diff --git a/libnetwork/networkdb/cluster.go b/libnetwork/networkdb/cluster.go index d448c8caef..9156d0da68 100644 --- a/libnetwork/networkdb/cluster.go +++ b/libnetwork/networkdb/cluster.go @@ -284,7 +284,6 @@ func (nDB *NetworkDB) reconnectNode() { } if err := nDB.sendNodeEvent(NodeEventTypeJoin); err != nil { - logrus.Errorf("failed to send node join during reconnect: %v", err) return } diff --git a/libnetwork/networkdb/delegate.go b/libnetwork/networkdb/delegate.go index 2096ea622e..6df358382f 100644 --- a/libnetwork/networkdb/delegate.go +++ b/libnetwork/networkdb/delegate.go @@ -17,6 +17,25 @@ func (d *delegate) NodeMeta(limit int) []byte { return []byte{} } +func (nDB *NetworkDB) getNode(nEvent *NodeEvent) *node { + nDB.Lock() + defer nDB.Unlock() + + for _, nodes := range []map[string]*node{ + nDB.failedNodes, + nDB.leftNodes, + nDB.nodes, + } { + if n, ok := nodes[nEvent.NodeName]; ok { + if n.ltime >= nEvent.LTime { + return nil + } + return n + } + } + return nil +} + func (nDB *NetworkDB) checkAndGetNode(nEvent *NodeEvent) *node { nDB.Lock() defer nDB.Unlock() @@ -63,10 +82,28 @@ func (nDB *NetworkDB) purgeSameNode(n *node) { } func (nDB *NetworkDB) handleNodeEvent(nEvent *NodeEvent) bool { - n := nDB.checkAndGetNode(nEvent) + // Update our local clock if the received messages has newer + // time. + nDB.networkClock.Witness(nEvent.LTime) + + n := nDB.getNode(nEvent) if n == nil { return false } + // If its a node leave event for a manager and this is the only manager we + // know of we want the reconnect logic to kick in. In a single manager + // cluster manager's gossip can't be bootstrapped unless some other node + // connects to it. + if len(nDB.bootStrapIP) == 1 && nEvent.Type == NodeEventTypeLeave { + for _, ip := range nDB.bootStrapIP { + if ip.Equal(n.Addr) { + n.ltime = nEvent.LTime + return true + } + } + } + + n = nDB.checkAndGetNode(nEvent) nDB.purgeSameNode(n) n.ltime = nEvent.LTime @@ -76,11 +113,13 @@ func (nDB *NetworkDB) handleNodeEvent(nEvent *NodeEvent) bool { nDB.Lock() nDB.nodes[n.Name] = n nDB.Unlock() + logrus.Infof("Node join event for %s/%s", n.Name, n.Addr) return true case NodeEventTypeLeave: nDB.Lock() nDB.leftNodes[n.Name] = n nDB.Unlock() + logrus.Infof("Node leave event for %s/%s", n.Name, n.Addr) return true } diff --git a/libnetwork/networkdb/event_delegate.go b/libnetwork/networkdb/event_delegate.go index 0b137bc818..23e16832e7 100644 --- a/libnetwork/networkdb/event_delegate.go +++ b/libnetwork/networkdb/event_delegate.go @@ -22,6 +22,7 @@ func (e *eventDelegate) broadcastNodeEvent(addr net.IP, op opType) { } func (e *eventDelegate) NotifyJoin(mn *memberlist.Node) { + logrus.Infof("Node %s/%s, joined gossip cluster", mn.Name, mn.Addr) e.broadcastNodeEvent(mn.Addr, opCreate) e.nDB.Lock() // In case the node is rejoining after a failure or leave, @@ -37,9 +38,12 @@ func (e *eventDelegate) NotifyJoin(mn *memberlist.Node) { e.nDB.nodes[mn.Name] = &node{Node: *mn} e.nDB.Unlock() + logrus.Infof("Node %s/%s, added to nodes list", mn.Name, mn.Addr) } func (e *eventDelegate) NotifyLeave(mn *memberlist.Node) { + var failed bool + logrus.Infof("Node %s/%s, left gossip cluster", mn.Name, mn.Addr) e.broadcastNodeEvent(mn.Addr, opDelete) e.nDB.deleteNodeTableEntries(mn.Name) e.nDB.deleteNetworkEntriesForNode(mn.Name) @@ -51,8 +55,13 @@ func (e *eventDelegate) NotifyLeave(mn *memberlist.Node) { // Explicit leave will have already removed the node from the list of nodes (nDB.nodes) and put it into the leftNodes map n.reapTime = nodeReapInterval e.nDB.failedNodes[mn.Name] = n + failed = true } e.nDB.Unlock() + if failed { + logrus.Infof("Node %s/%s, added to failed nodes list", mn.Name, mn.Addr) + } + } func (e *eventDelegate) NotifyUpdate(n *memberlist.Node) { diff --git a/libnetwork/networkdb/networkdb.go b/libnetwork/networkdb/networkdb.go index 86b0128b60..ecb2d714a4 100644 --- a/libnetwork/networkdb/networkdb.go +++ b/libnetwork/networkdb/networkdb.go @@ -4,6 +4,7 @@ package networkdb import ( "fmt" + "net" "strings" "sync" "time" @@ -88,6 +89,10 @@ type NetworkDB struct { // Reference to the memberlist's keyring to add & remove keys keyring *memberlist.Keyring + + // bootStrapIP is the list of IPs that can be used to bootstrap + // the gossip. + bootStrapIP []net.IP } // PeerInfo represents the peer (gossip cluster) nodes of a network @@ -194,6 +199,11 @@ func New(c *Config) (*NetworkDB, error) { // Join joins this NetworkDB instance with a list of peer NetworkDB // instances passed by the caller in the form of addr:port func (nDB *NetworkDB) Join(members []string) error { + nDB.Lock() + for _, m := range members { + nDB.bootStrapIP = append(nDB.bootStrapIP, net.ParseIP(m)) + } + nDB.Unlock() return nDB.clusterJoin(members) }