Kaynağa Gözat

NetworkDB incorrect number of entries in networkNodes

A rapid (within networkReapTime 30min) leave/join network
can corrupt the list of nodes per network with multiple copies
of the same nodes.
The fix makes sure that each node is present only once

Signed-off-by: Flavio Crisciani <flavio.crisciani@docker.com>
Flavio Crisciani 8 yıl önce
ebeveyn
işleme
051a0d5ce9

+ 1 - 2
libnetwork/networkdb/cluster.go

@@ -310,12 +310,11 @@ func (nDB *NetworkDB) reapState() {
 
 
 func (nDB *NetworkDB) reapNetworks() {
 func (nDB *NetworkDB) reapNetworks() {
 	nDB.Lock()
 	nDB.Lock()
-	for name, nn := range nDB.networks {
+	for _, nn := range nDB.networks {
 		for id, n := range nn {
 		for id, n := range nn {
 			if n.leaving {
 			if n.leaving {
 				if n.reapTime <= 0 {
 				if n.reapTime <= 0 {
 					delete(nn, id)
 					delete(nn, id)
-					nDB.deleteNetworkNode(id, name)
 					continue
 					continue
 				}
 				}
 				n.reapTime -= reapPeriod
 				n.reapTime -= reapPeriod

+ 6 - 1
libnetwork/networkdb/delegate.go

@@ -179,7 +179,12 @@ func (nDB *NetworkDB) handleNetworkEvent(nEvent *NetworkEvent) bool {
 			flushEntries = true
 			flushEntries = true
 		}
 		}
 
 
-		nDB.addNetworkNode(nEvent.NetworkID, nEvent.NodeName)
+		if nEvent.Type == NetworkEventTypeLeave {
+			nDB.deleteNetworkNode(nEvent.NetworkID, nEvent.NodeName)
+		} else {
+			nDB.addNetworkNode(nEvent.NetworkID, nEvent.NodeName)
+		}
+
 		return true
 		return true
 	}
 	}
 
 

+ 7 - 2
libnetwork/networkdb/networkdb.go

@@ -487,7 +487,7 @@ func (nDB *NetworkDB) JoinNetwork(nid string) error {
 		},
 		},
 		RetransmitMult: 4,
 		RetransmitMult: 4,
 	}
 	}
-	nDB.networkNodes[nid] = append(nDB.networkNodes[nid], nDB.config.NodeName)
+	nDB.addNetworkNode(nid, nDB.config.NodeName)
 	networkNodes := nDB.networkNodes[nid]
 	networkNodes := nDB.networkNodes[nid]
 	nDB.Unlock()
 	nDB.Unlock()
 
 
@@ -522,6 +522,8 @@ func (nDB *NetworkDB) LeaveNetwork(nid string) error {
 		entries []*entry
 		entries []*entry
 	)
 	)
 
 
+	nDB.deleteNetworkNode(nid, nDB.config.NodeName)
+
 	nwWalker := func(path string, v interface{}) bool {
 	nwWalker := func(path string, v interface{}) bool {
 		entry, ok := v.(*entry)
 		entry, ok := v.(*entry)
 		if !ok {
 		if !ok {
@@ -580,7 +582,10 @@ func (nDB *NetworkDB) addNetworkNode(nid string, nodeName string) {
 // passed network. Caller should hold the NetworkDB lock while calling
 // passed network. Caller should hold the NetworkDB lock while calling
 // this
 // this
 func (nDB *NetworkDB) deleteNetworkNode(nid string, nodeName string) {
 func (nDB *NetworkDB) deleteNetworkNode(nid string, nodeName string) {
-	nodes := nDB.networkNodes[nid]
+	nodes, ok := nDB.networkNodes[nid]
+	if !ok || len(nodes) == 0 {
+		return
+	}
 	newNodes := make([]string, 0, len(nodes)-1)
 	newNodes := make([]string, 0, len(nodes)-1)
 	for _, name := range nodes {
 	for _, name := range nodes {
 		if name == nodeName {
 		if name == nodeName {

+ 78 - 0
libnetwork/networkdb/networkdb_test.go

@@ -436,3 +436,81 @@ func TestNetworkDBCRUDMediumCluster(t *testing.T) {
 	log.Print("Closing DB instances...")
 	log.Print("Closing DB instances...")
 	closeNetworkDBInstances(dbs)
 	closeNetworkDBInstances(dbs)
 }
 }
+
+func TestNetworkDBNodeJoinLeaveIteration(t *testing.T) {
+	maxRetry := 5
+	dbs := createNetworkDBInstances(t, 2, "node")
+
+	// Single node Join/Leave
+	err := dbs[0].JoinNetwork("network1")
+	assert.NoError(t, err)
+
+	if len(dbs[0].networkNodes["network1"]) != 1 {
+		t.Fatalf("The networkNodes list has to have be 1 instead of %d", len(dbs[0].networkNodes["network1"]))
+	}
+
+	err = dbs[0].LeaveNetwork("network1")
+	assert.NoError(t, err)
+
+	if len(dbs[0].networkNodes["network1"]) != 0 {
+		t.Fatalf("The networkNodes list has to have be 0 instead of %d", len(dbs[0].networkNodes["network1"]))
+	}
+
+	// Multiple nodes Join/Leave
+	err = dbs[0].JoinNetwork("network1")
+	assert.NoError(t, err)
+
+	err = dbs[1].JoinNetwork("network1")
+	assert.NoError(t, err)
+
+	// Wait for the propagation on db[0]
+	for i := 0; i < maxRetry; i++ {
+		if len(dbs[0].networkNodes["network1"]) == 2 {
+			break
+		}
+		time.Sleep(1 * time.Second)
+	}
+	if len(dbs[0].networkNodes["network1"]) != 2 {
+		t.Fatalf("The networkNodes list has to have be 2 instead of %d - %v", len(dbs[0].networkNodes["network1"]), dbs[0].networkNodes["network1"])
+	}
+
+	// Wait for the propagation on db[1]
+	for i := 0; i < maxRetry; i++ {
+		if len(dbs[1].networkNodes["network1"]) == 2 {
+			break
+		}
+		time.Sleep(1 * time.Second)
+	}
+	if len(dbs[1].networkNodes["network1"]) != 2 {
+		t.Fatalf("The networkNodes list has to have be 2 instead of %d - %v", len(dbs[1].networkNodes["network1"]), dbs[1].networkNodes["network1"])
+	}
+
+	// Try a quick leave/join
+	err = dbs[0].LeaveNetwork("network1")
+	assert.NoError(t, err)
+	err = dbs[0].JoinNetwork("network1")
+	assert.NoError(t, err)
+
+	for i := 0; i < maxRetry; i++ {
+		if len(dbs[0].networkNodes["network1"]) == 2 {
+			break
+		}
+		time.Sleep(1 * time.Second)
+	}
+	if len(dbs[0].networkNodes["network1"]) != 2 {
+		t.Fatalf("The networkNodes list has to have be 2 instead of %d - %v", len(dbs[0].networkNodes["network1"]), dbs[0].networkNodes["network1"])
+	}
+
+	for i := 0; i < maxRetry; i++ {
+		if len(dbs[1].networkNodes["network1"]) == 2 {
+			break
+		}
+		time.Sleep(1 * time.Second)
+	}
+	if len(dbs[1].networkNodes["network1"]) != 2 {
+		t.Fatalf("The networkNodes list has to have be 2 instead of %d - %v", len(dbs[1].networkNodes["network1"]), dbs[1].networkNodes["network1"])
+	}
+
+	dbs[0].Close()
+	dbs[1].Close()
+}