преди 4 години · d7a2635537
--- a/libnetwork/networkdb/cluster.go
+++ b/libnetwork/networkdb/cluster.go
@@ -18,12 +18,10 @@ import (
 
				 )
			
 
				 
			
 
				 const (
			
 
				-	reapPeriod            = 5 * time.Second
			
 
				-	rejoinClusterDuration = 10 * time.Second
			
 
				-	rejoinInterval        = 60 * time.Second
			
 
				-	retryInterval         = 1 * time.Second
			
 
				-	nodeReapInterval      = 24 * time.Hour
			
 
				-	nodeReapPeriod        = 2 * time.Hour
			
 
				+	reapPeriod       = 5 * time.Second
			
 
				+	retryInterval    = 1 * time.Second
			
 
				+	nodeReapInterval = 24 * time.Hour
			
 
				+	nodeReapPeriod   = 2 * time.Hour
			
 
				 	// considering a cluster with > 20 nodes and a drain speed of 100 msg/s
			
 
				 	// the following is roughly 1 minute
			
 
				 	maxQueueLenBroadcastOnSync = 500
			
@@ -172,7 +170,7 @@ func (nDB *NetworkDB) clusterInit() error {
 
				 		{config.PushPullInterval, nDB.bulkSyncTables},
			
 
				 		{retryInterval, nDB.reconnectNode},
			
 
				 		{nodeReapPeriod, nDB.reapDeadNode},
			
 
				-		{rejoinInterval, nDB.rejoinClusterBootStrap},
			
 
				+		{nDB.config.rejoinClusterInterval, nDB.rejoinClusterBootStrap},
			
 
				 	} {
			
 
				 		t := time.NewTicker(trigger.interval)
			
 
				 		go nDB.triggerFunc(trigger.interval, t.C, trigger.fn)
			
@@ -210,7 +208,8 @@ func (nDB *NetworkDB) clusterJoin(members []string) error {
 
				 
			
 
				 	if _, err := mlist.Join(members); err != nil {
			
 
				 		// In case of failure, we no longer need to explicitly call retryJoin.
			
 
				-		// rejoinClusterBootStrap, which runs every minute, will retryJoin for 10sec
			
 
				+		// rejoinClusterBootStrap, which runs every nDB.config.rejoinClusterInterval,
			
 
				+		// will retryJoin for nDB.config.rejoinClusterDuration.
			
 
				 		return fmt.Errorf("could not join node to memberlist: %v", err)
			
 
				 	}
			
 
				 
			
@@ -324,7 +323,7 @@ func (nDB *NetworkDB) rejoinClusterBootStrap() {
 
				 	}
			
 
				 	// None of the bootStrap nodes are in the cluster, call memberlist join
			
 
				 	logrus.Debugf("rejoinClusterBootStrap, calling cluster join with bootStrap %v", bootStrapIPs)
			
 
				-	ctx, cancel := context.WithTimeout(nDB.ctx, rejoinClusterDuration)
			
 
				+	ctx, cancel := context.WithTimeout(nDB.ctx, nDB.config.rejoinClusterDuration)
			
 
				 	defer cancel()
			
 
				 	nDB.retryJoin(ctx, bootStrapIPs)
			
 
				 }
			
--- a/libnetwork/networkdb/networkdb.go
+++ b/libnetwork/networkdb/networkdb.go
@@ -192,6 +192,14 @@ type Config struct {
 
				 	// NOTE this MUST always be higher than reapEntryInterval
			
 
				 	reapNetworkInterval time.Duration
			
 
				 
			
 
				+	// rejoinClusterDuration represents retryJoin timeout used by rejoinClusterBootStrap.
			
 
				+	// Default is 10sec.
			
 
				+	rejoinClusterDuration time.Duration
			
 
				+
			
 
				+	// rejoinClusterInterval represents interval on which rejoinClusterBootStrap runs.
			
 
				+	// Default is 60sec.
			
 
				+	rejoinClusterInterval time.Duration
			
 
				+
			
 
				 	// StatsPrintPeriod the period to use to print queue stats
			
 
				 	// Default is 5min
			
 
				 	StatsPrintPeriod time.Duration
			
@@ -225,13 +233,15 @@ type entry struct {
 
				 func DefaultConfig() *Config {
			
 
				 	hostname, _ := os.Hostname()
			
 
				 	return &Config{
			
 
				-		NodeID:            stringid.TruncateID(stringid.GenerateRandomID()),
			
 
				-		Hostname:          hostname,
			
 
				-		BindAddr:          "0.0.0.0",
			
 
				-		PacketBufferSize:  1400,
			
 
				-		StatsPrintPeriod:  5 * time.Minute,
			
 
				-		HealthPrintPeriod: 1 * time.Minute,
			
 
				-		reapEntryInterval: 30 * time.Minute,
			
 
				+		NodeID:                stringid.TruncateID(stringid.GenerateRandomID()),
			
 
				+		Hostname:              hostname,
			
 
				+		BindAddr:              "0.0.0.0",
			
 
				+		PacketBufferSize:      1400,
			
 
				+		StatsPrintPeriod:      5 * time.Minute,
			
 
				+		HealthPrintPeriod:     1 * time.Minute,
			
 
				+		reapEntryInterval:     30 * time.Minute,
			
 
				+		rejoinClusterDuration: 10 * time.Second,
			
 
				+		rejoinClusterInterval: 60 * time.Second,
			
 
				 	}
			
 
				 }
			
 
				 
			
--- a/libnetwork/networkdb/networkdb_test.go
+++ b/libnetwork/networkdb/networkdb_test.go
@@ -819,8 +819,24 @@ func TestParallelDelete(t *testing.T) {
 
				 }
			
 
				 
			
 
				 func TestNetworkDBIslands(t *testing.T) {
			
 
				+	pollTimeout := func() time.Duration {
			
 
				+		const defaultTimeout = 120 * time.Second
			
 
				+		dl, ok := t.Deadline()
			
 
				+		if !ok {
			
 
				+			return defaultTimeout
			
 
				+		}
			
 
				+		if d := time.Until(dl); d <= defaultTimeout {
			
 
				+			return d
			
 
				+		}
			
 
				+		return defaultTimeout
			
 
				+	}
			
 
				+
			
 
				 	logrus.SetLevel(logrus.DebugLevel)
			
 
				-	dbs := createNetworkDBInstances(t, 5, "node", DefaultConfig())
			
 
				+	conf := DefaultConfig()
			
 
				+	// Shorten durations to speed up test execution.
			
 
				+	conf.rejoinClusterDuration = conf.rejoinClusterDuration / 10
			
 
				+	conf.rejoinClusterInterval = conf.rejoinClusterInterval / 10
			
 
				+	dbs := createNetworkDBInstances(t, 5, "node", conf)
			
 
				 
			
 
				 	// Get the node IP used currently
			
 
				 	node := dbs[0].nodes[dbs[0].config.NodeID]
			
@@ -868,7 +884,7 @@ func TestNetworkDBIslands(t *testing.T) {
 
				 		}
			
 
				 		return poll.Success()
			
 
				 	}
			
 
				-	poll.WaitOn(t, check, poll.WithDelay(time.Second), poll.WithTimeout(120*time.Second))
			
 
				+	poll.WaitOn(t, check, poll.WithDelay(time.Second), poll.WithTimeout(pollTimeout()))
			
 
				 
			
 
				 	// Spawn again the first 3 nodes with different names but same IP:port
			
 
				 	for i := 0; i < 3; i++ {
			
@@ -877,7 +893,7 @@ func TestNetworkDBIslands(t *testing.T) {
 
				 		dbs[i] = launchNode(t, *dbs[i].config)
			
 
				 	}
			
 
				 
			
 
				-	// Give some time for the reconnect routine to run, it runs every 60s
			
 
				+	// Give some time for the reconnect routine to run, it runs every 6s.
			
 
				 	check = func(t poll.LogT) poll.Result {
			
 
				 		// Verify that the cluster is again all connected. Note that the 3 previous node did not do any join
			
 
				 		for i := 0; i < 5; i++ {
			
@@ -908,6 +924,6 @@ func TestNetworkDBIslands(t *testing.T) {
 
				 		}
			
 
				 		return poll.Success()
			
 
				 	}
			
 
				-	poll.WaitOn(t, check, poll.WithDelay(10*time.Second), poll.WithTimeout(120*time.Second))
			
 
				+	poll.WaitOn(t, check, poll.WithDelay(time.Second), poll.WithTimeout(pollTimeout()))
			
 
				 	closeNetworkDBInstances(t, dbs)
			
 
				 }