4 years ago · d7a2635537
--- a/libnetwork/networkdb/cluster.go
+++ b/libnetwork/networkdb/cluster.go
@@ -18,12 +18,10 @@ import (
 
															 )
														
 
															 const (
														
 
															-	reapPeriod            = 5 * time.Second
														
 
															-	rejoinClusterDuration = 10 * time.Second
														
 
															-	rejoinInterval        = 60 * time.Second
														
 
															-	retryInterval         = 1 * time.Second
														
 
															-	nodeReapInterval      = 24 * time.Hour
														
 
															-	nodeReapPeriod        = 2 * time.Hour
														
 
															+	reapPeriod       = 5 * time.Second
														
 
															+	retryInterval    = 1 * time.Second
														
 
															+	nodeReapInterval = 24 * time.Hour
														
 
															+	nodeReapPeriod   = 2 * time.Hour
														
 
															 	// considering a cluster with > 20 nodes and a drain speed of 100 msg/s
														
 
															 	// the following is roughly 1 minute
														
 
															 	maxQueueLenBroadcastOnSync = 500
														
@@ -172,7 +170,7 @@ func (nDB *NetworkDB) clusterInit() error {
 
															 		{config.PushPullInterval, nDB.bulkSyncTables},
														
 
															 		{retryInterval, nDB.reconnectNode},
														
 
															 		{nodeReapPeriod, nDB.reapDeadNode},
														
 
															-		{rejoinInterval, nDB.rejoinClusterBootStrap},
														
 
															+		{nDB.config.rejoinClusterInterval, nDB.rejoinClusterBootStrap},
														
 
															 	} {
														
 
															 		t := time.NewTicker(trigger.interval)
														
 
															 		go nDB.triggerFunc(trigger.interval, t.C, trigger.fn)
														
@@ -210,7 +208,8 @@ func (nDB *NetworkDB) clusterJoin(members []string) error {
 
															 	if _, err := mlist.Join(members); err != nil {
														
 
															 		// In case of failure, we no longer need to explicitly call retryJoin.
														
 
															-		// rejoinClusterBootStrap, which runs every minute, will retryJoin for 10sec
														
 
															+		// rejoinClusterBootStrap, which runs every nDB.config.rejoinClusterInterval,
														
 
															+		// will retryJoin for nDB.config.rejoinClusterDuration.
														
 
															 		return fmt.Errorf("could not join node to memberlist: %v", err)
														
 
															 	}
														
@@ -324,7 +323,7 @@ func (nDB *NetworkDB) rejoinClusterBootStrap() {
 
															 	}
														
 
															 	// None of the bootStrap nodes are in the cluster, call memberlist join
														
 
															 	logrus.Debugf("rejoinClusterBootStrap, calling cluster join with bootStrap %v", bootStrapIPs)
														
 
															-	ctx, cancel := context.WithTimeout(nDB.ctx, rejoinClusterDuration)
														
 
															+	ctx, cancel := context.WithTimeout(nDB.ctx, nDB.config.rejoinClusterDuration)
														
 
															 	defer cancel()
														
 
															 	nDB.retryJoin(ctx, bootStrapIPs)
														
 
															 }
														
--- a/libnetwork/networkdb/networkdb.go
+++ b/libnetwork/networkdb/networkdb.go
@@ -192,6 +192,14 @@ type Config struct {
 
															 	// NOTE this MUST always be higher than reapEntryInterval
														
 
															 	reapNetworkInterval time.Duration
														
 
															+	// rejoinClusterDuration represents retryJoin timeout used by rejoinClusterBootStrap.
														
 
															+	// Default is 10sec.
														
 
															+	rejoinClusterDuration time.Duration
														
 
															+
														
 
															+	// rejoinClusterInterval represents interval on which rejoinClusterBootStrap runs.
														
 
															+	// Default is 60sec.
														
 
															+	rejoinClusterInterval time.Duration
														
 
															+
														
 
															 	// StatsPrintPeriod the period to use to print queue stats
														
 
															 	// Default is 5min
														
 
															 	StatsPrintPeriod time.Duration
														
@@ -225,13 +233,15 @@ type entry struct {
 
															 func DefaultConfig() *Config {
														
 
															 	hostname, _ := os.Hostname()
														
 
															 	return &Config{
														
 
															-		NodeID:            stringid.TruncateID(stringid.GenerateRandomID()),
														
 
															-		Hostname:          hostname,
														
 
															-		BindAddr:          "0.0.0.0",
														
 
															-		PacketBufferSize:  1400,
														
 
															-		StatsPrintPeriod:  5 * time.Minute,
														
 
															-		HealthPrintPeriod: 1 * time.Minute,
														
 
															-		reapEntryInterval: 30 * time.Minute,
														
 
															+		NodeID:                stringid.TruncateID(stringid.GenerateRandomID()),
														
 
															+		Hostname:              hostname,
														
 
															+		BindAddr:              "0.0.0.0",
														
 
															+		PacketBufferSize:      1400,
														
 
															+		StatsPrintPeriod:      5 * time.Minute,
														
 
															+		HealthPrintPeriod:     1 * time.Minute,
														
 
															+		reapEntryInterval:     30 * time.Minute,
														
 
															+		rejoinClusterDuration: 10 * time.Second,
														
 
															+		rejoinClusterInterval: 60 * time.Second,
														
 
															 	}
														
 
															 }
														
--- a/libnetwork/networkdb/networkdb_test.go
+++ b/libnetwork/networkdb/networkdb_test.go
@@ -819,8 +819,24 @@ func TestParallelDelete(t *testing.T) {
 
															 }
														
 
															 func TestNetworkDBIslands(t *testing.T) {
														
 
															+	pollTimeout := func() time.Duration {
														
 
															+		const defaultTimeout = 120 * time.Second
														
 
															+		dl, ok := t.Deadline()
														
 
															+		if !ok {
														
 
															+			return defaultTimeout
														
 
															+		}
														
 
															+		if d := time.Until(dl); d <= defaultTimeout {
														
 
															+			return d
														
 
															+		}
														
 
															+		return defaultTimeout
														
 
															+	}
														
 
															+
														
 
															 	logrus.SetLevel(logrus.DebugLevel)
														
 
															-	dbs := createNetworkDBInstances(t, 5, "node", DefaultConfig())
														
 
															+	conf := DefaultConfig()
														
 
															+	// Shorten durations to speed up test execution.
														
 
															+	conf.rejoinClusterDuration = conf.rejoinClusterDuration / 10
														
 
															+	conf.rejoinClusterInterval = conf.rejoinClusterInterval / 10
														
 
															+	dbs := createNetworkDBInstances(t, 5, "node", conf)
														
 
															 	// Get the node IP used currently
														
 
															 	node := dbs[0].nodes[dbs[0].config.NodeID]
														
@@ -868,7 +884,7 @@ func TestNetworkDBIslands(t *testing.T) {
 
															 		}
														
 
															 		return poll.Success()
														
 
															 	}
														
 
															-	poll.WaitOn(t, check, poll.WithDelay(time.Second), poll.WithTimeout(120*time.Second))
														
 
															+	poll.WaitOn(t, check, poll.WithDelay(time.Second), poll.WithTimeout(pollTimeout()))
														
 
															 	// Spawn again the first 3 nodes with different names but same IP:port
														
 
															 	for i := 0; i < 3; i++ {
														
@@ -877,7 +893,7 @@ func TestNetworkDBIslands(t *testing.T) {
 
															 		dbs[i] = launchNode(t, *dbs[i].config)
														
 
															 	}
														
 
															-	// Give some time for the reconnect routine to run, it runs every 60s
														
 
															+	// Give some time for the reconnect routine to run, it runs every 6s.
														
 
															 	check = func(t poll.LogT) poll.Result {
														
 
															 		// Verify that the cluster is again all connected. Note that the 3 previous node did not do any join
														
 
															 		for i := 0; i < 5; i++ {
														
@@ -908,6 +924,6 @@ func TestNetworkDBIslands(t *testing.T) {
 
															 		}
														
 
															 		return poll.Success()
														
 
															 	}
														
 
															-	poll.WaitOn(t, check, poll.WithDelay(10*time.Second), poll.WithTimeout(120*time.Second))
														
 
															+	poll.WaitOn(t, check, poll.WithDelay(time.Second), poll.WithTimeout(pollTimeout()))
														
 
															 	closeNetworkDBInstances(t, dbs)
														
 
															 }