Browse Source

libnetwork: make rejoin intervals configurable

This allows the rejoin intervals to be chosen according to the context
within which the component is used, and, in particular, this allows
lower intervals to be used within TestNetworkDBIslands test.

Signed-off-by: Roman Volosatovs <roman.volosatovs@docker.com>
Roman Volosatovs 4 years ago
parent
commit
d7a2635537

+ 8 - 9
libnetwork/networkdb/cluster.go

@@ -18,12 +18,10 @@ import (
 )
 )
 
 
 const (
 const (
-	reapPeriod            = 5 * time.Second
-	rejoinClusterDuration = 10 * time.Second
-	rejoinInterval        = 60 * time.Second
-	retryInterval         = 1 * time.Second
-	nodeReapInterval      = 24 * time.Hour
-	nodeReapPeriod        = 2 * time.Hour
+	reapPeriod       = 5 * time.Second
+	retryInterval    = 1 * time.Second
+	nodeReapInterval = 24 * time.Hour
+	nodeReapPeriod   = 2 * time.Hour
 	// considering a cluster with > 20 nodes and a drain speed of 100 msg/s
 	// considering a cluster with > 20 nodes and a drain speed of 100 msg/s
 	// the following is roughly 1 minute
 	// the following is roughly 1 minute
 	maxQueueLenBroadcastOnSync = 500
 	maxQueueLenBroadcastOnSync = 500
@@ -172,7 +170,7 @@ func (nDB *NetworkDB) clusterInit() error {
 		{config.PushPullInterval, nDB.bulkSyncTables},
 		{config.PushPullInterval, nDB.bulkSyncTables},
 		{retryInterval, nDB.reconnectNode},
 		{retryInterval, nDB.reconnectNode},
 		{nodeReapPeriod, nDB.reapDeadNode},
 		{nodeReapPeriod, nDB.reapDeadNode},
-		{rejoinInterval, nDB.rejoinClusterBootStrap},
+		{nDB.config.rejoinClusterInterval, nDB.rejoinClusterBootStrap},
 	} {
 	} {
 		t := time.NewTicker(trigger.interval)
 		t := time.NewTicker(trigger.interval)
 		go nDB.triggerFunc(trigger.interval, t.C, trigger.fn)
 		go nDB.triggerFunc(trigger.interval, t.C, trigger.fn)
@@ -210,7 +208,8 @@ func (nDB *NetworkDB) clusterJoin(members []string) error {
 
 
 	if _, err := mlist.Join(members); err != nil {
 	if _, err := mlist.Join(members); err != nil {
 		// In case of failure, we no longer need to explicitly call retryJoin.
 		// In case of failure, we no longer need to explicitly call retryJoin.
-		// rejoinClusterBootStrap, which runs every minute, will retryJoin for 10sec
+		// rejoinClusterBootStrap, which runs every nDB.config.rejoinClusterInterval,
+		// will retryJoin for nDB.config.rejoinClusterDuration.
 		return fmt.Errorf("could not join node to memberlist: %v", err)
 		return fmt.Errorf("could not join node to memberlist: %v", err)
 	}
 	}
 
 
@@ -324,7 +323,7 @@ func (nDB *NetworkDB) rejoinClusterBootStrap() {
 	}
 	}
 	// None of the bootStrap nodes are in the cluster, call memberlist join
 	// None of the bootStrap nodes are in the cluster, call memberlist join
 	logrus.Debugf("rejoinClusterBootStrap, calling cluster join with bootStrap %v", bootStrapIPs)
 	logrus.Debugf("rejoinClusterBootStrap, calling cluster join with bootStrap %v", bootStrapIPs)
-	ctx, cancel := context.WithTimeout(nDB.ctx, rejoinClusterDuration)
+	ctx, cancel := context.WithTimeout(nDB.ctx, nDB.config.rejoinClusterDuration)
 	defer cancel()
 	defer cancel()
 	nDB.retryJoin(ctx, bootStrapIPs)
 	nDB.retryJoin(ctx, bootStrapIPs)
 }
 }

+ 17 - 7
libnetwork/networkdb/networkdb.go

@@ -192,6 +192,14 @@ type Config struct {
 	// NOTE this MUST always be higher than reapEntryInterval
 	// NOTE this MUST always be higher than reapEntryInterval
 	reapNetworkInterval time.Duration
 	reapNetworkInterval time.Duration
 
 
+	// rejoinClusterDuration represents retryJoin timeout used by rejoinClusterBootStrap.
+	// Default is 10sec.
+	rejoinClusterDuration time.Duration
+
+	// rejoinClusterInterval represents interval on which rejoinClusterBootStrap runs.
+	// Default is 60sec.
+	rejoinClusterInterval time.Duration
+
 	// StatsPrintPeriod the period to use to print queue stats
 	// StatsPrintPeriod the period to use to print queue stats
 	// Default is 5min
 	// Default is 5min
 	StatsPrintPeriod time.Duration
 	StatsPrintPeriod time.Duration
@@ -225,13 +233,15 @@ type entry struct {
 func DefaultConfig() *Config {
 func DefaultConfig() *Config {
 	hostname, _ := os.Hostname()
 	hostname, _ := os.Hostname()
 	return &Config{
 	return &Config{
-		NodeID:            stringid.TruncateID(stringid.GenerateRandomID()),
-		Hostname:          hostname,
-		BindAddr:          "0.0.0.0",
-		PacketBufferSize:  1400,
-		StatsPrintPeriod:  5 * time.Minute,
-		HealthPrintPeriod: 1 * time.Minute,
-		reapEntryInterval: 30 * time.Minute,
+		NodeID:                stringid.TruncateID(stringid.GenerateRandomID()),
+		Hostname:              hostname,
+		BindAddr:              "0.0.0.0",
+		PacketBufferSize:      1400,
+		StatsPrintPeriod:      5 * time.Minute,
+		HealthPrintPeriod:     1 * time.Minute,
+		reapEntryInterval:     30 * time.Minute,
+		rejoinClusterDuration: 10 * time.Second,
+		rejoinClusterInterval: 60 * time.Second,
 	}
 	}
 }
 }
 
 

+ 20 - 4
libnetwork/networkdb/networkdb_test.go

@@ -819,8 +819,24 @@ func TestParallelDelete(t *testing.T) {
 }
 }
 
 
 func TestNetworkDBIslands(t *testing.T) {
 func TestNetworkDBIslands(t *testing.T) {
+	pollTimeout := func() time.Duration {
+		const defaultTimeout = 120 * time.Second
+		dl, ok := t.Deadline()
+		if !ok {
+			return defaultTimeout
+		}
+		if d := time.Until(dl); d <= defaultTimeout {
+			return d
+		}
+		return defaultTimeout
+	}
+
 	logrus.SetLevel(logrus.DebugLevel)
 	logrus.SetLevel(logrus.DebugLevel)
-	dbs := createNetworkDBInstances(t, 5, "node", DefaultConfig())
+	conf := DefaultConfig()
+	// Shorten durations to speed up test execution.
+	conf.rejoinClusterDuration = conf.rejoinClusterDuration / 10
+	conf.rejoinClusterInterval = conf.rejoinClusterInterval / 10
+	dbs := createNetworkDBInstances(t, 5, "node", conf)
 
 
 	// Get the node IP used currently
 	// Get the node IP used currently
 	node := dbs[0].nodes[dbs[0].config.NodeID]
 	node := dbs[0].nodes[dbs[0].config.NodeID]
@@ -868,7 +884,7 @@ func TestNetworkDBIslands(t *testing.T) {
 		}
 		}
 		return poll.Success()
 		return poll.Success()
 	}
 	}
-	poll.WaitOn(t, check, poll.WithDelay(time.Second), poll.WithTimeout(120*time.Second))
+	poll.WaitOn(t, check, poll.WithDelay(time.Second), poll.WithTimeout(pollTimeout()))
 
 
 	// Spawn again the first 3 nodes with different names but same IP:port
 	// Spawn again the first 3 nodes with different names but same IP:port
 	for i := 0; i < 3; i++ {
 	for i := 0; i < 3; i++ {
@@ -877,7 +893,7 @@ func TestNetworkDBIslands(t *testing.T) {
 		dbs[i] = launchNode(t, *dbs[i].config)
 		dbs[i] = launchNode(t, *dbs[i].config)
 	}
 	}
 
 
-	// Give some time for the reconnect routine to run, it runs every 60s
+	// Give some time for the reconnect routine to run, it runs every 6s.
 	check = func(t poll.LogT) poll.Result {
 	check = func(t poll.LogT) poll.Result {
 		// Verify that the cluster is again all connected. Note that the 3 previous node did not do any join
 		// Verify that the cluster is again all connected. Note that the 3 previous node did not do any join
 		for i := 0; i < 5; i++ {
 		for i := 0; i < 5; i++ {
@@ -908,6 +924,6 @@ func TestNetworkDBIslands(t *testing.T) {
 		}
 		}
 		return poll.Success()
 		return poll.Success()
 	}
 	}
-	poll.WaitOn(t, check, poll.WithDelay(10*time.Second), poll.WithTimeout(120*time.Second))
+	poll.WaitOn(t, check, poll.WithDelay(time.Second), poll.WithTimeout(pollTimeout()))
 	closeNetworkDBInstances(t, dbs)
 	closeNetworkDBInstances(t, dbs)
 }
 }