Make discovery ttl and heartbeat configurable

Docker daemon uses kv-store as the host-discovery backend.
Discovery module tracks the liveness of a node through a simple
keepalive mechanism.  The keepalive mechanism depends on every
node performing heartbeat by registering itself with the discovery
module (via KV-Store Put operation). And for every Put operation,
the discovery module in all other nodes will receive a Watch
notification. That keeps the node alive.
Any node that fails to register itself within the TTL timer is
considered dead and removed from the discovery database.

The default timer (heartbeat = 20 seconds & ttl = 60 seconds)
works fine for small clusters.  But for large clusters, these
default timers are extremely aggressive and that causes high CPU
& most of the processing is spent managing the node discovery
and that impacts normal daemon operation.

Hence we need a way to make the discovery ttl and heartbeat
configurable.  As the cluster size grows, the user can change
these timers to make sure the daemon scales.

Signed-off-by: Madhu Venugopal <madhu@docker.com>
This commit is contained in:
Madhu Venugopal 2015-11-11 16:18:06 -08:00
parent 900c8f5847
commit 2efdb8cbf5
3 changed files with 153 additions and 7 deletions

View file

@ -1,6 +1,8 @@
package daemon
import (
"fmt"
"strconv"
"time"
log "github.com/Sirupsen/logrus"
@ -13,22 +15,63 @@ import (
const (
// defaultDiscoveryHeartbeat is the default value for discovery heartbeat interval.
defaultDiscoveryHeartbeat = 20 * time.Second
// defaultDiscoveryTTL is the default TTL interface for discovery.
defaultDiscoveryTTL = 60 * time.Second
// defaultDiscoveryTTLFactor is the default TTL factor for discovery
defaultDiscoveryTTLFactor = 3
)
func discoveryOpts(clusterOpts map[string]string) (time.Duration, time.Duration, error) {
var (
heartbeat = defaultDiscoveryHeartbeat
ttl = defaultDiscoveryTTLFactor * defaultDiscoveryHeartbeat
)
if hb, ok := clusterOpts["discovery.heartbeat"]; ok {
h, err := strconv.Atoi(hb)
if err != nil {
return time.Duration(0), time.Duration(0), err
}
heartbeat = time.Duration(h) * time.Second
ttl = defaultDiscoveryTTLFactor * heartbeat
}
if tstr, ok := clusterOpts["discovery.ttl"]; ok {
t, err := strconv.Atoi(tstr)
if err != nil {
return time.Duration(0), time.Duration(0), err
}
ttl = time.Duration(t) * time.Second
if _, ok := clusterOpts["discovery.heartbeat"]; !ok {
h := int(t / defaultDiscoveryTTLFactor)
heartbeat = time.Duration(h) * time.Second
}
if ttl <= heartbeat {
return time.Duration(0), time.Duration(0),
fmt.Errorf("discovery.ttl timer must be greater than discovery.heartbeat")
}
}
return heartbeat, ttl, nil
}
// initDiscovery initialized the nodes discovery subsystem by connecting to the specified backend
// and start a registration loop to advertise the current node under the specified address.
func initDiscovery(backend, address string, clusterOpts map[string]string) (discovery.Backend, error) {
discoveryBackend, err := discovery.New(backend, defaultDiscoveryHeartbeat, defaultDiscoveryTTL, clusterOpts)
heartbeat, ttl, err := discoveryOpts(clusterOpts)
if err != nil {
return nil, err
}
discoveryBackend, err := discovery.New(backend, heartbeat, ttl, clusterOpts)
if err != nil {
return nil, err
}
// We call Register() on the discovery backend in a loop for the whole lifetime of the daemon,
// but we never actually Watch() for nodes appearing and disappearing for the moment.
go registrationLoop(discoveryBackend, address)
go registrationLoop(discoveryBackend, address, heartbeat)
return discoveryBackend, nil
}
@ -41,9 +84,9 @@ func registerAddr(backend discovery.Backend, addr string) {
// registrationLoop registers the current node against the discovery backend using the specified
// address. The function never returns, as registration against the backend comes with a TTL and
// requires regular heartbeats.
func registrationLoop(discoveryBackend discovery.Backend, address string) {
func registrationLoop(discoveryBackend discovery.Backend, address string, heartbeat time.Duration) {
registerAddr(discoveryBackend, address)
for range time.Tick(defaultDiscoveryHeartbeat) {
for range time.Tick(heartbeat) {
registerAddr(discoveryBackend, address)
}
}

91
daemon/discovery_test.go Normal file
View file

@ -0,0 +1,91 @@
package daemon
import (
"testing"
"time"
)
func TestDiscoveryOpts(t *testing.T) {
clusterOpts := map[string]string{"discovery.heartbeat": "10", "discovery.ttl": "5"}
heartbeat, ttl, err := discoveryOpts(clusterOpts)
if err == nil {
t.Fatalf("discovery.ttl < discovery.heartbeat must fail")
}
clusterOpts = map[string]string{"discovery.heartbeat": "10", "discovery.ttl": "10"}
heartbeat, ttl, err = discoveryOpts(clusterOpts)
if err == nil {
t.Fatalf("discovery.ttl == discovery.heartbeat must fail")
}
clusterOpts = map[string]string{"discovery.heartbeat": "invalid"}
heartbeat, ttl, err = discoveryOpts(clusterOpts)
if err == nil {
t.Fatalf("invalid discovery.heartbeat must fail")
}
clusterOpts = map[string]string{"discovery.ttl": "invalid"}
heartbeat, ttl, err = discoveryOpts(clusterOpts)
if err == nil {
t.Fatalf("invalid discovery.ttl must fail")
}
clusterOpts = map[string]string{"discovery.heartbeat": "10", "discovery.ttl": "20"}
heartbeat, ttl, err = discoveryOpts(clusterOpts)
if err != nil {
t.Fatal(err)
}
if heartbeat != 10*time.Second {
t.Fatalf("Heatbeat - Expected : %v, Actual : %v", 10*time.Second, heartbeat)
}
if ttl != 20*time.Second {
t.Fatalf("TTL - Expected : %v, Actual : %v", 20*time.Second, ttl)
}
clusterOpts = map[string]string{"discovery.heartbeat": "10"}
heartbeat, ttl, err = discoveryOpts(clusterOpts)
if err != nil {
t.Fatal(err)
}
if heartbeat != 10*time.Second {
t.Fatalf("Heatbeat - Expected : %v, Actual : %v", 10*time.Second, heartbeat)
}
expected := 10 * defaultDiscoveryTTLFactor * time.Second
if ttl != expected {
t.Fatalf("TTL - Expected : %v, Actual : %v", expected, ttl)
}
clusterOpts = map[string]string{"discovery.ttl": "30"}
heartbeat, ttl, err = discoveryOpts(clusterOpts)
if err != nil {
t.Fatal(err)
}
if ttl != 30*time.Second {
t.Fatalf("TTL - Expected : %v, Actual : %v", 30*time.Second, ttl)
}
expected = 30 * time.Second / defaultDiscoveryTTLFactor
if heartbeat != expected {
t.Fatalf("Heatbeat - Expected : %v, Actual : %v", expected, heartbeat)
}
clusterOpts = map[string]string{}
heartbeat, ttl, err = discoveryOpts(clusterOpts)
if err != nil {
t.Fatal(err)
}
if heartbeat != defaultDiscoveryHeartbeat {
t.Fatalf("Heatbeat - Expected : %v, Actual : %v", defaultDiscoveryHeartbeat, heartbeat)
}
expected = defaultDiscoveryHeartbeat * defaultDiscoveryTTLFactor
if ttl != expected {
t.Fatalf("TTL - Expected : %v, Actual : %v", expected, ttl)
}
}

View file

@ -565,6 +565,18 @@ docker daemon \
The currently supported cluster store options are:
* `discovery.heartbeat`
Specifies the heartbeat timer in seconds which is used by the daemon as a
keepalive mechanism to make sure discovery module treats the node as alive
in the cluster. If not configured, the default value is 20 seconds.
* `discovery.ttl`
Specifies the ttl (time-to-live) in seconds which is used by the discovery
module to timeout a node if a valid heartbeat is not received within the
configured ttl value. If not configured, the default value is 60 seconds.
* `kv.cacertfile`
Specifies the path to a local file with PEM encoded CA certificates to trust