Bladeren bron

Fix race condition between swarm and libnetwork

This commit in conjunction with a libnetwork side commit,
cleans up the libnetwork SetClusterProvider logic interaction.
The previous code was inducing libnetwork to spawn several go
routines that were racing between each other during the agent
init and close.

A test got added to verify that back to back swarm init and leave
are properly processed and not raise crashes

Signed-off-by: Flavio Crisciani <flavio.crisciani@docker.com>
Flavio Crisciani 8 jaren geleden
bovenliggende
commit
e2ec006797

+ 5 - 1
cmd/dockerd/daemon.go

@@ -300,6 +300,11 @@ func (cli *DaemonCli) start(opts daemonOptions) (err error) {
 	if err != nil {
 		logrus.Fatalf("Error creating cluster component: %v", err)
 	}
+	d.SetCluster(c)
+	err = c.Start()
+	if err != nil {
+		logrus.Fatalf("Error starting cluster component: %v", err)
+	}
 
 	// Restart all autostart containers which has a swarm endpoint
 	// and is not yet running now that we have successfully
@@ -316,7 +321,6 @@ func (cli *DaemonCli) start(opts daemonOptions) (err error) {
 
 	cli.d = d
 
-	d.SetCluster(c)
 	initRouter(api, d, c)
 
 	cli.setupConfigReloadTrap()

+ 2 - 0
daemon/cluster.go

@@ -2,12 +2,14 @@ package daemon
 
 import (
 	apitypes "github.com/docker/docker/api/types"
+	lncluster "github.com/docker/libnetwork/cluster"
 )
 
 // Cluster is the interface for github.com/docker/docker/daemon/cluster.(*Cluster).
 type Cluster interface {
 	ClusterStatus
 	NetworkManager
+	SendClusterEvent(event lncluster.ConfigEventType)
 }
 
 // ClusterStatus interface provides information about the Swarm status of the Cluster

+ 27 - 8
daemon/cluster/cluster.go

@@ -51,6 +51,7 @@ import (
 	types "github.com/docker/docker/api/types/swarm"
 	executorpkg "github.com/docker/docker/daemon/cluster/executor"
 	"github.com/docker/docker/pkg/signal"
+	lncluster "github.com/docker/libnetwork/cluster"
 	swarmapi "github.com/docker/swarmkit/api"
 	swarmnode "github.com/docker/swarmkit/node"
 	"github.com/pkg/errors"
@@ -115,7 +116,7 @@ type Cluster struct {
 	root         string
 	runtimeRoot  string
 	config       Config
-	configEvent  chan struct{} // todo: make this array and goroutine safe
+	configEvent  chan lncluster.ConfigEventType // todo: make this array and goroutine safe
 	attachers    map[string]*attacher
 }
 
@@ -147,22 +148,30 @@ func New(config Config) (*Cluster, error) {
 	c := &Cluster{
 		root:        root,
 		config:      config,
-		configEvent: make(chan struct{}, 10),
+		configEvent: make(chan lncluster.ConfigEventType, 10),
 		runtimeRoot: config.RuntimeRoot,
 		attachers:   make(map[string]*attacher),
 	}
+	return c, nil
+}
+
+// Start the Cluster instance
+// TODO The split between New and Start can be join again when the SendClusterEvent
+// method is no longer required
+func (c *Cluster) Start() error {
+	root := filepath.Join(c.config.Root, swarmDirName)
 
 	nodeConfig, err := loadPersistentState(root)
 	if err != nil {
 		if os.IsNotExist(err) {
-			return c, nil
+			return nil
 		}
-		return nil, err
+		return err
 	}
 
 	nr, err := c.newNodeRunner(*nodeConfig)
 	if err != nil {
-		return nil, err
+		return err
 	}
 	c.nr = nr
 
@@ -172,10 +181,10 @@ func New(config Config) (*Cluster, error) {
 	case err := <-nr.Ready():
 		if err != nil {
 			logrus.WithError(err).Error("swarm component could not be started")
-			return c, nil
+			return nil
 		}
 	}
-	return c, nil
+	return nil
 }
 
 func (c *Cluster) newNodeRunner(conf nodeStartConfig) (*nodeRunner, error) {
@@ -308,7 +317,7 @@ func (c *Cluster) getRemoteAddressList() []string {
 // ListenClusterEvents returns a channel that receives messages on cluster
 // participation changes.
 // todo: make cancelable and accessible to multiple callers
-func (c *Cluster) ListenClusterEvents() <-chan struct{} {
+func (c *Cluster) ListenClusterEvents() <-chan lncluster.ConfigEventType {
 	return c.configEvent
 }
 
@@ -413,3 +422,13 @@ func (c *Cluster) lockedManagerAction(fn func(ctx context.Context, state nodeSta
 
 	return fn(ctx, state)
 }
+
+// SendClusterEvent allows to send cluster events on the configEvent channel
+// TODO This method should not be exposed.
+// Currently it is used to notify the network controller that the keys are
+// available
+func (c *Cluster) SendClusterEvent(event lncluster.ConfigEventType) {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+	c.configEvent <- event
+}

+ 4 - 2
daemon/cluster/noderunner.go

@@ -11,6 +11,7 @@ import (
 	"github.com/Sirupsen/logrus"
 	types "github.com/docker/docker/api/types/swarm"
 	"github.com/docker/docker/daemon/cluster/executor/container"
+	lncluster "github.com/docker/libnetwork/cluster"
 	swarmapi "github.com/docker/swarmkit/api"
 	swarmnode "github.com/docker/swarmkit/node"
 	"github.com/pkg/errors"
@@ -162,7 +163,7 @@ func (n *nodeRunner) handleControlSocketChange(ctx context.Context, node *swarmn
 		}
 		n.grpcConn = conn
 		n.mu.Unlock()
-		n.cluster.configEvent <- struct{}{}
+		n.cluster.SendClusterEvent(lncluster.EventSocketChange)
 	}
 }
 
@@ -175,7 +176,7 @@ func (n *nodeRunner) handleReadyEvent(ctx context.Context, node *swarmnode.Node,
 		close(ready)
 	case <-ctx.Done():
 	}
-	n.cluster.configEvent <- struct{}{}
+	n.cluster.SendClusterEvent(lncluster.EventNodeReady)
 }
 
 func (n *nodeRunner) handleNodeExit(node *swarmnode.Node) {
@@ -217,6 +218,7 @@ func (n *nodeRunner) Stop() error {
 	if err := n.swarmNode.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") {
 		return err
 	}
+	n.cluster.SendClusterEvent(lncluster.EventNodeLeave)
 	<-n.done
 	return nil
 }

+ 0 - 1
daemon/cluster/swarm.go

@@ -388,7 +388,6 @@ func (c *Cluster) Leave(force bool) error {
 		}
 	}
 
-	c.configEvent <- struct{}{}
 	// todo: cleanup optional?
 	if err := clearPersistentState(c.root); err != nil {
 		return err

+ 7 - 3
daemon/network.go

@@ -16,6 +16,7 @@ import (
 	"github.com/docker/docker/pkg/plugingetter"
 	"github.com/docker/docker/runconfig"
 	"github.com/docker/libnetwork"
+	lncluster "github.com/docker/libnetwork/cluster"
 	"github.com/docker/libnetwork/driverapi"
 	"github.com/docker/libnetwork/ipamapi"
 	networktypes "github.com/docker/libnetwork/types"
@@ -207,7 +208,6 @@ func (daemon *Daemon) setupIngress(create *clustertypes.NetworkCreateRequest, ip
 
 func (daemon *Daemon) releaseIngress(id string) {
 	controller := daemon.netController
-
 	if err := controller.SandboxDestroy("ingress-sbox"); err != nil {
 		logrus.Errorf("Failed to delete ingress sandbox: %v", err)
 	}
@@ -233,13 +233,17 @@ func (daemon *Daemon) releaseIngress(id string) {
 		logrus.Errorf("Failed to delete ingress network %s: %v", n.ID(), err)
 		return
 	}
-
 	return
 }
 
 // SetNetworkBootstrapKeys sets the bootstrap keys.
 func (daemon *Daemon) SetNetworkBootstrapKeys(keys []*networktypes.EncryptionKey) error {
-	return daemon.netController.SetKeys(keys)
+	err := daemon.netController.SetKeys(keys)
+	if err == nil {
+		// Upon successful key setting dispatch the keys available event
+		daemon.cluster.SendClusterEvent(lncluster.EventNetworkKeysAvailable)
+	}
+	return err
 }
 
 // UpdateAttachment notifies the attacher about the attachment config.

+ 21 - 0
integration-cli/docker_cli_swarm_test.go

@@ -1980,3 +1980,24 @@ func (s *DockerSwarmSuite) TestSwarmInitUnspecifiedDataPathAddr(c *check.C) {
 	c.Assert(err, checker.NotNil)
 	c.Assert(out, checker.Contains, "data path address must be a non-zero IP")
 }
+
+func (s *DockerSwarmSuite) TestSwarmJoinLeave(c *check.C) {
+	d := s.AddDaemon(c, true, true)
+
+	out, err := d.Cmd("swarm", "join-token", "-q", "worker")
+	c.Assert(err, checker.IsNil)
+	c.Assert(strings.TrimSpace(out), checker.Not(checker.Equals), "")
+
+	token := strings.TrimSpace(out)
+
+	// Verify that back to back join/leave does not cause panics
+	d1 := s.AddDaemon(c, false, false)
+	for i := 0; i < 10; i++ {
+		out, err = d1.Cmd("swarm", "join", "--token", token, d.ListenAddr)
+		c.Assert(err, checker.IsNil)
+		c.Assert(strings.TrimSpace(out), checker.Not(checker.Equals), "")
+
+		_, err = d1.Cmd("swarm", "leave")
+		c.Assert(err, checker.IsNil)
+	}
+}