Fix race condition between swarm and libnetwork

This commit in conjunction with a libnetwork side commit,
cleans up the libnetwork SetClusterProvider logic interaction.
The previous code was inducing libnetwork to spawn several go
routines that were racing between each other during the agent
init and close.

A test got added to verify that back to back swarm init and leave
are properly processed and not raise crashes

Signed-off-by: Flavio Crisciani <flavio.crisciani@docker.com>
This commit is contained in:
Flavio Crisciani 2017-04-30 14:51:43 -07:00
parent 2ca41c47c4
commit e2ec006797
No known key found for this signature in database
GPG key ID: 28CAFCE754CF3A48
7 changed files with 66 additions and 15 deletions

View file

@ -300,6 +300,11 @@ func (cli *DaemonCli) start(opts daemonOptions) (err error) {
if err != nil {
logrus.Fatalf("Error creating cluster component: %v", err)
}
d.SetCluster(c)
err = c.Start()
if err != nil {
logrus.Fatalf("Error starting cluster component: %v", err)
}
// Restart all autostart containers which has a swarm endpoint
// and is not yet running now that we have successfully
@ -316,7 +321,6 @@ func (cli *DaemonCli) start(opts daemonOptions) (err error) {
cli.d = d
d.SetCluster(c)
initRouter(api, d, c)
cli.setupConfigReloadTrap()

View file

@ -2,12 +2,14 @@ package daemon
import (
apitypes "github.com/docker/docker/api/types"
lncluster "github.com/docker/libnetwork/cluster"
)
// Cluster is the interface for github.com/docker/docker/daemon/cluster.(*Cluster).
type Cluster interface {
ClusterStatus
NetworkManager
SendClusterEvent(event lncluster.ConfigEventType)
}
// ClusterStatus interface provides information about the Swarm status of the Cluster

View file

@ -51,6 +51,7 @@ import (
types "github.com/docker/docker/api/types/swarm"
executorpkg "github.com/docker/docker/daemon/cluster/executor"
"github.com/docker/docker/pkg/signal"
lncluster "github.com/docker/libnetwork/cluster"
swarmapi "github.com/docker/swarmkit/api"
swarmnode "github.com/docker/swarmkit/node"
"github.com/pkg/errors"
@ -115,7 +116,7 @@ type Cluster struct {
root string
runtimeRoot string
config Config
configEvent chan struct{} // todo: make this array and goroutine safe
configEvent chan lncluster.ConfigEventType // todo: make this array and goroutine safe
attachers map[string]*attacher
}
@ -147,22 +148,30 @@ func New(config Config) (*Cluster, error) {
c := &Cluster{
root: root,
config: config,
configEvent: make(chan struct{}, 10),
configEvent: make(chan lncluster.ConfigEventType, 10),
runtimeRoot: config.RuntimeRoot,
attachers: make(map[string]*attacher),
}
return c, nil
}
// Start the Cluster instance
// TODO The split between New and Start can be join again when the SendClusterEvent
// method is no longer required
func (c *Cluster) Start() error {
root := filepath.Join(c.config.Root, swarmDirName)
nodeConfig, err := loadPersistentState(root)
if err != nil {
if os.IsNotExist(err) {
return c, nil
return nil
}
return nil, err
return err
}
nr, err := c.newNodeRunner(*nodeConfig)
if err != nil {
return nil, err
return err
}
c.nr = nr
@ -172,10 +181,10 @@ func New(config Config) (*Cluster, error) {
case err := <-nr.Ready():
if err != nil {
logrus.WithError(err).Error("swarm component could not be started")
return c, nil
return nil
}
}
return c, nil
return nil
}
func (c *Cluster) newNodeRunner(conf nodeStartConfig) (*nodeRunner, error) {
@ -308,7 +317,7 @@ func (c *Cluster) getRemoteAddressList() []string {
// ListenClusterEvents returns a channel that receives messages on cluster
// participation changes.
// todo: make cancelable and accessible to multiple callers
func (c *Cluster) ListenClusterEvents() <-chan struct{} {
func (c *Cluster) ListenClusterEvents() <-chan lncluster.ConfigEventType {
return c.configEvent
}
@ -413,3 +422,13 @@ func (c *Cluster) lockedManagerAction(fn func(ctx context.Context, state nodeSta
return fn(ctx, state)
}
// SendClusterEvent allows to send cluster events on the configEvent channel
// TODO This method should not be exposed.
// Currently it is used to notify the network controller that the keys are
// available
func (c *Cluster) SendClusterEvent(event lncluster.ConfigEventType) {
c.mu.RLock()
defer c.mu.RUnlock()
c.configEvent <- event
}

View file

@ -11,6 +11,7 @@ import (
"github.com/Sirupsen/logrus"
types "github.com/docker/docker/api/types/swarm"
"github.com/docker/docker/daemon/cluster/executor/container"
lncluster "github.com/docker/libnetwork/cluster"
swarmapi "github.com/docker/swarmkit/api"
swarmnode "github.com/docker/swarmkit/node"
"github.com/pkg/errors"
@ -162,7 +163,7 @@ func (n *nodeRunner) handleControlSocketChange(ctx context.Context, node *swarmn
}
n.grpcConn = conn
n.mu.Unlock()
n.cluster.configEvent <- struct{}{}
n.cluster.SendClusterEvent(lncluster.EventSocketChange)
}
}
@ -175,7 +176,7 @@ func (n *nodeRunner) handleReadyEvent(ctx context.Context, node *swarmnode.Node,
close(ready)
case <-ctx.Done():
}
n.cluster.configEvent <- struct{}{}
n.cluster.SendClusterEvent(lncluster.EventNodeReady)
}
func (n *nodeRunner) handleNodeExit(node *swarmnode.Node) {
@ -217,6 +218,7 @@ func (n *nodeRunner) Stop() error {
if err := n.swarmNode.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") {
return err
}
n.cluster.SendClusterEvent(lncluster.EventNodeLeave)
<-n.done
return nil
}

View file

@ -388,7 +388,6 @@ func (c *Cluster) Leave(force bool) error {
}
}
c.configEvent <- struct{}{}
// todo: cleanup optional?
if err := clearPersistentState(c.root); err != nil {
return err

View file

@ -16,6 +16,7 @@ import (
"github.com/docker/docker/pkg/plugingetter"
"github.com/docker/docker/runconfig"
"github.com/docker/libnetwork"
lncluster "github.com/docker/libnetwork/cluster"
"github.com/docker/libnetwork/driverapi"
"github.com/docker/libnetwork/ipamapi"
networktypes "github.com/docker/libnetwork/types"
@ -207,7 +208,6 @@ func (daemon *Daemon) setupIngress(create *clustertypes.NetworkCreateRequest, ip
func (daemon *Daemon) releaseIngress(id string) {
controller := daemon.netController
if err := controller.SandboxDestroy("ingress-sbox"); err != nil {
logrus.Errorf("Failed to delete ingress sandbox: %v", err)
}
@ -233,13 +233,17 @@ func (daemon *Daemon) releaseIngress(id string) {
logrus.Errorf("Failed to delete ingress network %s: %v", n.ID(), err)
return
}
return
}
// SetNetworkBootstrapKeys sets the bootstrap keys.
func (daemon *Daemon) SetNetworkBootstrapKeys(keys []*networktypes.EncryptionKey) error {
return daemon.netController.SetKeys(keys)
err := daemon.netController.SetKeys(keys)
if err == nil {
// Upon successful key setting dispatch the keys available event
daemon.cluster.SendClusterEvent(lncluster.EventNetworkKeysAvailable)
}
return err
}
// UpdateAttachment notifies the attacher about the attachment config.

View file

@ -1980,3 +1980,24 @@ func (s *DockerSwarmSuite) TestSwarmInitUnspecifiedDataPathAddr(c *check.C) {
c.Assert(err, checker.NotNil)
c.Assert(out, checker.Contains, "data path address must be a non-zero IP")
}
func (s *DockerSwarmSuite) TestSwarmJoinLeave(c *check.C) {
d := s.AddDaemon(c, true, true)
out, err := d.Cmd("swarm", "join-token", "-q", "worker")
c.Assert(err, checker.IsNil)
c.Assert(strings.TrimSpace(out), checker.Not(checker.Equals), "")
token := strings.TrimSpace(out)
// Verify that back to back join/leave does not cause panics
d1 := s.AddDaemon(c, false, false)
for i := 0; i < 10; i++ {
out, err = d1.Cmd("swarm", "join", "--token", token, d.ListenAddr)
c.Assert(err, checker.IsNil)
c.Assert(strings.TrimSpace(out), checker.Not(checker.Equals), "")
_, err = d1.Cmd("swarm", "leave")
c.Assert(err, checker.IsNil)
}
}