Fix race condition between swarm and libnetwork
This commit in conjunction with a libnetwork side commit, cleans up the libnetwork SetClusterProvider logic interaction. The previous code was inducing libnetwork to spawn several go routines that were racing between each other during the agent init and close. A test got added to verify that back to back swarm init and leave are properly processed and not raise crashes Signed-off-by: Flavio Crisciani <flavio.crisciani@docker.com>
This commit is contained in:
parent
2ca41c47c4
commit
e2ec006797
7 changed files with 66 additions and 15 deletions
|
@ -300,6 +300,11 @@ func (cli *DaemonCli) start(opts daemonOptions) (err error) {
|
|||
if err != nil {
|
||||
logrus.Fatalf("Error creating cluster component: %v", err)
|
||||
}
|
||||
d.SetCluster(c)
|
||||
err = c.Start()
|
||||
if err != nil {
|
||||
logrus.Fatalf("Error starting cluster component: %v", err)
|
||||
}
|
||||
|
||||
// Restart all autostart containers which has a swarm endpoint
|
||||
// and is not yet running now that we have successfully
|
||||
|
@ -316,7 +321,6 @@ func (cli *DaemonCli) start(opts daemonOptions) (err error) {
|
|||
|
||||
cli.d = d
|
||||
|
||||
d.SetCluster(c)
|
||||
initRouter(api, d, c)
|
||||
|
||||
cli.setupConfigReloadTrap()
|
||||
|
|
|
@ -2,12 +2,14 @@ package daemon
|
|||
|
||||
import (
|
||||
apitypes "github.com/docker/docker/api/types"
|
||||
lncluster "github.com/docker/libnetwork/cluster"
|
||||
)
|
||||
|
||||
// Cluster is the interface for github.com/docker/docker/daemon/cluster.(*Cluster).
|
||||
type Cluster interface {
|
||||
ClusterStatus
|
||||
NetworkManager
|
||||
SendClusterEvent(event lncluster.ConfigEventType)
|
||||
}
|
||||
|
||||
// ClusterStatus interface provides information about the Swarm status of the Cluster
|
||||
|
|
|
@ -51,6 +51,7 @@ import (
|
|||
types "github.com/docker/docker/api/types/swarm"
|
||||
executorpkg "github.com/docker/docker/daemon/cluster/executor"
|
||||
"github.com/docker/docker/pkg/signal"
|
||||
lncluster "github.com/docker/libnetwork/cluster"
|
||||
swarmapi "github.com/docker/swarmkit/api"
|
||||
swarmnode "github.com/docker/swarmkit/node"
|
||||
"github.com/pkg/errors"
|
||||
|
@ -115,7 +116,7 @@ type Cluster struct {
|
|||
root string
|
||||
runtimeRoot string
|
||||
config Config
|
||||
configEvent chan struct{} // todo: make this array and goroutine safe
|
||||
configEvent chan lncluster.ConfigEventType // todo: make this array and goroutine safe
|
||||
attachers map[string]*attacher
|
||||
}
|
||||
|
||||
|
@ -147,22 +148,30 @@ func New(config Config) (*Cluster, error) {
|
|||
c := &Cluster{
|
||||
root: root,
|
||||
config: config,
|
||||
configEvent: make(chan struct{}, 10),
|
||||
configEvent: make(chan lncluster.ConfigEventType, 10),
|
||||
runtimeRoot: config.RuntimeRoot,
|
||||
attachers: make(map[string]*attacher),
|
||||
}
|
||||
return c, nil
|
||||
}
|
||||
|
||||
// Start the Cluster instance
|
||||
// TODO The split between New and Start can be join again when the SendClusterEvent
|
||||
// method is no longer required
|
||||
func (c *Cluster) Start() error {
|
||||
root := filepath.Join(c.config.Root, swarmDirName)
|
||||
|
||||
nodeConfig, err := loadPersistentState(root)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return c, nil
|
||||
return nil
|
||||
}
|
||||
return nil, err
|
||||
return err
|
||||
}
|
||||
|
||||
nr, err := c.newNodeRunner(*nodeConfig)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return err
|
||||
}
|
||||
c.nr = nr
|
||||
|
||||
|
@ -172,10 +181,10 @@ func New(config Config) (*Cluster, error) {
|
|||
case err := <-nr.Ready():
|
||||
if err != nil {
|
||||
logrus.WithError(err).Error("swarm component could not be started")
|
||||
return c, nil
|
||||
return nil
|
||||
}
|
||||
}
|
||||
return c, nil
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Cluster) newNodeRunner(conf nodeStartConfig) (*nodeRunner, error) {
|
||||
|
@ -308,7 +317,7 @@ func (c *Cluster) getRemoteAddressList() []string {
|
|||
// ListenClusterEvents returns a channel that receives messages on cluster
|
||||
// participation changes.
|
||||
// todo: make cancelable and accessible to multiple callers
|
||||
func (c *Cluster) ListenClusterEvents() <-chan struct{} {
|
||||
func (c *Cluster) ListenClusterEvents() <-chan lncluster.ConfigEventType {
|
||||
return c.configEvent
|
||||
}
|
||||
|
||||
|
@ -413,3 +422,13 @@ func (c *Cluster) lockedManagerAction(fn func(ctx context.Context, state nodeSta
|
|||
|
||||
return fn(ctx, state)
|
||||
}
|
||||
|
||||
// SendClusterEvent allows to send cluster events on the configEvent channel
|
||||
// TODO This method should not be exposed.
|
||||
// Currently it is used to notify the network controller that the keys are
|
||||
// available
|
||||
func (c *Cluster) SendClusterEvent(event lncluster.ConfigEventType) {
|
||||
c.mu.RLock()
|
||||
defer c.mu.RUnlock()
|
||||
c.configEvent <- event
|
||||
}
|
||||
|
|
|
@ -11,6 +11,7 @@ import (
|
|||
"github.com/Sirupsen/logrus"
|
||||
types "github.com/docker/docker/api/types/swarm"
|
||||
"github.com/docker/docker/daemon/cluster/executor/container"
|
||||
lncluster "github.com/docker/libnetwork/cluster"
|
||||
swarmapi "github.com/docker/swarmkit/api"
|
||||
swarmnode "github.com/docker/swarmkit/node"
|
||||
"github.com/pkg/errors"
|
||||
|
@ -162,7 +163,7 @@ func (n *nodeRunner) handleControlSocketChange(ctx context.Context, node *swarmn
|
|||
}
|
||||
n.grpcConn = conn
|
||||
n.mu.Unlock()
|
||||
n.cluster.configEvent <- struct{}{}
|
||||
n.cluster.SendClusterEvent(lncluster.EventSocketChange)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -175,7 +176,7 @@ func (n *nodeRunner) handleReadyEvent(ctx context.Context, node *swarmnode.Node,
|
|||
close(ready)
|
||||
case <-ctx.Done():
|
||||
}
|
||||
n.cluster.configEvent <- struct{}{}
|
||||
n.cluster.SendClusterEvent(lncluster.EventNodeReady)
|
||||
}
|
||||
|
||||
func (n *nodeRunner) handleNodeExit(node *swarmnode.Node) {
|
||||
|
@ -217,6 +218,7 @@ func (n *nodeRunner) Stop() error {
|
|||
if err := n.swarmNode.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") {
|
||||
return err
|
||||
}
|
||||
n.cluster.SendClusterEvent(lncluster.EventNodeLeave)
|
||||
<-n.done
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -388,7 +388,6 @@ func (c *Cluster) Leave(force bool) error {
|
|||
}
|
||||
}
|
||||
|
||||
c.configEvent <- struct{}{}
|
||||
// todo: cleanup optional?
|
||||
if err := clearPersistentState(c.root); err != nil {
|
||||
return err
|
||||
|
|
|
@ -16,6 +16,7 @@ import (
|
|||
"github.com/docker/docker/pkg/plugingetter"
|
||||
"github.com/docker/docker/runconfig"
|
||||
"github.com/docker/libnetwork"
|
||||
lncluster "github.com/docker/libnetwork/cluster"
|
||||
"github.com/docker/libnetwork/driverapi"
|
||||
"github.com/docker/libnetwork/ipamapi"
|
||||
networktypes "github.com/docker/libnetwork/types"
|
||||
|
@ -207,7 +208,6 @@ func (daemon *Daemon) setupIngress(create *clustertypes.NetworkCreateRequest, ip
|
|||
|
||||
func (daemon *Daemon) releaseIngress(id string) {
|
||||
controller := daemon.netController
|
||||
|
||||
if err := controller.SandboxDestroy("ingress-sbox"); err != nil {
|
||||
logrus.Errorf("Failed to delete ingress sandbox: %v", err)
|
||||
}
|
||||
|
@ -233,13 +233,17 @@ func (daemon *Daemon) releaseIngress(id string) {
|
|||
logrus.Errorf("Failed to delete ingress network %s: %v", n.ID(), err)
|
||||
return
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
// SetNetworkBootstrapKeys sets the bootstrap keys.
|
||||
func (daemon *Daemon) SetNetworkBootstrapKeys(keys []*networktypes.EncryptionKey) error {
|
||||
return daemon.netController.SetKeys(keys)
|
||||
err := daemon.netController.SetKeys(keys)
|
||||
if err == nil {
|
||||
// Upon successful key setting dispatch the keys available event
|
||||
daemon.cluster.SendClusterEvent(lncluster.EventNetworkKeysAvailable)
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
// UpdateAttachment notifies the attacher about the attachment config.
|
||||
|
|
|
@ -1980,3 +1980,24 @@ func (s *DockerSwarmSuite) TestSwarmInitUnspecifiedDataPathAddr(c *check.C) {
|
|||
c.Assert(err, checker.NotNil)
|
||||
c.Assert(out, checker.Contains, "data path address must be a non-zero IP")
|
||||
}
|
||||
|
||||
func (s *DockerSwarmSuite) TestSwarmJoinLeave(c *check.C) {
|
||||
d := s.AddDaemon(c, true, true)
|
||||
|
||||
out, err := d.Cmd("swarm", "join-token", "-q", "worker")
|
||||
c.Assert(err, checker.IsNil)
|
||||
c.Assert(strings.TrimSpace(out), checker.Not(checker.Equals), "")
|
||||
|
||||
token := strings.TrimSpace(out)
|
||||
|
||||
// Verify that back to back join/leave does not cause panics
|
||||
d1 := s.AddDaemon(c, false, false)
|
||||
for i := 0; i < 10; i++ {
|
||||
out, err = d1.Cmd("swarm", "join", "--token", token, d.ListenAddr)
|
||||
c.Assert(err, checker.IsNil)
|
||||
c.Assert(strings.TrimSpace(out), checker.Not(checker.Equals), "")
|
||||
|
||||
_, err = d1.Cmd("swarm", "leave")
|
||||
c.Assert(err, checker.IsNil)
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue