Browse Source

Fix autostart for swarm scope connected containers

The swarm scope network connected containers with autostart enabled
there was a dependency problem with the cluster to be initialized before
we can autostart them. With the current container restart code happening
before cluster init, these containers were not getting autostarted
properly. Added a fix to delay the container start of those containers
which has atleast one swarm scope endpoint to until after the cluster is
initialized.

Signed-off-by: Jana Radhakrishnan <mrjana@docker.com>
Jana Radhakrishnan 8 years ago
parent
commit
c9fb551d60

+ 5 - 0
cmd/dockerd/daemon.go

@@ -262,6 +262,11 @@ func (cli *DaemonCli) start(opts daemonOptions) (err error) {
 		logrus.Fatalf("Error creating cluster component: %v", err)
 	}
 
+	// Restart all autostart containers which has a swarm endpoint
+	// and is not yet running now that we have successfully
+	// initialized the cluster.
+	d.RestartSwarmContainers()
+
 	logrus.Info("Daemon has completed initialization")
 
 	logrus.WithFields(logrus.Fields{

+ 28 - 9
daemon/cluster/cluster.go

@@ -135,10 +135,11 @@ type Cluster struct {
 // helps in identifying the attachment ID via the taskID and the
 // corresponding attachment configuration obtained from the manager.
 type attacher struct {
-	taskID       string
-	config       *network.NetworkingConfig
-	attachWaitCh chan *network.NetworkingConfig
-	detachWaitCh chan struct{}
+	taskID           string
+	config           *network.NetworkingConfig
+	attachWaitCh     chan *network.NetworkingConfig
+	attachCompleteCh chan struct{}
+	detachWaitCh     chan struct{}
 }
 
 type node struct {
@@ -1262,12 +1263,24 @@ func (c *Cluster) WaitForDetachment(ctx context.Context, networkName, networkID,
 	agent := c.node.Agent()
 	c.RUnlock()
 
-	if ok && attacher != nil && attacher.detachWaitCh != nil {
+	if ok && attacher != nil &&
+		attacher.detachWaitCh != nil &&
+		attacher.attachCompleteCh != nil {
+		// Attachment may be in progress still so wait for
+		// attachment to complete.
 		select {
-		case <-attacher.detachWaitCh:
+		case <-attacher.attachCompleteCh:
 		case <-ctx.Done():
 			return ctx.Err()
 		}
+
+		if attacher.taskID == taskID {
+			select {
+			case <-attacher.detachWaitCh:
+			case <-ctx.Done():
+				return ctx.Err()
+			}
+		}
 	}
 
 	return agent.ResourceAllocator().DetachNetwork(ctx, taskID)
@@ -1289,9 +1302,11 @@ func (c *Cluster) AttachNetwork(target string, containerID string, addresses []s
 	agent := c.node.Agent()
 	attachWaitCh := make(chan *network.NetworkingConfig)
 	detachWaitCh := make(chan struct{})
+	attachCompleteCh := make(chan struct{})
 	c.attachers[aKey] = &attacher{
-		attachWaitCh: attachWaitCh,
-		detachWaitCh: detachWaitCh,
+		attachWaitCh:     attachWaitCh,
+		attachCompleteCh: attachCompleteCh,
+		detachWaitCh:     detachWaitCh,
 	}
 	c.Unlock()
 
@@ -1306,6 +1321,11 @@ func (c *Cluster) AttachNetwork(target string, containerID string, addresses []s
 		return nil, fmt.Errorf("Could not attach to network %s: %v", target, err)
 	}
 
+	c.Lock()
+	c.attachers[aKey].taskID = taskID
+	close(attachCompleteCh)
+	c.Unlock()
+
 	logrus.Debugf("Successfully attached to network %s with tid %s", target, taskID)
 
 	var config *network.NetworkingConfig
@@ -1316,7 +1336,6 @@ func (c *Cluster) AttachNetwork(target string, containerID string, addresses []s
 	}
 
 	c.Lock()
-	c.attachers[aKey].taskID = taskID
 	c.attachers[aKey].config = config
 	c.Unlock()
 	return config, nil

+ 5 - 0
daemon/container_operations.go

@@ -384,6 +384,9 @@ func (daemon *Daemon) findAndAttachNetwork(container *container.Container, idOrN
 		return nil, nil, err
 	}
 
+	// This container has attachment to a swarm scope
+	// network. Update the container network settings accordingly.
+	container.NetworkSettings.HasSwarmEndpoint = true
 	return n, config, nil
 }
 
@@ -492,6 +495,7 @@ func (daemon *Daemon) allocateNetwork(container *container.Container) error {
 	// on first network connecting.
 	defaultNetName := runconfig.DefaultDaemonNetworkMode().NetworkName()
 	if nConf, ok := container.NetworkSettings.Networks[defaultNetName]; ok {
+		cleanOperationalData(nConf)
 		if err := daemon.connectToNetwork(container, defaultNetName, nConf.EndpointSettings, updateSettings); err != nil {
 			return err
 		}
@@ -512,6 +516,7 @@ func (daemon *Daemon) allocateNetwork(container *container.Container) error {
 	}
 
 	for i, epConf := range epConfigs {
+		cleanOperationalData(epConf)
 		if err := daemon.connectToNetwork(container, networks[i], epConf.EndpointSettings, updateSettings); err != nil {
 			return err
 		}

+ 31 - 1
daemon/daemon.go

@@ -202,7 +202,13 @@ func (daemon *Daemon) restore() error {
 			// fixme: only if not running
 			// get list of containers we need to restart
 			if !c.IsRunning() && !c.IsPaused() {
-				if daemon.configStore.AutoRestart && c.ShouldRestart() {
+				// Do not autostart containers which
+				// has endpoints in a swarm scope
+				// network yet since the cluster is
+				// not initialized yet. We will start
+				// it after the cluster is
+				// initialized.
+				if daemon.configStore.AutoRestart && c.ShouldRestart() && !c.NetworkSettings.HasSwarmEndpoint {
 					mapLock.Lock()
 					restartContainers[c] = make(chan struct{})
 					mapLock.Unlock()
@@ -346,6 +352,30 @@ func (daemon *Daemon) restore() error {
 	return nil
 }
 
+// RestartSwarmContainers restarts any autostart container which has a
+// swarm endpoint.
+func (daemon *Daemon) RestartSwarmContainers() {
+	group := sync.WaitGroup{}
+	for _, c := range daemon.List() {
+		if !c.IsRunning() && !c.IsPaused() {
+			// Autostart all the containers which has a
+			// swarm endpoint now that the cluster is
+			// initialized.
+			if daemon.configStore.AutoRestart && c.ShouldRestart() && c.NetworkSettings.HasSwarmEndpoint {
+				group.Add(1)
+				go func(c *container.Container) {
+					defer group.Done()
+					if err := daemon.containerStart(c, ""); err != nil {
+						logrus.Error(err)
+					}
+				}(c)
+			}
+		}
+
+	}
+	group.Wait()
+}
+
 // waitForNetworks is used during daemon initialization when starting up containers
 // It ensures that all of a container's networks are available before the daemon tries to start the container.
 // In practice it just makes sure the discovery service is available for containers which use a network that require discovery.

+ 1 - 0
daemon/network/settings.go

@@ -21,6 +21,7 @@ type Settings struct {
 	SecondaryIPAddresses   []networktypes.Address
 	SecondaryIPv6Addresses []networktypes.Address
 	IsAnonymousEndpoint    bool
+	HasSwarmEndpoint       bool
 }
 
 // EndpointSettings is a package local wrapper for

+ 22 - 0
integration-cli/docker_cli_swarm_test.go

@@ -242,3 +242,25 @@ func (s *DockerSwarmSuite) TestSwarmServiceWithGroup(c *check.C) {
 	c.Assert(err, checker.IsNil)
 	c.Assert(strings.TrimSpace(out), checker.Equals, "uid=0(root) gid=0(root) groups=10(wheel),29(audio),50(staff),777")
 }
+
+func (s *DockerSwarmSuite) TestSwarmContainerAutoStart(c *check.C) {
+	d := s.AddDaemon(c, true, true)
+
+	out, err := d.Cmd("network", "create", "--attachable", "-d", "overlay", "foo")
+	c.Assert(err, checker.IsNil)
+	c.Assert(strings.TrimSpace(out), checker.Not(checker.Equals), "")
+
+	out, err = d.Cmd("run", "-id", "--restart=always", "--net=foo", "--name=test", "busybox", "top")
+	c.Assert(err, checker.IsNil)
+	c.Assert(strings.TrimSpace(out), checker.Not(checker.Equals), "")
+
+	out, err = d.Cmd("ps", "-q")
+	c.Assert(err, checker.IsNil)
+	c.Assert(strings.TrimSpace(out), checker.Not(checker.Equals), "")
+
+	d.Restart()
+
+	out, err = d.Cmd("ps", "-q")
+	c.Assert(err, checker.IsNil)
+	c.Assert(strings.TrimSpace(out), checker.Not(checker.Equals), "")
+}