浏览代码

Fix autostart for swarm scope connected containers

The swarm scope network connected containers with autostart enabled
there was a dependency problem with the cluster to be initialized before
we can autostart them. With the current container restart code happening
before cluster init, these containers were not getting autostarted
properly. Added a fix to delay the container start of those containers
which has atleast one swarm scope endpoint to until after the cluster is
initialized.

Signed-off-by: Jana Radhakrishnan <mrjana@docker.com>
Jana Radhakrishnan 8 年之前
父节点
当前提交
c9fb551d60

+ 5 - 0
cmd/dockerd/daemon.go

@@ -262,6 +262,11 @@ func (cli *DaemonCli) start(opts daemonOptions) (err error) {
 		logrus.Fatalf("Error creating cluster component: %v", err)
 		logrus.Fatalf("Error creating cluster component: %v", err)
 	}
 	}
 
 
+	// Restart all autostart containers which has a swarm endpoint
+	// and is not yet running now that we have successfully
+	// initialized the cluster.
+	d.RestartSwarmContainers()
+
 	logrus.Info("Daemon has completed initialization")
 	logrus.Info("Daemon has completed initialization")
 
 
 	logrus.WithFields(logrus.Fields{
 	logrus.WithFields(logrus.Fields{

+ 28 - 9
daemon/cluster/cluster.go

@@ -135,10 +135,11 @@ type Cluster struct {
 // helps in identifying the attachment ID via the taskID and the
 // helps in identifying the attachment ID via the taskID and the
 // corresponding attachment configuration obtained from the manager.
 // corresponding attachment configuration obtained from the manager.
 type attacher struct {
 type attacher struct {
-	taskID       string
-	config       *network.NetworkingConfig
-	attachWaitCh chan *network.NetworkingConfig
-	detachWaitCh chan struct{}
+	taskID           string
+	config           *network.NetworkingConfig
+	attachWaitCh     chan *network.NetworkingConfig
+	attachCompleteCh chan struct{}
+	detachWaitCh     chan struct{}
 }
 }
 
 
 type node struct {
 type node struct {
@@ -1262,12 +1263,24 @@ func (c *Cluster) WaitForDetachment(ctx context.Context, networkName, networkID,
 	agent := c.node.Agent()
 	agent := c.node.Agent()
 	c.RUnlock()
 	c.RUnlock()
 
 
-	if ok && attacher != nil && attacher.detachWaitCh != nil {
+	if ok && attacher != nil &&
+		attacher.detachWaitCh != nil &&
+		attacher.attachCompleteCh != nil {
+		// Attachment may be in progress still so wait for
+		// attachment to complete.
 		select {
 		select {
-		case <-attacher.detachWaitCh:
+		case <-attacher.attachCompleteCh:
 		case <-ctx.Done():
 		case <-ctx.Done():
 			return ctx.Err()
 			return ctx.Err()
 		}
 		}
+
+		if attacher.taskID == taskID {
+			select {
+			case <-attacher.detachWaitCh:
+			case <-ctx.Done():
+				return ctx.Err()
+			}
+		}
 	}
 	}
 
 
 	return agent.ResourceAllocator().DetachNetwork(ctx, taskID)
 	return agent.ResourceAllocator().DetachNetwork(ctx, taskID)
@@ -1289,9 +1302,11 @@ func (c *Cluster) AttachNetwork(target string, containerID string, addresses []s
 	agent := c.node.Agent()
 	agent := c.node.Agent()
 	attachWaitCh := make(chan *network.NetworkingConfig)
 	attachWaitCh := make(chan *network.NetworkingConfig)
 	detachWaitCh := make(chan struct{})
 	detachWaitCh := make(chan struct{})
+	attachCompleteCh := make(chan struct{})
 	c.attachers[aKey] = &attacher{
 	c.attachers[aKey] = &attacher{
-		attachWaitCh: attachWaitCh,
-		detachWaitCh: detachWaitCh,
+		attachWaitCh:     attachWaitCh,
+		attachCompleteCh: attachCompleteCh,
+		detachWaitCh:     detachWaitCh,
 	}
 	}
 	c.Unlock()
 	c.Unlock()
 
 
@@ -1306,6 +1321,11 @@ func (c *Cluster) AttachNetwork(target string, containerID string, addresses []s
 		return nil, fmt.Errorf("Could not attach to network %s: %v", target, err)
 		return nil, fmt.Errorf("Could not attach to network %s: %v", target, err)
 	}
 	}
 
 
+	c.Lock()
+	c.attachers[aKey].taskID = taskID
+	close(attachCompleteCh)
+	c.Unlock()
+
 	logrus.Debugf("Successfully attached to network %s with tid %s", target, taskID)
 	logrus.Debugf("Successfully attached to network %s with tid %s", target, taskID)
 
 
 	var config *network.NetworkingConfig
 	var config *network.NetworkingConfig
@@ -1316,7 +1336,6 @@ func (c *Cluster) AttachNetwork(target string, containerID string, addresses []s
 	}
 	}
 
 
 	c.Lock()
 	c.Lock()
-	c.attachers[aKey].taskID = taskID
 	c.attachers[aKey].config = config
 	c.attachers[aKey].config = config
 	c.Unlock()
 	c.Unlock()
 	return config, nil
 	return config, nil

+ 5 - 0
daemon/container_operations.go

@@ -384,6 +384,9 @@ func (daemon *Daemon) findAndAttachNetwork(container *container.Container, idOrN
 		return nil, nil, err
 		return nil, nil, err
 	}
 	}
 
 
+	// This container has attachment to a swarm scope
+	// network. Update the container network settings accordingly.
+	container.NetworkSettings.HasSwarmEndpoint = true
 	return n, config, nil
 	return n, config, nil
 }
 }
 
 
@@ -492,6 +495,7 @@ func (daemon *Daemon) allocateNetwork(container *container.Container) error {
 	// on first network connecting.
 	// on first network connecting.
 	defaultNetName := runconfig.DefaultDaemonNetworkMode().NetworkName()
 	defaultNetName := runconfig.DefaultDaemonNetworkMode().NetworkName()
 	if nConf, ok := container.NetworkSettings.Networks[defaultNetName]; ok {
 	if nConf, ok := container.NetworkSettings.Networks[defaultNetName]; ok {
+		cleanOperationalData(nConf)
 		if err := daemon.connectToNetwork(container, defaultNetName, nConf.EndpointSettings, updateSettings); err != nil {
 		if err := daemon.connectToNetwork(container, defaultNetName, nConf.EndpointSettings, updateSettings); err != nil {
 			return err
 			return err
 		}
 		}
@@ -512,6 +516,7 @@ func (daemon *Daemon) allocateNetwork(container *container.Container) error {
 	}
 	}
 
 
 	for i, epConf := range epConfigs {
 	for i, epConf := range epConfigs {
+		cleanOperationalData(epConf)
 		if err := daemon.connectToNetwork(container, networks[i], epConf.EndpointSettings, updateSettings); err != nil {
 		if err := daemon.connectToNetwork(container, networks[i], epConf.EndpointSettings, updateSettings); err != nil {
 			return err
 			return err
 		}
 		}

+ 31 - 1
daemon/daemon.go

@@ -202,7 +202,13 @@ func (daemon *Daemon) restore() error {
 			// fixme: only if not running
 			// fixme: only if not running
 			// get list of containers we need to restart
 			// get list of containers we need to restart
 			if !c.IsRunning() && !c.IsPaused() {
 			if !c.IsRunning() && !c.IsPaused() {
-				if daemon.configStore.AutoRestart && c.ShouldRestart() {
+				// Do not autostart containers which
+				// has endpoints in a swarm scope
+				// network yet since the cluster is
+				// not initialized yet. We will start
+				// it after the cluster is
+				// initialized.
+				if daemon.configStore.AutoRestart && c.ShouldRestart() && !c.NetworkSettings.HasSwarmEndpoint {
 					mapLock.Lock()
 					mapLock.Lock()
 					restartContainers[c] = make(chan struct{})
 					restartContainers[c] = make(chan struct{})
 					mapLock.Unlock()
 					mapLock.Unlock()
@@ -346,6 +352,30 @@ func (daemon *Daemon) restore() error {
 	return nil
 	return nil
 }
 }
 
 
+// RestartSwarmContainers restarts any autostart container which has a
+// swarm endpoint.
+func (daemon *Daemon) RestartSwarmContainers() {
+	group := sync.WaitGroup{}
+	for _, c := range daemon.List() {
+		if !c.IsRunning() && !c.IsPaused() {
+			// Autostart all the containers which has a
+			// swarm endpoint now that the cluster is
+			// initialized.
+			if daemon.configStore.AutoRestart && c.ShouldRestart() && c.NetworkSettings.HasSwarmEndpoint {
+				group.Add(1)
+				go func(c *container.Container) {
+					defer group.Done()
+					if err := daemon.containerStart(c, ""); err != nil {
+						logrus.Error(err)
+					}
+				}(c)
+			}
+		}
+
+	}
+	group.Wait()
+}
+
 // waitForNetworks is used during daemon initialization when starting up containers
 // waitForNetworks is used during daemon initialization when starting up containers
 // It ensures that all of a container's networks are available before the daemon tries to start the container.
 // It ensures that all of a container's networks are available before the daemon tries to start the container.
 // In practice it just makes sure the discovery service is available for containers which use a network that require discovery.
 // In practice it just makes sure the discovery service is available for containers which use a network that require discovery.

+ 1 - 0
daemon/network/settings.go

@@ -21,6 +21,7 @@ type Settings struct {
 	SecondaryIPAddresses   []networktypes.Address
 	SecondaryIPAddresses   []networktypes.Address
 	SecondaryIPv6Addresses []networktypes.Address
 	SecondaryIPv6Addresses []networktypes.Address
 	IsAnonymousEndpoint    bool
 	IsAnonymousEndpoint    bool
+	HasSwarmEndpoint       bool
 }
 }
 
 
 // EndpointSettings is a package local wrapper for
 // EndpointSettings is a package local wrapper for

+ 22 - 0
integration-cli/docker_cli_swarm_test.go

@@ -242,3 +242,25 @@ func (s *DockerSwarmSuite) TestSwarmServiceWithGroup(c *check.C) {
 	c.Assert(err, checker.IsNil)
 	c.Assert(err, checker.IsNil)
 	c.Assert(strings.TrimSpace(out), checker.Equals, "uid=0(root) gid=0(root) groups=10(wheel),29(audio),50(staff),777")
 	c.Assert(strings.TrimSpace(out), checker.Equals, "uid=0(root) gid=0(root) groups=10(wheel),29(audio),50(staff),777")
 }
 }
+
+func (s *DockerSwarmSuite) TestSwarmContainerAutoStart(c *check.C) {
+	d := s.AddDaemon(c, true, true)
+
+	out, err := d.Cmd("network", "create", "--attachable", "-d", "overlay", "foo")
+	c.Assert(err, checker.IsNil)
+	c.Assert(strings.TrimSpace(out), checker.Not(checker.Equals), "")
+
+	out, err = d.Cmd("run", "-id", "--restart=always", "--net=foo", "--name=test", "busybox", "top")
+	c.Assert(err, checker.IsNil)
+	c.Assert(strings.TrimSpace(out), checker.Not(checker.Equals), "")
+
+	out, err = d.Cmd("ps", "-q")
+	c.Assert(err, checker.IsNil)
+	c.Assert(strings.TrimSpace(out), checker.Not(checker.Equals), "")
+
+	d.Restart()
+
+	out, err = d.Cmd("ps", "-q")
+	c.Assert(err, checker.IsNil)
+	c.Assert(strings.TrimSpace(out), checker.Not(checker.Equals), "")
+}