6 年之前 · 2cb26cfe9c
--- a/daemon/daemon.go
+++ b/daemon/daemon.go
@@ -66,6 +66,7 @@ import (
 
				 	"github.com/docker/libnetwork/cluster"
			
 
				 	nwconfig "github.com/docker/libnetwork/config"
			
 
				 	"github.com/pkg/errors"
			
 
				+	"golang.org/x/sync/semaphore"
			
 
				 )
			
 
				 
			
 
				 // ContainersNamespace is the name of the namespace used for users containers
			
@@ -197,6 +198,7 @@ func (daemon *Daemon) NewResolveOptionsFunc() resolver.ResolveOptionsFunc {
 
				 }
			
 
				 
			
 
				 func (daemon *Daemon) restore() error {
			
 
				+	var mapLock sync.Mutex
			
 
				 	containers := make(map[string]*container.Container)
			
 
				 
			
 
				 	logrus.Info("Loading containers: start.")
			
@@ -206,68 +208,99 @@ func (daemon *Daemon) restore() error {
 
				 		return err
			
 
				 	}
			
 
				 
			
 
				+	// parallelLimit is the maximum number of parallel startup jobs that we
			
 
				+	// allow (this is the limited used for all startup semaphores). The multipler
			
 
				+	// (128) was chosen after some fairly significant benchmarking -- don't change
			
 
				+	// it unless you've tested it significantly (this value is adjusted if
			
 
				+	// RLIMIT_NOFILE is small to avoid EMFILE).
			
 
				+	parallelLimit := adjustParallelLimit(len(dir), 128*runtime.NumCPU())
			
 
				+
			
 
				+	// Re-used for all parallel startup jobs.
			
 
				+	var group sync.WaitGroup
			
 
				+	sem := semaphore.NewWeighted(int64(parallelLimit))
			
 
				+
			
 
				 	for _, v := range dir {
			
 
				-		id := v.Name()
			
 
				-		container, err := daemon.load(id)
			
 
				-		if err != nil {
			
 
				-			logrus.Errorf("Failed to load container %v: %v", id, err)
			
 
				-			continue
			
 
				-		}
			
 
				-		if !system.IsOSSupported(container.OS) {
			
 
				-			logrus.Errorf("Failed to load container %v: %s (%q)", id, system.ErrNotSupportedOperatingSystem, container.OS)
			
 
				-			continue
			
 
				-		}
			
 
				-		// Ignore the container if it does not support the current driver being used by the graph
			
 
				-		currentDriverForContainerOS := daemon.graphDrivers[container.OS]
			
 
				-		if (container.Driver == "" && currentDriverForContainerOS == "aufs") || container.Driver == currentDriverForContainerOS {
			
 
				-			rwlayer, err := daemon.imageService.GetLayerByID(container.ID, container.OS)
			
 
				+		group.Add(1)
			
 
				+		go func(id string) {
			
 
				+			defer group.Done()
			
 
				+			_ = sem.Acquire(context.Background(), 1)
			
 
				+			defer sem.Release(1)
			
 
				+
			
 
				+			container, err := daemon.load(id)
			
 
				 			if err != nil {
			
 
				-				logrus.Errorf("Failed to load container mount %v: %v", id, err)
			
 
				-				continue
			
 
				+				logrus.Errorf("Failed to load container %v: %v", id, err)
			
 
				+				return
			
 
				 			}
			
 
				-			container.RWLayer = rwlayer
			
 
				-			logrus.Debugf("Loaded container %v, isRunning: %v", container.ID, container.IsRunning())
			
 
				+			if !system.IsOSSupported(container.OS) {
			
 
				+				logrus.Errorf("Failed to load container %v: %s (%q)", id, system.ErrNotSupportedOperatingSystem, container.OS)
			
 
				+				return
			
 
				+			}
			
 
				+			// Ignore the container if it does not support the current driver being used by the graph
			
 
				+			currentDriverForContainerOS := daemon.graphDrivers[container.OS]
			
 
				+			if (container.Driver == "" && currentDriverForContainerOS == "aufs") || container.Driver == currentDriverForContainerOS {
			
 
				+				rwlayer, err := daemon.imageService.GetLayerByID(container.ID, container.OS)
			
 
				+				if err != nil {
			
 
				+					logrus.Errorf("Failed to load container mount %v: %v", id, err)
			
 
				+					return
			
 
				+				}
			
 
				+				container.RWLayer = rwlayer
			
 
				+				logrus.Debugf("Loaded container %v, isRunning: %v", container.ID, container.IsRunning())
			
 
				 
			
 
				-			containers[container.ID] = container
			
 
				-		} else {
			
 
				-			logrus.Debugf("Cannot load container %s because it was created with another graph driver.", container.ID)
			
 
				-		}
			
 
				+				mapLock.Lock()
			
 
				+				containers[container.ID] = container
			
 
				+				mapLock.Unlock()
			
 
				+			} else {
			
 
				+				logrus.Debugf("Cannot load container %s because it was created with another graph driver.", container.ID)
			
 
				+			}
			
 
				+		}(v.Name())
			
 
				 	}
			
 
				+	group.Wait()
			
 
				 
			
 
				 	removeContainers := make(map[string]*container.Container)
			
 
				 	restartContainers := make(map[*container.Container]chan struct{})
			
 
				 	activeSandboxes := make(map[string]interface{})
			
 
				+
			
 
				 	for id, c := range containers {
			
 
				-		if err := daemon.registerName(c); err != nil {
			
 
				-			logrus.Errorf("Failed to register container name %s: %s", c.ID, err)
			
 
				-			delete(containers, id)
			
 
				-			continue
			
 
				-		}
			
 
				-		if err := daemon.Register(c); err != nil {
			
 
				-			logrus.Errorf("Failed to register container %s: %s", c.ID, err)
			
 
				-			delete(containers, id)
			
 
				-			continue
			
 
				-		}
			
 
				+		group.Add(1)
			
 
				+		go func(c *container.Container) {
			
 
				+			defer group.Done()
			
 
				+			_ = sem.Acquire(context.Background(), 1)
			
 
				+			defer sem.Release(1)
			
 
				 
			
 
				-		// The LogConfig.Type is empty if the container was created before docker 1.12 with default log driver.
			
 
				-		// We should rewrite it to use the daemon defaults.
			
 
				-		// Fixes https://github.com/docker/docker/issues/22536
			
 
				-		if c.HostConfig.LogConfig.Type == "" {
			
 
				-			if err := daemon.mergeAndVerifyLogConfig(&c.HostConfig.LogConfig); err != nil {
			
 
				-				logrus.Errorf("Failed to verify log config for container %s: %q", c.ID, err)
			
 
				-				continue
			
 
				+			if err := daemon.registerName(c); err != nil {
			
 
				+				logrus.Errorf("Failed to register container name %s: %s", c.ID, err)
			
 
				+				mapLock.Lock()
			
 
				+				delete(containers, id)
			
 
				+				mapLock.Unlock()
			
 
				+				return
			
 
				 			}
			
 
				-		}
			
 
				+			if err := daemon.Register(c); err != nil {
			
 
				+				logrus.Errorf("Failed to register container %s: %s", c.ID, err)
			
 
				+				mapLock.Lock()
			
 
				+				delete(containers, id)
			
 
				+				mapLock.Unlock()
			
 
				+				return
			
 
				+			}
			
 
				+
			
 
				+			// The LogConfig.Type is empty if the container was created before docker 1.12 with default log driver.
			
 
				+			// We should rewrite it to use the daemon defaults.
			
 
				+			// Fixes https://github.com/docker/docker/issues/22536
			
 
				+			if c.HostConfig.LogConfig.Type == "" {
			
 
				+				if err := daemon.mergeAndVerifyLogConfig(&c.HostConfig.LogConfig); err != nil {
			
 
				+					logrus.Errorf("Failed to verify log config for container %s: %q", c.ID, err)
			
 
				+				}
			
 
				+			}
			
 
				+		}(c)
			
 
				 	}
			
 
				+	group.Wait()
			
 
				 
			
 
				-	var (
			
 
				-		wg      sync.WaitGroup
			
 
				-		mapLock sync.Mutex
			
 
				-	)
			
 
				 	for _, c := range containers {
			
 
				-		wg.Add(1)
			
 
				+		group.Add(1)
			
 
				 		go func(c *container.Container) {
			
 
				-			defer wg.Done()
			
 
				+			defer group.Done()
			
 
				+			_ = sem.Acquire(context.Background(), 1)
			
 
				+			defer sem.Release(1)
			
 
				+
			
 
				 			daemon.backportMountSpec(c)
			
 
				 			if err := daemon.checkpointAndSave(c); err != nil {
			
 
				 				logrus.WithError(err).WithField("container", c.ID).Error("error saving backported mountspec to disk")
			
@@ -414,7 +447,8 @@ func (daemon *Daemon) restore() error {
 
				 			c.Unlock()
			
 
				 		}(c)
			
 
				 	}
			
 
				-	wg.Wait()
			
 
				+	group.Wait()
			
 
				+
			
 
				 	daemon.netController, err = daemon.initNetworkController(daemon.configStore, activeSandboxes)
			
 
				 	if err != nil {
			
 
				 		return fmt.Errorf("Error initializing network controller: %v", err)
			
@@ -422,18 +456,24 @@ func (daemon *Daemon) restore() error {
 
				 
			
 
				 	// Now that all the containers are registered, register the links
			
 
				 	for _, c := range containers {
			
 
				-		if err := daemon.registerLinks(c, c.HostConfig); err != nil {
			
 
				-			logrus.Errorf("failed to register link for container %s: %v", c.ID, err)
			
 
				-		}
			
 
				+		group.Add(1)
			
 
				+		go func(c *container.Container) {
			
 
				+			_ = sem.Acquire(context.Background(), 1)
			
 
				+
			
 
				+			if err := daemon.registerLinks(c, c.HostConfig); err != nil {
			
 
				+				logrus.Errorf("failed to register link for container %s: %v", c.ID, err)
			
 
				+			}
			
 
				+
			
 
				+			sem.Release(1)
			
 
				+			group.Done()
			
 
				+		}(c)
			
 
				 	}
			
 
				+	group.Wait()
			
 
				 
			
 
				-	group := sync.WaitGroup{}
			
 
				 	for c, notifier := range restartContainers {
			
 
				 		group.Add(1)
			
 
				-
			
 
				 		go func(c *container.Container, chNotify chan struct{}) {
			
 
				-			defer group.Done()
			
 
				-
			
 
				+			_ = sem.Acquire(context.Background(), 1)
			
 
				 			logrus.Debugf("Starting container %s", c.ID)
			
 
				 
			
 
				 			// ignore errors here as this is a best effort to wait for children to be
			
@@ -455,22 +495,27 @@ func (daemon *Daemon) restore() error {
 
				 				logrus.Errorf("Failed to start container %s: %s", c.ID, err)
			
 
				 			}
			
 
				 			close(chNotify)
			
 
				-		}(c, notifier)
			
 
				 
			
 
				+			sem.Release(1)
			
 
				+			group.Done()
			
 
				+		}(c, notifier)
			
 
				 	}
			
 
				 	group.Wait()
			
 
				 
			
 
				-	removeGroup := sync.WaitGroup{}
			
 
				 	for id := range removeContainers {
			
 
				-		removeGroup.Add(1)
			
 
				+		group.Add(1)
			
 
				 		go func(cid string) {
			
 
				+			_ = sem.Acquire(context.Background(), 1)
			
 
				+
			
 
				 			if err := daemon.ContainerRm(cid, &types.ContainerRmConfig{ForceRemove: true, RemoveVolume: true}); err != nil {
			
 
				 				logrus.Errorf("Failed to remove container %s: %s", cid, err)
			
 
				 			}
			
 
				-			removeGroup.Done()
			
 
				+
			
 
				+			sem.Release(1)
			
 
				+			group.Done()
			
 
				 		}(id)
			
 
				 	}
			
 
				-	removeGroup.Wait()
			
 
				+	group.Wait()
			
 
				 
			
 
				 	// any containers that were started above would already have had this done,
			
 
				 	// however we need to now prepare the mountpoints for the rest of the containers as well.
			
@@ -491,13 +536,16 @@ func (daemon *Daemon) restore() error {
 
				 
			
 
				 		group.Add(1)
			
 
				 		go func(c *container.Container) {
			
 
				-			defer group.Done()
			
 
				+			_ = sem.Acquire(context.Background(), 1)
			
 
				+
			
 
				 			if err := daemon.prepareMountPoints(c); err != nil {
			
 
				 				logrus.Error(err)
			
 
				 			}
			
 
				+
			
 
				+			sem.Release(1)
			
 
				+			group.Done()
			
 
				 		}(c)
			
 
				 	}
			
 
				-
			
 
				 	group.Wait()
			
 
				 
			
 
				 	logrus.Info("Loading containers: done.")
			
@@ -508,7 +556,18 @@ func (daemon *Daemon) restore() error {
 
				 // RestartSwarmContainers restarts any autostart container which has a
			
 
				 // swarm endpoint.
			
 
				 func (daemon *Daemon) RestartSwarmContainers() {
			
 
				-	group := sync.WaitGroup{}
			
 
				+	ctx := context.Background()
			
 
				+
			
 
				+	// parallelLimit is the maximum number of parallel startup jobs that we
			
 
				+	// allow (this is the limited used for all startup semaphores). The multipler
			
 
				+	// (128) was chosen after some fairly significant benchmarking -- don't change
			
 
				+	// it unless you've tested it significantly (this value is adjusted if
			
 
				+	// RLIMIT_NOFILE is small to avoid EMFILE).
			
 
				+	parallelLimit := adjustParallelLimit(len(daemon.List()), 128*runtime.NumCPU())
			
 
				+
			
 
				+	var group sync.WaitGroup
			
 
				+	sem := semaphore.NewWeighted(int64(parallelLimit))
			
 
				+
			
 
				 	for _, c := range daemon.List() {
			
 
				 		if !c.IsRunning() && !c.IsPaused() {
			
 
				 			// Autostart all the containers which has a
			
@@ -517,14 +576,21 @@ func (daemon *Daemon) RestartSwarmContainers() {
 
				 			if daemon.configStore.AutoRestart && c.ShouldRestart() && c.NetworkSettings.HasSwarmEndpoint && c.HasBeenStartedBefore {
			
 
				 				group.Add(1)
			
 
				 				go func(c *container.Container) {
			
 
				-					defer group.Done()
			
 
				+					if err := sem.Acquire(ctx, 1); err != nil {
			
 
				+						// ctx is done.
			
 
				+						group.Done()
			
 
				+						return
			
 
				+					}
			
 
				+
			
 
				 					if err := daemon.containerStart(c, "", "", true); err != nil {
			
 
				 						logrus.Error(err)
			
 
				 					}
			
 
				+
			
 
				+					sem.Release(1)
			
 
				+					group.Done()
			
 
				 				}(c)
			
 
				 			}
			
 
				 		}
			
 
				-
			
 
				 	}
			
 
				 	group.Wait()
			
 
				 }
			
--- a/daemon/daemon_unix.go
+++ b/daemon/daemon_unix.go
@@ -257,6 +257,41 @@ func getBlkioThrottleDevices(devs []*blkiodev.ThrottleDevice) ([]specs.LinuxThro
 
				 	return throttleDevices, nil
			
 
				 }
			
 
				 
			
 
				+// adjustParallelLimit takes a number of objects and a proposed limit and
			
 
				+// figures out if it's reasonable (and adjusts it accordingly). This is only
			
 
				+// used for daemon startup, which does a lot of parallel loading of containers
			
 
				+// (and if we exceed RLIMIT_NOFILE then we're in trouble).
			
 
				+func adjustParallelLimit(n int, limit int) int {
			
 
				+	// Rule-of-thumb overhead factor (how many files will each goroutine open
			
 
				+	// simultaneously). Yes, this is ugly but to be frank this whole thing is
			
 
				+	// ugly.
			
 
				+	const overhead = 2
			
 
				+
			
 
				+	// On Linux, we need to ensure that parallelStartupJobs doesn't cause us to
			
 
				+	// exceed RLIMIT_NOFILE. If parallelStartupJobs is too large, we reduce it
			
 
				+	// and give a warning (since in theory the user should increase their
			
 
				+	// ulimits to the largest possible value for dockerd).
			
 
				+	var rlim unix.Rlimit
			
 
				+	if err := unix.Getrlimit(unix.RLIMIT_NOFILE, &rlim); err != nil {
			
 
				+		logrus.Warnf("Couldn't find dockerd's RLIMIT_NOFILE to double-check startup parallelism factor: %v", err)
			
 
				+		return limit
			
 
				+	}
			
 
				+	softRlimit := int(rlim.Cur)
			
 
				+
			
 
				+	// Much fewer containers than RLIMIT_NOFILE. No need to adjust anything.
			
 
				+	if softRlimit > overhead*n {
			
 
				+		return limit
			
 
				+	}
			
 
				+
			
 
				+	// RLIMIT_NOFILE big enough, no need to adjust anything.
			
 
				+	if softRlimit > overhead*limit {
			
 
				+		return limit
			
 
				+	}
			
 
				+
			
 
				+	logrus.Warnf("Found dockerd's open file ulimit (%v) is far too small -- consider increasing it significantly (at least %v)", softRlimit, overhead*limit)
			
 
				+	return softRlimit / overhead
			
 
				+}
			
 
				+
			
 
				 func checkKernel() error {
			
 
				 	// Check for unsupported kernel versions
			
 
				 	// FIXME: it would be cleaner to not test for specific versions, but rather
			
--- a/daemon/daemon_windows.go
+++ b/daemon/daemon_windows.go
@@ -40,6 +40,11 @@ const (
 
				 	windowsMaxCPUPercent = 100
			
 
				 )
			
 
				 
			
 
				+// Windows doesn't really have rlimits.
			
 
				+func adjustParallelLimit(n int, limit int) int {
			
 
				+	return limit
			
 
				+}
			
 
				+
			
 
				 // Windows has no concept of an execution state directory. So use config.Root here.
			
 
				 func getPluginExecRoot(root string) string {
			
 
				 	return filepath.Join(root, "plugins")