فهرست منبع

Export all spec generation opts

Signed-off-by: Michael Crosby <crosbymichael@gmail.com>
Michael Crosby 6 سال پیش
والد
کامیت
c478553640
6فایلهای تغییر یافته به همراه520 افزوده شده و 488 حذف شده
  1. 0 51
      daemon/daemon_linux_test.go
  2. 4 2
      daemon/exec_linux.go
  3. 457 402
      daemon/oci_linux.go
  4. 10 5
      daemon/seccomp_disabled.go
  5. 34 28
      daemon/seccomp_linux.go
  6. 15 0
      daemon/seccomp_unsupported.go

+ 0 - 51
daemon/daemon_linux_test.go

@@ -10,10 +10,7 @@ import (
 	"testing"
 
 	containertypes "github.com/docker/docker/api/types/container"
-	"github.com/docker/docker/container"
 	"github.com/docker/docker/daemon/config"
-	"github.com/docker/docker/oci"
-	"github.com/docker/docker/pkg/idtools"
 	"github.com/docker/docker/pkg/mount"
 	"gotest.tools/assert"
 	is "gotest.tools/assert/cmp"
@@ -115,54 +112,6 @@ func TestNotCleanupMounts(t *testing.T) {
 	}
 }
 
-// TestTmpfsDevShmSizeOverride checks that user-specified /dev/tmpfs mount
-// size is not overridden by the default shmsize (that should only be used
-// for default /dev/shm (as in "shareable" and "private" ipc modes).
-// https://github.com/moby/moby/issues/35271
-func TestTmpfsDevShmSizeOverride(t *testing.T) {
-	size := "777m"
-	mnt := "/dev/shm"
-
-	d := Daemon{
-		idMapping: &idtools.IdentityMapping{},
-	}
-	c := &container.Container{
-		HostConfig: &containertypes.HostConfig{
-			ShmSize: 48 * 1024, // size we should NOT end up with
-		},
-	}
-	ms := []container.Mount{
-		{
-			Source:      "tmpfs",
-			Destination: mnt,
-			Data:        "size=" + size,
-		},
-	}
-
-	// convert ms to spec
-	spec := oci.DefaultSpec()
-	err := setMounts(&d, &spec, c, ms)
-	assert.Check(t, err)
-
-	// Check the resulting spec for the correct size
-	found := false
-	for _, m := range spec.Mounts {
-		if m.Destination == mnt {
-			for _, o := range m.Options {
-				if !strings.HasPrefix(o, "size=") {
-					continue
-				}
-				t.Logf("%+v\n", m.Options)
-				assert.Check(t, is.Equal("size="+size, o))
-				found = true
-			}
-		}
-	}
-	if !found {
-		t.Fatal("/dev/shm not found in spec, or size option missing")
-	}
-}
-
 func TestValidateContainerIsolationLinux(t *testing.T) {
 	d := Daemon{}
 

+ 4 - 2
daemon/exec_linux.go

@@ -1,6 +1,8 @@
 package daemon // import "github.com/docker/docker/daemon"
 
 import (
+	"context"
+
 	"github.com/docker/docker/container"
 	"github.com/docker/docker/daemon/exec"
 	"github.com/docker/docker/oci/caps"
@@ -54,6 +56,6 @@ func (daemon *Daemon) execSetPlatformOpt(c *container.Container, ec *exec.Config
 		}
 		p.ApparmorProfile = appArmorProfile
 	}
-	daemon.setRlimits(&specs.Spec{Process: p}, c)
-	return nil
+	s := &specs.Spec{Process: p}
+	return WithRlimits(daemon, c)(context.Background(), nil, nil, s)
 }

+ 457 - 402
daemon/oci_linux.go

@@ -33,27 +33,121 @@ import (
 	"golang.org/x/sys/unix"
 )
 
-const (
-	inContainerInitPath = "/sbin/" + daemonconfig.DefaultInitBinary
-)
+const inContainerInitPath = "/sbin/" + daemonconfig.DefaultInitBinary
 
-func (daemon *Daemon) setRlimits(s *specs.Spec, c *container.Container) error {
-	var rlimits []specs.POSIXRlimit
-
-	// We want to leave the original HostConfig alone so make a copy here
-	hostConfig := *c.HostConfig
-	// Merge with the daemon defaults
-	daemon.mergeUlimits(&hostConfig)
-	for _, ul := range hostConfig.Ulimits {
-		rlimits = append(rlimits, specs.POSIXRlimit{
-			Type: "RLIMIT_" + strings.ToUpper(ul.Name),
-			Soft: uint64(ul.Soft),
-			Hard: uint64(ul.Hard),
-		})
+// WithRlimits sets the container's rlimits along with merging the daemon's rlimits
+func WithRlimits(daemon *Daemon, c *container.Container) coci.SpecOpts {
+	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
+		var rlimits []specs.POSIXRlimit
+
+		// We want to leave the original HostConfig alone so make a copy here
+		hostConfig := *c.HostConfig
+		// Merge with the daemon defaults
+		daemon.mergeUlimits(&hostConfig)
+		for _, ul := range hostConfig.Ulimits {
+			rlimits = append(rlimits, specs.POSIXRlimit{
+				Type: "RLIMIT_" + strings.ToUpper(ul.Name),
+				Soft: uint64(ul.Soft),
+				Hard: uint64(ul.Hard),
+			})
+		}
+
+		s.Process.Rlimits = rlimits
+		return nil
 	}
+}
 
-	s.Process.Rlimits = rlimits
-	return nil
+// WithLibnetwork sets the libnetwork hook
+func WithLibnetwork(daemon *Daemon, c *container.Container) coci.SpecOpts {
+	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
+		if s.Hooks == nil {
+			s.Hooks = &specs.Hooks{}
+		}
+		for _, ns := range s.Linux.Namespaces {
+			if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled {
+				target := filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe")
+				s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{
+					Path: target,
+					Args: []string{
+						"libnetwork-setkey",
+						"-exec-root=" + daemon.configStore.GetExecRoot(),
+						c.ID,
+						daemon.netController.ID(),
+					},
+				})
+			}
+		}
+		return nil
+	}
+}
+
+// WithRootless sets the spec to the rootless configuration
+func WithRootless(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
+	return specconv.ToRootless(s)
+}
+
+// WithOOMScore sets the oom score
+func WithOOMScore(score *int) coci.SpecOpts {
+	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
+		s.Process.OOMScoreAdj = score
+		return nil
+	}
+}
+
+// WithSelinux sets the selinux labels
+func WithSelinux(c *container.Container) coci.SpecOpts {
+	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
+		s.Process.SelinuxLabel = c.GetProcessLabel()
+		s.Linux.MountLabel = c.MountLabel
+		return nil
+	}
+}
+
+// WithApparmor sets the apparmor profile
+func WithApparmor(c *container.Container) coci.SpecOpts {
+	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
+		if apparmor.IsEnabled() {
+			var appArmorProfile string
+			if c.AppArmorProfile != "" {
+				appArmorProfile = c.AppArmorProfile
+			} else if c.HostConfig.Privileged {
+				appArmorProfile = "unconfined"
+			} else {
+				appArmorProfile = "docker-default"
+			}
+
+			if appArmorProfile == "docker-default" {
+				// Unattended upgrades and other fun services can unload AppArmor
+				// profiles inadvertently. Since we cannot store our profile in
+				// /etc/apparmor.d, nor can we practically add other ways of
+				// telling the system to keep our profile loaded, in order to make
+				// sure that we keep the default profile enabled we dynamically
+				// reload it if necessary.
+				if err := ensureDefaultAppArmorProfile(); err != nil {
+					return err
+				}
+			}
+			s.Process.ApparmorProfile = appArmorProfile
+		}
+		return nil
+	}
+}
+
+// WithCapabilities sets the container's capabilties
+func WithCapabilities(c *container.Container) coci.SpecOpts {
+	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
+		capabilities, err := caps.TweakCapabilities(
+			oci.DefaultCapabilities(),
+			c.HostConfig.CapAdd,
+			c.HostConfig.CapDrop,
+			c.HostConfig.Capabilities,
+			c.HostConfig.Privileged,
+		)
+		if err != nil {
+			return err
+		}
+		return oci.SetCapabilities(s, capabilities)
+	}
 }
 
 func readUserFile(c *container.Container, p string) (io.ReadCloser, error) {
@@ -119,99 +213,102 @@ func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) {
 	s.Linux.Namespaces = append(s.Linux.Namespaces, ns)
 }
 
-func setNamespaces(daemon *Daemon, s *specs.Spec, c *container.Container) error {
-	userNS := false
-	// user
-	if c.HostConfig.UsernsMode.IsPrivate() {
-		uidMap := daemon.idMapping.UIDs()
-		if uidMap != nil {
-			userNS = true
-			ns := specs.LinuxNamespace{Type: "user"}
+// WithNamespaces sets the container's namespaces
+func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
+	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
+		userNS := false
+		// user
+		if c.HostConfig.UsernsMode.IsPrivate() {
+			uidMap := daemon.idMapping.UIDs()
+			if uidMap != nil {
+				userNS = true
+				ns := specs.LinuxNamespace{Type: "user"}
+				setNamespace(s, ns)
+				s.Linux.UIDMappings = specMapping(uidMap)
+				s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDs())
+			}
+		}
+		// network
+		if !c.Config.NetworkDisabled {
+			ns := specs.LinuxNamespace{Type: "network"}
+			parts := strings.SplitN(string(c.HostConfig.NetworkMode), ":", 2)
+			if parts[0] == "container" {
+				nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer())
+				if err != nil {
+					return err
+				}
+				ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID())
+				if userNS {
+					// to share a net namespace, they must also share a user namespace
+					nsUser := specs.LinuxNamespace{Type: "user"}
+					nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID())
+					setNamespace(s, nsUser)
+				}
+			} else if c.HostConfig.NetworkMode.IsHost() {
+				ns.Path = c.NetworkSettings.SandboxKey
+			}
 			setNamespace(s, ns)
-			s.Linux.UIDMappings = specMapping(uidMap)
-			s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDs())
 		}
-	}
-	// network
-	if !c.Config.NetworkDisabled {
-		ns := specs.LinuxNamespace{Type: "network"}
-		parts := strings.SplitN(string(c.HostConfig.NetworkMode), ":", 2)
-		if parts[0] == "container" {
-			nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer())
+
+		// ipc
+		ipcMode := c.HostConfig.IpcMode
+		switch {
+		case ipcMode.IsContainer():
+			ns := specs.LinuxNamespace{Type: "ipc"}
+			ic, err := daemon.getIpcContainer(ipcMode.Container())
 			if err != nil {
 				return err
 			}
-			ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID())
+			ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID())
+			setNamespace(s, ns)
 			if userNS {
-				// to share a net namespace, they must also share a user namespace
+				// to share an IPC namespace, they must also share a user namespace
 				nsUser := specs.LinuxNamespace{Type: "user"}
-				nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID())
+				nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID())
 				setNamespace(s, nsUser)
 			}
-		} else if c.HostConfig.NetworkMode.IsHost() {
-			ns.Path = c.NetworkSettings.SandboxKey
+		case ipcMode.IsHost():
+			oci.RemoveNamespace(s, specs.LinuxNamespaceType("ipc"))
+		case ipcMode.IsEmpty():
+			// A container was created by an older version of the daemon.
+			// The default behavior used to be what is now called "shareable".
+			fallthrough
+		case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone():
+			ns := specs.LinuxNamespace{Type: "ipc"}
+			setNamespace(s, ns)
+		default:
+			return fmt.Errorf("Invalid IPC mode: %v", ipcMode)
 		}
-		setNamespace(s, ns)
-	}
 
-	// ipc
-	ipcMode := c.HostConfig.IpcMode
-	switch {
-	case ipcMode.IsContainer():
-		ns := specs.LinuxNamespace{Type: "ipc"}
-		ic, err := daemon.getIpcContainer(ipcMode.Container())
-		if err != nil {
-			return err
-		}
-		ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID())
-		setNamespace(s, ns)
-		if userNS {
-			// to share an IPC namespace, they must also share a user namespace
-			nsUser := specs.LinuxNamespace{Type: "user"}
-			nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID())
-			setNamespace(s, nsUser)
-		}
-	case ipcMode.IsHost():
-		oci.RemoveNamespace(s, specs.LinuxNamespaceType("ipc"))
-	case ipcMode.IsEmpty():
-		// A container was created by an older version of the daemon.
-		// The default behavior used to be what is now called "shareable".
-		fallthrough
-	case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone():
-		ns := specs.LinuxNamespace{Type: "ipc"}
-		setNamespace(s, ns)
-	default:
-		return fmt.Errorf("Invalid IPC mode: %v", ipcMode)
-	}
-
-	// pid
-	if c.HostConfig.PidMode.IsContainer() {
-		ns := specs.LinuxNamespace{Type: "pid"}
-		pc, err := daemon.getPidContainer(c)
-		if err != nil {
-			return err
+		// pid
+		if c.HostConfig.PidMode.IsContainer() {
+			ns := specs.LinuxNamespace{Type: "pid"}
+			pc, err := daemon.getPidContainer(c)
+			if err != nil {
+				return err
+			}
+			ns.Path = fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID())
+			setNamespace(s, ns)
+			if userNS {
+				// to share a PID namespace, they must also share a user namespace
+				nsUser := specs.LinuxNamespace{Type: "user"}
+				nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID())
+				setNamespace(s, nsUser)
+			}
+		} else if c.HostConfig.PidMode.IsHost() {
+			oci.RemoveNamespace(s, specs.LinuxNamespaceType("pid"))
+		} else {
+			ns := specs.LinuxNamespace{Type: "pid"}
+			setNamespace(s, ns)
 		}
-		ns.Path = fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID())
-		setNamespace(s, ns)
-		if userNS {
-			// to share a PID namespace, they must also share a user namespace
-			nsUser := specs.LinuxNamespace{Type: "user"}
-			nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID())
-			setNamespace(s, nsUser)
+		// uts
+		if c.HostConfig.UTSMode.IsHost() {
+			oci.RemoveNamespace(s, specs.LinuxNamespaceType("uts"))
+			s.Hostname = ""
 		}
-	} else if c.HostConfig.PidMode.IsHost() {
-		oci.RemoveNamespace(s, specs.LinuxNamespaceType("pid"))
-	} else {
-		ns := specs.LinuxNamespace{Type: "pid"}
-		setNamespace(s, ns)
-	}
-	// uts
-	if c.HostConfig.UTSMode.IsHost() {
-		oci.RemoveNamespace(s, specs.LinuxNamespaceType("uts"))
-		s.Hostname = ""
-	}
 
-	return nil
+		return nil
+	}
 }
 
 func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {
@@ -361,233 +458,284 @@ func inSlice(slice []string, s string) bool {
 	return false
 }
 
-func setMounts(daemon *Daemon, s *specs.Spec, c *container.Container, mounts []container.Mount) error {
-	userMounts := make(map[string]struct{})
-	for _, m := range mounts {
-		userMounts[m.Destination] = struct{}{}
-	}
-
-	// Copy all mounts from spec to defaultMounts, except for
-	//  - mounts overridden by a user supplied mount;
-	//  - all mounts under /dev if a user supplied /dev is present;
-	//  - /dev/shm, in case IpcMode is none.
-	// While at it, also
-	//  - set size for /dev/shm from shmsize.
-	defaultMounts := s.Mounts[:0]
-	_, mountDev := userMounts["/dev"]
-	for _, m := range s.Mounts {
-		if _, ok := userMounts[m.Destination]; ok {
-			// filter out mount overridden by a user supplied mount
-			continue
-		}
-		if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
-			// filter out everything under /dev if /dev is user-mounted
-			continue
-		}
-
-		if m.Destination == "/dev/shm" {
-			if c.HostConfig.IpcMode.IsNone() {
-				// filter out /dev/shm for "none" IpcMode
-				continue
-			}
-			// set size for /dev/shm mount from spec
-			sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
-			m.Options = append(m.Options, sizeOpt)
+// WithMounts sets the container's mounts
+func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts {
+	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
+		if err := daemon.setupContainerMountsRoot(c); err != nil {
+			return err
 		}
 
-		defaultMounts = append(defaultMounts, m)
-	}
-
-	s.Mounts = defaultMounts
-	for _, m := range mounts {
-		if m.Source == "tmpfs" {
-			data := m.Data
-			parser := volumemounts.NewParser("linux")
-			options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())}
-			if data != "" {
-				options = append(options, strings.Split(data, ",")...)
-			}
+		if err := daemon.setupIpcDirs(c); err != nil {
+			return err
+		}
 
-			merged, err := mount.MergeTmpfsOptions(options)
+		defer func() {
 			if err != nil {
-				return err
+				daemon.cleanupSecretDir(c)
 			}
+		}()
 
-			s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged})
-			continue
+		if err := daemon.setupSecretDir(c); err != nil {
+			return err
 		}
 
-		mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"}
+		ms, err := daemon.setupMounts(c)
+		if err != nil {
+			return err
+		}
 
-		// Determine property of RootPropagation based on volume
-		// properties. If a volume is shared, then keep root propagation
-		// shared. This should work for slave and private volumes too.
-		//
-		// For slave volumes, it can be either [r]shared/[r]slave.
-		//
-		// For private volumes any root propagation value should work.
-		pFlag := mountPropagationMap[m.Propagation]
-		switch pFlag {
-		case mount.SHARED, mount.RSHARED:
-			if err := ensureShared(m.Source); err != nil {
-				return err
+		if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() {
+			ms = append(ms, c.IpcMounts()...)
+		}
+
+		tmpfsMounts, err := c.TmpfsMounts()
+		if err != nil {
+			return err
+		}
+		ms = append(ms, tmpfsMounts...)
+
+		secretMounts, err := c.SecretMounts()
+		if err != nil {
+			return err
+		}
+		ms = append(ms, secretMounts...)
+
+		sort.Sort(mounts(ms))
+
+		mounts := ms
+
+		userMounts := make(map[string]struct{})
+		for _, m := range mounts {
+			userMounts[m.Destination] = struct{}{}
+		}
+
+		// Copy all mounts from spec to defaultMounts, except for
+		//  - mounts overridden by a user supplied mount;
+		//  - all mounts under /dev if a user supplied /dev is present;
+		//  - /dev/shm, in case IpcMode is none.
+		// While at it, also
+		//  - set size for /dev/shm from shmsize.
+		defaultMounts := s.Mounts[:0]
+		_, mountDev := userMounts["/dev"]
+		for _, m := range s.Mounts {
+			if _, ok := userMounts[m.Destination]; ok {
+				// filter out mount overridden by a user supplied mount
+				continue
 			}
-			rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
-			if rootpg != mount.SHARED && rootpg != mount.RSHARED {
-				s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED]
-			}
-		case mount.SLAVE, mount.RSLAVE:
-			var fallback bool
-			if err := ensureSharedOrSlave(m.Source); err != nil {
-				// For backwards compatibility purposes, treat mounts from the daemon root
-				// as special since we automatically add rslave propagation to these mounts
-				// when the user did not set anything, so we should fallback to the old
-				// behavior which is to use private propagation which is normally the
-				// default.
-				if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) {
-					return err
+			if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
+				// filter out everything under /dev if /dev is user-mounted
+				continue
+			}
+
+			if m.Destination == "/dev/shm" {
+				if c.HostConfig.IpcMode.IsNone() {
+					// filter out /dev/shm for "none" IpcMode
+					continue
 				}
+				// set size for /dev/shm mount from spec
+				sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
+				m.Options = append(m.Options, sizeOpt)
+			}
 
-				cm, ok := c.MountPoints[m.Destination]
-				if !ok {
-					return err
+			defaultMounts = append(defaultMounts, m)
+		}
+
+		s.Mounts = defaultMounts
+		for _, m := range mounts {
+			if m.Source == "tmpfs" {
+				data := m.Data
+				parser := volumemounts.NewParser("linux")
+				options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())}
+				if data != "" {
+					options = append(options, strings.Split(data, ",")...)
 				}
-				if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" {
-					// This means the user explicitly set a propagation, do not fallback in that case.
+
+				merged, err := mount.MergeTmpfsOptions(options)
+				if err != nil {
 					return err
 				}
-				fallback = true
-				logrus.WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root")
+
+				s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged})
+				continue
 			}
-			if !fallback {
+
+			mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"}
+
+			// Determine property of RootPropagation based on volume
+			// properties. If a volume is shared, then keep root propagation
+			// shared. This should work for slave and private volumes too.
+			//
+			// For slave volumes, it can be either [r]shared/[r]slave.
+			//
+			// For private volumes any root propagation value should work.
+			pFlag := mountPropagationMap[m.Propagation]
+			switch pFlag {
+			case mount.SHARED, mount.RSHARED:
+				if err := ensureShared(m.Source); err != nil {
+					return err
+				}
 				rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
-				if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
-					s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE]
+				if rootpg != mount.SHARED && rootpg != mount.RSHARED {
+					s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED]
+				}
+			case mount.SLAVE, mount.RSLAVE:
+				var fallback bool
+				if err := ensureSharedOrSlave(m.Source); err != nil {
+					// For backwards compatibility purposes, treat mounts from the daemon root
+					// as special since we automatically add rslave propagation to these mounts
+					// when the user did not set anything, so we should fallback to the old
+					// behavior which is to use private propagation which is normally the
+					// default.
+					if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) {
+						return err
+					}
+
+					cm, ok := c.MountPoints[m.Destination]
+					if !ok {
+						return err
+					}
+					if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" {
+						// This means the user explicitly set a propagation, do not fallback in that case.
+						return err
+					}
+					fallback = true
+					logrus.WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root")
+				}
+				if !fallback {
+					rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
+					if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
+						s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE]
+					}
 				}
 			}
-		}
 
-		bindMode := "rbind"
-		if m.NonRecursive {
-			bindMode = "bind"
-		}
-		opts := []string{bindMode}
-		if !m.Writable {
-			opts = append(opts, "ro")
-		}
-		if pFlag != 0 {
-			opts = append(opts, mountPropagationReverseMap[pFlag])
-		}
+			bindMode := "rbind"
+			if m.NonRecursive {
+				bindMode = "bind"
+			}
+			opts := []string{bindMode}
+			if !m.Writable {
+				opts = append(opts, "ro")
+			}
+			if pFlag != 0 {
+				opts = append(opts, mountPropagationReverseMap[pFlag])
+			}
 
-		// If we are using user namespaces, then we must make sure that we
-		// don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
-		// "mount" when we bind-mount. The reason for this is that at the point
-		// when runc sets up the root filesystem, it is already inside a user
-		// namespace, and thus cannot change any flags that are locked.
-		if daemon.configStore.RemappedRoot != "" {
-			unprivOpts, err := getUnprivilegedMountFlags(m.Source)
-			if err != nil {
-				return err
+			// If we are using user namespaces, then we must make sure that we
+			// don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
+			// "mount" when we bind-mount. The reason for this is that at the point
+			// when runc sets up the root filesystem, it is already inside a user
+			// namespace, and thus cannot change any flags that are locked.
+			if daemon.configStore.RemappedRoot != "" {
+				unprivOpts, err := getUnprivilegedMountFlags(m.Source)
+				if err != nil {
+					return err
+				}
+				opts = append(opts, unprivOpts...)
 			}
-			opts = append(opts, unprivOpts...)
-		}
 
-		mt.Options = opts
-		s.Mounts = append(s.Mounts, mt)
-	}
+			mt.Options = opts
+			s.Mounts = append(s.Mounts, mt)
+		}
 
-	if s.Root.Readonly {
-		for i, m := range s.Mounts {
-			switch m.Destination {
-			case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev":
-				continue
-			}
-			if _, ok := userMounts[m.Destination]; !ok {
-				if !inSlice(m.Options, "ro") {
-					s.Mounts[i].Options = append(s.Mounts[i].Options, "ro")
+		if s.Root.Readonly {
+			for i, m := range s.Mounts {
+				switch m.Destination {
+				case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev":
+					continue
+				}
+				if _, ok := userMounts[m.Destination]; !ok {
+					if !inSlice(m.Options, "ro") {
+						s.Mounts[i].Options = append(s.Mounts[i].Options, "ro")
+					}
 				}
 			}
 		}
-	}
 
-	if c.HostConfig.Privileged {
-		// clear readonly for /sys
-		for i := range s.Mounts {
-			if s.Mounts[i].Destination == "/sys" {
-				clearReadOnly(&s.Mounts[i])
+		if c.HostConfig.Privileged {
+			// clear readonly for /sys
+			for i := range s.Mounts {
+				if s.Mounts[i].Destination == "/sys" {
+					clearReadOnly(&s.Mounts[i])
+				}
 			}
+			s.Linux.ReadonlyPaths = nil
+			s.Linux.MaskedPaths = nil
 		}
-		s.Linux.ReadonlyPaths = nil
-		s.Linux.MaskedPaths = nil
-	}
 
-	// TODO: until a kernel/mount solution exists for handling remount in a user namespace,
-	// we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
-	if uidMap := daemon.idMapping.UIDs(); uidMap != nil || c.HostConfig.Privileged {
-		for i, m := range s.Mounts {
-			if m.Type == "cgroup" {
-				clearReadOnly(&s.Mounts[i])
+		// TODO: until a kernel/mount solution exists for handling remount in a user namespace,
+		// we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
+		if uidMap := daemon.idMapping.UIDs(); uidMap != nil || c.HostConfig.Privileged {
+			for i, m := range s.Mounts {
+				if m.Type == "cgroup" {
+					clearReadOnly(&s.Mounts[i])
+				}
 			}
 		}
-	}
 
-	return nil
-}
+		return nil
 
-func (daemon *Daemon) populateCommonSpec(s *specs.Spec, c *container.Container) error {
-	if c.BaseFS == nil {
-		return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly nil")
-	}
-	linkedEnv, err := daemon.setupLinkedContainers(c)
-	if err != nil {
-		return err
-	}
-	s.Root = &specs.Root{
-		Path:     c.BaseFS.Path(),
-		Readonly: c.HostConfig.ReadonlyRootfs,
-	}
-	if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil {
-		return err
 	}
-	cwd := c.Config.WorkingDir
-	if len(cwd) == 0 {
-		cwd = "/"
-	}
-	s.Process.Args = append([]string{c.Path}, c.Args...)
-
-	// only add the custom init if it is specified and the container is running in its
-	// own private pid namespace.  It does not make sense to add if it is running in the
-	// host namespace or another container's pid namespace where we already have an init
-	if c.HostConfig.PidMode.IsPrivate() {
-		if (c.HostConfig.Init != nil && *c.HostConfig.Init) ||
-			(c.HostConfig.Init == nil && daemon.configStore.Init) {
-			s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...)
-			path := daemon.configStore.InitPath
-			if path == "" {
-				path, err = exec.LookPath(daemonconfig.DefaultInitBinary)
-				if err != nil {
-					return err
+}
+
+// WithCommonOptions sets common docker options
+func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts {
+	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
+		if c.BaseFS == nil {
+			return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly nil")
+		}
+		linkedEnv, err := daemon.setupLinkedContainers(c)
+		if err != nil {
+			return err
+		}
+		s.Root = &specs.Root{
+			Path:     c.BaseFS.Path(),
+			Readonly: c.HostConfig.ReadonlyRootfs,
+		}
+		if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil {
+			return err
+		}
+		cwd := c.Config.WorkingDir
+		if len(cwd) == 0 {
+			cwd = "/"
+		}
+		s.Process.Args = append([]string{c.Path}, c.Args...)
+
+		// only add the custom init if it is specified and the container is running in its
+		// own private pid namespace.  It does not make sense to add if it is running in the
+		// host namespace or another container's pid namespace where we already have an init
+		if c.HostConfig.PidMode.IsPrivate() {
+			if (c.HostConfig.Init != nil && *c.HostConfig.Init) ||
+				(c.HostConfig.Init == nil && daemon.configStore.Init) {
+				s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...)
+				path := daemon.configStore.InitPath
+				if path == "" {
+					path, err = exec.LookPath(daemonconfig.DefaultInitBinary)
+					if err != nil {
+						return err
+					}
 				}
+				s.Mounts = append(s.Mounts, specs.Mount{
+					Destination: inContainerInitPath,
+					Type:        "bind",
+					Source:      path,
+					Options:     []string{"bind", "ro"},
+				})
 			}
-			s.Mounts = append(s.Mounts, specs.Mount{
-				Destination: inContainerInitPath,
-				Type:        "bind",
-				Source:      path,
-				Options:     []string{"bind", "ro"},
-			})
 		}
-	}
-	s.Process.Cwd = cwd
-	s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
-	s.Process.Terminal = c.Config.Tty
+		s.Process.Cwd = cwd
+		s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
+		s.Process.Terminal = c.Config.Tty
 
-	s.Hostname = c.Config.Hostname
-	setLinuxDomainname(c, s)
+		s.Hostname = c.Config.Hostname
+		setLinuxDomainname(c, s)
 
-	return nil
+		return nil
+	}
 }
 
-func withCgroups(daemon *Daemon, c *container.Container) coci.SpecOpts {
+// WithCgroups sets the container's cgroups
+func WithCgroups(daemon *Daemon, c *container.Container) coci.SpecOpts {
 	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
 		var cgroupsPath string
 		scopePrefix := "docker"
@@ -636,7 +784,8 @@ func withCgroups(daemon *Daemon, c *container.Container) coci.SpecOpts {
 	}
 }
 
-func withContainerDevices(daemon *Daemon, c *container.Container) coci.SpecOpts {
+// WithDevices sets the container's devices
+func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts {
 	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
 		// Build lists of devices allowed and created within the container.
 		var devs []specs.LinuxDevice
@@ -684,7 +833,8 @@ func withContainerDevices(daemon *Daemon, c *container.Container) coci.SpecOpts
 	}
 }
 
-func withResources(c *container.Container) coci.SpecOpts {
+// WithResources applies the container resources
+func WithResources(c *container.Container) coci.SpecOpts {
 	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
 		r := c.HostConfig.Resources
 		weightDevices, err := getBlkioWeightDevices(r)
@@ -738,7 +888,8 @@ func withResources(c *container.Container) coci.SpecOpts {
 	}
 }
 
-func withSysctls(c *container.Container) coci.SpecOpts {
+// WithSysctls sets the container's sysctls
+func WithSysctls(c *container.Container) coci.SpecOpts {
 	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
 		// We merge the sysctls injected above with the HostConfig (latter takes
 		// precedence for backwards-compatibility reasons).
@@ -749,7 +900,8 @@ func withSysctls(c *container.Container) coci.SpecOpts {
 	}
 }
 
-func withUser(c *container.Container) coci.SpecOpts {
+// WithUser sets the container's user
+func WithUser(c *container.Container) coci.SpecOpts {
 	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
 		uid, gid, additionalGids, err := getUser(c, c.Config.User)
 		if err != nil {
@@ -767,133 +919,36 @@ func (daemon *Daemon) createSpec(c *container.Container) (retSpec *specs.Spec, e
 		opts []coci.SpecOpts
 		s    = oci.DefaultSpec()
 	)
-	if err := daemon.populateCommonSpec(&s, c); err != nil {
-		return nil, err
-	}
-
 	opts = append(opts,
-		withCgroups(daemon, c),
-		withResources(c),
-		withSysctls(c),
-		withContainerDevices(daemon, c),
-		withUser(c),
+		WithCommonOptions(daemon, c),
+		WithCgroups(daemon, c),
+		WithResources(c),
+		WithSysctls(c),
+		WithDevices(daemon, c),
+		WithUser(c),
+		WithRlimits(daemon, c),
+		WithNamespaces(daemon, c),
+		WithCapabilities(c),
+		WithSeccomp(daemon, c),
+		WithMounts(daemon, c),
+		WithLibnetwork(daemon, c),
+		WithApparmor(c),
+		WithSelinux(c),
+		WithOOMScore(&c.HostConfig.OomScoreAdj),
 	)
-
-	if err := daemon.setRlimits(&s, c); err != nil {
-		return nil, fmt.Errorf("linux runtime spec rlimits: %v", err)
-	}
-	if err := setNamespaces(daemon, &s, c); err != nil {
-		return nil, fmt.Errorf("linux spec namespaces: %v", err)
-	}
-	capabilities, err := caps.TweakCapabilities(oci.DefaultCapabilities(), c.HostConfig.CapAdd, c.HostConfig.CapDrop, c.HostConfig.Capabilities, c.HostConfig.Privileged)
-	if err != nil {
-		return nil, fmt.Errorf("linux spec capabilities: %v", err)
-	}
-	if err := oci.SetCapabilities(&s, capabilities); err != nil {
-		return nil, fmt.Errorf("linux spec capabilities: %v", err)
-	}
-	if err := setSeccomp(daemon, &s, c); err != nil {
-		return nil, fmt.Errorf("linux seccomp: %v", err)
-	}
-
-	if err := daemon.setupContainerMountsRoot(c); err != nil {
-		return nil, err
-	}
-
-	if err := daemon.setupIpcDirs(c); err != nil {
-		return nil, err
-	}
-
-	defer func() {
-		if err != nil {
-			daemon.cleanupSecretDir(c)
-		}
-	}()
-
-	if err := daemon.setupSecretDir(c); err != nil {
-		return nil, err
-	}
-
-	ms, err := daemon.setupMounts(c)
-	if err != nil {
-		return nil, err
-	}
-
-	if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() {
-		ms = append(ms, c.IpcMounts()...)
-	}
-
-	tmpfsMounts, err := c.TmpfsMounts()
-	if err != nil {
-		return nil, err
-	}
-	ms = append(ms, tmpfsMounts...)
-
-	secretMounts, err := c.SecretMounts()
-	if err != nil {
-		return nil, err
-	}
-	ms = append(ms, secretMounts...)
-
-	sort.Sort(mounts(ms))
-	if err := setMounts(daemon, &s, c, ms); err != nil {
-		return nil, fmt.Errorf("linux mounts: %v", err)
-	}
-
-	if s.Hooks == nil {
-		s.Hooks = &specs.Hooks{}
-	}
-	for _, ns := range s.Linux.Namespaces {
-		if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled {
-			target := filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe")
-			s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{
-				Path: target,
-				Args: []string{"libnetwork-setkey", "-exec-root=" + daemon.configStore.GetExecRoot(), c.ID, daemon.netController.ID()},
-			})
-		}
-	}
-
-	if apparmor.IsEnabled() {
-		var appArmorProfile string
-		if c.AppArmorProfile != "" {
-			appArmorProfile = c.AppArmorProfile
-		} else if c.HostConfig.Privileged {
-			appArmorProfile = "unconfined"
-		} else {
-			appArmorProfile = "docker-default"
-		}
-
-		if appArmorProfile == "docker-default" {
-			// Unattended upgrades and other fun services can unload AppArmor
-			// profiles inadvertently. Since we cannot store our profile in
-			// /etc/apparmor.d, nor can we practically add other ways of
-			// telling the system to keep our profile loaded, in order to make
-			// sure that we keep the default profile enabled we dynamically
-			// reload it if necessary.
-			if err := ensureDefaultAppArmorProfile(); err != nil {
-				return nil, err
-			}
-		}
-
-		s.Process.ApparmorProfile = appArmorProfile
+	if c.NoNewPrivileges {
+		opts = append(opts, coci.WithNoNewPrivileges)
 	}
-	s.Process.SelinuxLabel = c.GetProcessLabel()
-	s.Process.NoNewPrivileges = c.NoNewPrivileges
-	s.Process.OOMScoreAdj = &c.HostConfig.OomScoreAdj
-	s.Linux.MountLabel = c.MountLabel
 
 	// Set the masked and readonly paths with regard to the host config options if they are set.
 	if c.HostConfig.MaskedPaths != nil {
-		s.Linux.MaskedPaths = c.HostConfig.MaskedPaths
+		opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths))
 	}
 	if c.HostConfig.ReadonlyPaths != nil {
-		s.Linux.ReadonlyPaths = c.HostConfig.ReadonlyPaths
+		opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths))
 	}
-
 	if daemon.configStore.Rootless {
-		if err := specconv.ToRootless(&s); err != nil {
-			return nil, err
-		}
+		opts = append(opts, WithRootless)
 	}
 	return &s, coci.ApplyOpts(context.Background(), nil, &containers.Container{
 		ID: c.ID,

+ 10 - 5
daemon/seccomp_disabled.go

@@ -3,17 +3,22 @@
 package daemon // import "github.com/docker/docker/daemon"
 
 import (
+	"context"
 	"fmt"
 
+	"github.com/containerd/containerd/containers"
+	coci "github.com/containerd/containerd/oci"
 	"github.com/docker/docker/container"
-	"github.com/opencontainers/runtime-spec/specs-go"
 )
 
 var supportsSeccomp = false
 
-func setSeccomp(daemon *Daemon, rs *specs.Spec, c *container.Container) error {
-	if c.SeccompProfile != "" && c.SeccompProfile != "unconfined" {
-		return fmt.Errorf("seccomp profiles are not supported on this daemon, you cannot specify a custom seccomp profile")
+// WithSeccomp sets the seccomp profile
+func WithSeccomp(daemon *Daemon, c *container.Container) coci.SpecOpts {
+	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
+		if c.SeccompProfile != "" && c.SeccompProfile != "unconfined" {
+			return fmt.Errorf("seccomp profiles are not supported on this daemon, you cannot specify a custom seccomp profile")
+		}
+		return nil
 	}
-	return nil
 }

+ 34 - 28
daemon/seccomp_linux.go

@@ -3,8 +3,11 @@
 package daemon // import "github.com/docker/docker/daemon"
 
 import (
+	"context"
 	"fmt"
 
+	"github.com/containerd/containerd/containers"
+	coci "github.com/containerd/containerd/oci"
 	"github.com/docker/docker/container"
 	"github.com/docker/docker/profiles/seccomp"
 	"github.com/opencontainers/runtime-spec/specs-go"
@@ -13,43 +16,46 @@ import (
 
 var supportsSeccomp = true
 
-func setSeccomp(daemon *Daemon, rs *specs.Spec, c *container.Container) error {
-	var profile *specs.LinuxSeccomp
-	var err error
+// WithSeccomp sets the seccomp profile
+func WithSeccomp(daemon *Daemon, c *container.Container) coci.SpecOpts {
+	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
+		var profile *specs.LinuxSeccomp
+		var err error
 
-	if c.HostConfig.Privileged {
-		return nil
-	}
+		if c.HostConfig.Privileged {
+			return nil
+		}
 
-	if !daemon.seccompEnabled {
-		if c.SeccompProfile != "" && c.SeccompProfile != "unconfined" {
-			return fmt.Errorf("Seccomp is not enabled in your kernel, cannot run a custom seccomp profile.")
+		if !daemon.seccompEnabled {
+			if c.SeccompProfile != "" && c.SeccompProfile != "unconfined" {
+				return fmt.Errorf("Seccomp is not enabled in your kernel, cannot run a custom seccomp profile.")
+			}
+			logrus.Warn("Seccomp is not enabled in your kernel, running container without default profile.")
+			c.SeccompProfile = "unconfined"
 		}
-		logrus.Warn("Seccomp is not enabled in your kernel, running container without default profile.")
-		c.SeccompProfile = "unconfined"
-	}
-	if c.SeccompProfile == "unconfined" {
-		return nil
-	}
-	if c.SeccompProfile != "" {
-		profile, err = seccomp.LoadProfile(c.SeccompProfile, rs)
-		if err != nil {
-			return err
+		if c.SeccompProfile == "unconfined" {
+			return nil
 		}
-	} else {
-		if daemon.seccompProfile != nil {
-			profile, err = seccomp.LoadProfile(string(daemon.seccompProfile), rs)
+		if c.SeccompProfile != "" {
+			profile, err = seccomp.LoadProfile(c.SeccompProfile, s)
 			if err != nil {
 				return err
 			}
 		} else {
-			profile, err = seccomp.GetDefaultProfile(rs)
-			if err != nil {
-				return err
+			if daemon.seccompProfile != nil {
+				profile, err = seccomp.LoadProfile(string(daemon.seccompProfile), s)
+				if err != nil {
+					return err
+				}
+			} else {
+				profile, err = seccomp.GetDefaultProfile(s)
+				if err != nil {
+					return err
+				}
 			}
 		}
-	}
 
-	rs.Linux.Seccomp = profile
-	return nil
+		s.Linux.Seccomp = profile
+		return nil
+	}
 }

+ 15 - 0
daemon/seccomp_unsupported.go

@@ -2,4 +2,19 @@
 
 package daemon // import "github.com/docker/docker/daemon"
 
+import (
+	"context"
+
+	"github.com/containerd/containerd/containers"
+	coci "github.com/containerd/containerd/oci"
+	"github.com/docker/docker/container"
+)
+
 var supportsSeccomp = false
+
+// WithSeccomp sets the seccomp profile
+func WithSeccomp(daemon *Daemon, c *container.Container) coci.SpecOpts {
+	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
+		return nil
+	}
+}