moby/daemon/oci_linux.go

package daemon // import "github.com/docker/docker/daemon"

import (
	"context"
	"fmt"
	"os"
	"path/filepath"
	"sort"
	"strconv"
	"strings"

	cdcgroups "github.com/containerd/cgroups/v3"
	"github.com/containerd/containerd/containers"
	coci "github.com/containerd/containerd/oci"
	"github.com/containerd/containerd/pkg/apparmor"
	"github.com/containerd/containerd/pkg/userns"
	"github.com/containerd/log"
	containertypes "github.com/docker/docker/api/types/container"
	"github.com/docker/docker/container"
	dconfig "github.com/docker/docker/daemon/config"
	"github.com/docker/docker/errdefs"
	"github.com/docker/docker/oci"
	"github.com/docker/docker/oci/caps"
	"github.com/docker/docker/pkg/idtools"
	"github.com/docker/docker/pkg/rootless/specconv"
	volumemounts "github.com/docker/docker/volume/mounts"
	"github.com/moby/sys/mount"
	"github.com/moby/sys/mountinfo"
	"github.com/moby/sys/user"
	"github.com/opencontainers/runc/libcontainer/cgroups"
	specs "github.com/opencontainers/runtime-spec/specs-go"
	"github.com/pkg/errors"
	"golang.org/x/sys/unix"
)

const inContainerInitPath = "/sbin/" + dconfig.DefaultInitBinary

// withRlimits sets the container's rlimits along with merging the daemon's rlimits
func withRlimits(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
		var rlimits []specs.POSIXRlimit

		// We want to leave the original HostConfig alone so make a copy here
		hostConfig := *c.HostConfig
		// Merge with the daemon defaults
		daemon.mergeUlimits(&hostConfig, daemonCfg)
		for _, ul := range hostConfig.Ulimits {
			rlimits = append(rlimits, specs.POSIXRlimit{
				Type: "RLIMIT_" + strings.ToUpper(ul.Name),
				Soft: uint64(ul.Soft),
				Hard: uint64(ul.Hard),
			})
		}

		if s.Process == nil {
			s.Process = &specs.Process{}
		}
		s.Process.Rlimits = rlimits
		return nil
	}
}

// withRootless sets the spec to the rootless configuration
func withRootless(daemon *Daemon, daemonCfg *dconfig.Config) coci.SpecOpts {
	return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
		var v2Controllers []string
		if cgroupDriver(daemonCfg) == cgroupSystemdDriver {
			if cdcgroups.Mode() != cdcgroups.Unified {
				return errors.New("rootless systemd driver doesn't support cgroup v1")
			}
			rootlesskitParentEUID := os.Getenv("ROOTLESSKIT_PARENT_EUID")
			if rootlesskitParentEUID == "" {
				return errors.New("$ROOTLESSKIT_PARENT_EUID is not set (requires RootlessKit v0.8.0)")
			}
			euid, err := strconv.Atoi(rootlesskitParentEUID)
			if err != nil {
				return errors.Wrap(err, "invalid $ROOTLESSKIT_PARENT_EUID: must be a numeric value")
			}
			controllersPath := fmt.Sprintf("/sys/fs/cgroup/user.slice/user-%d.slice/cgroup.controllers", euid)
			controllersFile, err := os.ReadFile(controllersPath)
			if err != nil {
				return err
			}
			v2Controllers = strings.Fields(string(controllersFile))
		}
		return specconv.ToRootless(s, v2Controllers)
	}
}

// withRootfulInRootless is used for "rootful-in-rootless" dind;
// the daemon is running in UserNS but has no access to RootlessKit API socket, host filesystem, etc.
func withRootfulInRootless(daemon *Daemon, daemonCfg *dconfig.Config) coci.SpecOpts {
	return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
		specconv.ToRootfulInRootless(s)
		return nil
	}
}

// WithOOMScore sets the oom score
func WithOOMScore(score *int) coci.SpecOpts {
	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
		if s.Process == nil {
			s.Process = &specs.Process{}
		}
		s.Process.OOMScoreAdj = score
		return nil
	}
}

// WithSelinux sets the selinux labels
func WithSelinux(c *container.Container) coci.SpecOpts {
	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
		if s.Process == nil {
			s.Process = &specs.Process{}
		}
		if s.Linux == nil {
			s.Linux = &specs.Linux{}
		}
		s.Process.SelinuxLabel = c.GetProcessLabel()
		s.Linux.MountLabel = c.MountLabel
		return nil
	}
}

// WithApparmor sets the apparmor profile
func WithApparmor(c *container.Container) coci.SpecOpts {
	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
		if apparmor.HostSupports() {
			var appArmorProfile string
			if c.AppArmorProfile != "" {
				appArmorProfile = c.AppArmorProfile
			} else if c.HostConfig.Privileged {
				appArmorProfile = unconfinedAppArmorProfile
			} else {
				appArmorProfile = defaultAppArmorProfile
			}

			if appArmorProfile == defaultAppArmorProfile {
				// Unattended upgrades and other fun services can unload AppArmor
				// profiles inadvertently. Since we cannot store our profile in
				// /etc/apparmor.d, nor can we practically add other ways of
				// telling the system to keep our profile loaded, in order to make
				// sure that we keep the default profile enabled we dynamically
				// reload it if necessary.
				if err := ensureDefaultAppArmorProfile(); err != nil {
					return err
				}
			}
			if s.Process == nil {
				s.Process = &specs.Process{}
			}
			s.Process.ApparmorProfile = appArmorProfile
		}
		return nil
	}
}

// WithCapabilities sets the container's capabilties
func WithCapabilities(c *container.Container) coci.SpecOpts {
	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
		capabilities, err := caps.TweakCapabilities(
			caps.DefaultCapabilities(),
			c.HostConfig.CapAdd,
			c.HostConfig.CapDrop,
			c.HostConfig.Privileged,
		)
		if err != nil {
			return err
		}
		return oci.SetCapabilities(s, capabilities)
	}
}

func resourcePath(c *container.Container, getPath func() (string, error)) (string, error) {
	p, err := getPath()
	if err != nil {
		return "", err
	}
	return c.GetResourcePath(p)
}

func getUser(c *container.Container, username string) (specs.User, error) {
	var usr specs.User
	passwdPath, err := resourcePath(c, user.GetPasswdPath)
	if err != nil {
		return usr, err
	}
	groupPath, err := resourcePath(c, user.GetGroupPath)
	if err != nil {
		return usr, err
	}
	execUser, err := user.GetExecUserPath(username, nil, passwdPath, groupPath)
	if err != nil {
		return usr, err
	}
	usr.UID = uint32(execUser.Uid)
	usr.GID = uint32(execUser.Gid)
	usr.AdditionalGids = []uint32{usr.GID}

	var addGroups []int
	if len(c.HostConfig.GroupAdd) > 0 {
		addGroups, err = user.GetAdditionalGroupsPath(c.HostConfig.GroupAdd, groupPath)
		if err != nil {
			return usr, err
		}
	}
	for _, g := range append(execUser.Sgids, addGroups...) {
		usr.AdditionalGids = append(usr.AdditionalGids, uint32(g))
	}
	return usr, nil
}

func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) {
	if s.Linux == nil {
		s.Linux = &specs.Linux{}
	}

	for i, n := range s.Linux.Namespaces {
		if n.Type == ns.Type {
			s.Linux.Namespaces[i] = ns
			return
		}
	}
	s.Linux.Namespaces = append(s.Linux.Namespaces, ns)
}

// WithNamespaces sets the container's namespaces
func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
		userNS := false
		// user
		if c.HostConfig.UsernsMode.IsPrivate() {
			if uidMap := daemon.idMapping.UIDMaps; uidMap != nil {
				userNS = true
				setNamespace(s, specs.LinuxNamespace{
					Type: specs.UserNamespace,
				})
				s.Linux.UIDMappings = specMapping(uidMap)
				s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDMaps)
			}
		}
		// network
		if !c.Config.NetworkDisabled {
			networkMode := c.HostConfig.NetworkMode
			switch {
			case networkMode.IsContainer():
				nc, err := daemon.getNetworkedContainer(c.ID, networkMode.ConnectedContainer())
				if err != nil {
					return err
				}
				setNamespace(s, specs.LinuxNamespace{
					Type: specs.NetworkNamespace,
					Path: fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID()),
				})
				if userNS {
					// to share a net namespace, the containers must also share a user namespace.
					//
					// FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210
					setNamespace(s, specs.LinuxNamespace{
						Type: specs.UserNamespace,
						Path: fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID()),
					})
				}
			case networkMode.IsHost():
				oci.RemoveNamespace(s, specs.NetworkNamespace)
			default:
				setNamespace(s, specs.LinuxNamespace{
					Type: specs.NetworkNamespace,
				})
			}
		}

		// ipc
		ipcMode := c.HostConfig.IpcMode
		if !ipcMode.Valid() {
			return errdefs.InvalidParameter(errors.Errorf("invalid IPC mode: %v", ipcMode))
		}
		switch {
		case ipcMode.IsContainer():
			ic, err := daemon.getIPCContainer(ipcMode.Container())
			if err != nil {
				return errors.Wrap(err, "failed to join IPC namespace")
			}
			setNamespace(s, specs.LinuxNamespace{
				Type: specs.IPCNamespace,
				Path: fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID()),
			})
			if userNS {
				// to share a IPC namespace, the containers must also share a user namespace.
				//
				// FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210
				setNamespace(s, specs.LinuxNamespace{
					Type: specs.UserNamespace,
					Path: fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID()),
				})
			}
		case ipcMode.IsHost():
			oci.RemoveNamespace(s, specs.IPCNamespace)
		case ipcMode.IsEmpty():
			// A container was created by an older version of the daemon.
			// The default behavior used to be what is now called "shareable".
			fallthrough
		case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone():
			setNamespace(s, specs.LinuxNamespace{
				Type: specs.IPCNamespace,
			})
		}

		// pid
		pidMode := c.HostConfig.PidMode
		if !pidMode.Valid() {
			return errdefs.InvalidParameter(errors.Errorf("invalid PID mode: %v", pidMode))
		}
		switch {
		case pidMode.IsContainer():
			pc, err := daemon.getPIDContainer(pidMode.Container())
			if err != nil {
				return errors.Wrap(err, "failed to join PID namespace")
			}
			setNamespace(s, specs.LinuxNamespace{
				Type: specs.PIDNamespace,
				Path: fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID()),
			})
			if userNS {
				// to share a PID namespace, the containers must also share a user namespace.
				//
				// FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210
				setNamespace(s, specs.LinuxNamespace{
					Type: specs.UserNamespace,
					Path: fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID()),
				})
			}
		case pidMode.IsHost():
			oci.RemoveNamespace(s, specs.PIDNamespace)
		default:
			setNamespace(s, specs.LinuxNamespace{
				Type: specs.PIDNamespace,
			})
		}

		// uts
		if !c.HostConfig.UTSMode.Valid() {
			return errdefs.InvalidParameter(errors.Errorf("invalid UTS mode: %v", c.HostConfig.UTSMode))
		}
		if c.HostConfig.UTSMode.IsHost() {
			oci.RemoveNamespace(s, specs.UTSNamespace)
			s.Hostname = ""
		}

		// cgroup
		if !c.HostConfig.CgroupnsMode.Valid() {
			return errdefs.InvalidParameter(errors.Errorf("invalid cgroup namespace mode: %v", c.HostConfig.CgroupnsMode))
		}
		if c.HostConfig.CgroupnsMode.IsPrivate() {
			setNamespace(s, specs.LinuxNamespace{
				Type: specs.CgroupNamespace,
			})
		}

		return nil
	}
}

func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {
	var ids []specs.LinuxIDMapping
	for _, item := range s {
		ids = append(ids, specs.LinuxIDMapping{
			HostID:      uint32(item.HostID),
			ContainerID: uint32(item.ContainerID),
			Size:        uint32(item.Size),
		})
	}
	return ids
}

// Get the source mount point of directory passed in as argument. Also return
// optional fields.
func getSourceMount(source string) (string, string, error) {
	// Ensure any symlinks are resolved.
	sourcePath, err := filepath.EvalSymlinks(source)
	if err != nil {
		return "", "", err
	}

	mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(sourcePath))
	if err != nil {
		return "", "", err
	}
	if len(mi) < 1 {
		return "", "", fmt.Errorf("Can't find mount point of %s", source)
	}

	// find the longest mount point
	var idx, maxlen int
	for i := range mi {
		if len(mi[i].Mountpoint) > maxlen {
			maxlen = len(mi[i].Mountpoint)
			idx = i
		}
	}
	return mi[idx].Mountpoint, mi[idx].Optional, nil
}

const (
	sharedPropagationOption = "shared:"
	slavePropagationOption  = "master:"
)

// hasMountInfoOption checks if any of the passed any of the given option values
// are set in the passed in option string.
func hasMountInfoOption(opts string, vals ...string) bool {
	for _, opt := range strings.Split(opts, " ") {
		for _, val := range vals {
			if strings.HasPrefix(opt, val) {
				return true
			}
		}
	}
	return false
}

// Ensure mount point on which path is mounted, is shared.
func ensureShared(path string) error {
	sourceMount, optionalOpts, err := getSourceMount(path)
	if err != nil {
		return err
	}
	// Make sure source mount point is shared.
	if !hasMountInfoOption(optionalOpts, sharedPropagationOption) {
		return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount)
	}
	return nil
}

// Ensure mount point on which path is mounted, is either shared or slave.
func ensureSharedOrSlave(path string) error {
	sourceMount, optionalOpts, err := getSourceMount(path)
	if err != nil {
		return err
	}

	if !hasMountInfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) {
		return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount)
	}
	return nil
}

// Get the set of mount flags that are set on the mount that contains the given
// path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
// bind-mounting "with options" will not fail with user namespaces, due to
// kernel restrictions that require user namespace mounts to preserve
// CL_UNPRIVILEGED locked flags.
func getUnprivilegedMountFlags(path string) ([]string, error) {
	var statfs unix.Statfs_t
	if err := unix.Statfs(path, &statfs); err != nil {
		return nil, err
	}

	// The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
	unprivilegedFlags := map[uint64]string{
		unix.MS_RDONLY:     "ro",
		unix.MS_NODEV:      "nodev",
		unix.MS_NOEXEC:     "noexec",
		unix.MS_NOSUID:     "nosuid",
		unix.MS_NOATIME:    "noatime",
		unix.MS_RELATIME:   "relatime",
		unix.MS_NODIRATIME: "nodiratime",
	}

	var flags []string
	for mask, flag := range unprivilegedFlags {
		if uint64(statfs.Flags)&mask == mask {
			flags = append(flags, flag)
		}
	}

	return flags, nil
}

var (
	mountPropagationMap = map[string]int{
		"private":  mount.PRIVATE,
		"rprivate": mount.RPRIVATE,
		"shared":   mount.SHARED,
		"rshared":  mount.RSHARED,
		"slave":    mount.SLAVE,
		"rslave":   mount.RSLAVE,
	}

	mountPropagationReverseMap = map[int]string{
		mount.PRIVATE:  "private",
		mount.RPRIVATE: "rprivate",
		mount.SHARED:   "shared",
		mount.RSHARED:  "rshared",
		mount.SLAVE:    "slave",
		mount.RSLAVE:   "rslave",
	}
)

// inSlice tests whether a string is contained in a slice of strings or not.
// Comparison is case sensitive
func inSlice(slice []string, s string) bool {
	for _, ss := range slice {
		if s == ss {
			return true
		}
	}
	return false
}

// withMounts sets the container's mounts
func withMounts(daemon *Daemon, daemonCfg *configStore, c *container.Container, ms []container.Mount) coci.SpecOpts {
	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
		sort.Sort(mounts(ms))

		mounts := ms

		userMounts := make(map[string]struct{})
		for _, m := range mounts {
			userMounts[m.Destination] = struct{}{}
		}

		// Copy all mounts from spec to defaultMounts, except for
		//  - mounts overridden by a user supplied mount;
		//  - all mounts under /dev if a user supplied /dev is present;
		//  - /dev/shm, in case IpcMode is none.
		// While at it, also
		//  - set size for /dev/shm from shmsize.
		defaultMounts := s.Mounts[:0]
		_, mountDev := userMounts["/dev"]
		for _, m := range s.Mounts {
			if _, ok := userMounts[m.Destination]; ok {
				// filter out mount overridden by a user supplied mount
				continue
			}
			if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
				// filter out everything under /dev if /dev is user-mounted
				continue
			}

			if m.Destination == "/dev/shm" {
				if c.HostConfig.IpcMode.IsNone() {
					// filter out /dev/shm for "none" IpcMode
					continue
				}
				// set size for /dev/shm mount from spec
				sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
				m.Options = append(m.Options, sizeOpt)
			}

			defaultMounts = append(defaultMounts, m)
		}

		s.Mounts = defaultMounts
		for _, m := range mounts {
			if m.Source == "tmpfs" {
				data := m.Data
				parser := volumemounts.NewParser()
				options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())}
				if data != "" {
					options = append(options, strings.Split(data, ",")...)
				}

				merged, err := mount.MergeTmpfsOptions(options)
				if err != nil {
					return err
				}

				s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged})
				continue
			}

			mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"}

			// Determine property of RootPropagation based on volume
			// properties. If a volume is shared, then keep root propagation
			// shared. This should work for slave and private volumes too.
			//
			// For slave volumes, it can be either [r]shared/[r]slave.
			//
			// For private volumes any root propagation value should work.
			pFlag := mountPropagationMap[m.Propagation]
			switch pFlag {
			case mount.SHARED, mount.RSHARED:
				if err := ensureShared(m.Source); err != nil {
					return err
				}
				rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
				if rootpg != mount.SHARED && rootpg != mount.RSHARED {
					if s.Linux == nil {
						s.Linux = &specs.Linux{}
					}
					s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED]
				}
			case mount.SLAVE, mount.RSLAVE:
				var fallback bool
				if err := ensureSharedOrSlave(m.Source); err != nil {
					// For backwards compatibility purposes, treat mounts from the daemon root
					// as special since we automatically add rslave propagation to these mounts
					// when the user did not set anything, so we should fallback to the old
					// behavior which is to use private propagation which is normally the
					// default.
					if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) {
						return err
					}

					cm, ok := c.MountPoints[m.Destination]
					if !ok {
						return err
					}
					if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" {
						// This means the user explicitly set a propagation, do not fallback in that case.
						return err
					}
					fallback = true
					log.G(ctx).WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root")
				}
				if !fallback {
					rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
					if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
						if s.Linux == nil {
							s.Linux = &specs.Linux{}
						}
						s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE]
					}
				}
			}

			bindMode := "rbind"
			if m.NonRecursive {
				bindMode = "bind"
			}
			opts := []string{bindMode}
			if !m.Writable {
				rro := true
				if m.ReadOnlyNonRecursive {
					rro = false
					if m.ReadOnlyForceRecursive {
						return errors.New("mount options conflict: ReadOnlyNonRecursive && ReadOnlyForceRecursive")
					}
				}
				if rroErr := supportsRecursivelyReadOnly(daemonCfg, c.HostConfig.Runtime); rroErr != nil {
					rro = false
					if m.ReadOnlyForceRecursive {
						return rroErr
					}
				}
				if rro {
					opts = append(opts, "rro")
				} else {
					opts = append(opts, "ro")
				}
			}
			if pFlag != 0 {
				opts = append(opts, mountPropagationReverseMap[pFlag])
			}

			// If we are using user namespaces, then we must make sure that we
			// don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
			// "mount" when we bind-mount. The reason for this is that at the point
			// when runc sets up the root filesystem, it is already inside a user
			// namespace, and thus cannot change any flags that are locked.
			if daemonCfg.RemappedRoot != "" || userns.RunningInUserNS() {
				unprivOpts, err := getUnprivilegedMountFlags(m.Source)
				if err != nil {
					return err
				}
				opts = append(opts, unprivOpts...)
			}

			mt.Options = opts
			s.Mounts = append(s.Mounts, mt)
		}

		if s.Root.Readonly {
			for i, m := range s.Mounts {
				switch m.Destination {
				case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev":
					continue
				}
				if _, ok := userMounts[m.Destination]; !ok {
					if !inSlice(m.Options, "ro") {
						s.Mounts[i].Options = append(s.Mounts[i].Options, "ro")
					}
				}
			}
		}

		if c.HostConfig.Privileged {
			// clear readonly for /sys
			for i := range s.Mounts {
				if s.Mounts[i].Destination == "/sys" {
					clearReadOnly(&s.Mounts[i])
				}
			}
			if s.Linux != nil {
				s.Linux.ReadonlyPaths = nil
				s.Linux.MaskedPaths = nil
			}
		}

		// TODO: until a kernel/mount solution exists for handling remount in a user namespace,
		// we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
		if uidMap := daemon.idMapping.UIDMaps; uidMap != nil || c.HostConfig.Privileged {
			for i, m := range s.Mounts {
				if m.Type == "cgroup" {
					clearReadOnly(&s.Mounts[i])
				}
			}
		}

		return nil
	}
}

// sysctlExists checks if a sysctl exists; runc will error if we add any that do not actually
// exist, so do not add the default ones if running on an old kernel.
func sysctlExists(s string) bool {
	f := filepath.Join("/proc", "sys", strings.ReplaceAll(s, ".", "/"))
	_, err := os.Stat(f)
	return err == nil
}

// withCommonOptions sets common docker options
func withCommonOptions(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
		if c.BaseFS == "" {
			return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly empty")
		}
		linkedEnv, err := daemon.setupLinkedContainers(c)
		if err != nil {
			return err
		}
		s.Root = &specs.Root{
			Path:     c.BaseFS,
			Readonly: c.HostConfig.ReadonlyRootfs,
		}
		if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil {
			return err
		}
		cwd := c.Config.WorkingDir
		if len(cwd) == 0 {
			cwd = "/"
		}
		if s.Process == nil {
			s.Process = &specs.Process{}
		}
		s.Process.Args = append([]string{c.Path}, c.Args...)

		// only add the custom init if it is specified and the container is running in its
		// own private pid namespace.  It does not make sense to add if it is running in the
		// host namespace or another container's pid namespace where we already have an init
		if c.HostConfig.PidMode.IsPrivate() {
			if (c.HostConfig.Init != nil && *c.HostConfig.Init) ||
				(c.HostConfig.Init == nil && daemonCfg.Init) {
				s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...)
				path, err := daemonCfg.LookupInitPath() // this will fall back to DefaultInitBinary and return an absolute path
				if err != nil {
					return err
				}
				s.Mounts = append(s.Mounts, specs.Mount{
					Destination: inContainerInitPath,
					Type:        "bind",
					Source:      path,
					Options:     []string{"bind", "ro"},
				})
			}
		}
		s.Process.Cwd = cwd
		s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
		s.Process.Terminal = c.Config.Tty

		s.Hostname = c.Config.Hostname
		setLinuxDomainname(c, s)

		// Add default sysctls that are generally safe and useful; currently we
		// grant the capabilities to allow these anyway. You can override if
		// you want to restore the original behaviour.
		// We do not set network sysctls if network namespace is host, or if we are
		// joining an existing namespace, only if we create a new net namespace.
		if c.HostConfig.NetworkMode.IsPrivate() {
			// We cannot set up ping socket support in a user namespace
			userNS := daemonCfg.RemappedRoot != "" && c.HostConfig.UsernsMode.IsPrivate()
			if !userNS && !userns.RunningInUserNS() && sysctlExists("net.ipv4.ping_group_range") {
				// allow unprivileged ICMP echo sockets without CAP_NET_RAW
				s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647"
			}
			// allow opening any port less than 1024 without CAP_NET_BIND_SERVICE
			if sysctlExists("net.ipv4.ip_unprivileged_port_start") {
				s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"] = "0"
			}
		}

		return nil
	}
}

// withCgroups sets the container's cgroups
func withCgroups(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
		var cgroupsPath string
		scopePrefix := "docker"
		parent := "/docker"
		useSystemd := UsingSystemd(daemonCfg)
		if useSystemd {
			parent = "system.slice"
			if daemonCfg.Rootless {
				parent = "user.slice"
			}
		}

		if c.HostConfig.CgroupParent != "" {
			parent = c.HostConfig.CgroupParent
		} else if daemonCfg.CgroupParent != "" {
			parent = daemonCfg.CgroupParent
		}

		if useSystemd {
			cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID
			log.G(ctx).Debugf("createSpec: cgroupsPath: %s", cgroupsPath)
		} else {
			cgroupsPath = filepath.Join(parent, c.ID)
		}
		if s.Linux == nil {
			s.Linux = &specs.Linux{}
		}
		s.Linux.CgroupsPath = cgroupsPath

		// the rest is only needed for CPU RT controller

		if daemonCfg.CPURealtimePeriod == 0 && daemonCfg.CPURealtimeRuntime == 0 {
			return nil
		}

		p := cgroupsPath
		if useSystemd {
			initPath, err := cgroups.GetInitCgroup("cpu")
			if err != nil {
				return errors.Wrap(err, "unable to init CPU RT controller")
			}
			_, err = cgroups.GetOwnCgroup("cpu")
			if err != nil {
				return errors.Wrap(err, "unable to init CPU RT controller")
			}
			p = filepath.Join(initPath, s.Linux.CgroupsPath)
		}

		// Clean path to guard against things like ../../../BAD
		parentPath := filepath.Dir(p)
		if !filepath.IsAbs(parentPath) {
			parentPath = filepath.Clean("/" + parentPath)
		}

		mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", "cpu")
		if err != nil {
			return errors.Wrap(err, "unable to init CPU RT controller")
		}
		// When docker is run inside docker, the root is based of the host cgroup.
		// Should this be handled in runc/libcontainer/cgroups ?
		if strings.HasPrefix(root, "/docker/") {
			root = "/"
		}
		mnt = filepath.Join(mnt, root)

		if err := daemon.initCPURtController(daemonCfg, mnt, parentPath); err != nil {
			return errors.Wrap(err, "unable to init CPU RT controller")
		}
		return nil
	}
}

// WithDevices sets the container's devices
func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts {
	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
		// Build lists of devices allowed and created within the container.
		var devs []specs.LinuxDevice
		devPermissions := s.Linux.Resources.Devices

		if c.HostConfig.Privileged {
			hostDevices, err := coci.HostDevices()
			if err != nil {
				return err
			}
			devs = append(devs, hostDevices...)

			// adding device mappings in privileged containers
			for _, deviceMapping := range c.HostConfig.Devices {
				// issue a warning that custom cgroup permissions are ignored in privileged mode
				if deviceMapping.CgroupPermissions != "rwm" {
					log.G(ctx).WithField("container", c.ID).Warnf("custom %s permissions for device %s are ignored in privileged mode", deviceMapping.CgroupPermissions, deviceMapping.PathOnHost)
				}
				// issue a warning that the device path already exists via /dev mounting in privileged mode
				if deviceMapping.PathOnHost == deviceMapping.PathInContainer {
					log.G(ctx).WithField("container", c.ID).Warnf("path in container %s already exists in privileged mode", deviceMapping.PathInContainer)
					continue
				}
				d, _, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, "rwm")
				if err != nil {
					return err
				}
				devs = append(devs, d...)
			}

			devPermissions = []specs.LinuxDeviceCgroup{
				{
					Allow:  true,
					Access: "rwm",
				},
			}
		} else {
			for _, deviceMapping := range c.HostConfig.Devices {
				d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions)
				if err != nil {
					return err
				}
				devs = append(devs, d...)
				devPermissions = append(devPermissions, dPermissions...)
			}

			var err error
			devPermissions, err = oci.AppendDevicePermissionsFromCgroupRules(devPermissions, c.HostConfig.DeviceCgroupRules)
			if err != nil {
				return err
			}
		}

		if s.Linux == nil {
			s.Linux = &specs.Linux{}
		}
		if s.Linux.Resources == nil {
			s.Linux.Resources = &specs.LinuxResources{}
		}
		s.Linux.Devices = append(s.Linux.Devices, devs...)
		s.Linux.Resources.Devices = append(s.Linux.Resources.Devices, devPermissions...)

		for _, req := range c.HostConfig.DeviceRequests {
			if err := daemon.handleDevice(req, s); err != nil {
				return err
			}
		}
		return nil
	}
}

// WithResources applies the container resources
func WithResources(c *container.Container) coci.SpecOpts {
	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
		r := c.HostConfig.Resources
		weightDevices, err := getBlkioWeightDevices(r)
		if err != nil {
			return err
		}
		readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps)
		if err != nil {
			return err
		}
		writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps)
		if err != nil {
			return err
		}
		readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps)
		if err != nil {
			return err
		}
		writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps)
		if err != nil {
			return err
		}

		memoryRes := getMemoryResources(r)
		cpuRes, err := getCPUResources(r)
		if err != nil {
			return err
		}

		if s.Linux == nil {
			s.Linux = &specs.Linux{}
		}
		if s.Linux.Resources == nil {
			s.Linux.Resources = &specs.LinuxResources{}
		}
		s.Linux.Resources.Memory = memoryRes
		s.Linux.Resources.CPU = cpuRes
		s.Linux.Resources.BlockIO = &specs.LinuxBlockIO{
			WeightDevice:            weightDevices,
			ThrottleReadBpsDevice:   readBpsDevice,
			ThrottleWriteBpsDevice:  writeBpsDevice,
			ThrottleReadIOPSDevice:  readIOpsDevice,
			ThrottleWriteIOPSDevice: writeIOpsDevice,
		}
		if r.BlkioWeight != 0 {
			w := r.BlkioWeight
			s.Linux.Resources.BlockIO.Weight = &w
		}
		s.Linux.Resources.Pids = getPidsLimit(r)

		return nil
	}
}

// WithSysctls sets the container's sysctls
func WithSysctls(c *container.Container) coci.SpecOpts {
	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
		if len(c.HostConfig.Sysctls) == 0 {
			return nil
		}
		if s.Linux == nil {
			s.Linux = &specs.Linux{}
		}
		if s.Linux.Sysctl == nil {
			s.Linux.Sysctl = make(map[string]string)
		}
		// We merge the sysctls injected above with the HostConfig (latter takes
		// precedence for backwards-compatibility reasons).
		for k, v := range c.HostConfig.Sysctls {
			s.Linux.Sysctl[k] = v
		}
		return nil
	}
}

// WithUser sets the container's user
func WithUser(c *container.Container) coci.SpecOpts {
	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
		if s.Process == nil {
			s.Process = &specs.Process{}
		}
		var err error
		s.Process.User, err = getUser(c, c.Config.User)
		return err
	}
}

func (daemon *Daemon) createSpec(ctx context.Context, daemonCfg *configStore, c *container.Container, mounts []container.Mount) (retSpec *specs.Spec, err error) {
	var (
		opts []coci.SpecOpts
		s    = oci.DefaultSpec()
	)
	opts = append(opts,
		withCommonOptions(daemon, &daemonCfg.Config, c),
		withCgroups(daemon, &daemonCfg.Config, c),
		WithResources(c),
		WithSysctls(c),
		WithDevices(daemon, c),
		withRlimits(daemon, &daemonCfg.Config, c),
		WithNamespaces(daemon, c),
		WithCapabilities(c),
		WithSeccomp(daemon, c),
		withMounts(daemon, daemonCfg, c, mounts),
		WithApparmor(c),
		WithSelinux(c),
		WithOOMScore(&c.HostConfig.OomScoreAdj),
		coci.WithAnnotations(c.HostConfig.Annotations),
		WithUser(c),
	)

	if c.NoNewPrivileges {
		opts = append(opts, coci.WithNoNewPrivileges)
	}
	if c.Config.Tty {
		opts = append(opts, WithConsoleSize(c))
	}
	// Set the masked and readonly paths with regard to the host config options if they are set.
	if c.HostConfig.MaskedPaths != nil {
		opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths))
	}
	if c.HostConfig.ReadonlyPaths != nil {
		opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths))
	}
	if daemonCfg.Rootless {
		opts = append(opts, withRootless(daemon, &daemonCfg.Config))
	} else if userns.RunningInUserNS() {
		opts = append(opts, withRootfulInRootless(daemon, &daemonCfg.Config))
	}

	var snapshotter, snapshotKey string
	if daemon.UsesSnapshotter() {
		snapshotter = daemon.imageService.StorageDriver()
		snapshotKey = c.ID
	}

	return &s, coci.ApplyOpts(ctx, daemon.containerdClient, &containers.Container{
		ID:          c.ID,
		Snapshotter: snapshotter,
		SnapshotKey: snapshotKey,
	}, &s, opts...)
}

func clearReadOnly(m *specs.Mount) {
	var opt []string
	for _, o := range m.Options {
		if o != "ro" {
			opt = append(opt, o)
		}
	}
	m.Options = opt
}

// mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig
func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig, daemonCfg *dconfig.Config) {
	ulimits := c.Ulimits
	// Merge ulimits with daemon defaults
	ulIdx := make(map[string]struct{})
	for _, ul := range ulimits {
		ulIdx[ul.Name] = struct{}{}
	}
	for name, ul := range daemonCfg.Ulimits {
		if _, exists := ulIdx[name]; !exists {
			ulimits = append(ulimits, ul)
		}
	}
	c.Ulimits = ulimits
}