moby/daemon/oci_linux.go
Sebastiaan van Stijn 7b414f5703
daemon: move getUnprivilegedMountFlags to internal package
This code is currently only used in the daemon, but is also needed in other
places. We should consider moving this code to github.com/moby/sys, so that
BuildKit can also use the same implementation instead of maintaining a fork;
moving it to internal allows us to reuse this code inside the repository, but
does not allow external consumers to depend on it (which we don't want as
it's not a permanent location).

As our code only uses this in linux files, I did not add a stub for other
platforms (but we may decide to do that in the moby/sys repository).

Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
2024-03-15 12:55:09 +01:00

1081 lines
33 KiB
Go

package daemon // import "github.com/docker/docker/daemon"
import (
"context"
"fmt"
"os"
"path/filepath"
"sort"
"strconv"
"strings"
cdcgroups "github.com/containerd/cgroups/v3"
"github.com/containerd/containerd/containers"
coci "github.com/containerd/containerd/oci"
"github.com/containerd/containerd/pkg/apparmor"
"github.com/containerd/containerd/pkg/userns"
"github.com/containerd/log"
containertypes "github.com/docker/docker/api/types/container"
"github.com/docker/docker/container"
dconfig "github.com/docker/docker/daemon/config"
"github.com/docker/docker/errdefs"
"github.com/docker/docker/internal/rootless/mountopts"
"github.com/docker/docker/oci"
"github.com/docker/docker/oci/caps"
"github.com/docker/docker/pkg/idtools"
"github.com/docker/docker/pkg/rootless/specconv"
volumemounts "github.com/docker/docker/volume/mounts"
"github.com/moby/sys/mount"
"github.com/moby/sys/mountinfo"
"github.com/moby/sys/user"
"github.com/opencontainers/runc/libcontainer/cgroups"
specs "github.com/opencontainers/runtime-spec/specs-go"
"github.com/pkg/errors"
)
const inContainerInitPath = "/sbin/" + dconfig.DefaultInitBinary
// withRlimits sets the container's rlimits along with merging the daemon's rlimits
func withRlimits(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
var rlimits []specs.POSIXRlimit
// We want to leave the original HostConfig alone so make a copy here
hostConfig := *c.HostConfig
// Merge with the daemon defaults
daemon.mergeUlimits(&hostConfig, daemonCfg)
for _, ul := range hostConfig.Ulimits {
rlimits = append(rlimits, specs.POSIXRlimit{
Type: "RLIMIT_" + strings.ToUpper(ul.Name),
Soft: uint64(ul.Soft),
Hard: uint64(ul.Hard),
})
}
if s.Process == nil {
s.Process = &specs.Process{}
}
s.Process.Rlimits = rlimits
return nil
}
}
// withRootless sets the spec to the rootless configuration
func withRootless(daemon *Daemon, daemonCfg *dconfig.Config) coci.SpecOpts {
return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
var v2Controllers []string
if cgroupDriver(daemonCfg) == cgroupSystemdDriver {
if cdcgroups.Mode() != cdcgroups.Unified {
return errors.New("rootless systemd driver doesn't support cgroup v1")
}
rootlesskitParentEUID := os.Getenv("ROOTLESSKIT_PARENT_EUID")
if rootlesskitParentEUID == "" {
return errors.New("$ROOTLESSKIT_PARENT_EUID is not set (requires RootlessKit v0.8.0)")
}
euid, err := strconv.Atoi(rootlesskitParentEUID)
if err != nil {
return errors.Wrap(err, "invalid $ROOTLESSKIT_PARENT_EUID: must be a numeric value")
}
controllersPath := fmt.Sprintf("/sys/fs/cgroup/user.slice/user-%d.slice/cgroup.controllers", euid)
controllersFile, err := os.ReadFile(controllersPath)
if err != nil {
return err
}
v2Controllers = strings.Fields(string(controllersFile))
}
return specconv.ToRootless(s, v2Controllers)
}
}
// withRootfulInRootless is used for "rootful-in-rootless" dind;
// the daemon is running in UserNS but has no access to RootlessKit API socket, host filesystem, etc.
func withRootfulInRootless(daemon *Daemon, daemonCfg *dconfig.Config) coci.SpecOpts {
return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
specconv.ToRootfulInRootless(s)
return nil
}
}
// WithOOMScore sets the oom score
func WithOOMScore(score *int) coci.SpecOpts {
return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
if s.Process == nil {
s.Process = &specs.Process{}
}
s.Process.OOMScoreAdj = score
return nil
}
}
// WithSelinux sets the selinux labels
func WithSelinux(c *container.Container) coci.SpecOpts {
return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
if s.Process == nil {
s.Process = &specs.Process{}
}
if s.Linux == nil {
s.Linux = &specs.Linux{}
}
s.Process.SelinuxLabel = c.GetProcessLabel()
s.Linux.MountLabel = c.MountLabel
return nil
}
}
// WithApparmor sets the apparmor profile
func WithApparmor(c *container.Container) coci.SpecOpts {
return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
if apparmor.HostSupports() {
var appArmorProfile string
if c.AppArmorProfile != "" {
appArmorProfile = c.AppArmorProfile
} else if c.HostConfig.Privileged {
appArmorProfile = unconfinedAppArmorProfile
} else {
appArmorProfile = defaultAppArmorProfile
}
if appArmorProfile == defaultAppArmorProfile {
// Unattended upgrades and other fun services can unload AppArmor
// profiles inadvertently. Since we cannot store our profile in
// /etc/apparmor.d, nor can we practically add other ways of
// telling the system to keep our profile loaded, in order to make
// sure that we keep the default profile enabled we dynamically
// reload it if necessary.
if err := ensureDefaultAppArmorProfile(); err != nil {
return err
}
}
if s.Process == nil {
s.Process = &specs.Process{}
}
s.Process.ApparmorProfile = appArmorProfile
}
return nil
}
}
// WithCapabilities sets the container's capabilties
func WithCapabilities(c *container.Container) coci.SpecOpts {
return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
capabilities, err := caps.TweakCapabilities(
caps.DefaultCapabilities(),
c.HostConfig.CapAdd,
c.HostConfig.CapDrop,
c.HostConfig.Privileged,
)
if err != nil {
return err
}
return oci.SetCapabilities(s, capabilities)
}
}
func resourcePath(c *container.Container, getPath func() (string, error)) (string, error) {
p, err := getPath()
if err != nil {
return "", err
}
return c.GetResourcePath(p)
}
func getUser(c *container.Container, username string) (specs.User, error) {
var usr specs.User
passwdPath, err := resourcePath(c, user.GetPasswdPath)
if err != nil {
return usr, err
}
groupPath, err := resourcePath(c, user.GetGroupPath)
if err != nil {
return usr, err
}
execUser, err := user.GetExecUserPath(username, nil, passwdPath, groupPath)
if err != nil {
return usr, err
}
usr.UID = uint32(execUser.Uid)
usr.GID = uint32(execUser.Gid)
usr.AdditionalGids = []uint32{usr.GID}
var addGroups []int
if len(c.HostConfig.GroupAdd) > 0 {
addGroups, err = user.GetAdditionalGroupsPath(c.HostConfig.GroupAdd, groupPath)
if err != nil {
return usr, err
}
}
for _, g := range append(execUser.Sgids, addGroups...) {
usr.AdditionalGids = append(usr.AdditionalGids, uint32(g))
}
return usr, nil
}
func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) {
if s.Linux == nil {
s.Linux = &specs.Linux{}
}
for i, n := range s.Linux.Namespaces {
if n.Type == ns.Type {
s.Linux.Namespaces[i] = ns
return
}
}
s.Linux.Namespaces = append(s.Linux.Namespaces, ns)
}
// WithNamespaces sets the container's namespaces
func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
userNS := false
// user
if c.HostConfig.UsernsMode.IsPrivate() {
if uidMap := daemon.idMapping.UIDMaps; uidMap != nil {
userNS = true
setNamespace(s, specs.LinuxNamespace{
Type: specs.UserNamespace,
})
s.Linux.UIDMappings = specMapping(uidMap)
s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDMaps)
}
}
// network
if !c.Config.NetworkDisabled {
networkMode := c.HostConfig.NetworkMode
switch {
case networkMode.IsContainer():
nc, err := daemon.getNetworkedContainer(c.ID, networkMode.ConnectedContainer())
if err != nil {
return err
}
setNamespace(s, specs.LinuxNamespace{
Type: specs.NetworkNamespace,
Path: fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID()),
})
if userNS {
// to share a net namespace, the containers must also share a user namespace.
//
// FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210
setNamespace(s, specs.LinuxNamespace{
Type: specs.UserNamespace,
Path: fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID()),
})
}
case networkMode.IsHost():
oci.RemoveNamespace(s, specs.NetworkNamespace)
default:
setNamespace(s, specs.LinuxNamespace{
Type: specs.NetworkNamespace,
})
}
}
// ipc
ipcMode := c.HostConfig.IpcMode
if !ipcMode.Valid() {
return errdefs.InvalidParameter(errors.Errorf("invalid IPC mode: %v", ipcMode))
}
switch {
case ipcMode.IsContainer():
ic, err := daemon.getIPCContainer(ipcMode.Container())
if err != nil {
return errors.Wrap(err, "failed to join IPC namespace")
}
setNamespace(s, specs.LinuxNamespace{
Type: specs.IPCNamespace,
Path: fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID()),
})
if userNS {
// to share a IPC namespace, the containers must also share a user namespace.
//
// FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210
setNamespace(s, specs.LinuxNamespace{
Type: specs.UserNamespace,
Path: fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID()),
})
}
case ipcMode.IsHost():
oci.RemoveNamespace(s, specs.IPCNamespace)
case ipcMode.IsEmpty():
// A container was created by an older version of the daemon.
// The default behavior used to be what is now called "shareable".
fallthrough
case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone():
setNamespace(s, specs.LinuxNamespace{
Type: specs.IPCNamespace,
})
}
// pid
pidMode := c.HostConfig.PidMode
if !pidMode.Valid() {
return errdefs.InvalidParameter(errors.Errorf("invalid PID mode: %v", pidMode))
}
switch {
case pidMode.IsContainer():
pc, err := daemon.getPIDContainer(pidMode.Container())
if err != nil {
return errors.Wrap(err, "failed to join PID namespace")
}
setNamespace(s, specs.LinuxNamespace{
Type: specs.PIDNamespace,
Path: fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID()),
})
if userNS {
// to share a PID namespace, the containers must also share a user namespace.
//
// FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210
setNamespace(s, specs.LinuxNamespace{
Type: specs.UserNamespace,
Path: fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID()),
})
}
case pidMode.IsHost():
oci.RemoveNamespace(s, specs.PIDNamespace)
default:
setNamespace(s, specs.LinuxNamespace{
Type: specs.PIDNamespace,
})
}
// uts
if !c.HostConfig.UTSMode.Valid() {
return errdefs.InvalidParameter(errors.Errorf("invalid UTS mode: %v", c.HostConfig.UTSMode))
}
if c.HostConfig.UTSMode.IsHost() {
oci.RemoveNamespace(s, specs.UTSNamespace)
s.Hostname = ""
}
// cgroup
if !c.HostConfig.CgroupnsMode.Valid() {
return errdefs.InvalidParameter(errors.Errorf("invalid cgroup namespace mode: %v", c.HostConfig.CgroupnsMode))
}
if c.HostConfig.CgroupnsMode.IsPrivate() {
setNamespace(s, specs.LinuxNamespace{
Type: specs.CgroupNamespace,
})
}
return nil
}
}
func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {
var ids []specs.LinuxIDMapping
for _, item := range s {
ids = append(ids, specs.LinuxIDMapping{
HostID: uint32(item.HostID),
ContainerID: uint32(item.ContainerID),
Size: uint32(item.Size),
})
}
return ids
}
// Get the source mount point of directory passed in as argument. Also return
// optional fields.
func getSourceMount(source string) (string, string, error) {
// Ensure any symlinks are resolved.
sourcePath, err := filepath.EvalSymlinks(source)
if err != nil {
return "", "", err
}
mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(sourcePath))
if err != nil {
return "", "", err
}
if len(mi) < 1 {
return "", "", fmt.Errorf("Can't find mount point of %s", source)
}
// find the longest mount point
var idx, maxlen int
for i := range mi {
if len(mi[i].Mountpoint) > maxlen {
maxlen = len(mi[i].Mountpoint)
idx = i
}
}
return mi[idx].Mountpoint, mi[idx].Optional, nil
}
const (
sharedPropagationOption = "shared:"
slavePropagationOption = "master:"
)
// hasMountInfoOption checks if any of the passed any of the given option values
// are set in the passed in option string.
func hasMountInfoOption(opts string, vals ...string) bool {
for _, opt := range strings.Split(opts, " ") {
for _, val := range vals {
if strings.HasPrefix(opt, val) {
return true
}
}
}
return false
}
// Ensure mount point on which path is mounted, is shared.
func ensureShared(path string) error {
sourceMount, optionalOpts, err := getSourceMount(path)
if err != nil {
return err
}
// Make sure source mount point is shared.
if !hasMountInfoOption(optionalOpts, sharedPropagationOption) {
return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount)
}
return nil
}
// Ensure mount point on which path is mounted, is either shared or slave.
func ensureSharedOrSlave(path string) error {
sourceMount, optionalOpts, err := getSourceMount(path)
if err != nil {
return err
}
if !hasMountInfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) {
return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount)
}
return nil
}
var (
mountPropagationMap = map[string]int{
"private": mount.PRIVATE,
"rprivate": mount.RPRIVATE,
"shared": mount.SHARED,
"rshared": mount.RSHARED,
"slave": mount.SLAVE,
"rslave": mount.RSLAVE,
}
mountPropagationReverseMap = map[int]string{
mount.PRIVATE: "private",
mount.RPRIVATE: "rprivate",
mount.SHARED: "shared",
mount.RSHARED: "rshared",
mount.SLAVE: "slave",
mount.RSLAVE: "rslave",
}
)
// inSlice tests whether a string is contained in a slice of strings or not.
// Comparison is case sensitive
func inSlice(slice []string, s string) bool {
for _, ss := range slice {
if s == ss {
return true
}
}
return false
}
// withMounts sets the container's mounts
func withMounts(daemon *Daemon, daemonCfg *configStore, c *container.Container, ms []container.Mount) coci.SpecOpts {
return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
sort.Sort(mounts(ms))
mounts := ms
userMounts := make(map[string]struct{})
for _, m := range mounts {
userMounts[m.Destination] = struct{}{}
}
// Copy all mounts from spec to defaultMounts, except for
// - mounts overridden by a user supplied mount;
// - all mounts under /dev if a user supplied /dev is present;
// - /dev/shm, in case IpcMode is none.
// While at it, also
// - set size for /dev/shm from shmsize.
defaultMounts := s.Mounts[:0]
_, mountDev := userMounts["/dev"]
for _, m := range s.Mounts {
if _, ok := userMounts[m.Destination]; ok {
// filter out mount overridden by a user supplied mount
continue
}
if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
// filter out everything under /dev if /dev is user-mounted
continue
}
if m.Destination == "/dev/shm" {
if c.HostConfig.IpcMode.IsNone() {
// filter out /dev/shm for "none" IpcMode
continue
}
// set size for /dev/shm mount from spec
sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
m.Options = append(m.Options, sizeOpt)
}
defaultMounts = append(defaultMounts, m)
}
s.Mounts = defaultMounts
for _, m := range mounts {
if m.Source == "tmpfs" {
data := m.Data
parser := volumemounts.NewParser()
options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())}
if data != "" {
options = append(options, strings.Split(data, ",")...)
}
merged, err := mount.MergeTmpfsOptions(options)
if err != nil {
return err
}
s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged})
continue
}
mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"}
// Determine property of RootPropagation based on volume
// properties. If a volume is shared, then keep root propagation
// shared. This should work for slave and private volumes too.
//
// For slave volumes, it can be either [r]shared/[r]slave.
//
// For private volumes any root propagation value should work.
pFlag := mountPropagationMap[m.Propagation]
switch pFlag {
case mount.SHARED, mount.RSHARED:
if err := ensureShared(m.Source); err != nil {
return err
}
rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
if rootpg != mount.SHARED && rootpg != mount.RSHARED {
if s.Linux == nil {
s.Linux = &specs.Linux{}
}
s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED]
}
case mount.SLAVE, mount.RSLAVE:
var fallback bool
if err := ensureSharedOrSlave(m.Source); err != nil {
// For backwards compatibility purposes, treat mounts from the daemon root
// as special since we automatically add rslave propagation to these mounts
// when the user did not set anything, so we should fallback to the old
// behavior which is to use private propagation which is normally the
// default.
if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) {
return err
}
cm, ok := c.MountPoints[m.Destination]
if !ok {
return err
}
if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" {
// This means the user explicitly set a propagation, do not fallback in that case.
return err
}
fallback = true
log.G(ctx).WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root")
}
if !fallback {
rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
if s.Linux == nil {
s.Linux = &specs.Linux{}
}
s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE]
}
}
}
bindMode := "rbind"
if m.NonRecursive {
bindMode = "bind"
}
opts := []string{bindMode}
if !m.Writable {
rro := true
if m.ReadOnlyNonRecursive {
rro = false
if m.ReadOnlyForceRecursive {
return errors.New("mount options conflict: ReadOnlyNonRecursive && ReadOnlyForceRecursive")
}
}
if rroErr := supportsRecursivelyReadOnly(daemonCfg, c.HostConfig.Runtime); rroErr != nil {
rro = false
if m.ReadOnlyForceRecursive {
return rroErr
}
}
if rro {
opts = append(opts, "rro")
} else {
opts = append(opts, "ro")
}
}
if pFlag != 0 {
opts = append(opts, mountPropagationReverseMap[pFlag])
}
// If we are using user namespaces, then we must make sure that we
// don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
// "mount" when we bind-mount. The reason for this is that at the point
// when runc sets up the root filesystem, it is already inside a user
// namespace, and thus cannot change any flags that are locked.
if daemonCfg.RemappedRoot != "" || userns.RunningInUserNS() {
unprivOpts, err := mountopts.UnprivilegedMountFlags(m.Source)
if err != nil {
return err
}
opts = append(opts, unprivOpts...)
}
mt.Options = opts
s.Mounts = append(s.Mounts, mt)
}
if s.Root.Readonly {
for i, m := range s.Mounts {
switch m.Destination {
case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev":
continue
}
if _, ok := userMounts[m.Destination]; !ok {
if !inSlice(m.Options, "ro") {
s.Mounts[i].Options = append(s.Mounts[i].Options, "ro")
}
}
}
}
if c.HostConfig.Privileged {
// clear readonly for /sys
for i := range s.Mounts {
if s.Mounts[i].Destination == "/sys" {
clearReadOnly(&s.Mounts[i])
}
}
if s.Linux != nil {
s.Linux.ReadonlyPaths = nil
s.Linux.MaskedPaths = nil
}
}
// TODO: until a kernel/mount solution exists for handling remount in a user namespace,
// we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
if uidMap := daemon.idMapping.UIDMaps; uidMap != nil || c.HostConfig.Privileged {
for i, m := range s.Mounts {
if m.Type == "cgroup" {
clearReadOnly(&s.Mounts[i])
}
}
}
return nil
}
}
// sysctlExists checks if a sysctl exists; runc will error if we add any that do not actually
// exist, so do not add the default ones if running on an old kernel.
func sysctlExists(s string) bool {
f := filepath.Join("/proc", "sys", strings.ReplaceAll(s, ".", "/"))
_, err := os.Stat(f)
return err == nil
}
// withCommonOptions sets common docker options
func withCommonOptions(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
if c.BaseFS == "" {
return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly empty")
}
linkedEnv, err := daemon.setupLinkedContainers(c)
if err != nil {
return err
}
s.Root = &specs.Root{
Path: c.BaseFS,
Readonly: c.HostConfig.ReadonlyRootfs,
}
if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil {
return err
}
cwd := c.Config.WorkingDir
if len(cwd) == 0 {
cwd = "/"
}
if s.Process == nil {
s.Process = &specs.Process{}
}
s.Process.Args = append([]string{c.Path}, c.Args...)
// only add the custom init if it is specified and the container is running in its
// own private pid namespace. It does not make sense to add if it is running in the
// host namespace or another container's pid namespace where we already have an init
if c.HostConfig.PidMode.IsPrivate() {
if (c.HostConfig.Init != nil && *c.HostConfig.Init) ||
(c.HostConfig.Init == nil && daemonCfg.Init) {
s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...)
path, err := daemonCfg.LookupInitPath() // this will fall back to DefaultInitBinary and return an absolute path
if err != nil {
return err
}
s.Mounts = append(s.Mounts, specs.Mount{
Destination: inContainerInitPath,
Type: "bind",
Source: path,
Options: []string{"bind", "ro"},
})
}
}
s.Process.Cwd = cwd
s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
s.Process.Terminal = c.Config.Tty
s.Hostname = c.Config.Hostname
setLinuxDomainname(c, s)
// Add default sysctls that are generally safe and useful; currently we
// grant the capabilities to allow these anyway. You can override if
// you want to restore the original behaviour.
// We do not set network sysctls if network namespace is host, or if we are
// joining an existing namespace, only if we create a new net namespace.
if c.HostConfig.NetworkMode.IsPrivate() {
// We cannot set up ping socket support in a user namespace
userNS := daemonCfg.RemappedRoot != "" && c.HostConfig.UsernsMode.IsPrivate()
if !userNS && !userns.RunningInUserNS() && sysctlExists("net.ipv4.ping_group_range") {
// allow unprivileged ICMP echo sockets without CAP_NET_RAW
s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647"
}
// allow opening any port less than 1024 without CAP_NET_BIND_SERVICE
if sysctlExists("net.ipv4.ip_unprivileged_port_start") {
s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"] = "0"
}
}
return nil
}
}
// withCgroups sets the container's cgroups
func withCgroups(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
var cgroupsPath string
scopePrefix := "docker"
parent := "/docker"
useSystemd := UsingSystemd(daemonCfg)
if useSystemd {
parent = "system.slice"
if daemonCfg.Rootless {
parent = "user.slice"
}
}
if c.HostConfig.CgroupParent != "" {
parent = c.HostConfig.CgroupParent
} else if daemonCfg.CgroupParent != "" {
parent = daemonCfg.CgroupParent
}
if useSystemd {
cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID
log.G(ctx).Debugf("createSpec: cgroupsPath: %s", cgroupsPath)
} else {
cgroupsPath = filepath.Join(parent, c.ID)
}
if s.Linux == nil {
s.Linux = &specs.Linux{}
}
s.Linux.CgroupsPath = cgroupsPath
// the rest is only needed for CPU RT controller
if daemonCfg.CPURealtimePeriod == 0 && daemonCfg.CPURealtimeRuntime == 0 {
return nil
}
p := cgroupsPath
if useSystemd {
initPath, err := cgroups.GetInitCgroup("cpu")
if err != nil {
return errors.Wrap(err, "unable to init CPU RT controller")
}
_, err = cgroups.GetOwnCgroup("cpu")
if err != nil {
return errors.Wrap(err, "unable to init CPU RT controller")
}
p = filepath.Join(initPath, s.Linux.CgroupsPath)
}
// Clean path to guard against things like ../../../BAD
parentPath := filepath.Dir(p)
if !filepath.IsAbs(parentPath) {
parentPath = filepath.Clean("/" + parentPath)
}
mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", "cpu")
if err != nil {
return errors.Wrap(err, "unable to init CPU RT controller")
}
// When docker is run inside docker, the root is based of the host cgroup.
// Should this be handled in runc/libcontainer/cgroups ?
if strings.HasPrefix(root, "/docker/") {
root = "/"
}
mnt = filepath.Join(mnt, root)
if err := daemon.initCPURtController(daemonCfg, mnt, parentPath); err != nil {
return errors.Wrap(err, "unable to init CPU RT controller")
}
return nil
}
}
// WithDevices sets the container's devices
func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts {
return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
// Build lists of devices allowed and created within the container.
var devs []specs.LinuxDevice
devPermissions := s.Linux.Resources.Devices
if c.HostConfig.Privileged {
hostDevices, err := coci.HostDevices()
if err != nil {
return err
}
devs = append(devs, hostDevices...)
// adding device mappings in privileged containers
for _, deviceMapping := range c.HostConfig.Devices {
// issue a warning that custom cgroup permissions are ignored in privileged mode
if deviceMapping.CgroupPermissions != "rwm" {
log.G(ctx).WithField("container", c.ID).Warnf("custom %s permissions for device %s are ignored in privileged mode", deviceMapping.CgroupPermissions, deviceMapping.PathOnHost)
}
// issue a warning that the device path already exists via /dev mounting in privileged mode
if deviceMapping.PathOnHost == deviceMapping.PathInContainer {
log.G(ctx).WithField("container", c.ID).Warnf("path in container %s already exists in privileged mode", deviceMapping.PathInContainer)
continue
}
d, _, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, "rwm")
if err != nil {
return err
}
devs = append(devs, d...)
}
devPermissions = []specs.LinuxDeviceCgroup{
{
Allow: true,
Access: "rwm",
},
}
} else {
for _, deviceMapping := range c.HostConfig.Devices {
d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions)
if err != nil {
return err
}
devs = append(devs, d...)
devPermissions = append(devPermissions, dPermissions...)
}
var err error
devPermissions, err = oci.AppendDevicePermissionsFromCgroupRules(devPermissions, c.HostConfig.DeviceCgroupRules)
if err != nil {
return err
}
}
if s.Linux == nil {
s.Linux = &specs.Linux{}
}
if s.Linux.Resources == nil {
s.Linux.Resources = &specs.LinuxResources{}
}
s.Linux.Devices = append(s.Linux.Devices, devs...)
s.Linux.Resources.Devices = append(s.Linux.Resources.Devices, devPermissions...)
for _, req := range c.HostConfig.DeviceRequests {
if err := daemon.handleDevice(req, s); err != nil {
return err
}
}
return nil
}
}
// WithResources applies the container resources
func WithResources(c *container.Container) coci.SpecOpts {
return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
r := c.HostConfig.Resources
weightDevices, err := getBlkioWeightDevices(r)
if err != nil {
return err
}
readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps)
if err != nil {
return err
}
writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps)
if err != nil {
return err
}
readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps)
if err != nil {
return err
}
writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps)
if err != nil {
return err
}
memoryRes := getMemoryResources(r)
cpuRes, err := getCPUResources(r)
if err != nil {
return err
}
if s.Linux == nil {
s.Linux = &specs.Linux{}
}
if s.Linux.Resources == nil {
s.Linux.Resources = &specs.LinuxResources{}
}
s.Linux.Resources.Memory = memoryRes
s.Linux.Resources.CPU = cpuRes
s.Linux.Resources.BlockIO = &specs.LinuxBlockIO{
WeightDevice: weightDevices,
ThrottleReadBpsDevice: readBpsDevice,
ThrottleWriteBpsDevice: writeBpsDevice,
ThrottleReadIOPSDevice: readIOpsDevice,
ThrottleWriteIOPSDevice: writeIOpsDevice,
}
if r.BlkioWeight != 0 {
w := r.BlkioWeight
s.Linux.Resources.BlockIO.Weight = &w
}
s.Linux.Resources.Pids = getPidsLimit(r)
return nil
}
}
// WithSysctls sets the container's sysctls
func WithSysctls(c *container.Container) coci.SpecOpts {
return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
if len(c.HostConfig.Sysctls) == 0 {
return nil
}
if s.Linux == nil {
s.Linux = &specs.Linux{}
}
if s.Linux.Sysctl == nil {
s.Linux.Sysctl = make(map[string]string)
}
// We merge the sysctls injected above with the HostConfig (latter takes
// precedence for backwards-compatibility reasons).
for k, v := range c.HostConfig.Sysctls {
s.Linux.Sysctl[k] = v
}
return nil
}
}
// WithUser sets the container's user
func WithUser(c *container.Container) coci.SpecOpts {
return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
if s.Process == nil {
s.Process = &specs.Process{}
}
var err error
s.Process.User, err = getUser(c, c.Config.User)
return err
}
}
func (daemon *Daemon) createSpec(ctx context.Context, daemonCfg *configStore, c *container.Container, mounts []container.Mount) (retSpec *specs.Spec, err error) {
var (
opts []coci.SpecOpts
s = oci.DefaultSpec()
)
opts = append(opts,
withCommonOptions(daemon, &daemonCfg.Config, c),
withCgroups(daemon, &daemonCfg.Config, c),
WithResources(c),
WithSysctls(c),
WithDevices(daemon, c),
withRlimits(daemon, &daemonCfg.Config, c),
WithNamespaces(daemon, c),
WithCapabilities(c),
WithSeccomp(daemon, c),
withMounts(daemon, daemonCfg, c, mounts),
WithApparmor(c),
WithSelinux(c),
WithOOMScore(&c.HostConfig.OomScoreAdj),
coci.WithAnnotations(c.HostConfig.Annotations),
WithUser(c),
)
if c.NoNewPrivileges {
opts = append(opts, coci.WithNoNewPrivileges)
}
if c.Config.Tty {
opts = append(opts, WithConsoleSize(c))
}
// Set the masked and readonly paths with regard to the host config options if they are set.
if c.HostConfig.MaskedPaths != nil {
opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths))
}
if c.HostConfig.ReadonlyPaths != nil {
opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths))
}
if daemonCfg.Rootless {
opts = append(opts, withRootless(daemon, &daemonCfg.Config))
} else if userns.RunningInUserNS() {
opts = append(opts, withRootfulInRootless(daemon, &daemonCfg.Config))
}
var snapshotter, snapshotKey string
if daemon.UsesSnapshotter() {
snapshotter = daemon.imageService.StorageDriver()
snapshotKey = c.ID
}
return &s, coci.ApplyOpts(ctx, daemon.containerdClient, &containers.Container{
ID: c.ID,
Snapshotter: snapshotter,
SnapshotKey: snapshotKey,
}, &s, opts...)
}
func clearReadOnly(m *specs.Mount) {
var opt []string
for _, o := range m.Options {
if o != "ro" {
opt = append(opt, o)
}
}
m.Options = opt
}
// mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig
func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig, daemonCfg *dconfig.Config) {
ulimits := c.Ulimits
// Merge ulimits with daemon defaults
ulIdx := make(map[string]struct{})
for _, ul := range ulimits {
ulIdx[ul.Name] = struct{}{}
}
for name, ul := range daemonCfg.Ulimits {
if _, exists := ulIdx[name]; !exists {
ulimits = append(ulimits, ul)
}
}
c.Ulimits = ulimits
}