moby/daemon/oci_linux.go

1114 lines
34 KiB
Go
Raw Normal View History

package daemon // import "github.com/docker/docker/daemon"
import (
"context"
"fmt"
"os"
"path/filepath"
"sort"
"strconv"
"strings"
cdcgroups "github.com/containerd/cgroups/v3"
"github.com/containerd/containerd/containers"
coci "github.com/containerd/containerd/oci"
"github.com/containerd/containerd/pkg/apparmor"
"github.com/containerd/containerd/pkg/userns"
"github.com/containerd/log"
containertypes "github.com/docker/docker/api/types/container"
"github.com/docker/docker/container"
dconfig "github.com/docker/docker/daemon/config"
Fix validation of IpcMode, PidMode, UTSMode, CgroupnsMode These HostConfig properties were not validated until the OCI spec for the container was created, which meant that `container run` and `docker create` would accept invalid values, and the invalid value would not be detected until `start` was called, returning a 500 "internal server error", as well as errors from containerd ("cleanup: failed to delete container from containerd: no such container") in the daemon logs. As a result, a faulty container was created, and the container state remained in the `created` state. This patch: - Updates `oci.WithNamespaces()` to return the correct `errdefs.InvalidParameter` - Updates `verifyPlatformContainerSettings()` to validate these settings, so that an error is returned when _creating_ the container. Before this patch: docker run -dit --ipc=shared --name foo busybox 2a00d74e9fbb7960c4718def8f6c74fa8ee754030eeb93ee26a516e27d4d029f docker: Error response from daemon: Invalid IPC mode: shared. docker ps -a --filter name=foo CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 2a00d74e9fbb busybox "sh" About a minute ago Created foo After this patch: docker run -dit --ipc=shared --name foo busybox docker: Error response from daemon: invalid IPC mode: shared. docker ps -a --filter name=foo CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES An integration test was added to verify the new validation, which can be run with: make BIND_DIR=. TEST_FILTER=TestCreateInvalidHostConfig DOCKER_GRAPHDRIVER=vfs test-integration Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
2022-05-25 11:17:16 +00:00
"github.com/docker/docker/errdefs"
"github.com/docker/docker/oci"
"github.com/docker/docker/oci/caps"
"github.com/docker/docker/pkg/idtools"
"github.com/docker/docker/pkg/rootless/specconv"
volumemounts "github.com/docker/docker/volume/mounts"
"github.com/moby/sys/mount"
"github.com/moby/sys/mountinfo"
"github.com/moby/sys/user"
"github.com/opencontainers/runc/libcontainer/cgroups"
specs "github.com/opencontainers/runtime-spec/specs-go"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
const inContainerInitPath = "/sbin/" + dconfig.DefaultInitBinary
// withRlimits sets the container's rlimits along with merging the daemon's rlimits
func withRlimits(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
var rlimits []specs.POSIXRlimit
// We want to leave the original HostConfig alone so make a copy here
hostConfig := *c.HostConfig
// Merge with the daemon defaults
daemon.mergeUlimits(&hostConfig, daemonCfg)
for _, ul := range hostConfig.Ulimits {
rlimits = append(rlimits, specs.POSIXRlimit{
Type: "RLIMIT_" + strings.ToUpper(ul.Name),
Soft: uint64(ul.Soft),
Hard: uint64(ul.Hard),
})
}
if s.Process == nil {
s.Process = &specs.Process{}
}
s.Process.Rlimits = rlimits
return nil
}
}
// withRootless sets the spec to the rootless configuration
func withRootless(daemon *Daemon, daemonCfg *dconfig.Config) coci.SpecOpts {
return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
var v2Controllers []string
if cgroupDriver(daemonCfg) == cgroupSystemdDriver {
if cdcgroups.Mode() != cdcgroups.Unified {
return errors.New("rootless systemd driver doesn't support cgroup v1")
}
rootlesskitParentEUID := os.Getenv("ROOTLESSKIT_PARENT_EUID")
if rootlesskitParentEUID == "" {
return errors.New("$ROOTLESSKIT_PARENT_EUID is not set (requires RootlessKit v0.8.0)")
}
euid, err := strconv.Atoi(rootlesskitParentEUID)
if err != nil {
return errors.Wrap(err, "invalid $ROOTLESSKIT_PARENT_EUID: must be a numeric value")
}
controllersPath := fmt.Sprintf("/sys/fs/cgroup/user.slice/user-%d.slice/cgroup.controllers", euid)
controllersFile, err := os.ReadFile(controllersPath)
if err != nil {
return err
}
v2Controllers = strings.Fields(string(controllersFile))
}
return specconv.ToRootless(s, v2Controllers)
}
}
// withRootfulInRootless is used for "rootful-in-rootless" dind;
// the daemon is running in UserNS but has no access to RootlessKit API socket, host filesystem, etc.
func withRootfulInRootless(daemon *Daemon, daemonCfg *dconfig.Config) coci.SpecOpts {
return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
specconv.ToRootfulInRootless(s)
return nil
}
}
// WithOOMScore sets the oom score
func WithOOMScore(score *int) coci.SpecOpts {
return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
if s.Process == nil {
s.Process = &specs.Process{}
}
s.Process.OOMScoreAdj = score
return nil
}
}
// WithSelinux sets the selinux labels
func WithSelinux(c *container.Container) coci.SpecOpts {
return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
if s.Process == nil {
s.Process = &specs.Process{}
}
if s.Linux == nil {
s.Linux = &specs.Linux{}
}
s.Process.SelinuxLabel = c.GetProcessLabel()
s.Linux.MountLabel = c.MountLabel
return nil
}
}
// WithApparmor sets the apparmor profile
func WithApparmor(c *container.Container) coci.SpecOpts {
return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
if apparmor.HostSupports() {
var appArmorProfile string
if c.AppArmorProfile != "" {
appArmorProfile = c.AppArmorProfile
} else if c.HostConfig.Privileged {
appArmorProfile = unconfinedAppArmorProfile
} else {
appArmorProfile = defaultAppArmorProfile
}
if appArmorProfile == defaultAppArmorProfile {
// Unattended upgrades and other fun services can unload AppArmor
// profiles inadvertently. Since we cannot store our profile in
// /etc/apparmor.d, nor can we practically add other ways of
// telling the system to keep our profile loaded, in order to make
// sure that we keep the default profile enabled we dynamically
// reload it if necessary.
if err := ensureDefaultAppArmorProfile(); err != nil {
return err
}
}
if s.Process == nil {
s.Process = &specs.Process{}
}
s.Process.ApparmorProfile = appArmorProfile
}
return nil
}
}
// WithCapabilities sets the container's capabilties
func WithCapabilities(c *container.Container) coci.SpecOpts {
return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
capabilities, err := caps.TweakCapabilities(
caps.DefaultCapabilities(),
c.HostConfig.CapAdd,
c.HostConfig.CapDrop,
c.HostConfig.Privileged,
)
if err != nil {
return err
}
return oci.SetCapabilities(s, capabilities)
}
}
func resourcePath(c *container.Container, getPath func() (string, error)) (string, error) {
p, err := getPath()
if err != nil {
return "", err
}
return c.GetResourcePath(p)
}
func getUser(c *container.Container, username string) (specs.User, error) {
var usr specs.User
passwdPath, err := resourcePath(c, user.GetPasswdPath)
if err != nil {
return usr, err
}
groupPath, err := resourcePath(c, user.GetGroupPath)
if err != nil {
return usr, err
}
execUser, err := user.GetExecUserPath(username, nil, passwdPath, groupPath)
if err != nil {
return usr, err
}
usr.UID = uint32(execUser.Uid)
usr.GID = uint32(execUser.Gid)
usr.AdditionalGids = []uint32{usr.GID}
var addGroups []int
if len(c.HostConfig.GroupAdd) > 0 {
addGroups, err = user.GetAdditionalGroupsPath(c.HostConfig.GroupAdd, groupPath)
if err != nil {
return usr, err
}
}
for _, g := range append(execUser.Sgids, addGroups...) {
usr.AdditionalGids = append(usr.AdditionalGids, uint32(g))
}
return usr, nil
}
func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) {
if s.Linux == nil {
s.Linux = &specs.Linux{}
}
for i, n := range s.Linux.Namespaces {
if n.Type == ns.Type {
s.Linux.Namespaces[i] = ns
return
}
}
s.Linux.Namespaces = append(s.Linux.Namespaces, ns)
}
// WithNamespaces sets the container's namespaces
func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
userNS := false
// user
if c.HostConfig.UsernsMode.IsPrivate() {
if uidMap := daemon.idMapping.UIDMaps; uidMap != nil {
userNS = true
setNamespace(s, specs.LinuxNamespace{
Type: specs.UserNamespace,
})
s.Linux.UIDMappings = specMapping(uidMap)
s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDMaps)
}
}
// network
if !c.Config.NetworkDisabled {
networkMode := c.HostConfig.NetworkMode
switch {
case networkMode.IsContainer():
nc, err := daemon.getNetworkedContainer(c.ID, networkMode.ConnectedContainer())
if err != nil {
return err
}
setNamespace(s, specs.LinuxNamespace{
Type: specs.NetworkNamespace,
Path: fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID()),
})
if userNS {
// to share a net namespace, the containers must also share a user namespace.
//
// FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210
setNamespace(s, specs.LinuxNamespace{
Type: specs.UserNamespace,
Path: fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID()),
})
}
case networkMode.IsHost():
oci.RemoveNamespace(s, specs.NetworkNamespace)
default:
setNamespace(s, specs.LinuxNamespace{
Type: specs.NetworkNamespace,
})
}
}
// ipc
ipcMode := c.HostConfig.IpcMode
Fix validation of IpcMode, PidMode, UTSMode, CgroupnsMode These HostConfig properties were not validated until the OCI spec for the container was created, which meant that `container run` and `docker create` would accept invalid values, and the invalid value would not be detected until `start` was called, returning a 500 "internal server error", as well as errors from containerd ("cleanup: failed to delete container from containerd: no such container") in the daemon logs. As a result, a faulty container was created, and the container state remained in the `created` state. This patch: - Updates `oci.WithNamespaces()` to return the correct `errdefs.InvalidParameter` - Updates `verifyPlatformContainerSettings()` to validate these settings, so that an error is returned when _creating_ the container. Before this patch: docker run -dit --ipc=shared --name foo busybox 2a00d74e9fbb7960c4718def8f6c74fa8ee754030eeb93ee26a516e27d4d029f docker: Error response from daemon: Invalid IPC mode: shared. docker ps -a --filter name=foo CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 2a00d74e9fbb busybox "sh" About a minute ago Created foo After this patch: docker run -dit --ipc=shared --name foo busybox docker: Error response from daemon: invalid IPC mode: shared. docker ps -a --filter name=foo CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES An integration test was added to verify the new validation, which can be run with: make BIND_DIR=. TEST_FILTER=TestCreateInvalidHostConfig DOCKER_GRAPHDRIVER=vfs test-integration Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
2022-05-25 11:17:16 +00:00
if !ipcMode.Valid() {
return errdefs.InvalidParameter(errors.Errorf("invalid IPC mode: %v", ipcMode))
}
switch {
case ipcMode.IsContainer():
ic, err := daemon.getIPCContainer(ipcMode.Container())
if err != nil {
return errors.Wrap(err, "failed to join IPC namespace")
}
setNamespace(s, specs.LinuxNamespace{
Type: specs.IPCNamespace,
Path: fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID()),
})
if userNS {
// to share a IPC namespace, the containers must also share a user namespace.
//
// FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210
setNamespace(s, specs.LinuxNamespace{
Type: specs.UserNamespace,
Path: fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID()),
})
}
case ipcMode.IsHost():
oci.RemoveNamespace(s, specs.IPCNamespace)
case ipcMode.IsEmpty():
// A container was created by an older version of the daemon.
// The default behavior used to be what is now called "shareable".
fallthrough
case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone():
setNamespace(s, specs.LinuxNamespace{
Type: specs.IPCNamespace,
})
}
Implement none, private, and shareable ipc modes Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and /dev/mqueue between containers") container's /dev/shm is mounted on the host first, then bind-mounted inside the container. This is done that way in order to be able to share this container's IPC namespace (and the /dev/shm mount point) with another container. Unfortunately, this functionality breaks container checkpoint/restore (even if IPC is not shared). Since /dev/shm is an external mount, its contents is not saved by `criu checkpoint`, and so upon restore any application that tries to access data under /dev/shm is severily disappointed (which usually results in a fatal crash). This commit solves the issue by introducing new IPC modes for containers (in addition to 'host' and 'container:ID'). The new modes are: - 'shareable': enables sharing this container's IPC with others (this used to be the implicit default); - 'private': disables sharing this container's IPC. In 'private' mode, container's /dev/shm is truly mounted inside the container, without any bind-mounting from the host, which solves the issue. While at it, let's also implement 'none' mode. The motivation, as eloquently put by Justin Cormack, is: > I wondered a while back about having a none shm mode, as currently it is > not possible to have a totally unwriteable container as there is always > a /dev/shm writeable mount. It is a bit of a niche case (and clearly > should never be allowed to be daemon default) but it would be trivial to > add now so maybe we should... ...so here's yet yet another mode: - 'none': no /dev/shm mount inside the container (though it still has its own private IPC namespace). Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd need to make 'private' the default mode, but unfortunately it breaks the backward compatibility. So, let's make the default container IPC mode per-daemon configurable (with the built-in default set to 'shareable' for now). The default can be changed either via a daemon CLI option (--default-shm-mode) or a daemon.json configuration file parameter of the same name. Note one can only set either 'shareable' or 'private' IPC modes as a daemon default (i.e. in this context 'host', 'container', or 'none' do not make much sense). Some other changes this patch introduces are: 1. A mount for /dev/shm is added to default OCI Linux spec. 2. IpcMode.Valid() is simplified to remove duplicated code that parsed 'container:ID' form. Note the old version used to check that ID does not contain a semicolon -- this is no longer the case (tests are modified accordingly). The motivation is we should either do a proper check for container ID validity, or don't check it at all (since it is checked in other places anyway). I chose the latter. 3. IpcMode.Container() is modified to not return container ID if the mode value does not start with "container:", unifying the check to be the same as in IpcMode.IsContainer(). 3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified to add checks for newly added values. [v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997] [v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833] [v4: addressed the case of upgrading from older daemon, in this case container.HostConfig.IpcMode is unset and this is valid] [v5: document old and new IpcMode values in api/swagger.yaml] [v6: add the 'none' mode, changelog entry to docs/api/version-history.md] Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 21:58:50 +00:00
// pid
pidMode := c.HostConfig.PidMode
if !pidMode.Valid() {
return errdefs.InvalidParameter(errors.Errorf("invalid PID mode: %v", pidMode))
Fix validation of IpcMode, PidMode, UTSMode, CgroupnsMode These HostConfig properties were not validated until the OCI spec for the container was created, which meant that `container run` and `docker create` would accept invalid values, and the invalid value would not be detected until `start` was called, returning a 500 "internal server error", as well as errors from containerd ("cleanup: failed to delete container from containerd: no such container") in the daemon logs. As a result, a faulty container was created, and the container state remained in the `created` state. This patch: - Updates `oci.WithNamespaces()` to return the correct `errdefs.InvalidParameter` - Updates `verifyPlatformContainerSettings()` to validate these settings, so that an error is returned when _creating_ the container. Before this patch: docker run -dit --ipc=shared --name foo busybox 2a00d74e9fbb7960c4718def8f6c74fa8ee754030eeb93ee26a516e27d4d029f docker: Error response from daemon: Invalid IPC mode: shared. docker ps -a --filter name=foo CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 2a00d74e9fbb busybox "sh" About a minute ago Created foo After this patch: docker run -dit --ipc=shared --name foo busybox docker: Error response from daemon: invalid IPC mode: shared. docker ps -a --filter name=foo CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES An integration test was added to verify the new validation, which can be run with: make BIND_DIR=. TEST_FILTER=TestCreateInvalidHostConfig DOCKER_GRAPHDRIVER=vfs test-integration Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
2022-05-25 11:17:16 +00:00
}
switch {
case pidMode.IsContainer():
pc, err := daemon.getPIDContainer(pidMode.Container())
if err != nil {
daemon: WithNamespaces(): fix incorrect error for PID, IPC namespace `Daemon.getPidContainer()` was wrapping the error-message with a message ("cannot join PID of a non running container") that did not reflect the actual reason for the error; `Daemon.GetContainer()` could either return an invalid parameter (invalid / empty identifier), or a "not found" error if the specified container-ID could not be found. In the latter case, we don't want to return a "not found" error through the API, as this would indicate that the container we're _starting_ was not found (which is not the case), so we need to convert the error into an `errdefs.ErrInvalidParameter` (the container-ID specified for the PID namespace is invalid if the container doesn't exist). This logic is similar to what we do for IPC namespaces. which received a similar fix in c3d7a0c6033a2764dd85c3863809ac498ef129f2. This patch updates the error-types, and moves them into the getIpcContainer and getPidContainer container functions, both of which should return an "invalid parameter" if the container was not found. It's worth noting that, while `WithNamespaces()` may return an "invalid parameter" error, the `start` endpoint itself may _not_ be. as outlined in commit bf1fb97575ae0c929075f8340d7deb4ae9f41fae, starting a container that has an invalid configuration should be considered an internal server error, and is not an invalid _request_. However, for uses other than container "start", `WithNamespaces()` should return the correct error to allow code to handle it accordingly. Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
2023-08-11 18:37:58 +00:00
return errors.Wrap(err, "failed to join PID namespace")
}
setNamespace(s, specs.LinuxNamespace{
Type: specs.PIDNamespace,
Path: fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID()),
})
if userNS {
// to share a PID namespace, the containers must also share a user namespace.
//
// FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210
setNamespace(s, specs.LinuxNamespace{
Type: specs.UserNamespace,
Path: fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID()),
})
}
case pidMode.IsHost():
oci.RemoveNamespace(s, specs.PIDNamespace)
default:
setNamespace(s, specs.LinuxNamespace{
Type: specs.PIDNamespace,
})
}
// uts
Fix validation of IpcMode, PidMode, UTSMode, CgroupnsMode These HostConfig properties were not validated until the OCI spec for the container was created, which meant that `container run` and `docker create` would accept invalid values, and the invalid value would not be detected until `start` was called, returning a 500 "internal server error", as well as errors from containerd ("cleanup: failed to delete container from containerd: no such container") in the daemon logs. As a result, a faulty container was created, and the container state remained in the `created` state. This patch: - Updates `oci.WithNamespaces()` to return the correct `errdefs.InvalidParameter` - Updates `verifyPlatformContainerSettings()` to validate these settings, so that an error is returned when _creating_ the container. Before this patch: docker run -dit --ipc=shared --name foo busybox 2a00d74e9fbb7960c4718def8f6c74fa8ee754030eeb93ee26a516e27d4d029f docker: Error response from daemon: Invalid IPC mode: shared. docker ps -a --filter name=foo CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 2a00d74e9fbb busybox "sh" About a minute ago Created foo After this patch: docker run -dit --ipc=shared --name foo busybox docker: Error response from daemon: invalid IPC mode: shared. docker ps -a --filter name=foo CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES An integration test was added to verify the new validation, which can be run with: make BIND_DIR=. TEST_FILTER=TestCreateInvalidHostConfig DOCKER_GRAPHDRIVER=vfs test-integration Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
2022-05-25 11:17:16 +00:00
if !c.HostConfig.UTSMode.Valid() {
return errdefs.InvalidParameter(errors.Errorf("invalid UTS mode: %v", c.HostConfig.UTSMode))
}
if c.HostConfig.UTSMode.IsHost() {
oci.RemoveNamespace(s, specs.UTSNamespace)
s.Hostname = ""
}
// cgroup
Fix validation of IpcMode, PidMode, UTSMode, CgroupnsMode These HostConfig properties were not validated until the OCI spec for the container was created, which meant that `container run` and `docker create` would accept invalid values, and the invalid value would not be detected until `start` was called, returning a 500 "internal server error", as well as errors from containerd ("cleanup: failed to delete container from containerd: no such container") in the daemon logs. As a result, a faulty container was created, and the container state remained in the `created` state. This patch: - Updates `oci.WithNamespaces()` to return the correct `errdefs.InvalidParameter` - Updates `verifyPlatformContainerSettings()` to validate these settings, so that an error is returned when _creating_ the container. Before this patch: docker run -dit --ipc=shared --name foo busybox 2a00d74e9fbb7960c4718def8f6c74fa8ee754030eeb93ee26a516e27d4d029f docker: Error response from daemon: Invalid IPC mode: shared. docker ps -a --filter name=foo CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 2a00d74e9fbb busybox "sh" About a minute ago Created foo After this patch: docker run -dit --ipc=shared --name foo busybox docker: Error response from daemon: invalid IPC mode: shared. docker ps -a --filter name=foo CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES An integration test was added to verify the new validation, which can be run with: make BIND_DIR=. TEST_FILTER=TestCreateInvalidHostConfig DOCKER_GRAPHDRIVER=vfs test-integration Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
2022-05-25 11:17:16 +00:00
if !c.HostConfig.CgroupnsMode.Valid() {
return errdefs.InvalidParameter(errors.Errorf("invalid cgroup namespace mode: %v", c.HostConfig.CgroupnsMode))
}
if c.HostConfig.CgroupnsMode.IsPrivate() {
setNamespace(s, specs.LinuxNamespace{
Type: specs.CgroupNamespace,
})
}
return nil
}
}
func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {
var ids []specs.LinuxIDMapping
for _, item := range s {
ids = append(ids, specs.LinuxIDMapping{
HostID: uint32(item.HostID),
ContainerID: uint32(item.ContainerID),
Size: uint32(item.Size),
})
}
return ids
}
// Get the source mount point of directory passed in as argument. Also return
// optional fields.
func getSourceMount(source string) (string, string, error) {
// Ensure any symlinks are resolved.
sourcePath, err := filepath.EvalSymlinks(source)
if err != nil {
return "", "", err
}
mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(sourcePath))
if err != nil {
return "", "", err
}
if len(mi) < 1 {
return "", "", fmt.Errorf("Can't find mount point of %s", source)
}
// find the longest mount point
var idx, maxlen int
for i := range mi {
if len(mi[i].Mountpoint) > maxlen {
maxlen = len(mi[i].Mountpoint)
idx = i
}
}
return mi[idx].Mountpoint, mi[idx].Optional, nil
}
const (
sharedPropagationOption = "shared:"
slavePropagationOption = "master:"
)
// hasMountInfoOption checks if any of the passed any of the given option values
// are set in the passed in option string.
func hasMountInfoOption(opts string, vals ...string) bool {
for _, opt := range strings.Split(opts, " ") {
for _, val := range vals {
if strings.HasPrefix(opt, val) {
return true
}
}
}
return false
}
// Ensure mount point on which path is mounted, is shared.
func ensureShared(path string) error {
sourceMount, optionalOpts, err := getSourceMount(path)
if err != nil {
return err
}
// Make sure source mount point is shared.
if !hasMountInfoOption(optionalOpts, sharedPropagationOption) {
return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount)
}
return nil
}
// Ensure mount point on which path is mounted, is either shared or slave.
func ensureSharedOrSlave(path string) error {
sourceMount, optionalOpts, err := getSourceMount(path)
if err != nil {
return err
}
if !hasMountInfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) {
return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount)
}
return nil
}
// Get the set of mount flags that are set on the mount that contains the given
// path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
// bind-mounting "with options" will not fail with user namespaces, due to
// kernel restrictions that require user namespace mounts to preserve
// CL_UNPRIVILEGED locked flags.
func getUnprivilegedMountFlags(path string) ([]string, error) {
var statfs unix.Statfs_t
if err := unix.Statfs(path, &statfs); err != nil {
return nil, err
}
// The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
unprivilegedFlags := map[uint64]string{
unix.MS_RDONLY: "ro",
unix.MS_NODEV: "nodev",
unix.MS_NOEXEC: "noexec",
unix.MS_NOSUID: "nosuid",
unix.MS_NOATIME: "noatime",
unix.MS_RELATIME: "relatime",
unix.MS_NODIRATIME: "nodiratime",
}
var flags []string
for mask, flag := range unprivilegedFlags {
if uint64(statfs.Flags)&mask == mask {
flags = append(flags, flag)
}
}
return flags, nil
}
var (
mountPropagationMap = map[string]int{
"private": mount.PRIVATE,
"rprivate": mount.RPRIVATE,
"shared": mount.SHARED,
"rshared": mount.RSHARED,
"slave": mount.SLAVE,
"rslave": mount.RSLAVE,
}
mountPropagationReverseMap = map[int]string{
mount.PRIVATE: "private",
mount.RPRIVATE: "rprivate",
mount.SHARED: "shared",
mount.RSHARED: "rshared",
mount.SLAVE: "slave",
mount.RSLAVE: "rslave",
}
)
// inSlice tests whether a string is contained in a slice of strings or not.
// Comparison is case sensitive
func inSlice(slice []string, s string) bool {
for _, ss := range slice {
if s == ss {
return true
}
}
return false
}
// withMounts sets the container's mounts
func withMounts(daemon *Daemon, daemonCfg *configStore, c *container.Container, ms []container.Mount) coci.SpecOpts {
return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
sort.Sort(mounts(ms))
mounts := ms
userMounts := make(map[string]struct{})
for _, m := range mounts {
userMounts[m.Destination] = struct{}{}
}
// Copy all mounts from spec to defaultMounts, except for
// - mounts overridden by a user supplied mount;
// - all mounts under /dev if a user supplied /dev is present;
// - /dev/shm, in case IpcMode is none.
// While at it, also
// - set size for /dev/shm from shmsize.
defaultMounts := s.Mounts[:0]
_, mountDev := userMounts["/dev"]
for _, m := range s.Mounts {
if _, ok := userMounts[m.Destination]; ok {
// filter out mount overridden by a user supplied mount
continue
}
if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
// filter out everything under /dev if /dev is user-mounted
continue
}
if m.Destination == "/dev/shm" {
if c.HostConfig.IpcMode.IsNone() {
// filter out /dev/shm for "none" IpcMode
continue
}
// set size for /dev/shm mount from spec
sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
m.Options = append(m.Options, sizeOpt)
}
defaultMounts = append(defaultMounts, m)
}
s.Mounts = defaultMounts
for _, m := range mounts {
if m.Source == "tmpfs" {
data := m.Data
parser := volumemounts.NewParser()
options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())}
if data != "" {
options = append(options, strings.Split(data, ",")...)
}
merged, err := mount.MergeTmpfsOptions(options)
if err != nil {
return err
}
s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged})
continue
}
mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"}
// Determine property of RootPropagation based on volume
// properties. If a volume is shared, then keep root propagation
// shared. This should work for slave and private volumes too.
//
// For slave volumes, it can be either [r]shared/[r]slave.
//
// For private volumes any root propagation value should work.
pFlag := mountPropagationMap[m.Propagation]
switch pFlag {
case mount.SHARED, mount.RSHARED:
if err := ensureShared(m.Source); err != nil {
return err
}
rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
if rootpg != mount.SHARED && rootpg != mount.RSHARED {
if s.Linux == nil {
s.Linux = &specs.Linux{}
}
s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED]
}
case mount.SLAVE, mount.RSLAVE:
var fallback bool
if err := ensureSharedOrSlave(m.Source); err != nil {
// For backwards compatibility purposes, treat mounts from the daemon root
// as special since we automatically add rslave propagation to these mounts
// when the user did not set anything, so we should fallback to the old
// behavior which is to use private propagation which is normally the
// default.
if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) {
return err
}
cm, ok := c.MountPoints[m.Destination]
if !ok {
return err
}
if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" {
// This means the user explicitly set a propagation, do not fallback in that case.
return err
}
fallback = true
log.G(ctx).WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root")
}
if !fallback {
rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
if s.Linux == nil {
s.Linux = &specs.Linux{}
}
s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE]
}
}
}
bindMode := "rbind"
if m.NonRecursive {
bindMode = "bind"
}
opts := []string{bindMode}
if !m.Writable {
rro := true
if m.ReadOnlyNonRecursive {
rro = false
if m.ReadOnlyForceRecursive {
return errors.New("mount options conflict: ReadOnlyNonRecursive && ReadOnlyForceRecursive")
}
}
if rroErr := supportsRecursivelyReadOnly(daemonCfg, c.HostConfig.Runtime); rroErr != nil {
rro = false
if m.ReadOnlyForceRecursive {
return rroErr
}
}
if rro {
opts = append(opts, "rro")
} else {
opts = append(opts, "ro")
}
}
if pFlag != 0 {
opts = append(opts, mountPropagationReverseMap[pFlag])
}
// If we are using user namespaces, then we must make sure that we
// don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
// "mount" when we bind-mount. The reason for this is that at the point
// when runc sets up the root filesystem, it is already inside a user
// namespace, and thus cannot change any flags that are locked.
if daemonCfg.RemappedRoot != "" || userns.RunningInUserNS() {
unprivOpts, err := getUnprivilegedMountFlags(m.Source)
if err != nil {
return err
}
opts = append(opts, unprivOpts...)
}
mt.Options = opts
s.Mounts = append(s.Mounts, mt)
}
if s.Root.Readonly {
for i, m := range s.Mounts {
switch m.Destination {
case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev":
continue
}
if _, ok := userMounts[m.Destination]; !ok {
if !inSlice(m.Options, "ro") {
s.Mounts[i].Options = append(s.Mounts[i].Options, "ro")
}
}
}
}
if c.HostConfig.Privileged {
// clear readonly for /sys
for i := range s.Mounts {
if s.Mounts[i].Destination == "/sys" {
clearReadOnly(&s.Mounts[i])
}
}
if s.Linux != nil {
s.Linux.ReadonlyPaths = nil
s.Linux.MaskedPaths = nil
}
}
// TODO: until a kernel/mount solution exists for handling remount in a user namespace,
// we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
if uidMap := daemon.idMapping.UIDMaps; uidMap != nil || c.HostConfig.Privileged {
for i, m := range s.Mounts {
if m.Type == "cgroup" {
clearReadOnly(&s.Mounts[i])
}
}
}
return nil
}
}
// sysctlExists checks if a sysctl exists; runc will error if we add any that do not actually
// exist, so do not add the default ones if running on an old kernel.
func sysctlExists(s string) bool {
f := filepath.Join("/proc", "sys", strings.ReplaceAll(s, ".", "/"))
_, err := os.Stat(f)
return err == nil
}
// withCommonOptions sets common docker options
func withCommonOptions(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
if c.BaseFS == "" {
return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly empty")
}
linkedEnv, err := daemon.setupLinkedContainers(c)
if err != nil {
return err
}
s.Root = &specs.Root{
Path: c.BaseFS,
Readonly: c.HostConfig.ReadonlyRootfs,
}
if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil {
return err
}
cwd := c.Config.WorkingDir
if len(cwd) == 0 {
cwd = "/"
}
if s.Process == nil {
s.Process = &specs.Process{}
}
s.Process.Args = append([]string{c.Path}, c.Args...)
// only add the custom init if it is specified and the container is running in its
// own private pid namespace. It does not make sense to add if it is running in the
// host namespace or another container's pid namespace where we already have an init
if c.HostConfig.PidMode.IsPrivate() {
if (c.HostConfig.Init != nil && *c.HostConfig.Init) ||
(c.HostConfig.Init == nil && daemonCfg.Init) {
s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...)
path, err := daemonCfg.LookupInitPath() // this will fall back to DefaultInitBinary and return an absolute path
Prefer loading `docker-init` from an appropriate "libexec" directory The `docker-init` binary is not intended to be a user-facing command, and as such it is more appropriate for it to be found in `/usr/libexec` (or similar) than in `PATH` (see the FHS, especially https://refspecs.linuxfoundation.org/FHS_3.0/fhs/ch04s07.html and https://refspecs.linuxfoundation.org/FHS_2.3/fhs-2.3.html#USRLIBLIBRARIESFORPROGRAMMINGANDPA). This adjusts the logic for using that configuration option to take this into account and appropriately search for `docker-init` (or the user's configured alternative) in these directories before falling back to the existing `PATH` lookup behavior. This behavior _used_ to exist for the old `dockerinit` binary (of a similar name and used in a similar way but for an alternative purpose), but that behavior was removed in 4357ed4a7363a1032edf93cf03232953c805184f when that older `dockerinit` was also removed. Most of this reasoning _also_ applies to `docker-proxy` (and various `containerd-xxx` binaries such as the shims), but this change does not affect those. It would be relatively straightforward to adapt `LookupInitPath` to be a more generic function such as `libexecLookupPath` or similar if we wanted to explore that. See https://github.com/docker/cli/blob/14482589df194a86b2ee07df643ba3277b40df7d/cli-plugins/manager/manager_unix.go for the related path list in the CLI which loads CLI plugins from a similar set of paths (with a similar rationale - plugin binaries are not typically intended to be run directly by users but rather invoked _via_ the CLI binary). Signed-off-by: Tianon Gravi <admwiggin@gmail.com>
2023-03-22 20:26:43 +00:00
if err != nil {
return err
}
s.Mounts = append(s.Mounts, specs.Mount{
Destination: inContainerInitPath,
Type: "bind",
Source: path,
Options: []string{"bind", "ro"},
})
}
}
s.Process.Cwd = cwd
s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
s.Process.Terminal = c.Config.Tty
s.Hostname = c.Config.Hostname
setLinuxDomainname(c, s)
// Add default sysctls that are generally safe and useful; currently we
// grant the capabilities to allow these anyway. You can override if
// you want to restore the original behaviour.
// We do not set network sysctls if network namespace is host, or if we are
// joining an existing namespace, only if we create a new net namespace.
if c.HostConfig.NetworkMode.IsPrivate() {
// We cannot set up ping socket support in a user namespace
userNS := daemonCfg.RemappedRoot != "" && c.HostConfig.UsernsMode.IsPrivate()
if !userNS && !userns.RunningInUserNS() && sysctlExists("net.ipv4.ping_group_range") {
// allow unprivileged ICMP echo sockets without CAP_NET_RAW
s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647"
}
// allow opening any port less than 1024 without CAP_NET_BIND_SERVICE
if sysctlExists("net.ipv4.ip_unprivileged_port_start") {
s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"] = "0"
}
}
return nil
}
}
// withCgroups sets the container's cgroups
func withCgroups(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
var cgroupsPath string
scopePrefix := "docker"
parent := "/docker"
useSystemd := UsingSystemd(daemonCfg)
if useSystemd {
parent = "system.slice"
if daemonCfg.Rootless {
parent = "user.slice"
}
}
if c.HostConfig.CgroupParent != "" {
parent = c.HostConfig.CgroupParent
} else if daemonCfg.CgroupParent != "" {
parent = daemonCfg.CgroupParent
}
if useSystemd {
cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID
log.G(ctx).Debugf("createSpec: cgroupsPath: %s", cgroupsPath)
} else {
cgroupsPath = filepath.Join(parent, c.ID)
}
if s.Linux == nil {
s.Linux = &specs.Linux{}
}
s.Linux.CgroupsPath = cgroupsPath
// the rest is only needed for CPU RT controller
if daemonCfg.CPURealtimePeriod == 0 && daemonCfg.CPURealtimeRuntime == 0 {
return nil
}
p := cgroupsPath
if useSystemd {
initPath, err := cgroups.GetInitCgroup("cpu")
if err != nil {
return errors.Wrap(err, "unable to init CPU RT controller")
}
_, err = cgroups.GetOwnCgroup("cpu")
if err != nil {
return errors.Wrap(err, "unable to init CPU RT controller")
}
p = filepath.Join(initPath, s.Linux.CgroupsPath)
}
// Clean path to guard against things like ../../../BAD
parentPath := filepath.Dir(p)
if !filepath.IsAbs(parentPath) {
parentPath = filepath.Clean("/" + parentPath)
}
mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", "cpu")
if err != nil {
return errors.Wrap(err, "unable to init CPU RT controller")
}
// When docker is run inside docker, the root is based of the host cgroup.
// Should this be handled in runc/libcontainer/cgroups ?
if strings.HasPrefix(root, "/docker/") {
root = "/"
}
mnt = filepath.Join(mnt, root)
if err := daemon.initCPURtController(daemonCfg, mnt, parentPath); err != nil {
return errors.Wrap(err, "unable to init CPU RT controller")
}
return nil
}
}
// WithDevices sets the container's devices
func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts {
return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
// Build lists of devices allowed and created within the container.
var devs []specs.LinuxDevice
devPermissions := s.Linux.Resources.Devices
if c.HostConfig.Privileged {
hostDevices, err := coci.HostDevices()
if err != nil {
return err
}
devs = append(devs, hostDevices...)
// adding device mappings in privileged containers
for _, deviceMapping := range c.HostConfig.Devices {
// issue a warning that custom cgroup permissions are ignored in privileged mode
if deviceMapping.CgroupPermissions != "rwm" {
log.G(ctx).WithField("container", c.ID).Warnf("custom %s permissions for device %s are ignored in privileged mode", deviceMapping.CgroupPermissions, deviceMapping.PathOnHost)
}
// issue a warning that the device path already exists via /dev mounting in privileged mode
if deviceMapping.PathOnHost == deviceMapping.PathInContainer {
log.G(ctx).WithField("container", c.ID).Warnf("path in container %s already exists in privileged mode", deviceMapping.PathInContainer)
continue
}
d, _, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, "rwm")
if err != nil {
return err
}
devs = append(devs, d...)
}
devPermissions = []specs.LinuxDeviceCgroup{
{
Allow: true,
Access: "rwm",
},
}
} else {
for _, deviceMapping := range c.HostConfig.Devices {
d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions)
if err != nil {
return err
}
devs = append(devs, d...)
devPermissions = append(devPermissions, dPermissions...)
}
var err error
devPermissions, err = oci.AppendDevicePermissionsFromCgroupRules(devPermissions, c.HostConfig.DeviceCgroupRules)
if err != nil {
return err
}
}
if s.Linux == nil {
s.Linux = &specs.Linux{}
}
if s.Linux.Resources == nil {
s.Linux.Resources = &specs.LinuxResources{}
}
s.Linux.Devices = append(s.Linux.Devices, devs...)
s.Linux.Resources.Devices = append(s.Linux.Resources.Devices, devPermissions...)
for _, req := range c.HostConfig.DeviceRequests {
if err := daemon.handleDevice(req, s); err != nil {
return err
}
}
return nil
}
}
// WithResources applies the container resources
func WithResources(c *container.Container) coci.SpecOpts {
return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
r := c.HostConfig.Resources
weightDevices, err := getBlkioWeightDevices(r)
if err != nil {
return err
}
readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps)
if err != nil {
return err
}
writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps)
if err != nil {
return err
}
readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps)
if err != nil {
return err
}
writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps)
if err != nil {
return err
}
memoryRes := getMemoryResources(r)
cpuRes, err := getCPUResources(r)
if err != nil {
return err
}
if s.Linux == nil {
s.Linux = &specs.Linux{}
}
if s.Linux.Resources == nil {
s.Linux.Resources = &specs.LinuxResources{}
}
s.Linux.Resources.Memory = memoryRes
s.Linux.Resources.CPU = cpuRes
s.Linux.Resources.BlockIO = &specs.LinuxBlockIO{
WeightDevice: weightDevices,
ThrottleReadBpsDevice: readBpsDevice,
ThrottleWriteBpsDevice: writeBpsDevice,
ThrottleReadIOPSDevice: readIOpsDevice,
ThrottleWriteIOPSDevice: writeIOpsDevice,
}
if r.BlkioWeight != 0 {
w := r.BlkioWeight
s.Linux.Resources.BlockIO.Weight = &w
}
s.Linux.Resources.Pids = getPidsLimit(r)
return nil
}
}
// WithSysctls sets the container's sysctls
func WithSysctls(c *container.Container) coci.SpecOpts {
return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
if len(c.HostConfig.Sysctls) == 0 {
return nil
}
if s.Linux == nil {
s.Linux = &specs.Linux{}
}
if s.Linux.Sysctl == nil {
s.Linux.Sysctl = make(map[string]string)
}
// We merge the sysctls injected above with the HostConfig (latter takes
// precedence for backwards-compatibility reasons).
for k, v := range c.HostConfig.Sysctls {
s.Linux.Sysctl[k] = v
}
return nil
}
}
// WithUser sets the container's user
func WithUser(c *container.Container) coci.SpecOpts {
return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
if s.Process == nil {
s.Process = &specs.Process{}
}
var err error
s.Process.User, err = getUser(c, c.Config.User)
return err
}
}
func (daemon *Daemon) createSpec(ctx context.Context, daemonCfg *configStore, c *container.Container, mounts []container.Mount) (retSpec *specs.Spec, err error) {
var (
opts []coci.SpecOpts
s = oci.DefaultSpec()
)
opts = append(opts,
daemon: reload runtimes w/o breaking containers The existing runtimes reload logic went to great lengths to replace the directory containing runtime wrapper scripts as atomically as possible within the limitations of the Linux filesystem ABI. Trouble is, atomically swapping the wrapper scripts directory solves the wrong problem! The runtime configuration is "locked in" when a container is started, including the path to the runC binary. If a container is started with a runtime which requires a daemon-managed wrapper script and then the daemon is reloaded with a config which no longer requires the wrapper script (i.e. some args -> no args, or the runtime is dropped from the config), that container would become unmanageable. Any attempts to stop, exec or otherwise perform lifecycle management operations on the container are likely to fail due to the wrapper script no longer existing at its original path. Atomically swapping the wrapper scripts is also incompatible with the read-copy-update paradigm for reloading configuration. A handler in the daemon could retain a reference to the pre-reload configuration for an indeterminate amount of time after the daemon configuration has been reloaded and updated. It is possible for the daemon to attempt to start a container using a deleted wrapper script if a request to run a container races a reload. Solve the problem of deleting referenced wrapper scripts by ensuring that all wrapper scripts are *immutable* for the lifetime of the daemon process. Any given runtime wrapper script must always exist with the same contents, no matter how many times the daemon config is reloaded, or what changes are made to the config. This is accomplished by using everyone's favourite design pattern: content-addressable storage. Each wrapper script file name is suffixed with the SHA-256 digest of its contents to (probabilistically) guarantee immutability without needing any concurrency control. Stale runtime wrapper scripts are only cleaned up on the next daemon restart. Split the derived runtimes configuration from the user-supplied configuration to have a place to store derived state without mutating the user-supplied configuration or exposing daemon internals in API struct types. Hold the derived state and the user-supplied configuration in a single struct value so that they can be updated as an atomic unit. Signed-off-by: Cory Snider <csnider@mirantis.com>
2022-08-31 20:12:30 +00:00
withCommonOptions(daemon, &daemonCfg.Config, c),
withCgroups(daemon, &daemonCfg.Config, c),
WithResources(c),
WithSysctls(c),
WithDevices(daemon, c),
daemon: reload runtimes w/o breaking containers The existing runtimes reload logic went to great lengths to replace the directory containing runtime wrapper scripts as atomically as possible within the limitations of the Linux filesystem ABI. Trouble is, atomically swapping the wrapper scripts directory solves the wrong problem! The runtime configuration is "locked in" when a container is started, including the path to the runC binary. If a container is started with a runtime which requires a daemon-managed wrapper script and then the daemon is reloaded with a config which no longer requires the wrapper script (i.e. some args -> no args, or the runtime is dropped from the config), that container would become unmanageable. Any attempts to stop, exec or otherwise perform lifecycle management operations on the container are likely to fail due to the wrapper script no longer existing at its original path. Atomically swapping the wrapper scripts is also incompatible with the read-copy-update paradigm for reloading configuration. A handler in the daemon could retain a reference to the pre-reload configuration for an indeterminate amount of time after the daemon configuration has been reloaded and updated. It is possible for the daemon to attempt to start a container using a deleted wrapper script if a request to run a container races a reload. Solve the problem of deleting referenced wrapper scripts by ensuring that all wrapper scripts are *immutable* for the lifetime of the daemon process. Any given runtime wrapper script must always exist with the same contents, no matter how many times the daemon config is reloaded, or what changes are made to the config. This is accomplished by using everyone's favourite design pattern: content-addressable storage. Each wrapper script file name is suffixed with the SHA-256 digest of its contents to (probabilistically) guarantee immutability without needing any concurrency control. Stale runtime wrapper scripts are only cleaned up on the next daemon restart. Split the derived runtimes configuration from the user-supplied configuration to have a place to store derived state without mutating the user-supplied configuration or exposing daemon internals in API struct types. Hold the derived state and the user-supplied configuration in a single struct value so that they can be updated as an atomic unit. Signed-off-by: Cory Snider <csnider@mirantis.com>
2022-08-31 20:12:30 +00:00
withRlimits(daemon, &daemonCfg.Config, c),
WithNamespaces(daemon, c),
WithCapabilities(c),
WithSeccomp(daemon, c),
withMounts(daemon, daemonCfg, c, mounts),
WithApparmor(c),
WithSelinux(c),
WithOOMScore(&c.HostConfig.OomScoreAdj),
coci.WithAnnotations(c.HostConfig.Annotations),
WithUser(c),
)
if c.NoNewPrivileges {
opts = append(opts, coci.WithNoNewPrivileges)
}
if c.Config.Tty {
opts = append(opts, WithConsoleSize(c))
}
// Set the masked and readonly paths with regard to the host config options if they are set.
if c.HostConfig.MaskedPaths != nil {
opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths))
}
if c.HostConfig.ReadonlyPaths != nil {
opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths))
}
if daemonCfg.Rootless {
daemon: reload runtimes w/o breaking containers The existing runtimes reload logic went to great lengths to replace the directory containing runtime wrapper scripts as atomically as possible within the limitations of the Linux filesystem ABI. Trouble is, atomically swapping the wrapper scripts directory solves the wrong problem! The runtime configuration is "locked in" when a container is started, including the path to the runC binary. If a container is started with a runtime which requires a daemon-managed wrapper script and then the daemon is reloaded with a config which no longer requires the wrapper script (i.e. some args -> no args, or the runtime is dropped from the config), that container would become unmanageable. Any attempts to stop, exec or otherwise perform lifecycle management operations on the container are likely to fail due to the wrapper script no longer existing at its original path. Atomically swapping the wrapper scripts is also incompatible with the read-copy-update paradigm for reloading configuration. A handler in the daemon could retain a reference to the pre-reload configuration for an indeterminate amount of time after the daemon configuration has been reloaded and updated. It is possible for the daemon to attempt to start a container using a deleted wrapper script if a request to run a container races a reload. Solve the problem of deleting referenced wrapper scripts by ensuring that all wrapper scripts are *immutable* for the lifetime of the daemon process. Any given runtime wrapper script must always exist with the same contents, no matter how many times the daemon config is reloaded, or what changes are made to the config. This is accomplished by using everyone's favourite design pattern: content-addressable storage. Each wrapper script file name is suffixed with the SHA-256 digest of its contents to (probabilistically) guarantee immutability without needing any concurrency control. Stale runtime wrapper scripts are only cleaned up on the next daemon restart. Split the derived runtimes configuration from the user-supplied configuration to have a place to store derived state without mutating the user-supplied configuration or exposing daemon internals in API struct types. Hold the derived state and the user-supplied configuration in a single struct value so that they can be updated as an atomic unit. Signed-off-by: Cory Snider <csnider@mirantis.com>
2022-08-31 20:12:30 +00:00
opts = append(opts, withRootless(daemon, &daemonCfg.Config))
} else if userns.RunningInUserNS() {
opts = append(opts, withRootfulInRootless(daemon, &daemonCfg.Config))
}
var snapshotter, snapshotKey string
if daemon.UsesSnapshotter() {
snapshotter = daemon.imageService.StorageDriver()
snapshotKey = c.ID
}
return &s, coci.ApplyOpts(ctx, daemon.containerdClient, &containers.Container{
ID: c.ID,
Snapshotter: snapshotter,
SnapshotKey: snapshotKey,
}, &s, opts...)
}
func clearReadOnly(m *specs.Mount) {
var opt []string
for _, o := range m.Options {
if o != "ro" {
opt = append(opt, o)
}
}
m.Options = opt
}
// mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig
func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig, daemonCfg *dconfig.Config) {
ulimits := c.Ulimits
// Merge ulimits with daemon defaults
ulIdx := make(map[string]struct{})
for _, ul := range ulimits {
ulIdx[ul.Name] = struct{}{}
}
for name, ul := range daemonCfg.Ulimits {
if _, exists := ulIdx[name]; !exists {
ulimits = append(ulimits, ul)
}
}
c.Ulimits = ulimits
}