2018-02-05 21:05:59 +00:00
package daemon // import "github.com/docker/docker/daemon"
2016-03-18 18:50:19 +00:00
import (
2019-04-09 20:51:40 +00:00
"context"
2016-03-18 18:50:19 +00:00
"fmt"
"os"
"path/filepath"
2016-04-26 08:20:17 +00:00
"sort"
2016-03-18 18:50:19 +00:00
"strconv"
"strings"
2023-01-30 14:43:31 +00:00
cdcgroups "github.com/containerd/cgroups/v3"
2019-04-09 20:51:40 +00:00
"github.com/containerd/containerd/containers"
coci "github.com/containerd/containerd/oci"
2021-04-08 13:37:13 +00:00
"github.com/containerd/containerd/pkg/apparmor"
2021-06-18 09:01:24 +00:00
"github.com/containerd/containerd/pkg/userns"
2023-09-13 15:41:45 +00:00
"github.com/containerd/log"
2016-09-06 18:18:12 +00:00
containertypes "github.com/docker/docker/api/types/container"
2016-03-18 18:50:19 +00:00
"github.com/docker/docker/container"
2022-02-18 17:07:40 +00:00
dconfig "github.com/docker/docker/daemon/config"
Fix validation of IpcMode, PidMode, UTSMode, CgroupnsMode
These HostConfig properties were not validated until the OCI spec for the container
was created, which meant that `container run` and `docker create` would accept
invalid values, and the invalid value would not be detected until `start` was
called, returning a 500 "internal server error", as well as errors from containerd
("cleanup: failed to delete container from containerd: no such container") in the
daemon logs.
As a result, a faulty container was created, and the container state remained
in the `created` state.
This patch:
- Updates `oci.WithNamespaces()` to return the correct `errdefs.InvalidParameter`
- Updates `verifyPlatformContainerSettings()` to validate these settings, so that
an error is returned when _creating_ the container.
Before this patch:
docker run -dit --ipc=shared --name foo busybox
2a00d74e9fbb7960c4718def8f6c74fa8ee754030eeb93ee26a516e27d4d029f
docker: Error response from daemon: Invalid IPC mode: shared.
docker ps -a --filter name=foo
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
2a00d74e9fbb busybox "sh" About a minute ago Created foo
After this patch:
docker run -dit --ipc=shared --name foo busybox
docker: Error response from daemon: invalid IPC mode: shared.
docker ps -a --filter name=foo
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
An integration test was added to verify the new validation, which can be run with:
make BIND_DIR=. TEST_FILTER=TestCreateInvalidHostConfig DOCKER_GRAPHDRIVER=vfs test-integration
Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
2022-05-25 11:17:16 +00:00
"github.com/docker/docker/errdefs"
2016-03-18 18:50:19 +00:00
"github.com/docker/docker/oci"
2018-12-16 15:11:37 +00:00
"github.com/docker/docker/oci/caps"
2016-03-18 18:50:19 +00:00
"github.com/docker/docker/pkg/idtools"
2023-01-03 12:08:40 +00:00
"github.com/docker/docker/pkg/rootless/specconv"
2018-04-17 20:50:28 +00:00
volumemounts "github.com/docker/docker/volume/mounts"
2020-03-13 23:38:24 +00:00
"github.com/moby/sys/mount"
"github.com/moby/sys/mountinfo"
2023-10-24 13:45:02 +00:00
"github.com/moby/sys/user"
2016-06-07 19:05:43 +00:00
"github.com/opencontainers/runc/libcontainer/cgroups"
2019-08-05 14:37:47 +00:00
specs "github.com/opencontainers/runtime-spec/specs-go"
2018-01-24 23:10:01 +00:00
"github.com/pkg/errors"
2017-10-15 06:06:20 +00:00
"golang.org/x/sys/unix"
2016-03-18 18:50:19 +00:00
)
2022-02-18 17:07:40 +00:00
const inContainerInitPath = "/sbin/" + dconfig . DefaultInitBinary
2018-08-22 20:05:12 +00:00
2022-08-17 21:13:49 +00:00
// withRlimits sets the container's rlimits along with merging the daemon's rlimits
func withRlimits ( daemon * Daemon , daemonCfg * dconfig . Config , c * container . Container ) coci . SpecOpts {
2019-04-10 18:45:14 +00:00
return func ( ctx context . Context , _ coci . Client , _ * containers . Container , s * coci . Spec ) error {
var rlimits [ ] specs . POSIXRlimit
// We want to leave the original HostConfig alone so make a copy here
hostConfig := * c . HostConfig
// Merge with the daemon defaults
2022-08-17 21:13:49 +00:00
daemon . mergeUlimits ( & hostConfig , daemonCfg )
2019-04-10 18:45:14 +00:00
for _ , ul := range hostConfig . Ulimits {
rlimits = append ( rlimits , specs . POSIXRlimit {
Type : "RLIMIT_" + strings . ToUpper ( ul . Name ) ,
Soft : uint64 ( ul . Soft ) ,
Hard : uint64 ( ul . Hard ) ,
} )
}
2023-06-06 16:57:38 +00:00
if s . Process == nil {
s . Process = & specs . Process { }
}
2019-04-10 18:45:14 +00:00
s . Process . Rlimits = rlimits
return nil
2016-03-18 18:50:19 +00:00
}
2019-04-10 18:45:14 +00:00
}
2016-03-18 18:50:19 +00:00
2022-08-17 21:13:49 +00:00
// withRootless sets the spec to the rootless configuration
func withRootless ( daemon * Daemon , daemonCfg * dconfig . Config ) coci . SpecOpts {
2020-02-10 05:37:22 +00:00
return func ( _ context . Context , _ coci . Client , _ * containers . Container , s * coci . Spec ) error {
var v2Controllers [ ] string
2022-08-17 21:13:49 +00:00
if cgroupDriver ( daemonCfg ) == cgroupSystemdDriver {
2020-11-09 14:00:32 +00:00
if cdcgroups . Mode ( ) != cdcgroups . Unified {
2020-02-10 05:37:22 +00:00
return errors . New ( "rootless systemd driver doesn't support cgroup v1" )
}
rootlesskitParentEUID := os . Getenv ( "ROOTLESSKIT_PARENT_EUID" )
if rootlesskitParentEUID == "" {
return errors . New ( "$ROOTLESSKIT_PARENT_EUID is not set (requires RootlessKit v0.8.0)" )
}
2021-06-05 19:09:59 +00:00
euid , err := strconv . Atoi ( rootlesskitParentEUID )
if err != nil {
return errors . Wrap ( err , "invalid $ROOTLESSKIT_PARENT_EUID: must be a numeric value" )
}
controllersPath := fmt . Sprintf ( "/sys/fs/cgroup/user.slice/user-%d.slice/cgroup.controllers" , euid )
2021-08-24 10:10:50 +00:00
controllersFile , err := os . ReadFile ( controllersPath )
2020-02-10 05:37:22 +00:00
if err != nil {
return err
}
v2Controllers = strings . Fields ( string ( controllersFile ) )
}
return specconv . ToRootless ( s , v2Controllers )
}
2019-04-10 18:45:14 +00:00
}
2023-09-29 11:31:22 +00:00
// withRootfulInRootless is used for "rootful-in-rootless" dind;
// the daemon is running in UserNS but has no access to RootlessKit API socket, host filesystem, etc.
func withRootfulInRootless ( daemon * Daemon , daemonCfg * dconfig . Config ) coci . SpecOpts {
return func ( _ context . Context , _ coci . Client , _ * containers . Container , s * coci . Spec ) error {
specconv . ToRootfulInRootless ( s )
return nil
}
}
2019-04-10 18:45:14 +00:00
// WithOOMScore sets the oom score
func WithOOMScore ( score * int ) coci . SpecOpts {
return func ( ctx context . Context , _ coci . Client , _ * containers . Container , s * coci . Spec ) error {
2023-06-06 16:57:38 +00:00
if s . Process == nil {
s . Process = & specs . Process { }
}
2019-04-10 18:45:14 +00:00
s . Process . OOMScoreAdj = score
return nil
}
}
// WithSelinux sets the selinux labels
func WithSelinux ( c * container . Container ) coci . SpecOpts {
return func ( ctx context . Context , _ coci . Client , _ * containers . Container , s * coci . Spec ) error {
2023-06-06 16:57:38 +00:00
if s . Process == nil {
s . Process = & specs . Process { }
}
if s . Linux == nil {
s . Linux = & specs . Linux { }
}
2019-04-10 18:45:14 +00:00
s . Process . SelinuxLabel = c . GetProcessLabel ( )
s . Linux . MountLabel = c . MountLabel
return nil
}
}
// WithApparmor sets the apparmor profile
func WithApparmor ( c * container . Container ) coci . SpecOpts {
return func ( ctx context . Context , _ coci . Client , _ * containers . Container , s * coci . Spec ) error {
2021-04-08 13:37:13 +00:00
if apparmor . HostSupports ( ) {
2019-04-10 18:45:14 +00:00
var appArmorProfile string
if c . AppArmorProfile != "" {
appArmorProfile = c . AppArmorProfile
} else if c . HostConfig . Privileged {
2019-10-12 22:04:44 +00:00
appArmorProfile = unconfinedAppArmorProfile
2019-04-10 18:45:14 +00:00
} else {
2019-08-09 10:33:15 +00:00
appArmorProfile = defaultAppArmorProfile
2019-04-10 18:45:14 +00:00
}
2019-08-09 10:33:15 +00:00
if appArmorProfile == defaultAppArmorProfile {
2019-04-10 18:45:14 +00:00
// Unattended upgrades and other fun services can unload AppArmor
// profiles inadvertently. Since we cannot store our profile in
// /etc/apparmor.d, nor can we practically add other ways of
// telling the system to keep our profile loaded, in order to make
// sure that we keep the default profile enabled we dynamically
// reload it if necessary.
if err := ensureDefaultAppArmorProfile ( ) ; err != nil {
return err
}
}
2023-06-06 16:57:38 +00:00
if s . Process == nil {
s . Process = & specs . Process { }
}
2019-04-10 18:45:14 +00:00
s . Process . ApparmorProfile = appArmorProfile
}
return nil
}
}
// WithCapabilities sets the container's capabilties
func WithCapabilities ( c * container . Container ) coci . SpecOpts {
return func ( ctx context . Context , _ coci . Client , _ * containers . Container , s * coci . Spec ) error {
capabilities , err := caps . TweakCapabilities (
2019-11-14 17:53:52 +00:00
caps . DefaultCapabilities ( ) ,
2019-04-10 18:45:14 +00:00
c . HostConfig . CapAdd ,
c . HostConfig . CapDrop ,
c . HostConfig . Privileged ,
)
if err != nil {
return err
}
return oci . SetCapabilities ( s , capabilities )
}
2016-03-18 18:50:19 +00:00
}
2020-07-29 12:26:05 +00:00
func resourcePath ( c * container . Container , getPath func ( ) ( string , error ) ) ( string , error ) {
p , err := getPath ( )
2016-03-18 18:50:19 +00:00
if err != nil {
2020-07-29 12:26:05 +00:00
return "" , err
2020-07-29 02:43:43 +00:00
}
2020-07-29 12:26:05 +00:00
return c . GetResourcePath ( p )
2016-03-18 18:50:19 +00:00
}
2020-07-29 12:26:05 +00:00
func getUser ( c * container . Container , username string ) ( specs . User , error ) {
var usr specs . User
passwdPath , err := resourcePath ( c , user . GetPasswdPath )
2016-03-18 18:50:19 +00:00
if err != nil {
2020-07-29 12:26:05 +00:00
return usr , err
2016-03-18 18:50:19 +00:00
}
2020-07-29 12:26:05 +00:00
groupPath , err := resourcePath ( c , user . GetGroupPath )
2016-03-18 18:50:19 +00:00
if err != nil {
2020-07-29 12:26:05 +00:00
return usr , err
2016-03-18 18:50:19 +00:00
}
2020-07-29 12:26:05 +00:00
execUser , err := user . GetExecUserPath ( username , nil , passwdPath , groupPath )
2016-03-18 18:50:19 +00:00
if err != nil {
2020-07-29 12:26:05 +00:00
return usr , err
2016-03-18 18:50:19 +00:00
}
2020-07-29 12:26:05 +00:00
usr . UID = uint32 ( execUser . Uid )
usr . GID = uint32 ( execUser . Gid )
2022-06-02 09:30:15 +00:00
usr . AdditionalGids = [ ] uint32 { usr . GID }
2016-03-18 18:50:19 +00:00
var addGroups [ ] int
if len ( c . HostConfig . GroupAdd ) > 0 {
2020-07-29 12:26:05 +00:00
addGroups , err = user . GetAdditionalGroupsPath ( c . HostConfig . GroupAdd , groupPath )
2016-03-18 18:50:19 +00:00
if err != nil {
2020-07-29 12:26:05 +00:00
return usr , err
2016-03-18 18:50:19 +00:00
}
}
2020-07-29 12:26:05 +00:00
for _ , g := range append ( execUser . Sgids , addGroups ... ) {
usr . AdditionalGids = append ( usr . AdditionalGids , uint32 ( g ) )
2016-03-18 18:50:19 +00:00
}
2020-07-29 12:26:05 +00:00
return usr , nil
2016-03-18 18:50:19 +00:00
}
2017-04-27 21:52:47 +00:00
func setNamespace ( s * specs . Spec , ns specs . LinuxNamespace ) {
2023-06-06 16:57:38 +00:00
if s . Linux == nil {
s . Linux = & specs . Linux { }
}
2016-03-18 18:50:19 +00:00
for i , n := range s . Linux . Namespaces {
if n . Type == ns . Type {
s . Linux . Namespaces [ i ] = ns
return
}
}
s . Linux . Namespaces = append ( s . Linux . Namespaces , ns )
}
2019-04-10 18:45:14 +00:00
// WithNamespaces sets the container's namespaces
func WithNamespaces ( daemon * Daemon , c * container . Container ) coci . SpecOpts {
return func ( ctx context . Context , _ coci . Client , _ * containers . Container , s * coci . Spec ) error {
userNS := false
// user
if c . HostConfig . UsernsMode . IsPrivate ( ) {
2023-08-10 17:59:49 +00:00
if uidMap := daemon . idMapping . UIDMaps ; uidMap != nil {
2019-04-10 18:45:14 +00:00
userNS = true
2023-08-10 17:59:49 +00:00
setNamespace ( s , specs . LinuxNamespace {
2023-08-12 16:45:18 +00:00
Type : specs . UserNamespace ,
2023-08-10 17:59:49 +00:00
} )
2019-04-10 18:45:14 +00:00
s . Linux . UIDMappings = specMapping ( uidMap )
2022-03-14 19:24:29 +00:00
s . Linux . GIDMappings = specMapping ( daemon . idMapping . GIDMaps )
2019-04-10 18:45:14 +00:00
}
}
// network
if ! c . Config . NetworkDisabled {
2023-08-11 09:29:08 +00:00
networkMode := c . HostConfig . NetworkMode
switch {
case networkMode . IsContainer ( ) :
nc , err := daemon . getNetworkedContainer ( c . ID , networkMode . ConnectedContainer ( ) )
2019-04-10 18:45:14 +00:00
if err != nil {
return err
}
2023-08-10 17:59:49 +00:00
setNamespace ( s , specs . LinuxNamespace {
2023-08-12 16:45:18 +00:00
Type : specs . NetworkNamespace ,
2023-08-10 17:59:49 +00:00
Path : fmt . Sprintf ( "/proc/%d/ns/net" , nc . State . GetPID ( ) ) ,
} )
2019-04-10 18:45:14 +00:00
if userNS {
2023-08-10 17:59:49 +00:00
// to share a net namespace, the containers must also share a user namespace.
2023-08-12 19:37:41 +00:00
//
// FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210
2023-08-10 17:59:49 +00:00
setNamespace ( s , specs . LinuxNamespace {
2023-08-12 16:45:18 +00:00
Type : specs . UserNamespace ,
2023-08-10 17:59:49 +00:00
Path : fmt . Sprintf ( "/proc/%d/ns/user" , nc . State . GetPID ( ) ) ,
} )
2019-04-10 18:45:14 +00:00
}
2023-08-11 09:29:08 +00:00
case networkMode . IsHost ( ) :
2024-01-18 09:05:27 +00:00
oci . RemoveNamespace ( s , specs . NetworkNamespace )
2023-08-11 09:29:08 +00:00
default :
2023-08-10 17:59:49 +00:00
setNamespace ( s , specs . LinuxNamespace {
2023-08-12 16:45:18 +00:00
Type : specs . NetworkNamespace ,
2023-08-10 17:59:49 +00:00
} )
2019-04-10 18:45:14 +00:00
}
2016-03-22 01:30:21 +00:00
}
2019-04-10 18:45:14 +00:00
// ipc
ipcMode := c . HostConfig . IpcMode
Fix validation of IpcMode, PidMode, UTSMode, CgroupnsMode
These HostConfig properties were not validated until the OCI spec for the container
was created, which meant that `container run` and `docker create` would accept
invalid values, and the invalid value would not be detected until `start` was
called, returning a 500 "internal server error", as well as errors from containerd
("cleanup: failed to delete container from containerd: no such container") in the
daemon logs.
As a result, a faulty container was created, and the container state remained
in the `created` state.
This patch:
- Updates `oci.WithNamespaces()` to return the correct `errdefs.InvalidParameter`
- Updates `verifyPlatformContainerSettings()` to validate these settings, so that
an error is returned when _creating_ the container.
Before this patch:
docker run -dit --ipc=shared --name foo busybox
2a00d74e9fbb7960c4718def8f6c74fa8ee754030eeb93ee26a516e27d4d029f
docker: Error response from daemon: Invalid IPC mode: shared.
docker ps -a --filter name=foo
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
2a00d74e9fbb busybox "sh" About a minute ago Created foo
After this patch:
docker run -dit --ipc=shared --name foo busybox
docker: Error response from daemon: invalid IPC mode: shared.
docker ps -a --filter name=foo
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
An integration test was added to verify the new validation, which can be run with:
make BIND_DIR=. TEST_FILTER=TestCreateInvalidHostConfig DOCKER_GRAPHDRIVER=vfs test-integration
Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
2022-05-25 11:17:16 +00:00
if ! ipcMode . Valid ( ) {
return errdefs . InvalidParameter ( errors . Errorf ( "invalid IPC mode: %v" , ipcMode ) )
}
2019-04-10 18:45:14 +00:00
switch {
case ipcMode . IsContainer ( ) :
2023-08-11 22:23:21 +00:00
ic , err := daemon . getIPCContainer ( ipcMode . Container ( ) )
2016-03-18 18:50:19 +00:00
if err != nil {
2023-08-11 22:23:21 +00:00
return errors . Wrap ( err , "failed to join IPC namespace" )
2016-03-18 18:50:19 +00:00
}
2023-08-10 17:59:49 +00:00
setNamespace ( s , specs . LinuxNamespace {
2023-08-12 16:45:18 +00:00
Type : specs . IPCNamespace ,
2023-08-10 17:59:49 +00:00
Path : fmt . Sprintf ( "/proc/%d/ns/ipc" , ic . State . GetPID ( ) ) ,
} )
2016-03-22 01:30:21 +00:00
if userNS {
2023-08-10 17:59:49 +00:00
// to share a IPC namespace, the containers must also share a user namespace.
2023-08-12 19:37:41 +00:00
//
// FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210
2023-08-10 17:59:49 +00:00
setNamespace ( s , specs . LinuxNamespace {
2023-08-12 16:45:18 +00:00
Type : specs . UserNamespace ,
2023-08-10 17:59:49 +00:00
Path : fmt . Sprintf ( "/proc/%d/ns/user" , ic . State . GetPID ( ) ) ,
} )
2016-03-22 01:30:21 +00:00
}
2019-04-10 18:45:14 +00:00
case ipcMode . IsHost ( ) :
2023-08-12 16:45:18 +00:00
oci . RemoveNamespace ( s , specs . IPCNamespace )
2019-04-10 18:45:14 +00:00
case ipcMode . IsEmpty ( ) :
// A container was created by an older version of the daemon.
// The default behavior used to be what is now called "shareable".
fallthrough
case ipcMode . IsPrivate ( ) , ipcMode . IsShareable ( ) , ipcMode . IsNone ( ) :
2023-08-10 17:59:49 +00:00
setNamespace ( s , specs . LinuxNamespace {
2023-08-12 16:45:18 +00:00
Type : specs . IPCNamespace ,
2023-08-10 17:59:49 +00:00
} )
2016-03-18 18:50:19 +00:00
}
Implement none, private, and shareable ipc modes
Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and
/dev/mqueue between containers") container's /dev/shm is mounted on the
host first, then bind-mounted inside the container. This is done that
way in order to be able to share this container's IPC namespace
(and the /dev/shm mount point) with another container.
Unfortunately, this functionality breaks container checkpoint/restore
(even if IPC is not shared). Since /dev/shm is an external mount, its
contents is not saved by `criu checkpoint`, and so upon restore any
application that tries to access data under /dev/shm is severily
disappointed (which usually results in a fatal crash).
This commit solves the issue by introducing new IPC modes for containers
(in addition to 'host' and 'container:ID'). The new modes are:
- 'shareable': enables sharing this container's IPC with others
(this used to be the implicit default);
- 'private': disables sharing this container's IPC.
In 'private' mode, container's /dev/shm is truly mounted inside the
container, without any bind-mounting from the host, which solves the
issue.
While at it, let's also implement 'none' mode. The motivation, as
eloquently put by Justin Cormack, is:
> I wondered a while back about having a none shm mode, as currently it is
> not possible to have a totally unwriteable container as there is always
> a /dev/shm writeable mount. It is a bit of a niche case (and clearly
> should never be allowed to be daemon default) but it would be trivial to
> add now so maybe we should...
...so here's yet yet another mode:
- 'none': no /dev/shm mount inside the container (though it still
has its own private IPC namespace).
Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd
need to make 'private' the default mode, but unfortunately it breaks the
backward compatibility. So, let's make the default container IPC mode
per-daemon configurable (with the built-in default set to 'shareable'
for now). The default can be changed either via a daemon CLI option
(--default-shm-mode) or a daemon.json configuration file parameter
of the same name.
Note one can only set either 'shareable' or 'private' IPC modes as a
daemon default (i.e. in this context 'host', 'container', or 'none'
do not make much sense).
Some other changes this patch introduces are:
1. A mount for /dev/shm is added to default OCI Linux spec.
2. IpcMode.Valid() is simplified to remove duplicated code that parsed
'container:ID' form. Note the old version used to check that ID does
not contain a semicolon -- this is no longer the case (tests are
modified accordingly). The motivation is we should either do a
proper check for container ID validity, or don't check it at all
(since it is checked in other places anyway). I chose the latter.
3. IpcMode.Container() is modified to not return container ID if the
mode value does not start with "container:", unifying the check to
be the same as in IpcMode.IsContainer().
3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified
to add checks for newly added values.
[v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997]
[v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833]
[v4: addressed the case of upgrading from older daemon, in this case
container.HostConfig.IpcMode is unset and this is valid]
[v5: document old and new IpcMode values in api/swagger.yaml]
[v6: add the 'none' mode, changelog entry to docs/api/version-history.md]
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 21:58:50 +00:00
2019-04-10 18:45:14 +00:00
// pid
2023-08-11 09:29:08 +00:00
pidMode := c . HostConfig . PidMode
if ! pidMode . Valid ( ) {
return errdefs . InvalidParameter ( errors . Errorf ( "invalid PID mode: %v" , pidMode ) )
Fix validation of IpcMode, PidMode, UTSMode, CgroupnsMode
These HostConfig properties were not validated until the OCI spec for the container
was created, which meant that `container run` and `docker create` would accept
invalid values, and the invalid value would not be detected until `start` was
called, returning a 500 "internal server error", as well as errors from containerd
("cleanup: failed to delete container from containerd: no such container") in the
daemon logs.
As a result, a faulty container was created, and the container state remained
in the `created` state.
This patch:
- Updates `oci.WithNamespaces()` to return the correct `errdefs.InvalidParameter`
- Updates `verifyPlatformContainerSettings()` to validate these settings, so that
an error is returned when _creating_ the container.
Before this patch:
docker run -dit --ipc=shared --name foo busybox
2a00d74e9fbb7960c4718def8f6c74fa8ee754030eeb93ee26a516e27d4d029f
docker: Error response from daemon: Invalid IPC mode: shared.
docker ps -a --filter name=foo
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
2a00d74e9fbb busybox "sh" About a minute ago Created foo
After this patch:
docker run -dit --ipc=shared --name foo busybox
docker: Error response from daemon: invalid IPC mode: shared.
docker ps -a --filter name=foo
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
An integration test was added to verify the new validation, which can be run with:
make BIND_DIR=. TEST_FILTER=TestCreateInvalidHostConfig DOCKER_GRAPHDRIVER=vfs test-integration
Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
2022-05-25 11:17:16 +00:00
}
2023-08-11 09:29:08 +00:00
switch {
case pidMode . IsContainer ( ) :
2023-08-11 19:33:16 +00:00
pc , err := daemon . getPIDContainer ( pidMode . Container ( ) )
2019-04-10 18:45:14 +00:00
if err != nil {
daemon: WithNamespaces(): fix incorrect error for PID, IPC namespace
`Daemon.getPidContainer()` was wrapping the error-message with a message
("cannot join PID of a non running container") that did not reflect the
actual reason for the error; `Daemon.GetContainer()` could either return
an invalid parameter (invalid / empty identifier), or a "not found" error
if the specified container-ID could not be found.
In the latter case, we don't want to return a "not found" error through
the API, as this would indicate that the container we're _starting_ was
not found (which is not the case), so we need to convert the error into
an `errdefs.ErrInvalidParameter` (the container-ID specified for the PID
namespace is invalid if the container doesn't exist).
This logic is similar to what we do for IPC namespaces. which received
a similar fix in c3d7a0c6033a2764dd85c3863809ac498ef129f2.
This patch updates the error-types, and moves them into the getIpcContainer
and getPidContainer container functions, both of which should return
an "invalid parameter" if the container was not found.
It's worth noting that, while `WithNamespaces()` may return an "invalid
parameter" error, the `start` endpoint itself may _not_ be. as outlined
in commit bf1fb97575ae0c929075f8340d7deb4ae9f41fae, starting a container
that has an invalid configuration should be considered an internal server
error, and is not an invalid _request_. However, for uses other than
container "start", `WithNamespaces()` should return the correct error
to allow code to handle it accordingly.
Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
2023-08-11 18:37:58 +00:00
return errors . Wrap ( err , "failed to join PID namespace" )
2019-04-10 18:45:14 +00:00
}
2023-08-10 17:59:49 +00:00
setNamespace ( s , specs . LinuxNamespace {
2023-08-12 16:45:18 +00:00
Type : specs . PIDNamespace ,
2019-08-09 10:30:18 +00:00
Path : fmt . Sprintf ( "/proc/%d/ns/pid" , pc . State . GetPID ( ) ) ,
2023-08-10 17:59:49 +00:00
} )
2019-04-10 18:45:14 +00:00
if userNS {
2023-08-10 17:59:49 +00:00
// to share a PID namespace, the containers must also share a user namespace.
2023-08-12 19:37:41 +00:00
//
// FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210
2023-08-10 17:59:49 +00:00
setNamespace ( s , specs . LinuxNamespace {
2023-08-12 16:45:18 +00:00
Type : specs . UserNamespace ,
2019-08-09 10:30:18 +00:00
Path : fmt . Sprintf ( "/proc/%d/ns/user" , pc . State . GetPID ( ) ) ,
2023-08-10 17:59:49 +00:00
} )
2019-04-10 18:45:14 +00:00
}
2023-08-11 09:29:08 +00:00
case pidMode . IsHost ( ) :
2023-08-12 16:45:18 +00:00
oci . RemoveNamespace ( s , specs . PIDNamespace )
2023-08-11 09:29:08 +00:00
default :
2023-08-10 17:59:49 +00:00
setNamespace ( s , specs . LinuxNamespace {
2023-08-12 16:45:18 +00:00
Type : specs . PIDNamespace ,
2023-08-10 17:59:49 +00:00
} )
2016-05-06 18:56:03 +00:00
}
2023-08-11 09:29:08 +00:00
2019-04-10 18:45:14 +00:00
// uts
Fix validation of IpcMode, PidMode, UTSMode, CgroupnsMode
These HostConfig properties were not validated until the OCI spec for the container
was created, which meant that `container run` and `docker create` would accept
invalid values, and the invalid value would not be detected until `start` was
called, returning a 500 "internal server error", as well as errors from containerd
("cleanup: failed to delete container from containerd: no such container") in the
daemon logs.
As a result, a faulty container was created, and the container state remained
in the `created` state.
This patch:
- Updates `oci.WithNamespaces()` to return the correct `errdefs.InvalidParameter`
- Updates `verifyPlatformContainerSettings()` to validate these settings, so that
an error is returned when _creating_ the container.
Before this patch:
docker run -dit --ipc=shared --name foo busybox
2a00d74e9fbb7960c4718def8f6c74fa8ee754030eeb93ee26a516e27d4d029f
docker: Error response from daemon: Invalid IPC mode: shared.
docker ps -a --filter name=foo
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
2a00d74e9fbb busybox "sh" About a minute ago Created foo
After this patch:
docker run -dit --ipc=shared --name foo busybox
docker: Error response from daemon: invalid IPC mode: shared.
docker ps -a --filter name=foo
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
An integration test was added to verify the new validation, which can be run with:
make BIND_DIR=. TEST_FILTER=TestCreateInvalidHostConfig DOCKER_GRAPHDRIVER=vfs test-integration
Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
2022-05-25 11:17:16 +00:00
if ! c . HostConfig . UTSMode . Valid ( ) {
return errdefs . InvalidParameter ( errors . Errorf ( "invalid UTS mode: %v" , c . HostConfig . UTSMode ) )
}
2019-04-10 18:45:14 +00:00
if c . HostConfig . UTSMode . IsHost ( ) {
2023-08-12 16:45:18 +00:00
oci . RemoveNamespace ( s , specs . UTSNamespace )
2019-04-10 18:45:14 +00:00
s . Hostname = ""
2016-05-06 18:56:03 +00:00
}
2016-03-18 18:50:19 +00:00
2019-03-15 03:44:18 +00:00
// cgroup
Fix validation of IpcMode, PidMode, UTSMode, CgroupnsMode
These HostConfig properties were not validated until the OCI spec for the container
was created, which meant that `container run` and `docker create` would accept
invalid values, and the invalid value would not be detected until `start` was
called, returning a 500 "internal server error", as well as errors from containerd
("cleanup: failed to delete container from containerd: no such container") in the
daemon logs.
As a result, a faulty container was created, and the container state remained
in the `created` state.
This patch:
- Updates `oci.WithNamespaces()` to return the correct `errdefs.InvalidParameter`
- Updates `verifyPlatformContainerSettings()` to validate these settings, so that
an error is returned when _creating_ the container.
Before this patch:
docker run -dit --ipc=shared --name foo busybox
2a00d74e9fbb7960c4718def8f6c74fa8ee754030eeb93ee26a516e27d4d029f
docker: Error response from daemon: Invalid IPC mode: shared.
docker ps -a --filter name=foo
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
2a00d74e9fbb busybox "sh" About a minute ago Created foo
After this patch:
docker run -dit --ipc=shared --name foo busybox
docker: Error response from daemon: invalid IPC mode: shared.
docker ps -a --filter name=foo
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
An integration test was added to verify the new validation, which can be run with:
make BIND_DIR=. TEST_FILTER=TestCreateInvalidHostConfig DOCKER_GRAPHDRIVER=vfs test-integration
Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
2022-05-25 11:17:16 +00:00
if ! c . HostConfig . CgroupnsMode . Valid ( ) {
return errdefs . InvalidParameter ( errors . Errorf ( "invalid cgroup namespace mode: %v" , c . HostConfig . CgroupnsMode ) )
}
2023-08-11 09:47:38 +00:00
if c . HostConfig . CgroupnsMode . IsPrivate ( ) {
setNamespace ( s , specs . LinuxNamespace {
Type : specs . CgroupNamespace ,
} )
2019-03-15 03:44:18 +00:00
}
return nil
}
2016-03-18 18:50:19 +00:00
}
2017-04-27 21:52:47 +00:00
func specMapping ( s [ ] idtools . IDMap ) [ ] specs . LinuxIDMapping {
var ids [ ] specs . LinuxIDMapping
2016-03-18 18:50:19 +00:00
for _ , item := range s {
2017-04-27 21:52:47 +00:00
ids = append ( ids , specs . LinuxIDMapping {
2016-03-18 18:50:19 +00:00
HostID : uint32 ( item . HostID ) ,
ContainerID : uint32 ( item . ContainerID ) ,
Size : uint32 ( item . Size ) ,
} )
}
return ids
}
// Get the source mount point of directory passed in as argument. Also return
// optional fields.
func getSourceMount ( source string ) ( string , string , error ) {
// Ensure any symlinks are resolved.
sourcePath , err := filepath . EvalSymlinks ( source )
if err != nil {
return "" , "" , err
}
2020-03-13 23:38:24 +00:00
mi , err := mountinfo . GetMounts ( mountinfo . ParentsFilter ( sourcePath ) )
2016-03-18 18:50:19 +00:00
if err != nil {
return "" , "" , err
}
getSourceMount(): simplify
The flow of getSourceMount was:
1 get all entries from /proc/self/mountinfo
2 do a linear search for the `source` directory
3 if found, return its data
4 get the parent directory of `source`, goto 2
The repeated linear search through the whole mountinfo (which can have
thousands of records) is inefficient. Instead, let's just
1 collect all the relevant records (only those mount points
that can be a parent of `source`)
2 find the record with the longest mountpath, return its data
This was tested manually with something like
```go
func TestGetSourceMount(t *testing.T) {
mnt, flags, err := getSourceMount("/sys/devices/msr/")
assert.NoError(t, err)
t.Logf("mnt: %v, flags: %v", mnt, flags)
}
```
...but it relies on having a specific mount points on the system
being used for testing.
[v2: add unit tests for ParentsFilter]
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2018-01-26 04:13:46 +00:00
if len ( mi ) < 1 {
return "" , "" , fmt . Errorf ( "Can't find mount point of %s" , source )
2016-03-18 18:50:19 +00:00
}
getSourceMount(): simplify
The flow of getSourceMount was:
1 get all entries from /proc/self/mountinfo
2 do a linear search for the `source` directory
3 if found, return its data
4 get the parent directory of `source`, goto 2
The repeated linear search through the whole mountinfo (which can have
thousands of records) is inefficient. Instead, let's just
1 collect all the relevant records (only those mount points
that can be a parent of `source`)
2 find the record with the longest mountpath, return its data
This was tested manually with something like
```go
func TestGetSourceMount(t *testing.T) {
mnt, flags, err := getSourceMount("/sys/devices/msr/")
assert.NoError(t, err)
t.Logf("mnt: %v, flags: %v", mnt, flags)
}
```
...but it relies on having a specific mount points on the system
being used for testing.
[v2: add unit tests for ParentsFilter]
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2018-01-26 04:13:46 +00:00
// find the longest mount point
var idx , maxlen int
for i := range mi {
if len ( mi [ i ] . Mountpoint ) > maxlen {
maxlen = len ( mi [ i ] . Mountpoint )
idx = i
2016-03-18 18:50:19 +00:00
}
}
2018-05-10 19:01:50 +00:00
return mi [ idx ] . Mountpoint , mi [ idx ] . Optional , nil
2016-03-18 18:50:19 +00:00
}
2018-01-24 23:10:01 +00:00
const (
sharedPropagationOption = "shared:"
slavePropagationOption = "master:"
)
2019-08-09 10:33:15 +00:00
// hasMountInfoOption checks if any of the passed any of the given option values
2018-01-24 23:10:01 +00:00
// are set in the passed in option string.
2019-08-09 10:33:15 +00:00
func hasMountInfoOption ( opts string , vals ... string ) bool {
2018-01-24 23:10:01 +00:00
for _ , opt := range strings . Split ( opts , " " ) {
for _ , val := range vals {
if strings . HasPrefix ( opt , val ) {
return true
}
}
}
return false
}
2016-03-18 18:50:19 +00:00
// Ensure mount point on which path is mounted, is shared.
func ensureShared ( path string ) error {
sourceMount , optionalOpts , err := getSourceMount ( path )
if err != nil {
return err
}
// Make sure source mount point is shared.
2019-08-09 10:33:15 +00:00
if ! hasMountInfoOption ( optionalOpts , sharedPropagationOption ) {
2018-01-24 23:10:01 +00:00
return errors . Errorf ( "path %s is mounted on %s but it is not a shared mount" , path , sourceMount )
2016-03-18 18:50:19 +00:00
}
return nil
}
// Ensure mount point on which path is mounted, is either shared or slave.
func ensureSharedOrSlave ( path string ) error {
sourceMount , optionalOpts , err := getSourceMount ( path )
if err != nil {
return err
}
2019-08-09 10:33:15 +00:00
if ! hasMountInfoOption ( optionalOpts , sharedPropagationOption , slavePropagationOption ) {
2018-01-24 23:10:01 +00:00
return errors . Errorf ( "path %s is mounted on %s but it is not a shared or slave mount" , path , sourceMount )
2016-03-18 18:50:19 +00:00
}
return nil
}
2017-10-15 06:06:20 +00:00
// Get the set of mount flags that are set on the mount that contains the given
// path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
// bind-mounting "with options" will not fail with user namespaces, due to
// kernel restrictions that require user namespace mounts to preserve
// CL_UNPRIVILEGED locked flags.
func getUnprivilegedMountFlags ( path string ) ( [ ] string , error ) {
var statfs unix . Statfs_t
if err := unix . Statfs ( path , & statfs ) ; err != nil {
return nil , err
}
// The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
unprivilegedFlags := map [ uint64 ] string {
unix . MS_RDONLY : "ro" ,
unix . MS_NODEV : "nodev" ,
unix . MS_NOEXEC : "noexec" ,
unix . MS_NOSUID : "nosuid" ,
unix . MS_NOATIME : "noatime" ,
unix . MS_RELATIME : "relatime" ,
unix . MS_NODIRATIME : "nodiratime" ,
}
var flags [ ] string
for mask , flag := range unprivilegedFlags {
if uint64 ( statfs . Flags ) & mask == mask {
flags = append ( flags , flag )
}
}
return flags , nil
}
2016-03-18 18:50:19 +00:00
var (
mountPropagationMap = map [ string ] int {
"private" : mount . PRIVATE ,
"rprivate" : mount . RPRIVATE ,
"shared" : mount . SHARED ,
"rshared" : mount . RSHARED ,
"slave" : mount . SLAVE ,
"rslave" : mount . RSLAVE ,
}
mountPropagationReverseMap = map [ int ] string {
mount . PRIVATE : "private" ,
mount . RPRIVATE : "rprivate" ,
mount . SHARED : "shared" ,
mount . RSHARED : "rshared" ,
mount . SLAVE : "slave" ,
mount . RSLAVE : "rslave" ,
}
)
2017-11-10 05:18:48 +00:00
// inSlice tests whether a string is contained in a slice of strings or not.
// Comparison is case sensitive
func inSlice ( slice [ ] string , s string ) bool {
for _ , ss := range slice {
if s == ss {
return true
}
}
return false
}
2022-08-17 21:13:49 +00:00
// withMounts sets the container's mounts
2023-06-27 10:17:49 +00:00
func withMounts ( daemon * Daemon , daemonCfg * configStore , c * container . Container , ms [ ] container . Mount ) coci . SpecOpts {
2019-04-10 18:45:14 +00:00
return func ( ctx context . Context , _ coci . Client , _ * containers . Container , s * coci . Spec ) ( err error ) {
sort . Sort ( mounts ( ms ) )
mounts := ms
userMounts := make ( map [ string ] struct { } )
for _ , m := range mounts {
userMounts [ m . Destination ] = struct { } { }
}
// Copy all mounts from spec to defaultMounts, except for
// - mounts overridden by a user supplied mount;
// - all mounts under /dev if a user supplied /dev is present;
// - /dev/shm, in case IpcMode is none.
// While at it, also
// - set size for /dev/shm from shmsize.
defaultMounts := s . Mounts [ : 0 ]
_ , mountDev := userMounts [ "/dev" ]
for _ , m := range s . Mounts {
if _ , ok := userMounts [ m . Destination ] ; ok {
// filter out mount overridden by a user supplied mount
continue
2016-03-18 18:50:19 +00:00
}
2019-04-10 18:45:14 +00:00
if mountDev && strings . HasPrefix ( m . Destination , "/dev/" ) {
// filter out everything under /dev if /dev is user-mounted
continue
}
if m . Destination == "/dev/shm" {
if c . HostConfig . IpcMode . IsNone ( ) {
// filter out /dev/shm for "none" IpcMode
continue
2018-01-18 21:55:27 +00:00
}
2019-04-10 18:45:14 +00:00
// set size for /dev/shm mount from spec
sizeOpt := "size=" + strconv . FormatInt ( c . HostConfig . ShmSize , 10 )
m . Options = append ( m . Options , sizeOpt )
}
2018-01-18 21:55:27 +00:00
2019-04-10 18:45:14 +00:00
defaultMounts = append ( defaultMounts , m )
}
s . Mounts = defaultMounts
for _ , m := range mounts {
if m . Source == "tmpfs" {
data := m . Data
2021-06-11 19:01:18 +00:00
parser := volumemounts . NewParser ( )
2019-04-10 18:45:14 +00:00
options := [ ] string { "noexec" , "nosuid" , "nodev" , string ( parser . DefaultPropagationMode ( ) ) }
if data != "" {
options = append ( options , strings . Split ( data , "," ) ... )
2018-01-18 21:55:27 +00:00
}
2019-04-10 18:45:14 +00:00
merged , err := mount . MergeTmpfsOptions ( options )
if err != nil {
2018-01-18 21:55:27 +00:00
return err
}
2019-04-10 18:45:14 +00:00
s . Mounts = append ( s . Mounts , specs . Mount { Destination : m . Destination , Source : m . Source , Type : "tmpfs" , Options : merged } )
continue
2016-03-18 18:50:19 +00:00
}
2019-04-10 18:45:14 +00:00
mt := specs . Mount { Destination : m . Destination , Source : m . Source , Type : "bind" }
// Determine property of RootPropagation based on volume
// properties. If a volume is shared, then keep root propagation
// shared. This should work for slave and private volumes too.
//
// For slave volumes, it can be either [r]shared/[r]slave.
//
// For private volumes any root propagation value should work.
pFlag := mountPropagationMap [ m . Propagation ]
switch pFlag {
case mount . SHARED , mount . RSHARED :
if err := ensureShared ( m . Source ) ; err != nil {
return err
}
2018-01-18 21:55:27 +00:00
rootpg := mountPropagationMap [ s . Linux . RootfsPropagation ]
2019-04-10 18:45:14 +00:00
if rootpg != mount . SHARED && rootpg != mount . RSHARED {
2023-06-06 16:57:38 +00:00
if s . Linux == nil {
s . Linux = & specs . Linux { }
}
2019-04-10 18:45:14 +00:00
s . Linux . RootfsPropagation = mountPropagationReverseMap [ mount . SHARED ]
}
case mount . SLAVE , mount . RSLAVE :
var fallback bool
if err := ensureSharedOrSlave ( m . Source ) ; err != nil {
// For backwards compatibility purposes, treat mounts from the daemon root
// as special since we automatically add rslave propagation to these mounts
// when the user did not set anything, so we should fallback to the old
// behavior which is to use private propagation which is normally the
// default.
if ! strings . HasPrefix ( m . Source , daemon . root ) && ! strings . HasPrefix ( daemon . root , m . Source ) {
return err
}
cm , ok := c . MountPoints [ m . Destination ]
if ! ok {
return err
}
if cm . Spec . BindOptions != nil && cm . Spec . BindOptions . Propagation != "" {
// This means the user explicitly set a propagation, do not fallback in that case.
return err
}
fallback = true
2023-06-23 00:33:17 +00:00
log . G ( ctx ) . WithField ( "container" , c . ID ) . WithField ( "source" , m . Source ) . Warn ( "Falling back to default propagation for bind source in daemon root" )
2019-04-10 18:45:14 +00:00
}
if ! fallback {
rootpg := mountPropagationMap [ s . Linux . RootfsPropagation ]
if rootpg != mount . SHARED && rootpg != mount . RSHARED && rootpg != mount . SLAVE && rootpg != mount . RSLAVE {
2023-06-06 16:57:38 +00:00
if s . Linux == nil {
s . Linux = & specs . Linux { }
}
2019-04-10 18:45:14 +00:00
s . Linux . RootfsPropagation = mountPropagationReverseMap [ mount . RSLAVE ]
}
2018-01-18 21:55:27 +00:00
}
2016-03-18 18:50:19 +00:00
}
2019-04-10 18:45:14 +00:00
bindMode := "rbind"
if m . NonRecursive {
bindMode = "bind"
}
opts := [ ] string { bindMode }
if ! m . Writable {
2023-04-05 11:32:03 +00:00
rro := true
if m . ReadOnlyNonRecursive {
rro = false
if m . ReadOnlyForceRecursive {
return errors . New ( "mount options conflict: ReadOnlyNonRecursive && ReadOnlyForceRecursive" )
}
}
2022-08-17 21:13:49 +00:00
if rroErr := supportsRecursivelyReadOnly ( daemonCfg , c . HostConfig . Runtime ) ; rroErr != nil {
2023-04-05 11:32:03 +00:00
rro = false
if m . ReadOnlyForceRecursive {
return rroErr
}
}
if rro {
opts = append ( opts , "rro" )
} else {
opts = append ( opts , "ro" )
}
2019-04-10 18:45:14 +00:00
}
if pFlag != 0 {
opts = append ( opts , mountPropagationReverseMap [ pFlag ] )
}
2016-03-18 18:50:19 +00:00
2019-04-10 18:45:14 +00:00
// If we are using user namespaces, then we must make sure that we
// don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
// "mount" when we bind-mount. The reason for this is that at the point
// when runc sets up the root filesystem, it is already inside a user
// namespace, and thus cannot change any flags that are locked.
2022-08-17 21:13:49 +00:00
if daemonCfg . RemappedRoot != "" || userns . RunningInUserNS ( ) {
2019-04-10 18:45:14 +00:00
unprivOpts , err := getUnprivilegedMountFlags ( m . Source )
if err != nil {
return err
}
opts = append ( opts , unprivOpts ... )
2017-10-15 06:06:20 +00:00
}
2019-04-10 18:45:14 +00:00
mt . Options = opts
s . Mounts = append ( s . Mounts , mt )
}
2016-03-18 18:50:19 +00:00
2019-04-10 18:45:14 +00:00
if s . Root . Readonly {
for i , m := range s . Mounts {
switch m . Destination {
case "/proc" , "/dev/pts" , "/dev/shm" , "/dev/mqueue" , "/dev" :
continue
}
if _ , ok := userMounts [ m . Destination ] ; ! ok {
if ! inSlice ( m . Options , "ro" ) {
s . Mounts [ i ] . Options = append ( s . Mounts [ i ] . Options , "ro" )
}
2016-03-18 18:50:19 +00:00
}
}
}
2019-04-10 18:45:14 +00:00
if c . HostConfig . Privileged {
// clear readonly for /sys
for i := range s . Mounts {
if s . Mounts [ i ] . Destination == "/sys" {
clearReadOnly ( & s . Mounts [ i ] )
}
2016-03-18 18:50:19 +00:00
}
2023-06-06 16:57:38 +00:00
if s . Linux != nil {
s . Linux . ReadonlyPaths = nil
s . Linux . MaskedPaths = nil
}
2016-03-18 18:50:19 +00:00
}
2019-04-10 18:45:14 +00:00
// TODO: until a kernel/mount solution exists for handling remount in a user namespace,
// we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
2022-03-14 19:24:29 +00:00
if uidMap := daemon . idMapping . UIDMaps ; uidMap != nil || c . HostConfig . Privileged {
2019-04-10 18:45:14 +00:00
for i , m := range s . Mounts {
if m . Type == "cgroup" {
clearReadOnly ( & s . Mounts [ i ] )
}
2016-03-18 18:50:19 +00:00
}
}
2019-04-10 18:45:14 +00:00
return nil
2016-03-18 18:50:19 +00:00
}
2019-04-10 18:45:14 +00:00
}
2020-05-26 14:58:24 +00:00
// sysctlExists checks if a sysctl exists; runc will error if we add any that do not actually
// exist, so do not add the default ones if running on an old kernel.
func sysctlExists ( s string ) bool {
2022-05-09 11:26:05 +00:00
f := filepath . Join ( "/proc" , "sys" , strings . ReplaceAll ( s , "." , "/" ) )
2020-05-26 14:58:24 +00:00
_ , err := os . Stat ( f )
return err == nil
}
2022-08-17 21:13:49 +00:00
// withCommonOptions sets common docker options
func withCommonOptions ( daemon * Daemon , daemonCfg * dconfig . Config , c * container . Container ) coci . SpecOpts {
2019-04-10 18:45:14 +00:00
return func ( ctx context . Context , _ coci . Client , _ * containers . Container , s * coci . Spec ) error {
2023-05-15 15:42:37 +00:00
if c . BaseFS == "" {
2022-09-23 16:25:19 +00:00
return errors . New ( "populateCommonSpec: BaseFS of container " + c . ID + " is unexpectedly empty" )
2019-04-10 18:45:14 +00:00
}
linkedEnv , err := daemon . setupLinkedContainers ( c )
if err != nil {
return err
}
2023-05-15 15:42:37 +00:00
s . Root = & specs . Root {
Path : c . BaseFS ,
Readonly : c . HostConfig . ReadonlyRootfs ,
}
if err := c . SetupWorkingDirectory ( daemon . idMapping . RootPair ( ) ) ; err != nil {
return err
2019-04-10 18:45:14 +00:00
}
cwd := c . Config . WorkingDir
if len ( cwd ) == 0 {
cwd = "/"
}
2023-06-06 16:57:38 +00:00
if s . Process == nil {
s . Process = & specs . Process { }
}
2019-04-10 18:45:14 +00:00
s . Process . Args = append ( [ ] string { c . Path } , c . Args ... )
// only add the custom init if it is specified and the container is running in its
// own private pid namespace. It does not make sense to add if it is running in the
// host namespace or another container's pid namespace where we already have an init
if c . HostConfig . PidMode . IsPrivate ( ) {
if ( c . HostConfig . Init != nil && * c . HostConfig . Init ) ||
2022-08-17 21:13:49 +00:00
( c . HostConfig . Init == nil && daemonCfg . Init ) {
2019-04-10 18:45:14 +00:00
s . Process . Args = append ( [ ] string { inContainerInitPath , "--" , c . Path } , c . Args ... )
2022-08-17 21:13:49 +00:00
path , err := daemonCfg . LookupInitPath ( ) // this will fall back to DefaultInitBinary and return an absolute path
2023-03-22 20:26:43 +00:00
if err != nil {
return err
2016-09-27 10:51:42 +00:00
}
2019-04-10 18:45:14 +00:00
s . Mounts = append ( s . Mounts , specs . Mount {
Destination : inContainerInitPath ,
Type : "bind" ,
Source : path ,
Options : [ ] string { "bind" , "ro" } ,
} )
2016-09-27 10:51:42 +00:00
}
2016-06-27 21:38:47 +00:00
}
2019-04-10 18:45:14 +00:00
s . Process . Cwd = cwd
s . Process . Env = c . CreateDaemonEnvironment ( c . Config . Tty , linkedEnv )
s . Process . Terminal = c . Config . Tty
2018-06-17 07:05:54 +00:00
2019-04-10 18:45:14 +00:00
s . Hostname = c . Config . Hostname
setLinuxDomainname ( c , s )
2016-03-18 18:50:19 +00:00
2020-05-26 14:58:24 +00:00
// Add default sysctls that are generally safe and useful; currently we
// grant the capabilities to allow these anyway. You can override if
// you want to restore the original behaviour.
// We do not set network sysctls if network namespace is host, or if we are
// joining an existing namespace, only if we create a new net namespace.
if c . HostConfig . NetworkMode . IsPrivate ( ) {
// We cannot set up ping socket support in a user namespace
2022-08-17 21:13:49 +00:00
userNS := daemonCfg . RemappedRoot != "" && c . HostConfig . UsernsMode . IsPrivate ( )
2021-08-11 18:43:30 +00:00
if ! userNS && ! userns . RunningInUserNS ( ) && sysctlExists ( "net.ipv4.ping_group_range" ) {
2020-05-26 14:58:24 +00:00
// allow unprivileged ICMP echo sockets without CAP_NET_RAW
s . Linux . Sysctl [ "net.ipv4.ping_group_range" ] = "0 2147483647"
}
// allow opening any port less than 1024 without CAP_NET_BIND_SERVICE
if sysctlExists ( "net.ipv4.ip_unprivileged_port_start" ) {
s . Linux . Sysctl [ "net.ipv4.ip_unprivileged_port_start" ] = "0"
}
}
2019-04-10 18:45:14 +00:00
return nil
}
2016-03-18 18:50:19 +00:00
}
2022-08-17 21:13:49 +00:00
// withCgroups sets the container's cgroups
func withCgroups ( daemon * Daemon , daemonCfg * dconfig . Config , c * container . Container ) coci . SpecOpts {
2019-04-09 20:51:40 +00:00
return func ( ctx context . Context , _ coci . Client , _ * containers . Container , s * coci . Spec ) error {
var cgroupsPath string
scopePrefix := "docker"
parent := "/docker"
2022-08-17 21:13:49 +00:00
useSystemd := UsingSystemd ( daemonCfg )
2019-04-09 20:51:40 +00:00
if useSystemd {
parent = "system.slice"
2022-08-17 21:13:49 +00:00
if daemonCfg . Rootless {
2020-02-10 05:37:22 +00:00
parent = "user.slice"
}
2019-04-09 20:51:40 +00:00
}
2016-03-18 18:50:19 +00:00
2019-04-09 20:51:40 +00:00
if c . HostConfig . CgroupParent != "" {
parent = c . HostConfig . CgroupParent
2022-08-17 21:13:49 +00:00
} else if daemonCfg . CgroupParent != "" {
parent = daemonCfg . CgroupParent
2019-04-09 20:51:40 +00:00
}
2016-03-24 16:18:03 +00:00
2019-04-09 20:51:40 +00:00
if useSystemd {
cgroupsPath = parent + ":" + scopePrefix + ":" + c . ID
2023-06-23 00:33:17 +00:00
log . G ( ctx ) . Debugf ( "createSpec: cgroupsPath: %s" , cgroupsPath )
2019-04-09 20:51:40 +00:00
} else {
cgroupsPath = filepath . Join ( parent , c . ID )
}
2023-06-06 16:57:38 +00:00
if s . Linux == nil {
s . Linux = & specs . Linux { }
}
2019-04-09 20:51:40 +00:00
s . Linux . CgroupsPath = cgroupsPath
2020-05-22 22:05:13 +00:00
// the rest is only needed for CPU RT controller
2022-08-17 21:13:49 +00:00
if daemonCfg . CPURealtimePeriod == 0 && daemonCfg . CPURealtimeRuntime == 0 {
2020-05-22 22:05:13 +00:00
return nil
}
2019-04-09 20:51:40 +00:00
p := cgroupsPath
if useSystemd {
initPath , err := cgroups . GetInitCgroup ( "cpu" )
if err != nil {
2020-05-22 22:05:13 +00:00
return errors . Wrap ( err , "unable to init CPU RT controller" )
2019-04-09 20:51:40 +00:00
}
_ , err = cgroups . GetOwnCgroup ( "cpu" )
if err != nil {
2020-05-22 22:05:13 +00:00
return errors . Wrap ( err , "unable to init CPU RT controller" )
2019-04-09 20:51:40 +00:00
}
p = filepath . Join ( initPath , s . Linux . CgroupsPath )
}
2016-03-24 16:18:03 +00:00
2019-04-09 20:51:40 +00:00
// Clean path to guard against things like ../../../BAD
parentPath := filepath . Dir ( p )
if ! filepath . IsAbs ( parentPath ) {
parentPath = filepath . Clean ( "/" + parentPath )
}
2016-03-18 18:50:19 +00:00
2020-05-22 22:05:13 +00:00
mnt , root , err := cgroups . FindCgroupMountpointAndRoot ( "" , "cpu" )
if err != nil {
return errors . Wrap ( err , "unable to init CPU RT controller" )
}
// When docker is run inside docker, the root is based of the host cgroup.
// Should this be handled in runc/libcontainer/cgroups ?
if strings . HasPrefix ( root , "/docker/" ) {
root = "/"
}
mnt = filepath . Join ( mnt , root )
2022-08-17 21:13:49 +00:00
if err := daemon . initCPURtController ( daemonCfg , mnt , parentPath ) ; err != nil {
2020-05-22 22:05:13 +00:00
return errors . Wrap ( err , "unable to init CPU RT controller" )
2019-04-09 20:51:40 +00:00
}
return nil
2016-03-18 18:50:19 +00:00
}
2019-04-09 20:51:40 +00:00
}
2019-04-10 18:45:14 +00:00
// WithDevices sets the container's devices
func WithDevices ( daemon * Daemon , c * container . Container ) coci . SpecOpts {
2019-04-09 20:51:40 +00:00
return func ( ctx context . Context , _ coci . Client , _ * containers . Container , s * coci . Spec ) error {
// Build lists of devices allowed and created within the container.
var devs [ ] specs . LinuxDevice
devPermissions := s . Linux . Resources . Devices
2019-12-06 12:49:55 +00:00
2021-12-15 08:14:37 +00:00
if c . HostConfig . Privileged {
2021-12-01 11:06:59 +00:00
hostDevices , err := coci . HostDevices ( )
2019-04-09 20:51:40 +00:00
if err != nil {
return err
}
2021-12-01 11:06:59 +00:00
devs = append ( devs , hostDevices ... )
2019-12-06 12:49:55 +00:00
// adding device mappings in privileged containers
for _ , deviceMapping := range c . HostConfig . Devices {
// issue a warning that custom cgroup permissions are ignored in privileged mode
if deviceMapping . CgroupPermissions != "rwm" {
2023-06-23 00:33:17 +00:00
log . G ( ctx ) . WithField ( "container" , c . ID ) . Warnf ( "custom %s permissions for device %s are ignored in privileged mode" , deviceMapping . CgroupPermissions , deviceMapping . PathOnHost )
2019-12-06 12:49:55 +00:00
}
// issue a warning that the device path already exists via /dev mounting in privileged mode
if deviceMapping . PathOnHost == deviceMapping . PathInContainer {
2023-06-23 00:33:17 +00:00
log . G ( ctx ) . WithField ( "container" , c . ID ) . Warnf ( "path in container %s already exists in privileged mode" , deviceMapping . PathInContainer )
2019-12-06 12:49:55 +00:00
continue
}
d , _ , err := oci . DevicesFromPath ( deviceMapping . PathOnHost , deviceMapping . PathInContainer , "rwm" )
if err != nil {
return err
}
devs = append ( devs , d ... )
}
2019-04-09 20:51:40 +00:00
devPermissions = [ ] specs . LinuxDeviceCgroup {
{
Allow : true ,
Access : "rwm" ,
} ,
}
} else {
for _ , deviceMapping := range c . HostConfig . Devices {
d , dPermissions , err := oci . DevicesFromPath ( deviceMapping . PathOnHost , deviceMapping . PathInContainer , deviceMapping . CgroupPermissions )
if err != nil {
return err
}
devs = append ( devs , d ... )
devPermissions = append ( devPermissions , dPermissions ... )
}
var err error
devPermissions , err = oci . AppendDevicePermissionsFromCgroupRules ( devPermissions , c . HostConfig . DeviceCgroupRules )
if err != nil {
return err
}
}
2023-06-06 16:57:38 +00:00
if s . Linux == nil {
s . Linux = & specs . Linux { }
}
if s . Linux . Resources == nil {
s . Linux . Resources = & specs . LinuxResources { }
}
2019-04-09 20:51:40 +00:00
s . Linux . Devices = append ( s . Linux . Devices , devs ... )
2023-06-06 16:57:38 +00:00
s . Linux . Resources . Devices = append ( s . Linux . Resources . Devices , devPermissions ... )
2019-04-09 20:51:40 +00:00
for _ , req := range c . HostConfig . DeviceRequests {
if err := daemon . handleDevice ( req , s ) ; err != nil {
return err
}
}
return nil
2018-06-17 07:05:54 +00:00
}
2019-04-09 20:51:40 +00:00
}
2016-06-07 19:05:43 +00:00
2019-04-10 18:45:14 +00:00
// WithResources applies the container resources
func WithResources ( c * container . Container ) coci . SpecOpts {
2019-04-09 20:51:40 +00:00
return func ( ctx context . Context , _ coci . Client , _ * containers . Container , s * coci . Spec ) error {
r := c . HostConfig . Resources
weightDevices , err := getBlkioWeightDevices ( r )
2016-06-07 19:05:43 +00:00
if err != nil {
2019-04-09 20:51:40 +00:00
return err
2016-06-07 19:05:43 +00:00
}
2019-04-09 20:51:40 +00:00
readBpsDevice , err := getBlkioThrottleDevices ( r . BlkioDeviceReadBps )
2016-06-07 19:05:43 +00:00
if err != nil {
2019-04-09 20:51:40 +00:00
return err
}
writeBpsDevice , err := getBlkioThrottleDevices ( r . BlkioDeviceWriteBps )
if err != nil {
return err
}
readIOpsDevice , err := getBlkioThrottleDevices ( r . BlkioDeviceReadIOps )
if err != nil {
return err
}
writeIOpsDevice , err := getBlkioThrottleDevices ( r . BlkioDeviceWriteIOps )
if err != nil {
return err
}
memoryRes := getMemoryResources ( r )
cpuRes , err := getCPUResources ( r )
if err != nil {
return err
}
2023-06-06 16:57:38 +00:00
if s . Linux == nil {
s . Linux = & specs . Linux { }
2016-06-07 19:05:43 +00:00
}
2023-06-06 16:57:38 +00:00
if s . Linux . Resources == nil {
s . Linux . Resources = & specs . LinuxResources { }
}
s . Linux . Resources . Memory = memoryRes
s . Linux . Resources . CPU = cpuRes
s . Linux . Resources . BlockIO = & specs . LinuxBlockIO {
WeightDevice : weightDevices ,
ThrottleReadBpsDevice : readBpsDevice ,
ThrottleWriteBpsDevice : writeBpsDevice ,
ThrottleReadIOPSDevice : readIOpsDevice ,
ThrottleWriteIOPSDevice : writeIOpsDevice ,
2016-06-07 19:05:43 +00:00
}
daemon: stop setting container resources to zero
Many of the fields in LinuxResources struct are pointers to scalars for
some reason, presumably to differentiate between set-to-zero and unset
when unmarshaling from JSON, despite zero being outside the acceptable
range for the corresponding kernel tunables. When creating the OCI spec
for a container, the daemon sets the container's OCI spec CPUShares and
BlkioWeight parameters to zero when the corresponding Docker container
configuration values are zero, signifying unset, despite the minimum
acceptable value for CPUShares being two, and BlkioWeight ten. This has
gone unnoticed as runC does not distingiush set-to-zero from unset as it
also uses zero internally to represent unset for those fields. However,
kata-containers v3.2.0-alpha.3 tries to apply the explicit-zero resource
parameters to the container, exactly as instructed, and fails loudly.
The OCI runtime-spec is silent on how the runtime should handle the case
when those parameters are explicitly set to out-of-range values and
kata's behaviour is not unreasonable, so the daemon must therefore be in
the wrong.
Translate unset values in the Docker container's resources HostConfig to
omit the corresponding fields in the container's OCI spec when starting
and updating a container in order to maximize compatibility with
runtimes.
Signed-off-by: Cory Snider <csnider@mirantis.com>
2023-06-05 22:44:51 +00:00
if r . BlkioWeight != 0 {
w := r . BlkioWeight
2023-06-06 16:57:38 +00:00
s . Linux . Resources . BlockIO . Weight = & w
2019-04-09 20:51:40 +00:00
}
2023-06-06 16:57:38 +00:00
s . Linux . Resources . Pids = getPidsLimit ( r )
2019-04-09 20:51:40 +00:00
return nil
2016-06-07 19:05:43 +00:00
}
2019-04-09 20:51:40 +00:00
}
2016-06-07 19:05:43 +00:00
2019-04-10 18:45:14 +00:00
// WithSysctls sets the container's sysctls
func WithSysctls ( c * container . Container ) coci . SpecOpts {
2019-04-09 20:51:40 +00:00
return func ( ctx context . Context , _ coci . Client , _ * containers . Container , s * coci . Spec ) error {
2023-06-06 16:57:38 +00:00
if len ( c . HostConfig . Sysctls ) == 0 {
return nil
}
if s . Linux == nil {
s . Linux = & specs . Linux { }
}
if s . Linux . Sysctl == nil {
s . Linux . Sysctl = make ( map [ string ] string )
}
2019-04-09 20:51:40 +00:00
// We merge the sysctls injected above with the HostConfig (latter takes
// precedence for backwards-compatibility reasons).
for k , v := range c . HostConfig . Sysctls {
s . Linux . Sysctl [ k ] = v
}
return nil
2016-06-07 19:05:43 +00:00
}
2019-04-09 20:51:40 +00:00
}
2016-06-07 19:05:43 +00:00
2019-04-10 18:45:14 +00:00
// WithUser sets the container's user
func WithUser ( c * container . Container ) coci . SpecOpts {
2019-04-09 20:51:40 +00:00
return func ( ctx context . Context , _ coci . Client , _ * containers . Container , s * coci . Spec ) error {
2023-06-06 16:57:38 +00:00
if s . Process == nil {
s . Process = & specs . Process { }
}
2020-07-29 12:26:05 +00:00
var err error
s . Process . User , err = getUser ( c , c . Config . User )
return err
2016-06-07 19:05:43 +00:00
}
2019-04-09 20:51:40 +00:00
}
2023-06-27 10:17:49 +00:00
func ( daemon * Daemon ) createSpec ( ctx context . Context , daemonCfg * configStore , c * container . Container , mounts [ ] container . Mount ) ( retSpec * specs . Spec , err error ) {
2019-04-09 20:51:40 +00:00
var (
opts [ ] coci . SpecOpts
s = oci . DefaultSpec ( )
)
opts = append ( opts ,
2022-08-31 20:12:30 +00:00
withCommonOptions ( daemon , & daemonCfg . Config , c ) ,
withCgroups ( daemon , & daemonCfg . Config , c ) ,
2019-04-10 18:45:14 +00:00
WithResources ( c ) ,
WithSysctls ( c ) ,
WithDevices ( daemon , c ) ,
2022-08-31 20:12:30 +00:00
withRlimits ( daemon , & daemonCfg . Config , c ) ,
2019-04-10 18:45:14 +00:00
WithNamespaces ( daemon , c ) ,
WithCapabilities ( c ) ,
WithSeccomp ( daemon , c ) ,
2023-06-27 10:17:49 +00:00
withMounts ( daemon , daemonCfg , c , mounts ) ,
2019-04-10 18:45:14 +00:00
WithApparmor ( c ) ,
WithSelinux ( c ) ,
WithOOMScore ( & c . HostConfig . OomScoreAdj ) ,
2023-02-17 20:06:19 +00:00
coci . WithAnnotations ( c . HostConfig . Annotations ) ,
2023-05-15 15:42:37 +00:00
WithUser ( c ) ,
2019-04-09 20:51:40 +00:00
)
2022-07-25 12:22:05 +00:00
2019-04-10 18:45:14 +00:00
if c . NoNewPrivileges {
opts = append ( opts , coci . WithNoNewPrivileges )
2016-03-18 18:50:19 +00:00
}
2022-05-12 12:54:44 +00:00
if c . Config . Tty {
opts = append ( opts , WithConsoleSize ( c ) )
}
2018-03-20 17:29:18 +00:00
// Set the masked and readonly paths with regard to the host config options if they are set.
if c . HostConfig . MaskedPaths != nil {
2019-04-10 18:45:14 +00:00
opts = append ( opts , coci . WithMaskedPaths ( c . HostConfig . MaskedPaths ) )
2018-03-20 17:29:18 +00:00
}
if c . HostConfig . ReadonlyPaths != nil {
2019-04-10 18:45:14 +00:00
opts = append ( opts , coci . WithReadonlyPaths ( c . HostConfig . ReadonlyPaths ) )
2018-03-20 17:29:18 +00:00
}
2022-08-17 21:13:49 +00:00
if daemonCfg . Rootless {
2022-08-31 20:12:30 +00:00
opts = append ( opts , withRootless ( daemon , & daemonCfg . Config ) )
2023-09-29 11:31:22 +00:00
} else if userns . RunningInUserNS ( ) {
opts = append ( opts , withRootfulInRootless ( daemon , & daemonCfg . Config ) )
2018-10-15 07:52:53 +00:00
}
2022-08-03 09:20:54 +00:00
var snapshotter , snapshotKey string
if daemon . UsesSnapshotter ( ) {
snapshotter = daemon . imageService . StorageDriver ( )
snapshotKey = c . ID
}
2023-07-18 11:57:27 +00:00
return & s , coci . ApplyOpts ( ctx , daemon . containerdClient , & containers . Container {
2022-08-03 09:20:54 +00:00
ID : c . ID ,
Snapshotter : snapshotter ,
SnapshotKey : snapshotKey ,
2019-04-09 20:51:40 +00:00
} , & s , opts ... )
2016-03-18 18:50:19 +00:00
}
func clearReadOnly ( m * specs . Mount ) {
var opt [ ] string
for _ , o := range m . Options {
if o != "ro" {
opt = append ( opt , o )
}
}
m . Options = opt
}
2016-09-08 04:23:56 +00:00
// mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig
2022-08-17 21:13:49 +00:00
func ( daemon * Daemon ) mergeUlimits ( c * containertypes . HostConfig , daemonCfg * dconfig . Config ) {
2016-09-08 04:23:56 +00:00
ulimits := c . Ulimits
// Merge ulimits with daemon defaults
ulIdx := make ( map [ string ] struct { } )
for _ , ul := range ulimits {
ulIdx [ ul . Name ] = struct { } { }
}
2022-08-17 21:13:49 +00:00
for name , ul := range daemonCfg . Ulimits {
2016-09-08 04:23:56 +00:00
if _ , exists := ulIdx [ name ] ; ! exists {
ulimits = append ( ulimits , ul )
}
}
c . Ulimits = ulimits
}