123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179 |
- package daemon // import "github.com/docker/docker/daemon"
- import (
- "context"
- "fmt"
- "os"
- "path/filepath"
- "sort"
- "strconv"
- "strings"
- cdcgroups "github.com/containerd/cgroups/v3"
- "github.com/containerd/containerd/containers"
- coci "github.com/containerd/containerd/oci"
- "github.com/containerd/containerd/pkg/apparmor"
- "github.com/containerd/containerd/pkg/userns"
- "github.com/containerd/log"
- containertypes "github.com/docker/docker/api/types/container"
- "github.com/docker/docker/container"
- dconfig "github.com/docker/docker/daemon/config"
- "github.com/docker/docker/errdefs"
- "github.com/docker/docker/oci"
- "github.com/docker/docker/oci/caps"
- "github.com/docker/docker/pkg/idtools"
- "github.com/docker/docker/pkg/rootless/specconv"
- "github.com/docker/docker/pkg/stringid"
- volumemounts "github.com/docker/docker/volume/mounts"
- "github.com/moby/sys/mount"
- "github.com/moby/sys/mountinfo"
- "github.com/moby/sys/user"
- "github.com/opencontainers/runc/libcontainer/cgroups"
- specs "github.com/opencontainers/runtime-spec/specs-go"
- "github.com/pkg/errors"
- "golang.org/x/sys/unix"
- )
- const inContainerInitPath = "/sbin/" + dconfig.DefaultInitBinary
- // withRlimits sets the container's rlimits along with merging the daemon's rlimits
- func withRlimits(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
- return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
- var rlimits []specs.POSIXRlimit
- // We want to leave the original HostConfig alone so make a copy here
- hostConfig := *c.HostConfig
- // Merge with the daemon defaults
- daemon.mergeUlimits(&hostConfig, daemonCfg)
- for _, ul := range hostConfig.Ulimits {
- rlimits = append(rlimits, specs.POSIXRlimit{
- Type: "RLIMIT_" + strings.ToUpper(ul.Name),
- Soft: uint64(ul.Soft),
- Hard: uint64(ul.Hard),
- })
- }
- if s.Process == nil {
- s.Process = &specs.Process{}
- }
- s.Process.Rlimits = rlimits
- return nil
- }
- }
- // withLibnetwork sets the libnetwork hook
- func withLibnetwork(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
- return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
- if c.Config.NetworkDisabled {
- return nil
- }
- for _, ns := range s.Linux.Namespaces {
- if ns.Type == specs.NetworkNamespace && ns.Path == "" {
- if s.Hooks == nil {
- s.Hooks = &specs.Hooks{}
- }
- shortNetCtlrID := stringid.TruncateID(daemon.netController.ID())
- s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{
- Path: filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe"),
- Args: []string{"libnetwork-setkey", "-exec-root=" + daemonCfg.GetExecRoot(), c.ID, shortNetCtlrID},
- })
- }
- }
- return nil
- }
- }
- // withRootless sets the spec to the rootless configuration
- func withRootless(daemon *Daemon, daemonCfg *dconfig.Config) coci.SpecOpts {
- return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
- var v2Controllers []string
- if cgroupDriver(daemonCfg) == cgroupSystemdDriver {
- if cdcgroups.Mode() != cdcgroups.Unified {
- return errors.New("rootless systemd driver doesn't support cgroup v1")
- }
- rootlesskitParentEUID := os.Getenv("ROOTLESSKIT_PARENT_EUID")
- if rootlesskitParentEUID == "" {
- return errors.New("$ROOTLESSKIT_PARENT_EUID is not set (requires RootlessKit v0.8.0)")
- }
- euid, err := strconv.Atoi(rootlesskitParentEUID)
- if err != nil {
- return errors.Wrap(err, "invalid $ROOTLESSKIT_PARENT_EUID: must be a numeric value")
- }
- controllersPath := fmt.Sprintf("/sys/fs/cgroup/user.slice/user-%d.slice/cgroup.controllers", euid)
- controllersFile, err := os.ReadFile(controllersPath)
- if err != nil {
- return err
- }
- v2Controllers = strings.Fields(string(controllersFile))
- }
- return specconv.ToRootless(s, v2Controllers)
- }
- }
- // withRootfulInRootless is used for "rootful-in-rootless" dind;
- // the daemon is running in UserNS but has no access to RootlessKit API socket, host filesystem, etc.
- func withRootfulInRootless(daemon *Daemon, daemonCfg *dconfig.Config) coci.SpecOpts {
- return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
- specconv.ToRootfulInRootless(s)
- return nil
- }
- }
- // WithOOMScore sets the oom score
- func WithOOMScore(score *int) coci.SpecOpts {
- return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
- if s.Process == nil {
- s.Process = &specs.Process{}
- }
- s.Process.OOMScoreAdj = score
- return nil
- }
- }
- // WithSelinux sets the selinux labels
- func WithSelinux(c *container.Container) coci.SpecOpts {
- return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
- if s.Process == nil {
- s.Process = &specs.Process{}
- }
- if s.Linux == nil {
- s.Linux = &specs.Linux{}
- }
- s.Process.SelinuxLabel = c.GetProcessLabel()
- s.Linux.MountLabel = c.MountLabel
- return nil
- }
- }
- // WithApparmor sets the apparmor profile
- func WithApparmor(c *container.Container) coci.SpecOpts {
- return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
- if apparmor.HostSupports() {
- var appArmorProfile string
- if c.AppArmorProfile != "" {
- appArmorProfile = c.AppArmorProfile
- } else if c.HostConfig.Privileged {
- appArmorProfile = unconfinedAppArmorProfile
- } else {
- appArmorProfile = defaultAppArmorProfile
- }
- if appArmorProfile == defaultAppArmorProfile {
- // Unattended upgrades and other fun services can unload AppArmor
- // profiles inadvertently. Since we cannot store our profile in
- // /etc/apparmor.d, nor can we practically add other ways of
- // telling the system to keep our profile loaded, in order to make
- // sure that we keep the default profile enabled we dynamically
- // reload it if necessary.
- if err := ensureDefaultAppArmorProfile(); err != nil {
- return err
- }
- }
- if s.Process == nil {
- s.Process = &specs.Process{}
- }
- s.Process.ApparmorProfile = appArmorProfile
- }
- return nil
- }
- }
- // WithCapabilities sets the container's capabilties
- func WithCapabilities(c *container.Container) coci.SpecOpts {
- return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
- capabilities, err := caps.TweakCapabilities(
- caps.DefaultCapabilities(),
- c.HostConfig.CapAdd,
- c.HostConfig.CapDrop,
- c.HostConfig.Privileged,
- )
- if err != nil {
- return err
- }
- return oci.SetCapabilities(s, capabilities)
- }
- }
- func resourcePath(c *container.Container, getPath func() (string, error)) (string, error) {
- p, err := getPath()
- if err != nil {
- return "", err
- }
- return c.GetResourcePath(p)
- }
- func getUser(c *container.Container, username string) (specs.User, error) {
- var usr specs.User
- passwdPath, err := resourcePath(c, user.GetPasswdPath)
- if err != nil {
- return usr, err
- }
- groupPath, err := resourcePath(c, user.GetGroupPath)
- if err != nil {
- return usr, err
- }
- execUser, err := user.GetExecUserPath(username, nil, passwdPath, groupPath)
- if err != nil {
- return usr, err
- }
- usr.UID = uint32(execUser.Uid)
- usr.GID = uint32(execUser.Gid)
- usr.AdditionalGids = []uint32{usr.GID}
- var addGroups []int
- if len(c.HostConfig.GroupAdd) > 0 {
- addGroups, err = user.GetAdditionalGroupsPath(c.HostConfig.GroupAdd, groupPath)
- if err != nil {
- return usr, err
- }
- }
- for _, g := range append(execUser.Sgids, addGroups...) {
- usr.AdditionalGids = append(usr.AdditionalGids, uint32(g))
- }
- return usr, nil
- }
- func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) {
- if s.Linux == nil {
- s.Linux = &specs.Linux{}
- }
- for i, n := range s.Linux.Namespaces {
- if n.Type == ns.Type {
- s.Linux.Namespaces[i] = ns
- return
- }
- }
- s.Linux.Namespaces = append(s.Linux.Namespaces, ns)
- }
- // WithNamespaces sets the container's namespaces
- func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
- return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
- userNS := false
- // user
- if c.HostConfig.UsernsMode.IsPrivate() {
- if uidMap := daemon.idMapping.UIDMaps; uidMap != nil {
- userNS = true
- setNamespace(s, specs.LinuxNamespace{
- Type: specs.UserNamespace,
- })
- s.Linux.UIDMappings = specMapping(uidMap)
- s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDMaps)
- }
- }
- // network
- if !c.Config.NetworkDisabled {
- networkMode := c.HostConfig.NetworkMode
- switch {
- case networkMode.IsContainer():
- nc, err := daemon.getNetworkedContainer(c.ID, networkMode.ConnectedContainer())
- if err != nil {
- return err
- }
- setNamespace(s, specs.LinuxNamespace{
- Type: specs.NetworkNamespace,
- Path: fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID()),
- })
- if userNS {
- // to share a net namespace, the containers must also share a user namespace.
- //
- // FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210
- setNamespace(s, specs.LinuxNamespace{
- Type: specs.UserNamespace,
- Path: fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID()),
- })
- }
- case networkMode.IsHost():
- setNamespace(s, specs.LinuxNamespace{
- Type: specs.NetworkNamespace,
- Path: c.NetworkSettings.SandboxKey,
- })
- default:
- setNamespace(s, specs.LinuxNamespace{
- Type: specs.NetworkNamespace,
- })
- }
- }
- // ipc
- ipcMode := c.HostConfig.IpcMode
- if !ipcMode.Valid() {
- return errdefs.InvalidParameter(errors.Errorf("invalid IPC mode: %v", ipcMode))
- }
- switch {
- case ipcMode.IsContainer():
- ic, err := daemon.getIPCContainer(ipcMode.Container())
- if err != nil {
- return errors.Wrap(err, "failed to join IPC namespace")
- }
- setNamespace(s, specs.LinuxNamespace{
- Type: specs.IPCNamespace,
- Path: fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID()),
- })
- if userNS {
- // to share a IPC namespace, the containers must also share a user namespace.
- //
- // FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210
- setNamespace(s, specs.LinuxNamespace{
- Type: specs.UserNamespace,
- Path: fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID()),
- })
- }
- case ipcMode.IsHost():
- oci.RemoveNamespace(s, specs.IPCNamespace)
- case ipcMode.IsEmpty():
- // A container was created by an older version of the daemon.
- // The default behavior used to be what is now called "shareable".
- fallthrough
- case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone():
- setNamespace(s, specs.LinuxNamespace{
- Type: specs.IPCNamespace,
- })
- }
- // pid
- pidMode := c.HostConfig.PidMode
- if !pidMode.Valid() {
- return errdefs.InvalidParameter(errors.Errorf("invalid PID mode: %v", pidMode))
- }
- switch {
- case pidMode.IsContainer():
- pc, err := daemon.getPIDContainer(pidMode.Container())
- if err != nil {
- return errors.Wrap(err, "failed to join PID namespace")
- }
- setNamespace(s, specs.LinuxNamespace{
- Type: specs.PIDNamespace,
- Path: fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID()),
- })
- if userNS {
- // to share a PID namespace, the containers must also share a user namespace.
- //
- // FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210
- setNamespace(s, specs.LinuxNamespace{
- Type: specs.UserNamespace,
- Path: fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID()),
- })
- }
- case pidMode.IsHost():
- oci.RemoveNamespace(s, specs.PIDNamespace)
- default:
- setNamespace(s, specs.LinuxNamespace{
- Type: specs.PIDNamespace,
- })
- }
- // uts
- if !c.HostConfig.UTSMode.Valid() {
- return errdefs.InvalidParameter(errors.Errorf("invalid UTS mode: %v", c.HostConfig.UTSMode))
- }
- if c.HostConfig.UTSMode.IsHost() {
- oci.RemoveNamespace(s, specs.UTSNamespace)
- s.Hostname = ""
- }
- // cgroup
- if !c.HostConfig.CgroupnsMode.Valid() {
- return errdefs.InvalidParameter(errors.Errorf("invalid cgroup namespace mode: %v", c.HostConfig.CgroupnsMode))
- }
- if c.HostConfig.CgroupnsMode.IsPrivate() {
- setNamespace(s, specs.LinuxNamespace{
- Type: specs.CgroupNamespace,
- })
- }
- return nil
- }
- }
- func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {
- var ids []specs.LinuxIDMapping
- for _, item := range s {
- ids = append(ids, specs.LinuxIDMapping{
- HostID: uint32(item.HostID),
- ContainerID: uint32(item.ContainerID),
- Size: uint32(item.Size),
- })
- }
- return ids
- }
- // Get the source mount point of directory passed in as argument. Also return
- // optional fields.
- func getSourceMount(source string) (string, string, error) {
- // Ensure any symlinks are resolved.
- sourcePath, err := filepath.EvalSymlinks(source)
- if err != nil {
- return "", "", err
- }
- mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(sourcePath))
- if err != nil {
- return "", "", err
- }
- if len(mi) < 1 {
- return "", "", fmt.Errorf("Can't find mount point of %s", source)
- }
- // find the longest mount point
- var idx, maxlen int
- for i := range mi {
- if len(mi[i].Mountpoint) > maxlen {
- maxlen = len(mi[i].Mountpoint)
- idx = i
- }
- }
- return mi[idx].Mountpoint, mi[idx].Optional, nil
- }
- const (
- sharedPropagationOption = "shared:"
- slavePropagationOption = "master:"
- )
- // hasMountInfoOption checks if any of the passed any of the given option values
- // are set in the passed in option string.
- func hasMountInfoOption(opts string, vals ...string) bool {
- for _, opt := range strings.Split(opts, " ") {
- for _, val := range vals {
- if strings.HasPrefix(opt, val) {
- return true
- }
- }
- }
- return false
- }
- // Ensure mount point on which path is mounted, is shared.
- func ensureShared(path string) error {
- sourceMount, optionalOpts, err := getSourceMount(path)
- if err != nil {
- return err
- }
- // Make sure source mount point is shared.
- if !hasMountInfoOption(optionalOpts, sharedPropagationOption) {
- return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount)
- }
- return nil
- }
- // Ensure mount point on which path is mounted, is either shared or slave.
- func ensureSharedOrSlave(path string) error {
- sourceMount, optionalOpts, err := getSourceMount(path)
- if err != nil {
- return err
- }
- if !hasMountInfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) {
- return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount)
- }
- return nil
- }
- // Get the set of mount flags that are set on the mount that contains the given
- // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
- // bind-mounting "with options" will not fail with user namespaces, due to
- // kernel restrictions that require user namespace mounts to preserve
- // CL_UNPRIVILEGED locked flags.
- func getUnprivilegedMountFlags(path string) ([]string, error) {
- var statfs unix.Statfs_t
- if err := unix.Statfs(path, &statfs); err != nil {
- return nil, err
- }
- // The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
- unprivilegedFlags := map[uint64]string{
- unix.MS_RDONLY: "ro",
- unix.MS_NODEV: "nodev",
- unix.MS_NOEXEC: "noexec",
- unix.MS_NOSUID: "nosuid",
- unix.MS_NOATIME: "noatime",
- unix.MS_RELATIME: "relatime",
- unix.MS_NODIRATIME: "nodiratime",
- }
- var flags []string
- for mask, flag := range unprivilegedFlags {
- if uint64(statfs.Flags)&mask == mask {
- flags = append(flags, flag)
- }
- }
- return flags, nil
- }
- var (
- mountPropagationMap = map[string]int{
- "private": mount.PRIVATE,
- "rprivate": mount.RPRIVATE,
- "shared": mount.SHARED,
- "rshared": mount.RSHARED,
- "slave": mount.SLAVE,
- "rslave": mount.RSLAVE,
- }
- mountPropagationReverseMap = map[int]string{
- mount.PRIVATE: "private",
- mount.RPRIVATE: "rprivate",
- mount.SHARED: "shared",
- mount.RSHARED: "rshared",
- mount.SLAVE: "slave",
- mount.RSLAVE: "rslave",
- }
- )
- // inSlice tests whether a string is contained in a slice of strings or not.
- // Comparison is case sensitive
- func inSlice(slice []string, s string) bool {
- for _, ss := range slice {
- if s == ss {
- return true
- }
- }
- return false
- }
- // withMounts sets the container's mounts
- func withMounts(daemon *Daemon, daemonCfg *configStore, c *container.Container) coci.SpecOpts {
- return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
- if err := daemon.setupContainerMountsRoot(c); err != nil {
- return err
- }
- if err := daemon.setupIPCDirs(c); err != nil {
- return err
- }
- defer func() {
- if err != nil {
- daemon.cleanupSecretDir(c)
- }
- }()
- if err := daemon.setupSecretDir(c); err != nil {
- return err
- }
- ms, err := daemon.setupMounts(c)
- if err != nil {
- return err
- }
- if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() {
- ms = append(ms, c.IpcMounts()...)
- }
- tmpfsMounts, err := c.TmpfsMounts()
- if err != nil {
- return err
- }
- ms = append(ms, tmpfsMounts...)
- secretMounts, err := c.SecretMounts()
- if err != nil {
- return err
- }
- ms = append(ms, secretMounts...)
- sort.Sort(mounts(ms))
- mounts := ms
- userMounts := make(map[string]struct{})
- for _, m := range mounts {
- userMounts[m.Destination] = struct{}{}
- }
- // Copy all mounts from spec to defaultMounts, except for
- // - mounts overridden by a user supplied mount;
- // - all mounts under /dev if a user supplied /dev is present;
- // - /dev/shm, in case IpcMode is none.
- // While at it, also
- // - set size for /dev/shm from shmsize.
- defaultMounts := s.Mounts[:0]
- _, mountDev := userMounts["/dev"]
- for _, m := range s.Mounts {
- if _, ok := userMounts[m.Destination]; ok {
- // filter out mount overridden by a user supplied mount
- continue
- }
- if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
- // filter out everything under /dev if /dev is user-mounted
- continue
- }
- if m.Destination == "/dev/shm" {
- if c.HostConfig.IpcMode.IsNone() {
- // filter out /dev/shm for "none" IpcMode
- continue
- }
- // set size for /dev/shm mount from spec
- sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
- m.Options = append(m.Options, sizeOpt)
- }
- defaultMounts = append(defaultMounts, m)
- }
- s.Mounts = defaultMounts
- for _, m := range mounts {
- if m.Source == "tmpfs" {
- data := m.Data
- parser := volumemounts.NewParser()
- options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())}
- if data != "" {
- options = append(options, strings.Split(data, ",")...)
- }
- merged, err := mount.MergeTmpfsOptions(options)
- if err != nil {
- return err
- }
- s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged})
- continue
- }
- mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"}
- // Determine property of RootPropagation based on volume
- // properties. If a volume is shared, then keep root propagation
- // shared. This should work for slave and private volumes too.
- //
- // For slave volumes, it can be either [r]shared/[r]slave.
- //
- // For private volumes any root propagation value should work.
- pFlag := mountPropagationMap[m.Propagation]
- switch pFlag {
- case mount.SHARED, mount.RSHARED:
- if err := ensureShared(m.Source); err != nil {
- return err
- }
- rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
- if rootpg != mount.SHARED && rootpg != mount.RSHARED {
- if s.Linux == nil {
- s.Linux = &specs.Linux{}
- }
- s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED]
- }
- case mount.SLAVE, mount.RSLAVE:
- var fallback bool
- if err := ensureSharedOrSlave(m.Source); err != nil {
- // For backwards compatibility purposes, treat mounts from the daemon root
- // as special since we automatically add rslave propagation to these mounts
- // when the user did not set anything, so we should fallback to the old
- // behavior which is to use private propagation which is normally the
- // default.
- if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) {
- return err
- }
- cm, ok := c.MountPoints[m.Destination]
- if !ok {
- return err
- }
- if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" {
- // This means the user explicitly set a propagation, do not fallback in that case.
- return err
- }
- fallback = true
- log.G(ctx).WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root")
- }
- if !fallback {
- rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
- if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
- if s.Linux == nil {
- s.Linux = &specs.Linux{}
- }
- s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE]
- }
- }
- }
- bindMode := "rbind"
- if m.NonRecursive {
- bindMode = "bind"
- }
- opts := []string{bindMode}
- if !m.Writable {
- rro := true
- if m.ReadOnlyNonRecursive {
- rro = false
- if m.ReadOnlyForceRecursive {
- return errors.New("mount options conflict: ReadOnlyNonRecursive && ReadOnlyForceRecursive")
- }
- }
- if rroErr := supportsRecursivelyReadOnly(daemonCfg, c.HostConfig.Runtime); rroErr != nil {
- rro = false
- if m.ReadOnlyForceRecursive {
- return rroErr
- }
- }
- if rro {
- opts = append(opts, "rro")
- } else {
- opts = append(opts, "ro")
- }
- }
- if pFlag != 0 {
- opts = append(opts, mountPropagationReverseMap[pFlag])
- }
- // If we are using user namespaces, then we must make sure that we
- // don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
- // "mount" when we bind-mount. The reason for this is that at the point
- // when runc sets up the root filesystem, it is already inside a user
- // namespace, and thus cannot change any flags that are locked.
- if daemonCfg.RemappedRoot != "" || userns.RunningInUserNS() {
- unprivOpts, err := getUnprivilegedMountFlags(m.Source)
- if err != nil {
- return err
- }
- opts = append(opts, unprivOpts...)
- }
- mt.Options = opts
- s.Mounts = append(s.Mounts, mt)
- }
- if s.Root.Readonly {
- for i, m := range s.Mounts {
- switch m.Destination {
- case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev":
- continue
- }
- if _, ok := userMounts[m.Destination]; !ok {
- if !inSlice(m.Options, "ro") {
- s.Mounts[i].Options = append(s.Mounts[i].Options, "ro")
- }
- }
- }
- }
- if c.HostConfig.Privileged {
- // clear readonly for /sys
- for i := range s.Mounts {
- if s.Mounts[i].Destination == "/sys" {
- clearReadOnly(&s.Mounts[i])
- }
- }
- if s.Linux != nil {
- s.Linux.ReadonlyPaths = nil
- s.Linux.MaskedPaths = nil
- }
- }
- // TODO: until a kernel/mount solution exists for handling remount in a user namespace,
- // we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
- if uidMap := daemon.idMapping.UIDMaps; uidMap != nil || c.HostConfig.Privileged {
- for i, m := range s.Mounts {
- if m.Type == "cgroup" {
- clearReadOnly(&s.Mounts[i])
- }
- }
- }
- return nil
- }
- }
- // sysctlExists checks if a sysctl exists; runc will error if we add any that do not actually
- // exist, so do not add the default ones if running on an old kernel.
- func sysctlExists(s string) bool {
- f := filepath.Join("/proc", "sys", strings.ReplaceAll(s, ".", "/"))
- _, err := os.Stat(f)
- return err == nil
- }
- // withCommonOptions sets common docker options
- func withCommonOptions(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
- return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
- if c.BaseFS == "" {
- return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly empty")
- }
- linkedEnv, err := daemon.setupLinkedContainers(c)
- if err != nil {
- return err
- }
- s.Root = &specs.Root{
- Path: c.BaseFS,
- Readonly: c.HostConfig.ReadonlyRootfs,
- }
- if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil {
- return err
- }
- cwd := c.Config.WorkingDir
- if len(cwd) == 0 {
- cwd = "/"
- }
- if s.Process == nil {
- s.Process = &specs.Process{}
- }
- s.Process.Args = append([]string{c.Path}, c.Args...)
- // only add the custom init if it is specified and the container is running in its
- // own private pid namespace. It does not make sense to add if it is running in the
- // host namespace or another container's pid namespace where we already have an init
- if c.HostConfig.PidMode.IsPrivate() {
- if (c.HostConfig.Init != nil && *c.HostConfig.Init) ||
- (c.HostConfig.Init == nil && daemonCfg.Init) {
- s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...)
- path, err := daemonCfg.LookupInitPath() // this will fall back to DefaultInitBinary and return an absolute path
- if err != nil {
- return err
- }
- s.Mounts = append(s.Mounts, specs.Mount{
- Destination: inContainerInitPath,
- Type: "bind",
- Source: path,
- Options: []string{"bind", "ro"},
- })
- }
- }
- s.Process.Cwd = cwd
- s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
- s.Process.Terminal = c.Config.Tty
- s.Hostname = c.Config.Hostname
- setLinuxDomainname(c, s)
- // Add default sysctls that are generally safe and useful; currently we
- // grant the capabilities to allow these anyway. You can override if
- // you want to restore the original behaviour.
- // We do not set network sysctls if network namespace is host, or if we are
- // joining an existing namespace, only if we create a new net namespace.
- if c.HostConfig.NetworkMode.IsPrivate() {
- // We cannot set up ping socket support in a user namespace
- userNS := daemonCfg.RemappedRoot != "" && c.HostConfig.UsernsMode.IsPrivate()
- if !userNS && !userns.RunningInUserNS() && sysctlExists("net.ipv4.ping_group_range") {
- // allow unprivileged ICMP echo sockets without CAP_NET_RAW
- s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647"
- }
- // allow opening any port less than 1024 without CAP_NET_BIND_SERVICE
- if sysctlExists("net.ipv4.ip_unprivileged_port_start") {
- s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"] = "0"
- }
- }
- return nil
- }
- }
- // withCgroups sets the container's cgroups
- func withCgroups(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
- return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
- var cgroupsPath string
- scopePrefix := "docker"
- parent := "/docker"
- useSystemd := UsingSystemd(daemonCfg)
- if useSystemd {
- parent = "system.slice"
- if daemonCfg.Rootless {
- parent = "user.slice"
- }
- }
- if c.HostConfig.CgroupParent != "" {
- parent = c.HostConfig.CgroupParent
- } else if daemonCfg.CgroupParent != "" {
- parent = daemonCfg.CgroupParent
- }
- if useSystemd {
- cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID
- log.G(ctx).Debugf("createSpec: cgroupsPath: %s", cgroupsPath)
- } else {
- cgroupsPath = filepath.Join(parent, c.ID)
- }
- if s.Linux == nil {
- s.Linux = &specs.Linux{}
- }
- s.Linux.CgroupsPath = cgroupsPath
- // the rest is only needed for CPU RT controller
- if daemonCfg.CPURealtimePeriod == 0 && daemonCfg.CPURealtimeRuntime == 0 {
- return nil
- }
- p := cgroupsPath
- if useSystemd {
- initPath, err := cgroups.GetInitCgroup("cpu")
- if err != nil {
- return errors.Wrap(err, "unable to init CPU RT controller")
- }
- _, err = cgroups.GetOwnCgroup("cpu")
- if err != nil {
- return errors.Wrap(err, "unable to init CPU RT controller")
- }
- p = filepath.Join(initPath, s.Linux.CgroupsPath)
- }
- // Clean path to guard against things like ../../../BAD
- parentPath := filepath.Dir(p)
- if !filepath.IsAbs(parentPath) {
- parentPath = filepath.Clean("/" + parentPath)
- }
- mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", "cpu")
- if err != nil {
- return errors.Wrap(err, "unable to init CPU RT controller")
- }
- // When docker is run inside docker, the root is based of the host cgroup.
- // Should this be handled in runc/libcontainer/cgroups ?
- if strings.HasPrefix(root, "/docker/") {
- root = "/"
- }
- mnt = filepath.Join(mnt, root)
- if err := daemon.initCPURtController(daemonCfg, mnt, parentPath); err != nil {
- return errors.Wrap(err, "unable to init CPU RT controller")
- }
- return nil
- }
- }
- // WithDevices sets the container's devices
- func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts {
- return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
- // Build lists of devices allowed and created within the container.
- var devs []specs.LinuxDevice
- devPermissions := s.Linux.Resources.Devices
- if c.HostConfig.Privileged {
- hostDevices, err := coci.HostDevices()
- if err != nil {
- return err
- }
- devs = append(devs, hostDevices...)
- // adding device mappings in privileged containers
- for _, deviceMapping := range c.HostConfig.Devices {
- // issue a warning that custom cgroup permissions are ignored in privileged mode
- if deviceMapping.CgroupPermissions != "rwm" {
- log.G(ctx).WithField("container", c.ID).Warnf("custom %s permissions for device %s are ignored in privileged mode", deviceMapping.CgroupPermissions, deviceMapping.PathOnHost)
- }
- // issue a warning that the device path already exists via /dev mounting in privileged mode
- if deviceMapping.PathOnHost == deviceMapping.PathInContainer {
- log.G(ctx).WithField("container", c.ID).Warnf("path in container %s already exists in privileged mode", deviceMapping.PathInContainer)
- continue
- }
- d, _, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, "rwm")
- if err != nil {
- return err
- }
- devs = append(devs, d...)
- }
- devPermissions = []specs.LinuxDeviceCgroup{
- {
- Allow: true,
- Access: "rwm",
- },
- }
- } else {
- for _, deviceMapping := range c.HostConfig.Devices {
- d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions)
- if err != nil {
- return err
- }
- devs = append(devs, d...)
- devPermissions = append(devPermissions, dPermissions...)
- }
- var err error
- devPermissions, err = oci.AppendDevicePermissionsFromCgroupRules(devPermissions, c.HostConfig.DeviceCgroupRules)
- if err != nil {
- return err
- }
- }
- if s.Linux == nil {
- s.Linux = &specs.Linux{}
- }
- if s.Linux.Resources == nil {
- s.Linux.Resources = &specs.LinuxResources{}
- }
- s.Linux.Devices = append(s.Linux.Devices, devs...)
- s.Linux.Resources.Devices = append(s.Linux.Resources.Devices, devPermissions...)
- for _, req := range c.HostConfig.DeviceRequests {
- if err := daemon.handleDevice(req, s); err != nil {
- return err
- }
- }
- return nil
- }
- }
- // WithResources applies the container resources
- func WithResources(c *container.Container) coci.SpecOpts {
- return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
- r := c.HostConfig.Resources
- weightDevices, err := getBlkioWeightDevices(r)
- if err != nil {
- return err
- }
- readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps)
- if err != nil {
- return err
- }
- writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps)
- if err != nil {
- return err
- }
- readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps)
- if err != nil {
- return err
- }
- writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps)
- if err != nil {
- return err
- }
- memoryRes := getMemoryResources(r)
- cpuRes, err := getCPUResources(r)
- if err != nil {
- return err
- }
- if s.Linux == nil {
- s.Linux = &specs.Linux{}
- }
- if s.Linux.Resources == nil {
- s.Linux.Resources = &specs.LinuxResources{}
- }
- s.Linux.Resources.Memory = memoryRes
- s.Linux.Resources.CPU = cpuRes
- s.Linux.Resources.BlockIO = &specs.LinuxBlockIO{
- WeightDevice: weightDevices,
- ThrottleReadBpsDevice: readBpsDevice,
- ThrottleWriteBpsDevice: writeBpsDevice,
- ThrottleReadIOPSDevice: readIOpsDevice,
- ThrottleWriteIOPSDevice: writeIOpsDevice,
- }
- if r.BlkioWeight != 0 {
- w := r.BlkioWeight
- s.Linux.Resources.BlockIO.Weight = &w
- }
- s.Linux.Resources.Pids = getPidsLimit(r)
- return nil
- }
- }
- // WithSysctls sets the container's sysctls
- func WithSysctls(c *container.Container) coci.SpecOpts {
- return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
- if len(c.HostConfig.Sysctls) == 0 {
- return nil
- }
- if s.Linux == nil {
- s.Linux = &specs.Linux{}
- }
- if s.Linux.Sysctl == nil {
- s.Linux.Sysctl = make(map[string]string)
- }
- // We merge the sysctls injected above with the HostConfig (latter takes
- // precedence for backwards-compatibility reasons).
- for k, v := range c.HostConfig.Sysctls {
- s.Linux.Sysctl[k] = v
- }
- return nil
- }
- }
- // WithUser sets the container's user
- func WithUser(c *container.Container) coci.SpecOpts {
- return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
- if s.Process == nil {
- s.Process = &specs.Process{}
- }
- var err error
- s.Process.User, err = getUser(c, c.Config.User)
- return err
- }
- }
- func (daemon *Daemon) createSpec(ctx context.Context, daemonCfg *configStore, c *container.Container) (retSpec *specs.Spec, err error) {
- var (
- opts []coci.SpecOpts
- s = oci.DefaultSpec()
- )
- opts = append(opts,
- withCommonOptions(daemon, &daemonCfg.Config, c),
- withCgroups(daemon, &daemonCfg.Config, c),
- WithResources(c),
- WithSysctls(c),
- WithDevices(daemon, c),
- withRlimits(daemon, &daemonCfg.Config, c),
- WithNamespaces(daemon, c),
- WithCapabilities(c),
- WithSeccomp(daemon, c),
- withMounts(daemon, daemonCfg, c),
- withLibnetwork(daemon, &daemonCfg.Config, c),
- WithApparmor(c),
- WithSelinux(c),
- WithOOMScore(&c.HostConfig.OomScoreAdj),
- coci.WithAnnotations(c.HostConfig.Annotations),
- WithUser(c),
- )
- if c.NoNewPrivileges {
- opts = append(opts, coci.WithNoNewPrivileges)
- }
- if c.Config.Tty {
- opts = append(opts, WithConsoleSize(c))
- }
- // Set the masked and readonly paths with regard to the host config options if they are set.
- if c.HostConfig.MaskedPaths != nil {
- opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths))
- }
- if c.HostConfig.ReadonlyPaths != nil {
- opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths))
- }
- if daemonCfg.Rootless {
- opts = append(opts, withRootless(daemon, &daemonCfg.Config))
- } else if userns.RunningInUserNS() {
- opts = append(opts, withRootfulInRootless(daemon, &daemonCfg.Config))
- }
- var snapshotter, snapshotKey string
- if daemon.UsesSnapshotter() {
- snapshotter = daemon.imageService.StorageDriver()
- snapshotKey = c.ID
- }
- return &s, coci.ApplyOpts(ctx, daemon.containerdClient, &containers.Container{
- ID: c.ID,
- Snapshotter: snapshotter,
- SnapshotKey: snapshotKey,
- }, &s, opts...)
- }
- func clearReadOnly(m *specs.Mount) {
- var opt []string
- for _, o := range m.Options {
- if o != "ro" {
- opt = append(opt, o)
- }
- }
- m.Options = opt
- }
- // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig
- func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig, daemonCfg *dconfig.Config) {
- ulimits := c.Ulimits
- // Merge ulimits with daemon defaults
- ulIdx := make(map[string]struct{})
- for _, ul := range ulimits {
- ulIdx[ul.Name] = struct{}{}
- }
- for name, ul := range daemonCfg.Ulimits {
- if _, exists := ulIdx[name]; !exists {
- ulimits = append(ulimits, ul)
- }
- }
- c.Ulimits = ulimits
- }
|