2016-03-18 18:50:19 +00:00
|
|
|
package daemon
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
|
|
|
"io"
|
|
|
|
"os"
|
2016-06-27 21:38:47 +00:00
|
|
|
"os/exec"
|
2016-03-18 18:50:19 +00:00
|
|
|
"path/filepath"
|
2016-05-06 22:09:46 +00:00
|
|
|
"regexp"
|
2016-04-26 08:20:17 +00:00
|
|
|
"sort"
|
2016-03-18 18:50:19 +00:00
|
|
|
"strconv"
|
|
|
|
"strings"
|
|
|
|
|
2016-09-06 18:18:12 +00:00
|
|
|
containertypes "github.com/docker/docker/api/types/container"
|
2016-03-18 18:50:19 +00:00
|
|
|
"github.com/docker/docker/container"
|
|
|
|
"github.com/docker/docker/daemon/caps"
|
2017-04-10 09:25:15 +00:00
|
|
|
daemonconfig "github.com/docker/docker/daemon/config"
|
2016-03-18 18:50:19 +00:00
|
|
|
"github.com/docker/docker/oci"
|
|
|
|
"github.com/docker/docker/pkg/idtools"
|
|
|
|
"github.com/docker/docker/pkg/mount"
|
|
|
|
"github.com/docker/docker/volume"
|
|
|
|
"github.com/opencontainers/runc/libcontainer/apparmor"
|
2016-06-07 19:05:43 +00:00
|
|
|
"github.com/opencontainers/runc/libcontainer/cgroups"
|
2016-03-18 18:50:19 +00:00
|
|
|
"github.com/opencontainers/runc/libcontainer/devices"
|
|
|
|
"github.com/opencontainers/runc/libcontainer/user"
|
2016-06-07 19:05:43 +00:00
|
|
|
specs "github.com/opencontainers/runtime-spec/specs-go"
|
2017-07-26 21:42:13 +00:00
|
|
|
"github.com/sirupsen/logrus"
|
2017-10-15 06:06:20 +00:00
|
|
|
"golang.org/x/sys/unix"
|
2016-03-18 18:50:19 +00:00
|
|
|
)
|
|
|
|
|
2017-09-11 18:55:05 +00:00
|
|
|
// nolint: gosimple
|
2016-05-06 22:09:46 +00:00
|
|
|
var (
|
|
|
|
deviceCgroupRuleRegex = regexp.MustCompile("^([acb]) ([0-9]+|\\*):([0-9]+|\\*) ([rwm]{1,3})$")
|
|
|
|
)
|
|
|
|
|
2016-03-18 18:50:19 +00:00
|
|
|
func setResources(s *specs.Spec, r containertypes.Resources) error {
|
|
|
|
weightDevices, err := getBlkioWeightDevices(r)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2016-04-29 20:39:04 +00:00
|
|
|
readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps)
|
2016-03-18 18:50:19 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2016-04-29 20:39:04 +00:00
|
|
|
writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps)
|
2016-03-18 18:50:19 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2016-04-29 20:39:04 +00:00
|
|
|
readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps)
|
2016-03-18 18:50:19 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2016-04-29 20:39:04 +00:00
|
|
|
writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps)
|
2016-03-18 18:50:19 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
memoryRes := getMemoryResources(r)
|
2017-04-27 21:52:47 +00:00
|
|
|
cpuRes, err := getCPUResources(r)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2016-03-18 18:50:19 +00:00
|
|
|
blkioWeight := r.BlkioWeight
|
|
|
|
|
2017-04-27 21:52:47 +00:00
|
|
|
specResources := &specs.LinuxResources{
|
2016-03-18 18:50:19 +00:00
|
|
|
Memory: memoryRes,
|
|
|
|
CPU: cpuRes,
|
2017-04-27 21:52:47 +00:00
|
|
|
BlockIO: &specs.LinuxBlockIO{
|
2016-03-18 18:50:19 +00:00
|
|
|
Weight: &blkioWeight,
|
|
|
|
WeightDevice: weightDevices,
|
|
|
|
ThrottleReadBpsDevice: readBpsDevice,
|
|
|
|
ThrottleWriteBpsDevice: writeBpsDevice,
|
|
|
|
ThrottleReadIOPSDevice: readIOpsDevice,
|
|
|
|
ThrottleWriteIOPSDevice: writeIOpsDevice,
|
|
|
|
},
|
2017-04-27 21:52:47 +00:00
|
|
|
Pids: &specs.LinuxPids{
|
|
|
|
Limit: r.PidsLimit,
|
2016-03-18 18:50:19 +00:00
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
if s.Linux.Resources != nil && len(s.Linux.Resources.Devices) > 0 {
|
|
|
|
specResources.Devices = s.Linux.Resources.Devices
|
|
|
|
}
|
|
|
|
|
|
|
|
s.Linux.Resources = specResources
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func setDevices(s *specs.Spec, c *container.Container) error {
|
|
|
|
// Build lists of devices allowed and created within the container.
|
2017-04-27 21:52:47 +00:00
|
|
|
var devs []specs.LinuxDevice
|
2016-03-24 19:01:12 +00:00
|
|
|
devPermissions := s.Linux.Resources.Devices
|
2016-03-18 18:50:19 +00:00
|
|
|
if c.HostConfig.Privileged {
|
|
|
|
hostDevices, err := devices.HostDevices()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
for _, d := range hostDevices {
|
2016-11-17 00:18:43 +00:00
|
|
|
devs = append(devs, oci.Device(d))
|
2016-03-18 18:50:19 +00:00
|
|
|
}
|
2017-04-27 21:52:47 +00:00
|
|
|
devPermissions = []specs.LinuxDeviceCgroup{
|
2016-03-24 19:01:12 +00:00
|
|
|
{
|
|
|
|
Allow: true,
|
2017-04-27 21:52:47 +00:00
|
|
|
Access: "rwm",
|
2016-03-24 19:01:12 +00:00
|
|
|
},
|
|
|
|
}
|
2016-03-18 18:50:19 +00:00
|
|
|
} else {
|
|
|
|
for _, deviceMapping := range c.HostConfig.Devices {
|
2016-11-17 00:18:43 +00:00
|
|
|
d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions)
|
2016-03-18 18:50:19 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
devs = append(devs, d...)
|
2016-03-24 19:01:12 +00:00
|
|
|
devPermissions = append(devPermissions, dPermissions...)
|
2016-03-18 18:50:19 +00:00
|
|
|
}
|
2016-05-06 22:09:46 +00:00
|
|
|
|
|
|
|
for _, deviceCgroupRule := range c.HostConfig.DeviceCgroupRules {
|
|
|
|
ss := deviceCgroupRuleRegex.FindAllStringSubmatch(deviceCgroupRule, -1)
|
|
|
|
if len(ss[0]) != 5 {
|
|
|
|
return fmt.Errorf("invalid device cgroup rule format: '%s'", deviceCgroupRule)
|
|
|
|
}
|
|
|
|
matches := ss[0]
|
|
|
|
|
2017-04-27 21:52:47 +00:00
|
|
|
dPermissions := specs.LinuxDeviceCgroup{
|
2016-05-06 22:09:46 +00:00
|
|
|
Allow: true,
|
2017-04-27 21:52:47 +00:00
|
|
|
Type: matches[1],
|
|
|
|
Access: matches[4],
|
2016-05-06 22:09:46 +00:00
|
|
|
}
|
|
|
|
if matches[2] == "*" {
|
|
|
|
major := int64(-1)
|
|
|
|
dPermissions.Major = &major
|
|
|
|
} else {
|
|
|
|
major, err := strconv.ParseInt(matches[2], 10, 64)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("invalid major value in device cgroup rule format: '%s'", deviceCgroupRule)
|
|
|
|
}
|
|
|
|
dPermissions.Major = &major
|
|
|
|
}
|
|
|
|
if matches[3] == "*" {
|
|
|
|
minor := int64(-1)
|
|
|
|
dPermissions.Minor = &minor
|
|
|
|
} else {
|
|
|
|
minor, err := strconv.ParseInt(matches[3], 10, 64)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("invalid minor value in device cgroup rule format: '%s'", deviceCgroupRule)
|
|
|
|
}
|
|
|
|
dPermissions.Minor = &minor
|
|
|
|
}
|
|
|
|
devPermissions = append(devPermissions, dPermissions)
|
|
|
|
}
|
2016-03-18 18:50:19 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
s.Linux.Devices = append(s.Linux.Devices, devs...)
|
2016-03-24 19:01:12 +00:00
|
|
|
s.Linux.Resources.Devices = devPermissions
|
2016-03-18 18:50:19 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2017-09-22 13:52:41 +00:00
|
|
|
func (daemon *Daemon) setRlimits(s *specs.Spec, c *container.Container) error {
|
2017-08-01 15:51:24 +00:00
|
|
|
var rlimits []specs.POSIXRlimit
|
2016-03-18 18:50:19 +00:00
|
|
|
|
2016-09-08 04:23:56 +00:00
|
|
|
// We want to leave the original HostConfig alone so make a copy here
|
|
|
|
hostConfig := *c.HostConfig
|
|
|
|
// Merge with the daemon defaults
|
|
|
|
daemon.mergeUlimits(&hostConfig)
|
|
|
|
for _, ul := range hostConfig.Ulimits {
|
2017-08-01 15:51:24 +00:00
|
|
|
rlimits = append(rlimits, specs.POSIXRlimit{
|
2016-03-18 18:50:19 +00:00
|
|
|
Type: "RLIMIT_" + strings.ToUpper(ul.Name),
|
|
|
|
Soft: uint64(ul.Soft),
|
|
|
|
Hard: uint64(ul.Hard),
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
s.Process.Rlimits = rlimits
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func setUser(s *specs.Spec, c *container.Container) error {
|
|
|
|
uid, gid, additionalGids, err := getUser(c, c.Config.User)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
s.Process.User.UID = uid
|
|
|
|
s.Process.User.GID = gid
|
|
|
|
s.Process.User.AdditionalGids = additionalGids
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func readUserFile(c *container.Container, p string) (io.ReadCloser, error) {
|
2017-08-04 00:22:00 +00:00
|
|
|
fp, err := c.GetResourcePath(p)
|
2016-03-18 18:50:19 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
return os.Open(fp)
|
|
|
|
}
|
|
|
|
|
|
|
|
func getUser(c *container.Container, username string) (uint32, uint32, []uint32, error) {
|
|
|
|
passwdPath, err := user.GetPasswdPath()
|
|
|
|
if err != nil {
|
|
|
|
return 0, 0, nil, err
|
|
|
|
}
|
|
|
|
groupPath, err := user.GetGroupPath()
|
|
|
|
if err != nil {
|
|
|
|
return 0, 0, nil, err
|
|
|
|
}
|
|
|
|
passwdFile, err := readUserFile(c, passwdPath)
|
|
|
|
if err == nil {
|
|
|
|
defer passwdFile.Close()
|
|
|
|
}
|
|
|
|
groupFile, err := readUserFile(c, groupPath)
|
|
|
|
if err == nil {
|
|
|
|
defer groupFile.Close()
|
|
|
|
}
|
|
|
|
|
|
|
|
execUser, err := user.GetExecUser(username, nil, passwdFile, groupFile)
|
|
|
|
if err != nil {
|
|
|
|
return 0, 0, nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// todo: fix this double read by a change to libcontainer/user pkg
|
|
|
|
groupFile, err = readUserFile(c, groupPath)
|
|
|
|
if err == nil {
|
|
|
|
defer groupFile.Close()
|
|
|
|
}
|
|
|
|
var addGroups []int
|
|
|
|
if len(c.HostConfig.GroupAdd) > 0 {
|
|
|
|
addGroups, err = user.GetAdditionalGroups(c.HostConfig.GroupAdd, groupFile)
|
|
|
|
if err != nil {
|
|
|
|
return 0, 0, nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
uid := uint32(execUser.Uid)
|
|
|
|
gid := uint32(execUser.Gid)
|
|
|
|
sgids := append(execUser.Sgids, addGroups...)
|
|
|
|
var additionalGids []uint32
|
|
|
|
for _, g := range sgids {
|
|
|
|
additionalGids = append(additionalGids, uint32(g))
|
|
|
|
}
|
|
|
|
return uid, gid, additionalGids, nil
|
|
|
|
}
|
|
|
|
|
2017-04-27 21:52:47 +00:00
|
|
|
func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) {
|
2016-03-18 18:50:19 +00:00
|
|
|
for i, n := range s.Linux.Namespaces {
|
|
|
|
if n.Type == ns.Type {
|
|
|
|
s.Linux.Namespaces[i] = ns
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
s.Linux.Namespaces = append(s.Linux.Namespaces, ns)
|
|
|
|
}
|
|
|
|
|
|
|
|
func setCapabilities(s *specs.Spec, c *container.Container) error {
|
|
|
|
var caplist []string
|
|
|
|
var err error
|
|
|
|
if c.HostConfig.Privileged {
|
|
|
|
caplist = caps.GetAllCapabilities()
|
|
|
|
} else {
|
2017-04-27 21:52:47 +00:00
|
|
|
caplist, err = caps.TweakCapabilities(s.Process.Capabilities.Effective, c.HostConfig.CapAdd, c.HostConfig.CapDrop)
|
2016-03-18 18:50:19 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
2017-04-27 21:52:47 +00:00
|
|
|
s.Process.Capabilities.Effective = caplist
|
|
|
|
s.Process.Capabilities.Bounding = caplist
|
|
|
|
s.Process.Capabilities.Permitted = caplist
|
|
|
|
s.Process.Capabilities.Inheritable = caplist
|
2016-03-18 18:50:19 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func setNamespaces(daemon *Daemon, s *specs.Spec, c *container.Container) error {
|
2016-03-22 01:30:21 +00:00
|
|
|
userNS := false
|
|
|
|
// user
|
|
|
|
if c.HostConfig.UsernsMode.IsPrivate() {
|
2017-05-19 22:06:46 +00:00
|
|
|
uidMap := daemon.idMappings.UIDs()
|
2016-03-22 01:30:21 +00:00
|
|
|
if uidMap != nil {
|
|
|
|
userNS = true
|
2017-04-27 21:52:47 +00:00
|
|
|
ns := specs.LinuxNamespace{Type: "user"}
|
2016-03-22 01:30:21 +00:00
|
|
|
setNamespace(s, ns)
|
|
|
|
s.Linux.UIDMappings = specMapping(uidMap)
|
2017-05-19 22:06:46 +00:00
|
|
|
s.Linux.GIDMappings = specMapping(daemon.idMappings.GIDs())
|
2016-03-22 01:30:21 +00:00
|
|
|
}
|
|
|
|
}
|
2016-03-18 18:50:19 +00:00
|
|
|
// network
|
|
|
|
if !c.Config.NetworkDisabled {
|
2017-04-27 21:52:47 +00:00
|
|
|
ns := specs.LinuxNamespace{Type: "network"}
|
2016-03-18 18:50:19 +00:00
|
|
|
parts := strings.SplitN(string(c.HostConfig.NetworkMode), ":", 2)
|
|
|
|
if parts[0] == "container" {
|
|
|
|
nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer())
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID())
|
2016-03-22 01:30:21 +00:00
|
|
|
if userNS {
|
|
|
|
// to share a net namespace, they must also share a user namespace
|
2017-04-27 21:52:47 +00:00
|
|
|
nsUser := specs.LinuxNamespace{Type: "user"}
|
2016-03-22 01:30:21 +00:00
|
|
|
nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID())
|
|
|
|
setNamespace(s, nsUser)
|
|
|
|
}
|
2016-03-18 18:50:19 +00:00
|
|
|
} else if c.HostConfig.NetworkMode.IsHost() {
|
|
|
|
ns.Path = c.NetworkSettings.SandboxKey
|
|
|
|
}
|
|
|
|
setNamespace(s, ns)
|
|
|
|
}
|
Implement none, private, and shareable ipc modes
Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and
/dev/mqueue between containers") container's /dev/shm is mounted on the
host first, then bind-mounted inside the container. This is done that
way in order to be able to share this container's IPC namespace
(and the /dev/shm mount point) with another container.
Unfortunately, this functionality breaks container checkpoint/restore
(even if IPC is not shared). Since /dev/shm is an external mount, its
contents is not saved by `criu checkpoint`, and so upon restore any
application that tries to access data under /dev/shm is severily
disappointed (which usually results in a fatal crash).
This commit solves the issue by introducing new IPC modes for containers
(in addition to 'host' and 'container:ID'). The new modes are:
- 'shareable': enables sharing this container's IPC with others
(this used to be the implicit default);
- 'private': disables sharing this container's IPC.
In 'private' mode, container's /dev/shm is truly mounted inside the
container, without any bind-mounting from the host, which solves the
issue.
While at it, let's also implement 'none' mode. The motivation, as
eloquently put by Justin Cormack, is:
> I wondered a while back about having a none shm mode, as currently it is
> not possible to have a totally unwriteable container as there is always
> a /dev/shm writeable mount. It is a bit of a niche case (and clearly
> should never be allowed to be daemon default) but it would be trivial to
> add now so maybe we should...
...so here's yet yet another mode:
- 'none': no /dev/shm mount inside the container (though it still
has its own private IPC namespace).
Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd
need to make 'private' the default mode, but unfortunately it breaks the
backward compatibility. So, let's make the default container IPC mode
per-daemon configurable (with the built-in default set to 'shareable'
for now). The default can be changed either via a daemon CLI option
(--default-shm-mode) or a daemon.json configuration file parameter
of the same name.
Note one can only set either 'shareable' or 'private' IPC modes as a
daemon default (i.e. in this context 'host', 'container', or 'none'
do not make much sense).
Some other changes this patch introduces are:
1. A mount for /dev/shm is added to default OCI Linux spec.
2. IpcMode.Valid() is simplified to remove duplicated code that parsed
'container:ID' form. Note the old version used to check that ID does
not contain a semicolon -- this is no longer the case (tests are
modified accordingly). The motivation is we should either do a
proper check for container ID validity, or don't check it at all
(since it is checked in other places anyway). I chose the latter.
3. IpcMode.Container() is modified to not return container ID if the
mode value does not start with "container:", unifying the check to
be the same as in IpcMode.IsContainer().
3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified
to add checks for newly added values.
[v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997]
[v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833]
[v4: addressed the case of upgrading from older daemon, in this case
container.HostConfig.IpcMode is unset and this is valid]
[v5: document old and new IpcMode values in api/swagger.yaml]
[v6: add the 'none' mode, changelog entry to docs/api/version-history.md]
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 21:58:50 +00:00
|
|
|
|
2016-03-18 18:50:19 +00:00
|
|
|
// ipc
|
Implement none, private, and shareable ipc modes
Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and
/dev/mqueue between containers") container's /dev/shm is mounted on the
host first, then bind-mounted inside the container. This is done that
way in order to be able to share this container's IPC namespace
(and the /dev/shm mount point) with another container.
Unfortunately, this functionality breaks container checkpoint/restore
(even if IPC is not shared). Since /dev/shm is an external mount, its
contents is not saved by `criu checkpoint`, and so upon restore any
application that tries to access data under /dev/shm is severily
disappointed (which usually results in a fatal crash).
This commit solves the issue by introducing new IPC modes for containers
(in addition to 'host' and 'container:ID'). The new modes are:
- 'shareable': enables sharing this container's IPC with others
(this used to be the implicit default);
- 'private': disables sharing this container's IPC.
In 'private' mode, container's /dev/shm is truly mounted inside the
container, without any bind-mounting from the host, which solves the
issue.
While at it, let's also implement 'none' mode. The motivation, as
eloquently put by Justin Cormack, is:
> I wondered a while back about having a none shm mode, as currently it is
> not possible to have a totally unwriteable container as there is always
> a /dev/shm writeable mount. It is a bit of a niche case (and clearly
> should never be allowed to be daemon default) but it would be trivial to
> add now so maybe we should...
...so here's yet yet another mode:
- 'none': no /dev/shm mount inside the container (though it still
has its own private IPC namespace).
Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd
need to make 'private' the default mode, but unfortunately it breaks the
backward compatibility. So, let's make the default container IPC mode
per-daemon configurable (with the built-in default set to 'shareable'
for now). The default can be changed either via a daemon CLI option
(--default-shm-mode) or a daemon.json configuration file parameter
of the same name.
Note one can only set either 'shareable' or 'private' IPC modes as a
daemon default (i.e. in this context 'host', 'container', or 'none'
do not make much sense).
Some other changes this patch introduces are:
1. A mount for /dev/shm is added to default OCI Linux spec.
2. IpcMode.Valid() is simplified to remove duplicated code that parsed
'container:ID' form. Note the old version used to check that ID does
not contain a semicolon -- this is no longer the case (tests are
modified accordingly). The motivation is we should either do a
proper check for container ID validity, or don't check it at all
(since it is checked in other places anyway). I chose the latter.
3. IpcMode.Container() is modified to not return container ID if the
mode value does not start with "container:", unifying the check to
be the same as in IpcMode.IsContainer().
3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified
to add checks for newly added values.
[v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997]
[v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833]
[v4: addressed the case of upgrading from older daemon, in this case
container.HostConfig.IpcMode is unset and this is valid]
[v5: document old and new IpcMode values in api/swagger.yaml]
[v6: add the 'none' mode, changelog entry to docs/api/version-history.md]
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 21:58:50 +00:00
|
|
|
ipcMode := c.HostConfig.IpcMode
|
|
|
|
switch {
|
|
|
|
case ipcMode.IsContainer():
|
2017-04-27 21:52:47 +00:00
|
|
|
ns := specs.LinuxNamespace{Type: "ipc"}
|
Implement none, private, and shareable ipc modes
Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and
/dev/mqueue between containers") container's /dev/shm is mounted on the
host first, then bind-mounted inside the container. This is done that
way in order to be able to share this container's IPC namespace
(and the /dev/shm mount point) with another container.
Unfortunately, this functionality breaks container checkpoint/restore
(even if IPC is not shared). Since /dev/shm is an external mount, its
contents is not saved by `criu checkpoint`, and so upon restore any
application that tries to access data under /dev/shm is severily
disappointed (which usually results in a fatal crash).
This commit solves the issue by introducing new IPC modes for containers
(in addition to 'host' and 'container:ID'). The new modes are:
- 'shareable': enables sharing this container's IPC with others
(this used to be the implicit default);
- 'private': disables sharing this container's IPC.
In 'private' mode, container's /dev/shm is truly mounted inside the
container, without any bind-mounting from the host, which solves the
issue.
While at it, let's also implement 'none' mode. The motivation, as
eloquently put by Justin Cormack, is:
> I wondered a while back about having a none shm mode, as currently it is
> not possible to have a totally unwriteable container as there is always
> a /dev/shm writeable mount. It is a bit of a niche case (and clearly
> should never be allowed to be daemon default) but it would be trivial to
> add now so maybe we should...
...so here's yet yet another mode:
- 'none': no /dev/shm mount inside the container (though it still
has its own private IPC namespace).
Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd
need to make 'private' the default mode, but unfortunately it breaks the
backward compatibility. So, let's make the default container IPC mode
per-daemon configurable (with the built-in default set to 'shareable'
for now). The default can be changed either via a daemon CLI option
(--default-shm-mode) or a daemon.json configuration file parameter
of the same name.
Note one can only set either 'shareable' or 'private' IPC modes as a
daemon default (i.e. in this context 'host', 'container', or 'none'
do not make much sense).
Some other changes this patch introduces are:
1. A mount for /dev/shm is added to default OCI Linux spec.
2. IpcMode.Valid() is simplified to remove duplicated code that parsed
'container:ID' form. Note the old version used to check that ID does
not contain a semicolon -- this is no longer the case (tests are
modified accordingly). The motivation is we should either do a
proper check for container ID validity, or don't check it at all
(since it is checked in other places anyway). I chose the latter.
3. IpcMode.Container() is modified to not return container ID if the
mode value does not start with "container:", unifying the check to
be the same as in IpcMode.IsContainer().
3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified
to add checks for newly added values.
[v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997]
[v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833]
[v4: addressed the case of upgrading from older daemon, in this case
container.HostConfig.IpcMode is unset and this is valid]
[v5: document old and new IpcMode values in api/swagger.yaml]
[v6: add the 'none' mode, changelog entry to docs/api/version-history.md]
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 21:58:50 +00:00
|
|
|
ic, err := daemon.getIpcContainer(ipcMode.Container())
|
2016-03-18 18:50:19 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID())
|
|
|
|
setNamespace(s, ns)
|
2016-03-22 01:30:21 +00:00
|
|
|
if userNS {
|
|
|
|
// to share an IPC namespace, they must also share a user namespace
|
2017-04-27 21:52:47 +00:00
|
|
|
nsUser := specs.LinuxNamespace{Type: "user"}
|
2016-03-22 01:30:21 +00:00
|
|
|
nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID())
|
|
|
|
setNamespace(s, nsUser)
|
|
|
|
}
|
Implement none, private, and shareable ipc modes
Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and
/dev/mqueue between containers") container's /dev/shm is mounted on the
host first, then bind-mounted inside the container. This is done that
way in order to be able to share this container's IPC namespace
(and the /dev/shm mount point) with another container.
Unfortunately, this functionality breaks container checkpoint/restore
(even if IPC is not shared). Since /dev/shm is an external mount, its
contents is not saved by `criu checkpoint`, and so upon restore any
application that tries to access data under /dev/shm is severily
disappointed (which usually results in a fatal crash).
This commit solves the issue by introducing new IPC modes for containers
(in addition to 'host' and 'container:ID'). The new modes are:
- 'shareable': enables sharing this container's IPC with others
(this used to be the implicit default);
- 'private': disables sharing this container's IPC.
In 'private' mode, container's /dev/shm is truly mounted inside the
container, without any bind-mounting from the host, which solves the
issue.
While at it, let's also implement 'none' mode. The motivation, as
eloquently put by Justin Cormack, is:
> I wondered a while back about having a none shm mode, as currently it is
> not possible to have a totally unwriteable container as there is always
> a /dev/shm writeable mount. It is a bit of a niche case (and clearly
> should never be allowed to be daemon default) but it would be trivial to
> add now so maybe we should...
...so here's yet yet another mode:
- 'none': no /dev/shm mount inside the container (though it still
has its own private IPC namespace).
Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd
need to make 'private' the default mode, but unfortunately it breaks the
backward compatibility. So, let's make the default container IPC mode
per-daemon configurable (with the built-in default set to 'shareable'
for now). The default can be changed either via a daemon CLI option
(--default-shm-mode) or a daemon.json configuration file parameter
of the same name.
Note one can only set either 'shareable' or 'private' IPC modes as a
daemon default (i.e. in this context 'host', 'container', or 'none'
do not make much sense).
Some other changes this patch introduces are:
1. A mount for /dev/shm is added to default OCI Linux spec.
2. IpcMode.Valid() is simplified to remove duplicated code that parsed
'container:ID' form. Note the old version used to check that ID does
not contain a semicolon -- this is no longer the case (tests are
modified accordingly). The motivation is we should either do a
proper check for container ID validity, or don't check it at all
(since it is checked in other places anyway). I chose the latter.
3. IpcMode.Container() is modified to not return container ID if the
mode value does not start with "container:", unifying the check to
be the same as in IpcMode.IsContainer().
3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified
to add checks for newly added values.
[v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997]
[v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833]
[v4: addressed the case of upgrading from older daemon, in this case
container.HostConfig.IpcMode is unset and this is valid]
[v5: document old and new IpcMode values in api/swagger.yaml]
[v6: add the 'none' mode, changelog entry to docs/api/version-history.md]
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 21:58:50 +00:00
|
|
|
case ipcMode.IsHost():
|
2017-04-27 21:52:47 +00:00
|
|
|
oci.RemoveNamespace(s, specs.LinuxNamespaceType("ipc"))
|
Implement none, private, and shareable ipc modes
Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and
/dev/mqueue between containers") container's /dev/shm is mounted on the
host first, then bind-mounted inside the container. This is done that
way in order to be able to share this container's IPC namespace
(and the /dev/shm mount point) with another container.
Unfortunately, this functionality breaks container checkpoint/restore
(even if IPC is not shared). Since /dev/shm is an external mount, its
contents is not saved by `criu checkpoint`, and so upon restore any
application that tries to access data under /dev/shm is severily
disappointed (which usually results in a fatal crash).
This commit solves the issue by introducing new IPC modes for containers
(in addition to 'host' and 'container:ID'). The new modes are:
- 'shareable': enables sharing this container's IPC with others
(this used to be the implicit default);
- 'private': disables sharing this container's IPC.
In 'private' mode, container's /dev/shm is truly mounted inside the
container, without any bind-mounting from the host, which solves the
issue.
While at it, let's also implement 'none' mode. The motivation, as
eloquently put by Justin Cormack, is:
> I wondered a while back about having a none shm mode, as currently it is
> not possible to have a totally unwriteable container as there is always
> a /dev/shm writeable mount. It is a bit of a niche case (and clearly
> should never be allowed to be daemon default) but it would be trivial to
> add now so maybe we should...
...so here's yet yet another mode:
- 'none': no /dev/shm mount inside the container (though it still
has its own private IPC namespace).
Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd
need to make 'private' the default mode, but unfortunately it breaks the
backward compatibility. So, let's make the default container IPC mode
per-daemon configurable (with the built-in default set to 'shareable'
for now). The default can be changed either via a daemon CLI option
(--default-shm-mode) or a daemon.json configuration file parameter
of the same name.
Note one can only set either 'shareable' or 'private' IPC modes as a
daemon default (i.e. in this context 'host', 'container', or 'none'
do not make much sense).
Some other changes this patch introduces are:
1. A mount for /dev/shm is added to default OCI Linux spec.
2. IpcMode.Valid() is simplified to remove duplicated code that parsed
'container:ID' form. Note the old version used to check that ID does
not contain a semicolon -- this is no longer the case (tests are
modified accordingly). The motivation is we should either do a
proper check for container ID validity, or don't check it at all
(since it is checked in other places anyway). I chose the latter.
3. IpcMode.Container() is modified to not return container ID if the
mode value does not start with "container:", unifying the check to
be the same as in IpcMode.IsContainer().
3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified
to add checks for newly added values.
[v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997]
[v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833]
[v4: addressed the case of upgrading from older daemon, in this case
container.HostConfig.IpcMode is unset and this is valid]
[v5: document old and new IpcMode values in api/swagger.yaml]
[v6: add the 'none' mode, changelog entry to docs/api/version-history.md]
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 21:58:50 +00:00
|
|
|
case ipcMode.IsEmpty():
|
|
|
|
// A container was created by an older version of the daemon.
|
|
|
|
// The default behavior used to be what is now called "shareable".
|
|
|
|
fallthrough
|
|
|
|
case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone():
|
2017-04-27 21:52:47 +00:00
|
|
|
ns := specs.LinuxNamespace{Type: "ipc"}
|
2016-03-18 18:50:19 +00:00
|
|
|
setNamespace(s, ns)
|
Implement none, private, and shareable ipc modes
Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and
/dev/mqueue between containers") container's /dev/shm is mounted on the
host first, then bind-mounted inside the container. This is done that
way in order to be able to share this container's IPC namespace
(and the /dev/shm mount point) with another container.
Unfortunately, this functionality breaks container checkpoint/restore
(even if IPC is not shared). Since /dev/shm is an external mount, its
contents is not saved by `criu checkpoint`, and so upon restore any
application that tries to access data under /dev/shm is severily
disappointed (which usually results in a fatal crash).
This commit solves the issue by introducing new IPC modes for containers
(in addition to 'host' and 'container:ID'). The new modes are:
- 'shareable': enables sharing this container's IPC with others
(this used to be the implicit default);
- 'private': disables sharing this container's IPC.
In 'private' mode, container's /dev/shm is truly mounted inside the
container, without any bind-mounting from the host, which solves the
issue.
While at it, let's also implement 'none' mode. The motivation, as
eloquently put by Justin Cormack, is:
> I wondered a while back about having a none shm mode, as currently it is
> not possible to have a totally unwriteable container as there is always
> a /dev/shm writeable mount. It is a bit of a niche case (and clearly
> should never be allowed to be daemon default) but it would be trivial to
> add now so maybe we should...
...so here's yet yet another mode:
- 'none': no /dev/shm mount inside the container (though it still
has its own private IPC namespace).
Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd
need to make 'private' the default mode, but unfortunately it breaks the
backward compatibility. So, let's make the default container IPC mode
per-daemon configurable (with the built-in default set to 'shareable'
for now). The default can be changed either via a daemon CLI option
(--default-shm-mode) or a daemon.json configuration file parameter
of the same name.
Note one can only set either 'shareable' or 'private' IPC modes as a
daemon default (i.e. in this context 'host', 'container', or 'none'
do not make much sense).
Some other changes this patch introduces are:
1. A mount for /dev/shm is added to default OCI Linux spec.
2. IpcMode.Valid() is simplified to remove duplicated code that parsed
'container:ID' form. Note the old version used to check that ID does
not contain a semicolon -- this is no longer the case (tests are
modified accordingly). The motivation is we should either do a
proper check for container ID validity, or don't check it at all
(since it is checked in other places anyway). I chose the latter.
3. IpcMode.Container() is modified to not return container ID if the
mode value does not start with "container:", unifying the check to
be the same as in IpcMode.IsContainer().
3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified
to add checks for newly added values.
[v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997]
[v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833]
[v4: addressed the case of upgrading from older daemon, in this case
container.HostConfig.IpcMode is unset and this is valid]
[v5: document old and new IpcMode values in api/swagger.yaml]
[v6: add the 'none' mode, changelog entry to docs/api/version-history.md]
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 21:58:50 +00:00
|
|
|
default:
|
|
|
|
return fmt.Errorf("Invalid IPC mode: %v", ipcMode)
|
2016-03-18 18:50:19 +00:00
|
|
|
}
|
Implement none, private, and shareable ipc modes
Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and
/dev/mqueue between containers") container's /dev/shm is mounted on the
host first, then bind-mounted inside the container. This is done that
way in order to be able to share this container's IPC namespace
(and the /dev/shm mount point) with another container.
Unfortunately, this functionality breaks container checkpoint/restore
(even if IPC is not shared). Since /dev/shm is an external mount, its
contents is not saved by `criu checkpoint`, and so upon restore any
application that tries to access data under /dev/shm is severily
disappointed (which usually results in a fatal crash).
This commit solves the issue by introducing new IPC modes for containers
(in addition to 'host' and 'container:ID'). The new modes are:
- 'shareable': enables sharing this container's IPC with others
(this used to be the implicit default);
- 'private': disables sharing this container's IPC.
In 'private' mode, container's /dev/shm is truly mounted inside the
container, without any bind-mounting from the host, which solves the
issue.
While at it, let's also implement 'none' mode. The motivation, as
eloquently put by Justin Cormack, is:
> I wondered a while back about having a none shm mode, as currently it is
> not possible to have a totally unwriteable container as there is always
> a /dev/shm writeable mount. It is a bit of a niche case (and clearly
> should never be allowed to be daemon default) but it would be trivial to
> add now so maybe we should...
...so here's yet yet another mode:
- 'none': no /dev/shm mount inside the container (though it still
has its own private IPC namespace).
Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd
need to make 'private' the default mode, but unfortunately it breaks the
backward compatibility. So, let's make the default container IPC mode
per-daemon configurable (with the built-in default set to 'shareable'
for now). The default can be changed either via a daemon CLI option
(--default-shm-mode) or a daemon.json configuration file parameter
of the same name.
Note one can only set either 'shareable' or 'private' IPC modes as a
daemon default (i.e. in this context 'host', 'container', or 'none'
do not make much sense).
Some other changes this patch introduces are:
1. A mount for /dev/shm is added to default OCI Linux spec.
2. IpcMode.Valid() is simplified to remove duplicated code that parsed
'container:ID' form. Note the old version used to check that ID does
not contain a semicolon -- this is no longer the case (tests are
modified accordingly). The motivation is we should either do a
proper check for container ID validity, or don't check it at all
(since it is checked in other places anyway). I chose the latter.
3. IpcMode.Container() is modified to not return container ID if the
mode value does not start with "container:", unifying the check to
be the same as in IpcMode.IsContainer().
3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified
to add checks for newly added values.
[v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997]
[v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833]
[v4: addressed the case of upgrading from older daemon, in this case
container.HostConfig.IpcMode is unset and this is valid]
[v5: document old and new IpcMode values in api/swagger.yaml]
[v6: add the 'none' mode, changelog entry to docs/api/version-history.md]
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 21:58:50 +00:00
|
|
|
|
2016-03-18 18:50:19 +00:00
|
|
|
// pid
|
2016-05-06 18:56:03 +00:00
|
|
|
if c.HostConfig.PidMode.IsContainer() {
|
2017-04-27 21:52:47 +00:00
|
|
|
ns := specs.LinuxNamespace{Type: "pid"}
|
2016-05-06 18:56:03 +00:00
|
|
|
pc, err := daemon.getPidContainer(c)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
ns.Path = fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID())
|
|
|
|
setNamespace(s, ns)
|
|
|
|
if userNS {
|
2016-05-08 01:36:10 +00:00
|
|
|
// to share a PID namespace, they must also share a user namespace
|
2017-04-27 21:52:47 +00:00
|
|
|
nsUser := specs.LinuxNamespace{Type: "user"}
|
2016-05-06 18:56:03 +00:00
|
|
|
nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID())
|
|
|
|
setNamespace(s, nsUser)
|
|
|
|
}
|
|
|
|
} else if c.HostConfig.PidMode.IsHost() {
|
2017-04-27 21:52:47 +00:00
|
|
|
oci.RemoveNamespace(s, specs.LinuxNamespaceType("pid"))
|
2016-05-06 18:56:03 +00:00
|
|
|
} else {
|
2017-04-27 21:52:47 +00:00
|
|
|
ns := specs.LinuxNamespace{Type: "pid"}
|
2016-05-06 18:56:03 +00:00
|
|
|
setNamespace(s, ns)
|
2016-03-18 18:50:19 +00:00
|
|
|
}
|
|
|
|
// uts
|
|
|
|
if c.HostConfig.UTSMode.IsHost() {
|
2017-04-27 21:52:47 +00:00
|
|
|
oci.RemoveNamespace(s, specs.LinuxNamespaceType("uts"))
|
2016-03-18 18:50:19 +00:00
|
|
|
s.Hostname = ""
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2017-04-27 21:52:47 +00:00
|
|
|
func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {
|
|
|
|
var ids []specs.LinuxIDMapping
|
2016-03-18 18:50:19 +00:00
|
|
|
for _, item := range s {
|
2017-04-27 21:52:47 +00:00
|
|
|
ids = append(ids, specs.LinuxIDMapping{
|
2016-03-18 18:50:19 +00:00
|
|
|
HostID: uint32(item.HostID),
|
|
|
|
ContainerID: uint32(item.ContainerID),
|
|
|
|
Size: uint32(item.Size),
|
|
|
|
})
|
|
|
|
}
|
|
|
|
return ids
|
|
|
|
}
|
|
|
|
|
|
|
|
func getMountInfo(mountinfo []*mount.Info, dir string) *mount.Info {
|
|
|
|
for _, m := range mountinfo {
|
|
|
|
if m.Mountpoint == dir {
|
|
|
|
return m
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Get the source mount point of directory passed in as argument. Also return
|
|
|
|
// optional fields.
|
|
|
|
func getSourceMount(source string) (string, string, error) {
|
|
|
|
// Ensure any symlinks are resolved.
|
|
|
|
sourcePath, err := filepath.EvalSymlinks(source)
|
|
|
|
if err != nil {
|
|
|
|
return "", "", err
|
|
|
|
}
|
|
|
|
|
|
|
|
mountinfos, err := mount.GetMounts()
|
|
|
|
if err != nil {
|
|
|
|
return "", "", err
|
|
|
|
}
|
|
|
|
|
|
|
|
mountinfo := getMountInfo(mountinfos, sourcePath)
|
|
|
|
if mountinfo != nil {
|
|
|
|
return sourcePath, mountinfo.Optional, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
path := sourcePath
|
|
|
|
for {
|
|
|
|
path = filepath.Dir(path)
|
|
|
|
|
|
|
|
mountinfo = getMountInfo(mountinfos, path)
|
|
|
|
if mountinfo != nil {
|
|
|
|
return path, mountinfo.Optional, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
if path == "/" {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// If we are here, we did not find parent mount. Something is wrong.
|
|
|
|
return "", "", fmt.Errorf("Could not find source mount of %s", source)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Ensure mount point on which path is mounted, is shared.
|
|
|
|
func ensureShared(path string) error {
|
|
|
|
sharedMount := false
|
|
|
|
|
|
|
|
sourceMount, optionalOpts, err := getSourceMount(path)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
// Make sure source mount point is shared.
|
|
|
|
optsSplit := strings.Split(optionalOpts, " ")
|
|
|
|
for _, opt := range optsSplit {
|
|
|
|
if strings.HasPrefix(opt, "shared:") {
|
|
|
|
sharedMount = true
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if !sharedMount {
|
2017-08-17 19:16:30 +00:00
|
|
|
return fmt.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount)
|
2016-03-18 18:50:19 +00:00
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Ensure mount point on which path is mounted, is either shared or slave.
|
|
|
|
func ensureSharedOrSlave(path string) error {
|
|
|
|
sharedMount := false
|
|
|
|
slaveMount := false
|
|
|
|
|
|
|
|
sourceMount, optionalOpts, err := getSourceMount(path)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
// Make sure source mount point is shared.
|
|
|
|
optsSplit := strings.Split(optionalOpts, " ")
|
|
|
|
for _, opt := range optsSplit {
|
|
|
|
if strings.HasPrefix(opt, "shared:") {
|
|
|
|
sharedMount = true
|
|
|
|
break
|
|
|
|
} else if strings.HasPrefix(opt, "master:") {
|
|
|
|
slaveMount = true
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if !sharedMount && !slaveMount {
|
2017-08-17 19:16:30 +00:00
|
|
|
return fmt.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount)
|
2016-03-18 18:50:19 +00:00
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2017-10-15 06:06:20 +00:00
|
|
|
// Get the set of mount flags that are set on the mount that contains the given
|
|
|
|
// path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
|
|
|
|
// bind-mounting "with options" will not fail with user namespaces, due to
|
|
|
|
// kernel restrictions that require user namespace mounts to preserve
|
|
|
|
// CL_UNPRIVILEGED locked flags.
|
|
|
|
func getUnprivilegedMountFlags(path string) ([]string, error) {
|
|
|
|
var statfs unix.Statfs_t
|
|
|
|
if err := unix.Statfs(path, &statfs); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
|
|
|
|
unprivilegedFlags := map[uint64]string{
|
|
|
|
unix.MS_RDONLY: "ro",
|
|
|
|
unix.MS_NODEV: "nodev",
|
|
|
|
unix.MS_NOEXEC: "noexec",
|
|
|
|
unix.MS_NOSUID: "nosuid",
|
|
|
|
unix.MS_NOATIME: "noatime",
|
|
|
|
unix.MS_RELATIME: "relatime",
|
|
|
|
unix.MS_NODIRATIME: "nodiratime",
|
|
|
|
}
|
|
|
|
|
|
|
|
var flags []string
|
|
|
|
for mask, flag := range unprivilegedFlags {
|
|
|
|
if uint64(statfs.Flags)&mask == mask {
|
|
|
|
flags = append(flags, flag)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return flags, nil
|
|
|
|
}
|
|
|
|
|
2016-03-18 18:50:19 +00:00
|
|
|
var (
|
|
|
|
mountPropagationMap = map[string]int{
|
|
|
|
"private": mount.PRIVATE,
|
|
|
|
"rprivate": mount.RPRIVATE,
|
|
|
|
"shared": mount.SHARED,
|
|
|
|
"rshared": mount.RSHARED,
|
|
|
|
"slave": mount.SLAVE,
|
|
|
|
"rslave": mount.RSLAVE,
|
|
|
|
}
|
|
|
|
|
|
|
|
mountPropagationReverseMap = map[int]string{
|
|
|
|
mount.PRIVATE: "private",
|
|
|
|
mount.RPRIVATE: "rprivate",
|
|
|
|
mount.SHARED: "shared",
|
|
|
|
mount.RSHARED: "rshared",
|
|
|
|
mount.SLAVE: "slave",
|
|
|
|
mount.RSLAVE: "rslave",
|
|
|
|
}
|
|
|
|
)
|
|
|
|
|
2017-11-10 05:18:48 +00:00
|
|
|
// inSlice tests whether a string is contained in a slice of strings or not.
|
|
|
|
// Comparison is case sensitive
|
|
|
|
func inSlice(slice []string, s string) bool {
|
|
|
|
for _, ss := range slice {
|
|
|
|
if s == ss {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
2016-03-18 18:50:19 +00:00
|
|
|
func setMounts(daemon *Daemon, s *specs.Spec, c *container.Container, mounts []container.Mount) error {
|
|
|
|
userMounts := make(map[string]struct{})
|
|
|
|
for _, m := range mounts {
|
|
|
|
userMounts[m.Destination] = struct{}{}
|
|
|
|
}
|
|
|
|
|
2017-10-27 07:21:41 +00:00
|
|
|
// Copy all mounts from spec to defaultMounts, except for
|
|
|
|
// - mounts overriden by a user supplied mount;
|
|
|
|
// - all mounts under /dev if a user supplied /dev is present;
|
|
|
|
// - /dev/shm, in case IpcMode is none.
|
|
|
|
// While at it, also
|
|
|
|
// - set size for /dev/shm from shmsize.
|
2016-03-18 18:50:19 +00:00
|
|
|
var defaultMounts []specs.Mount
|
|
|
|
_, mountDev := userMounts["/dev"]
|
|
|
|
for _, m := range s.Mounts {
|
2017-10-27 07:21:41 +00:00
|
|
|
if _, ok := userMounts[m.Destination]; ok {
|
|
|
|
// filter out mount overridden by a user supplied mount
|
Implement none, private, and shareable ipc modes
Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and
/dev/mqueue between containers") container's /dev/shm is mounted on the
host first, then bind-mounted inside the container. This is done that
way in order to be able to share this container's IPC namespace
(and the /dev/shm mount point) with another container.
Unfortunately, this functionality breaks container checkpoint/restore
(even if IPC is not shared). Since /dev/shm is an external mount, its
contents is not saved by `criu checkpoint`, and so upon restore any
application that tries to access data under /dev/shm is severily
disappointed (which usually results in a fatal crash).
This commit solves the issue by introducing new IPC modes for containers
(in addition to 'host' and 'container:ID'). The new modes are:
- 'shareable': enables sharing this container's IPC with others
(this used to be the implicit default);
- 'private': disables sharing this container's IPC.
In 'private' mode, container's /dev/shm is truly mounted inside the
container, without any bind-mounting from the host, which solves the
issue.
While at it, let's also implement 'none' mode. The motivation, as
eloquently put by Justin Cormack, is:
> I wondered a while back about having a none shm mode, as currently it is
> not possible to have a totally unwriteable container as there is always
> a /dev/shm writeable mount. It is a bit of a niche case (and clearly
> should never be allowed to be daemon default) but it would be trivial to
> add now so maybe we should...
...so here's yet yet another mode:
- 'none': no /dev/shm mount inside the container (though it still
has its own private IPC namespace).
Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd
need to make 'private' the default mode, but unfortunately it breaks the
backward compatibility. So, let's make the default container IPC mode
per-daemon configurable (with the built-in default set to 'shareable'
for now). The default can be changed either via a daemon CLI option
(--default-shm-mode) or a daemon.json configuration file parameter
of the same name.
Note one can only set either 'shareable' or 'private' IPC modes as a
daemon default (i.e. in this context 'host', 'container', or 'none'
do not make much sense).
Some other changes this patch introduces are:
1. A mount for /dev/shm is added to default OCI Linux spec.
2. IpcMode.Valid() is simplified to remove duplicated code that parsed
'container:ID' form. Note the old version used to check that ID does
not contain a semicolon -- this is no longer the case (tests are
modified accordingly). The motivation is we should either do a
proper check for container ID validity, or don't check it at all
(since it is checked in other places anyway). I chose the latter.
3. IpcMode.Container() is modified to not return container ID if the
mode value does not start with "container:", unifying the check to
be the same as in IpcMode.IsContainer().
3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified
to add checks for newly added values.
[v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997]
[v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833]
[v4: addressed the case of upgrading from older daemon, in this case
container.HostConfig.IpcMode is unset and this is valid]
[v5: document old and new IpcMode values in api/swagger.yaml]
[v6: add the 'none' mode, changelog entry to docs/api/version-history.md]
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 21:58:50 +00:00
|
|
|
continue
|
|
|
|
}
|
2017-10-27 07:21:41 +00:00
|
|
|
if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
|
|
|
|
// filter out everything under /dev if /dev is user-mounted
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
if m.Destination == "/dev/shm" {
|
|
|
|
if c.HostConfig.IpcMode.IsNone() {
|
|
|
|
// filter out /dev/shm for "none" IpcMode
|
2016-03-18 18:50:19 +00:00
|
|
|
continue
|
|
|
|
}
|
2017-10-27 07:21:41 +00:00
|
|
|
// set size for /dev/shm mount from spec
|
|
|
|
sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
|
|
|
|
m.Options = append(m.Options, sizeOpt)
|
2016-03-18 18:50:19 +00:00
|
|
|
}
|
2017-10-27 07:21:41 +00:00
|
|
|
|
|
|
|
defaultMounts = append(defaultMounts, m)
|
2016-03-18 18:50:19 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
s.Mounts = defaultMounts
|
|
|
|
for _, m := range mounts {
|
|
|
|
for _, cm := range s.Mounts {
|
|
|
|
if cm.Destination == m.Destination {
|
2017-07-19 14:20:13 +00:00
|
|
|
return duplicateMountPointError(m.Destination)
|
2016-03-18 18:50:19 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if m.Source == "tmpfs" {
|
2016-09-22 20:14:15 +00:00
|
|
|
data := m.Data
|
2017-08-01 17:32:44 +00:00
|
|
|
parser := volume.NewParser("linux")
|
|
|
|
options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())}
|
2016-06-06 09:57:11 +00:00
|
|
|
if data != "" {
|
|
|
|
options = append(options, strings.Split(data, ",")...)
|
2016-03-18 18:50:19 +00:00
|
|
|
}
|
|
|
|
|
Inconsistent --tmpfs behavior
This fix tries to address the issue raised in #22420. When
`--tmpfs` is specified with `/tmp`, the default value is
`rw,nosuid,nodev,noexec,relatime,size=65536k`. When `--tmpfs`
is specified with `/tmp:rw`, then the value changed to
`rw,nosuid,nodev,noexec,relatime`.
The reason for such an inconsistency is because docker tries
to add `size=65536k` option only when user provides no option.
This fix tries to address this issue by always pre-progating
`size=65536k` along with `rw,nosuid,nodev,noexec,relatime`.
If user provides a different value (e.g., `size=8192k`), it
will override the `size=65536k` anyway since the combined
options will be parsed and merged to remove any duplicates.
Additional test cases have been added to cover the changes
in this fix.
This fix fixes #22420.
Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
2016-05-01 02:42:19 +00:00
|
|
|
merged, err := mount.MergeTmpfsOptions(options)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged})
|
2016-03-18 18:50:19 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"}
|
|
|
|
|
|
|
|
// Determine property of RootPropagation based on volume
|
|
|
|
// properties. If a volume is shared, then keep root propagation
|
|
|
|
// shared. This should work for slave and private volumes too.
|
|
|
|
//
|
|
|
|
// For slave volumes, it can be either [r]shared/[r]slave.
|
|
|
|
//
|
|
|
|
// For private volumes any root propagation value should work.
|
|
|
|
pFlag := mountPropagationMap[m.Propagation]
|
|
|
|
if pFlag == mount.SHARED || pFlag == mount.RSHARED {
|
|
|
|
if err := ensureShared(m.Source); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
|
|
|
|
if rootpg != mount.SHARED && rootpg != mount.RSHARED {
|
|
|
|
s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED]
|
|
|
|
}
|
|
|
|
} else if pFlag == mount.SLAVE || pFlag == mount.RSLAVE {
|
|
|
|
if err := ensureSharedOrSlave(m.Source); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
|
|
|
|
if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
|
|
|
|
s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
opts := []string{"rbind"}
|
|
|
|
if !m.Writable {
|
|
|
|
opts = append(opts, "ro")
|
|
|
|
}
|
|
|
|
if pFlag != 0 {
|
|
|
|
opts = append(opts, mountPropagationReverseMap[pFlag])
|
|
|
|
}
|
|
|
|
|
2017-10-15 06:06:20 +00:00
|
|
|
// If we are using user namespaces, then we must make sure that we
|
|
|
|
// don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
|
|
|
|
// "mount" when we bind-mount. The reason for this is that at the point
|
|
|
|
// when runc sets up the root filesystem, it is already inside a user
|
|
|
|
// namespace, and thus cannot change any flags that are locked.
|
|
|
|
if daemon.configStore.RemappedRoot != "" {
|
|
|
|
unprivOpts, err := getUnprivilegedMountFlags(m.Source)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
opts = append(opts, unprivOpts...)
|
|
|
|
}
|
|
|
|
|
2016-03-18 18:50:19 +00:00
|
|
|
mt.Options = opts
|
|
|
|
s.Mounts = append(s.Mounts, mt)
|
|
|
|
}
|
|
|
|
|
|
|
|
if s.Root.Readonly {
|
|
|
|
for i, m := range s.Mounts {
|
|
|
|
switch m.Destination {
|
2017-10-31 13:32:20 +00:00
|
|
|
case "/proc", "/dev/pts", "/dev/mqueue", "/dev":
|
2016-03-18 18:50:19 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
if _, ok := userMounts[m.Destination]; !ok {
|
2017-11-10 05:18:48 +00:00
|
|
|
if !inSlice(m.Options, "ro") {
|
2016-03-18 18:50:19 +00:00
|
|
|
s.Mounts[i].Options = append(s.Mounts[i].Options, "ro")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if c.HostConfig.Privileged {
|
|
|
|
if !s.Root.Readonly {
|
|
|
|
// clear readonly for /sys
|
|
|
|
for i := range s.Mounts {
|
|
|
|
if s.Mounts[i].Destination == "/sys" {
|
|
|
|
clearReadOnly(&s.Mounts[i])
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2016-04-04 21:27:44 +00:00
|
|
|
s.Linux.ReadonlyPaths = nil
|
|
|
|
s.Linux.MaskedPaths = nil
|
2016-03-18 18:50:19 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// TODO: until a kernel/mount solution exists for handling remount in a user namespace,
|
|
|
|
// we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
|
2017-05-19 22:06:46 +00:00
|
|
|
if uidMap := daemon.idMappings.UIDs(); uidMap != nil || c.HostConfig.Privileged {
|
2016-03-18 18:50:19 +00:00
|
|
|
for i, m := range s.Mounts {
|
|
|
|
if m.Type == "cgroup" {
|
|
|
|
clearReadOnly(&s.Mounts[i])
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (daemon *Daemon) populateCommonSpec(s *specs.Spec, c *container.Container) error {
|
|
|
|
linkedEnv, err := daemon.setupLinkedContainers(c)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2017-08-01 15:51:24 +00:00
|
|
|
s.Root = &specs.Root{
|
2017-08-04 00:22:00 +00:00
|
|
|
Path: c.BaseFS.Path(),
|
2016-03-18 18:50:19 +00:00
|
|
|
Readonly: c.HostConfig.ReadonlyRootfs,
|
|
|
|
}
|
2017-05-31 21:56:23 +00:00
|
|
|
if err := c.SetupWorkingDirectory(daemon.idMappings.RootPair()); err != nil {
|
2016-03-18 18:50:19 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
cwd := c.Config.WorkingDir
|
|
|
|
if len(cwd) == 0 {
|
|
|
|
cwd = "/"
|
|
|
|
}
|
|
|
|
s.Process.Args = append([]string{c.Path}, c.Args...)
|
2016-06-27 21:38:47 +00:00
|
|
|
|
|
|
|
// only add the custom init if it is specified and the container is running in its
|
|
|
|
// own private pid namespace. It does not make sense to add if it is running in the
|
|
|
|
// host namespace or another container's pid namespace where we already have an init
|
|
|
|
if c.HostConfig.PidMode.IsPrivate() {
|
|
|
|
if (c.HostConfig.Init != nil && *c.HostConfig.Init) ||
|
|
|
|
(c.HostConfig.Init == nil && daemon.configStore.Init) {
|
2016-11-15 22:07:13 +00:00
|
|
|
s.Process.Args = append([]string{"/dev/init", "--", c.Path}, c.Args...)
|
2016-09-27 10:51:42 +00:00
|
|
|
var path string
|
2017-04-10 11:11:58 +00:00
|
|
|
if daemon.configStore.InitPath == "" {
|
2017-04-10 09:25:15 +00:00
|
|
|
path, err = exec.LookPath(daemonconfig.DefaultInitBinary)
|
2016-09-27 10:51:42 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if daemon.configStore.InitPath != "" {
|
|
|
|
path = daemon.configStore.InitPath
|
|
|
|
}
|
2016-06-27 21:38:47 +00:00
|
|
|
s.Mounts = append(s.Mounts, specs.Mount{
|
|
|
|
Destination: "/dev/init",
|
|
|
|
Type: "bind",
|
|
|
|
Source: path,
|
|
|
|
Options: []string{"bind", "ro"},
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
2016-03-18 18:50:19 +00:00
|
|
|
s.Process.Cwd = cwd
|
2016-09-28 22:21:33 +00:00
|
|
|
s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
|
2016-03-18 18:50:19 +00:00
|
|
|
s.Process.Terminal = c.Config.Tty
|
|
|
|
s.Hostname = c.FullHostname()
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2016-09-27 17:26:59 +00:00
|
|
|
func (daemon *Daemon) createSpec(c *container.Container) (*specs.Spec, error) {
|
2016-03-18 18:50:19 +00:00
|
|
|
s := oci.DefaultSpec()
|
|
|
|
if err := daemon.populateCommonSpec(&s, c); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
var cgroupsPath string
|
2016-03-24 16:18:03 +00:00
|
|
|
scopePrefix := "docker"
|
|
|
|
parent := "/docker"
|
|
|
|
useSystemd := UsingSystemd(daemon.configStore)
|
|
|
|
if useSystemd {
|
|
|
|
parent = "system.slice"
|
|
|
|
}
|
|
|
|
|
2016-03-18 18:50:19 +00:00
|
|
|
if c.HostConfig.CgroupParent != "" {
|
2016-03-24 16:18:03 +00:00
|
|
|
parent = c.HostConfig.CgroupParent
|
|
|
|
} else if daemon.configStore.CgroupParent != "" {
|
|
|
|
parent = daemon.configStore.CgroupParent
|
|
|
|
}
|
|
|
|
|
|
|
|
if useSystemd {
|
|
|
|
cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID
|
|
|
|
logrus.Debugf("createSpec: cgroupsPath: %s", cgroupsPath)
|
2016-03-18 18:50:19 +00:00
|
|
|
} else {
|
2016-03-24 16:18:03 +00:00
|
|
|
cgroupsPath = filepath.Join(parent, c.ID)
|
2016-03-18 18:50:19 +00:00
|
|
|
}
|
2017-04-27 21:52:47 +00:00
|
|
|
s.Linux.CgroupsPath = cgroupsPath
|
2016-03-18 18:50:19 +00:00
|
|
|
|
|
|
|
if err := setResources(&s, c.HostConfig.Resources); err != nil {
|
|
|
|
return nil, fmt.Errorf("linux runtime spec resources: %v", err)
|
|
|
|
}
|
2016-03-29 12:24:28 +00:00
|
|
|
s.Linux.Sysctl = c.HostConfig.Sysctls
|
2016-06-07 19:05:43 +00:00
|
|
|
|
2017-04-27 21:52:47 +00:00
|
|
|
p := s.Linux.CgroupsPath
|
2016-06-07 19:05:43 +00:00
|
|
|
if useSystemd {
|
2017-04-27 21:52:47 +00:00
|
|
|
initPath, err := cgroups.GetInitCgroup("cpu")
|
2016-06-07 19:05:43 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2017-09-22 13:52:41 +00:00
|
|
|
_, err = cgroups.GetOwnCgroup("cpu")
|
2016-06-07 19:05:43 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2017-09-22 13:52:41 +00:00
|
|
|
p = filepath.Join(initPath, s.Linux.CgroupsPath)
|
2016-06-07 19:05:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Clean path to guard against things like ../../../BAD
|
|
|
|
parentPath := filepath.Dir(p)
|
|
|
|
if !filepath.IsAbs(parentPath) {
|
|
|
|
parentPath = filepath.Clean("/" + parentPath)
|
|
|
|
}
|
|
|
|
|
|
|
|
if err := daemon.initCgroupsPath(parentPath); err != nil {
|
|
|
|
return nil, fmt.Errorf("linux init cgroups path: %v", err)
|
|
|
|
}
|
2016-03-18 18:50:19 +00:00
|
|
|
if err := setDevices(&s, c); err != nil {
|
|
|
|
return nil, fmt.Errorf("linux runtime spec devices: %v", err)
|
|
|
|
}
|
2017-09-22 13:52:41 +00:00
|
|
|
if err := daemon.setRlimits(&s, c); err != nil {
|
2016-03-18 18:50:19 +00:00
|
|
|
return nil, fmt.Errorf("linux runtime spec rlimits: %v", err)
|
|
|
|
}
|
|
|
|
if err := setUser(&s, c); err != nil {
|
|
|
|
return nil, fmt.Errorf("linux spec user: %v", err)
|
|
|
|
}
|
|
|
|
if err := setNamespaces(daemon, &s, c); err != nil {
|
|
|
|
return nil, fmt.Errorf("linux spec namespaces: %v", err)
|
|
|
|
}
|
|
|
|
if err := setCapabilities(&s, c); err != nil {
|
|
|
|
return nil, fmt.Errorf("linux spec capabilities: %v", err)
|
|
|
|
}
|
|
|
|
if err := setSeccomp(daemon, &s, c); err != nil {
|
|
|
|
return nil, fmt.Errorf("linux seccomp: %v", err)
|
|
|
|
}
|
|
|
|
|
2017-12-18 21:02:23 +00:00
|
|
|
if err := daemon.setupContainerMountsRoot(c); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2016-03-18 18:50:19 +00:00
|
|
|
if err := daemon.setupIpcDirs(c); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2016-10-19 16:22:02 +00:00
|
|
|
if err := daemon.setupSecretDir(c); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2017-03-16 21:23:33 +00:00
|
|
|
if err := daemon.setupConfigDir(c); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2016-04-26 08:20:17 +00:00
|
|
|
ms, err := daemon.setupMounts(c)
|
2016-03-18 18:50:19 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2016-10-27 07:41:32 +00:00
|
|
|
|
Implement none, private, and shareable ipc modes
Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and
/dev/mqueue between containers") container's /dev/shm is mounted on the
host first, then bind-mounted inside the container. This is done that
way in order to be able to share this container's IPC namespace
(and the /dev/shm mount point) with another container.
Unfortunately, this functionality breaks container checkpoint/restore
(even if IPC is not shared). Since /dev/shm is an external mount, its
contents is not saved by `criu checkpoint`, and so upon restore any
application that tries to access data under /dev/shm is severily
disappointed (which usually results in a fatal crash).
This commit solves the issue by introducing new IPC modes for containers
(in addition to 'host' and 'container:ID'). The new modes are:
- 'shareable': enables sharing this container's IPC with others
(this used to be the implicit default);
- 'private': disables sharing this container's IPC.
In 'private' mode, container's /dev/shm is truly mounted inside the
container, without any bind-mounting from the host, which solves the
issue.
While at it, let's also implement 'none' mode. The motivation, as
eloquently put by Justin Cormack, is:
> I wondered a while back about having a none shm mode, as currently it is
> not possible to have a totally unwriteable container as there is always
> a /dev/shm writeable mount. It is a bit of a niche case (and clearly
> should never be allowed to be daemon default) but it would be trivial to
> add now so maybe we should...
...so here's yet yet another mode:
- 'none': no /dev/shm mount inside the container (though it still
has its own private IPC namespace).
Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd
need to make 'private' the default mode, but unfortunately it breaks the
backward compatibility. So, let's make the default container IPC mode
per-daemon configurable (with the built-in default set to 'shareable'
for now). The default can be changed either via a daemon CLI option
(--default-shm-mode) or a daemon.json configuration file parameter
of the same name.
Note one can only set either 'shareable' or 'private' IPC modes as a
daemon default (i.e. in this context 'host', 'container', or 'none'
do not make much sense).
Some other changes this patch introduces are:
1. A mount for /dev/shm is added to default OCI Linux spec.
2. IpcMode.Valid() is simplified to remove duplicated code that parsed
'container:ID' form. Note the old version used to check that ID does
not contain a semicolon -- this is no longer the case (tests are
modified accordingly). The motivation is we should either do a
proper check for container ID validity, or don't check it at all
(since it is checked in other places anyway). I chose the latter.
3. IpcMode.Container() is modified to not return container ID if the
mode value does not start with "container:", unifying the check to
be the same as in IpcMode.IsContainer().
3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified
to add checks for newly added values.
[v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997]
[v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833]
[v4: addressed the case of upgrading from older daemon, in this case
container.HostConfig.IpcMode is unset and this is valid]
[v5: document old and new IpcMode values in api/swagger.yaml]
[v6: add the 'none' mode, changelog entry to docs/api/version-history.md]
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 21:58:50 +00:00
|
|
|
if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() {
|
|
|
|
ms = append(ms, c.IpcMounts()...)
|
|
|
|
}
|
2016-10-19 16:22:02 +00:00
|
|
|
|
2016-09-22 20:14:15 +00:00
|
|
|
tmpfsMounts, err := c.TmpfsMounts()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
ms = append(ms, tmpfsMounts...)
|
2016-10-19 16:22:02 +00:00
|
|
|
|
2017-12-18 21:02:23 +00:00
|
|
|
secretMounts, err := c.SecretMounts()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
2016-10-27 07:41:32 +00:00
|
|
|
}
|
2017-12-18 21:02:23 +00:00
|
|
|
ms = append(ms, secretMounts...)
|
2016-10-26 20:30:53 +00:00
|
|
|
|
2017-12-18 21:02:23 +00:00
|
|
|
configMounts, err := c.ConfigMounts()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
ms = append(ms, configMounts...)
|
2017-03-16 21:23:33 +00:00
|
|
|
|
2016-04-26 08:20:17 +00:00
|
|
|
sort.Sort(mounts(ms))
|
|
|
|
if err := setMounts(daemon, &s, c, ms); err != nil {
|
2016-03-18 18:50:19 +00:00
|
|
|
return nil, fmt.Errorf("linux mounts: %v", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, ns := range s.Linux.Namespaces {
|
|
|
|
if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled {
|
2018-01-26 18:40:32 +00:00
|
|
|
target := filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe")
|
2017-04-27 21:52:47 +00:00
|
|
|
s.Hooks = &specs.Hooks{
|
2016-03-18 18:50:19 +00:00
|
|
|
Prestart: []specs.Hook{{
|
2018-01-26 18:40:32 +00:00
|
|
|
Path: target,
|
2016-03-18 18:50:19 +00:00
|
|
|
Args: []string{"libnetwork-setkey", c.ID, daemon.netController.ID()},
|
|
|
|
}},
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if apparmor.IsEnabled() {
|
2016-12-05 13:12:17 +00:00
|
|
|
var appArmorProfile string
|
|
|
|
if c.AppArmorProfile != "" {
|
2016-03-18 18:50:19 +00:00
|
|
|
appArmorProfile = c.AppArmorProfile
|
2016-04-04 21:27:44 +00:00
|
|
|
} else if c.HostConfig.Privileged {
|
|
|
|
appArmorProfile = "unconfined"
|
2016-12-05 13:12:17 +00:00
|
|
|
} else {
|
|
|
|
appArmorProfile = "docker-default"
|
|
|
|
}
|
|
|
|
|
|
|
|
if appArmorProfile == "docker-default" {
|
|
|
|
// Unattended upgrades and other fun services can unload AppArmor
|
|
|
|
// profiles inadvertently. Since we cannot store our profile in
|
|
|
|
// /etc/apparmor.d, nor can we practically add other ways of
|
|
|
|
// telling the system to keep our profile loaded, in order to make
|
|
|
|
// sure that we keep the default profile enabled we dynamically
|
|
|
|
// reload it if necessary.
|
|
|
|
if err := ensureDefaultAppArmorProfile(); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2016-03-18 18:50:19 +00:00
|
|
|
}
|
2016-12-05 13:12:17 +00:00
|
|
|
|
2016-03-18 18:50:19 +00:00
|
|
|
s.Process.ApparmorProfile = appArmorProfile
|
|
|
|
}
|
|
|
|
s.Process.SelinuxLabel = c.GetProcessLabel()
|
|
|
|
s.Process.NoNewPrivileges = c.NoNewPrivileges
|
2017-08-01 15:51:24 +00:00
|
|
|
s.Process.OOMScoreAdj = &c.HostConfig.OomScoreAdj
|
2016-04-25 19:55:28 +00:00
|
|
|
s.Linux.MountLabel = c.MountLabel
|
2016-03-18 18:50:19 +00:00
|
|
|
|
2017-08-24 17:11:44 +00:00
|
|
|
return &s, nil
|
2016-03-18 18:50:19 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func clearReadOnly(m *specs.Mount) {
|
|
|
|
var opt []string
|
|
|
|
for _, o := range m.Options {
|
|
|
|
if o != "ro" {
|
|
|
|
opt = append(opt, o)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
m.Options = opt
|
|
|
|
}
|
2016-09-08 04:23:56 +00:00
|
|
|
|
|
|
|
// mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig
|
|
|
|
func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig) {
|
|
|
|
ulimits := c.Ulimits
|
|
|
|
// Merge ulimits with daemon defaults
|
|
|
|
ulIdx := make(map[string]struct{})
|
|
|
|
for _, ul := range ulimits {
|
|
|
|
ulIdx[ul.Name] = struct{}{}
|
|
|
|
}
|
|
|
|
for name, ul := range daemon.configStore.Ulimits {
|
|
|
|
if _, exists := ulIdx[name]; !exists {
|
|
|
|
ulimits = append(ulimits, ul)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
c.Ulimits = ulimits
|
|
|
|
}
|