moby/daemon/container_operations_unix.go
Kir Kolyshkin 7120976d74 Implement none, private, and shareable ipc modes
Since the commit d88fe447df ("Add support for sharing /dev/shm/ and
/dev/mqueue between containers") container's /dev/shm is mounted on the
host first, then bind-mounted inside the container. This is done that
way in order to be able to share this container's IPC namespace
(and the /dev/shm mount point) with another container.

Unfortunately, this functionality breaks container checkpoint/restore
(even if IPC is not shared). Since /dev/shm is an external mount, its
contents is not saved by `criu checkpoint`, and so upon restore any
application that tries to access data under /dev/shm is severily
disappointed (which usually results in a fatal crash).

This commit solves the issue by introducing new IPC modes for containers
(in addition to 'host' and 'container:ID'). The new modes are:

 - 'shareable':	enables sharing this container's IPC with others
		(this used to be the implicit default);

 - 'private':	disables sharing this container's IPC.

In 'private' mode, container's /dev/shm is truly mounted inside the
container, without any bind-mounting from the host, which solves the
issue.

While at it, let's also implement 'none' mode. The motivation, as
eloquently put by Justin Cormack, is:

> I wondered a while back about having a none shm mode, as currently it is
> not possible to have a totally unwriteable container as there is always
> a /dev/shm writeable mount. It is a bit of a niche case (and clearly
> should never be allowed to be daemon default) but it would be trivial to
> add now so maybe we should...

...so here's yet yet another mode:

 - 'none':	no /dev/shm mount inside the container (though it still
		has its own private IPC namespace).

Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd
need to make 'private' the default mode, but unfortunately it breaks the
backward compatibility. So, let's make the default container IPC mode
per-daemon configurable (with the built-in default set to 'shareable'
for now). The default can be changed either via a daemon CLI option
(--default-shm-mode) or a daemon.json configuration file parameter
of the same name.

Note one can only set either 'shareable' or 'private' IPC modes as a
daemon default (i.e. in this context 'host', 'container', or 'none'
do not make much sense).

Some other changes this patch introduces are:

1. A mount for /dev/shm is added to default OCI Linux spec.

2. IpcMode.Valid() is simplified to remove duplicated code that parsed
   'container:ID' form. Note the old version used to check that ID does
   not contain a semicolon -- this is no longer the case (tests are
   modified accordingly). The motivation is we should either do a
   proper check for container ID validity, or don't check it at all
   (since it is checked in other places anyway). I chose the latter.

3. IpcMode.Container() is modified to not return container ID if the
   mode value does not start with "container:", unifying the check to
   be the same as in IpcMode.IsContainer().

3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified
   to add checks for newly added values.

[v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997]
[v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833]
[v4: addressed the case of upgrading from older daemon, in this case
     container.HostConfig.IpcMode is unset and this is valid]
[v5: document old and new IpcMode values in api/swagger.yaml]
[v6: add the 'none' mode, changelog entry to docs/api/version-history.md]

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-08-14 10:50:39 +03:00

378 lines
11 KiB
Go

// +build linux freebsd
package daemon
import (
"context"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strconv"
"time"
"github.com/docker/docker/container"
"github.com/docker/docker/daemon/links"
"github.com/docker/docker/pkg/idtools"
"github.com/docker/docker/pkg/mount"
"github.com/docker/docker/pkg/stringid"
"github.com/docker/docker/runconfig"
"github.com/docker/libnetwork"
"github.com/opencontainers/selinux/go-selinux/label"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
func (daemon *Daemon) setupLinkedContainers(container *container.Container) ([]string, error) {
var env []string
children := daemon.children(container)
bridgeSettings := container.NetworkSettings.Networks[runconfig.DefaultDaemonNetworkMode().NetworkName()]
if bridgeSettings == nil || bridgeSettings.EndpointSettings == nil {
return nil, nil
}
for linkAlias, child := range children {
if !child.IsRunning() {
return nil, fmt.Errorf("Cannot link to a non running container: %s AS %s", child.Name, linkAlias)
}
childBridgeSettings := child.NetworkSettings.Networks[runconfig.DefaultDaemonNetworkMode().NetworkName()]
if childBridgeSettings == nil || childBridgeSettings.EndpointSettings == nil {
return nil, fmt.Errorf("container %s not attached to default bridge network", child.ID)
}
link := links.NewLink(
bridgeSettings.IPAddress,
childBridgeSettings.IPAddress,
linkAlias,
child.Config.Env,
child.Config.ExposedPorts,
)
env = append(env, link.ToEnv()...)
}
return env, nil
}
func (daemon *Daemon) getIpcContainer(id string) (*container.Container, error) {
errMsg := "can't join IPC of container " + id
// Check the container exists
container, err := daemon.GetContainer(id)
if err != nil {
return nil, errors.Wrap(err, errMsg)
}
// Check the container is running and not restarting
if err := daemon.checkContainer(container, containerIsRunning, containerIsNotRestarting); err != nil {
return nil, errors.Wrap(err, errMsg)
}
// Check the container ipc is shareable
if st, err := os.Stat(container.ShmPath); err != nil || !st.IsDir() {
if err == nil || os.IsNotExist(err) {
return nil, errors.New(errMsg + ": non-shareable IPC")
}
// stat() failed?
return nil, errors.Wrap(err, errMsg+": unexpected error from stat "+container.ShmPath)
}
return container, nil
}
func (daemon *Daemon) getPidContainer(container *container.Container) (*container.Container, error) {
containerID := container.HostConfig.PidMode.Container()
container, err := daemon.GetContainer(containerID)
if err != nil {
return nil, errors.Wrapf(err, "cannot join PID of a non running container: %s", container.ID)
}
return container, daemon.checkContainer(container, containerIsRunning, containerIsNotRestarting)
}
func containerIsRunning(c *container.Container) error {
if !c.IsRunning() {
return errors.Errorf("container %s is not running", c.ID)
}
return nil
}
func containerIsNotRestarting(c *container.Container) error {
if c.IsRestarting() {
return errContainerIsRestarting(c.ID)
}
return nil
}
func (daemon *Daemon) setupIpcDirs(c *container.Container) error {
ipcMode := c.HostConfig.IpcMode
switch {
case ipcMode.IsContainer():
ic, err := daemon.getIpcContainer(ipcMode.Container())
if err != nil {
return err
}
c.ShmPath = ic.ShmPath
case ipcMode.IsHost():
if _, err := os.Stat("/dev/shm"); err != nil {
return fmt.Errorf("/dev/shm is not mounted, but must be for --ipc=host")
}
c.ShmPath = "/dev/shm"
case ipcMode.IsPrivate(), ipcMode.IsNone():
// c.ShmPath will/should not be used, so make it empty.
// Container's /dev/shm mount comes from OCI spec.
c.ShmPath = ""
case ipcMode.IsEmpty():
// A container was created by an older version of the daemon.
// The default behavior used to be what is now called "shareable".
fallthrough
case ipcMode.IsShareable():
rootIDs := daemon.idMappings.RootPair()
if !c.HasMountFor("/dev/shm") {
shmPath, err := c.ShmResourcePath()
if err != nil {
return err
}
if err := idtools.MkdirAllAndChown(shmPath, 0700, rootIDs); err != nil {
return err
}
shmproperty := "mode=1777,size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
if err := unix.Mount("shm", shmPath, "tmpfs", uintptr(unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV), label.FormatMountLabel(shmproperty, c.GetMountLabel())); err != nil {
return fmt.Errorf("mounting shm tmpfs: %s", err)
}
if err := os.Chown(shmPath, rootIDs.UID, rootIDs.GID); err != nil {
return err
}
c.ShmPath = shmPath
}
default:
return fmt.Errorf("invalid IPC mode: %v", ipcMode)
}
return nil
}
func (daemon *Daemon) setupSecretDir(c *container.Container) (setupErr error) {
if len(c.SecretReferences) == 0 {
return nil
}
localMountPath := c.SecretMountPath()
logrus.Debugf("secrets: setting up secret dir: %s", localMountPath)
// retrieve possible remapped range start for root UID, GID
rootIDs := daemon.idMappings.RootPair()
// create tmpfs
if err := idtools.MkdirAllAndChown(localMountPath, 0700, rootIDs); err != nil {
return errors.Wrap(err, "error creating secret local mount path")
}
defer func() {
if setupErr != nil {
// cleanup
_ = detachMounted(localMountPath)
if err := os.RemoveAll(localMountPath); err != nil {
logrus.Errorf("error cleaning up secret mount: %s", err)
}
}
}()
tmpfsOwnership := fmt.Sprintf("uid=%d,gid=%d", rootIDs.UID, rootIDs.GID)
if err := mount.Mount("tmpfs", localMountPath, "tmpfs", "nodev,nosuid,noexec,"+tmpfsOwnership); err != nil {
return errors.Wrap(err, "unable to setup secret mount")
}
if c.DependencyStore == nil {
return fmt.Errorf("secret store is not initialized")
}
for _, s := range c.SecretReferences {
// TODO (ehazlett): use type switch when more are supported
if s.File == nil {
logrus.Error("secret target type is not a file target")
continue
}
// secrets are created in the SecretMountPath on the host, at a
// single level
fPath := c.SecretFilePath(*s)
if err := idtools.MkdirAllAndChown(filepath.Dir(fPath), 0700, rootIDs); err != nil {
return errors.Wrap(err, "error creating secret mount path")
}
logrus.WithFields(logrus.Fields{
"name": s.File.Name,
"path": fPath,
}).Debug("injecting secret")
secret, err := c.DependencyStore.Secrets().Get(s.SecretID)
if err != nil {
return errors.Wrap(err, "unable to get secret from secret store")
}
if err := ioutil.WriteFile(fPath, secret.Spec.Data, s.File.Mode); err != nil {
return errors.Wrap(err, "error injecting secret")
}
uid, err := strconv.Atoi(s.File.UID)
if err != nil {
return err
}
gid, err := strconv.Atoi(s.File.GID)
if err != nil {
return err
}
if err := os.Chown(fPath, rootIDs.UID+uid, rootIDs.GID+gid); err != nil {
return errors.Wrap(err, "error setting ownership for secret")
}
}
label.Relabel(localMountPath, c.MountLabel, false)
// remount secrets ro
if err := mount.Mount("tmpfs", localMountPath, "tmpfs", "remount,ro,"+tmpfsOwnership); err != nil {
return errors.Wrap(err, "unable to remount secret dir as readonly")
}
return nil
}
func (daemon *Daemon) setupConfigDir(c *container.Container) (setupErr error) {
if len(c.ConfigReferences) == 0 {
return nil
}
localPath := c.ConfigsDirPath()
logrus.Debugf("configs: setting up config dir: %s", localPath)
// retrieve possible remapped range start for root UID, GID
rootIDs := daemon.idMappings.RootPair()
// create tmpfs
if err := idtools.MkdirAllAndChown(localPath, 0700, rootIDs); err != nil {
return errors.Wrap(err, "error creating config dir")
}
defer func() {
if setupErr != nil {
if err := os.RemoveAll(localPath); err != nil {
logrus.Errorf("error cleaning up config dir: %s", err)
}
}
}()
if c.DependencyStore == nil {
return fmt.Errorf("config store is not initialized")
}
for _, configRef := range c.ConfigReferences {
// TODO (ehazlett): use type switch when more are supported
if configRef.File == nil {
logrus.Error("config target type is not a file target")
continue
}
fPath := c.ConfigFilePath(*configRef)
log := logrus.WithFields(logrus.Fields{"name": configRef.File.Name, "path": fPath})
if err := idtools.MkdirAllAndChown(filepath.Dir(fPath), 0700, rootIDs); err != nil {
return errors.Wrap(err, "error creating config path")
}
log.Debug("injecting config")
config, err := c.DependencyStore.Configs().Get(configRef.ConfigID)
if err != nil {
return errors.Wrap(err, "unable to get config from config store")
}
if err := ioutil.WriteFile(fPath, config.Spec.Data, configRef.File.Mode); err != nil {
return errors.Wrap(err, "error injecting config")
}
uid, err := strconv.Atoi(configRef.File.UID)
if err != nil {
return err
}
gid, err := strconv.Atoi(configRef.File.GID)
if err != nil {
return err
}
if err := os.Chown(fPath, rootIDs.UID+uid, rootIDs.GID+gid); err != nil {
return errors.Wrap(err, "error setting ownership for config")
}
}
return nil
}
func killProcessDirectly(cntr *container.Container) error {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
// Block until the container to stops or timeout.
status := <-cntr.Wait(ctx, container.WaitConditionNotRunning)
if status.Err() != nil {
// Ensure that we don't kill ourselves
if pid := cntr.GetPID(); pid != 0 {
logrus.Infof("Container %s failed to exit within 10 seconds of kill - trying direct SIGKILL", stringid.TruncateID(cntr.ID))
if err := unix.Kill(pid, 9); err != nil {
if err != unix.ESRCH {
return err
}
e := errNoSuchProcess{pid, 9}
logrus.Debug(e)
return e
}
}
}
return nil
}
func detachMounted(path string) error {
return unix.Unmount(path, unix.MNT_DETACH)
}
func isLinkable(child *container.Container) bool {
// A container is linkable only if it belongs to the default network
_, ok := child.NetworkSettings.Networks[runconfig.DefaultDaemonNetworkMode().NetworkName()]
return ok
}
func enableIPOnPredefinedNetwork() bool {
return false
}
func (daemon *Daemon) isNetworkHotPluggable() bool {
return true
}
func setupPathsAndSandboxOptions(container *container.Container, sboxOptions *[]libnetwork.SandboxOption) error {
var err error
container.HostsPath, err = container.GetRootResourcePath("hosts")
if err != nil {
return err
}
*sboxOptions = append(*sboxOptions, libnetwork.OptionHostsPath(container.HostsPath))
container.ResolvConfPath, err = container.GetRootResourcePath("resolv.conf")
if err != nil {
return err
}
*sboxOptions = append(*sboxOptions, libnetwork.OptionResolvConfPath(container.ResolvConfPath))
return nil
}
func (daemon *Daemon) initializeNetworkingPaths(container *container.Container, nc *container.Container) error {
container.HostnamePath = nc.HostnamePath
container.HostsPath = nc.HostsPath
container.ResolvConfPath = nc.ResolvConfPath
return nil
}