7120976d74
Since the commit d88fe447df
("Add support for sharing /dev/shm/ and
/dev/mqueue between containers") container's /dev/shm is mounted on the
host first, then bind-mounted inside the container. This is done that
way in order to be able to share this container's IPC namespace
(and the /dev/shm mount point) with another container.
Unfortunately, this functionality breaks container checkpoint/restore
(even if IPC is not shared). Since /dev/shm is an external mount, its
contents is not saved by `criu checkpoint`, and so upon restore any
application that tries to access data under /dev/shm is severily
disappointed (which usually results in a fatal crash).
This commit solves the issue by introducing new IPC modes for containers
(in addition to 'host' and 'container:ID'). The new modes are:
- 'shareable': enables sharing this container's IPC with others
(this used to be the implicit default);
- 'private': disables sharing this container's IPC.
In 'private' mode, container's /dev/shm is truly mounted inside the
container, without any bind-mounting from the host, which solves the
issue.
While at it, let's also implement 'none' mode. The motivation, as
eloquently put by Justin Cormack, is:
> I wondered a while back about having a none shm mode, as currently it is
> not possible to have a totally unwriteable container as there is always
> a /dev/shm writeable mount. It is a bit of a niche case (and clearly
> should never be allowed to be daemon default) but it would be trivial to
> add now so maybe we should...
...so here's yet yet another mode:
- 'none': no /dev/shm mount inside the container (though it still
has its own private IPC namespace).
Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd
need to make 'private' the default mode, but unfortunately it breaks the
backward compatibility. So, let's make the default container IPC mode
per-daemon configurable (with the built-in default set to 'shareable'
for now). The default can be changed either via a daemon CLI option
(--default-shm-mode) or a daemon.json configuration file parameter
of the same name.
Note one can only set either 'shareable' or 'private' IPC modes as a
daemon default (i.e. in this context 'host', 'container', or 'none'
do not make much sense).
Some other changes this patch introduces are:
1. A mount for /dev/shm is added to default OCI Linux spec.
2. IpcMode.Valid() is simplified to remove duplicated code that parsed
'container:ID' form. Note the old version used to check that ID does
not contain a semicolon -- this is no longer the case (tests are
modified accordingly). The motivation is we should either do a
proper check for container ID validity, or don't check it at all
(since it is checked in other places anyway). I chose the latter.
3. IpcMode.Container() is modified to not return container ID if the
mode value does not start with "container:", unifying the check to
be the same as in IpcMode.IsContainer().
3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified
to add checks for newly added values.
[v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997]
[v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833]
[v4: addressed the case of upgrading from older daemon, in this case
container.HostConfig.IpcMode is unset and this is valid]
[v5: document old and new IpcMode values in api/swagger.yaml]
[v6: add the 'none' mode, changelog entry to docs/api/version-history.md]
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
378 lines
11 KiB
Go
378 lines
11 KiB
Go
// +build linux freebsd
|
|
|
|
package daemon
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"io/ioutil"
|
|
"os"
|
|
"path/filepath"
|
|
"strconv"
|
|
"time"
|
|
|
|
"github.com/docker/docker/container"
|
|
"github.com/docker/docker/daemon/links"
|
|
"github.com/docker/docker/pkg/idtools"
|
|
"github.com/docker/docker/pkg/mount"
|
|
"github.com/docker/docker/pkg/stringid"
|
|
"github.com/docker/docker/runconfig"
|
|
"github.com/docker/libnetwork"
|
|
"github.com/opencontainers/selinux/go-selinux/label"
|
|
"github.com/pkg/errors"
|
|
"github.com/sirupsen/logrus"
|
|
"golang.org/x/sys/unix"
|
|
)
|
|
|
|
func (daemon *Daemon) setupLinkedContainers(container *container.Container) ([]string, error) {
|
|
var env []string
|
|
children := daemon.children(container)
|
|
|
|
bridgeSettings := container.NetworkSettings.Networks[runconfig.DefaultDaemonNetworkMode().NetworkName()]
|
|
if bridgeSettings == nil || bridgeSettings.EndpointSettings == nil {
|
|
return nil, nil
|
|
}
|
|
|
|
for linkAlias, child := range children {
|
|
if !child.IsRunning() {
|
|
return nil, fmt.Errorf("Cannot link to a non running container: %s AS %s", child.Name, linkAlias)
|
|
}
|
|
|
|
childBridgeSettings := child.NetworkSettings.Networks[runconfig.DefaultDaemonNetworkMode().NetworkName()]
|
|
if childBridgeSettings == nil || childBridgeSettings.EndpointSettings == nil {
|
|
return nil, fmt.Errorf("container %s not attached to default bridge network", child.ID)
|
|
}
|
|
|
|
link := links.NewLink(
|
|
bridgeSettings.IPAddress,
|
|
childBridgeSettings.IPAddress,
|
|
linkAlias,
|
|
child.Config.Env,
|
|
child.Config.ExposedPorts,
|
|
)
|
|
|
|
env = append(env, link.ToEnv()...)
|
|
}
|
|
|
|
return env, nil
|
|
}
|
|
|
|
func (daemon *Daemon) getIpcContainer(id string) (*container.Container, error) {
|
|
errMsg := "can't join IPC of container " + id
|
|
// Check the container exists
|
|
container, err := daemon.GetContainer(id)
|
|
if err != nil {
|
|
return nil, errors.Wrap(err, errMsg)
|
|
}
|
|
// Check the container is running and not restarting
|
|
if err := daemon.checkContainer(container, containerIsRunning, containerIsNotRestarting); err != nil {
|
|
return nil, errors.Wrap(err, errMsg)
|
|
}
|
|
// Check the container ipc is shareable
|
|
if st, err := os.Stat(container.ShmPath); err != nil || !st.IsDir() {
|
|
if err == nil || os.IsNotExist(err) {
|
|
return nil, errors.New(errMsg + ": non-shareable IPC")
|
|
}
|
|
// stat() failed?
|
|
return nil, errors.Wrap(err, errMsg+": unexpected error from stat "+container.ShmPath)
|
|
}
|
|
|
|
return container, nil
|
|
}
|
|
|
|
func (daemon *Daemon) getPidContainer(container *container.Container) (*container.Container, error) {
|
|
containerID := container.HostConfig.PidMode.Container()
|
|
container, err := daemon.GetContainer(containerID)
|
|
if err != nil {
|
|
return nil, errors.Wrapf(err, "cannot join PID of a non running container: %s", container.ID)
|
|
}
|
|
return container, daemon.checkContainer(container, containerIsRunning, containerIsNotRestarting)
|
|
}
|
|
|
|
func containerIsRunning(c *container.Container) error {
|
|
if !c.IsRunning() {
|
|
return errors.Errorf("container %s is not running", c.ID)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func containerIsNotRestarting(c *container.Container) error {
|
|
if c.IsRestarting() {
|
|
return errContainerIsRestarting(c.ID)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (daemon *Daemon) setupIpcDirs(c *container.Container) error {
|
|
ipcMode := c.HostConfig.IpcMode
|
|
|
|
switch {
|
|
case ipcMode.IsContainer():
|
|
ic, err := daemon.getIpcContainer(ipcMode.Container())
|
|
if err != nil {
|
|
return err
|
|
}
|
|
c.ShmPath = ic.ShmPath
|
|
|
|
case ipcMode.IsHost():
|
|
if _, err := os.Stat("/dev/shm"); err != nil {
|
|
return fmt.Errorf("/dev/shm is not mounted, but must be for --ipc=host")
|
|
}
|
|
c.ShmPath = "/dev/shm"
|
|
|
|
case ipcMode.IsPrivate(), ipcMode.IsNone():
|
|
// c.ShmPath will/should not be used, so make it empty.
|
|
// Container's /dev/shm mount comes from OCI spec.
|
|
c.ShmPath = ""
|
|
|
|
case ipcMode.IsEmpty():
|
|
// A container was created by an older version of the daemon.
|
|
// The default behavior used to be what is now called "shareable".
|
|
fallthrough
|
|
|
|
case ipcMode.IsShareable():
|
|
rootIDs := daemon.idMappings.RootPair()
|
|
if !c.HasMountFor("/dev/shm") {
|
|
shmPath, err := c.ShmResourcePath()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := idtools.MkdirAllAndChown(shmPath, 0700, rootIDs); err != nil {
|
|
return err
|
|
}
|
|
|
|
shmproperty := "mode=1777,size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
|
|
if err := unix.Mount("shm", shmPath, "tmpfs", uintptr(unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV), label.FormatMountLabel(shmproperty, c.GetMountLabel())); err != nil {
|
|
return fmt.Errorf("mounting shm tmpfs: %s", err)
|
|
}
|
|
if err := os.Chown(shmPath, rootIDs.UID, rootIDs.GID); err != nil {
|
|
return err
|
|
}
|
|
c.ShmPath = shmPath
|
|
}
|
|
|
|
default:
|
|
return fmt.Errorf("invalid IPC mode: %v", ipcMode)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (daemon *Daemon) setupSecretDir(c *container.Container) (setupErr error) {
|
|
if len(c.SecretReferences) == 0 {
|
|
return nil
|
|
}
|
|
|
|
localMountPath := c.SecretMountPath()
|
|
logrus.Debugf("secrets: setting up secret dir: %s", localMountPath)
|
|
|
|
// retrieve possible remapped range start for root UID, GID
|
|
rootIDs := daemon.idMappings.RootPair()
|
|
// create tmpfs
|
|
if err := idtools.MkdirAllAndChown(localMountPath, 0700, rootIDs); err != nil {
|
|
return errors.Wrap(err, "error creating secret local mount path")
|
|
}
|
|
|
|
defer func() {
|
|
if setupErr != nil {
|
|
// cleanup
|
|
_ = detachMounted(localMountPath)
|
|
|
|
if err := os.RemoveAll(localMountPath); err != nil {
|
|
logrus.Errorf("error cleaning up secret mount: %s", err)
|
|
}
|
|
}
|
|
}()
|
|
|
|
tmpfsOwnership := fmt.Sprintf("uid=%d,gid=%d", rootIDs.UID, rootIDs.GID)
|
|
if err := mount.Mount("tmpfs", localMountPath, "tmpfs", "nodev,nosuid,noexec,"+tmpfsOwnership); err != nil {
|
|
return errors.Wrap(err, "unable to setup secret mount")
|
|
}
|
|
|
|
if c.DependencyStore == nil {
|
|
return fmt.Errorf("secret store is not initialized")
|
|
}
|
|
|
|
for _, s := range c.SecretReferences {
|
|
// TODO (ehazlett): use type switch when more are supported
|
|
if s.File == nil {
|
|
logrus.Error("secret target type is not a file target")
|
|
continue
|
|
}
|
|
|
|
// secrets are created in the SecretMountPath on the host, at a
|
|
// single level
|
|
fPath := c.SecretFilePath(*s)
|
|
if err := idtools.MkdirAllAndChown(filepath.Dir(fPath), 0700, rootIDs); err != nil {
|
|
return errors.Wrap(err, "error creating secret mount path")
|
|
}
|
|
|
|
logrus.WithFields(logrus.Fields{
|
|
"name": s.File.Name,
|
|
"path": fPath,
|
|
}).Debug("injecting secret")
|
|
secret, err := c.DependencyStore.Secrets().Get(s.SecretID)
|
|
if err != nil {
|
|
return errors.Wrap(err, "unable to get secret from secret store")
|
|
}
|
|
if err := ioutil.WriteFile(fPath, secret.Spec.Data, s.File.Mode); err != nil {
|
|
return errors.Wrap(err, "error injecting secret")
|
|
}
|
|
|
|
uid, err := strconv.Atoi(s.File.UID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
gid, err := strconv.Atoi(s.File.GID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := os.Chown(fPath, rootIDs.UID+uid, rootIDs.GID+gid); err != nil {
|
|
return errors.Wrap(err, "error setting ownership for secret")
|
|
}
|
|
}
|
|
|
|
label.Relabel(localMountPath, c.MountLabel, false)
|
|
|
|
// remount secrets ro
|
|
if err := mount.Mount("tmpfs", localMountPath, "tmpfs", "remount,ro,"+tmpfsOwnership); err != nil {
|
|
return errors.Wrap(err, "unable to remount secret dir as readonly")
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (daemon *Daemon) setupConfigDir(c *container.Container) (setupErr error) {
|
|
if len(c.ConfigReferences) == 0 {
|
|
return nil
|
|
}
|
|
|
|
localPath := c.ConfigsDirPath()
|
|
logrus.Debugf("configs: setting up config dir: %s", localPath)
|
|
|
|
// retrieve possible remapped range start for root UID, GID
|
|
rootIDs := daemon.idMappings.RootPair()
|
|
// create tmpfs
|
|
if err := idtools.MkdirAllAndChown(localPath, 0700, rootIDs); err != nil {
|
|
return errors.Wrap(err, "error creating config dir")
|
|
}
|
|
|
|
defer func() {
|
|
if setupErr != nil {
|
|
if err := os.RemoveAll(localPath); err != nil {
|
|
logrus.Errorf("error cleaning up config dir: %s", err)
|
|
}
|
|
}
|
|
}()
|
|
|
|
if c.DependencyStore == nil {
|
|
return fmt.Errorf("config store is not initialized")
|
|
}
|
|
|
|
for _, configRef := range c.ConfigReferences {
|
|
// TODO (ehazlett): use type switch when more are supported
|
|
if configRef.File == nil {
|
|
logrus.Error("config target type is not a file target")
|
|
continue
|
|
}
|
|
|
|
fPath := c.ConfigFilePath(*configRef)
|
|
|
|
log := logrus.WithFields(logrus.Fields{"name": configRef.File.Name, "path": fPath})
|
|
|
|
if err := idtools.MkdirAllAndChown(filepath.Dir(fPath), 0700, rootIDs); err != nil {
|
|
return errors.Wrap(err, "error creating config path")
|
|
}
|
|
|
|
log.Debug("injecting config")
|
|
config, err := c.DependencyStore.Configs().Get(configRef.ConfigID)
|
|
if err != nil {
|
|
return errors.Wrap(err, "unable to get config from config store")
|
|
}
|
|
if err := ioutil.WriteFile(fPath, config.Spec.Data, configRef.File.Mode); err != nil {
|
|
return errors.Wrap(err, "error injecting config")
|
|
}
|
|
|
|
uid, err := strconv.Atoi(configRef.File.UID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
gid, err := strconv.Atoi(configRef.File.GID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := os.Chown(fPath, rootIDs.UID+uid, rootIDs.GID+gid); err != nil {
|
|
return errors.Wrap(err, "error setting ownership for config")
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func killProcessDirectly(cntr *container.Container) error {
|
|
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
|
defer cancel()
|
|
|
|
// Block until the container to stops or timeout.
|
|
status := <-cntr.Wait(ctx, container.WaitConditionNotRunning)
|
|
if status.Err() != nil {
|
|
// Ensure that we don't kill ourselves
|
|
if pid := cntr.GetPID(); pid != 0 {
|
|
logrus.Infof("Container %s failed to exit within 10 seconds of kill - trying direct SIGKILL", stringid.TruncateID(cntr.ID))
|
|
if err := unix.Kill(pid, 9); err != nil {
|
|
if err != unix.ESRCH {
|
|
return err
|
|
}
|
|
e := errNoSuchProcess{pid, 9}
|
|
logrus.Debug(e)
|
|
return e
|
|
}
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func detachMounted(path string) error {
|
|
return unix.Unmount(path, unix.MNT_DETACH)
|
|
}
|
|
|
|
func isLinkable(child *container.Container) bool {
|
|
// A container is linkable only if it belongs to the default network
|
|
_, ok := child.NetworkSettings.Networks[runconfig.DefaultDaemonNetworkMode().NetworkName()]
|
|
return ok
|
|
}
|
|
|
|
func enableIPOnPredefinedNetwork() bool {
|
|
return false
|
|
}
|
|
|
|
func (daemon *Daemon) isNetworkHotPluggable() bool {
|
|
return true
|
|
}
|
|
|
|
func setupPathsAndSandboxOptions(container *container.Container, sboxOptions *[]libnetwork.SandboxOption) error {
|
|
var err error
|
|
|
|
container.HostsPath, err = container.GetRootResourcePath("hosts")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
*sboxOptions = append(*sboxOptions, libnetwork.OptionHostsPath(container.HostsPath))
|
|
|
|
container.ResolvConfPath, err = container.GetRootResourcePath("resolv.conf")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
*sboxOptions = append(*sboxOptions, libnetwork.OptionResolvConfPath(container.ResolvConfPath))
|
|
return nil
|
|
}
|
|
|
|
func (daemon *Daemon) initializeNetworkingPaths(container *container.Container, nc *container.Container) error {
|
|
container.HostnamePath = nc.HostnamePath
|
|
container.HostsPath = nc.HostsPath
|
|
container.ResolvConfPath = nc.ResolvConfPath
|
|
return nil
|
|
}
|