moby/daemon/start.go
Kir Kolyshkin 7120976d74 Implement none, private, and shareable ipc modes
Since the commit d88fe447df ("Add support for sharing /dev/shm/ and
/dev/mqueue between containers") container's /dev/shm is mounted on the
host first, then bind-mounted inside the container. This is done that
way in order to be able to share this container's IPC namespace
(and the /dev/shm mount point) with another container.

Unfortunately, this functionality breaks container checkpoint/restore
(even if IPC is not shared). Since /dev/shm is an external mount, its
contents is not saved by `criu checkpoint`, and so upon restore any
application that tries to access data under /dev/shm is severily
disappointed (which usually results in a fatal crash).

This commit solves the issue by introducing new IPC modes for containers
(in addition to 'host' and 'container:ID'). The new modes are:

 - 'shareable':	enables sharing this container's IPC with others
		(this used to be the implicit default);

 - 'private':	disables sharing this container's IPC.

In 'private' mode, container's /dev/shm is truly mounted inside the
container, without any bind-mounting from the host, which solves the
issue.

While at it, let's also implement 'none' mode. The motivation, as
eloquently put by Justin Cormack, is:

> I wondered a while back about having a none shm mode, as currently it is
> not possible to have a totally unwriteable container as there is always
> a /dev/shm writeable mount. It is a bit of a niche case (and clearly
> should never be allowed to be daemon default) but it would be trivial to
> add now so maybe we should...

...so here's yet yet another mode:

 - 'none':	no /dev/shm mount inside the container (though it still
		has its own private IPC namespace).

Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd
need to make 'private' the default mode, but unfortunately it breaks the
backward compatibility. So, let's make the default container IPC mode
per-daemon configurable (with the built-in default set to 'shareable'
for now). The default can be changed either via a daemon CLI option
(--default-shm-mode) or a daemon.json configuration file parameter
of the same name.

Note one can only set either 'shareable' or 'private' IPC modes as a
daemon default (i.e. in this context 'host', 'container', or 'none'
do not make much sense).

Some other changes this patch introduces are:

1. A mount for /dev/shm is added to default OCI Linux spec.

2. IpcMode.Valid() is simplified to remove duplicated code that parsed
   'container:ID' form. Note the old version used to check that ID does
   not contain a semicolon -- this is no longer the case (tests are
   modified accordingly). The motivation is we should either do a
   proper check for container ID validity, or don't check it at all
   (since it is checked in other places anyway). I chose the latter.

3. IpcMode.Container() is modified to not return container ID if the
   mode value does not start with "container:", unifying the check to
   be the same as in IpcMode.IsContainer().

3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified
   to add checks for newly added values.

[v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997]
[v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833]
[v4: addressed the case of upgrading from older daemon, in this case
     container.HostConfig.IpcMode is unset and this is valid]
[v5: document old and new IpcMode values in api/swagger.yaml]
[v6: add the 'none' mode, changelog entry to docs/api/version-history.md]

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-08-14 10:50:39 +03:00

227 lines
7.6 KiB
Go

package daemon
import (
"fmt"
"net/http"
"runtime"
"strings"
"syscall"
"time"
"google.golang.org/grpc"
apierrors "github.com/docker/docker/api/errors"
"github.com/docker/docker/api/types"
containertypes "github.com/docker/docker/api/types/container"
"github.com/docker/docker/container"
"github.com/sirupsen/logrus"
)
// ContainerStart starts a container.
func (daemon *Daemon) ContainerStart(name string, hostConfig *containertypes.HostConfig, checkpoint string, checkpointDir string) error {
if checkpoint != "" && !daemon.HasExperimental() {
return apierrors.NewBadRequestError(fmt.Errorf("checkpoint is only supported in experimental mode"))
}
container, err := daemon.GetContainer(name)
if err != nil {
return err
}
if container.IsPaused() {
return fmt.Errorf("Cannot start a paused container, try unpause instead.")
}
if container.IsRunning() {
err := fmt.Errorf("Container already started")
return apierrors.NewErrorWithStatusCode(err, http.StatusNotModified)
}
// Windows does not have the backwards compatibility issue here.
if runtime.GOOS != "windows" {
// This is kept for backward compatibility - hostconfig should be passed when
// creating a container, not during start.
if hostConfig != nil {
logrus.Warn("DEPRECATED: Setting host configuration options when the container starts is deprecated and has been removed in Docker 1.12")
oldNetworkMode := container.HostConfig.NetworkMode
if err := daemon.setSecurityOptions(container, hostConfig); err != nil {
return err
}
if err := daemon.mergeAndVerifyLogConfig(&hostConfig.LogConfig); err != nil {
return err
}
if err := daemon.setHostConfig(container, hostConfig); err != nil {
return err
}
newNetworkMode := container.HostConfig.NetworkMode
if string(oldNetworkMode) != string(newNetworkMode) {
// if user has change the network mode on starting, clean up the
// old networks. It is a deprecated feature and has been removed in Docker 1.12
container.NetworkSettings.Networks = nil
if err := container.CheckpointTo(daemon.containersReplica); err != nil {
return err
}
}
container.InitDNSHostConfig()
}
} else {
if hostConfig != nil {
return fmt.Errorf("Supplying a hostconfig on start is not supported. It should be supplied on create")
}
}
// check if hostConfig is in line with the current system settings.
// It may happen cgroups are umounted or the like.
if _, err = daemon.verifyContainerSettings(container.HostConfig, nil, false); err != nil {
return err
}
// Adapt for old containers in case we have updates in this function and
// old containers never have chance to call the new function in create stage.
if hostConfig != nil {
if err := daemon.adaptContainerSettings(container.HostConfig, false); err != nil {
return err
}
}
return daemon.containerStart(container, checkpoint, checkpointDir, true)
}
// containerStart prepares the container to run by setting up everything the
// container needs, such as storage and networking, as well as links
// between containers. The container is left waiting for a signal to
// begin running.
func (daemon *Daemon) containerStart(container *container.Container, checkpoint string, checkpointDir string, resetRestartManager bool) (err error) {
start := time.Now()
container.Lock()
defer container.Unlock()
if resetRestartManager && container.Running { // skip this check if already in restarting step and resetRestartManager==false
return nil
}
if container.RemovalInProgress || container.Dead {
return fmt.Errorf("Container is marked for removal and cannot be started.")
}
// if we encounter an error during start we need to ensure that any other
// setup has been cleaned up properly
defer func() {
if err != nil {
container.SetError(err)
// if no one else has set it, make sure we don't leave it at zero
if container.ExitCode() == 0 {
container.SetExitCode(128)
}
if err := container.CheckpointTo(daemon.containersReplica); err != nil {
logrus.Errorf("%s: failed saving state on start failure: %v", container.ID, err)
}
container.Reset(false)
daemon.Cleanup(container)
// if containers AutoRemove flag is set, remove it after clean up
if container.HostConfig.AutoRemove {
container.Unlock()
if err := daemon.ContainerRm(container.ID, &types.ContainerRmConfig{ForceRemove: true, RemoveVolume: true}); err != nil {
logrus.Errorf("can't remove container %s: %v", container.ID, err)
}
container.Lock()
}
}
}()
if err := daemon.conditionalMountOnStart(container); err != nil {
return err
}
if err := daemon.initializeNetworking(container); err != nil {
return err
}
spec, err := daemon.createSpec(container)
if err != nil {
return err
}
createOptions, err := daemon.getLibcontainerdCreateOptions(container)
if err != nil {
return err
}
if resetRestartManager {
container.ResetRestartManager(true)
}
if checkpointDir == "" {
checkpointDir = container.CheckpointDir()
}
if daemon.saveApparmorConfig(container); err != nil {
return err
}
if err := daemon.containerd.Create(container.ID, checkpoint, checkpointDir, *spec, container.InitializeStdio, createOptions...); err != nil {
errDesc := grpc.ErrorDesc(err)
contains := func(s1, s2 string) bool {
return strings.Contains(strings.ToLower(s1), s2)
}
logrus.Errorf("Create container failed with error: %s", errDesc)
// if we receive an internal error from the initial start of a container then lets
// return it instead of entering the restart loop
// set to 127 for container cmd not found/does not exist)
if contains(errDesc, container.Path) &&
(contains(errDesc, "executable file not found") ||
contains(errDesc, "no such file or directory") ||
contains(errDesc, "system cannot find the file specified")) {
container.SetExitCode(127)
}
// set to 126 for container cmd can't be invoked errors
if contains(errDesc, syscall.EACCES.Error()) {
container.SetExitCode(126)
}
// attempted to mount a file onto a directory, or a directory onto a file, maybe from user specified bind mounts
if contains(errDesc, syscall.ENOTDIR.Error()) {
errDesc += ": Are you trying to mount a directory onto a file (or vice-versa)? Check if the specified host path exists and is the expected type"
container.SetExitCode(127)
}
return fmt.Errorf("%s", errDesc)
}
containerActions.WithValues("start").UpdateSince(start)
return nil
}
// Cleanup releases any network resources allocated to the container along with any rules
// around how containers are linked together. It also unmounts the container's root filesystem.
func (daemon *Daemon) Cleanup(container *container.Container) {
daemon.releaseNetwork(container)
if err := container.UnmountIpcMount(detachMounted); err != nil {
logrus.Warnf("%s cleanup: failed to unmount IPC: %s", container.ID, err)
}
if err := daemon.conditionalUnmountOnCleanup(container); err != nil {
// FIXME: remove once reference counting for graphdrivers has been refactored
// Ensure that all the mounts are gone
if mountid, err := daemon.stores[container.Platform].layerStore.GetMountID(container.ID); err == nil {
daemon.cleanupMountsByID(mountid)
}
}
if err := container.UnmountSecrets(); err != nil {
logrus.Warnf("%s cleanup: failed to unmount secrets: %s", container.ID, err)
}
for _, eConfig := range container.ExecCommands.Commands() {
daemon.unregisterExecCommand(container, eConfig)
}
if container.BaseFS != "" {
if err := container.UnmountVolumes(daemon.LogVolumeEvent); err != nil {
logrus.Warnf("%s cleanup: Failed to umount volumes: %v", container.ID, err)
}
}
container.CancelAttachContext()
}