7120976d74
Since the commit d88fe447df
("Add support for sharing /dev/shm/ and
/dev/mqueue between containers") container's /dev/shm is mounted on the
host first, then bind-mounted inside the container. This is done that
way in order to be able to share this container's IPC namespace
(and the /dev/shm mount point) with another container.
Unfortunately, this functionality breaks container checkpoint/restore
(even if IPC is not shared). Since /dev/shm is an external mount, its
contents is not saved by `criu checkpoint`, and so upon restore any
application that tries to access data under /dev/shm is severily
disappointed (which usually results in a fatal crash).
This commit solves the issue by introducing new IPC modes for containers
(in addition to 'host' and 'container:ID'). The new modes are:
- 'shareable': enables sharing this container's IPC with others
(this used to be the implicit default);
- 'private': disables sharing this container's IPC.
In 'private' mode, container's /dev/shm is truly mounted inside the
container, without any bind-mounting from the host, which solves the
issue.
While at it, let's also implement 'none' mode. The motivation, as
eloquently put by Justin Cormack, is:
> I wondered a while back about having a none shm mode, as currently it is
> not possible to have a totally unwriteable container as there is always
> a /dev/shm writeable mount. It is a bit of a niche case (and clearly
> should never be allowed to be daemon default) but it would be trivial to
> add now so maybe we should...
...so here's yet yet another mode:
- 'none': no /dev/shm mount inside the container (though it still
has its own private IPC namespace).
Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd
need to make 'private' the default mode, but unfortunately it breaks the
backward compatibility. So, let's make the default container IPC mode
per-daemon configurable (with the built-in default set to 'shareable'
for now). The default can be changed either via a daemon CLI option
(--default-shm-mode) or a daemon.json configuration file parameter
of the same name.
Note one can only set either 'shareable' or 'private' IPC modes as a
daemon default (i.e. in this context 'host', 'container', or 'none'
do not make much sense).
Some other changes this patch introduces are:
1. A mount for /dev/shm is added to default OCI Linux spec.
2. IpcMode.Valid() is simplified to remove duplicated code that parsed
'container:ID' form. Note the old version used to check that ID does
not contain a semicolon -- this is no longer the case (tests are
modified accordingly). The motivation is we should either do a
proper check for container ID validity, or don't check it at all
(since it is checked in other places anyway). I chose the latter.
3. IpcMode.Container() is modified to not return container ID if the
mode value does not start with "container:", unifying the check to
be the same as in IpcMode.IsContainer().
3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified
to add checks for newly added values.
[v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997]
[v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833]
[v4: addressed the case of upgrading from older daemon, in this case
container.HostConfig.IpcMode is unset and this is valid]
[v5: document old and new IpcMode values in api/swagger.yaml]
[v6: add the 'none' mode, changelog entry to docs/api/version-history.md]
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
227 lines
7.6 KiB
Go
227 lines
7.6 KiB
Go
package daemon
|
|
|
|
import (
|
|
"fmt"
|
|
"net/http"
|
|
"runtime"
|
|
"strings"
|
|
"syscall"
|
|
"time"
|
|
|
|
"google.golang.org/grpc"
|
|
|
|
apierrors "github.com/docker/docker/api/errors"
|
|
"github.com/docker/docker/api/types"
|
|
containertypes "github.com/docker/docker/api/types/container"
|
|
"github.com/docker/docker/container"
|
|
"github.com/sirupsen/logrus"
|
|
)
|
|
|
|
// ContainerStart starts a container.
|
|
func (daemon *Daemon) ContainerStart(name string, hostConfig *containertypes.HostConfig, checkpoint string, checkpointDir string) error {
|
|
if checkpoint != "" && !daemon.HasExperimental() {
|
|
return apierrors.NewBadRequestError(fmt.Errorf("checkpoint is only supported in experimental mode"))
|
|
}
|
|
|
|
container, err := daemon.GetContainer(name)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if container.IsPaused() {
|
|
return fmt.Errorf("Cannot start a paused container, try unpause instead.")
|
|
}
|
|
|
|
if container.IsRunning() {
|
|
err := fmt.Errorf("Container already started")
|
|
return apierrors.NewErrorWithStatusCode(err, http.StatusNotModified)
|
|
}
|
|
|
|
// Windows does not have the backwards compatibility issue here.
|
|
if runtime.GOOS != "windows" {
|
|
// This is kept for backward compatibility - hostconfig should be passed when
|
|
// creating a container, not during start.
|
|
if hostConfig != nil {
|
|
logrus.Warn("DEPRECATED: Setting host configuration options when the container starts is deprecated and has been removed in Docker 1.12")
|
|
oldNetworkMode := container.HostConfig.NetworkMode
|
|
if err := daemon.setSecurityOptions(container, hostConfig); err != nil {
|
|
return err
|
|
}
|
|
if err := daemon.mergeAndVerifyLogConfig(&hostConfig.LogConfig); err != nil {
|
|
return err
|
|
}
|
|
if err := daemon.setHostConfig(container, hostConfig); err != nil {
|
|
return err
|
|
}
|
|
newNetworkMode := container.HostConfig.NetworkMode
|
|
if string(oldNetworkMode) != string(newNetworkMode) {
|
|
// if user has change the network mode on starting, clean up the
|
|
// old networks. It is a deprecated feature and has been removed in Docker 1.12
|
|
container.NetworkSettings.Networks = nil
|
|
if err := container.CheckpointTo(daemon.containersReplica); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
container.InitDNSHostConfig()
|
|
}
|
|
} else {
|
|
if hostConfig != nil {
|
|
return fmt.Errorf("Supplying a hostconfig on start is not supported. It should be supplied on create")
|
|
}
|
|
}
|
|
|
|
// check if hostConfig is in line with the current system settings.
|
|
// It may happen cgroups are umounted or the like.
|
|
if _, err = daemon.verifyContainerSettings(container.HostConfig, nil, false); err != nil {
|
|
return err
|
|
}
|
|
// Adapt for old containers in case we have updates in this function and
|
|
// old containers never have chance to call the new function in create stage.
|
|
if hostConfig != nil {
|
|
if err := daemon.adaptContainerSettings(container.HostConfig, false); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return daemon.containerStart(container, checkpoint, checkpointDir, true)
|
|
}
|
|
|
|
// containerStart prepares the container to run by setting up everything the
|
|
// container needs, such as storage and networking, as well as links
|
|
// between containers. The container is left waiting for a signal to
|
|
// begin running.
|
|
func (daemon *Daemon) containerStart(container *container.Container, checkpoint string, checkpointDir string, resetRestartManager bool) (err error) {
|
|
start := time.Now()
|
|
container.Lock()
|
|
defer container.Unlock()
|
|
|
|
if resetRestartManager && container.Running { // skip this check if already in restarting step and resetRestartManager==false
|
|
return nil
|
|
}
|
|
|
|
if container.RemovalInProgress || container.Dead {
|
|
return fmt.Errorf("Container is marked for removal and cannot be started.")
|
|
}
|
|
|
|
// if we encounter an error during start we need to ensure that any other
|
|
// setup has been cleaned up properly
|
|
defer func() {
|
|
if err != nil {
|
|
container.SetError(err)
|
|
// if no one else has set it, make sure we don't leave it at zero
|
|
if container.ExitCode() == 0 {
|
|
container.SetExitCode(128)
|
|
}
|
|
if err := container.CheckpointTo(daemon.containersReplica); err != nil {
|
|
logrus.Errorf("%s: failed saving state on start failure: %v", container.ID, err)
|
|
}
|
|
container.Reset(false)
|
|
|
|
daemon.Cleanup(container)
|
|
// if containers AutoRemove flag is set, remove it after clean up
|
|
if container.HostConfig.AutoRemove {
|
|
container.Unlock()
|
|
if err := daemon.ContainerRm(container.ID, &types.ContainerRmConfig{ForceRemove: true, RemoveVolume: true}); err != nil {
|
|
logrus.Errorf("can't remove container %s: %v", container.ID, err)
|
|
}
|
|
container.Lock()
|
|
}
|
|
}
|
|
}()
|
|
|
|
if err := daemon.conditionalMountOnStart(container); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := daemon.initializeNetworking(container); err != nil {
|
|
return err
|
|
}
|
|
|
|
spec, err := daemon.createSpec(container)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
createOptions, err := daemon.getLibcontainerdCreateOptions(container)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if resetRestartManager {
|
|
container.ResetRestartManager(true)
|
|
}
|
|
|
|
if checkpointDir == "" {
|
|
checkpointDir = container.CheckpointDir()
|
|
}
|
|
|
|
if daemon.saveApparmorConfig(container); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := daemon.containerd.Create(container.ID, checkpoint, checkpointDir, *spec, container.InitializeStdio, createOptions...); err != nil {
|
|
errDesc := grpc.ErrorDesc(err)
|
|
contains := func(s1, s2 string) bool {
|
|
return strings.Contains(strings.ToLower(s1), s2)
|
|
}
|
|
logrus.Errorf("Create container failed with error: %s", errDesc)
|
|
// if we receive an internal error from the initial start of a container then lets
|
|
// return it instead of entering the restart loop
|
|
// set to 127 for container cmd not found/does not exist)
|
|
if contains(errDesc, container.Path) &&
|
|
(contains(errDesc, "executable file not found") ||
|
|
contains(errDesc, "no such file or directory") ||
|
|
contains(errDesc, "system cannot find the file specified")) {
|
|
container.SetExitCode(127)
|
|
}
|
|
// set to 126 for container cmd can't be invoked errors
|
|
if contains(errDesc, syscall.EACCES.Error()) {
|
|
container.SetExitCode(126)
|
|
}
|
|
|
|
// attempted to mount a file onto a directory, or a directory onto a file, maybe from user specified bind mounts
|
|
if contains(errDesc, syscall.ENOTDIR.Error()) {
|
|
errDesc += ": Are you trying to mount a directory onto a file (or vice-versa)? Check if the specified host path exists and is the expected type"
|
|
container.SetExitCode(127)
|
|
}
|
|
|
|
return fmt.Errorf("%s", errDesc)
|
|
}
|
|
|
|
containerActions.WithValues("start").UpdateSince(start)
|
|
|
|
return nil
|
|
}
|
|
|
|
// Cleanup releases any network resources allocated to the container along with any rules
|
|
// around how containers are linked together. It also unmounts the container's root filesystem.
|
|
func (daemon *Daemon) Cleanup(container *container.Container) {
|
|
daemon.releaseNetwork(container)
|
|
|
|
if err := container.UnmountIpcMount(detachMounted); err != nil {
|
|
logrus.Warnf("%s cleanup: failed to unmount IPC: %s", container.ID, err)
|
|
}
|
|
|
|
if err := daemon.conditionalUnmountOnCleanup(container); err != nil {
|
|
// FIXME: remove once reference counting for graphdrivers has been refactored
|
|
// Ensure that all the mounts are gone
|
|
if mountid, err := daemon.stores[container.Platform].layerStore.GetMountID(container.ID); err == nil {
|
|
daemon.cleanupMountsByID(mountid)
|
|
}
|
|
}
|
|
|
|
if err := container.UnmountSecrets(); err != nil {
|
|
logrus.Warnf("%s cleanup: failed to unmount secrets: %s", container.ID, err)
|
|
}
|
|
|
|
for _, eConfig := range container.ExecCommands.Commands() {
|
|
daemon.unregisterExecCommand(container, eConfig)
|
|
}
|
|
|
|
if container.BaseFS != "" {
|
|
if err := container.UnmountVolumes(daemon.LogVolumeEvent); err != nil {
|
|
logrus.Warnf("%s cleanup: Failed to umount volumes: %v", container.ID, err)
|
|
}
|
|
}
|
|
container.CancelAttachContext()
|
|
}
|