moby/daemon/container.go
Cory Snider d222bf097c daemon: reload runtimes w/o breaking containers
The existing runtimes reload logic went to great lengths to replace the
directory containing runtime wrapper scripts as atomically as possible
within the limitations of the Linux filesystem ABI. Trouble is,
atomically swapping the wrapper scripts directory solves the wrong
problem! The runtime configuration is "locked in" when a container is
started, including the path to the runC binary. If a container is
started with a runtime which requires a daemon-managed wrapper script
and then the daemon is reloaded with a config which no longer requires
the wrapper script (i.e. some args -> no args, or the runtime is dropped
from the config), that container would become unmanageable. Any attempts
to stop, exec or otherwise perform lifecycle management operations on
the container are likely to fail due to the wrapper script no longer
existing at its original path.

Atomically swapping the wrapper scripts is also incompatible with the
read-copy-update paradigm for reloading configuration. A handler in the
daemon could retain a reference to the pre-reload configuration for an
indeterminate amount of time after the daemon configuration has been
reloaded and updated. It is possible for the daemon to attempt to start
a container using a deleted wrapper script if a request to run a
container races a reload.

Solve the problem of deleting referenced wrapper scripts by ensuring
that all wrapper scripts are *immutable* for the lifetime of the daemon
process. Any given runtime wrapper script must always exist with the
same contents, no matter how many times the daemon config is reloaded,
or what changes are made to the config. This is accomplished by using
everyone's favourite design pattern: content-addressable storage. Each
wrapper script file name is suffixed with the SHA-256 digest of its
contents to (probabilistically) guarantee immutability without needing
any concurrency control. Stale runtime wrapper scripts are only cleaned
up on the next daemon restart.

Split the derived runtimes configuration from the user-supplied
configuration to have a place to store derived state without mutating
the user-supplied configuration or exposing daemon internals in API
struct types. Hold the derived state and the user-supplied configuration
in a single struct value so that they can be updated as an atomic unit.

Signed-off-by: Cory Snider <csnider@mirantis.com>
2023-06-01 14:45:25 -04:00

395 lines
13 KiB
Go

package daemon // import "github.com/docker/docker/daemon"
import (
"fmt"
"os"
"path/filepath"
"runtime"
"time"
containertypes "github.com/docker/docker/api/types/container"
"github.com/docker/docker/api/types/strslice"
"github.com/docker/docker/container"
"github.com/docker/docker/daemon/config"
"github.com/docker/docker/daemon/network"
"github.com/docker/docker/errdefs"
"github.com/docker/docker/image"
"github.com/docker/docker/oci/caps"
"github.com/docker/docker/opts"
"github.com/docker/docker/pkg/system"
"github.com/docker/docker/runconfig"
volumemounts "github.com/docker/docker/volume/mounts"
"github.com/docker/go-connections/nat"
"github.com/moby/sys/signal"
"github.com/opencontainers/selinux/go-selinux"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
)
// GetContainer looks for a container using the provided information, which could be
// one of the following inputs from the caller:
// - A full container ID, which will exact match a container in daemon's list
// - A container name, which will only exact match via the GetByName() function
// - A partial container ID prefix (e.g. short ID) of any length that is
// unique enough to only return a single container object
// If none of these searches succeed, an error is returned
func (daemon *Daemon) GetContainer(prefixOrName string) (*container.Container, error) {
if len(prefixOrName) == 0 {
return nil, errors.WithStack(invalidIdentifier(prefixOrName))
}
if containerByID := daemon.containers.Get(prefixOrName); containerByID != nil {
// prefix is an exact match to a full container ID
return containerByID, nil
}
// GetByName will match only an exact name provided; we ignore errors
if containerByName, _ := daemon.GetByName(prefixOrName); containerByName != nil {
// prefix is an exact match to a full container Name
return containerByName, nil
}
containerID, err := daemon.containersReplica.GetByPrefix(prefixOrName)
if err != nil {
return nil, err
}
ctr := daemon.containers.Get(containerID)
if ctr == nil {
// Updates to the daemon.containersReplica ViewDB are not atomic
// or consistent w.r.t. the live daemon.containers Store so
// while reaching this code path may be indicative of a bug,
// it is not _necessarily_ the case.
logrus.WithField("prefixOrName", prefixOrName).
WithField("id", containerID).
Debugf("daemon.GetContainer: container is known to daemon.containersReplica but not daemon.containers")
return nil, containerNotFound(prefixOrName)
}
return ctr, nil
}
// checkContainer make sure the specified container validates the specified conditions
func (daemon *Daemon) checkContainer(container *container.Container, conditions ...func(*container.Container) error) error {
for _, condition := range conditions {
if err := condition(container); err != nil {
return err
}
}
return nil
}
// Exists returns a true if a container of the specified ID or name exists,
// false otherwise.
func (daemon *Daemon) Exists(id string) bool {
c, _ := daemon.GetContainer(id)
return c != nil
}
// IsPaused returns a bool indicating if the specified container is paused.
func (daemon *Daemon) IsPaused(id string) bool {
c, _ := daemon.GetContainer(id)
return c.State.IsPaused()
}
func (daemon *Daemon) containerRoot(id string) string {
return filepath.Join(daemon.repository, id)
}
// Load reads the contents of a container from disk
// This is typically done at startup.
func (daemon *Daemon) load(id string) (*container.Container, error) {
ctr := daemon.newBaseContainer(id)
if err := ctr.FromDisk(); err != nil {
return nil, err
}
selinux.ReserveLabel(ctr.ProcessLabel)
if ctr.ID != id {
return ctr, fmt.Errorf("Container %s is stored at %s", ctr.ID, id)
}
return ctr, nil
}
// Register makes a container object usable by the daemon as <container.ID>
func (daemon *Daemon) Register(c *container.Container) error {
// Attach to stdout and stderr
if c.Config.OpenStdin {
c.StreamConfig.NewInputPipes()
} else {
c.StreamConfig.NewNopInputPipe()
}
// once in the memory store it is visible to other goroutines
// grab a Lock until it has been checkpointed to avoid races
c.Lock()
defer c.Unlock()
daemon.containers.Add(c.ID, c)
return c.CheckpointTo(daemon.containersReplica)
}
func (daemon *Daemon) newContainer(name string, operatingSystem string, config *containertypes.Config, hostConfig *containertypes.HostConfig, imgID image.ID, managed bool) (*container.Container, error) {
var (
id string
err error
noExplicitName = name == ""
)
id, name, err = daemon.generateIDAndName(name)
if err != nil {
return nil, err
}
if hostConfig.NetworkMode.IsHost() {
if config.Hostname == "" {
config.Hostname, err = os.Hostname()
if err != nil {
return nil, errdefs.System(err)
}
}
} else {
daemon.generateHostname(id, config)
}
entrypoint, args := daemon.getEntrypointAndArgs(config.Entrypoint, config.Cmd)
base := daemon.newBaseContainer(id)
base.Created = time.Now().UTC()
base.Managed = managed
base.Path = entrypoint
base.Args = args // FIXME: de-duplicate from config
base.Config = config
base.HostConfig = &containertypes.HostConfig{}
base.ImageID = imgID
base.NetworkSettings = &network.Settings{IsAnonymousEndpoint: noExplicitName}
base.Name = name
base.Driver = daemon.imageService.StorageDriver()
base.OS = operatingSystem
return base, err
}
// GetByName returns a container given a name.
func (daemon *Daemon) GetByName(name string) (*container.Container, error) {
if len(name) == 0 {
return nil, fmt.Errorf("No container name supplied")
}
fullName := name
if name[0] != '/' {
fullName = "/" + name
}
id, err := daemon.containersReplica.Snapshot().GetID(fullName)
if err != nil {
return nil, fmt.Errorf("Could not find entity for %s", name)
}
e := daemon.containers.Get(id)
if e == nil {
return nil, fmt.Errorf("Could not find container for entity id %s", id)
}
return e, nil
}
// newBaseContainer creates a new container with its initial
// configuration based on the root storage from the daemon.
func (daemon *Daemon) newBaseContainer(id string) *container.Container {
return container.NewBaseContainer(id, daemon.containerRoot(id))
}
func (daemon *Daemon) getEntrypointAndArgs(configEntrypoint strslice.StrSlice, configCmd strslice.StrSlice) (string, []string) {
if len(configEntrypoint) != 0 {
return configEntrypoint[0], append(configEntrypoint[1:], configCmd...)
}
return configCmd[0], configCmd[1:]
}
func (daemon *Daemon) generateHostname(id string, config *containertypes.Config) {
// Generate default hostname
if config.Hostname == "" {
config.Hostname = id[:12]
}
}
func (daemon *Daemon) setSecurityOptions(cfg *config.Config, container *container.Container, hostConfig *containertypes.HostConfig) error {
container.Lock()
defer container.Unlock()
return daemon.parseSecurityOpt(cfg, &container.SecurityOptions, hostConfig)
}
func (daemon *Daemon) setHostConfig(container *container.Container, hostConfig *containertypes.HostConfig) error {
// Do not lock while creating volumes since this could be calling out to external plugins
// Don't want to block other actions, like `docker ps` because we're waiting on an external plugin
if err := daemon.registerMountPoints(container, hostConfig); err != nil {
return err
}
container.Lock()
defer container.Unlock()
// Register any links from the host config before starting the container
if err := daemon.registerLinks(container, hostConfig); err != nil {
return err
}
runconfig.SetDefaultNetModeIfBlank(hostConfig)
container.HostConfig = hostConfig
return nil
}
// verifyContainerSettings performs validation of the hostconfig and config
// structures.
func (daemon *Daemon) verifyContainerSettings(daemonCfg *configStore, hostConfig *containertypes.HostConfig, config *containertypes.Config, update bool) (warnings []string, err error) {
// First perform verification of settings common across all platforms.
if err = validateContainerConfig(config); err != nil {
return warnings, err
}
if err := validateHostConfig(hostConfig); err != nil {
return warnings, err
}
// Now do platform-specific verification
warnings, err = verifyPlatformContainerSettings(daemon, daemonCfg, hostConfig, update)
for _, w := range warnings {
logrus.Warn(w)
}
return warnings, err
}
func validateContainerConfig(config *containertypes.Config) error {
if config == nil {
return nil
}
if err := translateWorkingDir(config); err != nil {
return err
}
if len(config.StopSignal) > 0 {
if _, err := signal.ParseSignal(config.StopSignal); err != nil {
return err
}
}
// Validate if Env contains empty variable or not (e.g., ``, `=foo`)
for _, env := range config.Env {
if _, err := opts.ValidateEnv(env); err != nil {
return err
}
}
return validateHealthCheck(config.Healthcheck)
}
func validateHostConfig(hostConfig *containertypes.HostConfig) error {
if hostConfig == nil {
return nil
}
if hostConfig.AutoRemove && !hostConfig.RestartPolicy.IsNone() {
return errors.Errorf("can't create 'AutoRemove' container with restart policy")
}
// Validate mounts; check if host directories still exist
parser := volumemounts.NewParser()
for _, c := range hostConfig.Mounts {
cfg := c
if err := parser.ValidateMountConfig(&cfg); err != nil {
return err
}
}
for _, extraHost := range hostConfig.ExtraHosts {
if _, err := opts.ValidateExtraHost(extraHost); err != nil {
return err
}
}
if err := validatePortBindings(hostConfig.PortBindings); err != nil {
return err
}
if err := validateRestartPolicy(hostConfig.RestartPolicy); err != nil {
return err
}
if err := validateCapabilities(hostConfig); err != nil {
return err
}
if !hostConfig.Isolation.IsValid() {
return errors.Errorf("invalid isolation '%s' on %s", hostConfig.Isolation, runtime.GOOS)
}
for k := range hostConfig.Annotations {
if k == "" {
return errors.Errorf("invalid Annotations: the empty string is not permitted as an annotation key")
}
}
return nil
}
func validateCapabilities(hostConfig *containertypes.HostConfig) error {
if _, err := caps.NormalizeLegacyCapabilities(hostConfig.CapAdd); err != nil {
return errors.Wrap(err, "invalid CapAdd")
}
if _, err := caps.NormalizeLegacyCapabilities(hostConfig.CapDrop); err != nil {
return errors.Wrap(err, "invalid CapDrop")
}
// TODO consider returning warnings if "Privileged" is combined with Capabilities, CapAdd and/or CapDrop
return nil
}
// validateHealthCheck validates the healthcheck params of Config
func validateHealthCheck(healthConfig *containertypes.HealthConfig) error {
if healthConfig == nil {
return nil
}
if healthConfig.Interval != 0 && healthConfig.Interval < containertypes.MinimumDuration {
return errors.Errorf("Interval in Healthcheck cannot be less than %s", containertypes.MinimumDuration)
}
if healthConfig.Timeout != 0 && healthConfig.Timeout < containertypes.MinimumDuration {
return errors.Errorf("Timeout in Healthcheck cannot be less than %s", containertypes.MinimumDuration)
}
if healthConfig.Retries < 0 {
return errors.Errorf("Retries in Healthcheck cannot be negative")
}
if healthConfig.StartPeriod != 0 && healthConfig.StartPeriod < containertypes.MinimumDuration {
return errors.Errorf("StartPeriod in Healthcheck cannot be less than %s", containertypes.MinimumDuration)
}
return nil
}
func validatePortBindings(ports nat.PortMap) error {
for port := range ports {
_, portStr := nat.SplitProtoPort(string(port))
if _, err := nat.ParsePort(portStr); err != nil {
return errors.Errorf("invalid port specification: %q", portStr)
}
for _, pb := range ports[port] {
_, err := nat.NewPort(nat.SplitProtoPort(pb.HostPort))
if err != nil {
return errors.Errorf("invalid port specification: %q", pb.HostPort)
}
}
}
return nil
}
func validateRestartPolicy(policy containertypes.RestartPolicy) error {
switch policy.Name {
case "always", "unless-stopped", "no":
if policy.MaximumRetryCount != 0 {
return errors.Errorf("maximum retry count cannot be used with restart policy '%s'", policy.Name)
}
case "on-failure":
if policy.MaximumRetryCount < 0 {
return errors.Errorf("maximum retry count cannot be negative")
}
case "":
// do nothing
return nil
default:
return errors.Errorf("invalid restart policy '%s'", policy.Name)
}
return nil
}
// translateWorkingDir translates the working-dir for the target platform,
// and returns an error if the given path is not an absolute path.
func translateWorkingDir(config *containertypes.Config) error {
if config.WorkingDir == "" {
return nil
}
wd := filepath.FromSlash(config.WorkingDir) // Ensure in platform semantics
if !system.IsAbs(wd) {
return fmt.Errorf("the working directory '%s' is invalid, it needs to be an absolute path", config.WorkingDir)
}
config.WorkingDir = wd
return nil
}