57d2d6ef62
The OOMKilled flag on a container's state has historically behaved rather unintuitively: it is updated on container exit to reflect whether or not any process within the container has been OOM-killed during the preceding run of the container. The OOMKilled flag would be set to true when the container exits if any process within the container---including execs---was OOM-killed at any time while the container was running, whether or not the OOM-kill was the cause of the container exiting. The flag is "sticky," persisting through the next start of the container; only being cleared once the container exits without any processes having been OOM-killed that run. Alter the behavior of the OOMKilled flag such that it signals whether any process in the container had been OOM-killed since the most recent start of the container. Set the flag immediately upon any process being OOM-killed, and clear it when the container transitions to the "running" state. There is an ulterior motive for this change. It reduces the amount of state the libcontainerd client needs to keep track of and clean up on container exit. It's one less place the client could leak memory if a container was to be deleted without going through libcontainerd. Signed-off-by: Cory Snider <csnider@mirantis.com>
249 lines
6.6 KiB
Go
249 lines
6.6 KiB
Go
package daemon // import "github.com/docker/docker/daemon"
|
|
|
|
import (
|
|
"context"
|
|
"strconv"
|
|
"time"
|
|
|
|
"github.com/docker/docker/api/types"
|
|
"github.com/docker/docker/container"
|
|
libcontainerdtypes "github.com/docker/docker/libcontainerd/types"
|
|
"github.com/docker/docker/restartmanager"
|
|
"github.com/pkg/errors"
|
|
"github.com/sirupsen/logrus"
|
|
)
|
|
|
|
func (daemon *Daemon) setStateCounter(c *container.Container) {
|
|
switch c.StateString() {
|
|
case "paused":
|
|
stateCtr.set(c.ID, "paused")
|
|
case "running":
|
|
stateCtr.set(c.ID, "running")
|
|
default:
|
|
stateCtr.set(c.ID, "stopped")
|
|
}
|
|
}
|
|
|
|
func (daemon *Daemon) handleContainerExit(c *container.Container, e *libcontainerdtypes.EventInfo) error {
|
|
c.Lock()
|
|
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
|
ec, et, err := daemon.containerd.DeleteTask(ctx, c.ID)
|
|
cancel()
|
|
if err != nil {
|
|
logrus.WithError(err).WithField("container", c.ID).Warnf("failed to delete container from containerd")
|
|
}
|
|
|
|
ctx, cancel = context.WithTimeout(context.Background(), 2*time.Second)
|
|
c.StreamConfig.Wait(ctx)
|
|
cancel()
|
|
|
|
c.Reset(false)
|
|
|
|
exitStatus := container.ExitStatus{
|
|
ExitCode: int(ec),
|
|
ExitedAt: et,
|
|
}
|
|
if e != nil {
|
|
exitStatus.ExitCode = int(e.ExitCode)
|
|
exitStatus.ExitedAt = e.ExitedAt
|
|
if e.Error != nil {
|
|
c.SetError(e.Error)
|
|
}
|
|
}
|
|
|
|
daemonShutdown := daemon.IsShuttingDown()
|
|
execDuration := time.Since(c.StartedAt)
|
|
restart, wait, err := c.RestartManager().ShouldRestart(ec, daemonShutdown || c.HasBeenManuallyStopped, execDuration)
|
|
if err != nil {
|
|
logrus.WithError(err).
|
|
WithField("container", c.ID).
|
|
WithField("restartCount", c.RestartCount).
|
|
WithField("exitStatus", exitStatus).
|
|
WithField("daemonShuttingDown", daemonShutdown).
|
|
WithField("hasBeenManuallyStopped", c.HasBeenManuallyStopped).
|
|
WithField("execDuration", execDuration).
|
|
Warn("ShouldRestart failed, container will not be restarted")
|
|
restart = false
|
|
}
|
|
|
|
// cancel healthcheck here, they will be automatically
|
|
// restarted if/when the container is started again
|
|
daemon.stopHealthchecks(c)
|
|
attributes := map[string]string{
|
|
"exitCode": strconv.Itoa(int(ec)),
|
|
}
|
|
daemon.Cleanup(c)
|
|
|
|
if restart {
|
|
c.RestartCount++
|
|
logrus.WithField("container", c.ID).
|
|
WithField("restartCount", c.RestartCount).
|
|
WithField("exitStatus", exitStatus).
|
|
WithField("manualRestart", c.HasBeenManuallyRestarted).
|
|
Debug("Restarting container")
|
|
c.SetRestarting(&exitStatus)
|
|
} else {
|
|
c.SetStopped(&exitStatus)
|
|
if !c.HasBeenManuallyRestarted {
|
|
defer daemon.autoRemove(c)
|
|
}
|
|
}
|
|
defer c.Unlock() // needs to be called before autoRemove
|
|
|
|
daemon.setStateCounter(c)
|
|
cpErr := c.CheckpointTo(daemon.containersReplica)
|
|
|
|
daemon.LogContainerEventWithAttributes(c, "die", attributes)
|
|
|
|
if restart {
|
|
go func() {
|
|
err := <-wait
|
|
if err == nil {
|
|
// daemon.netController is initialized when daemon is restoring containers.
|
|
// But containerStart will use daemon.netController segment.
|
|
// So to avoid panic at startup process, here must wait util daemon restore done.
|
|
daemon.waitForStartupDone()
|
|
if err = daemon.containerStart(c, "", "", false); err != nil {
|
|
logrus.Debugf("failed to restart container: %+v", err)
|
|
}
|
|
}
|
|
if err != nil {
|
|
c.Lock()
|
|
c.SetStopped(&exitStatus)
|
|
daemon.setStateCounter(c)
|
|
c.CheckpointTo(daemon.containersReplica)
|
|
c.Unlock()
|
|
defer daemon.autoRemove(c)
|
|
if err != restartmanager.ErrRestartCanceled {
|
|
logrus.Errorf("restartmanger wait error: %+v", err)
|
|
}
|
|
}
|
|
}()
|
|
}
|
|
|
|
return cpErr
|
|
}
|
|
|
|
// ProcessEvent is called by libcontainerd whenever an event occurs
|
|
func (daemon *Daemon) ProcessEvent(id string, e libcontainerdtypes.EventType, ei libcontainerdtypes.EventInfo) error {
|
|
c, err := daemon.GetContainer(id)
|
|
if err != nil {
|
|
return errors.Wrapf(err, "could not find container %s", id)
|
|
}
|
|
|
|
switch e {
|
|
case libcontainerdtypes.EventOOM:
|
|
// StateOOM is Linux specific and should never be hit on Windows
|
|
if isWindows {
|
|
return errors.New("received StateOOM from libcontainerd on Windows. This should never happen")
|
|
}
|
|
|
|
c.Lock()
|
|
defer c.Unlock()
|
|
c.OOMKilled = true
|
|
daemon.updateHealthMonitor(c)
|
|
if err := c.CheckpointTo(daemon.containersReplica); err != nil {
|
|
return err
|
|
}
|
|
|
|
daemon.LogContainerEvent(c, "oom")
|
|
case libcontainerdtypes.EventExit:
|
|
if int(ei.Pid) == c.Pid {
|
|
return daemon.handleContainerExit(c, &ei)
|
|
}
|
|
|
|
exitCode := 127
|
|
if execConfig := c.ExecCommands.Get(ei.ProcessID); execConfig != nil {
|
|
ec := int(ei.ExitCode)
|
|
execConfig.Lock()
|
|
defer execConfig.Unlock()
|
|
execConfig.ExitCode = &ec
|
|
execConfig.Running = false
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
|
execConfig.StreamConfig.Wait(ctx)
|
|
cancel()
|
|
|
|
if err := execConfig.CloseStreams(); err != nil {
|
|
logrus.Errorf("failed to cleanup exec %s streams: %s", c.ID, err)
|
|
}
|
|
|
|
// remove the exec command from the container's store only and not the
|
|
// daemon's store so that the exec command can be inspected.
|
|
c.ExecCommands.Delete(execConfig.ID, execConfig.Pid)
|
|
|
|
exitCode = ec
|
|
}
|
|
attributes := map[string]string{
|
|
"execID": ei.ProcessID,
|
|
"exitCode": strconv.Itoa(exitCode),
|
|
}
|
|
daemon.LogContainerEventWithAttributes(c, "exec_die", attributes)
|
|
case libcontainerdtypes.EventStart:
|
|
c.Lock()
|
|
defer c.Unlock()
|
|
|
|
// This is here to handle start not generated by docker
|
|
if !c.Running {
|
|
c.SetRunning(int(ei.Pid), false)
|
|
c.HasBeenManuallyStopped = false
|
|
c.HasBeenStartedBefore = true
|
|
daemon.setStateCounter(c)
|
|
|
|
daemon.initHealthMonitor(c)
|
|
|
|
if err := c.CheckpointTo(daemon.containersReplica); err != nil {
|
|
return err
|
|
}
|
|
daemon.LogContainerEvent(c, "start")
|
|
}
|
|
|
|
case libcontainerdtypes.EventPaused:
|
|
c.Lock()
|
|
defer c.Unlock()
|
|
|
|
if !c.Paused {
|
|
c.Paused = true
|
|
daemon.setStateCounter(c)
|
|
daemon.updateHealthMonitor(c)
|
|
if err := c.CheckpointTo(daemon.containersReplica); err != nil {
|
|
return err
|
|
}
|
|
daemon.LogContainerEvent(c, "pause")
|
|
}
|
|
case libcontainerdtypes.EventResumed:
|
|
c.Lock()
|
|
defer c.Unlock()
|
|
|
|
if c.Paused {
|
|
c.Paused = false
|
|
daemon.setStateCounter(c)
|
|
daemon.updateHealthMonitor(c)
|
|
|
|
if err := c.CheckpointTo(daemon.containersReplica); err != nil {
|
|
return err
|
|
}
|
|
daemon.LogContainerEvent(c, "unpause")
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (daemon *Daemon) autoRemove(c *container.Container) {
|
|
c.Lock()
|
|
ar := c.HostConfig.AutoRemove
|
|
c.Unlock()
|
|
if !ar {
|
|
return
|
|
}
|
|
|
|
err := daemon.ContainerRm(c.ID, &types.ContainerRmConfig{ForceRemove: true, RemoveVolume: true})
|
|
if err == nil {
|
|
return
|
|
}
|
|
if c := daemon.containers.Get(c.ID); c == nil {
|
|
return
|
|
}
|
|
|
|
logrus.WithError(err).WithField("container", c.ID).Error("error removing container")
|
|
}
|