moby/daemon/kill.go
Brian Goff c458bca6dc Handle missing c8d task on stop
In this case, we are sending a signal to the container (typically this
would be SIGKILL or SIGTERM, but could be any signal), but container
reports that the process does not exist.

At the point this code is happening, dockerd thinks that the container
is running, but containerd reports that it is not.

Since containerd reports that it is not running, try to collect the exit
status of the container from containerd, and mark the container as
stopped in dockerd.

Repro this problem like so:

```
id=$(docker run -d busybox top)
pkill containerd && pkill top
docker stop $id
```

Without this change, `docker stop $id` will first try to send SIGTERM,
wait for exit, then try SIGKILL.
Because the process doesn't exist to begin with, no signal is sent, and
so nothing happens.
Since we won't receive any event here to process, the container can
never be marked as stopped until the daemon is restarted.

With the change `docker stop` succeeds immediately (since the process is
already stopped) and we mark the container as stopped. We handle the
case as if we missed a exit event.

There are definitely some other places in the stack that could use some
improvement here, but this helps people get out of a sticky situation.

With io.containerd.runc.v2, no event is ever recieved by docker because
the shim quits trying to send the event.

With io.containerd.runtime.v1.linux the TastExit event is sent before
dockerd can reconnect to the event stream and we miss the event.

No matter what, we shouldn't be reliant on the shim doing the right
thing here, nor can we rely on a steady event stream.

Signed-off-by: Brian Goff <cpuguy83@gmail.com>
2020-07-28 10:09:25 -07:00

180 lines
5.7 KiB
Go

package daemon // import "github.com/docker/docker/daemon"
import (
"context"
"fmt"
"runtime"
"syscall"
"time"
containerpkg "github.com/docker/docker/container"
"github.com/docker/docker/errdefs"
libcontainerdtypes "github.com/docker/docker/libcontainerd/types"
"github.com/docker/docker/pkg/signal"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
)
type errNoSuchProcess struct {
pid int
signal int
}
func (e errNoSuchProcess) Error() string {
return fmt.Sprintf("Cannot kill process (pid=%d) with signal %d: no such process.", e.pid, e.signal)
}
func (errNoSuchProcess) NotFound() {}
// isErrNoSuchProcess returns true if the error
// is an instance of errNoSuchProcess.
func isErrNoSuchProcess(err error) bool {
_, ok := err.(errNoSuchProcess)
return ok
}
// ContainerKill sends signal to the container
// If no signal is given (sig 0), then Kill with SIGKILL and wait
// for the container to exit.
// If a signal is given, then just send it to the container and return.
func (daemon *Daemon) ContainerKill(name string, sig uint64) error {
container, err := daemon.GetContainer(name)
if err != nil {
return err
}
if sig != 0 && !signal.ValidSignalForPlatform(syscall.Signal(sig)) {
return fmt.Errorf("The %s daemon does not support signal %d", runtime.GOOS, sig)
}
// If no signal is passed, or SIGKILL, perform regular Kill (SIGKILL + wait())
if sig == 0 || syscall.Signal(sig) == syscall.SIGKILL {
return daemon.Kill(container)
}
return daemon.killWithSignal(container, int(sig))
}
// killWithSignal sends the container the given signal. This wrapper for the
// host specific kill command prepares the container before attempting
// to send the signal. An error is returned if the container is paused
// or not running, or if there is a problem returned from the
// underlying kill command.
func (daemon *Daemon) killWithSignal(container *containerpkg.Container, sig int) error {
logrus.Debugf("Sending kill signal %d to container %s", sig, container.ID)
container.Lock()
defer container.Unlock()
if !container.Running {
return errNotRunning(container.ID)
}
var unpause bool
if container.Config.StopSignal != "" && syscall.Signal(sig) != syscall.SIGKILL {
containerStopSignal, err := signal.ParseSignal(container.Config.StopSignal)
if err != nil {
return err
}
if containerStopSignal == syscall.Signal(sig) {
container.ExitOnNext()
unpause = container.Paused
}
} else {
container.ExitOnNext()
unpause = container.Paused
}
if !daemon.IsShuttingDown() {
container.HasBeenManuallyStopped = true
container.CheckpointTo(daemon.containersReplica)
}
// if the container is currently restarting we do not need to send the signal
// to the process. Telling the monitor that it should exit on its next event
// loop is enough
if container.Restarting {
return nil
}
if err := daemon.kill(container, sig); err != nil {
if errdefs.IsNotFound(err) {
unpause = false
logrus.WithError(err).WithField("container", container.ID).WithField("action", "kill").Debug("container kill failed because of 'container not found' or 'no such process'")
go daemon.handleContainerExit(container, nil)
} else {
return errors.Wrapf(err, "Cannot kill container %s", container.ID)
}
}
if unpause {
// above kill signal will be sent once resume is finished
if err := daemon.containerd.Resume(context.Background(), container.ID); err != nil {
logrus.Warnf("Cannot unpause container %s: %s", container.ID, err)
}
}
attributes := map[string]string{
"signal": fmt.Sprintf("%d", sig),
}
daemon.LogContainerEventWithAttributes(container, "kill", attributes)
return nil
}
// Kill forcefully terminates a container.
func (daemon *Daemon) Kill(container *containerpkg.Container) error {
if !container.IsRunning() {
return errNotRunning(container.ID)
}
// 1. Send SIGKILL
if err := daemon.killPossiblyDeadProcess(container, int(syscall.SIGKILL)); err != nil {
// While normally we might "return err" here we're not going to
// because if we can't stop the container by this point then
// it's probably because it's already stopped. Meaning, between
// the time of the IsRunning() call above and now it stopped.
// Also, since the err return will be environment specific we can't
// look for any particular (common) error that would indicate
// that the process is already dead vs something else going wrong.
// So, instead we'll give it up to 2 more seconds to complete and if
// by that time the container is still running, then the error
// we got is probably valid and so we return it to the caller.
if isErrNoSuchProcess(err) {
return nil
}
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
if status := <-container.Wait(ctx, containerpkg.WaitConditionNotRunning); status.Err() != nil {
return err
}
}
// 2. Wait for the process to die, in last resort, try to kill the process directly
if err := killProcessDirectly(container); err != nil {
if isErrNoSuchProcess(err) {
return nil
}
return err
}
// Wait for exit with no timeout.
// Ignore returned status.
<-container.Wait(context.Background(), containerpkg.WaitConditionNotRunning)
return nil
}
// killPossibleDeadProcess is a wrapper around killSig() suppressing "no such process" error.
func (daemon *Daemon) killPossiblyDeadProcess(container *containerpkg.Container, sig int) error {
err := daemon.killWithSignal(container, sig)
if errdefs.IsNotFound(err) {
e := errNoSuchProcess{container.GetPID(), sig}
logrus.Debug(e)
return e
}
return err
}
func (daemon *Daemon) kill(c *containerpkg.Container, sig int) error {
return daemon.containerd.SignalProcess(context.Background(), c.ID, libcontainerdtypes.InitProcessName, sig)
}