Merge pull request #42956 from cpuguy83/20.10-backport-kill-fixes
[20.10] Backport fixes for kill/stop handling
This commit is contained in:
commit
52f5b613ba
3 changed files with 93 additions and 93 deletions
|
@ -3,13 +3,11 @@
|
|||
package daemon // import "github.com/docker/docker/daemon"
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/docker/docker/container"
|
||||
"github.com/docker/docker/daemon/links"
|
||||
|
@ -336,38 +334,32 @@ func (daemon *Daemon) cleanupSecretDir(c *container.Container) {
|
|||
}
|
||||
}
|
||||
|
||||
func killProcessDirectly(cntr *container.Container) error {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
func killProcessDirectly(container *container.Container) error {
|
||||
pid := container.GetPID()
|
||||
// Ensure that we don't kill ourselves
|
||||
if pid == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Block until the container to stops or timeout.
|
||||
status := <-cntr.Wait(ctx, container.WaitConditionNotRunning)
|
||||
if status.Err() != nil {
|
||||
// Ensure that we don't kill ourselves
|
||||
if pid := cntr.GetPID(); pid != 0 {
|
||||
logrus.Infof("Container %s failed to exit within 10 seconds of kill - trying direct SIGKILL", stringid.TruncateID(cntr.ID))
|
||||
if err := unix.Kill(pid, 9); err != nil {
|
||||
if err != unix.ESRCH {
|
||||
return err
|
||||
}
|
||||
e := errNoSuchProcess{pid, 9}
|
||||
logrus.Debug(e)
|
||||
return e
|
||||
}
|
||||
if err := unix.Kill(pid, 9); err != nil {
|
||||
if err != unix.ESRCH {
|
||||
return err
|
||||
}
|
||||
e := errNoSuchProcess{pid, 9}
|
||||
logrus.WithError(e).WithField("container", container.ID).Debug("no such process")
|
||||
return e
|
||||
}
|
||||
|
||||
// In case there were some exceptions(e.g., state of zombie and D)
|
||||
if system.IsProcessAlive(pid) {
|
||||
|
||||
// Since we can not kill a zombie pid, add zombie check here
|
||||
isZombie, err := system.IsProcessZombie(pid)
|
||||
if err != nil {
|
||||
logrus.Warnf("Container %s state is invalid", stringid.TruncateID(cntr.ID))
|
||||
return err
|
||||
}
|
||||
if isZombie {
|
||||
return errdefs.System(errors.Errorf("container %s PID %d is zombie and can not be killed. Use the --init option when creating containers to run an init inside the container that forwards signals and reaps processes", stringid.TruncateID(cntr.ID), pid))
|
||||
}
|
||||
}
|
||||
// In case there were some exceptions(e.g., state of zombie and D)
|
||||
if system.IsProcessAlive(pid) {
|
||||
// Since we can not kill a zombie pid, add zombie check here
|
||||
isZombie, err := system.IsProcessZombie(pid)
|
||||
if err != nil {
|
||||
logrus.WithError(err).WithField("container", container.ID).Warn("Container state is invalid")
|
||||
return err
|
||||
}
|
||||
if isZombie {
|
||||
return errdefs.System(errors.Errorf("container %s PID %d is zombie and can not be killed. Use the --init option when creating containers to run an init inside the container that forwards signals and reaps processes", stringid.TruncateID(container.ID), pid))
|
||||
}
|
||||
}
|
||||
return nil
|
||||
|
|
|
@ -139,29 +139,22 @@ func (daemon *Daemon) Kill(container *containerpkg.Container) error {
|
|||
|
||||
// 1. Send SIGKILL
|
||||
if err := daemon.killPossiblyDeadProcess(container, int(syscall.SIGKILL)); err != nil {
|
||||
// While normally we might "return err" here we're not going to
|
||||
// because if we can't stop the container by this point then
|
||||
// it's probably because it's already stopped. Meaning, between
|
||||
// the time of the IsRunning() call above and now it stopped.
|
||||
// Also, since the err return will be environment specific we can't
|
||||
// look for any particular (common) error that would indicate
|
||||
// that the process is already dead vs something else going wrong.
|
||||
// So, instead we'll give it up to 2 more seconds to complete and if
|
||||
// by that time the container is still running, then the error
|
||||
// we got is probably valid and so we return it to the caller.
|
||||
// kill failed, check if process is no longer running.
|
||||
if isErrNoSuchProcess(err) {
|
||||
return nil
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
|
||||
if status := <-container.Wait(ctx, containerpkg.WaitConditionNotRunning); status.Err() != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Wait for the process to die, in last resort, try to kill the process directly
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
status := <-container.Wait(ctx, containerpkg.WaitConditionNotRunning)
|
||||
if status.Err() == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
logrus.WithError(status.Err()).WithField("container", container.ID).Error("Container failed to exit within 10 seconds of kill - trying direct SIGKILL")
|
||||
|
||||
if err := killProcessDirectly(container); err != nil {
|
||||
if isErrNoSuchProcess(err) {
|
||||
return nil
|
||||
|
@ -169,10 +162,13 @@ func (daemon *Daemon) Kill(container *containerpkg.Container) error {
|
|||
return err
|
||||
}
|
||||
|
||||
// Wait for exit with no timeout.
|
||||
// Ignore returned status.
|
||||
<-container.Wait(context.Background(), containerpkg.WaitConditionNotRunning)
|
||||
// wait for container to exit one last time, if it doesn't then kill didnt work, so return error
|
||||
ctx2, cancel2 := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel2()
|
||||
|
||||
if status := <-container.Wait(ctx2, containerpkg.WaitConditionNotRunning); status.Err() != nil {
|
||||
return errors.New("tried to kill container, but did not receive an exit event")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
|
|
|
@ -38,52 +38,64 @@ func (daemon *Daemon) ContainerStop(name string, timeout *int) error {
|
|||
|
||||
// containerStop sends a stop signal, waits, sends a kill signal.
|
||||
func (daemon *Daemon) containerStop(container *containerpkg.Container, seconds int) error {
|
||||
// TODO propagate a context down to this function
|
||||
ctx := context.TODO()
|
||||
if !container.IsRunning() {
|
||||
return nil
|
||||
}
|
||||
|
||||
stopSignal := container.StopSignal()
|
||||
// 1. Send a stop signal
|
||||
if err := daemon.killPossiblyDeadProcess(container, stopSignal); err != nil {
|
||||
// While normally we might "return err" here we're not going to
|
||||
// because if we can't stop the container by this point then
|
||||
// it's probably because it's already stopped. Meaning, between
|
||||
// the time of the IsRunning() call above and now it stopped.
|
||||
// Also, since the err return will be environment specific we can't
|
||||
// look for any particular (common) error that would indicate
|
||||
// that the process is already dead vs something else going wrong.
|
||||
// So, instead we'll give it up to 2 more seconds to complete and if
|
||||
// by that time the container is still running, then the error
|
||||
// we got is probably valid and so we force kill it.
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
|
||||
if status := <-container.Wait(ctx, containerpkg.WaitConditionNotRunning); status.Err() != nil {
|
||||
logrus.Infof("Container failed to stop after sending signal %d to the process, force killing", stopSignal)
|
||||
if err := daemon.killPossiblyDeadProcess(container, 9); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Wait for the process to exit on its own
|
||||
ctx := context.Background()
|
||||
var wait time.Duration
|
||||
if seconds >= 0 {
|
||||
var cancel context.CancelFunc
|
||||
ctx, cancel = context.WithTimeout(ctx, time.Duration(seconds)*time.Second)
|
||||
wait = time.Duration(seconds) * time.Second
|
||||
}
|
||||
success := func() error {
|
||||
daemon.LogContainerEvent(container, "stop")
|
||||
return nil
|
||||
}
|
||||
stopSignal := container.StopSignal()
|
||||
|
||||
// 1. Send a stop signal
|
||||
err := daemon.killPossiblyDeadProcess(container, stopSignal)
|
||||
if err != nil {
|
||||
wait = 2 * time.Second
|
||||
}
|
||||
|
||||
var subCtx context.Context
|
||||
var cancel context.CancelFunc
|
||||
if seconds >= 0 {
|
||||
subCtx, cancel = context.WithTimeout(ctx, wait)
|
||||
} else {
|
||||
subCtx, cancel = context.WithCancel(ctx)
|
||||
}
|
||||
defer cancel()
|
||||
|
||||
if status := <-container.Wait(subCtx, containerpkg.WaitConditionNotRunning); status.Err() == nil {
|
||||
// container did exit, so ignore any previous errors and return
|
||||
return success()
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
// the container has still not exited, and the kill function errored, so log the error here:
|
||||
logrus.WithError(err).WithField("container", container.ID).Errorf("Error sending stop (signal %d) to container", stopSignal)
|
||||
}
|
||||
if seconds < 0 {
|
||||
// if the client requested that we never kill / wait forever, but container.Wait was still
|
||||
// interrupted (parent context cancelled, for example), we should propagate the signal failure
|
||||
return err
|
||||
}
|
||||
|
||||
logrus.WithField("container", container.ID).Infof("Container failed to exit within %s of signal %d - using the force", wait, stopSignal)
|
||||
// Stop either failed or container didnt exit, so fallback to kill.
|
||||
if err := daemon.Kill(container); err != nil {
|
||||
// got a kill error, but give container 2 more seconds to exit just in case
|
||||
subCtx, cancel := context.WithTimeout(ctx, 2*time.Second)
|
||||
defer cancel()
|
||||
}
|
||||
|
||||
if status := <-container.Wait(ctx, containerpkg.WaitConditionNotRunning); status.Err() != nil {
|
||||
logrus.Infof("Container %v failed to exit within %d seconds of signal %d - using the force", container.ID, seconds, stopSignal)
|
||||
// 3. If it doesn't, then send SIGKILL
|
||||
if err := daemon.Kill(container); err != nil {
|
||||
// Wait without a timeout, ignore result.
|
||||
<-container.Wait(context.Background(), containerpkg.WaitConditionNotRunning)
|
||||
logrus.Warn(err) // Don't return error because we only care that container is stopped, not what function stopped it
|
||||
if status := <-container.Wait(subCtx, containerpkg.WaitConditionNotRunning); status.Err() == nil {
|
||||
// container did exit, so ignore error and return
|
||||
return success()
|
||||
}
|
||||
logrus.WithError(err).WithField("container", container.ID).Error("Error killing the container")
|
||||
return err
|
||||
}
|
||||
|
||||
daemon.LogContainerEvent(container, "stop")
|
||||
return nil
|
||||
return success()
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue