Merge pull request #41586 from sparrc/stop-refactor
Fix hung docker stop if stop signal and daemon.Kill both fail
This commit is contained in:
commit
6110ba3d7c
1 changed files with 52 additions and 40 deletions
|
@ -38,52 +38,64 @@ func (daemon *Daemon) ContainerStop(name string, timeout *int) error {
|
|||
|
||||
// containerStop sends a stop signal, waits, sends a kill signal.
|
||||
func (daemon *Daemon) containerStop(container *containerpkg.Container, seconds int) error {
|
||||
// TODO propagate a context down to this function
|
||||
ctx := context.TODO()
|
||||
if !container.IsRunning() {
|
||||
return nil
|
||||
}
|
||||
|
||||
stopSignal := container.StopSignal()
|
||||
// 1. Send a stop signal
|
||||
if err := daemon.killPossiblyDeadProcess(container, stopSignal); err != nil {
|
||||
// While normally we might "return err" here we're not going to
|
||||
// because if we can't stop the container by this point then
|
||||
// it's probably because it's already stopped. Meaning, between
|
||||
// the time of the IsRunning() call above and now it stopped.
|
||||
// Also, since the err return will be environment specific we can't
|
||||
// look for any particular (common) error that would indicate
|
||||
// that the process is already dead vs something else going wrong.
|
||||
// So, instead we'll give it up to 2 more seconds to complete and if
|
||||
// by that time the container is still running, then the error
|
||||
// we got is probably valid and so we force kill it.
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
|
||||
if status := <-container.Wait(ctx, containerpkg.WaitConditionNotRunning); status.Err() != nil {
|
||||
logrus.Infof("Container failed to stop after sending signal %d to the process, force killing", stopSignal)
|
||||
if err := daemon.killPossiblyDeadProcess(container, 9); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Wait for the process to exit on its own
|
||||
ctx := context.Background()
|
||||
var wait time.Duration
|
||||
if seconds >= 0 {
|
||||
var cancel context.CancelFunc
|
||||
ctx, cancel = context.WithTimeout(ctx, time.Duration(seconds)*time.Second)
|
||||
wait = time.Duration(seconds) * time.Second
|
||||
}
|
||||
success := func() error {
|
||||
daemon.LogContainerEvent(container, "stop")
|
||||
return nil
|
||||
}
|
||||
stopSignal := container.StopSignal()
|
||||
|
||||
// 1. Send a stop signal
|
||||
err := daemon.killPossiblyDeadProcess(container, stopSignal)
|
||||
if err != nil {
|
||||
wait = 2 * time.Second
|
||||
}
|
||||
|
||||
var subCtx context.Context
|
||||
var cancel context.CancelFunc
|
||||
if seconds >= 0 {
|
||||
subCtx, cancel = context.WithTimeout(ctx, wait)
|
||||
} else {
|
||||
subCtx, cancel = context.WithCancel(ctx)
|
||||
}
|
||||
defer cancel()
|
||||
|
||||
if status := <-container.Wait(subCtx, containerpkg.WaitConditionNotRunning); status.Err() == nil {
|
||||
// container did exit, so ignore any previous errors and return
|
||||
return success()
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
// the container has still not exited, and the kill function errored, so log the error here:
|
||||
logrus.WithError(err).WithField("container", container.ID).Errorf("Error sending stop (signal %d) to container", stopSignal)
|
||||
}
|
||||
if seconds < 0 {
|
||||
// if the client requested that we never kill / wait forever, but container.Wait was still
|
||||
// interrupted (parent context cancelled, for example), we should propagate the signal failure
|
||||
return err
|
||||
}
|
||||
|
||||
logrus.WithField("container", container.ID).Infof("Container failed to exit within %d seconds of signal %d - using the force", seconds, stopSignal)
|
||||
// Stop either failed or container didnt exit, so fallback to kill.
|
||||
if err := daemon.Kill(container); err != nil {
|
||||
// got a kill error, but give container 2 more seconds to exit just in case
|
||||
subCtx, cancel := context.WithTimeout(ctx, 2*time.Second)
|
||||
defer cancel()
|
||||
}
|
||||
|
||||
if status := <-container.Wait(ctx, containerpkg.WaitConditionNotRunning); status.Err() != nil {
|
||||
logrus.Infof("Container %v failed to exit within %d seconds of signal %d - using the force", container.ID, seconds, stopSignal)
|
||||
// 3. If it doesn't, then send SIGKILL
|
||||
if err := daemon.Kill(container); err != nil {
|
||||
// Wait without a timeout, ignore result.
|
||||
<-container.Wait(context.Background(), containerpkg.WaitConditionNotRunning)
|
||||
logrus.Warn(err) // Don't return error because we only care that container is stopped, not what function stopped it
|
||||
if status := <-container.Wait(subCtx, containerpkg.WaitConditionNotRunning); status.Err() == nil {
|
||||
// container did exit, so ignore error and return
|
||||
return success()
|
||||
}
|
||||
logrus.WithError(err).WithField("container", container.ID).Error("Error killing the container")
|
||||
return err
|
||||
}
|
||||
|
||||
daemon.LogContainerEvent(container, "stop")
|
||||
return nil
|
||||
return success()
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue