Просмотр исходного кода

docker kill: fix bug where failed kills didnt fallback to unix kill

1. fixes #41587
2. removes potential infinite Wait and goroutine leak at end of kill
function

fixes #41587

Signed-off-by: Cam <gh@sparr.email>
(cherry picked from commit e57a365ab1a1b5f30b9cc04e2365448868ae5b4f)
Signed-off-by: Brian Goff <cpuguy83@gmail.com>
Cam 4 лет назад
Родитель
Сommit
bed624fdc9
2 измененных файлов с 40 добавлено и 52 удалено
  1. 25 33
      daemon/container_operations_unix.go
  2. 15 19
      daemon/kill.go

+ 25 - 33
daemon/container_operations_unix.go

@@ -3,13 +3,11 @@
 package daemon // import "github.com/docker/docker/daemon"
 
 import (
-	"context"
 	"fmt"
 	"io/ioutil"
 	"os"
 	"path/filepath"
 	"strconv"
-	"time"
 
 	"github.com/docker/docker/container"
 	"github.com/docker/docker/daemon/links"
@@ -336,38 +334,32 @@ func (daemon *Daemon) cleanupSecretDir(c *container.Container) {
 	}
 }
 
-func killProcessDirectly(cntr *container.Container) error {
-	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
-	defer cancel()
-
-	// Block until the container to stops or timeout.
-	status := <-cntr.Wait(ctx, container.WaitConditionNotRunning)
-	if status.Err() != nil {
-		// Ensure that we don't kill ourselves
-		if pid := cntr.GetPID(); pid != 0 {
-			logrus.Infof("Container %s failed to exit within 10 seconds of kill - trying direct SIGKILL", stringid.TruncateID(cntr.ID))
-			if err := unix.Kill(pid, 9); err != nil {
-				if err != unix.ESRCH {
-					return err
-				}
-				e := errNoSuchProcess{pid, 9}
-				logrus.Debug(e)
-				return e
-			}
+func killProcessDirectly(container *container.Container) error {
+	pid := container.GetPID()
+	// Ensure that we don't kill ourselves
+	if pid == 0 {
+		return nil
+	}
 
-			// In case there were some exceptions(e.g., state of zombie and D)
-			if system.IsProcessAlive(pid) {
-
-				// Since we can not kill a zombie pid, add zombie check here
-				isZombie, err := system.IsProcessZombie(pid)
-				if err != nil {
-					logrus.Warnf("Container %s state is invalid", stringid.TruncateID(cntr.ID))
-					return err
-				}
-				if isZombie {
-					return errdefs.System(errors.Errorf("container %s PID %d is zombie and can not be killed. Use the --init option when creating containers to run an init inside the container that forwards signals and reaps processes", stringid.TruncateID(cntr.ID), pid))
-				}
-			}
+	if err := unix.Kill(pid, 9); err != nil {
+		if err != unix.ESRCH {
+			return err
+		}
+		e := errNoSuchProcess{pid, 9}
+		logrus.WithError(e).WithField("container", container.ID).Debug("no such process")
+		return e
+	}
+
+	// In case there were some exceptions(e.g., state of zombie and D)
+	if system.IsProcessAlive(pid) {
+		// Since we can not kill a zombie pid, add zombie check here
+		isZombie, err := system.IsProcessZombie(pid)
+		if err != nil {
+			logrus.WithError(err).WithField("container", container.ID).Warn("Container state is invalid")
+			return err
+		}
+		if isZombie {
+			return errdefs.System(errors.Errorf("container %s PID %d is zombie and can not be killed. Use the --init option when creating containers to run an init inside the container that forwards signals and reaps processes", stringid.TruncateID(container.ID), pid))
 		}
 	}
 	return nil

+ 15 - 19
daemon/kill.go

@@ -139,29 +139,22 @@ func (daemon *Daemon) Kill(container *containerpkg.Container) error {
 
 	// 1. Send SIGKILL
 	if err := daemon.killPossiblyDeadProcess(container, int(syscall.SIGKILL)); err != nil {
-		// While normally we might "return err" here we're not going to
-		// because if we can't stop the container by this point then
-		// it's probably because it's already stopped. Meaning, between
-		// the time of the IsRunning() call above and now it stopped.
-		// Also, since the err return will be environment specific we can't
-		// look for any particular (common) error that would indicate
-		// that the process is already dead vs something else going wrong.
-		// So, instead we'll give it up to 2 more seconds to complete and if
-		// by that time the container is still running, then the error
-		// we got is probably valid and so we return it to the caller.
+		// kill failed, check if process is no longer running.
 		if isErrNoSuchProcess(err) {
 			return nil
 		}
+	}
 
-		ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
-		defer cancel()
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer cancel()
 
-		if status := <-container.Wait(ctx, containerpkg.WaitConditionNotRunning); status.Err() != nil {
-			return err
-		}
+	status := <-container.Wait(ctx, containerpkg.WaitConditionNotRunning)
+	if status.Err() == nil {
+		return nil
 	}
 
-	// 2. Wait for the process to die, in last resort, try to kill the process directly
+	logrus.WithError(status.Err()).WithField("container", container.ID).Error("Container failed to exit within 10 seconds of kill - trying direct SIGKILL")
+
 	if err := killProcessDirectly(container); err != nil {
 		if isErrNoSuchProcess(err) {
 			return nil
@@ -169,10 +162,13 @@ func (daemon *Daemon) Kill(container *containerpkg.Container) error {
 		return err
 	}
 
-	// Wait for exit with no timeout.
-	// Ignore returned status.
-	<-container.Wait(context.Background(), containerpkg.WaitConditionNotRunning)
+	// wait for container to exit one last time, if it doesn't then kill didnt work, so return error
+	ctx2, cancel2 := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancel2()
 
+	if status := <-container.Wait(ctx2, containerpkg.WaitConditionNotRunning); status.Err() != nil {
+		return errors.New("tried to kill container, but did not receive an exit event")
+	}
 	return nil
 }