Forráskód Böngészése

Merge pull request #43739 from ndeloof/healthcheck_timeout

don't use canceled context to send KILL signal to healthcheck process
Sebastiaan van Stijn 2 éve
szülő
commit
6b7974cf16
3 módosított fájl, 52 hozzáadás és 18 törlés
  1. 13 16
      daemon/exec.go
  2. 2 2
      daemon/health.go
  3. 37 0
      integration/container/health_test.go

+ 13 - 16
daemon/exec.go

@@ -23,9 +23,6 @@ import (
 	"github.com/sirupsen/logrus"
 )
 
-// Seconds to wait after sending TERM before trying KILL
-const termProcessTimeout = 10 * time.Second
-
 func (daemon *Daemon) registerExecCommand(container *container.Container, config *exec.Config) {
 	// Storing execs in container in order to kill them gracefully whenever the container is stopped or removed.
 	container.ExecCommands.Add(config.ID, config)
@@ -272,7 +269,10 @@ func (daemon *Daemon) ContainerExecStart(ctx context.Context, name string, optio
 		CloseStdin: true,
 	}
 	ec.StreamConfig.AttachStreams(&attachConfig)
-	attachErr := ec.StreamConfig.CopyStreams(ctx, &attachConfig)
+	// using context.Background() so that attachErr does not race ctx.Done().
+	copyCtx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	attachErr := ec.StreamConfig.CopyStreams(copyCtx, &attachConfig)
 
 	// Synchronize with libcontainerd event loop
 	ec.Lock()
@@ -292,18 +292,15 @@ func (daemon *Daemon) ContainerExecStart(ctx context.Context, name string, optio
 
 	select {
 	case <-ctx.Done():
-		logrus.Debugf("Sending TERM signal to process %v in container %v", name, c.ID)
-		daemon.containerd.SignalProcess(ctx, c.ID, name, signal.SignalMap["TERM"])
-
-		timeout := time.NewTimer(termProcessTimeout)
-		defer timeout.Stop()
-
-		select {
-		case <-timeout.C:
-			logrus.Infof("Container %v, process %v failed to exit within %v of signal TERM - using the force", c.ID, name, termProcessTimeout)
-			daemon.containerd.SignalProcess(ctx, c.ID, name, signal.SignalMap["KILL"])
-		case <-attachErr:
-			// TERM signal worked
+		log := logrus.
+			WithField("container", c.ID).
+			WithField("exec", name)
+		log.Debug("Sending KILL signal to container process")
+		sigCtx, cancelFunc := context.WithTimeout(context.Background(), 30*time.Second)
+		defer cancelFunc()
+		err := daemon.containerd.SignalProcess(sigCtx, c.ID, name, signal.SignalMap["KILL"])
+		if err != nil {
+			log.WithError(err).Error("Could not send KILL signal to container process")
 		}
 		return ctx.Err()
 	case err := <-attachErr:

+ 2 - 2
daemon/health.go

@@ -133,8 +133,8 @@ func (p *cmdProbe) run(ctx context.Context, d *Daemon, cntr *container.Container
 	case <-tm.C:
 		cancelProbe()
 		logrus.WithContext(ctx).Debugf("Health check for container %s taking too long", cntr.ID)
-		// Wait for probe to exit (it might take a while to respond to the TERM
-		// signal and we don't want dying probes to pile up).
+		// Wait for probe to exit (it might take some time to call containerd to kill
+		// the process and we don't want dying probes to pile up).
 		<-execErr
 		return &types.HealthcheckResult{
 			ExitCode: -1,

+ 37 - 0
integration/container/health_test.go

@@ -2,6 +2,7 @@ package container // import "github.com/docker/docker/integration/container"
 
 import (
 	"context"
+	"fmt"
 	"testing"
 	"time"
 
@@ -93,6 +94,42 @@ while true; do sleep 1; done
 	poll.WaitOn(t, pollForHealthStatus(ctxPoll, client, id, "healthy"), poll.WithDelay(100*time.Millisecond))
 }
 
+// TestHealthCheckProcessKilled verifies that health-checks exec get killed on time-out.
+func TestHealthCheckProcessKilled(t *testing.T) {
+	skip.If(t, testEnv.RuntimeIsWindowsContainerd(), "FIXME: Broken on Windows + containerd combination")
+	defer setupTest(t)()
+	ctx := context.Background()
+	apiClient := testEnv.APIClient()
+
+	cID := container.Run(ctx, t, apiClient, func(c *container.TestContainerConfig) {
+		c.Config.Healthcheck = &containertypes.HealthConfig{
+			Test:     []string{"CMD", "sh", "-c", "sleep 60"},
+			Interval: 100 * time.Millisecond,
+			Timeout:  50 * time.Millisecond,
+			Retries:  1,
+		}
+	})
+	poll.WaitOn(t, pollForHealthCheckLog(ctx, apiClient, cID, "Health check exceeded timeout (50ms)"))
+}
+
+func pollForHealthCheckLog(ctx context.Context, client client.APIClient, containerID string, expected string) func(log poll.LogT) poll.Result {
+	return func(log poll.LogT) poll.Result {
+		inspect, err := client.ContainerInspect(ctx, containerID)
+		if err != nil {
+			return poll.Error(err)
+		}
+		healthChecksTotal := len(inspect.State.Health.Log)
+		if healthChecksTotal > 0 {
+			output := inspect.State.Health.Log[healthChecksTotal-1].Output
+			if output == expected {
+				return poll.Success()
+			}
+			return poll.Error(fmt.Errorf("expected %q, got %q", expected, output))
+		}
+		return poll.Continue("waiting for container healthcheck logs")
+	}
+}
+
 func pollForHealthStatus(ctx context.Context, client client.APIClient, containerID string, healthStatus string) func(log poll.LogT) poll.Result {
 	return func(log poll.LogT) poll.Result {
 		inspect, err := client.ContainerInspect(ctx, containerID)