moby/integration/container/restart_test.go

package container // import "github.com/docker/docker/integration/container"

import (
	"context"
	"fmt"
	"runtime"
	"testing"
	"time"

	"github.com/docker/docker/api/types"
	"github.com/docker/docker/api/types/container"
	"github.com/docker/docker/api/types/events"
	"github.com/docker/docker/api/types/filters"
	"github.com/docker/docker/client"
	testContainer "github.com/docker/docker/integration/internal/container"
	"github.com/docker/docker/testutil"
	"github.com/docker/docker/testutil/daemon"
	"gotest.tools/v3/assert"
	is "gotest.tools/v3/assert/cmp"
	"gotest.tools/v3/poll"
	"gotest.tools/v3/skip"
)

func TestDaemonRestartKillContainers(t *testing.T) {
	skip.If(t, testEnv.IsRemoteDaemon, "cannot start daemon on remote test run")
	skip.If(t, testEnv.DaemonInfo.OSType == "windows")
	skip.If(t, testEnv.IsRootless, "rootless mode doesn't support live-restore")

	ctx := testutil.StartSpan(baseContext, t)

	type testCase struct {
		desc       string
		config     *container.Config
		hostConfig *container.HostConfig

		xRunning            bool
		xRunningLiveRestore bool
		xStart              bool
		xHealthCheck        bool
	}

	for _, tc := range []testCase{
		{
			desc:                "container without restart policy",
			config:              &container.Config{Image: "busybox", Cmd: []string{"top"}},
			xRunningLiveRestore: true,
			xStart:              true,
		},
		{
			desc:                "container with restart=always",
			config:              &container.Config{Image: "busybox", Cmd: []string{"top"}},
			hostConfig:          &container.HostConfig{RestartPolicy: container.RestartPolicy{Name: "always"}},
			xRunning:            true,
			xRunningLiveRestore: true,
			xStart:              true,
		},
		{
			desc: "container with restart=always and with healthcheck",
			config: &container.Config{
				Image: "busybox", Cmd: []string{"top"},
				Healthcheck: &container.HealthConfig{
					Test:     []string{"CMD-SHELL", "sleep 1"},
					Interval: time.Second,
				},
			},
			hostConfig:          &container.HostConfig{RestartPolicy: container.RestartPolicy{Name: "always"}},
			xRunning:            true,
			xRunningLiveRestore: true,
			xStart:              true,
			xHealthCheck:        true,
		},
		{
			desc:       "container created should not be restarted",
			config:     &container.Config{Image: "busybox", Cmd: []string{"top"}},
			hostConfig: &container.HostConfig{RestartPolicy: container.RestartPolicy{Name: "always"}},
		},
	} {
		for _, liveRestoreEnabled := range []bool{false, true} {
			for fnName, stopDaemon := range map[string]func(*testing.T, *daemon.Daemon){
				"kill-daemon": func(t *testing.T, d *daemon.Daemon) {
					err := d.Kill()
					assert.NilError(t, err)
				},
				"stop-daemon": func(t *testing.T, d *daemon.Daemon) {
					d.Stop(t)
				},
			} {
				tc := tc
				liveRestoreEnabled := liveRestoreEnabled
				stopDaemon := stopDaemon
				t.Run(fmt.Sprintf("live-restore=%v/%s/%s", liveRestoreEnabled, tc.desc, fnName), func(t *testing.T) {
					t.Parallel()

					ctx := testutil.StartSpan(ctx, t)

					d := daemon.New(t)
					apiClient := d.NewClientT(t)

					args := []string{"--iptables=false"}
					if liveRestoreEnabled {
						args = append(args, "--live-restore")
					}

					d.StartWithBusybox(ctx, t, args...)
					defer d.Stop(t)

					resp, err := apiClient.ContainerCreate(ctx, tc.config, tc.hostConfig, nil, nil, "")
					assert.NilError(t, err)
					defer apiClient.ContainerRemove(ctx, resp.ID, container.RemoveOptions{Force: true})

					if tc.xStart {
						err = apiClient.ContainerStart(ctx, resp.ID, container.StartOptions{})
						assert.NilError(t, err)
					}

					stopDaemon(t, d)
					d.Start(t, args...)

					expected := tc.xRunning
					if liveRestoreEnabled {
						expected = tc.xRunningLiveRestore
					}

					var running bool
					for i := 0; i < 30; i++ {
						inspect, err := apiClient.ContainerInspect(ctx, resp.ID)
						assert.NilError(t, err)

						running = inspect.State.Running
						if running == expected {
							break
						}
						time.Sleep(2 * time.Second)
					}
					assert.Equal(t, expected, running, "got unexpected running state, expected %v, got: %v", expected, running)

					if tc.xHealthCheck {
						startTime := time.Now()
						ctxPoll, cancel := context.WithTimeout(ctx, 30*time.Second)
						defer cancel()
						poll.WaitOn(t, pollForNewHealthCheck(ctxPoll, apiClient, startTime, resp.ID), poll.WithDelay(100*time.Millisecond))
					}
					// TODO(cpuguy83): test pause states... this seems to be rather undefined currently
				})
			}
		}
	}
}

func pollForNewHealthCheck(ctx context.Context, client *client.Client, startTime time.Time, containerID string) func(log poll.LogT) poll.Result {
	return func(log poll.LogT) poll.Result {
		inspect, err := client.ContainerInspect(ctx, containerID)
		if err != nil {
			return poll.Error(err)
		}
		healthChecksTotal := len(inspect.State.Health.Log)
		if healthChecksTotal > 0 {
			if inspect.State.Health.Log[healthChecksTotal-1].Start.After(startTime) {
				return poll.Success()
			}
		}
		return poll.Continue("waiting for a new container healthcheck")
	}
}

// Container started with --rm should be able to be restarted.
// It should be removed only if killed or stopped
func TestContainerWithAutoRemoveCanBeRestarted(t *testing.T) {
	ctx := setupTest(t)
	apiClient := testEnv.APIClient()

	noWaitTimeout := 0

	for _, tc := range []struct {
		desc  string
		doSth func(ctx context.Context, containerID string) error
	}{
		{
			desc: "kill",
			doSth: func(ctx context.Context, containerID string) error {
				return apiClient.ContainerKill(ctx, containerID, "SIGKILL")
			},
		},
		{
			desc: "stop",
			doSth: func(ctx context.Context, containerID string) error {
				return apiClient.ContainerStop(ctx, containerID, container.StopOptions{Timeout: &noWaitTimeout})
			},
		},
	} {
		tc := tc
		t.Run(tc.desc, func(t *testing.T) {
			testutil.StartSpan(ctx, t)
			cID := testContainer.Run(ctx, t, apiClient,
				testContainer.WithName("autoremove-restart-and-"+tc.desc),
				testContainer.WithAutoRemove,
			)
			defer func() {
				err := apiClient.ContainerRemove(ctx, cID, container.RemoveOptions{Force: true})
				if t.Failed() && err != nil {
					t.Logf("Cleaning up test container failed with error: %v", err)
				}
			}()

			err := apiClient.ContainerRestart(ctx, cID, container.StopOptions{Timeout: &noWaitTimeout})
			assert.NilError(t, err)

			inspect, err := apiClient.ContainerInspect(ctx, cID)
			assert.NilError(t, err)
			assert.Assert(t, inspect.State.Status != "removing", "Container should not be removing yet")

			poll.WaitOn(t, testContainer.IsInState(ctx, apiClient, cID, "running"))

			err = tc.doSth(ctx, cID)
			assert.NilError(t, err)

			poll.WaitOn(t, testContainer.IsRemoved(ctx, apiClient, cID))
		})
	}
}

// TestContainerRestartWithCancelledRequest verifies that cancelling a restart
// request does not cancel the restart operation, and still starts the container
// after it was stopped.
//
// Regression test for https://github.com/moby/moby/discussions/46682
func TestContainerRestartWithCancelledRequest(t *testing.T) {
	ctx := setupTest(t)
	apiClient := testEnv.APIClient()

	testutil.StartSpan(ctx, t)

	// Create a container that ignores SIGTERM and doesn't stop immediately,
	// giving us time to cancel the request.
	//
	// Restarting a container is "stop" (and, if needed, "kill"), then "start"
	// the container. We're trying to create the scenario where the "stop" is
	// handled, but the request was cancelled and therefore the "start" not
	// taking place.
	cID := testContainer.Run(ctx, t, apiClient, testContainer.WithCmd("sh", "-c", "trap 'echo received TERM' TERM; while true; do usleep 10; done"))
	defer func() {
		err := apiClient.ContainerRemove(ctx, cID, container.RemoveOptions{Force: true})
		if t.Failed() && err != nil {
			t.Logf("Cleaning up test container failed with error: %v", err)
		}
	}()

	// Start listening for events.
	messages, errs := apiClient.Events(ctx, types.EventsOptions{
		Filters: filters.NewArgs(
			filters.Arg("container", cID),
			filters.Arg("event", string(events.ActionRestart)),
		),
	})

	// Make restart request, but cancel the request before the container
	// is (forcibly) killed.
	ctx2, cancel := context.WithTimeout(ctx, 100*time.Millisecond)
	stopTimeout := 1
	err := apiClient.ContainerRestart(ctx2, cID, container.StopOptions{
		Timeout: &stopTimeout,
	})
	assert.Check(t, is.ErrorIs(err, context.DeadlineExceeded))
	cancel()

	// Validate that the restart event occurred, which is emitted
	// after the restart (stop (kill) start) finished.
	//
	// Note that we cannot use RestartCount for this, as that's only
	// used for restart-policies.
	restartTimeout := 2 * time.Second
	if runtime.GOOS == "windows" {
		// hcs can sometimes take a long time to stop container.
		restartTimeout = StopContainerWindowsPollTimeout
	}
	select {
	case m := <-messages:
		assert.Check(t, is.Equal(m.Actor.ID, cID))
		assert.Check(t, is.Equal(m.Action, events.ActionRestart))
	case err := <-errs:
		assert.NilError(t, err)
	case <-time.After(restartTimeout):
		t.Errorf("timeout waiting for restart event")
	}

	// Container should be restarted (running).
	inspect, err := apiClient.ContainerInspect(ctx, cID)
	assert.NilError(t, err)
	assert.Check(t, is.Equal(inspect.State.Status, "running"))
}