moby/integration/container/restart_test.go
Sebastiaan van Stijn aeb8972281
daemon: daemon.containerRestart: don't cancel restart on context cancel
commit def549c8f6 passed through the context
to the daemon.ContainerStart function. As a result, restarting containers
no longer is an atomic operation, because a context cancellation could
interrupt the restart (between "stopping" and "(re)starting"), resulting
in the container being stopped, but not restarted.

Restarting a container, or more factually; making a successful request on
the `/containers/{id]/restart` endpoint, should be an atomic operation.

This patch uses a context.WithoutCancel for restart requests.

It's worth noting that daemon.containerStop already uses context.WithoutCancel,
so in that function, we'll be wrapping the context twice, but this should
likely not cause issues (just redundant for this code-path).

Before this patch, starting a container that bind-mounts the docker socket,
then restarting itself from within the container would cancel the restart
operation. The container would be stopped, but not started after that:

    docker run -dit --name myself -v /var/run/docker.sock:/var/run/docker.sock docker:cli sh
    docker exec myself sh -c 'docker restart myself'

    docker ps -a
    CONTAINER ID   IMAGE         COMMAND                  CREATED          STATUS                       PORTS     NAMES
    3a2a741c65ff   docker:cli    "docker-entrypoint.s…"   26 seconds ago   Exited (128) 7 seconds ago             myself

With this patch: the stop still cancels the exec, but does not cancel the
restart operation, and the container is started again:

    docker run -dit --name myself -v /var/run/docker.sock:/var/run/docker.sock docker:cli sh
    docker exec myself sh -c 'docker restart myself'
    docker ps
    CONTAINER ID   IMAGE        COMMAND                  CREATED              STATUS         PORTS     NAMES
    4393a01f7c75   docker:cli   "docker-entrypoint.s…"   About a minute ago   Up 4 seconds             myself

Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
2023-10-24 12:56:45 +02:00

290 lines
9.1 KiB
Go

package container // import "github.com/docker/docker/integration/container"
import (
"context"
"fmt"
"runtime"
"testing"
"time"
"github.com/docker/docker/api/types"
"github.com/docker/docker/api/types/container"
"github.com/docker/docker/api/types/events"
"github.com/docker/docker/api/types/filters"
"github.com/docker/docker/client"
testContainer "github.com/docker/docker/integration/internal/container"
"github.com/docker/docker/testutil"
"github.com/docker/docker/testutil/daemon"
"gotest.tools/v3/assert"
is "gotest.tools/v3/assert/cmp"
"gotest.tools/v3/poll"
"gotest.tools/v3/skip"
)
func TestDaemonRestartKillContainers(t *testing.T) {
skip.If(t, testEnv.IsRemoteDaemon, "cannot start daemon on remote test run")
skip.If(t, testEnv.DaemonInfo.OSType == "windows")
skip.If(t, testEnv.IsRootless, "rootless mode doesn't support live-restore")
ctx := testutil.StartSpan(baseContext, t)
type testCase struct {
desc string
config *container.Config
hostConfig *container.HostConfig
xRunning bool
xRunningLiveRestore bool
xStart bool
xHealthCheck bool
}
for _, tc := range []testCase{
{
desc: "container without restart policy",
config: &container.Config{Image: "busybox", Cmd: []string{"top"}},
xRunningLiveRestore: true,
xStart: true,
},
{
desc: "container with restart=always",
config: &container.Config{Image: "busybox", Cmd: []string{"top"}},
hostConfig: &container.HostConfig{RestartPolicy: container.RestartPolicy{Name: "always"}},
xRunning: true,
xRunningLiveRestore: true,
xStart: true,
},
{
desc: "container with restart=always and with healthcheck",
config: &container.Config{
Image: "busybox", Cmd: []string{"top"},
Healthcheck: &container.HealthConfig{
Test: []string{"CMD-SHELL", "sleep 1"},
Interval: time.Second,
},
},
hostConfig: &container.HostConfig{RestartPolicy: container.RestartPolicy{Name: "always"}},
xRunning: true,
xRunningLiveRestore: true,
xStart: true,
xHealthCheck: true,
},
{
desc: "container created should not be restarted",
config: &container.Config{Image: "busybox", Cmd: []string{"top"}},
hostConfig: &container.HostConfig{RestartPolicy: container.RestartPolicy{Name: "always"}},
},
} {
for _, liveRestoreEnabled := range []bool{false, true} {
for fnName, stopDaemon := range map[string]func(*testing.T, *daemon.Daemon){
"kill-daemon": func(t *testing.T, d *daemon.Daemon) {
err := d.Kill()
assert.NilError(t, err)
},
"stop-daemon": func(t *testing.T, d *daemon.Daemon) {
d.Stop(t)
},
} {
tc := tc
liveRestoreEnabled := liveRestoreEnabled
stopDaemon := stopDaemon
t.Run(fmt.Sprintf("live-restore=%v/%s/%s", liveRestoreEnabled, tc.desc, fnName), func(t *testing.T) {
t.Parallel()
ctx := testutil.StartSpan(ctx, t)
d := daemon.New(t)
apiClient := d.NewClientT(t)
args := []string{"--iptables=false"}
if liveRestoreEnabled {
args = append(args, "--live-restore")
}
d.StartWithBusybox(ctx, t, args...)
defer d.Stop(t)
resp, err := apiClient.ContainerCreate(ctx, tc.config, tc.hostConfig, nil, nil, "")
assert.NilError(t, err)
defer apiClient.ContainerRemove(ctx, resp.ID, container.RemoveOptions{Force: true})
if tc.xStart {
err = apiClient.ContainerStart(ctx, resp.ID, container.StartOptions{})
assert.NilError(t, err)
}
stopDaemon(t, d)
d.Start(t, args...)
expected := tc.xRunning
if liveRestoreEnabled {
expected = tc.xRunningLiveRestore
}
var running bool
for i := 0; i < 30; i++ {
inspect, err := apiClient.ContainerInspect(ctx, resp.ID)
assert.NilError(t, err)
running = inspect.State.Running
if running == expected {
break
}
time.Sleep(2 * time.Second)
}
assert.Equal(t, expected, running, "got unexpected running state, expected %v, got: %v", expected, running)
if tc.xHealthCheck {
startTime := time.Now()
ctxPoll, cancel := context.WithTimeout(ctx, 30*time.Second)
defer cancel()
poll.WaitOn(t, pollForNewHealthCheck(ctxPoll, apiClient, startTime, resp.ID), poll.WithDelay(100*time.Millisecond))
}
// TODO(cpuguy83): test pause states... this seems to be rather undefined currently
})
}
}
}
}
func pollForNewHealthCheck(ctx context.Context, client *client.Client, startTime time.Time, containerID string) func(log poll.LogT) poll.Result {
return func(log poll.LogT) poll.Result {
inspect, err := client.ContainerInspect(ctx, containerID)
if err != nil {
return poll.Error(err)
}
healthChecksTotal := len(inspect.State.Health.Log)
if healthChecksTotal > 0 {
if inspect.State.Health.Log[healthChecksTotal-1].Start.After(startTime) {
return poll.Success()
}
}
return poll.Continue("waiting for a new container healthcheck")
}
}
// Container started with --rm should be able to be restarted.
// It should be removed only if killed or stopped
func TestContainerWithAutoRemoveCanBeRestarted(t *testing.T) {
ctx := setupTest(t)
apiClient := testEnv.APIClient()
noWaitTimeout := 0
for _, tc := range []struct {
desc string
doSth func(ctx context.Context, containerID string) error
}{
{
desc: "kill",
doSth: func(ctx context.Context, containerID string) error {
return apiClient.ContainerKill(ctx, containerID, "SIGKILL")
},
},
{
desc: "stop",
doSth: func(ctx context.Context, containerID string) error {
return apiClient.ContainerStop(ctx, containerID, container.StopOptions{Timeout: &noWaitTimeout})
},
},
} {
tc := tc
t.Run(tc.desc, func(t *testing.T) {
testutil.StartSpan(ctx, t)
cID := testContainer.Run(ctx, t, apiClient,
testContainer.WithName("autoremove-restart-and-"+tc.desc),
testContainer.WithAutoRemove,
)
defer func() {
err := apiClient.ContainerRemove(ctx, cID, container.RemoveOptions{Force: true})
if t.Failed() && err != nil {
t.Logf("Cleaning up test container failed with error: %v", err)
}
}()
err := apiClient.ContainerRestart(ctx, cID, container.StopOptions{Timeout: &noWaitTimeout})
assert.NilError(t, err)
inspect, err := apiClient.ContainerInspect(ctx, cID)
assert.NilError(t, err)
assert.Assert(t, inspect.State.Status != "removing", "Container should not be removing yet")
poll.WaitOn(t, testContainer.IsInState(ctx, apiClient, cID, "running"))
err = tc.doSth(ctx, cID)
assert.NilError(t, err)
poll.WaitOn(t, testContainer.IsRemoved(ctx, apiClient, cID))
})
}
}
// TestContainerRestartWithCancelledRequest verifies that cancelling a restart
// request does not cancel the restart operation, and still starts the container
// after it was stopped.
//
// Regression test for https://github.com/moby/moby/discussions/46682
func TestContainerRestartWithCancelledRequest(t *testing.T) {
ctx := setupTest(t)
apiClient := testEnv.APIClient()
testutil.StartSpan(ctx, t)
// Create a container that ignores SIGTERM and doesn't stop immediately,
// giving us time to cancel the request.
//
// Restarting a container is "stop" (and, if needed, "kill"), then "start"
// the container. We're trying to create the scenario where the "stop" is
// handled, but the request was cancelled and therefore the "start" not
// taking place.
cID := testContainer.Run(ctx, t, apiClient, testContainer.WithCmd("sh", "-c", "trap 'echo received TERM' TERM; while true; do usleep 10; done"))
defer func() {
err := apiClient.ContainerRemove(ctx, cID, container.RemoveOptions{Force: true})
if t.Failed() && err != nil {
t.Logf("Cleaning up test container failed with error: %v", err)
}
}()
// Start listening for events.
messages, errs := apiClient.Events(ctx, types.EventsOptions{
Filters: filters.NewArgs(
filters.Arg("container", cID),
filters.Arg("event", string(events.ActionRestart)),
),
})
// Make restart request, but cancel the request before the container
// is (forcibly) killed.
ctx2, cancel := context.WithTimeout(ctx, 100*time.Millisecond)
stopTimeout := 1
err := apiClient.ContainerRestart(ctx2, cID, container.StopOptions{
Timeout: &stopTimeout,
})
assert.Check(t, is.ErrorIs(err, context.DeadlineExceeded))
cancel()
// Validate that the restart event occurred, which is emitted
// after the restart (stop (kill) start) finished.
//
// Note that we cannot use RestartCount for this, as that's only
// used for restart-policies.
restartTimeout := 2 * time.Second
if runtime.GOOS == "windows" {
// hcs can sometimes take a long time to stop container.
restartTimeout = StopContainerWindowsPollTimeout
}
select {
case m := <-messages:
assert.Check(t, is.Equal(m.Actor.ID, cID))
assert.Check(t, is.Equal(m.Action, events.ActionRestart))
case err := <-errs:
assert.NilError(t, err)
case <-time.After(restartTimeout):
t.Errorf("timeout waiting for restart event")
}
// Container should be restarted (running).
inspect, err := apiClient.ContainerInspect(ctx, cID)
assert.NilError(t, err)
assert.Check(t, is.Equal(inspect.State.Status, "running"))
}