aeb8972281
commit def549c8f6
passed through the context
to the daemon.ContainerStart function. As a result, restarting containers
no longer is an atomic operation, because a context cancellation could
interrupt the restart (between "stopping" and "(re)starting"), resulting
in the container being stopped, but not restarted.
Restarting a container, or more factually; making a successful request on
the `/containers/{id]/restart` endpoint, should be an atomic operation.
This patch uses a context.WithoutCancel for restart requests.
It's worth noting that daemon.containerStop already uses context.WithoutCancel,
so in that function, we'll be wrapping the context twice, but this should
likely not cause issues (just redundant for this code-path).
Before this patch, starting a container that bind-mounts the docker socket,
then restarting itself from within the container would cancel the restart
operation. The container would be stopped, but not started after that:
docker run -dit --name myself -v /var/run/docker.sock:/var/run/docker.sock docker:cli sh
docker exec myself sh -c 'docker restart myself'
docker ps -a
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
3a2a741c65ff docker:cli "docker-entrypoint.s…" 26 seconds ago Exited (128) 7 seconds ago myself
With this patch: the stop still cancels the exec, but does not cancel the
restart operation, and the container is started again:
docker run -dit --name myself -v /var/run/docker.sock:/var/run/docker.sock docker:cli sh
docker exec myself sh -c 'docker restart myself'
docker ps
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
4393a01f7c75 docker:cli "docker-entrypoint.s…" About a minute ago Up 4 seconds myself
Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
290 lines
9.1 KiB
Go
290 lines
9.1 KiB
Go
package container // import "github.com/docker/docker/integration/container"
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"runtime"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/docker/docker/api/types"
|
|
"github.com/docker/docker/api/types/container"
|
|
"github.com/docker/docker/api/types/events"
|
|
"github.com/docker/docker/api/types/filters"
|
|
"github.com/docker/docker/client"
|
|
testContainer "github.com/docker/docker/integration/internal/container"
|
|
"github.com/docker/docker/testutil"
|
|
"github.com/docker/docker/testutil/daemon"
|
|
"gotest.tools/v3/assert"
|
|
is "gotest.tools/v3/assert/cmp"
|
|
"gotest.tools/v3/poll"
|
|
"gotest.tools/v3/skip"
|
|
)
|
|
|
|
func TestDaemonRestartKillContainers(t *testing.T) {
|
|
skip.If(t, testEnv.IsRemoteDaemon, "cannot start daemon on remote test run")
|
|
skip.If(t, testEnv.DaemonInfo.OSType == "windows")
|
|
skip.If(t, testEnv.IsRootless, "rootless mode doesn't support live-restore")
|
|
|
|
ctx := testutil.StartSpan(baseContext, t)
|
|
|
|
type testCase struct {
|
|
desc string
|
|
config *container.Config
|
|
hostConfig *container.HostConfig
|
|
|
|
xRunning bool
|
|
xRunningLiveRestore bool
|
|
xStart bool
|
|
xHealthCheck bool
|
|
}
|
|
|
|
for _, tc := range []testCase{
|
|
{
|
|
desc: "container without restart policy",
|
|
config: &container.Config{Image: "busybox", Cmd: []string{"top"}},
|
|
xRunningLiveRestore: true,
|
|
xStart: true,
|
|
},
|
|
{
|
|
desc: "container with restart=always",
|
|
config: &container.Config{Image: "busybox", Cmd: []string{"top"}},
|
|
hostConfig: &container.HostConfig{RestartPolicy: container.RestartPolicy{Name: "always"}},
|
|
xRunning: true,
|
|
xRunningLiveRestore: true,
|
|
xStart: true,
|
|
},
|
|
{
|
|
desc: "container with restart=always and with healthcheck",
|
|
config: &container.Config{
|
|
Image: "busybox", Cmd: []string{"top"},
|
|
Healthcheck: &container.HealthConfig{
|
|
Test: []string{"CMD-SHELL", "sleep 1"},
|
|
Interval: time.Second,
|
|
},
|
|
},
|
|
hostConfig: &container.HostConfig{RestartPolicy: container.RestartPolicy{Name: "always"}},
|
|
xRunning: true,
|
|
xRunningLiveRestore: true,
|
|
xStart: true,
|
|
xHealthCheck: true,
|
|
},
|
|
{
|
|
desc: "container created should not be restarted",
|
|
config: &container.Config{Image: "busybox", Cmd: []string{"top"}},
|
|
hostConfig: &container.HostConfig{RestartPolicy: container.RestartPolicy{Name: "always"}},
|
|
},
|
|
} {
|
|
for _, liveRestoreEnabled := range []bool{false, true} {
|
|
for fnName, stopDaemon := range map[string]func(*testing.T, *daemon.Daemon){
|
|
"kill-daemon": func(t *testing.T, d *daemon.Daemon) {
|
|
err := d.Kill()
|
|
assert.NilError(t, err)
|
|
},
|
|
"stop-daemon": func(t *testing.T, d *daemon.Daemon) {
|
|
d.Stop(t)
|
|
},
|
|
} {
|
|
tc := tc
|
|
liveRestoreEnabled := liveRestoreEnabled
|
|
stopDaemon := stopDaemon
|
|
t.Run(fmt.Sprintf("live-restore=%v/%s/%s", liveRestoreEnabled, tc.desc, fnName), func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
ctx := testutil.StartSpan(ctx, t)
|
|
|
|
d := daemon.New(t)
|
|
apiClient := d.NewClientT(t)
|
|
|
|
args := []string{"--iptables=false"}
|
|
if liveRestoreEnabled {
|
|
args = append(args, "--live-restore")
|
|
}
|
|
|
|
d.StartWithBusybox(ctx, t, args...)
|
|
defer d.Stop(t)
|
|
|
|
resp, err := apiClient.ContainerCreate(ctx, tc.config, tc.hostConfig, nil, nil, "")
|
|
assert.NilError(t, err)
|
|
defer apiClient.ContainerRemove(ctx, resp.ID, container.RemoveOptions{Force: true})
|
|
|
|
if tc.xStart {
|
|
err = apiClient.ContainerStart(ctx, resp.ID, container.StartOptions{})
|
|
assert.NilError(t, err)
|
|
}
|
|
|
|
stopDaemon(t, d)
|
|
d.Start(t, args...)
|
|
|
|
expected := tc.xRunning
|
|
if liveRestoreEnabled {
|
|
expected = tc.xRunningLiveRestore
|
|
}
|
|
|
|
var running bool
|
|
for i := 0; i < 30; i++ {
|
|
inspect, err := apiClient.ContainerInspect(ctx, resp.ID)
|
|
assert.NilError(t, err)
|
|
|
|
running = inspect.State.Running
|
|
if running == expected {
|
|
break
|
|
}
|
|
time.Sleep(2 * time.Second)
|
|
}
|
|
assert.Equal(t, expected, running, "got unexpected running state, expected %v, got: %v", expected, running)
|
|
|
|
if tc.xHealthCheck {
|
|
startTime := time.Now()
|
|
ctxPoll, cancel := context.WithTimeout(ctx, 30*time.Second)
|
|
defer cancel()
|
|
poll.WaitOn(t, pollForNewHealthCheck(ctxPoll, apiClient, startTime, resp.ID), poll.WithDelay(100*time.Millisecond))
|
|
}
|
|
// TODO(cpuguy83): test pause states... this seems to be rather undefined currently
|
|
})
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func pollForNewHealthCheck(ctx context.Context, client *client.Client, startTime time.Time, containerID string) func(log poll.LogT) poll.Result {
|
|
return func(log poll.LogT) poll.Result {
|
|
inspect, err := client.ContainerInspect(ctx, containerID)
|
|
if err != nil {
|
|
return poll.Error(err)
|
|
}
|
|
healthChecksTotal := len(inspect.State.Health.Log)
|
|
if healthChecksTotal > 0 {
|
|
if inspect.State.Health.Log[healthChecksTotal-1].Start.After(startTime) {
|
|
return poll.Success()
|
|
}
|
|
}
|
|
return poll.Continue("waiting for a new container healthcheck")
|
|
}
|
|
}
|
|
|
|
// Container started with --rm should be able to be restarted.
|
|
// It should be removed only if killed or stopped
|
|
func TestContainerWithAutoRemoveCanBeRestarted(t *testing.T) {
|
|
ctx := setupTest(t)
|
|
apiClient := testEnv.APIClient()
|
|
|
|
noWaitTimeout := 0
|
|
|
|
for _, tc := range []struct {
|
|
desc string
|
|
doSth func(ctx context.Context, containerID string) error
|
|
}{
|
|
{
|
|
desc: "kill",
|
|
doSth: func(ctx context.Context, containerID string) error {
|
|
return apiClient.ContainerKill(ctx, containerID, "SIGKILL")
|
|
},
|
|
},
|
|
{
|
|
desc: "stop",
|
|
doSth: func(ctx context.Context, containerID string) error {
|
|
return apiClient.ContainerStop(ctx, containerID, container.StopOptions{Timeout: &noWaitTimeout})
|
|
},
|
|
},
|
|
} {
|
|
tc := tc
|
|
t.Run(tc.desc, func(t *testing.T) {
|
|
testutil.StartSpan(ctx, t)
|
|
cID := testContainer.Run(ctx, t, apiClient,
|
|
testContainer.WithName("autoremove-restart-and-"+tc.desc),
|
|
testContainer.WithAutoRemove,
|
|
)
|
|
defer func() {
|
|
err := apiClient.ContainerRemove(ctx, cID, container.RemoveOptions{Force: true})
|
|
if t.Failed() && err != nil {
|
|
t.Logf("Cleaning up test container failed with error: %v", err)
|
|
}
|
|
}()
|
|
|
|
err := apiClient.ContainerRestart(ctx, cID, container.StopOptions{Timeout: &noWaitTimeout})
|
|
assert.NilError(t, err)
|
|
|
|
inspect, err := apiClient.ContainerInspect(ctx, cID)
|
|
assert.NilError(t, err)
|
|
assert.Assert(t, inspect.State.Status != "removing", "Container should not be removing yet")
|
|
|
|
poll.WaitOn(t, testContainer.IsInState(ctx, apiClient, cID, "running"))
|
|
|
|
err = tc.doSth(ctx, cID)
|
|
assert.NilError(t, err)
|
|
|
|
poll.WaitOn(t, testContainer.IsRemoved(ctx, apiClient, cID))
|
|
})
|
|
}
|
|
}
|
|
|
|
// TestContainerRestartWithCancelledRequest verifies that cancelling a restart
|
|
// request does not cancel the restart operation, and still starts the container
|
|
// after it was stopped.
|
|
//
|
|
// Regression test for https://github.com/moby/moby/discussions/46682
|
|
func TestContainerRestartWithCancelledRequest(t *testing.T) {
|
|
ctx := setupTest(t)
|
|
apiClient := testEnv.APIClient()
|
|
|
|
testutil.StartSpan(ctx, t)
|
|
|
|
// Create a container that ignores SIGTERM and doesn't stop immediately,
|
|
// giving us time to cancel the request.
|
|
//
|
|
// Restarting a container is "stop" (and, if needed, "kill"), then "start"
|
|
// the container. We're trying to create the scenario where the "stop" is
|
|
// handled, but the request was cancelled and therefore the "start" not
|
|
// taking place.
|
|
cID := testContainer.Run(ctx, t, apiClient, testContainer.WithCmd("sh", "-c", "trap 'echo received TERM' TERM; while true; do usleep 10; done"))
|
|
defer func() {
|
|
err := apiClient.ContainerRemove(ctx, cID, container.RemoveOptions{Force: true})
|
|
if t.Failed() && err != nil {
|
|
t.Logf("Cleaning up test container failed with error: %v", err)
|
|
}
|
|
}()
|
|
|
|
// Start listening for events.
|
|
messages, errs := apiClient.Events(ctx, types.EventsOptions{
|
|
Filters: filters.NewArgs(
|
|
filters.Arg("container", cID),
|
|
filters.Arg("event", string(events.ActionRestart)),
|
|
),
|
|
})
|
|
|
|
// Make restart request, but cancel the request before the container
|
|
// is (forcibly) killed.
|
|
ctx2, cancel := context.WithTimeout(ctx, 100*time.Millisecond)
|
|
stopTimeout := 1
|
|
err := apiClient.ContainerRestart(ctx2, cID, container.StopOptions{
|
|
Timeout: &stopTimeout,
|
|
})
|
|
assert.Check(t, is.ErrorIs(err, context.DeadlineExceeded))
|
|
cancel()
|
|
|
|
// Validate that the restart event occurred, which is emitted
|
|
// after the restart (stop (kill) start) finished.
|
|
//
|
|
// Note that we cannot use RestartCount for this, as that's only
|
|
// used for restart-policies.
|
|
restartTimeout := 2 * time.Second
|
|
if runtime.GOOS == "windows" {
|
|
// hcs can sometimes take a long time to stop container.
|
|
restartTimeout = StopContainerWindowsPollTimeout
|
|
}
|
|
select {
|
|
case m := <-messages:
|
|
assert.Check(t, is.Equal(m.Actor.ID, cID))
|
|
assert.Check(t, is.Equal(m.Action, events.ActionRestart))
|
|
case err := <-errs:
|
|
assert.NilError(t, err)
|
|
case <-time.After(restartTimeout):
|
|
t.Errorf("timeout waiting for restart event")
|
|
}
|
|
|
|
// Container should be restarted (running).
|
|
inspect, err := apiClient.ContainerInspect(ctx, cID)
|
|
assert.NilError(t, err)
|
|
assert.Check(t, is.Equal(inspect.State.Status, "running"))
|
|
}
|