daemon: daemon.containerRestart: don't cancel restart on context cancel

commit def549c8f6 passed through the context
to the daemon.ContainerStart function. As a result, restarting containers
no longer is an atomic operation, because a context cancellation could
interrupt the restart (between "stopping" and "(re)starting"), resulting
in the container being stopped, but not restarted.

Restarting a container, or more factually; making a successful request on
the `/containers/{id]/restart` endpoint, should be an atomic operation.

This patch uses a context.WithoutCancel for restart requests.

It's worth noting that daemon.containerStop already uses context.WithoutCancel,
so in that function, we'll be wrapping the context twice, but this should
likely not cause issues (just redundant for this code-path).

Before this patch, starting a container that bind-mounts the docker socket,
then restarting itself from within the container would cancel the restart
operation. The container would be stopped, but not started after that:

    docker run -dit --name myself -v /var/run/docker.sock:/var/run/docker.sock docker:cli sh
    docker exec myself sh -c 'docker restart myself'

    docker ps -a
    CONTAINER ID   IMAGE         COMMAND                  CREATED          STATUS                       PORTS     NAMES
    3a2a741c65ff   docker:cli    "docker-entrypoint.s…"   26 seconds ago   Exited (128) 7 seconds ago             myself

With this patch: the stop still cancels the exec, but does not cancel the
restart operation, and the container is started again:

    docker run -dit --name myself -v /var/run/docker.sock:/var/run/docker.sock docker:cli sh
    docker exec myself sh -c 'docker restart myself'
    docker ps
    CONTAINER ID   IMAGE        COMMAND                  CREATED              STATUS         PORTS     NAMES
    4393a01f7c75   docker:cli   "docker-entrypoint.s…"   About a minute ago   Up 4 seconds             myself

Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
This commit is contained in:
Sebastiaan van Stijn 2023-10-20 18:00:16 +02:00
parent b4a08b3b7c
commit aeb8972281
No known key found for this signature in database
GPG key ID: 76698F39D527CE8C
2 changed files with 80 additions and 0 deletions

View file

@ -7,6 +7,7 @@ import (
containertypes "github.com/docker/docker/api/types/container"
"github.com/docker/docker/api/types/events"
"github.com/docker/docker/container"
"github.com/docker/docker/internal/compatcontext"
)
// ContainerRestart stops and starts a container. It attempts to
@ -32,6 +33,10 @@ func (daemon *Daemon) ContainerRestart(ctx context.Context, name string, options
// gracefully stop, before forcefully terminating the container. If
// given a negative duration, wait forever for a graceful stop.
func (daemon *Daemon) containerRestart(ctx context.Context, daemonCfg *configStore, container *container.Container, options containertypes.StopOptions) error {
// Restarting is expected to be an atomic operation, and cancelling
// the request should not cancel the stop -> start sequence.
ctx = compatcontext.WithoutCancel(ctx)
// Determine isolation. If not specified in the hostconfig, use daemon default.
actualIsolation := container.HostConfig.Isolation
if containertypes.Isolation.IsDefault(actualIsolation) {

View file

@ -3,15 +3,20 @@ package container // import "github.com/docker/docker/integration/container"
import (
"context"
"fmt"
"runtime"
"testing"
"time"
"github.com/docker/docker/api/types"
"github.com/docker/docker/api/types/container"
"github.com/docker/docker/api/types/events"
"github.com/docker/docker/api/types/filters"
"github.com/docker/docker/client"
testContainer "github.com/docker/docker/integration/internal/container"
"github.com/docker/docker/testutil"
"github.com/docker/docker/testutil/daemon"
"gotest.tools/v3/assert"
is "gotest.tools/v3/assert/cmp"
"gotest.tools/v3/poll"
"gotest.tools/v3/skip"
)
@ -213,3 +218,73 @@ func TestContainerWithAutoRemoveCanBeRestarted(t *testing.T) {
})
}
}
// TestContainerRestartWithCancelledRequest verifies that cancelling a restart
// request does not cancel the restart operation, and still starts the container
// after it was stopped.
//
// Regression test for https://github.com/moby/moby/discussions/46682
func TestContainerRestartWithCancelledRequest(t *testing.T) {
ctx := setupTest(t)
apiClient := testEnv.APIClient()
testutil.StartSpan(ctx, t)
// Create a container that ignores SIGTERM and doesn't stop immediately,
// giving us time to cancel the request.
//
// Restarting a container is "stop" (and, if needed, "kill"), then "start"
// the container. We're trying to create the scenario where the "stop" is
// handled, but the request was cancelled and therefore the "start" not
// taking place.
cID := testContainer.Run(ctx, t, apiClient, testContainer.WithCmd("sh", "-c", "trap 'echo received TERM' TERM; while true; do usleep 10; done"))
defer func() {
err := apiClient.ContainerRemove(ctx, cID, container.RemoveOptions{Force: true})
if t.Failed() && err != nil {
t.Logf("Cleaning up test container failed with error: %v", err)
}
}()
// Start listening for events.
messages, errs := apiClient.Events(ctx, types.EventsOptions{
Filters: filters.NewArgs(
filters.Arg("container", cID),
filters.Arg("event", string(events.ActionRestart)),
),
})
// Make restart request, but cancel the request before the container
// is (forcibly) killed.
ctx2, cancel := context.WithTimeout(ctx, 100*time.Millisecond)
stopTimeout := 1
err := apiClient.ContainerRestart(ctx2, cID, container.StopOptions{
Timeout: &stopTimeout,
})
assert.Check(t, is.ErrorIs(err, context.DeadlineExceeded))
cancel()
// Validate that the restart event occurred, which is emitted
// after the restart (stop (kill) start) finished.
//
// Note that we cannot use RestartCount for this, as that's only
// used for restart-policies.
restartTimeout := 2 * time.Second
if runtime.GOOS == "windows" {
// hcs can sometimes take a long time to stop container.
restartTimeout = StopContainerWindowsPollTimeout
}
select {
case m := <-messages:
assert.Check(t, is.Equal(m.Actor.ID, cID))
assert.Check(t, is.Equal(m.Action, events.ActionRestart))
case err := <-errs:
assert.NilError(t, err)
case <-time.After(restartTimeout):
t.Errorf("timeout waiting for restart event")
}
// Container should be restarted (running).
inspect, err := apiClient.ContainerInspect(ctx, cID)
assert.NilError(t, err)
assert.Check(t, is.Equal(inspect.State.Status, "running"))
}