Fix race in TestCreateServiceSecretFileMode, TestCreateServiceConfigFileMode

Looks like this test was broken from the start, and fully relied on a race
condition. (Test was added in 65ee7fff02)

The problem is in the service's command: `ls -l /etc/config || /bin/top`, which
will either:

- exit immediately if the secret is mounted correctly at `/etc/config` (which it should)
- keep running with `/bin/top` if the above failed

After the service is created, the test enters a race-condition, checking for 1
task to be running (which it ocassionally is), after which it proceeds, and looks
up the list of tasks of the service, to get the log output of `ls -l /etc/config`.

This is another race: first of all, the original filter for that task lookup did
not filter by `running`, so it would pick "any" task of the service (either failed,
running, or "completed" (successfully exited) tasks).

In the meantime though, SwarmKit kept reconciling the service, and creating new
tasks, so even if the test was able to get the ID of the correct task, that task
may already have been exited, and removed (task-limit is 5 by default), so only
if the test was "lucky", it would be able to get the logs, but of course, chances
were likely that it would be "too late", and the task already gone.

The problem can be easily reproduced when running the steps manually:

    echo 'CONFIG' | docker config create myconfig -

    docker service create --config source=myconfig,target=/etc/config,mode=0777 --name myservice busybox sh -c 'ls -l /etc/config || /bin/top'

The above creates the service, but it keeps retrying, because each task exits
immediately (followed by SwarmKit reconciling and starting a new task);

    mjntpfkkyuuc1dpay4h00c4oo
    overall progress: 0 out of 1 tasks
    1/1: ready     [======================================>            ]
    verify: Detected task failure
    ^COperation continuing in background.
    Use `docker service ps mjntpfkkyuuc1dpay4h00c4oo` to check progress.

And checking the tasks for the service reveals that tasks exit cleanly (no error),
but _do exit_, so swarm just keeps up reconciling, and spinning up new tasks;

    docker service ps myservice --no-trunc
    ID                          NAME              IMAGE                                                                                    NODE             DESIRED STATE   CURRENT STATE                     ERROR     PORTS
    2wmcuv4vffnet8nybg3he4v9n   myservice.1       busybox:latest@sha256:f7ca5a32c10d51aeda3b4d01c61c6061f497893d7f6628b92f822f7117182a57   docker-desktop   Ready           Ready less than a second ago
    5p8b006uec125iq2892lxay64    \_ myservice.1   busybox:latest@sha256:f7ca5a32c10d51aeda3b4d01c61c6061f497893d7f6628b92f822f7117182a57   docker-desktop   Shutdown        Complete less than a second ago
    k8lpsvlak4b3nil0zfkexw61p    \_ myservice.1   busybox:latest@sha256:f7ca5a32c10d51aeda3b4d01c61c6061f497893d7f6628b92f822f7117182a57   docker-desktop   Shutdown        Complete 6 seconds ago
    vsunl5pi7e2n9ol3p89kvj6pn    \_ myservice.1   busybox:latest@sha256:f7ca5a32c10d51aeda3b4d01c61c6061f497893d7f6628b92f822f7117182a57   docker-desktop   Shutdown        Complete 11 seconds ago
    orxl8b6kt2l6dfznzzd4lij4s    \_ myservice.1   busybox:latest@sha256:f7ca5a32c10d51aeda3b4d01c61c6061f497893d7f6628b92f822f7117182a57   docker-desktop   Shutdown        Complete 17 seconds ago

This patch changes the service's command to `sleep`, so that a successful task
(after successfully performing `ls -l /etc/config`) continues to be running until
the service is deleted. With that change, the service should (usually) reconcile
immediately, which removes the race condition, and should also make it faster :)

This patch changes the tests to use client.ServiceLogs() instead of using the
service's tasklist to directly access container logs. This should also fix some
failures that happened if some tasks failed to start before reconciling, in which
case client.TaskList() (with the current filters), could return more tasks than
anticipated (as it also contained the exited tasks);

    === RUN   TestCreateServiceSecretFileMode
        create_test.go:291: assertion failed: 2 (int) != 1 (int)
    --- FAIL: TestCreateServiceSecretFileMode (7.88s)
    === RUN   TestCreateServiceConfigFileMode
        create_test.go:355: assertion failed: 2 (int) != 1 (int)
    --- FAIL: TestCreateServiceConfigFileMode (7.87s)

Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
This commit is contained in:
Sebastiaan van Stijn 2021-10-22 15:33:22 +02:00
parent 9aa0b24dcf
commit 13cff6d583
No known key found for this signature in database
GPG key ID: 76698F39D527CE8C

View file

@ -267,7 +267,7 @@ func TestCreateServiceSecretFileMode(t *testing.T) {
serviceID := swarm.CreateService(t, d,
swarm.ServiceWithReplicas(instances),
swarm.ServiceWithName(serviceName),
swarm.ServiceWithCommand([]string{"/bin/sh", "-c", "ls -l /etc/secret || /bin/top"}),
swarm.ServiceWithCommand([]string{"/bin/sh", "-c", "ls -l /etc/secret && sleep inf"}),
swarm.ServiceWithSecret(&swarmtypes.SecretReference{
File: &swarmtypes.SecretReferenceFileTarget{
Name: "/etc/secret",
@ -282,15 +282,8 @@ func TestCreateServiceSecretFileMode(t *testing.T) {
poll.WaitOn(t, swarm.RunningTasksCount(client, serviceID, instances), swarm.ServicePoll)
filter := filters.NewArgs()
filter.Add("service", serviceID)
tasks, err := client.TaskList(ctx, types.TaskListOptions{
Filters: filter,
})
assert.NilError(t, err)
assert.Check(t, is.Equal(len(tasks), 1))
body, err := client.ContainerLogs(ctx, tasks[0].Status.ContainerStatus.ContainerID, types.ContainerLogsOptions{
body, err := client.ServiceLogs(ctx, serviceID, types.ContainerLogsOptions{
Tail: "1",
ShowStdout: true,
})
assert.NilError(t, err)
@ -330,7 +323,7 @@ func TestCreateServiceConfigFileMode(t *testing.T) {
serviceName := "TestService_" + t.Name()
serviceID := swarm.CreateService(t, d,
swarm.ServiceWithName(serviceName),
swarm.ServiceWithCommand([]string{"/bin/sh", "-c", "ls -l /etc/config || /bin/top"}),
swarm.ServiceWithCommand([]string{"/bin/sh", "-c", "ls -l /etc/config && sleep inf"}),
swarm.ServiceWithReplicas(instances),
swarm.ServiceWithConfig(&swarmtypes.ConfigReference{
File: &swarmtypes.ConfigReferenceFileTarget{
@ -346,15 +339,8 @@ func TestCreateServiceConfigFileMode(t *testing.T) {
poll.WaitOn(t, swarm.RunningTasksCount(client, serviceID, instances))
filter := filters.NewArgs()
filter.Add("service", serviceID)
tasks, err := client.TaskList(ctx, types.TaskListOptions{
Filters: filter,
})
assert.NilError(t, err)
assert.Check(t, is.Equal(len(tasks), 1))
body, err := client.ContainerLogs(ctx, tasks[0].Status.ContainerStatus.ContainerID, types.ContainerLogsOptions{
body, err := client.ServiceLogs(ctx, serviceID, types.ContainerLogsOptions{
Tail: "1",
ShowStdout: true,
})
assert.NilError(t, err)