Cleanup container shutdown check and add test

Adds a test case for the case where dockerd gets stuck on startup due to
hanging `daemon.shutdownContainer`

Signed-off-by: Brian Goff <cpuguy83@gmail.com>
This commit is contained in:
Brian Goff 2020-12-19 22:04:06 +00:00 committed by Tibor Vass
parent af0665861b
commit 4a175fd050
3 changed files with 102 additions and 18 deletions

View file

@ -330,10 +330,15 @@ func (daemon *Daemon) restore() error {
daemon.setStateCounter(c)
log.WithFields(logrus.Fields{
"running": c.IsRunning(),
"paused": c.IsPaused(),
}).Debug("restoring container")
logger := func(c *container.Container) *logrus.Entry {
return log.WithFields(logrus.Fields{
"running": c.IsRunning(),
"paused": c.IsPaused(),
"restarting": c.IsRestarting(),
})
}
logger(c).Debug("restoring container")
var (
err error
@ -345,17 +350,24 @@ func (daemon *Daemon) restore() error {
alive, _, process, err = daemon.containerd.Restore(context.Background(), c.ID, c.InitializeStdio)
if err != nil && !errdefs.IsNotFound(err) {
log.WithError(err).Error("failed to restore container with containerd")
logger(c).WithError(err).Error("failed to restore container with containerd")
return
}
log.Debugf("alive: %v", alive)
if !alive && process != nil {
ec, exitedAt, err = process.Delete(context.Background())
if err != nil && !errdefs.IsNotFound(err) {
log.WithError(err).Error("failed to delete container from containerd")
return
logger(c).Debugf("alive: %v", alive)
if !alive {
// If process is not nil, cleanup dead container from containerd.
// If process is nil then the above `containerd.Restore` returned an errdefs.NotFoundError,
// and docker's view of the container state will be updated accorrdingly via SetStopped further down.
if process != nil {
logger(c).Debug("cleaning up dead container process")
ec, exitedAt, err = process.Delete(context.Background())
if err != nil && !errdefs.IsNotFound(err) {
logger(c).WithError(err).Error("failed to delete container from containerd")
return
}
}
} else if alive && !daemon.configStore.LiveRestoreEnabled {
} else if !daemon.configStore.LiveRestoreEnabled {
logger(c).Debug("shutting down container considered alive by containerd")
if err := daemon.shutdownContainer(c); err != nil && !errdefs.IsNotFound(err) {
log.WithError(err).Error("error shutting down container")
return
@ -364,14 +376,16 @@ func (daemon *Daemon) restore() error {
}
if c.IsRunning() || c.IsPaused() {
logger(c).Debug("syncing container on disk state with real state")
c.RestartManager().Cancel() // manually start containers because some need to wait for swarm networking
if c.IsPaused() && alive {
s, err := daemon.containerd.Status(context.Background(), c.ID)
if err != nil {
log.WithError(err).Error("failed to get container status")
logger(c).WithError(err).Error("failed to get container status")
} else {
log.WithField("state", s).Info("restored container paused")
logger(c).WithField("state", s).Info("restored container paused")
switch s {
case containerd.Paused, containerd.Pausing:
// nothing to do
@ -393,14 +407,15 @@ func (daemon *Daemon) restore() error {
}
if !alive {
logger(c).Debug("setting stopped state")
c.Lock()
log.Debug("setting stopped state")
c.SetStopped(&container.ExitStatus{ExitCode: int(ec), ExitedAt: exitedAt})
daemon.Cleanup(c)
if err := c.CheckpointTo(daemon.containersReplica); err != nil {
log.WithError(err).Error("failed to update stopped container state")
}
c.Unlock()
logger(c).Debug("set stopped state")
}
// we call Mount and then Unmount to get BaseFs of the container
@ -411,10 +426,10 @@ func (daemon *Daemon) restore() error {
// stopped/restarted/removed.
// See #29365 for related information.
// The error is only logged here.
log.WithError(err).Warn("failed to mount container to get BaseFs path")
logger(c).WithError(err).Warn("failed to mount container to get BaseFs path")
} else {
if err := daemon.Unmount(c); err != nil {
log.WithError(err).Warn("failed to umount container to get BaseFs path")
logger(c).WithError(err).Warn("failed to umount container to get BaseFs path")
}
}
@ -422,7 +437,7 @@ func (daemon *Daemon) restore() error {
if !c.HostConfig.NetworkMode.IsContainer() && c.IsRunning() {
options, err := daemon.buildSandboxOptions(c)
if err != nil {
log.WithError(err).Warn("failed to build sandbox option to restore container")
logger(c).WithError(err).Warn("failed to build sandbox option to restore container")
}
mapLock.Lock()
activeSandboxes[c.NetworkSettings.SandboxID] = options
@ -467,6 +482,7 @@ func (daemon *Daemon) restore() error {
}
}
c.Unlock()
logger(c).Debug("done restoring container")
}(c)
}
group.Wait()

View file

@ -2,13 +2,18 @@ package container // import "github.com/docker/docker/integration/container"
import (
"context"
"encoding/json"
"fmt"
"io/ioutil"
"path/filepath"
"strconv"
"strings"
"testing"
"time"
"github.com/docker/docker/api/types"
containerapi "github.com/docker/docker/api/types/container"
realcontainer "github.com/docker/docker/container"
"github.com/docker/docker/integration/internal/container"
"github.com/docker/docker/testutil/daemon"
"golang.org/x/sys/unix"
@ -166,3 +171,65 @@ func TestDaemonHostGatewayIP(t *testing.T) {
d.Stop(t)
}
// TestRestartDaemonWithRestartingContainer simulates a case where a container is in "restarting" state when
// dockerd is killed (due to machine reset or something else).
//
// Related to moby/moby#41817
//
// In this test we'll change the container state to "restarting".
// This means that the container will not be 'alive' when we attempt to restore in on daemon startup.
//
// We could do the same with `docker run -d --resetart=always busybox:latest exit 1`, and then
// `kill -9` dockerd while the container is in "restarting" state. This is difficult to reproduce reliably
// in an automated test, so we manipulate on disk state instead.
func TestRestartDaemonWithRestartingContainer(t *testing.T) {
skip.If(t, testEnv.IsRemoteDaemon, "cannot start daemon on remote test run")
skip.If(t, testEnv.DaemonInfo.OSType == "windows")
t.Parallel()
d := daemon.New(t)
defer d.Cleanup(t)
d.StartWithBusybox(t, "--iptables=false")
defer d.Kill()
ctx := context.Background()
client := d.NewClientT(t)
// Just create the container, no need to start it to be started.
// We really want to make sure there is no process running when docker starts back up.
// We will manipulate the on disk state later
id := container.Create(ctx, t, client, container.WithRestartPolicy("always"), container.WithCmd("/bin/sh", "-c", "exit 1"))
// SIGKILL the daemon
assert.NilError(t, d.Kill())
configPath := filepath.Join(d.Root, "containers", id, "config.v2.json")
configBytes, err := ioutil.ReadFile(configPath)
assert.NilError(t, err)
var c realcontainer.Container
assert.NilError(t, json.Unmarshal(configBytes, &c))
c.State = realcontainer.NewState()
c.SetRestarting(&realcontainer.ExitStatus{ExitCode: 1})
c.HasBeenStartedBefore = true
configBytes, err = json.Marshal(&c)
assert.NilError(t, err)
assert.NilError(t, ioutil.WriteFile(configPath, configBytes, 0600))
d.Start(t)
ctxTimeout, cancel := context.WithTimeout(ctx, 30*time.Second)
defer cancel()
chOk, chErr := client.ContainerWait(ctxTimeout, id, containerapi.WaitConditionNextExit)
select {
case <-chOk:
case err := <-chErr:
assert.NilError(t, err)
}
}

View file

@ -288,6 +288,7 @@ func (d *Daemon) Cleanup(t testing.TB) {
func (d *Daemon) Start(t testing.TB, args ...string) {
t.Helper()
if err := d.StartWithError(args...); err != nil {
d.DumpStackAndQuit() // in case the daemon is stuck
t.Fatalf("[%s] failed to start daemon with arguments %v : %v", d.id, d.args, err)
}
}