From af0665861bcf6ba67d1e2e07755f9140c64ac468 Mon Sep 17 00:00:00 2001 From: Simon Ferquel Date: Fri, 18 Dec 2020 10:20:56 +0100 Subject: [PATCH 1/2] Fix a potential hang when starting after a non-clean shutdown Previous startup sequence used to call "containerStop" on containers that were persisted with a running state but are not alive when restarting (can happen on non-clean shutdown). This call was made before fixing-up the RunningState of the container, and tricked the daemon to trying to kill a non-existing process and ultimately hang. The fix is very simple - just add a condition on calling containerStop. Signed-off-by: Simon Ferquel --- daemon/daemon.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/daemon/daemon.go b/daemon/daemon.go index 1a382a4eef..2759781842 100644 --- a/daemon/daemon.go +++ b/daemon/daemon.go @@ -348,13 +348,14 @@ func (daemon *Daemon) restore() error { log.WithError(err).Error("failed to restore container with containerd") return } + log.Debugf("alive: %v", alive) if !alive && process != nil { ec, exitedAt, err = process.Delete(context.Background()) if err != nil && !errdefs.IsNotFound(err) { log.WithError(err).Error("failed to delete container from containerd") return } - } else if !daemon.configStore.LiveRestoreEnabled { + } else if alive && !daemon.configStore.LiveRestoreEnabled { if err := daemon.shutdownContainer(c); err != nil && !errdefs.IsNotFound(err) { log.WithError(err).Error("error shutting down container") return @@ -393,6 +394,7 @@ func (daemon *Daemon) restore() error { if !alive { c.Lock() + log.Debug("setting stopped state") c.SetStopped(&container.ExitStatus{ExitCode: int(ec), ExitedAt: exitedAt}) daemon.Cleanup(c) if err := c.CheckpointTo(daemon.containersReplica); err != nil { From 4a175fd0509f226064003b39f212be7d4c410cf8 Mon Sep 17 00:00:00 2001 From: Brian Goff Date: Sat, 19 Dec 2020 22:04:06 +0000 Subject: [PATCH 2/2] Cleanup container shutdown check and add test Adds a test case for the case where dockerd gets stuck on startup due to hanging `daemon.shutdownContainer` Signed-off-by: Brian Goff --- daemon/daemon.go | 52 +++++++++++------ integration/container/daemon_linux_test.go | 67 ++++++++++++++++++++++ testutil/daemon/daemon.go | 1 + 3 files changed, 102 insertions(+), 18 deletions(-) diff --git a/daemon/daemon.go b/daemon/daemon.go index 2759781842..3e86ab5c87 100644 --- a/daemon/daemon.go +++ b/daemon/daemon.go @@ -330,10 +330,15 @@ func (daemon *Daemon) restore() error { daemon.setStateCounter(c) - log.WithFields(logrus.Fields{ - "running": c.IsRunning(), - "paused": c.IsPaused(), - }).Debug("restoring container") + logger := func(c *container.Container) *logrus.Entry { + return log.WithFields(logrus.Fields{ + "running": c.IsRunning(), + "paused": c.IsPaused(), + "restarting": c.IsRestarting(), + }) + } + + logger(c).Debug("restoring container") var ( err error @@ -345,17 +350,24 @@ func (daemon *Daemon) restore() error { alive, _, process, err = daemon.containerd.Restore(context.Background(), c.ID, c.InitializeStdio) if err != nil && !errdefs.IsNotFound(err) { - log.WithError(err).Error("failed to restore container with containerd") + logger(c).WithError(err).Error("failed to restore container with containerd") return } - log.Debugf("alive: %v", alive) - if !alive && process != nil { - ec, exitedAt, err = process.Delete(context.Background()) - if err != nil && !errdefs.IsNotFound(err) { - log.WithError(err).Error("failed to delete container from containerd") - return + logger(c).Debugf("alive: %v", alive) + if !alive { + // If process is not nil, cleanup dead container from containerd. + // If process is nil then the above `containerd.Restore` returned an errdefs.NotFoundError, + // and docker's view of the container state will be updated accorrdingly via SetStopped further down. + if process != nil { + logger(c).Debug("cleaning up dead container process") + ec, exitedAt, err = process.Delete(context.Background()) + if err != nil && !errdefs.IsNotFound(err) { + logger(c).WithError(err).Error("failed to delete container from containerd") + return + } } - } else if alive && !daemon.configStore.LiveRestoreEnabled { + } else if !daemon.configStore.LiveRestoreEnabled { + logger(c).Debug("shutting down container considered alive by containerd") if err := daemon.shutdownContainer(c); err != nil && !errdefs.IsNotFound(err) { log.WithError(err).Error("error shutting down container") return @@ -364,14 +376,16 @@ func (daemon *Daemon) restore() error { } if c.IsRunning() || c.IsPaused() { + logger(c).Debug("syncing container on disk state with real state") + c.RestartManager().Cancel() // manually start containers because some need to wait for swarm networking if c.IsPaused() && alive { s, err := daemon.containerd.Status(context.Background(), c.ID) if err != nil { - log.WithError(err).Error("failed to get container status") + logger(c).WithError(err).Error("failed to get container status") } else { - log.WithField("state", s).Info("restored container paused") + logger(c).WithField("state", s).Info("restored container paused") switch s { case containerd.Paused, containerd.Pausing: // nothing to do @@ -393,14 +407,15 @@ func (daemon *Daemon) restore() error { } if !alive { + logger(c).Debug("setting stopped state") c.Lock() - log.Debug("setting stopped state") c.SetStopped(&container.ExitStatus{ExitCode: int(ec), ExitedAt: exitedAt}) daemon.Cleanup(c) if err := c.CheckpointTo(daemon.containersReplica); err != nil { log.WithError(err).Error("failed to update stopped container state") } c.Unlock() + logger(c).Debug("set stopped state") } // we call Mount and then Unmount to get BaseFs of the container @@ -411,10 +426,10 @@ func (daemon *Daemon) restore() error { // stopped/restarted/removed. // See #29365 for related information. // The error is only logged here. - log.WithError(err).Warn("failed to mount container to get BaseFs path") + logger(c).WithError(err).Warn("failed to mount container to get BaseFs path") } else { if err := daemon.Unmount(c); err != nil { - log.WithError(err).Warn("failed to umount container to get BaseFs path") + logger(c).WithError(err).Warn("failed to umount container to get BaseFs path") } } @@ -422,7 +437,7 @@ func (daemon *Daemon) restore() error { if !c.HostConfig.NetworkMode.IsContainer() && c.IsRunning() { options, err := daemon.buildSandboxOptions(c) if err != nil { - log.WithError(err).Warn("failed to build sandbox option to restore container") + logger(c).WithError(err).Warn("failed to build sandbox option to restore container") } mapLock.Lock() activeSandboxes[c.NetworkSettings.SandboxID] = options @@ -467,6 +482,7 @@ func (daemon *Daemon) restore() error { } } c.Unlock() + logger(c).Debug("done restoring container") }(c) } group.Wait() diff --git a/integration/container/daemon_linux_test.go b/integration/container/daemon_linux_test.go index 328bd12caa..ea8fde2fd3 100644 --- a/integration/container/daemon_linux_test.go +++ b/integration/container/daemon_linux_test.go @@ -2,13 +2,18 @@ package container // import "github.com/docker/docker/integration/container" import ( "context" + "encoding/json" "fmt" "io/ioutil" + "path/filepath" "strconv" "strings" "testing" + "time" "github.com/docker/docker/api/types" + containerapi "github.com/docker/docker/api/types/container" + realcontainer "github.com/docker/docker/container" "github.com/docker/docker/integration/internal/container" "github.com/docker/docker/testutil/daemon" "golang.org/x/sys/unix" @@ -166,3 +171,65 @@ func TestDaemonHostGatewayIP(t *testing.T) { d.Stop(t) } + +// TestRestartDaemonWithRestartingContainer simulates a case where a container is in "restarting" state when +// dockerd is killed (due to machine reset or something else). +// +// Related to moby/moby#41817 +// +// In this test we'll change the container state to "restarting". +// This means that the container will not be 'alive' when we attempt to restore in on daemon startup. +// +// We could do the same with `docker run -d --resetart=always busybox:latest exit 1`, and then +// `kill -9` dockerd while the container is in "restarting" state. This is difficult to reproduce reliably +// in an automated test, so we manipulate on disk state instead. +func TestRestartDaemonWithRestartingContainer(t *testing.T) { + skip.If(t, testEnv.IsRemoteDaemon, "cannot start daemon on remote test run") + skip.If(t, testEnv.DaemonInfo.OSType == "windows") + + t.Parallel() + + d := daemon.New(t) + defer d.Cleanup(t) + + d.StartWithBusybox(t, "--iptables=false") + defer d.Kill() + + ctx := context.Background() + client := d.NewClientT(t) + + // Just create the container, no need to start it to be started. + // We really want to make sure there is no process running when docker starts back up. + // We will manipulate the on disk state later + id := container.Create(ctx, t, client, container.WithRestartPolicy("always"), container.WithCmd("/bin/sh", "-c", "exit 1")) + + // SIGKILL the daemon + assert.NilError(t, d.Kill()) + + configPath := filepath.Join(d.Root, "containers", id, "config.v2.json") + configBytes, err := ioutil.ReadFile(configPath) + assert.NilError(t, err) + + var c realcontainer.Container + + assert.NilError(t, json.Unmarshal(configBytes, &c)) + + c.State = realcontainer.NewState() + c.SetRestarting(&realcontainer.ExitStatus{ExitCode: 1}) + c.HasBeenStartedBefore = true + + configBytes, err = json.Marshal(&c) + assert.NilError(t, err) + assert.NilError(t, ioutil.WriteFile(configPath, configBytes, 0600)) + + d.Start(t) + + ctxTimeout, cancel := context.WithTimeout(ctx, 30*time.Second) + defer cancel() + chOk, chErr := client.ContainerWait(ctxTimeout, id, containerapi.WaitConditionNextExit) + select { + case <-chOk: + case err := <-chErr: + assert.NilError(t, err) + } +} diff --git a/testutil/daemon/daemon.go b/testutil/daemon/daemon.go index 1b29654f6c..52882e4dcb 100644 --- a/testutil/daemon/daemon.go +++ b/testutil/daemon/daemon.go @@ -288,6 +288,7 @@ func (d *Daemon) Cleanup(t testing.TB) { func (d *Daemon) Start(t testing.TB, args ...string) { t.Helper() if err := d.StartWithError(args...); err != nil { + d.DumpStackAndQuit() // in case the daemon is stuck t.Fatalf("[%s] failed to start daemon with arguments %v : %v", d.id, d.args, err) } }