Merge pull request #41817 from simonferquel/desktop-startup-hang

Fix a potential hang when starting after a non-clean shutdown
This commit is contained in:
Tibor Vass 2020-12-23 23:22:00 -08:00 committed by GitHub
commit ffc4dc9aec
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 101 additions and 15 deletions

View file

@ -330,10 +330,15 @@ func (daemon *Daemon) restore() error {
daemon.setStateCounter(c) daemon.setStateCounter(c)
log.WithFields(logrus.Fields{ logger := func(c *container.Container) *logrus.Entry {
"running": c.IsRunning(), return log.WithFields(logrus.Fields{
"paused": c.IsPaused(), "running": c.IsRunning(),
}).Debug("restoring container") "paused": c.IsPaused(),
"restarting": c.IsRestarting(),
})
}
logger(c).Debug("restoring container")
var ( var (
err error err error
@ -345,16 +350,24 @@ func (daemon *Daemon) restore() error {
alive, _, process, err = daemon.containerd.Restore(context.Background(), c.ID, c.InitializeStdio) alive, _, process, err = daemon.containerd.Restore(context.Background(), c.ID, c.InitializeStdio)
if err != nil && !errdefs.IsNotFound(err) { if err != nil && !errdefs.IsNotFound(err) {
log.WithError(err).Error("failed to restore container with containerd") logger(c).WithError(err).Error("failed to restore container with containerd")
return return
} }
if !alive && process != nil { logger(c).Debugf("alive: %v", alive)
ec, exitedAt, err = process.Delete(context.Background()) if !alive {
if err != nil && !errdefs.IsNotFound(err) { // If process is not nil, cleanup dead container from containerd.
log.WithError(err).Error("failed to delete container from containerd") // If process is nil then the above `containerd.Restore` returned an errdefs.NotFoundError,
return // and docker's view of the container state will be updated accorrdingly via SetStopped further down.
if process != nil {
logger(c).Debug("cleaning up dead container process")
ec, exitedAt, err = process.Delete(context.Background())
if err != nil && !errdefs.IsNotFound(err) {
logger(c).WithError(err).Error("failed to delete container from containerd")
return
}
} }
} else if !daemon.configStore.LiveRestoreEnabled { } else if !daemon.configStore.LiveRestoreEnabled {
logger(c).Debug("shutting down container considered alive by containerd")
if err := daemon.shutdownContainer(c); err != nil && !errdefs.IsNotFound(err) { if err := daemon.shutdownContainer(c); err != nil && !errdefs.IsNotFound(err) {
log.WithError(err).Error("error shutting down container") log.WithError(err).Error("error shutting down container")
return return
@ -363,14 +376,16 @@ func (daemon *Daemon) restore() error {
} }
if c.IsRunning() || c.IsPaused() { if c.IsRunning() || c.IsPaused() {
logger(c).Debug("syncing container on disk state with real state")
c.RestartManager().Cancel() // manually start containers because some need to wait for swarm networking c.RestartManager().Cancel() // manually start containers because some need to wait for swarm networking
if c.IsPaused() && alive { if c.IsPaused() && alive {
s, err := daemon.containerd.Status(context.Background(), c.ID) s, err := daemon.containerd.Status(context.Background(), c.ID)
if err != nil { if err != nil {
log.WithError(err).Error("failed to get container status") logger(c).WithError(err).Error("failed to get container status")
} else { } else {
log.WithField("state", s).Info("restored container paused") logger(c).WithField("state", s).Info("restored container paused")
switch s { switch s {
case containerd.Paused, containerd.Pausing: case containerd.Paused, containerd.Pausing:
// nothing to do // nothing to do
@ -392,6 +407,7 @@ func (daemon *Daemon) restore() error {
} }
if !alive { if !alive {
logger(c).Debug("setting stopped state")
c.Lock() c.Lock()
c.SetStopped(&container.ExitStatus{ExitCode: int(ec), ExitedAt: exitedAt}) c.SetStopped(&container.ExitStatus{ExitCode: int(ec), ExitedAt: exitedAt})
daemon.Cleanup(c) daemon.Cleanup(c)
@ -399,6 +415,7 @@ func (daemon *Daemon) restore() error {
log.WithError(err).Error("failed to update stopped container state") log.WithError(err).Error("failed to update stopped container state")
} }
c.Unlock() c.Unlock()
logger(c).Debug("set stopped state")
} }
// we call Mount and then Unmount to get BaseFs of the container // we call Mount and then Unmount to get BaseFs of the container
@ -409,10 +426,10 @@ func (daemon *Daemon) restore() error {
// stopped/restarted/removed. // stopped/restarted/removed.
// See #29365 for related information. // See #29365 for related information.
// The error is only logged here. // The error is only logged here.
log.WithError(err).Warn("failed to mount container to get BaseFs path") logger(c).WithError(err).Warn("failed to mount container to get BaseFs path")
} else { } else {
if err := daemon.Unmount(c); err != nil { if err := daemon.Unmount(c); err != nil {
log.WithError(err).Warn("failed to umount container to get BaseFs path") logger(c).WithError(err).Warn("failed to umount container to get BaseFs path")
} }
} }
@ -420,7 +437,7 @@ func (daemon *Daemon) restore() error {
if !c.HostConfig.NetworkMode.IsContainer() && c.IsRunning() { if !c.HostConfig.NetworkMode.IsContainer() && c.IsRunning() {
options, err := daemon.buildSandboxOptions(c) options, err := daemon.buildSandboxOptions(c)
if err != nil { if err != nil {
log.WithError(err).Warn("failed to build sandbox option to restore container") logger(c).WithError(err).Warn("failed to build sandbox option to restore container")
} }
mapLock.Lock() mapLock.Lock()
activeSandboxes[c.NetworkSettings.SandboxID] = options activeSandboxes[c.NetworkSettings.SandboxID] = options
@ -465,6 +482,7 @@ func (daemon *Daemon) restore() error {
} }
} }
c.Unlock() c.Unlock()
logger(c).Debug("done restoring container")
}(c) }(c)
} }
group.Wait() group.Wait()

View file

@ -2,13 +2,18 @@ package container // import "github.com/docker/docker/integration/container"
import ( import (
"context" "context"
"encoding/json"
"fmt" "fmt"
"io/ioutil" "io/ioutil"
"path/filepath"
"strconv" "strconv"
"strings" "strings"
"testing" "testing"
"time"
"github.com/docker/docker/api/types" "github.com/docker/docker/api/types"
containerapi "github.com/docker/docker/api/types/container"
realcontainer "github.com/docker/docker/container"
"github.com/docker/docker/integration/internal/container" "github.com/docker/docker/integration/internal/container"
"github.com/docker/docker/testutil/daemon" "github.com/docker/docker/testutil/daemon"
"golang.org/x/sys/unix" "golang.org/x/sys/unix"
@ -166,3 +171,65 @@ func TestDaemonHostGatewayIP(t *testing.T) {
d.Stop(t) d.Stop(t)
} }
// TestRestartDaemonWithRestartingContainer simulates a case where a container is in "restarting" state when
// dockerd is killed (due to machine reset or something else).
//
// Related to moby/moby#41817
//
// In this test we'll change the container state to "restarting".
// This means that the container will not be 'alive' when we attempt to restore in on daemon startup.
//
// We could do the same with `docker run -d --resetart=always busybox:latest exit 1`, and then
// `kill -9` dockerd while the container is in "restarting" state. This is difficult to reproduce reliably
// in an automated test, so we manipulate on disk state instead.
func TestRestartDaemonWithRestartingContainer(t *testing.T) {
skip.If(t, testEnv.IsRemoteDaemon, "cannot start daemon on remote test run")
skip.If(t, testEnv.DaemonInfo.OSType == "windows")
t.Parallel()
d := daemon.New(t)
defer d.Cleanup(t)
d.StartWithBusybox(t, "--iptables=false")
defer d.Kill()
ctx := context.Background()
client := d.NewClientT(t)
// Just create the container, no need to start it to be started.
// We really want to make sure there is no process running when docker starts back up.
// We will manipulate the on disk state later
id := container.Create(ctx, t, client, container.WithRestartPolicy("always"), container.WithCmd("/bin/sh", "-c", "exit 1"))
// SIGKILL the daemon
assert.NilError(t, d.Kill())
configPath := filepath.Join(d.Root, "containers", id, "config.v2.json")
configBytes, err := ioutil.ReadFile(configPath)
assert.NilError(t, err)
var c realcontainer.Container
assert.NilError(t, json.Unmarshal(configBytes, &c))
c.State = realcontainer.NewState()
c.SetRestarting(&realcontainer.ExitStatus{ExitCode: 1})
c.HasBeenStartedBefore = true
configBytes, err = json.Marshal(&c)
assert.NilError(t, err)
assert.NilError(t, ioutil.WriteFile(configPath, configBytes, 0600))
d.Start(t)
ctxTimeout, cancel := context.WithTimeout(ctx, 30*time.Second)
defer cancel()
chOk, chErr := client.ContainerWait(ctxTimeout, id, containerapi.WaitConditionNextExit)
select {
case <-chOk:
case err := <-chErr:
assert.NilError(t, err)
}
}

View file

@ -288,6 +288,7 @@ func (d *Daemon) Cleanup(t testing.TB) {
func (d *Daemon) Start(t testing.TB, args ...string) { func (d *Daemon) Start(t testing.TB, args ...string) {
t.Helper() t.Helper()
if err := d.StartWithError(args...); err != nil { if err := d.StartWithError(args...); err != nil {
d.DumpStackAndQuit() // in case the daemon is stuck
t.Fatalf("[%s] failed to start daemon with arguments %v : %v", d.id, d.args, err) t.Fatalf("[%s] failed to start daemon with arguments %v : %v", d.id, d.args, err)
} }
} }