Merge pull request #41817 from simonferquel/desktop-startup-hang
Fix a potential hang when starting after a non-clean shutdown
This commit is contained in:
commit
ffc4dc9aec
3 changed files with 101 additions and 15 deletions
|
@ -330,10 +330,15 @@ func (daemon *Daemon) restore() error {
|
|||
|
||||
daemon.setStateCounter(c)
|
||||
|
||||
log.WithFields(logrus.Fields{
|
||||
"running": c.IsRunning(),
|
||||
"paused": c.IsPaused(),
|
||||
}).Debug("restoring container")
|
||||
logger := func(c *container.Container) *logrus.Entry {
|
||||
return log.WithFields(logrus.Fields{
|
||||
"running": c.IsRunning(),
|
||||
"paused": c.IsPaused(),
|
||||
"restarting": c.IsRestarting(),
|
||||
})
|
||||
}
|
||||
|
||||
logger(c).Debug("restoring container")
|
||||
|
||||
var (
|
||||
err error
|
||||
|
@ -345,16 +350,24 @@ func (daemon *Daemon) restore() error {
|
|||
|
||||
alive, _, process, err = daemon.containerd.Restore(context.Background(), c.ID, c.InitializeStdio)
|
||||
if err != nil && !errdefs.IsNotFound(err) {
|
||||
log.WithError(err).Error("failed to restore container with containerd")
|
||||
logger(c).WithError(err).Error("failed to restore container with containerd")
|
||||
return
|
||||
}
|
||||
if !alive && process != nil {
|
||||
ec, exitedAt, err = process.Delete(context.Background())
|
||||
if err != nil && !errdefs.IsNotFound(err) {
|
||||
log.WithError(err).Error("failed to delete container from containerd")
|
||||
return
|
||||
logger(c).Debugf("alive: %v", alive)
|
||||
if !alive {
|
||||
// If process is not nil, cleanup dead container from containerd.
|
||||
// If process is nil then the above `containerd.Restore` returned an errdefs.NotFoundError,
|
||||
// and docker's view of the container state will be updated accorrdingly via SetStopped further down.
|
||||
if process != nil {
|
||||
logger(c).Debug("cleaning up dead container process")
|
||||
ec, exitedAt, err = process.Delete(context.Background())
|
||||
if err != nil && !errdefs.IsNotFound(err) {
|
||||
logger(c).WithError(err).Error("failed to delete container from containerd")
|
||||
return
|
||||
}
|
||||
}
|
||||
} else if !daemon.configStore.LiveRestoreEnabled {
|
||||
logger(c).Debug("shutting down container considered alive by containerd")
|
||||
if err := daemon.shutdownContainer(c); err != nil && !errdefs.IsNotFound(err) {
|
||||
log.WithError(err).Error("error shutting down container")
|
||||
return
|
||||
|
@ -363,14 +376,16 @@ func (daemon *Daemon) restore() error {
|
|||
}
|
||||
|
||||
if c.IsRunning() || c.IsPaused() {
|
||||
logger(c).Debug("syncing container on disk state with real state")
|
||||
|
||||
c.RestartManager().Cancel() // manually start containers because some need to wait for swarm networking
|
||||
|
||||
if c.IsPaused() && alive {
|
||||
s, err := daemon.containerd.Status(context.Background(), c.ID)
|
||||
if err != nil {
|
||||
log.WithError(err).Error("failed to get container status")
|
||||
logger(c).WithError(err).Error("failed to get container status")
|
||||
} else {
|
||||
log.WithField("state", s).Info("restored container paused")
|
||||
logger(c).WithField("state", s).Info("restored container paused")
|
||||
switch s {
|
||||
case containerd.Paused, containerd.Pausing:
|
||||
// nothing to do
|
||||
|
@ -392,6 +407,7 @@ func (daemon *Daemon) restore() error {
|
|||
}
|
||||
|
||||
if !alive {
|
||||
logger(c).Debug("setting stopped state")
|
||||
c.Lock()
|
||||
c.SetStopped(&container.ExitStatus{ExitCode: int(ec), ExitedAt: exitedAt})
|
||||
daemon.Cleanup(c)
|
||||
|
@ -399,6 +415,7 @@ func (daemon *Daemon) restore() error {
|
|||
log.WithError(err).Error("failed to update stopped container state")
|
||||
}
|
||||
c.Unlock()
|
||||
logger(c).Debug("set stopped state")
|
||||
}
|
||||
|
||||
// we call Mount and then Unmount to get BaseFs of the container
|
||||
|
@ -409,10 +426,10 @@ func (daemon *Daemon) restore() error {
|
|||
// stopped/restarted/removed.
|
||||
// See #29365 for related information.
|
||||
// The error is only logged here.
|
||||
log.WithError(err).Warn("failed to mount container to get BaseFs path")
|
||||
logger(c).WithError(err).Warn("failed to mount container to get BaseFs path")
|
||||
} else {
|
||||
if err := daemon.Unmount(c); err != nil {
|
||||
log.WithError(err).Warn("failed to umount container to get BaseFs path")
|
||||
logger(c).WithError(err).Warn("failed to umount container to get BaseFs path")
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -420,7 +437,7 @@ func (daemon *Daemon) restore() error {
|
|||
if !c.HostConfig.NetworkMode.IsContainer() && c.IsRunning() {
|
||||
options, err := daemon.buildSandboxOptions(c)
|
||||
if err != nil {
|
||||
log.WithError(err).Warn("failed to build sandbox option to restore container")
|
||||
logger(c).WithError(err).Warn("failed to build sandbox option to restore container")
|
||||
}
|
||||
mapLock.Lock()
|
||||
activeSandboxes[c.NetworkSettings.SandboxID] = options
|
||||
|
@ -465,6 +482,7 @@ func (daemon *Daemon) restore() error {
|
|||
}
|
||||
}
|
||||
c.Unlock()
|
||||
logger(c).Debug("done restoring container")
|
||||
}(c)
|
||||
}
|
||||
group.Wait()
|
||||
|
|
|
@ -2,13 +2,18 @@ package container // import "github.com/docker/docker/integration/container"
|
|||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/docker/docker/api/types"
|
||||
containerapi "github.com/docker/docker/api/types/container"
|
||||
realcontainer "github.com/docker/docker/container"
|
||||
"github.com/docker/docker/integration/internal/container"
|
||||
"github.com/docker/docker/testutil/daemon"
|
||||
"golang.org/x/sys/unix"
|
||||
|
@ -166,3 +171,65 @@ func TestDaemonHostGatewayIP(t *testing.T) {
|
|||
d.Stop(t)
|
||||
|
||||
}
|
||||
|
||||
// TestRestartDaemonWithRestartingContainer simulates a case where a container is in "restarting" state when
|
||||
// dockerd is killed (due to machine reset or something else).
|
||||
//
|
||||
// Related to moby/moby#41817
|
||||
//
|
||||
// In this test we'll change the container state to "restarting".
|
||||
// This means that the container will not be 'alive' when we attempt to restore in on daemon startup.
|
||||
//
|
||||
// We could do the same with `docker run -d --resetart=always busybox:latest exit 1`, and then
|
||||
// `kill -9` dockerd while the container is in "restarting" state. This is difficult to reproduce reliably
|
||||
// in an automated test, so we manipulate on disk state instead.
|
||||
func TestRestartDaemonWithRestartingContainer(t *testing.T) {
|
||||
skip.If(t, testEnv.IsRemoteDaemon, "cannot start daemon on remote test run")
|
||||
skip.If(t, testEnv.DaemonInfo.OSType == "windows")
|
||||
|
||||
t.Parallel()
|
||||
|
||||
d := daemon.New(t)
|
||||
defer d.Cleanup(t)
|
||||
|
||||
d.StartWithBusybox(t, "--iptables=false")
|
||||
defer d.Kill()
|
||||
|
||||
ctx := context.Background()
|
||||
client := d.NewClientT(t)
|
||||
|
||||
// Just create the container, no need to start it to be started.
|
||||
// We really want to make sure there is no process running when docker starts back up.
|
||||
// We will manipulate the on disk state later
|
||||
id := container.Create(ctx, t, client, container.WithRestartPolicy("always"), container.WithCmd("/bin/sh", "-c", "exit 1"))
|
||||
|
||||
// SIGKILL the daemon
|
||||
assert.NilError(t, d.Kill())
|
||||
|
||||
configPath := filepath.Join(d.Root, "containers", id, "config.v2.json")
|
||||
configBytes, err := ioutil.ReadFile(configPath)
|
||||
assert.NilError(t, err)
|
||||
|
||||
var c realcontainer.Container
|
||||
|
||||
assert.NilError(t, json.Unmarshal(configBytes, &c))
|
||||
|
||||
c.State = realcontainer.NewState()
|
||||
c.SetRestarting(&realcontainer.ExitStatus{ExitCode: 1})
|
||||
c.HasBeenStartedBefore = true
|
||||
|
||||
configBytes, err = json.Marshal(&c)
|
||||
assert.NilError(t, err)
|
||||
assert.NilError(t, ioutil.WriteFile(configPath, configBytes, 0600))
|
||||
|
||||
d.Start(t)
|
||||
|
||||
ctxTimeout, cancel := context.WithTimeout(ctx, 30*time.Second)
|
||||
defer cancel()
|
||||
chOk, chErr := client.ContainerWait(ctxTimeout, id, containerapi.WaitConditionNextExit)
|
||||
select {
|
||||
case <-chOk:
|
||||
case err := <-chErr:
|
||||
assert.NilError(t, err)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -288,6 +288,7 @@ func (d *Daemon) Cleanup(t testing.TB) {
|
|||
func (d *Daemon) Start(t testing.TB, args ...string) {
|
||||
t.Helper()
|
||||
if err := d.StartWithError(args...); err != nil {
|
||||
d.DumpStackAndQuit() // in case the daemon is stuck
|
||||
t.Fatalf("[%s] failed to start daemon with arguments %v : %v", d.id, d.args, err)
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue