Merge pull request #41817 from simonferquel/desktop-startup-hang
Fix a potential hang when starting after a non-clean shutdown
This commit is contained in:
commit
ffc4dc9aec
3 changed files with 101 additions and 15 deletions
|
@ -330,10 +330,15 @@ func (daemon *Daemon) restore() error {
|
||||||
|
|
||||||
daemon.setStateCounter(c)
|
daemon.setStateCounter(c)
|
||||||
|
|
||||||
log.WithFields(logrus.Fields{
|
logger := func(c *container.Container) *logrus.Entry {
|
||||||
"running": c.IsRunning(),
|
return log.WithFields(logrus.Fields{
|
||||||
"paused": c.IsPaused(),
|
"running": c.IsRunning(),
|
||||||
}).Debug("restoring container")
|
"paused": c.IsPaused(),
|
||||||
|
"restarting": c.IsRestarting(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
logger(c).Debug("restoring container")
|
||||||
|
|
||||||
var (
|
var (
|
||||||
err error
|
err error
|
||||||
|
@ -345,16 +350,24 @@ func (daemon *Daemon) restore() error {
|
||||||
|
|
||||||
alive, _, process, err = daemon.containerd.Restore(context.Background(), c.ID, c.InitializeStdio)
|
alive, _, process, err = daemon.containerd.Restore(context.Background(), c.ID, c.InitializeStdio)
|
||||||
if err != nil && !errdefs.IsNotFound(err) {
|
if err != nil && !errdefs.IsNotFound(err) {
|
||||||
log.WithError(err).Error("failed to restore container with containerd")
|
logger(c).WithError(err).Error("failed to restore container with containerd")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if !alive && process != nil {
|
logger(c).Debugf("alive: %v", alive)
|
||||||
ec, exitedAt, err = process.Delete(context.Background())
|
if !alive {
|
||||||
if err != nil && !errdefs.IsNotFound(err) {
|
// If process is not nil, cleanup dead container from containerd.
|
||||||
log.WithError(err).Error("failed to delete container from containerd")
|
// If process is nil then the above `containerd.Restore` returned an errdefs.NotFoundError,
|
||||||
return
|
// and docker's view of the container state will be updated accorrdingly via SetStopped further down.
|
||||||
|
if process != nil {
|
||||||
|
logger(c).Debug("cleaning up dead container process")
|
||||||
|
ec, exitedAt, err = process.Delete(context.Background())
|
||||||
|
if err != nil && !errdefs.IsNotFound(err) {
|
||||||
|
logger(c).WithError(err).Error("failed to delete container from containerd")
|
||||||
|
return
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else if !daemon.configStore.LiveRestoreEnabled {
|
} else if !daemon.configStore.LiveRestoreEnabled {
|
||||||
|
logger(c).Debug("shutting down container considered alive by containerd")
|
||||||
if err := daemon.shutdownContainer(c); err != nil && !errdefs.IsNotFound(err) {
|
if err := daemon.shutdownContainer(c); err != nil && !errdefs.IsNotFound(err) {
|
||||||
log.WithError(err).Error("error shutting down container")
|
log.WithError(err).Error("error shutting down container")
|
||||||
return
|
return
|
||||||
|
@ -363,14 +376,16 @@ func (daemon *Daemon) restore() error {
|
||||||
}
|
}
|
||||||
|
|
||||||
if c.IsRunning() || c.IsPaused() {
|
if c.IsRunning() || c.IsPaused() {
|
||||||
|
logger(c).Debug("syncing container on disk state with real state")
|
||||||
|
|
||||||
c.RestartManager().Cancel() // manually start containers because some need to wait for swarm networking
|
c.RestartManager().Cancel() // manually start containers because some need to wait for swarm networking
|
||||||
|
|
||||||
if c.IsPaused() && alive {
|
if c.IsPaused() && alive {
|
||||||
s, err := daemon.containerd.Status(context.Background(), c.ID)
|
s, err := daemon.containerd.Status(context.Background(), c.ID)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.WithError(err).Error("failed to get container status")
|
logger(c).WithError(err).Error("failed to get container status")
|
||||||
} else {
|
} else {
|
||||||
log.WithField("state", s).Info("restored container paused")
|
logger(c).WithField("state", s).Info("restored container paused")
|
||||||
switch s {
|
switch s {
|
||||||
case containerd.Paused, containerd.Pausing:
|
case containerd.Paused, containerd.Pausing:
|
||||||
// nothing to do
|
// nothing to do
|
||||||
|
@ -392,6 +407,7 @@ func (daemon *Daemon) restore() error {
|
||||||
}
|
}
|
||||||
|
|
||||||
if !alive {
|
if !alive {
|
||||||
|
logger(c).Debug("setting stopped state")
|
||||||
c.Lock()
|
c.Lock()
|
||||||
c.SetStopped(&container.ExitStatus{ExitCode: int(ec), ExitedAt: exitedAt})
|
c.SetStopped(&container.ExitStatus{ExitCode: int(ec), ExitedAt: exitedAt})
|
||||||
daemon.Cleanup(c)
|
daemon.Cleanup(c)
|
||||||
|
@ -399,6 +415,7 @@ func (daemon *Daemon) restore() error {
|
||||||
log.WithError(err).Error("failed to update stopped container state")
|
log.WithError(err).Error("failed to update stopped container state")
|
||||||
}
|
}
|
||||||
c.Unlock()
|
c.Unlock()
|
||||||
|
logger(c).Debug("set stopped state")
|
||||||
}
|
}
|
||||||
|
|
||||||
// we call Mount and then Unmount to get BaseFs of the container
|
// we call Mount and then Unmount to get BaseFs of the container
|
||||||
|
@ -409,10 +426,10 @@ func (daemon *Daemon) restore() error {
|
||||||
// stopped/restarted/removed.
|
// stopped/restarted/removed.
|
||||||
// See #29365 for related information.
|
// See #29365 for related information.
|
||||||
// The error is only logged here.
|
// The error is only logged here.
|
||||||
log.WithError(err).Warn("failed to mount container to get BaseFs path")
|
logger(c).WithError(err).Warn("failed to mount container to get BaseFs path")
|
||||||
} else {
|
} else {
|
||||||
if err := daemon.Unmount(c); err != nil {
|
if err := daemon.Unmount(c); err != nil {
|
||||||
log.WithError(err).Warn("failed to umount container to get BaseFs path")
|
logger(c).WithError(err).Warn("failed to umount container to get BaseFs path")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -420,7 +437,7 @@ func (daemon *Daemon) restore() error {
|
||||||
if !c.HostConfig.NetworkMode.IsContainer() && c.IsRunning() {
|
if !c.HostConfig.NetworkMode.IsContainer() && c.IsRunning() {
|
||||||
options, err := daemon.buildSandboxOptions(c)
|
options, err := daemon.buildSandboxOptions(c)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.WithError(err).Warn("failed to build sandbox option to restore container")
|
logger(c).WithError(err).Warn("failed to build sandbox option to restore container")
|
||||||
}
|
}
|
||||||
mapLock.Lock()
|
mapLock.Lock()
|
||||||
activeSandboxes[c.NetworkSettings.SandboxID] = options
|
activeSandboxes[c.NetworkSettings.SandboxID] = options
|
||||||
|
@ -465,6 +482,7 @@ func (daemon *Daemon) restore() error {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
c.Unlock()
|
c.Unlock()
|
||||||
|
logger(c).Debug("done restoring container")
|
||||||
}(c)
|
}(c)
|
||||||
}
|
}
|
||||||
group.Wait()
|
group.Wait()
|
||||||
|
|
|
@ -2,13 +2,18 @@ package container // import "github.com/docker/docker/integration/container"
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
|
"path/filepath"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/docker/docker/api/types"
|
"github.com/docker/docker/api/types"
|
||||||
|
containerapi "github.com/docker/docker/api/types/container"
|
||||||
|
realcontainer "github.com/docker/docker/container"
|
||||||
"github.com/docker/docker/integration/internal/container"
|
"github.com/docker/docker/integration/internal/container"
|
||||||
"github.com/docker/docker/testutil/daemon"
|
"github.com/docker/docker/testutil/daemon"
|
||||||
"golang.org/x/sys/unix"
|
"golang.org/x/sys/unix"
|
||||||
|
@ -166,3 +171,65 @@ func TestDaemonHostGatewayIP(t *testing.T) {
|
||||||
d.Stop(t)
|
d.Stop(t)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestRestartDaemonWithRestartingContainer simulates a case where a container is in "restarting" state when
|
||||||
|
// dockerd is killed (due to machine reset or something else).
|
||||||
|
//
|
||||||
|
// Related to moby/moby#41817
|
||||||
|
//
|
||||||
|
// In this test we'll change the container state to "restarting".
|
||||||
|
// This means that the container will not be 'alive' when we attempt to restore in on daemon startup.
|
||||||
|
//
|
||||||
|
// We could do the same with `docker run -d --resetart=always busybox:latest exit 1`, and then
|
||||||
|
// `kill -9` dockerd while the container is in "restarting" state. This is difficult to reproduce reliably
|
||||||
|
// in an automated test, so we manipulate on disk state instead.
|
||||||
|
func TestRestartDaemonWithRestartingContainer(t *testing.T) {
|
||||||
|
skip.If(t, testEnv.IsRemoteDaemon, "cannot start daemon on remote test run")
|
||||||
|
skip.If(t, testEnv.DaemonInfo.OSType == "windows")
|
||||||
|
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
d := daemon.New(t)
|
||||||
|
defer d.Cleanup(t)
|
||||||
|
|
||||||
|
d.StartWithBusybox(t, "--iptables=false")
|
||||||
|
defer d.Kill()
|
||||||
|
|
||||||
|
ctx := context.Background()
|
||||||
|
client := d.NewClientT(t)
|
||||||
|
|
||||||
|
// Just create the container, no need to start it to be started.
|
||||||
|
// We really want to make sure there is no process running when docker starts back up.
|
||||||
|
// We will manipulate the on disk state later
|
||||||
|
id := container.Create(ctx, t, client, container.WithRestartPolicy("always"), container.WithCmd("/bin/sh", "-c", "exit 1"))
|
||||||
|
|
||||||
|
// SIGKILL the daemon
|
||||||
|
assert.NilError(t, d.Kill())
|
||||||
|
|
||||||
|
configPath := filepath.Join(d.Root, "containers", id, "config.v2.json")
|
||||||
|
configBytes, err := ioutil.ReadFile(configPath)
|
||||||
|
assert.NilError(t, err)
|
||||||
|
|
||||||
|
var c realcontainer.Container
|
||||||
|
|
||||||
|
assert.NilError(t, json.Unmarshal(configBytes, &c))
|
||||||
|
|
||||||
|
c.State = realcontainer.NewState()
|
||||||
|
c.SetRestarting(&realcontainer.ExitStatus{ExitCode: 1})
|
||||||
|
c.HasBeenStartedBefore = true
|
||||||
|
|
||||||
|
configBytes, err = json.Marshal(&c)
|
||||||
|
assert.NilError(t, err)
|
||||||
|
assert.NilError(t, ioutil.WriteFile(configPath, configBytes, 0600))
|
||||||
|
|
||||||
|
d.Start(t)
|
||||||
|
|
||||||
|
ctxTimeout, cancel := context.WithTimeout(ctx, 30*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
chOk, chErr := client.ContainerWait(ctxTimeout, id, containerapi.WaitConditionNextExit)
|
||||||
|
select {
|
||||||
|
case <-chOk:
|
||||||
|
case err := <-chErr:
|
||||||
|
assert.NilError(t, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -288,6 +288,7 @@ func (d *Daemon) Cleanup(t testing.TB) {
|
||||||
func (d *Daemon) Start(t testing.TB, args ...string) {
|
func (d *Daemon) Start(t testing.TB, args ...string) {
|
||||||
t.Helper()
|
t.Helper()
|
||||||
if err := d.StartWithError(args...); err != nil {
|
if err := d.StartWithError(args...); err != nil {
|
||||||
|
d.DumpStackAndQuit() // in case the daemon is stuck
|
||||||
t.Fatalf("[%s] failed to start daemon with arguments %v : %v", d.id, d.args, err)
|
t.Fatalf("[%s] failed to start daemon with arguments %v : %v", d.id, d.args, err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue