state/Wait: Fix race when reading exit status
Before this change there was a race condition between State.Wait reading the exit code from State and the State being changed instantly after the change which ended the State.Wait. Now, each State.Wait has its own channel which is used to transmit the desired StateStatus at the time the state transitions to the awaited one. Wait no longer reads the status by itself so there is no race. The issue caused the `docker run --restart=always ...' to sometimes exit with 0 exit code, because the process was already restarted by the time State.Wait got the chance to read the exit code. Test run -------- Before: ``` $ go test -count 1 -run TestCorrectStateWaitResultAfterRestart . --- FAIL: TestCorrectStateWaitResultAfterRestart (0.00s) state_test.go:198: expected exit code 10, got 0 FAIL FAIL github.com/docker/docker/container 0.011s FAIL ``` After: ``` $ go test -count 1 -run TestCorrectStateWaitResultAfterRestart . ok github.com/docker/docker/container 0.011s ``` Signed-off-by: Paweł Gronowski <pawel.gronowski@docker.com>
This commit is contained in:
parent
826003ecae
commit
85b9568d0e
3 changed files with 88 additions and 57 deletions
|
@ -81,17 +81,18 @@ type Container struct {
|
|||
Driver string
|
||||
OS string
|
||||
// MountLabel contains the options for the 'mount' command
|
||||
MountLabel string
|
||||
ProcessLabel string
|
||||
RestartCount int
|
||||
HasBeenStartedBefore bool
|
||||
HasBeenManuallyStopped bool // used for unless-stopped restart policy
|
||||
MountPoints map[string]*volumemounts.MountPoint
|
||||
HostConfig *containertypes.HostConfig `json:"-"` // do not serialize the host config in the json, otherwise we'll make the container unportable
|
||||
ExecCommands *exec.Store `json:"-"`
|
||||
DependencyStore agentexec.DependencyGetter `json:"-"`
|
||||
SecretReferences []*swarmtypes.SecretReference
|
||||
ConfigReferences []*swarmtypes.ConfigReference
|
||||
MountLabel string
|
||||
ProcessLabel string
|
||||
RestartCount int
|
||||
HasBeenStartedBefore bool
|
||||
HasBeenManuallyStopped bool // used for unless-stopped restart policy
|
||||
HasBeenManuallyRestarted bool `json:"-"` // used to distinguish restart caused by restart policy from the manual one
|
||||
MountPoints map[string]*volumemounts.MountPoint
|
||||
HostConfig *containertypes.HostConfig `json:"-"` // do not serialize the host config in the json, otherwise we'll make the container unportable
|
||||
ExecCommands *exec.Store `json:"-"`
|
||||
DependencyStore agentexec.DependencyGetter `json:"-"`
|
||||
SecretReferences []*swarmtypes.SecretReference
|
||||
ConfigReferences []*swarmtypes.ConfigReference
|
||||
// logDriver for closing
|
||||
LogDriver logger.Logger `json:"-"`
|
||||
LogCopier *logger.Copier `json:"-"`
|
||||
|
|
|
@ -32,9 +32,10 @@ type State struct {
|
|||
StartedAt time.Time
|
||||
FinishedAt time.Time
|
||||
Health *Health
|
||||
Removed bool `json:"-"`
|
||||
|
||||
waitStop chan struct{}
|
||||
waitRemove chan struct{}
|
||||
stopWaiters []chan<- StateStatus
|
||||
removeOnlyWaiters []chan<- StateStatus
|
||||
}
|
||||
|
||||
// StateStatus is used to return container wait results.
|
||||
|
@ -57,12 +58,9 @@ func (s StateStatus) Err() error {
|
|||
return s.err
|
||||
}
|
||||
|
||||
// NewState creates a default state object with a fresh channel for state changes.
|
||||
// NewState creates a default state object.
|
||||
func NewState() *State {
|
||||
return &State{
|
||||
waitStop: make(chan struct{}),
|
||||
waitRemove: make(chan struct{}),
|
||||
}
|
||||
return &State{}
|
||||
}
|
||||
|
||||
// String returns a human-readable description of the state
|
||||
|
@ -182,11 +180,10 @@ func (s *State) Wait(ctx context.Context, condition WaitCondition) <-chan StateS
|
|||
s.Lock()
|
||||
defer s.Unlock()
|
||||
|
||||
if condition == WaitConditionNotRunning && !s.Running {
|
||||
// Buffer so we can put it in the channel now.
|
||||
resultC := make(chan StateStatus, 1)
|
||||
// Buffer so we can put status and finish even nobody receives it.
|
||||
resultC := make(chan StateStatus, 1)
|
||||
|
||||
// Send the current status.
|
||||
if s.conditionAlreadyMet(condition) {
|
||||
resultC <- StateStatus{
|
||||
exitCode: s.ExitCode(),
|
||||
err: s.Err(),
|
||||
|
@ -195,20 +192,17 @@ func (s *State) Wait(ctx context.Context, condition WaitCondition) <-chan StateS
|
|||
return resultC
|
||||
}
|
||||
|
||||
// If we are waiting only for removal, the waitStop channel should
|
||||
// remain nil and block forever.
|
||||
var waitStop chan struct{}
|
||||
if condition < WaitConditionRemoved {
|
||||
waitStop = s.waitStop
|
||||
waitC := make(chan StateStatus, 1)
|
||||
|
||||
// Removal wakes up both removeOnlyWaiters and stopWaiters
|
||||
// Container could be removed while still in "created" state
|
||||
// in which case it is never actually stopped
|
||||
if condition == WaitConditionRemoved {
|
||||
s.removeOnlyWaiters = append(s.removeOnlyWaiters, waitC)
|
||||
} else {
|
||||
s.stopWaiters = append(s.stopWaiters, waitC)
|
||||
}
|
||||
|
||||
// Always wait for removal, just in case the container gets removed
|
||||
// while it is still in a "created" state, in which case it is never
|
||||
// actually stopped.
|
||||
waitRemove := s.waitRemove
|
||||
|
||||
resultC := make(chan StateStatus, 1)
|
||||
|
||||
go func() {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
|
@ -218,23 +212,25 @@ func (s *State) Wait(ctx context.Context, condition WaitCondition) <-chan StateS
|
|||
err: ctx.Err(),
|
||||
}
|
||||
return
|
||||
case <-waitStop:
|
||||
case <-waitRemove:
|
||||
case status := <-waitC:
|
||||
resultC <- status
|
||||
}
|
||||
|
||||
s.Lock()
|
||||
result := StateStatus{
|
||||
exitCode: s.ExitCode(),
|
||||
err: s.Err(),
|
||||
}
|
||||
s.Unlock()
|
||||
|
||||
resultC <- result
|
||||
}()
|
||||
|
||||
return resultC
|
||||
}
|
||||
|
||||
func (s *State) conditionAlreadyMet(condition WaitCondition) bool {
|
||||
switch condition {
|
||||
case WaitConditionNotRunning:
|
||||
return !s.Running
|
||||
case WaitConditionRemoved:
|
||||
return s.Removed
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// IsRunning returns whether the running flag is set. Used by Container to check whether a container is running.
|
||||
func (s *State) IsRunning() bool {
|
||||
s.Lock()
|
||||
|
@ -292,8 +288,8 @@ func (s *State) SetStopped(exitStatus *ExitStatus) {
|
|||
}
|
||||
s.ExitCodeValue = exitStatus.ExitCode
|
||||
s.OOMKilled = exitStatus.OOMKilled
|
||||
close(s.waitStop) // fire waiters for stop
|
||||
s.waitStop = make(chan struct{})
|
||||
|
||||
s.notifyAndClear(&s.stopWaiters)
|
||||
}
|
||||
|
||||
// SetRestarting sets the container state to "restarting" without locking.
|
||||
|
@ -308,8 +304,8 @@ func (s *State) SetRestarting(exitStatus *ExitStatus) {
|
|||
s.FinishedAt = time.Now().UTC()
|
||||
s.ExitCodeValue = exitStatus.ExitCode
|
||||
s.OOMKilled = exitStatus.OOMKilled
|
||||
close(s.waitStop) // fire waiters for stop
|
||||
s.waitStop = make(chan struct{})
|
||||
|
||||
s.notifyAndClear(&s.stopWaiters)
|
||||
}
|
||||
|
||||
// SetError sets the container's error state. This is useful when we want to
|
||||
|
@ -374,22 +370,19 @@ func (s *State) IsDead() bool {
|
|||
return res
|
||||
}
|
||||
|
||||
// SetRemoved assumes this container is already in the "dead" state and
|
||||
// closes the internal waitRemove channel to unblock callers waiting for a
|
||||
// container to be removed.
|
||||
// SetRemoved assumes this container is already in the "dead" state and notifies all waiters.
|
||||
func (s *State) SetRemoved() {
|
||||
s.SetRemovalError(nil)
|
||||
}
|
||||
|
||||
// SetRemovalError is to be called in case a container remove failed.
|
||||
// It sets an error and closes the internal waitRemove channel to unblock
|
||||
// callers waiting for the container to be removed.
|
||||
// It sets an error and notifies all waiters.
|
||||
func (s *State) SetRemovalError(err error) {
|
||||
s.SetError(err)
|
||||
s.Lock()
|
||||
close(s.waitRemove) // Unblock those waiting on remove.
|
||||
// Recreate the channel so next ContainerWait will work
|
||||
s.waitRemove = make(chan struct{})
|
||||
s.Removed = true
|
||||
s.notifyAndClear(&s.removeOnlyWaiters)
|
||||
s.notifyAndClear(&s.stopWaiters)
|
||||
s.Unlock()
|
||||
}
|
||||
|
||||
|
@ -400,3 +393,15 @@ func (s *State) Err() error {
|
|||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *State) notifyAndClear(waiters *[]chan<- StateStatus) {
|
||||
result := StateStatus{
|
||||
exitCode: s.ExitCodeValue,
|
||||
err: s.Err(),
|
||||
}
|
||||
|
||||
for _, c := range *waiters {
|
||||
c <- result
|
||||
}
|
||||
*waiters = nil
|
||||
}
|
||||
|
|
|
@ -169,6 +169,31 @@ func TestStateTimeoutWait(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
// Related issue: #39352
|
||||
func TestCorrectStateWaitResultAfterRestart(t *testing.T) {
|
||||
s := NewState()
|
||||
|
||||
s.Lock()
|
||||
s.SetRunning(0, true)
|
||||
s.Unlock()
|
||||
|
||||
waitC := s.Wait(context.Background(), WaitConditionNotRunning)
|
||||
want := ExitStatus{ExitCode: 10, ExitedAt: time.Now()}
|
||||
|
||||
s.Lock()
|
||||
s.SetRestarting(&want)
|
||||
s.Unlock()
|
||||
|
||||
s.Lock()
|
||||
s.SetRunning(0, true)
|
||||
s.Unlock()
|
||||
|
||||
got := <-waitC
|
||||
if got.exitCode != want.ExitCode {
|
||||
t.Fatalf("expected exit code %v, got %v", want.ExitCode, got.exitCode)
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsValidStateString(t *testing.T) {
|
||||
states := []struct {
|
||||
state string
|
||||
|
|
Loading…
Reference in a new issue