Windows: Retry loop to fix HCS race condition enumerating containers
Signed-off-by: John Howard <jhoward@microsoft.com> This fixes https://github.com/docker/docker/issues/30278 where there is a race condition in HCS for RS1 and RS2 builds, and enumeration of compute systems can return access is denied if a silo is being torn down in the kernel while HCS is attempting to enumerate them.
This commit is contained in:
parent
9983a7b88f
commit
83a2db2097
1 changed files with 24 additions and 7 deletions
|
@ -31,6 +31,7 @@ import (
|
|||
"github.com/docker/docker/pkg/ioutils"
|
||||
"github.com/docker/docker/pkg/longpath"
|
||||
"github.com/docker/docker/pkg/reexec"
|
||||
"github.com/docker/docker/pkg/system"
|
||||
units "github.com/docker/go-units"
|
||||
"golang.org/x/sys/windows"
|
||||
)
|
||||
|
@ -265,19 +266,35 @@ func (d *Driver) Remove(id string) error {
|
|||
// it is a transient error. Retry until it succeeds.
|
||||
var computeSystems []hcsshim.ContainerProperties
|
||||
retryCount := 0
|
||||
osv := system.GetOSVersion()
|
||||
for {
|
||||
// Get and terminate any template VMs that are currently using the layer
|
||||
// Get and terminate any template VMs that are currently using the layer.
|
||||
// Note: It is unfortunate that we end up in the graphdrivers Remove() call
|
||||
// for both containers and images, but the logic for template VMs is only
|
||||
// needed for images - specifically we are looking to see if a base layer
|
||||
// is in use by a template VM as a result of having started a Hyper-V
|
||||
// container at some point.
|
||||
//
|
||||
// We have a retry loop for ErrVmcomputeOperationInvalidState and
|
||||
// ErrVmcomputeOperationAccessIsDenied as there is a race condition
|
||||
// in RS1 and RS2 building during enumeration when a silo is going away
|
||||
// for example under it, in HCS. AccessIsDenied added to fix 30278.
|
||||
//
|
||||
// TODO @jhowardmsft - For RS3, we can remove the retries. Also consider
|
||||
// using platform APIs (if available) to get this more succinctly. Also
|
||||
// consider enlighting the Remove() interface to have context of why
|
||||
// the remove is being called - that could improve efficiency by not
|
||||
// enumerating compute systems during a remove of a container as it's
|
||||
// not required.
|
||||
computeSystems, err = hcsshim.GetContainers(hcsshim.ComputeSystemQuery{})
|
||||
if err != nil {
|
||||
if err == hcsshim.ErrVmcomputeOperationInvalidState {
|
||||
if retryCount >= 5 {
|
||||
// If we are unable to get the list of containers
|
||||
// go ahead and attempt to delete the layer anyway
|
||||
// as it will most likely work.
|
||||
if (osv.Build < 15139) &&
|
||||
((err == hcsshim.ErrVmcomputeOperationInvalidState) || (err == hcsshim.ErrVmcomputeOperationAccessIsDenied)) {
|
||||
if retryCount >= 500 {
|
||||
break
|
||||
}
|
||||
retryCount++
|
||||
time.Sleep(2 * time.Second)
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
continue
|
||||
}
|
||||
return err
|
||||
|
|
Loading…
Add table
Reference in a new issue