aufs: retry unmount on EBUSY
For some reason, retrying to unmount in case of getting EBUSY error was only performed in Remove(), but not Put(). I have done some testing on Ubuntu 16.04 and 18.04 with aufs, performing massively parallel container creation using this script: ``` NUMCTS=5000 PARALLEL=100 IMAGE=busybox docker pull $IMAGE >/dev/null seq $NUMCTS | parallel -j$PARALLEL docker create $IMAGE true > /dev/null docker ps -qa | shuf | tail -n $NUMCTS | parallel -j$PARALLEL docker rm -f '{}' > /dev/null ``` Sometimes (1 to 5 times per 10000 `docker create`), aufs.Put() fails on Unmount syscall with EBUSY during container creation: > Error response from daemon: device or resource busy and in docker log, with debug turned on: > level=debug msg="Failed to unmount ID-init aufs: device or resource busy" > level=error msg="Handler for POST /v1.30/containers/create returned error: device or resource busy" I did some debugging by running fuser -v -M -m $MOUNT_POINT but that reveals nothing. This commit: * implements retry on EBUSY in Unmount() * calls Unmount() from Remove() * increases the number of retries from 3 to 5 Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
This commit is contained in:
parent
72ceac6a74
commit
57f06409b1
2 changed files with 27 additions and 34 deletions
|
@ -34,7 +34,6 @@ import (
|
|||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/docker/docker/daemon/graphdriver"
|
||||
"github.com/docker/docker/pkg/archive"
|
||||
|
@ -307,33 +306,9 @@ func (a *Driver) Remove(id string) error {
|
|||
mountpoint = a.getMountpoint(id)
|
||||
}
|
||||
|
||||
logger := logger.WithField("layer", id)
|
||||
|
||||
var retries int
|
||||
for {
|
||||
mounted, err := a.mounted(mountpoint)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if !mounted {
|
||||
break
|
||||
}
|
||||
|
||||
err = a.unmount(mountpoint)
|
||||
if err == nil {
|
||||
break
|
||||
}
|
||||
|
||||
if errors.Cause(err) != unix.EBUSY {
|
||||
return errors.Wrap(err, "aufs: unmount error")
|
||||
}
|
||||
if retries >= 5 {
|
||||
return errors.Wrap(err, "aufs: unmount error after retries")
|
||||
}
|
||||
// If unmount returns EBUSY, it could be a transient error. Sleep and retry.
|
||||
retries++
|
||||
logger.Warnf("unmount failed due to EBUSY: retry count: %d", retries)
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
if err := a.unmount(mountpoint); err != nil {
|
||||
logger.WithError(err).WithField("method", "Remove()").Warn()
|
||||
return err
|
||||
}
|
||||
|
||||
// Remove the layers file for the id
|
||||
|
|
|
@ -5,17 +5,18 @@ package aufs // import "github.com/docker/docker/daemon/graphdriver/aufs"
|
|||
import (
|
||||
"os/exec"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/docker/docker/pkg/mount"
|
||||
"github.com/pkg/errors"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
// Unmount the target specified.
|
||||
func Unmount(target string) error {
|
||||
const (
|
||||
EINVAL = 22 // if auplink returns this,
|
||||
retries = 3 // retry a few times
|
||||
)
|
||||
const retries = 5
|
||||
|
||||
// auplink flush
|
||||
for i := 0; ; i++ {
|
||||
out, err := exec.Command("auplink", target, "flush").CombinedOutput()
|
||||
if err == nil {
|
||||
|
@ -27,7 +28,7 @@ func Unmount(target string) error {
|
|||
rc = status.ExitStatus()
|
||||
}
|
||||
}
|
||||
if i >= retries || rc != EINVAL {
|
||||
if i >= retries || rc != int(unix.EINVAL) {
|
||||
logger.WithError(err).WithField("method", "Unmount").Warnf("auplink flush failed: %s", out)
|
||||
break
|
||||
}
|
||||
|
@ -37,5 +38,22 @@ func Unmount(target string) error {
|
|||
logger.Debugf("auplink flush error (retrying %d/%d): %s", i+1, retries, out)
|
||||
}
|
||||
|
||||
return mount.Unmount(target)
|
||||
// unmount
|
||||
var err error
|
||||
for i := 0; i < retries; i++ {
|
||||
err = mount.Unmount(target)
|
||||
switch errors.Cause(err) {
|
||||
case nil:
|
||||
return nil
|
||||
case unix.EBUSY:
|
||||
logger.Debugf("aufs unmount %s failed with EBUSY (retrying %d/%d)", target, i+1, retries)
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
continue // try again
|
||||
default:
|
||||
// any other error is fatal
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue