aufs: retry unmount on EBUSY

For some reason, retrying to unmount in case of getting EBUSY error
was only performed in Remove(), but not Put().

I have done some testing on Ubuntu 16.04 and 18.04 with aufs,
performing massively parallel container creation using this script:

```
NUMCTS=5000
PARALLEL=100
IMAGE=busybox

docker pull $IMAGE >/dev/null
seq $NUMCTS | parallel -j$PARALLEL docker create $IMAGE true > /dev/null
docker ps -qa | shuf | tail -n $NUMCTS | parallel -j$PARALLEL docker rm -f '{}' > /dev/null
```

Sometimes (1 to 5 times per 10000 `docker create`), aufs.Put() fails on Unmount syscall
with EBUSY during container creation:

> Error response from daemon: device or resource busy

and in docker log, with debug turned on:

> level=debug msg="Failed to unmount ID-init aufs: device or resource busy"
> level=error msg="Handler for POST /v1.30/containers/create returned error: device or resource busy"

I did some debugging by running fuser -v -M -m $MOUNT_POINT but
that reveals nothing.

This commit:

 * implements retry on EBUSY in Unmount()
 * calls Unmount() from Remove()
 * increases the number of retries from 3 to 5

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
This commit is contained in:
Kir Kolyshkin 2019-05-24 15:22:10 -07:00
parent 72ceac6a74
commit 57f06409b1
2 changed files with 27 additions and 34 deletions

View file

@ -34,7 +34,6 @@ import (
"path/filepath"
"strings"
"sync"
"time"
"github.com/docker/docker/daemon/graphdriver"
"github.com/docker/docker/pkg/archive"
@ -307,33 +306,9 @@ func (a *Driver) Remove(id string) error {
mountpoint = a.getMountpoint(id)
}
logger := logger.WithField("layer", id)
var retries int
for {
mounted, err := a.mounted(mountpoint)
if err != nil {
return err
}
if !mounted {
break
}
err = a.unmount(mountpoint)
if err == nil {
break
}
if errors.Cause(err) != unix.EBUSY {
return errors.Wrap(err, "aufs: unmount error")
}
if retries >= 5 {
return errors.Wrap(err, "aufs: unmount error after retries")
}
// If unmount returns EBUSY, it could be a transient error. Sleep and retry.
retries++
logger.Warnf("unmount failed due to EBUSY: retry count: %d", retries)
time.Sleep(100 * time.Millisecond)
if err := a.unmount(mountpoint); err != nil {
logger.WithError(err).WithField("method", "Remove()").Warn()
return err
}
// Remove the layers file for the id

View file

@ -5,17 +5,18 @@ package aufs // import "github.com/docker/docker/daemon/graphdriver/aufs"
import (
"os/exec"
"syscall"
"time"
"github.com/docker/docker/pkg/mount"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
// Unmount the target specified.
func Unmount(target string) error {
const (
EINVAL = 22 // if auplink returns this,
retries = 3 // retry a few times
)
const retries = 5
// auplink flush
for i := 0; ; i++ {
out, err := exec.Command("auplink", target, "flush").CombinedOutput()
if err == nil {
@ -27,7 +28,7 @@ func Unmount(target string) error {
rc = status.ExitStatus()
}
}
if i >= retries || rc != EINVAL {
if i >= retries || rc != int(unix.EINVAL) {
logger.WithError(err).WithField("method", "Unmount").Warnf("auplink flush failed: %s", out)
break
}
@ -37,5 +38,22 @@ func Unmount(target string) error {
logger.Debugf("auplink flush error (retrying %d/%d): %s", i+1, retries, out)
}
return mount.Unmount(target)
// unmount
var err error
for i := 0; i < retries; i++ {
err = mount.Unmount(target)
switch errors.Cause(err) {
case nil:
return nil
case unix.EBUSY:
logger.Debugf("aufs unmount %s failed with EBUSY (retrying %d/%d)", target, i+1, retries)
time.Sleep(100 * time.Millisecond)
continue // try again
default:
// any other error is fatal
break
}
}
return err
}