volumes: Implement subpath mount

`VolumeOptions` now has a `Subpath` field which allows to specify a path relative to the volume that should be mounted as a destination. Symlinks are supported, but they cannot escape the base volume directory. Signed-off-by: Paweł Gronowski <pawel.gronowski@docker.com>
2023-06-09 20:43:45 +02:00 · 2023-06-09 20:43:45 +02:00 · bfb810445c
commit bfb810445c
parent f2e1105056
20 changed files with 727 additions and 30 deletions
--- a/api/server/router/container/container_routes.go
+++ b/api/server/router/container/container_routes.go
@ -628,6 +628,14 @@ func (s *containerRouter) postContainersCreate(ctx context.Context, w http.Respo
 		}
 	}
 	if versions.LessThan(version, "1.45") {
 		for _, m := range hostConfig.Mounts {
 			if m.VolumeOptions != nil && m.VolumeOptions.Subpath != "" {
 				return errdefs.InvalidParameter(errors.New("VolumeOptions.Subpath needs API v1.45 or newer"))
 			}
 		}
 	}
 	var warnings []string
 	if warn, err := handleMACAddressBC(config, hostConfig, networkingConfig, version); err != nil {
 		return err
--- a/api/swagger.yaml
+++ b/api/swagger.yaml
@ -423,6 +423,10 @@ definitions:
                type: "object"
                additionalProperties:
                  type: "string"
          Subpath:
            description: "Source path inside the volume. Must be relative without any back traversals."
            type: "string"
            example: "dir-inside-volume/subdirectory"
      TmpfsOptions:
        description: "Optional configuration for the `tmpfs` type."
        type: "object"
--- a/api/types/mount/mount.go
+++ b/api/types/mount/mount.go
@ -96,6 +96,7 @@ type BindOptions struct {
 type VolumeOptions struct {
 	NoCopy       bool              `json:",omitempty"`
 	Labels       map[string]string `json:",omitempty"`
 	Subpath      string            `json:",omitempty"`
 	DriverConfig *Driver           `json:",omitempty"`
 }
--- a/daemon/containerfs_linux.go
+++ b/daemon/containerfs_linux.go
@ -63,11 +63,12 @@ func (daemon *Daemon) openContainerFS(container *container.Container) (_ *contai
 		}
 	}()
-	mounts, err := daemon.setupMounts(container)
+	mounts, cleanup, err := daemon.setupMounts(container)
 	if err != nil {
 		return nil, err
 	}
 	defer func() {
 		cleanup()
 		if err != nil {
 			_ = container.UnmountVolumes(daemon.LogVolumeEvent)
 		}
--- a/daemon/start.go
+++ b/daemon/start.go
@ -164,11 +164,12 @@ func (daemon *Daemon) containerStart(ctx context.Context, daemonCfg *configStore
 		return err
 	}
-	m, err := daemon.setupMounts(container)
+	m, cleanup, err := daemon.setupMounts(container)
 	if err != nil {
 		return err
 	}
 	mnts = append(mnts, m...)
 	defer cleanup()
 	spec, err := daemon.createSpec(ctx, daemonCfg, container, mnts)
 	if err != nil {
--- a/daemon/volumes_unix.go
+++ b/daemon/volumes_unix.go
@ -3,15 +3,18 @@
 package daemon // import "github.com/docker/docker/daemon"
 import (
 	"context"
 	"fmt"
 	"os"
 	"sort"
 	"strconv"
 	"strings"
 	"github.com/containerd/log"
 	"github.com/docker/docker/api/types/events"
 	mounttypes "github.com/docker/docker/api/types/mount"
 	"github.com/docker/docker/container"
 	"github.com/docker/docker/internal/cleanups"
 	volumemounts "github.com/docker/docker/volume/mounts"
 	"github.com/pkg/errors"
 )
@ -19,23 +22,34 @@ import (
 // setupMounts iterates through each of the mount points for a container and
 // calls Setup() on each. It also looks to see if is a network mount such as
 // /etc/resolv.conf, and if it is not, appends it to the array of mounts.
-func (daemon *Daemon) setupMounts(c *container.Container) ([]container.Mount, error) {
+//
 // The cleanup function should be called as soon as the container has been
 // started.
 func (daemon *Daemon) setupMounts(c *container.Container) ([]container.Mount, func() error, error) {
 	var mounts []container.Mount
 	// TODO: tmpfs mounts should be part of Mountpoints
 	tmpfsMounts := make(map[string]bool)
 	tmpfsMountInfo, err := c.TmpfsMounts()
 	if err != nil {
-		return nil, err
+		return nil, nil, err
 	}
 	for _, m := range tmpfsMountInfo {
 		tmpfsMounts[m.Destination] = true
 	}
 	cleanups := cleanups.Composite{}
 	defer func() {
 		if err := cleanups.Call(); err != nil {
 			log.G(context.TODO()).WithError(err).Warn("failed to cleanup temporary mounts created by MountPoint.Setup")
 		}
 	}()
 	for _, m := range c.MountPoints {
 		if tmpfsMounts[m.Destination] {
 			continue
 		}
 		if err := daemon.lazyInitializeVolume(c.ID, m); err != nil {
-			return nil, err
+			return nil, nil, err
 		}
 		// If the daemon is being shutdown, we should not let a container start if it is trying to
 		// mount the socket the daemon is listening on. During daemon shutdown, the socket
@ -48,10 +62,12 @@ func (daemon *Daemon) setupMounts(c *container.Container) ([]container.Mount, er
 			return nil
 		}
-		path, err := m.Setup(c.MountLabel, daemon.idMapping.RootPair(), checkfunc)
+		path, clean, err := m.Setup(c.MountLabel, daemon.idMapping.RootPair(), checkfunc)
 		if err != nil {
-			return nil, err
+			return nil, nil, err
 		}
 		cleanups.Add(clean)
 		if !c.TrySetNetworkMount(m.Destination, path) {
 			mnt := container.Mount{
 				Source:      path,
@ -61,13 +77,13 @@ func (daemon *Daemon) setupMounts(c *container.Container) ([]container.Mount, er
 			}
 			if m.Spec.Type == mounttypes.TypeBind && m.Spec.BindOptions != nil {
 				if !m.Spec.ReadOnly && m.Spec.BindOptions.ReadOnlyNonRecursive {
-					return nil, errors.New("mount options conflict: !ReadOnly && BindOptions.ReadOnlyNonRecursive")
+					return nil, nil, errors.New("mount options conflict: !ReadOnly && BindOptions.ReadOnlyNonRecursive")
 				}
 				if !m.Spec.ReadOnly && m.Spec.BindOptions.ReadOnlyForceRecursive {
-					return nil, errors.New("mount options conflict: !ReadOnly && BindOptions.ReadOnlyForceRecursive")
+					return nil, nil, errors.New("mount options conflict: !ReadOnly && BindOptions.ReadOnlyForceRecursive")
 				}
 				if m.Spec.BindOptions.ReadOnlyNonRecursive && m.Spec.BindOptions.ReadOnlyForceRecursive {
-					return nil, errors.New("mount options conflict: ReadOnlyNonRecursive && BindOptions.ReadOnlyForceRecursive")
+					return nil, nil, errors.New("mount options conflict: ReadOnlyNonRecursive && BindOptions.ReadOnlyForceRecursive")
 				}
 				mnt.NonRecursive = m.Spec.BindOptions.NonRecursive
 				mnt.ReadOnlyNonRecursive = m.Spec.BindOptions.ReadOnlyNonRecursive
@ -98,11 +114,11 @@ func (daemon *Daemon) setupMounts(c *container.Container) ([]container.Mount, er
 		// up to the user to make sure the file has proper ownership for userns
 		if strings.Index(mnt.Source, daemon.repository) == 0 {
 			if err := os.Chown(mnt.Source, rootIDs.UID, rootIDs.GID); err != nil {
-				return nil, err
+				return nil, nil, err
 			}
 		}
 	}
-	return append(mounts, netMounts...), nil
+	return append(mounts, netMounts...), cleanups.Release(), nil
 }
 // sortMounts sorts an array of mounts in lexicographic order. This ensure that
--- a/daemon/volumes_windows.go
+++ b/daemon/volumes_windows.go
@ -1,10 +1,13 @@
 package daemon // import "github.com/docker/docker/daemon"
 import (
 	"context"
 	"sort"
 	"github.com/containerd/log"
 	"github.com/docker/docker/api/types/mount"
 	"github.com/docker/docker/container"
 	"github.com/docker/docker/internal/cleanups"
 	"github.com/docker/docker/pkg/idtools"
 	volumemounts "github.com/docker/docker/volume/mounts"
 )
@ -13,21 +16,31 @@ import (
 // of the configured mounts on the container to the OCI mount structure
 // which will ultimately be passed into the oci runtime during container creation.
 // It also ensures each of the mounts are lexicographically sorted.
-
+//
 // The cleanup function should be called as soon as the container has been
 // started.
 //
 // BUGBUG TODO Windows containerd. This would be much better if it returned
 // an array of runtime spec mounts, not container mounts. Then no need to
 // do multiple transitions.
 func (daemon *Daemon) setupMounts(c *container.Container) ([]container.Mount, func() error, error) {
 	cleanups := cleanups.Composite{}
 	defer func() {
 		if err := cleanups.Call(); err != nil {
 			log.G(context.TODO()).WithError(err).Warn("failed to cleanup temporary mounts created by MountPoint.Setup")
 		}
 	}()
 func (daemon *Daemon) setupMounts(c *container.Container) ([]container.Mount, error) {
 	var mnts []container.Mount
 	for _, mount := range c.MountPoints { // type is volumemounts.MountPoint
 		if err := daemon.lazyInitializeVolume(c.ID, mount); err != nil {
-			return nil, err
+			return nil, nil, err
 		}
-		s, err := mount.Setup(c.MountLabel, idtools.Identity{}, nil)
+		s, c, err := mount.Setup(c.MountLabel, idtools.Identity{}, nil)
 		if err != nil {
-			return nil, err
+			return nil, nil, err
 		}
 		cleanups.Add(c)
 		mnts = append(mnts, container.Mount{
 			Source:      s,
@ -37,7 +50,7 @@ func (daemon *Daemon) setupMounts(c *container.Container) ([]container.Mount, er
 	}
 	sort.Sort(mounts(mnts))
-	return mnts, nil
+	return mnts, cleanups.Release(), nil
 }
 // setBindModeIfNull is platform specific processing which is a no-op on
--- a/docs/api/version-history.md
+++ b/docs/api/version-history.md
@ -17,6 +17,9 @@ keywords: "API, Docker, rcli, REST, documentation"
 [Docker Engine API v1.45](https://docs.docker.com/engine/api/v1.45/) documentation
 * `POST /containers/create` now supports `VolumeOptions.Subpath` which allows a
  subpath of a named volume to be mounted.
 ## v1.44 API changes
 [Docker Engine API v1.44](https://docs.docker.com/engine/api/v1.44/) documentation
--- a/integration/internal/container/container.go
+++ b/integration/internal/container/container.go
@ -170,3 +170,28 @@ func Inspect(ctx context.Context, t *testing.T, apiClient client.APIClient, cont
 	return c
 }
 type ContainerOutput struct {
 	Stdout, Stderr string
 }
 // Output waits for the container to end running and returns its output.
 func Output(ctx context.Context, client client.APIClient, id string) (ContainerOutput, error) {
 	logs, err := client.ContainerLogs(ctx, id, container.LogsOptions{Follow: true, ShowStdout: true, ShowStderr: true})
 	if err != nil {
 		return ContainerOutput{}, err
 	}
 	defer logs.Close()
 	var stdoutBuf, stderrBuf bytes.Buffer
 	_, err = stdcopy.StdCopy(&stdoutBuf, &stderrBuf, logs)
 	if err != nil {
 		return ContainerOutput{}, err
 	}
 	return ContainerOutput{
 		Stdout: stdoutBuf.String(),
 		Stderr: stderrBuf.String(),
 	}, nil
 }
--- a/integration/volume/mount_test.go
+++ b/integration/volume/mount_test.go
@ -0,0 +1,185 @@
 package volume
 import (
 	"context"
 	"path/filepath"
 	"strings"
 	"testing"
 	containertypes "github.com/docker/docker/api/types/container"
 	"github.com/docker/docker/api/types/mount"
 	"github.com/docker/docker/api/types/network"
 	"github.com/docker/docker/api/types/versions"
 	"github.com/docker/docker/api/types/volume"
 	"github.com/docker/docker/client"
 	"github.com/docker/docker/integration/internal/container"
 	"github.com/docker/docker/internal/safepath"
 	"gotest.tools/v3/assert"
 	is "gotest.tools/v3/assert/cmp"
 	"gotest.tools/v3/skip"
 )
 func TestRunMountVolumeSubdir(t *testing.T) {
 	skip.If(t, versions.LessThan(testEnv.DaemonAPIVersion(), "1.45"), "skip test from new feature")
 	ctx := setupTest(t)
 	apiClient := testEnv.APIClient()
 	testVolumeName := setupTestVolume(t, apiClient)
 	for _, tc := range []struct {
 		name         string
 		opts         mount.VolumeOptions
 		cmd          []string
 		volumeTarget string
 		createErr    string
 		startErr     string
 		expected     string
 		skipPlatform string
 	}{
 		{name: "subdir", opts: mount.VolumeOptions{Subpath: "subdir"}, cmd: []string{"ls", "/volume"}, expected: "hello.txt"},
 		{name: "subdir link", opts: mount.VolumeOptions{Subpath: "hack/good"}, cmd: []string{"ls", "/volume"}, expected: "hello.txt"},
 		{name: "subdir with copy data", opts: mount.VolumeOptions{Subpath: "bin"}, volumeTarget: "/bin", cmd: []string{"ls", "/bin/busybox"}, expected: "/bin/busybox", skipPlatform: "windows:copy not supported on Windows"},
 		{name: "file", opts: mount.VolumeOptions{Subpath: "bar.txt"}, cmd: []string{"cat", "/volume"}, expected: "foo", skipPlatform: "windows:file bind mounts not supported on Windows"},
 		{name: "relative with backtracks", opts: mount.VolumeOptions{Subpath: "../../../../../../etc/passwd"}, cmd: []string{"cat", "/volume"}, createErr: "subpath must be a relative path within the volume"},
 		{name: "not existing", opts: mount.VolumeOptions{Subpath: "not-existing-path"}, cmd: []string{"cat", "/volume"}, startErr: (&safepath.ErrNotAccessible{}).Error()},
 		{name: "mount link", opts: mount.VolumeOptions{Subpath: filepath.Join("hack", "root")}, cmd: []string{"ls", "/volume"}, startErr: (&safepath.ErrEscapesBase{}).Error()},
 		{name: "mount link link", opts: mount.VolumeOptions{Subpath: filepath.Join("hack", "bad")}, cmd: []string{"ls", "/volume"}, startErr: (&safepath.ErrEscapesBase{}).Error()},
 	} {
 		t.Run(tc.name, func(t *testing.T) {
 			if tc.skipPlatform != "" {
 				platform, reason, _ := strings.Cut(tc.skipPlatform, ":")
 				if testEnv.DaemonInfo.OSType == platform {
 					t.Skip(reason)
 				}
 			}
 			cfg := containertypes.Config{
 				Image: "busybox",
 				Cmd:   tc.cmd,
 			}
 			hostCfg := containertypes.HostConfig{
 				Mounts: []mount.Mount{
 					{
 						Type:          mount.TypeVolume,
 						Source:        testVolumeName,
 						Target:        "/volume",
 						VolumeOptions: &tc.opts,
 					},
 				},
 			}
 			if testEnv.DaemonInfo.OSType == "windows" {
 				hostCfg.Mounts[0].Target = `C:\volume`
 			}
 			if tc.volumeTarget != "" {
 				hostCfg.Mounts[0].Target = tc.volumeTarget
 			}
 			ctrName := strings.ReplaceAll(t.Name(), "/", "_")
 			create, creatErr := apiClient.ContainerCreate(ctx, &cfg, &hostCfg, &network.NetworkingConfig{}, nil, ctrName)
 			id := create.ID
 			if id != "" {
 				defer apiClient.ContainerRemove(ctx, id, containertypes.RemoveOptions{Force: true})
 			}
 			if tc.createErr != "" {
 				assert.ErrorContains(t, creatErr, tc.createErr)
 				return
 			}
 			assert.NilError(t, creatErr, "container creation failed")
 			startErr := apiClient.ContainerStart(ctx, id, containertypes.StartOptions{})
 			if tc.startErr != "" {
 				assert.ErrorContains(t, startErr, tc.startErr)
 				return
 			}
 			assert.NilError(t, startErr)
 			output, err := container.Output(ctx, apiClient, id)
 			assert.Check(t, err)
 			t.Logf("stdout:\n%s", output.Stdout)
 			t.Logf("stderr:\n%s", output.Stderr)
 			inspect, err := apiClient.ContainerInspect(ctx, id)
 			if assert.Check(t, err) {
 				assert.Check(t, is.Equal(inspect.State.ExitCode, 0))
 			}
 			assert.Check(t, is.Equal(strings.TrimSpace(output.Stderr), ""))
 			assert.Check(t, is.Equal(strings.TrimSpace(output.Stdout), tc.expected))
 		})
 	}
 }
 // setupTestVolume sets up a volume with:
 // .
 // |-- bar.txt                        (file with "foo")
 // |-- bin                            (directory)
 // |-- subdir                         (directory)
 // |   |-- hello.txt                  (file with "world")
 // |-- hack                           (directory)
 // |   |-- root                       (symlink to /)
 // |   |-- good                       (symlink to ../subdir)
 // |   |-- bad                        (symlink to root)
 func setupTestVolume(t *testing.T, client client.APIClient) string {
 	t.Helper()
 	ctx := context.Background()
 	volumeName := t.Name() + "-volume"
 	err := client.VolumeRemove(ctx, volumeName, true)
 	assert.NilError(t, err, "failed to clean volume")
 	_, err = client.VolumeCreate(ctx, volume.CreateOptions{
 		Name: volumeName,
 	})
 	assert.NilError(t, err, "failed to setup volume")
 	mount := mount.Mount{
 		Type:   mount.TypeVolume,
 		Source: volumeName,
 		Target: "/volume",
 	}
 	rootFs := "/"
 	if testEnv.DaemonInfo.OSType == "windows" {
 		mount.Target = `C:\volume`
 		rootFs = `C:`
 	}
 	initCmd := "echo foo > /volume/bar.txt && " +
 		"mkdir /volume/bin && " +
 		"mkdir /volume/subdir && " +
 		"echo world > /volume/subdir/hello.txt && " +
 		"mkdir /volume/hack && " +
 		"ln -s " + rootFs + " /volume/hack/root && " +
 		"ln -s ../subdir /volume/hack/good && " +
 		"ln -s root /volume/hack/bad &&" +
 		"mkdir /volume/hack/iwanttobehackedwithtoctou"
 	opts := []func(*container.TestContainerConfig){
 		container.WithMount(mount),
 		container.WithCmd("sh", "-c", initCmd+"; ls -lah /volume /volume/hack/"),
 	}
 	if testEnv.DaemonInfo.OSType == "windows" {
 		// Can't create symlinks under HyperV isolation
 		opts = append(opts, container.WithIsolation(containertypes.IsolationProcess))
 	}
 	cid := container.Run(ctx, t, client, opts...)
 	defer client.ContainerRemove(ctx, cid, containertypes.RemoveOptions{Force: true})
 	output, err := container.Output(ctx, client, cid)
 	t.Logf("Setup stderr:\n%s", output.Stderr)
 	t.Logf("Setup stdout:\n%s", output.Stdout)
 	assert.NilError(t, err)
 	assert.Assert(t, is.Equal(output.Stderr, ""))
 	inspect, err := client.ContainerInspect(ctx, cid)
 	assert.NilError(t, err)
 	assert.Assert(t, is.Equal(inspect.State.ExitCode, 0))
 	return volumeName
 }
--- a/internal/safepath/common.go
+++ b/internal/safepath/common.go
@ -0,0 +1,66 @@
 package safepath
 import (
 	"os"
 	"path/filepath"
 	"github.com/pkg/errors"
 )
 // evaluatePath evaluates symlinks in the concatenation of path and subpath. If
 // err is nil, resolvedBasePath will contain result of resolving all symlinks
 // in the given path, and resolvedSubpath will contain a relative path rooted
 // at the resolvedBasePath pointing to the concatenation after resolving all
 // symlinks.
 func evaluatePath(path, subpath string) (resolvedBasePath string, resolvedSubpath string, err error) {
 	baseResolved, err := filepath.EvalSymlinks(path)
 	if err != nil {
 		if errors.Is(err, os.ErrNotExist) {
 			return "", "", &ErrNotAccessible{Path: path, Cause: err}
 		}
 		return "", "", errors.Wrapf(err, "error while resolving symlinks in base directory %q", path)
 	}
 	combinedPath := filepath.Join(baseResolved, subpath)
 	combinedResolved, err := filepath.EvalSymlinks(combinedPath)
 	if err != nil {
 		if errors.Is(err, os.ErrNotExist) {
 			return "", "", &ErrNotAccessible{Path: combinedPath, Cause: err}
 		}
 		return "", "", errors.Wrapf(err, "error while resolving symlinks in combined path %q", combinedPath)
 	}
 	subpart, err := filepath.Rel(baseResolved, combinedResolved)
 	if err != nil {
 		return "", "", &ErrEscapesBase{Base: baseResolved, Subpath: subpath}
 	}
 	if !filepath.IsLocal(subpart) {
 		return "", "", &ErrEscapesBase{Base: baseResolved, Subpath: subpath}
 	}
 	return baseResolved, subpart, nil
 }
 // isLocalTo reports whether path, using lexical analysis only, has all of these properties:
 //   - is within the subtree rooted at basepath
 //   - is not empty
 //   - on Windows, is not a reserved name such as "NUL"
 //
 // If isLocalTo(path, basepath) returns true, then
 //
 //	filepath.Rel(basepath, path)
 //
 // will always produce an unrooted path with no `..` elements.
 //
 // isLocalTo is a purely lexical operation. In particular, it does not account for the effect of any symbolic links that may exist in the filesystem.
 //
 // Both path and basepath are expected to be absolute paths.
 func isLocalTo(path, basepath string) bool {
 	rel, err := filepath.Rel(basepath, path)
 	if err != nil {
 		return false
 	}
 	return filepath.IsLocal(rel)
 }
--- a/internal/safepath/common_test.go
+++ b/internal/safepath/common_test.go
@ -0,0 +1,31 @@
 package safepath
 import (
 	"testing"
 	"gotest.tools/v3/assert"
 	is "gotest.tools/v3/assert/cmp"
 )
 func TestIsLocalTo(t *testing.T) {
 	for _, tc := range []struct {
 		name    string
 		subpath string
 		result  bool
 	}{
 		{name: "same", subpath: "/volume", result: true},
 		{name: "1 level subpath", subpath: "/volume/sub", result: true},
 		{name: "2 level subpath", subpath: "/volume/sub/path", result: true},
 		{name: "absolute", subpath: "/etc/passwd", result: false},
 		{name: "backtrack", subpath: "/volume/../", result: false},
 		{name: "backtrack inside", subpath: "/volume/sub/../", result: true},
 		{name: "relative path", subpath: "./rel", result: false},
 		{name: "file with dots", subpath: "/volume/file..with.dots", result: true},
 		{name: "file starting with dots", subpath: "/volume/..file", result: true},
 	} {
 		t.Run(tc.name, func(t *testing.T) {
 			result := isLocalTo(tc.subpath, "/volume")
 			assert.Check(t, is.Equal(result, tc.result))
 		})
 	}
 }
--- a/internal/safepath/errors.go
+++ b/internal/safepath/errors.go
@ -0,0 +1,42 @@
 package safepath
 // ErrNotAccessible is returned by Join when the resulting path doesn't exist,
 // is not accessible, or any of the path components was replaced with a symlink
 // during the path traversal.
 type ErrNotAccessible struct {
 	Path  string
 	Cause error
 }
 func (*ErrNotAccessible) NotFound() {}
 func (e *ErrNotAccessible) Unwrap() error {
 	return e.Cause
 }
 func (e *ErrNotAccessible) Error() string {
 	msg := "cannot access path " + e.Path
 	if e.Cause != nil {
 		msg += ": " + e.Cause.Error()
 	}
 	return msg
 }
 // ErrEscapesBase is returned by Join when the resulting concatenation would
 // point outside of the specified base directory.
 type ErrEscapesBase struct {
 	Base, Subpath string
 }
 func (*ErrEscapesBase) InvalidParameter() {}
 func (e *ErrEscapesBase) Error() string {
 	msg := "path concatenation escapes the base directory"
 	if e.Base != "" {
 		msg += ", base: " + e.Base
 	}
 	if e.Subpath != "" {
 		msg += ", subpath: " + e.Subpath
 	}
 	return msg
 }
--- a/internal/safepath/join_test.go
+++ b/internal/safepath/join_test.go
@ -0,0 +1,144 @@
 package safepath
 import (
 	"os"
 	"path/filepath"
 	"runtime"
 	"strings"
 	"testing"
 	"gotest.tools/v3/assert"
 	is "gotest.tools/v3/assert/cmp"
 )
 func TestJoinEscapingSymlink(t *testing.T) {
 	type testCase struct {
 		name   string
 		target string
 	}
 	var cases []testCase
 	if runtime.GOOS == "windows" {
 		cases = []testCase{
 			{name: "root", target: `C:\`},
 			{name: "absolute file", target: `C:\Windows\System32\cmd.exe`},
 		}
 	} else {
 		cases = []testCase{
 			{name: "root", target: "/"},
 			{name: "absolute file", target: "/etc/passwd"},
 		}
 	}
 	cases = append(cases, testCase{name: "relative", target: "../../"})
 	for _, tc := range cases {
 		t.Run(tc.name, func(t *testing.T) {
 			tempDir := t.TempDir()
 			dir, err := filepath.EvalSymlinks(tempDir)
 			assert.NilError(t, err, "filepath.EvalSymlinks failed for temporary directory %s", tempDir)
 			err = os.Symlink(tc.target, filepath.Join(dir, "link"))
 			assert.NilError(t, err, "failed to create symlink to %s", tc.target)
 			safe, err := Join(dir, "link")
 			if err == nil {
 				safe.Close()
 			}
 			assert.ErrorType(t, err, &ErrEscapesBase{})
 		})
 	}
 }
 func TestJoinGoodSymlink(t *testing.T) {
 	tempDir := t.TempDir()
 	dir, err := filepath.EvalSymlinks(tempDir)
 	assert.NilError(t, err, "filepath.EvalSymlinks failed for temporary directory %s", tempDir)
 	assert.Assert(t, os.WriteFile(filepath.Join(dir, "foo"), []byte("bar"), 0o744), "failed to create file 'foo'")
 	assert.Assert(t, os.Mkdir(filepath.Join(dir, "subdir"), 0o744), "failed to create directory 'subdir'")
 	assert.Assert(t, os.WriteFile(filepath.Join(dir, "subdir/hello.txt"), []byte("world"), 0o744), "failed to create file 'subdir/hello.txt'")
 	assert.Assert(t, os.Symlink(filepath.Join(dir, "subdir"), filepath.Join(dir, "subdir_link_absolute")), "failed to create absolute symlink to directory 'subdir'")
 	assert.Assert(t, os.Symlink("subdir", filepath.Join(dir, "subdir_link_relative")), "failed to create relative symlink to directory 'subdir'")
 	assert.Assert(t, os.Symlink(filepath.Join(dir, "foo"), filepath.Join(dir, "foo_link_absolute")), "failed to create absolute symlink to file 'foo'")
 	assert.Assert(t, os.Symlink("foo", filepath.Join(dir, "foo_link_relative")), "failed to create relative symlink to file 'foo'")
 	for _, target := range []string{
 		"foo", "subdir",
 		"subdir_link_absolute", "foo_link_absolute",
 		"subdir_link_relative", "foo_link_relative",
 	} {
 		t.Run(target, func(t *testing.T) {
 			safe, err := Join(dir, target)
 			assert.NilError(t, err)
 			defer safe.Close()
 			if strings.HasPrefix(target, "subdir") {
 				data, err := os.ReadFile(filepath.Join(safe.Path(), "hello.txt"))
 				assert.NilError(t, err)
 				assert.Assert(t, is.Equal(string(data), "world"))
 			}
 		})
 	}
 }
 func TestJoinWithSymlinkReplace(t *testing.T) {
 	tempDir := t.TempDir()
 	dir, err := filepath.EvalSymlinks(tempDir)
 	assert.NilError(t, err, "filepath.EvalSymlinks failed for temporary directory %s", tempDir)
 	link := filepath.Join(dir, "link")
 	target := filepath.Join(dir, "foo")
 	err = os.WriteFile(target, []byte("bar"), 0o744)
 	assert.NilError(t, err, "failed to create test file")
 	err = os.Symlink(target, link)
 	assert.Check(t, err, "failed to create symlink to foo")
 	safe, err := Join(dir, "link")
 	assert.NilError(t, err)
 	defer safe.Close()
 	// Delete the link target.
 	err = os.Remove(target)
 	if runtime.GOOS == "windows" {
 		// On Windows it shouldn't be possible.
 		assert.Assert(t, is.ErrorType(err, &os.PathError{}), "link shouldn't be deletable before cleanup")
 	} else {
 		// On Linux we can delete it just fine.
 		assert.NilError(t, err, "failed to remove symlink")
 		// Replace target with a symlink to /etc/paswd
 		err = os.Symlink("/etc/passwd", target)
 		assert.NilError(t, err, "failed to create symlink")
 	}
 	// The returned safe path should still point to the old file.
 	data, err := os.ReadFile(safe.Path())
 	assert.NilError(t, err, "failed to read file")
 	assert.Check(t, is.Equal(string(data), "bar"))
 }
 func TestJoinCloseInvalidates(t *testing.T) {
 	tempDir := t.TempDir()
 	dir, err := filepath.EvalSymlinks(tempDir)
 	assert.NilError(t, err)
 	foo := filepath.Join(dir, "foo")
 	err = os.WriteFile(foo, []byte("bar"), 0o744)
 	assert.NilError(t, err, "failed to create test file")
 	safe, err := Join(dir, "foo")
 	assert.NilError(t, err)
 	assert.Check(t, safe.IsValid())
 	assert.NilError(t, safe.Close())
 	assert.Check(t, !safe.IsValid())
 }
--- a/internal/safepath/safepath.go
+++ b/internal/safepath/safepath.go
@ -0,0 +1,63 @@
 package safepath
 import (
 	"context"
 	"fmt"
 	"sync"
 	"github.com/containerd/log"
 )
 type SafePath struct {
 	path    string
 	cleanup func() error
 	mutex   sync.Mutex
 	// Immutable fields
 	sourceBase, sourceSubpath string
 }
 // Close releases the resources used by the path.
 func (s *SafePath) Close() error {
 	s.mutex.Lock()
 	defer s.mutex.Unlock()
 	if s.path == "" {
 		base, sub := s.SourcePath()
 		log.G(context.TODO()).WithFields(log.Fields{
 			"path":          s.Path(),
 			"sourceBase":    base,
 			"sourceSubpath": sub,
 		}).Warn("an attempt to close an already closed SafePath")
 		return nil
 	}
 	s.path = ""
 	if s.cleanup != nil {
 		return s.cleanup()
 	}
 	return nil
 }
 // IsValid return true when path can still be used and wasn't cleaned up by Close.
 func (s *SafePath) IsValid() bool {
 	s.mutex.Lock()
 	defer s.mutex.Unlock()
 	return s.path != ""
 }
 // Path returns a safe, temporary path that can be used to access the original path.
 func (s *SafePath) Path() string {
 	s.mutex.Lock()
 	defer s.mutex.Unlock()
 	if s.path == "" {
 		panic(fmt.Sprintf("use-after-close attempted for safepath with source [%s, %s]", s.sourceBase, s.sourceSubpath))
 	}
 	return s.path
 }
 // SourcePath returns the source path the safepath points to.
 func (s *SafePath) SourcePath() (string, string) {
 	// No mutex lock because these are immutable.
 	return s.sourceBase, s.sourceSubpath
 }
--- a/volume/mounts/linux_parser.go
+++ b/volume/mounts/linux_parser.go
@ -94,8 +94,18 @@ func (p *linuxParser) validateMountConfigImpl(mnt *mount.Mount, validateBindSour
 		if mnt.BindOptions != nil {
 			return &errMountConfig{mnt, errExtraField("BindOptions")}
 		}
 		anonymousVolume := len(mnt.Source) == 0
-		if len(mnt.Source) == 0 && mnt.ReadOnly {
+		if mnt.VolumeOptions != nil && mnt.VolumeOptions.Subpath != "" {
 			if anonymousVolume {
 				return &errMountConfig{mnt, errAnonymousVolumeWithSubpath}
 			}
 			if !filepath.IsLocal(mnt.VolumeOptions.Subpath) {
 				return &errMountConfig{mnt, errInvalidSubpath}
 			}
 		}
 		if mnt.ReadOnly && anonymousVolume {
 			return &errMountConfig{mnt, fmt.Errorf("must not set ReadOnly mode when using anonymous volumes")}
 		}
 	case mount.TypeTmpfs:
--- a/volume/mounts/mounts.go
+++ b/volume/mounts/mounts.go
@ -9,6 +9,7 @@ import (
 	"github.com/containerd/log"
 	mounttypes "github.com/docker/docker/api/types/mount"
 	"github.com/docker/docker/internal/safepath"
 	"github.com/docker/docker/pkg/idtools"
 	"github.com/docker/docker/pkg/stringid"
 	"github.com/docker/docker/volume"
@ -74,14 +75,34 @@ type MountPoint struct {
 	// Specifically needed for containers which are running and calls to `docker cp`
 	// because both these actions require mounting the volumes.
 	active int
 	// SafePaths created by Setup that should be cleaned up before unmounting
 	// the volume.
 	safePaths []*safepath.SafePath
 }
-// Cleanup frees resources used by the mountpoint
+// Cleanup frees resources used by the mountpoint and cleans up all the paths
 // returned by Setup that hasn't been cleaned up by the caller.
 func (m *MountPoint) Cleanup() error {
 	if m.Volume == nil || m.ID == "" {
 		return nil
 	}
 	for _, p := range m.safePaths {
 		if !p.IsValid() {
 			continue
 		}
 		err := p.Close()
 		base, sub := p.SourcePath()
 		log.G(context.TODO()).WithFields(log.Fields{
 			"error":         err,
 			"path":          p.Path(),
 			"sourceBase":    base,
 			"sourceSubpath": sub,
 		}).Warn("cleaning up SafePath that hasn't been cleaned up by the caller")
 	}
 	if err := m.Volume.Unmount(m.ID); err != nil {
 		return errors.Wrapf(err, "error unmounting volume %s", m.Volume.Name())
 	}
@ -97,9 +118,17 @@ func (m *MountPoint) Cleanup() error {
 // configured, or creating the source directory if supplied.
 // The, optional, checkFun parameter allows doing additional checking
 // before creating the source directory on the host.
-func (m *MountPoint) Setup(mountLabel string, rootIDs idtools.Identity, checkFun func(m *MountPoint) error) (path string, retErr error) {
+//
 // The returned path can be a temporary path, caller is responsible to
 // call the returned cleanup function as soon as the path is not needed.
 // Cleanup doesn't unmount the underlying volumes (if any), it only
 // frees up the resources that were needed to guarantee that the path
 // still points to the same target (to avoid TOCTOU attack).
 //
 // Cleanup function doesn't need to be called when error is returned.
 func (m *MountPoint) Setup(mountLabel string, rootIDs idtools.Identity, checkFun func(m *MountPoint) error) (path string, cleanup func() error, retErr error) {
 	if m.SkipMountpointCreation {
-		return m.Source, nil
+		return m.Source, noCleanup, nil
 	}
 	defer func() {
@ -107,16 +136,24 @@ func (m *MountPoint) Setup(mountLabel string, rootIDs idtools.Identity, checkFun
 			return
 		}
-		sourcePath, err := filepath.EvalSymlinks(m.Source)
+		sourcePath, err := filepath.EvalSymlinks(path)
 		if err != nil {
 			path = ""
 			retErr = errors.Wrapf(err, "error evaluating symlinks from mount source %q", m.Source)
 			if cleanupErr := cleanup(); cleanupErr != nil {
 				log.G(context.TODO()).WithError(cleanupErr).Warn("failed to cleanup after error")
 			}
 			cleanup = noCleanup
 			return
 		}
 		err = label.Relabel(sourcePath, mountLabel, label.IsShared(m.Mode))
 		if err != nil && !errors.Is(err, syscall.ENOTSUP) {
 			path = ""
 			retErr = errors.Wrapf(err, "error setting label on mount source '%s'", sourcePath)
 			if cleanupErr := cleanup(); cleanupErr != nil {
 				log.G(context.TODO()).WithError(cleanupErr).Warn("failed to cleanup after error")
 			}
 			cleanup = noCleanup
 		}
 	}()
@ -127,16 +164,34 @@ func (m *MountPoint) Setup(mountLabel string, rootIDs idtools.Identity, checkFun
 		}
 		volumePath, err := m.Volume.Mount(id)
 		if err != nil {
-			return "", errors.Wrapf(err, "error while mounting volume '%s'", m.Source)
+			return "", noCleanup, errors.Wrapf(err, "error while mounting volume '%s'", m.Source)
 		}
 		m.ID = id
 		clean := noCleanup
 		if m.Spec.VolumeOptions != nil && m.Spec.VolumeOptions.Subpath != "" {
 			subpath := m.Spec.VolumeOptions.Subpath
 			safePath, err := safepath.Join(volumePath, subpath)
 			if err != nil {
 				if err := m.Volume.Unmount(id); err != nil {
 					log.G(context.TODO()).WithError(err).Error("failed to unmount after safepath.Join failed")
 				}
 				return "", noCleanup, err
 			}
 			m.safePaths = append(m.safePaths, safePath)
 			log.G(context.TODO()).Debugf("mounting (%s|%s) via %s", volumePath, subpath, safePath.Path())
 			clean = safePath.Close
 			volumePath = safePath.Path()
 		}
 		m.active++
-		return volumePath, nil
+		return volumePath, clean, nil
 	}
 	if len(m.Source) == 0 {
-		return "", fmt.Errorf("Unable to setup mount point, neither source nor volume defined")
+		return "", noCleanup, fmt.Errorf("Unable to setup mount point, neither source nor volume defined")
 	}
 	if m.Type == mounttypes.TypeBind {
@ -145,7 +200,7 @@ func (m *MountPoint) Setup(mountLabel string, rootIDs idtools.Identity, checkFun
 		// the process of shutting down.
 		if checkFun != nil {
 			if err := checkFun(m); err != nil {
-				return "", err
+				return "", noCleanup, err
 			}
 		}
@ -154,12 +209,12 @@ func (m *MountPoint) Setup(mountLabel string, rootIDs idtools.Identity, checkFun
 		if err := idtools.MkdirAllAndChownNew(m.Source, 0o755, rootIDs); err != nil {
 			if perr, ok := err.(*os.PathError); ok {
 				if perr.Err != syscall.ENOTDIR {
-					return "", errors.Wrapf(err, "error while creating mount source path '%s'", m.Source)
+					return "", noCleanup, errors.Wrapf(err, "error while creating mount source path '%s'", m.Source)
 				}
 			}
 		}
 	}
-	return m.Source, nil
+	return m.Source, noCleanup, nil
 }
 func (m *MountPoint) LiveRestore(ctx context.Context) error {
@ -203,3 +258,8 @@ func errInvalidMode(mode string) error {
 func errInvalidSpec(spec string) error {
 	return errors.Errorf("invalid volume specification: '%s'", spec)
 }
 // noCleanup is a no-op cleanup function.
 func noCleanup() error {
 	return nil
 }
--- a/volume/mounts/parser.go
+++ b/volume/mounts/parser.go
@ -11,6 +11,14 @@ import (
 // It's used by both LCOW and Linux parsers.
 var ErrVolumeTargetIsRoot = errors.New("invalid specification: destination can't be '/'")
 // errAnonymousVolumeWithSubpath is returned when Subpath is specified for
 // anonymous volume.
 var errAnonymousVolumeWithSubpath = errors.New("must not set Subpath when using anonymous volumes")
 // errInvalidSubpath is returned when the provided Subpath is not lexically an
 // relative path within volume.
 var errInvalidSubpath = errors.New("subpath must be a relative path within the volume")
 // read-write modes
 var rwModes = map[string]bool{
 	"rw": true,
--- a/volume/mounts/validate_test.go
+++ b/volume/mounts/validate_test.go
@ -28,6 +28,9 @@ func TestValidateMount(t *testing.T) {
 		{
 			input: mount.Mount{Type: mount.TypeVolume, Target: testDestinationPath},
 		},
 		{
 			input: mount.Mount{Type: mount.TypeVolume, Target: testDestinationPath, Source: "hello", VolumeOptions: &mount.VolumeOptions{Subpath: "world"}},
 		},
 		{
 			input:    mount.Mount{Type: mount.TypeBind},
 			expected: errMissingField("Target"),
--- a/volume/mounts/windows_parser.go
+++ b/volume/mounts/windows_parser.go
@ -4,6 +4,7 @@ import (
 	"errors"
 	"fmt"
 	"os"
 	"path/filepath"
 	"regexp"
 	"runtime"
 	"strings"
@ -258,7 +259,19 @@ func (p *windowsParser) validateMountConfigReg(mnt *mount.Mount, additionalValid
 			return &errMountConfig{mnt, errExtraField("BindOptions")}
 		}
-		if len(mnt.Source) == 0 && mnt.ReadOnly {
+		anonymousVolume := len(mnt.Source) == 0
 		if mnt.VolumeOptions != nil && mnt.VolumeOptions.Subpath != "" {
 			if anonymousVolume {
 				return errAnonymousVolumeWithSubpath
 			}
 			// Check if path is relative but without any back traversals
 			if !filepath.IsLocal(mnt.VolumeOptions.Subpath) {
 				return &errMountConfig{mnt, errInvalidSubpath}
 			}
 		}
 		if anonymousVolume && mnt.ReadOnly {
 			return &errMountConfig{mnt, fmt.Errorf("must not set ReadOnly mode when using anonymous volumes")}
 		}