Quellcode durchsuchen

volumes: Implement subpath mount

`VolumeOptions` now has a `Subpath` field which allows to specify a path
relative to the volume that should be mounted as a destination.

Symlinks are supported, but they cannot escape the base volume
directory.

Signed-off-by: Paweł Gronowski <pawel.gronowski@docker.com>
Paweł Gronowski vor 2 Jahren
Ursprung
Commit
bfb810445c

+ 8 - 0
api/server/router/container/container_routes.go

@@ -628,6 +628,14 @@ func (s *containerRouter) postContainersCreate(ctx context.Context, w http.Respo
 		}
 	}
 
+	if versions.LessThan(version, "1.45") {
+		for _, m := range hostConfig.Mounts {
+			if m.VolumeOptions != nil && m.VolumeOptions.Subpath != "" {
+				return errdefs.InvalidParameter(errors.New("VolumeOptions.Subpath needs API v1.45 or newer"))
+			}
+		}
+	}
+
 	var warnings []string
 	if warn, err := handleMACAddressBC(config, hostConfig, networkingConfig, version); err != nil {
 		return err

+ 4 - 0
api/swagger.yaml

@@ -423,6 +423,10 @@ definitions:
                 type: "object"
                 additionalProperties:
                   type: "string"
+          Subpath:
+            description: "Source path inside the volume. Must be relative without any back traversals."
+            type: "string"
+            example: "dir-inside-volume/subdirectory"
       TmpfsOptions:
         description: "Optional configuration for the `tmpfs` type."
         type: "object"

+ 1 - 0
api/types/mount/mount.go

@@ -96,6 +96,7 @@ type BindOptions struct {
 type VolumeOptions struct {
 	NoCopy       bool              `json:",omitempty"`
 	Labels       map[string]string `json:",omitempty"`
+	Subpath      string            `json:",omitempty"`
 	DriverConfig *Driver           `json:",omitempty"`
 }
 

+ 2 - 1
daemon/containerfs_linux.go

@@ -63,11 +63,12 @@ func (daemon *Daemon) openContainerFS(container *container.Container) (_ *contai
 		}
 	}()
 
-	mounts, err := daemon.setupMounts(container)
+	mounts, cleanup, err := daemon.setupMounts(container)
 	if err != nil {
 		return nil, err
 	}
 	defer func() {
+		cleanup()
 		if err != nil {
 			_ = container.UnmountVolumes(daemon.LogVolumeEvent)
 		}

+ 2 - 1
daemon/start.go

@@ -164,11 +164,12 @@ func (daemon *Daemon) containerStart(ctx context.Context, daemonCfg *configStore
 		return err
 	}
 
-	m, err := daemon.setupMounts(container)
+	m, cleanup, err := daemon.setupMounts(container)
 	if err != nil {
 		return err
 	}
 	mnts = append(mnts, m...)
+	defer cleanup()
 
 	spec, err := daemon.createSpec(ctx, daemonCfg, container, mnts)
 	if err != nil {

+ 26 - 10
daemon/volumes_unix.go

@@ -3,15 +3,18 @@
 package daemon // import "github.com/docker/docker/daemon"
 
 import (
+	"context"
 	"fmt"
 	"os"
 	"sort"
 	"strconv"
 	"strings"
 
+	"github.com/containerd/log"
 	"github.com/docker/docker/api/types/events"
 	mounttypes "github.com/docker/docker/api/types/mount"
 	"github.com/docker/docker/container"
+	"github.com/docker/docker/internal/cleanups"
 	volumemounts "github.com/docker/docker/volume/mounts"
 	"github.com/pkg/errors"
 )
@@ -19,23 +22,34 @@ import (
 // setupMounts iterates through each of the mount points for a container and
 // calls Setup() on each. It also looks to see if is a network mount such as
 // /etc/resolv.conf, and if it is not, appends it to the array of mounts.
-func (daemon *Daemon) setupMounts(c *container.Container) ([]container.Mount, error) {
+//
+// The cleanup function should be called as soon as the container has been
+// started.
+func (daemon *Daemon) setupMounts(c *container.Container) ([]container.Mount, func() error, error) {
 	var mounts []container.Mount
 	// TODO: tmpfs mounts should be part of Mountpoints
 	tmpfsMounts := make(map[string]bool)
 	tmpfsMountInfo, err := c.TmpfsMounts()
 	if err != nil {
-		return nil, err
+		return nil, nil, err
 	}
 	for _, m := range tmpfsMountInfo {
 		tmpfsMounts[m.Destination] = true
 	}
+
+	cleanups := cleanups.Composite{}
+	defer func() {
+		if err := cleanups.Call(); err != nil {
+			log.G(context.TODO()).WithError(err).Warn("failed to cleanup temporary mounts created by MountPoint.Setup")
+		}
+	}()
+
 	for _, m := range c.MountPoints {
 		if tmpfsMounts[m.Destination] {
 			continue
 		}
 		if err := daemon.lazyInitializeVolume(c.ID, m); err != nil {
-			return nil, err
+			return nil, nil, err
 		}
 		// If the daemon is being shutdown, we should not let a container start if it is trying to
 		// mount the socket the daemon is listening on. During daemon shutdown, the socket
@@ -48,10 +62,12 @@ func (daemon *Daemon) setupMounts(c *container.Container) ([]container.Mount, er
 			return nil
 		}
 
-		path, err := m.Setup(c.MountLabel, daemon.idMapping.RootPair(), checkfunc)
+		path, clean, err := m.Setup(c.MountLabel, daemon.idMapping.RootPair(), checkfunc)
 		if err != nil {
-			return nil, err
+			return nil, nil, err
 		}
+		cleanups.Add(clean)
+
 		if !c.TrySetNetworkMount(m.Destination, path) {
 			mnt := container.Mount{
 				Source:      path,
@@ -61,13 +77,13 @@ func (daemon *Daemon) setupMounts(c *container.Container) ([]container.Mount, er
 			}
 			if m.Spec.Type == mounttypes.TypeBind && m.Spec.BindOptions != nil {
 				if !m.Spec.ReadOnly && m.Spec.BindOptions.ReadOnlyNonRecursive {
-					return nil, errors.New("mount options conflict: !ReadOnly && BindOptions.ReadOnlyNonRecursive")
+					return nil, nil, errors.New("mount options conflict: !ReadOnly && BindOptions.ReadOnlyNonRecursive")
 				}
 				if !m.Spec.ReadOnly && m.Spec.BindOptions.ReadOnlyForceRecursive {
-					return nil, errors.New("mount options conflict: !ReadOnly && BindOptions.ReadOnlyForceRecursive")
+					return nil, nil, errors.New("mount options conflict: !ReadOnly && BindOptions.ReadOnlyForceRecursive")
 				}
 				if m.Spec.BindOptions.ReadOnlyNonRecursive && m.Spec.BindOptions.ReadOnlyForceRecursive {
-					return nil, errors.New("mount options conflict: ReadOnlyNonRecursive && BindOptions.ReadOnlyForceRecursive")
+					return nil, nil, errors.New("mount options conflict: ReadOnlyNonRecursive && BindOptions.ReadOnlyForceRecursive")
 				}
 				mnt.NonRecursive = m.Spec.BindOptions.NonRecursive
 				mnt.ReadOnlyNonRecursive = m.Spec.BindOptions.ReadOnlyNonRecursive
@@ -98,11 +114,11 @@ func (daemon *Daemon) setupMounts(c *container.Container) ([]container.Mount, er
 		// up to the user to make sure the file has proper ownership for userns
 		if strings.Index(mnt.Source, daemon.repository) == 0 {
 			if err := os.Chown(mnt.Source, rootIDs.UID, rootIDs.GID); err != nil {
-				return nil, err
+				return nil, nil, err
 			}
 		}
 	}
-	return append(mounts, netMounts...), nil
+	return append(mounts, netMounts...), cleanups.Release(), nil
 }
 
 // sortMounts sorts an array of mounts in lexicographic order. This ensure that

+ 19 - 6
daemon/volumes_windows.go

@@ -1,10 +1,13 @@
 package daemon // import "github.com/docker/docker/daemon"
 
 import (
+	"context"
 	"sort"
 
+	"github.com/containerd/log"
 	"github.com/docker/docker/api/types/mount"
 	"github.com/docker/docker/container"
+	"github.com/docker/docker/internal/cleanups"
 	"github.com/docker/docker/pkg/idtools"
 	volumemounts "github.com/docker/docker/volume/mounts"
 )
@@ -13,21 +16,31 @@ import (
 // of the configured mounts on the container to the OCI mount structure
 // which will ultimately be passed into the oci runtime during container creation.
 // It also ensures each of the mounts are lexicographically sorted.
-
+//
+// The cleanup function should be called as soon as the container has been
+// started.
+//
 // BUGBUG TODO Windows containerd. This would be much better if it returned
 // an array of runtime spec mounts, not container mounts. Then no need to
 // do multiple transitions.
+func (daemon *Daemon) setupMounts(c *container.Container) ([]container.Mount, func() error, error) {
+	cleanups := cleanups.Composite{}
+	defer func() {
+		if err := cleanups.Call(); err != nil {
+			log.G(context.TODO()).WithError(err).Warn("failed to cleanup temporary mounts created by MountPoint.Setup")
+		}
+	}()
 
-func (daemon *Daemon) setupMounts(c *container.Container) ([]container.Mount, error) {
 	var mnts []container.Mount
 	for _, mount := range c.MountPoints { // type is volumemounts.MountPoint
 		if err := daemon.lazyInitializeVolume(c.ID, mount); err != nil {
-			return nil, err
+			return nil, nil, err
 		}
-		s, err := mount.Setup(c.MountLabel, idtools.Identity{}, nil)
+		s, c, err := mount.Setup(c.MountLabel, idtools.Identity{}, nil)
 		if err != nil {
-			return nil, err
+			return nil, nil, err
 		}
+		cleanups.Add(c)
 
 		mnts = append(mnts, container.Mount{
 			Source:      s,
@@ -37,7 +50,7 @@ func (daemon *Daemon) setupMounts(c *container.Container) ([]container.Mount, er
 	}
 
 	sort.Sort(mounts(mnts))
-	return mnts, nil
+	return mnts, cleanups.Release(), nil
 }
 
 // setBindModeIfNull is platform specific processing which is a no-op on

+ 3 - 0
docs/api/version-history.md

@@ -17,6 +17,9 @@ keywords: "API, Docker, rcli, REST, documentation"
 
 [Docker Engine API v1.45](https://docs.docker.com/engine/api/v1.45/) documentation
 
+* `POST /containers/create` now supports `VolumeOptions.Subpath` which allows a
+  subpath of a named volume to be mounted.
+
 ## v1.44 API changes
 
 [Docker Engine API v1.44](https://docs.docker.com/engine/api/v1.44/) documentation

+ 25 - 0
integration/internal/container/container.go

@@ -170,3 +170,28 @@ func Inspect(ctx context.Context, t *testing.T, apiClient client.APIClient, cont
 
 	return c
 }
+
+type ContainerOutput struct {
+	Stdout, Stderr string
+}
+
+// Output waits for the container to end running and returns its output.
+func Output(ctx context.Context, client client.APIClient, id string) (ContainerOutput, error) {
+	logs, err := client.ContainerLogs(ctx, id, container.LogsOptions{Follow: true, ShowStdout: true, ShowStderr: true})
+	if err != nil {
+		return ContainerOutput{}, err
+	}
+
+	defer logs.Close()
+
+	var stdoutBuf, stderrBuf bytes.Buffer
+	_, err = stdcopy.StdCopy(&stdoutBuf, &stderrBuf, logs)
+	if err != nil {
+		return ContainerOutput{}, err
+	}
+
+	return ContainerOutput{
+		Stdout: stdoutBuf.String(),
+		Stderr: stderrBuf.String(),
+	}, nil
+}

+ 185 - 0
integration/volume/mount_test.go

@@ -0,0 +1,185 @@
+package volume
+
+import (
+	"context"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	containertypes "github.com/docker/docker/api/types/container"
+	"github.com/docker/docker/api/types/mount"
+	"github.com/docker/docker/api/types/network"
+	"github.com/docker/docker/api/types/versions"
+	"github.com/docker/docker/api/types/volume"
+	"github.com/docker/docker/client"
+	"github.com/docker/docker/integration/internal/container"
+	"github.com/docker/docker/internal/safepath"
+	"gotest.tools/v3/assert"
+	is "gotest.tools/v3/assert/cmp"
+	"gotest.tools/v3/skip"
+)
+
+func TestRunMountVolumeSubdir(t *testing.T) {
+	skip.If(t, versions.LessThan(testEnv.DaemonAPIVersion(), "1.45"), "skip test from new feature")
+
+	ctx := setupTest(t)
+	apiClient := testEnv.APIClient()
+
+	testVolumeName := setupTestVolume(t, apiClient)
+
+	for _, tc := range []struct {
+		name         string
+		opts         mount.VolumeOptions
+		cmd          []string
+		volumeTarget string
+		createErr    string
+		startErr     string
+		expected     string
+		skipPlatform string
+	}{
+		{name: "subdir", opts: mount.VolumeOptions{Subpath: "subdir"}, cmd: []string{"ls", "/volume"}, expected: "hello.txt"},
+		{name: "subdir link", opts: mount.VolumeOptions{Subpath: "hack/good"}, cmd: []string{"ls", "/volume"}, expected: "hello.txt"},
+		{name: "subdir with copy data", opts: mount.VolumeOptions{Subpath: "bin"}, volumeTarget: "/bin", cmd: []string{"ls", "/bin/busybox"}, expected: "/bin/busybox", skipPlatform: "windows:copy not supported on Windows"},
+		{name: "file", opts: mount.VolumeOptions{Subpath: "bar.txt"}, cmd: []string{"cat", "/volume"}, expected: "foo", skipPlatform: "windows:file bind mounts not supported on Windows"},
+		{name: "relative with backtracks", opts: mount.VolumeOptions{Subpath: "../../../../../../etc/passwd"}, cmd: []string{"cat", "/volume"}, createErr: "subpath must be a relative path within the volume"},
+		{name: "not existing", opts: mount.VolumeOptions{Subpath: "not-existing-path"}, cmd: []string{"cat", "/volume"}, startErr: (&safepath.ErrNotAccessible{}).Error()},
+
+		{name: "mount link", opts: mount.VolumeOptions{Subpath: filepath.Join("hack", "root")}, cmd: []string{"ls", "/volume"}, startErr: (&safepath.ErrEscapesBase{}).Error()},
+		{name: "mount link link", opts: mount.VolumeOptions{Subpath: filepath.Join("hack", "bad")}, cmd: []string{"ls", "/volume"}, startErr: (&safepath.ErrEscapesBase{}).Error()},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			if tc.skipPlatform != "" {
+				platform, reason, _ := strings.Cut(tc.skipPlatform, ":")
+				if testEnv.DaemonInfo.OSType == platform {
+					t.Skip(reason)
+				}
+			}
+
+			cfg := containertypes.Config{
+				Image: "busybox",
+				Cmd:   tc.cmd,
+			}
+			hostCfg := containertypes.HostConfig{
+				Mounts: []mount.Mount{
+					{
+						Type:          mount.TypeVolume,
+						Source:        testVolumeName,
+						Target:        "/volume",
+						VolumeOptions: &tc.opts,
+					},
+				},
+			}
+			if testEnv.DaemonInfo.OSType == "windows" {
+				hostCfg.Mounts[0].Target = `C:\volume`
+			}
+			if tc.volumeTarget != "" {
+				hostCfg.Mounts[0].Target = tc.volumeTarget
+			}
+
+			ctrName := strings.ReplaceAll(t.Name(), "/", "_")
+			create, creatErr := apiClient.ContainerCreate(ctx, &cfg, &hostCfg, &network.NetworkingConfig{}, nil, ctrName)
+			id := create.ID
+			if id != "" {
+				defer apiClient.ContainerRemove(ctx, id, containertypes.RemoveOptions{Force: true})
+			}
+
+			if tc.createErr != "" {
+				assert.ErrorContains(t, creatErr, tc.createErr)
+				return
+			}
+			assert.NilError(t, creatErr, "container creation failed")
+
+			startErr := apiClient.ContainerStart(ctx, id, containertypes.StartOptions{})
+			if tc.startErr != "" {
+				assert.ErrorContains(t, startErr, tc.startErr)
+				return
+			}
+			assert.NilError(t, startErr)
+
+			output, err := container.Output(ctx, apiClient, id)
+			assert.Check(t, err)
+			t.Logf("stdout:\n%s", output.Stdout)
+			t.Logf("stderr:\n%s", output.Stderr)
+
+			inspect, err := apiClient.ContainerInspect(ctx, id)
+			if assert.Check(t, err) {
+				assert.Check(t, is.Equal(inspect.State.ExitCode, 0))
+			}
+
+			assert.Check(t, is.Equal(strings.TrimSpace(output.Stderr), ""))
+			assert.Check(t, is.Equal(strings.TrimSpace(output.Stdout), tc.expected))
+		})
+	}
+}
+
+// setupTestVolume sets up a volume with:
+// .
+// |-- bar.txt                        (file with "foo")
+// |-- bin                            (directory)
+// |-- subdir                         (directory)
+// |   |-- hello.txt                  (file with "world")
+// |-- hack                           (directory)
+// |   |-- root                       (symlink to /)
+// |   |-- good                       (symlink to ../subdir)
+// |   |-- bad                        (symlink to root)
+func setupTestVolume(t *testing.T, client client.APIClient) string {
+	t.Helper()
+	ctx := context.Background()
+
+	volumeName := t.Name() + "-volume"
+
+	err := client.VolumeRemove(ctx, volumeName, true)
+	assert.NilError(t, err, "failed to clean volume")
+
+	_, err = client.VolumeCreate(ctx, volume.CreateOptions{
+		Name: volumeName,
+	})
+	assert.NilError(t, err, "failed to setup volume")
+
+	mount := mount.Mount{
+		Type:   mount.TypeVolume,
+		Source: volumeName,
+		Target: "/volume",
+	}
+
+	rootFs := "/"
+	if testEnv.DaemonInfo.OSType == "windows" {
+		mount.Target = `C:\volume`
+		rootFs = `C:`
+	}
+
+	initCmd := "echo foo > /volume/bar.txt && " +
+		"mkdir /volume/bin && " +
+		"mkdir /volume/subdir && " +
+		"echo world > /volume/subdir/hello.txt && " +
+		"mkdir /volume/hack && " +
+		"ln -s " + rootFs + " /volume/hack/root && " +
+		"ln -s ../subdir /volume/hack/good && " +
+		"ln -s root /volume/hack/bad &&" +
+		"mkdir /volume/hack/iwanttobehackedwithtoctou"
+
+	opts := []func(*container.TestContainerConfig){
+		container.WithMount(mount),
+		container.WithCmd("sh", "-c", initCmd+"; ls -lah /volume /volume/hack/"),
+	}
+	if testEnv.DaemonInfo.OSType == "windows" {
+		// Can't create symlinks under HyperV isolation
+		opts = append(opts, container.WithIsolation(containertypes.IsolationProcess))
+	}
+
+	cid := container.Run(ctx, t, client, opts...)
+	defer client.ContainerRemove(ctx, cid, containertypes.RemoveOptions{Force: true})
+	output, err := container.Output(ctx, client, cid)
+
+	t.Logf("Setup stderr:\n%s", output.Stderr)
+	t.Logf("Setup stdout:\n%s", output.Stdout)
+
+	assert.NilError(t, err)
+	assert.Assert(t, is.Equal(output.Stderr, ""))
+
+	inspect, err := client.ContainerInspect(ctx, cid)
+	assert.NilError(t, err)
+	assert.Assert(t, is.Equal(inspect.State.ExitCode, 0))
+
+	return volumeName
+}

+ 66 - 0
internal/safepath/common.go

@@ -0,0 +1,66 @@
+package safepath
+
+import (
+	"os"
+	"path/filepath"
+
+	"github.com/pkg/errors"
+)
+
+// evaluatePath evaluates symlinks in the concatenation of path and subpath. If
+// err is nil, resolvedBasePath will contain result of resolving all symlinks
+// in the given path, and resolvedSubpath will contain a relative path rooted
+// at the resolvedBasePath pointing to the concatenation after resolving all
+// symlinks.
+func evaluatePath(path, subpath string) (resolvedBasePath string, resolvedSubpath string, err error) {
+	baseResolved, err := filepath.EvalSymlinks(path)
+	if err != nil {
+		if errors.Is(err, os.ErrNotExist) {
+			return "", "", &ErrNotAccessible{Path: path, Cause: err}
+		}
+		return "", "", errors.Wrapf(err, "error while resolving symlinks in base directory %q", path)
+	}
+
+	combinedPath := filepath.Join(baseResolved, subpath)
+	combinedResolved, err := filepath.EvalSymlinks(combinedPath)
+	if err != nil {
+		if errors.Is(err, os.ErrNotExist) {
+			return "", "", &ErrNotAccessible{Path: combinedPath, Cause: err}
+		}
+		return "", "", errors.Wrapf(err, "error while resolving symlinks in combined path %q", combinedPath)
+	}
+
+	subpart, err := filepath.Rel(baseResolved, combinedResolved)
+	if err != nil {
+		return "", "", &ErrEscapesBase{Base: baseResolved, Subpath: subpath}
+	}
+
+	if !filepath.IsLocal(subpart) {
+		return "", "", &ErrEscapesBase{Base: baseResolved, Subpath: subpath}
+	}
+
+	return baseResolved, subpart, nil
+}
+
+// isLocalTo reports whether path, using lexical analysis only, has all of these properties:
+//   - is within the subtree rooted at basepath
+//   - is not empty
+//   - on Windows, is not a reserved name such as "NUL"
+//
+// If isLocalTo(path, basepath) returns true, then
+//
+//	filepath.Rel(basepath, path)
+//
+// will always produce an unrooted path with no `..` elements.
+//
+// isLocalTo is a purely lexical operation. In particular, it does not account for the effect of any symbolic links that may exist in the filesystem.
+//
+// Both path and basepath are expected to be absolute paths.
+func isLocalTo(path, basepath string) bool {
+	rel, err := filepath.Rel(basepath, path)
+	if err != nil {
+		return false
+	}
+
+	return filepath.IsLocal(rel)
+}

+ 31 - 0
internal/safepath/common_test.go

@@ -0,0 +1,31 @@
+package safepath
+
+import (
+	"testing"
+
+	"gotest.tools/v3/assert"
+	is "gotest.tools/v3/assert/cmp"
+)
+
+func TestIsLocalTo(t *testing.T) {
+	for _, tc := range []struct {
+		name    string
+		subpath string
+		result  bool
+	}{
+		{name: "same", subpath: "/volume", result: true},
+		{name: "1 level subpath", subpath: "/volume/sub", result: true},
+		{name: "2 level subpath", subpath: "/volume/sub/path", result: true},
+		{name: "absolute", subpath: "/etc/passwd", result: false},
+		{name: "backtrack", subpath: "/volume/../", result: false},
+		{name: "backtrack inside", subpath: "/volume/sub/../", result: true},
+		{name: "relative path", subpath: "./rel", result: false},
+		{name: "file with dots", subpath: "/volume/file..with.dots", result: true},
+		{name: "file starting with dots", subpath: "/volume/..file", result: true},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			result := isLocalTo(tc.subpath, "/volume")
+			assert.Check(t, is.Equal(result, tc.result))
+		})
+	}
+}

+ 42 - 0
internal/safepath/errors.go

@@ -0,0 +1,42 @@
+package safepath
+
+// ErrNotAccessible is returned by Join when the resulting path doesn't exist,
+// is not accessible, or any of the path components was replaced with a symlink
+// during the path traversal.
+type ErrNotAccessible struct {
+	Path  string
+	Cause error
+}
+
+func (*ErrNotAccessible) NotFound() {}
+
+func (e *ErrNotAccessible) Unwrap() error {
+	return e.Cause
+}
+
+func (e *ErrNotAccessible) Error() string {
+	msg := "cannot access path " + e.Path
+	if e.Cause != nil {
+		msg += ": " + e.Cause.Error()
+	}
+	return msg
+}
+
+// ErrEscapesBase is returned by Join when the resulting concatenation would
+// point outside of the specified base directory.
+type ErrEscapesBase struct {
+	Base, Subpath string
+}
+
+func (*ErrEscapesBase) InvalidParameter() {}
+
+func (e *ErrEscapesBase) Error() string {
+	msg := "path concatenation escapes the base directory"
+	if e.Base != "" {
+		msg += ", base: " + e.Base
+	}
+	if e.Subpath != "" {
+		msg += ", subpath: " + e.Subpath
+	}
+	return msg
+}

+ 144 - 0
internal/safepath/join_test.go

@@ -0,0 +1,144 @@
+package safepath
+
+import (
+	"os"
+	"path/filepath"
+	"runtime"
+	"strings"
+	"testing"
+
+	"gotest.tools/v3/assert"
+	is "gotest.tools/v3/assert/cmp"
+)
+
+func TestJoinEscapingSymlink(t *testing.T) {
+	type testCase struct {
+		name   string
+		target string
+	}
+	var cases []testCase
+
+	if runtime.GOOS == "windows" {
+		cases = []testCase{
+			{name: "root", target: `C:\`},
+			{name: "absolute file", target: `C:\Windows\System32\cmd.exe`},
+		}
+	} else {
+		cases = []testCase{
+			{name: "root", target: "/"},
+			{name: "absolute file", target: "/etc/passwd"},
+		}
+	}
+	cases = append(cases, testCase{name: "relative", target: "../../"})
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			tempDir := t.TempDir()
+			dir, err := filepath.EvalSymlinks(tempDir)
+			assert.NilError(t, err, "filepath.EvalSymlinks failed for temporary directory %s", tempDir)
+
+			err = os.Symlink(tc.target, filepath.Join(dir, "link"))
+			assert.NilError(t, err, "failed to create symlink to %s", tc.target)
+
+			safe, err := Join(dir, "link")
+			if err == nil {
+				safe.Close()
+			}
+			assert.ErrorType(t, err, &ErrEscapesBase{})
+		})
+	}
+}
+
+func TestJoinGoodSymlink(t *testing.T) {
+	tempDir := t.TempDir()
+	dir, err := filepath.EvalSymlinks(tempDir)
+	assert.NilError(t, err, "filepath.EvalSymlinks failed for temporary directory %s", tempDir)
+
+	assert.Assert(t, os.WriteFile(filepath.Join(dir, "foo"), []byte("bar"), 0o744), "failed to create file 'foo'")
+	assert.Assert(t, os.Mkdir(filepath.Join(dir, "subdir"), 0o744), "failed to create directory 'subdir'")
+	assert.Assert(t, os.WriteFile(filepath.Join(dir, "subdir/hello.txt"), []byte("world"), 0o744), "failed to create file 'subdir/hello.txt'")
+
+	assert.Assert(t, os.Symlink(filepath.Join(dir, "subdir"), filepath.Join(dir, "subdir_link_absolute")), "failed to create absolute symlink to directory 'subdir'")
+	assert.Assert(t, os.Symlink("subdir", filepath.Join(dir, "subdir_link_relative")), "failed to create relative symlink to directory 'subdir'")
+
+	assert.Assert(t, os.Symlink(filepath.Join(dir, "foo"), filepath.Join(dir, "foo_link_absolute")), "failed to create absolute symlink to file 'foo'")
+	assert.Assert(t, os.Symlink("foo", filepath.Join(dir, "foo_link_relative")), "failed to create relative symlink to file 'foo'")
+
+	for _, target := range []string{
+		"foo", "subdir",
+		"subdir_link_absolute", "foo_link_absolute",
+		"subdir_link_relative", "foo_link_relative",
+	} {
+		t.Run(target, func(t *testing.T) {
+			safe, err := Join(dir, target)
+			assert.NilError(t, err)
+
+			defer safe.Close()
+			if strings.HasPrefix(target, "subdir") {
+				data, err := os.ReadFile(filepath.Join(safe.Path(), "hello.txt"))
+				assert.NilError(t, err)
+				assert.Assert(t, is.Equal(string(data), "world"))
+			}
+		})
+	}
+}
+
+func TestJoinWithSymlinkReplace(t *testing.T) {
+	tempDir := t.TempDir()
+	dir, err := filepath.EvalSymlinks(tempDir)
+	assert.NilError(t, err, "filepath.EvalSymlinks failed for temporary directory %s", tempDir)
+
+	link := filepath.Join(dir, "link")
+	target := filepath.Join(dir, "foo")
+
+	err = os.WriteFile(target, []byte("bar"), 0o744)
+	assert.NilError(t, err, "failed to create test file")
+
+	err = os.Symlink(target, link)
+	assert.Check(t, err, "failed to create symlink to foo")
+
+	safe, err := Join(dir, "link")
+	assert.NilError(t, err)
+
+	defer safe.Close()
+
+	// Delete the link target.
+	err = os.Remove(target)
+	if runtime.GOOS == "windows" {
+		// On Windows it shouldn't be possible.
+		assert.Assert(t, is.ErrorType(err, &os.PathError{}), "link shouldn't be deletable before cleanup")
+	} else {
+		// On Linux we can delete it just fine.
+		assert.NilError(t, err, "failed to remove symlink")
+
+		// Replace target with a symlink to /etc/paswd
+		err = os.Symlink("/etc/passwd", target)
+		assert.NilError(t, err, "failed to create symlink")
+	}
+
+	// The returned safe path should still point to the old file.
+	data, err := os.ReadFile(safe.Path())
+	assert.NilError(t, err, "failed to read file")
+
+	assert.Check(t, is.Equal(string(data), "bar"))
+
+}
+
+func TestJoinCloseInvalidates(t *testing.T) {
+	tempDir := t.TempDir()
+	dir, err := filepath.EvalSymlinks(tempDir)
+	assert.NilError(t, err)
+
+	foo := filepath.Join(dir, "foo")
+	err = os.WriteFile(foo, []byte("bar"), 0o744)
+	assert.NilError(t, err, "failed to create test file")
+
+	safe, err := Join(dir, "foo")
+	assert.NilError(t, err)
+
+	assert.Check(t, safe.IsValid())
+
+	assert.NilError(t, safe.Close())
+
+	assert.Check(t, !safe.IsValid())
+}

+ 63 - 0
internal/safepath/safepath.go

@@ -0,0 +1,63 @@
+package safepath
+
+import (
+	"context"
+	"fmt"
+	"sync"
+
+	"github.com/containerd/log"
+)
+
+type SafePath struct {
+	path    string
+	cleanup func() error
+	mutex   sync.Mutex
+
+	// Immutable fields
+	sourceBase, sourceSubpath string
+}
+
+// Close releases the resources used by the path.
+func (s *SafePath) Close() error {
+	s.mutex.Lock()
+	defer s.mutex.Unlock()
+
+	if s.path == "" {
+		base, sub := s.SourcePath()
+		log.G(context.TODO()).WithFields(log.Fields{
+			"path":          s.Path(),
+			"sourceBase":    base,
+			"sourceSubpath": sub,
+		}).Warn("an attempt to close an already closed SafePath")
+		return nil
+	}
+
+	s.path = ""
+	if s.cleanup != nil {
+		return s.cleanup()
+	}
+	return nil
+}
+
+// IsValid return true when path can still be used and wasn't cleaned up by Close.
+func (s *SafePath) IsValid() bool {
+	s.mutex.Lock()
+	defer s.mutex.Unlock()
+	return s.path != ""
+}
+
+// Path returns a safe, temporary path that can be used to access the original path.
+func (s *SafePath) Path() string {
+	s.mutex.Lock()
+	defer s.mutex.Unlock()
+	if s.path == "" {
+		panic(fmt.Sprintf("use-after-close attempted for safepath with source [%s, %s]", s.sourceBase, s.sourceSubpath))
+	}
+	return s.path
+}
+
+// SourcePath returns the source path the safepath points to.
+func (s *SafePath) SourcePath() (string, string) {
+	// No mutex lock because these are immutable.
+	return s.sourceBase, s.sourceSubpath
+}

+ 11 - 1
volume/mounts/linux_parser.go

@@ -94,8 +94,18 @@ func (p *linuxParser) validateMountConfigImpl(mnt *mount.Mount, validateBindSour
 		if mnt.BindOptions != nil {
 			return &errMountConfig{mnt, errExtraField("BindOptions")}
 		}
+		anonymousVolume := len(mnt.Source) == 0
 
-		if len(mnt.Source) == 0 && mnt.ReadOnly {
+		if mnt.VolumeOptions != nil && mnt.VolumeOptions.Subpath != "" {
+			if anonymousVolume {
+				return &errMountConfig{mnt, errAnonymousVolumeWithSubpath}
+			}
+
+			if !filepath.IsLocal(mnt.VolumeOptions.Subpath) {
+				return &errMountConfig{mnt, errInvalidSubpath}
+			}
+		}
+		if mnt.ReadOnly && anonymousVolume {
 			return &errMountConfig{mnt, fmt.Errorf("must not set ReadOnly mode when using anonymous volumes")}
 		}
 	case mount.TypeTmpfs:

+ 70 - 10
volume/mounts/mounts.go

@@ -9,6 +9,7 @@ import (
 
 	"github.com/containerd/log"
 	mounttypes "github.com/docker/docker/api/types/mount"
+	"github.com/docker/docker/internal/safepath"
 	"github.com/docker/docker/pkg/idtools"
 	"github.com/docker/docker/pkg/stringid"
 	"github.com/docker/docker/volume"
@@ -74,14 +75,34 @@ type MountPoint struct {
 	// Specifically needed for containers which are running and calls to `docker cp`
 	// because both these actions require mounting the volumes.
 	active int
+
+	// SafePaths created by Setup that should be cleaned up before unmounting
+	// the volume.
+	safePaths []*safepath.SafePath
 }
 
-// Cleanup frees resources used by the mountpoint
+// Cleanup frees resources used by the mountpoint and cleans up all the paths
+// returned by Setup that hasn't been cleaned up by the caller.
 func (m *MountPoint) Cleanup() error {
 	if m.Volume == nil || m.ID == "" {
 		return nil
 	}
 
+	for _, p := range m.safePaths {
+		if !p.IsValid() {
+			continue
+		}
+
+		err := p.Close()
+		base, sub := p.SourcePath()
+		log.G(context.TODO()).WithFields(log.Fields{
+			"error":         err,
+			"path":          p.Path(),
+			"sourceBase":    base,
+			"sourceSubpath": sub,
+		}).Warn("cleaning up SafePath that hasn't been cleaned up by the caller")
+	}
+
 	if err := m.Volume.Unmount(m.ID); err != nil {
 		return errors.Wrapf(err, "error unmounting volume %s", m.Volume.Name())
 	}
@@ -97,9 +118,17 @@ func (m *MountPoint) Cleanup() error {
 // configured, or creating the source directory if supplied.
 // The, optional, checkFun parameter allows doing additional checking
 // before creating the source directory on the host.
-func (m *MountPoint) Setup(mountLabel string, rootIDs idtools.Identity, checkFun func(m *MountPoint) error) (path string, retErr error) {
+//
+// The returned path can be a temporary path, caller is responsible to
+// call the returned cleanup function as soon as the path is not needed.
+// Cleanup doesn't unmount the underlying volumes (if any), it only
+// frees up the resources that were needed to guarantee that the path
+// still points to the same target (to avoid TOCTOU attack).
+//
+// Cleanup function doesn't need to be called when error is returned.
+func (m *MountPoint) Setup(mountLabel string, rootIDs idtools.Identity, checkFun func(m *MountPoint) error) (path string, cleanup func() error, retErr error) {
 	if m.SkipMountpointCreation {
-		return m.Source, nil
+		return m.Source, noCleanup, nil
 	}
 
 	defer func() {
@@ -107,16 +136,24 @@ func (m *MountPoint) Setup(mountLabel string, rootIDs idtools.Identity, checkFun
 			return
 		}
 
-		sourcePath, err := filepath.EvalSymlinks(m.Source)
+		sourcePath, err := filepath.EvalSymlinks(path)
 		if err != nil {
 			path = ""
 			retErr = errors.Wrapf(err, "error evaluating symlinks from mount source %q", m.Source)
+			if cleanupErr := cleanup(); cleanupErr != nil {
+				log.G(context.TODO()).WithError(cleanupErr).Warn("failed to cleanup after error")
+			}
+			cleanup = noCleanup
 			return
 		}
 		err = label.Relabel(sourcePath, mountLabel, label.IsShared(m.Mode))
 		if err != nil && !errors.Is(err, syscall.ENOTSUP) {
 			path = ""
 			retErr = errors.Wrapf(err, "error setting label on mount source '%s'", sourcePath)
+			if cleanupErr := cleanup(); cleanupErr != nil {
+				log.G(context.TODO()).WithError(cleanupErr).Warn("failed to cleanup after error")
+			}
+			cleanup = noCleanup
 		}
 	}()
 
@@ -127,16 +164,34 @@ func (m *MountPoint) Setup(mountLabel string, rootIDs idtools.Identity, checkFun
 		}
 		volumePath, err := m.Volume.Mount(id)
 		if err != nil {
-			return "", errors.Wrapf(err, "error while mounting volume '%s'", m.Source)
+			return "", noCleanup, errors.Wrapf(err, "error while mounting volume '%s'", m.Source)
 		}
 
 		m.ID = id
+		clean := noCleanup
+		if m.Spec.VolumeOptions != nil && m.Spec.VolumeOptions.Subpath != "" {
+			subpath := m.Spec.VolumeOptions.Subpath
+
+			safePath, err := safepath.Join(volumePath, subpath)
+			if err != nil {
+				if err := m.Volume.Unmount(id); err != nil {
+					log.G(context.TODO()).WithError(err).Error("failed to unmount after safepath.Join failed")
+				}
+				return "", noCleanup, err
+			}
+			m.safePaths = append(m.safePaths, safePath)
+			log.G(context.TODO()).Debugf("mounting (%s|%s) via %s", volumePath, subpath, safePath.Path())
+
+			clean = safePath.Close
+			volumePath = safePath.Path()
+		}
+
 		m.active++
-		return volumePath, nil
+		return volumePath, clean, nil
 	}
 
 	if len(m.Source) == 0 {
-		return "", fmt.Errorf("Unable to setup mount point, neither source nor volume defined")
+		return "", noCleanup, fmt.Errorf("Unable to setup mount point, neither source nor volume defined")
 	}
 
 	if m.Type == mounttypes.TypeBind {
@@ -145,7 +200,7 @@ func (m *MountPoint) Setup(mountLabel string, rootIDs idtools.Identity, checkFun
 		// the process of shutting down.
 		if checkFun != nil {
 			if err := checkFun(m); err != nil {
-				return "", err
+				return "", noCleanup, err
 			}
 		}
 
@@ -154,12 +209,12 @@ func (m *MountPoint) Setup(mountLabel string, rootIDs idtools.Identity, checkFun
 		if err := idtools.MkdirAllAndChownNew(m.Source, 0o755, rootIDs); err != nil {
 			if perr, ok := err.(*os.PathError); ok {
 				if perr.Err != syscall.ENOTDIR {
-					return "", errors.Wrapf(err, "error while creating mount source path '%s'", m.Source)
+					return "", noCleanup, errors.Wrapf(err, "error while creating mount source path '%s'", m.Source)
 				}
 			}
 		}
 	}
-	return m.Source, nil
+	return m.Source, noCleanup, nil
 }
 
 func (m *MountPoint) LiveRestore(ctx context.Context) error {
@@ -203,3 +258,8 @@ func errInvalidMode(mode string) error {
 func errInvalidSpec(spec string) error {
 	return errors.Errorf("invalid volume specification: '%s'", spec)
 }
+
+// noCleanup is a no-op cleanup function.
+func noCleanup() error {
+	return nil
+}

+ 8 - 0
volume/mounts/parser.go

@@ -11,6 +11,14 @@ import (
 // It's used by both LCOW and Linux parsers.
 var ErrVolumeTargetIsRoot = errors.New("invalid specification: destination can't be '/'")
 
+// errAnonymousVolumeWithSubpath is returned when Subpath is specified for
+// anonymous volume.
+var errAnonymousVolumeWithSubpath = errors.New("must not set Subpath when using anonymous volumes")
+
+// errInvalidSubpath is returned when the provided Subpath is not lexically an
+// relative path within volume.
+var errInvalidSubpath = errors.New("subpath must be a relative path within the volume")
+
 // read-write modes
 var rwModes = map[string]bool{
 	"rw": true,

+ 3 - 0
volume/mounts/validate_test.go

@@ -28,6 +28,9 @@ func TestValidateMount(t *testing.T) {
 		{
 			input: mount.Mount{Type: mount.TypeVolume, Target: testDestinationPath},
 		},
+		{
+			input: mount.Mount{Type: mount.TypeVolume, Target: testDestinationPath, Source: "hello", VolumeOptions: &mount.VolumeOptions{Subpath: "world"}},
+		},
 		{
 			input:    mount.Mount{Type: mount.TypeBind},
 			expected: errMissingField("Target"),

+ 14 - 1
volume/mounts/windows_parser.go

@@ -4,6 +4,7 @@ import (
 	"errors"
 	"fmt"
 	"os"
+	"path/filepath"
 	"regexp"
 	"runtime"
 	"strings"
@@ -258,7 +259,19 @@ func (p *windowsParser) validateMountConfigReg(mnt *mount.Mount, additionalValid
 			return &errMountConfig{mnt, errExtraField("BindOptions")}
 		}
 
-		if len(mnt.Source) == 0 && mnt.ReadOnly {
+		anonymousVolume := len(mnt.Source) == 0
+		if mnt.VolumeOptions != nil && mnt.VolumeOptions.Subpath != "" {
+			if anonymousVolume {
+				return errAnonymousVolumeWithSubpath
+			}
+
+			// Check if path is relative but without any back traversals
+			if !filepath.IsLocal(mnt.VolumeOptions.Subpath) {
+				return &errMountConfig{mnt, errInvalidSubpath}
+			}
+		}
+
+		if anonymousVolume && mnt.ReadOnly {
 			return &errMountConfig{mnt, fmt.Errorf("must not set ReadOnly mode when using anonymous volumes")}
 		}