瀏覽代碼

Merge pull request #45687 from vvoland/volume-mount-subpath

volumes: Implement subpath mount
Sebastiaan van Stijn 1 年之前
父節點
當前提交
31ccdbb7a8

+ 8 - 0
api/server/router/container/container_routes.go

@@ -628,6 +628,14 @@ func (s *containerRouter) postContainersCreate(ctx context.Context, w http.Respo
 		}
 	}
 
+	if versions.LessThan(version, "1.45") {
+		for _, m := range hostConfig.Mounts {
+			if m.VolumeOptions != nil && m.VolumeOptions.Subpath != "" {
+				return errdefs.InvalidParameter(errors.New("VolumeOptions.Subpath needs API v1.45 or newer"))
+			}
+		}
+	}
+
 	var warnings []string
 	if warn, err := handleMACAddressBC(config, hostConfig, networkingConfig, version); err != nil {
 		return err

+ 4 - 0
api/swagger.yaml

@@ -423,6 +423,10 @@ definitions:
                 type: "object"
                 additionalProperties:
                   type: "string"
+          Subpath:
+            description: "Source path inside the volume. Must be relative without any back traversals."
+            type: "string"
+            example: "dir-inside-volume/subdirectory"
       TmpfsOptions:
         description: "Optional configuration for the `tmpfs` type."
         type: "object"

+ 1 - 0
api/types/mount/mount.go

@@ -96,6 +96,7 @@ type BindOptions struct {
 type VolumeOptions struct {
 	NoCopy       bool              `json:",omitempty"`
 	Labels       map[string]string `json:",omitempty"`
+	Subpath      string            `json:",omitempty"`
 	DriverConfig *Driver           `json:",omitempty"`
 }
 

+ 2 - 2
container/container.go

@@ -514,14 +514,14 @@ func (container *Container) AddMountPointWithVolume(destination string, vol volu
 }
 
 // UnmountVolumes unmounts all volumes
-func (container *Container) UnmountVolumes(volumeEventLog func(name string, action events.Action, attributes map[string]string)) error {
+func (container *Container) UnmountVolumes(ctx context.Context, volumeEventLog func(name string, action events.Action, attributes map[string]string)) error {
 	var errs []string
 	for _, volumeMount := range container.MountPoints {
 		if volumeMount.Volume == nil {
 			continue
 		}
 
-		if err := volumeMount.Cleanup(); err != nil {
+		if err := volumeMount.Cleanup(ctx); err != nil {
 			errs = append(errs, err.Error())
 			continue
 		}

+ 9 - 30
container/container_unix.go

@@ -15,8 +15,6 @@ import (
 	"github.com/docker/docker/api/types/events"
 	mounttypes "github.com/docker/docker/api/types/mount"
 	swarmtypes "github.com/docker/docker/api/types/swarm"
-	"github.com/docker/docker/pkg/stringid"
-	"github.com/docker/docker/volume"
 	volumemounts "github.com/docker/docker/volume/mounts"
 	"github.com/moby/sys/mount"
 	"github.com/opencontainers/selinux/go-selinux/label"
@@ -129,34 +127,11 @@ func (container *Container) NetworkMounts() []Mount {
 }
 
 // CopyImagePathContent copies files in destination to the volume.
-func (container *Container) CopyImagePathContent(v volume.Volume, destination string) error {
-	rootfs, err := container.GetResourcePath(destination)
-	if err != nil {
-		return err
-	}
-
-	if _, err := os.Stat(rootfs); err != nil {
-		if os.IsNotExist(err) {
-			return nil
-		}
-		return err
-	}
-
-	id := stringid.GenerateRandomID()
-	path, err := v.Mount(id)
-	if err != nil {
-		return err
-	}
-
-	defer func() {
-		if err := v.Unmount(id); err != nil {
-			log.G(context.TODO()).Warnf("error while unmounting volume %s: %v", v.Name(), err)
-		}
-	}()
-	if err := label.Relabel(path, container.MountLabel, true); err != nil && !errors.Is(err, syscall.ENOTSUP) {
+func (container *Container) CopyImagePathContent(volumePath, destination string) error {
+	if err := label.Relabel(volumePath, container.MountLabel, true); err != nil && !errors.Is(err, syscall.ENOTSUP) {
 		return err
 	}
-	return copyExistingContents(rootfs, path)
+	return copyExistingContents(destination, volumePath)
 }
 
 // ShmResourcePath returns path to shm
@@ -396,7 +371,7 @@ func (container *Container) DetachAndUnmount(volumeEventLog func(name string, ac
 				Warn("Unable to unmount")
 		}
 	}
-	return container.UnmountVolumes(volumeEventLog)
+	return container.UnmountVolumes(ctx, volumeEventLog)
 }
 
 // ignoreUnsupportedXAttrs ignores errors when extended attributes
@@ -419,9 +394,13 @@ func copyExistingContents(source, destination string) error {
 		return err
 	}
 	if len(dstList) != 0 {
-		// destination is not empty, do not copy
+		log.G(context.TODO()).WithFields(log.Fields{
+			"source":      source,
+			"destination": destination,
+		}).Debug("destination is not empty, do not copy")
 		return nil
 	}
+
 	return fs.CopyDir(destination, source, ignoreUnsupportedXAttrs())
 }
 

+ 2 - 1
container/container_windows.go

@@ -1,6 +1,7 @@
 package container // import "github.com/docker/docker/container"
 
 import (
+	"context"
 	"fmt"
 	"os"
 	"path/filepath"
@@ -128,7 +129,7 @@ func (container *Container) ConfigMounts() []Mount {
 // On Windows it only delegates to `UnmountVolumes` since there is nothing to
 // force unmount.
 func (container *Container) DetachAndUnmount(volumeEventLog func(name string, action events.Action, attributes map[string]string)) error {
-	return container.UnmountVolumes(volumeEventLog)
+	return container.UnmountVolumes(context.TODO(), volumeEventLog)
 }
 
 // TmpfsMounts returns the list of tmpfs mounts

+ 39 - 0
daemon/container_operations_unix.go

@@ -99,6 +99,45 @@ func (daemon *Daemon) getPIDContainer(id string) (*container.Container, error) {
 	return ctr, nil
 }
 
+// setupContainerDirs sets up base container directories (root, ipc, tmpfs and secrets).
+func (daemon *Daemon) setupContainerDirs(c *container.Container) (_ []container.Mount, err error) {
+	if err := daemon.setupContainerMountsRoot(c); err != nil {
+		return nil, err
+	}
+
+	if err := daemon.setupIPCDirs(c); err != nil {
+		return nil, err
+	}
+
+	if err := daemon.setupSecretDir(c); err != nil {
+		return nil, err
+	}
+	defer func() {
+		if err != nil {
+			daemon.cleanupSecretDir(c)
+		}
+	}()
+
+	var ms []container.Mount
+	if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() {
+		ms = append(ms, c.IpcMounts()...)
+	}
+
+	tmpfsMounts, err := c.TmpfsMounts()
+	if err != nil {
+		return nil, err
+	}
+	ms = append(ms, tmpfsMounts...)
+
+	secretMounts, err := c.SecretMounts()
+	if err != nil {
+		return nil, err
+	}
+	ms = append(ms, secretMounts...)
+
+	return ms, nil
+}
+
 func (daemon *Daemon) setupIPCDirs(c *container.Container) error {
 	ipcMode := c.HostConfig.IpcMode
 

+ 8 - 3
daemon/containerfs_linux.go

@@ -17,6 +17,7 @@ import (
 
 	"github.com/docker/docker/api/types"
 	"github.com/docker/docker/container"
+	"github.com/docker/docker/internal/compatcontext"
 	"github.com/docker/docker/internal/mounttree"
 	"github.com/docker/docker/internal/unshare"
 	"github.com/docker/docker/pkg/fileutils"
@@ -54,6 +55,8 @@ type containerFSView struct {
 
 // openContainerFS opens a new view of the container's filesystem.
 func (daemon *Daemon) openContainerFS(container *container.Container) (_ *containerFSView, err error) {
+	ctx := context.TODO()
+
 	if err := daemon.Mount(container); err != nil {
 		return nil, err
 	}
@@ -63,13 +66,15 @@ func (daemon *Daemon) openContainerFS(container *container.Container) (_ *contai
 		}
 	}()
 
-	mounts, err := daemon.setupMounts(container)
+	mounts, cleanup, err := daemon.setupMounts(ctx, container)
 	if err != nil {
 		return nil, err
 	}
 	defer func() {
+		ctx := compatcontext.WithoutCancel(ctx)
+		cleanup(ctx)
 		if err != nil {
-			_ = container.UnmountVolumes(daemon.LogVolumeEvent)
+			_ = container.UnmountVolumes(ctx, daemon.LogVolumeEvent)
 		}
 	}()
 
@@ -207,7 +212,7 @@ func (vw *containerFSView) Close() error {
 	runtime.SetFinalizer(vw, nil)
 	close(vw.todo)
 	err := multierror.Append(nil, <-vw.done)
-	err = multierror.Append(err, vw.ctr.UnmountVolumes(vw.d.LogVolumeEvent))
+	err = multierror.Append(err, vw.ctr.UnmountVolumes(context.TODO(), vw.d.LogVolumeEvent))
 	err = multierror.Append(err, vw.d.Unmount(vw.ctr))
 	return err.ErrorOrNil()
 }

+ 1 - 1
daemon/create.go

@@ -222,7 +222,7 @@ func (daemon *Daemon) create(ctx context.Context, daemonCfg *config.Config, opts
 		return nil, err
 	}
 
-	if err := daemon.createContainerOSSpecificSettings(ctr, opts.params.Config, opts.params.HostConfig); err != nil {
+	if err := daemon.createContainerOSSpecificSettings(ctx, ctr, opts.params.Config, opts.params.HostConfig); err != nil {
 		return nil, err
 	}
 

+ 41 - 6
daemon/create_unix.go

@@ -12,13 +12,17 @@ import (
 	containertypes "github.com/docker/docker/api/types/container"
 	mounttypes "github.com/docker/docker/api/types/mount"
 	"github.com/docker/docker/container"
+	"github.com/docker/docker/errdefs"
+	"github.com/docker/docker/internal/compatcontext"
 	"github.com/docker/docker/oci"
+	volumemounts "github.com/docker/docker/volume/mounts"
 	volumeopts "github.com/docker/docker/volume/service/opts"
 	"github.com/opencontainers/selinux/go-selinux/label"
+	"github.com/pkg/errors"
 )
 
 // createContainerOSSpecificSettings performs host-OS specific container create functionality
-func (daemon *Daemon) createContainerOSSpecificSettings(container *container.Container, config *containertypes.Config, hostConfig *containertypes.HostConfig) error {
+func (daemon *Daemon) createContainerOSSpecificSettings(ctx context.Context, container *container.Container, config *containertypes.Config, hostConfig *containertypes.HostConfig) error {
 	if err := daemon.Mount(container); err != nil {
 		return err
 	}
@@ -45,7 +49,7 @@ func (daemon *Daemon) createContainerOSSpecificSettings(container *container.Con
 		// Skip volumes for which we already have something mounted on that
 		// destination because of a --volume-from.
 		if container.HasMountFor(destination) {
-			log.G(context.TODO()).WithField("container", container.ID).WithField("destination", spec).Debug("mountpoint already exists, skipping anonymous volume")
+			log.G(ctx).WithField("container", container.ID).WithField("destination", spec).Debug("mountpoint already exists, skipping anonymous volume")
 			// Not an error, this could easily have come from the image config.
 			continue
 		}
@@ -70,12 +74,12 @@ func (daemon *Daemon) createContainerOSSpecificSettings(container *container.Con
 
 		container.AddMountPointWithVolume(destination, &volumeWrapper{v: v, s: daemon.volumes}, true)
 	}
-	return daemon.populateVolumes(container)
+	return daemon.populateVolumes(ctx, container)
 }
 
 // populateVolumes copies data from the container's rootfs into the volume for non-binds.
 // this is only called when the container is created.
-func (daemon *Daemon) populateVolumes(c *container.Container) error {
+func (daemon *Daemon) populateVolumes(ctx context.Context, c *container.Container) error {
 	for _, mnt := range c.MountPoints {
 		if mnt.Volume == nil {
 			continue
@@ -85,10 +89,41 @@ func (daemon *Daemon) populateVolumes(c *container.Container) error {
 			continue
 		}
 
-		log.G(context.TODO()).Debugf("copying image data from %s:%s, to %s", c.ID, mnt.Destination, mnt.Name)
-		if err := c.CopyImagePathContent(mnt.Volume, mnt.Destination); err != nil {
+		if err := daemon.populateVolume(ctx, c, mnt); err != nil {
 			return err
 		}
 	}
 	return nil
 }
+
+func (daemon *Daemon) populateVolume(ctx context.Context, c *container.Container, mnt *volumemounts.MountPoint) error {
+	ctrDestPath, err := c.GetResourcePath(mnt.Destination)
+	if err != nil {
+		return err
+	}
+
+	if _, err := os.Stat(ctrDestPath); err != nil {
+		if os.IsNotExist(err) {
+			return nil
+		}
+		return err
+	}
+
+	volumePath, cleanup, err := mnt.Setup(ctx, c.MountLabel, daemon.idMapping.RootPair(), nil)
+	if err != nil {
+		if errdefs.IsNotFound(err) {
+			return nil
+		}
+		log.G(ctx).WithError(err).Debugf("can't copy data from %s:%s, to %s", c.ID, mnt.Destination, volumePath)
+		return errors.Wrapf(err, "failed to populate volume")
+	}
+	defer mnt.Cleanup(compatcontext.WithoutCancel(ctx))
+	defer cleanup(compatcontext.WithoutCancel(ctx))
+
+	log.G(ctx).Debugf("copying image data from %s:%s, to %s", c.ID, mnt.Destination, volumePath)
+	if err := c.CopyImagePathContent(volumePath, ctrDestPath); err != nil {
+		return err
+	}
+
+	return nil
+}

+ 2 - 2
daemon/create_windows.go

@@ -11,7 +11,7 @@ import (
 )
 
 // createContainerOSSpecificSettings performs host-OS specific container create functionality
-func (daemon *Daemon) createContainerOSSpecificSettings(container *container.Container, config *containertypes.Config, hostConfig *containertypes.HostConfig) error {
+func (daemon *Daemon) createContainerOSSpecificSettings(ctx context.Context, container *container.Container, config *containertypes.Config, hostConfig *containertypes.HostConfig) error {
 	if containertypes.Isolation.IsDefault(hostConfig.Isolation) {
 		// Make sure the host config has the default daemon isolation if not specified by caller.
 		hostConfig.Isolation = daemon.defaultIsolation
@@ -34,7 +34,7 @@ func (daemon *Daemon) createContainerOSSpecificSettings(container *container.Con
 
 		// Create the volume in the volume driver. If it doesn't exist,
 		// a new one will be created.
-		v, err := daemon.volumes.Create(context.TODO(), "", volumeDriver, volumeopts.WithCreateReference(container.ID))
+		v, err := daemon.volumes.Create(ctx, "", volumeDriver, volumeopts.WithCreateReference(container.ID))
 		if err != nil {
 			return err
 		}

+ 1 - 1
daemon/daemon.go

@@ -466,7 +466,7 @@ func (daemon *Daemon) restore(cfg *configStore) error {
 						ces.ExitCode = 255
 					}
 					c.SetStopped(&ces)
-					daemon.Cleanup(c)
+					daemon.Cleanup(context.TODO(), c)
 					if err := c.CheckpointTo(daemon.containersReplica); err != nil {
 						baseLogger.WithError(err).Error("failed to update stopped container state")
 					}

+ 1 - 1
daemon/monitor.go

@@ -89,7 +89,7 @@ func (daemon *Daemon) handleContainerExit(c *container.Container, e *libcontaine
 		"exitCode":     strconv.Itoa(exitStatus.ExitCode),
 		"execDuration": strconv.Itoa(int(execDuration.Seconds())),
 	}
-	daemon.Cleanup(c)
+	daemon.Cleanup(context.TODO(), c)
 
 	if restart {
 		c.RestartCount++

+ 3 - 42
daemon/oci_linux.go

@@ -532,47 +532,8 @@ func inSlice(slice []string, s string) bool {
 }
 
 // withMounts sets the container's mounts
-func withMounts(daemon *Daemon, daemonCfg *configStore, c *container.Container) coci.SpecOpts {
+func withMounts(daemon *Daemon, daemonCfg *configStore, c *container.Container, ms []container.Mount) coci.SpecOpts {
 	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
-		if err := daemon.setupContainerMountsRoot(c); err != nil {
-			return err
-		}
-
-		if err := daemon.setupIPCDirs(c); err != nil {
-			return err
-		}
-
-		defer func() {
-			if err != nil {
-				daemon.cleanupSecretDir(c)
-			}
-		}()
-
-		if err := daemon.setupSecretDir(c); err != nil {
-			return err
-		}
-
-		ms, err := daemon.setupMounts(c)
-		if err != nil {
-			return err
-		}
-
-		if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() {
-			ms = append(ms, c.IpcMounts()...)
-		}
-
-		tmpfsMounts, err := c.TmpfsMounts()
-		if err != nil {
-			return err
-		}
-		ms = append(ms, tmpfsMounts...)
-
-		secretMounts, err := c.SecretMounts()
-		if err != nil {
-			return err
-		}
-		ms = append(ms, secretMounts...)
-
 		sort.Sort(mounts(ms))
 
 		mounts := ms
@@ -1093,7 +1054,7 @@ func WithUser(c *container.Container) coci.SpecOpts {
 	}
 }
 
-func (daemon *Daemon) createSpec(ctx context.Context, daemonCfg *configStore, c *container.Container) (retSpec *specs.Spec, err error) {
+func (daemon *Daemon) createSpec(ctx context.Context, daemonCfg *configStore, c *container.Container, mounts []container.Mount) (retSpec *specs.Spec, err error) {
 	var (
 		opts []coci.SpecOpts
 		s    = oci.DefaultSpec()
@@ -1108,7 +1069,7 @@ func (daemon *Daemon) createSpec(ctx context.Context, daemonCfg *configStore, c
 		WithNamespaces(daemon, c),
 		WithCapabilities(c),
 		WithSeccomp(daemon, c),
-		withMounts(daemon, daemonCfg, c),
+		withMounts(daemon, daemonCfg, c, mounts),
 		withLibnetwork(daemon, &daemonCfg.Config, c),
 		WithApparmor(c),
 		WithSelinux(c),

+ 9 - 9
daemon/oci_linux_test.go

@@ -92,7 +92,7 @@ func TestTmpfsDevShmNoDupMount(t *testing.T) {
 	}
 	d := setupFakeDaemon(t, c)
 
-	_, err := d.createSpec(context.TODO(), &configStore{}, c)
+	_, err := d.createSpec(context.TODO(), &configStore{}, c, nil)
 	assert.Check(t, err)
 }
 
@@ -110,7 +110,7 @@ func TestIpcPrivateVsReadonly(t *testing.T) {
 	}
 	d := setupFakeDaemon(t, c)
 
-	s, err := d.createSpec(context.TODO(), &configStore{}, c)
+	s, err := d.createSpec(context.TODO(), &configStore{}, c, nil)
 	assert.Check(t, err)
 
 	// Find the /dev/shm mount in ms, check it does not have ro
@@ -139,7 +139,7 @@ func TestSysctlOverride(t *testing.T) {
 	d := setupFakeDaemon(t, c)
 
 	// Ensure that the implicit sysctl is set correctly.
-	s, err := d.createSpec(context.TODO(), &configStore{}, c)
+	s, err := d.createSpec(context.TODO(), &configStore{}, c, nil)
 	assert.NilError(t, err)
 	assert.Equal(t, s.Hostname, "foobar")
 	assert.Equal(t, s.Linux.Sysctl["kernel.domainname"], c.Config.Domainname)
@@ -155,14 +155,14 @@ func TestSysctlOverride(t *testing.T) {
 	assert.Assert(t, c.HostConfig.Sysctls["kernel.domainname"] != c.Config.Domainname)
 	c.HostConfig.Sysctls["net.ipv4.ip_unprivileged_port_start"] = "1024"
 
-	s, err = d.createSpec(context.TODO(), &configStore{}, c)
+	s, err = d.createSpec(context.TODO(), &configStore{}, c, nil)
 	assert.NilError(t, err)
 	assert.Equal(t, s.Hostname, "foobar")
 	assert.Equal(t, s.Linux.Sysctl["kernel.domainname"], c.HostConfig.Sysctls["kernel.domainname"])
 	assert.Equal(t, s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"], c.HostConfig.Sysctls["net.ipv4.ip_unprivileged_port_start"])
 
 	// Ensure the ping_group_range is not set on a daemon with user-namespaces enabled
-	s, err = d.createSpec(context.TODO(), &configStore{Config: config.Config{RemappedRoot: "dummy:dummy"}}, c)
+	s, err = d.createSpec(context.TODO(), &configStore{Config: config.Config{RemappedRoot: "dummy:dummy"}}, c, nil)
 	assert.NilError(t, err)
 	_, ok := s.Linux.Sysctl["net.ipv4.ping_group_range"]
 	assert.Assert(t, !ok)
@@ -170,7 +170,7 @@ func TestSysctlOverride(t *testing.T) {
 	// Ensure the ping_group_range is set on a container in "host" userns mode
 	// on a daemon with user-namespaces enabled
 	c.HostConfig.UsernsMode = "host"
-	s, err = d.createSpec(context.TODO(), &configStore{Config: config.Config{RemappedRoot: "dummy:dummy"}}, c)
+	s, err = d.createSpec(context.TODO(), &configStore{Config: config.Config{RemappedRoot: "dummy:dummy"}}, c, nil)
 	assert.NilError(t, err)
 	assert.Equal(t, s.Linux.Sysctl["net.ipv4.ping_group_range"], "0 2147483647")
 }
@@ -189,7 +189,7 @@ func TestSysctlOverrideHost(t *testing.T) {
 	d := setupFakeDaemon(t, c)
 
 	// Ensure that the implicit sysctl is not set
-	s, err := d.createSpec(context.TODO(), &configStore{}, c)
+	s, err := d.createSpec(context.TODO(), &configStore{}, c, nil)
 	assert.NilError(t, err)
 	assert.Equal(t, s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"], "")
 	assert.Equal(t, s.Linux.Sysctl["net.ipv4.ping_group_range"], "")
@@ -197,7 +197,7 @@ func TestSysctlOverrideHost(t *testing.T) {
 	// Set an explicit sysctl.
 	c.HostConfig.Sysctls["net.ipv4.ip_unprivileged_port_start"] = "1024"
 
-	s, err = d.createSpec(context.TODO(), &configStore{}, c)
+	s, err = d.createSpec(context.TODO(), &configStore{}, c, nil)
 	assert.NilError(t, err)
 	assert.Equal(t, s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"], c.HostConfig.Sysctls["net.ipv4.ip_unprivileged_port_start"])
 }
@@ -225,7 +225,7 @@ func TestDefaultResources(t *testing.T) {
 	}
 	d := setupFakeDaemon(t, c)
 
-	s, err := d.createSpec(context.Background(), &configStore{}, c)
+	s, err := d.createSpec(context.Background(), &configStore{}, c, nil)
 	assert.NilError(t, err)
 	checkResourcesAreUnset(t, s.Linux.Resources)
 }

+ 42 - 41
daemon/oci_windows.go

@@ -30,30 +30,11 @@ const (
 	credentialSpecFileLocation     = "CredentialSpecs"
 )
 
-func (daemon *Daemon) createSpec(ctx context.Context, daemonCfg *configStore, c *container.Container) (*specs.Spec, error) {
-	img, err := daemon.imageService.GetImage(ctx, string(c.ImageID), imagetypes.GetImageOpts{})
-	if err != nil {
-		return nil, err
-	}
-	if err := image.CheckOS(img.OperatingSystem()); err != nil {
-		return nil, err
-	}
-
-	s := oci.DefaultSpec()
-
-	if err := coci.WithAnnotations(c.HostConfig.Annotations)(ctx, nil, nil, &s); err != nil {
-		return nil, err
-	}
-
-	linkedEnv, err := daemon.setupLinkedContainers(c)
-	if err != nil {
-		return nil, err
-	}
-
+// setupContainerDirs sets up base container directories (root, ipc, tmpfs and secrets).
+func (daemon *Daemon) setupContainerDirs(c *container.Container) ([]container.Mount, error) {
 	// Note, unlike Unix, we do NOT call into SetupWorkingDirectory as
 	// this is done in VMCompute. Further, we couldn't do it for Hyper-V
 	// containers anyway.
-
 	if err := daemon.setupSecretDir(c); err != nil {
 		return nil, err
 	}
@@ -62,25 +43,6 @@ func (daemon *Daemon) createSpec(ctx context.Context, daemonCfg *configStore, c
 		return nil, err
 	}
 
-	// In s.Mounts
-	mounts, err := daemon.setupMounts(c)
-	if err != nil {
-		return nil, err
-	}
-
-	var isHyperV bool
-	if c.HostConfig.Isolation.IsDefault() {
-		// Container using default isolation, so take the default from the daemon configuration
-		isHyperV = daemon.defaultIsolation.IsHyperV()
-	} else {
-		// Container may be requesting an explicit isolation mode.
-		isHyperV = c.HostConfig.Isolation.IsHyperV()
-	}
-
-	if isHyperV {
-		s.Windows.HyperV = &specs.WindowsHyperV{}
-	}
-
 	// If the container has not been started, and has configs or secrets
 	// secrets, create symlinks to each config and secret. If it has been
 	// started before, the symlinks should have already been created. Also, it
@@ -90,7 +52,7 @@ func (daemon *Daemon) createSpec(ctx context.Context, daemonCfg *configStore, c
 	if !c.HasBeenStartedBefore && (len(c.SecretReferences) > 0 || len(c.ConfigReferences) > 0) {
 		// The container file system is mounted before this function is called,
 		// except for Hyper-V containers, so mount it here in that case.
-		if isHyperV {
+		if daemon.isHyperV(c) {
 			if err := daemon.Mount(c); err != nil {
 				return nil, err
 			}
@@ -108,6 +70,8 @@ func (daemon *Daemon) createSpec(ctx context.Context, daemonCfg *configStore, c
 	if err != nil {
 		return nil, err
 	}
+
+	var mounts []container.Mount
 	if secretMounts != nil {
 		mounts = append(mounts, secretMounts...)
 	}
@@ -116,6 +80,33 @@ func (daemon *Daemon) createSpec(ctx context.Context, daemonCfg *configStore, c
 		mounts = append(mounts, configMounts...)
 	}
 
+	return mounts, nil
+}
+
+func (daemon *Daemon) isHyperV(c *container.Container) bool {
+	if c.HostConfig.Isolation.IsDefault() {
+		// Container using default isolation, so take the default from the daemon configuration
+		return daemon.defaultIsolation.IsHyperV()
+	}
+	// Container may be requesting an explicit isolation mode.
+	return c.HostConfig.Isolation.IsHyperV()
+}
+
+func (daemon *Daemon) createSpec(ctx context.Context, daemonCfg *configStore, c *container.Container, mounts []container.Mount) (*specs.Spec, error) {
+	img, err := daemon.imageService.GetImage(ctx, string(c.ImageID), imagetypes.GetImageOpts{})
+	if err != nil {
+		return nil, err
+	}
+	if err := image.CheckOS(img.OperatingSystem()); err != nil {
+		return nil, err
+	}
+
+	s := oci.DefaultSpec()
+
+	if err := coci.WithAnnotations(c.HostConfig.Annotations)(ctx, nil, nil, &s); err != nil {
+		return nil, err
+	}
+
 	for _, mount := range mounts {
 		m := specs.Mount{
 			Source:      mount.Source,
@@ -127,6 +118,16 @@ func (daemon *Daemon) createSpec(ctx context.Context, daemonCfg *configStore, c
 		s.Mounts = append(s.Mounts, m)
 	}
 
+	linkedEnv, err := daemon.setupLinkedContainers(c)
+	if err != nil {
+		return nil, err
+	}
+
+	isHyperV := daemon.isHyperV(c)
+	if isHyperV {
+		s.Windows.HyperV = &specs.WindowsHyperV{}
+	}
+
 	// In s.Process
 	s.Process.Cwd = c.Config.WorkingDir
 	s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)

+ 21 - 9
daemon/start.go

@@ -139,7 +139,7 @@ func (daemon *Daemon) containerStart(ctx context.Context, daemonCfg *configStore
 			}
 			container.Reset(false)
 
-			daemon.Cleanup(container)
+			daemon.Cleanup(compatcontext.WithoutCancel(ctx), container)
 			// if containers AutoRemove flag is set, remove it after clean up
 			if container.HostConfig.AutoRemove {
 				container.Unlock()
@@ -159,7 +159,19 @@ func (daemon *Daemon) containerStart(ctx context.Context, daemonCfg *configStore
 		return err
 	}
 
-	spec, err := daemon.createSpec(ctx, daemonCfg, container)
+	mnts, err := daemon.setupContainerDirs(container)
+	if err != nil {
+		return err
+	}
+
+	m, cleanup, err := daemon.setupMounts(ctx, container)
+	if err != nil {
+		return err
+	}
+	mnts = append(mnts, m...)
+	defer cleanup(compatcontext.WithoutCancel(ctx))
+
+	spec, err := daemon.createSpec(ctx, daemonCfg, container, mnts)
 	if err != nil {
 		// Any error that occurs while creating the spec, even if it's the
 		// result of an invalid container config, must be considered a System
@@ -248,19 +260,19 @@ func (daemon *Daemon) containerStart(ctx context.Context, daemonCfg *configStore
 
 // Cleanup releases any network resources allocated to the container along with any rules
 // around how containers are linked together.  It also unmounts the container's root filesystem.
-func (daemon *Daemon) Cleanup(container *container.Container) {
+func (daemon *Daemon) Cleanup(ctx context.Context, container *container.Container) {
 	// Microsoft HCS containers get in a bad state if host resources are
 	// released while the container still exists.
 	if ctr, ok := container.C8dContainer(); ok {
 		if err := ctr.Delete(context.Background()); err != nil {
-			log.G(context.TODO()).Errorf("%s cleanup: failed to delete container from containerd: %v", container.ID, err)
+			log.G(ctx).Errorf("%s cleanup: failed to delete container from containerd: %v", container.ID, err)
 		}
 	}
 
 	daemon.releaseNetwork(container)
 
 	if err := container.UnmountIpcMount(); err != nil {
-		log.G(context.TODO()).Warnf("%s cleanup: failed to unmount IPC: %s", container.ID, err)
+		log.G(ctx).Warnf("%s cleanup: failed to unmount IPC: %s", container.ID, err)
 	}
 
 	if err := daemon.conditionalUnmountOnCleanup(container); err != nil {
@@ -272,11 +284,11 @@ func (daemon *Daemon) Cleanup(container *container.Container) {
 	}
 
 	if err := container.UnmountSecrets(); err != nil {
-		log.G(context.TODO()).Warnf("%s cleanup: failed to unmount secrets: %s", container.ID, err)
+		log.G(ctx).Warnf("%s cleanup: failed to unmount secrets: %s", container.ID, err)
 	}
 
 	if err := recursiveUnmount(container.Root); err != nil {
-		log.G(context.TODO()).WithError(err).WithField("container", container.ID).Warn("Error while cleaning up container resource mounts.")
+		log.G(ctx).WithError(err).WithField("container", container.ID).Warn("Error while cleaning up container resource mounts.")
 	}
 
 	for _, eConfig := range container.ExecCommands.Commands() {
@@ -284,8 +296,8 @@ func (daemon *Daemon) Cleanup(container *container.Container) {
 	}
 
 	if container.BaseFS != "" {
-		if err := container.UnmountVolumes(daemon.LogVolumeEvent); err != nil {
-			log.G(context.TODO()).Warnf("%s cleanup: Failed to umount volumes: %v", container.ID, err)
+		if err := container.UnmountVolumes(ctx, daemon.LogVolumeEvent); err != nil {
+			log.G(ctx).Warnf("%s cleanup: Failed to umount volumes: %v", container.ID, err)
 		}
 	}
 

+ 27 - 10
daemon/volumes_unix.go

@@ -3,15 +3,19 @@
 package daemon // import "github.com/docker/docker/daemon"
 
 import (
+	"context"
 	"fmt"
 	"os"
 	"sort"
 	"strconv"
 	"strings"
 
+	"github.com/containerd/log"
 	"github.com/docker/docker/api/types/events"
 	mounttypes "github.com/docker/docker/api/types/mount"
 	"github.com/docker/docker/container"
+	"github.com/docker/docker/internal/cleanups"
+	"github.com/docker/docker/internal/compatcontext"
 	volumemounts "github.com/docker/docker/volume/mounts"
 	"github.com/pkg/errors"
 )
@@ -19,23 +23,34 @@ import (
 // setupMounts iterates through each of the mount points for a container and
 // calls Setup() on each. It also looks to see if is a network mount such as
 // /etc/resolv.conf, and if it is not, appends it to the array of mounts.
-func (daemon *Daemon) setupMounts(c *container.Container) ([]container.Mount, error) {
+//
+// The cleanup function should be called as soon as the container has been
+// started.
+func (daemon *Daemon) setupMounts(ctx context.Context, c *container.Container) ([]container.Mount, func(context.Context) error, error) {
 	var mounts []container.Mount
 	// TODO: tmpfs mounts should be part of Mountpoints
 	tmpfsMounts := make(map[string]bool)
 	tmpfsMountInfo, err := c.TmpfsMounts()
 	if err != nil {
-		return nil, err
+		return nil, nil, err
 	}
 	for _, m := range tmpfsMountInfo {
 		tmpfsMounts[m.Destination] = true
 	}
+
+	cleanups := cleanups.Composite{}
+	defer func() {
+		if err := cleanups.Call(compatcontext.WithoutCancel(ctx)); err != nil {
+			log.G(ctx).WithError(err).Warn("failed to cleanup temporary mounts created by MountPoint.Setup")
+		}
+	}()
+
 	for _, m := range c.MountPoints {
 		if tmpfsMounts[m.Destination] {
 			continue
 		}
 		if err := daemon.lazyInitializeVolume(c.ID, m); err != nil {
-			return nil, err
+			return nil, nil, err
 		}
 		// If the daemon is being shutdown, we should not let a container start if it is trying to
 		// mount the socket the daemon is listening on. During daemon shutdown, the socket
@@ -48,10 +63,12 @@ func (daemon *Daemon) setupMounts(c *container.Container) ([]container.Mount, er
 			return nil
 		}
 
-		path, err := m.Setup(c.MountLabel, daemon.idMapping.RootPair(), checkfunc)
+		path, clean, err := m.Setup(ctx, c.MountLabel, daemon.idMapping.RootPair(), checkfunc)
 		if err != nil {
-			return nil, err
+			return nil, nil, err
 		}
+		cleanups.Add(clean)
+
 		if !c.TrySetNetworkMount(m.Destination, path) {
 			mnt := container.Mount{
 				Source:      path,
@@ -61,13 +78,13 @@ func (daemon *Daemon) setupMounts(c *container.Container) ([]container.Mount, er
 			}
 			if m.Spec.Type == mounttypes.TypeBind && m.Spec.BindOptions != nil {
 				if !m.Spec.ReadOnly && m.Spec.BindOptions.ReadOnlyNonRecursive {
-					return nil, errors.New("mount options conflict: !ReadOnly && BindOptions.ReadOnlyNonRecursive")
+					return nil, nil, errors.New("mount options conflict: !ReadOnly && BindOptions.ReadOnlyNonRecursive")
 				}
 				if !m.Spec.ReadOnly && m.Spec.BindOptions.ReadOnlyForceRecursive {
-					return nil, errors.New("mount options conflict: !ReadOnly && BindOptions.ReadOnlyForceRecursive")
+					return nil, nil, errors.New("mount options conflict: !ReadOnly && BindOptions.ReadOnlyForceRecursive")
 				}
 				if m.Spec.BindOptions.ReadOnlyNonRecursive && m.Spec.BindOptions.ReadOnlyForceRecursive {
-					return nil, errors.New("mount options conflict: ReadOnlyNonRecursive && BindOptions.ReadOnlyForceRecursive")
+					return nil, nil, errors.New("mount options conflict: ReadOnlyNonRecursive && BindOptions.ReadOnlyForceRecursive")
 				}
 				mnt.NonRecursive = m.Spec.BindOptions.NonRecursive
 				mnt.ReadOnlyNonRecursive = m.Spec.BindOptions.ReadOnlyNonRecursive
@@ -98,11 +115,11 @@ func (daemon *Daemon) setupMounts(c *container.Container) ([]container.Mount, er
 		// up to the user to make sure the file has proper ownership for userns
 		if strings.Index(mnt.Source, daemon.repository) == 0 {
 			if err := os.Chown(mnt.Source, rootIDs.UID, rootIDs.GID); err != nil {
-				return nil, err
+				return nil, nil, err
 			}
 		}
 	}
-	return append(mounts, netMounts...), nil
+	return append(mounts, netMounts...), cleanups.Release(), nil
 }
 
 // sortMounts sorts an array of mounts in lexicographic order. This ensure that

+ 20 - 6
daemon/volumes_windows.go

@@ -1,10 +1,14 @@
 package daemon // import "github.com/docker/docker/daemon"
 
 import (
+	"context"
 	"sort"
 
+	"github.com/containerd/log"
 	"github.com/docker/docker/api/types/mount"
 	"github.com/docker/docker/container"
+	"github.com/docker/docker/internal/cleanups"
+	"github.com/docker/docker/internal/compatcontext"
 	"github.com/docker/docker/pkg/idtools"
 	volumemounts "github.com/docker/docker/volume/mounts"
 )
@@ -13,21 +17,31 @@ import (
 // of the configured mounts on the container to the OCI mount structure
 // which will ultimately be passed into the oci runtime during container creation.
 // It also ensures each of the mounts are lexicographically sorted.
-
+//
+// The cleanup function should be called as soon as the container has been
+// started.
+//
 // BUGBUG TODO Windows containerd. This would be much better if it returned
 // an array of runtime spec mounts, not container mounts. Then no need to
 // do multiple transitions.
+func (daemon *Daemon) setupMounts(ctx context.Context, c *container.Container) ([]container.Mount, func(context.Context) error, error) {
+	cleanups := cleanups.Composite{}
+	defer func() {
+		if err := cleanups.Call(compatcontext.WithoutCancel(ctx)); err != nil {
+			log.G(ctx).WithError(err).Warn("failed to cleanup temporary mounts created by MountPoint.Setup")
+		}
+	}()
 
-func (daemon *Daemon) setupMounts(c *container.Container) ([]container.Mount, error) {
 	var mnts []container.Mount
 	for _, mount := range c.MountPoints { // type is volumemounts.MountPoint
 		if err := daemon.lazyInitializeVolume(c.ID, mount); err != nil {
-			return nil, err
+			return nil, nil, err
 		}
-		s, err := mount.Setup(c.MountLabel, idtools.Identity{}, nil)
+		s, c, err := mount.Setup(ctx, c.MountLabel, idtools.Identity{}, nil)
 		if err != nil {
-			return nil, err
+			return nil, nil, err
 		}
+		cleanups.Add(c)
 
 		mnts = append(mnts, container.Mount{
 			Source:      s,
@@ -37,7 +51,7 @@ func (daemon *Daemon) setupMounts(c *container.Container) ([]container.Mount, er
 	}
 
 	sort.Sort(mounts(mnts))
-	return mnts, nil
+	return mnts, cleanups.Release(), nil
 }
 
 // setBindModeIfNull is platform specific processing which is a no-op on

+ 3 - 0
docs/api/version-history.md

@@ -17,6 +17,9 @@ keywords: "API, Docker, rcli, REST, documentation"
 
 [Docker Engine API v1.45](https://docs.docker.com/engine/api/v1.45/) documentation
 
+* `POST /containers/create` now supports `VolumeOptions.Subpath` which allows a
+  subpath of a named volume to be mounted.
+
 ## v1.44 API changes
 
 [Docker Engine API v1.44](https://docs.docker.com/engine/api/v1.44/) documentation

+ 25 - 0
integration/internal/container/container.go

@@ -170,3 +170,28 @@ func Inspect(ctx context.Context, t *testing.T, apiClient client.APIClient, cont
 
 	return c
 }
+
+type ContainerOutput struct {
+	Stdout, Stderr string
+}
+
+// Output waits for the container to end running and returns its output.
+func Output(ctx context.Context, client client.APIClient, id string) (ContainerOutput, error) {
+	logs, err := client.ContainerLogs(ctx, id, container.LogsOptions{Follow: true, ShowStdout: true, ShowStderr: true})
+	if err != nil {
+		return ContainerOutput{}, err
+	}
+
+	defer logs.Close()
+
+	var stdoutBuf, stderrBuf bytes.Buffer
+	_, err = stdcopy.StdCopy(&stdoutBuf, &stderrBuf, logs)
+	if err != nil {
+		return ContainerOutput{}, err
+	}
+
+	return ContainerOutput{
+		Stdout: stdoutBuf.String(),
+		Stderr: stderrBuf.String(),
+	}, nil
+}

+ 185 - 0
integration/volume/mount_test.go

@@ -0,0 +1,185 @@
+package volume
+
+import (
+	"context"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	containertypes "github.com/docker/docker/api/types/container"
+	"github.com/docker/docker/api/types/mount"
+	"github.com/docker/docker/api/types/network"
+	"github.com/docker/docker/api/types/versions"
+	"github.com/docker/docker/api/types/volume"
+	"github.com/docker/docker/client"
+	"github.com/docker/docker/integration/internal/container"
+	"github.com/docker/docker/internal/safepath"
+	"gotest.tools/v3/assert"
+	is "gotest.tools/v3/assert/cmp"
+	"gotest.tools/v3/skip"
+)
+
+func TestRunMountVolumeSubdir(t *testing.T) {
+	skip.If(t, versions.LessThan(testEnv.DaemonAPIVersion(), "1.45"), "skip test from new feature")
+
+	ctx := setupTest(t)
+	apiClient := testEnv.APIClient()
+
+	testVolumeName := setupTestVolume(t, apiClient)
+
+	for _, tc := range []struct {
+		name         string
+		opts         mount.VolumeOptions
+		cmd          []string
+		volumeTarget string
+		createErr    string
+		startErr     string
+		expected     string
+		skipPlatform string
+	}{
+		{name: "subdir", opts: mount.VolumeOptions{Subpath: "subdir"}, cmd: []string{"ls", "/volume"}, expected: "hello.txt"},
+		{name: "subdir link", opts: mount.VolumeOptions{Subpath: "hack/good"}, cmd: []string{"ls", "/volume"}, expected: "hello.txt"},
+		{name: "subdir with copy data", opts: mount.VolumeOptions{Subpath: "bin"}, volumeTarget: "/bin", cmd: []string{"ls", "/bin/busybox"}, expected: "/bin/busybox", skipPlatform: "windows:copy not supported on Windows"},
+		{name: "file", opts: mount.VolumeOptions{Subpath: "bar.txt"}, cmd: []string{"cat", "/volume"}, expected: "foo", skipPlatform: "windows:file bind mounts not supported on Windows"},
+		{name: "relative with backtracks", opts: mount.VolumeOptions{Subpath: "../../../../../../etc/passwd"}, cmd: []string{"cat", "/volume"}, createErr: "subpath must be a relative path within the volume"},
+		{name: "not existing", opts: mount.VolumeOptions{Subpath: "not-existing-path"}, cmd: []string{"cat", "/volume"}, startErr: (&safepath.ErrNotAccessible{}).Error()},
+
+		{name: "mount link", opts: mount.VolumeOptions{Subpath: filepath.Join("hack", "root")}, cmd: []string{"ls", "/volume"}, startErr: (&safepath.ErrEscapesBase{}).Error()},
+		{name: "mount link link", opts: mount.VolumeOptions{Subpath: filepath.Join("hack", "bad")}, cmd: []string{"ls", "/volume"}, startErr: (&safepath.ErrEscapesBase{}).Error()},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			if tc.skipPlatform != "" {
+				platform, reason, _ := strings.Cut(tc.skipPlatform, ":")
+				if testEnv.DaemonInfo.OSType == platform {
+					t.Skip(reason)
+				}
+			}
+
+			cfg := containertypes.Config{
+				Image: "busybox",
+				Cmd:   tc.cmd,
+			}
+			hostCfg := containertypes.HostConfig{
+				Mounts: []mount.Mount{
+					{
+						Type:          mount.TypeVolume,
+						Source:        testVolumeName,
+						Target:        "/volume",
+						VolumeOptions: &tc.opts,
+					},
+				},
+			}
+			if testEnv.DaemonInfo.OSType == "windows" {
+				hostCfg.Mounts[0].Target = `C:\volume`
+			}
+			if tc.volumeTarget != "" {
+				hostCfg.Mounts[0].Target = tc.volumeTarget
+			}
+
+			ctrName := strings.ReplaceAll(t.Name(), "/", "_")
+			create, creatErr := apiClient.ContainerCreate(ctx, &cfg, &hostCfg, &network.NetworkingConfig{}, nil, ctrName)
+			id := create.ID
+			if id != "" {
+				defer apiClient.ContainerRemove(ctx, id, containertypes.RemoveOptions{Force: true})
+			}
+
+			if tc.createErr != "" {
+				assert.ErrorContains(t, creatErr, tc.createErr)
+				return
+			}
+			assert.NilError(t, creatErr, "container creation failed")
+
+			startErr := apiClient.ContainerStart(ctx, id, containertypes.StartOptions{})
+			if tc.startErr != "" {
+				assert.ErrorContains(t, startErr, tc.startErr)
+				return
+			}
+			assert.NilError(t, startErr)
+
+			output, err := container.Output(ctx, apiClient, id)
+			assert.Check(t, err)
+			t.Logf("stdout:\n%s", output.Stdout)
+			t.Logf("stderr:\n%s", output.Stderr)
+
+			inspect, err := apiClient.ContainerInspect(ctx, id)
+			if assert.Check(t, err) {
+				assert.Check(t, is.Equal(inspect.State.ExitCode, 0))
+			}
+
+			assert.Check(t, is.Equal(strings.TrimSpace(output.Stderr), ""))
+			assert.Check(t, is.Equal(strings.TrimSpace(output.Stdout), tc.expected))
+		})
+	}
+}
+
+// setupTestVolume sets up a volume with:
+// .
+// |-- bar.txt                        (file with "foo")
+// |-- bin                            (directory)
+// |-- subdir                         (directory)
+// |   |-- hello.txt                  (file with "world")
+// |-- hack                           (directory)
+// |   |-- root                       (symlink to /)
+// |   |-- good                       (symlink to ../subdir)
+// |   |-- bad                        (symlink to root)
+func setupTestVolume(t *testing.T, client client.APIClient) string {
+	t.Helper()
+	ctx := context.Background()
+
+	volumeName := t.Name() + "-volume"
+
+	err := client.VolumeRemove(ctx, volumeName, true)
+	assert.NilError(t, err, "failed to clean volume")
+
+	_, err = client.VolumeCreate(ctx, volume.CreateOptions{
+		Name: volumeName,
+	})
+	assert.NilError(t, err, "failed to setup volume")
+
+	mount := mount.Mount{
+		Type:   mount.TypeVolume,
+		Source: volumeName,
+		Target: "/volume",
+	}
+
+	rootFs := "/"
+	if testEnv.DaemonInfo.OSType == "windows" {
+		mount.Target = `C:\volume`
+		rootFs = `C:`
+	}
+
+	initCmd := "echo foo > /volume/bar.txt && " +
+		"mkdir /volume/bin && " +
+		"mkdir /volume/subdir && " +
+		"echo world > /volume/subdir/hello.txt && " +
+		"mkdir /volume/hack && " +
+		"ln -s " + rootFs + " /volume/hack/root && " +
+		"ln -s ../subdir /volume/hack/good && " +
+		"ln -s root /volume/hack/bad &&" +
+		"mkdir /volume/hack/iwanttobehackedwithtoctou"
+
+	opts := []func(*container.TestContainerConfig){
+		container.WithMount(mount),
+		container.WithCmd("sh", "-c", initCmd+"; ls -lah /volume /volume/hack/"),
+	}
+	if testEnv.DaemonInfo.OSType == "windows" {
+		// Can't create symlinks under HyperV isolation
+		opts = append(opts, container.WithIsolation(containertypes.IsolationProcess))
+	}
+
+	cid := container.Run(ctx, t, client, opts...)
+	defer client.ContainerRemove(ctx, cid, containertypes.RemoveOptions{Force: true})
+	output, err := container.Output(ctx, client, cid)
+
+	t.Logf("Setup stderr:\n%s", output.Stderr)
+	t.Logf("Setup stdout:\n%s", output.Stdout)
+
+	assert.NilError(t, err)
+	assert.Assert(t, is.Equal(output.Stderr, ""))
+
+	inspect, err := client.ContainerInspect(ctx, cid)
+	assert.NilError(t, err)
+	assert.Assert(t, is.Equal(inspect.State.ExitCode, 0))
+
+	return volumeName
+}

+ 44 - 0
internal/cleanups/composite.go

@@ -0,0 +1,44 @@
+package cleanups
+
+import (
+	"context"
+
+	"github.com/docker/docker/internal/multierror"
+)
+
+type Composite struct {
+	cleanups []func(context.Context) error
+}
+
+// Add adds a cleanup to be called.
+func (c *Composite) Add(f func(context.Context) error) {
+	c.cleanups = append(c.cleanups, f)
+}
+
+// Call calls all cleanups in reverse order and returns an error combining all
+// non-nil errors.
+func (c *Composite) Call(ctx context.Context) error {
+	err := call(ctx, c.cleanups)
+	c.cleanups = nil
+	return err
+}
+
+// Release removes all cleanups, turning Call into a no-op.
+// Caller still can call the cleanups by calling the returned function
+// which is equivalent to calling the Call before Release was called.
+func (c *Composite) Release() func(context.Context) error {
+	cleanups := c.cleanups
+	c.cleanups = nil
+	return func(ctx context.Context) error {
+		return call(ctx, cleanups)
+	}
+}
+
+func call(ctx context.Context, cleanups []func(context.Context) error) error {
+	var errs []error
+	for idx := len(cleanups) - 1; idx >= 0; idx-- {
+		c := cleanups[idx]
+		errs = append(errs, c(ctx))
+	}
+	return multierror.Join(errs...)
+}

+ 54 - 0
internal/cleanups/composite_test.go

@@ -0,0 +1,54 @@
+package cleanups
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"testing"
+
+	"gotest.tools/v3/assert"
+	is "gotest.tools/v3/assert/cmp"
+)
+
+func TestCall(t *testing.T) {
+	c := Composite{}
+	var err1 = errors.New("error1")
+	var err2 = errors.New("error2")
+	var errX = errors.New("errorX")
+	var errY = errors.New("errorY")
+	var errZ = errors.New("errorZ")
+	var errYZ = errors.Join(errY, errZ)
+
+	c.Add(func(ctx context.Context) error {
+		return err1
+	})
+	c.Add(func(ctx context.Context) error {
+		return nil
+	})
+	c.Add(func(ctx context.Context) error {
+		return fmt.Errorf("something happened: %w", err2)
+	})
+	c.Add(func(ctx context.Context) error {
+		return errors.Join(errX, fmt.Errorf("joined: %w", errYZ))
+	})
+
+	err := c.Call(context.Background())
+
+	errs := err.(interface{ Unwrap() []error }).Unwrap()
+
+	assert.Check(t, is.ErrorContains(err, err1.Error()))
+	assert.Check(t, is.ErrorContains(err, err2.Error()))
+	assert.Check(t, is.ErrorContains(err, errX.Error()))
+	assert.Check(t, is.ErrorContains(err, errY.Error()))
+	assert.Check(t, is.ErrorContains(err, errZ.Error()))
+	assert.Check(t, is.ErrorContains(err, "something happened: "+err2.Error()))
+
+	t.Logf(err.Error())
+	assert.Assert(t, is.Len(errs, 3))
+
+	// Cleanups executed in reverse order.
+	assert.Check(t, is.ErrorIs(errs[2], err1))
+	assert.Check(t, is.ErrorIs(errs[1], err2))
+	assert.Check(t, is.ErrorIs(errs[0], errX))
+	assert.Check(t, is.ErrorIs(errs[0], errYZ))
+}

+ 66 - 0
internal/safepath/common.go

@@ -0,0 +1,66 @@
+package safepath
+
+import (
+	"os"
+	"path/filepath"
+
+	"github.com/pkg/errors"
+)
+
+// evaluatePath evaluates symlinks in the concatenation of path and subpath. If
+// err is nil, resolvedBasePath will contain result of resolving all symlinks
+// in the given path, and resolvedSubpath will contain a relative path rooted
+// at the resolvedBasePath pointing to the concatenation after resolving all
+// symlinks.
+func evaluatePath(path, subpath string) (resolvedBasePath string, resolvedSubpath string, err error) {
+	baseResolved, err := filepath.EvalSymlinks(path)
+	if err != nil {
+		if errors.Is(err, os.ErrNotExist) {
+			return "", "", &ErrNotAccessible{Path: path, Cause: err}
+		}
+		return "", "", errors.Wrapf(err, "error while resolving symlinks in base directory %q", path)
+	}
+
+	combinedPath := filepath.Join(baseResolved, subpath)
+	combinedResolved, err := filepath.EvalSymlinks(combinedPath)
+	if err != nil {
+		if errors.Is(err, os.ErrNotExist) {
+			return "", "", &ErrNotAccessible{Path: combinedPath, Cause: err}
+		}
+		return "", "", errors.Wrapf(err, "error while resolving symlinks in combined path %q", combinedPath)
+	}
+
+	subpart, err := filepath.Rel(baseResolved, combinedResolved)
+	if err != nil {
+		return "", "", &ErrEscapesBase{Base: baseResolved, Subpath: subpath}
+	}
+
+	if !filepath.IsLocal(subpart) {
+		return "", "", &ErrEscapesBase{Base: baseResolved, Subpath: subpath}
+	}
+
+	return baseResolved, subpart, nil
+}
+
+// isLocalTo reports whether path, using lexical analysis only, has all of these properties:
+//   - is within the subtree rooted at basepath
+//   - is not empty
+//   - on Windows, is not a reserved name such as "NUL"
+//
+// If isLocalTo(path, basepath) returns true, then
+//
+//	filepath.Rel(basepath, path)
+//
+// will always produce an unrooted path with no `..` elements.
+//
+// isLocalTo is a purely lexical operation. In particular, it does not account for the effect of any symbolic links that may exist in the filesystem.
+//
+// Both path and basepath are expected to be absolute paths.
+func isLocalTo(path, basepath string) bool {
+	rel, err := filepath.Rel(basepath, path)
+	if err != nil {
+		return false
+	}
+
+	return filepath.IsLocal(rel)
+}

+ 31 - 0
internal/safepath/common_test.go

@@ -0,0 +1,31 @@
+package safepath
+
+import (
+	"testing"
+
+	"gotest.tools/v3/assert"
+	is "gotest.tools/v3/assert/cmp"
+)
+
+func TestIsLocalTo(t *testing.T) {
+	for _, tc := range []struct {
+		name    string
+		subpath string
+		result  bool
+	}{
+		{name: "same", subpath: "/volume", result: true},
+		{name: "1 level subpath", subpath: "/volume/sub", result: true},
+		{name: "2 level subpath", subpath: "/volume/sub/path", result: true},
+		{name: "absolute", subpath: "/etc/passwd", result: false},
+		{name: "backtrack", subpath: "/volume/../", result: false},
+		{name: "backtrack inside", subpath: "/volume/sub/../", result: true},
+		{name: "relative path", subpath: "./rel", result: false},
+		{name: "file with dots", subpath: "/volume/file..with.dots", result: true},
+		{name: "file starting with dots", subpath: "/volume/..file", result: true},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			result := isLocalTo(tc.subpath, "/volume")
+			assert.Check(t, is.Equal(result, tc.result))
+		})
+	}
+}

+ 42 - 0
internal/safepath/errors.go

@@ -0,0 +1,42 @@
+package safepath
+
+// ErrNotAccessible is returned by Join when the resulting path doesn't exist,
+// is not accessible, or any of the path components was replaced with a symlink
+// during the path traversal.
+type ErrNotAccessible struct {
+	Path  string
+	Cause error
+}
+
+func (*ErrNotAccessible) NotFound() {}
+
+func (e *ErrNotAccessible) Unwrap() error {
+	return e.Cause
+}
+
+func (e *ErrNotAccessible) Error() string {
+	msg := "cannot access path " + e.Path
+	if e.Cause != nil {
+		msg += ": " + e.Cause.Error()
+	}
+	return msg
+}
+
+// ErrEscapesBase is returned by Join when the resulting concatenation would
+// point outside of the specified base directory.
+type ErrEscapesBase struct {
+	Base, Subpath string
+}
+
+func (*ErrEscapesBase) InvalidParameter() {}
+
+func (e *ErrEscapesBase) Error() string {
+	msg := "path concatenation escapes the base directory"
+	if e.Base != "" {
+		msg += ", base: " + e.Base
+	}
+	if e.Subpath != "" {
+		msg += ", subpath: " + e.Subpath
+	}
+	return msg
+}

+ 150 - 0
internal/safepath/join_linux.go

@@ -0,0 +1,150 @@
+package safepath
+
+import (
+	"context"
+	"os"
+	"path/filepath"
+	"runtime"
+	"strconv"
+
+	"github.com/containerd/log"
+	"github.com/docker/docker/internal/unix_noeintr"
+	"github.com/pkg/errors"
+	"golang.org/x/sys/unix"
+)
+
+// Join makes sure that the concatenation of path and subpath doesn't
+// resolve to a path outside of path and returns a path to a temporary file that is
+// a bind mount to the the exact same file/directory that was validated.
+//
+// After use, it is the caller's responsibility to call Close on the returned
+// SafePath object, which will unmount the temporary file/directory
+// and remove it.
+func Join(_ context.Context, path, subpath string) (*SafePath, error) {
+	base, subpart, err := evaluatePath(path, subpath)
+	if err != nil {
+		return nil, err
+	}
+
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
+	fd, err := safeOpenFd(base, subpart)
+	if err != nil {
+		return nil, err
+	}
+
+	defer unix_noeintr.Close(fd)
+
+	tmpMount, err := tempMountPoint(fd)
+	if err != nil {
+		return nil, errors.Wrap(err, "failed to create temporary file for safe mount")
+	}
+
+	pid := strconv.Itoa(unix.Gettid())
+	// Using explicit pid path, because /proc/self/fd/<fd> fails with EACCES
+	// when running under "Enhanced Container Isolation" in Docker Desktop
+	// which uses sysbox runtime under the hood.
+	// TODO(vvoland): Investigate.
+	mountSource := "/proc/" + pid + "/fd/" + strconv.Itoa(fd)
+
+	if err := unix_noeintr.Mount(mountSource, tmpMount, "none", unix.MS_BIND, ""); err != nil {
+		os.Remove(tmpMount)
+		return nil, errors.Wrap(err, "failed to mount resolved path")
+	}
+
+	return &SafePath{
+		path:          tmpMount,
+		sourceBase:    base,
+		sourceSubpath: subpart,
+		cleanup:       cleanupSafePath(tmpMount),
+	}, nil
+}
+
+// safeOpenFd opens the file at filepath.Join(path, subpath) in O_PATH
+// mode and returns the file descriptor if subpath is within the subtree
+// rooted at path. It is an error if any of components of path or subpath
+// are symbolic links.
+//
+// It is a caller's responsibility to close the returned file descriptor, if no
+// error was returned.
+func safeOpenFd(path, subpath string) (int, error) {
+	// Open base volume path (_data directory).
+	prevFd, err := unix_noeintr.Open(path, unix.O_PATH|unix.O_DIRECTORY|unix.O_CLOEXEC|unix.O_NOFOLLOW, 0)
+	if err != nil {
+		return -1, &ErrNotAccessible{Path: path, Cause: err}
+	}
+	defer unix_noeintr.Close(prevFd)
+
+	// Try to use the Openat2 syscall first (available on Linux 5.6+).
+	fd, err := unix_noeintr.Openat2(prevFd, subpath, &unix.OpenHow{
+		Flags:   unix.O_PATH | unix.O_CLOEXEC,
+		Mode:    0,
+		Resolve: unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS | unix.RESOLVE_NO_SYMLINKS,
+	})
+
+	switch {
+	case errors.Is(err, unix.ENOSYS):
+		// Openat2 is not available, fallback to Openat loop.
+		return kubernetesSafeOpen(path, subpath)
+	case errors.Is(err, unix.EXDEV):
+		return -1, &ErrEscapesBase{Base: path, Subpath: subpath}
+	case errors.Is(err, unix.ENOENT), errors.Is(err, unix.ELOOP):
+		return -1, &ErrNotAccessible{Path: filepath.Join(path, subpath), Cause: err}
+	case err != nil:
+		return -1, &os.PathError{Op: "openat2", Path: subpath, Err: err}
+	}
+
+	// Openat2 is available and succeeded.
+	return fd, nil
+}
+
+// tempMountPoint creates a temporary file/directory to act as mount
+// point for the file descriptor.
+func tempMountPoint(sourceFd int) (string, error) {
+	var stat unix.Stat_t
+	err := unix_noeintr.Fstat(sourceFd, &stat)
+	if err != nil {
+		return "", errors.Wrap(err, "failed to Fstat mount source fd")
+	}
+
+	isDir := (stat.Mode & unix.S_IFMT) == unix.S_IFDIR
+	if isDir {
+		return os.MkdirTemp("", "safe-mount")
+	}
+
+	f, err := os.CreateTemp("", "safe-mount")
+	if err != nil {
+		return "", err
+	}
+
+	p := f.Name()
+	if err := f.Close(); err != nil {
+		return "", err
+	}
+	return p, nil
+}
+
+// cleanupSafePaths returns a function that unmounts the path and removes the
+// mountpoint.
+func cleanupSafePath(path string) func(context.Context) error {
+	return func(ctx context.Context) error {
+		log.G(ctx).WithField("path", path).Debug("removing safe temp mount")
+
+		if err := unix_noeintr.Unmount(path, unix.MNT_DETACH); err != nil {
+			if errors.Is(err, unix.EINVAL) {
+				log.G(ctx).WithField("path", path).Warn("safe temp mount no longer exists?")
+				return nil
+			}
+			return errors.Wrapf(err, "error unmounting safe mount %s", path)
+		}
+		if err := os.Remove(path); err != nil {
+			if errors.Is(err, os.ErrNotExist) {
+				log.G(ctx).WithField("path", path).Warn("safe temp mount no longer exists?")
+				return nil
+			}
+			return errors.Wrapf(err, "failed to delete temporary safe mount")
+		}
+
+		return nil
+	}
+}

+ 145 - 0
internal/safepath/join_test.go

@@ -0,0 +1,145 @@
+package safepath
+
+import (
+	"context"
+	"os"
+	"path/filepath"
+	"runtime"
+	"strings"
+	"testing"
+
+	"gotest.tools/v3/assert"
+	is "gotest.tools/v3/assert/cmp"
+)
+
+func TestJoinEscapingSymlink(t *testing.T) {
+	type testCase struct {
+		name   string
+		target string
+	}
+	var cases []testCase
+
+	if runtime.GOOS == "windows" {
+		cases = []testCase{
+			{name: "root", target: `C:\`},
+			{name: "absolute file", target: `C:\Windows\System32\cmd.exe`},
+		}
+	} else {
+		cases = []testCase{
+			{name: "root", target: "/"},
+			{name: "absolute file", target: "/etc/passwd"},
+		}
+	}
+	cases = append(cases, testCase{name: "relative", target: "../../"})
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			tempDir := t.TempDir()
+			dir, err := filepath.EvalSymlinks(tempDir)
+			assert.NilError(t, err, "filepath.EvalSymlinks failed for temporary directory %s", tempDir)
+
+			err = os.Symlink(tc.target, filepath.Join(dir, "link"))
+			assert.NilError(t, err, "failed to create symlink to %s", tc.target)
+
+			safe, err := Join(context.Background(), dir, "link")
+			if err == nil {
+				safe.Close(context.Background())
+			}
+			assert.ErrorType(t, err, &ErrEscapesBase{})
+		})
+	}
+}
+
+func TestJoinGoodSymlink(t *testing.T) {
+	tempDir := t.TempDir()
+	dir, err := filepath.EvalSymlinks(tempDir)
+	assert.NilError(t, err, "filepath.EvalSymlinks failed for temporary directory %s", tempDir)
+
+	assert.Assert(t, os.WriteFile(filepath.Join(dir, "foo"), []byte("bar"), 0o744), "failed to create file 'foo'")
+	assert.Assert(t, os.Mkdir(filepath.Join(dir, "subdir"), 0o744), "failed to create directory 'subdir'")
+	assert.Assert(t, os.WriteFile(filepath.Join(dir, "subdir/hello.txt"), []byte("world"), 0o744), "failed to create file 'subdir/hello.txt'")
+
+	assert.Assert(t, os.Symlink(filepath.Join(dir, "subdir"), filepath.Join(dir, "subdir_link_absolute")), "failed to create absolute symlink to directory 'subdir'")
+	assert.Assert(t, os.Symlink("subdir", filepath.Join(dir, "subdir_link_relative")), "failed to create relative symlink to directory 'subdir'")
+
+	assert.Assert(t, os.Symlink(filepath.Join(dir, "foo"), filepath.Join(dir, "foo_link_absolute")), "failed to create absolute symlink to file 'foo'")
+	assert.Assert(t, os.Symlink("foo", filepath.Join(dir, "foo_link_relative")), "failed to create relative symlink to file 'foo'")
+
+	for _, target := range []string{
+		"foo", "subdir",
+		"subdir_link_absolute", "foo_link_absolute",
+		"subdir_link_relative", "foo_link_relative",
+	} {
+		t.Run(target, func(t *testing.T) {
+			safe, err := Join(context.Background(), dir, target)
+			assert.NilError(t, err)
+
+			defer safe.Close(context.Background())
+			if strings.HasPrefix(target, "subdir") {
+				data, err := os.ReadFile(filepath.Join(safe.Path(), "hello.txt"))
+				assert.NilError(t, err)
+				assert.Assert(t, is.Equal(string(data), "world"))
+			}
+		})
+	}
+}
+
+func TestJoinWithSymlinkReplace(t *testing.T) {
+	tempDir := t.TempDir()
+	dir, err := filepath.EvalSymlinks(tempDir)
+	assert.NilError(t, err, "filepath.EvalSymlinks failed for temporary directory %s", tempDir)
+
+	link := filepath.Join(dir, "link")
+	target := filepath.Join(dir, "foo")
+
+	err = os.WriteFile(target, []byte("bar"), 0o744)
+	assert.NilError(t, err, "failed to create test file")
+
+	err = os.Symlink(target, link)
+	assert.Check(t, err, "failed to create symlink to foo")
+
+	safe, err := Join(context.Background(), dir, "link")
+	assert.NilError(t, err)
+
+	defer safe.Close(context.Background())
+
+	// Delete the link target.
+	err = os.Remove(target)
+	if runtime.GOOS == "windows" {
+		// On Windows it shouldn't be possible.
+		assert.Assert(t, is.ErrorType(err, &os.PathError{}), "link shouldn't be deletable before cleanup")
+	} else {
+		// On Linux we can delete it just fine.
+		assert.NilError(t, err, "failed to remove symlink")
+
+		// Replace target with a symlink to /etc/paswd
+		err = os.Symlink("/etc/passwd", target)
+		assert.NilError(t, err, "failed to create symlink")
+	}
+
+	// The returned safe path should still point to the old file.
+	data, err := os.ReadFile(safe.Path())
+	assert.NilError(t, err, "failed to read file")
+
+	assert.Check(t, is.Equal(string(data), "bar"))
+
+}
+
+func TestJoinCloseInvalidates(t *testing.T) {
+	tempDir := t.TempDir()
+	dir, err := filepath.EvalSymlinks(tempDir)
+	assert.NilError(t, err)
+
+	foo := filepath.Join(dir, "foo")
+	err = os.WriteFile(foo, []byte("bar"), 0o744)
+	assert.NilError(t, err, "failed to create test file")
+
+	safe, err := Join(context.Background(), dir, "foo")
+	assert.NilError(t, err)
+
+	assert.Check(t, safe.IsValid())
+
+	assert.NilError(t, safe.Close(context.Background()))
+
+	assert.Check(t, !safe.IsValid())
+}

+ 93 - 0
internal/safepath/join_windows.go

@@ -0,0 +1,93 @@
+package safepath
+
+import (
+	"context"
+	"os"
+	"path/filepath"
+	"strings"
+
+	"github.com/containerd/log"
+	"github.com/docker/docker/internal/cleanups"
+	"github.com/docker/docker/internal/compatcontext"
+	"github.com/pkg/errors"
+	"golang.org/x/sys/windows"
+)
+
+// Join locks all individual components of the path which is the concatenation
+// of provided path and its subpath, checks that it doesn't escape the base path
+// and returns the concatenated path.
+//
+// The path is safe (the path target won't change) until the returned SafePath
+// is Closed.
+// Caller is responsible for calling the Close function which unlocks the path.
+func Join(ctx context.Context, path, subpath string) (*SafePath, error) {
+	base, subpart, err := evaluatePath(path, subpath)
+	if err != nil {
+		return nil, err
+	}
+	parts := strings.Split(subpart, string(os.PathSeparator))
+
+	cleanups := cleanups.Composite{}
+	defer func() {
+		if cErr := cleanups.Call(compatcontext.WithoutCancel(ctx)); cErr != nil {
+			log.G(ctx).WithError(cErr).Warn("failed to close handles after error")
+		}
+	}()
+
+	fullPath := base
+	for _, part := range parts {
+		fullPath = filepath.Join(fullPath, part)
+
+		handle, err := lockFile(fullPath)
+		if err != nil {
+			if errors.Is(err, windows.ERROR_FILE_NOT_FOUND) {
+				return nil, &ErrNotAccessible{Path: fullPath, Cause: err}
+			}
+			return nil, errors.Wrapf(err, "failed to lock file %s", fullPath)
+		}
+		cleanups.Add(func(context.Context) error {
+			if err := windows.CloseHandle(handle); err != nil {
+				return &os.PathError{Op: "CloseHandle", Path: fullPath, Err: err}
+			}
+			return err
+		})
+
+		realPath, err := filepath.EvalSymlinks(fullPath)
+		if err != nil {
+			return nil, errors.Wrapf(err, "failed to eval symlinks of %s", fullPath)
+		}
+
+		if realPath != fullPath && !isLocalTo(realPath, base) {
+			return nil, &ErrEscapesBase{Base: base, Subpath: subpart}
+		}
+
+		var info windows.ByHandleFileInformation
+		if err := windows.GetFileInformationByHandle(handle, &info); err != nil {
+			return nil, errors.WithStack(&os.PathError{Op: "GetFileInformationByHandle", Path: fullPath, Err: err})
+		}
+
+		if (info.FileAttributes & windows.FILE_ATTRIBUTE_REPARSE_POINT) != 0 {
+			return nil, &ErrNotAccessible{Path: fullPath, Cause: err}
+		}
+	}
+
+	return &SafePath{
+		path:          fullPath,
+		sourceBase:    base,
+		sourceSubpath: subpart,
+		cleanup:       cleanups.Release(),
+	}, nil
+}
+
+func lockFile(path string) (windows.Handle, error) {
+	p, err := windows.UTF16PtrFromString(path)
+	if err != nil {
+		return windows.InvalidHandle, &os.PathError{Op: "UTF16PtrFromString", Path: path, Err: err}
+	}
+	const flags = windows.FILE_FLAG_BACKUP_SEMANTICS | windows.FILE_FLAG_OPEN_REPARSE_POINT
+	handle, err := windows.CreateFile(p, windows.GENERIC_READ, windows.FILE_SHARE_READ, nil, windows.OPEN_EXISTING, flags, 0)
+	if err != nil {
+		return handle, &os.PathError{Op: "CreateFile", Path: path, Err: err}
+	}
+	return handle, nil
+}

+ 112 - 0
internal/safepath/k8s_safeopen_linux.go

@@ -0,0 +1,112 @@
+package safepath
+
+/*
+Copyright 2014 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+import (
+	"context"
+	"fmt"
+	"path/filepath"
+	"strings"
+
+	"github.com/containerd/log"
+	"github.com/docker/docker/internal/unix_noeintr"
+	"golang.org/x/sys/unix"
+)
+
+// kubernetesSafeOpen open path formed by concatenation of the base directory
+// and its subpath and return its fd.
+// Symlinks are disallowed (pathname must already resolve symlinks) and the path
+// path must be within the base directory.
+// This is minimally modified code from https://github.com/kubernetes/kubernetes/blob/55fb1805a1217b91b36fa8fe8f2bf3a28af2454d/pkg/volume/util/subpath/subpath_linux.go#L530
+func kubernetesSafeOpen(base, subpath string) (int, error) {
+	// syscall.Openat flags used to traverse directories not following symlinks
+	const nofollowFlags = unix.O_RDONLY | unix.O_NOFOLLOW
+	// flags for getting file descriptor without following the symlink
+	const openFDFlags = unix.O_NOFOLLOW | unix.O_PATH
+
+	pathname := filepath.Join(base, subpath)
+	segments := strings.Split(subpath, string(filepath.Separator))
+
+	// Assumption: base is the only directory that we have under control.
+	// Base dir is not allowed to be a symlink.
+	parentFD, err := unix_noeintr.Open(base, nofollowFlags|unix.O_CLOEXEC, 0)
+	if err != nil {
+		return -1, &ErrNotAccessible{Path: base, Cause: err}
+	}
+	defer func() {
+		if parentFD != -1 {
+			if err = unix_noeintr.Close(parentFD); err != nil {
+				log.G(context.TODO()).Errorf("Closing FD %v failed for safeopen(%v): %v", parentFD, pathname, err)
+			}
+		}
+	}()
+
+	childFD := -1
+	defer func() {
+		if childFD != -1 {
+			if err = unix_noeintr.Close(childFD); err != nil {
+				log.G(context.TODO()).Errorf("Closing FD %v failed for safeopen(%v): %v", childFD, pathname, err)
+			}
+		}
+	}()
+
+	currentPath := base
+
+	// Follow the segments one by one using openat() to make
+	// sure the user cannot change already existing directories into symlinks.
+	for _, seg := range segments {
+		var deviceStat unix.Stat_t
+
+		currentPath = filepath.Join(currentPath, seg)
+		if !isLocalTo(currentPath, base) {
+			return -1, &ErrEscapesBase{Base: currentPath, Subpath: seg}
+		}
+
+		// Trigger auto mount if it's an auto-mounted directory, ignore error if not a directory.
+		// Notice the trailing slash is mandatory, see "automount" in openat(2) and open_by_handle_at(2).
+		unix_noeintr.Fstatat(parentFD, seg+"/", &deviceStat, unix.AT_SYMLINK_NOFOLLOW)
+
+		log.G(context.TODO()).Debugf("Opening path %s", currentPath)
+		childFD, err = unix_noeintr.Openat(parentFD, seg, openFDFlags|unix.O_CLOEXEC, 0)
+		if err != nil {
+			return -1, &ErrNotAccessible{Path: currentPath, Cause: err}
+		}
+
+		err := unix_noeintr.Fstat(childFD, &deviceStat)
+		if err != nil {
+			return -1, fmt.Errorf("error running fstat on %s with %v", currentPath, err)
+		}
+		fileFmt := deviceStat.Mode & unix.S_IFMT
+		if fileFmt == unix.S_IFLNK {
+			return -1, fmt.Errorf("unexpected symlink found %s", currentPath)
+		}
+
+		// Close parentFD
+		if err = unix_noeintr.Close(parentFD); err != nil {
+			return -1, fmt.Errorf("closing fd for %q failed: %v", filepath.Dir(currentPath), err)
+		}
+		// Set child to new parent
+		parentFD = childFD
+		childFD = -1
+	}
+
+	// We made it to the end, return this fd, don't close it
+	finalFD := parentFD
+	parentFD = -1
+
+	return finalFD, nil
+}

+ 63 - 0
internal/safepath/safepath.go

@@ -0,0 +1,63 @@
+package safepath
+
+import (
+	"context"
+	"fmt"
+	"sync"
+
+	"github.com/containerd/log"
+)
+
+type SafePath struct {
+	path    string
+	cleanup func(ctx context.Context) error
+	mutex   sync.Mutex
+
+	// Immutable fields
+	sourceBase, sourceSubpath string
+}
+
+// Close releases the resources used by the path.
+func (s *SafePath) Close(ctx context.Context) error {
+	s.mutex.Lock()
+	defer s.mutex.Unlock()
+
+	if s.path == "" {
+		base, sub := s.SourcePath()
+		log.G(ctx).WithFields(log.Fields{
+			"path":          s.Path(),
+			"sourceBase":    base,
+			"sourceSubpath": sub,
+		}).Warn("an attempt to close an already closed SafePath")
+		return nil
+	}
+
+	s.path = ""
+	if s.cleanup != nil {
+		return s.cleanup(ctx)
+	}
+	return nil
+}
+
+// IsValid return true when path can still be used and wasn't cleaned up by Close.
+func (s *SafePath) IsValid() bool {
+	s.mutex.Lock()
+	defer s.mutex.Unlock()
+	return s.path != ""
+}
+
+// Path returns a safe, temporary path that can be used to access the original path.
+func (s *SafePath) Path() string {
+	s.mutex.Lock()
+	defer s.mutex.Unlock()
+	if s.path == "" {
+		panic(fmt.Sprintf("use-after-close attempted for safepath with source [%s, %s]", s.sourceBase, s.sourceSubpath))
+	}
+	return s.path
+}
+
+// SourcePath returns the source path the safepath points to.
+func (s *SafePath) SourcePath() (string, string) {
+	// No mutex lock because these are immutable.
+	return s.sourceBase, s.sourceSubpath
+}

+ 85 - 0
internal/unix_noeintr/fs_unix.go

@@ -0,0 +1,85 @@
+//go:build !windows
+
+// Wrappers for unix syscalls that retry on EINTR
+// TODO: Consider moving (for example to moby/sys) and making the wrappers
+// auto-generated.
+package unix_noeintr
+
+import (
+	"errors"
+
+	"golang.org/x/sys/unix"
+)
+
+func Retry(f func() error) {
+	for {
+		err := f()
+		if !errors.Is(err, unix.EINTR) {
+			return
+		}
+	}
+}
+
+func Mount(source string, target string, fstype string, flags uintptr, data string) (err error) {
+	Retry(func() error {
+		err = unix.Mount(source, target, fstype, flags, data)
+		return err
+	})
+	return
+}
+
+func Unmount(target string, flags int) (err error) {
+	Retry(func() error {
+		err = unix.Unmount(target, flags)
+		return err
+	})
+	return
+}
+
+func Open(path string, mode int, perm uint32) (fd int, err error) {
+	Retry(func() error {
+		fd, err = unix.Open(path, mode, perm)
+		return err
+	})
+	return
+}
+
+func Close(fd int) (err error) {
+	Retry(func() error {
+		err = unix.Close(fd)
+		return err
+	})
+	return
+}
+
+func Openat(dirfd int, path string, mode int, perms uint32) (fd int, err error) {
+	Retry(func() error {
+		fd, err = unix.Openat(dirfd, path, mode, perms)
+		return err
+	})
+	return
+}
+
+func Openat2(dirfd int, path string, how *unix.OpenHow) (fd int, err error) {
+	Retry(func() error {
+		fd, err = unix.Openat2(dirfd, path, how)
+		return err
+	})
+	return
+}
+
+func Fstat(fd int, stat *unix.Stat_t) (err error) {
+	Retry(func() error {
+		err = unix.Fstat(fd, stat)
+		return err
+	})
+	return
+}
+
+func Fstatat(fd int, path string, stat *unix.Stat_t, flags int) (err error) {
+	Retry(func() error {
+		err = unix.Fstatat(fd, path, stat, flags)
+		return err
+	})
+	return
+}

+ 11 - 1
volume/mounts/linux_parser.go

@@ -94,8 +94,18 @@ func (p *linuxParser) validateMountConfigImpl(mnt *mount.Mount, validateBindSour
 		if mnt.BindOptions != nil {
 			return &errMountConfig{mnt, errExtraField("BindOptions")}
 		}
+		anonymousVolume := len(mnt.Source) == 0
 
-		if len(mnt.Source) == 0 && mnt.ReadOnly {
+		if mnt.VolumeOptions != nil && mnt.VolumeOptions.Subpath != "" {
+			if anonymousVolume {
+				return &errMountConfig{mnt, errAnonymousVolumeWithSubpath}
+			}
+
+			if !filepath.IsLocal(mnt.VolumeOptions.Subpath) {
+				return &errMountConfig{mnt, errInvalidSubpath}
+			}
+		}
+		if mnt.ReadOnly && anonymousVolume {
 			return &errMountConfig{mnt, fmt.Errorf("must not set ReadOnly mode when using anonymous volumes")}
 		}
 	case mount.TypeTmpfs:

+ 76 - 20
volume/mounts/mounts.go

@@ -9,6 +9,7 @@ import (
 
 	"github.com/containerd/log"
 	mounttypes "github.com/docker/docker/api/types/mount"
+	"github.com/docker/docker/internal/safepath"
 	"github.com/docker/docker/pkg/idtools"
 	"github.com/docker/docker/pkg/stringid"
 	"github.com/docker/docker/volume"
@@ -74,14 +75,34 @@ type MountPoint struct {
 	// Specifically needed for containers which are running and calls to `docker cp`
 	// because both these actions require mounting the volumes.
 	active int
+
+	// SafePaths created by Setup that should be cleaned up before unmounting
+	// the volume.
+	safePaths []*safepath.SafePath
 }
 
-// Cleanup frees resources used by the mountpoint
-func (m *MountPoint) Cleanup() error {
+// Cleanup frees resources used by the mountpoint and cleans up all the paths
+// returned by Setup that hasn't been cleaned up by the caller.
+func (m *MountPoint) Cleanup(ctx context.Context) error {
 	if m.Volume == nil || m.ID == "" {
 		return nil
 	}
 
+	for _, p := range m.safePaths {
+		if !p.IsValid() {
+			continue
+		}
+
+		err := p.Close(ctx)
+		base, sub := p.SourcePath()
+		log.G(ctx).WithFields(log.Fields{
+			"error":         err,
+			"path":          p.Path(),
+			"sourceBase":    base,
+			"sourceSubpath": sub,
+		}).Warn("cleaning up SafePath that hasn't been cleaned up by the caller")
+	}
+
 	if err := m.Volume.Unmount(m.ID); err != nil {
 		return errors.Wrapf(err, "error unmounting volume %s", m.Volume.Name())
 	}
@@ -97,30 +118,42 @@ func (m *MountPoint) Cleanup() error {
 // configured, or creating the source directory if supplied.
 // The, optional, checkFun parameter allows doing additional checking
 // before creating the source directory on the host.
-func (m *MountPoint) Setup(mountLabel string, rootIDs idtools.Identity, checkFun func(m *MountPoint) error) (path string, err error) {
+//
+// The returned path can be a temporary path, caller is responsible to
+// call the returned cleanup function as soon as the path is not needed.
+// Cleanup doesn't unmount the underlying volumes (if any), it only
+// frees up the resources that were needed to guarantee that the path
+// still points to the same target (to avoid TOCTOU attack).
+//
+// Cleanup function doesn't need to be called when error is returned.
+func (m *MountPoint) Setup(ctx context.Context, mountLabel string, rootIDs idtools.Identity, checkFun func(m *MountPoint) error) (path string, cleanup func(context.Context) error, retErr error) {
 	if m.SkipMountpointCreation {
-		return m.Source, nil
+		return m.Source, noCleanup, nil
 	}
 
 	defer func() {
-		if err != nil || !label.RelabelNeeded(m.Mode) {
+		if retErr != nil || !label.RelabelNeeded(m.Mode) {
 			return
 		}
 
-		var sourcePath string
-		sourcePath, err = filepath.EvalSymlinks(m.Source)
+		sourcePath, err := filepath.EvalSymlinks(path)
 		if err != nil {
 			path = ""
-			err = errors.Wrapf(err, "error evaluating symlinks from mount source %q", m.Source)
+			retErr = errors.Wrapf(err, "error evaluating symlinks from mount source %q", m.Source)
+			if cleanupErr := cleanup(ctx); cleanupErr != nil {
+				log.G(ctx).WithError(cleanupErr).Warn("failed to cleanup after error")
+			}
+			cleanup = noCleanup
 			return
 		}
 		err = label.Relabel(sourcePath, mountLabel, label.IsShared(m.Mode))
-		if errors.Is(err, syscall.ENOTSUP) {
-			err = nil
-		}
-		if err != nil {
+		if err != nil && !errors.Is(err, syscall.ENOTSUP) {
 			path = ""
-			err = errors.Wrapf(err, "error setting label on mount source '%s'", sourcePath)
+			retErr = errors.Wrapf(err, "error setting label on mount source '%s'", sourcePath)
+			if cleanupErr := cleanup(ctx); cleanupErr != nil {
+				log.G(ctx).WithError(cleanupErr).Warn("failed to cleanup after error")
+			}
+			cleanup = noCleanup
 		}
 	}()
 
@@ -129,18 +162,36 @@ func (m *MountPoint) Setup(mountLabel string, rootIDs idtools.Identity, checkFun
 		if id == "" {
 			id = stringid.GenerateRandomID()
 		}
-		path, err := m.Volume.Mount(id)
+		volumePath, err := m.Volume.Mount(id)
 		if err != nil {
-			return "", errors.Wrapf(err, "error while mounting volume '%s'", m.Source)
+			return "", noCleanup, errors.Wrapf(err, "error while mounting volume '%s'", m.Source)
 		}
 
 		m.ID = id
+		clean := noCleanup
+		if m.Spec.VolumeOptions != nil && m.Spec.VolumeOptions.Subpath != "" {
+			subpath := m.Spec.VolumeOptions.Subpath
+
+			safePath, err := safepath.Join(ctx, volumePath, subpath)
+			if err != nil {
+				if err := m.Volume.Unmount(id); err != nil {
+					log.G(ctx).WithError(err).Error("failed to unmount after safepath.Join failed")
+				}
+				return "", noCleanup, err
+			}
+			m.safePaths = append(m.safePaths, safePath)
+			log.G(ctx).Debugf("mounting (%s|%s) via %s", volumePath, subpath, safePath.Path())
+
+			clean = safePath.Close
+			volumePath = safePath.Path()
+		}
+
 		m.active++
-		return path, nil
+		return volumePath, clean, nil
 	}
 
 	if len(m.Source) == 0 {
-		return "", fmt.Errorf("Unable to setup mount point, neither source nor volume defined")
+		return "", noCleanup, fmt.Errorf("Unable to setup mount point, neither source nor volume defined")
 	}
 
 	if m.Type == mounttypes.TypeBind {
@@ -149,7 +200,7 @@ func (m *MountPoint) Setup(mountLabel string, rootIDs idtools.Identity, checkFun
 		// the process of shutting down.
 		if checkFun != nil {
 			if err := checkFun(m); err != nil {
-				return "", err
+				return "", noCleanup, err
 			}
 		}
 
@@ -158,12 +209,12 @@ func (m *MountPoint) Setup(mountLabel string, rootIDs idtools.Identity, checkFun
 		if err := idtools.MkdirAllAndChownNew(m.Source, 0o755, rootIDs); err != nil {
 			if perr, ok := err.(*os.PathError); ok {
 				if perr.Err != syscall.ENOTDIR {
-					return "", errors.Wrapf(err, "error while creating mount source path '%s'", m.Source)
+					return "", noCleanup, errors.Wrapf(err, "error while creating mount source path '%s'", m.Source)
 				}
 			}
 		}
 	}
-	return m.Source, nil
+	return m.Source, noCleanup, nil
 }
 
 func (m *MountPoint) LiveRestore(ctx context.Context) error {
@@ -207,3 +258,8 @@ func errInvalidMode(mode string) error {
 func errInvalidSpec(spec string) error {
 	return errors.Errorf("invalid volume specification: '%s'", spec)
 }
+
+// noCleanup is a no-op cleanup function.
+func noCleanup(_ context.Context) error {
+	return nil
+}

+ 8 - 0
volume/mounts/parser.go

@@ -11,6 +11,14 @@ import (
 // It's used by both LCOW and Linux parsers.
 var ErrVolumeTargetIsRoot = errors.New("invalid specification: destination can't be '/'")
 
+// errAnonymousVolumeWithSubpath is returned when Subpath is specified for
+// anonymous volume.
+var errAnonymousVolumeWithSubpath = errors.New("must not set Subpath when using anonymous volumes")
+
+// errInvalidSubpath is returned when the provided Subpath is not lexically an
+// relative path within volume.
+var errInvalidSubpath = errors.New("subpath must be a relative path within the volume")
+
 // read-write modes
 var rwModes = map[string]bool{
 	"rw": true,

+ 3 - 0
volume/mounts/validate_test.go

@@ -28,6 +28,9 @@ func TestValidateMount(t *testing.T) {
 		{
 			input: mount.Mount{Type: mount.TypeVolume, Target: testDestinationPath},
 		},
+		{
+			input: mount.Mount{Type: mount.TypeVolume, Target: testDestinationPath, Source: "hello", VolumeOptions: &mount.VolumeOptions{Subpath: "world"}},
+		},
 		{
 			input:    mount.Mount{Type: mount.TypeBind},
 			expected: errMissingField("Target"),

+ 14 - 1
volume/mounts/windows_parser.go

@@ -4,6 +4,7 @@ import (
 	"errors"
 	"fmt"
 	"os"
+	"path/filepath"
 	"regexp"
 	"runtime"
 	"strings"
@@ -258,7 +259,19 @@ func (p *windowsParser) validateMountConfigReg(mnt *mount.Mount, additionalValid
 			return &errMountConfig{mnt, errExtraField("BindOptions")}
 		}
 
-		if len(mnt.Source) == 0 && mnt.ReadOnly {
+		anonymousVolume := len(mnt.Source) == 0
+		if mnt.VolumeOptions != nil && mnt.VolumeOptions.Subpath != "" {
+			if anonymousVolume {
+				return errAnonymousVolumeWithSubpath
+			}
+
+			// Check if path is relative but without any back traversals
+			if !filepath.IsLocal(mnt.VolumeOptions.Subpath) {
+				return &errMountConfig{mnt, errInvalidSubpath}
+			}
+		}
+
+		if anonymousVolume && mnt.ReadOnly {
 			return &errMountConfig{mnt, fmt.Errorf("must not set ReadOnly mode when using anonymous volumes")}
 		}