فهرست منبع

Merge pull request #44210 from corhere/chrootarchive-without-reexec

Fix 'docker cp' mount table explosion, take four
Brian Goff 2 سال پیش
والد
کامیت
6eab4f55fa

+ 1 - 1
container/archive.go → container/archive_windows.go

@@ -18,7 +18,7 @@ func (container *Container) ResolvePath(path string) (resolvedPath, absPath stri
 	if container.BaseFS == "" {
 	if container.BaseFS == "" {
 		return "", "", errors.New("ResolvePath: BaseFS of container " + container.ID + " is unexpectedly empty")
 		return "", "", errors.New("ResolvePath: BaseFS of container " + container.ID + " is unexpectedly empty")
 	}
 	}
-	// Check if a drive letter supplied, it must be the system drive. No-op except on Windows
+	// Check if a drive letter supplied, it must be the system drive.
 	path, err = system.CheckSystemDriveAndRemoveDriveLetter(path)
 	path, err = system.CheckSystemDriveAndRemoveDriveLetter(path)
 	if err != nil {
 	if err != nil {
 		return "", "", err
 		return "", "", err

+ 0 - 324
daemon/archive.go

@@ -3,17 +3,9 @@ package daemon // import "github.com/docker/docker/daemon"
 import (
 import (
 	"io"
 	"io"
 	"os"
 	"os"
-	"path/filepath"
-	"strings"
 
 
 	"github.com/docker/docker/api/types"
 	"github.com/docker/docker/api/types"
-	"github.com/docker/docker/container"
 	"github.com/docker/docker/errdefs"
 	"github.com/docker/docker/errdefs"
-	"github.com/docker/docker/pkg/archive"
-	"github.com/docker/docker/pkg/chrootarchive"
-	"github.com/docker/docker/pkg/ioutils"
-	"github.com/docker/docker/pkg/system"
-	"github.com/pkg/errors"
 )
 )
 
 
 // ContainerCopy performs a deprecated operation of archiving the resource at
 // ContainerCopy performs a deprecated operation of archiving the resource at
@@ -24,11 +16,6 @@ func (daemon *Daemon) ContainerCopy(name string, res string) (io.ReadCloser, err
 		return nil, err
 		return nil, err
 	}
 	}
 
 
-	// Make sure an online file-system operation is permitted.
-	if err := daemon.isOnlineFSOperationPermitted(ctr); err != nil {
-		return nil, errdefs.System(err)
-	}
-
 	data, err := daemon.containerCopy(ctr, res)
 	data, err := daemon.containerCopy(ctr, res)
 	if err == nil {
 	if err == nil {
 		return data, nil
 		return data, nil
@@ -48,11 +35,6 @@ func (daemon *Daemon) ContainerStatPath(name string, path string) (stat *types.C
 		return nil, err
 		return nil, err
 	}
 	}
 
 
-	// Make sure an online file-system operation is permitted.
-	if err := daemon.isOnlineFSOperationPermitted(ctr); err != nil {
-		return nil, errdefs.System(err)
-	}
-
 	stat, err = daemon.containerStatPath(ctr, path)
 	stat, err = daemon.containerStatPath(ctr, path)
 	if err == nil {
 	if err == nil {
 		return stat, nil
 		return stat, nil
@@ -73,11 +55,6 @@ func (daemon *Daemon) ContainerArchivePath(name string, path string) (content io
 		return nil, nil, err
 		return nil, nil, err
 	}
 	}
 
 
-	// Make sure an online file-system operation is permitted.
-	if err := daemon.isOnlineFSOperationPermitted(ctr); err != nil {
-		return nil, nil, errdefs.System(err)
-	}
-
 	content, stat, err = daemon.containerArchivePath(ctr, path)
 	content, stat, err = daemon.containerArchivePath(ctr, path)
 	if err == nil {
 	if err == nil {
 		return content, stat, nil
 		return content, stat, nil
@@ -101,11 +78,6 @@ func (daemon *Daemon) ContainerExtractToDir(name, path string, copyUIDGID, noOve
 		return err
 		return err
 	}
 	}
 
 
-	// Make sure an online file-system operation is permitted.
-	if err := daemon.isOnlineFSOperationPermitted(ctr); err != nil {
-		return errdefs.System(err)
-	}
-
 	err = daemon.containerExtractToDir(ctr, path, copyUIDGID, noOverwriteDirNonDir, content)
 	err = daemon.containerExtractToDir(ctr, path, copyUIDGID, noOverwriteDirNonDir, content)
 	if err == nil {
 	if err == nil {
 		return nil
 		return nil
@@ -116,299 +88,3 @@ func (daemon *Daemon) ContainerExtractToDir(name, path string, copyUIDGID, noOve
 	}
 	}
 	return errdefs.System(err)
 	return errdefs.System(err)
 }
 }
-
-// containerStatPath stats the filesystem resource at the specified path in this
-// container. Returns stat info about the resource.
-func (daemon *Daemon) containerStatPath(container *container.Container, path string) (stat *types.ContainerPathStat, err error) {
-	container.Lock()
-	defer container.Unlock()
-
-	if err = daemon.Mount(container); err != nil {
-		return nil, err
-	}
-	defer daemon.Unmount(container)
-
-	err = daemon.mountVolumes(container)
-	defer container.DetachAndUnmount(daemon.LogVolumeEvent)
-	if err != nil {
-		return nil, err
-	}
-
-	// Normalize path before sending to rootfs
-	path = filepath.FromSlash(path)
-
-	resolvedPath, absPath, err := container.ResolvePath(path)
-	if err != nil {
-		return nil, err
-	}
-
-	return container.StatPath(resolvedPath, absPath)
-}
-
-// containerArchivePath creates an archive of the filesystem resource at the specified
-// path in this container. Returns a tar archive of the resource and stat info
-// about the resource.
-func (daemon *Daemon) containerArchivePath(container *container.Container, path string) (content io.ReadCloser, stat *types.ContainerPathStat, err error) {
-	container.Lock()
-
-	defer func() {
-		if err != nil {
-			// Wait to unlock the container until the archive is fully read
-			// (see the ReadCloseWrapper func below) or if there is an error
-			// before that occurs.
-			container.Unlock()
-		}
-	}()
-
-	if err = daemon.Mount(container); err != nil {
-		return nil, nil, err
-	}
-
-	defer func() {
-		if err != nil {
-			// unmount any volumes
-			container.DetachAndUnmount(daemon.LogVolumeEvent)
-			// unmount the container's rootfs
-			daemon.Unmount(container)
-		}
-	}()
-
-	if err = daemon.mountVolumes(container); err != nil {
-		return nil, nil, err
-	}
-
-	// Normalize path before sending to rootfs
-	path = filepath.FromSlash(path)
-
-	resolvedPath, absPath, err := container.ResolvePath(path)
-	if err != nil {
-		return nil, nil, err
-	}
-
-	stat, err = container.StatPath(resolvedPath, absPath)
-	if err != nil {
-		return nil, nil, err
-	}
-
-	// We need to rebase the archive entries if the last element of the
-	// resolved path was a symlink that was evaluated and is now different
-	// than the requested path. For example, if the given path was "/foo/bar/",
-	// but it resolved to "/var/lib/docker/containers/{id}/foo/baz/", we want
-	// to ensure that the archive entries start with "bar" and not "baz". This
-	// also catches the case when the root directory of the container is
-	// requested: we want the archive entries to start with "/" and not the
-	// container ID.
-
-	// Get the source and the base paths of the container resolved path in order
-	// to get the proper tar options for the rebase tar.
-	resolvedPath = filepath.Clean(resolvedPath)
-	if filepath.Base(resolvedPath) == "." {
-		resolvedPath += string(filepath.Separator) + "."
-	}
-
-	sourceDir := resolvedPath
-	sourceBase := "."
-
-	if stat.Mode&os.ModeDir == 0 { // not dir
-		sourceDir, sourceBase = filepath.Split(resolvedPath)
-	}
-	opts := archive.TarResourceRebaseOpts(sourceBase, filepath.Base(absPath))
-
-	data, err := chrootarchive.Tar(sourceDir, opts, container.BaseFS)
-	if err != nil {
-		return nil, nil, err
-	}
-
-	content = ioutils.NewReadCloserWrapper(data, func() error {
-		err := data.Close()
-		container.DetachAndUnmount(daemon.LogVolumeEvent)
-		daemon.Unmount(container)
-		container.Unlock()
-		return err
-	})
-
-	daemon.LogContainerEvent(container, "archive-path")
-
-	return content, stat, nil
-}
-
-// containerExtractToDir extracts the given tar archive to the specified location in the
-// filesystem of this container. The given path must be of a directory in the
-// container. If it is not, the error will be an errdefs.InvalidParameter. If
-// noOverwriteDirNonDir is true then it will be an error if unpacking the
-// given content would cause an existing directory to be replaced with a non-
-// directory and vice versa.
-func (daemon *Daemon) containerExtractToDir(container *container.Container, path string, copyUIDGID, noOverwriteDirNonDir bool, content io.Reader) (err error) {
-	container.Lock()
-	defer container.Unlock()
-
-	if err = daemon.Mount(container); err != nil {
-		return err
-	}
-	defer daemon.Unmount(container)
-
-	err = daemon.mountVolumes(container)
-	defer container.DetachAndUnmount(daemon.LogVolumeEvent)
-	if err != nil {
-		return err
-	}
-
-	// Normalize path before sending to rootfs'
-	path = filepath.FromSlash(path)
-
-	// Check if a drive letter supplied, it must be the system drive. No-op except on Windows
-	path, err = system.CheckSystemDriveAndRemoveDriveLetter(path)
-	if err != nil {
-		return err
-	}
-
-	// The destination path needs to be resolved to a host path, with all
-	// symbolic links followed in the scope of the container's rootfs. Note
-	// that we do not use `container.ResolvePath(path)` here because we need
-	// to also evaluate the last path element if it is a symlink. This is so
-	// that you can extract an archive to a symlink that points to a directory.
-
-	// Consider the given path as an absolute path in the container.
-	absPath := archive.PreserveTrailingDotOrSeparator(filepath.Join(string(filepath.Separator), path), path)
-
-	// This will evaluate the last path element if it is a symlink.
-	resolvedPath, err := container.GetResourcePath(absPath)
-	if err != nil {
-		return err
-	}
-
-	stat, err := os.Lstat(resolvedPath)
-	if err != nil {
-		return err
-	}
-
-	if !stat.IsDir() {
-		return errdefs.InvalidParameter(errors.New("extraction point is not a directory"))
-	}
-
-	// Need to check if the path is in a volume. If it is, it cannot be in a
-	// read-only volume. If it is not in a volume, the container cannot be
-	// configured with a read-only rootfs.
-
-	// Use the resolved path relative to the container rootfs as the new
-	// absPath. This way we fully follow any symlinks in a volume that may
-	// lead back outside the volume.
-	//
-	// The Windows implementation of filepath.Rel in golang 1.4 does not
-	// support volume style file path semantics. On Windows when using the
-	// filter driver, we are guaranteed that the path will always be
-	// a volume file path.
-	var baseRel string
-	if strings.HasPrefix(resolvedPath, `\\?\Volume{`) {
-		if strings.HasPrefix(resolvedPath, container.BaseFS) {
-			baseRel = resolvedPath[len(container.BaseFS):]
-			if baseRel[:1] == `\` {
-				baseRel = baseRel[1:]
-			}
-		}
-	} else {
-		baseRel, err = filepath.Rel(container.BaseFS, resolvedPath)
-	}
-	if err != nil {
-		return err
-	}
-	// Make it an absolute path.
-	absPath = filepath.Join(string(filepath.Separator), baseRel)
-
-	toVolume, err := checkIfPathIsInAVolume(container, absPath)
-	if err != nil {
-		return err
-	}
-
-	if !toVolume && container.HostConfig.ReadonlyRootfs {
-		return errdefs.InvalidParameter(errors.New("container rootfs is marked read-only"))
-	}
-
-	options := daemon.defaultTarCopyOptions(noOverwriteDirNonDir)
-
-	if copyUIDGID {
-		var err error
-		// tarCopyOptions will appropriately pull in the right uid/gid for the
-		// user/group and will set the options.
-		options, err = daemon.tarCopyOptions(container, noOverwriteDirNonDir)
-		if err != nil {
-			return err
-		}
-	}
-
-	if err := chrootarchive.UntarWithRoot(content, resolvedPath, options, container.BaseFS); err != nil {
-		return err
-	}
-
-	daemon.LogContainerEvent(container, "extract-to-dir")
-
-	return nil
-}
-
-func (daemon *Daemon) containerCopy(container *container.Container, resource string) (rc io.ReadCloser, err error) {
-	if resource[0] == '/' || resource[0] == '\\' {
-		resource = resource[1:]
-	}
-	container.Lock()
-
-	defer func() {
-		if err != nil {
-			// Wait to unlock the container until the archive is fully read
-			// (see the ReadCloseWrapper func below) or if there is an error
-			// before that occurs.
-			container.Unlock()
-		}
-	}()
-
-	if err := daemon.Mount(container); err != nil {
-		return nil, err
-	}
-
-	defer func() {
-		if err != nil {
-			// unmount any volumes
-			container.DetachAndUnmount(daemon.LogVolumeEvent)
-			// unmount the container's rootfs
-			daemon.Unmount(container)
-		}
-	}()
-
-	if err := daemon.mountVolumes(container); err != nil {
-		return nil, err
-	}
-
-	// Normalize path before sending to rootfs
-	resource = filepath.FromSlash(resource)
-
-	basePath, err := container.GetResourcePath(resource)
-	if err != nil {
-		return nil, err
-	}
-	stat, err := os.Stat(basePath)
-	if err != nil {
-		return nil, err
-	}
-	var filter []string
-	if !stat.IsDir() {
-		d, f := filepath.Split(basePath)
-		basePath = d
-		filter = []string{f}
-	}
-	archv, err := chrootarchive.Tar(basePath, &archive.TarOptions{
-		Compression:  archive.Uncompressed,
-		IncludeFiles: filter,
-	}, container.BaseFS)
-	if err != nil {
-		return nil, err
-	}
-
-	reader := ioutils.NewReadCloserWrapper(archv, func() error {
-		err := archv.Close()
-		container.DetachAndUnmount(daemon.LogVolumeEvent)
-		daemon.Unmount(container)
-		container.Unlock()
-		return err
-	})
-	daemon.LogContainerEvent(container, "copy")
-	return reader, nil
-}

+ 200 - 6
daemon/archive_unix.go

@@ -4,12 +4,212 @@
 package daemon // import "github.com/docker/docker/daemon"
 package daemon // import "github.com/docker/docker/daemon"
 
 
 import (
 import (
+	"context"
+	"io"
+	"os"
+	"path/filepath"
+
+	"github.com/docker/docker/api/types"
 	"github.com/docker/docker/container"
 	"github.com/docker/docker/container"
 	"github.com/docker/docker/errdefs"
 	"github.com/docker/docker/errdefs"
+	"github.com/docker/docker/pkg/archive"
+	"github.com/docker/docker/pkg/ioutils"
 	volumemounts "github.com/docker/docker/volume/mounts"
 	volumemounts "github.com/docker/docker/volume/mounts"
 	"github.com/pkg/errors"
 	"github.com/pkg/errors"
 )
 )
 
 
+// containerStatPath stats the filesystem resource at the specified path in this
+// container. Returns stat info about the resource.
+func (daemon *Daemon) containerStatPath(container *container.Container, path string) (stat *types.ContainerPathStat, err error) {
+	container.Lock()
+	defer container.Unlock()
+
+	cfs, err := daemon.openContainerFS(container)
+	if err != nil {
+		return nil, err
+	}
+	defer cfs.Close()
+
+	return cfs.Stat(context.TODO(), path)
+}
+
+// containerArchivePath creates an archive of the filesystem resource at the specified
+// path in this container. Returns a tar archive of the resource and stat info
+// about the resource.
+func (daemon *Daemon) containerArchivePath(container *container.Container, path string) (content io.ReadCloser, stat *types.ContainerPathStat, err error) {
+	container.Lock()
+
+	defer func() {
+		if err != nil {
+			// Wait to unlock the container until the archive is fully read
+			// (see the ReadCloseWrapper func below) or if there is an error
+			// before that occurs.
+			container.Unlock()
+		}
+	}()
+
+	cfs, err := daemon.openContainerFS(container)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	defer func() {
+		if err != nil {
+			cfs.Close()
+		}
+	}()
+
+	absPath := archive.PreserveTrailingDotOrSeparator(filepath.Join("/", path), path)
+
+	stat, err = cfs.Stat(context.TODO(), absPath)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	sourceDir, sourceBase := absPath, "."
+	if stat.Mode&os.ModeDir == 0 { // not dir
+		sourceDir, sourceBase = filepath.Split(absPath)
+	}
+	opts := archive.TarResourceRebaseOpts(sourceBase, filepath.Base(absPath))
+
+	tb, err := archive.NewTarballer(sourceDir, opts)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	cfs.GoInFS(context.TODO(), tb.Do)
+	data := tb.Reader()
+	content = ioutils.NewReadCloserWrapper(data, func() error {
+		err := data.Close()
+		_ = cfs.Close()
+		container.Unlock()
+		return err
+	})
+
+	daemon.LogContainerEvent(container, "archive-path")
+
+	return content, stat, nil
+}
+
+// containerExtractToDir extracts the given tar archive to the specified location in the
+// filesystem of this container. The given path must be of a directory in the
+// container. If it is not, the error will be an errdefs.InvalidParameter. If
+// noOverwriteDirNonDir is true then it will be an error if unpacking the
+// given content would cause an existing directory to be replaced with a non-
+// directory and vice versa.
+func (daemon *Daemon) containerExtractToDir(container *container.Container, path string, copyUIDGID, noOverwriteDirNonDir bool, content io.Reader) (err error) {
+	container.Lock()
+	defer container.Unlock()
+
+	cfs, err := daemon.openContainerFS(container)
+	if err != nil {
+		return err
+	}
+	defer cfs.Close()
+
+	err = cfs.RunInFS(context.TODO(), func() error {
+		// The destination path needs to be resolved with all symbolic links
+		// followed. Note that we need to also evaluate the last path element if
+		// it is a symlink. This is so that you can extract an archive to a
+		// symlink that points to a directory.
+		absPath, err := filepath.EvalSymlinks(filepath.Join("/", path))
+		if err != nil {
+			return err
+		}
+		absPath = archive.PreserveTrailingDotOrSeparator(absPath, path)
+
+		stat, err := os.Lstat(absPath)
+		if err != nil {
+			return err
+		}
+		if !stat.IsDir() {
+			return errdefs.InvalidParameter(errors.New("extraction point is not a directory"))
+		}
+
+		// Need to check if the path is in a volume. If it is, it cannot be in a
+		// read-only volume. If it is not in a volume, the container cannot be
+		// configured with a read-only rootfs.
+		toVolume, err := checkIfPathIsInAVolume(container, absPath)
+		if err != nil {
+			return err
+		}
+
+		if !toVolume && container.HostConfig.ReadonlyRootfs {
+			return errdefs.InvalidParameter(errors.New("container rootfs is marked read-only"))
+		}
+
+		options := daemon.defaultTarCopyOptions(noOverwriteDirNonDir)
+
+		if copyUIDGID {
+			var err error
+			// tarCopyOptions will appropriately pull in the right uid/gid for the
+			// user/group and will set the options.
+			options, err = daemon.tarCopyOptions(container, noOverwriteDirNonDir)
+			if err != nil {
+				return err
+			}
+		}
+
+		return archive.Untar(content, absPath, options)
+	})
+	if err != nil {
+		return err
+	}
+
+	daemon.LogContainerEvent(container, "extract-to-dir")
+
+	return nil
+}
+
+func (daemon *Daemon) containerCopy(container *container.Container, resource string) (rc io.ReadCloser, err error) {
+	container.Lock()
+
+	defer func() {
+		if err != nil {
+			// Wait to unlock the container until the archive is fully read
+			// (see the ReadCloseWrapper func below) or if there is an error
+			// before that occurs.
+			container.Unlock()
+		}
+	}()
+
+	cfs, err := daemon.openContainerFS(container)
+	if err != nil {
+		return nil, err
+	}
+	defer func() {
+		if err != nil {
+			cfs.Close()
+		}
+	}()
+
+	err = cfs.RunInFS(context.TODO(), func() error {
+		_, err := os.Stat(resource)
+		return err
+	})
+	if err != nil {
+		return nil, err
+	}
+
+	tb, err := archive.NewTarballer(resource, &archive.TarOptions{
+		Compression: archive.Uncompressed,
+	})
+	if err != nil {
+		return nil, err
+	}
+
+	cfs.GoInFS(context.TODO(), tb.Do)
+	archv := tb.Reader()
+	reader := ioutils.NewReadCloserWrapper(archv, func() error {
+		err := archv.Close()
+		_ = cfs.Close()
+		container.Unlock()
+		return err
+	})
+	daemon.LogContainerEvent(container, "copy")
+	return reader, nil
+}
+
 // checkIfPathIsInAVolume checks if the path is in a volume. If it is, it
 // checkIfPathIsInAVolume checks if the path is in a volume. If it is, it
 // cannot be in a read-only volume. If it  is not in a volume, the container
 // cannot be in a read-only volume. If it  is not in a volume, the container
 // cannot be configured with a read-only rootfs.
 // cannot be configured with a read-only rootfs.
@@ -26,9 +226,3 @@ func checkIfPathIsInAVolume(container *container.Container, absPath string) (boo
 	}
 	}
 	return toVolume, nil
 	return toVolume, nil
 }
 }
-
-// isOnlineFSOperationPermitted returns an error if an online filesystem operation
-// is not permitted.
-func (daemon *Daemon) isOnlineFSOperationPermitted(container *container.Container) error {
-	return nil
-}

+ 328 - 2
daemon/archive_windows.go

@@ -2,11 +2,337 @@ package daemon // import "github.com/docker/docker/daemon"
 
 
 import (
 import (
 	"errors"
 	"errors"
+	"io"
+	"os"
+	"path/filepath"
+	"strings"
 
 
+	"github.com/docker/docker/api/types"
 	containertypes "github.com/docker/docker/api/types/container"
 	containertypes "github.com/docker/docker/api/types/container"
 	"github.com/docker/docker/container"
 	"github.com/docker/docker/container"
+	"github.com/docker/docker/errdefs"
+	"github.com/docker/docker/pkg/archive"
+	"github.com/docker/docker/pkg/chrootarchive"
+	"github.com/docker/docker/pkg/ioutils"
+	"github.com/docker/docker/pkg/system"
 )
 )
 
 
+// containerStatPath stats the filesystem resource at the specified path in this
+// container. Returns stat info about the resource.
+func (daemon *Daemon) containerStatPath(container *container.Container, path string) (stat *types.ContainerPathStat, err error) {
+	container.Lock()
+	defer container.Unlock()
+
+	// Make sure an online file-system operation is permitted.
+	if err := daemon.isOnlineFSOperationPermitted(container); err != nil {
+		return nil, err
+	}
+
+	if err = daemon.Mount(container); err != nil {
+		return nil, err
+	}
+	defer daemon.Unmount(container)
+
+	err = daemon.mountVolumes(container)
+	defer container.DetachAndUnmount(daemon.LogVolumeEvent)
+	if err != nil {
+		return nil, err
+	}
+
+	// Normalize path before sending to rootfs
+	path = filepath.FromSlash(path)
+
+	resolvedPath, absPath, err := container.ResolvePath(path)
+	if err != nil {
+		return nil, err
+	}
+
+	return container.StatPath(resolvedPath, absPath)
+}
+
+// containerArchivePath creates an archive of the filesystem resource at the specified
+// path in this container. Returns a tar archive of the resource and stat info
+// about the resource.
+func (daemon *Daemon) containerArchivePath(container *container.Container, path string) (content io.ReadCloser, stat *types.ContainerPathStat, err error) {
+	container.Lock()
+
+	defer func() {
+		if err != nil {
+			// Wait to unlock the container until the archive is fully read
+			// (see the ReadCloseWrapper func below) or if there is an error
+			// before that occurs.
+			container.Unlock()
+		}
+	}()
+
+	// Make sure an online file-system operation is permitted.
+	if err := daemon.isOnlineFSOperationPermitted(container); err != nil {
+		return nil, nil, err
+	}
+
+	if err = daemon.Mount(container); err != nil {
+		return nil, nil, err
+	}
+
+	defer func() {
+		if err != nil {
+			// unmount any volumes
+			container.DetachAndUnmount(daemon.LogVolumeEvent)
+			// unmount the container's rootfs
+			daemon.Unmount(container)
+		}
+	}()
+
+	if err = daemon.mountVolumes(container); err != nil {
+		return nil, nil, err
+	}
+
+	// Normalize path before sending to rootfs
+	path = filepath.FromSlash(path)
+
+	resolvedPath, absPath, err := container.ResolvePath(path)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	stat, err = container.StatPath(resolvedPath, absPath)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	// We need to rebase the archive entries if the last element of the
+	// resolved path was a symlink that was evaluated and is now different
+	// than the requested path. For example, if the given path was "/foo/bar/",
+	// but it resolved to "/var/lib/docker/containers/{id}/foo/baz/", we want
+	// to ensure that the archive entries start with "bar" and not "baz". This
+	// also catches the case when the root directory of the container is
+	// requested: we want the archive entries to start with "/" and not the
+	// container ID.
+
+	// Get the source and the base paths of the container resolved path in order
+	// to get the proper tar options for the rebase tar.
+	resolvedPath = filepath.Clean(resolvedPath)
+	if filepath.Base(resolvedPath) == "." {
+		resolvedPath += string(filepath.Separator) + "."
+	}
+
+	sourceDir := resolvedPath
+	sourceBase := "."
+
+	if stat.Mode&os.ModeDir == 0 { // not dir
+		sourceDir, sourceBase = filepath.Split(resolvedPath)
+	}
+	opts := archive.TarResourceRebaseOpts(sourceBase, filepath.Base(absPath))
+
+	data, err := chrootarchive.Tar(sourceDir, opts, container.BaseFS)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	content = ioutils.NewReadCloserWrapper(data, func() error {
+		err := data.Close()
+		container.DetachAndUnmount(daemon.LogVolumeEvent)
+		daemon.Unmount(container)
+		container.Unlock()
+		return err
+	})
+
+	daemon.LogContainerEvent(container, "archive-path")
+
+	return content, stat, nil
+}
+
+// containerExtractToDir extracts the given tar archive to the specified location in the
+// filesystem of this container. The given path must be of a directory in the
+// container. If it is not, the error will be an errdefs.InvalidParameter. If
+// noOverwriteDirNonDir is true then it will be an error if unpacking the
+// given content would cause an existing directory to be replaced with a non-
+// directory and vice versa.
+func (daemon *Daemon) containerExtractToDir(container *container.Container, path string, copyUIDGID, noOverwriteDirNonDir bool, content io.Reader) (err error) {
+	container.Lock()
+	defer container.Unlock()
+
+	// Make sure an online file-system operation is permitted.
+	if err := daemon.isOnlineFSOperationPermitted(container); err != nil {
+		return err
+	}
+
+	if err = daemon.Mount(container); err != nil {
+		return err
+	}
+	defer daemon.Unmount(container)
+
+	err = daemon.mountVolumes(container)
+	defer container.DetachAndUnmount(daemon.LogVolumeEvent)
+	if err != nil {
+		return err
+	}
+
+	// Normalize path before sending to rootfs'
+	path = filepath.FromSlash(path)
+
+	// Check if a drive letter supplied, it must be the system drive. No-op except on Windows
+	path, err = system.CheckSystemDriveAndRemoveDriveLetter(path)
+	if err != nil {
+		return err
+	}
+
+	// The destination path needs to be resolved to a host path, with all
+	// symbolic links followed in the scope of the container's rootfs. Note
+	// that we do not use `container.ResolvePath(path)` here because we need
+	// to also evaluate the last path element if it is a symlink. This is so
+	// that you can extract an archive to a symlink that points to a directory.
+
+	// Consider the given path as an absolute path in the container.
+	absPath := archive.PreserveTrailingDotOrSeparator(filepath.Join(string(filepath.Separator), path), path)
+
+	// This will evaluate the last path element if it is a symlink.
+	resolvedPath, err := container.GetResourcePath(absPath)
+	if err != nil {
+		return err
+	}
+
+	stat, err := os.Lstat(resolvedPath)
+	if err != nil {
+		return err
+	}
+
+	if !stat.IsDir() {
+		return errdefs.InvalidParameter(errors.New("extraction point is not a directory"))
+	}
+
+	// Need to check if the path is in a volume. If it is, it cannot be in a
+	// read-only volume. If it is not in a volume, the container cannot be
+	// configured with a read-only rootfs.
+
+	// Use the resolved path relative to the container rootfs as the new
+	// absPath. This way we fully follow any symlinks in a volume that may
+	// lead back outside the volume.
+	//
+	// The Windows implementation of filepath.Rel in golang 1.4 does not
+	// support volume style file path semantics. On Windows when using the
+	// filter driver, we are guaranteed that the path will always be
+	// a volume file path.
+	var baseRel string
+	if strings.HasPrefix(resolvedPath, `\\?\Volume{`) {
+		if strings.HasPrefix(resolvedPath, container.BaseFS) {
+			baseRel = resolvedPath[len(container.BaseFS):]
+			if baseRel[:1] == `\` {
+				baseRel = baseRel[1:]
+			}
+		}
+	} else {
+		baseRel, err = filepath.Rel(container.BaseFS, resolvedPath)
+	}
+	if err != nil {
+		return err
+	}
+	// Make it an absolute path.
+	absPath = filepath.Join(string(filepath.Separator), baseRel)
+
+	toVolume, err := checkIfPathIsInAVolume(container, absPath)
+	if err != nil {
+		return err
+	}
+
+	if !toVolume && container.HostConfig.ReadonlyRootfs {
+		return errdefs.InvalidParameter(errors.New("container rootfs is marked read-only"))
+	}
+
+	options := daemon.defaultTarCopyOptions(noOverwriteDirNonDir)
+
+	if copyUIDGID {
+		var err error
+		// tarCopyOptions will appropriately pull in the right uid/gid for the
+		// user/group and will set the options.
+		options, err = daemon.tarCopyOptions(container, noOverwriteDirNonDir)
+		if err != nil {
+			return err
+		}
+	}
+
+	if err := chrootarchive.UntarWithRoot(content, resolvedPath, options, container.BaseFS); err != nil {
+		return err
+	}
+
+	daemon.LogContainerEvent(container, "extract-to-dir")
+
+	return nil
+}
+
+func (daemon *Daemon) containerCopy(container *container.Container, resource string) (rc io.ReadCloser, err error) {
+	if resource[0] == '/' || resource[0] == '\\' {
+		resource = resource[1:]
+	}
+	container.Lock()
+
+	defer func() {
+		if err != nil {
+			// Wait to unlock the container until the archive is fully read
+			// (see the ReadCloseWrapper func below) or if there is an error
+			// before that occurs.
+			container.Unlock()
+		}
+	}()
+
+	// Make sure an online file-system operation is permitted.
+	if err := daemon.isOnlineFSOperationPermitted(container); err != nil {
+		return nil, err
+	}
+
+	if err := daemon.Mount(container); err != nil {
+		return nil, err
+	}
+
+	defer func() {
+		if err != nil {
+			// unmount any volumes
+			container.DetachAndUnmount(daemon.LogVolumeEvent)
+			// unmount the container's rootfs
+			daemon.Unmount(container)
+		}
+	}()
+
+	if err := daemon.mountVolumes(container); err != nil {
+		return nil, err
+	}
+
+	// Normalize path before sending to rootfs
+	resource = filepath.FromSlash(resource)
+
+	basePath, err := container.GetResourcePath(resource)
+	if err != nil {
+		return nil, err
+	}
+	stat, err := os.Stat(basePath)
+	if err != nil {
+		return nil, err
+	}
+	var filter []string
+	if !stat.IsDir() {
+		d, f := filepath.Split(basePath)
+		basePath = d
+		filter = []string{f}
+	}
+	archv, err := chrootarchive.Tar(basePath, &archive.TarOptions{
+		Compression:  archive.Uncompressed,
+		IncludeFiles: filter,
+	}, container.BaseFS)
+	if err != nil {
+		return nil, err
+	}
+
+	reader := ioutils.NewReadCloserWrapper(archv, func() error {
+		err := archv.Close()
+		container.DetachAndUnmount(daemon.LogVolumeEvent)
+		daemon.Unmount(container)
+		container.Unlock()
+		return err
+	})
+	daemon.LogContainerEvent(container, "copy")
+	return reader, nil
+}
+
 // checkIfPathIsInAVolume checks if the path is in a volume. If it is, it
 // checkIfPathIsInAVolume checks if the path is in a volume. If it is, it
 // cannot be in a read-only volume. If it  is not in a volume, the container
 // cannot be in a read-only volume. If it  is not in a volume, the container
 // cannot be configured with a read-only rootfs.
 // cannot be configured with a read-only rootfs.
@@ -21,9 +347,9 @@ func checkIfPathIsInAVolume(container *container.Container, absPath string) (boo
 // is not permitted (such as stat or for copying). Running Hyper-V containers
 // is not permitted (such as stat or for copying). Running Hyper-V containers
 // cannot have their file-system interrogated from the host as the filter is
 // cannot have their file-system interrogated from the host as the filter is
 // loaded inside the utility VM, not the host.
 // loaded inside the utility VM, not the host.
-// IMPORTANT: The container lock must NOT be held when calling this function.
+// IMPORTANT: The container lock MUST be held when calling this function.
 func (daemon *Daemon) isOnlineFSOperationPermitted(container *container.Container) error {
 func (daemon *Daemon) isOnlineFSOperationPermitted(container *container.Container) error {
-	if !container.IsRunning() {
+	if !container.Running {
 		return nil
 		return nil
 	}
 	}
 
 

+ 221 - 0
daemon/containerfs_linux.go

@@ -0,0 +1,221 @@
+package daemon // import "github.com/docker/docker/daemon"
+
+import (
+	"context"
+	"os"
+	"path/filepath"
+	"runtime"
+	"strings"
+
+	"github.com/hashicorp/go-multierror"
+	"github.com/moby/sys/mount"
+	"github.com/moby/sys/symlink"
+	"golang.org/x/sys/unix"
+
+	"github.com/docker/docker/api/types"
+	"github.com/docker/docker/container"
+	"github.com/docker/docker/internal/mounttree"
+	"github.com/docker/docker/internal/unshare"
+	"github.com/docker/docker/pkg/fileutils"
+)
+
+type future struct {
+	fn  func() error
+	res chan<- error
+}
+
+// containerFSView allows functions to be run in the context of a container's
+// filesystem. Inside these functions, the root directory is the container root
+// for all native OS filesystem APIs, including, but not limited to, the [os]
+// and [golang.org/x/sys/unix] packages. The view of the container's filesystem
+// is live and read-write. Each view has its own private set of tmpfs mounts.
+// Any files written under a tmpfs mount are not visible to processes inside the
+// container nor any other view of the container's filesystem, and vice versa.
+//
+// Each view has its own current working directory which is initialized to the
+// root of the container filesystem and can be changed with [os.Chdir]. Changes
+// to the current directory persist across successive [*containerFSView.RunInFS]
+// and [*containerFSView.GoInFS] calls.
+//
+// Multiple views of the same container filesystem can coexist at the same time.
+// Only one function can be running in a particular filesystem view at any given
+// time. Calls to [*containerFSView.RunInFS] or [*containerFSView.GoInFS] will
+// block while another function is running. If more than one call is blocked
+// concurrently, the order they are unblocked is undefined.
+type containerFSView struct {
+	d    *Daemon
+	ctr  *container.Container
+	todo chan future
+	done chan error
+}
+
+// openContainerFS opens a new view of the container's filesystem.
+func (daemon *Daemon) openContainerFS(container *container.Container) (_ *containerFSView, err error) {
+	if err := daemon.Mount(container); err != nil {
+		return nil, err
+	}
+	defer func() {
+		if err != nil {
+			_ = daemon.Unmount(container)
+		}
+	}()
+
+	mounts, err := daemon.setupMounts(container)
+	if err != nil {
+		return nil, err
+	}
+	defer func() {
+		if err != nil {
+			_ = container.UnmountVolumes(daemon.LogVolumeEvent)
+		}
+	}()
+
+	// Setup in initial mount namespace complete. We're ready to unshare the
+	// mount namespace and bind the volume mounts into that private view of
+	// the container FS.
+	todo := make(chan future)
+	done := make(chan error)
+	err = unshare.Go(unix.CLONE_NEWNS,
+		func() error {
+			if err := mount.MakeRSlave("/"); err != nil {
+				return err
+			}
+			for _, m := range mounts {
+				dest, err := container.GetResourcePath(m.Destination)
+				if err != nil {
+					return err
+				}
+
+				var stat os.FileInfo
+				stat, err = os.Stat(m.Source)
+				if err != nil {
+					return err
+				}
+				if err := fileutils.CreateIfNotExists(dest, stat.IsDir()); err != nil {
+					return err
+				}
+
+				bindMode := "rbind"
+				if m.NonRecursive {
+					bindMode = "bind"
+				}
+				writeMode := "ro"
+				if m.Writable {
+					writeMode = "rw"
+				}
+
+				// openContainerFS() is called for temporary mounts
+				// outside the container. Soon these will be unmounted
+				// with lazy unmount option and given we have mounted
+				// them rbind, all the submounts will propagate if these
+				// are shared. If daemon is running in host namespace
+				// and has / as shared then these unmounts will
+				// propagate and unmount original mount as well. So make
+				// all these mounts rprivate.  Do not use propagation
+				// property of volume as that should apply only when
+				// mounting happens inside the container.
+				opts := strings.Join([]string{bindMode, writeMode, "rprivate"}, ",")
+				if err := mount.Mount(m.Source, dest, "", opts); err != nil {
+					return err
+				}
+			}
+
+			return mounttree.SwitchRoot(container.BaseFS)
+		},
+		func() {
+			defer close(done)
+
+			for it := range todo {
+				err := it.fn()
+				if it.res != nil {
+					it.res <- err
+				}
+			}
+
+			// The thread will terminate when this goroutine returns, taking the
+			// mount namespace and all the volume bind-mounts with it.
+		},
+	)
+	if err != nil {
+		return nil, err
+	}
+	vw := &containerFSView{
+		d:    daemon,
+		ctr:  container,
+		todo: todo,
+		done: done,
+	}
+	runtime.SetFinalizer(vw, (*containerFSView).Close)
+	return vw, nil
+}
+
+// RunInFS synchronously runs fn in the context of the container filesytem and
+// passes through its return value.
+//
+// The container filesystem is only visible to functions called in the same
+// goroutine as fn. Goroutines started from fn will see the host's filesystem.
+func (vw *containerFSView) RunInFS(ctx context.Context, fn func() error) error {
+	res := make(chan error)
+	select {
+	case vw.todo <- future{fn: fn, res: res}:
+	case <-ctx.Done():
+		return ctx.Err()
+	}
+	return <-res
+}
+
+// GoInFS starts fn in the container FS. It blocks until fn is started but does
+// not wait until fn returns. An error is returned if ctx is canceled before fn
+// has been started.
+//
+// The container filesystem is only visible to functions called in the same
+// goroutine as fn. Goroutines started from fn will see the host's filesystem.
+func (vw *containerFSView) GoInFS(ctx context.Context, fn func()) error {
+	select {
+	case vw.todo <- future{fn: func() error { fn(); return nil }}:
+		return nil
+	case <-ctx.Done():
+		return ctx.Err()
+	}
+}
+
+// Close waits until any in-flight operations complete and frees all
+// resources associated with vw.
+func (vw *containerFSView) Close() error {
+	runtime.SetFinalizer(vw, nil)
+	close(vw.todo)
+	err := multierror.Append(nil, <-vw.done)
+	err = multierror.Append(err, vw.ctr.UnmountVolumes(vw.d.LogVolumeEvent))
+	err = multierror.Append(err, vw.d.Unmount(vw.ctr))
+	return err.ErrorOrNil()
+}
+
+// Stat returns the metadata for path, relative to the current working directory
+// of vw inside the container filesystem view.
+func (vw *containerFSView) Stat(ctx context.Context, path string) (*types.ContainerPathStat, error) {
+	var stat *types.ContainerPathStat
+	err := vw.RunInFS(ctx, func() error {
+		lstat, err := os.Lstat(path)
+		if err != nil {
+			return err
+		}
+		var target string
+		if lstat.Mode()&os.ModeSymlink != 0 {
+			// Fully evaluate symlinks along path to the ultimate
+			// target, or as much as possible with broken links.
+			target, err = symlink.FollowSymlinkInScope(path, "/")
+			if err != nil {
+				return err
+			}
+		}
+		stat = &types.ContainerPathStat{
+			Name:       filepath.Base(path),
+			Size:       lstat.Size(),
+			Mode:       lstat.Mode(),
+			Mtime:      lstat.ModTime(),
+			LinkTarget: target,
+		}
+		return nil
+	})
+	return stat, err
+}

+ 0 - 50
daemon/volumes_unix.go

@@ -12,9 +12,7 @@ import (
 
 
 	mounttypes "github.com/docker/docker/api/types/mount"
 	mounttypes "github.com/docker/docker/api/types/mount"
 	"github.com/docker/docker/container"
 	"github.com/docker/docker/container"
-	"github.com/docker/docker/pkg/fileutils"
 	volumemounts "github.com/docker/docker/volume/mounts"
 	volumemounts "github.com/docker/docker/volume/mounts"
-	"github.com/moby/sys/mount"
 )
 )
 
 
 // setupMounts iterates through each of the mount points for a container and
 // setupMounts iterates through each of the mount points for a container and
@@ -112,51 +110,3 @@ func setBindModeIfNull(bind *volumemounts.MountPoint) {
 		bind.Mode = "z"
 		bind.Mode = "z"
 	}
 	}
 }
 }
-
-func (daemon *Daemon) mountVolumes(container *container.Container) error {
-	mounts, err := daemon.setupMounts(container)
-	if err != nil {
-		return err
-	}
-
-	for _, m := range mounts {
-		dest, err := container.GetResourcePath(m.Destination)
-		if err != nil {
-			return err
-		}
-
-		var stat os.FileInfo
-		stat, err = os.Stat(m.Source)
-		if err != nil {
-			return err
-		}
-		if err = fileutils.CreateIfNotExists(dest, stat.IsDir()); err != nil {
-			return err
-		}
-
-		bindMode := "rbind"
-		if m.NonRecursive {
-			bindMode = "bind"
-		}
-		writeMode := "ro"
-		if m.Writable {
-			writeMode = "rw"
-		}
-
-		// mountVolumes() seems to be called for temporary mounts
-		// outside the container. Soon these will be unmounted with
-		// lazy unmount option and given we have mounted the rbind,
-		// all the submounts will propagate if these are shared. If
-		// daemon is running in host namespace and has / as shared
-		// then these unmounts will propagate and unmount original
-		// mount as well. So make all these mounts rprivate.
-		// Do not use propagation property of volume as that should
-		// apply only when mounting happens inside the container.
-		opts := strings.Join([]string{bindMode, writeMode, "rprivate"}, ",")
-		if err := mount.Mount(m.Source, dest, "", opts); err != nil {
-			return err
-		}
-	}
-
-	return nil
-}

+ 4 - 0
hack/dind

@@ -37,6 +37,10 @@ if [ -f /sys/fs/cgroup/cgroup.controllers ]; then
 		> /sys/fs/cgroup/cgroup.subtree_control
 		> /sys/fs/cgroup/cgroup.subtree_control
 fi
 fi
 
 
+# Change mount propagation to shared to make the environment more similar to a
+# modern Linux system, e.g. with SystemD as PID 1.
+mount --make-rshared /
+
 if [ $# -gt 0 ]; then
 if [ $# -gt 0 ]; then
 	exec "$@"
 	exec "$@"
 fi
 fi

+ 5 - 0
hack/dind-systemd

@@ -13,6 +13,11 @@ if [ ! -t 0 ]; then
 	exit 1
 	exit 1
 fi
 fi
 
 
+# Change mount propagation to shared, which SystemD PID 1 would normally do
+# itself when started by the kernel. SystemD skips that when it detects it is
+# running in a container.
+mount --make-rshared /
+
 env > /etc/docker-entrypoint-env
 env > /etc/docker-entrypoint-env
 
 
 cat > /etc/systemd/system/docker-entrypoint.target << EOF
 cat > /etc/systemd/system/docker-entrypoint.target << EOF

+ 7 - 0
integration/container/copy_test.go

@@ -158,16 +158,23 @@ func TestCopyFromContainer(t *testing.T) {
 		expect map[string]string
 		expect map[string]string
 	}{
 	}{
 		{"/", map[string]string{"/": "", "/foo": "hello", "/bar/quux/baz": "world", "/bar/filesymlink": "", "/bar/dirsymlink": "", "/bar/notarget": ""}},
 		{"/", map[string]string{"/": "", "/foo": "hello", "/bar/quux/baz": "world", "/bar/filesymlink": "", "/bar/dirsymlink": "", "/bar/notarget": ""}},
+		{".", map[string]string{"./": "", "./foo": "hello", "./bar/quux/baz": "world", "./bar/filesymlink": "", "./bar/dirsymlink": "", "./bar/notarget": ""}},
+		{"/.", map[string]string{"./": "", "./foo": "hello", "./bar/quux/baz": "world", "./bar/filesymlink": "", "./bar/dirsymlink": "", "./bar/notarget": ""}},
+		{"./", map[string]string{"./": "", "./foo": "hello", "./bar/quux/baz": "world", "./bar/filesymlink": "", "./bar/dirsymlink": "", "./bar/notarget": ""}},
+		{"/./", map[string]string{"./": "", "./foo": "hello", "./bar/quux/baz": "world", "./bar/filesymlink": "", "./bar/dirsymlink": "", "./bar/notarget": ""}},
 		{"/bar/root", map[string]string{"root": ""}},
 		{"/bar/root", map[string]string{"root": ""}},
 		{"/bar/root/", map[string]string{"root/": "", "root/foo": "hello", "root/bar/quux/baz": "world", "root/bar/filesymlink": "", "root/bar/dirsymlink": "", "root/bar/notarget": ""}},
 		{"/bar/root/", map[string]string{"root/": "", "root/foo": "hello", "root/bar/quux/baz": "world", "root/bar/filesymlink": "", "root/bar/dirsymlink": "", "root/bar/notarget": ""}},
+		{"/bar/root/.", map[string]string{"./": "", "./foo": "hello", "./bar/quux/baz": "world", "./bar/filesymlink": "", "./bar/dirsymlink": "", "./bar/notarget": ""}},
 
 
 		{"bar/quux", map[string]string{"quux/": "", "quux/baz": "world"}},
 		{"bar/quux", map[string]string{"quux/": "", "quux/baz": "world"}},
 		{"bar/quux/", map[string]string{"quux/": "", "quux/baz": "world"}},
 		{"bar/quux/", map[string]string{"quux/": "", "quux/baz": "world"}},
+		{"bar/quux/.", map[string]string{"./": "", "./baz": "world"}},
 		{"bar/quux/baz", map[string]string{"baz": "world"}},
 		{"bar/quux/baz", map[string]string{"baz": "world"}},
 
 
 		{"bar/filesymlink", map[string]string{"filesymlink": ""}},
 		{"bar/filesymlink", map[string]string{"filesymlink": ""}},
 		{"bar/dirsymlink", map[string]string{"dirsymlink": ""}},
 		{"bar/dirsymlink", map[string]string{"dirsymlink": ""}},
 		{"bar/dirsymlink/", map[string]string{"dirsymlink/": "", "dirsymlink/baz": "world"}},
 		{"bar/dirsymlink/", map[string]string{"dirsymlink/": "", "dirsymlink/baz": "world"}},
+		{"bar/dirsymlink/.", map[string]string{"./": "", "./baz": "world"}},
 		{"bar/notarget", map[string]string{"notarget": ""}},
 		{"bar/notarget", map[string]string{"notarget": ""}},
 	} {
 	} {
 		t.Run(x.src, func(t *testing.T) {
 		t.Run(x.src, func(t *testing.T) {

+ 35 - 0
integration/container/mounts_linux_test.go

@@ -393,3 +393,38 @@ func TestContainerVolumesMountedAsSlave(t *testing.T) {
 		t.Fatal(err)
 		t.Fatal(err)
 	}
 	}
 }
 }
+
+// Regression test for #38995 and #43390.
+func TestContainerCopyLeaksMounts(t *testing.T) {
+	defer setupTest(t)()
+
+	bindMount := mounttypes.Mount{
+		Type:   mounttypes.TypeBind,
+		Source: "/var",
+		Target: "/hostvar",
+		BindOptions: &mounttypes.BindOptions{
+			Propagation: mounttypes.PropagationRSlave,
+		},
+	}
+
+	ctx := context.Background()
+	client := testEnv.APIClient()
+	cid := container.Run(ctx, t, client, container.WithMount(bindMount), container.WithCmd("sleep", "120s"))
+
+	getMounts := func() string {
+		t.Helper()
+		res, err := container.Exec(ctx, client, cid, []string{"cat", "/proc/self/mountinfo"})
+		assert.NilError(t, err)
+		assert.Equal(t, res.ExitCode, 0)
+		return res.Stdout()
+	}
+
+	mountsBefore := getMounts()
+
+	_, _, err := client.CopyFromContainer(ctx, cid, "/etc/passwd")
+	assert.NilError(t, err)
+
+	mountsAfter := getMounts()
+
+	assert.Equal(t, mountsBefore, mountsAfter)
+}

+ 94 - 0
internal/mounttree/switchroot_linux.go

@@ -0,0 +1,94 @@
+package mounttree // import "github.com/docker/docker/internal/mounttree"
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+
+	"github.com/moby/sys/mount"
+	"github.com/moby/sys/mountinfo"
+	"golang.org/x/sys/unix"
+)
+
+// SwitchRoot changes path to be the root of the mount tree and changes the
+// current working directory to the new root.
+//
+// This function bind-mounts onto path; it is the caller's responsibility to set
+// the desired propagation mode of path's parent mount beforehand to prevent
+// unwanted propagation into different mount namespaces.
+func SwitchRoot(path string) error {
+	if mounted, _ := mountinfo.Mounted(path); !mounted {
+		if err := mount.Mount(path, path, "bind", "rbind,rw"); err != nil {
+			return realChroot(path)
+		}
+	}
+
+	// setup oldRoot for pivot_root
+	pivotDir, err := os.MkdirTemp(path, ".pivot_root")
+	if err != nil {
+		return fmt.Errorf("Error setting up pivot dir: %v", err)
+	}
+
+	var mounted bool
+	defer func() {
+		if mounted {
+			// make sure pivotDir is not mounted before we try to remove it
+			if errCleanup := unix.Unmount(pivotDir, unix.MNT_DETACH); errCleanup != nil {
+				if err == nil {
+					err = errCleanup
+				}
+				return
+			}
+		}
+
+		errCleanup := os.Remove(pivotDir)
+		// pivotDir doesn't exist if pivot_root failed and chroot+chdir was successful
+		// because we already cleaned it up on failed pivot_root
+		if errCleanup != nil && !os.IsNotExist(errCleanup) {
+			errCleanup = fmt.Errorf("Error cleaning up after pivot: %v", errCleanup)
+			if err == nil {
+				err = errCleanup
+			}
+		}
+	}()
+
+	if err := unix.PivotRoot(path, pivotDir); err != nil {
+		// If pivot fails, fall back to the normal chroot after cleaning up temp dir
+		if err := os.Remove(pivotDir); err != nil {
+			return fmt.Errorf("Error cleaning up after failed pivot: %v", err)
+		}
+		return realChroot(path)
+	}
+	mounted = true
+
+	// This is the new path for where the old root (prior to the pivot) has been moved to
+	// This dir contains the rootfs of the caller, which we need to remove so it is not visible during extraction
+	pivotDir = filepath.Join("/", filepath.Base(pivotDir))
+
+	if err := unix.Chdir("/"); err != nil {
+		return fmt.Errorf("Error changing to new root: %v", err)
+	}
+
+	// Make the pivotDir (where the old root lives) private so it can be unmounted without propagating to the host
+	if err := unix.Mount("", pivotDir, "", unix.MS_PRIVATE|unix.MS_REC, ""); err != nil {
+		return fmt.Errorf("Error making old root private after pivot: %v", err)
+	}
+
+	// Now unmount the old root so it's no longer visible from the new root
+	if err := unix.Unmount(pivotDir, unix.MNT_DETACH); err != nil {
+		return fmt.Errorf("Error while unmounting old root after pivot: %v", err)
+	}
+	mounted = false
+
+	return nil
+}
+
+func realChroot(path string) error {
+	if err := unix.Chroot(path); err != nil {
+		return fmt.Errorf("Error after fallback to chroot: %v", err)
+	}
+	if err := unix.Chdir("/"); err != nil {
+		return fmt.Errorf("Error changing to new root after chroot: %v", err)
+	}
+	return nil
+}

+ 176 - 0
internal/unshare/unshare_linux.go

@@ -0,0 +1,176 @@
+//go:build go1.10
+// +build go1.10
+
+package unshare // import "github.com/docker/docker/internal/unshare"
+
+import (
+	"fmt"
+	"os"
+	"runtime"
+
+	"golang.org/x/sys/unix"
+)
+
+func init() {
+	// The startup thread of a process is special in a few different ways.
+	// Most pertinent to the discussion at hand, any per-thread kernel state
+	// reflected in the /proc/[pid]/ directory for a process is taken from
+	// the state of the startup thread. Same goes for /proc/self/; it shows
+	// the state of the current process' startup thread, no matter which
+	// thread the files are being opened from. For most programs this is a
+	// distinction without a difference as the kernel state, such as the
+	// mount namespace and current working directory, is shared among (and
+	// kept synchronized across) all threads of a process. But things start
+	// to break down once threads start unsharing and modifying parts of
+	// their kernel state.
+	//
+	// The Go runtime schedules goroutines to execute on the startup thread,
+	// same as any other. How this could be problematic is best illustrated
+	// with a concrete example. Consider what happens if a call to
+	// Go(unix.CLONE_NEWNS, ...) spawned a goroutine which gets scheduled
+	// onto the startup thread. The thread's mount namespace will be
+	// unshared and modified. The contents of the /proc/[pid]/mountinfo file
+	// will then describe the mount tree of the unshared namespace, not the
+	// namespace of any other thread. It will remain this way until the
+	// process exits. (The startup thread is special in another way: exiting
+	// it puts the process into a "non-waitable zombie" state. To avoid this
+	// fate, the Go runtime parks the thread instead of exiting if a
+	// goroutine returns while locked to the startup thread. More
+	// information can be found in the Go runtime sources:
+	// `go doc -u -src runtime.mexit`.) The github.com/moby/sys/mountinfo
+	// package reads from /proc/self/mountinfo, so will read the mount tree
+	// for the wrong namespace if the startup thread has had its mount
+	// namespace unshared! The /proc/thread-self/ directory, introduced in
+	// Linux 3.17, is one potential solution to this problem, but every
+	// package which opens files in /proc/self/ would need to be updated,
+	// and fallbacks to /proc/self/task/[tid]/ would be required to support
+	// older kernels. Overlooking any reference to /proc/self/ would
+	// manifest as stochastically-reproducible bugs, so this is far from an
+	// ideal solution.
+	//
+	// Reading from /proc/self/ would not be a problem if we could prevent
+	// the per-thread state of the startup thread from being modified
+	// nondeterministically in the first place. We can accomplish this
+	// simply by locking the main() function to the startup thread! Doing so
+	// excludes any other goroutine from being scheduled on the thread.
+	runtime.LockOSThread()
+}
+
+// reversibleSetnsFlags maps the unshare(2) flags whose effects can be fully
+// reversed using setns(2). The values are the basenames of the corresponding
+// /proc/self/task/[tid]/ns/ magic symlinks to use to save and restore the
+// state.
+var reversibleSetnsFlags = map[int]string{
+	unix.CLONE_NEWCGROUP: "cgroup",
+	unix.CLONE_NEWNET:    "net",
+	unix.CLONE_NEWUTS:    "uts",
+	unix.CLONE_NEWPID:    "pid",
+	unix.CLONE_NEWTIME:   "time",
+
+	// The following CLONE_NEW* flags are not included because they imply
+	// another, irreversible flag when used with unshare(2).
+	//  - unix.CLONE_NEWIPC:  implies CLONE_SYSVMEM
+	//  - unix.CLONE_NEWNS:   implies CLONE_FS
+	//  - unix.CLONE_NEWUSER: implies CLONE_FS since Linux 3.9
+}
+
+// Go calls the given functions in a new goroutine, locked to an OS thread,
+// which has had the parts of its execution state disassociated from the rest of
+// the current process using [unshare(2)]. It blocks until the new goroutine has
+// started and setupfn has returned. fn is only called if setupfn returns nil. A
+// nil setupfn or fn is equivalent to passing a no-op function.
+//
+// The disassociated execution state and any changes made to it are only visible
+// to the goroutine which the functions are called in. Any other goroutines,
+// including ones started from the function, will see the same execution state
+// as the rest of the process.
+//
+// The acceptable flags are documented in the [unshare(2)] Linux man-page.
+// The corresponding CLONE_* constants are defined in package [unix].
+//
+// # Warning
+//
+// This function may terminate the thread which the new goroutine executed on
+// after fn returns, which could cause subprocesses started with the
+// [syscall.SysProcAttr] Pdeathsig field set to be signaled before process
+// termination. Any subprocess started before this function is called may be
+// affected, in addition to any subprocesses started inside setupfn or fn.
+// There are more details at https://go.dev/issue/27505.
+//
+// [unshare(2)]: https://man7.org/linux/man-pages/man2/unshare.2.html
+func Go(flags int, setupfn func() error, fn func()) error {
+	started := make(chan error)
+
+	maskedFlags := flags
+	for f := range reversibleSetnsFlags {
+		maskedFlags &^= f
+	}
+	isReversible := maskedFlags == 0
+
+	go func() {
+		// Prepare to manipulate per-thread kernel state.
+		runtime.LockOSThread()
+
+		// Not all changes to the execution state can be reverted.
+		// If an irreversible change to the execution state is made, our
+		// only recourse is to have the tampered thread terminated by
+		// returning from this function while the goroutine remains
+		// wired to the thread. The Go runtime will terminate the thread
+		// and replace it with a fresh one as needed.
+
+		if isReversible {
+			defer func() {
+				if isReversible {
+					// All execution state has been restored without error.
+					// The thread is once again fungible.
+					runtime.UnlockOSThread()
+				}
+			}()
+			tid := unix.Gettid()
+			for f, ns := range reversibleSetnsFlags {
+				if flags&f != f {
+					continue
+				}
+				// The /proc/thread-self directory was added in Linux 3.17.
+				// We are not using it to maximize compatibility.
+				pth := fmt.Sprintf("/proc/self/task/%d/ns/%s", tid, ns)
+				fd, err := unix.Open(pth, unix.O_RDONLY|unix.O_CLOEXEC, 0)
+				if err != nil {
+					started <- &os.PathError{Op: "open", Path: pth, Err: err}
+					return
+				}
+				defer func() {
+					if isReversible {
+						if err := unix.Setns(fd, 0); err != nil {
+							isReversible = false
+						}
+					}
+					_ = unix.Close(fd)
+				}()
+			}
+		}
+
+		// Threads are implemented under Linux as processes which share
+		// a virtual memory space. Therefore in a multithreaded process
+		// unshare(2) disassociates parts of the calling thread's
+		// context from the thread it was clone(2)'d from.
+		if err := unix.Unshare(flags); err != nil {
+			started <- os.NewSyscallError("unshare", err)
+			return
+		}
+
+		if setupfn != nil {
+			if err := setupfn(); err != nil {
+				started <- err
+				return
+			}
+		}
+		close(started)
+
+		if fn != nil {
+			fn()
+		}
+	}()
+
+	return <-started
+}

+ 185 - 148
pkg/archive/archive.go

@@ -821,10 +821,29 @@ func Tar(path string, compression Compression) (io.ReadCloser, error) {
 // TarWithOptions creates an archive from the directory at `path`, only including files whose relative
 // TarWithOptions creates an archive from the directory at `path`, only including files whose relative
 // paths are included in `options.IncludeFiles` (if non-nil) or not in `options.ExcludePatterns`.
 // paths are included in `options.IncludeFiles` (if non-nil) or not in `options.ExcludePatterns`.
 func TarWithOptions(srcPath string, options *TarOptions) (io.ReadCloser, error) {
 func TarWithOptions(srcPath string, options *TarOptions) (io.ReadCloser, error) {
-	// Fix the source path to work with long path names. This is a no-op
-	// on platforms other than Windows.
-	srcPath = fixVolumePathPrefix(srcPath)
+	tb, err := NewTarballer(srcPath, options)
+	if err != nil {
+		return nil, err
+	}
+	go tb.Do()
+	return tb.Reader(), nil
+}
+
+// Tarballer is a lower-level interface to TarWithOptions which gives the caller
+// control over which goroutine the archiving operation executes on.
+type Tarballer struct {
+	srcPath           string
+	options           *TarOptions
+	pm                *patternmatcher.PatternMatcher
+	pipeReader        *io.PipeReader
+	pipeWriter        *io.PipeWriter
+	compressWriter    io.WriteCloser
+	whiteoutConverter tarWhiteoutConverter
+}
 
 
+// NewTarballer constructs a new tarballer. The arguments are the same as for
+// TarWithOptions.
+func NewTarballer(srcPath string, options *TarOptions) (*Tarballer, error) {
 	pm, err := patternmatcher.New(options.ExcludePatterns)
 	pm, err := patternmatcher.New(options.ExcludePatterns)
 	if err != nil {
 	if err != nil {
 		return nil, err
 		return nil, err
@@ -842,183 +861,201 @@ func TarWithOptions(srcPath string, options *TarOptions) (io.ReadCloser, error)
 		return nil, err
 		return nil, err
 	}
 	}
 
 
-	go func() {
-		ta := newTarAppender(
-			options.IDMap,
-			compressWriter,
-			options.ChownOpts,
-		)
-		ta.WhiteoutConverter = whiteoutConverter
-
-		defer func() {
-			// Make sure to check the error on Close.
-			if err := ta.TarWriter.Close(); err != nil {
-				logrus.Errorf("Can't close tar writer: %s", err)
-			}
-			if err := compressWriter.Close(); err != nil {
-				logrus.Errorf("Can't close compress writer: %s", err)
-			}
-			if err := pipeWriter.Close(); err != nil {
-				logrus.Errorf("Can't close pipe writer: %s", err)
-			}
-		}()
+	return &Tarballer{
+		// Fix the source path to work with long path names. This is a no-op
+		// on platforms other than Windows.
+		srcPath:           fixVolumePathPrefix(srcPath),
+		options:           options,
+		pm:                pm,
+		pipeReader:        pipeReader,
+		pipeWriter:        pipeWriter,
+		compressWriter:    compressWriter,
+		whiteoutConverter: whiteoutConverter,
+	}, nil
+}
 
 
-		// this buffer is needed for the duration of this piped stream
-		defer pools.BufioWriter32KPool.Put(ta.Buffer)
+// Reader returns the reader for the created archive.
+func (t *Tarballer) Reader() io.ReadCloser {
+	return t.pipeReader
+}
 
 
-		// In general we log errors here but ignore them because
-		// during e.g. a diff operation the container can continue
-		// mutating the filesystem and we can see transient errors
-		// from this
+// Do performs the archiving operation in the background. The resulting archive
+// can be read from t.Reader(). Do should only be called once on each Tarballer
+// instance.
+func (t *Tarballer) Do() {
+	ta := newTarAppender(
+		t.options.IDMap,
+		t.compressWriter,
+		t.options.ChownOpts,
+	)
+	ta.WhiteoutConverter = t.whiteoutConverter
 
 
-		stat, err := os.Lstat(srcPath)
-		if err != nil {
-			return
+	defer func() {
+		// Make sure to check the error on Close.
+		if err := ta.TarWriter.Close(); err != nil {
+			logrus.Errorf("Can't close tar writer: %s", err)
 		}
 		}
-
-		if !stat.IsDir() {
-			// We can't later join a non-dir with any includes because the
-			// 'walk' will error if "file/." is stat-ed and "file" is not a
-			// directory. So, we must split the source path and use the
-			// basename as the include.
-			if len(options.IncludeFiles) > 0 {
-				logrus.Warn("Tar: Can't archive a file with includes")
-			}
-
-			dir, base := SplitPathDirEntry(srcPath)
-			srcPath = dir
-			options.IncludeFiles = []string{base}
+		if err := t.compressWriter.Close(); err != nil {
+			logrus.Errorf("Can't close compress writer: %s", err)
 		}
 		}
-
-		if len(options.IncludeFiles) == 0 {
-			options.IncludeFiles = []string{"."}
+		if err := t.pipeWriter.Close(); err != nil {
+			logrus.Errorf("Can't close pipe writer: %s", err)
 		}
 		}
+	}()
 
 
-		seen := make(map[string]bool)
-
-		for _, include := range options.IncludeFiles {
-			rebaseName := options.RebaseNames[include]
+	// this buffer is needed for the duration of this piped stream
+	defer pools.BufioWriter32KPool.Put(ta.Buffer)
 
 
-			var (
-				parentMatchInfo []patternmatcher.MatchInfo
-				parentDirs      []string
-			)
+	// In general we log errors here but ignore them because
+	// during e.g. a diff operation the container can continue
+	// mutating the filesystem and we can see transient errors
+	// from this
 
 
-			walkRoot := getWalkRoot(srcPath, include)
-			filepath.WalkDir(walkRoot, func(filePath string, f os.DirEntry, err error) error {
-				if err != nil {
-					logrus.Errorf("Tar: Can't stat file %s to tar: %s", srcPath, err)
-					return nil
-				}
+	stat, err := os.Lstat(t.srcPath)
+	if err != nil {
+		return
+	}
 
 
-				relFilePath, err := filepath.Rel(srcPath, filePath)
-				if err != nil || (!options.IncludeSourceDir && relFilePath == "." && f.IsDir()) {
-					// Error getting relative path OR we are looking
-					// at the source directory path. Skip in both situations.
-					return nil
-				}
+	if !stat.IsDir() {
+		// We can't later join a non-dir with any includes because the
+		// 'walk' will error if "file/." is stat-ed and "file" is not a
+		// directory. So, we must split the source path and use the
+		// basename as the include.
+		if len(t.options.IncludeFiles) > 0 {
+			logrus.Warn("Tar: Can't archive a file with includes")
+		}
 
 
-				if options.IncludeSourceDir && include == "." && relFilePath != "." {
-					relFilePath = strings.Join([]string{".", relFilePath}, string(filepath.Separator))
-				}
+		dir, base := SplitPathDirEntry(t.srcPath)
+		t.srcPath = dir
+		t.options.IncludeFiles = []string{base}
+	}
 
 
-				skip := false
-
-				// If "include" is an exact match for the current file
-				// then even if there's an "excludePatterns" pattern that
-				// matches it, don't skip it. IOW, assume an explicit 'include'
-				// is asking for that file no matter what - which is true
-				// for some files, like .dockerignore and Dockerfile (sometimes)
-				if include != relFilePath {
-					for len(parentDirs) != 0 {
-						lastParentDir := parentDirs[len(parentDirs)-1]
-						if strings.HasPrefix(relFilePath, lastParentDir+string(os.PathSeparator)) {
-							break
-						}
-						parentDirs = parentDirs[:len(parentDirs)-1]
-						parentMatchInfo = parentMatchInfo[:len(parentMatchInfo)-1]
-					}
+	if len(t.options.IncludeFiles) == 0 {
+		t.options.IncludeFiles = []string{"."}
+	}
 
 
-					var matchInfo patternmatcher.MatchInfo
-					if len(parentMatchInfo) != 0 {
-						skip, matchInfo, err = pm.MatchesUsingParentResults(relFilePath, parentMatchInfo[len(parentMatchInfo)-1])
-					} else {
-						skip, matchInfo, err = pm.MatchesUsingParentResults(relFilePath, patternmatcher.MatchInfo{})
-					}
-					if err != nil {
-						logrus.Errorf("Error matching %s: %v", relFilePath, err)
-						return err
-					}
+	seen := make(map[string]bool)
 
 
-					if f.IsDir() {
-						parentDirs = append(parentDirs, relFilePath)
-						parentMatchInfo = append(parentMatchInfo, matchInfo)
-					}
-				}
+	for _, include := range t.options.IncludeFiles {
+		rebaseName := t.options.RebaseNames[include]
 
 
-				if skip {
-					// If we want to skip this file and its a directory
-					// then we should first check to see if there's an
-					// excludes pattern (e.g. !dir/file) that starts with this
-					// dir. If so then we can't skip this dir.
+		var (
+			parentMatchInfo []patternmatcher.MatchInfo
+			parentDirs      []string
+		)
 
 
-					// Its not a dir then so we can just return/skip.
-					if !f.IsDir() {
-						return nil
-					}
+		walkRoot := getWalkRoot(t.srcPath, include)
+		filepath.WalkDir(walkRoot, func(filePath string, f os.DirEntry, err error) error {
+			if err != nil {
+				logrus.Errorf("Tar: Can't stat file %s to tar: %s", t.srcPath, err)
+				return nil
+			}
 
 
-					// No exceptions (!...) in patterns so just skip dir
-					if !pm.Exclusions() {
-						return filepath.SkipDir
-					}
+			relFilePath, err := filepath.Rel(t.srcPath, filePath)
+			if err != nil || (!t.options.IncludeSourceDir && relFilePath == "." && f.IsDir()) {
+				// Error getting relative path OR we are looking
+				// at the source directory path. Skip in both situations.
+				return nil
+			}
 
 
-					dirSlash := relFilePath + string(filepath.Separator)
+			if t.options.IncludeSourceDir && include == "." && relFilePath != "." {
+				relFilePath = strings.Join([]string{".", relFilePath}, string(filepath.Separator))
+			}
 
 
-					for _, pat := range pm.Patterns() {
-						if !pat.Exclusion() {
-							continue
-						}
-						if strings.HasPrefix(pat.String()+string(filepath.Separator), dirSlash) {
-							// found a match - so can't skip this dir
-							return nil
-						}
+			skip := false
+
+			// If "include" is an exact match for the current file
+			// then even if there's an "excludePatterns" pattern that
+			// matches it, don't skip it. IOW, assume an explicit 'include'
+			// is asking for that file no matter what - which is true
+			// for some files, like .dockerignore and Dockerfile (sometimes)
+			if include != relFilePath {
+				for len(parentDirs) != 0 {
+					lastParentDir := parentDirs[len(parentDirs)-1]
+					if strings.HasPrefix(relFilePath, lastParentDir+string(os.PathSeparator)) {
+						break
 					}
 					}
+					parentDirs = parentDirs[:len(parentDirs)-1]
+					parentMatchInfo = parentMatchInfo[:len(parentMatchInfo)-1]
+				}
 
 
-					// No matching exclusion dir so just skip dir
-					return filepath.SkipDir
+				var matchInfo patternmatcher.MatchInfo
+				if len(parentMatchInfo) != 0 {
+					skip, matchInfo, err = t.pm.MatchesUsingParentResults(relFilePath, parentMatchInfo[len(parentMatchInfo)-1])
+				} else {
+					skip, matchInfo, err = t.pm.MatchesUsingParentResults(relFilePath, patternmatcher.MatchInfo{})
+				}
+				if err != nil {
+					logrus.Errorf("Error matching %s: %v", relFilePath, err)
+					return err
+				}
+
+				if f.IsDir() {
+					parentDirs = append(parentDirs, relFilePath)
+					parentMatchInfo = append(parentMatchInfo, matchInfo)
 				}
 				}
+			}
+
+			if skip {
+				// If we want to skip this file and its a directory
+				// then we should first check to see if there's an
+				// excludes pattern (e.g. !dir/file) that starts with this
+				// dir. If so then we can't skip this dir.
 
 
-				if seen[relFilePath] {
+				// Its not a dir then so we can just return/skip.
+				if !f.IsDir() {
 					return nil
 					return nil
 				}
 				}
-				seen[relFilePath] = true
-
-				// Rename the base resource.
-				if rebaseName != "" {
-					var replacement string
-					if rebaseName != string(filepath.Separator) {
-						// Special case the root directory to replace with an
-						// empty string instead so that we don't end up with
-						// double slashes in the paths.
-						replacement = rebaseName
-					}
 
 
-					relFilePath = strings.Replace(relFilePath, include, replacement, 1)
+				// No exceptions (!...) in patterns so just skip dir
+				if !t.pm.Exclusions() {
+					return filepath.SkipDir
 				}
 				}
 
 
-				if err := ta.addTarFile(filePath, relFilePath); err != nil {
-					logrus.Errorf("Can't add file %s to tar: %s", filePath, err)
-					// if pipe is broken, stop writing tar stream to it
-					if err == io.ErrClosedPipe {
-						return err
+				dirSlash := relFilePath + string(filepath.Separator)
+
+				for _, pat := range t.pm.Patterns() {
+					if !pat.Exclusion() {
+						continue
+					}
+					if strings.HasPrefix(pat.String()+string(filepath.Separator), dirSlash) {
+						// found a match - so can't skip this dir
+						return nil
 					}
 					}
 				}
 				}
+
+				// No matching exclusion dir so just skip dir
+				return filepath.SkipDir
+			}
+
+			if seen[relFilePath] {
 				return nil
 				return nil
-			})
-		}
-	}()
+			}
+			seen[relFilePath] = true
+
+			// Rename the base resource.
+			if rebaseName != "" {
+				var replacement string
+				if rebaseName != string(filepath.Separator) {
+					// Special case the root directory to replace with an
+					// empty string instead so that we don't end up with
+					// double slashes in the paths.
+					replacement = rebaseName
+				}
 
 
-	return pipeReader, nil
+				relFilePath = strings.Replace(relFilePath, include, replacement, 1)
+			}
+
+			if err := ta.addTarFile(filePath, relFilePath); err != nil {
+				logrus.Errorf("Can't add file %s to tar: %s", filePath, err)
+				// if pipe is broken, stop writing tar stream to it
+				if err == io.ErrClosedPipe {
+					return err
+				}
+			}
+			return nil
+		})
+	}
 }
 }
 
 
 // Unpack unpacks the decompressedArchive to dest with options.
 // Unpack unpacks the decompressedArchive to dest with options.

+ 1 - 1
pkg/archive/diff.go

@@ -87,7 +87,7 @@ func UnpackLayer(dest string, layer io.Reader, options *TarOptions) (size int64,
 				basename := filepath.Base(hdr.Name)
 				basename := filepath.Base(hdr.Name)
 				aufsHardlinks[basename] = hdr
 				aufsHardlinks[basename] = hdr
 				if aufsTempdir == "" {
 				if aufsTempdir == "" {
-					if aufsTempdir, err = os.MkdirTemp("", "dockerplnk"); err != nil {
+					if aufsTempdir, err = os.MkdirTemp(dest, "dockerplnk"); err != nil {
 						return 0, err
 						return 0, err
 					}
 					}
 					defer os.RemoveAll(aufsTempdir)
 					defer os.RemoveAll(aufsTempdir)

+ 37 - 189
pkg/chrootarchive/archive_unix.go

@@ -4,223 +4,71 @@
 package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive"
 package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive"
 
 
 import (
 import (
-	"bytes"
-	"encoding/json"
-	"flag"
-	"fmt"
 	"io"
 	"io"
-	"os"
 	"path/filepath"
 	"path/filepath"
-	"runtime"
 	"strings"
 	"strings"
 
 
 	"github.com/docker/docker/pkg/archive"
 	"github.com/docker/docker/pkg/archive"
-	"github.com/docker/docker/pkg/reexec"
 	"github.com/pkg/errors"
 	"github.com/pkg/errors"
 )
 )
 
 
-// untar is the entry-point for docker-untar on re-exec. This is not used on
-// Windows as it does not support chroot, hence no point sandboxing through
-// chroot and rexec.
-func untar() {
-	runtime.LockOSThread()
-	flag.Parse()
-
-	var options archive.TarOptions
-
-	// read the options from the pipe "ExtraFiles"
-	if err := json.NewDecoder(os.NewFile(3, "options")).Decode(&options); err != nil {
-		fatal(err)
-	}
-
-	dst := flag.Arg(0)
-	var root string
-	if len(flag.Args()) > 1 {
-		root = flag.Arg(1)
-	}
-
-	if root == "" {
-		root = dst
-	}
-
-	if err := chroot(root); err != nil {
-		fatal(err)
-	}
-
-	if err := archive.Unpack(os.Stdin, dst, &options); err != nil {
-		fatal(err)
-	}
-	// fully consume stdin in case it is zero padded
-	if _, err := flush(os.Stdin); err != nil {
-		fatal(err)
-	}
-
-	os.Exit(0)
-}
-
 func invokeUnpack(decompressedArchive io.Reader, dest string, options *archive.TarOptions, root string) error {
 func invokeUnpack(decompressedArchive io.Reader, dest string, options *archive.TarOptions, root string) error {
-	if root == "" {
-		return errors.New("must specify a root to chroot to")
-	}
-
-	// We can't pass a potentially large exclude list directly via cmd line
-	// because we easily overrun the kernel's max argument/environment size
-	// when the full image list is passed (e.g. when this is used by
-	// `docker load`). We will marshall the options via a pipe to the
-	// child
-	r, w, err := os.Pipe()
+	relDest, err := resolvePathInChroot(root, dest)
 	if err != nil {
 	if err != nil {
-		return fmt.Errorf("Untar pipe failure: %v", err)
-	}
-
-	if root != "" {
-		relDest, err := filepath.Rel(root, dest)
-		if err != nil {
-			return err
-		}
-		if relDest == "." {
-			relDest = "/"
-		}
-		if relDest[0] != '/' {
-			relDest = "/" + relDest
-		}
-		dest = relDest
+		return err
 	}
 	}
 
 
-	cmd := reexec.Command("docker-untar", dest, root)
-	cmd.Stdin = decompressedArchive
-
-	cmd.ExtraFiles = append(cmd.ExtraFiles, r)
-	output := bytes.NewBuffer(nil)
-	cmd.Stdout = output
-	cmd.Stderr = output
-
-	// reexec.Command() sets cmd.SysProcAttr.Pdeathsig on Linux, which
-	// causes the started process to be signaled when the creating OS thread
-	// dies. Ensure that the reexec is not prematurely signaled. See
-	// https://go.dev/issue/27505 for more information.
-	runtime.LockOSThread()
-	defer runtime.UnlockOSThread()
-	if err := cmd.Start(); err != nil {
-		w.Close()
-		return fmt.Errorf("Untar error on re-exec cmd: %v", err)
-	}
-
-	// write the options to the pipe for the untar exec to read
-	if err := json.NewEncoder(w).Encode(options); err != nil {
-		w.Close()
-		return fmt.Errorf("Untar json encode to pipe failed: %v", err)
-	}
-	w.Close()
-
-	if err := cmd.Wait(); err != nil {
-		// when `xz -d -c -q | docker-untar ...` failed on docker-untar side,
-		// we need to exhaust `xz`'s output, otherwise the `xz` side will be
-		// pending on write pipe forever
-		io.Copy(io.Discard, decompressedArchive)
-
-		return fmt.Errorf("Error processing tar file(%v): %s", err, output)
-	}
-	return nil
-}
-
-func tar() {
-	runtime.LockOSThread()
-	flag.Parse()
-
-	src := flag.Arg(0)
-	var root string
-	if len(flag.Args()) > 1 {
-		root = flag.Arg(1)
-	}
-
-	if root == "" {
-		root = src
-	}
-
-	if err := realChroot(root); err != nil {
-		fatal(err)
-	}
-
-	var options archive.TarOptions
-	if err := json.NewDecoder(os.Stdin).Decode(&options); err != nil {
-		fatal(err)
-	}
-
-	rdr, err := archive.TarWithOptions(src, &options)
+	done := make(chan error)
+	err = goInChroot(root, func() { done <- archive.Unpack(decompressedArchive, relDest, options) })
 	if err != nil {
 	if err != nil {
-		fatal(err)
-	}
-	defer rdr.Close()
-
-	if _, err := io.Copy(os.Stdout, rdr); err != nil {
-		fatal(err)
+		return err
 	}
 	}
-
-	os.Exit(0)
+	return <-done
 }
 }
 
 
 func invokePack(srcPath string, options *archive.TarOptions, root string) (io.ReadCloser, error) {
 func invokePack(srcPath string, options *archive.TarOptions, root string) (io.ReadCloser, error) {
-	if root == "" {
-		return nil, errors.New("root path must not be empty")
-	}
-
-	relSrc, err := filepath.Rel(root, srcPath)
+	relSrc, err := resolvePathInChroot(root, srcPath)
 	if err != nil {
 	if err != nil {
 		return nil, err
 		return nil, err
 	}
 	}
-	if relSrc == "." {
-		relSrc = "/"
-	}
-	if relSrc[0] != '/' {
-		relSrc = "/" + relSrc
-	}
 
 
-	// make sure we didn't trim a trailing slash with the call to `Rel`
+	// make sure we didn't trim a trailing slash with the call to `resolvePathInChroot`
 	if strings.HasSuffix(srcPath, "/") && !strings.HasSuffix(relSrc, "/") {
 	if strings.HasSuffix(srcPath, "/") && !strings.HasSuffix(relSrc, "/") {
 		relSrc += "/"
 		relSrc += "/"
 	}
 	}
 
 
-	cmd := reexec.Command("docker-tar", relSrc, root)
-
-	errBuff := bytes.NewBuffer(nil)
-	cmd.Stderr = errBuff
-
-	tarR, tarW := io.Pipe()
-	cmd.Stdout = tarW
-
-	stdin, err := cmd.StdinPipe()
+	tb, err := archive.NewTarballer(relSrc, options)
 	if err != nil {
 	if err != nil {
-		return nil, errors.Wrap(err, "error getting options pipe for tar process")
+		return nil, errors.Wrap(err, "error processing tar file")
 	}
 	}
-
-	started := make(chan error)
-	go func() {
-		// reexec.Command() sets cmd.SysProcAttr.Pdeathsig on Linux,
-		// which causes the started process to be signaled when the
-		// creating OS thread dies. Ensure that the subprocess is not
-		// prematurely signaled. See https://go.dev/issue/27505 for more
-		// information.
-		runtime.LockOSThread()
-		defer runtime.UnlockOSThread()
-		if err := cmd.Start(); err != nil {
-			started <- err
-			return
-		}
-		close(started)
-		err := cmd.Wait()
-		err = errors.Wrapf(err, "error processing tar file: %s", errBuff)
-		tarW.CloseWithError(err)
-	}()
-	if err := <-started; err != nil {
-		return nil, errors.Wrap(err, "tar error on re-exec cmd")
+	err = goInChroot(root, tb.Do)
+	if err != nil {
+		return nil, errors.Wrap(err, "could not chroot")
 	}
 	}
+	return tb.Reader(), nil
+}
 
 
-	if err := json.NewEncoder(stdin).Encode(options); err != nil {
-		stdin.Close()
-		return nil, errors.Wrap(err, "tar json encode to pipe failed")
+// resolvePathInChroot returns the equivalent to path inside a chroot rooted at root.
+// The returned path always begins with '/'.
+//
+//   - resolvePathInChroot("/a/b", "/a/b/c/d") -> "/c/d"
+//   - resolvePathInChroot("/a/b", "/a/b")     -> "/"
+//
+// The implementation is buggy, and some bugs may be load-bearing.
+// Here be dragons.
+func resolvePathInChroot(root, path string) (string, error) {
+	if root == "" {
+		return "", errors.New("root path must not be empty")
 	}
 	}
-	stdin.Close()
-
-	return tarR, nil
+	rel, err := filepath.Rel(root, path)
+	if err != nil {
+		return "", err
+	}
+	if rel == "." {
+		rel = "/"
+	}
+	if rel[0] != '/' {
+		rel = "/" + rel
+	}
+	return rel, nil
 }
 }

+ 0 - 5
pkg/chrootarchive/archive_windows.go

@@ -7,11 +7,6 @@ import (
 	"github.com/docker/docker/pkg/longpath"
 	"github.com/docker/docker/pkg/longpath"
 )
 )
 
 
-// chroot is not supported by Windows
-func chroot(path string) error {
-	return nil
-}
-
 func invokeUnpack(decompressedArchive io.ReadCloser,
 func invokeUnpack(decompressedArchive io.ReadCloser,
 	dest string,
 	dest string,
 	options *archive.TarOptions, root string) error {
 	options *archive.TarOptions, root string) error {

+ 24 - 103
pkg/chrootarchive/chroot_linux.go

@@ -1,113 +1,34 @@
 package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive"
 package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive"
 
 
 import (
 import (
-	"fmt"
-	"os"
-	"path/filepath"
-
-	"github.com/containerd/containerd/pkg/userns"
+	"github.com/docker/docker/internal/mounttree"
+	"github.com/docker/docker/internal/unshare"
 	"github.com/moby/sys/mount"
 	"github.com/moby/sys/mount"
-	"github.com/moby/sys/mountinfo"
 	"golang.org/x/sys/unix"
 	"golang.org/x/sys/unix"
 )
 )
 
 
-// chroot on linux uses pivot_root instead of chroot
-// pivot_root takes a new root and an old root.
-// Old root must be a sub-dir of new root, it is where the current rootfs will reside after the call to pivot_root.
-// New root is where the new rootfs is set to.
-// Old root is removed after the call to pivot_root so it is no longer available under the new root.
-// This is similar to how libcontainer sets up a container's rootfs
-func chroot(path string) (err error) {
-	// if the engine is running in a user namespace we need to use actual chroot
-	if userns.RunningInUserNS() {
-		return realChroot(path)
-	}
-	if err := unix.Unshare(unix.CLONE_NEWNS); err != nil {
-		return fmt.Errorf("Error creating mount namespace before pivot: %v", err)
-	}
-
-	// Make everything in new ns slave.
-	// Don't use `private` here as this could race where the mountns gets a
-	//   reference to a mount and an unmount from the host does not propagate,
-	//   which could potentially cause transient errors for other operations,
-	//   even though this should be relatively small window here `slave` should
-	//   not cause any problems.
-	if err := mount.MakeRSlave("/"); err != nil {
-		return err
-	}
-
-	if mounted, _ := mountinfo.Mounted(path); !mounted {
-		if err := mount.Mount(path, path, "bind", "rbind,rw"); err != nil {
-			return realChroot(path)
-		}
-	}
-
-	// setup oldRoot for pivot_root
-	pivotDir, err := os.MkdirTemp(path, ".pivot_root")
-	if err != nil {
-		return fmt.Errorf("Error setting up pivot dir: %v", err)
-	}
-
-	var mounted bool
-	defer func() {
-		if mounted {
-			// make sure pivotDir is not mounted before we try to remove it
-			if errCleanup := unix.Unmount(pivotDir, unix.MNT_DETACH); errCleanup != nil {
-				if err == nil {
-					err = errCleanup
-				}
-				return
-			}
-		}
-
-		errCleanup := os.Remove(pivotDir)
-		// pivotDir doesn't exist if pivot_root failed and chroot+chdir was successful
-		// because we already cleaned it up on failed pivot_root
-		if errCleanup != nil && !os.IsNotExist(errCleanup) {
-			errCleanup = fmt.Errorf("Error cleaning up after pivot: %v", errCleanup)
-			if err == nil {
-				err = errCleanup
+// goInChroot starts fn in a goroutine where the root directory, current working
+// directory and umask are unshared from other goroutines and the root directory
+// has been changed to path. These changes are only visible to the goroutine in
+// which fn is executed. Any other goroutines, including ones started from fn,
+// will see the same root directory and file system attributes as the rest of
+// the process.
+func goInChroot(path string, fn func()) error {
+	return unshare.Go(
+		unix.CLONE_FS|unix.CLONE_NEWNS,
+		func() error {
+			// Make everything in new ns slave.
+			// Don't use `private` here as this could race where the mountns gets a
+			//   reference to a mount and an unmount from the host does not propagate,
+			//   which could potentially cause transient errors for other operations,
+			//   even though this should be relatively small window here `slave` should
+			//   not cause any problems.
+			if err := mount.MakeRSlave("/"); err != nil {
+				return err
 			}
 			}
-		}
-	}()
-
-	if err := unix.PivotRoot(path, pivotDir); err != nil {
-		// If pivot fails, fall back to the normal chroot after cleaning up temp dir
-		if err := os.Remove(pivotDir); err != nil {
-			return fmt.Errorf("Error cleaning up after failed pivot: %v", err)
-		}
-		return realChroot(path)
-	}
-	mounted = true
-
-	// This is the new path for where the old root (prior to the pivot) has been moved to
-	// This dir contains the rootfs of the caller, which we need to remove so it is not visible during extraction
-	pivotDir = filepath.Join("/", filepath.Base(pivotDir))
-
-	if err := unix.Chdir("/"); err != nil {
-		return fmt.Errorf("Error changing to new root: %v", err)
-	}
-
-	// Make the pivotDir (where the old root lives) private so it can be unmounted without propagating to the host
-	if err := unix.Mount("", pivotDir, "", unix.MS_PRIVATE|unix.MS_REC, ""); err != nil {
-		return fmt.Errorf("Error making old root private after pivot: %v", err)
-	}
-
-	// Now unmount the old root so it's no longer visible from the new root
-	if err := unix.Unmount(pivotDir, unix.MNT_DETACH); err != nil {
-		return fmt.Errorf("Error while unmounting old root after pivot: %v", err)
-	}
-	mounted = false
-
-	return nil
-}
 
 
-func realChroot(path string) error {
-	if err := unix.Chroot(path); err != nil {
-		return fmt.Errorf("Error after fallback to chroot: %v", err)
-	}
-	if err := unix.Chdir("/"); err != nil {
-		return fmt.Errorf("Error changing to new root after chroot: %v", err)
-	}
-	return nil
+			return mounttree.SwitchRoot(path)
+		},
+		fn,
+	)
 }
 }

+ 0 - 17
pkg/chrootarchive/chroot_unix.go

@@ -1,17 +0,0 @@
-//go:build !windows && !linux
-// +build !windows,!linux
-
-package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive"
-
-import "golang.org/x/sys/unix"
-
-func chroot(path string) error {
-	if err := unix.Chroot(path); err != nil {
-		return err
-	}
-	return unix.Chdir("/")
-}
-
-func realChroot(path string) error {
-	return chroot(path)
-}

+ 17 - 93
pkg/chrootarchive/diff_unix.go

@@ -4,78 +4,14 @@
 package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive"
 package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive"
 
 
 import (
 import (
-	"bytes"
-	"encoding/json"
-	"flag"
-	"fmt"
 	"io"
 	"io"
-	"os"
 	"path/filepath"
 	"path/filepath"
-	"runtime"
 
 
 	"github.com/containerd/containerd/pkg/userns"
 	"github.com/containerd/containerd/pkg/userns"
 	"github.com/docker/docker/pkg/archive"
 	"github.com/docker/docker/pkg/archive"
-	"github.com/docker/docker/pkg/reexec"
 	"golang.org/x/sys/unix"
 	"golang.org/x/sys/unix"
 )
 )
 
 
-type applyLayerResponse struct {
-	LayerSize int64 `json:"layerSize"`
-}
-
-// applyLayer is the entry-point for docker-applylayer on re-exec. This is not
-// used on Windows as it does not support chroot, hence no point sandboxing
-// through chroot and rexec.
-func applyLayer() {
-
-	var (
-		tmpDir  string
-		err     error
-		options *archive.TarOptions
-	)
-	runtime.LockOSThread()
-	flag.Parse()
-
-	inUserns := userns.RunningInUserNS()
-	if err := chroot(flag.Arg(0)); err != nil {
-		fatal(err)
-	}
-
-	// We need to be able to set any perms
-	oldmask := unix.Umask(0)
-	defer unix.Umask(oldmask)
-
-	if err := json.Unmarshal([]byte(os.Getenv("OPT")), &options); err != nil {
-		fatal(err)
-	}
-
-	if inUserns {
-		options.InUserNS = true
-	}
-
-	if tmpDir, err = os.MkdirTemp("/", "temp-docker-extract"); err != nil {
-		fatal(err)
-	}
-
-	os.Setenv("TMPDIR", tmpDir)
-	size, err := archive.UnpackLayer("/", os.Stdin, options)
-	os.RemoveAll(tmpDir)
-	if err != nil {
-		fatal(err)
-	}
-
-	encoder := json.NewEncoder(os.Stdout)
-	if err := encoder.Encode(applyLayerResponse{size}); err != nil {
-		fatal(fmt.Errorf("unable to encode layerSize JSON: %s", err))
-	}
-
-	if _, err := flush(os.Stdin); err != nil {
-		fatal(err)
-	}
-
-	os.Exit(0)
-}
-
 // applyLayerHandler parses a diff in the standard layer format from `layer`, and
 // applyLayerHandler parses a diff in the standard layer format from `layer`, and
 // applies it to the directory `dest`. Returns the size in bytes of the
 // applies it to the directory `dest`. Returns the size in bytes of the
 // contents of the layer.
 // contents of the layer.
@@ -92,42 +28,30 @@ func applyLayerHandler(dest string, layer io.Reader, options *archive.TarOptions
 	}
 	}
 	if options == nil {
 	if options == nil {
 		options = &archive.TarOptions{}
 		options = &archive.TarOptions{}
-		if userns.RunningInUserNS() {
-			options.InUserNS = true
-		}
+	}
+	if userns.RunningInUserNS() {
+		options.InUserNS = true
 	}
 	}
 	if options.ExcludePatterns == nil {
 	if options.ExcludePatterns == nil {
 		options.ExcludePatterns = []string{}
 		options.ExcludePatterns = []string{}
 	}
 	}
 
 
-	data, err := json.Marshal(options)
-	if err != nil {
-		return 0, fmt.Errorf("ApplyLayer json encode: %v", err)
+	type result struct {
+		layerSize int64
+		err       error
 	}
 	}
 
 
-	cmd := reexec.Command("docker-applyLayer", dest)
-	cmd.Stdin = layer
-	cmd.Env = append(cmd.Env, fmt.Sprintf("OPT=%s", data))
-
-	outBuf, errBuf := new(bytes.Buffer), new(bytes.Buffer)
-	cmd.Stdout, cmd.Stderr = outBuf, errBuf
+	done := make(chan result)
+	err = goInChroot(dest, func() {
+		// We need to be able to set any perms
+		_ = unix.Umask(0)
 
 
-	// reexec.Command() sets cmd.SysProcAttr.Pdeathsig on Linux, which
-	// causes the started process to be signaled when the creating OS thread
-	// dies. Ensure that the reexec is not prematurely signaled. See
-	// https://go.dev/issue/27505 for more information.
-	runtime.LockOSThread()
-	defer runtime.UnlockOSThread()
-	if err = cmd.Run(); err != nil {
-		return 0, fmt.Errorf("ApplyLayer %s stdout: %s stderr: %s", err, outBuf, errBuf)
-	}
-
-	// Stdout should be a valid JSON struct representing an applyLayerResponse.
-	response := applyLayerResponse{}
-	decoder := json.NewDecoder(outBuf)
-	if err = decoder.Decode(&response); err != nil {
-		return 0, fmt.Errorf("unable to decode ApplyLayer JSON response: %s", err)
+		size, err := archive.UnpackLayer("/", layer, options)
+		done <- result{layerSize: size, err: err}
+	})
+	if err != nil {
+		return 0, err
 	}
 	}
-
-	return response.LayerSize, nil
+	res := <-done
+	return res.layerSize, res.err
 }
 }

+ 0 - 7
pkg/chrootarchive/diff_windows.go

@@ -3,7 +3,6 @@ package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive"
 import (
 import (
 	"fmt"
 	"fmt"
 	"io"
 	"io"
-	"os"
 	"path/filepath"
 	"path/filepath"
 
 
 	"github.com/docker/docker/pkg/archive"
 	"github.com/docker/docker/pkg/archive"
@@ -29,13 +28,7 @@ func applyLayerHandler(dest string, layer io.Reader, options *archive.TarOptions
 		layer = decompressed
 		layer = decompressed
 	}
 	}
 
 
-	tmpDir, err := os.MkdirTemp(os.Getenv("temp"), "temp-docker-extract")
-	if err != nil {
-		return 0, fmt.Errorf("ApplyLayer failed to create temp-docker-extract under %s. %s", dest, err)
-	}
-
 	s, err := archive.UnpackLayer(dest, layer, nil)
 	s, err := archive.UnpackLayer(dest, layer, nil)
-	os.RemoveAll(tmpDir)
 	if err != nil {
 	if err != nil {
 		return 0, fmt.Errorf("ApplyLayer %s failed UnpackLayer to %s: %s", layer, dest, err)
 		return 0, fmt.Errorf("ApplyLayer %s failed UnpackLayer to %s: %s", layer, dest, err)
 	}
 	}

+ 0 - 29
pkg/chrootarchive/init_unix.go

@@ -1,29 +0,0 @@
-//go:build !windows
-// +build !windows
-
-package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive"
-
-import (
-	"fmt"
-	"io"
-	"os"
-
-	"github.com/docker/docker/pkg/reexec"
-)
-
-func init() {
-	reexec.Register("docker-applyLayer", applyLayer)
-	reexec.Register("docker-untar", untar)
-	reexec.Register("docker-tar", tar)
-}
-
-func fatal(err error) {
-	fmt.Fprint(os.Stderr, err)
-	os.Exit(1)
-}
-
-// flush consumes all the bytes from the reader discarding
-// any errors
-func flush(r io.Reader) (bytes int64, err error) {
-	return io.Copy(io.Discard, r)
-}