diff --git a/container/archive.go b/container/archive_windows.go similarity index 99% rename from container/archive.go rename to container/archive_windows.go index c96ea1b868..6631fc69d6 100644 --- a/container/archive.go +++ b/container/archive_windows.go @@ -18,7 +18,7 @@ func (container *Container) ResolvePath(path string) (resolvedPath, absPath stri if container.BaseFS == "" { return "", "", errors.New("ResolvePath: BaseFS of container " + container.ID + " is unexpectedly empty") } - // Check if a drive letter supplied, it must be the system drive. No-op except on Windows + // Check if a drive letter supplied, it must be the system drive. path, err = system.CheckSystemDriveAndRemoveDriveLetter(path) if err != nil { return "", "", err diff --git a/daemon/archive.go b/daemon/archive.go index 5a1ac57952..4be02b25c6 100644 --- a/daemon/archive.go +++ b/daemon/archive.go @@ -3,17 +3,9 @@ package daemon // import "github.com/docker/docker/daemon" import ( "io" "os" - "path/filepath" - "strings" "github.com/docker/docker/api/types" - "github.com/docker/docker/container" "github.com/docker/docker/errdefs" - "github.com/docker/docker/pkg/archive" - "github.com/docker/docker/pkg/chrootarchive" - "github.com/docker/docker/pkg/ioutils" - "github.com/docker/docker/pkg/system" - "github.com/pkg/errors" ) // ContainerCopy performs a deprecated operation of archiving the resource at @@ -24,11 +16,6 @@ func (daemon *Daemon) ContainerCopy(name string, res string) (io.ReadCloser, err return nil, err } - // Make sure an online file-system operation is permitted. - if err := daemon.isOnlineFSOperationPermitted(ctr); err != nil { - return nil, errdefs.System(err) - } - data, err := daemon.containerCopy(ctr, res) if err == nil { return data, nil @@ -48,11 +35,6 @@ func (daemon *Daemon) ContainerStatPath(name string, path string) (stat *types.C return nil, err } - // Make sure an online file-system operation is permitted. - if err := daemon.isOnlineFSOperationPermitted(ctr); err != nil { - return nil, errdefs.System(err) - } - stat, err = daemon.containerStatPath(ctr, path) if err == nil { return stat, nil @@ -73,11 +55,6 @@ func (daemon *Daemon) ContainerArchivePath(name string, path string) (content io return nil, nil, err } - // Make sure an online file-system operation is permitted. - if err := daemon.isOnlineFSOperationPermitted(ctr); err != nil { - return nil, nil, errdefs.System(err) - } - content, stat, err = daemon.containerArchivePath(ctr, path) if err == nil { return content, stat, nil @@ -101,11 +78,6 @@ func (daemon *Daemon) ContainerExtractToDir(name, path string, copyUIDGID, noOve return err } - // Make sure an online file-system operation is permitted. - if err := daemon.isOnlineFSOperationPermitted(ctr); err != nil { - return errdefs.System(err) - } - err = daemon.containerExtractToDir(ctr, path, copyUIDGID, noOverwriteDirNonDir, content) if err == nil { return nil @@ -116,299 +88,3 @@ func (daemon *Daemon) ContainerExtractToDir(name, path string, copyUIDGID, noOve } return errdefs.System(err) } - -// containerStatPath stats the filesystem resource at the specified path in this -// container. Returns stat info about the resource. -func (daemon *Daemon) containerStatPath(container *container.Container, path string) (stat *types.ContainerPathStat, err error) { - container.Lock() - defer container.Unlock() - - if err = daemon.Mount(container); err != nil { - return nil, err - } - defer daemon.Unmount(container) - - err = daemon.mountVolumes(container) - defer container.DetachAndUnmount(daemon.LogVolumeEvent) - if err != nil { - return nil, err - } - - // Normalize path before sending to rootfs - path = filepath.FromSlash(path) - - resolvedPath, absPath, err := container.ResolvePath(path) - if err != nil { - return nil, err - } - - return container.StatPath(resolvedPath, absPath) -} - -// containerArchivePath creates an archive of the filesystem resource at the specified -// path in this container. Returns a tar archive of the resource and stat info -// about the resource. -func (daemon *Daemon) containerArchivePath(container *container.Container, path string) (content io.ReadCloser, stat *types.ContainerPathStat, err error) { - container.Lock() - - defer func() { - if err != nil { - // Wait to unlock the container until the archive is fully read - // (see the ReadCloseWrapper func below) or if there is an error - // before that occurs. - container.Unlock() - } - }() - - if err = daemon.Mount(container); err != nil { - return nil, nil, err - } - - defer func() { - if err != nil { - // unmount any volumes - container.DetachAndUnmount(daemon.LogVolumeEvent) - // unmount the container's rootfs - daemon.Unmount(container) - } - }() - - if err = daemon.mountVolumes(container); err != nil { - return nil, nil, err - } - - // Normalize path before sending to rootfs - path = filepath.FromSlash(path) - - resolvedPath, absPath, err := container.ResolvePath(path) - if err != nil { - return nil, nil, err - } - - stat, err = container.StatPath(resolvedPath, absPath) - if err != nil { - return nil, nil, err - } - - // We need to rebase the archive entries if the last element of the - // resolved path was a symlink that was evaluated and is now different - // than the requested path. For example, if the given path was "/foo/bar/", - // but it resolved to "/var/lib/docker/containers/{id}/foo/baz/", we want - // to ensure that the archive entries start with "bar" and not "baz". This - // also catches the case when the root directory of the container is - // requested: we want the archive entries to start with "/" and not the - // container ID. - - // Get the source and the base paths of the container resolved path in order - // to get the proper tar options for the rebase tar. - resolvedPath = filepath.Clean(resolvedPath) - if filepath.Base(resolvedPath) == "." { - resolvedPath += string(filepath.Separator) + "." - } - - sourceDir := resolvedPath - sourceBase := "." - - if stat.Mode&os.ModeDir == 0 { // not dir - sourceDir, sourceBase = filepath.Split(resolvedPath) - } - opts := archive.TarResourceRebaseOpts(sourceBase, filepath.Base(absPath)) - - data, err := chrootarchive.Tar(sourceDir, opts, container.BaseFS) - if err != nil { - return nil, nil, err - } - - content = ioutils.NewReadCloserWrapper(data, func() error { - err := data.Close() - container.DetachAndUnmount(daemon.LogVolumeEvent) - daemon.Unmount(container) - container.Unlock() - return err - }) - - daemon.LogContainerEvent(container, "archive-path") - - return content, stat, nil -} - -// containerExtractToDir extracts the given tar archive to the specified location in the -// filesystem of this container. The given path must be of a directory in the -// container. If it is not, the error will be an errdefs.InvalidParameter. If -// noOverwriteDirNonDir is true then it will be an error if unpacking the -// given content would cause an existing directory to be replaced with a non- -// directory and vice versa. -func (daemon *Daemon) containerExtractToDir(container *container.Container, path string, copyUIDGID, noOverwriteDirNonDir bool, content io.Reader) (err error) { - container.Lock() - defer container.Unlock() - - if err = daemon.Mount(container); err != nil { - return err - } - defer daemon.Unmount(container) - - err = daemon.mountVolumes(container) - defer container.DetachAndUnmount(daemon.LogVolumeEvent) - if err != nil { - return err - } - - // Normalize path before sending to rootfs' - path = filepath.FromSlash(path) - - // Check if a drive letter supplied, it must be the system drive. No-op except on Windows - path, err = system.CheckSystemDriveAndRemoveDriveLetter(path) - if err != nil { - return err - } - - // The destination path needs to be resolved to a host path, with all - // symbolic links followed in the scope of the container's rootfs. Note - // that we do not use `container.ResolvePath(path)` here because we need - // to also evaluate the last path element if it is a symlink. This is so - // that you can extract an archive to a symlink that points to a directory. - - // Consider the given path as an absolute path in the container. - absPath := archive.PreserveTrailingDotOrSeparator(filepath.Join(string(filepath.Separator), path), path) - - // This will evaluate the last path element if it is a symlink. - resolvedPath, err := container.GetResourcePath(absPath) - if err != nil { - return err - } - - stat, err := os.Lstat(resolvedPath) - if err != nil { - return err - } - - if !stat.IsDir() { - return errdefs.InvalidParameter(errors.New("extraction point is not a directory")) - } - - // Need to check if the path is in a volume. If it is, it cannot be in a - // read-only volume. If it is not in a volume, the container cannot be - // configured with a read-only rootfs. - - // Use the resolved path relative to the container rootfs as the new - // absPath. This way we fully follow any symlinks in a volume that may - // lead back outside the volume. - // - // The Windows implementation of filepath.Rel in golang 1.4 does not - // support volume style file path semantics. On Windows when using the - // filter driver, we are guaranteed that the path will always be - // a volume file path. - var baseRel string - if strings.HasPrefix(resolvedPath, `\\?\Volume{`) { - if strings.HasPrefix(resolvedPath, container.BaseFS) { - baseRel = resolvedPath[len(container.BaseFS):] - if baseRel[:1] == `\` { - baseRel = baseRel[1:] - } - } - } else { - baseRel, err = filepath.Rel(container.BaseFS, resolvedPath) - } - if err != nil { - return err - } - // Make it an absolute path. - absPath = filepath.Join(string(filepath.Separator), baseRel) - - toVolume, err := checkIfPathIsInAVolume(container, absPath) - if err != nil { - return err - } - - if !toVolume && container.HostConfig.ReadonlyRootfs { - return errdefs.InvalidParameter(errors.New("container rootfs is marked read-only")) - } - - options := daemon.defaultTarCopyOptions(noOverwriteDirNonDir) - - if copyUIDGID { - var err error - // tarCopyOptions will appropriately pull in the right uid/gid for the - // user/group and will set the options. - options, err = daemon.tarCopyOptions(container, noOverwriteDirNonDir) - if err != nil { - return err - } - } - - if err := chrootarchive.UntarWithRoot(content, resolvedPath, options, container.BaseFS); err != nil { - return err - } - - daemon.LogContainerEvent(container, "extract-to-dir") - - return nil -} - -func (daemon *Daemon) containerCopy(container *container.Container, resource string) (rc io.ReadCloser, err error) { - if resource[0] == '/' || resource[0] == '\\' { - resource = resource[1:] - } - container.Lock() - - defer func() { - if err != nil { - // Wait to unlock the container until the archive is fully read - // (see the ReadCloseWrapper func below) or if there is an error - // before that occurs. - container.Unlock() - } - }() - - if err := daemon.Mount(container); err != nil { - return nil, err - } - - defer func() { - if err != nil { - // unmount any volumes - container.DetachAndUnmount(daemon.LogVolumeEvent) - // unmount the container's rootfs - daemon.Unmount(container) - } - }() - - if err := daemon.mountVolumes(container); err != nil { - return nil, err - } - - // Normalize path before sending to rootfs - resource = filepath.FromSlash(resource) - - basePath, err := container.GetResourcePath(resource) - if err != nil { - return nil, err - } - stat, err := os.Stat(basePath) - if err != nil { - return nil, err - } - var filter []string - if !stat.IsDir() { - d, f := filepath.Split(basePath) - basePath = d - filter = []string{f} - } - archv, err := chrootarchive.Tar(basePath, &archive.TarOptions{ - Compression: archive.Uncompressed, - IncludeFiles: filter, - }, container.BaseFS) - if err != nil { - return nil, err - } - - reader := ioutils.NewReadCloserWrapper(archv, func() error { - err := archv.Close() - container.DetachAndUnmount(daemon.LogVolumeEvent) - daemon.Unmount(container) - container.Unlock() - return err - }) - daemon.LogContainerEvent(container, "copy") - return reader, nil -} diff --git a/daemon/archive_unix.go b/daemon/archive_unix.go index 7a5e29905a..3098dbeadf 100644 --- a/daemon/archive_unix.go +++ b/daemon/archive_unix.go @@ -4,12 +4,212 @@ package daemon // import "github.com/docker/docker/daemon" import ( + "context" + "io" + "os" + "path/filepath" + + "github.com/docker/docker/api/types" "github.com/docker/docker/container" "github.com/docker/docker/errdefs" + "github.com/docker/docker/pkg/archive" + "github.com/docker/docker/pkg/ioutils" volumemounts "github.com/docker/docker/volume/mounts" "github.com/pkg/errors" ) +// containerStatPath stats the filesystem resource at the specified path in this +// container. Returns stat info about the resource. +func (daemon *Daemon) containerStatPath(container *container.Container, path string) (stat *types.ContainerPathStat, err error) { + container.Lock() + defer container.Unlock() + + cfs, err := daemon.openContainerFS(container) + if err != nil { + return nil, err + } + defer cfs.Close() + + return cfs.Stat(context.TODO(), path) +} + +// containerArchivePath creates an archive of the filesystem resource at the specified +// path in this container. Returns a tar archive of the resource and stat info +// about the resource. +func (daemon *Daemon) containerArchivePath(container *container.Container, path string) (content io.ReadCloser, stat *types.ContainerPathStat, err error) { + container.Lock() + + defer func() { + if err != nil { + // Wait to unlock the container until the archive is fully read + // (see the ReadCloseWrapper func below) or if there is an error + // before that occurs. + container.Unlock() + } + }() + + cfs, err := daemon.openContainerFS(container) + if err != nil { + return nil, nil, err + } + + defer func() { + if err != nil { + cfs.Close() + } + }() + + absPath := archive.PreserveTrailingDotOrSeparator(filepath.Join("/", path), path) + + stat, err = cfs.Stat(context.TODO(), absPath) + if err != nil { + return nil, nil, err + } + + sourceDir, sourceBase := absPath, "." + if stat.Mode&os.ModeDir == 0 { // not dir + sourceDir, sourceBase = filepath.Split(absPath) + } + opts := archive.TarResourceRebaseOpts(sourceBase, filepath.Base(absPath)) + + tb, err := archive.NewTarballer(sourceDir, opts) + if err != nil { + return nil, nil, err + } + + cfs.GoInFS(context.TODO(), tb.Do) + data := tb.Reader() + content = ioutils.NewReadCloserWrapper(data, func() error { + err := data.Close() + _ = cfs.Close() + container.Unlock() + return err + }) + + daemon.LogContainerEvent(container, "archive-path") + + return content, stat, nil +} + +// containerExtractToDir extracts the given tar archive to the specified location in the +// filesystem of this container. The given path must be of a directory in the +// container. If it is not, the error will be an errdefs.InvalidParameter. If +// noOverwriteDirNonDir is true then it will be an error if unpacking the +// given content would cause an existing directory to be replaced with a non- +// directory and vice versa. +func (daemon *Daemon) containerExtractToDir(container *container.Container, path string, copyUIDGID, noOverwriteDirNonDir bool, content io.Reader) (err error) { + container.Lock() + defer container.Unlock() + + cfs, err := daemon.openContainerFS(container) + if err != nil { + return err + } + defer cfs.Close() + + err = cfs.RunInFS(context.TODO(), func() error { + // The destination path needs to be resolved with all symbolic links + // followed. Note that we need to also evaluate the last path element if + // it is a symlink. This is so that you can extract an archive to a + // symlink that points to a directory. + absPath, err := filepath.EvalSymlinks(filepath.Join("/", path)) + if err != nil { + return err + } + absPath = archive.PreserveTrailingDotOrSeparator(absPath, path) + + stat, err := os.Lstat(absPath) + if err != nil { + return err + } + if !stat.IsDir() { + return errdefs.InvalidParameter(errors.New("extraction point is not a directory")) + } + + // Need to check if the path is in a volume. If it is, it cannot be in a + // read-only volume. If it is not in a volume, the container cannot be + // configured with a read-only rootfs. + toVolume, err := checkIfPathIsInAVolume(container, absPath) + if err != nil { + return err + } + + if !toVolume && container.HostConfig.ReadonlyRootfs { + return errdefs.InvalidParameter(errors.New("container rootfs is marked read-only")) + } + + options := daemon.defaultTarCopyOptions(noOverwriteDirNonDir) + + if copyUIDGID { + var err error + // tarCopyOptions will appropriately pull in the right uid/gid for the + // user/group and will set the options. + options, err = daemon.tarCopyOptions(container, noOverwriteDirNonDir) + if err != nil { + return err + } + } + + return archive.Untar(content, absPath, options) + }) + if err != nil { + return err + } + + daemon.LogContainerEvent(container, "extract-to-dir") + + return nil +} + +func (daemon *Daemon) containerCopy(container *container.Container, resource string) (rc io.ReadCloser, err error) { + container.Lock() + + defer func() { + if err != nil { + // Wait to unlock the container until the archive is fully read + // (see the ReadCloseWrapper func below) or if there is an error + // before that occurs. + container.Unlock() + } + }() + + cfs, err := daemon.openContainerFS(container) + if err != nil { + return nil, err + } + defer func() { + if err != nil { + cfs.Close() + } + }() + + err = cfs.RunInFS(context.TODO(), func() error { + _, err := os.Stat(resource) + return err + }) + if err != nil { + return nil, err + } + + tb, err := archive.NewTarballer(resource, &archive.TarOptions{ + Compression: archive.Uncompressed, + }) + if err != nil { + return nil, err + } + + cfs.GoInFS(context.TODO(), tb.Do) + archv := tb.Reader() + reader := ioutils.NewReadCloserWrapper(archv, func() error { + err := archv.Close() + _ = cfs.Close() + container.Unlock() + return err + }) + daemon.LogContainerEvent(container, "copy") + return reader, nil +} + // checkIfPathIsInAVolume checks if the path is in a volume. If it is, it // cannot be in a read-only volume. If it is not in a volume, the container // cannot be configured with a read-only rootfs. @@ -26,9 +226,3 @@ func checkIfPathIsInAVolume(container *container.Container, absPath string) (boo } return toVolume, nil } - -// isOnlineFSOperationPermitted returns an error if an online filesystem operation -// is not permitted. -func (daemon *Daemon) isOnlineFSOperationPermitted(container *container.Container) error { - return nil -} diff --git a/daemon/archive_windows.go b/daemon/archive_windows.go index 8cec39c5e4..ba6e17c98f 100644 --- a/daemon/archive_windows.go +++ b/daemon/archive_windows.go @@ -2,11 +2,337 @@ package daemon // import "github.com/docker/docker/daemon" import ( "errors" + "io" + "os" + "path/filepath" + "strings" + "github.com/docker/docker/api/types" containertypes "github.com/docker/docker/api/types/container" "github.com/docker/docker/container" + "github.com/docker/docker/errdefs" + "github.com/docker/docker/pkg/archive" + "github.com/docker/docker/pkg/chrootarchive" + "github.com/docker/docker/pkg/ioutils" + "github.com/docker/docker/pkg/system" ) +// containerStatPath stats the filesystem resource at the specified path in this +// container. Returns stat info about the resource. +func (daemon *Daemon) containerStatPath(container *container.Container, path string) (stat *types.ContainerPathStat, err error) { + container.Lock() + defer container.Unlock() + + // Make sure an online file-system operation is permitted. + if err := daemon.isOnlineFSOperationPermitted(container); err != nil { + return nil, err + } + + if err = daemon.Mount(container); err != nil { + return nil, err + } + defer daemon.Unmount(container) + + err = daemon.mountVolumes(container) + defer container.DetachAndUnmount(daemon.LogVolumeEvent) + if err != nil { + return nil, err + } + + // Normalize path before sending to rootfs + path = filepath.FromSlash(path) + + resolvedPath, absPath, err := container.ResolvePath(path) + if err != nil { + return nil, err + } + + return container.StatPath(resolvedPath, absPath) +} + +// containerArchivePath creates an archive of the filesystem resource at the specified +// path in this container. Returns a tar archive of the resource and stat info +// about the resource. +func (daemon *Daemon) containerArchivePath(container *container.Container, path string) (content io.ReadCloser, stat *types.ContainerPathStat, err error) { + container.Lock() + + defer func() { + if err != nil { + // Wait to unlock the container until the archive is fully read + // (see the ReadCloseWrapper func below) or if there is an error + // before that occurs. + container.Unlock() + } + }() + + // Make sure an online file-system operation is permitted. + if err := daemon.isOnlineFSOperationPermitted(container); err != nil { + return nil, nil, err + } + + if err = daemon.Mount(container); err != nil { + return nil, nil, err + } + + defer func() { + if err != nil { + // unmount any volumes + container.DetachAndUnmount(daemon.LogVolumeEvent) + // unmount the container's rootfs + daemon.Unmount(container) + } + }() + + if err = daemon.mountVolumes(container); err != nil { + return nil, nil, err + } + + // Normalize path before sending to rootfs + path = filepath.FromSlash(path) + + resolvedPath, absPath, err := container.ResolvePath(path) + if err != nil { + return nil, nil, err + } + + stat, err = container.StatPath(resolvedPath, absPath) + if err != nil { + return nil, nil, err + } + + // We need to rebase the archive entries if the last element of the + // resolved path was a symlink that was evaluated and is now different + // than the requested path. For example, if the given path was "/foo/bar/", + // but it resolved to "/var/lib/docker/containers/{id}/foo/baz/", we want + // to ensure that the archive entries start with "bar" and not "baz". This + // also catches the case when the root directory of the container is + // requested: we want the archive entries to start with "/" and not the + // container ID. + + // Get the source and the base paths of the container resolved path in order + // to get the proper tar options for the rebase tar. + resolvedPath = filepath.Clean(resolvedPath) + if filepath.Base(resolvedPath) == "." { + resolvedPath += string(filepath.Separator) + "." + } + + sourceDir := resolvedPath + sourceBase := "." + + if stat.Mode&os.ModeDir == 0 { // not dir + sourceDir, sourceBase = filepath.Split(resolvedPath) + } + opts := archive.TarResourceRebaseOpts(sourceBase, filepath.Base(absPath)) + + data, err := chrootarchive.Tar(sourceDir, opts, container.BaseFS) + if err != nil { + return nil, nil, err + } + + content = ioutils.NewReadCloserWrapper(data, func() error { + err := data.Close() + container.DetachAndUnmount(daemon.LogVolumeEvent) + daemon.Unmount(container) + container.Unlock() + return err + }) + + daemon.LogContainerEvent(container, "archive-path") + + return content, stat, nil +} + +// containerExtractToDir extracts the given tar archive to the specified location in the +// filesystem of this container. The given path must be of a directory in the +// container. If it is not, the error will be an errdefs.InvalidParameter. If +// noOverwriteDirNonDir is true then it will be an error if unpacking the +// given content would cause an existing directory to be replaced with a non- +// directory and vice versa. +func (daemon *Daemon) containerExtractToDir(container *container.Container, path string, copyUIDGID, noOverwriteDirNonDir bool, content io.Reader) (err error) { + container.Lock() + defer container.Unlock() + + // Make sure an online file-system operation is permitted. + if err := daemon.isOnlineFSOperationPermitted(container); err != nil { + return err + } + + if err = daemon.Mount(container); err != nil { + return err + } + defer daemon.Unmount(container) + + err = daemon.mountVolumes(container) + defer container.DetachAndUnmount(daemon.LogVolumeEvent) + if err != nil { + return err + } + + // Normalize path before sending to rootfs' + path = filepath.FromSlash(path) + + // Check if a drive letter supplied, it must be the system drive. No-op except on Windows + path, err = system.CheckSystemDriveAndRemoveDriveLetter(path) + if err != nil { + return err + } + + // The destination path needs to be resolved to a host path, with all + // symbolic links followed in the scope of the container's rootfs. Note + // that we do not use `container.ResolvePath(path)` here because we need + // to also evaluate the last path element if it is a symlink. This is so + // that you can extract an archive to a symlink that points to a directory. + + // Consider the given path as an absolute path in the container. + absPath := archive.PreserveTrailingDotOrSeparator(filepath.Join(string(filepath.Separator), path), path) + + // This will evaluate the last path element if it is a symlink. + resolvedPath, err := container.GetResourcePath(absPath) + if err != nil { + return err + } + + stat, err := os.Lstat(resolvedPath) + if err != nil { + return err + } + + if !stat.IsDir() { + return errdefs.InvalidParameter(errors.New("extraction point is not a directory")) + } + + // Need to check if the path is in a volume. If it is, it cannot be in a + // read-only volume. If it is not in a volume, the container cannot be + // configured with a read-only rootfs. + + // Use the resolved path relative to the container rootfs as the new + // absPath. This way we fully follow any symlinks in a volume that may + // lead back outside the volume. + // + // The Windows implementation of filepath.Rel in golang 1.4 does not + // support volume style file path semantics. On Windows when using the + // filter driver, we are guaranteed that the path will always be + // a volume file path. + var baseRel string + if strings.HasPrefix(resolvedPath, `\\?\Volume{`) { + if strings.HasPrefix(resolvedPath, container.BaseFS) { + baseRel = resolvedPath[len(container.BaseFS):] + if baseRel[:1] == `\` { + baseRel = baseRel[1:] + } + } + } else { + baseRel, err = filepath.Rel(container.BaseFS, resolvedPath) + } + if err != nil { + return err + } + // Make it an absolute path. + absPath = filepath.Join(string(filepath.Separator), baseRel) + + toVolume, err := checkIfPathIsInAVolume(container, absPath) + if err != nil { + return err + } + + if !toVolume && container.HostConfig.ReadonlyRootfs { + return errdefs.InvalidParameter(errors.New("container rootfs is marked read-only")) + } + + options := daemon.defaultTarCopyOptions(noOverwriteDirNonDir) + + if copyUIDGID { + var err error + // tarCopyOptions will appropriately pull in the right uid/gid for the + // user/group and will set the options. + options, err = daemon.tarCopyOptions(container, noOverwriteDirNonDir) + if err != nil { + return err + } + } + + if err := chrootarchive.UntarWithRoot(content, resolvedPath, options, container.BaseFS); err != nil { + return err + } + + daemon.LogContainerEvent(container, "extract-to-dir") + + return nil +} + +func (daemon *Daemon) containerCopy(container *container.Container, resource string) (rc io.ReadCloser, err error) { + if resource[0] == '/' || resource[0] == '\\' { + resource = resource[1:] + } + container.Lock() + + defer func() { + if err != nil { + // Wait to unlock the container until the archive is fully read + // (see the ReadCloseWrapper func below) or if there is an error + // before that occurs. + container.Unlock() + } + }() + + // Make sure an online file-system operation is permitted. + if err := daemon.isOnlineFSOperationPermitted(container); err != nil { + return nil, err + } + + if err := daemon.Mount(container); err != nil { + return nil, err + } + + defer func() { + if err != nil { + // unmount any volumes + container.DetachAndUnmount(daemon.LogVolumeEvent) + // unmount the container's rootfs + daemon.Unmount(container) + } + }() + + if err := daemon.mountVolumes(container); err != nil { + return nil, err + } + + // Normalize path before sending to rootfs + resource = filepath.FromSlash(resource) + + basePath, err := container.GetResourcePath(resource) + if err != nil { + return nil, err + } + stat, err := os.Stat(basePath) + if err != nil { + return nil, err + } + var filter []string + if !stat.IsDir() { + d, f := filepath.Split(basePath) + basePath = d + filter = []string{f} + } + archv, err := chrootarchive.Tar(basePath, &archive.TarOptions{ + Compression: archive.Uncompressed, + IncludeFiles: filter, + }, container.BaseFS) + if err != nil { + return nil, err + } + + reader := ioutils.NewReadCloserWrapper(archv, func() error { + err := archv.Close() + container.DetachAndUnmount(daemon.LogVolumeEvent) + daemon.Unmount(container) + container.Unlock() + return err + }) + daemon.LogContainerEvent(container, "copy") + return reader, nil +} + // checkIfPathIsInAVolume checks if the path is in a volume. If it is, it // cannot be in a read-only volume. If it is not in a volume, the container // cannot be configured with a read-only rootfs. @@ -21,9 +347,9 @@ func checkIfPathIsInAVolume(container *container.Container, absPath string) (boo // is not permitted (such as stat or for copying). Running Hyper-V containers // cannot have their file-system interrogated from the host as the filter is // loaded inside the utility VM, not the host. -// IMPORTANT: The container lock must NOT be held when calling this function. +// IMPORTANT: The container lock MUST be held when calling this function. func (daemon *Daemon) isOnlineFSOperationPermitted(container *container.Container) error { - if !container.IsRunning() { + if !container.Running { return nil } diff --git a/daemon/containerfs_linux.go b/daemon/containerfs_linux.go new file mode 100644 index 0000000000..b7420b9244 --- /dev/null +++ b/daemon/containerfs_linux.go @@ -0,0 +1,221 @@ +package daemon // import "github.com/docker/docker/daemon" + +import ( + "context" + "os" + "path/filepath" + "runtime" + "strings" + + "github.com/hashicorp/go-multierror" + "github.com/moby/sys/mount" + "github.com/moby/sys/symlink" + "golang.org/x/sys/unix" + + "github.com/docker/docker/api/types" + "github.com/docker/docker/container" + "github.com/docker/docker/internal/mounttree" + "github.com/docker/docker/internal/unshare" + "github.com/docker/docker/pkg/fileutils" +) + +type future struct { + fn func() error + res chan<- error +} + +// containerFSView allows functions to be run in the context of a container's +// filesystem. Inside these functions, the root directory is the container root +// for all native OS filesystem APIs, including, but not limited to, the [os] +// and [golang.org/x/sys/unix] packages. The view of the container's filesystem +// is live and read-write. Each view has its own private set of tmpfs mounts. +// Any files written under a tmpfs mount are not visible to processes inside the +// container nor any other view of the container's filesystem, and vice versa. +// +// Each view has its own current working directory which is initialized to the +// root of the container filesystem and can be changed with [os.Chdir]. Changes +// to the current directory persist across successive [*containerFSView.RunInFS] +// and [*containerFSView.GoInFS] calls. +// +// Multiple views of the same container filesystem can coexist at the same time. +// Only one function can be running in a particular filesystem view at any given +// time. Calls to [*containerFSView.RunInFS] or [*containerFSView.GoInFS] will +// block while another function is running. If more than one call is blocked +// concurrently, the order they are unblocked is undefined. +type containerFSView struct { + d *Daemon + ctr *container.Container + todo chan future + done chan error +} + +// openContainerFS opens a new view of the container's filesystem. +func (daemon *Daemon) openContainerFS(container *container.Container) (_ *containerFSView, err error) { + if err := daemon.Mount(container); err != nil { + return nil, err + } + defer func() { + if err != nil { + _ = daemon.Unmount(container) + } + }() + + mounts, err := daemon.setupMounts(container) + if err != nil { + return nil, err + } + defer func() { + if err != nil { + _ = container.UnmountVolumes(daemon.LogVolumeEvent) + } + }() + + // Setup in initial mount namespace complete. We're ready to unshare the + // mount namespace and bind the volume mounts into that private view of + // the container FS. + todo := make(chan future) + done := make(chan error) + err = unshare.Go(unix.CLONE_NEWNS, + func() error { + if err := mount.MakeRSlave("/"); err != nil { + return err + } + for _, m := range mounts { + dest, err := container.GetResourcePath(m.Destination) + if err != nil { + return err + } + + var stat os.FileInfo + stat, err = os.Stat(m.Source) + if err != nil { + return err + } + if err := fileutils.CreateIfNotExists(dest, stat.IsDir()); err != nil { + return err + } + + bindMode := "rbind" + if m.NonRecursive { + bindMode = "bind" + } + writeMode := "ro" + if m.Writable { + writeMode = "rw" + } + + // openContainerFS() is called for temporary mounts + // outside the container. Soon these will be unmounted + // with lazy unmount option and given we have mounted + // them rbind, all the submounts will propagate if these + // are shared. If daemon is running in host namespace + // and has / as shared then these unmounts will + // propagate and unmount original mount as well. So make + // all these mounts rprivate. Do not use propagation + // property of volume as that should apply only when + // mounting happens inside the container. + opts := strings.Join([]string{bindMode, writeMode, "rprivate"}, ",") + if err := mount.Mount(m.Source, dest, "", opts); err != nil { + return err + } + } + + return mounttree.SwitchRoot(container.BaseFS) + }, + func() { + defer close(done) + + for it := range todo { + err := it.fn() + if it.res != nil { + it.res <- err + } + } + + // The thread will terminate when this goroutine returns, taking the + // mount namespace and all the volume bind-mounts with it. + }, + ) + if err != nil { + return nil, err + } + vw := &containerFSView{ + d: daemon, + ctr: container, + todo: todo, + done: done, + } + runtime.SetFinalizer(vw, (*containerFSView).Close) + return vw, nil +} + +// RunInFS synchronously runs fn in the context of the container filesytem and +// passes through its return value. +// +// The container filesystem is only visible to functions called in the same +// goroutine as fn. Goroutines started from fn will see the host's filesystem. +func (vw *containerFSView) RunInFS(ctx context.Context, fn func() error) error { + res := make(chan error) + select { + case vw.todo <- future{fn: fn, res: res}: + case <-ctx.Done(): + return ctx.Err() + } + return <-res +} + +// GoInFS starts fn in the container FS. It blocks until fn is started but does +// not wait until fn returns. An error is returned if ctx is canceled before fn +// has been started. +// +// The container filesystem is only visible to functions called in the same +// goroutine as fn. Goroutines started from fn will see the host's filesystem. +func (vw *containerFSView) GoInFS(ctx context.Context, fn func()) error { + select { + case vw.todo <- future{fn: func() error { fn(); return nil }}: + return nil + case <-ctx.Done(): + return ctx.Err() + } +} + +// Close waits until any in-flight operations complete and frees all +// resources associated with vw. +func (vw *containerFSView) Close() error { + runtime.SetFinalizer(vw, nil) + close(vw.todo) + err := multierror.Append(nil, <-vw.done) + err = multierror.Append(err, vw.ctr.UnmountVolumes(vw.d.LogVolumeEvent)) + err = multierror.Append(err, vw.d.Unmount(vw.ctr)) + return err.ErrorOrNil() +} + +// Stat returns the metadata for path, relative to the current working directory +// of vw inside the container filesystem view. +func (vw *containerFSView) Stat(ctx context.Context, path string) (*types.ContainerPathStat, error) { + var stat *types.ContainerPathStat + err := vw.RunInFS(ctx, func() error { + lstat, err := os.Lstat(path) + if err != nil { + return err + } + var target string + if lstat.Mode()&os.ModeSymlink != 0 { + // Fully evaluate symlinks along path to the ultimate + // target, or as much as possible with broken links. + target, err = symlink.FollowSymlinkInScope(path, "/") + if err != nil { + return err + } + } + stat = &types.ContainerPathStat{ + Name: filepath.Base(path), + Size: lstat.Size(), + Mode: lstat.Mode(), + Mtime: lstat.ModTime(), + LinkTarget: target, + } + return nil + }) + return stat, err +} diff --git a/daemon/volumes_unix.go b/daemon/volumes_unix.go index 59a95c239a..8e63203243 100644 --- a/daemon/volumes_unix.go +++ b/daemon/volumes_unix.go @@ -12,9 +12,7 @@ import ( mounttypes "github.com/docker/docker/api/types/mount" "github.com/docker/docker/container" - "github.com/docker/docker/pkg/fileutils" volumemounts "github.com/docker/docker/volume/mounts" - "github.com/moby/sys/mount" ) // setupMounts iterates through each of the mount points for a container and @@ -112,51 +110,3 @@ func setBindModeIfNull(bind *volumemounts.MountPoint) { bind.Mode = "z" } } - -func (daemon *Daemon) mountVolumes(container *container.Container) error { - mounts, err := daemon.setupMounts(container) - if err != nil { - return err - } - - for _, m := range mounts { - dest, err := container.GetResourcePath(m.Destination) - if err != nil { - return err - } - - var stat os.FileInfo - stat, err = os.Stat(m.Source) - if err != nil { - return err - } - if err = fileutils.CreateIfNotExists(dest, stat.IsDir()); err != nil { - return err - } - - bindMode := "rbind" - if m.NonRecursive { - bindMode = "bind" - } - writeMode := "ro" - if m.Writable { - writeMode = "rw" - } - - // mountVolumes() seems to be called for temporary mounts - // outside the container. Soon these will be unmounted with - // lazy unmount option and given we have mounted the rbind, - // all the submounts will propagate if these are shared. If - // daemon is running in host namespace and has / as shared - // then these unmounts will propagate and unmount original - // mount as well. So make all these mounts rprivate. - // Do not use propagation property of volume as that should - // apply only when mounting happens inside the container. - opts := strings.Join([]string{bindMode, writeMode, "rprivate"}, ",") - if err := mount.Mount(m.Source, dest, "", opts); err != nil { - return err - } - } - - return nil -} diff --git a/hack/dind b/hack/dind index 087270a7a8..04bf6aaf92 100755 --- a/hack/dind +++ b/hack/dind @@ -37,6 +37,10 @@ if [ -f /sys/fs/cgroup/cgroup.controllers ]; then > /sys/fs/cgroup/cgroup.subtree_control fi +# Change mount propagation to shared to make the environment more similar to a +# modern Linux system, e.g. with SystemD as PID 1. +mount --make-rshared / + if [ $# -gt 0 ]; then exec "$@" fi diff --git a/hack/dind-systemd b/hack/dind-systemd index 27e07badd3..5ab0d25fc1 100755 --- a/hack/dind-systemd +++ b/hack/dind-systemd @@ -13,6 +13,11 @@ if [ ! -t 0 ]; then exit 1 fi +# Change mount propagation to shared, which SystemD PID 1 would normally do +# itself when started by the kernel. SystemD skips that when it detects it is +# running in a container. +mount --make-rshared / + env > /etc/docker-entrypoint-env cat > /etc/systemd/system/docker-entrypoint.target << EOF diff --git a/integration/container/copy_test.go b/integration/container/copy_test.go index b50c1757ed..5214d15be9 100644 --- a/integration/container/copy_test.go +++ b/integration/container/copy_test.go @@ -158,16 +158,23 @@ func TestCopyFromContainer(t *testing.T) { expect map[string]string }{ {"/", map[string]string{"/": "", "/foo": "hello", "/bar/quux/baz": "world", "/bar/filesymlink": "", "/bar/dirsymlink": "", "/bar/notarget": ""}}, + {".", map[string]string{"./": "", "./foo": "hello", "./bar/quux/baz": "world", "./bar/filesymlink": "", "./bar/dirsymlink": "", "./bar/notarget": ""}}, + {"/.", map[string]string{"./": "", "./foo": "hello", "./bar/quux/baz": "world", "./bar/filesymlink": "", "./bar/dirsymlink": "", "./bar/notarget": ""}}, + {"./", map[string]string{"./": "", "./foo": "hello", "./bar/quux/baz": "world", "./bar/filesymlink": "", "./bar/dirsymlink": "", "./bar/notarget": ""}}, + {"/./", map[string]string{"./": "", "./foo": "hello", "./bar/quux/baz": "world", "./bar/filesymlink": "", "./bar/dirsymlink": "", "./bar/notarget": ""}}, {"/bar/root", map[string]string{"root": ""}}, {"/bar/root/", map[string]string{"root/": "", "root/foo": "hello", "root/bar/quux/baz": "world", "root/bar/filesymlink": "", "root/bar/dirsymlink": "", "root/bar/notarget": ""}}, + {"/bar/root/.", map[string]string{"./": "", "./foo": "hello", "./bar/quux/baz": "world", "./bar/filesymlink": "", "./bar/dirsymlink": "", "./bar/notarget": ""}}, {"bar/quux", map[string]string{"quux/": "", "quux/baz": "world"}}, {"bar/quux/", map[string]string{"quux/": "", "quux/baz": "world"}}, + {"bar/quux/.", map[string]string{"./": "", "./baz": "world"}}, {"bar/quux/baz", map[string]string{"baz": "world"}}, {"bar/filesymlink", map[string]string{"filesymlink": ""}}, {"bar/dirsymlink", map[string]string{"dirsymlink": ""}}, {"bar/dirsymlink/", map[string]string{"dirsymlink/": "", "dirsymlink/baz": "world"}}, + {"bar/dirsymlink/.", map[string]string{"./": "", "./baz": "world"}}, {"bar/notarget", map[string]string{"notarget": ""}}, } { t.Run(x.src, func(t *testing.T) { diff --git a/integration/container/mounts_linux_test.go b/integration/container/mounts_linux_test.go index 07bae777ea..405e16c1db 100644 --- a/integration/container/mounts_linux_test.go +++ b/integration/container/mounts_linux_test.go @@ -393,3 +393,38 @@ func TestContainerVolumesMountedAsSlave(t *testing.T) { t.Fatal(err) } } + +// Regression test for #38995 and #43390. +func TestContainerCopyLeaksMounts(t *testing.T) { + defer setupTest(t)() + + bindMount := mounttypes.Mount{ + Type: mounttypes.TypeBind, + Source: "/var", + Target: "/hostvar", + BindOptions: &mounttypes.BindOptions{ + Propagation: mounttypes.PropagationRSlave, + }, + } + + ctx := context.Background() + client := testEnv.APIClient() + cid := container.Run(ctx, t, client, container.WithMount(bindMount), container.WithCmd("sleep", "120s")) + + getMounts := func() string { + t.Helper() + res, err := container.Exec(ctx, client, cid, []string{"cat", "/proc/self/mountinfo"}) + assert.NilError(t, err) + assert.Equal(t, res.ExitCode, 0) + return res.Stdout() + } + + mountsBefore := getMounts() + + _, _, err := client.CopyFromContainer(ctx, cid, "/etc/passwd") + assert.NilError(t, err) + + mountsAfter := getMounts() + + assert.Equal(t, mountsBefore, mountsAfter) +} diff --git a/internal/mounttree/switchroot_linux.go b/internal/mounttree/switchroot_linux.go new file mode 100644 index 0000000000..8797a04b45 --- /dev/null +++ b/internal/mounttree/switchroot_linux.go @@ -0,0 +1,94 @@ +package mounttree // import "github.com/docker/docker/internal/mounttree" + +import ( + "fmt" + "os" + "path/filepath" + + "github.com/moby/sys/mount" + "github.com/moby/sys/mountinfo" + "golang.org/x/sys/unix" +) + +// SwitchRoot changes path to be the root of the mount tree and changes the +// current working directory to the new root. +// +// This function bind-mounts onto path; it is the caller's responsibility to set +// the desired propagation mode of path's parent mount beforehand to prevent +// unwanted propagation into different mount namespaces. +func SwitchRoot(path string) error { + if mounted, _ := mountinfo.Mounted(path); !mounted { + if err := mount.Mount(path, path, "bind", "rbind,rw"); err != nil { + return realChroot(path) + } + } + + // setup oldRoot for pivot_root + pivotDir, err := os.MkdirTemp(path, ".pivot_root") + if err != nil { + return fmt.Errorf("Error setting up pivot dir: %v", err) + } + + var mounted bool + defer func() { + if mounted { + // make sure pivotDir is not mounted before we try to remove it + if errCleanup := unix.Unmount(pivotDir, unix.MNT_DETACH); errCleanup != nil { + if err == nil { + err = errCleanup + } + return + } + } + + errCleanup := os.Remove(pivotDir) + // pivotDir doesn't exist if pivot_root failed and chroot+chdir was successful + // because we already cleaned it up on failed pivot_root + if errCleanup != nil && !os.IsNotExist(errCleanup) { + errCleanup = fmt.Errorf("Error cleaning up after pivot: %v", errCleanup) + if err == nil { + err = errCleanup + } + } + }() + + if err := unix.PivotRoot(path, pivotDir); err != nil { + // If pivot fails, fall back to the normal chroot after cleaning up temp dir + if err := os.Remove(pivotDir); err != nil { + return fmt.Errorf("Error cleaning up after failed pivot: %v", err) + } + return realChroot(path) + } + mounted = true + + // This is the new path for where the old root (prior to the pivot) has been moved to + // This dir contains the rootfs of the caller, which we need to remove so it is not visible during extraction + pivotDir = filepath.Join("/", filepath.Base(pivotDir)) + + if err := unix.Chdir("/"); err != nil { + return fmt.Errorf("Error changing to new root: %v", err) + } + + // Make the pivotDir (where the old root lives) private so it can be unmounted without propagating to the host + if err := unix.Mount("", pivotDir, "", unix.MS_PRIVATE|unix.MS_REC, ""); err != nil { + return fmt.Errorf("Error making old root private after pivot: %v", err) + } + + // Now unmount the old root so it's no longer visible from the new root + if err := unix.Unmount(pivotDir, unix.MNT_DETACH); err != nil { + return fmt.Errorf("Error while unmounting old root after pivot: %v", err) + } + mounted = false + + return nil +} + +func realChroot(path string) error { + if err := unix.Chroot(path); err != nil { + return fmt.Errorf("Error after fallback to chroot: %v", err) + } + if err := unix.Chdir("/"); err != nil { + return fmt.Errorf("Error changing to new root after chroot: %v", err) + } + return nil +} diff --git a/internal/unshare/unshare_linux.go b/internal/unshare/unshare_linux.go new file mode 100644 index 0000000000..14bbec4871 --- /dev/null +++ b/internal/unshare/unshare_linux.go @@ -0,0 +1,176 @@ +//go:build go1.10 +// +build go1.10 + +package unshare // import "github.com/docker/docker/internal/unshare" + +import ( + "fmt" + "os" + "runtime" + + "golang.org/x/sys/unix" +) + +func init() { + // The startup thread of a process is special in a few different ways. + // Most pertinent to the discussion at hand, any per-thread kernel state + // reflected in the /proc/[pid]/ directory for a process is taken from + // the state of the startup thread. Same goes for /proc/self/; it shows + // the state of the current process' startup thread, no matter which + // thread the files are being opened from. For most programs this is a + // distinction without a difference as the kernel state, such as the + // mount namespace and current working directory, is shared among (and + // kept synchronized across) all threads of a process. But things start + // to break down once threads start unsharing and modifying parts of + // their kernel state. + // + // The Go runtime schedules goroutines to execute on the startup thread, + // same as any other. How this could be problematic is best illustrated + // with a concrete example. Consider what happens if a call to + // Go(unix.CLONE_NEWNS, ...) spawned a goroutine which gets scheduled + // onto the startup thread. The thread's mount namespace will be + // unshared and modified. The contents of the /proc/[pid]/mountinfo file + // will then describe the mount tree of the unshared namespace, not the + // namespace of any other thread. It will remain this way until the + // process exits. (The startup thread is special in another way: exiting + // it puts the process into a "non-waitable zombie" state. To avoid this + // fate, the Go runtime parks the thread instead of exiting if a + // goroutine returns while locked to the startup thread. More + // information can be found in the Go runtime sources: + // `go doc -u -src runtime.mexit`.) The github.com/moby/sys/mountinfo + // package reads from /proc/self/mountinfo, so will read the mount tree + // for the wrong namespace if the startup thread has had its mount + // namespace unshared! The /proc/thread-self/ directory, introduced in + // Linux 3.17, is one potential solution to this problem, but every + // package which opens files in /proc/self/ would need to be updated, + // and fallbacks to /proc/self/task/[tid]/ would be required to support + // older kernels. Overlooking any reference to /proc/self/ would + // manifest as stochastically-reproducible bugs, so this is far from an + // ideal solution. + // + // Reading from /proc/self/ would not be a problem if we could prevent + // the per-thread state of the startup thread from being modified + // nondeterministically in the first place. We can accomplish this + // simply by locking the main() function to the startup thread! Doing so + // excludes any other goroutine from being scheduled on the thread. + runtime.LockOSThread() +} + +// reversibleSetnsFlags maps the unshare(2) flags whose effects can be fully +// reversed using setns(2). The values are the basenames of the corresponding +// /proc/self/task/[tid]/ns/ magic symlinks to use to save and restore the +// state. +var reversibleSetnsFlags = map[int]string{ + unix.CLONE_NEWCGROUP: "cgroup", + unix.CLONE_NEWNET: "net", + unix.CLONE_NEWUTS: "uts", + unix.CLONE_NEWPID: "pid", + unix.CLONE_NEWTIME: "time", + + // The following CLONE_NEW* flags are not included because they imply + // another, irreversible flag when used with unshare(2). + // - unix.CLONE_NEWIPC: implies CLONE_SYSVMEM + // - unix.CLONE_NEWNS: implies CLONE_FS + // - unix.CLONE_NEWUSER: implies CLONE_FS since Linux 3.9 +} + +// Go calls the given functions in a new goroutine, locked to an OS thread, +// which has had the parts of its execution state disassociated from the rest of +// the current process using [unshare(2)]. It blocks until the new goroutine has +// started and setupfn has returned. fn is only called if setupfn returns nil. A +// nil setupfn or fn is equivalent to passing a no-op function. +// +// The disassociated execution state and any changes made to it are only visible +// to the goroutine which the functions are called in. Any other goroutines, +// including ones started from the function, will see the same execution state +// as the rest of the process. +// +// The acceptable flags are documented in the [unshare(2)] Linux man-page. +// The corresponding CLONE_* constants are defined in package [unix]. +// +// # Warning +// +// This function may terminate the thread which the new goroutine executed on +// after fn returns, which could cause subprocesses started with the +// [syscall.SysProcAttr] Pdeathsig field set to be signaled before process +// termination. Any subprocess started before this function is called may be +// affected, in addition to any subprocesses started inside setupfn or fn. +// There are more details at https://go.dev/issue/27505. +// +// [unshare(2)]: https://man7.org/linux/man-pages/man2/unshare.2.html +func Go(flags int, setupfn func() error, fn func()) error { + started := make(chan error) + + maskedFlags := flags + for f := range reversibleSetnsFlags { + maskedFlags &^= f + } + isReversible := maskedFlags == 0 + + go func() { + // Prepare to manipulate per-thread kernel state. + runtime.LockOSThread() + + // Not all changes to the execution state can be reverted. + // If an irreversible change to the execution state is made, our + // only recourse is to have the tampered thread terminated by + // returning from this function while the goroutine remains + // wired to the thread. The Go runtime will terminate the thread + // and replace it with a fresh one as needed. + + if isReversible { + defer func() { + if isReversible { + // All execution state has been restored without error. + // The thread is once again fungible. + runtime.UnlockOSThread() + } + }() + tid := unix.Gettid() + for f, ns := range reversibleSetnsFlags { + if flags&f != f { + continue + } + // The /proc/thread-self directory was added in Linux 3.17. + // We are not using it to maximize compatibility. + pth := fmt.Sprintf("/proc/self/task/%d/ns/%s", tid, ns) + fd, err := unix.Open(pth, unix.O_RDONLY|unix.O_CLOEXEC, 0) + if err != nil { + started <- &os.PathError{Op: "open", Path: pth, Err: err} + return + } + defer func() { + if isReversible { + if err := unix.Setns(fd, 0); err != nil { + isReversible = false + } + } + _ = unix.Close(fd) + }() + } + } + + // Threads are implemented under Linux as processes which share + // a virtual memory space. Therefore in a multithreaded process + // unshare(2) disassociates parts of the calling thread's + // context from the thread it was clone(2)'d from. + if err := unix.Unshare(flags); err != nil { + started <- os.NewSyscallError("unshare", err) + return + } + + if setupfn != nil { + if err := setupfn(); err != nil { + started <- err + return + } + } + close(started) + + if fn != nil { + fn() + } + }() + + return <-started +} diff --git a/pkg/archive/archive.go b/pkg/archive/archive.go index 370d9612fc..e5509209ba 100644 --- a/pkg/archive/archive.go +++ b/pkg/archive/archive.go @@ -821,10 +821,29 @@ func Tar(path string, compression Compression) (io.ReadCloser, error) { // TarWithOptions creates an archive from the directory at `path`, only including files whose relative // paths are included in `options.IncludeFiles` (if non-nil) or not in `options.ExcludePatterns`. func TarWithOptions(srcPath string, options *TarOptions) (io.ReadCloser, error) { - // Fix the source path to work with long path names. This is a no-op - // on platforms other than Windows. - srcPath = fixVolumePathPrefix(srcPath) + tb, err := NewTarballer(srcPath, options) + if err != nil { + return nil, err + } + go tb.Do() + return tb.Reader(), nil +} +// Tarballer is a lower-level interface to TarWithOptions which gives the caller +// control over which goroutine the archiving operation executes on. +type Tarballer struct { + srcPath string + options *TarOptions + pm *patternmatcher.PatternMatcher + pipeReader *io.PipeReader + pipeWriter *io.PipeWriter + compressWriter io.WriteCloser + whiteoutConverter tarWhiteoutConverter +} + +// NewTarballer constructs a new tarballer. The arguments are the same as for +// TarWithOptions. +func NewTarballer(srcPath string, options *TarOptions) (*Tarballer, error) { pm, err := patternmatcher.New(options.ExcludePatterns) if err != nil { return nil, err @@ -842,183 +861,201 @@ func TarWithOptions(srcPath string, options *TarOptions) (io.ReadCloser, error) return nil, err } - go func() { - ta := newTarAppender( - options.IDMap, - compressWriter, - options.ChownOpts, - ) - ta.WhiteoutConverter = whiteoutConverter + return &Tarballer{ + // Fix the source path to work with long path names. This is a no-op + // on platforms other than Windows. + srcPath: fixVolumePathPrefix(srcPath), + options: options, + pm: pm, + pipeReader: pipeReader, + pipeWriter: pipeWriter, + compressWriter: compressWriter, + whiteoutConverter: whiteoutConverter, + }, nil +} - defer func() { - // Make sure to check the error on Close. - if err := ta.TarWriter.Close(); err != nil { - logrus.Errorf("Can't close tar writer: %s", err) - } - if err := compressWriter.Close(); err != nil { - logrus.Errorf("Can't close compress writer: %s", err) - } - if err := pipeWriter.Close(); err != nil { - logrus.Errorf("Can't close pipe writer: %s", err) - } - }() +// Reader returns the reader for the created archive. +func (t *Tarballer) Reader() io.ReadCloser { + return t.pipeReader +} - // this buffer is needed for the duration of this piped stream - defer pools.BufioWriter32KPool.Put(ta.Buffer) +// Do performs the archiving operation in the background. The resulting archive +// can be read from t.Reader(). Do should only be called once on each Tarballer +// instance. +func (t *Tarballer) Do() { + ta := newTarAppender( + t.options.IDMap, + t.compressWriter, + t.options.ChownOpts, + ) + ta.WhiteoutConverter = t.whiteoutConverter - // In general we log errors here but ignore them because - // during e.g. a diff operation the container can continue - // mutating the filesystem and we can see transient errors - // from this - - stat, err := os.Lstat(srcPath) - if err != nil { - return + defer func() { + // Make sure to check the error on Close. + if err := ta.TarWriter.Close(); err != nil { + logrus.Errorf("Can't close tar writer: %s", err) } - - if !stat.IsDir() { - // We can't later join a non-dir with any includes because the - // 'walk' will error if "file/." is stat-ed and "file" is not a - // directory. So, we must split the source path and use the - // basename as the include. - if len(options.IncludeFiles) > 0 { - logrus.Warn("Tar: Can't archive a file with includes") - } - - dir, base := SplitPathDirEntry(srcPath) - srcPath = dir - options.IncludeFiles = []string{base} + if err := t.compressWriter.Close(); err != nil { + logrus.Errorf("Can't close compress writer: %s", err) } - - if len(options.IncludeFiles) == 0 { - options.IncludeFiles = []string{"."} - } - - seen := make(map[string]bool) - - for _, include := range options.IncludeFiles { - rebaseName := options.RebaseNames[include] - - var ( - parentMatchInfo []patternmatcher.MatchInfo - parentDirs []string - ) - - walkRoot := getWalkRoot(srcPath, include) - filepath.WalkDir(walkRoot, func(filePath string, f os.DirEntry, err error) error { - if err != nil { - logrus.Errorf("Tar: Can't stat file %s to tar: %s", srcPath, err) - return nil - } - - relFilePath, err := filepath.Rel(srcPath, filePath) - if err != nil || (!options.IncludeSourceDir && relFilePath == "." && f.IsDir()) { - // Error getting relative path OR we are looking - // at the source directory path. Skip in both situations. - return nil - } - - if options.IncludeSourceDir && include == "." && relFilePath != "." { - relFilePath = strings.Join([]string{".", relFilePath}, string(filepath.Separator)) - } - - skip := false - - // If "include" is an exact match for the current file - // then even if there's an "excludePatterns" pattern that - // matches it, don't skip it. IOW, assume an explicit 'include' - // is asking for that file no matter what - which is true - // for some files, like .dockerignore and Dockerfile (sometimes) - if include != relFilePath { - for len(parentDirs) != 0 { - lastParentDir := parentDirs[len(parentDirs)-1] - if strings.HasPrefix(relFilePath, lastParentDir+string(os.PathSeparator)) { - break - } - parentDirs = parentDirs[:len(parentDirs)-1] - parentMatchInfo = parentMatchInfo[:len(parentMatchInfo)-1] - } - - var matchInfo patternmatcher.MatchInfo - if len(parentMatchInfo) != 0 { - skip, matchInfo, err = pm.MatchesUsingParentResults(relFilePath, parentMatchInfo[len(parentMatchInfo)-1]) - } else { - skip, matchInfo, err = pm.MatchesUsingParentResults(relFilePath, patternmatcher.MatchInfo{}) - } - if err != nil { - logrus.Errorf("Error matching %s: %v", relFilePath, err) - return err - } - - if f.IsDir() { - parentDirs = append(parentDirs, relFilePath) - parentMatchInfo = append(parentMatchInfo, matchInfo) - } - } - - if skip { - // If we want to skip this file and its a directory - // then we should first check to see if there's an - // excludes pattern (e.g. !dir/file) that starts with this - // dir. If so then we can't skip this dir. - - // Its not a dir then so we can just return/skip. - if !f.IsDir() { - return nil - } - - // No exceptions (!...) in patterns so just skip dir - if !pm.Exclusions() { - return filepath.SkipDir - } - - dirSlash := relFilePath + string(filepath.Separator) - - for _, pat := range pm.Patterns() { - if !pat.Exclusion() { - continue - } - if strings.HasPrefix(pat.String()+string(filepath.Separator), dirSlash) { - // found a match - so can't skip this dir - return nil - } - } - - // No matching exclusion dir so just skip dir - return filepath.SkipDir - } - - if seen[relFilePath] { - return nil - } - seen[relFilePath] = true - - // Rename the base resource. - if rebaseName != "" { - var replacement string - if rebaseName != string(filepath.Separator) { - // Special case the root directory to replace with an - // empty string instead so that we don't end up with - // double slashes in the paths. - replacement = rebaseName - } - - relFilePath = strings.Replace(relFilePath, include, replacement, 1) - } - - if err := ta.addTarFile(filePath, relFilePath); err != nil { - logrus.Errorf("Can't add file %s to tar: %s", filePath, err) - // if pipe is broken, stop writing tar stream to it - if err == io.ErrClosedPipe { - return err - } - } - return nil - }) + if err := t.pipeWriter.Close(); err != nil { + logrus.Errorf("Can't close pipe writer: %s", err) } }() - return pipeReader, nil + // this buffer is needed for the duration of this piped stream + defer pools.BufioWriter32KPool.Put(ta.Buffer) + + // In general we log errors here but ignore them because + // during e.g. a diff operation the container can continue + // mutating the filesystem and we can see transient errors + // from this + + stat, err := os.Lstat(t.srcPath) + if err != nil { + return + } + + if !stat.IsDir() { + // We can't later join a non-dir with any includes because the + // 'walk' will error if "file/." is stat-ed and "file" is not a + // directory. So, we must split the source path and use the + // basename as the include. + if len(t.options.IncludeFiles) > 0 { + logrus.Warn("Tar: Can't archive a file with includes") + } + + dir, base := SplitPathDirEntry(t.srcPath) + t.srcPath = dir + t.options.IncludeFiles = []string{base} + } + + if len(t.options.IncludeFiles) == 0 { + t.options.IncludeFiles = []string{"."} + } + + seen := make(map[string]bool) + + for _, include := range t.options.IncludeFiles { + rebaseName := t.options.RebaseNames[include] + + var ( + parentMatchInfo []patternmatcher.MatchInfo + parentDirs []string + ) + + walkRoot := getWalkRoot(t.srcPath, include) + filepath.WalkDir(walkRoot, func(filePath string, f os.DirEntry, err error) error { + if err != nil { + logrus.Errorf("Tar: Can't stat file %s to tar: %s", t.srcPath, err) + return nil + } + + relFilePath, err := filepath.Rel(t.srcPath, filePath) + if err != nil || (!t.options.IncludeSourceDir && relFilePath == "." && f.IsDir()) { + // Error getting relative path OR we are looking + // at the source directory path. Skip in both situations. + return nil + } + + if t.options.IncludeSourceDir && include == "." && relFilePath != "." { + relFilePath = strings.Join([]string{".", relFilePath}, string(filepath.Separator)) + } + + skip := false + + // If "include" is an exact match for the current file + // then even if there's an "excludePatterns" pattern that + // matches it, don't skip it. IOW, assume an explicit 'include' + // is asking for that file no matter what - which is true + // for some files, like .dockerignore and Dockerfile (sometimes) + if include != relFilePath { + for len(parentDirs) != 0 { + lastParentDir := parentDirs[len(parentDirs)-1] + if strings.HasPrefix(relFilePath, lastParentDir+string(os.PathSeparator)) { + break + } + parentDirs = parentDirs[:len(parentDirs)-1] + parentMatchInfo = parentMatchInfo[:len(parentMatchInfo)-1] + } + + var matchInfo patternmatcher.MatchInfo + if len(parentMatchInfo) != 0 { + skip, matchInfo, err = t.pm.MatchesUsingParentResults(relFilePath, parentMatchInfo[len(parentMatchInfo)-1]) + } else { + skip, matchInfo, err = t.pm.MatchesUsingParentResults(relFilePath, patternmatcher.MatchInfo{}) + } + if err != nil { + logrus.Errorf("Error matching %s: %v", relFilePath, err) + return err + } + + if f.IsDir() { + parentDirs = append(parentDirs, relFilePath) + parentMatchInfo = append(parentMatchInfo, matchInfo) + } + } + + if skip { + // If we want to skip this file and its a directory + // then we should first check to see if there's an + // excludes pattern (e.g. !dir/file) that starts with this + // dir. If so then we can't skip this dir. + + // Its not a dir then so we can just return/skip. + if !f.IsDir() { + return nil + } + + // No exceptions (!...) in patterns so just skip dir + if !t.pm.Exclusions() { + return filepath.SkipDir + } + + dirSlash := relFilePath + string(filepath.Separator) + + for _, pat := range t.pm.Patterns() { + if !pat.Exclusion() { + continue + } + if strings.HasPrefix(pat.String()+string(filepath.Separator), dirSlash) { + // found a match - so can't skip this dir + return nil + } + } + + // No matching exclusion dir so just skip dir + return filepath.SkipDir + } + + if seen[relFilePath] { + return nil + } + seen[relFilePath] = true + + // Rename the base resource. + if rebaseName != "" { + var replacement string + if rebaseName != string(filepath.Separator) { + // Special case the root directory to replace with an + // empty string instead so that we don't end up with + // double slashes in the paths. + replacement = rebaseName + } + + relFilePath = strings.Replace(relFilePath, include, replacement, 1) + } + + if err := ta.addTarFile(filePath, relFilePath); err != nil { + logrus.Errorf("Can't add file %s to tar: %s", filePath, err) + // if pipe is broken, stop writing tar stream to it + if err == io.ErrClosedPipe { + return err + } + } + return nil + }) + } } // Unpack unpacks the decompressedArchive to dest with options. diff --git a/pkg/archive/diff.go b/pkg/archive/diff.go index 4fce40b4c6..c8c7be7479 100644 --- a/pkg/archive/diff.go +++ b/pkg/archive/diff.go @@ -87,7 +87,7 @@ func UnpackLayer(dest string, layer io.Reader, options *TarOptions) (size int64, basename := filepath.Base(hdr.Name) aufsHardlinks[basename] = hdr if aufsTempdir == "" { - if aufsTempdir, err = os.MkdirTemp("", "dockerplnk"); err != nil { + if aufsTempdir, err = os.MkdirTemp(dest, "dockerplnk"); err != nil { return 0, err } defer os.RemoveAll(aufsTempdir) diff --git a/pkg/chrootarchive/archive_unix.go b/pkg/chrootarchive/archive_unix.go index 13bb82a2e4..f6cb0be964 100644 --- a/pkg/chrootarchive/archive_unix.go +++ b/pkg/chrootarchive/archive_unix.go @@ -4,223 +4,71 @@ package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive" import ( - "bytes" - "encoding/json" - "flag" - "fmt" "io" - "os" "path/filepath" - "runtime" "strings" "github.com/docker/docker/pkg/archive" - "github.com/docker/docker/pkg/reexec" "github.com/pkg/errors" ) -// untar is the entry-point for docker-untar on re-exec. This is not used on -// Windows as it does not support chroot, hence no point sandboxing through -// chroot and rexec. -func untar() { - runtime.LockOSThread() - flag.Parse() - - var options archive.TarOptions - - // read the options from the pipe "ExtraFiles" - if err := json.NewDecoder(os.NewFile(3, "options")).Decode(&options); err != nil { - fatal(err) - } - - dst := flag.Arg(0) - var root string - if len(flag.Args()) > 1 { - root = flag.Arg(1) - } - - if root == "" { - root = dst - } - - if err := chroot(root); err != nil { - fatal(err) - } - - if err := archive.Unpack(os.Stdin, dst, &options); err != nil { - fatal(err) - } - // fully consume stdin in case it is zero padded - if _, err := flush(os.Stdin); err != nil { - fatal(err) - } - - os.Exit(0) -} - func invokeUnpack(decompressedArchive io.Reader, dest string, options *archive.TarOptions, root string) error { - if root == "" { - return errors.New("must specify a root to chroot to") - } - - // We can't pass a potentially large exclude list directly via cmd line - // because we easily overrun the kernel's max argument/environment size - // when the full image list is passed (e.g. when this is used by - // `docker load`). We will marshall the options via a pipe to the - // child - r, w, err := os.Pipe() + relDest, err := resolvePathInChroot(root, dest) if err != nil { - return fmt.Errorf("Untar pipe failure: %v", err) + return err } - if root != "" { - relDest, err := filepath.Rel(root, dest) - if err != nil { - return err - } - if relDest == "." { - relDest = "/" - } - if relDest[0] != '/' { - relDest = "/" + relDest - } - dest = relDest - } - - cmd := reexec.Command("docker-untar", dest, root) - cmd.Stdin = decompressedArchive - - cmd.ExtraFiles = append(cmd.ExtraFiles, r) - output := bytes.NewBuffer(nil) - cmd.Stdout = output - cmd.Stderr = output - - // reexec.Command() sets cmd.SysProcAttr.Pdeathsig on Linux, which - // causes the started process to be signaled when the creating OS thread - // dies. Ensure that the reexec is not prematurely signaled. See - // https://go.dev/issue/27505 for more information. - runtime.LockOSThread() - defer runtime.UnlockOSThread() - if err := cmd.Start(); err != nil { - w.Close() - return fmt.Errorf("Untar error on re-exec cmd: %v", err) - } - - // write the options to the pipe for the untar exec to read - if err := json.NewEncoder(w).Encode(options); err != nil { - w.Close() - return fmt.Errorf("Untar json encode to pipe failed: %v", err) - } - w.Close() - - if err := cmd.Wait(); err != nil { - // when `xz -d -c -q | docker-untar ...` failed on docker-untar side, - // we need to exhaust `xz`'s output, otherwise the `xz` side will be - // pending on write pipe forever - io.Copy(io.Discard, decompressedArchive) - - return fmt.Errorf("Error processing tar file(%v): %s", err, output) - } - return nil -} - -func tar() { - runtime.LockOSThread() - flag.Parse() - - src := flag.Arg(0) - var root string - if len(flag.Args()) > 1 { - root = flag.Arg(1) - } - - if root == "" { - root = src - } - - if err := realChroot(root); err != nil { - fatal(err) - } - - var options archive.TarOptions - if err := json.NewDecoder(os.Stdin).Decode(&options); err != nil { - fatal(err) - } - - rdr, err := archive.TarWithOptions(src, &options) + done := make(chan error) + err = goInChroot(root, func() { done <- archive.Unpack(decompressedArchive, relDest, options) }) if err != nil { - fatal(err) + return err } - defer rdr.Close() - - if _, err := io.Copy(os.Stdout, rdr); err != nil { - fatal(err) - } - - os.Exit(0) + return <-done } func invokePack(srcPath string, options *archive.TarOptions, root string) (io.ReadCloser, error) { - if root == "" { - return nil, errors.New("root path must not be empty") - } - - relSrc, err := filepath.Rel(root, srcPath) + relSrc, err := resolvePathInChroot(root, srcPath) if err != nil { return nil, err } - if relSrc == "." { - relSrc = "/" - } - if relSrc[0] != '/' { - relSrc = "/" + relSrc - } - // make sure we didn't trim a trailing slash with the call to `Rel` + // make sure we didn't trim a trailing slash with the call to `resolvePathInChroot` if strings.HasSuffix(srcPath, "/") && !strings.HasSuffix(relSrc, "/") { relSrc += "/" } - cmd := reexec.Command("docker-tar", relSrc, root) - - errBuff := bytes.NewBuffer(nil) - cmd.Stderr = errBuff - - tarR, tarW := io.Pipe() - cmd.Stdout = tarW - - stdin, err := cmd.StdinPipe() + tb, err := archive.NewTarballer(relSrc, options) if err != nil { - return nil, errors.Wrap(err, "error getting options pipe for tar process") + return nil, errors.Wrap(err, "error processing tar file") } - - started := make(chan error) - go func() { - // reexec.Command() sets cmd.SysProcAttr.Pdeathsig on Linux, - // which causes the started process to be signaled when the - // creating OS thread dies. Ensure that the subprocess is not - // prematurely signaled. See https://go.dev/issue/27505 for more - // information. - runtime.LockOSThread() - defer runtime.UnlockOSThread() - if err := cmd.Start(); err != nil { - started <- err - return - } - close(started) - err := cmd.Wait() - err = errors.Wrapf(err, "error processing tar file: %s", errBuff) - tarW.CloseWithError(err) - }() - if err := <-started; err != nil { - return nil, errors.Wrap(err, "tar error on re-exec cmd") + err = goInChroot(root, tb.Do) + if err != nil { + return nil, errors.Wrap(err, "could not chroot") } - - if err := json.NewEncoder(stdin).Encode(options); err != nil { - stdin.Close() - return nil, errors.Wrap(err, "tar json encode to pipe failed") - } - stdin.Close() - - return tarR, nil + return tb.Reader(), nil +} + +// resolvePathInChroot returns the equivalent to path inside a chroot rooted at root. +// The returned path always begins with '/'. +// +// - resolvePathInChroot("/a/b", "/a/b/c/d") -> "/c/d" +// - resolvePathInChroot("/a/b", "/a/b") -> "/" +// +// The implementation is buggy, and some bugs may be load-bearing. +// Here be dragons. +func resolvePathInChroot(root, path string) (string, error) { + if root == "" { + return "", errors.New("root path must not be empty") + } + rel, err := filepath.Rel(root, path) + if err != nil { + return "", err + } + if rel == "." { + rel = "/" + } + if rel[0] != '/' { + rel = "/" + rel + } + return rel, nil } diff --git a/pkg/chrootarchive/archive_windows.go b/pkg/chrootarchive/archive_windows.go index de87113e95..7095740a50 100644 --- a/pkg/chrootarchive/archive_windows.go +++ b/pkg/chrootarchive/archive_windows.go @@ -7,11 +7,6 @@ import ( "github.com/docker/docker/pkg/longpath" ) -// chroot is not supported by Windows -func chroot(path string) error { - return nil -} - func invokeUnpack(decompressedArchive io.ReadCloser, dest string, options *archive.TarOptions, root string) error { diff --git a/pkg/chrootarchive/chroot_linux.go b/pkg/chrootarchive/chroot_linux.go index 85c291cdb2..6356a6378e 100644 --- a/pkg/chrootarchive/chroot_linux.go +++ b/pkg/chrootarchive/chroot_linux.go @@ -1,113 +1,34 @@ package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive" import ( - "fmt" - "os" - "path/filepath" - - "github.com/containerd/containerd/pkg/userns" + "github.com/docker/docker/internal/mounttree" + "github.com/docker/docker/internal/unshare" "github.com/moby/sys/mount" - "github.com/moby/sys/mountinfo" "golang.org/x/sys/unix" ) -// chroot on linux uses pivot_root instead of chroot -// pivot_root takes a new root and an old root. -// Old root must be a sub-dir of new root, it is where the current rootfs will reside after the call to pivot_root. -// New root is where the new rootfs is set to. -// Old root is removed after the call to pivot_root so it is no longer available under the new root. -// This is similar to how libcontainer sets up a container's rootfs -func chroot(path string) (err error) { - // if the engine is running in a user namespace we need to use actual chroot - if userns.RunningInUserNS() { - return realChroot(path) - } - if err := unix.Unshare(unix.CLONE_NEWNS); err != nil { - return fmt.Errorf("Error creating mount namespace before pivot: %v", err) - } - - // Make everything in new ns slave. - // Don't use `private` here as this could race where the mountns gets a - // reference to a mount and an unmount from the host does not propagate, - // which could potentially cause transient errors for other operations, - // even though this should be relatively small window here `slave` should - // not cause any problems. - if err := mount.MakeRSlave("/"); err != nil { - return err - } - - if mounted, _ := mountinfo.Mounted(path); !mounted { - if err := mount.Mount(path, path, "bind", "rbind,rw"); err != nil { - return realChroot(path) - } - } - - // setup oldRoot for pivot_root - pivotDir, err := os.MkdirTemp(path, ".pivot_root") - if err != nil { - return fmt.Errorf("Error setting up pivot dir: %v", err) - } - - var mounted bool - defer func() { - if mounted { - // make sure pivotDir is not mounted before we try to remove it - if errCleanup := unix.Unmount(pivotDir, unix.MNT_DETACH); errCleanup != nil { - if err == nil { - err = errCleanup - } - return +// goInChroot starts fn in a goroutine where the root directory, current working +// directory and umask are unshared from other goroutines and the root directory +// has been changed to path. These changes are only visible to the goroutine in +// which fn is executed. Any other goroutines, including ones started from fn, +// will see the same root directory and file system attributes as the rest of +// the process. +func goInChroot(path string, fn func()) error { + return unshare.Go( + unix.CLONE_FS|unix.CLONE_NEWNS, + func() error { + // Make everything in new ns slave. + // Don't use `private` here as this could race where the mountns gets a + // reference to a mount and an unmount from the host does not propagate, + // which could potentially cause transient errors for other operations, + // even though this should be relatively small window here `slave` should + // not cause any problems. + if err := mount.MakeRSlave("/"); err != nil { + return err } - } - errCleanup := os.Remove(pivotDir) - // pivotDir doesn't exist if pivot_root failed and chroot+chdir was successful - // because we already cleaned it up on failed pivot_root - if errCleanup != nil && !os.IsNotExist(errCleanup) { - errCleanup = fmt.Errorf("Error cleaning up after pivot: %v", errCleanup) - if err == nil { - err = errCleanup - } - } - }() - - if err := unix.PivotRoot(path, pivotDir); err != nil { - // If pivot fails, fall back to the normal chroot after cleaning up temp dir - if err := os.Remove(pivotDir); err != nil { - return fmt.Errorf("Error cleaning up after failed pivot: %v", err) - } - return realChroot(path) - } - mounted = true - - // This is the new path for where the old root (prior to the pivot) has been moved to - // This dir contains the rootfs of the caller, which we need to remove so it is not visible during extraction - pivotDir = filepath.Join("/", filepath.Base(pivotDir)) - - if err := unix.Chdir("/"); err != nil { - return fmt.Errorf("Error changing to new root: %v", err) - } - - // Make the pivotDir (where the old root lives) private so it can be unmounted without propagating to the host - if err := unix.Mount("", pivotDir, "", unix.MS_PRIVATE|unix.MS_REC, ""); err != nil { - return fmt.Errorf("Error making old root private after pivot: %v", err) - } - - // Now unmount the old root so it's no longer visible from the new root - if err := unix.Unmount(pivotDir, unix.MNT_DETACH); err != nil { - return fmt.Errorf("Error while unmounting old root after pivot: %v", err) - } - mounted = false - - return nil -} - -func realChroot(path string) error { - if err := unix.Chroot(path); err != nil { - return fmt.Errorf("Error after fallback to chroot: %v", err) - } - if err := unix.Chdir("/"); err != nil { - return fmt.Errorf("Error changing to new root after chroot: %v", err) - } - return nil + return mounttree.SwitchRoot(path) + }, + fn, + ) } diff --git a/pkg/chrootarchive/chroot_unix.go b/pkg/chrootarchive/chroot_unix.go deleted file mode 100644 index c35aa91669..0000000000 --- a/pkg/chrootarchive/chroot_unix.go +++ /dev/null @@ -1,17 +0,0 @@ -//go:build !windows && !linux -// +build !windows,!linux - -package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive" - -import "golang.org/x/sys/unix" - -func chroot(path string) error { - if err := unix.Chroot(path); err != nil { - return err - } - return unix.Chdir("/") -} - -func realChroot(path string) error { - return chroot(path) -} diff --git a/pkg/chrootarchive/diff_unix.go b/pkg/chrootarchive/diff_unix.go index c667cc5819..873390c57f 100644 --- a/pkg/chrootarchive/diff_unix.go +++ b/pkg/chrootarchive/diff_unix.go @@ -4,78 +4,14 @@ package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive" import ( - "bytes" - "encoding/json" - "flag" - "fmt" "io" - "os" "path/filepath" - "runtime" "github.com/containerd/containerd/pkg/userns" "github.com/docker/docker/pkg/archive" - "github.com/docker/docker/pkg/reexec" "golang.org/x/sys/unix" ) -type applyLayerResponse struct { - LayerSize int64 `json:"layerSize"` -} - -// applyLayer is the entry-point for docker-applylayer on re-exec. This is not -// used on Windows as it does not support chroot, hence no point sandboxing -// through chroot and rexec. -func applyLayer() { - - var ( - tmpDir string - err error - options *archive.TarOptions - ) - runtime.LockOSThread() - flag.Parse() - - inUserns := userns.RunningInUserNS() - if err := chroot(flag.Arg(0)); err != nil { - fatal(err) - } - - // We need to be able to set any perms - oldmask := unix.Umask(0) - defer unix.Umask(oldmask) - - if err := json.Unmarshal([]byte(os.Getenv("OPT")), &options); err != nil { - fatal(err) - } - - if inUserns { - options.InUserNS = true - } - - if tmpDir, err = os.MkdirTemp("/", "temp-docker-extract"); err != nil { - fatal(err) - } - - os.Setenv("TMPDIR", tmpDir) - size, err := archive.UnpackLayer("/", os.Stdin, options) - os.RemoveAll(tmpDir) - if err != nil { - fatal(err) - } - - encoder := json.NewEncoder(os.Stdout) - if err := encoder.Encode(applyLayerResponse{size}); err != nil { - fatal(fmt.Errorf("unable to encode layerSize JSON: %s", err)) - } - - if _, err := flush(os.Stdin); err != nil { - fatal(err) - } - - os.Exit(0) -} - // applyLayerHandler parses a diff in the standard layer format from `layer`, and // applies it to the directory `dest`. Returns the size in bytes of the // contents of the layer. @@ -92,42 +28,30 @@ func applyLayerHandler(dest string, layer io.Reader, options *archive.TarOptions } if options == nil { options = &archive.TarOptions{} - if userns.RunningInUserNS() { - options.InUserNS = true - } + } + if userns.RunningInUserNS() { + options.InUserNS = true } if options.ExcludePatterns == nil { options.ExcludePatterns = []string{} } - data, err := json.Marshal(options) + type result struct { + layerSize int64 + err error + } + + done := make(chan result) + err = goInChroot(dest, func() { + // We need to be able to set any perms + _ = unix.Umask(0) + + size, err := archive.UnpackLayer("/", layer, options) + done <- result{layerSize: size, err: err} + }) if err != nil { - return 0, fmt.Errorf("ApplyLayer json encode: %v", err) + return 0, err } - - cmd := reexec.Command("docker-applyLayer", dest) - cmd.Stdin = layer - cmd.Env = append(cmd.Env, fmt.Sprintf("OPT=%s", data)) - - outBuf, errBuf := new(bytes.Buffer), new(bytes.Buffer) - cmd.Stdout, cmd.Stderr = outBuf, errBuf - - // reexec.Command() sets cmd.SysProcAttr.Pdeathsig on Linux, which - // causes the started process to be signaled when the creating OS thread - // dies. Ensure that the reexec is not prematurely signaled. See - // https://go.dev/issue/27505 for more information. - runtime.LockOSThread() - defer runtime.UnlockOSThread() - if err = cmd.Run(); err != nil { - return 0, fmt.Errorf("ApplyLayer %s stdout: %s stderr: %s", err, outBuf, errBuf) - } - - // Stdout should be a valid JSON struct representing an applyLayerResponse. - response := applyLayerResponse{} - decoder := json.NewDecoder(outBuf) - if err = decoder.Decode(&response); err != nil { - return 0, fmt.Errorf("unable to decode ApplyLayer JSON response: %s", err) - } - - return response.LayerSize, nil + res := <-done + return res.layerSize, res.err } diff --git a/pkg/chrootarchive/diff_windows.go b/pkg/chrootarchive/diff_windows.go index f423419d3c..fd29072e82 100644 --- a/pkg/chrootarchive/diff_windows.go +++ b/pkg/chrootarchive/diff_windows.go @@ -3,7 +3,6 @@ package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive" import ( "fmt" "io" - "os" "path/filepath" "github.com/docker/docker/pkg/archive" @@ -29,13 +28,7 @@ func applyLayerHandler(dest string, layer io.Reader, options *archive.TarOptions layer = decompressed } - tmpDir, err := os.MkdirTemp(os.Getenv("temp"), "temp-docker-extract") - if err != nil { - return 0, fmt.Errorf("ApplyLayer failed to create temp-docker-extract under %s. %s", dest, err) - } - s, err := archive.UnpackLayer(dest, layer, nil) - os.RemoveAll(tmpDir) if err != nil { return 0, fmt.Errorf("ApplyLayer %s failed UnpackLayer to %s: %s", layer, dest, err) } diff --git a/pkg/chrootarchive/init_unix.go b/pkg/chrootarchive/init_unix.go deleted file mode 100644 index 0746c1cb97..0000000000 --- a/pkg/chrootarchive/init_unix.go +++ /dev/null @@ -1,29 +0,0 @@ -//go:build !windows -// +build !windows - -package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive" - -import ( - "fmt" - "io" - "os" - - "github.com/docker/docker/pkg/reexec" -) - -func init() { - reexec.Register("docker-applyLayer", applyLayer) - reexec.Register("docker-untar", untar) - reexec.Register("docker-tar", tar) -} - -func fatal(err error) { - fmt.Fprint(os.Stderr, err) - os.Exit(1) -} - -// flush consumes all the bytes from the reader discarding -// any errors -func flush(r io.Reader) (bytes int64, err error) { - return io.Copy(io.Discard, r) -}