Преглед на файлове

Add reusable chroot and unshare utilities

Refactor pkg/chrootarchive in terms of those utilities.

Signed-off-by: Cory Snider <csnider@mirantis.com>
Cory Snider преди 2 години
родител
ревизия
60ee6f739f

+ 94 - 0
internal/mounttree/switchroot_linux.go

@@ -0,0 +1,94 @@
+package mounttree // import "github.com/docker/docker/internal/mounttree"
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+
+	"github.com/moby/sys/mount"
+	"github.com/moby/sys/mountinfo"
+	"golang.org/x/sys/unix"
+)
+
+// SwitchRoot changes path to be the root of the mount tree and changes the
+// current working directory to the new root.
+//
+// This function bind-mounts onto path; it is the caller's responsibility to set
+// the desired propagation mode of path's parent mount beforehand to prevent
+// unwanted propagation into different mount namespaces.
+func SwitchRoot(path string) error {
+	if mounted, _ := mountinfo.Mounted(path); !mounted {
+		if err := mount.Mount(path, path, "bind", "rbind,rw"); err != nil {
+			return realChroot(path)
+		}
+	}
+
+	// setup oldRoot for pivot_root
+	pivotDir, err := os.MkdirTemp(path, ".pivot_root")
+	if err != nil {
+		return fmt.Errorf("Error setting up pivot dir: %v", err)
+	}
+
+	var mounted bool
+	defer func() {
+		if mounted {
+			// make sure pivotDir is not mounted before we try to remove it
+			if errCleanup := unix.Unmount(pivotDir, unix.MNT_DETACH); errCleanup != nil {
+				if err == nil {
+					err = errCleanup
+				}
+				return
+			}
+		}
+
+		errCleanup := os.Remove(pivotDir)
+		// pivotDir doesn't exist if pivot_root failed and chroot+chdir was successful
+		// because we already cleaned it up on failed pivot_root
+		if errCleanup != nil && !os.IsNotExist(errCleanup) {
+			errCleanup = fmt.Errorf("Error cleaning up after pivot: %v", errCleanup)
+			if err == nil {
+				err = errCleanup
+			}
+		}
+	}()
+
+	if err := unix.PivotRoot(path, pivotDir); err != nil {
+		// If pivot fails, fall back to the normal chroot after cleaning up temp dir
+		if err := os.Remove(pivotDir); err != nil {
+			return fmt.Errorf("Error cleaning up after failed pivot: %v", err)
+		}
+		return realChroot(path)
+	}
+	mounted = true
+
+	// This is the new path for where the old root (prior to the pivot) has been moved to
+	// This dir contains the rootfs of the caller, which we need to remove so it is not visible during extraction
+	pivotDir = filepath.Join("/", filepath.Base(pivotDir))
+
+	if err := unix.Chdir("/"); err != nil {
+		return fmt.Errorf("Error changing to new root: %v", err)
+	}
+
+	// Make the pivotDir (where the old root lives) private so it can be unmounted without propagating to the host
+	if err := unix.Mount("", pivotDir, "", unix.MS_PRIVATE|unix.MS_REC, ""); err != nil {
+		return fmt.Errorf("Error making old root private after pivot: %v", err)
+	}
+
+	// Now unmount the old root so it's no longer visible from the new root
+	if err := unix.Unmount(pivotDir, unix.MNT_DETACH); err != nil {
+		return fmt.Errorf("Error while unmounting old root after pivot: %v", err)
+	}
+	mounted = false
+
+	return nil
+}
+
+func realChroot(path string) error {
+	if err := unix.Chroot(path); err != nil {
+		return fmt.Errorf("Error after fallback to chroot: %v", err)
+	}
+	if err := unix.Chdir("/"); err != nil {
+		return fmt.Errorf("Error changing to new root after chroot: %v", err)
+	}
+	return nil
+}

+ 176 - 0
internal/unshare/unshare_linux.go

@@ -0,0 +1,176 @@
+//go:build go1.10
+// +build go1.10
+
+package unshare // import "github.com/docker/docker/internal/unshare"
+
+import (
+	"fmt"
+	"os"
+	"runtime"
+
+	"golang.org/x/sys/unix"
+)
+
+func init() {
+	// The startup thread of a process is special in a few different ways.
+	// Most pertinent to the discussion at hand, any per-thread kernel state
+	// reflected in the /proc/[pid]/ directory for a process is taken from
+	// the state of the startup thread. Same goes for /proc/self/; it shows
+	// the state of the current process' startup thread, no matter which
+	// thread the files are being opened from. For most programs this is a
+	// distinction without a difference as the kernel state, such as the
+	// mount namespace and current working directory, is shared among (and
+	// kept synchronized across) all threads of a process. But things start
+	// to break down once threads start unsharing and modifying parts of
+	// their kernel state.
+	//
+	// The Go runtime schedules goroutines to execute on the startup thread,
+	// same as any other. How this could be problematic is best illustrated
+	// with a concrete example. Consider what happens if a call to
+	// Go(unix.CLONE_NEWNS, ...) spawned a goroutine which gets scheduled
+	// onto the startup thread. The thread's mount namespace will be
+	// unshared and modified. The contents of the /proc/[pid]/mountinfo file
+	// will then describe the mount tree of the unshared namespace, not the
+	// namespace of any other thread. It will remain this way until the
+	// process exits. (The startup thread is special in another way: exiting
+	// it puts the process into a "non-waitable zombie" state. To avoid this
+	// fate, the Go runtime parks the thread instead of exiting if a
+	// goroutine returns while locked to the startup thread. More
+	// information can be found in the Go runtime sources:
+	// `go doc -u -src runtime.mexit`.) The github.com/moby/sys/mountinfo
+	// package reads from /proc/self/mountinfo, so will read the mount tree
+	// for the wrong namespace if the startup thread has had its mount
+	// namespace unshared! The /proc/thread-self/ directory, introduced in
+	// Linux 3.17, is one potential solution to this problem, but every
+	// package which opens files in /proc/self/ would need to be updated,
+	// and fallbacks to /proc/self/task/[tid]/ would be required to support
+	// older kernels. Overlooking any reference to /proc/self/ would
+	// manifest as stochastically-reproducible bugs, so this is far from an
+	// ideal solution.
+	//
+	// Reading from /proc/self/ would not be a problem if we could prevent
+	// the per-thread state of the startup thread from being modified
+	// nondeterministically in the first place. We can accomplish this
+	// simply by locking the main() function to the startup thread! Doing so
+	// excludes any other goroutine from being scheduled on the thread.
+	runtime.LockOSThread()
+}
+
+// reversibleSetnsFlags maps the unshare(2) flags whose effects can be fully
+// reversed using setns(2). The values are the basenames of the corresponding
+// /proc/self/task/[tid]/ns/ magic symlinks to use to save and restore the
+// state.
+var reversibleSetnsFlags = map[int]string{
+	unix.CLONE_NEWCGROUP: "cgroup",
+	unix.CLONE_NEWNET:    "net",
+	unix.CLONE_NEWUTS:    "uts",
+	unix.CLONE_NEWPID:    "pid",
+	unix.CLONE_NEWTIME:   "time",
+
+	// The following CLONE_NEW* flags are not included because they imply
+	// another, irreversible flag when used with unshare(2).
+	//  - unix.CLONE_NEWIPC:  implies CLONE_SYSVMEM
+	//  - unix.CLONE_NEWNS:   implies CLONE_FS
+	//  - unix.CLONE_NEWUSER: implies CLONE_FS since Linux 3.9
+}
+
+// Go calls the given functions in a new goroutine, locked to an OS thread,
+// which has had the parts of its execution state disassociated from the rest of
+// the current process using [unshare(2)]. It blocks until the new goroutine has
+// started and setupfn has returned. fn is only called if setupfn returns nil. A
+// nil setupfn or fn is equivalent to passing a no-op function.
+//
+// The disassociated execution state and any changes made to it are only visible
+// to the goroutine which the functions are called in. Any other goroutines,
+// including ones started from the function, will see the same execution state
+// as the rest of the process.
+//
+// The acceptable flags are documented in the [unshare(2)] Linux man-page.
+// The corresponding CLONE_* constants are defined in package [unix].
+//
+// # Warning
+//
+// This function may terminate the thread which the new goroutine executed on
+// after fn returns, which could cause subprocesses started with the
+// [syscall.SysProcAttr] Pdeathsig field set to be signaled before process
+// termination. Any subprocess started before this function is called may be
+// affected, in addition to any subprocesses started inside setupfn or fn.
+// There are more details at https://go.dev/issue/27505.
+//
+// [unshare(2)]: https://man7.org/linux/man-pages/man2/unshare.2.html
+func Go(flags int, setupfn func() error, fn func()) error {
+	started := make(chan error)
+
+	maskedFlags := flags
+	for f := range reversibleSetnsFlags {
+		maskedFlags &^= f
+	}
+	isReversible := maskedFlags == 0
+
+	go func() {
+		// Prepare to manipulate per-thread kernel state.
+		runtime.LockOSThread()
+
+		// Not all changes to the execution state can be reverted.
+		// If an irreversible change to the execution state is made, our
+		// only recourse is to have the tampered thread terminated by
+		// returning from this function while the goroutine remains
+		// wired to the thread. The Go runtime will terminate the thread
+		// and replace it with a fresh one as needed.
+
+		if isReversible {
+			defer func() {
+				if isReversible {
+					// All execution state has been restored without error.
+					// The thread is once again fungible.
+					runtime.UnlockOSThread()
+				}
+			}()
+			tid := unix.Gettid()
+			for f, ns := range reversibleSetnsFlags {
+				if flags&f != f {
+					continue
+				}
+				// The /proc/thread-self directory was added in Linux 3.17.
+				// We are not using it to maximize compatibility.
+				pth := fmt.Sprintf("/proc/self/task/%d/ns/%s", tid, ns)
+				fd, err := unix.Open(pth, unix.O_RDONLY|unix.O_CLOEXEC, 0)
+				if err != nil {
+					started <- &os.PathError{Op: "open", Path: pth, Err: err}
+					return
+				}
+				defer func() {
+					if isReversible {
+						if err := unix.Setns(fd, 0); err != nil {
+							isReversible = false
+						}
+					}
+					_ = unix.Close(fd)
+				}()
+			}
+		}
+
+		// Threads are implemented under Linux as processes which share
+		// a virtual memory space. Therefore in a multithreaded process
+		// unshare(2) disassociates parts of the calling thread's
+		// context from the thread it was clone(2)'d from.
+		if err := unix.Unshare(flags); err != nil {
+			started <- os.NewSyscallError("unshare", err)
+			return
+		}
+
+		if setupfn != nil {
+			if err := setupfn(); err != nil {
+				started <- err
+				return
+			}
+		}
+		close(started)
+
+		if fn != nil {
+			fn()
+		}
+	}()
+
+	return <-started
+}

+ 2 - 2
pkg/chrootarchive/archive_unix.go

@@ -19,7 +19,7 @@ func invokeUnpack(decompressedArchive io.Reader, dest string, options *archive.T
 	}
 	}
 
 
 	done := make(chan error)
 	done := make(chan error)
-	err = Go(root, func() { done <- archive.Unpack(decompressedArchive, relDest, options) })
+	err = goInChroot(root, func() { done <- archive.Unpack(decompressedArchive, relDest, options) })
 	if err != nil {
 	if err != nil {
 		return err
 		return err
 	}
 	}
@@ -41,7 +41,7 @@ func invokePack(srcPath string, options *archive.TarOptions, root string) (io.Re
 	if err != nil {
 	if err != nil {
 		return nil, errors.Wrap(err, "error processing tar file")
 		return nil, errors.Wrap(err, "error processing tar file")
 	}
 	}
-	err = Go(root, tb.Do)
+	err = goInChroot(root, tb.Do)
 	if err != nil {
 	if err != nil {
 		return nil, errors.Wrap(err, "could not chroot")
 		return nil, errors.Wrap(err, "could not chroot")
 	}
 	}

+ 0 - 5
pkg/chrootarchive/archive_windows.go

@@ -7,11 +7,6 @@ import (
 	"github.com/docker/docker/pkg/longpath"
 	"github.com/docker/docker/pkg/longpath"
 )
 )
 
 
-// chroot is not supported by Windows
-func chroot(path string) error {
-	return nil
-}
-
 func invokeUnpack(decompressedArchive io.ReadCloser,
 func invokeUnpack(decompressedArchive io.ReadCloser,
 	dest string,
 	dest string,
 	options *archive.TarOptions, root string) error {
 	options *archive.TarOptions, root string) error {

+ 24 - 98
pkg/chrootarchive/chroot_linux.go

@@ -1,108 +1,34 @@
 package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive"
 package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive"
 
 
 import (
 import (
-	"fmt"
-	"os"
-	"path/filepath"
-
+	"github.com/docker/docker/internal/mounttree"
+	"github.com/docker/docker/internal/unshare"
 	"github.com/moby/sys/mount"
 	"github.com/moby/sys/mount"
-	"github.com/moby/sys/mountinfo"
 	"golang.org/x/sys/unix"
 	"golang.org/x/sys/unix"
 )
 )
 
 
-// chroot on linux uses pivot_root instead of chroot
-// pivot_root takes a new root and an old root.
-// Old root must be a sub-dir of new root, it is where the current rootfs will reside after the call to pivot_root.
-// New root is where the new rootfs is set to.
-// Old root is removed after the call to pivot_root so it is no longer available under the new root.
-// This is similar to how libcontainer sets up a container's rootfs
-func chroot(path string) (err error) {
-	if err := unix.Unshare(unix.CLONE_NEWNS); err != nil {
-		return fmt.Errorf("Error creating mount namespace before pivot: %v", err)
-	}
-
-	// Make everything in new ns slave.
-	// Don't use `private` here as this could race where the mountns gets a
-	//   reference to a mount and an unmount from the host does not propagate,
-	//   which could potentially cause transient errors for other operations,
-	//   even though this should be relatively small window here `slave` should
-	//   not cause any problems.
-	if err := mount.MakeRSlave("/"); err != nil {
-		return err
-	}
-
-	if mounted, _ := mountinfo.Mounted(path); !mounted {
-		if err := mount.Mount(path, path, "bind", "rbind,rw"); err != nil {
-			return realChroot(path)
-		}
-	}
-
-	// setup oldRoot for pivot_root
-	pivotDir, err := os.MkdirTemp(path, ".pivot_root")
-	if err != nil {
-		return fmt.Errorf("Error setting up pivot dir: %v", err)
-	}
-
-	var mounted bool
-	defer func() {
-		if mounted {
-			// make sure pivotDir is not mounted before we try to remove it
-			if errCleanup := unix.Unmount(pivotDir, unix.MNT_DETACH); errCleanup != nil {
-				if err == nil {
-					err = errCleanup
-				}
-				return
-			}
-		}
-
-		errCleanup := os.Remove(pivotDir)
-		// pivotDir doesn't exist if pivot_root failed and chroot+chdir was successful
-		// because we already cleaned it up on failed pivot_root
-		if errCleanup != nil && !os.IsNotExist(errCleanup) {
-			errCleanup = fmt.Errorf("Error cleaning up after pivot: %v", errCleanup)
-			if err == nil {
-				err = errCleanup
+// goInChroot starts fn in a goroutine where the root directory, current working
+// directory and umask are unshared from other goroutines and the root directory
+// has been changed to path. These changes are only visible to the goroutine in
+// which fn is executed. Any other goroutines, including ones started from fn,
+// will see the same root directory and file system attributes as the rest of
+// the process.
+func goInChroot(path string, fn func()) error {
+	return unshare.Go(
+		unix.CLONE_FS|unix.CLONE_NEWNS,
+		func() error {
+			// Make everything in new ns slave.
+			// Don't use `private` here as this could race where the mountns gets a
+			//   reference to a mount and an unmount from the host does not propagate,
+			//   which could potentially cause transient errors for other operations,
+			//   even though this should be relatively small window here `slave` should
+			//   not cause any problems.
+			if err := mount.MakeRSlave("/"); err != nil {
+				return err
 			}
 			}
-		}
-	}()
-
-	if err := unix.PivotRoot(path, pivotDir); err != nil {
-		// If pivot fails, fall back to the normal chroot after cleaning up temp dir
-		if err := os.Remove(pivotDir); err != nil {
-			return fmt.Errorf("Error cleaning up after failed pivot: %v", err)
-		}
-		return realChroot(path)
-	}
-	mounted = true
-
-	// This is the new path for where the old root (prior to the pivot) has been moved to
-	// This dir contains the rootfs of the caller, which we need to remove so it is not visible during extraction
-	pivotDir = filepath.Join("/", filepath.Base(pivotDir))
-
-	if err := unix.Chdir("/"); err != nil {
-		return fmt.Errorf("Error changing to new root: %v", err)
-	}
-
-	// Make the pivotDir (where the old root lives) private so it can be unmounted without propagating to the host
-	if err := unix.Mount("", pivotDir, "", unix.MS_PRIVATE|unix.MS_REC, ""); err != nil {
-		return fmt.Errorf("Error making old root private after pivot: %v", err)
-	}
-
-	// Now unmount the old root so it's no longer visible from the new root
-	if err := unix.Unmount(pivotDir, unix.MNT_DETACH); err != nil {
-		return fmt.Errorf("Error while unmounting old root after pivot: %v", err)
-	}
-	mounted = false
-
-	return nil
-}
 
 
-func realChroot(path string) error {
-	if err := unix.Chroot(path); err != nil {
-		return fmt.Errorf("Error after fallback to chroot: %v", err)
-	}
-	if err := unix.Chdir("/"); err != nil {
-		return fmt.Errorf("Error changing to new root after chroot: %v", err)
-	}
-	return nil
+			return mounttree.SwitchRoot(path)
+		},
+		fn,
+	)
 }
 }

+ 0 - 17
pkg/chrootarchive/chroot_unix.go

@@ -1,17 +0,0 @@
-//go:build !windows && !linux
-// +build !windows,!linux
-
-package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive"
-
-import "golang.org/x/sys/unix"
-
-func chroot(path string) error {
-	if err := unix.Chroot(path); err != nil {
-		return err
-	}
-	return unix.Chdir("/")
-}
-
-func realChroot(path string) error {
-	return chroot(path)
-}

+ 1 - 1
pkg/chrootarchive/diff_unix.go

@@ -42,7 +42,7 @@ func applyLayerHandler(dest string, layer io.Reader, options *archive.TarOptions
 	}
 	}
 
 
 	done := make(chan result)
 	done := make(chan result)
-	err = Go(dest, func() {
+	err = goInChroot(dest, func() {
 		// We need to be able to set any perms
 		// We need to be able to set any perms
 		_ = unix.Umask(0)
 		_ = unix.Umask(0)
 
 

+ 0 - 92
pkg/chrootarchive/go_linux.go

@@ -1,92 +0,0 @@
-//go:build go1.10
-// +build go1.10
-
-package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive"
-
-import (
-	"runtime"
-
-	"golang.org/x/sys/unix"
-)
-
-func init() {
-	// The startup thread of a process is special in a few different ways.
-	// Most pertinent to the discussion at hand, any per-thread kernel state
-	// reflected in the /proc/[pid]/ directory for a process is taken from
-	// the state of the startup thread. Same goes for /proc/self/; it shows
-	// the state of the current process' startup thread, no matter which
-	// thread the files are being opened from. For most programs this is a
-	// distinction without a difference as the kernel state, such as the
-	// mount namespace and current working directory, is shared among (and
-	// kept synchronized across) all threads of a process. But things start
-	// to break down once threads start unsharing and modifying parts of
-	// their kernel state.
-	//
-	// The Go runtime schedules goroutines to execute on the startup thread,
-	// same as any other. How this could be problematic is best illustrated
-	// with a concrete example. Consider what happens if a goroutine spawned
-	// from Go() gets scheduled onto the startup thread. The thread's mount
-	// namespace will be unshared and modified. The contents of the
-	// /proc/[pid]/mountinfo file will then describe the mount tree of the
-	// unshared namespace, not the namespace of any other thread. It will
-	// remain this way until the process exits. (The startup thread is
-	// special in another way: exiting it puts the process into a
-	// "non-waitable zombie" state. To avoid this fate, the Go runtime parks
-	// the thread instead of exiting if a goroutine returns while locked to
-	// the startup thread. More information can be found in the Go runtime
-	// sources: `go doc -u -src runtime.mexit`.)
-	// The github.com/moby/sys/mountinfo package reads from
-	// /proc/self/mountinfo, so will read the mount tree for the wrong
-	// namespace if the startup thread has had its mount namespace unshared!
-	// The /proc/thread-self/ magic symlink, introduced in Linux 3.17, is
-	// one potential solution to this problem, but every package which opens
-	// files in /proc/self/ would need to be updated, and fallbacks to
-	// /proc/self/task/{{syscall.Gettid()}}/ would be required to support
-	// older kernels. Overlooking any reference to /proc/self/ would
-	// manifest as stochastically-reproducible bugs, so this is far from an
-	// ideal solution.
-	//
-	// Reading from /proc/self/ would not be a problem if we can prevent the
-	// per-thread state of the startup thread from being modified
-	// nondeterministically in the first place. We can accomplish this
-	// simply by locking the main() function to the startup thread! Doing so
-	// excludes any other goroutine from being scheduled on the thread.
-	runtime.LockOSThread()
-}
-
-// Go starts fn in a goroutine where the root directory, current working
-// directory and umask are unshared from other goroutines and the root directory
-// has been changed to path. These changes are only visible to the goroutine in
-// which fn is executed. Any other goroutines, including ones started from fn,
-// will see the same root directory and file system attributes as the rest of
-// the process.
-func Go(path string, fn func()) error {
-	started := make(chan error)
-	go func() {
-		// Prepare to manipulate per-thread kernel state. Wire the
-		// goroutine to the OS thread so execution of other goroutines
-		// will not be scheduled on it. It is very important not to
-		// unwire the goroutine from the thread so that the thread exits
-		// with this goroutine and is not returned to the goroutine
-		// thread pool.
-		runtime.LockOSThread()
-
-		// Under Linux, threads are implemented as processes which share
-		// a virtual memory space. Therefore in a multithreaded process
-		// unshare(2) disassociates parts of the calling thread's
-		// context from the thread it was clone(2)'d from.
-		if err := unix.Unshare(unix.CLONE_FS); err != nil {
-			started <- err
-			return
-		}
-
-		if err := chroot(path); err != nil {
-			started <- err
-			return
-		}
-
-		close(started)
-		fn()
-	}()
-	return <-started
-}