Add reusable chroot and unshare utilities

Refactor pkg/chrootarchive in terms of those utilities.

Signed-off-by: Cory Snider <csnider@mirantis.com>
This commit is contained in:
Cory Snider 2022-10-07 18:16:49 -04:00
parent 317d3d10b8
commit 60ee6f739f
8 changed files with 297 additions and 215 deletions

View file

@ -0,0 +1,94 @@
package mounttree // import "github.com/docker/docker/internal/mounttree"
import (
"fmt"
"os"
"path/filepath"
"github.com/moby/sys/mount"
"github.com/moby/sys/mountinfo"
"golang.org/x/sys/unix"
)
// SwitchRoot changes path to be the root of the mount tree and changes the
// current working directory to the new root.
//
// This function bind-mounts onto path; it is the caller's responsibility to set
// the desired propagation mode of path's parent mount beforehand to prevent
// unwanted propagation into different mount namespaces.
func SwitchRoot(path string) error {
if mounted, _ := mountinfo.Mounted(path); !mounted {
if err := mount.Mount(path, path, "bind", "rbind,rw"); err != nil {
return realChroot(path)
}
}
// setup oldRoot for pivot_root
pivotDir, err := os.MkdirTemp(path, ".pivot_root")
if err != nil {
return fmt.Errorf("Error setting up pivot dir: %v", err)
}
var mounted bool
defer func() {
if mounted {
// make sure pivotDir is not mounted before we try to remove it
if errCleanup := unix.Unmount(pivotDir, unix.MNT_DETACH); errCleanup != nil {
if err == nil {
err = errCleanup
}
return
}
}
errCleanup := os.Remove(pivotDir)
// pivotDir doesn't exist if pivot_root failed and chroot+chdir was successful
// because we already cleaned it up on failed pivot_root
if errCleanup != nil && !os.IsNotExist(errCleanup) {
errCleanup = fmt.Errorf("Error cleaning up after pivot: %v", errCleanup)
if err == nil {
err = errCleanup
}
}
}()
if err := unix.PivotRoot(path, pivotDir); err != nil {
// If pivot fails, fall back to the normal chroot after cleaning up temp dir
if err := os.Remove(pivotDir); err != nil {
return fmt.Errorf("Error cleaning up after failed pivot: %v", err)
}
return realChroot(path)
}
mounted = true
// This is the new path for where the old root (prior to the pivot) has been moved to
// This dir contains the rootfs of the caller, which we need to remove so it is not visible during extraction
pivotDir = filepath.Join("/", filepath.Base(pivotDir))
if err := unix.Chdir("/"); err != nil {
return fmt.Errorf("Error changing to new root: %v", err)
}
// Make the pivotDir (where the old root lives) private so it can be unmounted without propagating to the host
if err := unix.Mount("", pivotDir, "", unix.MS_PRIVATE|unix.MS_REC, ""); err != nil {
return fmt.Errorf("Error making old root private after pivot: %v", err)
}
// Now unmount the old root so it's no longer visible from the new root
if err := unix.Unmount(pivotDir, unix.MNT_DETACH); err != nil {
return fmt.Errorf("Error while unmounting old root after pivot: %v", err)
}
mounted = false
return nil
}
func realChroot(path string) error {
if err := unix.Chroot(path); err != nil {
return fmt.Errorf("Error after fallback to chroot: %v", err)
}
if err := unix.Chdir("/"); err != nil {
return fmt.Errorf("Error changing to new root after chroot: %v", err)
}
return nil
}

View file

@ -0,0 +1,176 @@
//go:build go1.10
// +build go1.10
package unshare // import "github.com/docker/docker/internal/unshare"
import (
"fmt"
"os"
"runtime"
"golang.org/x/sys/unix"
)
func init() {
// The startup thread of a process is special in a few different ways.
// Most pertinent to the discussion at hand, any per-thread kernel state
// reflected in the /proc/[pid]/ directory for a process is taken from
// the state of the startup thread. Same goes for /proc/self/; it shows
// the state of the current process' startup thread, no matter which
// thread the files are being opened from. For most programs this is a
// distinction without a difference as the kernel state, such as the
// mount namespace and current working directory, is shared among (and
// kept synchronized across) all threads of a process. But things start
// to break down once threads start unsharing and modifying parts of
// their kernel state.
//
// The Go runtime schedules goroutines to execute on the startup thread,
// same as any other. How this could be problematic is best illustrated
// with a concrete example. Consider what happens if a call to
// Go(unix.CLONE_NEWNS, ...) spawned a goroutine which gets scheduled
// onto the startup thread. The thread's mount namespace will be
// unshared and modified. The contents of the /proc/[pid]/mountinfo file
// will then describe the mount tree of the unshared namespace, not the
// namespace of any other thread. It will remain this way until the
// process exits. (The startup thread is special in another way: exiting
// it puts the process into a "non-waitable zombie" state. To avoid this
// fate, the Go runtime parks the thread instead of exiting if a
// goroutine returns while locked to the startup thread. More
// information can be found in the Go runtime sources:
// `go doc -u -src runtime.mexit`.) The github.com/moby/sys/mountinfo
// package reads from /proc/self/mountinfo, so will read the mount tree
// for the wrong namespace if the startup thread has had its mount
// namespace unshared! The /proc/thread-self/ directory, introduced in
// Linux 3.17, is one potential solution to this problem, but every
// package which opens files in /proc/self/ would need to be updated,
// and fallbacks to /proc/self/task/[tid]/ would be required to support
// older kernels. Overlooking any reference to /proc/self/ would
// manifest as stochastically-reproducible bugs, so this is far from an
// ideal solution.
//
// Reading from /proc/self/ would not be a problem if we could prevent
// the per-thread state of the startup thread from being modified
// nondeterministically in the first place. We can accomplish this
// simply by locking the main() function to the startup thread! Doing so
// excludes any other goroutine from being scheduled on the thread.
runtime.LockOSThread()
}
// reversibleSetnsFlags maps the unshare(2) flags whose effects can be fully
// reversed using setns(2). The values are the basenames of the corresponding
// /proc/self/task/[tid]/ns/ magic symlinks to use to save and restore the
// state.
var reversibleSetnsFlags = map[int]string{
unix.CLONE_NEWCGROUP: "cgroup",
unix.CLONE_NEWNET: "net",
unix.CLONE_NEWUTS: "uts",
unix.CLONE_NEWPID: "pid",
unix.CLONE_NEWTIME: "time",
// The following CLONE_NEW* flags are not included because they imply
// another, irreversible flag when used with unshare(2).
// - unix.CLONE_NEWIPC: implies CLONE_SYSVMEM
// - unix.CLONE_NEWNS: implies CLONE_FS
// - unix.CLONE_NEWUSER: implies CLONE_FS since Linux 3.9
}
// Go calls the given functions in a new goroutine, locked to an OS thread,
// which has had the parts of its execution state disassociated from the rest of
// the current process using [unshare(2)]. It blocks until the new goroutine has
// started and setupfn has returned. fn is only called if setupfn returns nil. A
// nil setupfn or fn is equivalent to passing a no-op function.
//
// The disassociated execution state and any changes made to it are only visible
// to the goroutine which the functions are called in. Any other goroutines,
// including ones started from the function, will see the same execution state
// as the rest of the process.
//
// The acceptable flags are documented in the [unshare(2)] Linux man-page.
// The corresponding CLONE_* constants are defined in package [unix].
//
// # Warning
//
// This function may terminate the thread which the new goroutine executed on
// after fn returns, which could cause subprocesses started with the
// [syscall.SysProcAttr] Pdeathsig field set to be signaled before process
// termination. Any subprocess started before this function is called may be
// affected, in addition to any subprocesses started inside setupfn or fn.
// There are more details at https://go.dev/issue/27505.
//
// [unshare(2)]: https://man7.org/linux/man-pages/man2/unshare.2.html
func Go(flags int, setupfn func() error, fn func()) error {
started := make(chan error)
maskedFlags := flags
for f := range reversibleSetnsFlags {
maskedFlags &^= f
}
isReversible := maskedFlags == 0
go func() {
// Prepare to manipulate per-thread kernel state.
runtime.LockOSThread()
// Not all changes to the execution state can be reverted.
// If an irreversible change to the execution state is made, our
// only recourse is to have the tampered thread terminated by
// returning from this function while the goroutine remains
// wired to the thread. The Go runtime will terminate the thread
// and replace it with a fresh one as needed.
if isReversible {
defer func() {
if isReversible {
// All execution state has been restored without error.
// The thread is once again fungible.
runtime.UnlockOSThread()
}
}()
tid := unix.Gettid()
for f, ns := range reversibleSetnsFlags {
if flags&f != f {
continue
}
// The /proc/thread-self directory was added in Linux 3.17.
// We are not using it to maximize compatibility.
pth := fmt.Sprintf("/proc/self/task/%d/ns/%s", tid, ns)
fd, err := unix.Open(pth, unix.O_RDONLY|unix.O_CLOEXEC, 0)
if err != nil {
started <- &os.PathError{Op: "open", Path: pth, Err: err}
return
}
defer func() {
if isReversible {
if err := unix.Setns(fd, 0); err != nil {
isReversible = false
}
}
_ = unix.Close(fd)
}()
}
}
// Threads are implemented under Linux as processes which share
// a virtual memory space. Therefore in a multithreaded process
// unshare(2) disassociates parts of the calling thread's
// context from the thread it was clone(2)'d from.
if err := unix.Unshare(flags); err != nil {
started <- os.NewSyscallError("unshare", err)
return
}
if setupfn != nil {
if err := setupfn(); err != nil {
started <- err
return
}
}
close(started)
if fn != nil {
fn()
}
}()
return <-started
}

View file

@ -19,7 +19,7 @@ func invokeUnpack(decompressedArchive io.Reader, dest string, options *archive.T
}
done := make(chan error)
err = Go(root, func() { done <- archive.Unpack(decompressedArchive, relDest, options) })
err = goInChroot(root, func() { done <- archive.Unpack(decompressedArchive, relDest, options) })
if err != nil {
return err
}
@ -41,7 +41,7 @@ func invokePack(srcPath string, options *archive.TarOptions, root string) (io.Re
if err != nil {
return nil, errors.Wrap(err, "error processing tar file")
}
err = Go(root, tb.Do)
err = goInChroot(root, tb.Do)
if err != nil {
return nil, errors.Wrap(err, "could not chroot")
}

View file

@ -7,11 +7,6 @@ import (
"github.com/docker/docker/pkg/longpath"
)
// chroot is not supported by Windows
func chroot(path string) error {
return nil
}
func invokeUnpack(decompressedArchive io.ReadCloser,
dest string,
options *archive.TarOptions, root string) error {

View file

@ -1,108 +1,34 @@
package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive"
import (
"fmt"
"os"
"path/filepath"
"github.com/docker/docker/internal/mounttree"
"github.com/docker/docker/internal/unshare"
"github.com/moby/sys/mount"
"github.com/moby/sys/mountinfo"
"golang.org/x/sys/unix"
)
// chroot on linux uses pivot_root instead of chroot
// pivot_root takes a new root and an old root.
// Old root must be a sub-dir of new root, it is where the current rootfs will reside after the call to pivot_root.
// New root is where the new rootfs is set to.
// Old root is removed after the call to pivot_root so it is no longer available under the new root.
// This is similar to how libcontainer sets up a container's rootfs
func chroot(path string) (err error) {
if err := unix.Unshare(unix.CLONE_NEWNS); err != nil {
return fmt.Errorf("Error creating mount namespace before pivot: %v", err)
}
// Make everything in new ns slave.
// Don't use `private` here as this could race where the mountns gets a
// reference to a mount and an unmount from the host does not propagate,
// which could potentially cause transient errors for other operations,
// even though this should be relatively small window here `slave` should
// not cause any problems.
if err := mount.MakeRSlave("/"); err != nil {
return err
}
if mounted, _ := mountinfo.Mounted(path); !mounted {
if err := mount.Mount(path, path, "bind", "rbind,rw"); err != nil {
return realChroot(path)
}
}
// setup oldRoot for pivot_root
pivotDir, err := os.MkdirTemp(path, ".pivot_root")
if err != nil {
return fmt.Errorf("Error setting up pivot dir: %v", err)
}
var mounted bool
defer func() {
if mounted {
// make sure pivotDir is not mounted before we try to remove it
if errCleanup := unix.Unmount(pivotDir, unix.MNT_DETACH); errCleanup != nil {
if err == nil {
err = errCleanup
}
return
// goInChroot starts fn in a goroutine where the root directory, current working
// directory and umask are unshared from other goroutines and the root directory
// has been changed to path. These changes are only visible to the goroutine in
// which fn is executed. Any other goroutines, including ones started from fn,
// will see the same root directory and file system attributes as the rest of
// the process.
func goInChroot(path string, fn func()) error {
return unshare.Go(
unix.CLONE_FS|unix.CLONE_NEWNS,
func() error {
// Make everything in new ns slave.
// Don't use `private` here as this could race where the mountns gets a
// reference to a mount and an unmount from the host does not propagate,
// which could potentially cause transient errors for other operations,
// even though this should be relatively small window here `slave` should
// not cause any problems.
if err := mount.MakeRSlave("/"); err != nil {
return err
}
}
errCleanup := os.Remove(pivotDir)
// pivotDir doesn't exist if pivot_root failed and chroot+chdir was successful
// because we already cleaned it up on failed pivot_root
if errCleanup != nil && !os.IsNotExist(errCleanup) {
errCleanup = fmt.Errorf("Error cleaning up after pivot: %v", errCleanup)
if err == nil {
err = errCleanup
}
}
}()
if err := unix.PivotRoot(path, pivotDir); err != nil {
// If pivot fails, fall back to the normal chroot after cleaning up temp dir
if err := os.Remove(pivotDir); err != nil {
return fmt.Errorf("Error cleaning up after failed pivot: %v", err)
}
return realChroot(path)
}
mounted = true
// This is the new path for where the old root (prior to the pivot) has been moved to
// This dir contains the rootfs of the caller, which we need to remove so it is not visible during extraction
pivotDir = filepath.Join("/", filepath.Base(pivotDir))
if err := unix.Chdir("/"); err != nil {
return fmt.Errorf("Error changing to new root: %v", err)
}
// Make the pivotDir (where the old root lives) private so it can be unmounted without propagating to the host
if err := unix.Mount("", pivotDir, "", unix.MS_PRIVATE|unix.MS_REC, ""); err != nil {
return fmt.Errorf("Error making old root private after pivot: %v", err)
}
// Now unmount the old root so it's no longer visible from the new root
if err := unix.Unmount(pivotDir, unix.MNT_DETACH); err != nil {
return fmt.Errorf("Error while unmounting old root after pivot: %v", err)
}
mounted = false
return nil
}
func realChroot(path string) error {
if err := unix.Chroot(path); err != nil {
return fmt.Errorf("Error after fallback to chroot: %v", err)
}
if err := unix.Chdir("/"); err != nil {
return fmt.Errorf("Error changing to new root after chroot: %v", err)
}
return nil
return mounttree.SwitchRoot(path)
},
fn,
)
}

View file

@ -1,17 +0,0 @@
//go:build !windows && !linux
// +build !windows,!linux
package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive"
import "golang.org/x/sys/unix"
func chroot(path string) error {
if err := unix.Chroot(path); err != nil {
return err
}
return unix.Chdir("/")
}
func realChroot(path string) error {
return chroot(path)
}

View file

@ -42,7 +42,7 @@ func applyLayerHandler(dest string, layer io.Reader, options *archive.TarOptions
}
done := make(chan result)
err = Go(dest, func() {
err = goInChroot(dest, func() {
// We need to be able to set any perms
_ = unix.Umask(0)

View file

@ -1,92 +0,0 @@
//go:build go1.10
// +build go1.10
package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive"
import (
"runtime"
"golang.org/x/sys/unix"
)
func init() {
// The startup thread of a process is special in a few different ways.
// Most pertinent to the discussion at hand, any per-thread kernel state
// reflected in the /proc/[pid]/ directory for a process is taken from
// the state of the startup thread. Same goes for /proc/self/; it shows
// the state of the current process' startup thread, no matter which
// thread the files are being opened from. For most programs this is a
// distinction without a difference as the kernel state, such as the
// mount namespace and current working directory, is shared among (and
// kept synchronized across) all threads of a process. But things start
// to break down once threads start unsharing and modifying parts of
// their kernel state.
//
// The Go runtime schedules goroutines to execute on the startup thread,
// same as any other. How this could be problematic is best illustrated
// with a concrete example. Consider what happens if a goroutine spawned
// from Go() gets scheduled onto the startup thread. The thread's mount
// namespace will be unshared and modified. The contents of the
// /proc/[pid]/mountinfo file will then describe the mount tree of the
// unshared namespace, not the namespace of any other thread. It will
// remain this way until the process exits. (The startup thread is
// special in another way: exiting it puts the process into a
// "non-waitable zombie" state. To avoid this fate, the Go runtime parks
// the thread instead of exiting if a goroutine returns while locked to
// the startup thread. More information can be found in the Go runtime
// sources: `go doc -u -src runtime.mexit`.)
// The github.com/moby/sys/mountinfo package reads from
// /proc/self/mountinfo, so will read the mount tree for the wrong
// namespace if the startup thread has had its mount namespace unshared!
// The /proc/thread-self/ magic symlink, introduced in Linux 3.17, is
// one potential solution to this problem, but every package which opens
// files in /proc/self/ would need to be updated, and fallbacks to
// /proc/self/task/{{syscall.Gettid()}}/ would be required to support
// older kernels. Overlooking any reference to /proc/self/ would
// manifest as stochastically-reproducible bugs, so this is far from an
// ideal solution.
//
// Reading from /proc/self/ would not be a problem if we can prevent the
// per-thread state of the startup thread from being modified
// nondeterministically in the first place. We can accomplish this
// simply by locking the main() function to the startup thread! Doing so
// excludes any other goroutine from being scheduled on the thread.
runtime.LockOSThread()
}
// Go starts fn in a goroutine where the root directory, current working
// directory and umask are unshared from other goroutines and the root directory
// has been changed to path. These changes are only visible to the goroutine in
// which fn is executed. Any other goroutines, including ones started from fn,
// will see the same root directory and file system attributes as the rest of
// the process.
func Go(path string, fn func()) error {
started := make(chan error)
go func() {
// Prepare to manipulate per-thread kernel state. Wire the
// goroutine to the OS thread so execution of other goroutines
// will not be scheduled on it. It is very important not to
// unwire the goroutine from the thread so that the thread exits
// with this goroutine and is not returned to the goroutine
// thread pool.
runtime.LockOSThread()
// Under Linux, threads are implemented as processes which share
// a virtual memory space. Therefore in a multithreaded process
// unshare(2) disassociates parts of the calling thread's
// context from the thread it was clone(2)'d from.
if err := unix.Unshare(unix.CLONE_FS); err != nil {
started <- err
return
}
if err := chroot(path); err != nil {
started <- err
return
}
close(started)
fn()
}()
return <-started
}