123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175 |
- //go:build go1.10
- package unshare // import "github.com/docker/docker/internal/unshare"
- import (
- "fmt"
- "os"
- "runtime"
- "golang.org/x/sys/unix"
- )
- func init() {
- // The startup thread of a process is special in a few different ways.
- // Most pertinent to the discussion at hand, any per-thread kernel state
- // reflected in the /proc/[pid]/ directory for a process is taken from
- // the state of the startup thread. Same goes for /proc/self/; it shows
- // the state of the current process' startup thread, no matter which
- // thread the files are being opened from. For most programs this is a
- // distinction without a difference as the kernel state, such as the
- // mount namespace and current working directory, is shared among (and
- // kept synchronized across) all threads of a process. But things start
- // to break down once threads start unsharing and modifying parts of
- // their kernel state.
- //
- // The Go runtime schedules goroutines to execute on the startup thread,
- // same as any other. How this could be problematic is best illustrated
- // with a concrete example. Consider what happens if a call to
- // Go(unix.CLONE_NEWNS, ...) spawned a goroutine which gets scheduled
- // onto the startup thread. The thread's mount namespace will be
- // unshared and modified. The contents of the /proc/[pid]/mountinfo file
- // will then describe the mount tree of the unshared namespace, not the
- // namespace of any other thread. It will remain this way until the
- // process exits. (The startup thread is special in another way: exiting
- // it puts the process into a "non-waitable zombie" state. To avoid this
- // fate, the Go runtime parks the thread instead of exiting if a
- // goroutine returns while locked to the startup thread. More
- // information can be found in the Go runtime sources:
- // `go doc -u -src runtime.mexit`.) The github.com/moby/sys/mountinfo
- // package reads from /proc/self/mountinfo, so will read the mount tree
- // for the wrong namespace if the startup thread has had its mount
- // namespace unshared! The /proc/thread-self/ directory, introduced in
- // Linux 3.17, is one potential solution to this problem, but every
- // package which opens files in /proc/self/ would need to be updated,
- // and fallbacks to /proc/self/task/[tid]/ would be required to support
- // older kernels. Overlooking any reference to /proc/self/ would
- // manifest as stochastically-reproducible bugs, so this is far from an
- // ideal solution.
- //
- // Reading from /proc/self/ would not be a problem if we could prevent
- // the per-thread state of the startup thread from being modified
- // nondeterministically in the first place. We can accomplish this
- // simply by locking the main() function to the startup thread! Doing so
- // excludes any other goroutine from being scheduled on the thread.
- runtime.LockOSThread()
- }
- // reversibleSetnsFlags maps the unshare(2) flags whose effects can be fully
- // reversed using setns(2). The values are the basenames of the corresponding
- // /proc/self/task/[tid]/ns/ magic symlinks to use to save and restore the
- // state.
- var reversibleSetnsFlags = map[int]string{
- unix.CLONE_NEWCGROUP: "cgroup",
- unix.CLONE_NEWNET: "net",
- unix.CLONE_NEWUTS: "uts",
- unix.CLONE_NEWPID: "pid",
- unix.CLONE_NEWTIME: "time",
- // The following CLONE_NEW* flags are not included because they imply
- // another, irreversible flag when used with unshare(2).
- // - unix.CLONE_NEWIPC: implies CLONE_SYSVMEM
- // - unix.CLONE_NEWNS: implies CLONE_FS
- // - unix.CLONE_NEWUSER: implies CLONE_FS since Linux 3.9
- }
- // Go calls the given functions in a new goroutine, locked to an OS thread,
- // which has had the parts of its execution state disassociated from the rest of
- // the current process using [unshare(2)]. It blocks until the new goroutine has
- // started and setupfn has returned. fn is only called if setupfn returns nil. A
- // nil setupfn or fn is equivalent to passing a no-op function.
- //
- // The disassociated execution state and any changes made to it are only visible
- // to the goroutine which the functions are called in. Any other goroutines,
- // including ones started from the function, will see the same execution state
- // as the rest of the process.
- //
- // The acceptable flags are documented in the [unshare(2)] Linux man-page.
- // The corresponding CLONE_* constants are defined in package [unix].
- //
- // # Warning
- //
- // This function may terminate the thread which the new goroutine executed on
- // after fn returns, which could cause subprocesses started with the
- // [syscall.SysProcAttr] Pdeathsig field set to be signaled before process
- // termination. Any subprocess started before this function is called may be
- // affected, in addition to any subprocesses started inside setupfn or fn.
- // There are more details at https://go.dev/issue/27505.
- //
- // [unshare(2)]: https://man7.org/linux/man-pages/man2/unshare.2.html
- func Go(flags int, setupfn func() error, fn func()) error {
- started := make(chan error)
- maskedFlags := flags
- for f := range reversibleSetnsFlags {
- maskedFlags &^= f
- }
- isReversible := maskedFlags == 0
- go func() {
- // Prepare to manipulate per-thread kernel state.
- runtime.LockOSThread()
- // Not all changes to the execution state can be reverted.
- // If an irreversible change to the execution state is made, our
- // only recourse is to have the tampered thread terminated by
- // returning from this function while the goroutine remains
- // wired to the thread. The Go runtime will terminate the thread
- // and replace it with a fresh one as needed.
- if isReversible {
- defer func() {
- if isReversible {
- // All execution state has been restored without error.
- // The thread is once again fungible.
- runtime.UnlockOSThread()
- }
- }()
- tid := unix.Gettid()
- for f, ns := range reversibleSetnsFlags {
- if flags&f != f {
- continue
- }
- // The /proc/thread-self directory was added in Linux 3.17.
- // We are not using it to maximize compatibility.
- pth := fmt.Sprintf("/proc/self/task/%d/ns/%s", tid, ns)
- fd, err := unix.Open(pth, unix.O_RDONLY|unix.O_CLOEXEC, 0)
- if err != nil {
- started <- &os.PathError{Op: "open", Path: pth, Err: err}
- return
- }
- defer func() {
- if isReversible {
- if err := unix.Setns(fd, 0); err != nil {
- isReversible = false
- }
- }
- _ = unix.Close(fd)
- }()
- }
- }
- // Threads are implemented under Linux as processes which share
- // a virtual memory space. Therefore in a multithreaded process
- // unshare(2) disassociates parts of the calling thread's
- // context from the thread it was clone(2)'d from.
- if err := unix.Unshare(flags); err != nil {
- started <- os.NewSyscallError("unshare", err)
- return
- }
- if setupfn != nil {
- if err := setupfn(); err != nil {
- started <- err
- return
- }
- }
- close(started)
- if fn != nil {
- fn()
- }
- }()
- return <-started
- }
|