unshare_linux.go 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
  1. //go:build go1.10
  2. package unshare // import "github.com/docker/docker/internal/unshare"
  3. import (
  4. "fmt"
  5. "os"
  6. "runtime"
  7. "golang.org/x/sys/unix"
  8. )
  9. func init() {
  10. // The startup thread of a process is special in a few different ways.
  11. // Most pertinent to the discussion at hand, any per-thread kernel state
  12. // reflected in the /proc/[pid]/ directory for a process is taken from
  13. // the state of the startup thread. Same goes for /proc/self/; it shows
  14. // the state of the current process' startup thread, no matter which
  15. // thread the files are being opened from. For most programs this is a
  16. // distinction without a difference as the kernel state, such as the
  17. // mount namespace and current working directory, is shared among (and
  18. // kept synchronized across) all threads of a process. But things start
  19. // to break down once threads start unsharing and modifying parts of
  20. // their kernel state.
  21. //
  22. // The Go runtime schedules goroutines to execute on the startup thread,
  23. // same as any other. How this could be problematic is best illustrated
  24. // with a concrete example. Consider what happens if a call to
  25. // Go(unix.CLONE_NEWNS, ...) spawned a goroutine which gets scheduled
  26. // onto the startup thread. The thread's mount namespace will be
  27. // unshared and modified. The contents of the /proc/[pid]/mountinfo file
  28. // will then describe the mount tree of the unshared namespace, not the
  29. // namespace of any other thread. It will remain this way until the
  30. // process exits. (The startup thread is special in another way: exiting
  31. // it puts the process into a "non-waitable zombie" state. To avoid this
  32. // fate, the Go runtime parks the thread instead of exiting if a
  33. // goroutine returns while locked to the startup thread. More
  34. // information can be found in the Go runtime sources:
  35. // `go doc -u -src runtime.mexit`.) The github.com/moby/sys/mountinfo
  36. // package reads from /proc/self/mountinfo, so will read the mount tree
  37. // for the wrong namespace if the startup thread has had its mount
  38. // namespace unshared! The /proc/thread-self/ directory, introduced in
  39. // Linux 3.17, is one potential solution to this problem, but every
  40. // package which opens files in /proc/self/ would need to be updated,
  41. // and fallbacks to /proc/self/task/[tid]/ would be required to support
  42. // older kernels. Overlooking any reference to /proc/self/ would
  43. // manifest as stochastically-reproducible bugs, so this is far from an
  44. // ideal solution.
  45. //
  46. // Reading from /proc/self/ would not be a problem if we could prevent
  47. // the per-thread state of the startup thread from being modified
  48. // nondeterministically in the first place. We can accomplish this
  49. // simply by locking the main() function to the startup thread! Doing so
  50. // excludes any other goroutine from being scheduled on the thread.
  51. runtime.LockOSThread()
  52. }
  53. // reversibleSetnsFlags maps the unshare(2) flags whose effects can be fully
  54. // reversed using setns(2). The values are the basenames of the corresponding
  55. // /proc/self/task/[tid]/ns/ magic symlinks to use to save and restore the
  56. // state.
  57. var reversibleSetnsFlags = map[int]string{
  58. unix.CLONE_NEWCGROUP: "cgroup",
  59. unix.CLONE_NEWNET: "net",
  60. unix.CLONE_NEWUTS: "uts",
  61. unix.CLONE_NEWPID: "pid",
  62. unix.CLONE_NEWTIME: "time",
  63. // The following CLONE_NEW* flags are not included because they imply
  64. // another, irreversible flag when used with unshare(2).
  65. // - unix.CLONE_NEWIPC: implies CLONE_SYSVMEM
  66. // - unix.CLONE_NEWNS: implies CLONE_FS
  67. // - unix.CLONE_NEWUSER: implies CLONE_FS since Linux 3.9
  68. }
  69. // Go calls the given functions in a new goroutine, locked to an OS thread,
  70. // which has had the parts of its execution state disassociated from the rest of
  71. // the current process using [unshare(2)]. It blocks until the new goroutine has
  72. // started and setupfn has returned. fn is only called if setupfn returns nil. A
  73. // nil setupfn or fn is equivalent to passing a no-op function.
  74. //
  75. // The disassociated execution state and any changes made to it are only visible
  76. // to the goroutine which the functions are called in. Any other goroutines,
  77. // including ones started from the function, will see the same execution state
  78. // as the rest of the process.
  79. //
  80. // The acceptable flags are documented in the [unshare(2)] Linux man-page.
  81. // The corresponding CLONE_* constants are defined in package [unix].
  82. //
  83. // # Warning
  84. //
  85. // This function may terminate the thread which the new goroutine executed on
  86. // after fn returns, which could cause subprocesses started with the
  87. // [syscall.SysProcAttr] Pdeathsig field set to be signaled before process
  88. // termination. Any subprocess started before this function is called may be
  89. // affected, in addition to any subprocesses started inside setupfn or fn.
  90. // There are more details at https://go.dev/issue/27505.
  91. //
  92. // [unshare(2)]: https://man7.org/linux/man-pages/man2/unshare.2.html
  93. func Go(flags int, setupfn func() error, fn func()) error {
  94. started := make(chan error)
  95. maskedFlags := flags
  96. for f := range reversibleSetnsFlags {
  97. maskedFlags &^= f
  98. }
  99. isReversible := maskedFlags == 0
  100. go func() {
  101. // Prepare to manipulate per-thread kernel state.
  102. runtime.LockOSThread()
  103. // Not all changes to the execution state can be reverted.
  104. // If an irreversible change to the execution state is made, our
  105. // only recourse is to have the tampered thread terminated by
  106. // returning from this function while the goroutine remains
  107. // wired to the thread. The Go runtime will terminate the thread
  108. // and replace it with a fresh one as needed.
  109. if isReversible {
  110. defer func() {
  111. if isReversible {
  112. // All execution state has been restored without error.
  113. // The thread is once again fungible.
  114. runtime.UnlockOSThread()
  115. }
  116. }()
  117. tid := unix.Gettid()
  118. for f, ns := range reversibleSetnsFlags {
  119. if flags&f != f {
  120. continue
  121. }
  122. // The /proc/thread-self directory was added in Linux 3.17.
  123. // We are not using it to maximize compatibility.
  124. pth := fmt.Sprintf("/proc/self/task/%d/ns/%s", tid, ns)
  125. fd, err := unix.Open(pth, unix.O_RDONLY|unix.O_CLOEXEC, 0)
  126. if err != nil {
  127. started <- &os.PathError{Op: "open", Path: pth, Err: err}
  128. return
  129. }
  130. defer func() {
  131. if isReversible {
  132. if err := unix.Setns(fd, 0); err != nil {
  133. isReversible = false
  134. }
  135. }
  136. _ = unix.Close(fd)
  137. }()
  138. }
  139. }
  140. // Threads are implemented under Linux as processes which share
  141. // a virtual memory space. Therefore in a multithreaded process
  142. // unshare(2) disassociates parts of the calling thread's
  143. // context from the thread it was clone(2)'d from.
  144. if err := unix.Unshare(flags); err != nil {
  145. started <- os.NewSyscallError("unshare", err)
  146. return
  147. }
  148. if setupfn != nil {
  149. if err := setupfn(); err != nil {
  150. started <- err
  151. return
  152. }
  153. }
  154. close(started)
  155. if fn != nil {
  156. fn()
  157. }
  158. }()
  159. return <-started
  160. }