containerfs_linux.go 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
  1. package daemon // import "github.com/docker/docker/daemon"
  2. import (
  3. "context"
  4. "errors"
  5. "fmt"
  6. "os"
  7. "path/filepath"
  8. "runtime"
  9. "strings"
  10. "github.com/containerd/log"
  11. "github.com/hashicorp/go-multierror"
  12. "github.com/moby/sys/mount"
  13. "github.com/moby/sys/symlink"
  14. "golang.org/x/sys/unix"
  15. "github.com/docker/docker/api/types"
  16. "github.com/docker/docker/container"
  17. "github.com/docker/docker/internal/mounttree"
  18. "github.com/docker/docker/internal/unshare"
  19. "github.com/docker/docker/pkg/fileutils"
  20. )
  21. type future struct {
  22. fn func() error
  23. res chan<- error
  24. }
  25. // containerFSView allows functions to be run in the context of a container's
  26. // filesystem. Inside these functions, the root directory is the container root
  27. // for all native OS filesystem APIs, including, but not limited to, the [os]
  28. // and [golang.org/x/sys/unix] packages. The view of the container's filesystem
  29. // is live and read-write. Each view has its own private set of tmpfs mounts.
  30. // Any files written under a tmpfs mount are not visible to processes inside the
  31. // container nor any other view of the container's filesystem, and vice versa.
  32. //
  33. // Each view has its own current working directory which is initialized to the
  34. // root of the container filesystem and can be changed with [os.Chdir]. Changes
  35. // to the current directory persist across successive [*containerFSView.RunInFS]
  36. // and [*containerFSView.GoInFS] calls.
  37. //
  38. // Multiple views of the same container filesystem can coexist at the same time.
  39. // Only one function can be running in a particular filesystem view at any given
  40. // time. Calls to [*containerFSView.RunInFS] or [*containerFSView.GoInFS] will
  41. // block while another function is running. If more than one call is blocked
  42. // concurrently, the order they are unblocked is undefined.
  43. type containerFSView struct {
  44. d *Daemon
  45. ctr *container.Container
  46. todo chan future
  47. done chan error
  48. }
  49. // openContainerFS opens a new view of the container's filesystem.
  50. func (daemon *Daemon) openContainerFS(container *container.Container) (_ *containerFSView, err error) {
  51. if err := daemon.Mount(container); err != nil {
  52. return nil, err
  53. }
  54. defer func() {
  55. if err != nil {
  56. _ = daemon.Unmount(container)
  57. }
  58. }()
  59. mounts, err := daemon.setupMounts(container)
  60. if err != nil {
  61. return nil, err
  62. }
  63. defer func() {
  64. if err != nil {
  65. _ = container.UnmountVolumes(daemon.LogVolumeEvent)
  66. }
  67. }()
  68. // Setup in initial mount namespace complete. We're ready to unshare the
  69. // mount namespace and bind the volume mounts into that private view of
  70. // the container FS.
  71. todo := make(chan future)
  72. done := make(chan error)
  73. err = unshare.Go(unix.CLONE_NEWNS,
  74. func() error {
  75. if err := mount.MakeRSlave("/"); err != nil {
  76. return err
  77. }
  78. for _, m := range mounts {
  79. dest, err := container.GetResourcePath(m.Destination)
  80. if err != nil {
  81. return err
  82. }
  83. var stat os.FileInfo
  84. stat, err = os.Stat(m.Source)
  85. if err != nil {
  86. return err
  87. }
  88. if err := fileutils.CreateIfNotExists(dest, stat.IsDir()); err != nil {
  89. return err
  90. }
  91. bindMode := "rbind"
  92. if m.NonRecursive {
  93. bindMode = "bind"
  94. }
  95. writeMode := "ro"
  96. if m.Writable {
  97. writeMode = "rw"
  98. if m.ReadOnlyNonRecursive {
  99. return errors.New("options conflict: Writable && ReadOnlyNonRecursive")
  100. }
  101. if m.ReadOnlyForceRecursive {
  102. return errors.New("options conflict: Writable && ReadOnlyForceRecursive")
  103. }
  104. }
  105. if m.ReadOnlyNonRecursive && m.ReadOnlyForceRecursive {
  106. return errors.New("options conflict: ReadOnlyNonRecursive && ReadOnlyForceRecursive")
  107. }
  108. // openContainerFS() is called for temporary mounts
  109. // outside the container. Soon these will be unmounted
  110. // with lazy unmount option and given we have mounted
  111. // them rbind, all the submounts will propagate if these
  112. // are shared. If daemon is running in host namespace
  113. // and has / as shared then these unmounts will
  114. // propagate and unmount original mount as well. So make
  115. // all these mounts rprivate. Do not use propagation
  116. // property of volume as that should apply only when
  117. // mounting happens inside the container.
  118. opts := strings.Join([]string{bindMode, writeMode, "rprivate"}, ",")
  119. if err := mount.Mount(m.Source, dest, "", opts); err != nil {
  120. return err
  121. }
  122. if !m.Writable && !m.ReadOnlyNonRecursive {
  123. if err := makeMountRRO(dest); err != nil {
  124. if m.ReadOnlyForceRecursive {
  125. return err
  126. } else {
  127. log.G(context.TODO()).WithError(err).Debugf("Failed to make %q recursively read-only", dest)
  128. }
  129. }
  130. }
  131. }
  132. return mounttree.SwitchRoot(container.BaseFS)
  133. },
  134. func() {
  135. defer close(done)
  136. for it := range todo {
  137. err := it.fn()
  138. if it.res != nil {
  139. it.res <- err
  140. }
  141. }
  142. // The thread will terminate when this goroutine returns, taking the
  143. // mount namespace and all the volume bind-mounts with it.
  144. },
  145. )
  146. if err != nil {
  147. return nil, err
  148. }
  149. vw := &containerFSView{
  150. d: daemon,
  151. ctr: container,
  152. todo: todo,
  153. done: done,
  154. }
  155. runtime.SetFinalizer(vw, (*containerFSView).Close)
  156. return vw, nil
  157. }
  158. // RunInFS synchronously runs fn in the context of the container filesytem and
  159. // passes through its return value.
  160. //
  161. // The container filesystem is only visible to functions called in the same
  162. // goroutine as fn. Goroutines started from fn will see the host's filesystem.
  163. func (vw *containerFSView) RunInFS(ctx context.Context, fn func() error) error {
  164. res := make(chan error)
  165. select {
  166. case vw.todo <- future{fn: fn, res: res}:
  167. case <-ctx.Done():
  168. return ctx.Err()
  169. }
  170. return <-res
  171. }
  172. // GoInFS starts fn in the container FS. It blocks until fn is started but does
  173. // not wait until fn returns. An error is returned if ctx is canceled before fn
  174. // has been started.
  175. //
  176. // The container filesystem is only visible to functions called in the same
  177. // goroutine as fn. Goroutines started from fn will see the host's filesystem.
  178. func (vw *containerFSView) GoInFS(ctx context.Context, fn func()) error {
  179. select {
  180. case vw.todo <- future{fn: func() error { fn(); return nil }}:
  181. return nil
  182. case <-ctx.Done():
  183. return ctx.Err()
  184. }
  185. }
  186. // Close waits until any in-flight operations complete and frees all
  187. // resources associated with vw.
  188. func (vw *containerFSView) Close() error {
  189. runtime.SetFinalizer(vw, nil)
  190. close(vw.todo)
  191. err := multierror.Append(nil, <-vw.done)
  192. err = multierror.Append(err, vw.ctr.UnmountVolumes(vw.d.LogVolumeEvent))
  193. err = multierror.Append(err, vw.d.Unmount(vw.ctr))
  194. return err.ErrorOrNil()
  195. }
  196. // Stat returns the metadata for path, relative to the current working directory
  197. // of vw inside the container filesystem view.
  198. func (vw *containerFSView) Stat(ctx context.Context, path string) (*types.ContainerPathStat, error) {
  199. var stat *types.ContainerPathStat
  200. err := vw.RunInFS(ctx, func() error {
  201. lstat, err := os.Lstat(path)
  202. if err != nil {
  203. return err
  204. }
  205. var target string
  206. if lstat.Mode()&os.ModeSymlink != 0 {
  207. // Fully evaluate symlinks along path to the ultimate
  208. // target, or as much as possible with broken links.
  209. target, err = symlink.FollowSymlinkInScope(path, "/")
  210. if err != nil {
  211. return err
  212. }
  213. }
  214. stat = &types.ContainerPathStat{
  215. Name: filepath.Base(path),
  216. Size: lstat.Size(),
  217. Mode: lstat.Mode(),
  218. Mtime: lstat.ModTime(),
  219. LinkTarget: target,
  220. }
  221. return nil
  222. })
  223. return stat, err
  224. }
  225. // makeMountRRO makes the mount recursively read-only.
  226. func makeMountRRO(dest string) error {
  227. attr := &unix.MountAttr{
  228. Attr_set: unix.MOUNT_ATTR_RDONLY,
  229. }
  230. var err error
  231. for {
  232. err = unix.MountSetattr(-1, dest, unix.AT_RECURSIVE, attr)
  233. if !errors.Is(err, unix.EINTR) {
  234. break
  235. }
  236. }
  237. if err != nil {
  238. err = fmt.Errorf("failed to apply MOUNT_ATTR_RDONLY with AT_RECURSIVE to %q: %w", dest, err)
  239. }
  240. return err
  241. }