package daemon // import "github.com/docker/docker/daemon" import ( "context" "errors" "fmt" "os" "path/filepath" "runtime" "strings" "github.com/containerd/log" "github.com/hashicorp/go-multierror" "github.com/moby/sys/mount" "github.com/moby/sys/symlink" "golang.org/x/sys/unix" "github.com/docker/docker/api/types" "github.com/docker/docker/container" "github.com/docker/docker/internal/mounttree" "github.com/docker/docker/internal/unshare" "github.com/docker/docker/pkg/fileutils" ) type future struct { fn func() error res chan<- error } // containerFSView allows functions to be run in the context of a container's // filesystem. Inside these functions, the root directory is the container root // for all native OS filesystem APIs, including, but not limited to, the [os] // and [golang.org/x/sys/unix] packages. The view of the container's filesystem // is live and read-write. Each view has its own private set of tmpfs mounts. // Any files written under a tmpfs mount are not visible to processes inside the // container nor any other view of the container's filesystem, and vice versa. // // Each view has its own current working directory which is initialized to the // root of the container filesystem and can be changed with [os.Chdir]. Changes // to the current directory persist across successive [*containerFSView.RunInFS] // and [*containerFSView.GoInFS] calls. // // Multiple views of the same container filesystem can coexist at the same time. // Only one function can be running in a particular filesystem view at any given // time. Calls to [*containerFSView.RunInFS] or [*containerFSView.GoInFS] will // block while another function is running. If more than one call is blocked // concurrently, the order they are unblocked is undefined. type containerFSView struct { d *Daemon ctr *container.Container todo chan future done chan error } // openContainerFS opens a new view of the container's filesystem. func (daemon *Daemon) openContainerFS(container *container.Container) (_ *containerFSView, err error) { if err := daemon.Mount(container); err != nil { return nil, err } defer func() { if err != nil { _ = daemon.Unmount(container) } }() mounts, err := daemon.setupMounts(container) if err != nil { return nil, err } defer func() { if err != nil { _ = container.UnmountVolumes(daemon.LogVolumeEvent) } }() // Setup in initial mount namespace complete. We're ready to unshare the // mount namespace and bind the volume mounts into that private view of // the container FS. todo := make(chan future) done := make(chan error) err = unshare.Go(unix.CLONE_NEWNS, func() error { if err := mount.MakeRSlave("/"); err != nil { return err } for _, m := range mounts { dest, err := container.GetResourcePath(m.Destination) if err != nil { return err } var stat os.FileInfo stat, err = os.Stat(m.Source) if err != nil { return err } if err := fileutils.CreateIfNotExists(dest, stat.IsDir()); err != nil { return err } bindMode := "rbind" if m.NonRecursive { bindMode = "bind" } writeMode := "ro" if m.Writable { writeMode = "rw" if m.ReadOnlyNonRecursive { return errors.New("options conflict: Writable && ReadOnlyNonRecursive") } if m.ReadOnlyForceRecursive { return errors.New("options conflict: Writable && ReadOnlyForceRecursive") } } if m.ReadOnlyNonRecursive && m.ReadOnlyForceRecursive { return errors.New("options conflict: ReadOnlyNonRecursive && ReadOnlyForceRecursive") } // openContainerFS() is called for temporary mounts // outside the container. Soon these will be unmounted // with lazy unmount option and given we have mounted // them rbind, all the submounts will propagate if these // are shared. If daemon is running in host namespace // and has / as shared then these unmounts will // propagate and unmount original mount as well. So make // all these mounts rprivate. Do not use propagation // property of volume as that should apply only when // mounting happens inside the container. opts := strings.Join([]string{bindMode, writeMode, "rprivate"}, ",") if err := mount.Mount(m.Source, dest, "", opts); err != nil { return err } if !m.Writable && !m.ReadOnlyNonRecursive { if err := makeMountRRO(dest); err != nil { if m.ReadOnlyForceRecursive { return err } else { log.G(context.TODO()).WithError(err).Debugf("Failed to make %q recursively read-only", dest) } } } } return mounttree.SwitchRoot(container.BaseFS) }, func() { defer close(done) for it := range todo { err := it.fn() if it.res != nil { it.res <- err } } // The thread will terminate when this goroutine returns, taking the // mount namespace and all the volume bind-mounts with it. }, ) if err != nil { return nil, err } vw := &containerFSView{ d: daemon, ctr: container, todo: todo, done: done, } runtime.SetFinalizer(vw, (*containerFSView).Close) return vw, nil } // RunInFS synchronously runs fn in the context of the container filesytem and // passes through its return value. // // The container filesystem is only visible to functions called in the same // goroutine as fn. Goroutines started from fn will see the host's filesystem. func (vw *containerFSView) RunInFS(ctx context.Context, fn func() error) error { res := make(chan error) select { case vw.todo <- future{fn: fn, res: res}: case <-ctx.Done(): return ctx.Err() } return <-res } // GoInFS starts fn in the container FS. It blocks until fn is started but does // not wait until fn returns. An error is returned if ctx is canceled before fn // has been started. // // The container filesystem is only visible to functions called in the same // goroutine as fn. Goroutines started from fn will see the host's filesystem. func (vw *containerFSView) GoInFS(ctx context.Context, fn func()) error { select { case vw.todo <- future{fn: func() error { fn(); return nil }}: return nil case <-ctx.Done(): return ctx.Err() } } // Close waits until any in-flight operations complete and frees all // resources associated with vw. func (vw *containerFSView) Close() error { runtime.SetFinalizer(vw, nil) close(vw.todo) err := multierror.Append(nil, <-vw.done) err = multierror.Append(err, vw.ctr.UnmountVolumes(vw.d.LogVolumeEvent)) err = multierror.Append(err, vw.d.Unmount(vw.ctr)) return err.ErrorOrNil() } // Stat returns the metadata for path, relative to the current working directory // of vw inside the container filesystem view. func (vw *containerFSView) Stat(ctx context.Context, path string) (*types.ContainerPathStat, error) { var stat *types.ContainerPathStat err := vw.RunInFS(ctx, func() error { lstat, err := os.Lstat(path) if err != nil { return err } var target string if lstat.Mode()&os.ModeSymlink != 0 { // Fully evaluate symlinks along path to the ultimate // target, or as much as possible with broken links. target, err = symlink.FollowSymlinkInScope(path, "/") if err != nil { return err } } stat = &types.ContainerPathStat{ Name: filepath.Base(path), Size: lstat.Size(), Mode: lstat.Mode(), Mtime: lstat.ModTime(), LinkTarget: target, } return nil }) return stat, err } // makeMountRRO makes the mount recursively read-only. func makeMountRRO(dest string) error { attr := &unix.MountAttr{ Attr_set: unix.MOUNT_ATTR_RDONLY, } var err error for { err = unix.MountSetattr(-1, dest, unix.AT_RECURSIVE, attr) if !errors.Is(err, unix.EINTR) { break } } if err != nil { err = fmt.Errorf("failed to apply MOUNT_ATTR_RDONLY with AT_RECURSIVE to %q: %w", dest, err) } return err }