From 5de229644fbda206a2ad1860ed1a73bb0a56143b Mon Sep 17 00:00:00 2001 From: Cory Snider Date: Tue, 27 Sep 2022 18:39:26 -0400 Subject: [PATCH] pkg/chrootarchive: stop reexec'ing before chroot Unshare the thread's file system attributes and, if applicable, mount namespace so that the chroot operation does not affect the rest of the process. Signed-off-by: Cory Snider --- pkg/archive/archive.go | 381 ++++++++++++++++-------------- pkg/chrootarchive/archive_unix.go | 230 +++--------------- pkg/chrootarchive/diff_unix.go | 107 ++------- pkg/chrootarchive/go_linux.go | 92 ++++++++ pkg/chrootarchive/init_unix.go | 29 --- 5 files changed, 359 insertions(+), 480 deletions(-) create mode 100644 pkg/chrootarchive/go_linux.go delete mode 100644 pkg/chrootarchive/init_unix.go diff --git a/pkg/archive/archive.go b/pkg/archive/archive.go index ce4e4bd5d4..3cde5ce16d 100644 --- a/pkg/archive/archive.go +++ b/pkg/archive/archive.go @@ -821,10 +821,29 @@ func Tar(path string, compression Compression) (io.ReadCloser, error) { // TarWithOptions creates an archive from the directory at `path`, only including files whose relative // paths are included in `options.IncludeFiles` (if non-nil) or not in `options.ExcludePatterns`. func TarWithOptions(srcPath string, options *TarOptions) (io.ReadCloser, error) { - // Fix the source path to work with long path names. This is a no-op - // on platforms other than Windows. - srcPath = fixVolumePathPrefix(srcPath) + tb, err := NewTarballer(srcPath, options) + if err != nil { + return nil, err + } + go tb.Do() + return tb.Reader(), nil +} +// Tarballer is a lower-level interface to TarWithOptions which gives the caller +// control over which goroutine the archiving operation executes on. +type Tarballer struct { + srcPath string + options *TarOptions + pm *patternmatcher.PatternMatcher + pipeReader *io.PipeReader + pipeWriter *io.PipeWriter + compressWriter io.WriteCloser + whiteoutConverter tarWhiteoutConverter +} + +// NewTarballer constructs a new tarballer. The arguments are the same as for +// TarWithOptions. +func NewTarballer(srcPath string, options *TarOptions) (*Tarballer, error) { pm, err := patternmatcher.New(options.ExcludePatterns) if err != nil { return nil, err @@ -842,183 +861,201 @@ func TarWithOptions(srcPath string, options *TarOptions) (io.ReadCloser, error) return nil, err } - go func() { - ta := newTarAppender( - options.IDMap, - compressWriter, - options.ChownOpts, - ) - ta.WhiteoutConverter = whiteoutConverter + return &Tarballer{ + // Fix the source path to work with long path names. This is a no-op + // on platforms other than Windows. + srcPath: fixVolumePathPrefix(srcPath), + options: options, + pm: pm, + pipeReader: pipeReader, + pipeWriter: pipeWriter, + compressWriter: compressWriter, + whiteoutConverter: whiteoutConverter, + }, nil +} - defer func() { - // Make sure to check the error on Close. - if err := ta.TarWriter.Close(); err != nil { - logrus.Errorf("Can't close tar writer: %s", err) - } - if err := compressWriter.Close(); err != nil { - logrus.Errorf("Can't close compress writer: %s", err) - } - if err := pipeWriter.Close(); err != nil { - logrus.Errorf("Can't close pipe writer: %s", err) - } - }() +// Reader returns the reader for the created archive. +func (t *Tarballer) Reader() io.ReadCloser { + return t.pipeReader +} - // this buffer is needed for the duration of this piped stream - defer pools.BufioWriter32KPool.Put(ta.Buffer) +// Do performs the archiving operation in the background. The resulting archive +// can be read from t.Reader(). Do should only be called once on each Tarballer +// instance. +func (t *Tarballer) Do() { + ta := newTarAppender( + t.options.IDMap, + t.compressWriter, + t.options.ChownOpts, + ) + ta.WhiteoutConverter = t.whiteoutConverter - // In general we log errors here but ignore them because - // during e.g. a diff operation the container can continue - // mutating the filesystem and we can see transient errors - // from this - - stat, err := os.Lstat(srcPath) - if err != nil { - return + defer func() { + // Make sure to check the error on Close. + if err := ta.TarWriter.Close(); err != nil { + logrus.Errorf("Can't close tar writer: %s", err) } - - if !stat.IsDir() { - // We can't later join a non-dir with any includes because the - // 'walk' will error if "file/." is stat-ed and "file" is not a - // directory. So, we must split the source path and use the - // basename as the include. - if len(options.IncludeFiles) > 0 { - logrus.Warn("Tar: Can't archive a file with includes") - } - - dir, base := SplitPathDirEntry(srcPath) - srcPath = dir - options.IncludeFiles = []string{base} + if err := t.compressWriter.Close(); err != nil { + logrus.Errorf("Can't close compress writer: %s", err) } - - if len(options.IncludeFiles) == 0 { - options.IncludeFiles = []string{"."} - } - - seen := make(map[string]bool) - - for _, include := range options.IncludeFiles { - rebaseName := options.RebaseNames[include] - - var ( - parentMatchInfo []patternmatcher.MatchInfo - parentDirs []string - ) - - walkRoot := getWalkRoot(srcPath, include) - filepath.WalkDir(walkRoot, func(filePath string, f os.DirEntry, err error) error { - if err != nil { - logrus.Errorf("Tar: Can't stat file %s to tar: %s", srcPath, err) - return nil - } - - relFilePath, err := filepath.Rel(srcPath, filePath) - if err != nil || (!options.IncludeSourceDir && relFilePath == "." && f.IsDir()) { - // Error getting relative path OR we are looking - // at the source directory path. Skip in both situations. - return nil - } - - if options.IncludeSourceDir && include == "." && relFilePath != "." { - relFilePath = strings.Join([]string{".", relFilePath}, string(filepath.Separator)) - } - - skip := false - - // If "include" is an exact match for the current file - // then even if there's an "excludePatterns" pattern that - // matches it, don't skip it. IOW, assume an explicit 'include' - // is asking for that file no matter what - which is true - // for some files, like .dockerignore and Dockerfile (sometimes) - if include != relFilePath { - for len(parentDirs) != 0 { - lastParentDir := parentDirs[len(parentDirs)-1] - if strings.HasPrefix(relFilePath, lastParentDir+string(os.PathSeparator)) { - break - } - parentDirs = parentDirs[:len(parentDirs)-1] - parentMatchInfo = parentMatchInfo[:len(parentMatchInfo)-1] - } - - var matchInfo patternmatcher.MatchInfo - if len(parentMatchInfo) != 0 { - skip, matchInfo, err = pm.MatchesUsingParentResults(relFilePath, parentMatchInfo[len(parentMatchInfo)-1]) - } else { - skip, matchInfo, err = pm.MatchesUsingParentResults(relFilePath, patternmatcher.MatchInfo{}) - } - if err != nil { - logrus.Errorf("Error matching %s: %v", relFilePath, err) - return err - } - - if f.IsDir() { - parentDirs = append(parentDirs, relFilePath) - parentMatchInfo = append(parentMatchInfo, matchInfo) - } - } - - if skip { - // If we want to skip this file and its a directory - // then we should first check to see if there's an - // excludes pattern (e.g. !dir/file) that starts with this - // dir. If so then we can't skip this dir. - - // Its not a dir then so we can just return/skip. - if !f.IsDir() { - return nil - } - - // No exceptions (!...) in patterns so just skip dir - if !pm.Exclusions() { - return filepath.SkipDir - } - - dirSlash := relFilePath + string(filepath.Separator) - - for _, pat := range pm.Patterns() { - if !pat.Exclusion() { - continue - } - if strings.HasPrefix(pat.String()+string(filepath.Separator), dirSlash) { - // found a match - so can't skip this dir - return nil - } - } - - // No matching exclusion dir so just skip dir - return filepath.SkipDir - } - - if seen[relFilePath] { - return nil - } - seen[relFilePath] = true - - // Rename the base resource. - if rebaseName != "" { - var replacement string - if rebaseName != string(filepath.Separator) { - // Special case the root directory to replace with an - // empty string instead so that we don't end up with - // double slashes in the paths. - replacement = rebaseName - } - - relFilePath = strings.Replace(relFilePath, include, replacement, 1) - } - - if err := ta.addTarFile(filePath, relFilePath); err != nil { - logrus.Errorf("Can't add file %s to tar: %s", filePath, err) - // if pipe is broken, stop writing tar stream to it - if err == io.ErrClosedPipe { - return err - } - } - return nil - }) + if err := t.pipeWriter.Close(); err != nil { + logrus.Errorf("Can't close pipe writer: %s", err) } }() - return pipeReader, nil + // this buffer is needed for the duration of this piped stream + defer pools.BufioWriter32KPool.Put(ta.Buffer) + + // In general we log errors here but ignore them because + // during e.g. a diff operation the container can continue + // mutating the filesystem and we can see transient errors + // from this + + stat, err := os.Lstat(t.srcPath) + if err != nil { + return + } + + if !stat.IsDir() { + // We can't later join a non-dir with any includes because the + // 'walk' will error if "file/." is stat-ed and "file" is not a + // directory. So, we must split the source path and use the + // basename as the include. + if len(t.options.IncludeFiles) > 0 { + logrus.Warn("Tar: Can't archive a file with includes") + } + + dir, base := SplitPathDirEntry(t.srcPath) + t.srcPath = dir + t.options.IncludeFiles = []string{base} + } + + if len(t.options.IncludeFiles) == 0 { + t.options.IncludeFiles = []string{"."} + } + + seen := make(map[string]bool) + + for _, include := range t.options.IncludeFiles { + rebaseName := t.options.RebaseNames[include] + + var ( + parentMatchInfo []patternmatcher.MatchInfo + parentDirs []string + ) + + walkRoot := getWalkRoot(t.srcPath, include) + filepath.WalkDir(walkRoot, func(filePath string, f os.DirEntry, err error) error { + if err != nil { + logrus.Errorf("Tar: Can't stat file %s to tar: %s", t.srcPath, err) + return nil + } + + relFilePath, err := filepath.Rel(t.srcPath, filePath) + if err != nil || (!t.options.IncludeSourceDir && relFilePath == "." && f.IsDir()) { + // Error getting relative path OR we are looking + // at the source directory path. Skip in both situations. + return nil + } + + if t.options.IncludeSourceDir && include == "." && relFilePath != "." { + relFilePath = strings.Join([]string{".", relFilePath}, string(filepath.Separator)) + } + + skip := false + + // If "include" is an exact match for the current file + // then even if there's an "excludePatterns" pattern that + // matches it, don't skip it. IOW, assume an explicit 'include' + // is asking for that file no matter what - which is true + // for some files, like .dockerignore and Dockerfile (sometimes) + if include != relFilePath { + for len(parentDirs) != 0 { + lastParentDir := parentDirs[len(parentDirs)-1] + if strings.HasPrefix(relFilePath, lastParentDir+string(os.PathSeparator)) { + break + } + parentDirs = parentDirs[:len(parentDirs)-1] + parentMatchInfo = parentMatchInfo[:len(parentMatchInfo)-1] + } + + var matchInfo patternmatcher.MatchInfo + if len(parentMatchInfo) != 0 { + skip, matchInfo, err = t.pm.MatchesUsingParentResults(relFilePath, parentMatchInfo[len(parentMatchInfo)-1]) + } else { + skip, matchInfo, err = t.pm.MatchesUsingParentResults(relFilePath, patternmatcher.MatchInfo{}) + } + if err != nil { + logrus.Errorf("Error matching %s: %v", relFilePath, err) + return err + } + + if f.IsDir() { + parentDirs = append(parentDirs, relFilePath) + parentMatchInfo = append(parentMatchInfo, matchInfo) + } + } + + if skip { + // If we want to skip this file and its a directory + // then we should first check to see if there's an + // excludes pattern (e.g. !dir/file) that starts with this + // dir. If so then we can't skip this dir. + + // Its not a dir then so we can just return/skip. + if !f.IsDir() { + return nil + } + + // No exceptions (!...) in patterns so just skip dir + if !t.pm.Exclusions() { + return filepath.SkipDir + } + + dirSlash := relFilePath + string(filepath.Separator) + + for _, pat := range t.pm.Patterns() { + if !pat.Exclusion() { + continue + } + if strings.HasPrefix(pat.String()+string(filepath.Separator), dirSlash) { + // found a match - so can't skip this dir + return nil + } + } + + // No matching exclusion dir so just skip dir + return filepath.SkipDir + } + + if seen[relFilePath] { + return nil + } + seen[relFilePath] = true + + // Rename the base resource. + if rebaseName != "" { + var replacement string + if rebaseName != string(filepath.Separator) { + // Special case the root directory to replace with an + // empty string instead so that we don't end up with + // double slashes in the paths. + replacement = rebaseName + } + + relFilePath = strings.Replace(relFilePath, include, replacement, 1) + } + + if err := ta.addTarFile(filePath, relFilePath); err != nil { + logrus.Errorf("Can't add file %s to tar: %s", filePath, err) + // if pipe is broken, stop writing tar stream to it + if err == io.ErrClosedPipe { + return err + } + } + return nil + }) + } } // Unpack unpacks the decompressedArchive to dest with options. diff --git a/pkg/chrootarchive/archive_unix.go b/pkg/chrootarchive/archive_unix.go index 13bb82a2e4..62e7499acd 100644 --- a/pkg/chrootarchive/archive_unix.go +++ b/pkg/chrootarchive/archive_unix.go @@ -4,223 +4,71 @@ package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive" import ( - "bytes" - "encoding/json" - "flag" - "fmt" "io" - "os" "path/filepath" - "runtime" "strings" "github.com/docker/docker/pkg/archive" - "github.com/docker/docker/pkg/reexec" "github.com/pkg/errors" ) -// untar is the entry-point for docker-untar on re-exec. This is not used on -// Windows as it does not support chroot, hence no point sandboxing through -// chroot and rexec. -func untar() { - runtime.LockOSThread() - flag.Parse() - - var options archive.TarOptions - - // read the options from the pipe "ExtraFiles" - if err := json.NewDecoder(os.NewFile(3, "options")).Decode(&options); err != nil { - fatal(err) - } - - dst := flag.Arg(0) - var root string - if len(flag.Args()) > 1 { - root = flag.Arg(1) - } - - if root == "" { - root = dst - } - - if err := chroot(root); err != nil { - fatal(err) - } - - if err := archive.Unpack(os.Stdin, dst, &options); err != nil { - fatal(err) - } - // fully consume stdin in case it is zero padded - if _, err := flush(os.Stdin); err != nil { - fatal(err) - } - - os.Exit(0) -} - func invokeUnpack(decompressedArchive io.Reader, dest string, options *archive.TarOptions, root string) error { - if root == "" { - return errors.New("must specify a root to chroot to") - } - - // We can't pass a potentially large exclude list directly via cmd line - // because we easily overrun the kernel's max argument/environment size - // when the full image list is passed (e.g. when this is used by - // `docker load`). We will marshall the options via a pipe to the - // child - r, w, err := os.Pipe() + relDest, err := resolvePathInChroot(root, dest) if err != nil { - return fmt.Errorf("Untar pipe failure: %v", err) + return err } - if root != "" { - relDest, err := filepath.Rel(root, dest) - if err != nil { - return err - } - if relDest == "." { - relDest = "/" - } - if relDest[0] != '/' { - relDest = "/" + relDest - } - dest = relDest - } - - cmd := reexec.Command("docker-untar", dest, root) - cmd.Stdin = decompressedArchive - - cmd.ExtraFiles = append(cmd.ExtraFiles, r) - output := bytes.NewBuffer(nil) - cmd.Stdout = output - cmd.Stderr = output - - // reexec.Command() sets cmd.SysProcAttr.Pdeathsig on Linux, which - // causes the started process to be signaled when the creating OS thread - // dies. Ensure that the reexec is not prematurely signaled. See - // https://go.dev/issue/27505 for more information. - runtime.LockOSThread() - defer runtime.UnlockOSThread() - if err := cmd.Start(); err != nil { - w.Close() - return fmt.Errorf("Untar error on re-exec cmd: %v", err) - } - - // write the options to the pipe for the untar exec to read - if err := json.NewEncoder(w).Encode(options); err != nil { - w.Close() - return fmt.Errorf("Untar json encode to pipe failed: %v", err) - } - w.Close() - - if err := cmd.Wait(); err != nil { - // when `xz -d -c -q | docker-untar ...` failed on docker-untar side, - // we need to exhaust `xz`'s output, otherwise the `xz` side will be - // pending on write pipe forever - io.Copy(io.Discard, decompressedArchive) - - return fmt.Errorf("Error processing tar file(%v): %s", err, output) - } - return nil -} - -func tar() { - runtime.LockOSThread() - flag.Parse() - - src := flag.Arg(0) - var root string - if len(flag.Args()) > 1 { - root = flag.Arg(1) - } - - if root == "" { - root = src - } - - if err := realChroot(root); err != nil { - fatal(err) - } - - var options archive.TarOptions - if err := json.NewDecoder(os.Stdin).Decode(&options); err != nil { - fatal(err) - } - - rdr, err := archive.TarWithOptions(src, &options) + done := make(chan error) + err = Go(root, func() { done <- archive.Unpack(decompressedArchive, relDest, options) }) if err != nil { - fatal(err) + return err } - defer rdr.Close() - - if _, err := io.Copy(os.Stdout, rdr); err != nil { - fatal(err) - } - - os.Exit(0) + return <-done } func invokePack(srcPath string, options *archive.TarOptions, root string) (io.ReadCloser, error) { - if root == "" { - return nil, errors.New("root path must not be empty") - } - - relSrc, err := filepath.Rel(root, srcPath) + relSrc, err := resolvePathInChroot(root, srcPath) if err != nil { return nil, err } - if relSrc == "." { - relSrc = "/" - } - if relSrc[0] != '/' { - relSrc = "/" + relSrc - } - // make sure we didn't trim a trailing slash with the call to `Rel` + // make sure we didn't trim a trailing slash with the call to `resolvePathInChroot` if strings.HasSuffix(srcPath, "/") && !strings.HasSuffix(relSrc, "/") { relSrc += "/" } - cmd := reexec.Command("docker-tar", relSrc, root) - - errBuff := bytes.NewBuffer(nil) - cmd.Stderr = errBuff - - tarR, tarW := io.Pipe() - cmd.Stdout = tarW - - stdin, err := cmd.StdinPipe() + tb, err := archive.NewTarballer(relSrc, options) if err != nil { - return nil, errors.Wrap(err, "error getting options pipe for tar process") + return nil, errors.Wrap(err, "error processing tar file") } - - started := make(chan error) - go func() { - // reexec.Command() sets cmd.SysProcAttr.Pdeathsig on Linux, - // which causes the started process to be signaled when the - // creating OS thread dies. Ensure that the subprocess is not - // prematurely signaled. See https://go.dev/issue/27505 for more - // information. - runtime.LockOSThread() - defer runtime.UnlockOSThread() - if err := cmd.Start(); err != nil { - started <- err - return - } - close(started) - err := cmd.Wait() - err = errors.Wrapf(err, "error processing tar file: %s", errBuff) - tarW.CloseWithError(err) - }() - if err := <-started; err != nil { - return nil, errors.Wrap(err, "tar error on re-exec cmd") + err = Go(root, tb.Do) + if err != nil { + return nil, errors.Wrap(err, "could not chroot") } - - if err := json.NewEncoder(stdin).Encode(options); err != nil { - stdin.Close() - return nil, errors.Wrap(err, "tar json encode to pipe failed") - } - stdin.Close() - - return tarR, nil + return tb.Reader(), nil +} + +// resolvePathInChroot returns the equivalent to path inside a chroot rooted at root. +// The returned path always begins with '/'. +// +// - resolvePathInChroot("/a/b", "/a/b/c/d") -> "/c/d" +// - resolvePathInChroot("/a/b", "/a/b") -> "/" +// +// The implementation is buggy, and some bugs may be load-bearing. +// Here be dragons. +func resolvePathInChroot(root, path string) (string, error) { + if root == "" { + return "", errors.New("root path must not be empty") + } + rel, err := filepath.Rel(root, path) + if err != nil { + return "", err + } + if rel == "." { + rel = "/" + } + if rel[0] != '/' { + rel = "/" + rel + } + return rel, nil } diff --git a/pkg/chrootarchive/diff_unix.go b/pkg/chrootarchive/diff_unix.go index c35acbe91f..9729c6d052 100644 --- a/pkg/chrootarchive/diff_unix.go +++ b/pkg/chrootarchive/diff_unix.go @@ -4,71 +4,14 @@ package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive" import ( - "bytes" - "encoding/json" - "flag" - "fmt" "io" - "os" "path/filepath" - "runtime" "github.com/containerd/containerd/pkg/userns" "github.com/docker/docker/pkg/archive" - "github.com/docker/docker/pkg/reexec" "golang.org/x/sys/unix" ) -type applyLayerResponse struct { - LayerSize int64 `json:"layerSize"` -} - -// applyLayer is the entry-point for docker-applylayer on re-exec. This is not -// used on Windows as it does not support chroot, hence no point sandboxing -// through chroot and rexec. -func applyLayer() { - - var ( - err error - options *archive.TarOptions - ) - runtime.LockOSThread() - flag.Parse() - - inUserns := userns.RunningInUserNS() - if err := chroot(flag.Arg(0)); err != nil { - fatal(err) - } - - // We need to be able to set any perms - oldmask := unix.Umask(0) - defer unix.Umask(oldmask) - - if err := json.Unmarshal([]byte(os.Getenv("OPT")), &options); err != nil { - fatal(err) - } - - if inUserns { - options.InUserNS = true - } - - size, err := archive.UnpackLayer("/", os.Stdin, options) - if err != nil { - fatal(err) - } - - encoder := json.NewEncoder(os.Stdout) - if err := encoder.Encode(applyLayerResponse{size}); err != nil { - fatal(fmt.Errorf("unable to encode layerSize JSON: %s", err)) - } - - if _, err := flush(os.Stdin); err != nil { - fatal(err) - } - - os.Exit(0) -} - // applyLayerHandler parses a diff in the standard layer format from `layer`, and // applies it to the directory `dest`. Returns the size in bytes of the // contents of the layer. @@ -85,42 +28,30 @@ func applyLayerHandler(dest string, layer io.Reader, options *archive.TarOptions } if options == nil { options = &archive.TarOptions{} - if userns.RunningInUserNS() { - options.InUserNS = true - } + } + if userns.RunningInUserNS() { + options.InUserNS = true } if options.ExcludePatterns == nil { options.ExcludePatterns = []string{} } - data, err := json.Marshal(options) + type result struct { + layerSize int64 + err error + } + + done := make(chan result) + err = Go(dest, func() { + // We need to be able to set any perms + _ = unix.Umask(0) + + size, err := archive.UnpackLayer("/", layer, options) + done <- result{layerSize: size, err: err} + }) if err != nil { - return 0, fmt.Errorf("ApplyLayer json encode: %v", err) + return 0, err } - - cmd := reexec.Command("docker-applyLayer", dest) - cmd.Stdin = layer - cmd.Env = append(cmd.Env, fmt.Sprintf("OPT=%s", data)) - - outBuf, errBuf := new(bytes.Buffer), new(bytes.Buffer) - cmd.Stdout, cmd.Stderr = outBuf, errBuf - - // reexec.Command() sets cmd.SysProcAttr.Pdeathsig on Linux, which - // causes the started process to be signaled when the creating OS thread - // dies. Ensure that the reexec is not prematurely signaled. See - // https://go.dev/issue/27505 for more information. - runtime.LockOSThread() - defer runtime.UnlockOSThread() - if err = cmd.Run(); err != nil { - return 0, fmt.Errorf("ApplyLayer %s stdout: %s stderr: %s", err, outBuf, errBuf) - } - - // Stdout should be a valid JSON struct representing an applyLayerResponse. - response := applyLayerResponse{} - decoder := json.NewDecoder(outBuf) - if err = decoder.Decode(&response); err != nil { - return 0, fmt.Errorf("unable to decode ApplyLayer JSON response: %s", err) - } - - return response.LayerSize, nil + res := <-done + return res.layerSize, res.err } diff --git a/pkg/chrootarchive/go_linux.go b/pkg/chrootarchive/go_linux.go new file mode 100644 index 0000000000..37cbc14d3d --- /dev/null +++ b/pkg/chrootarchive/go_linux.go @@ -0,0 +1,92 @@ +//go:build go1.10 +// +build go1.10 + +package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive" + +import ( + "runtime" + + "golang.org/x/sys/unix" +) + +func init() { + // The startup thread of a process is special in a few different ways. + // Most pertinent to the discussion at hand, any per-thread kernel state + // reflected in the /proc/[pid]/ directory for a process is taken from + // the state of the startup thread. Same goes for /proc/self/; it shows + // the state of the current process' startup thread, no matter which + // thread the files are being opened from. For most programs this is a + // distinction without a difference as the kernel state, such as the + // mount namespace and current working directory, is shared among (and + // kept synchronized across) all threads of a process. But things start + // to break down once threads start unsharing and modifying parts of + // their kernel state. + // + // The Go runtime schedules goroutines to execute on the startup thread, + // same as any other. How this could be problematic is best illustrated + // with a concrete example. Consider what happens if a goroutine spawned + // from Go() gets scheduled onto the startup thread. The thread's mount + // namespace will be unshared and modified. The contents of the + // /proc/[pid]/mountinfo file will then describe the mount tree of the + // unshared namespace, not the namespace of any other thread. It will + // remain this way until the process exits. (The startup thread is + // special in another way: exiting it puts the process into a + // "non-waitable zombie" state. To avoid this fate, the Go runtime parks + // the thread instead of exiting if a goroutine returns while locked to + // the startup thread. More information can be found in the Go runtime + // sources: `go doc -u -src runtime.mexit`.) + // The github.com/moby/sys/mountinfo package reads from + // /proc/self/mountinfo, so will read the mount tree for the wrong + // namespace if the startup thread has had its mount namespace unshared! + // The /proc/thread-self/ magic symlink, introduced in Linux 3.17, is + // one potential solution to this problem, but every package which opens + // files in /proc/self/ would need to be updated, and fallbacks to + // /proc/self/task/{{syscall.Gettid()}}/ would be required to support + // older kernels. Overlooking any reference to /proc/self/ would + // manifest as stochastically-reproducible bugs, so this is far from an + // ideal solution. + // + // Reading from /proc/self/ would not be a problem if we can prevent the + // per-thread state of the startup thread from being modified + // nondeterministically in the first place. We can accomplish this + // simply by locking the main() function to the startup thread! Doing so + // excludes any other goroutine from being scheduled on the thread. + runtime.LockOSThread() +} + +// Go starts fn in a goroutine where the root directory, current working +// directory and umask are unshared from other goroutines and the root directory +// has been changed to path. These changes are only visible to the goroutine in +// which fn is executed. Any other goroutines, including ones started from fn, +// will see the same root directory and file system attributes as the rest of +// the process. +func Go(path string, fn func()) error { + started := make(chan error) + go func() { + // Prepare to manipulate per-thread kernel state. Wire the + // goroutine to the OS thread so execution of other goroutines + // will not be scheduled on it. It is very important not to + // unwire the goroutine from the thread so that the thread exits + // with this goroutine and is not returned to the goroutine + // thread pool. + runtime.LockOSThread() + + // Under Linux, threads are implemented as processes which share + // a virtual memory space. Therefore in a multithreaded process + // unshare(2) disassociates parts of the calling thread's + // context from the thread it was clone(2)'d from. + if err := unix.Unshare(unix.CLONE_FS); err != nil { + started <- err + return + } + + if err := chroot(path); err != nil { + started <- err + return + } + + close(started) + fn() + }() + return <-started +} diff --git a/pkg/chrootarchive/init_unix.go b/pkg/chrootarchive/init_unix.go deleted file mode 100644 index 0746c1cb97..0000000000 --- a/pkg/chrootarchive/init_unix.go +++ /dev/null @@ -1,29 +0,0 @@ -//go:build !windows -// +build !windows - -package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive" - -import ( - "fmt" - "io" - "os" - - "github.com/docker/docker/pkg/reexec" -) - -func init() { - reexec.Register("docker-applyLayer", applyLayer) - reexec.Register("docker-untar", untar) - reexec.Register("docker-tar", tar) -} - -func fatal(err error) { - fmt.Fprint(os.Stderr, err) - os.Exit(1) -} - -// flush consumes all the bytes from the reader discarding -// any errors -func flush(r io.Reader) (bytes int64, err error) { - return io.Copy(io.Discard, r) -}