Bläddra i källkod

pkg/chrootarchive: stop reexec'ing before chroot

Unshare the thread's file system attributes and, if applicable, mount
namespace so that the chroot operation does not affect the rest of the
process.

Signed-off-by: Cory Snider <csnider@mirantis.com>
Cory Snider 2 år sedan
förälder
incheckning
5de229644f

+ 185 - 148
pkg/archive/archive.go

@@ -821,10 +821,29 @@ func Tar(path string, compression Compression) (io.ReadCloser, error) {
 // TarWithOptions creates an archive from the directory at `path`, only including files whose relative
 // TarWithOptions creates an archive from the directory at `path`, only including files whose relative
 // paths are included in `options.IncludeFiles` (if non-nil) or not in `options.ExcludePatterns`.
 // paths are included in `options.IncludeFiles` (if non-nil) or not in `options.ExcludePatterns`.
 func TarWithOptions(srcPath string, options *TarOptions) (io.ReadCloser, error) {
 func TarWithOptions(srcPath string, options *TarOptions) (io.ReadCloser, error) {
-	// Fix the source path to work with long path names. This is a no-op
-	// on platforms other than Windows.
-	srcPath = fixVolumePathPrefix(srcPath)
+	tb, err := NewTarballer(srcPath, options)
+	if err != nil {
+		return nil, err
+	}
+	go tb.Do()
+	return tb.Reader(), nil
+}
+
+// Tarballer is a lower-level interface to TarWithOptions which gives the caller
+// control over which goroutine the archiving operation executes on.
+type Tarballer struct {
+	srcPath           string
+	options           *TarOptions
+	pm                *patternmatcher.PatternMatcher
+	pipeReader        *io.PipeReader
+	pipeWriter        *io.PipeWriter
+	compressWriter    io.WriteCloser
+	whiteoutConverter tarWhiteoutConverter
+}
 
 
+// NewTarballer constructs a new tarballer. The arguments are the same as for
+// TarWithOptions.
+func NewTarballer(srcPath string, options *TarOptions) (*Tarballer, error) {
 	pm, err := patternmatcher.New(options.ExcludePatterns)
 	pm, err := patternmatcher.New(options.ExcludePatterns)
 	if err != nil {
 	if err != nil {
 		return nil, err
 		return nil, err
@@ -842,183 +861,201 @@ func TarWithOptions(srcPath string, options *TarOptions) (io.ReadCloser, error)
 		return nil, err
 		return nil, err
 	}
 	}
 
 
-	go func() {
-		ta := newTarAppender(
-			options.IDMap,
-			compressWriter,
-			options.ChownOpts,
-		)
-		ta.WhiteoutConverter = whiteoutConverter
-
-		defer func() {
-			// Make sure to check the error on Close.
-			if err := ta.TarWriter.Close(); err != nil {
-				logrus.Errorf("Can't close tar writer: %s", err)
-			}
-			if err := compressWriter.Close(); err != nil {
-				logrus.Errorf("Can't close compress writer: %s", err)
-			}
-			if err := pipeWriter.Close(); err != nil {
-				logrus.Errorf("Can't close pipe writer: %s", err)
-			}
-		}()
+	return &Tarballer{
+		// Fix the source path to work with long path names. This is a no-op
+		// on platforms other than Windows.
+		srcPath:           fixVolumePathPrefix(srcPath),
+		options:           options,
+		pm:                pm,
+		pipeReader:        pipeReader,
+		pipeWriter:        pipeWriter,
+		compressWriter:    compressWriter,
+		whiteoutConverter: whiteoutConverter,
+	}, nil
+}
 
 
-		// this buffer is needed for the duration of this piped stream
-		defer pools.BufioWriter32KPool.Put(ta.Buffer)
+// Reader returns the reader for the created archive.
+func (t *Tarballer) Reader() io.ReadCloser {
+	return t.pipeReader
+}
 
 
-		// In general we log errors here but ignore them because
-		// during e.g. a diff operation the container can continue
-		// mutating the filesystem and we can see transient errors
-		// from this
+// Do performs the archiving operation in the background. The resulting archive
+// can be read from t.Reader(). Do should only be called once on each Tarballer
+// instance.
+func (t *Tarballer) Do() {
+	ta := newTarAppender(
+		t.options.IDMap,
+		t.compressWriter,
+		t.options.ChownOpts,
+	)
+	ta.WhiteoutConverter = t.whiteoutConverter
 
 
-		stat, err := os.Lstat(srcPath)
-		if err != nil {
-			return
+	defer func() {
+		// Make sure to check the error on Close.
+		if err := ta.TarWriter.Close(); err != nil {
+			logrus.Errorf("Can't close tar writer: %s", err)
 		}
 		}
-
-		if !stat.IsDir() {
-			// We can't later join a non-dir with any includes because the
-			// 'walk' will error if "file/." is stat-ed and "file" is not a
-			// directory. So, we must split the source path and use the
-			// basename as the include.
-			if len(options.IncludeFiles) > 0 {
-				logrus.Warn("Tar: Can't archive a file with includes")
-			}
-
-			dir, base := SplitPathDirEntry(srcPath)
-			srcPath = dir
-			options.IncludeFiles = []string{base}
+		if err := t.compressWriter.Close(); err != nil {
+			logrus.Errorf("Can't close compress writer: %s", err)
 		}
 		}
-
-		if len(options.IncludeFiles) == 0 {
-			options.IncludeFiles = []string{"."}
+		if err := t.pipeWriter.Close(); err != nil {
+			logrus.Errorf("Can't close pipe writer: %s", err)
 		}
 		}
+	}()
 
 
-		seen := make(map[string]bool)
-
-		for _, include := range options.IncludeFiles {
-			rebaseName := options.RebaseNames[include]
+	// this buffer is needed for the duration of this piped stream
+	defer pools.BufioWriter32KPool.Put(ta.Buffer)
 
 
-			var (
-				parentMatchInfo []patternmatcher.MatchInfo
-				parentDirs      []string
-			)
+	// In general we log errors here but ignore them because
+	// during e.g. a diff operation the container can continue
+	// mutating the filesystem and we can see transient errors
+	// from this
 
 
-			walkRoot := getWalkRoot(srcPath, include)
-			filepath.WalkDir(walkRoot, func(filePath string, f os.DirEntry, err error) error {
-				if err != nil {
-					logrus.Errorf("Tar: Can't stat file %s to tar: %s", srcPath, err)
-					return nil
-				}
+	stat, err := os.Lstat(t.srcPath)
+	if err != nil {
+		return
+	}
 
 
-				relFilePath, err := filepath.Rel(srcPath, filePath)
-				if err != nil || (!options.IncludeSourceDir && relFilePath == "." && f.IsDir()) {
-					// Error getting relative path OR we are looking
-					// at the source directory path. Skip in both situations.
-					return nil
-				}
+	if !stat.IsDir() {
+		// We can't later join a non-dir with any includes because the
+		// 'walk' will error if "file/." is stat-ed and "file" is not a
+		// directory. So, we must split the source path and use the
+		// basename as the include.
+		if len(t.options.IncludeFiles) > 0 {
+			logrus.Warn("Tar: Can't archive a file with includes")
+		}
 
 
-				if options.IncludeSourceDir && include == "." && relFilePath != "." {
-					relFilePath = strings.Join([]string{".", relFilePath}, string(filepath.Separator))
-				}
+		dir, base := SplitPathDirEntry(t.srcPath)
+		t.srcPath = dir
+		t.options.IncludeFiles = []string{base}
+	}
 
 
-				skip := false
-
-				// If "include" is an exact match for the current file
-				// then even if there's an "excludePatterns" pattern that
-				// matches it, don't skip it. IOW, assume an explicit 'include'
-				// is asking for that file no matter what - which is true
-				// for some files, like .dockerignore and Dockerfile (sometimes)
-				if include != relFilePath {
-					for len(parentDirs) != 0 {
-						lastParentDir := parentDirs[len(parentDirs)-1]
-						if strings.HasPrefix(relFilePath, lastParentDir+string(os.PathSeparator)) {
-							break
-						}
-						parentDirs = parentDirs[:len(parentDirs)-1]
-						parentMatchInfo = parentMatchInfo[:len(parentMatchInfo)-1]
-					}
+	if len(t.options.IncludeFiles) == 0 {
+		t.options.IncludeFiles = []string{"."}
+	}
 
 
-					var matchInfo patternmatcher.MatchInfo
-					if len(parentMatchInfo) != 0 {
-						skip, matchInfo, err = pm.MatchesUsingParentResults(relFilePath, parentMatchInfo[len(parentMatchInfo)-1])
-					} else {
-						skip, matchInfo, err = pm.MatchesUsingParentResults(relFilePath, patternmatcher.MatchInfo{})
-					}
-					if err != nil {
-						logrus.Errorf("Error matching %s: %v", relFilePath, err)
-						return err
-					}
+	seen := make(map[string]bool)
 
 
-					if f.IsDir() {
-						parentDirs = append(parentDirs, relFilePath)
-						parentMatchInfo = append(parentMatchInfo, matchInfo)
-					}
-				}
+	for _, include := range t.options.IncludeFiles {
+		rebaseName := t.options.RebaseNames[include]
 
 
-				if skip {
-					// If we want to skip this file and its a directory
-					// then we should first check to see if there's an
-					// excludes pattern (e.g. !dir/file) that starts with this
-					// dir. If so then we can't skip this dir.
+		var (
+			parentMatchInfo []patternmatcher.MatchInfo
+			parentDirs      []string
+		)
 
 
-					// Its not a dir then so we can just return/skip.
-					if !f.IsDir() {
-						return nil
-					}
+		walkRoot := getWalkRoot(t.srcPath, include)
+		filepath.WalkDir(walkRoot, func(filePath string, f os.DirEntry, err error) error {
+			if err != nil {
+				logrus.Errorf("Tar: Can't stat file %s to tar: %s", t.srcPath, err)
+				return nil
+			}
 
 
-					// No exceptions (!...) in patterns so just skip dir
-					if !pm.Exclusions() {
-						return filepath.SkipDir
-					}
+			relFilePath, err := filepath.Rel(t.srcPath, filePath)
+			if err != nil || (!t.options.IncludeSourceDir && relFilePath == "." && f.IsDir()) {
+				// Error getting relative path OR we are looking
+				// at the source directory path. Skip in both situations.
+				return nil
+			}
 
 
-					dirSlash := relFilePath + string(filepath.Separator)
+			if t.options.IncludeSourceDir && include == "." && relFilePath != "." {
+				relFilePath = strings.Join([]string{".", relFilePath}, string(filepath.Separator))
+			}
 
 
-					for _, pat := range pm.Patterns() {
-						if !pat.Exclusion() {
-							continue
-						}
-						if strings.HasPrefix(pat.String()+string(filepath.Separator), dirSlash) {
-							// found a match - so can't skip this dir
-							return nil
-						}
+			skip := false
+
+			// If "include" is an exact match for the current file
+			// then even if there's an "excludePatterns" pattern that
+			// matches it, don't skip it. IOW, assume an explicit 'include'
+			// is asking for that file no matter what - which is true
+			// for some files, like .dockerignore and Dockerfile (sometimes)
+			if include != relFilePath {
+				for len(parentDirs) != 0 {
+					lastParentDir := parentDirs[len(parentDirs)-1]
+					if strings.HasPrefix(relFilePath, lastParentDir+string(os.PathSeparator)) {
+						break
 					}
 					}
+					parentDirs = parentDirs[:len(parentDirs)-1]
+					parentMatchInfo = parentMatchInfo[:len(parentMatchInfo)-1]
+				}
 
 
-					// No matching exclusion dir so just skip dir
-					return filepath.SkipDir
+				var matchInfo patternmatcher.MatchInfo
+				if len(parentMatchInfo) != 0 {
+					skip, matchInfo, err = t.pm.MatchesUsingParentResults(relFilePath, parentMatchInfo[len(parentMatchInfo)-1])
+				} else {
+					skip, matchInfo, err = t.pm.MatchesUsingParentResults(relFilePath, patternmatcher.MatchInfo{})
+				}
+				if err != nil {
+					logrus.Errorf("Error matching %s: %v", relFilePath, err)
+					return err
+				}
+
+				if f.IsDir() {
+					parentDirs = append(parentDirs, relFilePath)
+					parentMatchInfo = append(parentMatchInfo, matchInfo)
 				}
 				}
+			}
+
+			if skip {
+				// If we want to skip this file and its a directory
+				// then we should first check to see if there's an
+				// excludes pattern (e.g. !dir/file) that starts with this
+				// dir. If so then we can't skip this dir.
 
 
-				if seen[relFilePath] {
+				// Its not a dir then so we can just return/skip.
+				if !f.IsDir() {
 					return nil
 					return nil
 				}
 				}
-				seen[relFilePath] = true
-
-				// Rename the base resource.
-				if rebaseName != "" {
-					var replacement string
-					if rebaseName != string(filepath.Separator) {
-						// Special case the root directory to replace with an
-						// empty string instead so that we don't end up with
-						// double slashes in the paths.
-						replacement = rebaseName
-					}
 
 
-					relFilePath = strings.Replace(relFilePath, include, replacement, 1)
+				// No exceptions (!...) in patterns so just skip dir
+				if !t.pm.Exclusions() {
+					return filepath.SkipDir
 				}
 				}
 
 
-				if err := ta.addTarFile(filePath, relFilePath); err != nil {
-					logrus.Errorf("Can't add file %s to tar: %s", filePath, err)
-					// if pipe is broken, stop writing tar stream to it
-					if err == io.ErrClosedPipe {
-						return err
+				dirSlash := relFilePath + string(filepath.Separator)
+
+				for _, pat := range t.pm.Patterns() {
+					if !pat.Exclusion() {
+						continue
+					}
+					if strings.HasPrefix(pat.String()+string(filepath.Separator), dirSlash) {
+						// found a match - so can't skip this dir
+						return nil
 					}
 					}
 				}
 				}
+
+				// No matching exclusion dir so just skip dir
+				return filepath.SkipDir
+			}
+
+			if seen[relFilePath] {
 				return nil
 				return nil
-			})
-		}
-	}()
+			}
+			seen[relFilePath] = true
+
+			// Rename the base resource.
+			if rebaseName != "" {
+				var replacement string
+				if rebaseName != string(filepath.Separator) {
+					// Special case the root directory to replace with an
+					// empty string instead so that we don't end up with
+					// double slashes in the paths.
+					replacement = rebaseName
+				}
 
 
-	return pipeReader, nil
+				relFilePath = strings.Replace(relFilePath, include, replacement, 1)
+			}
+
+			if err := ta.addTarFile(filePath, relFilePath); err != nil {
+				logrus.Errorf("Can't add file %s to tar: %s", filePath, err)
+				// if pipe is broken, stop writing tar stream to it
+				if err == io.ErrClosedPipe {
+					return err
+				}
+			}
+			return nil
+		})
+	}
 }
 }
 
 
 // Unpack unpacks the decompressedArchive to dest with options.
 // Unpack unpacks the decompressedArchive to dest with options.

+ 37 - 189
pkg/chrootarchive/archive_unix.go

@@ -4,223 +4,71 @@
 package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive"
 package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive"
 
 
 import (
 import (
-	"bytes"
-	"encoding/json"
-	"flag"
-	"fmt"
 	"io"
 	"io"
-	"os"
 	"path/filepath"
 	"path/filepath"
-	"runtime"
 	"strings"
 	"strings"
 
 
 	"github.com/docker/docker/pkg/archive"
 	"github.com/docker/docker/pkg/archive"
-	"github.com/docker/docker/pkg/reexec"
 	"github.com/pkg/errors"
 	"github.com/pkg/errors"
 )
 )
 
 
-// untar is the entry-point for docker-untar on re-exec. This is not used on
-// Windows as it does not support chroot, hence no point sandboxing through
-// chroot and rexec.
-func untar() {
-	runtime.LockOSThread()
-	flag.Parse()
-
-	var options archive.TarOptions
-
-	// read the options from the pipe "ExtraFiles"
-	if err := json.NewDecoder(os.NewFile(3, "options")).Decode(&options); err != nil {
-		fatal(err)
-	}
-
-	dst := flag.Arg(0)
-	var root string
-	if len(flag.Args()) > 1 {
-		root = flag.Arg(1)
-	}
-
-	if root == "" {
-		root = dst
-	}
-
-	if err := chroot(root); err != nil {
-		fatal(err)
-	}
-
-	if err := archive.Unpack(os.Stdin, dst, &options); err != nil {
-		fatal(err)
-	}
-	// fully consume stdin in case it is zero padded
-	if _, err := flush(os.Stdin); err != nil {
-		fatal(err)
-	}
-
-	os.Exit(0)
-}
-
 func invokeUnpack(decompressedArchive io.Reader, dest string, options *archive.TarOptions, root string) error {
 func invokeUnpack(decompressedArchive io.Reader, dest string, options *archive.TarOptions, root string) error {
-	if root == "" {
-		return errors.New("must specify a root to chroot to")
-	}
-
-	// We can't pass a potentially large exclude list directly via cmd line
-	// because we easily overrun the kernel's max argument/environment size
-	// when the full image list is passed (e.g. when this is used by
-	// `docker load`). We will marshall the options via a pipe to the
-	// child
-	r, w, err := os.Pipe()
+	relDest, err := resolvePathInChroot(root, dest)
 	if err != nil {
 	if err != nil {
-		return fmt.Errorf("Untar pipe failure: %v", err)
-	}
-
-	if root != "" {
-		relDest, err := filepath.Rel(root, dest)
-		if err != nil {
-			return err
-		}
-		if relDest == "." {
-			relDest = "/"
-		}
-		if relDest[0] != '/' {
-			relDest = "/" + relDest
-		}
-		dest = relDest
+		return err
 	}
 	}
 
 
-	cmd := reexec.Command("docker-untar", dest, root)
-	cmd.Stdin = decompressedArchive
-
-	cmd.ExtraFiles = append(cmd.ExtraFiles, r)
-	output := bytes.NewBuffer(nil)
-	cmd.Stdout = output
-	cmd.Stderr = output
-
-	// reexec.Command() sets cmd.SysProcAttr.Pdeathsig on Linux, which
-	// causes the started process to be signaled when the creating OS thread
-	// dies. Ensure that the reexec is not prematurely signaled. See
-	// https://go.dev/issue/27505 for more information.
-	runtime.LockOSThread()
-	defer runtime.UnlockOSThread()
-	if err := cmd.Start(); err != nil {
-		w.Close()
-		return fmt.Errorf("Untar error on re-exec cmd: %v", err)
-	}
-
-	// write the options to the pipe for the untar exec to read
-	if err := json.NewEncoder(w).Encode(options); err != nil {
-		w.Close()
-		return fmt.Errorf("Untar json encode to pipe failed: %v", err)
-	}
-	w.Close()
-
-	if err := cmd.Wait(); err != nil {
-		// when `xz -d -c -q | docker-untar ...` failed on docker-untar side,
-		// we need to exhaust `xz`'s output, otherwise the `xz` side will be
-		// pending on write pipe forever
-		io.Copy(io.Discard, decompressedArchive)
-
-		return fmt.Errorf("Error processing tar file(%v): %s", err, output)
-	}
-	return nil
-}
-
-func tar() {
-	runtime.LockOSThread()
-	flag.Parse()
-
-	src := flag.Arg(0)
-	var root string
-	if len(flag.Args()) > 1 {
-		root = flag.Arg(1)
-	}
-
-	if root == "" {
-		root = src
-	}
-
-	if err := realChroot(root); err != nil {
-		fatal(err)
-	}
-
-	var options archive.TarOptions
-	if err := json.NewDecoder(os.Stdin).Decode(&options); err != nil {
-		fatal(err)
-	}
-
-	rdr, err := archive.TarWithOptions(src, &options)
+	done := make(chan error)
+	err = Go(root, func() { done <- archive.Unpack(decompressedArchive, relDest, options) })
 	if err != nil {
 	if err != nil {
-		fatal(err)
-	}
-	defer rdr.Close()
-
-	if _, err := io.Copy(os.Stdout, rdr); err != nil {
-		fatal(err)
+		return err
 	}
 	}
-
-	os.Exit(0)
+	return <-done
 }
 }
 
 
 func invokePack(srcPath string, options *archive.TarOptions, root string) (io.ReadCloser, error) {
 func invokePack(srcPath string, options *archive.TarOptions, root string) (io.ReadCloser, error) {
-	if root == "" {
-		return nil, errors.New("root path must not be empty")
-	}
-
-	relSrc, err := filepath.Rel(root, srcPath)
+	relSrc, err := resolvePathInChroot(root, srcPath)
 	if err != nil {
 	if err != nil {
 		return nil, err
 		return nil, err
 	}
 	}
-	if relSrc == "." {
-		relSrc = "/"
-	}
-	if relSrc[0] != '/' {
-		relSrc = "/" + relSrc
-	}
 
 
-	// make sure we didn't trim a trailing slash with the call to `Rel`
+	// make sure we didn't trim a trailing slash with the call to `resolvePathInChroot`
 	if strings.HasSuffix(srcPath, "/") && !strings.HasSuffix(relSrc, "/") {
 	if strings.HasSuffix(srcPath, "/") && !strings.HasSuffix(relSrc, "/") {
 		relSrc += "/"
 		relSrc += "/"
 	}
 	}
 
 
-	cmd := reexec.Command("docker-tar", relSrc, root)
-
-	errBuff := bytes.NewBuffer(nil)
-	cmd.Stderr = errBuff
-
-	tarR, tarW := io.Pipe()
-	cmd.Stdout = tarW
-
-	stdin, err := cmd.StdinPipe()
+	tb, err := archive.NewTarballer(relSrc, options)
 	if err != nil {
 	if err != nil {
-		return nil, errors.Wrap(err, "error getting options pipe for tar process")
+		return nil, errors.Wrap(err, "error processing tar file")
 	}
 	}
-
-	started := make(chan error)
-	go func() {
-		// reexec.Command() sets cmd.SysProcAttr.Pdeathsig on Linux,
-		// which causes the started process to be signaled when the
-		// creating OS thread dies. Ensure that the subprocess is not
-		// prematurely signaled. See https://go.dev/issue/27505 for more
-		// information.
-		runtime.LockOSThread()
-		defer runtime.UnlockOSThread()
-		if err := cmd.Start(); err != nil {
-			started <- err
-			return
-		}
-		close(started)
-		err := cmd.Wait()
-		err = errors.Wrapf(err, "error processing tar file: %s", errBuff)
-		tarW.CloseWithError(err)
-	}()
-	if err := <-started; err != nil {
-		return nil, errors.Wrap(err, "tar error on re-exec cmd")
+	err = Go(root, tb.Do)
+	if err != nil {
+		return nil, errors.Wrap(err, "could not chroot")
 	}
 	}
+	return tb.Reader(), nil
+}
 
 
-	if err := json.NewEncoder(stdin).Encode(options); err != nil {
-		stdin.Close()
-		return nil, errors.Wrap(err, "tar json encode to pipe failed")
+// resolvePathInChroot returns the equivalent to path inside a chroot rooted at root.
+// The returned path always begins with '/'.
+//
+//   - resolvePathInChroot("/a/b", "/a/b/c/d") -> "/c/d"
+//   - resolvePathInChroot("/a/b", "/a/b")     -> "/"
+//
+// The implementation is buggy, and some bugs may be load-bearing.
+// Here be dragons.
+func resolvePathInChroot(root, path string) (string, error) {
+	if root == "" {
+		return "", errors.New("root path must not be empty")
 	}
 	}
-	stdin.Close()
-
-	return tarR, nil
+	rel, err := filepath.Rel(root, path)
+	if err != nil {
+		return "", err
+	}
+	if rel == "." {
+		rel = "/"
+	}
+	if rel[0] != '/' {
+		rel = "/" + rel
+	}
+	return rel, nil
 }
 }

+ 17 - 86
pkg/chrootarchive/diff_unix.go

@@ -4,71 +4,14 @@
 package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive"
 package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive"
 
 
 import (
 import (
-	"bytes"
-	"encoding/json"
-	"flag"
-	"fmt"
 	"io"
 	"io"
-	"os"
 	"path/filepath"
 	"path/filepath"
-	"runtime"
 
 
 	"github.com/containerd/containerd/pkg/userns"
 	"github.com/containerd/containerd/pkg/userns"
 	"github.com/docker/docker/pkg/archive"
 	"github.com/docker/docker/pkg/archive"
-	"github.com/docker/docker/pkg/reexec"
 	"golang.org/x/sys/unix"
 	"golang.org/x/sys/unix"
 )
 )
 
 
-type applyLayerResponse struct {
-	LayerSize int64 `json:"layerSize"`
-}
-
-// applyLayer is the entry-point for docker-applylayer on re-exec. This is not
-// used on Windows as it does not support chroot, hence no point sandboxing
-// through chroot and rexec.
-func applyLayer() {
-
-	var (
-		err     error
-		options *archive.TarOptions
-	)
-	runtime.LockOSThread()
-	flag.Parse()
-
-	inUserns := userns.RunningInUserNS()
-	if err := chroot(flag.Arg(0)); err != nil {
-		fatal(err)
-	}
-
-	// We need to be able to set any perms
-	oldmask := unix.Umask(0)
-	defer unix.Umask(oldmask)
-
-	if err := json.Unmarshal([]byte(os.Getenv("OPT")), &options); err != nil {
-		fatal(err)
-	}
-
-	if inUserns {
-		options.InUserNS = true
-	}
-
-	size, err := archive.UnpackLayer("/", os.Stdin, options)
-	if err != nil {
-		fatal(err)
-	}
-
-	encoder := json.NewEncoder(os.Stdout)
-	if err := encoder.Encode(applyLayerResponse{size}); err != nil {
-		fatal(fmt.Errorf("unable to encode layerSize JSON: %s", err))
-	}
-
-	if _, err := flush(os.Stdin); err != nil {
-		fatal(err)
-	}
-
-	os.Exit(0)
-}
-
 // applyLayerHandler parses a diff in the standard layer format from `layer`, and
 // applyLayerHandler parses a diff in the standard layer format from `layer`, and
 // applies it to the directory `dest`. Returns the size in bytes of the
 // applies it to the directory `dest`. Returns the size in bytes of the
 // contents of the layer.
 // contents of the layer.
@@ -85,42 +28,30 @@ func applyLayerHandler(dest string, layer io.Reader, options *archive.TarOptions
 	}
 	}
 	if options == nil {
 	if options == nil {
 		options = &archive.TarOptions{}
 		options = &archive.TarOptions{}
-		if userns.RunningInUserNS() {
-			options.InUserNS = true
-		}
+	}
+	if userns.RunningInUserNS() {
+		options.InUserNS = true
 	}
 	}
 	if options.ExcludePatterns == nil {
 	if options.ExcludePatterns == nil {
 		options.ExcludePatterns = []string{}
 		options.ExcludePatterns = []string{}
 	}
 	}
 
 
-	data, err := json.Marshal(options)
-	if err != nil {
-		return 0, fmt.Errorf("ApplyLayer json encode: %v", err)
+	type result struct {
+		layerSize int64
+		err       error
 	}
 	}
 
 
-	cmd := reexec.Command("docker-applyLayer", dest)
-	cmd.Stdin = layer
-	cmd.Env = append(cmd.Env, fmt.Sprintf("OPT=%s", data))
+	done := make(chan result)
+	err = Go(dest, func() {
+		// We need to be able to set any perms
+		_ = unix.Umask(0)
 
 
-	outBuf, errBuf := new(bytes.Buffer), new(bytes.Buffer)
-	cmd.Stdout, cmd.Stderr = outBuf, errBuf
-
-	// reexec.Command() sets cmd.SysProcAttr.Pdeathsig on Linux, which
-	// causes the started process to be signaled when the creating OS thread
-	// dies. Ensure that the reexec is not prematurely signaled. See
-	// https://go.dev/issue/27505 for more information.
-	runtime.LockOSThread()
-	defer runtime.UnlockOSThread()
-	if err = cmd.Run(); err != nil {
-		return 0, fmt.Errorf("ApplyLayer %s stdout: %s stderr: %s", err, outBuf, errBuf)
-	}
-
-	// Stdout should be a valid JSON struct representing an applyLayerResponse.
-	response := applyLayerResponse{}
-	decoder := json.NewDecoder(outBuf)
-	if err = decoder.Decode(&response); err != nil {
-		return 0, fmt.Errorf("unable to decode ApplyLayer JSON response: %s", err)
+		size, err := archive.UnpackLayer("/", layer, options)
+		done <- result{layerSize: size, err: err}
+	})
+	if err != nil {
+		return 0, err
 	}
 	}
-
-	return response.LayerSize, nil
+	res := <-done
+	return res.layerSize, res.err
 }
 }

+ 92 - 0
pkg/chrootarchive/go_linux.go

@@ -0,0 +1,92 @@
+//go:build go1.10
+// +build go1.10
+
+package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive"
+
+import (
+	"runtime"
+
+	"golang.org/x/sys/unix"
+)
+
+func init() {
+	// The startup thread of a process is special in a few different ways.
+	// Most pertinent to the discussion at hand, any per-thread kernel state
+	// reflected in the /proc/[pid]/ directory for a process is taken from
+	// the state of the startup thread. Same goes for /proc/self/; it shows
+	// the state of the current process' startup thread, no matter which
+	// thread the files are being opened from. For most programs this is a
+	// distinction without a difference as the kernel state, such as the
+	// mount namespace and current working directory, is shared among (and
+	// kept synchronized across) all threads of a process. But things start
+	// to break down once threads start unsharing and modifying parts of
+	// their kernel state.
+	//
+	// The Go runtime schedules goroutines to execute on the startup thread,
+	// same as any other. How this could be problematic is best illustrated
+	// with a concrete example. Consider what happens if a goroutine spawned
+	// from Go() gets scheduled onto the startup thread. The thread's mount
+	// namespace will be unshared and modified. The contents of the
+	// /proc/[pid]/mountinfo file will then describe the mount tree of the
+	// unshared namespace, not the namespace of any other thread. It will
+	// remain this way until the process exits. (The startup thread is
+	// special in another way: exiting it puts the process into a
+	// "non-waitable zombie" state. To avoid this fate, the Go runtime parks
+	// the thread instead of exiting if a goroutine returns while locked to
+	// the startup thread. More information can be found in the Go runtime
+	// sources: `go doc -u -src runtime.mexit`.)
+	// The github.com/moby/sys/mountinfo package reads from
+	// /proc/self/mountinfo, so will read the mount tree for the wrong
+	// namespace if the startup thread has had its mount namespace unshared!
+	// The /proc/thread-self/ magic symlink, introduced in Linux 3.17, is
+	// one potential solution to this problem, but every package which opens
+	// files in /proc/self/ would need to be updated, and fallbacks to
+	// /proc/self/task/{{syscall.Gettid()}}/ would be required to support
+	// older kernels. Overlooking any reference to /proc/self/ would
+	// manifest as stochastically-reproducible bugs, so this is far from an
+	// ideal solution.
+	//
+	// Reading from /proc/self/ would not be a problem if we can prevent the
+	// per-thread state of the startup thread from being modified
+	// nondeterministically in the first place. We can accomplish this
+	// simply by locking the main() function to the startup thread! Doing so
+	// excludes any other goroutine from being scheduled on the thread.
+	runtime.LockOSThread()
+}
+
+// Go starts fn in a goroutine where the root directory, current working
+// directory and umask are unshared from other goroutines and the root directory
+// has been changed to path. These changes are only visible to the goroutine in
+// which fn is executed. Any other goroutines, including ones started from fn,
+// will see the same root directory and file system attributes as the rest of
+// the process.
+func Go(path string, fn func()) error {
+	started := make(chan error)
+	go func() {
+		// Prepare to manipulate per-thread kernel state. Wire the
+		// goroutine to the OS thread so execution of other goroutines
+		// will not be scheduled on it. It is very important not to
+		// unwire the goroutine from the thread so that the thread exits
+		// with this goroutine and is not returned to the goroutine
+		// thread pool.
+		runtime.LockOSThread()
+
+		// Under Linux, threads are implemented as processes which share
+		// a virtual memory space. Therefore in a multithreaded process
+		// unshare(2) disassociates parts of the calling thread's
+		// context from the thread it was clone(2)'d from.
+		if err := unix.Unshare(unix.CLONE_FS); err != nil {
+			started <- err
+			return
+		}
+
+		if err := chroot(path); err != nil {
+			started <- err
+			return
+		}
+
+		close(started)
+		fn()
+	}()
+	return <-started
+}

+ 0 - 29
pkg/chrootarchive/init_unix.go

@@ -1,29 +0,0 @@
-//go:build !windows
-// +build !windows
-
-package chrootarchive // import "github.com/docker/docker/pkg/chrootarchive"
-
-import (
-	"fmt"
-	"io"
-	"os"
-
-	"github.com/docker/docker/pkg/reexec"
-)
-
-func init() {
-	reexec.Register("docker-applyLayer", applyLayer)
-	reexec.Register("docker-untar", untar)
-	reexec.Register("docker-tar", tar)
-}
-
-func fatal(err error) {
-	fmt.Fprint(os.Stderr, err)
-	os.Exit(1)
-}
-
-// flush consumes all the bytes from the reader discarding
-// any errors
-func flush(r io.Reader) (bytes int64, err error) {
-	return io.Copy(io.Discard, r)
-}