瀏覽代碼

Merge pull request #37999 from Microsoft/jjh/tar2vhd

LCOW: ApplyDiff() use tar2ext4, not SVM
Sebastiaan van Stijn 6 年之前
父節點
當前提交
1527a67212

+ 72 - 19
daemon/graphdriver/lcow/lcow.go

@@ -71,20 +71,33 @@ import (
 	"time"
 
 	"github.com/Microsoft/hcsshim"
+	"github.com/Microsoft/hcsshim/ext4/tar2ext4"
 	"github.com/Microsoft/opengcs/client"
 	"github.com/docker/docker/daemon/graphdriver"
 	"github.com/docker/docker/pkg/archive"
 	"github.com/docker/docker/pkg/containerfs"
 	"github.com/docker/docker/pkg/idtools"
 	"github.com/docker/docker/pkg/ioutils"
+	"github.com/docker/docker/pkg/reexec"
 	"github.com/docker/docker/pkg/system"
 	"github.com/sirupsen/logrus"
 )
 
+// noreexec controls reexec functionality. Off by default, on for debugging purposes.
+var noreexec = false
+
 // init registers this driver to the register. It gets initialised by the
 // function passed in the second parameter, implemented in this file.
 func init() {
 	graphdriver.Register("lcow", InitDriver)
+	// DOCKER_LCOW_NOREEXEC allows for inline processing which makes
+	// debugging issues in the re-exec codepath significantly easier.
+	if os.Getenv("DOCKER_LCOW_NOREEXEC") != "" {
+		logrus.Warnf("LCOW Graphdriver is set to not re-exec. This is intended for debugging purposes only.")
+		noreexec = true
+	} else {
+		reexec.Register("docker-lcow-tar2ext4", tar2ext4Reexec)
+	}
 }
 
 const (
@@ -846,32 +859,72 @@ func (d *Driver) Diff(id, parent string) (io.ReadCloser, error) {
 func (d *Driver) ApplyDiff(id, parent string, diff io.Reader) (int64, error) {
 	logrus.Debugf("lcowdriver: applydiff: id %s", id)
 
-	svm, err := d.startServiceVMIfNotRunning(id, nil, fmt.Sprintf("applydiff %s", id))
+	// Log failures here as it's undiagnosable sometimes, due to a possible panic.
+	// See https://github.com/moby/moby/issues/37955 for more information.
+
+	dest := filepath.Join(d.dataRoot, id, layerFilename)
+	if !noreexec {
+		cmd := reexec.Command([]string{"docker-lcow-tar2ext4", dest}...)
+		stdout := bytes.NewBuffer(nil)
+		stderr := bytes.NewBuffer(nil)
+		cmd.Stdin = diff
+		cmd.Stdout = stdout
+		cmd.Stderr = stderr
+
+		if err := cmd.Start(); err != nil {
+			logrus.Warnf("lcowdriver: applydiff: id %s failed to start re-exec: %s", id, err)
+			return 0, err
+		}
+
+		if err := cmd.Wait(); err != nil {
+			logrus.Warnf("lcowdriver: applydiff: id %s failed %s", id, err)
+			return 0, fmt.Errorf("re-exec error: %v: stderr: %s", err, stderr)
+		}
+		return strconv.ParseInt(stdout.String(), 10, 64)
+	}
+	// The inline case
+	size, err := tar2ext4Actual(dest, diff)
 	if err != nil {
-		return 0, err
+		logrus.Warnf("lcowdriver: applydiff: id %s failed %s", id, err)
 	}
-	defer d.terminateServiceVM(id, fmt.Sprintf("applydiff %s", id), false)
+	return size, err
+}
 
-	logrus.Debugf("lcowdriver: applydiff: waiting for svm to finish booting")
-	err = svm.getStartError()
+// tar2ext4Reexec is the re-exec entry point for writing a layer from a tar file
+func tar2ext4Reexec() {
+	size, err := tar2ext4Actual(os.Args[1], os.Stdin)
 	if err != nil {
-		return 0, fmt.Errorf("lcowdriver: applydiff: svm failed to boot: %s", err)
+		fmt.Fprint(os.Stderr, err)
+		os.Exit(1)
 	}
+	fmt.Fprint(os.Stdout, size)
+}
 
-	// TODO @jhowardmsft - the retries are temporary to overcome platform reliability issues.
-	// Obviously this will be removed as platform bugs are fixed.
-	retries := 0
-	for {
-		retries++
-		size, err := svm.config.TarToVhd(filepath.Join(d.dataRoot, id, layerFilename), diff)
-		if err != nil {
-			if retries <= 10 {
-				continue
-			}
-			return 0, err
-		}
-		return size, err
+// tar2ext4Actual is the implementation of tar2ext to write a layer from a tar file.
+// It can be called through re-exec (default), or inline for debugging.
+func tar2ext4Actual(dest string, diff io.Reader) (int64, error) {
+	// maxDiskSize is not relating to the sandbox size - this is the
+	// maximum possible size a layer VHD generated can be from an EXT4
+	// layout perspective.
+	const maxDiskSize = 128 * 1024 * 1024 * 1024 // 128GB
+	out, err := os.Create(dest)
+	if err != nil {
+		return 0, err
+	}
+	defer out.Close()
+	if err := tar2ext4.Convert(
+		diff,
+		out,
+		tar2ext4.AppendVhdFooter,
+		tar2ext4.ConvertWhiteout,
+		tar2ext4.MaximumDiskSize(maxDiskSize)); err != nil {
+		return 0, err
+	}
+	fi, err := os.Stat(dest)
+	if err != nil {
+		return 0, err
 	}
+	return fi.Size(), nil
 }
 
 // Changes produces a list of changes between the specified layer

+ 1 - 1
vendor.conf

@@ -1,6 +1,6 @@
 # the following lines are in sorted order, FYI
 github.com/Azure/go-ansiterm d6e3b3328b783f23731bc4d058875b0371ff8109
-github.com/Microsoft/hcsshim v0.7.6
+github.com/Microsoft/hcsshim v0.7.9
 github.com/Microsoft/go-winio v0.4.11
 github.com/docker/libtrust 9cbd2a1374f46905c68a4eb3694a130610adc62a
 github.com/go-check/check 4ed411733c5785b40214c70bce814c3a3a689609 https://github.com/cpuguy83/check.git

+ 1263 - 0
vendor/github.com/Microsoft/hcsshim/ext4/internal/compactext4/compact.go

@@ -0,0 +1,1263 @@
+package compactext4
+
+import (
+	"bufio"
+	"bytes"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+	"path"
+	"sort"
+	"strings"
+	"time"
+
+	"github.com/Microsoft/hcsshim/ext4/internal/format"
+)
+
+// Writer writes a compact ext4 file system.
+type Writer struct {
+	f                    io.ReadWriteSeeker
+	bw                   *bufio.Writer
+	inodes               []*inode
+	curName              string
+	curInode             *inode
+	pos                  int64
+	dataWritten, dataMax int64
+	err                  error
+	initialized          bool
+	supportInlineData    bool
+	maxDiskSize          int64
+	gdBlocks             uint32
+}
+
+// Mode flags for Linux files.
+const (
+	S_IXOTH  = format.S_IXOTH
+	S_IWOTH  = format.S_IWOTH
+	S_IROTH  = format.S_IROTH
+	S_IXGRP  = format.S_IXGRP
+	S_IWGRP  = format.S_IWGRP
+	S_IRGRP  = format.S_IRGRP
+	S_IXUSR  = format.S_IXUSR
+	S_IWUSR  = format.S_IWUSR
+	S_IRUSR  = format.S_IRUSR
+	S_ISVTX  = format.S_ISVTX
+	S_ISGID  = format.S_ISGID
+	S_ISUID  = format.S_ISUID
+	S_IFIFO  = format.S_IFIFO
+	S_IFCHR  = format.S_IFCHR
+	S_IFDIR  = format.S_IFDIR
+	S_IFBLK  = format.S_IFBLK
+	S_IFREG  = format.S_IFREG
+	S_IFLNK  = format.S_IFLNK
+	S_IFSOCK = format.S_IFSOCK
+
+	TypeMask = format.TypeMask
+)
+
+type inode struct {
+	Size                        int64
+	Atime, Ctime, Mtime, Crtime uint64
+	Number                      format.InodeNumber
+	Mode                        uint16
+	Uid, Gid                    uint32
+	LinkCount                   uint32
+	XattrBlock                  uint32
+	BlockCount                  uint32
+	Devmajor, Devminor          uint32
+	Flags                       format.InodeFlag
+	Data                        []byte
+	XattrInline                 []byte
+	Children                    directory
+}
+
+func (node *inode) FileType() uint16 {
+	return node.Mode & format.TypeMask
+}
+
+func (node *inode) IsDir() bool {
+	return node.FileType() == S_IFDIR
+}
+
+// A File represents a file to be added to an ext4 file system.
+type File struct {
+	Linkname                    string
+	Size                        int64
+	Mode                        uint16
+	Uid, Gid                    uint32
+	Atime, Ctime, Mtime, Crtime time.Time
+	Devmajor, Devminor          uint32
+	Xattrs                      map[string][]byte
+}
+
+const (
+	inodeFirst        = 11
+	inodeLostAndFound = inodeFirst
+
+	blockSize               = 4096
+	blocksPerGroup          = blockSize * 8
+	inodeSize               = 256
+	maxInodesPerGroup       = blockSize * 8 // Limited by the inode bitmap
+	inodesPerGroupIncrement = blockSize / inodeSize
+
+	defaultMaxDiskSize = 16 * 1024 * 1024 * 1024        // 16GB
+	maxMaxDiskSize     = 16 * 1024 * 1024 * 1024 * 1024 // 16TB
+
+	groupDescriptorSize      = 32 // Use the small group descriptor
+	groupsPerDescriptorBlock = blockSize / groupDescriptorSize
+
+	maxFileSize             = 128 * 1024 * 1024 * 1024 // 128GB file size maximum for now
+	smallSymlinkSize        = 59                       // max symlink size that goes directly in the inode
+	maxBlocksPerExtent      = 0x8000                   // maximum number of blocks in an extent
+	inodeDataSize           = 60
+	inodeUsedSize           = 152 // fields through CrtimeExtra
+	inodeExtraSize          = inodeSize - inodeUsedSize
+	xattrInodeOverhead      = 4 + 4                       // magic number + empty next entry value
+	xattrBlockOverhead      = 32 + 4                      // header + empty next entry value
+	inlineDataXattrOverhead = xattrInodeOverhead + 16 + 4 // entry + "data"
+	inlineDataSize          = inodeDataSize + inodeExtraSize - inlineDataXattrOverhead
+)
+
+type exceededMaxSizeError struct {
+	Size int64
+}
+
+func (err exceededMaxSizeError) Error() string {
+	return fmt.Sprintf("disk exceeded maximum size of %d bytes", err.Size)
+}
+
+var directoryEntrySize = binary.Size(format.DirectoryEntry{})
+var extraIsize = uint16(inodeUsedSize - 128)
+
+type directory map[string]*inode
+
+func splitFirst(p string) (string, string) {
+	n := strings.IndexByte(p, '/')
+	if n >= 0 {
+		return p[:n], p[n+1:]
+	}
+	return p, ""
+}
+
+func (w *Writer) findPath(root *inode, p string) *inode {
+	inode := root
+	for inode != nil && len(p) != 0 {
+		name, rest := splitFirst(p)
+		p = rest
+		inode = inode.Children[name]
+	}
+	return inode
+}
+
+func timeToFsTime(t time.Time) uint64 {
+	if t.IsZero() {
+		return 0
+	}
+	s := t.Unix()
+	if s < -0x80000000 {
+		return 0x80000000
+	}
+	if s > 0x37fffffff {
+		return 0x37fffffff
+	}
+	return uint64(s) | uint64(t.Nanosecond())<<34
+}
+
+func fsTimeToTime(t uint64) time.Time {
+	if t == 0 {
+		return time.Time{}
+	}
+	s := int64(t & 0x3ffffffff)
+	if s > 0x7fffffff && s < 0x100000000 {
+		s = int64(int32(uint32(s)))
+	}
+	return time.Unix(s, int64(t>>34))
+}
+
+func (w *Writer) getInode(i format.InodeNumber) *inode {
+	if i == 0 || int(i) > len(w.inodes) {
+		return nil
+	}
+	return w.inodes[i-1]
+}
+
+var xattrPrefixes = []struct {
+	Index  uint8
+	Prefix string
+}{
+	{2, "system.posix_acl_access"},
+	{3, "system.posix_acl_default"},
+	{8, "system.richacl"},
+	{7, "system."},
+	{1, "user."},
+	{4, "trusted."},
+	{6, "security."},
+}
+
+func compressXattrName(name string) (uint8, string) {
+	for _, p := range xattrPrefixes {
+		if strings.HasPrefix(name, p.Prefix) {
+			return p.Index, name[len(p.Prefix):]
+		}
+	}
+	return 0, name
+}
+
+func decompressXattrName(index uint8, name string) string {
+	for _, p := range xattrPrefixes {
+		if index == p.Index {
+			return p.Prefix + name
+		}
+	}
+	return name
+}
+
+func hashXattrEntry(name string, value []byte) uint32 {
+	var hash uint32
+	for i := 0; i < len(name); i++ {
+		hash = (hash << 5) ^ (hash >> 27) ^ uint32(name[i])
+	}
+
+	for i := 0; i+3 < len(value); i += 4 {
+		hash = (hash << 16) ^ (hash >> 16) ^ binary.LittleEndian.Uint32(value[i:i+4])
+	}
+
+	if len(value)%4 != 0 {
+		var last [4]byte
+		copy(last[:], value[len(value)&^3:])
+		hash = (hash << 16) ^ (hash >> 16) ^ binary.LittleEndian.Uint32(last[:])
+	}
+	return hash
+}
+
+type xattr struct {
+	Name  string
+	Index uint8
+	Value []byte
+}
+
+func (x *xattr) EntryLen() int {
+	return (len(x.Name)+3)&^3 + 16
+}
+
+func (x *xattr) ValueLen() int {
+	return (len(x.Value) + 3) &^ 3
+}
+
+type xattrState struct {
+	inode, block         []xattr
+	inodeLeft, blockLeft int
+}
+
+func (s *xattrState) init() {
+	s.inodeLeft = inodeExtraSize - xattrInodeOverhead
+	s.blockLeft = blockSize - xattrBlockOverhead
+}
+
+func (s *xattrState) addXattr(name string, value []byte) bool {
+	index, name := compressXattrName(name)
+	x := xattr{
+		Index: index,
+		Name:  name,
+		Value: value,
+	}
+	length := x.EntryLen() + x.ValueLen()
+	if s.inodeLeft >= length {
+		s.inode = append(s.inode, x)
+		s.inodeLeft -= length
+	} else if s.blockLeft >= length {
+		s.block = append(s.block, x)
+		s.blockLeft -= length
+	} else {
+		return false
+	}
+	return true
+}
+
+func putXattrs(xattrs []xattr, b []byte, offsetDelta uint16) {
+	offset := uint16(len(b)) + offsetDelta
+	eb := b
+	db := b
+	for _, xattr := range xattrs {
+		vl := xattr.ValueLen()
+		offset -= uint16(vl)
+		eb[0] = uint8(len(xattr.Name))
+		eb[1] = xattr.Index
+		binary.LittleEndian.PutUint16(eb[2:], offset)
+		binary.LittleEndian.PutUint32(eb[8:], uint32(len(xattr.Value)))
+		binary.LittleEndian.PutUint32(eb[12:], hashXattrEntry(xattr.Name, xattr.Value))
+		copy(eb[16:], xattr.Name)
+		eb = eb[xattr.EntryLen():]
+		copy(db[len(db)-vl:], xattr.Value)
+		db = db[:len(db)-vl]
+	}
+}
+
+func getXattrs(b []byte, xattrs map[string][]byte, offsetDelta uint16) {
+	eb := b
+	for len(eb) != 0 {
+		nameLen := eb[0]
+		if nameLen == 0 {
+			break
+		}
+		index := eb[1]
+		offset := binary.LittleEndian.Uint16(eb[2:]) - offsetDelta
+		valueLen := binary.LittleEndian.Uint32(eb[8:])
+		attr := xattr{
+			Index: index,
+			Name:  string(eb[16 : 16+nameLen]),
+			Value: b[offset : uint32(offset)+valueLen],
+		}
+		xattrs[decompressXattrName(index, attr.Name)] = attr.Value
+		eb = eb[attr.EntryLen():]
+	}
+}
+
+func (w *Writer) writeXattrs(inode *inode, state *xattrState) error {
+	// Write the inline attributes.
+	if len(state.inode) != 0 {
+		inode.XattrInline = make([]byte, inodeExtraSize)
+		binary.LittleEndian.PutUint32(inode.XattrInline[0:], format.XAttrHeaderMagic) // Magic
+		putXattrs(state.inode, inode.XattrInline[4:], 0)
+	}
+
+	// Write the block attributes. If there was previously an xattr block, then
+	// rewrite it even if it is now empty.
+	if len(state.block) != 0 || inode.XattrBlock != 0 {
+		sort.Slice(state.block, func(i, j int) bool {
+			return state.block[i].Index < state.block[j].Index ||
+				len(state.block[i].Name) < len(state.block[j].Name) ||
+				state.block[i].Name < state.block[j].Name
+		})
+
+		var b [blockSize]byte
+		binary.LittleEndian.PutUint32(b[0:], format.XAttrHeaderMagic) // Magic
+		binary.LittleEndian.PutUint32(b[4:], 1)                       // ReferenceCount
+		binary.LittleEndian.PutUint32(b[8:], 1)                       // Blocks
+		putXattrs(state.block, b[32:], 32)
+
+		orig := w.block()
+		if inode.XattrBlock == 0 {
+			inode.XattrBlock = orig
+			inode.BlockCount++
+		} else {
+			// Reuse the original block.
+			w.seekBlock(inode.XattrBlock)
+			defer w.seekBlock(orig)
+		}
+
+		if _, err := w.write(b[:]); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (w *Writer) write(b []byte) (int, error) {
+	if w.err != nil {
+		return 0, w.err
+	}
+	if w.pos+int64(len(b)) > w.maxDiskSize {
+		w.err = exceededMaxSizeError{w.maxDiskSize}
+		return 0, w.err
+	}
+	n, err := w.bw.Write(b)
+	w.pos += int64(n)
+	w.err = err
+	return n, err
+}
+
+func (w *Writer) zero(n int64) (int64, error) {
+	if w.err != nil {
+		return 0, w.err
+	}
+	if w.pos+int64(n) > w.maxDiskSize {
+		w.err = exceededMaxSizeError{w.maxDiskSize}
+		return 0, w.err
+	}
+	n, err := io.CopyN(w.bw, zero, n)
+	w.pos += n
+	w.err = err
+	return n, err
+}
+
+func (w *Writer) makeInode(f *File, node *inode) (*inode, error) {
+	mode := f.Mode
+	if mode&format.TypeMask == 0 {
+		mode |= format.S_IFREG
+	}
+	typ := mode & format.TypeMask
+	ino := format.InodeNumber(len(w.inodes) + 1)
+	if node == nil {
+		node = &inode{
+			Number: ino,
+		}
+		if typ == S_IFDIR {
+			node.Children = make(directory)
+			node.LinkCount = 1 // A directory is linked to itself.
+		}
+	} else if node.Flags&format.InodeFlagExtents != 0 {
+		// Since we cannot deallocate or reuse blocks, don't allow updates that
+		// would invalidate data that has already been written.
+		return nil, errors.New("cannot overwrite file with non-inline data")
+	}
+	node.Mode = mode
+	node.Uid = f.Uid
+	node.Gid = f.Gid
+	node.Flags = format.InodeFlagHugeFile
+	node.Atime = timeToFsTime(f.Atime)
+	node.Ctime = timeToFsTime(f.Ctime)
+	node.Mtime = timeToFsTime(f.Mtime)
+	node.Crtime = timeToFsTime(f.Crtime)
+	node.Devmajor = f.Devmajor
+	node.Devminor = f.Devminor
+	node.Data = nil
+	node.XattrInline = nil
+
+	var xstate xattrState
+	xstate.init()
+
+	var size int64
+	switch typ {
+	case format.S_IFREG:
+		size = f.Size
+		if f.Size > maxFileSize {
+			return nil, fmt.Errorf("file too big: %d > %d", f.Size, maxFileSize)
+		}
+		if f.Size <= inlineDataSize && w.supportInlineData {
+			node.Data = make([]byte, f.Size)
+			extra := 0
+			if f.Size > inodeDataSize {
+				extra = int(f.Size - inodeDataSize)
+			}
+			// Add a dummy entry for now.
+			if !xstate.addXattr("system.data", node.Data[:extra]) {
+				panic("not enough room for inline data")
+			}
+			node.Flags |= format.InodeFlagInlineData
+		}
+	case format.S_IFLNK:
+		node.Mode |= 0777 // Symlinks should appear as ugw rwx
+		size = int64(len(f.Linkname))
+		if size <= smallSymlinkSize {
+			// Special case: small symlinks go directly in Block without setting
+			// an inline data flag.
+			node.Data = make([]byte, len(f.Linkname))
+			copy(node.Data, f.Linkname)
+		}
+	case format.S_IFDIR, format.S_IFIFO, format.S_IFSOCK, format.S_IFCHR, format.S_IFBLK:
+	default:
+		return nil, fmt.Errorf("invalid mode %o", mode)
+	}
+
+	// Accumulate the extended attributes.
+	if len(f.Xattrs) != 0 {
+		// Sort the xattrs to avoid non-determinism in map iteration.
+		var xattrs []string
+		for name := range f.Xattrs {
+			xattrs = append(xattrs, name)
+		}
+		sort.Strings(xattrs)
+		for _, name := range xattrs {
+			if !xstate.addXattr(name, f.Xattrs[name]) {
+				return nil, fmt.Errorf("could not fit xattr %s", name)
+			}
+		}
+	}
+
+	if err := w.writeXattrs(node, &xstate); err != nil {
+		return nil, err
+	}
+
+	node.Size = size
+	if typ == format.S_IFLNK && size > smallSymlinkSize {
+		// Write the link name as data.
+		w.startInode("", node, size)
+		if _, err := w.Write([]byte(f.Linkname)); err != nil {
+			return nil, err
+		}
+		if err := w.finishInode(); err != nil {
+			return nil, err
+		}
+	}
+
+	if int(node.Number-1) >= len(w.inodes) {
+		w.inodes = append(w.inodes, node)
+	}
+	return node, nil
+}
+
+func (w *Writer) root() *inode {
+	return w.getInode(format.InodeRoot)
+}
+
+func (w *Writer) lookup(name string, mustExist bool) (*inode, *inode, string, error) {
+	root := w.root()
+	cleanname := path.Clean("/" + name)[1:]
+	if len(cleanname) == 0 {
+		return root, root, "", nil
+	}
+	dirname, childname := path.Split(cleanname)
+	if len(childname) == 0 || len(childname) > 0xff {
+		return nil, nil, "", fmt.Errorf("%s: invalid name", name)
+	}
+	dir := w.findPath(root, dirname)
+	if dir == nil || !dir.IsDir() {
+		return nil, nil, "", fmt.Errorf("%s: path not found", name)
+	}
+	child := dir.Children[childname]
+	if child == nil && mustExist {
+		return nil, nil, "", fmt.Errorf("%s: file not found", name)
+	}
+	return dir, child, childname, nil
+}
+
+// Create adds a file to the file system.
+func (w *Writer) Create(name string, f *File) error {
+	if err := w.finishInode(); err != nil {
+		return err
+	}
+	dir, existing, childname, err := w.lookup(name, false)
+	if err != nil {
+		return err
+	}
+	var reuse *inode
+	if existing != nil {
+		if existing.IsDir() {
+			if f.Mode&TypeMask != S_IFDIR {
+				return fmt.Errorf("%s: cannot replace a directory with a file", name)
+			}
+			reuse = existing
+		} else if f.Mode&TypeMask == S_IFDIR {
+			return fmt.Errorf("%s: cannot replace a file with a directory", name)
+		} else if existing.LinkCount < 2 {
+			reuse = existing
+		}
+	} else {
+		if f.Mode&TypeMask == S_IFDIR && dir.LinkCount >= format.MaxLinks {
+			return fmt.Errorf("%s: exceeded parent directory maximum link count", name)
+		}
+	}
+	child, err := w.makeInode(f, reuse)
+	if err != nil {
+		return fmt.Errorf("%s: %s", name, err)
+	}
+	if existing != child {
+		if existing != nil {
+			existing.LinkCount--
+		}
+		dir.Children[childname] = child
+		child.LinkCount++
+		if child.IsDir() {
+			dir.LinkCount++
+		}
+	}
+	if child.Mode&format.TypeMask == format.S_IFREG {
+		w.startInode(name, child, f.Size)
+	}
+	return nil
+}
+
+// Link adds a hard link to the file system.
+func (w *Writer) Link(oldname, newname string) error {
+	if err := w.finishInode(); err != nil {
+		return err
+	}
+	newdir, existing, newchildname, err := w.lookup(newname, false)
+	if err != nil {
+		return err
+	}
+	if existing != nil && (existing.IsDir() || existing.LinkCount < 2) {
+		return fmt.Errorf("%s: cannot orphan existing file or directory", newname)
+	}
+
+	_, oldfile, _, err := w.lookup(oldname, true)
+	if err != nil {
+		return err
+	}
+	switch oldfile.Mode & format.TypeMask {
+	case format.S_IFDIR, format.S_IFLNK:
+		return fmt.Errorf("%s: link target cannot be a directory or symlink: %s", newname, oldname)
+	}
+
+	if existing != oldfile && oldfile.LinkCount >= format.MaxLinks {
+		return fmt.Errorf("%s: link target would exceed maximum link count: %s", newname, oldname)
+	}
+
+	if existing != nil {
+		existing.LinkCount--
+	}
+	oldfile.LinkCount++
+	newdir.Children[newchildname] = oldfile
+	return nil
+}
+
+// Stat returns information about a file that has been written.
+func (w *Writer) Stat(name string) (*File, error) {
+	if err := w.finishInode(); err != nil {
+		return nil, err
+	}
+	_, node, _, err := w.lookup(name, true)
+	if err != nil {
+		return nil, err
+	}
+	f := &File{
+		Size:     node.Size,
+		Mode:     node.Mode,
+		Uid:      node.Uid,
+		Gid:      node.Gid,
+		Atime:    fsTimeToTime(node.Atime),
+		Ctime:    fsTimeToTime(node.Ctime),
+		Mtime:    fsTimeToTime(node.Mtime),
+		Crtime:   fsTimeToTime(node.Crtime),
+		Devmajor: node.Devmajor,
+		Devminor: node.Devminor,
+	}
+	f.Xattrs = make(map[string][]byte)
+	if node.XattrBlock != 0 || len(node.XattrInline) != 0 {
+		if node.XattrBlock != 0 {
+			orig := w.block()
+			w.seekBlock(node.XattrBlock)
+			if w.err != nil {
+				return nil, w.err
+			}
+			var b [blockSize]byte
+			_, err := w.f.Read(b[:])
+			w.seekBlock(orig)
+			if err != nil {
+				return nil, err
+			}
+			getXattrs(b[32:], f.Xattrs, 32)
+		}
+		if len(node.XattrInline) != 0 {
+			getXattrs(node.XattrInline[4:], f.Xattrs, 0)
+			delete(f.Xattrs, "system.data")
+		}
+	}
+	if node.FileType() == S_IFLNK {
+		if node.Size > smallSymlinkSize {
+			return nil, fmt.Errorf("%s: cannot retrieve link information", name)
+		}
+		f.Linkname = string(node.Data)
+	}
+	return f, nil
+}
+
+func (w *Writer) Write(b []byte) (int, error) {
+	if len(b) == 0 {
+		return 0, nil
+	}
+	if w.dataWritten+int64(len(b)) > w.dataMax {
+		return 0, fmt.Errorf("%s: wrote too much: %d > %d", w.curName, w.dataWritten+int64(len(b)), w.dataMax)
+	}
+
+	if w.curInode.Flags&format.InodeFlagInlineData != 0 {
+		copy(w.curInode.Data[w.dataWritten:], b)
+		w.dataWritten += int64(len(b))
+		return len(b), nil
+	}
+
+	n, err := w.write(b)
+	w.dataWritten += int64(n)
+	return n, err
+}
+
+func (w *Writer) startInode(name string, inode *inode, size int64) {
+	if w.curInode != nil {
+		panic("inode already in progress")
+	}
+	w.curName = name
+	w.curInode = inode
+	w.dataWritten = 0
+	w.dataMax = size
+}
+
+func (w *Writer) block() uint32 {
+	return uint32(w.pos / blockSize)
+}
+
+func (w *Writer) seekBlock(block uint32) {
+	w.pos = int64(block) * blockSize
+	if w.err != nil {
+		return
+	}
+	w.err = w.bw.Flush()
+	if w.err != nil {
+		return
+	}
+	_, w.err = w.f.Seek(w.pos, io.SeekStart)
+}
+
+func (w *Writer) nextBlock() {
+	if w.pos%blockSize != 0 {
+		// Simplify callers; w.err is updated on failure.
+		w.zero(blockSize - w.pos%blockSize)
+	}
+}
+
+func fillExtents(hdr *format.ExtentHeader, extents []format.ExtentLeafNode, startBlock, offset, inodeSize uint32) {
+	*hdr = format.ExtentHeader{
+		Magic:   format.ExtentHeaderMagic,
+		Entries: uint16(len(extents)),
+		Max:     uint16(cap(extents)),
+		Depth:   0,
+	}
+	for i := range extents {
+		block := offset + uint32(i)*maxBlocksPerExtent
+		length := inodeSize - block
+		if length > maxBlocksPerExtent {
+			length = maxBlocksPerExtent
+		}
+		start := startBlock + block
+		extents[i] = format.ExtentLeafNode{
+			Block:    block,
+			Length:   uint16(length),
+			StartLow: start,
+		}
+	}
+}
+
+func (w *Writer) writeExtents(inode *inode) error {
+	start := w.pos - w.dataWritten
+	if start%blockSize != 0 {
+		panic("unaligned")
+	}
+	w.nextBlock()
+
+	startBlock := uint32(start / blockSize)
+	blocks := w.block() - startBlock
+	usedBlocks := blocks
+
+	const extentNodeSize = 12
+	const extentsPerBlock = blockSize/extentNodeSize - 1
+
+	extents := (blocks + maxBlocksPerExtent - 1) / maxBlocksPerExtent
+	var b bytes.Buffer
+	if extents == 0 {
+		// Nothing to do.
+	} else if extents <= 4 {
+		var root struct {
+			hdr     format.ExtentHeader
+			extents [4]format.ExtentLeafNode
+		}
+		fillExtents(&root.hdr, root.extents[:extents], startBlock, 0, blocks)
+		binary.Write(&b, binary.LittleEndian, root)
+	} else if extents <= 4*extentsPerBlock {
+		const extentsPerBlock = blockSize/extentNodeSize - 1
+		extentBlocks := extents/extentsPerBlock + 1
+		usedBlocks += extentBlocks
+		var b2 bytes.Buffer
+
+		var root struct {
+			hdr   format.ExtentHeader
+			nodes [4]format.ExtentIndexNode
+		}
+		root.hdr = format.ExtentHeader{
+			Magic:   format.ExtentHeaderMagic,
+			Entries: uint16(extentBlocks),
+			Max:     4,
+			Depth:   1,
+		}
+		for i := uint32(0); i < extentBlocks; i++ {
+			root.nodes[i] = format.ExtentIndexNode{
+				Block:   i * extentsPerBlock * maxBlocksPerExtent,
+				LeafLow: w.block(),
+			}
+			extentsInBlock := extents - i*extentBlocks
+			if extentsInBlock > extentsPerBlock {
+				extentsInBlock = extentsPerBlock
+			}
+
+			var node struct {
+				hdr     format.ExtentHeader
+				extents [extentsPerBlock]format.ExtentLeafNode
+				_       [blockSize - (extentsPerBlock+1)*extentNodeSize]byte
+			}
+
+			offset := i * extentsPerBlock * maxBlocksPerExtent
+			fillExtents(&node.hdr, node.extents[:extentsInBlock], startBlock+offset, offset, blocks)
+			binary.Write(&b2, binary.LittleEndian, node)
+			if _, err := w.write(b2.Next(blockSize)); err != nil {
+				return err
+			}
+		}
+		binary.Write(&b, binary.LittleEndian, root)
+	} else {
+		panic("file too big")
+	}
+
+	inode.Data = b.Bytes()
+	inode.Flags |= format.InodeFlagExtents
+	inode.BlockCount += usedBlocks
+	return w.err
+}
+
+func (w *Writer) finishInode() error {
+	if !w.initialized {
+		if err := w.init(); err != nil {
+			return err
+		}
+	}
+	if w.curInode == nil {
+		return nil
+	}
+	if w.dataWritten != w.dataMax {
+		return fmt.Errorf("did not write the right amount: %d != %d", w.dataWritten, w.dataMax)
+	}
+
+	if w.dataMax != 0 && w.curInode.Flags&format.InodeFlagInlineData == 0 {
+		if err := w.writeExtents(w.curInode); err != nil {
+			return err
+		}
+	}
+
+	w.dataWritten = 0
+	w.dataMax = 0
+	w.curInode = nil
+	return w.err
+}
+
+func modeToFileType(mode uint16) format.FileType {
+	switch mode & format.TypeMask {
+	default:
+		return format.FileTypeUnknown
+	case format.S_IFREG:
+		return format.FileTypeRegular
+	case format.S_IFDIR:
+		return format.FileTypeDirectory
+	case format.S_IFCHR:
+		return format.FileTypeCharacter
+	case format.S_IFBLK:
+		return format.FileTypeBlock
+	case format.S_IFIFO:
+		return format.FileTypeFIFO
+	case format.S_IFSOCK:
+		return format.FileTypeSocket
+	case format.S_IFLNK:
+		return format.FileTypeSymbolicLink
+	}
+}
+
+type constReader byte
+
+var zero = constReader(0)
+
+func (r constReader) Read(b []byte) (int, error) {
+	for i := range b {
+		b[i] = byte(r)
+	}
+	return len(b), nil
+}
+
+func (w *Writer) writeDirectory(dir, parent *inode) error {
+	if err := w.finishInode(); err != nil {
+		return err
+	}
+
+	// The size of the directory is not known yet.
+	w.startInode("", dir, 0x7fffffffffffffff)
+	left := blockSize
+	finishBlock := func() error {
+		if left > 0 {
+			e := format.DirectoryEntry{
+				RecordLength: uint16(left),
+			}
+			err := binary.Write(w, binary.LittleEndian, e)
+			if err != nil {
+				return err
+			}
+			left -= directoryEntrySize
+			if left < 4 {
+				panic("not enough space for trailing entry")
+			}
+			_, err = io.CopyN(w, zero, int64(left))
+			if err != nil {
+				return err
+			}
+		}
+		left = blockSize
+		return nil
+	}
+
+	writeEntry := func(ino format.InodeNumber, name string) error {
+		rlb := directoryEntrySize + len(name)
+		rl := (rlb + 3) & ^3
+		if left < rl+12 {
+			if err := finishBlock(); err != nil {
+				return err
+			}
+		}
+		e := format.DirectoryEntry{
+			Inode:        ino,
+			RecordLength: uint16(rl),
+			NameLength:   uint8(len(name)),
+			FileType:     modeToFileType(w.getInode(ino).Mode),
+		}
+		err := binary.Write(w, binary.LittleEndian, e)
+		if err != nil {
+			return err
+		}
+		_, err = w.Write([]byte(name))
+		if err != nil {
+			return err
+		}
+		var zero [4]byte
+		_, err = w.Write(zero[:rl-rlb])
+		if err != nil {
+			return err
+		}
+		left -= rl
+		return nil
+	}
+	if err := writeEntry(dir.Number, "."); err != nil {
+		return err
+	}
+	if err := writeEntry(parent.Number, ".."); err != nil {
+		return err
+	}
+
+	// Follow e2fsck's convention and sort the children by inode number.
+	var children []string
+	for name := range dir.Children {
+		children = append(children, name)
+	}
+	sort.Slice(children, func(i, j int) bool {
+		return dir.Children[children[i]].Number < dir.Children[children[j]].Number
+	})
+
+	for _, name := range children {
+		child := dir.Children[name]
+		if err := writeEntry(child.Number, name); err != nil {
+			return err
+		}
+	}
+	if err := finishBlock(); err != nil {
+		return err
+	}
+	w.curInode.Size = w.dataWritten
+	w.dataMax = w.dataWritten
+	return nil
+}
+
+func (w *Writer) writeDirectoryRecursive(dir, parent *inode) error {
+	if err := w.writeDirectory(dir, parent); err != nil {
+		return err
+	}
+	for _, child := range dir.Children {
+		if child.IsDir() {
+			if err := w.writeDirectoryRecursive(child, dir); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+func (w *Writer) writeInodeTable(tableSize uint32) error {
+	var b bytes.Buffer
+	for _, inode := range w.inodes {
+		if inode != nil {
+			binode := format.Inode{
+				Mode:          inode.Mode,
+				Uid:           uint16(inode.Uid & 0xffff),
+				Gid:           uint16(inode.Gid & 0xffff),
+				SizeLow:       uint32(inode.Size & 0xffffffff),
+				SizeHigh:      uint32(inode.Size >> 32),
+				LinksCount:    uint16(inode.LinkCount),
+				BlocksLow:     inode.BlockCount,
+				Flags:         inode.Flags,
+				XattrBlockLow: inode.XattrBlock,
+				UidHigh:       uint16(inode.Uid >> 16),
+				GidHigh:       uint16(inode.Gid >> 16),
+				ExtraIsize:    uint16(inodeUsedSize - 128),
+				Atime:         uint32(inode.Atime),
+				AtimeExtra:    uint32(inode.Atime >> 32),
+				Ctime:         uint32(inode.Ctime),
+				CtimeExtra:    uint32(inode.Ctime >> 32),
+				Mtime:         uint32(inode.Mtime),
+				MtimeExtra:    uint32(inode.Mtime >> 32),
+				Crtime:        uint32(inode.Crtime),
+				CrtimeExtra:   uint32(inode.Crtime >> 32),
+			}
+			switch inode.Mode & format.TypeMask {
+			case format.S_IFDIR, format.S_IFREG, format.S_IFLNK:
+				n := copy(binode.Block[:], inode.Data)
+				if n < len(inode.Data) {
+					// Rewrite the first xattr with the data.
+					xattr := [1]xattr{{
+						Name:  "data",
+						Index: 7, // "system."
+						Value: inode.Data[n:],
+					}}
+					putXattrs(xattr[:], inode.XattrInline[4:], 0)
+				}
+			case format.S_IFBLK, format.S_IFCHR:
+				dev := inode.Devminor&0xff | inode.Devmajor<<8 | (inode.Devminor&0xffffff00)<<12
+				binary.LittleEndian.PutUint32(binode.Block[4:], dev)
+			}
+
+			binary.Write(&b, binary.LittleEndian, binode)
+			b.Truncate(inodeUsedSize)
+			n, _ := b.Write(inode.XattrInline)
+			io.CopyN(&b, zero, int64(inodeExtraSize-n))
+		} else {
+			io.CopyN(&b, zero, inodeSize)
+		}
+		if _, err := w.write(b.Next(inodeSize)); err != nil {
+			return err
+		}
+	}
+	rest := tableSize - uint32(len(w.inodes)*inodeSize)
+	if _, err := w.zero(int64(rest)); err != nil {
+		return err
+	}
+	return nil
+}
+
+// NewWriter returns a Writer that writes an ext4 file system to the provided
+// WriteSeeker.
+func NewWriter(f io.ReadWriteSeeker, opts ...Option) *Writer {
+	w := &Writer{
+		f:           f,
+		bw:          bufio.NewWriterSize(f, 65536*8),
+		maxDiskSize: defaultMaxDiskSize,
+	}
+	for _, opt := range opts {
+		opt(w)
+	}
+	return w
+}
+
+// An Option provides extra options to NewWriter.
+type Option func(*Writer)
+
+// InlineData instructs the Writer to write small files into the inode
+// structures directly. This creates smaller images but currently is not
+// compatible with DAX.
+func InlineData(w *Writer) {
+	w.supportInlineData = true
+}
+
+// MaximumDiskSize instructs the writer to reserve enough metadata space for the
+// specified disk size. If not provided, then 16GB is the default.
+func MaximumDiskSize(size int64) Option {
+	return func(w *Writer) {
+		if size < 0 || size > maxMaxDiskSize {
+			w.maxDiskSize = maxMaxDiskSize
+		} else if size == 0 {
+			w.maxDiskSize = defaultMaxDiskSize
+		} else {
+			w.maxDiskSize = (size + blockSize - 1) &^ (blockSize - 1)
+		}
+	}
+}
+
+func (w *Writer) init() error {
+	// Skip the defective block inode.
+	w.inodes = make([]*inode, 1, 32)
+	// Create the root directory.
+	root, _ := w.makeInode(&File{
+		Mode: format.S_IFDIR | 0755,
+	}, nil)
+	root.LinkCount++ // The root is linked to itself.
+	// Skip until the first non-reserved inode.
+	w.inodes = append(w.inodes, make([]*inode, inodeFirst-len(w.inodes)-1)...)
+	maxBlocks := (w.maxDiskSize-1)/blockSize + 1
+	maxGroups := (maxBlocks-1)/blocksPerGroup + 1
+	w.gdBlocks = uint32((maxGroups-1)/groupsPerDescriptorBlock + 1)
+
+	// Skip past the superblock and block descriptor table.
+	w.seekBlock(1 + w.gdBlocks)
+	w.initialized = true
+
+	// The lost+found directory is required to exist for e2fsck to pass.
+	if err := w.Create("lost+found", &File{Mode: format.S_IFDIR | 0700}); err != nil {
+		return err
+	}
+	return w.err
+}
+
+func groupCount(blocks uint32, inodes uint32, inodesPerGroup uint32) uint32 {
+	inodeBlocksPerGroup := inodesPerGroup * inodeSize / blockSize
+	dataBlocksPerGroup := blocksPerGroup - inodeBlocksPerGroup - 2 // save room for the bitmaps
+
+	// Increase the block count to ensure there are enough groups for all the
+	// inodes.
+	minBlocks := (inodes-1)/inodesPerGroup*dataBlocksPerGroup + 1
+	if blocks < minBlocks {
+		blocks = minBlocks
+	}
+
+	return (blocks + dataBlocksPerGroup - 1) / dataBlocksPerGroup
+}
+
+func bestGroupCount(blocks uint32, inodes uint32) (groups uint32, inodesPerGroup uint32) {
+	groups = 0xffffffff
+	for ipg := uint32(inodesPerGroupIncrement); ipg <= maxInodesPerGroup; ipg += inodesPerGroupIncrement {
+		g := groupCount(blocks, inodes, ipg)
+		if g < groups {
+			groups = g
+			inodesPerGroup = ipg
+		}
+	}
+	return
+}
+
+func (w *Writer) Close() error {
+	if err := w.finishInode(); err != nil {
+		return err
+	}
+	root := w.root()
+	if err := w.writeDirectoryRecursive(root, root); err != nil {
+		return err
+	}
+	// Finish the last inode (probably a directory).
+	if err := w.finishInode(); err != nil {
+		return err
+	}
+
+	// Write the inode table
+	inodeTableOffset := w.block()
+	groups, inodesPerGroup := bestGroupCount(inodeTableOffset, uint32(len(w.inodes)))
+	err := w.writeInodeTable(groups * inodesPerGroup * inodeSize)
+	if err != nil {
+		return err
+	}
+
+	// Write the bitmaps.
+	bitmapOffset := w.block()
+	bitmapSize := groups * 2
+	validDataSize := bitmapOffset + bitmapSize
+	diskSize := validDataSize
+	minSize := (groups-1)*blocksPerGroup + 1
+	if diskSize < minSize {
+		diskSize = minSize
+	}
+
+	usedGdBlocks := (groups-1)/groupDescriptorSize + 1
+	if usedGdBlocks > w.gdBlocks {
+		return exceededMaxSizeError{w.maxDiskSize}
+	}
+
+	gds := make([]format.GroupDescriptor, w.gdBlocks*groupsPerDescriptorBlock)
+	inodeTableSizePerGroup := inodesPerGroup * inodeSize / blockSize
+	var totalUsedBlocks, totalUsedInodes uint32
+	for g := uint32(0); g < groups; g++ {
+		var b [blockSize * 2]byte
+		var dirCount, usedInodeCount, usedBlockCount uint16
+
+		// Block bitmap
+		if (g+1)*blocksPerGroup <= validDataSize {
+			// This group is fully allocated.
+			for j := range b[:blockSize] {
+				b[j] = 0xff
+			}
+			usedBlockCount = blocksPerGroup
+		} else if g*blocksPerGroup < validDataSize {
+			for j := uint32(0); j < validDataSize-g*blocksPerGroup; j++ {
+				b[j/8] |= 1 << (j % 8)
+				usedBlockCount++
+			}
+		}
+		if g == 0 {
+			// Unused group descriptor blocks should be cleared.
+			for j := 1 + usedGdBlocks; j < 1+w.gdBlocks; j++ {
+				b[j/8] &^= 1 << (j % 8)
+				usedBlockCount--
+			}
+		}
+		if g == groups-1 && diskSize%blocksPerGroup != 0 {
+			// Blocks that aren't present in the disk should be marked as
+			// allocated.
+			for j := diskSize % blocksPerGroup; j < blocksPerGroup; j++ {
+				b[j/8] |= 1 << (j % 8)
+				usedBlockCount++
+			}
+		}
+		// Inode bitmap
+		for j := uint32(0); j < inodesPerGroup; j++ {
+			ino := format.InodeNumber(1 + g*inodesPerGroup + j)
+			inode := w.getInode(ino)
+			if ino < inodeFirst || inode != nil {
+				b[blockSize+j/8] |= 1 << (j % 8)
+				usedInodeCount++
+			}
+			if inode != nil && inode.Mode&format.TypeMask == format.S_IFDIR {
+				dirCount++
+			}
+		}
+		_, err := w.write(b[:])
+		if err != nil {
+			return err
+		}
+		gds[g] = format.GroupDescriptor{
+			BlockBitmapLow:     bitmapOffset + 2*g,
+			InodeBitmapLow:     bitmapOffset + 2*g + 1,
+			InodeTableLow:      inodeTableOffset + g*inodeTableSizePerGroup,
+			UsedDirsCountLow:   dirCount,
+			FreeInodesCountLow: uint16(inodesPerGroup) - usedInodeCount,
+			FreeBlocksCountLow: blocksPerGroup - usedBlockCount,
+		}
+
+		totalUsedBlocks += uint32(usedBlockCount)
+		totalUsedInodes += uint32(usedInodeCount)
+	}
+
+	// Zero up to the disk size.
+	_, err = w.zero(int64(diskSize-bitmapOffset-bitmapSize) * blockSize)
+	if err != nil {
+		return err
+	}
+
+	// Write the block descriptors
+	w.seekBlock(1)
+	if w.err != nil {
+		return w.err
+	}
+	err = binary.Write(w.bw, binary.LittleEndian, gds)
+	if err != nil {
+		return err
+	}
+
+	// Write the super block
+	var blk [blockSize]byte
+	b := bytes.NewBuffer(blk[:1024])
+	sb := &format.SuperBlock{
+		InodesCount:        inodesPerGroup * groups,
+		BlocksCountLow:     diskSize,
+		FreeBlocksCountLow: blocksPerGroup*groups - totalUsedBlocks,
+		FreeInodesCount:    inodesPerGroup*groups - totalUsedInodes,
+		FirstDataBlock:     0,
+		LogBlockSize:       2, // 2^(10 + 2)
+		LogClusterSize:     2,
+		BlocksPerGroup:     blocksPerGroup,
+		ClustersPerGroup:   blocksPerGroup,
+		InodesPerGroup:     inodesPerGroup,
+		Magic:              format.SuperBlockMagic,
+		State:              1, // cleanly unmounted
+		Errors:             1, // continue on error?
+		CreatorOS:          0, // Linux
+		RevisionLevel:      1, // dynamic inode sizes
+		FirstInode:         inodeFirst,
+		LpfInode:           inodeLostAndFound,
+		InodeSize:          inodeSize,
+		FeatureCompat:      format.CompatSparseSuper2 | format.CompatExtAttr,
+		FeatureIncompat:    format.IncompatFiletype | format.IncompatExtents | format.IncompatFlexBg,
+		FeatureRoCompat:    format.RoCompatLargeFile | format.RoCompatHugeFile | format.RoCompatExtraIsize | format.RoCompatReadonly,
+		MinExtraIsize:      extraIsize,
+		WantExtraIsize:     extraIsize,
+		LogGroupsPerFlex:   31,
+	}
+	if w.supportInlineData {
+		sb.FeatureIncompat |= format.IncompatInlineData
+	}
+	binary.Write(b, binary.LittleEndian, sb)
+	w.seekBlock(0)
+	if _, err := w.write(blk[:]); err != nil {
+		return err
+	}
+	w.seekBlock(diskSize)
+	return w.err
+}

+ 411 - 0
vendor/github.com/Microsoft/hcsshim/ext4/internal/format/format.go

@@ -0,0 +1,411 @@
+package format
+
+type SuperBlock struct {
+	InodesCount          uint32
+	BlocksCountLow       uint32
+	RootBlocksCountLow   uint32
+	FreeBlocksCountLow   uint32
+	FreeInodesCount      uint32
+	FirstDataBlock       uint32
+	LogBlockSize         uint32
+	LogClusterSize       uint32
+	BlocksPerGroup       uint32
+	ClustersPerGroup     uint32
+	InodesPerGroup       uint32
+	Mtime                uint32
+	Wtime                uint32
+	MountCount           uint16
+	MaxMountCount        uint16
+	Magic                uint16
+	State                uint16
+	Errors               uint16
+	MinorRevisionLevel   uint16
+	LastCheck            uint32
+	CheckInterval        uint32
+	CreatorOS            uint32
+	RevisionLevel        uint32
+	DefaultReservedUid   uint16
+	DefaultReservedGid   uint16
+	FirstInode           uint32
+	InodeSize            uint16
+	BlockGroupNr         uint16
+	FeatureCompat        CompatFeature
+	FeatureIncompat      IncompatFeature
+	FeatureRoCompat      RoCompatFeature
+	UUID                 [16]uint8
+	VolumeName           [16]byte
+	LastMounted          [64]byte
+	AlgorithmUsageBitmap uint32
+	PreallocBlocks       uint8
+	PreallocDirBlocks    uint8
+	ReservedGdtBlocks    uint16
+	JournalUUID          [16]uint8
+	JournalInum          uint32
+	JournalDev           uint32
+	LastOrphan           uint32
+	HashSeed             [4]uint32
+	DefHashVersion       uint8
+	JournalBackupType    uint8
+	DescSize             uint16
+	DefaultMountOpts     uint32
+	FirstMetaBg          uint32
+	MkfsTime             uint32
+	JournalBlocks        [17]uint32
+	BlocksCountHigh      uint32
+	RBlocksCountHigh     uint32
+	FreeBlocksCountHigh  uint32
+	MinExtraIsize        uint16
+	WantExtraIsize       uint16
+	Flags                uint32
+	RaidStride           uint16
+	MmpInterval          uint16
+	MmpBlock             uint64
+	RaidStripeWidth      uint32
+	LogGroupsPerFlex     uint8
+	ChecksumType         uint8
+	ReservedPad          uint16
+	KbytesWritten        uint64
+	SnapshotInum         uint32
+	SnapshotID           uint32
+	SnapshotRBlocksCount uint64
+	SnapshotList         uint32
+	ErrorCount           uint32
+	FirstErrorTime       uint32
+	FirstErrorInode      uint32
+	FirstErrorBlock      uint64
+	FirstErrorFunc       [32]uint8
+	FirstErrorLine       uint32
+	LastErrorTime        uint32
+	LastErrorInode       uint32
+	LastErrorLine        uint32
+	LastErrorBlock       uint64
+	LastErrorFunc        [32]uint8
+	MountOpts            [64]uint8
+	UserQuotaInum        uint32
+	GroupQuotaInum       uint32
+	OverheadBlocks       uint32
+	BackupBgs            [2]uint32
+	EncryptAlgos         [4]uint8
+	EncryptPwSalt        [16]uint8
+	LpfInode             uint32
+	ProjectQuotaInum     uint32
+	ChecksumSeed         uint32
+	WtimeHigh            uint8
+	MtimeHigh            uint8
+	MkfsTimeHigh         uint8
+	LastcheckHigh        uint8
+	FirstErrorTimeHigh   uint8
+	LastErrorTimeHigh    uint8
+	Pad                  [2]uint8
+	Reserved             [96]uint32
+	Checksum             uint32
+}
+
+const SuperBlockMagic uint16 = 0xef53
+
+type CompatFeature uint32
+type IncompatFeature uint32
+type RoCompatFeature uint32
+
+const (
+	CompatDirPrealloc   CompatFeature = 0x1
+	CompatImagicInodes  CompatFeature = 0x2
+	CompatHasJournal    CompatFeature = 0x4
+	CompatExtAttr       CompatFeature = 0x8
+	CompatResizeInode   CompatFeature = 0x10
+	CompatDirIndex      CompatFeature = 0x20
+	CompatLazyBg        CompatFeature = 0x40
+	CompatExcludeInode  CompatFeature = 0x80
+	CompatExcludeBitmap CompatFeature = 0x100
+	CompatSparseSuper2  CompatFeature = 0x200
+
+	IncompatCompression IncompatFeature = 0x1
+	IncompatFiletype    IncompatFeature = 0x2
+	IncompatRecover     IncompatFeature = 0x4
+	IncompatJournalDev  IncompatFeature = 0x8
+	IncompatMetaBg      IncompatFeature = 0x10
+	IncompatExtents     IncompatFeature = 0x40
+	Incompat_64Bit      IncompatFeature = 0x80
+	IncompatMmp         IncompatFeature = 0x100
+	IncompatFlexBg      IncompatFeature = 0x200
+	IncompatEaInode     IncompatFeature = 0x400
+	IncompatDirdata     IncompatFeature = 0x1000
+	IncompatCsumSeed    IncompatFeature = 0x2000
+	IncompatLargedir    IncompatFeature = 0x4000
+	IncompatInlineData  IncompatFeature = 0x8000
+	IncompatEncrypt     IncompatFeature = 0x10000
+
+	RoCompatSparseSuper  RoCompatFeature = 0x1
+	RoCompatLargeFile    RoCompatFeature = 0x2
+	RoCompatBtreeDir     RoCompatFeature = 0x4
+	RoCompatHugeFile     RoCompatFeature = 0x8
+	RoCompatGdtCsum      RoCompatFeature = 0x10
+	RoCompatDirNlink     RoCompatFeature = 0x20
+	RoCompatExtraIsize   RoCompatFeature = 0x40
+	RoCompatHasSnapshot  RoCompatFeature = 0x80
+	RoCompatQuota        RoCompatFeature = 0x100
+	RoCompatBigalloc     RoCompatFeature = 0x200
+	RoCompatMetadataCsum RoCompatFeature = 0x400
+	RoCompatReplica      RoCompatFeature = 0x800
+	RoCompatReadonly     RoCompatFeature = 0x1000
+	RoCompatProject      RoCompatFeature = 0x2000
+)
+
+type BlockGroupFlag uint16
+
+const (
+	BlockGroupInodeUninit BlockGroupFlag = 0x1
+	BlockGroupBlockUninit BlockGroupFlag = 0x2
+	BlockGroupInodeZeroed BlockGroupFlag = 0x4
+)
+
+type GroupDescriptor struct {
+	BlockBitmapLow     uint32
+	InodeBitmapLow     uint32
+	InodeTableLow      uint32
+	FreeBlocksCountLow uint16
+	FreeInodesCountLow uint16
+	UsedDirsCountLow   uint16
+	Flags              BlockGroupFlag
+	ExcludeBitmapLow   uint32
+	BlockBitmapCsumLow uint16
+	InodeBitmapCsumLow uint16
+	ItableUnusedLow    uint16
+	Checksum           uint16
+}
+
+type GroupDescriptor64 struct {
+	GroupDescriptor
+	BlockBitmapHigh     uint32
+	InodeBitmapHigh     uint32
+	InodeTableHigh      uint32
+	FreeBlocksCountHigh uint16
+	FreeInodesCountHigh uint16
+	UsedDirsCountHigh   uint16
+	ItableUnusedHigh    uint16
+	ExcludeBitmapHigh   uint32
+	BlockBitmapCsumHigh uint16
+	InodeBitmapCsumHigh uint16
+	Reserved            uint32
+}
+
+const (
+	S_IXOTH  = 0x1
+	S_IWOTH  = 0x2
+	S_IROTH  = 0x4
+	S_IXGRP  = 0x8
+	S_IWGRP  = 0x10
+	S_IRGRP  = 0x20
+	S_IXUSR  = 0x40
+	S_IWUSR  = 0x80
+	S_IRUSR  = 0x100
+	S_ISVTX  = 0x200
+	S_ISGID  = 0x400
+	S_ISUID  = 0x800
+	S_IFIFO  = 0x1000
+	S_IFCHR  = 0x2000
+	S_IFDIR  = 0x4000
+	S_IFBLK  = 0x6000
+	S_IFREG  = 0x8000
+	S_IFLNK  = 0xA000
+	S_IFSOCK = 0xC000
+
+	TypeMask uint16 = 0xF000
+)
+
+type InodeNumber uint32
+
+const (
+	InodeRoot = 2
+)
+
+type Inode struct {
+	Mode                 uint16
+	Uid                  uint16
+	SizeLow              uint32
+	Atime                uint32
+	Ctime                uint32
+	Mtime                uint32
+	Dtime                uint32
+	Gid                  uint16
+	LinksCount           uint16
+	BlocksLow            uint32
+	Flags                InodeFlag
+	Version              uint32
+	Block                [60]byte
+	Generation           uint32
+	XattrBlockLow        uint32
+	SizeHigh             uint32
+	ObsoleteFragmentAddr uint32
+	BlocksHigh           uint16
+	XattrBlockHigh       uint16
+	UidHigh              uint16
+	GidHigh              uint16
+	ChecksumLow          uint16
+	Reserved             uint16
+	ExtraIsize           uint16
+	ChecksumHigh         uint16
+	CtimeExtra           uint32
+	MtimeExtra           uint32
+	AtimeExtra           uint32
+	Crtime               uint32
+	CrtimeExtra          uint32
+	VersionHigh          uint32
+	Projid               uint32
+}
+
+type InodeFlag uint32
+
+const (
+	InodeFlagSecRm              InodeFlag = 0x1
+	InodeFlagUnRm               InodeFlag = 0x2
+	InodeFlagCompressed         InodeFlag = 0x4
+	InodeFlagSync               InodeFlag = 0x8
+	InodeFlagImmutable          InodeFlag = 0x10
+	InodeFlagAppend             InodeFlag = 0x20
+	InodeFlagNoDump             InodeFlag = 0x40
+	InodeFlagNoAtime            InodeFlag = 0x80
+	InodeFlagDirtyCompressed    InodeFlag = 0x100
+	InodeFlagCompressedClusters InodeFlag = 0x200
+	InodeFlagNoCompress         InodeFlag = 0x400
+	InodeFlagEncrypted          InodeFlag = 0x800
+	InodeFlagHashedIndex        InodeFlag = 0x1000
+	InodeFlagMagic              InodeFlag = 0x2000
+	InodeFlagJournalData        InodeFlag = 0x4000
+	InodeFlagNoTail             InodeFlag = 0x8000
+	InodeFlagDirSync            InodeFlag = 0x10000
+	InodeFlagTopDir             InodeFlag = 0x20000
+	InodeFlagHugeFile           InodeFlag = 0x40000
+	InodeFlagExtents            InodeFlag = 0x80000
+	InodeFlagEaInode            InodeFlag = 0x200000
+	InodeFlagEOFBlocks          InodeFlag = 0x400000
+	InodeFlagSnapfile           InodeFlag = 0x01000000
+	InodeFlagSnapfileDeleted    InodeFlag = 0x04000000
+	InodeFlagSnapfileShrunk     InodeFlag = 0x08000000
+	InodeFlagInlineData         InodeFlag = 0x10000000
+	InodeFlagProjectIDInherit   InodeFlag = 0x20000000
+	InodeFlagReserved           InodeFlag = 0x80000000
+)
+
+const (
+	MaxLinks = 65000
+)
+
+type ExtentHeader struct {
+	Magic      uint16
+	Entries    uint16
+	Max        uint16
+	Depth      uint16
+	Generation uint32
+}
+
+const ExtentHeaderMagic uint16 = 0xf30a
+
+type ExtentIndexNode struct {
+	Block    uint32
+	LeafLow  uint32
+	LeafHigh uint16
+	Unused   uint16
+}
+
+type ExtentLeafNode struct {
+	Block     uint32
+	Length    uint16
+	StartHigh uint16
+	StartLow  uint32
+}
+
+type ExtentTail struct {
+	Checksum uint32
+}
+
+type DirectoryEntry struct {
+	Inode        InodeNumber
+	RecordLength uint16
+	NameLength   uint8
+	FileType     FileType
+	//Name         []byte
+}
+
+type FileType uint8
+
+const (
+	FileTypeUnknown      FileType = 0x0
+	FileTypeRegular      FileType = 0x1
+	FileTypeDirectory    FileType = 0x2
+	FileTypeCharacter    FileType = 0x3
+	FileTypeBlock        FileType = 0x4
+	FileTypeFIFO         FileType = 0x5
+	FileTypeSocket       FileType = 0x6
+	FileTypeSymbolicLink FileType = 0x7
+)
+
+type DirectoryEntryTail struct {
+	ReservedZero1 uint32
+	RecordLength  uint16
+	ReservedZero2 uint8
+	FileType      uint8
+	Checksum      uint32
+}
+
+type DirectoryTreeRoot struct {
+	Dot            DirectoryEntry
+	DotName        [4]byte
+	DotDot         DirectoryEntry
+	DotDotName     [4]byte
+	ReservedZero   uint32
+	HashVersion    uint8
+	InfoLength     uint8
+	IndirectLevels uint8
+	UnusedFlags    uint8
+	Limit          uint16
+	Count          uint16
+	Block          uint32
+	//Entries        []DirectoryTreeEntry
+}
+
+type DirectoryTreeNode struct {
+	FakeInode        uint32
+	FakeRecordLength uint16
+	NameLength       uint8
+	FileType         uint8
+	Limit            uint16
+	Count            uint16
+	Block            uint32
+	//Entries          []DirectoryTreeEntry
+}
+
+type DirectoryTreeEntry struct {
+	Hash  uint32
+	Block uint32
+}
+
+type DirectoryTreeTail struct {
+	Reserved uint32
+	Checksum uint32
+}
+
+type XAttrInodeBodyHeader struct {
+	Magic uint32
+}
+
+type XAttrHeader struct {
+	Magic          uint32
+	ReferenceCount uint32
+	Blocks         uint32
+	Hash           uint32
+	Checksum       uint32
+	Reserved       [3]uint32
+}
+
+const XAttrHeaderMagic uint32 = 0xea020000
+
+type XAttrEntry struct {
+	NameLength  uint8
+	NameIndex   uint8
+	ValueOffset uint16
+	ValueInum   uint32
+	ValueSize   uint32
+	Hash        uint32
+	//Name        []byte
+}

+ 174 - 0
vendor/github.com/Microsoft/hcsshim/ext4/tar2ext4/tar2ext4.go

@@ -0,0 +1,174 @@
+package tar2ext4
+
+import (
+	"archive/tar"
+	"bufio"
+	"encoding/binary"
+	"io"
+	"path"
+	"strings"
+
+	"github.com/Microsoft/hcsshim/ext4/internal/compactext4"
+)
+
+type params struct {
+	convertWhiteout bool
+	appendVhdFooter bool
+	ext4opts        []compactext4.Option
+}
+
+// Option is the type for optional parameters to Convert.
+type Option func(*params)
+
+// ConvertWhiteout instructs the converter to convert OCI-style whiteouts
+// (beginning with .wh.) to overlay-style whiteouts.
+func ConvertWhiteout(p *params) {
+	p.convertWhiteout = true
+}
+
+// AppendVhdFooter instructs the converter to add a fixed VHD footer to the
+// file.
+func AppendVhdFooter(p *params) {
+	p.appendVhdFooter = true
+}
+
+// InlineData instructs the converter to write small files into the inode
+// structures directly. This creates smaller images but currently is not
+// compatible with DAX.
+func InlineData(p *params) {
+	p.ext4opts = append(p.ext4opts, compactext4.InlineData)
+}
+
+// MaximumDiskSize instructs the writer to limit the disk size to the specified
+// value. This also reserves enough metadata space for the specified disk size.
+// If not provided, then 16GB is the default.
+func MaximumDiskSize(size int64) Option {
+	return func(p *params) {
+		p.ext4opts = append(p.ext4opts, compactext4.MaximumDiskSize(size))
+	}
+}
+
+const (
+	whiteoutPrefix = ".wh."
+	opaqueWhiteout = ".wh..wh..opq"
+)
+
+// Convert writes a compact ext4 file system image that contains the files in the
+// input tar stream.
+func Convert(r io.Reader, w io.ReadWriteSeeker, options ...Option) error {
+	var p params
+	for _, opt := range options {
+		opt(&p)
+	}
+	t := tar.NewReader(bufio.NewReader(r))
+	fs := compactext4.NewWriter(w, p.ext4opts...)
+	for {
+		hdr, err := t.Next()
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			return err
+		}
+
+		if p.convertWhiteout {
+			dir, name := path.Split(hdr.Name)
+			if strings.HasPrefix(name, whiteoutPrefix) {
+				if name == opaqueWhiteout {
+					// Update the directory with the appropriate xattr.
+					f, err := fs.Stat(dir)
+					if err != nil {
+						return err
+					}
+					f.Xattrs["trusted.overlay.opaque"] = []byte("y")
+					err = fs.Create(dir, f)
+					if err != nil {
+						return err
+					}
+				} else {
+					// Create an overlay-style whiteout.
+					f := &compactext4.File{
+						Mode:     compactext4.S_IFCHR,
+						Devmajor: 0,
+						Devminor: 0,
+					}
+					err = fs.Create(path.Join(dir, name[len(whiteoutPrefix):]), f)
+					if err != nil {
+						return err
+					}
+				}
+
+				continue
+			}
+		}
+
+		if hdr.Typeflag == tar.TypeLink {
+			err = fs.Link(hdr.Linkname, hdr.Name)
+			if err != nil {
+				return err
+			}
+		} else {
+			f := &compactext4.File{
+				Mode:     uint16(hdr.Mode),
+				Atime:    hdr.AccessTime,
+				Mtime:    hdr.ModTime,
+				Ctime:    hdr.ChangeTime,
+				Crtime:   hdr.ModTime,
+				Size:     hdr.Size,
+				Uid:      uint32(hdr.Uid),
+				Gid:      uint32(hdr.Gid),
+				Linkname: hdr.Linkname,
+				Devmajor: uint32(hdr.Devmajor),
+				Devminor: uint32(hdr.Devminor),
+				Xattrs:   make(map[string][]byte),
+			}
+			for key, value := range hdr.PAXRecords {
+				const xattrPrefix = "SCHILY.xattr."
+				if strings.HasPrefix(key, xattrPrefix) {
+					f.Xattrs[key[len(xattrPrefix):]] = []byte(value)
+				}
+			}
+
+			var typ uint16
+			switch hdr.Typeflag {
+			case tar.TypeReg, tar.TypeRegA:
+				typ = compactext4.S_IFREG
+			case tar.TypeSymlink:
+				typ = compactext4.S_IFLNK
+			case tar.TypeChar:
+				typ = compactext4.S_IFCHR
+			case tar.TypeBlock:
+				typ = compactext4.S_IFBLK
+			case tar.TypeDir:
+				typ = compactext4.S_IFDIR
+			case tar.TypeFifo:
+				typ = compactext4.S_IFIFO
+			}
+			f.Mode &= ^compactext4.TypeMask
+			f.Mode |= typ
+			err = fs.Create(hdr.Name, f)
+			if err != nil {
+				return err
+			}
+			_, err = io.Copy(fs, t)
+			if err != nil {
+				return err
+			}
+		}
+	}
+	err := fs.Close()
+	if err != nil {
+		return err
+	}
+	if p.appendVhdFooter {
+		size, err := w.Seek(0, io.SeekEnd)
+		if err != nil {
+			return err
+		}
+		err = binary.Write(w, binary.BigEndian, makeFixedVHDFooter(size))
+		if err != nil {
+			return err
+		}
+	}
+	return nil
+}

+ 76 - 0
vendor/github.com/Microsoft/hcsshim/ext4/tar2ext4/vhdfooter.go

@@ -0,0 +1,76 @@
+package tar2ext4
+
+import (
+	"bytes"
+	"crypto/rand"
+	"encoding/binary"
+)
+
+// Constants for the VHD footer
+const (
+	cookieMagic            = "conectix"
+	featureMask            = 0x2
+	fileFormatVersionMagic = 0x00010000
+	fixedDataOffset        = -1
+	creatorVersionMagic    = 0x000a0000
+	diskTypeFixed          = 2
+)
+
+type vhdFooter struct {
+	Cookie             [8]byte
+	Features           uint32
+	FileFormatVersion  uint32
+	DataOffset         int64
+	TimeStamp          uint32
+	CreatorApplication [4]byte
+	CreatorVersion     uint32
+	CreatorHostOS      [4]byte
+	OriginalSize       int64
+	CurrentSize        int64
+	DiskGeometry       uint32
+	DiskType           uint32
+	Checksum           uint32
+	UniqueID           [16]uint8
+	SavedState         uint8
+	Reserved           [427]uint8
+}
+
+func makeFixedVHDFooter(size int64) *vhdFooter {
+	footer := &vhdFooter{
+		Features:          featureMask,
+		FileFormatVersion: fileFormatVersionMagic,
+		DataOffset:        fixedDataOffset,
+		CreatorVersion:    creatorVersionMagic,
+		OriginalSize:      size,
+		CurrentSize:       size,
+		DiskType:          diskTypeFixed,
+		UniqueID:          generateUUID(),
+	}
+	copy(footer.Cookie[:], cookieMagic)
+	footer.Checksum = calculateCheckSum(footer)
+	return footer
+}
+
+func calculateCheckSum(footer *vhdFooter) uint32 {
+	oldchk := footer.Checksum
+	footer.Checksum = 0
+
+	buf := &bytes.Buffer{}
+	binary.Write(buf, binary.BigEndian, footer)
+
+	var chk uint32
+	bufBytes := buf.Bytes()
+	for i := 0; i < len(bufBytes); i++ {
+		chk += uint32(bufBytes[i])
+	}
+	footer.Checksum = oldchk
+	return uint32(^chk)
+}
+
+func generateUUID() [16]byte {
+	res := [16]byte{}
+	if _, err := rand.Read(res[:]); err != nil {
+		panic(err)
+	}
+	return res
+}

+ 0 - 6
vendor/github.com/Microsoft/hcsshim/version.go

@@ -1,6 +0,0 @@
-package hcsshim
-
-// IsTP4 returns whether the currently running Windows build is at least TP4.
-func IsTP4() bool {
-	return false
-}