Просмотр исходного кода

Merge pull request #22126 from dmcgowan/overlay-native-diff

Overlay multiple lower directory support
Michael Crosby 9 лет назад
Родитель
Сommit
8a2f9a249c

+ 264 - 0
daemon/graphdriver/graphtest/graphbench_unix.go

@@ -0,0 +1,264 @@
+// +build linux freebsd
+
+package graphtest
+
+import (
+	"bytes"
+	"io"
+	"io/ioutil"
+	"path/filepath"
+	"testing"
+
+	"github.com/docker/docker/pkg/stringid"
+)
+
+// DriverBenchExists benchmarks calls to exist
+func DriverBenchExists(b *testing.B, drivername string, driveroptions ...string) {
+	driver := GetDriver(b, drivername, driveroptions...)
+	defer PutDriver(b)
+
+	base := stringid.GenerateRandomID()
+
+	if err := driver.Create(base, "", "", nil); err != nil {
+		b.Fatal(err)
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		if !driver.Exists(base) {
+			b.Fatal("Newly created image doesn't exist")
+		}
+	}
+}
+
+// DriverBenchGetEmpty benchmarks calls to get on an empty layer
+func DriverBenchGetEmpty(b *testing.B, drivername string, driveroptions ...string) {
+	driver := GetDriver(b, drivername, driveroptions...)
+	defer PutDriver(b)
+
+	base := stringid.GenerateRandomID()
+
+	if err := driver.Create(base, "", "", nil); err != nil {
+		b.Fatal(err)
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_, err := driver.Get(base, "")
+		b.StopTimer()
+		if err != nil {
+			b.Fatalf("Error getting mount: %s", err)
+		}
+		if err := driver.Put(base); err != nil {
+			b.Fatalf("Error putting mount: %s", err)
+		}
+		b.StartTimer()
+	}
+}
+
+// DriverBenchDiffBase benchmarks calls to diff on a root layer
+func DriverBenchDiffBase(b *testing.B, drivername string, driveroptions ...string) {
+	driver := GetDriver(b, drivername, driveroptions...)
+	defer PutDriver(b)
+
+	base := stringid.GenerateRandomID()
+
+	if err := driver.Create(base, "", "", nil); err != nil {
+		b.Fatal(err)
+	}
+
+	if err := addFiles(driver, base, 3); err != nil {
+		b.Fatal(err)
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		arch, err := driver.Diff(base, "")
+		if err != nil {
+			b.Fatal(err)
+		}
+		_, err = io.Copy(ioutil.Discard, arch)
+		if err != nil {
+			b.Fatalf("Error copying archive: %s", err)
+		}
+		arch.Close()
+	}
+}
+
+// DriverBenchDiffN benchmarks calls to diff on two layers with
+// a provided number of files on the lower and upper layers.
+func DriverBenchDiffN(b *testing.B, bottom, top int, drivername string, driveroptions ...string) {
+	driver := GetDriver(b, drivername, driveroptions...)
+	defer PutDriver(b)
+	base := stringid.GenerateRandomID()
+	upper := stringid.GenerateRandomID()
+
+	if err := driver.Create(base, "", "", nil); err != nil {
+		b.Fatal(err)
+	}
+
+	if err := addManyFiles(driver, base, bottom, 3); err != nil {
+		b.Fatal(err)
+	}
+
+	if err := driver.Create(upper, base, "", nil); err != nil {
+		b.Fatal(err)
+	}
+
+	if err := addManyFiles(driver, upper, top, 6); err != nil {
+		b.Fatal(err)
+	}
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		arch, err := driver.Diff(upper, "")
+		if err != nil {
+			b.Fatal(err)
+		}
+		_, err = io.Copy(ioutil.Discard, arch)
+		if err != nil {
+			b.Fatalf("Error copying archive: %s", err)
+		}
+		arch.Close()
+	}
+}
+
+// DriverBenchDiffApplyN benchmarks calls to diff and apply together
+func DriverBenchDiffApplyN(b *testing.B, fileCount int, drivername string, driveroptions ...string) {
+	driver := GetDriver(b, drivername, driveroptions...)
+	defer PutDriver(b)
+	base := stringid.GenerateRandomID()
+	upper := stringid.GenerateRandomID()
+
+	if err := driver.Create(base, "", "", nil); err != nil {
+		b.Fatal(err)
+	}
+
+	if err := addManyFiles(driver, base, fileCount, 3); err != nil {
+		b.Fatal(err)
+	}
+
+	if err := driver.Create(upper, base, "", nil); err != nil {
+		b.Fatal(err)
+	}
+
+	if err := addManyFiles(driver, upper, fileCount, 6); err != nil {
+		b.Fatal(err)
+	}
+	diffSize, err := driver.DiffSize(upper, "")
+	if err != nil {
+		b.Fatal(err)
+	}
+	b.ResetTimer()
+	b.StopTimer()
+	for i := 0; i < b.N; i++ {
+		diff := stringid.GenerateRandomID()
+		if err := driver.Create(diff, base, "", nil); err != nil {
+			b.Fatal(err)
+		}
+
+		if err := checkManyFiles(driver, diff, fileCount, 3); err != nil {
+			b.Fatal(err)
+		}
+
+		b.StartTimer()
+
+		arch, err := driver.Diff(upper, "")
+		if err != nil {
+			b.Fatal(err)
+		}
+
+		applyDiffSize, err := driver.ApplyDiff(diff, "", arch)
+		if err != nil {
+			b.Fatal(err)
+		}
+
+		b.StopTimer()
+		arch.Close()
+
+		if applyDiffSize != diffSize {
+			// TODO: enforce this
+			//b.Fatalf("Apply diff size different, got %d, expected %s", applyDiffSize, diffSize)
+		}
+		if err := checkManyFiles(driver, diff, fileCount, 6); err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
+// DriverBenchDeepLayerDiff benchmarks calls to diff on top of a given number of layers.
+func DriverBenchDeepLayerDiff(b *testing.B, layerCount int, drivername string, driveroptions ...string) {
+	driver := GetDriver(b, drivername, driveroptions...)
+	defer PutDriver(b)
+
+	base := stringid.GenerateRandomID()
+
+	if err := driver.Create(base, "", "", nil); err != nil {
+		b.Fatal(err)
+	}
+
+	if err := addFiles(driver, base, 50); err != nil {
+		b.Fatal(err)
+	}
+
+	topLayer, err := addManyLayers(driver, base, layerCount)
+	if err != nil {
+		b.Fatal(err)
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		arch, err := driver.Diff(topLayer, "")
+		if err != nil {
+			b.Fatal(err)
+		}
+		_, err = io.Copy(ioutil.Discard, arch)
+		if err != nil {
+			b.Fatalf("Error copying archive: %s", err)
+		}
+		arch.Close()
+	}
+}
+
+// DriverBenchDeepLayerRead benchmarks calls to read a file under a given number of layers.
+func DriverBenchDeepLayerRead(b *testing.B, layerCount int, drivername string, driveroptions ...string) {
+	driver := GetDriver(b, drivername, driveroptions...)
+	defer PutDriver(b)
+
+	base := stringid.GenerateRandomID()
+
+	if err := driver.Create(base, "", "", nil); err != nil {
+		b.Fatal(err)
+	}
+
+	content := []byte("test content")
+	if err := addFile(driver, base, "testfile.txt", content); err != nil {
+		b.Fatal(err)
+	}
+
+	topLayer, err := addManyLayers(driver, base, layerCount)
+	if err != nil {
+		b.Fatal(err)
+	}
+
+	root, err := driver.Get(topLayer, "")
+	if err != nil {
+		b.Fatal(err)
+	}
+	defer driver.Put(topLayer)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+
+		// Read content
+		c, err := ioutil.ReadFile(filepath.Join(root, "testfile.txt"))
+		if err != nil {
+			b.Fatal(err)
+		}
+
+		b.StopTimer()
+		if bytes.Compare(c, content) != 0 {
+			b.Fatalf("Wrong content in file %v, expected %v", c, content)
+		}
+		b.StartTimer()
+	}
+}

+ 134 - 153
daemon/graphdriver/graphtest/graphtest_unix.go

@@ -3,7 +3,7 @@
 package graphtest
 
 import (
-	"fmt"
+	"bytes"
 	"io/ioutil"
 	"math/rand"
 	"os"
@@ -14,6 +14,7 @@ import (
 	"unsafe"
 
 	"github.com/docker/docker/daemon/graphdriver"
+	"github.com/docker/docker/pkg/stringid"
 	"github.com/docker/go-units"
 )
 
@@ -30,47 +31,7 @@ type Driver struct {
 	refCount int
 }
 
-// InitLoopbacks ensures that the loopback devices are properly created within
-// the system running the device mapper tests.
-func InitLoopbacks() error {
-	statT, err := getBaseLoopStats()
-	if err != nil {
-		return err
-	}
-	// create at least 8 loopback files, ya, that is a good number
-	for i := 0; i < 8; i++ {
-		loopPath := fmt.Sprintf("/dev/loop%d", i)
-		// only create new loopback files if they don't exist
-		if _, err := os.Stat(loopPath); err != nil {
-			if mkerr := syscall.Mknod(loopPath,
-				uint32(statT.Mode|syscall.S_IFBLK), int((7<<8)|(i&0xff)|((i&0xfff00)<<12))); mkerr != nil {
-				return mkerr
-			}
-			os.Chown(loopPath, int(statT.Uid), int(statT.Gid))
-		}
-	}
-	return nil
-}
-
-// getBaseLoopStats inspects /dev/loop0 to collect uid,gid, and mode for the
-// loop0 device on the system.  If it does not exist we assume 0,0,0660 for the
-// stat data
-func getBaseLoopStats() (*syscall.Stat_t, error) {
-	loop0, err := os.Stat("/dev/loop0")
-	if err != nil {
-		if os.IsNotExist(err) {
-			return &syscall.Stat_t{
-				Uid:  0,
-				Gid:  0,
-				Mode: 0660,
-			}, nil
-		}
-		return nil, err
-	}
-	return loop0.Sys().(*syscall.Stat_t), nil
-}
-
-func newDriver(t *testing.T, name string) *Driver {
+func newDriver(t testing.TB, name string, options []string) *Driver {
 	root, err := ioutil.TempDir("", "docker-graphtest-")
 	if err != nil {
 		t.Fatal(err)
@@ -80,7 +41,7 @@ func newDriver(t *testing.T, name string) *Driver {
 		t.Fatal(err)
 	}
 
-	d, err := graphdriver.GetDriver(name, root, nil, nil, nil)
+	d, err := graphdriver.GetDriver(name, root, options, nil, nil)
 	if err != nil {
 		t.Logf("graphdriver: %v\n", err)
 		if err == graphdriver.ErrNotSupported || err == graphdriver.ErrPrerequisites || err == graphdriver.ErrIncompatibleFS {
@@ -91,7 +52,7 @@ func newDriver(t *testing.T, name string) *Driver {
 	return &Driver{d, root, 1}
 }
 
-func cleanup(t *testing.T, d *Driver) {
+func cleanup(t testing.TB, d *Driver) {
 	if err := drv.Cleanup(); err != nil {
 		t.Fatal(err)
 	}
@@ -99,9 +60,9 @@ func cleanup(t *testing.T, d *Driver) {
 }
 
 // GetDriver create a new driver with given name or return an existing driver with the name updating the reference count.
-func GetDriver(t *testing.T, name string) graphdriver.Driver {
+func GetDriver(t testing.TB, name string, options ...string) graphdriver.Driver {
 	if drv == nil {
-		drv = newDriver(t, name)
+		drv = newDriver(t, name, options)
 	} else {
 		drv.refCount++
 	}
@@ -109,7 +70,7 @@ func GetDriver(t *testing.T, name string) graphdriver.Driver {
 }
 
 // PutDriver removes the driver if it is no longer used and updates the reference count.
-func PutDriver(t *testing.T) {
+func PutDriver(t testing.TB) {
 	if drv == nil {
 		t.Skip("No driver to put!")
 	}
@@ -120,190 +81,210 @@ func PutDriver(t *testing.T) {
 	}
 }
 
-func verifyFile(t *testing.T, path string, mode os.FileMode, uid, gid uint32) {
-	fi, err := os.Stat(path)
-	if err != nil {
+// DriverTestCreateEmpty creates a new image and verifies it is empty and the right metadata
+func DriverTestCreateEmpty(t testing.TB, drivername string, driverOptions ...string) {
+	driver := GetDriver(t, drivername, driverOptions...)
+	defer PutDriver(t)
+
+	if err := driver.Create("empty", "", "", nil); err != nil {
 		t.Fatal(err)
 	}
 
-	if fi.Mode()&os.ModeType != mode&os.ModeType {
-		t.Fatalf("Expected %s type 0x%x, got 0x%x", path, mode&os.ModeType, fi.Mode()&os.ModeType)
-	}
+	defer func() {
+		if err := driver.Remove("empty"); err != nil {
+			t.Fatal(err)
+		}
+	}()
 
-	if fi.Mode()&os.ModePerm != mode&os.ModePerm {
-		t.Fatalf("Expected %s mode %o, got %o", path, mode&os.ModePerm, fi.Mode()&os.ModePerm)
+	if !driver.Exists("empty") {
+		t.Fatal("Newly created image doesn't exist")
 	}
 
-	if fi.Mode()&os.ModeSticky != mode&os.ModeSticky {
-		t.Fatalf("Expected %s sticky 0x%x, got 0x%x", path, mode&os.ModeSticky, fi.Mode()&os.ModeSticky)
+	dir, err := driver.Get("empty", "")
+	if err != nil {
+		t.Fatal(err)
 	}
 
-	if fi.Mode()&os.ModeSetuid != mode&os.ModeSetuid {
-		t.Fatalf("Expected %s setuid 0x%x, got 0x%x", path, mode&os.ModeSetuid, fi.Mode()&os.ModeSetuid)
-	}
+	verifyFile(t, dir, 0755|os.ModeDir, 0, 0)
 
-	if fi.Mode()&os.ModeSetgid != mode&os.ModeSetgid {
-		t.Fatalf("Expected %s setgid 0x%x, got 0x%x", path, mode&os.ModeSetgid, fi.Mode()&os.ModeSetgid)
+	// Verify that the directory is empty
+	fis, err := readDir(dir)
+	if err != nil {
+		t.Fatal(err)
 	}
 
-	if stat, ok := fi.Sys().(*syscall.Stat_t); ok {
-		if stat.Uid != uid {
-			t.Fatalf("%s no owned by uid %d", path, uid)
-		}
-		if stat.Gid != gid {
-			t.Fatalf("%s not owned by gid %d", path, gid)
-		}
+	if len(fis) != 0 {
+		t.Fatal("New directory not empty")
 	}
 
+	driver.Put("empty")
 }
 
-// readDir reads a directory just like ioutil.ReadDir()
-// then hides specific files (currently "lost+found")
-// so the tests don't "see" it
-func readDir(dir string) ([]os.FileInfo, error) {
-	a, err := ioutil.ReadDir(dir)
-	if err != nil {
-		return nil, err
-	}
+// DriverTestCreateBase create a base driver and verify.
+func DriverTestCreateBase(t testing.TB, drivername string, driverOptions ...string) {
+	driver := GetDriver(t, drivername, driverOptions...)
+	defer PutDriver(t)
 
-	b := a[:0]
-	for _, x := range a {
-		if x.Name() != "lost+found" { // ext4 always have this dir
-			b = append(b, x)
+	createBase(t, driver, "Base")
+	defer func() {
+		if err := driver.Remove("Base"); err != nil {
+			t.Fatal(err)
 		}
-	}
-
-	return b, nil
+	}()
+	verifyBase(t, driver, "Base")
 }
 
-// DriverTestCreateEmpty creates a new image and verifies it is empty and the right metadata
-func DriverTestCreateEmpty(t *testing.T, drivername string) {
-	driver := GetDriver(t, drivername)
+// DriverTestCreateSnap Create a driver and snap and verify.
+func DriverTestCreateSnap(t testing.TB, drivername string, driverOptions ...string) {
+	driver := GetDriver(t, drivername, driverOptions...)
 	defer PutDriver(t)
 
-	if err := driver.Create("empty", "", "", nil); err != nil {
+	createBase(t, driver, "Base")
+
+	defer func() {
+		if err := driver.Remove("Base"); err != nil {
+			t.Fatal(err)
+		}
+	}()
+
+	if err := driver.Create("Snap", "Base", "", nil); err != nil {
 		t.Fatal(err)
 	}
 
 	defer func() {
-		if err := driver.Remove("empty"); err != nil {
+		if err := driver.Remove("Snap"); err != nil {
 			t.Fatal(err)
 		}
 	}()
 
-	if !driver.Exists("empty") {
-		t.Fatal("Newly created image doesn't exist")
-	}
+	verifyBase(t, driver, "Snap")
+}
 
-	dir, err := driver.Get("empty", "")
-	if err != nil {
+// DriverTestDeepLayerRead reads a file from a lower layer under a given number of layers
+func DriverTestDeepLayerRead(t testing.TB, layerCount int, drivername string, driverOptions ...string) {
+	driver := GetDriver(t, drivername, driverOptions...)
+	defer PutDriver(t)
+
+	base := stringid.GenerateRandomID()
+
+	if err := driver.Create(base, "", "", nil); err != nil {
 		t.Fatal(err)
 	}
 
-	verifyFile(t, dir, 0755|os.ModeDir, 0, 0)
+	content := []byte("test content")
+	if err := addFile(driver, base, "testfile.txt", content); err != nil {
+		t.Fatal(err)
+	}
 
-	// Verify that the directory is empty
-	fis, err := readDir(dir)
+	topLayer, err := addManyLayers(driver, base, layerCount)
 	if err != nil {
 		t.Fatal(err)
 	}
 
-	if len(fis) != 0 {
-		t.Fatal("New directory not empty")
+	err = checkManyLayers(driver, topLayer, layerCount)
+	if err != nil {
+		t.Fatal(err)
 	}
 
-	driver.Put("empty")
+	if err := checkFile(driver, topLayer, "testfile.txt", content); err != nil {
+		t.Fatal(err)
+	}
 }
 
-func createBase(t *testing.T, driver graphdriver.Driver, name string) {
-	// We need to be able to set any perms
-	oldmask := syscall.Umask(0)
-	defer syscall.Umask(oldmask)
+// DriverTestDiffApply tests diffing and applying produces the same layer
+func DriverTestDiffApply(t testing.TB, fileCount int, drivername string, driverOptions ...string) {
+	driver := GetDriver(t, drivername, driverOptions...)
+	defer PutDriver(t)
+	base := stringid.GenerateRandomID()
+	upper := stringid.GenerateRandomID()
 
-	if err := driver.CreateReadWrite(name, "", "", nil); err != nil {
+	if err := driver.Create(base, "", "", nil); err != nil {
 		t.Fatal(err)
 	}
 
-	dir, err := driver.Get(name, "")
-	if err != nil {
+	if err := addManyFiles(driver, base, fileCount, 3); err != nil {
+		t.Fatal(err)
+	}
+
+	if err := driver.Create(upper, base, "", nil); err != nil {
 		t.Fatal(err)
 	}
-	defer driver.Put(name)
 
-	subdir := path.Join(dir, "a subdir")
-	if err := os.Mkdir(subdir, 0705|os.ModeSticky); err != nil {
+	if err := addManyFiles(driver, upper, fileCount, 6); err != nil {
 		t.Fatal(err)
 	}
-	if err := os.Chown(subdir, 1, 2); err != nil {
+	diffSize, err := driver.DiffSize(upper, "")
+	if err != nil {
 		t.Fatal(err)
 	}
 
-	file := path.Join(dir, "a file")
-	if err := ioutil.WriteFile(file, []byte("Some data"), 0222|os.ModeSetuid); err != nil {
+	diff := stringid.GenerateRandomID()
+	if err := driver.Create(diff, base, "", nil); err != nil {
 		t.Fatal(err)
 	}
-}
 
-func verifyBase(t *testing.T, driver graphdriver.Driver, name string) {
-	dir, err := driver.Get(name, "")
-	if err != nil {
+	if err := checkManyFiles(driver, diff, fileCount, 3); err != nil {
 		t.Fatal(err)
 	}
-	defer driver.Put(name)
 
-	subdir := path.Join(dir, "a subdir")
-	verifyFile(t, subdir, 0705|os.ModeDir|os.ModeSticky, 1, 2)
+	arch, err := driver.Diff(upper, base)
+	if err != nil {
+		t.Fatal(err)
+	}
 
-	file := path.Join(dir, "a file")
-	verifyFile(t, file, 0222|os.ModeSetuid, 0, 0)
+	buf := bytes.NewBuffer(nil)
+	if _, err := buf.ReadFrom(arch); err != nil {
+		t.Fatal(err)
+	}
+	if err := arch.Close(); err != nil {
+		t.Fatal(err)
+	}
 
-	fis, err := readDir(dir)
+	applyDiffSize, err := driver.ApplyDiff(diff, base, bytes.NewReader(buf.Bytes()))
 	if err != nil {
 		t.Fatal(err)
 	}
 
-	if len(fis) != 2 {
-		t.Fatal("Unexpected files in base image")
+	if applyDiffSize != diffSize {
+		t.Fatalf("Apply diff size different, got %d, expected %d", applyDiffSize, diffSize)
+	}
+	if err := checkManyFiles(driver, diff, fileCount, 6); err != nil {
+		t.Fatal(err)
 	}
 }
 
-// DriverTestCreateBase create a base driver and verify.
-func DriverTestCreateBase(t *testing.T, drivername string) {
-	driver := GetDriver(t, drivername)
+// DriverTestChanges tests computed changes on a layer matches changes made
+func DriverTestChanges(t testing.TB, drivername string, driverOptions ...string) {
+	driver := GetDriver(t, drivername, driverOptions...)
 	defer PutDriver(t)
+	base := stringid.GenerateRandomID()
+	upper := stringid.GenerateRandomID()
 
-	createBase(t, driver, "Base")
-	defer func() {
-		if err := driver.Remove("Base"); err != nil {
-			t.Fatal(err)
-		}
-	}()
-	verifyBase(t, driver, "Base")
-}
+	if err := driver.Create(base, "", "", nil); err != nil {
+		t.Fatal(err)
+	}
 
-// DriverTestCreateSnap Create a driver and snap and verify.
-func DriverTestCreateSnap(t *testing.T, drivername string) {
-	driver := GetDriver(t, drivername)
-	defer PutDriver(t)
+	if err := addManyFiles(driver, base, 20, 3); err != nil {
+		t.Fatal(err)
+	}
 
-	createBase(t, driver, "Base")
+	if err := driver.Create(upper, base, "", nil); err != nil {
+		t.Fatal(err)
+	}
 
-	defer func() {
-		if err := driver.Remove("Base"); err != nil {
-			t.Fatal(err)
-		}
-	}()
+	expectedChanges, err := changeManyFiles(driver, upper, 20, 6)
+	if err != nil {
+		t.Fatal(err)
+	}
 
-	if err := driver.Create("Snap", "Base", "", nil); err != nil {
+	changes, err := driver.Changes(upper, base)
+	if err != nil {
 		t.Fatal(err)
 	}
-	defer func() {
-		if err := driver.Remove("Snap"); err != nil {
-			t.Fatal(err)
-		}
-	}()
 
-	verifyBase(t, driver, "Snap")
+	if err = checkChanges(expectedChanges, changes); err != nil {
+		t.Fatal(err)
+	}
 }
 
 func writeRandomFile(path string, size uint64) error {

+ 301 - 0
daemon/graphdriver/graphtest/testutil.go

@@ -0,0 +1,301 @@
+package graphtest
+
+import (
+	"bytes"
+	"fmt"
+	"io/ioutil"
+	"math/rand"
+	"os"
+	"path"
+	"sort"
+
+	"github.com/docker/docker/daemon/graphdriver"
+	"github.com/docker/docker/pkg/archive"
+	"github.com/docker/docker/pkg/stringid"
+)
+
+func randomContent(size int, seed int64) []byte {
+	s := rand.NewSource(seed)
+	content := make([]byte, size)
+
+	for i := 0; i < len(content); i += 7 {
+		val := s.Int63()
+		for j := 0; i+j < len(content) && j < 7; j++ {
+			content[i+j] = byte(val)
+			val >>= 8
+		}
+	}
+
+	return content
+}
+
+func addFiles(drv graphdriver.Driver, layer string, seed int64) error {
+	root, err := drv.Get(layer, "")
+	if err != nil {
+		return err
+	}
+	defer drv.Put(layer)
+
+	if err := ioutil.WriteFile(path.Join(root, "file-a"), randomContent(64, seed), 0755); err != nil {
+		return err
+	}
+	if err := os.MkdirAll(path.Join(root, "dir-b"), 0755); err != nil {
+		return err
+	}
+	if err := ioutil.WriteFile(path.Join(root, "dir-b", "file-b"), randomContent(128, seed+1), 0755); err != nil {
+		return err
+	}
+
+	return ioutil.WriteFile(path.Join(root, "file-c"), randomContent(128*128, seed+2), 0755)
+}
+
+func checkFile(drv graphdriver.Driver, layer, filename string, content []byte) error {
+	root, err := drv.Get(layer, "")
+	if err != nil {
+		return err
+	}
+	defer drv.Put(layer)
+
+	fileContent, err := ioutil.ReadFile(path.Join(root, filename))
+	if err != nil {
+		return err
+	}
+
+	if bytes.Compare(fileContent, content) != 0 {
+		return fmt.Errorf("mismatched file content %v, expecting %v", fileContent, content)
+	}
+
+	return nil
+}
+
+func addFile(drv graphdriver.Driver, layer, filename string, content []byte) error {
+	root, err := drv.Get(layer, "")
+	if err != nil {
+		return err
+	}
+	defer drv.Put(layer)
+
+	return ioutil.WriteFile(path.Join(root, filename), content, 0755)
+}
+
+func addManyFiles(drv graphdriver.Driver, layer string, count int, seed int64) error {
+	root, err := drv.Get(layer, "")
+	if err != nil {
+		return err
+	}
+	defer drv.Put(layer)
+
+	for i := 0; i < count; i += 100 {
+		dir := path.Join(root, fmt.Sprintf("directory-%d", i))
+		if err := os.MkdirAll(dir, 0755); err != nil {
+			return err
+		}
+		for j := 0; i+j < count && j < 100; j++ {
+			file := path.Join(dir, fmt.Sprintf("file-%d", i+j))
+			if err := ioutil.WriteFile(file, randomContent(64, seed+int64(i+j)), 0755); err != nil {
+				return err
+			}
+		}
+	}
+
+	return nil
+}
+
+func changeManyFiles(drv graphdriver.Driver, layer string, count int, seed int64) ([]archive.Change, error) {
+	root, err := drv.Get(layer, "")
+	if err != nil {
+		return nil, err
+	}
+	defer drv.Put(layer)
+
+	changes := []archive.Change{}
+	for i := 0; i < count; i += 100 {
+		archiveRoot := fmt.Sprintf("/directory-%d", i)
+		if err := os.MkdirAll(path.Join(root, archiveRoot), 0755); err != nil {
+			return nil, err
+		}
+		for j := 0; i+j < count && j < 100; j++ {
+			if j == 0 {
+				changes = append(changes, archive.Change{
+					Path: archiveRoot,
+					Kind: archive.ChangeModify,
+				})
+			}
+			var change archive.Change
+			switch j % 3 {
+			// Update file
+			case 0:
+				change.Path = path.Join(archiveRoot, fmt.Sprintf("file-%d", i+j))
+				change.Kind = archive.ChangeModify
+				if err := ioutil.WriteFile(path.Join(root, change.Path), randomContent(64, seed+int64(i+j)), 0755); err != nil {
+					return nil, err
+				}
+			// Add file
+			case 1:
+				change.Path = path.Join(archiveRoot, fmt.Sprintf("file-%d-%d", seed, i+j))
+				change.Kind = archive.ChangeAdd
+				if err := ioutil.WriteFile(path.Join(root, change.Path), randomContent(64, seed+int64(i+j)), 0755); err != nil {
+					return nil, err
+				}
+			// Remove file
+			case 2:
+				change.Path = path.Join(archiveRoot, fmt.Sprintf("file-%d", i+j))
+				change.Kind = archive.ChangeDelete
+				if err := os.Remove(path.Join(root, change.Path)); err != nil {
+					return nil, err
+				}
+			}
+			changes = append(changes, change)
+		}
+	}
+
+	return changes, nil
+}
+
+func checkManyFiles(drv graphdriver.Driver, layer string, count int, seed int64) error {
+	root, err := drv.Get(layer, "")
+	if err != nil {
+		return err
+	}
+	defer drv.Put(layer)
+
+	for i := 0; i < count; i += 100 {
+		dir := path.Join(root, fmt.Sprintf("directory-%d", i))
+		for j := 0; i+j < count && j < 100; j++ {
+			file := path.Join(dir, fmt.Sprintf("file-%d", i+j))
+			fileContent, err := ioutil.ReadFile(file)
+			if err != nil {
+				return err
+			}
+
+			content := randomContent(64, seed+int64(i+j))
+
+			if bytes.Compare(fileContent, content) != 0 {
+				return fmt.Errorf("mismatched file content %v, expecting %v", fileContent, content)
+			}
+		}
+	}
+
+	return nil
+}
+
+type changeList []archive.Change
+
+func (c changeList) Less(i, j int) bool {
+	if c[i].Path == c[j].Path {
+		return c[i].Kind < c[j].Kind
+	}
+	return c[i].Path < c[j].Path
+}
+func (c changeList) Len() int      { return len(c) }
+func (c changeList) Swap(i, j int) { c[j], c[i] = c[i], c[j] }
+
+func checkChanges(expected, actual []archive.Change) error {
+	if len(expected) != len(actual) {
+		return fmt.Errorf("unexpected number of changes, expected %d, got %d", len(expected), len(actual))
+	}
+	sort.Sort(changeList(expected))
+	sort.Sort(changeList(actual))
+
+	for i := range expected {
+		if expected[i] != actual[i] {
+			return fmt.Errorf("unexpected change, expecting %v, got %v", expected[i], actual[i])
+		}
+	}
+
+	return nil
+}
+
+func addLayerFiles(drv graphdriver.Driver, layer, parent string, i int) error {
+	root, err := drv.Get(layer, "")
+	if err != nil {
+		return err
+	}
+	defer drv.Put(layer)
+
+	if err := ioutil.WriteFile(path.Join(root, "top-id"), []byte(layer), 0755); err != nil {
+		return err
+	}
+	layerDir := path.Join(root, fmt.Sprintf("layer-%d", i))
+	if err := os.MkdirAll(layerDir, 0755); err != nil {
+		return err
+	}
+	if err := ioutil.WriteFile(path.Join(layerDir, "layer-id"), []byte(layer), 0755); err != nil {
+		return err
+	}
+	if err := ioutil.WriteFile(path.Join(layerDir, "parent-id"), []byte(parent), 0755); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func addManyLayers(drv graphdriver.Driver, baseLayer string, count int) (string, error) {
+	lastLayer := baseLayer
+	for i := 1; i <= count; i++ {
+		nextLayer := stringid.GenerateRandomID()
+		if err := drv.Create(nextLayer, lastLayer, "", nil); err != nil {
+			return "", err
+		}
+		if err := addLayerFiles(drv, nextLayer, lastLayer, i); err != nil {
+			return "", err
+		}
+
+		lastLayer = nextLayer
+
+	}
+	return lastLayer, nil
+}
+
+func checkManyLayers(drv graphdriver.Driver, layer string, count int) error {
+	root, err := drv.Get(layer, "")
+	if err != nil {
+		return err
+	}
+	defer drv.Put(layer)
+
+	layerIDBytes, err := ioutil.ReadFile(path.Join(root, "top-id"))
+	if err != nil {
+		return err
+	}
+
+	if bytes.Compare(layerIDBytes, []byte(layer)) != 0 {
+		return fmt.Errorf("mismatched file content %v, expecting %v", layerIDBytes, []byte(layer))
+	}
+
+	for i := count; i > 0; i-- {
+		layerDir := path.Join(root, fmt.Sprintf("layer-%d", i))
+
+		thisLayerIDBytes, err := ioutil.ReadFile(path.Join(layerDir, "layer-id"))
+		if err != nil {
+			return err
+		}
+		if bytes.Compare(thisLayerIDBytes, layerIDBytes) != 0 {
+			return fmt.Errorf("mismatched file content %v, expecting %v", thisLayerIDBytes, layerIDBytes)
+		}
+		layerIDBytes, err = ioutil.ReadFile(path.Join(layerDir, "parent-id"))
+		if err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// readDir reads a directory just like ioutil.ReadDir()
+// then hides specific files (currently "lost+found")
+// so the tests don't "see" it
+func readDir(dir string) ([]os.FileInfo, error) {
+	a, err := ioutil.ReadDir(dir)
+	if err != nil {
+		return nil, err
+	}
+
+	b := a[:0]
+	for _, x := range a {
+		if x.Name() != "lost+found" { // ext4 always have this dir
+			b = append(b, x)
+		}
+	}
+
+	return b, nil
+}

+ 143 - 0
daemon/graphdriver/graphtest/testutil_unix.go

@@ -0,0 +1,143 @@
+// +build linux freebsd
+
+package graphtest
+
+import (
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path"
+	"syscall"
+	"testing"
+
+	"github.com/docker/docker/daemon/graphdriver"
+)
+
+// InitLoopbacks ensures that the loopback devices are properly created within
+// the system running the device mapper tests.
+func InitLoopbacks() error {
+	statT, err := getBaseLoopStats()
+	if err != nil {
+		return err
+	}
+	// create at least 8 loopback files, ya, that is a good number
+	for i := 0; i < 8; i++ {
+		loopPath := fmt.Sprintf("/dev/loop%d", i)
+		// only create new loopback files if they don't exist
+		if _, err := os.Stat(loopPath); err != nil {
+			if mkerr := syscall.Mknod(loopPath,
+				uint32(statT.Mode|syscall.S_IFBLK), int((7<<8)|(i&0xff)|((i&0xfff00)<<12))); mkerr != nil {
+				return mkerr
+			}
+			os.Chown(loopPath, int(statT.Uid), int(statT.Gid))
+		}
+	}
+	return nil
+}
+
+// getBaseLoopStats inspects /dev/loop0 to collect uid,gid, and mode for the
+// loop0 device on the system.  If it does not exist we assume 0,0,0660 for the
+// stat data
+func getBaseLoopStats() (*syscall.Stat_t, error) {
+	loop0, err := os.Stat("/dev/loop0")
+	if err != nil {
+		if os.IsNotExist(err) {
+			return &syscall.Stat_t{
+				Uid:  0,
+				Gid:  0,
+				Mode: 0660,
+			}, nil
+		}
+		return nil, err
+	}
+	return loop0.Sys().(*syscall.Stat_t), nil
+}
+
+func verifyFile(t testing.TB, path string, mode os.FileMode, uid, gid uint32) {
+	fi, err := os.Stat(path)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if fi.Mode()&os.ModeType != mode&os.ModeType {
+		t.Fatalf("Expected %s type 0x%x, got 0x%x", path, mode&os.ModeType, fi.Mode()&os.ModeType)
+	}
+
+	if fi.Mode()&os.ModePerm != mode&os.ModePerm {
+		t.Fatalf("Expected %s mode %o, got %o", path, mode&os.ModePerm, fi.Mode()&os.ModePerm)
+	}
+
+	if fi.Mode()&os.ModeSticky != mode&os.ModeSticky {
+		t.Fatalf("Expected %s sticky 0x%x, got 0x%x", path, mode&os.ModeSticky, fi.Mode()&os.ModeSticky)
+	}
+
+	if fi.Mode()&os.ModeSetuid != mode&os.ModeSetuid {
+		t.Fatalf("Expected %s setuid 0x%x, got 0x%x", path, mode&os.ModeSetuid, fi.Mode()&os.ModeSetuid)
+	}
+
+	if fi.Mode()&os.ModeSetgid != mode&os.ModeSetgid {
+		t.Fatalf("Expected %s setgid 0x%x, got 0x%x", path, mode&os.ModeSetgid, fi.Mode()&os.ModeSetgid)
+	}
+
+	if stat, ok := fi.Sys().(*syscall.Stat_t); ok {
+		if stat.Uid != uid {
+			t.Fatalf("%s no owned by uid %d", path, uid)
+		}
+		if stat.Gid != gid {
+			t.Fatalf("%s not owned by gid %d", path, gid)
+		}
+	}
+}
+
+func createBase(t testing.TB, driver graphdriver.Driver, name string) {
+	// We need to be able to set any perms
+	oldmask := syscall.Umask(0)
+	defer syscall.Umask(oldmask)
+
+	if err := driver.CreateReadWrite(name, "", "", nil); err != nil {
+		t.Fatal(err)
+	}
+
+	dir, err := driver.Get(name, "")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer driver.Put(name)
+
+	subdir := path.Join(dir, "a subdir")
+	if err := os.Mkdir(subdir, 0705|os.ModeSticky); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.Chown(subdir, 1, 2); err != nil {
+		t.Fatal(err)
+	}
+
+	file := path.Join(dir, "a file")
+	if err := ioutil.WriteFile(file, []byte("Some data"), 0222|os.ModeSetuid); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func verifyBase(t testing.TB, driver graphdriver.Driver, name string) {
+	dir, err := driver.Get(name, "")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer driver.Put(name)
+
+	subdir := path.Join(dir, "a subdir")
+	verifyFile(t, subdir, 0705|os.ModeDir|os.ModeSticky, 1, 2)
+
+	file := path.Join(dir, "a file")
+	verifyFile(t, file, 0222|os.ModeSetuid, 0, 0)
+
+	fis, err := readDir(dir)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if len(fis) != 2 {
+		t.Fatal("Unexpected files in base image")
+	}
+
+}

+ 1 - 2
daemon/graphdriver/overlay/overlay.go

@@ -15,7 +15,6 @@ import (
 
 	"github.com/docker/docker/daemon/graphdriver"
 	"github.com/docker/docker/pkg/archive"
-	"github.com/docker/docker/pkg/chrootarchive"
 	"github.com/docker/docker/pkg/idtools"
 
 	"github.com/docker/docker/pkg/mount"
@@ -426,7 +425,7 @@ func (d *Driver) ApplyDiff(id string, parent string, diff archive.Reader) (size
 	}
 
 	options := &archive.TarOptions{UIDMaps: d.uidMaps, GIDMaps: d.gidMaps}
-	if size, err = chrootarchive.ApplyUncompressedLayer(tmpRootDir, diff, options); err != nil {
+	if size, err = graphdriver.ApplyUncompressedLayer(tmpRootDir, diff, options); err != nil {
 		return 0, err
 	}
 

+ 58 - 0
daemon/graphdriver/overlay/overlay_test.go

@@ -5,9 +5,17 @@ package overlay
 import (
 	"testing"
 
+	"github.com/docker/docker/daemon/graphdriver"
 	"github.com/docker/docker/daemon/graphdriver/graphtest"
+	"github.com/docker/docker/pkg/archive"
 )
 
+func init() {
+	// Do not sure chroot to speed run time and allow archive
+	// errors or hangs to be debugged directly from the test process.
+	graphdriver.ApplyUncompressedLayer = archive.ApplyUncompressedLayer
+}
+
 // This avoids creating a new driver for each test if all tests are run
 // Make sure to put new tests between TestOverlaySetup and TestOverlayTeardown
 func TestOverlaySetup(t *testing.T) {
@@ -26,6 +34,56 @@ func TestOverlayCreateSnap(t *testing.T) {
 	graphtest.DriverTestCreateSnap(t, "overlay")
 }
 
+func TestOverlay50LayerRead(t *testing.T) {
+	graphtest.DriverTestDeepLayerRead(t, 50, "overlay")
+}
+
+func TestOverlayDiffApply10Files(t *testing.T) {
+	graphtest.DriverTestDiffApply(t, 10, "overlay")
+}
+
+func TestOverlayChanges(t *testing.T) {
+	graphtest.DriverTestChanges(t, "overlay")
+}
+
 func TestOverlayTeardown(t *testing.T) {
 	graphtest.PutDriver(t)
 }
+
+// Benchmarks should always setup new driver
+
+func BenchmarkExists(b *testing.B) {
+	graphtest.DriverBenchExists(b, "overlay")
+}
+
+func BenchmarkGetEmpty(b *testing.B) {
+	graphtest.DriverBenchGetEmpty(b, "overlay")
+}
+
+func BenchmarkDiffBase(b *testing.B) {
+	graphtest.DriverBenchDiffBase(b, "overlay")
+}
+
+func BenchmarkDiffSmallUpper(b *testing.B) {
+	graphtest.DriverBenchDiffN(b, 10, 10, "overlay")
+}
+
+func BenchmarkDiff10KFileUpper(b *testing.B) {
+	graphtest.DriverBenchDiffN(b, 10, 10000, "overlay")
+}
+
+func BenchmarkDiff10KFilesBottom(b *testing.B) {
+	graphtest.DriverBenchDiffN(b, 10000, 10, "overlay")
+}
+
+func BenchmarkDiffApply100(b *testing.B) {
+	graphtest.DriverBenchDiffApplyN(b, 100, "overlay")
+}
+
+func BenchmarkDiff20Layers(b *testing.B) {
+	graphtest.DriverBenchDeepLayerDiff(b, 20, "overlay")
+}
+
+func BenchmarkRead20Layers(b *testing.B) {
+	graphtest.DriverBenchDeepLayerRead(b, 20, "overlay")
+}

+ 91 - 0
daemon/graphdriver/overlay2/mount.go

@@ -0,0 +1,91 @@
+// +build linux
+
+package overlay2
+
+import (
+	"bytes"
+	"encoding/json"
+	"flag"
+	"fmt"
+	"os"
+	"runtime"
+	"syscall"
+
+	"github.com/docker/docker/pkg/reexec"
+)
+
+func init() {
+	reexec.Register("docker-mountfrom", mountFromMain)
+}
+
+func fatal(err error) {
+	fmt.Fprint(os.Stderr, err)
+	os.Exit(1)
+}
+
+type mountOptions struct {
+	Device string
+	Target string
+	Type   string
+	Label  string
+	Flag   uint32
+}
+
+func mountFrom(dir, device, target, mType, label string) error {
+
+	r, w, err := os.Pipe()
+	if err != nil {
+		return fmt.Errorf("mountfrom pipe failure: %v", err)
+	}
+
+	options := &mountOptions{
+		Device: device,
+		Target: target,
+		Type:   mType,
+		Flag:   0,
+		Label:  label,
+	}
+
+	cmd := reexec.Command("docker-mountfrom", dir)
+	cmd.Stdin = r
+
+	output := bytes.NewBuffer(nil)
+	cmd.Stdout = output
+	cmd.Stderr = output
+
+	if err := cmd.Start(); err != nil {
+		return fmt.Errorf("mountfrom error on re-exec cmd: %v", err)
+	}
+	//write the options to the pipe for the untar exec to read
+	if err := json.NewEncoder(w).Encode(options); err != nil {
+		return fmt.Errorf("mountfrom json encode to pipe failed: %v", err)
+	}
+	w.Close()
+
+	if err := cmd.Wait(); err != nil {
+		return fmt.Errorf("mountfrom re-exec error: %v: output: %s", err, output)
+	}
+	return nil
+}
+
+// mountfromMain is the entry-point for docker-mountfrom on re-exec.
+func mountFromMain() {
+	runtime.LockOSThread()
+	flag.Parse()
+
+	var options *mountOptions
+
+	if err := json.NewDecoder(os.Stdin).Decode(&options); err != nil {
+		fatal(err)
+	}
+
+	if err := os.Chdir(flag.Arg(0)); err != nil {
+		fatal(err)
+	}
+
+	if err := syscall.Mount(options.Device, options.Target, options.Type, uintptr(options.Flag), options.Label); err != nil {
+		fatal(err)
+	}
+
+	os.Exit(0)
+}

+ 476 - 0
daemon/graphdriver/overlay2/overlay.go

@@ -0,0 +1,476 @@
+// +build linux
+
+package overlay2
+
+import (
+	"bufio"
+	"errors"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"os/exec"
+	"path"
+	"strings"
+	"syscall"
+
+	"github.com/Sirupsen/logrus"
+
+	"github.com/docker/docker/daemon/graphdriver"
+	"github.com/docker/docker/pkg/archive"
+	"github.com/docker/docker/pkg/chrootarchive"
+	"github.com/docker/docker/pkg/directory"
+	"github.com/docker/docker/pkg/idtools"
+	"github.com/docker/docker/pkg/mount"
+	"github.com/docker/docker/pkg/parsers/kernel"
+
+	"github.com/opencontainers/runc/libcontainer/label"
+)
+
+var (
+	// untar defines the untar method
+	untar = chrootarchive.UntarUncompressed
+)
+
+// This backend uses the overlay union filesystem for containers
+// with diff directories for each layer.
+
+// This version of the overlay driver requires at least kernel
+// 4.0.0 in order to support mounting multiple diff directories.
+
+// Each container/image has at least a "diff" directory and "link" file.
+// If there is also a "lower" file when there are diff layers
+// below  as well as "merged" and "work" directories. The "diff" directory
+// has the upper layer of the overlay and is used to capture any
+// changes to the layer. The "lower" file contains all the lower layer
+// mounts separated by ":" and ordered from uppermost to lowermost
+// layers. The overlay itself is mounted in the "merged" directory,
+// and the "work" dir is needed for overlay to work.
+
+// The "link" file for each layer contains a unique string for the layer.
+// Under the "l" directory at the root there will be a symbolic link
+// with that unique string pointing the "diff" directory for the layer.
+// The symbolic links are used to reference lower layers in the "lower"
+// file and on mount. The links are used to shorten the total length
+// of a layer reference without requiring changes to the layer identifier
+// or root directory. Mounts are always done relative to root and
+// referencing the symbolic links in order to ensure the number of
+// lower directories can fit in a single page for making the mount
+// syscall. A hard upper limit of 128 lower layers is enforced to ensure
+// that mounts do not fail due to length.
+
+const (
+	driverName = "overlay2"
+	linkDir    = "l"
+	lowerFile  = "lower"
+	maxDepth   = 128
+
+	// idLength represents the number of random characters
+	// which can be used to create the unique link identifer
+	// for every layer. If this value is too long then the
+	// page size limit for the mount command may be exceeded.
+	// The idLength should be selected such that following equation
+	// is true (512 is a buffer for label metadata).
+	// ((idLength + len(linkDir) + 1) * maxDepth) <= (pageSize - 512)
+	idLength = 26
+)
+
+// Driver contains information about the home directory and the list of active mounts that are created using this driver.
+type Driver struct {
+	home    string
+	uidMaps []idtools.IDMap
+	gidMaps []idtools.IDMap
+	ctr     *graphdriver.RefCounter
+}
+
+var backingFs = "<unknown>"
+
+func init() {
+	graphdriver.Register(driverName, Init)
+}
+
+// Init returns the a native diff driver for overlay filesystem.
+// If overlay filesystem is not supported on the host, graphdriver.ErrNotSupported is returned as error.
+// If a overlay filesystem is not supported over a existing filesystem then error graphdriver.ErrIncompatibleFS is returned.
+func Init(home string, options []string, uidMaps, gidMaps []idtools.IDMap) (graphdriver.Driver, error) {
+
+	if err := supportsOverlay(); err != nil {
+		return nil, graphdriver.ErrNotSupported
+	}
+
+	// require kernel 4.0.0 to ensure multiple lower dirs are supported
+	v, err := kernel.GetKernelVersion()
+	if err != nil {
+		return nil, err
+	}
+	if kernel.CompareKernelVersion(*v, kernel.VersionInfo{Kernel: 4, Major: 0, Minor: 0}) < 0 {
+		return nil, graphdriver.ErrNotSupported
+	}
+
+	fsMagic, err := graphdriver.GetFSMagic(home)
+	if err != nil {
+		return nil, err
+	}
+	if fsName, ok := graphdriver.FsNames[fsMagic]; ok {
+		backingFs = fsName
+	}
+
+	// check if they are running over btrfs, aufs, zfs or overlay
+	switch fsMagic {
+	case graphdriver.FsMagicBtrfs:
+		logrus.Error("'overlay' is not supported over btrfs.")
+		return nil, graphdriver.ErrIncompatibleFS
+	case graphdriver.FsMagicAufs:
+		logrus.Error("'overlay' is not supported over aufs.")
+		return nil, graphdriver.ErrIncompatibleFS
+	case graphdriver.FsMagicZfs:
+		logrus.Error("'overlay' is not supported over zfs.")
+		return nil, graphdriver.ErrIncompatibleFS
+	case graphdriver.FsMagicOverlay:
+		logrus.Error("'overlay' is not supported over overlay.")
+		return nil, graphdriver.ErrIncompatibleFS
+	}
+
+	rootUID, rootGID, err := idtools.GetRootUIDGID(uidMaps, gidMaps)
+	if err != nil {
+		return nil, err
+	}
+	// Create the driver home dir
+	if err := idtools.MkdirAllAs(path.Join(home, linkDir), 0700, rootUID, rootGID); err != nil && !os.IsExist(err) {
+		return nil, err
+	}
+
+	if err := mount.MakePrivate(home); err != nil {
+		return nil, err
+	}
+
+	d := &Driver{
+		home:    home,
+		uidMaps: uidMaps,
+		gidMaps: gidMaps,
+		ctr:     graphdriver.NewRefCounter(graphdriver.NewFsChecker(graphdriver.FsMagicOverlay)),
+	}
+
+	return d, nil
+}
+
+func supportsOverlay() error {
+	// We can try to modprobe overlay first before looking at
+	// proc/filesystems for when overlay is supported
+	exec.Command("modprobe", "overlay").Run()
+
+	f, err := os.Open("/proc/filesystems")
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+
+	s := bufio.NewScanner(f)
+	for s.Scan() {
+		if s.Text() == "nodev\toverlay" {
+			return nil
+		}
+	}
+	logrus.Error("'overlay' not found as a supported filesystem on this host. Please ensure kernel is new enough and has overlay support loaded.")
+	return graphdriver.ErrNotSupported
+}
+
+func (d *Driver) String() string {
+	return driverName
+}
+
+// Status returns current driver information in a two dimensional string array.
+// Output contains "Backing Filesystem" used in this implementation.
+func (d *Driver) Status() [][2]string {
+	return [][2]string{
+		{"Backing Filesystem", backingFs},
+	}
+}
+
+// GetMetadata returns meta data about the overlay driver such as
+// LowerDir, UpperDir, WorkDir and MergeDir used to store data.
+func (d *Driver) GetMetadata(id string) (map[string]string, error) {
+	dir := d.dir(id)
+	if _, err := os.Stat(dir); err != nil {
+		return nil, err
+	}
+
+	metadata := map[string]string{
+		"WorkDir":   path.Join(dir, "work"),
+		"MergedDir": path.Join(dir, "merged"),
+		"UpperDir":  path.Join(dir, "diff"),
+	}
+
+	lowerDirs, err := d.getLowerDirs(id)
+	if err != nil {
+		return nil, err
+	}
+	if len(lowerDirs) > 0 {
+		metadata["LowerDir"] = strings.Join(lowerDirs, ":")
+	}
+
+	return metadata, nil
+}
+
+// Cleanup any state created by overlay which should be cleaned when daemon
+// is being shutdown. For now, we just have to unmount the bind mounted
+// we had created.
+func (d *Driver) Cleanup() error {
+	return mount.Unmount(d.home)
+}
+
+// CreateReadWrite creates a layer that is writable for use as a container
+// file system.
+func (d *Driver) CreateReadWrite(id, parent, mountLabel string, storageOpt map[string]string) error {
+	return d.Create(id, parent, mountLabel, storageOpt)
+}
+
+// Create is used to create the upper, lower, and merge directories required for overlay fs for a given id.
+// The parent filesystem is used to configure these directories for the overlay.
+func (d *Driver) Create(id, parent, mountLabel string, storageOpt map[string]string) (retErr error) {
+
+	if len(storageOpt) != 0 {
+		return fmt.Errorf("--storage-opt is not supported for overlay")
+	}
+
+	dir := d.dir(id)
+
+	rootUID, rootGID, err := idtools.GetRootUIDGID(d.uidMaps, d.gidMaps)
+	if err != nil {
+		return err
+	}
+	if err := idtools.MkdirAllAs(path.Dir(dir), 0700, rootUID, rootGID); err != nil {
+		return err
+	}
+	if err := idtools.MkdirAs(dir, 0700, rootUID, rootGID); err != nil {
+		return err
+	}
+
+	defer func() {
+		// Clean up on failure
+		if retErr != nil {
+			os.RemoveAll(dir)
+		}
+	}()
+
+	if err := idtools.MkdirAs(path.Join(dir, "diff"), 0755, rootUID, rootGID); err != nil {
+		return err
+	}
+
+	lid := generateID(idLength)
+	if err := os.Symlink(path.Join("..", id, "diff"), path.Join(d.home, linkDir, lid)); err != nil {
+		return err
+	}
+
+	// Write link id to link file
+	if err := ioutil.WriteFile(path.Join(dir, "link"), []byte(lid), 0644); err != nil {
+		return err
+	}
+
+	// if no parent directory, done
+	if parent == "" {
+		return nil
+	}
+
+	if err := idtools.MkdirAs(path.Join(dir, "work"), 0700, rootUID, rootGID); err != nil {
+		return err
+	}
+	if err := idtools.MkdirAs(path.Join(dir, "merged"), 0700, rootUID, rootGID); err != nil {
+		return err
+	}
+
+	lower, err := d.getLower(parent)
+	if err != nil {
+		return err
+	}
+	if lower != "" {
+		if err := ioutil.WriteFile(path.Join(dir, lowerFile), []byte(lower), 0666); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (d *Driver) getLower(parent string) (string, error) {
+	parentDir := d.dir(parent)
+
+	// Ensure parent exists
+	if _, err := os.Lstat(parentDir); err != nil {
+		return "", err
+	}
+
+	// Read Parent link fileA
+	parentLink, err := ioutil.ReadFile(path.Join(parentDir, "link"))
+	if err != nil {
+		return "", err
+	}
+	lowers := []string{path.Join(linkDir, string(parentLink))}
+
+	parentLower, err := ioutil.ReadFile(path.Join(parentDir, lowerFile))
+	if err == nil {
+		parentLowers := strings.Split(string(parentLower), ":")
+		lowers = append(lowers, parentLowers...)
+	}
+	if len(lowers) > maxDepth {
+		return "", errors.New("max depth exceeded")
+	}
+	return strings.Join(lowers, ":"), nil
+}
+
+func (d *Driver) dir(id string) string {
+	return path.Join(d.home, id)
+}
+
+func (d *Driver) getLowerDirs(id string) ([]string, error) {
+	var lowersArray []string
+	lowers, err := ioutil.ReadFile(path.Join(d.dir(id), lowerFile))
+	if err == nil {
+		for _, s := range strings.Split(string(lowers), ":") {
+			lp, err := os.Readlink(path.Join(d.home, s))
+			if err != nil {
+				return nil, err
+			}
+			lowersArray = append(lowersArray, path.Clean(path.Join(d.home, "link", lp)))
+		}
+	} else if !os.IsNotExist(err) {
+		return nil, err
+	}
+	return lowersArray, nil
+}
+
+// Remove cleans the directories that are created for this id.
+func (d *Driver) Remove(id string) error {
+	if err := os.RemoveAll(d.dir(id)); err != nil && !os.IsNotExist(err) {
+		return err
+	}
+	return nil
+}
+
+// Get creates and mounts the required file system for the given id and returns the mount path.
+func (d *Driver) Get(id string, mountLabel string) (s string, err error) {
+	dir := d.dir(id)
+	if _, err := os.Stat(dir); err != nil {
+		return "", err
+	}
+
+	diffDir := path.Join(dir, "diff")
+	lowers, err := ioutil.ReadFile(path.Join(dir, lowerFile))
+	if err != nil {
+		// If no lower, just return diff directory
+		if os.IsNotExist(err) {
+			return diffDir, nil
+		}
+		return "", err
+	}
+
+	mergedDir := path.Join(dir, "merged")
+	if count := d.ctr.Increment(mergedDir); count > 1 {
+		return mergedDir, nil
+	}
+	defer func() {
+		if err != nil {
+			if c := d.ctr.Decrement(mergedDir); c <= 0 {
+				syscall.Unmount(mergedDir, 0)
+			}
+		}
+	}()
+
+	workDir := path.Join(dir, "work")
+	opts := fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s", string(lowers), path.Join(id, "diff"), path.Join(id, "work"))
+	mountLabel = label.FormatMountLabel(opts, mountLabel)
+	if len(mountLabel) > syscall.Getpagesize() {
+		return "", fmt.Errorf("cannot mount layer, mount label too large %d", len(mountLabel))
+	}
+
+	if err := mountFrom(d.home, "overlay", path.Join(id, "merged"), "overlay", mountLabel); err != nil {
+		return "", fmt.Errorf("error creating overlay mount to %s: %v", mergedDir, err)
+	}
+
+	// chown "workdir/work" to the remapped root UID/GID. Overlay fs inside a
+	// user namespace requires this to move a directory from lower to upper.
+	rootUID, rootGID, err := idtools.GetRootUIDGID(d.uidMaps, d.gidMaps)
+	if err != nil {
+		return "", err
+	}
+
+	if err := os.Chown(path.Join(workDir, "work"), rootUID, rootGID); err != nil {
+		return "", err
+	}
+
+	return mergedDir, nil
+}
+
+// Put unmounts the mount path created for the give id.
+func (d *Driver) Put(id string) error {
+	mountpoint := path.Join(d.dir(id), "merged")
+	if count := d.ctr.Decrement(mountpoint); count > 0 {
+		return nil
+	}
+	if err := syscall.Unmount(mountpoint, 0); err != nil {
+		logrus.Debugf("Failed to unmount %s overlay: %v", id, err)
+	}
+	return nil
+}
+
+// Exists checks to see if the id is already mounted.
+func (d *Driver) Exists(id string) bool {
+	_, err := os.Stat(d.dir(id))
+	return err == nil
+}
+
+// ApplyDiff applies the new layer into a root
+func (d *Driver) ApplyDiff(id string, parent string, diff archive.Reader) (size int64, err error) {
+	applyDir := d.getDiffPath(id)
+
+	logrus.Debugf("Applying tar in %s", applyDir)
+	// Overlay doesn't need the parent id to apply the diff
+	if err := untar(diff, applyDir, &archive.TarOptions{
+		UIDMaps:        d.uidMaps,
+		GIDMaps:        d.gidMaps,
+		WhiteoutFormat: archive.OverlayWhiteoutFormat,
+	}); err != nil {
+		return 0, err
+	}
+
+	return d.DiffSize(id, parent)
+}
+
+func (d *Driver) getDiffPath(id string) string {
+	dir := d.dir(id)
+
+	return path.Join(dir, "diff")
+}
+
+// DiffSize calculates the changes between the specified id
+// and its parent and returns the size in bytes of the changes
+// relative to its base filesystem directory.
+func (d *Driver) DiffSize(id, parent string) (size int64, err error) {
+	return directory.Size(d.getDiffPath(id))
+}
+
+// Diff produces an archive of the changes between the specified
+// layer and its parent layer which may be "".
+func (d *Driver) Diff(id, parent string) (archive.Archive, error) {
+	diffPath := d.getDiffPath(id)
+	logrus.Debugf("Tar with options on %s", diffPath)
+	return archive.TarWithOptions(diffPath, &archive.TarOptions{
+		Compression:    archive.Uncompressed,
+		UIDMaps:        d.uidMaps,
+		GIDMaps:        d.gidMaps,
+		WhiteoutFormat: archive.OverlayWhiteoutFormat,
+	})
+}
+
+// Changes produces a list of changes between the specified layer
+// and its parent layer. If parent is "", then all changes will be ADD changes.
+func (d *Driver) Changes(id, parent string) ([]archive.Change, error) {
+	// Overlay doesn't have snapshots, so we need to get changes from all parent
+	// layers.
+	diffPath := d.getDiffPath(id)
+	layers, err := d.getLowerDirs(id)
+	if err != nil {
+		return nil, err
+	}
+
+	return archive.OverlayChanges(layers, diffPath)
+}

+ 106 - 0
daemon/graphdriver/overlay2/overlay_test.go

@@ -0,0 +1,106 @@
+// +build linux
+
+package overlay2
+
+import (
+	"os"
+	"syscall"
+	"testing"
+
+	"github.com/docker/docker/daemon/graphdriver"
+	"github.com/docker/docker/daemon/graphdriver/graphtest"
+	"github.com/docker/docker/pkg/archive"
+	"github.com/docker/docker/pkg/reexec"
+)
+
+func init() {
+	// Do not sure chroot to speed run time and allow archive
+	// errors or hangs to be debugged directly from the test process.
+	untar = archive.UntarUncompressed
+	graphdriver.ApplyUncompressedLayer = archive.ApplyUncompressedLayer
+
+	reexec.Init()
+}
+
+func cdMountFrom(dir, device, target, mType, label string) error {
+	wd, err := os.Getwd()
+	if err != nil {
+		return err
+	}
+	os.Chdir(dir)
+	defer os.Chdir(wd)
+
+	return syscall.Mount(device, target, mType, 0, label)
+}
+
+// This avoids creating a new driver for each test if all tests are run
+// Make sure to put new tests between TestOverlaySetup and TestOverlayTeardown
+func TestOverlaySetup(t *testing.T) {
+	graphtest.GetDriver(t, driverName)
+}
+
+func TestOverlayCreateEmpty(t *testing.T) {
+	graphtest.DriverTestCreateEmpty(t, driverName)
+}
+
+func TestOverlayCreateBase(t *testing.T) {
+	graphtest.DriverTestCreateBase(t, driverName)
+}
+
+func TestOverlayCreateSnap(t *testing.T) {
+	graphtest.DriverTestCreateSnap(t, driverName)
+}
+
+func TestOverlay128LayerRead(t *testing.T) {
+	graphtest.DriverTestDeepLayerRead(t, 128, driverName)
+}
+
+func TestOverlayDiffApply10Files(t *testing.T) {
+	graphtest.DriverTestDiffApply(t, 10, driverName)
+}
+
+func TestOverlayChanges(t *testing.T) {
+	graphtest.DriverTestChanges(t, driverName)
+}
+
+func TestOverlayTeardown(t *testing.T) {
+	graphtest.PutDriver(t)
+}
+
+// Benchmarks should always setup new driver
+
+func BenchmarkExists(b *testing.B) {
+	graphtest.DriverBenchExists(b, driverName)
+}
+
+func BenchmarkGetEmpty(b *testing.B) {
+	graphtest.DriverBenchGetEmpty(b, driverName)
+}
+
+func BenchmarkDiffBase(b *testing.B) {
+	graphtest.DriverBenchDiffBase(b, driverName)
+}
+
+func BenchmarkDiffSmallUpper(b *testing.B) {
+	graphtest.DriverBenchDiffN(b, 10, 10, driverName)
+}
+
+func BenchmarkDiff10KFileUpper(b *testing.B) {
+	graphtest.DriverBenchDiffN(b, 10, 10000, driverName)
+}
+
+func BenchmarkDiff10KFilesBottom(b *testing.B) {
+	graphtest.DriverBenchDiffN(b, 10000, 10, driverName)
+}
+
+func BenchmarkDiffApply100(b *testing.B) {
+	graphtest.DriverBenchDiffApplyN(b, 100, driverName)
+}
+
+func BenchmarkDiff20Layers(b *testing.B) {
+	graphtest.DriverBenchDeepLayerDiff(b, 20, driverName)
+}
+
+func BenchmarkRead20Layers(b *testing.B) {
+	graphtest.DriverBenchDeepLayerRead(b, 20, driverName)
+}

+ 3 - 0
daemon/graphdriver/overlay2/overlay_unsupported.go

@@ -0,0 +1,3 @@
+// +build !linux
+
+package overlay2

+ 80 - 0
daemon/graphdriver/overlay2/randomid.go

@@ -0,0 +1,80 @@
+// +build linux
+
+package overlay2
+
+import (
+	"crypto/rand"
+	"encoding/base32"
+	"fmt"
+	"io"
+	"os"
+	"syscall"
+	"time"
+
+	"github.com/Sirupsen/logrus"
+)
+
+// generateID creates a new random string identifier with the given length
+func generateID(l int) string {
+	const (
+		// ensures we backoff for less than 450ms total. Use the following to
+		// select new value, in units of 10ms:
+		// 	n*(n+1)/2 = d -> n^2 + n - 2d -> n = (sqrt(8d + 1) - 1)/2
+		maxretries = 9
+		backoff    = time.Millisecond * 10
+	)
+
+	var (
+		totalBackoff time.Duration
+		count        int
+		retries      int
+		size         = (l*5 + 7) / 8
+		u            = make([]byte, size)
+	)
+	// TODO: Include time component, counter component, random component
+
+	for {
+		// This should never block but the read may fail. Because of this,
+		// we just try to read the random number generator until we get
+		// something. This is a very rare condition but may happen.
+		b := time.Duration(retries) * backoff
+		time.Sleep(b)
+		totalBackoff += b
+
+		n, err := io.ReadFull(rand.Reader, u[count:])
+		if err != nil {
+			if retryOnError(err) && retries < maxretries {
+				count += n
+				retries++
+				logrus.Errorf("error generating version 4 uuid, retrying: %v", err)
+				continue
+			}
+
+			// Any other errors represent a system problem. What did someone
+			// do to /dev/urandom?
+			panic(fmt.Errorf("error reading random number generator, retried for %v: %v", totalBackoff.String(), err))
+		}
+
+		break
+	}
+
+	s := base32.StdEncoding.EncodeToString(u)
+
+	return s[:l]
+}
+
+// retryOnError tries to detect whether or not retrying would be fruitful.
+func retryOnError(err error) bool {
+	switch err := err.(type) {
+	case *os.PathError:
+		return retryOnError(err.Err) // unpack the target error
+	case syscall.Errno:
+		if err == syscall.EPERM {
+			// EPERM represents an entropy pool exhaustion, a condition under
+			// which we backoff and retry.
+			return true
+		}
+	}
+
+	return false
+}

+ 1 - 0
daemon/graphdriver/register/register_overlay.go

@@ -5,4 +5,5 @@ package register
 import (
 	// register the overlay graphdriver
 	_ "github.com/docker/docker/daemon/graphdriver/overlay"
+	_ "github.com/docker/docker/daemon/graphdriver/overlay2"
 )

+ 8 - 3
docs/reference/commandline/dockerd.md

@@ -204,7 +204,7 @@ TCP and a Unix socket
 ### Daemon storage-driver option
 
 The Docker daemon has support for several different image layer storage
-drivers: `aufs`, `devicemapper`, `btrfs`, `zfs` and `overlay`.
+drivers: `aufs`, `devicemapper`, `btrfs`, `zfs`, `overlay` and `overlay2`.
 
 The `aufs` driver is the oldest, but is based on a Linux kernel patch-set that
 is unlikely to be merged into the main kernel. These are also known to cause
@@ -242,9 +242,14 @@ Linux kernel as of [3.18.0](https://lkml.org/lkml/2014/10/26/137). Call
 > inode consumption (especially as the number of images grows), as well as
 > being incompatible with the use of RPMs.
 
+The `overlay2` uses the same fast union filesystem but takes advantage of
+[additional features](https://lkml.org/lkml/2015/2/11/106) added in Linux
+kernel 4.0 to avoid excessive inode consumption. Call `dockerd -s overlay2`
+to use it.
+
 > **Note:**
-> It is currently unsupported on `btrfs` or any Copy on Write filesystem
-> and should only be used over `ext4` partitions.
+> Both `overlay` and `overlay2` are currently unsupported on `btrfs` or any
+> Copy on Write filesystem and should only be used over `ext4` partitions.
 
 ### Storage driver options
 

+ 35 - 16
docs/userguide/storagedriver/selectadriver.md

@@ -34,14 +34,14 @@ and all containers created by that daemon instance use the same storage driver.
  The table below shows the supported storage driver technologies and their
 driver names:
 
-|Technology    |Storage driver name  |
-|--------------|---------------------|
-|OverlayFS     |`overlay`            |
-|AUFS          |`aufs`               |
-|Btrfs         |`btrfs`              |
-|Device Mapper |`devicemapper`       |
-|VFS           |`vfs`                |
-|ZFS           |`zfs`                |
+|Technology    |Storage driver name    |
+|--------------|-----------------------|
+|OverlayFS     |`overlay` or `overlay2`|
+|AUFS          |`aufs`                 |
+|Btrfs         |`btrfs`                |
+|Device Mapper |`devicemapper`         |
+|VFS           |`vfs`                  |
+|ZFS           |`zfs`                  |
 
 To find out which storage driver is set on the daemon, you use the
 `docker info` command:
@@ -71,14 +71,15 @@ For example, the `btrfs` storage driver on a Btrfs backing filesystem. The
 following table lists each storage driver and whether it must match the host's
 backing file system:
 
-|Storage driver |Commonly used on |Disabled on                              |
-|---------------|-----------------|-----------------------------------------|
-|`overlay`      |`ext4` `xfs`     |`btrfs` `aufs` `overlay` `zfs` `eCryptfs`|
-|`aufs`         |`ext4` `xfs`     |`btrfs` `aufs` `eCryptfs`                |
-|`btrfs`        |`btrfs` _only_   |   N/A                                   |
-|`devicemapper` |`direct-lvm`     |   N/A                                   |
-|`vfs`          |debugging only   |   N/A                                   |
-|`zfs`          |`zfs` _only_     |   N/A                                   |
+|Storage driver |Commonly used on |Disabled on                                         |
+|---------------|-----------------|----------------------------------------------------|
+|`overlay`      |`ext4` `xfs`     |`btrfs` `aufs` `overlay` `overlay2` `zfs` `eCryptfs`|
+|`overlay2`     |`ext4` `xfs`     |`btrfs` `aufs` `overlay` `overlay2` `zfs` `eCryptfs`|
+|`aufs`         |`ext4` `xfs`     |`btrfs` `aufs` `eCryptfs`                           |
+|`btrfs`        |`btrfs` _only_   |   N/A                                              |
+|`devicemapper` |`direct-lvm`     |   N/A                                              |
+|`vfs`          |debugging only   |   N/A                                              |
+|`zfs`          |`zfs` _only_     |   N/A                                              |
 
 
 > **Note**
@@ -198,6 +199,24 @@ the guidance offered by the table below along with the points mentioned above.
 
 ![](images/driver-pros-cons.png)
 
+### Overlay vs Overlay2
+
+OverlayFS has 2 storage drivers which both make use of the same OverlayFS
+technology but with different implementations and incompatible on disk
+storage. Since the storage is incompatible, switching between the two
+will require re-creating all image content. The `overlay` driver is the
+original implementation and the only option in Docker 1.11 and before.
+The `overlay` driver has known limitations with inode exhaustion and
+commit performance. The `overlay2` driver addresses this limitation, but
+is only compatible with Linux kernel 4.0 and later. For users on a pre-4.0
+kernel or with an existing `overlay` graph, it is recommended to stay
+on `overlay`. For users with at least a 4.0 kernel and no existing or required
+`overlay` graph data, then `overlay2` may be used.
+
+> **Note**
+> `overlay2` graph data will not interfere with `overlay` graph data. However
+> when switching to `overlay2`, the user is responsible for removing
+> `overlay` graph data to avoid storage duplication.
 
 ## Related information
 

+ 1 - 1
man/dockerd.8.md

@@ -226,7 +226,7 @@ output otherwise.
   Force the Docker runtime to use a specific storage driver.
 
 **--selinux-enabled**=*true*|*false*
-  Enable selinux support. Default is false. SELinux does not presently support the overlay storage driver.
+  Enable selinux support. Default is false. SELinux does not presently support either of the overlay storage drivers.
 
 **--storage-opt**=[]
   Set storage driver options. See STORAGE DRIVER OPTIONS.

+ 49 - 5
pkg/archive/archive.go

@@ -33,6 +33,8 @@ type (
 	Reader io.Reader
 	// Compression is the state represents if compressed or not.
 	Compression int
+	// WhiteoutFormat is the format of whiteouts unpacked
+	WhiteoutFormat int
 	// TarChownOptions wraps the chown options UID and GID.
 	TarChownOptions struct {
 		UID, GID int
@@ -47,6 +49,10 @@ type (
 		GIDMaps          []idtools.IDMap
 		ChownOpts        *TarChownOptions
 		IncludeSourceDir bool
+		// WhiteoutFormat is the expected on disk format for whiteout files.
+		// This format will be converted to the standard format on pack
+		// and from the standard format on unpack.
+		WhiteoutFormat WhiteoutFormat
 		// When unpacking, specifies whether overwriting a directory with a
 		// non-directory is allowed and vice versa.
 		NoOverwriteDirNonDir bool
@@ -93,6 +99,14 @@ const (
 	Xz
 )
 
+const (
+	// AUFSWhiteoutFormat is the default format for whitesouts
+	AUFSWhiteoutFormat WhiteoutFormat = iota
+	// OverlayWhiteoutFormat formats whiteout according to the overlay
+	// standard.
+	OverlayWhiteoutFormat
+)
+
 // IsArchive checks for the magic bytes of a tar or any supported compression
 // algorithm.
 func IsArchive(header []byte) bool {
@@ -228,6 +242,11 @@ func (compression *Compression) Extension() string {
 	return ""
 }
 
+type tarWhiteoutConverter interface {
+	ConvertWrite(*tar.Header, string, os.FileInfo) error
+	ConvertRead(*tar.Header, string) (bool, error)
+}
+
 type tarAppender struct {
 	TarWriter *tar.Writer
 	Buffer    *bufio.Writer
@@ -236,6 +255,12 @@ type tarAppender struct {
 	SeenFiles map[uint64]string
 	UIDMaps   []idtools.IDMap
 	GIDMaps   []idtools.IDMap
+
+	// For packing and unpacking whiteout files in the
+	// non standard format. The whiteout files defined
+	// by the AUFS standard are used as the tar whiteout
+	// standard.
+	WhiteoutConverter tarWhiteoutConverter
 }
 
 // canonicalTarName provides a platform-independent and consistent posix-style
@@ -253,6 +278,7 @@ func canonicalTarName(name string, isDir bool) (string, error) {
 	return name, nil
 }
 
+// addTarFile adds to the tar archive a file from `path` as `name`
 func (ta *tarAppender) addTarFile(path, name string) error {
 	fi, err := os.Lstat(path)
 	if err != nil {
@@ -323,6 +349,12 @@ func (ta *tarAppender) addTarFile(path, name string) error {
 		hdr.Gid = xGID
 	}
 
+	if ta.WhiteoutConverter != nil {
+		if err := ta.WhiteoutConverter.ConvertWrite(hdr, path, fi); err != nil {
+			return err
+		}
+	}
+
 	if err := ta.TarWriter.WriteHeader(hdr); err != nil {
 		return err
 	}
@@ -508,11 +540,12 @@ func TarWithOptions(srcPath string, options *TarOptions) (io.ReadCloser, error)
 
 	go func() {
 		ta := &tarAppender{
-			TarWriter: tar.NewWriter(compressWriter),
-			Buffer:    pools.BufioWriter32KPool.Get(nil),
-			SeenFiles: make(map[uint64]string),
-			UIDMaps:   options.UIDMaps,
-			GIDMaps:   options.GIDMaps,
+			TarWriter:         tar.NewWriter(compressWriter),
+			Buffer:            pools.BufioWriter32KPool.Get(nil),
+			SeenFiles:         make(map[uint64]string),
+			UIDMaps:           options.UIDMaps,
+			GIDMaps:           options.GIDMaps,
+			WhiteoutConverter: getWhiteoutConverter(options.WhiteoutFormat),
 		}
 
 		defer func() {
@@ -674,6 +707,7 @@ func Unpack(decompressedArchive io.Reader, dest string, options *TarOptions) err
 	if err != nil {
 		return err
 	}
+	whiteoutConverter := getWhiteoutConverter(options.WhiteoutFormat)
 
 	// Iterate through the files in the archive.
 loop:
@@ -773,6 +807,16 @@ loop:
 			hdr.Gid = xGID
 		}
 
+		if whiteoutConverter != nil {
+			writeFile, err := whiteoutConverter.ConvertRead(hdr, path)
+			if err != nil {
+				return err
+			}
+			if !writeFile {
+				continue
+			}
+		}
+
 		if err := createTarFile(path, dest, hdr, trBuf, !options.NoLchown, options.ChownOpts); err != nil {
 			return err
 		}

+ 89 - 0
pkg/archive/archive_linux.go

@@ -0,0 +1,89 @@
+package archive
+
+import (
+	"archive/tar"
+	"os"
+	"path/filepath"
+	"strings"
+	"syscall"
+
+	"github.com/docker/docker/pkg/system"
+)
+
+func getWhiteoutConverter(format WhiteoutFormat) tarWhiteoutConverter {
+	if format == OverlayWhiteoutFormat {
+		return overlayWhiteoutConverter{}
+	}
+	return nil
+}
+
+type overlayWhiteoutConverter struct{}
+
+func (overlayWhiteoutConverter) ConvertWrite(hdr *tar.Header, path string, fi os.FileInfo) error {
+	// convert whiteouts to AUFS format
+	if fi.Mode()&os.ModeCharDevice != 0 && hdr.Devmajor == 0 && hdr.Devminor == 0 {
+		// we just rename the file and make it normal
+		hdr.Name = WhiteoutPrefix + hdr.Name
+		hdr.Mode = 0600
+		hdr.Typeflag = tar.TypeReg
+	}
+
+	if fi.Mode()&os.ModeDir != 0 {
+		// convert opaque dirs to AUFS format by writing an empty file with the prefix
+		opaque, err := system.Lgetxattr(path, "trusted.overlay.opaque")
+		if err != nil {
+			return err
+		}
+		if opaque != nil && len(opaque) == 1 && opaque[0] == 'y' {
+			// create a header for the whiteout file
+			// it should inherit some properties from the parent, but be a regular file
+			*hdr = tar.Header{
+				Typeflag:   tar.TypeReg,
+				Mode:       hdr.Mode & int64(os.ModePerm),
+				Name:       filepath.Join(hdr.Name, WhiteoutOpaqueDir),
+				Size:       0,
+				Uid:        hdr.Uid,
+				Uname:      hdr.Uname,
+				Gid:        hdr.Gid,
+				Gname:      hdr.Gname,
+				AccessTime: hdr.AccessTime,
+				ChangeTime: hdr.ChangeTime,
+			}
+		}
+	}
+
+	return nil
+}
+
+func (overlayWhiteoutConverter) ConvertRead(hdr *tar.Header, path string) (bool, error) {
+	base := filepath.Base(path)
+	dir := filepath.Dir(path)
+
+	// if a directory is marked as opaque by the AUFS special file, we need to translate that to overlay
+	if base == WhiteoutOpaqueDir {
+		if err := syscall.Setxattr(dir, "trusted.overlay.opaque", []byte{'y'}, 0); err != nil {
+			return false, err
+		}
+
+		// don't write the file itself
+		return false, nil
+	}
+
+	// if a file was deleted and we are using overlay, we need to create a character device
+	if strings.HasPrefix(base, WhiteoutPrefix) {
+		originalBase := base[len(WhiteoutPrefix):]
+		originalPath := filepath.Join(dir, originalBase)
+
+		if err := syscall.Mknod(originalPath, syscall.S_IFCHR, 0); err != nil {
+			return false, err
+		}
+		if err := os.Chown(originalPath, hdr.Uid, hdr.Gid); err != nil {
+			return false, err
+		}
+
+		// don't write the file itself
+		return false, nil
+	}
+
+	return true, nil
+}

+ 7 - 0
pkg/archive/archive_other.go

@@ -0,0 +1,7 @@
+// +build !linux
+
+package archive
+
+func getWhiteoutConverter(format WhiteoutFormat) tarWhiteoutConverter {
+	return nil
+}

+ 38 - 8
pkg/archive/changes.go

@@ -81,6 +81,33 @@ func sameFsTimeSpec(a, b syscall.Timespec) bool {
 // Changes walks the path rw and determines changes for the files in the path,
 // with respect to the parent layers
 func Changes(layers []string, rw string) ([]Change, error) {
+	return changes(layers, rw, aufsDeletedFile, aufsMetadataSkip)
+}
+
+func aufsMetadataSkip(path string) (skip bool, err error) {
+	skip, err = filepath.Match(string(os.PathSeparator)+WhiteoutMetaPrefix+"*", path)
+	if err != nil {
+		skip = true
+	}
+	return
+}
+
+func aufsDeletedFile(root, path string, fi os.FileInfo) (string, error) {
+	f := filepath.Base(path)
+
+	// If there is a whiteout, then the file was removed
+	if strings.HasPrefix(f, WhiteoutPrefix) {
+		originalFile := f[len(WhiteoutPrefix):]
+		return filepath.Join(filepath.Dir(path), originalFile), nil
+	}
+
+	return "", nil
+}
+
+type skipChange func(string) (bool, error)
+type deleteChange func(string, string, os.FileInfo) (string, error)
+
+func changes(layers []string, rw string, dc deleteChange, sc skipChange) ([]Change, error) {
 	var (
 		changes     []Change
 		changedDirs = make(map[string]struct{})
@@ -105,21 +132,24 @@ func Changes(layers []string, rw string) ([]Change, error) {
 			return nil
 		}
 
-		// Skip AUFS metadata
-		if matched, err := filepath.Match(string(os.PathSeparator)+WhiteoutMetaPrefix+"*", path); err != nil || matched {
-			return err
+		if sc != nil {
+			if skip, err := sc(path); skip {
+				return err
+			}
 		}
 
 		change := Change{
 			Path: path,
 		}
 
+		deletedFile, err := dc(rw, path, f)
+		if err != nil {
+			return err
+		}
+
 		// Find out what kind of modification happened
-		file := filepath.Base(path)
-		// If there is a whiteout, then the file was removed
-		if strings.HasPrefix(file, WhiteoutPrefix) {
-			originalFile := file[len(WhiteoutPrefix):]
-			change.Path = filepath.Join(filepath.Dir(path), originalFile)
+		if deletedFile != "" {
+			change.Path = deletedFile
 			change.Kind = ChangeDelete
 		} else {
 			// Otherwise, the file was added

+ 27 - 0
pkg/archive/changes_linux.go

@@ -283,3 +283,30 @@ func clen(n []byte) int {
 	}
 	return len(n)
 }
+
+// OverlayChanges walks the path rw and determines changes for the files in the path,
+// with respect to the parent layers
+func OverlayChanges(layers []string, rw string) ([]Change, error) {
+	return changes(layers, rw, overlayDeletedFile, nil)
+}
+
+func overlayDeletedFile(root, path string, fi os.FileInfo) (string, error) {
+	if fi.Mode()&os.ModeCharDevice != 0 {
+		s := fi.Sys().(*syscall.Stat_t)
+		if major(uint64(s.Rdev)) == 0 && minor(uint64(s.Rdev)) == 0 {
+			return path, nil
+		}
+	}
+	if fi.Mode()&os.ModeDir != 0 {
+		opaque, err := system.Lgetxattr(filepath.Join(root, path), "trusted.overlay.opaque")
+		if err != nil {
+			return "", err
+		}
+		if opaque != nil && len(opaque) == 1 && opaque[0] == 'y' {
+			return path, nil
+		}
+	}
+
+	return "", nil
+
+}