Browse Source

Merge pull request #22126 from dmcgowan/overlay-native-diff

Overlay multiple lower directory support
Michael Crosby 9 years ago
parent
commit
8a2f9a249c

+ 264 - 0
daemon/graphdriver/graphtest/graphbench_unix.go

@@ -0,0 +1,264 @@
+// +build linux freebsd
+
+package graphtest
+
+import (
+	"bytes"
+	"io"
+	"io/ioutil"
+	"path/filepath"
+	"testing"
+
+	"github.com/docker/docker/pkg/stringid"
+)
+
+// DriverBenchExists benchmarks calls to exist
+func DriverBenchExists(b *testing.B, drivername string, driveroptions ...string) {
+	driver := GetDriver(b, drivername, driveroptions...)
+	defer PutDriver(b)
+
+	base := stringid.GenerateRandomID()
+
+	if err := driver.Create(base, "", "", nil); err != nil {
+		b.Fatal(err)
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		if !driver.Exists(base) {
+			b.Fatal("Newly created image doesn't exist")
+		}
+	}
+}
+
+// DriverBenchGetEmpty benchmarks calls to get on an empty layer
+func DriverBenchGetEmpty(b *testing.B, drivername string, driveroptions ...string) {
+	driver := GetDriver(b, drivername, driveroptions...)
+	defer PutDriver(b)
+
+	base := stringid.GenerateRandomID()
+
+	if err := driver.Create(base, "", "", nil); err != nil {
+		b.Fatal(err)
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_, err := driver.Get(base, "")
+		b.StopTimer()
+		if err != nil {
+			b.Fatalf("Error getting mount: %s", err)
+		}
+		if err := driver.Put(base); err != nil {
+			b.Fatalf("Error putting mount: %s", err)
+		}
+		b.StartTimer()
+	}
+}
+
+// DriverBenchDiffBase benchmarks calls to diff on a root layer
+func DriverBenchDiffBase(b *testing.B, drivername string, driveroptions ...string) {
+	driver := GetDriver(b, drivername, driveroptions...)
+	defer PutDriver(b)
+
+	base := stringid.GenerateRandomID()
+
+	if err := driver.Create(base, "", "", nil); err != nil {
+		b.Fatal(err)
+	}
+
+	if err := addFiles(driver, base, 3); err != nil {
+		b.Fatal(err)
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		arch, err := driver.Diff(base, "")
+		if err != nil {
+			b.Fatal(err)
+		}
+		_, err = io.Copy(ioutil.Discard, arch)
+		if err != nil {
+			b.Fatalf("Error copying archive: %s", err)
+		}
+		arch.Close()
+	}
+}
+
+// DriverBenchDiffN benchmarks calls to diff on two layers with
+// a provided number of files on the lower and upper layers.
+func DriverBenchDiffN(b *testing.B, bottom, top int, drivername string, driveroptions ...string) {
+	driver := GetDriver(b, drivername, driveroptions...)
+	defer PutDriver(b)
+	base := stringid.GenerateRandomID()
+	upper := stringid.GenerateRandomID()
+
+	if err := driver.Create(base, "", "", nil); err != nil {
+		b.Fatal(err)
+	}
+
+	if err := addManyFiles(driver, base, bottom, 3); err != nil {
+		b.Fatal(err)
+	}
+
+	if err := driver.Create(upper, base, "", nil); err != nil {
+		b.Fatal(err)
+	}
+
+	if err := addManyFiles(driver, upper, top, 6); err != nil {
+		b.Fatal(err)
+	}
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		arch, err := driver.Diff(upper, "")
+		if err != nil {
+			b.Fatal(err)
+		}
+		_, err = io.Copy(ioutil.Discard, arch)
+		if err != nil {
+			b.Fatalf("Error copying archive: %s", err)
+		}
+		arch.Close()
+	}
+}
+
+// DriverBenchDiffApplyN benchmarks calls to diff and apply together
+func DriverBenchDiffApplyN(b *testing.B, fileCount int, drivername string, driveroptions ...string) {
+	driver := GetDriver(b, drivername, driveroptions...)
+	defer PutDriver(b)
+	base := stringid.GenerateRandomID()
+	upper := stringid.GenerateRandomID()
+
+	if err := driver.Create(base, "", "", nil); err != nil {
+		b.Fatal(err)
+	}
+
+	if err := addManyFiles(driver, base, fileCount, 3); err != nil {
+		b.Fatal(err)
+	}
+
+	if err := driver.Create(upper, base, "", nil); err != nil {
+		b.Fatal(err)
+	}
+
+	if err := addManyFiles(driver, upper, fileCount, 6); err != nil {
+		b.Fatal(err)
+	}
+	diffSize, err := driver.DiffSize(upper, "")
+	if err != nil {
+		b.Fatal(err)
+	}
+	b.ResetTimer()
+	b.StopTimer()
+	for i := 0; i < b.N; i++ {
+		diff := stringid.GenerateRandomID()
+		if err := driver.Create(diff, base, "", nil); err != nil {
+			b.Fatal(err)
+		}
+
+		if err := checkManyFiles(driver, diff, fileCount, 3); err != nil {
+			b.Fatal(err)
+		}
+
+		b.StartTimer()
+
+		arch, err := driver.Diff(upper, "")
+		if err != nil {
+			b.Fatal(err)
+		}
+
+		applyDiffSize, err := driver.ApplyDiff(diff, "", arch)
+		if err != nil {
+			b.Fatal(err)
+		}
+
+		b.StopTimer()
+		arch.Close()
+
+		if applyDiffSize != diffSize {
+			// TODO: enforce this
+			//b.Fatalf("Apply diff size different, got %d, expected %s", applyDiffSize, diffSize)
+		}
+		if err := checkManyFiles(driver, diff, fileCount, 6); err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
+// DriverBenchDeepLayerDiff benchmarks calls to diff on top of a given number of layers.
+func DriverBenchDeepLayerDiff(b *testing.B, layerCount int, drivername string, driveroptions ...string) {
+	driver := GetDriver(b, drivername, driveroptions...)
+	defer PutDriver(b)
+
+	base := stringid.GenerateRandomID()
+
+	if err := driver.Create(base, "", "", nil); err != nil {
+		b.Fatal(err)
+	}
+
+	if err := addFiles(driver, base, 50); err != nil {
+		b.Fatal(err)
+	}
+
+	topLayer, err := addManyLayers(driver, base, layerCount)
+	if err != nil {
+		b.Fatal(err)
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		arch, err := driver.Diff(topLayer, "")
+		if err != nil {
+			b.Fatal(err)
+		}
+		_, err = io.Copy(ioutil.Discard, arch)
+		if err != nil {
+			b.Fatalf("Error copying archive: %s", err)
+		}
+		arch.Close()
+	}
+}
+
+// DriverBenchDeepLayerRead benchmarks calls to read a file under a given number of layers.
+func DriverBenchDeepLayerRead(b *testing.B, layerCount int, drivername string, driveroptions ...string) {
+	driver := GetDriver(b, drivername, driveroptions...)
+	defer PutDriver(b)
+
+	base := stringid.GenerateRandomID()
+
+	if err := driver.Create(base, "", "", nil); err != nil {
+		b.Fatal(err)
+	}
+
+	content := []byte("test content")
+	if err := addFile(driver, base, "testfile.txt", content); err != nil {
+		b.Fatal(err)
+	}
+
+	topLayer, err := addManyLayers(driver, base, layerCount)
+	if err != nil {
+		b.Fatal(err)
+	}
+
+	root, err := driver.Get(topLayer, "")
+	if err != nil {
+		b.Fatal(err)
+	}
+	defer driver.Put(topLayer)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+
+		// Read content
+		c, err := ioutil.ReadFile(filepath.Join(root, "testfile.txt"))
+		if err != nil {
+			b.Fatal(err)
+		}
+
+		b.StopTimer()
+		if bytes.Compare(c, content) != 0 {
+			b.Fatalf("Wrong content in file %v, expected %v", c, content)
+		}
+		b.StartTimer()
+	}
+}

+ 134 - 153
daemon/graphdriver/graphtest/graphtest_unix.go

@@ -3,7 +3,7 @@
 package graphtest
 package graphtest
 
 
 import (
 import (
-	"fmt"
+	"bytes"
 	"io/ioutil"
 	"io/ioutil"
 	"math/rand"
 	"math/rand"
 	"os"
 	"os"
@@ -14,6 +14,7 @@ import (
 	"unsafe"
 	"unsafe"
 
 
 	"github.com/docker/docker/daemon/graphdriver"
 	"github.com/docker/docker/daemon/graphdriver"
+	"github.com/docker/docker/pkg/stringid"
 	"github.com/docker/go-units"
 	"github.com/docker/go-units"
 )
 )
 
 
@@ -30,47 +31,7 @@ type Driver struct {
 	refCount int
 	refCount int
 }
 }
 
 
-// InitLoopbacks ensures that the loopback devices are properly created within
-// the system running the device mapper tests.
-func InitLoopbacks() error {
-	statT, err := getBaseLoopStats()
-	if err != nil {
-		return err
-	}
-	// create at least 8 loopback files, ya, that is a good number
-	for i := 0; i < 8; i++ {
-		loopPath := fmt.Sprintf("/dev/loop%d", i)
-		// only create new loopback files if they don't exist
-		if _, err := os.Stat(loopPath); err != nil {
-			if mkerr := syscall.Mknod(loopPath,
-				uint32(statT.Mode|syscall.S_IFBLK), int((7<<8)|(i&0xff)|((i&0xfff00)<<12))); mkerr != nil {
-				return mkerr
-			}
-			os.Chown(loopPath, int(statT.Uid), int(statT.Gid))
-		}
-	}
-	return nil
-}
-
-// getBaseLoopStats inspects /dev/loop0 to collect uid,gid, and mode for the
-// loop0 device on the system.  If it does not exist we assume 0,0,0660 for the
-// stat data
-func getBaseLoopStats() (*syscall.Stat_t, error) {
-	loop0, err := os.Stat("/dev/loop0")
-	if err != nil {
-		if os.IsNotExist(err) {
-			return &syscall.Stat_t{
-				Uid:  0,
-				Gid:  0,
-				Mode: 0660,
-			}, nil
-		}
-		return nil, err
-	}
-	return loop0.Sys().(*syscall.Stat_t), nil
-}
-
-func newDriver(t *testing.T, name string) *Driver {
+func newDriver(t testing.TB, name string, options []string) *Driver {
 	root, err := ioutil.TempDir("", "docker-graphtest-")
 	root, err := ioutil.TempDir("", "docker-graphtest-")
 	if err != nil {
 	if err != nil {
 		t.Fatal(err)
 		t.Fatal(err)
@@ -80,7 +41,7 @@ func newDriver(t *testing.T, name string) *Driver {
 		t.Fatal(err)
 		t.Fatal(err)
 	}
 	}
 
 
-	d, err := graphdriver.GetDriver(name, root, nil, nil, nil)
+	d, err := graphdriver.GetDriver(name, root, options, nil, nil)
 	if err != nil {
 	if err != nil {
 		t.Logf("graphdriver: %v\n", err)
 		t.Logf("graphdriver: %v\n", err)
 		if err == graphdriver.ErrNotSupported || err == graphdriver.ErrPrerequisites || err == graphdriver.ErrIncompatibleFS {
 		if err == graphdriver.ErrNotSupported || err == graphdriver.ErrPrerequisites || err == graphdriver.ErrIncompatibleFS {
@@ -91,7 +52,7 @@ func newDriver(t *testing.T, name string) *Driver {
 	return &Driver{d, root, 1}
 	return &Driver{d, root, 1}
 }
 }
 
 
-func cleanup(t *testing.T, d *Driver) {
+func cleanup(t testing.TB, d *Driver) {
 	if err := drv.Cleanup(); err != nil {
 	if err := drv.Cleanup(); err != nil {
 		t.Fatal(err)
 		t.Fatal(err)
 	}
 	}
@@ -99,9 +60,9 @@ func cleanup(t *testing.T, d *Driver) {
 }
 }
 
 
 // GetDriver create a new driver with given name or return an existing driver with the name updating the reference count.
 // GetDriver create a new driver with given name or return an existing driver with the name updating the reference count.
-func GetDriver(t *testing.T, name string) graphdriver.Driver {
+func GetDriver(t testing.TB, name string, options ...string) graphdriver.Driver {
 	if drv == nil {
 	if drv == nil {
-		drv = newDriver(t, name)
+		drv = newDriver(t, name, options)
 	} else {
 	} else {
 		drv.refCount++
 		drv.refCount++
 	}
 	}
@@ -109,7 +70,7 @@ func GetDriver(t *testing.T, name string) graphdriver.Driver {
 }
 }
 
 
 // PutDriver removes the driver if it is no longer used and updates the reference count.
 // PutDriver removes the driver if it is no longer used and updates the reference count.
-func PutDriver(t *testing.T) {
+func PutDriver(t testing.TB) {
 	if drv == nil {
 	if drv == nil {
 		t.Skip("No driver to put!")
 		t.Skip("No driver to put!")
 	}
 	}
@@ -120,190 +81,210 @@ func PutDriver(t *testing.T) {
 	}
 	}
 }
 }
 
 
-func verifyFile(t *testing.T, path string, mode os.FileMode, uid, gid uint32) {
-	fi, err := os.Stat(path)
-	if err != nil {
+// DriverTestCreateEmpty creates a new image and verifies it is empty and the right metadata
+func DriverTestCreateEmpty(t testing.TB, drivername string, driverOptions ...string) {
+	driver := GetDriver(t, drivername, driverOptions...)
+	defer PutDriver(t)
+
+	if err := driver.Create("empty", "", "", nil); err != nil {
 		t.Fatal(err)
 		t.Fatal(err)
 	}
 	}
 
 
-	if fi.Mode()&os.ModeType != mode&os.ModeType {
-		t.Fatalf("Expected %s type 0x%x, got 0x%x", path, mode&os.ModeType, fi.Mode()&os.ModeType)
-	}
+	defer func() {
+		if err := driver.Remove("empty"); err != nil {
+			t.Fatal(err)
+		}
+	}()
 
 
-	if fi.Mode()&os.ModePerm != mode&os.ModePerm {
-		t.Fatalf("Expected %s mode %o, got %o", path, mode&os.ModePerm, fi.Mode()&os.ModePerm)
+	if !driver.Exists("empty") {
+		t.Fatal("Newly created image doesn't exist")
 	}
 	}
 
 
-	if fi.Mode()&os.ModeSticky != mode&os.ModeSticky {
-		t.Fatalf("Expected %s sticky 0x%x, got 0x%x", path, mode&os.ModeSticky, fi.Mode()&os.ModeSticky)
+	dir, err := driver.Get("empty", "")
+	if err != nil {
+		t.Fatal(err)
 	}
 	}
 
 
-	if fi.Mode()&os.ModeSetuid != mode&os.ModeSetuid {
-		t.Fatalf("Expected %s setuid 0x%x, got 0x%x", path, mode&os.ModeSetuid, fi.Mode()&os.ModeSetuid)
-	}
+	verifyFile(t, dir, 0755|os.ModeDir, 0, 0)
 
 
-	if fi.Mode()&os.ModeSetgid != mode&os.ModeSetgid {
-		t.Fatalf("Expected %s setgid 0x%x, got 0x%x", path, mode&os.ModeSetgid, fi.Mode()&os.ModeSetgid)
+	// Verify that the directory is empty
+	fis, err := readDir(dir)
+	if err != nil {
+		t.Fatal(err)
 	}
 	}
 
 
-	if stat, ok := fi.Sys().(*syscall.Stat_t); ok {
-		if stat.Uid != uid {
-			t.Fatalf("%s no owned by uid %d", path, uid)
-		}
-		if stat.Gid != gid {
-			t.Fatalf("%s not owned by gid %d", path, gid)
-		}
+	if len(fis) != 0 {
+		t.Fatal("New directory not empty")
 	}
 	}
 
 
+	driver.Put("empty")
 }
 }
 
 
-// readDir reads a directory just like ioutil.ReadDir()
-// then hides specific files (currently "lost+found")
-// so the tests don't "see" it
-func readDir(dir string) ([]os.FileInfo, error) {
-	a, err := ioutil.ReadDir(dir)
-	if err != nil {
-		return nil, err
-	}
+// DriverTestCreateBase create a base driver and verify.
+func DriverTestCreateBase(t testing.TB, drivername string, driverOptions ...string) {
+	driver := GetDriver(t, drivername, driverOptions...)
+	defer PutDriver(t)
 
 
-	b := a[:0]
-	for _, x := range a {
-		if x.Name() != "lost+found" { // ext4 always have this dir
-			b = append(b, x)
+	createBase(t, driver, "Base")
+	defer func() {
+		if err := driver.Remove("Base"); err != nil {
+			t.Fatal(err)
 		}
 		}
-	}
-
-	return b, nil
+	}()
+	verifyBase(t, driver, "Base")
 }
 }
 
 
-// DriverTestCreateEmpty creates a new image and verifies it is empty and the right metadata
-func DriverTestCreateEmpty(t *testing.T, drivername string) {
-	driver := GetDriver(t, drivername)
+// DriverTestCreateSnap Create a driver and snap and verify.
+func DriverTestCreateSnap(t testing.TB, drivername string, driverOptions ...string) {
+	driver := GetDriver(t, drivername, driverOptions...)
 	defer PutDriver(t)
 	defer PutDriver(t)
 
 
-	if err := driver.Create("empty", "", "", nil); err != nil {
+	createBase(t, driver, "Base")
+
+	defer func() {
+		if err := driver.Remove("Base"); err != nil {
+			t.Fatal(err)
+		}
+	}()
+
+	if err := driver.Create("Snap", "Base", "", nil); err != nil {
 		t.Fatal(err)
 		t.Fatal(err)
 	}
 	}
 
 
 	defer func() {
 	defer func() {
-		if err := driver.Remove("empty"); err != nil {
+		if err := driver.Remove("Snap"); err != nil {
 			t.Fatal(err)
 			t.Fatal(err)
 		}
 		}
 	}()
 	}()
 
 
-	if !driver.Exists("empty") {
-		t.Fatal("Newly created image doesn't exist")
-	}
+	verifyBase(t, driver, "Snap")
+}
 
 
-	dir, err := driver.Get("empty", "")
-	if err != nil {
+// DriverTestDeepLayerRead reads a file from a lower layer under a given number of layers
+func DriverTestDeepLayerRead(t testing.TB, layerCount int, drivername string, driverOptions ...string) {
+	driver := GetDriver(t, drivername, driverOptions...)
+	defer PutDriver(t)
+
+	base := stringid.GenerateRandomID()
+
+	if err := driver.Create(base, "", "", nil); err != nil {
 		t.Fatal(err)
 		t.Fatal(err)
 	}
 	}
 
 
-	verifyFile(t, dir, 0755|os.ModeDir, 0, 0)
+	content := []byte("test content")
+	if err := addFile(driver, base, "testfile.txt", content); err != nil {
+		t.Fatal(err)
+	}
 
 
-	// Verify that the directory is empty
-	fis, err := readDir(dir)
+	topLayer, err := addManyLayers(driver, base, layerCount)
 	if err != nil {
 	if err != nil {
 		t.Fatal(err)
 		t.Fatal(err)
 	}
 	}
 
 
-	if len(fis) != 0 {
-		t.Fatal("New directory not empty")
+	err = checkManyLayers(driver, topLayer, layerCount)
+	if err != nil {
+		t.Fatal(err)
 	}
 	}
 
 
-	driver.Put("empty")
+	if err := checkFile(driver, topLayer, "testfile.txt", content); err != nil {
+		t.Fatal(err)
+	}
 }
 }
 
 
-func createBase(t *testing.T, driver graphdriver.Driver, name string) {
-	// We need to be able to set any perms
-	oldmask := syscall.Umask(0)
-	defer syscall.Umask(oldmask)
+// DriverTestDiffApply tests diffing and applying produces the same layer
+func DriverTestDiffApply(t testing.TB, fileCount int, drivername string, driverOptions ...string) {
+	driver := GetDriver(t, drivername, driverOptions...)
+	defer PutDriver(t)
+	base := stringid.GenerateRandomID()
+	upper := stringid.GenerateRandomID()
 
 
-	if err := driver.CreateReadWrite(name, "", "", nil); err != nil {
+	if err := driver.Create(base, "", "", nil); err != nil {
 		t.Fatal(err)
 		t.Fatal(err)
 	}
 	}
 
 
-	dir, err := driver.Get(name, "")
-	if err != nil {
+	if err := addManyFiles(driver, base, fileCount, 3); err != nil {
+		t.Fatal(err)
+	}
+
+	if err := driver.Create(upper, base, "", nil); err != nil {
 		t.Fatal(err)
 		t.Fatal(err)
 	}
 	}
-	defer driver.Put(name)
 
 
-	subdir := path.Join(dir, "a subdir")
-	if err := os.Mkdir(subdir, 0705|os.ModeSticky); err != nil {
+	if err := addManyFiles(driver, upper, fileCount, 6); err != nil {
 		t.Fatal(err)
 		t.Fatal(err)
 	}
 	}
-	if err := os.Chown(subdir, 1, 2); err != nil {
+	diffSize, err := driver.DiffSize(upper, "")
+	if err != nil {
 		t.Fatal(err)
 		t.Fatal(err)
 	}
 	}
 
 
-	file := path.Join(dir, "a file")
-	if err := ioutil.WriteFile(file, []byte("Some data"), 0222|os.ModeSetuid); err != nil {
+	diff := stringid.GenerateRandomID()
+	if err := driver.Create(diff, base, "", nil); err != nil {
 		t.Fatal(err)
 		t.Fatal(err)
 	}
 	}
-}
 
 
-func verifyBase(t *testing.T, driver graphdriver.Driver, name string) {
-	dir, err := driver.Get(name, "")
-	if err != nil {
+	if err := checkManyFiles(driver, diff, fileCount, 3); err != nil {
 		t.Fatal(err)
 		t.Fatal(err)
 	}
 	}
-	defer driver.Put(name)
 
 
-	subdir := path.Join(dir, "a subdir")
-	verifyFile(t, subdir, 0705|os.ModeDir|os.ModeSticky, 1, 2)
+	arch, err := driver.Diff(upper, base)
+	if err != nil {
+		t.Fatal(err)
+	}
 
 
-	file := path.Join(dir, "a file")
-	verifyFile(t, file, 0222|os.ModeSetuid, 0, 0)
+	buf := bytes.NewBuffer(nil)
+	if _, err := buf.ReadFrom(arch); err != nil {
+		t.Fatal(err)
+	}
+	if err := arch.Close(); err != nil {
+		t.Fatal(err)
+	}
 
 
-	fis, err := readDir(dir)
+	applyDiffSize, err := driver.ApplyDiff(diff, base, bytes.NewReader(buf.Bytes()))
 	if err != nil {
 	if err != nil {
 		t.Fatal(err)
 		t.Fatal(err)
 	}
 	}
 
 
-	if len(fis) != 2 {
-		t.Fatal("Unexpected files in base image")
+	if applyDiffSize != diffSize {
+		t.Fatalf("Apply diff size different, got %d, expected %d", applyDiffSize, diffSize)
+	}
+	if err := checkManyFiles(driver, diff, fileCount, 6); err != nil {
+		t.Fatal(err)
 	}
 	}
 }
 }
 
 
-// DriverTestCreateBase create a base driver and verify.
-func DriverTestCreateBase(t *testing.T, drivername string) {
-	driver := GetDriver(t, drivername)
+// DriverTestChanges tests computed changes on a layer matches changes made
+func DriverTestChanges(t testing.TB, drivername string, driverOptions ...string) {
+	driver := GetDriver(t, drivername, driverOptions...)
 	defer PutDriver(t)
 	defer PutDriver(t)
+	base := stringid.GenerateRandomID()
+	upper := stringid.GenerateRandomID()
 
 
-	createBase(t, driver, "Base")
-	defer func() {
-		if err := driver.Remove("Base"); err != nil {
-			t.Fatal(err)
-		}
-	}()
-	verifyBase(t, driver, "Base")
-}
+	if err := driver.Create(base, "", "", nil); err != nil {
+		t.Fatal(err)
+	}
 
 
-// DriverTestCreateSnap Create a driver and snap and verify.
-func DriverTestCreateSnap(t *testing.T, drivername string) {
-	driver := GetDriver(t, drivername)
-	defer PutDriver(t)
+	if err := addManyFiles(driver, base, 20, 3); err != nil {
+		t.Fatal(err)
+	}
 
 
-	createBase(t, driver, "Base")
+	if err := driver.Create(upper, base, "", nil); err != nil {
+		t.Fatal(err)
+	}
 
 
-	defer func() {
-		if err := driver.Remove("Base"); err != nil {
-			t.Fatal(err)
-		}
-	}()
+	expectedChanges, err := changeManyFiles(driver, upper, 20, 6)
+	if err != nil {
+		t.Fatal(err)
+	}
 
 
-	if err := driver.Create("Snap", "Base", "", nil); err != nil {
+	changes, err := driver.Changes(upper, base)
+	if err != nil {
 		t.Fatal(err)
 		t.Fatal(err)
 	}
 	}
-	defer func() {
-		if err := driver.Remove("Snap"); err != nil {
-			t.Fatal(err)
-		}
-	}()
 
 
-	verifyBase(t, driver, "Snap")
+	if err = checkChanges(expectedChanges, changes); err != nil {
+		t.Fatal(err)
+	}
 }
 }
 
 
 func writeRandomFile(path string, size uint64) error {
 func writeRandomFile(path string, size uint64) error {

+ 301 - 0
daemon/graphdriver/graphtest/testutil.go

@@ -0,0 +1,301 @@
+package graphtest
+
+import (
+	"bytes"
+	"fmt"
+	"io/ioutil"
+	"math/rand"
+	"os"
+	"path"
+	"sort"
+
+	"github.com/docker/docker/daemon/graphdriver"
+	"github.com/docker/docker/pkg/archive"
+	"github.com/docker/docker/pkg/stringid"
+)
+
+func randomContent(size int, seed int64) []byte {
+	s := rand.NewSource(seed)
+	content := make([]byte, size)
+
+	for i := 0; i < len(content); i += 7 {
+		val := s.Int63()
+		for j := 0; i+j < len(content) && j < 7; j++ {
+			content[i+j] = byte(val)
+			val >>= 8
+		}
+	}
+
+	return content
+}
+
+func addFiles(drv graphdriver.Driver, layer string, seed int64) error {
+	root, err := drv.Get(layer, "")
+	if err != nil {
+		return err
+	}
+	defer drv.Put(layer)
+
+	if err := ioutil.WriteFile(path.Join(root, "file-a"), randomContent(64, seed), 0755); err != nil {
+		return err
+	}
+	if err := os.MkdirAll(path.Join(root, "dir-b"), 0755); err != nil {
+		return err
+	}
+	if err := ioutil.WriteFile(path.Join(root, "dir-b", "file-b"), randomContent(128, seed+1), 0755); err != nil {
+		return err
+	}
+
+	return ioutil.WriteFile(path.Join(root, "file-c"), randomContent(128*128, seed+2), 0755)
+}
+
+func checkFile(drv graphdriver.Driver, layer, filename string, content []byte) error {
+	root, err := drv.Get(layer, "")
+	if err != nil {
+		return err
+	}
+	defer drv.Put(layer)
+
+	fileContent, err := ioutil.ReadFile(path.Join(root, filename))
+	if err != nil {
+		return err
+	}
+
+	if bytes.Compare(fileContent, content) != 0 {
+		return fmt.Errorf("mismatched file content %v, expecting %v", fileContent, content)
+	}
+
+	return nil
+}
+
+func addFile(drv graphdriver.Driver, layer, filename string, content []byte) error {
+	root, err := drv.Get(layer, "")
+	if err != nil {
+		return err
+	}
+	defer drv.Put(layer)
+
+	return ioutil.WriteFile(path.Join(root, filename), content, 0755)
+}
+
+func addManyFiles(drv graphdriver.Driver, layer string, count int, seed int64) error {
+	root, err := drv.Get(layer, "")
+	if err != nil {
+		return err
+	}
+	defer drv.Put(layer)
+
+	for i := 0; i < count; i += 100 {
+		dir := path.Join(root, fmt.Sprintf("directory-%d", i))
+		if err := os.MkdirAll(dir, 0755); err != nil {
+			return err
+		}
+		for j := 0; i+j < count && j < 100; j++ {
+			file := path.Join(dir, fmt.Sprintf("file-%d", i+j))
+			if err := ioutil.WriteFile(file, randomContent(64, seed+int64(i+j)), 0755); err != nil {
+				return err
+			}
+		}
+	}
+
+	return nil
+}
+
+func changeManyFiles(drv graphdriver.Driver, layer string, count int, seed int64) ([]archive.Change, error) {
+	root, err := drv.Get(layer, "")
+	if err != nil {
+		return nil, err
+	}
+	defer drv.Put(layer)
+
+	changes := []archive.Change{}
+	for i := 0; i < count; i += 100 {
+		archiveRoot := fmt.Sprintf("/directory-%d", i)
+		if err := os.MkdirAll(path.Join(root, archiveRoot), 0755); err != nil {
+			return nil, err
+		}
+		for j := 0; i+j < count && j < 100; j++ {
+			if j == 0 {
+				changes = append(changes, archive.Change{
+					Path: archiveRoot,
+					Kind: archive.ChangeModify,
+				})
+			}
+			var change archive.Change
+			switch j % 3 {
+			// Update file
+			case 0:
+				change.Path = path.Join(archiveRoot, fmt.Sprintf("file-%d", i+j))
+				change.Kind = archive.ChangeModify
+				if err := ioutil.WriteFile(path.Join(root, change.Path), randomContent(64, seed+int64(i+j)), 0755); err != nil {
+					return nil, err
+				}
+			// Add file
+			case 1:
+				change.Path = path.Join(archiveRoot, fmt.Sprintf("file-%d-%d", seed, i+j))
+				change.Kind = archive.ChangeAdd
+				if err := ioutil.WriteFile(path.Join(root, change.Path), randomContent(64, seed+int64(i+j)), 0755); err != nil {
+					return nil, err
+				}
+			// Remove file
+			case 2:
+				change.Path = path.Join(archiveRoot, fmt.Sprintf("file-%d", i+j))
+				change.Kind = archive.ChangeDelete
+				if err := os.Remove(path.Join(root, change.Path)); err != nil {
+					return nil, err
+				}
+			}
+			changes = append(changes, change)
+		}
+	}
+
+	return changes, nil
+}
+
+func checkManyFiles(drv graphdriver.Driver, layer string, count int, seed int64) error {
+	root, err := drv.Get(layer, "")
+	if err != nil {
+		return err
+	}
+	defer drv.Put(layer)
+
+	for i := 0; i < count; i += 100 {
+		dir := path.Join(root, fmt.Sprintf("directory-%d", i))
+		for j := 0; i+j < count && j < 100; j++ {
+			file := path.Join(dir, fmt.Sprintf("file-%d", i+j))
+			fileContent, err := ioutil.ReadFile(file)
+			if err != nil {
+				return err
+			}
+
+			content := randomContent(64, seed+int64(i+j))
+
+			if bytes.Compare(fileContent, content) != 0 {
+				return fmt.Errorf("mismatched file content %v, expecting %v", fileContent, content)
+			}
+		}
+	}
+
+	return nil
+}
+
+type changeList []archive.Change
+
+func (c changeList) Less(i, j int) bool {
+	if c[i].Path == c[j].Path {
+		return c[i].Kind < c[j].Kind
+	}
+	return c[i].Path < c[j].Path
+}
+func (c changeList) Len() int      { return len(c) }
+func (c changeList) Swap(i, j int) { c[j], c[i] = c[i], c[j] }
+
+func checkChanges(expected, actual []archive.Change) error {
+	if len(expected) != len(actual) {
+		return fmt.Errorf("unexpected number of changes, expected %d, got %d", len(expected), len(actual))
+	}
+	sort.Sort(changeList(expected))
+	sort.Sort(changeList(actual))
+
+	for i := range expected {
+		if expected[i] != actual[i] {
+			return fmt.Errorf("unexpected change, expecting %v, got %v", expected[i], actual[i])
+		}
+	}
+
+	return nil
+}
+
+func addLayerFiles(drv graphdriver.Driver, layer, parent string, i int) error {
+	root, err := drv.Get(layer, "")
+	if err != nil {
+		return err
+	}
+	defer drv.Put(layer)
+
+	if err := ioutil.WriteFile(path.Join(root, "top-id"), []byte(layer), 0755); err != nil {
+		return err
+	}
+	layerDir := path.Join(root, fmt.Sprintf("layer-%d", i))
+	if err := os.MkdirAll(layerDir, 0755); err != nil {
+		return err
+	}
+	if err := ioutil.WriteFile(path.Join(layerDir, "layer-id"), []byte(layer), 0755); err != nil {
+		return err
+	}
+	if err := ioutil.WriteFile(path.Join(layerDir, "parent-id"), []byte(parent), 0755); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func addManyLayers(drv graphdriver.Driver, baseLayer string, count int) (string, error) {
+	lastLayer := baseLayer
+	for i := 1; i <= count; i++ {
+		nextLayer := stringid.GenerateRandomID()
+		if err := drv.Create(nextLayer, lastLayer, "", nil); err != nil {
+			return "", err
+		}
+		if err := addLayerFiles(drv, nextLayer, lastLayer, i); err != nil {
+			return "", err
+		}
+
+		lastLayer = nextLayer
+
+	}
+	return lastLayer, nil
+}
+
+func checkManyLayers(drv graphdriver.Driver, layer string, count int) error {
+	root, err := drv.Get(layer, "")
+	if err != nil {
+		return err
+	}
+	defer drv.Put(layer)
+
+	layerIDBytes, err := ioutil.ReadFile(path.Join(root, "top-id"))
+	if err != nil {
+		return err
+	}
+
+	if bytes.Compare(layerIDBytes, []byte(layer)) != 0 {
+		return fmt.Errorf("mismatched file content %v, expecting %v", layerIDBytes, []byte(layer))
+	}
+
+	for i := count; i > 0; i-- {
+		layerDir := path.Join(root, fmt.Sprintf("layer-%d", i))
+
+		thisLayerIDBytes, err := ioutil.ReadFile(path.Join(layerDir, "layer-id"))
+		if err != nil {
+			return err
+		}
+		if bytes.Compare(thisLayerIDBytes, layerIDBytes) != 0 {
+			return fmt.Errorf("mismatched file content %v, expecting %v", thisLayerIDBytes, layerIDBytes)
+		}
+		layerIDBytes, err = ioutil.ReadFile(path.Join(layerDir, "parent-id"))
+		if err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// readDir reads a directory just like ioutil.ReadDir()
+// then hides specific files (currently "lost+found")
+// so the tests don't "see" it
+func readDir(dir string) ([]os.FileInfo, error) {
+	a, err := ioutil.ReadDir(dir)
+	if err != nil {
+		return nil, err
+	}
+
+	b := a[:0]
+	for _, x := range a {
+		if x.Name() != "lost+found" { // ext4 always have this dir
+			b = append(b, x)
+		}
+	}
+
+	return b, nil
+}

+ 143 - 0
daemon/graphdriver/graphtest/testutil_unix.go

@@ -0,0 +1,143 @@
+// +build linux freebsd
+
+package graphtest
+
+import (
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path"
+	"syscall"
+	"testing"
+
+	"github.com/docker/docker/daemon/graphdriver"
+)
+
+// InitLoopbacks ensures that the loopback devices are properly created within
+// the system running the device mapper tests.
+func InitLoopbacks() error {
+	statT, err := getBaseLoopStats()
+	if err != nil {
+		return err
+	}
+	// create at least 8 loopback files, ya, that is a good number
+	for i := 0; i < 8; i++ {
+		loopPath := fmt.Sprintf("/dev/loop%d", i)
+		// only create new loopback files if they don't exist
+		if _, err := os.Stat(loopPath); err != nil {
+			if mkerr := syscall.Mknod(loopPath,
+				uint32(statT.Mode|syscall.S_IFBLK), int((7<<8)|(i&0xff)|((i&0xfff00)<<12))); mkerr != nil {
+				return mkerr
+			}
+			os.Chown(loopPath, int(statT.Uid), int(statT.Gid))
+		}
+	}
+	return nil
+}
+
+// getBaseLoopStats inspects /dev/loop0 to collect uid,gid, and mode for the
+// loop0 device on the system.  If it does not exist we assume 0,0,0660 for the
+// stat data
+func getBaseLoopStats() (*syscall.Stat_t, error) {
+	loop0, err := os.Stat("/dev/loop0")
+	if err != nil {
+		if os.IsNotExist(err) {
+			return &syscall.Stat_t{
+				Uid:  0,
+				Gid:  0,
+				Mode: 0660,
+			}, nil
+		}
+		return nil, err
+	}
+	return loop0.Sys().(*syscall.Stat_t), nil
+}
+
+func verifyFile(t testing.TB, path string, mode os.FileMode, uid, gid uint32) {
+	fi, err := os.Stat(path)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if fi.Mode()&os.ModeType != mode&os.ModeType {
+		t.Fatalf("Expected %s type 0x%x, got 0x%x", path, mode&os.ModeType, fi.Mode()&os.ModeType)
+	}
+
+	if fi.Mode()&os.ModePerm != mode&os.ModePerm {
+		t.Fatalf("Expected %s mode %o, got %o", path, mode&os.ModePerm, fi.Mode()&os.ModePerm)
+	}
+
+	if fi.Mode()&os.ModeSticky != mode&os.ModeSticky {
+		t.Fatalf("Expected %s sticky 0x%x, got 0x%x", path, mode&os.ModeSticky, fi.Mode()&os.ModeSticky)
+	}
+
+	if fi.Mode()&os.ModeSetuid != mode&os.ModeSetuid {
+		t.Fatalf("Expected %s setuid 0x%x, got 0x%x", path, mode&os.ModeSetuid, fi.Mode()&os.ModeSetuid)
+	}
+
+	if fi.Mode()&os.ModeSetgid != mode&os.ModeSetgid {
+		t.Fatalf("Expected %s setgid 0x%x, got 0x%x", path, mode&os.ModeSetgid, fi.Mode()&os.ModeSetgid)
+	}
+
+	if stat, ok := fi.Sys().(*syscall.Stat_t); ok {
+		if stat.Uid != uid {
+			t.Fatalf("%s no owned by uid %d", path, uid)
+		}
+		if stat.Gid != gid {
+			t.Fatalf("%s not owned by gid %d", path, gid)
+		}
+	}
+}
+
+func createBase(t testing.TB, driver graphdriver.Driver, name string) {
+	// We need to be able to set any perms
+	oldmask := syscall.Umask(0)
+	defer syscall.Umask(oldmask)
+
+	if err := driver.CreateReadWrite(name, "", "", nil); err != nil {
+		t.Fatal(err)
+	}
+
+	dir, err := driver.Get(name, "")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer driver.Put(name)
+
+	subdir := path.Join(dir, "a subdir")
+	if err := os.Mkdir(subdir, 0705|os.ModeSticky); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.Chown(subdir, 1, 2); err != nil {
+		t.Fatal(err)
+	}
+
+	file := path.Join(dir, "a file")
+	if err := ioutil.WriteFile(file, []byte("Some data"), 0222|os.ModeSetuid); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func verifyBase(t testing.TB, driver graphdriver.Driver, name string) {
+	dir, err := driver.Get(name, "")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer driver.Put(name)
+
+	subdir := path.Join(dir, "a subdir")
+	verifyFile(t, subdir, 0705|os.ModeDir|os.ModeSticky, 1, 2)
+
+	file := path.Join(dir, "a file")
+	verifyFile(t, file, 0222|os.ModeSetuid, 0, 0)
+
+	fis, err := readDir(dir)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if len(fis) != 2 {
+		t.Fatal("Unexpected files in base image")
+	}
+
+}

+ 1 - 2
daemon/graphdriver/overlay/overlay.go

@@ -15,7 +15,6 @@ import (
 
 
 	"github.com/docker/docker/daemon/graphdriver"
 	"github.com/docker/docker/daemon/graphdriver"
 	"github.com/docker/docker/pkg/archive"
 	"github.com/docker/docker/pkg/archive"
-	"github.com/docker/docker/pkg/chrootarchive"
 	"github.com/docker/docker/pkg/idtools"
 	"github.com/docker/docker/pkg/idtools"
 
 
 	"github.com/docker/docker/pkg/mount"
 	"github.com/docker/docker/pkg/mount"
@@ -426,7 +425,7 @@ func (d *Driver) ApplyDiff(id string, parent string, diff archive.Reader) (size
 	}
 	}
 
 
 	options := &archive.TarOptions{UIDMaps: d.uidMaps, GIDMaps: d.gidMaps}
 	options := &archive.TarOptions{UIDMaps: d.uidMaps, GIDMaps: d.gidMaps}
-	if size, err = chrootarchive.ApplyUncompressedLayer(tmpRootDir, diff, options); err != nil {
+	if size, err = graphdriver.ApplyUncompressedLayer(tmpRootDir, diff, options); err != nil {
 		return 0, err
 		return 0, err
 	}
 	}
 
 

+ 58 - 0
daemon/graphdriver/overlay/overlay_test.go

@@ -5,9 +5,17 @@ package overlay
 import (
 import (
 	"testing"
 	"testing"
 
 
+	"github.com/docker/docker/daemon/graphdriver"
 	"github.com/docker/docker/daemon/graphdriver/graphtest"
 	"github.com/docker/docker/daemon/graphdriver/graphtest"
+	"github.com/docker/docker/pkg/archive"
 )
 )
 
 
+func init() {
+	// Do not sure chroot to speed run time and allow archive
+	// errors or hangs to be debugged directly from the test process.
+	graphdriver.ApplyUncompressedLayer = archive.ApplyUncompressedLayer
+}
+
 // This avoids creating a new driver for each test if all tests are run
 // This avoids creating a new driver for each test if all tests are run
 // Make sure to put new tests between TestOverlaySetup and TestOverlayTeardown
 // Make sure to put new tests between TestOverlaySetup and TestOverlayTeardown
 func TestOverlaySetup(t *testing.T) {
 func TestOverlaySetup(t *testing.T) {
@@ -26,6 +34,56 @@ func TestOverlayCreateSnap(t *testing.T) {
 	graphtest.DriverTestCreateSnap(t, "overlay")
 	graphtest.DriverTestCreateSnap(t, "overlay")
 }
 }
 
 
+func TestOverlay50LayerRead(t *testing.T) {
+	graphtest.DriverTestDeepLayerRead(t, 50, "overlay")
+}
+
+func TestOverlayDiffApply10Files(t *testing.T) {
+	graphtest.DriverTestDiffApply(t, 10, "overlay")
+}
+
+func TestOverlayChanges(t *testing.T) {
+	graphtest.DriverTestChanges(t, "overlay")
+}
+
 func TestOverlayTeardown(t *testing.T) {
 func TestOverlayTeardown(t *testing.T) {
 	graphtest.PutDriver(t)
 	graphtest.PutDriver(t)
 }
 }
+
+// Benchmarks should always setup new driver
+
+func BenchmarkExists(b *testing.B) {
+	graphtest.DriverBenchExists(b, "overlay")
+}
+
+func BenchmarkGetEmpty(b *testing.B) {
+	graphtest.DriverBenchGetEmpty(b, "overlay")
+}
+
+func BenchmarkDiffBase(b *testing.B) {
+	graphtest.DriverBenchDiffBase(b, "overlay")
+}
+
+func BenchmarkDiffSmallUpper(b *testing.B) {
+	graphtest.DriverBenchDiffN(b, 10, 10, "overlay")
+}
+
+func BenchmarkDiff10KFileUpper(b *testing.B) {
+	graphtest.DriverBenchDiffN(b, 10, 10000, "overlay")
+}
+
+func BenchmarkDiff10KFilesBottom(b *testing.B) {
+	graphtest.DriverBenchDiffN(b, 10000, 10, "overlay")
+}
+
+func BenchmarkDiffApply100(b *testing.B) {
+	graphtest.DriverBenchDiffApplyN(b, 100, "overlay")
+}
+
+func BenchmarkDiff20Layers(b *testing.B) {
+	graphtest.DriverBenchDeepLayerDiff(b, 20, "overlay")
+}
+
+func BenchmarkRead20Layers(b *testing.B) {
+	graphtest.DriverBenchDeepLayerRead(b, 20, "overlay")
+}

+ 91 - 0
daemon/graphdriver/overlay2/mount.go

@@ -0,0 +1,91 @@
+// +build linux
+
+package overlay2
+
+import (
+	"bytes"
+	"encoding/json"
+	"flag"
+	"fmt"
+	"os"
+	"runtime"
+	"syscall"
+
+	"github.com/docker/docker/pkg/reexec"
+)
+
+func init() {
+	reexec.Register("docker-mountfrom", mountFromMain)
+}
+
+func fatal(err error) {
+	fmt.Fprint(os.Stderr, err)
+	os.Exit(1)
+}
+
+type mountOptions struct {
+	Device string
+	Target string
+	Type   string
+	Label  string
+	Flag   uint32
+}
+
+func mountFrom(dir, device, target, mType, label string) error {
+
+	r, w, err := os.Pipe()
+	if err != nil {
+		return fmt.Errorf("mountfrom pipe failure: %v", err)
+	}
+
+	options := &mountOptions{
+		Device: device,
+		Target: target,
+		Type:   mType,
+		Flag:   0,
+		Label:  label,
+	}
+
+	cmd := reexec.Command("docker-mountfrom", dir)
+	cmd.Stdin = r
+
+	output := bytes.NewBuffer(nil)
+	cmd.Stdout = output
+	cmd.Stderr = output
+
+	if err := cmd.Start(); err != nil {
+		return fmt.Errorf("mountfrom error on re-exec cmd: %v", err)
+	}
+	//write the options to the pipe for the untar exec to read
+	if err := json.NewEncoder(w).Encode(options); err != nil {
+		return fmt.Errorf("mountfrom json encode to pipe failed: %v", err)
+	}
+	w.Close()
+
+	if err := cmd.Wait(); err != nil {
+		return fmt.Errorf("mountfrom re-exec error: %v: output: %s", err, output)
+	}
+	return nil
+}
+
+// mountfromMain is the entry-point for docker-mountfrom on re-exec.
+func mountFromMain() {
+	runtime.LockOSThread()
+	flag.Parse()
+
+	var options *mountOptions
+
+	if err := json.NewDecoder(os.Stdin).Decode(&options); err != nil {
+		fatal(err)
+	}
+
+	if err := os.Chdir(flag.Arg(0)); err != nil {
+		fatal(err)
+	}
+
+	if err := syscall.Mount(options.Device, options.Target, options.Type, uintptr(options.Flag), options.Label); err != nil {
+		fatal(err)
+	}
+
+	os.Exit(0)
+}

+ 476 - 0
daemon/graphdriver/overlay2/overlay.go

@@ -0,0 +1,476 @@
+// +build linux
+
+package overlay2
+
+import (
+	"bufio"
+	"errors"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"os/exec"
+	"path"
+	"strings"
+	"syscall"
+
+	"github.com/Sirupsen/logrus"
+
+	"github.com/docker/docker/daemon/graphdriver"
+	"github.com/docker/docker/pkg/archive"
+	"github.com/docker/docker/pkg/chrootarchive"
+	"github.com/docker/docker/pkg/directory"
+	"github.com/docker/docker/pkg/idtools"
+	"github.com/docker/docker/pkg/mount"
+	"github.com/docker/docker/pkg/parsers/kernel"
+
+	"github.com/opencontainers/runc/libcontainer/label"
+)
+
+var (
+	// untar defines the untar method
+	untar = chrootarchive.UntarUncompressed
+)
+
+// This backend uses the overlay union filesystem for containers
+// with diff directories for each layer.
+
+// This version of the overlay driver requires at least kernel
+// 4.0.0 in order to support mounting multiple diff directories.
+
+// Each container/image has at least a "diff" directory and "link" file.
+// If there is also a "lower" file when there are diff layers
+// below  as well as "merged" and "work" directories. The "diff" directory
+// has the upper layer of the overlay and is used to capture any
+// changes to the layer. The "lower" file contains all the lower layer
+// mounts separated by ":" and ordered from uppermost to lowermost
+// layers. The overlay itself is mounted in the "merged" directory,
+// and the "work" dir is needed for overlay to work.
+
+// The "link" file for each layer contains a unique string for the layer.
+// Under the "l" directory at the root there will be a symbolic link
+// with that unique string pointing the "diff" directory for the layer.
+// The symbolic links are used to reference lower layers in the "lower"
+// file and on mount. The links are used to shorten the total length
+// of a layer reference without requiring changes to the layer identifier
+// or root directory. Mounts are always done relative to root and
+// referencing the symbolic links in order to ensure the number of
+// lower directories can fit in a single page for making the mount
+// syscall. A hard upper limit of 128 lower layers is enforced to ensure
+// that mounts do not fail due to length.
+
+const (
+	driverName = "overlay2"
+	linkDir    = "l"
+	lowerFile  = "lower"
+	maxDepth   = 128
+
+	// idLength represents the number of random characters
+	// which can be used to create the unique link identifer
+	// for every layer. If this value is too long then the
+	// page size limit for the mount command may be exceeded.
+	// The idLength should be selected such that following equation
+	// is true (512 is a buffer for label metadata).
+	// ((idLength + len(linkDir) + 1) * maxDepth) <= (pageSize - 512)
+	idLength = 26
+)
+
+// Driver contains information about the home directory and the list of active mounts that are created using this driver.
+type Driver struct {
+	home    string
+	uidMaps []idtools.IDMap
+	gidMaps []idtools.IDMap
+	ctr     *graphdriver.RefCounter
+}
+
+var backingFs = "<unknown>"
+
+func init() {
+	graphdriver.Register(driverName, Init)
+}
+
+// Init returns the a native diff driver for overlay filesystem.
+// If overlay filesystem is not supported on the host, graphdriver.ErrNotSupported is returned as error.
+// If a overlay filesystem is not supported over a existing filesystem then error graphdriver.ErrIncompatibleFS is returned.
+func Init(home string, options []string, uidMaps, gidMaps []idtools.IDMap) (graphdriver.Driver, error) {
+
+	if err := supportsOverlay(); err != nil {
+		return nil, graphdriver.ErrNotSupported
+	}
+
+	// require kernel 4.0.0 to ensure multiple lower dirs are supported
+	v, err := kernel.GetKernelVersion()
+	if err != nil {
+		return nil, err
+	}
+	if kernel.CompareKernelVersion(*v, kernel.VersionInfo{Kernel: 4, Major: 0, Minor: 0}) < 0 {
+		return nil, graphdriver.ErrNotSupported
+	}
+
+	fsMagic, err := graphdriver.GetFSMagic(home)
+	if err != nil {
+		return nil, err
+	}
+	if fsName, ok := graphdriver.FsNames[fsMagic]; ok {
+		backingFs = fsName
+	}
+
+	// check if they are running over btrfs, aufs, zfs or overlay
+	switch fsMagic {
+	case graphdriver.FsMagicBtrfs:
+		logrus.Error("'overlay' is not supported over btrfs.")
+		return nil, graphdriver.ErrIncompatibleFS
+	case graphdriver.FsMagicAufs:
+		logrus.Error("'overlay' is not supported over aufs.")
+		return nil, graphdriver.ErrIncompatibleFS
+	case graphdriver.FsMagicZfs:
+		logrus.Error("'overlay' is not supported over zfs.")
+		return nil, graphdriver.ErrIncompatibleFS
+	case graphdriver.FsMagicOverlay:
+		logrus.Error("'overlay' is not supported over overlay.")
+		return nil, graphdriver.ErrIncompatibleFS
+	}
+
+	rootUID, rootGID, err := idtools.GetRootUIDGID(uidMaps, gidMaps)
+	if err != nil {
+		return nil, err
+	}
+	// Create the driver home dir
+	if err := idtools.MkdirAllAs(path.Join(home, linkDir), 0700, rootUID, rootGID); err != nil && !os.IsExist(err) {
+		return nil, err
+	}
+
+	if err := mount.MakePrivate(home); err != nil {
+		return nil, err
+	}
+
+	d := &Driver{
+		home:    home,
+		uidMaps: uidMaps,
+		gidMaps: gidMaps,
+		ctr:     graphdriver.NewRefCounter(graphdriver.NewFsChecker(graphdriver.FsMagicOverlay)),
+	}
+
+	return d, nil
+}
+
+func supportsOverlay() error {
+	// We can try to modprobe overlay first before looking at
+	// proc/filesystems for when overlay is supported
+	exec.Command("modprobe", "overlay").Run()
+
+	f, err := os.Open("/proc/filesystems")
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+
+	s := bufio.NewScanner(f)
+	for s.Scan() {
+		if s.Text() == "nodev\toverlay" {
+			return nil
+		}
+	}
+	logrus.Error("'overlay' not found as a supported filesystem on this host. Please ensure kernel is new enough and has overlay support loaded.")
+	return graphdriver.ErrNotSupported
+}
+
+func (d *Driver) String() string {
+	return driverName
+}
+
+// Status returns current driver information in a two dimensional string array.
+// Output contains "Backing Filesystem" used in this implementation.
+func (d *Driver) Status() [][2]string {
+	return [][2]string{
+		{"Backing Filesystem", backingFs},
+	}
+}
+
+// GetMetadata returns meta data about the overlay driver such as
+// LowerDir, UpperDir, WorkDir and MergeDir used to store data.
+func (d *Driver) GetMetadata(id string) (map[string]string, error) {
+	dir := d.dir(id)
+	if _, err := os.Stat(dir); err != nil {
+		return nil, err
+	}
+
+	metadata := map[string]string{
+		"WorkDir":   path.Join(dir, "work"),
+		"MergedDir": path.Join(dir, "merged"),
+		"UpperDir":  path.Join(dir, "diff"),
+	}
+
+	lowerDirs, err := d.getLowerDirs(id)
+	if err != nil {
+		return nil, err
+	}
+	if len(lowerDirs) > 0 {
+		metadata["LowerDir"] = strings.Join(lowerDirs, ":")
+	}
+
+	return metadata, nil
+}
+
+// Cleanup any state created by overlay which should be cleaned when daemon
+// is being shutdown. For now, we just have to unmount the bind mounted
+// we had created.
+func (d *Driver) Cleanup() error {
+	return mount.Unmount(d.home)
+}
+
+// CreateReadWrite creates a layer that is writable for use as a container
+// file system.
+func (d *Driver) CreateReadWrite(id, parent, mountLabel string, storageOpt map[string]string) error {
+	return d.Create(id, parent, mountLabel, storageOpt)
+}
+
+// Create is used to create the upper, lower, and merge directories required for overlay fs for a given id.
+// The parent filesystem is used to configure these directories for the overlay.
+func (d *Driver) Create(id, parent, mountLabel string, storageOpt map[string]string) (retErr error) {
+
+	if len(storageOpt) != 0 {
+		return fmt.Errorf("--storage-opt is not supported for overlay")
+	}
+
+	dir := d.dir(id)
+
+	rootUID, rootGID, err := idtools.GetRootUIDGID(d.uidMaps, d.gidMaps)
+	if err != nil {
+		return err
+	}
+	if err := idtools.MkdirAllAs(path.Dir(dir), 0700, rootUID, rootGID); err != nil {
+		return err
+	}
+	if err := idtools.MkdirAs(dir, 0700, rootUID, rootGID); err != nil {
+		return err
+	}
+
+	defer func() {
+		// Clean up on failure
+		if retErr != nil {
+			os.RemoveAll(dir)
+		}
+	}()
+
+	if err := idtools.MkdirAs(path.Join(dir, "diff"), 0755, rootUID, rootGID); err != nil {
+		return err
+	}
+
+	lid := generateID(idLength)
+	if err := os.Symlink(path.Join("..", id, "diff"), path.Join(d.home, linkDir, lid)); err != nil {
+		return err
+	}
+
+	// Write link id to link file
+	if err := ioutil.WriteFile(path.Join(dir, "link"), []byte(lid), 0644); err != nil {
+		return err
+	}
+
+	// if no parent directory, done
+	if parent == "" {
+		return nil
+	}
+
+	if err := idtools.MkdirAs(path.Join(dir, "work"), 0700, rootUID, rootGID); err != nil {
+		return err
+	}
+	if err := idtools.MkdirAs(path.Join(dir, "merged"), 0700, rootUID, rootGID); err != nil {
+		return err
+	}
+
+	lower, err := d.getLower(parent)
+	if err != nil {
+		return err
+	}
+	if lower != "" {
+		if err := ioutil.WriteFile(path.Join(dir, lowerFile), []byte(lower), 0666); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (d *Driver) getLower(parent string) (string, error) {
+	parentDir := d.dir(parent)
+
+	// Ensure parent exists
+	if _, err := os.Lstat(parentDir); err != nil {
+		return "", err
+	}
+
+	// Read Parent link fileA
+	parentLink, err := ioutil.ReadFile(path.Join(parentDir, "link"))
+	if err != nil {
+		return "", err
+	}
+	lowers := []string{path.Join(linkDir, string(parentLink))}
+
+	parentLower, err := ioutil.ReadFile(path.Join(parentDir, lowerFile))
+	if err == nil {
+		parentLowers := strings.Split(string(parentLower), ":")
+		lowers = append(lowers, parentLowers...)
+	}
+	if len(lowers) > maxDepth {
+		return "", errors.New("max depth exceeded")
+	}
+	return strings.Join(lowers, ":"), nil
+}
+
+func (d *Driver) dir(id string) string {
+	return path.Join(d.home, id)
+}
+
+func (d *Driver) getLowerDirs(id string) ([]string, error) {
+	var lowersArray []string
+	lowers, err := ioutil.ReadFile(path.Join(d.dir(id), lowerFile))
+	if err == nil {
+		for _, s := range strings.Split(string(lowers), ":") {
+			lp, err := os.Readlink(path.Join(d.home, s))
+			if err != nil {
+				return nil, err
+			}
+			lowersArray = append(lowersArray, path.Clean(path.Join(d.home, "link", lp)))
+		}
+	} else if !os.IsNotExist(err) {
+		return nil, err
+	}
+	return lowersArray, nil
+}
+
+// Remove cleans the directories that are created for this id.
+func (d *Driver) Remove(id string) error {
+	if err := os.RemoveAll(d.dir(id)); err != nil && !os.IsNotExist(err) {
+		return err
+	}
+	return nil
+}
+
+// Get creates and mounts the required file system for the given id and returns the mount path.
+func (d *Driver) Get(id string, mountLabel string) (s string, err error) {
+	dir := d.dir(id)
+	if _, err := os.Stat(dir); err != nil {
+		return "", err
+	}
+
+	diffDir := path.Join(dir, "diff")
+	lowers, err := ioutil.ReadFile(path.Join(dir, lowerFile))
+	if err != nil {
+		// If no lower, just return diff directory
+		if os.IsNotExist(err) {
+			return diffDir, nil
+		}
+		return "", err
+	}
+
+	mergedDir := path.Join(dir, "merged")
+	if count := d.ctr.Increment(mergedDir); count > 1 {
+		return mergedDir, nil
+	}
+	defer func() {
+		if err != nil {
+			if c := d.ctr.Decrement(mergedDir); c <= 0 {
+				syscall.Unmount(mergedDir, 0)
+			}
+		}
+	}()
+
+	workDir := path.Join(dir, "work")
+	opts := fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s", string(lowers), path.Join(id, "diff"), path.Join(id, "work"))
+	mountLabel = label.FormatMountLabel(opts, mountLabel)
+	if len(mountLabel) > syscall.Getpagesize() {
+		return "", fmt.Errorf("cannot mount layer, mount label too large %d", len(mountLabel))
+	}
+
+	if err := mountFrom(d.home, "overlay", path.Join(id, "merged"), "overlay", mountLabel); err != nil {
+		return "", fmt.Errorf("error creating overlay mount to %s: %v", mergedDir, err)
+	}
+
+	// chown "workdir/work" to the remapped root UID/GID. Overlay fs inside a
+	// user namespace requires this to move a directory from lower to upper.
+	rootUID, rootGID, err := idtools.GetRootUIDGID(d.uidMaps, d.gidMaps)
+	if err != nil {
+		return "", err
+	}
+
+	if err := os.Chown(path.Join(workDir, "work"), rootUID, rootGID); err != nil {
+		return "", err
+	}
+
+	return mergedDir, nil
+}
+
+// Put unmounts the mount path created for the give id.
+func (d *Driver) Put(id string) error {
+	mountpoint := path.Join(d.dir(id), "merged")
+	if count := d.ctr.Decrement(mountpoint); count > 0 {
+		return nil
+	}
+	if err := syscall.Unmount(mountpoint, 0); err != nil {
+		logrus.Debugf("Failed to unmount %s overlay: %v", id, err)
+	}
+	return nil
+}
+
+// Exists checks to see if the id is already mounted.
+func (d *Driver) Exists(id string) bool {
+	_, err := os.Stat(d.dir(id))
+	return err == nil
+}
+
+// ApplyDiff applies the new layer into a root
+func (d *Driver) ApplyDiff(id string, parent string, diff archive.Reader) (size int64, err error) {
+	applyDir := d.getDiffPath(id)
+
+	logrus.Debugf("Applying tar in %s", applyDir)
+	// Overlay doesn't need the parent id to apply the diff
+	if err := untar(diff, applyDir, &archive.TarOptions{
+		UIDMaps:        d.uidMaps,
+		GIDMaps:        d.gidMaps,
+		WhiteoutFormat: archive.OverlayWhiteoutFormat,
+	}); err != nil {
+		return 0, err
+	}
+
+	return d.DiffSize(id, parent)
+}
+
+func (d *Driver) getDiffPath(id string) string {
+	dir := d.dir(id)
+
+	return path.Join(dir, "diff")
+}
+
+// DiffSize calculates the changes between the specified id
+// and its parent and returns the size in bytes of the changes
+// relative to its base filesystem directory.
+func (d *Driver) DiffSize(id, parent string) (size int64, err error) {
+	return directory.Size(d.getDiffPath(id))
+}
+
+// Diff produces an archive of the changes between the specified
+// layer and its parent layer which may be "".
+func (d *Driver) Diff(id, parent string) (archive.Archive, error) {
+	diffPath := d.getDiffPath(id)
+	logrus.Debugf("Tar with options on %s", diffPath)
+	return archive.TarWithOptions(diffPath, &archive.TarOptions{
+		Compression:    archive.Uncompressed,
+		UIDMaps:        d.uidMaps,
+		GIDMaps:        d.gidMaps,
+		WhiteoutFormat: archive.OverlayWhiteoutFormat,
+	})
+}
+
+// Changes produces a list of changes between the specified layer
+// and its parent layer. If parent is "", then all changes will be ADD changes.
+func (d *Driver) Changes(id, parent string) ([]archive.Change, error) {
+	// Overlay doesn't have snapshots, so we need to get changes from all parent
+	// layers.
+	diffPath := d.getDiffPath(id)
+	layers, err := d.getLowerDirs(id)
+	if err != nil {
+		return nil, err
+	}
+
+	return archive.OverlayChanges(layers, diffPath)
+}

+ 106 - 0
daemon/graphdriver/overlay2/overlay_test.go

@@ -0,0 +1,106 @@
+// +build linux
+
+package overlay2
+
+import (
+	"os"
+	"syscall"
+	"testing"
+
+	"github.com/docker/docker/daemon/graphdriver"
+	"github.com/docker/docker/daemon/graphdriver/graphtest"
+	"github.com/docker/docker/pkg/archive"
+	"github.com/docker/docker/pkg/reexec"
+)
+
+func init() {
+	// Do not sure chroot to speed run time and allow archive
+	// errors or hangs to be debugged directly from the test process.
+	untar = archive.UntarUncompressed
+	graphdriver.ApplyUncompressedLayer = archive.ApplyUncompressedLayer
+
+	reexec.Init()
+}
+
+func cdMountFrom(dir, device, target, mType, label string) error {
+	wd, err := os.Getwd()
+	if err != nil {
+		return err
+	}
+	os.Chdir(dir)
+	defer os.Chdir(wd)
+
+	return syscall.Mount(device, target, mType, 0, label)
+}
+
+// This avoids creating a new driver for each test if all tests are run
+// Make sure to put new tests between TestOverlaySetup and TestOverlayTeardown
+func TestOverlaySetup(t *testing.T) {
+	graphtest.GetDriver(t, driverName)
+}
+
+func TestOverlayCreateEmpty(t *testing.T) {
+	graphtest.DriverTestCreateEmpty(t, driverName)
+}
+
+func TestOverlayCreateBase(t *testing.T) {
+	graphtest.DriverTestCreateBase(t, driverName)
+}
+
+func TestOverlayCreateSnap(t *testing.T) {
+	graphtest.DriverTestCreateSnap(t, driverName)
+}
+
+func TestOverlay128LayerRead(t *testing.T) {
+	graphtest.DriverTestDeepLayerRead(t, 128, driverName)
+}
+
+func TestOverlayDiffApply10Files(t *testing.T) {
+	graphtest.DriverTestDiffApply(t, 10, driverName)
+}
+
+func TestOverlayChanges(t *testing.T) {
+	graphtest.DriverTestChanges(t, driverName)
+}
+
+func TestOverlayTeardown(t *testing.T) {
+	graphtest.PutDriver(t)
+}
+
+// Benchmarks should always setup new driver
+
+func BenchmarkExists(b *testing.B) {
+	graphtest.DriverBenchExists(b, driverName)
+}
+
+func BenchmarkGetEmpty(b *testing.B) {
+	graphtest.DriverBenchGetEmpty(b, driverName)
+}
+
+func BenchmarkDiffBase(b *testing.B) {
+	graphtest.DriverBenchDiffBase(b, driverName)
+}
+
+func BenchmarkDiffSmallUpper(b *testing.B) {
+	graphtest.DriverBenchDiffN(b, 10, 10, driverName)
+}
+
+func BenchmarkDiff10KFileUpper(b *testing.B) {
+	graphtest.DriverBenchDiffN(b, 10, 10000, driverName)
+}
+
+func BenchmarkDiff10KFilesBottom(b *testing.B) {
+	graphtest.DriverBenchDiffN(b, 10000, 10, driverName)
+}
+
+func BenchmarkDiffApply100(b *testing.B) {
+	graphtest.DriverBenchDiffApplyN(b, 100, driverName)
+}
+
+func BenchmarkDiff20Layers(b *testing.B) {
+	graphtest.DriverBenchDeepLayerDiff(b, 20, driverName)
+}
+
+func BenchmarkRead20Layers(b *testing.B) {
+	graphtest.DriverBenchDeepLayerRead(b, 20, driverName)
+}

+ 3 - 0
daemon/graphdriver/overlay2/overlay_unsupported.go

@@ -0,0 +1,3 @@
+// +build !linux
+
+package overlay2

+ 80 - 0
daemon/graphdriver/overlay2/randomid.go

@@ -0,0 +1,80 @@
+// +build linux
+
+package overlay2
+
+import (
+	"crypto/rand"
+	"encoding/base32"
+	"fmt"
+	"io"
+	"os"
+	"syscall"
+	"time"
+
+	"github.com/Sirupsen/logrus"
+)
+
+// generateID creates a new random string identifier with the given length
+func generateID(l int) string {
+	const (
+		// ensures we backoff for less than 450ms total. Use the following to
+		// select new value, in units of 10ms:
+		// 	n*(n+1)/2 = d -> n^2 + n - 2d -> n = (sqrt(8d + 1) - 1)/2
+		maxretries = 9
+		backoff    = time.Millisecond * 10
+	)
+
+	var (
+		totalBackoff time.Duration
+		count        int
+		retries      int
+		size         = (l*5 + 7) / 8
+		u            = make([]byte, size)
+	)
+	// TODO: Include time component, counter component, random component
+
+	for {
+		// This should never block but the read may fail. Because of this,
+		// we just try to read the random number generator until we get
+		// something. This is a very rare condition but may happen.
+		b := time.Duration(retries) * backoff
+		time.Sleep(b)
+		totalBackoff += b
+
+		n, err := io.ReadFull(rand.Reader, u[count:])
+		if err != nil {
+			if retryOnError(err) && retries < maxretries {
+				count += n
+				retries++
+				logrus.Errorf("error generating version 4 uuid, retrying: %v", err)
+				continue
+			}
+
+			// Any other errors represent a system problem. What did someone
+			// do to /dev/urandom?
+			panic(fmt.Errorf("error reading random number generator, retried for %v: %v", totalBackoff.String(), err))
+		}
+
+		break
+	}
+
+	s := base32.StdEncoding.EncodeToString(u)
+
+	return s[:l]
+}
+
+// retryOnError tries to detect whether or not retrying would be fruitful.
+func retryOnError(err error) bool {
+	switch err := err.(type) {
+	case *os.PathError:
+		return retryOnError(err.Err) // unpack the target error
+	case syscall.Errno:
+		if err == syscall.EPERM {
+			// EPERM represents an entropy pool exhaustion, a condition under
+			// which we backoff and retry.
+			return true
+		}
+	}
+
+	return false
+}

+ 1 - 0
daemon/graphdriver/register/register_overlay.go

@@ -5,4 +5,5 @@ package register
 import (
 import (
 	// register the overlay graphdriver
 	// register the overlay graphdriver
 	_ "github.com/docker/docker/daemon/graphdriver/overlay"
 	_ "github.com/docker/docker/daemon/graphdriver/overlay"
+	_ "github.com/docker/docker/daemon/graphdriver/overlay2"
 )
 )

+ 8 - 3
docs/reference/commandline/dockerd.md

@@ -204,7 +204,7 @@ TCP and a Unix socket
 ### Daemon storage-driver option
 ### Daemon storage-driver option
 
 
 The Docker daemon has support for several different image layer storage
 The Docker daemon has support for several different image layer storage
-drivers: `aufs`, `devicemapper`, `btrfs`, `zfs` and `overlay`.
+drivers: `aufs`, `devicemapper`, `btrfs`, `zfs`, `overlay` and `overlay2`.
 
 
 The `aufs` driver is the oldest, but is based on a Linux kernel patch-set that
 The `aufs` driver is the oldest, but is based on a Linux kernel patch-set that
 is unlikely to be merged into the main kernel. These are also known to cause
 is unlikely to be merged into the main kernel. These are also known to cause
@@ -242,9 +242,14 @@ Linux kernel as of [3.18.0](https://lkml.org/lkml/2014/10/26/137). Call
 > inode consumption (especially as the number of images grows), as well as
 > inode consumption (especially as the number of images grows), as well as
 > being incompatible with the use of RPMs.
 > being incompatible with the use of RPMs.
 
 
+The `overlay2` uses the same fast union filesystem but takes advantage of
+[additional features](https://lkml.org/lkml/2015/2/11/106) added in Linux
+kernel 4.0 to avoid excessive inode consumption. Call `dockerd -s overlay2`
+to use it.
+
 > **Note:**
 > **Note:**
-> It is currently unsupported on `btrfs` or any Copy on Write filesystem
-> and should only be used over `ext4` partitions.
+> Both `overlay` and `overlay2` are currently unsupported on `btrfs` or any
+> Copy on Write filesystem and should only be used over `ext4` partitions.
 
 
 ### Storage driver options
 ### Storage driver options
 
 

+ 35 - 16
docs/userguide/storagedriver/selectadriver.md

@@ -34,14 +34,14 @@ and all containers created by that daemon instance use the same storage driver.
  The table below shows the supported storage driver technologies and their
  The table below shows the supported storage driver technologies and their
 driver names:
 driver names:
 
 
-|Technology    |Storage driver name  |
-|--------------|---------------------|
-|OverlayFS     |`overlay`            |
-|AUFS          |`aufs`               |
-|Btrfs         |`btrfs`              |
-|Device Mapper |`devicemapper`       |
-|VFS           |`vfs`                |
-|ZFS           |`zfs`                |
+|Technology    |Storage driver name    |
+|--------------|-----------------------|
+|OverlayFS     |`overlay` or `overlay2`|
+|AUFS          |`aufs`                 |
+|Btrfs         |`btrfs`                |
+|Device Mapper |`devicemapper`         |
+|VFS           |`vfs`                  |
+|ZFS           |`zfs`                  |
 
 
 To find out which storage driver is set on the daemon, you use the
 To find out which storage driver is set on the daemon, you use the
 `docker info` command:
 `docker info` command:
@@ -71,14 +71,15 @@ For example, the `btrfs` storage driver on a Btrfs backing filesystem. The
 following table lists each storage driver and whether it must match the host's
 following table lists each storage driver and whether it must match the host's
 backing file system:
 backing file system:
 
 
-|Storage driver |Commonly used on |Disabled on                              |
-|---------------|-----------------|-----------------------------------------|
-|`overlay`      |`ext4` `xfs`     |`btrfs` `aufs` `overlay` `zfs` `eCryptfs`|
-|`aufs`         |`ext4` `xfs`     |`btrfs` `aufs` `eCryptfs`                |
-|`btrfs`        |`btrfs` _only_   |   N/A                                   |
-|`devicemapper` |`direct-lvm`     |   N/A                                   |
-|`vfs`          |debugging only   |   N/A                                   |
-|`zfs`          |`zfs` _only_     |   N/A                                   |
+|Storage driver |Commonly used on |Disabled on                                         |
+|---------------|-----------------|----------------------------------------------------|
+|`overlay`      |`ext4` `xfs`     |`btrfs` `aufs` `overlay` `overlay2` `zfs` `eCryptfs`|
+|`overlay2`     |`ext4` `xfs`     |`btrfs` `aufs` `overlay` `overlay2` `zfs` `eCryptfs`|
+|`aufs`         |`ext4` `xfs`     |`btrfs` `aufs` `eCryptfs`                           |
+|`btrfs`        |`btrfs` _only_   |   N/A                                              |
+|`devicemapper` |`direct-lvm`     |   N/A                                              |
+|`vfs`          |debugging only   |   N/A                                              |
+|`zfs`          |`zfs` _only_     |   N/A                                              |
 
 
 
 
 > **Note**
 > **Note**
@@ -198,6 +199,24 @@ the guidance offered by the table below along with the points mentioned above.
 
 
 ![](images/driver-pros-cons.png)
 ![](images/driver-pros-cons.png)
 
 
+### Overlay vs Overlay2
+
+OverlayFS has 2 storage drivers which both make use of the same OverlayFS
+technology but with different implementations and incompatible on disk
+storage. Since the storage is incompatible, switching between the two
+will require re-creating all image content. The `overlay` driver is the
+original implementation and the only option in Docker 1.11 and before.
+The `overlay` driver has known limitations with inode exhaustion and
+commit performance. The `overlay2` driver addresses this limitation, but
+is only compatible with Linux kernel 4.0 and later. For users on a pre-4.0
+kernel or with an existing `overlay` graph, it is recommended to stay
+on `overlay`. For users with at least a 4.0 kernel and no existing or required
+`overlay` graph data, then `overlay2` may be used.
+
+> **Note**
+> `overlay2` graph data will not interfere with `overlay` graph data. However
+> when switching to `overlay2`, the user is responsible for removing
+> `overlay` graph data to avoid storage duplication.
 
 
 ## Related information
 ## Related information
 
 

+ 1 - 1
man/dockerd.8.md

@@ -226,7 +226,7 @@ output otherwise.
   Force the Docker runtime to use a specific storage driver.
   Force the Docker runtime to use a specific storage driver.
 
 
 **--selinux-enabled**=*true*|*false*
 **--selinux-enabled**=*true*|*false*
-  Enable selinux support. Default is false. SELinux does not presently support the overlay storage driver.
+  Enable selinux support. Default is false. SELinux does not presently support either of the overlay storage drivers.
 
 
 **--storage-opt**=[]
 **--storage-opt**=[]
   Set storage driver options. See STORAGE DRIVER OPTIONS.
   Set storage driver options. See STORAGE DRIVER OPTIONS.

+ 49 - 5
pkg/archive/archive.go

@@ -33,6 +33,8 @@ type (
 	Reader io.Reader
 	Reader io.Reader
 	// Compression is the state represents if compressed or not.
 	// Compression is the state represents if compressed or not.
 	Compression int
 	Compression int
+	// WhiteoutFormat is the format of whiteouts unpacked
+	WhiteoutFormat int
 	// TarChownOptions wraps the chown options UID and GID.
 	// TarChownOptions wraps the chown options UID and GID.
 	TarChownOptions struct {
 	TarChownOptions struct {
 		UID, GID int
 		UID, GID int
@@ -47,6 +49,10 @@ type (
 		GIDMaps          []idtools.IDMap
 		GIDMaps          []idtools.IDMap
 		ChownOpts        *TarChownOptions
 		ChownOpts        *TarChownOptions
 		IncludeSourceDir bool
 		IncludeSourceDir bool
+		// WhiteoutFormat is the expected on disk format for whiteout files.
+		// This format will be converted to the standard format on pack
+		// and from the standard format on unpack.
+		WhiteoutFormat WhiteoutFormat
 		// When unpacking, specifies whether overwriting a directory with a
 		// When unpacking, specifies whether overwriting a directory with a
 		// non-directory is allowed and vice versa.
 		// non-directory is allowed and vice versa.
 		NoOverwriteDirNonDir bool
 		NoOverwriteDirNonDir bool
@@ -93,6 +99,14 @@ const (
 	Xz
 	Xz
 )
 )
 
 
+const (
+	// AUFSWhiteoutFormat is the default format for whitesouts
+	AUFSWhiteoutFormat WhiteoutFormat = iota
+	// OverlayWhiteoutFormat formats whiteout according to the overlay
+	// standard.
+	OverlayWhiteoutFormat
+)
+
 // IsArchive checks for the magic bytes of a tar or any supported compression
 // IsArchive checks for the magic bytes of a tar or any supported compression
 // algorithm.
 // algorithm.
 func IsArchive(header []byte) bool {
 func IsArchive(header []byte) bool {
@@ -228,6 +242,11 @@ func (compression *Compression) Extension() string {
 	return ""
 	return ""
 }
 }
 
 
+type tarWhiteoutConverter interface {
+	ConvertWrite(*tar.Header, string, os.FileInfo) error
+	ConvertRead(*tar.Header, string) (bool, error)
+}
+
 type tarAppender struct {
 type tarAppender struct {
 	TarWriter *tar.Writer
 	TarWriter *tar.Writer
 	Buffer    *bufio.Writer
 	Buffer    *bufio.Writer
@@ -236,6 +255,12 @@ type tarAppender struct {
 	SeenFiles map[uint64]string
 	SeenFiles map[uint64]string
 	UIDMaps   []idtools.IDMap
 	UIDMaps   []idtools.IDMap
 	GIDMaps   []idtools.IDMap
 	GIDMaps   []idtools.IDMap
+
+	// For packing and unpacking whiteout files in the
+	// non standard format. The whiteout files defined
+	// by the AUFS standard are used as the tar whiteout
+	// standard.
+	WhiteoutConverter tarWhiteoutConverter
 }
 }
 
 
 // canonicalTarName provides a platform-independent and consistent posix-style
 // canonicalTarName provides a platform-independent and consistent posix-style
@@ -253,6 +278,7 @@ func canonicalTarName(name string, isDir bool) (string, error) {
 	return name, nil
 	return name, nil
 }
 }
 
 
+// addTarFile adds to the tar archive a file from `path` as `name`
 func (ta *tarAppender) addTarFile(path, name string) error {
 func (ta *tarAppender) addTarFile(path, name string) error {
 	fi, err := os.Lstat(path)
 	fi, err := os.Lstat(path)
 	if err != nil {
 	if err != nil {
@@ -323,6 +349,12 @@ func (ta *tarAppender) addTarFile(path, name string) error {
 		hdr.Gid = xGID
 		hdr.Gid = xGID
 	}
 	}
 
 
+	if ta.WhiteoutConverter != nil {
+		if err := ta.WhiteoutConverter.ConvertWrite(hdr, path, fi); err != nil {
+			return err
+		}
+	}
+
 	if err := ta.TarWriter.WriteHeader(hdr); err != nil {
 	if err := ta.TarWriter.WriteHeader(hdr); err != nil {
 		return err
 		return err
 	}
 	}
@@ -508,11 +540,12 @@ func TarWithOptions(srcPath string, options *TarOptions) (io.ReadCloser, error)
 
 
 	go func() {
 	go func() {
 		ta := &tarAppender{
 		ta := &tarAppender{
-			TarWriter: tar.NewWriter(compressWriter),
-			Buffer:    pools.BufioWriter32KPool.Get(nil),
-			SeenFiles: make(map[uint64]string),
-			UIDMaps:   options.UIDMaps,
-			GIDMaps:   options.GIDMaps,
+			TarWriter:         tar.NewWriter(compressWriter),
+			Buffer:            pools.BufioWriter32KPool.Get(nil),
+			SeenFiles:         make(map[uint64]string),
+			UIDMaps:           options.UIDMaps,
+			GIDMaps:           options.GIDMaps,
+			WhiteoutConverter: getWhiteoutConverter(options.WhiteoutFormat),
 		}
 		}
 
 
 		defer func() {
 		defer func() {
@@ -674,6 +707,7 @@ func Unpack(decompressedArchive io.Reader, dest string, options *TarOptions) err
 	if err != nil {
 	if err != nil {
 		return err
 		return err
 	}
 	}
+	whiteoutConverter := getWhiteoutConverter(options.WhiteoutFormat)
 
 
 	// Iterate through the files in the archive.
 	// Iterate through the files in the archive.
 loop:
 loop:
@@ -773,6 +807,16 @@ loop:
 			hdr.Gid = xGID
 			hdr.Gid = xGID
 		}
 		}
 
 
+		if whiteoutConverter != nil {
+			writeFile, err := whiteoutConverter.ConvertRead(hdr, path)
+			if err != nil {
+				return err
+			}
+			if !writeFile {
+				continue
+			}
+		}
+
 		if err := createTarFile(path, dest, hdr, trBuf, !options.NoLchown, options.ChownOpts); err != nil {
 		if err := createTarFile(path, dest, hdr, trBuf, !options.NoLchown, options.ChownOpts); err != nil {
 			return err
 			return err
 		}
 		}

+ 89 - 0
pkg/archive/archive_linux.go

@@ -0,0 +1,89 @@
+package archive
+
+import (
+	"archive/tar"
+	"os"
+	"path/filepath"
+	"strings"
+	"syscall"
+
+	"github.com/docker/docker/pkg/system"
+)
+
+func getWhiteoutConverter(format WhiteoutFormat) tarWhiteoutConverter {
+	if format == OverlayWhiteoutFormat {
+		return overlayWhiteoutConverter{}
+	}
+	return nil
+}
+
+type overlayWhiteoutConverter struct{}
+
+func (overlayWhiteoutConverter) ConvertWrite(hdr *tar.Header, path string, fi os.FileInfo) error {
+	// convert whiteouts to AUFS format
+	if fi.Mode()&os.ModeCharDevice != 0 && hdr.Devmajor == 0 && hdr.Devminor == 0 {
+		// we just rename the file and make it normal
+		hdr.Name = WhiteoutPrefix + hdr.Name
+		hdr.Mode = 0600
+		hdr.Typeflag = tar.TypeReg
+	}
+
+	if fi.Mode()&os.ModeDir != 0 {
+		// convert opaque dirs to AUFS format by writing an empty file with the prefix
+		opaque, err := system.Lgetxattr(path, "trusted.overlay.opaque")
+		if err != nil {
+			return err
+		}
+		if opaque != nil && len(opaque) == 1 && opaque[0] == 'y' {
+			// create a header for the whiteout file
+			// it should inherit some properties from the parent, but be a regular file
+			*hdr = tar.Header{
+				Typeflag:   tar.TypeReg,
+				Mode:       hdr.Mode & int64(os.ModePerm),
+				Name:       filepath.Join(hdr.Name, WhiteoutOpaqueDir),
+				Size:       0,
+				Uid:        hdr.Uid,
+				Uname:      hdr.Uname,
+				Gid:        hdr.Gid,
+				Gname:      hdr.Gname,
+				AccessTime: hdr.AccessTime,
+				ChangeTime: hdr.ChangeTime,
+			}
+		}
+	}
+
+	return nil
+}
+
+func (overlayWhiteoutConverter) ConvertRead(hdr *tar.Header, path string) (bool, error) {
+	base := filepath.Base(path)
+	dir := filepath.Dir(path)
+
+	// if a directory is marked as opaque by the AUFS special file, we need to translate that to overlay
+	if base == WhiteoutOpaqueDir {
+		if err := syscall.Setxattr(dir, "trusted.overlay.opaque", []byte{'y'}, 0); err != nil {
+			return false, err
+		}
+
+		// don't write the file itself
+		return false, nil
+	}
+
+	// if a file was deleted and we are using overlay, we need to create a character device
+	if strings.HasPrefix(base, WhiteoutPrefix) {
+		originalBase := base[len(WhiteoutPrefix):]
+		originalPath := filepath.Join(dir, originalBase)
+
+		if err := syscall.Mknod(originalPath, syscall.S_IFCHR, 0); err != nil {
+			return false, err
+		}
+		if err := os.Chown(originalPath, hdr.Uid, hdr.Gid); err != nil {
+			return false, err
+		}
+
+		// don't write the file itself
+		return false, nil
+	}
+
+	return true, nil
+}

+ 7 - 0
pkg/archive/archive_other.go

@@ -0,0 +1,7 @@
+// +build !linux
+
+package archive
+
+func getWhiteoutConverter(format WhiteoutFormat) tarWhiteoutConverter {
+	return nil
+}

+ 38 - 8
pkg/archive/changes.go

@@ -81,6 +81,33 @@ func sameFsTimeSpec(a, b syscall.Timespec) bool {
 // Changes walks the path rw and determines changes for the files in the path,
 // Changes walks the path rw and determines changes for the files in the path,
 // with respect to the parent layers
 // with respect to the parent layers
 func Changes(layers []string, rw string) ([]Change, error) {
 func Changes(layers []string, rw string) ([]Change, error) {
+	return changes(layers, rw, aufsDeletedFile, aufsMetadataSkip)
+}
+
+func aufsMetadataSkip(path string) (skip bool, err error) {
+	skip, err = filepath.Match(string(os.PathSeparator)+WhiteoutMetaPrefix+"*", path)
+	if err != nil {
+		skip = true
+	}
+	return
+}
+
+func aufsDeletedFile(root, path string, fi os.FileInfo) (string, error) {
+	f := filepath.Base(path)
+
+	// If there is a whiteout, then the file was removed
+	if strings.HasPrefix(f, WhiteoutPrefix) {
+		originalFile := f[len(WhiteoutPrefix):]
+		return filepath.Join(filepath.Dir(path), originalFile), nil
+	}
+
+	return "", nil
+}
+
+type skipChange func(string) (bool, error)
+type deleteChange func(string, string, os.FileInfo) (string, error)
+
+func changes(layers []string, rw string, dc deleteChange, sc skipChange) ([]Change, error) {
 	var (
 	var (
 		changes     []Change
 		changes     []Change
 		changedDirs = make(map[string]struct{})
 		changedDirs = make(map[string]struct{})
@@ -105,21 +132,24 @@ func Changes(layers []string, rw string) ([]Change, error) {
 			return nil
 			return nil
 		}
 		}
 
 
-		// Skip AUFS metadata
-		if matched, err := filepath.Match(string(os.PathSeparator)+WhiteoutMetaPrefix+"*", path); err != nil || matched {
-			return err
+		if sc != nil {
+			if skip, err := sc(path); skip {
+				return err
+			}
 		}
 		}
 
 
 		change := Change{
 		change := Change{
 			Path: path,
 			Path: path,
 		}
 		}
 
 
+		deletedFile, err := dc(rw, path, f)
+		if err != nil {
+			return err
+		}
+
 		// Find out what kind of modification happened
 		// Find out what kind of modification happened
-		file := filepath.Base(path)
-		// If there is a whiteout, then the file was removed
-		if strings.HasPrefix(file, WhiteoutPrefix) {
-			originalFile := file[len(WhiteoutPrefix):]
-			change.Path = filepath.Join(filepath.Dir(path), originalFile)
+		if deletedFile != "" {
+			change.Path = deletedFile
 			change.Kind = ChangeDelete
 			change.Kind = ChangeDelete
 		} else {
 		} else {
 			// Otherwise, the file was added
 			// Otherwise, the file was added

+ 27 - 0
pkg/archive/changes_linux.go

@@ -283,3 +283,30 @@ func clen(n []byte) int {
 	}
 	}
 	return len(n)
 	return len(n)
 }
 }
+
+// OverlayChanges walks the path rw and determines changes for the files in the path,
+// with respect to the parent layers
+func OverlayChanges(layers []string, rw string) ([]Change, error) {
+	return changes(layers, rw, overlayDeletedFile, nil)
+}
+
+func overlayDeletedFile(root, path string, fi os.FileInfo) (string, error) {
+	if fi.Mode()&os.ModeCharDevice != 0 {
+		s := fi.Sys().(*syscall.Stat_t)
+		if major(uint64(s.Rdev)) == 0 && minor(uint64(s.Rdev)) == 0 {
+			return path, nil
+		}
+	}
+	if fi.Mode()&os.ModeDir != 0 {
+		opaque, err := system.Lgetxattr(filepath.Join(root, path), "trusted.overlay.opaque")
+		if err != nil {
+			return "", err
+		}
+		if opaque != nil && len(opaque) == 1 && opaque[0] == 'y' {
+			return path, nil
+		}
+	}
+
+	return "", nil
+
+}