Browse Source

Merged master into device-mapper branch

Solomon Hykes 11 years ago
parent
commit
1804fcba93
28 changed files with 2552 additions and 212 deletions
  1. 38 26
      Dockerfile
  2. 1 0
      api.go
  3. 7 7
      api_params.go
  4. 15 5
      api_test.go
  5. 74 7
      archive.go
  6. 3 3
      archive_test.go
  7. 253 42
      changes.go
  8. 59 9
      container.go
  9. 14 0
      deviceset.go
  10. 909 0
      devmapper/deviceset_devmapper.go
  11. 390 0
      devmapper/devmapper.go
  12. 62 0
      devmapper/docker-device-tool/device_tool.go
  13. 16 0
      docker-init/docker-init.go
  14. 3 2
      docker/docker.go
  15. 5 2
      graph_test.go
  16. 11 14
      hack/PACKAGERS.md
  17. 2 2
      hack/make.sh
  18. 1 1
      hack/make/binary
  19. 3 1
      hack/make/test
  20. 383 34
      image.go
  21. 0 29
      mount.go
  22. 66 4
      runtime.go
  23. 75 9
      runtime_test.go
  24. 9 13
      server.go
  25. 58 0
      utils.go
  26. 33 0
      utils/utils.go
  27. 61 1
      utils_test.go
  28. 1 1
      z_final_test.go

+ 38 - 26
Dockerfile

@@ -24,45 +24,57 @@
 #
 
 docker-version 0.6.1
-from	ubuntu:12.04
-maintainer	Solomon Hykes <solomon@dotcloud.com>
+from    ubuntu:12.10
+maintainer      Solomon Hykes <solomon@dotcloud.com>
 
 # Build dependencies
-run	echo 'deb http://archive.ubuntu.com/ubuntu precise main universe' > /etc/apt/sources.list
-run	apt-get update
-run	apt-get install -y -q curl
-run	apt-get install -y -q git
-run	apt-get install -y -q mercurial
-run	apt-get install -y -q build-essential
+run     apt-get update
+run     apt-get install -y -q curl
+run     apt-get install -y -q git
+run     apt-get install -y -q mercurial
+run     apt-get install -y -q build-essential
 
 # Install Go from source (for eventual cross-compiling)
-env	CGO_ENABLED 0
-run	curl -s https://go.googlecode.com/files/go1.1.2.src.tar.gz | tar -v -C / -xz && mv /go /goroot
-run	cd /goroot/src && ./make.bash
-env GOROOT	/goroot
-env	PATH	$PATH:/goroot/bin
-env	GOPATH	/go:/go/src/github.com/dotcloud/docker/vendor
+run     curl -s https://go.googlecode.com/files/go1.2rc1.src.tar.gz | tar -v -C / -xz && mv /go /goroot
+run     cd /goroot/src && ./make.bash
+env     GOROOT  /goroot
+env     PATH    $PATH:/goroot/bin
+env     GOPATH  /go:/go/src/github.com/dotcloud/docker/vendor
+
+# Create Go cache with tag netgo (for static compilation of Go while preserving CGO support)
+run     go install -ldflags '-w -linkmode external -extldflags "-static -Wl,--unresolved-symbols=ignore-in-shared-libs"' -tags netgo -a std
+
+# Get lvm2 source for compiling statically
+run     git clone git://git.fedorahosted.org/git/lvm2.git /lvm2
+run     cd /lvm2 && git checkout v2_02_102
+
+# can't use git clone -b because it's not supported by git versions before 1.7.10
+run	cd /lvm2 && ./configure --enable-static_link && make && make install_device-mapper
+# see https://git.fedorahosted.org/cgit/lvm2.git/refs/tags for release tags
 
 # Ubuntu stuff
-run	apt-get install -y -q ruby1.9.3 rubygems libffi-dev
-run	gem install --no-rdoc --no-ri fpm
-run	apt-get install -y -q reprepro dpkg-sig
+run     apt-get install -y -q ruby1.9.3 rubygems libffi-dev
+run     gem install --no-rdoc --no-ri fpm
+run     apt-get install -y -q reprepro dpkg-sig
 
 # Install s3cmd 1.0.1 (earlier versions don't support env variables in the config)
-run	apt-get install -y -q python-pip
-run	pip install s3cmd
-run	pip install python-magic
-run	/bin/echo -e '[default]\naccess_key=$AWS_ACCESS_KEY\nsecret_key=$AWS_SECRET_KEY\n' > /.s3cfg
+run     apt-get install -y -q python-pip
+run     pip install s3cmd
+run     pip install python-magic
+run     /bin/echo -e '[default]\naccess_key=$AWS_ACCESS_KEY\nsecret_key=$AWS_SECRET_KEY\n' > /.s3cfg
 
 # Runtime dependencies
-run	apt-get install -y -q iptables
-run	apt-get install -y -q lxc
+run     apt-get install -y -q iptables
+run     dpkg-divert --local --rename --add /sbin/initctl && \
+        ln -s /bin/true /sbin/initctl && \
+        apt-get install -y -q lxc
 
-volume	/var/lib/docker
-workdir	/go/src/github.com/dotcloud/docker
+volume  /var/lib/docker
+workdir /go/src/github.com/dotcloud/docker
 
 # Wrap all commands in the "docker-in-docker" script to allow nested containers
 entrypoint ["hack/dind"]
 
 # Upload docker source
-add	.       /go/src/github.com/dotcloud/docker
+add     .		/go/src/github.com/dotcloud/docker
+

+ 1 - 0
api.go

@@ -623,6 +623,7 @@ func postContainersStart(srv *Server, version float64, w http.ResponseWriter, r
 	}
 	name := vars["name"]
 	if err := srv.ContainerStart(name, hostConfig); err != nil {
+		utils.Debugf("error ContainerStart: %s", err)
 		return err
 	}
 	w.WriteHeader(http.StatusNoContent)

+ 7 - 7
api_params.go

@@ -56,13 +56,13 @@ type APIContainers struct {
 
 func (self *APIContainers) ToLegacy() APIContainersOld {
 	return APIContainersOld{
-		ID: self.ID,
-		Image: self.Image,
-		Command: self.Command,
-		Created: self.Created,
-		Status: self.Status,
-		Ports: displayablePorts(self.Ports),
-		SizeRw: self.SizeRw,
+		ID:         self.ID,
+		Image:      self.Image,
+		Command:    self.Command,
+		Created:    self.Created,
+		Status:     self.Status,
+		Ports:      displayablePorts(self.Ports),
+		SizeRw:     self.SizeRw,
 		SizeRootFs: self.SizeRootFs,
 	}
 }

+ 15 - 5
api_test.go

@@ -336,9 +336,11 @@ func TestGetContainersJSON(t *testing.T) {
 	}
 
 	r := httptest.NewRecorder()
-	if err := getContainersJSON(srv, APIVERSION, r, req, nil); err != nil {
-		t.Fatal(err)
-	}
+	setTimeout(t, "getContainerJSON timed out", 5*time.Second, func() {
+		if err := getContainersJSON(srv, APIVERSION, r, req, nil); err != nil {
+			t.Fatal(err)
+		}
+	})
 	containers := []APIContainers{}
 	if err := json.Unmarshal(r.Body.Bytes(), &containers); err != nil {
 		t.Fatal(err)
@@ -374,7 +376,7 @@ func TestGetContainersExport(t *testing.T) {
 	}
 
 	r := httptest.NewRecorder()
-	if err = getContainersExport(srv, APIVERSION, r, nil, map[string]string{"name": container.ID}); err != nil {
+	if err := getContainersExport(srv, APIVERSION, r, nil, map[string]string{"name": container.ID}); err != nil {
 		t.Fatal(err)
 	}
 
@@ -646,13 +648,21 @@ func TestPostContainersCreate(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	if _, err := os.Stat(path.Join(container.rwPath(), "test")); err != nil {
+	if err := container.EnsureMounted(); err != nil {
+		t.Fatalf("Unable to mount container: %s", err)
+	}
+
+	if _, err := os.Stat(path.Join(container.RootfsPath(), "test")); err != nil {
 		if os.IsNotExist(err) {
 			utils.Debugf("Err: %s", err)
 			t.Fatalf("The test file has not been created")
 		}
 		t.Fatal(err)
 	}
+
+	if err := container.Unmount(); err != nil {
+		t.Fatalf("Unable to unmount container: %s", err)
+	}
 }
 
 func TestPostContainersKill(t *testing.T) {

+ 74 - 7
archive.go

@@ -80,20 +80,73 @@ func (compression *Compression) Extension() string {
 // Tar creates an archive from the directory at `path`, and returns it as a
 // stream of bytes.
 func Tar(path string, compression Compression) (io.Reader, error) {
-	return TarFilter(path, compression, nil)
+	return TarFilter(path, compression, nil, true, nil)
+}
+
+func escapeName(name string) string {
+	escaped := make([]byte, 0)
+	for i, c := range []byte(name) {
+		if i == 0 && c == '/' {
+			continue
+		}
+		// all printable chars except "-" which is 0x2d
+		if (0x20 <= c && c <= 0x7E) && c != 0x2d {
+			escaped = append(escaped, c)
+		} else {
+			escaped = append(escaped, fmt.Sprintf("\\%03o", c)...)
+		}
+	}
+	return string(escaped)
 }
 
 // Tar creates an archive from the directory at `path`, only including files whose relative
 // paths are included in `filter`. If `filter` is nil, then all files are included.
-func TarFilter(path string, compression Compression, filter []string) (io.Reader, error) {
-	args := []string{"tar", "--numeric-owner", "-f", "-", "-C", path}
+func TarFilter(path string, compression Compression, filter []string, recursive bool, createFiles []string) (io.Reader, error) {
+	args := []string{"tar", "--numeric-owner", "-f", "-", "-C", path, "-T", "-"}
 	if filter == nil {
 		filter = []string{"."}
 	}
+	args = append(args, "-c"+compression.Flag())
+
+	if !recursive {
+		args = append(args, "--no-recursion")
+	}
+
+	files := ""
 	for _, f := range filter {
-		args = append(args, "-c"+compression.Flag(), f)
+		files = files + escapeName(f) + "\n"
+	}
+
+	tmpDir := ""
+
+	if createFiles != nil {
+		tmpDir, err := ioutil.TempDir("", "docker-tar")
+		if err != nil {
+			return nil, err
+		}
+
+		files = files + "-C" + tmpDir + "\n"
+		for _, f := range createFiles {
+			path := filepath.Join(tmpDir, f)
+			err := os.MkdirAll(filepath.Dir(path), 0600)
+			if err != nil {
+				return nil, err
+			}
+
+			if file, err := os.OpenFile(path, os.O_CREATE, 0600); err != nil {
+				return nil, err
+			} else {
+				file.Close()
+			}
+			files = files + escapeName(f) + "\n"
+		}
 	}
-	return CmdStream(exec.Command(args[0], args[1:]...))
+
+	return CmdStream(exec.Command(args[0], args[1:]...), &files, func() {
+		if tmpDir != "" {
+			_ = os.RemoveAll(tmpDir)
+		}
+	})
 }
 
 // Untar reads a stream of bytes from `archive`, parses it as a tar archive,
@@ -140,7 +193,7 @@ func Untar(archive io.Reader, path string) error {
 // TarUntar aborts and returns the error.
 func TarUntar(src string, filter []string, dst string) error {
 	utils.Debugf("TarUntar(%s %s %s)", src, filter, dst)
-	archive, err := TarFilter(src, Uncompressed, filter)
+	archive, err := TarFilter(src, Uncompressed, filter, true, nil)
 	if err != nil {
 		return err
 	}
@@ -227,7 +280,18 @@ func CopyFileWithTar(src, dst string) error {
 // CmdStream executes a command, and returns its stdout as a stream.
 // If the command fails to run or doesn't complete successfully, an error
 // will be returned, including anything written on stderr.
-func CmdStream(cmd *exec.Cmd) (io.Reader, error) {
+func CmdStream(cmd *exec.Cmd, input *string, atEnd func()) (io.Reader, error) {
+	if input != nil {
+		stdin, err := cmd.StdinPipe()
+		if err != nil {
+			return nil, err
+		}
+		// Write stdin if any
+		go func() {
+			_, _ = stdin.Write([]byte(*input))
+			stdin.Close()
+		}()
+	}
 	stdout, err := cmd.StdoutPipe()
 	if err != nil {
 		return nil, err
@@ -258,6 +322,9 @@ func CmdStream(cmd *exec.Cmd) (io.Reader, error) {
 		} else {
 			pipeW.Close()
 		}
+		if atEnd != nil {
+			atEnd()
+		}
 	}()
 	// Run the command and return the pipe
 	if err := cmd.Start(); err != nil {

+ 3 - 3
archive_test.go

@@ -14,7 +14,7 @@ import (
 
 func TestCmdStreamLargeStderr(t *testing.T) {
 	cmd := exec.Command("/bin/sh", "-c", "dd if=/dev/zero bs=1k count=1000 of=/dev/stderr; echo hello")
-	out, err := CmdStream(cmd)
+	out, err := CmdStream(cmd, nil, nil)
 	if err != nil {
 		t.Fatalf("Failed to start command: %s", err)
 	}
@@ -35,7 +35,7 @@ func TestCmdStreamLargeStderr(t *testing.T) {
 
 func TestCmdStreamBad(t *testing.T) {
 	badCmd := exec.Command("/bin/sh", "-c", "echo hello; echo >&2 error couldn\\'t reverse the phase pulser; exit 1")
-	out, err := CmdStream(badCmd)
+	out, err := CmdStream(badCmd, nil, nil)
 	if err != nil {
 		t.Fatalf("Failed to start command: %s", err)
 	}
@@ -50,7 +50,7 @@ func TestCmdStreamBad(t *testing.T) {
 
 func TestCmdStreamGood(t *testing.T) {
 	cmd := exec.Command("/bin/sh", "-c", "echo hello; exit 0")
-	out, err := CmdStream(cmd)
+	out, err := CmdStream(cmd, nil, nil)
 	if err != nil {
 		t.Fatal(err)
 	}

+ 253 - 42
changes.go

@@ -5,6 +5,7 @@ import (
 	"os"
 	"path/filepath"
 	"strings"
+	"syscall"
 )
 
 type ChangeType int
@@ -33,74 +34,284 @@ func (change *Change) String() string {
 	return fmt.Sprintf("%s %s", kind, change.Path)
 }
 
-func Changes(layers []string, rw string) ([]Change, error) {
-	var changes []Change
-	err := filepath.Walk(rw, func(path string, f os.FileInfo, err error) error {
-		if err != nil {
-			return err
+type FileInfo struct {
+	parent   *FileInfo
+	name     string
+	stat     syscall.Stat_t
+	children map[string]*FileInfo
+}
+
+func (root *FileInfo) LookUp(path string) *FileInfo {
+	parent := root
+	if path == "/" {
+		return root
+	}
+
+	pathElements := strings.Split(path, "/")
+	for _, elem := range pathElements {
+		if elem != "" {
+			child := parent.children[elem]
+			if child == nil {
+				return nil
+			}
+			parent = child
 		}
+	}
+	return parent
+}
 
-		// Rebase path
-		path, err = filepath.Rel(rw, path)
+func (info *FileInfo) path() string {
+	if info.parent == nil {
+		return "/"
+	}
+	return filepath.Join(info.parent.path(), info.name)
+}
+
+func (info *FileInfo) unlink() {
+	if info.parent != nil {
+		delete(info.parent.children, info.name)
+	}
+}
+
+func (info *FileInfo) Remove(path string) bool {
+	child := info.LookUp(path)
+	if child != nil {
+		child.unlink()
+		return true
+	}
+	return false
+}
+
+func (info *FileInfo) isDir() bool {
+	return info.parent == nil || info.stat.Mode&syscall.S_IFDIR == syscall.S_IFDIR
+}
+
+func (info *FileInfo) addChanges(oldInfo *FileInfo, changes *[]Change) {
+	if oldInfo == nil {
+		// add
+		change := Change{
+			Path: info.path(),
+			Kind: ChangeAdd,
+		}
+		*changes = append(*changes, change)
+	}
+
+	// We make a copy so we can modify it to detect additions
+	// also, we only recurse on the old dir if the new info is a directory
+	// otherwise any previous delete/change is considered recursive
+	oldChildren := make(map[string]*FileInfo)
+	if oldInfo != nil && info.isDir() {
+		for k, v := range oldInfo.children {
+			oldChildren[k] = v
+		}
+	}
+
+	for name, newChild := range info.children {
+		oldChild, _ := oldChildren[name]
+		if oldChild != nil {
+			// change?
+			oldStat := &oldChild.stat
+			newStat := &newChild.stat
+			// Note: We can't compare inode or ctime or blocksize here, because these change
+			// when copying a file into a container. However, that is not generally a problem
+			// because any content change will change mtime, and any status change should
+			// be visible when actually comparing the stat fields. The only time this
+			// breaks down is if some code intentionally hides a change by setting
+			// back mtime
+			oldMtime := syscall.NsecToTimeval(oldStat.Mtim.Nano())
+			newMtime := syscall.NsecToTimeval(oldStat.Mtim.Nano())
+			if oldStat.Mode != newStat.Mode ||
+				oldStat.Uid != newStat.Uid ||
+				oldStat.Gid != newStat.Gid ||
+				oldStat.Rdev != newStat.Rdev ||
+				// Don't look at size for dirs, its not a good measure of change
+				(oldStat.Size != newStat.Size && oldStat.Mode&syscall.S_IFDIR != syscall.S_IFDIR) ||
+				oldMtime.Sec != newMtime.Sec ||
+				oldMtime.Usec != newMtime.Usec {
+				change := Change{
+					Path: newChild.path(),
+					Kind: ChangeModify,
+				}
+				*changes = append(*changes, change)
+			}
+
+			// Remove from copy so we can detect deletions
+			delete(oldChildren, name)
+		}
+
+		newChild.addChanges(oldChild, changes)
+	}
+	for _, oldChild := range oldChildren {
+		// delete
+		change := Change{
+			Path: oldChild.path(),
+			Kind: ChangeDelete,
+		}
+		*changes = append(*changes, change)
+	}
+
+}
+
+func (info *FileInfo) Changes(oldInfo *FileInfo) []Change {
+	var changes []Change
+
+	info.addChanges(oldInfo, &changes)
+
+	return changes
+}
+
+func newRootFileInfo() *FileInfo {
+	root := &FileInfo{
+		name:     "/",
+		children: make(map[string]*FileInfo),
+	}
+	return root
+}
+
+func applyLayer(root *FileInfo, layer string) error {
+	err := filepath.Walk(layer, func(layerPath string, f os.FileInfo, err error) error {
 		if err != nil {
 			return err
 		}
-		path = filepath.Join("/", path)
 
 		// Skip root
-		if path == "/" {
+		if layerPath == layer {
 			return nil
 		}
 
-		// Skip AUFS metadata
-		if matched, err := filepath.Match("/.wh..wh.*", path); err != nil || matched {
+		// rebase path
+		relPath, err := filepath.Rel(layer, layerPath)
+		if err != nil {
 			return err
 		}
+		relPath = filepath.Join("/", relPath)
 
-		change := Change{
-			Path: path,
+		// Skip AUFS metadata
+		if matched, err := filepath.Match("/.wh..wh.*", relPath); err != nil || matched {
+			if err != nil || !f.IsDir() {
+				return err
+			}
+			return filepath.SkipDir
+		}
+
+		var layerStat syscall.Stat_t
+		err = syscall.Lstat(layerPath, &layerStat)
+		if err != nil {
+			return err
 		}
 
-		// Find out what kind of modification happened
-		file := filepath.Base(path)
+		file := filepath.Base(relPath)
 		// If there is a whiteout, then the file was removed
 		if strings.HasPrefix(file, ".wh.") {
 			originalFile := file[len(".wh."):]
-			change.Path = filepath.Join(filepath.Dir(path), originalFile)
-			change.Kind = ChangeDelete
+			deletePath := filepath.Join(filepath.Dir(relPath), originalFile)
+
+			root.Remove(deletePath)
 		} else {
-			// Otherwise, the file was added
-			change.Kind = ChangeAdd
-
-			// ...Unless it already existed in a top layer, in which case, it's a modification
-			for _, layer := range layers {
-				stat, err := os.Stat(filepath.Join(layer, path))
-				if err != nil && !os.IsNotExist(err) {
-					return err
+			// Added or changed file
+			existing := root.LookUp(relPath)
+			if existing != nil {
+				// Changed file
+				existing.stat = layerStat
+				if !existing.isDir() {
+					// Changed from dir to non-dir, delete all previous files
+					existing.children = make(map[string]*FileInfo)
+				}
+			} else {
+				// Added file
+				parent := root.LookUp(filepath.Dir(relPath))
+				if parent == nil {
+					return fmt.Errorf("collectFileInfo: Unexpectedly no parent for %s", relPath)
 				}
-				if err == nil {
-					// The file existed in the top layer, so that's a modification
-
-					// However, if it's a directory, maybe it wasn't actually modified.
-					// If you modify /foo/bar/baz, then /foo will be part of the changed files only because it's the parent of bar
-					if stat.IsDir() && f.IsDir() {
-						if f.Size() == stat.Size() && f.Mode() == stat.Mode() && f.ModTime() == stat.ModTime() {
-							// Both directories are the same, don't record the change
-							return nil
-						}
-					}
-					change.Kind = ChangeModify
-					break
+
+				info := &FileInfo{
+					name:     filepath.Base(relPath),
+					children: make(map[string]*FileInfo),
+					parent:   parent,
+					stat:     layerStat,
 				}
+
+				parent.children[info.name] = info
 			}
 		}
+		return nil
+	})
+	return err
+}
+
+func collectFileInfo(sourceDir string) (*FileInfo, error) {
+	root := newRootFileInfo()
+
+	err := filepath.Walk(sourceDir, func(path string, f os.FileInfo, err error) error {
+		if err != nil {
+			return err
+		}
+
+		// Rebase path
+		relPath, err := filepath.Rel(sourceDir, path)
+		if err != nil {
+			return err
+		}
+		relPath = filepath.Join("/", relPath)
+
+		if relPath == "/" {
+			return nil
+		}
+
+		parent := root.LookUp(filepath.Dir(relPath))
+		if parent == nil {
+			return fmt.Errorf("collectFileInfo: Unexpectedly no parent for %s", relPath)
+		}
+
+		info := &FileInfo{
+			name:     filepath.Base(relPath),
+			children: make(map[string]*FileInfo),
+			parent:   parent,
+		}
+
+		if err := syscall.Lstat(path, &info.stat); err != nil {
+			return err
+		}
+
+		parent.children[info.name] = info
 
-		// Record change
-		changes = append(changes, change)
 		return nil
 	})
-	if err != nil && !os.IsNotExist(err) {
+	if err != nil {
 		return nil, err
 	}
-	return changes, nil
+	return root, nil
+}
+
+func ChangesLayers(newDir string, layers []string) ([]Change, error) {
+	newRoot, err := collectFileInfo(newDir)
+	if err != nil {
+		return nil, err
+	}
+	oldRoot := newRootFileInfo()
+	for i := len(layers) - 1; i >= 0; i-- {
+		layer := layers[i]
+		if err = applyLayer(oldRoot, layer); err != nil {
+			return nil, err
+		}
+	}
+
+	return newRoot.Changes(oldRoot), nil
+}
+
+func ChangesDirs(newDir, oldDir string) ([]Change, error) {
+	oldRoot, err := collectFileInfo(oldDir)
+	if err != nil {
+		return nil, err
+	}
+	newRoot, err := collectFileInfo(newDir)
+	if err != nil {
+		return nil, err
+	}
+
+	// Ignore changes in .docker-id
+	_ = newRoot.Remove("/.docker-id")
+	_ = oldRoot.Remove("/.docker-id")
+
+	return newRoot.Changes(oldRoot), nil
 }

+ 59 - 9
container.go

@@ -297,12 +297,17 @@ func (settings *NetworkSettings) PortMappingAPI() []APIPort {
 
 // Inject the io.Reader at the given path. Note: do not close the reader
 func (container *Container) Inject(file io.Reader, pth string) error {
+	if err := container.EnsureMounted(); err != nil {
+		return err
+	}
+
 	// Make sure the directory exists
-	if err := os.MkdirAll(path.Join(container.rwPath(), path.Dir(pth)), 0755); err != nil {
+	if err := os.MkdirAll(path.Join(container.RootfsPath(), path.Dir(pth)), 0755); err != nil {
 		return err
 	}
+
 	// FIXME: Handle permissions/already existing dest
-	dest, err := os.Create(path.Join(container.rwPath(), pth))
+	dest, err := os.Create(path.Join(container.RootfsPath(), pth))
 	if err != nil {
 		return err
 	}
@@ -743,6 +748,7 @@ func (container *Container) Start(hostConfig *HostConfig) error {
 	}
 
 	params := []string{
+		"lxc-start",
 		"-n", container.ID,
 		"-f", container.lxcConfigPath(),
 		"--",
@@ -791,7 +797,21 @@ func (container *Container) Start(hostConfig *HostConfig) error {
 	params = append(params, "--", container.Path)
 	params = append(params, container.Args...)
 
-	container.cmd = exec.Command("lxc-start", params...)
+	if RootIsShared() {
+		// lxc-start really needs / to be private, or all kinds of stuff break
+		// What we really want is to clone into a new namespace and then
+		// mount / MS_REC|MS_PRIVATE, but since we can't really clone or fork
+		// without exec in go we have to do this horrible shell hack...
+		shellString :=
+			"mount --make-rprivate /; exec " +
+				utils.ShellQuoteArguments(params)
+
+		params = []string{
+			"unshare", "-m", "--", "/bin/sh", "-c", shellString,
+		}
+	}
+
+	container.cmd = exec.Command(params[0], params[1:]...)
 
 	// Setup logging of stdout and stderr to disk
 	if err := container.runtime.LogToDisk(container.stdout, container.logPath("json"), "stdout"); err != nil {
@@ -1111,7 +1131,15 @@ func (container *Container) Resize(h, w int) error {
 }
 
 func (container *Container) ExportRw() (Archive, error) {
-	return Tar(container.rwPath(), Uncompressed)
+	if err := container.EnsureMounted(); err != nil {
+		return nil, err
+	}
+
+	image, err := container.GetImage()
+	if err != nil {
+		return nil, err
+	}
+	return image.ExportChanges(container.runtime, container.RootfsPath(), container.rwPath(), container.ID)
 }
 
 func (container *Container) RwChecksum() (string, error) {
@@ -1153,20 +1181,33 @@ func (container *Container) EnsureMounted() error {
 	return container.Mount()
 }
 
+func (container *Container) EnsureUnmounted() error {
+	if mounted, err := container.Mounted(); err != nil {
+		return err
+	} else if !mounted {
+		return nil
+	}
+	return container.Unmount()
+}
+
 func (container *Container) Mount() error {
 	image, err := container.GetImage()
 	if err != nil {
 		return err
 	}
-	return image.Mount(container.RootfsPath(), container.rwPath())
+	return image.Mount(container.runtime, container.RootfsPath(), container.rwPath(), container.ID)
 }
 
 func (container *Container) Changes() ([]Change, error) {
+	if err := container.EnsureMounted(); err != nil {
+		return nil, err
+	}
+
 	image, err := container.GetImage()
 	if err != nil {
 		return nil, err
 	}
-	return image.Changes(container.rwPath())
+	return image.Changes(container.runtime, container.RootfsPath(), container.rwPath(), container.ID)
 }
 
 func (container *Container) GetImage() (*Image, error) {
@@ -1177,11 +1218,20 @@ func (container *Container) GetImage() (*Image, error) {
 }
 
 func (container *Container) Mounted() (bool, error) {
-	return Mounted(container.RootfsPath())
+	image, err := container.GetImage()
+	if err != nil {
+		return false, err
+	}
+	return image.Mounted(container.runtime, container.RootfsPath(), container.rwPath())
 }
 
 func (container *Container) Unmount() error {
-	return Unmount(container.RootfsPath())
+	image, err := container.GetImage()
+	if err != nil {
+		return err
+	}
+	err = image.Unmount(container.runtime, container.RootfsPath(), container.ID)
+	return err
 }
 
 // ShortID returns a shorthand version of the container's id for convenience.
@@ -1269,5 +1319,5 @@ func (container *Container) Copy(resource string) (Archive, error) {
 		filter = []string{path.Base(basePath)}
 		basePath = path.Dir(basePath)
 	}
-	return TarFilter(basePath, Uncompressed, filter)
+	return TarFilter(basePath, Uncompressed, filter, true, nil)
 }

+ 14 - 0
deviceset.go

@@ -0,0 +1,14 @@
+package docker
+
+type DeviceSet interface {
+	AddDevice(hash, baseHash string) error
+	SetInitialized(hash string) error
+	DeactivateDevice(hash string) error
+	RemoveDevice(hash string) error
+	MountDevice(hash, path string) error
+	UnmountDevice(hash, path string) error
+	HasDevice(hash string) bool
+	HasInitializedDevice(hash string) bool
+	HasActivatedDevice(hash string) bool
+	Shutdown() error
+}

+ 909 - 0
devmapper/deviceset_devmapper.go

@@ -0,0 +1,909 @@
+package devmapper
+
+import (
+	"encoding/json"
+	"fmt"
+	"github.com/dotcloud/docker/utils"
+	"io"
+	"io/ioutil"
+	"os"
+	"os/exec"
+	"path"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"syscall"
+)
+
+const (
+	defaultDataLoopbackSize     int64  = 100 * 1024 * 1024 * 1024
+	defaultMetaDataLoopbackSize int64  = 2 * 1024 * 1024 * 1024
+	defaultBaseFsSize           uint64 = 10 * 1024 * 1024 * 1024
+)
+
+type DevInfo struct {
+	Hash          string       `json:"-"`
+	DeviceId      int          `json:"device_id"`
+	Size          uint64       `json:"size"`
+	TransactionId uint64       `json:"transaction_id"`
+	Initialized   bool         `json:"initialized"`
+	devices       *DeviceSetDM `json:"-"`
+}
+
+type MetaData struct {
+	Devices map[string]*DevInfo `json:devices`
+}
+
+type DeviceSetDM struct {
+	MetaData
+	initialized      bool
+	root             string
+	devicePrefix     string
+	TransactionId    uint64
+	NewTransactionId uint64
+	nextFreeDevice   int
+	activeMounts     map[string]int
+}
+
+func getDevName(name string) string {
+	return "/dev/mapper/" + name
+}
+
+func (info *DevInfo) Name() string {
+	hash := info.Hash
+	if hash == "" {
+		hash = "base"
+	}
+	return fmt.Sprintf("%s-%s", info.devices.devicePrefix, hash)
+}
+
+func (info *DevInfo) DevName() string {
+	return getDevName(info.Name())
+}
+
+func (devices *DeviceSetDM) loopbackDir() string {
+	return path.Join(devices.root, "loopback")
+}
+
+func (devices *DeviceSetDM) jsonFile() string {
+	return path.Join(devices.loopbackDir(), "json")
+}
+
+func (devices *DeviceSetDM) getPoolName() string {
+	return devices.devicePrefix + "-pool"
+}
+
+func (devices *DeviceSetDM) getPoolDevName() string {
+	return getDevName(devices.getPoolName())
+}
+
+func (devices *DeviceSetDM) createTask(t TaskType, name string) (*Task, error) {
+	task := TaskCreate(t)
+	if task == nil {
+		return nil, fmt.Errorf("Can't create task of type %d", int(t))
+	}
+	if err := task.SetName(name); err != nil {
+		return nil, fmt.Errorf("Can't set task name %s", name)
+	}
+	return task, nil
+}
+
+func (devices *DeviceSetDM) getInfo(name string) (*Info, error) {
+	task, err := devices.createTask(DeviceInfo, name)
+	if task == nil {
+		return nil, err
+	}
+	if err := task.Run(); err != nil {
+		return nil, err
+	}
+	return task.GetInfo()
+}
+
+func (devices *DeviceSetDM) getStatus(name string) (uint64, uint64, string, string, error) {
+	task, err := devices.createTask(DeviceStatus, name)
+	if task == nil {
+		utils.Debugf("getStatus: Error createTask: %s", err)
+		return 0, 0, "", "", err
+	}
+	if err := task.Run(); err != nil {
+		utils.Debugf("getStatus: Error Run: %s", err)
+		return 0, 0, "", "", err
+	}
+
+	devinfo, err := task.GetInfo()
+	if err != nil {
+		utils.Debugf("getStatus: Error GetInfo: %s", err)
+		return 0, 0, "", "", err
+	}
+	if devinfo.Exists == 0 {
+		utils.Debugf("getStatus: Non existing device %s", name)
+		return 0, 0, "", "", fmt.Errorf("Non existing device %s", name)
+	}
+
+	_, start, length, target_type, params := task.GetNextTarget(0)
+	return start, length, target_type, params, nil
+}
+
+func (devices *DeviceSetDM) setTransactionId(oldId uint64, newId uint64) error {
+	task, err := devices.createTask(DeviceTargetMsg, devices.getPoolDevName())
+	if task == nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	if err := task.SetSector(0); err != nil {
+		return fmt.Errorf("Can't set sector")
+	}
+
+	if err := task.SetMessage(fmt.Sprintf("set_transaction_id %d %d", oldId, newId)); err != nil {
+		return fmt.Errorf("Can't set message")
+	}
+
+	if err := task.Run(); err != nil {
+		return fmt.Errorf("Error running setTransactionId")
+	}
+	return nil
+}
+
+func (devices *DeviceSetDM) hasImage(name string) bool {
+	dirname := devices.loopbackDir()
+	filename := path.Join(dirname, name)
+
+	_, err := os.Stat(filename)
+	return err == nil
+}
+
+func (devices *DeviceSetDM) ensureImage(name string, size int64) (string, error) {
+	dirname := devices.loopbackDir()
+	filename := path.Join(dirname, name)
+
+	if err := os.MkdirAll(dirname, 0700); err != nil && !os.IsExist(err) {
+		return "", err
+	}
+
+	if _, err := os.Stat(filename); err != nil {
+		if !os.IsNotExist(err) {
+			return "", err
+		}
+		utils.Debugf("Creating loopback file %s for device-manage use", filename)
+		file, err := os.OpenFile(filename, os.O_RDWR|os.O_CREATE, 0600)
+		if err != nil {
+			return "", err
+		}
+
+		if err = file.Truncate(size); err != nil {
+			return "", err
+		}
+	}
+	return filename, nil
+}
+
+func (devices *DeviceSetDM) createPool(dataFile *os.File, metadataFile *os.File) error {
+	utils.Debugf("Activating device-mapper pool %s", devices.getPoolName())
+	task, err := devices.createTask(DeviceCreate, devices.getPoolName())
+	if task == nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	size, err := GetBlockDeviceSize(dataFile)
+	if err != nil {
+		return fmt.Errorf("Can't get data size")
+	}
+
+	params := metadataFile.Name() + " " + dataFile.Name() + " 512 8192"
+	if err := task.AddTarget(0, size/512, "thin-pool", params); err != nil {
+		return fmt.Errorf("Can't add target")
+	}
+
+	var cookie uint32 = 0
+	if err := task.SetCookie(&cookie, 0); err != nil {
+		return fmt.Errorf("Can't set cookie")
+	}
+
+	if err := task.Run(); err != nil {
+		return fmt.Errorf("Error running DeviceCreate")
+	}
+
+	UdevWait(cookie)
+
+	return nil
+}
+
+func (devices *DeviceSetDM) suspendDevice(info *DevInfo) error {
+	task, err := devices.createTask(DeviceSuspend, info.Name())
+	if task == nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+	if err := task.Run(); err != nil {
+		return fmt.Errorf("Error running DeviceSuspend")
+	}
+	return nil
+}
+
+func (devices *DeviceSetDM) resumeDevice(info *DevInfo) error {
+	task, err := devices.createTask(DeviceResume, info.Name())
+	if task == nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	var cookie uint32 = 0
+	if err := task.SetCookie(&cookie, 0); err != nil {
+		return fmt.Errorf("Can't set cookie")
+	}
+
+	if err := task.Run(); err != nil {
+		return fmt.Errorf("Error running DeviceSuspend")
+	}
+
+	UdevWait(cookie)
+
+	return nil
+}
+
+func (devices *DeviceSetDM) createDevice(deviceId int) error {
+	task, err := devices.createTask(DeviceTargetMsg, devices.getPoolDevName())
+	if task == nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	if err := task.SetSector(0); err != nil {
+		return fmt.Errorf("Can't set sector")
+	}
+
+	if err := task.SetMessage(fmt.Sprintf("create_thin %d", deviceId)); err != nil {
+		return fmt.Errorf("Can't set message")
+	}
+
+	if err := task.Run(); err != nil {
+		return fmt.Errorf("Error running createDevice")
+	}
+	return nil
+}
+
+func (devices *DeviceSetDM) createSnapDevice(deviceId int, baseInfo *DevInfo) error {
+	devinfo, _ := devices.getInfo(baseInfo.Name())
+	doSuspend := devinfo != nil && devinfo.Exists != 0
+
+	if doSuspend {
+		if err := devices.suspendDevice(baseInfo); err != nil {
+			utils.Debugf("\n--->Err: %s\n", err)
+			return err
+		}
+	}
+
+	task, err := devices.createTask(DeviceTargetMsg, devices.getPoolDevName())
+	if task == nil {
+		devices.resumeDevice(baseInfo)
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	if err := task.SetSector(0); err != nil {
+		devices.resumeDevice(baseInfo)
+		return fmt.Errorf("Can't set sector")
+	}
+
+	if err := task.SetMessage(fmt.Sprintf("create_snap %d %d", deviceId, baseInfo.DeviceId)); err != nil {
+		devices.resumeDevice(baseInfo)
+		return fmt.Errorf("Can't set message")
+	}
+
+	if err := task.Run(); err != nil {
+		devices.resumeDevice(baseInfo)
+		return fmt.Errorf("Error running DeviceCreate")
+	}
+
+	if doSuspend {
+		if err := devices.resumeDevice(baseInfo); err != nil {
+			utils.Debugf("\n--->Err: %s\n", err)
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (devices *DeviceSetDM) deleteDevice(deviceId int) error {
+	task, err := devices.createTask(DeviceTargetMsg, devices.getPoolDevName())
+	if task == nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	if err := task.SetSector(0); err != nil {
+		return fmt.Errorf("Can't set sector")
+	}
+
+	if err := task.SetMessage(fmt.Sprintf("delete %d", deviceId)); err != nil {
+		return fmt.Errorf("Can't set message")
+	}
+
+	if err := task.Run(); err != nil {
+		return fmt.Errorf("Error running deleteDevice")
+	}
+	return nil
+}
+
+func (devices *DeviceSetDM) removeDevice(name string) error {
+	task, err := devices.createTask(DeviceRemove, name)
+	if task == nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+	if err = task.Run(); err != nil {
+		return fmt.Errorf("Error running removeDevice")
+	}
+	return nil
+}
+
+func (devices *DeviceSetDM) activateDevice(info *DevInfo) error {
+	task, err := devices.createTask(DeviceCreate, info.Name())
+	if task == nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	params := fmt.Sprintf("%s %d", devices.getPoolDevName(), info.DeviceId)
+	if err := task.AddTarget(0, info.Size/512, "thin", params); err != nil {
+		return fmt.Errorf("Can't add target")
+	}
+
+	var cookie uint32 = 0
+	if err := task.SetCookie(&cookie, 0); err != nil {
+		return fmt.Errorf("Can't set cookie")
+	}
+
+	if err := task.Run(); err != nil {
+		return fmt.Errorf("Error running DeviceCreate")
+	}
+
+	UdevWait(cookie)
+
+	return nil
+}
+
+func (devices *DeviceSetDM) allocateDeviceId() int {
+	// TODO: Add smarter reuse of deleted devices
+	id := devices.nextFreeDevice
+	devices.nextFreeDevice = devices.nextFreeDevice + 1
+	return id
+}
+
+func (devices *DeviceSetDM) allocateTransactionId() uint64 {
+	devices.NewTransactionId = devices.NewTransactionId + 1
+	return devices.NewTransactionId
+}
+
+func (devices *DeviceSetDM) saveMetadata() error {
+	jsonData, err := json.Marshal(devices.MetaData)
+	if err != nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+	tmpFile, err := ioutil.TempFile(filepath.Dir(devices.jsonFile()), ".json")
+	if err != nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	n, err := tmpFile.Write(jsonData)
+	if err != nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+	if n < len(jsonData) {
+		return io.ErrShortWrite
+	}
+	if err := tmpFile.Sync(); err != nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+	if err := tmpFile.Close(); err != nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+	if err := os.Rename(tmpFile.Name(), devices.jsonFile()); err != nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	if devices.NewTransactionId != devices.TransactionId {
+		if err = devices.setTransactionId(devices.TransactionId, devices.NewTransactionId); err != nil {
+			utils.Debugf("\n--->Err: %s\n", err)
+			return err
+		}
+		devices.TransactionId = devices.NewTransactionId
+	}
+	return nil
+}
+
+func (devices *DeviceSetDM) registerDevice(id int, hash string, size uint64) (*DevInfo, error) {
+	info := &DevInfo{
+		Hash:          hash,
+		DeviceId:      id,
+		Size:          size,
+		TransactionId: devices.allocateTransactionId(),
+		Initialized:   false,
+		devices:       devices,
+	}
+
+	devices.Devices[hash] = info
+	if err := devices.saveMetadata(); err != nil {
+		// Try to remove unused device
+		delete(devices.Devices, hash)
+		return nil, err
+	}
+
+	return info, nil
+}
+
+func (devices *DeviceSetDM) activateDeviceIfNeeded(hash string) error {
+	utils.Debugf("activateDeviceIfNeeded()")
+	info := devices.Devices[hash]
+	if info == nil {
+		return fmt.Errorf("Unknown device %s", hash)
+	}
+
+	if devinfo, _ := devices.getInfo(info.Name()); devinfo != nil && devinfo.Exists != 0 {
+		return nil
+	}
+
+	return devices.activateDevice(info)
+}
+
+func (devices *DeviceSetDM) createFilesystem(info *DevInfo) error {
+	devname := info.DevName()
+
+	err := exec.Command("mkfs.ext4", "-E", "discard,lazy_itable_init=0,lazy_journal_init=0", devname).Run()
+	if err != nil {
+		err = exec.Command("mkfs.ext4", "-E", "discard,lazy_itable_init=0", devname).Run()
+	}
+	if err != nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+	return nil
+}
+
+func (devices *DeviceSetDM) loadMetaData() error {
+	_, _, _, params, err := devices.getStatus(devices.getPoolName())
+	if err != nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	if _, err := fmt.Sscanf(params, "%d", &devices.TransactionId); err != nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+	devices.NewTransactionId = devices.TransactionId
+
+	jsonData, err := ioutil.ReadFile(devices.jsonFile())
+	if err != nil && !os.IsNotExist(err) {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	devices.MetaData.Devices = make(map[string]*DevInfo)
+	if jsonData != nil {
+		if err := json.Unmarshal(jsonData, &devices.MetaData); err != nil {
+			utils.Debugf("\n--->Err: %s\n", err)
+			return err
+		}
+	}
+
+	for hash, d := range devices.Devices {
+		d.Hash = hash
+		d.devices = devices
+
+		if d.DeviceId >= devices.nextFreeDevice {
+			devices.nextFreeDevice = d.DeviceId + 1
+		}
+
+		// If the transaction id is larger than the actual one we lost the device due to some crash
+		if d.TransactionId > devices.TransactionId {
+			utils.Debugf("Removing lost device %s with id %d", hash, d.TransactionId)
+			delete(devices.Devices, hash)
+		}
+	}
+	return nil
+}
+
+func (devices *DeviceSetDM) setupBaseImage() error {
+	oldInfo := devices.Devices[""]
+	if oldInfo != nil && oldInfo.Initialized {
+		return nil
+	}
+
+	if oldInfo != nil && !oldInfo.Initialized {
+		utils.Debugf("Removing uninitialized base image")
+		if err := devices.RemoveDevice(""); err != nil {
+			utils.Debugf("\n--->Err: %s\n", err)
+			return err
+		}
+	}
+
+	utils.Debugf("Initializing base device-manager snapshot")
+
+	id := devices.allocateDeviceId()
+
+	// Create initial device
+	if err := devices.createDevice(id); err != nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	info, err := devices.registerDevice(id, "", defaultBaseFsSize)
+	if err != nil {
+		_ = devices.deleteDevice(id)
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	utils.Debugf("Creating filesystem on base device-manager snapshot")
+
+	if err = devices.activateDeviceIfNeeded(""); err != nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	if err := devices.createFilesystem(info); err != nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	info.Initialized = true
+	if err = devices.saveMetadata(); err != nil {
+		info.Initialized = false
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	return nil
+}
+
+func setCloseOnExec(name string) {
+	fileInfos, _ := ioutil.ReadDir("/proc/self/fd")
+	if fileInfos != nil {
+		for _, i := range fileInfos {
+			link, _ := os.Readlink(filepath.Join("/proc/self/fd", i.Name()))
+			if link ==  name {
+				fd, err := strconv.Atoi(i.Name())
+				if err == nil {
+					syscall.CloseOnExec(fd)
+				}
+			}
+		}
+	}
+}
+
+func (devices *DeviceSetDM) initDevmapper() error {
+	info, err := devices.getInfo(devices.getPoolName())
+	if info == nil {
+		utils.Debugf("Error device getInfo: %s", err)
+		return err
+	}
+	utils.Debugf("initDevmapper(). Pool exists: %v", info.Exists)
+
+	// It seems libdevmapper opens this without O_CLOEXEC, and go exec will not close files
+	// that are not Close-on-exec, and lxc-start will die if it inherits any unexpected files,
+	// so we add this badhack to make sure it closes itself
+	setCloseOnExec("/dev/mapper/control")
+
+	if info.Exists != 0 {
+		/* Pool exists, assume everything is up */
+		if err := devices.loadMetaData(); err != nil {
+			utils.Debugf("Error device loadMetaData: %s\n", err)
+			return err
+		}
+		if err := devices.setupBaseImage(); err != nil {
+			utils.Debugf("Error device setupBaseImage: %s\n", err)
+			return err
+		}
+		return nil
+	}
+
+	/* If we create the loopback mounts we also need to initialize the base fs */
+	createdLoopback := !devices.hasImage("data") || !devices.hasImage("metadata")
+
+	data, err := devices.ensureImage("data", defaultDataLoopbackSize)
+	if err != nil {
+		utils.Debugf("Error device ensureImage (data): %s\n", err)
+		return err
+	}
+
+	metadata, err := devices.ensureImage("metadata", defaultMetaDataLoopbackSize)
+	if err != nil {
+		utils.Debugf("Error device ensureImage (metadata): %s\n", err)
+		return err
+	}
+
+	dataFile, err := AttachLoopDevice(data)
+	if err != nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+	defer dataFile.Close()
+
+	metadataFile, err := AttachLoopDevice(metadata)
+	if err != nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+	defer metadataFile.Close()
+
+	if err := devices.createPool(dataFile, metadataFile); err != nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	if !createdLoopback {
+		if err = devices.loadMetaData(); err != nil {
+			utils.Debugf("\n--->Err: %s\n", err)
+			return err
+		}
+	}
+
+	if err := devices.setupBaseImage(); err != nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	return nil
+}
+
+func (devices *DeviceSetDM) AddDevice(hash, baseHash string) error {
+	if err := devices.ensureInit(); err != nil {
+		utils.Debugf("Error init: %s\n", err)
+		return err
+	}
+
+	if devices.Devices[hash] != nil {
+		return fmt.Errorf("hash %s already exists", hash)
+	}
+
+	baseInfo := devices.Devices[baseHash]
+	if baseInfo == nil {
+		utils.Debugf("Base Hash not found")
+		return fmt.Errorf("Unknown base hash %s", baseHash)
+	}
+
+	deviceId := devices.allocateDeviceId()
+
+	if err := devices.createSnapDevice(deviceId, baseInfo); err != nil {
+		utils.Debugf("Error creating snap device: %s\n", err)
+		return err
+	}
+
+	if _, err := devices.registerDevice(deviceId, hash, baseInfo.Size); err != nil {
+		devices.deleteDevice(deviceId)
+		utils.Debugf("Error registering device: %s\n", err)
+		return err
+	}
+	return nil
+}
+
+func (devices *DeviceSetDM) RemoveDevice(hash string) error {
+	if err := devices.ensureInit(); err != nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	info := devices.Devices[hash]
+	if info == nil {
+		return fmt.Errorf("hash %s doesn't exists", hash)
+	}
+
+	devinfo, _ := devices.getInfo(info.Name())
+	if devinfo != nil && devinfo.Exists != 0 {
+		if err := devices.removeDevice(info.Name()); err != nil {
+			utils.Debugf("Error removing device: %s\n", err)
+			return err
+		}
+	}
+
+	if info.Initialized {
+		info.Initialized = false
+		if err := devices.saveMetadata(); err != nil {
+			utils.Debugf("Error saving meta data: %s\n", err)
+			return err
+		}
+	}
+
+	if err := devices.deleteDevice(info.DeviceId); err != nil {
+		utils.Debugf("Error deleting device: %s\n", err)
+		return err
+	}
+
+	devices.allocateTransactionId()
+	delete(devices.Devices, info.Hash)
+
+	if err := devices.saveMetadata(); err != nil {
+		devices.Devices[info.Hash] = info
+		utils.Debugf("Error saving meta data: %s\n", err)
+		return err
+	}
+
+	return nil
+}
+
+func (devices *DeviceSetDM) DeactivateDevice(hash string) error {
+	if err := devices.ensureInit(); err != nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	info := devices.Devices[hash]
+	if info == nil {
+		return fmt.Errorf("hash %s doesn't exists", hash)
+	}
+
+	devinfo, err := devices.getInfo(info.Name())
+	if err != nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+	if devinfo.Exists != 0 {
+		if err := devices.removeDevice(info.Name()); err != nil {
+			utils.Debugf("\n--->Err: %s\n", err)
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (devices *DeviceSetDM) Shutdown() error {
+	if !devices.initialized {
+		return nil
+	}
+
+	for path, count := range devices.activeMounts {
+		for i := count; i > 0; i-- {
+			if err := syscall.Unmount(path, 0); err != nil {
+				utils.Debugf("Shutdown unmounting %s, error: %s\n", path, err)
+			}
+		}
+		delete(devices.activeMounts, path)
+	}
+
+	for _, d := range devices.Devices {
+		if err := devices.DeactivateDevice(d.Hash); err != nil {
+			utils.Debugf("Shutdown deactivate %s , error: %s\n", d.Hash, err)
+		}
+	}
+
+	pool := devices.getPoolDevName()
+	if devinfo, err := devices.getInfo(pool); err == nil && devinfo.Exists != 0 {
+		if err := devices.removeDevice(pool); err != nil {
+			utils.Debugf("Shutdown deactivate %s , error: %s\n", pool, err)
+		}
+	}
+
+	return nil
+}
+
+func (devices *DeviceSetDM) MountDevice(hash, path string) error {
+	if err := devices.ensureInit(); err != nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	if err := devices.activateDeviceIfNeeded(hash); err != nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	info := devices.Devices[hash]
+
+	err := syscall.Mount(info.DevName(), path, "ext4", syscall.MS_MGC_VAL, "discard")
+	if err != nil && err == syscall.EINVAL {
+		err = syscall.Mount(info.DevName(), path, "ext4", syscall.MS_MGC_VAL, "")
+	}
+	if err != nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	count := devices.activeMounts[path]
+	devices.activeMounts[path] = count + 1
+
+	return nil
+}
+
+func (devices *DeviceSetDM) UnmountDevice(hash, path string) error {
+	if err := syscall.Unmount(path, 0); err != nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	if count := devices.activeMounts[path]; count > 1 {
+		devices.activeMounts[path] = count - 1
+	} else {
+		delete(devices.activeMounts, path)
+	}
+
+	return nil
+}
+
+func (devices *DeviceSetDM) HasDevice(hash string) bool {
+	if err := devices.ensureInit(); err != nil {
+		return false
+	}
+	return devices.Devices[hash] != nil
+}
+
+func (devices *DeviceSetDM) HasInitializedDevice(hash string) bool {
+	if err := devices.ensureInit(); err != nil {
+		return false
+	}
+
+	info := devices.Devices[hash]
+	return info != nil && info.Initialized
+}
+
+func (devices *DeviceSetDM) HasActivatedDevice(hash string) bool {
+	if err := devices.ensureInit(); err != nil {
+		return false
+	}
+
+	info := devices.Devices[hash]
+	if info == nil {
+		return false
+	}
+	devinfo, _ := devices.getInfo(info.Name())
+	return devinfo != nil && devinfo.Exists != 0
+}
+
+func (devices *DeviceSetDM) SetInitialized(hash string) error {
+	if err := devices.ensureInit(); err != nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	info := devices.Devices[hash]
+	if info == nil {
+		return fmt.Errorf("Unknown device %s", hash)
+	}
+
+	info.Initialized = true
+	if err := devices.saveMetadata(); err != nil {
+		info.Initialized = false
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	return nil
+}
+
+func (devices *DeviceSetDM) ensureInit() error {
+	if !devices.initialized {
+		devices.initialized = true
+		if err := devices.initDevmapper(); err != nil {
+			utils.Debugf("\n--->Err: %s\n", err)
+			return err
+		}
+	}
+	return nil
+}
+
+func NewDeviceSetDM(root string) *DeviceSetDM {
+	SetDevDir("/dev")
+
+	base := filepath.Base(root)
+	if !strings.HasPrefix(base, "docker") {
+		base = "docker-" + base
+	}
+
+	return &DeviceSetDM{
+		initialized:  false,
+		root:         root,
+		devicePrefix: base,
+		MetaData:     MetaData{Devices: make(map[string]*DevInfo)},
+		activeMounts: make(map[string]int),
+	}
+}

+ 390 - 0
devmapper/devmapper.go

@@ -0,0 +1,390 @@
+package devmapper
+
+/*
+#cgo LDFLAGS: -L. -ldevmapper
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <libdevmapper.h>
+#include <linux/loop.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+#include <errno.h>
+
+char*			attach_loop_device(const char *filename, int *loop_fd_out)
+{
+  struct loop_info64	loopinfo = {0};
+  struct stat		st;
+  char			buf[64];
+  int			i, loop_fd, fd, start_index;
+  char*			loopname;
+
+  *loop_fd_out = -1;
+
+  start_index = 0;
+  fd = open("/dev/loop-control", O_RDONLY);
+  if (fd >= 0) {
+    start_index = ioctl(fd, LOOP_CTL_GET_FREE);
+    close(fd);
+
+    if (start_index < 0)
+      start_index = 0;
+  }
+
+  fd = open(filename, O_RDWR);
+  if (fd < 0) {
+    perror("open");
+    return NULL;
+  }
+
+  loop_fd = -1;
+  for (i = start_index ; loop_fd < 0 ; i++ ) {
+    if (sprintf(buf, "/dev/loop%d", i) < 0) {
+      close(fd);
+	perror("sprintf");
+      return NULL;
+    }
+
+    if (stat(buf, &st) || !S_ISBLK(st.st_mode)) {
+      close(fd);
+      return NULL;
+    }
+
+    loop_fd = open(buf, O_RDWR);
+    if (loop_fd < 0 && errno == ENOENT) {
+      close(fd);
+      perror("open");
+      fprintf (stderr, "no available loopback device!");
+      return NULL;
+    } else if (loop_fd < 0)
+      continue;
+
+    if (ioctl (loop_fd, LOOP_SET_FD, (void *)(size_t)fd) < 0) {
+      perror("ioctl");
+      close(loop_fd);
+      loop_fd = -1;
+      if (errno != EBUSY) {
+        close (fd);
+        fprintf (stderr, "cannot set up loopback device %s", buf);
+        return NULL;
+      }
+      continue;
+    }
+
+    close (fd);
+
+    strncpy((char*)loopinfo.lo_file_name, buf, LO_NAME_SIZE);
+    loopinfo.lo_offset = 0;
+    loopinfo.lo_flags = LO_FLAGS_AUTOCLEAR;
+
+    if (ioctl(loop_fd, LOOP_SET_STATUS64, &loopinfo) < 0) {
+      perror("ioctl1");
+      if (ioctl(loop_fd, LOOP_CLR_FD, 0) < 0) {
+        perror("ioctl2");
+      }
+      close(loop_fd);
+      fprintf (stderr, "cannot set up loopback device info");
+      return NULL;
+    }
+
+    loopname = strdup(buf);
+    if (loopname == NULL) {
+      close(loop_fd);
+      return NULL;
+    }
+
+    *loop_fd_out = loop_fd;
+    return loopname;
+  }
+  return NULL;
+}
+
+static int64_t
+get_block_size(int fd)
+{
+  uint64_t size;
+  if (ioctl(fd, BLKGETSIZE64, &size) == -1)
+    return -1;
+  return (int64_t)size;
+}
+
+*/
+import "C"
+
+import (
+	"errors"
+	"fmt"
+	"github.com/dotcloud/docker/utils"
+	"os"
+	"runtime"
+	"syscall"
+	"unsafe"
+)
+
+const (
+	DeviceCreate TaskType = iota
+	DeviceReload
+	DeviceRemove
+	DeviceRemoveAll
+	DeviceSuspend
+	DeviceResume
+	DeviceInfo
+	DeviceDeps
+	DeviceRename
+	DeviceVersion
+	DeviceStatus
+	DeviceTable
+	DeviceWaitevent
+	DeviceList
+	DeviceClear
+	DeviceMknodes
+	DeviceListVersions
+	DeviceTargetMsg
+	DeviceSetGeometry
+)
+
+var (
+	ErrTaskRun              = errors.New("dm_task_run failed")
+	ErrTaskSetName          = errors.New("dm_task_set_name failed")
+	ErrTaskSetMessage       = errors.New("dm_task_set_message failed")
+	ErrTaskSetAddNode       = errors.New("dm_task_set_add_node failed")
+	ErrTaskSetRO            = errors.New("dm_task_set_ro failed")
+	ErrTaskAddTarget        = errors.New("dm_task_add_target failed")
+	ErrGetDriverVersion     = errors.New("dm_task_get_driver_version failed")
+	ErrAttachLoopbackDevice = errors.New("loopback mounting failed")
+	ErrGetBlockSize         = errors.New("Can't get block size")
+	ErrUdevWait             = errors.New("wait on udev cookie failed")
+	ErrSetDevDir            = errors.New("dm_set_dev_dir failed")
+	ErrGetLibraryVersion    = errors.New("dm_get_library_version failed")
+	ErrCreateRemoveTask     = errors.New("Can't create task of type DeviceRemove")
+	ErrRunRemoveDevice      = errors.New("running removeDevice failed")
+)
+
+type (
+	Task struct {
+		unmanaged *C.struct_dm_task
+	}
+	Info struct {
+		Exists        int
+		Suspended     int
+		LiveTable     int
+		InactiveTable int
+		OpenCount     int32
+		EventNr       uint32
+		Major         uint32
+		Minor         uint32
+		ReadOnly      int
+		TargetCount   int32
+	}
+	TaskType int
+)
+
+func (t *Task) destroy() {
+	if t != nil {
+		C.dm_task_destroy(t.unmanaged)
+		runtime.SetFinalizer(t, nil)
+	}
+}
+
+func TaskCreate(tasktype TaskType) *Task {
+	c_task := C.dm_task_create(C.int(tasktype))
+	if c_task == nil {
+		return nil
+	}
+	task := &Task{unmanaged: c_task}
+	runtime.SetFinalizer(task, (*Task).destroy)
+	return task
+}
+
+func (t *Task) Run() error {
+	if res := C.dm_task_run(t.unmanaged); res != 1 {
+		return ErrTaskRun
+	}
+	return nil
+}
+
+func (t *Task) SetName(name string) error {
+	c_name := C.CString(name)
+	defer free(c_name)
+
+	if res := C.dm_task_set_name(t.unmanaged, c_name); res != 1 {
+		if os.Getenv("DEBUG") != "" {
+			C.perror(C.CString(fmt.Sprintf("[debug] Error dm_task_set_name(%s, %#v)", name, t.unmanaged)))
+		}
+		return ErrTaskSetName
+	}
+	return nil
+}
+
+func (t *Task) SetMessage(message string) error {
+	c_message := C.CString(message)
+	defer free(c_message)
+
+	if res := C.dm_task_set_message(t.unmanaged, c_message); res != 1 {
+		return ErrTaskSetMessage
+	}
+	return nil
+}
+
+func (t *Task) SetSector(sector uint64) error {
+	if res := C.dm_task_set_sector(t.unmanaged, C.uint64_t(sector)); res != 1 {
+		return ErrTaskSetAddNode
+	}
+	return nil
+}
+
+func (t *Task) SetCookie(cookie *uint32, flags uint16) error {
+	c_cookie := C.uint32_t(*cookie)
+	if res := C.dm_task_set_cookie(t.unmanaged, &c_cookie, C.uint16_t(flags)); res != 1 {
+		return ErrTaskSetAddNode
+	}
+	*cookie = uint32(c_cookie)
+	return nil
+}
+
+func (t *Task) SetRo() error {
+	if res := C.dm_task_set_ro(t.unmanaged); res != 1 {
+		return ErrTaskSetRO
+	}
+	return nil
+}
+
+func (t *Task) AddTarget(start uint64, size uint64, ttype string, params string) error {
+	c_ttype := C.CString(ttype)
+	defer free(c_ttype)
+
+	c_params := C.CString(params)
+	defer free(c_params)
+
+	if res := C.dm_task_add_target(t.unmanaged, C.uint64_t(start), C.uint64_t(size), c_ttype, c_params); res != 1 {
+		return ErrTaskAddTarget
+	}
+	return nil
+}
+
+func (t *Task) GetDriverVersion() (string, error) {
+	buffer := C.CString(string(make([]byte, 128)))
+	defer free(buffer)
+
+	if res := C.dm_task_get_driver_version(t.unmanaged, buffer, 128); res != 1 {
+		return "", ErrGetDriverVersion
+	}
+	return C.GoString(buffer), nil
+}
+
+func (t *Task) GetInfo() (*Info, error) {
+	c_info := C.struct_dm_info{}
+	if res := C.dm_task_get_info(t.unmanaged, &c_info); res != 1 {
+		return nil, ErrGetDriverVersion
+	}
+	return &Info{
+		Exists:        int(c_info.exists),
+		Suspended:     int(c_info.suspended),
+		LiveTable:     int(c_info.live_table),
+		InactiveTable: int(c_info.inactive_table),
+		OpenCount:     int32(c_info.open_count),
+		EventNr:       uint32(c_info.event_nr),
+		Major:         uint32(c_info.major),
+		Minor:         uint32(c_info.minor),
+		ReadOnly:      int(c_info.read_only),
+		TargetCount:   int32(c_info.target_count),
+	}, nil
+}
+
+func (t *Task) GetNextTarget(next uintptr) (uintptr, uint64, uint64, string, string) {
+	var (
+		c_start, c_length       C.uint64_t
+		c_target_type, c_params *C.char
+	)
+
+	nextp := C.dm_get_next_target(t.unmanaged, unsafe.Pointer(next), &c_start, &c_length, &c_target_type, &c_params)
+	return uintptr(nextp), uint64(c_start), uint64(c_length), C.GoString(c_target_type), C.GoString(c_params)
+}
+
+func AttachLoopDevice(filename string) (*os.File, error) {
+	c_filename := C.CString(filename)
+	defer free(c_filename)
+
+	var fd C.int
+	res := C.attach_loop_device(c_filename, &fd)
+	if res == nil {
+		return nil, ErrAttachLoopbackDevice
+	}
+	defer free(res)
+
+	return os.NewFile(uintptr(fd), C.GoString(res)), nil
+}
+
+func getBlockSize(fd uintptr) int {
+	var size uint64
+
+	if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, fd, C.BLKGETSIZE64, uintptr(unsafe.Pointer(&size))); err != 0 {
+		utils.Debugf("Error ioctl: %s", err)
+		return -1
+	}
+	return int(size)
+}
+
+func GetBlockDeviceSize(file *os.File) (uint64, error) {
+	if size := C.get_block_size(C.int(file.Fd())); size == -1 {
+		return 0, ErrGetBlockSize
+	} else {
+		return uint64(size), nil
+	}
+}
+
+func UdevWait(cookie uint32) error {
+	if res := C.dm_udev_wait(C.uint32_t(cookie)); res != 1 {
+		utils.Debugf("Failed to wait on udev cookie %d", cookie)
+		return ErrUdevWait
+	}
+	return nil
+}
+
+func LogInitVerbose(level int) {
+	C.dm_log_init_verbose(C.int(level))
+}
+
+func SetDevDir(dir string) error {
+	c_dir := C.CString(dir)
+	defer free(c_dir)
+
+	if res := C.dm_set_dev_dir(c_dir); res != 1 {
+		utils.Debugf("Error dm_set_dev_dir")
+		return ErrSetDevDir
+	}
+	return nil
+}
+
+func GetLibraryVersion() (string, error) {
+	buffer := C.CString(string(make([]byte, 128)))
+	defer free(buffer)
+
+	if res := C.dm_get_library_version(buffer, 128); res != 1 {
+		return "", ErrGetLibraryVersion
+	}
+	return C.GoString(buffer), nil
+}
+
+// Useful helper for cleanup
+func RemoveDevice(name string) error {
+	task := TaskCreate(DeviceRemove)
+	if task == nil {
+		return ErrCreateRemoveTask
+	}
+	if err := task.SetName(name); err != nil {
+		utils.Debugf("Can't set task name %s", name)
+		return err
+	}
+	if err := task.Run(); err != nil {
+		return ErrRunRemoveDevice
+	}
+	return nil
+}
+
+func free(p *C.char) {
+	C.free(unsafe.Pointer(p))
+}

+ 62 - 0
devmapper/docker-device-tool/device_tool.go

@@ -0,0 +1,62 @@
+package main
+
+import (
+	"fmt"
+	"github.com/dotcloud/docker/devmapper"
+	"os"
+)
+
+func usage() {
+	fmt.Printf("Usage: %s [snap new-id base-id] | [remove id] | [mount id mountpoint]\n", os.Args[0])
+	os.Exit(1)
+}
+
+func main() {
+	devices := devmapper.NewDeviceSetDM("/var/lib/docker")
+
+	if len(os.Args) < 2 {
+		usage()
+	}
+
+	cmd := os.Args[1]
+	if cmd == "snap" {
+		if len(os.Args) < 4 {
+			usage()
+		}
+
+		err := devices.AddDevice(os.Args[2], os.Args[3])
+		if err != nil {
+			fmt.Println("Can't create snap device: ", err)
+			os.Exit(1)
+		}
+	} else if cmd == "remove" {
+		if len(os.Args) < 3 {
+			usage()
+		}
+
+		err := devices.RemoveDevice(os.Args[2])
+		if err != nil {
+			fmt.Println("Can't remove device: ", err)
+			os.Exit(1)
+		}
+	} else if cmd == "mount" {
+		if len(os.Args) < 4 {
+			usage()
+		}
+
+		err := devices.MountDevice(os.Args[2], os.Args[3])
+		if err != nil {
+			fmt.Println("Can't create snap device: ", err)
+			os.Exit(1)
+		}
+	} else {
+		fmt.Printf("Unknown command %s\n", cmd)
+		if len(os.Args) < 4 {
+			usage()
+		}
+
+		os.Exit(1)
+	}
+
+	return
+}

+ 16 - 0
docker-init/docker-init.go

@@ -0,0 +1,16 @@
+package main
+
+import (
+	"github.com/dotcloud/docker"
+)
+
+var (
+	GITCOMMIT string
+	VERSION   string
+)
+
+func main() {
+	// Running in init mode
+	docker.SysInit()
+	return
+}

+ 3 - 2
docker/docker.go

@@ -4,6 +4,7 @@ import (
 	"flag"
 	"fmt"
 	"github.com/dotcloud/docker"
+	"github.com/dotcloud/docker/devmapper"
 	"github.com/dotcloud/docker/utils"
 	"io/ioutil"
 	"log"
@@ -122,7 +123,7 @@ func daemon(pidfile string, flGraphPath string, protoAddrs []string, autoRestart
 	defer removePidFile(pidfile)
 
 	c := make(chan os.Signal, 1)
-	signal.Notify(c, os.Interrupt, os.Kill, os.Signal(syscall.SIGTERM))
+	signal.Notify(c, os.Interrupt, os.Kill, syscall.SIGTERM)
 	go func() {
 		sig := <-c
 		log.Printf("Received signal '%v', exiting\n", sig)
@@ -133,7 +134,7 @@ func daemon(pidfile string, flGraphPath string, protoAddrs []string, autoRestart
 	if flDns != "" {
 		dns = []string{flDns}
 	}
-	server, err := docker.NewServer(flGraphPath, autoRestart, enableCors, dns)
+	server, err := docker.NewServer(flGraphPath, devmapper.NewDeviceSetDM(flGraphPath), autoRestart, enableCors, dns)
 	if err != nil {
 		return err
 	}

+ 5 - 2
graph_test.go

@@ -121,6 +121,9 @@ func TestRegister(t *testing.T) {
 }
 
 func TestMount(t *testing.T) {
+	runtime := mkRuntime(t)
+	defer nuke(runtime)
+
 	graph := tempGraph(t)
 	defer os.RemoveAll(graph.Root)
 	archive, err := fakeTar()
@@ -144,12 +147,12 @@ func TestMount(t *testing.T) {
 	if err := os.MkdirAll(rw, 0700); err != nil {
 		t.Fatal(err)
 	}
-	if err := image.Mount(rootfs, rw); err != nil {
+	if err := image.Mount(runtime, rootfs, rw, "testing"); err != nil {
 		t.Fatal(err)
 	}
 	// FIXME: test for mount contents
 	defer func() {
-		if err := Unmount(rootfs); err != nil {
+		if err := image.Unmount(runtime, rootfs, "testing"); err != nil {
 			t.Error(err)
 		}
 	}()

+ 11 - 14
hack/PACKAGERS.md

@@ -32,14 +32,14 @@ the process.
 
 ## System build dependencies
 
-To build docker, you will need the following system dependencies
+To build docker, you will need the following system dependencies:
 
 * An amd64 machine
 * A recent version of git and mercurial
-* Go version 1.1.2
+* Go version 1.2rc1
+* A copy of libdevmapper.a (statically compiled), and associated headers
 * A clean checkout of the source must be added to a valid Go [workspace](http://golang.org/doc/code.html#Workspaces)
-under the path *src/github.com/dotcloud/docker*. See 
-
+under the path *src/github.com/dotcloud/docker*.
 
 ## Go dependencies
 
@@ -55,15 +55,13 @@ NOTE: if you''re not able to package the exact version (to the exact commit) of
 please get in touch so we can remediate! Who knows what discrepancies can be caused by even the
 slightest deviation. We promise to do our best to make everybody happy.
 
+## Disabling CGO for the net package
 
-## Disabling CGO
-
-Make sure to disable CGO on your system, and then recompile the standard library on the build
-machine:
+Make sure to disable CGO on your system for the net package using `-tags netgo`,
+and then recompile the standard library on the build machine:
 
 ```bash
-export CGO_ENABLED=0
-cd /tmp && echo 'package main' > t.go && go test -a -i -v
+go install -ldflags '-w -linkmode external -extldflags "-static -Wl,--unresolved-symbols=ignore-in-shared-libs"' -tags netgo -a std
 ```
 
 ## Building Docker
@@ -71,7 +69,7 @@ cd /tmp && echo 'package main' > t.go && go test -a -i -v
 To build the docker binary, run the following command with the source checkout as the
 working directory:
 
-```
+```bash
 ./hack/make.sh binary
 ```
 
@@ -80,9 +78,9 @@ This will create a static binary under *./bundles/$VERSION/binary/docker-$VERSIO
 
 You are encouraged to use ./hack/make.sh without modification. If you must absolutely write
 your own script (are you really, really sure you need to? make.sh is really not that complicated),
-then please take care the respect the following:
+then please take care to respect the following:
 
-* In *./hack/make.sh*: $LDFLAGS, $VERSION and $GITCOMMIT
+* In *./hack/make.sh*: $LDFLAGS, $BUILDFLAGS, $VERSION and $GITCOMMIT
 * In *./hack/make/binary*: the exact build command to run
 
 You may be tempted to tweak these settings. In particular, being a rigorous maintainer, you may want
@@ -106,7 +104,6 @@ dependencies to be installed (see below).
 
 The test suite will also download a small test container, so you will need internet connectivity.
 
-
 ## Runtime dependencies
 
 To run properly, docker needs the following software to be installed at runtime:

+ 2 - 2
hack/make.sh

@@ -44,8 +44,8 @@ if [ -n "$(git status --porcelain)" ]; then
 fi
 
 # Use these flags when compiling the tests and final binary
-LDFLAGS="-X main.GITCOMMIT $GITCOMMIT -X main.VERSION $VERSION -d -w"
-
+LDFLAGS='-X main.GITCOMMIT "'$GITCOMMIT'" -X main.VERSION "'$VERSION'" -w -linkmode external -extldflags "-static -Wl,--unresolved-symbols=ignore-in-shared-libs"'
+BUILDFLAGS='-tags netgo'
 
 bundle() {
 	bundlescript=$1

+ 1 - 1
hack/make/binary

@@ -2,6 +2,6 @@
 
 DEST=$1
 
-if go build -o $DEST/docker-$VERSION -ldflags "$LDFLAGS" ./docker; then
+if go build -o $DEST/docker-$VERSION -ldflags "$LDFLAGS" $BUILDFLAGS ./docker; then
 	echo "Created binary: $DEST/docker-$VERSION"
 fi

+ 3 - 1
hack/make/test

@@ -1,3 +1,5 @@
+#!/bin/sh
+
 DEST=$1
 
 set -e
@@ -9,7 +11,7 @@ bundle_test() {
 		for test_dir in $(find_test_dirs); do (
 			set -x
 			cd $test_dir
-			go test -v -ldflags "$LDFLAGS"
+			DEBUG=1 go test -v -ldflags "$LDFLAGS" $BUILDFLAGS
 		)  done
 	} 2>&1 | tee $DEST/test.log
 }

+ 383 - 34
image.go

@@ -8,13 +8,12 @@ import (
 	"github.com/dotcloud/docker/utils"
 	"io"
 	"io/ioutil"
-	"log"
 	"os"
-	"os/exec"
 	"path"
 	"path/filepath"
 	"strconv"
 	"strings"
+	"syscall"
 	"time"
 )
 
@@ -136,69 +135,419 @@ func jsonPath(root string) string {
 	return path.Join(root, "json")
 }
 
-func MountAUFS(ro []string, rw string, target string) error {
-	// FIXME: Now mount the layers
-	rwBranch := fmt.Sprintf("%v=rw", rw)
-	roBranches := ""
-	for _, layer := range ro {
-		roBranches += fmt.Sprintf("%v=ro+wh:", layer)
+func mountPath(root string) string {
+	return path.Join(root, "mount")
+}
+
+// TarLayer returns a tar archive of the image's filesystem layer.
+func (image *Image) TarLayer(compression Compression) (Archive, error) {
+	layerPath, err := image.layer()
+	if err != nil {
+		return nil, err
 	}
-	branches := fmt.Sprintf("br:%v:%v", rwBranch, roBranches)
+	return Tar(layerPath, compression)
+}
+
+type TimeUpdate struct {
+	path string
+	time []syscall.Timeval
+	mode uint32
+}
 
-	branches += ",xino=/dev/shm/aufs.xino"
+func (image *Image) applyLayer(layer, target string) error {
+	var updateTimes []TimeUpdate
+	oldmask := syscall.Umask(0)
+	defer syscall.Umask(oldmask)
+	err := filepath.Walk(layer, func(srcPath string, f os.FileInfo, err error) error {
+		if err != nil {
+			return err
+		}
+
+		// Skip root
+		if srcPath == layer {
+			return nil
+		}
 
-	//if error, try to load aufs kernel module
-	if err := mount("none", target, "aufs", 0, branches); err != nil {
-		log.Printf("Kernel does not support AUFS, trying to load the AUFS module with modprobe...")
-		if err := exec.Command("modprobe", "aufs").Run(); err != nil {
-			return fmt.Errorf("Unable to load the AUFS module")
+		var srcStat syscall.Stat_t
+		err = syscall.Lstat(srcPath, &srcStat)
+		if err != nil {
+			return err
+		}
+
+		relPath, err := filepath.Rel(layer, srcPath)
+		if err != nil {
+			return err
 		}
-		log.Printf("...module loaded.")
-		if err := mount("none", target, "aufs", 0, branches); err != nil {
-			return fmt.Errorf("Unable to mount using aufs")
+
+		targetPath := filepath.Join(target, relPath)
+
+		// Skip AUFS metadata
+		if matched, err := filepath.Match(".wh..wh.*", relPath); err != nil || matched {
+			if err != nil || !f.IsDir() {
+				return err
+			}
+			return filepath.SkipDir
+		}
+
+		// Find out what kind of modification happened
+		file := filepath.Base(srcPath)
+
+		// If there is a whiteout, then the file was removed
+		if strings.HasPrefix(file, ".wh.") {
+			originalFile := file[len(".wh."):]
+			deletePath := filepath.Join(filepath.Dir(targetPath), originalFile)
+
+			err = os.RemoveAll(deletePath)
+			if err != nil {
+				return err
+			}
+		} else {
+			var targetStat = &syscall.Stat_t{}
+			err := syscall.Lstat(targetPath, targetStat)
+			if err != nil {
+				if !os.IsNotExist(err) {
+					return err
+				}
+				targetStat = nil
+			}
+
+			if targetStat != nil && !(targetStat.Mode&syscall.S_IFDIR == syscall.S_IFDIR && srcStat.Mode&syscall.S_IFDIR == syscall.S_IFDIR) {
+				// Unless both src and dest are directories we remove the target and recreate it
+				// This is a bit wasteful in the case of only a mode change, but that is unlikely
+				// to matter much
+				err = os.RemoveAll(targetPath)
+				if err != nil {
+					return err
+				}
+				targetStat = nil
+			}
+
+			if f.IsDir() {
+				// Source is a directory
+				if targetStat == nil {
+					err = syscall.Mkdir(targetPath, srcStat.Mode&07777)
+					if err != nil {
+						return err
+					}
+				}
+			} else if srcStat.Mode&syscall.S_IFLNK == syscall.S_IFLNK {
+				// Source is symlink
+				link, err := os.Readlink(srcPath)
+				if err != nil {
+					return err
+				}
+
+				err = os.Symlink(link, targetPath)
+				if err != nil {
+					return err
+				}
+			} else if srcStat.Mode&syscall.S_IFBLK == syscall.S_IFBLK ||
+				srcStat.Mode&syscall.S_IFCHR == syscall.S_IFCHR ||
+				srcStat.Mode&syscall.S_IFIFO == syscall.S_IFIFO ||
+				srcStat.Mode&syscall.S_IFSOCK == syscall.S_IFSOCK {
+				// Source is special file
+				err = syscall.Mknod(targetPath, srcStat.Mode, int(srcStat.Rdev))
+				if err != nil {
+					return err
+				}
+			} else if srcStat.Mode&syscall.S_IFREG == syscall.S_IFREG {
+				// Source is regular file
+				fd, err := syscall.Open(targetPath, syscall.O_CREAT|syscall.O_WRONLY, srcStat.Mode&07777)
+				if err != nil {
+					return err
+				}
+				dstFile := os.NewFile(uintptr(fd), targetPath)
+				srcFile, err := os.Open(srcPath)
+				if err != nil {
+					_ = dstFile.Close()
+					return err
+				}
+				err = CopyFile(dstFile, srcFile)
+				_ = dstFile.Close()
+				_ = srcFile.Close()
+				if err != nil {
+					return err
+				}
+			} else {
+				return fmt.Errorf("Unknown type for file %s", srcPath)
+			}
+
+			err = syscall.Lchown(targetPath, int(srcStat.Uid), int(srcStat.Gid))
+			if err != nil {
+				return err
+			}
+
+			if srcStat.Mode&syscall.S_IFLNK != syscall.S_IFLNK {
+				err = syscall.Chmod(targetPath, srcStat.Mode&07777)
+				if err != nil {
+					return err
+				}
+			}
+
+			ts := []syscall.Timeval{
+				syscall.NsecToTimeval(srcStat.Atim.Nano()),
+				syscall.NsecToTimeval(srcStat.Mtim.Nano()),
+			}
+
+			u := TimeUpdate{
+				path: targetPath,
+				time: ts,
+				mode: srcStat.Mode,
+			}
+
+			// Delay time updates until all other changes done, or it is
+			// overwritten for directories (by child changes)
+			updateTimes = append(updateTimes, u)
+		}
+		return nil
+	})
+	if err != nil {
+		return err
+	}
+
+	// We do this in reverse order so that children are updated before parents
+	for i := len(updateTimes) - 1; i >= 0; i-- {
+		update := updateTimes[i]
+
+		O_PATH := 010000000 // Not in syscall yet
+		var err error
+		if update.mode&syscall.S_IFLNK == syscall.S_IFLNK {
+			// Update time on the symlink via O_PATH + futimes(), if supported by the kernel
+
+			fd, err := syscall.Open(update.path, syscall.O_RDWR|O_PATH|syscall.O_NOFOLLOW, 0600)
+			if err == syscall.EISDIR || err == syscall.ELOOP {
+				// O_PATH not supported by kernel, nothing to do, ignore
+			} else if err != nil {
+				return err
+			} else {
+				syscall.Futimes(fd, update.time)
+				syscall.Close(fd)
+			}
+		} else {
+			err = syscall.Utimes(update.path, update.time)
+			if err != nil {
+				return err
+			}
 		}
 	}
+
 	return nil
 }
 
-// TarLayer returns a tar archive of the image's filesystem layer.
-func (image *Image) TarLayer(compression Compression) (Archive, error) {
-	layerPath, err := image.layer()
+func (image *Image) ensureImageDevice(devices DeviceSet) error {
+	if devices.HasInitializedDevice(image.ID) {
+		return nil
+	}
+
+	if image.Parent != "" && !devices.HasInitializedDevice(image.Parent) {
+		parentImg, err := image.GetParent()
+		if err != nil {
+			return fmt.Errorf("Error while getting parent image: %v", err)
+		}
+		err = parentImg.ensureImageDevice(devices)
+		if err != nil {
+			return err
+		}
+	}
+
+	root, err := image.root()
 	if err != nil {
-		return nil, err
+		return err
 	}
-	return Tar(layerPath, compression)
-}
 
-func (image *Image) Mount(root, rw string) error {
-	if mounted, err := Mounted(root); err != nil {
+	mountDir := mountPath(root)
+	if err := os.Mkdir(mountDir, 0600); err != nil && !os.IsExist(err) {
 		return err
-	} else if mounted {
-		return fmt.Errorf("%s is already mounted", root)
 	}
-	layers, err := image.layers()
+
+	mounted, err := Mounted(mountDir)
+	if err == nil && mounted {
+		utils.Debugf("Image %s is unexpectedly mounted, unmounting...", image.ID)
+		err = syscall.Unmount(mountDir, 0)
+		if err != nil {
+			return err
+		}
+	}
+
+	if devices.HasDevice(image.ID) {
+		utils.Debugf("Found non-initialized demove-mapper device for image %s, removing", image.ID)
+		err = devices.RemoveDevice(image.ID)
+		if err != nil {
+			return err
+		}
+	}
+
+	utils.Debugf("Creating device-mapper device for image id %s", image.ID)
+	if err := devices.AddDevice(image.ID, image.Parent); err != nil {
+		utils.Debugf("Error add device: %s", err)
+		return err
+	}
+
+	if err := devices.MountDevice(image.ID, mountDir); err != nil {
+		utils.Debugf("Error mounting device: %s", err)
+		devices.RemoveDevice(image.ID)
+		return err
+	}
+
+	if err := ioutil.WriteFile(path.Join(mountDir, ".docker-id"), []byte(image.ID), 0600); err != nil {
+		utils.Debugf("Error writing file: %s", err)
+		devices.UnmountDevice(image.ID, mountDir)
+		devices.RemoveDevice(image.ID)
+		return err
+	}
+
+	if err = image.applyLayer(layerPath(root), mountDir); err != nil {
+		utils.Debugf("Error applying layer: %s", err)
+		devices.UnmountDevice(image.ID, mountDir)
+		devices.RemoveDevice(image.ID)
+		return err
+	}
+
+	// The docker init layer is conceptually above all other layers, so we apply
+	// it for every image. This is safe because the layer directory is the
+	// definition of the image, and the device-mapper device is just a cache
+	// of it instantiated. Diffs/commit compare the container device with the
+	// image device, which will then *not* pick up the init layer changes as
+	// part of the container changes
+	dockerinitLayer, err := image.getDockerInitLayer()
 	if err != nil {
+		devices.UnmountDevice(image.ID, mountDir)
+		devices.RemoveDevice(image.ID)
+		return err
+	}
+
+	if err := image.applyLayer(dockerinitLayer, mountDir); err != nil {
+		devices.UnmountDevice(image.ID, mountDir)
+		devices.RemoveDevice(image.ID)
 		return err
 	}
+
+	if err := devices.UnmountDevice(image.ID, mountDir); err != nil {
+		devices.RemoveDevice(image.ID)
+		return err
+	}
+
+	devices.SetInitialized(image.ID)
+
+	// No need to the device-mapper device to hang around once we've written
+	// the image, it can be enabled on-demand when needed
+	devices.DeactivateDevice(image.ID)
+
+	return nil
+}
+
+func (image *Image) Mounted(runtime *Runtime, root, rw string) (bool, error) {
+	return Mounted(root)
+}
+
+func (image *Image) Mount(runtime *Runtime, root, rw string, id string) error {
+	if mounted, _ := image.Mounted(runtime, root, rw); mounted {
+		return fmt.Errorf("%s is already mounted", root)
+	}
+
 	// Create the target directories if they don't exist
 	if err := os.Mkdir(root, 0755); err != nil && !os.IsExist(err) {
 		return err
 	}
-	if err := os.Mkdir(rw, 0755); err != nil && !os.IsExist(err) {
+
+	devices, err := runtime.GetDeviceSet()
+	if err != nil {
 		return err
 	}
-	if err := MountAUFS(layers, rw, root); err != nil {
+
+	if err := image.ensureImageDevice(devices); err != nil {
+		return err
+	}
+
+	createdDevice := false
+	if !devices.HasDevice(id) {
+		utils.Debugf("Creating device %s for container based on image %s", id, image.ID)
+		err = devices.AddDevice(id, image.ID)
+		if err != nil {
+			return err
+		}
+		createdDevice = true
+	}
+
+	utils.Debugf("Mounting container %s at %s for container", id, root)
+	if err := devices.MountDevice(id, root); err != nil {
 		return err
 	}
+
+	if createdDevice {
+		err = ioutil.WriteFile(path.Join(root, ".docker-id"), []byte(id), 0600)
+		if err != nil {
+			_ = devices.RemoveDevice(image.ID)
+			return err
+		}
+	}
 	return nil
 }
 
-func (image *Image) Changes(rw string) ([]Change, error) {
-	layers, err := image.layers()
+func (image *Image) Unmount(runtime *Runtime, root string, id string) error {
+	// Try to deactivate the device as generally there is no use for it anymore
+	devices, err := runtime.GetDeviceSet()
+	if err != nil {
+		return err
+	}
+
+	if err = devices.UnmountDevice(id, root); err != nil {
+		return err
+	}
+
+	return devices.DeactivateDevice(id)
+}
+
+func (image *Image) Changes(runtime *Runtime, root, rw, id string) ([]Change, error) {
+	devices, err := runtime.GetDeviceSet()
 	if err != nil {
 		return nil, err
 	}
-	return Changes(layers, rw)
+
+	if err := os.Mkdir(rw, 0755); err != nil && !os.IsExist(err) {
+		return nil, err
+	}
+
+	wasActivated := devices.HasActivatedDevice(image.ID)
+
+	// We re-use rw for the temporary mount of the base image as its
+	// not used by device-mapper otherwise
+	err = devices.MountDevice(image.ID, rw)
+	if err != nil {
+		return nil, err
+	}
+
+	changes, err := ChangesDirs(root, rw)
+	devices.UnmountDevice(image.ID, rw)
+	if !wasActivated {
+		devices.DeactivateDevice(image.ID)
+	}
+	if err != nil {
+		return nil, err
+	}
+	return changes, nil
+}
+
+func (image *Image) ExportChanges(runtime *Runtime, root, rw, id string) (Archive, error) {
+	changes, err := image.Changes(runtime, root, rw, id)
+	if err != nil {
+		return nil, err
+	}
+
+	files := make([]string, 0)
+	deletions := make([]string, 0)
+	for _, change := range changes {
+		if change.Kind == ChangeModify || change.Kind == ChangeAdd {
+			files = append(files, change.Path)
+		}
+		if change.Kind == ChangeDelete {
+			base := filepath.Base(change.Path)
+			dir := filepath.Dir(change.Path)
+			deletions = append(deletions, filepath.Join(dir, ".wh."+base))
+		}
+	}
+
+	return TarFilter(root, Uncompressed, files, false, deletions)
 }
 
 func (image *Image) ShortID() string {

+ 0 - 29
mount.go

@@ -1,40 +1,11 @@
 package docker
 
 import (
-	"fmt"
-	"github.com/dotcloud/docker/utils"
 	"os"
-	"os/exec"
 	"path/filepath"
 	"syscall"
-	"time"
 )
 
-func Unmount(target string) error {
-	if err := exec.Command("auplink", target, "flush").Run(); err != nil {
-		utils.Errorf("[warning]: couldn't run auplink before unmount: %s", err)
-	}
-	if err := syscall.Unmount(target, 0); err != nil {
-		return err
-	}
-	// Even though we just unmounted the filesystem, AUFS will prevent deleting the mntpoint
-	// for some time. We'll just keep retrying until it succeeds.
-	for retries := 0; retries < 1000; retries++ {
-		err := os.Remove(target)
-		if err == nil {
-			// rm mntpoint succeeded
-			return nil
-		}
-		if os.IsNotExist(err) {
-			// mntpoint doesn't exist anymore. Success.
-			return nil
-		}
-		// fmt.Printf("(%v) Remove %v returned: %v\n", retries, target, err)
-		time.Sleep(10 * time.Millisecond)
-	}
-	return fmt.Errorf("Umount: Failed to umount %v", target)
-}
-
 func Mounted(mountpoint string) (bool, error) {
 	mntpoint, err := os.Stat(mountpoint)
 	if err != nil {

+ 66 - 4
runtime.go

@@ -10,6 +10,7 @@ import (
 	"os"
 	"os/exec"
 	"path"
+	"path/filepath"
 	"sort"
 	"strings"
 	"time"
@@ -37,12 +38,28 @@ type Runtime struct {
 	volumes        *Graph
 	srv            *Server
 	Dns            []string
+	deviceSet      DeviceSet
 }
 
 var sysInitPath string
 
 func init() {
-	sysInitPath = utils.SelfPath()
+	env := os.Getenv("_DOCKER_INIT_PATH")
+	if env != "" {
+		sysInitPath = env
+	} else {
+		selfPath := utils.SelfPath()
+
+		// If we have a separate docker-init, use that, otherwise use the
+		// main docker binary
+		dir := filepath.Dir(selfPath)
+		dockerInitPath := filepath.Join(dir, "docker-init")
+		if _, err := os.Stat(dockerInitPath); err != nil {
+			sysInitPath = selfPath
+		} else {
+			sysInitPath = dockerInitPath
+		}
+	}
 }
 
 // List returns an array of all containers registered in the runtime.
@@ -64,6 +81,32 @@ func (runtime *Runtime) getContainerElement(id string) *list.Element {
 	return nil
 }
 
+func hasFilesystemSupport(fstype string) bool {
+	content, err := ioutil.ReadFile("/proc/filesystems")
+	if err != nil {
+		log.Printf("WARNING: Unable to read /proc/filesystems, assuming fs %s is not supported.", fstype)
+		return false
+	}
+	lines := strings.Split(string(content), "\n")
+	for _, line := range lines {
+		if strings.HasPrefix(line, "nodev") {
+			line = line[5:]
+		}
+		line = strings.TrimSpace(line)
+		if line == fstype {
+			return true
+		}
+	}
+	return false
+}
+
+func (runtime *Runtime) GetDeviceSet() (DeviceSet, error) {
+	if runtime.deviceSet == nil {
+		return nil, fmt.Errorf("No device set available")
+	}
+	return runtime.deviceSet, nil
+}
+
 // Get looks for a container by the specified ID or name, and returns it.
 // If the container is not found, or if an error occurs, nil is returned.
 func (runtime *Runtime) Get(name string) *Container {
@@ -216,6 +259,24 @@ func (runtime *Runtime) Destroy(container *Container) error {
 	if err := os.RemoveAll(container.root); err != nil {
 		return fmt.Errorf("Unable to remove filesystem for %v: %v", container.ID, err)
 	}
+	if runtime.deviceSet.HasDevice(container.ID) {
+		if err := runtime.deviceSet.RemoveDevice(container.ID); err != nil {
+			return fmt.Errorf("Unable to remove device for %v: %v", container.ID, err)
+		}
+	}
+	return nil
+}
+
+func (runtime *Runtime) DeleteImage(id string) error {
+	err := runtime.graph.Delete(id)
+	if err != nil {
+		return err
+	}
+	if runtime.deviceSet.HasDevice(id) {
+		if err := runtime.deviceSet.RemoveDevice(id); err != nil {
+			return fmt.Errorf("Unable to remove device for %v: %v", id, err)
+		}
+	}
 	return nil
 }
 
@@ -428,8 +489,8 @@ func (runtime *Runtime) Commit(container *Container, repository, tag, comment, a
 }
 
 // FIXME: harmonize with NewGraph()
-func NewRuntime(flGraphPath string, autoRestart bool, dns []string) (*Runtime, error) {
-	runtime, err := NewRuntimeFromDirectory(flGraphPath, autoRestart)
+func NewRuntime(flGraphPath string, deviceSet DeviceSet, autoRestart bool, dns []string) (*Runtime, error) {
+	runtime, err := NewRuntimeFromDirectory(flGraphPath, deviceSet, autoRestart)
 	if err != nil {
 		return nil, err
 	}
@@ -447,7 +508,7 @@ func NewRuntime(flGraphPath string, autoRestart bool, dns []string) (*Runtime, e
 	return runtime, nil
 }
 
-func NewRuntimeFromDirectory(root string, autoRestart bool) (*Runtime, error) {
+func NewRuntimeFromDirectory(root string, deviceSet DeviceSet, autoRestart bool) (*Runtime, error) {
 	runtimeRepo := path.Join(root, "containers")
 
 	if err := os.MkdirAll(runtimeRepo, 0700); err != nil && !os.IsExist(err) {
@@ -484,6 +545,7 @@ func NewRuntimeFromDirectory(root string, autoRestart bool) (*Runtime, error) {
 		capabilities:   &Capabilities{},
 		autoRestart:    autoRestart,
 		volumes:        volumes,
+		deviceSet:      deviceSet,
 	}
 
 	if err := runtime.restore(); err != nil {

+ 75 - 9
runtime_test.go

@@ -3,11 +3,14 @@ package docker
 import (
 	"bytes"
 	"fmt"
+	"github.com/dotcloud/docker/devmapper"
 	"github.com/dotcloud/docker/utils"
 	"io"
+	"io/ioutil"
 	"log"
 	"net"
 	"os"
+	"path/filepath"
 	"runtime"
 	"strconv"
 	"strings"
@@ -18,12 +21,13 @@ import (
 )
 
 const (
-	unitTestImageName     = "docker-test-image"
-	unitTestImageID       = "83599e29c455eb719f77d799bc7c51521b9551972f5a850d7ad265bc1b5292f6" // 1.0
-	unitTestNetworkBridge = "testdockbr0"
-	unitTestStoreBase     = "/var/lib/docker/unit-tests"
-	testDaemonAddr        = "127.0.0.1:4270"
-	testDaemonProto       = "tcp"
+	unitTestImageName        = "docker-test-image"
+	unitTestImageID          = "83599e29c455eb719f77d799bc7c51521b9551972f5a850d7ad265bc1b5292f6" // 1.0
+	unitTestNetworkBridge    = "testdockbr0"
+	unitTestStoreBase        = "/var/lib/docker/unit-tests"
+	unitTestStoreDevicesBase = "/var/lib/docker/unit-tests-devices"
+	testDaemonAddr           = "127.0.0.1:4270"
+	testDaemonProto          = "tcp"
 )
 
 var (
@@ -43,6 +47,10 @@ func nuke(runtime *Runtime) error {
 	}
 	wg.Wait()
 	runtime.networkManager.Close()
+
+	for _, container := range runtime.List() {
+		container.EnsureUnmounted()
+	}
 	return os.RemoveAll(runtime.root)
 }
 
@@ -57,12 +65,18 @@ func cleanup(runtime *Runtime) error {
 	}
 	for _, image := range images {
 		if image.ID != unitTestImageID {
-			runtime.graph.Delete(image.ID)
+			runtime.DeleteImage(image.ID)
 		}
 	}
 	return nil
 }
 
+func cleanupLast(runtime *Runtime) error {
+	cleanup(runtime)
+	runtime.deviceSet.Shutdown()
+	return nil
+}
+
 func layerArchive(tarfile string) (io.Reader, error) {
 	// FIXME: need to close f somewhere
 	f, err := os.Open(tarfile)
@@ -72,6 +86,45 @@ func layerArchive(tarfile string) (io.Reader, error) {
 	return f, nil
 }
 
+// Remove any leftover device mapper devices from earlier runs of the unit tests
+func removeDev(name string) {
+	path := filepath.Join("/dev/mapper", name)
+	fd, err := syscall.Open(path, syscall.O_RDONLY, 07777)
+	if err != nil {
+		if err == syscall.ENXIO {
+			// No device for this node, just remove it
+			os.Remove(path)
+			return
+		}
+	} else {
+		syscall.Close(fd)
+	}
+	if err := devmapper.RemoveDevice(name); err != nil {
+		panic(fmt.Errorf("Unable to remove existing device %s: %s", name, err))
+	}
+}
+
+func cleanupDevMapper() {
+	infos, _ := ioutil.ReadDir("/dev/mapper")
+	if infos != nil {
+		hasPool := false
+		for _, info := range infos {
+			name := info.Name()
+			if strings.HasPrefix(name, "docker-unit-tests-devices-") {
+				if name == "docker-unit-tests-devices-pool" {
+					hasPool = true
+				} else {
+					removeDev(name)
+				}
+			}
+			// We need to remove the pool last as the other devices block it
+			if hasPool {
+				removeDev("docker-unit-tests-devices-pool")
+			}
+		}
+	}
+}
+
 func init() {
 	os.Setenv("TEST", "1")
 
@@ -87,8 +140,21 @@ func init() {
 
 	NetworkBridgeIface = unitTestNetworkBridge
 
+	cleanupDevMapper()
+
+	// Always start from a clean set of loopback mounts
+	err := os.RemoveAll(unitTestStoreDevicesBase)
+	if err != nil {
+		panic(err)
+	}
+
+	deviceset := devmapper.NewDeviceSetDM(unitTestStoreDevicesBase)
+	// Create a device, which triggers the initiation of the base FS
+	// This avoids other tests doing this and timing out
+	deviceset.AddDevice("init","")
+
 	// Make it our Store root
-	if runtime, err := NewRuntimeFromDirectory(unitTestStoreBase, false); err != nil {
+	if runtime, err := NewRuntimeFromDirectory(unitTestStoreBase, deviceset, false); err != nil {
 		log.Fatalf("Unable to create a runtime for tests:", err)
 	} else {
 		globalRuntime = runtime
@@ -458,7 +524,7 @@ func TestRestore(t *testing.T) {
 
 	// Here are are simulating a docker restart - that is, reloading all containers
 	// from scratch
-	runtime2, err := NewRuntimeFromDirectory(runtime1.root, false)
+	runtime2, err := NewRuntimeFromDirectory(runtime1.root, runtime1.deviceSet, false)
 	if err != nil {
 		t.Fatal(err)
 	}

+ 9 - 13
server.go

@@ -1028,7 +1028,7 @@ func (srv *Server) deleteImageAndChildren(id string, imgs *[]APIRmi) error {
 		if err := srv.runtime.repositories.DeleteAll(id); err != nil {
 			return err
 		}
-		err := srv.runtime.graph.Delete(id)
+		err := srv.runtime.DeleteImage(id)
 		if err != nil {
 			return err
 		}
@@ -1102,7 +1102,7 @@ func (srv *Server) ImageDelete(name string, autoPrune bool) ([]APIRmi, error) {
 		return nil, fmt.Errorf("No such image: %s", name)
 	}
 	if !autoPrune {
-		if err := srv.runtime.graph.Delete(img.ID); err != nil {
+		if err := srv.runtime.DeleteImage(img.ID); err != nil {
 			return nil, fmt.Errorf("Error deleting image %s: %s", name, err)
 		}
 		return nil, nil
@@ -1302,15 +1302,15 @@ func (srv *Server) ContainerCopy(name string, resource string, out io.Writer) er
 
 }
 
-func NewServer(flGraphPath string, autoRestart, enableCors bool, dns ListOpts) (*Server, error) {
+func NewServer(flGraphPath string, deviceSet DeviceSet, autoRestart, enableCors bool, dns ListOpts) (*Server, error) {
 	if runtime.GOARCH != "amd64" {
 		log.Fatalf("The docker runtime currently only supports amd64 (not %s). This will change in the future. Aborting.", runtime.GOARCH)
 	}
-	runtime, err := NewRuntime(flGraphPath, autoRestart, dns)
+	runtime, err := NewRuntime(flGraphPath, deviceSet, autoRestart, dns)
 	if err != nil {
 		return nil, err
 	}
-	srv := &Server{
+	runtime.srv = &Server{
 		runtime:     runtime,
 		enableCors:  enableCors,
 		pullingPool: make(map[string]struct{}),
@@ -1319,18 +1319,14 @@ func NewServer(flGraphPath string, autoRestart, enableCors bool, dns ListOpts) (
 		listeners:   make(map[string]chan utils.JSONMessage),
 		reqFactory:  nil,
 	}
-	runtime.srv = srv
-	return srv, nil
+	return runtime.srv, nil
 }
 
 func (srv *Server) HTTPRequestFactory(metaHeaders map[string][]string) *utils.HTTPRequestFactory {
 	if srv.reqFactory == nil {
-		ud := utils.NewHTTPUserAgentDecorator(srv.versionInfos()...)
-		md := &utils.HTTPMetaHeadersDecorator{
-			Headers: metaHeaders,
-		}
-		factory := utils.NewHTTPRequestFactory(ud, md)
-		srv.reqFactory = factory
+		srv.reqFactory = utils.NewHTTPRequestFactory(
+			utils.NewHTTPUserAgentDecorator(srv.versionInfos()...),
+			&utils.HTTPMetaHeadersDecorator{Headers: metaHeaders})
 	}
 	return srv.reqFactory
 }

+ 58 - 0
utils.go

@@ -1,8 +1,33 @@
 package docker
 
+/*
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+#include <errno.h>
+
+// See linux.git/fs/btrfs/ioctl.h
+#define BTRFS_IOCTL_MAGIC 0x94
+#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int)
+
+int
+btrfs_reflink(int fd_out, int fd_in)
+{
+  int res;
+  res = ioctl(fd_out, BTRFS_IOC_CLONE, fd_in);
+  if (res < 0)
+    return errno;
+  return 0;
+}
+
+*/
+import "C"
 import (
 	"fmt"
+	"io"
+	"io/ioutil"
+	"os"
 	"strings"
+	"syscall"
 )
 
 // Compare two Config struct. Do not compare the "Image" nor "Hostname" fields
@@ -167,3 +192,36 @@ func parseLxcOpt(opt string) (string, string, error) {
 	}
 	return strings.TrimSpace(parts[0]), strings.TrimSpace(parts[1]), nil
 }
+
+func RootIsShared() bool {
+	if data, err := ioutil.ReadFile("/proc/self/mountinfo"); err == nil {
+		for _, line := range strings.Split(string(data), "\n") {
+			cols := strings.Split(line, " ")
+			if len(cols) >= 6 && cols[4] == "/" {
+				return strings.HasPrefix(cols[6], "shared")
+			}
+		}
+	}
+
+	// No idea, probably safe to assume so
+	return true
+}
+
+func BtrfsReflink(fd_out, fd_in uintptr) error {
+	res := C.btrfs_reflink(C.int(fd_out), C.int(fd_in))
+	if res != 0 {
+		return syscall.Errno(res)
+	}
+	return nil
+}
+
+func CopyFile(dstFile, srcFile *os.File) error {
+	err := BtrfsReflink(dstFile.Fd(), srcFile.Fd())
+	if err == nil {
+		return nil
+	}
+
+	// Fall back to normal copy
+	_, err = io.Copy(dstFile, srcFile)
+	return err
+}

+ 33 - 0
utils/utils.go

@@ -1028,3 +1028,36 @@ type StatusError struct {
 func (e *StatusError) Error() string {
 	return fmt.Sprintf("Status: %d", e.Status)
 }
+
+func quote(word string, buf *bytes.Buffer) {
+	// Bail out early for "simple" strings
+	if word != "" && !strings.ContainsAny(word, "\\'\"`${[|&;<>()~*?! \t\n") {
+		buf.WriteString(word)
+		return
+	}
+
+	buf.WriteString("'")
+
+	for i := 0; i < len(word); i++ {
+		b := word[i]
+		if b == '\'' {
+			// Replace literal ' with a close ', a \', and a open '
+			buf.WriteString("'\\''")
+		} else {
+			buf.WriteByte(b)
+		}
+	}
+
+	buf.WriteString("'")
+}
+
+func ShellQuoteArguments(args []string) string {
+	var buf bytes.Buffer
+	for i, arg := range args {
+		if i != 0 {
+			buf.WriteByte(' ')
+		}
+		quote(arg, &buf)
+	}
+	return buf.String()
+}

+ 61 - 1
utils_test.go

@@ -6,6 +6,7 @@ import (
 	"io/ioutil"
 	"os"
 	"path"
+	"path/filepath"
 	"strings"
 	"testing"
 )
@@ -42,7 +43,7 @@ func newTestRuntime() (*Runtime, error) {
 		return nil, err
 	}
 
-	runtime, err := NewRuntimeFromDirectory(root, false)
+	runtime, err := NewRuntimeFromDirectory(root, NewDeviceSetWrapper(globalRuntime.deviceSet, filepath.Base(root)), false)
 	if err != nil {
 		return nil, err
 	}
@@ -318,3 +319,62 @@ func TestParseLxcConfOpt(t *testing.T) {
 		}
 	}
 }
+
+type DeviceSetWrapper struct {
+	wrapped DeviceSet
+	prefix  string
+}
+
+func (wrapper *DeviceSetWrapper) wrap(hash string) string {
+	if hash != "" {
+		hash = wrapper.prefix + "-" + hash
+	}
+	return hash
+}
+
+func (wrapper *DeviceSetWrapper) AddDevice(hash, baseHash string) error {
+	return wrapper.wrapped.AddDevice(wrapper.wrap(hash), wrapper.wrap(baseHash))
+}
+
+func (wrapper *DeviceSetWrapper) SetInitialized(hash string) error {
+	return wrapper.wrapped.SetInitialized(wrapper.wrap(hash))
+}
+
+func (wrapper *DeviceSetWrapper) DeactivateDevice(hash string) error {
+	return wrapper.wrapped.DeactivateDevice(wrapper.wrap(hash))
+}
+
+func (wrapper *DeviceSetWrapper) Shutdown() error {
+	return nil
+}
+
+func (wrapper *DeviceSetWrapper) RemoveDevice(hash string) error {
+	return wrapper.wrapped.RemoveDevice(wrapper.wrap(hash))
+}
+
+func (wrapper *DeviceSetWrapper) MountDevice(hash, path string) error {
+	return wrapper.wrapped.MountDevice(wrapper.wrap(hash), path)
+}
+
+func (wrapper *DeviceSetWrapper) UnmountDevice(hash, path string) error {
+	return wrapper.wrapped.UnmountDevice(wrapper.wrap(hash), path)
+}
+
+func (wrapper *DeviceSetWrapper) HasDevice(hash string) bool {
+	return wrapper.wrapped.HasDevice(wrapper.wrap(hash))
+}
+
+func (wrapper *DeviceSetWrapper) HasInitializedDevice(hash string) bool {
+	return wrapper.wrapped.HasInitializedDevice(wrapper.wrap(hash))
+}
+
+func (wrapper *DeviceSetWrapper) HasActivatedDevice(hash string) bool {
+	return wrapper.wrapped.HasActivatedDevice(wrapper.wrap(hash))
+}
+
+func NewDeviceSetWrapper(wrapped DeviceSet, prefix string) DeviceSet {
+	return &DeviceSetWrapper{
+		wrapped: wrapped,
+		prefix:  prefix,
+	}
+}

+ 1 - 1
z_final_test.go

@@ -11,7 +11,7 @@ func displayFdGoroutines(t *testing.T) {
 }
 
 func TestFinal(t *testing.T) {
-	cleanup(globalRuntime)
+	cleanupLast(globalRuntime)
 	t.Logf("Start Fds: %d, Start Goroutines: %d", startFds, startGoroutines)
 	displayFdGoroutines(t)
 }