Przeglądaj źródła

Partial merge of devmapper/ in order to integrate it as a backend
plugin.

The merge is inert, in other words the devmapper code is not called
and the primary aufs backend is untouched.

Solomon Hykes 11 lat temu
rodzic
commit
36c7a7ae94

+ 84 - 7
archive/archive.go

@@ -80,20 +80,74 @@ func (compression *Compression) Extension() string {
 // Tar creates an archive from the directory at `path`, and returns it as a
 // stream of bytes.
 func Tar(path string, compression Compression) (io.Reader, error) {
-	return TarFilter(path, compression, nil)
+	return TarFilter(path, compression, nil, true, nil)
+}
+
+func escapeName(name string) string {
+	escaped := make([]byte, 0)
+	for i, c := range []byte(name) {
+		if i == 0 && c == '/' {
+			continue
+		}
+		// all printable chars except "-" which is 0x2d
+		if (0x20 <= c && c <= 0x7E) && c != 0x2d {
+			escaped = append(escaped, c)
+		} else {
+			escaped = append(escaped, fmt.Sprintf("\\%03o", c)...)
+		}
+	}
+	return string(escaped)
 }
 
 // Tar creates an archive from the directory at `path`, only including files whose relative
 // paths are included in `filter`. If `filter` is nil, then all files are included.
-func TarFilter(path string, compression Compression, filter []string) (io.Reader, error) {
-	args := []string{"tar", "--numeric-owner", "-f", "-", "-C", path}
+func TarFilter(path string, compression Compression, filter []string, recursive bool, createFiles []string) (io.Reader, error) {
+	args := []string{"tar", "--numeric-owner", "-f", "-", "-C", path, "-T", "-"}
 	if filter == nil {
 		filter = []string{"."}
 	}
+	args = append(args, "-c"+compression.Flag())
+
+	if !recursive {
+		args = append(args, "--no-recursion")
+	}
+
+	files := ""
 	for _, f := range filter {
-		args = append(args, "-c"+compression.Flag(), f)
+		files = files + escapeName(f) + "\n"
 	}
-	return CmdStream(exec.Command(args[0], args[1:]...))
+
+	tmpDir := ""
+
+	if createFiles != nil {
+		var err error // Can't use := here or we override the outer tmpDir
+		tmpDir, err = ioutil.TempDir("", "docker-tar")
+		if err != nil {
+			return nil, err
+		}
+
+		files = files + "-C" + tmpDir + "\n"
+		for _, f := range createFiles {
+			path := filepath.Join(tmpDir, f)
+			err := os.MkdirAll(filepath.Dir(path), 0600)
+			if err != nil {
+				return nil, err
+			}
+
+			if file, err := os.OpenFile(path, os.O_CREATE, 0600); err != nil {
+				return nil, err
+			} else {
+				file.Close()
+			}
+			files = files + escapeName(f) + "\n"
+		}
+	}
+
+	return CmdStream(exec.Command(args[0], args[1:]...), &files, func() {
+		if tmpDir != "" {
+			_ = os.RemoveAll(tmpDir)
+		}
+	})
 }
 
 // Untar reads a stream of bytes from `archive`, parses it as a tar archive,
@@ -140,7 +194,7 @@ func Untar(archive io.Reader, path string) error {
 // TarUntar aborts and returns the error.
 func TarUntar(src string, filter []string, dst string) error {
 	utils.Debugf("TarUntar(%s %s %s)", src, filter, dst)
-	archive, err := TarFilter(src, Uncompressed, filter)
+	archive, err := TarFilter(src, Uncompressed, filter, true, nil)
 	if err != nil {
 		return err
 	}
@@ -227,13 +281,33 @@ func CopyFileWithTar(src, dst string) error {
 // CmdStream executes a command, and returns its stdout as a stream.
 // If the command fails to run or doesn't complete successfully, an error
 // will be returned, including anything written on stderr.
-func CmdStream(cmd *exec.Cmd) (io.Reader, error) {
+func CmdStream(cmd *exec.Cmd, input *string, atEnd func()) (io.Reader, error) {
+	if input != nil {
+		stdin, err := cmd.StdinPipe()
+		if err != nil {
+			if atEnd != nil {
+				atEnd()
+			}
+			return nil, err
+		}
+		// Write stdin if any
+		go func() {
+			_, _ = stdin.Write([]byte(*input))
+			stdin.Close()
+		}()
+	}
 	stdout, err := cmd.StdoutPipe()
 	if err != nil {
+		if atEnd != nil {
+			atEnd()
+		}
 		return nil, err
 	}
 	stderr, err := cmd.StderrPipe()
 	if err != nil {
+		if atEnd != nil {
+			atEnd()
+		}
 		return nil, err
 	}
 	pipeR, pipeW := io.Pipe()
@@ -258,6 +332,9 @@ func CmdStream(cmd *exec.Cmd) (io.Reader, error) {
 		} else {
 			pipeW.Close()
 		}
+		if atEnd != nil {
+			atEnd()
+		}
 	}()
 	// Run the command and return the pipe
 	if err := cmd.Start(); err != nil {

+ 3 - 3
archive/archive_test.go

@@ -14,7 +14,7 @@ import (
 
 func TestCmdStreamLargeStderr(t *testing.T) {
 	cmd := exec.Command("/bin/sh", "-c", "dd if=/dev/zero bs=1k count=1000 of=/dev/stderr; echo hello")
-	out, err := CmdStream(cmd)
+	out, err := CmdStream(cmd, nil, nil)
 	if err != nil {
 		t.Fatalf("Failed to start command: %s", err)
 	}
@@ -35,7 +35,7 @@ func TestCmdStreamLargeStderr(t *testing.T) {
 
 func TestCmdStreamBad(t *testing.T) {
 	badCmd := exec.Command("/bin/sh", "-c", "echo hello; echo >&2 error couldn\\'t reverse the phase pulser; exit 1")
-	out, err := CmdStream(badCmd)
+	out, err := CmdStream(badCmd, nil, nil)
 	if err != nil {
 		t.Fatalf("Failed to start command: %s", err)
 	}
@@ -50,7 +50,7 @@ func TestCmdStreamBad(t *testing.T) {
 
 func TestCmdStreamGood(t *testing.T) {
 	cmd := exec.Command("/bin/sh", "-c", "echo hello; exit 0")
-	out, err := CmdStream(cmd)
+	out, err := CmdStream(cmd, nil, nil)
 	if err != nil {
 		t.Fatal(err)
 	}

+ 1 - 1
container.go

@@ -1507,7 +1507,7 @@ func (container *Container) Copy(resource string) (archive.Archive, error) {
 		filter = []string{path.Base(basePath)}
 		basePath = path.Dir(basePath)
 	}
-	return archive.TarFilter(basePath, archive.Uncompressed, filter)
+	return archive.TarFilter(basePath, archive.Uncompressed, filter, true, nil)
 }
 
 // Returns true if the container exposes a certain port

+ 850 - 0
devmapper/deviceset.go

@@ -0,0 +1,850 @@
+package devmapper
+
+import (
+	"encoding/json"
+	"fmt"
+	"github.com/dotcloud/docker/utils"
+	"io"
+	"io/ioutil"
+	"os"
+	"os/exec"
+	"path"
+	"path/filepath"
+	"strconv"
+	"sync"
+	"syscall"
+	"time"
+)
+
+var (
+	DefaultDataLoopbackSize     int64  = 100 * 1024 * 1024 * 1024
+	DefaultMetaDataLoopbackSize int64  = 2 * 1024 * 1024 * 1024
+	DefaultBaseFsSize           uint64 = 10 * 1024 * 1024 * 1024
+)
+
+type DevInfo struct {
+	Hash          string       `json:"-"`
+	DeviceId      int          `json:"device_id"`
+	Size          uint64       `json:"size"`
+	TransactionId uint64       `json:"transaction_id"`
+	Initialized   bool         `json:"initialized"`
+	devices       *DeviceSet `json:"-"`
+}
+
+type MetaData struct {
+	Devices map[string]*DevInfo `json:devices`
+}
+
+type DeviceSet struct {
+	MetaData
+	sync.Mutex
+	initialized      bool
+	root             string
+	devicePrefix     string
+	TransactionId    uint64
+	NewTransactionId uint64
+	nextFreeDevice   int
+	activeMounts     map[string]int
+}
+
+type DiskUsage struct {
+	Used uint64
+	Total uint64
+}
+
+type Status struct {
+	PoolName string
+	DataLoopback string
+	MetadataLoopback string
+	Data DiskUsage
+	Metadata DiskUsage
+}
+
+func getDevName(name string) string {
+	return "/dev/mapper/" + name
+}
+
+func (info *DevInfo) Name() string {
+	hash := info.Hash
+	if hash == "" {
+		hash = "base"
+	}
+	return fmt.Sprintf("%s-%s", info.devices.devicePrefix, hash)
+}
+
+func (info *DevInfo) DevName() string {
+	return getDevName(info.Name())
+}
+
+func (devices *DeviceSet) loopbackDir() string {
+	return path.Join(devices.root, "devicemapper")
+}
+
+func (devices *DeviceSet) jsonFile() string {
+	return path.Join(devices.loopbackDir(), "json")
+}
+
+func (devices *DeviceSet) getPoolName() string {
+	return devices.devicePrefix + "-pool"
+}
+
+func (devices *DeviceSet) getPoolDevName() string {
+	return getDevName(devices.getPoolName())
+}
+
+func (devices *DeviceSet) hasImage(name string) bool {
+	dirname := devices.loopbackDir()
+	filename := path.Join(dirname, name)
+
+	_, err := os.Stat(filename)
+	return err == nil
+}
+
+// ensureImage creates a sparse file of <size> bytes at the path
+// <root>/devicemapper/<name>.
+// If the file already exists, it does nothing.
+// Either way it returns the full path.
+func (devices *DeviceSet) ensureImage(name string, size int64) (string, error) {
+	dirname := devices.loopbackDir()
+	filename := path.Join(dirname, name)
+
+	if err := os.MkdirAll(dirname, 0700); err != nil && !os.IsExist(err) {
+		return "", err
+	}
+
+	if _, err := os.Stat(filename); err != nil {
+		if !os.IsNotExist(err) {
+			return "", err
+		}
+		utils.Debugf("Creating loopback file %s for device-manage use", filename)
+		file, err := os.OpenFile(filename, os.O_RDWR|os.O_CREATE, 0600)
+		if err != nil {
+			return "", err
+		}
+
+		if err = file.Truncate(size); err != nil {
+			return "", err
+		}
+	}
+	return filename, nil
+}
+
+func (devices *DeviceSet) allocateDeviceId() int {
+	// TODO: Add smarter reuse of deleted devices
+	id := devices.nextFreeDevice
+	devices.nextFreeDevice = devices.nextFreeDevice + 1
+	return id
+}
+
+func (devices *DeviceSet) allocateTransactionId() uint64 {
+	devices.NewTransactionId = devices.NewTransactionId + 1
+	return devices.NewTransactionId
+}
+
+func (devices *DeviceSet) saveMetadata() error {
+	jsonData, err := json.Marshal(devices.MetaData)
+	if err != nil {
+		return fmt.Errorf("Error encoding metaadata to json: %s", err)
+	}
+	tmpFile, err := ioutil.TempFile(filepath.Dir(devices.jsonFile()), ".json")
+	if err != nil {
+		return fmt.Errorf("Error creating metadata file: %s", err)
+	}
+
+	n, err := tmpFile.Write(jsonData)
+	if err != nil {
+		return fmt.Errorf("Error writing metadata to %s: %s", tmpFile.Name(), err)
+	}
+	if n < len(jsonData) {
+		return io.ErrShortWrite
+	}
+	if err := tmpFile.Sync(); err != nil {
+		return fmt.Errorf("Error syncing metadata file %s: %s", tmpFile.Name(), err)
+	}
+	if err := tmpFile.Close(); err != nil {
+		return fmt.Errorf("Error closing metadata file %s: %s", tmpFile.Name(), err)
+	}
+	if err := os.Rename(tmpFile.Name(), devices.jsonFile()); err != nil {
+		return fmt.Errorf("Error committing metadata file", err)
+	}
+
+	if devices.NewTransactionId != devices.TransactionId {
+		if err = setTransactionId(devices.getPoolDevName(), devices.TransactionId, devices.NewTransactionId); err != nil {
+			return fmt.Errorf("Error setting devmapper transition ID: %s", err)
+		}
+		devices.TransactionId = devices.NewTransactionId
+	}
+	return nil
+}
+
+func (devices *DeviceSet) registerDevice(id int, hash string, size uint64) (*DevInfo, error) {
+	utils.Debugf("registerDevice(%v, %v)", id, hash)
+	info := &DevInfo{
+		Hash:          hash,
+		DeviceId:      id,
+		Size:          size,
+		TransactionId: devices.allocateTransactionId(),
+		Initialized:   false,
+		devices:       devices,
+	}
+
+	devices.Devices[hash] = info
+	if err := devices.saveMetadata(); err != nil {
+		// Try to remove unused device
+		delete(devices.Devices, hash)
+		return nil, err
+	}
+
+	return info, nil
+}
+
+func (devices *DeviceSet) activateDeviceIfNeeded(hash string) error {
+	utils.Debugf("activateDeviceIfNeeded(%v)", hash)
+	info := devices.Devices[hash]
+	if info == nil {
+		return fmt.Errorf("Unknown device %s", hash)
+	}
+
+	if devinfo, _ := getInfo(info.Name()); devinfo != nil && devinfo.Exists != 0 {
+		return nil
+	}
+
+	return activateDevice(devices.getPoolDevName(), info.Name(), info.DeviceId, info.Size)
+}
+
+func (devices *DeviceSet) createFilesystem(info *DevInfo) error {
+	devname := info.DevName()
+
+	err := exec.Command("mkfs.ext4", "-E", "discard,lazy_itable_init=0,lazy_journal_init=0", devname).Run()
+	if err != nil {
+		err = exec.Command("mkfs.ext4", "-E", "discard,lazy_itable_init=0", devname).Run()
+	}
+	if err != nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+	return nil
+}
+
+func (devices *DeviceSet) loadMetaData() error {
+	utils.Debugf("loadMetadata()")
+	defer utils.Debugf("loadMetadata END")
+	_, _, _, params, err := getStatus(devices.getPoolName())
+	if err != nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	if _, err := fmt.Sscanf(params, "%d", &devices.TransactionId); err != nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+	devices.NewTransactionId = devices.TransactionId
+
+	jsonData, err := ioutil.ReadFile(devices.jsonFile())
+	if err != nil && !os.IsNotExist(err) {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	devices.MetaData.Devices = make(map[string]*DevInfo)
+	if jsonData != nil {
+		if err := json.Unmarshal(jsonData, &devices.MetaData); err != nil {
+			utils.Debugf("\n--->Err: %s\n", err)
+			return err
+		}
+	}
+
+	for hash, d := range devices.Devices {
+		d.Hash = hash
+		d.devices = devices
+
+		if d.DeviceId >= devices.nextFreeDevice {
+			devices.nextFreeDevice = d.DeviceId + 1
+		}
+
+		// If the transaction id is larger than the actual one we lost the device due to some crash
+		if d.TransactionId > devices.TransactionId {
+			utils.Debugf("Removing lost device %s with id %d", hash, d.TransactionId)
+			delete(devices.Devices, hash)
+		}
+	}
+	return nil
+}
+
+func (devices *DeviceSet) setupBaseImage() error {
+	oldInfo := devices.Devices[""]
+	if oldInfo != nil && oldInfo.Initialized {
+		return nil
+	}
+
+	if oldInfo != nil && !oldInfo.Initialized {
+		utils.Debugf("Removing uninitialized base image")
+		if err := devices.removeDevice(""); err != nil {
+			utils.Debugf("\n--->Err: %s\n", err)
+			return err
+		}
+	}
+
+	utils.Debugf("Initializing base device-manager snapshot")
+
+	id := devices.allocateDeviceId()
+
+	// Create initial device
+	if err := createDevice(devices.getPoolDevName(), id); err != nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	utils.Debugf("Registering base device (id %v) with FS size %v", id, DefaultBaseFsSize)
+	info, err := devices.registerDevice(id, "", DefaultBaseFsSize)
+	if err != nil {
+		_ = deleteDevice(devices.getPoolDevName(), id)
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	utils.Debugf("Creating filesystem on base device-manager snapshot")
+
+	if err = devices.activateDeviceIfNeeded(""); err != nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	if err := devices.createFilesystem(info); err != nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	info.Initialized = true
+	if err = devices.saveMetadata(); err != nil {
+		info.Initialized = false
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	return nil
+}
+
+func setCloseOnExec(name string) {
+	fileInfos, _ := ioutil.ReadDir("/proc/self/fd")
+	if fileInfos != nil {
+		for _, i := range fileInfos {
+			link, _ := os.Readlink(filepath.Join("/proc/self/fd", i.Name()))
+			if link == name {
+				fd, err := strconv.Atoi(i.Name())
+				if err == nil {
+					syscall.CloseOnExec(fd)
+				}
+			}
+		}
+	}
+}
+
+func (devices *DeviceSet) log(level int, file string, line int, dmError int, message string) {
+	if level >= 7 {
+		return // Ignore _LOG_DEBUG
+	}
+
+	utils.Debugf("libdevmapper(%d): %s:%d (%d) %s", level, file, line, dmError, message)
+}
+
+func major (device uint64) uint64 {
+	return  (device >> 8) & 0xfff
+}
+
+func minor (device uint64) uint64 {
+	return (device & 0xff) | ((device >> 12) & 0xfff00)
+}
+
+func (devices *DeviceSet) initDevmapper() error {
+	logInit(devices)
+
+	// Make sure the sparse images exist in <root>/devicemapper/data and
+	// <root>/devicemapper/metadata
+
+	createdLoopback := !devices.hasImage("data") || !devices.hasImage("metadata")
+	data, err := devices.ensureImage("data", DefaultDataLoopbackSize)
+	if err != nil {
+		utils.Debugf("Error device ensureImage (data): %s\n", err)
+		return err
+	}
+	metadata, err := devices.ensureImage("metadata", DefaultMetaDataLoopbackSize)
+	if err != nil {
+		utils.Debugf("Error device ensureImage (metadata): %s\n", err)
+		return err
+	}
+
+	// Set the device prefix from the device id and inode of the docker root dir
+
+	st, err := os.Stat(devices.root)
+	if err != nil {
+		return fmt.Errorf("Error looking up dir %s: %s", devices.root, err)
+	}
+	sysSt := st.Sys().(*syscall.Stat_t)
+	// "reg-" stands for "regular file".
+	// In the future we might use "dev-" for "device file", etc.
+	// docker-maj,min[-inode] stands for:
+	//	- Managed by docker
+	//	- The target of this device is at major <maj> and minor <min>
+	//	- If <inode> is defined, use that file inside the device as a loopback image. Otherwise use the device itself.
+	devices.devicePrefix = fmt.Sprintf("docker-%d:%d-%d", major(sysSt.Dev), minor(sysSt.Dev), sysSt.Ino)
+	utils.Debugf("Generated prefix: %s", devices.devicePrefix)
+
+	// Check for the existence of the device <prefix>-pool
+	utils.Debugf("Checking for existence of the pool '%s'", devices.getPoolName())
+	info, err := getInfo(devices.getPoolName())
+	if info == nil {
+		utils.Debugf("Error device getInfo: %s", err)
+		return err
+	}
+
+	// It seems libdevmapper opens this without O_CLOEXEC, and go exec will not close files
+	// that are not Close-on-exec, and lxc-start will die if it inherits any unexpected files,
+	// so we add this badhack to make sure it closes itself
+	setCloseOnExec("/dev/mapper/control")
+
+	// If the pool doesn't exist, create it
+	if info.Exists == 0 {
+		utils.Debugf("Pool doesn't exist. Creating it.")
+
+		dataFile, err := AttachLoopDevice(data)
+		if err != nil {
+			utils.Debugf("\n--->Err: %s\n", err)
+			return err
+		}
+		defer dataFile.Close()
+
+		metadataFile, err := AttachLoopDevice(metadata)
+		if err != nil {
+			utils.Debugf("\n--->Err: %s\n", err)
+			return err
+		}
+		defer metadataFile.Close()
+
+		if err := createPool(devices.getPoolName(), dataFile, metadataFile); err != nil {
+			utils.Debugf("\n--->Err: %s\n", err)
+			return err
+		}
+	}
+
+	// If we didn't just create the data or metadata image, we need to
+	// load the metadata from the existing file.
+	if !createdLoopback {
+		if err = devices.loadMetaData(); err != nil {
+			utils.Debugf("\n--->Err: %s\n", err)
+			return err
+		}
+	}
+
+	// Setup the base image
+	if err := devices.setupBaseImage(); err != nil {
+		utils.Debugf("Error device setupBaseImage: %s\n", err)
+		return err
+	}
+
+	return nil
+}
+
+func (devices *DeviceSet) AddDevice(hash, baseHash string) error {
+	devices.Lock()
+	defer devices.Unlock()
+
+	if err := devices.ensureInit(); err != nil {
+		utils.Debugf("Error init: %s\n", err)
+		return err
+	}
+
+	if devices.Devices[hash] != nil {
+		return fmt.Errorf("hash %s already exists", hash)
+	}
+
+	baseInfo := devices.Devices[baseHash]
+	if baseInfo == nil {
+		return fmt.Errorf("Error adding device for '%s': can't find device for parent '%s'", hash, baseHash)
+	}
+
+	deviceId := devices.allocateDeviceId()
+
+	if err := devices.createSnapDevice(devices.getPoolDevName(), deviceId, baseInfo.Name(), baseInfo.DeviceId); err != nil {
+		utils.Debugf("Error creating snap device: %s\n", err)
+		return err
+	}
+
+	if _, err := devices.registerDevice(deviceId, hash, baseInfo.Size); err != nil {
+		deleteDevice(devices.getPoolDevName(), deviceId)
+		utils.Debugf("Error registering device: %s\n", err)
+		return err
+	}
+	return nil
+}
+
+func (devices *DeviceSet) removeDevice(hash string) error {
+	info := devices.Devices[hash]
+	if info == nil {
+		return fmt.Errorf("hash %s doesn't exists", hash)
+	}
+
+	devinfo, _ := getInfo(info.Name())
+	if devinfo != nil && devinfo.Exists != 0 {
+		if err := removeDevice(info.Name()); err != nil {
+			utils.Debugf("Error removing device: %s\n", err)
+			return err
+		}
+	}
+
+	if info.Initialized {
+		info.Initialized = false
+		if err := devices.saveMetadata(); err != nil {
+			utils.Debugf("Error saving meta data: %s\n", err)
+			return err
+		}
+	}
+
+	if err := deleteDevice(devices.getPoolDevName(), info.DeviceId); err != nil {
+		utils.Debugf("Error deleting device: %s\n", err)
+		return err
+	}
+
+	devices.allocateTransactionId()
+	delete(devices.Devices, info.Hash)
+
+	if err := devices.saveMetadata(); err != nil {
+		devices.Devices[info.Hash] = info
+		utils.Debugf("Error saving meta data: %s\n", err)
+		return err
+	}
+
+	return nil
+}
+
+func (devices *DeviceSet) RemoveDevice(hash string) error {
+	devices.Lock()
+	defer devices.Unlock()
+
+	if err := devices.ensureInit(); err != nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	return devices.removeDevice(hash)
+}
+
+func (devices *DeviceSet) deactivateDevice(hash string) error {
+	utils.Debugf("[devmapper] deactivateDevice(%s)", hash)
+	defer utils.Debugf("[devmapper] deactivateDevice END")
+	var devname string
+	// FIXME: shouldn't we just register the pool into devices?
+	devname, err := devices.byHash(hash)
+	if err != nil {
+		return err
+	}
+	devinfo, err := getInfo(devname)
+	if err != nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+	if devinfo.Exists != 0 {
+		if err := removeDevice(devname); err != nil {
+			utils.Debugf("\n--->Err: %s\n", err)
+			return err
+		}
+		if err := devices.waitRemove(hash); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// waitRemove blocks until either:
+// a) the device registered at <device_set_prefix>-<hash> is removed,
+// or b) the 1 second timeout expires.
+func (devices *DeviceSet) waitRemove(hash string) error {
+	utils.Debugf("[deviceset %s] waitRemove(%s)", devices.devicePrefix, hash)
+	defer utils.Debugf("[deviceset %s] waitRemove END", devices.devicePrefix, hash)
+	devname, err := devices.byHash(hash)
+	if err != nil {
+		return err
+	}
+	i := 0
+	for ; i < 1000; i += 1 {
+		devinfo, err := getInfo(devname)
+		if err != nil {
+			// If there is an error we assume the device doesn't exist.
+			// The error might actually be something else, but we can't differentiate.
+			return nil
+		}
+		utils.Debugf("Waiting for removal of %s: exists=%d", devname, devinfo.Exists)
+		if devinfo.Exists == 0 {
+			break
+		}
+		time.Sleep(1 * time.Millisecond)
+	}
+	if i == 1000 {
+		return fmt.Errorf("Timeout while waiting for device %s to be removed", devname)
+	}
+	return nil
+}
+
+// waitClose blocks until either:
+// a) the device registered at <device_set_prefix>-<hash> is closed,
+// or b) the 1 second timeout expires.
+func (devices *DeviceSet) waitClose(hash string) error {
+	devname, err := devices.byHash(hash)
+	if err != nil {
+		return err
+	}
+	i := 0
+	for ; i < 1000; i += 1 {
+		devinfo, err := getInfo(devname)
+		if err != nil {
+			return err
+		}
+		utils.Debugf("Waiting for unmount of %s: opencount=%d", devname, devinfo.OpenCount)
+		if devinfo.OpenCount == 0 {
+			break
+		}
+		time.Sleep(1 * time.Millisecond)
+	}
+	if i == 1000 {
+		return fmt.Errorf("Timeout while waiting for device %s to close", devname)
+	}
+	return nil
+}
+
+// byHash is a hack to allow looking up the deviceset's pool by the hash "pool".
+// FIXME: it seems probably cleaner to register the pool in devices.Devices,
+// but I am afraid of arcane implications deep in the devicemapper code,
+// so this will do.
+func (devices *DeviceSet) byHash(hash string) (devname string, err error) {
+	if hash == "pool" {
+		return devices.getPoolDevName(), nil
+	}
+	info := devices.Devices[hash]
+	if info == nil {
+		return "", fmt.Errorf("hash %s doesn't exists", hash)
+	}
+	return info.Name(), nil
+}
+
+func (devices *DeviceSet) Shutdown() error {
+	utils.Debugf("[deviceset %s] shutdown()", devices.devicePrefix)
+	defer utils.Debugf("[deviceset %s] shutdown END", devices.devicePrefix)
+	devices.Lock()
+	utils.Debugf("[devmapper] Shutting down DeviceSet: %s", devices.root)
+	defer devices.Unlock()
+
+	if !devices.initialized {
+		return nil
+	}
+
+	for path, count := range devices.activeMounts {
+		for i := count; i > 0; i-- {
+			if err := syscall.Unmount(path, 0); err != nil {
+				utils.Debugf("Shutdown unmounting %s, error: %s\n", path, err)
+			}
+		}
+		delete(devices.activeMounts, path)
+	}
+
+	for _, d := range devices.Devices {
+		if err := devices.waitClose(d.Hash); err != nil {
+			utils.Errorf("Warning: error waiting for device %s to unmount: %s\n", d.Hash, err)
+		}
+		if err := devices.deactivateDevice(d.Hash); err != nil {
+			utils.Debugf("Shutdown deactivate %s , error: %s\n", d.Hash, err)
+		}
+	}
+
+	pool := devices.getPoolDevName()
+	if devinfo, err := getInfo(pool); err == nil && devinfo.Exists != 0 {
+		if err := devices.deactivateDevice("pool"); err != nil {
+			utils.Debugf("Shutdown deactivate %s , error: %s\n", pool, err)
+		}
+	}
+
+	return nil
+}
+
+func (devices *DeviceSet) MountDevice(hash, path string, readOnly bool) error {
+	devices.Lock()
+	defer devices.Unlock()
+
+	if err := devices.ensureInit(); err != nil {
+		return fmt.Errorf("Error initializing devmapper: %s", err)
+	}
+
+	if err := devices.activateDeviceIfNeeded(hash); err != nil {
+		return fmt.Errorf("Error activating devmapper device for '%s': %s", hash, err)
+	}
+
+	info := devices.Devices[hash]
+
+	var flags uintptr = syscall.MS_MGC_VAL
+
+	if readOnly {
+		flags = flags | syscall.MS_RDONLY
+	}
+
+	err := syscall.Mount(info.DevName(), path, "ext4", flags, "discard")
+	if err != nil && err == syscall.EINVAL {
+		err = syscall.Mount(info.DevName(), path, "ext4", flags, "")
+	}
+	if err != nil {
+		return fmt.Errorf("Error mounting '%s' on '%s': %s", info.DevName(), path, err)
+	}
+
+	count := devices.activeMounts[path]
+	devices.activeMounts[path] = count + 1
+
+	return nil
+}
+
+func (devices *DeviceSet) UnmountDevice(hash, path string, deactivate bool) error {
+	utils.Debugf("[devmapper] UnmountDevice(hash=%s path=%s)", hash, path)
+	defer utils.Debugf("[devmapper] UnmountDevice END")
+	devices.Lock()
+	defer devices.Unlock()
+
+	utils.Debugf("[devmapper] Unmount(%s)", path)
+	if err := syscall.Unmount(path, 0); err != nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+	utils.Debugf("[devmapper] Unmount done")
+	// Wait for the unmount to be effective,
+	// by watching the value of Info.OpenCount for the device
+	if err := devices.waitClose(hash); err != nil {
+		return err
+	}
+
+	if count := devices.activeMounts[path]; count > 1 {
+		devices.activeMounts[path] = count - 1
+	} else {
+		delete(devices.activeMounts, path)
+	}
+
+	if deactivate {
+		devices.deactivateDevice(hash)
+	}
+
+	return nil
+}
+
+func (devices *DeviceSet) HasDevice(hash string) bool {
+	devices.Lock()
+	defer devices.Unlock()
+
+	if err := devices.ensureInit(); err != nil {
+		return false
+	}
+	return devices.Devices[hash] != nil
+}
+
+func (devices *DeviceSet) HasInitializedDevice(hash string) bool {
+	devices.Lock()
+	defer devices.Unlock()
+
+	if err := devices.ensureInit(); err != nil {
+		return false
+	}
+
+	info := devices.Devices[hash]
+	return info != nil && info.Initialized
+}
+
+func (devices *DeviceSet) HasActivatedDevice(hash string) bool {
+	devices.Lock()
+	defer devices.Unlock()
+
+	if err := devices.ensureInit(); err != nil {
+		return false
+	}
+
+	info := devices.Devices[hash]
+	if info == nil {
+		return false
+	}
+	devinfo, _ := getInfo(info.Name())
+	return devinfo != nil && devinfo.Exists != 0
+}
+
+func (devices *DeviceSet) SetInitialized(hash string) error {
+	devices.Lock()
+	defer devices.Unlock()
+
+	if err := devices.ensureInit(); err != nil {
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	info := devices.Devices[hash]
+	if info == nil {
+		return fmt.Errorf("Unknown device %s", hash)
+	}
+
+	info.Initialized = true
+	if err := devices.saveMetadata(); err != nil {
+		info.Initialized = false
+		utils.Debugf("\n--->Err: %s\n", err)
+		return err
+	}
+
+	return nil
+}
+
+func (devices *DeviceSet) Status() *Status {
+	devices.Lock()
+	defer devices.Unlock()
+
+	status := &Status {}
+
+	if err := devices.ensureInit(); err != nil {
+		return status
+	}
+
+	status.PoolName = devices.getPoolName()
+	status.DataLoopback = path.Join( devices.loopbackDir(), "data")
+	status.MetadataLoopback = path.Join( devices.loopbackDir(), "metadata")
+
+	_, totalSizeInSectors, _, params, err := getStatus(devices.getPoolName())
+	if err == nil {
+		var transactionId, dataUsed, dataTotal, metadataUsed, metadataTotal uint64
+		if _, err := fmt.Sscanf(params, "%d %d/%d %d/%d", &transactionId, &metadataUsed, &metadataTotal, &dataUsed, &dataTotal); err == nil {
+			// Convert from blocks to bytes
+			blockSizeInSectors := totalSizeInSectors / dataTotal;
+
+			status.Data.Used = dataUsed * blockSizeInSectors * 512
+			status.Data.Total = dataTotal * blockSizeInSectors * 512
+
+			// metadata blocks are always 4k
+			status.Metadata.Used = metadataUsed * 4096
+			status.Metadata.Total = metadataTotal * 4096
+		}
+	}
+
+	return status
+}
+
+func (devices *DeviceSet) ensureInit() error {
+	if !devices.initialized {
+		devices.initialized = true
+		if err := devices.initDevmapper(); err != nil {
+			utils.Debugf("\n--->Err: %s\n", err)
+			return err
+		}
+	}
+	return nil
+}
+
+func NewDeviceSet(root string) *DeviceSet {
+	SetDevDir("/dev")
+
+	return &DeviceSet{
+		initialized:  false,
+		root:         root,
+		MetaData:     MetaData{Devices: make(map[string]*DevInfo)},
+		activeMounts: make(map[string]int),
+	}
+}

+ 709 - 0
devmapper/devmapper.go

@@ -0,0 +1,709 @@
+package devmapper
+
+/*
+#cgo LDFLAGS: -L. -ldevmapper
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <libdevmapper.h>
+#include <linux/loop.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+#include <errno.h>
+
+#ifndef LOOP_CTL_GET_FREE
+#define LOOP_CTL_GET_FREE       0x4C82
+#endif
+
+// FIXME: this could easily be rewritten in go
+char*			attach_loop_device(const char *filename, int *loop_fd_out)
+{
+  struct loop_info64	loopinfo = {0};
+  struct stat		st;
+  char			buf[64];
+  int			i, loop_fd, fd, start_index;
+  char*			loopname;
+
+
+  *loop_fd_out = -1;
+
+  start_index = 0;
+  fd = open("/dev/loop-control", O_RDONLY);
+  if (fd >= 0) {
+    start_index = ioctl(fd, LOOP_CTL_GET_FREE);
+    close(fd);
+
+    if (start_index < 0)
+      start_index = 0;
+  }
+
+  fd = open(filename, O_RDWR);
+  if (fd < 0) {
+    perror("open");
+    return NULL;
+  }
+
+  loop_fd = -1;
+  for (i = start_index ; loop_fd < 0 ; i++ ) {
+    if (sprintf(buf, "/dev/loop%d", i) < 0) {
+	close(fd);
+	return NULL;
+    }
+
+    if (stat(buf, &st)) {
+      if (!S_ISBLK(st.st_mode)) {
+	 fprintf(stderr, "[error] Loopback device %s is not a block device.\n", buf);
+      } else if (errno == ENOENT) {
+	fprintf(stderr, "[error] There are no more loopback device available.\n");
+      } else {
+	fprintf(stderr, "[error] Unkown error trying to stat the loopback device %s (errno: %d).\n", buf, errno);
+      }
+      close(fd);
+      return NULL;
+    }
+
+    loop_fd = open(buf, O_RDWR);
+    if (loop_fd < 0 && errno == ENOENT) {
+      fprintf(stderr, "[error] The loopback device %s does not exists.\n", buf);
+      close(fd);
+      return NULL;
+    } else if (loop_fd < 0) {
+	fprintf(stderr, "[error] Unkown error openning the loopback device %s. (errno: %d)\n", buf, errno);
+	continue;
+    }
+
+    if (ioctl(loop_fd, LOOP_SET_FD, (void *)(size_t)fd) < 0) {
+      int errsv = errno;
+      close(loop_fd);
+      loop_fd = -1;
+      if (errsv != EBUSY) {
+        close(fd);
+        fprintf(stderr, "cannot set up loopback device %s: %s", buf, strerror(errsv));
+        return NULL;
+      }
+      continue;
+    }
+
+    close(fd);
+
+    strncpy((char*)loopinfo.lo_file_name, buf, LO_NAME_SIZE);
+    loopinfo.lo_offset = 0;
+    loopinfo.lo_flags = LO_FLAGS_AUTOCLEAR;
+
+    if (ioctl(loop_fd, LOOP_SET_STATUS64, &loopinfo) < 0) {
+      perror("ioctl LOOP_SET_STATUS64");
+      if (ioctl(loop_fd, LOOP_CLR_FD, 0) < 0) {
+        perror("ioctl LOOP_CLR_FD");
+      }
+      close(loop_fd);
+      fprintf (stderr, "cannot set up loopback device info");
+      return (NULL);
+    }
+
+    loopname = strdup(buf);
+    if (loopname == NULL) {
+      close(loop_fd);
+      return (NULL);
+    }
+
+    *loop_fd_out = loop_fd;
+    return (loopname);
+  }
+
+  return (NULL);
+}
+
+static int64_t	get_block_size(int fd)
+{
+  uint64_t	size;
+
+  if (ioctl(fd, BLKGETSIZE64, &size) == -1)
+    return -1;
+  return ((int64_t)size);
+}
+
+extern void DevmapperLogCallback(int level, char *file, int line, int dm_errno_or_class, char *str);
+
+static void
+log_cb(int level, const char *file, int line,
+       int dm_errno_or_class, const char *f, ...)
+{
+  char buffer[256];
+  va_list ap;
+
+  va_start(ap, f);
+  vsnprintf(buffer, 256, f, ap);
+  va_end(ap);
+
+  DevmapperLogCallback(level, (char *)file, line, dm_errno_or_class, buffer);
+}
+
+static void
+log_with_errno_init ()
+{
+  dm_log_with_errno_init(log_cb);
+}
+
+*/
+import "C"
+
+import (
+	"errors"
+	"fmt"
+	"github.com/dotcloud/docker/utils"
+	"os"
+	"runtime"
+	"syscall"
+	"unsafe"
+)
+
+type DevmapperLogger interface {
+	log(level int, file string, line int, dmError int, message string)
+}
+
+const (
+	DeviceCreate TaskType = iota
+	DeviceReload
+	DeviceRemove
+	DeviceRemoveAll
+	DeviceSuspend
+	DeviceResume
+	DeviceInfo
+	DeviceDeps
+	DeviceRename
+	DeviceVersion
+	DeviceStatus
+	DeviceTable
+	DeviceWaitevent
+	DeviceList
+	DeviceClear
+	DeviceMknodes
+	DeviceListVersions
+	DeviceTargetMsg
+	DeviceSetGeometry
+)
+
+const (
+	AddNodeOnResume AddNodeType = iota
+	AddNodeOnCreate
+)
+
+var (
+	ErrTaskRun              = errors.New("dm_task_run failed")
+	ErrTaskSetName          = errors.New("dm_task_set_name failed")
+	ErrTaskSetMessage       = errors.New("dm_task_set_message failed")
+	ErrTaskSetAddNode       = errors.New("dm_task_set_add_node failed")
+	ErrTaskSetRO            = errors.New("dm_task_set_ro failed")
+	ErrTaskAddTarget        = errors.New("dm_task_add_target failed")
+	ErrGetDriverVersion     = errors.New("dm_task_get_driver_version failed")
+	ErrAttachLoopbackDevice = errors.New("loopback mounting failed")
+	ErrGetBlockSize         = errors.New("Can't get block size")
+	ErrUdevWait             = errors.New("wait on udev cookie failed")
+	ErrSetDevDir            = errors.New("dm_set_dev_dir failed")
+	ErrGetLibraryVersion    = errors.New("dm_get_library_version failed")
+	ErrCreateRemoveTask     = errors.New("Can't create task of type DeviceRemove")
+	ErrRunRemoveDevice      = errors.New("running removeDevice failed")
+)
+
+type (
+	Task struct {
+		unmanaged *C.struct_dm_task
+	}
+	Info struct {
+		Exists        int
+		Suspended     int
+		LiveTable     int
+		InactiveTable int
+		OpenCount     int32
+		EventNr       uint32
+		Major         uint32
+		Minor         uint32
+		ReadOnly      int
+		TargetCount   int32
+	}
+	TaskType    int
+	AddNodeType int
+)
+
+func (t *Task) destroy() {
+	if t != nil {
+		C.dm_task_destroy(t.unmanaged)
+		runtime.SetFinalizer(t, nil)
+	}
+}
+
+func TaskCreate(tasktype TaskType) *Task {
+	c_task := C.dm_task_create(C.int(tasktype))
+	if c_task == nil {
+		return nil
+	}
+	task := &Task{unmanaged: c_task}
+	runtime.SetFinalizer(task, (*Task).destroy)
+	return task
+}
+
+func (t *Task) Run() error {
+	if res := C.dm_task_run(t.unmanaged); res != 1 {
+		return ErrTaskRun
+	}
+	return nil
+}
+
+func (t *Task) SetName(name string) error {
+	c_name := C.CString(name)
+	defer free(c_name)
+
+	if res := C.dm_task_set_name(t.unmanaged, c_name); res != 1 {
+		return ErrTaskSetName
+	}
+	return nil
+}
+
+func (t *Task) SetMessage(message string) error {
+	c_message := C.CString(message)
+	defer free(c_message)
+
+	if res := C.dm_task_set_message(t.unmanaged, c_message); res != 1 {
+		return ErrTaskSetMessage
+	}
+	return nil
+}
+
+func (t *Task) SetSector(sector uint64) error {
+	if res := C.dm_task_set_sector(t.unmanaged, C.uint64_t(sector)); res != 1 {
+		return ErrTaskSetAddNode
+	}
+	return nil
+}
+
+func (t *Task) SetCookie(cookie *uint32, flags uint16) error {
+	c_cookie := C.uint32_t(*cookie)
+	if res := C.dm_task_set_cookie(t.unmanaged, &c_cookie, C.uint16_t(flags)); res != 1 {
+		return ErrTaskSetAddNode
+	}
+	*cookie = uint32(c_cookie)
+	return nil
+}
+
+func (t *Task) SetAddNode(add_node AddNodeType) error {
+	if res := C.dm_task_set_add_node(t.unmanaged, C.dm_add_node_t(add_node)); res != 1 {
+		return ErrTaskSetAddNode
+	}
+	return nil
+}
+
+func (t *Task) SetRo() error {
+	if res := C.dm_task_set_ro(t.unmanaged); res != 1 {
+		return ErrTaskSetRO
+	}
+	return nil
+}
+
+func (t *Task) AddTarget(start uint64, size uint64, ttype string, params string) error {
+	c_ttype := C.CString(ttype)
+	defer free(c_ttype)
+
+	c_params := C.CString(params)
+	defer free(c_params)
+
+	if res := C.dm_task_add_target(t.unmanaged, C.uint64_t(start), C.uint64_t(size), c_ttype, c_params); res != 1 {
+		return ErrTaskAddTarget
+	}
+	return nil
+}
+
+func (t *Task) GetDriverVersion() (string, error) {
+	buffer := C.CString(string(make([]byte, 128)))
+	defer free(buffer)
+
+	if res := C.dm_task_get_driver_version(t.unmanaged, buffer, 128); res != 1 {
+		return "", ErrGetDriverVersion
+	}
+	return C.GoString(buffer), nil
+}
+
+func (t *Task) GetInfo() (*Info, error) {
+	c_info := C.struct_dm_info{}
+	if res := C.dm_task_get_info(t.unmanaged, &c_info); res != 1 {
+		return nil, ErrGetDriverVersion
+	}
+	return &Info{
+		Exists:        int(c_info.exists),
+		Suspended:     int(c_info.suspended),
+		LiveTable:     int(c_info.live_table),
+		InactiveTable: int(c_info.inactive_table),
+		OpenCount:     int32(c_info.open_count),
+		EventNr:       uint32(c_info.event_nr),
+		Major:         uint32(c_info.major),
+		Minor:         uint32(c_info.minor),
+		ReadOnly:      int(c_info.read_only),
+		TargetCount:   int32(c_info.target_count),
+	}, nil
+}
+
+func (t *Task) GetNextTarget(next uintptr) (uintptr, uint64, uint64, string, string) {
+	var (
+		c_start, c_length       C.uint64_t
+		c_target_type, c_params *C.char
+	)
+
+	nextp := C.dm_get_next_target(t.unmanaged, unsafe.Pointer(next), &c_start, &c_length, &c_target_type, &c_params)
+	return uintptr(nextp), uint64(c_start), uint64(c_length), C.GoString(c_target_type), C.GoString(c_params)
+}
+
+func AttachLoopDevice(filename string) (*os.File, error) {
+	c_filename := C.CString(filename)
+	defer free(c_filename)
+
+	var fd C.int
+	res := C.attach_loop_device(c_filename, &fd)
+	if res == nil {
+		if os.Getenv("DEBUG") != "" {
+			C.perror(C.CString(fmt.Sprintf("[debug] Error attach_loop_device(%s, %d)", filename, int(fd))))
+		}
+		return nil, ErrAttachLoopbackDevice
+	}
+	defer free(res)
+
+	return os.NewFile(uintptr(fd), C.GoString(res)), nil
+}
+
+func getBlockSize(fd uintptr) int {
+	var size uint64
+
+	if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, fd, C.BLKGETSIZE64, uintptr(unsafe.Pointer(&size))); err != 0 {
+		utils.Debugf("Error ioctl: %s", err)
+		return -1
+	}
+	return int(size)
+}
+
+func GetBlockDeviceSize(file *os.File) (uint64, error) {
+	if size := C.get_block_size(C.int(file.Fd())); size == -1 {
+		return 0, ErrGetBlockSize
+	} else {
+		return uint64(size), nil
+	}
+}
+
+func UdevWait(cookie uint32) error {
+	if res := C.dm_udev_wait(C.uint32_t(cookie)); res != 1 {
+		utils.Debugf("Failed to wait on udev cookie %d", cookie)
+		return ErrUdevWait
+	}
+	return nil
+}
+
+func LogInitVerbose(level int) {
+	C.dm_log_init_verbose(C.int(level))
+}
+
+var dmLogger DevmapperLogger = nil
+
+func logInit(logger DevmapperLogger) {
+	dmLogger = logger
+	C.log_with_errno_init()
+}
+
+func SetDevDir(dir string) error {
+	c_dir := C.CString(dir)
+	defer free(c_dir)
+
+	if res := C.dm_set_dev_dir(c_dir); res != 1 {
+		utils.Debugf("Error dm_set_dev_dir")
+		return ErrSetDevDir
+	}
+	return nil
+}
+
+func GetLibraryVersion() (string, error) {
+	buffer := C.CString(string(make([]byte, 128)))
+	defer free(buffer)
+
+	if res := C.dm_get_library_version(buffer, 128); res != 1 {
+		return "", ErrGetLibraryVersion
+	}
+	return C.GoString(buffer), nil
+}
+
+// Useful helper for cleanup
+func RemoveDevice(name string) error {
+	task := TaskCreate(DeviceRemove)
+	if task == nil {
+		return ErrCreateRemoveTask
+	}
+	if err := task.SetName(name); err != nil {
+		utils.Debugf("Can't set task name %s", name)
+		return err
+	}
+	if err := task.Run(); err != nil {
+		return ErrRunRemoveDevice
+	}
+	return nil
+}
+
+func free(p *C.char) {
+	C.free(unsafe.Pointer(p))
+}
+
+// This is the programmatic example of "dmsetup create"
+func createPool(poolName string, dataFile *os.File, metadataFile *os.File) error {
+	task, err := createTask(DeviceCreate, poolName)
+	if task == nil {
+		return err
+	}
+
+	size, err := GetBlockDeviceSize(dataFile)
+	if err != nil {
+		return fmt.Errorf("Can't get data size")
+	}
+
+	params := metadataFile.Name() + " " + dataFile.Name() + " 128 32768"
+	if err := task.AddTarget(0, size/512, "thin-pool", params); err != nil {
+		return fmt.Errorf("Can't add target")
+	}
+
+	var cookie uint32 = 0
+	if err := task.SetCookie(&cookie, 0); err != nil {
+		return fmt.Errorf("Can't set cookie")
+	}
+
+	if err := task.Run(); err != nil {
+		return fmt.Errorf("Error running DeviceCreate")
+	}
+
+	UdevWait(cookie)
+
+	return nil
+}
+
+func createTask(t TaskType, name string) (*Task, error) {
+	task := TaskCreate(t)
+	if task == nil {
+		return nil, fmt.Errorf("Can't create task of type %d", int(t))
+	}
+	if err := task.SetName(name); err != nil {
+		return nil, fmt.Errorf("Can't set task name %s", name)
+	}
+	return task, nil
+}
+
+func getInfo(name string) (*Info, error) {
+	task, err := createTask(DeviceInfo, name)
+	if task == nil {
+		return nil, err
+	}
+	if err := task.Run(); err != nil {
+		return nil, err
+	}
+	return task.GetInfo()
+}
+
+func getStatus(name string) (uint64, uint64, string, string, error) {
+	task, err := createTask(DeviceStatus, name)
+	if task == nil {
+		utils.Debugf("getStatus: Error createTask: %s", err)
+		return 0, 0, "", "", err
+	}
+	if err := task.Run(); err != nil {
+		utils.Debugf("getStatus: Error Run: %s", err)
+		return 0, 0, "", "", err
+	}
+
+	devinfo, err := task.GetInfo()
+	if err != nil {
+		utils.Debugf("getStatus: Error GetInfo: %s", err)
+		return 0, 0, "", "", err
+	}
+	if devinfo.Exists == 0 {
+		utils.Debugf("getStatus: Non existing device %s", name)
+		return 0, 0, "", "", fmt.Errorf("Non existing device %s", name)
+	}
+
+	_, start, length, target_type, params := task.GetNextTarget(0)
+	return start, length, target_type, params, nil
+}
+
+func setTransactionId(poolName string, oldId uint64, newId uint64) error {
+	task, err := createTask(DeviceTargetMsg, poolName)
+	if task == nil {
+		return err
+	}
+
+	if err := task.SetSector(0); err != nil {
+		return fmt.Errorf("Can't set sector")
+	}
+
+	if err := task.SetMessage(fmt.Sprintf("set_transaction_id %d %d", oldId, newId)); err != nil {
+		return fmt.Errorf("Can't set message")
+	}
+
+	if err := task.Run(); err != nil {
+		return fmt.Errorf("Error running setTransactionId")
+	}
+	return nil
+}
+
+func suspendDevice(name string) error {
+	task, err := createTask(DeviceSuspend, name)
+	if task == nil {
+		return err
+	}
+	if err := task.Run(); err != nil {
+		return fmt.Errorf("Error running DeviceSuspend")
+	}
+	return nil
+}
+
+func resumeDevice(name string) error {
+	task, err := createTask(DeviceResume, name)
+	if task == nil {
+		return err
+	}
+
+	var cookie uint32 = 0
+	if err := task.SetCookie(&cookie, 0); err != nil {
+		return fmt.Errorf("Can't set cookie")
+	}
+
+	if err := task.Run(); err != nil {
+		return fmt.Errorf("Error running DeviceSuspend")
+	}
+
+	UdevWait(cookie)
+
+	return nil
+}
+
+func createDevice(poolName string, deviceId int) error {
+	utils.Debugf("[devmapper] createDevice(poolName=%v, deviceId=%v)", poolName, deviceId)
+	task, err := createTask(DeviceTargetMsg, poolName)
+	if task == nil {
+		return err
+	}
+
+	if err := task.SetSector(0); err != nil {
+		return fmt.Errorf("Can't set sector")
+	}
+
+	if err := task.SetMessage(fmt.Sprintf("create_thin %d", deviceId)); err != nil {
+		return fmt.Errorf("Can't set message")
+	}
+
+	if err := task.Run(); err != nil {
+		return fmt.Errorf("Error running createDevice")
+	}
+	return nil
+}
+
+func deleteDevice(poolName string, deviceId int) error {
+	task, err := createTask(DeviceTargetMsg, poolName)
+	if task == nil {
+		return err
+	}
+
+	if err := task.SetSector(0); err != nil {
+		return fmt.Errorf("Can't set sector")
+	}
+
+	if err := task.SetMessage(fmt.Sprintf("delete %d", deviceId)); err != nil {
+		return fmt.Errorf("Can't set message")
+	}
+
+	if err := task.Run(); err != nil {
+		return fmt.Errorf("Error running deleteDevice")
+	}
+	return nil
+}
+
+func removeDevice(name string) error {
+	utils.Debugf("[devmapper] removeDevice START")
+	defer utils.Debugf("[devmapper] removeDevice END")
+	task, err := createTask(DeviceRemove, name)
+	if task == nil {
+		return err
+	}
+	if err = task.Run(); err != nil {
+		return fmt.Errorf("Error running removeDevice")
+	}
+	return nil
+}
+
+func activateDevice(poolName string, name string, deviceId int, size uint64) error {
+	task, err := createTask(DeviceCreate, name)
+	if task == nil {
+		return err
+	}
+
+	params := fmt.Sprintf("%s %d", poolName, deviceId)
+	if err := task.AddTarget(0, size/512, "thin", params); err != nil {
+		return fmt.Errorf("Can't add target")
+	}
+	if err := task.SetAddNode(AddNodeOnCreate); err != nil {
+		return fmt.Errorf("Can't add node")
+	}
+
+	var cookie uint32 = 0
+	if err := task.SetCookie(&cookie, 0); err != nil {
+		return fmt.Errorf("Can't set cookie")
+	}
+
+	if err := task.Run(); err != nil {
+		return fmt.Errorf("Error running DeviceCreate")
+	}
+
+	UdevWait(cookie)
+
+	return nil
+}
+
+func (devices *DeviceSet) createSnapDevice(poolName string, deviceId int, baseName string, baseDeviceId int) error {
+	devinfo, _ := getInfo(baseName)
+	doSuspend := devinfo != nil && devinfo.Exists != 0
+
+	if doSuspend {
+		if err := suspendDevice(baseName); err != nil {
+			return err
+		}
+	}
+
+	task, err := createTask(DeviceTargetMsg, poolName)
+	if task == nil {
+		if doSuspend {
+			resumeDevice(baseName)
+		}
+		return err
+	}
+
+	if err := task.SetSector(0); err != nil {
+		if doSuspend {
+			resumeDevice(baseName)
+		}
+		return fmt.Errorf("Can't set sector")
+	}
+
+	if err := task.SetMessage(fmt.Sprintf("create_snap %d %d", deviceId, baseDeviceId)); err != nil {
+		if doSuspend {
+			resumeDevice(baseName)
+		}
+		return fmt.Errorf("Can't set message")
+	}
+
+	if err := task.Run(); err != nil {
+		if doSuspend {
+			resumeDevice(baseName)
+		}
+		return fmt.Errorf("Error running DeviceCreate")
+	}
+
+	if doSuspend {
+		if err := resumeDevice(baseName); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}

+ 13 - 0
devmapper/devmapper_log.go

@@ -0,0 +1,13 @@
+package devmapper
+
+import "C"
+
+// Due to the way cgo works this has to be in a separate file, as devmapper.go has
+// definitions in the cgo block, which is incompatible with using "//export"
+
+//export DevmapperLogCallback
+func DevmapperLogCallback(level C.int, file *C.char, line C.int, dm_errno_or_class C.int, message *C.char) {
+	if dmLogger != nil {
+		dmLogger.log(int(level), C.GoString(file), int(line), int(dm_errno_or_class), C.GoString(message))
+	}
+}

+ 62 - 0
devmapper/docker-device-tool/device_tool.go

@@ -0,0 +1,62 @@
+package main
+
+import (
+	"fmt"
+	"github.com/dotcloud/docker/devmapper"
+	"os"
+)
+
+func usage() {
+	fmt.Printf("Usage: %s [snap new-id base-id] | [remove id] | [mount id mountpoint]\n", os.Args[0])
+	os.Exit(1)
+}
+
+func main() {
+	devices := devmapper.NewDeviceSet("/var/lib/docker")
+
+	if len(os.Args) < 2 {
+		usage()
+	}
+
+	cmd := os.Args[1]
+	if cmd == "snap" {
+		if len(os.Args) < 4 {
+			usage()
+		}
+
+		err := devices.AddDevice(os.Args[2], os.Args[3])
+		if err != nil {
+			fmt.Println("Can't create snap device: ", err)
+			os.Exit(1)
+		}
+	} else if cmd == "remove" {
+		if len(os.Args) < 3 {
+			usage()
+		}
+
+		err := devices.RemoveDevice(os.Args[2])
+		if err != nil {
+			fmt.Println("Can't remove device: ", err)
+			os.Exit(1)
+		}
+	} else if cmd == "mount" {
+		if len(os.Args) < 4 {
+			usage()
+		}
+
+		err := devices.MountDevice(os.Args[2], os.Args[3])
+		if err != nil {
+			fmt.Println("Can't create snap device: ", err)
+			os.Exit(1)
+		}
+	} else {
+		fmt.Printf("Unknown command %s\n", cmd)
+		if len(os.Args) < 4 {
+			usage()
+		}
+
+		os.Exit(1)
+	}
+
+	return
+}

+ 58 - 0
utils.go

@@ -1,11 +1,36 @@
 package docker
 
+/*
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+#include <errno.h>
+
+// See linux.git/fs/btrfs/ioctl.h
+#define BTRFS_IOCTL_MAGIC 0x94
+#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int)
+
+int
+btrfs_reflink(int fd_out, int fd_in)
+{
+  int res;
+  res = ioctl(fd_out, BTRFS_IOC_CLONE, fd_in);
+  if (res < 0)
+    return errno;
+  return 0;
+}
+
+*/
+import "C"
 import (
 	"fmt"
 	"github.com/dotcloud/docker/namesgenerator"
 	"github.com/dotcloud/docker/utils"
 	"strconv"
+	"io"
+	"io/ioutil"
+	"os"
 	"strings"
+	"syscall"
 )
 
 // Compare two Config struct. Do not compare the "Image" nor "Hostname" fields
@@ -289,6 +314,28 @@ func migratePortMappings(config *Config, hostConfig *HostConfig) error {
 	return nil
 }
 
+func RootIsShared() bool {
+	if data, err := ioutil.ReadFile("/proc/self/mountinfo"); err == nil {
+		for _, line := range strings.Split(string(data), "\n") {
+			cols := strings.Split(line, " ")
+			if len(cols) >= 6 && cols[4] == "/" {
+				return strings.HasPrefix(cols[6], "shared")
+			}
+		}
+	}
+
+	// No idea, probably safe to assume so
+	return true
+}
+
+func BtrfsReflink(fd_out, fd_in uintptr) error {
+	res := C.btrfs_reflink(C.int(fd_out), C.int(fd_in))
+	if res != 0 {
+		return syscall.Errno(res)
+	}
+	return nil
+}
+
 // Links come in the format of
 // name:alias
 func parseLink(rawLink string) (map[string]string, error) {
@@ -307,3 +354,14 @@ func (c *checker) Exists(name string) bool {
 func generateRandomName(runtime *Runtime) (string, error) {
 	return namesgenerator.GenerateRandomName(&checker{runtime})
 }
+
+func CopyFile(dstFile, srcFile *os.File) error {
+	err := BtrfsReflink(dstFile.Fd(), srcFile.Fd())
+	if err == nil {
+		return nil
+	}
+
+	// Fall back to normal copy
+	_, err = io.Copy(dstFile, srcFile)
+	return err
+}

+ 35 - 0
utils/utils.go

@@ -1114,6 +1114,41 @@ func (e *StatusError) Error() string {
 	return fmt.Sprintf("Status: %d", e.Status)
 }
 
+func quote(word string, buf *bytes.Buffer) {
+	// Bail out early for "simple" strings
+	if word != "" && !strings.ContainsAny(word, "\\'\"`${[|&;<>()~*?! \t\n") {
+		buf.WriteString(word)
+		return
+	}
+
+	buf.WriteString("'")
+
+	for i := 0; i < len(word); i++ {
+		b := word[i]
+		if b == '\'' {
+			// Replace literal ' with a close ', a \', and a open '
+			buf.WriteString("'\\''")
+		} else {
+			buf.WriteByte(b)
+		}
+	}
+
+	buf.WriteString("'")
+}
+
+// Take a list of strings and escape them so they will be handled right
+// when passed as arguments to an program via a shell
+func ShellQuoteArguments(args []string) string {
+	var buf bytes.Buffer
+	for i, arg := range args {
+		if i != 0 {
+			buf.WriteByte(' ')
+		}
+		quote(arg, &buf)
+	}
+	return buf.String()
+}
+
 func IsClosedError(err error) bool {
 	/* This comparison is ugly, but unfortunately, net.go doesn't export errClosing.
 	 * See:

+ 2 - 0
utils_test.go

@@ -62,7 +62,9 @@ func newTestRuntime(prefix string) (runtime *Runtime, err error) {
 	if err := os.Remove(root); err != nil {
 		return nil, err
 	}
+	utils.Debugf("Copying %s to %s", unitTestStoreBase, root)
 	if err := utils.CopyDirectory(unitTestStoreBase, root); err != nil {
+		utils.Debugf("ERROR: Copying %s to %s returned %s", unitTestStoreBase, root, err)
 		return nil, err
 	}