Explorar o código

Merge pull request #5411 from crosbymichael/lockdown

Update default restrictions for exec drivers
unclejack %!s(int64=11) %!d(string=hai) anos
pai
achega
44140f7909

+ 21 - 13
daemon/execdriver/lxc/driver.go

@@ -59,9 +59,10 @@ func init() {
 }
 
 type driver struct {
-	root       string // root path for the driver to use
-	apparmor   bool
-	sharedRoot bool
+	root            string // root path for the driver to use
+	apparmor        bool
+	sharedRoot      bool
+	restrictionPath string
 }
 
 func NewDriver(root string, apparmor bool) (*driver, error) {
@@ -69,10 +70,15 @@ func NewDriver(root string, apparmor bool) (*driver, error) {
 	if err := linkLxcStart(root); err != nil {
 		return nil, err
 	}
+	restrictionPath := filepath.Join(root, "empty")
+	if err := os.MkdirAll(restrictionPath, 0700); err != nil {
+		return nil, err
+	}
 	return &driver{
-		apparmor:   apparmor,
-		root:       root,
-		sharedRoot: rootIsShared(),
+		apparmor:        apparmor,
+		root:            root,
+		sharedRoot:      rootIsShared(),
+		restrictionPath: restrictionPath,
 	}, nil
 }
 
@@ -403,14 +409,16 @@ func (d *driver) generateLXCConfig(c *execdriver.Command) (string, error) {
 
 	if err := LxcTemplateCompiled.Execute(fo, struct {
 		*execdriver.Command
-		AppArmor     bool
-		ProcessLabel string
-		MountLabel   string
+		AppArmor          bool
+		ProcessLabel      string
+		MountLabel        string
+		RestrictionSource string
 	}{
-		Command:      c,
-		AppArmor:     d.apparmor,
-		ProcessLabel: process,
-		MountLabel:   mount,
+		Command:           c,
+		AppArmor:          d.apparmor,
+		ProcessLabel:      process,
+		MountLabel:        mount,
+		RestrictionSource: d.restrictionPath,
 	}); err != nil {
 		return "", err
 	}

+ 10 - 1
daemon/execdriver/lxc/lxc_template.go

@@ -88,7 +88,9 @@ lxc.mount.entry = proc {{escapeFstabSpaces $ROOTFS}}/proc proc nosuid,nodev,noex
 
 # WARNING: sysfs is a known attack vector and should probably be disabled
 # if your userspace allows it. eg. see http://bit.ly/T9CkqJ
+{{if .Privileged}}
 lxc.mount.entry = sysfs {{escapeFstabSpaces $ROOTFS}}/sys sysfs nosuid,nodev,noexec 0 0
+{{end}}
 
 {{if .Tty}}
 lxc.mount.entry = {{.Console}} {{escapeFstabSpaces $ROOTFS}}/dev/console none bind,rw 0 0
@@ -109,8 +111,15 @@ lxc.mount.entry = {{$value.Source}} {{escapeFstabSpaces $ROOTFS}}/{{escapeFstabS
 {{if .AppArmor}}
 lxc.aa_profile = unconfined
 {{else}}
-#lxc.aa_profile = unconfined
+# not unconfined
 {{end}}
+{{else}}
+# restrict access to proc
+lxc.mount.entry = {{.RestrictionSource}} {{escapeFstabSpaces $ROOTFS}}/proc/sys none bind,ro 0 0
+lxc.mount.entry = {{.RestrictionSource}} {{escapeFstabSpaces $ROOTFS}}/proc/irq none bind,ro 0 0
+lxc.mount.entry = {{.RestrictionSource}} {{escapeFstabSpaces $ROOTFS}}/proc/acpi none bind,ro 0 0
+lxc.mount.entry = {{escapeFstabSpaces $ROOTFS}}/dev/null {{escapeFstabSpaces $ROOTFS}}/proc/sysrq-trigger none bind,ro 0 0
+lxc.mount.entry = {{escapeFstabSpaces $ROOTFS}}/dev/null {{escapeFstabSpaces $ROOTFS}}/proc/kcore none bind,ro 0 0
 {{end}}
 
 # limits

+ 15 - 1
daemon/execdriver/native/create.go

@@ -25,6 +25,7 @@ func (d *driver) createContainer(c *execdriver.Command) (*libcontainer.Container
 	container.Cgroups.Name = c.ID
 	// check to see if we are running in ramdisk to disable pivot root
 	container.NoPivotRoot = os.Getenv("DOCKER_RAMDISK") != ""
+	container.Context["restriction_path"] = d.restrictionPath
 
 	if err := d.createNetwork(container, c); err != nil {
 		return nil, err
@@ -33,6 +34,8 @@ func (d *driver) createContainer(c *execdriver.Command) (*libcontainer.Container
 		if err := d.setPrivileged(container); err != nil {
 			return nil, err
 		}
+	} else {
+		container.Mounts = append(container.Mounts, libcontainer.Mount{Type: "devtmpfs"})
 	}
 	if err := d.setupCgroups(container, c); err != nil {
 		return nil, err
@@ -81,6 +84,11 @@ func (d *driver) setPrivileged(container *libcontainer.Container) error {
 		c.Enabled = true
 	}
 	container.Cgroups.DeviceAccess = true
+
+	// add sysfs as a mount for privileged containers
+	container.Mounts = append(container.Mounts, libcontainer.Mount{Type: "sysfs"})
+	delete(container.Context, "restriction_path")
+
 	if apparmor.IsEnabled() {
 		container.Context["apparmor_profile"] = "unconfined"
 	}
@@ -99,7 +107,13 @@ func (d *driver) setupCgroups(container *libcontainer.Container, c *execdriver.C
 
 func (d *driver) setupMounts(container *libcontainer.Container, c *execdriver.Command) error {
 	for _, m := range c.Mounts {
-		container.Mounts = append(container.Mounts, libcontainer.Mount{m.Source, m.Destination, m.Writable, m.Private})
+		container.Mounts = append(container.Mounts, libcontainer.Mount{
+			Type:        "bind",
+			Source:      m.Source,
+			Destination: m.Destination,
+			Writable:    m.Writable,
+			Private:     m.Private,
+		})
 	}
 	return nil
 }

+ 8 - 1
daemon/execdriver/native/driver.go

@@ -23,7 +23,7 @@ import (
 
 const (
 	DriverName                = "native"
-	Version                   = "0.1"
+	Version                   = "0.2"
 	BackupApparmorProfilePath = "apparmor/docker.back" // relative to docker root
 )
 
@@ -62,6 +62,7 @@ type driver struct {
 	root             string
 	initPath         string
 	activeContainers map[string]*exec.Cmd
+	restrictionPath  string
 }
 
 func NewDriver(root, initPath string) (*driver, error) {
@@ -72,8 +73,14 @@ func NewDriver(root, initPath string) (*driver, error) {
 	if err := apparmor.InstallDefaultProfile(filepath.Join(root, "../..", BackupApparmorProfilePath)); err != nil {
 		return nil, err
 	}
+	restrictionPath := filepath.Join(root, "empty")
+	if err := os.MkdirAll(restrictionPath, 0700); err != nil {
+		return nil, err
+	}
+
 	return &driver{
 		root:             root,
+		restrictionPath:  restrictionPath,
 		initPath:         initPath,
 		activeContainers: make(map[string]*exec.Cmd),
 	}, nil

+ 22 - 0
integration-cli/docker_cli_run_test.go

@@ -665,3 +665,25 @@ func TestUnPrivilegedCannotMount(t *testing.T) {
 
 	logDone("run - test un-privileged cannot mount")
 }
+
+func TestSysNotAvaliableInNonPrivilegedContainers(t *testing.T) {
+	cmd := exec.Command(dockerBinary, "run", "busybox", "ls", "/sys/kernel")
+	if code, err := runCommand(cmd); err == nil || code == 0 {
+		t.Fatal("sys should not be available in a non privileged container")
+	}
+
+	deleteAllContainers()
+
+	logDone("run - sys not avaliable in non privileged container")
+}
+
+func TestSysAvaliableInPrivilegedContainers(t *testing.T) {
+	cmd := exec.Command(dockerBinary, "run", "--privileged", "busybox", "ls", "/sys/kernel")
+	if code, err := runCommand(cmd); err != nil || code != 0 {
+		t.Fatalf("sys should be available in privileged container")
+	}
+
+	deleteAllContainers()
+
+	logDone("run - sys avaliable in privileged container")
+}

+ 127 - 54
pkg/libcontainer/README.md

@@ -16,76 +16,149 @@ process are specified in this file.  The configuration is used for each process
 Sample `container.json` file:
 ```json
 {
-   "hostname" : "koye",
-   "networks" : [
+   "mounts" : [
       {
-         "gateway" : "172.17.42.1",
-         "context" : {
-            "bridge" : "docker0",
-            "prefix" : "veth"
-         },
-         "address" : "172.17.0.2/16",
-         "type" : "veth",
-         "mtu" : 1500
+         "type" : "devtmpfs"
       }
    ],
-   "cgroups" : {
-      "parent" : "docker",
-      "name" : "11bb30683fb0bdd57fab4d3a8238877f1e4395a2cfc7320ea359f7a02c1a5620"
-   },
    "tty" : true,
    "environment" : [
       "HOME=/",
-      "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
-      "HOSTNAME=11bb30683fb0",
-      "TERM=xterm"
-   ],
-   "capabilities_mask" : [
-      "SETPCAP",
-      "SYS_MODULE",
-      "SYS_RAWIO",
-      "SYS_PACCT",
-      "SYS_ADMIN",
-      "SYS_NICE",
-      "SYS_RESOURCE",
-      "SYS_TIME",
-      "SYS_TTY_CONFIG",
-      "MKNOD",
-      "AUDIT_WRITE",
-      "AUDIT_CONTROL",
-      "MAC_OVERRIDE",
-      "MAC_ADMIN",
-      "NET_ADMIN"
+      "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin",
+      "container=docker",
+      "TERM=xterm-256color"
    ],
-   "context" : {
-      "apparmor_profile" : "docker-default"
+   "hostname" : "koye",
+   "cgroups" : {
+      "parent" : "docker",
+      "name" : "docker-koye"
    },
-   "mounts" : [
+   "capabilities_mask" : [
+      {
+         "value" : 8,
+         "key" : "SETPCAP",
+         "enabled" : false
+      },
+      {
+         "enabled" : false,
+         "value" : 16,
+         "key" : "SYS_MODULE"
+      },
+      {
+         "value" : 17,
+         "key" : "SYS_RAWIO",
+         "enabled" : false
+      },
+      {
+         "key" : "SYS_PACCT",
+         "value" : 20,
+         "enabled" : false
+      },
+      {
+         "value" : 21,
+         "key" : "SYS_ADMIN",
+         "enabled" : false
+      },
+      {
+         "value" : 23,
+         "key" : "SYS_NICE",
+         "enabled" : false
+      },
+      {
+         "value" : 24,
+         "key" : "SYS_RESOURCE",
+         "enabled" : false
+      },
       {
-         "source" : "/var/lib/docker/containers/11bb30683fb0bdd57fab4d3a8238877f1e4395a2cfc7320ea359f7a02c1a5620/resolv.conf",
-         "writable" : false,
-         "destination" : "/etc/resolv.conf",
-         "private" : true
+         "key" : "SYS_TIME",
+         "value" : 25,
+         "enabled" : false
       },
       {
-         "source" : "/var/lib/docker/containers/11bb30683fb0bdd57fab4d3a8238877f1e4395a2cfc7320ea359f7a02c1a5620/hostname",
-         "writable" : false,
-         "destination" : "/etc/hostname",
-         "private" : true
+         "enabled" : false,
+         "value" : 26,
+         "key" : "SYS_TTY_CONFIG"
       },
       {
-         "source" : "/var/lib/docker/containers/11bb30683fb0bdd57fab4d3a8238877f1e4395a2cfc7320ea359f7a02c1a5620/hosts",
-         "writable" : false,
-         "destination" : "/etc/hosts",
-         "private" : true
+         "key" : "AUDIT_WRITE",
+         "value" : 29,
+         "enabled" : false
+      },
+      {
+         "value" : 30,
+         "key" : "AUDIT_CONTROL",
+         "enabled" : false
+      },
+      {
+         "enabled" : false,
+         "key" : "MAC_OVERRIDE",
+         "value" : 32
+      },
+      {
+         "enabled" : false,
+         "key" : "MAC_ADMIN",
+         "value" : 33
+      },
+      {
+         "key" : "NET_ADMIN",
+         "value" : 12,
+         "enabled" : false
+      },
+      {
+         "value" : 27,
+         "key" : "MKNOD",
+         "enabled" : true
+      }
+   ],
+   "networks" : [
+      {
+         "mtu" : 1500,
+         "address" : "127.0.0.1/0",
+         "type" : "loopback",
+         "gateway" : "localhost"
+      },
+      {
+         "mtu" : 1500,
+         "address" : "172.17.42.2/16",
+         "type" : "veth",
+         "context" : {
+            "bridge" : "docker0",
+            "prefix" : "veth"
+         },
+         "gateway" : "172.17.42.1"
       }
    ],
    "namespaces" : [
-      "NEWNS",
-      "NEWUTS",
-      "NEWIPC",
-      "NEWPID",
-      "NEWNET"
+      {
+         "key" : "NEWNS",
+         "value" : 131072,
+         "enabled" : true,
+         "file" : "mnt"
+      },
+      {
+         "key" : "NEWUTS",
+         "value" : 67108864,
+         "enabled" : true,
+         "file" : "uts"
+      },
+      {
+         "enabled" : true,
+         "file" : "ipc",
+         "key" : "NEWIPC",
+         "value" : 134217728
+      },
+      {
+         "file" : "pid",
+         "enabled" : true,
+         "value" : 536870912,
+         "key" : "NEWPID"
+      },
+      {
+         "enabled" : true,
+         "file" : "net",
+         "key" : "NEWNET",
+         "value" : 1073741824
+      }
    ]
 }
 ```

+ 60 - 0
pkg/libcontainer/console/console.go

@@ -0,0 +1,60 @@
+// +build linux
+
+package console
+
+import (
+	"fmt"
+	"github.com/dotcloud/docker/pkg/label"
+	"github.com/dotcloud/docker/pkg/system"
+	"os"
+	"path/filepath"
+	"syscall"
+)
+
+// Setup initializes the proper /dev/console inside the rootfs path
+func Setup(rootfs, consolePath, mountLabel string) error {
+	oldMask := system.Umask(0000)
+	defer system.Umask(oldMask)
+
+	stat, err := os.Stat(consolePath)
+	if err != nil {
+		return fmt.Errorf("stat console %s %s", consolePath, err)
+	}
+	var (
+		st   = stat.Sys().(*syscall.Stat_t)
+		dest = filepath.Join(rootfs, "dev/console")
+	)
+	if err := os.Remove(dest); err != nil && !os.IsNotExist(err) {
+		return fmt.Errorf("remove %s %s", dest, err)
+	}
+	if err := os.Chmod(consolePath, 0600); err != nil {
+		return err
+	}
+	if err := os.Chown(consolePath, 0, 0); err != nil {
+		return err
+	}
+	if err := system.Mknod(dest, (st.Mode&^07777)|0600, int(st.Rdev)); err != nil {
+		return fmt.Errorf("mknod %s %s", dest, err)
+	}
+	if err := label.SetFileLabel(consolePath, mountLabel); err != nil {
+		return fmt.Errorf("set file label %s %s", dest, err)
+	}
+	if err := system.Mount(consolePath, dest, "bind", syscall.MS_BIND, ""); err != nil {
+		return fmt.Errorf("bind %s to %s %s", consolePath, dest, err)
+	}
+	return nil
+}
+
+func OpenAndDup(consolePath string) error {
+	slave, err := system.OpenTerminal(consolePath, syscall.O_RDWR)
+	if err != nil {
+		return fmt.Errorf("open terminal %s", err)
+	}
+	if err := system.Dup2(slave.Fd(), 0); err != nil {
+		return err
+	}
+	if err := system.Dup2(slave.Fd(), 1); err != nil {
+		return err
+	}
+	return system.Dup2(slave.Fd(), 2)
+}

+ 1 - 10
pkg/libcontainer/container.go

@@ -23,7 +23,7 @@ type Container struct {
 	Networks         []*Network      `json:"networks,omitempty"`          // nil for host's network stack
 	Cgroups          *cgroups.Cgroup `json:"cgroups,omitempty"`           // cgroups
 	Context          Context         `json:"context,omitempty"`           // generic context for specific options (apparmor, selinux)
-	Mounts           []Mount         `json:"mounts,omitempty"`
+	Mounts           Mounts          `json:"mounts,omitempty"`
 }
 
 // Network defines configuration for a container's networking stack
@@ -37,12 +37,3 @@ type Network struct {
 	Gateway string  `json:"gateway,omitempty"`
 	Mtu     int     `json:"mtu,omitempty"`
 }
-
-// Bind mounts from the host system to the container
-//
-type Mount struct {
-	Source      string `json:"source"`      // Source path, in the host namespace
-	Destination string `json:"destination"` // Destination path, in the container
-	Writable    bool   `json:"writable"`
-	Private     bool   `json:"private"`
-}

+ 144 - 48
pkg/libcontainer/container.json

@@ -1,50 +1,146 @@
 {
-    "hostname": "koye",
-    "tty": true,
-    "environment": [
-        "HOME=/",
-        "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin",
-        "container=docker",
-        "TERM=xterm-256color"
-    ],
-    "namespaces": [
-        "NEWIPC",
-        "NEWNS",
-        "NEWPID",
-        "NEWUTS",
-        "NEWNET"
-    ],
-    "capabilities_mask": [
-        "SETPCAP",
-        "SYS_MODULE",
-        "SYS_RAWIO",
-        "SYS_PACCT",
-        "SYS_ADMIN",
-        "SYS_NICE",
-        "SYS_RESOURCE",
-        "SYS_TIME",
-        "SYS_TTY_CONFIG",
-        "MKNOD",
-        "AUDIT_WRITE",
-        "AUDIT_CONTROL",
-        "MAC_OVERRIDE",
-        "MAC_ADMIN",
-        "NET_ADMIN"
-    ],
-    "networks": [{
-            "type": "veth",
-            "context": {
-                "bridge": "docker0",
-                "prefix": "dock"
-            },
-            "address": "172.17.0.100/16",
-            "gateway": "172.17.42.1",
-            "mtu": 1500
-        }
-    ],
-    "cgroups": {
-        "name": "docker-koye",
-        "parent": "docker",
-        "memory": 5248000
-    }
+   "mounts" : [
+      {
+         "type" : "devtmpfs"
+      }
+   ],
+   "tty" : true,
+   "environment" : [
+      "HOME=/",
+      "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin",
+      "container=docker",
+      "TERM=xterm-256color"
+   ],
+   "hostname" : "koye",
+   "cgroups" : {
+      "parent" : "docker",
+      "name" : "docker-koye"
+   },
+   "capabilities_mask" : [
+      {
+         "value" : 8,
+         "key" : "SETPCAP",
+         "enabled" : false
+      },
+      {
+         "enabled" : false,
+         "value" : 16,
+         "key" : "SYS_MODULE"
+      },
+      {
+         "value" : 17,
+         "key" : "SYS_RAWIO",
+         "enabled" : false
+      },
+      {
+         "key" : "SYS_PACCT",
+         "value" : 20,
+         "enabled" : false
+      },
+      {
+         "value" : 21,
+         "key" : "SYS_ADMIN",
+         "enabled" : false
+      },
+      {
+         "value" : 23,
+         "key" : "SYS_NICE",
+         "enabled" : false
+      },
+      {
+         "value" : 24,
+         "key" : "SYS_RESOURCE",
+         "enabled" : false
+      },
+      {
+         "key" : "SYS_TIME",
+         "value" : 25,
+         "enabled" : false
+      },
+      {
+         "enabled" : false,
+         "value" : 26,
+         "key" : "SYS_TTY_CONFIG"
+      },
+      {
+         "key" : "AUDIT_WRITE",
+         "value" : 29,
+         "enabled" : false
+      },
+      {
+         "value" : 30,
+         "key" : "AUDIT_CONTROL",
+         "enabled" : false
+      },
+      {
+         "enabled" : false,
+         "key" : "MAC_OVERRIDE",
+         "value" : 32
+      },
+      {
+         "enabled" : false,
+         "key" : "MAC_ADMIN",
+         "value" : 33
+      },
+      {
+         "key" : "NET_ADMIN",
+         "value" : 12,
+         "enabled" : false
+      },
+      {
+         "value" : 27,
+         "key" : "MKNOD",
+         "enabled" : true
+      }
+   ],
+   "networks" : [
+      {
+         "mtu" : 1500,
+         "address" : "127.0.0.1/0",
+         "type" : "loopback",
+         "gateway" : "localhost"
+      },
+      {
+         "mtu" : 1500,
+         "address" : "172.17.42.2/16",
+         "type" : "veth",
+         "context" : {
+            "bridge" : "docker0",
+            "prefix" : "veth"
+         },
+         "gateway" : "172.17.42.1"
+      }
+   ],
+   "namespaces" : [
+      {
+         "key" : "NEWNS",
+         "value" : 131072,
+         "enabled" : true,
+         "file" : "mnt"
+      },
+      {
+         "key" : "NEWUTS",
+         "value" : 67108864,
+         "enabled" : true,
+         "file" : "uts"
+      },
+      {
+         "enabled" : true,
+         "file" : "ipc",
+         "key" : "NEWIPC",
+         "value" : 134217728
+      },
+      {
+         "file" : "pid",
+         "enabled" : true,
+         "value" : 536870912,
+         "key" : "NEWPID"
+      },
+      {
+         "enabled" : true,
+         "file" : "net",
+         "key" : "NEWNET",
+         "value" : 1073741824
+      }
+   ]
 }

+ 143 - 0
pkg/libcontainer/mount/init.go

@@ -0,0 +1,143 @@
+// +build linux
+
+package mount
+
+import (
+	"fmt"
+	"github.com/dotcloud/docker/pkg/label"
+	"github.com/dotcloud/docker/pkg/libcontainer"
+	"github.com/dotcloud/docker/pkg/libcontainer/mount/nodes"
+	"github.com/dotcloud/docker/pkg/libcontainer/security/restrict"
+	"github.com/dotcloud/docker/pkg/system"
+	"os"
+	"path/filepath"
+	"syscall"
+)
+
+// default mount point flags
+const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
+
+type mount struct {
+	source string
+	path   string
+	device string
+	flags  int
+	data   string
+}
+
+// InitializeMountNamespace setups up the devices, mount points, and filesystems for use inside a
+// new mount namepsace
+func InitializeMountNamespace(rootfs, console string, container *libcontainer.Container) error {
+	var (
+		err  error
+		flag = syscall.MS_PRIVATE
+	)
+	if container.NoPivotRoot {
+		flag = syscall.MS_SLAVE
+	}
+	if err := system.Mount("", "/", "", uintptr(flag|syscall.MS_REC), ""); err != nil {
+		return fmt.Errorf("mounting / as slave %s", err)
+	}
+	if err := system.Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil {
+		return fmt.Errorf("mouting %s as bind %s", rootfs, err)
+	}
+	if err := mountSystem(rootfs, container); err != nil {
+		return fmt.Errorf("mount system %s", err)
+	}
+	if err := setupBindmounts(rootfs, container.Mounts); err != nil {
+		return fmt.Errorf("bind mounts %s", err)
+	}
+	if err := nodes.CopyN(rootfs, nodes.DefaultNodes); err != nil {
+		return fmt.Errorf("copy dev nodes %s", err)
+	}
+	if restrictionPath := container.Context["restriction_path"]; restrictionPath != "" {
+		if err := restrict.Restrict(rootfs, restrictionPath); err != nil {
+			return fmt.Errorf("restrict %s", err)
+		}
+	}
+	if err := SetupPtmx(rootfs, console, container.Context["mount_label"]); err != nil {
+		return err
+	}
+	if err := system.Chdir(rootfs); err != nil {
+		return fmt.Errorf("chdir into %s %s", rootfs, err)
+	}
+
+	if container.NoPivotRoot {
+		err = MsMoveRoot(rootfs)
+	} else {
+		err = PivotRoot(rootfs)
+	}
+	if err != nil {
+		return err
+	}
+
+	if container.ReadonlyFs {
+		if err := SetReadonly(); err != nil {
+			return fmt.Errorf("set readonly %s", err)
+		}
+	}
+
+	system.Umask(0022)
+
+	return nil
+}
+
+// mountSystem sets up linux specific system mounts like sys, proc, shm, and devpts
+// inside the mount namespace
+func mountSystem(rootfs string, container *libcontainer.Container) error {
+	for _, m := range newSystemMounts(rootfs, container.Context["mount_label"], container.Mounts) {
+		if err := os.MkdirAll(m.path, 0755); err != nil && !os.IsExist(err) {
+			return fmt.Errorf("mkdirall %s %s", m.path, err)
+		}
+		if err := system.Mount(m.source, m.path, m.device, uintptr(m.flags), m.data); err != nil {
+			return fmt.Errorf("mounting %s into %s %s", m.source, m.path, err)
+		}
+	}
+	return nil
+}
+
+func setupBindmounts(rootfs string, bindMounts libcontainer.Mounts) error {
+	for _, m := range bindMounts.OfType("bind") {
+		var (
+			flags = syscall.MS_BIND | syscall.MS_REC
+			dest  = filepath.Join(rootfs, m.Destination)
+		)
+		if !m.Writable {
+			flags = flags | syscall.MS_RDONLY
+		}
+		if err := system.Mount(m.Source, dest, "bind", uintptr(flags), ""); err != nil {
+			return fmt.Errorf("mounting %s into %s %s", m.Source, dest, err)
+		}
+		if !m.Writable {
+			if err := system.Mount(m.Source, dest, "bind", uintptr(flags|syscall.MS_REMOUNT), ""); err != nil {
+				return fmt.Errorf("remounting %s into %s %s", m.Source, dest, err)
+			}
+		}
+		if m.Private {
+			if err := system.Mount("", dest, "none", uintptr(syscall.MS_PRIVATE), ""); err != nil {
+				return fmt.Errorf("mounting %s private %s", dest, err)
+			}
+		}
+	}
+	return nil
+}
+
+// TODO: this is crappy right now and should be cleaned up with a better way of handling system and
+// standard bind mounts allowing them to be more dymanic
+func newSystemMounts(rootfs, mountLabel string, mounts libcontainer.Mounts) []mount {
+	systemMounts := []mount{
+		{source: "proc", path: filepath.Join(rootfs, "proc"), device: "proc", flags: defaultMountFlags},
+	}
+
+	if len(mounts.OfType("devtmpfs")) == 1 {
+		systemMounts = append(systemMounts, mount{source: "tmpfs", path: filepath.Join(rootfs, "dev"), device: "tmpfs", flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME, data: "mode=755"})
+	}
+	systemMounts = append(systemMounts,
+		mount{source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaultMountFlags, data: label.FormatMountLabel("mode=1777,size=65536k", mountLabel)},
+		mount{source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: label.FormatMountLabel("newinstance,ptmxmode=0666,mode=620,gid=5", mountLabel)})
+
+	if len(mounts.OfType("sysfs")) == 1 {
+		systemMounts = append(systemMounts, mount{source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaultMountFlags})
+	}
+	return systemMounts
+}

+ 19 - 0
pkg/libcontainer/mount/msmoveroot.go

@@ -0,0 +1,19 @@
+// +build linux
+
+package mount
+
+import (
+	"fmt"
+	"github.com/dotcloud/docker/pkg/system"
+	"syscall"
+)
+
+func MsMoveRoot(rootfs string) error {
+	if err := system.Mount(rootfs, "/", "", syscall.MS_MOVE, ""); err != nil {
+		return fmt.Errorf("mount move %s into / %s", rootfs, err)
+	}
+	if err := system.Chroot("."); err != nil {
+		return fmt.Errorf("chroot . %s", err)
+	}
+	return system.Chdir("/")
+}

+ 49 - 0
pkg/libcontainer/mount/nodes/nodes.go

@@ -0,0 +1,49 @@
+// +build linux
+
+package nodes
+
+import (
+	"fmt"
+	"github.com/dotcloud/docker/pkg/system"
+	"os"
+	"path/filepath"
+	"syscall"
+)
+
+// Default list of device nodes to copy
+var DefaultNodes = []string{
+	"null",
+	"zero",
+	"full",
+	"random",
+	"urandom",
+	"tty",
+}
+
+// CopyN copies the device node from the host into the rootfs
+func CopyN(rootfs string, nodesToCopy []string) error {
+	oldMask := system.Umask(0000)
+	defer system.Umask(oldMask)
+
+	for _, node := range nodesToCopy {
+		if err := Copy(rootfs, node); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func Copy(rootfs, node string) error {
+	stat, err := os.Stat(filepath.Join("/dev", node))
+	if err != nil {
+		return err
+	}
+	var (
+		dest = filepath.Join(rootfs, "dev", node)
+		st   = stat.Sys().(*syscall.Stat_t)
+	)
+	if err := system.Mknod(dest, st.Mode, int(st.Rdev)); err != nil && !os.IsExist(err) {
+		return fmt.Errorf("copy %s %s", node, err)
+	}
+	return nil
+}

+ 31 - 0
pkg/libcontainer/mount/pivotroot.go

@@ -0,0 +1,31 @@
+// +build linux
+
+package mount
+
+import (
+	"fmt"
+	"github.com/dotcloud/docker/pkg/system"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"syscall"
+)
+
+func PivotRoot(rootfs string) error {
+	pivotDir, err := ioutil.TempDir(rootfs, ".pivot_root")
+	if err != nil {
+		return fmt.Errorf("can't create pivot_root dir %s", pivotDir, err)
+	}
+	if err := system.Pivotroot(rootfs, pivotDir); err != nil {
+		return fmt.Errorf("pivot_root %s", err)
+	}
+	if err := system.Chdir("/"); err != nil {
+		return fmt.Errorf("chdir / %s", err)
+	}
+	// path to pivot dir now changed, update
+	pivotDir = filepath.Join("/", filepath.Base(pivotDir))
+	if err := system.Unmount(pivotDir, syscall.MNT_DETACH); err != nil {
+		return fmt.Errorf("unmount pivot_root dir %s", err)
+	}
+	return os.Remove(pivotDir)
+}

+ 26 - 0
pkg/libcontainer/mount/ptmx.go

@@ -0,0 +1,26 @@
+// +build linux
+
+package mount
+
+import (
+	"fmt"
+	"github.com/dotcloud/docker/pkg/libcontainer/console"
+	"os"
+	"path/filepath"
+)
+
+func SetupPtmx(rootfs, consolePath, mountLabel string) error {
+	ptmx := filepath.Join(rootfs, "dev/ptmx")
+	if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) {
+		return err
+	}
+	if err := os.Symlink("pts/ptmx", ptmx); err != nil {
+		return fmt.Errorf("symlink dev ptmx %s", err)
+	}
+	if consolePath != "" {
+		if err := console.Setup(rootfs, consolePath, mountLabel); err != nil {
+			return err
+		}
+	}
+	return nil
+}

+ 12 - 0
pkg/libcontainer/mount/readonly.go

@@ -0,0 +1,12 @@
+// +build linux
+
+package mount
+
+import (
+	"github.com/dotcloud/docker/pkg/system"
+	"syscall"
+)
+
+func SetReadonly() error {
+	return system.Mount("/", "/", "bind", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, "")
+}

+ 31 - 0
pkg/libcontainer/mount/remount.go

@@ -0,0 +1,31 @@
+// +build linux
+
+package mount
+
+import (
+	"github.com/dotcloud/docker/pkg/system"
+	"syscall"
+)
+
+func RemountProc() error {
+	if err := system.Unmount("/proc", syscall.MNT_DETACH); err != nil {
+		return err
+	}
+	if err := system.Mount("proc", "/proc", "proc", uintptr(defaultMountFlags), ""); err != nil {
+		return err
+	}
+	return nil
+}
+
+func RemountSys() error {
+	if err := system.Unmount("/sys", syscall.MNT_DETACH); err != nil {
+		if err != syscall.EINVAL {
+			return err
+		}
+	} else {
+		if err := system.Mount("sysfs", "/sys", "sysfs", uintptr(defaultMountFlags), ""); err != nil {
+			return err
+		}
+	}
+	return nil
+}

+ 3 - 2
pkg/libcontainer/nsinit/execin.go

@@ -6,6 +6,7 @@ import (
 	"fmt"
 	"github.com/dotcloud/docker/pkg/label"
 	"github.com/dotcloud/docker/pkg/libcontainer"
+	"github.com/dotcloud/docker/pkg/libcontainer/mount"
 	"github.com/dotcloud/docker/pkg/system"
 	"os"
 	"path/filepath"
@@ -63,10 +64,10 @@ func (ns *linuxNs) ExecIn(container *libcontainer.Container, nspid int, args []s
 			if err := system.Unshare(syscall.CLONE_NEWNS); err != nil {
 				return -1, err
 			}
-			if err := remountProc(); err != nil {
+			if err := mount.RemountProc(); err != nil {
 				return -1, fmt.Errorf("remount proc %s", err)
 			}
-			if err := remountSys(); err != nil {
+			if err := mount.RemountSys(); err != nil {
 				return -1, fmt.Errorf("remount sys %s", err)
 			}
 			goto dropAndExec

+ 10 - 27
pkg/libcontainer/nsinit/init.go

@@ -11,8 +11,10 @@ import (
 	"github.com/dotcloud/docker/pkg/apparmor"
 	"github.com/dotcloud/docker/pkg/label"
 	"github.com/dotcloud/docker/pkg/libcontainer"
-	"github.com/dotcloud/docker/pkg/libcontainer/capabilities"
+	"github.com/dotcloud/docker/pkg/libcontainer/console"
+	"github.com/dotcloud/docker/pkg/libcontainer/mount"
 	"github.com/dotcloud/docker/pkg/libcontainer/network"
+	"github.com/dotcloud/docker/pkg/libcontainer/security/capabilities"
 	"github.com/dotcloud/docker/pkg/libcontainer/utils"
 	"github.com/dotcloud/docker/pkg/system"
 	"github.com/dotcloud/docker/pkg/user"
@@ -20,7 +22,7 @@ import (
 
 // Init is the init process that first runs inside a new namespace to setup mounts, users, networking,
 // and other options required for the new container.
-func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, console string, syncPipe *SyncPipe, args []string) error {
+func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consolePath string, syncPipe *SyncPipe, args []string) error {
 	rootfs, err := utils.ResolveRootfs(uncleanRootfs)
 	if err != nil {
 		return err
@@ -36,20 +38,16 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol
 	ns.logger.Println("received context from parent")
 	syncPipe.Close()
 
-	if console != "" {
-		ns.logger.Printf("setting up %s as console\n", console)
-		slave, err := system.OpenTerminal(console, syscall.O_RDWR)
-		if err != nil {
-			return fmt.Errorf("open terminal %s", err)
-		}
-		if err := dupSlave(slave); err != nil {
-			return fmt.Errorf("dup2 slave %s", err)
+	if consolePath != "" {
+		ns.logger.Printf("setting up %s as console\n", consolePath)
+		if err := console.OpenAndDup(consolePath); err != nil {
+			return err
 		}
 	}
 	if _, err := system.Setsid(); err != nil {
 		return fmt.Errorf("setsid %s", err)
 	}
-	if console != "" {
+	if consolePath != "" {
 		if err := system.Setctty(); err != nil {
 			return fmt.Errorf("setctty %s", err)
 		}
@@ -60,7 +58,7 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol
 
 	label.Init()
 	ns.logger.Println("setup mount namespace")
-	if err := setupNewMountNamespace(rootfs, container.Mounts, console, container.ReadonlyFs, container.NoPivotRoot, container.Context["mount_label"]); err != nil {
+	if err := mount.InitializeMountNamespace(rootfs, consolePath, container); err != nil {
 		return fmt.Errorf("setup mount namespace %s", err)
 	}
 	if err := system.Sethostname(container.Hostname); err != nil {
@@ -114,21 +112,6 @@ func setupUser(container *libcontainer.Container) error {
 	return nil
 }
 
-// dupSlave dup2 the pty slave's fd into stdout and stdin and ensures that
-// the slave's fd is 0, or stdin
-func dupSlave(slave *os.File) error {
-	if err := system.Dup2(slave.Fd(), 0); err != nil {
-		return err
-	}
-	if err := system.Dup2(slave.Fd(), 1); err != nil {
-		return err
-	}
-	if err := system.Dup2(slave.Fd(), 2); err != nil {
-		return err
-	}
-	return nil
-}
-
 // setupVethNetwork uses the Network config if it is not nil to initialize
 // the new veth interface inside the container for use by changing the name to eth0
 // setting the MTU and IP address along with the default gateway

+ 0 - 265
pkg/libcontainer/nsinit/mount.go

@@ -1,265 +0,0 @@
-// +build linux
-
-package nsinit
-
-import (
-	"fmt"
-	"github.com/dotcloud/docker/pkg/label"
-	"github.com/dotcloud/docker/pkg/libcontainer"
-	"github.com/dotcloud/docker/pkg/system"
-	"io/ioutil"
-	"os"
-	"path/filepath"
-	"syscall"
-)
-
-// default mount point flags
-const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
-
-// setupNewMountNamespace is used to initialize a new mount namespace for an new
-// container in the rootfs that is specified.
-//
-// There is no need to unmount the new mounts because as soon as the mount namespace
-// is no longer in use, the mounts will be removed automatically
-func setupNewMountNamespace(rootfs string, bindMounts []libcontainer.Mount, console string, readonly, noPivotRoot bool, mountLabel string) error {
-	flag := syscall.MS_PRIVATE
-	if noPivotRoot {
-		flag = syscall.MS_SLAVE
-	}
-	if err := system.Mount("", "/", "", uintptr(flag|syscall.MS_REC), ""); err != nil {
-		return fmt.Errorf("mounting / as slave %s", err)
-	}
-	if err := system.Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil {
-		return fmt.Errorf("mouting %s as bind %s", rootfs, err)
-	}
-	if err := mountSystem(rootfs, mountLabel); err != nil {
-		return fmt.Errorf("mount system %s", err)
-	}
-
-	for _, m := range bindMounts {
-		var (
-			flags = syscall.MS_BIND | syscall.MS_REC
-			dest  = filepath.Join(rootfs, m.Destination)
-		)
-		if !m.Writable {
-			flags = flags | syscall.MS_RDONLY
-		}
-		if err := system.Mount(m.Source, dest, "bind", uintptr(flags), ""); err != nil {
-			return fmt.Errorf("mounting %s into %s %s", m.Source, dest, err)
-		}
-		if !m.Writable {
-			if err := system.Mount(m.Source, dest, "bind", uintptr(flags|syscall.MS_REMOUNT), ""); err != nil {
-				return fmt.Errorf("remounting %s into %s %s", m.Source, dest, err)
-			}
-		}
-		if m.Private {
-			if err := system.Mount("", dest, "none", uintptr(syscall.MS_PRIVATE), ""); err != nil {
-				return fmt.Errorf("mounting %s private %s", dest, err)
-			}
-		}
-	}
-
-	if err := copyDevNodes(rootfs); err != nil {
-		return fmt.Errorf("copy dev nodes %s", err)
-	}
-	if err := setupPtmx(rootfs, console, mountLabel); err != nil {
-		return err
-	}
-	if err := system.Chdir(rootfs); err != nil {
-		return fmt.Errorf("chdir into %s %s", rootfs, err)
-	}
-
-	if noPivotRoot {
-		if err := rootMsMove(rootfs); err != nil {
-			return err
-		}
-	} else {
-		if err := rootPivot(rootfs); err != nil {
-			return err
-		}
-	}
-
-	if readonly {
-		if err := system.Mount("/", "/", "bind", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, ""); err != nil {
-			return fmt.Errorf("mounting %s as readonly %s", rootfs, err)
-		}
-	}
-
-	system.Umask(0022)
-
-	return nil
-}
-
-// use a pivot root to setup the rootfs
-func rootPivot(rootfs string) error {
-	pivotDir, err := ioutil.TempDir(rootfs, ".pivot_root")
-	if err != nil {
-		return fmt.Errorf("can't create pivot_root dir %s", pivotDir, err)
-	}
-	if err := system.Pivotroot(rootfs, pivotDir); err != nil {
-		return fmt.Errorf("pivot_root %s", err)
-	}
-	if err := system.Chdir("/"); err != nil {
-		return fmt.Errorf("chdir / %s", err)
-	}
-	// path to pivot dir now changed, update
-	pivotDir = filepath.Join("/", filepath.Base(pivotDir))
-	if err := system.Unmount(pivotDir, syscall.MNT_DETACH); err != nil {
-		return fmt.Errorf("unmount pivot_root dir %s", err)
-	}
-	if err := os.Remove(pivotDir); err != nil {
-		return fmt.Errorf("remove pivot_root dir %s", err)
-	}
-	return nil
-}
-
-// use MS_MOVE and chroot to setup the rootfs
-func rootMsMove(rootfs string) error {
-	if err := system.Mount(rootfs, "/", "", syscall.MS_MOVE, ""); err != nil {
-		return fmt.Errorf("mount move %s into / %s", rootfs, err)
-	}
-	if err := system.Chroot("."); err != nil {
-		return fmt.Errorf("chroot . %s", err)
-	}
-	if err := system.Chdir("/"); err != nil {
-		return fmt.Errorf("chdir / %s", err)
-	}
-	return nil
-}
-
-// copyDevNodes mknods the hosts devices so the new container has access to them
-func copyDevNodes(rootfs string) error {
-	oldMask := system.Umask(0000)
-	defer system.Umask(oldMask)
-
-	for _, node := range []string{
-		"null",
-		"zero",
-		"full",
-		"random",
-		"urandom",
-		"tty",
-	} {
-		if err := copyDevNode(rootfs, node); err != nil {
-			return err
-		}
-	}
-	return nil
-}
-
-func copyDevNode(rootfs, node string) error {
-	stat, err := os.Stat(filepath.Join("/dev", node))
-	if err != nil {
-		return err
-	}
-	var (
-		dest = filepath.Join(rootfs, "dev", node)
-		st   = stat.Sys().(*syscall.Stat_t)
-	)
-	if err := system.Mknod(dest, st.Mode, int(st.Rdev)); err != nil && !os.IsExist(err) {
-		return fmt.Errorf("copy %s %s", node, err)
-	}
-	return nil
-}
-
-// setupConsole ensures that the container has a proper /dev/console setup
-func setupConsole(rootfs, console string, mountLabel string) error {
-	oldMask := system.Umask(0000)
-	defer system.Umask(oldMask)
-
-	stat, err := os.Stat(console)
-	if err != nil {
-		return fmt.Errorf("stat console %s %s", console, err)
-	}
-	var (
-		st   = stat.Sys().(*syscall.Stat_t)
-		dest = filepath.Join(rootfs, "dev/console")
-	)
-	if err := os.Remove(dest); err != nil && !os.IsNotExist(err) {
-		return fmt.Errorf("remove %s %s", dest, err)
-	}
-	if err := os.Chmod(console, 0600); err != nil {
-		return err
-	}
-	if err := os.Chown(console, 0, 0); err != nil {
-		return err
-	}
-	if err := system.Mknod(dest, (st.Mode&^07777)|0600, int(st.Rdev)); err != nil {
-		return fmt.Errorf("mknod %s %s", dest, err)
-	}
-	if err := label.SetFileLabel(console, mountLabel); err != nil {
-		return fmt.Errorf("SetFileLabel Failed %s %s", dest, err)
-	}
-	if err := system.Mount(console, dest, "bind", syscall.MS_BIND, ""); err != nil {
-		return fmt.Errorf("bind %s to %s %s", console, dest, err)
-	}
-	return nil
-}
-
-// mountSystem sets up linux specific system mounts like sys, proc, shm, and devpts
-// inside the mount namespace
-func mountSystem(rootfs string, mountLabel string) error {
-	for _, m := range []struct {
-		source string
-		path   string
-		device string
-		flags  int
-		data   string
-	}{
-		{source: "proc", path: filepath.Join(rootfs, "proc"), device: "proc", flags: defaultMountFlags},
-		{source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaultMountFlags},
-		{source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaultMountFlags, data: label.FormatMountLabel("mode=1777,size=65536k", mountLabel)},
-		{source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: label.FormatMountLabel("newinstance,ptmxmode=0666,mode=620,gid=5", mountLabel)},
-	} {
-		if err := os.MkdirAll(m.path, 0755); err != nil && !os.IsExist(err) {
-			return fmt.Errorf("mkdirall %s %s", m.path, err)
-		}
-		if err := system.Mount(m.source, m.path, m.device, uintptr(m.flags), m.data); err != nil {
-			return fmt.Errorf("mounting %s into %s %s", m.source, m.path, err)
-		}
-	}
-	return nil
-}
-
-// setupPtmx adds a symlink to pts/ptmx for /dev/ptmx and
-// finishes setting up /dev/console
-func setupPtmx(rootfs, console string, mountLabel string) error {
-	ptmx := filepath.Join(rootfs, "dev/ptmx")
-	if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) {
-		return err
-	}
-	if err := os.Symlink("pts/ptmx", ptmx); err != nil {
-		return fmt.Errorf("symlink dev ptmx %s", err)
-	}
-	if console != "" {
-		if err := setupConsole(rootfs, console, mountLabel); err != nil {
-			return err
-		}
-	}
-	return nil
-}
-
-// remountProc is used to detach and remount the proc filesystem
-// commonly needed with running a new process inside an existing container
-func remountProc() error {
-	if err := system.Unmount("/proc", syscall.MNT_DETACH); err != nil {
-		return err
-	}
-	if err := system.Mount("proc", "/proc", "proc", uintptr(defaultMountFlags), ""); err != nil {
-		return err
-	}
-	return nil
-}
-
-func remountSys() error {
-	if err := system.Unmount("/sys", syscall.MNT_DETACH); err != nil {
-		if err != syscall.EINVAL {
-			return err
-		}
-	} else {
-		if err := system.Mount("sysfs", "/sys", "sysfs", uintptr(defaultMountFlags), ""); err != nil {
-			return err
-		}
-	}
-	return nil
-}

+ 0 - 0
pkg/libcontainer/capabilities/capabilities.go → pkg/libcontainer/security/capabilities/capabilities.go


+ 51 - 0
pkg/libcontainer/security/restrict/restrict.go

@@ -0,0 +1,51 @@
+package restrict
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"syscall"
+
+	"github.com/dotcloud/docker/pkg/system"
+)
+
+const flags = syscall.MS_BIND | syscall.MS_REC | syscall.MS_RDONLY
+
+var restrictions = map[string]string{
+	// dirs
+	"/proc/sys":  "",
+	"/proc/irq":  "",
+	"/proc/acpi": "",
+
+	// files
+	"/proc/sysrq-trigger": "/dev/null",
+	"/proc/kcore":         "/dev/null",
+}
+
+// Restrict locks down access to many areas of proc
+// by using the asumption that the user does not have mount caps to
+// revert the changes made here
+func Restrict(rootfs, empty string) error {
+	for dest, source := range restrictions {
+		dest = filepath.Join(rootfs, dest)
+
+		// we don't have a "/dev/null" for dirs so have the requester pass a dir
+		// for us to bind mount
+		switch source {
+		case "":
+			source = empty
+		default:
+			source = filepath.Join(rootfs, source)
+		}
+		if err := system.Mount(source, dest, "bind", flags, ""); err != nil {
+			if os.IsNotExist(err) {
+				continue
+			}
+			return fmt.Errorf("unable to mount %s over %s %s", source, dest, err)
+		}
+		if err := system.Mount("", dest, "bind", flags|syscall.MS_REMOUNT, ""); err != nil {
+			return fmt.Errorf("unable to mount %s over %s %s", source, dest, err)
+		}
+	}
+	return nil
+}

+ 20 - 0
pkg/libcontainer/types.go

@@ -11,6 +11,26 @@ var (
 	ErrUnsupported      = errors.New("Unsupported method")
 )
 
+type Mounts []Mount
+
+func (s Mounts) OfType(t string) Mounts {
+	out := Mounts{}
+	for _, m := range s {
+		if m.Type == t {
+			out = append(out, m)
+		}
+	}
+	return out
+}
+
+type Mount struct {
+	Type        string `json:"type,omitempty"`
+	Source      string `json:"source,omitempty"`      // Source path, in the host namespace
+	Destination string `json:"destination,omitempty"` // Destination path, in the container
+	Writable    bool   `json:"writable,omitempty"`
+	Private     bool   `json:"private,omitempty"`
+}
+
 // namespaceList is used to convert the libcontainer types
 // into the names of the files located in /proc/<pid>/ns/* for
 // each namespace