пре 11 година · 1c4202a614
--- a/daemon/execdriver/lxc/driver.go
+++ b/daemon/execdriver/lxc/driver.go
@@ -5,6 +5,7 @@ import (
 
				 	"github.com/dotcloud/docker/daemon/execdriver"
			
 
				 	"github.com/dotcloud/docker/pkg/cgroups"
			
 
				 	"github.com/dotcloud/docker/pkg/label"
			
 
				+	"github.com/dotcloud/docker/pkg/libcontainer/security/restrict"
			
 
				 	"github.com/dotcloud/docker/pkg/system"
			
 
				 	"github.com/dotcloud/docker/utils"
			
 
				 	"io/ioutil"
			
@@ -35,6 +36,10 @@ func init() {
 
				 			return err
			
 
				 		}
			
 
				 
			
 
				+		if err := restrict.Restrict("/", "/empty"); err != nil {
			
 
				+			return err
			
 
				+		}
			
 
				+
			
 
				 		if err := setupCapabilities(args); err != nil {
			
 
				 			return err
			
 
				 		}
			
--- a/daemon/execdriver/lxc/lxc_template.go
+++ b/daemon/execdriver/lxc/lxc_template.go
@@ -82,15 +82,12 @@ lxc.pivotdir = lxc_putold
 
				 
			
 
				 # NOTICE: These mounts must be applied within the namespace
			
 
				 
			
 
				-#  WARNING: procfs is a known attack vector and should probably be disabled
			
 
				-#           if your userspace allows it. eg. see http://blog.zx2c4.com/749
			
 
				+# WARNING: mounting procfs and/or sysfs read-write is a known attack vector.
			
 
				+# See e.g. http://blog.zx2c4.com/749 and http://bit.ly/T9CkqJ
			
 
				+# We mount them read-write here, but later, dockerinit will call the Restrict() function to remount them read-only.
			
 
				+# We cannot mount them directly read-only, because that would prevent loading AppArmor profiles.
			
 
				 lxc.mount.entry = proc {{escapeFstabSpaces $ROOTFS}}/proc proc nosuid,nodev,noexec 0 0
			
 
				-
			
 
				-# WARNING: sysfs is a known attack vector and should probably be disabled
			
 
				-# if your userspace allows it. eg. see http://bit.ly/T9CkqJ
			
 
				-{{if .Privileged}}
			
 
				 lxc.mount.entry = sysfs {{escapeFstabSpaces $ROOTFS}}/sys sysfs nosuid,nodev,noexec 0 0
			
 
				-{{end}}
			
 
				 
			
 
				 {{if .Tty}}
			
 
				 lxc.mount.entry = {{.Console}} {{escapeFstabSpaces $ROOTFS}}/dev/console none bind,rw 0 0
			
@@ -111,14 +108,14 @@ lxc.mount.entry = {{$value.Source}} {{escapeFstabSpaces $ROOTFS}}/{{escapeFstabS
 
				 {{if .AppArmor}}
			
 
				 lxc.aa_profile = unconfined
			
 
				 {{else}}
			
 
				-# not unconfined
			
 
				+# Let AppArmor normal confinement take place (i.e., not unconfined)
			
 
				 {{end}}
			
 
				 {{else}}
			
 
				-# restrict access to proc
			
 
				-lxc.mount.entry = {{.RestrictionSource}} {{escapeFstabSpaces $ROOTFS}}/proc/sys none bind,ro 0 0
			
 
				-lxc.mount.entry = {{.RestrictionSource}} {{escapeFstabSpaces $ROOTFS}}/proc/irq none bind,ro 0 0
			
 
				-lxc.mount.entry = {{.RestrictionSource}} {{escapeFstabSpaces $ROOTFS}}/proc/acpi none bind,ro 0 0
			
 
				-lxc.mount.entry = {{escapeFstabSpaces $ROOTFS}}/dev/null {{escapeFstabSpaces $ROOTFS}}/proc/sysrq-trigger none bind,ro 0 0
			
 
				+# Restrict access to some stuff in /proc. Note that /proc is already mounted
			
 
				+# read-only, so we don't need to bother about things that are just dangerous
			
 
				+# to write to (like sysrq-trigger). Also, recent kernels won't let a container
			
 
				+# peek into /proc/kcore, but let's cater for people who might run Docker on
			
 
				+# older kernels. Just in case.
			
 
				 lxc.mount.entry = {{escapeFstabSpaces $ROOTFS}}/dev/null {{escapeFstabSpaces $ROOTFS}}/proc/kcore none bind,ro 0 0
			
 
				 {{end}}
			
 
				 
			
--- a/daemon/execdriver/native/create.go
+++ b/daemon/execdriver/native/create.go
@@ -84,8 +84,6 @@ func (d *driver) setPrivileged(container *libcontainer.Container) error {
 
				 	}
			
 
				 	container.Cgroups.DeviceAccess = true
			
 
				 
			
 
				-	// add sysfs as a mount for privileged containers
			
 
				-	container.Mounts = append(container.Mounts, libcontainer.Mount{Type: "sysfs"})
			
 
				 	delete(container.Context, "restriction_path")
			
 
				 
			
 
				 	if apparmor.IsEnabled() {
			
--- a/integration-cli/docker_cli_run_test.go
+++ b/integration-cli/docker_cli_run_test.go
@@ -725,24 +725,46 @@ func TestUnPrivilegedCannotMount(t *testing.T) {
 
				 	logDone("run - test un-privileged cannot mount")
			
 
				 }
			
 
				 
			
 
				-func TestSysNotAvaliableInNonPrivilegedContainers(t *testing.T) {
			
 
				-	cmd := exec.Command(dockerBinary, "run", "busybox", "ls", "/sys/kernel")
			
 
				+func TestSysNotWritableInNonPrivilegedContainers(t *testing.T) {
			
 
				+	cmd := exec.Command(dockerBinary, "run", "busybox", "touch", "/sys/kernel/profiling")
			
 
				 	if code, err := runCommand(cmd); err == nil || code == 0 {
			
 
				-		t.Fatal("sys should not be available in a non privileged container")
			
 
				+		t.Fatal("sys should not be writable in a non privileged container")
			
 
				 	}
			
 
				 
			
 
				 	deleteAllContainers()
			
 
				 
			
 
				-	logDone("run - sys not avaliable in non privileged container")
			
 
				+	logDone("run - sys not writable in non privileged container")
			
 
				 }
			
 
				 
			
 
				-func TestSysAvaliableInPrivilegedContainers(t *testing.T) {
			
 
				-	cmd := exec.Command(dockerBinary, "run", "--privileged", "busybox", "ls", "/sys/kernel")
			
 
				+func TestSysWritableInPrivilegedContainers(t *testing.T) {
			
 
				+	cmd := exec.Command(dockerBinary, "run", "--privileged", "busybox", "touch", "/sys/kernel/profiling")
			
 
				 	if code, err := runCommand(cmd); err != nil || code != 0 {
			
 
				-		t.Fatalf("sys should be available in privileged container")
			
 
				+		t.Fatalf("sys should be writable in privileged container")
			
 
				 	}
			
 
				 
			
 
				 	deleteAllContainers()
			
 
				 
			
 
				-	logDone("run - sys avaliable in privileged container")
			
 
				+	logDone("run - sys writable in privileged container")
			
 
				+}
			
 
				+
			
 
				+func TestProcNotWritableInNonPrivilegedContainers(t *testing.T) {
			
 
				+	cmd := exec.Command(dockerBinary, "run", "busybox", "touch", "/proc/sysrq-trigger")
			
 
				+	if code, err := runCommand(cmd); err == nil || code == 0 {
			
 
				+		t.Fatal("proc should not be writable in a non privileged container")
			
 
				+	}
			
 
				+
			
 
				+	deleteAllContainers()
			
 
				+
			
 
				+	logDone("run - proc not writable in non privileged container")
			
 
				+}
			
 
				+
			
 
				+func TestProcWritableInPrivilegedContainers(t *testing.T) {
			
 
				+	cmd := exec.Command(dockerBinary, "run", "--privileged", "busybox", "touch", "/proc/sysrq-trigger")
			
 
				+	if code, err := runCommand(cmd); err != nil || code != 0 {
			
 
				+		t.Fatalf("proc should be writable in privileged container")
			
 
				+	}
			
 
				+
			
 
				+	deleteAllContainers()
			
 
				+
			
 
				+	logDone("run - proc writable in privileged container")
			
 
				 }
			
--- a/pkg/libcontainer/mount/init.go
+++ b/pkg/libcontainer/mount/init.go
@@ -11,7 +11,6 @@ import (
 
				 	"github.com/dotcloud/docker/pkg/label"
			
 
				 	"github.com/dotcloud/docker/pkg/libcontainer"
			
 
				 	"github.com/dotcloud/docker/pkg/libcontainer/mount/nodes"
			
 
				-	"github.com/dotcloud/docker/pkg/libcontainer/security/restrict"
			
 
				 	"github.com/dotcloud/docker/pkg/system"
			
 
				 )
			
 
				 
			
@@ -51,11 +50,6 @@ func InitializeMountNamespace(rootfs, console string, container *libcontainer.Co
 
				 	if err := nodes.CopyN(rootfs, nodes.DefaultNodes); err != nil {
			
 
				 		return fmt.Errorf("copy dev nodes %s", err)
			
 
				 	}
			
 
				-	if restrictionPath := container.Context["restriction_path"]; restrictionPath != "" {
			
 
				-		if err := restrict.Restrict(rootfs, restrictionPath); err != nil {
			
 
				-			return fmt.Errorf("restrict %s", err)
			
 
				-		}
			
 
				-	}
			
 
				 	if err := SetupPtmx(rootfs, console, container.Context["mount_label"]); err != nil {
			
 
				 		return err
			
 
				 	}
			
@@ -124,10 +118,11 @@ func setupBindmounts(rootfs string, bindMounts libcontainer.Mounts) error {
 
				 }
			
 
				 
			
 
				 // TODO: this is crappy right now and should be cleaned up with a better way of handling system and
			
 
				-// standard bind mounts allowing them to be more dymanic
			
 
				+// standard bind mounts allowing them to be more dynamic
			
 
				 func newSystemMounts(rootfs, mountLabel string, mounts libcontainer.Mounts) []mount {
			
 
				 	systemMounts := []mount{
			
 
				 		{source: "proc", path: filepath.Join(rootfs, "proc"), device: "proc", flags: defaultMountFlags},
			
 
				+		{source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaultMountFlags},
			
 
				 	}
			
 
				 
			
 
				 	if len(mounts.OfType("devtmpfs")) == 1 {
			
@@ -138,8 +133,5 @@ func newSystemMounts(rootfs, mountLabel string, mounts libcontainer.Mounts) []mo
 
				 		mount{source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: label.FormatMountLabel("newinstance,ptmxmode=0666,mode=620,gid=5", mountLabel)},
			
 
				 	)
			
 
				 
			
 
				-	if len(mounts.OfType("sysfs")) == 1 {
			
 
				-		systemMounts = append(systemMounts, mount{source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaultMountFlags})
			
 
				-	}
			
 
				 	return systemMounts
			
 
				 }
			
--- a/pkg/libcontainer/nsinit/init.go
+++ b/pkg/libcontainer/nsinit/init.go
@@ -16,6 +16,7 @@ import (
 
				 	"github.com/dotcloud/docker/pkg/libcontainer/mount"
			
 
				 	"github.com/dotcloud/docker/pkg/libcontainer/network"
			
 
				 	"github.com/dotcloud/docker/pkg/libcontainer/security/capabilities"
			
 
				+	"github.com/dotcloud/docker/pkg/libcontainer/security/restrict"
			
 
				 	"github.com/dotcloud/docker/pkg/libcontainer/utils"
			
 
				 	"github.com/dotcloud/docker/pkg/system"
			
 
				 	"github.com/dotcloud/docker/pkg/user"
			
@@ -68,18 +69,25 @@ func Init(container *libcontainer.Container, uncleanRootfs, consolePath string,
 
				 	if err := system.Sethostname(container.Hostname); err != nil {
			
 
				 		return fmt.Errorf("sethostname %s", err)
			
 
				 	}
			
 
				-	if err := FinalizeNamespace(container); err != nil {
			
 
				-		return fmt.Errorf("finalize namespace %s", err)
			
 
				-	}
			
 
				 
			
 
				 	runtime.LockOSThread()
			
 
				 
			
 
				+	if restrictionPath := container.Context["restriction_path"]; restrictionPath != "" {
			
 
				+		if err := restrict.Restrict("/", restrictionPath); err != nil {
			
 
				+			return err
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				 	if err := apparmor.ApplyProfile(os.Getpid(), container.Context["apparmor_profile"]); err != nil {
			
 
				 		return err
			
 
				 	}
			
 
				 	if err := label.SetProcessLabel(container.Context["process_label"]); err != nil {
			
 
				 		return fmt.Errorf("set process label %s", err)
			
 
				 	}
			
 
				+
			
 
				+	if err := FinalizeNamespace(container); err != nil {
			
 
				+		return fmt.Errorf("finalize namespace %s", err)
			
 
				+	}
			
 
				 	return system.Execv(args[0], args[0:], container.Env)
			
 
				 }
			
 
				 
			
--- a/pkg/libcontainer/security/restrict/restrict.go
+++ b/pkg/libcontainer/security/restrict/restrict.go
@@ -9,43 +9,67 @@ import (
 
				 	"github.com/dotcloud/docker/pkg/system"
			
 
				 )
			
 
				 
			
 
				-const flags = syscall.MS_BIND | syscall.MS_REC | syscall.MS_RDONLY
			
 
				-
			
 
				-var restrictions = map[string]string{
			
 
				-	// dirs
			
 
				-	"/proc/sys":  "",
			
 
				-	"/proc/irq":  "",
			
 
				-	"/proc/acpi": "",
			
 
				+// "restrictions" are container paths (files, directories, whatever) that have to be masked.
			
 
				+// maskPath is a "safe" path to be mounted over maskedPath. It can take two special values:
			
 
				+// - if it is "", then nothing is mounted;
			
 
				+// - if it is "EMPTY", then an empty directory is mounted instead.
			
 
				+// If remountRO is true then the maskedPath is remounted read-only (regardless of whether a maskPath was used).
			
 
				+type restriction struct {
			
 
				+	maskedPath string
			
 
				+	maskPath   string
			
 
				+	remountRO  bool
			
 
				+}
			
 
				 
			
 
				-	// files
			
 
				-	"/proc/sysrq-trigger": "/dev/null",
			
 
				-	"/proc/kcore":         "/dev/null",
			
 
				+var restrictions = []restriction{
			
 
				+	{"/proc", "", true},
			
 
				+	{"/sys", "", true},
			
 
				+	{"/proc/kcore", "/dev/null", false},
			
 
				 }
			
 
				 
			
 
				-// Restrict locks down access to many areas of proc
			
 
				-// by using the asumption that the user does not have mount caps to
			
 
				-// revert the changes made here
			
 
				+// This has to be called while the container still has CAP_SYS_ADMIN (to be able to perform mounts).
			
 
				+// However, afterwards, CAP_SYS_ADMIN should be dropped (otherwise the user will be able to revert those changes).
			
 
				+// "empty" should be the path to an empty directory.
			
 
				 func Restrict(rootfs, empty string) error {
			
 
				-	for dest, source := range restrictions {
			
 
				-		dest = filepath.Join(rootfs, dest)
			
 
				-
			
 
				-		// we don't have a "/dev/null" for dirs so have the requester pass a dir
			
 
				-		// for us to bind mount
			
 
				-		switch source {
			
 
				-		case "":
			
 
				-			source = empty
			
 
				-		default:
			
 
				-			source = filepath.Join(rootfs, source)
			
 
				-		}
			
 
				-		if err := system.Mount(source, dest, "bind", flags, ""); err != nil {
			
 
				-			if os.IsNotExist(err) {
			
 
				-				continue
			
 
				+	for _, restriction := range restrictions {
			
 
				+		dest := filepath.Join(rootfs, restriction.maskedPath)
			
 
				+		if restriction.maskPath != "" {
			
 
				+			var source string
			
 
				+			if restriction.maskPath == "EMPTY" {
			
 
				+				source = empty
			
 
				+			} else {
			
 
				+				source = filepath.Join(rootfs, restriction.maskPath)
			
 
				+			}
			
 
				+			if err := system.Mount(source, dest, "", syscall.MS_BIND, ""); err != nil {
			
 
				+				return fmt.Errorf("unable to bind-mount %s over %s: %s", source, dest, err)
			
 
				 			}
			
 
				-			return fmt.Errorf("unable to mount %s over %s %s", source, dest, err)
			
 
				 		}
			
 
				-		if err := system.Mount("", dest, "bind", flags|syscall.MS_REMOUNT, ""); err != nil {
			
 
				-			return fmt.Errorf("unable to mount %s over %s %s", source, dest, err)
			
 
				+		if restriction.remountRO {
			
 
				+			if err := system.Mount("", dest, "", syscall.MS_REMOUNT|syscall.MS_RDONLY, ""); err != nil {
			
 
				+				return fmt.Errorf("unable to remount %s readonly: %s", dest, err)
			
 
				+			}
			
 
				 		}
			
 
				 	}
			
 
				+
			
 
				+	// This weird trick will allow us to mount /proc read-only, while being able to use AppArmor.
			
 
				+	// This is because apparently, loading an AppArmor profile requires write access to /proc/1/attr.
			
 
				+	// So we do another mount of procfs, ensure it's write-able, and bind-mount a subset of it.
			
 
				+	tmpProcPath := filepath.Join(rootfs, ".proc")
			
 
				+	if err := os.Mkdir(tmpProcPath, 0700); err != nil {
			
 
				+		return fmt.Errorf("unable to create temporary proc mountpoint %s: %s", tmpProcPath, err)
			
 
				+	}
			
 
				+	if err := system.Mount("proc", tmpProcPath, "proc", 0, ""); err != nil {
			
 
				+		return fmt.Errorf("unable to mount proc on temporary proc mountpoint: %s", err)
			
 
				+	}
			
 
				+	if err := system.Mount("proc", tmpProcPath, "", syscall.MS_REMOUNT, ""); err != nil {
			
 
				+		return fmt.Errorf("unable to remount proc read-write: %s", err)
			
 
				+	}
			
 
				+	rwAttrPath := filepath.Join(rootfs, ".proc", "1", "attr")
			
 
				+	roAttrPath := filepath.Join(rootfs, "proc", "1", "attr")
			
 
				+	if err := system.Mount(rwAttrPath, roAttrPath, "", syscall.MS_BIND, ""); err != nil {
			
 
				+		return fmt.Errorf("unable to bind-mount %s on %s: %s", rwAttrPath, roAttrPath, err)
			
 
				+	}
			
 
				+	if err := system.Unmount(tmpProcPath, 0); err != nil {
			
 
				+		return fmt.Errorf("unable to unmount temporary proc filesystem: %s", err)
			
 
				+	}
			
 
				 	return nil
			
 
				 }