Ver código fonte

Merge pull request #5529 from crosbymichael/restrict-proc

Mount /proc and /sys read-only, except in privileged containers
Guillaume J. Charmes 11 anos atrás
pai
commit
1c5a3123cc

+ 25 - 32
daemon/execdriver/lxc/driver.go

@@ -2,11 +2,6 @@ package lxc
 
 import (
 	"fmt"
-	"github.com/dotcloud/docker/daemon/execdriver"
-	"github.com/dotcloud/docker/pkg/cgroups"
-	"github.com/dotcloud/docker/pkg/label"
-	"github.com/dotcloud/docker/pkg/system"
-	"github.com/dotcloud/docker/utils"
 	"io/ioutil"
 	"log"
 	"os"
@@ -17,6 +12,13 @@ import (
 	"strings"
 	"syscall"
 	"time"
+
+	"github.com/dotcloud/docker/daemon/execdriver"
+	"github.com/dotcloud/docker/pkg/cgroups"
+	"github.com/dotcloud/docker/pkg/label"
+	"github.com/dotcloud/docker/pkg/libcontainer/security/restrict"
+	"github.com/dotcloud/docker/pkg/system"
+	"github.com/dotcloud/docker/utils"
 )
 
 const DriverName = "lxc"
@@ -26,27 +28,26 @@ func init() {
 		if err := setupEnv(args); err != nil {
 			return err
 		}
-
 		if err := setupHostname(args); err != nil {
 			return err
 		}
-
 		if err := setupNetworking(args); err != nil {
 			return err
 		}
-
+		if !args.Privileged {
+			if err := restrict.Restrict(); err != nil {
+				return err
+			}
+		}
 		if err := setupCapabilities(args); err != nil {
 			return err
 		}
-
 		if err := setupWorkingDirectory(args); err != nil {
 			return err
 		}
-
 		if err := system.CloseFdsFrom(3); err != nil {
 			return err
 		}
-
 		if err := changeUser(args); err != nil {
 			return err
 		}
@@ -64,10 +65,9 @@ func init() {
 }
 
 type driver struct {
-	root            string // root path for the driver to use
-	apparmor        bool
-	sharedRoot      bool
-	restrictionPath string
+	root       string // root path for the driver to use
+	apparmor   bool
+	sharedRoot bool
 }
 
 func NewDriver(root string, apparmor bool) (*driver, error) {
@@ -75,15 +75,10 @@ func NewDriver(root string, apparmor bool) (*driver, error) {
 	if err := linkLxcStart(root); err != nil {
 		return nil, err
 	}
-	restrictionPath := filepath.Join(root, "empty")
-	if err := os.MkdirAll(restrictionPath, 0700); err != nil {
-		return nil, err
-	}
 	return &driver{
-		apparmor:        apparmor,
-		root:            root,
-		sharedRoot:      rootIsShared(),
-		restrictionPath: restrictionPath,
+		apparmor:   apparmor,
+		root:       root,
+		sharedRoot: rootIsShared(),
 	}, nil
 }
 
@@ -414,16 +409,14 @@ func (d *driver) generateLXCConfig(c *execdriver.Command) (string, error) {
 
 	if err := LxcTemplateCompiled.Execute(fo, struct {
 		*execdriver.Command
-		AppArmor          bool
-		ProcessLabel      string
-		MountLabel        string
-		RestrictionSource string
+		AppArmor     bool
+		ProcessLabel string
+		MountLabel   string
 	}{
-		Command:           c,
-		AppArmor:          d.apparmor,
-		ProcessLabel:      process,
-		MountLabel:        mount,
-		RestrictionSource: d.restrictionPath,
+		Command:      c,
+		AppArmor:     d.apparmor,
+		ProcessLabel: process,
+		MountLabel:   mount,
 	}); err != nil {
 		return "", err
 	}

+ 8 - 17
daemon/execdriver/lxc/lxc_template.go

@@ -1,10 +1,11 @@
 package lxc
 
 import (
-	"github.com/dotcloud/docker/daemon/execdriver"
-	"github.com/dotcloud/docker/pkg/label"
 	"strings"
 	"text/template"
+
+	"github.com/dotcloud/docker/daemon/execdriver"
+	"github.com/dotcloud/docker/pkg/label"
 )
 
 const LxcTemplate = `
@@ -82,15 +83,12 @@ lxc.pivotdir = lxc_putold
 
 # NOTICE: These mounts must be applied within the namespace
 
-#  WARNING: procfs is a known attack vector and should probably be disabled
-#           if your userspace allows it. eg. see http://blog.zx2c4.com/749
+# WARNING: mounting procfs and/or sysfs read-write is a known attack vector.
+# See e.g. http://blog.zx2c4.com/749 and http://bit.ly/T9CkqJ
+# We mount them read-write here, but later, dockerinit will call the Restrict() function to remount them read-only.
+# We cannot mount them directly read-only, because that would prevent loading AppArmor profiles.
 lxc.mount.entry = proc {{escapeFstabSpaces $ROOTFS}}/proc proc nosuid,nodev,noexec 0 0
-
-# WARNING: sysfs is a known attack vector and should probably be disabled
-# if your userspace allows it. eg. see http://bit.ly/T9CkqJ
-{{if .Privileged}}
 lxc.mount.entry = sysfs {{escapeFstabSpaces $ROOTFS}}/sys sysfs nosuid,nodev,noexec 0 0
-{{end}}
 
 {{if .Tty}}
 lxc.mount.entry = {{.Console}} {{escapeFstabSpaces $ROOTFS}}/dev/console none bind,rw 0 0
@@ -111,15 +109,8 @@ lxc.mount.entry = {{$value.Source}} {{escapeFstabSpaces $ROOTFS}}/{{escapeFstabS
 {{if .AppArmor}}
 lxc.aa_profile = unconfined
 {{else}}
-# not unconfined
+# Let AppArmor normal confinement take place (i.e., not unconfined)
 {{end}}
-{{else}}
-# restrict access to proc
-lxc.mount.entry = {{.RestrictionSource}} {{escapeFstabSpaces $ROOTFS}}/proc/sys none bind,ro 0 0
-lxc.mount.entry = {{.RestrictionSource}} {{escapeFstabSpaces $ROOTFS}}/proc/irq none bind,ro 0 0
-lxc.mount.entry = {{.RestrictionSource}} {{escapeFstabSpaces $ROOTFS}}/proc/acpi none bind,ro 0 0
-lxc.mount.entry = {{escapeFstabSpaces $ROOTFS}}/dev/null {{escapeFstabSpaces $ROOTFS}}/proc/sysrq-trigger none bind,ro 0 0
-lxc.mount.entry = {{escapeFstabSpaces $ROOTFS}}/dev/null {{escapeFstabSpaces $ROOTFS}}/proc/kcore none bind,ro 0 0
 {{end}}
 
 # limits

+ 2 - 4
daemon/execdriver/native/create.go

@@ -24,7 +24,7 @@ func (d *driver) createContainer(c *execdriver.Command) (*libcontainer.Container
 	container.Cgroups.Name = c.ID
 	// check to see if we are running in ramdisk to disable pivot root
 	container.NoPivotRoot = os.Getenv("DOCKER_RAMDISK") != ""
-	container.Context["restriction_path"] = d.restrictionPath
+	container.Context["restrictions"] = "true"
 
 	if err := d.createNetwork(container, c); err != nil {
 		return nil, err
@@ -84,9 +84,7 @@ func (d *driver) setPrivileged(container *libcontainer.Container) error {
 	}
 	container.Cgroups.DeviceAccess = true
 
-	// add sysfs as a mount for privileged containers
-	container.Mounts = append(container.Mounts, libcontainer.Mount{Type: "sysfs"})
-	delete(container.Context, "restriction_path")
+	delete(container.Context, "restrictions")
 
 	if apparmor.IsEnabled() {
 		container.Context["apparmor_profile"] = "unconfined"

+ 0 - 7
daemon/execdriver/native/driver.go

@@ -57,7 +57,6 @@ type driver struct {
 	root             string
 	initPath         string
 	activeContainers map[string]*exec.Cmd
-	restrictionPath  string
 }
 
 func NewDriver(root, initPath string) (*driver, error) {
@@ -68,14 +67,8 @@ func NewDriver(root, initPath string) (*driver, error) {
 	if err := apparmor.InstallDefaultProfile(filepath.Join(root, "../..", BackupApparmorProfilePath)); err != nil {
 		return nil, err
 	}
-	restrictionPath := filepath.Join(root, "empty")
-	if err := os.MkdirAll(restrictionPath, 0700); err != nil {
-		return nil, err
-	}
-
 	return &driver{
 		root:             root,
-		restrictionPath:  restrictionPath,
 		initPath:         initPath,
 		activeContainers: make(map[string]*exec.Cmd),
 	}, nil

+ 30 - 8
integration-cli/docker_cli_run_test.go

@@ -725,24 +725,46 @@ func TestUnPrivilegedCannotMount(t *testing.T) {
 	logDone("run - test un-privileged cannot mount")
 }
 
-func TestSysNotAvaliableInNonPrivilegedContainers(t *testing.T) {
-	cmd := exec.Command(dockerBinary, "run", "busybox", "ls", "/sys/kernel")
+func TestSysNotWritableInNonPrivilegedContainers(t *testing.T) {
+	cmd := exec.Command(dockerBinary, "run", "busybox", "touch", "/sys/kernel/profiling")
 	if code, err := runCommand(cmd); err == nil || code == 0 {
-		t.Fatal("sys should not be available in a non privileged container")
+		t.Fatal("sys should not be writable in a non privileged container")
 	}
 
 	deleteAllContainers()
 
-	logDone("run - sys not avaliable in non privileged container")
+	logDone("run - sys not writable in non privileged container")
 }
 
-func TestSysAvaliableInPrivilegedContainers(t *testing.T) {
-	cmd := exec.Command(dockerBinary, "run", "--privileged", "busybox", "ls", "/sys/kernel")
+func TestSysWritableInPrivilegedContainers(t *testing.T) {
+	cmd := exec.Command(dockerBinary, "run", "--privileged", "busybox", "touch", "/sys/kernel/profiling")
 	if code, err := runCommand(cmd); err != nil || code != 0 {
-		t.Fatalf("sys should be available in privileged container")
+		t.Fatalf("sys should be writable in privileged container")
 	}
 
 	deleteAllContainers()
 
-	logDone("run - sys avaliable in privileged container")
+	logDone("run - sys writable in privileged container")
+}
+
+func TestProcNotWritableInNonPrivilegedContainers(t *testing.T) {
+	cmd := exec.Command(dockerBinary, "run", "busybox", "touch", "/proc/sysrq-trigger")
+	if code, err := runCommand(cmd); err == nil || code == 0 {
+		t.Fatal("proc should not be writable in a non privileged container")
+	}
+
+	deleteAllContainers()
+
+	logDone("run - proc not writable in non privileged container")
+}
+
+func TestProcWritableInPrivilegedContainers(t *testing.T) {
+	cmd := exec.Command(dockerBinary, "run", "--privileged", "busybox", "touch", "/proc/sysrq-trigger")
+	if code, err := runCommand(cmd); err != nil || code != 0 {
+		t.Fatalf("proc should be writable in privileged container")
+	}
+
+	deleteAllContainers()
+
+	logDone("run - proc writable in privileged container")
 }

+ 1 - 1
pkg/apparmor/apparmor.go

@@ -20,7 +20,7 @@ func IsEnabled() bool {
 	return false
 }
 
-func ApplyProfile(pid int, name string) error {
+func ApplyProfile(name string) error {
 	if name == "" {
 		return nil
 	}

+ 1 - 3
pkg/apparmor/apparmor_disabled.go

@@ -2,12 +2,10 @@
 
 package apparmor
 
-import ()
-
 func IsEnabled() bool {
 	return false
 }
 
-func ApplyProfile(pid int, name string) error {
+func ApplyProfile(name string) error {
 	return nil
 }

+ 3 - 2
pkg/libcontainer/console/console.go

@@ -4,11 +4,12 @@ package console
 
 import (
 	"fmt"
-	"github.com/dotcloud/docker/pkg/label"
-	"github.com/dotcloud/docker/pkg/system"
 	"os"
 	"path/filepath"
 	"syscall"
+
+	"github.com/dotcloud/docker/pkg/label"
+	"github.com/dotcloud/docker/pkg/system"
 )
 
 // Setup initializes the proper /dev/console inside the rootfs path

+ 4 - 15
pkg/libcontainer/mount/init.go

@@ -11,7 +11,6 @@ import (
 	"github.com/dotcloud/docker/pkg/label"
 	"github.com/dotcloud/docker/pkg/libcontainer"
 	"github.com/dotcloud/docker/pkg/libcontainer/mount/nodes"
-	"github.com/dotcloud/docker/pkg/libcontainer/security/restrict"
 	"github.com/dotcloud/docker/pkg/system"
 )
 
@@ -51,11 +50,6 @@ func InitializeMountNamespace(rootfs, console string, container *libcontainer.Co
 	if err := nodes.CopyN(rootfs, nodes.DefaultNodes); err != nil {
 		return fmt.Errorf("copy dev nodes %s", err)
 	}
-	if restrictionPath := container.Context["restriction_path"]; restrictionPath != "" {
-		if err := restrict.Restrict(rootfs, restrictionPath); err != nil {
-			return fmt.Errorf("restrict %s", err)
-		}
-	}
 	if err := SetupPtmx(rootfs, console, container.Context["mount_label"]); err != nil {
 		return err
 	}
@@ -124,22 +118,17 @@ func setupBindmounts(rootfs string, bindMounts libcontainer.Mounts) error {
 }
 
 // TODO: this is crappy right now and should be cleaned up with a better way of handling system and
-// standard bind mounts allowing them to be more dymanic
+// standard bind mounts allowing them to be more dynamic
 func newSystemMounts(rootfs, mountLabel string, mounts libcontainer.Mounts) []mount {
 	systemMounts := []mount{
 		{source: "proc", path: filepath.Join(rootfs, "proc"), device: "proc", flags: defaultMountFlags},
+		{source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaultMountFlags},
+		{source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaultMountFlags, data: label.FormatMountLabel("mode=1777,size=65536k", mountLabel)},
+		{source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: label.FormatMountLabel("newinstance,ptmxmode=0666,mode=620,gid=5", mountLabel)},
 	}
 
 	if len(mounts.OfType("devtmpfs")) == 1 {
 		systemMounts = append(systemMounts, mount{source: "tmpfs", path: filepath.Join(rootfs, "dev"), device: "tmpfs", flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME, data: label.FormatMountLabel("mode=755", mountLabel)})
 	}
-	systemMounts = append(systemMounts,
-		mount{source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaultMountFlags, data: label.FormatMountLabel("mode=1777,size=65536k", mountLabel)},
-		mount{source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: label.FormatMountLabel("newinstance,ptmxmode=0666,mode=620,gid=5", mountLabel)},
-	)
-
-	if len(mounts.OfType("sysfs")) == 1 {
-		systemMounts = append(systemMounts, mount{source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaultMountFlags})
-	}
 	return systemMounts
 }

+ 11 - 5
pkg/libcontainer/nsinit/init.go

@@ -16,6 +16,7 @@ import (
 	"github.com/dotcloud/docker/pkg/libcontainer/mount"
 	"github.com/dotcloud/docker/pkg/libcontainer/network"
 	"github.com/dotcloud/docker/pkg/libcontainer/security/capabilities"
+	"github.com/dotcloud/docker/pkg/libcontainer/security/restrict"
 	"github.com/dotcloud/docker/pkg/libcontainer/utils"
 	"github.com/dotcloud/docker/pkg/system"
 	"github.com/dotcloud/docker/pkg/user"
@@ -68,18 +69,23 @@ func Init(container *libcontainer.Container, uncleanRootfs, consolePath string,
 	if err := system.Sethostname(container.Hostname); err != nil {
 		return fmt.Errorf("sethostname %s", err)
 	}
-	if err := FinalizeNamespace(container); err != nil {
-		return fmt.Errorf("finalize namespace %s", err)
-	}
 
 	runtime.LockOSThread()
 
-	if err := apparmor.ApplyProfile(os.Getpid(), container.Context["apparmor_profile"]); err != nil {
-		return err
+	if err := apparmor.ApplyProfile(container.Context["apparmor_profile"]); err != nil {
+		return fmt.Errorf("set apparmor profile %s: %s", container.Context["apparmor_profile"], err)
 	}
 	if err := label.SetProcessLabel(container.Context["process_label"]); err != nil {
 		return fmt.Errorf("set process label %s", err)
 	}
+	if container.Context["restrictions"] != "" {
+		if err := restrict.Restrict(); err != nil {
+			return err
+		}
+	}
+	if err := FinalizeNamespace(container); err != nil {
+		return fmt.Errorf("finalize namespace %s", err)
+	}
 	return system.Execv(args[0], args[0:], container.Env)
 }
 

+ 12 - 38
pkg/libcontainer/security/restrict/restrict.go

@@ -1,51 +1,25 @@
+// +build linux
+
 package restrict
 
 import (
 	"fmt"
-	"os"
-	"path/filepath"
 	"syscall"
 
 	"github.com/dotcloud/docker/pkg/system"
 )
 
-const flags = syscall.MS_BIND | syscall.MS_REC | syscall.MS_RDONLY
-
-var restrictions = map[string]string{
-	// dirs
-	"/proc/sys":  "",
-	"/proc/irq":  "",
-	"/proc/acpi": "",
-
-	// files
-	"/proc/sysrq-trigger": "/dev/null",
-	"/proc/kcore":         "/dev/null",
-}
-
-// Restrict locks down access to many areas of proc
-// by using the asumption that the user does not have mount caps to
-// revert the changes made here
-func Restrict(rootfs, empty string) error {
-	for dest, source := range restrictions {
-		dest = filepath.Join(rootfs, dest)
-
-		// we don't have a "/dev/null" for dirs so have the requester pass a dir
-		// for us to bind mount
-		switch source {
-		case "":
-			source = empty
-		default:
-			source = filepath.Join(rootfs, source)
-		}
-		if err := system.Mount(source, dest, "bind", flags, ""); err != nil {
-			if os.IsNotExist(err) {
-				continue
-			}
-			return fmt.Errorf("unable to mount %s over %s %s", source, dest, err)
-		}
-		if err := system.Mount("", dest, "bind", flags|syscall.MS_REMOUNT, ""); err != nil {
-			return fmt.Errorf("unable to mount %s over %s %s", source, dest, err)
+// This has to be called while the container still has CAP_SYS_ADMIN (to be able to perform mounts).
+// However, afterwards, CAP_SYS_ADMIN should be dropped (otherwise the user will be able to revert those changes).
+func Restrict() error {
+	// remount proc and sys as readonly
+	for _, dest := range []string{"proc", "sys"} {
+		if err := system.Mount("", dest, "", syscall.MS_REMOUNT|syscall.MS_RDONLY, ""); err != nil {
+			return fmt.Errorf("unable to remount %s readonly: %s", dest, err)
 		}
 	}
+	if err := system.Mount("/dev/null", "/proc/kcore", "", syscall.MS_BIND, ""); err != nil {
+		return fmt.Errorf("unable to bind-mount /dev/null over /proc/kcore")
+	}
 	return nil
 }

+ 9 - 0
pkg/libcontainer/security/restrict/unsupported.go

@@ -0,0 +1,9 @@
+// +build !linux
+
+package restrict
+
+import "fmt"
+
+func Restrict() error {
+	return fmt.Errorf("not supported")
+}