浏览代码

Start containers in their own cgroup namespaces

This is enabled for all containers that are not run with --privileged,
if the kernel supports it.

Fixes #38332

Signed-off-by: Rob Gulewich <rgulewich@netflix.com>
Rob Gulewich 6 年之前
父节点
当前提交
256eb04d69

+ 22 - 20
daemon/daemon.go

@@ -81,26 +81,27 @@ var (
 
 // Daemon holds information about the Docker daemon.
 type Daemon struct {
-	ID                string
-	repository        string
-	containers        container.Store
-	containersReplica container.ViewDB
-	execCommands      *exec.Store
-	imageService      *images.ImageService
-	idIndex           *truncindex.TruncIndex
-	configStore       *config.Config
-	statsCollector    *stats.Collector
-	defaultLogConfig  containertypes.LogConfig
-	RegistryService   registry.Service
-	EventsService     *events.Events
-	netController     libnetwork.NetworkController
-	volumes           *volumesservice.VolumesService
-	discoveryWatcher  discovery.Reloader
-	root              string
-	seccompEnabled    bool
-	apparmorEnabled   bool
-	shutdown          bool
-	idMapping         *idtools.IdentityMapping
+	ID                      string
+	repository              string
+	containers              container.Store
+	containersReplica       container.ViewDB
+	execCommands            *exec.Store
+	imageService            *images.ImageService
+	idIndex                 *truncindex.TruncIndex
+	configStore             *config.Config
+	statsCollector          *stats.Collector
+	defaultLogConfig        containertypes.LogConfig
+	RegistryService         registry.Service
+	EventsService           *events.Events
+	netController           libnetwork.NetworkController
+	volumes                 *volumesservice.VolumesService
+	discoveryWatcher        discovery.Reloader
+	root                    string
+	seccompEnabled          bool
+	apparmorEnabled         bool
+	cgroupNamespacesEnabled bool
+	shutdown                bool
+	idMapping               *idtools.IdentityMapping
 	// TODO: move graphDrivers field to an InfoService
 	graphDrivers map[string]string // By operating system
 
@@ -1020,6 +1021,7 @@ func NewDaemon(ctx context.Context, config *config.Config, pluginStore *plugin.S
 	d.idMapping = idMapping
 	d.seccompEnabled = sysInfo.Seccomp
 	d.apparmorEnabled = sysInfo.AppArmor
+	d.cgroupNamespacesEnabled = sysInfo.CgroupNamespaces
 
 	d.linkIndex = newLinkIndex()
 

+ 6 - 1
daemon/oci_linux.go

@@ -307,8 +307,13 @@ func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
 			s.Hostname = ""
 		}
 
-		return nil
+	// cgroup
+	if daemon.cgroupNamespacesEnabled && !c.HostConfig.Privileged {
+		nsCgroup := specs.LinuxNamespace{Type: "cgroup"}
+		setNamespace(s, nsCgroup)
 	}
+
+	return nil
 }
 
 func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {

+ 31 - 9
integration-cli/docker_cli_build_test.go

@@ -3984,18 +3984,40 @@ func (s *DockerSuite) TestBuildContainerWithCgroupParent(c *check.C) {
 	if !found {
 		c.Fatalf("unable to find self memory cgroup path. CgroupsPath: %v", selfCgroupPaths)
 	}
-	result := buildImage("buildcgroupparent",
-		cli.WithFlags("--cgroup-parent", cgroupParent),
-		build.WithDockerfile(`
+
+	doneCh := make(chan string)
+
+	// If cgroup namespaces are enabled, then processes running inside the container won't
+	// be able to see the parent namespace. Check that they have the correct parents from
+	// the host, which has the non-namespaced view of the hierarchy.
+
+	go func() {
+		result := buildImage("buildcgroupparent",
+			cli.WithFlags("--cgroup-parent", cgroupParent),
+			build.WithDockerfile(`
 FROM busybox
-RUN cat /proc/self/cgroup
-`))
-	result.Assert(c, icmd.Success)
-	m, err := regexp.MatchString(fmt.Sprintf("memory:.*/%s/.*", cgroupParent), result.Combined())
-	assert.NilError(c, err)
+RUN sleep 10
+			`))
+		result.Assert(c, icmd.Success)
+		doneCh <- "done"
+	}()
+
+	// Wait until the build is well into the sleep
+	time.Sleep(3 * time.Second)
+	out, _, err := dockerCmdWithError("ps", "-q", "-l")
+	c.Assert(err, check.IsNil)
+	cID := strings.TrimSpace(out)
+
+	pid := inspectField(c, cID, "State.Pid")
+	paths := ReadCgroupPathsForPid(c, pid)
+	m, err := regexp.MatchString(fmt.Sprintf("memory:.*/%s/.*", cgroupParent), paths)
+	c.Assert(err, check.IsNil)
 	if !m {
-		c.Fatalf("There is no expected memory cgroup with parent /%s/: %s", cgroupParent, result.Combined())
+		c.Fatalf("There is no expected memory cgroup with parent /%s/: %s", cgroupParent, paths)
 	}
+
+	// Wait for the build to complete, otherwise it will exit with an error
+	<-doneCh
 }
 
 // FIXME(vdemeester) could be a unit test

+ 16 - 5
integration-cli/docker_cli_daemon_test.go

@@ -1787,7 +1787,8 @@ func (s *DockerDaemonSuite) TestDaemonRestartContainerLinksRestart(c *check.C) {
 }
 
 func (s *DockerDaemonSuite) TestDaemonCgroupParent(c *check.C) {
-	testRequires(c, DaemonIsLinux)
+	// Test requires local filesystem access on a Linux host
+	testRequires(c, DaemonIsLinux, testEnv.IsLocalDaemon)
 
 	cgroupParent := "test"
 	name := "cgroup-test"
@@ -1795,10 +1796,20 @@ func (s *DockerDaemonSuite) TestDaemonCgroupParent(c *check.C) {
 	s.d.StartWithBusybox(c, "--cgroup-parent", cgroupParent)
 	defer s.d.Restart(c)
 
-	out, err := s.d.Cmd("run", "--name", name, "busybox", "cat", "/proc/self/cgroup")
-	assert.NilError(c, err)
-	cgroupPaths := ParseCgroupPaths(string(out))
-	c.Assert(len(cgroupPaths), checker.Not(checker.Equals), 0, check.Commentf("unexpected output - %q", string(out)))
+	out, err := s.d.Cmd("run", "--name", name, "-d", "busybox", "top")
+	c.Assert(err, checker.IsNil)
+
+	// If cgroup namespaces are enabled, then processes running inside the container won't
+	// be able to see the parent namespace. Check that they have the correct parents from
+	// the host, which has the non-namespaced view of the hierarchy.
+
+	pid, err := s.d.Cmd("inspect", "-f", "{{.State.Pid}}", name)
+	c.Assert(err, checker.IsNil)
+	pid = strings.TrimSpace(string(pid))
+	paths := ReadCgroupPathsForPid(c, pid)
+	cgroupPaths := ParseCgroupPaths(paths)
+	c.Assert(len(cgroupPaths), checker.Not(checker.Equals), 0, check.Commentf("unexpected output - %q", paths))
+
 	out, err = s.d.Cmd("inspect", "-f", "{{.Id}}", name)
 	assert.NilError(c, err)
 	id := strings.TrimSpace(string(out))

+ 25 - 8
integration-cli/docker_cli_run_test.go

@@ -3241,8 +3241,8 @@ func (s *DockerSuite) TestRunWithUlimits(c *check.C) {
 }
 
 func (s *DockerSuite) TestRunContainerWithCgroupParent(c *check.C) {
-	// Not applicable on Windows as uses Unix specific functionality
-	testRequires(c, DaemonIsLinux)
+	// Test requires local filesystem access on a Linux host
+	testRequires(c, DaemonIsLinux, testEnv.IsLocalDaemon)
 
 	// cgroup-parent relative path
 	testRunContainerWithCgroupParent(c, "test", "cgroup-test")
@@ -3252,14 +3252,23 @@ func (s *DockerSuite) TestRunContainerWithCgroupParent(c *check.C) {
 }
 
 func testRunContainerWithCgroupParent(c *check.C, cgroupParent, name string) {
-	out, _, err := dockerCmdWithError("run", "--cgroup-parent", cgroupParent, "--name", name, "busybox", "cat", "/proc/self/cgroup")
+	out, _, err := dockerCmdWithError("run", "--cgroup-parent", cgroupParent, "--name", name, "-d", "busybox", "top")
 	if err != nil {
 		c.Fatalf("unexpected failure when running container with --cgroup-parent option - %s\n%v", string(out), err)
 	}
-	cgroupPaths := ParseCgroupPaths(string(out))
+	cID := strings.TrimSpace(out)
+
+	// If cgroup namespaces are enabled, then processes running inside the container won't
+	// be able to see the parent namespace. Check that they have the correct parents from
+	// the host, which has the non-namespaced view of the hierarchy.
+
+	pid := inspectField(c, cID, "State.Pid")
+	paths := ReadCgroupPathsForPid(c, pid)
+	cgroupPaths := ParseCgroupPaths(paths)
 	if len(cgroupPaths) == 0 {
-		c.Fatalf("unexpected output - %q", string(out))
+		c.Fatalf("unexpected output - %q", string(paths))
 	}
+
 	id := getIDByName(c, name)
 	expectedCgroup := path.Join(cgroupParent, id)
 	found := false
@@ -3285,21 +3294,29 @@ func (s *DockerSuite) TestRunInvalidCgroupParent(c *check.C) {
 }
 
 func testRunInvalidCgroupParent(c *check.C, cgroupParent, cleanCgroupParent, name string) {
-	out, _, err := dockerCmdWithError("run", "--cgroup-parent", cgroupParent, "--name", name, "busybox", "cat", "/proc/self/cgroup")
+	out, _, err := dockerCmdWithError("run", "--cgroup-parent", cgroupParent, "--name", name, "-d", "busybox", "top")
 	if err != nil {
 		// XXX: This may include a daemon crash.
 		c.Fatalf("unexpected failure when running container with --cgroup-parent option - %s\n%v", string(out), err)
 	}
+	cID := strings.TrimSpace(out)
 
 	// We expect "/SHOULD_NOT_EXIST" to not exist. If not, we have a security issue.
 	if _, err := os.Stat("/SHOULD_NOT_EXIST"); err == nil || !os.IsNotExist(err) {
 		c.Fatalf("SECURITY: --cgroup-parent with ../../ relative paths cause files to be created in the host (this is bad) !!")
 	}
 
-	cgroupPaths := ParseCgroupPaths(string(out))
+	// If cgroup namespaces are enabled, then processes running inside the container won't
+	// be able to see the parent namespace. Check that they have the correct parents from
+	// the host, which has the non-namespaced view of the hierarchy.
+
+	pid := inspectField(c, cID, "State.Pid")
+	paths := ReadCgroupPathsForPid(c, pid)
+	cgroupPaths := ParseCgroupPaths(paths)
 	if len(cgroupPaths) == 0 {
-		c.Fatalf("unexpected output - %q", string(out))
+		c.Fatalf("unexpected output - %q", string(paths))
 	}
+
 	id := getIDByName(c, name)
 	expectedCgroup := path.Join(cleanCgroupParent, id)
 	found := false

+ 12 - 0
integration-cli/utils_test.go

@@ -2,6 +2,7 @@ package main
 
 import (
 	"fmt"
+	"io/ioutil"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -38,6 +39,17 @@ func transformCmd(execCmd *exec.Cmd) icmd.Cmd {
 	}
 }
 
+// ReadCgroupPathsForPid reads the cgroup path file for a pid in '/proc/<pid>/cgroup'
+func ReadCgroupPathsForPid(c *check.C, pid string) string {
+	cgroupFile := fmt.Sprintf("/proc/%s/cgroup", pid)
+	out, err := ioutil.ReadFile(cgroupFile)
+	if err != nil {
+		c.Fatalf("unexpected failure when reading cgroup file %s\n%v", cgroupFile, err)
+	}
+
+	return string(out)
+}
+
 // ParseCgroupPaths parses 'procCgroupData', which is output of '/proc/<pid>/cgroup', and returns
 // a map which cgroup name as key and path as value.
 func ParseCgroupPaths(procCgroupData string) map[string]string {

+ 33 - 0
integration/container/run_linux_test.go

@@ -2,6 +2,10 @@ package container // import "github.com/docker/docker/integration/container"
 
 import (
 	"context"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path/filepath"
 	"strconv"
 	"strings"
 	"testing"
@@ -93,3 +97,32 @@ func TestNISDomainname(t *testing.T) {
 	assert.Equal(t, 0, res.ExitCode)
 	assert.Check(t, is.Equal(domainname, strings.TrimSpace(res.Stdout())))
 }
+
+func TestCgroupNamespaces(t *testing.T) {
+	skip.If(t, testEnv.DaemonInfo.OSType != "linux")
+	skip.If(t, testEnv.IsRemoteDaemon())
+
+	if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) {
+		t.Skip("cgroup namespaces are unsupported")
+	}
+
+	defer setupTest(t)()
+	client := testEnv.APIClient()
+	ctx := context.Background()
+
+	cID := container.Run(t, ctx, client)
+	poll.WaitOn(t, container.IsInState(ctx, client, cID, "running"), poll.WithDelay(100*time.Millisecond))
+
+	path := filepath.Join(os.Getenv("DEST"), "docker.pid")
+	b, err := ioutil.ReadFile(path)
+	assert.NilError(t, err)
+	link, err := os.Readlink(fmt.Sprintf("/proc/%s/ns/cgroup", string(b)))
+	assert.NilError(t, err)
+
+	// Check that the container's cgroup doesn't match the docker daemon's
+	res, err := container.Exec(ctx, client, cID, []string{"readlink", "/proc/1/ns/cgroup"})
+	assert.NilError(t, err)
+	assert.Assert(t, is.Len(res.Stderr(), 0))
+	assert.Equal(t, 0, res.ExitCode)
+	assert.Assert(t, link != strings.TrimSpace(res.Stdout()))
+}

+ 3 - 0
pkg/sysinfo/sysinfo.go

@@ -16,6 +16,9 @@ type SysInfo struct {
 	cgroupCpusetInfo
 	cgroupPids
 
+	// Whether the kernel supports cgroup namespaces or not
+	CgroupNamespaces bool
+
 	// Whether IPv4 forwarding is supported or not, if this was disabled, networking will not work
 	IPv4ForwardingDisabled bool
 

+ 10 - 0
pkg/sysinfo/sysinfo_linux.go

@@ -53,6 +53,7 @@ func New(quiet bool) *SysInfo {
 		applyNetworkingInfo,
 		applyAppArmorInfo,
 		applySeccompInfo,
+		applyCgroupNsInfo,
 	}...)
 
 	for _, o := range ops {
@@ -250,6 +251,15 @@ func applyAppArmorInfo(info *SysInfo, _ map[string]string) []string {
 	return warnings
 }
 
+// applyCgroupNsInfo adds cgroup namespace information to the info.
+func applyCgroupNsInfo(info *SysInfo, _ map[string]string) []string {
+	var warnings []string
+	if _, err := os.Stat("/proc/self/ns/cgroup"); !os.IsNotExist(err) {
+		info.CgroupNamespaces = true
+	}
+	return warnings
+}
+
 // applySeccompInfo checks if Seccomp is supported, via CONFIG_SECCOMP.
 func applySeccompInfo(info *SysInfo, _ map[string]string) []string {
 	var warnings []string

+ 20 - 0
pkg/sysinfo/sysinfo_linux_test.go

@@ -96,6 +96,26 @@ func TestNewAppArmorDisabled(t *testing.T) {
 	assert.Assert(t, !sysInfo.AppArmor)
 }
 
+func TestNewCgroupNamespacesEnabled(t *testing.T) {
+	// If cgroup namespaces are supported in the kernel, then sysInfo.CgroupNamespaces should be TRUE
+	if _, err := os.Stat("/proc/self/ns/cgroup"); err != nil {
+		t.Skip("cgroup namespaces must be enabled")
+	}
+
+	sysInfo := New(true)
+	assert.Assert(t, sysInfo.CgroupNamespaces)
+}
+
+func TestNewCgroupNamespacesDisabled(t *testing.T) {
+	// If cgroup namespaces are *not* supported in the kernel, then sysInfo.CgroupNamespaces should be FALSE
+	if _, err := os.Stat("/proc/self/ns/cgroup"); !os.IsNotExist(err) {
+		t.Skip("cgroup namespaces must be disabled")
+	}
+
+	sysInfo := New(true)
+	assert.Assert(t, !sysInfo.CgroupNamespaces)
+}
+
 func TestNumCPU(t *testing.T) {
 	cpuNumbers := NumCPU()
 	if cpuNumbers <= 0 {