Browse Source

Make cgroup namespaces configurable

This adds both a daemon-wide flag and a container creation property:
- Set the `CgroupnsMode: "host|private"` HostConfig property at
  container creation time to control what cgroup namespace the container
  is created in
- Set the `--default-cgroupns-mode=host|private` daemon flag to control
  what cgroup namespace containers are created in by default
- Set the default if the daemon flag is unset to "host", for backward
  compatibility
- Default to CgroupnsMode: "host" for client versions < 1.40

Signed-off-by: Rob Gulewich <rgulewich@netflix.com>
Rob Gulewich 6 years ago
parent
commit
072400fc4b

+ 5 - 0
api/server/router/container/container_routes.go

@@ -489,6 +489,11 @@ func (s *containerRouter) postContainersCreate(ctx context.Context, w http.Respo
 		if hostConfig.IpcMode.IsEmpty() {
 			hostConfig.IpcMode = container.IpcMode("shareable")
 		}
+
+		// Older clients expect the default to be "host"
+		if hostConfig.CgroupnsMode.IsEmpty() {
+			hostConfig.CgroupnsMode = container.CgroupnsMode("host")
+		}
 	}
 
 	if hostConfig != nil && hostConfig.PidsLimit != nil && *hostConfig.PidsLimit <= 0 {

+ 13 - 0
api/swagger.yaml

@@ -707,6 +707,19 @@ definitions:
             description: "A list of kernel capabilities to drop from the container. Conflicts with option 'Capabilities'"
             items:
               type: "string"
+          CgroupnsMode:
+            type: "string"
+            enum:
+              - "private"
+              - "host"
+            description: |
+                    cgroup namespace mode for the container. Possible values are:
+
+                    - `"private"`: the container runs in its own private cgroup namespace
+                    - `"host"`: use the host system's cgroup namespace
+
+                    If not specified, the daemon default is used, which can either be `"private"`
+                    or `"host"`, depending on daemon version, kernel support and configuration.
           Dns:
             type: "array"
             description: "A list of DNS servers for the container to use."

+ 27 - 3
api/types/container/host_config.go

@@ -10,6 +10,29 @@ import (
 	"github.com/docker/go-units"
 )
 
+// CgroupnsMode represents the cgroup namespace mode of the container
+type CgroupnsMode string
+
+// IsPrivate indicates whether the container uses its own private cgroup namespace
+func (c CgroupnsMode) IsPrivate() bool {
+	return c == "private"
+}
+
+// IsHost indicates whether the container shares the host's cgroup namespace
+func (c CgroupnsMode) IsHost() bool {
+	return c == "host"
+}
+
+// IsEmpty indicates whether the container cgroup namespace mode is unset
+func (c CgroupnsMode) IsEmpty() bool {
+	return c == ""
+}
+
+// Valid indicates whether the cgroup namespace mode is valid
+func (c CgroupnsMode) Valid() bool {
+	return c.IsEmpty() || c.IsPrivate() || c.IsHost()
+}
+
 // Isolation represents the isolation technology of a container. The supported
 // values are platform specific
 type Isolation string
@@ -382,9 +405,10 @@ type HostConfig struct {
 	CapAdd          strslice.StrSlice // List of kernel capabilities to add to the container
 	CapDrop         strslice.StrSlice // List of kernel capabilities to remove from the container
 	Capabilities    []string          `json:"Capabilities"` // List of kernel capabilities to be available for container (this overrides the default set)
-	DNS             []string          `json:"Dns"`          // List of DNS server to lookup
-	DNSOptions      []string          `json:"DnsOptions"`   // List of DNSOption to look for
-	DNSSearch       []string          `json:"DnsSearch"`    // List of DNSSearch to look for
+	CgroupnsMode    CgroupnsMode      // Cgroup namespace mode to use for the container
+	DNS             []string          `json:"Dns"`        // List of DNS server to lookup
+	DNSOptions      []string          `json:"DnsOptions"` // List of DNSOption to look for
+	DNSSearch       []string          `json:"DnsSearch"`  // List of DNSSearch to look for
 	ExtraHosts      []string          // List of extra hosts
 	GroupAdd        []string          // List of additional groups that the container process will run as
 	IpcMode         IpcMode           // IPC namespace to use for the container

+ 1 - 0
cmd/dockerd/config_unix.go

@@ -64,5 +64,6 @@ func installConfigFlags(conf *config.Config, flags *pflag.FlagSet) error {
 	// rootless needs to be explicitly specified for running "rootful" dockerd in rootless dockerd (#38702)
 	// Note that defaultUserlandProxyPath and honorXDG are configured according to the value of rootless.RunningWithRootlessKit, not the value of --rootless.
 	flags.BoolVar(&conf.Rootless, "rootless", rootless.RunningWithRootlessKit(), "Enable rootless mode; typically used with RootlessKit (experimental)")
+	flags.StringVar(&conf.CgroupNamespaceMode, "default-cgroupns-mode", config.DefaultCgroupNamespaceMode, `Default mode for containers cgroup namespace ("host" | "private")`)
 	return nil
 }

+ 17 - 1
daemon/config/config_unix.go

@@ -11,6 +11,8 @@ import (
 )
 
 const (
+	// DefaultCgroupNamespaceMode is the default for a container's CgroupnsMode, if not set otherwise
+	DefaultCgroupNamespaceMode = "host" // TODO: change to private
 	// DefaultIpcMode is default for container's IpcMode, if not set otherwise
 	DefaultIpcMode = "private"
 )
@@ -37,6 +39,7 @@ type Config struct {
 	ShmSize              opts.MemBytes            `json:"default-shm-size,omitempty"`
 	NoNewPrivileges      bool                     `json:"no-new-privileges,omitempty"`
 	IpcMode              string                   `json:"default-ipc-mode,omitempty"`
+	CgroupNamespaceMode  string                   `json:"default-cgroupns-mode,omitempty"`
 	// ResolvConf is the path to the configuration of the host resolver
 	ResolvConf string `json:"resolv-conf,omitempty"`
 	Rootless   bool   `json:"rootless,omitempty"`
@@ -84,9 +87,22 @@ func verifyDefaultIpcMode(mode string) error {
 	return nil
 }
 
+func verifyDefaultCgroupNsMode(mode string) error {
+	cm := containertypes.CgroupnsMode(mode)
+	if !cm.Valid() {
+		return fmt.Errorf("Default cgroup namespace mode (%v) is invalid. Use \"host\" or \"private\".", cm) // nolint: golint
+	}
+
+	return nil
+}
+
 // ValidatePlatformConfig checks if any platform-specific configuration settings are invalid.
 func (conf *Config) ValidatePlatformConfig() error {
-	return verifyDefaultIpcMode(conf.IpcMode)
+	if err := verifyDefaultIpcMode(conf.IpcMode); err != nil {
+		return err
+	}
+
+	return verifyDefaultCgroupNsMode(conf.CgroupNamespaceMode)
 }
 
 // IsRootless returns conf.Rootless

+ 20 - 22
daemon/daemon.go

@@ -81,27 +81,26 @@ var (
 
 // Daemon holds information about the Docker daemon.
 type Daemon struct {
-	ID                      string
-	repository              string
-	containers              container.Store
-	containersReplica       container.ViewDB
-	execCommands            *exec.Store
-	imageService            *images.ImageService
-	idIndex                 *truncindex.TruncIndex
-	configStore             *config.Config
-	statsCollector          *stats.Collector
-	defaultLogConfig        containertypes.LogConfig
-	RegistryService         registry.Service
-	EventsService           *events.Events
-	netController           libnetwork.NetworkController
-	volumes                 *volumesservice.VolumesService
-	discoveryWatcher        discovery.Reloader
-	root                    string
-	seccompEnabled          bool
-	apparmorEnabled         bool
-	cgroupNamespacesEnabled bool
-	shutdown                bool
-	idMapping               *idtools.IdentityMapping
+	ID                string
+	repository        string
+	containers        container.Store
+	containersReplica container.ViewDB
+	execCommands      *exec.Store
+	imageService      *images.ImageService
+	idIndex           *truncindex.TruncIndex
+	configStore       *config.Config
+	statsCollector    *stats.Collector
+	defaultLogConfig  containertypes.LogConfig
+	RegistryService   registry.Service
+	EventsService     *events.Events
+	netController     libnetwork.NetworkController
+	volumes           *volumesservice.VolumesService
+	discoveryWatcher  discovery.Reloader
+	root              string
+	seccompEnabled    bool
+	apparmorEnabled   bool
+	shutdown          bool
+	idMapping         *idtools.IdentityMapping
 	// TODO: move graphDrivers field to an InfoService
 	graphDrivers map[string]string // By operating system
 
@@ -1021,7 +1020,6 @@ func NewDaemon(ctx context.Context, config *config.Config, pluginStore *plugin.S
 	d.idMapping = idMapping
 	d.seccompEnabled = sysInfo.Seccomp
 	d.apparmorEnabled = sysInfo.AppArmor
-	d.cgroupNamespacesEnabled = sysInfo.CgroupNamespaces
 
 	d.linkIndex = newLinkIndex()
 

+ 22 - 0
daemon/daemon_unix.go

@@ -356,6 +356,15 @@ func (daemon *Daemon) adaptContainerSettings(hostConfig *containertypes.HostConf
 		hostConfig.IpcMode = containertypes.IpcMode(m)
 	}
 
+	// Set default cgroup namespace mode, if unset for container
+	if hostConfig.CgroupnsMode.IsEmpty() {
+		m := config.DefaultCgroupNamespaceMode
+		if daemon.configStore != nil {
+			m = daemon.configStore.CgroupNamespaceMode
+		}
+		hostConfig.CgroupnsMode = containertypes.CgroupnsMode(m)
+	}
+
 	adaptSharedNamespaceContainer(daemon, hostConfig)
 
 	var err error
@@ -675,6 +684,19 @@ func verifyPlatformContainerSettings(daemon *Daemon, hostConfig *containertypes.
 		}
 	}
 
+	if !hostConfig.CgroupnsMode.Valid() {
+		return warnings, fmt.Errorf("invalid cgroup namespace mode: %v", hostConfig.CgroupnsMode)
+	}
+	if hostConfig.CgroupnsMode.IsPrivate() {
+		if !sysInfo.CgroupNamespaces {
+			warnings = append(warnings, "Your kernel does not support cgroup namespaces.  Cgroup namespace setting discarded.")
+		}
+
+		if hostConfig.Privileged {
+			return warnings, fmt.Errorf("privileged mode is incompatible with private cgroup namespaces.  You must run the container in the host cgroup namespace when running privileged mode")
+		}
+	}
+
 	return warnings, nil
 }
 

+ 4 - 0
daemon/info.go

@@ -178,6 +178,10 @@ func (daemon *Daemon) fillSecurityOptions(v *types.Info, sysInfo *sysinfo.SysInf
 	if daemon.Rootless() {
 		securityOptions = append(securityOptions, "name=rootless")
 	}
+	if daemon.cgroupNamespacesEnabled(sysInfo) {
+		securityOptions = append(securityOptions, "name=cgroupns")
+	}
+
 	v.SecurityOptions = securityOptions
 }
 

+ 5 - 0
daemon/info_unix.go

@@ -10,6 +10,7 @@ import (
 	"strings"
 
 	"github.com/docker/docker/api/types"
+	containertypes "github.com/docker/docker/api/types/container"
 	"github.com/docker/docker/dockerversion"
 	"github.com/docker/docker/pkg/sysinfo"
 	"github.com/pkg/errors"
@@ -247,6 +248,10 @@ func parseRuncVersion(v string) (version string, commit string, err error) {
 	return version, commit, err
 }
 
+func (daemon *Daemon) cgroupNamespacesEnabled(sysInfo *sysinfo.SysInfo) bool {
+	return sysInfo.CgroupNamespaces && containertypes.CgroupnsMode(daemon.configStore.CgroupNamespaceMode).IsPrivate()
+}
+
 // Rootless returns true if daemon is running in rootless mode
 func (daemon *Daemon) Rootless() bool {
 	return daemon.configStore.Rootless

+ 4 - 0
daemon/info_windows.go

@@ -14,6 +14,10 @@ func (daemon *Daemon) fillPlatformVersion(v *types.Version) {}
 func fillDriverWarnings(v *types.Info) {
 }
 
+func (daemon *Daemon) cgroupNamespacesEnabled(sysInfo *sysinfo.SysInfo) bool {
+	return false
+}
+
 // Rootless returns true if daemon is running in rootless mode
 func (daemon *Daemon) Rootless() bool {
 	return false

+ 14 - 6
daemon/oci_linux.go

@@ -307,13 +307,21 @@ func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
 			s.Hostname = ""
 		}
 
-	// cgroup
-	if daemon.cgroupNamespacesEnabled && !c.HostConfig.Privileged {
-		nsCgroup := specs.LinuxNamespace{Type: "cgroup"}
-		setNamespace(s, nsCgroup)
-	}
+		// cgroup
+		if !c.HostConfig.CgroupnsMode.IsEmpty() {
+			cgroupNsMode := c.HostConfig.CgroupnsMode
+			if !cgroupNsMode.Valid() {
+				return fmt.Errorf("invalid cgroup namespace mode: %v", cgroupNsMode)
+			}
 
-	return nil
+			if cgroupNsMode.IsPrivate() && !c.HostConfig.Privileged {
+				nsCgroup := specs.LinuxNamespace{Type: "cgroup"}
+				setNamespace(s, nsCgroup)
+			}
+		}
+
+		return nil
+	}
 }
 
 func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {

+ 5 - 0
daemon/reload_unix.go

@@ -34,6 +34,10 @@ func (daemon *Daemon) reloadPlatform(conf *config.Config, attributes map[string]
 		daemon.configStore.ShmSize = conf.ShmSize
 	}
 
+	if conf.CgroupNamespaceMode != "" {
+		daemon.configStore.CgroupNamespaceMode = conf.CgroupNamespaceMode
+	}
+
 	if conf.IpcMode != "" {
 		daemon.configStore.IpcMode = conf.IpcMode
 	}
@@ -51,6 +55,7 @@ func (daemon *Daemon) reloadPlatform(conf *config.Config, attributes map[string]
 	attributes["default-runtime"] = daemon.configStore.DefaultRuntime
 	attributes["default-shm-size"] = fmt.Sprintf("%d", daemon.configStore.ShmSize)
 	attributes["default-ipc-mode"] = daemon.configStore.IpcMode
+	attributes["default-cgroupns-mode"] = daemon.configStore.CgroupNamespaceMode
 
 	return nil
 }

+ 5 - 0
docs/api/version-history.md

@@ -68,6 +68,11 @@ keywords: "API, Docker, rcli, REST, documentation"
 * `POST /containers/{id}/update` now accepts a `PidsLimit` field to tune a container's
   PID limit. Set `0` or `-1` for unlimited. Leave `null` to not change the current value.
 * `POST /build` now accepts `outputs` key for configuring build outputs when using BuildKit mode.
+* `POST /containers/create` on Linux now accepts the `HostConfig.CgroupnsMode` property.
+  Set the property to `host` to create the container in the daemon's cgroup namespace, or
+  `private` to create the container in its own private cgroup namespace.  The per-daemon
+  default is `host`, and can be changed by using the`CgroupNamespaceMode` daemon configuration
+  parameter.
 
 ## V1.39 API changes
 

+ 9 - 31
integration-cli/docker_cli_build_test.go

@@ -3984,40 +3984,18 @@ func (s *DockerSuite) TestBuildContainerWithCgroupParent(c *check.C) {
 	if !found {
 		c.Fatalf("unable to find self memory cgroup path. CgroupsPath: %v", selfCgroupPaths)
 	}
-
-	doneCh := make(chan string)
-
-	// If cgroup namespaces are enabled, then processes running inside the container won't
-	// be able to see the parent namespace. Check that they have the correct parents from
-	// the host, which has the non-namespaced view of the hierarchy.
-
-	go func() {
-		result := buildImage("buildcgroupparent",
-			cli.WithFlags("--cgroup-parent", cgroupParent),
-			build.WithDockerfile(`
+	result := buildImage("buildcgroupparent",
+		cli.WithFlags("--cgroup-parent", cgroupParent),
+		build.WithDockerfile(`
 FROM busybox
-RUN sleep 10
-			`))
-		result.Assert(c, icmd.Success)
-		doneCh <- "done"
-	}()
-
-	// Wait until the build is well into the sleep
-	time.Sleep(3 * time.Second)
-	out, _, err := dockerCmdWithError("ps", "-q", "-l")
-	c.Assert(err, check.IsNil)
-	cID := strings.TrimSpace(out)
-
-	pid := inspectField(c, cID, "State.Pid")
-	paths := ReadCgroupPathsForPid(c, pid)
-	m, err := regexp.MatchString(fmt.Sprintf("memory:.*/%s/.*", cgroupParent), paths)
-	c.Assert(err, check.IsNil)
+RUN cat /proc/self/cgroup
+`))
+	result.Assert(c, icmd.Success)
+	m, err := regexp.MatchString(fmt.Sprintf("memory:.*/%s/.*", cgroupParent), result.Combined())
+	assert.NilError(c, err)
 	if !m {
-		c.Fatalf("There is no expected memory cgroup with parent /%s/: %s", cgroupParent, paths)
+		c.Fatalf("There is no expected memory cgroup with parent /%s/: %s", cgroupParent, result.Combined())
 	}
-
-	// Wait for the build to complete, otherwise it will exit with an error
-	<-doneCh
 }
 
 // FIXME(vdemeester) could be a unit test

+ 5 - 16
integration-cli/docker_cli_daemon_test.go

@@ -1787,8 +1787,7 @@ func (s *DockerDaemonSuite) TestDaemonRestartContainerLinksRestart(c *check.C) {
 }
 
 func (s *DockerDaemonSuite) TestDaemonCgroupParent(c *check.C) {
-	// Test requires local filesystem access on a Linux host
-	testRequires(c, DaemonIsLinux, testEnv.IsLocalDaemon)
+	testRequires(c, DaemonIsLinux)
 
 	cgroupParent := "test"
 	name := "cgroup-test"
@@ -1796,20 +1795,10 @@ func (s *DockerDaemonSuite) TestDaemonCgroupParent(c *check.C) {
 	s.d.StartWithBusybox(c, "--cgroup-parent", cgroupParent)
 	defer s.d.Restart(c)
 
-	out, err := s.d.Cmd("run", "--name", name, "-d", "busybox", "top")
-	c.Assert(err, checker.IsNil)
-
-	// If cgroup namespaces are enabled, then processes running inside the container won't
-	// be able to see the parent namespace. Check that they have the correct parents from
-	// the host, which has the non-namespaced view of the hierarchy.
-
-	pid, err := s.d.Cmd("inspect", "-f", "{{.State.Pid}}", name)
-	c.Assert(err, checker.IsNil)
-	pid = strings.TrimSpace(string(pid))
-	paths := ReadCgroupPathsForPid(c, pid)
-	cgroupPaths := ParseCgroupPaths(paths)
-	c.Assert(len(cgroupPaths), checker.Not(checker.Equals), 0, check.Commentf("unexpected output - %q", paths))
-
+	out, err := s.d.Cmd("run", "--name", name, "busybox", "cat", "/proc/self/cgroup")
+	assert.NilError(c, err)
+	cgroupPaths := ParseCgroupPaths(string(out))
+	c.Assert(len(cgroupPaths), checker.Not(checker.Equals), 0, check.Commentf("unexpected output - %q", string(out)))
 	out, err = s.d.Cmd("inspect", "-f", "{{.Id}}", name)
 	assert.NilError(c, err)
 	id := strings.TrimSpace(string(out))

+ 8 - 25
integration-cli/docker_cli_run_test.go

@@ -3241,8 +3241,8 @@ func (s *DockerSuite) TestRunWithUlimits(c *check.C) {
 }
 
 func (s *DockerSuite) TestRunContainerWithCgroupParent(c *check.C) {
-	// Test requires local filesystem access on a Linux host
-	testRequires(c, DaemonIsLinux, testEnv.IsLocalDaemon)
+	// Not applicable on Windows as uses Unix specific functionality
+	testRequires(c, DaemonIsLinux)
 
 	// cgroup-parent relative path
 	testRunContainerWithCgroupParent(c, "test", "cgroup-test")
@@ -3252,23 +3252,14 @@ func (s *DockerSuite) TestRunContainerWithCgroupParent(c *check.C) {
 }
 
 func testRunContainerWithCgroupParent(c *check.C, cgroupParent, name string) {
-	out, _, err := dockerCmdWithError("run", "--cgroup-parent", cgroupParent, "--name", name, "-d", "busybox", "top")
+	out, _, err := dockerCmdWithError("run", "--cgroup-parent", cgroupParent, "--name", name, "busybox", "cat", "/proc/self/cgroup")
 	if err != nil {
 		c.Fatalf("unexpected failure when running container with --cgroup-parent option - %s\n%v", string(out), err)
 	}
-	cID := strings.TrimSpace(out)
-
-	// If cgroup namespaces are enabled, then processes running inside the container won't
-	// be able to see the parent namespace. Check that they have the correct parents from
-	// the host, which has the non-namespaced view of the hierarchy.
-
-	pid := inspectField(c, cID, "State.Pid")
-	paths := ReadCgroupPathsForPid(c, pid)
-	cgroupPaths := ParseCgroupPaths(paths)
+	cgroupPaths := ParseCgroupPaths(string(out))
 	if len(cgroupPaths) == 0 {
-		c.Fatalf("unexpected output - %q", string(paths))
+		c.Fatalf("unexpected output - %q", string(out))
 	}
-
 	id := getIDByName(c, name)
 	expectedCgroup := path.Join(cgroupParent, id)
 	found := false
@@ -3294,29 +3285,21 @@ func (s *DockerSuite) TestRunInvalidCgroupParent(c *check.C) {
 }
 
 func testRunInvalidCgroupParent(c *check.C, cgroupParent, cleanCgroupParent, name string) {
-	out, _, err := dockerCmdWithError("run", "--cgroup-parent", cgroupParent, "--name", name, "-d", "busybox", "top")
+	out, _, err := dockerCmdWithError("run", "--cgroup-parent", cgroupParent, "--name", name, "busybox", "cat", "/proc/self/cgroup")
 	if err != nil {
 		// XXX: This may include a daemon crash.
 		c.Fatalf("unexpected failure when running container with --cgroup-parent option - %s\n%v", string(out), err)
 	}
-	cID := strings.TrimSpace(out)
 
 	// We expect "/SHOULD_NOT_EXIST" to not exist. If not, we have a security issue.
 	if _, err := os.Stat("/SHOULD_NOT_EXIST"); err == nil || !os.IsNotExist(err) {
 		c.Fatalf("SECURITY: --cgroup-parent with ../../ relative paths cause files to be created in the host (this is bad) !!")
 	}
 
-	// If cgroup namespaces are enabled, then processes running inside the container won't
-	// be able to see the parent namespace. Check that they have the correct parents from
-	// the host, which has the non-namespaced view of the hierarchy.
-
-	pid := inspectField(c, cID, "State.Pid")
-	paths := ReadCgroupPathsForPid(c, pid)
-	cgroupPaths := ParseCgroupPaths(paths)
+	cgroupPaths := ParseCgroupPaths(string(out))
 	if len(cgroupPaths) == 0 {
-		c.Fatalf("unexpected output - %q", string(paths))
+		c.Fatalf("unexpected output - %q", string(out))
 	}
-
 	id := getIDByName(c, name)
 	expectedCgroup := path.Join(cleanCgroupParent, id)
 	found := false

+ 0 - 12
integration-cli/utils_test.go

@@ -2,7 +2,6 @@ package main
 
 import (
 	"fmt"
-	"io/ioutil"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -39,17 +38,6 @@ func transformCmd(execCmd *exec.Cmd) icmd.Cmd {
 	}
 }
 
-// ReadCgroupPathsForPid reads the cgroup path file for a pid in '/proc/<pid>/cgroup'
-func ReadCgroupPathsForPid(c *check.C, pid string) string {
-	cgroupFile := fmt.Sprintf("/proc/%s/cgroup", pid)
-	out, err := ioutil.ReadFile(cgroupFile)
-	if err != nil {
-		c.Fatalf("unexpected failure when reading cgroup file %s\n%v", cgroupFile, err)
-	}
-
-	return string(out)
-}
-
 // ParseCgroupPaths parses 'procCgroupData', which is output of '/proc/<pid>/cgroup', and returns
 // a map which cgroup name as key and path as value.
 func ParseCgroupPaths(procCgroupData string) map[string]string {

+ 92 - 0
integration/build/build_cgroupns_linux_test.go

@@ -0,0 +1,92 @@
+package build // import "github.com/docker/docker/integration/build"
+
+import (
+	"context"
+	"encoding/json"
+	"io"
+	"strings"
+	"testing"
+
+	"github.com/docker/docker/api/types"
+	"github.com/docker/docker/integration/internal/requirement"
+	"github.com/docker/docker/internal/test/daemon"
+	"github.com/docker/docker/internal/test/fakecontext"
+	"github.com/docker/docker/pkg/jsonmessage"
+	"gotest.tools/assert"
+	"gotest.tools/skip"
+)
+
+// Finds the output of `readlink /proc/<pid>/ns/cgroup` in build output
+func getCgroupFromBuildOutput(buildOutput io.Reader) (string, error) {
+	const prefix = "cgroup:"
+
+	dec := json.NewDecoder(buildOutput)
+	for {
+		m := jsonmessage.JSONMessage{}
+		err := dec.Decode(&m)
+		if err == io.EOF {
+			return "", nil
+		}
+		if err != nil {
+			return "", err
+		}
+		if ix := strings.Index(m.Stream, prefix); ix == 0 {
+			return strings.TrimSpace(m.Stream), nil
+		}
+	}
+}
+
+// Runs a docker build against a daemon with the given cgroup namespace default value.
+// Returns the container cgroup and daemon cgroup.
+func testBuildWithCgroupNs(t *testing.T, daemonNsMode string) (string, string) {
+	d := daemon.New(t, daemon.WithDefaultCgroupNamespaceMode(daemonNsMode))
+	d.StartWithBusybox(t)
+	defer d.Stop(t)
+
+	dockerfile := `
+		FROM busybox
+		RUN readlink /proc/self/ns/cgroup
+	`
+	ctx := context.Background()
+	source := fakecontext.New(t, "", fakecontext.WithDockerfile(dockerfile))
+	defer source.Close()
+
+	client := d.NewClientT(t)
+	resp, err := client.ImageBuild(ctx,
+		source.AsTarReader(t),
+		types.ImageBuildOptions{
+			Remove:      true,
+			ForceRemove: true,
+			Tags:        []string{"buildcgroupns"},
+		})
+	assert.NilError(t, err)
+	defer resp.Body.Close()
+
+	containerCgroup, err := getCgroupFromBuildOutput(resp.Body)
+	assert.NilError(t, err)
+	daemonCgroup := d.CgroupNamespace(t)
+
+	return containerCgroup, daemonCgroup
+}
+
+func TestCgroupNamespacesBuild(t *testing.T) {
+	skip.If(t, testEnv.DaemonInfo.OSType != "linux")
+	skip.If(t, testEnv.IsRemoteDaemon())
+	skip.If(t, !requirement.CgroupNamespacesEnabled())
+
+	// When the daemon defaults to private cgroup namespaces, containers launched
+	// should be in their own private cgroup namespace by default
+	containerCgroup, daemonCgroup := testBuildWithCgroupNs(t, "private")
+	assert.Assert(t, daemonCgroup != containerCgroup)
+}
+
+func TestCgroupNamespacesBuildDaemonHostMode(t *testing.T) {
+	skip.If(t, testEnv.DaemonInfo.OSType != "linux")
+	skip.If(t, testEnv.IsRemoteDaemon())
+	skip.If(t, !requirement.CgroupNamespacesEnabled())
+
+	// When the daemon defaults to host cgroup namespaces, containers
+	// launched should not be inside their own cgroup namespaces
+	containerCgroup, daemonCgroup := testBuildWithCgroupNs(t, "host")
+	assert.Assert(t, daemonCgroup == containerCgroup)
+}

+ 152 - 0
integration/container/run_cgroupns_linux_test.go

@@ -0,0 +1,152 @@
+package container // import "github.com/docker/docker/integration/container"
+
+import (
+	"context"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/docker/docker/client"
+	"github.com/docker/docker/integration/internal/container"
+	"github.com/docker/docker/integration/internal/requirement"
+	"github.com/docker/docker/internal/test/daemon"
+	"gotest.tools/assert"
+	is "gotest.tools/assert/cmp"
+	"gotest.tools/poll"
+	"gotest.tools/skip"
+)
+
+// Gets the value of the cgroup namespace for pid 1 of a container
+func containerCgroupNamespace(ctx context.Context, t *testing.T, client *client.Client, cID string) string {
+	res, err := container.Exec(ctx, client, cID, []string{"readlink", "/proc/1/ns/cgroup"})
+	assert.NilError(t, err)
+	assert.Assert(t, is.Len(res.Stderr(), 0))
+	assert.Equal(t, 0, res.ExitCode)
+	return strings.TrimSpace(res.Stdout())
+}
+
+// Bring up a daemon with the specified default cgroup namespace mode, and then create a container with the container options
+func testRunWithCgroupNs(t *testing.T, daemonNsMode string, containerOpts ...func(*container.TestContainerConfig)) (string, string) {
+	d := daemon.New(t, daemon.WithDefaultCgroupNamespaceMode(daemonNsMode))
+	client := d.NewClientT(t)
+	ctx := context.Background()
+
+	d.StartWithBusybox(t)
+	defer d.Stop(t)
+
+	cID := container.Run(t, ctx, client, containerOpts...)
+	poll.WaitOn(t, container.IsInState(ctx, client, cID, "running"), poll.WithDelay(100*time.Millisecond))
+
+	daemonCgroup := d.CgroupNamespace(t)
+	containerCgroup := containerCgroupNamespace(ctx, t, client, cID)
+	return containerCgroup, daemonCgroup
+}
+
+// Bring up a daemon with the specified default cgroup namespace mode. Create a container with the container options,
+// expecting an error with the specified string
+func testCreateFailureWithCgroupNs(t *testing.T, daemonNsMode string, errStr string, containerOpts ...func(*container.TestContainerConfig)) {
+	d := daemon.New(t, daemon.WithDefaultCgroupNamespaceMode(daemonNsMode))
+	client := d.NewClientT(t)
+	ctx := context.Background()
+
+	d.StartWithBusybox(t)
+	defer d.Stop(t)
+	container.CreateExpectingErr(t, ctx, client, errStr, containerOpts...)
+}
+
+func TestCgroupNamespacesRun(t *testing.T) {
+	skip.If(t, testEnv.DaemonInfo.OSType != "linux")
+	skip.If(t, testEnv.IsRemoteDaemon())
+	skip.If(t, !requirement.CgroupNamespacesEnabled())
+
+	// When the daemon defaults to private cgroup namespaces, containers launched
+	// should be in their own private cgroup namespace by default
+	containerCgroup, daemonCgroup := testRunWithCgroupNs(t, "private")
+	assert.Assert(t, daemonCgroup != containerCgroup)
+}
+
+func TestCgroupNamespacesRunPrivileged(t *testing.T) {
+	skip.If(t, testEnv.DaemonInfo.OSType != "linux")
+	skip.If(t, testEnv.IsRemoteDaemon())
+	skip.If(t, requirement.CgroupNamespacesEnabled())
+
+	// When the daemon defaults to private cgroup namespaces, privileged containers
+	// launched should not be inside their own cgroup namespaces
+	containerCgroup, daemonCgroup := testRunWithCgroupNs(t, "private", container.WithPrivileged(true))
+	assert.Assert(t, daemonCgroup == containerCgroup)
+}
+
+func TestCgroupNamespacesRunDaemonHostMode(t *testing.T) {
+	skip.If(t, testEnv.DaemonInfo.OSType != "linux")
+	skip.If(t, testEnv.IsRemoteDaemon())
+	skip.If(t, !requirement.CgroupNamespacesEnabled())
+
+	// When the daemon defaults to host cgroup namespaces, containers
+	// launched should not be inside their own cgroup namespaces
+	containerCgroup, daemonCgroup := testRunWithCgroupNs(t, "host")
+	assert.Assert(t, daemonCgroup == containerCgroup)
+}
+
+func TestCgroupNamespacesRunHostMode(t *testing.T) {
+	skip.If(t, testEnv.DaemonInfo.OSType != "linux")
+	skip.If(t, testEnv.IsRemoteDaemon())
+	skip.If(t, !requirement.CgroupNamespacesEnabled())
+
+	// When the daemon defaults to private cgroup namespaces, containers launched
+	// with a cgroup ns mode of "host" should not be inside their own cgroup namespaces
+	containerCgroup, daemonCgroup := testRunWithCgroupNs(t, "private", container.WithCgroupnsMode("host"))
+	assert.Assert(t, daemonCgroup == containerCgroup)
+}
+
+func TestCgroupNamespacesRunPrivateMode(t *testing.T) {
+	skip.If(t, testEnv.DaemonInfo.OSType != "linux")
+	skip.If(t, testEnv.IsRemoteDaemon())
+	skip.If(t, !requirement.CgroupNamespacesEnabled())
+
+	// When the daemon defaults to private cgroup namespaces, containers launched
+	// with a cgroup ns mode of "private" should be inside their own cgroup namespaces
+	containerCgroup, daemonCgroup := testRunWithCgroupNs(t, "private", container.WithCgroupnsMode("private"))
+	assert.Assert(t, daemonCgroup != containerCgroup)
+}
+
+func TestCgroupNamespacesRunPrivilegedAndPrivate(t *testing.T) {
+	skip.If(t, testEnv.DaemonInfo.OSType != "linux")
+	skip.If(t, testEnv.IsRemoteDaemon())
+	skip.If(t, !requirement.CgroupNamespacesEnabled())
+
+	// Running with both privileged and cgroupns=private is not allowed
+	errStr := "privileged mode is incompatible with private cgroup namespaces.  You must run the container in the host cgroup namespace when running privileged mode"
+	testCreateFailureWithCgroupNs(t, "private", errStr, container.WithPrivileged(true), container.WithCgroupnsMode("private"))
+}
+
+func TestCgroupNamespacesRunInvalidMode(t *testing.T) {
+	skip.If(t, testEnv.DaemonInfo.OSType != "linux")
+	skip.If(t, testEnv.IsRemoteDaemon())
+	skip.If(t, !requirement.CgroupNamespacesEnabled())
+
+	// An invalid cgroup namespace mode should return an error on container creation
+	errStr := "invalid cgroup namespace mode: invalid"
+	testCreateFailureWithCgroupNs(t, "private", errStr, container.WithCgroupnsMode("invalid"))
+}
+
+// Clients before 1.40 expect containers to be created in the host cgroup namespace,
+// regardless of the default setting of the daemon
+func TestCgroupNamespacesRunOlderClient(t *testing.T) {
+	skip.If(t, testEnv.DaemonInfo.OSType != "linux")
+	skip.If(t, testEnv.IsRemoteDaemon())
+	skip.If(t, !requirement.CgroupNamespacesEnabled())
+
+	d := daemon.New(t, daemon.WithDefaultCgroupNamespaceMode("private"))
+	client := d.NewClientT(t, client.WithVersion("1.39"))
+
+	ctx := context.Background()
+	d.StartWithBusybox(t)
+	defer d.Stop(t)
+
+	cID := container.Run(t, ctx, client)
+	poll.WaitOn(t, container.IsInState(ctx, client, cID, "running"), poll.WithDelay(100*time.Millisecond))
+
+	daemonCgroup := d.CgroupNamespace(t)
+	containerCgroup := containerCgroupNamespace(ctx, t, client, cID)
+	assert.Assert(t, daemonCgroup == containerCgroup)
+}

+ 0 - 33
integration/container/run_linux_test.go

@@ -2,10 +2,6 @@ package container // import "github.com/docker/docker/integration/container"
 
 import (
 	"context"
-	"fmt"
-	"io/ioutil"
-	"os"
-	"path/filepath"
 	"strconv"
 	"strings"
 	"testing"
@@ -97,32 +93,3 @@ func TestNISDomainname(t *testing.T) {
 	assert.Equal(t, 0, res.ExitCode)
 	assert.Check(t, is.Equal(domainname, strings.TrimSpace(res.Stdout())))
 }
-
-func TestCgroupNamespaces(t *testing.T) {
-	skip.If(t, testEnv.DaemonInfo.OSType != "linux")
-	skip.If(t, testEnv.IsRemoteDaemon())
-
-	if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) {
-		t.Skip("cgroup namespaces are unsupported")
-	}
-
-	defer setupTest(t)()
-	client := testEnv.APIClient()
-	ctx := context.Background()
-
-	cID := container.Run(t, ctx, client)
-	poll.WaitOn(t, container.IsInState(ctx, client, cID, "running"), poll.WithDelay(100*time.Millisecond))
-
-	path := filepath.Join(os.Getenv("DEST"), "docker.pid")
-	b, err := ioutil.ReadFile(path)
-	assert.NilError(t, err)
-	link, err := os.Readlink(fmt.Sprintf("/proc/%s/ns/cgroup", string(b)))
-	assert.NilError(t, err)
-
-	// Check that the container's cgroup doesn't match the docker daemon's
-	res, err := container.Exec(ctx, client, cID, []string{"readlink", "/proc/1/ns/cgroup"})
-	assert.NilError(t, err)
-	assert.Assert(t, is.Len(res.Stderr(), 0))
-	assert.Equal(t, 0, res.ExitCode)
-	assert.Assert(t, link != strings.TrimSpace(res.Stdout()))
-}

+ 14 - 3
integration/internal/container/container.go

@@ -20,9 +20,9 @@ type TestContainerConfig struct {
 	NetworkingConfig *network.NetworkingConfig
 }
 
-// Create creates a container with the specified options
+// create creates a container with the specified options
 // nolint: golint
-func Create(t *testing.T, ctx context.Context, client client.APIClient, ops ...func(*TestContainerConfig)) string { // nolint: golint
+func create(t *testing.T, ctx context.Context, client client.APIClient, ops ...func(*TestContainerConfig)) (container.ContainerCreateCreatedBody, error) { // nolint: golint
 	t.Helper()
 	config := &TestContainerConfig{
 		Config: &container.Config{
@@ -37,12 +37,23 @@ func Create(t *testing.T, ctx context.Context, client client.APIClient, ops ...f
 		op(config)
 	}
 
-	c, err := client.ContainerCreate(ctx, config.Config, config.HostConfig, config.NetworkingConfig, config.Name)
+	return client.ContainerCreate(ctx, config.Config, config.HostConfig, config.NetworkingConfig, config.Name)
+}
+
+// Create creates a container with the specified options, asserting that there was no error
+func Create(t *testing.T, ctx context.Context, client client.APIClient, ops ...func(*TestContainerConfig)) string { // nolint: golint
+	c, err := create(t, ctx, client, ops...)
 	assert.NilError(t, err)
 
 	return c.ID
 }
 
+// CreateExpectingErr creates a container, expecting an error with the specified message
+func CreateExpectingErr(t *testing.T, ctx context.Context, client client.APIClient, errMsg string, ops ...func(*TestContainerConfig)) { // nolint: golint
+	_, err := create(t, ctx, client, ops...)
+	assert.ErrorContains(t, err, errMsg)
+}
+
 // Run creates and start a container with the specified options
 // nolint: golint
 func Run(t *testing.T, ctx context.Context, client client.APIClient, ops ...func(*TestContainerConfig)) string { // nolint: golint

+ 20 - 0
integration/internal/container/ops.go

@@ -160,3 +160,23 @@ func WithUser(user string) func(c *TestContainerConfig) {
 		c.Config.User = user
 	}
 }
+
+// WithPrivileged sets privileged mode for the container
+func WithPrivileged(privileged bool) func(*TestContainerConfig) {
+	return func(c *TestContainerConfig) {
+		if c.HostConfig == nil {
+			c.HostConfig = &containertypes.HostConfig{}
+		}
+		c.HostConfig.Privileged = privileged
+	}
+}
+
+// WithCgroupnsMode sets the cgroup namespace mode for the container
+func WithCgroupnsMode(mode string) func(*TestContainerConfig) {
+	return func(c *TestContainerConfig) {
+		if c.HostConfig == nil {
+			c.HostConfig = &containertypes.HostConfig{}
+		}
+		c.HostConfig.CgroupnsMode = containertypes.CgroupnsMode(mode)
+	}
+}

+ 10 - 0
integration/internal/requirement/requirement_linux.go

@@ -1,12 +1,22 @@
 package requirement // import "github.com/docker/docker/integration/internal/requirement"
 
 import (
+	"os"
 	"strings"
 
 	"github.com/docker/docker/pkg/parsers/kernel"
 	"gotest.tools/icmd"
 )
 
+// CgroupNamespacesEnabled checks if cgroup namespaces are enabled on this host
+func CgroupNamespacesEnabled() bool {
+	if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) {
+		return false
+	}
+
+	return true
+}
+
 func overlayFSSupported() bool {
 	result := icmd.RunCommand("/bin/sh", "-c", "cat /proc/filesystems")
 	if result.Error != nil {

+ 22 - 13
internal/test/daemon/daemon.go

@@ -60,16 +60,17 @@ type Daemon struct {
 	UseDefaultHost    bool
 	UseDefaultTLSHost bool
 
-	id            string
-	logFile       *os.File
-	cmd           *exec.Cmd
-	storageDriver string
-	userlandProxy bool
-	execRoot      string
-	experimental  bool
-	init          bool
-	dockerdBinary string
-	log           logT
+	id                         string
+	logFile                    *os.File
+	cmd                        *exec.Cmd
+	storageDriver              string
+	userlandProxy              bool
+	defaultCgroupNamespaceMode string
+	execRoot                   string
+	experimental               bool
+	init                       bool
+	dockerdBinary              string
+	log                        logT
 
 	// swarm related field
 	swarmListenAddr string
@@ -169,13 +170,18 @@ func (d *Daemon) ReadLogFile() ([]byte, error) {
 }
 
 // NewClientT creates new client based on daemon's socket path
-func (d *Daemon) NewClientT(t assert.TestingT) *client.Client {
+func (d *Daemon) NewClientT(t assert.TestingT, extraOpts ...client.Opt) *client.Client {
 	if ht, ok := t.(test.HelperT); ok {
 		ht.Helper()
 	}
-	c, err := client.NewClientWithOpts(
+
+	clientOpts := []client.Opt{
 		client.FromEnv,
-		client.WithHost(d.Sock()))
+		client.WithHost(d.Sock()),
+	}
+	clientOpts = append(clientOpts, extraOpts...)
+
+	c, err := client.NewClientWithOpts(clientOpts...)
 	assert.NilError(t, err, "cannot create daemon client")
 	return c
 }
@@ -225,6 +231,9 @@ func (d *Daemon) StartWithLogFile(out *os.File, providedArgs ...string) error {
 		"--pidfile", fmt.Sprintf("%s/docker.pid", d.Folder),
 		fmt.Sprintf("--userland-proxy=%t", d.userlandProxy),
 	)
+	if d.defaultCgroupNamespaceMode != "" {
+		args = append(args, []string{"--default-cgroupns-mode", d.defaultCgroupNamespaceMode}...)
+	}
 	if d.experimental {
 		args = append(args, "--experimental")
 	}

+ 11 - 0
internal/test/daemon/daemon_unix.go

@@ -3,11 +3,14 @@
 package daemon // import "github.com/docker/docker/internal/test/daemon"
 
 import (
+	"fmt"
 	"os"
 	"path/filepath"
+	"strings"
 
 	"github.com/docker/docker/internal/test"
 	"golang.org/x/sys/unix"
+	"gotest.tools/assert"
 )
 
 func cleanupNetworkNamespace(t testingT, execRoot string) {
@@ -29,6 +32,14 @@ func cleanupNetworkNamespace(t testingT, execRoot string) {
 	})
 }
 
+// CgroupNamespace returns the cgroup namespace the daemon is running in
+func (d *Daemon) CgroupNamespace(t assert.TestingT) string {
+	link, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/cgroup", d.Pid()))
+	assert.NilError(t, err)
+
+	return strings.TrimSpace(link)
+}
+
 // SignalDaemonDump sends a signal to the daemon to write a dump file
 func SignalDaemonDump(pid int) {
 	unix.Kill(pid, unix.SIGQUIT)

+ 7 - 0
internal/test/daemon/daemon_windows.go

@@ -5,6 +5,7 @@ import (
 	"strconv"
 
 	"golang.org/x/sys/windows"
+	"gotest.tools/assert"
 )
 
 // SignalDaemonDump sends a signal to the daemon to write a dump file
@@ -23,3 +24,9 @@ func signalDaemonReload(pid int) error {
 
 func cleanupNetworkNamespace(t testingT, execRoot string) {
 }
+
+// CgroupNamespace returns the cgroup namespace the daemon is running in
+func (d *Daemon) CgroupNamespace(t assert.TestingT) string {
+	assert.Assert(t, false)
+	return "cgroup namespaces are not supported on Windows"
+}

+ 7 - 0
internal/test/daemon/ops.go

@@ -2,6 +2,13 @@ package daemon
 
 import "github.com/docker/docker/internal/test/environment"
 
+// WithDefaultCgroupNamespaceMode sets the default cgroup namespace mode for the daemon
+func WithDefaultCgroupNamespaceMode(mode string) func(*Daemon) {
+	return func(d *Daemon) {
+		d.defaultCgroupNamespaceMode = mode
+	}
+}
+
 // WithExperimental sets the daemon in experimental mode
 func WithExperimental(d *Daemon) {
 	d.experimental = true

+ 26 - 0
runconfig/hostconfig_test.go

@@ -14,6 +14,32 @@ import (
 	is "gotest.tools/assert/cmp"
 )
 
+func TestCgroupnsModeTest(t *testing.T) {
+	cgroupNsModes := map[container.CgroupnsMode][]bool{
+		// private, host, empty, valid
+		"":                {false, false, true, true},
+		"something:weird": {false, false, false, false},
+		"host":            {false, true, false, true},
+		"host:name":       {false, false, false, false},
+		"private":         {true, false, false, true},
+		"private:name":    {false, false, false, false},
+	}
+	for cgroupNsMode, state := range cgroupNsModes {
+		if cgroupNsMode.IsPrivate() != state[0] {
+			t.Fatalf("CgroupnsMode.IsPrivate for %v should have been %v but was %v", cgroupNsMode, state[0], cgroupNsMode.IsPrivate())
+		}
+		if cgroupNsMode.IsHost() != state[1] {
+			t.Fatalf("CgroupnsMode.IsHost for %v should have been %v but was %v", cgroupNsMode, state[1], cgroupNsMode.IsHost())
+		}
+		if cgroupNsMode.IsEmpty() != state[2] {
+			t.Fatalf("CgroupnsMode.Valid for %v should have been %v but was %v", cgroupNsMode, state[2], cgroupNsMode.Valid())
+		}
+		if cgroupNsMode.Valid() != state[3] {
+			t.Fatalf("CgroupnsMode.Valid for %v should have been %v but was %v", cgroupNsMode, state[2], cgroupNsMode.Valid())
+		}
+	}
+}
+
 // TODO Windows: This will need addressing for a Windows daemon.
 func TestNetworkModeTest(t *testing.T) {
 	networkModes := map[container.NetworkMode][]bool{