浏览代码

Merge pull request #40174 from AkihiroSuda/cgroup2

support cgroup2
Sebastiaan van Stijn 5 年之前
父节点
当前提交
e6c1820ef5

+ 6 - 1
cmd/dockerd/config_unix.go

@@ -9,6 +9,7 @@ import (
 	"github.com/docker/docker/opts"
 	"github.com/docker/docker/rootless"
 	units "github.com/docker/go-units"
+	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/pkg/errors"
 	"github.com/spf13/pflag"
 )
@@ -64,6 +65,10 @@ func installConfigFlags(conf *config.Config, flags *pflag.FlagSet) error {
 	// rootless needs to be explicitly specified for running "rootful" dockerd in rootless dockerd (#38702)
 	// Note that defaultUserlandProxyPath and honorXDG are configured according to the value of rootless.RunningWithRootlessKit, not the value of --rootless.
 	flags.BoolVar(&conf.Rootless, "rootless", rootless.RunningWithRootlessKit(), "Enable rootless mode; typically used with RootlessKit (experimental)")
-	flags.StringVar(&conf.CgroupNamespaceMode, "default-cgroupns-mode", config.DefaultCgroupNamespaceMode, `Default mode for containers cgroup namespace ("host" | "private")`)
+	defaultCgroupNamespaceMode := "host"
+	if cgroups.IsCgroup2UnifiedMode() {
+		defaultCgroupNamespaceMode = "private"
+	}
+	flags.StringVar(&conf.CgroupNamespaceMode, "default-cgroupns-mode", defaultCgroupNamespaceMode, `Default mode for containers cgroup namespace ("host" | "private")`)
 	return nil
 }

+ 0 - 2
daemon/config/config_unix.go

@@ -11,8 +11,6 @@ import (
 )
 
 const (
-	// DefaultCgroupNamespaceMode is the default for a container's CgroupnsMode, if not set otherwise
-	DefaultCgroupNamespaceMode = "host" // TODO: change to private
 	// DefaultIpcMode is default for container's IpcMode, if not set otherwise
 	DefaultIpcMode = "private"
 )

+ 3 - 2
daemon/daemon.go

@@ -794,6 +794,7 @@ func NewDaemon(ctx context.Context, config *config.Config, pluginStore *plugin.S
 		PluginStore: pluginStore,
 		startupDone: make(chan struct{}),
 	}
+
 	// Ensure the daemon is properly shutdown if there is a failure during
 	// initialization
 	defer func() {
@@ -914,7 +915,7 @@ func NewDaemon(ctx context.Context, config *config.Config, pluginStore *plugin.S
 			}
 		}
 
-		return pluginexec.New(ctx, getPluginExecRoot(config.Root), pluginCli, config.ContainerdPluginNamespace, m)
+		return pluginexec.New(ctx, getPluginExecRoot(config.Root), pluginCli, config.ContainerdPluginNamespace, m, d.useShimV2())
 	}
 
 	// Plugin system initialization should happen before restore. Do not change order.
@@ -1063,7 +1064,7 @@ func NewDaemon(ctx context.Context, config *config.Config, pluginStore *plugin.S
 
 	go d.execCommandGC()
 
-	d.containerd, err = libcontainerd.NewClient(ctx, d.containerdCli, filepath.Join(config.ExecRoot, "containerd"), config.ContainerdNamespace, d)
+	d.containerd, err = libcontainerd.NewClient(ctx, d.containerdCli, filepath.Join(config.ExecRoot, "containerd"), config.ContainerdNamespace, d, d.useShimV2())
 	if err != nil {
 		return nil, err
 	}

+ 17 - 4
daemon/daemon_unix.go

@@ -364,10 +364,15 @@ func (daemon *Daemon) adaptContainerSettings(hostConfig *containertypes.HostConf
 
 	// Set default cgroup namespace mode, if unset for container
 	if hostConfig.CgroupnsMode.IsEmpty() {
-		if hostConfig.Privileged {
+		// for cgroup v2: unshare cgroupns even for privileged containers
+		// https://github.com/containers/libpod/pull/4374#issuecomment-549776387
+		if hostConfig.Privileged && !cgroups.IsCgroup2UnifiedMode() {
 			hostConfig.CgroupnsMode = containertypes.CgroupnsMode("host")
 		} else {
-			m := config.DefaultCgroupNamespaceMode
+			m := "host"
+			if cgroups.IsCgroup2UnifiedMode() {
+				m = "private"
+			}
 			if daemon.configStore != nil {
 				m = daemon.configStore.CgroupNamespaceMode
 			}
@@ -708,8 +713,8 @@ func verifyPlatformContainerSettings(daemon *Daemon, hostConfig *containertypes.
 			warnings = append(warnings, "Your kernel does not support cgroup namespaces.  Cgroup namespace setting discarded.")
 		}
 
-		if hostConfig.Privileged {
-			return warnings, fmt.Errorf("privileged mode is incompatible with private cgroup namespaces.  You must run the container in the host cgroup namespace when running privileged mode")
+		if hostConfig.Privileged && !cgroups.IsCgroup2UnifiedMode() {
+			return warnings, fmt.Errorf("privileged mode is incompatible with private cgroup namespaces on cgroup v1 host.  You must run the container in the host cgroup namespace when running privileged mode")
 		}
 	}
 
@@ -1594,6 +1599,10 @@ func (daemon *Daemon) initCgroupsPath(path string) error {
 		return nil
 	}
 
+	if cgroups.IsCgroup2UnifiedMode() {
+		return fmt.Errorf("daemon-scoped cpu-rt-period and cpu-rt-runtime are not implemented for cgroup v2")
+	}
+
 	// Recursively create cgroup to ensure that the system and all parent cgroups have values set
 	// for the period and runtime as this limits what the children can be set to.
 	daemon.initCgroupsPath(filepath.Dir(path))
@@ -1639,3 +1648,7 @@ func (daemon *Daemon) setupSeccompProfile() error {
 	}
 	return nil
 }
+
+func (daemon *Daemon) useShimV2() bool {
+	return cgroups.IsCgroup2UnifiedMode()
+}

+ 4 - 0
daemon/daemon_windows.go

@@ -653,3 +653,7 @@ func (daemon *Daemon) initRuntimes(_ map[string]types.Runtime) error {
 
 func setupResolvConf(config *config.Config) {
 }
+
+func (daemon *Daemon) useShimV2() bool {
+	return true
+}

+ 3 - 1
daemon/oci_linux.go

@@ -316,7 +316,9 @@ func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
 				return fmt.Errorf("invalid cgroup namespace mode: %v", cgroupNsMode)
 			}
 
-			if cgroupNsMode.IsPrivate() && !c.HostConfig.Privileged {
+			// for cgroup v2: unshare cgroupns even for privileged containers
+			// https://github.com/containers/libpod/pull/4374#issuecomment-549776387
+			if cgroupNsMode.IsPrivate() && (cgroups.IsCgroup2UnifiedMode() || !c.HostConfig.Privileged) {
 				nsCgroup := specs.LinuxNamespace{Type: "cgroup"}
 				setNamespace(s, nsCgroup)
 			}

+ 15 - 0
daemon/start_unix.go

@@ -8,6 +8,7 @@ import (
 	"path/filepath"
 
 	"github.com/containerd/containerd/runtime/linux/runctypes"
+	v2runcoptions "github.com/containerd/containerd/runtime/v2/runc/options"
 	"github.com/docker/docker/container"
 	"github.com/docker/docker/errdefs"
 	"github.com/pkg/errors"
@@ -43,6 +44,20 @@ func (daemon *Daemon) getLibcontainerdCreateOptions(container *container.Contain
 	if err != nil {
 		return nil, err
 	}
+	if daemon.useShimV2() {
+		opts := &v2runcoptions.Options{
+			BinaryName: path,
+			Root: filepath.Join(daemon.configStore.ExecRoot,
+				fmt.Sprintf("runtime-%s", container.HostConfig.Runtime)),
+		}
+
+		if UsingSystemd(daemon.configStore) {
+			opts.SystemdCgroup = true
+		}
+
+		return opts, nil
+
+	}
 	opts := &runctypes.RuncOptions{
 		Runtime: path,
 		RuntimeRoot: filepath.Join(daemon.configStore.ExecRoot,

+ 1 - 1
integration/container/run_cgroupns_linux_test.go

@@ -115,7 +115,7 @@ func TestCgroupNamespacesRunPrivilegedAndPrivate(t *testing.T) {
 	skip.If(t, !requirement.CgroupNamespacesEnabled())
 
 	// Running with both privileged and cgroupns=private is not allowed
-	errStr := "privileged mode is incompatible with private cgroup namespaces.  You must run the container in the host cgroup namespace when running privileged mode"
+	errStr := "privileged mode is incompatible with private cgroup namespaces on cgroup v1 host.  You must run the container in the host cgroup namespace when running privileged mode"
 	testCreateFailureWithCgroupNs(t, "private", errStr, container.WithPrivileged(true), container.WithCgroupnsMode("private"))
 }
 

+ 2 - 2
libcontainerd/libcontainerd_linux.go

@@ -9,6 +9,6 @@ import (
 )
 
 // NewClient creates a new libcontainerd client from a containerd client
-func NewClient(ctx context.Context, cli *containerd.Client, stateDir, ns string, b libcontainerdtypes.Backend) (libcontainerdtypes.Client, error) {
-	return remote.NewClient(ctx, cli, stateDir, ns, b)
+func NewClient(ctx context.Context, cli *containerd.Client, stateDir, ns string, b libcontainerdtypes.Backend, useShimV2 bool) (libcontainerdtypes.Client, error) {
+	return remote.NewClient(ctx, cli, stateDir, ns, b, useShimV2)
 }

+ 3 - 2
libcontainerd/libcontainerd_windows.go

@@ -11,9 +11,10 @@ import (
 )
 
 // NewClient creates a new libcontainerd client from a containerd client
-func NewClient(ctx context.Context, cli *containerd.Client, stateDir, ns string, b libcontainerdtypes.Backend) (libcontainerdtypes.Client, error) {
+func NewClient(ctx context.Context, cli *containerd.Client, stateDir, ns string, b libcontainerdtypes.Backend, useShimV2 bool) (libcontainerdtypes.Client, error) {
 	if !system.ContainerdRuntimeSupported() {
+		// useShimV2 is ignored for windows
 		return local.NewClient(ctx, cli, stateDir, ns, b)
 	}
-	return remote.NewClient(ctx, cli, stateDir, ns, b)
+	return remote.NewClient(ctx, cli, stateDir, ns, b, useShimV2)
 }

+ 52 - 16
libcontainerd/remote/client.go

@@ -23,6 +23,7 @@ import (
 	"github.com/containerd/containerd/events"
 	"github.com/containerd/containerd/images"
 	"github.com/containerd/containerd/runtime/linux/runctypes"
+	v2runcoptions "github.com/containerd/containerd/runtime/v2/runc/options"
 	"github.com/containerd/typeurl"
 	"github.com/docker/docker/errdefs"
 	"github.com/docker/docker/libcontainerd/queue"
@@ -45,21 +46,27 @@ type client struct {
 	logger   *logrus.Entry
 	ns       string
 
-	backend libcontainerdtypes.Backend
-	eventQ  queue.Queue
-	oomMu   sync.Mutex
-	oom     map[string]bool
+	backend         libcontainerdtypes.Backend
+	eventQ          queue.Queue
+	oomMu           sync.Mutex
+	oom             map[string]bool
+	useShimV2       bool
+	v2runcoptionsMu sync.Mutex
+	// v2runcoptions is used for copying options specified on Create() to Start()
+	v2runcoptions map[string]v2runcoptions.Options
 }
 
 // NewClient creates a new libcontainerd client from a containerd client
-func NewClient(ctx context.Context, cli *containerd.Client, stateDir, ns string, b libcontainerdtypes.Backend) (libcontainerdtypes.Client, error) {
+func NewClient(ctx context.Context, cli *containerd.Client, stateDir, ns string, b libcontainerdtypes.Backend, useShimV2 bool) (libcontainerdtypes.Client, error) {
 	c := &client{
-		client:   cli,
-		stateDir: stateDir,
-		logger:   logrus.WithField("module", "libcontainerd").WithField("namespace", ns),
-		ns:       ns,
-		backend:  b,
-		oom:      make(map[string]bool),
+		client:        cli,
+		stateDir:      stateDir,
+		logger:        logrus.WithField("module", "libcontainerd").WithField("namespace", ns),
+		ns:            ns,
+		backend:       b,
+		oom:           make(map[string]bool),
+		useShimV2:     useShimV2,
+		v2runcoptions: make(map[string]v2runcoptions.Options),
 	}
 
 	go c.processEventStream(ctx, ns)
@@ -126,9 +133,13 @@ func (c *client) Create(ctx context.Context, id string, ociSpec *specs.Spec, run
 	bdir := c.bundleDir(id)
 	c.logger.WithField("bundle", bdir).WithField("root", ociSpec.Root.Path).Debug("bundle dir created")
 
+	rt := runtimeName
+	if c.useShimV2 {
+		rt = shimV2RuntimeName
+	}
 	newOpts := []containerd.NewContainerOpts{
 		containerd.WithSpec(ociSpec),
-		containerd.WithRuntime(runtimeName, runtimeOptions),
+		containerd.WithRuntime(rt, runtimeOptions),
 		WithBundle(bdir, ociSpec),
 	}
 	opts = append(opts, newOpts...)
@@ -140,6 +151,13 @@ func (c *client) Create(ctx context.Context, id string, ociSpec *specs.Spec, run
 		}
 		return wrapError(err)
 	}
+	if c.useShimV2 {
+		if x, ok := runtimeOptions.(*v2runcoptions.Options); ok {
+			c.v2runcoptionsMu.Lock()
+			c.v2runcoptions[id] = *x
+			c.v2runcoptionsMu.Unlock()
+		}
+	}
 	return nil
 }
 
@@ -200,11 +218,26 @@ func (c *client) Start(ctx context.Context, id, checkpointDir string, withStdin
 
 	if runtime.GOOS != "windows" {
 		taskOpts = append(taskOpts, func(_ context.Context, _ *containerd.Client, info *containerd.TaskInfo) error {
-			info.Options = &runctypes.CreateOptions{
-				IoUid:       uint32(uid),
-				IoGid:       uint32(gid),
-				NoPivotRoot: os.Getenv("DOCKER_RAMDISK") != "",
+			if c.useShimV2 {
+				// For v2, we need to inherit options specified on Create
+				c.v2runcoptionsMu.Lock()
+				opts, ok := c.v2runcoptions[id]
+				c.v2runcoptionsMu.Unlock()
+				if !ok {
+					opts = v2runcoptions.Options{}
+				}
+				opts.IoUid = uint32(uid)
+				opts.IoGid = uint32(gid)
+				opts.NoPivotRoot = os.Getenv("DOCKER_RAMDISK") != ""
+				info.Options = &opts
+			} else {
+				info.Options = &runctypes.CreateOptions{
+					IoUid:       uint32(uid),
+					IoGid:       uint32(gid),
+					NoPivotRoot: os.Getenv("DOCKER_RAMDISK") != "",
+				}
 			}
+
 			return nil
 		})
 	} else {
@@ -466,6 +499,9 @@ func (c *client) Delete(ctx context.Context, containerID string) error {
 	c.oomMu.Lock()
 	delete(c.oom, containerID)
 	c.oomMu.Unlock()
+	c.v2runcoptionsMu.Lock()
+	delete(c.v2runcoptions, containerID)
+	c.v2runcoptionsMu.Unlock()
 	if os.Getenv("LIBCONTAINERD_NOCLEAN") != "1" {
 		if err := os.RemoveAll(bundle); err != nil {
 			c.logger.WithError(err).WithFields(logrus.Fields{

+ 4 - 1
libcontainerd/remote/client_linux.go

@@ -16,7 +16,10 @@ import (
 	"github.com/sirupsen/logrus"
 )
 
-const runtimeName = "io.containerd.runtime.v1.linux"
+const (
+	runtimeName       = "io.containerd.runtime.v1.linux"
+	shimV2RuntimeName = "io.containerd.runc.v2"
+)
 
 func summaryFromInterface(i interface{}) (*libcontainerdtypes.Summary, error) {
 	return &libcontainerdtypes.Summary{}, nil

+ 4 - 1
libcontainerd/remote/client_windows.go

@@ -16,7 +16,10 @@ import (
 	"github.com/sirupsen/logrus"
 )
 
-const runtimeName = "io.containerd.runhcs.v1"
+const (
+	runtimeName       = "io.containerd.runhcs.v1"
+	shimV2RuntimeName = runtimeName
+)
 
 func summaryFromInterface(i interface{}) (*libcontainerdtypes.Summary, error) {
 	switch pd := i.(type) {

+ 45 - 0
pkg/sysinfo/sysinfo_linux.go

@@ -60,6 +60,9 @@ func New(quiet bool) *SysInfo {
 		w := o(sysInfo, cgMounts)
 		warnings = append(warnings, w...)
 	}
+	if cgroups.IsCgroup2UnifiedMode() {
+		warnings = append(warnings, "Your system is running cgroup v2 (unsupported)")
+	}
 	if !quiet {
 		for _, w := range warnings {
 			logrus.Warn(w)
@@ -70,6 +73,15 @@ func New(quiet bool) *SysInfo {
 
 // applyMemoryCgroupInfo reads the memory information from the memory cgroup mount point.
 func applyMemoryCgroupInfo(info *SysInfo, cgMounts map[string]string) []string {
+	if cgroups.IsCgroup2UnifiedMode() {
+		// TODO: check cgroup2 info correctly
+		info.MemoryLimit = true
+		info.SwapLimit = true
+		info.MemoryReservation = true
+		info.OomKillDisable = true
+		info.MemorySwappiness = true
+		return nil
+	}
 	var warnings []string
 	mountPoint, ok := cgMounts["memory"]
 	if !ok {
@@ -108,6 +120,15 @@ func applyMemoryCgroupInfo(info *SysInfo, cgMounts map[string]string) []string {
 
 // applyCPUCgroupInfo reads the cpu information from the cpu cgroup mount point.
 func applyCPUCgroupInfo(info *SysInfo, cgMounts map[string]string) []string {
+	if cgroups.IsCgroup2UnifiedMode() {
+		// TODO: check cgroup2 info correctly
+		info.CPUShares = true
+		info.CPUCfsPeriod = true
+		info.CPUCfsQuota = true
+		info.CPURealtimePeriod = true
+		info.CPURealtimeRuntime = true
+		return nil
+	}
 	var warnings []string
 	mountPoint, ok := cgMounts["cpu"]
 	if !ok {
@@ -145,6 +166,15 @@ func applyCPUCgroupInfo(info *SysInfo, cgMounts map[string]string) []string {
 
 // applyBlkioCgroupInfo reads the blkio information from the blkio cgroup mount point.
 func applyBlkioCgroupInfo(info *SysInfo, cgMounts map[string]string) []string {
+	if cgroups.IsCgroup2UnifiedMode() {
+		// TODO: check cgroup2 info correctly
+		info.BlkioWeight = true
+		info.BlkioReadBpsDevice = true
+		info.BlkioWriteBpsDevice = true
+		info.BlkioReadIOpsDevice = true
+		info.BlkioWriteIOpsDevice = true
+		return nil
+	}
 	var warnings []string
 	mountPoint, ok := cgMounts["blkio"]
 	if !ok {
@@ -186,6 +216,11 @@ func applyBlkioCgroupInfo(info *SysInfo, cgMounts map[string]string) []string {
 
 // applyCPUSetCgroupInfo reads the cpuset information from the cpuset cgroup mount point.
 func applyCPUSetCgroupInfo(info *SysInfo, cgMounts map[string]string) []string {
+	if cgroups.IsCgroup2UnifiedMode() {
+		// TODO: check cgroup2 info correctly
+		info.Cpuset = true
+		return nil
+	}
 	var warnings []string
 	mountPoint, ok := cgMounts["cpuset"]
 	if !ok {
@@ -213,6 +248,11 @@ func applyCPUSetCgroupInfo(info *SysInfo, cgMounts map[string]string) []string {
 
 // applyPIDSCgroupInfo reads the pids information from the pids cgroup mount point.
 func applyPIDSCgroupInfo(info *SysInfo, _ map[string]string) []string {
+	if cgroups.IsCgroup2UnifiedMode() {
+		// TODO: check cgroup2 info correctly
+		info.PidsLimit = true
+		return nil
+	}
 	var warnings []string
 	_, err := cgroups.FindCgroupMountpoint("", "pids")
 	if err != nil {
@@ -225,6 +265,11 @@ func applyPIDSCgroupInfo(info *SysInfo, _ map[string]string) []string {
 
 // applyDevicesCgroupInfo reads the pids information from the devices cgroup mount point.
 func applyDevicesCgroupInfo(info *SysInfo, cgMounts map[string]string) []string {
+	if cgroups.IsCgroup2UnifiedMode() {
+		// TODO: check cgroup2 info correctly
+		info.CgroupDevicesEnabled = true
+		return nil
+	}
 	var warnings []string
 	_, ok := cgMounts["devices"]
 	info.CgroupDevicesEnabled = ok

+ 2 - 2
plugin/executor/containerd/containerd.go

@@ -26,13 +26,13 @@ type ExitHandler interface {
 }
 
 // New creates a new containerd plugin executor
-func New(ctx context.Context, rootDir string, cli *containerd.Client, ns string, exitHandler ExitHandler) (*Executor, error) {
+func New(ctx context.Context, rootDir string, cli *containerd.Client, ns string, exitHandler ExitHandler, useShimV2 bool) (*Executor, error) {
 	e := &Executor{
 		rootDir:     rootDir,
 		exitHandler: exitHandler,
 	}
 
-	client, err := libcontainerd.NewClient(ctx, cli, rootDir, ns, e)
+	client, err := libcontainerd.NewClient(ctx, cli, rootDir, ns, e, useShimV2)
 	if err != nil {
 		return nil, errors.Wrap(err, "error creating containerd exec client")
 	}