Browse Source

Merge pull request #45858 from thaJeztah/update_cgroups

vendor: github.com/containerd/cgroups/v3 v3.0.2
Sebastiaan van Stijn 2 years ago
parent
commit
46c7319f02

+ 1 - 1
vendor.mod

@@ -25,7 +25,7 @@ require (
 	github.com/bsphere/le_go v0.0.0-20200109081728-fc06dab2caa8
 	github.com/cloudflare/cfssl v0.0.0-20180323000720-5d63dbd981b5
 	github.com/container-orchestrated-devices/container-device-interface v0.5.5-0.20230516140309-1e6752771dc5
-	github.com/containerd/cgroups/v3 v3.0.1
+	github.com/containerd/cgroups/v3 v3.0.2
 	github.com/containerd/containerd v1.6.21
 	github.com/containerd/continuity v0.3.0
 	github.com/containerd/fifo v1.1.0

+ 2 - 2
vendor.sum

@@ -349,8 +349,8 @@ github.com/containerd/cgroups v0.0.0-20210114181951-8a68de567b68/go.mod h1:ZJeTF
 github.com/containerd/cgroups v1.0.1/go.mod h1:0SJrPIenamHDcZhEcJMNBB85rHcUsw4f25ZfBiPYRkU=
 github.com/containerd/cgroups v1.0.4 h1:jN/mbWBEaz+T1pi5OFtnkQ+8qnmEbAr1Oo1FRm5B0dA=
 github.com/containerd/cgroups v1.0.4/go.mod h1:nLNQtsF7Sl2HxNebu77i1R0oDlhiTG+kO4JTrUzo6IA=
-github.com/containerd/cgroups/v3 v3.0.1 h1:4hfGvu8rfGIwVIDd+nLzn/B9ZXx4BcCjzt5ToenJRaE=
-github.com/containerd/cgroups/v3 v3.0.1/go.mod h1:/vtwk1VXrtoa5AaZLkypuOJgA/6DyPMZHJPGQNtlHnw=
+github.com/containerd/cgroups/v3 v3.0.2 h1:f5WFqIVSgo5IZmtTT3qVBo6TzI1ON6sycSBKkymb9L0=
+github.com/containerd/cgroups/v3 v3.0.2/go.mod h1:JUgITrzdFqp42uI2ryGA+ge0ap/nxzYgkGmIcetmErE=
 github.com/containerd/console v0.0.0-20180822173158-c12b1e7919c1/go.mod h1:Tj/on1eG8kiEhd0+fhSDzsPAFESxzBBvdyEgyryXffw=
 github.com/containerd/console v0.0.0-20181022165439-0650fd9eeb50/go.mod h1:Tj/on1eG8kiEhd0+fhSDzsPAFESxzBBvdyEgyryXffw=
 github.com/containerd/console v0.0.0-20191206165004-02ecf6a7291e/go.mod h1:8Pf4gM6VEbTNRIT26AyyU7hxdQU3MvAvxVI0sc00XBE=

+ 21 - 0
vendor/github.com/containerd/cgroups/v3/README.md

@@ -201,6 +201,27 @@ if err != nil {
 }
 ```
 
+
+### Get and set cgroup type
+```go
+m, err := cgroup2.LoadSystemd("/", "my-cgroup-abc.slice")
+if err != nil {
+    return err
+}
+
+// https://www.kernel.org/doc/html/v5.0/admin-guide/cgroup-v2.html#threads
+cgType, err := m.GetType()
+if err != nil {
+    return err
+}
+fmt.Println(cgType)
+
+err = m.SetType(cgroup2.Threaded)
+if err != nil {
+    return err
+}
+```
+
 ### Attention
 
 All static path should not include `/sys/fs/cgroup/` prefix, it should start with your own cgroups name

+ 0 - 1
vendor/github.com/containerd/cgroups/v3/cgroup1/blkio.go

@@ -331,7 +331,6 @@ type deviceKey struct {
 // keyed by major and minor number. Since devices may be mapped multiple times,
 // we err on taking the first occurrence.
 func getDevices(r io.Reader) (map[deviceKey]string, error) {
-
 	var (
 		s       = bufio.NewScanner(r)
 		devices = make(map[deviceKey]string)

+ 7 - 3
vendor/github.com/containerd/cgroups/v3/cgroup1/cgroup.go

@@ -41,7 +41,7 @@ func New(path Path, resources *specs.LinuxResources, opts ...InitOpts) (Cgroup,
 			return nil, err
 		}
 	}
-	subsystems, err := config.hiearchy()
+	subsystems, err := config.hierarchy()
 	if err != nil {
 		return nil, err
 	}
@@ -79,7 +79,7 @@ func Load(path Path, opts ...InitOpts) (Cgroup, error) {
 		}
 	}
 	var activeSubsystems []Subsystem
-	subsystems, err := config.hiearchy()
+	subsystems, err := config.hierarchy()
 	if err != nil {
 		return nil, err
 	}
@@ -158,7 +158,7 @@ func (c *cgroup) subsystemsFilter(subsystems ...Name) []Subsystem {
 		return c.subsystems
 	}
 
-	var filteredSubsystems = []Subsystem{}
+	filteredSubsystems := []Subsystem{}
 	for _, s := range c.subsystems {
 		for _, f := range subsystems {
 			if s.Name() == f {
@@ -259,6 +259,10 @@ func (c *cgroup) Delete() error {
 		// kernel prevents cgroups with running process from being removed, check the tree is empty
 		procs, err := c.processes(s.Name(), true, cgroupProcs)
 		if err != nil {
+			// if the control group does not exist within a subsystem, then proceed to the next subsystem
+			if errors.Is(err, os.ErrNotExist) {
+				continue
+			}
 			return err
 		}
 		if len(procs) > 0 {

+ 1 - 1
vendor/github.com/containerd/cgroups/v3/cgroup1/control.go

@@ -28,7 +28,7 @@ type procType = string
 const (
 	cgroupProcs    procType = "cgroup.procs"
 	cgroupTasks    procType = "tasks"
-	defaultDirPerm          = 0755
+	defaultDirPerm          = 0o755
 )
 
 // defaultFilePerm is a var so that the test framework can change the filemode

+ 1 - 1
vendor/github.com/containerd/cgroups/v3/cgroup1/memory.go

@@ -472,7 +472,7 @@ func (m *memoryController) memoryEvent(path string, event MemoryEvent) (uintptr,
 	defer evtFile.Close()
 	data := fmt.Sprintf("%d %d %s", efd, evtFile.Fd(), event.Arg())
 	evctlPath := filepath.Join(root, "cgroup.event_control")
-	if err := os.WriteFile(evctlPath, []byte(data), 0700); err != nil {
+	if err := os.WriteFile(evctlPath, []byte(data), 0o700); err != nil {
 		unix.Close(efd)
 		return 0, err
 	}

+ 3 - 3
vendor/github.com/containerd/cgroups/v3/cgroup1/opts.go

@@ -36,13 +36,13 @@ type InitOpts func(*InitConfig) error
 type InitConfig struct {
 	// InitCheck can be used to check initialization errors from the subsystem
 	InitCheck InitCheck
-	hiearchy  Hierarchy
+	hierarchy Hierarchy
 }
 
 func newInitConfig() *InitConfig {
 	return &InitConfig{
 		InitCheck: RequireDevices,
-		hiearchy:  Default,
+		hierarchy: Default,
 	}
 }
 
@@ -66,7 +66,7 @@ func RequireDevices(s Subsystem, _ Path, _ error) error {
 // The default list is coming from /proc/self/mountinfo.
 func WithHiearchy(h Hierarchy) InitOpts {
 	return func(c *InitConfig) error {
-		c.hiearchy = h
+		c.hierarchy = h
 		return nil
 	}
 }

+ 1 - 8
vendor/github.com/containerd/cgroups/v3/cgroup1/pids.go

@@ -20,7 +20,6 @@ import (
 	"os"
 	"path/filepath"
 	"strconv"
-	"strings"
 
 	v1 "github.com/containerd/cgroups/v3/cgroup1/stats"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
@@ -67,16 +66,10 @@ func (p *pidsController) Stat(path string, stats *v1.Metrics) error {
 	if err != nil {
 		return err
 	}
-	var max uint64
-	maxData, err := os.ReadFile(filepath.Join(p.Path(path), "pids.max"))
+	max, err := readUint(filepath.Join(p.Path(path), "pids.max"))
 	if err != nil {
 		return err
 	}
-	if maxS := strings.TrimSpace(string(maxData)); maxS != "max" {
-		if max, err = parseUint(maxS, 10, 64); err != nil {
-			return err
-		}
-	}
 	stats.Pids = &v1.PidsStat{
 		Current: current,
 		Limit:   max,

+ 0 - 1
vendor/github.com/containerd/cgroups/v3/cgroup1/rdma.go

@@ -124,7 +124,6 @@ func toRdmaEntry(strEntries []string) []*v1.RdmaEntry {
 }
 
 func (p *rdmaController) Stat(path string, stats *v1.Metrics) error {
-
 	currentData, err := os.ReadFile(filepath.Join(p.Path(path), "rdma.current"))
 	if err != nil {
 		return err

+ 2 - 3
vendor/github.com/containerd/cgroups/v3/cgroup1/systemd.go

@@ -29,7 +29,7 @@ import (
 
 const (
 	SystemdDbus  Name = "systemd"
-	defaultSlice      = "system.slice"
+	defaultSlice Name = "system.slice"
 )
 
 var (
@@ -56,7 +56,7 @@ func Systemd() ([]Subsystem, error) {
 
 func Slice(slice, name string) Path {
 	if slice == "" {
-		slice = defaultSlice
+		slice = string(defaultSlice)
 	}
 	return func(subsystem Name) (string, error) {
 		return filepath.Join(slice, name), nil
@@ -70,7 +70,6 @@ func NewSystemd(root string) (*SystemdController, error) {
 }
 
 type SystemdController struct {
-	mu   sync.Mutex
 	root string
 }
 

+ 17 - 2
vendor/github.com/containerd/cgroups/v3/cgroup1/utils.go

@@ -18,6 +18,7 @@ package cgroup1
 
 import (
 	"bufio"
+	"bytes"
 	"fmt"
 	"os"
 	"path/filepath"
@@ -131,11 +132,25 @@ func hugePageSizes() ([]string, error) {
 }
 
 func readUint(path string) (uint64, error) {
-	v, err := os.ReadFile(path)
+	f, err := os.Open(path)
 	if err != nil {
 		return 0, err
 	}
-	return parseUint(strings.TrimSpace(string(v)), 10, 64)
+	defer f.Close()
+
+	// We should only need 20 bytes for the max uint64, but for a nice power of 2
+	// lets use 32.
+	b := make([]byte, 32)
+	n, err := f.Read(b)
+	if err != nil {
+		return 0, err
+	}
+	s := string(bytes.TrimSpace(b[:n]))
+	if s == "max" {
+		// Return 0 for the max value to maintain backward compatibility.
+		return 0, nil
+	}
+	return parseUint(s, 10, 64)
 }
 
 func parseUint(s string, base, bitSize int) (uint64, error) {

+ 1 - 1
vendor/github.com/containerd/cgroups/v3/cgroup1/v1.go

@@ -45,7 +45,7 @@ func Default() ([]Subsystem, error) {
 }
 
 // v1MountPoint returns the mount point where the cgroup
-// mountpoints are mounted in a single hiearchy
+// mountpoints are mounted in a single hierarchy
 func v1MountPoint() (string, error) {
 	f, err := os.Open("/proc/self/mountinfo")
 	if err != nil {

+ 2 - 2
vendor/github.com/containerd/cgroups/v3/cgroup2/devicefilter.go

@@ -167,7 +167,7 @@ func (p *program) appendDevice(dev specs.LinuxDeviceCgroup) error {
 	}
 	p.insts = append(p.insts, acceptBlock(dev.Allow)...)
 	// set blockSym to the first instruction we added in this iteration
-	p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].Sym(blockSym)
+	p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].WithSymbol(blockSym)
 	p.blockID++
 	return nil
 }
@@ -180,7 +180,7 @@ func (p *program) finalize() (asm.Instructions, error) {
 	blockSym := fmt.Sprintf("block-%d", p.blockID)
 	p.insts = append(p.insts,
 		// R0 <- 0
-		asm.Mov.Imm32(asm.R0, 0).Sym(blockSym),
+		asm.Mov.Imm32(asm.R0, 0).WithSymbol(blockSym),
 		asm.Return(),
 	)
 	p.blockID = -1

+ 104 - 173
vendor/github.com/containerd/cgroups/v3/cgroup2/manager.go

@@ -21,13 +21,11 @@ import (
 	"context"
 	"errors"
 	"fmt"
-	"io"
 	"math"
 	"os"
 	"path/filepath"
 	"strconv"
 	"strings"
-	"syscall"
 	"time"
 
 	"github.com/containerd/cgroups/v3/cgroup2/stats"
@@ -43,13 +41,12 @@ const (
 	subtreeControl     = "cgroup.subtree_control"
 	controllersFile    = "cgroup.controllers"
 	killFile           = "cgroup.kill"
+	typeFile           = "cgroup.type"
 	defaultCgroup2Path = "/sys/fs/cgroup"
 	defaultSlice       = "system.slice"
 )
 
-var (
-	canDelegate bool
-)
+var canDelegate bool
 
 type Event struct {
 	Low     uint64
@@ -99,7 +96,9 @@ func (r *Resources) Values() (o []Value) {
 func (r *Resources) EnabledControllers() (c []string) {
 	if r.CPU != nil {
 		c = append(c, "cpu")
-		c = append(c, "cpuset")
+		if r.CPU.Cpus != "" || r.CPU.Mems != "" {
+			c = append(c, "cpuset")
+		}
 	}
 	if r.Memory != nil {
 		c = append(c, "memory")
@@ -238,6 +237,35 @@ func setResources(path string, resources *Resources) error {
 	return nil
 }
 
+// CgroupType represents the types a cgroup can be.
+type CgroupType string
+
+const (
+	Domain   CgroupType = "domain"
+	Threaded CgroupType = "threaded"
+)
+
+func (c *Manager) GetType() (CgroupType, error) {
+	val, err := os.ReadFile(filepath.Join(c.path, typeFile))
+	if err != nil {
+		return "", err
+	}
+	trimmed := strings.TrimSpace(string(val))
+	return CgroupType(trimmed), nil
+}
+
+func (c *Manager) SetType(cgType CgroupType) error {
+	// NOTE: We could abort if cgType != Threaded here as currently
+	// it's not possible to revert back to domain, but not sure
+	// it's worth being that opinionated, especially if that may
+	// ever change.
+	v := Value{
+		filename: typeFile,
+		value:    string(cgType),
+	}
+	return writeValues(c.path, []Value{v})
+}
+
 func (c *Manager) RootControllers() ([]string, error) {
 	b, err := os.ReadFile(filepath.Join(c.unifiedMountpoint, controllersFile))
 	if err != nil {
@@ -492,17 +520,15 @@ func (c *Manager) MoveTo(destination *Manager) error {
 	return nil
 }
 
-var singleValueFiles = []string{
-	"pids.current",
-	"pids.max",
-}
-
 func (c *Manager) Stat() (*stats.Metrics, error) {
 	controllers, err := c.Controllers()
 	if err != nil {
 		return nil, err
 	}
-	out := make(map[string]interface{})
+	// Sizing this avoids an allocation to increase the map at runtime;
+	// currently the default bucket size is 8 and we put 40+ elements
+	// in it so we'd always end up allocating.
+	out := make(map[string]uint64, 50)
 	for _, controller := range controllers {
 		switch controller {
 		case "cpu", "memory":
@@ -514,66 +540,58 @@ func (c *Manager) Stat() (*stats.Metrics, error) {
 			}
 		}
 	}
-	for _, name := range singleValueFiles {
-		if err := readSingleFile(c.path, name, out); err != nil {
-			if os.IsNotExist(err) {
-				continue
-			}
-			return nil, err
-		}
-	}
-	memoryEvents := make(map[string]interface{})
+	memoryEvents := make(map[string]uint64)
 	if err := readKVStatsFile(c.path, "memory.events", memoryEvents); err != nil {
 		if !os.IsNotExist(err) {
 			return nil, err
 		}
 	}
-	var metrics stats.Metrics
 
+	var metrics stats.Metrics
 	metrics.Pids = &stats.PidsStat{
-		Current: getPidValue("pids.current", out),
-		Limit:   getPidValue("pids.max", out),
+		Current: getStatFileContentUint64(filepath.Join(c.path, "pids.current")),
+		Limit:   getStatFileContentUint64(filepath.Join(c.path, "pids.max")),
 	}
 	metrics.CPU = &stats.CPUStat{
-		UsageUsec:     getUint64Value("usage_usec", out),
-		UserUsec:      getUint64Value("user_usec", out),
-		SystemUsec:    getUint64Value("system_usec", out),
-		NrPeriods:     getUint64Value("nr_periods", out),
-		NrThrottled:   getUint64Value("nr_throttled", out),
-		ThrottledUsec: getUint64Value("throttled_usec", out),
+		UsageUsec:     out["usage_usec"],
+		UserUsec:      out["user_usec"],
+		SystemUsec:    out["system_usec"],
+		NrPeriods:     out["nr_periods"],
+		NrThrottled:   out["nr_throttled"],
+		ThrottledUsec: out["throttled_usec"],
 	}
 	metrics.Memory = &stats.MemoryStat{
-		Anon:                  getUint64Value("anon", out),
-		File:                  getUint64Value("file", out),
-		KernelStack:           getUint64Value("kernel_stack", out),
-		Slab:                  getUint64Value("slab", out),
-		Sock:                  getUint64Value("sock", out),
-		Shmem:                 getUint64Value("shmem", out),
-		FileMapped:            getUint64Value("file_mapped", out),
-		FileDirty:             getUint64Value("file_dirty", out),
-		FileWriteback:         getUint64Value("file_writeback", out),
-		AnonThp:               getUint64Value("anon_thp", out),
-		InactiveAnon:          getUint64Value("inactive_anon", out),
-		ActiveAnon:            getUint64Value("active_anon", out),
-		InactiveFile:          getUint64Value("inactive_file", out),
-		ActiveFile:            getUint64Value("active_file", out),
-		Unevictable:           getUint64Value("unevictable", out),
-		SlabReclaimable:       getUint64Value("slab_reclaimable", out),
-		SlabUnreclaimable:     getUint64Value("slab_unreclaimable", out),
-		Pgfault:               getUint64Value("pgfault", out),
-		Pgmajfault:            getUint64Value("pgmajfault", out),
-		WorkingsetRefault:     getUint64Value("workingset_refault", out),
-		WorkingsetActivate:    getUint64Value("workingset_activate", out),
-		WorkingsetNodereclaim: getUint64Value("workingset_nodereclaim", out),
-		Pgrefill:              getUint64Value("pgrefill", out),
-		Pgscan:                getUint64Value("pgscan", out),
-		Pgsteal:               getUint64Value("pgsteal", out),
-		Pgactivate:            getUint64Value("pgactivate", out),
-		Pgdeactivate:          getUint64Value("pgdeactivate", out),
-		Pglazyfree:            getUint64Value("pglazyfree", out),
-		Pglazyfreed:           getUint64Value("pglazyfreed", out),
-		ThpFaultAlloc:         getUint64Value("thp_fault_alloc", out),
-		ThpCollapseAlloc:      getUint64Value("thp_collapse_alloc", out),
+		Anon:                  out["anon"],
+		File:                  out["file"],
+		KernelStack:           out["kernel_stack"],
+		Slab:                  out["slab"],
+		Sock:                  out["sock"],
+		Shmem:                 out["shmem"],
+		FileMapped:            out["file_mapped"],
+		FileDirty:             out["file_dirty"],
+		FileWriteback:         out["file_writeback"],
+		AnonThp:               out["anon_thp"],
+		InactiveAnon:          out["inactive_anon"],
+		ActiveAnon:            out["active_anon"],
+		InactiveFile:          out["inactive_file"],
+		ActiveFile:            out["active_file"],
+		Unevictable:           out["unevictable"],
+		SlabReclaimable:       out["slab_reclaimable"],
+		SlabUnreclaimable:     out["slab_unreclaimable"],
+		Pgfault:               out["pgfault"],
+		Pgmajfault:            out["pgmajfault"],
+		WorkingsetRefault:     out["workingset_refault"],
+		WorkingsetActivate:    out["workingset_activate"],
+		WorkingsetNodereclaim: out["workingset_nodereclaim"],
+		Pgrefill:              out["pgrefill"],
+		Pgscan:                out["pgscan"],
+		Pgsteal:               out["pgsteal"],
+		Pgactivate:            out["pgactivate"],
+		Pgdeactivate:          out["pgdeactivate"],
+		Pglazyfree:            out["pglazyfree"],
+		Pglazyfreed:           out["pglazyfreed"],
+		ThpFaultAlloc:         out["thp_fault_alloc"],
+		ThpCollapseAlloc:      out["thp_collapse_alloc"],
 		Usage:                 getStatFileContentUint64(filepath.Join(c.path, "memory.current")),
 		UsageLimit:            getStatFileContentUint64(filepath.Join(c.path, "memory.max")),
 		SwapUsage:             getStatFileContentUint64(filepath.Join(c.path, "memory.swap.current")),
@@ -581,11 +599,11 @@ func (c *Manager) Stat() (*stats.Metrics, error) {
 	}
 	if len(memoryEvents) > 0 {
 		metrics.MemoryEvents = &stats.MemoryEvents{
-			Low:     getUint64Value("low", memoryEvents),
-			High:    getUint64Value("high", memoryEvents),
-			Max:     getUint64Value("max", memoryEvents),
-			Oom:     getUint64Value("oom", memoryEvents),
-			OomKill: getUint64Value("oom_kill", memoryEvents),
+			Low:     memoryEvents["low"],
+			High:    memoryEvents["high"],
+			Max:     memoryEvents["max"],
+			Oom:     memoryEvents["oom"],
+			OomKill: memoryEvents["oom_kill"],
 		}
 	}
 	metrics.Io = &stats.IOStat{Usage: readIoStats(c.path)}
@@ -598,56 +616,7 @@ func (c *Manager) Stat() (*stats.Metrics, error) {
 	return &metrics, nil
 }
 
-func getUint64Value(key string, out map[string]interface{}) uint64 {
-	v, ok := out[key]
-	if !ok {
-		return 0
-	}
-	switch t := v.(type) {
-	case uint64:
-		return t
-	}
-	return 0
-}
-
-func getPidValue(key string, out map[string]interface{}) uint64 {
-	v, ok := out[key]
-	if !ok {
-		return 0
-	}
-	switch t := v.(type) {
-	case uint64:
-		return t
-	case string:
-		if t == "max" {
-			return math.MaxUint64
-		}
-	}
-	return 0
-}
-
-func readSingleFile(path string, file string, out map[string]interface{}) error {
-	f, err := os.Open(filepath.Join(path, file))
-	if err != nil {
-		return err
-	}
-	defer f.Close()
-	data, err := io.ReadAll(f)
-	if err != nil {
-		return err
-	}
-	s := strings.TrimSpace(string(data))
-	v, err := parseUint(s, 10, 64)
-	if err != nil {
-		// if we cannot parse as a uint, parse as a string
-		out[file] = s
-		return nil
-	}
-	out[file] = v
-	return nil
-}
-
-func readKVStatsFile(path string, file string, out map[string]interface{}) error {
+func readKVStatsFile(path string, file string, out map[string]uint64) error {
 	f, err := os.Open(filepath.Join(path, file))
 	if err != nil {
 		return err
@@ -692,16 +661,12 @@ func (c *Manager) freeze(path string, state State) error {
 
 func (c *Manager) isCgroupEmpty() bool {
 	// In case of any error we return true so that we exit and don't leak resources
-	out := make(map[string]interface{})
+	out := make(map[string]uint64)
 	if err := readKVStatsFile(c.path, "cgroup.events", out); err != nil {
 		return true
 	}
 	if v, ok := out["populated"]; ok {
-		populated, ok := v.(uint64)
-		if !ok {
-			return true
-		}
-		return populated == 0
+		return v == 0
 	}
 	return true
 }
@@ -709,19 +674,19 @@ func (c *Manager) isCgroupEmpty() bool {
 // MemoryEventFD returns inotify file descriptor and 'memory.events' inotify watch descriptor
 func (c *Manager) MemoryEventFD() (int, uint32, error) {
 	fpath := filepath.Join(c.path, "memory.events")
-	fd, err := syscall.InotifyInit()
+	fd, err := unix.InotifyInit()
 	if err != nil {
 		return 0, 0, errors.New("failed to create inotify fd")
 	}
-	wd, err := syscall.InotifyAddWatch(fd, fpath, unix.IN_MODIFY)
+	wd, err := unix.InotifyAddWatch(fd, fpath, unix.IN_MODIFY)
 	if err != nil {
-		syscall.Close(fd)
+		unix.Close(fd)
 		return 0, 0, fmt.Errorf("failed to add inotify watch for %q: %w", fpath, err)
 	}
 	// monitor to detect process exit/cgroup deletion
 	evpath := filepath.Join(c.path, "cgroup.events")
-	if _, err = syscall.InotifyAddWatch(fd, evpath, unix.IN_MODIFY); err != nil {
-		syscall.Close(fd)
+	if _, err = unix.InotifyAddWatch(fd, evpath, unix.IN_MODIFY); err != nil {
+		unix.Close(fd)
 		return 0, 0, fmt.Errorf("failed to add inotify watch for %q: %w", evpath, err)
 	}
 
@@ -736,41 +701,6 @@ func (c *Manager) EventChan() (<-chan Event, <-chan error) {
 	return ec, errCh
 }
 
-func parseMemoryEvents(out map[string]interface{}) (Event, error) {
-	e := Event{}
-	if v, ok := out["high"]; ok {
-		e.High, ok = v.(uint64)
-		if !ok {
-			return Event{}, fmt.Errorf("cannot convert high to uint64: %+v", v)
-		}
-	}
-	if v, ok := out["low"]; ok {
-		e.Low, ok = v.(uint64)
-		if !ok {
-			return Event{}, fmt.Errorf("cannot convert low to uint64: %+v", v)
-		}
-	}
-	if v, ok := out["max"]; ok {
-		e.Max, ok = v.(uint64)
-		if !ok {
-			return Event{}, fmt.Errorf("cannot convert max to uint64: %+v", v)
-		}
-	}
-	if v, ok := out["oom"]; ok {
-		e.OOM, ok = v.(uint64)
-		if !ok {
-			return Event{}, fmt.Errorf("cannot convert oom to uint64: %+v", v)
-		}
-	}
-	if v, ok := out["oom_kill"]; ok {
-		e.OOMKill, ok = v.(uint64)
-		if !ok {
-			return Event{}, fmt.Errorf("cannot convert oom_kill to uint64: %+v", v)
-		}
-	}
-	return e, nil
-}
-
 func (c *Manager) waitForEvents(ec chan<- Event, errCh chan<- error) {
 	defer close(errCh)
 
@@ -779,17 +709,17 @@ func (c *Manager) waitForEvents(ec chan<- Event, errCh chan<- error) {
 		errCh <- err
 		return
 	}
-	defer syscall.Close(fd)
+	defer unix.Close(fd)
 
 	for {
-		buffer := make([]byte, syscall.SizeofInotifyEvent*10)
-		bytesRead, err := syscall.Read(fd, buffer)
+		buffer := make([]byte, unix.SizeofInotifyEvent*10)
+		bytesRead, err := unix.Read(fd, buffer)
 		if err != nil {
 			errCh <- err
 			return
 		}
-		if bytesRead >= syscall.SizeofInotifyEvent {
-			out := make(map[string]interface{})
+		if bytesRead >= unix.SizeofInotifyEvent {
+			out := make(map[string]uint64)
 			if err := readKVStatsFile(c.path, "memory.events", out); err != nil {
 				// When cgroup is deleted read may return -ENODEV instead of -ENOENT from open.
 				if _, statErr := os.Lstat(filepath.Join(c.path, "memory.events")); !os.IsNotExist(statErr) {
@@ -797,12 +727,13 @@ func (c *Manager) waitForEvents(ec chan<- Event, errCh chan<- error) {
 				}
 				return
 			}
-			e, err := parseMemoryEvents(out)
-			if err != nil {
-				errCh <- err
-				return
+			ec <- Event{
+				Low:     out["low"],
+				High:    out["high"],
+				Max:     out["max"],
+				OOM:     out["oom"],
+				OOMKill: out["oom_kill"],
 			}
-			ec <- e
 			if c.isCgroupEmpty() {
 				return
 			}
@@ -818,7 +749,7 @@ func setDevices(path string, devices []specs.LinuxDeviceCgroup) error {
 	if err != nil {
 		return err
 	}
-	dirFD, err := unix.Open(path, unix.O_DIRECTORY|unix.O_RDONLY|unix.O_CLOEXEC, 0600)
+	dirFD, err := unix.Open(path, unix.O_DIRECTORY|unix.O_RDONLY|unix.O_CLOEXEC, 0o600)
 	if err != nil {
 		return fmt.Errorf("cannot get dir FD for %s", path)
 	}

+ 103 - 61
vendor/github.com/containerd/cgroups/v3/cgroup2/utils.go

@@ -18,6 +18,7 @@ package cgroup2
 
 import (
 	"bufio"
+	"errors"
 	"fmt"
 	"io"
 	"math"
@@ -25,6 +26,7 @@ import (
 	"path/filepath"
 	"strconv"
 	"strings"
+	"sync"
 	"time"
 	"unsafe"
 
@@ -39,7 +41,7 @@ import (
 const (
 	cgroupProcs    = "cgroup.procs"
 	cgroupThreads  = "cgroup.threads"
-	defaultDirPerm = 0755
+	defaultDirPerm = 0o755
 )
 
 // defaultFilePerm is a var so that the test framework can change the filemode
@@ -92,19 +94,13 @@ func parseCgroupProcsFile(path string) ([]uint64, error) {
 	return out, nil
 }
 
-func parseKV(raw string) (string, interface{}, error) {
+func parseKV(raw string) (string, uint64, error) {
 	parts := strings.Fields(raw)
-	switch len(parts) {
-	case 2:
-		v, err := parseUint(parts[1], 10, 64)
-		if err != nil {
-			// if we cannot parse as a uint, parse as a string
-			return parts[0], parts[1], nil
-		}
-		return parts[0], v, nil
-	default:
+	if len(parts) != 2 {
 		return "", 0, ErrInvalidFormat
 	}
+	v, err := parseUint(parts[1], 10, 64)
+	return parts[0], v, err
 }
 
 func parseUint(s string, base, bitSize int) (uint64, error) {
@@ -136,9 +132,7 @@ func parseCgroupFile(path string) (string, error) {
 }
 
 func parseCgroupFromReader(r io.Reader) (string, error) {
-	var (
-		s = bufio.NewScanner(r)
-	)
+	s := bufio.NewScanner(r)
 	for s.Scan() {
 		var (
 			text  = s.Text()
@@ -244,18 +238,28 @@ func ToResources(spec *specs.LinuxResources) *Resources {
 
 // Gets uint64 parsed content of single value cgroup stat file
 func getStatFileContentUint64(filePath string) uint64 {
-	contents, err := os.ReadFile(filePath)
+	f, err := os.Open(filePath)
 	if err != nil {
 		return 0
 	}
-	trimmed := strings.TrimSpace(string(contents))
+	defer f.Close()
+
+	// We expect an unsigned 64 bit integer, or a "max" string
+	// in some cases.
+	buf := make([]byte, 32)
+	n, err := f.Read(buf)
+	if err != nil {
+		return 0
+	}
+
+	trimmed := strings.TrimSpace(string(buf[:n]))
 	if trimmed == "max" {
 		return math.MaxUint64
 	}
 
 	res, err := parseUint(trimmed, 10, 64)
 	if err != nil {
-		logrus.Errorf("unable to parse %q as a uint from Cgroup file %q", string(contents), filePath)
+		logrus.Errorf("unable to parse %q as a uint from Cgroup file %q", trimmed, filePath)
 		return res
 	}
 
@@ -385,56 +389,94 @@ func systemdUnitFromPath(path string) string {
 }
 
 func readHugeTlbStats(path string) []*stats.HugeTlbStat {
-	var usage = []*stats.HugeTlbStat{}
-	var keyUsage = make(map[string]*stats.HugeTlbStat)
-	f, err := os.Open(path)
-	if err != nil {
-		return usage
-	}
-	files, err := f.Readdir(-1)
-	f.Close()
-	if err != nil {
-		return usage
+	hpSizes := hugePageSizes()
+	usage := make([]*stats.HugeTlbStat, len(hpSizes))
+	for idx, pagesize := range hpSizes {
+		usage[idx] = &stats.HugeTlbStat{
+			Max:      getStatFileContentUint64(filepath.Join(path, "hugetlb."+pagesize+".max")),
+			Current:  getStatFileContentUint64(filepath.Join(path, "hugetlb."+pagesize+".current")),
+			Pagesize: pagesize,
+		}
 	}
+	return usage
+}
 
-	for _, file := range files {
-		if strings.Contains(file.Name(), "hugetlb") &&
-			(strings.HasSuffix(file.Name(), "max") || strings.HasSuffix(file.Name(), "current")) {
-			var hugeTlb *stats.HugeTlbStat
-			var ok bool
-			fileName := strings.Split(file.Name(), ".")
-			pageSize := fileName[1]
-			if hugeTlb, ok = keyUsage[pageSize]; !ok {
-				hugeTlb = &stats.HugeTlbStat{}
-			}
-			hugeTlb.Pagesize = pageSize
-			out, err := os.ReadFile(filepath.Join(path, file.Name()))
-			if err != nil {
-				continue
-			}
-			var value uint64
-			stringVal := strings.TrimSpace(string(out))
-			if stringVal == "max" {
-				value = math.MaxUint64
-			} else {
-				value, err = strconv.ParseUint(stringVal, 10, 64)
-			}
-			if err != nil {
-				continue
+var (
+	hPageSizes  []string
+	initHPSOnce sync.Once
+)
+
+// The following idea and implementation is taken pretty much line for line from
+// runc. Because the hugetlb files are well known, and the only variable thrown in
+// the mix is what huge page sizes you have on your host, this lends itself well
+// to doing the work to find the files present once, and then re-using this. This
+// saves a os.Readdirnames(0) call to search for hugeltb files on every `manager.Stat`
+// call.
+// https://github.com/opencontainers/runc/blob/3a2c0c2565644d8a7e0f1dd594a060b21fa96cf1/libcontainer/cgroups/utils.go#L301
+func hugePageSizes() []string {
+	initHPSOnce.Do(func() {
+		dir, err := os.OpenFile("/sys/kernel/mm/hugepages", unix.O_DIRECTORY|unix.O_RDONLY, 0)
+		if err != nil {
+			return
+		}
+		files, err := dir.Readdirnames(0)
+		dir.Close()
+		if err != nil {
+			return
+		}
+
+		hPageSizes, err = getHugePageSizeFromFilenames(files)
+		if err != nil {
+			logrus.Warnf("hugePageSizes: %s", err)
+		}
+	})
+
+	return hPageSizes
+}
+
+func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) {
+	pageSizes := make([]string, 0, len(fileNames))
+	var warn error
+
+	for _, file := range fileNames {
+		// example: hugepages-1048576kB
+		val := strings.TrimPrefix(file, "hugepages-")
+		if len(val) == len(file) {
+			// Unexpected file name: no prefix found, ignore it.
+			continue
+		}
+		// In all known versions of Linux up to 6.3 the suffix is always
+		// "kB". If we find something else, produce an error but keep going.
+		eLen := len(val) - 2
+		val = strings.TrimSuffix(val, "kB")
+		if len(val) != eLen {
+			// Highly unlikely.
+			if warn == nil {
+				warn = errors.New(file + `: invalid suffix (expected "kB")`)
 			}
-			switch fileName[2] {
-			case "max":
-				hugeTlb.Max = value
-			case "current":
-				hugeTlb.Current = value
+			continue
+		}
+		size, err := strconv.Atoi(val)
+		if err != nil {
+			// Highly unlikely.
+			if warn == nil {
+				warn = fmt.Errorf("%s: %w", file, err)
 			}
-			keyUsage[pageSize] = hugeTlb
+			continue
 		}
+		// Model after https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/hugetlb_cgroup.c?id=eff48ddeab782e35e58ccc8853f7386bbae9dec4#n574
+		// but in our case the size is in KB already.
+		if size >= (1 << 20) {
+			val = strconv.Itoa(size>>20) + "GB"
+		} else if size >= (1 << 10) {
+			val = strconv.Itoa(size>>10) + "MB"
+		} else {
+			val += "KB"
+		}
+		pageSizes = append(pageSizes, val)
 	}
-	for _, entry := range keyUsage {
-		usage = append(usage, entry)
-	}
-	return usage
+
+	return pageSizes, warn
 }
 
 func getSubreaper() (int, error) {

+ 2 - 2
vendor/modules.txt

@@ -214,8 +214,8 @@ github.com/container-storage-interface/spec/lib/go/csi
 # github.com/containerd/cgroups v1.0.4
 ## explicit; go 1.17
 github.com/containerd/cgroups/stats/v1
-# github.com/containerd/cgroups/v3 v3.0.1
-## explicit; go 1.17
+# github.com/containerd/cgroups/v3 v3.0.2
+## explicit; go 1.18
 github.com/containerd/cgroups/v3
 github.com/containerd/cgroups/v3/cgroup1
 github.com/containerd/cgroups/v3/cgroup1/stats