123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467 |
- package cgroups
- import (
- "bufio"
- "errors"
- "fmt"
- "io"
- "os"
- "path/filepath"
- "strconv"
- "strings"
- "sync"
- "time"
- "github.com/opencontainers/runc/libcontainer/userns"
- "github.com/sirupsen/logrus"
- "golang.org/x/sys/unix"
- )
- const (
- CgroupProcesses = "cgroup.procs"
- unifiedMountpoint = "/sys/fs/cgroup"
- hybridMountpoint = "/sys/fs/cgroup/unified"
- )
- var (
- isUnifiedOnce sync.Once
- isUnified bool
- isHybridOnce sync.Once
- isHybrid bool
- )
- // IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode.
- func IsCgroup2UnifiedMode() bool {
- isUnifiedOnce.Do(func() {
- var st unix.Statfs_t
- err := unix.Statfs(unifiedMountpoint, &st)
- if err != nil {
- if os.IsNotExist(err) && userns.RunningInUserNS() {
- // ignore the "not found" error if running in userns
- logrus.WithError(err).Debugf("%s missing, assuming cgroup v1", unifiedMountpoint)
- isUnified = false
- return
- }
- panic(fmt.Sprintf("cannot statfs cgroup root: %s", err))
- }
- isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
- })
- return isUnified
- }
- // IsCgroup2HybridMode returns whether we are running in cgroup v2 hybrid mode.
- func IsCgroup2HybridMode() bool {
- isHybridOnce.Do(func() {
- var st unix.Statfs_t
- err := unix.Statfs(hybridMountpoint, &st)
- if err != nil {
- if os.IsNotExist(err) {
- // ignore the "not found" error
- isHybrid = false
- return
- }
- panic(fmt.Sprintf("cannot statfs cgroup root: %s", err))
- }
- isHybrid = st.Type == unix.CGROUP2_SUPER_MAGIC
- })
- return isHybrid
- }
- type Mount struct {
- Mountpoint string
- Root string
- Subsystems []string
- }
- // GetCgroupMounts returns the mounts for the cgroup subsystems.
- // all indicates whether to return just the first instance or all the mounts.
- // This function should not be used from cgroupv2 code, as in this case
- // all the controllers are available under the constant unifiedMountpoint.
- func GetCgroupMounts(all bool) ([]Mount, error) {
- if IsCgroup2UnifiedMode() {
- // TODO: remove cgroupv2 case once all external users are converted
- availableControllers, err := GetAllSubsystems()
- if err != nil {
- return nil, err
- }
- m := Mount{
- Mountpoint: unifiedMountpoint,
- Root: unifiedMountpoint,
- Subsystems: availableControllers,
- }
- return []Mount{m}, nil
- }
- return getCgroupMountsV1(all)
- }
- // GetAllSubsystems returns all the cgroup subsystems supported by the kernel
- func GetAllSubsystems() ([]string, error) {
- // /proc/cgroups is meaningless for v2
- // https://github.com/torvalds/linux/blob/v5.3/Documentation/admin-guide/cgroup-v2.rst#deprecated-v1-core-features
- if IsCgroup2UnifiedMode() {
- // "pseudo" controllers do not appear in /sys/fs/cgroup/cgroup.controllers.
- // - devices: implemented in kernel 4.15
- // - freezer: implemented in kernel 5.2
- // We assume these are always available, as it is hard to detect availability.
- pseudo := []string{"devices", "freezer"}
- data, err := ReadFile("/sys/fs/cgroup", "cgroup.controllers")
- if err != nil {
- return nil, err
- }
- subsystems := append(pseudo, strings.Fields(data)...)
- return subsystems, nil
- }
- f, err := os.Open("/proc/cgroups")
- if err != nil {
- return nil, err
- }
- defer f.Close()
- subsystems := []string{}
- s := bufio.NewScanner(f)
- for s.Scan() {
- text := s.Text()
- if text[0] != '#' {
- parts := strings.Fields(text)
- if len(parts) >= 4 && parts[3] != "0" {
- subsystems = append(subsystems, parts[0])
- }
- }
- }
- if err := s.Err(); err != nil {
- return nil, err
- }
- return subsystems, nil
- }
- func readProcsFile(dir string) ([]int, error) {
- f, err := OpenFile(dir, CgroupProcesses, os.O_RDONLY)
- if err != nil {
- return nil, err
- }
- defer f.Close()
- var (
- s = bufio.NewScanner(f)
- out = []int{}
- )
- for s.Scan() {
- if t := s.Text(); t != "" {
- pid, err := strconv.Atoi(t)
- if err != nil {
- return nil, err
- }
- out = append(out, pid)
- }
- }
- return out, s.Err()
- }
- // ParseCgroupFile parses the given cgroup file, typically /proc/self/cgroup
- // or /proc/<pid>/cgroup, into a map of subsystems to cgroup paths, e.g.
- // "cpu": "/user.slice/user-1000.slice"
- // "pids": "/user.slice/user-1000.slice"
- // etc.
- //
- // Note that for cgroup v2 unified hierarchy, there are no per-controller
- // cgroup paths, so the resulting map will have a single element where the key
- // is empty string ("") and the value is the cgroup path the <pid> is in.
- func ParseCgroupFile(path string) (map[string]string, error) {
- f, err := os.Open(path)
- if err != nil {
- return nil, err
- }
- defer f.Close()
- return parseCgroupFromReader(f)
- }
- // helper function for ParseCgroupFile to make testing easier
- func parseCgroupFromReader(r io.Reader) (map[string]string, error) {
- s := bufio.NewScanner(r)
- cgroups := make(map[string]string)
- for s.Scan() {
- text := s.Text()
- // from cgroups(7):
- // /proc/[pid]/cgroup
- // ...
- // For each cgroup hierarchy ... there is one entry
- // containing three colon-separated fields of the form:
- // hierarchy-ID:subsystem-list:cgroup-path
- parts := strings.SplitN(text, ":", 3)
- if len(parts) < 3 {
- return nil, fmt.Errorf("invalid cgroup entry: must contain at least two colons: %v", text)
- }
- for _, subs := range strings.Split(parts[1], ",") {
- cgroups[subs] = parts[2]
- }
- }
- if err := s.Err(); err != nil {
- return nil, err
- }
- return cgroups, nil
- }
- func PathExists(path string) bool {
- if _, err := os.Stat(path); err != nil {
- return false
- }
- return true
- }
- func EnterPid(cgroupPaths map[string]string, pid int) error {
- for _, path := range cgroupPaths {
- if PathExists(path) {
- if err := WriteCgroupProc(path, pid); err != nil {
- return err
- }
- }
- }
- return nil
- }
- func rmdir(path string) error {
- err := unix.Rmdir(path)
- if err == nil || err == unix.ENOENT { //nolint:errorlint // unix errors are bare
- return nil
- }
- return &os.PathError{Op: "rmdir", Path: path, Err: err}
- }
- // RemovePath aims to remove cgroup path. It does so recursively,
- // by removing any subdirectories (sub-cgroups) first.
- func RemovePath(path string) error {
- // try the fast path first
- if err := rmdir(path); err == nil {
- return nil
- }
- infos, err := os.ReadDir(path)
- if err != nil {
- if os.IsNotExist(err) {
- err = nil
- }
- return err
- }
- for _, info := range infos {
- if info.IsDir() {
- // We should remove subcgroups dir first
- if err = RemovePath(filepath.Join(path, info.Name())); err != nil {
- break
- }
- }
- }
- if err == nil {
- err = rmdir(path)
- }
- return err
- }
- // RemovePaths iterates over the provided paths removing them.
- // We trying to remove all paths five times with increasing delay between tries.
- // If after all there are not removed cgroups - appropriate error will be
- // returned.
- func RemovePaths(paths map[string]string) (err error) {
- const retries = 5
- delay := 10 * time.Millisecond
- for i := 0; i < retries; i++ {
- if i != 0 {
- time.Sleep(delay)
- delay *= 2
- }
- for s, p := range paths {
- if err := RemovePath(p); err != nil {
- // do not log intermediate iterations
- switch i {
- case 0:
- logrus.WithError(err).Warnf("Failed to remove cgroup (will retry)")
- case retries - 1:
- logrus.WithError(err).Error("Failed to remove cgroup")
- }
- }
- _, err := os.Stat(p)
- // We need this strange way of checking cgroups existence because
- // RemoveAll almost always returns error, even on already removed
- // cgroups
- if os.IsNotExist(err) {
- delete(paths, s)
- }
- }
- if len(paths) == 0 {
- //nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506
- paths = make(map[string]string)
- return nil
- }
- }
- return fmt.Errorf("Failed to remove paths: %v", paths)
- }
- var (
- hugePageSizes []string
- initHPSOnce sync.Once
- )
- func HugePageSizes() []string {
- initHPSOnce.Do(func() {
- dir, err := os.OpenFile("/sys/kernel/mm/hugepages", unix.O_DIRECTORY|unix.O_RDONLY, 0)
- if err != nil {
- return
- }
- files, err := dir.Readdirnames(0)
- dir.Close()
- if err != nil {
- return
- }
- hugePageSizes, err = getHugePageSizeFromFilenames(files)
- if err != nil {
- logrus.Warn("HugePageSizes: ", err)
- }
- })
- return hugePageSizes
- }
- func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) {
- pageSizes := make([]string, 0, len(fileNames))
- var warn error
- for _, file := range fileNames {
- // example: hugepages-1048576kB
- val := strings.TrimPrefix(file, "hugepages-")
- if len(val) == len(file) {
- // Unexpected file name: no prefix found, ignore it.
- continue
- }
- // The suffix is always "kB" (as of Linux 5.13). If we find
- // something else, produce an error but keep going.
- eLen := len(val) - 2
- val = strings.TrimSuffix(val, "kB")
- if len(val) != eLen {
- // Highly unlikely.
- if warn == nil {
- warn = errors.New(file + `: invalid suffix (expected "kB")`)
- }
- continue
- }
- size, err := strconv.Atoi(val)
- if err != nil {
- // Highly unlikely.
- if warn == nil {
- warn = fmt.Errorf("%s: %w", file, err)
- }
- continue
- }
- // Model after https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/hugetlb_cgroup.c?id=eff48ddeab782e35e58ccc8853f7386bbae9dec4#n574
- // but in our case the size is in KB already.
- if size >= (1 << 20) {
- val = strconv.Itoa(size>>20) + "GB"
- } else if size >= (1 << 10) {
- val = strconv.Itoa(size>>10) + "MB"
- } else {
- val += "KB"
- }
- pageSizes = append(pageSizes, val)
- }
- return pageSizes, warn
- }
- // GetPids returns all pids, that were added to cgroup at path.
- func GetPids(dir string) ([]int, error) {
- return readProcsFile(dir)
- }
- // WriteCgroupProc writes the specified pid into the cgroup's cgroup.procs file
- func WriteCgroupProc(dir string, pid int) error {
- // Normally dir should not be empty, one case is that cgroup subsystem
- // is not mounted, we will get empty dir, and we want it fail here.
- if dir == "" {
- return fmt.Errorf("no such directory for %s", CgroupProcesses)
- }
- // Dont attach any pid to the cgroup if -1 is specified as a pid
- if pid == -1 {
- return nil
- }
- file, err := OpenFile(dir, CgroupProcesses, os.O_WRONLY)
- if err != nil {
- return fmt.Errorf("failed to write %v: %w", pid, err)
- }
- defer file.Close()
- for i := 0; i < 5; i++ {
- _, err = file.WriteString(strconv.Itoa(pid))
- if err == nil {
- return nil
- }
- // EINVAL might mean that the task being added to cgroup.procs is in state
- // TASK_NEW. We should attempt to do so again.
- if errors.Is(err, unix.EINVAL) {
- time.Sleep(30 * time.Millisecond)
- continue
- }
- return fmt.Errorf("failed to write %v: %w", pid, err)
- }
- return err
- }
- // Since the OCI spec is designed for cgroup v1, in some cases
- // there is need to convert from the cgroup v1 configuration to cgroup v2
- // the formula for cpuShares is y = (1 + ((x - 2) * 9999) / 262142)
- // convert from [2-262144] to [1-10000]
- // 262144 comes from Linux kernel definition "#define MAX_SHARES (1UL << 18)"
- func ConvertCPUSharesToCgroupV2Value(cpuShares uint64) uint64 {
- if cpuShares == 0 {
- return 0
- }
- return (1 + ((cpuShares-2)*9999)/262142)
- }
- // ConvertMemorySwapToCgroupV2Value converts MemorySwap value from OCI spec
- // for use by cgroup v2 drivers. A conversion is needed since Resources.MemorySwap
- // is defined as memory+swap combined, while in cgroup v2 swap is a separate value.
- func ConvertMemorySwapToCgroupV2Value(memorySwap, memory int64) (int64, error) {
- // for compatibility with cgroup1 controller, set swap to unlimited in
- // case the memory is set to unlimited, and swap is not explicitly set,
- // treating the request as "set both memory and swap to unlimited".
- if memory == -1 && memorySwap == 0 {
- return -1, nil
- }
- if memorySwap == -1 || memorySwap == 0 {
- // -1 is "max", 0 is "unset", so treat as is
- return memorySwap, nil
- }
- // sanity checks
- if memory == 0 || memory == -1 {
- return 0, errors.New("unable to set swap limit without memory limit")
- }
- if memory < 0 {
- return 0, fmt.Errorf("invalid memory value: %d", memory)
- }
- if memorySwap < memory {
- return 0, errors.New("memory+swap limit should be >= memory limit")
- }
- return memorySwap - memory, nil
- }
- // Since the OCI spec is designed for cgroup v1, in some cases
- // there is need to convert from the cgroup v1 configuration to cgroup v2
- // the formula for BlkIOWeight to IOWeight is y = (1 + (x - 10) * 9999 / 990)
- // convert linearly from [10-1000] to [1-10000]
- func ConvertBlkIOToIOWeightValue(blkIoWeight uint16) uint64 {
- if blkIoWeight == 0 {
- return 0
- }
- return 1 + (uint64(blkIoWeight)-10)*9999/990
- }
|