utils.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469
  1. package cgroups
  2. import (
  3. "bufio"
  4. "errors"
  5. "fmt"
  6. "io"
  7. "os"
  8. "path/filepath"
  9. "strconv"
  10. "strings"
  11. "sync"
  12. "time"
  13. "github.com/opencontainers/runc/libcontainer/userns"
  14. "github.com/sirupsen/logrus"
  15. "golang.org/x/sys/unix"
  16. )
  17. const (
  18. CgroupProcesses = "cgroup.procs"
  19. unifiedMountpoint = "/sys/fs/cgroup"
  20. hybridMountpoint = "/sys/fs/cgroup/unified"
  21. )
  22. var (
  23. isUnifiedOnce sync.Once
  24. isUnified bool
  25. isHybridOnce sync.Once
  26. isHybrid bool
  27. )
  28. // IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode.
  29. func IsCgroup2UnifiedMode() bool {
  30. isUnifiedOnce.Do(func() {
  31. var st unix.Statfs_t
  32. err := unix.Statfs(unifiedMountpoint, &st)
  33. if err != nil {
  34. if os.IsNotExist(err) && userns.RunningInUserNS() {
  35. // ignore the "not found" error if running in userns
  36. logrus.WithError(err).Debugf("%s missing, assuming cgroup v1", unifiedMountpoint)
  37. isUnified = false
  38. return
  39. }
  40. panic(fmt.Sprintf("cannot statfs cgroup root: %s", err))
  41. }
  42. isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
  43. })
  44. return isUnified
  45. }
  46. // IsCgroup2HybridMode returns whether we are running in cgroup v2 hybrid mode.
  47. func IsCgroup2HybridMode() bool {
  48. isHybridOnce.Do(func() {
  49. var st unix.Statfs_t
  50. err := unix.Statfs(hybridMountpoint, &st)
  51. if err != nil {
  52. isHybrid = false
  53. if !os.IsNotExist(err) {
  54. // Report unexpected errors.
  55. logrus.WithError(err).Debugf("statfs(%q) failed", hybridMountpoint)
  56. }
  57. return
  58. }
  59. isHybrid = st.Type == unix.CGROUP2_SUPER_MAGIC
  60. })
  61. return isHybrid
  62. }
  63. type Mount struct {
  64. Mountpoint string
  65. Root string
  66. Subsystems []string
  67. }
  68. // GetCgroupMounts returns the mounts for the cgroup subsystems.
  69. // all indicates whether to return just the first instance or all the mounts.
  70. // This function should not be used from cgroupv2 code, as in this case
  71. // all the controllers are available under the constant unifiedMountpoint.
  72. func GetCgroupMounts(all bool) ([]Mount, error) {
  73. if IsCgroup2UnifiedMode() {
  74. // TODO: remove cgroupv2 case once all external users are converted
  75. availableControllers, err := GetAllSubsystems()
  76. if err != nil {
  77. return nil, err
  78. }
  79. m := Mount{
  80. Mountpoint: unifiedMountpoint,
  81. Root: unifiedMountpoint,
  82. Subsystems: availableControllers,
  83. }
  84. return []Mount{m}, nil
  85. }
  86. return getCgroupMountsV1(all)
  87. }
  88. // GetAllSubsystems returns all the cgroup subsystems supported by the kernel
  89. func GetAllSubsystems() ([]string, error) {
  90. // /proc/cgroups is meaningless for v2
  91. // https://github.com/torvalds/linux/blob/v5.3/Documentation/admin-guide/cgroup-v2.rst#deprecated-v1-core-features
  92. if IsCgroup2UnifiedMode() {
  93. // "pseudo" controllers do not appear in /sys/fs/cgroup/cgroup.controllers.
  94. // - devices: implemented in kernel 4.15
  95. // - freezer: implemented in kernel 5.2
  96. // We assume these are always available, as it is hard to detect availability.
  97. pseudo := []string{"devices", "freezer"}
  98. data, err := ReadFile("/sys/fs/cgroup", "cgroup.controllers")
  99. if err != nil {
  100. return nil, err
  101. }
  102. subsystems := append(pseudo, strings.Fields(data)...)
  103. return subsystems, nil
  104. }
  105. f, err := os.Open("/proc/cgroups")
  106. if err != nil {
  107. return nil, err
  108. }
  109. defer f.Close()
  110. subsystems := []string{}
  111. s := bufio.NewScanner(f)
  112. for s.Scan() {
  113. text := s.Text()
  114. if text[0] != '#' {
  115. parts := strings.Fields(text)
  116. if len(parts) >= 4 && parts[3] != "0" {
  117. subsystems = append(subsystems, parts[0])
  118. }
  119. }
  120. }
  121. if err := s.Err(); err != nil {
  122. return nil, err
  123. }
  124. return subsystems, nil
  125. }
  126. func readProcsFile(dir string) ([]int, error) {
  127. f, err := OpenFile(dir, CgroupProcesses, os.O_RDONLY)
  128. if err != nil {
  129. return nil, err
  130. }
  131. defer f.Close()
  132. var (
  133. s = bufio.NewScanner(f)
  134. out = []int{}
  135. )
  136. for s.Scan() {
  137. if t := s.Text(); t != "" {
  138. pid, err := strconv.Atoi(t)
  139. if err != nil {
  140. return nil, err
  141. }
  142. out = append(out, pid)
  143. }
  144. }
  145. return out, s.Err()
  146. }
  147. // ParseCgroupFile parses the given cgroup file, typically /proc/self/cgroup
  148. // or /proc/<pid>/cgroup, into a map of subsystems to cgroup paths, e.g.
  149. //
  150. // "cpu": "/user.slice/user-1000.slice"
  151. // "pids": "/user.slice/user-1000.slice"
  152. //
  153. // etc.
  154. //
  155. // Note that for cgroup v2 unified hierarchy, there are no per-controller
  156. // cgroup paths, so the resulting map will have a single element where the key
  157. // is empty string ("") and the value is the cgroup path the <pid> is in.
  158. func ParseCgroupFile(path string) (map[string]string, error) {
  159. f, err := os.Open(path)
  160. if err != nil {
  161. return nil, err
  162. }
  163. defer f.Close()
  164. return parseCgroupFromReader(f)
  165. }
  166. // helper function for ParseCgroupFile to make testing easier
  167. func parseCgroupFromReader(r io.Reader) (map[string]string, error) {
  168. s := bufio.NewScanner(r)
  169. cgroups := make(map[string]string)
  170. for s.Scan() {
  171. text := s.Text()
  172. // from cgroups(7):
  173. // /proc/[pid]/cgroup
  174. // ...
  175. // For each cgroup hierarchy ... there is one entry
  176. // containing three colon-separated fields of the form:
  177. // hierarchy-ID:subsystem-list:cgroup-path
  178. parts := strings.SplitN(text, ":", 3)
  179. if len(parts) < 3 {
  180. return nil, fmt.Errorf("invalid cgroup entry: must contain at least two colons: %v", text)
  181. }
  182. for _, subs := range strings.Split(parts[1], ",") {
  183. cgroups[subs] = parts[2]
  184. }
  185. }
  186. if err := s.Err(); err != nil {
  187. return nil, err
  188. }
  189. return cgroups, nil
  190. }
  191. func PathExists(path string) bool {
  192. if _, err := os.Stat(path); err != nil {
  193. return false
  194. }
  195. return true
  196. }
  197. func EnterPid(cgroupPaths map[string]string, pid int) error {
  198. for _, path := range cgroupPaths {
  199. if PathExists(path) {
  200. if err := WriteCgroupProc(path, pid); err != nil {
  201. return err
  202. }
  203. }
  204. }
  205. return nil
  206. }
  207. func rmdir(path string) error {
  208. err := unix.Rmdir(path)
  209. if err == nil || err == unix.ENOENT { //nolint:errorlint // unix errors are bare
  210. return nil
  211. }
  212. return &os.PathError{Op: "rmdir", Path: path, Err: err}
  213. }
  214. // RemovePath aims to remove cgroup path. It does so recursively,
  215. // by removing any subdirectories (sub-cgroups) first.
  216. func RemovePath(path string) error {
  217. // try the fast path first
  218. if err := rmdir(path); err == nil {
  219. return nil
  220. }
  221. infos, err := os.ReadDir(path)
  222. if err != nil {
  223. if os.IsNotExist(err) {
  224. err = nil
  225. }
  226. return err
  227. }
  228. for _, info := range infos {
  229. if info.IsDir() {
  230. // We should remove subcgroups dir first
  231. if err = RemovePath(filepath.Join(path, info.Name())); err != nil {
  232. break
  233. }
  234. }
  235. }
  236. if err == nil {
  237. err = rmdir(path)
  238. }
  239. return err
  240. }
  241. // RemovePaths iterates over the provided paths removing them.
  242. // We trying to remove all paths five times with increasing delay between tries.
  243. // If after all there are not removed cgroups - appropriate error will be
  244. // returned.
  245. func RemovePaths(paths map[string]string) (err error) {
  246. const retries = 5
  247. delay := 10 * time.Millisecond
  248. for i := 0; i < retries; i++ {
  249. if i != 0 {
  250. time.Sleep(delay)
  251. delay *= 2
  252. }
  253. for s, p := range paths {
  254. if err := RemovePath(p); err != nil {
  255. // do not log intermediate iterations
  256. switch i {
  257. case 0:
  258. logrus.WithError(err).Warnf("Failed to remove cgroup (will retry)")
  259. case retries - 1:
  260. logrus.WithError(err).Error("Failed to remove cgroup")
  261. }
  262. }
  263. _, err := os.Stat(p)
  264. // We need this strange way of checking cgroups existence because
  265. // RemoveAll almost always returns error, even on already removed
  266. // cgroups
  267. if os.IsNotExist(err) {
  268. delete(paths, s)
  269. }
  270. }
  271. if len(paths) == 0 {
  272. //nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506
  273. paths = make(map[string]string)
  274. return nil
  275. }
  276. }
  277. return fmt.Errorf("Failed to remove paths: %v", paths)
  278. }
  279. var (
  280. hugePageSizes []string
  281. initHPSOnce sync.Once
  282. )
  283. func HugePageSizes() []string {
  284. initHPSOnce.Do(func() {
  285. dir, err := os.OpenFile("/sys/kernel/mm/hugepages", unix.O_DIRECTORY|unix.O_RDONLY, 0)
  286. if err != nil {
  287. return
  288. }
  289. files, err := dir.Readdirnames(0)
  290. dir.Close()
  291. if err != nil {
  292. return
  293. }
  294. hugePageSizes, err = getHugePageSizeFromFilenames(files)
  295. if err != nil {
  296. logrus.Warn("HugePageSizes: ", err)
  297. }
  298. })
  299. return hugePageSizes
  300. }
  301. func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) {
  302. pageSizes := make([]string, 0, len(fileNames))
  303. var warn error
  304. for _, file := range fileNames {
  305. // example: hugepages-1048576kB
  306. val := strings.TrimPrefix(file, "hugepages-")
  307. if len(val) == len(file) {
  308. // Unexpected file name: no prefix found, ignore it.
  309. continue
  310. }
  311. // The suffix is always "kB" (as of Linux 5.13). If we find
  312. // something else, produce an error but keep going.
  313. eLen := len(val) - 2
  314. val = strings.TrimSuffix(val, "kB")
  315. if len(val) != eLen {
  316. // Highly unlikely.
  317. if warn == nil {
  318. warn = errors.New(file + `: invalid suffix (expected "kB")`)
  319. }
  320. continue
  321. }
  322. size, err := strconv.Atoi(val)
  323. if err != nil {
  324. // Highly unlikely.
  325. if warn == nil {
  326. warn = fmt.Errorf("%s: %w", file, err)
  327. }
  328. continue
  329. }
  330. // Model after https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/hugetlb_cgroup.c?id=eff48ddeab782e35e58ccc8853f7386bbae9dec4#n574
  331. // but in our case the size is in KB already.
  332. if size >= (1 << 20) {
  333. val = strconv.Itoa(size>>20) + "GB"
  334. } else if size >= (1 << 10) {
  335. val = strconv.Itoa(size>>10) + "MB"
  336. } else {
  337. val += "KB"
  338. }
  339. pageSizes = append(pageSizes, val)
  340. }
  341. return pageSizes, warn
  342. }
  343. // GetPids returns all pids, that were added to cgroup at path.
  344. func GetPids(dir string) ([]int, error) {
  345. return readProcsFile(dir)
  346. }
  347. // WriteCgroupProc writes the specified pid into the cgroup's cgroup.procs file
  348. func WriteCgroupProc(dir string, pid int) error {
  349. // Normally dir should not be empty, one case is that cgroup subsystem
  350. // is not mounted, we will get empty dir, and we want it fail here.
  351. if dir == "" {
  352. return fmt.Errorf("no such directory for %s", CgroupProcesses)
  353. }
  354. // Dont attach any pid to the cgroup if -1 is specified as a pid
  355. if pid == -1 {
  356. return nil
  357. }
  358. file, err := OpenFile(dir, CgroupProcesses, os.O_WRONLY)
  359. if err != nil {
  360. return fmt.Errorf("failed to write %v: %w", pid, err)
  361. }
  362. defer file.Close()
  363. for i := 0; i < 5; i++ {
  364. _, err = file.WriteString(strconv.Itoa(pid))
  365. if err == nil {
  366. return nil
  367. }
  368. // EINVAL might mean that the task being added to cgroup.procs is in state
  369. // TASK_NEW. We should attempt to do so again.
  370. if errors.Is(err, unix.EINVAL) {
  371. time.Sleep(30 * time.Millisecond)
  372. continue
  373. }
  374. return fmt.Errorf("failed to write %v: %w", pid, err)
  375. }
  376. return err
  377. }
  378. // Since the OCI spec is designed for cgroup v1, in some cases
  379. // there is need to convert from the cgroup v1 configuration to cgroup v2
  380. // the formula for cpuShares is y = (1 + ((x - 2) * 9999) / 262142)
  381. // convert from [2-262144] to [1-10000]
  382. // 262144 comes from Linux kernel definition "#define MAX_SHARES (1UL << 18)"
  383. func ConvertCPUSharesToCgroupV2Value(cpuShares uint64) uint64 {
  384. if cpuShares == 0 {
  385. return 0
  386. }
  387. return (1 + ((cpuShares-2)*9999)/262142)
  388. }
  389. // ConvertMemorySwapToCgroupV2Value converts MemorySwap value from OCI spec
  390. // for use by cgroup v2 drivers. A conversion is needed since Resources.MemorySwap
  391. // is defined as memory+swap combined, while in cgroup v2 swap is a separate value.
  392. func ConvertMemorySwapToCgroupV2Value(memorySwap, memory int64) (int64, error) {
  393. // for compatibility with cgroup1 controller, set swap to unlimited in
  394. // case the memory is set to unlimited, and swap is not explicitly set,
  395. // treating the request as "set both memory and swap to unlimited".
  396. if memory == -1 && memorySwap == 0 {
  397. return -1, nil
  398. }
  399. if memorySwap == -1 || memorySwap == 0 {
  400. // -1 is "max", 0 is "unset", so treat as is
  401. return memorySwap, nil
  402. }
  403. // sanity checks
  404. if memory == 0 || memory == -1 {
  405. return 0, errors.New("unable to set swap limit without memory limit")
  406. }
  407. if memory < 0 {
  408. return 0, fmt.Errorf("invalid memory value: %d", memory)
  409. }
  410. if memorySwap < memory {
  411. return 0, errors.New("memory+swap limit should be >= memory limit")
  412. }
  413. return memorySwap - memory, nil
  414. }
  415. // Since the OCI spec is designed for cgroup v1, in some cases
  416. // there is need to convert from the cgroup v1 configuration to cgroup v2
  417. // the formula for BlkIOWeight to IOWeight is y = (1 + (x - 10) * 9999 / 990)
  418. // convert linearly from [10-1000] to [1-10000]
  419. func ConvertBlkIOToIOWeightValue(blkIoWeight uint16) uint64 {
  420. if blkIoWeight == 0 {
  421. return 0
  422. }
  423. return 1 + (uint64(blkIoWeight)-10)*9999/990
  424. }