utils.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450
  1. // +build linux
  2. package cgroups
  3. import (
  4. "bufio"
  5. "errors"
  6. "fmt"
  7. "io"
  8. "io/ioutil"
  9. "os"
  10. "path/filepath"
  11. "strconv"
  12. "strings"
  13. "sync"
  14. "time"
  15. "github.com/opencontainers/runc/libcontainer/userns"
  16. "github.com/sirupsen/logrus"
  17. "golang.org/x/sys/unix"
  18. )
  19. const (
  20. CgroupProcesses = "cgroup.procs"
  21. unifiedMountpoint = "/sys/fs/cgroup"
  22. )
  23. var (
  24. isUnifiedOnce sync.Once
  25. isUnified bool
  26. )
  27. // IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode.
  28. func IsCgroup2UnifiedMode() bool {
  29. isUnifiedOnce.Do(func() {
  30. var st unix.Statfs_t
  31. err := unix.Statfs(unifiedMountpoint, &st)
  32. if err != nil {
  33. if os.IsNotExist(err) && userns.RunningInUserNS() {
  34. // ignore the "not found" error if running in userns
  35. logrus.WithError(err).Debugf("%s missing, assuming cgroup v1", unifiedMountpoint)
  36. isUnified = false
  37. return
  38. }
  39. panic(fmt.Sprintf("cannot statfs cgroup root: %s", err))
  40. }
  41. isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
  42. })
  43. return isUnified
  44. }
  45. type Mount struct {
  46. Mountpoint string
  47. Root string
  48. Subsystems []string
  49. }
  50. // GetCgroupMounts returns the mounts for the cgroup subsystems.
  51. // all indicates whether to return just the first instance or all the mounts.
  52. // This function should not be used from cgroupv2 code, as in this case
  53. // all the controllers are available under the constant unifiedMountpoint.
  54. func GetCgroupMounts(all bool) ([]Mount, error) {
  55. if IsCgroup2UnifiedMode() {
  56. // TODO: remove cgroupv2 case once all external users are converted
  57. availableControllers, err := GetAllSubsystems()
  58. if err != nil {
  59. return nil, err
  60. }
  61. m := Mount{
  62. Mountpoint: unifiedMountpoint,
  63. Root: unifiedMountpoint,
  64. Subsystems: availableControllers,
  65. }
  66. return []Mount{m}, nil
  67. }
  68. return getCgroupMountsV1(all)
  69. }
  70. // GetAllSubsystems returns all the cgroup subsystems supported by the kernel
  71. func GetAllSubsystems() ([]string, error) {
  72. // /proc/cgroups is meaningless for v2
  73. // https://github.com/torvalds/linux/blob/v5.3/Documentation/admin-guide/cgroup-v2.rst#deprecated-v1-core-features
  74. if IsCgroup2UnifiedMode() {
  75. // "pseudo" controllers do not appear in /sys/fs/cgroup/cgroup.controllers.
  76. // - devices: implemented in kernel 4.15
  77. // - freezer: implemented in kernel 5.2
  78. // We assume these are always available, as it is hard to detect availability.
  79. pseudo := []string{"devices", "freezer"}
  80. data, err := ReadFile("/sys/fs/cgroup", "cgroup.controllers")
  81. if err != nil {
  82. return nil, err
  83. }
  84. subsystems := append(pseudo, strings.Fields(data)...)
  85. return subsystems, nil
  86. }
  87. f, err := os.Open("/proc/cgroups")
  88. if err != nil {
  89. return nil, err
  90. }
  91. defer f.Close()
  92. subsystems := []string{}
  93. s := bufio.NewScanner(f)
  94. for s.Scan() {
  95. text := s.Text()
  96. if text[0] != '#' {
  97. parts := strings.Fields(text)
  98. if len(parts) >= 4 && parts[3] != "0" {
  99. subsystems = append(subsystems, parts[0])
  100. }
  101. }
  102. }
  103. if err := s.Err(); err != nil {
  104. return nil, err
  105. }
  106. return subsystems, nil
  107. }
  108. func readProcsFile(file string) ([]int, error) {
  109. f, err := os.Open(file)
  110. if err != nil {
  111. return nil, err
  112. }
  113. defer f.Close()
  114. var (
  115. s = bufio.NewScanner(f)
  116. out = []int{}
  117. )
  118. for s.Scan() {
  119. if t := s.Text(); t != "" {
  120. pid, err := strconv.Atoi(t)
  121. if err != nil {
  122. return nil, err
  123. }
  124. out = append(out, pid)
  125. }
  126. }
  127. return out, s.Err()
  128. }
  129. // ParseCgroupFile parses the given cgroup file, typically /proc/self/cgroup
  130. // or /proc/<pid>/cgroup, into a map of subsystems to cgroup paths, e.g.
  131. // "cpu": "/user.slice/user-1000.slice"
  132. // "pids": "/user.slice/user-1000.slice"
  133. // etc.
  134. //
  135. // Note that for cgroup v2 unified hierarchy, there are no per-controller
  136. // cgroup paths, so the resulting map will have a single element where the key
  137. // is empty string ("") and the value is the cgroup path the <pid> is in.
  138. func ParseCgroupFile(path string) (map[string]string, error) {
  139. f, err := os.Open(path)
  140. if err != nil {
  141. return nil, err
  142. }
  143. defer f.Close()
  144. return parseCgroupFromReader(f)
  145. }
  146. // helper function for ParseCgroupFile to make testing easier
  147. func parseCgroupFromReader(r io.Reader) (map[string]string, error) {
  148. s := bufio.NewScanner(r)
  149. cgroups := make(map[string]string)
  150. for s.Scan() {
  151. text := s.Text()
  152. // from cgroups(7):
  153. // /proc/[pid]/cgroup
  154. // ...
  155. // For each cgroup hierarchy ... there is one entry
  156. // containing three colon-separated fields of the form:
  157. // hierarchy-ID:subsystem-list:cgroup-path
  158. parts := strings.SplitN(text, ":", 3)
  159. if len(parts) < 3 {
  160. return nil, fmt.Errorf("invalid cgroup entry: must contain at least two colons: %v", text)
  161. }
  162. for _, subs := range strings.Split(parts[1], ",") {
  163. cgroups[subs] = parts[2]
  164. }
  165. }
  166. if err := s.Err(); err != nil {
  167. return nil, err
  168. }
  169. return cgroups, nil
  170. }
  171. func PathExists(path string) bool {
  172. if _, err := os.Stat(path); err != nil {
  173. return false
  174. }
  175. return true
  176. }
  177. func EnterPid(cgroupPaths map[string]string, pid int) error {
  178. for _, path := range cgroupPaths {
  179. if PathExists(path) {
  180. if err := WriteCgroupProc(path, pid); err != nil {
  181. return err
  182. }
  183. }
  184. }
  185. return nil
  186. }
  187. func rmdir(path string) error {
  188. err := unix.Rmdir(path)
  189. if err == nil || err == unix.ENOENT {
  190. return nil
  191. }
  192. return &os.PathError{Op: "rmdir", Path: path, Err: err}
  193. }
  194. // RemovePath aims to remove cgroup path. It does so recursively,
  195. // by removing any subdirectories (sub-cgroups) first.
  196. func RemovePath(path string) error {
  197. // try the fast path first
  198. if err := rmdir(path); err == nil {
  199. return nil
  200. }
  201. infos, err := ioutil.ReadDir(path)
  202. if err != nil {
  203. if os.IsNotExist(err) {
  204. err = nil
  205. }
  206. return err
  207. }
  208. for _, info := range infos {
  209. if info.IsDir() {
  210. // We should remove subcgroups dir first
  211. if err = RemovePath(filepath.Join(path, info.Name())); err != nil {
  212. break
  213. }
  214. }
  215. }
  216. if err == nil {
  217. err = rmdir(path)
  218. }
  219. return err
  220. }
  221. // RemovePaths iterates over the provided paths removing them.
  222. // We trying to remove all paths five times with increasing delay between tries.
  223. // If after all there are not removed cgroups - appropriate error will be
  224. // returned.
  225. func RemovePaths(paths map[string]string) (err error) {
  226. const retries = 5
  227. delay := 10 * time.Millisecond
  228. for i := 0; i < retries; i++ {
  229. if i != 0 {
  230. time.Sleep(delay)
  231. delay *= 2
  232. }
  233. for s, p := range paths {
  234. if err := RemovePath(p); err != nil {
  235. // do not log intermediate iterations
  236. switch i {
  237. case 0:
  238. logrus.WithError(err).Warnf("Failed to remove cgroup (will retry)")
  239. case retries - 1:
  240. logrus.WithError(err).Error("Failed to remove cgroup")
  241. }
  242. }
  243. _, err := os.Stat(p)
  244. // We need this strange way of checking cgroups existence because
  245. // RemoveAll almost always returns error, even on already removed
  246. // cgroups
  247. if os.IsNotExist(err) {
  248. delete(paths, s)
  249. }
  250. }
  251. if len(paths) == 0 {
  252. //nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506
  253. paths = make(map[string]string)
  254. return nil
  255. }
  256. }
  257. return fmt.Errorf("Failed to remove paths: %v", paths)
  258. }
  259. func GetHugePageSize() ([]string, error) {
  260. dir, err := os.OpenFile("/sys/kernel/mm/hugepages", unix.O_DIRECTORY|unix.O_RDONLY, 0)
  261. if err != nil {
  262. return nil, err
  263. }
  264. files, err := dir.Readdirnames(0)
  265. dir.Close()
  266. if err != nil {
  267. return nil, err
  268. }
  269. return getHugePageSizeFromFilenames(files)
  270. }
  271. func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) {
  272. pageSizes := make([]string, 0, len(fileNames))
  273. for _, file := range fileNames {
  274. // example: hugepages-1048576kB
  275. val := strings.TrimPrefix(file, "hugepages-")
  276. if len(val) == len(file) {
  277. // unexpected file name: no prefix found
  278. continue
  279. }
  280. // The suffix is always "kB" (as of Linux 5.9)
  281. eLen := len(val) - 2
  282. val = strings.TrimSuffix(val, "kB")
  283. if len(val) != eLen {
  284. logrus.Warnf("GetHugePageSize: %s: invalid filename suffix (expected \"kB\")", file)
  285. continue
  286. }
  287. size, err := strconv.Atoi(val)
  288. if err != nil {
  289. return nil, err
  290. }
  291. // Model after https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/hugetlb_cgroup.c?id=eff48ddeab782e35e58ccc8853f7386bbae9dec4#n574
  292. // but in our case the size is in KB already.
  293. if size >= (1 << 20) {
  294. val = strconv.Itoa(size>>20) + "GB"
  295. } else if size >= (1 << 10) {
  296. val = strconv.Itoa(size>>10) + "MB"
  297. } else {
  298. val += "KB"
  299. }
  300. pageSizes = append(pageSizes, val)
  301. }
  302. return pageSizes, nil
  303. }
  304. // GetPids returns all pids, that were added to cgroup at path.
  305. func GetPids(dir string) ([]int, error) {
  306. return readProcsFile(filepath.Join(dir, CgroupProcesses))
  307. }
  308. // GetAllPids returns all pids, that were added to cgroup at path and to all its
  309. // subcgroups.
  310. func GetAllPids(path string) ([]int, error) {
  311. var pids []int
  312. // collect pids from all sub-cgroups
  313. err := filepath.Walk(path, func(p string, info os.FileInfo, iErr error) error {
  314. if iErr != nil {
  315. return iErr
  316. }
  317. if info.IsDir() || info.Name() != CgroupProcesses {
  318. return nil
  319. }
  320. cPids, err := readProcsFile(p)
  321. if err != nil {
  322. return err
  323. }
  324. pids = append(pids, cPids...)
  325. return nil
  326. })
  327. return pids, err
  328. }
  329. // WriteCgroupProc writes the specified pid into the cgroup's cgroup.procs file
  330. func WriteCgroupProc(dir string, pid int) error {
  331. // Normally dir should not be empty, one case is that cgroup subsystem
  332. // is not mounted, we will get empty dir, and we want it fail here.
  333. if dir == "" {
  334. return fmt.Errorf("no such directory for %s", CgroupProcesses)
  335. }
  336. // Dont attach any pid to the cgroup if -1 is specified as a pid
  337. if pid == -1 {
  338. return nil
  339. }
  340. file, err := OpenFile(dir, CgroupProcesses, os.O_WRONLY)
  341. if err != nil {
  342. return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err)
  343. }
  344. defer file.Close()
  345. for i := 0; i < 5; i++ {
  346. _, err = file.WriteString(strconv.Itoa(pid))
  347. if err == nil {
  348. return nil
  349. }
  350. // EINVAL might mean that the task being added to cgroup.procs is in state
  351. // TASK_NEW. We should attempt to do so again.
  352. if errors.Is(err, unix.EINVAL) {
  353. time.Sleep(30 * time.Millisecond)
  354. continue
  355. }
  356. return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err)
  357. }
  358. return err
  359. }
  360. // Since the OCI spec is designed for cgroup v1, in some cases
  361. // there is need to convert from the cgroup v1 configuration to cgroup v2
  362. // the formula for cpuShares is y = (1 + ((x - 2) * 9999) / 262142)
  363. // convert from [2-262144] to [1-10000]
  364. // 262144 comes from Linux kernel definition "#define MAX_SHARES (1UL << 18)"
  365. func ConvertCPUSharesToCgroupV2Value(cpuShares uint64) uint64 {
  366. if cpuShares == 0 {
  367. return 0
  368. }
  369. return (1 + ((cpuShares-2)*9999)/262142)
  370. }
  371. // ConvertMemorySwapToCgroupV2Value converts MemorySwap value from OCI spec
  372. // for use by cgroup v2 drivers. A conversion is needed since Resources.MemorySwap
  373. // is defined as memory+swap combined, while in cgroup v2 swap is a separate value.
  374. func ConvertMemorySwapToCgroupV2Value(memorySwap, memory int64) (int64, error) {
  375. // for compatibility with cgroup1 controller, set swap to unlimited in
  376. // case the memory is set to unlimited, and swap is not explicitly set,
  377. // treating the request as "set both memory and swap to unlimited".
  378. if memory == -1 && memorySwap == 0 {
  379. return -1, nil
  380. }
  381. if memorySwap == -1 || memorySwap == 0 {
  382. // -1 is "max", 0 is "unset", so treat as is
  383. return memorySwap, nil
  384. }
  385. // sanity checks
  386. if memory == 0 || memory == -1 {
  387. return 0, errors.New("unable to set swap limit without memory limit")
  388. }
  389. if memory < 0 {
  390. return 0, fmt.Errorf("invalid memory value: %d", memory)
  391. }
  392. if memorySwap < memory {
  393. return 0, errors.New("memory+swap limit should be >= memory limit")
  394. }
  395. return memorySwap - memory, nil
  396. }
  397. // Since the OCI spec is designed for cgroup v1, in some cases
  398. // there is need to convert from the cgroup v1 configuration to cgroup v2
  399. // the formula for BlkIOWeight to IOWeight is y = (1 + (x - 10) * 9999 / 990)
  400. // convert linearly from [10-1000] to [1-10000]
  401. func ConvertBlkIOToIOWeightValue(blkIoWeight uint16) uint64 {
  402. if blkIoWeight == 0 {
  403. return 0
  404. }
  405. return uint64(1 + (uint64(blkIoWeight)-10)*9999/990)
  406. }