utils.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588
  1. // +build linux
  2. package cgroups
  3. import (
  4. "bufio"
  5. "fmt"
  6. "io"
  7. "io/ioutil"
  8. "os"
  9. "path/filepath"
  10. "strconv"
  11. "strings"
  12. "sync"
  13. "syscall"
  14. "time"
  15. units "github.com/docker/go-units"
  16. "golang.org/x/sys/unix"
  17. )
  18. const (
  19. CgroupNamePrefix = "name="
  20. CgroupProcesses = "cgroup.procs"
  21. unifiedMountpoint = "/sys/fs/cgroup"
  22. )
  23. var (
  24. isUnifiedOnce sync.Once
  25. isUnified bool
  26. )
  27. // HugePageSizeUnitList is a list of the units used by the linux kernel when
  28. // naming the HugePage control files.
  29. // https://www.kernel.org/doc/Documentation/cgroup-v1/hugetlb.txt
  30. // TODO Since the kernel only use KB, MB and GB; TB and PB should be removed,
  31. // depends on https://github.com/docker/go-units/commit/a09cd47f892041a4fac473133d181f5aea6fa393
  32. var HugePageSizeUnitList = []string{"B", "KB", "MB", "GB", "TB", "PB"}
  33. // IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode.
  34. func IsCgroup2UnifiedMode() bool {
  35. isUnifiedOnce.Do(func() {
  36. var st syscall.Statfs_t
  37. if err := syscall.Statfs(unifiedMountpoint, &st); err != nil {
  38. panic("cannot statfs cgroup root")
  39. }
  40. isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
  41. })
  42. return isUnified
  43. }
  44. // https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt
  45. func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) {
  46. if IsCgroup2UnifiedMode() {
  47. return unifiedMountpoint, nil
  48. }
  49. mnt, _, err := FindCgroupMountpointAndRoot(cgroupPath, subsystem)
  50. return mnt, err
  51. }
  52. func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string, error) {
  53. // We are not using mount.GetMounts() because it's super-inefficient,
  54. // parsing it directly sped up x10 times because of not using Sscanf.
  55. // It was one of two major performance drawbacks in container start.
  56. if !isSubsystemAvailable(subsystem) {
  57. return "", "", NewNotFoundError(subsystem)
  58. }
  59. f, err := os.Open("/proc/self/mountinfo")
  60. if err != nil {
  61. return "", "", err
  62. }
  63. defer f.Close()
  64. if IsCgroup2UnifiedMode() {
  65. subsystem = ""
  66. }
  67. return findCgroupMountpointAndRootFromReader(f, cgroupPath, subsystem)
  68. }
  69. func findCgroupMountpointAndRootFromReader(reader io.Reader, cgroupPath, subsystem string) (string, string, error) {
  70. scanner := bufio.NewScanner(reader)
  71. for scanner.Scan() {
  72. txt := scanner.Text()
  73. fields := strings.Fields(txt)
  74. if len(fields) < 9 {
  75. continue
  76. }
  77. if strings.HasPrefix(fields[4], cgroupPath) {
  78. for _, opt := range strings.Split(fields[len(fields)-1], ",") {
  79. if (subsystem == "" && fields[9] == "cgroup2") || opt == subsystem {
  80. return fields[4], fields[3], nil
  81. }
  82. }
  83. }
  84. }
  85. if err := scanner.Err(); err != nil {
  86. return "", "", err
  87. }
  88. return "", "", NewNotFoundError(subsystem)
  89. }
  90. func isSubsystemAvailable(subsystem string) bool {
  91. if IsCgroup2UnifiedMode() {
  92. controllers, err := GetAllSubsystems()
  93. if err != nil {
  94. return false
  95. }
  96. for _, c := range controllers {
  97. if c == subsystem {
  98. return true
  99. }
  100. }
  101. return false
  102. }
  103. cgroups, err := ParseCgroupFile("/proc/self/cgroup")
  104. if err != nil {
  105. return false
  106. }
  107. _, avail := cgroups[subsystem]
  108. return avail
  109. }
  110. func GetClosestMountpointAncestor(dir, mountinfo string) string {
  111. deepestMountPoint := ""
  112. for _, mountInfoEntry := range strings.Split(mountinfo, "\n") {
  113. mountInfoParts := strings.Fields(mountInfoEntry)
  114. if len(mountInfoParts) < 5 {
  115. continue
  116. }
  117. mountPoint := mountInfoParts[4]
  118. if strings.HasPrefix(mountPoint, deepestMountPoint) && strings.HasPrefix(dir, mountPoint) {
  119. deepestMountPoint = mountPoint
  120. }
  121. }
  122. return deepestMountPoint
  123. }
  124. func FindCgroupMountpointDir() (string, error) {
  125. f, err := os.Open("/proc/self/mountinfo")
  126. if err != nil {
  127. return "", err
  128. }
  129. defer f.Close()
  130. scanner := bufio.NewScanner(f)
  131. for scanner.Scan() {
  132. text := scanner.Text()
  133. fields := strings.Split(text, " ")
  134. // Safe as mountinfo encodes mountpoints with spaces as \040.
  135. index := strings.Index(text, " - ")
  136. postSeparatorFields := strings.Fields(text[index+3:])
  137. numPostFields := len(postSeparatorFields)
  138. // This is an error as we can't detect if the mount is for "cgroup"
  139. if numPostFields == 0 {
  140. return "", fmt.Errorf("Found no fields post '-' in %q", text)
  141. }
  142. if postSeparatorFields[0] == "cgroup" || postSeparatorFields[0] == "cgroup2" {
  143. // Check that the mount is properly formatted.
  144. if numPostFields < 3 {
  145. return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
  146. }
  147. return filepath.Dir(fields[4]), nil
  148. }
  149. }
  150. if err := scanner.Err(); err != nil {
  151. return "", err
  152. }
  153. return "", NewNotFoundError("cgroup")
  154. }
  155. type Mount struct {
  156. Mountpoint string
  157. Root string
  158. Subsystems []string
  159. }
  160. func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) {
  161. if len(m.Subsystems) == 0 {
  162. return "", fmt.Errorf("no subsystem for mount")
  163. }
  164. return getControllerPath(m.Subsystems[0], cgroups)
  165. }
  166. func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount, error) {
  167. res := make([]Mount, 0, len(ss))
  168. scanner := bufio.NewScanner(mi)
  169. numFound := 0
  170. for scanner.Scan() && numFound < len(ss) {
  171. txt := scanner.Text()
  172. sepIdx := strings.Index(txt, " - ")
  173. if sepIdx == -1 {
  174. return nil, fmt.Errorf("invalid mountinfo format")
  175. }
  176. if txt[sepIdx+3:sepIdx+10] == "cgroup2" || txt[sepIdx+3:sepIdx+9] != "cgroup" {
  177. continue
  178. }
  179. fields := strings.Split(txt, " ")
  180. m := Mount{
  181. Mountpoint: fields[4],
  182. Root: fields[3],
  183. }
  184. for _, opt := range strings.Split(fields[len(fields)-1], ",") {
  185. seen, known := ss[opt]
  186. if !known || (!all && seen) {
  187. continue
  188. }
  189. ss[opt] = true
  190. if strings.HasPrefix(opt, CgroupNamePrefix) {
  191. opt = opt[len(CgroupNamePrefix):]
  192. }
  193. m.Subsystems = append(m.Subsystems, opt)
  194. numFound++
  195. }
  196. if len(m.Subsystems) > 0 || all {
  197. res = append(res, m)
  198. }
  199. }
  200. if err := scanner.Err(); err != nil {
  201. return nil, err
  202. }
  203. return res, nil
  204. }
  205. // GetCgroupMounts returns the mounts for the cgroup subsystems.
  206. // all indicates whether to return just the first instance or all the mounts.
  207. func GetCgroupMounts(all bool) ([]Mount, error) {
  208. if IsCgroup2UnifiedMode() {
  209. availableControllers, err := GetAllSubsystems()
  210. if err != nil {
  211. return nil, err
  212. }
  213. m := Mount{
  214. Mountpoint: unifiedMountpoint,
  215. Root: unifiedMountpoint,
  216. Subsystems: availableControllers,
  217. }
  218. return []Mount{m}, nil
  219. }
  220. f, err := os.Open("/proc/self/mountinfo")
  221. if err != nil {
  222. return nil, err
  223. }
  224. defer f.Close()
  225. allSubsystems, err := ParseCgroupFile("/proc/self/cgroup")
  226. if err != nil {
  227. return nil, err
  228. }
  229. allMap := make(map[string]bool)
  230. for s := range allSubsystems {
  231. allMap[s] = false
  232. }
  233. return getCgroupMountsHelper(allMap, f, all)
  234. }
  235. // GetAllSubsystems returns all the cgroup subsystems supported by the kernel
  236. func GetAllSubsystems() ([]string, error) {
  237. // /proc/cgroups is meaningless for v2
  238. // https://github.com/torvalds/linux/blob/v5.3/Documentation/admin-guide/cgroup-v2.rst#deprecated-v1-core-features
  239. if IsCgroup2UnifiedMode() {
  240. // "pseudo" controllers do not appear in /sys/fs/cgroup/cgroup.controllers.
  241. // - devices: implemented in kernel 4.15
  242. // - freezer: implemented in kernel 5.2
  243. // We assume these are always available, as it is hard to detect availability.
  244. pseudo := []string{"devices", "freezer"}
  245. data, err := ioutil.ReadFile("/sys/fs/cgroup/cgroup.controllers")
  246. if err != nil {
  247. return nil, err
  248. }
  249. subsystems := append(pseudo, strings.Fields(string(data))...)
  250. return subsystems, nil
  251. }
  252. f, err := os.Open("/proc/cgroups")
  253. if err != nil {
  254. return nil, err
  255. }
  256. defer f.Close()
  257. subsystems := []string{}
  258. s := bufio.NewScanner(f)
  259. for s.Scan() {
  260. text := s.Text()
  261. if text[0] != '#' {
  262. parts := strings.Fields(text)
  263. if len(parts) >= 4 && parts[3] != "0" {
  264. subsystems = append(subsystems, parts[0])
  265. }
  266. }
  267. }
  268. if err := s.Err(); err != nil {
  269. return nil, err
  270. }
  271. return subsystems, nil
  272. }
  273. // GetOwnCgroup returns the relative path to the cgroup docker is running in.
  274. func GetOwnCgroup(subsystem string) (string, error) {
  275. cgroups, err := ParseCgroupFile("/proc/self/cgroup")
  276. if err != nil {
  277. return "", err
  278. }
  279. return getControllerPath(subsystem, cgroups)
  280. }
  281. func GetOwnCgroupPath(subsystem string) (string, error) {
  282. cgroup, err := GetOwnCgroup(subsystem)
  283. if err != nil {
  284. return "", err
  285. }
  286. return getCgroupPathHelper(subsystem, cgroup)
  287. }
  288. func GetInitCgroup(subsystem string) (string, error) {
  289. cgroups, err := ParseCgroupFile("/proc/1/cgroup")
  290. if err != nil {
  291. return "", err
  292. }
  293. return getControllerPath(subsystem, cgroups)
  294. }
  295. func GetInitCgroupPath(subsystem string) (string, error) {
  296. cgroup, err := GetInitCgroup(subsystem)
  297. if err != nil {
  298. return "", err
  299. }
  300. return getCgroupPathHelper(subsystem, cgroup)
  301. }
  302. func getCgroupPathHelper(subsystem, cgroup string) (string, error) {
  303. mnt, root, err := FindCgroupMountpointAndRoot("", subsystem)
  304. if err != nil {
  305. return "", err
  306. }
  307. // This is needed for nested containers, because in /proc/self/cgroup we
  308. // see paths from host, which don't exist in container.
  309. relCgroup, err := filepath.Rel(root, cgroup)
  310. if err != nil {
  311. return "", err
  312. }
  313. return filepath.Join(mnt, relCgroup), nil
  314. }
  315. func readProcsFile(dir string) ([]int, error) {
  316. f, err := os.Open(filepath.Join(dir, CgroupProcesses))
  317. if err != nil {
  318. return nil, err
  319. }
  320. defer f.Close()
  321. var (
  322. s = bufio.NewScanner(f)
  323. out = []int{}
  324. )
  325. for s.Scan() {
  326. if t := s.Text(); t != "" {
  327. pid, err := strconv.Atoi(t)
  328. if err != nil {
  329. return nil, err
  330. }
  331. out = append(out, pid)
  332. }
  333. }
  334. return out, nil
  335. }
  336. // ParseCgroupFile parses the given cgroup file, typically from
  337. // /proc/<pid>/cgroup, into a map of subgroups to cgroup names.
  338. func ParseCgroupFile(path string) (map[string]string, error) {
  339. f, err := os.Open(path)
  340. if err != nil {
  341. return nil, err
  342. }
  343. defer f.Close()
  344. return parseCgroupFromReader(f)
  345. }
  346. // helper function for ParseCgroupFile to make testing easier
  347. func parseCgroupFromReader(r io.Reader) (map[string]string, error) {
  348. s := bufio.NewScanner(r)
  349. cgroups := make(map[string]string)
  350. for s.Scan() {
  351. text := s.Text()
  352. // from cgroups(7):
  353. // /proc/[pid]/cgroup
  354. // ...
  355. // For each cgroup hierarchy ... there is one entry
  356. // containing three colon-separated fields of the form:
  357. // hierarchy-ID:subsystem-list:cgroup-path
  358. parts := strings.SplitN(text, ":", 3)
  359. if len(parts) < 3 {
  360. return nil, fmt.Errorf("invalid cgroup entry: must contain at least two colons: %v", text)
  361. }
  362. for _, subs := range strings.Split(parts[1], ",") {
  363. cgroups[subs] = parts[2]
  364. }
  365. }
  366. if err := s.Err(); err != nil {
  367. return nil, err
  368. }
  369. return cgroups, nil
  370. }
  371. func getControllerPath(subsystem string, cgroups map[string]string) (string, error) {
  372. if IsCgroup2UnifiedMode() {
  373. return "/", nil
  374. }
  375. if p, ok := cgroups[subsystem]; ok {
  376. return p, nil
  377. }
  378. if p, ok := cgroups[CgroupNamePrefix+subsystem]; ok {
  379. return p, nil
  380. }
  381. return "", NewNotFoundError(subsystem)
  382. }
  383. func PathExists(path string) bool {
  384. if _, err := os.Stat(path); err != nil {
  385. return false
  386. }
  387. return true
  388. }
  389. func EnterPid(cgroupPaths map[string]string, pid int) error {
  390. for _, path := range cgroupPaths {
  391. if PathExists(path) {
  392. if err := WriteCgroupProc(path, pid); err != nil {
  393. return err
  394. }
  395. }
  396. }
  397. return nil
  398. }
  399. // RemovePaths iterates over the provided paths removing them.
  400. // We trying to remove all paths five times with increasing delay between tries.
  401. // If after all there are not removed cgroups - appropriate error will be
  402. // returned.
  403. func RemovePaths(paths map[string]string) (err error) {
  404. delay := 10 * time.Millisecond
  405. for i := 0; i < 5; i++ {
  406. if i != 0 {
  407. time.Sleep(delay)
  408. delay *= 2
  409. }
  410. for s, p := range paths {
  411. os.RemoveAll(p)
  412. // TODO: here probably should be logging
  413. _, err := os.Stat(p)
  414. // We need this strange way of checking cgroups existence because
  415. // RemoveAll almost always returns error, even on already removed
  416. // cgroups
  417. if os.IsNotExist(err) {
  418. delete(paths, s)
  419. }
  420. }
  421. if len(paths) == 0 {
  422. return nil
  423. }
  424. }
  425. return fmt.Errorf("Failed to remove paths: %v", paths)
  426. }
  427. func GetHugePageSize() ([]string, error) {
  428. files, err := ioutil.ReadDir("/sys/kernel/mm/hugepages")
  429. if err != nil {
  430. return []string{}, err
  431. }
  432. var fileNames []string
  433. for _, st := range files {
  434. fileNames = append(fileNames, st.Name())
  435. }
  436. return getHugePageSizeFromFilenames(fileNames)
  437. }
  438. func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) {
  439. var pageSizes []string
  440. for _, fileName := range fileNames {
  441. nameArray := strings.Split(fileName, "-")
  442. pageSize, err := units.RAMInBytes(nameArray[1])
  443. if err != nil {
  444. return []string{}, err
  445. }
  446. sizeString := units.CustomSize("%g%s", float64(pageSize), 1024.0, HugePageSizeUnitList)
  447. pageSizes = append(pageSizes, sizeString)
  448. }
  449. return pageSizes, nil
  450. }
  451. // GetPids returns all pids, that were added to cgroup at path.
  452. func GetPids(path string) ([]int, error) {
  453. return readProcsFile(path)
  454. }
  455. // GetAllPids returns all pids, that were added to cgroup at path and to all its
  456. // subcgroups.
  457. func GetAllPids(path string) ([]int, error) {
  458. var pids []int
  459. // collect pids from all sub-cgroups
  460. err := filepath.Walk(path, func(p string, info os.FileInfo, iErr error) error {
  461. dir, file := filepath.Split(p)
  462. if file != CgroupProcesses {
  463. return nil
  464. }
  465. if iErr != nil {
  466. return iErr
  467. }
  468. cPids, err := readProcsFile(dir)
  469. if err != nil {
  470. return err
  471. }
  472. pids = append(pids, cPids...)
  473. return nil
  474. })
  475. return pids, err
  476. }
  477. // WriteCgroupProc writes the specified pid into the cgroup's cgroup.procs file
  478. func WriteCgroupProc(dir string, pid int) error {
  479. // Normally dir should not be empty, one case is that cgroup subsystem
  480. // is not mounted, we will get empty dir, and we want it fail here.
  481. if dir == "" {
  482. return fmt.Errorf("no such directory for %s", CgroupProcesses)
  483. }
  484. // Dont attach any pid to the cgroup if -1 is specified as a pid
  485. if pid == -1 {
  486. return nil
  487. }
  488. cgroupProcessesFile, err := os.OpenFile(filepath.Join(dir, CgroupProcesses), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0700)
  489. if err != nil {
  490. return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err)
  491. }
  492. defer cgroupProcessesFile.Close()
  493. for i := 0; i < 5; i++ {
  494. _, err = cgroupProcessesFile.WriteString(strconv.Itoa(pid))
  495. if err == nil {
  496. return nil
  497. }
  498. // EINVAL might mean that the task being added to cgroup.procs is in state
  499. // TASK_NEW. We should attempt to do so again.
  500. if isEINVAL(err) {
  501. time.Sleep(30 * time.Millisecond)
  502. continue
  503. }
  504. return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err)
  505. }
  506. return err
  507. }
  508. func isEINVAL(err error) bool {
  509. switch err := err.(type) {
  510. case *os.PathError:
  511. return err.Err == unix.EINVAL
  512. default:
  513. return false
  514. }
  515. }