utils.go 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379
  1. // +build linux
  2. package cgroups
  3. import (
  4. "bufio"
  5. "errors"
  6. "fmt"
  7. "io"
  8. "io/ioutil"
  9. "os"
  10. "path/filepath"
  11. "strconv"
  12. "strings"
  13. "sync"
  14. "time"
  15. units "github.com/docker/go-units"
  16. "golang.org/x/sys/unix"
  17. )
  18. const (
  19. CgroupProcesses = "cgroup.procs"
  20. unifiedMountpoint = "/sys/fs/cgroup"
  21. )
  22. var (
  23. isUnifiedOnce sync.Once
  24. isUnified bool
  25. )
  26. // HugePageSizeUnitList is a list of the units used by the linux kernel when
  27. // naming the HugePage control files.
  28. // https://www.kernel.org/doc/Documentation/cgroup-v1/hugetlb.txt
  29. // TODO Since the kernel only use KB, MB and GB; TB and PB should be removed,
  30. // depends on https://github.com/docker/go-units/commit/a09cd47f892041a4fac473133d181f5aea6fa393
  31. var HugePageSizeUnitList = []string{"B", "KB", "MB", "GB", "TB", "PB"}
  32. // IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode.
  33. func IsCgroup2UnifiedMode() bool {
  34. isUnifiedOnce.Do(func() {
  35. var st unix.Statfs_t
  36. if err := unix.Statfs(unifiedMountpoint, &st); err != nil {
  37. panic("cannot statfs cgroup root")
  38. }
  39. isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
  40. })
  41. return isUnified
  42. }
  43. type Mount struct {
  44. Mountpoint string
  45. Root string
  46. Subsystems []string
  47. }
  48. // GetCgroupMounts returns the mounts for the cgroup subsystems.
  49. // all indicates whether to return just the first instance or all the mounts.
  50. // This function should not be used from cgroupv2 code, as in this case
  51. // all the controllers are available under the constant unifiedMountpoint.
  52. func GetCgroupMounts(all bool) ([]Mount, error) {
  53. if IsCgroup2UnifiedMode() {
  54. // TODO: remove cgroupv2 case once all external users are converted
  55. availableControllers, err := GetAllSubsystems()
  56. if err != nil {
  57. return nil, err
  58. }
  59. m := Mount{
  60. Mountpoint: unifiedMountpoint,
  61. Root: unifiedMountpoint,
  62. Subsystems: availableControllers,
  63. }
  64. return []Mount{m}, nil
  65. }
  66. return getCgroupMountsV1(all)
  67. }
  68. // GetAllSubsystems returns all the cgroup subsystems supported by the kernel
  69. func GetAllSubsystems() ([]string, error) {
  70. // /proc/cgroups is meaningless for v2
  71. // https://github.com/torvalds/linux/blob/v5.3/Documentation/admin-guide/cgroup-v2.rst#deprecated-v1-core-features
  72. if IsCgroup2UnifiedMode() {
  73. // "pseudo" controllers do not appear in /sys/fs/cgroup/cgroup.controllers.
  74. // - devices: implemented in kernel 4.15
  75. // - freezer: implemented in kernel 5.2
  76. // We assume these are always available, as it is hard to detect availability.
  77. pseudo := []string{"devices", "freezer"}
  78. data, err := ioutil.ReadFile("/sys/fs/cgroup/cgroup.controllers")
  79. if err != nil {
  80. return nil, err
  81. }
  82. subsystems := append(pseudo, strings.Fields(string(data))...)
  83. return subsystems, nil
  84. }
  85. f, err := os.Open("/proc/cgroups")
  86. if err != nil {
  87. return nil, err
  88. }
  89. defer f.Close()
  90. subsystems := []string{}
  91. s := bufio.NewScanner(f)
  92. for s.Scan() {
  93. text := s.Text()
  94. if text[0] != '#' {
  95. parts := strings.Fields(text)
  96. if len(parts) >= 4 && parts[3] != "0" {
  97. subsystems = append(subsystems, parts[0])
  98. }
  99. }
  100. }
  101. if err := s.Err(); err != nil {
  102. return nil, err
  103. }
  104. return subsystems, nil
  105. }
  106. func readProcsFile(file string) ([]int, error) {
  107. f, err := os.Open(file)
  108. if err != nil {
  109. return nil, err
  110. }
  111. defer f.Close()
  112. var (
  113. s = bufio.NewScanner(f)
  114. out = []int{}
  115. )
  116. for s.Scan() {
  117. if t := s.Text(); t != "" {
  118. pid, err := strconv.Atoi(t)
  119. if err != nil {
  120. return nil, err
  121. }
  122. out = append(out, pid)
  123. }
  124. }
  125. return out, s.Err()
  126. }
  127. // ParseCgroupFile parses the given cgroup file, typically /proc/self/cgroup
  128. // or /proc/<pid>/cgroup, into a map of subsystems to cgroup paths, e.g.
  129. // "cpu": "/user.slice/user-1000.slice"
  130. // "pids": "/user.slice/user-1000.slice"
  131. // etc.
  132. //
  133. // Note that for cgroup v2 unified hierarchy, there are no per-controller
  134. // cgroup paths, so the resulting map will have a single element where the key
  135. // is empty string ("") and the value is the cgroup path the <pid> is in.
  136. func ParseCgroupFile(path string) (map[string]string, error) {
  137. f, err := os.Open(path)
  138. if err != nil {
  139. return nil, err
  140. }
  141. defer f.Close()
  142. return parseCgroupFromReader(f)
  143. }
  144. // helper function for ParseCgroupFile to make testing easier
  145. func parseCgroupFromReader(r io.Reader) (map[string]string, error) {
  146. s := bufio.NewScanner(r)
  147. cgroups := make(map[string]string)
  148. for s.Scan() {
  149. text := s.Text()
  150. // from cgroups(7):
  151. // /proc/[pid]/cgroup
  152. // ...
  153. // For each cgroup hierarchy ... there is one entry
  154. // containing three colon-separated fields of the form:
  155. // hierarchy-ID:subsystem-list:cgroup-path
  156. parts := strings.SplitN(text, ":", 3)
  157. if len(parts) < 3 {
  158. return nil, fmt.Errorf("invalid cgroup entry: must contain at least two colons: %v", text)
  159. }
  160. for _, subs := range strings.Split(parts[1], ",") {
  161. cgroups[subs] = parts[2]
  162. }
  163. }
  164. if err := s.Err(); err != nil {
  165. return nil, err
  166. }
  167. return cgroups, nil
  168. }
  169. func PathExists(path string) bool {
  170. if _, err := os.Stat(path); err != nil {
  171. return false
  172. }
  173. return true
  174. }
  175. func EnterPid(cgroupPaths map[string]string, pid int) error {
  176. for _, path := range cgroupPaths {
  177. if PathExists(path) {
  178. if err := WriteCgroupProc(path, pid); err != nil {
  179. return err
  180. }
  181. }
  182. }
  183. return nil
  184. }
  185. // RemovePaths iterates over the provided paths removing them.
  186. // We trying to remove all paths five times with increasing delay between tries.
  187. // If after all there are not removed cgroups - appropriate error will be
  188. // returned.
  189. func RemovePaths(paths map[string]string) (err error) {
  190. delay := 10 * time.Millisecond
  191. for i := 0; i < 5; i++ {
  192. if i != 0 {
  193. time.Sleep(delay)
  194. delay *= 2
  195. }
  196. for s, p := range paths {
  197. os.RemoveAll(p)
  198. // TODO: here probably should be logging
  199. _, err := os.Stat(p)
  200. // We need this strange way of checking cgroups existence because
  201. // RemoveAll almost always returns error, even on already removed
  202. // cgroups
  203. if os.IsNotExist(err) {
  204. delete(paths, s)
  205. }
  206. }
  207. if len(paths) == 0 {
  208. return nil
  209. }
  210. }
  211. return fmt.Errorf("Failed to remove paths: %v", paths)
  212. }
  213. func GetHugePageSize() ([]string, error) {
  214. files, err := ioutil.ReadDir("/sys/kernel/mm/hugepages")
  215. if err != nil {
  216. return []string{}, err
  217. }
  218. var fileNames []string
  219. for _, st := range files {
  220. fileNames = append(fileNames, st.Name())
  221. }
  222. return getHugePageSizeFromFilenames(fileNames)
  223. }
  224. func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) {
  225. var pageSizes []string
  226. for _, fileName := range fileNames {
  227. nameArray := strings.Split(fileName, "-")
  228. pageSize, err := units.RAMInBytes(nameArray[1])
  229. if err != nil {
  230. return []string{}, err
  231. }
  232. sizeString := units.CustomSize("%g%s", float64(pageSize), 1024.0, HugePageSizeUnitList)
  233. pageSizes = append(pageSizes, sizeString)
  234. }
  235. return pageSizes, nil
  236. }
  237. // GetPids returns all pids, that were added to cgroup at path.
  238. func GetPids(dir string) ([]int, error) {
  239. return readProcsFile(filepath.Join(dir, CgroupProcesses))
  240. }
  241. // GetAllPids returns all pids, that were added to cgroup at path and to all its
  242. // subcgroups.
  243. func GetAllPids(path string) ([]int, error) {
  244. var pids []int
  245. // collect pids from all sub-cgroups
  246. err := filepath.Walk(path, func(p string, info os.FileInfo, iErr error) error {
  247. if iErr != nil {
  248. return iErr
  249. }
  250. if info.IsDir() || info.Name() != CgroupProcesses {
  251. return nil
  252. }
  253. cPids, err := readProcsFile(p)
  254. if err != nil {
  255. return err
  256. }
  257. pids = append(pids, cPids...)
  258. return nil
  259. })
  260. return pids, err
  261. }
  262. // WriteCgroupProc writes the specified pid into the cgroup's cgroup.procs file
  263. func WriteCgroupProc(dir string, pid int) error {
  264. // Normally dir should not be empty, one case is that cgroup subsystem
  265. // is not mounted, we will get empty dir, and we want it fail here.
  266. if dir == "" {
  267. return fmt.Errorf("no such directory for %s", CgroupProcesses)
  268. }
  269. // Dont attach any pid to the cgroup if -1 is specified as a pid
  270. if pid == -1 {
  271. return nil
  272. }
  273. cgroupProcessesFile, err := os.OpenFile(filepath.Join(dir, CgroupProcesses), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0700)
  274. if err != nil {
  275. return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err)
  276. }
  277. defer cgroupProcessesFile.Close()
  278. for i := 0; i < 5; i++ {
  279. _, err = cgroupProcessesFile.WriteString(strconv.Itoa(pid))
  280. if err == nil {
  281. return nil
  282. }
  283. // EINVAL might mean that the task being added to cgroup.procs is in state
  284. // TASK_NEW. We should attempt to do so again.
  285. if errors.Is(err, unix.EINVAL) {
  286. time.Sleep(30 * time.Millisecond)
  287. continue
  288. }
  289. return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err)
  290. }
  291. return err
  292. }
  293. // Since the OCI spec is designed for cgroup v1, in some cases
  294. // there is need to convert from the cgroup v1 configuration to cgroup v2
  295. // the formula for BlkIOWeight is y = (1 + (x - 10) * 9999 / 990)
  296. // convert linearly from [10-1000] to [1-10000]
  297. func ConvertBlkIOToCgroupV2Value(blkIoWeight uint16) uint64 {
  298. if blkIoWeight == 0 {
  299. return 0
  300. }
  301. return uint64(1 + (uint64(blkIoWeight)-10)*9999/990)
  302. }
  303. // Since the OCI spec is designed for cgroup v1, in some cases
  304. // there is need to convert from the cgroup v1 configuration to cgroup v2
  305. // the formula for cpuShares is y = (1 + ((x - 2) * 9999) / 262142)
  306. // convert from [2-262144] to [1-10000]
  307. // 262144 comes from Linux kernel definition "#define MAX_SHARES (1UL << 18)"
  308. func ConvertCPUSharesToCgroupV2Value(cpuShares uint64) uint64 {
  309. if cpuShares == 0 {
  310. return 0
  311. }
  312. return (1 + ((cpuShares-2)*9999)/262142)
  313. }
  314. // ConvertMemorySwapToCgroupV2Value converts MemorySwap value from OCI spec
  315. // for use by cgroup v2 drivers. A conversion is needed since Resources.MemorySwap
  316. // is defined as memory+swap combined, while in cgroup v2 swap is a separate value.
  317. func ConvertMemorySwapToCgroupV2Value(memorySwap, memory int64) (int64, error) {
  318. // for compatibility with cgroup1 controller, set swap to unlimited in
  319. // case the memory is set to unlimited, and swap is not explicitly set,
  320. // treating the request as "set both memory and swap to unlimited".
  321. if memory == -1 && memorySwap == 0 {
  322. return -1, nil
  323. }
  324. if memorySwap == -1 || memorySwap == 0 {
  325. // -1 is "max", 0 is "unset", so treat as is
  326. return memorySwap, nil
  327. }
  328. // sanity checks
  329. if memory == 0 || memory == -1 {
  330. return 0, errors.New("unable to set swap limit without memory limit")
  331. }
  332. if memory < 0 {
  333. return 0, fmt.Errorf("invalid memory value: %d", memory)
  334. }
  335. if memorySwap < memory {
  336. return 0, errors.New("memory+swap limit should be >= memory limit")
  337. }
  338. return memorySwap - memory, nil
  339. }