utils.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561
  1. /*
  2. Copyright The containerd Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package cgroup2
  14. import (
  15. "bufio"
  16. "errors"
  17. "fmt"
  18. "io"
  19. "math"
  20. "os"
  21. "path/filepath"
  22. "strconv"
  23. "strings"
  24. "sync"
  25. "time"
  26. "unsafe"
  27. "github.com/containerd/cgroups/v3/cgroup2/stats"
  28. "github.com/godbus/dbus/v5"
  29. "github.com/opencontainers/runtime-spec/specs-go"
  30. "github.com/sirupsen/logrus"
  31. "golang.org/x/sys/unix"
  32. )
  33. const (
  34. cgroupProcs = "cgroup.procs"
  35. cgroupThreads = "cgroup.threads"
  36. defaultDirPerm = 0o755
  37. )
  38. // defaultFilePerm is a var so that the test framework can change the filemode
  39. // of all files created when the tests are running. The difference between the
  40. // tests and real world use is that files like "cgroup.procs" will exist when writing
  41. // to a read cgroup filesystem and do not exist prior when running in the tests.
  42. // this is set to a non 0 value in the test code
  43. var defaultFilePerm = os.FileMode(0)
  44. // remove will remove a cgroup path handling EAGAIN and EBUSY errors and
  45. // retrying the remove after a exp timeout
  46. func remove(path string) error {
  47. var err error
  48. delay := 10 * time.Millisecond
  49. for i := 0; i < 5; i++ {
  50. if i != 0 {
  51. time.Sleep(delay)
  52. delay *= 2
  53. }
  54. if err = os.RemoveAll(path); err == nil {
  55. return nil
  56. }
  57. }
  58. return fmt.Errorf("cgroups: unable to remove path %q: %w", path, err)
  59. }
  60. // parseCgroupTasksFile parses /sys/fs/cgroup/$GROUPPATH/cgroup.procs or
  61. // /sys/fs/cgroup/$GROUPPATH/cgroup.threads
  62. func parseCgroupTasksFile(path string) ([]uint64, error) {
  63. f, err := os.Open(path)
  64. if err != nil {
  65. return nil, err
  66. }
  67. defer f.Close()
  68. var (
  69. out []uint64
  70. s = bufio.NewScanner(f)
  71. )
  72. for s.Scan() {
  73. if t := s.Text(); t != "" {
  74. pid, err := strconv.ParseUint(t, 10, 0)
  75. if err != nil {
  76. return nil, err
  77. }
  78. out = append(out, pid)
  79. }
  80. }
  81. if err := s.Err(); err != nil {
  82. return nil, err
  83. }
  84. return out, nil
  85. }
  86. func parseKV(raw string) (string, uint64, error) {
  87. parts := strings.Fields(raw)
  88. if len(parts) != 2 {
  89. return "", 0, ErrInvalidFormat
  90. }
  91. v, err := parseUint(parts[1], 10, 64)
  92. return parts[0], v, err
  93. }
  94. func parseUint(s string, base, bitSize int) (uint64, error) {
  95. v, err := strconv.ParseUint(s, base, bitSize)
  96. if err != nil {
  97. intValue, intErr := strconv.ParseInt(s, base, bitSize)
  98. // 1. Handle negative values greater than MinInt64 (and)
  99. // 2. Handle negative values lesser than MinInt64
  100. if intErr == nil && intValue < 0 {
  101. return 0, nil
  102. } else if intErr != nil &&
  103. intErr.(*strconv.NumError).Err == strconv.ErrRange &&
  104. intValue < 0 {
  105. return 0, nil
  106. }
  107. return 0, err
  108. }
  109. return v, nil
  110. }
  111. // parseCgroupFile parses /proc/PID/cgroup file and return string
  112. func parseCgroupFile(path string) (string, error) {
  113. f, err := os.Open(path)
  114. if err != nil {
  115. return "", err
  116. }
  117. defer f.Close()
  118. return parseCgroupFromReader(f)
  119. }
  120. func parseCgroupFromReader(r io.Reader) (string, error) {
  121. s := bufio.NewScanner(r)
  122. for s.Scan() {
  123. var (
  124. text = s.Text()
  125. parts = strings.SplitN(text, ":", 3)
  126. )
  127. if len(parts) < 3 {
  128. return "", fmt.Errorf("invalid cgroup entry: %q", text)
  129. }
  130. // text is like "0::/user.slice/user-1001.slice/session-1.scope"
  131. if parts[0] == "0" && parts[1] == "" {
  132. return parts[2], nil
  133. }
  134. }
  135. if err := s.Err(); err != nil {
  136. return "", err
  137. }
  138. return "", fmt.Errorf("cgroup path not found")
  139. }
  140. // ToResources converts the oci LinuxResources struct into a
  141. // v2 Resources type for use with this package.
  142. //
  143. // converting cgroups configuration from v1 to v2
  144. // ref: https://github.com/containers/crun/blob/master/crun.1.md#cgroup-v2
  145. func ToResources(spec *specs.LinuxResources) *Resources {
  146. var resources Resources
  147. if cpu := spec.CPU; cpu != nil {
  148. resources.CPU = &CPU{
  149. Cpus: cpu.Cpus,
  150. Mems: cpu.Mems,
  151. }
  152. if shares := cpu.Shares; shares != nil {
  153. convertedWeight := 1 + ((*shares-2)*9999)/262142
  154. resources.CPU.Weight = &convertedWeight
  155. }
  156. if period := cpu.Period; period != nil {
  157. resources.CPU.Max = NewCPUMax(cpu.Quota, period)
  158. }
  159. }
  160. if mem := spec.Memory; mem != nil {
  161. resources.Memory = &Memory{}
  162. if swap := mem.Swap; swap != nil {
  163. resources.Memory.Swap = swap
  164. if l := mem.Limit; l != nil {
  165. reduce := *swap - *l
  166. resources.Memory.Swap = &reduce
  167. }
  168. }
  169. if l := mem.Limit; l != nil {
  170. resources.Memory.Max = l
  171. }
  172. if l := mem.Reservation; l != nil {
  173. resources.Memory.Low = l
  174. }
  175. }
  176. if hugetlbs := spec.HugepageLimits; hugetlbs != nil {
  177. hugeTlbUsage := HugeTlb{}
  178. for _, hugetlb := range hugetlbs {
  179. hugeTlbUsage = append(hugeTlbUsage, HugeTlbEntry{
  180. HugePageSize: hugetlb.Pagesize,
  181. Limit: hugetlb.Limit,
  182. })
  183. }
  184. resources.HugeTlb = &hugeTlbUsage
  185. }
  186. if pids := spec.Pids; pids != nil {
  187. resources.Pids = &Pids{
  188. Max: pids.Limit,
  189. }
  190. }
  191. if i := spec.BlockIO; i != nil {
  192. resources.IO = &IO{}
  193. if i.Weight != nil {
  194. resources.IO.BFQ.Weight = 1 + (*i.Weight-10)*9999/990
  195. }
  196. for t, devices := range map[IOType][]specs.LinuxThrottleDevice{
  197. ReadBPS: i.ThrottleReadBpsDevice,
  198. WriteBPS: i.ThrottleWriteBpsDevice,
  199. ReadIOPS: i.ThrottleReadIOPSDevice,
  200. WriteIOPS: i.ThrottleWriteIOPSDevice,
  201. } {
  202. for _, d := range devices {
  203. resources.IO.Max = append(resources.IO.Max, Entry{
  204. Type: t,
  205. Major: d.Major,
  206. Minor: d.Minor,
  207. Rate: d.Rate,
  208. })
  209. }
  210. }
  211. }
  212. if i := spec.Rdma; i != nil {
  213. resources.RDMA = &RDMA{}
  214. for device, value := range spec.Rdma {
  215. if device != "" && (value.HcaHandles != nil && value.HcaObjects != nil) {
  216. resources.RDMA.Limit = append(resources.RDMA.Limit, RDMAEntry{
  217. Device: device,
  218. HcaHandles: *value.HcaHandles,
  219. HcaObjects: *value.HcaObjects,
  220. })
  221. }
  222. }
  223. }
  224. return &resources
  225. }
  226. // Gets uint64 parsed content of single value cgroup stat file
  227. func getStatFileContentUint64(filePath string) uint64 {
  228. f, err := os.Open(filePath)
  229. if err != nil {
  230. return 0
  231. }
  232. defer f.Close()
  233. // We expect an unsigned 64 bit integer, or a "max" string
  234. // in some cases.
  235. buf := make([]byte, 32)
  236. n, err := f.Read(buf)
  237. if err != nil {
  238. return 0
  239. }
  240. trimmed := strings.TrimSpace(string(buf[:n]))
  241. if trimmed == "max" {
  242. return math.MaxUint64
  243. }
  244. res, err := parseUint(trimmed, 10, 64)
  245. if err != nil {
  246. logrus.Errorf("unable to parse %q as a uint from Cgroup file %q", trimmed, filePath)
  247. return res
  248. }
  249. return res
  250. }
  251. func readIoStats(path string) []*stats.IOEntry {
  252. // more details on the io.stat file format: https://www.kernel.org/doc/Documentation/cgroup-v2.txt
  253. var usage []*stats.IOEntry
  254. fpath := filepath.Join(path, "io.stat")
  255. currentData, err := os.ReadFile(fpath)
  256. if err != nil {
  257. return usage
  258. }
  259. entries := strings.Split(string(currentData), "\n")
  260. for _, entry := range entries {
  261. parts := strings.Split(entry, " ")
  262. if len(parts) < 2 {
  263. continue
  264. }
  265. majmin := strings.Split(parts[0], ":")
  266. if len(majmin) != 2 {
  267. continue
  268. }
  269. major, err := strconv.ParseUint(majmin[0], 10, 0)
  270. if err != nil {
  271. return usage
  272. }
  273. minor, err := strconv.ParseUint(majmin[1], 10, 0)
  274. if err != nil {
  275. return usage
  276. }
  277. parts = parts[1:]
  278. ioEntry := stats.IOEntry{
  279. Major: major,
  280. Minor: minor,
  281. }
  282. for _, s := range parts {
  283. keyPairValue := strings.Split(s, "=")
  284. if len(keyPairValue) != 2 {
  285. continue
  286. }
  287. v, err := strconv.ParseUint(keyPairValue[1], 10, 0)
  288. if err != nil {
  289. continue
  290. }
  291. switch keyPairValue[0] {
  292. case "rbytes":
  293. ioEntry.Rbytes = v
  294. case "wbytes":
  295. ioEntry.Wbytes = v
  296. case "rios":
  297. ioEntry.Rios = v
  298. case "wios":
  299. ioEntry.Wios = v
  300. }
  301. }
  302. usage = append(usage, &ioEntry)
  303. }
  304. return usage
  305. }
  306. func rdmaStats(filepath string) []*stats.RdmaEntry {
  307. currentData, err := os.ReadFile(filepath)
  308. if err != nil {
  309. return []*stats.RdmaEntry{}
  310. }
  311. return toRdmaEntry(strings.Split(string(currentData), "\n"))
  312. }
  313. func parseRdmaKV(raw string, entry *stats.RdmaEntry) {
  314. var value uint64
  315. var err error
  316. parts := strings.Split(raw, "=")
  317. switch len(parts) {
  318. case 2:
  319. if parts[1] == "max" {
  320. value = math.MaxUint32
  321. } else {
  322. value, err = parseUint(parts[1], 10, 32)
  323. if err != nil {
  324. return
  325. }
  326. }
  327. if parts[0] == "hca_handle" {
  328. entry.HcaHandles = uint32(value)
  329. } else if parts[0] == "hca_object" {
  330. entry.HcaObjects = uint32(value)
  331. }
  332. }
  333. }
  334. func toRdmaEntry(strEntries []string) []*stats.RdmaEntry {
  335. var rdmaEntries []*stats.RdmaEntry
  336. for i := range strEntries {
  337. parts := strings.Fields(strEntries[i])
  338. switch len(parts) {
  339. case 3:
  340. entry := new(stats.RdmaEntry)
  341. entry.Device = parts[0]
  342. parseRdmaKV(parts[1], entry)
  343. parseRdmaKV(parts[2], entry)
  344. rdmaEntries = append(rdmaEntries, entry)
  345. default:
  346. continue
  347. }
  348. }
  349. return rdmaEntries
  350. }
  351. // isUnitExists returns true if the error is that a systemd unit already exists.
  352. func isUnitExists(err error) bool {
  353. if err != nil {
  354. if dbusError, ok := err.(dbus.Error); ok {
  355. return strings.Contains(dbusError.Name, "org.freedesktop.systemd1.UnitExists")
  356. }
  357. }
  358. return false
  359. }
  360. func systemdUnitFromPath(path string) string {
  361. _, unit := filepath.Split(path)
  362. return unit
  363. }
  364. func readHugeTlbStats(path string) []*stats.HugeTlbStat {
  365. hpSizes := hugePageSizes()
  366. usage := make([]*stats.HugeTlbStat, len(hpSizes))
  367. for idx, pagesize := range hpSizes {
  368. usage[idx] = &stats.HugeTlbStat{
  369. Max: getStatFileContentUint64(filepath.Join(path, "hugetlb."+pagesize+".max")),
  370. Current: getStatFileContentUint64(filepath.Join(path, "hugetlb."+pagesize+".current")),
  371. Pagesize: pagesize,
  372. }
  373. }
  374. return usage
  375. }
  376. var (
  377. hPageSizes []string
  378. initHPSOnce sync.Once
  379. )
  380. // The following idea and implementation is taken pretty much line for line from
  381. // runc. Because the hugetlb files are well known, and the only variable thrown in
  382. // the mix is what huge page sizes you have on your host, this lends itself well
  383. // to doing the work to find the files present once, and then re-using this. This
  384. // saves a os.Readdirnames(0) call to search for hugeltb files on every `manager.Stat`
  385. // call.
  386. // https://github.com/opencontainers/runc/blob/3a2c0c2565644d8a7e0f1dd594a060b21fa96cf1/libcontainer/cgroups/utils.go#L301
  387. func hugePageSizes() []string {
  388. initHPSOnce.Do(func() {
  389. dir, err := os.OpenFile("/sys/kernel/mm/hugepages", unix.O_DIRECTORY|unix.O_RDONLY, 0)
  390. if err != nil {
  391. return
  392. }
  393. files, err := dir.Readdirnames(0)
  394. dir.Close()
  395. if err != nil {
  396. return
  397. }
  398. hPageSizes, err = getHugePageSizeFromFilenames(files)
  399. if err != nil {
  400. logrus.Warnf("hugePageSizes: %s", err)
  401. }
  402. })
  403. return hPageSizes
  404. }
  405. func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) {
  406. pageSizes := make([]string, 0, len(fileNames))
  407. var warn error
  408. for _, file := range fileNames {
  409. // example: hugepages-1048576kB
  410. val := strings.TrimPrefix(file, "hugepages-")
  411. if len(val) == len(file) {
  412. // Unexpected file name: no prefix found, ignore it.
  413. continue
  414. }
  415. // In all known versions of Linux up to 6.3 the suffix is always
  416. // "kB". If we find something else, produce an error but keep going.
  417. eLen := len(val) - 2
  418. val = strings.TrimSuffix(val, "kB")
  419. if len(val) != eLen {
  420. // Highly unlikely.
  421. if warn == nil {
  422. warn = errors.New(file + `: invalid suffix (expected "kB")`)
  423. }
  424. continue
  425. }
  426. size, err := strconv.Atoi(val)
  427. if err != nil {
  428. // Highly unlikely.
  429. if warn == nil {
  430. warn = fmt.Errorf("%s: %w", file, err)
  431. }
  432. continue
  433. }
  434. // Model after https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/hugetlb_cgroup.c?id=eff48ddeab782e35e58ccc8853f7386bbae9dec4#n574
  435. // but in our case the size is in KB already.
  436. if size >= (1 << 20) {
  437. val = strconv.Itoa(size>>20) + "GB"
  438. } else if size >= (1 << 10) {
  439. val = strconv.Itoa(size>>10) + "MB"
  440. } else {
  441. val += "KB"
  442. }
  443. pageSizes = append(pageSizes, val)
  444. }
  445. return pageSizes, warn
  446. }
  447. func getStatPSIFromFile(path string) *stats.PSIStats {
  448. f, err := os.Open(path)
  449. if err != nil {
  450. return nil
  451. }
  452. defer f.Close()
  453. psistats := &stats.PSIStats{}
  454. sc := bufio.NewScanner(f)
  455. for sc.Scan() {
  456. parts := strings.Fields(sc.Text())
  457. var pv *stats.PSIData
  458. switch parts[0] {
  459. case "some":
  460. psistats.Some = &stats.PSIData{}
  461. pv = psistats.Some
  462. case "full":
  463. psistats.Full = &stats.PSIData{}
  464. pv = psistats.Full
  465. }
  466. if pv != nil {
  467. err = parsePSIData(parts[1:], pv)
  468. if err != nil {
  469. logrus.Errorf("failed to read file %s: %v", path, err)
  470. return nil
  471. }
  472. }
  473. }
  474. if err := sc.Err(); err != nil {
  475. logrus.Errorf("unable to parse PSI data: %v", err)
  476. return nil
  477. }
  478. return psistats
  479. }
  480. func parsePSIData(psi []string, data *stats.PSIData) error {
  481. for _, f := range psi {
  482. kv := strings.SplitN(f, "=", 2)
  483. if len(kv) != 2 {
  484. return fmt.Errorf("invalid PSI data: %q", f)
  485. }
  486. var pv *float64
  487. switch kv[0] {
  488. case "avg10":
  489. pv = &data.Avg10
  490. case "avg60":
  491. pv = &data.Avg60
  492. case "avg300":
  493. pv = &data.Avg300
  494. case "total":
  495. v, err := strconv.ParseUint(kv[1], 10, 64)
  496. if err != nil {
  497. return fmt.Errorf("invalid %s PSI value: %w", kv[0], err)
  498. }
  499. data.Total = v
  500. }
  501. if pv != nil {
  502. v, err := strconv.ParseFloat(kv[1], 64)
  503. if err != nil {
  504. return fmt.Errorf("invalid %s PSI value: %w", kv[0], err)
  505. }
  506. *pv = v
  507. }
  508. }
  509. return nil
  510. }
  511. func getSubreaper() (int, error) {
  512. var i uintptr
  513. if err := unix.Prctl(unix.PR_GET_CHILD_SUBREAPER, uintptr(unsafe.Pointer(&i)), 0, 0, 0); err != nil {
  514. return -1, err
  515. }
  516. return int(i), nil
  517. }