utils.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488
  1. /*
  2. Copyright The containerd Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package cgroup2
  14. import (
  15. "bufio"
  16. "errors"
  17. "fmt"
  18. "io"
  19. "math"
  20. "os"
  21. "path/filepath"
  22. "strconv"
  23. "strings"
  24. "sync"
  25. "time"
  26. "unsafe"
  27. "github.com/containerd/cgroups/v3/cgroup2/stats"
  28. "github.com/godbus/dbus/v5"
  29. "github.com/opencontainers/runtime-spec/specs-go"
  30. "github.com/sirupsen/logrus"
  31. "golang.org/x/sys/unix"
  32. )
  33. const (
  34. cgroupProcs = "cgroup.procs"
  35. cgroupThreads = "cgroup.threads"
  36. defaultDirPerm = 0o755
  37. )
  38. // defaultFilePerm is a var so that the test framework can change the filemode
  39. // of all files created when the tests are running. The difference between the
  40. // tests and real world use is that files like "cgroup.procs" will exist when writing
  41. // to a read cgroup filesystem and do not exist prior when running in the tests.
  42. // this is set to a non 0 value in the test code
  43. var defaultFilePerm = os.FileMode(0)
  44. // remove will remove a cgroup path handling EAGAIN and EBUSY errors and
  45. // retrying the remove after a exp timeout
  46. func remove(path string) error {
  47. var err error
  48. delay := 10 * time.Millisecond
  49. for i := 0; i < 5; i++ {
  50. if i != 0 {
  51. time.Sleep(delay)
  52. delay *= 2
  53. }
  54. if err = os.RemoveAll(path); err == nil {
  55. return nil
  56. }
  57. }
  58. return fmt.Errorf("cgroups: unable to remove path %q: %w", path, err)
  59. }
  60. // parseCgroupProcsFile parses /sys/fs/cgroup/$GROUPPATH/cgroup.procs
  61. func parseCgroupProcsFile(path string) ([]uint64, error) {
  62. f, err := os.Open(path)
  63. if err != nil {
  64. return nil, err
  65. }
  66. defer f.Close()
  67. var (
  68. out []uint64
  69. s = bufio.NewScanner(f)
  70. )
  71. for s.Scan() {
  72. if t := s.Text(); t != "" {
  73. pid, err := strconv.ParseUint(t, 10, 0)
  74. if err != nil {
  75. return nil, err
  76. }
  77. out = append(out, pid)
  78. }
  79. }
  80. if err := s.Err(); err != nil {
  81. return nil, err
  82. }
  83. return out, nil
  84. }
  85. func parseKV(raw string) (string, uint64, error) {
  86. parts := strings.Fields(raw)
  87. if len(parts) != 2 {
  88. return "", 0, ErrInvalidFormat
  89. }
  90. v, err := parseUint(parts[1], 10, 64)
  91. return parts[0], v, err
  92. }
  93. func parseUint(s string, base, bitSize int) (uint64, error) {
  94. v, err := strconv.ParseUint(s, base, bitSize)
  95. if err != nil {
  96. intValue, intErr := strconv.ParseInt(s, base, bitSize)
  97. // 1. Handle negative values greater than MinInt64 (and)
  98. // 2. Handle negative values lesser than MinInt64
  99. if intErr == nil && intValue < 0 {
  100. return 0, nil
  101. } else if intErr != nil &&
  102. intErr.(*strconv.NumError).Err == strconv.ErrRange &&
  103. intValue < 0 {
  104. return 0, nil
  105. }
  106. return 0, err
  107. }
  108. return v, nil
  109. }
  110. // parseCgroupFile parses /proc/PID/cgroup file and return string
  111. func parseCgroupFile(path string) (string, error) {
  112. f, err := os.Open(path)
  113. if err != nil {
  114. return "", err
  115. }
  116. defer f.Close()
  117. return parseCgroupFromReader(f)
  118. }
  119. func parseCgroupFromReader(r io.Reader) (string, error) {
  120. s := bufio.NewScanner(r)
  121. for s.Scan() {
  122. var (
  123. text = s.Text()
  124. parts = strings.SplitN(text, ":", 3)
  125. )
  126. if len(parts) < 3 {
  127. return "", fmt.Errorf("invalid cgroup entry: %q", text)
  128. }
  129. // text is like "0::/user.slice/user-1001.slice/session-1.scope"
  130. if parts[0] == "0" && parts[1] == "" {
  131. return parts[2], nil
  132. }
  133. }
  134. if err := s.Err(); err != nil {
  135. return "", err
  136. }
  137. return "", fmt.Errorf("cgroup path not found")
  138. }
  139. // ToResources converts the oci LinuxResources struct into a
  140. // v2 Resources type for use with this package.
  141. //
  142. // converting cgroups configuration from v1 to v2
  143. // ref: https://github.com/containers/crun/blob/master/crun.1.md#cgroup-v2
  144. func ToResources(spec *specs.LinuxResources) *Resources {
  145. var resources Resources
  146. if cpu := spec.CPU; cpu != nil {
  147. resources.CPU = &CPU{
  148. Cpus: cpu.Cpus,
  149. Mems: cpu.Mems,
  150. }
  151. if shares := cpu.Shares; shares != nil {
  152. convertedWeight := 1 + ((*shares-2)*9999)/262142
  153. resources.CPU.Weight = &convertedWeight
  154. }
  155. if period := cpu.Period; period != nil {
  156. resources.CPU.Max = NewCPUMax(cpu.Quota, period)
  157. }
  158. }
  159. if mem := spec.Memory; mem != nil {
  160. resources.Memory = &Memory{}
  161. if swap := mem.Swap; swap != nil {
  162. resources.Memory.Swap = swap
  163. }
  164. if l := mem.Limit; l != nil {
  165. resources.Memory.Max = l
  166. }
  167. if l := mem.Reservation; l != nil {
  168. resources.Memory.Low = l
  169. }
  170. }
  171. if hugetlbs := spec.HugepageLimits; hugetlbs != nil {
  172. hugeTlbUsage := HugeTlb{}
  173. for _, hugetlb := range hugetlbs {
  174. hugeTlbUsage = append(hugeTlbUsage, HugeTlbEntry{
  175. HugePageSize: hugetlb.Pagesize,
  176. Limit: hugetlb.Limit,
  177. })
  178. }
  179. resources.HugeTlb = &hugeTlbUsage
  180. }
  181. if pids := spec.Pids; pids != nil {
  182. resources.Pids = &Pids{
  183. Max: pids.Limit,
  184. }
  185. }
  186. if i := spec.BlockIO; i != nil {
  187. resources.IO = &IO{}
  188. if i.Weight != nil {
  189. resources.IO.BFQ.Weight = 1 + (*i.Weight-10)*9999/990
  190. }
  191. for t, devices := range map[IOType][]specs.LinuxThrottleDevice{
  192. ReadBPS: i.ThrottleReadBpsDevice,
  193. WriteBPS: i.ThrottleWriteBpsDevice,
  194. ReadIOPS: i.ThrottleReadIOPSDevice,
  195. WriteIOPS: i.ThrottleWriteIOPSDevice,
  196. } {
  197. for _, d := range devices {
  198. resources.IO.Max = append(resources.IO.Max, Entry{
  199. Type: t,
  200. Major: d.Major,
  201. Minor: d.Minor,
  202. Rate: d.Rate,
  203. })
  204. }
  205. }
  206. }
  207. if i := spec.Rdma; i != nil {
  208. resources.RDMA = &RDMA{}
  209. for device, value := range spec.Rdma {
  210. if device != "" && (value.HcaHandles != nil && value.HcaObjects != nil) {
  211. resources.RDMA.Limit = append(resources.RDMA.Limit, RDMAEntry{
  212. Device: device,
  213. HcaHandles: *value.HcaHandles,
  214. HcaObjects: *value.HcaObjects,
  215. })
  216. }
  217. }
  218. }
  219. return &resources
  220. }
  221. // Gets uint64 parsed content of single value cgroup stat file
  222. func getStatFileContentUint64(filePath string) uint64 {
  223. f, err := os.Open(filePath)
  224. if err != nil {
  225. return 0
  226. }
  227. defer f.Close()
  228. // We expect an unsigned 64 bit integer, or a "max" string
  229. // in some cases.
  230. buf := make([]byte, 32)
  231. n, err := f.Read(buf)
  232. if err != nil {
  233. return 0
  234. }
  235. trimmed := strings.TrimSpace(string(buf[:n]))
  236. if trimmed == "max" {
  237. return math.MaxUint64
  238. }
  239. res, err := parseUint(trimmed, 10, 64)
  240. if err != nil {
  241. logrus.Errorf("unable to parse %q as a uint from Cgroup file %q", trimmed, filePath)
  242. return res
  243. }
  244. return res
  245. }
  246. func readIoStats(path string) []*stats.IOEntry {
  247. // more details on the io.stat file format: https://www.kernel.org/doc/Documentation/cgroup-v2.txt
  248. var usage []*stats.IOEntry
  249. fpath := filepath.Join(path, "io.stat")
  250. currentData, err := os.ReadFile(fpath)
  251. if err != nil {
  252. return usage
  253. }
  254. entries := strings.Split(string(currentData), "\n")
  255. for _, entry := range entries {
  256. parts := strings.Split(entry, " ")
  257. if len(parts) < 2 {
  258. continue
  259. }
  260. majmin := strings.Split(parts[0], ":")
  261. if len(majmin) != 2 {
  262. continue
  263. }
  264. major, err := strconv.ParseUint(majmin[0], 10, 0)
  265. if err != nil {
  266. return usage
  267. }
  268. minor, err := strconv.ParseUint(majmin[1], 10, 0)
  269. if err != nil {
  270. return usage
  271. }
  272. parts = parts[1:]
  273. ioEntry := stats.IOEntry{
  274. Major: major,
  275. Minor: minor,
  276. }
  277. for _, s := range parts {
  278. keyPairValue := strings.Split(s, "=")
  279. if len(keyPairValue) != 2 {
  280. continue
  281. }
  282. v, err := strconv.ParseUint(keyPairValue[1], 10, 0)
  283. if err != nil {
  284. continue
  285. }
  286. switch keyPairValue[0] {
  287. case "rbytes":
  288. ioEntry.Rbytes = v
  289. case "wbytes":
  290. ioEntry.Wbytes = v
  291. case "rios":
  292. ioEntry.Rios = v
  293. case "wios":
  294. ioEntry.Wios = v
  295. }
  296. }
  297. usage = append(usage, &ioEntry)
  298. }
  299. return usage
  300. }
  301. func rdmaStats(filepath string) []*stats.RdmaEntry {
  302. currentData, err := os.ReadFile(filepath)
  303. if err != nil {
  304. return []*stats.RdmaEntry{}
  305. }
  306. return toRdmaEntry(strings.Split(string(currentData), "\n"))
  307. }
  308. func parseRdmaKV(raw string, entry *stats.RdmaEntry) {
  309. var value uint64
  310. var err error
  311. parts := strings.Split(raw, "=")
  312. switch len(parts) {
  313. case 2:
  314. if parts[1] == "max" {
  315. value = math.MaxUint32
  316. } else {
  317. value, err = parseUint(parts[1], 10, 32)
  318. if err != nil {
  319. return
  320. }
  321. }
  322. if parts[0] == "hca_handle" {
  323. entry.HcaHandles = uint32(value)
  324. } else if parts[0] == "hca_object" {
  325. entry.HcaObjects = uint32(value)
  326. }
  327. }
  328. }
  329. func toRdmaEntry(strEntries []string) []*stats.RdmaEntry {
  330. var rdmaEntries []*stats.RdmaEntry
  331. for i := range strEntries {
  332. parts := strings.Fields(strEntries[i])
  333. switch len(parts) {
  334. case 3:
  335. entry := new(stats.RdmaEntry)
  336. entry.Device = parts[0]
  337. parseRdmaKV(parts[1], entry)
  338. parseRdmaKV(parts[2], entry)
  339. rdmaEntries = append(rdmaEntries, entry)
  340. default:
  341. continue
  342. }
  343. }
  344. return rdmaEntries
  345. }
  346. // isUnitExists returns true if the error is that a systemd unit already exists.
  347. func isUnitExists(err error) bool {
  348. if err != nil {
  349. if dbusError, ok := err.(dbus.Error); ok {
  350. return strings.Contains(dbusError.Name, "org.freedesktop.systemd1.UnitExists")
  351. }
  352. }
  353. return false
  354. }
  355. func systemdUnitFromPath(path string) string {
  356. _, unit := filepath.Split(path)
  357. return unit
  358. }
  359. func readHugeTlbStats(path string) []*stats.HugeTlbStat {
  360. hpSizes := hugePageSizes()
  361. usage := make([]*stats.HugeTlbStat, len(hpSizes))
  362. for idx, pagesize := range hpSizes {
  363. usage[idx] = &stats.HugeTlbStat{
  364. Max: getStatFileContentUint64(filepath.Join(path, "hugetlb."+pagesize+".max")),
  365. Current: getStatFileContentUint64(filepath.Join(path, "hugetlb."+pagesize+".current")),
  366. Pagesize: pagesize,
  367. }
  368. }
  369. return usage
  370. }
  371. var (
  372. hPageSizes []string
  373. initHPSOnce sync.Once
  374. )
  375. // The following idea and implementation is taken pretty much line for line from
  376. // runc. Because the hugetlb files are well known, and the only variable thrown in
  377. // the mix is what huge page sizes you have on your host, this lends itself well
  378. // to doing the work to find the files present once, and then re-using this. This
  379. // saves a os.Readdirnames(0) call to search for hugeltb files on every `manager.Stat`
  380. // call.
  381. // https://github.com/opencontainers/runc/blob/3a2c0c2565644d8a7e0f1dd594a060b21fa96cf1/libcontainer/cgroups/utils.go#L301
  382. func hugePageSizes() []string {
  383. initHPSOnce.Do(func() {
  384. dir, err := os.OpenFile("/sys/kernel/mm/hugepages", unix.O_DIRECTORY|unix.O_RDONLY, 0)
  385. if err != nil {
  386. return
  387. }
  388. files, err := dir.Readdirnames(0)
  389. dir.Close()
  390. if err != nil {
  391. return
  392. }
  393. hPageSizes, err = getHugePageSizeFromFilenames(files)
  394. if err != nil {
  395. logrus.Warnf("hugePageSizes: %s", err)
  396. }
  397. })
  398. return hPageSizes
  399. }
  400. func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) {
  401. pageSizes := make([]string, 0, len(fileNames))
  402. var warn error
  403. for _, file := range fileNames {
  404. // example: hugepages-1048576kB
  405. val := strings.TrimPrefix(file, "hugepages-")
  406. if len(val) == len(file) {
  407. // Unexpected file name: no prefix found, ignore it.
  408. continue
  409. }
  410. // In all known versions of Linux up to 6.3 the suffix is always
  411. // "kB". If we find something else, produce an error but keep going.
  412. eLen := len(val) - 2
  413. val = strings.TrimSuffix(val, "kB")
  414. if len(val) != eLen {
  415. // Highly unlikely.
  416. if warn == nil {
  417. warn = errors.New(file + `: invalid suffix (expected "kB")`)
  418. }
  419. continue
  420. }
  421. size, err := strconv.Atoi(val)
  422. if err != nil {
  423. // Highly unlikely.
  424. if warn == nil {
  425. warn = fmt.Errorf("%s: %w", file, err)
  426. }
  427. continue
  428. }
  429. // Model after https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/hugetlb_cgroup.c?id=eff48ddeab782e35e58ccc8853f7386bbae9dec4#n574
  430. // but in our case the size is in KB already.
  431. if size >= (1 << 20) {
  432. val = strconv.Itoa(size>>20) + "GB"
  433. } else if size >= (1 << 10) {
  434. val = strconv.Itoa(size>>10) + "MB"
  435. } else {
  436. val += "KB"
  437. }
  438. pageSizes = append(pageSizes, val)
  439. }
  440. return pageSizes, warn
  441. }
  442. func getSubreaper() (int, error) {
  443. var i uintptr
  444. if err := unix.Prctl(unix.PR_GET_CHILD_SUBREAPER, uintptr(unsafe.Pointer(&i)), 0, 0, 0); err != nil {
  445. return -1, err
  446. }
  447. return int(i), nil
  448. }