manager.go 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921
  1. /*
  2. Copyright The containerd Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package cgroup2
  14. import (
  15. "bufio"
  16. "context"
  17. "errors"
  18. "fmt"
  19. "math"
  20. "os"
  21. "path/filepath"
  22. "strconv"
  23. "strings"
  24. "time"
  25. "github.com/containerd/cgroups/v3/cgroup2/stats"
  26. systemdDbus "github.com/coreos/go-systemd/v22/dbus"
  27. "github.com/godbus/dbus/v5"
  28. "github.com/opencontainers/runtime-spec/specs-go"
  29. "github.com/sirupsen/logrus"
  30. "golang.org/x/sys/unix"
  31. )
  32. const (
  33. subtreeControl = "cgroup.subtree_control"
  34. controllersFile = "cgroup.controllers"
  35. killFile = "cgroup.kill"
  36. typeFile = "cgroup.type"
  37. defaultCgroup2Path = "/sys/fs/cgroup"
  38. defaultSlice = "system.slice"
  39. )
  40. var canDelegate bool
  41. type Event struct {
  42. Low uint64
  43. High uint64
  44. Max uint64
  45. OOM uint64
  46. OOMKill uint64
  47. }
  48. // Resources for a cgroups v2 unified hierarchy
  49. type Resources struct {
  50. CPU *CPU
  51. Memory *Memory
  52. Pids *Pids
  53. IO *IO
  54. RDMA *RDMA
  55. HugeTlb *HugeTlb
  56. // When len(Devices) is zero, devices are not controlled
  57. Devices []specs.LinuxDeviceCgroup
  58. }
  59. // Values returns the raw filenames and values that
  60. // can be written to the unified hierarchy
  61. func (r *Resources) Values() (o []Value) {
  62. if r.CPU != nil {
  63. o = append(o, r.CPU.Values()...)
  64. }
  65. if r.Memory != nil {
  66. o = append(o, r.Memory.Values()...)
  67. }
  68. if r.Pids != nil {
  69. o = append(o, r.Pids.Values()...)
  70. }
  71. if r.IO != nil {
  72. o = append(o, r.IO.Values()...)
  73. }
  74. if r.RDMA != nil {
  75. o = append(o, r.RDMA.Values()...)
  76. }
  77. if r.HugeTlb != nil {
  78. o = append(o, r.HugeTlb.Values()...)
  79. }
  80. return o
  81. }
  82. // EnabledControllers returns the list of all not nil resource controllers
  83. func (r *Resources) EnabledControllers() (c []string) {
  84. if r.CPU != nil {
  85. c = append(c, "cpu")
  86. if r.CPU.Cpus != "" || r.CPU.Mems != "" {
  87. c = append(c, "cpuset")
  88. }
  89. }
  90. if r.Memory != nil {
  91. c = append(c, "memory")
  92. }
  93. if r.Pids != nil {
  94. c = append(c, "pids")
  95. }
  96. if r.IO != nil {
  97. c = append(c, "io")
  98. }
  99. if r.RDMA != nil {
  100. c = append(c, "rdma")
  101. }
  102. if r.HugeTlb != nil {
  103. c = append(c, "hugetlb")
  104. }
  105. return
  106. }
  107. // Value of a cgroup setting
  108. type Value struct {
  109. filename string
  110. value interface{}
  111. }
  112. // write the value to the full, absolute path, of a unified hierarchy
  113. func (c *Value) write(path string, perm os.FileMode) error {
  114. var data []byte
  115. switch t := c.value.(type) {
  116. case uint64:
  117. data = []byte(strconv.FormatUint(t, 10))
  118. case uint16:
  119. data = []byte(strconv.FormatUint(uint64(t), 10))
  120. case int64:
  121. data = []byte(strconv.FormatInt(t, 10))
  122. case []byte:
  123. data = t
  124. case string:
  125. data = []byte(t)
  126. case CPUMax:
  127. data = []byte(t)
  128. default:
  129. return ErrInvalidFormat
  130. }
  131. return os.WriteFile(
  132. filepath.Join(path, c.filename),
  133. data,
  134. perm,
  135. )
  136. }
  137. func writeValues(path string, values []Value) error {
  138. for _, o := range values {
  139. if err := o.write(path, defaultFilePerm); err != nil {
  140. return err
  141. }
  142. }
  143. return nil
  144. }
  145. func NewManager(mountpoint string, group string, resources *Resources) (*Manager, error) {
  146. if resources == nil {
  147. return nil, errors.New("resources reference is nil")
  148. }
  149. if err := VerifyGroupPath(group); err != nil {
  150. return nil, err
  151. }
  152. path := filepath.Join(mountpoint, group)
  153. if err := os.MkdirAll(path, defaultDirPerm); err != nil {
  154. return nil, err
  155. }
  156. m := Manager{
  157. unifiedMountpoint: mountpoint,
  158. path: path,
  159. }
  160. if err := m.ToggleControllers(resources.EnabledControllers(), Enable); err != nil {
  161. // clean up cgroup dir on failure
  162. os.Remove(path)
  163. return nil, err
  164. }
  165. if err := setResources(path, resources); err != nil {
  166. os.Remove(path)
  167. return nil, err
  168. }
  169. return &m, nil
  170. }
  171. type InitConfig struct {
  172. mountpoint string
  173. }
  174. type InitOpts func(c *InitConfig) error
  175. // WithMountpoint sets the unified mountpoint. The default path is /sys/fs/cgroup.
  176. func WithMountpoint(path string) InitOpts {
  177. return func(c *InitConfig) error {
  178. c.mountpoint = path
  179. return nil
  180. }
  181. }
  182. // Load a cgroup.
  183. func Load(group string, opts ...InitOpts) (*Manager, error) {
  184. c := InitConfig{mountpoint: defaultCgroup2Path}
  185. for _, opt := range opts {
  186. if err := opt(&c); err != nil {
  187. return nil, err
  188. }
  189. }
  190. if err := VerifyGroupPath(group); err != nil {
  191. return nil, err
  192. }
  193. path := filepath.Join(c.mountpoint, group)
  194. return &Manager{
  195. unifiedMountpoint: c.mountpoint,
  196. path: path,
  197. }, nil
  198. }
  199. type Manager struct {
  200. unifiedMountpoint string
  201. path string
  202. }
  203. func setResources(path string, resources *Resources) error {
  204. if resources != nil {
  205. if err := writeValues(path, resources.Values()); err != nil {
  206. return err
  207. }
  208. if err := setDevices(path, resources.Devices); err != nil {
  209. return err
  210. }
  211. }
  212. return nil
  213. }
  214. // CgroupType represents the types a cgroup can be.
  215. type CgroupType string
  216. const (
  217. Domain CgroupType = "domain"
  218. Threaded CgroupType = "threaded"
  219. )
  220. func (c *Manager) GetType() (CgroupType, error) {
  221. val, err := os.ReadFile(filepath.Join(c.path, typeFile))
  222. if err != nil {
  223. return "", err
  224. }
  225. trimmed := strings.TrimSpace(string(val))
  226. return CgroupType(trimmed), nil
  227. }
  228. func (c *Manager) SetType(cgType CgroupType) error {
  229. // NOTE: We could abort if cgType != Threaded here as currently
  230. // it's not possible to revert back to domain, but not sure
  231. // it's worth being that opinionated, especially if that may
  232. // ever change.
  233. v := Value{
  234. filename: typeFile,
  235. value: string(cgType),
  236. }
  237. return writeValues(c.path, []Value{v})
  238. }
  239. func (c *Manager) RootControllers() ([]string, error) {
  240. b, err := os.ReadFile(filepath.Join(c.unifiedMountpoint, controllersFile))
  241. if err != nil {
  242. return nil, err
  243. }
  244. return strings.Fields(string(b)), nil
  245. }
  246. func (c *Manager) Controllers() ([]string, error) {
  247. b, err := os.ReadFile(filepath.Join(c.path, controllersFile))
  248. if err != nil {
  249. return nil, err
  250. }
  251. return strings.Fields(string(b)), nil
  252. }
  253. func (c *Manager) Update(resources *Resources) error {
  254. return setResources(c.path, resources)
  255. }
  256. type ControllerToggle int
  257. const (
  258. Enable ControllerToggle = iota + 1
  259. Disable
  260. )
  261. func toggleFunc(controllers []string, prefix string) []string {
  262. out := make([]string, len(controllers))
  263. for i, c := range controllers {
  264. out[i] = prefix + c
  265. }
  266. return out
  267. }
  268. func (c *Manager) ToggleControllers(controllers []string, t ControllerToggle) error {
  269. // when c.path is like /foo/bar/baz, the following files need to be written:
  270. // * /sys/fs/cgroup/cgroup.subtree_control
  271. // * /sys/fs/cgroup/foo/cgroup.subtree_control
  272. // * /sys/fs/cgroup/foo/bar/cgroup.subtree_control
  273. // Note that /sys/fs/cgroup/foo/bar/baz/cgroup.subtree_control does not need to be written.
  274. split := strings.Split(c.path, "/")
  275. var lastErr error
  276. for i := range split {
  277. f := strings.Join(split[:i], "/")
  278. if !strings.HasPrefix(f, c.unifiedMountpoint) || f == c.path {
  279. continue
  280. }
  281. filePath := filepath.Join(f, subtreeControl)
  282. if err := c.writeSubtreeControl(filePath, controllers, t); err != nil {
  283. // When running as rootless, the user may face EPERM on parent groups, but it is neglible when the
  284. // controller is already written.
  285. // So we only return the last error.
  286. lastErr = fmt.Errorf("failed to write subtree controllers %+v to %q: %w", controllers, filePath, err)
  287. } else {
  288. lastErr = nil
  289. }
  290. }
  291. return lastErr
  292. }
  293. func (c *Manager) writeSubtreeControl(filePath string, controllers []string, t ControllerToggle) error {
  294. f, err := os.OpenFile(filePath, os.O_WRONLY, 0)
  295. if err != nil {
  296. return err
  297. }
  298. defer f.Close()
  299. switch t {
  300. case Enable:
  301. controllers = toggleFunc(controllers, "+")
  302. case Disable:
  303. controllers = toggleFunc(controllers, "-")
  304. }
  305. _, err = f.WriteString(strings.Join(controllers, " "))
  306. return err
  307. }
  308. func (c *Manager) NewChild(name string, resources *Resources) (*Manager, error) {
  309. if strings.HasPrefix(name, "/") {
  310. return nil, errors.New("name must be relative")
  311. }
  312. path := filepath.Join(c.path, name)
  313. if err := os.MkdirAll(path, defaultDirPerm); err != nil {
  314. return nil, err
  315. }
  316. m := Manager{
  317. unifiedMountpoint: c.unifiedMountpoint,
  318. path: path,
  319. }
  320. if resources != nil {
  321. if err := m.ToggleControllers(resources.EnabledControllers(), Enable); err != nil {
  322. // clean up cgroup dir on failure
  323. os.Remove(path)
  324. return nil, err
  325. }
  326. }
  327. if err := setResources(path, resources); err != nil {
  328. // clean up cgroup dir on failure
  329. os.Remove(path)
  330. return nil, err
  331. }
  332. return &m, nil
  333. }
  334. func (c *Manager) AddProc(pid uint64) error {
  335. v := Value{
  336. filename: cgroupProcs,
  337. value: pid,
  338. }
  339. return writeValues(c.path, []Value{v})
  340. }
  341. func (c *Manager) AddThread(tid uint64) error {
  342. v := Value{
  343. filename: cgroupThreads,
  344. value: tid,
  345. }
  346. return writeValues(c.path, []Value{v})
  347. }
  348. // Kill will try to forcibly exit all of the processes in the cgroup. This is
  349. // equivalent to sending a SIGKILL to every process. On kernels 5.14 and greater
  350. // this will use the cgroup.kill file, on anything that doesn't have the cgroup.kill
  351. // file, a manual process of freezing -> sending a SIGKILL to every process -> thawing
  352. // will be used.
  353. func (c *Manager) Kill() error {
  354. v := Value{
  355. filename: killFile,
  356. value: "1",
  357. }
  358. err := writeValues(c.path, []Value{v})
  359. if err == nil {
  360. return nil
  361. }
  362. logrus.Warnf("falling back to slower kill implementation: %s", err)
  363. // Fallback to slow method.
  364. return c.fallbackKill()
  365. }
  366. // fallbackKill is a slower fallback to the more modern (kernels 5.14+)
  367. // approach of writing to the cgroup.kill file. This is heavily pulled
  368. // from runc's same approach (in signalAllProcesses), with the only differences
  369. // being this is just tailored to the API exposed in this library, and we don't
  370. // need to care about signals other than SIGKILL.
  371. //
  372. // https://github.com/opencontainers/runc/blob/8da0a0b5675764feaaaaad466f6567a9983fcd08/libcontainer/init_linux.go#L523-L529
  373. func (c *Manager) fallbackKill() error {
  374. if err := c.Freeze(); err != nil {
  375. logrus.Warn(err)
  376. }
  377. pids, err := c.Procs(true)
  378. if err != nil {
  379. if err := c.Thaw(); err != nil {
  380. logrus.Warn(err)
  381. }
  382. return err
  383. }
  384. var procs []*os.Process
  385. for _, pid := range pids {
  386. p, err := os.FindProcess(int(pid))
  387. if err != nil {
  388. logrus.Warn(err)
  389. continue
  390. }
  391. procs = append(procs, p)
  392. if err := p.Signal(unix.SIGKILL); err != nil {
  393. logrus.Warn(err)
  394. }
  395. }
  396. if err := c.Thaw(); err != nil {
  397. logrus.Warn(err)
  398. }
  399. subreaper, err := getSubreaper()
  400. if err != nil {
  401. // The error here means that PR_GET_CHILD_SUBREAPER is not
  402. // supported because this code might run on a kernel older
  403. // than 3.4. We don't want to throw an error in that case,
  404. // and we simplify things, considering there is no subreaper
  405. // set.
  406. subreaper = 0
  407. }
  408. for _, p := range procs {
  409. // In case a subreaper has been setup, this code must not
  410. // wait for the process. Otherwise, we cannot be sure the
  411. // current process will be reaped by the subreaper, while
  412. // the subreaper might be waiting for this process in order
  413. // to retrieve its exit code.
  414. if subreaper == 0 {
  415. if _, err := p.Wait(); err != nil {
  416. if !errors.Is(err, unix.ECHILD) {
  417. logrus.Warnf("wait on pid %d failed: %s", p.Pid, err)
  418. }
  419. }
  420. }
  421. }
  422. return nil
  423. }
  424. func (c *Manager) Delete() error {
  425. // kernel prevents cgroups with running process from being removed, check the tree is empty
  426. processes, err := c.Procs(true)
  427. if err != nil {
  428. return err
  429. }
  430. if len(processes) > 0 {
  431. return fmt.Errorf("cgroups: unable to remove path %q: still contains running processes", c.path)
  432. }
  433. return remove(c.path)
  434. }
  435. func (c *Manager) Procs(recursive bool) ([]uint64, error) {
  436. var processes []uint64
  437. err := filepath.Walk(c.path, func(p string, info os.FileInfo, err error) error {
  438. if err != nil {
  439. return err
  440. }
  441. if !recursive && info.IsDir() {
  442. if p == c.path {
  443. return nil
  444. }
  445. return filepath.SkipDir
  446. }
  447. _, name := filepath.Split(p)
  448. if name != cgroupProcs {
  449. return nil
  450. }
  451. procs, err := parseCgroupProcsFile(p)
  452. if err != nil {
  453. return err
  454. }
  455. processes = append(processes, procs...)
  456. return nil
  457. })
  458. return processes, err
  459. }
  460. func (c *Manager) MoveTo(destination *Manager) error {
  461. processes, err := c.Procs(true)
  462. if err != nil {
  463. return err
  464. }
  465. for _, p := range processes {
  466. if err := destination.AddProc(p); err != nil {
  467. if strings.Contains(err.Error(), "no such process") {
  468. continue
  469. }
  470. return err
  471. }
  472. }
  473. return nil
  474. }
  475. func (c *Manager) Stat() (*stats.Metrics, error) {
  476. controllers, err := c.Controllers()
  477. if err != nil {
  478. return nil, err
  479. }
  480. // Sizing this avoids an allocation to increase the map at runtime;
  481. // currently the default bucket size is 8 and we put 40+ elements
  482. // in it so we'd always end up allocating.
  483. out := make(map[string]uint64, 50)
  484. for _, controller := range controllers {
  485. switch controller {
  486. case "cpu", "memory":
  487. if err := readKVStatsFile(c.path, controller+".stat", out); err != nil {
  488. if os.IsNotExist(err) {
  489. continue
  490. }
  491. return nil, err
  492. }
  493. }
  494. }
  495. memoryEvents := make(map[string]uint64)
  496. if err := readKVStatsFile(c.path, "memory.events", memoryEvents); err != nil {
  497. if !os.IsNotExist(err) {
  498. return nil, err
  499. }
  500. }
  501. var metrics stats.Metrics
  502. metrics.Pids = &stats.PidsStat{
  503. Current: getStatFileContentUint64(filepath.Join(c.path, "pids.current")),
  504. Limit: getStatFileContentUint64(filepath.Join(c.path, "pids.max")),
  505. }
  506. metrics.CPU = &stats.CPUStat{
  507. UsageUsec: out["usage_usec"],
  508. UserUsec: out["user_usec"],
  509. SystemUsec: out["system_usec"],
  510. NrPeriods: out["nr_periods"],
  511. NrThrottled: out["nr_throttled"],
  512. ThrottledUsec: out["throttled_usec"],
  513. }
  514. metrics.Memory = &stats.MemoryStat{
  515. Anon: out["anon"],
  516. File: out["file"],
  517. KernelStack: out["kernel_stack"],
  518. Slab: out["slab"],
  519. Sock: out["sock"],
  520. Shmem: out["shmem"],
  521. FileMapped: out["file_mapped"],
  522. FileDirty: out["file_dirty"],
  523. FileWriteback: out["file_writeback"],
  524. AnonThp: out["anon_thp"],
  525. InactiveAnon: out["inactive_anon"],
  526. ActiveAnon: out["active_anon"],
  527. InactiveFile: out["inactive_file"],
  528. ActiveFile: out["active_file"],
  529. Unevictable: out["unevictable"],
  530. SlabReclaimable: out["slab_reclaimable"],
  531. SlabUnreclaimable: out["slab_unreclaimable"],
  532. Pgfault: out["pgfault"],
  533. Pgmajfault: out["pgmajfault"],
  534. WorkingsetRefault: out["workingset_refault"],
  535. WorkingsetActivate: out["workingset_activate"],
  536. WorkingsetNodereclaim: out["workingset_nodereclaim"],
  537. Pgrefill: out["pgrefill"],
  538. Pgscan: out["pgscan"],
  539. Pgsteal: out["pgsteal"],
  540. Pgactivate: out["pgactivate"],
  541. Pgdeactivate: out["pgdeactivate"],
  542. Pglazyfree: out["pglazyfree"],
  543. Pglazyfreed: out["pglazyfreed"],
  544. ThpFaultAlloc: out["thp_fault_alloc"],
  545. ThpCollapseAlloc: out["thp_collapse_alloc"],
  546. Usage: getStatFileContentUint64(filepath.Join(c.path, "memory.current")),
  547. UsageLimit: getStatFileContentUint64(filepath.Join(c.path, "memory.max")),
  548. SwapUsage: getStatFileContentUint64(filepath.Join(c.path, "memory.swap.current")),
  549. SwapLimit: getStatFileContentUint64(filepath.Join(c.path, "memory.swap.max")),
  550. }
  551. if len(memoryEvents) > 0 {
  552. metrics.MemoryEvents = &stats.MemoryEvents{
  553. Low: memoryEvents["low"],
  554. High: memoryEvents["high"],
  555. Max: memoryEvents["max"],
  556. Oom: memoryEvents["oom"],
  557. OomKill: memoryEvents["oom_kill"],
  558. }
  559. }
  560. metrics.Io = &stats.IOStat{Usage: readIoStats(c.path)}
  561. metrics.Rdma = &stats.RdmaStat{
  562. Current: rdmaStats(filepath.Join(c.path, "rdma.current")),
  563. Limit: rdmaStats(filepath.Join(c.path, "rdma.max")),
  564. }
  565. metrics.Hugetlb = readHugeTlbStats(c.path)
  566. return &metrics, nil
  567. }
  568. func readKVStatsFile(path string, file string, out map[string]uint64) error {
  569. f, err := os.Open(filepath.Join(path, file))
  570. if err != nil {
  571. return err
  572. }
  573. defer f.Close()
  574. s := bufio.NewScanner(f)
  575. for s.Scan() {
  576. name, value, err := parseKV(s.Text())
  577. if err != nil {
  578. return fmt.Errorf("error while parsing %s (line=%q): %w", filepath.Join(path, file), s.Text(), err)
  579. }
  580. out[name] = value
  581. }
  582. return s.Err()
  583. }
  584. func (c *Manager) Freeze() error {
  585. return c.freeze(c.path, Frozen)
  586. }
  587. func (c *Manager) Thaw() error {
  588. return c.freeze(c.path, Thawed)
  589. }
  590. func (c *Manager) freeze(path string, state State) error {
  591. values := state.Values()
  592. for {
  593. if err := writeValues(path, values); err != nil {
  594. return err
  595. }
  596. current, err := fetchState(path)
  597. if err != nil {
  598. return err
  599. }
  600. if current == state {
  601. return nil
  602. }
  603. time.Sleep(1 * time.Millisecond)
  604. }
  605. }
  606. func (c *Manager) isCgroupEmpty() bool {
  607. // In case of any error we return true so that we exit and don't leak resources
  608. out := make(map[string]uint64)
  609. if err := readKVStatsFile(c.path, "cgroup.events", out); err != nil {
  610. return true
  611. }
  612. if v, ok := out["populated"]; ok {
  613. return v == 0
  614. }
  615. return true
  616. }
  617. // MemoryEventFD returns inotify file descriptor and 'memory.events' inotify watch descriptor
  618. func (c *Manager) MemoryEventFD() (int, uint32, error) {
  619. fpath := filepath.Join(c.path, "memory.events")
  620. fd, err := unix.InotifyInit()
  621. if err != nil {
  622. return 0, 0, errors.New("failed to create inotify fd")
  623. }
  624. wd, err := unix.InotifyAddWatch(fd, fpath, unix.IN_MODIFY)
  625. if err != nil {
  626. unix.Close(fd)
  627. return 0, 0, fmt.Errorf("failed to add inotify watch for %q: %w", fpath, err)
  628. }
  629. // monitor to detect process exit/cgroup deletion
  630. evpath := filepath.Join(c.path, "cgroup.events")
  631. if _, err = unix.InotifyAddWatch(fd, evpath, unix.IN_MODIFY); err != nil {
  632. unix.Close(fd)
  633. return 0, 0, fmt.Errorf("failed to add inotify watch for %q: %w", evpath, err)
  634. }
  635. return fd, uint32(wd), nil
  636. }
  637. func (c *Manager) EventChan() (<-chan Event, <-chan error) {
  638. ec := make(chan Event)
  639. errCh := make(chan error, 1)
  640. go c.waitForEvents(ec, errCh)
  641. return ec, errCh
  642. }
  643. func (c *Manager) waitForEvents(ec chan<- Event, errCh chan<- error) {
  644. defer close(errCh)
  645. fd, _, err := c.MemoryEventFD()
  646. if err != nil {
  647. errCh <- err
  648. return
  649. }
  650. defer unix.Close(fd)
  651. for {
  652. buffer := make([]byte, unix.SizeofInotifyEvent*10)
  653. bytesRead, err := unix.Read(fd, buffer)
  654. if err != nil {
  655. errCh <- err
  656. return
  657. }
  658. if bytesRead >= unix.SizeofInotifyEvent {
  659. out := make(map[string]uint64)
  660. if err := readKVStatsFile(c.path, "memory.events", out); err != nil {
  661. // When cgroup is deleted read may return -ENODEV instead of -ENOENT from open.
  662. if _, statErr := os.Lstat(filepath.Join(c.path, "memory.events")); !os.IsNotExist(statErr) {
  663. errCh <- err
  664. }
  665. return
  666. }
  667. ec <- Event{
  668. Low: out["low"],
  669. High: out["high"],
  670. Max: out["max"],
  671. OOM: out["oom"],
  672. OOMKill: out["oom_kill"],
  673. }
  674. if c.isCgroupEmpty() {
  675. return
  676. }
  677. }
  678. }
  679. }
  680. func setDevices(path string, devices []specs.LinuxDeviceCgroup) error {
  681. if len(devices) == 0 {
  682. return nil
  683. }
  684. insts, license, err := DeviceFilter(devices)
  685. if err != nil {
  686. return err
  687. }
  688. dirFD, err := unix.Open(path, unix.O_DIRECTORY|unix.O_RDONLY|unix.O_CLOEXEC, 0o600)
  689. if err != nil {
  690. return fmt.Errorf("cannot get dir FD for %s", path)
  691. }
  692. defer unix.Close(dirFD)
  693. if _, err := LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil {
  694. if !canSkipEBPFError(devices) {
  695. return err
  696. }
  697. }
  698. return nil
  699. }
  700. // getSystemdFullPath returns the full systemd path when creating a systemd slice group.
  701. // the reason this is necessary is because the "-" character has a special meaning in
  702. // systemd slice. For example, when creating a slice called "my-group-112233.slice",
  703. // systemd will create a hierarchy like this:
  704. //
  705. // /sys/fs/cgroup/my.slice/my-group.slice/my-group-112233.slice
  706. func getSystemdFullPath(slice, group string) string {
  707. return filepath.Join(defaultCgroup2Path, dashesToPath(slice), dashesToPath(group))
  708. }
  709. // dashesToPath converts a slice name with dashes to it's corresponding systemd filesystem path.
  710. func dashesToPath(in string) string {
  711. path := ""
  712. if strings.HasSuffix(in, ".slice") && strings.Contains(in, "-") {
  713. parts := strings.Split(in, "-")
  714. for i := range parts {
  715. s := strings.Join(parts[0:i+1], "-")
  716. if !strings.HasSuffix(s, ".slice") {
  717. s += ".slice"
  718. }
  719. path = filepath.Join(path, s)
  720. }
  721. } else {
  722. path = filepath.Join(path, in)
  723. }
  724. return path
  725. }
  726. func NewSystemd(slice, group string, pid int, resources *Resources) (*Manager, error) {
  727. if slice == "" {
  728. slice = defaultSlice
  729. }
  730. ctx := context.TODO()
  731. path := getSystemdFullPath(slice, group)
  732. conn, err := systemdDbus.NewWithContext(ctx)
  733. if err != nil {
  734. return &Manager{}, err
  735. }
  736. defer conn.Close()
  737. properties := []systemdDbus.Property{
  738. systemdDbus.PropDescription("cgroup " + group),
  739. newSystemdProperty("DefaultDependencies", false),
  740. newSystemdProperty("MemoryAccounting", true),
  741. newSystemdProperty("CPUAccounting", true),
  742. newSystemdProperty("IOAccounting", true),
  743. }
  744. // if we create a slice, the parent is defined via a Wants=
  745. if strings.HasSuffix(group, ".slice") {
  746. properties = append(properties, systemdDbus.PropWants(defaultSlice))
  747. } else {
  748. // otherwise, we use Slice=
  749. properties = append(properties, systemdDbus.PropSlice(defaultSlice))
  750. }
  751. // only add pid if its valid, -1 is used w/ general slice creation.
  752. if pid != -1 {
  753. properties = append(properties, newSystemdProperty("PIDs", []uint32{uint32(pid)}))
  754. }
  755. if resources.Memory != nil && resources.Memory.Min != nil && *resources.Memory.Min != 0 {
  756. properties = append(properties,
  757. newSystemdProperty("MemoryMin", uint64(*resources.Memory.Min)))
  758. }
  759. if resources.Memory != nil && resources.Memory.Max != nil && *resources.Memory.Max != 0 {
  760. properties = append(properties,
  761. newSystemdProperty("MemoryMax", uint64(*resources.Memory.Max)))
  762. }
  763. if resources.CPU != nil && resources.CPU.Weight != nil && *resources.CPU.Weight != 0 {
  764. properties = append(properties,
  765. newSystemdProperty("CPUWeight", *resources.CPU.Weight))
  766. }
  767. if resources.CPU != nil && resources.CPU.Max != "" {
  768. quota, period := resources.CPU.Max.extractQuotaAndPeriod()
  769. // cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd.
  770. // corresponds to USEC_INFINITY in systemd
  771. // if USEC_INFINITY is provided, CPUQuota is left unbound by systemd
  772. // always setting a property value ensures we can apply a quota and remove it later
  773. cpuQuotaPerSecUSec := uint64(math.MaxUint64)
  774. if quota > 0 {
  775. // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota
  776. // (integer percentage of CPU) internally. This means that if a fractional percent of
  777. // CPU is indicated by Resources.CpuQuota, we need to round up to the nearest
  778. // 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect.
  779. cpuQuotaPerSecUSec = uint64(quota*1000000) / period
  780. if cpuQuotaPerSecUSec%10000 != 0 {
  781. cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000
  782. }
  783. }
  784. properties = append(properties,
  785. newSystemdProperty("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec))
  786. }
  787. // If we can delegate, we add the property back in
  788. if canDelegate {
  789. properties = append(properties, newSystemdProperty("Delegate", true))
  790. }
  791. if resources.Pids != nil && resources.Pids.Max > 0 {
  792. properties = append(properties,
  793. newSystemdProperty("TasksAccounting", true),
  794. newSystemdProperty("TasksMax", uint64(resources.Pids.Max)))
  795. }
  796. statusChan := make(chan string, 1)
  797. if _, err := conn.StartTransientUnitContext(ctx, group, "replace", properties, statusChan); err == nil {
  798. select {
  799. case <-statusChan:
  800. case <-time.After(time.Second):
  801. logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", group)
  802. }
  803. } else if !isUnitExists(err) {
  804. return &Manager{}, err
  805. }
  806. return &Manager{
  807. path: path,
  808. }, nil
  809. }
  810. func LoadSystemd(slice, group string) (*Manager, error) {
  811. if slice == "" {
  812. slice = defaultSlice
  813. }
  814. path := getSystemdFullPath(slice, group)
  815. return &Manager{
  816. path: path,
  817. }, nil
  818. }
  819. func (c *Manager) DeleteSystemd() error {
  820. ctx := context.TODO()
  821. conn, err := systemdDbus.NewWithContext(ctx)
  822. if err != nil {
  823. return err
  824. }
  825. defer conn.Close()
  826. group := systemdUnitFromPath(c.path)
  827. ch := make(chan string)
  828. _, err = conn.StopUnitContext(ctx, group, "replace", ch)
  829. if err != nil {
  830. return err
  831. }
  832. <-ch
  833. return nil
  834. }
  835. func newSystemdProperty(name string, units interface{}) systemdDbus.Property {
  836. return systemdDbus.Property{
  837. Name: name,
  838. Value: dbus.MakeVariant(units),
  839. }
  840. }