cgroup.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575
  1. /*
  2. Copyright The containerd Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package cgroup1
  14. import (
  15. "errors"
  16. "fmt"
  17. "io/fs"
  18. "os"
  19. "path/filepath"
  20. "strconv"
  21. "strings"
  22. "sync"
  23. "syscall"
  24. "time"
  25. v1 "github.com/containerd/cgroups/v3/cgroup1/stats"
  26. "github.com/opencontainers/runtime-spec/specs-go"
  27. )
  28. // New returns a new control via the cgroup cgroups interface
  29. func New(path Path, resources *specs.LinuxResources, opts ...InitOpts) (Cgroup, error) {
  30. config := newInitConfig()
  31. for _, o := range opts {
  32. if err := o(config); err != nil {
  33. return nil, err
  34. }
  35. }
  36. subsystems, err := config.hierarchy()
  37. if err != nil {
  38. return nil, err
  39. }
  40. var active []Subsystem
  41. for _, s := range subsystems {
  42. // check if subsystem exists
  43. if err := initializeSubsystem(s, path, resources); err != nil {
  44. if err == ErrControllerNotActive {
  45. if config.InitCheck != nil {
  46. if skerr := config.InitCheck(s, path, err); skerr != nil {
  47. if skerr != ErrIgnoreSubsystem {
  48. return nil, skerr
  49. }
  50. }
  51. }
  52. continue
  53. }
  54. return nil, err
  55. }
  56. active = append(active, s)
  57. }
  58. return &cgroup{
  59. path: path,
  60. subsystems: active,
  61. }, nil
  62. }
  63. // Load will load an existing cgroup and allow it to be controlled
  64. // All static path should not include `/sys/fs/cgroup/` prefix, it should start with your own cgroups name
  65. func Load(path Path, opts ...InitOpts) (Cgroup, error) {
  66. config := newInitConfig()
  67. for _, o := range opts {
  68. if err := o(config); err != nil {
  69. return nil, err
  70. }
  71. }
  72. var activeSubsystems []Subsystem
  73. subsystems, err := config.hierarchy()
  74. if err != nil {
  75. return nil, err
  76. }
  77. // check that the subsystems still exist, and keep only those that actually exist
  78. for _, s := range pathers(subsystems) {
  79. p, err := path(s.Name())
  80. if err != nil {
  81. if errors.Is(err, os.ErrNotExist) {
  82. return nil, ErrCgroupDeleted
  83. }
  84. if err == ErrControllerNotActive {
  85. if config.InitCheck != nil {
  86. if skerr := config.InitCheck(s, path, err); skerr != nil {
  87. if skerr != ErrIgnoreSubsystem {
  88. return nil, skerr
  89. }
  90. }
  91. }
  92. continue
  93. }
  94. return nil, err
  95. }
  96. if _, err := os.Lstat(s.Path(p)); err != nil {
  97. if os.IsNotExist(err) {
  98. continue
  99. }
  100. return nil, err
  101. }
  102. activeSubsystems = append(activeSubsystems, s)
  103. }
  104. // if we do not have any active systems then the cgroup is deleted
  105. if len(activeSubsystems) == 0 {
  106. return nil, ErrCgroupDeleted
  107. }
  108. return &cgroup{
  109. path: path,
  110. subsystems: activeSubsystems,
  111. }, nil
  112. }
  113. type cgroup struct {
  114. path Path
  115. subsystems []Subsystem
  116. mu sync.Mutex
  117. err error
  118. }
  119. // New returns a new sub cgroup
  120. func (c *cgroup) New(name string, resources *specs.LinuxResources) (Cgroup, error) {
  121. c.mu.Lock()
  122. defer c.mu.Unlock()
  123. if c.err != nil {
  124. return nil, c.err
  125. }
  126. path := subPath(c.path, name)
  127. for _, s := range c.subsystems {
  128. if err := initializeSubsystem(s, path, resources); err != nil {
  129. return nil, err
  130. }
  131. }
  132. return &cgroup{
  133. path: path,
  134. subsystems: c.subsystems,
  135. }, nil
  136. }
  137. // Subsystems returns all the subsystems that are currently being
  138. // consumed by the group
  139. func (c *cgroup) Subsystems() []Subsystem {
  140. return c.subsystems
  141. }
  142. func (c *cgroup) subsystemsFilter(subsystems ...Name) []Subsystem {
  143. if len(subsystems) == 0 {
  144. return c.subsystems
  145. }
  146. filteredSubsystems := []Subsystem{}
  147. for _, s := range c.subsystems {
  148. for _, f := range subsystems {
  149. if s.Name() == f {
  150. filteredSubsystems = append(filteredSubsystems, s)
  151. break
  152. }
  153. }
  154. }
  155. return filteredSubsystems
  156. }
  157. // Add moves the provided process into the new cgroup.
  158. // Without additional arguments, the process is added to all the cgroup subsystems.
  159. // When giving Add a list of subsystem names, the process is only added to those
  160. // subsystems, provided that they are active in the targeted cgroup.
  161. func (c *cgroup) Add(process Process, subsystems ...Name) error {
  162. return c.add(process, cgroupProcs, subsystems...)
  163. }
  164. // AddProc moves the provided process id into the new cgroup.
  165. // Without additional arguments, the process with the given id is added to all
  166. // the cgroup subsystems. When giving AddProc a list of subsystem names, the process
  167. // id is only added to those subsystems, provided that they are active in the targeted
  168. // cgroup.
  169. func (c *cgroup) AddProc(pid uint64, subsystems ...Name) error {
  170. return c.add(Process{Pid: int(pid)}, cgroupProcs, subsystems...)
  171. }
  172. // AddTask moves the provided tasks (threads) into the new cgroup.
  173. // Without additional arguments, the task is added to all the cgroup subsystems.
  174. // When giving AddTask a list of subsystem names, the task is only added to those
  175. // subsystems, provided that they are active in the targeted cgroup.
  176. func (c *cgroup) AddTask(process Process, subsystems ...Name) error {
  177. return c.add(process, cgroupTasks, subsystems...)
  178. }
  179. // writeCgroupsProcs writes to the file, but retries on EINVAL.
  180. func writeCgroupProcs(path string, content []byte, perm fs.FileMode) error {
  181. f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY, perm)
  182. if err != nil {
  183. return err
  184. }
  185. defer f.Close()
  186. for i := 0; i < 5; i++ {
  187. _, err = f.Write(content)
  188. if err == nil {
  189. return nil
  190. }
  191. // If the process's associated task's state is TASK_NEW, the kernel
  192. // returns EINVAL. The function will retry on the error like runc.
  193. // https://github.com/torvalds/linux/blob/v6.0/kernel/sched/core.c#L10308-L10337
  194. // https://github.com/opencontainers/runc/pull/1950
  195. if !errors.Is(err, syscall.EINVAL) {
  196. return err
  197. }
  198. time.Sleep(30 * time.Millisecond)
  199. }
  200. return err
  201. }
  202. func (c *cgroup) add(process Process, pType procType, subsystems ...Name) error {
  203. if process.Pid <= 0 {
  204. return ErrInvalidPid
  205. }
  206. c.mu.Lock()
  207. defer c.mu.Unlock()
  208. if c.err != nil {
  209. return c.err
  210. }
  211. for _, s := range pathers(c.subsystemsFilter(subsystems...)) {
  212. p, err := c.path(s.Name())
  213. if err != nil {
  214. return err
  215. }
  216. err = writeCgroupProcs(
  217. filepath.Join(s.Path(p), pType),
  218. []byte(strconv.Itoa(process.Pid)),
  219. defaultFilePerm,
  220. )
  221. if err != nil {
  222. return err
  223. }
  224. }
  225. return nil
  226. }
  227. // Delete will remove the control group from each of the subsystems registered
  228. func (c *cgroup) Delete() error {
  229. c.mu.Lock()
  230. defer c.mu.Unlock()
  231. if c.err != nil {
  232. return c.err
  233. }
  234. var errs []string
  235. for _, s := range c.subsystems {
  236. // kernel prevents cgroups with running process from being removed, check the tree is empty
  237. procs, err := c.processes(s.Name(), true, cgroupProcs)
  238. if err != nil {
  239. // if the control group does not exist within a subsystem, then proceed to the next subsystem
  240. if errors.Is(err, os.ErrNotExist) {
  241. continue
  242. }
  243. return err
  244. }
  245. if len(procs) > 0 {
  246. errs = append(errs, fmt.Sprintf("%s (contains running processes)", string(s.Name())))
  247. continue
  248. }
  249. if d, ok := s.(deleter); ok {
  250. sp, err := c.path(s.Name())
  251. if err != nil {
  252. return err
  253. }
  254. if err := d.Delete(sp); err != nil {
  255. errs = append(errs, string(s.Name()))
  256. }
  257. continue
  258. }
  259. if p, ok := s.(pather); ok {
  260. sp, err := c.path(s.Name())
  261. if err != nil {
  262. return err
  263. }
  264. path := p.Path(sp)
  265. if err := remove(path); err != nil {
  266. errs = append(errs, path)
  267. }
  268. continue
  269. }
  270. }
  271. if len(errs) > 0 {
  272. return fmt.Errorf("cgroups: unable to remove paths %s", strings.Join(errs, ", "))
  273. }
  274. c.err = ErrCgroupDeleted
  275. return nil
  276. }
  277. // Stat returns the current metrics for the cgroup
  278. func (c *cgroup) Stat(handlers ...ErrorHandler) (*v1.Metrics, error) {
  279. c.mu.Lock()
  280. defer c.mu.Unlock()
  281. if c.err != nil {
  282. return nil, c.err
  283. }
  284. if len(handlers) == 0 {
  285. handlers = append(handlers, errPassthrough)
  286. }
  287. var (
  288. stats = &v1.Metrics{
  289. CPU: &v1.CPUStat{
  290. Throttling: &v1.Throttle{},
  291. Usage: &v1.CPUUsage{},
  292. },
  293. }
  294. wg = &sync.WaitGroup{}
  295. errs = make(chan error, len(c.subsystems))
  296. )
  297. for _, s := range c.subsystems {
  298. if ss, ok := s.(stater); ok {
  299. sp, err := c.path(s.Name())
  300. if err != nil {
  301. return nil, err
  302. }
  303. wg.Add(1)
  304. go func() {
  305. defer wg.Done()
  306. if err := ss.Stat(sp, stats); err != nil {
  307. for _, eh := range handlers {
  308. if herr := eh(err); herr != nil {
  309. errs <- herr
  310. }
  311. }
  312. }
  313. }()
  314. }
  315. }
  316. wg.Wait()
  317. close(errs)
  318. for err := range errs {
  319. return nil, err
  320. }
  321. return stats, nil
  322. }
  323. // Update updates the cgroup with the new resource values provided
  324. //
  325. // Be prepared to handle EBUSY when trying to update a cgroup with
  326. // live processes and other operations like Stats being performed at the
  327. // same time
  328. func (c *cgroup) Update(resources *specs.LinuxResources) error {
  329. c.mu.Lock()
  330. defer c.mu.Unlock()
  331. if c.err != nil {
  332. return c.err
  333. }
  334. for _, s := range c.subsystems {
  335. if u, ok := s.(updater); ok {
  336. sp, err := c.path(s.Name())
  337. if err != nil {
  338. return err
  339. }
  340. if err := u.Update(sp, resources); err != nil {
  341. return err
  342. }
  343. }
  344. }
  345. return nil
  346. }
  347. // Processes returns the processes running inside the cgroup along
  348. // with the subsystem used, pid, and path
  349. func (c *cgroup) Processes(subsystem Name, recursive bool) ([]Process, error) {
  350. c.mu.Lock()
  351. defer c.mu.Unlock()
  352. if c.err != nil {
  353. return nil, c.err
  354. }
  355. return c.processes(subsystem, recursive, cgroupProcs)
  356. }
  357. // Tasks returns the tasks running inside the cgroup along
  358. // with the subsystem used, pid, and path
  359. func (c *cgroup) Tasks(subsystem Name, recursive bool) ([]Task, error) {
  360. c.mu.Lock()
  361. defer c.mu.Unlock()
  362. if c.err != nil {
  363. return nil, c.err
  364. }
  365. return c.processes(subsystem, recursive, cgroupTasks)
  366. }
  367. func (c *cgroup) processes(subsystem Name, recursive bool, pType procType) ([]Process, error) {
  368. s := c.getSubsystem(subsystem)
  369. sp, err := c.path(subsystem)
  370. if err != nil {
  371. return nil, err
  372. }
  373. if s == nil {
  374. return nil, fmt.Errorf("cgroups: %s doesn't exist in %s subsystem", sp, subsystem)
  375. }
  376. path := s.(pather).Path(sp)
  377. var processes []Process
  378. err = filepath.Walk(path, func(p string, info os.FileInfo, err error) error {
  379. if err != nil {
  380. return err
  381. }
  382. if !recursive && info.IsDir() {
  383. if p == path {
  384. return nil
  385. }
  386. return filepath.SkipDir
  387. }
  388. dir, name := filepath.Split(p)
  389. if name != pType {
  390. return nil
  391. }
  392. procs, err := readPids(dir, subsystem, pType)
  393. if err != nil {
  394. return err
  395. }
  396. processes = append(processes, procs...)
  397. return nil
  398. })
  399. return processes, err
  400. }
  401. // Freeze freezes the entire cgroup and all the processes inside it
  402. func (c *cgroup) Freeze() error {
  403. c.mu.Lock()
  404. defer c.mu.Unlock()
  405. if c.err != nil {
  406. return c.err
  407. }
  408. s := c.getSubsystem(Freezer)
  409. if s == nil {
  410. return ErrFreezerNotSupported
  411. }
  412. sp, err := c.path(Freezer)
  413. if err != nil {
  414. return err
  415. }
  416. return s.(*freezerController).Freeze(sp)
  417. }
  418. // Thaw thaws out the cgroup and all the processes inside it
  419. func (c *cgroup) Thaw() error {
  420. c.mu.Lock()
  421. defer c.mu.Unlock()
  422. if c.err != nil {
  423. return c.err
  424. }
  425. s := c.getSubsystem(Freezer)
  426. if s == nil {
  427. return ErrFreezerNotSupported
  428. }
  429. sp, err := c.path(Freezer)
  430. if err != nil {
  431. return err
  432. }
  433. return s.(*freezerController).Thaw(sp)
  434. }
  435. // OOMEventFD returns the memory cgroup's out of memory event fd that triggers
  436. // when processes inside the cgroup receive an oom event. Returns
  437. // ErrMemoryNotSupported if memory cgroups is not supported.
  438. func (c *cgroup) OOMEventFD() (uintptr, error) {
  439. c.mu.Lock()
  440. defer c.mu.Unlock()
  441. if c.err != nil {
  442. return 0, c.err
  443. }
  444. s := c.getSubsystem(Memory)
  445. if s == nil {
  446. return 0, ErrMemoryNotSupported
  447. }
  448. sp, err := c.path(Memory)
  449. if err != nil {
  450. return 0, err
  451. }
  452. return s.(*memoryController).memoryEvent(sp, OOMEvent())
  453. }
  454. // RegisterMemoryEvent allows the ability to register for all v1 memory cgroups
  455. // notifications.
  456. func (c *cgroup) RegisterMemoryEvent(event MemoryEvent) (uintptr, error) {
  457. c.mu.Lock()
  458. defer c.mu.Unlock()
  459. if c.err != nil {
  460. return 0, c.err
  461. }
  462. s := c.getSubsystem(Memory)
  463. if s == nil {
  464. return 0, ErrMemoryNotSupported
  465. }
  466. sp, err := c.path(Memory)
  467. if err != nil {
  468. return 0, err
  469. }
  470. return s.(*memoryController).memoryEvent(sp, event)
  471. }
  472. // State returns the state of the cgroup and its processes
  473. func (c *cgroup) State() State {
  474. c.mu.Lock()
  475. defer c.mu.Unlock()
  476. c.checkExists()
  477. if c.err != nil && c.err == ErrCgroupDeleted {
  478. return Deleted
  479. }
  480. s := c.getSubsystem(Freezer)
  481. if s == nil {
  482. return Thawed
  483. }
  484. sp, err := c.path(Freezer)
  485. if err != nil {
  486. return Unknown
  487. }
  488. state, err := s.(*freezerController).state(sp)
  489. if err != nil {
  490. return Unknown
  491. }
  492. return state
  493. }
  494. // MoveTo does a recursive move subsystem by subsystem of all the processes
  495. // inside the group
  496. func (c *cgroup) MoveTo(destination Cgroup) error {
  497. c.mu.Lock()
  498. defer c.mu.Unlock()
  499. if c.err != nil {
  500. return c.err
  501. }
  502. for _, s := range c.subsystems {
  503. processes, err := c.processes(s.Name(), true, cgroupProcs)
  504. if err != nil {
  505. return err
  506. }
  507. for _, p := range processes {
  508. if err := destination.Add(p); err != nil {
  509. if strings.Contains(err.Error(), "no such process") {
  510. continue
  511. }
  512. return err
  513. }
  514. }
  515. }
  516. return nil
  517. }
  518. func (c *cgroup) getSubsystem(n Name) Subsystem {
  519. for _, s := range c.subsystems {
  520. if s.Name() == n {
  521. return s
  522. }
  523. }
  524. return nil
  525. }
  526. func (c *cgroup) checkExists() {
  527. for _, s := range pathers(c.subsystems) {
  528. p, err := c.path(s.Name())
  529. if err != nil {
  530. return
  531. }
  532. if _, err := os.Lstat(s.Path(p)); err != nil {
  533. if os.IsNotExist(err) {
  534. c.err = ErrCgroupDeleted
  535. return
  536. }
  537. }
  538. }
  539. }