apply_systemd.go 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317
  1. // +build linux
  2. package systemd
  3. import (
  4. "bytes"
  5. "fmt"
  6. "io/ioutil"
  7. "os"
  8. "path/filepath"
  9. "strconv"
  10. "strings"
  11. "sync"
  12. "time"
  13. systemd "github.com/coreos/go-systemd/dbus"
  14. "github.com/docker/libcontainer/cgroups"
  15. "github.com/docker/libcontainer/cgroups/fs"
  16. "github.com/godbus/dbus"
  17. )
  18. type systemdCgroup struct {
  19. cgroup *cgroups.Cgroup
  20. }
  21. type subsystem interface {
  22. GetStats(string, *cgroups.Stats) error
  23. }
  24. var (
  25. connLock sync.Mutex
  26. theConn *systemd.Conn
  27. hasStartTransientUnit bool
  28. )
  29. func newProp(name string, units interface{}) systemd.Property {
  30. return systemd.Property{
  31. Name: name,
  32. Value: dbus.MakeVariant(units),
  33. }
  34. }
  35. func UseSystemd() bool {
  36. s, err := os.Stat("/run/systemd/system")
  37. if err != nil || !s.IsDir() {
  38. return false
  39. }
  40. connLock.Lock()
  41. defer connLock.Unlock()
  42. if theConn == nil {
  43. var err error
  44. theConn, err = systemd.New()
  45. if err != nil {
  46. return false
  47. }
  48. // Assume we have StartTransientUnit
  49. hasStartTransientUnit = true
  50. // But if we get UnknownMethod error we don't
  51. if _, err := theConn.StartTransientUnit("test.scope", "invalid"); err != nil {
  52. if dbusError, ok := err.(dbus.Error); ok {
  53. if dbusError.Name == "org.freedesktop.DBus.Error.UnknownMethod" {
  54. hasStartTransientUnit = false
  55. }
  56. }
  57. }
  58. }
  59. return hasStartTransientUnit
  60. }
  61. func getIfaceForUnit(unitName string) string {
  62. if strings.HasSuffix(unitName, ".scope") {
  63. return "Scope"
  64. }
  65. if strings.HasSuffix(unitName, ".service") {
  66. return "Service"
  67. }
  68. return "Unit"
  69. }
  70. func Apply(c *cgroups.Cgroup, pid int) (map[string]string, error) {
  71. var (
  72. unitName = getUnitName(c)
  73. slice = "system.slice"
  74. properties []systemd.Property
  75. res = &systemdCgroup{}
  76. )
  77. res.cgroup = c
  78. if c.Slice != "" {
  79. slice = c.Slice
  80. }
  81. properties = append(properties,
  82. systemd.PropSlice(slice),
  83. systemd.PropDescription("docker container "+c.Name),
  84. newProp("PIDs", []uint32{uint32(pid)}),
  85. )
  86. // Always enable accounting, this gets us the same behaviour as the fs implementation,
  87. // plus the kernel has some problems with joining the memory cgroup at a later time.
  88. properties = append(properties,
  89. newProp("MemoryAccounting", true),
  90. newProp("CPUAccounting", true),
  91. newProp("BlockIOAccounting", true))
  92. if c.Memory != 0 {
  93. properties = append(properties,
  94. newProp("MemoryLimit", uint64(c.Memory)))
  95. }
  96. // TODO: MemoryReservation and MemorySwap not available in systemd
  97. if c.CpuShares != 0 {
  98. properties = append(properties,
  99. newProp("CPUShares", uint64(c.CpuShares)))
  100. }
  101. if _, err := theConn.StartTransientUnit(unitName, "replace", properties...); err != nil {
  102. return nil, err
  103. }
  104. if !c.AllowAllDevices {
  105. if err := joinDevices(c, pid); err != nil {
  106. return nil, err
  107. }
  108. }
  109. // -1 disables memorySwap
  110. if c.MemorySwap >= 0 && (c.Memory != 0 || c.MemorySwap > 0) {
  111. if err := joinMemory(c, pid); err != nil {
  112. return nil, err
  113. }
  114. }
  115. // we need to manually join the freezer and cpuset cgroup in systemd
  116. // because it does not currently support it via the dbus api.
  117. if err := joinFreezer(c, pid); err != nil {
  118. return nil, err
  119. }
  120. if err := joinCpuset(c, pid); err != nil {
  121. return nil, err
  122. }
  123. paths := make(map[string]string)
  124. for _, sysname := range []string{
  125. "devices",
  126. "memory",
  127. "cpu",
  128. "cpuset",
  129. "cpuacct",
  130. "blkio",
  131. "perf_event",
  132. "freezer",
  133. } {
  134. subsystemPath, err := getSubsystemPath(res.cgroup, sysname)
  135. if err != nil {
  136. // Don't fail if a cgroup hierarchy was not found, just skip this subsystem
  137. if cgroups.IsNotFound(err) {
  138. continue
  139. }
  140. return nil, err
  141. }
  142. paths[sysname] = subsystemPath
  143. }
  144. return paths, nil
  145. }
  146. func writeFile(dir, file, data string) error {
  147. return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700)
  148. }
  149. func joinFreezer(c *cgroups.Cgroup, pid int) error {
  150. path, err := getSubsystemPath(c, "freezer")
  151. if err != nil {
  152. return err
  153. }
  154. if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) {
  155. return err
  156. }
  157. return ioutil.WriteFile(filepath.Join(path, "cgroup.procs"), []byte(strconv.Itoa(pid)), 0700)
  158. }
  159. func getSubsystemPath(c *cgroups.Cgroup, subsystem string) (string, error) {
  160. mountpoint, err := cgroups.FindCgroupMountpoint(subsystem)
  161. if err != nil {
  162. return "", err
  163. }
  164. initPath, err := cgroups.GetInitCgroupDir(subsystem)
  165. if err != nil {
  166. return "", err
  167. }
  168. slice := "system.slice"
  169. if c.Slice != "" {
  170. slice = c.Slice
  171. }
  172. return filepath.Join(mountpoint, initPath, slice, getUnitName(c)), nil
  173. }
  174. func Freeze(c *cgroups.Cgroup, state cgroups.FreezerState) error {
  175. path, err := getSubsystemPath(c, "freezer")
  176. if err != nil {
  177. return err
  178. }
  179. if err := ioutil.WriteFile(filepath.Join(path, "freezer.state"), []byte(state), 0); err != nil {
  180. return err
  181. }
  182. for {
  183. state_, err := ioutil.ReadFile(filepath.Join(path, "freezer.state"))
  184. if err != nil {
  185. return err
  186. }
  187. if string(state) == string(bytes.TrimSpace(state_)) {
  188. break
  189. }
  190. time.Sleep(1 * time.Millisecond)
  191. }
  192. return nil
  193. }
  194. func GetPids(c *cgroups.Cgroup) ([]int, error) {
  195. path, err := getSubsystemPath(c, "cpu")
  196. if err != nil {
  197. return nil, err
  198. }
  199. return cgroups.ReadProcsFile(path)
  200. }
  201. func getUnitName(c *cgroups.Cgroup) string {
  202. return fmt.Sprintf("%s-%s.scope", c.Parent, c.Name)
  203. }
  204. // Atm we can't use the systemd device support because of two missing things:
  205. // * Support for wildcards to allow mknod on any device
  206. // * Support for wildcards to allow /dev/pts support
  207. //
  208. // The second is available in more recent systemd as "char-pts", but not in e.g. v208 which is
  209. // in wide use. When both these are availalable we will be able to switch, but need to keep the old
  210. // implementation for backwards compat.
  211. //
  212. // Note: we can't use systemd to set up the initial limits, and then change the cgroup
  213. // because systemd will re-write the device settings if it needs to re-apply the cgroup context.
  214. // This happens at least for v208 when any sibling unit is started.
  215. func joinDevices(c *cgroups.Cgroup, pid int) error {
  216. path, err := getSubsystemPath(c, "devices")
  217. if err != nil {
  218. return err
  219. }
  220. if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) {
  221. return err
  222. }
  223. if err := ioutil.WriteFile(filepath.Join(path, "cgroup.procs"), []byte(strconv.Itoa(pid)), 0700); err != nil {
  224. return err
  225. }
  226. if err := writeFile(path, "devices.deny", "a"); err != nil {
  227. return err
  228. }
  229. for _, dev := range c.AllowedDevices {
  230. if err := writeFile(path, "devices.allow", dev.GetCgroupAllowString()); err != nil {
  231. return err
  232. }
  233. }
  234. return nil
  235. }
  236. // Symmetrical public function to update device based cgroups. Also available
  237. // in the fs implementation.
  238. func ApplyDevices(c *cgroups.Cgroup, pid int) error {
  239. return joinDevices(c, pid)
  240. }
  241. func joinMemory(c *cgroups.Cgroup, pid int) error {
  242. memorySwap := c.MemorySwap
  243. if memorySwap == 0 {
  244. // By default, MemorySwap is set to twice the size of RAM.
  245. memorySwap = c.Memory * 2
  246. }
  247. path, err := getSubsystemPath(c, "memory")
  248. if err != nil {
  249. return err
  250. }
  251. return ioutil.WriteFile(filepath.Join(path, "memory.memsw.limit_in_bytes"), []byte(strconv.FormatInt(memorySwap, 10)), 0700)
  252. }
  253. // systemd does not atm set up the cpuset controller, so we must manually
  254. // join it. Additionally that is a very finicky controller where each
  255. // level must have a full setup as the default for a new directory is "no cpus"
  256. func joinCpuset(c *cgroups.Cgroup, pid int) error {
  257. path, err := getSubsystemPath(c, "cpuset")
  258. if err != nil {
  259. return err
  260. }
  261. s := &fs.CpusetGroup{}
  262. return s.SetDir(path, c.CpusetCpus, c.CpusetMems, pid)
  263. }