apply_systemd.go 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
  1. // +build linux
  2. package systemd
  3. import (
  4. "fmt"
  5. "io/ioutil"
  6. "os"
  7. "path/filepath"
  8. "strconv"
  9. "strings"
  10. "sync"
  11. systemd1 "github.com/coreos/go-systemd/dbus"
  12. "github.com/dotcloud/docker/pkg/libcontainer/cgroups"
  13. "github.com/dotcloud/docker/pkg/systemd"
  14. "github.com/godbus/dbus"
  15. )
  16. type systemdCgroup struct {
  17. cleanupDirs []string
  18. }
  19. type DeviceAllow struct {
  20. Node string
  21. Permissions string
  22. }
  23. var (
  24. connLock sync.Mutex
  25. theConn *systemd1.Conn
  26. hasStartTransientUnit bool
  27. )
  28. func UseSystemd() bool {
  29. if !systemd.SdBooted() {
  30. return false
  31. }
  32. connLock.Lock()
  33. defer connLock.Unlock()
  34. if theConn == nil {
  35. var err error
  36. theConn, err = systemd1.New()
  37. if err != nil {
  38. return false
  39. }
  40. // Assume we have StartTransientUnit
  41. hasStartTransientUnit = true
  42. // But if we get UnknownMethod error we don't
  43. if _, err := theConn.StartTransientUnit("test.scope", "invalid"); err != nil {
  44. if dbusError, ok := err.(dbus.Error); ok {
  45. if dbusError.Name == "org.freedesktop.DBus.Error.UnknownMethod" {
  46. hasStartTransientUnit = false
  47. }
  48. }
  49. }
  50. }
  51. return hasStartTransientUnit
  52. }
  53. func getIfaceForUnit(unitName string) string {
  54. if strings.HasSuffix(unitName, ".scope") {
  55. return "Scope"
  56. }
  57. if strings.HasSuffix(unitName, ".service") {
  58. return "Service"
  59. }
  60. return "Unit"
  61. }
  62. type cgroupArg struct {
  63. File string
  64. Value string
  65. }
  66. func Apply(c *cgroups.Cgroup, pid int) (cgroups.ActiveCgroup, error) {
  67. var (
  68. unitName = getUnitName(c)
  69. slice = "system.slice"
  70. properties []systemd1.Property
  71. cpuArgs []cgroupArg
  72. cpusetArgs []cgroupArg
  73. memoryArgs []cgroupArg
  74. res systemdCgroup
  75. )
  76. // First set up things not supported by systemd
  77. // -1 disables memorySwap
  78. if c.MemorySwap >= 0 && (c.Memory != 0 || c.MemorySwap > 0) {
  79. memorySwap := c.MemorySwap
  80. if memorySwap == 0 {
  81. // By default, MemorySwap is set to twice the size of RAM.
  82. memorySwap = c.Memory * 2
  83. }
  84. memoryArgs = append(memoryArgs, cgroupArg{"memory.memsw.limit_in_bytes", strconv.FormatInt(memorySwap, 10)})
  85. }
  86. if c.CpusetCpus != "" {
  87. cpusetArgs = append(cpusetArgs, cgroupArg{"cpuset.cpus", c.CpusetCpus})
  88. }
  89. if c.Slice != "" {
  90. slice = c.Slice
  91. }
  92. properties = append(properties,
  93. systemd1.Property{"Slice", dbus.MakeVariant(slice)},
  94. systemd1.Property{"Description", dbus.MakeVariant("docker container " + c.Name)},
  95. systemd1.Property{"PIDs", dbus.MakeVariant([]uint32{uint32(pid)})},
  96. )
  97. if !c.DeviceAccess {
  98. properties = append(properties,
  99. systemd1.Property{"DevicePolicy", dbus.MakeVariant("strict")},
  100. systemd1.Property{"DeviceAllow", dbus.MakeVariant([]DeviceAllow{
  101. {"/dev/null", "rwm"},
  102. {"/dev/zero", "rwm"},
  103. {"/dev/full", "rwm"},
  104. {"/dev/random", "rwm"},
  105. {"/dev/urandom", "rwm"},
  106. {"/dev/tty", "rwm"},
  107. {"/dev/console", "rwm"},
  108. {"/dev/tty0", "rwm"},
  109. {"/dev/tty1", "rwm"},
  110. {"/dev/pts/ptmx", "rwm"},
  111. // There is no way to add /dev/pts/* here atm, so we hack this manually below
  112. // /dev/pts/* (how to add this?)
  113. // Same with tuntap, which doesn't exist as a node most of the time
  114. })})
  115. }
  116. // Always enable accounting, this gets us the same behaviour as the fs implementation,
  117. // plus the kernel has some problems with joining the memory cgroup at a later time.
  118. properties = append(properties,
  119. systemd1.Property{"MemoryAccounting", dbus.MakeVariant(true)},
  120. systemd1.Property{"CPUAccounting", dbus.MakeVariant(true)},
  121. systemd1.Property{"BlockIOAccounting", dbus.MakeVariant(true)})
  122. if c.Memory != 0 {
  123. properties = append(properties,
  124. systemd1.Property{"MemoryLimit", dbus.MakeVariant(uint64(c.Memory))})
  125. }
  126. // TODO: MemoryReservation and MemorySwap not available in systemd
  127. if c.CpuShares != 0 {
  128. properties = append(properties,
  129. systemd1.Property{"CPUShares", dbus.MakeVariant(uint64(c.CpuShares))})
  130. }
  131. if _, err := theConn.StartTransientUnit(unitName, "replace", properties...); err != nil {
  132. return nil, err
  133. }
  134. // To work around the lack of /dev/pts/* support above we need to manually add these
  135. // so, ask systemd for the cgroup used
  136. props, err := theConn.GetUnitTypeProperties(unitName, getIfaceForUnit(unitName))
  137. if err != nil {
  138. return nil, err
  139. }
  140. cgroup := props["ControlGroup"].(string)
  141. if !c.DeviceAccess {
  142. mountpoint, err := cgroups.FindCgroupMountpoint("devices")
  143. if err != nil {
  144. return nil, err
  145. }
  146. path := filepath.Join(mountpoint, cgroup)
  147. allow := []string{
  148. // allow mknod for any device
  149. "c *:* m",
  150. "b *:* m",
  151. // /dev/pts/ - pts namespaces are "coming soon"
  152. "c 136:* rwm",
  153. // tuntap
  154. "c 10:200 rwm",
  155. }
  156. for _, val := range allow {
  157. if err := ioutil.WriteFile(filepath.Join(path, "devices.allow"), []byte(val), 0700); err != nil {
  158. return nil, err
  159. }
  160. }
  161. }
  162. if len(cpuArgs) != 0 {
  163. mountpoint, err := cgroups.FindCgroupMountpoint("cpu")
  164. if err != nil {
  165. return nil, err
  166. }
  167. path := filepath.Join(mountpoint, cgroup)
  168. for _, arg := range cpuArgs {
  169. if err := ioutil.WriteFile(filepath.Join(path, arg.File), []byte(arg.Value), 0700); err != nil {
  170. return nil, err
  171. }
  172. }
  173. }
  174. if len(memoryArgs) != 0 {
  175. mountpoint, err := cgroups.FindCgroupMountpoint("memory")
  176. if err != nil {
  177. return nil, err
  178. }
  179. path := filepath.Join(mountpoint, cgroup)
  180. for _, arg := range memoryArgs {
  181. if err := ioutil.WriteFile(filepath.Join(path, arg.File), []byte(arg.Value), 0700); err != nil {
  182. return nil, err
  183. }
  184. }
  185. }
  186. if len(cpusetArgs) != 0 {
  187. // systemd does not atm set up the cpuset controller, so we must manually
  188. // join it. Additionally that is a very finicky controller where each
  189. // level must have a full setup as the default for a new directory is "no cpus",
  190. // so we avoid using any hierarchies here, creating a toplevel directory.
  191. mountpoint, err := cgroups.FindCgroupMountpoint("cpuset")
  192. if err != nil {
  193. return nil, err
  194. }
  195. initPath, err := cgroups.GetInitCgroupDir("cpuset")
  196. if err != nil {
  197. return nil, err
  198. }
  199. rootPath := filepath.Join(mountpoint, initPath)
  200. path := filepath.Join(mountpoint, initPath, c.Parent+"-"+c.Name)
  201. res.cleanupDirs = append(res.cleanupDirs, path)
  202. if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) {
  203. return nil, err
  204. }
  205. foundCpus := false
  206. foundMems := false
  207. for _, arg := range cpusetArgs {
  208. if arg.File == "cpuset.cpus" {
  209. foundCpus = true
  210. }
  211. if arg.File == "cpuset.mems" {
  212. foundMems = true
  213. }
  214. if err := ioutil.WriteFile(filepath.Join(path, arg.File), []byte(arg.Value), 0700); err != nil {
  215. return nil, err
  216. }
  217. }
  218. // These are required, if not specified inherit from parent
  219. if !foundCpus {
  220. s, err := ioutil.ReadFile(filepath.Join(rootPath, "cpuset.cpus"))
  221. if err != nil {
  222. return nil, err
  223. }
  224. if err := ioutil.WriteFile(filepath.Join(path, "cpuset.cpus"), s, 0700); err != nil {
  225. return nil, err
  226. }
  227. }
  228. // These are required, if not specified inherit from parent
  229. if !foundMems {
  230. s, err := ioutil.ReadFile(filepath.Join(rootPath, "cpuset.mems"))
  231. if err != nil {
  232. return nil, err
  233. }
  234. if err := ioutil.WriteFile(filepath.Join(path, "cpuset.mems"), s, 0700); err != nil {
  235. return nil, err
  236. }
  237. }
  238. if err := ioutil.WriteFile(filepath.Join(path, "cgroup.procs"), []byte(strconv.Itoa(pid)), 0700); err != nil {
  239. return nil, err
  240. }
  241. }
  242. return &res, nil
  243. }
  244. func (c *systemdCgroup) Cleanup() error {
  245. // systemd cleans up, we don't need to do much
  246. for _, path := range c.cleanupDirs {
  247. os.RemoveAll(path)
  248. }
  249. return nil
  250. }
  251. func GetPids(c *cgroups.Cgroup) ([]int, error) {
  252. unitName := getUnitName(c)
  253. mountpoint, err := cgroups.FindCgroupMountpoint("cpu")
  254. if err != nil {
  255. return nil, err
  256. }
  257. props, err := theConn.GetUnitTypeProperties(unitName, getIfaceForUnit(unitName))
  258. if err != nil {
  259. return nil, err
  260. }
  261. cgroup := props["ControlGroup"].(string)
  262. return cgroups.ReadProcsFile(filepath.Join(mountpoint, cgroup))
  263. }
  264. func getUnitName(c *cgroups.Cgroup) string {
  265. return fmt.Sprintf("%s-%s.scope", c.Parent, c.Name)
  266. }