create.go 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314
  1. // +build linux,cgo
  2. package native
  3. import (
  4. "fmt"
  5. "strings"
  6. "syscall"
  7. "github.com/docker/docker/daemon/execdriver"
  8. "github.com/opencontainers/runc/libcontainer/apparmor"
  9. "github.com/opencontainers/runc/libcontainer/configs"
  10. "github.com/opencontainers/runc/libcontainer/devices"
  11. )
  12. // createContainer populates and configures the container type with the
  13. // data provided by the execdriver.Command
  14. func (d *Driver) createContainer(c *execdriver.Command, hooks execdriver.Hooks) (*configs.Config, error) {
  15. container := execdriver.InitContainer(c)
  16. if err := d.createIpc(container, c); err != nil {
  17. return nil, err
  18. }
  19. if err := d.createPid(container, c); err != nil {
  20. return nil, err
  21. }
  22. if err := d.createUTS(container, c); err != nil {
  23. return nil, err
  24. }
  25. if err := d.setupRemappedRoot(container, c); err != nil {
  26. return nil, err
  27. }
  28. if err := d.createNetwork(container, c, hooks); err != nil {
  29. return nil, err
  30. }
  31. if c.ProcessConfig.Privileged {
  32. if !container.Readonlyfs {
  33. // clear readonly for /sys
  34. for i := range container.Mounts {
  35. if container.Mounts[i].Destination == "/sys" {
  36. container.Mounts[i].Flags &= ^syscall.MS_RDONLY
  37. }
  38. }
  39. container.ReadonlyPaths = nil
  40. }
  41. // clear readonly for cgroup
  42. for i := range container.Mounts {
  43. if container.Mounts[i].Device == "cgroup" {
  44. container.Mounts[i].Flags &= ^syscall.MS_RDONLY
  45. }
  46. }
  47. container.MaskPaths = nil
  48. if err := d.setPrivileged(container); err != nil {
  49. return nil, err
  50. }
  51. } else {
  52. if err := d.setCapabilities(container, c); err != nil {
  53. return nil, err
  54. }
  55. }
  56. // add CAP_ prefix to all caps for new libcontainer update to match
  57. // the spec format.
  58. for i, s := range container.Capabilities {
  59. if !strings.HasPrefix(s, "CAP_") {
  60. container.Capabilities[i] = fmt.Sprintf("CAP_%s", s)
  61. }
  62. }
  63. container.AdditionalGroups = c.GroupAdd
  64. if c.AppArmorProfile != "" {
  65. container.AppArmorProfile = c.AppArmorProfile
  66. }
  67. if err := execdriver.SetupCgroups(container, c); err != nil {
  68. return nil, err
  69. }
  70. container.OomScoreAdj = c.OomScoreAdj
  71. if container.Readonlyfs {
  72. for i := range container.Mounts {
  73. switch container.Mounts[i].Destination {
  74. case "/proc", "/dev", "/dev/pts":
  75. continue
  76. }
  77. container.Mounts[i].Flags |= syscall.MS_RDONLY
  78. }
  79. /* These paths must be remounted as r/o */
  80. container.ReadonlyPaths = append(container.ReadonlyPaths, "/dev")
  81. }
  82. if err := d.setupMounts(container, c); err != nil {
  83. return nil, err
  84. }
  85. d.setupLabels(container, c)
  86. d.setupRlimits(container, c)
  87. return container, nil
  88. }
  89. func (d *Driver) createNetwork(container *configs.Config, c *execdriver.Command, hooks execdriver.Hooks) error {
  90. if c.Network == nil {
  91. return nil
  92. }
  93. if c.Network.ContainerID != "" {
  94. d.Lock()
  95. active := d.activeContainers[c.Network.ContainerID]
  96. d.Unlock()
  97. if active == nil {
  98. return fmt.Errorf("%s is not a valid running container to join", c.Network.ContainerID)
  99. }
  100. state, err := active.State()
  101. if err != nil {
  102. return err
  103. }
  104. container.Namespaces.Add(configs.NEWNET, state.NamespacePaths[configs.NEWNET])
  105. return nil
  106. }
  107. if c.Network.NamespacePath != "" {
  108. container.Namespaces.Add(configs.NEWNET, c.Network.NamespacePath)
  109. return nil
  110. }
  111. // only set up prestart hook if the namespace path is not set (this should be
  112. // all cases *except* for --net=host shared networking)
  113. container.Hooks = &configs.Hooks{
  114. Prestart: []configs.Hook{
  115. configs.NewFunctionHook(func(s configs.HookState) error {
  116. if len(hooks.PreStart) > 0 {
  117. for _, fnHook := range hooks.PreStart {
  118. // A closed channel for OOM is returned here as it will be
  119. // non-blocking and return the correct result when read.
  120. chOOM := make(chan struct{})
  121. close(chOOM)
  122. if err := fnHook(&c.ProcessConfig, s.Pid, chOOM); err != nil {
  123. return err
  124. }
  125. }
  126. }
  127. return nil
  128. }),
  129. },
  130. }
  131. return nil
  132. }
  133. func (d *Driver) createIpc(container *configs.Config, c *execdriver.Command) error {
  134. if c.Ipc.HostIpc {
  135. container.Namespaces.Remove(configs.NEWIPC)
  136. return nil
  137. }
  138. if c.Ipc.ContainerID != "" {
  139. d.Lock()
  140. active := d.activeContainers[c.Ipc.ContainerID]
  141. d.Unlock()
  142. if active == nil {
  143. return fmt.Errorf("%s is not a valid running container to join", c.Ipc.ContainerID)
  144. }
  145. state, err := active.State()
  146. if err != nil {
  147. return err
  148. }
  149. container.Namespaces.Add(configs.NEWIPC, state.NamespacePaths[configs.NEWIPC])
  150. }
  151. return nil
  152. }
  153. func (d *Driver) createPid(container *configs.Config, c *execdriver.Command) error {
  154. if c.Pid.HostPid {
  155. container.Namespaces.Remove(configs.NEWPID)
  156. return nil
  157. }
  158. return nil
  159. }
  160. func (d *Driver) createUTS(container *configs.Config, c *execdriver.Command) error {
  161. if c.UTS.HostUTS {
  162. container.Namespaces.Remove(configs.NEWUTS)
  163. container.Hostname = ""
  164. return nil
  165. }
  166. return nil
  167. }
  168. func (d *Driver) setupRemappedRoot(container *configs.Config, c *execdriver.Command) error {
  169. if c.RemappedRoot.UID == 0 {
  170. container.Namespaces.Remove(configs.NEWUSER)
  171. return nil
  172. }
  173. // convert the Docker daemon id map to the libcontainer variant of the same struct
  174. // this keeps us from having to import libcontainer code across Docker client + daemon packages
  175. cuidMaps := []configs.IDMap{}
  176. cgidMaps := []configs.IDMap{}
  177. for _, idMap := range c.UIDMapping {
  178. cuidMaps = append(cuidMaps, configs.IDMap(idMap))
  179. }
  180. for _, idMap := range c.GIDMapping {
  181. cgidMaps = append(cgidMaps, configs.IDMap(idMap))
  182. }
  183. container.UidMappings = cuidMaps
  184. container.GidMappings = cgidMaps
  185. for _, node := range container.Devices {
  186. node.Uid = uint32(c.RemappedRoot.UID)
  187. node.Gid = uint32(c.RemappedRoot.GID)
  188. }
  189. // TODO: until a kernel/mount solution exists for handling remount in a user namespace,
  190. // we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
  191. for i := range container.Mounts {
  192. if container.Mounts[i].Device == "cgroup" {
  193. container.Mounts[i].Flags &= ^syscall.MS_RDONLY
  194. }
  195. }
  196. return nil
  197. }
  198. func (d *Driver) setPrivileged(container *configs.Config) (err error) {
  199. container.Capabilities = execdriver.GetAllCapabilities()
  200. container.Cgroups.AllowAllDevices = true
  201. hostDevices, err := devices.HostDevices()
  202. if err != nil {
  203. return err
  204. }
  205. container.Devices = hostDevices
  206. if apparmor.IsEnabled() {
  207. container.AppArmorProfile = "unconfined"
  208. }
  209. return nil
  210. }
  211. func (d *Driver) setCapabilities(container *configs.Config, c *execdriver.Command) (err error) {
  212. container.Capabilities, err = execdriver.TweakCapabilities(container.Capabilities, c.CapAdd, c.CapDrop)
  213. return err
  214. }
  215. func (d *Driver) setupRlimits(container *configs.Config, c *execdriver.Command) {
  216. if c.Resources == nil {
  217. return
  218. }
  219. for _, rlimit := range c.Resources.Rlimits {
  220. container.Rlimits = append(container.Rlimits, configs.Rlimit{
  221. Type: rlimit.Type,
  222. Hard: rlimit.Hard,
  223. Soft: rlimit.Soft,
  224. })
  225. }
  226. }
  227. func (d *Driver) setupMounts(container *configs.Config, c *execdriver.Command) error {
  228. userMounts := make(map[string]struct{})
  229. for _, m := range c.Mounts {
  230. userMounts[m.Destination] = struct{}{}
  231. }
  232. // Filter out mounts that are overriden by user supplied mounts
  233. var defaultMounts []*configs.Mount
  234. _, mountDev := userMounts["/dev"]
  235. for _, m := range container.Mounts {
  236. if _, ok := userMounts[m.Destination]; !ok {
  237. if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
  238. container.Devices = nil
  239. continue
  240. }
  241. defaultMounts = append(defaultMounts, m)
  242. }
  243. }
  244. container.Mounts = defaultMounts
  245. for _, m := range c.Mounts {
  246. flags := syscall.MS_BIND | syscall.MS_REC
  247. if !m.Writable {
  248. flags |= syscall.MS_RDONLY
  249. }
  250. if m.Slave {
  251. flags |= syscall.MS_SLAVE
  252. }
  253. container.Mounts = append(container.Mounts, &configs.Mount{
  254. Source: m.Source,
  255. Destination: m.Destination,
  256. Device: "bind",
  257. Flags: flags,
  258. })
  259. }
  260. return nil
  261. }
  262. func (d *Driver) setupLabels(container *configs.Config, c *execdriver.Command) {
  263. container.ProcessLabel = c.ProcessLabel
  264. container.MountLabel = c.MountLabel
  265. }