create.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514
  1. // +build linux,cgo
  2. package native
  3. import (
  4. "fmt"
  5. "path/filepath"
  6. "strings"
  7. "syscall"
  8. "github.com/docker/docker/daemon/execdriver"
  9. "github.com/docker/docker/pkg/mount"
  10. "github.com/docker/docker/profiles/seccomp"
  11. "github.com/docker/docker/volume"
  12. "github.com/opencontainers/runc/libcontainer/apparmor"
  13. "github.com/opencontainers/runc/libcontainer/configs"
  14. "github.com/opencontainers/runc/libcontainer/devices"
  15. )
  16. // createContainer populates and configures the container type with the
  17. // data provided by the execdriver.Command
  18. func (d *Driver) createContainer(c *execdriver.Command, hooks execdriver.Hooks) (container *configs.Config, err error) {
  19. container = execdriver.InitContainer(c)
  20. if err := d.createIpc(container, c); err != nil {
  21. return nil, err
  22. }
  23. if err := d.createPid(container, c); err != nil {
  24. return nil, err
  25. }
  26. if err := d.createUTS(container, c); err != nil {
  27. return nil, err
  28. }
  29. if err := d.setupRemappedRoot(container, c); err != nil {
  30. return nil, err
  31. }
  32. if err := d.createNetwork(container, c, hooks); err != nil {
  33. return nil, err
  34. }
  35. if c.ProcessConfig.Privileged {
  36. if !container.Readonlyfs {
  37. // clear readonly for /sys
  38. for i := range container.Mounts {
  39. if container.Mounts[i].Destination == "/sys" {
  40. container.Mounts[i].Flags &= ^syscall.MS_RDONLY
  41. }
  42. }
  43. container.ReadonlyPaths = nil
  44. }
  45. // clear readonly for cgroup
  46. for i := range container.Mounts {
  47. if container.Mounts[i].Device == "cgroup" {
  48. container.Mounts[i].Flags &= ^syscall.MS_RDONLY
  49. }
  50. }
  51. container.MaskPaths = nil
  52. if err := d.setPrivileged(container); err != nil {
  53. return nil, err
  54. }
  55. } else {
  56. if err := d.setCapabilities(container, c); err != nil {
  57. return nil, err
  58. }
  59. if c.SeccompProfile == "" {
  60. container.Seccomp, err = seccomp.GetDefaultProfile()
  61. if err != nil {
  62. return nil, err
  63. }
  64. }
  65. }
  66. // add CAP_ prefix to all caps for new libcontainer update to match
  67. // the spec format.
  68. for i, s := range container.Capabilities {
  69. if !strings.HasPrefix(s, "CAP_") {
  70. container.Capabilities[i] = fmt.Sprintf("CAP_%s", s)
  71. }
  72. }
  73. container.AdditionalGroups = c.GroupAdd
  74. if c.AppArmorProfile != "" {
  75. container.AppArmorProfile = c.AppArmorProfile
  76. }
  77. if c.SeccompProfile != "" && c.SeccompProfile != "unconfined" {
  78. container.Seccomp, err = seccomp.LoadProfile(c.SeccompProfile)
  79. if err != nil {
  80. return nil, err
  81. }
  82. }
  83. if err := execdriver.SetupCgroups(container, c); err != nil {
  84. return nil, err
  85. }
  86. container.OomScoreAdj = c.OomScoreAdj
  87. if container.Readonlyfs {
  88. for i := range container.Mounts {
  89. switch container.Mounts[i].Destination {
  90. case "/proc", "/dev", "/dev/pts", "/dev/mqueue":
  91. continue
  92. }
  93. container.Mounts[i].Flags |= syscall.MS_RDONLY
  94. }
  95. /* These paths must be remounted as r/o */
  96. container.ReadonlyPaths = append(container.ReadonlyPaths, "/dev")
  97. }
  98. if err := d.setupMounts(container, c); err != nil {
  99. return nil, err
  100. }
  101. d.setupLabels(container, c)
  102. d.setupRlimits(container, c)
  103. container.NoNewPrivileges = c.NoNewPrivileges
  104. return container, nil
  105. }
  106. func (d *Driver) createNetwork(container *configs.Config, c *execdriver.Command, hooks execdriver.Hooks) error {
  107. if c.Network == nil {
  108. return nil
  109. }
  110. if c.Network.ContainerID != "" {
  111. d.Lock()
  112. active := d.activeContainers[c.Network.ContainerID]
  113. d.Unlock()
  114. if active == nil {
  115. return fmt.Errorf("%s is not a valid running container to join", c.Network.ContainerID)
  116. }
  117. state, err := active.State()
  118. if err != nil {
  119. return err
  120. }
  121. container.Namespaces.Add(configs.NEWNET, state.NamespacePaths[configs.NEWNET])
  122. return nil
  123. }
  124. if c.Network.NamespacePath != "" {
  125. container.Namespaces.Add(configs.NEWNET, c.Network.NamespacePath)
  126. return nil
  127. }
  128. // only set up prestart hook if the namespace path is not set (this should be
  129. // all cases *except* for --net=host shared networking)
  130. container.Hooks = &configs.Hooks{
  131. Prestart: []configs.Hook{
  132. configs.NewFunctionHook(func(s configs.HookState) error {
  133. if len(hooks.PreStart) > 0 {
  134. for _, fnHook := range hooks.PreStart {
  135. // A closed channel for OOM is returned here as it will be
  136. // non-blocking and return the correct result when read.
  137. chOOM := make(chan struct{})
  138. close(chOOM)
  139. if err := fnHook(&c.ProcessConfig, s.Pid, chOOM); err != nil {
  140. return err
  141. }
  142. }
  143. }
  144. return nil
  145. }),
  146. },
  147. }
  148. return nil
  149. }
  150. func (d *Driver) createIpc(container *configs.Config, c *execdriver.Command) error {
  151. if c.Ipc.HostIpc {
  152. container.Namespaces.Remove(configs.NEWIPC)
  153. return nil
  154. }
  155. if c.Ipc.ContainerID != "" {
  156. d.Lock()
  157. active := d.activeContainers[c.Ipc.ContainerID]
  158. d.Unlock()
  159. if active == nil {
  160. return fmt.Errorf("%s is not a valid running container to join", c.Ipc.ContainerID)
  161. }
  162. state, err := active.State()
  163. if err != nil {
  164. return err
  165. }
  166. container.Namespaces.Add(configs.NEWIPC, state.NamespacePaths[configs.NEWIPC])
  167. }
  168. return nil
  169. }
  170. func (d *Driver) createPid(container *configs.Config, c *execdriver.Command) error {
  171. if c.Pid.HostPid {
  172. container.Namespaces.Remove(configs.NEWPID)
  173. return nil
  174. }
  175. return nil
  176. }
  177. func (d *Driver) createUTS(container *configs.Config, c *execdriver.Command) error {
  178. if c.UTS.HostUTS {
  179. container.Namespaces.Remove(configs.NEWUTS)
  180. container.Hostname = ""
  181. return nil
  182. }
  183. return nil
  184. }
  185. func (d *Driver) setupRemappedRoot(container *configs.Config, c *execdriver.Command) error {
  186. if c.RemappedRoot.UID == 0 {
  187. container.Namespaces.Remove(configs.NEWUSER)
  188. return nil
  189. }
  190. // convert the Docker daemon id map to the libcontainer variant of the same struct
  191. // this keeps us from having to import libcontainer code across Docker client + daemon packages
  192. cuidMaps := []configs.IDMap{}
  193. cgidMaps := []configs.IDMap{}
  194. for _, idMap := range c.UIDMapping {
  195. cuidMaps = append(cuidMaps, configs.IDMap(idMap))
  196. }
  197. for _, idMap := range c.GIDMapping {
  198. cgidMaps = append(cgidMaps, configs.IDMap(idMap))
  199. }
  200. container.UidMappings = cuidMaps
  201. container.GidMappings = cgidMaps
  202. for _, node := range container.Devices {
  203. node.Uid = uint32(c.RemappedRoot.UID)
  204. node.Gid = uint32(c.RemappedRoot.GID)
  205. }
  206. // TODO: until a kernel/mount solution exists for handling remount in a user namespace,
  207. // we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
  208. for i := range container.Mounts {
  209. if container.Mounts[i].Device == "cgroup" {
  210. container.Mounts[i].Flags &= ^syscall.MS_RDONLY
  211. }
  212. }
  213. return nil
  214. }
  215. func (d *Driver) setPrivileged(container *configs.Config) (err error) {
  216. container.Capabilities = execdriver.GetAllCapabilities()
  217. container.Cgroups.Resources.AllowAllDevices = true
  218. hostDevices, err := devices.HostDevices()
  219. if err != nil {
  220. return err
  221. }
  222. container.Devices = hostDevices
  223. if apparmor.IsEnabled() {
  224. container.AppArmorProfile = "unconfined"
  225. }
  226. return nil
  227. }
  228. func (d *Driver) setCapabilities(container *configs.Config, c *execdriver.Command) (err error) {
  229. container.Capabilities, err = execdriver.TweakCapabilities(container.Capabilities, c.CapAdd, c.CapDrop)
  230. return err
  231. }
  232. func (d *Driver) setupRlimits(container *configs.Config, c *execdriver.Command) {
  233. if c.Resources == nil {
  234. return
  235. }
  236. for _, rlimit := range c.Resources.Rlimits {
  237. container.Rlimits = append(container.Rlimits, configs.Rlimit{
  238. Type: rlimit.Type,
  239. Hard: rlimit.Hard,
  240. Soft: rlimit.Soft,
  241. })
  242. }
  243. }
  244. // If rootfs mount propagation is RPRIVATE, that means all the volumes are
  245. // going to be private anyway. There is no need to apply per volume
  246. // propagation on top. This is just an optimization so that cost of per volume
  247. // propagation is paid only if user decides to make some volume non-private
  248. // which will force rootfs mount propagation to be non RPRIVATE.
  249. func checkResetVolumePropagation(container *configs.Config) {
  250. if container.RootPropagation != mount.RPRIVATE {
  251. return
  252. }
  253. for _, m := range container.Mounts {
  254. m.PropagationFlags = nil
  255. }
  256. }
  257. func getMountInfo(mountinfo []*mount.Info, dir string) *mount.Info {
  258. for _, m := range mountinfo {
  259. if m.Mountpoint == dir {
  260. return m
  261. }
  262. }
  263. return nil
  264. }
  265. // Get the source mount point of directory passed in as argument. Also return
  266. // optional fields.
  267. func getSourceMount(source string) (string, string, error) {
  268. // Ensure any symlinks are resolved.
  269. sourcePath, err := filepath.EvalSymlinks(source)
  270. if err != nil {
  271. return "", "", err
  272. }
  273. mountinfos, err := mount.GetMounts()
  274. if err != nil {
  275. return "", "", err
  276. }
  277. mountinfo := getMountInfo(mountinfos, sourcePath)
  278. if mountinfo != nil {
  279. return sourcePath, mountinfo.Optional, nil
  280. }
  281. path := sourcePath
  282. for {
  283. path = filepath.Dir(path)
  284. mountinfo = getMountInfo(mountinfos, path)
  285. if mountinfo != nil {
  286. return path, mountinfo.Optional, nil
  287. }
  288. if path == "/" {
  289. break
  290. }
  291. }
  292. // If we are here, we did not find parent mount. Something is wrong.
  293. return "", "", fmt.Errorf("Could not find source mount of %s", source)
  294. }
  295. // Ensure mount point on which path is mounted, is shared.
  296. func ensureShared(path string) error {
  297. sharedMount := false
  298. sourceMount, optionalOpts, err := getSourceMount(path)
  299. if err != nil {
  300. return err
  301. }
  302. // Make sure source mount point is shared.
  303. optsSplit := strings.Split(optionalOpts, " ")
  304. for _, opt := range optsSplit {
  305. if strings.HasPrefix(opt, "shared:") {
  306. sharedMount = true
  307. break
  308. }
  309. }
  310. if !sharedMount {
  311. return fmt.Errorf("Path %s is mounted on %s but it is not a shared mount.", path, sourceMount)
  312. }
  313. return nil
  314. }
  315. // Ensure mount point on which path is mounted, is either shared or slave.
  316. func ensureSharedOrSlave(path string) error {
  317. sharedMount := false
  318. slaveMount := false
  319. sourceMount, optionalOpts, err := getSourceMount(path)
  320. if err != nil {
  321. return err
  322. }
  323. // Make sure source mount point is shared.
  324. optsSplit := strings.Split(optionalOpts, " ")
  325. for _, opt := range optsSplit {
  326. if strings.HasPrefix(opt, "shared:") {
  327. sharedMount = true
  328. break
  329. } else if strings.HasPrefix(opt, "master:") {
  330. slaveMount = true
  331. break
  332. }
  333. }
  334. if !sharedMount && !slaveMount {
  335. return fmt.Errorf("Path %s is mounted on %s but it is not a shared or slave mount.", path, sourceMount)
  336. }
  337. return nil
  338. }
  339. func (d *Driver) setupMounts(container *configs.Config, c *execdriver.Command) error {
  340. userMounts := make(map[string]struct{})
  341. for _, m := range c.Mounts {
  342. userMounts[m.Destination] = struct{}{}
  343. }
  344. // Filter out mounts that are overridden by user supplied mounts
  345. var defaultMounts []*configs.Mount
  346. _, mountDev := userMounts["/dev"]
  347. for _, m := range container.Mounts {
  348. if _, ok := userMounts[m.Destination]; !ok {
  349. if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
  350. container.Devices = nil
  351. continue
  352. }
  353. defaultMounts = append(defaultMounts, m)
  354. }
  355. }
  356. container.Mounts = defaultMounts
  357. mountPropagationMap := map[string]int{
  358. "private": mount.PRIVATE,
  359. "rprivate": mount.RPRIVATE,
  360. "shared": mount.SHARED,
  361. "rshared": mount.RSHARED,
  362. "slave": mount.SLAVE,
  363. "rslave": mount.RSLAVE,
  364. }
  365. for _, m := range c.Mounts {
  366. for _, cm := range container.Mounts {
  367. if cm.Destination == m.Destination {
  368. return fmt.Errorf("Duplicate mount point '%s'", m.Destination)
  369. }
  370. }
  371. if m.Source == "tmpfs" {
  372. var (
  373. data = "size=65536k"
  374. flags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
  375. err error
  376. )
  377. if m.Data != "" {
  378. flags, data, err = mount.ParseTmpfsOptions(m.Data)
  379. if err != nil {
  380. return err
  381. }
  382. }
  383. container.Mounts = append(container.Mounts, &configs.Mount{
  384. Source: m.Source,
  385. Destination: m.Destination,
  386. Data: data,
  387. Device: "tmpfs",
  388. Flags: flags,
  389. PropagationFlags: []int{mountPropagationMap[volume.DefaultPropagationMode]},
  390. })
  391. continue
  392. }
  393. flags := syscall.MS_BIND | syscall.MS_REC
  394. var pFlag int
  395. if !m.Writable {
  396. flags |= syscall.MS_RDONLY
  397. }
  398. // Determine property of RootPropagation based on volume
  399. // properties. If a volume is shared, then keep root propagation
  400. // shared. This should work for slave and private volumes too.
  401. //
  402. // For slave volumes, it can be either [r]shared/[r]slave.
  403. //
  404. // For private volumes any root propagation value should work.
  405. pFlag = mountPropagationMap[m.Propagation]
  406. if pFlag == mount.SHARED || pFlag == mount.RSHARED {
  407. if err := ensureShared(m.Source); err != nil {
  408. return err
  409. }
  410. rootpg := container.RootPropagation
  411. if rootpg != mount.SHARED && rootpg != mount.RSHARED {
  412. execdriver.SetRootPropagation(container, mount.SHARED)
  413. }
  414. } else if pFlag == mount.SLAVE || pFlag == mount.RSLAVE {
  415. if err := ensureSharedOrSlave(m.Source); err != nil {
  416. return err
  417. }
  418. rootpg := container.RootPropagation
  419. if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
  420. execdriver.SetRootPropagation(container, mount.RSLAVE)
  421. }
  422. }
  423. mount := &configs.Mount{
  424. Source: m.Source,
  425. Destination: m.Destination,
  426. Device: "bind",
  427. Flags: flags,
  428. }
  429. if pFlag != 0 {
  430. mount.PropagationFlags = []int{pFlag}
  431. }
  432. container.Mounts = append(container.Mounts, mount)
  433. }
  434. checkResetVolumePropagation(container)
  435. return nil
  436. }
  437. func (d *Driver) setupLabels(container *configs.Config, c *execdriver.Command) {
  438. container.ProcessLabel = c.ProcessLabel
  439. container.MountLabel = c.MountLabel
  440. }