oci_linux.go 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925
  1. package daemon
  2. import (
  3. "fmt"
  4. "io"
  5. "os"
  6. "os/exec"
  7. "path/filepath"
  8. "regexp"
  9. "sort"
  10. "strconv"
  11. "strings"
  12. containertypes "github.com/docker/docker/api/types/container"
  13. "github.com/docker/docker/container"
  14. "github.com/docker/docker/daemon/caps"
  15. daemonconfig "github.com/docker/docker/daemon/config"
  16. "github.com/docker/docker/oci"
  17. "github.com/docker/docker/pkg/idtools"
  18. "github.com/docker/docker/pkg/mount"
  19. "github.com/docker/docker/volume"
  20. "github.com/opencontainers/runc/libcontainer/apparmor"
  21. "github.com/opencontainers/runc/libcontainer/cgroups"
  22. "github.com/opencontainers/runc/libcontainer/devices"
  23. "github.com/opencontainers/runc/libcontainer/user"
  24. specs "github.com/opencontainers/runtime-spec/specs-go"
  25. "github.com/sirupsen/logrus"
  26. "golang.org/x/sys/unix"
  27. )
  28. // nolint: gosimple
  29. var (
  30. deviceCgroupRuleRegex = regexp.MustCompile("^([acb]) ([0-9]+|\\*):([0-9]+|\\*) ([rwm]{1,3})$")
  31. )
  32. func setResources(s *specs.Spec, r containertypes.Resources) error {
  33. weightDevices, err := getBlkioWeightDevices(r)
  34. if err != nil {
  35. return err
  36. }
  37. readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps)
  38. if err != nil {
  39. return err
  40. }
  41. writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps)
  42. if err != nil {
  43. return err
  44. }
  45. readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps)
  46. if err != nil {
  47. return err
  48. }
  49. writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps)
  50. if err != nil {
  51. return err
  52. }
  53. memoryRes := getMemoryResources(r)
  54. cpuRes, err := getCPUResources(r)
  55. if err != nil {
  56. return err
  57. }
  58. blkioWeight := r.BlkioWeight
  59. specResources := &specs.LinuxResources{
  60. Memory: memoryRes,
  61. CPU: cpuRes,
  62. BlockIO: &specs.LinuxBlockIO{
  63. Weight: &blkioWeight,
  64. WeightDevice: weightDevices,
  65. ThrottleReadBpsDevice: readBpsDevice,
  66. ThrottleWriteBpsDevice: writeBpsDevice,
  67. ThrottleReadIOPSDevice: readIOpsDevice,
  68. ThrottleWriteIOPSDevice: writeIOpsDevice,
  69. },
  70. Pids: &specs.LinuxPids{
  71. Limit: r.PidsLimit,
  72. },
  73. }
  74. if s.Linux.Resources != nil && len(s.Linux.Resources.Devices) > 0 {
  75. specResources.Devices = s.Linux.Resources.Devices
  76. }
  77. s.Linux.Resources = specResources
  78. return nil
  79. }
  80. func setDevices(s *specs.Spec, c *container.Container) error {
  81. // Build lists of devices allowed and created within the container.
  82. var devs []specs.LinuxDevice
  83. devPermissions := s.Linux.Resources.Devices
  84. if c.HostConfig.Privileged {
  85. hostDevices, err := devices.HostDevices()
  86. if err != nil {
  87. return err
  88. }
  89. for _, d := range hostDevices {
  90. devs = append(devs, oci.Device(d))
  91. }
  92. devPermissions = []specs.LinuxDeviceCgroup{
  93. {
  94. Allow: true,
  95. Access: "rwm",
  96. },
  97. }
  98. } else {
  99. for _, deviceMapping := range c.HostConfig.Devices {
  100. d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions)
  101. if err != nil {
  102. return err
  103. }
  104. devs = append(devs, d...)
  105. devPermissions = append(devPermissions, dPermissions...)
  106. }
  107. for _, deviceCgroupRule := range c.HostConfig.DeviceCgroupRules {
  108. ss := deviceCgroupRuleRegex.FindAllStringSubmatch(deviceCgroupRule, -1)
  109. if len(ss[0]) != 5 {
  110. return fmt.Errorf("invalid device cgroup rule format: '%s'", deviceCgroupRule)
  111. }
  112. matches := ss[0]
  113. dPermissions := specs.LinuxDeviceCgroup{
  114. Allow: true,
  115. Type: matches[1],
  116. Access: matches[4],
  117. }
  118. if matches[2] == "*" {
  119. major := int64(-1)
  120. dPermissions.Major = &major
  121. } else {
  122. major, err := strconv.ParseInt(matches[2], 10, 64)
  123. if err != nil {
  124. return fmt.Errorf("invalid major value in device cgroup rule format: '%s'", deviceCgroupRule)
  125. }
  126. dPermissions.Major = &major
  127. }
  128. if matches[3] == "*" {
  129. minor := int64(-1)
  130. dPermissions.Minor = &minor
  131. } else {
  132. minor, err := strconv.ParseInt(matches[3], 10, 64)
  133. if err != nil {
  134. return fmt.Errorf("invalid minor value in device cgroup rule format: '%s'", deviceCgroupRule)
  135. }
  136. dPermissions.Minor = &minor
  137. }
  138. devPermissions = append(devPermissions, dPermissions)
  139. }
  140. }
  141. s.Linux.Devices = append(s.Linux.Devices, devs...)
  142. s.Linux.Resources.Devices = devPermissions
  143. return nil
  144. }
  145. func (daemon *Daemon) setRlimits(s *specs.Spec, c *container.Container) error {
  146. var rlimits []specs.POSIXRlimit
  147. // We want to leave the original HostConfig alone so make a copy here
  148. hostConfig := *c.HostConfig
  149. // Merge with the daemon defaults
  150. daemon.mergeUlimits(&hostConfig)
  151. for _, ul := range hostConfig.Ulimits {
  152. rlimits = append(rlimits, specs.POSIXRlimit{
  153. Type: "RLIMIT_" + strings.ToUpper(ul.Name),
  154. Soft: uint64(ul.Soft),
  155. Hard: uint64(ul.Hard),
  156. })
  157. }
  158. s.Process.Rlimits = rlimits
  159. return nil
  160. }
  161. func setUser(s *specs.Spec, c *container.Container) error {
  162. uid, gid, additionalGids, err := getUser(c, c.Config.User)
  163. if err != nil {
  164. return err
  165. }
  166. s.Process.User.UID = uid
  167. s.Process.User.GID = gid
  168. s.Process.User.AdditionalGids = additionalGids
  169. return nil
  170. }
  171. func readUserFile(c *container.Container, p string) (io.ReadCloser, error) {
  172. fp, err := c.GetResourcePath(p)
  173. if err != nil {
  174. return nil, err
  175. }
  176. return os.Open(fp)
  177. }
  178. func getUser(c *container.Container, username string) (uint32, uint32, []uint32, error) {
  179. passwdPath, err := user.GetPasswdPath()
  180. if err != nil {
  181. return 0, 0, nil, err
  182. }
  183. groupPath, err := user.GetGroupPath()
  184. if err != nil {
  185. return 0, 0, nil, err
  186. }
  187. passwdFile, err := readUserFile(c, passwdPath)
  188. if err == nil {
  189. defer passwdFile.Close()
  190. }
  191. groupFile, err := readUserFile(c, groupPath)
  192. if err == nil {
  193. defer groupFile.Close()
  194. }
  195. execUser, err := user.GetExecUser(username, nil, passwdFile, groupFile)
  196. if err != nil {
  197. return 0, 0, nil, err
  198. }
  199. // todo: fix this double read by a change to libcontainer/user pkg
  200. groupFile, err = readUserFile(c, groupPath)
  201. if err == nil {
  202. defer groupFile.Close()
  203. }
  204. var addGroups []int
  205. if len(c.HostConfig.GroupAdd) > 0 {
  206. addGroups, err = user.GetAdditionalGroups(c.HostConfig.GroupAdd, groupFile)
  207. if err != nil {
  208. return 0, 0, nil, err
  209. }
  210. }
  211. uid := uint32(execUser.Uid)
  212. gid := uint32(execUser.Gid)
  213. sgids := append(execUser.Sgids, addGroups...)
  214. var additionalGids []uint32
  215. for _, g := range sgids {
  216. additionalGids = append(additionalGids, uint32(g))
  217. }
  218. return uid, gid, additionalGids, nil
  219. }
  220. func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) {
  221. for i, n := range s.Linux.Namespaces {
  222. if n.Type == ns.Type {
  223. s.Linux.Namespaces[i] = ns
  224. return
  225. }
  226. }
  227. s.Linux.Namespaces = append(s.Linux.Namespaces, ns)
  228. }
  229. func setCapabilities(s *specs.Spec, c *container.Container) error {
  230. var caplist []string
  231. var err error
  232. if c.HostConfig.Privileged {
  233. caplist = caps.GetAllCapabilities()
  234. } else {
  235. caplist, err = caps.TweakCapabilities(s.Process.Capabilities.Effective, c.HostConfig.CapAdd, c.HostConfig.CapDrop)
  236. if err != nil {
  237. return err
  238. }
  239. }
  240. s.Process.Capabilities.Effective = caplist
  241. s.Process.Capabilities.Bounding = caplist
  242. s.Process.Capabilities.Permitted = caplist
  243. s.Process.Capabilities.Inheritable = caplist
  244. return nil
  245. }
  246. func setNamespaces(daemon *Daemon, s *specs.Spec, c *container.Container) error {
  247. userNS := false
  248. // user
  249. if c.HostConfig.UsernsMode.IsPrivate() {
  250. uidMap := daemon.idMappings.UIDs()
  251. if uidMap != nil {
  252. userNS = true
  253. ns := specs.LinuxNamespace{Type: "user"}
  254. setNamespace(s, ns)
  255. s.Linux.UIDMappings = specMapping(uidMap)
  256. s.Linux.GIDMappings = specMapping(daemon.idMappings.GIDs())
  257. }
  258. }
  259. // network
  260. if !c.Config.NetworkDisabled {
  261. ns := specs.LinuxNamespace{Type: "network"}
  262. parts := strings.SplitN(string(c.HostConfig.NetworkMode), ":", 2)
  263. if parts[0] == "container" {
  264. nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer())
  265. if err != nil {
  266. return err
  267. }
  268. ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID())
  269. if userNS {
  270. // to share a net namespace, they must also share a user namespace
  271. nsUser := specs.LinuxNamespace{Type: "user"}
  272. nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID())
  273. setNamespace(s, nsUser)
  274. }
  275. } else if c.HostConfig.NetworkMode.IsHost() {
  276. ns.Path = c.NetworkSettings.SandboxKey
  277. }
  278. setNamespace(s, ns)
  279. }
  280. // ipc
  281. ipcMode := c.HostConfig.IpcMode
  282. switch {
  283. case ipcMode.IsContainer():
  284. ns := specs.LinuxNamespace{Type: "ipc"}
  285. ic, err := daemon.getIpcContainer(ipcMode.Container())
  286. if err != nil {
  287. return err
  288. }
  289. ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID())
  290. setNamespace(s, ns)
  291. if userNS {
  292. // to share an IPC namespace, they must also share a user namespace
  293. nsUser := specs.LinuxNamespace{Type: "user"}
  294. nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID())
  295. setNamespace(s, nsUser)
  296. }
  297. case ipcMode.IsHost():
  298. oci.RemoveNamespace(s, specs.LinuxNamespaceType("ipc"))
  299. case ipcMode.IsEmpty():
  300. // A container was created by an older version of the daemon.
  301. // The default behavior used to be what is now called "shareable".
  302. fallthrough
  303. case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone():
  304. ns := specs.LinuxNamespace{Type: "ipc"}
  305. setNamespace(s, ns)
  306. default:
  307. return fmt.Errorf("Invalid IPC mode: %v", ipcMode)
  308. }
  309. // pid
  310. if c.HostConfig.PidMode.IsContainer() {
  311. ns := specs.LinuxNamespace{Type: "pid"}
  312. pc, err := daemon.getPidContainer(c)
  313. if err != nil {
  314. return err
  315. }
  316. ns.Path = fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID())
  317. setNamespace(s, ns)
  318. if userNS {
  319. // to share a PID namespace, they must also share a user namespace
  320. nsUser := specs.LinuxNamespace{Type: "user"}
  321. nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID())
  322. setNamespace(s, nsUser)
  323. }
  324. } else if c.HostConfig.PidMode.IsHost() {
  325. oci.RemoveNamespace(s, specs.LinuxNamespaceType("pid"))
  326. } else {
  327. ns := specs.LinuxNamespace{Type: "pid"}
  328. setNamespace(s, ns)
  329. }
  330. // uts
  331. if c.HostConfig.UTSMode.IsHost() {
  332. oci.RemoveNamespace(s, specs.LinuxNamespaceType("uts"))
  333. s.Hostname = ""
  334. }
  335. return nil
  336. }
  337. func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {
  338. var ids []specs.LinuxIDMapping
  339. for _, item := range s {
  340. ids = append(ids, specs.LinuxIDMapping{
  341. HostID: uint32(item.HostID),
  342. ContainerID: uint32(item.ContainerID),
  343. Size: uint32(item.Size),
  344. })
  345. }
  346. return ids
  347. }
  348. func getMountInfo(mountinfo []*mount.Info, dir string) *mount.Info {
  349. for _, m := range mountinfo {
  350. if m.Mountpoint == dir {
  351. return m
  352. }
  353. }
  354. return nil
  355. }
  356. // Get the source mount point of directory passed in as argument. Also return
  357. // optional fields.
  358. func getSourceMount(source string) (string, string, error) {
  359. // Ensure any symlinks are resolved.
  360. sourcePath, err := filepath.EvalSymlinks(source)
  361. if err != nil {
  362. return "", "", err
  363. }
  364. mountinfos, err := mount.GetMounts()
  365. if err != nil {
  366. return "", "", err
  367. }
  368. mountinfo := getMountInfo(mountinfos, sourcePath)
  369. if mountinfo != nil {
  370. return sourcePath, mountinfo.Optional, nil
  371. }
  372. path := sourcePath
  373. for {
  374. path = filepath.Dir(path)
  375. mountinfo = getMountInfo(mountinfos, path)
  376. if mountinfo != nil {
  377. return path, mountinfo.Optional, nil
  378. }
  379. if path == "/" {
  380. break
  381. }
  382. }
  383. // If we are here, we did not find parent mount. Something is wrong.
  384. return "", "", fmt.Errorf("Could not find source mount of %s", source)
  385. }
  386. // Ensure mount point on which path is mounted, is shared.
  387. func ensureShared(path string) error {
  388. sharedMount := false
  389. sourceMount, optionalOpts, err := getSourceMount(path)
  390. if err != nil {
  391. return err
  392. }
  393. // Make sure source mount point is shared.
  394. optsSplit := strings.Split(optionalOpts, " ")
  395. for _, opt := range optsSplit {
  396. if strings.HasPrefix(opt, "shared:") {
  397. sharedMount = true
  398. break
  399. }
  400. }
  401. if !sharedMount {
  402. return fmt.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount)
  403. }
  404. return nil
  405. }
  406. // Ensure mount point on which path is mounted, is either shared or slave.
  407. func ensureSharedOrSlave(path string) error {
  408. sharedMount := false
  409. slaveMount := false
  410. sourceMount, optionalOpts, err := getSourceMount(path)
  411. if err != nil {
  412. return err
  413. }
  414. // Make sure source mount point is shared.
  415. optsSplit := strings.Split(optionalOpts, " ")
  416. for _, opt := range optsSplit {
  417. if strings.HasPrefix(opt, "shared:") {
  418. sharedMount = true
  419. break
  420. } else if strings.HasPrefix(opt, "master:") {
  421. slaveMount = true
  422. break
  423. }
  424. }
  425. if !sharedMount && !slaveMount {
  426. return fmt.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount)
  427. }
  428. return nil
  429. }
  430. // Get the set of mount flags that are set on the mount that contains the given
  431. // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
  432. // bind-mounting "with options" will not fail with user namespaces, due to
  433. // kernel restrictions that require user namespace mounts to preserve
  434. // CL_UNPRIVILEGED locked flags.
  435. func getUnprivilegedMountFlags(path string) ([]string, error) {
  436. var statfs unix.Statfs_t
  437. if err := unix.Statfs(path, &statfs); err != nil {
  438. return nil, err
  439. }
  440. // The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
  441. unprivilegedFlags := map[uint64]string{
  442. unix.MS_RDONLY: "ro",
  443. unix.MS_NODEV: "nodev",
  444. unix.MS_NOEXEC: "noexec",
  445. unix.MS_NOSUID: "nosuid",
  446. unix.MS_NOATIME: "noatime",
  447. unix.MS_RELATIME: "relatime",
  448. unix.MS_NODIRATIME: "nodiratime",
  449. }
  450. var flags []string
  451. for mask, flag := range unprivilegedFlags {
  452. if uint64(statfs.Flags)&mask == mask {
  453. flags = append(flags, flag)
  454. }
  455. }
  456. return flags, nil
  457. }
  458. var (
  459. mountPropagationMap = map[string]int{
  460. "private": mount.PRIVATE,
  461. "rprivate": mount.RPRIVATE,
  462. "shared": mount.SHARED,
  463. "rshared": mount.RSHARED,
  464. "slave": mount.SLAVE,
  465. "rslave": mount.RSLAVE,
  466. }
  467. mountPropagationReverseMap = map[int]string{
  468. mount.PRIVATE: "private",
  469. mount.RPRIVATE: "rprivate",
  470. mount.SHARED: "shared",
  471. mount.RSHARED: "rshared",
  472. mount.SLAVE: "slave",
  473. mount.RSLAVE: "rslave",
  474. }
  475. )
  476. // inSlice tests whether a string is contained in a slice of strings or not.
  477. // Comparison is case sensitive
  478. func inSlice(slice []string, s string) bool {
  479. for _, ss := range slice {
  480. if s == ss {
  481. return true
  482. }
  483. }
  484. return false
  485. }
  486. func setMounts(daemon *Daemon, s *specs.Spec, c *container.Container, mounts []container.Mount) error {
  487. userMounts := make(map[string]struct{})
  488. for _, m := range mounts {
  489. userMounts[m.Destination] = struct{}{}
  490. }
  491. // Copy all mounts from spec to defaultMounts, except for
  492. // - mounts overriden by a user supplied mount;
  493. // - all mounts under /dev if a user supplied /dev is present;
  494. // - /dev/shm, in case IpcMode is none.
  495. // While at it, also
  496. // - set size for /dev/shm from shmsize.
  497. var defaultMounts []specs.Mount
  498. _, mountDev := userMounts["/dev"]
  499. for _, m := range s.Mounts {
  500. if _, ok := userMounts[m.Destination]; ok {
  501. // filter out mount overridden by a user supplied mount
  502. continue
  503. }
  504. if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
  505. // filter out everything under /dev if /dev is user-mounted
  506. continue
  507. }
  508. if m.Destination == "/dev/shm" {
  509. if c.HostConfig.IpcMode.IsNone() {
  510. // filter out /dev/shm for "none" IpcMode
  511. continue
  512. }
  513. // set size for /dev/shm mount from spec
  514. sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
  515. m.Options = append(m.Options, sizeOpt)
  516. }
  517. defaultMounts = append(defaultMounts, m)
  518. }
  519. s.Mounts = defaultMounts
  520. for _, m := range mounts {
  521. for _, cm := range s.Mounts {
  522. if cm.Destination == m.Destination {
  523. return duplicateMountPointError(m.Destination)
  524. }
  525. }
  526. if m.Source == "tmpfs" {
  527. data := m.Data
  528. parser := volume.NewParser("linux")
  529. options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())}
  530. if data != "" {
  531. options = append(options, strings.Split(data, ",")...)
  532. }
  533. merged, err := mount.MergeTmpfsOptions(options)
  534. if err != nil {
  535. return err
  536. }
  537. s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged})
  538. continue
  539. }
  540. mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"}
  541. // Determine property of RootPropagation based on volume
  542. // properties. If a volume is shared, then keep root propagation
  543. // shared. This should work for slave and private volumes too.
  544. //
  545. // For slave volumes, it can be either [r]shared/[r]slave.
  546. //
  547. // For private volumes any root propagation value should work.
  548. pFlag := mountPropagationMap[m.Propagation]
  549. if pFlag == mount.SHARED || pFlag == mount.RSHARED {
  550. if err := ensureShared(m.Source); err != nil {
  551. return err
  552. }
  553. rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
  554. if rootpg != mount.SHARED && rootpg != mount.RSHARED {
  555. s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED]
  556. }
  557. } else if pFlag == mount.SLAVE || pFlag == mount.RSLAVE {
  558. if err := ensureSharedOrSlave(m.Source); err != nil {
  559. return err
  560. }
  561. rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
  562. if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
  563. s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE]
  564. }
  565. }
  566. opts := []string{"rbind"}
  567. if !m.Writable {
  568. opts = append(opts, "ro")
  569. }
  570. if pFlag != 0 {
  571. opts = append(opts, mountPropagationReverseMap[pFlag])
  572. }
  573. // If we are using user namespaces, then we must make sure that we
  574. // don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
  575. // "mount" when we bind-mount. The reason for this is that at the point
  576. // when runc sets up the root filesystem, it is already inside a user
  577. // namespace, and thus cannot change any flags that are locked.
  578. if daemon.configStore.RemappedRoot != "" {
  579. unprivOpts, err := getUnprivilegedMountFlags(m.Source)
  580. if err != nil {
  581. return err
  582. }
  583. opts = append(opts, unprivOpts...)
  584. }
  585. mt.Options = opts
  586. s.Mounts = append(s.Mounts, mt)
  587. }
  588. if s.Root.Readonly {
  589. for i, m := range s.Mounts {
  590. switch m.Destination {
  591. case "/proc", "/dev/pts", "/dev/mqueue", "/dev":
  592. continue
  593. }
  594. if _, ok := userMounts[m.Destination]; !ok {
  595. if !inSlice(m.Options, "ro") {
  596. s.Mounts[i].Options = append(s.Mounts[i].Options, "ro")
  597. }
  598. }
  599. }
  600. }
  601. if c.HostConfig.Privileged {
  602. if !s.Root.Readonly {
  603. // clear readonly for /sys
  604. for i := range s.Mounts {
  605. if s.Mounts[i].Destination == "/sys" {
  606. clearReadOnly(&s.Mounts[i])
  607. }
  608. }
  609. }
  610. s.Linux.ReadonlyPaths = nil
  611. s.Linux.MaskedPaths = nil
  612. }
  613. // TODO: until a kernel/mount solution exists for handling remount in a user namespace,
  614. // we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
  615. if uidMap := daemon.idMappings.UIDs(); uidMap != nil || c.HostConfig.Privileged {
  616. for i, m := range s.Mounts {
  617. if m.Type == "cgroup" {
  618. clearReadOnly(&s.Mounts[i])
  619. }
  620. }
  621. }
  622. return nil
  623. }
  624. func (daemon *Daemon) populateCommonSpec(s *specs.Spec, c *container.Container) error {
  625. linkedEnv, err := daemon.setupLinkedContainers(c)
  626. if err != nil {
  627. return err
  628. }
  629. s.Root = &specs.Root{
  630. Path: c.BaseFS.Path(),
  631. Readonly: c.HostConfig.ReadonlyRootfs,
  632. }
  633. if err := c.SetupWorkingDirectory(daemon.idMappings.RootPair()); err != nil {
  634. return err
  635. }
  636. cwd := c.Config.WorkingDir
  637. if len(cwd) == 0 {
  638. cwd = "/"
  639. }
  640. s.Process.Args = append([]string{c.Path}, c.Args...)
  641. // only add the custom init if it is specified and the container is running in its
  642. // own private pid namespace. It does not make sense to add if it is running in the
  643. // host namespace or another container's pid namespace where we already have an init
  644. if c.HostConfig.PidMode.IsPrivate() {
  645. if (c.HostConfig.Init != nil && *c.HostConfig.Init) ||
  646. (c.HostConfig.Init == nil && daemon.configStore.Init) {
  647. s.Process.Args = append([]string{"/dev/init", "--", c.Path}, c.Args...)
  648. var path string
  649. if daemon.configStore.InitPath == "" {
  650. path, err = exec.LookPath(daemonconfig.DefaultInitBinary)
  651. if err != nil {
  652. return err
  653. }
  654. }
  655. if daemon.configStore.InitPath != "" {
  656. path = daemon.configStore.InitPath
  657. }
  658. s.Mounts = append(s.Mounts, specs.Mount{
  659. Destination: "/dev/init",
  660. Type: "bind",
  661. Source: path,
  662. Options: []string{"bind", "ro"},
  663. })
  664. }
  665. }
  666. s.Process.Cwd = cwd
  667. s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
  668. s.Process.Terminal = c.Config.Tty
  669. s.Hostname = c.FullHostname()
  670. return nil
  671. }
  672. func (daemon *Daemon) createSpec(c *container.Container) (*specs.Spec, error) {
  673. s := oci.DefaultSpec()
  674. if err := daemon.populateCommonSpec(&s, c); err != nil {
  675. return nil, err
  676. }
  677. var cgroupsPath string
  678. scopePrefix := "docker"
  679. parent := "/docker"
  680. useSystemd := UsingSystemd(daemon.configStore)
  681. if useSystemd {
  682. parent = "system.slice"
  683. }
  684. if c.HostConfig.CgroupParent != "" {
  685. parent = c.HostConfig.CgroupParent
  686. } else if daemon.configStore.CgroupParent != "" {
  687. parent = daemon.configStore.CgroupParent
  688. }
  689. if useSystemd {
  690. cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID
  691. logrus.Debugf("createSpec: cgroupsPath: %s", cgroupsPath)
  692. } else {
  693. cgroupsPath = filepath.Join(parent, c.ID)
  694. }
  695. s.Linux.CgroupsPath = cgroupsPath
  696. if err := setResources(&s, c.HostConfig.Resources); err != nil {
  697. return nil, fmt.Errorf("linux runtime spec resources: %v", err)
  698. }
  699. s.Linux.Sysctl = c.HostConfig.Sysctls
  700. p := s.Linux.CgroupsPath
  701. if useSystemd {
  702. initPath, err := cgroups.GetInitCgroup("cpu")
  703. if err != nil {
  704. return nil, err
  705. }
  706. _, err = cgroups.GetOwnCgroup("cpu")
  707. if err != nil {
  708. return nil, err
  709. }
  710. p = filepath.Join(initPath, s.Linux.CgroupsPath)
  711. }
  712. // Clean path to guard against things like ../../../BAD
  713. parentPath := filepath.Dir(p)
  714. if !filepath.IsAbs(parentPath) {
  715. parentPath = filepath.Clean("/" + parentPath)
  716. }
  717. if err := daemon.initCgroupsPath(parentPath); err != nil {
  718. return nil, fmt.Errorf("linux init cgroups path: %v", err)
  719. }
  720. if err := setDevices(&s, c); err != nil {
  721. return nil, fmt.Errorf("linux runtime spec devices: %v", err)
  722. }
  723. if err := daemon.setRlimits(&s, c); err != nil {
  724. return nil, fmt.Errorf("linux runtime spec rlimits: %v", err)
  725. }
  726. if err := setUser(&s, c); err != nil {
  727. return nil, fmt.Errorf("linux spec user: %v", err)
  728. }
  729. if err := setNamespaces(daemon, &s, c); err != nil {
  730. return nil, fmt.Errorf("linux spec namespaces: %v", err)
  731. }
  732. if err := setCapabilities(&s, c); err != nil {
  733. return nil, fmt.Errorf("linux spec capabilities: %v", err)
  734. }
  735. if err := setSeccomp(daemon, &s, c); err != nil {
  736. return nil, fmt.Errorf("linux seccomp: %v", err)
  737. }
  738. if err := daemon.setupIpcDirs(c); err != nil {
  739. return nil, err
  740. }
  741. if err := daemon.setupSecretDir(c); err != nil {
  742. return nil, err
  743. }
  744. if err := daemon.setupConfigDir(c); err != nil {
  745. return nil, err
  746. }
  747. ms, err := daemon.setupMounts(c)
  748. if err != nil {
  749. return nil, err
  750. }
  751. if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() {
  752. ms = append(ms, c.IpcMounts()...)
  753. }
  754. tmpfsMounts, err := c.TmpfsMounts()
  755. if err != nil {
  756. return nil, err
  757. }
  758. ms = append(ms, tmpfsMounts...)
  759. if m := c.SecretMounts(); m != nil {
  760. ms = append(ms, m...)
  761. }
  762. ms = append(ms, c.ConfigMounts()...)
  763. sort.Sort(mounts(ms))
  764. if err := setMounts(daemon, &s, c, ms); err != nil {
  765. return nil, fmt.Errorf("linux mounts: %v", err)
  766. }
  767. for _, ns := range s.Linux.Namespaces {
  768. if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled {
  769. target, err := os.Readlink(filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe"))
  770. if err != nil {
  771. return nil, err
  772. }
  773. s.Hooks = &specs.Hooks{
  774. Prestart: []specs.Hook{{
  775. Path: target, // FIXME: cross-platform
  776. Args: []string{"libnetwork-setkey", c.ID, daemon.netController.ID()},
  777. }},
  778. }
  779. }
  780. }
  781. if apparmor.IsEnabled() {
  782. var appArmorProfile string
  783. if c.AppArmorProfile != "" {
  784. appArmorProfile = c.AppArmorProfile
  785. } else if c.HostConfig.Privileged {
  786. appArmorProfile = "unconfined"
  787. } else {
  788. appArmorProfile = "docker-default"
  789. }
  790. if appArmorProfile == "docker-default" {
  791. // Unattended upgrades and other fun services can unload AppArmor
  792. // profiles inadvertently. Since we cannot store our profile in
  793. // /etc/apparmor.d, nor can we practically add other ways of
  794. // telling the system to keep our profile loaded, in order to make
  795. // sure that we keep the default profile enabled we dynamically
  796. // reload it if necessary.
  797. if err := ensureDefaultAppArmorProfile(); err != nil {
  798. return nil, err
  799. }
  800. }
  801. s.Process.ApparmorProfile = appArmorProfile
  802. }
  803. s.Process.SelinuxLabel = c.GetProcessLabel()
  804. s.Process.NoNewPrivileges = c.NoNewPrivileges
  805. s.Process.OOMScoreAdj = &c.HostConfig.OomScoreAdj
  806. s.Linux.MountLabel = c.MountLabel
  807. return &s, nil
  808. }
  809. func clearReadOnly(m *specs.Mount) {
  810. var opt []string
  811. for _, o := range m.Options {
  812. if o != "ro" {
  813. opt = append(opt, o)
  814. }
  815. }
  816. m.Options = opt
  817. }
  818. // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig
  819. func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig) {
  820. ulimits := c.Ulimits
  821. // Merge ulimits with daemon defaults
  822. ulIdx := make(map[string]struct{})
  823. for _, ul := range ulimits {
  824. ulIdx[ul.Name] = struct{}{}
  825. }
  826. for name, ul := range daemon.configStore.Ulimits {
  827. if _, exists := ulIdx[name]; !exists {
  828. ulimits = append(ulimits, ul)
  829. }
  830. }
  831. c.Ulimits = ulimits
  832. }