oci_linux.go 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933
  1. package daemon // import "github.com/docker/docker/daemon"
  2. import (
  3. "fmt"
  4. "io"
  5. "os"
  6. "os/exec"
  7. "path/filepath"
  8. "regexp"
  9. "sort"
  10. "strconv"
  11. "strings"
  12. containertypes "github.com/docker/docker/api/types/container"
  13. "github.com/docker/docker/container"
  14. "github.com/docker/docker/daemon/caps"
  15. daemonconfig "github.com/docker/docker/daemon/config"
  16. "github.com/docker/docker/oci"
  17. "github.com/docker/docker/pkg/idtools"
  18. "github.com/docker/docker/pkg/mount"
  19. volumemounts "github.com/docker/docker/volume/mounts"
  20. "github.com/opencontainers/runc/libcontainer/apparmor"
  21. "github.com/opencontainers/runc/libcontainer/cgroups"
  22. "github.com/opencontainers/runc/libcontainer/devices"
  23. "github.com/opencontainers/runc/libcontainer/user"
  24. "github.com/opencontainers/runtime-spec/specs-go"
  25. "github.com/pkg/errors"
  26. "github.com/sirupsen/logrus"
  27. "golang.org/x/sys/unix"
  28. )
  29. // nolint: gosimple
  30. var (
  31. deviceCgroupRuleRegex = regexp.MustCompile("^([acb]) ([0-9]+|\\*):([0-9]+|\\*) ([rwm]{1,3})$")
  32. )
  33. func setResources(s *specs.Spec, r containertypes.Resources) error {
  34. weightDevices, err := getBlkioWeightDevices(r)
  35. if err != nil {
  36. return err
  37. }
  38. readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps)
  39. if err != nil {
  40. return err
  41. }
  42. writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps)
  43. if err != nil {
  44. return err
  45. }
  46. readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps)
  47. if err != nil {
  48. return err
  49. }
  50. writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps)
  51. if err != nil {
  52. return err
  53. }
  54. memoryRes := getMemoryResources(r)
  55. cpuRes, err := getCPUResources(r)
  56. if err != nil {
  57. return err
  58. }
  59. blkioWeight := r.BlkioWeight
  60. specResources := &specs.LinuxResources{
  61. Memory: memoryRes,
  62. CPU: cpuRes,
  63. BlockIO: &specs.LinuxBlockIO{
  64. Weight: &blkioWeight,
  65. WeightDevice: weightDevices,
  66. ThrottleReadBpsDevice: readBpsDevice,
  67. ThrottleWriteBpsDevice: writeBpsDevice,
  68. ThrottleReadIOPSDevice: readIOpsDevice,
  69. ThrottleWriteIOPSDevice: writeIOpsDevice,
  70. },
  71. Pids: &specs.LinuxPids{
  72. Limit: r.PidsLimit,
  73. },
  74. }
  75. if s.Linux.Resources != nil && len(s.Linux.Resources.Devices) > 0 {
  76. specResources.Devices = s.Linux.Resources.Devices
  77. }
  78. s.Linux.Resources = specResources
  79. return nil
  80. }
  81. func setDevices(s *specs.Spec, c *container.Container) error {
  82. // Build lists of devices allowed and created within the container.
  83. var devs []specs.LinuxDevice
  84. devPermissions := s.Linux.Resources.Devices
  85. if c.HostConfig.Privileged {
  86. hostDevices, err := devices.HostDevices()
  87. if err != nil {
  88. return err
  89. }
  90. for _, d := range hostDevices {
  91. devs = append(devs, oci.Device(d))
  92. }
  93. devPermissions = []specs.LinuxDeviceCgroup{
  94. {
  95. Allow: true,
  96. Access: "rwm",
  97. },
  98. }
  99. } else {
  100. for _, deviceMapping := range c.HostConfig.Devices {
  101. d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions)
  102. if err != nil {
  103. return err
  104. }
  105. devs = append(devs, d...)
  106. devPermissions = append(devPermissions, dPermissions...)
  107. }
  108. for _, deviceCgroupRule := range c.HostConfig.DeviceCgroupRules {
  109. ss := deviceCgroupRuleRegex.FindAllStringSubmatch(deviceCgroupRule, -1)
  110. if len(ss[0]) != 5 {
  111. return fmt.Errorf("invalid device cgroup rule format: '%s'", deviceCgroupRule)
  112. }
  113. matches := ss[0]
  114. dPermissions := specs.LinuxDeviceCgroup{
  115. Allow: true,
  116. Type: matches[1],
  117. Access: matches[4],
  118. }
  119. if matches[2] == "*" {
  120. major := int64(-1)
  121. dPermissions.Major = &major
  122. } else {
  123. major, err := strconv.ParseInt(matches[2], 10, 64)
  124. if err != nil {
  125. return fmt.Errorf("invalid major value in device cgroup rule format: '%s'", deviceCgroupRule)
  126. }
  127. dPermissions.Major = &major
  128. }
  129. if matches[3] == "*" {
  130. minor := int64(-1)
  131. dPermissions.Minor = &minor
  132. } else {
  133. minor, err := strconv.ParseInt(matches[3], 10, 64)
  134. if err != nil {
  135. return fmt.Errorf("invalid minor value in device cgroup rule format: '%s'", deviceCgroupRule)
  136. }
  137. dPermissions.Minor = &minor
  138. }
  139. devPermissions = append(devPermissions, dPermissions)
  140. }
  141. }
  142. s.Linux.Devices = append(s.Linux.Devices, devs...)
  143. s.Linux.Resources.Devices = devPermissions
  144. return nil
  145. }
  146. func (daemon *Daemon) setRlimits(s *specs.Spec, c *container.Container) error {
  147. var rlimits []specs.POSIXRlimit
  148. // We want to leave the original HostConfig alone so make a copy here
  149. hostConfig := *c.HostConfig
  150. // Merge with the daemon defaults
  151. daemon.mergeUlimits(&hostConfig)
  152. for _, ul := range hostConfig.Ulimits {
  153. rlimits = append(rlimits, specs.POSIXRlimit{
  154. Type: "RLIMIT_" + strings.ToUpper(ul.Name),
  155. Soft: uint64(ul.Soft),
  156. Hard: uint64(ul.Hard),
  157. })
  158. }
  159. s.Process.Rlimits = rlimits
  160. return nil
  161. }
  162. func setUser(s *specs.Spec, c *container.Container) error {
  163. uid, gid, additionalGids, err := getUser(c, c.Config.User)
  164. if err != nil {
  165. return err
  166. }
  167. s.Process.User.UID = uid
  168. s.Process.User.GID = gid
  169. s.Process.User.AdditionalGids = additionalGids
  170. return nil
  171. }
  172. func readUserFile(c *container.Container, p string) (io.ReadCloser, error) {
  173. fp, err := c.GetResourcePath(p)
  174. if err != nil {
  175. return nil, err
  176. }
  177. return os.Open(fp)
  178. }
  179. func getUser(c *container.Container, username string) (uint32, uint32, []uint32, error) {
  180. passwdPath, err := user.GetPasswdPath()
  181. if err != nil {
  182. return 0, 0, nil, err
  183. }
  184. groupPath, err := user.GetGroupPath()
  185. if err != nil {
  186. return 0, 0, nil, err
  187. }
  188. passwdFile, err := readUserFile(c, passwdPath)
  189. if err == nil {
  190. defer passwdFile.Close()
  191. }
  192. groupFile, err := readUserFile(c, groupPath)
  193. if err == nil {
  194. defer groupFile.Close()
  195. }
  196. execUser, err := user.GetExecUser(username, nil, passwdFile, groupFile)
  197. if err != nil {
  198. return 0, 0, nil, err
  199. }
  200. // todo: fix this double read by a change to libcontainer/user pkg
  201. groupFile, err = readUserFile(c, groupPath)
  202. if err == nil {
  203. defer groupFile.Close()
  204. }
  205. var addGroups []int
  206. if len(c.HostConfig.GroupAdd) > 0 {
  207. addGroups, err = user.GetAdditionalGroups(c.HostConfig.GroupAdd, groupFile)
  208. if err != nil {
  209. return 0, 0, nil, err
  210. }
  211. }
  212. uid := uint32(execUser.Uid)
  213. gid := uint32(execUser.Gid)
  214. sgids := append(execUser.Sgids, addGroups...)
  215. var additionalGids []uint32
  216. for _, g := range sgids {
  217. additionalGids = append(additionalGids, uint32(g))
  218. }
  219. return uid, gid, additionalGids, nil
  220. }
  221. func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) {
  222. for i, n := range s.Linux.Namespaces {
  223. if n.Type == ns.Type {
  224. s.Linux.Namespaces[i] = ns
  225. return
  226. }
  227. }
  228. s.Linux.Namespaces = append(s.Linux.Namespaces, ns)
  229. }
  230. func setCapabilities(s *specs.Spec, c *container.Container) error {
  231. var caplist []string
  232. var err error
  233. if c.HostConfig.Privileged {
  234. caplist = caps.GetAllCapabilities()
  235. } else {
  236. caplist, err = caps.TweakCapabilities(s.Process.Capabilities.Bounding, c.HostConfig.CapAdd, c.HostConfig.CapDrop)
  237. if err != nil {
  238. return err
  239. }
  240. }
  241. s.Process.Capabilities.Effective = caplist
  242. s.Process.Capabilities.Bounding = caplist
  243. s.Process.Capabilities.Permitted = caplist
  244. s.Process.Capabilities.Inheritable = caplist
  245. // setUser has already been executed here
  246. // if non root drop capabilities in the way execve does
  247. if s.Process.User.UID != 0 {
  248. s.Process.Capabilities.Effective = []string{}
  249. s.Process.Capabilities.Permitted = []string{}
  250. }
  251. return nil
  252. }
  253. func setNamespaces(daemon *Daemon, s *specs.Spec, c *container.Container) error {
  254. userNS := false
  255. // user
  256. if c.HostConfig.UsernsMode.IsPrivate() {
  257. uidMap := daemon.idMappings.UIDs()
  258. if uidMap != nil {
  259. userNS = true
  260. ns := specs.LinuxNamespace{Type: "user"}
  261. setNamespace(s, ns)
  262. s.Linux.UIDMappings = specMapping(uidMap)
  263. s.Linux.GIDMappings = specMapping(daemon.idMappings.GIDs())
  264. }
  265. }
  266. // network
  267. if !c.Config.NetworkDisabled {
  268. ns := specs.LinuxNamespace{Type: "network"}
  269. parts := strings.SplitN(string(c.HostConfig.NetworkMode), ":", 2)
  270. if parts[0] == "container" {
  271. nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer())
  272. if err != nil {
  273. return err
  274. }
  275. ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID())
  276. if userNS {
  277. // to share a net namespace, they must also share a user namespace
  278. nsUser := specs.LinuxNamespace{Type: "user"}
  279. nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID())
  280. setNamespace(s, nsUser)
  281. }
  282. } else if c.HostConfig.NetworkMode.IsHost() {
  283. ns.Path = c.NetworkSettings.SandboxKey
  284. }
  285. setNamespace(s, ns)
  286. }
  287. // ipc
  288. ipcMode := c.HostConfig.IpcMode
  289. switch {
  290. case ipcMode.IsContainer():
  291. ns := specs.LinuxNamespace{Type: "ipc"}
  292. ic, err := daemon.getIpcContainer(ipcMode.Container())
  293. if err != nil {
  294. return err
  295. }
  296. ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID())
  297. setNamespace(s, ns)
  298. if userNS {
  299. // to share an IPC namespace, they must also share a user namespace
  300. nsUser := specs.LinuxNamespace{Type: "user"}
  301. nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID())
  302. setNamespace(s, nsUser)
  303. }
  304. case ipcMode.IsHost():
  305. oci.RemoveNamespace(s, specs.LinuxNamespaceType("ipc"))
  306. case ipcMode.IsEmpty():
  307. // A container was created by an older version of the daemon.
  308. // The default behavior used to be what is now called "shareable".
  309. fallthrough
  310. case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone():
  311. ns := specs.LinuxNamespace{Type: "ipc"}
  312. setNamespace(s, ns)
  313. default:
  314. return fmt.Errorf("Invalid IPC mode: %v", ipcMode)
  315. }
  316. // pid
  317. if c.HostConfig.PidMode.IsContainer() {
  318. ns := specs.LinuxNamespace{Type: "pid"}
  319. pc, err := daemon.getPidContainer(c)
  320. if err != nil {
  321. return err
  322. }
  323. ns.Path = fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID())
  324. setNamespace(s, ns)
  325. if userNS {
  326. // to share a PID namespace, they must also share a user namespace
  327. nsUser := specs.LinuxNamespace{Type: "user"}
  328. nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID())
  329. setNamespace(s, nsUser)
  330. }
  331. } else if c.HostConfig.PidMode.IsHost() {
  332. oci.RemoveNamespace(s, specs.LinuxNamespaceType("pid"))
  333. } else {
  334. ns := specs.LinuxNamespace{Type: "pid"}
  335. setNamespace(s, ns)
  336. }
  337. // uts
  338. if c.HostConfig.UTSMode.IsHost() {
  339. oci.RemoveNamespace(s, specs.LinuxNamespaceType("uts"))
  340. s.Hostname = ""
  341. }
  342. return nil
  343. }
  344. func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {
  345. var ids []specs.LinuxIDMapping
  346. for _, item := range s {
  347. ids = append(ids, specs.LinuxIDMapping{
  348. HostID: uint32(item.HostID),
  349. ContainerID: uint32(item.ContainerID),
  350. Size: uint32(item.Size),
  351. })
  352. }
  353. return ids
  354. }
  355. // Get the source mount point of directory passed in as argument. Also return
  356. // optional fields.
  357. func getSourceMount(source string) (string, string, error) {
  358. // Ensure any symlinks are resolved.
  359. sourcePath, err := filepath.EvalSymlinks(source)
  360. if err != nil {
  361. return "", "", err
  362. }
  363. mi, err := mount.GetMounts(mount.ParentsFilter(sourcePath))
  364. if err != nil {
  365. return "", "", err
  366. }
  367. if len(mi) < 1 {
  368. return "", "", fmt.Errorf("Can't find mount point of %s", source)
  369. }
  370. // find the longest mount point
  371. var idx, maxlen int
  372. for i := range mi {
  373. if len(mi[i].Mountpoint) > maxlen {
  374. maxlen = len(mi[i].Mountpoint)
  375. idx = i
  376. }
  377. }
  378. return mi[idx].Mountpoint, mi[idx].Optional, nil
  379. }
  380. const (
  381. sharedPropagationOption = "shared:"
  382. slavePropagationOption = "master:"
  383. )
  384. // hasMountinfoOption checks if any of the passed any of the given option values
  385. // are set in the passed in option string.
  386. func hasMountinfoOption(opts string, vals ...string) bool {
  387. for _, opt := range strings.Split(opts, " ") {
  388. for _, val := range vals {
  389. if strings.HasPrefix(opt, val) {
  390. return true
  391. }
  392. }
  393. }
  394. return false
  395. }
  396. // Ensure mount point on which path is mounted, is shared.
  397. func ensureShared(path string) error {
  398. sourceMount, optionalOpts, err := getSourceMount(path)
  399. if err != nil {
  400. return err
  401. }
  402. // Make sure source mount point is shared.
  403. if !hasMountinfoOption(optionalOpts, sharedPropagationOption) {
  404. return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount)
  405. }
  406. return nil
  407. }
  408. // Ensure mount point on which path is mounted, is either shared or slave.
  409. func ensureSharedOrSlave(path string) error {
  410. sourceMount, optionalOpts, err := getSourceMount(path)
  411. if err != nil {
  412. return err
  413. }
  414. if !hasMountinfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) {
  415. return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount)
  416. }
  417. return nil
  418. }
  419. // Get the set of mount flags that are set on the mount that contains the given
  420. // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
  421. // bind-mounting "with options" will not fail with user namespaces, due to
  422. // kernel restrictions that require user namespace mounts to preserve
  423. // CL_UNPRIVILEGED locked flags.
  424. func getUnprivilegedMountFlags(path string) ([]string, error) {
  425. var statfs unix.Statfs_t
  426. if err := unix.Statfs(path, &statfs); err != nil {
  427. return nil, err
  428. }
  429. // The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
  430. unprivilegedFlags := map[uint64]string{
  431. unix.MS_RDONLY: "ro",
  432. unix.MS_NODEV: "nodev",
  433. unix.MS_NOEXEC: "noexec",
  434. unix.MS_NOSUID: "nosuid",
  435. unix.MS_NOATIME: "noatime",
  436. unix.MS_RELATIME: "relatime",
  437. unix.MS_NODIRATIME: "nodiratime",
  438. }
  439. var flags []string
  440. for mask, flag := range unprivilegedFlags {
  441. if uint64(statfs.Flags)&mask == mask {
  442. flags = append(flags, flag)
  443. }
  444. }
  445. return flags, nil
  446. }
  447. var (
  448. mountPropagationMap = map[string]int{
  449. "private": mount.PRIVATE,
  450. "rprivate": mount.RPRIVATE,
  451. "shared": mount.SHARED,
  452. "rshared": mount.RSHARED,
  453. "slave": mount.SLAVE,
  454. "rslave": mount.RSLAVE,
  455. }
  456. mountPropagationReverseMap = map[int]string{
  457. mount.PRIVATE: "private",
  458. mount.RPRIVATE: "rprivate",
  459. mount.SHARED: "shared",
  460. mount.RSHARED: "rshared",
  461. mount.SLAVE: "slave",
  462. mount.RSLAVE: "rslave",
  463. }
  464. )
  465. // inSlice tests whether a string is contained in a slice of strings or not.
  466. // Comparison is case sensitive
  467. func inSlice(slice []string, s string) bool {
  468. for _, ss := range slice {
  469. if s == ss {
  470. return true
  471. }
  472. }
  473. return false
  474. }
  475. func setMounts(daemon *Daemon, s *specs.Spec, c *container.Container, mounts []container.Mount) error {
  476. userMounts := make(map[string]struct{})
  477. for _, m := range mounts {
  478. userMounts[m.Destination] = struct{}{}
  479. }
  480. // Copy all mounts from spec to defaultMounts, except for
  481. // - mounts overriden by a user supplied mount;
  482. // - all mounts under /dev if a user supplied /dev is present;
  483. // - /dev/shm, in case IpcMode is none.
  484. // While at it, also
  485. // - set size for /dev/shm from shmsize.
  486. defaultMounts := s.Mounts[:0]
  487. _, mountDev := userMounts["/dev"]
  488. for _, m := range s.Mounts {
  489. if _, ok := userMounts[m.Destination]; ok {
  490. // filter out mount overridden by a user supplied mount
  491. continue
  492. }
  493. if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
  494. // filter out everything under /dev if /dev is user-mounted
  495. continue
  496. }
  497. if m.Destination == "/dev/shm" {
  498. if c.HostConfig.IpcMode.IsNone() {
  499. // filter out /dev/shm for "none" IpcMode
  500. continue
  501. }
  502. // set size for /dev/shm mount from spec
  503. sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
  504. m.Options = append(m.Options, sizeOpt)
  505. }
  506. defaultMounts = append(defaultMounts, m)
  507. }
  508. s.Mounts = defaultMounts
  509. for _, m := range mounts {
  510. for _, cm := range s.Mounts {
  511. if cm.Destination == m.Destination {
  512. return duplicateMountPointError(m.Destination)
  513. }
  514. }
  515. if m.Source == "tmpfs" {
  516. data := m.Data
  517. parser := volumemounts.NewParser("linux")
  518. options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())}
  519. if data != "" {
  520. options = append(options, strings.Split(data, ",")...)
  521. }
  522. merged, err := mount.MergeTmpfsOptions(options)
  523. if err != nil {
  524. return err
  525. }
  526. s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged})
  527. continue
  528. }
  529. mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"}
  530. // Determine property of RootPropagation based on volume
  531. // properties. If a volume is shared, then keep root propagation
  532. // shared. This should work for slave and private volumes too.
  533. //
  534. // For slave volumes, it can be either [r]shared/[r]slave.
  535. //
  536. // For private volumes any root propagation value should work.
  537. pFlag := mountPropagationMap[m.Propagation]
  538. switch pFlag {
  539. case mount.SHARED, mount.RSHARED:
  540. if err := ensureShared(m.Source); err != nil {
  541. return err
  542. }
  543. rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
  544. if rootpg != mount.SHARED && rootpg != mount.RSHARED {
  545. s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED]
  546. }
  547. case mount.SLAVE, mount.RSLAVE:
  548. var fallback bool
  549. if err := ensureSharedOrSlave(m.Source); err != nil {
  550. // For backwards compatability purposes, treat mounts from the daemon root
  551. // as special since we automatically add rslave propagation to these mounts
  552. // when the user did not set anything, so we should fallback to the old
  553. // behavior which is to use private propagation which is normally the
  554. // default.
  555. if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) {
  556. return err
  557. }
  558. cm, ok := c.MountPoints[m.Destination]
  559. if !ok {
  560. return err
  561. }
  562. if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" {
  563. // This means the user explicitly set a propagation, do not fallback in that case.
  564. return err
  565. }
  566. fallback = true
  567. logrus.WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root")
  568. }
  569. if !fallback {
  570. rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
  571. if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
  572. s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE]
  573. }
  574. }
  575. }
  576. opts := []string{"rbind"}
  577. if !m.Writable {
  578. opts = append(opts, "ro")
  579. }
  580. if pFlag != 0 {
  581. opts = append(opts, mountPropagationReverseMap[pFlag])
  582. }
  583. // If we are using user namespaces, then we must make sure that we
  584. // don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
  585. // "mount" when we bind-mount. The reason for this is that at the point
  586. // when runc sets up the root filesystem, it is already inside a user
  587. // namespace, and thus cannot change any flags that are locked.
  588. if daemon.configStore.RemappedRoot != "" {
  589. unprivOpts, err := getUnprivilegedMountFlags(m.Source)
  590. if err != nil {
  591. return err
  592. }
  593. opts = append(opts, unprivOpts...)
  594. }
  595. mt.Options = opts
  596. s.Mounts = append(s.Mounts, mt)
  597. }
  598. if s.Root.Readonly {
  599. for i, m := range s.Mounts {
  600. switch m.Destination {
  601. case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev":
  602. continue
  603. }
  604. if _, ok := userMounts[m.Destination]; !ok {
  605. if !inSlice(m.Options, "ro") {
  606. s.Mounts[i].Options = append(s.Mounts[i].Options, "ro")
  607. }
  608. }
  609. }
  610. }
  611. if c.HostConfig.Privileged {
  612. // clear readonly for /sys
  613. for i := range s.Mounts {
  614. if s.Mounts[i].Destination == "/sys" {
  615. clearReadOnly(&s.Mounts[i])
  616. }
  617. }
  618. s.Linux.ReadonlyPaths = nil
  619. s.Linux.MaskedPaths = nil
  620. }
  621. // TODO: until a kernel/mount solution exists for handling remount in a user namespace,
  622. // we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
  623. if uidMap := daemon.idMappings.UIDs(); uidMap != nil || c.HostConfig.Privileged {
  624. for i, m := range s.Mounts {
  625. if m.Type == "cgroup" {
  626. clearReadOnly(&s.Mounts[i])
  627. }
  628. }
  629. }
  630. return nil
  631. }
  632. func (daemon *Daemon) populateCommonSpec(s *specs.Spec, c *container.Container) error {
  633. if c.BaseFS == nil {
  634. return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly nil")
  635. }
  636. linkedEnv, err := daemon.setupLinkedContainers(c)
  637. if err != nil {
  638. return err
  639. }
  640. s.Root = &specs.Root{
  641. Path: c.BaseFS.Path(),
  642. Readonly: c.HostConfig.ReadonlyRootfs,
  643. }
  644. if err := c.SetupWorkingDirectory(daemon.idMappings.RootPair()); err != nil {
  645. return err
  646. }
  647. cwd := c.Config.WorkingDir
  648. if len(cwd) == 0 {
  649. cwd = "/"
  650. }
  651. s.Process.Args = append([]string{c.Path}, c.Args...)
  652. // only add the custom init if it is specified and the container is running in its
  653. // own private pid namespace. It does not make sense to add if it is running in the
  654. // host namespace or another container's pid namespace where we already have an init
  655. if c.HostConfig.PidMode.IsPrivate() {
  656. if (c.HostConfig.Init != nil && *c.HostConfig.Init) ||
  657. (c.HostConfig.Init == nil && daemon.configStore.Init) {
  658. s.Process.Args = append([]string{"/dev/init", "--", c.Path}, c.Args...)
  659. var path string
  660. if daemon.configStore.InitPath == "" {
  661. path, err = exec.LookPath(daemonconfig.DefaultInitBinary)
  662. if err != nil {
  663. return err
  664. }
  665. }
  666. if daemon.configStore.InitPath != "" {
  667. path = daemon.configStore.InitPath
  668. }
  669. s.Mounts = append(s.Mounts, specs.Mount{
  670. Destination: "/dev/init",
  671. Type: "bind",
  672. Source: path,
  673. Options: []string{"bind", "ro"},
  674. })
  675. }
  676. }
  677. s.Process.Cwd = cwd
  678. s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
  679. s.Process.Terminal = c.Config.Tty
  680. s.Hostname = c.FullHostname()
  681. return nil
  682. }
  683. func (daemon *Daemon) createSpec(c *container.Container) (retSpec *specs.Spec, err error) {
  684. s := oci.DefaultSpec()
  685. if err := daemon.populateCommonSpec(&s, c); err != nil {
  686. return nil, err
  687. }
  688. var cgroupsPath string
  689. scopePrefix := "docker"
  690. parent := "/docker"
  691. useSystemd := UsingSystemd(daemon.configStore)
  692. if useSystemd {
  693. parent = "system.slice"
  694. }
  695. if c.HostConfig.CgroupParent != "" {
  696. parent = c.HostConfig.CgroupParent
  697. } else if daemon.configStore.CgroupParent != "" {
  698. parent = daemon.configStore.CgroupParent
  699. }
  700. if useSystemd {
  701. cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID
  702. logrus.Debugf("createSpec: cgroupsPath: %s", cgroupsPath)
  703. } else {
  704. cgroupsPath = filepath.Join(parent, c.ID)
  705. }
  706. s.Linux.CgroupsPath = cgroupsPath
  707. if err := setResources(&s, c.HostConfig.Resources); err != nil {
  708. return nil, fmt.Errorf("linux runtime spec resources: %v", err)
  709. }
  710. s.Linux.Sysctl = c.HostConfig.Sysctls
  711. p := s.Linux.CgroupsPath
  712. if useSystemd {
  713. initPath, err := cgroups.GetInitCgroup("cpu")
  714. if err != nil {
  715. return nil, err
  716. }
  717. _, err = cgroups.GetOwnCgroup("cpu")
  718. if err != nil {
  719. return nil, err
  720. }
  721. p = filepath.Join(initPath, s.Linux.CgroupsPath)
  722. }
  723. // Clean path to guard against things like ../../../BAD
  724. parentPath := filepath.Dir(p)
  725. if !filepath.IsAbs(parentPath) {
  726. parentPath = filepath.Clean("/" + parentPath)
  727. }
  728. if err := daemon.initCgroupsPath(parentPath); err != nil {
  729. return nil, fmt.Errorf("linux init cgroups path: %v", err)
  730. }
  731. if err := setDevices(&s, c); err != nil {
  732. return nil, fmt.Errorf("linux runtime spec devices: %v", err)
  733. }
  734. if err := daemon.setRlimits(&s, c); err != nil {
  735. return nil, fmt.Errorf("linux runtime spec rlimits: %v", err)
  736. }
  737. if err := setUser(&s, c); err != nil {
  738. return nil, fmt.Errorf("linux spec user: %v", err)
  739. }
  740. if err := setNamespaces(daemon, &s, c); err != nil {
  741. return nil, fmt.Errorf("linux spec namespaces: %v", err)
  742. }
  743. if err := setCapabilities(&s, c); err != nil {
  744. return nil, fmt.Errorf("linux spec capabilities: %v", err)
  745. }
  746. if err := setSeccomp(daemon, &s, c); err != nil {
  747. return nil, fmt.Errorf("linux seccomp: %v", err)
  748. }
  749. if err := daemon.setupContainerMountsRoot(c); err != nil {
  750. return nil, err
  751. }
  752. if err := daemon.setupIpcDirs(c); err != nil {
  753. return nil, err
  754. }
  755. defer func() {
  756. if err != nil {
  757. daemon.cleanupSecretDir(c)
  758. }
  759. }()
  760. if err := daemon.setupSecretDir(c); err != nil {
  761. return nil, err
  762. }
  763. ms, err := daemon.setupMounts(c)
  764. if err != nil {
  765. return nil, err
  766. }
  767. if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() {
  768. ms = append(ms, c.IpcMounts()...)
  769. }
  770. tmpfsMounts, err := c.TmpfsMounts()
  771. if err != nil {
  772. return nil, err
  773. }
  774. ms = append(ms, tmpfsMounts...)
  775. secretMounts, err := c.SecretMounts()
  776. if err != nil {
  777. return nil, err
  778. }
  779. ms = append(ms, secretMounts...)
  780. sort.Sort(mounts(ms))
  781. if err := setMounts(daemon, &s, c, ms); err != nil {
  782. return nil, fmt.Errorf("linux mounts: %v", err)
  783. }
  784. for _, ns := range s.Linux.Namespaces {
  785. if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled {
  786. target := filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe")
  787. s.Hooks = &specs.Hooks{
  788. Prestart: []specs.Hook{{
  789. Path: target,
  790. Args: []string{"libnetwork-setkey", c.ID, daemon.netController.ID()},
  791. }},
  792. }
  793. }
  794. }
  795. if apparmor.IsEnabled() {
  796. var appArmorProfile string
  797. if c.AppArmorProfile != "" {
  798. appArmorProfile = c.AppArmorProfile
  799. } else if c.HostConfig.Privileged {
  800. appArmorProfile = "unconfined"
  801. } else {
  802. appArmorProfile = "docker-default"
  803. }
  804. if appArmorProfile == "docker-default" {
  805. // Unattended upgrades and other fun services can unload AppArmor
  806. // profiles inadvertently. Since we cannot store our profile in
  807. // /etc/apparmor.d, nor can we practically add other ways of
  808. // telling the system to keep our profile loaded, in order to make
  809. // sure that we keep the default profile enabled we dynamically
  810. // reload it if necessary.
  811. if err := ensureDefaultAppArmorProfile(); err != nil {
  812. return nil, err
  813. }
  814. }
  815. s.Process.ApparmorProfile = appArmorProfile
  816. }
  817. s.Process.SelinuxLabel = c.GetProcessLabel()
  818. s.Process.NoNewPrivileges = c.NoNewPrivileges
  819. s.Process.OOMScoreAdj = &c.HostConfig.OomScoreAdj
  820. s.Linux.MountLabel = c.MountLabel
  821. return &s, nil
  822. }
  823. func clearReadOnly(m *specs.Mount) {
  824. var opt []string
  825. for _, o := range m.Options {
  826. if o != "ro" {
  827. opt = append(opt, o)
  828. }
  829. }
  830. m.Options = opt
  831. }
  832. // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig
  833. func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig) {
  834. ulimits := c.Ulimits
  835. // Merge ulimits with daemon defaults
  836. ulIdx := make(map[string]struct{})
  837. for _, ul := range ulimits {
  838. ulIdx[ul.Name] = struct{}{}
  839. }
  840. for name, ul := range daemon.configStore.Ulimits {
  841. if _, exists := ulIdx[name]; !exists {
  842. ulimits = append(ulimits, ul)
  843. }
  844. }
  845. c.Ulimits = ulimits
  846. }