oci_linux.go 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904
  1. package daemon // import "github.com/docker/docker/daemon"
  2. import (
  3. "fmt"
  4. "io"
  5. "os"
  6. "os/exec"
  7. "path/filepath"
  8. "sort"
  9. "strconv"
  10. "strings"
  11. containertypes "github.com/docker/docker/api/types/container"
  12. "github.com/docker/docker/container"
  13. daemonconfig "github.com/docker/docker/daemon/config"
  14. "github.com/docker/docker/oci"
  15. "github.com/docker/docker/oci/caps"
  16. "github.com/docker/docker/pkg/idtools"
  17. "github.com/docker/docker/pkg/mount"
  18. "github.com/docker/docker/rootless/specconv"
  19. volumemounts "github.com/docker/docker/volume/mounts"
  20. "github.com/opencontainers/runc/libcontainer/apparmor"
  21. "github.com/opencontainers/runc/libcontainer/cgroups"
  22. "github.com/opencontainers/runc/libcontainer/devices"
  23. rsystem "github.com/opencontainers/runc/libcontainer/system"
  24. "github.com/opencontainers/runc/libcontainer/user"
  25. "github.com/opencontainers/runtime-spec/specs-go"
  26. "github.com/pkg/errors"
  27. "github.com/sirupsen/logrus"
  28. "golang.org/x/sys/unix"
  29. )
  30. const (
  31. inContainerInitPath = "/sbin/" + daemonconfig.DefaultInitBinary
  32. )
  33. func setResources(s *specs.Spec, r containertypes.Resources) error {
  34. weightDevices, err := getBlkioWeightDevices(r)
  35. if err != nil {
  36. return err
  37. }
  38. readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps)
  39. if err != nil {
  40. return err
  41. }
  42. writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps)
  43. if err != nil {
  44. return err
  45. }
  46. readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps)
  47. if err != nil {
  48. return err
  49. }
  50. writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps)
  51. if err != nil {
  52. return err
  53. }
  54. memoryRes := getMemoryResources(r)
  55. cpuRes, err := getCPUResources(r)
  56. if err != nil {
  57. return err
  58. }
  59. blkioWeight := r.BlkioWeight
  60. specResources := &specs.LinuxResources{
  61. Memory: memoryRes,
  62. CPU: cpuRes,
  63. BlockIO: &specs.LinuxBlockIO{
  64. Weight: &blkioWeight,
  65. WeightDevice: weightDevices,
  66. ThrottleReadBpsDevice: readBpsDevice,
  67. ThrottleWriteBpsDevice: writeBpsDevice,
  68. ThrottleReadIOPSDevice: readIOpsDevice,
  69. ThrottleWriteIOPSDevice: writeIOpsDevice,
  70. },
  71. Pids: &specs.LinuxPids{
  72. Limit: r.PidsLimit,
  73. },
  74. }
  75. if s.Linux.Resources != nil && len(s.Linux.Resources.Devices) > 0 {
  76. specResources.Devices = s.Linux.Resources.Devices
  77. }
  78. s.Linux.Resources = specResources
  79. return nil
  80. }
  81. func setDevices(s *specs.Spec, c *container.Container) error {
  82. // Build lists of devices allowed and created within the container.
  83. var devs []specs.LinuxDevice
  84. devPermissions := s.Linux.Resources.Devices
  85. if c.HostConfig.Privileged && !rsystem.RunningInUserNS() {
  86. hostDevices, err := devices.HostDevices()
  87. if err != nil {
  88. return err
  89. }
  90. for _, d := range hostDevices {
  91. devs = append(devs, oci.Device(d))
  92. }
  93. devPermissions = []specs.LinuxDeviceCgroup{
  94. {
  95. Allow: true,
  96. Access: "rwm",
  97. },
  98. }
  99. } else {
  100. for _, deviceMapping := range c.HostConfig.Devices {
  101. d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions)
  102. if err != nil {
  103. return err
  104. }
  105. devs = append(devs, d...)
  106. devPermissions = append(devPermissions, dPermissions...)
  107. }
  108. var err error
  109. devPermissions, err = oci.AppendDevicePermissionsFromCgroupRules(devPermissions, c.HostConfig.DeviceCgroupRules)
  110. if err != nil {
  111. return err
  112. }
  113. }
  114. s.Linux.Devices = append(s.Linux.Devices, devs...)
  115. s.Linux.Resources.Devices = devPermissions
  116. return nil
  117. }
  118. func (daemon *Daemon) setRlimits(s *specs.Spec, c *container.Container) error {
  119. var rlimits []specs.POSIXRlimit
  120. // We want to leave the original HostConfig alone so make a copy here
  121. hostConfig := *c.HostConfig
  122. // Merge with the daemon defaults
  123. daemon.mergeUlimits(&hostConfig)
  124. for _, ul := range hostConfig.Ulimits {
  125. rlimits = append(rlimits, specs.POSIXRlimit{
  126. Type: "RLIMIT_" + strings.ToUpper(ul.Name),
  127. Soft: uint64(ul.Soft),
  128. Hard: uint64(ul.Hard),
  129. })
  130. }
  131. s.Process.Rlimits = rlimits
  132. return nil
  133. }
  134. func setUser(s *specs.Spec, c *container.Container) error {
  135. uid, gid, additionalGids, err := getUser(c, c.Config.User)
  136. if err != nil {
  137. return err
  138. }
  139. s.Process.User.UID = uid
  140. s.Process.User.GID = gid
  141. s.Process.User.AdditionalGids = additionalGids
  142. return nil
  143. }
  144. func readUserFile(c *container.Container, p string) (io.ReadCloser, error) {
  145. fp, err := c.GetResourcePath(p)
  146. if err != nil {
  147. return nil, err
  148. }
  149. return os.Open(fp)
  150. }
  151. func getUser(c *container.Container, username string) (uint32, uint32, []uint32, error) {
  152. passwdPath, err := user.GetPasswdPath()
  153. if err != nil {
  154. return 0, 0, nil, err
  155. }
  156. groupPath, err := user.GetGroupPath()
  157. if err != nil {
  158. return 0, 0, nil, err
  159. }
  160. passwdFile, err := readUserFile(c, passwdPath)
  161. if err == nil {
  162. defer passwdFile.Close()
  163. }
  164. groupFile, err := readUserFile(c, groupPath)
  165. if err == nil {
  166. defer groupFile.Close()
  167. }
  168. execUser, err := user.GetExecUser(username, nil, passwdFile, groupFile)
  169. if err != nil {
  170. return 0, 0, nil, err
  171. }
  172. // todo: fix this double read by a change to libcontainer/user pkg
  173. groupFile, err = readUserFile(c, groupPath)
  174. if err == nil {
  175. defer groupFile.Close()
  176. }
  177. var addGroups []int
  178. if len(c.HostConfig.GroupAdd) > 0 {
  179. addGroups, err = user.GetAdditionalGroups(c.HostConfig.GroupAdd, groupFile)
  180. if err != nil {
  181. return 0, 0, nil, err
  182. }
  183. }
  184. uid := uint32(execUser.Uid)
  185. gid := uint32(execUser.Gid)
  186. sgids := append(execUser.Sgids, addGroups...)
  187. var additionalGids []uint32
  188. for _, g := range sgids {
  189. additionalGids = append(additionalGids, uint32(g))
  190. }
  191. return uid, gid, additionalGids, nil
  192. }
  193. func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) {
  194. for i, n := range s.Linux.Namespaces {
  195. if n.Type == ns.Type {
  196. s.Linux.Namespaces[i] = ns
  197. return
  198. }
  199. }
  200. s.Linux.Namespaces = append(s.Linux.Namespaces, ns)
  201. }
  202. func setNamespaces(daemon *Daemon, s *specs.Spec, c *container.Container) error {
  203. userNS := false
  204. // user
  205. if c.HostConfig.UsernsMode.IsPrivate() {
  206. uidMap := daemon.idMapping.UIDs()
  207. if uidMap != nil {
  208. userNS = true
  209. ns := specs.LinuxNamespace{Type: "user"}
  210. setNamespace(s, ns)
  211. s.Linux.UIDMappings = specMapping(uidMap)
  212. s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDs())
  213. }
  214. }
  215. // network
  216. if !c.Config.NetworkDisabled {
  217. ns := specs.LinuxNamespace{Type: "network"}
  218. parts := strings.SplitN(string(c.HostConfig.NetworkMode), ":", 2)
  219. if parts[0] == "container" {
  220. nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer())
  221. if err != nil {
  222. return err
  223. }
  224. ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID())
  225. if userNS {
  226. // to share a net namespace, they must also share a user namespace
  227. nsUser := specs.LinuxNamespace{Type: "user"}
  228. nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID())
  229. setNamespace(s, nsUser)
  230. }
  231. } else if c.HostConfig.NetworkMode.IsHost() {
  232. ns.Path = c.NetworkSettings.SandboxKey
  233. }
  234. setNamespace(s, ns)
  235. }
  236. // ipc
  237. ipcMode := c.HostConfig.IpcMode
  238. switch {
  239. case ipcMode.IsContainer():
  240. ns := specs.LinuxNamespace{Type: "ipc"}
  241. ic, err := daemon.getIpcContainer(ipcMode.Container())
  242. if err != nil {
  243. return err
  244. }
  245. ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID())
  246. setNamespace(s, ns)
  247. if userNS {
  248. // to share an IPC namespace, they must also share a user namespace
  249. nsUser := specs.LinuxNamespace{Type: "user"}
  250. nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID())
  251. setNamespace(s, nsUser)
  252. }
  253. case ipcMode.IsHost():
  254. oci.RemoveNamespace(s, specs.LinuxNamespaceType("ipc"))
  255. case ipcMode.IsEmpty():
  256. // A container was created by an older version of the daemon.
  257. // The default behavior used to be what is now called "shareable".
  258. fallthrough
  259. case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone():
  260. ns := specs.LinuxNamespace{Type: "ipc"}
  261. setNamespace(s, ns)
  262. default:
  263. return fmt.Errorf("Invalid IPC mode: %v", ipcMode)
  264. }
  265. // pid
  266. if c.HostConfig.PidMode.IsContainer() {
  267. ns := specs.LinuxNamespace{Type: "pid"}
  268. pc, err := daemon.getPidContainer(c)
  269. if err != nil {
  270. return err
  271. }
  272. ns.Path = fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID())
  273. setNamespace(s, ns)
  274. if userNS {
  275. // to share a PID namespace, they must also share a user namespace
  276. nsUser := specs.LinuxNamespace{Type: "user"}
  277. nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID())
  278. setNamespace(s, nsUser)
  279. }
  280. } else if c.HostConfig.PidMode.IsHost() {
  281. oci.RemoveNamespace(s, specs.LinuxNamespaceType("pid"))
  282. } else {
  283. ns := specs.LinuxNamespace{Type: "pid"}
  284. setNamespace(s, ns)
  285. }
  286. // uts
  287. if c.HostConfig.UTSMode.IsHost() {
  288. oci.RemoveNamespace(s, specs.LinuxNamespaceType("uts"))
  289. s.Hostname = ""
  290. }
  291. return nil
  292. }
  293. func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {
  294. var ids []specs.LinuxIDMapping
  295. for _, item := range s {
  296. ids = append(ids, specs.LinuxIDMapping{
  297. HostID: uint32(item.HostID),
  298. ContainerID: uint32(item.ContainerID),
  299. Size: uint32(item.Size),
  300. })
  301. }
  302. return ids
  303. }
  304. // Get the source mount point of directory passed in as argument. Also return
  305. // optional fields.
  306. func getSourceMount(source string) (string, string, error) {
  307. // Ensure any symlinks are resolved.
  308. sourcePath, err := filepath.EvalSymlinks(source)
  309. if err != nil {
  310. return "", "", err
  311. }
  312. mi, err := mount.GetMounts(mount.ParentsFilter(sourcePath))
  313. if err != nil {
  314. return "", "", err
  315. }
  316. if len(mi) < 1 {
  317. return "", "", fmt.Errorf("Can't find mount point of %s", source)
  318. }
  319. // find the longest mount point
  320. var idx, maxlen int
  321. for i := range mi {
  322. if len(mi[i].Mountpoint) > maxlen {
  323. maxlen = len(mi[i].Mountpoint)
  324. idx = i
  325. }
  326. }
  327. return mi[idx].Mountpoint, mi[idx].Optional, nil
  328. }
  329. const (
  330. sharedPropagationOption = "shared:"
  331. slavePropagationOption = "master:"
  332. )
  333. // hasMountinfoOption checks if any of the passed any of the given option values
  334. // are set in the passed in option string.
  335. func hasMountinfoOption(opts string, vals ...string) bool {
  336. for _, opt := range strings.Split(opts, " ") {
  337. for _, val := range vals {
  338. if strings.HasPrefix(opt, val) {
  339. return true
  340. }
  341. }
  342. }
  343. return false
  344. }
  345. // Ensure mount point on which path is mounted, is shared.
  346. func ensureShared(path string) error {
  347. sourceMount, optionalOpts, err := getSourceMount(path)
  348. if err != nil {
  349. return err
  350. }
  351. // Make sure source mount point is shared.
  352. if !hasMountinfoOption(optionalOpts, sharedPropagationOption) {
  353. return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount)
  354. }
  355. return nil
  356. }
  357. // Ensure mount point on which path is mounted, is either shared or slave.
  358. func ensureSharedOrSlave(path string) error {
  359. sourceMount, optionalOpts, err := getSourceMount(path)
  360. if err != nil {
  361. return err
  362. }
  363. if !hasMountinfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) {
  364. return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount)
  365. }
  366. return nil
  367. }
  368. // Get the set of mount flags that are set on the mount that contains the given
  369. // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
  370. // bind-mounting "with options" will not fail with user namespaces, due to
  371. // kernel restrictions that require user namespace mounts to preserve
  372. // CL_UNPRIVILEGED locked flags.
  373. func getUnprivilegedMountFlags(path string) ([]string, error) {
  374. var statfs unix.Statfs_t
  375. if err := unix.Statfs(path, &statfs); err != nil {
  376. return nil, err
  377. }
  378. // The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
  379. unprivilegedFlags := map[uint64]string{
  380. unix.MS_RDONLY: "ro",
  381. unix.MS_NODEV: "nodev",
  382. unix.MS_NOEXEC: "noexec",
  383. unix.MS_NOSUID: "nosuid",
  384. unix.MS_NOATIME: "noatime",
  385. unix.MS_RELATIME: "relatime",
  386. unix.MS_NODIRATIME: "nodiratime",
  387. }
  388. var flags []string
  389. for mask, flag := range unprivilegedFlags {
  390. if uint64(statfs.Flags)&mask == mask {
  391. flags = append(flags, flag)
  392. }
  393. }
  394. return flags, nil
  395. }
  396. var (
  397. mountPropagationMap = map[string]int{
  398. "private": mount.PRIVATE,
  399. "rprivate": mount.RPRIVATE,
  400. "shared": mount.SHARED,
  401. "rshared": mount.RSHARED,
  402. "slave": mount.SLAVE,
  403. "rslave": mount.RSLAVE,
  404. }
  405. mountPropagationReverseMap = map[int]string{
  406. mount.PRIVATE: "private",
  407. mount.RPRIVATE: "rprivate",
  408. mount.SHARED: "shared",
  409. mount.RSHARED: "rshared",
  410. mount.SLAVE: "slave",
  411. mount.RSLAVE: "rslave",
  412. }
  413. )
  414. // inSlice tests whether a string is contained in a slice of strings or not.
  415. // Comparison is case sensitive
  416. func inSlice(slice []string, s string) bool {
  417. for _, ss := range slice {
  418. if s == ss {
  419. return true
  420. }
  421. }
  422. return false
  423. }
  424. func setMounts(daemon *Daemon, s *specs.Spec, c *container.Container, mounts []container.Mount) error {
  425. userMounts := make(map[string]struct{})
  426. for _, m := range mounts {
  427. userMounts[m.Destination] = struct{}{}
  428. }
  429. // Copy all mounts from spec to defaultMounts, except for
  430. // - mounts overridden by a user supplied mount;
  431. // - all mounts under /dev if a user supplied /dev is present;
  432. // - /dev/shm, in case IpcMode is none.
  433. // While at it, also
  434. // - set size for /dev/shm from shmsize.
  435. defaultMounts := s.Mounts[:0]
  436. _, mountDev := userMounts["/dev"]
  437. for _, m := range s.Mounts {
  438. if _, ok := userMounts[m.Destination]; ok {
  439. // filter out mount overridden by a user supplied mount
  440. continue
  441. }
  442. if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
  443. // filter out everything under /dev if /dev is user-mounted
  444. continue
  445. }
  446. if m.Destination == "/dev/shm" {
  447. if c.HostConfig.IpcMode.IsNone() {
  448. // filter out /dev/shm for "none" IpcMode
  449. continue
  450. }
  451. // set size for /dev/shm mount from spec
  452. sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
  453. m.Options = append(m.Options, sizeOpt)
  454. }
  455. defaultMounts = append(defaultMounts, m)
  456. }
  457. s.Mounts = defaultMounts
  458. for _, m := range mounts {
  459. if m.Source == "tmpfs" {
  460. data := m.Data
  461. parser := volumemounts.NewParser("linux")
  462. options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())}
  463. if data != "" {
  464. options = append(options, strings.Split(data, ",")...)
  465. }
  466. merged, err := mount.MergeTmpfsOptions(options)
  467. if err != nil {
  468. return err
  469. }
  470. s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged})
  471. continue
  472. }
  473. mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"}
  474. // Determine property of RootPropagation based on volume
  475. // properties. If a volume is shared, then keep root propagation
  476. // shared. This should work for slave and private volumes too.
  477. //
  478. // For slave volumes, it can be either [r]shared/[r]slave.
  479. //
  480. // For private volumes any root propagation value should work.
  481. pFlag := mountPropagationMap[m.Propagation]
  482. switch pFlag {
  483. case mount.SHARED, mount.RSHARED:
  484. if err := ensureShared(m.Source); err != nil {
  485. return err
  486. }
  487. rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
  488. if rootpg != mount.SHARED && rootpg != mount.RSHARED {
  489. s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED]
  490. }
  491. case mount.SLAVE, mount.RSLAVE:
  492. var fallback bool
  493. if err := ensureSharedOrSlave(m.Source); err != nil {
  494. // For backwards compatibility purposes, treat mounts from the daemon root
  495. // as special since we automatically add rslave propagation to these mounts
  496. // when the user did not set anything, so we should fallback to the old
  497. // behavior which is to use private propagation which is normally the
  498. // default.
  499. if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) {
  500. return err
  501. }
  502. cm, ok := c.MountPoints[m.Destination]
  503. if !ok {
  504. return err
  505. }
  506. if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" {
  507. // This means the user explicitly set a propagation, do not fallback in that case.
  508. return err
  509. }
  510. fallback = true
  511. logrus.WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root")
  512. }
  513. if !fallback {
  514. rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
  515. if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
  516. s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE]
  517. }
  518. }
  519. }
  520. bindMode := "rbind"
  521. if m.NonRecursive {
  522. bindMode = "bind"
  523. }
  524. opts := []string{bindMode}
  525. if !m.Writable {
  526. opts = append(opts, "ro")
  527. }
  528. if pFlag != 0 {
  529. opts = append(opts, mountPropagationReverseMap[pFlag])
  530. }
  531. // If we are using user namespaces, then we must make sure that we
  532. // don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
  533. // "mount" when we bind-mount. The reason for this is that at the point
  534. // when runc sets up the root filesystem, it is already inside a user
  535. // namespace, and thus cannot change any flags that are locked.
  536. if daemon.configStore.RemappedRoot != "" {
  537. unprivOpts, err := getUnprivilegedMountFlags(m.Source)
  538. if err != nil {
  539. return err
  540. }
  541. opts = append(opts, unprivOpts...)
  542. }
  543. mt.Options = opts
  544. s.Mounts = append(s.Mounts, mt)
  545. }
  546. if s.Root.Readonly {
  547. for i, m := range s.Mounts {
  548. switch m.Destination {
  549. case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev":
  550. continue
  551. }
  552. if _, ok := userMounts[m.Destination]; !ok {
  553. if !inSlice(m.Options, "ro") {
  554. s.Mounts[i].Options = append(s.Mounts[i].Options, "ro")
  555. }
  556. }
  557. }
  558. }
  559. if c.HostConfig.Privileged {
  560. // clear readonly for /sys
  561. for i := range s.Mounts {
  562. if s.Mounts[i].Destination == "/sys" {
  563. clearReadOnly(&s.Mounts[i])
  564. }
  565. }
  566. s.Linux.ReadonlyPaths = nil
  567. s.Linux.MaskedPaths = nil
  568. }
  569. // TODO: until a kernel/mount solution exists for handling remount in a user namespace,
  570. // we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
  571. if uidMap := daemon.idMapping.UIDs(); uidMap != nil || c.HostConfig.Privileged {
  572. for i, m := range s.Mounts {
  573. if m.Type == "cgroup" {
  574. clearReadOnly(&s.Mounts[i])
  575. }
  576. }
  577. }
  578. return nil
  579. }
  580. func (daemon *Daemon) populateCommonSpec(s *specs.Spec, c *container.Container) error {
  581. if c.BaseFS == nil {
  582. return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly nil")
  583. }
  584. linkedEnv, err := daemon.setupLinkedContainers(c)
  585. if err != nil {
  586. return err
  587. }
  588. s.Root = &specs.Root{
  589. Path: c.BaseFS.Path(),
  590. Readonly: c.HostConfig.ReadonlyRootfs,
  591. }
  592. if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil {
  593. return err
  594. }
  595. cwd := c.Config.WorkingDir
  596. if len(cwd) == 0 {
  597. cwd = "/"
  598. }
  599. s.Process.Args = append([]string{c.Path}, c.Args...)
  600. // only add the custom init if it is specified and the container is running in its
  601. // own private pid namespace. It does not make sense to add if it is running in the
  602. // host namespace or another container's pid namespace where we already have an init
  603. if c.HostConfig.PidMode.IsPrivate() {
  604. if (c.HostConfig.Init != nil && *c.HostConfig.Init) ||
  605. (c.HostConfig.Init == nil && daemon.configStore.Init) {
  606. s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...)
  607. path := daemon.configStore.InitPath
  608. if path == "" {
  609. path, err = exec.LookPath(daemonconfig.DefaultInitBinary)
  610. if err != nil {
  611. return err
  612. }
  613. }
  614. s.Mounts = append(s.Mounts, specs.Mount{
  615. Destination: inContainerInitPath,
  616. Type: "bind",
  617. Source: path,
  618. Options: []string{"bind", "ro"},
  619. })
  620. }
  621. }
  622. s.Process.Cwd = cwd
  623. s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
  624. s.Process.Terminal = c.Config.Tty
  625. s.Hostname = c.Config.Hostname
  626. // There isn't a field in the OCI for the NIS domainname, but luckily there
  627. // is a sysctl which has an identical effect to setdomainname(2) so there's
  628. // no explicit need for runtime support.
  629. s.Linux.Sysctl = make(map[string]string)
  630. if c.Config.Domainname != "" {
  631. s.Linux.Sysctl["kernel.domainname"] = c.Config.Domainname
  632. }
  633. return nil
  634. }
  635. func (daemon *Daemon) createSpec(c *container.Container) (retSpec *specs.Spec, err error) {
  636. s := oci.DefaultSpec()
  637. if err := daemon.populateCommonSpec(&s, c); err != nil {
  638. return nil, err
  639. }
  640. var cgroupsPath string
  641. scopePrefix := "docker"
  642. parent := "/docker"
  643. useSystemd := UsingSystemd(daemon.configStore)
  644. if useSystemd {
  645. parent = "system.slice"
  646. }
  647. if c.HostConfig.CgroupParent != "" {
  648. parent = c.HostConfig.CgroupParent
  649. } else if daemon.configStore.CgroupParent != "" {
  650. parent = daemon.configStore.CgroupParent
  651. }
  652. if useSystemd {
  653. cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID
  654. logrus.Debugf("createSpec: cgroupsPath: %s", cgroupsPath)
  655. } else {
  656. cgroupsPath = filepath.Join(parent, c.ID)
  657. }
  658. s.Linux.CgroupsPath = cgroupsPath
  659. if err := setResources(&s, c.HostConfig.Resources); err != nil {
  660. return nil, fmt.Errorf("linux runtime spec resources: %v", err)
  661. }
  662. // We merge the sysctls injected above with the HostConfig (latter takes
  663. // precedence for backwards-compatibility reasons).
  664. for k, v := range c.HostConfig.Sysctls {
  665. s.Linux.Sysctl[k] = v
  666. }
  667. p := s.Linux.CgroupsPath
  668. if useSystemd {
  669. initPath, err := cgroups.GetInitCgroup("cpu")
  670. if err != nil {
  671. return nil, err
  672. }
  673. _, err = cgroups.GetOwnCgroup("cpu")
  674. if err != nil {
  675. return nil, err
  676. }
  677. p = filepath.Join(initPath, s.Linux.CgroupsPath)
  678. }
  679. // Clean path to guard against things like ../../../BAD
  680. parentPath := filepath.Dir(p)
  681. if !filepath.IsAbs(parentPath) {
  682. parentPath = filepath.Clean("/" + parentPath)
  683. }
  684. if err := daemon.initCgroupsPath(parentPath); err != nil {
  685. return nil, fmt.Errorf("linux init cgroups path: %v", err)
  686. }
  687. if err := setDevices(&s, c); err != nil {
  688. return nil, fmt.Errorf("linux runtime spec devices: %v", err)
  689. }
  690. if err := daemon.setRlimits(&s, c); err != nil {
  691. return nil, fmt.Errorf("linux runtime spec rlimits: %v", err)
  692. }
  693. if err := setUser(&s, c); err != nil {
  694. return nil, fmt.Errorf("linux spec user: %v", err)
  695. }
  696. if err := setNamespaces(daemon, &s, c); err != nil {
  697. return nil, fmt.Errorf("linux spec namespaces: %v", err)
  698. }
  699. capabilities, err := caps.TweakCapabilities(oci.DefaultCapabilities(), c.HostConfig.CapAdd, c.HostConfig.CapDrop, c.HostConfig.Capabilities, c.HostConfig.Privileged)
  700. if err != nil {
  701. return nil, fmt.Errorf("linux spec capabilities: %v", err)
  702. }
  703. if err := oci.SetCapabilities(&s, capabilities); err != nil {
  704. return nil, fmt.Errorf("linux spec capabilities: %v", err)
  705. }
  706. if err := setSeccomp(daemon, &s, c); err != nil {
  707. return nil, fmt.Errorf("linux seccomp: %v", err)
  708. }
  709. if err := daemon.setupContainerMountsRoot(c); err != nil {
  710. return nil, err
  711. }
  712. if err := daemon.setupIpcDirs(c); err != nil {
  713. return nil, err
  714. }
  715. defer func() {
  716. if err != nil {
  717. daemon.cleanupSecretDir(c)
  718. }
  719. }()
  720. if err := daemon.setupSecretDir(c); err != nil {
  721. return nil, err
  722. }
  723. ms, err := daemon.setupMounts(c)
  724. if err != nil {
  725. return nil, err
  726. }
  727. if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() {
  728. ms = append(ms, c.IpcMounts()...)
  729. }
  730. tmpfsMounts, err := c.TmpfsMounts()
  731. if err != nil {
  732. return nil, err
  733. }
  734. ms = append(ms, tmpfsMounts...)
  735. secretMounts, err := c.SecretMounts()
  736. if err != nil {
  737. return nil, err
  738. }
  739. ms = append(ms, secretMounts...)
  740. sort.Sort(mounts(ms))
  741. if err := setMounts(daemon, &s, c, ms); err != nil {
  742. return nil, fmt.Errorf("linux mounts: %v", err)
  743. }
  744. for _, ns := range s.Linux.Namespaces {
  745. if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled {
  746. target := filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe")
  747. s.Hooks = &specs.Hooks{
  748. Prestart: []specs.Hook{{
  749. Path: target,
  750. Args: []string{"libnetwork-setkey", "-exec-root=" + daemon.configStore.GetExecRoot(), c.ID, daemon.netController.ID()},
  751. }},
  752. }
  753. }
  754. }
  755. if apparmor.IsEnabled() {
  756. var appArmorProfile string
  757. if c.AppArmorProfile != "" {
  758. appArmorProfile = c.AppArmorProfile
  759. } else if c.HostConfig.Privileged {
  760. appArmorProfile = "unconfined"
  761. } else {
  762. appArmorProfile = "docker-default"
  763. }
  764. if appArmorProfile == "docker-default" {
  765. // Unattended upgrades and other fun services can unload AppArmor
  766. // profiles inadvertently. Since we cannot store our profile in
  767. // /etc/apparmor.d, nor can we practically add other ways of
  768. // telling the system to keep our profile loaded, in order to make
  769. // sure that we keep the default profile enabled we dynamically
  770. // reload it if necessary.
  771. if err := ensureDefaultAppArmorProfile(); err != nil {
  772. return nil, err
  773. }
  774. }
  775. s.Process.ApparmorProfile = appArmorProfile
  776. }
  777. s.Process.SelinuxLabel = c.GetProcessLabel()
  778. s.Process.NoNewPrivileges = c.NoNewPrivileges
  779. s.Process.OOMScoreAdj = &c.HostConfig.OomScoreAdj
  780. s.Linux.MountLabel = c.MountLabel
  781. // Set the masked and readonly paths with regard to the host config options if they are set.
  782. if c.HostConfig.MaskedPaths != nil {
  783. s.Linux.MaskedPaths = c.HostConfig.MaskedPaths
  784. }
  785. if c.HostConfig.ReadonlyPaths != nil {
  786. s.Linux.ReadonlyPaths = c.HostConfig.ReadonlyPaths
  787. }
  788. if daemon.configStore.Rootless {
  789. if err := specconv.ToRootless(&s); err != nil {
  790. return nil, err
  791. }
  792. }
  793. return &s, nil
  794. }
  795. func clearReadOnly(m *specs.Mount) {
  796. var opt []string
  797. for _, o := range m.Options {
  798. if o != "ro" {
  799. opt = append(opt, o)
  800. }
  801. }
  802. m.Options = opt
  803. }
  804. // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig
  805. func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig) {
  806. ulimits := c.Ulimits
  807. // Merge ulimits with daemon defaults
  808. ulIdx := make(map[string]struct{})
  809. for _, ul := range ulimits {
  810. ulIdx[ul.Name] = struct{}{}
  811. }
  812. for name, ul := range daemon.configStore.Ulimits {
  813. if _, exists := ulIdx[name]; !exists {
  814. ulimits = append(ulimits, ul)
  815. }
  816. }
  817. c.Ulimits = ulimits
  818. }