oci_linux.go 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997
  1. package daemon // import "github.com/docker/docker/daemon"
  2. import (
  3. "context"
  4. "fmt"
  5. "io"
  6. "os"
  7. "os/exec"
  8. "path/filepath"
  9. "sort"
  10. "strconv"
  11. "strings"
  12. "github.com/containerd/containerd/containers"
  13. coci "github.com/containerd/containerd/oci"
  14. containertypes "github.com/docker/docker/api/types/container"
  15. "github.com/docker/docker/container"
  16. daemonconfig "github.com/docker/docker/daemon/config"
  17. "github.com/docker/docker/oci"
  18. "github.com/docker/docker/oci/caps"
  19. "github.com/docker/docker/pkg/idtools"
  20. "github.com/docker/docker/pkg/mount"
  21. "github.com/docker/docker/pkg/stringid"
  22. "github.com/docker/docker/rootless/specconv"
  23. volumemounts "github.com/docker/docker/volume/mounts"
  24. "github.com/opencontainers/runc/libcontainer/apparmor"
  25. "github.com/opencontainers/runc/libcontainer/cgroups"
  26. "github.com/opencontainers/runc/libcontainer/devices"
  27. rsystem "github.com/opencontainers/runc/libcontainer/system"
  28. "github.com/opencontainers/runc/libcontainer/user"
  29. specs "github.com/opencontainers/runtime-spec/specs-go"
  30. "github.com/pkg/errors"
  31. "github.com/sirupsen/logrus"
  32. "golang.org/x/sys/unix"
  33. )
  34. const inContainerInitPath = "/sbin/" + daemonconfig.DefaultInitBinary
  35. // WithRlimits sets the container's rlimits along with merging the daemon's rlimits
  36. func WithRlimits(daemon *Daemon, c *container.Container) coci.SpecOpts {
  37. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  38. var rlimits []specs.POSIXRlimit
  39. // We want to leave the original HostConfig alone so make a copy here
  40. hostConfig := *c.HostConfig
  41. // Merge with the daemon defaults
  42. daemon.mergeUlimits(&hostConfig)
  43. for _, ul := range hostConfig.Ulimits {
  44. rlimits = append(rlimits, specs.POSIXRlimit{
  45. Type: "RLIMIT_" + strings.ToUpper(ul.Name),
  46. Soft: uint64(ul.Soft),
  47. Hard: uint64(ul.Hard),
  48. })
  49. }
  50. s.Process.Rlimits = rlimits
  51. return nil
  52. }
  53. }
  54. // WithLibnetwork sets the libnetwork hook
  55. func WithLibnetwork(daemon *Daemon, c *container.Container) coci.SpecOpts {
  56. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  57. if s.Hooks == nil {
  58. s.Hooks = &specs.Hooks{}
  59. }
  60. for _, ns := range s.Linux.Namespaces {
  61. if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled {
  62. target := filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe")
  63. shortNetCtlrID := stringid.TruncateID(daemon.netController.ID())
  64. s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{
  65. Path: target,
  66. Args: []string{
  67. "libnetwork-setkey",
  68. "-exec-root=" + daemon.configStore.GetExecRoot(),
  69. c.ID,
  70. shortNetCtlrID,
  71. },
  72. })
  73. }
  74. }
  75. return nil
  76. }
  77. }
  78. // WithRootless sets the spec to the rootless configuration
  79. func WithRootless(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  80. return specconv.ToRootless(s)
  81. }
  82. // WithOOMScore sets the oom score
  83. func WithOOMScore(score *int) coci.SpecOpts {
  84. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  85. s.Process.OOMScoreAdj = score
  86. return nil
  87. }
  88. }
  89. // WithSelinux sets the selinux labels
  90. func WithSelinux(c *container.Container) coci.SpecOpts {
  91. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  92. s.Process.SelinuxLabel = c.GetProcessLabel()
  93. s.Linux.MountLabel = c.MountLabel
  94. return nil
  95. }
  96. }
  97. // WithApparmor sets the apparmor profile
  98. func WithApparmor(c *container.Container) coci.SpecOpts {
  99. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  100. if apparmor.IsEnabled() {
  101. var appArmorProfile string
  102. if c.AppArmorProfile != "" {
  103. appArmorProfile = c.AppArmorProfile
  104. } else if c.HostConfig.Privileged {
  105. appArmorProfile = "unconfined"
  106. } else {
  107. appArmorProfile = "docker-default"
  108. }
  109. if appArmorProfile == "docker-default" {
  110. // Unattended upgrades and other fun services can unload AppArmor
  111. // profiles inadvertently. Since we cannot store our profile in
  112. // /etc/apparmor.d, nor can we practically add other ways of
  113. // telling the system to keep our profile loaded, in order to make
  114. // sure that we keep the default profile enabled we dynamically
  115. // reload it if necessary.
  116. if err := ensureDefaultAppArmorProfile(); err != nil {
  117. return err
  118. }
  119. }
  120. s.Process.ApparmorProfile = appArmorProfile
  121. }
  122. return nil
  123. }
  124. }
  125. // WithCapabilities sets the container's capabilties
  126. func WithCapabilities(c *container.Container) coci.SpecOpts {
  127. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  128. capabilities, err := caps.TweakCapabilities(
  129. oci.DefaultCapabilities(),
  130. c.HostConfig.CapAdd,
  131. c.HostConfig.CapDrop,
  132. c.HostConfig.Capabilities,
  133. c.HostConfig.Privileged,
  134. )
  135. if err != nil {
  136. return err
  137. }
  138. return oci.SetCapabilities(s, capabilities)
  139. }
  140. }
  141. func readUserFile(c *container.Container, p string) (io.ReadCloser, error) {
  142. fp, err := c.GetResourcePath(p)
  143. if err != nil {
  144. return nil, err
  145. }
  146. return os.Open(fp)
  147. }
  148. func getUser(c *container.Container, username string) (uint32, uint32, []uint32, error) {
  149. passwdPath, err := user.GetPasswdPath()
  150. if err != nil {
  151. return 0, 0, nil, err
  152. }
  153. groupPath, err := user.GetGroupPath()
  154. if err != nil {
  155. return 0, 0, nil, err
  156. }
  157. passwdFile, err := readUserFile(c, passwdPath)
  158. if err == nil {
  159. defer passwdFile.Close()
  160. }
  161. groupFile, err := readUserFile(c, groupPath)
  162. if err == nil {
  163. defer groupFile.Close()
  164. }
  165. execUser, err := user.GetExecUser(username, nil, passwdFile, groupFile)
  166. if err != nil {
  167. return 0, 0, nil, err
  168. }
  169. // todo: fix this double read by a change to libcontainer/user pkg
  170. groupFile, err = readUserFile(c, groupPath)
  171. if err == nil {
  172. defer groupFile.Close()
  173. }
  174. var addGroups []int
  175. if len(c.HostConfig.GroupAdd) > 0 {
  176. addGroups, err = user.GetAdditionalGroups(c.HostConfig.GroupAdd, groupFile)
  177. if err != nil {
  178. return 0, 0, nil, err
  179. }
  180. }
  181. uid := uint32(execUser.Uid)
  182. gid := uint32(execUser.Gid)
  183. sgids := append(execUser.Sgids, addGroups...)
  184. var additionalGids []uint32
  185. for _, g := range sgids {
  186. additionalGids = append(additionalGids, uint32(g))
  187. }
  188. return uid, gid, additionalGids, nil
  189. }
  190. func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) {
  191. for i, n := range s.Linux.Namespaces {
  192. if n.Type == ns.Type {
  193. s.Linux.Namespaces[i] = ns
  194. return
  195. }
  196. }
  197. s.Linux.Namespaces = append(s.Linux.Namespaces, ns)
  198. }
  199. // WithNamespaces sets the container's namespaces
  200. func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
  201. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  202. userNS := false
  203. // user
  204. if c.HostConfig.UsernsMode.IsPrivate() {
  205. uidMap := daemon.idMapping.UIDs()
  206. if uidMap != nil {
  207. userNS = true
  208. ns := specs.LinuxNamespace{Type: "user"}
  209. setNamespace(s, ns)
  210. s.Linux.UIDMappings = specMapping(uidMap)
  211. s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDs())
  212. }
  213. }
  214. // network
  215. if !c.Config.NetworkDisabled {
  216. ns := specs.LinuxNamespace{Type: "network"}
  217. parts := strings.SplitN(string(c.HostConfig.NetworkMode), ":", 2)
  218. if parts[0] == "container" {
  219. nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer())
  220. if err != nil {
  221. return err
  222. }
  223. ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID())
  224. if userNS {
  225. // to share a net namespace, they must also share a user namespace
  226. nsUser := specs.LinuxNamespace{Type: "user"}
  227. nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID())
  228. setNamespace(s, nsUser)
  229. }
  230. } else if c.HostConfig.NetworkMode.IsHost() {
  231. ns.Path = c.NetworkSettings.SandboxKey
  232. }
  233. setNamespace(s, ns)
  234. }
  235. // ipc
  236. ipcMode := c.HostConfig.IpcMode
  237. switch {
  238. case ipcMode.IsContainer():
  239. ns := specs.LinuxNamespace{Type: "ipc"}
  240. ic, err := daemon.getIpcContainer(ipcMode.Container())
  241. if err != nil {
  242. return err
  243. }
  244. ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID())
  245. setNamespace(s, ns)
  246. if userNS {
  247. // to share an IPC namespace, they must also share a user namespace
  248. nsUser := specs.LinuxNamespace{Type: "user"}
  249. nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID())
  250. setNamespace(s, nsUser)
  251. }
  252. case ipcMode.IsHost():
  253. oci.RemoveNamespace(s, specs.LinuxNamespaceType("ipc"))
  254. case ipcMode.IsEmpty():
  255. // A container was created by an older version of the daemon.
  256. // The default behavior used to be what is now called "shareable".
  257. fallthrough
  258. case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone():
  259. ns := specs.LinuxNamespace{Type: "ipc"}
  260. setNamespace(s, ns)
  261. default:
  262. return fmt.Errorf("Invalid IPC mode: %v", ipcMode)
  263. }
  264. // pid
  265. if c.HostConfig.PidMode.IsContainer() {
  266. ns := specs.LinuxNamespace{Type: "pid"}
  267. pc, err := daemon.getPidContainer(c)
  268. if err != nil {
  269. return err
  270. }
  271. ns.Path = fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID())
  272. setNamespace(s, ns)
  273. if userNS {
  274. // to share a PID namespace, they must also share a user namespace
  275. nsUser := specs.LinuxNamespace{Type: "user"}
  276. nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID())
  277. setNamespace(s, nsUser)
  278. }
  279. } else if c.HostConfig.PidMode.IsHost() {
  280. oci.RemoveNamespace(s, specs.LinuxNamespaceType("pid"))
  281. } else {
  282. ns := specs.LinuxNamespace{Type: "pid"}
  283. setNamespace(s, ns)
  284. }
  285. // uts
  286. if c.HostConfig.UTSMode.IsHost() {
  287. oci.RemoveNamespace(s, specs.LinuxNamespaceType("uts"))
  288. s.Hostname = ""
  289. }
  290. // cgroup
  291. if !c.HostConfig.CgroupnsMode.IsEmpty() {
  292. cgroupNsMode := c.HostConfig.CgroupnsMode
  293. if !cgroupNsMode.Valid() {
  294. return fmt.Errorf("invalid cgroup namespace mode: %v", cgroupNsMode)
  295. }
  296. if cgroupNsMode.IsPrivate() && !c.HostConfig.Privileged {
  297. nsCgroup := specs.LinuxNamespace{Type: "cgroup"}
  298. setNamespace(s, nsCgroup)
  299. }
  300. }
  301. return nil
  302. }
  303. }
  304. func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {
  305. var ids []specs.LinuxIDMapping
  306. for _, item := range s {
  307. ids = append(ids, specs.LinuxIDMapping{
  308. HostID: uint32(item.HostID),
  309. ContainerID: uint32(item.ContainerID),
  310. Size: uint32(item.Size),
  311. })
  312. }
  313. return ids
  314. }
  315. // Get the source mount point of directory passed in as argument. Also return
  316. // optional fields.
  317. func getSourceMount(source string) (string, string, error) {
  318. // Ensure any symlinks are resolved.
  319. sourcePath, err := filepath.EvalSymlinks(source)
  320. if err != nil {
  321. return "", "", err
  322. }
  323. mi, err := mount.GetMounts(mount.ParentsFilter(sourcePath))
  324. if err != nil {
  325. return "", "", err
  326. }
  327. if len(mi) < 1 {
  328. return "", "", fmt.Errorf("Can't find mount point of %s", source)
  329. }
  330. // find the longest mount point
  331. var idx, maxlen int
  332. for i := range mi {
  333. if len(mi[i].Mountpoint) > maxlen {
  334. maxlen = len(mi[i].Mountpoint)
  335. idx = i
  336. }
  337. }
  338. return mi[idx].Mountpoint, mi[idx].Optional, nil
  339. }
  340. const (
  341. sharedPropagationOption = "shared:"
  342. slavePropagationOption = "master:"
  343. )
  344. // hasMountinfoOption checks if any of the passed any of the given option values
  345. // are set in the passed in option string.
  346. func hasMountinfoOption(opts string, vals ...string) bool {
  347. for _, opt := range strings.Split(opts, " ") {
  348. for _, val := range vals {
  349. if strings.HasPrefix(opt, val) {
  350. return true
  351. }
  352. }
  353. }
  354. return false
  355. }
  356. // Ensure mount point on which path is mounted, is shared.
  357. func ensureShared(path string) error {
  358. sourceMount, optionalOpts, err := getSourceMount(path)
  359. if err != nil {
  360. return err
  361. }
  362. // Make sure source mount point is shared.
  363. if !hasMountinfoOption(optionalOpts, sharedPropagationOption) {
  364. return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount)
  365. }
  366. return nil
  367. }
  368. // Ensure mount point on which path is mounted, is either shared or slave.
  369. func ensureSharedOrSlave(path string) error {
  370. sourceMount, optionalOpts, err := getSourceMount(path)
  371. if err != nil {
  372. return err
  373. }
  374. if !hasMountinfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) {
  375. return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount)
  376. }
  377. return nil
  378. }
  379. // Get the set of mount flags that are set on the mount that contains the given
  380. // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
  381. // bind-mounting "with options" will not fail with user namespaces, due to
  382. // kernel restrictions that require user namespace mounts to preserve
  383. // CL_UNPRIVILEGED locked flags.
  384. func getUnprivilegedMountFlags(path string) ([]string, error) {
  385. var statfs unix.Statfs_t
  386. if err := unix.Statfs(path, &statfs); err != nil {
  387. return nil, err
  388. }
  389. // The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
  390. unprivilegedFlags := map[uint64]string{
  391. unix.MS_RDONLY: "ro",
  392. unix.MS_NODEV: "nodev",
  393. unix.MS_NOEXEC: "noexec",
  394. unix.MS_NOSUID: "nosuid",
  395. unix.MS_NOATIME: "noatime",
  396. unix.MS_RELATIME: "relatime",
  397. unix.MS_NODIRATIME: "nodiratime",
  398. }
  399. var flags []string
  400. for mask, flag := range unprivilegedFlags {
  401. if uint64(statfs.Flags)&mask == mask {
  402. flags = append(flags, flag)
  403. }
  404. }
  405. return flags, nil
  406. }
  407. var (
  408. mountPropagationMap = map[string]int{
  409. "private": mount.PRIVATE,
  410. "rprivate": mount.RPRIVATE,
  411. "shared": mount.SHARED,
  412. "rshared": mount.RSHARED,
  413. "slave": mount.SLAVE,
  414. "rslave": mount.RSLAVE,
  415. }
  416. mountPropagationReverseMap = map[int]string{
  417. mount.PRIVATE: "private",
  418. mount.RPRIVATE: "rprivate",
  419. mount.SHARED: "shared",
  420. mount.RSHARED: "rshared",
  421. mount.SLAVE: "slave",
  422. mount.RSLAVE: "rslave",
  423. }
  424. )
  425. // inSlice tests whether a string is contained in a slice of strings or not.
  426. // Comparison is case sensitive
  427. func inSlice(slice []string, s string) bool {
  428. for _, ss := range slice {
  429. if s == ss {
  430. return true
  431. }
  432. }
  433. return false
  434. }
  435. // WithMounts sets the container's mounts
  436. func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts {
  437. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
  438. if err := daemon.setupContainerMountsRoot(c); err != nil {
  439. return err
  440. }
  441. if err := daemon.setupIpcDirs(c); err != nil {
  442. return err
  443. }
  444. defer func() {
  445. if err != nil {
  446. daemon.cleanupSecretDir(c)
  447. }
  448. }()
  449. if err := daemon.setupSecretDir(c); err != nil {
  450. return err
  451. }
  452. ms, err := daemon.setupMounts(c)
  453. if err != nil {
  454. return err
  455. }
  456. if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() {
  457. ms = append(ms, c.IpcMounts()...)
  458. }
  459. tmpfsMounts, err := c.TmpfsMounts()
  460. if err != nil {
  461. return err
  462. }
  463. ms = append(ms, tmpfsMounts...)
  464. secretMounts, err := c.SecretMounts()
  465. if err != nil {
  466. return err
  467. }
  468. ms = append(ms, secretMounts...)
  469. sort.Sort(mounts(ms))
  470. mounts := ms
  471. userMounts := make(map[string]struct{})
  472. for _, m := range mounts {
  473. userMounts[m.Destination] = struct{}{}
  474. }
  475. // Copy all mounts from spec to defaultMounts, except for
  476. // - mounts overridden by a user supplied mount;
  477. // - all mounts under /dev if a user supplied /dev is present;
  478. // - /dev/shm, in case IpcMode is none.
  479. // While at it, also
  480. // - set size for /dev/shm from shmsize.
  481. defaultMounts := s.Mounts[:0]
  482. _, mountDev := userMounts["/dev"]
  483. for _, m := range s.Mounts {
  484. if _, ok := userMounts[m.Destination]; ok {
  485. // filter out mount overridden by a user supplied mount
  486. continue
  487. }
  488. if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
  489. // filter out everything under /dev if /dev is user-mounted
  490. continue
  491. }
  492. if m.Destination == "/dev/shm" {
  493. if c.HostConfig.IpcMode.IsNone() {
  494. // filter out /dev/shm for "none" IpcMode
  495. continue
  496. }
  497. // set size for /dev/shm mount from spec
  498. sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
  499. m.Options = append(m.Options, sizeOpt)
  500. }
  501. defaultMounts = append(defaultMounts, m)
  502. }
  503. s.Mounts = defaultMounts
  504. for _, m := range mounts {
  505. if m.Source == "tmpfs" {
  506. data := m.Data
  507. parser := volumemounts.NewParser("linux")
  508. options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())}
  509. if data != "" {
  510. options = append(options, strings.Split(data, ",")...)
  511. }
  512. merged, err := mount.MergeTmpfsOptions(options)
  513. if err != nil {
  514. return err
  515. }
  516. s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged})
  517. continue
  518. }
  519. mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"}
  520. // Determine property of RootPropagation based on volume
  521. // properties. If a volume is shared, then keep root propagation
  522. // shared. This should work for slave and private volumes too.
  523. //
  524. // For slave volumes, it can be either [r]shared/[r]slave.
  525. //
  526. // For private volumes any root propagation value should work.
  527. pFlag := mountPropagationMap[m.Propagation]
  528. switch pFlag {
  529. case mount.SHARED, mount.RSHARED:
  530. if err := ensureShared(m.Source); err != nil {
  531. return err
  532. }
  533. rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
  534. if rootpg != mount.SHARED && rootpg != mount.RSHARED {
  535. s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED]
  536. }
  537. case mount.SLAVE, mount.RSLAVE:
  538. var fallback bool
  539. if err := ensureSharedOrSlave(m.Source); err != nil {
  540. // For backwards compatibility purposes, treat mounts from the daemon root
  541. // as special since we automatically add rslave propagation to these mounts
  542. // when the user did not set anything, so we should fallback to the old
  543. // behavior which is to use private propagation which is normally the
  544. // default.
  545. if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) {
  546. return err
  547. }
  548. cm, ok := c.MountPoints[m.Destination]
  549. if !ok {
  550. return err
  551. }
  552. if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" {
  553. // This means the user explicitly set a propagation, do not fallback in that case.
  554. return err
  555. }
  556. fallback = true
  557. logrus.WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root")
  558. }
  559. if !fallback {
  560. rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
  561. if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
  562. s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE]
  563. }
  564. }
  565. }
  566. bindMode := "rbind"
  567. if m.NonRecursive {
  568. bindMode = "bind"
  569. }
  570. opts := []string{bindMode}
  571. if !m.Writable {
  572. opts = append(opts, "ro")
  573. }
  574. if pFlag != 0 {
  575. opts = append(opts, mountPropagationReverseMap[pFlag])
  576. }
  577. // If we are using user namespaces, then we must make sure that we
  578. // don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
  579. // "mount" when we bind-mount. The reason for this is that at the point
  580. // when runc sets up the root filesystem, it is already inside a user
  581. // namespace, and thus cannot change any flags that are locked.
  582. if daemon.configStore.RemappedRoot != "" {
  583. unprivOpts, err := getUnprivilegedMountFlags(m.Source)
  584. if err != nil {
  585. return err
  586. }
  587. opts = append(opts, unprivOpts...)
  588. }
  589. mt.Options = opts
  590. s.Mounts = append(s.Mounts, mt)
  591. }
  592. if s.Root.Readonly {
  593. for i, m := range s.Mounts {
  594. switch m.Destination {
  595. case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev":
  596. continue
  597. }
  598. if _, ok := userMounts[m.Destination]; !ok {
  599. if !inSlice(m.Options, "ro") {
  600. s.Mounts[i].Options = append(s.Mounts[i].Options, "ro")
  601. }
  602. }
  603. }
  604. }
  605. if c.HostConfig.Privileged {
  606. // clear readonly for /sys
  607. for i := range s.Mounts {
  608. if s.Mounts[i].Destination == "/sys" {
  609. clearReadOnly(&s.Mounts[i])
  610. }
  611. }
  612. s.Linux.ReadonlyPaths = nil
  613. s.Linux.MaskedPaths = nil
  614. }
  615. // TODO: until a kernel/mount solution exists for handling remount in a user namespace,
  616. // we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
  617. if uidMap := daemon.idMapping.UIDs(); uidMap != nil || c.HostConfig.Privileged {
  618. for i, m := range s.Mounts {
  619. if m.Type == "cgroup" {
  620. clearReadOnly(&s.Mounts[i])
  621. }
  622. }
  623. }
  624. return nil
  625. }
  626. }
  627. // WithCommonOptions sets common docker options
  628. func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts {
  629. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  630. if c.BaseFS == nil {
  631. return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly nil")
  632. }
  633. linkedEnv, err := daemon.setupLinkedContainers(c)
  634. if err != nil {
  635. return err
  636. }
  637. s.Root = &specs.Root{
  638. Path: c.BaseFS.Path(),
  639. Readonly: c.HostConfig.ReadonlyRootfs,
  640. }
  641. if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil {
  642. return err
  643. }
  644. cwd := c.Config.WorkingDir
  645. if len(cwd) == 0 {
  646. cwd = "/"
  647. }
  648. s.Process.Args = append([]string{c.Path}, c.Args...)
  649. // only add the custom init if it is specified and the container is running in its
  650. // own private pid namespace. It does not make sense to add if it is running in the
  651. // host namespace or another container's pid namespace where we already have an init
  652. if c.HostConfig.PidMode.IsPrivate() {
  653. if (c.HostConfig.Init != nil && *c.HostConfig.Init) ||
  654. (c.HostConfig.Init == nil && daemon.configStore.Init) {
  655. s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...)
  656. path := daemon.configStore.InitPath
  657. if path == "" {
  658. path, err = exec.LookPath(daemonconfig.DefaultInitBinary)
  659. if err != nil {
  660. return err
  661. }
  662. }
  663. s.Mounts = append(s.Mounts, specs.Mount{
  664. Destination: inContainerInitPath,
  665. Type: "bind",
  666. Source: path,
  667. Options: []string{"bind", "ro"},
  668. })
  669. }
  670. }
  671. s.Process.Cwd = cwd
  672. s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
  673. s.Process.Terminal = c.Config.Tty
  674. s.Hostname = c.Config.Hostname
  675. setLinuxDomainname(c, s)
  676. return nil
  677. }
  678. }
  679. // WithCgroups sets the container's cgroups
  680. func WithCgroups(daemon *Daemon, c *container.Container) coci.SpecOpts {
  681. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  682. var cgroupsPath string
  683. scopePrefix := "docker"
  684. parent := "/docker"
  685. useSystemd := UsingSystemd(daemon.configStore)
  686. if useSystemd {
  687. parent = "system.slice"
  688. }
  689. if c.HostConfig.CgroupParent != "" {
  690. parent = c.HostConfig.CgroupParent
  691. } else if daemon.configStore.CgroupParent != "" {
  692. parent = daemon.configStore.CgroupParent
  693. }
  694. if useSystemd {
  695. cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID
  696. logrus.Debugf("createSpec: cgroupsPath: %s", cgroupsPath)
  697. } else {
  698. cgroupsPath = filepath.Join(parent, c.ID)
  699. }
  700. s.Linux.CgroupsPath = cgroupsPath
  701. p := cgroupsPath
  702. if useSystemd {
  703. initPath, err := cgroups.GetInitCgroup("cpu")
  704. if err != nil {
  705. return err
  706. }
  707. _, err = cgroups.GetOwnCgroup("cpu")
  708. if err != nil {
  709. return err
  710. }
  711. p = filepath.Join(initPath, s.Linux.CgroupsPath)
  712. }
  713. // Clean path to guard against things like ../../../BAD
  714. parentPath := filepath.Dir(p)
  715. if !filepath.IsAbs(parentPath) {
  716. parentPath = filepath.Clean("/" + parentPath)
  717. }
  718. if err := daemon.initCgroupsPath(parentPath); err != nil {
  719. return fmt.Errorf("linux init cgroups path: %v", err)
  720. }
  721. return nil
  722. }
  723. }
  724. // WithDevices sets the container's devices
  725. func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts {
  726. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  727. // Build lists of devices allowed and created within the container.
  728. var devs []specs.LinuxDevice
  729. devPermissions := s.Linux.Resources.Devices
  730. if c.HostConfig.Privileged && !rsystem.RunningInUserNS() {
  731. hostDevices, err := devices.HostDevices()
  732. if err != nil {
  733. return err
  734. }
  735. for _, d := range hostDevices {
  736. devs = append(devs, oci.Device(d))
  737. }
  738. devPermissions = []specs.LinuxDeviceCgroup{
  739. {
  740. Allow: true,
  741. Access: "rwm",
  742. },
  743. }
  744. } else {
  745. for _, deviceMapping := range c.HostConfig.Devices {
  746. d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions)
  747. if err != nil {
  748. return err
  749. }
  750. devs = append(devs, d...)
  751. devPermissions = append(devPermissions, dPermissions...)
  752. }
  753. var err error
  754. devPermissions, err = oci.AppendDevicePermissionsFromCgroupRules(devPermissions, c.HostConfig.DeviceCgroupRules)
  755. if err != nil {
  756. return err
  757. }
  758. }
  759. s.Linux.Devices = append(s.Linux.Devices, devs...)
  760. s.Linux.Resources.Devices = devPermissions
  761. for _, req := range c.HostConfig.DeviceRequests {
  762. if err := daemon.handleDevice(req, s); err != nil {
  763. return err
  764. }
  765. }
  766. return nil
  767. }
  768. }
  769. // WithResources applies the container resources
  770. func WithResources(c *container.Container) coci.SpecOpts {
  771. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  772. r := c.HostConfig.Resources
  773. weightDevices, err := getBlkioWeightDevices(r)
  774. if err != nil {
  775. return err
  776. }
  777. readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps)
  778. if err != nil {
  779. return err
  780. }
  781. writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps)
  782. if err != nil {
  783. return err
  784. }
  785. readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps)
  786. if err != nil {
  787. return err
  788. }
  789. writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps)
  790. if err != nil {
  791. return err
  792. }
  793. memoryRes := getMemoryResources(r)
  794. cpuRes, err := getCPUResources(r)
  795. if err != nil {
  796. return err
  797. }
  798. blkioWeight := r.BlkioWeight
  799. specResources := &specs.LinuxResources{
  800. Memory: memoryRes,
  801. CPU: cpuRes,
  802. BlockIO: &specs.LinuxBlockIO{
  803. Weight: &blkioWeight,
  804. WeightDevice: weightDevices,
  805. ThrottleReadBpsDevice: readBpsDevice,
  806. ThrottleWriteBpsDevice: writeBpsDevice,
  807. ThrottleReadIOPSDevice: readIOpsDevice,
  808. ThrottleWriteIOPSDevice: writeIOpsDevice,
  809. },
  810. Pids: getPidsLimit(r),
  811. }
  812. if s.Linux.Resources != nil && len(s.Linux.Resources.Devices) > 0 {
  813. specResources.Devices = s.Linux.Resources.Devices
  814. }
  815. s.Linux.Resources = specResources
  816. return nil
  817. }
  818. }
  819. // WithSysctls sets the container's sysctls
  820. func WithSysctls(c *container.Container) coci.SpecOpts {
  821. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  822. // We merge the sysctls injected above with the HostConfig (latter takes
  823. // precedence for backwards-compatibility reasons).
  824. for k, v := range c.HostConfig.Sysctls {
  825. s.Linux.Sysctl[k] = v
  826. }
  827. return nil
  828. }
  829. }
  830. // WithUser sets the container's user
  831. func WithUser(c *container.Container) coci.SpecOpts {
  832. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  833. uid, gid, additionalGids, err := getUser(c, c.Config.User)
  834. if err != nil {
  835. return err
  836. }
  837. s.Process.User.UID = uid
  838. s.Process.User.GID = gid
  839. s.Process.User.AdditionalGids = additionalGids
  840. return nil
  841. }
  842. }
  843. func (daemon *Daemon) createSpec(c *container.Container) (retSpec *specs.Spec, err error) {
  844. var (
  845. opts []coci.SpecOpts
  846. s = oci.DefaultSpec()
  847. )
  848. opts = append(opts,
  849. WithCommonOptions(daemon, c),
  850. WithCgroups(daemon, c),
  851. WithResources(c),
  852. WithSysctls(c),
  853. WithDevices(daemon, c),
  854. WithUser(c),
  855. WithRlimits(daemon, c),
  856. WithNamespaces(daemon, c),
  857. WithCapabilities(c),
  858. WithSeccomp(daemon, c),
  859. WithMounts(daemon, c),
  860. WithLibnetwork(daemon, c),
  861. WithApparmor(c),
  862. WithSelinux(c),
  863. WithOOMScore(&c.HostConfig.OomScoreAdj),
  864. )
  865. if c.NoNewPrivileges {
  866. opts = append(opts, coci.WithNoNewPrivileges)
  867. }
  868. // Set the masked and readonly paths with regard to the host config options if they are set.
  869. if c.HostConfig.MaskedPaths != nil {
  870. opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths))
  871. }
  872. if c.HostConfig.ReadonlyPaths != nil {
  873. opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths))
  874. }
  875. if daemon.configStore.Rootless {
  876. opts = append(opts, WithRootless)
  877. }
  878. return &s, coci.ApplyOpts(context.Background(), nil, &containers.Container{
  879. ID: c.ID,
  880. }, &s, opts...)
  881. }
  882. func clearReadOnly(m *specs.Mount) {
  883. var opt []string
  884. for _, o := range m.Options {
  885. if o != "ro" {
  886. opt = append(opt, o)
  887. }
  888. }
  889. m.Options = opt
  890. }
  891. // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig
  892. func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig) {
  893. ulimits := c.Ulimits
  894. // Merge ulimits with daemon defaults
  895. ulIdx := make(map[string]struct{})
  896. for _, ul := range ulimits {
  897. ulIdx[ul.Name] = struct{}{}
  898. }
  899. for name, ul := range daemon.configStore.Ulimits {
  900. if _, exists := ulIdx[name]; !exists {
  901. ulimits = append(ulimits, ul)
  902. }
  903. }
  904. c.Ulimits = ulimits
  905. }