oci_linux.go 32 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066
  1. package daemon // import "github.com/docker/docker/daemon"
  2. import (
  3. "context"
  4. "fmt"
  5. "os"
  6. "os/exec"
  7. "path/filepath"
  8. "sort"
  9. "strconv"
  10. "strings"
  11. cdcgroups "github.com/containerd/cgroups"
  12. "github.com/containerd/containerd/containers"
  13. coci "github.com/containerd/containerd/oci"
  14. "github.com/containerd/containerd/pkg/apparmor"
  15. "github.com/containerd/containerd/pkg/userns"
  16. containertypes "github.com/docker/docker/api/types/container"
  17. "github.com/docker/docker/container"
  18. dconfig "github.com/docker/docker/daemon/config"
  19. "github.com/docker/docker/oci"
  20. "github.com/docker/docker/oci/caps"
  21. "github.com/docker/docker/pkg/idtools"
  22. "github.com/docker/docker/pkg/stringid"
  23. "github.com/docker/docker/rootless/specconv"
  24. volumemounts "github.com/docker/docker/volume/mounts"
  25. "github.com/moby/sys/mount"
  26. "github.com/moby/sys/mountinfo"
  27. "github.com/opencontainers/runc/libcontainer/cgroups"
  28. "github.com/opencontainers/runc/libcontainer/user"
  29. specs "github.com/opencontainers/runtime-spec/specs-go"
  30. "github.com/pkg/errors"
  31. "github.com/sirupsen/logrus"
  32. "golang.org/x/sys/unix"
  33. )
  34. const inContainerInitPath = "/sbin/" + dconfig.DefaultInitBinary
  35. // WithRlimits sets the container's rlimits along with merging the daemon's rlimits
  36. func WithRlimits(daemon *Daemon, c *container.Container) coci.SpecOpts {
  37. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  38. var rlimits []specs.POSIXRlimit
  39. // We want to leave the original HostConfig alone so make a copy here
  40. hostConfig := *c.HostConfig
  41. // Merge with the daemon defaults
  42. daemon.mergeUlimits(&hostConfig)
  43. for _, ul := range hostConfig.Ulimits {
  44. rlimits = append(rlimits, specs.POSIXRlimit{
  45. Type: "RLIMIT_" + strings.ToUpper(ul.Name),
  46. Soft: uint64(ul.Soft),
  47. Hard: uint64(ul.Hard),
  48. })
  49. }
  50. s.Process.Rlimits = rlimits
  51. return nil
  52. }
  53. }
  54. // WithLibnetwork sets the libnetwork hook
  55. func WithLibnetwork(daemon *Daemon, c *container.Container) coci.SpecOpts {
  56. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  57. if s.Hooks == nil {
  58. s.Hooks = &specs.Hooks{}
  59. }
  60. for _, ns := range s.Linux.Namespaces {
  61. if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled {
  62. target := filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe")
  63. shortNetCtlrID := stringid.TruncateID(daemon.netController.ID())
  64. s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{
  65. Path: target,
  66. Args: []string{
  67. "libnetwork-setkey",
  68. "-exec-root=" + daemon.configStore.GetExecRoot(),
  69. c.ID,
  70. shortNetCtlrID,
  71. },
  72. })
  73. }
  74. }
  75. return nil
  76. }
  77. }
  78. // WithRootless sets the spec to the rootless configuration
  79. func WithRootless(daemon *Daemon) coci.SpecOpts {
  80. return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  81. var v2Controllers []string
  82. if daemon.getCgroupDriver() == cgroupSystemdDriver {
  83. if cdcgroups.Mode() != cdcgroups.Unified {
  84. return errors.New("rootless systemd driver doesn't support cgroup v1")
  85. }
  86. rootlesskitParentEUID := os.Getenv("ROOTLESSKIT_PARENT_EUID")
  87. if rootlesskitParentEUID == "" {
  88. return errors.New("$ROOTLESSKIT_PARENT_EUID is not set (requires RootlessKit v0.8.0)")
  89. }
  90. euid, err := strconv.Atoi(rootlesskitParentEUID)
  91. if err != nil {
  92. return errors.Wrap(err, "invalid $ROOTLESSKIT_PARENT_EUID: must be a numeric value")
  93. }
  94. controllersPath := fmt.Sprintf("/sys/fs/cgroup/user.slice/user-%d.slice/cgroup.controllers", euid)
  95. controllersFile, err := os.ReadFile(controllersPath)
  96. if err != nil {
  97. return err
  98. }
  99. v2Controllers = strings.Fields(string(controllersFile))
  100. }
  101. return specconv.ToRootless(s, v2Controllers)
  102. }
  103. }
  104. // WithOOMScore sets the oom score
  105. func WithOOMScore(score *int) coci.SpecOpts {
  106. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  107. s.Process.OOMScoreAdj = score
  108. return nil
  109. }
  110. }
  111. // WithSelinux sets the selinux labels
  112. func WithSelinux(c *container.Container) coci.SpecOpts {
  113. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  114. s.Process.SelinuxLabel = c.GetProcessLabel()
  115. s.Linux.MountLabel = c.MountLabel
  116. return nil
  117. }
  118. }
  119. // WithApparmor sets the apparmor profile
  120. func WithApparmor(c *container.Container) coci.SpecOpts {
  121. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  122. if apparmor.HostSupports() {
  123. var appArmorProfile string
  124. if c.AppArmorProfile != "" {
  125. appArmorProfile = c.AppArmorProfile
  126. } else if c.HostConfig.Privileged {
  127. appArmorProfile = unconfinedAppArmorProfile
  128. } else {
  129. appArmorProfile = defaultAppArmorProfile
  130. }
  131. if appArmorProfile == defaultAppArmorProfile {
  132. // Unattended upgrades and other fun services can unload AppArmor
  133. // profiles inadvertently. Since we cannot store our profile in
  134. // /etc/apparmor.d, nor can we practically add other ways of
  135. // telling the system to keep our profile loaded, in order to make
  136. // sure that we keep the default profile enabled we dynamically
  137. // reload it if necessary.
  138. if err := ensureDefaultAppArmorProfile(); err != nil {
  139. return err
  140. }
  141. }
  142. s.Process.ApparmorProfile = appArmorProfile
  143. }
  144. return nil
  145. }
  146. }
  147. // WithCapabilities sets the container's capabilties
  148. func WithCapabilities(c *container.Container) coci.SpecOpts {
  149. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  150. capabilities, err := caps.TweakCapabilities(
  151. caps.DefaultCapabilities(),
  152. c.HostConfig.CapAdd,
  153. c.HostConfig.CapDrop,
  154. c.HostConfig.Privileged,
  155. )
  156. if err != nil {
  157. return err
  158. }
  159. return oci.SetCapabilities(s, capabilities)
  160. }
  161. }
  162. func resourcePath(c *container.Container, getPath func() (string, error)) (string, error) {
  163. p, err := getPath()
  164. if err != nil {
  165. return "", err
  166. }
  167. return c.GetResourcePath(p)
  168. }
  169. func getUser(c *container.Container, username string) (specs.User, error) {
  170. var usr specs.User
  171. passwdPath, err := resourcePath(c, user.GetPasswdPath)
  172. if err != nil {
  173. return usr, err
  174. }
  175. groupPath, err := resourcePath(c, user.GetGroupPath)
  176. if err != nil {
  177. return usr, err
  178. }
  179. execUser, err := user.GetExecUserPath(username, nil, passwdPath, groupPath)
  180. if err != nil {
  181. return usr, err
  182. }
  183. usr.UID = uint32(execUser.Uid)
  184. usr.GID = uint32(execUser.Gid)
  185. var addGroups []int
  186. if len(c.HostConfig.GroupAdd) > 0 {
  187. addGroups, err = user.GetAdditionalGroupsPath(c.HostConfig.GroupAdd, groupPath)
  188. if err != nil {
  189. return usr, err
  190. }
  191. }
  192. for _, g := range append(execUser.Sgids, addGroups...) {
  193. usr.AdditionalGids = append(usr.AdditionalGids, uint32(g))
  194. }
  195. return usr, nil
  196. }
  197. func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) {
  198. for i, n := range s.Linux.Namespaces {
  199. if n.Type == ns.Type {
  200. s.Linux.Namespaces[i] = ns
  201. return
  202. }
  203. }
  204. s.Linux.Namespaces = append(s.Linux.Namespaces, ns)
  205. }
  206. // WithNamespaces sets the container's namespaces
  207. func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
  208. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  209. userNS := false
  210. // user
  211. if c.HostConfig.UsernsMode.IsPrivate() {
  212. uidMap := daemon.idMapping.UIDMaps
  213. if uidMap != nil {
  214. userNS = true
  215. ns := specs.LinuxNamespace{Type: "user"}
  216. setNamespace(s, ns)
  217. s.Linux.UIDMappings = specMapping(uidMap)
  218. s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDMaps)
  219. }
  220. }
  221. // network
  222. if !c.Config.NetworkDisabled {
  223. ns := specs.LinuxNamespace{Type: "network"}
  224. parts := strings.SplitN(string(c.HostConfig.NetworkMode), ":", 2)
  225. if parts[0] == "container" {
  226. nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer())
  227. if err != nil {
  228. return err
  229. }
  230. ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID())
  231. if userNS {
  232. // to share a net namespace, they must also share a user namespace
  233. nsUser := specs.LinuxNamespace{Type: "user"}
  234. nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID())
  235. setNamespace(s, nsUser)
  236. }
  237. } else if c.HostConfig.NetworkMode.IsHost() {
  238. ns.Path = c.NetworkSettings.SandboxKey
  239. }
  240. setNamespace(s, ns)
  241. }
  242. // ipc
  243. ipcMode := c.HostConfig.IpcMode
  244. switch {
  245. case ipcMode.IsContainer():
  246. ns := specs.LinuxNamespace{Type: "ipc"}
  247. ic, err := daemon.getIpcContainer(ipcMode.Container())
  248. if err != nil {
  249. return err
  250. }
  251. ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID())
  252. setNamespace(s, ns)
  253. if userNS {
  254. // to share an IPC namespace, they must also share a user namespace
  255. nsUser := specs.LinuxNamespace{Type: "user"}
  256. nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID())
  257. setNamespace(s, nsUser)
  258. }
  259. case ipcMode.IsHost():
  260. oci.RemoveNamespace(s, "ipc")
  261. case ipcMode.IsEmpty():
  262. // A container was created by an older version of the daemon.
  263. // The default behavior used to be what is now called "shareable".
  264. fallthrough
  265. case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone():
  266. ns := specs.LinuxNamespace{Type: "ipc"}
  267. setNamespace(s, ns)
  268. default:
  269. return fmt.Errorf("Invalid IPC mode: %v", ipcMode)
  270. }
  271. // pid
  272. if c.HostConfig.PidMode.IsContainer() {
  273. pc, err := daemon.getPidContainer(c)
  274. if err != nil {
  275. return err
  276. }
  277. ns := specs.LinuxNamespace{
  278. Type: "pid",
  279. Path: fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID()),
  280. }
  281. setNamespace(s, ns)
  282. if userNS {
  283. // to share a PID namespace, they must also share a user namespace
  284. nsUser := specs.LinuxNamespace{
  285. Type: "user",
  286. Path: fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID()),
  287. }
  288. setNamespace(s, nsUser)
  289. }
  290. } else if c.HostConfig.PidMode.IsHost() {
  291. oci.RemoveNamespace(s, "pid")
  292. } else {
  293. ns := specs.LinuxNamespace{Type: "pid"}
  294. setNamespace(s, ns)
  295. }
  296. // uts
  297. if c.HostConfig.UTSMode.IsHost() {
  298. oci.RemoveNamespace(s, "uts")
  299. s.Hostname = ""
  300. }
  301. // cgroup
  302. if !c.HostConfig.CgroupnsMode.IsEmpty() {
  303. cgroupNsMode := c.HostConfig.CgroupnsMode
  304. if !cgroupNsMode.Valid() {
  305. return fmt.Errorf("invalid cgroup namespace mode: %v", cgroupNsMode)
  306. }
  307. if cgroupNsMode.IsPrivate() {
  308. nsCgroup := specs.LinuxNamespace{Type: "cgroup"}
  309. setNamespace(s, nsCgroup)
  310. }
  311. }
  312. return nil
  313. }
  314. }
  315. func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {
  316. var ids []specs.LinuxIDMapping
  317. for _, item := range s {
  318. ids = append(ids, specs.LinuxIDMapping{
  319. HostID: uint32(item.HostID),
  320. ContainerID: uint32(item.ContainerID),
  321. Size: uint32(item.Size),
  322. })
  323. }
  324. return ids
  325. }
  326. // Get the source mount point of directory passed in as argument. Also return
  327. // optional fields.
  328. func getSourceMount(source string) (string, string, error) {
  329. // Ensure any symlinks are resolved.
  330. sourcePath, err := filepath.EvalSymlinks(source)
  331. if err != nil {
  332. return "", "", err
  333. }
  334. mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(sourcePath))
  335. if err != nil {
  336. return "", "", err
  337. }
  338. if len(mi) < 1 {
  339. return "", "", fmt.Errorf("Can't find mount point of %s", source)
  340. }
  341. // find the longest mount point
  342. var idx, maxlen int
  343. for i := range mi {
  344. if len(mi[i].Mountpoint) > maxlen {
  345. maxlen = len(mi[i].Mountpoint)
  346. idx = i
  347. }
  348. }
  349. return mi[idx].Mountpoint, mi[idx].Optional, nil
  350. }
  351. const (
  352. sharedPropagationOption = "shared:"
  353. slavePropagationOption = "master:"
  354. )
  355. // hasMountInfoOption checks if any of the passed any of the given option values
  356. // are set in the passed in option string.
  357. func hasMountInfoOption(opts string, vals ...string) bool {
  358. for _, opt := range strings.Split(opts, " ") {
  359. for _, val := range vals {
  360. if strings.HasPrefix(opt, val) {
  361. return true
  362. }
  363. }
  364. }
  365. return false
  366. }
  367. // Ensure mount point on which path is mounted, is shared.
  368. func ensureShared(path string) error {
  369. sourceMount, optionalOpts, err := getSourceMount(path)
  370. if err != nil {
  371. return err
  372. }
  373. // Make sure source mount point is shared.
  374. if !hasMountInfoOption(optionalOpts, sharedPropagationOption) {
  375. return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount)
  376. }
  377. return nil
  378. }
  379. // Ensure mount point on which path is mounted, is either shared or slave.
  380. func ensureSharedOrSlave(path string) error {
  381. sourceMount, optionalOpts, err := getSourceMount(path)
  382. if err != nil {
  383. return err
  384. }
  385. if !hasMountInfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) {
  386. return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount)
  387. }
  388. return nil
  389. }
  390. // Get the set of mount flags that are set on the mount that contains the given
  391. // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
  392. // bind-mounting "with options" will not fail with user namespaces, due to
  393. // kernel restrictions that require user namespace mounts to preserve
  394. // CL_UNPRIVILEGED locked flags.
  395. func getUnprivilegedMountFlags(path string) ([]string, error) {
  396. var statfs unix.Statfs_t
  397. if err := unix.Statfs(path, &statfs); err != nil {
  398. return nil, err
  399. }
  400. // The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
  401. unprivilegedFlags := map[uint64]string{
  402. unix.MS_RDONLY: "ro",
  403. unix.MS_NODEV: "nodev",
  404. unix.MS_NOEXEC: "noexec",
  405. unix.MS_NOSUID: "nosuid",
  406. unix.MS_NOATIME: "noatime",
  407. unix.MS_RELATIME: "relatime",
  408. unix.MS_NODIRATIME: "nodiratime",
  409. }
  410. var flags []string
  411. for mask, flag := range unprivilegedFlags {
  412. if uint64(statfs.Flags)&mask == mask {
  413. flags = append(flags, flag)
  414. }
  415. }
  416. return flags, nil
  417. }
  418. var (
  419. mountPropagationMap = map[string]int{
  420. "private": mount.PRIVATE,
  421. "rprivate": mount.RPRIVATE,
  422. "shared": mount.SHARED,
  423. "rshared": mount.RSHARED,
  424. "slave": mount.SLAVE,
  425. "rslave": mount.RSLAVE,
  426. }
  427. mountPropagationReverseMap = map[int]string{
  428. mount.PRIVATE: "private",
  429. mount.RPRIVATE: "rprivate",
  430. mount.SHARED: "shared",
  431. mount.RSHARED: "rshared",
  432. mount.SLAVE: "slave",
  433. mount.RSLAVE: "rslave",
  434. }
  435. )
  436. // inSlice tests whether a string is contained in a slice of strings or not.
  437. // Comparison is case sensitive
  438. func inSlice(slice []string, s string) bool {
  439. for _, ss := range slice {
  440. if s == ss {
  441. return true
  442. }
  443. }
  444. return false
  445. }
  446. // WithMounts sets the container's mounts
  447. func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts {
  448. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
  449. if err := daemon.setupContainerMountsRoot(c); err != nil {
  450. return err
  451. }
  452. if err := daemon.setupIpcDirs(c); err != nil {
  453. return err
  454. }
  455. defer func() {
  456. if err != nil {
  457. daemon.cleanupSecretDir(c)
  458. }
  459. }()
  460. if err := daemon.setupSecretDir(c); err != nil {
  461. return err
  462. }
  463. ms, err := daemon.setupMounts(c)
  464. if err != nil {
  465. return err
  466. }
  467. if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() {
  468. ms = append(ms, c.IpcMounts()...)
  469. }
  470. tmpfsMounts, err := c.TmpfsMounts()
  471. if err != nil {
  472. return err
  473. }
  474. ms = append(ms, tmpfsMounts...)
  475. secretMounts, err := c.SecretMounts()
  476. if err != nil {
  477. return err
  478. }
  479. ms = append(ms, secretMounts...)
  480. sort.Sort(mounts(ms))
  481. mounts := ms
  482. userMounts := make(map[string]struct{})
  483. for _, m := range mounts {
  484. userMounts[m.Destination] = struct{}{}
  485. }
  486. // Copy all mounts from spec to defaultMounts, except for
  487. // - mounts overridden by a user supplied mount;
  488. // - all mounts under /dev if a user supplied /dev is present;
  489. // - /dev/shm, in case IpcMode is none.
  490. // While at it, also
  491. // - set size for /dev/shm from shmsize.
  492. defaultMounts := s.Mounts[:0]
  493. _, mountDev := userMounts["/dev"]
  494. for _, m := range s.Mounts {
  495. if _, ok := userMounts[m.Destination]; ok {
  496. // filter out mount overridden by a user supplied mount
  497. continue
  498. }
  499. if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
  500. // filter out everything under /dev if /dev is user-mounted
  501. continue
  502. }
  503. if m.Destination == "/dev/shm" {
  504. if c.HostConfig.IpcMode.IsNone() {
  505. // filter out /dev/shm for "none" IpcMode
  506. continue
  507. }
  508. // set size for /dev/shm mount from spec
  509. sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
  510. m.Options = append(m.Options, sizeOpt)
  511. }
  512. defaultMounts = append(defaultMounts, m)
  513. }
  514. s.Mounts = defaultMounts
  515. for _, m := range mounts {
  516. if m.Source == "tmpfs" {
  517. data := m.Data
  518. parser := volumemounts.NewParser()
  519. options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())}
  520. if data != "" {
  521. options = append(options, strings.Split(data, ",")...)
  522. }
  523. merged, err := mount.MergeTmpfsOptions(options)
  524. if err != nil {
  525. return err
  526. }
  527. s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged})
  528. continue
  529. }
  530. mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"}
  531. // Determine property of RootPropagation based on volume
  532. // properties. If a volume is shared, then keep root propagation
  533. // shared. This should work for slave and private volumes too.
  534. //
  535. // For slave volumes, it can be either [r]shared/[r]slave.
  536. //
  537. // For private volumes any root propagation value should work.
  538. pFlag := mountPropagationMap[m.Propagation]
  539. switch pFlag {
  540. case mount.SHARED, mount.RSHARED:
  541. if err := ensureShared(m.Source); err != nil {
  542. return err
  543. }
  544. rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
  545. if rootpg != mount.SHARED && rootpg != mount.RSHARED {
  546. s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED]
  547. }
  548. case mount.SLAVE, mount.RSLAVE:
  549. var fallback bool
  550. if err := ensureSharedOrSlave(m.Source); err != nil {
  551. // For backwards compatibility purposes, treat mounts from the daemon root
  552. // as special since we automatically add rslave propagation to these mounts
  553. // when the user did not set anything, so we should fallback to the old
  554. // behavior which is to use private propagation which is normally the
  555. // default.
  556. if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) {
  557. return err
  558. }
  559. cm, ok := c.MountPoints[m.Destination]
  560. if !ok {
  561. return err
  562. }
  563. if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" {
  564. // This means the user explicitly set a propagation, do not fallback in that case.
  565. return err
  566. }
  567. fallback = true
  568. logrus.WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root")
  569. }
  570. if !fallback {
  571. rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
  572. if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
  573. s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE]
  574. }
  575. }
  576. }
  577. bindMode := "rbind"
  578. if m.NonRecursive {
  579. bindMode = "bind"
  580. }
  581. opts := []string{bindMode}
  582. if !m.Writable {
  583. opts = append(opts, "ro")
  584. }
  585. if pFlag != 0 {
  586. opts = append(opts, mountPropagationReverseMap[pFlag])
  587. }
  588. // If we are using user namespaces, then we must make sure that we
  589. // don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
  590. // "mount" when we bind-mount. The reason for this is that at the point
  591. // when runc sets up the root filesystem, it is already inside a user
  592. // namespace, and thus cannot change any flags that are locked.
  593. if daemon.configStore.RemappedRoot != "" || userns.RunningInUserNS() {
  594. unprivOpts, err := getUnprivilegedMountFlags(m.Source)
  595. if err != nil {
  596. return err
  597. }
  598. opts = append(opts, unprivOpts...)
  599. }
  600. mt.Options = opts
  601. s.Mounts = append(s.Mounts, mt)
  602. }
  603. if s.Root.Readonly {
  604. for i, m := range s.Mounts {
  605. switch m.Destination {
  606. case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev":
  607. continue
  608. }
  609. if _, ok := userMounts[m.Destination]; !ok {
  610. if !inSlice(m.Options, "ro") {
  611. s.Mounts[i].Options = append(s.Mounts[i].Options, "ro")
  612. }
  613. }
  614. }
  615. }
  616. if c.HostConfig.Privileged {
  617. // clear readonly for /sys
  618. for i := range s.Mounts {
  619. if s.Mounts[i].Destination == "/sys" {
  620. clearReadOnly(&s.Mounts[i])
  621. }
  622. }
  623. s.Linux.ReadonlyPaths = nil
  624. s.Linux.MaskedPaths = nil
  625. }
  626. // TODO: until a kernel/mount solution exists for handling remount in a user namespace,
  627. // we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
  628. if uidMap := daemon.idMapping.UIDMaps; uidMap != nil || c.HostConfig.Privileged {
  629. for i, m := range s.Mounts {
  630. if m.Type == "cgroup" {
  631. clearReadOnly(&s.Mounts[i])
  632. }
  633. }
  634. }
  635. return nil
  636. }
  637. }
  638. // sysctlExists checks if a sysctl exists; runc will error if we add any that do not actually
  639. // exist, so do not add the default ones if running on an old kernel.
  640. func sysctlExists(s string) bool {
  641. f := filepath.Join("/proc", "sys", strings.ReplaceAll(s, ".", "/"))
  642. _, err := os.Stat(f)
  643. return err == nil
  644. }
  645. // WithCommonOptions sets common docker options
  646. func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts {
  647. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  648. if c.BaseFS == nil {
  649. return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly nil")
  650. }
  651. linkedEnv, err := daemon.setupLinkedContainers(c)
  652. if err != nil {
  653. return err
  654. }
  655. s.Root = &specs.Root{
  656. Path: c.BaseFS.Path(),
  657. Readonly: c.HostConfig.ReadonlyRootfs,
  658. }
  659. if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil {
  660. return err
  661. }
  662. cwd := c.Config.WorkingDir
  663. if len(cwd) == 0 {
  664. cwd = "/"
  665. }
  666. s.Process.Args = append([]string{c.Path}, c.Args...)
  667. // only add the custom init if it is specified and the container is running in its
  668. // own private pid namespace. It does not make sense to add if it is running in the
  669. // host namespace or another container's pid namespace where we already have an init
  670. if c.HostConfig.PidMode.IsPrivate() {
  671. if (c.HostConfig.Init != nil && *c.HostConfig.Init) ||
  672. (c.HostConfig.Init == nil && daemon.configStore.Init) {
  673. s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...)
  674. path := daemon.configStore.InitPath
  675. if path == "" {
  676. path, err = exec.LookPath(dconfig.DefaultInitBinary)
  677. if err != nil {
  678. return err
  679. }
  680. }
  681. s.Mounts = append(s.Mounts, specs.Mount{
  682. Destination: inContainerInitPath,
  683. Type: "bind",
  684. Source: path,
  685. Options: []string{"bind", "ro"},
  686. })
  687. }
  688. }
  689. s.Process.Cwd = cwd
  690. s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
  691. s.Process.Terminal = c.Config.Tty
  692. s.Hostname = c.Config.Hostname
  693. setLinuxDomainname(c, s)
  694. // Add default sysctls that are generally safe and useful; currently we
  695. // grant the capabilities to allow these anyway. You can override if
  696. // you want to restore the original behaviour.
  697. // We do not set network sysctls if network namespace is host, or if we are
  698. // joining an existing namespace, only if we create a new net namespace.
  699. if c.HostConfig.NetworkMode.IsPrivate() {
  700. // We cannot set up ping socket support in a user namespace
  701. userNS := daemon.configStore.RemappedRoot != "" && c.HostConfig.UsernsMode.IsPrivate()
  702. if !userNS && !userns.RunningInUserNS() && sysctlExists("net.ipv4.ping_group_range") {
  703. // allow unprivileged ICMP echo sockets without CAP_NET_RAW
  704. s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647"
  705. }
  706. // allow opening any port less than 1024 without CAP_NET_BIND_SERVICE
  707. if sysctlExists("net.ipv4.ip_unprivileged_port_start") {
  708. s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"] = "0"
  709. }
  710. }
  711. return nil
  712. }
  713. }
  714. // WithCgroups sets the container's cgroups
  715. func WithCgroups(daemon *Daemon, c *container.Container) coci.SpecOpts {
  716. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  717. var cgroupsPath string
  718. scopePrefix := "docker"
  719. parent := "/docker"
  720. useSystemd := UsingSystemd(daemon.configStore)
  721. if useSystemd {
  722. parent = "system.slice"
  723. if daemon.configStore.Rootless {
  724. parent = "user.slice"
  725. }
  726. }
  727. if c.HostConfig.CgroupParent != "" {
  728. parent = c.HostConfig.CgroupParent
  729. } else if daemon.configStore.CgroupParent != "" {
  730. parent = daemon.configStore.CgroupParent
  731. }
  732. if useSystemd {
  733. cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID
  734. logrus.Debugf("createSpec: cgroupsPath: %s", cgroupsPath)
  735. } else {
  736. cgroupsPath = filepath.Join(parent, c.ID)
  737. }
  738. s.Linux.CgroupsPath = cgroupsPath
  739. // the rest is only needed for CPU RT controller
  740. if daemon.configStore.CPURealtimePeriod == 0 && daemon.configStore.CPURealtimeRuntime == 0 {
  741. return nil
  742. }
  743. p := cgroupsPath
  744. if useSystemd {
  745. initPath, err := cgroups.GetInitCgroup("cpu")
  746. if err != nil {
  747. return errors.Wrap(err, "unable to init CPU RT controller")
  748. }
  749. _, err = cgroups.GetOwnCgroup("cpu")
  750. if err != nil {
  751. return errors.Wrap(err, "unable to init CPU RT controller")
  752. }
  753. p = filepath.Join(initPath, s.Linux.CgroupsPath)
  754. }
  755. // Clean path to guard against things like ../../../BAD
  756. parentPath := filepath.Dir(p)
  757. if !filepath.IsAbs(parentPath) {
  758. parentPath = filepath.Clean("/" + parentPath)
  759. }
  760. mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", "cpu")
  761. if err != nil {
  762. return errors.Wrap(err, "unable to init CPU RT controller")
  763. }
  764. // When docker is run inside docker, the root is based of the host cgroup.
  765. // Should this be handled in runc/libcontainer/cgroups ?
  766. if strings.HasPrefix(root, "/docker/") {
  767. root = "/"
  768. }
  769. mnt = filepath.Join(mnt, root)
  770. if err := daemon.initCPURtController(mnt, parentPath); err != nil {
  771. return errors.Wrap(err, "unable to init CPU RT controller")
  772. }
  773. return nil
  774. }
  775. }
  776. // WithDevices sets the container's devices
  777. func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts {
  778. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  779. // Build lists of devices allowed and created within the container.
  780. var devs []specs.LinuxDevice
  781. devPermissions := s.Linux.Resources.Devices
  782. if c.HostConfig.Privileged {
  783. hostDevices, err := coci.HostDevices()
  784. if err != nil {
  785. return err
  786. }
  787. devs = append(devs, hostDevices...)
  788. // adding device mappings in privileged containers
  789. for _, deviceMapping := range c.HostConfig.Devices {
  790. // issue a warning that custom cgroup permissions are ignored in privileged mode
  791. if deviceMapping.CgroupPermissions != "rwm" {
  792. logrus.WithField("container", c.ID).Warnf("custom %s permissions for device %s are ignored in privileged mode", deviceMapping.CgroupPermissions, deviceMapping.PathOnHost)
  793. }
  794. // issue a warning that the device path already exists via /dev mounting in privileged mode
  795. if deviceMapping.PathOnHost == deviceMapping.PathInContainer {
  796. logrus.WithField("container", c.ID).Warnf("path in container %s already exists in privileged mode", deviceMapping.PathInContainer)
  797. continue
  798. }
  799. d, _, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, "rwm")
  800. if err != nil {
  801. return err
  802. }
  803. devs = append(devs, d...)
  804. }
  805. devPermissions = []specs.LinuxDeviceCgroup{
  806. {
  807. Allow: true,
  808. Access: "rwm",
  809. },
  810. }
  811. } else {
  812. for _, deviceMapping := range c.HostConfig.Devices {
  813. d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions)
  814. if err != nil {
  815. return err
  816. }
  817. devs = append(devs, d...)
  818. devPermissions = append(devPermissions, dPermissions...)
  819. }
  820. var err error
  821. devPermissions, err = oci.AppendDevicePermissionsFromCgroupRules(devPermissions, c.HostConfig.DeviceCgroupRules)
  822. if err != nil {
  823. return err
  824. }
  825. }
  826. s.Linux.Devices = append(s.Linux.Devices, devs...)
  827. s.Linux.Resources.Devices = devPermissions
  828. for _, req := range c.HostConfig.DeviceRequests {
  829. if err := daemon.handleDevice(req, s); err != nil {
  830. return err
  831. }
  832. }
  833. return nil
  834. }
  835. }
  836. // WithResources applies the container resources
  837. func WithResources(c *container.Container) coci.SpecOpts {
  838. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  839. r := c.HostConfig.Resources
  840. weightDevices, err := getBlkioWeightDevices(r)
  841. if err != nil {
  842. return err
  843. }
  844. readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps)
  845. if err != nil {
  846. return err
  847. }
  848. writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps)
  849. if err != nil {
  850. return err
  851. }
  852. readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps)
  853. if err != nil {
  854. return err
  855. }
  856. writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps)
  857. if err != nil {
  858. return err
  859. }
  860. memoryRes := getMemoryResources(r)
  861. cpuRes, err := getCPUResources(r)
  862. if err != nil {
  863. return err
  864. }
  865. blkioWeight := r.BlkioWeight
  866. specResources := &specs.LinuxResources{
  867. Memory: memoryRes,
  868. CPU: cpuRes,
  869. BlockIO: &specs.LinuxBlockIO{
  870. Weight: &blkioWeight,
  871. WeightDevice: weightDevices,
  872. ThrottleReadBpsDevice: readBpsDevice,
  873. ThrottleWriteBpsDevice: writeBpsDevice,
  874. ThrottleReadIOPSDevice: readIOpsDevice,
  875. ThrottleWriteIOPSDevice: writeIOpsDevice,
  876. },
  877. Pids: getPidsLimit(r),
  878. }
  879. if s.Linux.Resources != nil && len(s.Linux.Resources.Devices) > 0 {
  880. specResources.Devices = s.Linux.Resources.Devices
  881. }
  882. s.Linux.Resources = specResources
  883. return nil
  884. }
  885. }
  886. // WithSysctls sets the container's sysctls
  887. func WithSysctls(c *container.Container) coci.SpecOpts {
  888. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  889. // We merge the sysctls injected above with the HostConfig (latter takes
  890. // precedence for backwards-compatibility reasons).
  891. for k, v := range c.HostConfig.Sysctls {
  892. s.Linux.Sysctl[k] = v
  893. }
  894. return nil
  895. }
  896. }
  897. // WithUser sets the container's user
  898. func WithUser(c *container.Container) coci.SpecOpts {
  899. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  900. var err error
  901. s.Process.User, err = getUser(c, c.Config.User)
  902. return err
  903. }
  904. }
  905. func (daemon *Daemon) createSpec(c *container.Container) (retSpec *specs.Spec, err error) {
  906. var (
  907. opts []coci.SpecOpts
  908. s = oci.DefaultSpec()
  909. )
  910. opts = append(opts,
  911. WithCommonOptions(daemon, c),
  912. WithCgroups(daemon, c),
  913. WithResources(c),
  914. WithSysctls(c),
  915. WithDevices(daemon, c),
  916. WithUser(c),
  917. WithRlimits(daemon, c),
  918. WithNamespaces(daemon, c),
  919. WithCapabilities(c),
  920. WithSeccomp(daemon, c),
  921. WithMounts(daemon, c),
  922. WithLibnetwork(daemon, c),
  923. WithApparmor(c),
  924. WithSelinux(c),
  925. WithOOMScore(&c.HostConfig.OomScoreAdj),
  926. )
  927. if c.NoNewPrivileges {
  928. opts = append(opts, coci.WithNoNewPrivileges)
  929. }
  930. // Set the masked and readonly paths with regard to the host config options if they are set.
  931. if c.HostConfig.MaskedPaths != nil {
  932. opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths))
  933. }
  934. if c.HostConfig.ReadonlyPaths != nil {
  935. opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths))
  936. }
  937. if daemon.configStore.Rootless {
  938. opts = append(opts, WithRootless(daemon))
  939. }
  940. return &s, coci.ApplyOpts(context.Background(), nil, &containers.Container{
  941. ID: c.ID,
  942. }, &s, opts...)
  943. }
  944. func clearReadOnly(m *specs.Mount) {
  945. var opt []string
  946. for _, o := range m.Options {
  947. if o != "ro" {
  948. opt = append(opt, o)
  949. }
  950. }
  951. m.Options = opt
  952. }
  953. // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig
  954. func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig) {
  955. ulimits := c.Ulimits
  956. // Merge ulimits with daemon defaults
  957. ulIdx := make(map[string]struct{})
  958. for _, ul := range ulimits {
  959. ulIdx[ul.Name] = struct{}{}
  960. }
  961. for name, ul := range daemon.configStore.Ulimits {
  962. if _, exists := ulIdx[name]; !exists {
  963. ulimits = append(ulimits, ul)
  964. }
  965. }
  966. c.Ulimits = ulimits
  967. }