oci_linux.go 34 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150
  1. package daemon // import "github.com/docker/docker/daemon"
  2. import (
  3. "context"
  4. "fmt"
  5. "os"
  6. "path/filepath"
  7. "sort"
  8. "strconv"
  9. "strings"
  10. cdcgroups "github.com/containerd/cgroups/v3"
  11. "github.com/containerd/containerd/containers"
  12. "github.com/containerd/containerd/log"
  13. coci "github.com/containerd/containerd/oci"
  14. "github.com/containerd/containerd/pkg/apparmor"
  15. "github.com/containerd/containerd/pkg/userns"
  16. containertypes "github.com/docker/docker/api/types/container"
  17. "github.com/docker/docker/container"
  18. dconfig "github.com/docker/docker/daemon/config"
  19. "github.com/docker/docker/errdefs"
  20. "github.com/docker/docker/oci"
  21. "github.com/docker/docker/oci/caps"
  22. "github.com/docker/docker/pkg/idtools"
  23. "github.com/docker/docker/pkg/rootless/specconv"
  24. "github.com/docker/docker/pkg/stringid"
  25. volumemounts "github.com/docker/docker/volume/mounts"
  26. "github.com/moby/sys/mount"
  27. "github.com/moby/sys/mountinfo"
  28. "github.com/opencontainers/runc/libcontainer/cgroups"
  29. "github.com/opencontainers/runc/libcontainer/user"
  30. specs "github.com/opencontainers/runtime-spec/specs-go"
  31. "github.com/pkg/errors"
  32. "golang.org/x/sys/unix"
  33. )
  34. const inContainerInitPath = "/sbin/" + dconfig.DefaultInitBinary
  35. // withRlimits sets the container's rlimits along with merging the daemon's rlimits
  36. func withRlimits(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
  37. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  38. var rlimits []specs.POSIXRlimit
  39. // We want to leave the original HostConfig alone so make a copy here
  40. hostConfig := *c.HostConfig
  41. // Merge with the daemon defaults
  42. daemon.mergeUlimits(&hostConfig, daemonCfg)
  43. for _, ul := range hostConfig.Ulimits {
  44. rlimits = append(rlimits, specs.POSIXRlimit{
  45. Type: "RLIMIT_" + strings.ToUpper(ul.Name),
  46. Soft: uint64(ul.Soft),
  47. Hard: uint64(ul.Hard),
  48. })
  49. }
  50. if s.Process == nil {
  51. s.Process = &specs.Process{}
  52. }
  53. s.Process.Rlimits = rlimits
  54. return nil
  55. }
  56. }
  57. // withLibnetwork sets the libnetwork hook
  58. func withLibnetwork(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
  59. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  60. if s.Hooks == nil {
  61. s.Hooks = &specs.Hooks{}
  62. }
  63. for _, ns := range s.Linux.Namespaces {
  64. if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled {
  65. target := filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe")
  66. shortNetCtlrID := stringid.TruncateID(daemon.netController.ID())
  67. s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{
  68. Path: target,
  69. Args: []string{
  70. "libnetwork-setkey",
  71. "-exec-root=" + daemonCfg.GetExecRoot(),
  72. c.ID,
  73. shortNetCtlrID,
  74. },
  75. })
  76. }
  77. }
  78. return nil
  79. }
  80. }
  81. // withRootless sets the spec to the rootless configuration
  82. func withRootless(daemon *Daemon, daemonCfg *dconfig.Config) coci.SpecOpts {
  83. return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  84. var v2Controllers []string
  85. if cgroupDriver(daemonCfg) == cgroupSystemdDriver {
  86. if cdcgroups.Mode() != cdcgroups.Unified {
  87. return errors.New("rootless systemd driver doesn't support cgroup v1")
  88. }
  89. rootlesskitParentEUID := os.Getenv("ROOTLESSKIT_PARENT_EUID")
  90. if rootlesskitParentEUID == "" {
  91. return errors.New("$ROOTLESSKIT_PARENT_EUID is not set (requires RootlessKit v0.8.0)")
  92. }
  93. euid, err := strconv.Atoi(rootlesskitParentEUID)
  94. if err != nil {
  95. return errors.Wrap(err, "invalid $ROOTLESSKIT_PARENT_EUID: must be a numeric value")
  96. }
  97. controllersPath := fmt.Sprintf("/sys/fs/cgroup/user.slice/user-%d.slice/cgroup.controllers", euid)
  98. controllersFile, err := os.ReadFile(controllersPath)
  99. if err != nil {
  100. return err
  101. }
  102. v2Controllers = strings.Fields(string(controllersFile))
  103. }
  104. return specconv.ToRootless(s, v2Controllers)
  105. }
  106. }
  107. // WithOOMScore sets the oom score
  108. func WithOOMScore(score *int) coci.SpecOpts {
  109. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  110. if s.Process == nil {
  111. s.Process = &specs.Process{}
  112. }
  113. s.Process.OOMScoreAdj = score
  114. return nil
  115. }
  116. }
  117. // WithSelinux sets the selinux labels
  118. func WithSelinux(c *container.Container) coci.SpecOpts {
  119. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  120. if s.Process == nil {
  121. s.Process = &specs.Process{}
  122. }
  123. if s.Linux == nil {
  124. s.Linux = &specs.Linux{}
  125. }
  126. s.Process.SelinuxLabel = c.GetProcessLabel()
  127. s.Linux.MountLabel = c.MountLabel
  128. return nil
  129. }
  130. }
  131. // WithApparmor sets the apparmor profile
  132. func WithApparmor(c *container.Container) coci.SpecOpts {
  133. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  134. if apparmor.HostSupports() {
  135. var appArmorProfile string
  136. if c.AppArmorProfile != "" {
  137. appArmorProfile = c.AppArmorProfile
  138. } else if c.HostConfig.Privileged {
  139. appArmorProfile = unconfinedAppArmorProfile
  140. } else {
  141. appArmorProfile = defaultAppArmorProfile
  142. }
  143. if appArmorProfile == defaultAppArmorProfile {
  144. // Unattended upgrades and other fun services can unload AppArmor
  145. // profiles inadvertently. Since we cannot store our profile in
  146. // /etc/apparmor.d, nor can we practically add other ways of
  147. // telling the system to keep our profile loaded, in order to make
  148. // sure that we keep the default profile enabled we dynamically
  149. // reload it if necessary.
  150. if err := ensureDefaultAppArmorProfile(); err != nil {
  151. return err
  152. }
  153. }
  154. if s.Process == nil {
  155. s.Process = &specs.Process{}
  156. }
  157. s.Process.ApparmorProfile = appArmorProfile
  158. }
  159. return nil
  160. }
  161. }
  162. // WithCapabilities sets the container's capabilties
  163. func WithCapabilities(c *container.Container) coci.SpecOpts {
  164. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  165. capabilities, err := caps.TweakCapabilities(
  166. caps.DefaultCapabilities(),
  167. c.HostConfig.CapAdd,
  168. c.HostConfig.CapDrop,
  169. c.HostConfig.Privileged,
  170. )
  171. if err != nil {
  172. return err
  173. }
  174. return oci.SetCapabilities(s, capabilities)
  175. }
  176. }
  177. func resourcePath(c *container.Container, getPath func() (string, error)) (string, error) {
  178. p, err := getPath()
  179. if err != nil {
  180. return "", err
  181. }
  182. return c.GetResourcePath(p)
  183. }
  184. func getUser(c *container.Container, username string) (specs.User, error) {
  185. var usr specs.User
  186. passwdPath, err := resourcePath(c, user.GetPasswdPath)
  187. if err != nil {
  188. return usr, err
  189. }
  190. groupPath, err := resourcePath(c, user.GetGroupPath)
  191. if err != nil {
  192. return usr, err
  193. }
  194. execUser, err := user.GetExecUserPath(username, nil, passwdPath, groupPath)
  195. if err != nil {
  196. return usr, err
  197. }
  198. usr.UID = uint32(execUser.Uid)
  199. usr.GID = uint32(execUser.Gid)
  200. usr.AdditionalGids = []uint32{usr.GID}
  201. var addGroups []int
  202. if len(c.HostConfig.GroupAdd) > 0 {
  203. addGroups, err = user.GetAdditionalGroupsPath(c.HostConfig.GroupAdd, groupPath)
  204. if err != nil {
  205. return usr, err
  206. }
  207. }
  208. for _, g := range append(execUser.Sgids, addGroups...) {
  209. usr.AdditionalGids = append(usr.AdditionalGids, uint32(g))
  210. }
  211. return usr, nil
  212. }
  213. func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) {
  214. if s.Linux == nil {
  215. s.Linux = &specs.Linux{}
  216. }
  217. for i, n := range s.Linux.Namespaces {
  218. if n.Type == ns.Type {
  219. s.Linux.Namespaces[i] = ns
  220. return
  221. }
  222. }
  223. s.Linux.Namespaces = append(s.Linux.Namespaces, ns)
  224. }
  225. // WithNamespaces sets the container's namespaces
  226. func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
  227. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  228. userNS := false
  229. // user
  230. if c.HostConfig.UsernsMode.IsPrivate() {
  231. uidMap := daemon.idMapping.UIDMaps
  232. if uidMap != nil {
  233. userNS = true
  234. ns := specs.LinuxNamespace{Type: "user"}
  235. setNamespace(s, ns)
  236. s.Linux.UIDMappings = specMapping(uidMap)
  237. s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDMaps)
  238. }
  239. }
  240. // network
  241. if !c.Config.NetworkDisabled {
  242. ns := specs.LinuxNamespace{Type: "network"}
  243. if c.HostConfig.NetworkMode.IsContainer() {
  244. nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer())
  245. if err != nil {
  246. return err
  247. }
  248. ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID())
  249. if userNS {
  250. // to share a net namespace, they must also share a user namespace
  251. nsUser := specs.LinuxNamespace{Type: "user"}
  252. nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID())
  253. setNamespace(s, nsUser)
  254. }
  255. } else if c.HostConfig.NetworkMode.IsHost() {
  256. ns.Path = c.NetworkSettings.SandboxKey
  257. }
  258. setNamespace(s, ns)
  259. }
  260. // ipc
  261. ipcMode := c.HostConfig.IpcMode
  262. if !ipcMode.Valid() {
  263. return errdefs.InvalidParameter(errors.Errorf("invalid IPC mode: %v", ipcMode))
  264. }
  265. switch {
  266. case ipcMode.IsContainer():
  267. ns := specs.LinuxNamespace{Type: "ipc"}
  268. ic, err := daemon.getIpcContainer(ipcMode.Container())
  269. if err != nil {
  270. return errdefs.InvalidParameter(errors.Wrapf(err, "invalid IPC mode: %v", ipcMode))
  271. }
  272. ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID())
  273. setNamespace(s, ns)
  274. if userNS {
  275. // to share an IPC namespace, they must also share a user namespace
  276. nsUser := specs.LinuxNamespace{Type: "user"}
  277. nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID())
  278. setNamespace(s, nsUser)
  279. }
  280. case ipcMode.IsHost():
  281. oci.RemoveNamespace(s, "ipc")
  282. case ipcMode.IsEmpty():
  283. // A container was created by an older version of the daemon.
  284. // The default behavior used to be what is now called "shareable".
  285. fallthrough
  286. case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone():
  287. ns := specs.LinuxNamespace{Type: "ipc"}
  288. setNamespace(s, ns)
  289. }
  290. // pid
  291. if !c.HostConfig.PidMode.Valid() {
  292. return errdefs.InvalidParameter(errors.Errorf("invalid PID mode: %v", c.HostConfig.PidMode))
  293. }
  294. if c.HostConfig.PidMode.IsContainer() {
  295. pc, err := daemon.getPidContainer(c)
  296. if err != nil {
  297. return err
  298. }
  299. ns := specs.LinuxNamespace{
  300. Type: "pid",
  301. Path: fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID()),
  302. }
  303. setNamespace(s, ns)
  304. if userNS {
  305. // to share a PID namespace, they must also share a user namespace
  306. nsUser := specs.LinuxNamespace{
  307. Type: "user",
  308. Path: fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID()),
  309. }
  310. setNamespace(s, nsUser)
  311. }
  312. } else if c.HostConfig.PidMode.IsHost() {
  313. oci.RemoveNamespace(s, "pid")
  314. } else {
  315. ns := specs.LinuxNamespace{Type: "pid"}
  316. setNamespace(s, ns)
  317. }
  318. // uts
  319. if !c.HostConfig.UTSMode.Valid() {
  320. return errdefs.InvalidParameter(errors.Errorf("invalid UTS mode: %v", c.HostConfig.UTSMode))
  321. }
  322. if c.HostConfig.UTSMode.IsHost() {
  323. oci.RemoveNamespace(s, "uts")
  324. s.Hostname = ""
  325. }
  326. // cgroup
  327. if !c.HostConfig.CgroupnsMode.Valid() {
  328. return errdefs.InvalidParameter(errors.Errorf("invalid cgroup namespace mode: %v", c.HostConfig.CgroupnsMode))
  329. }
  330. if !c.HostConfig.CgroupnsMode.IsEmpty() {
  331. if c.HostConfig.CgroupnsMode.IsPrivate() {
  332. nsCgroup := specs.LinuxNamespace{Type: "cgroup"}
  333. setNamespace(s, nsCgroup)
  334. }
  335. }
  336. return nil
  337. }
  338. }
  339. func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {
  340. var ids []specs.LinuxIDMapping
  341. for _, item := range s {
  342. ids = append(ids, specs.LinuxIDMapping{
  343. HostID: uint32(item.HostID),
  344. ContainerID: uint32(item.ContainerID),
  345. Size: uint32(item.Size),
  346. })
  347. }
  348. return ids
  349. }
  350. // Get the source mount point of directory passed in as argument. Also return
  351. // optional fields.
  352. func getSourceMount(source string) (string, string, error) {
  353. // Ensure any symlinks are resolved.
  354. sourcePath, err := filepath.EvalSymlinks(source)
  355. if err != nil {
  356. return "", "", err
  357. }
  358. mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(sourcePath))
  359. if err != nil {
  360. return "", "", err
  361. }
  362. if len(mi) < 1 {
  363. return "", "", fmt.Errorf("Can't find mount point of %s", source)
  364. }
  365. // find the longest mount point
  366. var idx, maxlen int
  367. for i := range mi {
  368. if len(mi[i].Mountpoint) > maxlen {
  369. maxlen = len(mi[i].Mountpoint)
  370. idx = i
  371. }
  372. }
  373. return mi[idx].Mountpoint, mi[idx].Optional, nil
  374. }
  375. const (
  376. sharedPropagationOption = "shared:"
  377. slavePropagationOption = "master:"
  378. )
  379. // hasMountInfoOption checks if any of the passed any of the given option values
  380. // are set in the passed in option string.
  381. func hasMountInfoOption(opts string, vals ...string) bool {
  382. for _, opt := range strings.Split(opts, " ") {
  383. for _, val := range vals {
  384. if strings.HasPrefix(opt, val) {
  385. return true
  386. }
  387. }
  388. }
  389. return false
  390. }
  391. // Ensure mount point on which path is mounted, is shared.
  392. func ensureShared(path string) error {
  393. sourceMount, optionalOpts, err := getSourceMount(path)
  394. if err != nil {
  395. return err
  396. }
  397. // Make sure source mount point is shared.
  398. if !hasMountInfoOption(optionalOpts, sharedPropagationOption) {
  399. return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount)
  400. }
  401. return nil
  402. }
  403. // Ensure mount point on which path is mounted, is either shared or slave.
  404. func ensureSharedOrSlave(path string) error {
  405. sourceMount, optionalOpts, err := getSourceMount(path)
  406. if err != nil {
  407. return err
  408. }
  409. if !hasMountInfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) {
  410. return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount)
  411. }
  412. return nil
  413. }
  414. // Get the set of mount flags that are set on the mount that contains the given
  415. // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
  416. // bind-mounting "with options" will not fail with user namespaces, due to
  417. // kernel restrictions that require user namespace mounts to preserve
  418. // CL_UNPRIVILEGED locked flags.
  419. func getUnprivilegedMountFlags(path string) ([]string, error) {
  420. var statfs unix.Statfs_t
  421. if err := unix.Statfs(path, &statfs); err != nil {
  422. return nil, err
  423. }
  424. // The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
  425. unprivilegedFlags := map[uint64]string{
  426. unix.MS_RDONLY: "ro",
  427. unix.MS_NODEV: "nodev",
  428. unix.MS_NOEXEC: "noexec",
  429. unix.MS_NOSUID: "nosuid",
  430. unix.MS_NOATIME: "noatime",
  431. unix.MS_RELATIME: "relatime",
  432. unix.MS_NODIRATIME: "nodiratime",
  433. }
  434. var flags []string
  435. for mask, flag := range unprivilegedFlags {
  436. if uint64(statfs.Flags)&mask == mask {
  437. flags = append(flags, flag)
  438. }
  439. }
  440. return flags, nil
  441. }
  442. var (
  443. mountPropagationMap = map[string]int{
  444. "private": mount.PRIVATE,
  445. "rprivate": mount.RPRIVATE,
  446. "shared": mount.SHARED,
  447. "rshared": mount.RSHARED,
  448. "slave": mount.SLAVE,
  449. "rslave": mount.RSLAVE,
  450. }
  451. mountPropagationReverseMap = map[int]string{
  452. mount.PRIVATE: "private",
  453. mount.RPRIVATE: "rprivate",
  454. mount.SHARED: "shared",
  455. mount.RSHARED: "rshared",
  456. mount.SLAVE: "slave",
  457. mount.RSLAVE: "rslave",
  458. }
  459. )
  460. // inSlice tests whether a string is contained in a slice of strings or not.
  461. // Comparison is case sensitive
  462. func inSlice(slice []string, s string) bool {
  463. for _, ss := range slice {
  464. if s == ss {
  465. return true
  466. }
  467. }
  468. return false
  469. }
  470. // withMounts sets the container's mounts
  471. func withMounts(daemon *Daemon, daemonCfg *configStore, c *container.Container) coci.SpecOpts {
  472. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
  473. if err := daemon.setupContainerMountsRoot(c); err != nil {
  474. return err
  475. }
  476. if err := daemon.setupIpcDirs(c); err != nil {
  477. return err
  478. }
  479. defer func() {
  480. if err != nil {
  481. daemon.cleanupSecretDir(c)
  482. }
  483. }()
  484. if err := daemon.setupSecretDir(c); err != nil {
  485. return err
  486. }
  487. ms, err := daemon.setupMounts(c)
  488. if err != nil {
  489. return err
  490. }
  491. if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() {
  492. ms = append(ms, c.IpcMounts()...)
  493. }
  494. tmpfsMounts, err := c.TmpfsMounts()
  495. if err != nil {
  496. return err
  497. }
  498. ms = append(ms, tmpfsMounts...)
  499. secretMounts, err := c.SecretMounts()
  500. if err != nil {
  501. return err
  502. }
  503. ms = append(ms, secretMounts...)
  504. sort.Sort(mounts(ms))
  505. mounts := ms
  506. userMounts := make(map[string]struct{})
  507. for _, m := range mounts {
  508. userMounts[m.Destination] = struct{}{}
  509. }
  510. // Copy all mounts from spec to defaultMounts, except for
  511. // - mounts overridden by a user supplied mount;
  512. // - all mounts under /dev if a user supplied /dev is present;
  513. // - /dev/shm, in case IpcMode is none.
  514. // While at it, also
  515. // - set size for /dev/shm from shmsize.
  516. defaultMounts := s.Mounts[:0]
  517. _, mountDev := userMounts["/dev"]
  518. for _, m := range s.Mounts {
  519. if _, ok := userMounts[m.Destination]; ok {
  520. // filter out mount overridden by a user supplied mount
  521. continue
  522. }
  523. if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
  524. // filter out everything under /dev if /dev is user-mounted
  525. continue
  526. }
  527. if m.Destination == "/dev/shm" {
  528. if c.HostConfig.IpcMode.IsNone() {
  529. // filter out /dev/shm for "none" IpcMode
  530. continue
  531. }
  532. // set size for /dev/shm mount from spec
  533. sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
  534. m.Options = append(m.Options, sizeOpt)
  535. }
  536. defaultMounts = append(defaultMounts, m)
  537. }
  538. s.Mounts = defaultMounts
  539. for _, m := range mounts {
  540. if m.Source == "tmpfs" {
  541. data := m.Data
  542. parser := volumemounts.NewParser()
  543. options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())}
  544. if data != "" {
  545. options = append(options, strings.Split(data, ",")...)
  546. }
  547. merged, err := mount.MergeTmpfsOptions(options)
  548. if err != nil {
  549. return err
  550. }
  551. s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged})
  552. continue
  553. }
  554. mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"}
  555. // Determine property of RootPropagation based on volume
  556. // properties. If a volume is shared, then keep root propagation
  557. // shared. This should work for slave and private volumes too.
  558. //
  559. // For slave volumes, it can be either [r]shared/[r]slave.
  560. //
  561. // For private volumes any root propagation value should work.
  562. pFlag := mountPropagationMap[m.Propagation]
  563. switch pFlag {
  564. case mount.SHARED, mount.RSHARED:
  565. if err := ensureShared(m.Source); err != nil {
  566. return err
  567. }
  568. rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
  569. if rootpg != mount.SHARED && rootpg != mount.RSHARED {
  570. if s.Linux == nil {
  571. s.Linux = &specs.Linux{}
  572. }
  573. s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED]
  574. }
  575. case mount.SLAVE, mount.RSLAVE:
  576. var fallback bool
  577. if err := ensureSharedOrSlave(m.Source); err != nil {
  578. // For backwards compatibility purposes, treat mounts from the daemon root
  579. // as special since we automatically add rslave propagation to these mounts
  580. // when the user did not set anything, so we should fallback to the old
  581. // behavior which is to use private propagation which is normally the
  582. // default.
  583. if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) {
  584. return err
  585. }
  586. cm, ok := c.MountPoints[m.Destination]
  587. if !ok {
  588. return err
  589. }
  590. if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" {
  591. // This means the user explicitly set a propagation, do not fallback in that case.
  592. return err
  593. }
  594. fallback = true
  595. log.G(ctx).WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root")
  596. }
  597. if !fallback {
  598. rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
  599. if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
  600. if s.Linux == nil {
  601. s.Linux = &specs.Linux{}
  602. }
  603. s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE]
  604. }
  605. }
  606. }
  607. bindMode := "rbind"
  608. if m.NonRecursive {
  609. bindMode = "bind"
  610. }
  611. opts := []string{bindMode}
  612. if !m.Writable {
  613. rro := true
  614. if m.ReadOnlyNonRecursive {
  615. rro = false
  616. if m.ReadOnlyForceRecursive {
  617. return errors.New("mount options conflict: ReadOnlyNonRecursive && ReadOnlyForceRecursive")
  618. }
  619. }
  620. if rroErr := supportsRecursivelyReadOnly(daemonCfg, c.HostConfig.Runtime); rroErr != nil {
  621. rro = false
  622. if m.ReadOnlyForceRecursive {
  623. return rroErr
  624. }
  625. }
  626. if rro {
  627. opts = append(opts, "rro")
  628. } else {
  629. opts = append(opts, "ro")
  630. }
  631. }
  632. if pFlag != 0 {
  633. opts = append(opts, mountPropagationReverseMap[pFlag])
  634. }
  635. // If we are using user namespaces, then we must make sure that we
  636. // don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
  637. // "mount" when we bind-mount. The reason for this is that at the point
  638. // when runc sets up the root filesystem, it is already inside a user
  639. // namespace, and thus cannot change any flags that are locked.
  640. if daemonCfg.RemappedRoot != "" || userns.RunningInUserNS() {
  641. unprivOpts, err := getUnprivilegedMountFlags(m.Source)
  642. if err != nil {
  643. return err
  644. }
  645. opts = append(opts, unprivOpts...)
  646. }
  647. mt.Options = opts
  648. s.Mounts = append(s.Mounts, mt)
  649. }
  650. if s.Root.Readonly {
  651. for i, m := range s.Mounts {
  652. switch m.Destination {
  653. case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev":
  654. continue
  655. }
  656. if _, ok := userMounts[m.Destination]; !ok {
  657. if !inSlice(m.Options, "ro") {
  658. s.Mounts[i].Options = append(s.Mounts[i].Options, "ro")
  659. }
  660. }
  661. }
  662. }
  663. if c.HostConfig.Privileged {
  664. // clear readonly for /sys
  665. for i := range s.Mounts {
  666. if s.Mounts[i].Destination == "/sys" {
  667. clearReadOnly(&s.Mounts[i])
  668. }
  669. }
  670. if s.Linux != nil {
  671. s.Linux.ReadonlyPaths = nil
  672. s.Linux.MaskedPaths = nil
  673. }
  674. }
  675. // TODO: until a kernel/mount solution exists for handling remount in a user namespace,
  676. // we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
  677. if uidMap := daemon.idMapping.UIDMaps; uidMap != nil || c.HostConfig.Privileged {
  678. for i, m := range s.Mounts {
  679. if m.Type == "cgroup" {
  680. clearReadOnly(&s.Mounts[i])
  681. }
  682. }
  683. }
  684. return nil
  685. }
  686. }
  687. // sysctlExists checks if a sysctl exists; runc will error if we add any that do not actually
  688. // exist, so do not add the default ones if running on an old kernel.
  689. func sysctlExists(s string) bool {
  690. f := filepath.Join("/proc", "sys", strings.ReplaceAll(s, ".", "/"))
  691. _, err := os.Stat(f)
  692. return err == nil
  693. }
  694. // withCommonOptions sets common docker options
  695. func withCommonOptions(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
  696. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  697. if c.BaseFS == "" {
  698. return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly empty")
  699. }
  700. linkedEnv, err := daemon.setupLinkedContainers(c)
  701. if err != nil {
  702. return err
  703. }
  704. s.Root = &specs.Root{
  705. Path: c.BaseFS,
  706. Readonly: c.HostConfig.ReadonlyRootfs,
  707. }
  708. if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil {
  709. return err
  710. }
  711. cwd := c.Config.WorkingDir
  712. if len(cwd) == 0 {
  713. cwd = "/"
  714. }
  715. if s.Process == nil {
  716. s.Process = &specs.Process{}
  717. }
  718. s.Process.Args = append([]string{c.Path}, c.Args...)
  719. // only add the custom init if it is specified and the container is running in its
  720. // own private pid namespace. It does not make sense to add if it is running in the
  721. // host namespace or another container's pid namespace where we already have an init
  722. if c.HostConfig.PidMode.IsPrivate() {
  723. if (c.HostConfig.Init != nil && *c.HostConfig.Init) ||
  724. (c.HostConfig.Init == nil && daemonCfg.Init) {
  725. s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...)
  726. path, err := daemonCfg.LookupInitPath() // this will fall back to DefaultInitBinary and return an absolute path
  727. if err != nil {
  728. return err
  729. }
  730. s.Mounts = append(s.Mounts, specs.Mount{
  731. Destination: inContainerInitPath,
  732. Type: "bind",
  733. Source: path,
  734. Options: []string{"bind", "ro"},
  735. })
  736. }
  737. }
  738. s.Process.Cwd = cwd
  739. s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
  740. s.Process.Terminal = c.Config.Tty
  741. s.Hostname = c.Config.Hostname
  742. setLinuxDomainname(c, s)
  743. // Add default sysctls that are generally safe and useful; currently we
  744. // grant the capabilities to allow these anyway. You can override if
  745. // you want to restore the original behaviour.
  746. // We do not set network sysctls if network namespace is host, or if we are
  747. // joining an existing namespace, only if we create a new net namespace.
  748. if c.HostConfig.NetworkMode.IsPrivate() {
  749. // We cannot set up ping socket support in a user namespace
  750. userNS := daemonCfg.RemappedRoot != "" && c.HostConfig.UsernsMode.IsPrivate()
  751. if !userNS && !userns.RunningInUserNS() && sysctlExists("net.ipv4.ping_group_range") {
  752. // allow unprivileged ICMP echo sockets without CAP_NET_RAW
  753. s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647"
  754. }
  755. // allow opening any port less than 1024 without CAP_NET_BIND_SERVICE
  756. if sysctlExists("net.ipv4.ip_unprivileged_port_start") {
  757. s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"] = "0"
  758. }
  759. }
  760. return nil
  761. }
  762. }
  763. // withCgroups sets the container's cgroups
  764. func withCgroups(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
  765. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  766. var cgroupsPath string
  767. scopePrefix := "docker"
  768. parent := "/docker"
  769. useSystemd := UsingSystemd(daemonCfg)
  770. if useSystemd {
  771. parent = "system.slice"
  772. if daemonCfg.Rootless {
  773. parent = "user.slice"
  774. }
  775. }
  776. if c.HostConfig.CgroupParent != "" {
  777. parent = c.HostConfig.CgroupParent
  778. } else if daemonCfg.CgroupParent != "" {
  779. parent = daemonCfg.CgroupParent
  780. }
  781. if useSystemd {
  782. cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID
  783. log.G(ctx).Debugf("createSpec: cgroupsPath: %s", cgroupsPath)
  784. } else {
  785. cgroupsPath = filepath.Join(parent, c.ID)
  786. }
  787. if s.Linux == nil {
  788. s.Linux = &specs.Linux{}
  789. }
  790. s.Linux.CgroupsPath = cgroupsPath
  791. // the rest is only needed for CPU RT controller
  792. if daemonCfg.CPURealtimePeriod == 0 && daemonCfg.CPURealtimeRuntime == 0 {
  793. return nil
  794. }
  795. p := cgroupsPath
  796. if useSystemd {
  797. initPath, err := cgroups.GetInitCgroup("cpu")
  798. if err != nil {
  799. return errors.Wrap(err, "unable to init CPU RT controller")
  800. }
  801. _, err = cgroups.GetOwnCgroup("cpu")
  802. if err != nil {
  803. return errors.Wrap(err, "unable to init CPU RT controller")
  804. }
  805. p = filepath.Join(initPath, s.Linux.CgroupsPath)
  806. }
  807. // Clean path to guard against things like ../../../BAD
  808. parentPath := filepath.Dir(p)
  809. if !filepath.IsAbs(parentPath) {
  810. parentPath = filepath.Clean("/" + parentPath)
  811. }
  812. mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", "cpu")
  813. if err != nil {
  814. return errors.Wrap(err, "unable to init CPU RT controller")
  815. }
  816. // When docker is run inside docker, the root is based of the host cgroup.
  817. // Should this be handled in runc/libcontainer/cgroups ?
  818. if strings.HasPrefix(root, "/docker/") {
  819. root = "/"
  820. }
  821. mnt = filepath.Join(mnt, root)
  822. if err := daemon.initCPURtController(daemonCfg, mnt, parentPath); err != nil {
  823. return errors.Wrap(err, "unable to init CPU RT controller")
  824. }
  825. return nil
  826. }
  827. }
  828. // WithDevices sets the container's devices
  829. func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts {
  830. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  831. // Build lists of devices allowed and created within the container.
  832. var devs []specs.LinuxDevice
  833. devPermissions := s.Linux.Resources.Devices
  834. if c.HostConfig.Privileged {
  835. hostDevices, err := coci.HostDevices()
  836. if err != nil {
  837. return err
  838. }
  839. devs = append(devs, hostDevices...)
  840. // adding device mappings in privileged containers
  841. for _, deviceMapping := range c.HostConfig.Devices {
  842. // issue a warning that custom cgroup permissions are ignored in privileged mode
  843. if deviceMapping.CgroupPermissions != "rwm" {
  844. log.G(ctx).WithField("container", c.ID).Warnf("custom %s permissions for device %s are ignored in privileged mode", deviceMapping.CgroupPermissions, deviceMapping.PathOnHost)
  845. }
  846. // issue a warning that the device path already exists via /dev mounting in privileged mode
  847. if deviceMapping.PathOnHost == deviceMapping.PathInContainer {
  848. log.G(ctx).WithField("container", c.ID).Warnf("path in container %s already exists in privileged mode", deviceMapping.PathInContainer)
  849. continue
  850. }
  851. d, _, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, "rwm")
  852. if err != nil {
  853. return err
  854. }
  855. devs = append(devs, d...)
  856. }
  857. devPermissions = []specs.LinuxDeviceCgroup{
  858. {
  859. Allow: true,
  860. Access: "rwm",
  861. },
  862. }
  863. } else {
  864. for _, deviceMapping := range c.HostConfig.Devices {
  865. d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions)
  866. if err != nil {
  867. return err
  868. }
  869. devs = append(devs, d...)
  870. devPermissions = append(devPermissions, dPermissions...)
  871. }
  872. var err error
  873. devPermissions, err = oci.AppendDevicePermissionsFromCgroupRules(devPermissions, c.HostConfig.DeviceCgroupRules)
  874. if err != nil {
  875. return err
  876. }
  877. }
  878. if s.Linux == nil {
  879. s.Linux = &specs.Linux{}
  880. }
  881. if s.Linux.Resources == nil {
  882. s.Linux.Resources = &specs.LinuxResources{}
  883. }
  884. s.Linux.Devices = append(s.Linux.Devices, devs...)
  885. s.Linux.Resources.Devices = append(s.Linux.Resources.Devices, devPermissions...)
  886. for _, req := range c.HostConfig.DeviceRequests {
  887. if err := daemon.handleDevice(req, s); err != nil {
  888. return err
  889. }
  890. }
  891. return nil
  892. }
  893. }
  894. // WithResources applies the container resources
  895. func WithResources(c *container.Container) coci.SpecOpts {
  896. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  897. r := c.HostConfig.Resources
  898. weightDevices, err := getBlkioWeightDevices(r)
  899. if err != nil {
  900. return err
  901. }
  902. readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps)
  903. if err != nil {
  904. return err
  905. }
  906. writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps)
  907. if err != nil {
  908. return err
  909. }
  910. readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps)
  911. if err != nil {
  912. return err
  913. }
  914. writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps)
  915. if err != nil {
  916. return err
  917. }
  918. memoryRes := getMemoryResources(r)
  919. cpuRes, err := getCPUResources(r)
  920. if err != nil {
  921. return err
  922. }
  923. if s.Linux == nil {
  924. s.Linux = &specs.Linux{}
  925. }
  926. if s.Linux.Resources == nil {
  927. s.Linux.Resources = &specs.LinuxResources{}
  928. }
  929. s.Linux.Resources.Memory = memoryRes
  930. s.Linux.Resources.CPU = cpuRes
  931. s.Linux.Resources.BlockIO = &specs.LinuxBlockIO{
  932. WeightDevice: weightDevices,
  933. ThrottleReadBpsDevice: readBpsDevice,
  934. ThrottleWriteBpsDevice: writeBpsDevice,
  935. ThrottleReadIOPSDevice: readIOpsDevice,
  936. ThrottleWriteIOPSDevice: writeIOpsDevice,
  937. }
  938. if r.BlkioWeight != 0 {
  939. w := r.BlkioWeight
  940. s.Linux.Resources.BlockIO.Weight = &w
  941. }
  942. s.Linux.Resources.Pids = getPidsLimit(r)
  943. return nil
  944. }
  945. }
  946. // WithSysctls sets the container's sysctls
  947. func WithSysctls(c *container.Container) coci.SpecOpts {
  948. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  949. if len(c.HostConfig.Sysctls) == 0 {
  950. return nil
  951. }
  952. if s.Linux == nil {
  953. s.Linux = &specs.Linux{}
  954. }
  955. if s.Linux.Sysctl == nil {
  956. s.Linux.Sysctl = make(map[string]string)
  957. }
  958. // We merge the sysctls injected above with the HostConfig (latter takes
  959. // precedence for backwards-compatibility reasons).
  960. for k, v := range c.HostConfig.Sysctls {
  961. s.Linux.Sysctl[k] = v
  962. }
  963. return nil
  964. }
  965. }
  966. // WithUser sets the container's user
  967. func WithUser(c *container.Container) coci.SpecOpts {
  968. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  969. if s.Process == nil {
  970. s.Process = &specs.Process{}
  971. }
  972. var err error
  973. s.Process.User, err = getUser(c, c.Config.User)
  974. return err
  975. }
  976. }
  977. func (daemon *Daemon) createSpec(ctx context.Context, daemonCfg *configStore, c *container.Container) (retSpec *specs.Spec, err error) {
  978. var (
  979. opts []coci.SpecOpts
  980. s = oci.DefaultSpec()
  981. )
  982. opts = append(opts,
  983. withCommonOptions(daemon, &daemonCfg.Config, c),
  984. withCgroups(daemon, &daemonCfg.Config, c),
  985. WithResources(c),
  986. WithSysctls(c),
  987. WithDevices(daemon, c),
  988. withRlimits(daemon, &daemonCfg.Config, c),
  989. WithNamespaces(daemon, c),
  990. WithCapabilities(c),
  991. WithSeccomp(daemon, c),
  992. withMounts(daemon, daemonCfg, c),
  993. withLibnetwork(daemon, &daemonCfg.Config, c),
  994. WithApparmor(c),
  995. WithSelinux(c),
  996. WithOOMScore(&c.HostConfig.OomScoreAdj),
  997. coci.WithAnnotations(c.HostConfig.Annotations),
  998. WithUser(c),
  999. )
  1000. if c.NoNewPrivileges {
  1001. opts = append(opts, coci.WithNoNewPrivileges)
  1002. }
  1003. if c.Config.Tty {
  1004. opts = append(opts, WithConsoleSize(c))
  1005. }
  1006. // Set the masked and readonly paths with regard to the host config options if they are set.
  1007. if c.HostConfig.MaskedPaths != nil {
  1008. opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths))
  1009. }
  1010. if c.HostConfig.ReadonlyPaths != nil {
  1011. opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths))
  1012. }
  1013. if daemonCfg.Rootless {
  1014. opts = append(opts, withRootless(daemon, &daemonCfg.Config))
  1015. }
  1016. var snapshotter, snapshotKey string
  1017. if daemon.UsesSnapshotter() {
  1018. snapshotter = daemon.imageService.StorageDriver()
  1019. snapshotKey = c.ID
  1020. }
  1021. return &s, coci.ApplyOpts(ctx, daemon.containerdClient, &containers.Container{
  1022. ID: c.ID,
  1023. Snapshotter: snapshotter,
  1024. SnapshotKey: snapshotKey,
  1025. }, &s, opts...)
  1026. }
  1027. func clearReadOnly(m *specs.Mount) {
  1028. var opt []string
  1029. for _, o := range m.Options {
  1030. if o != "ro" {
  1031. opt = append(opt, o)
  1032. }
  1033. }
  1034. m.Options = opt
  1035. }
  1036. // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig
  1037. func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig, daemonCfg *dconfig.Config) {
  1038. ulimits := c.Ulimits
  1039. // Merge ulimits with daemon defaults
  1040. ulIdx := make(map[string]struct{})
  1041. for _, ul := range ulimits {
  1042. ulIdx[ul.Name] = struct{}{}
  1043. }
  1044. for name, ul := range daemonCfg.Ulimits {
  1045. if _, exists := ulIdx[name]; !exists {
  1046. ulimits = append(ulimits, ul)
  1047. }
  1048. }
  1049. c.Ulimits = ulimits
  1050. }