oci_linux.go 34 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159
  1. package daemon // import "github.com/docker/docker/daemon"
  2. import (
  3. "context"
  4. "fmt"
  5. "os"
  6. "path/filepath"
  7. "sort"
  8. "strconv"
  9. "strings"
  10. cdcgroups "github.com/containerd/cgroups/v3"
  11. "github.com/containerd/containerd/containers"
  12. "github.com/containerd/containerd/log"
  13. coci "github.com/containerd/containerd/oci"
  14. "github.com/containerd/containerd/pkg/apparmor"
  15. "github.com/containerd/containerd/pkg/userns"
  16. containertypes "github.com/docker/docker/api/types/container"
  17. "github.com/docker/docker/container"
  18. dconfig "github.com/docker/docker/daemon/config"
  19. "github.com/docker/docker/errdefs"
  20. "github.com/docker/docker/oci"
  21. "github.com/docker/docker/oci/caps"
  22. "github.com/docker/docker/pkg/idtools"
  23. "github.com/docker/docker/pkg/rootless/specconv"
  24. "github.com/docker/docker/pkg/stringid"
  25. volumemounts "github.com/docker/docker/volume/mounts"
  26. "github.com/moby/sys/mount"
  27. "github.com/moby/sys/mountinfo"
  28. "github.com/opencontainers/runc/libcontainer/cgroups"
  29. "github.com/opencontainers/runc/libcontainer/user"
  30. specs "github.com/opencontainers/runtime-spec/specs-go"
  31. "github.com/pkg/errors"
  32. "golang.org/x/sys/unix"
  33. )
  34. const inContainerInitPath = "/sbin/" + dconfig.DefaultInitBinary
  35. // withRlimits sets the container's rlimits along with merging the daemon's rlimits
  36. func withRlimits(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
  37. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  38. var rlimits []specs.POSIXRlimit
  39. // We want to leave the original HostConfig alone so make a copy here
  40. hostConfig := *c.HostConfig
  41. // Merge with the daemon defaults
  42. daemon.mergeUlimits(&hostConfig, daemonCfg)
  43. for _, ul := range hostConfig.Ulimits {
  44. rlimits = append(rlimits, specs.POSIXRlimit{
  45. Type: "RLIMIT_" + strings.ToUpper(ul.Name),
  46. Soft: uint64(ul.Soft),
  47. Hard: uint64(ul.Hard),
  48. })
  49. }
  50. if s.Process == nil {
  51. s.Process = &specs.Process{}
  52. }
  53. s.Process.Rlimits = rlimits
  54. return nil
  55. }
  56. }
  57. // withLibnetwork sets the libnetwork hook
  58. func withLibnetwork(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
  59. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  60. if c.Config.NetworkDisabled {
  61. return nil
  62. }
  63. for _, ns := range s.Linux.Namespaces {
  64. if ns.Type == specs.NetworkNamespace && ns.Path == "" {
  65. if s.Hooks == nil {
  66. s.Hooks = &specs.Hooks{}
  67. }
  68. shortNetCtlrID := stringid.TruncateID(daemon.netController.ID())
  69. s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{
  70. Path: filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe"),
  71. Args: []string{"libnetwork-setkey", "-exec-root=" + daemonCfg.GetExecRoot(), c.ID, shortNetCtlrID},
  72. })
  73. }
  74. }
  75. return nil
  76. }
  77. }
  78. // withRootless sets the spec to the rootless configuration
  79. func withRootless(daemon *Daemon, daemonCfg *dconfig.Config) coci.SpecOpts {
  80. return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  81. var v2Controllers []string
  82. if cgroupDriver(daemonCfg) == cgroupSystemdDriver {
  83. if cdcgroups.Mode() != cdcgroups.Unified {
  84. return errors.New("rootless systemd driver doesn't support cgroup v1")
  85. }
  86. rootlesskitParentEUID := os.Getenv("ROOTLESSKIT_PARENT_EUID")
  87. if rootlesskitParentEUID == "" {
  88. return errors.New("$ROOTLESSKIT_PARENT_EUID is not set (requires RootlessKit v0.8.0)")
  89. }
  90. euid, err := strconv.Atoi(rootlesskitParentEUID)
  91. if err != nil {
  92. return errors.Wrap(err, "invalid $ROOTLESSKIT_PARENT_EUID: must be a numeric value")
  93. }
  94. controllersPath := fmt.Sprintf("/sys/fs/cgroup/user.slice/user-%d.slice/cgroup.controllers", euid)
  95. controllersFile, err := os.ReadFile(controllersPath)
  96. if err != nil {
  97. return err
  98. }
  99. v2Controllers = strings.Fields(string(controllersFile))
  100. }
  101. return specconv.ToRootless(s, v2Controllers)
  102. }
  103. }
  104. // WithOOMScore sets the oom score
  105. func WithOOMScore(score *int) coci.SpecOpts {
  106. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  107. if s.Process == nil {
  108. s.Process = &specs.Process{}
  109. }
  110. s.Process.OOMScoreAdj = score
  111. return nil
  112. }
  113. }
  114. // WithSelinux sets the selinux labels
  115. func WithSelinux(c *container.Container) coci.SpecOpts {
  116. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  117. if s.Process == nil {
  118. s.Process = &specs.Process{}
  119. }
  120. if s.Linux == nil {
  121. s.Linux = &specs.Linux{}
  122. }
  123. s.Process.SelinuxLabel = c.GetProcessLabel()
  124. s.Linux.MountLabel = c.MountLabel
  125. return nil
  126. }
  127. }
  128. // WithApparmor sets the apparmor profile
  129. func WithApparmor(c *container.Container) coci.SpecOpts {
  130. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  131. if apparmor.HostSupports() {
  132. var appArmorProfile string
  133. if c.AppArmorProfile != "" {
  134. appArmorProfile = c.AppArmorProfile
  135. } else if c.HostConfig.Privileged {
  136. appArmorProfile = unconfinedAppArmorProfile
  137. } else {
  138. appArmorProfile = defaultAppArmorProfile
  139. }
  140. if appArmorProfile == defaultAppArmorProfile {
  141. // Unattended upgrades and other fun services can unload AppArmor
  142. // profiles inadvertently. Since we cannot store our profile in
  143. // /etc/apparmor.d, nor can we practically add other ways of
  144. // telling the system to keep our profile loaded, in order to make
  145. // sure that we keep the default profile enabled we dynamically
  146. // reload it if necessary.
  147. if err := ensureDefaultAppArmorProfile(); err != nil {
  148. return err
  149. }
  150. }
  151. if s.Process == nil {
  152. s.Process = &specs.Process{}
  153. }
  154. s.Process.ApparmorProfile = appArmorProfile
  155. }
  156. return nil
  157. }
  158. }
  159. // WithCapabilities sets the container's capabilties
  160. func WithCapabilities(c *container.Container) coci.SpecOpts {
  161. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  162. capabilities, err := caps.TweakCapabilities(
  163. caps.DefaultCapabilities(),
  164. c.HostConfig.CapAdd,
  165. c.HostConfig.CapDrop,
  166. c.HostConfig.Privileged,
  167. )
  168. if err != nil {
  169. return err
  170. }
  171. return oci.SetCapabilities(s, capabilities)
  172. }
  173. }
  174. func resourcePath(c *container.Container, getPath func() (string, error)) (string, error) {
  175. p, err := getPath()
  176. if err != nil {
  177. return "", err
  178. }
  179. return c.GetResourcePath(p)
  180. }
  181. func getUser(c *container.Container, username string) (specs.User, error) {
  182. var usr specs.User
  183. passwdPath, err := resourcePath(c, user.GetPasswdPath)
  184. if err != nil {
  185. return usr, err
  186. }
  187. groupPath, err := resourcePath(c, user.GetGroupPath)
  188. if err != nil {
  189. return usr, err
  190. }
  191. execUser, err := user.GetExecUserPath(username, nil, passwdPath, groupPath)
  192. if err != nil {
  193. return usr, err
  194. }
  195. usr.UID = uint32(execUser.Uid)
  196. usr.GID = uint32(execUser.Gid)
  197. usr.AdditionalGids = []uint32{usr.GID}
  198. var addGroups []int
  199. if len(c.HostConfig.GroupAdd) > 0 {
  200. addGroups, err = user.GetAdditionalGroupsPath(c.HostConfig.GroupAdd, groupPath)
  201. if err != nil {
  202. return usr, err
  203. }
  204. }
  205. for _, g := range append(execUser.Sgids, addGroups...) {
  206. usr.AdditionalGids = append(usr.AdditionalGids, uint32(g))
  207. }
  208. return usr, nil
  209. }
  210. func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) {
  211. if s.Linux == nil {
  212. s.Linux = &specs.Linux{}
  213. }
  214. for i, n := range s.Linux.Namespaces {
  215. if n.Type == ns.Type {
  216. s.Linux.Namespaces[i] = ns
  217. return
  218. }
  219. }
  220. s.Linux.Namespaces = append(s.Linux.Namespaces, ns)
  221. }
  222. // WithNamespaces sets the container's namespaces
  223. func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
  224. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  225. userNS := false
  226. // user
  227. if c.HostConfig.UsernsMode.IsPrivate() {
  228. if uidMap := daemon.idMapping.UIDMaps; uidMap != nil {
  229. userNS = true
  230. setNamespace(s, specs.LinuxNamespace{
  231. Type: specs.UserNamespace,
  232. })
  233. s.Linux.UIDMappings = specMapping(uidMap)
  234. s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDMaps)
  235. }
  236. }
  237. // network
  238. if !c.Config.NetworkDisabled {
  239. if c.HostConfig.NetworkMode.IsContainer() {
  240. nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer())
  241. if err != nil {
  242. return err
  243. }
  244. setNamespace(s, specs.LinuxNamespace{
  245. Type: specs.NetworkNamespace,
  246. Path: fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID()),
  247. })
  248. if userNS {
  249. // to share a net namespace, the containers must also share a user namespace.
  250. setNamespace(s, specs.LinuxNamespace{
  251. Type: specs.UserNamespace,
  252. Path: fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID()),
  253. })
  254. }
  255. } else if c.HostConfig.NetworkMode.IsHost() {
  256. setNamespace(s, specs.LinuxNamespace{
  257. Type: specs.NetworkNamespace,
  258. Path: c.NetworkSettings.SandboxKey,
  259. })
  260. } else {
  261. setNamespace(s, specs.LinuxNamespace{
  262. Type: specs.NetworkNamespace,
  263. })
  264. }
  265. }
  266. // ipc
  267. ipcMode := c.HostConfig.IpcMode
  268. if !ipcMode.Valid() {
  269. return errdefs.InvalidParameter(errors.Errorf("invalid IPC mode: %v", ipcMode))
  270. }
  271. switch {
  272. case ipcMode.IsContainer():
  273. ic, err := daemon.getIpcContainer(ipcMode.Container())
  274. if err != nil {
  275. return errdefs.InvalidParameter(errors.Wrapf(err, "invalid IPC mode: %v", ipcMode))
  276. }
  277. setNamespace(s, specs.LinuxNamespace{
  278. Type: specs.IPCNamespace,
  279. Path: fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID()),
  280. })
  281. if userNS {
  282. // to share a IPC namespace, the containers must also share a user namespace.
  283. setNamespace(s, specs.LinuxNamespace{
  284. Type: specs.UserNamespace,
  285. Path: fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID()),
  286. })
  287. }
  288. case ipcMode.IsHost():
  289. oci.RemoveNamespace(s, specs.IPCNamespace)
  290. case ipcMode.IsEmpty():
  291. // A container was created by an older version of the daemon.
  292. // The default behavior used to be what is now called "shareable".
  293. fallthrough
  294. case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone():
  295. setNamespace(s, specs.LinuxNamespace{
  296. Type: specs.IPCNamespace,
  297. })
  298. }
  299. // pid
  300. if !c.HostConfig.PidMode.Valid() {
  301. return errdefs.InvalidParameter(errors.Errorf("invalid PID mode: %v", c.HostConfig.PidMode))
  302. }
  303. if c.HostConfig.PidMode.IsContainer() {
  304. pc, err := daemon.getPidContainer(c)
  305. if err != nil {
  306. return err
  307. }
  308. setNamespace(s, specs.LinuxNamespace{
  309. Type: specs.PIDNamespace,
  310. Path: fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID()),
  311. })
  312. if userNS {
  313. // to share a PID namespace, the containers must also share a user namespace.
  314. setNamespace(s, specs.LinuxNamespace{
  315. Type: specs.UserNamespace,
  316. Path: fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID()),
  317. })
  318. }
  319. } else if c.HostConfig.PidMode.IsHost() {
  320. oci.RemoveNamespace(s, specs.PIDNamespace)
  321. } else {
  322. setNamespace(s, specs.LinuxNamespace{
  323. Type: specs.PIDNamespace,
  324. })
  325. }
  326. // uts
  327. if !c.HostConfig.UTSMode.Valid() {
  328. return errdefs.InvalidParameter(errors.Errorf("invalid UTS mode: %v", c.HostConfig.UTSMode))
  329. }
  330. if c.HostConfig.UTSMode.IsHost() {
  331. oci.RemoveNamespace(s, specs.UTSNamespace)
  332. s.Hostname = ""
  333. }
  334. // cgroup
  335. if !c.HostConfig.CgroupnsMode.Valid() {
  336. return errdefs.InvalidParameter(errors.Errorf("invalid cgroup namespace mode: %v", c.HostConfig.CgroupnsMode))
  337. }
  338. if !c.HostConfig.CgroupnsMode.IsEmpty() {
  339. if c.HostConfig.CgroupnsMode.IsPrivate() {
  340. setNamespace(s, specs.LinuxNamespace{
  341. Type: specs.CgroupNamespace,
  342. })
  343. }
  344. }
  345. return nil
  346. }
  347. }
  348. func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {
  349. var ids []specs.LinuxIDMapping
  350. for _, item := range s {
  351. ids = append(ids, specs.LinuxIDMapping{
  352. HostID: uint32(item.HostID),
  353. ContainerID: uint32(item.ContainerID),
  354. Size: uint32(item.Size),
  355. })
  356. }
  357. return ids
  358. }
  359. // Get the source mount point of directory passed in as argument. Also return
  360. // optional fields.
  361. func getSourceMount(source string) (string, string, error) {
  362. // Ensure any symlinks are resolved.
  363. sourcePath, err := filepath.EvalSymlinks(source)
  364. if err != nil {
  365. return "", "", err
  366. }
  367. mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(sourcePath))
  368. if err != nil {
  369. return "", "", err
  370. }
  371. if len(mi) < 1 {
  372. return "", "", fmt.Errorf("Can't find mount point of %s", source)
  373. }
  374. // find the longest mount point
  375. var idx, maxlen int
  376. for i := range mi {
  377. if len(mi[i].Mountpoint) > maxlen {
  378. maxlen = len(mi[i].Mountpoint)
  379. idx = i
  380. }
  381. }
  382. return mi[idx].Mountpoint, mi[idx].Optional, nil
  383. }
  384. const (
  385. sharedPropagationOption = "shared:"
  386. slavePropagationOption = "master:"
  387. )
  388. // hasMountInfoOption checks if any of the passed any of the given option values
  389. // are set in the passed in option string.
  390. func hasMountInfoOption(opts string, vals ...string) bool {
  391. for _, opt := range strings.Split(opts, " ") {
  392. for _, val := range vals {
  393. if strings.HasPrefix(opt, val) {
  394. return true
  395. }
  396. }
  397. }
  398. return false
  399. }
  400. // Ensure mount point on which path is mounted, is shared.
  401. func ensureShared(path string) error {
  402. sourceMount, optionalOpts, err := getSourceMount(path)
  403. if err != nil {
  404. return err
  405. }
  406. // Make sure source mount point is shared.
  407. if !hasMountInfoOption(optionalOpts, sharedPropagationOption) {
  408. return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount)
  409. }
  410. return nil
  411. }
  412. // Ensure mount point on which path is mounted, is either shared or slave.
  413. func ensureSharedOrSlave(path string) error {
  414. sourceMount, optionalOpts, err := getSourceMount(path)
  415. if err != nil {
  416. return err
  417. }
  418. if !hasMountInfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) {
  419. return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount)
  420. }
  421. return nil
  422. }
  423. // Get the set of mount flags that are set on the mount that contains the given
  424. // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
  425. // bind-mounting "with options" will not fail with user namespaces, due to
  426. // kernel restrictions that require user namespace mounts to preserve
  427. // CL_UNPRIVILEGED locked flags.
  428. func getUnprivilegedMountFlags(path string) ([]string, error) {
  429. var statfs unix.Statfs_t
  430. if err := unix.Statfs(path, &statfs); err != nil {
  431. return nil, err
  432. }
  433. // The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
  434. unprivilegedFlags := map[uint64]string{
  435. unix.MS_RDONLY: "ro",
  436. unix.MS_NODEV: "nodev",
  437. unix.MS_NOEXEC: "noexec",
  438. unix.MS_NOSUID: "nosuid",
  439. unix.MS_NOATIME: "noatime",
  440. unix.MS_RELATIME: "relatime",
  441. unix.MS_NODIRATIME: "nodiratime",
  442. }
  443. var flags []string
  444. for mask, flag := range unprivilegedFlags {
  445. if uint64(statfs.Flags)&mask == mask {
  446. flags = append(flags, flag)
  447. }
  448. }
  449. return flags, nil
  450. }
  451. var (
  452. mountPropagationMap = map[string]int{
  453. "private": mount.PRIVATE,
  454. "rprivate": mount.RPRIVATE,
  455. "shared": mount.SHARED,
  456. "rshared": mount.RSHARED,
  457. "slave": mount.SLAVE,
  458. "rslave": mount.RSLAVE,
  459. }
  460. mountPropagationReverseMap = map[int]string{
  461. mount.PRIVATE: "private",
  462. mount.RPRIVATE: "rprivate",
  463. mount.SHARED: "shared",
  464. mount.RSHARED: "rshared",
  465. mount.SLAVE: "slave",
  466. mount.RSLAVE: "rslave",
  467. }
  468. )
  469. // inSlice tests whether a string is contained in a slice of strings or not.
  470. // Comparison is case sensitive
  471. func inSlice(slice []string, s string) bool {
  472. for _, ss := range slice {
  473. if s == ss {
  474. return true
  475. }
  476. }
  477. return false
  478. }
  479. // withMounts sets the container's mounts
  480. func withMounts(daemon *Daemon, daemonCfg *configStore, c *container.Container) coci.SpecOpts {
  481. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
  482. if err := daemon.setupContainerMountsRoot(c); err != nil {
  483. return err
  484. }
  485. if err := daemon.setupIpcDirs(c); err != nil {
  486. return err
  487. }
  488. defer func() {
  489. if err != nil {
  490. daemon.cleanupSecretDir(c)
  491. }
  492. }()
  493. if err := daemon.setupSecretDir(c); err != nil {
  494. return err
  495. }
  496. ms, err := daemon.setupMounts(c)
  497. if err != nil {
  498. return err
  499. }
  500. if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() {
  501. ms = append(ms, c.IpcMounts()...)
  502. }
  503. tmpfsMounts, err := c.TmpfsMounts()
  504. if err != nil {
  505. return err
  506. }
  507. ms = append(ms, tmpfsMounts...)
  508. secretMounts, err := c.SecretMounts()
  509. if err != nil {
  510. return err
  511. }
  512. ms = append(ms, secretMounts...)
  513. sort.Sort(mounts(ms))
  514. mounts := ms
  515. userMounts := make(map[string]struct{})
  516. for _, m := range mounts {
  517. userMounts[m.Destination] = struct{}{}
  518. }
  519. // Copy all mounts from spec to defaultMounts, except for
  520. // - mounts overridden by a user supplied mount;
  521. // - all mounts under /dev if a user supplied /dev is present;
  522. // - /dev/shm, in case IpcMode is none.
  523. // While at it, also
  524. // - set size for /dev/shm from shmsize.
  525. defaultMounts := s.Mounts[:0]
  526. _, mountDev := userMounts["/dev"]
  527. for _, m := range s.Mounts {
  528. if _, ok := userMounts[m.Destination]; ok {
  529. // filter out mount overridden by a user supplied mount
  530. continue
  531. }
  532. if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
  533. // filter out everything under /dev if /dev is user-mounted
  534. continue
  535. }
  536. if m.Destination == "/dev/shm" {
  537. if c.HostConfig.IpcMode.IsNone() {
  538. // filter out /dev/shm for "none" IpcMode
  539. continue
  540. }
  541. // set size for /dev/shm mount from spec
  542. sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
  543. m.Options = append(m.Options, sizeOpt)
  544. }
  545. defaultMounts = append(defaultMounts, m)
  546. }
  547. s.Mounts = defaultMounts
  548. for _, m := range mounts {
  549. if m.Source == "tmpfs" {
  550. data := m.Data
  551. parser := volumemounts.NewParser()
  552. options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())}
  553. if data != "" {
  554. options = append(options, strings.Split(data, ",")...)
  555. }
  556. merged, err := mount.MergeTmpfsOptions(options)
  557. if err != nil {
  558. return err
  559. }
  560. s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged})
  561. continue
  562. }
  563. mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"}
  564. // Determine property of RootPropagation based on volume
  565. // properties. If a volume is shared, then keep root propagation
  566. // shared. This should work for slave and private volumes too.
  567. //
  568. // For slave volumes, it can be either [r]shared/[r]slave.
  569. //
  570. // For private volumes any root propagation value should work.
  571. pFlag := mountPropagationMap[m.Propagation]
  572. switch pFlag {
  573. case mount.SHARED, mount.RSHARED:
  574. if err := ensureShared(m.Source); err != nil {
  575. return err
  576. }
  577. rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
  578. if rootpg != mount.SHARED && rootpg != mount.RSHARED {
  579. if s.Linux == nil {
  580. s.Linux = &specs.Linux{}
  581. }
  582. s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED]
  583. }
  584. case mount.SLAVE, mount.RSLAVE:
  585. var fallback bool
  586. if err := ensureSharedOrSlave(m.Source); err != nil {
  587. // For backwards compatibility purposes, treat mounts from the daemon root
  588. // as special since we automatically add rslave propagation to these mounts
  589. // when the user did not set anything, so we should fallback to the old
  590. // behavior which is to use private propagation which is normally the
  591. // default.
  592. if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) {
  593. return err
  594. }
  595. cm, ok := c.MountPoints[m.Destination]
  596. if !ok {
  597. return err
  598. }
  599. if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" {
  600. // This means the user explicitly set a propagation, do not fallback in that case.
  601. return err
  602. }
  603. fallback = true
  604. log.G(ctx).WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root")
  605. }
  606. if !fallback {
  607. rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
  608. if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
  609. if s.Linux == nil {
  610. s.Linux = &specs.Linux{}
  611. }
  612. s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE]
  613. }
  614. }
  615. }
  616. bindMode := "rbind"
  617. if m.NonRecursive {
  618. bindMode = "bind"
  619. }
  620. opts := []string{bindMode}
  621. if !m.Writable {
  622. rro := true
  623. if m.ReadOnlyNonRecursive {
  624. rro = false
  625. if m.ReadOnlyForceRecursive {
  626. return errors.New("mount options conflict: ReadOnlyNonRecursive && ReadOnlyForceRecursive")
  627. }
  628. }
  629. if rroErr := supportsRecursivelyReadOnly(daemonCfg, c.HostConfig.Runtime); rroErr != nil {
  630. rro = false
  631. if m.ReadOnlyForceRecursive {
  632. return rroErr
  633. }
  634. }
  635. if rro {
  636. opts = append(opts, "rro")
  637. } else {
  638. opts = append(opts, "ro")
  639. }
  640. }
  641. if pFlag != 0 {
  642. opts = append(opts, mountPropagationReverseMap[pFlag])
  643. }
  644. // If we are using user namespaces, then we must make sure that we
  645. // don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
  646. // "mount" when we bind-mount. The reason for this is that at the point
  647. // when runc sets up the root filesystem, it is already inside a user
  648. // namespace, and thus cannot change any flags that are locked.
  649. if daemonCfg.RemappedRoot != "" || userns.RunningInUserNS() {
  650. unprivOpts, err := getUnprivilegedMountFlags(m.Source)
  651. if err != nil {
  652. return err
  653. }
  654. opts = append(opts, unprivOpts...)
  655. }
  656. mt.Options = opts
  657. s.Mounts = append(s.Mounts, mt)
  658. }
  659. if s.Root.Readonly {
  660. for i, m := range s.Mounts {
  661. switch m.Destination {
  662. case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev":
  663. continue
  664. }
  665. if _, ok := userMounts[m.Destination]; !ok {
  666. if !inSlice(m.Options, "ro") {
  667. s.Mounts[i].Options = append(s.Mounts[i].Options, "ro")
  668. }
  669. }
  670. }
  671. }
  672. if c.HostConfig.Privileged {
  673. // clear readonly for /sys
  674. for i := range s.Mounts {
  675. if s.Mounts[i].Destination == "/sys" {
  676. clearReadOnly(&s.Mounts[i])
  677. }
  678. }
  679. if s.Linux != nil {
  680. s.Linux.ReadonlyPaths = nil
  681. s.Linux.MaskedPaths = nil
  682. }
  683. }
  684. // TODO: until a kernel/mount solution exists for handling remount in a user namespace,
  685. // we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
  686. if uidMap := daemon.idMapping.UIDMaps; uidMap != nil || c.HostConfig.Privileged {
  687. for i, m := range s.Mounts {
  688. if m.Type == "cgroup" {
  689. clearReadOnly(&s.Mounts[i])
  690. }
  691. }
  692. }
  693. return nil
  694. }
  695. }
  696. // sysctlExists checks if a sysctl exists; runc will error if we add any that do not actually
  697. // exist, so do not add the default ones if running on an old kernel.
  698. func sysctlExists(s string) bool {
  699. f := filepath.Join("/proc", "sys", strings.ReplaceAll(s, ".", "/"))
  700. _, err := os.Stat(f)
  701. return err == nil
  702. }
  703. // withCommonOptions sets common docker options
  704. func withCommonOptions(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
  705. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  706. if c.BaseFS == "" {
  707. return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly empty")
  708. }
  709. linkedEnv, err := daemon.setupLinkedContainers(c)
  710. if err != nil {
  711. return err
  712. }
  713. s.Root = &specs.Root{
  714. Path: c.BaseFS,
  715. Readonly: c.HostConfig.ReadonlyRootfs,
  716. }
  717. if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil {
  718. return err
  719. }
  720. cwd := c.Config.WorkingDir
  721. if len(cwd) == 0 {
  722. cwd = "/"
  723. }
  724. if s.Process == nil {
  725. s.Process = &specs.Process{}
  726. }
  727. s.Process.Args = append([]string{c.Path}, c.Args...)
  728. // only add the custom init if it is specified and the container is running in its
  729. // own private pid namespace. It does not make sense to add if it is running in the
  730. // host namespace or another container's pid namespace where we already have an init
  731. if c.HostConfig.PidMode.IsPrivate() {
  732. if (c.HostConfig.Init != nil && *c.HostConfig.Init) ||
  733. (c.HostConfig.Init == nil && daemonCfg.Init) {
  734. s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...)
  735. path, err := daemonCfg.LookupInitPath() // this will fall back to DefaultInitBinary and return an absolute path
  736. if err != nil {
  737. return err
  738. }
  739. s.Mounts = append(s.Mounts, specs.Mount{
  740. Destination: inContainerInitPath,
  741. Type: "bind",
  742. Source: path,
  743. Options: []string{"bind", "ro"},
  744. })
  745. }
  746. }
  747. s.Process.Cwd = cwd
  748. s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
  749. s.Process.Terminal = c.Config.Tty
  750. s.Hostname = c.Config.Hostname
  751. setLinuxDomainname(c, s)
  752. // Add default sysctls that are generally safe and useful; currently we
  753. // grant the capabilities to allow these anyway. You can override if
  754. // you want to restore the original behaviour.
  755. // We do not set network sysctls if network namespace is host, or if we are
  756. // joining an existing namespace, only if we create a new net namespace.
  757. if c.HostConfig.NetworkMode.IsPrivate() {
  758. // We cannot set up ping socket support in a user namespace
  759. userNS := daemonCfg.RemappedRoot != "" && c.HostConfig.UsernsMode.IsPrivate()
  760. if !userNS && !userns.RunningInUserNS() && sysctlExists("net.ipv4.ping_group_range") {
  761. // allow unprivileged ICMP echo sockets without CAP_NET_RAW
  762. s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647"
  763. }
  764. // allow opening any port less than 1024 without CAP_NET_BIND_SERVICE
  765. if sysctlExists("net.ipv4.ip_unprivileged_port_start") {
  766. s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"] = "0"
  767. }
  768. }
  769. return nil
  770. }
  771. }
  772. // withCgroups sets the container's cgroups
  773. func withCgroups(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
  774. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  775. var cgroupsPath string
  776. scopePrefix := "docker"
  777. parent := "/docker"
  778. useSystemd := UsingSystemd(daemonCfg)
  779. if useSystemd {
  780. parent = "system.slice"
  781. if daemonCfg.Rootless {
  782. parent = "user.slice"
  783. }
  784. }
  785. if c.HostConfig.CgroupParent != "" {
  786. parent = c.HostConfig.CgroupParent
  787. } else if daemonCfg.CgroupParent != "" {
  788. parent = daemonCfg.CgroupParent
  789. }
  790. if useSystemd {
  791. cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID
  792. log.G(ctx).Debugf("createSpec: cgroupsPath: %s", cgroupsPath)
  793. } else {
  794. cgroupsPath = filepath.Join(parent, c.ID)
  795. }
  796. if s.Linux == nil {
  797. s.Linux = &specs.Linux{}
  798. }
  799. s.Linux.CgroupsPath = cgroupsPath
  800. // the rest is only needed for CPU RT controller
  801. if daemonCfg.CPURealtimePeriod == 0 && daemonCfg.CPURealtimeRuntime == 0 {
  802. return nil
  803. }
  804. p := cgroupsPath
  805. if useSystemd {
  806. initPath, err := cgroups.GetInitCgroup("cpu")
  807. if err != nil {
  808. return errors.Wrap(err, "unable to init CPU RT controller")
  809. }
  810. _, err = cgroups.GetOwnCgroup("cpu")
  811. if err != nil {
  812. return errors.Wrap(err, "unable to init CPU RT controller")
  813. }
  814. p = filepath.Join(initPath, s.Linux.CgroupsPath)
  815. }
  816. // Clean path to guard against things like ../../../BAD
  817. parentPath := filepath.Dir(p)
  818. if !filepath.IsAbs(parentPath) {
  819. parentPath = filepath.Clean("/" + parentPath)
  820. }
  821. mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", "cpu")
  822. if err != nil {
  823. return errors.Wrap(err, "unable to init CPU RT controller")
  824. }
  825. // When docker is run inside docker, the root is based of the host cgroup.
  826. // Should this be handled in runc/libcontainer/cgroups ?
  827. if strings.HasPrefix(root, "/docker/") {
  828. root = "/"
  829. }
  830. mnt = filepath.Join(mnt, root)
  831. if err := daemon.initCPURtController(daemonCfg, mnt, parentPath); err != nil {
  832. return errors.Wrap(err, "unable to init CPU RT controller")
  833. }
  834. return nil
  835. }
  836. }
  837. // WithDevices sets the container's devices
  838. func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts {
  839. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  840. // Build lists of devices allowed and created within the container.
  841. var devs []specs.LinuxDevice
  842. devPermissions := s.Linux.Resources.Devices
  843. if c.HostConfig.Privileged {
  844. hostDevices, err := coci.HostDevices()
  845. if err != nil {
  846. return err
  847. }
  848. devs = append(devs, hostDevices...)
  849. // adding device mappings in privileged containers
  850. for _, deviceMapping := range c.HostConfig.Devices {
  851. // issue a warning that custom cgroup permissions are ignored in privileged mode
  852. if deviceMapping.CgroupPermissions != "rwm" {
  853. log.G(ctx).WithField("container", c.ID).Warnf("custom %s permissions for device %s are ignored in privileged mode", deviceMapping.CgroupPermissions, deviceMapping.PathOnHost)
  854. }
  855. // issue a warning that the device path already exists via /dev mounting in privileged mode
  856. if deviceMapping.PathOnHost == deviceMapping.PathInContainer {
  857. log.G(ctx).WithField("container", c.ID).Warnf("path in container %s already exists in privileged mode", deviceMapping.PathInContainer)
  858. continue
  859. }
  860. d, _, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, "rwm")
  861. if err != nil {
  862. return err
  863. }
  864. devs = append(devs, d...)
  865. }
  866. devPermissions = []specs.LinuxDeviceCgroup{
  867. {
  868. Allow: true,
  869. Access: "rwm",
  870. },
  871. }
  872. } else {
  873. for _, deviceMapping := range c.HostConfig.Devices {
  874. d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions)
  875. if err != nil {
  876. return err
  877. }
  878. devs = append(devs, d...)
  879. devPermissions = append(devPermissions, dPermissions...)
  880. }
  881. var err error
  882. devPermissions, err = oci.AppendDevicePermissionsFromCgroupRules(devPermissions, c.HostConfig.DeviceCgroupRules)
  883. if err != nil {
  884. return err
  885. }
  886. }
  887. if s.Linux == nil {
  888. s.Linux = &specs.Linux{}
  889. }
  890. if s.Linux.Resources == nil {
  891. s.Linux.Resources = &specs.LinuxResources{}
  892. }
  893. s.Linux.Devices = append(s.Linux.Devices, devs...)
  894. s.Linux.Resources.Devices = append(s.Linux.Resources.Devices, devPermissions...)
  895. for _, req := range c.HostConfig.DeviceRequests {
  896. if err := daemon.handleDevice(req, s); err != nil {
  897. return err
  898. }
  899. }
  900. return nil
  901. }
  902. }
  903. // WithResources applies the container resources
  904. func WithResources(c *container.Container) coci.SpecOpts {
  905. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  906. r := c.HostConfig.Resources
  907. weightDevices, err := getBlkioWeightDevices(r)
  908. if err != nil {
  909. return err
  910. }
  911. readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps)
  912. if err != nil {
  913. return err
  914. }
  915. writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps)
  916. if err != nil {
  917. return err
  918. }
  919. readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps)
  920. if err != nil {
  921. return err
  922. }
  923. writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps)
  924. if err != nil {
  925. return err
  926. }
  927. memoryRes := getMemoryResources(r)
  928. cpuRes, err := getCPUResources(r)
  929. if err != nil {
  930. return err
  931. }
  932. if s.Linux == nil {
  933. s.Linux = &specs.Linux{}
  934. }
  935. if s.Linux.Resources == nil {
  936. s.Linux.Resources = &specs.LinuxResources{}
  937. }
  938. s.Linux.Resources.Memory = memoryRes
  939. s.Linux.Resources.CPU = cpuRes
  940. s.Linux.Resources.BlockIO = &specs.LinuxBlockIO{
  941. WeightDevice: weightDevices,
  942. ThrottleReadBpsDevice: readBpsDevice,
  943. ThrottleWriteBpsDevice: writeBpsDevice,
  944. ThrottleReadIOPSDevice: readIOpsDevice,
  945. ThrottleWriteIOPSDevice: writeIOpsDevice,
  946. }
  947. if r.BlkioWeight != 0 {
  948. w := r.BlkioWeight
  949. s.Linux.Resources.BlockIO.Weight = &w
  950. }
  951. s.Linux.Resources.Pids = getPidsLimit(r)
  952. return nil
  953. }
  954. }
  955. // WithSysctls sets the container's sysctls
  956. func WithSysctls(c *container.Container) coci.SpecOpts {
  957. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  958. if len(c.HostConfig.Sysctls) == 0 {
  959. return nil
  960. }
  961. if s.Linux == nil {
  962. s.Linux = &specs.Linux{}
  963. }
  964. if s.Linux.Sysctl == nil {
  965. s.Linux.Sysctl = make(map[string]string)
  966. }
  967. // We merge the sysctls injected above with the HostConfig (latter takes
  968. // precedence for backwards-compatibility reasons).
  969. for k, v := range c.HostConfig.Sysctls {
  970. s.Linux.Sysctl[k] = v
  971. }
  972. return nil
  973. }
  974. }
  975. // WithUser sets the container's user
  976. func WithUser(c *container.Container) coci.SpecOpts {
  977. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  978. if s.Process == nil {
  979. s.Process = &specs.Process{}
  980. }
  981. var err error
  982. s.Process.User, err = getUser(c, c.Config.User)
  983. return err
  984. }
  985. }
  986. func (daemon *Daemon) createSpec(ctx context.Context, daemonCfg *configStore, c *container.Container) (retSpec *specs.Spec, err error) {
  987. var (
  988. opts []coci.SpecOpts
  989. s = oci.DefaultSpec()
  990. )
  991. opts = append(opts,
  992. withCommonOptions(daemon, &daemonCfg.Config, c),
  993. withCgroups(daemon, &daemonCfg.Config, c),
  994. WithResources(c),
  995. WithSysctls(c),
  996. WithDevices(daemon, c),
  997. withRlimits(daemon, &daemonCfg.Config, c),
  998. WithNamespaces(daemon, c),
  999. WithCapabilities(c),
  1000. WithSeccomp(daemon, c),
  1001. withMounts(daemon, daemonCfg, c),
  1002. withLibnetwork(daemon, &daemonCfg.Config, c),
  1003. WithApparmor(c),
  1004. WithSelinux(c),
  1005. WithOOMScore(&c.HostConfig.OomScoreAdj),
  1006. coci.WithAnnotations(c.HostConfig.Annotations),
  1007. WithUser(c),
  1008. )
  1009. if c.NoNewPrivileges {
  1010. opts = append(opts, coci.WithNoNewPrivileges)
  1011. }
  1012. if c.Config.Tty {
  1013. opts = append(opts, WithConsoleSize(c))
  1014. }
  1015. // Set the masked and readonly paths with regard to the host config options if they are set.
  1016. if c.HostConfig.MaskedPaths != nil {
  1017. opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths))
  1018. }
  1019. if c.HostConfig.ReadonlyPaths != nil {
  1020. opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths))
  1021. }
  1022. if daemonCfg.Rootless {
  1023. opts = append(opts, withRootless(daemon, &daemonCfg.Config))
  1024. }
  1025. var snapshotter, snapshotKey string
  1026. if daemon.UsesSnapshotter() {
  1027. snapshotter = daemon.imageService.StorageDriver()
  1028. snapshotKey = c.ID
  1029. }
  1030. return &s, coci.ApplyOpts(ctx, daemon.containerdClient, &containers.Container{
  1031. ID: c.ID,
  1032. Snapshotter: snapshotter,
  1033. SnapshotKey: snapshotKey,
  1034. }, &s, opts...)
  1035. }
  1036. func clearReadOnly(m *specs.Mount) {
  1037. var opt []string
  1038. for _, o := range m.Options {
  1039. if o != "ro" {
  1040. opt = append(opt, o)
  1041. }
  1042. }
  1043. m.Options = opt
  1044. }
  1045. // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig
  1046. func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig, daemonCfg *dconfig.Config) {
  1047. ulimits := c.Ulimits
  1048. // Merge ulimits with daemon defaults
  1049. ulIdx := make(map[string]struct{})
  1050. for _, ul := range ulimits {
  1051. ulIdx[ul.Name] = struct{}{}
  1052. }
  1053. for name, ul := range daemonCfg.Ulimits {
  1054. if _, exists := ulIdx[name]; !exists {
  1055. ulimits = append(ulimits, ul)
  1056. }
  1057. }
  1058. c.Ulimits = ulimits
  1059. }