oci_linux.go 34 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105
  1. package daemon // import "github.com/docker/docker/daemon"
  2. import (
  3. "context"
  4. "fmt"
  5. "os"
  6. "path/filepath"
  7. "sort"
  8. "strconv"
  9. "strings"
  10. cdcgroups "github.com/containerd/cgroups/v3"
  11. "github.com/containerd/containerd/containers"
  12. coci "github.com/containerd/containerd/oci"
  13. "github.com/containerd/containerd/pkg/apparmor"
  14. "github.com/containerd/containerd/pkg/userns"
  15. "github.com/containerd/log"
  16. containertypes "github.com/docker/docker/api/types/container"
  17. "github.com/docker/docker/container"
  18. dconfig "github.com/docker/docker/daemon/config"
  19. "github.com/docker/docker/errdefs"
  20. "github.com/docker/docker/internal/rootless/mountopts"
  21. "github.com/docker/docker/oci"
  22. "github.com/docker/docker/oci/caps"
  23. "github.com/docker/docker/pkg/idtools"
  24. "github.com/docker/docker/pkg/rootless/specconv"
  25. "github.com/docker/docker/pkg/stringid"
  26. volumemounts "github.com/docker/docker/volume/mounts"
  27. "github.com/moby/sys/mount"
  28. "github.com/moby/sys/mountinfo"
  29. "github.com/moby/sys/user"
  30. "github.com/opencontainers/runc/libcontainer/cgroups"
  31. specs "github.com/opencontainers/runtime-spec/specs-go"
  32. "github.com/pkg/errors"
  33. )
  34. const inContainerInitPath = "/sbin/" + dconfig.DefaultInitBinary
  35. // withRlimits sets the container's rlimits along with merging the daemon's rlimits
  36. func withRlimits(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
  37. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  38. var rlimits []specs.POSIXRlimit
  39. // We want to leave the original HostConfig alone so make a copy here
  40. hostConfig := *c.HostConfig
  41. // Merge with the daemon defaults
  42. daemon.mergeUlimits(&hostConfig, daemonCfg)
  43. for _, ul := range hostConfig.Ulimits {
  44. rlimits = append(rlimits, specs.POSIXRlimit{
  45. Type: "RLIMIT_" + strings.ToUpper(ul.Name),
  46. Soft: uint64(ul.Soft),
  47. Hard: uint64(ul.Hard),
  48. })
  49. }
  50. if s.Process == nil {
  51. s.Process = &specs.Process{}
  52. }
  53. s.Process.Rlimits = rlimits
  54. return nil
  55. }
  56. }
  57. // withLibnetwork sets the libnetwork hook
  58. func withLibnetwork(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
  59. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  60. if c.Config.NetworkDisabled {
  61. return nil
  62. }
  63. for _, ns := range s.Linux.Namespaces {
  64. if ns.Type == specs.NetworkNamespace && ns.Path == "" {
  65. if s.Hooks == nil {
  66. s.Hooks = &specs.Hooks{}
  67. }
  68. shortNetCtlrID := stringid.TruncateID(daemon.netController.ID())
  69. s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{ //nolint:staticcheck // FIXME(thaJeztah); replace prestart hook with a non-deprecated one.
  70. Path: filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe"),
  71. Args: []string{"libnetwork-setkey", "-exec-root=" + daemonCfg.GetExecRoot(), c.ID, shortNetCtlrID},
  72. })
  73. }
  74. }
  75. return nil
  76. }
  77. }
  78. // withRootless sets the spec to the rootless configuration
  79. func withRootless(daemon *Daemon, daemonCfg *dconfig.Config) coci.SpecOpts {
  80. return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  81. var v2Controllers []string
  82. if cgroupDriver(daemonCfg) == cgroupSystemdDriver {
  83. if cdcgroups.Mode() != cdcgroups.Unified {
  84. return errors.New("rootless systemd driver doesn't support cgroup v1")
  85. }
  86. rootlesskitParentEUID := os.Getenv("ROOTLESSKIT_PARENT_EUID")
  87. if rootlesskitParentEUID == "" {
  88. return errors.New("$ROOTLESSKIT_PARENT_EUID is not set (requires RootlessKit v0.8.0)")
  89. }
  90. euid, err := strconv.Atoi(rootlesskitParentEUID)
  91. if err != nil {
  92. return errors.Wrap(err, "invalid $ROOTLESSKIT_PARENT_EUID: must be a numeric value")
  93. }
  94. controllersPath := fmt.Sprintf("/sys/fs/cgroup/user.slice/user-%d.slice/cgroup.controllers", euid)
  95. controllersFile, err := os.ReadFile(controllersPath)
  96. if err != nil {
  97. return err
  98. }
  99. v2Controllers = strings.Fields(string(controllersFile))
  100. }
  101. return specconv.ToRootless(s, v2Controllers)
  102. }
  103. }
  104. // withRootfulInRootless is used for "rootful-in-rootless" dind;
  105. // the daemon is running in UserNS but has no access to RootlessKit API socket, host filesystem, etc.
  106. func withRootfulInRootless(daemon *Daemon, daemonCfg *dconfig.Config) coci.SpecOpts {
  107. return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  108. specconv.ToRootfulInRootless(s)
  109. return nil
  110. }
  111. }
  112. // WithOOMScore sets the oom score
  113. func WithOOMScore(score *int) coci.SpecOpts {
  114. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  115. if s.Process == nil {
  116. s.Process = &specs.Process{}
  117. }
  118. s.Process.OOMScoreAdj = score
  119. return nil
  120. }
  121. }
  122. // WithSelinux sets the selinux labels
  123. func WithSelinux(c *container.Container) coci.SpecOpts {
  124. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  125. if s.Process == nil {
  126. s.Process = &specs.Process{}
  127. }
  128. if s.Linux == nil {
  129. s.Linux = &specs.Linux{}
  130. }
  131. s.Process.SelinuxLabel = c.GetProcessLabel()
  132. s.Linux.MountLabel = c.MountLabel
  133. return nil
  134. }
  135. }
  136. // WithApparmor sets the apparmor profile
  137. func WithApparmor(c *container.Container) coci.SpecOpts {
  138. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  139. if apparmor.HostSupports() {
  140. var appArmorProfile string
  141. if c.AppArmorProfile != "" {
  142. appArmorProfile = c.AppArmorProfile
  143. } else if c.HostConfig.Privileged {
  144. appArmorProfile = unconfinedAppArmorProfile
  145. } else {
  146. appArmorProfile = defaultAppArmorProfile
  147. }
  148. if appArmorProfile == defaultAppArmorProfile {
  149. // Unattended upgrades and other fun services can unload AppArmor
  150. // profiles inadvertently. Since we cannot store our profile in
  151. // /etc/apparmor.d, nor can we practically add other ways of
  152. // telling the system to keep our profile loaded, in order to make
  153. // sure that we keep the default profile enabled we dynamically
  154. // reload it if necessary.
  155. if err := ensureDefaultAppArmorProfile(); err != nil {
  156. return err
  157. }
  158. }
  159. if s.Process == nil {
  160. s.Process = &specs.Process{}
  161. }
  162. s.Process.ApparmorProfile = appArmorProfile
  163. }
  164. return nil
  165. }
  166. }
  167. // WithCapabilities sets the container's capabilties
  168. func WithCapabilities(c *container.Container) coci.SpecOpts {
  169. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  170. capabilities, err := caps.TweakCapabilities(
  171. caps.DefaultCapabilities(),
  172. c.HostConfig.CapAdd,
  173. c.HostConfig.CapDrop,
  174. c.HostConfig.Privileged,
  175. )
  176. if err != nil {
  177. return err
  178. }
  179. return oci.SetCapabilities(s, capabilities)
  180. }
  181. }
  182. func resourcePath(c *container.Container, getPath func() (string, error)) (string, error) {
  183. p, err := getPath()
  184. if err != nil {
  185. return "", err
  186. }
  187. return c.GetResourcePath(p)
  188. }
  189. func getUser(c *container.Container, username string) (specs.User, error) {
  190. var usr specs.User
  191. passwdPath, err := resourcePath(c, user.GetPasswdPath)
  192. if err != nil {
  193. return usr, err
  194. }
  195. groupPath, err := resourcePath(c, user.GetGroupPath)
  196. if err != nil {
  197. return usr, err
  198. }
  199. execUser, err := user.GetExecUserPath(username, nil, passwdPath, groupPath)
  200. if err != nil {
  201. return usr, err
  202. }
  203. usr.UID = uint32(execUser.Uid)
  204. usr.GID = uint32(execUser.Gid)
  205. usr.AdditionalGids = []uint32{usr.GID}
  206. var addGroups []int
  207. if len(c.HostConfig.GroupAdd) > 0 {
  208. addGroups, err = user.GetAdditionalGroupsPath(c.HostConfig.GroupAdd, groupPath)
  209. if err != nil {
  210. return usr, err
  211. }
  212. }
  213. for _, g := range append(execUser.Sgids, addGroups...) {
  214. usr.AdditionalGids = append(usr.AdditionalGids, uint32(g))
  215. }
  216. return usr, nil
  217. }
  218. func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) {
  219. if s.Linux == nil {
  220. s.Linux = &specs.Linux{}
  221. }
  222. for i, n := range s.Linux.Namespaces {
  223. if n.Type == ns.Type {
  224. s.Linux.Namespaces[i] = ns
  225. return
  226. }
  227. }
  228. s.Linux.Namespaces = append(s.Linux.Namespaces, ns)
  229. }
  230. // WithNamespaces sets the container's namespaces
  231. func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
  232. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  233. userNS := false
  234. // user
  235. if c.HostConfig.UsernsMode.IsPrivate() {
  236. if uidMap := daemon.idMapping.UIDMaps; uidMap != nil {
  237. userNS = true
  238. setNamespace(s, specs.LinuxNamespace{
  239. Type: specs.UserNamespace,
  240. })
  241. s.Linux.UIDMappings = specMapping(uidMap)
  242. s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDMaps)
  243. }
  244. }
  245. // network
  246. if !c.Config.NetworkDisabled {
  247. networkMode := c.HostConfig.NetworkMode
  248. switch {
  249. case networkMode.IsContainer():
  250. nc, err := daemon.getNetworkedContainer(c.ID, networkMode.ConnectedContainer())
  251. if err != nil {
  252. return err
  253. }
  254. setNamespace(s, specs.LinuxNamespace{
  255. Type: specs.NetworkNamespace,
  256. Path: fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID()),
  257. })
  258. if userNS {
  259. // to share a net namespace, the containers must also share a user namespace.
  260. //
  261. // FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210
  262. setNamespace(s, specs.LinuxNamespace{
  263. Type: specs.UserNamespace,
  264. Path: fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID()),
  265. })
  266. }
  267. case networkMode.IsHost():
  268. oci.RemoveNamespace(s, specs.NetworkNamespace)
  269. default:
  270. setNamespace(s, specs.LinuxNamespace{
  271. Type: specs.NetworkNamespace,
  272. })
  273. }
  274. }
  275. // ipc
  276. ipcMode := c.HostConfig.IpcMode
  277. if !ipcMode.Valid() {
  278. return errdefs.InvalidParameter(errors.Errorf("invalid IPC mode: %v", ipcMode))
  279. }
  280. switch {
  281. case ipcMode.IsContainer():
  282. ic, err := daemon.getIPCContainer(ipcMode.Container())
  283. if err != nil {
  284. return errors.Wrap(err, "failed to join IPC namespace")
  285. }
  286. setNamespace(s, specs.LinuxNamespace{
  287. Type: specs.IPCNamespace,
  288. Path: fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID()),
  289. })
  290. if userNS {
  291. // to share a IPC namespace, the containers must also share a user namespace.
  292. //
  293. // FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210
  294. setNamespace(s, specs.LinuxNamespace{
  295. Type: specs.UserNamespace,
  296. Path: fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID()),
  297. })
  298. }
  299. case ipcMode.IsHost():
  300. oci.RemoveNamespace(s, specs.IPCNamespace)
  301. case ipcMode.IsEmpty():
  302. // A container was created by an older version of the daemon.
  303. // The default behavior used to be what is now called "shareable".
  304. fallthrough
  305. case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone():
  306. setNamespace(s, specs.LinuxNamespace{
  307. Type: specs.IPCNamespace,
  308. })
  309. }
  310. // pid
  311. pidMode := c.HostConfig.PidMode
  312. if !pidMode.Valid() {
  313. return errdefs.InvalidParameter(errors.Errorf("invalid PID mode: %v", pidMode))
  314. }
  315. switch {
  316. case pidMode.IsContainer():
  317. pc, err := daemon.getPIDContainer(pidMode.Container())
  318. if err != nil {
  319. return errors.Wrap(err, "failed to join PID namespace")
  320. }
  321. setNamespace(s, specs.LinuxNamespace{
  322. Type: specs.PIDNamespace,
  323. Path: fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID()),
  324. })
  325. if userNS {
  326. // to share a PID namespace, the containers must also share a user namespace.
  327. //
  328. // FIXME(thaJeztah): this will silently overwrite an earlier user namespace when joining multiple containers: https://github.com/moby/moby/issues/46210
  329. setNamespace(s, specs.LinuxNamespace{
  330. Type: specs.UserNamespace,
  331. Path: fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID()),
  332. })
  333. }
  334. case pidMode.IsHost():
  335. oci.RemoveNamespace(s, specs.PIDNamespace)
  336. default:
  337. setNamespace(s, specs.LinuxNamespace{
  338. Type: specs.PIDNamespace,
  339. })
  340. }
  341. // uts
  342. if !c.HostConfig.UTSMode.Valid() {
  343. return errdefs.InvalidParameter(errors.Errorf("invalid UTS mode: %v", c.HostConfig.UTSMode))
  344. }
  345. if c.HostConfig.UTSMode.IsHost() {
  346. oci.RemoveNamespace(s, specs.UTSNamespace)
  347. s.Hostname = ""
  348. }
  349. // cgroup
  350. if !c.HostConfig.CgroupnsMode.Valid() {
  351. return errdefs.InvalidParameter(errors.Errorf("invalid cgroup namespace mode: %v", c.HostConfig.CgroupnsMode))
  352. }
  353. if c.HostConfig.CgroupnsMode.IsPrivate() {
  354. setNamespace(s, specs.LinuxNamespace{
  355. Type: specs.CgroupNamespace,
  356. })
  357. }
  358. return nil
  359. }
  360. }
  361. func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {
  362. var ids []specs.LinuxIDMapping
  363. for _, item := range s {
  364. ids = append(ids, specs.LinuxIDMapping{
  365. HostID: uint32(item.HostID),
  366. ContainerID: uint32(item.ContainerID),
  367. Size: uint32(item.Size),
  368. })
  369. }
  370. return ids
  371. }
  372. // Get the source mount point of directory passed in as argument. Also return
  373. // optional fields.
  374. func getSourceMount(source string) (string, string, error) {
  375. // Ensure any symlinks are resolved.
  376. sourcePath, err := filepath.EvalSymlinks(source)
  377. if err != nil {
  378. return "", "", err
  379. }
  380. mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(sourcePath))
  381. if err != nil {
  382. return "", "", err
  383. }
  384. if len(mi) < 1 {
  385. return "", "", fmt.Errorf("Can't find mount point of %s", source)
  386. }
  387. // find the longest mount point
  388. var idx, maxlen int
  389. for i := range mi {
  390. if len(mi[i].Mountpoint) > maxlen {
  391. maxlen = len(mi[i].Mountpoint)
  392. idx = i
  393. }
  394. }
  395. return mi[idx].Mountpoint, mi[idx].Optional, nil
  396. }
  397. const (
  398. sharedPropagationOption = "shared:"
  399. slavePropagationOption = "master:"
  400. )
  401. // hasMountInfoOption checks if any of the passed any of the given option values
  402. // are set in the passed in option string.
  403. func hasMountInfoOption(opts string, vals ...string) bool {
  404. for _, opt := range strings.Split(opts, " ") {
  405. for _, val := range vals {
  406. if strings.HasPrefix(opt, val) {
  407. return true
  408. }
  409. }
  410. }
  411. return false
  412. }
  413. // Ensure mount point on which path is mounted, is shared.
  414. func ensureShared(path string) error {
  415. sourceMount, optionalOpts, err := getSourceMount(path)
  416. if err != nil {
  417. return err
  418. }
  419. // Make sure source mount point is shared.
  420. if !hasMountInfoOption(optionalOpts, sharedPropagationOption) {
  421. return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount)
  422. }
  423. return nil
  424. }
  425. // Ensure mount point on which path is mounted, is either shared or slave.
  426. func ensureSharedOrSlave(path string) error {
  427. sourceMount, optionalOpts, err := getSourceMount(path)
  428. if err != nil {
  429. return err
  430. }
  431. if !hasMountInfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) {
  432. return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount)
  433. }
  434. return nil
  435. }
  436. var (
  437. mountPropagationMap = map[string]int{
  438. "private": mount.PRIVATE,
  439. "rprivate": mount.RPRIVATE,
  440. "shared": mount.SHARED,
  441. "rshared": mount.RSHARED,
  442. "slave": mount.SLAVE,
  443. "rslave": mount.RSLAVE,
  444. }
  445. mountPropagationReverseMap = map[int]string{
  446. mount.PRIVATE: "private",
  447. mount.RPRIVATE: "rprivate",
  448. mount.SHARED: "shared",
  449. mount.RSHARED: "rshared",
  450. mount.SLAVE: "slave",
  451. mount.RSLAVE: "rslave",
  452. }
  453. )
  454. // inSlice tests whether a string is contained in a slice of strings or not.
  455. // Comparison is case sensitive
  456. func inSlice(slice []string, s string) bool {
  457. for _, ss := range slice {
  458. if s == ss {
  459. return true
  460. }
  461. }
  462. return false
  463. }
  464. // withMounts sets the container's mounts
  465. func withMounts(daemon *Daemon, daemonCfg *configStore, c *container.Container, ms []container.Mount) coci.SpecOpts {
  466. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
  467. sort.Sort(mounts(ms))
  468. mounts := ms
  469. userMounts := make(map[string]struct{})
  470. for _, m := range mounts {
  471. userMounts[m.Destination] = struct{}{}
  472. }
  473. // Copy all mounts from spec to defaultMounts, except for
  474. // - mounts overridden by a user supplied mount;
  475. // - all mounts under /dev if a user supplied /dev is present;
  476. // - /dev/shm, in case IpcMode is none.
  477. // While at it, also
  478. // - set size for /dev/shm from shmsize.
  479. defaultMounts := s.Mounts[:0]
  480. _, mountDev := userMounts["/dev"]
  481. for _, m := range s.Mounts {
  482. if _, ok := userMounts[m.Destination]; ok {
  483. // filter out mount overridden by a user supplied mount
  484. continue
  485. }
  486. if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
  487. // filter out everything under /dev if /dev is user-mounted
  488. continue
  489. }
  490. if m.Destination == "/dev/shm" {
  491. if c.HostConfig.IpcMode.IsNone() {
  492. // filter out /dev/shm for "none" IpcMode
  493. continue
  494. }
  495. // set size for /dev/shm mount from spec
  496. sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
  497. m.Options = append(m.Options, sizeOpt)
  498. }
  499. defaultMounts = append(defaultMounts, m)
  500. }
  501. s.Mounts = defaultMounts
  502. for _, m := range mounts {
  503. if m.Source == "tmpfs" {
  504. data := m.Data
  505. parser := volumemounts.NewParser()
  506. options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())}
  507. if data != "" {
  508. options = append(options, strings.Split(data, ",")...)
  509. }
  510. merged, err := mount.MergeTmpfsOptions(options)
  511. if err != nil {
  512. return err
  513. }
  514. s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged})
  515. continue
  516. }
  517. mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"}
  518. // Determine property of RootPropagation based on volume
  519. // properties. If a volume is shared, then keep root propagation
  520. // shared. This should work for slave and private volumes too.
  521. //
  522. // For slave volumes, it can be either [r]shared/[r]slave.
  523. //
  524. // For private volumes any root propagation value should work.
  525. pFlag := mountPropagationMap[m.Propagation]
  526. switch pFlag {
  527. case mount.SHARED, mount.RSHARED:
  528. if err := ensureShared(m.Source); err != nil {
  529. return err
  530. }
  531. rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
  532. if rootpg != mount.SHARED && rootpg != mount.RSHARED {
  533. if s.Linux == nil {
  534. s.Linux = &specs.Linux{}
  535. }
  536. s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED]
  537. }
  538. case mount.SLAVE, mount.RSLAVE:
  539. var fallback bool
  540. if err := ensureSharedOrSlave(m.Source); err != nil {
  541. // For backwards compatibility purposes, treat mounts from the daemon root
  542. // as special since we automatically add rslave propagation to these mounts
  543. // when the user did not set anything, so we should fallback to the old
  544. // behavior which is to use private propagation which is normally the
  545. // default.
  546. if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) {
  547. return err
  548. }
  549. cm, ok := c.MountPoints[m.Destination]
  550. if !ok {
  551. return err
  552. }
  553. if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" {
  554. // This means the user explicitly set a propagation, do not fallback in that case.
  555. return err
  556. }
  557. fallback = true
  558. log.G(ctx).WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root")
  559. }
  560. if !fallback {
  561. rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
  562. if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
  563. if s.Linux == nil {
  564. s.Linux = &specs.Linux{}
  565. }
  566. s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE]
  567. }
  568. }
  569. }
  570. bindMode := "rbind"
  571. if m.NonRecursive {
  572. bindMode = "bind"
  573. }
  574. opts := []string{bindMode}
  575. if !m.Writable {
  576. rro := true
  577. if m.ReadOnlyNonRecursive {
  578. rro = false
  579. if m.ReadOnlyForceRecursive {
  580. return errors.New("mount options conflict: ReadOnlyNonRecursive && ReadOnlyForceRecursive")
  581. }
  582. }
  583. if rroErr := supportsRecursivelyReadOnly(daemonCfg, c.HostConfig.Runtime); rroErr != nil {
  584. rro = false
  585. if m.ReadOnlyForceRecursive {
  586. return rroErr
  587. }
  588. }
  589. if rro {
  590. opts = append(opts, "rro")
  591. } else {
  592. opts = append(opts, "ro")
  593. }
  594. }
  595. if pFlag != 0 {
  596. opts = append(opts, mountPropagationReverseMap[pFlag])
  597. }
  598. // If we are using user namespaces, then we must make sure that we
  599. // don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
  600. // "mount" when we bind-mount. The reason for this is that at the point
  601. // when runc sets up the root filesystem, it is already inside a user
  602. // namespace, and thus cannot change any flags that are locked.
  603. if daemonCfg.RemappedRoot != "" || userns.RunningInUserNS() {
  604. unprivOpts, err := mountopts.UnprivilegedMountFlags(m.Source)
  605. if err != nil {
  606. return err
  607. }
  608. opts = append(opts, unprivOpts...)
  609. }
  610. mt.Options = opts
  611. s.Mounts = append(s.Mounts, mt)
  612. }
  613. if s.Root.Readonly {
  614. for i, m := range s.Mounts {
  615. switch m.Destination {
  616. case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev":
  617. continue
  618. }
  619. if _, ok := userMounts[m.Destination]; !ok {
  620. if !inSlice(m.Options, "ro") {
  621. s.Mounts[i].Options = append(s.Mounts[i].Options, "ro")
  622. }
  623. }
  624. }
  625. }
  626. if c.HostConfig.Privileged {
  627. // clear readonly for /sys
  628. for i := range s.Mounts {
  629. if s.Mounts[i].Destination == "/sys" {
  630. clearReadOnly(&s.Mounts[i])
  631. }
  632. }
  633. if s.Linux != nil {
  634. s.Linux.ReadonlyPaths = nil
  635. s.Linux.MaskedPaths = nil
  636. }
  637. }
  638. // TODO: until a kernel/mount solution exists for handling remount in a user namespace,
  639. // we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
  640. if uidMap := daemon.idMapping.UIDMaps; uidMap != nil || c.HostConfig.Privileged {
  641. for i, m := range s.Mounts {
  642. if m.Type == "cgroup" {
  643. clearReadOnly(&s.Mounts[i])
  644. }
  645. }
  646. }
  647. return nil
  648. }
  649. }
  650. // sysctlExists checks if a sysctl exists; runc will error if we add any that do not actually
  651. // exist, so do not add the default ones if running on an old kernel.
  652. func sysctlExists(s string) bool {
  653. f := filepath.Join("/proc", "sys", strings.ReplaceAll(s, ".", "/"))
  654. _, err := os.Stat(f)
  655. return err == nil
  656. }
  657. // withCommonOptions sets common docker options
  658. func withCommonOptions(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
  659. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  660. if c.BaseFS == "" {
  661. return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly empty")
  662. }
  663. linkedEnv, err := daemon.setupLinkedContainers(c)
  664. if err != nil {
  665. return err
  666. }
  667. s.Root = &specs.Root{
  668. Path: c.BaseFS,
  669. Readonly: c.HostConfig.ReadonlyRootfs,
  670. }
  671. if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil {
  672. return err
  673. }
  674. cwd := c.Config.WorkingDir
  675. if len(cwd) == 0 {
  676. cwd = "/"
  677. }
  678. if s.Process == nil {
  679. s.Process = &specs.Process{}
  680. }
  681. s.Process.Args = append([]string{c.Path}, c.Args...)
  682. // only add the custom init if it is specified and the container is running in its
  683. // own private pid namespace. It does not make sense to add if it is running in the
  684. // host namespace or another container's pid namespace where we already have an init
  685. if c.HostConfig.PidMode.IsPrivate() {
  686. if (c.HostConfig.Init != nil && *c.HostConfig.Init) ||
  687. (c.HostConfig.Init == nil && daemonCfg.Init) {
  688. s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...)
  689. path, err := daemonCfg.LookupInitPath() // this will fall back to DefaultInitBinary and return an absolute path
  690. if err != nil {
  691. return err
  692. }
  693. s.Mounts = append(s.Mounts, specs.Mount{
  694. Destination: inContainerInitPath,
  695. Type: "bind",
  696. Source: path,
  697. Options: []string{"bind", "ro"},
  698. })
  699. }
  700. }
  701. s.Process.Cwd = cwd
  702. s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
  703. s.Process.Terminal = c.Config.Tty
  704. s.Hostname = c.Config.Hostname
  705. setLinuxDomainname(c, s)
  706. // Add default sysctls that are generally safe and useful; currently we
  707. // grant the capabilities to allow these anyway. You can override if
  708. // you want to restore the original behaviour.
  709. // We do not set network sysctls if network namespace is host, or if we are
  710. // joining an existing namespace, only if we create a new net namespace.
  711. if c.HostConfig.NetworkMode.IsPrivate() {
  712. // We cannot set up ping socket support in a user namespace
  713. userNS := daemonCfg.RemappedRoot != "" && c.HostConfig.UsernsMode.IsPrivate()
  714. if !userNS && !userns.RunningInUserNS() && sysctlExists("net.ipv4.ping_group_range") {
  715. // allow unprivileged ICMP echo sockets without CAP_NET_RAW
  716. s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647"
  717. }
  718. // allow opening any port less than 1024 without CAP_NET_BIND_SERVICE
  719. if sysctlExists("net.ipv4.ip_unprivileged_port_start") {
  720. s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"] = "0"
  721. }
  722. }
  723. return nil
  724. }
  725. }
  726. // withCgroups sets the container's cgroups
  727. func withCgroups(daemon *Daemon, daemonCfg *dconfig.Config, c *container.Container) coci.SpecOpts {
  728. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  729. var cgroupsPath string
  730. scopePrefix := "docker"
  731. parent := "/docker"
  732. useSystemd := UsingSystemd(daemonCfg)
  733. if useSystemd {
  734. parent = "system.slice"
  735. if daemonCfg.Rootless {
  736. parent = "user.slice"
  737. }
  738. }
  739. if c.HostConfig.CgroupParent != "" {
  740. parent = c.HostConfig.CgroupParent
  741. } else if daemonCfg.CgroupParent != "" {
  742. parent = daemonCfg.CgroupParent
  743. }
  744. if useSystemd {
  745. cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID
  746. log.G(ctx).Debugf("createSpec: cgroupsPath: %s", cgroupsPath)
  747. } else {
  748. cgroupsPath = filepath.Join(parent, c.ID)
  749. }
  750. if s.Linux == nil {
  751. s.Linux = &specs.Linux{}
  752. }
  753. s.Linux.CgroupsPath = cgroupsPath
  754. // the rest is only needed for CPU RT controller
  755. if daemonCfg.CPURealtimePeriod == 0 && daemonCfg.CPURealtimeRuntime == 0 {
  756. return nil
  757. }
  758. p := cgroupsPath
  759. if useSystemd {
  760. initPath, err := cgroups.GetInitCgroup("cpu")
  761. if err != nil {
  762. return errors.Wrap(err, "unable to init CPU RT controller")
  763. }
  764. _, err = cgroups.GetOwnCgroup("cpu")
  765. if err != nil {
  766. return errors.Wrap(err, "unable to init CPU RT controller")
  767. }
  768. p = filepath.Join(initPath, s.Linux.CgroupsPath)
  769. }
  770. // Clean path to guard against things like ../../../BAD
  771. parentPath := filepath.Dir(p)
  772. if !filepath.IsAbs(parentPath) {
  773. parentPath = filepath.Clean("/" + parentPath)
  774. }
  775. mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", "cpu")
  776. if err != nil {
  777. return errors.Wrap(err, "unable to init CPU RT controller")
  778. }
  779. // When docker is run inside docker, the root is based of the host cgroup.
  780. // Should this be handled in runc/libcontainer/cgroups ?
  781. if strings.HasPrefix(root, "/docker/") {
  782. root = "/"
  783. }
  784. mnt = filepath.Join(mnt, root)
  785. if err := daemon.initCPURtController(daemonCfg, mnt, parentPath); err != nil {
  786. return errors.Wrap(err, "unable to init CPU RT controller")
  787. }
  788. return nil
  789. }
  790. }
  791. // WithDevices sets the container's devices
  792. func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts {
  793. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  794. // Build lists of devices allowed and created within the container.
  795. var devs []specs.LinuxDevice
  796. devPermissions := s.Linux.Resources.Devices
  797. if c.HostConfig.Privileged {
  798. hostDevices, err := coci.HostDevices()
  799. if err != nil {
  800. return err
  801. }
  802. devs = append(devs, hostDevices...)
  803. // adding device mappings in privileged containers
  804. for _, deviceMapping := range c.HostConfig.Devices {
  805. // issue a warning that custom cgroup permissions are ignored in privileged mode
  806. if deviceMapping.CgroupPermissions != "rwm" {
  807. log.G(ctx).WithField("container", c.ID).Warnf("custom %s permissions for device %s are ignored in privileged mode", deviceMapping.CgroupPermissions, deviceMapping.PathOnHost)
  808. }
  809. // issue a warning that the device path already exists via /dev mounting in privileged mode
  810. if deviceMapping.PathOnHost == deviceMapping.PathInContainer {
  811. log.G(ctx).WithField("container", c.ID).Warnf("path in container %s already exists in privileged mode", deviceMapping.PathInContainer)
  812. continue
  813. }
  814. d, _, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, "rwm")
  815. if err != nil {
  816. return err
  817. }
  818. devs = append(devs, d...)
  819. }
  820. devPermissions = []specs.LinuxDeviceCgroup{
  821. {
  822. Allow: true,
  823. Access: "rwm",
  824. },
  825. }
  826. } else {
  827. for _, deviceMapping := range c.HostConfig.Devices {
  828. d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions)
  829. if err != nil {
  830. return err
  831. }
  832. devs = append(devs, d...)
  833. devPermissions = append(devPermissions, dPermissions...)
  834. }
  835. var err error
  836. devPermissions, err = oci.AppendDevicePermissionsFromCgroupRules(devPermissions, c.HostConfig.DeviceCgroupRules)
  837. if err != nil {
  838. return err
  839. }
  840. }
  841. if s.Linux == nil {
  842. s.Linux = &specs.Linux{}
  843. }
  844. if s.Linux.Resources == nil {
  845. s.Linux.Resources = &specs.LinuxResources{}
  846. }
  847. s.Linux.Devices = append(s.Linux.Devices, devs...)
  848. s.Linux.Resources.Devices = append(s.Linux.Resources.Devices, devPermissions...)
  849. for _, req := range c.HostConfig.DeviceRequests {
  850. if err := daemon.handleDevice(req, s); err != nil {
  851. return err
  852. }
  853. }
  854. return nil
  855. }
  856. }
  857. // WithResources applies the container resources
  858. func WithResources(c *container.Container) coci.SpecOpts {
  859. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  860. r := c.HostConfig.Resources
  861. weightDevices, err := getBlkioWeightDevices(r)
  862. if err != nil {
  863. return err
  864. }
  865. readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps)
  866. if err != nil {
  867. return err
  868. }
  869. writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps)
  870. if err != nil {
  871. return err
  872. }
  873. readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps)
  874. if err != nil {
  875. return err
  876. }
  877. writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps)
  878. if err != nil {
  879. return err
  880. }
  881. memoryRes := getMemoryResources(r)
  882. cpuRes, err := getCPUResources(r)
  883. if err != nil {
  884. return err
  885. }
  886. if s.Linux == nil {
  887. s.Linux = &specs.Linux{}
  888. }
  889. if s.Linux.Resources == nil {
  890. s.Linux.Resources = &specs.LinuxResources{}
  891. }
  892. s.Linux.Resources.Memory = memoryRes
  893. s.Linux.Resources.CPU = cpuRes
  894. s.Linux.Resources.BlockIO = &specs.LinuxBlockIO{
  895. WeightDevice: weightDevices,
  896. ThrottleReadBpsDevice: readBpsDevice,
  897. ThrottleWriteBpsDevice: writeBpsDevice,
  898. ThrottleReadIOPSDevice: readIOpsDevice,
  899. ThrottleWriteIOPSDevice: writeIOpsDevice,
  900. }
  901. if r.BlkioWeight != 0 {
  902. w := r.BlkioWeight
  903. s.Linux.Resources.BlockIO.Weight = &w
  904. }
  905. s.Linux.Resources.Pids = getPidsLimit(r)
  906. return nil
  907. }
  908. }
  909. // WithSysctls sets the container's sysctls
  910. func WithSysctls(c *container.Container) coci.SpecOpts {
  911. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  912. if len(c.HostConfig.Sysctls) == 0 {
  913. return nil
  914. }
  915. if s.Linux == nil {
  916. s.Linux = &specs.Linux{}
  917. }
  918. if s.Linux.Sysctl == nil {
  919. s.Linux.Sysctl = make(map[string]string)
  920. }
  921. // We merge the sysctls injected above with the HostConfig (latter takes
  922. // precedence for backwards-compatibility reasons).
  923. for k, v := range c.HostConfig.Sysctls {
  924. s.Linux.Sysctl[k] = v
  925. }
  926. return nil
  927. }
  928. }
  929. // WithUser sets the container's user
  930. func WithUser(c *container.Container) coci.SpecOpts {
  931. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  932. if s.Process == nil {
  933. s.Process = &specs.Process{}
  934. }
  935. var err error
  936. s.Process.User, err = getUser(c, c.Config.User)
  937. return err
  938. }
  939. }
  940. func (daemon *Daemon) createSpec(ctx context.Context, daemonCfg *configStore, c *container.Container, mounts []container.Mount) (retSpec *specs.Spec, err error) {
  941. var (
  942. opts []coci.SpecOpts
  943. s = oci.DefaultSpec()
  944. )
  945. opts = append(opts,
  946. withCommonOptions(daemon, &daemonCfg.Config, c),
  947. withCgroups(daemon, &daemonCfg.Config, c),
  948. WithResources(c),
  949. WithSysctls(c),
  950. WithDevices(daemon, c),
  951. withRlimits(daemon, &daemonCfg.Config, c),
  952. WithNamespaces(daemon, c),
  953. WithCapabilities(c),
  954. WithSeccomp(daemon, c),
  955. withMounts(daemon, daemonCfg, c, mounts),
  956. withLibnetwork(daemon, &daemonCfg.Config, c),
  957. WithApparmor(c),
  958. WithSelinux(c),
  959. WithOOMScore(&c.HostConfig.OomScoreAdj),
  960. coci.WithAnnotations(c.HostConfig.Annotations),
  961. WithUser(c),
  962. )
  963. if c.NoNewPrivileges {
  964. opts = append(opts, coci.WithNoNewPrivileges)
  965. }
  966. if c.Config.Tty {
  967. opts = append(opts, WithConsoleSize(c))
  968. }
  969. // Set the masked and readonly paths with regard to the host config options if they are set.
  970. if c.HostConfig.MaskedPaths != nil {
  971. opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths))
  972. }
  973. if c.HostConfig.ReadonlyPaths != nil {
  974. opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths))
  975. }
  976. if daemonCfg.Rootless {
  977. opts = append(opts, withRootless(daemon, &daemonCfg.Config))
  978. } else if userns.RunningInUserNS() {
  979. opts = append(opts, withRootfulInRootless(daemon, &daemonCfg.Config))
  980. }
  981. var snapshotter, snapshotKey string
  982. if daemon.UsesSnapshotter() {
  983. snapshotter = daemon.imageService.StorageDriver()
  984. snapshotKey = c.ID
  985. }
  986. return &s, coci.ApplyOpts(ctx, daemon.containerdClient, &containers.Container{
  987. ID: c.ID,
  988. Snapshotter: snapshotter,
  989. SnapshotKey: snapshotKey,
  990. }, &s, opts...)
  991. }
  992. func clearReadOnly(m *specs.Mount) {
  993. var opt []string
  994. for _, o := range m.Options {
  995. if o != "ro" {
  996. opt = append(opt, o)
  997. }
  998. }
  999. m.Options = opt
  1000. }
  1001. // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig
  1002. func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig, daemonCfg *dconfig.Config) {
  1003. ulimits := c.Ulimits
  1004. // Merge ulimits with daemon defaults
  1005. ulIdx := make(map[string]struct{})
  1006. for _, ul := range ulimits {
  1007. ulIdx[ul.Name] = struct{}{}
  1008. }
  1009. for name, ul := range daemonCfg.Ulimits {
  1010. if _, exists := ulIdx[name]; !exists {
  1011. ulimits = append(ulimits, ul)
  1012. }
  1013. }
  1014. c.Ulimits = ulimits
  1015. }