oci_linux.go 32 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085
  1. package daemon // import "github.com/docker/docker/daemon"
  2. import (
  3. "context"
  4. "fmt"
  5. "os"
  6. "os/exec"
  7. "path/filepath"
  8. "sort"
  9. "strconv"
  10. "strings"
  11. cdcgroups "github.com/containerd/cgroups"
  12. "github.com/containerd/containerd/containers"
  13. coci "github.com/containerd/containerd/oci"
  14. "github.com/containerd/containerd/pkg/apparmor"
  15. "github.com/containerd/containerd/pkg/userns"
  16. containertypes "github.com/docker/docker/api/types/container"
  17. "github.com/docker/docker/container"
  18. dconfig "github.com/docker/docker/daemon/config"
  19. "github.com/docker/docker/errdefs"
  20. "github.com/docker/docker/oci"
  21. "github.com/docker/docker/oci/caps"
  22. "github.com/docker/docker/pkg/idtools"
  23. "github.com/docker/docker/pkg/stringid"
  24. "github.com/docker/docker/rootless/specconv"
  25. volumemounts "github.com/docker/docker/volume/mounts"
  26. "github.com/moby/sys/mount"
  27. "github.com/moby/sys/mountinfo"
  28. "github.com/opencontainers/runc/libcontainer/cgroups"
  29. "github.com/opencontainers/runc/libcontainer/user"
  30. specs "github.com/opencontainers/runtime-spec/specs-go"
  31. "github.com/pkg/errors"
  32. "github.com/sirupsen/logrus"
  33. "golang.org/x/sys/unix"
  34. )
  35. const inContainerInitPath = "/sbin/" + dconfig.DefaultInitBinary
  36. // WithRlimits sets the container's rlimits along with merging the daemon's rlimits
  37. func WithRlimits(daemon *Daemon, c *container.Container) coci.SpecOpts {
  38. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  39. var rlimits []specs.POSIXRlimit
  40. // We want to leave the original HostConfig alone so make a copy here
  41. hostConfig := *c.HostConfig
  42. // Merge with the daemon defaults
  43. daemon.mergeUlimits(&hostConfig)
  44. for _, ul := range hostConfig.Ulimits {
  45. rlimits = append(rlimits, specs.POSIXRlimit{
  46. Type: "RLIMIT_" + strings.ToUpper(ul.Name),
  47. Soft: uint64(ul.Soft),
  48. Hard: uint64(ul.Hard),
  49. })
  50. }
  51. s.Process.Rlimits = rlimits
  52. return nil
  53. }
  54. }
  55. // WithLibnetwork sets the libnetwork hook
  56. func WithLibnetwork(daemon *Daemon, c *container.Container) coci.SpecOpts {
  57. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  58. if s.Hooks == nil {
  59. s.Hooks = &specs.Hooks{}
  60. }
  61. for _, ns := range s.Linux.Namespaces {
  62. if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled {
  63. target := filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe")
  64. shortNetCtlrID := stringid.TruncateID(daemon.netController.ID())
  65. s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{
  66. Path: target,
  67. Args: []string{
  68. "libnetwork-setkey",
  69. "-exec-root=" + daemon.configStore.GetExecRoot(),
  70. c.ID,
  71. shortNetCtlrID,
  72. },
  73. })
  74. }
  75. }
  76. return nil
  77. }
  78. }
  79. // WithRootless sets the spec to the rootless configuration
  80. func WithRootless(daemon *Daemon) coci.SpecOpts {
  81. return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  82. var v2Controllers []string
  83. if daemon.getCgroupDriver() == cgroupSystemdDriver {
  84. if cdcgroups.Mode() != cdcgroups.Unified {
  85. return errors.New("rootless systemd driver doesn't support cgroup v1")
  86. }
  87. rootlesskitParentEUID := os.Getenv("ROOTLESSKIT_PARENT_EUID")
  88. if rootlesskitParentEUID == "" {
  89. return errors.New("$ROOTLESSKIT_PARENT_EUID is not set (requires RootlessKit v0.8.0)")
  90. }
  91. euid, err := strconv.Atoi(rootlesskitParentEUID)
  92. if err != nil {
  93. return errors.Wrap(err, "invalid $ROOTLESSKIT_PARENT_EUID: must be a numeric value")
  94. }
  95. controllersPath := fmt.Sprintf("/sys/fs/cgroup/user.slice/user-%d.slice/cgroup.controllers", euid)
  96. controllersFile, err := os.ReadFile(controllersPath)
  97. if err != nil {
  98. return err
  99. }
  100. v2Controllers = strings.Fields(string(controllersFile))
  101. }
  102. return specconv.ToRootless(s, v2Controllers)
  103. }
  104. }
  105. // WithOOMScore sets the oom score
  106. func WithOOMScore(score *int) coci.SpecOpts {
  107. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  108. s.Process.OOMScoreAdj = score
  109. return nil
  110. }
  111. }
  112. // WithSelinux sets the selinux labels
  113. func WithSelinux(c *container.Container) coci.SpecOpts {
  114. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  115. s.Process.SelinuxLabel = c.GetProcessLabel()
  116. s.Linux.MountLabel = c.MountLabel
  117. return nil
  118. }
  119. }
  120. // WithApparmor sets the apparmor profile
  121. func WithApparmor(c *container.Container) coci.SpecOpts {
  122. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  123. if apparmor.HostSupports() {
  124. var appArmorProfile string
  125. if c.AppArmorProfile != "" {
  126. appArmorProfile = c.AppArmorProfile
  127. } else if c.HostConfig.Privileged {
  128. appArmorProfile = unconfinedAppArmorProfile
  129. } else {
  130. appArmorProfile = defaultAppArmorProfile
  131. }
  132. if appArmorProfile == defaultAppArmorProfile {
  133. // Unattended upgrades and other fun services can unload AppArmor
  134. // profiles inadvertently. Since we cannot store our profile in
  135. // /etc/apparmor.d, nor can we practically add other ways of
  136. // telling the system to keep our profile loaded, in order to make
  137. // sure that we keep the default profile enabled we dynamically
  138. // reload it if necessary.
  139. if err := ensureDefaultAppArmorProfile(); err != nil {
  140. return err
  141. }
  142. }
  143. s.Process.ApparmorProfile = appArmorProfile
  144. }
  145. return nil
  146. }
  147. }
  148. // WithCapabilities sets the container's capabilties
  149. func WithCapabilities(c *container.Container) coci.SpecOpts {
  150. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  151. capabilities, err := caps.TweakCapabilities(
  152. caps.DefaultCapabilities(),
  153. c.HostConfig.CapAdd,
  154. c.HostConfig.CapDrop,
  155. c.HostConfig.Privileged,
  156. )
  157. if err != nil {
  158. return err
  159. }
  160. return oci.SetCapabilities(s, capabilities)
  161. }
  162. }
  163. func resourcePath(c *container.Container, getPath func() (string, error)) (string, error) {
  164. p, err := getPath()
  165. if err != nil {
  166. return "", err
  167. }
  168. return c.GetResourcePath(p)
  169. }
  170. func getUser(c *container.Container, username string) (specs.User, error) {
  171. var usr specs.User
  172. passwdPath, err := resourcePath(c, user.GetPasswdPath)
  173. if err != nil {
  174. return usr, err
  175. }
  176. groupPath, err := resourcePath(c, user.GetGroupPath)
  177. if err != nil {
  178. return usr, err
  179. }
  180. execUser, err := user.GetExecUserPath(username, nil, passwdPath, groupPath)
  181. if err != nil {
  182. return usr, err
  183. }
  184. usr.UID = uint32(execUser.Uid)
  185. usr.GID = uint32(execUser.Gid)
  186. usr.AdditionalGids = []uint32{usr.GID}
  187. var addGroups []int
  188. if len(c.HostConfig.GroupAdd) > 0 {
  189. addGroups, err = user.GetAdditionalGroupsPath(c.HostConfig.GroupAdd, groupPath)
  190. if err != nil {
  191. return usr, err
  192. }
  193. }
  194. for _, g := range append(execUser.Sgids, addGroups...) {
  195. usr.AdditionalGids = append(usr.AdditionalGids, uint32(g))
  196. }
  197. return usr, nil
  198. }
  199. func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) {
  200. for i, n := range s.Linux.Namespaces {
  201. if n.Type == ns.Type {
  202. s.Linux.Namespaces[i] = ns
  203. return
  204. }
  205. }
  206. s.Linux.Namespaces = append(s.Linux.Namespaces, ns)
  207. }
  208. // WithNamespaces sets the container's namespaces
  209. func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
  210. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  211. userNS := false
  212. // user
  213. if c.HostConfig.UsernsMode.IsPrivate() {
  214. uidMap := daemon.idMapping.UIDMaps
  215. if uidMap != nil {
  216. userNS = true
  217. ns := specs.LinuxNamespace{Type: "user"}
  218. setNamespace(s, ns)
  219. s.Linux.UIDMappings = specMapping(uidMap)
  220. s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDMaps)
  221. }
  222. }
  223. // network
  224. if !c.Config.NetworkDisabled {
  225. ns := specs.LinuxNamespace{Type: "network"}
  226. if c.HostConfig.NetworkMode.IsContainer() {
  227. nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer())
  228. if err != nil {
  229. return err
  230. }
  231. ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID())
  232. if userNS {
  233. // to share a net namespace, they must also share a user namespace
  234. nsUser := specs.LinuxNamespace{Type: "user"}
  235. nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID())
  236. setNamespace(s, nsUser)
  237. }
  238. } else if c.HostConfig.NetworkMode.IsHost() {
  239. ns.Path = c.NetworkSettings.SandboxKey
  240. }
  241. setNamespace(s, ns)
  242. }
  243. // ipc
  244. ipcMode := c.HostConfig.IpcMode
  245. if !ipcMode.Valid() {
  246. return errdefs.InvalidParameter(errors.Errorf("invalid IPC mode: %v", ipcMode))
  247. }
  248. switch {
  249. case ipcMode.IsContainer():
  250. ns := specs.LinuxNamespace{Type: "ipc"}
  251. ic, err := daemon.getIpcContainer(ipcMode.Container())
  252. if err != nil {
  253. return errdefs.InvalidParameter(errors.Wrapf(err, "invalid IPC mode: %v", ipcMode))
  254. }
  255. ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID())
  256. setNamespace(s, ns)
  257. if userNS {
  258. // to share an IPC namespace, they must also share a user namespace
  259. nsUser := specs.LinuxNamespace{Type: "user"}
  260. nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID())
  261. setNamespace(s, nsUser)
  262. }
  263. case ipcMode.IsHost():
  264. oci.RemoveNamespace(s, "ipc")
  265. case ipcMode.IsEmpty():
  266. // A container was created by an older version of the daemon.
  267. // The default behavior used to be what is now called "shareable".
  268. fallthrough
  269. case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone():
  270. ns := specs.LinuxNamespace{Type: "ipc"}
  271. setNamespace(s, ns)
  272. }
  273. // pid
  274. if !c.HostConfig.PidMode.Valid() {
  275. return errdefs.InvalidParameter(errors.Errorf("invalid PID mode: %v", c.HostConfig.PidMode))
  276. }
  277. if c.HostConfig.PidMode.IsContainer() {
  278. pc, err := daemon.getPidContainer(c)
  279. if err != nil {
  280. return err
  281. }
  282. ns := specs.LinuxNamespace{
  283. Type: "pid",
  284. Path: fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID()),
  285. }
  286. setNamespace(s, ns)
  287. if userNS {
  288. // to share a PID namespace, they must also share a user namespace
  289. nsUser := specs.LinuxNamespace{
  290. Type: "user",
  291. Path: fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID()),
  292. }
  293. setNamespace(s, nsUser)
  294. }
  295. } else if c.HostConfig.PidMode.IsHost() {
  296. oci.RemoveNamespace(s, "pid")
  297. } else {
  298. ns := specs.LinuxNamespace{Type: "pid"}
  299. setNamespace(s, ns)
  300. }
  301. // uts
  302. if !c.HostConfig.UTSMode.Valid() {
  303. return errdefs.InvalidParameter(errors.Errorf("invalid UTS mode: %v", c.HostConfig.UTSMode))
  304. }
  305. if c.HostConfig.UTSMode.IsHost() {
  306. oci.RemoveNamespace(s, "uts")
  307. s.Hostname = ""
  308. }
  309. // cgroup
  310. if !c.HostConfig.CgroupnsMode.Valid() {
  311. return errdefs.InvalidParameter(errors.Errorf("invalid cgroup namespace mode: %v", c.HostConfig.CgroupnsMode))
  312. }
  313. if !c.HostConfig.CgroupnsMode.IsEmpty() {
  314. if c.HostConfig.CgroupnsMode.IsPrivate() {
  315. nsCgroup := specs.LinuxNamespace{Type: "cgroup"}
  316. setNamespace(s, nsCgroup)
  317. }
  318. }
  319. return nil
  320. }
  321. }
  322. func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {
  323. var ids []specs.LinuxIDMapping
  324. for _, item := range s {
  325. ids = append(ids, specs.LinuxIDMapping{
  326. HostID: uint32(item.HostID),
  327. ContainerID: uint32(item.ContainerID),
  328. Size: uint32(item.Size),
  329. })
  330. }
  331. return ids
  332. }
  333. // Get the source mount point of directory passed in as argument. Also return
  334. // optional fields.
  335. func getSourceMount(source string) (string, string, error) {
  336. // Ensure any symlinks are resolved.
  337. sourcePath, err := filepath.EvalSymlinks(source)
  338. if err != nil {
  339. return "", "", err
  340. }
  341. mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(sourcePath))
  342. if err != nil {
  343. return "", "", err
  344. }
  345. if len(mi) < 1 {
  346. return "", "", fmt.Errorf("Can't find mount point of %s", source)
  347. }
  348. // find the longest mount point
  349. var idx, maxlen int
  350. for i := range mi {
  351. if len(mi[i].Mountpoint) > maxlen {
  352. maxlen = len(mi[i].Mountpoint)
  353. idx = i
  354. }
  355. }
  356. return mi[idx].Mountpoint, mi[idx].Optional, nil
  357. }
  358. const (
  359. sharedPropagationOption = "shared:"
  360. slavePropagationOption = "master:"
  361. )
  362. // hasMountInfoOption checks if any of the passed any of the given option values
  363. // are set in the passed in option string.
  364. func hasMountInfoOption(opts string, vals ...string) bool {
  365. for _, opt := range strings.Split(opts, " ") {
  366. for _, val := range vals {
  367. if strings.HasPrefix(opt, val) {
  368. return true
  369. }
  370. }
  371. }
  372. return false
  373. }
  374. // Ensure mount point on which path is mounted, is shared.
  375. func ensureShared(path string) error {
  376. sourceMount, optionalOpts, err := getSourceMount(path)
  377. if err != nil {
  378. return err
  379. }
  380. // Make sure source mount point is shared.
  381. if !hasMountInfoOption(optionalOpts, sharedPropagationOption) {
  382. return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount)
  383. }
  384. return nil
  385. }
  386. // Ensure mount point on which path is mounted, is either shared or slave.
  387. func ensureSharedOrSlave(path string) error {
  388. sourceMount, optionalOpts, err := getSourceMount(path)
  389. if err != nil {
  390. return err
  391. }
  392. if !hasMountInfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) {
  393. return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount)
  394. }
  395. return nil
  396. }
  397. // Get the set of mount flags that are set on the mount that contains the given
  398. // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
  399. // bind-mounting "with options" will not fail with user namespaces, due to
  400. // kernel restrictions that require user namespace mounts to preserve
  401. // CL_UNPRIVILEGED locked flags.
  402. func getUnprivilegedMountFlags(path string) ([]string, error) {
  403. var statfs unix.Statfs_t
  404. if err := unix.Statfs(path, &statfs); err != nil {
  405. return nil, err
  406. }
  407. // The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
  408. unprivilegedFlags := map[uint64]string{
  409. unix.MS_RDONLY: "ro",
  410. unix.MS_NODEV: "nodev",
  411. unix.MS_NOEXEC: "noexec",
  412. unix.MS_NOSUID: "nosuid",
  413. unix.MS_NOATIME: "noatime",
  414. unix.MS_RELATIME: "relatime",
  415. unix.MS_NODIRATIME: "nodiratime",
  416. }
  417. var flags []string
  418. for mask, flag := range unprivilegedFlags {
  419. if uint64(statfs.Flags)&mask == mask {
  420. flags = append(flags, flag)
  421. }
  422. }
  423. return flags, nil
  424. }
  425. var (
  426. mountPropagationMap = map[string]int{
  427. "private": mount.PRIVATE,
  428. "rprivate": mount.RPRIVATE,
  429. "shared": mount.SHARED,
  430. "rshared": mount.RSHARED,
  431. "slave": mount.SLAVE,
  432. "rslave": mount.RSLAVE,
  433. }
  434. mountPropagationReverseMap = map[int]string{
  435. mount.PRIVATE: "private",
  436. mount.RPRIVATE: "rprivate",
  437. mount.SHARED: "shared",
  438. mount.RSHARED: "rshared",
  439. mount.SLAVE: "slave",
  440. mount.RSLAVE: "rslave",
  441. }
  442. )
  443. // inSlice tests whether a string is contained in a slice of strings or not.
  444. // Comparison is case sensitive
  445. func inSlice(slice []string, s string) bool {
  446. for _, ss := range slice {
  447. if s == ss {
  448. return true
  449. }
  450. }
  451. return false
  452. }
  453. // WithMounts sets the container's mounts
  454. func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts {
  455. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
  456. if err := daemon.setupContainerMountsRoot(c); err != nil {
  457. return err
  458. }
  459. if err := daemon.setupIpcDirs(c); err != nil {
  460. return err
  461. }
  462. defer func() {
  463. if err != nil {
  464. daemon.cleanupSecretDir(c)
  465. }
  466. }()
  467. if err := daemon.setupSecretDir(c); err != nil {
  468. return err
  469. }
  470. ms, err := daemon.setupMounts(c)
  471. if err != nil {
  472. return err
  473. }
  474. if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() {
  475. ms = append(ms, c.IpcMounts()...)
  476. }
  477. tmpfsMounts, err := c.TmpfsMounts()
  478. if err != nil {
  479. return err
  480. }
  481. ms = append(ms, tmpfsMounts...)
  482. secretMounts, err := c.SecretMounts()
  483. if err != nil {
  484. return err
  485. }
  486. ms = append(ms, secretMounts...)
  487. sort.Sort(mounts(ms))
  488. mounts := ms
  489. userMounts := make(map[string]struct{})
  490. for _, m := range mounts {
  491. userMounts[m.Destination] = struct{}{}
  492. }
  493. // Copy all mounts from spec to defaultMounts, except for
  494. // - mounts overridden by a user supplied mount;
  495. // - all mounts under /dev if a user supplied /dev is present;
  496. // - /dev/shm, in case IpcMode is none.
  497. // While at it, also
  498. // - set size for /dev/shm from shmsize.
  499. defaultMounts := s.Mounts[:0]
  500. _, mountDev := userMounts["/dev"]
  501. for _, m := range s.Mounts {
  502. if _, ok := userMounts[m.Destination]; ok {
  503. // filter out mount overridden by a user supplied mount
  504. continue
  505. }
  506. if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
  507. // filter out everything under /dev if /dev is user-mounted
  508. continue
  509. }
  510. if m.Destination == "/dev/shm" {
  511. if c.HostConfig.IpcMode.IsNone() {
  512. // filter out /dev/shm for "none" IpcMode
  513. continue
  514. }
  515. // set size for /dev/shm mount from spec
  516. sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
  517. m.Options = append(m.Options, sizeOpt)
  518. }
  519. defaultMounts = append(defaultMounts, m)
  520. }
  521. s.Mounts = defaultMounts
  522. for _, m := range mounts {
  523. if m.Source == "tmpfs" {
  524. data := m.Data
  525. parser := volumemounts.NewParser()
  526. options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())}
  527. if data != "" {
  528. options = append(options, strings.Split(data, ",")...)
  529. }
  530. merged, err := mount.MergeTmpfsOptions(options)
  531. if err != nil {
  532. return err
  533. }
  534. s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged})
  535. continue
  536. }
  537. mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"}
  538. // Determine property of RootPropagation based on volume
  539. // properties. If a volume is shared, then keep root propagation
  540. // shared. This should work for slave and private volumes too.
  541. //
  542. // For slave volumes, it can be either [r]shared/[r]slave.
  543. //
  544. // For private volumes any root propagation value should work.
  545. pFlag := mountPropagationMap[m.Propagation]
  546. switch pFlag {
  547. case mount.SHARED, mount.RSHARED:
  548. if err := ensureShared(m.Source); err != nil {
  549. return err
  550. }
  551. rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
  552. if rootpg != mount.SHARED && rootpg != mount.RSHARED {
  553. s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED]
  554. }
  555. case mount.SLAVE, mount.RSLAVE:
  556. var fallback bool
  557. if err := ensureSharedOrSlave(m.Source); err != nil {
  558. // For backwards compatibility purposes, treat mounts from the daemon root
  559. // as special since we automatically add rslave propagation to these mounts
  560. // when the user did not set anything, so we should fallback to the old
  561. // behavior which is to use private propagation which is normally the
  562. // default.
  563. if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) {
  564. return err
  565. }
  566. cm, ok := c.MountPoints[m.Destination]
  567. if !ok {
  568. return err
  569. }
  570. if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" {
  571. // This means the user explicitly set a propagation, do not fallback in that case.
  572. return err
  573. }
  574. fallback = true
  575. logrus.WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root")
  576. }
  577. if !fallback {
  578. rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
  579. if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
  580. s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE]
  581. }
  582. }
  583. }
  584. bindMode := "rbind"
  585. if m.NonRecursive {
  586. bindMode = "bind"
  587. }
  588. opts := []string{bindMode}
  589. if !m.Writable {
  590. opts = append(opts, "ro")
  591. }
  592. if pFlag != 0 {
  593. opts = append(opts, mountPropagationReverseMap[pFlag])
  594. }
  595. // If we are using user namespaces, then we must make sure that we
  596. // don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
  597. // "mount" when we bind-mount. The reason for this is that at the point
  598. // when runc sets up the root filesystem, it is already inside a user
  599. // namespace, and thus cannot change any flags that are locked.
  600. if daemon.configStore.RemappedRoot != "" || userns.RunningInUserNS() {
  601. unprivOpts, err := getUnprivilegedMountFlags(m.Source)
  602. if err != nil {
  603. return err
  604. }
  605. opts = append(opts, unprivOpts...)
  606. }
  607. mt.Options = opts
  608. s.Mounts = append(s.Mounts, mt)
  609. }
  610. if s.Root.Readonly {
  611. for i, m := range s.Mounts {
  612. switch m.Destination {
  613. case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev":
  614. continue
  615. }
  616. if _, ok := userMounts[m.Destination]; !ok {
  617. if !inSlice(m.Options, "ro") {
  618. s.Mounts[i].Options = append(s.Mounts[i].Options, "ro")
  619. }
  620. }
  621. }
  622. }
  623. if c.HostConfig.Privileged {
  624. // clear readonly for /sys
  625. for i := range s.Mounts {
  626. if s.Mounts[i].Destination == "/sys" {
  627. clearReadOnly(&s.Mounts[i])
  628. }
  629. }
  630. s.Linux.ReadonlyPaths = nil
  631. s.Linux.MaskedPaths = nil
  632. }
  633. // TODO: until a kernel/mount solution exists for handling remount in a user namespace,
  634. // we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
  635. if uidMap := daemon.idMapping.UIDMaps; uidMap != nil || c.HostConfig.Privileged {
  636. for i, m := range s.Mounts {
  637. if m.Type == "cgroup" {
  638. clearReadOnly(&s.Mounts[i])
  639. }
  640. }
  641. }
  642. return nil
  643. }
  644. }
  645. // sysctlExists checks if a sysctl exists; runc will error if we add any that do not actually
  646. // exist, so do not add the default ones if running on an old kernel.
  647. func sysctlExists(s string) bool {
  648. f := filepath.Join("/proc", "sys", strings.ReplaceAll(s, ".", "/"))
  649. _, err := os.Stat(f)
  650. return err == nil
  651. }
  652. // WithCommonOptions sets common docker options
  653. func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts {
  654. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  655. if c.BaseFS == "" && !daemon.UsesSnapshotter() {
  656. return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly empty")
  657. }
  658. linkedEnv, err := daemon.setupLinkedContainers(c)
  659. if err != nil {
  660. return err
  661. }
  662. if !daemon.UsesSnapshotter() {
  663. s.Root = &specs.Root{
  664. Path: c.BaseFS,
  665. Readonly: c.HostConfig.ReadonlyRootfs,
  666. }
  667. }
  668. if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil {
  669. return err
  670. }
  671. cwd := c.Config.WorkingDir
  672. if len(cwd) == 0 {
  673. cwd = "/"
  674. }
  675. s.Process.Args = append([]string{c.Path}, c.Args...)
  676. // only add the custom init if it is specified and the container is running in its
  677. // own private pid namespace. It does not make sense to add if it is running in the
  678. // host namespace or another container's pid namespace where we already have an init
  679. if c.HostConfig.PidMode.IsPrivate() {
  680. if (c.HostConfig.Init != nil && *c.HostConfig.Init) ||
  681. (c.HostConfig.Init == nil && daemon.configStore.Init) {
  682. s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...)
  683. path := daemon.configStore.InitPath
  684. if path == "" {
  685. path, err = exec.LookPath(dconfig.DefaultInitBinary)
  686. if err != nil {
  687. return err
  688. }
  689. }
  690. s.Mounts = append(s.Mounts, specs.Mount{
  691. Destination: inContainerInitPath,
  692. Type: "bind",
  693. Source: path,
  694. Options: []string{"bind", "ro"},
  695. })
  696. }
  697. }
  698. s.Process.Cwd = cwd
  699. s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
  700. s.Process.Terminal = c.Config.Tty
  701. s.Hostname = c.Config.Hostname
  702. setLinuxDomainname(c, s)
  703. // Add default sysctls that are generally safe and useful; currently we
  704. // grant the capabilities to allow these anyway. You can override if
  705. // you want to restore the original behaviour.
  706. // We do not set network sysctls if network namespace is host, or if we are
  707. // joining an existing namespace, only if we create a new net namespace.
  708. if c.HostConfig.NetworkMode.IsPrivate() {
  709. // We cannot set up ping socket support in a user namespace
  710. userNS := daemon.configStore.RemappedRoot != "" && c.HostConfig.UsernsMode.IsPrivate()
  711. if !userNS && !userns.RunningInUserNS() && sysctlExists("net.ipv4.ping_group_range") {
  712. // allow unprivileged ICMP echo sockets without CAP_NET_RAW
  713. s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647"
  714. }
  715. // allow opening any port less than 1024 without CAP_NET_BIND_SERVICE
  716. if sysctlExists("net.ipv4.ip_unprivileged_port_start") {
  717. s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"] = "0"
  718. }
  719. }
  720. return nil
  721. }
  722. }
  723. // WithCgroups sets the container's cgroups
  724. func WithCgroups(daemon *Daemon, c *container.Container) coci.SpecOpts {
  725. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  726. var cgroupsPath string
  727. scopePrefix := "docker"
  728. parent := "/docker"
  729. useSystemd := UsingSystemd(daemon.configStore)
  730. if useSystemd {
  731. parent = "system.slice"
  732. if daemon.configStore.Rootless {
  733. parent = "user.slice"
  734. }
  735. }
  736. if c.HostConfig.CgroupParent != "" {
  737. parent = c.HostConfig.CgroupParent
  738. } else if daemon.configStore.CgroupParent != "" {
  739. parent = daemon.configStore.CgroupParent
  740. }
  741. if useSystemd {
  742. cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID
  743. logrus.Debugf("createSpec: cgroupsPath: %s", cgroupsPath)
  744. } else {
  745. cgroupsPath = filepath.Join(parent, c.ID)
  746. }
  747. s.Linux.CgroupsPath = cgroupsPath
  748. // the rest is only needed for CPU RT controller
  749. if daemon.configStore.CPURealtimePeriod == 0 && daemon.configStore.CPURealtimeRuntime == 0 {
  750. return nil
  751. }
  752. p := cgroupsPath
  753. if useSystemd {
  754. initPath, err := cgroups.GetInitCgroup("cpu")
  755. if err != nil {
  756. return errors.Wrap(err, "unable to init CPU RT controller")
  757. }
  758. _, err = cgroups.GetOwnCgroup("cpu")
  759. if err != nil {
  760. return errors.Wrap(err, "unable to init CPU RT controller")
  761. }
  762. p = filepath.Join(initPath, s.Linux.CgroupsPath)
  763. }
  764. // Clean path to guard against things like ../../../BAD
  765. parentPath := filepath.Dir(p)
  766. if !filepath.IsAbs(parentPath) {
  767. parentPath = filepath.Clean("/" + parentPath)
  768. }
  769. mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", "cpu")
  770. if err != nil {
  771. return errors.Wrap(err, "unable to init CPU RT controller")
  772. }
  773. // When docker is run inside docker, the root is based of the host cgroup.
  774. // Should this be handled in runc/libcontainer/cgroups ?
  775. if strings.HasPrefix(root, "/docker/") {
  776. root = "/"
  777. }
  778. mnt = filepath.Join(mnt, root)
  779. if err := daemon.initCPURtController(mnt, parentPath); err != nil {
  780. return errors.Wrap(err, "unable to init CPU RT controller")
  781. }
  782. return nil
  783. }
  784. }
  785. // WithDevices sets the container's devices
  786. func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts {
  787. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  788. // Build lists of devices allowed and created within the container.
  789. var devs []specs.LinuxDevice
  790. devPermissions := s.Linux.Resources.Devices
  791. if c.HostConfig.Privileged {
  792. hostDevices, err := coci.HostDevices()
  793. if err != nil {
  794. return err
  795. }
  796. devs = append(devs, hostDevices...)
  797. // adding device mappings in privileged containers
  798. for _, deviceMapping := range c.HostConfig.Devices {
  799. // issue a warning that custom cgroup permissions are ignored in privileged mode
  800. if deviceMapping.CgroupPermissions != "rwm" {
  801. logrus.WithField("container", c.ID).Warnf("custom %s permissions for device %s are ignored in privileged mode", deviceMapping.CgroupPermissions, deviceMapping.PathOnHost)
  802. }
  803. // issue a warning that the device path already exists via /dev mounting in privileged mode
  804. if deviceMapping.PathOnHost == deviceMapping.PathInContainer {
  805. logrus.WithField("container", c.ID).Warnf("path in container %s already exists in privileged mode", deviceMapping.PathInContainer)
  806. continue
  807. }
  808. d, _, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, "rwm")
  809. if err != nil {
  810. return err
  811. }
  812. devs = append(devs, d...)
  813. }
  814. devPermissions = []specs.LinuxDeviceCgroup{
  815. {
  816. Allow: true,
  817. Access: "rwm",
  818. },
  819. }
  820. } else {
  821. for _, deviceMapping := range c.HostConfig.Devices {
  822. d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions)
  823. if err != nil {
  824. return err
  825. }
  826. devs = append(devs, d...)
  827. devPermissions = append(devPermissions, dPermissions...)
  828. }
  829. var err error
  830. devPermissions, err = oci.AppendDevicePermissionsFromCgroupRules(devPermissions, c.HostConfig.DeviceCgroupRules)
  831. if err != nil {
  832. return err
  833. }
  834. }
  835. s.Linux.Devices = append(s.Linux.Devices, devs...)
  836. s.Linux.Resources.Devices = devPermissions
  837. for _, req := range c.HostConfig.DeviceRequests {
  838. if err := daemon.handleDevice(req, s); err != nil {
  839. return err
  840. }
  841. }
  842. return nil
  843. }
  844. }
  845. // WithResources applies the container resources
  846. func WithResources(c *container.Container) coci.SpecOpts {
  847. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  848. r := c.HostConfig.Resources
  849. weightDevices, err := getBlkioWeightDevices(r)
  850. if err != nil {
  851. return err
  852. }
  853. readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps)
  854. if err != nil {
  855. return err
  856. }
  857. writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps)
  858. if err != nil {
  859. return err
  860. }
  861. readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps)
  862. if err != nil {
  863. return err
  864. }
  865. writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps)
  866. if err != nil {
  867. return err
  868. }
  869. memoryRes := getMemoryResources(r)
  870. cpuRes, err := getCPUResources(r)
  871. if err != nil {
  872. return err
  873. }
  874. blkioWeight := r.BlkioWeight
  875. specResources := &specs.LinuxResources{
  876. Memory: memoryRes,
  877. CPU: cpuRes,
  878. BlockIO: &specs.LinuxBlockIO{
  879. Weight: &blkioWeight,
  880. WeightDevice: weightDevices,
  881. ThrottleReadBpsDevice: readBpsDevice,
  882. ThrottleWriteBpsDevice: writeBpsDevice,
  883. ThrottleReadIOPSDevice: readIOpsDevice,
  884. ThrottleWriteIOPSDevice: writeIOpsDevice,
  885. },
  886. Pids: getPidsLimit(r),
  887. }
  888. if s.Linux.Resources != nil && len(s.Linux.Resources.Devices) > 0 {
  889. specResources.Devices = s.Linux.Resources.Devices
  890. }
  891. s.Linux.Resources = specResources
  892. return nil
  893. }
  894. }
  895. // WithSysctls sets the container's sysctls
  896. func WithSysctls(c *container.Container) coci.SpecOpts {
  897. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  898. // We merge the sysctls injected above with the HostConfig (latter takes
  899. // precedence for backwards-compatibility reasons).
  900. for k, v := range c.HostConfig.Sysctls {
  901. s.Linux.Sysctl[k] = v
  902. }
  903. return nil
  904. }
  905. }
  906. // WithUser sets the container's user
  907. func WithUser(c *container.Container) coci.SpecOpts {
  908. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  909. var err error
  910. s.Process.User, err = getUser(c, c.Config.User)
  911. return err
  912. }
  913. }
  914. func (daemon *Daemon) createSpec(ctx context.Context, c *container.Container) (retSpec *specs.Spec, err error) {
  915. var (
  916. opts []coci.SpecOpts
  917. s = oci.DefaultSpec()
  918. )
  919. opts = append(opts,
  920. WithCommonOptions(daemon, c),
  921. WithCgroups(daemon, c),
  922. WithResources(c),
  923. WithSysctls(c),
  924. WithDevices(daemon, c),
  925. WithUser(c),
  926. WithRlimits(daemon, c),
  927. WithNamespaces(daemon, c),
  928. WithCapabilities(c),
  929. WithSeccomp(daemon, c),
  930. WithMounts(daemon, c),
  931. WithLibnetwork(daemon, c),
  932. WithApparmor(c),
  933. WithSelinux(c),
  934. WithOOMScore(&c.HostConfig.OomScoreAdj),
  935. )
  936. if c.NoNewPrivileges {
  937. opts = append(opts, coci.WithNoNewPrivileges)
  938. }
  939. if c.Config.Tty {
  940. opts = append(opts, WithConsoleSize(c))
  941. }
  942. // Set the masked and readonly paths with regard to the host config options if they are set.
  943. if c.HostConfig.MaskedPaths != nil {
  944. opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths))
  945. }
  946. if c.HostConfig.ReadonlyPaths != nil {
  947. opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths))
  948. }
  949. if daemon.configStore.Rootless {
  950. opts = append(opts, WithRootless(daemon))
  951. }
  952. var snapshotter, snapshotKey string
  953. if daemon.UsesSnapshotter() {
  954. snapshotter = daemon.imageService.StorageDriver()
  955. snapshotKey = c.ID
  956. }
  957. return &s, coci.ApplyOpts(ctx, nil, &containers.Container{
  958. ID: c.ID,
  959. Snapshotter: snapshotter,
  960. SnapshotKey: snapshotKey,
  961. }, &s, opts...)
  962. }
  963. func clearReadOnly(m *specs.Mount) {
  964. var opt []string
  965. for _, o := range m.Options {
  966. if o != "ro" {
  967. opt = append(opt, o)
  968. }
  969. }
  970. m.Options = opt
  971. }
  972. // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig
  973. func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig) {
  974. ulimits := c.Ulimits
  975. // Merge ulimits with daemon defaults
  976. ulIdx := make(map[string]struct{})
  977. for _, ul := range ulimits {
  978. ulIdx[ul.Name] = struct{}{}
  979. }
  980. for name, ul := range daemon.configStore.Ulimits {
  981. if _, exists := ulIdx[name]; !exists {
  982. ulimits = append(ulimits, ul)
  983. }
  984. }
  985. c.Ulimits = ulimits
  986. }