oci_linux.go 30 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043
  1. package daemon // import "github.com/docker/docker/daemon"
  2. import (
  3. "context"
  4. "fmt"
  5. "io"
  6. "io/ioutil"
  7. "os"
  8. "os/exec"
  9. "path/filepath"
  10. "sort"
  11. "strconv"
  12. "strings"
  13. "github.com/containerd/containerd/containers"
  14. coci "github.com/containerd/containerd/oci"
  15. containertypes "github.com/docker/docker/api/types/container"
  16. "github.com/docker/docker/container"
  17. daemonconfig "github.com/docker/docker/daemon/config"
  18. "github.com/docker/docker/oci"
  19. "github.com/docker/docker/oci/caps"
  20. "github.com/docker/docker/pkg/idtools"
  21. "github.com/docker/docker/pkg/stringid"
  22. "github.com/docker/docker/rootless/specconv"
  23. volumemounts "github.com/docker/docker/volume/mounts"
  24. "github.com/moby/sys/mount"
  25. "github.com/moby/sys/mountinfo"
  26. "github.com/opencontainers/runc/libcontainer/apparmor"
  27. "github.com/opencontainers/runc/libcontainer/cgroups"
  28. "github.com/opencontainers/runc/libcontainer/devices"
  29. rsystem "github.com/opencontainers/runc/libcontainer/system"
  30. "github.com/opencontainers/runc/libcontainer/user"
  31. specs "github.com/opencontainers/runtime-spec/specs-go"
  32. "github.com/pkg/errors"
  33. "github.com/sirupsen/logrus"
  34. "golang.org/x/sys/unix"
  35. )
  36. const inContainerInitPath = "/sbin/" + daemonconfig.DefaultInitBinary
  37. // WithRlimits sets the container's rlimits along with merging the daemon's rlimits
  38. func WithRlimits(daemon *Daemon, c *container.Container) coci.SpecOpts {
  39. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  40. var rlimits []specs.POSIXRlimit
  41. // We want to leave the original HostConfig alone so make a copy here
  42. hostConfig := *c.HostConfig
  43. // Merge with the daemon defaults
  44. daemon.mergeUlimits(&hostConfig)
  45. for _, ul := range hostConfig.Ulimits {
  46. rlimits = append(rlimits, specs.POSIXRlimit{
  47. Type: "RLIMIT_" + strings.ToUpper(ul.Name),
  48. Soft: uint64(ul.Soft),
  49. Hard: uint64(ul.Hard),
  50. })
  51. }
  52. s.Process.Rlimits = rlimits
  53. return nil
  54. }
  55. }
  56. // WithLibnetwork sets the libnetwork hook
  57. func WithLibnetwork(daemon *Daemon, c *container.Container) coci.SpecOpts {
  58. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  59. if s.Hooks == nil {
  60. s.Hooks = &specs.Hooks{}
  61. }
  62. for _, ns := range s.Linux.Namespaces {
  63. if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled {
  64. target := filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe")
  65. shortNetCtlrID := stringid.TruncateID(daemon.netController.ID())
  66. s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{
  67. Path: target,
  68. Args: []string{
  69. "libnetwork-setkey",
  70. "-exec-root=" + daemon.configStore.GetExecRoot(),
  71. c.ID,
  72. shortNetCtlrID,
  73. },
  74. })
  75. }
  76. }
  77. return nil
  78. }
  79. }
  80. // WithRootless sets the spec to the rootless configuration
  81. func WithRootless(daemon *Daemon) coci.SpecOpts {
  82. return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  83. var v2Controllers []string
  84. if daemon.getCgroupDriver() == cgroupSystemdDriver {
  85. if !cgroups.IsCgroup2UnifiedMode() {
  86. return errors.New("rootless systemd driver doesn't support cgroup v1")
  87. }
  88. rootlesskitParentEUID := os.Getenv("ROOTLESSKIT_PARENT_EUID")
  89. if rootlesskitParentEUID == "" {
  90. return errors.New("$ROOTLESSKIT_PARENT_EUID is not set (requires RootlessKit v0.8.0)")
  91. }
  92. controllersPath := fmt.Sprintf("/sys/fs/cgroup/user.slice/user-%s.slice/cgroup.controllers", rootlesskitParentEUID)
  93. controllersFile, err := ioutil.ReadFile(controllersPath)
  94. if err != nil {
  95. return err
  96. }
  97. v2Controllers = strings.Fields(string(controllersFile))
  98. }
  99. return specconv.ToRootless(s, v2Controllers)
  100. }
  101. }
  102. // WithOOMScore sets the oom score
  103. func WithOOMScore(score *int) coci.SpecOpts {
  104. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  105. s.Process.OOMScoreAdj = score
  106. return nil
  107. }
  108. }
  109. // WithSelinux sets the selinux labels
  110. func WithSelinux(c *container.Container) coci.SpecOpts {
  111. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  112. s.Process.SelinuxLabel = c.GetProcessLabel()
  113. s.Linux.MountLabel = c.MountLabel
  114. return nil
  115. }
  116. }
  117. // WithApparmor sets the apparmor profile
  118. func WithApparmor(c *container.Container) coci.SpecOpts {
  119. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  120. if apparmor.IsEnabled() {
  121. var appArmorProfile string
  122. if c.AppArmorProfile != "" {
  123. appArmorProfile = c.AppArmorProfile
  124. } else if c.HostConfig.Privileged {
  125. appArmorProfile = unconfinedAppArmorProfile
  126. } else {
  127. appArmorProfile = defaultAppArmorProfile
  128. }
  129. if appArmorProfile == defaultAppArmorProfile {
  130. // Unattended upgrades and other fun services can unload AppArmor
  131. // profiles inadvertently. Since we cannot store our profile in
  132. // /etc/apparmor.d, nor can we practically add other ways of
  133. // telling the system to keep our profile loaded, in order to make
  134. // sure that we keep the default profile enabled we dynamically
  135. // reload it if necessary.
  136. if err := ensureDefaultAppArmorProfile(); err != nil {
  137. return err
  138. }
  139. }
  140. s.Process.ApparmorProfile = appArmorProfile
  141. }
  142. return nil
  143. }
  144. }
  145. // WithCapabilities sets the container's capabilties
  146. func WithCapabilities(c *container.Container) coci.SpecOpts {
  147. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  148. capabilities, err := caps.TweakCapabilities(
  149. caps.DefaultCapabilities(),
  150. c.HostConfig.CapAdd,
  151. c.HostConfig.CapDrop,
  152. c.HostConfig.Capabilities,
  153. c.HostConfig.Privileged,
  154. )
  155. if err != nil {
  156. return err
  157. }
  158. return oci.SetCapabilities(s, capabilities)
  159. }
  160. }
  161. func readUserFile(c *container.Container, p string) (io.ReadCloser, error) {
  162. fp, err := c.GetResourcePath(p)
  163. if err != nil {
  164. return nil, err
  165. }
  166. return os.Open(fp)
  167. }
  168. func getUser(c *container.Container, username string) (uint32, uint32, []uint32, error) {
  169. passwdPath, err := user.GetPasswdPath()
  170. if err != nil {
  171. return 0, 0, nil, err
  172. }
  173. groupPath, err := user.GetGroupPath()
  174. if err != nil {
  175. return 0, 0, nil, err
  176. }
  177. passwdFile, err := readUserFile(c, passwdPath)
  178. if err == nil {
  179. defer passwdFile.Close()
  180. }
  181. groupFile, err := readUserFile(c, groupPath)
  182. if err == nil {
  183. defer groupFile.Close()
  184. }
  185. execUser, err := user.GetExecUser(username, nil, passwdFile, groupFile)
  186. if err != nil {
  187. return 0, 0, nil, err
  188. }
  189. // todo: fix this double read by a change to libcontainer/user pkg
  190. groupFile, err = readUserFile(c, groupPath)
  191. if err == nil {
  192. defer groupFile.Close()
  193. }
  194. var addGroups []int
  195. if len(c.HostConfig.GroupAdd) > 0 {
  196. addGroups, err = user.GetAdditionalGroups(c.HostConfig.GroupAdd, groupFile)
  197. if err != nil {
  198. return 0, 0, nil, err
  199. }
  200. }
  201. uid := uint32(execUser.Uid)
  202. gid := uint32(execUser.Gid)
  203. sgids := append(execUser.Sgids, addGroups...)
  204. var additionalGids []uint32
  205. for _, g := range sgids {
  206. additionalGids = append(additionalGids, uint32(g))
  207. }
  208. return uid, gid, additionalGids, nil
  209. }
  210. func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) {
  211. for i, n := range s.Linux.Namespaces {
  212. if n.Type == ns.Type {
  213. s.Linux.Namespaces[i] = ns
  214. return
  215. }
  216. }
  217. s.Linux.Namespaces = append(s.Linux.Namespaces, ns)
  218. }
  219. // WithNamespaces sets the container's namespaces
  220. func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
  221. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  222. userNS := false
  223. // user
  224. if c.HostConfig.UsernsMode.IsPrivate() {
  225. uidMap := daemon.idMapping.UIDs()
  226. if uidMap != nil {
  227. userNS = true
  228. ns := specs.LinuxNamespace{Type: "user"}
  229. setNamespace(s, ns)
  230. s.Linux.UIDMappings = specMapping(uidMap)
  231. s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDs())
  232. }
  233. }
  234. // network
  235. if !c.Config.NetworkDisabled {
  236. ns := specs.LinuxNamespace{Type: "network"}
  237. parts := strings.SplitN(string(c.HostConfig.NetworkMode), ":", 2)
  238. if parts[0] == "container" {
  239. nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer())
  240. if err != nil {
  241. return err
  242. }
  243. ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID())
  244. if userNS {
  245. // to share a net namespace, they must also share a user namespace
  246. nsUser := specs.LinuxNamespace{Type: "user"}
  247. nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID())
  248. setNamespace(s, nsUser)
  249. }
  250. } else if c.HostConfig.NetworkMode.IsHost() {
  251. ns.Path = c.NetworkSettings.SandboxKey
  252. }
  253. setNamespace(s, ns)
  254. }
  255. // ipc
  256. ipcMode := c.HostConfig.IpcMode
  257. switch {
  258. case ipcMode.IsContainer():
  259. ns := specs.LinuxNamespace{Type: "ipc"}
  260. ic, err := daemon.getIpcContainer(ipcMode.Container())
  261. if err != nil {
  262. return err
  263. }
  264. ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID())
  265. setNamespace(s, ns)
  266. if userNS {
  267. // to share an IPC namespace, they must also share a user namespace
  268. nsUser := specs.LinuxNamespace{Type: "user"}
  269. nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID())
  270. setNamespace(s, nsUser)
  271. }
  272. case ipcMode.IsHost():
  273. oci.RemoveNamespace(s, "ipc")
  274. case ipcMode.IsEmpty():
  275. // A container was created by an older version of the daemon.
  276. // The default behavior used to be what is now called "shareable".
  277. fallthrough
  278. case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone():
  279. ns := specs.LinuxNamespace{Type: "ipc"}
  280. setNamespace(s, ns)
  281. default:
  282. return fmt.Errorf("Invalid IPC mode: %v", ipcMode)
  283. }
  284. // pid
  285. if c.HostConfig.PidMode.IsContainer() {
  286. pc, err := daemon.getPidContainer(c)
  287. if err != nil {
  288. return err
  289. }
  290. ns := specs.LinuxNamespace{
  291. Type: "pid",
  292. Path: fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID()),
  293. }
  294. setNamespace(s, ns)
  295. if userNS {
  296. // to share a PID namespace, they must also share a user namespace
  297. nsUser := specs.LinuxNamespace{
  298. Type: "user",
  299. Path: fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID()),
  300. }
  301. setNamespace(s, nsUser)
  302. }
  303. } else if c.HostConfig.PidMode.IsHost() {
  304. oci.RemoveNamespace(s, "pid")
  305. } else {
  306. ns := specs.LinuxNamespace{Type: "pid"}
  307. setNamespace(s, ns)
  308. }
  309. // uts
  310. if c.HostConfig.UTSMode.IsHost() {
  311. oci.RemoveNamespace(s, "uts")
  312. s.Hostname = ""
  313. }
  314. // cgroup
  315. if !c.HostConfig.CgroupnsMode.IsEmpty() {
  316. cgroupNsMode := c.HostConfig.CgroupnsMode
  317. if !cgroupNsMode.Valid() {
  318. return fmt.Errorf("invalid cgroup namespace mode: %v", cgroupNsMode)
  319. }
  320. if cgroupNsMode.IsPrivate() {
  321. nsCgroup := specs.LinuxNamespace{Type: "cgroup"}
  322. setNamespace(s, nsCgroup)
  323. }
  324. }
  325. return nil
  326. }
  327. }
  328. func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {
  329. var ids []specs.LinuxIDMapping
  330. for _, item := range s {
  331. ids = append(ids, specs.LinuxIDMapping{
  332. HostID: uint32(item.HostID),
  333. ContainerID: uint32(item.ContainerID),
  334. Size: uint32(item.Size),
  335. })
  336. }
  337. return ids
  338. }
  339. // Get the source mount point of directory passed in as argument. Also return
  340. // optional fields.
  341. func getSourceMount(source string) (string, string, error) {
  342. // Ensure any symlinks are resolved.
  343. sourcePath, err := filepath.EvalSymlinks(source)
  344. if err != nil {
  345. return "", "", err
  346. }
  347. mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(sourcePath))
  348. if err != nil {
  349. return "", "", err
  350. }
  351. if len(mi) < 1 {
  352. return "", "", fmt.Errorf("Can't find mount point of %s", source)
  353. }
  354. // find the longest mount point
  355. var idx, maxlen int
  356. for i := range mi {
  357. if len(mi[i].Mountpoint) > maxlen {
  358. maxlen = len(mi[i].Mountpoint)
  359. idx = i
  360. }
  361. }
  362. return mi[idx].Mountpoint, mi[idx].Optional, nil
  363. }
  364. const (
  365. sharedPropagationOption = "shared:"
  366. slavePropagationOption = "master:"
  367. )
  368. // hasMountInfoOption checks if any of the passed any of the given option values
  369. // are set in the passed in option string.
  370. func hasMountInfoOption(opts string, vals ...string) bool {
  371. for _, opt := range strings.Split(opts, " ") {
  372. for _, val := range vals {
  373. if strings.HasPrefix(opt, val) {
  374. return true
  375. }
  376. }
  377. }
  378. return false
  379. }
  380. // Ensure mount point on which path is mounted, is shared.
  381. func ensureShared(path string) error {
  382. sourceMount, optionalOpts, err := getSourceMount(path)
  383. if err != nil {
  384. return err
  385. }
  386. // Make sure source mount point is shared.
  387. if !hasMountInfoOption(optionalOpts, sharedPropagationOption) {
  388. return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount)
  389. }
  390. return nil
  391. }
  392. // Ensure mount point on which path is mounted, is either shared or slave.
  393. func ensureSharedOrSlave(path string) error {
  394. sourceMount, optionalOpts, err := getSourceMount(path)
  395. if err != nil {
  396. return err
  397. }
  398. if !hasMountInfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) {
  399. return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount)
  400. }
  401. return nil
  402. }
  403. // Get the set of mount flags that are set on the mount that contains the given
  404. // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
  405. // bind-mounting "with options" will not fail with user namespaces, due to
  406. // kernel restrictions that require user namespace mounts to preserve
  407. // CL_UNPRIVILEGED locked flags.
  408. func getUnprivilegedMountFlags(path string) ([]string, error) {
  409. var statfs unix.Statfs_t
  410. if err := unix.Statfs(path, &statfs); err != nil {
  411. return nil, err
  412. }
  413. // The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
  414. unprivilegedFlags := map[uint64]string{
  415. unix.MS_RDONLY: "ro",
  416. unix.MS_NODEV: "nodev",
  417. unix.MS_NOEXEC: "noexec",
  418. unix.MS_NOSUID: "nosuid",
  419. unix.MS_NOATIME: "noatime",
  420. unix.MS_RELATIME: "relatime",
  421. unix.MS_NODIRATIME: "nodiratime",
  422. }
  423. var flags []string
  424. for mask, flag := range unprivilegedFlags {
  425. if uint64(statfs.Flags)&mask == mask {
  426. flags = append(flags, flag)
  427. }
  428. }
  429. return flags, nil
  430. }
  431. var (
  432. mountPropagationMap = map[string]int{
  433. "private": mount.PRIVATE,
  434. "rprivate": mount.RPRIVATE,
  435. "shared": mount.SHARED,
  436. "rshared": mount.RSHARED,
  437. "slave": mount.SLAVE,
  438. "rslave": mount.RSLAVE,
  439. }
  440. mountPropagationReverseMap = map[int]string{
  441. mount.PRIVATE: "private",
  442. mount.RPRIVATE: "rprivate",
  443. mount.SHARED: "shared",
  444. mount.RSHARED: "rshared",
  445. mount.SLAVE: "slave",
  446. mount.RSLAVE: "rslave",
  447. }
  448. )
  449. // inSlice tests whether a string is contained in a slice of strings or not.
  450. // Comparison is case sensitive
  451. func inSlice(slice []string, s string) bool {
  452. for _, ss := range slice {
  453. if s == ss {
  454. return true
  455. }
  456. }
  457. return false
  458. }
  459. // WithMounts sets the container's mounts
  460. func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts {
  461. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
  462. if err := daemon.setupContainerMountsRoot(c); err != nil {
  463. return err
  464. }
  465. if err := daemon.setupIpcDirs(c); err != nil {
  466. return err
  467. }
  468. defer func() {
  469. if err != nil {
  470. daemon.cleanupSecretDir(c)
  471. }
  472. }()
  473. if err := daemon.setupSecretDir(c); err != nil {
  474. return err
  475. }
  476. ms, err := daemon.setupMounts(c)
  477. if err != nil {
  478. return err
  479. }
  480. if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() {
  481. ms = append(ms, c.IpcMounts()...)
  482. }
  483. tmpfsMounts, err := c.TmpfsMounts()
  484. if err != nil {
  485. return err
  486. }
  487. ms = append(ms, tmpfsMounts...)
  488. secretMounts, err := c.SecretMounts()
  489. if err != nil {
  490. return err
  491. }
  492. ms = append(ms, secretMounts...)
  493. sort.Sort(mounts(ms))
  494. mounts := ms
  495. userMounts := make(map[string]struct{})
  496. for _, m := range mounts {
  497. userMounts[m.Destination] = struct{}{}
  498. }
  499. // Copy all mounts from spec to defaultMounts, except for
  500. // - mounts overridden by a user supplied mount;
  501. // - all mounts under /dev if a user supplied /dev is present;
  502. // - /dev/shm, in case IpcMode is none.
  503. // While at it, also
  504. // - set size for /dev/shm from shmsize.
  505. defaultMounts := s.Mounts[:0]
  506. _, mountDev := userMounts["/dev"]
  507. for _, m := range s.Mounts {
  508. if _, ok := userMounts[m.Destination]; ok {
  509. // filter out mount overridden by a user supplied mount
  510. continue
  511. }
  512. if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
  513. // filter out everything under /dev if /dev is user-mounted
  514. continue
  515. }
  516. if m.Destination == "/dev/shm" {
  517. if c.HostConfig.IpcMode.IsNone() {
  518. // filter out /dev/shm for "none" IpcMode
  519. continue
  520. }
  521. // set size for /dev/shm mount from spec
  522. sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
  523. m.Options = append(m.Options, sizeOpt)
  524. }
  525. defaultMounts = append(defaultMounts, m)
  526. }
  527. s.Mounts = defaultMounts
  528. for _, m := range mounts {
  529. if m.Source == "tmpfs" {
  530. data := m.Data
  531. parser := volumemounts.NewParser("linux")
  532. options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())}
  533. if data != "" {
  534. options = append(options, strings.Split(data, ",")...)
  535. }
  536. merged, err := mount.MergeTmpfsOptions(options)
  537. if err != nil {
  538. return err
  539. }
  540. s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged})
  541. continue
  542. }
  543. mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"}
  544. // Determine property of RootPropagation based on volume
  545. // properties. If a volume is shared, then keep root propagation
  546. // shared. This should work for slave and private volumes too.
  547. //
  548. // For slave volumes, it can be either [r]shared/[r]slave.
  549. //
  550. // For private volumes any root propagation value should work.
  551. pFlag := mountPropagationMap[m.Propagation]
  552. switch pFlag {
  553. case mount.SHARED, mount.RSHARED:
  554. if err := ensureShared(m.Source); err != nil {
  555. return err
  556. }
  557. rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
  558. if rootpg != mount.SHARED && rootpg != mount.RSHARED {
  559. s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED]
  560. }
  561. case mount.SLAVE, mount.RSLAVE:
  562. var fallback bool
  563. if err := ensureSharedOrSlave(m.Source); err != nil {
  564. // For backwards compatibility purposes, treat mounts from the daemon root
  565. // as special since we automatically add rslave propagation to these mounts
  566. // when the user did not set anything, so we should fallback to the old
  567. // behavior which is to use private propagation which is normally the
  568. // default.
  569. if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) {
  570. return err
  571. }
  572. cm, ok := c.MountPoints[m.Destination]
  573. if !ok {
  574. return err
  575. }
  576. if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" {
  577. // This means the user explicitly set a propagation, do not fallback in that case.
  578. return err
  579. }
  580. fallback = true
  581. logrus.WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root")
  582. }
  583. if !fallback {
  584. rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
  585. if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
  586. s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE]
  587. }
  588. }
  589. }
  590. bindMode := "rbind"
  591. if m.NonRecursive {
  592. bindMode = "bind"
  593. }
  594. opts := []string{bindMode}
  595. if !m.Writable {
  596. opts = append(opts, "ro")
  597. }
  598. if pFlag != 0 {
  599. opts = append(opts, mountPropagationReverseMap[pFlag])
  600. }
  601. // If we are using user namespaces, then we must make sure that we
  602. // don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
  603. // "mount" when we bind-mount. The reason for this is that at the point
  604. // when runc sets up the root filesystem, it is already inside a user
  605. // namespace, and thus cannot change any flags that are locked.
  606. if daemon.configStore.RemappedRoot != "" {
  607. unprivOpts, err := getUnprivilegedMountFlags(m.Source)
  608. if err != nil {
  609. return err
  610. }
  611. opts = append(opts, unprivOpts...)
  612. }
  613. mt.Options = opts
  614. s.Mounts = append(s.Mounts, mt)
  615. }
  616. if s.Root.Readonly {
  617. for i, m := range s.Mounts {
  618. switch m.Destination {
  619. case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev":
  620. continue
  621. }
  622. if _, ok := userMounts[m.Destination]; !ok {
  623. if !inSlice(m.Options, "ro") {
  624. s.Mounts[i].Options = append(s.Mounts[i].Options, "ro")
  625. }
  626. }
  627. }
  628. }
  629. if c.HostConfig.Privileged {
  630. // clear readonly for /sys
  631. for i := range s.Mounts {
  632. if s.Mounts[i].Destination == "/sys" {
  633. clearReadOnly(&s.Mounts[i])
  634. }
  635. }
  636. s.Linux.ReadonlyPaths = nil
  637. s.Linux.MaskedPaths = nil
  638. }
  639. // TODO: until a kernel/mount solution exists for handling remount in a user namespace,
  640. // we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
  641. if uidMap := daemon.idMapping.UIDs(); uidMap != nil || c.HostConfig.Privileged {
  642. for i, m := range s.Mounts {
  643. if m.Type == "cgroup" {
  644. clearReadOnly(&s.Mounts[i])
  645. }
  646. }
  647. }
  648. return nil
  649. }
  650. }
  651. // WithCommonOptions sets common docker options
  652. func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts {
  653. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  654. if c.BaseFS == nil {
  655. return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly nil")
  656. }
  657. linkedEnv, err := daemon.setupLinkedContainers(c)
  658. if err != nil {
  659. return err
  660. }
  661. s.Root = &specs.Root{
  662. Path: c.BaseFS.Path(),
  663. Readonly: c.HostConfig.ReadonlyRootfs,
  664. }
  665. if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil {
  666. return err
  667. }
  668. cwd := c.Config.WorkingDir
  669. if len(cwd) == 0 {
  670. cwd = "/"
  671. }
  672. s.Process.Args = append([]string{c.Path}, c.Args...)
  673. // only add the custom init if it is specified and the container is running in its
  674. // own private pid namespace. It does not make sense to add if it is running in the
  675. // host namespace or another container's pid namespace where we already have an init
  676. if c.HostConfig.PidMode.IsPrivate() {
  677. if (c.HostConfig.Init != nil && *c.HostConfig.Init) ||
  678. (c.HostConfig.Init == nil && daemon.configStore.Init) {
  679. s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...)
  680. path := daemon.configStore.InitPath
  681. if path == "" {
  682. path, err = exec.LookPath(daemonconfig.DefaultInitBinary)
  683. if err != nil {
  684. return err
  685. }
  686. }
  687. s.Mounts = append(s.Mounts, specs.Mount{
  688. Destination: inContainerInitPath,
  689. Type: "bind",
  690. Source: path,
  691. Options: []string{"bind", "ro"},
  692. })
  693. }
  694. }
  695. s.Process.Cwd = cwd
  696. s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
  697. s.Process.Terminal = c.Config.Tty
  698. s.Hostname = c.Config.Hostname
  699. setLinuxDomainname(c, s)
  700. return nil
  701. }
  702. }
  703. // WithCgroups sets the container's cgroups
  704. func WithCgroups(daemon *Daemon, c *container.Container) coci.SpecOpts {
  705. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  706. var cgroupsPath string
  707. scopePrefix := "docker"
  708. parent := "/docker"
  709. useSystemd := UsingSystemd(daemon.configStore)
  710. if useSystemd {
  711. parent = "system.slice"
  712. if daemon.configStore.Rootless {
  713. parent = "user.slice"
  714. }
  715. }
  716. if c.HostConfig.CgroupParent != "" {
  717. parent = c.HostConfig.CgroupParent
  718. } else if daemon.configStore.CgroupParent != "" {
  719. parent = daemon.configStore.CgroupParent
  720. }
  721. if useSystemd {
  722. cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID
  723. logrus.Debugf("createSpec: cgroupsPath: %s", cgroupsPath)
  724. } else {
  725. cgroupsPath = filepath.Join(parent, c.ID)
  726. }
  727. s.Linux.CgroupsPath = cgroupsPath
  728. p := cgroupsPath
  729. if useSystemd {
  730. initPath, err := cgroups.GetInitCgroup("cpu")
  731. if err != nil {
  732. return err
  733. }
  734. _, err = cgroups.GetOwnCgroup("cpu")
  735. if err != nil {
  736. return err
  737. }
  738. p = filepath.Join(initPath, s.Linux.CgroupsPath)
  739. }
  740. // Clean path to guard against things like ../../../BAD
  741. parentPath := filepath.Dir(p)
  742. if !filepath.IsAbs(parentPath) {
  743. parentPath = filepath.Clean("/" + parentPath)
  744. }
  745. if err := daemon.initCgroupsPath(parentPath); err != nil {
  746. return fmt.Errorf("linux init cgroups path: %v", err)
  747. }
  748. return nil
  749. }
  750. }
  751. // WithDevices sets the container's devices
  752. func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts {
  753. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  754. // Build lists of devices allowed and created within the container.
  755. var devs []specs.LinuxDevice
  756. devPermissions := s.Linux.Resources.Devices
  757. if c.HostConfig.Privileged && !rsystem.RunningInUserNS() {
  758. hostDevices, err := devices.HostDevices()
  759. if err != nil {
  760. return err
  761. }
  762. for _, d := range hostDevices {
  763. devs = append(devs, oci.Device(d))
  764. }
  765. // adding device mappings in privileged containers
  766. for _, deviceMapping := range c.HostConfig.Devices {
  767. // issue a warning that custom cgroup permissions are ignored in privileged mode
  768. if deviceMapping.CgroupPermissions != "rwm" {
  769. logrus.WithField("container", c.ID).Warnf("custom %s permissions for device %s are ignored in privileged mode", deviceMapping.CgroupPermissions, deviceMapping.PathOnHost)
  770. }
  771. // issue a warning that the device path already exists via /dev mounting in privileged mode
  772. if deviceMapping.PathOnHost == deviceMapping.PathInContainer {
  773. logrus.WithField("container", c.ID).Warnf("path in container %s already exists in privileged mode", deviceMapping.PathInContainer)
  774. continue
  775. }
  776. d, _, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, "rwm")
  777. if err != nil {
  778. return err
  779. }
  780. devs = append(devs, d...)
  781. }
  782. devPermissions = []specs.LinuxDeviceCgroup{
  783. {
  784. Allow: true,
  785. Access: "rwm",
  786. },
  787. }
  788. } else {
  789. for _, deviceMapping := range c.HostConfig.Devices {
  790. d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions)
  791. if err != nil {
  792. return err
  793. }
  794. devs = append(devs, d...)
  795. devPermissions = append(devPermissions, dPermissions...)
  796. }
  797. var err error
  798. devPermissions, err = oci.AppendDevicePermissionsFromCgroupRules(devPermissions, c.HostConfig.DeviceCgroupRules)
  799. if err != nil {
  800. return err
  801. }
  802. }
  803. s.Linux.Devices = append(s.Linux.Devices, devs...)
  804. s.Linux.Resources.Devices = devPermissions
  805. for _, req := range c.HostConfig.DeviceRequests {
  806. if err := daemon.handleDevice(req, s); err != nil {
  807. return err
  808. }
  809. }
  810. return nil
  811. }
  812. }
  813. // WithResources applies the container resources
  814. func WithResources(c *container.Container) coci.SpecOpts {
  815. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  816. r := c.HostConfig.Resources
  817. weightDevices, err := getBlkioWeightDevices(r)
  818. if err != nil {
  819. return err
  820. }
  821. readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps)
  822. if err != nil {
  823. return err
  824. }
  825. writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps)
  826. if err != nil {
  827. return err
  828. }
  829. readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps)
  830. if err != nil {
  831. return err
  832. }
  833. writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps)
  834. if err != nil {
  835. return err
  836. }
  837. memoryRes := getMemoryResources(r)
  838. cpuRes, err := getCPUResources(r)
  839. if err != nil {
  840. return err
  841. }
  842. blkioWeight := r.BlkioWeight
  843. specResources := &specs.LinuxResources{
  844. Memory: memoryRes,
  845. CPU: cpuRes,
  846. BlockIO: &specs.LinuxBlockIO{
  847. Weight: &blkioWeight,
  848. WeightDevice: weightDevices,
  849. ThrottleReadBpsDevice: readBpsDevice,
  850. ThrottleWriteBpsDevice: writeBpsDevice,
  851. ThrottleReadIOPSDevice: readIOpsDevice,
  852. ThrottleWriteIOPSDevice: writeIOpsDevice,
  853. },
  854. Pids: getPidsLimit(r),
  855. }
  856. if s.Linux.Resources != nil && len(s.Linux.Resources.Devices) > 0 {
  857. specResources.Devices = s.Linux.Resources.Devices
  858. }
  859. s.Linux.Resources = specResources
  860. return nil
  861. }
  862. }
  863. // WithSysctls sets the container's sysctls
  864. func WithSysctls(c *container.Container) coci.SpecOpts {
  865. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  866. // We merge the sysctls injected above with the HostConfig (latter takes
  867. // precedence for backwards-compatibility reasons).
  868. for k, v := range c.HostConfig.Sysctls {
  869. s.Linux.Sysctl[k] = v
  870. }
  871. return nil
  872. }
  873. }
  874. // WithUser sets the container's user
  875. func WithUser(c *container.Container) coci.SpecOpts {
  876. return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  877. uid, gid, additionalGids, err := getUser(c, c.Config.User)
  878. if err != nil {
  879. return err
  880. }
  881. s.Process.User.UID = uid
  882. s.Process.User.GID = gid
  883. s.Process.User.AdditionalGids = additionalGids
  884. return nil
  885. }
  886. }
  887. func (daemon *Daemon) createSpec(c *container.Container) (retSpec *specs.Spec, err error) {
  888. var (
  889. opts []coci.SpecOpts
  890. s = oci.DefaultSpec()
  891. )
  892. opts = append(opts,
  893. WithCommonOptions(daemon, c),
  894. WithCgroups(daemon, c),
  895. WithResources(c),
  896. WithSysctls(c),
  897. WithDevices(daemon, c),
  898. WithUser(c),
  899. WithRlimits(daemon, c),
  900. WithNamespaces(daemon, c),
  901. WithCapabilities(c),
  902. WithSeccomp(daemon, c),
  903. WithMounts(daemon, c),
  904. WithLibnetwork(daemon, c),
  905. WithApparmor(c),
  906. WithSelinux(c),
  907. WithOOMScore(&c.HostConfig.OomScoreAdj),
  908. )
  909. if c.NoNewPrivileges {
  910. opts = append(opts, coci.WithNoNewPrivileges)
  911. }
  912. // Set the masked and readonly paths with regard to the host config options if they are set.
  913. if c.HostConfig.MaskedPaths != nil {
  914. opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths))
  915. }
  916. if c.HostConfig.ReadonlyPaths != nil {
  917. opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths))
  918. }
  919. if daemon.configStore.Rootless {
  920. opts = append(opts, WithRootless(daemon))
  921. }
  922. return &s, coci.ApplyOpts(context.Background(), nil, &containers.Container{
  923. ID: c.ID,
  924. }, &s, opts...)
  925. }
  926. func clearReadOnly(m *specs.Mount) {
  927. var opt []string
  928. for _, o := range m.Options {
  929. if o != "ro" {
  930. opt = append(opt, o)
  931. }
  932. }
  933. m.Options = opt
  934. }
  935. // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig
  936. func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig) {
  937. ulimits := c.Ulimits
  938. // Merge ulimits with daemon defaults
  939. ulIdx := make(map[string]struct{})
  940. for _, ul := range ulimits {
  941. ulIdx[ul.Name] = struct{}{}
  942. }
  943. for name, ul := range daemon.configStore.Ulimits {
  944. if _, exists := ulIdx[name]; !exists {
  945. ulimits = append(ulimits, ul)
  946. }
  947. }
  948. c.Ulimits = ulimits
  949. }