rootfs_linux.go 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379
  1. // +build linux
  2. package libcontainer
  3. import (
  4. "fmt"
  5. "io/ioutil"
  6. "os"
  7. "path/filepath"
  8. "strings"
  9. "syscall"
  10. "time"
  11. "github.com/docker/libcontainer/configs"
  12. "github.com/docker/libcontainer/label"
  13. )
  14. const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
  15. var baseMounts = []*configs.Mount{
  16. {
  17. Source: "proc",
  18. Destination: "/proc",
  19. Device: "proc",
  20. Flags: defaultMountFlags,
  21. },
  22. {
  23. Source: "tmpfs",
  24. Destination: "/dev",
  25. Device: "tmpfs",
  26. Flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME,
  27. Data: "mode=755",
  28. },
  29. {
  30. Source: "devpts",
  31. Destination: "/dev/pts",
  32. Device: "devpts",
  33. Flags: syscall.MS_NOSUID | syscall.MS_NOEXEC,
  34. Data: "newinstance,ptmxmode=0666,mode=0620,gid=5",
  35. },
  36. }
  37. // setupRootfs sets up the devices, mount points, and filesystems for use inside a
  38. // new mount namespace.
  39. func setupRootfs(config *configs.Config, console *linuxConsole) (err error) {
  40. if err := prepareRoot(config); err != nil {
  41. return newSystemError(err)
  42. }
  43. for _, m := range append(baseMounts, config.Mounts...) {
  44. if err := mount(m, config.Rootfs, config.MountLabel); err != nil {
  45. return newSystemError(err)
  46. }
  47. }
  48. if err := createDevices(config); err != nil {
  49. return newSystemError(err)
  50. }
  51. if err := setupPtmx(config, console); err != nil {
  52. return newSystemError(err)
  53. }
  54. // stdin, stdout and stderr could be pointing to /dev/null from parent namespace.
  55. // re-open them inside this namespace.
  56. if err := reOpenDevNull(config.Rootfs); err != nil {
  57. return newSystemError(err)
  58. }
  59. if err := setupDevSymlinks(config.Rootfs); err != nil {
  60. return newSystemError(err)
  61. }
  62. if err := syscall.Chdir(config.Rootfs); err != nil {
  63. return newSystemError(err)
  64. }
  65. if config.NoPivotRoot {
  66. err = msMoveRoot(config.Rootfs)
  67. } else {
  68. err = pivotRoot(config.Rootfs, config.PivotDir)
  69. }
  70. if err != nil {
  71. return newSystemError(err)
  72. }
  73. if config.Readonlyfs {
  74. if err := setReadonly(); err != nil {
  75. return newSystemError(err)
  76. }
  77. }
  78. syscall.Umask(0022)
  79. return nil
  80. }
  81. func mount(m *configs.Mount, rootfs, mountLabel string) error {
  82. var (
  83. dest = m.Destination
  84. data = label.FormatMountLabel(m.Data, mountLabel)
  85. )
  86. if !strings.HasPrefix(dest, rootfs) {
  87. dest = filepath.Join(rootfs, dest)
  88. }
  89. switch m.Device {
  90. case "proc", "mqueue", "sysfs":
  91. if err := os.MkdirAll(dest, 0755); err != nil && !os.IsExist(err) {
  92. return err
  93. }
  94. return syscall.Mount(m.Source, dest, m.Device, uintptr(m.Flags), "")
  95. case "tmpfs":
  96. stat, err := os.Stat(dest)
  97. if err != nil {
  98. if err := os.MkdirAll(dest, 0755); err != nil && !os.IsExist(err) {
  99. return err
  100. }
  101. }
  102. if err := syscall.Mount(m.Source, dest, m.Device, uintptr(m.Flags), data); err != nil {
  103. return err
  104. }
  105. if stat != nil {
  106. if err = os.Chmod(dest, stat.Mode()); err != nil {
  107. return err
  108. }
  109. }
  110. return nil
  111. case "devpts":
  112. if err := os.MkdirAll(dest, 0755); err != nil && !os.IsExist(err) {
  113. return err
  114. }
  115. return syscall.Mount(m.Source, dest, m.Device, uintptr(m.Flags), data)
  116. case "bind":
  117. stat, err := os.Stat(m.Source)
  118. if err != nil {
  119. // error out if the source of a bind mount does not exist as we will be
  120. // unable to bind anything to it.
  121. return err
  122. }
  123. if err := createIfNotExists(dest, stat.IsDir()); err != nil {
  124. return err
  125. }
  126. if err := syscall.Mount(m.Source, dest, m.Device, uintptr(m.Flags), data); err != nil {
  127. return err
  128. }
  129. if m.Flags&syscall.MS_RDONLY != 0 {
  130. if err := syscall.Mount(m.Source, dest, m.Device, uintptr(m.Flags|syscall.MS_REMOUNT), ""); err != nil {
  131. return err
  132. }
  133. }
  134. if m.Relabel != "" {
  135. if err := label.Relabel(m.Source, mountLabel, m.Relabel); err != nil {
  136. return err
  137. }
  138. }
  139. if m.Flags&syscall.MS_PRIVATE != 0 {
  140. if err := syscall.Mount("", dest, "none", uintptr(syscall.MS_PRIVATE), ""); err != nil {
  141. return err
  142. }
  143. }
  144. default:
  145. return fmt.Errorf("unknown mount device %q to %q", m.Device, m.Destination)
  146. }
  147. return nil
  148. }
  149. func setupDevSymlinks(rootfs string) error {
  150. var links = [][2]string{
  151. {"/proc/self/fd", "/dev/fd"},
  152. {"/proc/self/fd/0", "/dev/stdin"},
  153. {"/proc/self/fd/1", "/dev/stdout"},
  154. {"/proc/self/fd/2", "/dev/stderr"},
  155. }
  156. // kcore support can be toggled with CONFIG_PROC_KCORE; only create a symlink
  157. // in /dev if it exists in /proc.
  158. if _, err := os.Stat("/proc/kcore"); err == nil {
  159. links = append(links, [2]string{"/proc/kcore", "/dev/kcore"})
  160. }
  161. for _, link := range links {
  162. var (
  163. src = link[0]
  164. dst = filepath.Join(rootfs, link[1])
  165. )
  166. if err := os.Symlink(src, dst); err != nil && !os.IsExist(err) {
  167. return fmt.Errorf("symlink %s %s %s", src, dst, err)
  168. }
  169. }
  170. return nil
  171. }
  172. // If stdin, stdout or stderr are pointing to '/dev/null' in the global mount namespace,
  173. // this method will make them point to '/dev/null' in this namespace.
  174. func reOpenDevNull(rootfs string) error {
  175. var stat, devNullStat syscall.Stat_t
  176. file, err := os.Open(filepath.Join(rootfs, "/dev/null"))
  177. if err != nil {
  178. return fmt.Errorf("Failed to open /dev/null - %s", err)
  179. }
  180. defer file.Close()
  181. if err := syscall.Fstat(int(file.Fd()), &devNullStat); err != nil {
  182. return err
  183. }
  184. for fd := 0; fd < 3; fd++ {
  185. if err := syscall.Fstat(fd, &stat); err != nil {
  186. return err
  187. }
  188. if stat.Rdev == devNullStat.Rdev {
  189. // Close and re-open the fd.
  190. if err := syscall.Dup2(int(file.Fd()), fd); err != nil {
  191. return err
  192. }
  193. }
  194. }
  195. return nil
  196. }
  197. // Create the device nodes in the container.
  198. func createDevices(config *configs.Config) error {
  199. oldMask := syscall.Umask(0000)
  200. for _, node := range config.Devices {
  201. if err := createDeviceNode(config.Rootfs, node); err != nil {
  202. syscall.Umask(oldMask)
  203. return err
  204. }
  205. }
  206. syscall.Umask(oldMask)
  207. return nil
  208. }
  209. // Creates the device node in the rootfs of the container.
  210. func createDeviceNode(rootfs string, node *configs.Device) error {
  211. dest := filepath.Join(rootfs, node.Path)
  212. if err := os.MkdirAll(filepath.Dir(dest), 0755); err != nil {
  213. return err
  214. }
  215. if err := mknodDevice(dest, node); err != nil {
  216. if os.IsExist(err) {
  217. return nil
  218. }
  219. if err != syscall.EPERM {
  220. return err
  221. }
  222. // containers running in a user namespace are not allowed to mknod
  223. // devices so we can just bind mount it from the host.
  224. f, err := os.Create(dest)
  225. if err != nil && !os.IsExist(err) {
  226. return err
  227. }
  228. if f != nil {
  229. f.Close()
  230. }
  231. return syscall.Mount(node.Path, dest, "bind", syscall.MS_BIND, "")
  232. }
  233. return nil
  234. }
  235. func mknodDevice(dest string, node *configs.Device) error {
  236. fileMode := node.FileMode
  237. switch node.Type {
  238. case 'c':
  239. fileMode |= syscall.S_IFCHR
  240. case 'b':
  241. fileMode |= syscall.S_IFBLK
  242. default:
  243. return fmt.Errorf("%c is not a valid device type for device %s", node.Type, node.Path)
  244. }
  245. if err := syscall.Mknod(dest, uint32(fileMode), node.Mkdev()); err != nil {
  246. return err
  247. }
  248. return syscall.Chown(dest, int(node.Uid), int(node.Gid))
  249. }
  250. func prepareRoot(config *configs.Config) error {
  251. flag := syscall.MS_PRIVATE | syscall.MS_REC
  252. if config.NoPivotRoot {
  253. flag = syscall.MS_SLAVE | syscall.MS_REC
  254. }
  255. if err := syscall.Mount("", "/", "", uintptr(flag), ""); err != nil {
  256. return err
  257. }
  258. return syscall.Mount(config.Rootfs, config.Rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, "")
  259. }
  260. func setReadonly() error {
  261. return syscall.Mount("/", "/", "bind", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, "")
  262. }
  263. func setupPtmx(config *configs.Config, console *linuxConsole) error {
  264. ptmx := filepath.Join(config.Rootfs, "dev/ptmx")
  265. if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) {
  266. return err
  267. }
  268. if err := os.Symlink("pts/ptmx", ptmx); err != nil {
  269. return fmt.Errorf("symlink dev ptmx %s", err)
  270. }
  271. if console != nil {
  272. return console.mount(config.Rootfs, config.MountLabel, 0, 0)
  273. }
  274. return nil
  275. }
  276. func pivotRoot(rootfs, pivotBaseDir string) error {
  277. if pivotBaseDir == "" {
  278. pivotBaseDir = "/"
  279. }
  280. tmpDir := filepath.Join(rootfs, pivotBaseDir)
  281. if err := os.MkdirAll(tmpDir, 0755); err != nil {
  282. return fmt.Errorf("can't create tmp dir %s, error %v", tmpDir, err)
  283. }
  284. pivotDir, err := ioutil.TempDir(tmpDir, ".pivot_root")
  285. if err != nil {
  286. return fmt.Errorf("can't create pivot_root dir %s, error %v", pivotDir, err)
  287. }
  288. if err := syscall.PivotRoot(rootfs, pivotDir); err != nil {
  289. return fmt.Errorf("pivot_root %s", err)
  290. }
  291. if err := syscall.Chdir("/"); err != nil {
  292. return fmt.Errorf("chdir / %s", err)
  293. }
  294. // path to pivot dir now changed, update
  295. pivotDir = filepath.Join(pivotBaseDir, filepath.Base(pivotDir))
  296. if err := syscall.Unmount(pivotDir, syscall.MNT_DETACH); err != nil {
  297. return fmt.Errorf("unmount pivot_root dir %s", err)
  298. }
  299. return os.Remove(pivotDir)
  300. }
  301. func msMoveRoot(rootfs string) error {
  302. if err := syscall.Mount(rootfs, "/", "", syscall.MS_MOVE, ""); err != nil {
  303. return err
  304. }
  305. if err := syscall.Chroot("."); err != nil {
  306. return err
  307. }
  308. return syscall.Chdir("/")
  309. }
  310. // createIfNotExists creates a file or a directory only if it does not already exist.
  311. func createIfNotExists(path string, isDir bool) error {
  312. if _, err := os.Stat(path); err != nil {
  313. if os.IsNotExist(err) {
  314. if isDir {
  315. return os.MkdirAll(path, 0755)
  316. }
  317. if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
  318. return err
  319. }
  320. f, err := os.OpenFile(path, os.O_CREATE, 0755)
  321. if err != nil {
  322. return err
  323. }
  324. f.Close()
  325. }
  326. }
  327. return nil
  328. }
  329. // remountReadonly will bind over the top of an existing path and ensure that it is read-only.
  330. func remountReadonly(path string) error {
  331. for i := 0; i < 5; i++ {
  332. if err := syscall.Mount("", path, "", syscall.MS_REMOUNT|syscall.MS_RDONLY, ""); err != nil && !os.IsNotExist(err) {
  333. switch err {
  334. case syscall.EINVAL:
  335. // Probably not a mountpoint, use bind-mount
  336. if err := syscall.Mount(path, path, "", syscall.MS_BIND, ""); err != nil {
  337. return err
  338. }
  339. return syscall.Mount(path, path, "", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC|defaultMountFlags, "")
  340. case syscall.EBUSY:
  341. time.Sleep(100 * time.Millisecond)
  342. continue
  343. default:
  344. return err
  345. }
  346. }
  347. return nil
  348. }
  349. return fmt.Errorf("unable to mount %s as readonly max retries reached", path)
  350. }
  351. // maskFile bind mounts /dev/null over the top of the specified path inside a container
  352. // to avoid security issues from processes reading information from non-namespace aware mounts ( proc/kcore ).
  353. func maskFile(path string) error {
  354. if err := syscall.Mount("/dev/null", path, "", syscall.MS_BIND, ""); err != nil && !os.IsNotExist(err) {
  355. return err
  356. }
  357. return nil
  358. }