init.go 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310
  1. // +build linux
  2. package namespaces
  3. import (
  4. "encoding/json"
  5. "fmt"
  6. "io/ioutil"
  7. "os"
  8. "strings"
  9. "syscall"
  10. "github.com/docker/libcontainer"
  11. "github.com/docker/libcontainer/apparmor"
  12. "github.com/docker/libcontainer/console"
  13. "github.com/docker/libcontainer/ipc"
  14. "github.com/docker/libcontainer/label"
  15. "github.com/docker/libcontainer/mount"
  16. "github.com/docker/libcontainer/netlink"
  17. "github.com/docker/libcontainer/network"
  18. "github.com/docker/libcontainer/security/capabilities"
  19. "github.com/docker/libcontainer/security/restrict"
  20. "github.com/docker/libcontainer/system"
  21. "github.com/docker/libcontainer/user"
  22. "github.com/docker/libcontainer/utils"
  23. )
  24. // TODO(vishh): This is part of the libcontainer API and it does much more than just namespaces related work.
  25. // Move this to libcontainer package.
  26. // Init is the init process that first runs inside a new namespace to setup mounts, users, networking,
  27. // and other options required for the new container.
  28. // The caller of Init function has to ensure that the go runtime is locked to an OS thread
  29. // (using runtime.LockOSThread) else system calls like setns called within Init may not work as intended.
  30. func Init(container *libcontainer.Config, uncleanRootfs, consolePath string, pipe *os.File, args []string) (err error) {
  31. defer func() {
  32. // if we have an error during the initialization of the container's init then send it back to the
  33. // parent process in the form of an initError.
  34. if err != nil {
  35. // ensure that any data sent from the parent is consumed so it doesn't
  36. // receive ECONNRESET when the child writes to the pipe.
  37. ioutil.ReadAll(pipe)
  38. if err := json.NewEncoder(pipe).Encode(initError{
  39. Message: err.Error(),
  40. }); err != nil {
  41. panic(err)
  42. }
  43. }
  44. // ensure that this pipe is always closed
  45. pipe.Close()
  46. }()
  47. rootfs, err := utils.ResolveRootfs(uncleanRootfs)
  48. if err != nil {
  49. return err
  50. }
  51. // clear the current processes env and replace it with the environment
  52. // defined on the container
  53. if err := LoadContainerEnvironment(container); err != nil {
  54. return err
  55. }
  56. // We always read this as it is a way to sync with the parent as well
  57. var networkState *network.NetworkState
  58. if err := json.NewDecoder(pipe).Decode(&networkState); err != nil {
  59. return err
  60. }
  61. if consolePath != "" {
  62. if err := console.OpenAndDup(consolePath); err != nil {
  63. return err
  64. }
  65. }
  66. if _, err := syscall.Setsid(); err != nil {
  67. return fmt.Errorf("setsid %s", err)
  68. }
  69. if consolePath != "" {
  70. if err := system.Setctty(); err != nil {
  71. return fmt.Errorf("setctty %s", err)
  72. }
  73. }
  74. if err := ipc.Initialize(container.IpcNsPath); err != nil {
  75. return fmt.Errorf("setup IPC %s", err)
  76. }
  77. if err := setupNetwork(container, networkState); err != nil {
  78. return fmt.Errorf("setup networking %s", err)
  79. }
  80. if err := setupRoute(container); err != nil {
  81. return fmt.Errorf("setup route %s", err)
  82. }
  83. if err := setupRlimits(container); err != nil {
  84. return fmt.Errorf("setup rlimits %s", err)
  85. }
  86. label.Init()
  87. if err := mount.InitializeMountNamespace(rootfs,
  88. consolePath,
  89. container.RestrictSys,
  90. (*mount.MountConfig)(container.MountConfig)); err != nil {
  91. return fmt.Errorf("setup mount namespace %s", err)
  92. }
  93. if container.Hostname != "" {
  94. if err := syscall.Sethostname([]byte(container.Hostname)); err != nil {
  95. return fmt.Errorf("unable to sethostname %q: %s", container.Hostname, err)
  96. }
  97. }
  98. if err := apparmor.ApplyProfile(container.AppArmorProfile); err != nil {
  99. return fmt.Errorf("set apparmor profile %s: %s", container.AppArmorProfile, err)
  100. }
  101. if err := label.SetProcessLabel(container.ProcessLabel); err != nil {
  102. return fmt.Errorf("set process label %s", err)
  103. }
  104. // TODO: (crosbymichael) make this configurable at the Config level
  105. if container.RestrictSys {
  106. if err := restrict.Restrict("proc/sys", "proc/sysrq-trigger", "proc/irq", "proc/bus"); err != nil {
  107. return err
  108. }
  109. }
  110. pdeathSignal, err := system.GetParentDeathSignal()
  111. if err != nil {
  112. return fmt.Errorf("get parent death signal %s", err)
  113. }
  114. if err := FinalizeNamespace(container); err != nil {
  115. return fmt.Errorf("finalize namespace %s", err)
  116. }
  117. // FinalizeNamespace can change user/group which clears the parent death
  118. // signal, so we restore it here.
  119. if err := RestoreParentDeathSignal(pdeathSignal); err != nil {
  120. return fmt.Errorf("restore parent death signal %s", err)
  121. }
  122. return system.Execv(args[0], args[0:], os.Environ())
  123. }
  124. // RestoreParentDeathSignal sets the parent death signal to old.
  125. func RestoreParentDeathSignal(old int) error {
  126. if old == 0 {
  127. return nil
  128. }
  129. current, err := system.GetParentDeathSignal()
  130. if err != nil {
  131. return fmt.Errorf("get parent death signal %s", err)
  132. }
  133. if old == current {
  134. return nil
  135. }
  136. if err := system.ParentDeathSignal(uintptr(old)); err != nil {
  137. return fmt.Errorf("set parent death signal %s", err)
  138. }
  139. // Signal self if parent is already dead. Does nothing if running in a new
  140. // PID namespace, as Getppid will always return 0.
  141. if syscall.Getppid() == 1 {
  142. return syscall.Kill(syscall.Getpid(), syscall.SIGKILL)
  143. }
  144. return nil
  145. }
  146. // SetupUser changes the groups, gid, and uid for the user inside the container
  147. func SetupUser(u string) error {
  148. // Set up defaults.
  149. defaultExecUser := user.ExecUser{
  150. Uid: syscall.Getuid(),
  151. Gid: syscall.Getgid(),
  152. Home: "/",
  153. }
  154. passwdFile, err := user.GetPasswdFile()
  155. if err != nil {
  156. return err
  157. }
  158. groupFile, err := user.GetGroupFile()
  159. if err != nil {
  160. return err
  161. }
  162. execUser, err := user.GetExecUserFile(u, &defaultExecUser, passwdFile, groupFile)
  163. if err != nil {
  164. return fmt.Errorf("get supplementary groups %s", err)
  165. }
  166. if err := syscall.Setgroups(execUser.Sgids); err != nil {
  167. return fmt.Errorf("setgroups %s", err)
  168. }
  169. if err := system.Setgid(execUser.Gid); err != nil {
  170. return fmt.Errorf("setgid %s", err)
  171. }
  172. if err := system.Setuid(execUser.Uid); err != nil {
  173. return fmt.Errorf("setuid %s", err)
  174. }
  175. // if we didn't get HOME already, set it based on the user's HOME
  176. if envHome := os.Getenv("HOME"); envHome == "" {
  177. if err := os.Setenv("HOME", execUser.Home); err != nil {
  178. return fmt.Errorf("set HOME %s", err)
  179. }
  180. }
  181. return nil
  182. }
  183. // setupVethNetwork uses the Network config if it is not nil to initialize
  184. // the new veth interface inside the container for use by changing the name to eth0
  185. // setting the MTU and IP address along with the default gateway
  186. func setupNetwork(container *libcontainer.Config, networkState *network.NetworkState) error {
  187. for _, config := range container.Networks {
  188. strategy, err := network.GetStrategy(config.Type)
  189. if err != nil {
  190. return err
  191. }
  192. err1 := strategy.Initialize((*network.Network)(config), networkState)
  193. if err1 != nil {
  194. return err1
  195. }
  196. }
  197. return nil
  198. }
  199. func setupRoute(container *libcontainer.Config) error {
  200. for _, config := range container.Routes {
  201. if err := netlink.AddRoute(config.Destination, config.Source, config.Gateway, config.InterfaceName); err != nil {
  202. return err
  203. }
  204. }
  205. return nil
  206. }
  207. func setupRlimits(container *libcontainer.Config) error {
  208. for _, rlimit := range container.Rlimits {
  209. l := &syscall.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}
  210. if err := syscall.Setrlimit(rlimit.Type, l); err != nil {
  211. return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err)
  212. }
  213. }
  214. return nil
  215. }
  216. // FinalizeNamespace drops the caps, sets the correct user
  217. // and working dir, and closes any leaky file descriptors
  218. // before execing the command inside the namespace
  219. func FinalizeNamespace(container *libcontainer.Config) error {
  220. // Ensure that all non-standard fds we may have accidentally
  221. // inherited are marked close-on-exec so they stay out of the
  222. // container
  223. if err := utils.CloseExecFrom(3); err != nil {
  224. return fmt.Errorf("close open file descriptors %s", err)
  225. }
  226. // drop capabilities in bounding set before changing user
  227. if err := capabilities.DropBoundingSet(container.Capabilities); err != nil {
  228. return fmt.Errorf("drop bounding set %s", err)
  229. }
  230. // preserve existing capabilities while we change users
  231. if err := system.SetKeepCaps(); err != nil {
  232. return fmt.Errorf("set keep caps %s", err)
  233. }
  234. if err := SetupUser(container.User); err != nil {
  235. return fmt.Errorf("setup user %s", err)
  236. }
  237. if err := system.ClearKeepCaps(); err != nil {
  238. return fmt.Errorf("clear keep caps %s", err)
  239. }
  240. // drop all other capabilities
  241. if err := capabilities.DropCapabilities(container.Capabilities); err != nil {
  242. return fmt.Errorf("drop capabilities %s", err)
  243. }
  244. if container.WorkingDir != "" {
  245. if err := syscall.Chdir(container.WorkingDir); err != nil {
  246. return fmt.Errorf("chdir to %s %s", container.WorkingDir, err)
  247. }
  248. }
  249. return nil
  250. }
  251. func LoadContainerEnvironment(container *libcontainer.Config) error {
  252. os.Clearenv()
  253. for _, pair := range container.Env {
  254. p := strings.SplitN(pair, "=", 2)
  255. if len(p) < 2 {
  256. return fmt.Errorf("invalid environment '%v'", pair)
  257. }
  258. if err := os.Setenv(p[0], p[1]); err != nil {
  259. return err
  260. }
  261. }
  262. return nil
  263. }