nsexec.c 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863
  1. #define _GNU_SOURCE
  2. #include <endian.h>
  3. #include <errno.h>
  4. #include <fcntl.h>
  5. #include <grp.h>
  6. #include <sched.h>
  7. #include <setjmp.h>
  8. #include <signal.h>
  9. #include <stdarg.h>
  10. #include <stdbool.h>
  11. #include <stdint.h>
  12. #include <stdio.h>
  13. #include <stdlib.h>
  14. #include <stdbool.h>
  15. #include <string.h>
  16. #include <unistd.h>
  17. #include <sys/ioctl.h>
  18. #include <sys/prctl.h>
  19. #include <sys/socket.h>
  20. #include <sys/types.h>
  21. #include <linux/limits.h>
  22. #include <linux/netlink.h>
  23. #include <linux/types.h>
  24. /* Get all of the CLONE_NEW* flags. */
  25. #include "namespace.h"
  26. /* Synchronisation values. */
  27. enum sync_t {
  28. SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */
  29. SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */
  30. SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */
  31. SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */
  32. SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */
  33. SYNC_CHILD_READY = 0x45, /* The child or grandchild is ready to return. */
  34. /* XXX: This doesn't help with segfaults and other such issues. */
  35. SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */
  36. };
  37. /* longjmp() arguments. */
  38. #define JUMP_PARENT 0x00
  39. #define JUMP_CHILD 0xA0
  40. #define JUMP_INIT 0xA1
  41. /* JSON buffer. */
  42. #define JSON_MAX 4096
  43. /* Assume the stack grows down, so arguments should be above it. */
  44. struct clone_t {
  45. /*
  46. * Reserve some space for clone() to locate arguments
  47. * and retcode in this place
  48. */
  49. char stack[4096] __attribute__ ((aligned(16)));
  50. char stack_ptr[0];
  51. /* There's two children. This is used to execute the different code. */
  52. jmp_buf *env;
  53. int jmpval;
  54. };
  55. struct nlconfig_t {
  56. char *data;
  57. uint32_t cloneflags;
  58. char *uidmap;
  59. size_t uidmap_len;
  60. char *gidmap;
  61. size_t gidmap_len;
  62. char *namespaces;
  63. size_t namespaces_len;
  64. uint8_t is_setgroup;
  65. uint8_t is_rootless;
  66. char *oom_score_adj;
  67. size_t oom_score_adj_len;
  68. };
  69. /*
  70. * List of netlink message types sent to us as part of bootstrapping the init.
  71. * These constants are defined in libcontainer/message_linux.go.
  72. */
  73. #define INIT_MSG 62000
  74. #define CLONE_FLAGS_ATTR 27281
  75. #define NS_PATHS_ATTR 27282
  76. #define UIDMAP_ATTR 27283
  77. #define GIDMAP_ATTR 27284
  78. #define SETGROUP_ATTR 27285
  79. #define OOM_SCORE_ADJ_ATTR 27286
  80. #define ROOTLESS_ATTR 27287
  81. /*
  82. * Use the raw syscall for versions of glibc which don't include a function for
  83. * it, namely (glibc 2.12).
  84. */
  85. #if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14
  86. # define _GNU_SOURCE
  87. # include "syscall.h"
  88. # if !defined(SYS_setns) && defined(__NR_setns)
  89. # define SYS_setns __NR_setns
  90. # endif
  91. #ifndef SYS_setns
  92. # error "setns(2) syscall not supported by glibc version"
  93. #endif
  94. int setns(int fd, int nstype)
  95. {
  96. return syscall(SYS_setns, fd, nstype);
  97. }
  98. #endif
  99. /* XXX: This is ugly. */
  100. static int syncfd = -1;
  101. /* TODO(cyphar): Fix this so it correctly deals with syncT. */
  102. #define bail(fmt, ...) \
  103. do { \
  104. int ret = __COUNTER__ + 1; \
  105. fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__); \
  106. if (syncfd >= 0) { \
  107. enum sync_t s = SYNC_ERR; \
  108. if (write(syncfd, &s, sizeof(s)) != sizeof(s)) \
  109. fprintf(stderr, "nsenter: failed: write(s)"); \
  110. if (write(syncfd, &ret, sizeof(ret)) != sizeof(ret)) \
  111. fprintf(stderr, "nsenter: failed: write(ret)"); \
  112. } \
  113. exit(ret); \
  114. } while(0)
  115. static int write_file(char *data, size_t data_len, char *pathfmt, ...)
  116. {
  117. int fd, len, ret = 0;
  118. char path[PATH_MAX];
  119. va_list ap;
  120. va_start(ap, pathfmt);
  121. len = vsnprintf(path, PATH_MAX, pathfmt, ap);
  122. va_end(ap);
  123. if (len < 0)
  124. return -1;
  125. fd = open(path, O_RDWR);
  126. if (fd < 0) {
  127. ret = -1;
  128. goto out;
  129. }
  130. len = write(fd, data, data_len);
  131. if (len != data_len) {
  132. ret = -1;
  133. goto out;
  134. }
  135. out:
  136. close(fd);
  137. return ret;
  138. }
  139. enum policy_t {
  140. SETGROUPS_DEFAULT = 0,
  141. SETGROUPS_ALLOW,
  142. SETGROUPS_DENY,
  143. };
  144. /* This *must* be called before we touch gid_map. */
  145. static void update_setgroups(int pid, enum policy_t setgroup)
  146. {
  147. char *policy;
  148. switch (setgroup) {
  149. case SETGROUPS_ALLOW:
  150. policy = "allow";
  151. break;
  152. case SETGROUPS_DENY:
  153. policy = "deny";
  154. break;
  155. case SETGROUPS_DEFAULT:
  156. default:
  157. /* Nothing to do. */
  158. return;
  159. }
  160. if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) {
  161. /*
  162. * If the kernel is too old to support /proc/pid/setgroups,
  163. * open(2) or write(2) will return ENOENT. This is fine.
  164. */
  165. if (errno != ENOENT)
  166. bail("failed to write '%s' to /proc/%d/setgroups", policy, pid);
  167. }
  168. }
  169. static void update_uidmap(int pid, char *map, size_t map_len)
  170. {
  171. if (map == NULL || map_len <= 0)
  172. return;
  173. if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0)
  174. bail("failed to update /proc/%d/uid_map", pid);
  175. }
  176. static void update_gidmap(int pid, char *map, size_t map_len)
  177. {
  178. if (map == NULL || map_len <= 0)
  179. return;
  180. if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0)
  181. bail("failed to update /proc/%d/gid_map", pid);
  182. }
  183. static void update_oom_score_adj(char *data, size_t len)
  184. {
  185. if (data == NULL || len <= 0)
  186. return;
  187. if (write_file(data, len, "/proc/self/oom_score_adj") < 0)
  188. bail("failed to update /proc/self/oom_score_adj");
  189. }
  190. /* A dummy function that just jumps to the given jumpval. */
  191. static int child_func(void *arg) __attribute__ ((noinline));
  192. static int child_func(void *arg)
  193. {
  194. struct clone_t *ca = (struct clone_t *)arg;
  195. longjmp(*ca->env, ca->jmpval);
  196. }
  197. static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline));
  198. static int clone_parent(jmp_buf *env, int jmpval)
  199. {
  200. struct clone_t ca = {
  201. .env = env,
  202. .jmpval = jmpval,
  203. };
  204. return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca);
  205. }
  206. /*
  207. * Gets the init pipe fd from the environment, which is used to read the
  208. * bootstrap data and tell the parent what the new pid is after we finish
  209. * setting up the environment.
  210. */
  211. static int initpipe(void)
  212. {
  213. int pipenum;
  214. char *initpipe, *endptr;
  215. initpipe = getenv("_LIBCONTAINER_INITPIPE");
  216. if (initpipe == NULL || *initpipe == '\0')
  217. return -1;
  218. pipenum = strtol(initpipe, &endptr, 10);
  219. if (*endptr != '\0')
  220. bail("unable to parse _LIBCONTAINER_INITPIPE");
  221. return pipenum;
  222. }
  223. /* Returns the clone(2) flag for a namespace, given the name of a namespace. */
  224. static int nsflag(char *name)
  225. {
  226. if (!strcmp(name, "cgroup"))
  227. return CLONE_NEWCGROUP;
  228. else if (!strcmp(name, "ipc"))
  229. return CLONE_NEWIPC;
  230. else if (!strcmp(name, "mnt"))
  231. return CLONE_NEWNS;
  232. else if (!strcmp(name, "net"))
  233. return CLONE_NEWNET;
  234. else if (!strcmp(name, "pid"))
  235. return CLONE_NEWPID;
  236. else if (!strcmp(name, "user"))
  237. return CLONE_NEWUSER;
  238. else if (!strcmp(name, "uts"))
  239. return CLONE_NEWUTS;
  240. /* If we don't recognise a name, fallback to 0. */
  241. return 0;
  242. }
  243. static uint32_t readint32(char *buf)
  244. {
  245. return *(uint32_t *) buf;
  246. }
  247. static uint8_t readint8(char *buf)
  248. {
  249. return *(uint8_t *) buf;
  250. }
  251. static void nl_parse(int fd, struct nlconfig_t *config)
  252. {
  253. size_t len, size;
  254. struct nlmsghdr hdr;
  255. char *data, *current;
  256. /* Retrieve the netlink header. */
  257. len = read(fd, &hdr, NLMSG_HDRLEN);
  258. if (len != NLMSG_HDRLEN)
  259. bail("invalid netlink header length %zu", len);
  260. if (hdr.nlmsg_type == NLMSG_ERROR)
  261. bail("failed to read netlink message");
  262. if (hdr.nlmsg_type != INIT_MSG)
  263. bail("unexpected msg type %d", hdr.nlmsg_type);
  264. /* Retrieve data. */
  265. size = NLMSG_PAYLOAD(&hdr, 0);
  266. current = data = malloc(size);
  267. if (!data)
  268. bail("failed to allocate %zu bytes of memory for nl_payload", size);
  269. len = read(fd, data, size);
  270. if (len != size)
  271. bail("failed to read netlink payload, %zu != %zu", len, size);
  272. /* Parse the netlink payload. */
  273. config->data = data;
  274. while (current < data + size) {
  275. struct nlattr *nlattr = (struct nlattr *)current;
  276. size_t payload_len = nlattr->nla_len - NLA_HDRLEN;
  277. /* Advance to payload. */
  278. current += NLA_HDRLEN;
  279. /* Handle payload. */
  280. switch (nlattr->nla_type) {
  281. case CLONE_FLAGS_ATTR:
  282. config->cloneflags = readint32(current);
  283. break;
  284. case ROOTLESS_ATTR:
  285. config->is_rootless = readint8(current);
  286. break;
  287. case OOM_SCORE_ADJ_ATTR:
  288. config->oom_score_adj = current;
  289. config->oom_score_adj_len = payload_len;
  290. break;
  291. case NS_PATHS_ATTR:
  292. config->namespaces = current;
  293. config->namespaces_len = payload_len;
  294. break;
  295. case UIDMAP_ATTR:
  296. config->uidmap = current;
  297. config->uidmap_len = payload_len;
  298. break;
  299. case GIDMAP_ATTR:
  300. config->gidmap = current;
  301. config->gidmap_len = payload_len;
  302. break;
  303. case SETGROUP_ATTR:
  304. config->is_setgroup = readint8(current);
  305. break;
  306. default:
  307. bail("unknown netlink message type %d", nlattr->nla_type);
  308. }
  309. current += NLA_ALIGN(payload_len);
  310. }
  311. }
  312. void nl_free(struct nlconfig_t *config)
  313. {
  314. free(config->data);
  315. }
  316. void join_namespaces(char *nslist)
  317. {
  318. int num = 0, i;
  319. char *saveptr = NULL;
  320. char *namespace = strtok_r(nslist, ",", &saveptr);
  321. struct namespace_t {
  322. int fd;
  323. int ns;
  324. char type[PATH_MAX];
  325. char path[PATH_MAX];
  326. } *namespaces = NULL;
  327. if (!namespace || !strlen(namespace) || !strlen(nslist))
  328. bail("ns paths are empty");
  329. /*
  330. * We have to open the file descriptors first, since after
  331. * we join the mnt namespace we might no longer be able to
  332. * access the paths.
  333. */
  334. do {
  335. int fd;
  336. char *path;
  337. struct namespace_t *ns;
  338. /* Resize the namespace array. */
  339. namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t));
  340. if (!namespaces)
  341. bail("failed to reallocate namespace array");
  342. ns = &namespaces[num - 1];
  343. /* Split 'ns:path'. */
  344. path = strstr(namespace, ":");
  345. if (!path)
  346. bail("failed to parse %s", namespace);
  347. *path++ = '\0';
  348. fd = open(path, O_RDONLY);
  349. if (fd < 0)
  350. bail("failed to open %s", path);
  351. ns->fd = fd;
  352. ns->ns = nsflag(namespace);
  353. strncpy(ns->path, path, PATH_MAX);
  354. } while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
  355. /*
  356. * The ordering in which we join namespaces is important. We should
  357. * always join the user namespace *first*. This is all guaranteed
  358. * from the container_linux.go side of this, so we're just going to
  359. * follow the order given to us.
  360. */
  361. for (i = 0; i < num; i++) {
  362. struct namespace_t ns = namespaces[i];
  363. if (setns(ns.fd, ns.ns) < 0)
  364. bail("failed to setns to %s", ns.path);
  365. close(ns.fd);
  366. }
  367. free(namespaces);
  368. }
  369. void nsexec(void)
  370. {
  371. int pipenum;
  372. jmp_buf env;
  373. int sync_child_pipe[2], sync_grandchild_pipe[2];
  374. struct nlconfig_t config = {0};
  375. /*
  376. * If we don't have an init pipe, just return to the go routine.
  377. * We'll only get an init pipe for start or exec.
  378. */
  379. pipenum = initpipe();
  380. if (pipenum == -1)
  381. return;
  382. /* Parse all of the netlink configuration. */
  383. nl_parse(pipenum, &config);
  384. /* Set oom_score_adj. This has to be done before !dumpable because
  385. * /proc/self/oom_score_adj is not writeable unless you're an privileged
  386. * user (if !dumpable is set). All children inherit their parent's
  387. * oom_score_adj value on fork(2) so this will always be propagated
  388. * properly.
  389. */
  390. update_oom_score_adj(config.oom_score_adj, config.oom_score_adj_len);
  391. /*
  392. * Make the process non-dumpable, to avoid various race conditions that
  393. * could cause processes in namespaces we're joining to access host
  394. * resources (or potentially execute code).
  395. *
  396. * However, if the number of namespaces we are joining is 0, we are not
  397. * going to be switching to a different security context. Thus setting
  398. * ourselves to be non-dumpable only breaks things (like rootless
  399. * containers), which is the recommendation from the kernel folks.
  400. */
  401. if (config.namespaces) {
  402. if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
  403. bail("failed to set process as non-dumpable");
  404. }
  405. /* Pipe so we can tell the child when we've finished setting up. */
  406. if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_child_pipe) < 0)
  407. bail("failed to setup sync pipe between parent and child");
  408. /*
  409. * We need a new socketpair to sync with grandchild so we don't have
  410. * race condition with child.
  411. */
  412. if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_grandchild_pipe) < 0)
  413. bail("failed to setup sync pipe between parent and grandchild");
  414. /* TODO: Currently we aren't dealing with child deaths properly. */
  415. /*
  416. * Okay, so this is quite annoying.
  417. *
  418. * In order for this unsharing code to be more extensible we need to split
  419. * up unshare(CLONE_NEWUSER) and clone() in various ways. The ideal case
  420. * would be if we did clone(CLONE_NEWUSER) and the other namespaces
  421. * separately, but because of SELinux issues we cannot really do that. But
  422. * we cannot just dump the namespace flags into clone(...) because several
  423. * usecases (such as rootless containers) require more granularity around
  424. * the namespace setup. In addition, some older kernels had issues where
  425. * CLONE_NEWUSER wasn't handled before other namespaces (but we cannot
  426. * handle this while also dealing with SELinux so we choose SELinux support
  427. * over broken kernel support).
  428. *
  429. * However, if we unshare(2) the user namespace *before* we clone(2), then
  430. * all hell breaks loose.
  431. *
  432. * The parent no longer has permissions to do many things (unshare(2) drops
  433. * all capabilities in your old namespace), and the container cannot be set
  434. * up to have more than one {uid,gid} mapping. This is obviously less than
  435. * ideal. In order to fix this, we have to first clone(2) and then unshare.
  436. *
  437. * Unfortunately, it's not as simple as that. We have to fork to enter the
  438. * PID namespace (the PID namespace only applies to children). Since we'll
  439. * have to double-fork, this clone_parent() call won't be able to get the
  440. * PID of the _actual_ init process (without doing more synchronisation than
  441. * I can deal with at the moment). So we'll just get the parent to send it
  442. * for us, the only job of this process is to update
  443. * /proc/pid/{setgroups,uid_map,gid_map}.
  444. *
  445. * And as a result of the above, we also need to setns(2) in the first child
  446. * because if we join a PID namespace in the topmost parent then our child
  447. * will be in that namespace (and it will not be able to give us a PID value
  448. * that makes sense without resorting to sending things with cmsg).
  449. *
  450. * This also deals with an older issue caused by dumping cloneflags into
  451. * clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so
  452. * we have to unshare(2) before clone(2) in order to do this. This was fixed
  453. * in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was
  454. * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're
  455. * aware, the last mainline kernel which had this bug was Linux 3.12.
  456. * However, we cannot comment on which kernels the broken patch was
  457. * backported to.
  458. *
  459. * -- Aleksa "what has my life come to?" Sarai
  460. */
  461. switch (setjmp(env)) {
  462. /*
  463. * Stage 0: We're in the parent. Our job is just to create a new child
  464. * (stage 1: JUMP_CHILD) process and write its uid_map and
  465. * gid_map. That process will go on to create a new process, then
  466. * it will send us its PID which we will send to the bootstrap
  467. * process.
  468. */
  469. case JUMP_PARENT: {
  470. int len;
  471. pid_t child;
  472. char buf[JSON_MAX];
  473. bool ready = false;
  474. /* For debugging. */
  475. prctl(PR_SET_NAME, (unsigned long) "runc:[0:PARENT]", 0, 0, 0);
  476. /* Start the process of getting a container. */
  477. child = clone_parent(&env, JUMP_CHILD);
  478. if (child < 0)
  479. bail("unable to fork: child_func");
  480. /*
  481. * State machine for synchronisation with the children.
  482. *
  483. * Father only return when both child and grandchild are
  484. * ready, so we can receive all possible error codes
  485. * generated by children.
  486. */
  487. while (!ready) {
  488. enum sync_t s;
  489. int ret;
  490. syncfd = sync_child_pipe[1];
  491. close(sync_child_pipe[0]);
  492. if (read(syncfd, &s, sizeof(s)) != sizeof(s))
  493. bail("failed to sync with child: next state");
  494. switch (s) {
  495. case SYNC_ERR:
  496. /* We have to mirror the error code of the child. */
  497. if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret))
  498. bail("failed to sync with child: read(error code)");
  499. exit(ret);
  500. case SYNC_USERMAP_PLS:
  501. /*
  502. * Enable setgroups(2) if we've been asked to. But we also
  503. * have to explicitly disable setgroups(2) if we're
  504. * creating a rootless container (this is required since
  505. * Linux 3.19).
  506. */
  507. if (config.is_rootless && config.is_setgroup) {
  508. kill(child, SIGKILL);
  509. bail("cannot allow setgroup in an unprivileged user namespace setup");
  510. }
  511. if (config.is_setgroup)
  512. update_setgroups(child, SETGROUPS_ALLOW);
  513. if (config.is_rootless)
  514. update_setgroups(child, SETGROUPS_DENY);
  515. /* Set up mappings. */
  516. update_uidmap(child, config.uidmap, config.uidmap_len);
  517. update_gidmap(child, config.gidmap, config.gidmap_len);
  518. s = SYNC_USERMAP_ACK;
  519. if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
  520. kill(child, SIGKILL);
  521. bail("failed to sync with child: write(SYNC_USERMAP_ACK)");
  522. }
  523. break;
  524. case SYNC_RECVPID_PLS: {
  525. pid_t old = child;
  526. /* Get the init_func pid. */
  527. if (read(syncfd, &child, sizeof(child)) != sizeof(child)) {
  528. kill(old, SIGKILL);
  529. bail("failed to sync with child: read(childpid)");
  530. }
  531. /* Send ACK. */
  532. s = SYNC_RECVPID_ACK;
  533. if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
  534. kill(old, SIGKILL);
  535. kill(child, SIGKILL);
  536. bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
  537. }
  538. }
  539. break;
  540. case SYNC_CHILD_READY:
  541. ready = true;
  542. break;
  543. default:
  544. bail("unexpected sync value: %u", s);
  545. }
  546. }
  547. /* Now sync with grandchild. */
  548. ready = false;
  549. while (!ready) {
  550. enum sync_t s;
  551. int ret;
  552. syncfd = sync_grandchild_pipe[1];
  553. close(sync_grandchild_pipe[0]);
  554. s = SYNC_GRANDCHILD;
  555. if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
  556. kill(child, SIGKILL);
  557. bail("failed to sync with child: write(SYNC_GRANDCHILD)");
  558. }
  559. if (read(syncfd, &s, sizeof(s)) != sizeof(s))
  560. bail("failed to sync with child: next state");
  561. switch (s) {
  562. case SYNC_ERR:
  563. /* We have to mirror the error code of the child. */
  564. if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret))
  565. bail("failed to sync with child: read(error code)");
  566. exit(ret);
  567. case SYNC_CHILD_READY:
  568. ready = true;
  569. break;
  570. default:
  571. bail("unexpected sync value: %u", s);
  572. }
  573. }
  574. /* Send the init_func pid back to our parent. */
  575. len = snprintf(buf, JSON_MAX, "{\"pid\": %d}\n", child);
  576. if (len < 0) {
  577. kill(child, SIGKILL);
  578. bail("unable to generate JSON for child pid");
  579. }
  580. if (write(pipenum, buf, len) != len) {
  581. kill(child, SIGKILL);
  582. bail("unable to send child pid to bootstrapper");
  583. }
  584. exit(0);
  585. }
  586. /*
  587. * Stage 1: We're in the first child process. Our job is to join any
  588. * provided namespaces in the netlink payload and unshare all
  589. * of the requested namespaces. If we've been asked to
  590. * CLONE_NEWUSER, we will ask our parent (stage 0) to set up
  591. * our user mappings for us. Then, we create a new child
  592. * (stage 2: JUMP_INIT) for PID namespace. We then send the
  593. * child's PID to our parent (stage 0).
  594. */
  595. case JUMP_CHILD: {
  596. pid_t child;
  597. enum sync_t s;
  598. /* We're in a child and thus need to tell the parent if we die. */
  599. syncfd = sync_child_pipe[0];
  600. close(sync_child_pipe[1]);
  601. /* For debugging. */
  602. prctl(PR_SET_NAME, (unsigned long) "runc:[1:CHILD]", 0, 0, 0);
  603. /*
  604. * We need to setns first. We cannot do this earlier (in stage 0)
  605. * because of the fact that we forked to get here (the PID of
  606. * [stage 2: JUMP_INIT]) would be meaningless). We could send it
  607. * using cmsg(3) but that's just annoying.
  608. */
  609. if (config.namespaces)
  610. join_namespaces(config.namespaces);
  611. /*
  612. * Unshare all of the namespaces. Now, it should be noted that this
  613. * ordering might break in the future (especially with rootless
  614. * containers). But for now, it's not possible to split this into
  615. * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues.
  616. *
  617. * Note that we don't merge this with clone() because there were
  618. * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
  619. * was broken, so we'll just do it the long way anyway.
  620. */
  621. if (unshare(config.cloneflags) < 0)
  622. bail("failed to unshare namespaces");
  623. /*
  624. * Deal with user namespaces first. They are quite special, as they
  625. * affect our ability to unshare other namespaces and are used as
  626. * context for privilege checks.
  627. */
  628. if (config.cloneflags & CLONE_NEWUSER) {
  629. /*
  630. * We don't have the privileges to do any mapping here (see the
  631. * clone_parent rant). So signal our parent to hook us up.
  632. */
  633. /* Switching is only necessary if we joined namespaces. */
  634. if (config.namespaces) {
  635. if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0)
  636. bail("failed to set process as dumpable");
  637. }
  638. s = SYNC_USERMAP_PLS;
  639. if (write(syncfd, &s, sizeof(s)) != sizeof(s))
  640. bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
  641. /* ... wait for mapping ... */
  642. if (read(syncfd, &s, sizeof(s)) != sizeof(s))
  643. bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
  644. if (s != SYNC_USERMAP_ACK)
  645. bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
  646. /* Switching is only necessary if we joined namespaces. */
  647. if (config.namespaces) {
  648. if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
  649. bail("failed to set process as dumpable");
  650. }
  651. }
  652. /*
  653. * TODO: What about non-namespace clone flags that we're dropping here?
  654. *
  655. * We fork again because of PID namespace, setns(2) or unshare(2) don't
  656. * change the PID namespace of the calling process, because doing so
  657. * would change the caller's idea of its own PID (as reported by getpid()),
  658. * which would break many applications and libraries, so we must fork
  659. * to actually enter the new PID namespace.
  660. */
  661. child = clone_parent(&env, JUMP_INIT);
  662. if (child < 0)
  663. bail("unable to fork: init_func");
  664. /* Send the child to our parent, which knows what it's doing. */
  665. s = SYNC_RECVPID_PLS;
  666. if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
  667. kill(child, SIGKILL);
  668. bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
  669. }
  670. if (write(syncfd, &child, sizeof(child)) != sizeof(child)) {
  671. kill(child, SIGKILL);
  672. bail("failed to sync with parent: write(childpid)");
  673. }
  674. /* ... wait for parent to get the pid ... */
  675. if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
  676. kill(child, SIGKILL);
  677. bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
  678. }
  679. if (s != SYNC_RECVPID_ACK) {
  680. kill(child, SIGKILL);
  681. bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
  682. }
  683. s = SYNC_CHILD_READY;
  684. if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
  685. kill(child, SIGKILL);
  686. bail("failed to sync with parent: write(SYNC_CHILD_READY)");
  687. }
  688. /* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */
  689. exit(0);
  690. }
  691. /*
  692. * Stage 2: We're the final child process, and the only process that will
  693. * actually return to the Go runtime. Our job is to just do the
  694. * final cleanup steps and then return to the Go runtime to allow
  695. * init_linux.go to run.
  696. */
  697. case JUMP_INIT: {
  698. /*
  699. * We're inside the child now, having jumped from the
  700. * start_child() code after forking in the parent.
  701. */
  702. enum sync_t s;
  703. /* We're in a child and thus need to tell the parent if we die. */
  704. syncfd = sync_grandchild_pipe[0];
  705. close(sync_grandchild_pipe[1]);
  706. close(sync_child_pipe[0]);
  707. close(sync_child_pipe[1]);
  708. /* For debugging. */
  709. prctl(PR_SET_NAME, (unsigned long) "runc:[2:INIT]", 0, 0, 0);
  710. if (read(syncfd, &s, sizeof(s)) != sizeof(s))
  711. bail("failed to sync with parent: read(SYNC_GRANDCHILD)");
  712. if (s != SYNC_GRANDCHILD)
  713. bail("failed to sync with parent: SYNC_GRANDCHILD: got %u", s);
  714. if (setsid() < 0)
  715. bail("setsid failed");
  716. if (setuid(0) < 0)
  717. bail("setuid failed");
  718. if (setgid(0) < 0)
  719. bail("setgid failed");
  720. if (!config.is_rootless && config.is_setgroup) {
  721. if (setgroups(0, NULL) < 0)
  722. bail("setgroups failed");
  723. }
  724. s = SYNC_CHILD_READY;
  725. if (write(syncfd, &s, sizeof(s)) != sizeof(s))
  726. bail("failed to sync with patent: write(SYNC_CHILD_READY)");
  727. /* Close sync pipes. */
  728. close(sync_grandchild_pipe[0]);
  729. /* Free netlink data. */
  730. nl_free(&config);
  731. /* Finish executing, let the Go runtime take over. */
  732. return;
  733. }
  734. default:
  735. bail("unexpected jump value");
  736. }
  737. /* Should never be reached. */
  738. bail("should never be reached");
  739. }