|
@@ -29,6 +29,8 @@
|
|
|
/* Get all of the CLONE_NEW* flags. */
|
|
|
#include "namespace.h"
|
|
|
|
|
|
+extern char *escape_json_string(char *str);
|
|
|
+
|
|
|
/* Synchronisation values. */
|
|
|
enum sync_t {
|
|
|
SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */
|
|
@@ -36,7 +38,7 @@ enum sync_t {
|
|
|
SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */
|
|
|
SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */
|
|
|
SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */
|
|
|
- SYNC_CHILD_READY = 0x45, /* The child or grandchild is ready to return. */
|
|
|
+ SYNC_CHILD_FINISH = 0x45, /* The child or grandchild has finished. */
|
|
|
};
|
|
|
|
|
|
/*
|
|
@@ -45,10 +47,14 @@ enum sync_t {
|
|
|
*/
|
|
|
#define CREATECGROUPNS 0x80
|
|
|
|
|
|
+#define STAGE_SETUP -1
|
|
|
/* longjmp() arguments. */
|
|
|
-#define JUMP_PARENT 0x00
|
|
|
-#define JUMP_CHILD 0xA0
|
|
|
-#define JUMP_INIT 0xA1
|
|
|
+#define STAGE_PARENT 0
|
|
|
+#define STAGE_CHILD 1
|
|
|
+#define STAGE_INIT 2
|
|
|
+
|
|
|
+/* Stores the current stage of nsexec. */
|
|
|
+int current_stage = STAGE_SETUP;
|
|
|
|
|
|
/* Assume the stack grows down, so arguments should be above it. */
|
|
|
struct clone_t {
|
|
@@ -56,7 +62,7 @@ struct clone_t {
|
|
|
* Reserve some space for clone() to locate arguments
|
|
|
* and retcode in this place
|
|
|
*/
|
|
|
- char stack[4096] __attribute__ ((aligned(16)));
|
|
|
+ char stack[4096] __attribute__((aligned(16)));
|
|
|
char stack_ptr[0];
|
|
|
|
|
|
/* There's two children. This is used to execute the different code. */
|
|
@@ -102,31 +108,31 @@ static int logfd = -1;
|
|
|
* List of netlink message types sent to us as part of bootstrapping the init.
|
|
|
* These constants are defined in libcontainer/message_linux.go.
|
|
|
*/
|
|
|
-#define INIT_MSG 62000
|
|
|
+#define INIT_MSG 62000
|
|
|
#define CLONE_FLAGS_ATTR 27281
|
|
|
#define NS_PATHS_ATTR 27282
|
|
|
-#define UIDMAP_ATTR 27283
|
|
|
-#define GIDMAP_ATTR 27284
|
|
|
+#define UIDMAP_ATTR 27283
|
|
|
+#define GIDMAP_ATTR 27284
|
|
|
#define SETGROUP_ATTR 27285
|
|
|
#define OOM_SCORE_ADJ_ATTR 27286
|
|
|
#define ROOTLESS_EUID_ATTR 27287
|
|
|
-#define UIDMAPPATH_ATTR 27288
|
|
|
-#define GIDMAPPATH_ATTR 27289
|
|
|
+#define UIDMAPPATH_ATTR 27288
|
|
|
+#define GIDMAPPATH_ATTR 27289
|
|
|
|
|
|
/*
|
|
|
* Use the raw syscall for versions of glibc which don't include a function for
|
|
|
* it, namely (glibc 2.12).
|
|
|
*/
|
|
|
#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14
|
|
|
-# define _GNU_SOURCE
|
|
|
-# include "syscall.h"
|
|
|
-# if !defined(SYS_setns) && defined(__NR_setns)
|
|
|
-# define SYS_setns __NR_setns
|
|
|
-# endif
|
|
|
-
|
|
|
-#ifndef SYS_setns
|
|
|
-# error "setns(2) syscall not supported by glibc version"
|
|
|
-#endif
|
|
|
+# define _GNU_SOURCE
|
|
|
+# include "syscall.h"
|
|
|
+# if !defined(SYS_setns) && defined(__NR_setns)
|
|
|
+# define SYS_setns __NR_setns
|
|
|
+# endif
|
|
|
+
|
|
|
+# ifndef SYS_setns
|
|
|
+# error "setns(2) syscall not supported by glibc version"
|
|
|
+# endif
|
|
|
|
|
|
int setns(int fd, int nstype)
|
|
|
{
|
|
@@ -134,33 +140,43 @@ int setns(int fd, int nstype)
|
|
|
}
|
|
|
#endif
|
|
|
|
|
|
-static void write_log_with_info(const char *level, const char *function, int line, const char *format, ...)
|
|
|
+static void write_log(const char *level, const char *format, ...)
|
|
|
{
|
|
|
- char message[1024] = {};
|
|
|
-
|
|
|
+ char *message = NULL, *stage = NULL;
|
|
|
va_list args;
|
|
|
+ int ret;
|
|
|
|
|
|
if (logfd < 0 || level == NULL)
|
|
|
- return;
|
|
|
+ goto out;
|
|
|
|
|
|
va_start(args, format);
|
|
|
- if (vsnprintf(message, sizeof(message), format, args) < 0)
|
|
|
- goto done;
|
|
|
-
|
|
|
- dprintf(logfd, "{\"level\":\"%s\", \"msg\": \"%s:%d %s\"}\n", level, function, line, message);
|
|
|
-done:
|
|
|
+ ret = vasprintf(&message, format, args);
|
|
|
va_end(args);
|
|
|
-}
|
|
|
+ if (ret < 0)
|
|
|
+ goto out;
|
|
|
|
|
|
-#define write_log(level, fmt, ...) \
|
|
|
- write_log_with_info((level), __FUNCTION__, __LINE__, (fmt), ##__VA_ARGS__)
|
|
|
+ message = escape_json_string(message);
|
|
|
+
|
|
|
+ if (current_stage == STAGE_SETUP)
|
|
|
+ stage = strdup("nsexec");
|
|
|
+ else
|
|
|
+ ret = asprintf(&stage, "nsexec-%d", current_stage);
|
|
|
+ if (ret < 0)
|
|
|
+ goto out;
|
|
|
+
|
|
|
+ dprintf(logfd, "{\"level\":\"%s\", \"msg\": \"%s[%d]: %s\"}\n", level, stage, getpid(), message);
|
|
|
+
|
|
|
+out:
|
|
|
+ free(message);
|
|
|
+ free(stage);
|
|
|
+}
|
|
|
|
|
|
/* XXX: This is ugly. */
|
|
|
static int syncfd = -1;
|
|
|
|
|
|
#define bail(fmt, ...) \
|
|
|
do { \
|
|
|
- write_log(FATAL, "nsenter: " fmt ": %m", ##__VA_ARGS__); \
|
|
|
+ write_log(FATAL, fmt ": %m", ##__VA_ARGS__); \
|
|
|
exit(1); \
|
|
|
} while(0)
|
|
|
|
|
@@ -187,7 +203,7 @@ static int write_file(char *data, size_t data_len, char *pathfmt, ...)
|
|
|
goto out;
|
|
|
}
|
|
|
|
|
|
- out:
|
|
|
+out:
|
|
|
close(fd);
|
|
|
return ret;
|
|
|
}
|
|
@@ -297,9 +313,11 @@ static void update_uidmap(const char *path, int pid, char *map, size_t map_len)
|
|
|
if (map == NULL || map_len <= 0)
|
|
|
return;
|
|
|
|
|
|
+ write_log(DEBUG, "update /proc/%d/uid_map to '%s'", pid, map);
|
|
|
if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) {
|
|
|
if (errno != EPERM)
|
|
|
bail("failed to update /proc/%d/uid_map", pid);
|
|
|
+ write_log(DEBUG, "update /proc/%d/uid_map got -EPERM (trying %s)", pid, path);
|
|
|
if (try_mapping_tool(path, pid, map, map_len))
|
|
|
bail("failed to use newuid map on %d", pid);
|
|
|
}
|
|
@@ -310,9 +328,11 @@ static void update_gidmap(const char *path, int pid, char *map, size_t map_len)
|
|
|
if (map == NULL || map_len <= 0)
|
|
|
return;
|
|
|
|
|
|
+ write_log(DEBUG, "update /proc/%d/gid_map to '%s'", pid, map);
|
|
|
if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) {
|
|
|
if (errno != EPERM)
|
|
|
bail("failed to update /proc/%d/gid_map", pid);
|
|
|
+ write_log(DEBUG, "update /proc/%d/gid_map got -EPERM (trying %s)", pid, path);
|
|
|
if (try_mapping_tool(path, pid, map, map_len))
|
|
|
bail("failed to use newgid map on %d", pid);
|
|
|
}
|
|
@@ -323,19 +343,20 @@ static void update_oom_score_adj(char *data, size_t len)
|
|
|
if (data == NULL || len <= 0)
|
|
|
return;
|
|
|
|
|
|
+ write_log(DEBUG, "update /proc/self/oom_score_adj to '%s'", data);
|
|
|
if (write_file(data, len, "/proc/self/oom_score_adj") < 0)
|
|
|
bail("failed to update /proc/self/oom_score_adj");
|
|
|
}
|
|
|
|
|
|
/* A dummy function that just jumps to the given jumpval. */
|
|
|
-static int child_func(void *arg) __attribute__ ((noinline));
|
|
|
+static int child_func(void *arg) __attribute__((noinline));
|
|
|
static int child_func(void *arg)
|
|
|
{
|
|
|
struct clone_t *ca = (struct clone_t *)arg;
|
|
|
longjmp(*ca->env, ca->jmpval);
|
|
|
}
|
|
|
|
|
|
-static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline));
|
|
|
+static int clone_parent(jmp_buf *env, int jmpval) __attribute__((noinline));
|
|
|
static int clone_parent(jmp_buf *env, int jmpval)
|
|
|
{
|
|
|
struct clone_t ca = {
|
|
@@ -507,7 +528,6 @@ void join_namespaces(char *nslist)
|
|
|
char *namespace = strtok_r(nslist, ",", &saveptr);
|
|
|
struct namespace_t {
|
|
|
int fd;
|
|
|
- int ns;
|
|
|
char type[PATH_MAX];
|
|
|
char path[PATH_MAX];
|
|
|
} *namespaces = NULL;
|
|
@@ -542,7 +562,7 @@ void join_namespaces(char *nslist)
|
|
|
bail("failed to open %s", path);
|
|
|
|
|
|
ns->fd = fd;
|
|
|
- ns->ns = nsflag(namespace);
|
|
|
+ strncpy(ns->type, namespace, PATH_MAX - 1);
|
|
|
strncpy(ns->path, path, PATH_MAX - 1);
|
|
|
ns->path[PATH_MAX - 1] = '\0';
|
|
|
} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
|
|
@@ -555,12 +575,14 @@ void join_namespaces(char *nslist)
|
|
|
*/
|
|
|
|
|
|
for (i = 0; i < num; i++) {
|
|
|
- struct namespace_t ns = namespaces[i];
|
|
|
+ struct namespace_t *ns = &namespaces[i];
|
|
|
+ int flag = nsflag(ns->type);
|
|
|
|
|
|
- if (setns(ns.fd, ns.ns) < 0)
|
|
|
- bail("failed to setns to %s", ns.path);
|
|
|
+ write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", flag, ns->type, ns->path);
|
|
|
+ if (setns(ns->fd, flag) < 0)
|
|
|
+ bail("failed to setns into %s namespace", ns->type);
|
|
|
|
|
|
- close(ns.fd);
|
|
|
+ close(ns->fd);
|
|
|
}
|
|
|
|
|
|
free(namespaces);
|
|
@@ -569,6 +591,14 @@ void join_namespaces(char *nslist)
|
|
|
/* Defined in cloned_binary.c. */
|
|
|
extern int ensure_cloned_binary(void);
|
|
|
|
|
|
+static inline int sane_kill(pid_t pid, int signum)
|
|
|
+{
|
|
|
+ if (pid > 0)
|
|
|
+ return kill(pid, signum);
|
|
|
+ else
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
void nsexec(void)
|
|
|
{
|
|
|
int pipenum;
|
|
@@ -598,7 +628,14 @@ void nsexec(void)
|
|
|
if (ensure_cloned_binary() < 0)
|
|
|
bail("could not ensure we are a cloned binary");
|
|
|
|
|
|
- write_log(DEBUG, "nsexec started");
|
|
|
+ /*
|
|
|
+ * Inform the parent we're past initial setup.
|
|
|
+ * For the other side of this, see initWaiter.
|
|
|
+ */
|
|
|
+ if (write(pipenum, "", 1) != 1)
|
|
|
+ bail("could not inform the parent we are past initial setup");
|
|
|
+
|
|
|
+ write_log(DEBUG, "=> nsexec container setup");
|
|
|
|
|
|
/* Parse all of the netlink configuration. */
|
|
|
nl_parse(pipenum, &config);
|
|
@@ -622,6 +659,7 @@ void nsexec(void)
|
|
|
* containers), which is the recommendation from the kernel folks.
|
|
|
*/
|
|
|
if (config.namespaces) {
|
|
|
+ write_log(DEBUG, "set process as non-dumpable");
|
|
|
if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
|
|
|
bail("failed to set process as non-dumpable");
|
|
|
}
|
|
@@ -686,45 +724,49 @@ void nsexec(void)
|
|
|
* -- Aleksa "what has my life come to?" Sarai
|
|
|
*/
|
|
|
|
|
|
- switch (setjmp(env)) {
|
|
|
+ current_stage = setjmp(env);
|
|
|
+ switch (current_stage) {
|
|
|
/*
|
|
|
* Stage 0: We're in the parent. Our job is just to create a new child
|
|
|
- * (stage 1: JUMP_CHILD) process and write its uid_map and
|
|
|
+ * (stage 1: STAGE_CHILD) process and write its uid_map and
|
|
|
* gid_map. That process will go on to create a new process, then
|
|
|
* it will send us its PID which we will send to the bootstrap
|
|
|
* process.
|
|
|
*/
|
|
|
- case JUMP_PARENT:{
|
|
|
+ case STAGE_PARENT:{
|
|
|
int len;
|
|
|
- pid_t child, first_child = -1;
|
|
|
- bool ready = false;
|
|
|
+ pid_t stage1_pid = -1, stage2_pid = -1;
|
|
|
+ bool stage1_complete, stage2_complete;
|
|
|
|
|
|
/* For debugging. */
|
|
|
prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0);
|
|
|
+ write_log(DEBUG, "~> nsexec stage-0");
|
|
|
|
|
|
/* Start the process of getting a container. */
|
|
|
- child = clone_parent(&env, JUMP_CHILD);
|
|
|
- if (child < 0)
|
|
|
- bail("unable to fork: child_func");
|
|
|
+ write_log(DEBUG, "spawn stage-1");
|
|
|
+ stage1_pid = clone_parent(&env, STAGE_CHILD);
|
|
|
+ if (stage1_pid < 0)
|
|
|
+ bail("unable to spawn stage-1");
|
|
|
|
|
|
- /*
|
|
|
- * State machine for synchronisation with the children.
|
|
|
- *
|
|
|
- * Father only return when both child and grandchild are
|
|
|
- * ready, so we can receive all possible error codes
|
|
|
- * generated by children.
|
|
|
- */
|
|
|
syncfd = sync_child_pipe[1];
|
|
|
close(sync_child_pipe[0]);
|
|
|
|
|
|
- while (!ready) {
|
|
|
+ /*
|
|
|
+ * State machine for synchronisation with the children. We only
|
|
|
+ * return once both the child and grandchild are ready.
|
|
|
+ */
|
|
|
+ write_log(DEBUG, "-> stage-1 synchronisation loop");
|
|
|
+ stage1_complete = false;
|
|
|
+ while (!stage1_complete) {
|
|
|
enum sync_t s;
|
|
|
|
|
|
if (read(syncfd, &s, sizeof(s)) != sizeof(s))
|
|
|
- bail("failed to sync with child: next state");
|
|
|
+ bail("failed to sync with stage-1: next state");
|
|
|
|
|
|
switch (s) {
|
|
|
case SYNC_USERMAP_PLS:
|
|
|
+ write_log(DEBUG, "stage-1 requested userns mappings");
|
|
|
+
|
|
|
/*
|
|
|
* Enable setgroups(2) if we've been asked to. But we also
|
|
|
* have to explicitly disable setgroups(2) if we're
|
|
@@ -735,70 +777,78 @@ void nsexec(void)
|
|
|
* For rootless multi-entry mapping, config.is_setgroup shall be true and
|
|
|
* newuidmap/newgidmap shall be used.
|
|
|
*/
|
|
|
-
|
|
|
if (config.is_rootless_euid && !config.is_setgroup)
|
|
|
- update_setgroups(child, SETGROUPS_DENY);
|
|
|
+ update_setgroups(stage1_pid, SETGROUPS_DENY);
|
|
|
|
|
|
/* Set up mappings. */
|
|
|
- update_uidmap(config.uidmappath, child, config.uidmap, config.uidmap_len);
|
|
|
- update_gidmap(config.gidmappath, child, config.gidmap, config.gidmap_len);
|
|
|
+ update_uidmap(config.uidmappath, stage1_pid, config.uidmap, config.uidmap_len);
|
|
|
+ update_gidmap(config.gidmappath, stage1_pid, config.gidmap, config.gidmap_len);
|
|
|
|
|
|
s = SYNC_USERMAP_ACK;
|
|
|
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
|
|
|
- kill(child, SIGKILL);
|
|
|
- bail("failed to sync with child: write(SYNC_USERMAP_ACK)");
|
|
|
+ sane_kill(stage1_pid, SIGKILL);
|
|
|
+ sane_kill(stage2_pid, SIGKILL);
|
|
|
+ bail("failed to sync with stage-1: write(SYNC_USERMAP_ACK)");
|
|
|
}
|
|
|
break;
|
|
|
- case SYNC_RECVPID_PLS:{
|
|
|
- first_child = child;
|
|
|
-
|
|
|
- /* Get the init_func pid. */
|
|
|
- if (read(syncfd, &child, sizeof(child)) != sizeof(child)) {
|
|
|
- kill(first_child, SIGKILL);
|
|
|
- bail("failed to sync with child: read(childpid)");
|
|
|
- }
|
|
|
-
|
|
|
- /* Send ACK. */
|
|
|
- s = SYNC_RECVPID_ACK;
|
|
|
- if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
|
|
|
- kill(first_child, SIGKILL);
|
|
|
- kill(child, SIGKILL);
|
|
|
- bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
|
|
|
- }
|
|
|
-
|
|
|
- /* Send the init_func pid back to our parent.
|
|
|
- *
|
|
|
- * Send the init_func pid and the pid of the first child back to our parent.
|
|
|
- * We need to send both back because we can't reap the first child we created (CLONE_PARENT).
|
|
|
- * It becomes the responsibility of our parent to reap the first child.
|
|
|
- */
|
|
|
- len = dprintf(pipenum, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child);
|
|
|
- if (len < 0) {
|
|
|
- kill(child, SIGKILL);
|
|
|
- bail("unable to generate JSON for child pid");
|
|
|
- }
|
|
|
+ case SYNC_RECVPID_PLS:
|
|
|
+ write_log(DEBUG, "stage-1 requested pid to be forwarded");
|
|
|
+
|
|
|
+ /* Get the stage-2 pid. */
|
|
|
+ if (read(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) {
|
|
|
+ sane_kill(stage1_pid, SIGKILL);
|
|
|
+ sane_kill(stage2_pid, SIGKILL);
|
|
|
+ bail("failed to sync with stage-1: read(stage2_pid)");
|
|
|
+ }
|
|
|
+
|
|
|
+ /* Send ACK. */
|
|
|
+ s = SYNC_RECVPID_ACK;
|
|
|
+ if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
|
|
|
+ sane_kill(stage1_pid, SIGKILL);
|
|
|
+ sane_kill(stage2_pid, SIGKILL);
|
|
|
+ bail("failed to sync with stage-1: write(SYNC_RECVPID_ACK)");
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Send both the stage-1 and stage-2 pids back to runc.
|
|
|
+ * runc needs the stage-2 to continue process management,
|
|
|
+ * but because stage-1 was spawned with CLONE_PARENT we
|
|
|
+ * cannot reap it within stage-0 and thus we need to ask
|
|
|
+ * runc to reap the zombie for us.
|
|
|
+ */
|
|
|
+ write_log(DEBUG, "forward stage-1 (%d) and stage-2 (%d) pids to runc",
|
|
|
+ stage1_pid, stage2_pid);
|
|
|
+ len =
|
|
|
+ dprintf(pipenum, "{\"stage1_pid\":%d,\"stage2_pid\":%d}\n", stage1_pid,
|
|
|
+ stage2_pid);
|
|
|
+ if (len < 0) {
|
|
|
+ sane_kill(stage1_pid, SIGKILL);
|
|
|
+ sane_kill(stage2_pid, SIGKILL);
|
|
|
+ bail("failed to sync with runc: write(pid-JSON)");
|
|
|
}
|
|
|
break;
|
|
|
- case SYNC_CHILD_READY:
|
|
|
- ready = true;
|
|
|
+ case SYNC_CHILD_FINISH:
|
|
|
+ write_log(DEBUG, "stage-1 complete");
|
|
|
+ stage1_complete = true;
|
|
|
break;
|
|
|
default:
|
|
|
bail("unexpected sync value: %u", s);
|
|
|
}
|
|
|
}
|
|
|
+ write_log(DEBUG, "<- stage-1 synchronisation loop");
|
|
|
|
|
|
/* Now sync with grandchild. */
|
|
|
-
|
|
|
syncfd = sync_grandchild_pipe[1];
|
|
|
close(sync_grandchild_pipe[0]);
|
|
|
-
|
|
|
- ready = false;
|
|
|
- while (!ready) {
|
|
|
+ write_log(DEBUG, "-> stage-2 synchronisation loop");
|
|
|
+ stage2_complete = false;
|
|
|
+ while (!stage2_complete) {
|
|
|
enum sync_t s;
|
|
|
|
|
|
+ write_log(DEBUG, "signalling stage-2 to run");
|
|
|
s = SYNC_GRANDCHILD;
|
|
|
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
|
|
|
- kill(child, SIGKILL);
|
|
|
+ sane_kill(stage2_pid, SIGKILL);
|
|
|
bail("failed to sync with child: write(SYNC_GRANDCHILD)");
|
|
|
}
|
|
|
|
|
@@ -806,27 +856,31 @@ void nsexec(void)
|
|
|
bail("failed to sync with child: next state");
|
|
|
|
|
|
switch (s) {
|
|
|
- case SYNC_CHILD_READY:
|
|
|
- ready = true;
|
|
|
+ case SYNC_CHILD_FINISH:
|
|
|
+ write_log(DEBUG, "stage-2 complete");
|
|
|
+ stage2_complete = true;
|
|
|
break;
|
|
|
default:
|
|
|
bail("unexpected sync value: %u", s);
|
|
|
}
|
|
|
}
|
|
|
+ write_log(DEBUG, "<- stage-2 synchronisation loop");
|
|
|
+ write_log(DEBUG, "<~ nsexec stage-0");
|
|
|
exit(0);
|
|
|
}
|
|
|
+ break;
|
|
|
|
|
|
/*
|
|
|
* Stage 1: We're in the first child process. Our job is to join any
|
|
|
- * provided namespaces in the netlink payload and unshare all
|
|
|
- * of the requested namespaces. If we've been asked to
|
|
|
- * CLONE_NEWUSER, we will ask our parent (stage 0) to set up
|
|
|
- * our user mappings for us. Then, we create a new child
|
|
|
- * (stage 2: JUMP_INIT) for PID namespace. We then send the
|
|
|
- * child's PID to our parent (stage 0).
|
|
|
+ * provided namespaces in the netlink payload and unshare all of
|
|
|
+ * the requested namespaces. If we've been asked to CLONE_NEWUSER,
|
|
|
+ * we will ask our parent (stage 0) to set up our user mappings
|
|
|
+ * for us. Then, we create a new child (stage 2: STAGE_INIT) for
|
|
|
+ * PID namespace. We then send the child's PID to our parent
|
|
|
+ * (stage 0).
|
|
|
*/
|
|
|
- case JUMP_CHILD:{
|
|
|
- pid_t child;
|
|
|
+ case STAGE_CHILD:{
|
|
|
+ pid_t stage2_pid = -1;
|
|
|
enum sync_t s;
|
|
|
|
|
|
/* We're in a child and thus need to tell the parent if we die. */
|
|
@@ -835,11 +889,12 @@ void nsexec(void)
|
|
|
|
|
|
/* For debugging. */
|
|
|
prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0);
|
|
|
+ write_log(DEBUG, "~> nsexec stage-1");
|
|
|
|
|
|
/*
|
|
|
* We need to setns first. We cannot do this earlier (in stage 0)
|
|
|
* because of the fact that we forked to get here (the PID of
|
|
|
- * [stage 2: JUMP_INIT]) would be meaningless). We could send it
|
|
|
+ * [stage 2: STAGE_INIT]) would be meaningless). We could send it
|
|
|
* using cmsg(3) but that's just annoying.
|
|
|
*/
|
|
|
if (config.namespaces)
|
|
@@ -865,40 +920,50 @@ void nsexec(void)
|
|
|
* problem.
|
|
|
*/
|
|
|
if (config.cloneflags & CLONE_NEWUSER) {
|
|
|
+ write_log(DEBUG, "unshare user namespace");
|
|
|
if (unshare(CLONE_NEWUSER) < 0)
|
|
|
bail("failed to unshare user namespace");
|
|
|
config.cloneflags &= ~CLONE_NEWUSER;
|
|
|
|
|
|
/*
|
|
|
- * We don't have the privileges to do any mapping here (see the
|
|
|
- * clone_parent rant). So signal our parent to hook us up.
|
|
|
+ * We need to set ourselves as dumpable temporarily so that the
|
|
|
+ * parent process can write to our procfs files.
|
|
|
*/
|
|
|
-
|
|
|
- /* Switching is only necessary if we joined namespaces. */
|
|
|
if (config.namespaces) {
|
|
|
+ write_log(DEBUG, "temporarily set process as dumpable");
|
|
|
if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0)
|
|
|
- bail("failed to set process as dumpable");
|
|
|
+ bail("failed to temporarily set process as dumpable");
|
|
|
}
|
|
|
+
|
|
|
+ /*
|
|
|
+ * We don't have the privileges to do any mapping here (see the
|
|
|
+ * clone_parent rant). So signal stage-0 to do the mapping for
|
|
|
+ * us.
|
|
|
+ */
|
|
|
+ write_log(DEBUG, "request stage-0 to map user namespace");
|
|
|
s = SYNC_USERMAP_PLS;
|
|
|
if (write(syncfd, &s, sizeof(s)) != sizeof(s))
|
|
|
bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
|
|
|
|
|
|
/* ... wait for mapping ... */
|
|
|
-
|
|
|
+ write_log(DEBUG, "request stage-0 to map user namespace");
|
|
|
if (read(syncfd, &s, sizeof(s)) != sizeof(s))
|
|
|
bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
|
|
|
if (s != SYNC_USERMAP_ACK)
|
|
|
bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
|
|
|
- /* Switching is only necessary if we joined namespaces. */
|
|
|
+
|
|
|
+ /* Revert temporary re-dumpable setting. */
|
|
|
if (config.namespaces) {
|
|
|
+ write_log(DEBUG, "re-set process as non-dumpable");
|
|
|
if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
|
|
|
- bail("failed to set process as dumpable");
|
|
|
+ bail("failed to re-set process as non-dumpable");
|
|
|
}
|
|
|
|
|
|
/* Become root in the namespace proper. */
|
|
|
if (setresuid(0, 0, 0) < 0)
|
|
|
bail("failed to become root in user namespace");
|
|
|
}
|
|
|
+
|
|
|
/*
|
|
|
* Unshare all of the namespaces. Now, it should be noted that this
|
|
|
* ordering might break in the future (especially with rootless
|
|
@@ -909,8 +974,9 @@ void nsexec(void)
|
|
|
* some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
|
|
|
* was broken, so we'll just do it the long way anyway.
|
|
|
*/
|
|
|
+ write_log(DEBUG, "unshare remaining namespace (except cgroupns)");
|
|
|
if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0)
|
|
|
- bail("failed to unshare namespaces");
|
|
|
+ bail("failed to unshare remaining namespaces (except cgroupns)");
|
|
|
|
|
|
/*
|
|
|
* TODO: What about non-namespace clone flags that we're dropping here?
|
|
@@ -921,41 +987,45 @@ void nsexec(void)
|
|
|
* which would break many applications and libraries, so we must fork
|
|
|
* to actually enter the new PID namespace.
|
|
|
*/
|
|
|
- child = clone_parent(&env, JUMP_INIT);
|
|
|
- if (child < 0)
|
|
|
- bail("unable to fork: init_func");
|
|
|
+ write_log(DEBUG, "spawn stage-2");
|
|
|
+ stage2_pid = clone_parent(&env, STAGE_INIT);
|
|
|
+ if (stage2_pid < 0)
|
|
|
+ bail("unable to spawn stage-2");
|
|
|
|
|
|
/* Send the child to our parent, which knows what it's doing. */
|
|
|
+ write_log(DEBUG, "request stage-0 to forward stage-2 pid (%d)", stage2_pid);
|
|
|
s = SYNC_RECVPID_PLS;
|
|
|
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
|
|
|
- kill(child, SIGKILL);
|
|
|
+ sane_kill(stage2_pid, SIGKILL);
|
|
|
bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
|
|
|
}
|
|
|
- if (write(syncfd, &child, sizeof(child)) != sizeof(child)) {
|
|
|
- kill(child, SIGKILL);
|
|
|
- bail("failed to sync with parent: write(childpid)");
|
|
|
+ if (write(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) {
|
|
|
+ sane_kill(stage2_pid, SIGKILL);
|
|
|
+ bail("failed to sync with parent: write(stage2_pid)");
|
|
|
}
|
|
|
|
|
|
/* ... wait for parent to get the pid ... */
|
|
|
-
|
|
|
if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
|
|
|
- kill(child, SIGKILL);
|
|
|
+ sane_kill(stage2_pid, SIGKILL);
|
|
|
bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
|
|
|
}
|
|
|
if (s != SYNC_RECVPID_ACK) {
|
|
|
- kill(child, SIGKILL);
|
|
|
+ sane_kill(stage2_pid, SIGKILL);
|
|
|
bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
|
|
|
}
|
|
|
|
|
|
- s = SYNC_CHILD_READY;
|
|
|
+ write_log(DEBUG, "signal completion to stage-0");
|
|
|
+ s = SYNC_CHILD_FINISH;
|
|
|
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
|
|
|
- kill(child, SIGKILL);
|
|
|
- bail("failed to sync with parent: write(SYNC_CHILD_READY)");
|
|
|
+ sane_kill(stage2_pid, SIGKILL);
|
|
|
+ bail("failed to sync with parent: write(SYNC_CHILD_FINISH)");
|
|
|
}
|
|
|
|
|
|
- /* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */
|
|
|
+ /* Our work is done. [Stage 2: STAGE_INIT] is doing the rest of the work. */
|
|
|
+ write_log(DEBUG, "<~ nsexec stage-1");
|
|
|
exit(0);
|
|
|
}
|
|
|
+ break;
|
|
|
|
|
|
/*
|
|
|
* Stage 2: We're the final child process, and the only process that will
|
|
@@ -963,7 +1033,7 @@ void nsexec(void)
|
|
|
* final cleanup steps and then return to the Go runtime to allow
|
|
|
* init_linux.go to run.
|
|
|
*/
|
|
|
- case JUMP_INIT:{
|
|
|
+ case STAGE_INIT:{
|
|
|
/*
|
|
|
* We're inside the child now, having jumped from the
|
|
|
* start_child() code after forking in the parent.
|
|
@@ -978,6 +1048,7 @@ void nsexec(void)
|
|
|
|
|
|
/* For debugging. */
|
|
|
prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0);
|
|
|
+ write_log(DEBUG, "~> nsexec stage-2");
|
|
|
|
|
|
if (read(syncfd, &s, sizeof(s)) != sizeof(s))
|
|
|
bail("failed to sync with parent: read(SYNC_GRANDCHILD)");
|
|
@@ -998,21 +1069,30 @@ void nsexec(void)
|
|
|
bail("setgroups failed");
|
|
|
}
|
|
|
|
|
|
- /* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */
|
|
|
+ /*
|
|
|
+ * Wait until our topmost parent has finished cgroup setup in
|
|
|
+ * p.manager.Apply().
|
|
|
+ *
|
|
|
+ * TODO(cyphar): Check if this code is actually needed because we
|
|
|
+ * should be in the cgroup even from stage-0, so
|
|
|
+ * waiting until now might not make sense.
|
|
|
+ */
|
|
|
if (config.cloneflags & CLONE_NEWCGROUP) {
|
|
|
uint8_t value;
|
|
|
if (read(pipenum, &value, sizeof(value)) != sizeof(value))
|
|
|
bail("read synchronisation value failed");
|
|
|
if (value == CREATECGROUPNS) {
|
|
|
+ write_log(DEBUG, "unshare cgroup namespace");
|
|
|
if (unshare(CLONE_NEWCGROUP) < 0)
|
|
|
bail("failed to unshare cgroup namespace");
|
|
|
} else
|
|
|
bail("received unknown synchronisation value");
|
|
|
}
|
|
|
|
|
|
- s = SYNC_CHILD_READY;
|
|
|
+ write_log(DEBUG, "signal completion to stage-0");
|
|
|
+ s = SYNC_CHILD_FINISH;
|
|
|
if (write(syncfd, &s, sizeof(s)) != sizeof(s))
|
|
|
- bail("failed to sync with patent: write(SYNC_CHILD_READY)");
|
|
|
+ bail("failed to sync with patent: write(SYNC_CHILD_FINISH)");
|
|
|
|
|
|
/* Close sync pipes. */
|
|
|
close(sync_grandchild_pipe[0]);
|
|
@@ -1021,10 +1101,13 @@ void nsexec(void)
|
|
|
nl_free(&config);
|
|
|
|
|
|
/* Finish executing, let the Go runtime take over. */
|
|
|
+ write_log(DEBUG, "<= nsexec container setup");
|
|
|
+ write_log(DEBUG, "booting up go runtime ...");
|
|
|
return;
|
|
|
}
|
|
|
+ break;
|
|
|
default:
|
|
|
- bail("unexpected jump value");
|
|
|
+ bail("unknown stage '%d' for jump value", current_stage);
|
|
|
}
|
|
|
|
|
|
/* Should never be reached. */
|