|
@@ -22,7 +22,6 @@
|
|
#include <sys/types.h>
|
|
#include <sys/types.h>
|
|
#include <sys/wait.h>
|
|
#include <sys/wait.h>
|
|
|
|
|
|
-
|
|
|
|
#include <linux/limits.h>
|
|
#include <linux/limits.h>
|
|
#include <linux/netlink.h>
|
|
#include <linux/netlink.h>
|
|
#include <linux/types.h>
|
|
#include <linux/types.h>
|
|
@@ -32,15 +31,15 @@
|
|
|
|
|
|
/* Synchronisation values. */
|
|
/* Synchronisation values. */
|
|
enum sync_t {
|
|
enum sync_t {
|
|
- SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */
|
|
|
|
- SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */
|
|
|
|
- SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */
|
|
|
|
- SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */
|
|
|
|
- SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */
|
|
|
|
- SYNC_CHILD_READY = 0x45, /* The child or grandchild is ready to return. */
|
|
|
|
|
|
+ SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */
|
|
|
|
+ SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */
|
|
|
|
+ SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */
|
|
|
|
+ SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */
|
|
|
|
+ SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */
|
|
|
|
+ SYNC_CHILD_READY = 0x45, /* The child or grandchild is ready to return. */
|
|
|
|
|
|
/* XXX: This doesn't help with segfaults and other such issues. */
|
|
/* XXX: This doesn't help with segfaults and other such issues. */
|
|
- SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */
|
|
|
|
|
|
+ SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */
|
|
};
|
|
};
|
|
|
|
|
|
/* longjmp() arguments. */
|
|
/* longjmp() arguments. */
|
|
@@ -73,7 +72,7 @@ struct nlconfig_t {
|
|
char *oom_score_adj;
|
|
char *oom_score_adj;
|
|
size_t oom_score_adj_len;
|
|
size_t oom_score_adj_len;
|
|
|
|
|
|
- /* User namespace settings.*/
|
|
|
|
|
|
+ /* User namespace settings. */
|
|
char *uidmap;
|
|
char *uidmap;
|
|
size_t uidmap_len;
|
|
size_t uidmap_len;
|
|
char *gidmap;
|
|
char *gidmap;
|
|
@@ -82,7 +81,7 @@ struct nlconfig_t {
|
|
size_t namespaces_len;
|
|
size_t namespaces_len;
|
|
uint8_t is_setgroup;
|
|
uint8_t is_setgroup;
|
|
|
|
|
|
- /* Rootless container settings.*/
|
|
|
|
|
|
+ /* Rootless container settings. */
|
|
uint8_t is_rootless;
|
|
uint8_t is_rootless;
|
|
char *uidmappath;
|
|
char *uidmappath;
|
|
size_t uidmappath_len;
|
|
size_t uidmappath_len;
|
|
@@ -167,7 +166,7 @@ static int write_file(char *data, size_t data_len, char *pathfmt, ...)
|
|
goto out;
|
|
goto out;
|
|
}
|
|
}
|
|
|
|
|
|
-out:
|
|
|
|
|
|
+ out:
|
|
close(fd);
|
|
close(fd);
|
|
return ret;
|
|
return ret;
|
|
}
|
|
}
|
|
@@ -184,16 +183,16 @@ static void update_setgroups(int pid, enum policy_t setgroup)
|
|
char *policy;
|
|
char *policy;
|
|
|
|
|
|
switch (setgroup) {
|
|
switch (setgroup) {
|
|
- case SETGROUPS_ALLOW:
|
|
|
|
- policy = "allow";
|
|
|
|
- break;
|
|
|
|
- case SETGROUPS_DENY:
|
|
|
|
- policy = "deny";
|
|
|
|
- break;
|
|
|
|
- case SETGROUPS_DEFAULT:
|
|
|
|
- default:
|
|
|
|
- /* Nothing to do. */
|
|
|
|
- return;
|
|
|
|
|
|
+ case SETGROUPS_ALLOW:
|
|
|
|
+ policy = "allow";
|
|
|
|
+ break;
|
|
|
|
+ case SETGROUPS_DENY:
|
|
|
|
+ policy = "deny";
|
|
|
|
+ break;
|
|
|
|
+ case SETGROUPS_DEFAULT:
|
|
|
|
+ default:
|
|
|
|
+ /* Nothing to do. */
|
|
|
|
+ return;
|
|
}
|
|
}
|
|
|
|
|
|
if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) {
|
|
if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) {
|
|
@@ -226,14 +225,14 @@ static int try_mapping_tool(const char *app, int pid, char *map, size_t map_len)
|
|
if (!child) {
|
|
if (!child) {
|
|
#define MAX_ARGV 20
|
|
#define MAX_ARGV 20
|
|
char *argv[MAX_ARGV];
|
|
char *argv[MAX_ARGV];
|
|
- char *envp[] = {NULL};
|
|
|
|
|
|
+ char *envp[] = { NULL };
|
|
char pid_fmt[16];
|
|
char pid_fmt[16];
|
|
int argc = 0;
|
|
int argc = 0;
|
|
char *next;
|
|
char *next;
|
|
|
|
|
|
snprintf(pid_fmt, 16, "%d", pid);
|
|
snprintf(pid_fmt, 16, "%d", pid);
|
|
|
|
|
|
- argv[argc++] = (char *) app;
|
|
|
|
|
|
+ argv[argc++] = (char *)app;
|
|
argv[argc++] = pid_fmt;
|
|
argv[argc++] = pid_fmt;
|
|
/*
|
|
/*
|
|
* Convert the map string into a list of argument that
|
|
* Convert the map string into a list of argument that
|
|
@@ -319,7 +318,7 @@ static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline));
|
|
static int clone_parent(jmp_buf *env, int jmpval)
|
|
static int clone_parent(jmp_buf *env, int jmpval)
|
|
{
|
|
{
|
|
struct clone_t ca = {
|
|
struct clone_t ca = {
|
|
- .env = env,
|
|
|
|
|
|
+ .env = env,
|
|
.jmpval = jmpval,
|
|
.jmpval = jmpval,
|
|
};
|
|
};
|
|
|
|
|
|
@@ -533,7 +532,7 @@ void nsexec(void)
|
|
int pipenum;
|
|
int pipenum;
|
|
jmp_buf env;
|
|
jmp_buf env;
|
|
int sync_child_pipe[2], sync_grandchild_pipe[2];
|
|
int sync_child_pipe[2], sync_grandchild_pipe[2];
|
|
- struct nlconfig_t config = {0};
|
|
|
|
|
|
+ struct nlconfig_t config = { 0 };
|
|
|
|
|
|
/*
|
|
/*
|
|
* If we don't have an init pipe, just return to the go routine.
|
|
* If we don't have an init pipe, just return to the go routine.
|
|
@@ -630,21 +629,21 @@ void nsexec(void)
|
|
*/
|
|
*/
|
|
|
|
|
|
switch (setjmp(env)) {
|
|
switch (setjmp(env)) {
|
|
- /*
|
|
|
|
- * Stage 0: We're in the parent. Our job is just to create a new child
|
|
|
|
- * (stage 1: JUMP_CHILD) process and write its uid_map and
|
|
|
|
- * gid_map. That process will go on to create a new process, then
|
|
|
|
- * it will send us its PID which we will send to the bootstrap
|
|
|
|
- * process.
|
|
|
|
- */
|
|
|
|
- case JUMP_PARENT: {
|
|
|
|
|
|
+ /*
|
|
|
|
+ * Stage 0: We're in the parent. Our job is just to create a new child
|
|
|
|
+ * (stage 1: JUMP_CHILD) process and write its uid_map and
|
|
|
|
+ * gid_map. That process will go on to create a new process, then
|
|
|
|
+ * it will send us its PID which we will send to the bootstrap
|
|
|
|
+ * process.
|
|
|
|
+ */
|
|
|
|
+ case JUMP_PARENT:{
|
|
int len;
|
|
int len;
|
|
pid_t child, first_child = -1;
|
|
pid_t child, first_child = -1;
|
|
char buf[JSON_MAX];
|
|
char buf[JSON_MAX];
|
|
bool ready = false;
|
|
bool ready = false;
|
|
|
|
|
|
/* For debugging. */
|
|
/* For debugging. */
|
|
- prctl(PR_SET_NAME, (unsigned long) "runc:[0:PARENT]", 0, 0, 0);
|
|
|
|
|
|
+ prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0);
|
|
|
|
|
|
/* Start the process of getting a container. */
|
|
/* Start the process of getting a container. */
|
|
child = clone_parent(&env, JUMP_CHILD);
|
|
child = clone_parent(&env, JUMP_CHILD);
|
|
@@ -702,7 +701,7 @@ void nsexec(void)
|
|
bail("failed to sync with child: write(SYNC_USERMAP_ACK)");
|
|
bail("failed to sync with child: write(SYNC_USERMAP_ACK)");
|
|
}
|
|
}
|
|
break;
|
|
break;
|
|
- case SYNC_RECVPID_PLS: {
|
|
|
|
|
|
+ case SYNC_RECVPID_PLS:{
|
|
first_child = child;
|
|
first_child = child;
|
|
|
|
|
|
/* Get the init_func pid. */
|
|
/* Get the init_func pid. */
|
|
@@ -781,16 +780,16 @@ void nsexec(void)
|
|
exit(0);
|
|
exit(0);
|
|
}
|
|
}
|
|
|
|
|
|
- /*
|
|
|
|
- * Stage 1: We're in the first child process. Our job is to join any
|
|
|
|
- * provided namespaces in the netlink payload and unshare all
|
|
|
|
- * of the requested namespaces. If we've been asked to
|
|
|
|
- * CLONE_NEWUSER, we will ask our parent (stage 0) to set up
|
|
|
|
- * our user mappings for us. Then, we create a new child
|
|
|
|
- * (stage 2: JUMP_INIT) for PID namespace. We then send the
|
|
|
|
- * child's PID to our parent (stage 0).
|
|
|
|
- */
|
|
|
|
- case JUMP_CHILD: {
|
|
|
|
|
|
+ /*
|
|
|
|
+ * Stage 1: We're in the first child process. Our job is to join any
|
|
|
|
+ * provided namespaces in the netlink payload and unshare all
|
|
|
|
+ * of the requested namespaces. If we've been asked to
|
|
|
|
+ * CLONE_NEWUSER, we will ask our parent (stage 0) to set up
|
|
|
|
+ * our user mappings for us. Then, we create a new child
|
|
|
|
+ * (stage 2: JUMP_INIT) for PID namespace. We then send the
|
|
|
|
+ * child's PID to our parent (stage 0).
|
|
|
|
+ */
|
|
|
|
+ case JUMP_CHILD:{
|
|
pid_t child;
|
|
pid_t child;
|
|
enum sync_t s;
|
|
enum sync_t s;
|
|
|
|
|
|
@@ -799,7 +798,7 @@ void nsexec(void)
|
|
close(sync_child_pipe[1]);
|
|
close(sync_child_pipe[1]);
|
|
|
|
|
|
/* For debugging. */
|
|
/* For debugging. */
|
|
- prctl(PR_SET_NAME, (unsigned long) "runc:[1:CHILD]", 0, 0, 0);
|
|
|
|
|
|
+ prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0);
|
|
|
|
|
|
/*
|
|
/*
|
|
* We need to setns first. We cannot do this earlier (in stage 0)
|
|
* We need to setns first. We cannot do this earlier (in stage 0)
|
|
@@ -901,13 +900,13 @@ void nsexec(void)
|
|
exit(0);
|
|
exit(0);
|
|
}
|
|
}
|
|
|
|
|
|
- /*
|
|
|
|
- * Stage 2: We're the final child process, and the only process that will
|
|
|
|
- * actually return to the Go runtime. Our job is to just do the
|
|
|
|
- * final cleanup steps and then return to the Go runtime to allow
|
|
|
|
- * init_linux.go to run.
|
|
|
|
- */
|
|
|
|
- case JUMP_INIT: {
|
|
|
|
|
|
+ /*
|
|
|
|
+ * Stage 2: We're the final child process, and the only process that will
|
|
|
|
+ * actually return to the Go runtime. Our job is to just do the
|
|
|
|
+ * final cleanup steps and then return to the Go runtime to allow
|
|
|
|
+ * init_linux.go to run.
|
|
|
|
+ */
|
|
|
|
+ case JUMP_INIT:{
|
|
/*
|
|
/*
|
|
* We're inside the child now, having jumped from the
|
|
* We're inside the child now, having jumped from the
|
|
* start_child() code after forking in the parent.
|
|
* start_child() code after forking in the parent.
|
|
@@ -921,7 +920,7 @@ void nsexec(void)
|
|
close(sync_child_pipe[1]);
|
|
close(sync_child_pipe[1]);
|
|
|
|
|
|
/* For debugging. */
|
|
/* For debugging. */
|
|
- prctl(PR_SET_NAME, (unsigned long) "runc:[2:INIT]", 0, 0, 0);
|
|
|
|
|
|
+ prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0);
|
|
|
|
|
|
if (read(syncfd, &s, sizeof(s)) != sizeof(s))
|
|
if (read(syncfd, &s, sizeof(s)) != sizeof(s))
|
|
bail("failed to sync with parent: read(SYNC_GRANDCHILD)");
|
|
bail("failed to sync with parent: read(SYNC_GRANDCHILD)");
|