From c5251f7116e3d9095a7169fc31bd170dff997c2e Mon Sep 17 00:00:00 2001 From: Justin Cormack Date: Fri, 21 Oct 2016 13:34:37 +0100 Subject: [PATCH] Use runc version built without ambient capabilities Until we can support existing behaviour with `sudo` disable ambient capabilities in runc build. Add tests that non root user cannot use default capabilities, and that capabilities are working as expected. Test for #27590 Update runc. Signed-off-by: Justin Cormack --- contrib/syscall-test/Dockerfile | 6 +- contrib/syscall-test/raw.c | 14 + contrib/syscall-test/setgid.c | 11 + contrib/syscall-test/setuid.c | 11 + contrib/syscall-test/socket.c | 30 + hack/dockerfile/install-binaries.sh | 5 +- integration-cli/docker_cli_run_unix_test.go | 183 ++++- integration-cli/fixtures_linux_daemon.go | 14 +- vendor.conf | 2 +- .../runc/libcontainer/configs/cgroup_unix.go | 4 +- .../runc/libcontainer/configs/config.go | 5 - .../runc/libcontainer/configs/mount.go | 9 + .../libcontainer/configs/namespaces_unix.go | 8 +- .../runc/libcontainer/label/label.go | 4 + .../runc/libcontainer/label/label_selinux.go | 22 +- .../runc/libcontainer/nsenter/namespace.h | 32 + .../runc/libcontainer/nsenter/nsenter.go | 12 + .../libcontainer/nsenter/nsenter_gccgo.go | 25 + .../nsenter/nsenter_unsupported.go | 5 + .../runc/libcontainer/nsenter/nsexec.c | 753 ++++++++++++++++++ .../runc/libcontainer/selinux/selinux.go | 12 + .../runc/libcontainer/system/proc.go | 20 +- 22 files changed, 1145 insertions(+), 42 deletions(-) create mode 100644 contrib/syscall-test/raw.c create mode 100644 contrib/syscall-test/setgid.c create mode 100644 contrib/syscall-test/setuid.c create mode 100644 contrib/syscall-test/socket.c create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/nsenter/namespace.h create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_gccgo.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_unsupported.go create mode 100644 vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c diff --git a/contrib/syscall-test/Dockerfile b/contrib/syscall-test/Dockerfile index a2f96b45e6..f95f1758c0 100644 --- a/contrib/syscall-test/Dockerfile +++ b/contrib/syscall-test/Dockerfile @@ -6,6 +6,10 @@ WORKDIR /usr/src/ RUN gcc -g -Wall -static userns.c -o /usr/bin/userns-test \ && gcc -g -Wall -static ns.c -o /usr/bin/ns-test \ - && gcc -g -Wall -static acct.c -o /usr/bin/acct-test + && gcc -g -Wall -static acct.c -o /usr/bin/acct-test \ + && gcc -g -Wall -static setuid.c -o /usr/bin/setuid-test \ + && gcc -g -Wall -static setgid.c -o /usr/bin/setgid-test \ + && gcc -g -Wall -static socket.c -o /usr/bin/socket-test \ + && gcc -g -Wall -static raw.c -o /usr/bin/raw-test RUN [ "$(uname -m)" = "x86_64" ] && gcc -s -m32 -nostdlib exit32.s -o /usr/bin/exit32-test || true diff --git a/contrib/syscall-test/raw.c b/contrib/syscall-test/raw.c new file mode 100644 index 0000000000..7995a0d3a5 --- /dev/null +++ b/contrib/syscall-test/raw.c @@ -0,0 +1,14 @@ +#include +#include +#include +#include +#include + +int main() { + if (socket(PF_INET, SOCK_RAW, IPPROTO_UDP) == -1) { + perror("socket"); + return 1; + } + + return 0; +} diff --git a/contrib/syscall-test/setgid.c b/contrib/syscall-test/setgid.c new file mode 100644 index 0000000000..df9680c869 --- /dev/null +++ b/contrib/syscall-test/setgid.c @@ -0,0 +1,11 @@ +#include +#include +#include + +int main() { + if (setgid(1) == -1) { + perror("setgid"); + return 1; + } + return 0; +} diff --git a/contrib/syscall-test/setuid.c b/contrib/syscall-test/setuid.c new file mode 100644 index 0000000000..5b939677e9 --- /dev/null +++ b/contrib/syscall-test/setuid.c @@ -0,0 +1,11 @@ +#include +#include +#include + +int main() { + if (setuid(1) == -1) { + perror("setuid"); + return 1; + } + return 0; +} diff --git a/contrib/syscall-test/socket.c b/contrib/syscall-test/socket.c new file mode 100644 index 0000000000..d26c82f00f --- /dev/null +++ b/contrib/syscall-test/socket.c @@ -0,0 +1,30 @@ +#include +#include +#include +#include +#include +#include + +int main() { + int s; + struct sockaddr_in sin; + + s = socket(AF_INET, SOCK_STREAM, 0); + if (s == -1) { + perror("socket"); + return 1; + } + + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = INADDR_ANY; + sin.sin_port = htons(80); + + if (bind(s, (struct sockaddr *)&sin, sizeof(sin)) == -1) { + perror("bind"); + return 1; + } + + close(s); + + return 0; +} diff --git a/hack/dockerfile/install-binaries.sh b/hack/dockerfile/install-binaries.sh index e8c0236434..567c5dffb5 100755 --- a/hack/dockerfile/install-binaries.sh +++ b/hack/dockerfile/install-binaries.sh @@ -3,7 +3,7 @@ set -e set -x TOMLV_COMMIT=9baf8a8a9f2ed20a8e54160840c492f937eeaf9a -RUNC_COMMIT=02f8fa7863dd3f82909a73e2061897828460d52f +RUNC_COMMIT=ac031b5bf1cc92239461125f4c1ffb760522bbf2 CONTAINERD_COMMIT=52ef1ceb4b660c42cf4ea9013180a5663968d4c7 GRIMES_COMMIT=fe069a03affd2547fdb05e5b8b07202d2e41735b LIBNETWORK_COMMIT=0f534354b813003a754606689722fe253101bc4e @@ -20,11 +20,12 @@ else export GOPATH="$TMP_GOPATH" fi +# Do not build with ambient capabilities support RUNC_BUILDTAGS="${RUNC_BUILDTAGS:-"seccomp apparmor selinux"}" install_runc() { echo "Install runc version $RUNC_COMMIT" - git clone https://github.com/opencontainers/runc.git "$GOPATH/src/github.com/opencontainers/runc" + git clone https://github.com/docker/runc.git "$GOPATH/src/github.com/opencontainers/runc" cd "$GOPATH/src/github.com/opencontainers/runc" git checkout -q "$RUNC_COMMIT" make BUILDTAGS="$RUNC_BUILDTAGS" $1 diff --git a/integration-cli/docker_cli_run_unix_test.go b/integration-cli/docker_cli_run_unix_test.go index 46f2d0677f..710cd3acf0 100644 --- a/integration-cli/docker_cli_run_unix_test.go +++ b/integration-cli/docker_cli_run_unix_test.go @@ -1155,24 +1155,185 @@ func (s *DockerSuite) TestRunNoNewPrivSetuid(c *check.C) { } } -func (s *DockerSuite) TestRunAmbientCapabilities(c *check.C) { - testRequires(c, DaemonIsLinux, ambientCapabilities) +func (s *DockerSuite) TestUserNoEffectiveCapabilitiesChown(c *check.C) { + testRequires(c, DaemonIsLinux) + ensureSyscallTest(c) - // test that a non root user can gain capabilities - runCmd := exec.Command(dockerBinary, "run", "--user", "1000", "--cap-add", "chown", "busybox", "chown", "100", "/tmp") + // test that a root user has default capability CAP_CHOWN + runCmd := exec.Command(dockerBinary, "run", "busybox", "chown", "100", "/tmp") _, _, err := runCommandWithOutput(runCmd) c.Assert(err, check.IsNil) - // test that non root user has default capabilities - runCmd = exec.Command(dockerBinary, "run", "--user", "1000", "busybox", "chown", "100", "/tmp") - _, _, err = runCommandWithOutput(runCmd) - c.Assert(err, check.IsNil) - // test this fails without cap_chown - runCmd = exec.Command(dockerBinary, "run", "--user", "1000", "--cap-drop", "chown", "busybox", "chown", "100", "/tmp") + // test that non root user does not have default capability CAP_CHOWN + runCmd = exec.Command(dockerBinary, "run", "--user", "1000:1000", "busybox", "chown", "100", "/tmp") out, _, err := runCommandWithOutput(runCmd) c.Assert(err, checker.NotNil, check.Commentf(out)) - c.Assert(strings.TrimSpace(out), checker.Equals, "chown: /tmp: Operation not permitted") + c.Assert(out, checker.Contains, "Operation not permitted") + // test that root user can drop default capability CAP_CHOWN + runCmd = exec.Command(dockerBinary, "run", "--cap-drop", "chown", "busybox", "chown", "100", "/tmp") + out, _, err = runCommandWithOutput(runCmd) + c.Assert(err, checker.NotNil, check.Commentf(out)) + c.Assert(out, checker.Contains, "Operation not permitted") } +func (s *DockerSuite) TestUserNoEffectiveCapabilitiesDacOverride(c *check.C) { + testRequires(c, DaemonIsLinux) + ensureSyscallTest(c) + + // test that a root user has default capability CAP_DAC_OVERRIDE + runCmd := exec.Command(dockerBinary, "run", "busybox", "sh", "-c", "echo test > /etc/passwd") + _, _, err := runCommandWithOutput(runCmd) + c.Assert(err, check.IsNil) + // test that non root user does not have default capability CAP_DAC_OVERRIDE + runCmd = exec.Command(dockerBinary, "run", "--user", "1000:1000", "busybox", "sh", "-c", "echo test > /etc/passwd") + out, _, err := runCommandWithOutput(runCmd) + c.Assert(err, checker.NotNil, check.Commentf(out)) + c.Assert(out, checker.Contains, "Permission denied") + // TODO test that root user can drop default capability CAP_DAC_OVERRIDE +} + +func (s *DockerSuite) TestUserNoEffectiveCapabilitiesFowner(c *check.C) { + testRequires(c, DaemonIsLinux) + ensureSyscallTest(c) + + // test that a root user has default capability CAP_FOWNER + runCmd := exec.Command(dockerBinary, "run", "busybox", "chmod", "777", "/etc/passwd") + _, _, err := runCommandWithOutput(runCmd) + c.Assert(err, check.IsNil) + // test that non root user does not have default capability CAP_FOWNER + runCmd = exec.Command(dockerBinary, "run", "--user", "1000:1000", "busybox", "chmod", "777", "/etc/passwd") + out, _, err := runCommandWithOutput(runCmd) + c.Assert(err, checker.NotNil, check.Commentf(out)) + c.Assert(out, checker.Contains, "Operation not permitted") + // TODO test that root user can drop default capability CAP_FOWNER +} + +// TODO CAP_KILL + +func (s *DockerSuite) TestUserNoEffectiveCapabilitiesSetuid(c *check.C) { + testRequires(c, DaemonIsLinux) + ensureSyscallTest(c) + + // test that a root user has default capability CAP_SETUID + runCmd := exec.Command(dockerBinary, "run", "syscall-test", "setuid-test") + _, _, err := runCommandWithOutput(runCmd) + c.Assert(err, check.IsNil) + // test that non root user does not have default capability CAP_SETUID + runCmd = exec.Command(dockerBinary, "run", "--user", "1000:1000", "syscall-test", "setuid-test") + out, _, err := runCommandWithOutput(runCmd) + c.Assert(err, checker.NotNil, check.Commentf(out)) + c.Assert(out, checker.Contains, "Operation not permitted") + // test that root user can drop default capability CAP_SETUID + runCmd = exec.Command(dockerBinary, "run", "--cap-drop", "setuid", "syscall-test", "setuid-test") + out, _, err = runCommandWithOutput(runCmd) + c.Assert(err, checker.NotNil, check.Commentf(out)) + c.Assert(out, checker.Contains, "Operation not permitted") +} + +func (s *DockerSuite) TestUserNoEffectiveCapabilitiesSetgid(c *check.C) { + testRequires(c, DaemonIsLinux) + ensureSyscallTest(c) + + // test that a root user has default capability CAP_SETGID + runCmd := exec.Command(dockerBinary, "run", "syscall-test", "setgid-test") + _, _, err := runCommandWithOutput(runCmd) + c.Assert(err, check.IsNil) + // test that non root user does not have default capability CAP_SETGID + runCmd = exec.Command(dockerBinary, "run", "--user", "1000:1000", "syscall-test", "setgid-test") + out, _, err := runCommandWithOutput(runCmd) + c.Assert(err, checker.NotNil, check.Commentf(out)) + c.Assert(out, checker.Contains, "Operation not permitted") + // test that root user can drop default capability CAP_SETGID + runCmd = exec.Command(dockerBinary, "run", "--cap-drop", "setgid", "syscall-test", "setgid-test") + out, _, err = runCommandWithOutput(runCmd) + c.Assert(err, checker.NotNil, check.Commentf(out)) + c.Assert(out, checker.Contains, "Operation not permitted") +} + +// TODO CAP_SETPCAP + +func (s *DockerSuite) TestUserNoEffectiveCapabilitiesNetBindService(c *check.C) { + testRequires(c, DaemonIsLinux) + ensureSyscallTest(c) + + // test that a root user has default capability CAP_NET_BIND_SERVICE + runCmd := exec.Command(dockerBinary, "run", "syscall-test", "socket-test") + _, _, err := runCommandWithOutput(runCmd) + c.Assert(err, check.IsNil) + // test that non root user does not have default capability CAP_NET_BIND_SERVICE + runCmd = exec.Command(dockerBinary, "run", "--user", "1000:1000", "syscall-test", "socket-test") + out, _, err := runCommandWithOutput(runCmd) + c.Assert(err, checker.NotNil, check.Commentf(out)) + c.Assert(out, checker.Contains, "Permission denied") + // test that root user can drop default capability CAP_NET_BIND_SERVICE + runCmd = exec.Command(dockerBinary, "run", "--cap-drop", "net_bind_service", "syscall-test", "socket-test") + out, _, err = runCommandWithOutput(runCmd) + c.Assert(err, checker.NotNil, check.Commentf(out)) + c.Assert(out, checker.Contains, "Permission denied") +} + +func (s *DockerSuite) TestUserNoEffectiveCapabilitiesNetRaw(c *check.C) { + testRequires(c, DaemonIsLinux) + ensureSyscallTest(c) + + // test that a root user has default capability CAP_NET_RAW + runCmd := exec.Command(dockerBinary, "run", "syscall-test", "raw-test") + _, _, err := runCommandWithOutput(runCmd) + c.Assert(err, check.IsNil) + // test that non root user does not have default capability CAP_NET_RAW + runCmd = exec.Command(dockerBinary, "run", "--user", "1000:1000", "syscall-test", "raw-test") + out, _, err := runCommandWithOutput(runCmd) + c.Assert(err, checker.NotNil, check.Commentf(out)) + c.Assert(out, checker.Contains, "Operation not permitted") + // test that root user can drop default capability CAP_NET_RAW + runCmd = exec.Command(dockerBinary, "run", "--cap-drop", "net_raw", "syscall-test", "raw-test") + out, _, err = runCommandWithOutput(runCmd) + c.Assert(err, checker.NotNil, check.Commentf(out)) + c.Assert(out, checker.Contains, "Operation not permitted") +} + +func (s *DockerSuite) TestUserNoEffectiveCapabilitiesChroot(c *check.C) { + testRequires(c, DaemonIsLinux) + ensureSyscallTest(c) + + // test that a root user has default capability CAP_SYS_CHROOT + runCmd := exec.Command(dockerBinary, "run", "busybox", "chroot", "/", "/bin/true") + _, _, err := runCommandWithOutput(runCmd) + c.Assert(err, check.IsNil) + // test that non root user does not have default capability CAP_SYS_CHROOT + runCmd = exec.Command(dockerBinary, "run", "--user", "1000:1000", "busybox", "chroot", "/", "/bin/true") + out, _, err := runCommandWithOutput(runCmd) + c.Assert(err, checker.NotNil, check.Commentf(out)) + c.Assert(out, checker.Contains, "Operation not permitted") + // test that root user can drop default capability CAP_SYS_CHROOT + runCmd = exec.Command(dockerBinary, "run", "--cap-drop", "sys_chroot", "busybox", "chroot", "/", "/bin/true") + out, _, err = runCommandWithOutput(runCmd) + c.Assert(err, checker.NotNil, check.Commentf(out)) + c.Assert(out, checker.Contains, "Operation not permitted") +} + +func (s *DockerSuite) TestUserNoEffectiveCapabilitiesMknod(c *check.C) { + testRequires(c, DaemonIsLinux) + ensureSyscallTest(c) + + // test that a root user has default capability CAP_MKNOD + runCmd := exec.Command(dockerBinary, "run", "busybox", "mknod", "/tmp/node", "b", "1", "2") + _, _, err := runCommandWithOutput(runCmd) + c.Assert(err, check.IsNil) + // test that non root user does not have default capability CAP_MKNOD + runCmd = exec.Command(dockerBinary, "run", "--user", "1000:1000", "busybox", "mknod", "/tmp/node", "b", "1", "2") + out, _, err := runCommandWithOutput(runCmd) + c.Assert(err, checker.NotNil, check.Commentf(out)) + c.Assert(out, checker.Contains, "Operation not permitted") + // test that root user can drop default capability CAP_MKNOD + runCmd = exec.Command(dockerBinary, "run", "--cap-drop", "mknod", "busybox", "mknod", "/tmp/node", "b", "1", "2") + out, _, err = runCommandWithOutput(runCmd) + c.Assert(err, checker.NotNil, check.Commentf(out)) + c.Assert(out, checker.Contains, "Operation not permitted") +} + +// TODO CAP_AUDIT_WRITE +// TODO CAP_SETFCAP + func (s *DockerSuite) TestRunApparmorProcDirectory(c *check.C) { testRequires(c, SameHostDaemon, Apparmor) diff --git a/integration-cli/fixtures_linux_daemon.go b/integration-cli/fixtures_linux_daemon.go index 94661180d5..b9484f3a2a 100644 --- a/integration-cli/fixtures_linux_daemon.go +++ b/integration-cli/fixtures_linux_daemon.go @@ -1,6 +1,7 @@ package main import ( + "fmt" "io/ioutil" "os" "os/exec" @@ -53,15 +54,14 @@ func ensureSyscallTest(c *check.C) { gcc, err := exec.LookPath("gcc") c.Assert(err, checker.IsNil, check.Commentf("could not find gcc")) - out, err := exec.Command(gcc, "-g", "-Wall", "-static", "../contrib/syscall-test/userns.c", "-o", tmp+"/"+"userns-test").CombinedOutput() - c.Assert(err, checker.IsNil, check.Commentf(string(out))) - out, err = exec.Command(gcc, "-g", "-Wall", "-static", "../contrib/syscall-test/ns.c", "-o", tmp+"/"+"ns-test").CombinedOutput() - c.Assert(err, checker.IsNil, check.Commentf(string(out))) - out, err = exec.Command(gcc, "-g", "-Wall", "-static", "../contrib/syscall-test/acct.c", "-o", tmp+"/"+"acct-test").CombinedOutput() - c.Assert(err, checker.IsNil, check.Commentf(string(out))) + tests := []string{"userns", "ns", "acct", "setuid", "setgid", "socket", "raw"} + for _, test := range tests { + out, err := exec.Command(gcc, "-g", "-Wall", "-static", fmt.Sprintf("../contrib/syscall-test/%s.c", test), "-o", fmt.Sprintf("%s/%s-test", tmp, test)).CombinedOutput() + c.Assert(err, checker.IsNil, check.Commentf(string(out))) + } if runtime.GOOS == "linux" && runtime.GOARCH == "amd64" { - out, err = exec.Command(gcc, "-s", "-m32", "-nostdlib", "../contrib/syscall-test/exit32.s", "-o", tmp+"/"+"exit32-test").CombinedOutput() + out, err := exec.Command(gcc, "-s", "-m32", "-nostdlib", "../contrib/syscall-test/exit32.s", "-o", tmp+"/"+"exit32-test").CombinedOutput() c.Assert(err, checker.IsNil, check.Commentf(string(out))) } diff --git a/vendor.conf b/vendor.conf index 10b559b40b..a511ce0b7b 100644 --- a/vendor.conf +++ b/vendor.conf @@ -59,7 +59,7 @@ github.com/miekg/pkcs11 df8ae6ca730422dba20c768ff38ef7d79077a59f github.com/docker/go v1.5.1-1-1-gbaf439e github.com/agl/ed25519 d2b94fd789ea21d12fac1a4443dd3a3f79cda72c -github.com/opencontainers/runc 02f8fa7863dd3f82909a73e2061897828460d52f # libcontainer +github.com/opencontainers/runc ac031b5bf1cc92239461125f4c1ffb760522bbf2 # libcontainer github.com/opencontainers/runtime-spec 1c7c27d043c2a5e513a44084d2b10d77d1402b8c # specs github.com/seccomp/libseccomp-golang 32f571b70023028bd57d9288c20efbcb237f3ce0 # libcontainer deps (see src/github.com/opencontainers/runc/Godeps/Godeps.json) diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unix.go index 94b38879ed..14d6289816 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unix.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unix.go @@ -22,7 +22,7 @@ type Cgroup struct { // The path is assumed to be relative to the host system cgroup mountpoint. Path string `json:"path"` - // ScopePrefix decribes prefix for the scope name + // ScopePrefix describes prefix for the scope name ScopePrefix string `json:"scope_prefix"` // Paths represent the absolute cgroups paths to join. @@ -95,7 +95,7 @@ type Resources struct { // IO read rate limit per cgroup per device, bytes per second. BlkioThrottleReadBpsDevice []*ThrottleDevice `json:"blkio_throttle_read_bps_device"` - // IO write rate limit per cgroup per divice, bytes per second. + // IO write rate limit per cgroup per device, bytes per second. BlkioThrottleWriteBpsDevice []*ThrottleDevice `json:"blkio_throttle_write_bps_device"` // IO read rate limit per cgroup per device, IO per second. diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go index a56d12bdb9..fdfe248c73 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go @@ -85,11 +85,6 @@ type Config struct { // that the parent process dies. ParentDeathSignal int `json:"parent_death_signal"` - // PivotDir allows a custom directory inside the container's root filesystem to be used as pivot, when NoPivotRoot is not set. - // When a custom PivotDir not set, a temporary dir inside the root filesystem will be used. The pivot dir needs to be writeable. - // This is required when using read only root filesystems. In these cases, a read/writeable path can be (bind) mounted somewhere inside the root filesystem to act as pivot. - PivotDir string `json:"pivot_dir"` - // Path to a directory containing the container's root filesystem. Rootfs string `json:"rootfs"` diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go index cc770c916f..670757ddb5 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go @@ -1,5 +1,11 @@ package configs +const ( + // EXT_COPYUP is a directive to copy up the contents of a directory when + // a tmpfs is mounted over it. + EXT_COPYUP = 1 << iota +) + type Mount struct { // Source path for the mount. Source string `json:"source"` @@ -22,6 +28,9 @@ type Mount struct { // Relabel source if set, "z" indicates shared, "Z" indicates unshared. Relabel string `json:"relabel"` + // Extensions are additional flags that are specific to runc. + Extensions int `json:"extensions"` + // Optional Command to be run before Source is mounted. PremountCmds []Command `json:"premount_cmds"` diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_unix.go index b9c820d062..8beba9d300 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_unix.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_unix.go @@ -22,8 +22,8 @@ var ( supportedNamespaces = make(map[NamespaceType]bool) ) -// nsToFile converts the namespace type to its filename -func nsToFile(ns NamespaceType) string { +// NsName converts the namespace type to its filename +func NsName(ns NamespaceType) string { switch ns { case NEWNET: return "net" @@ -50,7 +50,7 @@ func IsNamespaceSupported(ns NamespaceType) bool { if ok { return supported } - nsFile := nsToFile(ns) + nsFile := NsName(ns) // if the namespace type is unknown, just return false if nsFile == "" { return false @@ -84,7 +84,7 @@ func (n *Namespace) GetPath(pid int) string { if n.Path != "" { return n.Path } - return fmt.Sprintf("/proc/%d/ns/%s", pid, nsToFile(n.Type)) + return fmt.Sprintf("/proc/%d/ns/%s", pid, NsName(n.Type)) } func (n *Namespaces) Remove(t NamespaceType) bool { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/label/label.go b/vendor/github.com/opencontainers/runc/libcontainer/label/label.go index 684bb41bdc..fddec46334 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/label/label.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/label/label.go @@ -9,6 +9,10 @@ func InitLabels(options []string) (string, string, error) { return "", "", nil } +func GetROMountLabel() string { + return "" +} + func GenLabels(options string) (string, string, error) { return "", "", nil } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/label/label_selinux.go b/vendor/github.com/opencontainers/runc/libcontainer/label/label_selinux.go index 1d9d78a390..d76846eafb 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/label/label_selinux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/label/label_selinux.go @@ -33,15 +33,19 @@ func InitLabels(options []string) (string, string, error) { pcon := selinux.NewContext(processLabel) mcon := selinux.NewContext(mountLabel) for _, opt := range options { - if opt == "disable" { + val := strings.SplitN(opt, "=", 2) + if val[0] != "label" { + continue + } + if len(val) < 2 { + return "", "", fmt.Errorf("bad label option %q, valid options 'disable' or \n'user, role, level, type' followed by ':' and a value", opt) + } + if val[1] == "disable" { return "", "", nil } - if i := strings.Index(opt, ":"); i == -1 { - return "", "", fmt.Errorf("Bad label option %q, valid options 'disable' or \n'user, role, level, type' followed by ':' and a value", opt) - } - con := strings.SplitN(opt, ":", 2) - if !validOptions[con[0]] { - return "", "", fmt.Errorf("Bad label option %q, valid options 'disable, user, role, level, type'", con[0]) + con := strings.SplitN(val[1], ":", 2) + if len(con) < 2 || !validOptions[con[0]] { + return "", "", fmt.Errorf("bad label option %q, valid options 'disable, user, role, level, type'", con[0]) } pcon[con[0]] = con[1] @@ -55,6 +59,10 @@ func InitLabels(options []string) (string, string, error) { return processLabel, mountLabel, nil } +func GetROMountLabel() string { + return selinux.GetROFileLabel() +} + // DEPRECATED: The GenLabels function is only to be used during the transition to the official API. func GenLabels(options string) (string, string, error) { return InitLabels(strings.Fields(options)) diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/namespace.h b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/namespace.h new file mode 100644 index 0000000000..9e9bdca05e --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/namespace.h @@ -0,0 +1,32 @@ +#ifndef NSENTER_NAMESPACE_H +#define NSENTER_NAMESPACE_H + +#ifndef _GNU_SOURCE +# define _GNU_SOURCE +#endif +#include + +/* All of these are taken from include/uapi/linux/sched.h */ +#ifndef CLONE_NEWNS +# define CLONE_NEWNS 0x00020000 /* New mount namespace group */ +#endif +#ifndef CLONE_NEWCGROUP +# define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */ +#endif +#ifndef CLONE_NEWUTS +# define CLONE_NEWUTS 0x04000000 /* New utsname namespace */ +#endif +#ifndef CLONE_NEWIPC +# define CLONE_NEWIPC 0x08000000 /* New ipc namespace */ +#endif +#ifndef CLONE_NEWUSER +# define CLONE_NEWUSER 0x10000000 /* New user namespace */ +#endif +#ifndef CLONE_NEWPID +# define CLONE_NEWPID 0x20000000 /* New pid namespace */ +#endif +#ifndef CLONE_NEWNET +# define CLONE_NEWNET 0x40000000 /* New network namespace */ +#endif + +#endif /* NSENTER_NAMESPACE_H */ diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter.go b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter.go new file mode 100644 index 0000000000..07f4d63e43 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter.go @@ -0,0 +1,12 @@ +// +build linux,!gccgo + +package nsenter + +/* +#cgo CFLAGS: -Wall +extern void nsexec(); +void __attribute__((constructor)) init(void) { + nsexec(); +} +*/ +import "C" diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_gccgo.go b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_gccgo.go new file mode 100644 index 0000000000..63c7a3ec22 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_gccgo.go @@ -0,0 +1,25 @@ +// +build linux,gccgo + +package nsenter + +/* +#cgo CFLAGS: -Wall +extern void nsexec(); +void __attribute__((constructor)) init(void) { + nsexec(); +} +*/ +import "C" + +// AlwaysFalse is here to stay false +// (and be exported so the compiler doesn't optimize out its reference) +var AlwaysFalse bool + +func init() { + if AlwaysFalse { + // by referencing this C init() in a noop test, it will ensure the compiler + // links in the C function. + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65134 + C.init() + } +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_unsupported.go new file mode 100644 index 0000000000..ac701ca393 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsenter_unsupported.go @@ -0,0 +1,5 @@ +// +build !linux !cgo + +package nsenter + +import "C" diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c new file mode 100644 index 0000000000..97c070859c --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c @@ -0,0 +1,753 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +/* Get all of the CLONE_NEW* flags. */ +#include "namespace.h" + +/* Synchronisation values. */ +enum sync_t { + SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */ + SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */ + SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */ + SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */ + + /* XXX: This doesn't help with segfaults and other such issues. */ + SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */ +}; + +/* longjmp() arguments. */ +#define JUMP_PARENT 0x00 +#define JUMP_CHILD 0xA0 +#define JUMP_INIT 0xA1 + +/* JSON buffer. */ +#define JSON_MAX 4096 + +/* Assume the stack grows down, so arguments should be above it. */ +struct clone_t { + /* + * Reserve some space for clone() to locate arguments + * and retcode in this place + */ + char stack[4096] __attribute__ ((aligned(16))); + char stack_ptr[0]; + + /* There's two children. This is used to execute the different code. */ + jmp_buf *env; + int jmpval; +}; + +struct nlconfig_t { + char *data; + uint32_t cloneflags; + char *uidmap; + size_t uidmap_len; + char *gidmap; + size_t gidmap_len; + char *namespaces; + size_t namespaces_len; + uint8_t is_setgroup; + int consolefd; +}; + +/* + * List of netlink message types sent to us as part of bootstrapping the init. + * These constants are defined in libcontainer/message_linux.go. + */ +#define INIT_MSG 62000 +#define CLONE_FLAGS_ATTR 27281 +#define CONSOLE_PATH_ATTR 27282 +#define NS_PATHS_ATTR 27283 +#define UIDMAP_ATTR 27284 +#define GIDMAP_ATTR 27285 +#define SETGROUP_ATTR 27286 + +/* + * Use the raw syscall for versions of glibc which don't include a function for + * it, namely (glibc 2.12). + */ +#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14 +# define _GNU_SOURCE +# include "syscall.h" +# if !defined(SYS_setns) && defined(__NR_setns) +# define SYS_setns __NR_setns +# endif + +#ifndef SYS_setns +# error "setns(2) syscall not supported by glibc version" +#endif + +int setns(int fd, int nstype) +{ + return syscall(SYS_setns, fd, nstype); +} +#endif + +/* XXX: This is ugly. */ +static int syncfd = -1; + +/* TODO(cyphar): Fix this so it correctly deals with syncT. */ +#define bail(fmt, ...) \ + do { \ + int ret = __COUNTER__ + 1; \ + fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__); \ + if (syncfd >= 0) { \ + enum sync_t s = SYNC_ERR; \ + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) \ + fprintf(stderr, "nsenter: failed: write(s)"); \ + if (write(syncfd, &ret, sizeof(ret)) != sizeof(ret)) \ + fprintf(stderr, "nsenter: failed: write(ret)"); \ + } \ + exit(ret); \ + } while(0) + +static int write_file(char *data, size_t data_len, char *pathfmt, ...) +{ + int fd, len, ret = 0; + char path[PATH_MAX]; + + va_list ap; + va_start(ap, pathfmt); + len = vsnprintf(path, PATH_MAX, pathfmt, ap); + va_end(ap); + if (len < 0) + return -1; + + fd = open(path, O_RDWR); + if (fd < 0) { + ret = -1; + goto out; + } + + len = write(fd, data, data_len); + if (len != data_len) { + ret = -1; + goto out; + } + +out: + close(fd); + return ret; +} + +enum policy_t { + SETGROUPS_DEFAULT = 0, + SETGROUPS_ALLOW, + SETGROUPS_DENY, +}; + +/* This *must* be called before we touch gid_map. */ +static void update_setgroups(int pid, enum policy_t setgroup) +{ + char *policy; + + switch (setgroup) { + case SETGROUPS_ALLOW: + policy = "allow"; + break; + case SETGROUPS_DENY: + policy = "deny"; + break; + case SETGROUPS_DEFAULT: + /* Nothing to do. */ + return; + } + + if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) { + /* + * If the kernel is too old to support /proc/pid/setgroups, + * open(2) or write(2) will return ENOENT. This is fine. + */ + if (errno != ENOENT) + bail("failed to write '%s' to /proc/%d/setgroups", policy, pid); + } +} + +static void update_uidmap(int pid, char *map, int map_len) +{ + if (map == NULL || map_len <= 0) + return; + + if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) + bail("failed to update /proc/%d/uid_map", pid); +} + +static void update_gidmap(int pid, char *map, int map_len) +{ + if (map == NULL || map_len <= 0) + return; + + if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) + bail("failed to update /proc/%d/gid_map", pid); +} + +/* A dummy function that just jumps to the given jumpval. */ +static int child_func(void *arg) __attribute__ ((noinline)); +static int child_func(void *arg) +{ + struct clone_t *ca = (struct clone_t *)arg; + longjmp(*ca->env, ca->jmpval); +} + +static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline)); +static int clone_parent(jmp_buf *env, int jmpval) +{ + struct clone_t ca = { + .env = env, + .jmpval = jmpval, + }; + + return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca); +} + +/* + * Gets the init pipe fd from the environment, which is used to read the + * bootstrap data and tell the parent what the new pid is after we finish + * setting up the environment. + */ +static int initpipe(void) +{ + int pipenum; + char *initpipe, *endptr; + + initpipe = getenv("_LIBCONTAINER_INITPIPE"); + if (initpipe == NULL || *initpipe == '\0') + return -1; + + pipenum = strtol(initpipe, &endptr, 10); + if (*endptr != '\0') + bail("unable to parse _LIBCONTAINER_INITPIPE"); + + return pipenum; +} + +/* Returns the clone(2) flag for a namespace, given the name of a namespace. */ +static int nsflag(char *name) +{ + if (!strcmp(name, "cgroup")) + return CLONE_NEWCGROUP; + else if (!strcmp(name, "ipc")) + return CLONE_NEWIPC; + else if (!strcmp(name, "mnt")) + return CLONE_NEWNS; + else if (!strcmp(name, "net")) + return CLONE_NEWNET; + else if (!strcmp(name, "pid")) + return CLONE_NEWPID; + else if (!strcmp(name, "user")) + return CLONE_NEWUSER; + else if (!strcmp(name, "uts")) + return CLONE_NEWUTS; + + /* If we don't recognise a name, fallback to 0. */ + return 0; +} + +static uint32_t readint32(char *buf) +{ + return *(uint32_t *) buf; +} + +static uint8_t readint8(char *buf) +{ + return *(uint8_t *) buf; +} + +static void nl_parse(int fd, struct nlconfig_t *config) +{ + size_t len, size; + struct nlmsghdr hdr; + char *data, *current; + + /* Retrieve the netlink header. */ + len = read(fd, &hdr, NLMSG_HDRLEN); + if (len != NLMSG_HDRLEN) + bail("invalid netlink header length %lu", len); + + if (hdr.nlmsg_type == NLMSG_ERROR) + bail("failed to read netlink message"); + + if (hdr.nlmsg_type != INIT_MSG) + bail("unexpected msg type %d", hdr.nlmsg_type); + + /* Retrieve data. */ + size = NLMSG_PAYLOAD(&hdr, 0); + current = data = malloc(size); + if (!data) + bail("failed to allocate %zu bytes of memory for nl_payload", size); + + len = read(fd, data, size); + if (len != size) + bail("failed to read netlink payload, %lu != %lu", len, size); + + /* Parse the netlink payload. */ + config->data = data; + config->consolefd = -1; + while (current < data + size) { + struct nlattr *nlattr = (struct nlattr *)current; + size_t payload_len = nlattr->nla_len - NLA_HDRLEN; + + /* Advance to payload. */ + current += NLA_HDRLEN; + + /* Handle payload. */ + switch (nlattr->nla_type) { + case CLONE_FLAGS_ATTR: + config->cloneflags = readint32(current); + break; + case CONSOLE_PATH_ATTR: + /* + * We open the console here because we currently evaluate console + * paths from the *host* namespaces. + */ + config->consolefd = open(current, O_RDWR); + if (config->consolefd < 0) + bail("failed to open console %s", current); + break; + case NS_PATHS_ATTR: + config->namespaces = current; + config->namespaces_len = payload_len; + break; + case UIDMAP_ATTR: + config->uidmap = current; + config->uidmap_len = payload_len; + break; + case GIDMAP_ATTR: + config->gidmap = current; + config->gidmap_len = payload_len; + break; + case SETGROUP_ATTR: + config->is_setgroup = readint8(current); + break; + default: + bail("unknown netlink message type %d", nlattr->nla_type); + } + + current += NLA_ALIGN(payload_len); + } +} + +void nl_free(struct nlconfig_t *config) +{ + free(config->data); +} + +void join_namespaces(char *nslist) +{ + int num = 0, i; + char *saveptr = NULL; + char *namespace = strtok_r(nslist, ",", &saveptr); + struct namespace_t { + int fd; + int ns; + char type[PATH_MAX]; + char path[PATH_MAX]; + } *namespaces = NULL; + + if (!namespace || !strlen(namespace) || !strlen(nslist)) + bail("ns paths are empty"); + + /* + * We have to open the file descriptors first, since after + * we join the mnt namespace we might no longer be able to + * access the paths. + */ + do { + int fd; + char *path; + struct namespace_t *ns; + + /* Resize the namespace array. */ + namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t)); + if (!namespaces) + bail("failed to reallocate namespace array"); + ns = &namespaces[num - 1]; + + /* Split 'ns:path'. */ + path = strstr(namespace, ":"); + if (!path) + bail("failed to parse %s", namespace); + *path++ = '\0'; + + fd = open(path, O_RDONLY); + if (fd < 0) + bail("failed to open %s", namespace); + + ns->fd = fd; + ns->ns = nsflag(namespace); + strncpy(ns->path, path, PATH_MAX); + } while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL); + + /* + * The ordering in which we join namespaces is important. We should + * always join the user namespace *first*. This is all guaranteed + * from the container_linux.go side of this, so we're just going to + * follow the order given to us. + */ + + for (i = 0; i < num; i++) { + struct namespace_t ns = namespaces[i]; + + if (setns(ns.fd, ns.ns) < 0) + bail("failed to setns to %s", ns.path); + + close(ns.fd); + } + + free(namespaces); +} + +void nsexec(void) +{ + int pipenum; + jmp_buf env; + int syncpipe[2]; + struct nlconfig_t config = {0}; + + /* + * If we don't have an init pipe, just return to the go routine. + * We'll only get an init pipe for start or exec. + */ + pipenum = initpipe(); + if (pipenum == -1) + return; + + /* Parse all of the netlink configuration. */ + nl_parse(pipenum, &config); + + /* Pipe so we can tell the child when we've finished setting up. */ + if (socketpair(AF_LOCAL, SOCK_STREAM, 0, syncpipe) < 0) + bail("failed to setup sync pipe between parent and child"); + + /* TODO: Currently we aren't dealing with child deaths properly. */ + + /* + * Okay, so this is quite annoying. + * + * In order for this unsharing code to be more extensible we need to split + * up unshare(CLONE_NEWUSER) and clone() in various ways. The ideal case + * would be if we did clone(CLONE_NEWUSER) and the other namespaces + * separately, but because of SELinux issues we cannot really do that. But + * we cannot just dump the namespace flags into clone(...) because several + * usecases (such as rootless containers) require more granularity around + * the namespace setup. In addition, some older kernels had issues where + * CLONE_NEWUSER wasn't handled before other namespaces (but we cannot + * handle this while also dealing with SELinux so we choose SELinux support + * over broken kernel support). + * + * However, if we unshare(2) the user namespace *before* we clone(2), then + * all hell breaks loose. + * + * The parent no longer has permissions to do many things (unshare(2) drops + * all capabilities in your old namespace), and the container cannot be set + * up to have more than one {uid,gid} mapping. This is obviously less than + * ideal. In order to fix this, we have to first clone(2) and then unshare. + * + * Unfortunately, it's not as simple as that. We have to fork to enter the + * PID namespace (the PID namespace only applies to children). Since we'll + * have to double-fork, this clone_parent() call won't be able to get the + * PID of the _actual_ init process (without doing more synchronisation than + * I can deal with at the moment). So we'll just get the parent to send it + * for us, the only job of this process is to update + * /proc/pid/{setgroups,uid_map,gid_map}. + * + * And as a result of the above, we also need to setns(2) in the first child + * because if we join a PID namespace in the topmost parent then our child + * will be in that namespace (and it will not be able to give us a PID value + * that makes sense without resorting to sending things with cmsg). + * + * This also deals with an older issue caused by dumping cloneflags into + * clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so + * we have to unshare(2) before clone(2) in order to do this. This was fixed + * in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was + * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're + * aware, the last mainline kernel which had this bug was Linux 3.12. + * However, we cannot comment on which kernels the broken patch was + * backported to. + * + * -- Aleksa "what has my life come to?" Sarai + */ + + switch (setjmp(env)) { + /* + * Stage 0: We're in the parent. Our job is just to create a new child + * (stage 1: JUMP_CHILD) process and write its uid_map and + * gid_map. That process will go on to create a new process, then + * it will send us its PID which we will send to the bootstrap + * process. + */ + case JUMP_PARENT: { + int len; + pid_t child; + char buf[JSON_MAX]; + + /* For debugging. */ + prctl(PR_SET_NAME, (unsigned long) "runc:[0:PARENT]", 0, 0, 0); + + /* Start the process of getting a container. */ + child = clone_parent(&env, JUMP_CHILD); + if (child < 0) + bail("unable to fork: child_func"); + + /* State machine for synchronisation with the children. */ + while (true) { + enum sync_t s; + + /* This doesn't need to be global, we're in the parent. */ + int syncfd = syncpipe[1]; + + if (read(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with child: next state"); + + switch (s) { + case SYNC_ERR: { + /* We have to mirror the error code of the child. */ + int ret; + + if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret)) + bail("failed to sync with child: read(error code)"); + + exit(ret); + } + break; + case SYNC_USERMAP_PLS: + /* Enable setgroups(2) if we've been asked to. */ + if (config.is_setgroup) + update_setgroups(child, SETGROUPS_ALLOW); + + /* Set up mappings. */ + update_uidmap(child, config.uidmap, config.uidmap_len); + update_gidmap(child, config.gidmap, config.gidmap_len); + + s = SYNC_USERMAP_ACK; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(child, SIGKILL); + bail("failed to sync with child: write(SYNC_USERMAP_ACK)"); + } + break; + case SYNC_USERMAP_ACK: + /* We should _never_ receive acks. */ + kill(child, SIGKILL); + bail("failed to sync with child: unexpected SYNC_USERMAP_ACK"); + break; + case SYNC_RECVPID_PLS: { + pid_t old = child; + + /* Get the init_func pid. */ + if (read(syncfd, &child, sizeof(child)) != sizeof(child)) { + kill(old, SIGKILL); + bail("failed to sync with child: read(childpid)"); + } + + /* Send ACK. */ + s = SYNC_RECVPID_ACK; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(old, SIGKILL); + kill(child, SIGKILL); + bail("failed to sync with child: write(SYNC_RECVPID_ACK)"); + } + } + + /* Leave the loop. */ + goto out; + case SYNC_RECVPID_ACK: + /* We should _never_ receive acks. */ + kill(child, SIGKILL); + bail("failed to sync with child: unexpected SYNC_RECVPID_ACK"); + break; + } + } + + out: + /* Send the init_func pid back to our parent. */ + len = snprintf(buf, JSON_MAX, "{\"pid\": %d}\n", child); + if (len < 0) { + kill(child, SIGKILL); + bail("unable to generate JSON for child pid"); + } + if (write(pipenum, buf, len) != len) { + kill(child, SIGKILL); + bail("unable to send child pid to bootstrapper"); + } + + exit(0); + } + + /* + * Stage 1: We're in the first child process. Our job is to join any + * provided user namespaces in the netlink payload. If we've been + * asked to CLONE_NEWUSER, we will unshare the user namespace and + * ask our parent (stage 0) to set up our user mappings for us. + * Then, we unshare the rest of the requested namespaces and + * create a new child (stage 2: JUMP_INIT). We then send the + * child's PID to our parent (stage 0). + */ + case JUMP_CHILD: { + pid_t child; + enum sync_t s; + + /* We're in a child and thus need to tell the parent if we die. */ + syncfd = syncpipe[0]; + + /* For debugging. */ + prctl(PR_SET_NAME, (unsigned long) "runc:[1:CHILD]", 0, 0, 0); + + /* + * We need to setns first. We cannot do this earlier (in stage 0) + * because of the fact that we forked to get here (the PID of + * [stage 2: JUMP_INIT]) would be meaningless). We could send it + * using cmsg(3) but that's just annoying. + */ + if (config.namespaces) + join_namespaces(config.namespaces); + + /* + * Unshare all of the namespaces. Now, it should be noted that this + * ordering might break in the future (especially with rootless + * containers). But for now, it's not possible to split this into + * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues. + * + * Note that we don't merge this with clone() because there were + * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID) + * was broken, so we'll just do it the long way anyway. + */ + if (unshare(config.cloneflags) < 0) + bail("failed to unshare namespaces"); + + /* + * Deal with user namespaces first. They are quite special, as they + * affect our ability to unshare other namespaces and are used as + * context for privilege checks. + */ + if (config.cloneflags & CLONE_NEWUSER) { + /* + * We don't have the privileges to do any mapping here (see the + * clone_parent rant). So signal our parent to hook us up. + */ + + s = SYNC_USERMAP_PLS; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with parent: write(SYNC_USERMAP_PLS)"); + + /* ... wait for mapping ... */ + + if (read(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with parent: read(SYNC_USERMAP_ACK)"); + if (s != SYNC_USERMAP_ACK) + bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s); + } + + /* TODO: What about non-namespace clone flags that we're dropping here? */ + child = clone_parent(&env, JUMP_INIT); + if (child < 0) + bail("unable to fork: init_func"); + + /* Send the child to our parent, which knows what it's doing. */ + s = SYNC_RECVPID_PLS; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(child, SIGKILL); + bail("failed to sync with parent: write(SYNC_RECVPID_PLS)"); + } + if (write(syncfd, &child, sizeof(child)) != sizeof(child)) { + kill(child, SIGKILL); + bail("failed to sync with parent: write(childpid)"); + } + + /* ... wait for parent to get the pid ... */ + + if (read(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(child, SIGKILL); + bail("failed to sync with parent: read(SYNC_RECVPID_ACK)"); + } + if (s != SYNC_RECVPID_ACK) { + kill(child, SIGKILL); + bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s); + } + + /* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */ + exit(0); + } + + /* + * Stage 2: We're the final child process, and the only process that will + * actually return to the Go runtime. Our job is to just do the + * final cleanup steps and then return to the Go runtime to allow + * init_linux.go to run. + */ + case JUMP_INIT: { + /* + * We're inside the child now, having jumped from the + * start_child() code after forking in the parent. + */ + int consolefd = config.consolefd; + + /* We're in a child and thus need to tell the parent if we die. */ + syncfd = syncpipe[0]; + + /* For debugging. */ + prctl(PR_SET_NAME, (unsigned long) "runc:[2:INIT]", 0, 0, 0); + + if (setsid() < 0) + bail("setsid failed"); + + if (setuid(0) < 0) + bail("setuid failed"); + + if (setgid(0) < 0) + bail("setgid failed"); + + if (setgroups(0, NULL) < 0) + bail("setgroups failed"); + + if (consolefd != -1) { + if (ioctl(consolefd, TIOCSCTTY, 0) < 0) + bail("ioctl TIOCSCTTY failed"); + if (dup3(consolefd, STDIN_FILENO, 0) != STDIN_FILENO) + bail("failed to dup stdin"); + if (dup3(consolefd, STDOUT_FILENO, 0) != STDOUT_FILENO) + bail("failed to dup stdout"); + if (dup3(consolefd, STDERR_FILENO, 0) != STDERR_FILENO) + bail("failed to dup stderr"); + } + + /* Close sync pipes. */ + close(syncpipe[0]); + close(syncpipe[1]); + + /* Free netlink data. */ + nl_free(&config); + + /* Finish executing, let the Go runtime take over. */ + return; + } + default: + bail("unexpected jump value"); + break; + } + + /* Should never be reached. */ + bail("should never be reached"); +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/selinux/selinux.go b/vendor/github.com/opencontainers/runc/libcontainer/selinux/selinux.go index 2a18e2ad89..fcaba1d29e 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/selinux/selinux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/selinux/selinux.go @@ -355,6 +355,12 @@ func FreeLxcContexts(scon string) { } } +var roFileLabel string + +func GetROFileLabel() (fileLabel string) { + return roFileLabel +} + func GetLxcContexts() (processLabel string, fileLabel string) { var ( val, key string @@ -399,6 +405,9 @@ func GetLxcContexts() (processLabel string, fileLabel string) { if key == "file" { fileLabel = strings.Trim(val, "\"") } + if key == "ro_file" { + roFileLabel = strings.Trim(val, "\"") + } } } @@ -406,6 +415,9 @@ func GetLxcContexts() (processLabel string, fileLabel string) { return "", "" } + if roFileLabel == "" { + roFileLabel = fileLabel + } exit: // mcs := IntToMcs(os.Getpid(), 1024) mcs := uniqMcs(1024) diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/proc.go b/vendor/github.com/opencontainers/runc/libcontainer/system/proc.go index 37808a29f6..a0e9637199 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/system/proc.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/proc.go @@ -14,8 +14,10 @@ func GetProcessStartTime(pid int) (string, error) { if err != nil { return "", err } + return parseStartTime(string(data)) +} - parts := strings.Split(string(data), " ") +func parseStartTime(stat string) (string, error) { // the starttime is located at pos 22 // from the man page // @@ -23,5 +25,19 @@ func GetProcessStartTime(pid int) (string, error) { // (22) The time the process started after system boot. In kernels before Linux 2.6, this // value was expressed in jiffies. Since Linux 2.6, the value is expressed in clock ticks // (divide by sysconf(_SC_CLK_TCK)). - return parts[22-1], nil // starts at 1 + // + // NOTE: + // pos 2 could contain space and is inside `(` and `)`: + // (2) comm %s + // The filename of the executable, in parentheses. + // This is visible whether or not the executable is + // swapped out. + // + // the following is an example: + // 89653 (gunicorn: maste) S 89630 89653 89653 0 -1 4194560 29689 28896 0 3 146 32 76 19 20 0 1 0 2971844 52965376 3920 18446744073709551615 1 1 0 0 0 0 0 16781312 137447943 0 0 0 17 1 0 0 0 0 0 0 0 0 0 0 0 0 0 + + // get parts after last `)`: + s := strings.Split(stat, ")") + parts := strings.Split(strings.TrimSpace(s[len(s)-1]), " ") + return parts[22-3], nil // starts at 3 (after the filename pos `2`) }