DNM: PoC Always set a userns

This makes it so all containers, unless specically requesting the host namespace with `--privileged` or `--userns=host`, are placed into a new user namespace. UIDs and GIDs are mapped 1:1, so uid 0 in the userns is uid 0 on the host. The main thing this buys us is the containerized process no longer has privileges in the root userns which simultanously makes it lose any root namespace privileges and also allows us to safely grant (barring bugs and oversights, of course) extra privileges to the container. Specifically what's nice about this is now a containerized process can do bind and fuse mounts, and possibly overlay depending on which kernel you are running (ubuntu kernel allows unprivileged overlay mounts). This *should* actually be more secure than allowing the container to run in the host namespace, and even if we don't grant CAP_SYS_ADMIN in the usernamespace by default I think this would be a good change to make because the container process should no longer have any capabilities in the root namespace. Signed-off-by: Brian Goff <cpuguy83@gmail.com>
2023-10-23 17:35:13 +00:00 · 2023-10-23 17:35:13 +00:00 · fb64cc3cce
commit fb64cc3cce
parent 452ca90fe5
3 changed files with 75 additions and 5 deletions
--- a/daemon/oci_linux.go
+++ b/daemon/oci_linux.go
@ -255,11 +255,21 @@ func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
 		if c.HostConfig.UsernsMode.IsPrivate() {
 			if uidMap := daemon.idMapping.UIDMaps; uidMap != nil {
 				userNS = true
-				setNamespace(s, specs.LinuxNamespace{
-					Type: specs.UserNamespace,
-				})
 				s.Linux.UIDMappings = specMapping(uidMap)
 				s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDMaps)
+			} else {
+				if !c.HostConfig.Privileged {
+					setNamespace(s, specs.LinuxNamespace{
+						Type: specs.UserNamespace,
+					})
+					userNS = true
+					s.Linux.UIDMappings = []specs.LinuxIDMapping{
+						{Size: 65536},
+					}
+					s.Linux.GIDMappings = []specs.LinuxIDMapping{
+						{Size: 65536},
+					}
+				}
 			}
 		}
 		// network
@ -845,10 +855,11 @@ func withCommonOptions(daemon *Daemon, daemonCfg *dconfig.Config, c *container.C
 		// joining an existing namespace, only if we create a new net namespace.
 		if c.HostConfig.NetworkMode.IsPrivate() {
 			// We cannot set up ping socket support in a user namespace
-			userNS := daemonCfg.RemappedRoot != "" && c.HostConfig.UsernsMode.IsPrivate()
+			// userNS := daemonCfg.RemappedRoot != "" && c.HostConfig.UsernsMode.IsPrivate()
+			userNS := !c.HostConfig.Privileged
 			if !userNS && !userns.RunningInUserNS() && sysctlExists("net.ipv4.ping_group_range") {
 				// allow unprivileged ICMP echo sockets without CAP_NET_RAW
-				s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647"
+				// s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647"
 			}
 			// allow opening any port less than 1024 without CAP_NET_BIND_SERVICE
 			if sysctlExists("net.ipv4.ip_unprivileged_port_start") {
--- a/integration/container/default_userns_privs_test.go
+++ b/integration/container/default_userns_privs_test.go
@ -0,0 +1,58 @@
+package container // import "github.com/docker/docker/integration/container"
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/docker/docker/integration/internal/container"
+	"gotest.tools/v3/assert"
+	"gotest.tools/v3/icmd"
+)
+
+func TestDefaultUsernsPrivs(t *testing.T) {
+	ctx := setupTest(t)
+
+	apiClient := testEnv.APIClient()
+
+	// Make sure that 2 privileged containers have the same user namespace
+	hostNs1Res := container.RunAttach(ctx, t, apiClient, container.WithPrivileged(true), container.WithCmd("readlink", "/proc/self/ns/user"))
+	assert.Equal(t, hostNs1Res.ExitCode, 0)
+	hostns1 := strings.TrimSpace(hostNs1Res.Stdout.String())
+	assert.Assert(t, hostns1 != "", "user namespace should not be empty")
+
+	hostNs2Res := container.RunAttach(ctx, t, apiClient, container.WithPrivileged(true), container.WithCmd("readlink", "/proc/self/ns/user"))
+	assert.Equal(t, hostNs2Res.ExitCode, 0)
+	hostns2 := strings.TrimSpace(hostNs1Res.Stdout.String())
+	assert.Assert(t, hostns2 != "", "user namespace should not be empty")
+
+	assert.Equal(t, hostns1, hostns2, "privileged user namespaces should be the same")
+
+	if testEnv.IsLocalDaemon() {
+		// Make sure the privileged container has the same user namespace as the host
+		res := icmd.RunCommand("readlink", "/proc/self/ns/user")
+		res.Assert(t, icmd.Success)
+
+		out := strings.TrimSpace(res.Combined())
+		assert.NilError(t, res.Error, string(out))
+		assert.Equal(t, hostns1, out, "privileged user namespace should be the same as the host")
+	}
+
+	res := container.RunAttach(ctx, t, apiClient, container.WithCmd("readlink", "/proc/self/ns/user"))
+	assert.Equal(t, res.ExitCode, 0, res.Stderr)
+	cUserns := strings.TrimSpace(res.Stdout.String())
+	assert.Assert(t, cUserns != "", "user namespace should not be empty")
+	assert.Assert(t, cUserns != hostns1, "user namespace should not be the same as the host")
+
+	cmd := `
+set -e
+mkdir /test1
+mkdir /test2
+touch /test1/hello
+mount --bind /test1 /test2
+[ -f /test2/hello ]
+`
+
+	// TODO: For some reason this is failing in the test env but works just fine when running manually.
+	res = container.RunAttach(ctx, t, apiClient, container.WithCmd("sh", "-c", cmd))
+	assert.Equal(t, res.ExitCode, 0, res.Stderr)
+}
--- a/oci/caps/defaults.go
+++ b/oci/caps/defaults.go
@ -3,6 +3,7 @@ package caps // import "github.com/docker/docker/oci/caps"
 // DefaultCapabilities returns a Linux kernel default capabilities
 func DefaultCapabilities() []string {
 	return []string{
+		"CAP_SYS_ADMIN",
 		"CAP_CHOWN",
 		"CAP_DAC_OVERRIDE",
 		"CAP_FSETID",