Browse Source

DNM: PoC Always set a userns

This makes it so all containers, unless specically requesting the host
namespace with `--privileged` or `--userns=host`, are placed into a new
user namespace. UIDs and GIDs are mapped 1:1, so uid 0 in the userns is
uid 0 on the host.

The main thing this buys us is the containerized process no longer has
privileges in the root userns which simultanously makes it lose any root
namespace privileges and also allows us to safely grant (barring bugs
and oversights, of course) extra privileges to the container.

Specifically what's nice about this is now a containerized process can
do bind and fuse mounts, and possibly overlay depending on which kernel
you are running (ubuntu kernel allows unprivileged overlay mounts).

This *should* actually be more secure than allowing the container to run
in the host namespace, and even if we don't grant CAP_SYS_ADMIN in the
usernamespace by default I think this would be a good change to make
because the container process should no longer have any capabilities in
the root namespace.

Signed-off-by: Brian Goff <cpuguy83@gmail.com>
Brian Goff 1 year ago
parent
commit
fb64cc3cce
3 changed files with 75 additions and 5 deletions
  1. 16 5
      daemon/oci_linux.go
  2. 58 0
      integration/container/default_userns_privs_test.go
  3. 1 0
      oci/caps/defaults.go

+ 16 - 5
daemon/oci_linux.go

@@ -255,11 +255,21 @@ func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
 		if c.HostConfig.UsernsMode.IsPrivate() {
 			if uidMap := daemon.idMapping.UIDMaps; uidMap != nil {
 				userNS = true
-				setNamespace(s, specs.LinuxNamespace{
-					Type: specs.UserNamespace,
-				})
 				s.Linux.UIDMappings = specMapping(uidMap)
 				s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDMaps)
+			} else {
+				if !c.HostConfig.Privileged {
+					setNamespace(s, specs.LinuxNamespace{
+						Type: specs.UserNamespace,
+					})
+					userNS = true
+					s.Linux.UIDMappings = []specs.LinuxIDMapping{
+						{Size: 65536},
+					}
+					s.Linux.GIDMappings = []specs.LinuxIDMapping{
+						{Size: 65536},
+					}
+				}
 			}
 		}
 		// network
@@ -845,10 +855,11 @@ func withCommonOptions(daemon *Daemon, daemonCfg *dconfig.Config, c *container.C
 		// joining an existing namespace, only if we create a new net namespace.
 		if c.HostConfig.NetworkMode.IsPrivate() {
 			// We cannot set up ping socket support in a user namespace
-			userNS := daemonCfg.RemappedRoot != "" && c.HostConfig.UsernsMode.IsPrivate()
+			// userNS := daemonCfg.RemappedRoot != "" && c.HostConfig.UsernsMode.IsPrivate()
+			userNS := !c.HostConfig.Privileged
 			if !userNS && !userns.RunningInUserNS() && sysctlExists("net.ipv4.ping_group_range") {
 				// allow unprivileged ICMP echo sockets without CAP_NET_RAW
-				s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647"
+				// s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647"
 			}
 			// allow opening any port less than 1024 without CAP_NET_BIND_SERVICE
 			if sysctlExists("net.ipv4.ip_unprivileged_port_start") {

+ 58 - 0
integration/container/default_userns_privs_test.go

@@ -0,0 +1,58 @@
+package container // import "github.com/docker/docker/integration/container"
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/docker/docker/integration/internal/container"
+	"gotest.tools/v3/assert"
+	"gotest.tools/v3/icmd"
+)
+
+func TestDefaultUsernsPrivs(t *testing.T) {
+	ctx := setupTest(t)
+
+	apiClient := testEnv.APIClient()
+
+	// Make sure that 2 privileged containers have the same user namespace
+	hostNs1Res := container.RunAttach(ctx, t, apiClient, container.WithPrivileged(true), container.WithCmd("readlink", "/proc/self/ns/user"))
+	assert.Equal(t, hostNs1Res.ExitCode, 0)
+	hostns1 := strings.TrimSpace(hostNs1Res.Stdout.String())
+	assert.Assert(t, hostns1 != "", "user namespace should not be empty")
+
+	hostNs2Res := container.RunAttach(ctx, t, apiClient, container.WithPrivileged(true), container.WithCmd("readlink", "/proc/self/ns/user"))
+	assert.Equal(t, hostNs2Res.ExitCode, 0)
+	hostns2 := strings.TrimSpace(hostNs1Res.Stdout.String())
+	assert.Assert(t, hostns2 != "", "user namespace should not be empty")
+
+	assert.Equal(t, hostns1, hostns2, "privileged user namespaces should be the same")
+
+	if testEnv.IsLocalDaemon() {
+		// Make sure the privileged container has the same user namespace as the host
+		res := icmd.RunCommand("readlink", "/proc/self/ns/user")
+		res.Assert(t, icmd.Success)
+
+		out := strings.TrimSpace(res.Combined())
+		assert.NilError(t, res.Error, string(out))
+		assert.Equal(t, hostns1, out, "privileged user namespace should be the same as the host")
+	}
+
+	res := container.RunAttach(ctx, t, apiClient, container.WithCmd("readlink", "/proc/self/ns/user"))
+	assert.Equal(t, res.ExitCode, 0, res.Stderr)
+	cUserns := strings.TrimSpace(res.Stdout.String())
+	assert.Assert(t, cUserns != "", "user namespace should not be empty")
+	assert.Assert(t, cUserns != hostns1, "user namespace should not be the same as the host")
+
+	cmd := `
+set -e
+mkdir /test1
+mkdir /test2
+touch /test1/hello
+mount --bind /test1 /test2
+[ -f /test2/hello ]
+`
+
+	// TODO: For some reason this is failing in the test env but works just fine when running manually.
+	res = container.RunAttach(ctx, t, apiClient, container.WithCmd("sh", "-c", cmd))
+	assert.Equal(t, res.ExitCode, 0, res.Stderr)
+}

+ 1 - 0
oci/caps/defaults.go

@@ -3,6 +3,7 @@ package caps // import "github.com/docker/docker/oci/caps"
 // DefaultCapabilities returns a Linux kernel default capabilities
 func DefaultCapabilities() []string {
 	return []string{
+		"CAP_SYS_ADMIN",
 		"CAP_CHOWN",
 		"CAP_DAC_OVERRIDE",
 		"CAP_FSETID",