DNM: PoC Always set a userns

This makes it so all containers, unless specically requesting the host
namespace with `--privileged` or `--userns=host`, are placed into a new
user namespace. UIDs and GIDs are mapped 1:1, so uid 0 in the userns is
uid 0 on the host.

The main thing this buys us is the containerized process no longer has
privileges in the root userns which simultanously makes it lose any root
namespace privileges and also allows us to safely grant (barring bugs
and oversights, of course) extra privileges to the container.

Specifically what's nice about this is now a containerized process can
do bind and fuse mounts, and possibly overlay depending on which kernel
you are running (ubuntu kernel allows unprivileged overlay mounts).

This *should* actually be more secure than allowing the container to run
in the host namespace, and even if we don't grant CAP_SYS_ADMIN in the
usernamespace by default I think this would be a good change to make
because the container process should no longer have any capabilities in
the root namespace.

Signed-off-by: Brian Goff <cpuguy83@gmail.com>
This commit is contained in:
Brian Goff 2023-10-23 17:35:13 +00:00
parent 452ca90fe5
commit fb64cc3cce
3 changed files with 75 additions and 5 deletions

View file

@ -255,11 +255,21 @@ func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
if c.HostConfig.UsernsMode.IsPrivate() {
if uidMap := daemon.idMapping.UIDMaps; uidMap != nil {
userNS = true
setNamespace(s, specs.LinuxNamespace{
Type: specs.UserNamespace,
})
s.Linux.UIDMappings = specMapping(uidMap)
s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDMaps)
} else {
if !c.HostConfig.Privileged {
setNamespace(s, specs.LinuxNamespace{
Type: specs.UserNamespace,
})
userNS = true
s.Linux.UIDMappings = []specs.LinuxIDMapping{
{Size: 65536},
}
s.Linux.GIDMappings = []specs.LinuxIDMapping{
{Size: 65536},
}
}
}
}
// network
@ -845,10 +855,11 @@ func withCommonOptions(daemon *Daemon, daemonCfg *dconfig.Config, c *container.C
// joining an existing namespace, only if we create a new net namespace.
if c.HostConfig.NetworkMode.IsPrivate() {
// We cannot set up ping socket support in a user namespace
userNS := daemonCfg.RemappedRoot != "" && c.HostConfig.UsernsMode.IsPrivate()
// userNS := daemonCfg.RemappedRoot != "" && c.HostConfig.UsernsMode.IsPrivate()
userNS := !c.HostConfig.Privileged
if !userNS && !userns.RunningInUserNS() && sysctlExists("net.ipv4.ping_group_range") {
// allow unprivileged ICMP echo sockets without CAP_NET_RAW
s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647"
// s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647"
}
// allow opening any port less than 1024 without CAP_NET_BIND_SERVICE
if sysctlExists("net.ipv4.ip_unprivileged_port_start") {

View file

@ -0,0 +1,58 @@
package container // import "github.com/docker/docker/integration/container"
import (
"strings"
"testing"
"github.com/docker/docker/integration/internal/container"
"gotest.tools/v3/assert"
"gotest.tools/v3/icmd"
)
func TestDefaultUsernsPrivs(t *testing.T) {
ctx := setupTest(t)
apiClient := testEnv.APIClient()
// Make sure that 2 privileged containers have the same user namespace
hostNs1Res := container.RunAttach(ctx, t, apiClient, container.WithPrivileged(true), container.WithCmd("readlink", "/proc/self/ns/user"))
assert.Equal(t, hostNs1Res.ExitCode, 0)
hostns1 := strings.TrimSpace(hostNs1Res.Stdout.String())
assert.Assert(t, hostns1 != "", "user namespace should not be empty")
hostNs2Res := container.RunAttach(ctx, t, apiClient, container.WithPrivileged(true), container.WithCmd("readlink", "/proc/self/ns/user"))
assert.Equal(t, hostNs2Res.ExitCode, 0)
hostns2 := strings.TrimSpace(hostNs1Res.Stdout.String())
assert.Assert(t, hostns2 != "", "user namespace should not be empty")
assert.Equal(t, hostns1, hostns2, "privileged user namespaces should be the same")
if testEnv.IsLocalDaemon() {
// Make sure the privileged container has the same user namespace as the host
res := icmd.RunCommand("readlink", "/proc/self/ns/user")
res.Assert(t, icmd.Success)
out := strings.TrimSpace(res.Combined())
assert.NilError(t, res.Error, string(out))
assert.Equal(t, hostns1, out, "privileged user namespace should be the same as the host")
}
res := container.RunAttach(ctx, t, apiClient, container.WithCmd("readlink", "/proc/self/ns/user"))
assert.Equal(t, res.ExitCode, 0, res.Stderr)
cUserns := strings.TrimSpace(res.Stdout.String())
assert.Assert(t, cUserns != "", "user namespace should not be empty")
assert.Assert(t, cUserns != hostns1, "user namespace should not be the same as the host")
cmd := `
set -e
mkdir /test1
mkdir /test2
touch /test1/hello
mount --bind /test1 /test2
[ -f /test2/hello ]
`
// TODO: For some reason this is failing in the test env but works just fine when running manually.
res = container.RunAttach(ctx, t, apiClient, container.WithCmd("sh", "-c", cmd))
assert.Equal(t, res.ExitCode, 0, res.Stderr)
}

View file

@ -3,6 +3,7 @@ package caps // import "github.com/docker/docker/oci/caps"
// DefaultCapabilities returns a Linux kernel default capabilities
func DefaultCapabilities() []string {
return []string{
"CAP_SYS_ADMIN",
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",