Jelajahi Sumber

Run privileged containers when userns are specified

Following #19995 and #17409 this PR enables skipping userns re-mapping
when creating a container (or when executing a command). Thus, enabling
privileged containers running side by side with userns remapped
containers.

The feature is enabled by specifying ```--userns:host```, which will not
remapped the user if userns are applied. If this flag is not specified,
the existing behavior (which blocks specific privileged operation)
remains.

Signed-off-by: Liron Levin <liron@twistlock.com>
Liron Levin 9 tahun lalu
induk
melakukan
6993e891d1

+ 7 - 4
daemon/container_operations_unix.go

@@ -218,11 +218,14 @@ func (daemon *Daemon) populateCommand(c *container.Container, env []string) erro
 	processConfig.Env = env
 	processConfig.Env = env
 
 
 	remappedRoot := &execdriver.User{}
 	remappedRoot := &execdriver.User{}
-	rootUID, rootGID := daemon.GetRemappedUIDGID()
-	if rootUID != 0 {
-		remappedRoot.UID = rootUID
-		remappedRoot.GID = rootGID
+	if c.HostConfig.UsernsMode.IsPrivate() {
+		rootUID, rootGID := daemon.GetRemappedUIDGID()
+		if rootUID != 0 {
+			remappedRoot.UID = rootUID
+			remappedRoot.GID = rootGID
+		}
 	}
 	}
+
 	uidMap, gidMap := daemon.GetUIDGIDMaps()
 	uidMap, gidMap := daemon.GetUIDGIDMaps()
 
 
 	if !daemon.seccompEnabled {
 	if !daemon.seccompEnabled {

+ 1 - 1
daemon/daemon_unix.go

@@ -429,7 +429,7 @@ func verifyPlatformContainerSettings(daemon *Daemon, hostConfig *containertypes.
 		logrus.Warnf("IPv4 forwarding is disabled. Networking will not work")
 		logrus.Warnf("IPv4 forwarding is disabled. Networking will not work")
 	}
 	}
 	// check for various conflicting options with user namespaces
 	// check for various conflicting options with user namespaces
-	if daemon.configStore.RemappedRoot != "" {
+	if daemon.configStore.RemappedRoot != "" && hostConfig.UsernsMode.IsPrivate() {
 		if hostConfig.Privileged {
 		if hostConfig.Privileged {
 			return warnings, fmt.Errorf("Privileged mode is incompatible with user namespaces")
 			return warnings, fmt.Errorf("Privileged mode is incompatible with user namespaces")
 		}
 		}

+ 1 - 0
docs/reference/api/docker_remote_api.md

@@ -125,6 +125,7 @@ This section lists each version from latest to oldest.  Each listing includes a
 * `GET /info` now returns `KernelMemory` field, showing if "kernel memory limit" is supported.
 * `GET /info` now returns `KernelMemory` field, showing if "kernel memory limit" is supported.
 * `POST /containers/create` now takes `PidsLimit` field, if the kernel is >= 4.3 and the pids cgroup is supported.
 * `POST /containers/create` now takes `PidsLimit` field, if the kernel is >= 4.3 and the pids cgroup is supported.
 * `GET /containers/(id or name)/stats` now returns `pids_stats`, if the kernel is >= 4.3 and the pids cgroup is supported.
 * `GET /containers/(id or name)/stats` now returns `pids_stats`, if the kernel is >= 4.3 and the pids cgroup is supported.
+* `POST /containers/create` now allows you to override usernamespaces remapping and use privileged options for the container.
 * `POST /auth` now returns an `IdentityToken` when supported by a registry.
 * `POST /auth` now returns an `IdentityToken` when supported by a registry.
 
 
 ### v1.22 API changes
 ### v1.22 API changes

+ 2 - 0
docs/reference/api/docker_remote_api_v1.23.md

@@ -431,6 +431,8 @@ Json Parameters:
             The default is not to restart. (optional)
             The default is not to restart. (optional)
             An ever increasing delay (double the previous delay, starting at 100mS)
             An ever increasing delay (double the previous delay, starting at 100mS)
             is added before each restart to prevent flooding the server.
             is added before each restart to prevent flooding the server.
+    -   **UsernsMode**  - Sets the usernamespace mode for the container when usernamespace remapping option is enabled.
+           supported values are: `host`.
     -   **NetworkMode** - Sets the networking mode for the container. Supported
     -   **NetworkMode** - Sets the networking mode for the container. Supported
           standard values are: `bridge`, `host`, `none`, and `container:<name|id>`. Any other value is taken
           standard values are: `bridge`, `host`, `none`, and `container:<name|id>`. Any other value is taken
           as a custom network's name to which this container should connect to.
           as a custom network's name to which this container should connect to.

+ 3 - 0
docs/reference/commandline/create.md

@@ -83,6 +83,9 @@ Creates a new container.
       --shm-size=[]                 Size of `/dev/shm`. The format is `<number><unit>`. `number` must be greater than `0`.  Unit is optional and can be `b` (bytes), `k` (kilobytes), `m` (megabytes), or `g` (gigabytes). If you omit the unit, the system uses bytes. If you omit the size entirely, the system uses `64m`.
       --shm-size=[]                 Size of `/dev/shm`. The format is `<number><unit>`. `number` must be greater than `0`.  Unit is optional and can be `b` (bytes), `k` (kilobytes), `m` (megabytes), or `g` (gigabytes). If you omit the unit, the system uses bytes. If you omit the size entirely, the system uses `64m`.
       -t, --tty                     Allocate a pseudo-TTY
       -t, --tty                     Allocate a pseudo-TTY
       -u, --user=""                 Username or UID
       -u, --user=""                 Username or UID
+      --userns=""                   Container user namespace
+                                    'host': Use the Docker host user namespace
+                                    '': Use the Docker daemon user namespace specified by `--userns-remap` option.
       --ulimit=[]                   Ulimit options
       --ulimit=[]                   Ulimit options
       --uts=""                      UTS namespace to use
       --uts=""                      UTS namespace to use
       -v, --volume=[host-src:]container-dest[:<options>]
       -v, --volume=[host-src:]container-dest[:<options>]

+ 10 - 0
docs/reference/commandline/daemon.md

@@ -750,6 +750,16 @@ following algorithm to create the mapping ranges:
 2. Map segments will be created from each range in increasing value with a length matching the length of each segment. Therefore the range segment with the lowest numeric starting value will be equal to the remapped root, and continue up through host uid/gid equal to the range segment length. As an example, if the lowest segment starts at ID 1000 and has a length of 100, then a map of 1000 -> 0 (the remapped root) up through 1100 -> 100 will be created from this segment. If the next segment starts at ID 10000, then the next map will start with mapping 10000 -> 101 up to the length of this second segment. This will continue until no more segments are found in the subordinate files for this user.
 2. Map segments will be created from each range in increasing value with a length matching the length of each segment. Therefore the range segment with the lowest numeric starting value will be equal to the remapped root, and continue up through host uid/gid equal to the range segment length. As an example, if the lowest segment starts at ID 1000 and has a length of 100, then a map of 1000 -> 0 (the remapped root) up through 1100 -> 100 will be created from this segment. If the next segment starts at ID 10000, then the next map will start with mapping 10000 -> 101 up to the length of this second segment. This will continue until no more segments are found in the subordinate files for this user.
 3. If more than five range segments exist for a single user, only the first five will be utilized, matching the kernel's limitation of only five entries in `/proc/self/uid_map` and `proc/self/gid_map`.
 3. If more than five range segments exist for a single user, only the first five will be utilized, matching the kernel's limitation of only five entries in `/proc/self/uid_map` and `proc/self/gid_map`.
 
 
+### Disable user namespace for a container
+
+If you enable user namespaces on the daemon, all containers are started
+with user namespaces enabled. In some situations you might want to disable
+this feature for a container, for example, to start a privileged container (see
+[user namespace known restrictions](#user-namespace-known-restrictions)).
+To enable those advanced features for a specific container use `--userns=host`
+in the `run/exec/create` command.
+This option will completely disable user namespace mapping for the container's user.
+
 ### User namespace known restrictions
 ### User namespace known restrictions
 
 
 The following standard Docker features are currently incompatible when
 The following standard Docker features are currently incompatible when

+ 3 - 0
docs/reference/commandline/run.md

@@ -85,6 +85,9 @@ parent = "smn_cli"
       --stop-signal="SIGTERM"       Signal to stop a container
       --stop-signal="SIGTERM"       Signal to stop a container
       -t, --tty                     Allocate a pseudo-TTY
       -t, --tty                     Allocate a pseudo-TTY
       -u, --user=""                 Username or UID (format: <name|uid>[:<group|gid>])
       -u, --user=""                 Username or UID (format: <name|uid>[:<group|gid>])
+      --userns=""                   Container user namespace
+                                    'host': Use the Docker host user namespace
+                                    '': Use the Docker daemon user namespace specified by `--userns-remap` option.
       --ulimit=[]                   Ulimit options
       --ulimit=[]                   Ulimit options
       --uts=""                      UTS namespace to use
       --uts=""                      UTS namespace to use
       -v, --volume=[host-src:]container-dest[:<options>]
       -v, --volume=[host-src:]container-dest[:<options>]

+ 22 - 1
integration-cli/docker_cli_userns_test.go

@@ -37,11 +37,13 @@ func (s *DockerDaemonSuite) TestDaemonUserNamespaceRootSetting(c *check.C) {
 	gid, err := strconv.Atoi(uidgid[1])
 	gid, err := strconv.Atoi(uidgid[1])
 	c.Assert(err, checker.IsNil, check.Commentf("Can't parse gid"))
 	c.Assert(err, checker.IsNil, check.Commentf("Can't parse gid"))
 
 
-	//writeable by the remapped root UID/GID pair
+	// writable by the remapped root UID/GID pair
 	c.Assert(os.Chown(tmpDir, uid, gid), checker.IsNil)
 	c.Assert(os.Chown(tmpDir, uid, gid), checker.IsNil)
 
 
 	out, err := s.d.Cmd("run", "-d", "--name", "userns", "-v", tmpDir+":/goofy", "busybox", "sh", "-c", "touch /goofy/testfile; top")
 	out, err := s.d.Cmd("run", "-d", "--name", "userns", "-v", tmpDir+":/goofy", "busybox", "sh", "-c", "touch /goofy/testfile; top")
 	c.Assert(err, checker.IsNil, check.Commentf("Output: %s", out))
 	c.Assert(err, checker.IsNil, check.Commentf("Output: %s", out))
+	user := s.findUser(c, "userns")
+	c.Assert(uidgid[0], checker.Equals, user)
 
 
 	pid, err := s.d.Cmd("inspect", "--format='{{.State.Pid}}'", "userns")
 	pid, err := s.d.Cmd("inspect", "--format='{{.State.Pid}}'", "userns")
 	c.Assert(err, checker.IsNil, check.Commentf("Could not inspect running container: out: %q", pid))
 	c.Assert(err, checker.IsNil, check.Commentf("Could not inspect running container: out: %q", pid))
@@ -62,4 +64,23 @@ func (s *DockerDaemonSuite) TestDaemonUserNamespaceRootSetting(c *check.C) {
 	c.Assert(err, checker.IsNil)
 	c.Assert(err, checker.IsNil)
 	c.Assert(stat.UID(), checker.Equals, uint32(uid), check.Commentf("Touched file not owned by remapped root UID"))
 	c.Assert(stat.UID(), checker.Equals, uint32(uid), check.Commentf("Touched file not owned by remapped root UID"))
 	c.Assert(stat.GID(), checker.Equals, uint32(gid), check.Commentf("Touched file not owned by remapped root GID"))
 	c.Assert(stat.GID(), checker.Equals, uint32(gid), check.Commentf("Touched file not owned by remapped root GID"))
+
+	// use host usernamespace
+	out, err = s.d.Cmd("run", "-d", "--name", "userns_skip", "--userns", "host", "busybox", "sh", "-c", "touch /goofy/testfile; top")
+	c.Assert(err, checker.IsNil, check.Commentf("Output: %s", out))
+	user = s.findUser(c, "userns_skip")
+	// userns are skipped, user is root
+	c.Assert(user, checker.Equals, "root")
+}
+
+// findUser finds the uid or name of the user of the first process that runs in a container
+func (s *DockerDaemonSuite) findUser(c *check.C, container string) string {
+	out, err := s.d.Cmd("top", container)
+	c.Assert(err, checker.IsNil, check.Commentf("Output: %s", out))
+	rows := strings.Split(out, "\n")
+	if len(rows) < 2 {
+		// No process rows founds
+		c.FailNow()
+	}
+	return strings.Fields(rows[1])[0]
 }
 }

+ 5 - 0
man/docker-create.1.md

@@ -58,6 +58,7 @@ docker-create - Create a new container
 [**-P**|**--publish-all**]
 [**-P**|**--publish-all**]
 [**-p**|**--publish**[=*[]*]]
 [**-p**|**--publish**[=*[]*]]
 [**--pid**[=*[]*]]
 [**--pid**[=*[]*]]
+[**--userns**[=*[]*]]
 [**--pids-limit**[=*PIDS_LIMIT*]]
 [**--pids-limit**[=*PIDS_LIMIT*]]
 [**--privileged**]
 [**--privileged**]
 [**--read-only**]
 [**--read-only**]
@@ -291,6 +292,10 @@ unit, `b` is used. Set LIMIT to `-1` to enable unlimited swap.
      **host**: use the host's PID namespace inside the container.
      **host**: use the host's PID namespace inside the container.
      Note: the host mode gives the container full access to local PID and is therefore considered insecure.
      Note: the host mode gives the container full access to local PID and is therefore considered insecure.
 
 
+**--userns**=""
+   Set the usernamespace mode for the container when `userns-remap` option is enabled.
+     **host**: use the host usernamespace and enable all privileged options (e.g., `pid=host` or `--privileged`).
+
 **--pids-limit**=""
 **--pids-limit**=""
    Tune the container's pids limit. Set `-1` to have unlimited pids for the container.
    Tune the container's pids limit. Set `-1` to have unlimited pids for the container.
 
 

+ 5 - 0
man/docker-run.1.md

@@ -60,6 +60,7 @@ docker-run - Run a command in a new container
 [**-P**|**--publish-all**]
 [**-P**|**--publish-all**]
 [**-p**|**--publish**[=*[]*]]
 [**-p**|**--publish**[=*[]*]]
 [**--pid**[=*[]*]]
 [**--pid**[=*[]*]]
+[**--userns**[=*[]*]]
 [**--pids-limit**[=*PIDS_LIMIT*]]
 [**--pids-limit**[=*PIDS_LIMIT*]]
 [**--privileged**]
 [**--privileged**]
 [**--read-only**]
 [**--read-only**]
@@ -421,6 +422,10 @@ Use `docker port` to see the actual mapping: `docker port CONTAINER $CONTAINERPO
      **host**: use the host's PID namespace inside the container.
      **host**: use the host's PID namespace inside the container.
      Note: the host mode gives the container full access to local PID and is therefore considered insecure.
      Note: the host mode gives the container full access to local PID and is therefore considered insecure.
 
 
+**--userns**=""
+   Set the usernamespace mode for the container when `userns-remap` option is enabled.
+     **host**: use the host usernamespace and enable all privileged options (e.g., `pid=host` or `--privileged`).
+
 **--pids-limit**=""
 **--pids-limit**=""
    Tune the container's pids limit. Set `-1` to have unlimited pids for the container.
    Tune the container's pids limit. Set `-1` to have unlimited pids for the container.
 
 

+ 21 - 0
runconfig/hostconfig_test.go

@@ -121,6 +121,27 @@ func TestUTSModeTest(t *testing.T) {
 	}
 	}
 }
 }
 
 
+func TestUsernsModeTest(t *testing.T) {
+	usrensMode := map[container.UsernsMode][]bool{
+		// private, host, valid
+		"":                {true, false, true},
+		"something:weird": {true, false, false},
+		"host":            {false, true, true},
+		"host:name":       {true, false, true},
+	}
+	for usernsMode, state := range usrensMode {
+		if usernsMode.IsPrivate() != state[0] {
+			t.Fatalf("UsernsMode.IsPrivate for %v should have been %v but was %v", usernsMode, state[0], usernsMode.IsPrivate())
+		}
+		if usernsMode.IsHost() != state[1] {
+			t.Fatalf("UsernsMode.IsHost for %v should have been %v but was %v", usernsMode, state[1], usernsMode.IsHost())
+		}
+		if usernsMode.Valid() != state[2] {
+			t.Fatalf("UsernsMode.Valid for %v should have been %v but was %v", usernsMode, state[2], usernsMode.Valid())
+		}
+	}
+}
+
 func TestPidModeTest(t *testing.T) {
 func TestPidModeTest(t *testing.T) {
 	pidModes := map[container.PidMode][]bool{
 	pidModes := map[container.PidMode][]bool{
 		// private, host, valid
 		// private, host, valid

+ 7 - 0
runconfig/opts/parse.go

@@ -59,6 +59,7 @@ func Parse(cmd *flag.FlagSet, args []string) (*container.Config, *container.Host
 		flPrivileged        = cmd.Bool([]string{"-privileged"}, false, "Give extended privileges to this container")
 		flPrivileged        = cmd.Bool([]string{"-privileged"}, false, "Give extended privileges to this container")
 		flPidMode           = cmd.String([]string{"-pid"}, "", "PID namespace to use")
 		flPidMode           = cmd.String([]string{"-pid"}, "", "PID namespace to use")
 		flUTSMode           = cmd.String([]string{"-uts"}, "", "UTS namespace to use")
 		flUTSMode           = cmd.String([]string{"-uts"}, "", "UTS namespace to use")
+		flUsernsMode        = cmd.String([]string{"-userns"}, "", "User namespace to use")
 		flPublishAll        = cmd.Bool([]string{"P", "-publish-all"}, false, "Publish all exposed ports to random ports")
 		flPublishAll        = cmd.Bool([]string{"P", "-publish-all"}, false, "Publish all exposed ports to random ports")
 		flStdin             = cmd.Bool([]string{"i", "-interactive"}, false, "Keep STDIN open even if not attached")
 		flStdin             = cmd.Bool([]string{"i", "-interactive"}, false, "Keep STDIN open even if not attached")
 		flTty               = cmd.Bool([]string{"t", "-tty"}, false, "Allocate a pseudo-TTY")
 		flTty               = cmd.Bool([]string{"t", "-tty"}, false, "Allocate a pseudo-TTY")
@@ -316,6 +317,11 @@ func Parse(cmd *flag.FlagSet, args []string) (*container.Config, *container.Host
 		return nil, nil, nil, cmd, fmt.Errorf("--uts: invalid UTS mode")
 		return nil, nil, nil, cmd, fmt.Errorf("--uts: invalid UTS mode")
 	}
 	}
 
 
+	usernsMode := container.UsernsMode(*flUsernsMode)
+	if !usernsMode.Valid() {
+		return nil, nil, nil, cmd, fmt.Errorf("--userns: invalid USER mode")
+	}
+
 	restartPolicy, err := ParseRestartPolicy(*flRestartPolicy)
 	restartPolicy, err := ParseRestartPolicy(*flRestartPolicy)
 	if err != nil {
 	if err != nil {
 		return nil, nil, nil, cmd, err
 		return nil, nil, nil, cmd, err
@@ -404,6 +410,7 @@ func Parse(cmd *flag.FlagSet, args []string) (*container.Config, *container.Host
 		IpcMode:        ipcMode,
 		IpcMode:        ipcMode,
 		PidMode:        pidMode,
 		PidMode:        pidMode,
 		UTSMode:        utsMode,
 		UTSMode:        utsMode,
+		UsernsMode:     usernsMode,
 		CapAdd:         strslice.StrSlice(flCapAdd.GetAll()),
 		CapAdd:         strslice.StrSlice(flCapAdd.GetAll()),
 		CapDrop:        strslice.StrSlice(flCapDrop.GetAll()),
 		CapDrop:        strslice.StrSlice(flCapDrop.GetAll()),
 		GroupAdd:       flGroupAdd.GetAll(),
 		GroupAdd:       flGroupAdd.GetAll(),