浏览代码

vendor: github.com/opencontainers/runc v1.0.0-rc95

full diff: https://github.com/opencontainers/runc/compare/v1.0.0-rc92...v1.0.0-rc95

Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
Sebastiaan van Stijn 4 年之前
父节点
当前提交
a927fc7831
共有 34 个文件被更改,包括 1342 次插入602 次删除
  1. 1 1
      vendor.conf
  2. 25 13
      vendor/github.com/opencontainers/runc/README.md
  3. 16 14
      vendor/github.com/opencontainers/runc/go.mod
  4. 87 83
      vendor/github.com/opencontainers/runc/libcontainer/README.md
  5. 23 13
      vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go
  6. 51 0
      vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/fscommon.go
  7. 120 0
      vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/open.go
  8. 122 0
      vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/utils.go
  9. 28 0
      vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go
  10. 115 42
      vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go
  11. 41 59
      vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go
  12. 5 7
      vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go
  13. 15 10
      vendor/github.com/opencontainers/runc/libcontainer/configs/config.go
  14. 9 0
      vendor/github.com/opencontainers/runc/libcontainer/configs/configs_fuzzer.go
  15. 0 16
      vendor/github.com/opencontainers/runc/libcontainer/configs/device_unix.go
  16. 0 5
      vendor/github.com/opencontainers/runc/libcontainer/configs/device_windows.go
  17. 17 0
      vendor/github.com/opencontainers/runc/libcontainer/configs/devices.go
  18. 1 1
      vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go
  19. 33 29
      vendor/github.com/opencontainers/runc/libcontainer/devices/device.go
  20. 22 14
      vendor/github.com/opencontainers/runc/libcontainer/devices/device_unix.go
  21. 51 29
      vendor/github.com/opencontainers/runc/libcontainer/nsenter/cloned_binary.c
  22. 142 0
      vendor/github.com/opencontainers/runc/libcontainer/nsenter/escape.c
  23. 222 139
      vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c
  24. 1 0
      vendor/github.com/opencontainers/runc/libcontainer/nsenter/test/escape.c
  25. 53 0
      vendor/github.com/opencontainers/runc/libcontainer/nsenter/test/escape.go
  26. 0 41
      vendor/github.com/opencontainers/runc/libcontainer/user/lookup.go
  27. 16 4
      vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go
  28. 0 40
      vendor/github.com/opencontainers/runc/libcontainer/user/lookup_windows.go
  29. 10 42
      vendor/github.com/opencontainers/runc/libcontainer/user/user.go
  30. 42 0
      vendor/github.com/opencontainers/runc/libcontainer/user/user_fuzzer.go
  31. 5 0
      vendor/github.com/opencontainers/runc/libcontainer/userns/userns.go
  32. 15 0
      vendor/github.com/opencontainers/runc/libcontainer/userns/userns_fuzzer.go
  33. 37 0
      vendor/github.com/opencontainers/runc/libcontainer/userns/userns_linux.go
  34. 17 0
      vendor/github.com/opencontainers/runc/libcontainer/userns/userns_unsupported.go

+ 1 - 1
vendor.conf

@@ -91,7 +91,7 @@ google.golang.org/grpc                              f495f5b15ae7ccda3b38c53a1bfc
 # the containerd project first, and update both after that is merged.
 # the containerd project first, and update both after that is merged.
 # This commit does not need to match RUNC_COMMIT as it is used for helper
 # This commit does not need to match RUNC_COMMIT as it is used for helper
 # packages but should be newer or equal.
 # packages but should be newer or equal.
-github.com/opencontainers/runc                      ff819c7e9184c13b7c2607fe6c30ae19403a7aff # v1.0.0-rc92
+github.com/opencontainers/runc                      b9ee9c6314599f1b4a7f497e1f1f856fe433d3b7 # v1.0.0-rc95
 github.com/opencontainers/runtime-spec              1c3f411f041711bbeecf35ff7e93461ea6789220 # v1.0.3-0.20210326190908-1c3f411f0417
 github.com/opencontainers/runtime-spec              1c3f411f041711bbeecf35ff7e93461ea6789220 # v1.0.3-0.20210326190908-1c3f411f0417
 github.com/opencontainers/image-spec                d60099175f88c47cd379c4738d158884749ed235 # v1.0.1
 github.com/opencontainers/image-spec                d60099175f88c47cd379c4738d158884749ed235 # v1.0.1
 github.com/cyphar/filepath-securejoin               a261ee33d7a517f054effbf451841abaafe3e0fd # v0.2.2
 github.com/cyphar/filepath-securejoin               a261ee33d7a517f054effbf451841abaafe3e0fd # v0.2.2

+ 25 - 13
vendor/github.com/opencontainers/runc/README.md

@@ -1,9 +1,10 @@
 # runc
 # runc
 
 
-[![Build Status](https://travis-ci.org/opencontainers/runc.svg?branch=master)](https://travis-ci.org/opencontainers/runc)
 [![Go Report Card](https://goreportcard.com/badge/github.com/opencontainers/runc)](https://goreportcard.com/report/github.com/opencontainers/runc)
 [![Go Report Card](https://goreportcard.com/badge/github.com/opencontainers/runc)](https://goreportcard.com/report/github.com/opencontainers/runc)
 [![GoDoc](https://godoc.org/github.com/opencontainers/runc?status.svg)](https://godoc.org/github.com/opencontainers/runc)
 [![GoDoc](https://godoc.org/github.com/opencontainers/runc?status.svg)](https://godoc.org/github.com/opencontainers/runc)
 [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/588/badge)](https://bestpractices.coreinfrastructure.org/projects/588)
 [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/588/badge)](https://bestpractices.coreinfrastructure.org/projects/588)
+[![gha/validate](https://github.com/opencontainers/runc/workflows/validate/badge.svg)](https://github.com/opencontainers/runc/actions?query=workflow%3Avalidate)
+[![gha/ci](https://github.com/opencontainers/runc/workflows/ci/badge.svg)](https://github.com/opencontainers/runc/actions?query=workflow%3Aci)
 
 
 ## Introduction
 ## Introduction
 
 
@@ -17,10 +18,6 @@ This means that `runc` 1.0.0 should implement the 1.0 version of the specificati
 
 
 You can find official releases of `runc` on the [release](https://github.com/opencontainers/runc/releases) page.
 You can find official releases of `runc` on the [release](https://github.com/opencontainers/runc/releases) page.
 
 
-Currently, the following features are not considered to be production-ready:
-
-* [Support for cgroup v2](./docs/cgroup-v2.md)
-
 ## Security
 ## Security
 
 
 The reporting process and disclosure communications are outlined [here](https://github.com/opencontainers/org/blob/master/SECURITY.md).
 The reporting process and disclosure communications are outlined [here](https://github.com/opencontainers/org/blob/master/SECURITY.md).
@@ -64,19 +61,20 @@ sudo make install
 with some of them enabled by default (see `BUILDTAGS` in top-level `Makefile`).
 with some of them enabled by default (see `BUILDTAGS` in top-level `Makefile`).
 
 
 To change build tags from the default, set the `BUILDTAGS` variable for make,
 To change build tags from the default, set the `BUILDTAGS` variable for make,
-e.g.
+e.g. to disable seccomp:
 
 
 ```bash
 ```bash
-make BUILDTAGS='seccomp apparmor'
+make BUILDTAGS=""
 ```
 ```
 
 
 | Build Tag | Feature                            | Enabled by default | Dependency |
 | Build Tag | Feature                            | Enabled by default | Dependency |
 |-----------|------------------------------------|--------------------|------------|
 |-----------|------------------------------------|--------------------|------------|
 | seccomp   | Syscall filtering                  | yes                | libseccomp |
 | seccomp   | Syscall filtering                  | yes                | libseccomp |
-| selinux   | selinux process and mount labeling | yes                | <none>     |
-| apparmor  | apparmor profile support           | yes                | <none>     |
-| nokmem    | disable kernel memory accounting   | no                 | <none>     |
 
 
+The following build tags were used earlier, but are now obsoleted:
+ - **nokmem** (since runc v1.0.0-rc94 kernel memory settings are ignored)
+ - **apparmor** (since runc v1.0.0-rc93 the feature is always enabled)
+ - **selinux**  (since runc v1.0.0-rc93 the feature is always enabled)
 
 
 ### Running the test suite
 ### Running the test suite
 
 
@@ -128,6 +126,14 @@ make verify-dependencies
 
 
 ## Using runc
 ## Using runc
 
 
+Please note that runc is a low level tool not designed with an end user
+in mind. It is mostly employed by other higher level container software.
+
+Therefore, unless there is some specific use case that prevents the use
+of tools like Docker or Podman, it is not recommended to use runc directly.
+
+If you still want to use runc, here's how.
+
 ### Creating an OCI Bundle
 ### Creating an OCI Bundle
 
 
 In order to use runc you must have your container in the format of an OCI bundle.
 In order to use runc you must have your container in the format of an OCI bundle.
@@ -169,7 +175,9 @@ If you used the unmodified `runc spec` template this should give you a `sh` sess
 
 
 The second way to start a container is using the specs lifecycle operations.
 The second way to start a container is using the specs lifecycle operations.
 This gives you more power over how the container is created and managed while it is running.
 This gives you more power over how the container is created and managed while it is running.
-This will also launch the container in the background so you will have to edit the `config.json` to remove the `terminal` setting for the simple examples here.
+This will also launch the container in the background so you will have to edit
+the `config.json` to remove the `terminal` setting for the simple examples
+below (see more details about [runc terminal handling](docs/terminals.md)).
 Your process field in the `config.json` should look like this below with `"terminal": false` and `"args": ["sleep", "5"]`.
 Your process field in the `config.json` should look like this below with `"terminal": false` and `"args": ["sleep", "5"]`.
 
 
 
 
@@ -292,8 +300,12 @@ PIDFile=/run/mycontainerid.pid
 WantedBy=multi-user.target
 WantedBy=multi-user.target
 ```
 ```
 
 
-#### cgroup v2
-See [`./docs/cgroup-v2.md`](./docs/cgroup-v2.md).
+## More documentation
+
+* [cgroup v2](./docs/cgroup-v2.md)
+* [Checkpoint and restore](./docs/checkpoint-restore.md)
+* [systemd cgroup driver](./docs/systemd.md)
+* [Terminals and standard IO](./docs/terminals.md)
 
 
 ## License
 ## License
 
 

+ 16 - 14
vendor/github.com/opencontainers/runc/go.mod

@@ -1,26 +1,28 @@
 module github.com/opencontainers/runc
 module github.com/opencontainers/runc
 
 
-go 1.14
+go 1.13
 
 
 require (
 require (
-	github.com/checkpoint-restore/go-criu/v4 v4.1.0
-	github.com/cilium/ebpf v0.0.0-20200702112145-1c8d4c9ef775
-	github.com/containerd/console v1.0.0
-	github.com/coreos/go-systemd/v22 v22.1.0
+	github.com/checkpoint-restore/go-criu/v5 v5.0.0
+	github.com/cilium/ebpf v0.5.0
+	github.com/containerd/console v1.0.2
+	github.com/coreos/go-systemd/v22 v22.3.1
 	github.com/cyphar/filepath-securejoin v0.2.2
 	github.com/cyphar/filepath-securejoin v0.2.2
 	github.com/docker/go-units v0.4.0
 	github.com/docker/go-units v0.4.0
-	github.com/godbus/dbus/v5 v5.0.3
-	github.com/golang/protobuf v1.4.2
-	github.com/moby/sys/mountinfo v0.1.3
-	github.com/mrunalp/fileutils v0.0.0-20200520151820-abd8a0e76976
-	github.com/opencontainers/runtime-spec v1.0.3-0.20200728170252-4d89ac9fbff6
-	github.com/opencontainers/selinux v1.6.0
+	github.com/godbus/dbus/v5 v5.0.4
+	github.com/moby/sys/mountinfo v0.4.1
+	github.com/mrunalp/fileutils v0.5.0
+	github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417
+	github.com/opencontainers/selinux v1.8.0
 	github.com/pkg/errors v0.9.1
 	github.com/pkg/errors v0.9.1
 	github.com/seccomp/libseccomp-golang v0.9.1
 	github.com/seccomp/libseccomp-golang v0.9.1
-	github.com/sirupsen/logrus v1.6.0
-	github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2
+	github.com/sirupsen/logrus v1.7.0
+	github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635
 	// NOTE: urfave/cli must be <= v1.22.1 due to a regression: https://github.com/urfave/cli/issues/1092
 	// NOTE: urfave/cli must be <= v1.22.1 due to a regression: https://github.com/urfave/cli/issues/1092
 	github.com/urfave/cli v1.22.1
 	github.com/urfave/cli v1.22.1
 	github.com/vishvananda/netlink v1.1.0
 	github.com/vishvananda/netlink v1.1.0
-	golang.org/x/sys v0.0.0-20200728102440-3e129f6d46b1
+	github.com/willf/bitset v1.1.11
+	golang.org/x/net v0.0.0-20201224014010-6772e930b67b
+	golang.org/x/sys v0.0.0-20210426230700-d19ff857e887
+	google.golang.org/protobuf v1.25.0
 )
 )

+ 87 - 83
vendor/github.com/opencontainers/runc/libcontainer/README.md

@@ -57,90 +57,94 @@ struct describing how the container is to be created. A sample would look simila
 
 
 ```go
 ```go
 defaultMountFlags := unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
 defaultMountFlags := unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
+var devices []*configs.DeviceRule
+for _, device := range specconv.AllowedDevices {
+	devices = append(devices, &device.Rule)
+}
 config := &configs.Config{
 config := &configs.Config{
 	Rootfs: "/your/path/to/rootfs",
 	Rootfs: "/your/path/to/rootfs",
 	Capabilities: &configs.Capabilities{
 	Capabilities: &configs.Capabilities{
-                Bounding: []string{
-                        "CAP_CHOWN",
-                        "CAP_DAC_OVERRIDE",
-                        "CAP_FSETID",
-                        "CAP_FOWNER",
-                        "CAP_MKNOD",
-                        "CAP_NET_RAW",
-                        "CAP_SETGID",
-                        "CAP_SETUID",
-                        "CAP_SETFCAP",
-                        "CAP_SETPCAP",
-                        "CAP_NET_BIND_SERVICE",
-                        "CAP_SYS_CHROOT",
-                        "CAP_KILL",
-                        "CAP_AUDIT_WRITE",
-                },
-                Effective: []string{
-                        "CAP_CHOWN",
-                        "CAP_DAC_OVERRIDE",
-                        "CAP_FSETID",
-                        "CAP_FOWNER",
-                        "CAP_MKNOD",
-                        "CAP_NET_RAW",
-                        "CAP_SETGID",
-                        "CAP_SETUID",
-                        "CAP_SETFCAP",
-                        "CAP_SETPCAP",
-                        "CAP_NET_BIND_SERVICE",
-                        "CAP_SYS_CHROOT",
-                        "CAP_KILL",
-                        "CAP_AUDIT_WRITE",
-                },
-                Inheritable: []string{
-                        "CAP_CHOWN",
-                        "CAP_DAC_OVERRIDE",
-                        "CAP_FSETID",
-                        "CAP_FOWNER",
-                        "CAP_MKNOD",
-                        "CAP_NET_RAW",
-                        "CAP_SETGID",
-                        "CAP_SETUID",
-                        "CAP_SETFCAP",
-                        "CAP_SETPCAP",
-                        "CAP_NET_BIND_SERVICE",
-                        "CAP_SYS_CHROOT",
-                        "CAP_KILL",
-                        "CAP_AUDIT_WRITE",
-                },
-                Permitted: []string{
-                        "CAP_CHOWN",
-                        "CAP_DAC_OVERRIDE",
-                        "CAP_FSETID",
-                        "CAP_FOWNER",
-                        "CAP_MKNOD",
-                        "CAP_NET_RAW",
-                        "CAP_SETGID",
-                        "CAP_SETUID",
-                        "CAP_SETFCAP",
-                        "CAP_SETPCAP",
-                        "CAP_NET_BIND_SERVICE",
-                        "CAP_SYS_CHROOT",
-                        "CAP_KILL",
-                        "CAP_AUDIT_WRITE",
-                },
-                Ambient: []string{
-                        "CAP_CHOWN",
-                        "CAP_DAC_OVERRIDE",
-                        "CAP_FSETID",
-                        "CAP_FOWNER",
-                        "CAP_MKNOD",
-                        "CAP_NET_RAW",
-                        "CAP_SETGID",
-                        "CAP_SETUID",
-                        "CAP_SETFCAP",
-                        "CAP_SETPCAP",
-                        "CAP_NET_BIND_SERVICE",
-                        "CAP_SYS_CHROOT",
-                        "CAP_KILL",
-                        "CAP_AUDIT_WRITE",
-                },
-        },
+		Bounding: []string{
+			"CAP_CHOWN",
+			"CAP_DAC_OVERRIDE",
+			"CAP_FSETID",
+			"CAP_FOWNER",
+			"CAP_MKNOD",
+			"CAP_NET_RAW",
+			"CAP_SETGID",
+			"CAP_SETUID",
+			"CAP_SETFCAP",
+			"CAP_SETPCAP",
+			"CAP_NET_BIND_SERVICE",
+			"CAP_SYS_CHROOT",
+			"CAP_KILL",
+			"CAP_AUDIT_WRITE",
+		},
+		Effective: []string{
+			"CAP_CHOWN",
+			"CAP_DAC_OVERRIDE",
+			"CAP_FSETID",
+			"CAP_FOWNER",
+			"CAP_MKNOD",
+			"CAP_NET_RAW",
+			"CAP_SETGID",
+			"CAP_SETUID",
+			"CAP_SETFCAP",
+			"CAP_SETPCAP",
+			"CAP_NET_BIND_SERVICE",
+			"CAP_SYS_CHROOT",
+			"CAP_KILL",
+			"CAP_AUDIT_WRITE",
+		},
+		Inheritable: []string{
+			"CAP_CHOWN",
+			"CAP_DAC_OVERRIDE",
+			"CAP_FSETID",
+			"CAP_FOWNER",
+			"CAP_MKNOD",
+			"CAP_NET_RAW",
+			"CAP_SETGID",
+			"CAP_SETUID",
+			"CAP_SETFCAP",
+			"CAP_SETPCAP",
+			"CAP_NET_BIND_SERVICE",
+			"CAP_SYS_CHROOT",
+			"CAP_KILL",
+			"CAP_AUDIT_WRITE",
+		},
+		Permitted: []string{
+			"CAP_CHOWN",
+			"CAP_DAC_OVERRIDE",
+			"CAP_FSETID",
+			"CAP_FOWNER",
+			"CAP_MKNOD",
+			"CAP_NET_RAW",
+			"CAP_SETGID",
+			"CAP_SETUID",
+			"CAP_SETFCAP",
+			"CAP_SETPCAP",
+			"CAP_NET_BIND_SERVICE",
+			"CAP_SYS_CHROOT",
+			"CAP_KILL",
+			"CAP_AUDIT_WRITE",
+		},
+		Ambient: []string{
+			"CAP_CHOWN",
+			"CAP_DAC_OVERRIDE",
+			"CAP_FSETID",
+			"CAP_FOWNER",
+			"CAP_MKNOD",
+			"CAP_NET_RAW",
+			"CAP_SETGID",
+			"CAP_SETUID",
+			"CAP_SETFCAP",
+			"CAP_SETPCAP",
+			"CAP_NET_BIND_SERVICE",
+			"CAP_SYS_CHROOT",
+			"CAP_KILL",
+			"CAP_AUDIT_WRITE",
+		},
+	},
 	Namespaces: configs.Namespaces([]configs.Namespace{
 	Namespaces: configs.Namespaces([]configs.Namespace{
 		{Type: configs.NEWNS},
 		{Type: configs.NEWNS},
 		{Type: configs.NEWUTS},
 		{Type: configs.NEWUTS},
@@ -155,7 +159,7 @@ config := &configs.Config{
 		Parent: "system",
 		Parent: "system",
 		Resources: &configs.Resources{
 		Resources: &configs.Resources{
 			MemorySwappiness: nil,
 			MemorySwappiness: nil,
-			Devices:          specconv.AllowedDevices,
+			Devices:          devices,
 		},
 		},
 	},
 	},
 	MaskPaths: []string{
 	MaskPaths: []string{
@@ -313,7 +317,7 @@ state, err := container.State()
 #### Checkpoint & Restore
 #### Checkpoint & Restore
 
 
 libcontainer now integrates [CRIU](http://criu.org/) for checkpointing and restoring containers.
 libcontainer now integrates [CRIU](http://criu.org/) for checkpointing and restoring containers.
-This let's you save the state of a process running inside a container to disk, and then restore
+This lets you save the state of a process running inside a container to disk, and then restore
 that state into a new process, on the same machine or on another machine.
 that state into a new process, on the same machine or on another machine.
 
 
 `criu` version 1.5.2 or higher is required to use checkpoint and restore.
 `criu` version 1.5.2 or higher is required to use checkpoint and restore.

+ 23 - 13
vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go

@@ -7,37 +7,44 @@ import (
 )
 )
 
 
 type Manager interface {
 type Manager interface {
-	// Applies cgroup configuration to the process with the specified pid
+	// Apply creates a cgroup, if not yet created, and adds a process
+	// with the specified pid into that cgroup.  A special value of -1
+	// can be used to merely create a cgroup.
 	Apply(pid int) error
 	Apply(pid int) error
 
 
-	// Returns the PIDs inside the cgroup set
+	// GetPids returns the PIDs of all processes inside the cgroup.
 	GetPids() ([]int, error)
 	GetPids() ([]int, error)
 
 
-	// Returns the PIDs inside the cgroup set & all sub-cgroups
+	// GetAllPids returns the PIDs of all processes inside the cgroup
+	// any all its sub-cgroups.
 	GetAllPids() ([]int, error)
 	GetAllPids() ([]int, error)
 
 
-	// Returns statistics for the cgroup set
+	// GetStats returns cgroups statistics.
 	GetStats() (*Stats, error)
 	GetStats() (*Stats, error)
 
 
-	// Toggles the freezer cgroup according with specified state
+	// Freeze sets the freezer cgroup to the specified state.
 	Freeze(state configs.FreezerState) error
 	Freeze(state configs.FreezerState) error
 
 
-	// Destroys the cgroup set
+	// Destroy removes cgroup.
 	Destroy() error
 	Destroy() error
 
 
 	// Path returns a cgroup path to the specified controller/subsystem.
 	// Path returns a cgroup path to the specified controller/subsystem.
 	// For cgroupv2, the argument is unused and can be empty.
 	// For cgroupv2, the argument is unused and can be empty.
 	Path(string) string
 	Path(string) string
 
 
-	// Sets the cgroup as configured.
-	Set(container *configs.Config) error
+	// Set sets cgroup resources parameters/limits. If the argument is nil,
+	// the resources specified during Manager creation (or the previous call
+	// to Set) are used.
+	Set(r *configs.Resources) error
 
 
-	// GetPaths returns cgroup path(s) to save in a state file in order to restore later.
+	// GetPaths returns cgroup path(s) to save in a state file in order to
+	// restore later.
 	//
 	//
-	// For cgroup v1, a key is cgroup subsystem name, and the value is the path
-	// to the cgroup for this subsystem.
+	// For cgroup v1, a key is cgroup subsystem name, and the value is the
+	// path to the cgroup for this subsystem.
 	//
 	//
-	// For cgroup v2 unified hierarchy, a key is "", and the value is the unified path.
+	// For cgroup v2 unified hierarchy, a key is "", and the value is the
+	// unified path.
 	GetPaths() map[string]string
 	GetPaths() map[string]string
 
 
 	// GetCgroups returns the cgroup data as configured.
 	// GetCgroups returns the cgroup data as configured.
@@ -46,6 +53,9 @@ type Manager interface {
 	// GetFreezerState retrieves the current FreezerState of the cgroup.
 	// GetFreezerState retrieves the current FreezerState of the cgroup.
 	GetFreezerState() (configs.FreezerState, error)
 	GetFreezerState() (configs.FreezerState, error)
 
 
-	// Whether the cgroup path exists or not
+	// Exists returns whether the cgroup path exists or not.
 	Exists() bool
 	Exists() bool
+
+	// OOMKillCount reports OOM kill count for the cgroup.
+	OOMKillCount() (uint64, error)
 }
 }

+ 51 - 0
vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/fscommon.go

@@ -0,0 +1,51 @@
+// +build linux
+
+package fscommon
+
+import (
+	"bytes"
+	"os"
+
+	"github.com/pkg/errors"
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
+)
+
+// WriteFile writes data to a cgroup file in dir.
+// It is supposed to be used for cgroup files only.
+func WriteFile(dir, file, data string) error {
+	fd, err := OpenFile(dir, file, unix.O_WRONLY)
+	if err != nil {
+		return err
+	}
+	defer fd.Close()
+	if err := retryingWriteFile(fd, data); err != nil {
+		return errors.Wrapf(err, "failed to write %q", data)
+	}
+	return nil
+}
+
+// ReadFile reads data from a cgroup file in dir.
+// It is supposed to be used for cgroup files only.
+func ReadFile(dir, file string) (string, error) {
+	fd, err := OpenFile(dir, file, unix.O_RDONLY)
+	if err != nil {
+		return "", err
+	}
+	defer fd.Close()
+	var buf bytes.Buffer
+
+	_, err = buf.ReadFrom(fd)
+	return buf.String(), err
+}
+
+func retryingWriteFile(fd *os.File, data string) error {
+	for {
+		_, err := fd.Write([]byte(data))
+		if errors.Is(err, unix.EINTR) {
+			logrus.Infof("interrupted while writing %s to %s", data, fd.Name())
+			continue
+		}
+		return err
+	}
+}

+ 120 - 0
vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/open.go

@@ -0,0 +1,120 @@
+package fscommon
+
+import (
+	"os"
+	"strings"
+	"sync"
+
+	"github.com/pkg/errors"
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
+)
+
+const (
+	cgroupfsDir    = "/sys/fs/cgroup"
+	cgroupfsPrefix = cgroupfsDir + "/"
+)
+
+var (
+	// TestMode is set to true by unit tests that need "fake" cgroupfs.
+	TestMode bool
+
+	cgroupFd     int = -1
+	prepOnce     sync.Once
+	prepErr      error
+	resolveFlags uint64
+)
+
+func prepareOpenat2() error {
+	prepOnce.Do(func() {
+		fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{
+			Flags: unix.O_DIRECTORY | unix.O_PATH})
+		if err != nil {
+			prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err}
+			if err != unix.ENOSYS {
+				logrus.Warnf("falling back to securejoin: %s", prepErr)
+			} else {
+				logrus.Debug("openat2 not available, falling back to securejoin")
+			}
+			return
+		}
+		var st unix.Statfs_t
+		if err = unix.Fstatfs(fd, &st); err != nil {
+			prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err}
+			logrus.Warnf("falling back to securejoin: %s", prepErr)
+			return
+		}
+
+		cgroupFd = fd
+
+		resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS
+		if st.Type == unix.CGROUP2_SUPER_MAGIC {
+			// cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks
+			resolveFlags |= unix.RESOLVE_NO_XDEV | unix.RESOLVE_NO_SYMLINKS
+		}
+
+	})
+
+	return prepErr
+}
+
+// OpenFile opens a cgroup file in a given dir with given flags.
+// It is supposed to be used for cgroup files only.
+func OpenFile(dir, file string, flags int) (*os.File, error) {
+	if dir == "" {
+		return nil, errors.Errorf("no directory specified for %s", file)
+	}
+	mode := os.FileMode(0)
+	if TestMode && flags&os.O_WRONLY != 0 {
+		// "emulate" cgroup fs for unit tests
+		flags |= os.O_TRUNC | os.O_CREATE
+		mode = 0o600
+	}
+	if prepareOpenat2() != nil {
+		return openFallback(dir, file, flags, mode)
+	}
+	reldir := strings.TrimPrefix(dir, cgroupfsPrefix)
+	if len(reldir) == len(dir) { // non-standard path, old system?
+		return openFallback(dir, file, flags, mode)
+	}
+
+	relname := reldir + "/" + file
+	fd, err := unix.Openat2(cgroupFd, relname,
+		&unix.OpenHow{
+			Resolve: resolveFlags,
+			Flags:   uint64(flags) | unix.O_CLOEXEC,
+			Mode:    uint64(mode),
+		})
+	if err != nil {
+		return nil, &os.PathError{Op: "openat2", Path: dir + "/" + file, Err: err}
+	}
+
+	return os.NewFile(uintptr(fd), cgroupfsPrefix+relname), nil
+}
+
+var errNotCgroupfs = errors.New("not a cgroup file")
+
+// openFallback is used when openat2(2) is not available. It checks the opened
+// file is on cgroupfs, returning an error otherwise.
+func openFallback(dir, file string, flags int, mode os.FileMode) (*os.File, error) {
+	path := dir + "/" + file
+	fd, err := os.OpenFile(path, flags, mode)
+	if err != nil {
+		return nil, err
+	}
+	if TestMode {
+		return fd, nil
+	}
+	// Check this is a cgroupfs file.
+	var st unix.Statfs_t
+	if err := unix.Fstatfs(int(fd.Fd()), &st); err != nil {
+		_ = fd.Close()
+		return nil, &os.PathError{Op: "statfs", Path: path, Err: err}
+	}
+	if st.Type != unix.CGROUP_SUPER_MAGIC && st.Type != unix.CGROUP2_SUPER_MAGIC {
+		_ = fd.Close()
+		return nil, &os.PathError{Op: "open", Path: path, Err: errNotCgroupfs}
+	}
+
+	return fd, nil
+}

+ 122 - 0
vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/utils.go

@@ -0,0 +1,122 @@
+// +build linux
+
+package fscommon
+
+import (
+	"errors"
+	"fmt"
+	"math"
+	"strconv"
+	"strings"
+)
+
+var (
+	ErrNotValidFormat = errors.New("line is not a valid key value format")
+)
+
+// ParseUint converts a string to an uint64 integer.
+// Negative values are returned at zero as, due to kernel bugs,
+// some of the memory cgroup stats can be negative.
+func ParseUint(s string, base, bitSize int) (uint64, error) {
+	value, err := strconv.ParseUint(s, base, bitSize)
+	if err != nil {
+		intValue, intErr := strconv.ParseInt(s, base, bitSize)
+		// 1. Handle negative values greater than MinInt64 (and)
+		// 2. Handle negative values lesser than MinInt64
+		if intErr == nil && intValue < 0 {
+			return 0, nil
+		} else if intErr != nil && intErr.(*strconv.NumError).Err == strconv.ErrRange && intValue < 0 {
+			return 0, nil
+		}
+
+		return value, err
+	}
+
+	return value, nil
+}
+
+// ParseKeyValue parses a space-separated "name value" kind of cgroup
+// parameter and returns its key as a string, and its value as uint64
+// (ParseUint is used to convert the value). For example,
+// "io_service_bytes 1234" will be returned as "io_service_bytes", 1234.
+func ParseKeyValue(t string) (string, uint64, error) {
+	parts := strings.SplitN(t, " ", 3)
+	if len(parts) != 2 {
+		return "", 0, fmt.Errorf("line %q is not in key value format", t)
+	}
+
+	value, err := ParseUint(parts[1], 10, 64)
+	if err != nil {
+		return "", 0, fmt.Errorf("unable to convert to uint64: %v", err)
+	}
+
+	return parts[0], value, nil
+}
+
+// GetValueByKey reads a key-value pairs from the specified cgroup file,
+// and returns a value of the specified key. ParseUint is used for value
+// conversion.
+func GetValueByKey(path, file, key string) (uint64, error) {
+	content, err := ReadFile(path, file)
+	if err != nil {
+		return 0, err
+	}
+
+	lines := strings.Split(string(content), "\n")
+	for _, line := range lines {
+		arr := strings.Split(line, " ")
+		if len(arr) == 2 && arr[0] == key {
+			return ParseUint(arr[1], 10, 64)
+		}
+	}
+
+	return 0, nil
+}
+
+// GetCgroupParamUint reads a single uint64 value from the specified cgroup file.
+// If the value read is "max", the math.MaxUint64 is returned.
+func GetCgroupParamUint(path, file string) (uint64, error) {
+	contents, err := GetCgroupParamString(path, file)
+	if err != nil {
+		return 0, err
+	}
+	contents = strings.TrimSpace(contents)
+	if contents == "max" {
+		return math.MaxUint64, nil
+	}
+
+	res, err := ParseUint(contents, 10, 64)
+	if err != nil {
+		return res, fmt.Errorf("unable to parse file %q", path+"/"+file)
+	}
+	return res, nil
+}
+
+// GetCgroupParamInt reads a single int64 value from specified cgroup file.
+// If the value read is "max", the math.MaxInt64 is returned.
+func GetCgroupParamInt(path, file string) (int64, error) {
+	contents, err := ReadFile(path, file)
+	if err != nil {
+		return 0, err
+	}
+	contents = strings.TrimSpace(contents)
+	if contents == "max" {
+		return math.MaxInt64, nil
+	}
+
+	res, err := strconv.ParseInt(contents, 10, 64)
+	if err != nil {
+		return res, fmt.Errorf("unable to parse %q as a int from Cgroup file %q", contents, path+"/"+file)
+	}
+	return res, nil
+}
+
+// GetCgroupParamString reads a string from the specified cgroup file.
+func GetCgroupParamString(path, file string) (string, error) {
+	contents, err := ReadFile(path, file)
+	if err != nil {
+		return "", err
+	}
+
+	return strings.TrimSpace(contents), nil
+}

+ 28 - 0
vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go

@@ -39,6 +39,33 @@ type CpuStats struct {
 	ThrottlingData ThrottlingData `json:"throttling_data,omitempty"`
 	ThrottlingData ThrottlingData `json:"throttling_data,omitempty"`
 }
 }
 
 
+type CPUSetStats struct {
+	// List of the physical numbers of the CPUs on which processes
+	// in that cpuset are allowed to execute
+	CPUs []uint16 `json:"cpus,omitempty"`
+	// cpu_exclusive flag
+	CPUExclusive uint64 `json:"cpu_exclusive"`
+	// List of memory nodes on which processes in that cpuset
+	// are allowed to allocate memory
+	Mems []uint16 `json:"mems,omitempty"`
+	// mem_hardwall flag
+	MemHardwall uint64 `json:"mem_hardwall"`
+	// mem_exclusive flag
+	MemExclusive uint64 `json:"mem_exclusive"`
+	// memory_migrate flag
+	MemoryMigrate uint64 `json:"memory_migrate"`
+	// memory_spread page flag
+	MemorySpreadPage uint64 `json:"memory_spread_page"`
+	// memory_spread slab flag
+	MemorySpreadSlab uint64 `json:"memory_spread_slab"`
+	// memory_pressure
+	MemoryPressure uint64 `json:"memory_pressure"`
+	// sched_load balance flag
+	SchedLoadBalance uint64 `json:"sched_load_balance"`
+	// sched_relax_domain_level
+	SchedRelaxDomainLevel int64 `json:"sched_relax_domain_level"`
+}
+
 type MemoryData struct {
 type MemoryData struct {
 	Usage    uint64 `json:"usage,omitempty"`
 	Usage    uint64 `json:"usage,omitempty"`
 	MaxUsage uint64 `json:"max_usage,omitempty"`
 	MaxUsage uint64 `json:"max_usage,omitempty"`
@@ -121,6 +148,7 @@ type HugetlbStats struct {
 
 
 type Stats struct {
 type Stats struct {
 	CpuStats    CpuStats    `json:"cpu_stats,omitempty"`
 	CpuStats    CpuStats    `json:"cpu_stats,omitempty"`
+	CPUSetStats CPUSetStats `json:"cpuset_stats,omitempty"`
 	MemoryStats MemoryStats `json:"memory_stats,omitempty"`
 	MemoryStats MemoryStats `json:"memory_stats,omitempty"`
 	PidsStats   PidsStats   `json:"pids_stats,omitempty"`
 	PidsStats   PidsStats   `json:"pids_stats,omitempty"`
 	BlkioStats  BlkioStats  `json:"blkio_stats,omitempty"`
 	BlkioStats  BlkioStats  `json:"blkio_stats,omitempty"`

+ 115 - 42
vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go

@@ -15,7 +15,9 @@ import (
 	"sync"
 	"sync"
 	"time"
 	"time"
 
 
-	units "github.com/docker/go-units"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/userns"
+	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
 	"golang.org/x/sys/unix"
 )
 )
 
 
@@ -29,19 +31,19 @@ var (
 	isUnified     bool
 	isUnified     bool
 )
 )
 
 
-// HugePageSizeUnitList is a list of the units used by the linux kernel when
-// naming the HugePage control files.
-// https://www.kernel.org/doc/Documentation/cgroup-v1/hugetlb.txt
-// TODO Since the kernel only use KB, MB and GB; TB and PB should be removed,
-// depends on https://github.com/docker/go-units/commit/a09cd47f892041a4fac473133d181f5aea6fa393
-var HugePageSizeUnitList = []string{"B", "KB", "MB", "GB", "TB", "PB"}
-
 // IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode.
 // IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode.
 func IsCgroup2UnifiedMode() bool {
 func IsCgroup2UnifiedMode() bool {
 	isUnifiedOnce.Do(func() {
 	isUnifiedOnce.Do(func() {
 		var st unix.Statfs_t
 		var st unix.Statfs_t
-		if err := unix.Statfs(unifiedMountpoint, &st); err != nil {
-			panic("cannot statfs cgroup root")
+		err := unix.Statfs(unifiedMountpoint, &st)
+		if err != nil {
+			if os.IsNotExist(err) && userns.RunningInUserNS() {
+				// ignore the "not found" error if running in userns
+				logrus.WithError(err).Debugf("%s missing, assuming cgroup v1", unifiedMountpoint)
+				isUnified = false
+				return
+			}
+			panic(fmt.Sprintf("cannot statfs cgroup root: %s", err))
 		}
 		}
 		isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
 		isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
 	})
 	})
@@ -86,11 +88,11 @@ func GetAllSubsystems() ([]string, error) {
 		// - freezer: implemented in kernel 5.2
 		// - freezer: implemented in kernel 5.2
 		// We assume these are always available, as it is hard to detect availability.
 		// We assume these are always available, as it is hard to detect availability.
 		pseudo := []string{"devices", "freezer"}
 		pseudo := []string{"devices", "freezer"}
-		data, err := ioutil.ReadFile("/sys/fs/cgroup/cgroup.controllers")
+		data, err := fscommon.ReadFile("/sys/fs/cgroup", "cgroup.controllers")
 		if err != nil {
 		if err != nil {
 			return nil, err
 			return nil, err
 		}
 		}
-		subsystems := append(pseudo, strings.Fields(string(data))...)
+		subsystems := append(pseudo, strings.Fields(data)...)
 		return subsystems, nil
 		return subsystems, nil
 	}
 	}
 	f, err := os.Open("/proc/cgroups")
 	f, err := os.Open("/proc/cgroups")
@@ -207,20 +209,66 @@ func EnterPid(cgroupPaths map[string]string, pid int) error {
 	return nil
 	return nil
 }
 }
 
 
+func rmdir(path string) error {
+	err := unix.Rmdir(path)
+	if err == nil || err == unix.ENOENT {
+		return nil
+	}
+	return &os.PathError{Op: "rmdir", Path: path, Err: err}
+}
+
+// RemovePath aims to remove cgroup path. It does so recursively,
+// by removing any subdirectories (sub-cgroups) first.
+func RemovePath(path string) error {
+	// try the fast path first
+	if err := rmdir(path); err == nil {
+		return nil
+	}
+
+	infos, err := ioutil.ReadDir(path)
+	if err != nil {
+		if os.IsNotExist(err) {
+			err = nil
+		}
+		return err
+	}
+	for _, info := range infos {
+		if info.IsDir() {
+			// We should remove subcgroups dir first
+			if err = RemovePath(filepath.Join(path, info.Name())); err != nil {
+				break
+			}
+		}
+	}
+	if err == nil {
+		err = rmdir(path)
+	}
+	return err
+}
+
 // RemovePaths iterates over the provided paths removing them.
 // RemovePaths iterates over the provided paths removing them.
 // We trying to remove all paths five times with increasing delay between tries.
 // We trying to remove all paths five times with increasing delay between tries.
 // If after all there are not removed cgroups - appropriate error will be
 // If after all there are not removed cgroups - appropriate error will be
 // returned.
 // returned.
 func RemovePaths(paths map[string]string) (err error) {
 func RemovePaths(paths map[string]string) (err error) {
+	const retries = 5
 	delay := 10 * time.Millisecond
 	delay := 10 * time.Millisecond
-	for i := 0; i < 5; i++ {
+	for i := 0; i < retries; i++ {
 		if i != 0 {
 		if i != 0 {
 			time.Sleep(delay)
 			time.Sleep(delay)
 			delay *= 2
 			delay *= 2
 		}
 		}
 		for s, p := range paths {
 		for s, p := range paths {
-			os.RemoveAll(p)
-			// TODO: here probably should be logging
+			if err := RemovePath(p); err != nil {
+				// do not log intermediate iterations
+				switch i {
+				case 0:
+					logrus.WithError(err).Warnf("Failed to remove cgroup (will retry)")
+				case retries - 1:
+					logrus.WithError(err).Error("Failed to remove cgroup")
+				}
+
+			}
 			_, err := os.Stat(p)
 			_, err := os.Stat(p)
 			// We need this strange way of checking cgroups existence because
 			// We need this strange way of checking cgroups existence because
 			// RemoveAll almost always returns error, even on already removed
 			// RemoveAll almost always returns error, even on already removed
@@ -230,6 +278,8 @@ func RemovePaths(paths map[string]string) (err error) {
 			}
 			}
 		}
 		}
 		if len(paths) == 0 {
 		if len(paths) == 0 {
+			//nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506
+			paths = make(map[string]string)
 			return nil
 			return nil
 		}
 		}
 	}
 	}
@@ -237,27 +287,50 @@ func RemovePaths(paths map[string]string) (err error) {
 }
 }
 
 
 func GetHugePageSize() ([]string, error) {
 func GetHugePageSize() ([]string, error) {
-	files, err := ioutil.ReadDir("/sys/kernel/mm/hugepages")
+	dir, err := os.OpenFile("/sys/kernel/mm/hugepages", unix.O_DIRECTORY|unix.O_RDONLY, 0)
 	if err != nil {
 	if err != nil {
-		return []string{}, err
+		return nil, err
 	}
 	}
-	var fileNames []string
-	for _, st := range files {
-		fileNames = append(fileNames, st.Name())
+	files, err := dir.Readdirnames(0)
+	dir.Close()
+	if err != nil {
+		return nil, err
 	}
 	}
-	return getHugePageSizeFromFilenames(fileNames)
+
+	return getHugePageSizeFromFilenames(files)
 }
 }
 
 
 func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) {
 func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) {
-	var pageSizes []string
-	for _, fileName := range fileNames {
-		nameArray := strings.Split(fileName, "-")
-		pageSize, err := units.RAMInBytes(nameArray[1])
+	pageSizes := make([]string, 0, len(fileNames))
+
+	for _, file := range fileNames {
+		// example: hugepages-1048576kB
+		val := strings.TrimPrefix(file, "hugepages-")
+		if len(val) == len(file) {
+			// unexpected file name: no prefix found
+			continue
+		}
+		// The suffix is always "kB" (as of Linux 5.9)
+		eLen := len(val) - 2
+		val = strings.TrimSuffix(val, "kB")
+		if len(val) != eLen {
+			logrus.Warnf("GetHugePageSize: %s: invalid filename suffix (expected \"kB\")", file)
+			continue
+		}
+		size, err := strconv.Atoi(val)
 		if err != nil {
 		if err != nil {
-			return []string{}, err
+			return nil, err
 		}
 		}
-		sizeString := units.CustomSize("%g%s", float64(pageSize), 1024.0, HugePageSizeUnitList)
-		pageSizes = append(pageSizes, sizeString)
+		// Model after https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/hugetlb_cgroup.c?id=eff48ddeab782e35e58ccc8853f7386bbae9dec4#n574
+		// but in our case the size is in KB already.
+		if size >= (1 << 20) {
+			val = strconv.Itoa(size>>20) + "GB"
+		} else if size >= (1 << 10) {
+			val = strconv.Itoa(size>>10) + "MB"
+		} else {
+			val += "KB"
+		}
+		pageSizes = append(pageSizes, val)
 	}
 	}
 
 
 	return pageSizes, nil
 	return pageSizes, nil
@@ -303,14 +376,14 @@ func WriteCgroupProc(dir string, pid int) error {
 		return nil
 		return nil
 	}
 	}
 
 
-	cgroupProcessesFile, err := os.OpenFile(filepath.Join(dir, CgroupProcesses), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0700)
+	file, err := fscommon.OpenFile(dir, CgroupProcesses, os.O_WRONLY)
 	if err != nil {
 	if err != nil {
 		return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err)
 		return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err)
 	}
 	}
-	defer cgroupProcessesFile.Close()
+	defer file.Close()
 
 
 	for i := 0; i < 5; i++ {
 	for i := 0; i < 5; i++ {
-		_, err = cgroupProcessesFile.WriteString(strconv.Itoa(pid))
+		_, err = file.WriteString(strconv.Itoa(pid))
 		if err == nil {
 		if err == nil {
 			return nil
 			return nil
 		}
 		}
@@ -327,17 +400,6 @@ func WriteCgroupProc(dir string, pid int) error {
 	return err
 	return err
 }
 }
 
 
-// Since the OCI spec is designed for cgroup v1, in some cases
-// there is need to convert from the cgroup v1 configuration to cgroup v2
-// the formula for BlkIOWeight is y = (1 + (x - 10) * 9999 / 990)
-// convert linearly from [10-1000] to [1-10000]
-func ConvertBlkIOToCgroupV2Value(blkIoWeight uint16) uint64 {
-	if blkIoWeight == 0 {
-		return 0
-	}
-	return uint64(1 + (uint64(blkIoWeight)-10)*9999/990)
-}
-
 // Since the OCI spec is designed for cgroup v1, in some cases
 // Since the OCI spec is designed for cgroup v1, in some cases
 // there is need to convert from the cgroup v1 configuration to cgroup v2
 // there is need to convert from the cgroup v1 configuration to cgroup v2
 // the formula for cpuShares is y = (1 + ((x - 2) * 9999) / 262142)
 // the formula for cpuShares is y = (1 + ((x - 2) * 9999) / 262142)
@@ -377,3 +439,14 @@ func ConvertMemorySwapToCgroupV2Value(memorySwap, memory int64) (int64, error) {
 
 
 	return memorySwap - memory, nil
 	return memorySwap - memory, nil
 }
 }
+
+// Since the OCI spec is designed for cgroup v1, in some cases
+// there is need to convert from the cgroup v1 configuration to cgroup v2
+// the formula for BlkIOWeight to IOWeight is y = (1 + (x - 10) * 9999 / 990)
+// convert linearly from [10-1000] to [1-10000]
+func ConvertBlkIOToIOWeightValue(blkIoWeight uint16) uint64 {
+	if blkIoWeight == 0 {
+		return 0
+	}
+	return uint64(1 + (uint64(blkIoWeight)-10)*9999/990)
+}

+ 41 - 59
vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go

@@ -1,16 +1,16 @@
 package cgroups
 package cgroups
 
 
 import (
 import (
-	"bufio"
 	"errors"
 	"errors"
 	"fmt"
 	"fmt"
-	"io"
 	"os"
 	"os"
 	"path/filepath"
 	"path/filepath"
 	"strings"
 	"strings"
+	"sync"
 	"syscall"
 	"syscall"
 
 
 	securejoin "github.com/cyphar/filepath-securejoin"
 	securejoin "github.com/cyphar/filepath-securejoin"
+	"github.com/moby/sys/mountinfo"
 	"golang.org/x/sys/unix"
 	"golang.org/x/sys/unix"
 )
 )
 
 
@@ -23,7 +23,12 @@ const (
 )
 )
 
 
 var (
 var (
-	errUnified = errors.New("not implemented for cgroup v2 unified hierarchy")
+	errUnified     = errors.New("not implemented for cgroup v2 unified hierarchy")
+	ErrV1NoUnified = errors.New("invalid configuration: cannot use unified on cgroup v1")
+
+	readMountinfoOnce sync.Once
+	readMountinfoErr  error
+	cgroupMountinfo   []*mountinfo.Info
 )
 )
 
 
 type NotFoundError struct {
 type NotFoundError struct {
@@ -90,6 +95,21 @@ func tryDefaultPath(cgroupPath, subsystem string) string {
 	return path
 	return path
 }
 }
 
 
+// readCgroupMountinfo returns a list of cgroup v1 mounts (i.e. the ones
+// with fstype of "cgroup") for the current running process.
+//
+// The results are cached (to avoid re-reading mountinfo which is relatively
+// expensive), so it is assumed that cgroup mounts are not being changed.
+func readCgroupMountinfo() ([]*mountinfo.Info, error) {
+	readMountinfoOnce.Do(func() {
+		cgroupMountinfo, readMountinfoErr = mountinfo.GetMounts(
+			mountinfo.FSTypeFilter("cgroup"),
+		)
+	})
+
+	return cgroupMountinfo, readMountinfoErr
+}
+
 // https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt
 // https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt
 func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) {
 func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) {
 	if IsCgroup2UnifiedMode() {
 	if IsCgroup2UnifiedMode() {
@@ -110,56 +130,28 @@ func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string,
 		return "", "", errUnified
 		return "", "", errUnified
 	}
 	}
 
 
-	// Avoid parsing mountinfo by checking if subsystem is valid/available.
-	if !isSubsystemAvailable(subsystem) {
-		return "", "", NewNotFoundError(subsystem)
-	}
-
-	f, err := os.Open("/proc/self/mountinfo")
+	mi, err := readCgroupMountinfo()
 	if err != nil {
 	if err != nil {
 		return "", "", err
 		return "", "", err
 	}
 	}
-	defer f.Close()
 
 
-	return findCgroupMountpointAndRootFromReader(f, cgroupPath, subsystem)
+	return findCgroupMountpointAndRootFromMI(mi, cgroupPath, subsystem)
 }
 }
 
 
-func findCgroupMountpointAndRootFromReader(reader io.Reader, cgroupPath, subsystem string) (string, string, error) {
-	scanner := bufio.NewScanner(reader)
-	for scanner.Scan() {
-		txt := scanner.Text()
-		fields := strings.Fields(txt)
-		if len(fields) < 9 {
-			continue
-		}
-		if strings.HasPrefix(fields[4], cgroupPath) {
-			for _, opt := range strings.Split(fields[len(fields)-1], ",") {
+func findCgroupMountpointAndRootFromMI(mounts []*mountinfo.Info, cgroupPath, subsystem string) (string, string, error) {
+	for _, mi := range mounts {
+		if strings.HasPrefix(mi.Mountpoint, cgroupPath) {
+			for _, opt := range strings.Split(mi.VFSOptions, ",") {
 				if opt == subsystem {
 				if opt == subsystem {
-					return fields[4], fields[3], nil
+					return mi.Mountpoint, mi.Root, nil
 				}
 				}
 			}
 			}
 		}
 		}
 	}
 	}
-	if err := scanner.Err(); err != nil {
-		return "", "", err
-	}
 
 
 	return "", "", NewNotFoundError(subsystem)
 	return "", "", NewNotFoundError(subsystem)
 }
 }
 
 
-func isSubsystemAvailable(subsystem string) bool {
-	if IsCgroup2UnifiedMode() {
-		panic("don't call isSubsystemAvailable from cgroupv2 code")
-	}
-
-	cgroups, err := ParseCgroupFile("/proc/self/cgroup")
-	if err != nil {
-		return false
-	}
-	_, avail := cgroups[subsystem]
-	return avail
-}
-
 func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) {
 func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) {
 	if len(m.Subsystems) == 0 {
 	if len(m.Subsystems) == 0 {
 		return "", fmt.Errorf("no subsystem for mount")
 		return "", fmt.Errorf("no subsystem for mount")
@@ -168,25 +160,15 @@ func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) {
 	return getControllerPath(m.Subsystems[0], cgroups)
 	return getControllerPath(m.Subsystems[0], cgroups)
 }
 }
 
 
-func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount, error) {
+func getCgroupMountsHelper(ss map[string]bool, mounts []*mountinfo.Info, all bool) ([]Mount, error) {
 	res := make([]Mount, 0, len(ss))
 	res := make([]Mount, 0, len(ss))
-	scanner := bufio.NewScanner(mi)
 	numFound := 0
 	numFound := 0
-	for scanner.Scan() && numFound < len(ss) {
-		txt := scanner.Text()
-		sepIdx := strings.Index(txt, " - ")
-		if sepIdx == -1 {
-			return nil, fmt.Errorf("invalid mountinfo format")
-		}
-		if txt[sepIdx+3:sepIdx+10] == "cgroup2" || txt[sepIdx+3:sepIdx+9] != "cgroup" {
-			continue
-		}
-		fields := strings.Split(txt, " ")
+	for _, mi := range mounts {
 		m := Mount{
 		m := Mount{
-			Mountpoint: fields[4],
-			Root:       fields[3],
+			Mountpoint: mi.Mountpoint,
+			Root:       mi.Root,
 		}
 		}
-		for _, opt := range strings.Split(fields[len(fields)-1], ",") {
+		for _, opt := range strings.Split(mi.VFSOptions, ",") {
 			seen, known := ss[opt]
 			seen, known := ss[opt]
 			if !known || (!all && seen) {
 			if !known || (!all && seen) {
 				continue
 				continue
@@ -199,19 +181,18 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount,
 		if len(m.Subsystems) > 0 || all {
 		if len(m.Subsystems) > 0 || all {
 			res = append(res, m)
 			res = append(res, m)
 		}
 		}
-	}
-	if err := scanner.Err(); err != nil {
-		return nil, err
+		if !all && numFound >= len(ss) {
+			break
+		}
 	}
 	}
 	return res, nil
 	return res, nil
 }
 }
 
 
 func getCgroupMountsV1(all bool) ([]Mount, error) {
 func getCgroupMountsV1(all bool) ([]Mount, error) {
-	f, err := os.Open("/proc/self/mountinfo")
+	mi, err := readCgroupMountinfo()
 	if err != nil {
 	if err != nil {
 		return nil, err
 		return nil, err
 	}
 	}
-	defer f.Close()
 
 
 	allSubsystems, err := ParseCgroupFile("/proc/self/cgroup")
 	allSubsystems, err := ParseCgroupFile("/proc/self/cgroup")
 	if err != nil {
 	if err != nil {
@@ -222,7 +203,8 @@ func getCgroupMountsV1(all bool) ([]Mount, error) {
 	for s := range allSubsystems {
 	for s := range allSubsystems {
 		allMap[s] = false
 		allMap[s] = false
 	}
 	}
-	return getCgroupMountsHelper(allMap, f, all)
+
+	return getCgroupMountsHelper(allMap, mi, all)
 }
 }
 
 
 // GetOwnCgroup returns the relative path to the cgroup docker is running in.
 // GetOwnCgroup returns the relative path to the cgroup docker is running in.

+ 5 - 7
vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go

@@ -2,6 +2,7 @@ package configs
 
 
 import (
 import (
 	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
 	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
+	"github.com/opencontainers/runc/libcontainer/devices"
 )
 )
 
 
 type FreezerState string
 type FreezerState string
@@ -42,7 +43,7 @@ type Cgroup struct {
 
 
 type Resources struct {
 type Resources struct {
 	// Devices is the set of access rules for devices in the container.
 	// Devices is the set of access rules for devices in the container.
-	Devices []*DeviceRule `json:"devices"`
+	Devices []*devices.Rule `json:"devices"`
 
 
 	// Memory limit (in bytes)
 	// Memory limit (in bytes)
 	Memory int64 `json:"memory"`
 	Memory int64 `json:"memory"`
@@ -53,12 +54,6 @@ type Resources struct {
 	// Total memory usage (memory + swap); set `-1` to enable unlimited swap
 	// Total memory usage (memory + swap); set `-1` to enable unlimited swap
 	MemorySwap int64 `json:"memory_swap"`
 	MemorySwap int64 `json:"memory_swap"`
 
 
-	// Kernel memory limit (in bytes)
-	KernelMemory int64 `json:"kernel_memory"`
-
-	// Kernel memory limit for TCP use (in bytes)
-	KernelMemoryTCP int64 `json:"kernel_memory_tcp"`
-
 	// CPU shares (relative weight vs. other containers)
 	// CPU shares (relative weight vs. other containers)
 	CpuShares uint64 `json:"cpu_shares"`
 	CpuShares uint64 `json:"cpu_shares"`
 
 
@@ -127,6 +122,9 @@ type Resources struct {
 	// CpuWeight sets a proportional bandwidth limit.
 	// CpuWeight sets a proportional bandwidth limit.
 	CpuWeight uint64 `json:"cpu_weight"`
 	CpuWeight uint64 `json:"cpu_weight"`
 
 
+	// Unified is cgroupv2-only key-value map.
+	Unified map[string]string `json:"unified"`
+
 	// SkipDevices allows to skip configuring device permissions.
 	// SkipDevices allows to skip configuring device permissions.
 	// Used by e.g. kubelet while creating a parent cgroup (kubepods)
 	// Used by e.g. kubelet while creating a parent cgroup (kubepods)
 	// common for many containers.
 	// common for many containers.

+ 15 - 10
vendor/github.com/opencontainers/runc/libcontainer/configs/config.go

@@ -7,6 +7,7 @@ import (
 	"os/exec"
 	"os/exec"
 	"time"
 	"time"
 
 
+	"github.com/opencontainers/runc/libcontainer/devices"
 	"github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/pkg/errors"
 	"github.com/pkg/errors"
 	"github.com/sirupsen/logrus"
 	"github.com/sirupsen/logrus"
@@ -30,9 +31,10 @@ type IDMap struct {
 // for syscalls. Additional architectures can be added by specifying them in
 // for syscalls. Additional architectures can be added by specifying them in
 // Architectures.
 // Architectures.
 type Seccomp struct {
 type Seccomp struct {
-	DefaultAction Action     `json:"default_action"`
-	Architectures []string   `json:"architectures"`
-	Syscalls      []*Syscall `json:"syscalls"`
+	DefaultAction   Action     `json:"default_action"`
+	Architectures   []string   `json:"architectures"`
+	Syscalls        []*Syscall `json:"syscalls"`
+	DefaultErrnoRet *uint      `json:"default_errno_ret"`
 }
 }
 
 
 // Action is taken upon rule match in Seccomp
 // Action is taken upon rule match in Seccomp
@@ -92,6 +94,9 @@ type Config struct {
 	// Path to a directory containing the container's root filesystem.
 	// Path to a directory containing the container's root filesystem.
 	Rootfs string `json:"rootfs"`
 	Rootfs string `json:"rootfs"`
 
 
+	// Umask is the umask to use inside of the container.
+	Umask *uint32 `json:"umask"`
+
 	// Readonlyfs will remount the container's rootfs as readonly where only externally mounted
 	// Readonlyfs will remount the container's rootfs as readonly where only externally mounted
 	// bind mounts are writtable.
 	// bind mounts are writtable.
 	Readonlyfs bool `json:"readonlyfs"`
 	Readonlyfs bool `json:"readonlyfs"`
@@ -104,7 +109,7 @@ type Config struct {
 	Mounts []*Mount `json:"mounts"`
 	Mounts []*Mount `json:"mounts"`
 
 
 	// The device nodes that should be automatically created within the container upon container start.  Note, make sure that the node is marked as allowed in the cgroup as well!
 	// The device nodes that should be automatically created within the container upon container start.  Note, make sure that the node is marked as allowed in the cgroup as well!
-	Devices []*Device `json:"devices"`
+	Devices []*devices.Device `json:"devices"`
 
 
 	MountLabel string `json:"mount_label"`
 	MountLabel string `json:"mount_label"`
 
 
@@ -218,25 +223,25 @@ const (
 	// the runtime environment has been created but before the pivot_root has been executed.
 	// the runtime environment has been created but before the pivot_root has been executed.
 	// CreateRuntime is called immediately after the deprecated Prestart hook.
 	// CreateRuntime is called immediately after the deprecated Prestart hook.
 	// CreateRuntime commands are called in the Runtime Namespace.
 	// CreateRuntime commands are called in the Runtime Namespace.
-	CreateRuntime = "createRuntime"
+	CreateRuntime HookName = "createRuntime"
 
 
 	// CreateContainer commands MUST be called as part of the create operation after
 	// CreateContainer commands MUST be called as part of the create operation after
 	// the runtime environment has been created but before the pivot_root has been executed.
 	// the runtime environment has been created but before the pivot_root has been executed.
 	// CreateContainer commands are called in the Container namespace.
 	// CreateContainer commands are called in the Container namespace.
-	CreateContainer = "createContainer"
+	CreateContainer HookName = "createContainer"
 
 
 	// StartContainer commands MUST be called as part of the start operation and before
 	// StartContainer commands MUST be called as part of the start operation and before
 	// the container process is started.
 	// the container process is started.
 	// StartContainer commands are called in the Container namespace.
 	// StartContainer commands are called in the Container namespace.
-	StartContainer = "startContainer"
+	StartContainer HookName = "startContainer"
 
 
 	// Poststart commands are executed after the container init process starts.
 	// Poststart commands are executed after the container init process starts.
 	// Poststart commands are called in the Runtime Namespace.
 	// Poststart commands are called in the Runtime Namespace.
-	Poststart = "poststart"
+	Poststart HookName = "poststart"
 
 
 	// Poststop commands are executed after the container init process exits.
 	// Poststop commands are executed after the container init process exits.
 	// Poststop commands are called in the Runtime Namespace.
 	// Poststop commands are called in the Runtime Namespace.
-	Poststop = "poststop"
+	Poststop HookName = "poststop"
 )
 )
 
 
 type Capabilities struct {
 type Capabilities struct {
@@ -383,7 +388,7 @@ func (c Command) Run(s *specs.State) error {
 		return err
 		return err
 	case <-timerCh:
 	case <-timerCh:
 		cmd.Process.Kill()
 		cmd.Process.Kill()
-		cmd.Wait()
+		<-errC
 		return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds())
 		return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds())
 	}
 	}
 }
 }

+ 9 - 0
vendor/github.com/opencontainers/runc/libcontainer/configs/configs_fuzzer.go

@@ -0,0 +1,9 @@
+// +build gofuzz
+
+package configs
+
+func FuzzUnmarshalJSON(data []byte) int {
+	hooks := Hooks{}
+	_ = hooks.UnmarshalJSON(data)
+	return 1
+}

+ 0 - 16
vendor/github.com/opencontainers/runc/libcontainer/configs/device_unix.go

@@ -1,16 +0,0 @@
-// +build !windows
-
-package configs
-
-import (
-	"errors"
-
-	"golang.org/x/sys/unix"
-)
-
-func (d *DeviceRule) Mkdev() (uint64, error) {
-	if d.Major == Wildcard || d.Minor == Wildcard {
-		return 0, errors.New("cannot mkdev() device with wildcards")
-	}
-	return unix.Mkdev(uint32(d.Major), uint32(d.Minor)), nil
-}

+ 0 - 5
vendor/github.com/opencontainers/runc/libcontainer/configs/device_windows.go

@@ -1,5 +0,0 @@
-package configs
-
-func (d *DeviceRule) Mkdev() (uint64, error) {
-	return 0, nil
-}

+ 17 - 0
vendor/github.com/opencontainers/runc/libcontainer/configs/devices.go

@@ -0,0 +1,17 @@
+package configs
+
+import "github.com/opencontainers/runc/libcontainer/devices"
+
+type (
+	// Deprecated: use libcontainer/devices.Device
+	Device = devices.Device
+
+	// Deprecated: use libcontainer/devices.Rule
+	DeviceRule = devices.Rule
+
+	// Deprecated: use libcontainer/devices.Type
+	DeviceType = devices.Type
+
+	// Deprecated: use libcontainer/devices.Permissions
+	DevicePermissions = devices.Permissions
+)

+ 1 - 1
vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go

@@ -56,7 +56,7 @@ func IsNamespaceSupported(ns NamespaceType) bool {
 	if nsFile == "" {
 	if nsFile == "" {
 		return false
 		return false
 	}
 	}
-	_, err := os.Stat(fmt.Sprintf("/proc/self/ns/%s", nsFile))
+	_, err := os.Stat("/proc/self/ns/" + nsFile)
 	// a namespace is supported if it exists and we have permissions to read it
 	// a namespace is supported if it exists and we have permissions to read it
 	supported = err == nil
 	supported = err == nil
 	supportedNamespaces[ns] = supported
 	supportedNamespaces[ns] = supported

+ 33 - 29
vendor/github.com/opencontainers/runc/libcontainer/configs/device.go → vendor/github.com/opencontainers/runc/libcontainer/devices/device.go

@@ -1,4 +1,4 @@
-package configs
+package devices
 
 
 import (
 import (
 	"fmt"
 	"fmt"
@@ -11,7 +11,7 @@ const (
 )
 )
 
 
 type Device struct {
 type Device struct {
-	DeviceRule
+	Rule
 
 
 	// Path to the device.
 	// Path to the device.
 	Path string `json:"path"`
 	Path string `json:"path"`
@@ -26,10 +26,10 @@ type Device struct {
 	Gid uint32 `json:"gid"`
 	Gid uint32 `json:"gid"`
 }
 }
 
 
-// DevicePermissions is a cgroupv1-style string to represent device access. It
+// Permissions is a cgroupv1-style string to represent device access. It
 // has to be a string for backward compatibility reasons, hence why it has
 // has to be a string for backward compatibility reasons, hence why it has
 // methods to do set operations.
 // methods to do set operations.
-type DevicePermissions string
+type Permissions string
 
 
 const (
 const (
 	deviceRead uint = (1 << iota)
 	deviceRead uint = (1 << iota)
@@ -37,7 +37,7 @@ const (
 	deviceMknod
 	deviceMknod
 )
 )
 
 
-func (p DevicePermissions) toSet() uint {
+func (p Permissions) toSet() uint {
 	var set uint
 	var set uint
 	for _, perm := range p {
 	for _, perm := range p {
 		switch perm {
 		switch perm {
@@ -52,7 +52,7 @@ func (p DevicePermissions) toSet() uint {
 	return set
 	return set
 }
 }
 
 
-func fromSet(set uint) DevicePermissions {
+func fromSet(set uint) Permissions {
 	var perm string
 	var perm string
 	if set&deviceRead == deviceRead {
 	if set&deviceRead == deviceRead {
 		perm += "r"
 		perm += "r"
@@ -63,53 +63,53 @@ func fromSet(set uint) DevicePermissions {
 	if set&deviceMknod == deviceMknod {
 	if set&deviceMknod == deviceMknod {
 		perm += "m"
 		perm += "m"
 	}
 	}
-	return DevicePermissions(perm)
+	return Permissions(perm)
 }
 }
 
 
-// Union returns the union of the two sets of DevicePermissions.
-func (p DevicePermissions) Union(o DevicePermissions) DevicePermissions {
+// Union returns the union of the two sets of Permissions.
+func (p Permissions) Union(o Permissions) Permissions {
 	lhs := p.toSet()
 	lhs := p.toSet()
 	rhs := o.toSet()
 	rhs := o.toSet()
 	return fromSet(lhs | rhs)
 	return fromSet(lhs | rhs)
 }
 }
 
 
-// Difference returns the set difference of the two sets of DevicePermissions.
+// Difference returns the set difference of the two sets of Permissions.
 // In set notation, A.Difference(B) gives you A\B.
 // In set notation, A.Difference(B) gives you A\B.
-func (p DevicePermissions) Difference(o DevicePermissions) DevicePermissions {
+func (p Permissions) Difference(o Permissions) Permissions {
 	lhs := p.toSet()
 	lhs := p.toSet()
 	rhs := o.toSet()
 	rhs := o.toSet()
 	return fromSet(lhs &^ rhs)
 	return fromSet(lhs &^ rhs)
 }
 }
 
 
-// Intersection computes the intersection of the two sets of DevicePermissions.
-func (p DevicePermissions) Intersection(o DevicePermissions) DevicePermissions {
+// Intersection computes the intersection of the two sets of Permissions.
+func (p Permissions) Intersection(o Permissions) Permissions {
 	lhs := p.toSet()
 	lhs := p.toSet()
 	rhs := o.toSet()
 	rhs := o.toSet()
 	return fromSet(lhs & rhs)
 	return fromSet(lhs & rhs)
 }
 }
 
 
-// IsEmpty returns whether the set of permissions in a DevicePermissions is
+// IsEmpty returns whether the set of permissions in a Permissions is
 // empty.
 // empty.
-func (p DevicePermissions) IsEmpty() bool {
-	return p == DevicePermissions("")
+func (p Permissions) IsEmpty() bool {
+	return p == Permissions("")
 }
 }
 
 
 // IsValid returns whether the set of permissions is a subset of valid
 // IsValid returns whether the set of permissions is a subset of valid
 // permissions (namely, {r,w,m}).
 // permissions (namely, {r,w,m}).
-func (p DevicePermissions) IsValid() bool {
+func (p Permissions) IsValid() bool {
 	return p == fromSet(p.toSet())
 	return p == fromSet(p.toSet())
 }
 }
 
 
-type DeviceType rune
+type Type rune
 
 
 const (
 const (
-	WildcardDevice DeviceType = 'a'
-	BlockDevice    DeviceType = 'b'
-	CharDevice     DeviceType = 'c' // or 'u'
-	FifoDevice     DeviceType = 'p'
+	WildcardDevice Type = 'a'
+	BlockDevice    Type = 'b'
+	CharDevice     Type = 'c' // or 'u'
+	FifoDevice     Type = 'p'
 )
 )
 
 
-func (t DeviceType) IsValid() bool {
+func (t Type) IsValid() bool {
 	switch t {
 	switch t {
 	case WildcardDevice, BlockDevice, CharDevice, FifoDevice:
 	case WildcardDevice, BlockDevice, CharDevice, FifoDevice:
 		return true
 		return true
@@ -118,7 +118,7 @@ func (t DeviceType) IsValid() bool {
 	}
 	}
 }
 }
 
 
-func (t DeviceType) CanMknod() bool {
+func (t Type) CanMknod() bool {
 	switch t {
 	switch t {
 	case BlockDevice, CharDevice, FifoDevice:
 	case BlockDevice, CharDevice, FifoDevice:
 		return true
 		return true
@@ -127,7 +127,7 @@ func (t DeviceType) CanMknod() bool {
 	}
 	}
 }
 }
 
 
-func (t DeviceType) CanCgroup() bool {
+func (t Type) CanCgroup() bool {
 	switch t {
 	switch t {
 	case WildcardDevice, BlockDevice, CharDevice:
 	case WildcardDevice, BlockDevice, CharDevice:
 		return true
 		return true
@@ -136,10 +136,10 @@ func (t DeviceType) CanCgroup() bool {
 	}
 	}
 }
 }
 
 
-type DeviceRule struct {
+type Rule struct {
 	// Type of device ('c' for char, 'b' for block). If set to 'a', this rule
 	// Type of device ('c' for char, 'b' for block). If set to 'a', this rule
 	// acts as a wildcard and all fields other than Allow are ignored.
 	// acts as a wildcard and all fields other than Allow are ignored.
-	Type DeviceType `json:"type"`
+	Type Type `json:"type"`
 
 
 	// Major is the device's major number.
 	// Major is the device's major number.
 	Major int64 `json:"major"`
 	Major int64 `json:"major"`
@@ -149,13 +149,13 @@ type DeviceRule struct {
 
 
 	// Permissions is the set of permissions that this rule applies to (in the
 	// Permissions is the set of permissions that this rule applies to (in the
 	// cgroupv1 format -- any combination of "rwm").
 	// cgroupv1 format -- any combination of "rwm").
-	Permissions DevicePermissions `json:"permissions"`
+	Permissions Permissions `json:"permissions"`
 
 
 	// Allow specifies whether this rule is allowed.
 	// Allow specifies whether this rule is allowed.
 	Allow bool `json:"allow"`
 	Allow bool `json:"allow"`
 }
 }
 
 
-func (d *DeviceRule) CgroupString() string {
+func (d *Rule) CgroupString() string {
 	var (
 	var (
 		major = strconv.FormatInt(d.Major, 10)
 		major = strconv.FormatInt(d.Major, 10)
 		minor = strconv.FormatInt(d.Minor, 10)
 		minor = strconv.FormatInt(d.Minor, 10)
@@ -168,3 +168,7 @@ func (d *DeviceRule) CgroupString() string {
 	}
 	}
 	return fmt.Sprintf("%c %s:%s %s", d.Type, major, minor, d.Permissions)
 	return fmt.Sprintf("%c %s:%s %s", d.Type, major, minor, d.Permissions)
 }
 }
+
+func (d *Rule) Mkdev() (uint64, error) {
+	return mkDev(d)
+}

+ 22 - 14
vendor/github.com/opencontainers/runc/libcontainer/devices/devices.go → vendor/github.com/opencontainers/runc/libcontainer/devices/device_unix.go

@@ -1,3 +1,5 @@
+// +build !windows
+
 package devices
 package devices
 
 
 import (
 import (
@@ -6,7 +8,6 @@ import (
 	"os"
 	"os"
 	"path/filepath"
 	"path/filepath"
 
 
-	"github.com/opencontainers/runc/libcontainer/configs"
 	"golang.org/x/sys/unix"
 	"golang.org/x/sys/unix"
 )
 )
 
 
@@ -21,9 +22,16 @@ var (
 	ioutilReadDir = ioutil.ReadDir
 	ioutilReadDir = ioutil.ReadDir
 )
 )
 
 
+func mkDev(d *Rule) (uint64, error) {
+	if d.Major == Wildcard || d.Minor == Wildcard {
+		return 0, errors.New("cannot mkdev() device with wildcards")
+	}
+	return unix.Mkdev(uint32(d.Major), uint32(d.Minor)), nil
+}
+
 // Given the path to a device and its cgroup_permissions(which cannot be easily queried) look up the
 // Given the path to a device and its cgroup_permissions(which cannot be easily queried) look up the
 // information about a linux device and return that information as a Device struct.
 // information about a linux device and return that information as a Device struct.
-func DeviceFromPath(path, permissions string) (*configs.Device, error) {
+func DeviceFromPath(path, permissions string) (*Device, error) {
 	var stat unix.Stat_t
 	var stat unix.Stat_t
 	err := unixLstat(path, &stat)
 	err := unixLstat(path, &stat)
 	if err != nil {
 	if err != nil {
@@ -31,7 +39,7 @@ func DeviceFromPath(path, permissions string) (*configs.Device, error) {
 	}
 	}
 
 
 	var (
 	var (
-		devType   configs.DeviceType
+		devType   Type
 		mode      = stat.Mode
 		mode      = stat.Mode
 		devNumber = uint64(stat.Rdev)
 		devNumber = uint64(stat.Rdev)
 		major     = unix.Major(devNumber)
 		major     = unix.Major(devNumber)
@@ -39,41 +47,41 @@ func DeviceFromPath(path, permissions string) (*configs.Device, error) {
 	)
 	)
 	switch mode & unix.S_IFMT {
 	switch mode & unix.S_IFMT {
 	case unix.S_IFBLK:
 	case unix.S_IFBLK:
-		devType = configs.BlockDevice
+		devType = BlockDevice
 	case unix.S_IFCHR:
 	case unix.S_IFCHR:
-		devType = configs.CharDevice
+		devType = CharDevice
 	case unix.S_IFIFO:
 	case unix.S_IFIFO:
-		devType = configs.FifoDevice
+		devType = FifoDevice
 	default:
 	default:
 		return nil, ErrNotADevice
 		return nil, ErrNotADevice
 	}
 	}
-	return &configs.Device{
-		DeviceRule: configs.DeviceRule{
+	return &Device{
+		Rule: Rule{
 			Type:        devType,
 			Type:        devType,
 			Major:       int64(major),
 			Major:       int64(major),
 			Minor:       int64(minor),
 			Minor:       int64(minor),
-			Permissions: configs.DevicePermissions(permissions),
+			Permissions: Permissions(permissions),
 		},
 		},
 		Path:     path,
 		Path:     path,
-		FileMode: os.FileMode(mode),
+		FileMode: os.FileMode(mode &^ unix.S_IFMT),
 		Uid:      stat.Uid,
 		Uid:      stat.Uid,
 		Gid:      stat.Gid,
 		Gid:      stat.Gid,
 	}, nil
 	}, nil
 }
 }
 
 
 // HostDevices returns all devices that can be found under /dev directory.
 // HostDevices returns all devices that can be found under /dev directory.
-func HostDevices() ([]*configs.Device, error) {
+func HostDevices() ([]*Device, error) {
 	return GetDevices("/dev")
 	return GetDevices("/dev")
 }
 }
 
 
 // GetDevices recursively traverses a directory specified by path
 // GetDevices recursively traverses a directory specified by path
 // and returns all devices found there.
 // and returns all devices found there.
-func GetDevices(path string) ([]*configs.Device, error) {
+func GetDevices(path string) ([]*Device, error) {
 	files, err := ioutilReadDir(path)
 	files, err := ioutilReadDir(path)
 	if err != nil {
 	if err != nil {
 		return nil, err
 		return nil, err
 	}
 	}
-	var out []*configs.Device
+	var out []*Device
 	for _, f := range files {
 	for _, f := range files {
 		switch {
 		switch {
 		case f.IsDir():
 		case f.IsDir():
@@ -104,7 +112,7 @@ func GetDevices(path string) ([]*configs.Device, error) {
 			}
 			}
 			return nil, err
 			return nil, err
 		}
 		}
-		if device.Type == configs.FifoDevice {
+		if device.Type == FifoDevice {
 			continue
 			continue
 		}
 		}
 		out = append(out, device)
 		out = append(out, device)

+ 51 - 29
vendor/github.com/opencontainers/runc/libcontainer/nsenter/cloned_binary.c

@@ -59,14 +59,38 @@
 #include <sys/syscall.h>
 #include <sys/syscall.h>
 
 
 /* Use our own wrapper for memfd_create. */
 /* Use our own wrapper for memfd_create. */
-#if !defined(SYS_memfd_create) && defined(__NR_memfd_create)
-#  define SYS_memfd_create __NR_memfd_create
+#ifndef SYS_memfd_create
+#  ifdef __NR_memfd_create
+#    define SYS_memfd_create __NR_memfd_create
+#  else
+/* These values come from <https://fedora.juszkiewicz.com.pl/syscalls.html>. */
+#    warning "libc is outdated -- using hard-coded SYS_memfd_create"
+#    if defined(__x86_64__)
+#      define SYS_memfd_create 319
+#    elif defined(__i386__)
+#      define SYS_memfd_create 356
+#    elif defined(__ia64__)
+#      define SYS_memfd_create 1340
+#    elif defined(__arm__)
+#      define SYS_memfd_create 385
+#    elif defined(__aarch64__)
+#      define SYS_memfd_create 279
+#    elif defined(__ppc__) || defined(__PPC64__) || defined(__powerpc64__)
+#      define SYS_memfd_create 360
+#    elif defined(__s390__) || defined(__s390x__)
+#      define SYS_memfd_create 350
+#    else
+#      warning "unknown architecture -- cannot hard-code SYS_memfd_create"
+#    endif
+#  endif
 #endif
 #endif
+
 /* memfd_create(2) flags -- copied from <linux/memfd.h>. */
 /* memfd_create(2) flags -- copied from <linux/memfd.h>. */
 #ifndef MFD_CLOEXEC
 #ifndef MFD_CLOEXEC
 #  define MFD_CLOEXEC       0x0001U
 #  define MFD_CLOEXEC       0x0001U
 #  define MFD_ALLOW_SEALING 0x0002U
 #  define MFD_ALLOW_SEALING 0x0002U
 #endif
 #endif
+
 int memfd_create(const char *name, unsigned int flags)
 int memfd_create(const char *name, unsigned int flags)
 {
 {
 #ifdef SYS_memfd_create
 #ifdef SYS_memfd_create
@@ -77,7 +101,6 @@ int memfd_create(const char *name, unsigned int flags)
 #endif
 #endif
 }
 }
 
 
-
 /* This comes directly from <linux/fcntl.h>. */
 /* This comes directly from <linux/fcntl.h>. */
 #ifndef F_LINUX_SPECIFIC_BASE
 #ifndef F_LINUX_SPECIFIC_BASE
 #  define F_LINUX_SPECIFIC_BASE 1024
 #  define F_LINUX_SPECIFIC_BASE 1024
@@ -103,7 +126,7 @@ static void *must_realloc(void *ptr, size_t size)
 	void *old = ptr;
 	void *old = ptr;
 	do {
 	do {
 		ptr = realloc(old, size);
 		ptr = realloc(old, size);
-	} while(!ptr);
+	} while (!ptr);
 	return ptr;
 	return ptr;
 }
 }
 
 
@@ -115,10 +138,10 @@ static void *must_realloc(void *ptr, size_t size)
 static int is_self_cloned(void)
 static int is_self_cloned(void)
 {
 {
 	int fd, ret, is_cloned = 0;
 	int fd, ret, is_cloned = 0;
-	struct stat statbuf = {};
-	struct statfs fsbuf = {};
+	struct stat statbuf = { };
+	struct statfs fsbuf = { };
 
 
-	fd = open("/proc/self/exe", O_RDONLY|O_CLOEXEC);
+	fd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC);
 	if (fd < 0) {
 	if (fd < 0) {
 		fprintf(stderr, "you have no read access to runc binary file\n");
 		fprintf(stderr, "you have no read access to runc binary file\n");
 		return -ENOTRECOVERABLE;
 		return -ENOTRECOVERABLE;
@@ -274,7 +297,7 @@ enum {
 static int make_execfd(int *fdtype)
 static int make_execfd(int *fdtype)
 {
 {
 	int fd = -1;
 	int fd = -1;
-	char template[PATH_MAX] = {0};
+	char template[PATH_MAX] = { 0 };
 	char *prefix = getenv("_LIBCONTAINER_STATEDIR");
 	char *prefix = getenv("_LIBCONTAINER_STATEDIR");
 
 
 	if (!prefix || *prefix != '/')
 	if (!prefix || *prefix != '/')
@@ -303,7 +326,7 @@ static int make_execfd(int *fdtype)
 	*fdtype = EFD_FILE;
 	*fdtype = EFD_FILE;
 	fd = open(prefix, O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700);
 	fd = open(prefix, O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700);
 	if (fd >= 0) {
 	if (fd >= 0) {
-		struct stat statbuf = {};
+		struct stat statbuf = { };
 		bool working_otmpfile = false;
 		bool working_otmpfile = false;
 
 
 		/*
 		/*
@@ -348,27 +371,27 @@ static int seal_execfd(int *fd, int fdtype)
 	switch (fdtype) {
 	switch (fdtype) {
 	case EFD_MEMFD:
 	case EFD_MEMFD:
 		return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_SEALS);
 		return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_SEALS);
-	case EFD_FILE: {
-		/* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */
-		int newfd;
-		char fdpath[PATH_MAX] = {0};
+	case EFD_FILE:{
+			/* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */
+			int newfd;
+			char fdpath[PATH_MAX] = { 0 };
 
 
-		if (fchmod(*fd, 0100) < 0)
-			return -1;
+			if (fchmod(*fd, 0100) < 0)
+				return -1;
 
 
-		if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0)
-			return -1;
+			if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0)
+				return -1;
 
 
-		newfd = open(fdpath, O_PATH | O_CLOEXEC);
-		if (newfd < 0)
-			return -1;
+			newfd = open(fdpath, O_PATH | O_CLOEXEC);
+			if (newfd < 0)
+				return -1;
 
 
-		close(*fd);
-		*fd = newfd;
-		return 0;
-	}
+			close(*fd);
+			*fd = newfd;
+			return 0;
+		}
 	default:
 	default:
-	   break;
+		break;
 	}
 	}
 	return -1;
 	return -1;
 }
 }
@@ -376,7 +399,7 @@ static int seal_execfd(int *fd, int fdtype)
 static int try_bindfd(void)
 static int try_bindfd(void)
 {
 {
 	int fd, ret = -1;
 	int fd, ret = -1;
-	char template[PATH_MAX] = {0};
+	char template[PATH_MAX] = { 0 };
 	char *prefix = getenv("_LIBCONTAINER_STATEDIR");
 	char *prefix = getenv("_LIBCONTAINER_STATEDIR");
 
 
 	if (!prefix || *prefix != '/')
 	if (!prefix || *prefix != '/')
@@ -404,7 +427,6 @@ static int try_bindfd(void)
 	if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0)
 	if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0)
 		goto out_umount;
 		goto out_umount;
 
 
-
 	/* Get read-only handle that we're sure can't be made read-write. */
 	/* Get read-only handle that we're sure can't be made read-write. */
 	ret = open(template, O_PATH | O_CLOEXEC);
 	ret = open(template, O_PATH | O_CLOEXEC);
 
 
@@ -448,7 +470,7 @@ static ssize_t fd_to_fd(int outfd, int infd)
 			if (n < 0)
 			if (n < 0)
 				return -1;
 				return -1;
 			nwritten += n;
 			nwritten += n;
-		} while(nwritten < nread);
+		} while (nwritten < nread);
 
 
 		total += nwritten;
 		total += nwritten;
 	}
 	}
@@ -459,7 +481,7 @@ static ssize_t fd_to_fd(int outfd, int infd)
 static int clone_binary(void)
 static int clone_binary(void)
 {
 {
 	int binfd, execfd;
 	int binfd, execfd;
-	struct stat statbuf = {};
+	struct stat statbuf = { };
 	size_t sent = 0;
 	size_t sent = 0;
 	int fdtype = EFD_NONE;
 	int fdtype = EFD_NONE;
 
 

+ 142 - 0
vendor/github.com/opencontainers/runc/libcontainer/nsenter/escape.c

@@ -0,0 +1,142 @@
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef ESCAPE_TEST
+#  include <assert.h>
+#  define test_assert(arg) assert(arg)
+#else
+#  define test_assert(arg)
+#endif
+
+#define DEL '\x7f'
+
+/*
+ * Poor man version of itoa with base=16 and input number from 0 to 15,
+ * represented by a char. Converts it to a single hex digit ('0' to 'f').
+ */
+static char hex(char i)
+{
+	test_assert(i >= 0 && i < 16);
+
+	if (i >= 0 && i < 10) {
+		return '0' + i;
+	}
+	if (i >= 10 && i < 16) {
+		return 'a' + i - 10;
+	}
+	return '?';
+}
+
+/*
+ * Given the character, tells how many _extra_ characters are needed
+ * to JSON-escape it. If 0 is returned, the character does not need to
+ * be escaped.
+ */
+static int need_escape(char c)
+{
+	switch (c) {
+	case '\\':
+	case '"':
+	case '\b':
+	case '\n':
+	case '\r':
+	case '\t':
+	case '\f':
+		return 1;
+	case DEL:		// -> \u007f
+		return 5;
+	default:
+		if (c > 0 && c < ' ') {
+			// ASCII decimal 01 to 31 -> \u00xx
+			return 5;
+		}
+		return 0;
+	}
+}
+
+/*
+ * Escape the string so it can be used as a JSON string (per RFC4627,
+ * section 2.5 minimal requirements, plus the DEL (0x7f) character).
+ *
+ * It is expected that the argument is a string allocated via malloc.
+ * In case no escaping is needed, the original string is returned as is;
+ * otherwise, the original string is free'd, and the newly allocated
+ * escaped string is returned. Thus, in any case, the value returned
+ * need to be free'd by the caller.
+ */
+char *escape_json_string(char *s)
+{
+	int i, j, len;
+	char *c, *out;
+
+	/*
+	 * First, check if escaping is at all needed -- if not, we can avoid
+	 * malloc and return the argument as is.  While at it, count how much
+	 * extra space is required.
+	 *
+	 * XXX: the counting code must be in sync with the escaping code
+	 * (checked by test_assert()s below).
+	 */
+	for (i = j = 0; s[i] != '\0'; i++) {
+		j += need_escape(s[i]);
+	}
+	if (j == 0) {
+		// nothing to escape
+		return s;
+	}
+
+	len = i + j + 1;
+	out = malloc(len);
+	if (!out) {
+		free(s);
+		// As malloc failed, strdup can fail, too, so in the worst case
+		// scenario NULL will be returned from here.
+		return strdup("escape_json_string: out of memory");
+	}
+	for (c = s, j = 0; *c != '\0'; c++) {
+		switch (*c) {
+		case '"':
+		case '\\':
+			test_assert(need_escape(*c) == 1);
+			out[j++] = '\\';
+			out[j++] = *c;
+			continue;
+		}
+		if ((*c < 0 || *c >= ' ') && (*c != DEL)) {
+			// no escape needed
+			test_assert(need_escape(*c) == 0);
+			out[j++] = *c;
+			continue;
+		}
+		out[j++] = '\\';
+		switch (*c) {
+		case '\b':
+			out[j++] = 'b';
+			break;
+		case '\n':
+			out[j++] = 'n';
+			break;
+		case '\r':
+			out[j++] = 'r';
+			break;
+		case '\t':
+			out[j++] = 't';
+			break;
+		case '\f':
+			out[j++] = 'f';
+			break;
+		default:
+			test_assert(need_escape(*c) == 5);
+			out[j++] = 'u';
+			out[j++] = '0';
+			out[j++] = '0';
+			out[j++] = hex(*c >> 4);
+			out[j++] = hex(*c & 0x0f);
+		}
+	}
+	test_assert(j + 1 == len);
+	out[j] = '\0';
+
+	free(s);
+	return out;
+}

+ 222 - 139
vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c

@@ -29,6 +29,8 @@
 /* Get all of the CLONE_NEW* flags. */
 /* Get all of the CLONE_NEW* flags. */
 #include "namespace.h"
 #include "namespace.h"
 
 
+extern char *escape_json_string(char *str);
+
 /* Synchronisation values. */
 /* Synchronisation values. */
 enum sync_t {
 enum sync_t {
 	SYNC_USERMAP_PLS = 0x40,	/* Request parent to map our users. */
 	SYNC_USERMAP_PLS = 0x40,	/* Request parent to map our users. */
@@ -36,7 +38,7 @@ enum sync_t {
 	SYNC_RECVPID_PLS = 0x42,	/* Tell parent we're sending the PID. */
 	SYNC_RECVPID_PLS = 0x42,	/* Tell parent we're sending the PID. */
 	SYNC_RECVPID_ACK = 0x43,	/* PID was correctly received by parent. */
 	SYNC_RECVPID_ACK = 0x43,	/* PID was correctly received by parent. */
 	SYNC_GRANDCHILD = 0x44,	/* The grandchild is ready to run. */
 	SYNC_GRANDCHILD = 0x44,	/* The grandchild is ready to run. */
-	SYNC_CHILD_READY = 0x45,	/* The child or grandchild is ready to return. */
+	SYNC_CHILD_FINISH = 0x45,	/* The child or grandchild has finished. */
 };
 };
 
 
 /*
 /*
@@ -45,10 +47,14 @@ enum sync_t {
  */
  */
 #define CREATECGROUPNS 0x80
 #define CREATECGROUPNS 0x80
 
 
+#define STAGE_SETUP  -1
 /* longjmp() arguments. */
 /* longjmp() arguments. */
-#define JUMP_PARENT 0x00
-#define JUMP_CHILD  0xA0
-#define JUMP_INIT   0xA1
+#define STAGE_PARENT  0
+#define STAGE_CHILD   1
+#define STAGE_INIT    2
+
+/* Stores the current stage of nsexec. */
+int current_stage = STAGE_SETUP;
 
 
 /* Assume the stack grows down, so arguments should be above it. */
 /* Assume the stack grows down, so arguments should be above it. */
 struct clone_t {
 struct clone_t {
@@ -56,7 +62,7 @@ struct clone_t {
 	 * Reserve some space for clone() to locate arguments
 	 * Reserve some space for clone() to locate arguments
 	 * and retcode in this place
 	 * and retcode in this place
 	 */
 	 */
-	char stack[4096] __attribute__ ((aligned(16)));
+	char stack[4096] __attribute__((aligned(16)));
 	char stack_ptr[0];
 	char stack_ptr[0];
 
 
 	/* There's two children. This is used to execute the different code. */
 	/* There's two children. This is used to execute the different code. */
@@ -102,31 +108,31 @@ static int logfd = -1;
  * List of netlink message types sent to us as part of bootstrapping the init.
  * List of netlink message types sent to us as part of bootstrapping the init.
  * These constants are defined in libcontainer/message_linux.go.
  * These constants are defined in libcontainer/message_linux.go.
  */
  */
-#define INIT_MSG			62000
+#define INIT_MSG		62000
 #define CLONE_FLAGS_ATTR	27281
 #define CLONE_FLAGS_ATTR	27281
 #define NS_PATHS_ATTR		27282
 #define NS_PATHS_ATTR		27282
-#define UIDMAP_ATTR			27283
-#define GIDMAP_ATTR			27284
+#define UIDMAP_ATTR		27283
+#define GIDMAP_ATTR		27284
 #define SETGROUP_ATTR		27285
 #define SETGROUP_ATTR		27285
 #define OOM_SCORE_ADJ_ATTR	27286
 #define OOM_SCORE_ADJ_ATTR	27286
 #define ROOTLESS_EUID_ATTR	27287
 #define ROOTLESS_EUID_ATTR	27287
-#define UIDMAPPATH_ATTR	    27288
-#define GIDMAPPATH_ATTR	    27289
+#define UIDMAPPATH_ATTR		27288
+#define GIDMAPPATH_ATTR		27289
 
 
 /*
 /*
  * Use the raw syscall for versions of glibc which don't include a function for
  * Use the raw syscall for versions of glibc which don't include a function for
  * it, namely (glibc 2.12).
  * it, namely (glibc 2.12).
  */
  */
 #if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14
 #if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14
-#	define _GNU_SOURCE
-#	include "syscall.h"
-#	if !defined(SYS_setns) && defined(__NR_setns)
-#		define SYS_setns __NR_setns
-#	endif
-
-#ifndef SYS_setns
-#	error "setns(2) syscall not supported by glibc version"
-#endif
+#  define _GNU_SOURCE
+#  include "syscall.h"
+#  if !defined(SYS_setns) && defined(__NR_setns)
+#    define SYS_setns __NR_setns
+#  endif
+
+#  ifndef SYS_setns
+#    error "setns(2) syscall not supported by glibc version"
+#  endif
 
 
 int setns(int fd, int nstype)
 int setns(int fd, int nstype)
 {
 {
@@ -134,33 +140,43 @@ int setns(int fd, int nstype)
 }
 }
 #endif
 #endif
 
 
-static void write_log_with_info(const char *level, const char *function, int line, const char *format, ...)
+static void write_log(const char *level, const char *format, ...)
 {
 {
-	char message[1024] = {};
-
+	char *message = NULL, *stage = NULL;
 	va_list args;
 	va_list args;
+	int ret;
 
 
 	if (logfd < 0 || level == NULL)
 	if (logfd < 0 || level == NULL)
-		return;
+		goto out;
 
 
 	va_start(args, format);
 	va_start(args, format);
-	if (vsnprintf(message, sizeof(message), format, args) < 0)
-		goto done;
-
-	dprintf(logfd, "{\"level\":\"%s\", \"msg\": \"%s:%d %s\"}\n", level, function, line, message);
-done:
+	ret = vasprintf(&message, format, args);
 	va_end(args);
 	va_end(args);
-}
+	if (ret < 0)
+		goto out;
 
 
-#define write_log(level, fmt, ...) \
-	write_log_with_info((level), __FUNCTION__, __LINE__, (fmt), ##__VA_ARGS__)
+	message = escape_json_string(message);
+
+	if (current_stage == STAGE_SETUP)
+		stage = strdup("nsexec");
+	else
+		ret = asprintf(&stage, "nsexec-%d", current_stage);
+	if (ret < 0)
+		goto out;
+
+	dprintf(logfd, "{\"level\":\"%s\", \"msg\": \"%s[%d]: %s\"}\n", level, stage, getpid(), message);
+
+out:
+	free(message);
+	free(stage);
+}
 
 
 /* XXX: This is ugly. */
 /* XXX: This is ugly. */
 static int syncfd = -1;
 static int syncfd = -1;
 
 
 #define bail(fmt, ...)                                       \
 #define bail(fmt, ...)                                       \
 	do {                                                       \
 	do {                                                       \
-		write_log(FATAL, "nsenter: " fmt ": %m", ##__VA_ARGS__); \
+		write_log(FATAL, fmt ": %m", ##__VA_ARGS__); \
 		exit(1);                                                 \
 		exit(1);                                                 \
 	} while(0)
 	} while(0)
 
 
@@ -187,7 +203,7 @@ static int write_file(char *data, size_t data_len, char *pathfmt, ...)
 		goto out;
 		goto out;
 	}
 	}
 
 
- out:
+out:
 	close(fd);
 	close(fd);
 	return ret;
 	return ret;
 }
 }
@@ -297,9 +313,11 @@ static void update_uidmap(const char *path, int pid, char *map, size_t map_len)
 	if (map == NULL || map_len <= 0)
 	if (map == NULL || map_len <= 0)
 		return;
 		return;
 
 
+	write_log(DEBUG, "update /proc/%d/uid_map to '%s'", pid, map);
 	if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) {
 	if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) {
 		if (errno != EPERM)
 		if (errno != EPERM)
 			bail("failed to update /proc/%d/uid_map", pid);
 			bail("failed to update /proc/%d/uid_map", pid);
+		write_log(DEBUG, "update /proc/%d/uid_map got -EPERM (trying %s)", pid, path);
 		if (try_mapping_tool(path, pid, map, map_len))
 		if (try_mapping_tool(path, pid, map, map_len))
 			bail("failed to use newuid map on %d", pid);
 			bail("failed to use newuid map on %d", pid);
 	}
 	}
@@ -310,9 +328,11 @@ static void update_gidmap(const char *path, int pid, char *map, size_t map_len)
 	if (map == NULL || map_len <= 0)
 	if (map == NULL || map_len <= 0)
 		return;
 		return;
 
 
+	write_log(DEBUG, "update /proc/%d/gid_map to '%s'", pid, map);
 	if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) {
 	if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) {
 		if (errno != EPERM)
 		if (errno != EPERM)
 			bail("failed to update /proc/%d/gid_map", pid);
 			bail("failed to update /proc/%d/gid_map", pid);
+		write_log(DEBUG, "update /proc/%d/gid_map got -EPERM (trying %s)", pid, path);
 		if (try_mapping_tool(path, pid, map, map_len))
 		if (try_mapping_tool(path, pid, map, map_len))
 			bail("failed to use newgid map on %d", pid);
 			bail("failed to use newgid map on %d", pid);
 	}
 	}
@@ -323,19 +343,20 @@ static void update_oom_score_adj(char *data, size_t len)
 	if (data == NULL || len <= 0)
 	if (data == NULL || len <= 0)
 		return;
 		return;
 
 
+	write_log(DEBUG, "update /proc/self/oom_score_adj to '%s'", data);
 	if (write_file(data, len, "/proc/self/oom_score_adj") < 0)
 	if (write_file(data, len, "/proc/self/oom_score_adj") < 0)
 		bail("failed to update /proc/self/oom_score_adj");
 		bail("failed to update /proc/self/oom_score_adj");
 }
 }
 
 
 /* A dummy function that just jumps to the given jumpval. */
 /* A dummy function that just jumps to the given jumpval. */
-static int child_func(void *arg) __attribute__ ((noinline));
+static int child_func(void *arg) __attribute__((noinline));
 static int child_func(void *arg)
 static int child_func(void *arg)
 {
 {
 	struct clone_t *ca = (struct clone_t *)arg;
 	struct clone_t *ca = (struct clone_t *)arg;
 	longjmp(*ca->env, ca->jmpval);
 	longjmp(*ca->env, ca->jmpval);
 }
 }
 
 
-static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline));
+static int clone_parent(jmp_buf *env, int jmpval) __attribute__((noinline));
 static int clone_parent(jmp_buf *env, int jmpval)
 static int clone_parent(jmp_buf *env, int jmpval)
 {
 {
 	struct clone_t ca = {
 	struct clone_t ca = {
@@ -507,7 +528,6 @@ void join_namespaces(char *nslist)
 	char *namespace = strtok_r(nslist, ",", &saveptr);
 	char *namespace = strtok_r(nslist, ",", &saveptr);
 	struct namespace_t {
 	struct namespace_t {
 		int fd;
 		int fd;
-		int ns;
 		char type[PATH_MAX];
 		char type[PATH_MAX];
 		char path[PATH_MAX];
 		char path[PATH_MAX];
 	} *namespaces = NULL;
 	} *namespaces = NULL;
@@ -542,7 +562,7 @@ void join_namespaces(char *nslist)
 			bail("failed to open %s", path);
 			bail("failed to open %s", path);
 
 
 		ns->fd = fd;
 		ns->fd = fd;
-		ns->ns = nsflag(namespace);
+		strncpy(ns->type, namespace, PATH_MAX - 1);
 		strncpy(ns->path, path, PATH_MAX - 1);
 		strncpy(ns->path, path, PATH_MAX - 1);
 		ns->path[PATH_MAX - 1] = '\0';
 		ns->path[PATH_MAX - 1] = '\0';
 	} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
 	} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
@@ -555,12 +575,14 @@ void join_namespaces(char *nslist)
 	 */
 	 */
 
 
 	for (i = 0; i < num; i++) {
 	for (i = 0; i < num; i++) {
-		struct namespace_t ns = namespaces[i];
+		struct namespace_t *ns = &namespaces[i];
+		int flag = nsflag(ns->type);
 
 
-		if (setns(ns.fd, ns.ns) < 0)
-			bail("failed to setns to %s", ns.path);
+		write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", flag, ns->type, ns->path);
+		if (setns(ns->fd, flag) < 0)
+			bail("failed to setns into %s namespace", ns->type);
 
 
-		close(ns.fd);
+		close(ns->fd);
 	}
 	}
 
 
 	free(namespaces);
 	free(namespaces);
@@ -569,6 +591,14 @@ void join_namespaces(char *nslist)
 /* Defined in cloned_binary.c. */
 /* Defined in cloned_binary.c. */
 extern int ensure_cloned_binary(void);
 extern int ensure_cloned_binary(void);
 
 
+static inline int sane_kill(pid_t pid, int signum)
+{
+	if (pid > 0)
+		return kill(pid, signum);
+	else
+		return 0;
+}
+
 void nsexec(void)
 void nsexec(void)
 {
 {
 	int pipenum;
 	int pipenum;
@@ -598,7 +628,14 @@ void nsexec(void)
 	if (ensure_cloned_binary() < 0)
 	if (ensure_cloned_binary() < 0)
 		bail("could not ensure we are a cloned binary");
 		bail("could not ensure we are a cloned binary");
 
 
-	write_log(DEBUG, "nsexec started");
+	/*
+	 * Inform the parent we're past initial setup.
+	 * For the other side of this, see initWaiter.
+	 */
+	if (write(pipenum, "", 1) != 1)
+		bail("could not inform the parent we are past initial setup");
+
+	write_log(DEBUG, "=> nsexec container setup");
 
 
 	/* Parse all of the netlink configuration. */
 	/* Parse all of the netlink configuration. */
 	nl_parse(pipenum, &config);
 	nl_parse(pipenum, &config);
@@ -622,6 +659,7 @@ void nsexec(void)
 	 * containers), which is the recommendation from the kernel folks.
 	 * containers), which is the recommendation from the kernel folks.
 	 */
 	 */
 	if (config.namespaces) {
 	if (config.namespaces) {
+		write_log(DEBUG, "set process as non-dumpable");
 		if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
 		if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
 			bail("failed to set process as non-dumpable");
 			bail("failed to set process as non-dumpable");
 	}
 	}
@@ -686,45 +724,49 @@ void nsexec(void)
 	 * -- Aleksa "what has my life come to?" Sarai
 	 * -- Aleksa "what has my life come to?" Sarai
 	 */
 	 */
 
 
-	switch (setjmp(env)) {
+	current_stage = setjmp(env);
+	switch (current_stage) {
 		/*
 		/*
 		 * Stage 0: We're in the parent. Our job is just to create a new child
 		 * Stage 0: We're in the parent. Our job is just to create a new child
-		 *          (stage 1: JUMP_CHILD) process and write its uid_map and
+		 *          (stage 1: STAGE_CHILD) process and write its uid_map and
 		 *          gid_map. That process will go on to create a new process, then
 		 *          gid_map. That process will go on to create a new process, then
 		 *          it will send us its PID which we will send to the bootstrap
 		 *          it will send us its PID which we will send to the bootstrap
 		 *          process.
 		 *          process.
 		 */
 		 */
-	case JUMP_PARENT:{
+	case STAGE_PARENT:{
 			int len;
 			int len;
-			pid_t child, first_child = -1;
-			bool ready = false;
+			pid_t stage1_pid = -1, stage2_pid = -1;
+			bool stage1_complete, stage2_complete;
 
 
 			/* For debugging. */
 			/* For debugging. */
 			prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0);
 			prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0);
+			write_log(DEBUG, "~> nsexec stage-0");
 
 
 			/* Start the process of getting a container. */
 			/* Start the process of getting a container. */
-			child = clone_parent(&env, JUMP_CHILD);
-			if (child < 0)
-				bail("unable to fork: child_func");
+			write_log(DEBUG, "spawn stage-1");
+			stage1_pid = clone_parent(&env, STAGE_CHILD);
+			if (stage1_pid < 0)
+				bail("unable to spawn stage-1");
 
 
-			/*
-			 * State machine for synchronisation with the children.
-			 *
-			 * Father only return when both child and grandchild are
-			 * ready, so we can receive all possible error codes
-			 * generated by children.
-			 */
 			syncfd = sync_child_pipe[1];
 			syncfd = sync_child_pipe[1];
 			close(sync_child_pipe[0]);
 			close(sync_child_pipe[0]);
 
 
-			while (!ready) {
+			/*
+			 * State machine for synchronisation with the children. We only
+			 * return once both the child and grandchild are ready.
+			 */
+			write_log(DEBUG, "-> stage-1 synchronisation loop");
+			stage1_complete = false;
+			while (!stage1_complete) {
 				enum sync_t s;
 				enum sync_t s;
 
 
 				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
 				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
-					bail("failed to sync with child: next state");
+					bail("failed to sync with stage-1: next state");
 
 
 				switch (s) {
 				switch (s) {
 				case SYNC_USERMAP_PLS:
 				case SYNC_USERMAP_PLS:
+					write_log(DEBUG, "stage-1 requested userns mappings");
+
 					/*
 					/*
 					 * Enable setgroups(2) if we've been asked to. But we also
 					 * Enable setgroups(2) if we've been asked to. But we also
 					 * have to explicitly disable setgroups(2) if we're
 					 * have to explicitly disable setgroups(2) if we're
@@ -735,70 +777,78 @@ void nsexec(void)
 					 * For rootless multi-entry mapping, config.is_setgroup shall be true and
 					 * For rootless multi-entry mapping, config.is_setgroup shall be true and
 					 * newuidmap/newgidmap shall be used.
 					 * newuidmap/newgidmap shall be used.
 					 */
 					 */
-
 					if (config.is_rootless_euid && !config.is_setgroup)
 					if (config.is_rootless_euid && !config.is_setgroup)
-						update_setgroups(child, SETGROUPS_DENY);
+						update_setgroups(stage1_pid, SETGROUPS_DENY);
 
 
 					/* Set up mappings. */
 					/* Set up mappings. */
-					update_uidmap(config.uidmappath, child, config.uidmap, config.uidmap_len);
-					update_gidmap(config.gidmappath, child, config.gidmap, config.gidmap_len);
+					update_uidmap(config.uidmappath, stage1_pid, config.uidmap, config.uidmap_len);
+					update_gidmap(config.gidmappath, stage1_pid, config.gidmap, config.gidmap_len);
 
 
 					s = SYNC_USERMAP_ACK;
 					s = SYNC_USERMAP_ACK;
 					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
 					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
-						kill(child, SIGKILL);
-						bail("failed to sync with child: write(SYNC_USERMAP_ACK)");
+						sane_kill(stage1_pid, SIGKILL);
+						sane_kill(stage2_pid, SIGKILL);
+						bail("failed to sync with stage-1: write(SYNC_USERMAP_ACK)");
 					}
 					}
 					break;
 					break;
-				case SYNC_RECVPID_PLS:{
-						first_child = child;
-
-						/* Get the init_func pid. */
-						if (read(syncfd, &child, sizeof(child)) != sizeof(child)) {
-							kill(first_child, SIGKILL);
-							bail("failed to sync with child: read(childpid)");
-						}
-
-						/* Send ACK. */
-						s = SYNC_RECVPID_ACK;
-						if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
-							kill(first_child, SIGKILL);
-							kill(child, SIGKILL);
-							bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
-						}
-
-						/* Send the init_func pid back to our parent.
-						 *
-						 * Send the init_func pid and the pid of the first child back to our parent.
-						 * We need to send both back because we can't reap the first child we created (CLONE_PARENT).
-						 * It becomes the responsibility of our parent to reap the first child.
-						 */
-						len = dprintf(pipenum, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child);
-						if (len < 0) {
-							kill(child, SIGKILL);
-							bail("unable to generate JSON for child pid");
-						}
+				case SYNC_RECVPID_PLS:
+					write_log(DEBUG, "stage-1 requested pid to be forwarded");
+
+					/* Get the stage-2 pid. */
+					if (read(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) {
+						sane_kill(stage1_pid, SIGKILL);
+						sane_kill(stage2_pid, SIGKILL);
+						bail("failed to sync with stage-1: read(stage2_pid)");
+					}
+
+					/* Send ACK. */
+					s = SYNC_RECVPID_ACK;
+					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
+						sane_kill(stage1_pid, SIGKILL);
+						sane_kill(stage2_pid, SIGKILL);
+						bail("failed to sync with stage-1: write(SYNC_RECVPID_ACK)");
+					}
+
+					/*
+					 * Send both the stage-1 and stage-2 pids back to runc.
+					 * runc needs the stage-2 to continue process management,
+					 * but because stage-1 was spawned with CLONE_PARENT we
+					 * cannot reap it within stage-0 and thus we need to ask
+					 * runc to reap the zombie for us.
+					 */
+					write_log(DEBUG, "forward stage-1 (%d) and stage-2 (%d) pids to runc",
+						  stage1_pid, stage2_pid);
+					len =
+					    dprintf(pipenum, "{\"stage1_pid\":%d,\"stage2_pid\":%d}\n", stage1_pid,
+						    stage2_pid);
+					if (len < 0) {
+						sane_kill(stage1_pid, SIGKILL);
+						sane_kill(stage2_pid, SIGKILL);
+						bail("failed to sync with runc: write(pid-JSON)");
 					}
 					}
 					break;
 					break;
-				case SYNC_CHILD_READY:
-					ready = true;
+				case SYNC_CHILD_FINISH:
+					write_log(DEBUG, "stage-1 complete");
+					stage1_complete = true;
 					break;
 					break;
 				default:
 				default:
 					bail("unexpected sync value: %u", s);
 					bail("unexpected sync value: %u", s);
 				}
 				}
 			}
 			}
+			write_log(DEBUG, "<- stage-1 synchronisation loop");
 
 
 			/* Now sync with grandchild. */
 			/* Now sync with grandchild. */
-
 			syncfd = sync_grandchild_pipe[1];
 			syncfd = sync_grandchild_pipe[1];
 			close(sync_grandchild_pipe[0]);
 			close(sync_grandchild_pipe[0]);
-
-			ready = false;
-			while (!ready) {
+			write_log(DEBUG, "-> stage-2 synchronisation loop");
+			stage2_complete = false;
+			while (!stage2_complete) {
 				enum sync_t s;
 				enum sync_t s;
 
 
+				write_log(DEBUG, "signalling stage-2 to run");
 				s = SYNC_GRANDCHILD;
 				s = SYNC_GRANDCHILD;
 				if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
 				if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
-					kill(child, SIGKILL);
+					sane_kill(stage2_pid, SIGKILL);
 					bail("failed to sync with child: write(SYNC_GRANDCHILD)");
 					bail("failed to sync with child: write(SYNC_GRANDCHILD)");
 				}
 				}
 
 
@@ -806,27 +856,31 @@ void nsexec(void)
 					bail("failed to sync with child: next state");
 					bail("failed to sync with child: next state");
 
 
 				switch (s) {
 				switch (s) {
-				case SYNC_CHILD_READY:
-					ready = true;
+				case SYNC_CHILD_FINISH:
+					write_log(DEBUG, "stage-2 complete");
+					stage2_complete = true;
 					break;
 					break;
 				default:
 				default:
 					bail("unexpected sync value: %u", s);
 					bail("unexpected sync value: %u", s);
 				}
 				}
 			}
 			}
+			write_log(DEBUG, "<- stage-2 synchronisation loop");
+			write_log(DEBUG, "<~ nsexec stage-0");
 			exit(0);
 			exit(0);
 		}
 		}
+		break;
 
 
 		/*
 		/*
 		 * Stage 1: We're in the first child process. Our job is to join any
 		 * Stage 1: We're in the first child process. Our job is to join any
-		 *          provided namespaces in the netlink payload and unshare all
-		 *          of the requested namespaces. If we've been asked to
-		 *          CLONE_NEWUSER, we will ask our parent (stage 0) to set up
-		 *          our user mappings for us. Then, we create a new child
-		 *          (stage 2: JUMP_INIT) for PID namespace. We then send the
-		 *          child's PID to our parent (stage 0).
+		 *          provided namespaces in the netlink payload and unshare all of
+		 *          the requested namespaces. If we've been asked to CLONE_NEWUSER,
+		 *          we will ask our parent (stage 0) to set up our user mappings
+		 *          for us. Then, we create a new child (stage 2: STAGE_INIT) for
+		 *          PID namespace. We then send the child's PID to our parent
+		 *          (stage 0).
 		 */
 		 */
-	case JUMP_CHILD:{
-			pid_t child;
+	case STAGE_CHILD:{
+			pid_t stage2_pid = -1;
 			enum sync_t s;
 			enum sync_t s;
 
 
 			/* We're in a child and thus need to tell the parent if we die. */
 			/* We're in a child and thus need to tell the parent if we die. */
@@ -835,11 +889,12 @@ void nsexec(void)
 
 
 			/* For debugging. */
 			/* For debugging. */
 			prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0);
 			prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0);
+			write_log(DEBUG, "~> nsexec stage-1");
 
 
 			/*
 			/*
 			 * We need to setns first. We cannot do this earlier (in stage 0)
 			 * We need to setns first. We cannot do this earlier (in stage 0)
 			 * because of the fact that we forked to get here (the PID of
 			 * because of the fact that we forked to get here (the PID of
-			 * [stage 2: JUMP_INIT]) would be meaningless). We could send it
+			 * [stage 2: STAGE_INIT]) would be meaningless). We could send it
 			 * using cmsg(3) but that's just annoying.
 			 * using cmsg(3) but that's just annoying.
 			 */
 			 */
 			if (config.namespaces)
 			if (config.namespaces)
@@ -865,40 +920,50 @@ void nsexec(void)
 			 * problem.
 			 * problem.
 			 */
 			 */
 			if (config.cloneflags & CLONE_NEWUSER) {
 			if (config.cloneflags & CLONE_NEWUSER) {
+				write_log(DEBUG, "unshare user namespace");
 				if (unshare(CLONE_NEWUSER) < 0)
 				if (unshare(CLONE_NEWUSER) < 0)
 					bail("failed to unshare user namespace");
 					bail("failed to unshare user namespace");
 				config.cloneflags &= ~CLONE_NEWUSER;
 				config.cloneflags &= ~CLONE_NEWUSER;
 
 
 				/*
 				/*
-				 * We don't have the privileges to do any mapping here (see the
-				 * clone_parent rant). So signal our parent to hook us up.
+				 * We need to set ourselves as dumpable temporarily so that the
+				 * parent process can write to our procfs files.
 				 */
 				 */
-
-				/* Switching is only necessary if we joined namespaces. */
 				if (config.namespaces) {
 				if (config.namespaces) {
+					write_log(DEBUG, "temporarily set process as dumpable");
 					if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0)
 					if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0)
-						bail("failed to set process as dumpable");
+						bail("failed to temporarily set process as dumpable");
 				}
 				}
+
+				/*
+				 * We don't have the privileges to do any mapping here (see the
+				 * clone_parent rant). So signal stage-0 to do the mapping for
+				 * us.
+				 */
+				write_log(DEBUG, "request stage-0 to map user namespace");
 				s = SYNC_USERMAP_PLS;
 				s = SYNC_USERMAP_PLS;
 				if (write(syncfd, &s, sizeof(s)) != sizeof(s))
 				if (write(syncfd, &s, sizeof(s)) != sizeof(s))
 					bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
 					bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
 
 
 				/* ... wait for mapping ... */
 				/* ... wait for mapping ... */
-
+				write_log(DEBUG, "request stage-0 to map user namespace");
 				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
 				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
 					bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
 					bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
 				if (s != SYNC_USERMAP_ACK)
 				if (s != SYNC_USERMAP_ACK)
 					bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
 					bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
-				/* Switching is only necessary if we joined namespaces. */
+
+				/* Revert temporary re-dumpable setting. */
 				if (config.namespaces) {
 				if (config.namespaces) {
+					write_log(DEBUG, "re-set process as non-dumpable");
 					if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
 					if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
-						bail("failed to set process as dumpable");
+						bail("failed to re-set process as non-dumpable");
 				}
 				}
 
 
 				/* Become root in the namespace proper. */
 				/* Become root in the namespace proper. */
 				if (setresuid(0, 0, 0) < 0)
 				if (setresuid(0, 0, 0) < 0)
 					bail("failed to become root in user namespace");
 					bail("failed to become root in user namespace");
 			}
 			}
+
 			/*
 			/*
 			 * Unshare all of the namespaces. Now, it should be noted that this
 			 * Unshare all of the namespaces. Now, it should be noted that this
 			 * ordering might break in the future (especially with rootless
 			 * ordering might break in the future (especially with rootless
@@ -909,8 +974,9 @@ void nsexec(void)
 			 * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
 			 * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
 			 * was broken, so we'll just do it the long way anyway.
 			 * was broken, so we'll just do it the long way anyway.
 			 */
 			 */
+			write_log(DEBUG, "unshare remaining namespace (except cgroupns)");
 			if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0)
 			if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0)
-				bail("failed to unshare namespaces");
+				bail("failed to unshare remaining namespaces (except cgroupns)");
 
 
 			/*
 			/*
 			 * TODO: What about non-namespace clone flags that we're dropping here?
 			 * TODO: What about non-namespace clone flags that we're dropping here?
@@ -921,41 +987,45 @@ void nsexec(void)
 			 * which would break many applications and libraries, so we must fork
 			 * which would break many applications and libraries, so we must fork
 			 * to actually enter the new PID namespace.
 			 * to actually enter the new PID namespace.
 			 */
 			 */
-			child = clone_parent(&env, JUMP_INIT);
-			if (child < 0)
-				bail("unable to fork: init_func");
+			write_log(DEBUG, "spawn stage-2");
+			stage2_pid = clone_parent(&env, STAGE_INIT);
+			if (stage2_pid < 0)
+				bail("unable to spawn stage-2");
 
 
 			/* Send the child to our parent, which knows what it's doing. */
 			/* Send the child to our parent, which knows what it's doing. */
+			write_log(DEBUG, "request stage-0 to forward stage-2 pid (%d)", stage2_pid);
 			s = SYNC_RECVPID_PLS;
 			s = SYNC_RECVPID_PLS;
 			if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
 			if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
-				kill(child, SIGKILL);
+				sane_kill(stage2_pid, SIGKILL);
 				bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
 				bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
 			}
 			}
-			if (write(syncfd, &child, sizeof(child)) != sizeof(child)) {
-				kill(child, SIGKILL);
-				bail("failed to sync with parent: write(childpid)");
+			if (write(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) {
+				sane_kill(stage2_pid, SIGKILL);
+				bail("failed to sync with parent: write(stage2_pid)");
 			}
 			}
 
 
 			/* ... wait for parent to get the pid ... */
 			/* ... wait for parent to get the pid ... */
-
 			if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
 			if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
-				kill(child, SIGKILL);
+				sane_kill(stage2_pid, SIGKILL);
 				bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
 				bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
 			}
 			}
 			if (s != SYNC_RECVPID_ACK) {
 			if (s != SYNC_RECVPID_ACK) {
-				kill(child, SIGKILL);
+				sane_kill(stage2_pid, SIGKILL);
 				bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
 				bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
 			}
 			}
 
 
-			s = SYNC_CHILD_READY;
+			write_log(DEBUG, "signal completion to stage-0");
+			s = SYNC_CHILD_FINISH;
 			if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
 			if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
-				kill(child, SIGKILL);
-				bail("failed to sync with parent: write(SYNC_CHILD_READY)");
+				sane_kill(stage2_pid, SIGKILL);
+				bail("failed to sync with parent: write(SYNC_CHILD_FINISH)");
 			}
 			}
 
 
-			/* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */
+			/* Our work is done. [Stage 2: STAGE_INIT] is doing the rest of the work. */
+			write_log(DEBUG, "<~ nsexec stage-1");
 			exit(0);
 			exit(0);
 		}
 		}
+		break;
 
 
 		/*
 		/*
 		 * Stage 2: We're the final child process, and the only process that will
 		 * Stage 2: We're the final child process, and the only process that will
@@ -963,7 +1033,7 @@ void nsexec(void)
 		 *          final cleanup steps and then return to the Go runtime to allow
 		 *          final cleanup steps and then return to the Go runtime to allow
 		 *          init_linux.go to run.
 		 *          init_linux.go to run.
 		 */
 		 */
-	case JUMP_INIT:{
+	case STAGE_INIT:{
 			/*
 			/*
 			 * We're inside the child now, having jumped from the
 			 * We're inside the child now, having jumped from the
 			 * start_child() code after forking in the parent.
 			 * start_child() code after forking in the parent.
@@ -978,6 +1048,7 @@ void nsexec(void)
 
 
 			/* For debugging. */
 			/* For debugging. */
 			prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0);
 			prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0);
+			write_log(DEBUG, "~> nsexec stage-2");
 
 
 			if (read(syncfd, &s, sizeof(s)) != sizeof(s))
 			if (read(syncfd, &s, sizeof(s)) != sizeof(s))
 				bail("failed to sync with parent: read(SYNC_GRANDCHILD)");
 				bail("failed to sync with parent: read(SYNC_GRANDCHILD)");
@@ -998,21 +1069,30 @@ void nsexec(void)
 					bail("setgroups failed");
 					bail("setgroups failed");
 			}
 			}
 
 
-			/* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */
+			/*
+			 * Wait until our topmost parent has finished cgroup setup in
+			 * p.manager.Apply().
+			 *
+			 * TODO(cyphar): Check if this code is actually needed because we
+			 *               should be in the cgroup even from stage-0, so
+			 *               waiting until now might not make sense.
+			 */
 			if (config.cloneflags & CLONE_NEWCGROUP) {
 			if (config.cloneflags & CLONE_NEWCGROUP) {
 				uint8_t value;
 				uint8_t value;
 				if (read(pipenum, &value, sizeof(value)) != sizeof(value))
 				if (read(pipenum, &value, sizeof(value)) != sizeof(value))
 					bail("read synchronisation value failed");
 					bail("read synchronisation value failed");
 				if (value == CREATECGROUPNS) {
 				if (value == CREATECGROUPNS) {
+					write_log(DEBUG, "unshare cgroup namespace");
 					if (unshare(CLONE_NEWCGROUP) < 0)
 					if (unshare(CLONE_NEWCGROUP) < 0)
 						bail("failed to unshare cgroup namespace");
 						bail("failed to unshare cgroup namespace");
 				} else
 				} else
 					bail("received unknown synchronisation value");
 					bail("received unknown synchronisation value");
 			}
 			}
 
 
-			s = SYNC_CHILD_READY;
+			write_log(DEBUG, "signal completion to stage-0");
+			s = SYNC_CHILD_FINISH;
 			if (write(syncfd, &s, sizeof(s)) != sizeof(s))
 			if (write(syncfd, &s, sizeof(s)) != sizeof(s))
-				bail("failed to sync with patent: write(SYNC_CHILD_READY)");
+				bail("failed to sync with patent: write(SYNC_CHILD_FINISH)");
 
 
 			/* Close sync pipes. */
 			/* Close sync pipes. */
 			close(sync_grandchild_pipe[0]);
 			close(sync_grandchild_pipe[0]);
@@ -1021,10 +1101,13 @@ void nsexec(void)
 			nl_free(&config);
 			nl_free(&config);
 
 
 			/* Finish executing, let the Go runtime take over. */
 			/* Finish executing, let the Go runtime take over. */
+			write_log(DEBUG, "<= nsexec container setup");
+			write_log(DEBUG, "booting up go runtime ...");
 			return;
 			return;
 		}
 		}
+		break;
 	default:
 	default:
-		bail("unexpected jump value");
+		bail("unknown stage '%d' for jump value", current_stage);
 	}
 	}
 
 
 	/* Should never be reached. */
 	/* Should never be reached. */

+ 1 - 0
vendor/github.com/opencontainers/runc/libcontainer/nsenter/test/escape.c

@@ -0,0 +1 @@
+../escape.c

+ 53 - 0
vendor/github.com/opencontainers/runc/libcontainer/nsenter/test/escape.go

@@ -0,0 +1,53 @@
+package escapetest
+
+// This file is part of escape_json_string unit test.
+// It is in a separate package so cgo can be used together
+// with go test.
+
+// #include <stdlib.h>
+// extern char *escape_json_string(char *str);
+// #cgo CFLAGS: -DESCAPE_TEST=1
+import "C"
+
+import (
+	"testing"
+	"unsafe"
+)
+
+func testEscapeJsonString(t *testing.T, input, want string) {
+	in := C.CString(input)
+	out := C.escape_json_string(in)
+	got := C.GoString(out)
+	C.free(unsafe.Pointer(out))
+	t.Logf("input: %q, output: %q", input, got)
+	if got != want {
+		t.Errorf("Failed on input: %q, want %q, got %q", input, want, got)
+	}
+}
+
+func testEscapeJson(t *testing.T) {
+	testCases := []struct {
+		input, output string
+	}{
+		{"", ""},
+		{"abcdef", "abcdef"},
+		{`\\\\\\`, `\\\\\\\\\\\\`},
+		{`with"quote`, `with\"quote`},
+		{"\n\r\b\t\f\\", `\n\r\b\t\f\\`},
+		{"\007", "\\u0007"},
+		{"\017 \020 \037", "\\u000f \\u0010 \\u001f"},
+		{"\033", "\\u001b"},
+		{`<->`, `<->`},
+		{"\176\177\200", "~\\u007f\200"},
+		{"\000", ""},
+		{"a\x7fxc", "a\\u007fxc"},
+		{"a\033xc", "a\\u001bxc"},
+		{"a\nxc", "a\\nxc"},
+		{"a\\xc", "a\\\\xc"},
+		{"Barney B\303\244r", "Barney B\303\244r"},
+	}
+
+	for _, tc := range testCases {
+		testEscapeJsonString(t, tc.input, tc.output)
+	}
+}

+ 0 - 41
vendor/github.com/opencontainers/runc/libcontainer/user/lookup.go

@@ -1,41 +0,0 @@
-package user
-
-import (
-	"errors"
-)
-
-var (
-	// The current operating system does not provide the required data for user lookups.
-	ErrUnsupported = errors.New("user lookup: operating system does not provide passwd-formatted data")
-	// No matching entries found in file.
-	ErrNoPasswdEntries = errors.New("no matching entries in passwd file")
-	ErrNoGroupEntries  = errors.New("no matching entries in group file")
-)
-
-// LookupUser looks up a user by their username in /etc/passwd. If the user
-// cannot be found (or there is no /etc/passwd file on the filesystem), then
-// LookupUser returns an error.
-func LookupUser(username string) (User, error) {
-	return lookupUser(username)
-}
-
-// LookupUid looks up a user by their user id in /etc/passwd. If the user cannot
-// be found (or there is no /etc/passwd file on the filesystem), then LookupId
-// returns an error.
-func LookupUid(uid int) (User, error) {
-	return lookupUid(uid)
-}
-
-// LookupGroup looks up a group by its name in /etc/group. If the group cannot
-// be found (or there is no /etc/group file on the filesystem), then LookupGroup
-// returns an error.
-func LookupGroup(groupname string) (Group, error) {
-	return lookupGroup(groupname)
-}
-
-// LookupGid looks up a group by its group id in /etc/group. If the group cannot
-// be found (or there is no /etc/group file on the filesystem), then LookupGid
-// returns an error.
-func LookupGid(gid int) (Group, error) {
-	return lookupGid(gid)
-}

+ 16 - 4
vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go

@@ -16,13 +16,19 @@ const (
 	unixGroupPath  = "/etc/group"
 	unixGroupPath  = "/etc/group"
 )
 )
 
 
-func lookupUser(username string) (User, error) {
+// LookupUser looks up a user by their username in /etc/passwd. If the user
+// cannot be found (or there is no /etc/passwd file on the filesystem), then
+// LookupUser returns an error.
+func LookupUser(username string) (User, error) {
 	return lookupUserFunc(func(u User) bool {
 	return lookupUserFunc(func(u User) bool {
 		return u.Name == username
 		return u.Name == username
 	})
 	})
 }
 }
 
 
-func lookupUid(uid int) (User, error) {
+// LookupUid looks up a user by their user id in /etc/passwd. If the user cannot
+// be found (or there is no /etc/passwd file on the filesystem), then LookupId
+// returns an error.
+func LookupUid(uid int) (User, error) {
 	return lookupUserFunc(func(u User) bool {
 	return lookupUserFunc(func(u User) bool {
 		return u.Uid == uid
 		return u.Uid == uid
 	})
 	})
@@ -51,13 +57,19 @@ func lookupUserFunc(filter func(u User) bool) (User, error) {
 	return users[0], nil
 	return users[0], nil
 }
 }
 
 
-func lookupGroup(groupname string) (Group, error) {
+// LookupGroup looks up a group by its name in /etc/group. If the group cannot
+// be found (or there is no /etc/group file on the filesystem), then LookupGroup
+// returns an error.
+func LookupGroup(groupname string) (Group, error) {
 	return lookupGroupFunc(func(g Group) bool {
 	return lookupGroupFunc(func(g Group) bool {
 		return g.Name == groupname
 		return g.Name == groupname
 	})
 	})
 }
 }
 
 
-func lookupGid(gid int) (Group, error) {
+// LookupGid looks up a group by its group id in /etc/group. If the group cannot
+// be found (or there is no /etc/group file on the filesystem), then LookupGid
+// returns an error.
+func LookupGid(gid int) (Group, error) {
 	return lookupGroupFunc(func(g Group) bool {
 	return lookupGroupFunc(func(g Group) bool {
 		return g.Gid == gid
 		return g.Gid == gid
 	})
 	})

+ 0 - 40
vendor/github.com/opencontainers/runc/libcontainer/user/lookup_windows.go

@@ -1,40 +0,0 @@
-// +build windows
-
-package user
-
-import (
-	"fmt"
-	"os/user"
-)
-
-func lookupUser(username string) (User, error) {
-	u, err := user.Lookup(username)
-	if err != nil {
-		return User{}, err
-	}
-	return userFromOS(u)
-}
-
-func lookupUid(uid int) (User, error) {
-	u, err := user.LookupId(fmt.Sprintf("%d", uid))
-	if err != nil {
-		return User{}, err
-	}
-	return userFromOS(u)
-}
-
-func lookupGroup(groupname string) (Group, error) {
-	g, err := user.LookupGroup(groupname)
-	if err != nil {
-		return Group{}, err
-	}
-	return groupFromOS(g)
-}
-
-func lookupGid(gid int) (Group, error) {
-	g, err := user.LookupGroupId(fmt.Sprintf("%d", gid))
-	if err != nil {
-		return Group{}, err
-	}
-	return groupFromOS(g)
-}

+ 10 - 42
vendor/github.com/opencontainers/runc/libcontainer/user/user.go

@@ -2,10 +2,10 @@ package user
 
 
 import (
 import (
 	"bufio"
 	"bufio"
+	"errors"
 	"fmt"
 	"fmt"
 	"io"
 	"io"
 	"os"
 	"os"
-	"os/user"
 	"strconv"
 	"strconv"
 	"strings"
 	"strings"
 )
 )
@@ -16,6 +16,13 @@ const (
 )
 )
 
 
 var (
 var (
+	// The current operating system does not provide the required data for user lookups.
+	ErrUnsupported = errors.New("user lookup: operating system does not provide passwd-formatted data")
+
+	// No matching entries found in file.
+	ErrNoPasswdEntries = errors.New("no matching entries in passwd file")
+	ErrNoGroupEntries  = errors.New("no matching entries in group file")
+
 	ErrRange = fmt.Errorf("uids and gids must be in range %d-%d", minId, maxId)
 	ErrRange = fmt.Errorf("uids and gids must be in range %d-%d", minId, maxId)
 )
 )
 
 
@@ -29,28 +36,6 @@ type User struct {
 	Shell string
 	Shell string
 }
 }
 
 
-// userFromOS converts an os/user.(*User) to local User
-//
-// (This does not include Pass, Shell or Gecos)
-func userFromOS(u *user.User) (User, error) {
-	newUser := User{
-		Name: u.Username,
-		Home: u.HomeDir,
-	}
-	id, err := strconv.Atoi(u.Uid)
-	if err != nil {
-		return newUser, err
-	}
-	newUser.Uid = id
-
-	id, err = strconv.Atoi(u.Gid)
-	if err != nil {
-		return newUser, err
-	}
-	newUser.Gid = id
-	return newUser, nil
-}
-
 type Group struct {
 type Group struct {
 	Name string
 	Name string
 	Pass string
 	Pass string
@@ -58,23 +43,6 @@ type Group struct {
 	List []string
 	List []string
 }
 }
 
 
-// groupFromOS converts an os/user.(*Group) to local Group
-//
-// (This does not include Pass or List)
-func groupFromOS(g *user.Group) (Group, error) {
-	newGroup := Group{
-		Name: g.Name,
-	}
-
-	id, err := strconv.Atoi(g.Gid)
-	if err != nil {
-		return newGroup, err
-	}
-	newGroup.Gid = id
-
-	return newGroup, nil
-}
-
 // SubID represents an entry in /etc/sub{u,g}id
 // SubID represents an entry in /etc/sub{u,g}id
 type SubID struct {
 type SubID struct {
 	Name  string
 	Name  string
@@ -466,7 +434,7 @@ func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, err
 		// we asked for a group but didn't find it. let's check to see
 		// we asked for a group but didn't find it. let's check to see
 		// if we wanted a numeric group
 		// if we wanted a numeric group
 		if !found {
 		if !found {
-			gid, err := strconv.Atoi(ag)
+			gid, err := strconv.ParseInt(ag, 10, 64)
 			if err != nil {
 			if err != nil {
 				return nil, fmt.Errorf("Unable to find group %s", ag)
 				return nil, fmt.Errorf("Unable to find group %s", ag)
 			}
 			}
@@ -474,7 +442,7 @@ func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, err
 			if gid < minId || gid > maxId {
 			if gid < minId || gid > maxId {
 				return nil, ErrRange
 				return nil, ErrRange
 			}
 			}
-			gidMap[gid] = struct{}{}
+			gidMap[int(gid)] = struct{}{}
 		}
 		}
 	}
 	}
 	gids := []int{}
 	gids := []int{}

+ 42 - 0
vendor/github.com/opencontainers/runc/libcontainer/user/user_fuzzer.go

@@ -0,0 +1,42 @@
+// +build gofuzz
+
+package user
+
+import (
+	"io"
+	"strings"
+)
+
+func IsDivisbleBy(n int, divisibleby int) bool {
+	return (n % divisibleby) == 0
+}
+
+func FuzzUser(data []byte) int {
+	if len(data) == 0 {
+		return -1
+	}
+	if !IsDivisbleBy(len(data), 5) {
+		return -1
+	}
+
+	var divided [][]byte
+
+	chunkSize := len(data) / 5
+
+	for i := 0; i < len(data); i += chunkSize {
+		end := i + chunkSize
+
+		divided = append(divided, data[i:end])
+	}
+
+	_, _ = ParsePasswdFilter(strings.NewReader(string(divided[0])), nil)
+
+	var passwd, group io.Reader
+
+	group = strings.NewReader(string(divided[1]))
+	_, _ = GetAdditionalGroups([]string{string(divided[2])}, group)
+
+	passwd = strings.NewReader(string(divided[3]))
+	_, _ = GetExecUser(string(divided[4]), nil, passwd, group)
+	return 1
+}

+ 5 - 0
vendor/github.com/opencontainers/runc/libcontainer/userns/userns.go

@@ -0,0 +1,5 @@
+package userns
+
+// RunningInUserNS detects whether we are currently running in a user namespace.
+// Originally copied from github.com/lxc/lxd/shared/util.go
+var RunningInUserNS = runningInUserNS

+ 15 - 0
vendor/github.com/opencontainers/runc/libcontainer/userns/userns_fuzzer.go

@@ -0,0 +1,15 @@
+// +build gofuzz
+
+package userns
+
+import (
+	"strings"
+
+	"github.com/opencontainers/runc/libcontainer/user"
+)
+
+func FuzzUIDMap(data []byte) int {
+	uidmap, _ := user.ParseIDMap(strings.NewReader(string(data)))
+	_ = uidMapInUserNS(uidmap)
+	return 1
+}

+ 37 - 0
vendor/github.com/opencontainers/runc/libcontainer/userns/userns_linux.go

@@ -0,0 +1,37 @@
+package userns
+
+import (
+	"sync"
+
+	"github.com/opencontainers/runc/libcontainer/user"
+)
+
+var (
+	inUserNS bool
+	nsOnce   sync.Once
+)
+
+// runningInUserNS detects whether we are currently running in a user namespace.
+// Originally copied from github.com/lxc/lxd/shared/util.go
+func runningInUserNS() bool {
+	nsOnce.Do(func() {
+		uidmap, err := user.CurrentProcessUIDMap()
+		if err != nil {
+			// This kernel-provided file only exists if user namespaces are supported
+			return
+		}
+		inUserNS = uidMapInUserNS(uidmap)
+	})
+	return inUserNS
+}
+
+func uidMapInUserNS(uidmap []user.IDMap) bool {
+	/*
+	 * We assume we are in the initial user namespace if we have a full
+	 * range - 4294967295 uids starting at uid 0.
+	 */
+	if len(uidmap) == 1 && uidmap[0].ID == 0 && uidmap[0].ParentID == 0 && uidmap[0].Count == 4294967295 {
+		return false
+	}
+	return true
+}

+ 17 - 0
vendor/github.com/opencontainers/runc/libcontainer/userns/userns_unsupported.go

@@ -0,0 +1,17 @@
+// +build !linux
+
+package userns
+
+import "github.com/opencontainers/runc/libcontainer/user"
+
+// runningInUserNS is a stub for non-Linux systems
+// Always returns false
+func runningInUserNS() bool {
+	return false
+}
+
+// uidMapInUserNS is a stub for non-Linux systems
+// Always returns false
+func uidMapInUserNS(uidmap []user.IDMap) bool {
+	return false
+}