Selaa lähdekoodia

vendor: github.com/opencontainers/runc v1.0.0-rc95

full diff: https://github.com/opencontainers/runc/compare/v1.0.0-rc92...v1.0.0-rc95

Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
Sebastiaan van Stijn 4 vuotta sitten
vanhempi
commit
a927fc7831
34 muutettua tiedostoa jossa 1342 lisäystä ja 602 poistoa
  1. 1 1
      vendor.conf
  2. 25 13
      vendor/github.com/opencontainers/runc/README.md
  3. 16 14
      vendor/github.com/opencontainers/runc/go.mod
  4. 87 83
      vendor/github.com/opencontainers/runc/libcontainer/README.md
  5. 23 13
      vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go
  6. 51 0
      vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/fscommon.go
  7. 120 0
      vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/open.go
  8. 122 0
      vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/utils.go
  9. 28 0
      vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go
  10. 115 42
      vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go
  11. 41 59
      vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go
  12. 5 7
      vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go
  13. 15 10
      vendor/github.com/opencontainers/runc/libcontainer/configs/config.go
  14. 9 0
      vendor/github.com/opencontainers/runc/libcontainer/configs/configs_fuzzer.go
  15. 0 16
      vendor/github.com/opencontainers/runc/libcontainer/configs/device_unix.go
  16. 0 5
      vendor/github.com/opencontainers/runc/libcontainer/configs/device_windows.go
  17. 17 0
      vendor/github.com/opencontainers/runc/libcontainer/configs/devices.go
  18. 1 1
      vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go
  19. 33 29
      vendor/github.com/opencontainers/runc/libcontainer/devices/device.go
  20. 22 14
      vendor/github.com/opencontainers/runc/libcontainer/devices/device_unix.go
  21. 51 29
      vendor/github.com/opencontainers/runc/libcontainer/nsenter/cloned_binary.c
  22. 142 0
      vendor/github.com/opencontainers/runc/libcontainer/nsenter/escape.c
  23. 222 139
      vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c
  24. 1 0
      vendor/github.com/opencontainers/runc/libcontainer/nsenter/test/escape.c
  25. 53 0
      vendor/github.com/opencontainers/runc/libcontainer/nsenter/test/escape.go
  26. 0 41
      vendor/github.com/opencontainers/runc/libcontainer/user/lookup.go
  27. 16 4
      vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go
  28. 0 40
      vendor/github.com/opencontainers/runc/libcontainer/user/lookup_windows.go
  29. 10 42
      vendor/github.com/opencontainers/runc/libcontainer/user/user.go
  30. 42 0
      vendor/github.com/opencontainers/runc/libcontainer/user/user_fuzzer.go
  31. 5 0
      vendor/github.com/opencontainers/runc/libcontainer/userns/userns.go
  32. 15 0
      vendor/github.com/opencontainers/runc/libcontainer/userns/userns_fuzzer.go
  33. 37 0
      vendor/github.com/opencontainers/runc/libcontainer/userns/userns_linux.go
  34. 17 0
      vendor/github.com/opencontainers/runc/libcontainer/userns/userns_unsupported.go

+ 1 - 1
vendor.conf

@@ -91,7 +91,7 @@ google.golang.org/grpc                              f495f5b15ae7ccda3b38c53a1bfc
 # the containerd project first, and update both after that is merged.
 # This commit does not need to match RUNC_COMMIT as it is used for helper
 # packages but should be newer or equal.
-github.com/opencontainers/runc                      ff819c7e9184c13b7c2607fe6c30ae19403a7aff # v1.0.0-rc92
+github.com/opencontainers/runc                      b9ee9c6314599f1b4a7f497e1f1f856fe433d3b7 # v1.0.0-rc95
 github.com/opencontainers/runtime-spec              1c3f411f041711bbeecf35ff7e93461ea6789220 # v1.0.3-0.20210326190908-1c3f411f0417
 github.com/opencontainers/image-spec                d60099175f88c47cd379c4738d158884749ed235 # v1.0.1
 github.com/cyphar/filepath-securejoin               a261ee33d7a517f054effbf451841abaafe3e0fd # v0.2.2

+ 25 - 13
vendor/github.com/opencontainers/runc/README.md

@@ -1,9 +1,10 @@
 # runc
 
-[![Build Status](https://travis-ci.org/opencontainers/runc.svg?branch=master)](https://travis-ci.org/opencontainers/runc)
 [![Go Report Card](https://goreportcard.com/badge/github.com/opencontainers/runc)](https://goreportcard.com/report/github.com/opencontainers/runc)
 [![GoDoc](https://godoc.org/github.com/opencontainers/runc?status.svg)](https://godoc.org/github.com/opencontainers/runc)
 [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/588/badge)](https://bestpractices.coreinfrastructure.org/projects/588)
+[![gha/validate](https://github.com/opencontainers/runc/workflows/validate/badge.svg)](https://github.com/opencontainers/runc/actions?query=workflow%3Avalidate)
+[![gha/ci](https://github.com/opencontainers/runc/workflows/ci/badge.svg)](https://github.com/opencontainers/runc/actions?query=workflow%3Aci)
 
 ## Introduction
 
@@ -17,10 +18,6 @@ This means that `runc` 1.0.0 should implement the 1.0 version of the specificati
 
 You can find official releases of `runc` on the [release](https://github.com/opencontainers/runc/releases) page.
 
-Currently, the following features are not considered to be production-ready:
-
-* [Support for cgroup v2](./docs/cgroup-v2.md)
-
 ## Security
 
 The reporting process and disclosure communications are outlined [here](https://github.com/opencontainers/org/blob/master/SECURITY.md).
@@ -64,19 +61,20 @@ sudo make install
 with some of them enabled by default (see `BUILDTAGS` in top-level `Makefile`).
 
 To change build tags from the default, set the `BUILDTAGS` variable for make,
-e.g.
+e.g. to disable seccomp:
 
 ```bash
-make BUILDTAGS='seccomp apparmor'
+make BUILDTAGS=""
 ```
 
 | Build Tag | Feature                            | Enabled by default | Dependency |
 |-----------|------------------------------------|--------------------|------------|
 | seccomp   | Syscall filtering                  | yes                | libseccomp |
-| selinux   | selinux process and mount labeling | yes                | <none>     |
-| apparmor  | apparmor profile support           | yes                | <none>     |
-| nokmem    | disable kernel memory accounting   | no                 | <none>     |
 
+The following build tags were used earlier, but are now obsoleted:
+ - **nokmem** (since runc v1.0.0-rc94 kernel memory settings are ignored)
+ - **apparmor** (since runc v1.0.0-rc93 the feature is always enabled)
+ - **selinux**  (since runc v1.0.0-rc93 the feature is always enabled)
 
 ### Running the test suite
 
@@ -128,6 +126,14 @@ make verify-dependencies
 
 ## Using runc
 
+Please note that runc is a low level tool not designed with an end user
+in mind. It is mostly employed by other higher level container software.
+
+Therefore, unless there is some specific use case that prevents the use
+of tools like Docker or Podman, it is not recommended to use runc directly.
+
+If you still want to use runc, here's how.
+
 ### Creating an OCI Bundle
 
 In order to use runc you must have your container in the format of an OCI bundle.
@@ -169,7 +175,9 @@ If you used the unmodified `runc spec` template this should give you a `sh` sess
 
 The second way to start a container is using the specs lifecycle operations.
 This gives you more power over how the container is created and managed while it is running.
-This will also launch the container in the background so you will have to edit the `config.json` to remove the `terminal` setting for the simple examples here.
+This will also launch the container in the background so you will have to edit
+the `config.json` to remove the `terminal` setting for the simple examples
+below (see more details about [runc terminal handling](docs/terminals.md)).
 Your process field in the `config.json` should look like this below with `"terminal": false` and `"args": ["sleep", "5"]`.
 
 
@@ -292,8 +300,12 @@ PIDFile=/run/mycontainerid.pid
 WantedBy=multi-user.target
 ```
 
-#### cgroup v2
-See [`./docs/cgroup-v2.md`](./docs/cgroup-v2.md).
+## More documentation
+
+* [cgroup v2](./docs/cgroup-v2.md)
+* [Checkpoint and restore](./docs/checkpoint-restore.md)
+* [systemd cgroup driver](./docs/systemd.md)
+* [Terminals and standard IO](./docs/terminals.md)
 
 ## License
 

+ 16 - 14
vendor/github.com/opencontainers/runc/go.mod

@@ -1,26 +1,28 @@
 module github.com/opencontainers/runc
 
-go 1.14
+go 1.13
 
 require (
-	github.com/checkpoint-restore/go-criu/v4 v4.1.0
-	github.com/cilium/ebpf v0.0.0-20200702112145-1c8d4c9ef775
-	github.com/containerd/console v1.0.0
-	github.com/coreos/go-systemd/v22 v22.1.0
+	github.com/checkpoint-restore/go-criu/v5 v5.0.0
+	github.com/cilium/ebpf v0.5.0
+	github.com/containerd/console v1.0.2
+	github.com/coreos/go-systemd/v22 v22.3.1
 	github.com/cyphar/filepath-securejoin v0.2.2
 	github.com/docker/go-units v0.4.0
-	github.com/godbus/dbus/v5 v5.0.3
-	github.com/golang/protobuf v1.4.2
-	github.com/moby/sys/mountinfo v0.1.3
-	github.com/mrunalp/fileutils v0.0.0-20200520151820-abd8a0e76976
-	github.com/opencontainers/runtime-spec v1.0.3-0.20200728170252-4d89ac9fbff6
-	github.com/opencontainers/selinux v1.6.0
+	github.com/godbus/dbus/v5 v5.0.4
+	github.com/moby/sys/mountinfo v0.4.1
+	github.com/mrunalp/fileutils v0.5.0
+	github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417
+	github.com/opencontainers/selinux v1.8.0
 	github.com/pkg/errors v0.9.1
 	github.com/seccomp/libseccomp-golang v0.9.1
-	github.com/sirupsen/logrus v1.6.0
-	github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2
+	github.com/sirupsen/logrus v1.7.0
+	github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635
 	// NOTE: urfave/cli must be <= v1.22.1 due to a regression: https://github.com/urfave/cli/issues/1092
 	github.com/urfave/cli v1.22.1
 	github.com/vishvananda/netlink v1.1.0
-	golang.org/x/sys v0.0.0-20200728102440-3e129f6d46b1
+	github.com/willf/bitset v1.1.11
+	golang.org/x/net v0.0.0-20201224014010-6772e930b67b
+	golang.org/x/sys v0.0.0-20210426230700-d19ff857e887
+	google.golang.org/protobuf v1.25.0
 )

+ 87 - 83
vendor/github.com/opencontainers/runc/libcontainer/README.md

@@ -57,90 +57,94 @@ struct describing how the container is to be created. A sample would look simila
 
 ```go
 defaultMountFlags := unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
+var devices []*configs.DeviceRule
+for _, device := range specconv.AllowedDevices {
+	devices = append(devices, &device.Rule)
+}
 config := &configs.Config{
 	Rootfs: "/your/path/to/rootfs",
 	Capabilities: &configs.Capabilities{
-                Bounding: []string{
-                        "CAP_CHOWN",
-                        "CAP_DAC_OVERRIDE",
-                        "CAP_FSETID",
-                        "CAP_FOWNER",
-                        "CAP_MKNOD",
-                        "CAP_NET_RAW",
-                        "CAP_SETGID",
-                        "CAP_SETUID",
-                        "CAP_SETFCAP",
-                        "CAP_SETPCAP",
-                        "CAP_NET_BIND_SERVICE",
-                        "CAP_SYS_CHROOT",
-                        "CAP_KILL",
-                        "CAP_AUDIT_WRITE",
-                },
-                Effective: []string{
-                        "CAP_CHOWN",
-                        "CAP_DAC_OVERRIDE",
-                        "CAP_FSETID",
-                        "CAP_FOWNER",
-                        "CAP_MKNOD",
-                        "CAP_NET_RAW",
-                        "CAP_SETGID",
-                        "CAP_SETUID",
-                        "CAP_SETFCAP",
-                        "CAP_SETPCAP",
-                        "CAP_NET_BIND_SERVICE",
-                        "CAP_SYS_CHROOT",
-                        "CAP_KILL",
-                        "CAP_AUDIT_WRITE",
-                },
-                Inheritable: []string{
-                        "CAP_CHOWN",
-                        "CAP_DAC_OVERRIDE",
-                        "CAP_FSETID",
-                        "CAP_FOWNER",
-                        "CAP_MKNOD",
-                        "CAP_NET_RAW",
-                        "CAP_SETGID",
-                        "CAP_SETUID",
-                        "CAP_SETFCAP",
-                        "CAP_SETPCAP",
-                        "CAP_NET_BIND_SERVICE",
-                        "CAP_SYS_CHROOT",
-                        "CAP_KILL",
-                        "CAP_AUDIT_WRITE",
-                },
-                Permitted: []string{
-                        "CAP_CHOWN",
-                        "CAP_DAC_OVERRIDE",
-                        "CAP_FSETID",
-                        "CAP_FOWNER",
-                        "CAP_MKNOD",
-                        "CAP_NET_RAW",
-                        "CAP_SETGID",
-                        "CAP_SETUID",
-                        "CAP_SETFCAP",
-                        "CAP_SETPCAP",
-                        "CAP_NET_BIND_SERVICE",
-                        "CAP_SYS_CHROOT",
-                        "CAP_KILL",
-                        "CAP_AUDIT_WRITE",
-                },
-                Ambient: []string{
-                        "CAP_CHOWN",
-                        "CAP_DAC_OVERRIDE",
-                        "CAP_FSETID",
-                        "CAP_FOWNER",
-                        "CAP_MKNOD",
-                        "CAP_NET_RAW",
-                        "CAP_SETGID",
-                        "CAP_SETUID",
-                        "CAP_SETFCAP",
-                        "CAP_SETPCAP",
-                        "CAP_NET_BIND_SERVICE",
-                        "CAP_SYS_CHROOT",
-                        "CAP_KILL",
-                        "CAP_AUDIT_WRITE",
-                },
-        },
+		Bounding: []string{
+			"CAP_CHOWN",
+			"CAP_DAC_OVERRIDE",
+			"CAP_FSETID",
+			"CAP_FOWNER",
+			"CAP_MKNOD",
+			"CAP_NET_RAW",
+			"CAP_SETGID",
+			"CAP_SETUID",
+			"CAP_SETFCAP",
+			"CAP_SETPCAP",
+			"CAP_NET_BIND_SERVICE",
+			"CAP_SYS_CHROOT",
+			"CAP_KILL",
+			"CAP_AUDIT_WRITE",
+		},
+		Effective: []string{
+			"CAP_CHOWN",
+			"CAP_DAC_OVERRIDE",
+			"CAP_FSETID",
+			"CAP_FOWNER",
+			"CAP_MKNOD",
+			"CAP_NET_RAW",
+			"CAP_SETGID",
+			"CAP_SETUID",
+			"CAP_SETFCAP",
+			"CAP_SETPCAP",
+			"CAP_NET_BIND_SERVICE",
+			"CAP_SYS_CHROOT",
+			"CAP_KILL",
+			"CAP_AUDIT_WRITE",
+		},
+		Inheritable: []string{
+			"CAP_CHOWN",
+			"CAP_DAC_OVERRIDE",
+			"CAP_FSETID",
+			"CAP_FOWNER",
+			"CAP_MKNOD",
+			"CAP_NET_RAW",
+			"CAP_SETGID",
+			"CAP_SETUID",
+			"CAP_SETFCAP",
+			"CAP_SETPCAP",
+			"CAP_NET_BIND_SERVICE",
+			"CAP_SYS_CHROOT",
+			"CAP_KILL",
+			"CAP_AUDIT_WRITE",
+		},
+		Permitted: []string{
+			"CAP_CHOWN",
+			"CAP_DAC_OVERRIDE",
+			"CAP_FSETID",
+			"CAP_FOWNER",
+			"CAP_MKNOD",
+			"CAP_NET_RAW",
+			"CAP_SETGID",
+			"CAP_SETUID",
+			"CAP_SETFCAP",
+			"CAP_SETPCAP",
+			"CAP_NET_BIND_SERVICE",
+			"CAP_SYS_CHROOT",
+			"CAP_KILL",
+			"CAP_AUDIT_WRITE",
+		},
+		Ambient: []string{
+			"CAP_CHOWN",
+			"CAP_DAC_OVERRIDE",
+			"CAP_FSETID",
+			"CAP_FOWNER",
+			"CAP_MKNOD",
+			"CAP_NET_RAW",
+			"CAP_SETGID",
+			"CAP_SETUID",
+			"CAP_SETFCAP",
+			"CAP_SETPCAP",
+			"CAP_NET_BIND_SERVICE",
+			"CAP_SYS_CHROOT",
+			"CAP_KILL",
+			"CAP_AUDIT_WRITE",
+		},
+	},
 	Namespaces: configs.Namespaces([]configs.Namespace{
 		{Type: configs.NEWNS},
 		{Type: configs.NEWUTS},
@@ -155,7 +159,7 @@ config := &configs.Config{
 		Parent: "system",
 		Resources: &configs.Resources{
 			MemorySwappiness: nil,
-			Devices:          specconv.AllowedDevices,
+			Devices:          devices,
 		},
 	},
 	MaskPaths: []string{
@@ -313,7 +317,7 @@ state, err := container.State()
 #### Checkpoint & Restore
 
 libcontainer now integrates [CRIU](http://criu.org/) for checkpointing and restoring containers.
-This let's you save the state of a process running inside a container to disk, and then restore
+This lets you save the state of a process running inside a container to disk, and then restore
 that state into a new process, on the same machine or on another machine.
 
 `criu` version 1.5.2 or higher is required to use checkpoint and restore.

+ 23 - 13
vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go

@@ -7,37 +7,44 @@ import (
 )
 
 type Manager interface {
-	// Applies cgroup configuration to the process with the specified pid
+	// Apply creates a cgroup, if not yet created, and adds a process
+	// with the specified pid into that cgroup.  A special value of -1
+	// can be used to merely create a cgroup.
 	Apply(pid int) error
 
-	// Returns the PIDs inside the cgroup set
+	// GetPids returns the PIDs of all processes inside the cgroup.
 	GetPids() ([]int, error)
 
-	// Returns the PIDs inside the cgroup set & all sub-cgroups
+	// GetAllPids returns the PIDs of all processes inside the cgroup
+	// any all its sub-cgroups.
 	GetAllPids() ([]int, error)
 
-	// Returns statistics for the cgroup set
+	// GetStats returns cgroups statistics.
 	GetStats() (*Stats, error)
 
-	// Toggles the freezer cgroup according with specified state
+	// Freeze sets the freezer cgroup to the specified state.
 	Freeze(state configs.FreezerState) error
 
-	// Destroys the cgroup set
+	// Destroy removes cgroup.
 	Destroy() error
 
 	// Path returns a cgroup path to the specified controller/subsystem.
 	// For cgroupv2, the argument is unused and can be empty.
 	Path(string) string
 
-	// Sets the cgroup as configured.
-	Set(container *configs.Config) error
+	// Set sets cgroup resources parameters/limits. If the argument is nil,
+	// the resources specified during Manager creation (or the previous call
+	// to Set) are used.
+	Set(r *configs.Resources) error
 
-	// GetPaths returns cgroup path(s) to save in a state file in order to restore later.
+	// GetPaths returns cgroup path(s) to save in a state file in order to
+	// restore later.
 	//
-	// For cgroup v1, a key is cgroup subsystem name, and the value is the path
-	// to the cgroup for this subsystem.
+	// For cgroup v1, a key is cgroup subsystem name, and the value is the
+	// path to the cgroup for this subsystem.
 	//
-	// For cgroup v2 unified hierarchy, a key is "", and the value is the unified path.
+	// For cgroup v2 unified hierarchy, a key is "", and the value is the
+	// unified path.
 	GetPaths() map[string]string
 
 	// GetCgroups returns the cgroup data as configured.
@@ -46,6 +53,9 @@ type Manager interface {
 	// GetFreezerState retrieves the current FreezerState of the cgroup.
 	GetFreezerState() (configs.FreezerState, error)
 
-	// Whether the cgroup path exists or not
+	// Exists returns whether the cgroup path exists or not.
 	Exists() bool
+
+	// OOMKillCount reports OOM kill count for the cgroup.
+	OOMKillCount() (uint64, error)
 }

+ 51 - 0
vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/fscommon.go

@@ -0,0 +1,51 @@
+// +build linux
+
+package fscommon
+
+import (
+	"bytes"
+	"os"
+
+	"github.com/pkg/errors"
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
+)
+
+// WriteFile writes data to a cgroup file in dir.
+// It is supposed to be used for cgroup files only.
+func WriteFile(dir, file, data string) error {
+	fd, err := OpenFile(dir, file, unix.O_WRONLY)
+	if err != nil {
+		return err
+	}
+	defer fd.Close()
+	if err := retryingWriteFile(fd, data); err != nil {
+		return errors.Wrapf(err, "failed to write %q", data)
+	}
+	return nil
+}
+
+// ReadFile reads data from a cgroup file in dir.
+// It is supposed to be used for cgroup files only.
+func ReadFile(dir, file string) (string, error) {
+	fd, err := OpenFile(dir, file, unix.O_RDONLY)
+	if err != nil {
+		return "", err
+	}
+	defer fd.Close()
+	var buf bytes.Buffer
+
+	_, err = buf.ReadFrom(fd)
+	return buf.String(), err
+}
+
+func retryingWriteFile(fd *os.File, data string) error {
+	for {
+		_, err := fd.Write([]byte(data))
+		if errors.Is(err, unix.EINTR) {
+			logrus.Infof("interrupted while writing %s to %s", data, fd.Name())
+			continue
+		}
+		return err
+	}
+}

+ 120 - 0
vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/open.go

@@ -0,0 +1,120 @@
+package fscommon
+
+import (
+	"os"
+	"strings"
+	"sync"
+
+	"github.com/pkg/errors"
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
+)
+
+const (
+	cgroupfsDir    = "/sys/fs/cgroup"
+	cgroupfsPrefix = cgroupfsDir + "/"
+)
+
+var (
+	// TestMode is set to true by unit tests that need "fake" cgroupfs.
+	TestMode bool
+
+	cgroupFd     int = -1
+	prepOnce     sync.Once
+	prepErr      error
+	resolveFlags uint64
+)
+
+func prepareOpenat2() error {
+	prepOnce.Do(func() {
+		fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{
+			Flags: unix.O_DIRECTORY | unix.O_PATH})
+		if err != nil {
+			prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err}
+			if err != unix.ENOSYS {
+				logrus.Warnf("falling back to securejoin: %s", prepErr)
+			} else {
+				logrus.Debug("openat2 not available, falling back to securejoin")
+			}
+			return
+		}
+		var st unix.Statfs_t
+		if err = unix.Fstatfs(fd, &st); err != nil {
+			prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err}
+			logrus.Warnf("falling back to securejoin: %s", prepErr)
+			return
+		}
+
+		cgroupFd = fd
+
+		resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS
+		if st.Type == unix.CGROUP2_SUPER_MAGIC {
+			// cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks
+			resolveFlags |= unix.RESOLVE_NO_XDEV | unix.RESOLVE_NO_SYMLINKS
+		}
+
+	})
+
+	return prepErr
+}
+
+// OpenFile opens a cgroup file in a given dir with given flags.
+// It is supposed to be used for cgroup files only.
+func OpenFile(dir, file string, flags int) (*os.File, error) {
+	if dir == "" {
+		return nil, errors.Errorf("no directory specified for %s", file)
+	}
+	mode := os.FileMode(0)
+	if TestMode && flags&os.O_WRONLY != 0 {
+		// "emulate" cgroup fs for unit tests
+		flags |= os.O_TRUNC | os.O_CREATE
+		mode = 0o600
+	}
+	if prepareOpenat2() != nil {
+		return openFallback(dir, file, flags, mode)
+	}
+	reldir := strings.TrimPrefix(dir, cgroupfsPrefix)
+	if len(reldir) == len(dir) { // non-standard path, old system?
+		return openFallback(dir, file, flags, mode)
+	}
+
+	relname := reldir + "/" + file
+	fd, err := unix.Openat2(cgroupFd, relname,
+		&unix.OpenHow{
+			Resolve: resolveFlags,
+			Flags:   uint64(flags) | unix.O_CLOEXEC,
+			Mode:    uint64(mode),
+		})
+	if err != nil {
+		return nil, &os.PathError{Op: "openat2", Path: dir + "/" + file, Err: err}
+	}
+
+	return os.NewFile(uintptr(fd), cgroupfsPrefix+relname), nil
+}
+
+var errNotCgroupfs = errors.New("not a cgroup file")
+
+// openFallback is used when openat2(2) is not available. It checks the opened
+// file is on cgroupfs, returning an error otherwise.
+func openFallback(dir, file string, flags int, mode os.FileMode) (*os.File, error) {
+	path := dir + "/" + file
+	fd, err := os.OpenFile(path, flags, mode)
+	if err != nil {
+		return nil, err
+	}
+	if TestMode {
+		return fd, nil
+	}
+	// Check this is a cgroupfs file.
+	var st unix.Statfs_t
+	if err := unix.Fstatfs(int(fd.Fd()), &st); err != nil {
+		_ = fd.Close()
+		return nil, &os.PathError{Op: "statfs", Path: path, Err: err}
+	}
+	if st.Type != unix.CGROUP_SUPER_MAGIC && st.Type != unix.CGROUP2_SUPER_MAGIC {
+		_ = fd.Close()
+		return nil, &os.PathError{Op: "open", Path: path, Err: errNotCgroupfs}
+	}
+
+	return fd, nil
+}

+ 122 - 0
vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/utils.go

@@ -0,0 +1,122 @@
+// +build linux
+
+package fscommon
+
+import (
+	"errors"
+	"fmt"
+	"math"
+	"strconv"
+	"strings"
+)
+
+var (
+	ErrNotValidFormat = errors.New("line is not a valid key value format")
+)
+
+// ParseUint converts a string to an uint64 integer.
+// Negative values are returned at zero as, due to kernel bugs,
+// some of the memory cgroup stats can be negative.
+func ParseUint(s string, base, bitSize int) (uint64, error) {
+	value, err := strconv.ParseUint(s, base, bitSize)
+	if err != nil {
+		intValue, intErr := strconv.ParseInt(s, base, bitSize)
+		// 1. Handle negative values greater than MinInt64 (and)
+		// 2. Handle negative values lesser than MinInt64
+		if intErr == nil && intValue < 0 {
+			return 0, nil
+		} else if intErr != nil && intErr.(*strconv.NumError).Err == strconv.ErrRange && intValue < 0 {
+			return 0, nil
+		}
+
+		return value, err
+	}
+
+	return value, nil
+}
+
+// ParseKeyValue parses a space-separated "name value" kind of cgroup
+// parameter and returns its key as a string, and its value as uint64
+// (ParseUint is used to convert the value). For example,
+// "io_service_bytes 1234" will be returned as "io_service_bytes", 1234.
+func ParseKeyValue(t string) (string, uint64, error) {
+	parts := strings.SplitN(t, " ", 3)
+	if len(parts) != 2 {
+		return "", 0, fmt.Errorf("line %q is not in key value format", t)
+	}
+
+	value, err := ParseUint(parts[1], 10, 64)
+	if err != nil {
+		return "", 0, fmt.Errorf("unable to convert to uint64: %v", err)
+	}
+
+	return parts[0], value, nil
+}
+
+// GetValueByKey reads a key-value pairs from the specified cgroup file,
+// and returns a value of the specified key. ParseUint is used for value
+// conversion.
+func GetValueByKey(path, file, key string) (uint64, error) {
+	content, err := ReadFile(path, file)
+	if err != nil {
+		return 0, err
+	}
+
+	lines := strings.Split(string(content), "\n")
+	for _, line := range lines {
+		arr := strings.Split(line, " ")
+		if len(arr) == 2 && arr[0] == key {
+			return ParseUint(arr[1], 10, 64)
+		}
+	}
+
+	return 0, nil
+}
+
+// GetCgroupParamUint reads a single uint64 value from the specified cgroup file.
+// If the value read is "max", the math.MaxUint64 is returned.
+func GetCgroupParamUint(path, file string) (uint64, error) {
+	contents, err := GetCgroupParamString(path, file)
+	if err != nil {
+		return 0, err
+	}
+	contents = strings.TrimSpace(contents)
+	if contents == "max" {
+		return math.MaxUint64, nil
+	}
+
+	res, err := ParseUint(contents, 10, 64)
+	if err != nil {
+		return res, fmt.Errorf("unable to parse file %q", path+"/"+file)
+	}
+	return res, nil
+}
+
+// GetCgroupParamInt reads a single int64 value from specified cgroup file.
+// If the value read is "max", the math.MaxInt64 is returned.
+func GetCgroupParamInt(path, file string) (int64, error) {
+	contents, err := ReadFile(path, file)
+	if err != nil {
+		return 0, err
+	}
+	contents = strings.TrimSpace(contents)
+	if contents == "max" {
+		return math.MaxInt64, nil
+	}
+
+	res, err := strconv.ParseInt(contents, 10, 64)
+	if err != nil {
+		return res, fmt.Errorf("unable to parse %q as a int from Cgroup file %q", contents, path+"/"+file)
+	}
+	return res, nil
+}
+
+// GetCgroupParamString reads a string from the specified cgroup file.
+func GetCgroupParamString(path, file string) (string, error) {
+	contents, err := ReadFile(path, file)
+	if err != nil {
+		return "", err
+	}
+
+	return strings.TrimSpace(contents), nil
+}

+ 28 - 0
vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go

@@ -39,6 +39,33 @@ type CpuStats struct {
 	ThrottlingData ThrottlingData `json:"throttling_data,omitempty"`
 }
 
+type CPUSetStats struct {
+	// List of the physical numbers of the CPUs on which processes
+	// in that cpuset are allowed to execute
+	CPUs []uint16 `json:"cpus,omitempty"`
+	// cpu_exclusive flag
+	CPUExclusive uint64 `json:"cpu_exclusive"`
+	// List of memory nodes on which processes in that cpuset
+	// are allowed to allocate memory
+	Mems []uint16 `json:"mems,omitempty"`
+	// mem_hardwall flag
+	MemHardwall uint64 `json:"mem_hardwall"`
+	// mem_exclusive flag
+	MemExclusive uint64 `json:"mem_exclusive"`
+	// memory_migrate flag
+	MemoryMigrate uint64 `json:"memory_migrate"`
+	// memory_spread page flag
+	MemorySpreadPage uint64 `json:"memory_spread_page"`
+	// memory_spread slab flag
+	MemorySpreadSlab uint64 `json:"memory_spread_slab"`
+	// memory_pressure
+	MemoryPressure uint64 `json:"memory_pressure"`
+	// sched_load balance flag
+	SchedLoadBalance uint64 `json:"sched_load_balance"`
+	// sched_relax_domain_level
+	SchedRelaxDomainLevel int64 `json:"sched_relax_domain_level"`
+}
+
 type MemoryData struct {
 	Usage    uint64 `json:"usage,omitempty"`
 	MaxUsage uint64 `json:"max_usage,omitempty"`
@@ -121,6 +148,7 @@ type HugetlbStats struct {
 
 type Stats struct {
 	CpuStats    CpuStats    `json:"cpu_stats,omitempty"`
+	CPUSetStats CPUSetStats `json:"cpuset_stats,omitempty"`
 	MemoryStats MemoryStats `json:"memory_stats,omitempty"`
 	PidsStats   PidsStats   `json:"pids_stats,omitempty"`
 	BlkioStats  BlkioStats  `json:"blkio_stats,omitempty"`

+ 115 - 42
vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go

@@ -15,7 +15,9 @@ import (
 	"sync"
 	"time"
 
-	units "github.com/docker/go-units"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/userns"
+	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
 )
 
@@ -29,19 +31,19 @@ var (
 	isUnified     bool
 )
 
-// HugePageSizeUnitList is a list of the units used by the linux kernel when
-// naming the HugePage control files.
-// https://www.kernel.org/doc/Documentation/cgroup-v1/hugetlb.txt
-// TODO Since the kernel only use KB, MB and GB; TB and PB should be removed,
-// depends on https://github.com/docker/go-units/commit/a09cd47f892041a4fac473133d181f5aea6fa393
-var HugePageSizeUnitList = []string{"B", "KB", "MB", "GB", "TB", "PB"}
-
 // IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode.
 func IsCgroup2UnifiedMode() bool {
 	isUnifiedOnce.Do(func() {
 		var st unix.Statfs_t
-		if err := unix.Statfs(unifiedMountpoint, &st); err != nil {
-			panic("cannot statfs cgroup root")
+		err := unix.Statfs(unifiedMountpoint, &st)
+		if err != nil {
+			if os.IsNotExist(err) && userns.RunningInUserNS() {
+				// ignore the "not found" error if running in userns
+				logrus.WithError(err).Debugf("%s missing, assuming cgroup v1", unifiedMountpoint)
+				isUnified = false
+				return
+			}
+			panic(fmt.Sprintf("cannot statfs cgroup root: %s", err))
 		}
 		isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
 	})
@@ -86,11 +88,11 @@ func GetAllSubsystems() ([]string, error) {
 		// - freezer: implemented in kernel 5.2
 		// We assume these are always available, as it is hard to detect availability.
 		pseudo := []string{"devices", "freezer"}
-		data, err := ioutil.ReadFile("/sys/fs/cgroup/cgroup.controllers")
+		data, err := fscommon.ReadFile("/sys/fs/cgroup", "cgroup.controllers")
 		if err != nil {
 			return nil, err
 		}
-		subsystems := append(pseudo, strings.Fields(string(data))...)
+		subsystems := append(pseudo, strings.Fields(data)...)
 		return subsystems, nil
 	}
 	f, err := os.Open("/proc/cgroups")
@@ -207,20 +209,66 @@ func EnterPid(cgroupPaths map[string]string, pid int) error {
 	return nil
 }
 
+func rmdir(path string) error {
+	err := unix.Rmdir(path)
+	if err == nil || err == unix.ENOENT {
+		return nil
+	}
+	return &os.PathError{Op: "rmdir", Path: path, Err: err}
+}
+
+// RemovePath aims to remove cgroup path. It does so recursively,
+// by removing any subdirectories (sub-cgroups) first.
+func RemovePath(path string) error {
+	// try the fast path first
+	if err := rmdir(path); err == nil {
+		return nil
+	}
+
+	infos, err := ioutil.ReadDir(path)
+	if err != nil {
+		if os.IsNotExist(err) {
+			err = nil
+		}
+		return err
+	}
+	for _, info := range infos {
+		if info.IsDir() {
+			// We should remove subcgroups dir first
+			if err = RemovePath(filepath.Join(path, info.Name())); err != nil {
+				break
+			}
+		}
+	}
+	if err == nil {
+		err = rmdir(path)
+	}
+	return err
+}
+
 // RemovePaths iterates over the provided paths removing them.
 // We trying to remove all paths five times with increasing delay between tries.
 // If after all there are not removed cgroups - appropriate error will be
 // returned.
 func RemovePaths(paths map[string]string) (err error) {
+	const retries = 5
 	delay := 10 * time.Millisecond
-	for i := 0; i < 5; i++ {
+	for i := 0; i < retries; i++ {
 		if i != 0 {
 			time.Sleep(delay)
 			delay *= 2
 		}
 		for s, p := range paths {
-			os.RemoveAll(p)
-			// TODO: here probably should be logging
+			if err := RemovePath(p); err != nil {
+				// do not log intermediate iterations
+				switch i {
+				case 0:
+					logrus.WithError(err).Warnf("Failed to remove cgroup (will retry)")
+				case retries - 1:
+					logrus.WithError(err).Error("Failed to remove cgroup")
+				}
+
+			}
 			_, err := os.Stat(p)
 			// We need this strange way of checking cgroups existence because
 			// RemoveAll almost always returns error, even on already removed
@@ -230,6 +278,8 @@ func RemovePaths(paths map[string]string) (err error) {
 			}
 		}
 		if len(paths) == 0 {
+			//nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506
+			paths = make(map[string]string)
 			return nil
 		}
 	}
@@ -237,27 +287,50 @@ func RemovePaths(paths map[string]string) (err error) {
 }
 
 func GetHugePageSize() ([]string, error) {
-	files, err := ioutil.ReadDir("/sys/kernel/mm/hugepages")
+	dir, err := os.OpenFile("/sys/kernel/mm/hugepages", unix.O_DIRECTORY|unix.O_RDONLY, 0)
 	if err != nil {
-		return []string{}, err
+		return nil, err
 	}
-	var fileNames []string
-	for _, st := range files {
-		fileNames = append(fileNames, st.Name())
+	files, err := dir.Readdirnames(0)
+	dir.Close()
+	if err != nil {
+		return nil, err
 	}
-	return getHugePageSizeFromFilenames(fileNames)
+
+	return getHugePageSizeFromFilenames(files)
 }
 
 func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) {
-	var pageSizes []string
-	for _, fileName := range fileNames {
-		nameArray := strings.Split(fileName, "-")
-		pageSize, err := units.RAMInBytes(nameArray[1])
+	pageSizes := make([]string, 0, len(fileNames))
+
+	for _, file := range fileNames {
+		// example: hugepages-1048576kB
+		val := strings.TrimPrefix(file, "hugepages-")
+		if len(val) == len(file) {
+			// unexpected file name: no prefix found
+			continue
+		}
+		// The suffix is always "kB" (as of Linux 5.9)
+		eLen := len(val) - 2
+		val = strings.TrimSuffix(val, "kB")
+		if len(val) != eLen {
+			logrus.Warnf("GetHugePageSize: %s: invalid filename suffix (expected \"kB\")", file)
+			continue
+		}
+		size, err := strconv.Atoi(val)
 		if err != nil {
-			return []string{}, err
+			return nil, err
 		}
-		sizeString := units.CustomSize("%g%s", float64(pageSize), 1024.0, HugePageSizeUnitList)
-		pageSizes = append(pageSizes, sizeString)
+		// Model after https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/hugetlb_cgroup.c?id=eff48ddeab782e35e58ccc8853f7386bbae9dec4#n574
+		// but in our case the size is in KB already.
+		if size >= (1 << 20) {
+			val = strconv.Itoa(size>>20) + "GB"
+		} else if size >= (1 << 10) {
+			val = strconv.Itoa(size>>10) + "MB"
+		} else {
+			val += "KB"
+		}
+		pageSizes = append(pageSizes, val)
 	}
 
 	return pageSizes, nil
@@ -303,14 +376,14 @@ func WriteCgroupProc(dir string, pid int) error {
 		return nil
 	}
 
-	cgroupProcessesFile, err := os.OpenFile(filepath.Join(dir, CgroupProcesses), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0700)
+	file, err := fscommon.OpenFile(dir, CgroupProcesses, os.O_WRONLY)
 	if err != nil {
 		return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err)
 	}
-	defer cgroupProcessesFile.Close()
+	defer file.Close()
 
 	for i := 0; i < 5; i++ {
-		_, err = cgroupProcessesFile.WriteString(strconv.Itoa(pid))
+		_, err = file.WriteString(strconv.Itoa(pid))
 		if err == nil {
 			return nil
 		}
@@ -327,17 +400,6 @@ func WriteCgroupProc(dir string, pid int) error {
 	return err
 }
 
-// Since the OCI spec is designed for cgroup v1, in some cases
-// there is need to convert from the cgroup v1 configuration to cgroup v2
-// the formula for BlkIOWeight is y = (1 + (x - 10) * 9999 / 990)
-// convert linearly from [10-1000] to [1-10000]
-func ConvertBlkIOToCgroupV2Value(blkIoWeight uint16) uint64 {
-	if blkIoWeight == 0 {
-		return 0
-	}
-	return uint64(1 + (uint64(blkIoWeight)-10)*9999/990)
-}
-
 // Since the OCI spec is designed for cgroup v1, in some cases
 // there is need to convert from the cgroup v1 configuration to cgroup v2
 // the formula for cpuShares is y = (1 + ((x - 2) * 9999) / 262142)
@@ -377,3 +439,14 @@ func ConvertMemorySwapToCgroupV2Value(memorySwap, memory int64) (int64, error) {
 
 	return memorySwap - memory, nil
 }
+
+// Since the OCI spec is designed for cgroup v1, in some cases
+// there is need to convert from the cgroup v1 configuration to cgroup v2
+// the formula for BlkIOWeight to IOWeight is y = (1 + (x - 10) * 9999 / 990)
+// convert linearly from [10-1000] to [1-10000]
+func ConvertBlkIOToIOWeightValue(blkIoWeight uint16) uint64 {
+	if blkIoWeight == 0 {
+		return 0
+	}
+	return uint64(1 + (uint64(blkIoWeight)-10)*9999/990)
+}

+ 41 - 59
vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go

@@ -1,16 +1,16 @@
 package cgroups
 
 import (
-	"bufio"
 	"errors"
 	"fmt"
-	"io"
 	"os"
 	"path/filepath"
 	"strings"
+	"sync"
 	"syscall"
 
 	securejoin "github.com/cyphar/filepath-securejoin"
+	"github.com/moby/sys/mountinfo"
 	"golang.org/x/sys/unix"
 )
 
@@ -23,7 +23,12 @@ const (
 )
 
 var (
-	errUnified = errors.New("not implemented for cgroup v2 unified hierarchy")
+	errUnified     = errors.New("not implemented for cgroup v2 unified hierarchy")
+	ErrV1NoUnified = errors.New("invalid configuration: cannot use unified on cgroup v1")
+
+	readMountinfoOnce sync.Once
+	readMountinfoErr  error
+	cgroupMountinfo   []*mountinfo.Info
 )
 
 type NotFoundError struct {
@@ -90,6 +95,21 @@ func tryDefaultPath(cgroupPath, subsystem string) string {
 	return path
 }
 
+// readCgroupMountinfo returns a list of cgroup v1 mounts (i.e. the ones
+// with fstype of "cgroup") for the current running process.
+//
+// The results are cached (to avoid re-reading mountinfo which is relatively
+// expensive), so it is assumed that cgroup mounts are not being changed.
+func readCgroupMountinfo() ([]*mountinfo.Info, error) {
+	readMountinfoOnce.Do(func() {
+		cgroupMountinfo, readMountinfoErr = mountinfo.GetMounts(
+			mountinfo.FSTypeFilter("cgroup"),
+		)
+	})
+
+	return cgroupMountinfo, readMountinfoErr
+}
+
 // https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt
 func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) {
 	if IsCgroup2UnifiedMode() {
@@ -110,56 +130,28 @@ func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string,
 		return "", "", errUnified
 	}
 
-	// Avoid parsing mountinfo by checking if subsystem is valid/available.
-	if !isSubsystemAvailable(subsystem) {
-		return "", "", NewNotFoundError(subsystem)
-	}
-
-	f, err := os.Open("/proc/self/mountinfo")
+	mi, err := readCgroupMountinfo()
 	if err != nil {
 		return "", "", err
 	}
-	defer f.Close()
 
-	return findCgroupMountpointAndRootFromReader(f, cgroupPath, subsystem)
+	return findCgroupMountpointAndRootFromMI(mi, cgroupPath, subsystem)
 }
 
-func findCgroupMountpointAndRootFromReader(reader io.Reader, cgroupPath, subsystem string) (string, string, error) {
-	scanner := bufio.NewScanner(reader)
-	for scanner.Scan() {
-		txt := scanner.Text()
-		fields := strings.Fields(txt)
-		if len(fields) < 9 {
-			continue
-		}
-		if strings.HasPrefix(fields[4], cgroupPath) {
-			for _, opt := range strings.Split(fields[len(fields)-1], ",") {
+func findCgroupMountpointAndRootFromMI(mounts []*mountinfo.Info, cgroupPath, subsystem string) (string, string, error) {
+	for _, mi := range mounts {
+		if strings.HasPrefix(mi.Mountpoint, cgroupPath) {
+			for _, opt := range strings.Split(mi.VFSOptions, ",") {
 				if opt == subsystem {
-					return fields[4], fields[3], nil
+					return mi.Mountpoint, mi.Root, nil
 				}
 			}
 		}
 	}
-	if err := scanner.Err(); err != nil {
-		return "", "", err
-	}
 
 	return "", "", NewNotFoundError(subsystem)
 }
 
-func isSubsystemAvailable(subsystem string) bool {
-	if IsCgroup2UnifiedMode() {
-		panic("don't call isSubsystemAvailable from cgroupv2 code")
-	}
-
-	cgroups, err := ParseCgroupFile("/proc/self/cgroup")
-	if err != nil {
-		return false
-	}
-	_, avail := cgroups[subsystem]
-	return avail
-}
-
 func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) {
 	if len(m.Subsystems) == 0 {
 		return "", fmt.Errorf("no subsystem for mount")
@@ -168,25 +160,15 @@ func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) {
 	return getControllerPath(m.Subsystems[0], cgroups)
 }
 
-func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount, error) {
+func getCgroupMountsHelper(ss map[string]bool, mounts []*mountinfo.Info, all bool) ([]Mount, error) {
 	res := make([]Mount, 0, len(ss))
-	scanner := bufio.NewScanner(mi)
 	numFound := 0
-	for scanner.Scan() && numFound < len(ss) {
-		txt := scanner.Text()
-		sepIdx := strings.Index(txt, " - ")
-		if sepIdx == -1 {
-			return nil, fmt.Errorf("invalid mountinfo format")
-		}
-		if txt[sepIdx+3:sepIdx+10] == "cgroup2" || txt[sepIdx+3:sepIdx+9] != "cgroup" {
-			continue
-		}
-		fields := strings.Split(txt, " ")
+	for _, mi := range mounts {
 		m := Mount{
-			Mountpoint: fields[4],
-			Root:       fields[3],
+			Mountpoint: mi.Mountpoint,
+			Root:       mi.Root,
 		}
-		for _, opt := range strings.Split(fields[len(fields)-1], ",") {
+		for _, opt := range strings.Split(mi.VFSOptions, ",") {
 			seen, known := ss[opt]
 			if !known || (!all && seen) {
 				continue
@@ -199,19 +181,18 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount,
 		if len(m.Subsystems) > 0 || all {
 			res = append(res, m)
 		}
-	}
-	if err := scanner.Err(); err != nil {
-		return nil, err
+		if !all && numFound >= len(ss) {
+			break
+		}
 	}
 	return res, nil
 }
 
 func getCgroupMountsV1(all bool) ([]Mount, error) {
-	f, err := os.Open("/proc/self/mountinfo")
+	mi, err := readCgroupMountinfo()
 	if err != nil {
 		return nil, err
 	}
-	defer f.Close()
 
 	allSubsystems, err := ParseCgroupFile("/proc/self/cgroup")
 	if err != nil {
@@ -222,7 +203,8 @@ func getCgroupMountsV1(all bool) ([]Mount, error) {
 	for s := range allSubsystems {
 		allMap[s] = false
 	}
-	return getCgroupMountsHelper(allMap, f, all)
+
+	return getCgroupMountsHelper(allMap, mi, all)
 }
 
 // GetOwnCgroup returns the relative path to the cgroup docker is running in.

+ 5 - 7
vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go

@@ -2,6 +2,7 @@ package configs
 
 import (
 	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
+	"github.com/opencontainers/runc/libcontainer/devices"
 )
 
 type FreezerState string
@@ -42,7 +43,7 @@ type Cgroup struct {
 
 type Resources struct {
 	// Devices is the set of access rules for devices in the container.
-	Devices []*DeviceRule `json:"devices"`
+	Devices []*devices.Rule `json:"devices"`
 
 	// Memory limit (in bytes)
 	Memory int64 `json:"memory"`
@@ -53,12 +54,6 @@ type Resources struct {
 	// Total memory usage (memory + swap); set `-1` to enable unlimited swap
 	MemorySwap int64 `json:"memory_swap"`
 
-	// Kernel memory limit (in bytes)
-	KernelMemory int64 `json:"kernel_memory"`
-
-	// Kernel memory limit for TCP use (in bytes)
-	KernelMemoryTCP int64 `json:"kernel_memory_tcp"`
-
 	// CPU shares (relative weight vs. other containers)
 	CpuShares uint64 `json:"cpu_shares"`
 
@@ -127,6 +122,9 @@ type Resources struct {
 	// CpuWeight sets a proportional bandwidth limit.
 	CpuWeight uint64 `json:"cpu_weight"`
 
+	// Unified is cgroupv2-only key-value map.
+	Unified map[string]string `json:"unified"`
+
 	// SkipDevices allows to skip configuring device permissions.
 	// Used by e.g. kubelet while creating a parent cgroup (kubepods)
 	// common for many containers.

+ 15 - 10
vendor/github.com/opencontainers/runc/libcontainer/configs/config.go

@@ -7,6 +7,7 @@ import (
 	"os/exec"
 	"time"
 
+	"github.com/opencontainers/runc/libcontainer/devices"
 	"github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/pkg/errors"
 	"github.com/sirupsen/logrus"
@@ -30,9 +31,10 @@ type IDMap struct {
 // for syscalls. Additional architectures can be added by specifying them in
 // Architectures.
 type Seccomp struct {
-	DefaultAction Action     `json:"default_action"`
-	Architectures []string   `json:"architectures"`
-	Syscalls      []*Syscall `json:"syscalls"`
+	DefaultAction   Action     `json:"default_action"`
+	Architectures   []string   `json:"architectures"`
+	Syscalls        []*Syscall `json:"syscalls"`
+	DefaultErrnoRet *uint      `json:"default_errno_ret"`
 }
 
 // Action is taken upon rule match in Seccomp
@@ -92,6 +94,9 @@ type Config struct {
 	// Path to a directory containing the container's root filesystem.
 	Rootfs string `json:"rootfs"`
 
+	// Umask is the umask to use inside of the container.
+	Umask *uint32 `json:"umask"`
+
 	// Readonlyfs will remount the container's rootfs as readonly where only externally mounted
 	// bind mounts are writtable.
 	Readonlyfs bool `json:"readonlyfs"`
@@ -104,7 +109,7 @@ type Config struct {
 	Mounts []*Mount `json:"mounts"`
 
 	// The device nodes that should be automatically created within the container upon container start.  Note, make sure that the node is marked as allowed in the cgroup as well!
-	Devices []*Device `json:"devices"`
+	Devices []*devices.Device `json:"devices"`
 
 	MountLabel string `json:"mount_label"`
 
@@ -218,25 +223,25 @@ const (
 	// the runtime environment has been created but before the pivot_root has been executed.
 	// CreateRuntime is called immediately after the deprecated Prestart hook.
 	// CreateRuntime commands are called in the Runtime Namespace.
-	CreateRuntime = "createRuntime"
+	CreateRuntime HookName = "createRuntime"
 
 	// CreateContainer commands MUST be called as part of the create operation after
 	// the runtime environment has been created but before the pivot_root has been executed.
 	// CreateContainer commands are called in the Container namespace.
-	CreateContainer = "createContainer"
+	CreateContainer HookName = "createContainer"
 
 	// StartContainer commands MUST be called as part of the start operation and before
 	// the container process is started.
 	// StartContainer commands are called in the Container namespace.
-	StartContainer = "startContainer"
+	StartContainer HookName = "startContainer"
 
 	// Poststart commands are executed after the container init process starts.
 	// Poststart commands are called in the Runtime Namespace.
-	Poststart = "poststart"
+	Poststart HookName = "poststart"
 
 	// Poststop commands are executed after the container init process exits.
 	// Poststop commands are called in the Runtime Namespace.
-	Poststop = "poststop"
+	Poststop HookName = "poststop"
 )
 
 type Capabilities struct {
@@ -383,7 +388,7 @@ func (c Command) Run(s *specs.State) error {
 		return err
 	case <-timerCh:
 		cmd.Process.Kill()
-		cmd.Wait()
+		<-errC
 		return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds())
 	}
 }

+ 9 - 0
vendor/github.com/opencontainers/runc/libcontainer/configs/configs_fuzzer.go

@@ -0,0 +1,9 @@
+// +build gofuzz
+
+package configs
+
+func FuzzUnmarshalJSON(data []byte) int {
+	hooks := Hooks{}
+	_ = hooks.UnmarshalJSON(data)
+	return 1
+}

+ 0 - 16
vendor/github.com/opencontainers/runc/libcontainer/configs/device_unix.go

@@ -1,16 +0,0 @@
-// +build !windows
-
-package configs
-
-import (
-	"errors"
-
-	"golang.org/x/sys/unix"
-)
-
-func (d *DeviceRule) Mkdev() (uint64, error) {
-	if d.Major == Wildcard || d.Minor == Wildcard {
-		return 0, errors.New("cannot mkdev() device with wildcards")
-	}
-	return unix.Mkdev(uint32(d.Major), uint32(d.Minor)), nil
-}

+ 0 - 5
vendor/github.com/opencontainers/runc/libcontainer/configs/device_windows.go

@@ -1,5 +0,0 @@
-package configs
-
-func (d *DeviceRule) Mkdev() (uint64, error) {
-	return 0, nil
-}

+ 17 - 0
vendor/github.com/opencontainers/runc/libcontainer/configs/devices.go

@@ -0,0 +1,17 @@
+package configs
+
+import "github.com/opencontainers/runc/libcontainer/devices"
+
+type (
+	// Deprecated: use libcontainer/devices.Device
+	Device = devices.Device
+
+	// Deprecated: use libcontainer/devices.Rule
+	DeviceRule = devices.Rule
+
+	// Deprecated: use libcontainer/devices.Type
+	DeviceType = devices.Type
+
+	// Deprecated: use libcontainer/devices.Permissions
+	DevicePermissions = devices.Permissions
+)

+ 1 - 1
vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go

@@ -56,7 +56,7 @@ func IsNamespaceSupported(ns NamespaceType) bool {
 	if nsFile == "" {
 		return false
 	}
-	_, err := os.Stat(fmt.Sprintf("/proc/self/ns/%s", nsFile))
+	_, err := os.Stat("/proc/self/ns/" + nsFile)
 	// a namespace is supported if it exists and we have permissions to read it
 	supported = err == nil
 	supportedNamespaces[ns] = supported

+ 33 - 29
vendor/github.com/opencontainers/runc/libcontainer/configs/device.go → vendor/github.com/opencontainers/runc/libcontainer/devices/device.go

@@ -1,4 +1,4 @@
-package configs
+package devices
 
 import (
 	"fmt"
@@ -11,7 +11,7 @@ const (
 )
 
 type Device struct {
-	DeviceRule
+	Rule
 
 	// Path to the device.
 	Path string `json:"path"`
@@ -26,10 +26,10 @@ type Device struct {
 	Gid uint32 `json:"gid"`
 }
 
-// DevicePermissions is a cgroupv1-style string to represent device access. It
+// Permissions is a cgroupv1-style string to represent device access. It
 // has to be a string for backward compatibility reasons, hence why it has
 // methods to do set operations.
-type DevicePermissions string
+type Permissions string
 
 const (
 	deviceRead uint = (1 << iota)
@@ -37,7 +37,7 @@ const (
 	deviceMknod
 )
 
-func (p DevicePermissions) toSet() uint {
+func (p Permissions) toSet() uint {
 	var set uint
 	for _, perm := range p {
 		switch perm {
@@ -52,7 +52,7 @@ func (p DevicePermissions) toSet() uint {
 	return set
 }
 
-func fromSet(set uint) DevicePermissions {
+func fromSet(set uint) Permissions {
 	var perm string
 	if set&deviceRead == deviceRead {
 		perm += "r"
@@ -63,53 +63,53 @@ func fromSet(set uint) DevicePermissions {
 	if set&deviceMknod == deviceMknod {
 		perm += "m"
 	}
-	return DevicePermissions(perm)
+	return Permissions(perm)
 }
 
-// Union returns the union of the two sets of DevicePermissions.
-func (p DevicePermissions) Union(o DevicePermissions) DevicePermissions {
+// Union returns the union of the two sets of Permissions.
+func (p Permissions) Union(o Permissions) Permissions {
 	lhs := p.toSet()
 	rhs := o.toSet()
 	return fromSet(lhs | rhs)
 }
 
-// Difference returns the set difference of the two sets of DevicePermissions.
+// Difference returns the set difference of the two sets of Permissions.
 // In set notation, A.Difference(B) gives you A\B.
-func (p DevicePermissions) Difference(o DevicePermissions) DevicePermissions {
+func (p Permissions) Difference(o Permissions) Permissions {
 	lhs := p.toSet()
 	rhs := o.toSet()
 	return fromSet(lhs &^ rhs)
 }
 
-// Intersection computes the intersection of the two sets of DevicePermissions.
-func (p DevicePermissions) Intersection(o DevicePermissions) DevicePermissions {
+// Intersection computes the intersection of the two sets of Permissions.
+func (p Permissions) Intersection(o Permissions) Permissions {
 	lhs := p.toSet()
 	rhs := o.toSet()
 	return fromSet(lhs & rhs)
 }
 
-// IsEmpty returns whether the set of permissions in a DevicePermissions is
+// IsEmpty returns whether the set of permissions in a Permissions is
 // empty.
-func (p DevicePermissions) IsEmpty() bool {
-	return p == DevicePermissions("")
+func (p Permissions) IsEmpty() bool {
+	return p == Permissions("")
 }
 
 // IsValid returns whether the set of permissions is a subset of valid
 // permissions (namely, {r,w,m}).
-func (p DevicePermissions) IsValid() bool {
+func (p Permissions) IsValid() bool {
 	return p == fromSet(p.toSet())
 }
 
-type DeviceType rune
+type Type rune
 
 const (
-	WildcardDevice DeviceType = 'a'
-	BlockDevice    DeviceType = 'b'
-	CharDevice     DeviceType = 'c' // or 'u'
-	FifoDevice     DeviceType = 'p'
+	WildcardDevice Type = 'a'
+	BlockDevice    Type = 'b'
+	CharDevice     Type = 'c' // or 'u'
+	FifoDevice     Type = 'p'
 )
 
-func (t DeviceType) IsValid() bool {
+func (t Type) IsValid() bool {
 	switch t {
 	case WildcardDevice, BlockDevice, CharDevice, FifoDevice:
 		return true
@@ -118,7 +118,7 @@ func (t DeviceType) IsValid() bool {
 	}
 }
 
-func (t DeviceType) CanMknod() bool {
+func (t Type) CanMknod() bool {
 	switch t {
 	case BlockDevice, CharDevice, FifoDevice:
 		return true
@@ -127,7 +127,7 @@ func (t DeviceType) CanMknod() bool {
 	}
 }
 
-func (t DeviceType) CanCgroup() bool {
+func (t Type) CanCgroup() bool {
 	switch t {
 	case WildcardDevice, BlockDevice, CharDevice:
 		return true
@@ -136,10 +136,10 @@ func (t DeviceType) CanCgroup() bool {
 	}
 }
 
-type DeviceRule struct {
+type Rule struct {
 	// Type of device ('c' for char, 'b' for block). If set to 'a', this rule
 	// acts as a wildcard and all fields other than Allow are ignored.
-	Type DeviceType `json:"type"`
+	Type Type `json:"type"`
 
 	// Major is the device's major number.
 	Major int64 `json:"major"`
@@ -149,13 +149,13 @@ type DeviceRule struct {
 
 	// Permissions is the set of permissions that this rule applies to (in the
 	// cgroupv1 format -- any combination of "rwm").
-	Permissions DevicePermissions `json:"permissions"`
+	Permissions Permissions `json:"permissions"`
 
 	// Allow specifies whether this rule is allowed.
 	Allow bool `json:"allow"`
 }
 
-func (d *DeviceRule) CgroupString() string {
+func (d *Rule) CgroupString() string {
 	var (
 		major = strconv.FormatInt(d.Major, 10)
 		minor = strconv.FormatInt(d.Minor, 10)
@@ -168,3 +168,7 @@ func (d *DeviceRule) CgroupString() string {
 	}
 	return fmt.Sprintf("%c %s:%s %s", d.Type, major, minor, d.Permissions)
 }
+
+func (d *Rule) Mkdev() (uint64, error) {
+	return mkDev(d)
+}

+ 22 - 14
vendor/github.com/opencontainers/runc/libcontainer/devices/devices.go → vendor/github.com/opencontainers/runc/libcontainer/devices/device_unix.go

@@ -1,3 +1,5 @@
+// +build !windows
+
 package devices
 
 import (
@@ -6,7 +8,6 @@ import (
 	"os"
 	"path/filepath"
 
-	"github.com/opencontainers/runc/libcontainer/configs"
 	"golang.org/x/sys/unix"
 )
 
@@ -21,9 +22,16 @@ var (
 	ioutilReadDir = ioutil.ReadDir
 )
 
+func mkDev(d *Rule) (uint64, error) {
+	if d.Major == Wildcard || d.Minor == Wildcard {
+		return 0, errors.New("cannot mkdev() device with wildcards")
+	}
+	return unix.Mkdev(uint32(d.Major), uint32(d.Minor)), nil
+}
+
 // Given the path to a device and its cgroup_permissions(which cannot be easily queried) look up the
 // information about a linux device and return that information as a Device struct.
-func DeviceFromPath(path, permissions string) (*configs.Device, error) {
+func DeviceFromPath(path, permissions string) (*Device, error) {
 	var stat unix.Stat_t
 	err := unixLstat(path, &stat)
 	if err != nil {
@@ -31,7 +39,7 @@ func DeviceFromPath(path, permissions string) (*configs.Device, error) {
 	}
 
 	var (
-		devType   configs.DeviceType
+		devType   Type
 		mode      = stat.Mode
 		devNumber = uint64(stat.Rdev)
 		major     = unix.Major(devNumber)
@@ -39,41 +47,41 @@ func DeviceFromPath(path, permissions string) (*configs.Device, error) {
 	)
 	switch mode & unix.S_IFMT {
 	case unix.S_IFBLK:
-		devType = configs.BlockDevice
+		devType = BlockDevice
 	case unix.S_IFCHR:
-		devType = configs.CharDevice
+		devType = CharDevice
 	case unix.S_IFIFO:
-		devType = configs.FifoDevice
+		devType = FifoDevice
 	default:
 		return nil, ErrNotADevice
 	}
-	return &configs.Device{
-		DeviceRule: configs.DeviceRule{
+	return &Device{
+		Rule: Rule{
 			Type:        devType,
 			Major:       int64(major),
 			Minor:       int64(minor),
-			Permissions: configs.DevicePermissions(permissions),
+			Permissions: Permissions(permissions),
 		},
 		Path:     path,
-		FileMode: os.FileMode(mode),
+		FileMode: os.FileMode(mode &^ unix.S_IFMT),
 		Uid:      stat.Uid,
 		Gid:      stat.Gid,
 	}, nil
 }
 
 // HostDevices returns all devices that can be found under /dev directory.
-func HostDevices() ([]*configs.Device, error) {
+func HostDevices() ([]*Device, error) {
 	return GetDevices("/dev")
 }
 
 // GetDevices recursively traverses a directory specified by path
 // and returns all devices found there.
-func GetDevices(path string) ([]*configs.Device, error) {
+func GetDevices(path string) ([]*Device, error) {
 	files, err := ioutilReadDir(path)
 	if err != nil {
 		return nil, err
 	}
-	var out []*configs.Device
+	var out []*Device
 	for _, f := range files {
 		switch {
 		case f.IsDir():
@@ -104,7 +112,7 @@ func GetDevices(path string) ([]*configs.Device, error) {
 			}
 			return nil, err
 		}
-		if device.Type == configs.FifoDevice {
+		if device.Type == FifoDevice {
 			continue
 		}
 		out = append(out, device)

+ 51 - 29
vendor/github.com/opencontainers/runc/libcontainer/nsenter/cloned_binary.c

@@ -59,14 +59,38 @@
 #include <sys/syscall.h>
 
 /* Use our own wrapper for memfd_create. */
-#if !defined(SYS_memfd_create) && defined(__NR_memfd_create)
-#  define SYS_memfd_create __NR_memfd_create
+#ifndef SYS_memfd_create
+#  ifdef __NR_memfd_create
+#    define SYS_memfd_create __NR_memfd_create
+#  else
+/* These values come from <https://fedora.juszkiewicz.com.pl/syscalls.html>. */
+#    warning "libc is outdated -- using hard-coded SYS_memfd_create"
+#    if defined(__x86_64__)
+#      define SYS_memfd_create 319
+#    elif defined(__i386__)
+#      define SYS_memfd_create 356
+#    elif defined(__ia64__)
+#      define SYS_memfd_create 1340
+#    elif defined(__arm__)
+#      define SYS_memfd_create 385
+#    elif defined(__aarch64__)
+#      define SYS_memfd_create 279
+#    elif defined(__ppc__) || defined(__PPC64__) || defined(__powerpc64__)
+#      define SYS_memfd_create 360
+#    elif defined(__s390__) || defined(__s390x__)
+#      define SYS_memfd_create 350
+#    else
+#      warning "unknown architecture -- cannot hard-code SYS_memfd_create"
+#    endif
+#  endif
 #endif
+
 /* memfd_create(2) flags -- copied from <linux/memfd.h>. */
 #ifndef MFD_CLOEXEC
 #  define MFD_CLOEXEC       0x0001U
 #  define MFD_ALLOW_SEALING 0x0002U
 #endif
+
 int memfd_create(const char *name, unsigned int flags)
 {
 #ifdef SYS_memfd_create
@@ -77,7 +101,6 @@ int memfd_create(const char *name, unsigned int flags)
 #endif
 }
 
-
 /* This comes directly from <linux/fcntl.h>. */
 #ifndef F_LINUX_SPECIFIC_BASE
 #  define F_LINUX_SPECIFIC_BASE 1024
@@ -103,7 +126,7 @@ static void *must_realloc(void *ptr, size_t size)
 	void *old = ptr;
 	do {
 		ptr = realloc(old, size);
-	} while(!ptr);
+	} while (!ptr);
 	return ptr;
 }
 
@@ -115,10 +138,10 @@ static void *must_realloc(void *ptr, size_t size)
 static int is_self_cloned(void)
 {
 	int fd, ret, is_cloned = 0;
-	struct stat statbuf = {};
-	struct statfs fsbuf = {};
+	struct stat statbuf = { };
+	struct statfs fsbuf = { };
 
-	fd = open("/proc/self/exe", O_RDONLY|O_CLOEXEC);
+	fd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC);
 	if (fd < 0) {
 		fprintf(stderr, "you have no read access to runc binary file\n");
 		return -ENOTRECOVERABLE;
@@ -274,7 +297,7 @@ enum {
 static int make_execfd(int *fdtype)
 {
 	int fd = -1;
-	char template[PATH_MAX] = {0};
+	char template[PATH_MAX] = { 0 };
 	char *prefix = getenv("_LIBCONTAINER_STATEDIR");
 
 	if (!prefix || *prefix != '/')
@@ -303,7 +326,7 @@ static int make_execfd(int *fdtype)
 	*fdtype = EFD_FILE;
 	fd = open(prefix, O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700);
 	if (fd >= 0) {
-		struct stat statbuf = {};
+		struct stat statbuf = { };
 		bool working_otmpfile = false;
 
 		/*
@@ -348,27 +371,27 @@ static int seal_execfd(int *fd, int fdtype)
 	switch (fdtype) {
 	case EFD_MEMFD:
 		return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_SEALS);
-	case EFD_FILE: {
-		/* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */
-		int newfd;
-		char fdpath[PATH_MAX] = {0};
+	case EFD_FILE:{
+			/* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */
+			int newfd;
+			char fdpath[PATH_MAX] = { 0 };
 
-		if (fchmod(*fd, 0100) < 0)
-			return -1;
+			if (fchmod(*fd, 0100) < 0)
+				return -1;
 
-		if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0)
-			return -1;
+			if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0)
+				return -1;
 
-		newfd = open(fdpath, O_PATH | O_CLOEXEC);
-		if (newfd < 0)
-			return -1;
+			newfd = open(fdpath, O_PATH | O_CLOEXEC);
+			if (newfd < 0)
+				return -1;
 
-		close(*fd);
-		*fd = newfd;
-		return 0;
-	}
+			close(*fd);
+			*fd = newfd;
+			return 0;
+		}
 	default:
-	   break;
+		break;
 	}
 	return -1;
 }
@@ -376,7 +399,7 @@ static int seal_execfd(int *fd, int fdtype)
 static int try_bindfd(void)
 {
 	int fd, ret = -1;
-	char template[PATH_MAX] = {0};
+	char template[PATH_MAX] = { 0 };
 	char *prefix = getenv("_LIBCONTAINER_STATEDIR");
 
 	if (!prefix || *prefix != '/')
@@ -404,7 +427,6 @@ static int try_bindfd(void)
 	if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0)
 		goto out_umount;
 
-
 	/* Get read-only handle that we're sure can't be made read-write. */
 	ret = open(template, O_PATH | O_CLOEXEC);
 
@@ -448,7 +470,7 @@ static ssize_t fd_to_fd(int outfd, int infd)
 			if (n < 0)
 				return -1;
 			nwritten += n;
-		} while(nwritten < nread);
+		} while (nwritten < nread);
 
 		total += nwritten;
 	}
@@ -459,7 +481,7 @@ static ssize_t fd_to_fd(int outfd, int infd)
 static int clone_binary(void)
 {
 	int binfd, execfd;
-	struct stat statbuf = {};
+	struct stat statbuf = { };
 	size_t sent = 0;
 	int fdtype = EFD_NONE;
 

+ 142 - 0
vendor/github.com/opencontainers/runc/libcontainer/nsenter/escape.c

@@ -0,0 +1,142 @@
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef ESCAPE_TEST
+#  include <assert.h>
+#  define test_assert(arg) assert(arg)
+#else
+#  define test_assert(arg)
+#endif
+
+#define DEL '\x7f'
+
+/*
+ * Poor man version of itoa with base=16 and input number from 0 to 15,
+ * represented by a char. Converts it to a single hex digit ('0' to 'f').
+ */
+static char hex(char i)
+{
+	test_assert(i >= 0 && i < 16);
+
+	if (i >= 0 && i < 10) {
+		return '0' + i;
+	}
+	if (i >= 10 && i < 16) {
+		return 'a' + i - 10;
+	}
+	return '?';
+}
+
+/*
+ * Given the character, tells how many _extra_ characters are needed
+ * to JSON-escape it. If 0 is returned, the character does not need to
+ * be escaped.
+ */
+static int need_escape(char c)
+{
+	switch (c) {
+	case '\\':
+	case '"':
+	case '\b':
+	case '\n':
+	case '\r':
+	case '\t':
+	case '\f':
+		return 1;
+	case DEL:		// -> \u007f
+		return 5;
+	default:
+		if (c > 0 && c < ' ') {
+			// ASCII decimal 01 to 31 -> \u00xx
+			return 5;
+		}
+		return 0;
+	}
+}
+
+/*
+ * Escape the string so it can be used as a JSON string (per RFC4627,
+ * section 2.5 minimal requirements, plus the DEL (0x7f) character).
+ *
+ * It is expected that the argument is a string allocated via malloc.
+ * In case no escaping is needed, the original string is returned as is;
+ * otherwise, the original string is free'd, and the newly allocated
+ * escaped string is returned. Thus, in any case, the value returned
+ * need to be free'd by the caller.
+ */
+char *escape_json_string(char *s)
+{
+	int i, j, len;
+	char *c, *out;
+
+	/*
+	 * First, check if escaping is at all needed -- if not, we can avoid
+	 * malloc and return the argument as is.  While at it, count how much
+	 * extra space is required.
+	 *
+	 * XXX: the counting code must be in sync with the escaping code
+	 * (checked by test_assert()s below).
+	 */
+	for (i = j = 0; s[i] != '\0'; i++) {
+		j += need_escape(s[i]);
+	}
+	if (j == 0) {
+		// nothing to escape
+		return s;
+	}
+
+	len = i + j + 1;
+	out = malloc(len);
+	if (!out) {
+		free(s);
+		// As malloc failed, strdup can fail, too, so in the worst case
+		// scenario NULL will be returned from here.
+		return strdup("escape_json_string: out of memory");
+	}
+	for (c = s, j = 0; *c != '\0'; c++) {
+		switch (*c) {
+		case '"':
+		case '\\':
+			test_assert(need_escape(*c) == 1);
+			out[j++] = '\\';
+			out[j++] = *c;
+			continue;
+		}
+		if ((*c < 0 || *c >= ' ') && (*c != DEL)) {
+			// no escape needed
+			test_assert(need_escape(*c) == 0);
+			out[j++] = *c;
+			continue;
+		}
+		out[j++] = '\\';
+		switch (*c) {
+		case '\b':
+			out[j++] = 'b';
+			break;
+		case '\n':
+			out[j++] = 'n';
+			break;
+		case '\r':
+			out[j++] = 'r';
+			break;
+		case '\t':
+			out[j++] = 't';
+			break;
+		case '\f':
+			out[j++] = 'f';
+			break;
+		default:
+			test_assert(need_escape(*c) == 5);
+			out[j++] = 'u';
+			out[j++] = '0';
+			out[j++] = '0';
+			out[j++] = hex(*c >> 4);
+			out[j++] = hex(*c & 0x0f);
+		}
+	}
+	test_assert(j + 1 == len);
+	out[j] = '\0';
+
+	free(s);
+	return out;
+}

+ 222 - 139
vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c

@@ -29,6 +29,8 @@
 /* Get all of the CLONE_NEW* flags. */
 #include "namespace.h"
 
+extern char *escape_json_string(char *str);
+
 /* Synchronisation values. */
 enum sync_t {
 	SYNC_USERMAP_PLS = 0x40,	/* Request parent to map our users. */
@@ -36,7 +38,7 @@ enum sync_t {
 	SYNC_RECVPID_PLS = 0x42,	/* Tell parent we're sending the PID. */
 	SYNC_RECVPID_ACK = 0x43,	/* PID was correctly received by parent. */
 	SYNC_GRANDCHILD = 0x44,	/* The grandchild is ready to run. */
-	SYNC_CHILD_READY = 0x45,	/* The child or grandchild is ready to return. */
+	SYNC_CHILD_FINISH = 0x45,	/* The child or grandchild has finished. */
 };
 
 /*
@@ -45,10 +47,14 @@ enum sync_t {
  */
 #define CREATECGROUPNS 0x80
 
+#define STAGE_SETUP  -1
 /* longjmp() arguments. */
-#define JUMP_PARENT 0x00
-#define JUMP_CHILD  0xA0
-#define JUMP_INIT   0xA1
+#define STAGE_PARENT  0
+#define STAGE_CHILD   1
+#define STAGE_INIT    2
+
+/* Stores the current stage of nsexec. */
+int current_stage = STAGE_SETUP;
 
 /* Assume the stack grows down, so arguments should be above it. */
 struct clone_t {
@@ -56,7 +62,7 @@ struct clone_t {
 	 * Reserve some space for clone() to locate arguments
 	 * and retcode in this place
 	 */
-	char stack[4096] __attribute__ ((aligned(16)));
+	char stack[4096] __attribute__((aligned(16)));
 	char stack_ptr[0];
 
 	/* There's two children. This is used to execute the different code. */
@@ -102,31 +108,31 @@ static int logfd = -1;
  * List of netlink message types sent to us as part of bootstrapping the init.
  * These constants are defined in libcontainer/message_linux.go.
  */
-#define INIT_MSG			62000
+#define INIT_MSG		62000
 #define CLONE_FLAGS_ATTR	27281
 #define NS_PATHS_ATTR		27282
-#define UIDMAP_ATTR			27283
-#define GIDMAP_ATTR			27284
+#define UIDMAP_ATTR		27283
+#define GIDMAP_ATTR		27284
 #define SETGROUP_ATTR		27285
 #define OOM_SCORE_ADJ_ATTR	27286
 #define ROOTLESS_EUID_ATTR	27287
-#define UIDMAPPATH_ATTR	    27288
-#define GIDMAPPATH_ATTR	    27289
+#define UIDMAPPATH_ATTR		27288
+#define GIDMAPPATH_ATTR		27289
 
 /*
  * Use the raw syscall for versions of glibc which don't include a function for
  * it, namely (glibc 2.12).
  */
 #if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14
-#	define _GNU_SOURCE
-#	include "syscall.h"
-#	if !defined(SYS_setns) && defined(__NR_setns)
-#		define SYS_setns __NR_setns
-#	endif
-
-#ifndef SYS_setns
-#	error "setns(2) syscall not supported by glibc version"
-#endif
+#  define _GNU_SOURCE
+#  include "syscall.h"
+#  if !defined(SYS_setns) && defined(__NR_setns)
+#    define SYS_setns __NR_setns
+#  endif
+
+#  ifndef SYS_setns
+#    error "setns(2) syscall not supported by glibc version"
+#  endif
 
 int setns(int fd, int nstype)
 {
@@ -134,33 +140,43 @@ int setns(int fd, int nstype)
 }
 #endif
 
-static void write_log_with_info(const char *level, const char *function, int line, const char *format, ...)
+static void write_log(const char *level, const char *format, ...)
 {
-	char message[1024] = {};
-
+	char *message = NULL, *stage = NULL;
 	va_list args;
+	int ret;
 
 	if (logfd < 0 || level == NULL)
-		return;
+		goto out;
 
 	va_start(args, format);
-	if (vsnprintf(message, sizeof(message), format, args) < 0)
-		goto done;
-
-	dprintf(logfd, "{\"level\":\"%s\", \"msg\": \"%s:%d %s\"}\n", level, function, line, message);
-done:
+	ret = vasprintf(&message, format, args);
 	va_end(args);
-}
+	if (ret < 0)
+		goto out;
 
-#define write_log(level, fmt, ...) \
-	write_log_with_info((level), __FUNCTION__, __LINE__, (fmt), ##__VA_ARGS__)
+	message = escape_json_string(message);
+
+	if (current_stage == STAGE_SETUP)
+		stage = strdup("nsexec");
+	else
+		ret = asprintf(&stage, "nsexec-%d", current_stage);
+	if (ret < 0)
+		goto out;
+
+	dprintf(logfd, "{\"level\":\"%s\", \"msg\": \"%s[%d]: %s\"}\n", level, stage, getpid(), message);
+
+out:
+	free(message);
+	free(stage);
+}
 
 /* XXX: This is ugly. */
 static int syncfd = -1;
 
 #define bail(fmt, ...)                                       \
 	do {                                                       \
-		write_log(FATAL, "nsenter: " fmt ": %m", ##__VA_ARGS__); \
+		write_log(FATAL, fmt ": %m", ##__VA_ARGS__); \
 		exit(1);                                                 \
 	} while(0)
 
@@ -187,7 +203,7 @@ static int write_file(char *data, size_t data_len, char *pathfmt, ...)
 		goto out;
 	}
 
- out:
+out:
 	close(fd);
 	return ret;
 }
@@ -297,9 +313,11 @@ static void update_uidmap(const char *path, int pid, char *map, size_t map_len)
 	if (map == NULL || map_len <= 0)
 		return;
 
+	write_log(DEBUG, "update /proc/%d/uid_map to '%s'", pid, map);
 	if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) {
 		if (errno != EPERM)
 			bail("failed to update /proc/%d/uid_map", pid);
+		write_log(DEBUG, "update /proc/%d/uid_map got -EPERM (trying %s)", pid, path);
 		if (try_mapping_tool(path, pid, map, map_len))
 			bail("failed to use newuid map on %d", pid);
 	}
@@ -310,9 +328,11 @@ static void update_gidmap(const char *path, int pid, char *map, size_t map_len)
 	if (map == NULL || map_len <= 0)
 		return;
 
+	write_log(DEBUG, "update /proc/%d/gid_map to '%s'", pid, map);
 	if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) {
 		if (errno != EPERM)
 			bail("failed to update /proc/%d/gid_map", pid);
+		write_log(DEBUG, "update /proc/%d/gid_map got -EPERM (trying %s)", pid, path);
 		if (try_mapping_tool(path, pid, map, map_len))
 			bail("failed to use newgid map on %d", pid);
 	}
@@ -323,19 +343,20 @@ static void update_oom_score_adj(char *data, size_t len)
 	if (data == NULL || len <= 0)
 		return;
 
+	write_log(DEBUG, "update /proc/self/oom_score_adj to '%s'", data);
 	if (write_file(data, len, "/proc/self/oom_score_adj") < 0)
 		bail("failed to update /proc/self/oom_score_adj");
 }
 
 /* A dummy function that just jumps to the given jumpval. */
-static int child_func(void *arg) __attribute__ ((noinline));
+static int child_func(void *arg) __attribute__((noinline));
 static int child_func(void *arg)
 {
 	struct clone_t *ca = (struct clone_t *)arg;
 	longjmp(*ca->env, ca->jmpval);
 }
 
-static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline));
+static int clone_parent(jmp_buf *env, int jmpval) __attribute__((noinline));
 static int clone_parent(jmp_buf *env, int jmpval)
 {
 	struct clone_t ca = {
@@ -507,7 +528,6 @@ void join_namespaces(char *nslist)
 	char *namespace = strtok_r(nslist, ",", &saveptr);
 	struct namespace_t {
 		int fd;
-		int ns;
 		char type[PATH_MAX];
 		char path[PATH_MAX];
 	} *namespaces = NULL;
@@ -542,7 +562,7 @@ void join_namespaces(char *nslist)
 			bail("failed to open %s", path);
 
 		ns->fd = fd;
-		ns->ns = nsflag(namespace);
+		strncpy(ns->type, namespace, PATH_MAX - 1);
 		strncpy(ns->path, path, PATH_MAX - 1);
 		ns->path[PATH_MAX - 1] = '\0';
 	} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
@@ -555,12 +575,14 @@ void join_namespaces(char *nslist)
 	 */
 
 	for (i = 0; i < num; i++) {
-		struct namespace_t ns = namespaces[i];
+		struct namespace_t *ns = &namespaces[i];
+		int flag = nsflag(ns->type);
 
-		if (setns(ns.fd, ns.ns) < 0)
-			bail("failed to setns to %s", ns.path);
+		write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", flag, ns->type, ns->path);
+		if (setns(ns->fd, flag) < 0)
+			bail("failed to setns into %s namespace", ns->type);
 
-		close(ns.fd);
+		close(ns->fd);
 	}
 
 	free(namespaces);
@@ -569,6 +591,14 @@ void join_namespaces(char *nslist)
 /* Defined in cloned_binary.c. */
 extern int ensure_cloned_binary(void);
 
+static inline int sane_kill(pid_t pid, int signum)
+{
+	if (pid > 0)
+		return kill(pid, signum);
+	else
+		return 0;
+}
+
 void nsexec(void)
 {
 	int pipenum;
@@ -598,7 +628,14 @@ void nsexec(void)
 	if (ensure_cloned_binary() < 0)
 		bail("could not ensure we are a cloned binary");
 
-	write_log(DEBUG, "nsexec started");
+	/*
+	 * Inform the parent we're past initial setup.
+	 * For the other side of this, see initWaiter.
+	 */
+	if (write(pipenum, "", 1) != 1)
+		bail("could not inform the parent we are past initial setup");
+
+	write_log(DEBUG, "=> nsexec container setup");
 
 	/* Parse all of the netlink configuration. */
 	nl_parse(pipenum, &config);
@@ -622,6 +659,7 @@ void nsexec(void)
 	 * containers), which is the recommendation from the kernel folks.
 	 */
 	if (config.namespaces) {
+		write_log(DEBUG, "set process as non-dumpable");
 		if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
 			bail("failed to set process as non-dumpable");
 	}
@@ -686,45 +724,49 @@ void nsexec(void)
 	 * -- Aleksa "what has my life come to?" Sarai
 	 */
 
-	switch (setjmp(env)) {
+	current_stage = setjmp(env);
+	switch (current_stage) {
 		/*
 		 * Stage 0: We're in the parent. Our job is just to create a new child
-		 *          (stage 1: JUMP_CHILD) process and write its uid_map and
+		 *          (stage 1: STAGE_CHILD) process and write its uid_map and
 		 *          gid_map. That process will go on to create a new process, then
 		 *          it will send us its PID which we will send to the bootstrap
 		 *          process.
 		 */
-	case JUMP_PARENT:{
+	case STAGE_PARENT:{
 			int len;
-			pid_t child, first_child = -1;
-			bool ready = false;
+			pid_t stage1_pid = -1, stage2_pid = -1;
+			bool stage1_complete, stage2_complete;
 
 			/* For debugging. */
 			prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0);
+			write_log(DEBUG, "~> nsexec stage-0");
 
 			/* Start the process of getting a container. */
-			child = clone_parent(&env, JUMP_CHILD);
-			if (child < 0)
-				bail("unable to fork: child_func");
+			write_log(DEBUG, "spawn stage-1");
+			stage1_pid = clone_parent(&env, STAGE_CHILD);
+			if (stage1_pid < 0)
+				bail("unable to spawn stage-1");
 
-			/*
-			 * State machine for synchronisation with the children.
-			 *
-			 * Father only return when both child and grandchild are
-			 * ready, so we can receive all possible error codes
-			 * generated by children.
-			 */
 			syncfd = sync_child_pipe[1];
 			close(sync_child_pipe[0]);
 
-			while (!ready) {
+			/*
+			 * State machine for synchronisation with the children. We only
+			 * return once both the child and grandchild are ready.
+			 */
+			write_log(DEBUG, "-> stage-1 synchronisation loop");
+			stage1_complete = false;
+			while (!stage1_complete) {
 				enum sync_t s;
 
 				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
-					bail("failed to sync with child: next state");
+					bail("failed to sync with stage-1: next state");
 
 				switch (s) {
 				case SYNC_USERMAP_PLS:
+					write_log(DEBUG, "stage-1 requested userns mappings");
+
 					/*
 					 * Enable setgroups(2) if we've been asked to. But we also
 					 * have to explicitly disable setgroups(2) if we're
@@ -735,70 +777,78 @@ void nsexec(void)
 					 * For rootless multi-entry mapping, config.is_setgroup shall be true and
 					 * newuidmap/newgidmap shall be used.
 					 */
-
 					if (config.is_rootless_euid && !config.is_setgroup)
-						update_setgroups(child, SETGROUPS_DENY);
+						update_setgroups(stage1_pid, SETGROUPS_DENY);
 
 					/* Set up mappings. */
-					update_uidmap(config.uidmappath, child, config.uidmap, config.uidmap_len);
-					update_gidmap(config.gidmappath, child, config.gidmap, config.gidmap_len);
+					update_uidmap(config.uidmappath, stage1_pid, config.uidmap, config.uidmap_len);
+					update_gidmap(config.gidmappath, stage1_pid, config.gidmap, config.gidmap_len);
 
 					s = SYNC_USERMAP_ACK;
 					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
-						kill(child, SIGKILL);
-						bail("failed to sync with child: write(SYNC_USERMAP_ACK)");
+						sane_kill(stage1_pid, SIGKILL);
+						sane_kill(stage2_pid, SIGKILL);
+						bail("failed to sync with stage-1: write(SYNC_USERMAP_ACK)");
 					}
 					break;
-				case SYNC_RECVPID_PLS:{
-						first_child = child;
-
-						/* Get the init_func pid. */
-						if (read(syncfd, &child, sizeof(child)) != sizeof(child)) {
-							kill(first_child, SIGKILL);
-							bail("failed to sync with child: read(childpid)");
-						}
-
-						/* Send ACK. */
-						s = SYNC_RECVPID_ACK;
-						if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
-							kill(first_child, SIGKILL);
-							kill(child, SIGKILL);
-							bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
-						}
-
-						/* Send the init_func pid back to our parent.
-						 *
-						 * Send the init_func pid and the pid of the first child back to our parent.
-						 * We need to send both back because we can't reap the first child we created (CLONE_PARENT).
-						 * It becomes the responsibility of our parent to reap the first child.
-						 */
-						len = dprintf(pipenum, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child);
-						if (len < 0) {
-							kill(child, SIGKILL);
-							bail("unable to generate JSON for child pid");
-						}
+				case SYNC_RECVPID_PLS:
+					write_log(DEBUG, "stage-1 requested pid to be forwarded");
+
+					/* Get the stage-2 pid. */
+					if (read(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) {
+						sane_kill(stage1_pid, SIGKILL);
+						sane_kill(stage2_pid, SIGKILL);
+						bail("failed to sync with stage-1: read(stage2_pid)");
+					}
+
+					/* Send ACK. */
+					s = SYNC_RECVPID_ACK;
+					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
+						sane_kill(stage1_pid, SIGKILL);
+						sane_kill(stage2_pid, SIGKILL);
+						bail("failed to sync with stage-1: write(SYNC_RECVPID_ACK)");
+					}
+
+					/*
+					 * Send both the stage-1 and stage-2 pids back to runc.
+					 * runc needs the stage-2 to continue process management,
+					 * but because stage-1 was spawned with CLONE_PARENT we
+					 * cannot reap it within stage-0 and thus we need to ask
+					 * runc to reap the zombie for us.
+					 */
+					write_log(DEBUG, "forward stage-1 (%d) and stage-2 (%d) pids to runc",
+						  stage1_pid, stage2_pid);
+					len =
+					    dprintf(pipenum, "{\"stage1_pid\":%d,\"stage2_pid\":%d}\n", stage1_pid,
+						    stage2_pid);
+					if (len < 0) {
+						sane_kill(stage1_pid, SIGKILL);
+						sane_kill(stage2_pid, SIGKILL);
+						bail("failed to sync with runc: write(pid-JSON)");
 					}
 					break;
-				case SYNC_CHILD_READY:
-					ready = true;
+				case SYNC_CHILD_FINISH:
+					write_log(DEBUG, "stage-1 complete");
+					stage1_complete = true;
 					break;
 				default:
 					bail("unexpected sync value: %u", s);
 				}
 			}
+			write_log(DEBUG, "<- stage-1 synchronisation loop");
 
 			/* Now sync with grandchild. */
-
 			syncfd = sync_grandchild_pipe[1];
 			close(sync_grandchild_pipe[0]);
-
-			ready = false;
-			while (!ready) {
+			write_log(DEBUG, "-> stage-2 synchronisation loop");
+			stage2_complete = false;
+			while (!stage2_complete) {
 				enum sync_t s;
 
+				write_log(DEBUG, "signalling stage-2 to run");
 				s = SYNC_GRANDCHILD;
 				if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
-					kill(child, SIGKILL);
+					sane_kill(stage2_pid, SIGKILL);
 					bail("failed to sync with child: write(SYNC_GRANDCHILD)");
 				}
 
@@ -806,27 +856,31 @@ void nsexec(void)
 					bail("failed to sync with child: next state");
 
 				switch (s) {
-				case SYNC_CHILD_READY:
-					ready = true;
+				case SYNC_CHILD_FINISH:
+					write_log(DEBUG, "stage-2 complete");
+					stage2_complete = true;
 					break;
 				default:
 					bail("unexpected sync value: %u", s);
 				}
 			}
+			write_log(DEBUG, "<- stage-2 synchronisation loop");
+			write_log(DEBUG, "<~ nsexec stage-0");
 			exit(0);
 		}
+		break;
 
 		/*
 		 * Stage 1: We're in the first child process. Our job is to join any
-		 *          provided namespaces in the netlink payload and unshare all
-		 *          of the requested namespaces. If we've been asked to
-		 *          CLONE_NEWUSER, we will ask our parent (stage 0) to set up
-		 *          our user mappings for us. Then, we create a new child
-		 *          (stage 2: JUMP_INIT) for PID namespace. We then send the
-		 *          child's PID to our parent (stage 0).
+		 *          provided namespaces in the netlink payload and unshare all of
+		 *          the requested namespaces. If we've been asked to CLONE_NEWUSER,
+		 *          we will ask our parent (stage 0) to set up our user mappings
+		 *          for us. Then, we create a new child (stage 2: STAGE_INIT) for
+		 *          PID namespace. We then send the child's PID to our parent
+		 *          (stage 0).
 		 */
-	case JUMP_CHILD:{
-			pid_t child;
+	case STAGE_CHILD:{
+			pid_t stage2_pid = -1;
 			enum sync_t s;
 
 			/* We're in a child and thus need to tell the parent if we die. */
@@ -835,11 +889,12 @@ void nsexec(void)
 
 			/* For debugging. */
 			prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0);
+			write_log(DEBUG, "~> nsexec stage-1");
 
 			/*
 			 * We need to setns first. We cannot do this earlier (in stage 0)
 			 * because of the fact that we forked to get here (the PID of
-			 * [stage 2: JUMP_INIT]) would be meaningless). We could send it
+			 * [stage 2: STAGE_INIT]) would be meaningless). We could send it
 			 * using cmsg(3) but that's just annoying.
 			 */
 			if (config.namespaces)
@@ -865,40 +920,50 @@ void nsexec(void)
 			 * problem.
 			 */
 			if (config.cloneflags & CLONE_NEWUSER) {
+				write_log(DEBUG, "unshare user namespace");
 				if (unshare(CLONE_NEWUSER) < 0)
 					bail("failed to unshare user namespace");
 				config.cloneflags &= ~CLONE_NEWUSER;
 
 				/*
-				 * We don't have the privileges to do any mapping here (see the
-				 * clone_parent rant). So signal our parent to hook us up.
+				 * We need to set ourselves as dumpable temporarily so that the
+				 * parent process can write to our procfs files.
 				 */
-
-				/* Switching is only necessary if we joined namespaces. */
 				if (config.namespaces) {
+					write_log(DEBUG, "temporarily set process as dumpable");
 					if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0)
-						bail("failed to set process as dumpable");
+						bail("failed to temporarily set process as dumpable");
 				}
+
+				/*
+				 * We don't have the privileges to do any mapping here (see the
+				 * clone_parent rant). So signal stage-0 to do the mapping for
+				 * us.
+				 */
+				write_log(DEBUG, "request stage-0 to map user namespace");
 				s = SYNC_USERMAP_PLS;
 				if (write(syncfd, &s, sizeof(s)) != sizeof(s))
 					bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
 
 				/* ... wait for mapping ... */
-
+				write_log(DEBUG, "request stage-0 to map user namespace");
 				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
 					bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
 				if (s != SYNC_USERMAP_ACK)
 					bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
-				/* Switching is only necessary if we joined namespaces. */
+
+				/* Revert temporary re-dumpable setting. */
 				if (config.namespaces) {
+					write_log(DEBUG, "re-set process as non-dumpable");
 					if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
-						bail("failed to set process as dumpable");
+						bail("failed to re-set process as non-dumpable");
 				}
 
 				/* Become root in the namespace proper. */
 				if (setresuid(0, 0, 0) < 0)
 					bail("failed to become root in user namespace");
 			}
+
 			/*
 			 * Unshare all of the namespaces. Now, it should be noted that this
 			 * ordering might break in the future (especially with rootless
@@ -909,8 +974,9 @@ void nsexec(void)
 			 * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
 			 * was broken, so we'll just do it the long way anyway.
 			 */
+			write_log(DEBUG, "unshare remaining namespace (except cgroupns)");
 			if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0)
-				bail("failed to unshare namespaces");
+				bail("failed to unshare remaining namespaces (except cgroupns)");
 
 			/*
 			 * TODO: What about non-namespace clone flags that we're dropping here?
@@ -921,41 +987,45 @@ void nsexec(void)
 			 * which would break many applications and libraries, so we must fork
 			 * to actually enter the new PID namespace.
 			 */
-			child = clone_parent(&env, JUMP_INIT);
-			if (child < 0)
-				bail("unable to fork: init_func");
+			write_log(DEBUG, "spawn stage-2");
+			stage2_pid = clone_parent(&env, STAGE_INIT);
+			if (stage2_pid < 0)
+				bail("unable to spawn stage-2");
 
 			/* Send the child to our parent, which knows what it's doing. */
+			write_log(DEBUG, "request stage-0 to forward stage-2 pid (%d)", stage2_pid);
 			s = SYNC_RECVPID_PLS;
 			if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
-				kill(child, SIGKILL);
+				sane_kill(stage2_pid, SIGKILL);
 				bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
 			}
-			if (write(syncfd, &child, sizeof(child)) != sizeof(child)) {
-				kill(child, SIGKILL);
-				bail("failed to sync with parent: write(childpid)");
+			if (write(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) {
+				sane_kill(stage2_pid, SIGKILL);
+				bail("failed to sync with parent: write(stage2_pid)");
 			}
 
 			/* ... wait for parent to get the pid ... */
-
 			if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
-				kill(child, SIGKILL);
+				sane_kill(stage2_pid, SIGKILL);
 				bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
 			}
 			if (s != SYNC_RECVPID_ACK) {
-				kill(child, SIGKILL);
+				sane_kill(stage2_pid, SIGKILL);
 				bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
 			}
 
-			s = SYNC_CHILD_READY;
+			write_log(DEBUG, "signal completion to stage-0");
+			s = SYNC_CHILD_FINISH;
 			if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
-				kill(child, SIGKILL);
-				bail("failed to sync with parent: write(SYNC_CHILD_READY)");
+				sane_kill(stage2_pid, SIGKILL);
+				bail("failed to sync with parent: write(SYNC_CHILD_FINISH)");
 			}
 
-			/* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */
+			/* Our work is done. [Stage 2: STAGE_INIT] is doing the rest of the work. */
+			write_log(DEBUG, "<~ nsexec stage-1");
 			exit(0);
 		}
+		break;
 
 		/*
 		 * Stage 2: We're the final child process, and the only process that will
@@ -963,7 +1033,7 @@ void nsexec(void)
 		 *          final cleanup steps and then return to the Go runtime to allow
 		 *          init_linux.go to run.
 		 */
-	case JUMP_INIT:{
+	case STAGE_INIT:{
 			/*
 			 * We're inside the child now, having jumped from the
 			 * start_child() code after forking in the parent.
@@ -978,6 +1048,7 @@ void nsexec(void)
 
 			/* For debugging. */
 			prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0);
+			write_log(DEBUG, "~> nsexec stage-2");
 
 			if (read(syncfd, &s, sizeof(s)) != sizeof(s))
 				bail("failed to sync with parent: read(SYNC_GRANDCHILD)");
@@ -998,21 +1069,30 @@ void nsexec(void)
 					bail("setgroups failed");
 			}
 
-			/* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */
+			/*
+			 * Wait until our topmost parent has finished cgroup setup in
+			 * p.manager.Apply().
+			 *
+			 * TODO(cyphar): Check if this code is actually needed because we
+			 *               should be in the cgroup even from stage-0, so
+			 *               waiting until now might not make sense.
+			 */
 			if (config.cloneflags & CLONE_NEWCGROUP) {
 				uint8_t value;
 				if (read(pipenum, &value, sizeof(value)) != sizeof(value))
 					bail("read synchronisation value failed");
 				if (value == CREATECGROUPNS) {
+					write_log(DEBUG, "unshare cgroup namespace");
 					if (unshare(CLONE_NEWCGROUP) < 0)
 						bail("failed to unshare cgroup namespace");
 				} else
 					bail("received unknown synchronisation value");
 			}
 
-			s = SYNC_CHILD_READY;
+			write_log(DEBUG, "signal completion to stage-0");
+			s = SYNC_CHILD_FINISH;
 			if (write(syncfd, &s, sizeof(s)) != sizeof(s))
-				bail("failed to sync with patent: write(SYNC_CHILD_READY)");
+				bail("failed to sync with patent: write(SYNC_CHILD_FINISH)");
 
 			/* Close sync pipes. */
 			close(sync_grandchild_pipe[0]);
@@ -1021,10 +1101,13 @@ void nsexec(void)
 			nl_free(&config);
 
 			/* Finish executing, let the Go runtime take over. */
+			write_log(DEBUG, "<= nsexec container setup");
+			write_log(DEBUG, "booting up go runtime ...");
 			return;
 		}
+		break;
 	default:
-		bail("unexpected jump value");
+		bail("unknown stage '%d' for jump value", current_stage);
 	}
 
 	/* Should never be reached. */

+ 1 - 0
vendor/github.com/opencontainers/runc/libcontainer/nsenter/test/escape.c

@@ -0,0 +1 @@
+../escape.c

+ 53 - 0
vendor/github.com/opencontainers/runc/libcontainer/nsenter/test/escape.go

@@ -0,0 +1,53 @@
+package escapetest
+
+// This file is part of escape_json_string unit test.
+// It is in a separate package so cgo can be used together
+// with go test.
+
+// #include <stdlib.h>
+// extern char *escape_json_string(char *str);
+// #cgo CFLAGS: -DESCAPE_TEST=1
+import "C"
+
+import (
+	"testing"
+	"unsafe"
+)
+
+func testEscapeJsonString(t *testing.T, input, want string) {
+	in := C.CString(input)
+	out := C.escape_json_string(in)
+	got := C.GoString(out)
+	C.free(unsafe.Pointer(out))
+	t.Logf("input: %q, output: %q", input, got)
+	if got != want {
+		t.Errorf("Failed on input: %q, want %q, got %q", input, want, got)
+	}
+}
+
+func testEscapeJson(t *testing.T) {
+	testCases := []struct {
+		input, output string
+	}{
+		{"", ""},
+		{"abcdef", "abcdef"},
+		{`\\\\\\`, `\\\\\\\\\\\\`},
+		{`with"quote`, `with\"quote`},
+		{"\n\r\b\t\f\\", `\n\r\b\t\f\\`},
+		{"\007", "\\u0007"},
+		{"\017 \020 \037", "\\u000f \\u0010 \\u001f"},
+		{"\033", "\\u001b"},
+		{`<->`, `<->`},
+		{"\176\177\200", "~\\u007f\200"},
+		{"\000", ""},
+		{"a\x7fxc", "a\\u007fxc"},
+		{"a\033xc", "a\\u001bxc"},
+		{"a\nxc", "a\\nxc"},
+		{"a\\xc", "a\\\\xc"},
+		{"Barney B\303\244r", "Barney B\303\244r"},
+	}
+
+	for _, tc := range testCases {
+		testEscapeJsonString(t, tc.input, tc.output)
+	}
+}

+ 0 - 41
vendor/github.com/opencontainers/runc/libcontainer/user/lookup.go

@@ -1,41 +0,0 @@
-package user
-
-import (
-	"errors"
-)
-
-var (
-	// The current operating system does not provide the required data for user lookups.
-	ErrUnsupported = errors.New("user lookup: operating system does not provide passwd-formatted data")
-	// No matching entries found in file.
-	ErrNoPasswdEntries = errors.New("no matching entries in passwd file")
-	ErrNoGroupEntries  = errors.New("no matching entries in group file")
-)
-
-// LookupUser looks up a user by their username in /etc/passwd. If the user
-// cannot be found (or there is no /etc/passwd file on the filesystem), then
-// LookupUser returns an error.
-func LookupUser(username string) (User, error) {
-	return lookupUser(username)
-}
-
-// LookupUid looks up a user by their user id in /etc/passwd. If the user cannot
-// be found (or there is no /etc/passwd file on the filesystem), then LookupId
-// returns an error.
-func LookupUid(uid int) (User, error) {
-	return lookupUid(uid)
-}
-
-// LookupGroup looks up a group by its name in /etc/group. If the group cannot
-// be found (or there is no /etc/group file on the filesystem), then LookupGroup
-// returns an error.
-func LookupGroup(groupname string) (Group, error) {
-	return lookupGroup(groupname)
-}
-
-// LookupGid looks up a group by its group id in /etc/group. If the group cannot
-// be found (or there is no /etc/group file on the filesystem), then LookupGid
-// returns an error.
-func LookupGid(gid int) (Group, error) {
-	return lookupGid(gid)
-}

+ 16 - 4
vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go

@@ -16,13 +16,19 @@ const (
 	unixGroupPath  = "/etc/group"
 )
 
-func lookupUser(username string) (User, error) {
+// LookupUser looks up a user by their username in /etc/passwd. If the user
+// cannot be found (or there is no /etc/passwd file on the filesystem), then
+// LookupUser returns an error.
+func LookupUser(username string) (User, error) {
 	return lookupUserFunc(func(u User) bool {
 		return u.Name == username
 	})
 }
 
-func lookupUid(uid int) (User, error) {
+// LookupUid looks up a user by their user id in /etc/passwd. If the user cannot
+// be found (or there is no /etc/passwd file on the filesystem), then LookupId
+// returns an error.
+func LookupUid(uid int) (User, error) {
 	return lookupUserFunc(func(u User) bool {
 		return u.Uid == uid
 	})
@@ -51,13 +57,19 @@ func lookupUserFunc(filter func(u User) bool) (User, error) {
 	return users[0], nil
 }
 
-func lookupGroup(groupname string) (Group, error) {
+// LookupGroup looks up a group by its name in /etc/group. If the group cannot
+// be found (or there is no /etc/group file on the filesystem), then LookupGroup
+// returns an error.
+func LookupGroup(groupname string) (Group, error) {
 	return lookupGroupFunc(func(g Group) bool {
 		return g.Name == groupname
 	})
 }
 
-func lookupGid(gid int) (Group, error) {
+// LookupGid looks up a group by its group id in /etc/group. If the group cannot
+// be found (or there is no /etc/group file on the filesystem), then LookupGid
+// returns an error.
+func LookupGid(gid int) (Group, error) {
 	return lookupGroupFunc(func(g Group) bool {
 		return g.Gid == gid
 	})

+ 0 - 40
vendor/github.com/opencontainers/runc/libcontainer/user/lookup_windows.go

@@ -1,40 +0,0 @@
-// +build windows
-
-package user
-
-import (
-	"fmt"
-	"os/user"
-)
-
-func lookupUser(username string) (User, error) {
-	u, err := user.Lookup(username)
-	if err != nil {
-		return User{}, err
-	}
-	return userFromOS(u)
-}
-
-func lookupUid(uid int) (User, error) {
-	u, err := user.LookupId(fmt.Sprintf("%d", uid))
-	if err != nil {
-		return User{}, err
-	}
-	return userFromOS(u)
-}
-
-func lookupGroup(groupname string) (Group, error) {
-	g, err := user.LookupGroup(groupname)
-	if err != nil {
-		return Group{}, err
-	}
-	return groupFromOS(g)
-}
-
-func lookupGid(gid int) (Group, error) {
-	g, err := user.LookupGroupId(fmt.Sprintf("%d", gid))
-	if err != nil {
-		return Group{}, err
-	}
-	return groupFromOS(g)
-}

+ 10 - 42
vendor/github.com/opencontainers/runc/libcontainer/user/user.go

@@ -2,10 +2,10 @@ package user
 
 import (
 	"bufio"
+	"errors"
 	"fmt"
 	"io"
 	"os"
-	"os/user"
 	"strconv"
 	"strings"
 )
@@ -16,6 +16,13 @@ const (
 )
 
 var (
+	// The current operating system does not provide the required data for user lookups.
+	ErrUnsupported = errors.New("user lookup: operating system does not provide passwd-formatted data")
+
+	// No matching entries found in file.
+	ErrNoPasswdEntries = errors.New("no matching entries in passwd file")
+	ErrNoGroupEntries  = errors.New("no matching entries in group file")
+
 	ErrRange = fmt.Errorf("uids and gids must be in range %d-%d", minId, maxId)
 )
 
@@ -29,28 +36,6 @@ type User struct {
 	Shell string
 }
 
-// userFromOS converts an os/user.(*User) to local User
-//
-// (This does not include Pass, Shell or Gecos)
-func userFromOS(u *user.User) (User, error) {
-	newUser := User{
-		Name: u.Username,
-		Home: u.HomeDir,
-	}
-	id, err := strconv.Atoi(u.Uid)
-	if err != nil {
-		return newUser, err
-	}
-	newUser.Uid = id
-
-	id, err = strconv.Atoi(u.Gid)
-	if err != nil {
-		return newUser, err
-	}
-	newUser.Gid = id
-	return newUser, nil
-}
-
 type Group struct {
 	Name string
 	Pass string
@@ -58,23 +43,6 @@ type Group struct {
 	List []string
 }
 
-// groupFromOS converts an os/user.(*Group) to local Group
-//
-// (This does not include Pass or List)
-func groupFromOS(g *user.Group) (Group, error) {
-	newGroup := Group{
-		Name: g.Name,
-	}
-
-	id, err := strconv.Atoi(g.Gid)
-	if err != nil {
-		return newGroup, err
-	}
-	newGroup.Gid = id
-
-	return newGroup, nil
-}
-
 // SubID represents an entry in /etc/sub{u,g}id
 type SubID struct {
 	Name  string
@@ -466,7 +434,7 @@ func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, err
 		// we asked for a group but didn't find it. let's check to see
 		// if we wanted a numeric group
 		if !found {
-			gid, err := strconv.Atoi(ag)
+			gid, err := strconv.ParseInt(ag, 10, 64)
 			if err != nil {
 				return nil, fmt.Errorf("Unable to find group %s", ag)
 			}
@@ -474,7 +442,7 @@ func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, err
 			if gid < minId || gid > maxId {
 				return nil, ErrRange
 			}
-			gidMap[gid] = struct{}{}
+			gidMap[int(gid)] = struct{}{}
 		}
 	}
 	gids := []int{}

+ 42 - 0
vendor/github.com/opencontainers/runc/libcontainer/user/user_fuzzer.go

@@ -0,0 +1,42 @@
+// +build gofuzz
+
+package user
+
+import (
+	"io"
+	"strings"
+)
+
+func IsDivisbleBy(n int, divisibleby int) bool {
+	return (n % divisibleby) == 0
+}
+
+func FuzzUser(data []byte) int {
+	if len(data) == 0 {
+		return -1
+	}
+	if !IsDivisbleBy(len(data), 5) {
+		return -1
+	}
+
+	var divided [][]byte
+
+	chunkSize := len(data) / 5
+
+	for i := 0; i < len(data); i += chunkSize {
+		end := i + chunkSize
+
+		divided = append(divided, data[i:end])
+	}
+
+	_, _ = ParsePasswdFilter(strings.NewReader(string(divided[0])), nil)
+
+	var passwd, group io.Reader
+
+	group = strings.NewReader(string(divided[1]))
+	_, _ = GetAdditionalGroups([]string{string(divided[2])}, group)
+
+	passwd = strings.NewReader(string(divided[3]))
+	_, _ = GetExecUser(string(divided[4]), nil, passwd, group)
+	return 1
+}

+ 5 - 0
vendor/github.com/opencontainers/runc/libcontainer/userns/userns.go

@@ -0,0 +1,5 @@
+package userns
+
+// RunningInUserNS detects whether we are currently running in a user namespace.
+// Originally copied from github.com/lxc/lxd/shared/util.go
+var RunningInUserNS = runningInUserNS

+ 15 - 0
vendor/github.com/opencontainers/runc/libcontainer/userns/userns_fuzzer.go

@@ -0,0 +1,15 @@
+// +build gofuzz
+
+package userns
+
+import (
+	"strings"
+
+	"github.com/opencontainers/runc/libcontainer/user"
+)
+
+func FuzzUIDMap(data []byte) int {
+	uidmap, _ := user.ParseIDMap(strings.NewReader(string(data)))
+	_ = uidMapInUserNS(uidmap)
+	return 1
+}

+ 37 - 0
vendor/github.com/opencontainers/runc/libcontainer/userns/userns_linux.go

@@ -0,0 +1,37 @@
+package userns
+
+import (
+	"sync"
+
+	"github.com/opencontainers/runc/libcontainer/user"
+)
+
+var (
+	inUserNS bool
+	nsOnce   sync.Once
+)
+
+// runningInUserNS detects whether we are currently running in a user namespace.
+// Originally copied from github.com/lxc/lxd/shared/util.go
+func runningInUserNS() bool {
+	nsOnce.Do(func() {
+		uidmap, err := user.CurrentProcessUIDMap()
+		if err != nil {
+			// This kernel-provided file only exists if user namespaces are supported
+			return
+		}
+		inUserNS = uidMapInUserNS(uidmap)
+	})
+	return inUserNS
+}
+
+func uidMapInUserNS(uidmap []user.IDMap) bool {
+	/*
+	 * We assume we are in the initial user namespace if we have a full
+	 * range - 4294967295 uids starting at uid 0.
+	 */
+	if len(uidmap) == 1 && uidmap[0].ID == 0 && uidmap[0].ParentID == 0 && uidmap[0].Count == 4294967295 {
+		return false
+	}
+	return true
+}

+ 17 - 0
vendor/github.com/opencontainers/runc/libcontainer/userns/userns_unsupported.go

@@ -0,0 +1,17 @@
+// +build !linux
+
+package userns
+
+import "github.com/opencontainers/runc/libcontainer/user"
+
+// runningInUserNS is a stub for non-Linux systems
+// Always returns false
+func runningInUserNS() bool {
+	return false
+}
+
+// uidMapInUserNS is a stub for non-Linux systems
+// Always returns false
+func uidMapInUserNS(uidmap []user.IDMap) bool {
+	return false
+}