瀏覽代碼

Merge pull request #2311 from andrewhsu/updt

vndr runc 96ec217
Flavio Crisciani 6 年之前
父節點
當前提交
650280a057
共有 26 個文件被更改,包括 268 次插入1148 次删除
  1. 4 4
      libnetwork/sandbox_externalkey_unix.go
  2. 1 1
      libnetwork/vendor.conf
  3. 17 0
      libnetwork/vendor/github.com/opencontainers/runc/README.md
  4. 5 3
      libnetwork/vendor/github.com/opencontainers/runc/libcontainer/README.md
  5. 0 61
      libnetwork/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go
  6. 0 122
      libnetwork/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go
  7. 0 6
      libnetwork/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_windows.go
  8. 0 348
      libnetwork/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go
  9. 0 61
      libnetwork/vendor/github.com/opencontainers/runc/libcontainer/configs/config_linux.go
  10. 0 57
      libnetwork/vendor/github.com/opencontainers/runc/libcontainer/configs/device.go
  11. 0 111
      libnetwork/vendor/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go
  12. 0 9
      libnetwork/vendor/github.com/opencontainers/runc/libcontainer/configs/hugepage_limit.go
  13. 0 7
      libnetwork/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go
  14. 0 14
      libnetwork/vendor/github.com/opencontainers/runc/libcontainer/configs/interface_priority_map.go
  15. 0 39
      libnetwork/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go
  16. 0 5
      libnetwork/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces.go
  17. 0 122
      libnetwork/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go
  18. 0 31
      libnetwork/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go
  19. 0 13
      libnetwork/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall_unsupported.go
  20. 0 8
      libnetwork/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_unsupported.go
  21. 0 72
      libnetwork/vendor/github.com/opencontainers/runc/libcontainer/configs/network.go
  22. 3 3
      libnetwork/vendor/github.com/opencontainers/runc/libcontainer/nsenter/README.md
  23. 79 47
      libnetwork/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c
  24. 28 0
      libnetwork/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go
  25. 130 3
      libnetwork/vendor/github.com/opencontainers/runc/libcontainer/user/user.go
  26. 1 1
      libnetwork/vendor/github.com/opencontainers/runc/vendor.conf

+ 4 - 4
libnetwork/sandbox_externalkey_unix.go

@@ -13,7 +13,7 @@ import (
 	"path/filepath"
 
 	"github.com/docker/libnetwork/types"
-	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/sirupsen/logrus"
 )
 
@@ -25,7 +25,7 @@ const (
 
 // processSetKeyReexec is a private function that must be called only on an reexec path
 // It expects 3 args { [0] = "libnetwork-setkey", [1] = <container-id>, [2] = <controller-id> }
-// It also expects configs.HookState as a json string in <stdin>
+// It also expects specs.State as a json string in <stdin>
 // Refer to https://github.com/opencontainers/runc/pull/160/ for more information
 // The docker exec-root can be specified as "-exec-root" flag. The default value is "/run/docker".
 func processSetKeyReexec() {
@@ -50,12 +50,12 @@ func processSetKeyReexec() {
 	}
 	containerID, controllerID := args[0], args[1]
 
-	// We expect configs.HookState as a json string in <stdin>
+	// We expect specs.State as a json string in <stdin>
 	stateBuf, err := ioutil.ReadAll(os.Stdin)
 	if err != nil {
 		return
 	}
-	var state configs.HookState
+	var state specs.State
 	if err = json.Unmarshal(stateBuf, &state); err != nil {
 		return
 	}

+ 1 - 1
libnetwork/vendor.conf

@@ -32,7 +32,7 @@ github.com/mattn/go-shellwords v1.0.3
 github.com/miekg/dns v1.0.7
 github.com/opencontainers/go-digest v1.0.0-rc1
 github.com/opencontainers/image-spec v1.0.1
-github.com/opencontainers/runc 69663f0bd4b60df09991c08812a60108003fa340
+github.com/opencontainers/runc 96ec2177ae841256168fcf76954f7177af9446eb
 github.com/opencontainers/runtime-spec v1.0.1
 github.com/samuel/go-zookeeper d0e0d8e11f318e000a8cc434616d69e329edc374
 github.com/sirupsen/logrus v1.0.3

+ 17 - 0
libnetwork/vendor/github.com/opencontainers/runc/README.md

@@ -68,6 +68,7 @@ make BUILDTAGS='seccomp apparmor'
 | selinux   | selinux process and mount labeling | <none>      |
 | apparmor  | apparmor profile support           | <none>      |
 | ambient   | ambient capability support         | kernel 4.3  |
+| nokmem    | disable kernel memory account      | <none>      |
 
 
 ### Running the test suite
@@ -87,6 +88,18 @@ You can run a specific test case by setting the `TESTFLAGS` variable.
 # make test TESTFLAGS="-run=SomeTestFunction"
 ```
 
+You can run a specific integration test by setting the `TESTPATH` variable.
+
+```bash
+# make test TESTPATH="/checkpoint.bats"
+```
+
+You can run a test in your proxy environment by setting `DOCKER_BUILD_PROXY` and `DOCKER_RUN_PROXY` variables.
+
+```bash
+# make test DOCKER_BUILD_PROXY="--build-arg HTTP_PROXY=http://yourproxy/" DOCKER_RUN_PROXY="-e HTTP_PROXY=http://yourproxy/"
+```
+
 ### Dependencies Management
 
 `runc` uses [vndr](https://github.com/LK4D4/vndr) for dependencies management.
@@ -251,3 +264,7 @@ PIDFile=/run/mycontainerid.pid
 [Install]
 WantedBy=multi-user.target
 ```
+
+## License
+
+The code and docs are released under the [Apache 2.0 license](LICENSE).

+ 5 - 3
libnetwork/vendor/github.com/opencontainers/runc/libcontainer/README.md

@@ -148,6 +148,7 @@ config := &configs.Config{
 		{Type: configs.NEWPID},
 		{Type: configs.NEWUSER},
 		{Type: configs.NEWNET},
+		{Type: configs.NEWCGROUP},
 	}),
 	Cgroups: &configs.Cgroup{
 		Name:   "test-container",
@@ -323,6 +324,7 @@ generated when building libcontainer with docker.
 
 ## Copyright and license
 
-Code and documentation copyright 2014 Docker, inc. Code released under the Apache 2.0 license.
-Docs released under Creative commons.
-
+Code and documentation copyright 2014 Docker, inc.
+The code and documentation are released under the [Apache 2.0 license](../LICENSE).
+The documentation is also released under Creative Commons Attribution 4.0 International License.
+You may obtain a copy of the license, titled CC-BY-4.0, at http://creativecommons.org/licenses/by/4.0/.

+ 0 - 61
libnetwork/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go

@@ -1,61 +0,0 @@
-package configs
-
-import "fmt"
-
-// blockIODevice holds major:minor format supported in blkio cgroup
-type blockIODevice struct {
-	// Major is the device's major number
-	Major int64 `json:"major"`
-	// Minor is the device's minor number
-	Minor int64 `json:"minor"`
-}
-
-// WeightDevice struct holds a `major:minor weight`|`major:minor leaf_weight` pair
-type WeightDevice struct {
-	blockIODevice
-	// Weight is the bandwidth rate for the device, range is from 10 to 1000
-	Weight uint16 `json:"weight"`
-	// LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only
-	LeafWeight uint16 `json:"leafWeight"`
-}
-
-// NewWeightDevice returns a configured WeightDevice pointer
-func NewWeightDevice(major, minor int64, weight, leafWeight uint16) *WeightDevice {
-	wd := &WeightDevice{}
-	wd.Major = major
-	wd.Minor = minor
-	wd.Weight = weight
-	wd.LeafWeight = leafWeight
-	return wd
-}
-
-// WeightString formats the struct to be writable to the cgroup specific file
-func (wd *WeightDevice) WeightString() string {
-	return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.Weight)
-}
-
-// LeafWeightString formats the struct to be writable to the cgroup specific file
-func (wd *WeightDevice) LeafWeightString() string {
-	return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.LeafWeight)
-}
-
-// ThrottleDevice struct holds a `major:minor rate_per_second` pair
-type ThrottleDevice struct {
-	blockIODevice
-	// Rate is the IO rate limit per cgroup per device
-	Rate uint64 `json:"rate"`
-}
-
-// NewThrottleDevice returns a configured ThrottleDevice pointer
-func NewThrottleDevice(major, minor int64, rate uint64) *ThrottleDevice {
-	td := &ThrottleDevice{}
-	td.Major = major
-	td.Minor = minor
-	td.Rate = rate
-	return td
-}
-
-// String formats the struct to be writable to the cgroup specific file
-func (td *ThrottleDevice) String() string {
-	return fmt.Sprintf("%d:%d %d", td.Major, td.Minor, td.Rate)
-}

+ 0 - 122
libnetwork/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go

@@ -1,122 +0,0 @@
-package configs
-
-type FreezerState string
-
-const (
-	Undefined FreezerState = ""
-	Frozen    FreezerState = "FROZEN"
-	Thawed    FreezerState = "THAWED"
-)
-
-type Cgroup struct {
-	// Deprecated, use Path instead
-	Name string `json:"name,omitempty"`
-
-	// name of parent of cgroup or slice
-	// Deprecated, use Path instead
-	Parent string `json:"parent,omitempty"`
-
-	// Path specifies the path to cgroups that are created and/or joined by the container.
-	// The path is assumed to be relative to the host system cgroup mountpoint.
-	Path string `json:"path"`
-
-	// ScopePrefix describes prefix for the scope name
-	ScopePrefix string `json:"scope_prefix"`
-
-	// Paths represent the absolute cgroups paths to join.
-	// This takes precedence over Path.
-	Paths map[string]string
-
-	// Resources contains various cgroups settings to apply
-	*Resources
-}
-
-type Resources struct {
-	// If this is true allow access to any kind of device within the container.  If false, allow access only to devices explicitly listed in the allowed_devices list.
-	// Deprecated
-	AllowAllDevices *bool `json:"allow_all_devices,omitempty"`
-	// Deprecated
-	AllowedDevices []*Device `json:"allowed_devices,omitempty"`
-	// Deprecated
-	DeniedDevices []*Device `json:"denied_devices,omitempty"`
-
-	Devices []*Device `json:"devices"`
-
-	// Memory limit (in bytes)
-	Memory int64 `json:"memory"`
-
-	// Memory reservation or soft_limit (in bytes)
-	MemoryReservation int64 `json:"memory_reservation"`
-
-	// Total memory usage (memory + swap); set `-1` to enable unlimited swap
-	MemorySwap int64 `json:"memory_swap"`
-
-	// Kernel memory limit (in bytes)
-	KernelMemory int64 `json:"kernel_memory"`
-
-	// Kernel memory limit for TCP use (in bytes)
-	KernelMemoryTCP int64 `json:"kernel_memory_tcp"`
-
-	// CPU shares (relative weight vs. other containers)
-	CpuShares uint64 `json:"cpu_shares"`
-
-	// CPU hardcap limit (in usecs). Allowed cpu time in a given period.
-	CpuQuota int64 `json:"cpu_quota"`
-
-	// CPU period to be used for hardcapping (in usecs). 0 to use system default.
-	CpuPeriod uint64 `json:"cpu_period"`
-
-	// How many time CPU will use in realtime scheduling (in usecs).
-	CpuRtRuntime int64 `json:"cpu_rt_quota"`
-
-	// CPU period to be used for realtime scheduling (in usecs).
-	CpuRtPeriod uint64 `json:"cpu_rt_period"`
-
-	// CPU to use
-	CpusetCpus string `json:"cpuset_cpus"`
-
-	// MEM to use
-	CpusetMems string `json:"cpuset_mems"`
-
-	// Process limit; set <= `0' to disable limit.
-	PidsLimit int64 `json:"pids_limit"`
-
-	// Specifies per cgroup weight, range is from 10 to 1000.
-	BlkioWeight uint16 `json:"blkio_weight"`
-
-	// Specifies tasks' weight in the given cgroup while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only
-	BlkioLeafWeight uint16 `json:"blkio_leaf_weight"`
-
-	// Weight per cgroup per device, can override BlkioWeight.
-	BlkioWeightDevice []*WeightDevice `json:"blkio_weight_device"`
-
-	// IO read rate limit per cgroup per device, bytes per second.
-	BlkioThrottleReadBpsDevice []*ThrottleDevice `json:"blkio_throttle_read_bps_device"`
-
-	// IO write rate limit per cgroup per device, bytes per second.
-	BlkioThrottleWriteBpsDevice []*ThrottleDevice `json:"blkio_throttle_write_bps_device"`
-
-	// IO read rate limit per cgroup per device, IO per second.
-	BlkioThrottleReadIOPSDevice []*ThrottleDevice `json:"blkio_throttle_read_iops_device"`
-
-	// IO write rate limit per cgroup per device, IO per second.
-	BlkioThrottleWriteIOPSDevice []*ThrottleDevice `json:"blkio_throttle_write_iops_device"`
-
-	// set the freeze value for the process
-	Freezer FreezerState `json:"freezer"`
-
-	// Hugetlb limit (in bytes)
-	HugetlbLimit []*HugepageLimit `json:"hugetlb_limit"`
-
-	// Whether to disable OOM Killer
-	OomKillDisable bool `json:"oom_kill_disable"`
-
-	// Tuning swappiness behaviour per cgroup
-	MemorySwappiness *uint64 `json:"memory_swappiness"`
-
-	// Set priority of network traffic for container
-	NetPrioIfpriomap []*IfPrioMap `json:"net_prio_ifpriomap"`
-
-	// Set class identifier for container's network packets
-	NetClsClassid uint32 `json:"net_cls_classid_u"`
-}

+ 0 - 6
libnetwork/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_windows.go

@@ -1,6 +0,0 @@
-package configs
-
-// TODO Windows: This can ultimately be entirely factored out on Windows as
-// cgroups are a Unix-specific construct.
-type Cgroup struct {
-}

+ 0 - 348
libnetwork/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go

@@ -1,348 +0,0 @@
-package configs
-
-import (
-	"bytes"
-	"encoding/json"
-	"fmt"
-	"os/exec"
-	"time"
-
-	"github.com/opencontainers/runtime-spec/specs-go"
-
-	"github.com/sirupsen/logrus"
-)
-
-type Rlimit struct {
-	Type int    `json:"type"`
-	Hard uint64 `json:"hard"`
-	Soft uint64 `json:"soft"`
-}
-
-// IDMap represents UID/GID Mappings for User Namespaces.
-type IDMap struct {
-	ContainerID int `json:"container_id"`
-	HostID      int `json:"host_id"`
-	Size        int `json:"size"`
-}
-
-// Seccomp represents syscall restrictions
-// By default, only the native architecture of the kernel is allowed to be used
-// for syscalls. Additional architectures can be added by specifying them in
-// Architectures.
-type Seccomp struct {
-	DefaultAction Action     `json:"default_action"`
-	Architectures []string   `json:"architectures"`
-	Syscalls      []*Syscall `json:"syscalls"`
-}
-
-// Action is taken upon rule match in Seccomp
-type Action int
-
-const (
-	Kill Action = iota + 1
-	Errno
-	Trap
-	Allow
-	Trace
-)
-
-// Operator is a comparison operator to be used when matching syscall arguments in Seccomp
-type Operator int
-
-const (
-	EqualTo Operator = iota + 1
-	NotEqualTo
-	GreaterThan
-	GreaterThanOrEqualTo
-	LessThan
-	LessThanOrEqualTo
-	MaskEqualTo
-)
-
-// Arg is a rule to match a specific syscall argument in Seccomp
-type Arg struct {
-	Index    uint     `json:"index"`
-	Value    uint64   `json:"value"`
-	ValueTwo uint64   `json:"value_two"`
-	Op       Operator `json:"op"`
-}
-
-// Syscall is a rule to match a syscall in Seccomp
-type Syscall struct {
-	Name   string `json:"name"`
-	Action Action `json:"action"`
-	Args   []*Arg `json:"args"`
-}
-
-// TODO Windows. Many of these fields should be factored out into those parts
-// which are common across platforms, and those which are platform specific.
-
-// Config defines configuration options for executing a process inside a contained environment.
-type Config struct {
-	// NoPivotRoot will use MS_MOVE and a chroot to jail the process into the container's rootfs
-	// This is a common option when the container is running in ramdisk
-	NoPivotRoot bool `json:"no_pivot_root"`
-
-	// ParentDeathSignal specifies the signal that is sent to the container's process in the case
-	// that the parent process dies.
-	ParentDeathSignal int `json:"parent_death_signal"`
-
-	// Path to a directory containing the container's root filesystem.
-	Rootfs string `json:"rootfs"`
-
-	// Readonlyfs will remount the container's rootfs as readonly where only externally mounted
-	// bind mounts are writtable.
-	Readonlyfs bool `json:"readonlyfs"`
-
-	// Specifies the mount propagation flags to be applied to /.
-	RootPropagation int `json:"rootPropagation"`
-
-	// Mounts specify additional source and destination paths that will be mounted inside the container's
-	// rootfs and mount namespace if specified
-	Mounts []*Mount `json:"mounts"`
-
-	// The device nodes that should be automatically created within the container upon container start.  Note, make sure that the node is marked as allowed in the cgroup as well!
-	Devices []*Device `json:"devices"`
-
-	MountLabel string `json:"mount_label"`
-
-	// Hostname optionally sets the container's hostname if provided
-	Hostname string `json:"hostname"`
-
-	// Namespaces specifies the container's namespaces that it should setup when cloning the init process
-	// If a namespace is not provided that namespace is shared from the container's parent process
-	Namespaces Namespaces `json:"namespaces"`
-
-	// Capabilities specify the capabilities to keep when executing the process inside the container
-	// All capabilities not specified will be dropped from the processes capability mask
-	Capabilities *Capabilities `json:"capabilities"`
-
-	// Networks specifies the container's network setup to be created
-	Networks []*Network `json:"networks"`
-
-	// Routes can be specified to create entries in the route table as the container is started
-	Routes []*Route `json:"routes"`
-
-	// Cgroups specifies specific cgroup settings for the various subsystems that the container is
-	// placed into to limit the resources the container has available
-	Cgroups *Cgroup `json:"cgroups"`
-
-	// AppArmorProfile specifies the profile to apply to the process running in the container and is
-	// change at the time the process is execed
-	AppArmorProfile string `json:"apparmor_profile,omitempty"`
-
-	// ProcessLabel specifies the label to apply to the process running in the container.  It is
-	// commonly used by selinux
-	ProcessLabel string `json:"process_label,omitempty"`
-
-	// Rlimits specifies the resource limits, such as max open files, to set in the container
-	// If Rlimits are not set, the container will inherit rlimits from the parent process
-	Rlimits []Rlimit `json:"rlimits,omitempty"`
-
-	// OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores
-	// for a process. Valid values are between the range [-1000, '1000'], where processes with
-	// higher scores are preferred for being killed.
-	// More information about kernel oom score calculation here: https://lwn.net/Articles/317814/
-	OomScoreAdj int `json:"oom_score_adj"`
-
-	// UidMappings is an array of User ID mappings for User Namespaces
-	UidMappings []IDMap `json:"uid_mappings"`
-
-	// GidMappings is an array of Group ID mappings for User Namespaces
-	GidMappings []IDMap `json:"gid_mappings"`
-
-	// MaskPaths specifies paths within the container's rootfs to mask over with a bind
-	// mount pointing to /dev/null as to prevent reads of the file.
-	MaskPaths []string `json:"mask_paths"`
-
-	// ReadonlyPaths specifies paths within the container's rootfs to remount as read-only
-	// so that these files prevent any writes.
-	ReadonlyPaths []string `json:"readonly_paths"`
-
-	// Sysctl is a map of properties and their values. It is the equivalent of using
-	// sysctl -w my.property.name value in Linux.
-	Sysctl map[string]string `json:"sysctl"`
-
-	// Seccomp allows actions to be taken whenever a syscall is made within the container.
-	// A number of rules are given, each having an action to be taken if a syscall matches it.
-	// A default action to be taken if no rules match is also given.
-	Seccomp *Seccomp `json:"seccomp"`
-
-	// NoNewPrivileges controls whether processes in the container can gain additional privileges.
-	NoNewPrivileges bool `json:"no_new_privileges,omitempty"`
-
-	// Hooks are a collection of actions to perform at various container lifecycle events.
-	// CommandHooks are serialized to JSON, but other hooks are not.
-	Hooks *Hooks
-
-	// Version is the version of opencontainer specification that is supported.
-	Version string `json:"version"`
-
-	// Labels are user defined metadata that is stored in the config and populated on the state
-	Labels []string `json:"labels"`
-
-	// NoNewKeyring will not allocated a new session keyring for the container.  It will use the
-	// callers keyring in this case.
-	NoNewKeyring bool `json:"no_new_keyring"`
-
-	// Rootless specifies whether the container is a rootless container.
-	Rootless bool `json:"rootless"`
-
-	// IntelRdt specifies settings for Intel RDT/CAT group that the container is placed into
-	// to limit the resources (e.g., L3 cache) the container has available
-	IntelRdt *IntelRdt `json:"intel_rdt,omitempty"`
-}
-
-type Hooks struct {
-	// Prestart commands are executed after the container namespaces are created,
-	// but before the user supplied command is executed from init.
-	Prestart []Hook
-
-	// Poststart commands are executed after the container init process starts.
-	Poststart []Hook
-
-	// Poststop commands are executed after the container init process exits.
-	Poststop []Hook
-}
-
-type Capabilities struct {
-	// Bounding is the set of capabilities checked by the kernel.
-	Bounding []string
-	// Effective is the set of capabilities checked by the kernel.
-	Effective []string
-	// Inheritable is the capabilities preserved across execve.
-	Inheritable []string
-	// Permitted is the limiting superset for effective capabilities.
-	Permitted []string
-	// Ambient is the ambient set of capabilities that are kept.
-	Ambient []string
-}
-
-func (hooks *Hooks) UnmarshalJSON(b []byte) error {
-	var state struct {
-		Prestart  []CommandHook
-		Poststart []CommandHook
-		Poststop  []CommandHook
-	}
-
-	if err := json.Unmarshal(b, &state); err != nil {
-		return err
-	}
-
-	deserialize := func(shooks []CommandHook) (hooks []Hook) {
-		for _, shook := range shooks {
-			hooks = append(hooks, shook)
-		}
-
-		return hooks
-	}
-
-	hooks.Prestart = deserialize(state.Prestart)
-	hooks.Poststart = deserialize(state.Poststart)
-	hooks.Poststop = deserialize(state.Poststop)
-	return nil
-}
-
-func (hooks Hooks) MarshalJSON() ([]byte, error) {
-	serialize := func(hooks []Hook) (serializableHooks []CommandHook) {
-		for _, hook := range hooks {
-			switch chook := hook.(type) {
-			case CommandHook:
-				serializableHooks = append(serializableHooks, chook)
-			default:
-				logrus.Warnf("cannot serialize hook of type %T, skipping", hook)
-			}
-		}
-
-		return serializableHooks
-	}
-
-	return json.Marshal(map[string]interface{}{
-		"prestart":  serialize(hooks.Prestart),
-		"poststart": serialize(hooks.Poststart),
-		"poststop":  serialize(hooks.Poststop),
-	})
-}
-
-// HookState is the payload provided to a hook on execution.
-type HookState specs.State
-
-type Hook interface {
-	// Run executes the hook with the provided state.
-	Run(HookState) error
-}
-
-// NewFunctionHook will call the provided function when the hook is run.
-func NewFunctionHook(f func(HookState) error) FuncHook {
-	return FuncHook{
-		run: f,
-	}
-}
-
-type FuncHook struct {
-	run func(HookState) error
-}
-
-func (f FuncHook) Run(s HookState) error {
-	return f.run(s)
-}
-
-type Command struct {
-	Path    string         `json:"path"`
-	Args    []string       `json:"args"`
-	Env     []string       `json:"env"`
-	Dir     string         `json:"dir"`
-	Timeout *time.Duration `json:"timeout"`
-}
-
-// NewCommandHook will execute the provided command when the hook is run.
-func NewCommandHook(cmd Command) CommandHook {
-	return CommandHook{
-		Command: cmd,
-	}
-}
-
-type CommandHook struct {
-	Command
-}
-
-func (c Command) Run(s HookState) error {
-	b, err := json.Marshal(s)
-	if err != nil {
-		return err
-	}
-	var stdout, stderr bytes.Buffer
-	cmd := exec.Cmd{
-		Path:   c.Path,
-		Args:   c.Args,
-		Env:    c.Env,
-		Stdin:  bytes.NewReader(b),
-		Stdout: &stdout,
-		Stderr: &stderr,
-	}
-	if err := cmd.Start(); err != nil {
-		return err
-	}
-	errC := make(chan error, 1)
-	go func() {
-		err := cmd.Wait()
-		if err != nil {
-			err = fmt.Errorf("error running hook: %v, stdout: %s, stderr: %s", err, stdout.String(), stderr.String())
-		}
-		errC <- err
-	}()
-	var timerCh <-chan time.Time
-	if c.Timeout != nil {
-		timer := time.NewTimer(*c.Timeout)
-		defer timer.Stop()
-		timerCh = timer.C
-	}
-	select {
-	case err := <-errC:
-		return err
-	case <-timerCh:
-		cmd.Process.Kill()
-		cmd.Wait()
-		return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds())
-	}
-}

+ 0 - 61
libnetwork/vendor/github.com/opencontainers/runc/libcontainer/configs/config_linux.go

@@ -1,61 +0,0 @@
-package configs
-
-import "fmt"
-
-// HostUID gets the translated uid for the process on host which could be
-// different when user namespaces are enabled.
-func (c Config) HostUID(containerId int) (int, error) {
-	if c.Namespaces.Contains(NEWUSER) {
-		if c.UidMappings == nil {
-			return -1, fmt.Errorf("User namespaces enabled, but no uid mappings found.")
-		}
-		id, found := c.hostIDFromMapping(containerId, c.UidMappings)
-		if !found {
-			return -1, fmt.Errorf("User namespaces enabled, but no user mapping found.")
-		}
-		return id, nil
-	}
-	// Return unchanged id.
-	return containerId, nil
-}
-
-// HostRootUID gets the root uid for the process on host which could be non-zero
-// when user namespaces are enabled.
-func (c Config) HostRootUID() (int, error) {
-	return c.HostUID(0)
-}
-
-// HostGID gets the translated gid for the process on host which could be
-// different when user namespaces are enabled.
-func (c Config) HostGID(containerId int) (int, error) {
-	if c.Namespaces.Contains(NEWUSER) {
-		if c.GidMappings == nil {
-			return -1, fmt.Errorf("User namespaces enabled, but no gid mappings found.")
-		}
-		id, found := c.hostIDFromMapping(containerId, c.GidMappings)
-		if !found {
-			return -1, fmt.Errorf("User namespaces enabled, but no group mapping found.")
-		}
-		return id, nil
-	}
-	// Return unchanged id.
-	return containerId, nil
-}
-
-// HostRootGID gets the root gid for the process on host which could be non-zero
-// when user namespaces are enabled.
-func (c Config) HostRootGID() (int, error) {
-	return c.HostGID(0)
-}
-
-// Utility function that gets a host ID for a container ID from user namespace map
-// if that ID is present in the map.
-func (c Config) hostIDFromMapping(containerID int, uMap []IDMap) (int, bool) {
-	for _, m := range uMap {
-		if (containerID >= m.ContainerID) && (containerID <= (m.ContainerID + m.Size - 1)) {
-			hostID := m.HostID + (containerID - m.ContainerID)
-			return hostID, true
-		}
-	}
-	return -1, false
-}

+ 0 - 57
libnetwork/vendor/github.com/opencontainers/runc/libcontainer/configs/device.go

@@ -1,57 +0,0 @@
-package configs
-
-import (
-	"fmt"
-	"os"
-)
-
-const (
-	Wildcard = -1
-)
-
-// TODO Windows: This can be factored out in the future
-
-type Device struct {
-	// Device type, block, char, etc.
-	Type rune `json:"type"`
-
-	// Path to the device.
-	Path string `json:"path"`
-
-	// Major is the device's major number.
-	Major int64 `json:"major"`
-
-	// Minor is the device's minor number.
-	Minor int64 `json:"minor"`
-
-	// Cgroup permissions format, rwm.
-	Permissions string `json:"permissions"`
-
-	// FileMode permission bits for the device.
-	FileMode os.FileMode `json:"file_mode"`
-
-	// Uid of the device.
-	Uid uint32 `json:"uid"`
-
-	// Gid of the device.
-	Gid uint32 `json:"gid"`
-
-	// Write the file to the allowed list
-	Allow bool `json:"allow"`
-}
-
-func (d *Device) CgroupString() string {
-	return fmt.Sprintf("%c %s:%s %s", d.Type, deviceNumberString(d.Major), deviceNumberString(d.Minor), d.Permissions)
-}
-
-func (d *Device) Mkdev() int {
-	return int((d.Major << 8) | (d.Minor & 0xff) | ((d.Minor & 0xfff00) << 12))
-}
-
-// deviceNumberString converts the device number to a string return result.
-func deviceNumberString(number int64) string {
-	if number == Wildcard {
-		return "*"
-	}
-	return fmt.Sprint(number)
-}

+ 0 - 111
libnetwork/vendor/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go

@@ -1,111 +0,0 @@
-// +build linux
-
-package configs
-
-var (
-	// DefaultSimpleDevices are devices that are to be both allowed and created.
-	DefaultSimpleDevices = []*Device{
-		// /dev/null and zero
-		{
-			Path:        "/dev/null",
-			Type:        'c',
-			Major:       1,
-			Minor:       3,
-			Permissions: "rwm",
-			FileMode:    0666,
-		},
-		{
-			Path:        "/dev/zero",
-			Type:        'c',
-			Major:       1,
-			Minor:       5,
-			Permissions: "rwm",
-			FileMode:    0666,
-		},
-
-		{
-			Path:        "/dev/full",
-			Type:        'c',
-			Major:       1,
-			Minor:       7,
-			Permissions: "rwm",
-			FileMode:    0666,
-		},
-
-		// consoles and ttys
-		{
-			Path:        "/dev/tty",
-			Type:        'c',
-			Major:       5,
-			Minor:       0,
-			Permissions: "rwm",
-			FileMode:    0666,
-		},
-
-		// /dev/urandom,/dev/random
-		{
-			Path:        "/dev/urandom",
-			Type:        'c',
-			Major:       1,
-			Minor:       9,
-			Permissions: "rwm",
-			FileMode:    0666,
-		},
-		{
-			Path:        "/dev/random",
-			Type:        'c',
-			Major:       1,
-			Minor:       8,
-			Permissions: "rwm",
-			FileMode:    0666,
-		},
-	}
-	DefaultAllowedDevices = append([]*Device{
-		// allow mknod for any device
-		{
-			Type:        'c',
-			Major:       Wildcard,
-			Minor:       Wildcard,
-			Permissions: "m",
-		},
-		{
-			Type:        'b',
-			Major:       Wildcard,
-			Minor:       Wildcard,
-			Permissions: "m",
-		},
-
-		{
-			Path:        "/dev/console",
-			Type:        'c',
-			Major:       5,
-			Minor:       1,
-			Permissions: "rwm",
-		},
-		// /dev/pts/ - pts namespaces are "coming soon"
-		{
-			Path:        "",
-			Type:        'c',
-			Major:       136,
-			Minor:       Wildcard,
-			Permissions: "rwm",
-		},
-		{
-			Path:        "",
-			Type:        'c',
-			Major:       5,
-			Minor:       2,
-			Permissions: "rwm",
-		},
-
-		// tuntap
-		{
-			Path:        "",
-			Type:        'c',
-			Major:       10,
-			Minor:       200,
-			Permissions: "rwm",
-		},
-	}, DefaultSimpleDevices...)
-	DefaultAutoCreatedDevices = append([]*Device{}, DefaultSimpleDevices...)
-)

+ 0 - 9
libnetwork/vendor/github.com/opencontainers/runc/libcontainer/configs/hugepage_limit.go

@@ -1,9 +0,0 @@
-package configs
-
-type HugepageLimit struct {
-	// which type of hugepage to limit.
-	Pagesize string `json:"page_size"`
-
-	// usage limit for hugepage.
-	Limit uint64 `json:"limit"`
-}

+ 0 - 7
libnetwork/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go

@@ -1,7 +0,0 @@
-package configs
-
-type IntelRdt struct {
-	// The schema for L3 cache id and capacity bitmask (CBM)
-	// Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
-	L3CacheSchema string `json:"l3_cache_schema,omitempty"`
-}

+ 0 - 14
libnetwork/vendor/github.com/opencontainers/runc/libcontainer/configs/interface_priority_map.go

@@ -1,14 +0,0 @@
-package configs
-
-import (
-	"fmt"
-)
-
-type IfPrioMap struct {
-	Interface string `json:"interface"`
-	Priority  int64  `json:"priority"`
-}
-
-func (i *IfPrioMap) CgroupString() string {
-	return fmt.Sprintf("%s %d", i.Interface, i.Priority)
-}

+ 0 - 39
libnetwork/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go

@@ -1,39 +0,0 @@
-package configs
-
-const (
-	// EXT_COPYUP is a directive to copy up the contents of a directory when
-	// a tmpfs is mounted over it.
-	EXT_COPYUP = 1 << iota
-)
-
-type Mount struct {
-	// Source path for the mount.
-	Source string `json:"source"`
-
-	// Destination path for the mount inside the container.
-	Destination string `json:"destination"`
-
-	// Device the mount is for.
-	Device string `json:"device"`
-
-	// Mount flags.
-	Flags int `json:"flags"`
-
-	// Propagation Flags
-	PropagationFlags []int `json:"propagation_flags"`
-
-	// Mount data applied to the mount.
-	Data string `json:"data"`
-
-	// Relabel source if set, "z" indicates shared, "Z" indicates unshared.
-	Relabel string `json:"relabel"`
-
-	// Extensions are additional flags that are specific to runc.
-	Extensions int `json:"extensions"`
-
-	// Optional Command to be run before Source is mounted.
-	PremountCmds []Command `json:"premount_cmds"`
-
-	// Optional Command to be run after Source is mounted.
-	PostmountCmds []Command `json:"postmount_cmds"`
-}

+ 0 - 5
libnetwork/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces.go

@@ -1,5 +0,0 @@
-package configs
-
-type NamespaceType string
-
-type Namespaces []Namespace

+ 0 - 122
libnetwork/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go

@@ -1,122 +0,0 @@
-package configs
-
-import (
-	"fmt"
-	"os"
-	"sync"
-)
-
-const (
-	NEWNET  NamespaceType = "NEWNET"
-	NEWPID  NamespaceType = "NEWPID"
-	NEWNS   NamespaceType = "NEWNS"
-	NEWUTS  NamespaceType = "NEWUTS"
-	NEWIPC  NamespaceType = "NEWIPC"
-	NEWUSER NamespaceType = "NEWUSER"
-)
-
-var (
-	nsLock              sync.Mutex
-	supportedNamespaces = make(map[NamespaceType]bool)
-)
-
-// NsName converts the namespace type to its filename
-func NsName(ns NamespaceType) string {
-	switch ns {
-	case NEWNET:
-		return "net"
-	case NEWNS:
-		return "mnt"
-	case NEWPID:
-		return "pid"
-	case NEWIPC:
-		return "ipc"
-	case NEWUSER:
-		return "user"
-	case NEWUTS:
-		return "uts"
-	}
-	return ""
-}
-
-// IsNamespaceSupported returns whether a namespace is available or
-// not
-func IsNamespaceSupported(ns NamespaceType) bool {
-	nsLock.Lock()
-	defer nsLock.Unlock()
-	supported, ok := supportedNamespaces[ns]
-	if ok {
-		return supported
-	}
-	nsFile := NsName(ns)
-	// if the namespace type is unknown, just return false
-	if nsFile == "" {
-		return false
-	}
-	_, err := os.Stat(fmt.Sprintf("/proc/self/ns/%s", nsFile))
-	// a namespace is supported if it exists and we have permissions to read it
-	supported = err == nil
-	supportedNamespaces[ns] = supported
-	return supported
-}
-
-func NamespaceTypes() []NamespaceType {
-	return []NamespaceType{
-		NEWUSER, // Keep user NS always first, don't move it.
-		NEWIPC,
-		NEWUTS,
-		NEWNET,
-		NEWPID,
-		NEWNS,
-	}
-}
-
-// Namespace defines configuration for each namespace.  It specifies an
-// alternate path that is able to be joined via setns.
-type Namespace struct {
-	Type NamespaceType `json:"type"`
-	Path string        `json:"path"`
-}
-
-func (n *Namespace) GetPath(pid int) string {
-	return fmt.Sprintf("/proc/%d/ns/%s", pid, NsName(n.Type))
-}
-
-func (n *Namespaces) Remove(t NamespaceType) bool {
-	i := n.index(t)
-	if i == -1 {
-		return false
-	}
-	*n = append((*n)[:i], (*n)[i+1:]...)
-	return true
-}
-
-func (n *Namespaces) Add(t NamespaceType, path string) {
-	i := n.index(t)
-	if i == -1 {
-		*n = append(*n, Namespace{Type: t, Path: path})
-		return
-	}
-	(*n)[i].Path = path
-}
-
-func (n *Namespaces) index(t NamespaceType) int {
-	for i, ns := range *n {
-		if ns.Type == t {
-			return i
-		}
-	}
-	return -1
-}
-
-func (n *Namespaces) Contains(t NamespaceType) bool {
-	return n.index(t) != -1
-}
-
-func (n *Namespaces) PathOf(t NamespaceType) string {
-	i := n.index(t)
-	if i == -1 {
-		return ""
-	}
-	return (*n)[i].Path
-}

+ 0 - 31
libnetwork/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go

@@ -1,31 +0,0 @@
-// +build linux
-
-package configs
-
-import "golang.org/x/sys/unix"
-
-func (n *Namespace) Syscall() int {
-	return namespaceInfo[n.Type]
-}
-
-var namespaceInfo = map[NamespaceType]int{
-	NEWNET:  unix.CLONE_NEWNET,
-	NEWNS:   unix.CLONE_NEWNS,
-	NEWUSER: unix.CLONE_NEWUSER,
-	NEWIPC:  unix.CLONE_NEWIPC,
-	NEWUTS:  unix.CLONE_NEWUTS,
-	NEWPID:  unix.CLONE_NEWPID,
-}
-
-// CloneFlags parses the container's Namespaces options to set the correct
-// flags on clone, unshare. This function returns flags only for new namespaces.
-func (n *Namespaces) CloneFlags() uintptr {
-	var flag int
-	for _, v := range *n {
-		if v.Path != "" {
-			continue
-		}
-		flag |= namespaceInfo[v.Type]
-	}
-	return uintptr(flag)
-}

+ 0 - 13
libnetwork/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall_unsupported.go

@@ -1,13 +0,0 @@
-// +build !linux,!windows
-
-package configs
-
-func (n *Namespace) Syscall() int {
-	panic("No namespace syscall support")
-}
-
-// CloneFlags parses the container's Namespaces options to set the correct
-// flags on clone, unshare. This function returns flags only for new namespaces.
-func (n *Namespaces) CloneFlags() uintptr {
-	panic("No namespace syscall support")
-}

+ 0 - 8
libnetwork/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_unsupported.go

@@ -1,8 +0,0 @@
-// +build !linux
-
-package configs
-
-// Namespace defines configuration for each namespace.  It specifies an
-// alternate path that is able to be joined via setns.
-type Namespace struct {
-}

+ 0 - 72
libnetwork/vendor/github.com/opencontainers/runc/libcontainer/configs/network.go

@@ -1,72 +0,0 @@
-package configs
-
-// Network defines configuration for a container's networking stack
-//
-// The network configuration can be omitted from a container causing the
-// container to be setup with the host's networking stack
-type Network struct {
-	// Type sets the networks type, commonly veth and loopback
-	Type string `json:"type"`
-
-	// Name of the network interface
-	Name string `json:"name"`
-
-	// The bridge to use.
-	Bridge string `json:"bridge"`
-
-	// MacAddress contains the MAC address to set on the network interface
-	MacAddress string `json:"mac_address"`
-
-	// Address contains the IPv4 and mask to set on the network interface
-	Address string `json:"address"`
-
-	// Gateway sets the gateway address that is used as the default for the interface
-	Gateway string `json:"gateway"`
-
-	// IPv6Address contains the IPv6 and mask to set on the network interface
-	IPv6Address string `json:"ipv6_address"`
-
-	// IPv6Gateway sets the ipv6 gateway address that is used as the default for the interface
-	IPv6Gateway string `json:"ipv6_gateway"`
-
-	// Mtu sets the mtu value for the interface and will be mirrored on both the host and
-	// container's interfaces if a pair is created, specifically in the case of type veth
-	// Note: This does not apply to loopback interfaces.
-	Mtu int `json:"mtu"`
-
-	// TxQueueLen sets the tx_queuelen value for the interface and will be mirrored on both the host and
-	// container's interfaces if a pair is created, specifically in the case of type veth
-	// Note: This does not apply to loopback interfaces.
-	TxQueueLen int `json:"txqueuelen"`
-
-	// HostInterfaceName is a unique name of a veth pair that resides on in the host interface of the
-	// container.
-	HostInterfaceName string `json:"host_interface_name"`
-
-	// HairpinMode specifies if hairpin NAT should be enabled on the virtual interface
-	// bridge port in the case of type veth
-	// Note: This is unsupported on some systems.
-	// Note: This does not apply to loopback interfaces.
-	HairpinMode bool `json:"hairpin_mode"`
-}
-
-// Routes can be specified to create entries in the route table as the container is started
-//
-// All of destination, source, and gateway should be either IPv4 or IPv6.
-// One of the three options must be present, and omitted entries will use their
-// IP family default for the route table.  For IPv4 for example, setting the
-// gateway to 1.2.3.4 and the interface to eth0 will set up a standard
-// destination of 0.0.0.0(or *) when viewed in the route table.
-type Route struct {
-	// Sets the destination and mask, should be a CIDR.  Accepts IPv4 and IPv6
-	Destination string `json:"destination"`
-
-	// Sets the source and mask, should be a CIDR.  Accepts IPv4 and IPv6
-	Source string `json:"source"`
-
-	// Sets the gateway.  Accepts IPv4 and IPv6
-	Gateway string `json:"gateway"`
-
-	// The device to set this route up for, for example: eth0
-	InterfaceName string `json:"interface_name"`
-}

+ 3 - 3
libnetwork/vendor/github.com/opencontainers/runc/libcontainer/nsenter/README.md

@@ -10,8 +10,8 @@ The `nsenter` package will `import "C"` and it uses [cgo](https://golang.org/cmd
 package. In cgo, if the import of "C" is immediately preceded by a comment, that comment, 
 called the preamble, is used as a header when compiling the C parts of the package.
 So every time we  import package `nsenter`, the C code function `nsexec()` would be 
-called. And package `nsenter` is now only imported in `main_unix.go`, so every time
-before we call `cmd.Start` on linux, that C code would run.
+called. And package `nsenter` is only imported in `init.go`, so every time the runc
+`init` command is invoked, that C code is run.
 
 Because `nsexec()` must be run before the Go runtime in order to use the
 Linux kernel namespace, you must `import` this library into a package if
@@ -37,7 +37,7 @@ the parent `nsexec()` will exit and the child `nsexec()` process will
 return to allow the Go runtime take over.
 
 NOTE: We do both `setns(2)` and `clone(2)` even if we don't have any
-CLONE_NEW* clone flags because we must fork a new process in order to
+`CLONE_NEW*` clone flags because we must fork a new process in order to
 enter the PID namespace.
 
 

+ 79 - 47
libnetwork/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c

@@ -42,6 +42,12 @@ enum sync_t {
 	SYNC_ERR = 0xFF,	/* Fatal error, no turning back. The error code follows. */
 };
 
+/*
+ * Synchronisation value for cgroup namespace setup.
+ * The same constant is defined in process_linux.go as "createCgroupns".
+ */
+#define CREATECGROUPNS 0x80
+
 /* longjmp() arguments. */
 #define JUMP_PARENT 0x00
 #define JUMP_CHILD  0xA0
@@ -82,7 +88,7 @@ struct nlconfig_t {
 	uint8_t is_setgroup;
 
 	/* Rootless container settings. */
-	uint8_t is_rootless;
+	uint8_t is_rootless_euid;	/* boolean */
 	char *uidmappath;
 	size_t uidmappath_len;
 	char *gidmappath;
@@ -100,7 +106,7 @@ struct nlconfig_t {
 #define GIDMAP_ATTR			27284
 #define SETGROUP_ATTR		27285
 #define OOM_SCORE_ADJ_ATTR	27286
-#define ROOTLESS_ATTR	    27287
+#define ROOTLESS_EUID_ATTR	27287
 #define UIDMAPPATH_ATTR	    27288
 #define GIDMAPPATH_ATTR	    27289
 
@@ -211,7 +217,7 @@ static int try_mapping_tool(const char *app, int pid, char *map, size_t map_len)
 
 	/*
 	 * If @app is NULL, execve will segfault. Just check it here and bail (if
-	 * we're in this path, the caller is already getting desparate and there
+	 * we're in this path, the caller is already getting desperate and there
 	 * isn't a backup to this failing). This usually would be a configuration
 	 * or programming issue.
 	 */
@@ -419,8 +425,8 @@ static void nl_parse(int fd, struct nlconfig_t *config)
 		case CLONE_FLAGS_ATTR:
 			config->cloneflags = readint32(current);
 			break;
-		case ROOTLESS_ATTR:
-			config->is_rootless = readint8(current);
+		case ROOTLESS_EUID_ATTR:
+			config->is_rootless_euid = readint8(current);	/* boolean */
 			break;
 		case OOM_SCORE_ADJ_ATTR:
 			config->oom_score_adj = current;
@@ -505,7 +511,8 @@ void join_namespaces(char *nslist)
 
 		ns->fd = fd;
 		ns->ns = nsflag(namespace);
-		strncpy(ns->path, path, PATH_MAX);
+		strncpy(ns->path, path, PATH_MAX - 1);
+		ns->path[PATH_MAX - 1] = '\0';
 	} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
 
 	/*
@@ -639,7 +646,6 @@ void nsexec(void)
 	case JUMP_PARENT:{
 			int len;
 			pid_t child, first_child = -1;
-			char buf[JSON_MAX];
 			bool ready = false;
 
 			/* For debugging. */
@@ -678,17 +684,15 @@ void nsexec(void)
 					/*
 					 * Enable setgroups(2) if we've been asked to. But we also
 					 * have to explicitly disable setgroups(2) if we're
-					 * creating a rootless container (this is required since
-					 * Linux 3.19).
+					 * creating a rootless container for single-entry mapping.
+					 * i.e. config.is_setgroup == false.
+					 * (this is required since Linux 3.19).
+					 *
+					 * For rootless multi-entry mapping, config.is_setgroup shall be true and
+					 * newuidmap/newgidmap shall be used.
 					 */
-					if (config.is_rootless && config.is_setgroup) {
-						kill(child, SIGKILL);
-						bail("cannot allow setgroup in an unprivileged user namespace setup");
-					}
 
-					if (config.is_setgroup)
-						update_setgroups(child, SETGROUPS_ALLOW);
-					if (config.is_rootless)
+					if (config.is_rootless_euid && !config.is_setgroup)
 						update_setgroups(child, SETGROUPS_DENY);
 
 					/* Set up mappings. */
@@ -717,6 +721,18 @@ void nsexec(void)
 							kill(child, SIGKILL);
 							bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
 						}
+
+						/* Send the init_func pid back to our parent.
+						 *
+						 * Send the init_func pid and the pid of the first child back to our parent.
+						 * We need to send both back because we can't reap the first child we created (CLONE_PARENT).
+						 * It becomes the responsibility of our parent to reap the first child.
+						 */
+						len = dprintf(pipenum, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child);
+						if (len < 0) {
+							kill(child, SIGKILL);
+							bail("unable to generate JSON for child pid");
+						}
 					}
 					break;
 				case SYNC_CHILD_READY:
@@ -760,23 +776,6 @@ void nsexec(void)
 					bail("unexpected sync value: %u", s);
 				}
 			}
-
-			/*
-			 * Send the init_func pid and the pid of the first child back to our parent.
-			 *
-			 * We need to send both back because we can't reap the first child we created (CLONE_PARENT).
-			 * It becomes the responsibility of our parent to reap the first child.
-			 */
-			len = snprintf(buf, JSON_MAX, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child);
-			if (len < 0) {
-				kill(child, SIGKILL);
-				bail("unable to generate JSON for child pid");
-			}
-			if (write(pipenum, buf, len) != len) {
-				kill(child, SIGKILL);
-				bail("unable to send child pid to bootstrapper");
-			}
-
 			exit(0);
 		}
 
@@ -809,25 +808,30 @@ void nsexec(void)
 			if (config.namespaces)
 				join_namespaces(config.namespaces);
 
-			/*
-			 * Unshare all of the namespaces. Now, it should be noted that this
-			 * ordering might break in the future (especially with rootless
-			 * containers). But for now, it's not possible to split this into
-			 * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues.
-			 *
-			 * Note that we don't merge this with clone() because there were
-			 * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
-			 * was broken, so we'll just do it the long way anyway.
-			 */
-			if (unshare(config.cloneflags) < 0)
-				bail("failed to unshare namespaces");
-
 			/*
 			 * Deal with user namespaces first. They are quite special, as they
 			 * affect our ability to unshare other namespaces and are used as
 			 * context for privilege checks.
+			 *
+			 * We don't unshare all namespaces in one go. The reason for this
+			 * is that, while the kernel documentation may claim otherwise,
+			 * there are certain cases where unsharing all namespaces at once
+			 * will result in namespace objects being owned incorrectly.
+			 * Ideally we should just fix these kernel bugs, but it's better to
+			 * be safe than sorry, and fix them separately.
+			 *
+			 * A specific case of this is that the SELinux label of the
+			 * internal kern-mount that mqueue uses will be incorrect if the
+			 * UTS namespace is cloned before the USER namespace is mapped.
+			 * I've also heard of similar problems with the network namespace
+			 * in some scenarios. This also mirrors how LXC deals with this
+			 * problem.
 			 */
 			if (config.cloneflags & CLONE_NEWUSER) {
+				if (unshare(CLONE_NEWUSER) < 0)
+					bail("failed to unshare user namespace");
+				config.cloneflags &= ~CLONE_NEWUSER;
+
 				/*
 				 * We don't have the privileges to do any mapping here (see the
 				 * clone_parent rant). So signal our parent to hook us up.
@@ -853,7 +857,23 @@ void nsexec(void)
 					if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
 						bail("failed to set process as dumpable");
 				}
+
+				/* Become root in the namespace proper. */
+				if (setresuid(0, 0, 0) < 0)
+					bail("failed to become root in user namespace");
 			}
+			/*
+			 * Unshare all of the namespaces. Now, it should be noted that this
+			 * ordering might break in the future (especially with rootless
+			 * containers). But for now, it's not possible to split this into
+			 * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues.
+			 *
+			 * Note that we don't merge this with clone() because there were
+			 * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
+			 * was broken, so we'll just do it the long way anyway.
+			 */
+			if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0)
+				bail("failed to unshare namespaces");
 
 			/*
 			 * TODO: What about non-namespace clone flags that we're dropping here?
@@ -936,11 +956,23 @@ void nsexec(void)
 			if (setgid(0) < 0)
 				bail("setgid failed");
 
-			if (!config.is_rootless && config.is_setgroup) {
+			if (!config.is_rootless_euid && config.is_setgroup) {
 				if (setgroups(0, NULL) < 0)
 					bail("setgroups failed");
 			}
 
+			/* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */
+			if (config.cloneflags & CLONE_NEWCGROUP) {
+				uint8_t value;
+				if (read(pipenum, &value, sizeof(value)) != sizeof(value))
+					bail("read synchronisation value failed");
+				if (value == CREATECGROUPNS) {
+					if (unshare(CLONE_NEWCGROUP) < 0)
+						bail("failed to unshare cgroup namespace");
+				} else
+					bail("received unknown synchronisation value");
+			}
+
 			s = SYNC_CHILD_READY;
 			if (write(syncfd, &s, sizeof(s)) != sizeof(s))
 				bail("failed to sync with patent: write(SYNC_CHILD_READY)");

+ 28 - 0
libnetwork/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go

@@ -5,6 +5,7 @@ package user
 import (
 	"io"
 	"os"
+	"strconv"
 
 	"golang.org/x/sys/unix"
 )
@@ -114,3 +115,30 @@ func CurrentUser() (User, error) {
 func CurrentGroup() (Group, error) {
 	return LookupGid(unix.Getgid())
 }
+
+func currentUserSubIDs(fileName string) ([]SubID, error) {
+	u, err := CurrentUser()
+	if err != nil {
+		return nil, err
+	}
+	filter := func(entry SubID) bool {
+		return entry.Name == u.Name || entry.Name == strconv.Itoa(u.Uid)
+	}
+	return ParseSubIDFileFilter(fileName, filter)
+}
+
+func CurrentUserSubUIDs() ([]SubID, error) {
+	return currentUserSubIDs("/etc/subuid")
+}
+
+func CurrentUserSubGIDs() ([]SubID, error) {
+	return currentUserSubIDs("/etc/subgid")
+}
+
+func CurrentProcessUIDMap() ([]IDMap, error) {
+	return ParseIDMapFile("/proc/self/uid_map")
+}
+
+func CurrentProcessGIDMap() ([]IDMap, error) {
+	return ParseIDMapFile("/proc/self/gid_map")
+}

+ 130 - 3
libnetwork/vendor/github.com/opencontainers/runc/libcontainer/user/user.go

@@ -75,12 +75,29 @@ func groupFromOS(g *user.Group) (Group, error) {
 	return newGroup, nil
 }
 
+// SubID represents an entry in /etc/sub{u,g}id
+type SubID struct {
+	Name  string
+	SubID int64
+	Count int64
+}
+
+// IDMap represents an entry in /proc/PID/{u,g}id_map
+type IDMap struct {
+	ID       int64
+	ParentID int64
+	Count    int64
+}
+
 func parseLine(line string, v ...interface{}) {
-	if line == "" {
+	parseParts(strings.Split(line, ":"), v...)
+}
+
+func parseParts(parts []string, v ...interface{}) {
+	if len(parts) == 0 {
 		return
 	}
 
-	parts := strings.Split(line, ":")
 	for i, p := range parts {
 		// Ignore cases where we don't have enough fields to populate the arguments.
 		// Some configuration files like to misbehave.
@@ -96,6 +113,8 @@ func parseLine(line string, v ...interface{}) {
 		case *int:
 			// "numbers", with conversion errors ignored because of some misbehaving configuration files.
 			*e, _ = strconv.Atoi(p)
+		case *int64:
+			*e, _ = strconv.ParseInt(p, 10, 64)
 		case *[]string:
 			// Comma-separated lists.
 			if p != "" {
@@ -105,7 +124,7 @@ func parseLine(line string, v ...interface{}) {
 			}
 		default:
 			// Someone goof'd when writing code using this function. Scream so they can hear us.
-			panic(fmt.Sprintf("parseLine only accepts {*string, *int, *[]string} as arguments! %#v is not a pointer!", e))
+			panic(fmt.Sprintf("parseLine only accepts {*string, *int, *int64, *[]string} as arguments! %#v is not a pointer!", e))
 		}
 	}
 }
@@ -479,3 +498,111 @@ func GetAdditionalGroupsPath(additionalGroups []string, groupPath string) ([]int
 	}
 	return GetAdditionalGroups(additionalGroups, group)
 }
+
+func ParseSubIDFile(path string) ([]SubID, error) {
+	subid, err := os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+	defer subid.Close()
+	return ParseSubID(subid)
+}
+
+func ParseSubID(subid io.Reader) ([]SubID, error) {
+	return ParseSubIDFilter(subid, nil)
+}
+
+func ParseSubIDFileFilter(path string, filter func(SubID) bool) ([]SubID, error) {
+	subid, err := os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+	defer subid.Close()
+	return ParseSubIDFilter(subid, filter)
+}
+
+func ParseSubIDFilter(r io.Reader, filter func(SubID) bool) ([]SubID, error) {
+	if r == nil {
+		return nil, fmt.Errorf("nil source for subid-formatted data")
+	}
+
+	var (
+		s   = bufio.NewScanner(r)
+		out = []SubID{}
+	)
+
+	for s.Scan() {
+		if err := s.Err(); err != nil {
+			return nil, err
+		}
+
+		line := strings.TrimSpace(s.Text())
+		if line == "" {
+			continue
+		}
+
+		// see: man 5 subuid
+		p := SubID{}
+		parseLine(line, &p.Name, &p.SubID, &p.Count)
+
+		if filter == nil || filter(p) {
+			out = append(out, p)
+		}
+	}
+
+	return out, nil
+}
+
+func ParseIDMapFile(path string) ([]IDMap, error) {
+	r, err := os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+	defer r.Close()
+	return ParseIDMap(r)
+}
+
+func ParseIDMap(r io.Reader) ([]IDMap, error) {
+	return ParseIDMapFilter(r, nil)
+}
+
+func ParseIDMapFileFilter(path string, filter func(IDMap) bool) ([]IDMap, error) {
+	r, err := os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+	defer r.Close()
+	return ParseIDMapFilter(r, filter)
+}
+
+func ParseIDMapFilter(r io.Reader, filter func(IDMap) bool) ([]IDMap, error) {
+	if r == nil {
+		return nil, fmt.Errorf("nil source for idmap-formatted data")
+	}
+
+	var (
+		s   = bufio.NewScanner(r)
+		out = []IDMap{}
+	)
+
+	for s.Scan() {
+		if err := s.Err(); err != nil {
+			return nil, err
+		}
+
+		line := strings.TrimSpace(s.Text())
+		if line == "" {
+			continue
+		}
+
+		// see: man 7 user_namespaces
+		p := IDMap{}
+		parseParts(strings.Fields(line), &p.ID, &p.ParentID, &p.Count)
+
+		if filter == nil || filter(p) {
+			out = append(out, p)
+		}
+	}
+
+	return out, nil
+}

+ 1 - 1
libnetwork/vendor/github.com/opencontainers/runc/vendor.conf

@@ -1,7 +1,7 @@
 # OCI runtime-spec. When updating this, make sure you use a version tag rather
 # than a commit ID so it's much more obvious what version of the spec we are
 # using.
-github.com/opencontainers/runtime-spec v1.0.0
+github.com/opencontainers/runtime-spec 5684b8af48c1ac3b1451fa499724e30e3c20a294
 # Core libcontainer functionality.
 github.com/mrunalp/fileutils ed869b029674c0e9ce4c0dfa781405c2d9946d08
 github.com/opencontainers/selinux v1.0.0-rc1