Browse Source

Merge pull request #20187 from cyphar/vendor-runc

[carry 19752] vendor: update runc/libcontainer to v0.0.8
Alexander Morozov 9 years ago
parent
commit
929f62e64d
41 changed files with 1326 additions and 530 deletions
  1. 1 1
      hack/vendor.sh
  2. 153 67
      vendor/src/github.com/opencontainers/runc/libcontainer/README.md
  3. 3 2
      vendor/src/github.com/opencontainers/runc/libcontainer/SPEC.md
  4. 3 0
      vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go
  5. 75 53
      vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go
  6. 1 6
      vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/blkio.go
  7. 1 6
      vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go
  8. 2 6
      vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go
  9. 14 6
      vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go
  10. 1 6
      vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go
  11. 1 6
      vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/hugetlb.go
  12. 22 8
      vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go
  13. 1 6
      vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_cls.go
  14. 1 6
      vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_prio.go
  15. 57 0
      vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/pids.go
  16. 8 0
      vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/stats.go
  17. 4 0
      vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_nosystemd.go
  18. 105 144
      vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go
  19. 55 24
      vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/utils.go
  20. 24 8
      vendor/src/github.com/opencontainers/runc/libcontainer/configs/cgroup_unix.go
  21. 3 0
      vendor/src/github.com/opencontainers/runc/libcontainer/configs/config.go
  22. 3 0
      vendor/src/github.com/opencontainers/runc/libcontainer/configs/device.go
  23. 0 14
      vendor/src/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go
  24. 28 2
      vendor/src/github.com/opencontainers/runc/libcontainer/container.go
  25. 155 114
      vendor/src/github.com/opencontainers/runc/libcontainer/container_linux.go
  26. 6 1
      vendor/src/github.com/opencontainers/runc/libcontainer/error.go
  27. 24 9
      vendor/src/github.com/opencontainers/runc/libcontainer/factory_linux.go
  28. 12 0
      vendor/src/github.com/opencontainers/runc/libcontainer/generic_error.go
  29. 24 1
      vendor/src/github.com/opencontainers/runc/libcontainer/init_linux.go
  30. 67 0
      vendor/src/github.com/opencontainers/runc/libcontainer/keys/keyctl.go
  31. 40 14
      vendor/src/github.com/opencontainers/runc/libcontainer/notify_linux.go
  32. 1 0
      vendor/src/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c
  33. 3 3
      vendor/src/github.com/opencontainers/runc/libcontainer/process.go
  34. 52 11
      vendor/src/github.com/opencontainers/runc/libcontainer/process_linux.go
  35. 25 3
      vendor/src/github.com/opencontainers/runc/libcontainer/rootfs_linux.go
  36. 9 1
      vendor/src/github.com/opencontainers/runc/libcontainer/selinux/selinux.go
  37. 10 0
      vendor/src/github.com/opencontainers/runc/libcontainer/setns_init_linux.go
  38. 31 2
      vendor/src/github.com/opencontainers/runc/libcontainer/standard_init_linux.go
  39. 226 0
      vendor/src/github.com/opencontainers/runc/libcontainer/state_linux.go
  40. 45 0
      vendor/src/github.com/opencontainers/runc/libcontainer/system/linux.go
  41. 30 0
      vendor/src/github.com/opencontainers/runc/libcontainer/utils/utils.go

+ 1 - 1
hack/vendor.sh

@@ -59,7 +59,7 @@ clone git github.com/miekg/pkcs11 80f102b5cac759de406949c47f0928b99bd64cdf
 clone git github.com/docker/go v1.5.1-1-1-gbaf439e
 clone git github.com/agl/ed25519 d2b94fd789ea21d12fac1a4443dd3a3f79cda72c
 
-clone git github.com/opencontainers/runc 3d8a20bb772defc28c355534d83486416d1719b4 # libcontainer
+clone git github.com/opencontainers/runc ce72f86a2b54bc114d6ffb51f6500479b2d42154 # libcontainer
 clone git github.com/seccomp/libseccomp-golang 1b506fc7c24eec5a3693cdcbed40d9c226cfc6a1
 # libcontainer deps (see src/github.com/opencontainers/runc/Godeps/Godeps.json)
 clone git github.com/coreos/go-systemd v4

+ 153 - 67
vendor/src/github.com/opencontainers/runc/libcontainer/README.md

@@ -10,80 +10,165 @@ host system and which is (optionally) isolated from other containers in the syst
 
 #### Using libcontainer
 
-To create a container you first have to initialize an instance of a factory
-that will handle the creation and initialization for a container.
+Because containers are spawned in a two step process you will need a binary that
+will be executed as the init process for the container. In libcontainer, we use
+the current binary (/proc/self/exe) to be executed as the init process, and use
+arg "init", we call the first step process "bootstrap", so you always need a "init"
+function as the entry of "bootstrap".
 
-Because containers are spawned in a two step process you will need to provide
-arguments to a binary that will be executed as the init process for the container.
-To use the current binary that is spawning the containers and acting as the parent
-you can use `os.Args[0]` and we have a command called `init` setup.
+```go
+func init() {
+	if len(os.Args) > 1 && os.Args[1] == "init" {
+		runtime.GOMAXPROCS(1)
+		runtime.LockOSThread()
+		factory, _ := libcontainer.New("")
+		if err := factory.StartInitialization(); err != nil {
+			logrus.Fatal(err)
+		}
+		panic("--this line should have never been executed, congratulations--")
+	}
+}
+```
+
+Then to create a container you first have to initialize an instance of a factory
+that will handle the creation and initialization for a container.
 
 ```go
-root, err := libcontainer.New("/var/lib/container", libcontainer.InitArgs(os.Args[0], "init"))
+factory, err := libcontainer.New("/var/lib/container", libcontainer.Cgroupfs, libcontainer.InitArgs(os.Args[0], "init"))
 if err != nil {
-    log.Fatal(err)
+	logrus.Fatal(err)
+	return
 }
 ```
 
 Once you have an instance of the factory created we can create a configuration
-struct describing how the container is to be created.  A sample would look similar to this:
+struct describing how the container is to be created. A sample would look similar to this:
 
 ```go
+defaultMountFlags := syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
 config := &configs.Config{
-    Rootfs: rootfs,
-    Capabilities: []string{
-        "CAP_CHOWN",
-        "CAP_DAC_OVERRIDE",
-        "CAP_FSETID",
-        "CAP_FOWNER",
-        "CAP_MKNOD",
-        "CAP_NET_RAW",
-        "CAP_SETGID",
-        "CAP_SETUID",
-        "CAP_SETFCAP",
-        "CAP_SETPCAP",
-        "CAP_NET_BIND_SERVICE",
-        "CAP_SYS_CHROOT",
-        "CAP_KILL",
-        "CAP_AUDIT_WRITE",
-    },
-    Namespaces: configs.Namespaces([]configs.Namespace{
-        {Type: configs.NEWNS},
-        {Type: configs.NEWUTS},
-        {Type: configs.NEWIPC},
-        {Type: configs.NEWPID},
-        {Type: configs.NEWNET},
-    }),
-    Cgroups: &configs.Cgroup{
-        Name:            "test-container",
-        Parent:          "system",
-        AllowAllDevices: false,
-        AllowedDevices:  configs.DefaultAllowedDevices,
-    },
-
-    Devices:  configs.DefaultAutoCreatedDevices,
-    Hostname: "testing",
-    Networks: []*configs.Network{
-        {
-            Type:    "loopback",
-            Address: "127.0.0.1/0",
-            Gateway: "localhost",
-        },
-    },
-    Rlimits: []configs.Rlimit{
-        {
-            Type: syscall.RLIMIT_NOFILE,
-            Hard: uint64(1024),
-            Soft: uint64(1024),
-        },
-    },
+	Rootfs: "/your/path/to/rootfs",
+	Capabilities: []string{
+		"CAP_CHOWN",
+		"CAP_DAC_OVERRIDE",
+		"CAP_FSETID",
+		"CAP_FOWNER",
+		"CAP_MKNOD",
+		"CAP_NET_RAW",
+		"CAP_SETGID",
+		"CAP_SETUID",
+		"CAP_SETFCAP",
+		"CAP_SETPCAP",
+		"CAP_NET_BIND_SERVICE",
+		"CAP_SYS_CHROOT",
+		"CAP_KILL",
+		"CAP_AUDIT_WRITE",
+	},
+	Namespaces: configs.Namespaces([]configs.Namespace{
+		{Type: configs.NEWNS},
+		{Type: configs.NEWUTS},
+		{Type: configs.NEWIPC},
+		{Type: configs.NEWPID},
+		{Type: configs.NEWUSER},
+		{Type: configs.NEWNET},
+	}),
+	Cgroups: &configs.Cgroup{
+		Name:   "test-container",
+		Parent: "system",
+		Resources: &configs.Resources{
+			MemorySwappiness: -1,
+			AllowAllDevices:  false,
+			AllowedDevices:   configs.DefaultAllowedDevices,
+		},
+	},
+	MaskPaths: []string{
+		"/proc/kcore",
+	},
+	ReadonlyPaths: []string{
+		"/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
+	},
+	Devices:  configs.DefaultAutoCreatedDevices,
+	Hostname: "testing",
+	Mounts: []*configs.Mount{
+		{
+			Source:      "proc",
+			Destination: "/proc",
+			Device:      "proc",
+			Flags:       defaultMountFlags,
+		},
+		{
+			Source:      "tmpfs",
+			Destination: "/dev",
+			Device:      "tmpfs",
+			Flags:       syscall.MS_NOSUID | syscall.MS_STRICTATIME,
+			Data:        "mode=755",
+		},
+		{
+			Source:      "devpts",
+			Destination: "/dev/pts",
+			Device:      "devpts",
+			Flags:       syscall.MS_NOSUID | syscall.MS_NOEXEC,
+			Data:        "newinstance,ptmxmode=0666,mode=0620,gid=5",
+		},
+		{
+			Device:      "tmpfs",
+			Source:      "shm",
+			Destination: "/dev/shm",
+			Data:        "mode=1777,size=65536k",
+			Flags:       defaultMountFlags,
+		},
+		{
+			Source:      "mqueue",
+			Destination: "/dev/mqueue",
+			Device:      "mqueue",
+			Flags:       defaultMountFlags,
+		},
+		{
+			Source:      "sysfs",
+			Destination: "/sys",
+			Device:      "sysfs",
+			Flags:       defaultMountFlags | syscall.MS_RDONLY,
+		},
+	},
+	UidMappings: []configs.IDMap{
+		{
+			ContainerID: 0,
+			Host: 1000,
+			size: 65536,
+		},
+	},
+	GidMappings: []configs.IDMap{
+		{
+			ContainerID: 0,
+			Host: 1000,
+			size: 65536,
+		},
+	},
+	Networks: []*configs.Network{
+		{
+			Type:    "loopback",
+			Address: "127.0.0.1/0",
+			Gateway: "localhost",
+		},
+	},
+	Rlimits: []configs.Rlimit{
+		{
+			Type: syscall.RLIMIT_NOFILE,
+			Hard: uint64(1025),
+			Soft: uint64(1025),
+		},
+	},
 }
 ```
 
 Once you have the configuration populated you can create a container:
 
 ```go
-container, err := root.Create("container-id", config)
+container, err := factory.Create("container-id", config)
+if err != nil {
+	logrus.Fatal(err)
+	return
+}
 ```
 
 To spawn bash as the initial process inside the container and have the
@@ -91,23 +176,25 @@ processes pid returned in order to wait, signal, or kill the process:
 
 ```go
 process := &libcontainer.Process{
-    Args:   []string{"/bin/bash"},
-    Env:    []string{"PATH=/bin"},
-    User:   "daemon",
-    Stdin:  os.Stdin,
-    Stdout: os.Stdout,
-    Stderr: os.Stderr,
+	Args:   []string{"/bin/bash"},
+	Env:    []string{"PATH=/bin"},
+	User:   "daemon",
+	Stdin:  os.Stdin,
+	Stdout: os.Stdout,
+	Stderr: os.Stderr,
 }
 
 err := container.Start(process)
 if err != nil {
-    log.Fatal(err)
+	logrus.Fatal(err)
+	container.Destroy()
+	return
 }
 
 // wait for the process to finish.
-status, err := process.Wait()
+_, err := process.Wait()
 if err != nil {
-    log.Fatal(err)
+	logrus.Fatal(err)
 }
 
 // destroy the container.
@@ -124,7 +211,6 @@ processes, err := container.Processes()
 // it's processes.
 stats, err := container.Stats()
 
-
 // pause all processes inside the container.
 container.Pause()
 

+ 3 - 2
vendor/src/github.com/opencontainers/runc/libcontainer/SPEC.md

@@ -60,7 +60,7 @@ are required to be mounted within the rootfs that the runtime will setup.
 After a container's filesystems are mounted within the newly created 
 mount namespace `/dev` will need to be populated with a set of device nodes.
 It is expected that a rootfs does not need to have any device nodes specified
-for `/dev` witin the rootfs as the container will setup the correct devices
+for `/dev` within the rootfs as the container will setup the correct devices
 that are required for executing a container's process.
 
 |      Path    | Mode |   Access   |
@@ -142,6 +142,7 @@ system resources like cpu, memory, and device access.
 | perf_event | 1       |
 | freezer    | 1       |
 | hugetlb    | 1       |
+| pids       | 1       |
 
 
 All cgroup subsystem are joined so that statistics can be collected from
@@ -199,7 +200,7 @@ provide a good default for security and flexibility for the applications.
 | CAP_SYS_BOOT         | 0       |
 | CAP_LEASE            | 0       |
 | CAP_WAKE_ALARM       | 0       |
-| CAP_BLOCK_SUSPE      | 0       |
+| CAP_BLOCK_SUSPEND    | 0       |
 
 
 Additional security layers like [apparmor](https://wiki.ubuntu.com/AppArmor)

+ 3 - 0
vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go

@@ -15,6 +15,9 @@ type Manager interface {
 	// Returns the PIDs inside the cgroup set
 	GetPids() ([]int, error)
 
+	// Returns the PIDs inside the cgroup set & all sub-cgroups
+	GetAllPids() ([]int, error)
+
 	// Returns statistics for the cgroup set
 	GetStats() (*Stats, error)
 

+ 75 - 53
vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go

@@ -14,6 +14,7 @@ import (
 
 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/configs"
+	libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
 )
 
 var (
@@ -23,6 +24,7 @@ var (
 		&MemoryGroup{},
 		&CpuGroup{},
 		&CpuacctGroup{},
+		&PidsGroup{},
 		&BlkioGroup{},
 		&HugetlbGroup{},
 		&NetClsGroup{},
@@ -93,11 +95,10 @@ func getCgroupRoot() (string, error) {
 }
 
 type cgroupData struct {
-	root   string
-	parent string
-	name   string
-	config *configs.Cgroup
-	pid    int
+	root      string
+	innerPath string
+	config    *configs.Cgroup
+	pid       int
 }
 
 func (m *Manager) Apply(pid int) (err error) {
@@ -112,6 +113,22 @@ func (m *Manager) Apply(pid int) (err error) {
 		return err
 	}
 
+	if c.Paths != nil {
+		paths := make(map[string]string)
+		for name, path := range c.Paths {
+			_, err := d.path(name)
+			if err != nil {
+				if cgroups.IsNotFound(err) {
+					continue
+				}
+				return err
+			}
+			paths[name] = path
+		}
+		m.Paths = paths
+		return cgroups.EnterPid(m.Paths, pid)
+	}
+
 	paths := make(map[string]string)
 	defer func() {
 		if err != nil {
@@ -135,17 +152,13 @@ func (m *Manager) Apply(pid int) (err error) {
 		paths[sys.Name()] = p
 	}
 	m.Paths = paths
-
-	if paths["cpu"] != "" {
-		if err := CheckCpushares(paths["cpu"], c.Resources.CpuShares); err != nil {
-			return err
-		}
-	}
-
 	return nil
 }
 
 func (m *Manager) Destroy() error {
+	if m.Cgroups.Paths != nil {
+		return nil
+	}
 	m.mu.Lock()
 	defer m.mu.Unlock()
 	if err := cgroups.RemovePaths(m.Paths); err != nil {
@@ -179,15 +192,28 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) {
 }
 
 func (m *Manager) Set(container *configs.Config) error {
-	for name, path := range m.Paths {
-		sys, err := subsystems.Get(name)
-		if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) {
-			continue
+	for _, sys := range subsystems {
+		// Generate fake cgroup data.
+		d, err := getCgroupData(container.Cgroups, -1)
+		if err != nil {
+			return err
+		}
+		// Get the path, but don't error out if the cgroup wasn't found.
+		path, err := d.path(sys.Name())
+		if err != nil && !cgroups.IsNotFound(err) {
+			return err
 		}
+
 		if err := sys.Set(path, container.Cgroups); err != nil {
 			return err
 		}
 	}
+
+	if m.Paths["cpu"] != "" {
+		if err := CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil {
+			return err
+		}
+	}
 	return nil
 }
 
@@ -217,41 +243,28 @@ func (m *Manager) Freeze(state configs.FreezerState) error {
 }
 
 func (m *Manager) GetPids() ([]int, error) {
-	d, err := getCgroupData(m.Cgroups, 0)
+	dir, err := getCgroupPath(m.Cgroups)
 	if err != nil {
 		return nil, err
 	}
+	return cgroups.GetPids(dir)
+}
 
-	dir, err := d.path("devices")
+func (m *Manager) GetAllPids() ([]int, error) {
+	dir, err := getCgroupPath(m.Cgroups)
 	if err != nil {
 		return nil, err
 	}
-
-	return cgroups.GetPids(dir)
+	return cgroups.GetAllPids(dir)
 }
 
-// pathClean makes a path safe for use with filepath.Join. This is done by not
-// only cleaning the path, but also (if the path is relative) adding a leading
-// '/' and cleaning it (then removing the leading '/'). This ensures that a
-// path resulting from prepending another path will always resolve to lexically
-// be a subdirectory of the prefixed path. This is all done lexically, so paths
-// that include symlinks won't be safe as a result of using pathClean.
-func pathClean(path string) string {
-	// Ensure that all paths are cleaned (especially problematic ones like
-	// "/../../../../../" which can cause lots of issues).
-	path = filepath.Clean(path)
-
-	// If the path isn't absolute, we need to do more processing to fix paths
-	// such as "../../../../<etc>/some/path". We also shouldn't convert absolute
-	// paths to relative ones.
-	if !filepath.IsAbs(path) {
-		path = filepath.Clean(string(os.PathSeparator) + path)
-		// This can't fail, as (by definition) all paths are relative to root.
-		path, _ = filepath.Rel(string(os.PathSeparator), path)
-	}
-
-	// Clean the path again for good measure.
-	return filepath.Clean(path)
+func getCgroupPath(c *configs.Cgroup) (string, error) {
+	d, err := getCgroupData(c, 0)
+	if err != nil {
+		return "", err
+	}
+
+	return d.path("devices")
 }
 
 func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) {
@@ -260,15 +273,25 @@ func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) {
 		return nil, err
 	}
 
-	// Clean the parent slice path.
-	c.Parent = pathClean(c.Parent)
+	if (c.Name != "" || c.Parent != "") && c.Path != "" {
+		return nil, fmt.Errorf("cgroup: either Path or Name and Parent should be used")
+	}
+
+	// XXX: Do not remove this code. Path safety is important! -- cyphar
+	cgPath := libcontainerUtils.CleanPath(c.Path)
+	cgParent := libcontainerUtils.CleanPath(c.Parent)
+	cgName := libcontainerUtils.CleanPath(c.Name)
+
+	innerPath := cgPath
+	if innerPath == "" {
+		innerPath = filepath.Join(cgParent, cgName)
+	}
 
 	return &cgroupData{
-		root:   root,
-		parent: c.Parent,
-		name:   c.Name,
-		config: c,
-		pid:    pid,
+		root:      root,
+		innerPath: innerPath,
+		config:    c,
+		pid:       pid,
 	}, nil
 }
 
@@ -296,11 +319,10 @@ func (raw *cgroupData) path(subsystem string) (string, error) {
 		return "", err
 	}
 
-	cgPath := filepath.Join(raw.parent, raw.name)
 	// If the cgroup name/path is absolute do not look relative to the cgroup of the init process.
-	if filepath.IsAbs(cgPath) {
+	if filepath.IsAbs(raw.innerPath) {
 		// Sometimes subsystems can be mounted togethger as 'cpu,cpuacct'.
-		return filepath.Join(raw.root, filepath.Base(mnt), cgPath), nil
+		return filepath.Join(raw.root, filepath.Base(mnt), raw.innerPath), nil
 	}
 
 	parentPath, err := raw.parentPath(subsystem, mnt, root)
@@ -308,7 +330,7 @@ func (raw *cgroupData) path(subsystem string) (string, error) {
 		return "", err
 	}
 
-	return filepath.Join(parentPath, cgPath), nil
+	return filepath.Join(parentPath, raw.innerPath), nil
 }
 
 func (raw *cgroupData) join(subsystem string) (string, error) {

+ 1 - 6
vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/blkio.go

@@ -22,15 +22,10 @@ func (s *BlkioGroup) Name() string {
 }
 
 func (s *BlkioGroup) Apply(d *cgroupData) error {
-	dir, err := d.join("blkio")
+	_, err := d.join("blkio")
 	if err != nil && !cgroups.IsNotFound(err) {
 		return err
 	}
-
-	if err := s.Set(dir, d.config); err != nil {
-		return err
-	}
-
 	return nil
 }
 

+ 1 - 6
vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go

@@ -22,15 +22,10 @@ func (s *CpuGroup) Name() string {
 func (s *CpuGroup) Apply(d *cgroupData) error {
 	// We always want to join the cpu group, to allow fair cpu scheduling
 	// on a container basis
-	dir, err := d.join("cpu")
+	_, err := d.join("cpu")
 	if err != nil && !cgroups.IsNotFound(err) {
 		return err
 	}
-
-	if err := s.Set(dir, d.config); err != nil {
-		return err
-	}
-
 	return nil
 }
 

+ 2 - 6
vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go

@@ -12,6 +12,7 @@ import (
 
 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/configs"
+	libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
 )
 
 type CpusetGroup struct {
@@ -64,11 +65,6 @@ func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) erro
 	if err := s.ensureParent(dir, root); err != nil {
 		return err
 	}
-	// the default values inherit from parent cgroup are already set in
-	// s.ensureParent, cover these if we have our own
-	if err := s.Set(dir, cgroup); err != nil {
-		return err
-	}
 	// because we are not using d.join we need to place the pid into the procs file
 	// unlike the other subsystems
 	if err := writeFile(dir, "cgroup.procs", strconv.Itoa(pid)); err != nil {
@@ -93,7 +89,7 @@ func (s *CpusetGroup) getSubsystemSettings(parent string) (cpus []byte, mems []b
 // it's parent.
 func (s *CpusetGroup) ensureParent(current, root string) error {
 	parent := filepath.Dir(current)
-	if filepath.Clean(parent) == root {
+	if libcontainerUtils.CleanPath(parent) == root {
 		return nil
 	}
 	// Avoid infinite recursion.

+ 14 - 6
vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go

@@ -15,21 +15,29 @@ func (s *DevicesGroup) Name() string {
 }
 
 func (s *DevicesGroup) Apply(d *cgroupData) error {
-	dir, err := d.join("devices")
+	_, err := d.join("devices")
 	if err != nil {
 		// We will return error even it's `not found` error, devices
 		// cgroup is hard requirement for container's security.
 		return err
 	}
-
-	if err := s.Set(dir, d.config); err != nil {
-		return err
-	}
-
 	return nil
 }
 
 func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error {
+	devices := cgroup.Resources.Devices
+	if len(devices) > 0 {
+		for _, dev := range devices {
+			file := "devices.deny"
+			if dev.Allow {
+				file = "devices.allow"
+			}
+			if err := writeFile(path, file, dev.CgroupString()); err != nil {
+				return err
+			}
+		}
+		return nil
+	}
 	if !cgroup.Resources.AllowAllDevices {
 		if err := writeFile(path, "devices.deny", "a"); err != nil {
 			return err

+ 1 - 6
vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go

@@ -19,15 +19,10 @@ func (s *FreezerGroup) Name() string {
 }
 
 func (s *FreezerGroup) Apply(d *cgroupData) error {
-	dir, err := d.join("freezer")
+	_, err := d.join("freezer")
 	if err != nil && !cgroups.IsNotFound(err) {
 		return err
 	}
-
-	if err := s.Set(dir, d.config); err != nil {
-		return err
-	}
-
 	return nil
 }
 

+ 1 - 6
vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/hugetlb.go

@@ -19,15 +19,10 @@ func (s *HugetlbGroup) Name() string {
 }
 
 func (s *HugetlbGroup) Apply(d *cgroupData) error {
-	dir, err := d.join("hugetlb")
+	_, err := d.join("hugetlb")
 	if err != nil && !cgroups.IsNotFound(err) {
 		return err
 	}
-
-	if err := s.Set(dir, d.config); err != nil {
-		return err
-	}
-
 	return nil
 }
 

+ 22 - 8
vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go

@@ -32,8 +32,9 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
 				return err
 			}
 		}
-
-		if err := s.Set(path, d.config); err != nil {
+		// We have to set kernel memory here, as we can't change it once
+		// processes have been attached.
+		if err := s.SetKernelMemory(path, d.config); err != nil {
 			return err
 		}
 	}
@@ -50,7 +51,17 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
 	if err != nil && !cgroups.IsNotFound(err) {
 		return err
 	}
+	return nil
+}
 
+func (s *MemoryGroup) SetKernelMemory(path string, cgroup *configs.Cgroup) error {
+	// This has to be done separately because it has special constraints (it
+	// can't be done after there are processes attached to the cgroup).
+	if cgroup.Resources.KernelMemory > 0 {
+		if err := writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemory, 10)); err != nil {
+			return err
+		}
+	}
 	return nil
 }
 
@@ -70,12 +81,6 @@ func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
 			return err
 		}
 	}
-	if cgroup.Resources.KernelMemory > 0 {
-		if err := writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemory, 10)); err != nil {
-			return err
-		}
-	}
-
 	if cgroup.Resources.OomKillDisable {
 		if err := writeFile(path, "memory.oom_control", "1"); err != nil {
 			return err
@@ -157,6 +162,7 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {
 	usage := strings.Join([]string{moduleName, "usage_in_bytes"}, ".")
 	maxUsage := strings.Join([]string{moduleName, "max_usage_in_bytes"}, ".")
 	failcnt := strings.Join([]string{moduleName, "failcnt"}, ".")
+	limit := strings.Join([]string{moduleName, "limit_in_bytes"}, ".")
 
 	value, err := getCgroupParamUint(path, usage)
 	if err != nil {
@@ -182,6 +188,14 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {
 		return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", failcnt, err)
 	}
 	memoryData.Failcnt = value
+	value, err = getCgroupParamUint(path, limit)
+	if err != nil {
+		if moduleName != "memory" && os.IsNotExist(err) {
+			return cgroups.MemoryData{}, nil
+		}
+		return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", limit, err)
+	}
+	memoryData.Limit = value
 
 	return memoryData, nil
 }

+ 1 - 6
vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_cls.go

@@ -15,15 +15,10 @@ func (s *NetClsGroup) Name() string {
 }
 
 func (s *NetClsGroup) Apply(d *cgroupData) error {
-	dir, err := d.join("net_cls")
+	_, err := d.join("net_cls")
 	if err != nil && !cgroups.IsNotFound(err) {
 		return err
 	}
-
-	if err := s.Set(dir, d.config); err != nil {
-		return err
-	}
-
 	return nil
 }
 

+ 1 - 6
vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_prio.go

@@ -15,15 +15,10 @@ func (s *NetPrioGroup) Name() string {
 }
 
 func (s *NetPrioGroup) Apply(d *cgroupData) error {
-	dir, err := d.join("net_prio")
+	_, err := d.join("net_prio")
 	if err != nil && !cgroups.IsNotFound(err) {
 		return err
 	}
-
-	if err := s.Set(dir, d.config); err != nil {
-		return err
-	}
-
 	return nil
 }
 

+ 57 - 0
vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/pids.go

@@ -0,0 +1,57 @@
+// +build linux
+
+package fs
+
+import (
+	"fmt"
+	"strconv"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type PidsGroup struct {
+}
+
+func (s *PidsGroup) Name() string {
+	return "pids"
+}
+
+func (s *PidsGroup) Apply(d *cgroupData) error {
+	_, err := d.join("pids")
+	if err != nil && !cgroups.IsNotFound(err) {
+		return err
+	}
+	return nil
+}
+
+func (s *PidsGroup) Set(path string, cgroup *configs.Cgroup) error {
+	if cgroup.Resources.PidsLimit != 0 {
+		// "max" is the fallback value.
+		limit := "max"
+
+		if cgroup.Resources.PidsLimit > 0 {
+			limit = strconv.FormatInt(cgroup.Resources.PidsLimit, 10)
+		}
+
+		if err := writeFile(path, "pids.max", limit); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (s *PidsGroup) Remove(d *cgroupData) error {
+	return removePath(d.path("pids"))
+}
+
+func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error {
+	value, err := getCgroupParamUint(path, "pids.current")
+	if err != nil {
+		return fmt.Errorf("failed to parse pids.current - %s", err)
+	}
+
+	stats.PidsStats.Current = value
+	return nil
+}

+ 8 - 0
vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/stats.go

@@ -36,7 +36,9 @@ type MemoryData struct {
 	Usage    uint64 `json:"usage,omitempty"`
 	MaxUsage uint64 `json:"max_usage,omitempty"`
 	Failcnt  uint64 `json:"failcnt"`
+	Limit    uint64 `json:"limit"`
 }
+
 type MemoryStats struct {
 	// memory used for cache
 	Cache uint64 `json:"cache,omitempty"`
@@ -49,6 +51,11 @@ type MemoryStats struct {
 	Stats       map[string]uint64 `json:"stats,omitempty"`
 }
 
+type PidsStats struct {
+	// number of pids in the cgroup
+	Current uint64 `json:"current,omitempty"`
+}
+
 type BlkioStatEntry struct {
 	Major uint64 `json:"major,omitempty"`
 	Minor uint64 `json:"minor,omitempty"`
@@ -80,6 +87,7 @@ type HugetlbStats struct {
 type Stats struct {
 	CpuStats    CpuStats    `json:"cpu_stats,omitempty"`
 	MemoryStats MemoryStats `json:"memory_stats,omitempty"`
+	PidsStats   PidsStats   `json:"pids_stats,omitempty"`
 	BlkioStats  BlkioStats  `json:"blkio_stats,omitempty"`
 	// the map is in the format "size of hugepage: stats of the hugepage"
 	HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"`

+ 4 - 0
vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_nosystemd.go

@@ -26,6 +26,10 @@ func (m *Manager) GetPids() ([]int, error) {
 	return nil, fmt.Errorf("Systemd not supported")
 }
 
+func (m *Manager) GetAllPids() ([]int, error) {
+	return nil, fmt.Errorf("Systemd not supported")
+}
+
 func (m *Manager) Destroy() error {
 	return fmt.Errorf("Systemd not supported")
 }

+ 105 - 144
vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go

@@ -55,6 +55,7 @@ var subsystems = subsystemSet{
 	&fs.MemoryGroup{},
 	&fs.CpuGroup{},
 	&fs.CpuacctGroup{},
+	&fs.PidsGroup{},
 	&fs.BlkioGroup{},
 	&fs.HugetlbGroup{},
 	&fs.PerfEventGroup{},
@@ -167,6 +168,23 @@ func (m *Manager) Apply(pid int) error {
 		properties []systemdDbus.Property
 	)
 
+	if c.Paths != nil {
+		paths := make(map[string]string)
+		for name, path := range c.Paths {
+			_, err := getSubsystemPath(m.Cgroups, name)
+			if err != nil {
+				// Don't fail if a cgroup hierarchy was not found, just skip this subsystem
+				if cgroups.IsNotFound(err) {
+					continue
+				}
+				return err
+			}
+			paths[name] = path
+		}
+		m.Paths = paths
+		return cgroups.EnterPid(m.Paths, pid)
+	}
+
 	if c.Parent != "" {
 		slice = c.Parent
 	}
@@ -233,7 +251,7 @@ func (m *Manager) Apply(pid int) error {
 		return err
 	}
 
-	// we need to manually join the freezer, net_cls, net_prio and cpuset cgroup in systemd
+	// we need to manually join the freezer, net_cls, net_prio, pids and cpuset cgroup in systemd
 	// because it does not currently support it via the dbus api.
 	if err := joinFreezer(c, pid); err != nil {
 		return err
@@ -246,6 +264,10 @@ func (m *Manager) Apply(pid int) error {
 		return err
 	}
 
+	if err := joinPids(c, pid); err != nil {
+		return err
+	}
+
 	if err := joinCpuset(c, pid); err != nil {
 		return err
 	}
@@ -277,17 +299,13 @@ func (m *Manager) Apply(pid int) error {
 		paths[s.Name()] = subsystemPath
 	}
 	m.Paths = paths
-
-	if paths["cpu"] != "" {
-		if err := fs.CheckCpushares(paths["cpu"], c.Resources.CpuShares); err != nil {
-			return err
-		}
-	}
-
 	return nil
 }
 
 func (m *Manager) Destroy() error {
+	if m.Cgroups.Paths != nil {
+		return nil
+	}
 	m.mu.Lock()
 	defer m.mu.Unlock()
 	theConn.StopUnit(getUnitName(m.Cgroups), "replace", nil)
@@ -330,68 +348,74 @@ func join(c *configs.Cgroup, subsystem string, pid int) (string, error) {
 }
 
 func joinCpu(c *configs.Cgroup, pid int) error {
-	path, err := getSubsystemPath(c, "cpu")
+	_, err := join(c, "cpu", pid)
 	if err != nil && !cgroups.IsNotFound(err) {
 		return err
 	}
-	if c.Resources.CpuQuota != 0 {
-		if err = writeFile(path, "cpu.cfs_quota_us", strconv.FormatInt(c.Resources.CpuQuota, 10)); err != nil {
-			return err
-		}
-	}
-	if c.Resources.CpuPeriod != 0 {
-		if err = writeFile(path, "cpu.cfs_period_us", strconv.FormatInt(c.Resources.CpuPeriod, 10)); err != nil {
-			return err
-		}
-	}
-	if c.Resources.CpuRtPeriod != 0 {
-		if err = writeFile(path, "cpu.rt_period_us", strconv.FormatInt(c.Resources.CpuRtPeriod, 10)); err != nil {
-			return err
-		}
-	}
-	if c.Resources.CpuRtRuntime != 0 {
-		if err = writeFile(path, "cpu.rt_runtime_us", strconv.FormatInt(c.Resources.CpuRtRuntime, 10)); err != nil {
-			return err
-		}
-	}
-
 	return nil
 }
 
 func joinFreezer(c *configs.Cgroup, pid int) error {
-	path, err := join(c, "freezer", pid)
+	_, err := join(c, "freezer", pid)
 	if err != nil && !cgroups.IsNotFound(err) {
 		return err
 	}
-	freezer, err := subsystems.Get("freezer")
-	if err != nil {
-		return err
-	}
-	return freezer.Set(path, c)
+	return nil
 }
 
 func joinNetPrio(c *configs.Cgroup, pid int) error {
-	path, err := join(c, "net_prio", pid)
+	_, err := join(c, "net_prio", pid)
 	if err != nil && !cgroups.IsNotFound(err) {
 		return err
 	}
-	netPrio, err := subsystems.Get("net_prio")
-	if err != nil {
-		return err
-	}
-	return netPrio.Set(path, c)
+	return nil
 }
 
 func joinNetCls(c *configs.Cgroup, pid int) error {
-	path, err := join(c, "net_cls", pid)
+	_, err := join(c, "net_cls", pid)
 	if err != nil && !cgroups.IsNotFound(err) {
 		return err
 	}
-	netcls, err := subsystems.Get("net_cls")
-	if err != nil {
+	return nil
+}
+
+func joinPids(c *configs.Cgroup, pid int) error {
+	_, err := join(c, "pids", pid)
+	if err != nil && !cgroups.IsNotFound(err) {
 		return err
 	}
-	return netcls.Set(path, c)
+	return nil
+}
+
+// systemd represents slice heirarchy using `-`, so we need to follow suit when
+// generating the path of slice. Essentially, test-a-b.slice becomes
+// test.slice/test-a.slice/test-a-b.slice.
+func expandSlice(slice string) (string, error) {
+	suffix := ".slice"
+	// Name has to end with ".slice", but can't be just ".slice".
+	if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) {
+		return "", fmt.Errorf("invalid slice name: %s", slice)
+	}
+
+	// Path-separators are not allowed.
+	if strings.Contains(slice, "/") {
+		return "", fmt.Errorf("invalid slice name: %s", slice)
+	}
+
+	var path, prefix string
+	sliceName := strings.TrimSuffix(slice, suffix)
+	for _, component := range strings.Split(sliceName, "-") {
+		// test--a.slice isn't permitted, nor is -test.slice.
+		if component == "" {
+			return "", fmt.Errorf("invalid slice name: %s", slice)
+		}
+
+		// Append the component to the path and to the prefix.
+		path += prefix + component + suffix + "/"
+		prefix += component + "-"
+	}
+
+	return path, nil
 }
 
 func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
@@ -410,6 +434,11 @@ func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
 		slice = c.Parent
 	}
 
+	slice, err = expandSlice(slice)
+	if err != nil {
+		return "", err
+	}
+
 	return filepath.Join(mountpoint, initPath, slice, getUnitName(c)), nil
 }
 
@@ -440,6 +469,14 @@ func (m *Manager) GetPids() ([]int, error) {
 	return cgroups.GetPids(path)
 }
 
+func (m *Manager) GetAllPids() ([]int, error) {
+	path, err := getSubsystemPath(m.Cgroups, "devices")
+	if err != nil {
+		return nil, err
+	}
+	return cgroups.GetAllPids(path)
+}
+
 func (m *Manager) GetStats() (*cgroups.Stats, error) {
 	m.mu.Lock()
 	defer m.mu.Unlock()
@@ -458,16 +495,23 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) {
 }
 
 func (m *Manager) Set(container *configs.Config) error {
-	for name, path := range m.Paths {
-		sys, err := subsystems.Get(name)
-		if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) {
-			continue
+	for _, sys := range subsystems {
+		// Get the subsystem path, but don't error out for not found cgroups.
+		path, err := getSubsystemPath(container.Cgroups, sys.Name())
+		if err != nil && !cgroups.IsNotFound(err) {
+			return err
 		}
+
 		if err := sys.Set(path, container.Cgroups); err != nil {
 			return err
 		}
 	}
 
+	if m.Paths["cpu"] != "" {
+		if err := fs.CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil {
+			return err
+		}
+	}
 	return nil
 }
 
@@ -487,17 +531,13 @@ func getUnitName(c *configs.Cgroup) string {
 // because systemd will re-write the device settings if it needs to re-apply the cgroup context.
 // This happens at least for v208 when any sibling unit is started.
 func joinDevices(c *configs.Cgroup, pid int) error {
-	path, err := join(c, "devices", pid)
+	_, err := join(c, "devices", pid)
 	// Even if it's `not found` error, we'll return err because devices cgroup
 	// is hard requirement for container security.
 	if err != nil {
 		return err
 	}
-	devices, err := subsystems.Get("devices")
-	if err != nil {
-		return err
-	}
-	return devices.Set(path, c)
+	return nil
 }
 
 func setKernelMemory(c *configs.Cgroup) error {
@@ -510,52 +550,16 @@ func setKernelMemory(c *configs.Cgroup) error {
 		return err
 	}
 
-	if c.Resources.KernelMemory > 0 {
-		err = writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(c.Resources.KernelMemory, 10))
-		if err != nil {
-			return err
-		}
-	}
-
-	return nil
+	// This doesn't get called by manager.Set, so we need to do it here.
+	s := &fs.MemoryGroup{}
+	return s.SetKernelMemory(path, c)
 }
 
 func joinMemory(c *configs.Cgroup, pid int) error {
-	path, err := getSubsystemPath(c, "memory")
+	_, err := join(c, "memory", pid)
 	if err != nil && !cgroups.IsNotFound(err) {
 		return err
 	}
-
-	// -1 disables memoryswap
-	if c.Resources.MemorySwap > 0 {
-		err = writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(c.Resources.MemorySwap, 10))
-		if err != nil {
-			return err
-		}
-	}
-	if c.Resources.MemoryReservation > 0 {
-		err = writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(c.Resources.MemoryReservation, 10))
-		if err != nil {
-			return err
-		}
-	}
-	if c.Resources.OomKillDisable {
-		if err := writeFile(path, "memory.oom_control", "1"); err != nil {
-			return err
-		}
-	}
-
-	if c.Resources.MemorySwappiness >= 0 && c.Resources.MemorySwappiness <= 100 {
-		err = writeFile(path, "memory.swappiness", strconv.FormatInt(c.Resources.MemorySwappiness, 10))
-		if err != nil {
-			return err
-		}
-	} else if c.Resources.MemorySwappiness == -1 {
-		return nil
-	} else {
-		return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", c.Resources.MemorySwappiness)
-	}
-
 	return nil
 }
 
@@ -577,68 +581,25 @@ func joinCpuset(c *configs.Cgroup, pid int) error {
 // expects device path instead of major minor numbers, which is also confusing
 // for users. So we use fs work around for now.
 func joinBlkio(c *configs.Cgroup, pid int) error {
-	path, err := getSubsystemPath(c, "blkio")
+	_, err := join(c, "blkio", pid)
 	if err != nil {
 		return err
 	}
-	// systemd doesn't directly support this in the dbus properties
-	if c.Resources.BlkioLeafWeight != 0 {
-		if err := writeFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(c.Resources.BlkioLeafWeight), 10)); err != nil {
-			return err
-		}
-	}
-	for _, wd := range c.Resources.BlkioWeightDevice {
-		if err := writeFile(path, "blkio.weight_device", wd.WeightString()); err != nil {
-			return err
-		}
-		if err := writeFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil {
-			return err
-		}
-	}
-	for _, td := range c.Resources.BlkioThrottleReadBpsDevice {
-		if err := writeFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil {
-			return err
-		}
-	}
-	for _, td := range c.Resources.BlkioThrottleWriteBpsDevice {
-		if err := writeFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil {
-			return err
-		}
-	}
-	for _, td := range c.Resources.BlkioThrottleReadIOPSDevice {
-		if err := writeFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil {
-			return err
-		}
-	}
-	for _, td := range c.Resources.BlkioThrottleWriteIOPSDevice {
-		if err := writeFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil {
-			return err
-		}
-	}
-
 	return nil
 }
 
 func joinHugetlb(c *configs.Cgroup, pid int) error {
-	path, err := join(c, "hugetlb", pid)
+	_, err := join(c, "hugetlb", pid)
 	if err != nil && !cgroups.IsNotFound(err) {
 		return err
 	}
-	hugetlb, err := subsystems.Get("hugetlb")
-	if err != nil {
-		return err
-	}
-	return hugetlb.Set(path, c)
+	return nil
 }
 
 func joinPerfEvent(c *configs.Cgroup, pid int) error {
-	path, err := join(c, "perf_event", pid)
+	_, err := join(c, "perf_event", pid)
 	if err != nil && !cgroups.IsNotFound(err) {
 		return err
 	}
-	perfEvent, err := subsystems.Get("perf_event")
-	if err != nil {
-		return err
-	}
-	return perfEvent.Set(path, c)
+	return nil
 }

+ 55 - 24
vendor/src/github.com/opencontainers/runc/libcontainer/cgroups/utils.go

@@ -5,6 +5,7 @@ package cgroups
 import (
 	"bufio"
 	"fmt"
+	"io"
 	"io/ioutil"
 	"os"
 	"path/filepath"
@@ -12,7 +13,6 @@ import (
 	"strings"
 	"time"
 
-	"github.com/docker/docker/pkg/mount"
 	"github.com/docker/go-units"
 )
 
@@ -84,10 +84,19 @@ func FindCgroupMountpointDir() (string, error) {
 		// Safe as mountinfo encodes mountpoints with spaces as \040.
 		index := strings.Index(text, " - ")
 		postSeparatorFields := strings.Fields(text[index+3:])
-		if len(postSeparatorFields) < 3 {
-			return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
+		numPostFields := len(postSeparatorFields)
+
+		// This is an error as we can't detect if the mount is for "cgroup"
+		if numPostFields == 0 {
+			return "", fmt.Errorf("Found no fields post '-' in %q", text)
 		}
+
 		if postSeparatorFields[0] == "cgroup" {
+			// Check that the mount is properly formated.
+			if numPostFields < 3 {
+				return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
+			}
+
 			return filepath.Dir(fields[4]), nil
 		}
 	}
@@ -112,11 +121,45 @@ func (m Mount) GetThisCgroupDir(cgroups map[string]string) (string, error) {
 	return getControllerPath(m.Subsystems[0], cgroups)
 }
 
+func getCgroupMountsHelper(ss map[string]bool, mi io.Reader) ([]Mount, error) {
+	res := make([]Mount, 0, len(ss))
+	scanner := bufio.NewScanner(mi)
+	for scanner.Scan() {
+		txt := scanner.Text()
+		sepIdx := strings.IndexByte(txt, '-')
+		if sepIdx == -1 {
+			return nil, fmt.Errorf("invalid mountinfo format")
+		}
+		if txt[sepIdx+2:sepIdx+8] != "cgroup" {
+			continue
+		}
+		fields := strings.Split(txt, " ")
+		m := Mount{
+			Mountpoint: fields[4],
+			Root:       fields[3],
+		}
+		for _, opt := range strings.Split(fields[len(fields)-1], ",") {
+			if strings.HasPrefix(opt, cgroupNamePrefix) {
+				m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):])
+			}
+			if ss[opt] {
+				m.Subsystems = append(m.Subsystems, opt)
+			}
+		}
+		res = append(res, m)
+	}
+	if err := scanner.Err(); err != nil {
+		return nil, err
+	}
+	return res, nil
+}
+
 func GetCgroupMounts() ([]Mount, error) {
-	mounts, err := mount.GetMounts()
+	f, err := os.Open("/proc/self/mountinfo")
 	if err != nil {
 		return nil, err
 	}
+	defer f.Close()
 
 	all, err := GetAllSubsystems()
 	if err != nil {
@@ -127,24 +170,7 @@ func GetCgroupMounts() ([]Mount, error) {
 	for _, s := range all {
 		allMap[s] = true
 	}
-
-	res := []Mount{}
-	for _, mount := range mounts {
-		if mount.Fstype == "cgroup" {
-			m := Mount{Mountpoint: mount.Mountpoint, Root: mount.Root}
-
-			for _, opt := range strings.Split(mount.VfsOpts, ",") {
-				if strings.HasPrefix(opt, cgroupNamePrefix) {
-					m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):])
-				}
-				if allMap[opt] {
-					m.Subsystems = append(m.Subsystems, opt)
-				}
-			}
-			res = append(res, m)
-		}
-	}
-	return res, nil
+	return getCgroupMountsHelper(allMap, f)
 }
 
 // Returns all the cgroup subsystems supported by the kernel
@@ -323,9 +349,14 @@ func GetHugePageSize() ([]string, error) {
 	return pageSizes, nil
 }
 
-// GetPids returns all pids, that were added to cgroup at path and to all its
-// subcgroups.
+// GetPids returns all pids, that were added to cgroup at path.
 func GetPids(path string) ([]int, error) {
+	return readProcsFile(path)
+}
+
+// GetAllPids returns all pids, that were added to cgroup at path and to all its
+// subcgroups.
+func GetAllPids(path string) ([]int, error) {
 	var pids []int
 	// collect pids from all sub-cgroups
 	err := filepath.Walk(path, func(p string, info os.FileInfo, iErr error) error {

+ 24 - 8
vendor/src/github.com/opencontainers/runc/libcontainer/configs/cgroup_unix.go

@@ -11,25 +11,38 @@ const (
 )
 
 type Cgroup struct {
-	Name string `json:"name"`
+	// Deprecated, use Path instead
+	Name string `json:"name,omitempty"`
 
-	// name of parent cgroup or slice
-	Parent string `json:"parent"`
+	// name of parent of cgroup or slice
+	// Deprecated, use Path instead
+	Parent string `json:"parent,omitempty"`
+
+	// Path specifies the path to cgroups that are created and/or joined by the container.
+	// The path is assumed to be relative to the host system cgroup mountpoint.
+	Path string `json:"path"`
 
 	// ScopePrefix decribes prefix for the scope name
 	ScopePrefix string `json:"scope_prefix"`
 
+	// Paths represent the absolute cgroups paths to join.
+	// This takes precedence over Path.
+	Paths map[string]string
+
 	// Resources contains various cgroups settings to apply
 	*Resources
 }
 
 type Resources struct {
 	// If this is true allow access to any kind of device within the container.  If false, allow access only to devices explicitly listed in the allowed_devices list.
-	AllowAllDevices bool `json:"allow_all_devices"`
+	// Deprecated
+	AllowAllDevices bool `json:"allow_all_devices,omitempty"`
+	// Deprecated
+	AllowedDevices []*Device `json:"allowed_devices,omitempty"`
+	// Deprecated
+	DeniedDevices []*Device `json:"denied_devices,omitempty"`
 
-	AllowedDevices []*Device `json:"allowed_devices"`
-
-	DeniedDevices []*Device `json:"denied_devices"`
+	Devices []*Device `json:"devices"`
 
 	// Memory limit (in bytes)
 	Memory int64 `json:"memory"`
@@ -37,7 +50,7 @@ type Resources struct {
 	// Memory reservation or soft_limit (in bytes)
 	MemoryReservation int64 `json:"memory_reservation"`
 
-	// Total memory usage (memory + swap); set `-1' to disable swap
+	// Total memory usage (memory + swap); set `-1` to enable unlimited swap
 	MemorySwap int64 `json:"memory_swap"`
 
 	// Kernel memory limit (in bytes)
@@ -64,6 +77,9 @@ type Resources struct {
 	// MEM to use
 	CpusetMems string `json:"cpuset_mems"`
 
+	// Process limit; set <= `0' to disable limit.
+	PidsLimit int64 `json:"pids_limit"`
+
 	// Specifies per cgroup weight, range is from 10 to 1000.
 	BlkioWeight uint16 `json:"blkio_weight"`
 

+ 3 - 0
vendor/src/github.com/opencontainers/runc/libcontainer/configs/config.go

@@ -171,6 +171,9 @@ type Config struct {
 	// A default action to be taken if no rules match is also given.
 	Seccomp *Seccomp `json:"seccomp"`
 
+	// NoNewPrivileges controls whether processes in the container can gain additional privileges.
+	NoNewPrivileges bool `json:"no_new_privileges"`
+
 	// Hooks are a collection of actions to perform at various container lifecycle events.
 	// Hooks are not able to be marshaled to json but they are also not needed to.
 	Hooks *Hooks `json:"-"`

+ 3 - 0
vendor/src/github.com/opencontainers/runc/libcontainer/configs/device.go

@@ -35,6 +35,9 @@ type Device struct {
 
 	// Gid of the device.
 	Gid uint32 `json:"gid"`
+
+	// Write the file to the allowed list
+	Allow bool `json:"allow"`
 }
 
 func (d *Device) CgroupString() string {

+ 0 - 14
vendor/src/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go

@@ -82,20 +82,6 @@ var (
 			Minor:       1,
 			Permissions: "rwm",
 		},
-		{
-			Path:        "/dev/tty0",
-			Type:        'c',
-			Major:       4,
-			Minor:       0,
-			Permissions: "rwm",
-		},
-		{
-			Path:        "/dev/tty1",
-			Type:        'c',
-			Major:       4,
-			Minor:       1,
-			Permissions: "rwm",
-		},
 		// /dev/pts/ - pts namespaces are "coming soon"
 		{
 			Path:        "",

+ 28 - 2
vendor/src/github.com/opencontainers/runc/libcontainer/container.go

@@ -6,6 +6,7 @@ package libcontainer
 
 import (
 	"os"
+	"time"
 
 	"github.com/opencontainers/runc/libcontainer/configs"
 )
@@ -14,8 +15,11 @@ import (
 type Status int
 
 const (
+	// The container exists but has not been run yet
+	Created Status = iota
+
 	// The container exists and is running.
-	Running Status = iota + 1
+	Running
 
 	// The container exists, it is in the process of being paused.
 	Pausing
@@ -30,6 +34,25 @@ const (
 	Destroyed
 )
 
+func (s Status) String() string {
+	switch s {
+	case Created:
+		return "created"
+	case Running:
+		return "running"
+	case Pausing:
+		return "pausing"
+	case Paused:
+		return "paused"
+	case Checkpointed:
+		return "checkpointed"
+	case Destroyed:
+		return "destroyed"
+	default:
+		return "unknown"
+	}
+}
+
 // BaseState represents the platform agnostic pieces relating to a
 // running container's state
 type BaseState struct {
@@ -39,9 +62,12 @@ type BaseState struct {
 	// InitProcessPid is the init process id in the parent namespace.
 	InitProcessPid int `json:"init_process_pid"`
 
-	// InitProcessStartTime is the init process start time.
+	// InitProcessStartTime is the init process start time in clock cycles since boot time.
 	InitProcessStartTime string `json:"init_process_start"`
 
+	// Created is the unix timestamp for the creation time of the container in UTC
+	Created time.Time `json:"created"`
+
 	// Config is the container's configuration.
 	Config configs.Config `json:"config"`
 }

+ 155 - 114
vendor/src/github.com/opencontainers/runc/libcontainer/container_linux.go

@@ -15,6 +15,7 @@ import (
 	"strings"
 	"sync"
 	"syscall"
+	"time"
 
 	"github.com/Sirupsen/logrus"
 	"github.com/golang/protobuf/proto"
@@ -38,6 +39,8 @@ type linuxContainer struct {
 	criuPath      string
 	m             sync.Mutex
 	criuVersion   int
+	state         containerState
+	created       time.Time
 }
 
 // State represents a running container's state
@@ -104,6 +107,12 @@ type Container interface {
 	// errors:
 	// Systemerror - System error.
 	NotifyOOM() (<-chan struct{}, error)
+
+	// NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level
+	//
+	// errors:
+	// Systemerror - System error.
+	NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error)
 }
 
 // ID returns the container's unique ID
@@ -129,7 +138,7 @@ func (c *linuxContainer) State() (*State, error) {
 }
 
 func (c *linuxContainer) Processes() ([]int, error) {
-	pids, err := c.cgroupManager.GetPids()
+	pids, err := c.cgroupManager.GetAllPids()
 	if err != nil {
 		return nil, newSystemError(err)
 	}
@@ -183,22 +192,30 @@ func (c *linuxContainer) Start(process *Process) error {
 		}
 		return newSystemError(err)
 	}
+	// generate a timestamp indicating when the container was started
+	c.created = time.Now().UTC()
+
+	c.state = &runningState{
+		c: c,
+	}
 	if doInit {
-		c.updateState(parent)
-	}
-	if c.config.Hooks != nil {
-		s := configs.HookState{
-			Version: c.config.Version,
-			ID:      c.id,
-			Pid:     parent.pid(),
-			Root:    c.config.Rootfs,
+		if err := c.updateState(parent); err != nil {
+			return err
 		}
-		for _, hook := range c.config.Hooks.Poststart {
-			if err := hook.Run(s); err != nil {
-				if err := parent.terminate(); err != nil {
-					logrus.Warn(err)
+		if c.config.Hooks != nil {
+			s := configs.HookState{
+				Version: c.config.Version,
+				ID:      c.id,
+				Pid:     parent.pid(),
+				Root:    c.config.Rootfs,
+			}
+			for _, hook := range c.config.Hooks.Poststart {
+				if err := hook.Run(s); err != nil {
+					if err := parent.terminate(); err != nil {
+						logrus.Warn(err)
+					}
+					return newSystemError(err)
 				}
-				return newSystemError(err)
 			}
 		}
 	}
@@ -251,7 +268,7 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.
 }
 
 func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) {
-	t := "_LIBCONTAINER_INITTYPE=standard"
+	t := "_LIBCONTAINER_INITTYPE=" + string(initStandard)
 	cloneFlags := c.config.Namespaces.CloneFlags()
 	if cloneFlags&syscall.CLONE_NEWUSER != 0 {
 		if err := c.addUidGidMappings(cmd.SysProcAttr); err != nil {
@@ -278,7 +295,7 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c
 }
 
 func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) {
-	cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE=setns")
+	cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
 	// for setns process, we dont have to set cloneflags as the process namespaces
 	// will only be set via setns syscall
 	data, err := c.bootstrapData(0, c.initProcess.pid(), p.consolePath)
@@ -319,56 +336,55 @@ func newPipe() (parent *os.File, child *os.File, err error) {
 }
 
 func (c *linuxContainer) Destroy() error {
+	c.m.Lock()
+	defer c.m.Unlock()
+	return c.state.destroy()
+}
+
+func (c *linuxContainer) Pause() error {
 	c.m.Lock()
 	defer c.m.Unlock()
 	status, err := c.currentStatus()
 	if err != nil {
 		return err
 	}
-	if status != Destroyed {
-		return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped)
+	if status != Running {
+		return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
 	}
-	if !c.config.Namespaces.Contains(configs.NEWPID) {
-		if err := killCgroupProcesses(c.cgroupManager); err != nil {
-			logrus.Warn(err)
-		}
-	}
-	err = c.cgroupManager.Destroy()
-	if rerr := os.RemoveAll(c.root); err == nil {
-		err = rerr
-	}
-	c.initProcess = nil
-	if c.config.Hooks != nil {
-		s := configs.HookState{
-			Version: c.config.Version,
-			ID:      c.id,
-			Root:    c.config.Rootfs,
-		}
-		for _, hook := range c.config.Hooks.Poststop {
-			if err := hook.Run(s); err != nil {
-				return err
-			}
-		}
+	if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
+		return err
 	}
-	return err
-}
-
-func (c *linuxContainer) Pause() error {
-	c.m.Lock()
-	defer c.m.Unlock()
-	return c.cgroupManager.Freeze(configs.Frozen)
+	return c.state.transition(&pausedState{
+		c: c,
+	})
 }
 
 func (c *linuxContainer) Resume() error {
 	c.m.Lock()
 	defer c.m.Unlock()
-	return c.cgroupManager.Freeze(configs.Thawed)
+	status, err := c.currentStatus()
+	if err != nil {
+		return err
+	}
+	if status != Paused {
+		return newGenericError(fmt.Errorf("container not paused"), ContainerNotPaused)
+	}
+	if err := c.cgroupManager.Freeze(configs.Thawed); err != nil {
+		return err
+	}
+	return c.state.transition(&runningState{
+		c: c,
+	})
 }
 
 func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) {
 	return notifyOnOOM(c.cgroupManager.GetPaths())
 }
 
+func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) {
+	return notifyMemoryPressure(c.cgroupManager.GetPaths(), level)
+}
+
 // XXX debug support, remove when debugging done.
 func addArgsFromEnv(evar string, args *[]string) {
 	if e := os.Getenv(evar); e != "" {
@@ -460,7 +476,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
 	}
 
 	if criuOpts.ImagesDirectory == "" {
-		criuOpts.ImagesDirectory = filepath.Join(c.root, "criu.image")
+		return fmt.Errorf("invalid directory to save checkpoint")
 	}
 
 	// Since a container can be C/R'ed multiple times,
@@ -579,11 +595,9 @@ func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mo
 func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
 	c.m.Lock()
 	defer c.m.Unlock()
-
 	if err := c.checkCriuVersion("1.5.2"); err != nil {
 		return err
 	}
-
 	if criuOpts.WorkDirectory == "" {
 		criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work")
 	}
@@ -592,22 +606,19 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
 	if err := os.Mkdir(criuOpts.WorkDirectory, 0655); err != nil && !os.IsExist(err) {
 		return err
 	}
-
 	workDir, err := os.Open(criuOpts.WorkDirectory)
 	if err != nil {
 		return err
 	}
 	defer workDir.Close()
-
 	if criuOpts.ImagesDirectory == "" {
-		criuOpts.ImagesDirectory = filepath.Join(c.root, "criu.image")
+		return fmt.Errorf("invalid directory to restore checkpoint")
 	}
 	imageDir, err := os.Open(criuOpts.ImagesDirectory)
 	if err != nil {
 		return err
 	}
 	defer imageDir.Close()
-
 	// CRIU has a few requirements for a root directory:
 	// * it must be a mount point
 	// * its parent must not be overmounted
@@ -618,18 +629,15 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
 		return err
 	}
 	defer os.Remove(root)
-
 	root, err = filepath.EvalSymlinks(root)
 	if err != nil {
 		return err
 	}
-
 	err = syscall.Mount(c.config.Rootfs, root, "", syscall.MS_BIND|syscall.MS_REC, "")
 	if err != nil {
 		return err
 	}
 	defer syscall.Unmount(root, syscall.MNT_DETACH)
-
 	t := criurpc.CriuReqType_RESTORE
 	req := &criurpc.CriuReq{
 		Type: &t,
@@ -697,15 +705,13 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
 		fds    []string
 		fdJSON []byte
 	)
-
 	if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil {
 		return err
 	}
 
-	if err = json.Unmarshal(fdJSON, &fds); err != nil {
+	if err := json.Unmarshal(fdJSON, &fds); err != nil {
 		return err
 	}
-
 	for i := range fds {
 		if s := fds[i]; strings.Contains(s, "pipe:") {
 			inheritFd := new(criurpc.InheritFd)
@@ -714,12 +720,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
 			req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
 		}
 	}
-
-	err = c.criuSwrk(process, req, criuOpts, true)
-	if err != nil {
-		return err
-	}
-	return nil
+	return c.criuSwrk(process, req, criuOpts, true)
 }
 
 func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
@@ -914,46 +915,43 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc
 	if notify == nil {
 		return fmt.Errorf("invalid response: %s", resp.String())
 	}
-
 	switch {
 	case notify.GetScript() == "post-dump":
-		if !opts.LeaveRunning {
-			f, err := os.Create(filepath.Join(c.root, "checkpoint"))
-			if err != nil {
-				return err
-			}
-			f.Close()
+		f, err := os.Create(filepath.Join(c.root, "checkpoint"))
+		if err != nil {
+			return err
 		}
-		break
-
+		f.Close()
 	case notify.GetScript() == "network-unlock":
 		if err := unlockNetwork(c.config); err != nil {
 			return err
 		}
-		break
-
 	case notify.GetScript() == "network-lock":
 		if err := lockNetwork(c.config); err != nil {
 			return err
 		}
-		break
-
 	case notify.GetScript() == "post-restore":
 		pid := notify.GetPid()
 		r, err := newRestoredProcess(int(pid), fds)
 		if err != nil {
 			return err
 		}
-
-		// TODO: crosbymichael restore previous process information by saving the init process information in
-		// the container's state file or separate process state files.
+		process.ops = r
+		if err := c.state.transition(&restoredState{
+			imageDir: opts.ImagesDirectory,
+			c:        c,
+		}); err != nil {
+			return err
+		}
 		if err := c.updateState(r); err != nil {
 			return err
 		}
-		process.ops = r
-		break
+		if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil {
+			if !os.IsNotExist(err) {
+				logrus.Error(err)
+			}
+		}
 	}
-
 	return nil
 }
 
@@ -963,65 +961,108 @@ func (c *linuxContainer) updateState(process parentProcess) error {
 	if err != nil {
 		return err
 	}
+	return c.saveState(state)
+}
+
+func (c *linuxContainer) saveState(s *State) error {
 	f, err := os.Create(filepath.Join(c.root, stateFilename))
 	if err != nil {
 		return err
 	}
 	defer f.Close()
-	os.Remove(filepath.Join(c.root, "checkpoint"))
-	return utils.WriteJSON(f, state)
+	return utils.WriteJSON(f, s)
+}
+
+func (c *linuxContainer) deleteState() error {
+	return os.Remove(filepath.Join(c.root, stateFilename))
 }
 
 func (c *linuxContainer) currentStatus() (Status, error) {
-	if _, err := os.Stat(filepath.Join(c.root, "checkpoint")); err == nil {
-		return Checkpointed, nil
+	if err := c.refreshState(); err != nil {
+		return -1, err
 	}
+	return c.state.status(), nil
+}
+
+// refreshState needs to be called to verify that the current state on the
+// container is what is true.  Because consumers of libcontainer can use it
+// out of process we need to verify the container's status based on runtime
+// information and not rely on our in process info.
+func (c *linuxContainer) refreshState() error {
+	paused, err := c.isPaused()
+	if err != nil {
+		return err
+	}
+	if paused {
+		return c.state.transition(&pausedState{c: c})
+	}
+	running, err := c.isRunning()
+	if err != nil {
+		return err
+	}
+	if running {
+		return c.state.transition(&runningState{c: c})
+	}
+	return c.state.transition(&stoppedState{c: c})
+}
+
+func (c *linuxContainer) isRunning() (bool, error) {
 	if c.initProcess == nil {
-		return Destroyed, nil
+		return false, nil
 	}
 	// return Running if the init process is alive
 	if err := syscall.Kill(c.initProcess.pid(), 0); err != nil {
 		if err == syscall.ESRCH {
-			return Destroyed, nil
+			return false, nil
 		}
-		return 0, newSystemError(err)
+		return false, newSystemError(err)
 	}
-	if c.config.Cgroups != nil && c.config.Cgroups.Resources != nil && c.config.Cgroups.Resources.Freezer == configs.Frozen {
-		return Paused, nil
-	}
-	return Running, nil
+	return true, nil
 }
 
-func (c *linuxContainer) currentState() (*State, error) {
-	status, err := c.currentStatus()
+func (c *linuxContainer) isPaused() (bool, error) {
+	data, err := ioutil.ReadFile(filepath.Join(c.cgroupManager.GetPaths()["freezer"], "freezer.state"))
 	if err != nil {
-		return nil, err
-	}
-	if status == Destroyed {
-		return nil, newGenericError(fmt.Errorf("container destroyed"), ContainerNotExists)
+		if os.IsNotExist(err) {
+			return false, nil
+		}
+		return false, newSystemError(err)
 	}
-	startTime, err := c.initProcess.startTime()
-	if err != nil {
-		return nil, newSystemError(err)
+	return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil
+}
+
+func (c *linuxContainer) currentState() (*State, error) {
+	var (
+		startTime           string
+		externalDescriptors []string
+		pid                 = -1
+	)
+	if c.initProcess != nil {
+		pid = c.initProcess.pid()
+		startTime, _ = c.initProcess.startTime()
+		externalDescriptors = c.initProcess.externalDescriptors()
 	}
 	state := &State{
 		BaseState: BaseState{
 			ID:                   c.ID(),
 			Config:               *c.config,
-			InitProcessPid:       c.initProcess.pid(),
+			InitProcessPid:       pid,
 			InitProcessStartTime: startTime,
+			Created:              c.created,
 		},
 		CgroupPaths:         c.cgroupManager.GetPaths(),
 		NamespacePaths:      make(map[configs.NamespaceType]string),
-		ExternalDescriptors: c.initProcess.externalDescriptors(),
+		ExternalDescriptors: externalDescriptors,
 	}
-	for _, ns := range c.config.Namespaces {
-		state.NamespacePaths[ns.Type] = ns.GetPath(c.initProcess.pid())
-	}
-	for _, nsType := range configs.NamespaceTypes() {
-		if _, ok := state.NamespacePaths[nsType]; !ok {
-			ns := configs.Namespace{Type: nsType}
-			state.NamespacePaths[ns.Type] = ns.GetPath(c.initProcess.pid())
+	if pid > 0 {
+		for _, ns := range c.config.Namespaces {
+			state.NamespacePaths[ns.Type] = ns.GetPath(pid)
+		}
+		for _, nsType := range configs.NamespaceTypes() {
+			if _, ok := state.NamespacePaths[nsType]; !ok {
+				ns := configs.Namespace{Type: nsType}
+				state.NamespacePaths[ns.Type] = ns.GetPath(pid)
+			}
 		}
 	}
 	return state, nil

+ 6 - 1
vendor/src/github.com/opencontainers/runc/libcontainer/error.go

@@ -16,9 +16,10 @@ const (
 	ContainerPaused
 	ContainerNotStopped
 	ContainerNotRunning
+	ContainerNotPaused
 
 	// Process errors
-	ProcessNotExecuted
+	NoProcessOps
 
 	// Common errors
 	ConfigInvalid
@@ -46,6 +47,10 @@ func (c ErrorCode) String() string {
 		return "Container is not running"
 	case ConsoleExists:
 		return "Console exists for process"
+	case ContainerNotPaused:
+		return "Container is not paused"
+	case NoProcessOps:
+		return "No process operations"
 	default:
 		return "Unknown error"
 	}

+ 24 - 9
vendor/src/github.com/opencontainers/runc/libcontainer/factory_linux.go

@@ -166,7 +166,7 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
 	if err := os.MkdirAll(containerRoot, 0700); err != nil {
 		return nil, newGenericError(err, SystemError)
 	}
-	return &linuxContainer{
+	c := &linuxContainer{
 		id:            id,
 		root:          containerRoot,
 		config:        config,
@@ -174,7 +174,9 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
 		initArgs:      l.InitArgs,
 		criuPath:      l.CriuPath,
 		cgroupManager: l.NewCgroupsManager(config.Cgroups, nil),
-	}, nil
+	}
+	c.state = &stoppedState{c: c}
+	return c, nil
 }
 
 func (l *LinuxFactory) Load(id string) (Container, error) {
@@ -191,7 +193,7 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
 		processStartTime: state.InitProcessStartTime,
 		fds:              state.ExternalDescriptors,
 	}
-	return &linuxContainer{
+	c := &linuxContainer{
 		initProcess:   r,
 		id:            id,
 		config:        &state.Config,
@@ -200,7 +202,13 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
 		criuPath:      l.CriuPath,
 		cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths),
 		root:          containerRoot,
-	}, nil
+		created:       state.Created,
+	}
+	c.state = &createdState{c: c, s: Created}
+	if err := c.refreshState(); err != nil {
+		return nil, err
+	}
+	return c, nil
 }
 
 func (l *LinuxFactory) Type() string {
@@ -222,18 +230,25 @@ func (l *LinuxFactory) StartInitialization() (err error) {
 	// clear the current process's environment to clean any libcontainer
 	// specific env vars.
 	os.Clearenv()
+	var i initer
 	defer func() {
-		// if we have an error during the initialization of the container's init then send it back to the
-		// parent process in the form of an initError.
-		if err != nil {
-			if err := utils.WriteJSON(pipe, newSystemError(err)); err != nil {
+		// We have an error during the initialization of the container's init,
+		// send it back to the parent process in the form of an initError.
+		// If container's init successed, syscall.Exec will not return, hence
+		// this defer function will never be called.
+		if _, ok := i.(*linuxStandardInit); ok {
+			//  Synchronisation only necessary for standard init.
+			if err := utils.WriteJSON(pipe, syncT{procError}); err != nil {
 				panic(err)
 			}
 		}
+		if err := utils.WriteJSON(pipe, newSystemError(err)); err != nil {
+			panic(err)
+		}
 		// ensure that this pipe is always closed
 		pipe.Close()
 	}()
-	i, err := newContainerInit(it, pipe)
+	i, err = newContainerInit(it, pipe)
 	if err != nil {
 		return err
 	}

+ 12 - 0
vendor/src/github.com/opencontainers/runc/libcontainer/generic_error.go

@@ -9,6 +9,18 @@ import (
 	"github.com/opencontainers/runc/libcontainer/stacktrace"
 )
 
+type syncType uint8
+
+const (
+	procReady syncType = iota
+	procError
+	procRun
+)
+
+type syncT struct {
+	Type syncType `json:"type"`
+}
+
 var errorTemplate = template.Must(template.New("error").Parse(`Timestamp: {{.Timestamp}}
 Code: {{.ECode}}
 {{if .Message }}

+ 24 - 1
vendor/src/github.com/opencontainers/runc/libcontainer/init_linux.go

@@ -5,6 +5,7 @@ package libcontainer
 import (
 	"encoding/json"
 	"fmt"
+	"io"
 	"io/ioutil"
 	"net"
 	"os"
@@ -73,6 +74,7 @@ func newContainerInit(t initType, pipe *os.File) (initer, error) {
 		}, nil
 	case initStandard:
 		return &linuxStandardInit{
+			pipe:      pipe,
 			parentPid: syscall.Getppid(),
 			config:    config,
 		}, nil
@@ -140,6 +142,27 @@ func finalizeNamespace(config *initConfig) error {
 	return nil
 }
 
+// syncParentReady sends to the given pipe a JSON payload which indicates that
+// the init is ready to Exec the child process. It then waits for the parent to
+// indicate that it is cleared to Exec.
+func syncParentReady(pipe io.ReadWriter) error {
+	// Tell parent.
+	if err := utils.WriteJSON(pipe, syncT{procReady}); err != nil {
+		return err
+	}
+	// Wait for parent to give the all-clear.
+	var procSync syncT
+	if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
+		if err == io.EOF {
+			return fmt.Errorf("parent closed synchronisation channel")
+		}
+		if procSync.Type != procRun {
+			return fmt.Errorf("invalid synchronisation flag from parent")
+		}
+	}
+	return nil
+}
+
 // joinExistingNamespaces gets all the namespace paths specified for the container and
 // does a setns on the namespace fd so that the current process joins the namespace.
 func joinExistingNamespaces(namespaces []configs.Namespace) error {
@@ -309,7 +332,7 @@ func killCgroupProcesses(m cgroups.Manager) error {
 	if err := m.Freeze(configs.Frozen); err != nil {
 		logrus.Warn(err)
 	}
-	pids, err := m.GetPids()
+	pids, err := m.GetAllPids()
 	if err != nil {
 		m.Freeze(configs.Thawed)
 		return err

+ 67 - 0
vendor/src/github.com/opencontainers/runc/libcontainer/keys/keyctl.go

@@ -0,0 +1,67 @@
+// +build linux
+
+package keyctl
+
+import (
+	"fmt"
+	"syscall"
+	"strings"
+	"strconv"
+	"unsafe"
+)
+
+const KEYCTL_JOIN_SESSION_KEYRING = 1
+const KEYCTL_SETPERM = 5
+const KEYCTL_DESCRIBE = 6
+
+type KeySerial uint32
+
+func JoinSessionKeyring(name string) (KeySerial, error) {
+	var _name *byte = nil
+	var err error
+
+	if len(name) > 0 {
+		_name, err = syscall.BytePtrFromString(name)
+		if err != nil {
+			return KeySerial(0), err
+		}
+	}
+
+	sessKeyId, _, errn := syscall.Syscall(syscall.SYS_KEYCTL, KEYCTL_JOIN_SESSION_KEYRING, uintptr(unsafe.Pointer(_name)), 0)
+	if errn != 0 {
+		return 0, fmt.Errorf("could not create session key: %v", errn)
+	}
+	return KeySerial(sessKeyId), nil
+}
+
+// modify permissions on a keyring by reading the current permissions,
+// anding the bits with the given mask (clearing permissions) and setting
+// additional permission bits
+func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error {
+	dest := make([]byte, 1024)
+	destBytes := unsafe.Pointer(&dest[0])
+
+	if _, _, err := syscall.Syscall6(syscall.SYS_KEYCTL, uintptr(KEYCTL_DESCRIBE), uintptr(ringId), uintptr(destBytes), uintptr(len(dest)), 0, 0); err != 0 {
+		return err
+	}
+
+	res := strings.Split(string(dest), ";")
+	if len(res) < 5 {
+		return fmt.Errorf("Destination buffer for key description is too small")
+	}
+
+	// parse permissions
+	perm64, err := strconv.ParseUint(res[3], 16, 32)
+	if err != nil {
+		return err
+	}
+
+	perm := (uint32(perm64) & mask) | setbits
+
+	if _, _, err := syscall.Syscall(syscall.SYS_KEYCTL, uintptr(KEYCTL_SETPERM), uintptr(ringId), uintptr(perm)); err != 0 {
+		return err
+	}
+
+	return nil
+}
+

+ 40 - 14
vendor/src/github.com/opencontainers/runc/libcontainer/notify_linux.go

@@ -12,31 +12,32 @@ import (
 
 const oomCgroupName = "memory"
 
-// notifyOnOOM returns channel on which you can expect event about OOM,
-// if process died without OOM this channel will be closed.
-// s is current *libcontainer.State for container.
-func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) {
-	dir := paths[oomCgroupName]
-	if dir == "" {
-		return nil, fmt.Errorf("There is no path for %q in state", oomCgroupName)
-	}
-	oomControl, err := os.Open(filepath.Join(dir, "memory.oom_control"))
+type PressureLevel uint
+
+const (
+	LowPressure PressureLevel = iota
+	MediumPressure
+	CriticalPressure
+)
+
+func registerMemoryEvent(cgDir string, evName string, arg string) (<-chan struct{}, error) {
+	evFile, err := os.Open(filepath.Join(cgDir, evName))
 	if err != nil {
 		return nil, err
 	}
 	fd, _, syserr := syscall.RawSyscall(syscall.SYS_EVENTFD2, 0, syscall.FD_CLOEXEC, 0)
 	if syserr != 0 {
-		oomControl.Close()
+		evFile.Close()
 		return nil, syserr
 	}
 
 	eventfd := os.NewFile(fd, "eventfd")
 
-	eventControlPath := filepath.Join(dir, "cgroup.event_control")
-	data := fmt.Sprintf("%d %d", eventfd.Fd(), oomControl.Fd())
+	eventControlPath := filepath.Join(cgDir, "cgroup.event_control")
+	data := fmt.Sprintf("%d %d %s", eventfd.Fd(), evFile.Fd(), arg)
 	if err := ioutil.WriteFile(eventControlPath, []byte(data), 0700); err != nil {
 		eventfd.Close()
-		oomControl.Close()
+		evFile.Close()
 		return nil, err
 	}
 	ch := make(chan struct{})
@@ -44,7 +45,7 @@ func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) {
 		defer func() {
 			close(ch)
 			eventfd.Close()
-			oomControl.Close()
+			evFile.Close()
 		}()
 		buf := make([]byte, 8)
 		for {
@@ -61,3 +62,28 @@ func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) {
 	}()
 	return ch, nil
 }
+
+// notifyOnOOM returns channel on which you can expect event about OOM,
+// if process died without OOM this channel will be closed.
+func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) {
+	dir := paths[oomCgroupName]
+	if dir == "" {
+		return nil, fmt.Errorf("path %q missing", oomCgroupName)
+	}
+
+	return registerMemoryEvent(dir, "memory.oom_control", "")
+}
+
+func notifyMemoryPressure(paths map[string]string, level PressureLevel) (<-chan struct{}, error) {
+	dir := paths[oomCgroupName]
+	if dir == "" {
+		return nil, fmt.Errorf("path %q missing", oomCgroupName)
+	}
+
+	if level > CriticalPressure {
+		return nil, fmt.Errorf("invalid pressure level %d", level)
+	}
+
+	levelStr := []string{"low", "medium", "critical"}[level]
+	return registerMemoryEvent(dir, "memory.pressure_level", levelStr)
+}

+ 1 - 0
vendor/src/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c

@@ -17,6 +17,7 @@
 #include <sched.h>
 #include <signal.h>
 
+#include <bits/sockaddr.h>
 #include <linux/netlink.h>
 #include <linux/types.h>
 #include <stdint.h>

+ 3 - 3
vendor/src/github.com/opencontainers/runc/libcontainer/process.go

@@ -55,7 +55,7 @@ type Process struct {
 // Wait releases any resources associated with the Process
 func (p Process) Wait() (*os.ProcessState, error) {
 	if p.ops == nil {
-		return nil, newGenericError(fmt.Errorf("invalid process"), ProcessNotExecuted)
+		return nil, newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
 	}
 	return p.ops.wait()
 }
@@ -65,7 +65,7 @@ func (p Process) Pid() (int, error) {
 	// math.MinInt32 is returned here, because it's invalid value
 	// for the kill() system call.
 	if p.ops == nil {
-		return math.MinInt32, newGenericError(fmt.Errorf("invalid process"), ProcessNotExecuted)
+		return math.MinInt32, newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
 	}
 	return p.ops.pid(), nil
 }
@@ -73,7 +73,7 @@ func (p Process) Pid() (int, error) {
 // Signal sends a signal to the Process.
 func (p Process) Signal(sig os.Signal) error {
 	if p.ops == nil {
-		return newGenericError(fmt.Errorf("invalid process"), ProcessNotExecuted)
+		return newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
 	}
 	return p.ops.signal(sig)
 }

+ 52 - 11
vendor/src/github.com/opencontainers/runc/libcontainer/process_linux.go

@@ -5,6 +5,7 @@ package libcontainer
 import (
 	"encoding/json"
 	"errors"
+	"fmt"
 	"io"
 	"os"
 	"os/exec"
@@ -87,6 +88,7 @@ func (p *setnsProcess) start() (err error) {
 	if err := utils.WriteJSON(p.parentPipe, p.config); err != nil {
 		return newSystemError(err)
 	}
+
 	if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
 		return newSystemError(err)
 	}
@@ -96,6 +98,7 @@ func (p *setnsProcess) start() (err error) {
 	if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
 		return newSystemError(err)
 	}
+	// Must be done after Shutdown so the child will exit and we can wait for it.
 	if ierr != nil {
 		p.wait()
 		return newSystemError(ierr)
@@ -199,7 +202,6 @@ func (p *initProcess) start() (err error) {
 		return newSystemError(err)
 	}
 	p.setExternalDescriptors(fds)
-
 	// Do this before syncing with child so that no children
 	// can escape the cgroup
 	if err := p.manager.Apply(p.pid()); err != nil {
@@ -230,13 +232,54 @@ func (p *initProcess) start() (err error) {
 	if err := p.sendConfig(); err != nil {
 		return newSystemError(err)
 	}
-	// wait for the child process to fully complete and receive an error message
-	// if one was encoutered
-	var ierr *genericError
-	if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
+	var (
+		procSync syncT
+		sentRun  bool
+		ierr     *genericError
+	)
+
+loop:
+	for {
+		if err := json.NewDecoder(p.parentPipe).Decode(&procSync); err != nil {
+			if err == io.EOF {
+				break loop
+			}
+			return newSystemError(err)
+		}
+		switch procSync.Type {
+		case procReady:
+			if err := p.manager.Set(p.config.Config); err != nil {
+				return newSystemError(err)
+			}
+			// Sync with child.
+			if err := utils.WriteJSON(p.parentPipe, syncT{procRun}); err != nil {
+				return newSystemError(err)
+			}
+			sentRun = true
+		case procError:
+			// wait for the child process to fully complete and receive an error message
+			// if one was encoutered
+			if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
+				return newSystemError(err)
+			}
+			if ierr != nil {
+				break loop
+			}
+			// Programmer error.
+			panic("No error following JSON procError payload.")
+		default:
+			return newSystemError(fmt.Errorf("invalid JSON synchronisation payload from child"))
+		}
+	}
+	if !sentRun {
+		return newSystemError(fmt.Errorf("could not synchronise with container process"))
+	}
+	if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
 		return newSystemError(err)
 	}
+	// Must be done after Shutdown so the child will exit and we can wait for it.
 	if ierr != nil {
+		p.wait()
 		return newSystemError(ierr)
 	}
 	return nil
@@ -270,12 +313,10 @@ func (p *initProcess) startTime() (string, error) {
 }
 
 func (p *initProcess) sendConfig() error {
-	// send the state to the container's init process then shutdown writes for the parent
-	if err := utils.WriteJSON(p.parentPipe, p.config); err != nil {
-		return err
-	}
-	// shutdown writes for the parent side of the pipe
-	return syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR)
+	// send the config to the container's init process, we don't use JSON Encode
+	// here because there might be a problem in JSON decoder in some cases, see:
+	// https://github.com/docker/docker/issues/14203#issuecomment-174177790
+	return utils.WriteJSON(p.parentPipe, p.config)
 }
 
 func (p *initProcess) createNetworkInterfaces() error {

+ 25 - 3
vendor/src/github.com/opencontainers/runc/libcontainer/rootfs_linux.go

@@ -18,6 +18,8 @@ import (
 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/configs"
 	"github.com/opencontainers/runc/libcontainer/label"
+	"github.com/opencontainers/runc/libcontainer/system"
+	libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
 )
 
 const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
@@ -293,12 +295,31 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) {
 // checkMountDestination checks to ensure that the mount destination is not over the top of /proc.
 // dest is required to be an abs path and have any symlinks resolved before calling this function.
 func checkMountDestination(rootfs, dest string) error {
-	if filepath.Clean(rootfs) == filepath.Clean(dest) {
+	if libcontainerUtils.CleanPath(rootfs) == libcontainerUtils.CleanPath(dest) {
 		return fmt.Errorf("mounting into / is prohibited")
 	}
 	invalidDestinations := []string{
 		"/proc",
 	}
+	// White list, it should be sub directories of invalid destinations
+	validDestinations := []string{
+		// These entries can be bind mounted by files emulated by fuse,
+		// so commands like top, free displays stats in container.
+		"/proc/cpuinfo",
+		"/proc/diskstats",
+		"/proc/meminfo",
+		"/proc/stat",
+		"/proc/net/dev",
+	}
+	for _, valid := range validDestinations {
+		path, err := filepath.Rel(filepath.Join(rootfs, valid), dest)
+		if err != nil {
+			return err
+		}
+		if path == "." {
+			return nil
+		}
+	}
 	for _, invalid := range invalidDestinations {
 		path, err := filepath.Rel(filepath.Join(rootfs, invalid), dest)
 		if err != nil {
@@ -321,7 +342,7 @@ func setupDevSymlinks(rootfs string) error {
 	// kcore support can be toggled with CONFIG_PROC_KCORE; only create a symlink
 	// in /dev if it exists in /proc.
 	if _, err := os.Stat("/proc/kcore"); err == nil {
-		links = append(links, [2]string{"/proc/kcore", "/dev/kcore"})
+		links = append(links, [2]string{"/proc/kcore", "/dev/core"})
 	}
 	for _, link := range links {
 		var (
@@ -365,11 +386,12 @@ func reOpenDevNull() error {
 
 // Create the device nodes in the container.
 func createDevices(config *configs.Config) error {
+	useBindMount := system.RunningInUserNS() || config.Namespaces.Contains(configs.NEWUSER)
 	oldMask := syscall.Umask(0000)
 	for _, node := range config.Devices {
 		// containers running in a user namespace are not allowed to mknod
 		// devices so we can just bind mount it from the host.
-		if err := createDeviceNode(config.Rootfs, node, config.Namespaces.Contains(configs.NEWUSER)); err != nil {
+		if err := createDeviceNode(config.Rootfs, node, useBindMount); err != nil {
 			syscall.Umask(oldMask)
 			return err
 		}

+ 9 - 1
vendor/src/github.com/opencontainers/runc/libcontainer/selinux/selinux.go

@@ -231,10 +231,14 @@ func ReserveLabel(scon string) {
 	}
 }
 
+func selinuxEnforcePath() string {
+	return fmt.Sprintf("%s/enforce", selinuxPath)
+}
+
 func SelinuxGetEnforce() int {
 	var enforce int
 
-	enforceS, err := readCon(fmt.Sprintf("%s/enforce", selinuxPath))
+	enforceS, err := readCon(selinuxEnforcePath())
 	if err != nil {
 		return -1
 	}
@@ -246,6 +250,10 @@ func SelinuxGetEnforce() int {
 	return enforce
 }
 
+func SelinuxSetEnforce(mode int) error {
+	return writeCon(selinuxEnforcePath(), fmt.Sprintf("%d", mode))
+}
+
 func SelinuxGetEnforceMode() int {
 	switch readConfig(selinuxTag) {
 	case "enforcing":

+ 10 - 0
vendor/src/github.com/opencontainers/runc/libcontainer/setns_init_linux.go

@@ -6,6 +6,7 @@ import (
 	"os"
 
 	"github.com/opencontainers/runc/libcontainer/apparmor"
+	"github.com/opencontainers/runc/libcontainer/keys"
 	"github.com/opencontainers/runc/libcontainer/label"
 	"github.com/opencontainers/runc/libcontainer/seccomp"
 	"github.com/opencontainers/runc/libcontainer/system"
@@ -18,12 +19,21 @@ type linuxSetnsInit struct {
 }
 
 func (l *linuxSetnsInit) Init() error {
+	// do not inherit the parent's session keyring
+	if _, err := keyctl.JoinSessionKeyring("_ses"); err != nil {
+		return err
+	}
 	if err := setupRlimits(l.config.Config); err != nil {
 		return err
 	}
 	if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil {
 		return err
 	}
+	if l.config.Config.NoNewPrivileges {
+		if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
+			return err
+		}
+	}
 	if l.config.Config.Seccomp != nil {
 		if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
 			return err

+ 31 - 2
vendor/src/github.com/opencontainers/runc/libcontainer/standard_init_linux.go

@@ -3,22 +3,41 @@
 package libcontainer
 
 import (
+	"io"
 	"os"
 	"syscall"
 
 	"github.com/opencontainers/runc/libcontainer/apparmor"
 	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/opencontainers/runc/libcontainer/keys"
 	"github.com/opencontainers/runc/libcontainer/label"
 	"github.com/opencontainers/runc/libcontainer/seccomp"
 	"github.com/opencontainers/runc/libcontainer/system"
 )
 
 type linuxStandardInit struct {
+	pipe      io.ReadWriter
 	parentPid int
 	config    *initConfig
 }
 
+// PR_SET_NO_NEW_PRIVS isn't exposed in Golang so we define it ourselves copying the value
+// the kernel
+const PR_SET_NO_NEW_PRIVS = 0x26
+
 func (l *linuxStandardInit) Init() error {
+	// do not inherit the parent's session keyring
+	sessKeyId, err := keyctl.JoinSessionKeyring("")
+	if err != nil {
+		return err
+	}
+	// make session keyring searcheable
+	// without user ns we need 'UID' search permissions
+	// with user ns we need 'other' search permissions
+	if err := keyctl.ModKeyringPerm(sessKeyId, 0xffffffff, 0x080008); err != nil {
+		return err
+	}
+
 	// join any namespaces via a path to the namespace fd if provided
 	if err := joinExistingNamespaces(l.config.Config.Namespaces); err != nil {
 		return err
@@ -50,7 +69,6 @@ func (l *linuxStandardInit) Init() error {
 	if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil {
 		return err
 	}
-
 	label.Init()
 	// InitializeMountNamespace() can be executed only for a new mount namespace
 	if l.config.Config.Namespaces.Contains(configs.NEWNS) {
@@ -75,7 +93,6 @@ func (l *linuxStandardInit) Init() error {
 			return err
 		}
 	}
-
 	for _, path := range l.config.Config.ReadonlyPaths {
 		if err := remountReadonly(path); err != nil {
 			return err
@@ -90,6 +107,17 @@ func (l *linuxStandardInit) Init() error {
 	if err != nil {
 		return err
 	}
+	if l.config.Config.NoNewPrivileges {
+		if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
+			return err
+		}
+	}
+	// Tell our parent that we're ready to Execv. This must be done before the
+	// Seccomp rules have been applied, because we need to be able to read and
+	// write to a socket.
+	if err := syncParentReady(l.pipe); err != nil {
+		return err
+	}
 	if l.config.Config.Seccomp != nil {
 		if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
 			return err
@@ -109,5 +137,6 @@ func (l *linuxStandardInit) Init() error {
 	if syscall.Getppid() != l.parentPid {
 		return syscall.Kill(syscall.Getpid(), syscall.SIGKILL)
 	}
+
 	return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
 }

+ 226 - 0
vendor/src/github.com/opencontainers/runc/libcontainer/state_linux.go

@@ -0,0 +1,226 @@
+// +build linux
+
+package libcontainer
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+
+	"github.com/Sirupsen/logrus"
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+func newStateTransitionError(from, to containerState) error {
+	return &stateTransitionError{
+		From: from.status().String(),
+		To:   to.status().String(),
+	}
+}
+
+// stateTransitionError is returned when an invalid state transition happens from one
+// state to another.
+type stateTransitionError struct {
+	From string
+	To   string
+}
+
+func (s *stateTransitionError) Error() string {
+	return fmt.Sprintf("invalid state transition from %s to %s", s.From, s.To)
+}
+
+type containerState interface {
+	transition(containerState) error
+	destroy() error
+	status() Status
+}
+
+func destroy(c *linuxContainer) error {
+	if !c.config.Namespaces.Contains(configs.NEWPID) {
+		if err := killCgroupProcesses(c.cgroupManager); err != nil {
+			logrus.Warn(err)
+		}
+	}
+	err := c.cgroupManager.Destroy()
+	if rerr := os.RemoveAll(c.root); err == nil {
+		err = rerr
+	}
+	c.initProcess = nil
+	if herr := runPoststopHooks(c); err == nil {
+		err = herr
+	}
+	c.state = &stoppedState{c: c}
+	return err
+}
+
+func runPoststopHooks(c *linuxContainer) error {
+	if c.config.Hooks != nil {
+		s := configs.HookState{
+			Version: c.config.Version,
+			ID:      c.id,
+			Root:    c.config.Rootfs,
+		}
+		for _, hook := range c.config.Hooks.Poststop {
+			if err := hook.Run(s); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+// stoppedState represents a container is a stopped/destroyed state.
+type stoppedState struct {
+	c *linuxContainer
+}
+
+func (b *stoppedState) status() Status {
+	return Destroyed
+}
+
+func (b *stoppedState) transition(s containerState) error {
+	switch s.(type) {
+	case *runningState:
+		b.c.state = s
+		return nil
+	case *restoredState:
+		b.c.state = s
+		return nil
+	case *stoppedState:
+		return nil
+	}
+	return newStateTransitionError(b, s)
+}
+
+func (b *stoppedState) destroy() error {
+	return destroy(b.c)
+}
+
+// runningState represents a container that is currently running.
+type runningState struct {
+	c *linuxContainer
+}
+
+func (r *runningState) status() Status {
+	return Running
+}
+
+func (r *runningState) transition(s containerState) error {
+	switch s.(type) {
+	case *stoppedState:
+		running, err := r.c.isRunning()
+		if err != nil {
+			return err
+		}
+		if running {
+			return newGenericError(fmt.Errorf("container still running"), ContainerNotStopped)
+		}
+		r.c.state = s
+		return nil
+	case *pausedState:
+		r.c.state = s
+		return nil
+	case *runningState:
+		return nil
+	}
+	return newStateTransitionError(r, s)
+}
+
+func (r *runningState) destroy() error {
+	running, err := r.c.isRunning()
+	if err != nil {
+		return err
+	}
+	if running {
+		return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped)
+	}
+	return destroy(r.c)
+}
+
+// pausedState represents a container that is currently pause.  It cannot be destroyed in a
+// paused state and must transition back to running first.
+type pausedState struct {
+	c *linuxContainer
+}
+
+func (p *pausedState) status() Status {
+	return Paused
+}
+
+func (p *pausedState) transition(s containerState) error {
+	switch s.(type) {
+	case *runningState, *stoppedState:
+		p.c.state = s
+		return nil
+	case *pausedState:
+		return nil
+	}
+	return newStateTransitionError(p, s)
+}
+
+func (p *pausedState) destroy() error {
+	isRunning, err := p.c.isRunning()
+	if err != nil {
+		return err
+	}
+	if !isRunning {
+		if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil {
+			return err
+		}
+		return destroy(p.c)
+	}
+	return newGenericError(fmt.Errorf("container is paused"), ContainerPaused)
+}
+
+// restoredState is the same as the running state but also has accociated checkpoint
+// information that maybe need destroyed when the container is stopped and destory is called.
+type restoredState struct {
+	imageDir string
+	c        *linuxContainer
+}
+
+func (r *restoredState) status() Status {
+	return Running
+}
+
+func (r *restoredState) transition(s containerState) error {
+	switch s.(type) {
+	case *stoppedState:
+		return nil
+	case *runningState:
+		return nil
+	}
+	return newStateTransitionError(r, s)
+}
+
+func (r *restoredState) destroy() error {
+	if _, err := os.Stat(filepath.Join(r.c.root, "checkpoint")); err != nil {
+		if !os.IsNotExist(err) {
+			return err
+		}
+	}
+	return destroy(r.c)
+}
+
+// createdState is used whenever a container is restored, loaded, or setting additional
+// processes inside and it should not be destroyed when it is exiting.
+type createdState struct {
+	c *linuxContainer
+	s Status
+}
+
+func (n *createdState) status() Status {
+	return n.s
+}
+
+func (n *createdState) transition(s containerState) error {
+	n.c.state = s
+	return nil
+}
+
+func (n *createdState) destroy() error {
+	if err := n.c.refreshState(); err != nil {
+		return err
+	}
+	return n.c.state.destroy()
+}

+ 45 - 0
vendor/src/github.com/opencontainers/runc/libcontainer/system/linux.go

@@ -3,6 +3,9 @@
 package system
 
 import (
+	"bufio"
+	"fmt"
+	"os"
 	"os/exec"
 	"syscall"
 	"unsafe"
@@ -75,3 +78,45 @@ func Setctty() error {
 	}
 	return nil
 }
+
+/*
+ * Detect whether we are currently running in a user namespace.
+ * Copied from github.com/lxc/lxd/shared/util.go
+ */
+func RunningInUserNS() bool {
+	file, err := os.Open("/proc/self/uid_map")
+	if err != nil {
+		/*
+		 * This kernel-provided file only exists if user namespaces are
+		 * supported
+		 */
+		return false
+	}
+	defer file.Close()
+
+	buf := bufio.NewReader(file)
+	l, _, err := buf.ReadLine()
+	if err != nil {
+		return false
+	}
+
+	line := string(l)
+	var a, b, c int64
+	fmt.Sscanf(line, "%d %d %d", &a, &b, &c)
+	/*
+	 * We assume we are in the initial user namespace if we have a full
+	 * range - 4294967295 uids starting at uid 0.
+	 */
+	if a == 0 && b == 0 && c == 4294967295 {
+		return false
+	}
+	return true
+}
+
+func Prctl(option int, arg2, arg3, arg4, arg5 uintptr) (err error) {
+	_, _, e1 := syscall.Syscall6(syscall.SYS_PRCTL, uintptr(option), arg2, arg3, arg4, arg5, 0)
+	if e1 != 0 {
+		err = e1
+	}
+	return
+}

+ 30 - 0
vendor/src/github.com/opencontainers/runc/libcontainer/utils/utils.go

@@ -5,6 +5,7 @@ import (
 	"encoding/hex"
 	"encoding/json"
 	"io"
+	"os"
 	"path/filepath"
 	"syscall"
 )
@@ -54,3 +55,32 @@ func WriteJSON(w io.Writer, v interface{}) error {
 	_, err = w.Write(data)
 	return err
 }
+
+// CleanPath makes a path safe for use with filepath.Join. This is done by not
+// only cleaning the path, but also (if the path is relative) adding a leading
+// '/' and cleaning it (then removing the leading '/'). This ensures that a
+// path resulting from prepending another path will always resolve to lexically
+// be a subdirectory of the prefixed path. This is all done lexically, so paths
+// that include symlinks won't be safe as a result of using CleanPath.
+func CleanPath(path string) string {
+	// Deal with empty strings nicely.
+	if path == "" {
+		return ""
+	}
+
+	// Ensure that all paths are cleaned (especially problematic ones like
+	// "/../../../../../" which can cause lots of issues).
+	path = filepath.Clean(path)
+
+	// If the path isn't absolute, we need to do more processing to fix paths
+	// such as "../../../../<etc>/some/path". We also shouldn't convert absolute
+	// paths to relative ones.
+	if !filepath.IsAbs(path) {
+		path = filepath.Clean(string(os.PathSeparator) + path)
+		// This can't fail, as (by definition) all paths are relative to root.
+		path, _ = filepath.Rel(string(os.PathSeparator), path)
+	}
+
+	// Clean the path again for good measure.
+	return filepath.Clean(path)
+}