diff --git a/hack/dockerfile/install/containerd.installer b/hack/dockerfile/install/containerd.installer index 678077417b..d7e371e9ac 100755 --- a/hack/dockerfile/install/containerd.installer +++ b/hack/dockerfile/install/containerd.installer @@ -4,7 +4,7 @@ # containerd is also pinned in vendor.conf. When updating the binary # version you may also need to update the vendor version to pick up bug # fixes or new APIs. -CONTAINERD_COMMIT=e6b3f5632f50dbc4e9cb6288d911bf4f5e95b18e # v1.2.4 +CONTAINERD_COMMIT=bb71b10fd8f58240ca47fbb579b9d1028eea7c84 # v1.2.5 install_containerd() { echo "Install containerd version $CONTAINERD_COMMIT" diff --git a/hack/dockerfile/install/runc.installer b/hack/dockerfile/install/runc.installer index a787e3022f..d4782ce18b 100755 --- a/hack/dockerfile/install/runc.installer +++ b/hack/dockerfile/install/runc.installer @@ -4,7 +4,7 @@ # The version of runc should match the version that is used by the containerd # version that is used. If you need to update runc, open a pull request in # the containerd project first, and update both after that is merged. -RUNC_COMMIT=6635b4f0c6af3810594d2770f662f34ddc15b40d +RUNC_COMMIT=2b18fe1d885ee5083ef9f0838fee39b62d653e30 install_runc() { # If using RHEL7 kernels (3.10.0 el7), disable kmem accounting/limiting diff --git a/vendor.conf b/vendor.conf index eb5dac67d6..f11b15495e 100644 --- a/vendor.conf +++ b/vendor.conf @@ -79,7 +79,7 @@ google.golang.org/grpc v1.12.0 # the containerd project first, and update both after that is merged. # This commit does not need to match RUNC_COMMIT as it is used for helper # packages but should be newer or equal. -github.com/opencontainers/runc 12f6a991201fdb8f82579582d5e00e28fba06d0a +github.com/opencontainers/runc 2b18fe1d885ee5083ef9f0838fee39b62d653e30 github.com/opencontainers/runtime-spec 29686dbc5559d93fb1ef402eeda3e35c38d75af4 # v1.0.1-59-g29686db github.com/opencontainers/image-spec v1.0.1 github.com/seccomp/libseccomp-golang 32f571b70023028bd57d9288c20efbcb237f3ce0 @@ -118,10 +118,10 @@ github.com/googleapis/gax-go v2.0.0 google.golang.org/genproto 694d95ba50e67b2e363f3483057db5d4910c18f9 # containerd -github.com/containerd/containerd e6b3f5632f50dbc4e9cb6288d911bf4f5e95b18e # v1.2.4 +github.com/containerd/containerd bb71b10fd8f58240ca47fbb579b9d1028eea7c84 # v1.2.5 github.com/containerd/fifo 3d5202aec260678c48179c56f40e6f38a095738c github.com/containerd/continuity 004b46473808b3e7a4a3049c20e4376c91eb966d -github.com/containerd/cgroups 5e610833b72089b37d0e615de9a92dfc043757c2 +github.com/containerd/cgroups dbea6f2bd41658b84b00417ceefa416b979cbf10 github.com/containerd/console c12b1e7919c14469339a5d38f2f8ed9b64a9de23 github.com/containerd/go-runc 5a6d9f37cfa36b15efba46dc7ea349fa9b7143c3 github.com/containerd/typeurl a93fcdb778cd272c6e9b3028b2f42d813e785d40 diff --git a/vendor/github.com/containerd/cgroups/README.md b/vendor/github.com/containerd/cgroups/README.md index 69e932a9f7..81ad11cc7f 100644 --- a/vendor/github.com/containerd/cgroups/README.md +++ b/vendor/github.com/containerd/cgroups/README.md @@ -1,8 +1,9 @@ # cgroups [![Build Status](https://travis-ci.org/containerd/cgroups.svg?branch=master)](https://travis-ci.org/containerd/cgroups) - [![codecov](https://codecov.io/gh/containerd/cgroups/branch/master/graph/badge.svg)](https://codecov.io/gh/containerd/cgroups) +[![GoDoc](https://godoc.org/github.com/containerd/cgroups?status.svg)](https://godoc.org/github.com/containerd/cgroups) +[![Go Report Card](https://goreportcard.com/badge/github.com/containerd/cgroups)](https://goreportcard.com/report/github.com/containerd/cgroups) Go package for creating, managing, inspecting, and destroying cgroups. The resources format for settings on the cgroup uses the OCI runtime-spec found @@ -110,3 +111,14 @@ err := control.MoveTo(destination) ```go subCgroup, err := control.New("child", resources) ``` + +## Project details + +Cgroups is a containerd sub-project, licensed under the [Apache 2.0 license](./LICENSE). +As a containerd sub-project, you will find the: + + * [Project governance](https://github.com/containerd/project/blob/master/GOVERNANCE.md), + * [Maintainers](https://github.com/containerd/project/blob/master/MAINTAINERS), + * and [Contributing guidelines](https://github.com/containerd/project/blob/master/CONTRIBUTING.md) + +information in our [`containerd/project`](https://github.com/containerd/project) repository. diff --git a/vendor/github.com/containerd/cgroups/blkio.go b/vendor/github.com/containerd/cgroups/blkio.go index fc1e689cbd..875fb55465 100644 --- a/vendor/github.com/containerd/cgroups/blkio.go +++ b/vendor/github.com/containerd/cgroups/blkio.go @@ -191,31 +191,42 @@ func (b *blkioController) readEntry(devices map[deviceKey]string, path, name str } func createBlkioSettings(blkio *specs.LinuxBlockIO) []blkioSettings { - settings := []blkioSettings{ - { - name: "weight", - value: blkio.Weight, - format: uintf, - }, - { - name: "leaf_weight", - value: blkio.LeafWeight, - format: uintf, - }, - } - for _, wd := range blkio.WeightDevice { + settings := []blkioSettings{} + + if blkio.Weight != nil { settings = append(settings, blkioSettings{ - name: "weight_device", - value: wd, - format: weightdev, - }, - blkioSettings{ - name: "leaf_weight_device", - value: wd, - format: weightleafdev, + name: "weight", + value: blkio.Weight, + format: uintf, }) } + if blkio.LeafWeight != nil { + settings = append(settings, + blkioSettings{ + name: "leaf_weight", + value: blkio.LeafWeight, + format: uintf, + }) + } + for _, wd := range blkio.WeightDevice { + if wd.Weight != nil { + settings = append(settings, + blkioSettings{ + name: "weight_device", + value: wd, + format: weightdev, + }) + } + if wd.LeafWeight != nil { + settings = append(settings, + blkioSettings{ + name: "leaf_weight_device", + value: wd, + format: weightleafdev, + }) + } + } for _, t := range []struct { name string list []specs.LinuxThrottleDevice @@ -265,12 +276,12 @@ func uintf(v interface{}) []byte { func weightdev(v interface{}) []byte { wd := v.(specs.LinuxWeightDevice) - return []byte(fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.Weight)) + return []byte(fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, *wd.Weight)) } func weightleafdev(v interface{}) []byte { wd := v.(specs.LinuxWeightDevice) - return []byte(fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.LeafWeight)) + return []byte(fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, *wd.LeafWeight)) } func throttleddev(v interface{}) []byte { diff --git a/vendor/github.com/containerd/cgroups/cgroup.go b/vendor/github.com/containerd/cgroups/cgroup.go index 7959feb490..9fbea82499 100644 --- a/vendor/github.com/containerd/cgroups/cgroup.go +++ b/vendor/github.com/containerd/cgroups/cgroup.go @@ -30,47 +30,84 @@ import ( ) // New returns a new control via the cgroup cgroups interface -func New(hierarchy Hierarchy, path Path, resources *specs.LinuxResources) (Cgroup, error) { +func New(hierarchy Hierarchy, path Path, resources *specs.LinuxResources, opts ...InitOpts) (Cgroup, error) { + config := newInitConfig() + for _, o := range opts { + if err := o(config); err != nil { + return nil, err + } + } subsystems, err := hierarchy() if err != nil { return nil, err } + var active []Subsystem for _, s := range subsystems { + // check if subsystem exists if err := initializeSubsystem(s, path, resources); err != nil { + if err == ErrControllerNotActive { + if config.InitCheck != nil { + if skerr := config.InitCheck(s, path, err); skerr != nil { + if skerr != ErrIgnoreSubsystem { + return nil, skerr + } + } + } + continue + } return nil, err } + active = append(active, s) } return &cgroup{ path: path, - subsystems: subsystems, + subsystems: active, }, nil } // Load will load an existing cgroup and allow it to be controlled -func Load(hierarchy Hierarchy, path Path) (Cgroup, error) { +func Load(hierarchy Hierarchy, path Path, opts ...InitOpts) (Cgroup, error) { + config := newInitConfig() + for _, o := range opts { + if err := o(config); err != nil { + return nil, err + } + } + var activeSubsystems []Subsystem subsystems, err := hierarchy() if err != nil { return nil, err } - // check the the subsystems still exist + // check that the subsystems still exist, and keep only those that actually exist for _, s := range pathers(subsystems) { p, err := path(s.Name()) if err != nil { if os.IsNotExist(errors.Cause(err)) { return nil, ErrCgroupDeleted } + if err == ErrControllerNotActive { + if config.InitCheck != nil { + if skerr := config.InitCheck(s, path, err); skerr != nil { + if skerr != ErrIgnoreSubsystem { + return nil, skerr + } + } + } + continue + } return nil, err } if _, err := os.Lstat(s.Path(p)); err != nil { if os.IsNotExist(err) { - return nil, ErrCgroupDeleted + continue } return nil, err } + activeSubsystems = append(activeSubsystems, s) } return &cgroup{ path: path, - subsystems: subsystems, + subsystems: activeSubsystems, }, nil } @@ -319,6 +356,49 @@ func (c *cgroup) processes(subsystem Name, recursive bool) ([]Process, error) { return processes, err } +// Tasks returns the tasks running inside the cgroup along +// with the subsystem used, pid, and path +func (c *cgroup) Tasks(subsystem Name, recursive bool) ([]Task, error) { + c.mu.Lock() + defer c.mu.Unlock() + if c.err != nil { + return nil, c.err + } + return c.tasks(subsystem, recursive) +} + +func (c *cgroup) tasks(subsystem Name, recursive bool) ([]Task, error) { + s := c.getSubsystem(subsystem) + sp, err := c.path(subsystem) + if err != nil { + return nil, err + } + path := s.(pather).Path(sp) + var tasks []Task + err = filepath.Walk(path, func(p string, info os.FileInfo, err error) error { + if err != nil { + return err + } + if !recursive && info.IsDir() { + if p == path { + return nil + } + return filepath.SkipDir + } + dir, name := filepath.Split(p) + if name != cgroupTasks { + return nil + } + procs, err := readTasksPids(dir, subsystem) + if err != nil { + return err + } + tasks = append(tasks, procs...) + return nil + }) + return tasks, err +} + // Freeze freezes the entire cgroup and all the processes inside it func (c *cgroup) Freeze() error { c.mu.Lock() diff --git a/vendor/github.com/containerd/cgroups/control.go b/vendor/github.com/containerd/cgroups/control.go index 63e2df93dd..1f62c54f3b 100644 --- a/vendor/github.com/containerd/cgroups/control.go +++ b/vendor/github.com/containerd/cgroups/control.go @@ -44,6 +44,15 @@ type Process struct { Path string } +type Task struct { + // Subsystem is the name of the subsystem that the task is in + Subsystem Name + // Pid is the process id of the task + Pid int + // Path is the full path of the subsystem and location that the task is in + Path string +} + // Cgroup handles interactions with the individual groups to perform // actions on them as them main interface to this cgroup package type Cgroup interface { @@ -64,6 +73,8 @@ type Cgroup interface { Update(resources *specs.LinuxResources) error // Processes returns all the processes in a select subsystem for the cgroup Processes(Name, bool) ([]Process, error) + // Tasks returns all the tasks in a select subsystem for the cgroup + Tasks(Name, bool) ([]Task, error) // Freeze freezes or pauses all processes inside the cgroup Freeze() error // Thaw thaw or resumes all processes inside the cgroup diff --git a/vendor/github.com/containerd/cgroups/cpuset.go b/vendor/github.com/containerd/cgroups/cpuset.go index f182aa68c1..30208515e6 100644 --- a/vendor/github.com/containerd/cgroups/cpuset.go +++ b/vendor/github.com/containerd/cgroups/cpuset.go @@ -57,21 +57,21 @@ func (c *cpusetController) Create(path string, resources *specs.LinuxResources) if resources.CPU != nil { for _, t := range []struct { name string - value *string + value string }{ { name: "cpus", - value: &resources.CPU.Cpus, + value: resources.CPU.Cpus, }, { name: "mems", - value: &resources.CPU.Mems, + value: resources.CPU.Mems, }, } { - if t.value != nil { + if t.value != "" { if err := ioutil.WriteFile( filepath.Join(c.Path(path), fmt.Sprintf("cpuset.%s", t.name)), - []byte(*t.value), + []byte(t.value), defaultFilePerm, ); err != nil { return err diff --git a/vendor/github.com/containerd/cgroups/devices.go b/vendor/github.com/containerd/cgroups/devices.go index f9a118b227..f6a3b1947d 100644 --- a/vendor/github.com/containerd/cgroups/devices.go +++ b/vendor/github.com/containerd/cgroups/devices.go @@ -58,6 +58,9 @@ func (d *devicesController) Create(path string, resources *specs.LinuxResources) if device.Allow { file = allowDeviceFile } + if device.Type == "" { + device.Type = "a" + } if err := ioutil.WriteFile( filepath.Join(d.Path(path), file), []byte(deviceString(device)), diff --git a/vendor/github.com/containerd/cgroups/net_prio.go b/vendor/github.com/containerd/cgroups/net_prio.go index c77169215c..612e1bcd26 100644 --- a/vendor/github.com/containerd/cgroups/net_prio.go +++ b/vendor/github.com/containerd/cgroups/net_prio.go @@ -50,7 +50,7 @@ func (n *netprioController) Create(path string, resources *specs.LinuxResources) if resources.Network != nil { for _, prio := range resources.Network.Priorities { if err := ioutil.WriteFile( - filepath.Join(n.Path(path), "net_prio_ifpriomap"), + filepath.Join(n.Path(path), "net_prio.ifpriomap"), formatPrio(prio.Name, prio.Priority), defaultFilePerm, ); err != nil { diff --git a/vendor/github.com/containerd/cgroups/opts.go b/vendor/github.com/containerd/cgroups/opts.go new file mode 100644 index 0000000000..7c5d9fb9c2 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/opts.go @@ -0,0 +1,61 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "github.com/pkg/errors" +) + +var ( + // ErrIgnoreSubsystem allows the specific subsystem to be skipped + ErrIgnoreSubsystem = errors.New("skip subsystem") + // ErrDevicesRequired is returned when the devices subsystem is required but + // does not exist or is not active + ErrDevicesRequired = errors.New("devices subsystem is required") +) + +// InitOpts allows configuration for the creation or loading of a cgroup +type InitOpts func(*InitConfig) error + +// InitConfig provides configuration options for the creation +// or loading of a cgroup and its subsystems +type InitConfig struct { + // InitCheck can be used to check initialization errors from the subsystem + InitCheck InitCheck +} + +func newInitConfig() *InitConfig { + return &InitConfig{ + InitCheck: RequireDevices, + } +} + +// InitCheck allows subsystems errors to be checked when initialized or loaded +type InitCheck func(Subsystem, Path, error) error + +// AllowAny allows any subsystem errors to be skipped +func AllowAny(s Subsystem, p Path, err error) error { + return ErrIgnoreSubsystem +} + +// RequireDevices requires the device subsystem but no others +func RequireDevices(s Subsystem, p Path, err error) error { + if s.Name() == Devices { + return ErrDevicesRequired + } + return ErrIgnoreSubsystem +} diff --git a/vendor/github.com/containerd/cgroups/paths.go b/vendor/github.com/containerd/cgroups/paths.go index 455ce857f9..f45fd42564 100644 --- a/vendor/github.com/containerd/cgroups/paths.go +++ b/vendor/github.com/containerd/cgroups/paths.go @@ -57,6 +57,9 @@ func PidPath(pid int) Path { return existingPath(paths, "") } +// ErrControllerNotActive is returned when a controller is not supported or enabled +var ErrControllerNotActive = errors.New("controller is not supported") + func existingPath(paths map[string]string, suffix string) Path { // localize the paths based on the root mount dest for nested cgroups for n, p := range paths { @@ -77,7 +80,7 @@ func existingPath(paths map[string]string, suffix string) Path { root, ok := paths[string(name)] if !ok { if root, ok = paths[fmt.Sprintf("name=%s", name)]; !ok { - return "", fmt.Errorf("unable to find %q in controller set", name) + return "", ErrControllerNotActive } } if suffix != "" { diff --git a/vendor/github.com/containerd/cgroups/subsystem.go b/vendor/github.com/containerd/cgroups/subsystem.go index 933a6c38d6..23de04d494 100644 --- a/vendor/github.com/containerd/cgroups/subsystem.go +++ b/vendor/github.com/containerd/cgroups/subsystem.go @@ -42,7 +42,7 @@ const ( ) // Subsystems returns a complete list of the default cgroups -// avaliable on most linux systems +// available on most linux systems func Subsystems() []Name { n := []Name{ Hugetlb, diff --git a/vendor/github.com/containerd/cgroups/systemd.go b/vendor/github.com/containerd/cgroups/systemd.go index 8153d744ce..c5d4e30811 100644 --- a/vendor/github.com/containerd/cgroups/systemd.go +++ b/vendor/github.com/containerd/cgroups/systemd.go @@ -32,6 +32,11 @@ const ( defaultSlice = "system.slice" ) +var ( + canDelegate bool + once sync.Once +) + func Systemd() ([]Subsystem, error) { root, err := v1MountPoint() if err != nil { @@ -54,7 +59,7 @@ func Slice(slice, name string) Path { slice = defaultSlice } return func(subsystem Name) (string, error) { - return filepath.Join(slice, unitName(name)), nil + return filepath.Join(slice, name), nil } } @@ -80,15 +85,39 @@ func (s *SystemdController) Create(path string, resources *specs.LinuxResources) } defer conn.Close() slice, name := splitName(path) + // We need to see if systemd can handle the delegate property + // Systemd will return an error if it cannot handle delegate regardless + // of its bool setting. + checkDelegate := func() { + canDelegate = true + dlSlice := newProperty("Delegate", true) + if _, err := conn.StartTransientUnit(slice, "testdelegate", []systemdDbus.Property{dlSlice}, nil); err != nil { + if dbusError, ok := err.(dbus.Error); ok { + // Starting with systemd v237, Delegate is not even a property of slices anymore, + // so the D-Bus call fails with "InvalidArgs" error. + if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") || strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.InvalidArgs") { + canDelegate = false + } + } + } + + conn.StopUnit(slice, "testDelegate", nil) + } + once.Do(checkDelegate) properties := []systemdDbus.Property{ systemdDbus.PropDescription(fmt.Sprintf("cgroup %s", name)), systemdDbus.PropWants(slice), newProperty("DefaultDependencies", false), - newProperty("Delegate", true), newProperty("MemoryAccounting", true), newProperty("CPUAccounting", true), newProperty("BlockIOAccounting", true), } + + // If we can delegate, we add the property back in + if canDelegate { + properties = append(properties, newProperty("Delegate", true)) + } + ch := make(chan string) _, err = conn.StartTransientUnit(name, "replace", properties, ch) if err != nil { diff --git a/vendor/github.com/containerd/cgroups/utils.go b/vendor/github.com/containerd/cgroups/utils.go index 345be4e463..f3129b1a3a 100644 --- a/vendor/github.com/containerd/cgroups/utils.go +++ b/vendor/github.com/containerd/cgroups/utils.go @@ -111,7 +111,7 @@ func remove(path string) error { return fmt.Errorf("cgroups: unable to remove path %q", path) } -// readPids will read all the pids in a cgroup by the provided path +// readPids will read all the pids of processes in a cgroup by the provided path func readPids(path string, subsystem Name) ([]Process, error) { f, err := os.Open(filepath.Join(path, cgroupProcs)) if err != nil { @@ -138,6 +138,33 @@ func readPids(path string, subsystem Name) ([]Process, error) { return out, nil } +// readTasksPids will read all the pids of tasks in a cgroup by the provided path +func readTasksPids(path string, subsystem Name) ([]Task, error) { + f, err := os.Open(filepath.Join(path, cgroupTasks)) + if err != nil { + return nil, err + } + defer f.Close() + var ( + out []Task + s = bufio.NewScanner(f) + ) + for s.Scan() { + if t := s.Text(); t != "" { + pid, err := strconv.Atoi(t) + if err != nil { + return nil, err + } + out = append(out, Task{ + Pid: pid, + Subsystem: subsystem, + Path: path, + }) + } + } + return out, nil +} + func hugePageSizes() ([]string, error) { var ( pageSizes []string diff --git a/vendor/github.com/containerd/containerd/archive/tar.go b/vendor/github.com/containerd/containerd/archive/tar.go index 0d9e0e7501..fae023c557 100644 --- a/vendor/github.com/containerd/containerd/archive/tar.go +++ b/vendor/github.com/containerd/containerd/archive/tar.go @@ -194,7 +194,7 @@ func applyNaive(ctx context.Context, root string, tr *tar.Reader, options ApplyO parentPath = filepath.Dir(path) } if _, err := os.Lstat(parentPath); err != nil && os.IsNotExist(err) { - err = mkdirAll(parentPath, 0700) + err = mkdirAll(parentPath, 0755) if err != nil { return 0, err } diff --git a/vendor/github.com/containerd/containerd/image.go b/vendor/github.com/containerd/containerd/image.go index 62fba9de75..14bfea91b9 100644 --- a/vendor/github.com/containerd/containerd/image.go +++ b/vendor/github.com/containerd/containerd/image.go @@ -170,26 +170,22 @@ func (i *image) Unpack(ctx context.Context, snapshotterName string) error { chain = append(chain, layer.Diff.Digest) } - if unpacked { - desc, err := i.i.Config(ctx, cs, i.platform) - if err != nil { - return err - } - - rootfs := identity.ChainID(chain).String() - - cinfo := content.Info{ - Digest: desc.Digest, - Labels: map[string]string{ - fmt.Sprintf("containerd.io/gc.ref.snapshot.%s", snapshotterName): rootfs, - }, - } - if _, err := cs.Update(ctx, cinfo, fmt.Sprintf("labels.containerd.io/gc.ref.snapshot.%s", snapshotterName)); err != nil { - return err - } + desc, err := i.i.Config(ctx, cs, i.platform) + if err != nil { + return err } - return nil + rootfs := identity.ChainID(chain).String() + + cinfo := content.Info{ + Digest: desc.Digest, + Labels: map[string]string{ + fmt.Sprintf("containerd.io/gc.ref.snapshot.%s", snapshotterName): rootfs, + }, + } + + _, err = cs.Update(ctx, cinfo, fmt.Sprintf("labels.containerd.io/gc.ref.snapshot.%s", snapshotterName)) + return err } func (i *image) getLayers(ctx context.Context, platform platforms.MatchComparer) ([]rootfs.Layer, error) { diff --git a/vendor/github.com/containerd/containerd/metadata/gc.go b/vendor/github.com/containerd/containerd/metadata/gc.go index 99503fad7d..6afaa17729 100644 --- a/vendor/github.com/containerd/containerd/metadata/gc.go +++ b/vendor/github.com/containerd/containerd/metadata/gc.go @@ -64,6 +64,18 @@ func scanRoots(ctx context.Context, tx *bolt.Tx, nc chan<- gc.Node) error { // iterate through each namespace v1c := v1bkt.Cursor() + // cerr indicates the scan did not successfully send all + // the roots. The scan does not need to be cancelled but + // must return error at the end. + var cerr error + fn := func(n gc.Node) { + select { + case nc <- n: + case <-ctx.Done(): + cerr = ctx.Err() + } + } + for k, v := v1c.First(); k != nil; k, v = v1c.Next() { if v != nil { continue @@ -92,11 +104,7 @@ func scanRoots(ctx context.Context, tx *bolt.Tx, nc chan<- gc.Node) error { } } - select { - case nc <- gcnode(ResourceLease, ns, string(k)): - case <-ctx.Done(): - return ctx.Err() - } + fn(gcnode(ResourceLease, ns, string(k))) // Emit content and snapshots as roots instead of implementing // in references. Since leases cannot be referenced there is @@ -106,11 +114,7 @@ func scanRoots(ctx context.Context, tx *bolt.Tx, nc chan<- gc.Node) error { cbkt := libkt.Bucket(bucketKeyObjectContent) if cbkt != nil { if err := cbkt.ForEach(func(k, v []byte) error { - select { - case nc <- gcnode(ResourceContent, ns, string(k)): - case <-ctx.Done(): - return ctx.Err() - } + fn(gcnode(ResourceContent, ns, string(k))) return nil }); err != nil { return err @@ -126,11 +130,7 @@ func scanRoots(ctx context.Context, tx *bolt.Tx, nc chan<- gc.Node) error { snbkt := sbkt.Bucket(sk) return snbkt.ForEach(func(k, v []byte) error { - select { - case nc <- gcnode(ResourceSnapshot, ns, fmt.Sprintf("%s/%s", sk, k)): - case <-ctx.Done(): - return ctx.Err() - } + fn(gcnode(ResourceSnapshot, ns, fmt.Sprintf("%s/%s", sk, k))) return nil }) }); err != nil { @@ -141,11 +141,7 @@ func scanRoots(ctx context.Context, tx *bolt.Tx, nc chan<- gc.Node) error { ibkt := libkt.Bucket(bucketKeyObjectIngests) if ibkt != nil { if err := ibkt.ForEach(func(k, v []byte) error { - select { - case nc <- gcnode(ResourceIngest, ns, string(k)): - case <-ctx.Done(): - return ctx.Err() - } + fn(gcnode(ResourceIngest, ns, string(k))) return nil }); err != nil { return err @@ -168,18 +164,9 @@ func scanRoots(ctx context.Context, tx *bolt.Tx, nc chan<- gc.Node) error { target := ibkt.Bucket(k).Bucket(bucketKeyTarget) if target != nil { contentKey := string(target.Get(bucketKeyDigest)) - select { - case nc <- gcnode(ResourceContent, ns, contentKey): - case <-ctx.Done(): - return ctx.Err() - } + fn(gcnode(ResourceContent, ns, contentKey)) } - return sendSnapshotRefs(ns, ibkt.Bucket(k), func(n gc.Node) { - select { - case nc <- n: - case <-ctx.Done(): - } - }) + return sendLabelRefs(ns, ibkt.Bucket(k), fn) }); err != nil { return err } @@ -200,11 +187,7 @@ func scanRoots(ctx context.Context, tx *bolt.Tx, nc chan<- gc.Node) error { if ea == nil || expThreshold.After(*ea) { return nil } - select { - case nc <- gcnode(ResourceIngest, ns, string(k)): - case <-ctx.Done(): - return ctx.Err() - } + fn(gcnode(ResourceIngest, ns, string(k))) return nil }); err != nil { return err @@ -216,7 +199,12 @@ func scanRoots(ctx context.Context, tx *bolt.Tx, nc chan<- gc.Node) error { if v != nil { return nil } - return sendRootRef(ctx, nc, gcnode(ResourceContent, ns, string(k)), cbkt.Bucket(k)) + + if isRootRef(cbkt.Bucket(k)) { + fn(gcnode(ResourceContent, ns, string(k))) + } + + return nil }); err != nil { return err } @@ -229,23 +217,15 @@ func scanRoots(ctx context.Context, tx *bolt.Tx, nc chan<- gc.Node) error { if v != nil { return nil } - snapshotter := string(cbkt.Bucket(k).Get(bucketKeySnapshotter)) + + cibkt := cbkt.Bucket(k) + snapshotter := string(cibkt.Get(bucketKeySnapshotter)) if snapshotter != "" { - ss := string(cbkt.Bucket(k).Get(bucketKeySnapshotKey)) - select { - case nc <- gcnode(ResourceSnapshot, ns, fmt.Sprintf("%s/%s", snapshotter, ss)): - case <-ctx.Done(): - return ctx.Err() - } + ss := string(cibkt.Get(bucketKeySnapshotKey)) + fn(gcnode(ResourceSnapshot, ns, fmt.Sprintf("%s/%s", snapshotter, ss))) } - // TODO: Send additional snapshot refs through labels - return sendSnapshotRefs(ns, cbkt.Bucket(k), func(n gc.Node) { - select { - case nc <- n: - case <-ctx.Done(): - } - }) + return sendLabelRefs(ns, cibkt, fn) }); err != nil { return err } @@ -263,15 +243,17 @@ func scanRoots(ctx context.Context, tx *bolt.Tx, nc chan<- gc.Node) error { if v != nil { return nil } - - return sendRootRef(ctx, nc, gcnode(ResourceSnapshot, ns, fmt.Sprintf("%s/%s", sk, k)), snbkt.Bucket(k)) + if isRootRef(snbkt.Bucket(k)) { + fn(gcnode(ResourceSnapshot, ns, fmt.Sprintf("%s/%s", sk, k))) + } + return nil }) }); err != nil { return err } } } - return nil + return cerr } func references(ctx context.Context, tx *bolt.Tx, node gc.Node, fn func(gc.Node)) error { @@ -282,10 +264,7 @@ func references(ctx context.Context, tx *bolt.Tx, node gc.Node, fn func(gc.Node) return nil } - if err := sendSnapshotRefs(node.Namespace, bkt, fn); err != nil { - return err - } - return sendContentRefs(node.Namespace, bkt, fn) + return sendLabelRefs(node.Namespace, bkt, fn) } else if node.Type == ResourceSnapshot { parts := strings.SplitN(node.Key, "/", 2) if len(parts) != 2 { @@ -304,7 +283,7 @@ func references(ctx context.Context, tx *bolt.Tx, node gc.Node, fn func(gc.Node) fn(gcnode(ResourceSnapshot, node.Namespace, fmt.Sprintf("%s/%s", ss, pv))) } - return sendSnapshotRefs(node.Namespace, bkt, fn) + return sendLabelRefs(node.Namespace, bkt, fn) } else if node.Type == ResourceIngest { // Send expected value bkt := getBucket(tx, bucketKeyVersion, []byte(node.Namespace), bucketKeyObjectContent, bucketKeyObjectIngests, []byte(node.Key)) @@ -456,25 +435,8 @@ func remove(ctx context.Context, tx *bolt.Tx, node gc.Node) error { return nil } -// sendSnapshotRefs sends all snapshot references referred to by the labels in the bkt -func sendSnapshotRefs(ns string, bkt *bolt.Bucket, fn func(gc.Node)) error { - lbkt := bkt.Bucket(bucketKeyObjectLabels) - if lbkt != nil { - lc := lbkt.Cursor() - - for k, v := lc.Seek(labelGCSnapRef); k != nil && strings.HasPrefix(string(k), string(labelGCSnapRef)); k, v = lc.Next() { - snapshotter := k[len(labelGCSnapRef):] - if i := bytes.IndexByte(snapshotter, '/'); i >= 0 { - snapshotter = snapshotter[:i] - } - fn(gcnode(ResourceSnapshot, ns, fmt.Sprintf("%s/%s", snapshotter, v))) - } - } - return nil -} - -// sendContentRefs sends all content references referred to by the labels in the bkt -func sendContentRefs(ns string, bkt *bolt.Bucket, fn func(gc.Node)) error { +// sendLabelRefs sends all snapshot and content references referred to by the labels in the bkt +func sendLabelRefs(ns string, bkt *bolt.Bucket, fn func(gc.Node)) error { lbkt := bkt.Bucket(bucketKeyObjectLabels) if lbkt != nil { lc := lbkt.Cursor() @@ -490,6 +452,15 @@ func sendContentRefs(ns string, bkt *bolt.Bucket, fn func(gc.Node)) error { fn(gcnode(ResourceContent, ns, string(v))) } + + for k, v := lc.Seek(labelGCSnapRef); k != nil && strings.HasPrefix(string(k), string(labelGCSnapRef)); k, v = lc.Next() { + snapshotter := k[len(labelGCSnapRef):] + if i := bytes.IndexByte(snapshotter, '/'); i >= 0 { + snapshotter = snapshotter[:i] + } + fn(gcnode(ResourceSnapshot, ns, fmt.Sprintf("%s/%s", snapshotter, v))) + } + } return nil } @@ -506,17 +477,6 @@ func isRootRef(bkt *bolt.Bucket) bool { return false } -func sendRootRef(ctx context.Context, nc chan<- gc.Node, n gc.Node, bkt *bolt.Bucket) error { - if isRootRef(bkt) { - select { - case nc <- n: - case <-ctx.Done(): - return ctx.Err() - } - } - return nil -} - func gcnode(t gc.ResourceType, ns, key string) gc.Node { return gc.Node{ Type: t, diff --git a/vendor/github.com/containerd/containerd/vendor.conf b/vendor/github.com/containerd/containerd/vendor.conf index 264e15b718..b2261cf463 100644 --- a/vendor/github.com/containerd/containerd/vendor.conf +++ b/vendor/github.com/containerd/containerd/vendor.conf @@ -1,6 +1,6 @@ github.com/containerd/go-runc 5a6d9f37cfa36b15efba46dc7ea349fa9b7143c3 github.com/containerd/console c12b1e7919c14469339a5d38f2f8ed9b64a9de23 -github.com/containerd/cgroups 5e610833b72089b37d0e615de9a92dfc043757c2 +github.com/containerd/cgroups dbea6f2bd41658b84b00417ceefa416b979cbf10 github.com/containerd/typeurl a93fcdb778cd272c6e9b3028b2f42d813e785d40 github.com/containerd/fifo 3d5202aec260678c48179c56f40e6f38a095738c github.com/containerd/btrfs 2e1aa0ddf94f91fa282b6ed87c23bf0d64911244 @@ -20,7 +20,7 @@ github.com/gogo/protobuf v1.0.0 github.com/gogo/googleapis 08a7655d27152912db7aaf4f983275eaf8d128ef github.com/golang/protobuf v1.1.0 github.com/opencontainers/runtime-spec eba862dc2470385a233c7507392675cbeadf7353 # v1.0.1-45-geba862d -github.com/opencontainers/runc 6635b4f0c6af3810594d2770f662f34ddc15b40d +github.com/opencontainers/runc 2b18fe1d885ee5083ef9f0838fee39b62d653e30 github.com/sirupsen/logrus v1.0.0 github.com/urfave/cli 7bc6a0acffa589f415f88aca16cc1de5ffd66f9c golang.org/x/net b3756b4b77d7b13260a0a2ec658753cf48922eac @@ -43,7 +43,7 @@ github.com/google/go-cmp v0.1.0 go.etcd.io/bbolt v1.3.1-etcd.8 # cri dependencies -github.com/containerd/cri da0c016c830b2ea97fd1d737c49a568a816bf964 # release/1.2 branch +github.com/containerd/cri a92c40017473cbe0239ce180125f12669757e44f # release/1.2 branch github.com/containerd/go-cni 40bcf8ec8acd7372be1d77031d585d5d8e561c90 github.com/blang/semver v3.1.0 github.com/containernetworking/cni v0.6.0 diff --git a/vendor/github.com/opencontainers/runc/README.md b/vendor/github.com/opencontainers/runc/README.md index e755fb7bcd..11fa4138b4 100644 --- a/vendor/github.com/opencontainers/runc/README.md +++ b/vendor/github.com/opencontainers/runc/README.md @@ -16,10 +16,9 @@ This means that `runc` 1.0.0 should implement the 1.0 version of the specificati You can find official releases of `runc` on the [release](https://github.com/opencontainers/runc/releases) page. -### Security +## Security -If you wish to report a security issue, please disclose the issue responsibly -to security@opencontainers.org. +Reporting process and disclosure communications are outlined in [/org/security](https://github.com/opencontainers/org/blob/master/security/) ## Building diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go index ea571ad937..9717acc729 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go @@ -14,6 +14,7 @@ import ( "time" units "github.com/docker/go-units" + "golang.org/x/sys/unix" ) const ( @@ -463,11 +464,40 @@ func WriteCgroupProc(dir string, pid int) error { return fmt.Errorf("no such directory for %s", CgroupProcesses) } - // Don't attach any pid to the cgroup if -1 is specified as a pid - if pid != -1 { - if err := ioutil.WriteFile(filepath.Join(dir, CgroupProcesses), []byte(strconv.Itoa(pid)), 0700); err != nil { - return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err) - } + // Dont attach any pid to the cgroup if -1 is specified as a pid + if pid == -1 { + return nil + } + + cgroupProcessesFile, err := os.OpenFile(filepath.Join(dir, CgroupProcesses), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0700) + if err != nil { + return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err) + } + defer cgroupProcessesFile.Close() + + for i := 0; i < 5; i++ { + _, err = cgroupProcessesFile.WriteString(strconv.Itoa(pid)) + if err == nil { + return nil + } + + // EINVAL might mean that the task being added to cgroup.procs is in state + // TASK_NEW. We should attempt to do so again. + if isEINVAL(err) { + time.Sleep(30 * time.Millisecond) + continue + } + + return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err) + } + return err +} + +func isEINVAL(err error) bool { + switch err := err.(type) { + case *os.PathError: + return err.Err == unix.EINVAL + default: + return false } - return nil } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/cloned_binary.c b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/cloned_binary.c new file mode 100644 index 0000000000..b410e29517 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/cloned_binary.c @@ -0,0 +1,516 @@ +/* + * Copyright (C) 2019 Aleksa Sarai + * Copyright (C) 2019 SUSE LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Use our own wrapper for memfd_create. */ +#if !defined(SYS_memfd_create) && defined(__NR_memfd_create) +# define SYS_memfd_create __NR_memfd_create +#endif +/* memfd_create(2) flags -- copied from . */ +#ifndef MFD_CLOEXEC +# define MFD_CLOEXEC 0x0001U +# define MFD_ALLOW_SEALING 0x0002U +#endif +int memfd_create(const char *name, unsigned int flags) +{ +#ifdef SYS_memfd_create + return syscall(SYS_memfd_create, name, flags); +#else + errno = ENOSYS; + return -1; +#endif +} + + +/* This comes directly from . */ +#ifndef F_LINUX_SPECIFIC_BASE +# define F_LINUX_SPECIFIC_BASE 1024 +#endif +#ifndef F_ADD_SEALS +# define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) +# define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) +#endif +#ifndef F_SEAL_SEAL +# define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ +# define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ +# define F_SEAL_GROW 0x0004 /* prevent file from growing */ +# define F_SEAL_WRITE 0x0008 /* prevent writes */ +#endif + +#define CLONED_BINARY_ENV "_LIBCONTAINER_CLONED_BINARY" +#define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe" +#define RUNC_MEMFD_SEALS \ + (F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE) + +static void *must_realloc(void *ptr, size_t size) +{ + void *old = ptr; + do { + ptr = realloc(old, size); + } while(!ptr); + return ptr; +} + +/* + * Verify whether we are currently in a self-cloned program (namely, is + * /proc/self/exe a memfd). F_GET_SEALS will only succeed for memfds (or rather + * for shmem files), and we want to be sure it's actually sealed. + */ +static int is_self_cloned(void) +{ + int fd, ret, is_cloned = 0; + struct stat statbuf = {}; + struct statfs fsbuf = {}; + + fd = open("/proc/self/exe", O_RDONLY|O_CLOEXEC); + if (fd < 0) + return -ENOTRECOVERABLE; + + /* + * Is the binary a fully-sealed memfd? We don't need CLONED_BINARY_ENV for + * this, because you cannot write to a sealed memfd no matter what (so + * sharing it isn't a bad thing -- and an admin could bind-mount a sealed + * memfd to /usr/bin/runc to allow re-use). + */ + ret = fcntl(fd, F_GET_SEALS); + if (ret >= 0) { + is_cloned = (ret == RUNC_MEMFD_SEALS); + goto out; + } + + /* + * All other forms require CLONED_BINARY_ENV, since they are potentially + * writeable (or we can't tell if they're fully safe) and thus we must + * check the environment as an extra layer of defence. + */ + if (!getenv(CLONED_BINARY_ENV)) { + is_cloned = false; + goto out; + } + + /* + * Is the binary on a read-only filesystem? We can't detect bind-mounts in + * particular (in-kernel they are identical to regular mounts) but we can + * at least be sure that it's read-only. In addition, to make sure that + * it's *our* bind-mount we check CLONED_BINARY_ENV. + */ + if (fstatfs(fd, &fsbuf) >= 0) + is_cloned |= (fsbuf.f_flags & MS_RDONLY); + + /* + * Okay, we're a tmpfile -- or we're currently running on RHEL <=7.6 + * which appears to have a borked backport of F_GET_SEALS. Either way, + * having a file which has no hardlinks indicates that we aren't using + * a host-side "runc" binary and this is something that a container + * cannot fake (because unlinking requires being able to resolve the + * path that you want to unlink). + */ + if (fstat(fd, &statbuf) >= 0) + is_cloned |= (statbuf.st_nlink == 0); + +out: + close(fd); + return is_cloned; +} + +/* Read a given file into a new buffer, and providing the length. */ +static char *read_file(char *path, size_t *length) +{ + int fd; + char buf[4096], *copy = NULL; + + if (!length) + return NULL; + + fd = open(path, O_RDONLY | O_CLOEXEC); + if (fd < 0) + return NULL; + + *length = 0; + for (;;) { + ssize_t n; + + n = read(fd, buf, sizeof(buf)); + if (n < 0) + goto error; + if (!n) + break; + + copy = must_realloc(copy, (*length + n) * sizeof(*copy)); + memcpy(copy + *length, buf, n); + *length += n; + } + close(fd); + return copy; + +error: + close(fd); + free(copy); + return NULL; +} + +/* + * A poor-man's version of "xargs -0". Basically parses a given block of + * NUL-delimited data, within the given length and adds a pointer to each entry + * to the array of pointers. + */ +static int parse_xargs(char *data, int data_length, char ***output) +{ + int num = 0; + char *cur = data; + + if (!data || *output != NULL) + return -1; + + while (cur < data + data_length) { + num++; + *output = must_realloc(*output, (num + 1) * sizeof(**output)); + (*output)[num - 1] = cur; + cur += strlen(cur) + 1; + } + (*output)[num] = NULL; + return num; +} + +/* + * "Parse" out argv from /proc/self/cmdline. + * This is necessary because we are running in a context where we don't have a + * main() that we can just get the arguments from. + */ +static int fetchve(char ***argv) +{ + char *cmdline = NULL; + size_t cmdline_size; + + cmdline = read_file("/proc/self/cmdline", &cmdline_size); + if (!cmdline) + goto error; + + if (parse_xargs(cmdline, cmdline_size, argv) <= 0) + goto error; + + return 0; + +error: + free(cmdline); + return -EINVAL; +} + +enum { + EFD_NONE = 0, + EFD_MEMFD, + EFD_FILE, +}; + +/* + * This comes from . We can't hard-code __O_TMPFILE because it + * changes depending on the architecture. If we don't have O_TMPFILE we always + * have the mkostemp(3) fallback. + */ +#ifndef O_TMPFILE +# if defined(__O_TMPFILE) && defined(O_DIRECTORY) +# define O_TMPFILE (__O_TMPFILE | O_DIRECTORY) +# endif +#endif + +static int make_execfd(int *fdtype) +{ + int fd = -1; + char template[PATH_MAX] = {0}; + char *prefix = secure_getenv("_LIBCONTAINER_STATEDIR"); + + if (!prefix || *prefix != '/') + prefix = "/tmp"; + if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0) + return -1; + + /* + * Now try memfd, it's much nicer than actually creating a file in STATEDIR + * since it's easily detected thanks to sealing and also doesn't require + * assumptions about STATEDIR. + */ + *fdtype = EFD_MEMFD; + fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING); + if (fd >= 0) + return fd; + if (errno != ENOSYS && errno != EINVAL) + goto error; + +#ifdef O_TMPFILE + /* + * Try O_TMPFILE to avoid races where someone might snatch our file. Note + * that O_EXCL isn't actually a security measure here (since you can just + * fd re-open it and clear O_EXCL). + */ + *fdtype = EFD_FILE; + fd = open(prefix, O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700); + if (fd >= 0) { + struct stat statbuf = {}; + bool working_otmpfile = false; + + /* + * open(2) ignores unknown O_* flags -- yeah, I was surprised when I + * found this out too. As a result we can't check for EINVAL. However, + * if we get nlink != 0 (or EISDIR) then we know that this kernel + * doesn't support O_TMPFILE. + */ + if (fstat(fd, &statbuf) >= 0) + working_otmpfile = (statbuf.st_nlink == 0); + + if (working_otmpfile) + return fd; + + /* Pretend that we got EISDIR since O_TMPFILE failed. */ + close(fd); + errno = EISDIR; + } + if (errno != EISDIR) + goto error; +#endif /* defined(O_TMPFILE) */ + + /* + * Our final option is to create a temporary file the old-school way, and + * then unlink it so that nothing else sees it by accident. + */ + *fdtype = EFD_FILE; + fd = mkostemp(template, O_CLOEXEC); + if (fd >= 0) { + if (unlink(template) >= 0) + return fd; + close(fd); + } + +error: + *fdtype = EFD_NONE; + return -1; +} + +static int seal_execfd(int *fd, int fdtype) +{ + switch (fdtype) { + case EFD_MEMFD: + return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_SEALS); + case EFD_FILE: { + /* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */ + int newfd; + char fdpath[PATH_MAX] = {0}; + + if (fchmod(*fd, 0100) < 0) + return -1; + + if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0) + return -1; + + newfd = open(fdpath, O_PATH | O_CLOEXEC); + if (newfd < 0) + return -1; + + close(*fd); + *fd = newfd; + return 0; + } + default: + break; + } + return -1; +} + +static int try_bindfd(void) +{ + int fd, ret = -1; + char template[PATH_MAX] = {0}; + char *prefix = secure_getenv("_LIBCONTAINER_STATEDIR"); + + if (!prefix || *prefix != '/') + prefix = "/tmp"; + if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0) + return ret; + + /* + * We need somewhere to mount it, mounting anything over /proc/self is a + * BAD idea on the host -- even if we do it temporarily. + */ + fd = mkstemp(template); + if (fd < 0) + return ret; + close(fd); + + /* + * For obvious reasons this won't work in rootless mode because we haven't + * created a userns+mntns -- but getting that to work will be a bit + * complicated and it's only worth doing if someone actually needs it. + */ + ret = -EPERM; + if (mount("/proc/self/exe", template, "", MS_BIND, "") < 0) + goto out; + if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0) + goto out_umount; + + + /* Get read-only handle that we're sure can't be made read-write. */ + ret = open(template, O_PATH | O_CLOEXEC); + +out_umount: + /* + * Make sure the MNT_DETACH works, otherwise we could get remounted + * read-write and that would be quite bad (the fd would be made read-write + * too, invalidating the protection). + */ + if (umount2(template, MNT_DETACH) < 0) { + if (ret >= 0) + close(ret); + ret = -ENOTRECOVERABLE; + } + +out: + /* + * We don't care about unlink errors, the worst that happens is that + * there's an empty file left around in STATEDIR. + */ + unlink(template); + return ret; +} + +static ssize_t fd_to_fd(int outfd, int infd) +{ + ssize_t total = 0; + char buffer[4096]; + + for (;;) { + ssize_t nread, nwritten = 0; + + nread = read(infd, buffer, sizeof(buffer)); + if (nread < 0) + return -1; + if (!nread) + break; + + do { + ssize_t n = write(outfd, buffer + nwritten, nread - nwritten); + if (n < 0) + return -1; + nwritten += n; + } while(nwritten < nread); + + total += nwritten; + } + + return total; +} + +static int clone_binary(void) +{ + int binfd, execfd; + struct stat statbuf = {}; + size_t sent = 0; + int fdtype = EFD_NONE; + + /* + * Before we resort to copying, let's try creating an ro-binfd in one shot + * by getting a handle for a read-only bind-mount of the execfd. + */ + execfd = try_bindfd(); + if (execfd >= 0) + return execfd; + + /* + * Dammit, that didn't work -- time to copy the binary to a safe place we + * can seal the contents. + */ + execfd = make_execfd(&fdtype); + if (execfd < 0 || fdtype == EFD_NONE) + return -ENOTRECOVERABLE; + + binfd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC); + if (binfd < 0) + goto error; + + if (fstat(binfd, &statbuf) < 0) + goto error_binfd; + + while (sent < statbuf.st_size) { + int n = sendfile(execfd, binfd, NULL, statbuf.st_size - sent); + if (n < 0) { + /* sendfile can fail so we fallback to a dumb user-space copy. */ + n = fd_to_fd(execfd, binfd); + if (n < 0) + goto error_binfd; + } + sent += n; + } + close(binfd); + if (sent != statbuf.st_size) + goto error; + + if (seal_execfd(&execfd, fdtype) < 0) + goto error; + + return execfd; + +error_binfd: + close(binfd); +error: + close(execfd); + return -EIO; +} + +/* Get cheap access to the environment. */ +extern char **environ; + +int ensure_cloned_binary(void) +{ + int execfd; + char **argv = NULL; + + /* Check that we're not self-cloned, and if we are then bail. */ + int cloned = is_self_cloned(); + if (cloned > 0 || cloned == -ENOTRECOVERABLE) + return cloned; + + if (fetchve(&argv) < 0) + return -EINVAL; + + execfd = clone_binary(); + if (execfd < 0) + return -EIO; + + if (putenv(CLONED_BINARY_ENV "=1")) + goto error; + + fexecve(execfd, argv, environ); +error: + close(execfd); + return -ENOEXEC; +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c index 28269dfc02..7750af35ea 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c +++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c @@ -534,6 +534,9 @@ void join_namespaces(char *nslist) free(namespaces); } +/* Defined in cloned_binary.c. */ +extern int ensure_cloned_binary(void); + void nsexec(void) { int pipenum; @@ -549,6 +552,14 @@ void nsexec(void) if (pipenum == -1) return; + /* + * We need to re-exec if we are not in a cloned binary. This is necessary + * to ensure that containers won't be able to access the host binary + * through /proc/self/exe. See CVE-2019-5736. + */ + if (ensure_cloned_binary() < 0) + bail("could not ensure we are a cloned binary"); + /* Parse all of the netlink configuration. */ nl_parse(pipenum, &config); diff --git a/vendor/github.com/opencontainers/runc/vendor.conf b/vendor/github.com/opencontainers/runc/vendor.conf index fadbe07071..3f5a35115b 100644 --- a/vendor/github.com/opencontainers/runc/vendor.conf +++ b/vendor/github.com/opencontainers/runc/vendor.conf @@ -1,8 +1,9 @@ # OCI runtime-spec. When updating this, make sure you use a version tag rather # than a commit ID so it's much more obvious what version of the spec we are # using. -github.com/opencontainers/runtime-spec 5684b8af48c1ac3b1451fa499724e30e3c20a294 +github.com/opencontainers/runtime-spec 29686dbc5559d93fb1ef402eeda3e35c38d75af4 # Core libcontainer functionality. +github.com/checkpoint-restore/go-criu v3.11 github.com/mrunalp/fileutils ed869b029674c0e9ce4c0dfa781405c2d9946d08 github.com/opencontainers/selinux v1.0.0-rc1 github.com/seccomp/libseccomp-golang 84e90a91acea0f4e51e62bc1a75de18b1fc0790f @@ -18,7 +19,7 @@ github.com/golang/protobuf 18c9bb3261723cd5401db4d0c9fbc5c3b6c70fe8 github.com/cyphar/filepath-securejoin v0.2.1 github.com/docker/go-units v0.2.0 github.com/urfave/cli d53eb991652b1d438abdd34ce4bfa3ef1539108e -golang.org/x/sys 7ddbeae9ae08c6a06a59597f0c9edbc5ff2444ce https://github.com/golang/sys +golang.org/x/sys 41f3e6584952bb034a481797859f6ab34b6803bd https://github.com/golang/sys # console dependencies github.com/containerd/console 2748ece16665b45a47f884001d5831ec79703880