diff --git a/vendor.mod b/vendor.mod index d4a628475b..fed7ee0873 100644 --- a/vendor.mod +++ b/vendor.mod @@ -77,7 +77,7 @@ require ( github.com/morikuni/aec v1.0.0 github.com/opencontainers/go-digest v1.0.0 github.com/opencontainers/image-spec v1.1.0-rc5 - github.com/opencontainers/runc v1.1.12 + github.com/opencontainers/runc v1.2.0-rc.1 github.com/opencontainers/runtime-spec v1.2.0 github.com/opencontainers/selinux v1.11.0 github.com/pelletier/go-toml v1.9.5 diff --git a/vendor.sum b/vendor.sum index 8b8f04a15b..9900b6d503 100644 --- a/vendor.sum +++ b/vendor.sum @@ -539,8 +539,8 @@ github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8 github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= github.com/opencontainers/image-spec v1.1.0-rc5 h1:Ygwkfw9bpDvs+c9E34SdgGOj41dX/cbdlwvlWt0pnFI= github.com/opencontainers/image-spec v1.1.0-rc5/go.mod h1:X4pATf0uXsnn3g5aiGIsVnJBR4mxhKzfwmvK/B2NTm8= -github.com/opencontainers/runc v1.1.12 h1:BOIssBaW1La0/qbNZHXOOa71dZfZEQOzW7dqQf3phss= -github.com/opencontainers/runc v1.1.12/go.mod h1:S+lQwSfncpBha7XTy/5lBwWgm5+y5Ma/O44Ekby9FK8= +github.com/opencontainers/runc v1.2.0-rc.1 h1:SMjop2pxxYRTfKdsigna/8xRoaoCfIQfD2cVuOb64/o= +github.com/opencontainers/runc v1.2.0-rc.1/go.mod h1:m9JwxfHzXz5YTTXBQr7EY9KTuazFAGPyMQx2nRR3vTw= github.com/opencontainers/runtime-spec v1.0.3-0.20220825212826-86290f6a00fb/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/opencontainers/runtime-spec v1.2.0 h1:z97+pHb3uELt/yiAWD691HNHQIF07bE7dzrbT927iTk= github.com/opencontainers/runtime-spec v1.2.0/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= diff --git a/vendor/github.com/opencontainers/runc/NOTICE b/vendor/github.com/opencontainers/runc/NOTICE index 5c97abce4b..c29775c0d9 100644 --- a/vendor/github.com/opencontainers/runc/NOTICE +++ b/vendor/github.com/opencontainers/runc/NOTICE @@ -8,9 +8,9 @@ The following is courtesy of our legal counsel: Use and transfer of Docker may be subject to certain restrictions by the -United States and other governments. +United States and other governments. It is your responsibility to ensure that your use and/or transfer does not -violate applicable laws. +violate applicable laws. For more information, please see http://www.bis.doc.gov diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go index ba2b2266c9..b9ba889b7a 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go @@ -1,9 +1,24 @@ package cgroups import ( + "errors" + "github.com/opencontainers/runc/libcontainer/configs" ) +var ( + // ErrDevicesUnsupported is an error returned when a cgroup manager + // is not configured to set device rules. + ErrDevicesUnsupported = errors.New("cgroup manager is not configured to set device rules") + + // DevicesSetV1 and DevicesSetV2 are functions to set devices for + // cgroup v1 and v2, respectively. Unless libcontainer/cgroups/devices + // package is imported, it is set to nil, so cgroup managers can't + // manage devices. + DevicesSetV1 func(path string, r *configs.Resources) error + DevicesSetV2 func(path string, r *configs.Resources) error +) + type Manager interface { // Apply creates a cgroup, if not yet created, and adds a process // with the specified pid into that cgroup. A special value of -1 diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/file.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/file.go index f6e1b73bd9..16aae5a3b7 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/file.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/file.go @@ -50,24 +50,13 @@ func WriteFile(dir, file, data string) error { return err } defer fd.Close() - if err := retryingWriteFile(fd, data); err != nil { + if _, err := fd.WriteString(data); err != nil { // Having data in the error message helps in debugging. return fmt.Errorf("failed to write %q: %w", data, err) } return nil } -func retryingWriteFile(fd *os.File, data string) error { - for { - _, err := fd.Write([]byte(data)) - if errors.Is(err, unix.EINTR) { - logrus.Infof("interrupted while writing %s to %s", data, fd.Name()) - continue - } - return err - } -} - const ( cgroupfsDir = "/sys/fs/cgroup" cgroupfsPrefix = cgroupfsDir + "/" @@ -90,7 +79,7 @@ func prepareOpenat2() error { }) if err != nil { prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err} - if err != unix.ENOSYS { //nolint:errorlint // unix errors are bare + if err != unix.ENOSYS { logrus.Warnf("falling back to securejoin: %s", prepErr) } else { logrus.Debug("openat2 not available, falling back to securejoin") @@ -148,8 +137,9 @@ func openFile(dir, file string, flags int) (*os.File, error) { // // TODO: if such usage will ever be common, amend this // to reopen cgroupRootHandle and retry openat2. - fdStr := strconv.Itoa(int(cgroupRootHandle.Fd())) - fdDest, _ := os.Readlink("/proc/self/fd/" + fdStr) + fdPath, closer := utils.ProcThreadSelf("fd/" + strconv.Itoa(int(cgroupRootHandle.Fd()))) + defer closer() + fdDest, _ := os.Readlink(fdPath) if fdDest != cgroupfsDir { // Wrap the error so it is clear that cgroupRootHandle // is opened to an unexpected/wrong directory. diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go index 0d8371b05f..b475567d82 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go @@ -32,9 +32,22 @@ type CpuUsage struct { UsageInUsermode uint64 `json:"usage_in_usermode"` } +type PSIData struct { + Avg10 float64 `json:"avg10"` + Avg60 float64 `json:"avg60"` + Avg300 float64 `json:"avg300"` + Total uint64 `json:"total"` +} + +type PSIStats struct { + Some PSIData `json:"some,omitempty"` + Full PSIData `json:"full,omitempty"` +} + type CpuStats struct { CpuUsage CpuUsage `json:"cpu_usage,omitempty"` ThrottlingData ThrottlingData `json:"throttling_data,omitempty"` + PSI *PSIStats `json:"psi,omitempty"` } type CPUSetStats struct { @@ -91,6 +104,7 @@ type MemoryStats struct { UseHierarchy bool `json:"use_hierarchy"` Stats map[string]uint64 `json:"stats,omitempty"` + PSI *PSIStats `json:"psi,omitempty"` } type PageUsageByNUMA struct { @@ -135,6 +149,7 @@ type BlkioStats struct { IoMergedRecursive []BlkioStatEntry `json:"io_merged_recursive,omitempty"` IoTimeRecursive []BlkioStatEntry `json:"io_time_recursive,omitempty"` SectorsRecursive []BlkioStatEntry `json:"sectors_recursive,omitempty"` + PSI *PSIStats `json:"psi,omitempty"` } type HugetlbStats struct { @@ -157,6 +172,13 @@ type RdmaStats struct { RdmaCurrent []RdmaEntry `json:"rdma_current,omitempty"` } +type MiscStats struct { + // current resource usage for a key in misc + Usage uint64 `json:"usage,omitempty"` + // number of times the resource usage was about to go over the max boundary + Events uint64 `json:"events,omitempty"` +} + type Stats struct { CpuStats CpuStats `json:"cpu_stats,omitempty"` CPUSetStats CPUSetStats `json:"cpuset_stats,omitempty"` @@ -166,10 +188,13 @@ type Stats struct { // the map is in the format "size of hugepage: stats of the hugepage" HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"` RdmaStats RdmaStats `json:"rdma_stats,omitempty"` + // the map is in the format "misc resource name: stats of the key" + MiscStats map[string]MiscStats `json:"misc_stats,omitempty"` } func NewStats() *Stats { memoryStats := MemoryStats{Stats: make(map[string]uint64)} hugetlbStats := make(map[string]HugetlbStats) - return &Stats{MemoryStats: memoryStats, HugetlbStats: hugetlbStats} + miscStats := make(map[string]MiscStats) + return &Stats{MemoryStats: memoryStats, HugetlbStats: hugetlbStats, MiscStats: miscStats} } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go index fc4ae44a48..186cbc6413 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go @@ -36,13 +36,13 @@ func IsCgroup2UnifiedMode() bool { var st unix.Statfs_t err := unix.Statfs(unifiedMountpoint, &st) if err != nil { + level := logrus.WarnLevel if os.IsNotExist(err) && userns.RunningInUserNS() { - // ignore the "not found" error if running in userns - logrus.WithError(err).Debugf("%s missing, assuming cgroup v1", unifiedMountpoint) - isUnified = false - return + // For rootless containers, sweep it under the rug. + level = logrus.DebugLevel } - panic(fmt.Sprintf("cannot statfs cgroup root: %s", err)) + logrus.StandardLogger().Logf(level, + "statfs %s: %v; assuming cgroup v1", unifiedMountpoint, err) } isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC }) @@ -217,21 +217,26 @@ func PathExists(path string) bool { return true } -func EnterPid(cgroupPaths map[string]string, pid int) error { - for _, path := range cgroupPaths { - if PathExists(path) { - if err := WriteCgroupProc(path, pid); err != nil { - return err - } - } - } - return nil -} +// rmdir tries to remove a directory, optionally retrying on EBUSY. +func rmdir(path string, retry bool) error { + delay := time.Millisecond + tries := 10 -func rmdir(path string) error { +again: err := unix.Rmdir(path) - if err == nil || err == unix.ENOENT { //nolint:errorlint // unix errors are bare + switch err { // nolint:errorlint // unix errors are bare + case nil, unix.ENOENT: return nil + case unix.EINTR: + goto again + case unix.EBUSY: + if retry && tries > 0 { + time.Sleep(delay) + delay *= 2 + tries-- + goto again + + } } return &os.PathError{Op: "rmdir", Path: path, Err: err} } @@ -239,68 +244,42 @@ func rmdir(path string) error { // RemovePath aims to remove cgroup path. It does so recursively, // by removing any subdirectories (sub-cgroups) first. func RemovePath(path string) error { - // try the fast path first - if err := rmdir(path); err == nil { + // Try the fast path first. + if err := rmdir(path, false); err == nil { return nil } infos, err := os.ReadDir(path) - if err != nil { - if os.IsNotExist(err) { - err = nil - } + if err != nil && !os.IsNotExist(err) { return err } for _, info := range infos { if info.IsDir() { - // We should remove subcgroups dir first + // We should remove subcgroup first. if err = RemovePath(filepath.Join(path, info.Name())); err != nil { break } } } if err == nil { - err = rmdir(path) + err = rmdir(path, true) } return err } // RemovePaths iterates over the provided paths removing them. -// We trying to remove all paths five times with increasing delay between tries. -// If after all there are not removed cgroups - appropriate error will be -// returned. func RemovePaths(paths map[string]string) (err error) { - const retries = 5 - delay := 10 * time.Millisecond - for i := 0; i < retries; i++ { - if i != 0 { - time.Sleep(delay) - delay *= 2 - } - for s, p := range paths { - if err := RemovePath(p); err != nil { - // do not log intermediate iterations - switch i { - case 0: - logrus.WithError(err).Warnf("Failed to remove cgroup (will retry)") - case retries - 1: - logrus.WithError(err).Error("Failed to remove cgroup") - } - } - _, err := os.Stat(p) - // We need this strange way of checking cgroups existence because - // RemoveAll almost always returns error, even on already removed - // cgroups - if os.IsNotExist(err) { - delete(paths, s) - } - } - if len(paths) == 0 { - //nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506 - paths = make(map[string]string) - return nil + for s, p := range paths { + if err := RemovePath(p); err == nil { + delete(paths, s) } } + if len(paths) == 0 { + //nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506 + // TODO: switch to clear once Go < 1.21 is not supported. + paths = make(map[string]string) + return nil + } return fmt.Errorf("Failed to remove paths: %v", paths) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go index 47c75f22b4..81193e2098 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go @@ -99,11 +99,12 @@ func tryDefaultPath(cgroupPath, subsystem string) string { // expensive), so it is assumed that cgroup mounts are not being changed. func readCgroupMountinfo() ([]*mountinfo.Info, error) { readMountinfoOnce.Do(func() { + // mountinfo.GetMounts uses /proc/thread-self, so we can use it without + // issues. cgroupMountinfo, readMountinfoErr = mountinfo.GetMounts( mountinfo.FSTypeFilter("cgroup"), ) }) - return cgroupMountinfo, readMountinfoErr } @@ -196,6 +197,9 @@ func getCgroupMountsV1(all bool) ([]Mount, error) { return nil, err } + // We don't need to use /proc/thread-self here because runc always runs + // with every thread in the same cgroup. This lets us avoid having to do + // runtime.LockOSThread. allSubsystems, err := ParseCgroupFile("/proc/self/cgroup") if err != nil { return nil, err @@ -214,6 +218,10 @@ func GetOwnCgroup(subsystem string) (string, error) { if IsCgroup2UnifiedMode() { return "", errUnified } + + // We don't need to use /proc/thread-self here because runc always runs + // with every thread in the same cgroup. This lets us avoid having to do + // runtime.LockOSThread. cgroups, err := ParseCgroupFile("/proc/self/cgroup") if err != nil { return "", err @@ -236,27 +244,6 @@ func GetOwnCgroupPath(subsystem string) (string, error) { return getCgroupPathHelper(subsystem, cgroup) } -func GetInitCgroup(subsystem string) (string, error) { - if IsCgroup2UnifiedMode() { - return "", errUnified - } - cgroups, err := ParseCgroupFile("/proc/1/cgroup") - if err != nil { - return "", err - } - - return getControllerPath(subsystem, cgroups) -} - -func GetInitCgroupPath(subsystem string) (string, error) { - cgroup, err := GetInitCgroup(subsystem) - if err != nil { - return "", err - } - - return getCgroupPathHelper(subsystem, cgroup) -} - func getCgroupPathHelper(subsystem, cgroup string) (string, error) { mnt, root, err := FindCgroupMountpointAndRoot("", subsystem) if err != nil { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go index fa195bf90f..865344f99c 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go @@ -2,8 +2,8 @@ package configs import "fmt" -// blockIODevice holds major:minor format supported in blkio cgroup -type blockIODevice struct { +// BlockIODevice holds major:minor format supported in blkio cgroup. +type BlockIODevice struct { // Major is the device's major number Major int64 `json:"major"` // Minor is the device's minor number @@ -12,7 +12,7 @@ type blockIODevice struct { // WeightDevice struct holds a `major:minor weight`|`major:minor leaf_weight` pair type WeightDevice struct { - blockIODevice + BlockIODevice // Weight is the bandwidth rate for the device, range is from 10 to 1000 Weight uint16 `json:"weight"` // LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only @@ -41,7 +41,7 @@ func (wd *WeightDevice) LeafWeightString() string { // ThrottleDevice struct holds a `major:minor rate_per_second` pair type ThrottleDevice struct { - blockIODevice + BlockIODevice // Rate is the IO rate limit per cgroup per device Rate uint64 `json:"rate"` } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go index 2d4a898710..4a34cf76fc 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go @@ -69,6 +69,9 @@ type Resources struct { // CPU hardcap limit (in usecs). Allowed cpu time in a given period. CpuQuota int64 `json:"cpu_quota"` + // CPU hardcap burst limit (in usecs). Allowed accumulated cpu time additionally for burst in a given period. + CpuBurst *uint64 `json:"cpu_burst"` //nolint:revive + // CPU period to be used for hardcapping (in usecs). 0 to use system default. CpuPeriod uint64 `json:"cpu_period"` @@ -84,6 +87,9 @@ type Resources struct { // MEM to use CpusetMems string `json:"cpuset_mems"` + // cgroup SCHED_IDLE + CPUIdle *int64 `json:"cpu_idle,omitempty"` + // Process limit; set <= `0' to disable limit. PidsLimit int64 `json:"pids_limit"` @@ -155,4 +161,9 @@ type Resources struct { // during Set() to figure out whether the freeze is required. Those // methods may be relatively slow, thus this flag. SkipFreezeOnSet bool `json:"-"` + + // MemoryCheckBeforeUpdate is a flag for cgroup v2 managers to check + // if the new memory limits (Memory and MemorySwap) being set are lower + // than the current memory usage, and reject if so. + MemoryCheckBeforeUpdate bool `json:"memory_check_before_update"` } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go index 6ebf5ec7b6..22fe0f9b4c 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go @@ -8,6 +8,7 @@ import ( "time" "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" "github.com/opencontainers/runc/libcontainer/devices" "github.com/opencontainers/runtime-spec/specs-go" @@ -31,12 +32,13 @@ type IDMap struct { // for syscalls. Additional architectures can be added by specifying them in // Architectures. type Seccomp struct { - DefaultAction Action `json:"default_action"` - Architectures []string `json:"architectures"` - Syscalls []*Syscall `json:"syscalls"` - DefaultErrnoRet *uint `json:"default_errno_ret"` - ListenerPath string `json:"listener_path,omitempty"` - ListenerMetadata string `json:"listener_metadata,omitempty"` + DefaultAction Action `json:"default_action"` + Architectures []string `json:"architectures"` + Flags []specs.LinuxSeccompFlag `json:"flags"` + Syscalls []*Syscall `json:"syscalls"` + DefaultErrnoRet *uint `json:"default_errno_ret"` + ListenerPath string `json:"listener_path,omitempty"` + ListenerMetadata string `json:"listener_metadata,omitempty"` } // Action is taken upon rule match in Seccomp @@ -83,9 +85,6 @@ type Syscall struct { Args []*Arg `json:"args"` } -// TODO Windows. Many of these fields should be factored out into those parts -// which are common across platforms, and those which are platform specific. - // Config defines configuration options for executing a process inside a contained environment. type Config struct { // NoPivotRoot will use MS_MOVE and a chroot to jail the process into the container's rootfs @@ -121,6 +120,9 @@ type Config struct { // Hostname optionally sets the container's hostname if provided Hostname string `json:"hostname"` + // Domainname optionally sets the container's domainname if provided + Domainname string `json:"domainname"` + // Namespaces specifies the container's namespaces that it should setup when cloning the init process // If a namespace is not provided that namespace is shared from the container's parent process Namespaces Namespaces `json:"namespaces"` @@ -158,11 +160,11 @@ type Config struct { // More information about kernel oom score calculation here: https://lwn.net/Articles/317814/ OomScoreAdj *int `json:"oom_score_adj,omitempty"` - // UidMappings is an array of User ID mappings for User Namespaces - UidMappings []IDMap `json:"uid_mappings"` + // UIDMappings is an array of User ID mappings for User Namespaces + UIDMappings []IDMap `json:"uid_mappings"` - // GidMappings is an array of Group ID mappings for User Namespaces - GidMappings []IDMap `json:"gid_mappings"` + // GIDMappings is an array of Group ID mappings for User Namespaces + GIDMappings []IDMap `json:"gid_mappings"` // MaskPaths specifies paths within the container's rootfs to mask over with a bind // mount pointing to /dev/null as to prevent reads of the file. @@ -211,8 +213,87 @@ type Config struct { // RootlessCgroups is set when unlikely to have the full access to cgroups. // When RootlessCgroups is set, cgroups errors are ignored. RootlessCgroups bool `json:"rootless_cgroups,omitempty"` + + // TimeOffsets specifies the offset for supporting time namespaces. + TimeOffsets map[string]specs.LinuxTimeOffset `json:"time_offsets,omitempty"` + + // Scheduler represents the scheduling attributes for a process. + Scheduler *Scheduler `json:"scheduler,omitempty"` + + // Personality contains configuration for the Linux personality syscall. + Personality *LinuxPersonality `json:"personality,omitempty"` + + // IOPriority is the container's I/O priority. + IOPriority *IOPriority `json:"io_priority,omitempty"` } +// Scheduler is based on the Linux sched_setattr(2) syscall. +type Scheduler = specs.Scheduler + +// ToSchedAttr is to convert *configs.Scheduler to *unix.SchedAttr. +func ToSchedAttr(scheduler *Scheduler) (*unix.SchedAttr, error) { + var policy uint32 + switch scheduler.Policy { + case specs.SchedOther: + policy = 0 + case specs.SchedFIFO: + policy = 1 + case specs.SchedRR: + policy = 2 + case specs.SchedBatch: + policy = 3 + case specs.SchedISO: + policy = 4 + case specs.SchedIdle: + policy = 5 + case specs.SchedDeadline: + policy = 6 + default: + return nil, fmt.Errorf("invalid scheduler policy: %s", scheduler.Policy) + } + + var flags uint64 + for _, flag := range scheduler.Flags { + switch flag { + case specs.SchedFlagResetOnFork: + flags |= 0x01 + case specs.SchedFlagReclaim: + flags |= 0x02 + case specs.SchedFlagDLOverrun: + flags |= 0x04 + case specs.SchedFlagKeepPolicy: + flags |= 0x08 + case specs.SchedFlagKeepParams: + flags |= 0x10 + case specs.SchedFlagUtilClampMin: + flags |= 0x20 + case specs.SchedFlagUtilClampMax: + flags |= 0x40 + default: + return nil, fmt.Errorf("invalid scheduler flag: %s", flag) + } + } + + return &unix.SchedAttr{ + Size: unix.SizeofSchedAttr, + Policy: policy, + Flags: flags, + Nice: scheduler.Nice, + Priority: uint32(scheduler.Priority), + Runtime: scheduler.Runtime, + Deadline: scheduler.Deadline, + Period: scheduler.Period, + }, nil +} + +var IOPrioClassMapping = map[specs.IOPriorityClass]int{ + specs.IOPRIO_CLASS_RT: 1, + specs.IOPRIO_CLASS_BE: 2, + specs.IOPRIO_CLASS_IDLE: 3, +} + +type IOPriority = specs.LinuxIOPriority + type ( HookName string HookList []Hook @@ -277,6 +358,7 @@ type Capabilities struct { Ambient []string } +// Deprecated: use (Hooks).Run instead. func (hooks HookList) RunHooks(state *specs.State) error { for i, h := range hooks { if err := h.Run(state); err != nil { @@ -333,6 +415,18 @@ func (hooks *Hooks) MarshalJSON() ([]byte, error) { }) } +// Run executes all hooks for the given hook name. +func (hooks Hooks) Run(name HookName, state *specs.State) error { + list := hooks[name] + for i, h := range list { + if err := h.Run(state); err != nil { + return fmt.Errorf("error running %s hook #%d: %w", name, i, err) + } + } + + return nil +} + type Hook interface { // Run executes the hook with the provided state. Run(*specs.State) error @@ -393,7 +487,7 @@ func (c Command) Run(s *specs.State) error { go func() { err := cmd.Wait() if err != nil { - err = fmt.Errorf("error running hook: %w, stdout: %s, stderr: %s", err, stdout.String(), stderr.String()) + err = fmt.Errorf("%w, stdout: %s, stderr: %s", err, stdout.String(), stderr.String()) } errC <- err }() diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/config_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/config_linux.go index 51fe940748..e401f5331b 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/config_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config_linux.go @@ -7,22 +7,33 @@ import ( ) var ( - errNoUIDMap = errors.New("User namespaces enabled, but no uid mappings found.") - errNoUserMap = errors.New("User namespaces enabled, but no user mapping found.") - errNoGIDMap = errors.New("User namespaces enabled, but no gid mappings found.") - errNoGroupMap = errors.New("User namespaces enabled, but no group mapping found.") + errNoUIDMap = errors.New("user namespaces enabled, but no uid mappings found") + errNoGIDMap = errors.New("user namespaces enabled, but no gid mappings found") ) +// Please check https://man7.org/linux/man-pages/man2/personality.2.html for const details. +// https://raw.githubusercontent.com/torvalds/linux/master/include/uapi/linux/personality.h +const ( + PerLinux = 0x0000 + PerLinux32 = 0x0008 +) + +type LinuxPersonality struct { + // Domain for the personality + // can only contain values "LINUX" and "LINUX32" + Domain int `json:"domain"` +} + // HostUID gets the translated uid for the process on host which could be // different when user namespaces are enabled. func (c Config) HostUID(containerId int) (int, error) { if c.Namespaces.Contains(NEWUSER) { - if c.UidMappings == nil { + if len(c.UIDMappings) == 0 { return -1, errNoUIDMap } - id, found := c.hostIDFromMapping(int64(containerId), c.UidMappings) + id, found := c.hostIDFromMapping(int64(containerId), c.UIDMappings) if !found { - return -1, errNoUserMap + return -1, fmt.Errorf("user namespaces enabled, but no mapping found for uid %d", containerId) } // If we are a 32-bit binary running on a 64-bit system, it's possible // the mapped user is too large to store in an int, which means we @@ -47,12 +58,12 @@ func (c Config) HostRootUID() (int, error) { // different when user namespaces are enabled. func (c Config) HostGID(containerId int) (int, error) { if c.Namespaces.Contains(NEWUSER) { - if c.GidMappings == nil { + if len(c.GIDMappings) == 0 { return -1, errNoGIDMap } - id, found := c.hostIDFromMapping(int64(containerId), c.GidMappings) + id, found := c.hostIDFromMapping(int64(containerId), c.GIDMappings) if !found { - return -1, errNoGroupMap + return -1, fmt.Errorf("user namespaces enabled, but no mapping found for gid %d", containerId) } // If we are a 32-bit binary running on a 64-bit system, it's possible // the mapped user is too large to store in an int, which means we diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go index 784c618205..bfd356e497 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go @@ -1,48 +1,7 @@ package configs -import "golang.org/x/sys/unix" - const ( // EXT_COPYUP is a directive to copy up the contents of a directory when // a tmpfs is mounted over it. - EXT_COPYUP = 1 << iota //nolint:golint // ignore "don't use ALL_CAPS" warning + EXT_COPYUP = 1 << iota //nolint:golint,revive // ignore "don't use ALL_CAPS" warning ) - -type Mount struct { - // Source path for the mount. - Source string `json:"source"` - - // Destination path for the mount inside the container. - Destination string `json:"destination"` - - // Device the mount is for. - Device string `json:"device"` - - // Mount flags. - Flags int `json:"flags"` - - // Propagation Flags - PropagationFlags []int `json:"propagation_flags"` - - // Mount data applied to the mount. - Data string `json:"data"` - - // Relabel source if set, "z" indicates shared, "Z" indicates unshared. - Relabel string `json:"relabel"` - - // RecAttr represents mount properties to be applied recursively (AT_RECURSIVE), see mount_setattr(2). - RecAttr *unix.MountAttr `json:"rec_attr"` - - // Extensions are additional flags that are specific to runc. - Extensions int `json:"extensions"` - - // Optional Command to be run before Source is mounted. - PremountCmds []Command `json:"premount_cmds"` - - // Optional Command to be run after Source is mounted. - PostmountCmds []Command `json:"postmount_cmds"` -} - -func (m *Mount) IsBind() bool { - return m.Flags&unix.MS_BIND != 0 -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/mount_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/mount_linux.go new file mode 100644 index 0000000000..b69e9ab238 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/mount_linux.go @@ -0,0 +1,66 @@ +package configs + +import "golang.org/x/sys/unix" + +type MountIDMapping struct { + // Recursive indicates if the mapping needs to be recursive. + Recursive bool `json:"recursive"` + + // UserNSPath is a path to a user namespace that indicates the necessary + // id-mappings for MOUNT_ATTR_IDMAP. If set to non-"", UIDMappings and + // GIDMappings must be set to nil. + UserNSPath string `json:"userns_path,omitempty"` + + // UIDMappings is the uid mapping set for this mount, to be used with + // MOUNT_ATTR_IDMAP. + UIDMappings []IDMap `json:"uid_mappings,omitempty"` + + // GIDMappings is the gid mapping set for this mount, to be used with + // MOUNT_ATTR_IDMAP. + GIDMappings []IDMap `json:"gid_mappings,omitempty"` +} + +type Mount struct { + // Source path for the mount. + Source string `json:"source"` + + // Destination path for the mount inside the container. + Destination string `json:"destination"` + + // Device the mount is for. + Device string `json:"device"` + + // Mount flags. + Flags int `json:"flags"` + + // Mount flags that were explicitly cleared in the configuration (meaning + // the user explicitly requested that these flags *not* be set). + ClearedFlags int `json:"cleared_flags"` + + // Propagation Flags + PropagationFlags []int `json:"propagation_flags"` + + // Mount data applied to the mount. + Data string `json:"data"` + + // Relabel source if set, "z" indicates shared, "Z" indicates unshared. + Relabel string `json:"relabel"` + + // RecAttr represents mount properties to be applied recursively (AT_RECURSIVE), see mount_setattr(2). + RecAttr *unix.MountAttr `json:"rec_attr"` + + // Extensions are additional flags that are specific to runc. + Extensions int `json:"extensions"` + + // Mapping is the MOUNT_ATTR_IDMAP configuration for the mount. If non-nil, + // the mount is configured to use MOUNT_ATTR_IDMAP-style id mappings. + IDMapping *MountIDMapping `json:"id_mapping,omitempty"` +} + +func (m *Mount) IsBind() bool { + return m.Flags&unix.MS_BIND != 0 +} + +func (m *Mount) IsIDMapped() bool { + return m.IDMapping != nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/mount_unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/mount_unsupported.go new file mode 100644 index 0000000000..2154191215 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/mount_unsupported.go @@ -0,0 +1,10 @@ +//go:build !linux +// +build !linux + +package configs + +type Mount struct{} + +func (m *Mount) IsBind() bool { + return false +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go index d52d6fcd14..898f96fd0f 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go @@ -14,6 +14,7 @@ const ( NEWIPC NamespaceType = "NEWIPC" NEWUSER NamespaceType = "NEWUSER" NEWCGROUP NamespaceType = "NEWCGROUP" + NEWTIME NamespaceType = "NEWTIME" ) var ( @@ -38,6 +39,8 @@ func NsName(ns NamespaceType) string { return "uts" case NEWCGROUP: return "cgroup" + case NEWTIME: + return "time" } return "" } @@ -56,6 +59,9 @@ func IsNamespaceSupported(ns NamespaceType) bool { if nsFile == "" { return false } + // We don't need to use /proc/thread-self here because the list of + // namespace types is unrelated to the thread. This lets us avoid having to + // do runtime.LockOSThread. _, err := os.Stat("/proc/self/ns/" + nsFile) // a namespace is supported if it exists and we have permissions to read it supported = err == nil @@ -72,6 +78,7 @@ func NamespaceTypes() []NamespaceType { NEWPID, NEWNS, NEWCGROUP, + NEWTIME, } } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go index 0516dba8d0..15d8046f3d 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go @@ -17,6 +17,7 @@ var namespaceInfo = map[NamespaceType]int{ NEWUTS: unix.CLONE_NEWUTS, NEWPID: unix.CLONE_NEWPID, NEWCGROUP: unix.CLONE_NEWCGROUP, + NEWTIME: unix.CLONE_NEWTIME, } // CloneFlags parses the container's Namespaces options to set the correct @@ -31,3 +32,15 @@ func (n *Namespaces) CloneFlags() uintptr { } return uintptr(flag) } + +// IsPrivate tells whether the namespace of type t is configured as private +// (i.e. it exists and is not shared). +func (n Namespaces) IsPrivate(t NamespaceType) bool { + for _, v := range n { + if v.Type == t { + return v.Path == "" + } + } + // Not found, so implicitly sharing a parent namespace. + return false +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go deleted file mode 100644 index f95c1409fc..0000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go +++ /dev/null @@ -1,157 +0,0 @@ -//go:build darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris -// +build darwin dragonfly freebsd linux netbsd openbsd solaris - -package user - -import ( - "io" - "os" - "strconv" - - "golang.org/x/sys/unix" -) - -// Unix-specific path to the passwd and group formatted files. -const ( - unixPasswdPath = "/etc/passwd" - unixGroupPath = "/etc/group" -) - -// LookupUser looks up a user by their username in /etc/passwd. If the user -// cannot be found (or there is no /etc/passwd file on the filesystem), then -// LookupUser returns an error. -func LookupUser(username string) (User, error) { - return lookupUserFunc(func(u User) bool { - return u.Name == username - }) -} - -// LookupUid looks up a user by their user id in /etc/passwd. If the user cannot -// be found (or there is no /etc/passwd file on the filesystem), then LookupId -// returns an error. -func LookupUid(uid int) (User, error) { - return lookupUserFunc(func(u User) bool { - return u.Uid == uid - }) -} - -func lookupUserFunc(filter func(u User) bool) (User, error) { - // Get operating system-specific passwd reader-closer. - passwd, err := GetPasswd() - if err != nil { - return User{}, err - } - defer passwd.Close() - - // Get the users. - users, err := ParsePasswdFilter(passwd, filter) - if err != nil { - return User{}, err - } - - // No user entries found. - if len(users) == 0 { - return User{}, ErrNoPasswdEntries - } - - // Assume the first entry is the "correct" one. - return users[0], nil -} - -// LookupGroup looks up a group by its name in /etc/group. If the group cannot -// be found (or there is no /etc/group file on the filesystem), then LookupGroup -// returns an error. -func LookupGroup(groupname string) (Group, error) { - return lookupGroupFunc(func(g Group) bool { - return g.Name == groupname - }) -} - -// LookupGid looks up a group by its group id in /etc/group. If the group cannot -// be found (or there is no /etc/group file on the filesystem), then LookupGid -// returns an error. -func LookupGid(gid int) (Group, error) { - return lookupGroupFunc(func(g Group) bool { - return g.Gid == gid - }) -} - -func lookupGroupFunc(filter func(g Group) bool) (Group, error) { - // Get operating system-specific group reader-closer. - group, err := GetGroup() - if err != nil { - return Group{}, err - } - defer group.Close() - - // Get the users. - groups, err := ParseGroupFilter(group, filter) - if err != nil { - return Group{}, err - } - - // No user entries found. - if len(groups) == 0 { - return Group{}, ErrNoGroupEntries - } - - // Assume the first entry is the "correct" one. - return groups[0], nil -} - -func GetPasswdPath() (string, error) { - return unixPasswdPath, nil -} - -func GetPasswd() (io.ReadCloser, error) { - return os.Open(unixPasswdPath) -} - -func GetGroupPath() (string, error) { - return unixGroupPath, nil -} - -func GetGroup() (io.ReadCloser, error) { - return os.Open(unixGroupPath) -} - -// CurrentUser looks up the current user by their user id in /etc/passwd. If the -// user cannot be found (or there is no /etc/passwd file on the filesystem), -// then CurrentUser returns an error. -func CurrentUser() (User, error) { - return LookupUid(unix.Getuid()) -} - -// CurrentGroup looks up the current user's group by their primary group id's -// entry in /etc/passwd. If the group cannot be found (or there is no -// /etc/group file on the filesystem), then CurrentGroup returns an error. -func CurrentGroup() (Group, error) { - return LookupGid(unix.Getgid()) -} - -func currentUserSubIDs(fileName string) ([]SubID, error) { - u, err := CurrentUser() - if err != nil { - return nil, err - } - filter := func(entry SubID) bool { - return entry.Name == u.Name || entry.Name == strconv.Itoa(u.Uid) - } - return ParseSubIDFileFilter(fileName, filter) -} - -func CurrentUserSubUIDs() ([]SubID, error) { - return currentUserSubIDs("/etc/subuid") -} - -func CurrentUserSubGIDs() ([]SubID, error) { - return currentUserSubIDs("/etc/subgid") -} - -func CurrentProcessUIDMap() ([]IDMap, error) { - return ParseIDMapFile("/proc/self/uid_map") -} - -func CurrentProcessGIDMap() ([]IDMap, error) { - return ParseIDMapFile("/proc/self/gid_map") -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/user/user.go b/vendor/github.com/opencontainers/runc/libcontainer/user/user.go deleted file mode 100644 index 984466d1ab..0000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/user/user.go +++ /dev/null @@ -1,605 +0,0 @@ -package user - -import ( - "bufio" - "bytes" - "errors" - "fmt" - "io" - "os" - "strconv" - "strings" -) - -const ( - minID = 0 - maxID = 1<<31 - 1 // for 32-bit systems compatibility -) - -var ( - // ErrNoPasswdEntries is returned if no matching entries were found in /etc/group. - ErrNoPasswdEntries = errors.New("no matching entries in passwd file") - // ErrNoGroupEntries is returned if no matching entries were found in /etc/passwd. - ErrNoGroupEntries = errors.New("no matching entries in group file") - // ErrRange is returned if a UID or GID is outside of the valid range. - ErrRange = fmt.Errorf("uids and gids must be in range %d-%d", minID, maxID) -) - -type User struct { - Name string - Pass string - Uid int - Gid int - Gecos string - Home string - Shell string -} - -type Group struct { - Name string - Pass string - Gid int - List []string -} - -// SubID represents an entry in /etc/sub{u,g}id -type SubID struct { - Name string - SubID int64 - Count int64 -} - -// IDMap represents an entry in /proc/PID/{u,g}id_map -type IDMap struct { - ID int64 - ParentID int64 - Count int64 -} - -func parseLine(line []byte, v ...interface{}) { - parseParts(bytes.Split(line, []byte(":")), v...) -} - -func parseParts(parts [][]byte, v ...interface{}) { - if len(parts) == 0 { - return - } - - for i, p := range parts { - // Ignore cases where we don't have enough fields to populate the arguments. - // Some configuration files like to misbehave. - if len(v) <= i { - break - } - - // Use the type of the argument to figure out how to parse it, scanf() style. - // This is legit. - switch e := v[i].(type) { - case *string: - *e = string(p) - case *int: - // "numbers", with conversion errors ignored because of some misbehaving configuration files. - *e, _ = strconv.Atoi(string(p)) - case *int64: - *e, _ = strconv.ParseInt(string(p), 10, 64) - case *[]string: - // Comma-separated lists. - if len(p) != 0 { - *e = strings.Split(string(p), ",") - } else { - *e = []string{} - } - default: - // Someone goof'd when writing code using this function. Scream so they can hear us. - panic(fmt.Sprintf("parseLine only accepts {*string, *int, *int64, *[]string} as arguments! %#v is not a pointer!", e)) - } - } -} - -func ParsePasswdFile(path string) ([]User, error) { - passwd, err := os.Open(path) - if err != nil { - return nil, err - } - defer passwd.Close() - return ParsePasswd(passwd) -} - -func ParsePasswd(passwd io.Reader) ([]User, error) { - return ParsePasswdFilter(passwd, nil) -} - -func ParsePasswdFileFilter(path string, filter func(User) bool) ([]User, error) { - passwd, err := os.Open(path) - if err != nil { - return nil, err - } - defer passwd.Close() - return ParsePasswdFilter(passwd, filter) -} - -func ParsePasswdFilter(r io.Reader, filter func(User) bool) ([]User, error) { - if r == nil { - return nil, errors.New("nil source for passwd-formatted data") - } - - var ( - s = bufio.NewScanner(r) - out = []User{} - ) - - for s.Scan() { - line := bytes.TrimSpace(s.Bytes()) - if len(line) == 0 { - continue - } - - // see: man 5 passwd - // name:password:UID:GID:GECOS:directory:shell - // Name:Pass:Uid:Gid:Gecos:Home:Shell - // root:x:0:0:root:/root:/bin/bash - // adm:x:3:4:adm:/var/adm:/bin/false - p := User{} - parseLine(line, &p.Name, &p.Pass, &p.Uid, &p.Gid, &p.Gecos, &p.Home, &p.Shell) - - if filter == nil || filter(p) { - out = append(out, p) - } - } - if err := s.Err(); err != nil { - return nil, err - } - - return out, nil -} - -func ParseGroupFile(path string) ([]Group, error) { - group, err := os.Open(path) - if err != nil { - return nil, err - } - - defer group.Close() - return ParseGroup(group) -} - -func ParseGroup(group io.Reader) ([]Group, error) { - return ParseGroupFilter(group, nil) -} - -func ParseGroupFileFilter(path string, filter func(Group) bool) ([]Group, error) { - group, err := os.Open(path) - if err != nil { - return nil, err - } - defer group.Close() - return ParseGroupFilter(group, filter) -} - -func ParseGroupFilter(r io.Reader, filter func(Group) bool) ([]Group, error) { - if r == nil { - return nil, errors.New("nil source for group-formatted data") - } - rd := bufio.NewReader(r) - out := []Group{} - - // Read the file line-by-line. - for { - var ( - isPrefix bool - wholeLine []byte - err error - ) - - // Read the next line. We do so in chunks (as much as reader's - // buffer is able to keep), check if we read enough columns - // already on each step and store final result in wholeLine. - for { - var line []byte - line, isPrefix, err = rd.ReadLine() - - if err != nil { - // We should return no error if EOF is reached - // without a match. - if err == io.EOF { - err = nil - } - return out, err - } - - // Simple common case: line is short enough to fit in a - // single reader's buffer. - if !isPrefix && len(wholeLine) == 0 { - wholeLine = line - break - } - - wholeLine = append(wholeLine, line...) - - // Check if we read the whole line already. - if !isPrefix { - break - } - } - - // There's no spec for /etc/passwd or /etc/group, but we try to follow - // the same rules as the glibc parser, which allows comments and blank - // space at the beginning of a line. - wholeLine = bytes.TrimSpace(wholeLine) - if len(wholeLine) == 0 || wholeLine[0] == '#' { - continue - } - - // see: man 5 group - // group_name:password:GID:user_list - // Name:Pass:Gid:List - // root:x:0:root - // adm:x:4:root,adm,daemon - p := Group{} - parseLine(wholeLine, &p.Name, &p.Pass, &p.Gid, &p.List) - - if filter == nil || filter(p) { - out = append(out, p) - } - } -} - -type ExecUser struct { - Uid int - Gid int - Sgids []int - Home string -} - -// GetExecUserPath is a wrapper for GetExecUser. It reads data from each of the -// given file paths and uses that data as the arguments to GetExecUser. If the -// files cannot be opened for any reason, the error is ignored and a nil -// io.Reader is passed instead. -func GetExecUserPath(userSpec string, defaults *ExecUser, passwdPath, groupPath string) (*ExecUser, error) { - var passwd, group io.Reader - - if passwdFile, err := os.Open(passwdPath); err == nil { - passwd = passwdFile - defer passwdFile.Close() - } - - if groupFile, err := os.Open(groupPath); err == nil { - group = groupFile - defer groupFile.Close() - } - - return GetExecUser(userSpec, defaults, passwd, group) -} - -// GetExecUser parses a user specification string (using the passwd and group -// readers as sources for /etc/passwd and /etc/group data, respectively). In -// the case of blank fields or missing data from the sources, the values in -// defaults is used. -// -// GetExecUser will return an error if a user or group literal could not be -// found in any entry in passwd and group respectively. -// -// Examples of valid user specifications are: -// - "" -// - "user" -// - "uid" -// - "user:group" -// - "uid:gid -// - "user:gid" -// - "uid:group" -// -// It should be noted that if you specify a numeric user or group id, they will -// not be evaluated as usernames (only the metadata will be filled). So attempting -// to parse a user with user.Name = "1337" will produce the user with a UID of -// 1337. -func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) (*ExecUser, error) { - if defaults == nil { - defaults = new(ExecUser) - } - - // Copy over defaults. - user := &ExecUser{ - Uid: defaults.Uid, - Gid: defaults.Gid, - Sgids: defaults.Sgids, - Home: defaults.Home, - } - - // Sgids slice *cannot* be nil. - if user.Sgids == nil { - user.Sgids = []int{} - } - - // Allow for userArg to have either "user" syntax, or optionally "user:group" syntax - var userArg, groupArg string - parseLine([]byte(userSpec), &userArg, &groupArg) - - // Convert userArg and groupArg to be numeric, so we don't have to execute - // Atoi *twice* for each iteration over lines. - uidArg, uidErr := strconv.Atoi(userArg) - gidArg, gidErr := strconv.Atoi(groupArg) - - // Find the matching user. - users, err := ParsePasswdFilter(passwd, func(u User) bool { - if userArg == "" { - // Default to current state of the user. - return u.Uid == user.Uid - } - - if uidErr == nil { - // If the userArg is numeric, always treat it as a UID. - return uidArg == u.Uid - } - - return u.Name == userArg - }) - - // If we can't find the user, we have to bail. - if err != nil && passwd != nil { - if userArg == "" { - userArg = strconv.Itoa(user.Uid) - } - return nil, fmt.Errorf("unable to find user %s: %w", userArg, err) - } - - var matchedUserName string - if len(users) > 0 { - // First match wins, even if there's more than one matching entry. - matchedUserName = users[0].Name - user.Uid = users[0].Uid - user.Gid = users[0].Gid - user.Home = users[0].Home - } else if userArg != "" { - // If we can't find a user with the given username, the only other valid - // option is if it's a numeric username with no associated entry in passwd. - - if uidErr != nil { - // Not numeric. - return nil, fmt.Errorf("unable to find user %s: %w", userArg, ErrNoPasswdEntries) - } - user.Uid = uidArg - - // Must be inside valid uid range. - if user.Uid < minID || user.Uid > maxID { - return nil, ErrRange - } - - // Okay, so it's numeric. We can just roll with this. - } - - // On to the groups. If we matched a username, we need to do this because of - // the supplementary group IDs. - if groupArg != "" || matchedUserName != "" { - groups, err := ParseGroupFilter(group, func(g Group) bool { - // If the group argument isn't explicit, we'll just search for it. - if groupArg == "" { - // Check if user is a member of this group. - for _, u := range g.List { - if u == matchedUserName { - return true - } - } - return false - } - - if gidErr == nil { - // If the groupArg is numeric, always treat it as a GID. - return gidArg == g.Gid - } - - return g.Name == groupArg - }) - if err != nil && group != nil { - return nil, fmt.Errorf("unable to find groups for spec %v: %w", matchedUserName, err) - } - - // Only start modifying user.Gid if it is in explicit form. - if groupArg != "" { - if len(groups) > 0 { - // First match wins, even if there's more than one matching entry. - user.Gid = groups[0].Gid - } else { - // If we can't find a group with the given name, the only other valid - // option is if it's a numeric group name with no associated entry in group. - - if gidErr != nil { - // Not numeric. - return nil, fmt.Errorf("unable to find group %s: %w", groupArg, ErrNoGroupEntries) - } - user.Gid = gidArg - - // Must be inside valid gid range. - if user.Gid < minID || user.Gid > maxID { - return nil, ErrRange - } - - // Okay, so it's numeric. We can just roll with this. - } - } else if len(groups) > 0 { - // Supplementary group ids only make sense if in the implicit form. - user.Sgids = make([]int, len(groups)) - for i, group := range groups { - user.Sgids[i] = group.Gid - } - } - } - - return user, nil -} - -// GetAdditionalGroups looks up a list of groups by name or group id -// against the given /etc/group formatted data. If a group name cannot -// be found, an error will be returned. If a group id cannot be found, -// or the given group data is nil, the id will be returned as-is -// provided it is in the legal range. -func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, error) { - groups := []Group{} - if group != nil { - var err error - groups, err = ParseGroupFilter(group, func(g Group) bool { - for _, ag := range additionalGroups { - if g.Name == ag || strconv.Itoa(g.Gid) == ag { - return true - } - } - return false - }) - if err != nil { - return nil, fmt.Errorf("Unable to find additional groups %v: %w", additionalGroups, err) - } - } - - gidMap := make(map[int]struct{}) - for _, ag := range additionalGroups { - var found bool - for _, g := range groups { - // if we found a matched group either by name or gid, take the - // first matched as correct - if g.Name == ag || strconv.Itoa(g.Gid) == ag { - if _, ok := gidMap[g.Gid]; !ok { - gidMap[g.Gid] = struct{}{} - found = true - break - } - } - } - // we asked for a group but didn't find it. let's check to see - // if we wanted a numeric group - if !found { - gid, err := strconv.ParseInt(ag, 10, 64) - if err != nil { - // Not a numeric ID either. - return nil, fmt.Errorf("Unable to find group %s: %w", ag, ErrNoGroupEntries) - } - // Ensure gid is inside gid range. - if gid < minID || gid > maxID { - return nil, ErrRange - } - gidMap[int(gid)] = struct{}{} - } - } - gids := []int{} - for gid := range gidMap { - gids = append(gids, gid) - } - return gids, nil -} - -// GetAdditionalGroupsPath is a wrapper around GetAdditionalGroups -// that opens the groupPath given and gives it as an argument to -// GetAdditionalGroups. -func GetAdditionalGroupsPath(additionalGroups []string, groupPath string) ([]int, error) { - var group io.Reader - - if groupFile, err := os.Open(groupPath); err == nil { - group = groupFile - defer groupFile.Close() - } - return GetAdditionalGroups(additionalGroups, group) -} - -func ParseSubIDFile(path string) ([]SubID, error) { - subid, err := os.Open(path) - if err != nil { - return nil, err - } - defer subid.Close() - return ParseSubID(subid) -} - -func ParseSubID(subid io.Reader) ([]SubID, error) { - return ParseSubIDFilter(subid, nil) -} - -func ParseSubIDFileFilter(path string, filter func(SubID) bool) ([]SubID, error) { - subid, err := os.Open(path) - if err != nil { - return nil, err - } - defer subid.Close() - return ParseSubIDFilter(subid, filter) -} - -func ParseSubIDFilter(r io.Reader, filter func(SubID) bool) ([]SubID, error) { - if r == nil { - return nil, errors.New("nil source for subid-formatted data") - } - - var ( - s = bufio.NewScanner(r) - out = []SubID{} - ) - - for s.Scan() { - line := bytes.TrimSpace(s.Bytes()) - if len(line) == 0 { - continue - } - - // see: man 5 subuid - p := SubID{} - parseLine(line, &p.Name, &p.SubID, &p.Count) - - if filter == nil || filter(p) { - out = append(out, p) - } - } - if err := s.Err(); err != nil { - return nil, err - } - - return out, nil -} - -func ParseIDMapFile(path string) ([]IDMap, error) { - r, err := os.Open(path) - if err != nil { - return nil, err - } - defer r.Close() - return ParseIDMap(r) -} - -func ParseIDMap(r io.Reader) ([]IDMap, error) { - return ParseIDMapFilter(r, nil) -} - -func ParseIDMapFileFilter(path string, filter func(IDMap) bool) ([]IDMap, error) { - r, err := os.Open(path) - if err != nil { - return nil, err - } - defer r.Close() - return ParseIDMapFilter(r, filter) -} - -func ParseIDMapFilter(r io.Reader, filter func(IDMap) bool) ([]IDMap, error) { - if r == nil { - return nil, errors.New("nil source for idmap-formatted data") - } - - var ( - s = bufio.NewScanner(r) - out = []IDMap{} - ) - - for s.Scan() { - line := bytes.TrimSpace(s.Bytes()) - if len(line) == 0 { - continue - } - - // see: man 7 user_namespaces - p := IDMap{} - parseParts(bytes.Fields(line), &p.ID, &p.ParentID, &p.Count) - - if filter == nil || filter(p) { - out = append(out, p) - } - } - if err := s.Err(); err != nil { - return nil, err - } - - return out, nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/user/user_fuzzer.go b/vendor/github.com/opencontainers/runc/libcontainer/user/user_fuzzer.go deleted file mode 100644 index e018eae614..0000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/user/user_fuzzer.go +++ /dev/null @@ -1,43 +0,0 @@ -//go:build gofuzz -// +build gofuzz - -package user - -import ( - "io" - "strings" -) - -func IsDivisbleBy(n int, divisibleby int) bool { - return (n % divisibleby) == 0 -} - -func FuzzUser(data []byte) int { - if len(data) == 0 { - return -1 - } - if !IsDivisbleBy(len(data), 5) { - return -1 - } - - var divided [][]byte - - chunkSize := len(data) / 5 - - for i := 0; i < len(data); i += chunkSize { - end := i + chunkSize - - divided = append(divided, data[i:end]) - } - - _, _ = ParsePasswdFilter(strings.NewReader(string(divided[0])), nil) - - var passwd, group io.Reader - - group = strings.NewReader(string(divided[1])) - _, _ = GetAdditionalGroups([]string{string(divided[2])}, group) - - passwd = strings.NewReader(string(divided[3])) - _, _ = GetExecUser(string(divided[4]), nil, passwd, group) - return 1 -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns.go b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns.go index f6cb98e5e4..b225f18f2e 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns.go @@ -1,5 +1,4 @@ package userns // RunningInUserNS detects whether we are currently running in a user namespace. -// Originally copied from github.com/lxc/lxd/shared/util.go var RunningInUserNS = runningInUserNS diff --git a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_fuzzer.go b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_fuzzer.go index 1e00ab8b50..bff03f8d85 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_fuzzer.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_fuzzer.go @@ -3,14 +3,7 @@ package userns -import ( - "strings" - - "github.com/opencontainers/runc/libcontainer/user" -) - -func FuzzUIDMap(data []byte) int { - uidmap, _ := user.ParseIDMap(strings.NewReader(string(data))) - _ = uidMapInUserNS(uidmap) +func FuzzUIDMap(uidmap []byte) int { + _ = uidMapInUserNS(string(uidmap)) return 1 } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_linux.go index 724e6df012..a6710b321b 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_linux.go @@ -1,9 +1,10 @@ package userns import ( + "bufio" + "fmt" + "os" "sync" - - "github.com/opencontainers/runc/libcontainer/user" ) var ( @@ -12,26 +13,43 @@ var ( ) // runningInUserNS detects whether we are currently running in a user namespace. -// Originally copied from github.com/lxc/lxd/shared/util.go +// +// Originally copied from https://github.com/lxc/incus/blob/e45085dd42f826b3c8c3228e9733c0b6f998eafe/shared/util.go#L678-L700. func runningInUserNS() bool { nsOnce.Do(func() { - uidmap, err := user.CurrentProcessUIDMap() + file, err := os.Open("/proc/self/uid_map") if err != nil { - // This kernel-provided file only exists if user namespaces are supported + // This kernel-provided file only exists if user namespaces are supported. return } - inUserNS = uidMapInUserNS(uidmap) + defer file.Close() + + buf := bufio.NewReader(file) + l, _, err := buf.ReadLine() + if err != nil { + return + } + + inUserNS = uidMapInUserNS(string(l)) }) return inUserNS } -func uidMapInUserNS(uidmap []user.IDMap) bool { - /* - * We assume we are in the initial user namespace if we have a full - * range - 4294967295 uids starting at uid 0. - */ - if len(uidmap) == 1 && uidmap[0].ID == 0 && uidmap[0].ParentID == 0 && uidmap[0].Count == 4294967295 { +func uidMapInUserNS(uidMap string) bool { + if uidMap == "" { + // File exist but empty (the initial state when userns is created, + // see user_namespaces(7)). + return true + } + + var a, b, c int64 + if _, err := fmt.Sscanf(uidMap, "%d %d %d", &a, &b, &c); err != nil { + // Assume we are in a regular, non user namespace. return false } - return true + + // As per user_namespaces(7), /proc/self/uid_map of + // the initial user namespace shows 0 0 4294967295. + initNS := a == 0 && b == 0 && c == 4294967295 + return !initNS } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_unsupported.go index f35c13a10e..391c811c68 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_unsupported.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_unsupported.go @@ -3,8 +3,6 @@ package userns -import "github.com/opencontainers/runc/libcontainer/user" - // runningInUserNS is a stub for non-Linux systems // Always returns false func runningInUserNS() bool { @@ -13,6 +11,6 @@ func runningInUserNS() bool { // uidMapInUserNS is a stub for non-Linux systems // Always returns false -func uidMapInUserNS(uidmap []user.IDMap) bool { +func uidMapInUserNS(uidMap string) bool { return false } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/userns/usernsfd_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/userns/usernsfd_linux.go new file mode 100644 index 0000000000..2eb64cf76c --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/userns/usernsfd_linux.go @@ -0,0 +1,156 @@ +package userns + +import ( + "fmt" + "os" + "sort" + "strings" + "sync" + "syscall" + + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +type Mapping struct { + UIDMappings []configs.IDMap + GIDMappings []configs.IDMap +} + +func (m Mapping) toSys() (uids, gids []syscall.SysProcIDMap) { + for _, uid := range m.UIDMappings { + uids = append(uids, syscall.SysProcIDMap{ + ContainerID: int(uid.ContainerID), + HostID: int(uid.HostID), + Size: int(uid.Size), + }) + } + for _, gid := range m.GIDMappings { + gids = append(gids, syscall.SysProcIDMap{ + ContainerID: int(gid.ContainerID), + HostID: int(gid.HostID), + Size: int(gid.Size), + }) + } + return +} + +// id returns a unique identifier for this mapping, agnostic of the order of +// the uid and gid mappings (because the order doesn't matter to the kernel). +// The set of userns handles is indexed using this ID. +func (m Mapping) id() string { + var uids, gids []string + for _, idmap := range m.UIDMappings { + uids = append(uids, fmt.Sprintf("%d:%d:%d", idmap.ContainerID, idmap.HostID, idmap.Size)) + } + for _, idmap := range m.GIDMappings { + gids = append(gids, fmt.Sprintf("%d:%d:%d", idmap.ContainerID, idmap.HostID, idmap.Size)) + } + // We don't care about the sort order -- just sort them. + sort.Strings(uids) + sort.Strings(gids) + return "uid=" + strings.Join(uids, ",") + ";gid=" + strings.Join(gids, ",") +} + +type Handles struct { + m sync.Mutex + maps map[string]*os.File +} + +// Release all resources associated with this Handle. All existing files +// returned from Get() will continue to work even after calling Release(). The +// same Handles can be re-used after calling Release(). +func (hs *Handles) Release() { + hs.m.Lock() + defer hs.m.Unlock() + + // Close the files for good measure, though GC will do that for us anyway. + for _, file := range hs.maps { + _ = file.Close() + } + hs.maps = nil +} + +func spawnProc(req Mapping) (*os.Process, error) { + // We need to spawn a subprocess with the requested mappings, which is + // unfortunately quite expensive. The "safe" way of doing this is natively + // with Go (and then spawning something like "sleep infinity"), but + // execve() is a waste of cycles because we just need some process to have + // the right mapping, we don't care what it's executing. The "unsafe" + // option of doing a clone() behind the back of Go is probably okay in + // theory as long as we just do kill(getpid(), SIGSTOP). However, if we + // tell Go to put the new process into PTRACE_TRACEME mode, we can avoid + // the exec and not have to faff around with the mappings. + // + // Note that Go's stdlib does not support newuidmap, but in the case of + // id-mapped mounts, it seems incredibly unlikely that the user will be + // requesting us to do a remapping as an unprivileged user with mappings + // they have privileges over. + logrus.Debugf("spawning dummy process for id-mapping %s", req.id()) + uidMappings, gidMappings := req.toSys() + // We don't need to use /proc/thread-self here because the exe mm of a + // thread-group is guaranteed to be the same for all threads by definition. + // This lets us avoid having to do runtime.LockOSThread. + return os.StartProcess("/proc/self/exe", []string{"runc", "--help"}, &os.ProcAttr{ + Sys: &syscall.SysProcAttr{ + Cloneflags: unix.CLONE_NEWUSER, + UidMappings: uidMappings, + GidMappings: gidMappings, + GidMappingsEnableSetgroups: false, + // Put the process into PTRACE_TRACEME mode to allow us to get the + // userns without having a proper execve() target. + Ptrace: true, + }, + }) +} + +func dupFile(f *os.File) (*os.File, error) { + newFd, err := unix.FcntlInt(f.Fd(), unix.F_DUPFD_CLOEXEC, 0) + if err != nil { + return nil, os.NewSyscallError("fcntl(F_DUPFD_CLOEXEC)", err) + } + return os.NewFile(uintptr(newFd), f.Name()), nil +} + +// Get returns a handle to a /proc/$pid/ns/user nsfs file with the requested +// mapping. The processes spawned to produce userns nsfds are cached, so if +// equivalent user namespace mappings are requested, the same user namespace +// will be returned. The caller is responsible for closing the returned file +// descriptor. +func (hs *Handles) Get(req Mapping) (file *os.File, err error) { + hs.m.Lock() + defer hs.m.Unlock() + + if hs.maps == nil { + hs.maps = make(map[string]*os.File) + } + + file, ok := hs.maps[req.id()] + if !ok { + proc, err := spawnProc(req) + if err != nil { + return nil, fmt.Errorf("failed to spawn dummy process for map %s: %w", req.id(), err) + } + // Make sure we kill the helper process. We ignore errors because + // there's not much we can do about them anyway, and ultimately + defer func() { + _ = proc.Kill() + _, _ = proc.Wait() + }() + + // Stash away a handle to the userns file. This is neater than keeping + // the process alive, because Go's GC can handle files much better than + // leaked processes, and having long-living useless processes seems + // less than ideal. + file, err = os.Open(fmt.Sprintf("/proc/%d/ns/user", proc.Pid)) + if err != nil { + return nil, err + } + hs.maps[req.id()] = file + } + // Duplicate the file, to make sure the lifecycle of each *os.File we + // return is independent. + return dupFile(file) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go index 7ef9da21fd..2edd1417af 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go @@ -19,13 +19,14 @@ package utils import ( "fmt" "os" + "runtime" "golang.org/x/sys/unix" ) -// MaxSendfdLen is the maximum length of the name of a file descriptor being -// sent using SendFd. The name of the file handle returned by RecvFd will never -// be larger than this value. +// MaxNameLen is the maximum length of the name of a file descriptor being sent +// using SendFile. The name of the file handle returned by RecvFile will never be +// larger than this value. const MaxNameLen = 4096 // oobSpace is the size of the oob slice required to store a single FD. Note @@ -33,26 +34,21 @@ const MaxNameLen = 4096 // so sizeof(fd) = 4. var oobSpace = unix.CmsgSpace(4) -// RecvFd waits for a file descriptor to be sent over the given AF_UNIX +// RecvFile waits for a file descriptor to be sent over the given AF_UNIX // socket. The file name of the remote file descriptor will be recreated // locally (it is sent as non-auxiliary data in the same payload). -func RecvFd(socket *os.File) (*os.File, error) { - // For some reason, unix.Recvmsg uses the length rather than the capacity - // when passing the msg_controllen and other attributes to recvmsg. So we - // have to actually set the length. +func RecvFile(socket *os.File) (_ *os.File, Err error) { name := make([]byte, MaxNameLen) oob := make([]byte, oobSpace) sockfd := socket.Fd() - n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, 0) + n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, unix.MSG_CMSG_CLOEXEC) if err != nil { return nil, err } - if n >= MaxNameLen || oobn != oobSpace { - return nil, fmt.Errorf("recvfd: incorrect number of bytes read (n=%d oobn=%d)", n, oobn) + return nil, fmt.Errorf("recvfile: incorrect number of bytes read (n=%d oobn=%d)", n, oobn) } - // Truncate. name = name[:n] oob = oob[:oobn] @@ -61,36 +57,63 @@ func RecvFd(socket *os.File) (*os.File, error) { if err != nil { return nil, err } + + // We cannot control how many SCM_RIGHTS we receive, and upon receiving + // them all of the descriptors are installed in our fd table, so we need to + // parse all of the SCM_RIGHTS we received in order to close all of the + // descriptors on error. + var fds []int + defer func() { + for i, fd := range fds { + if i == 0 && Err == nil { + // Only close the first one on error. + continue + } + // Always close extra ones. + _ = unix.Close(fd) + } + }() + var lastErr error + for _, scm := range scms { + if scm.Header.Type == unix.SCM_RIGHTS { + scmFds, err := unix.ParseUnixRights(&scm) + if err != nil { + lastErr = err + } else { + fds = append(fds, scmFds...) + } + } + } + if lastErr != nil { + return nil, lastErr + } + + // We do this after collecting the fds to make sure we close them all when + // returning an error here. if len(scms) != 1 { return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms)) } - scm := scms[0] - - fds, err := unix.ParseUnixRights(&scm) - if err != nil { - return nil, err - } if len(fds) != 1 { return nil, fmt.Errorf("recvfd: number of fds is not 1: %d", len(fds)) } - fd := uintptr(fds[0]) - - return os.NewFile(fd, string(name)), nil + return os.NewFile(uintptr(fds[0]), string(name)), nil } -// SendFd sends a file descriptor over the given AF_UNIX socket. In -// addition, the file.Name() of the given file will also be sent as -// non-auxiliary data in the same payload (allowing to send contextual -// information for a file descriptor). -func SendFd(socket *os.File, name string, fd uintptr) error { +// SendFile sends a file over the given AF_UNIX socket. file.Name() is also +// included so that if the other end uses RecvFile, the file will have the same +// name information. +func SendFile(socket *os.File, file *os.File) error { + name := file.Name() if len(name) >= MaxNameLen { return fmt.Errorf("sendfd: filename too long: %s", name) } - return SendFds(socket, []byte(name), int(fd)) + err := SendRawFd(socket, name, file.Fd()) + runtime.KeepAlive(file) + return err } -// SendFds sends a list of files descriptor and msg over the given AF_UNIX socket. -func SendFds(socket *os.File, msg []byte, fds ...int) error { - oob := unix.UnixRights(fds...) - return unix.Sendmsg(int(socket.Fd()), msg, oob, nil, 0) +// SendRawFd sends a specific file descriptor over the given AF_UNIX socket. +func SendRawFd(socket *os.File, msg string, fd uintptr) error { + oob := unix.UnixRights(int(fd)) + return unix.Sendmsg(int(socket.Fd()), []byte(msg), oob, nil, 0) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go index 6b9fc34352..1b523d8ac5 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go @@ -3,15 +3,12 @@ package utils import ( "encoding/binary" "encoding/json" - "fmt" "io" "os" "path/filepath" - "strconv" "strings" "unsafe" - securejoin "github.com/cyphar/filepath-securejoin" "golang.org/x/sys/unix" ) @@ -43,6 +40,9 @@ func ExitStatus(status unix.WaitStatus) int { } // WriteJSON writes the provided struct v to w using standard json marshaling +// without a trailing newline. This is used instead of json.Encoder because +// there might be a problem in json decoder in some cases, see: +// https://github.com/docker/docker/issues/14203#issuecomment-174177790 func WriteJSON(w io.Writer, v interface{}) error { data, err := json.Marshal(v) if err != nil { @@ -99,52 +99,16 @@ func stripRoot(root, path string) string { return CleanPath("/" + path) } -// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...) -// corresponding to the unsafePath resolved within the root. Before passing the -// fd, this path is verified to have been inside the root -- so operating on it -// through the passed fdpath should be safe. Do not access this path through -// the original path strings, and do not attempt to use the pathname outside of -// the passed closure (the file handle will be freed once the closure returns). -func WithProcfd(root, unsafePath string, fn func(procfd string) error) error { - // Remove the root then forcefully resolve inside the root. - unsafePath = stripRoot(root, unsafePath) - path, err := securejoin.SecureJoin(root, unsafePath) - if err != nil { - return fmt.Errorf("resolving path inside rootfs failed: %w", err) - } - - // Open the target path. - fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0) - if err != nil { - return fmt.Errorf("open o_path procfd: %w", err) - } - defer fh.Close() - - // Double-check the path is the one we expected. - procfd := "/proc/self/fd/" + strconv.Itoa(int(fh.Fd())) - if realpath, err := os.Readlink(procfd); err != nil { - return fmt.Errorf("procfd verification failed: %w", err) - } else if realpath != path { - return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath) - } - - // Run the closure. - return fn(procfd) -} - -// SearchLabels searches a list of key-value pairs for the provided key and -// returns the corresponding value. The pairs must be separated with '='. -func SearchLabels(labels []string, query string) string { - for _, l := range labels { - parts := strings.SplitN(l, "=", 2) - if len(parts) < 2 { - continue - } - if parts[0] == query { - return parts[1] +// SearchLabels searches through a list of key=value pairs for a given key, +// returning its value, and the binary flag telling whether the key exist. +func SearchLabels(labels []string, key string) (string, bool) { + key += "=" + for _, s := range labels { + if strings.HasPrefix(s, key) { + return s[len(key):], true } } - return "" + return "", false } // Annotations returns the bundle path and user defined annotations from the diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go index bf3237a291..f57f0874a0 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go @@ -5,10 +5,16 @@ package utils import ( "fmt" + "math" "os" + "path/filepath" + "runtime" "strconv" + "sync" _ "unsafe" // for go:linkname + securejoin "github.com/cyphar/filepath-securejoin" + "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) @@ -24,12 +30,39 @@ func EnsureProcHandle(fh *os.File) error { return nil } +var ( + haveCloseRangeCloexecBool bool + haveCloseRangeCloexecOnce sync.Once +) + +func haveCloseRangeCloexec() bool { + haveCloseRangeCloexecOnce.Do(func() { + // Make sure we're not closing a random file descriptor. + tmpFd, err := unix.FcntlInt(0, unix.F_DUPFD_CLOEXEC, 0) + if err != nil { + return + } + defer unix.Close(tmpFd) + + err = unix.CloseRange(uint(tmpFd), uint(tmpFd), unix.CLOSE_RANGE_CLOEXEC) + // Any error means we cannot use close_range(CLOSE_RANGE_CLOEXEC). + // -ENOSYS and -EINVAL ultimately mean we don't have support, but any + // other potential error would imply that even the most basic close + // operation wouldn't work. + haveCloseRangeCloexecBool = err == nil + }) + return haveCloseRangeCloexecBool +} + type fdFunc func(fd int) // fdRangeFrom calls the passed fdFunc for each file descriptor that is open in // the current process. func fdRangeFrom(minFd int, fn fdFunc) error { - fdDir, err := os.Open("/proc/self/fd") + procSelfFd, closer := ProcThreadSelf("fd") + defer closer() + + fdDir, err := os.Open(procSelfFd) if err != nil { return err } @@ -67,6 +100,12 @@ func fdRangeFrom(minFd int, fn fdFunc) error { // CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or // equal to minFd in the current process. func CloseExecFrom(minFd int) error { + // Use close_range(CLOSE_RANGE_CLOEXEC) if possible. + if haveCloseRangeCloexec() { + err := unix.CloseRange(uint(minFd), math.MaxUint, unix.CLOSE_RANGE_CLOEXEC) + return os.NewSyscallError("close_range", err) + } + // Otherwise, fall back to the standard loop. return fdRangeFrom(minFd, unix.CloseOnExec) } @@ -89,7 +128,8 @@ func runtime_IsPollDescriptor(fd uintptr) bool //nolint:revive // *os.File operations would apply to the wrong file). This function is only // intended to be called from the last stage of runc init. func UnsafeCloseFrom(minFd int) error { - // We must not close some file descriptors. + // We cannot use close_range(2) even if it is available, because we must + // not close some file descriptors. return fdRangeFrom(minFd, func(fd int) { if runtime_IsPollDescriptor(uintptr(fd)) { // These are the Go runtimes internal netpoll file descriptors. @@ -107,11 +147,117 @@ func UnsafeCloseFrom(minFd int) error { }) } -// NewSockPair returns a new unix socket pair -func NewSockPair(name string) (parent *os.File, child *os.File, err error) { +// NewSockPair returns a new SOCK_STREAM unix socket pair. +func NewSockPair(name string) (parent, child *os.File, err error) { fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) if err != nil { return nil, nil, err } return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil } + +// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...) +// corresponding to the unsafePath resolved within the root. Before passing the +// fd, this path is verified to have been inside the root -- so operating on it +// through the passed fdpath should be safe. Do not access this path through +// the original path strings, and do not attempt to use the pathname outside of +// the passed closure (the file handle will be freed once the closure returns). +func WithProcfd(root, unsafePath string, fn func(procfd string) error) error { + // Remove the root then forcefully resolve inside the root. + unsafePath = stripRoot(root, unsafePath) + path, err := securejoin.SecureJoin(root, unsafePath) + if err != nil { + return fmt.Errorf("resolving path inside rootfs failed: %w", err) + } + + procSelfFd, closer := ProcThreadSelf("fd/") + defer closer() + + // Open the target path. + fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0) + if err != nil { + return fmt.Errorf("open o_path procfd: %w", err) + } + defer fh.Close() + + procfd := filepath.Join(procSelfFd, strconv.Itoa(int(fh.Fd()))) + // Double-check the path is the one we expected. + if realpath, err := os.Readlink(procfd); err != nil { + return fmt.Errorf("procfd verification failed: %w", err) + } else if realpath != path { + return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath) + } + + return fn(procfd) +} + +type ProcThreadSelfCloser func() + +var ( + haveProcThreadSelf bool + haveProcThreadSelfOnce sync.Once +) + +// ProcThreadSelf returns a string that is equivalent to +// /proc/thread-self/, with a graceful fallback on older kernels where +// /proc/thread-self doesn't exist. This method DOES NOT use SecureJoin, +// meaning that the passed string needs to be trusted. The caller _must_ call +// the returned procThreadSelfCloser function (which is runtime.UnlockOSThread) +// *only once* after it has finished using the returned path string. +func ProcThreadSelf(subpath string) (string, ProcThreadSelfCloser) { + haveProcThreadSelfOnce.Do(func() { + if _, err := os.Stat("/proc/thread-self/"); err == nil { + haveProcThreadSelf = true + } else { + logrus.Debugf("cannot stat /proc/thread-self (%v), falling back to /proc/self/task/", err) + } + }) + + // We need to lock our thread until the caller is done with the path string + // because any non-atomic operation on the path (such as opening a file, + // then reading it) could be interrupted by the Go runtime where the + // underlying thread is swapped out and the original thread is killed, + // resulting in pull-your-hair-out-hard-to-debug issues in the caller. In + // addition, the pre-3.17 fallback makes everything non-atomic because the + // same thing could happen between unix.Gettid() and the path operations. + // + // In theory, we don't need to lock in the atomic user case when using + // /proc/thread-self/, but it's better to be safe than sorry (and there are + // only one or two truly atomic users of /proc/thread-self/). + runtime.LockOSThread() + + threadSelf := "/proc/thread-self/" + if !haveProcThreadSelf { + // Pre-3.17 kernels did not have /proc/thread-self, so do it manually. + threadSelf = "/proc/self/task/" + strconv.Itoa(unix.Gettid()) + "/" + if _, err := os.Stat(threadSelf); err != nil { + // Unfortunately, this code is called from rootfs_linux.go where we + // are running inside the pid namespace of the container but /proc + // is the host's procfs. Unfortunately there is no real way to get + // the correct tid to use here (the kernel age means we cannot do + // things like set up a private fsopen("proc") -- even scanning + // NSpid in all of the tasks in /proc/self/task/*/status requires + // Linux 4.1). + // + // So, we just have to assume that /proc/self is acceptable in this + // one specific case. + if os.Getpid() == 1 { + logrus.Debugf("/proc/thread-self (tid=%d) cannot be emulated inside the initial container setup -- using /proc/self instead: %v", unix.Gettid(), err) + } else { + // This should never happen, but the fallback should work in most cases... + logrus.Warnf("/proc/thread-self could not be emulated for pid=%d (tid=%d) -- using more buggy /proc/self fallback instead: %v", os.Getpid(), unix.Gettid(), err) + } + threadSelf = "/proc/self/" + } + } + return threadSelf + subpath, runtime.UnlockOSThread +} + +// ProcThreadSelfFd is small wrapper around ProcThreadSelf to make it easier to +// create a /proc/thread-self handle for given file descriptor. +// +// It is basically equivalent to ProcThreadSelf(fmt.Sprintf("fd/%d", fd)), but +// without using fmt.Sprintf to avoid unneeded overhead. +func ProcThreadSelfFd(fd uintptr) (string, ProcThreadSelfCloser) { + return ProcThreadSelf("fd/" + strconv.FormatUint(uint64(fd), 10)) +} diff --git a/vendor/modules.txt b/vendor/modules.txt index 3b85d75d33..1ff7c4fd32 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -986,12 +986,11 @@ github.com/opencontainers/go-digest/digestset github.com/opencontainers/image-spec/identity github.com/opencontainers/image-spec/specs-go github.com/opencontainers/image-spec/specs-go/v1 -# github.com/opencontainers/runc v1.1.12 -## explicit; go 1.17 +# github.com/opencontainers/runc v1.2.0-rc.1 +## explicit; go 1.20 github.com/opencontainers/runc/libcontainer/cgroups github.com/opencontainers/runc/libcontainer/configs github.com/opencontainers/runc/libcontainer/devices -github.com/opencontainers/runc/libcontainer/user github.com/opencontainers/runc/libcontainer/userns github.com/opencontainers/runc/libcontainer/utils # github.com/opencontainers/runtime-spec v1.2.0