vendor: github.com/opencontainers/runc v1.2.0-rc.1

full diff: https://github.com/opencontainers/runc/compare/v1.1.12...v1.2.0-rc.1

Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
This commit is contained in:
Sebastiaan van Stijn 2024-04-03 14:25:31 +02:00
parent 5890b67b7f
commit 2dc62475f8
No known key found for this signature in database
GPG key ID: 76698F39D527CE8C
29 changed files with 744 additions and 1086 deletions

View file

@ -77,7 +77,7 @@ require (
github.com/morikuni/aec v1.0.0
github.com/opencontainers/go-digest v1.0.0
github.com/opencontainers/image-spec v1.1.0-rc5
github.com/opencontainers/runc v1.1.12
github.com/opencontainers/runc v1.2.0-rc.1
github.com/opencontainers/runtime-spec v1.2.0
github.com/opencontainers/selinux v1.11.0
github.com/pelletier/go-toml v1.9.5

View file

@ -539,8 +539,8 @@ github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8
github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
github.com/opencontainers/image-spec v1.1.0-rc5 h1:Ygwkfw9bpDvs+c9E34SdgGOj41dX/cbdlwvlWt0pnFI=
github.com/opencontainers/image-spec v1.1.0-rc5/go.mod h1:X4pATf0uXsnn3g5aiGIsVnJBR4mxhKzfwmvK/B2NTm8=
github.com/opencontainers/runc v1.1.12 h1:BOIssBaW1La0/qbNZHXOOa71dZfZEQOzW7dqQf3phss=
github.com/opencontainers/runc v1.1.12/go.mod h1:S+lQwSfncpBha7XTy/5lBwWgm5+y5Ma/O44Ekby9FK8=
github.com/opencontainers/runc v1.2.0-rc.1 h1:SMjop2pxxYRTfKdsigna/8xRoaoCfIQfD2cVuOb64/o=
github.com/opencontainers/runc v1.2.0-rc.1/go.mod h1:m9JwxfHzXz5YTTXBQr7EY9KTuazFAGPyMQx2nRR3vTw=
github.com/opencontainers/runtime-spec v1.0.3-0.20220825212826-86290f6a00fb/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
github.com/opencontainers/runtime-spec v1.2.0 h1:z97+pHb3uELt/yiAWD691HNHQIF07bE7dzrbT927iTk=
github.com/opencontainers/runtime-spec v1.2.0/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=

View file

@ -8,9 +8,9 @@ The following is courtesy of our legal counsel:
Use and transfer of Docker may be subject to certain restrictions by the
United States and other governments.
United States and other governments.
It is your responsibility to ensure that your use and/or transfer does not
violate applicable laws.
violate applicable laws.
For more information, please see http://www.bis.doc.gov

View file

@ -1,9 +1,24 @@
package cgroups
import (
"errors"
"github.com/opencontainers/runc/libcontainer/configs"
)
var (
// ErrDevicesUnsupported is an error returned when a cgroup manager
// is not configured to set device rules.
ErrDevicesUnsupported = errors.New("cgroup manager is not configured to set device rules")
// DevicesSetV1 and DevicesSetV2 are functions to set devices for
// cgroup v1 and v2, respectively. Unless libcontainer/cgroups/devices
// package is imported, it is set to nil, so cgroup managers can't
// manage devices.
DevicesSetV1 func(path string, r *configs.Resources) error
DevicesSetV2 func(path string, r *configs.Resources) error
)
type Manager interface {
// Apply creates a cgroup, if not yet created, and adds a process
// with the specified pid into that cgroup. A special value of -1

View file

@ -50,24 +50,13 @@ func WriteFile(dir, file, data string) error {
return err
}
defer fd.Close()
if err := retryingWriteFile(fd, data); err != nil {
if _, err := fd.WriteString(data); err != nil {
// Having data in the error message helps in debugging.
return fmt.Errorf("failed to write %q: %w", data, err)
}
return nil
}
func retryingWriteFile(fd *os.File, data string) error {
for {
_, err := fd.Write([]byte(data))
if errors.Is(err, unix.EINTR) {
logrus.Infof("interrupted while writing %s to %s", data, fd.Name())
continue
}
return err
}
}
const (
cgroupfsDir = "/sys/fs/cgroup"
cgroupfsPrefix = cgroupfsDir + "/"
@ -90,7 +79,7 @@ func prepareOpenat2() error {
})
if err != nil {
prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err}
if err != unix.ENOSYS { //nolint:errorlint // unix errors are bare
if err != unix.ENOSYS {
logrus.Warnf("falling back to securejoin: %s", prepErr)
} else {
logrus.Debug("openat2 not available, falling back to securejoin")
@ -148,8 +137,9 @@ func openFile(dir, file string, flags int) (*os.File, error) {
//
// TODO: if such usage will ever be common, amend this
// to reopen cgroupRootHandle and retry openat2.
fdStr := strconv.Itoa(int(cgroupRootHandle.Fd()))
fdDest, _ := os.Readlink("/proc/self/fd/" + fdStr)
fdPath, closer := utils.ProcThreadSelf("fd/" + strconv.Itoa(int(cgroupRootHandle.Fd())))
defer closer()
fdDest, _ := os.Readlink(fdPath)
if fdDest != cgroupfsDir {
// Wrap the error so it is clear that cgroupRootHandle
// is opened to an unexpected/wrong directory.

View file

@ -32,9 +32,22 @@ type CpuUsage struct {
UsageInUsermode uint64 `json:"usage_in_usermode"`
}
type PSIData struct {
Avg10 float64 `json:"avg10"`
Avg60 float64 `json:"avg60"`
Avg300 float64 `json:"avg300"`
Total uint64 `json:"total"`
}
type PSIStats struct {
Some PSIData `json:"some,omitempty"`
Full PSIData `json:"full,omitempty"`
}
type CpuStats struct {
CpuUsage CpuUsage `json:"cpu_usage,omitempty"`
ThrottlingData ThrottlingData `json:"throttling_data,omitempty"`
PSI *PSIStats `json:"psi,omitempty"`
}
type CPUSetStats struct {
@ -91,6 +104,7 @@ type MemoryStats struct {
UseHierarchy bool `json:"use_hierarchy"`
Stats map[string]uint64 `json:"stats,omitempty"`
PSI *PSIStats `json:"psi,omitempty"`
}
type PageUsageByNUMA struct {
@ -135,6 +149,7 @@ type BlkioStats struct {
IoMergedRecursive []BlkioStatEntry `json:"io_merged_recursive,omitempty"`
IoTimeRecursive []BlkioStatEntry `json:"io_time_recursive,omitempty"`
SectorsRecursive []BlkioStatEntry `json:"sectors_recursive,omitempty"`
PSI *PSIStats `json:"psi,omitempty"`
}
type HugetlbStats struct {
@ -157,6 +172,13 @@ type RdmaStats struct {
RdmaCurrent []RdmaEntry `json:"rdma_current,omitempty"`
}
type MiscStats struct {
// current resource usage for a key in misc
Usage uint64 `json:"usage,omitempty"`
// number of times the resource usage was about to go over the max boundary
Events uint64 `json:"events,omitempty"`
}
type Stats struct {
CpuStats CpuStats `json:"cpu_stats,omitempty"`
CPUSetStats CPUSetStats `json:"cpuset_stats,omitempty"`
@ -166,10 +188,13 @@ type Stats struct {
// the map is in the format "size of hugepage: stats of the hugepage"
HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"`
RdmaStats RdmaStats `json:"rdma_stats,omitempty"`
// the map is in the format "misc resource name: stats of the key"
MiscStats map[string]MiscStats `json:"misc_stats,omitempty"`
}
func NewStats() *Stats {
memoryStats := MemoryStats{Stats: make(map[string]uint64)}
hugetlbStats := make(map[string]HugetlbStats)
return &Stats{MemoryStats: memoryStats, HugetlbStats: hugetlbStats}
miscStats := make(map[string]MiscStats)
return &Stats{MemoryStats: memoryStats, HugetlbStats: hugetlbStats, MiscStats: miscStats}
}

View file

@ -36,13 +36,13 @@ func IsCgroup2UnifiedMode() bool {
var st unix.Statfs_t
err := unix.Statfs(unifiedMountpoint, &st)
if err != nil {
level := logrus.WarnLevel
if os.IsNotExist(err) && userns.RunningInUserNS() {
// ignore the "not found" error if running in userns
logrus.WithError(err).Debugf("%s missing, assuming cgroup v1", unifiedMountpoint)
isUnified = false
return
// For rootless containers, sweep it under the rug.
level = logrus.DebugLevel
}
panic(fmt.Sprintf("cannot statfs cgroup root: %s", err))
logrus.StandardLogger().Logf(level,
"statfs %s: %v; assuming cgroup v1", unifiedMountpoint, err)
}
isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
})
@ -217,21 +217,26 @@ func PathExists(path string) bool {
return true
}
func EnterPid(cgroupPaths map[string]string, pid int) error {
for _, path := range cgroupPaths {
if PathExists(path) {
if err := WriteCgroupProc(path, pid); err != nil {
return err
}
}
}
return nil
}
// rmdir tries to remove a directory, optionally retrying on EBUSY.
func rmdir(path string, retry bool) error {
delay := time.Millisecond
tries := 10
func rmdir(path string) error {
again:
err := unix.Rmdir(path)
if err == nil || err == unix.ENOENT { //nolint:errorlint // unix errors are bare
switch err { // nolint:errorlint // unix errors are bare
case nil, unix.ENOENT:
return nil
case unix.EINTR:
goto again
case unix.EBUSY:
if retry && tries > 0 {
time.Sleep(delay)
delay *= 2
tries--
goto again
}
}
return &os.PathError{Op: "rmdir", Path: path, Err: err}
}
@ -239,68 +244,42 @@ func rmdir(path string) error {
// RemovePath aims to remove cgroup path. It does so recursively,
// by removing any subdirectories (sub-cgroups) first.
func RemovePath(path string) error {
// try the fast path first
if err := rmdir(path); err == nil {
// Try the fast path first.
if err := rmdir(path, false); err == nil {
return nil
}
infos, err := os.ReadDir(path)
if err != nil {
if os.IsNotExist(err) {
err = nil
}
if err != nil && !os.IsNotExist(err) {
return err
}
for _, info := range infos {
if info.IsDir() {
// We should remove subcgroups dir first
// We should remove subcgroup first.
if err = RemovePath(filepath.Join(path, info.Name())); err != nil {
break
}
}
}
if err == nil {
err = rmdir(path)
err = rmdir(path, true)
}
return err
}
// RemovePaths iterates over the provided paths removing them.
// We trying to remove all paths five times with increasing delay between tries.
// If after all there are not removed cgroups - appropriate error will be
// returned.
func RemovePaths(paths map[string]string) (err error) {
const retries = 5
delay := 10 * time.Millisecond
for i := 0; i < retries; i++ {
if i != 0 {
time.Sleep(delay)
delay *= 2
}
for s, p := range paths {
if err := RemovePath(p); err != nil {
// do not log intermediate iterations
switch i {
case 0:
logrus.WithError(err).Warnf("Failed to remove cgroup (will retry)")
case retries - 1:
logrus.WithError(err).Error("Failed to remove cgroup")
}
}
_, err := os.Stat(p)
// We need this strange way of checking cgroups existence because
// RemoveAll almost always returns error, even on already removed
// cgroups
if os.IsNotExist(err) {
delete(paths, s)
}
}
if len(paths) == 0 {
//nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506
paths = make(map[string]string)
return nil
for s, p := range paths {
if err := RemovePath(p); err == nil {
delete(paths, s)
}
}
if len(paths) == 0 {
//nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506
// TODO: switch to clear once Go < 1.21 is not supported.
paths = make(map[string]string)
return nil
}
return fmt.Errorf("Failed to remove paths: %v", paths)
}

View file

@ -99,11 +99,12 @@ func tryDefaultPath(cgroupPath, subsystem string) string {
// expensive), so it is assumed that cgroup mounts are not being changed.
func readCgroupMountinfo() ([]*mountinfo.Info, error) {
readMountinfoOnce.Do(func() {
// mountinfo.GetMounts uses /proc/thread-self, so we can use it without
// issues.
cgroupMountinfo, readMountinfoErr = mountinfo.GetMounts(
mountinfo.FSTypeFilter("cgroup"),
)
})
return cgroupMountinfo, readMountinfoErr
}
@ -196,6 +197,9 @@ func getCgroupMountsV1(all bool) ([]Mount, error) {
return nil, err
}
// We don't need to use /proc/thread-self here because runc always runs
// with every thread in the same cgroup. This lets us avoid having to do
// runtime.LockOSThread.
allSubsystems, err := ParseCgroupFile("/proc/self/cgroup")
if err != nil {
return nil, err
@ -214,6 +218,10 @@ func GetOwnCgroup(subsystem string) (string, error) {
if IsCgroup2UnifiedMode() {
return "", errUnified
}
// We don't need to use /proc/thread-self here because runc always runs
// with every thread in the same cgroup. This lets us avoid having to do
// runtime.LockOSThread.
cgroups, err := ParseCgroupFile("/proc/self/cgroup")
if err != nil {
return "", err
@ -236,27 +244,6 @@ func GetOwnCgroupPath(subsystem string) (string, error) {
return getCgroupPathHelper(subsystem, cgroup)
}
func GetInitCgroup(subsystem string) (string, error) {
if IsCgroup2UnifiedMode() {
return "", errUnified
}
cgroups, err := ParseCgroupFile("/proc/1/cgroup")
if err != nil {
return "", err
}
return getControllerPath(subsystem, cgroups)
}
func GetInitCgroupPath(subsystem string) (string, error) {
cgroup, err := GetInitCgroup(subsystem)
if err != nil {
return "", err
}
return getCgroupPathHelper(subsystem, cgroup)
}
func getCgroupPathHelper(subsystem, cgroup string) (string, error) {
mnt, root, err := FindCgroupMountpointAndRoot("", subsystem)
if err != nil {

View file

@ -2,8 +2,8 @@ package configs
import "fmt"
// blockIODevice holds major:minor format supported in blkio cgroup
type blockIODevice struct {
// BlockIODevice holds major:minor format supported in blkio cgroup.
type BlockIODevice struct {
// Major is the device's major number
Major int64 `json:"major"`
// Minor is the device's minor number
@ -12,7 +12,7 @@ type blockIODevice struct {
// WeightDevice struct holds a `major:minor weight`|`major:minor leaf_weight` pair
type WeightDevice struct {
blockIODevice
BlockIODevice
// Weight is the bandwidth rate for the device, range is from 10 to 1000
Weight uint16 `json:"weight"`
// LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only
@ -41,7 +41,7 @@ func (wd *WeightDevice) LeafWeightString() string {
// ThrottleDevice struct holds a `major:minor rate_per_second` pair
type ThrottleDevice struct {
blockIODevice
BlockIODevice
// Rate is the IO rate limit per cgroup per device
Rate uint64 `json:"rate"`
}

View file

@ -69,6 +69,9 @@ type Resources struct {
// CPU hardcap limit (in usecs). Allowed cpu time in a given period.
CpuQuota int64 `json:"cpu_quota"`
// CPU hardcap burst limit (in usecs). Allowed accumulated cpu time additionally for burst in a given period.
CpuBurst *uint64 `json:"cpu_burst"` //nolint:revive
// CPU period to be used for hardcapping (in usecs). 0 to use system default.
CpuPeriod uint64 `json:"cpu_period"`
@ -84,6 +87,9 @@ type Resources struct {
// MEM to use
CpusetMems string `json:"cpuset_mems"`
// cgroup SCHED_IDLE
CPUIdle *int64 `json:"cpu_idle,omitempty"`
// Process limit; set <= `0' to disable limit.
PidsLimit int64 `json:"pids_limit"`
@ -155,4 +161,9 @@ type Resources struct {
// during Set() to figure out whether the freeze is required. Those
// methods may be relatively slow, thus this flag.
SkipFreezeOnSet bool `json:"-"`
// MemoryCheckBeforeUpdate is a flag for cgroup v2 managers to check
// if the new memory limits (Memory and MemorySwap) being set are lower
// than the current memory usage, and reject if so.
MemoryCheckBeforeUpdate bool `json:"memory_check_before_update"`
}

View file

@ -8,6 +8,7 @@ import (
"time"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/opencontainers/runtime-spec/specs-go"
@ -31,12 +32,13 @@ type IDMap struct {
// for syscalls. Additional architectures can be added by specifying them in
// Architectures.
type Seccomp struct {
DefaultAction Action `json:"default_action"`
Architectures []string `json:"architectures"`
Syscalls []*Syscall `json:"syscalls"`
DefaultErrnoRet *uint `json:"default_errno_ret"`
ListenerPath string `json:"listener_path,omitempty"`
ListenerMetadata string `json:"listener_metadata,omitempty"`
DefaultAction Action `json:"default_action"`
Architectures []string `json:"architectures"`
Flags []specs.LinuxSeccompFlag `json:"flags"`
Syscalls []*Syscall `json:"syscalls"`
DefaultErrnoRet *uint `json:"default_errno_ret"`
ListenerPath string `json:"listener_path,omitempty"`
ListenerMetadata string `json:"listener_metadata,omitempty"`
}
// Action is taken upon rule match in Seccomp
@ -83,9 +85,6 @@ type Syscall struct {
Args []*Arg `json:"args"`
}
// TODO Windows. Many of these fields should be factored out into those parts
// which are common across platforms, and those which are platform specific.
// Config defines configuration options for executing a process inside a contained environment.
type Config struct {
// NoPivotRoot will use MS_MOVE and a chroot to jail the process into the container's rootfs
@ -121,6 +120,9 @@ type Config struct {
// Hostname optionally sets the container's hostname if provided
Hostname string `json:"hostname"`
// Domainname optionally sets the container's domainname if provided
Domainname string `json:"domainname"`
// Namespaces specifies the container's namespaces that it should setup when cloning the init process
// If a namespace is not provided that namespace is shared from the container's parent process
Namespaces Namespaces `json:"namespaces"`
@ -158,11 +160,11 @@ type Config struct {
// More information about kernel oom score calculation here: https://lwn.net/Articles/317814/
OomScoreAdj *int `json:"oom_score_adj,omitempty"`
// UidMappings is an array of User ID mappings for User Namespaces
UidMappings []IDMap `json:"uid_mappings"`
// UIDMappings is an array of User ID mappings for User Namespaces
UIDMappings []IDMap `json:"uid_mappings"`
// GidMappings is an array of Group ID mappings for User Namespaces
GidMappings []IDMap `json:"gid_mappings"`
// GIDMappings is an array of Group ID mappings for User Namespaces
GIDMappings []IDMap `json:"gid_mappings"`
// MaskPaths specifies paths within the container's rootfs to mask over with a bind
// mount pointing to /dev/null as to prevent reads of the file.
@ -211,8 +213,87 @@ type Config struct {
// RootlessCgroups is set when unlikely to have the full access to cgroups.
// When RootlessCgroups is set, cgroups errors are ignored.
RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
// TimeOffsets specifies the offset for supporting time namespaces.
TimeOffsets map[string]specs.LinuxTimeOffset `json:"time_offsets,omitempty"`
// Scheduler represents the scheduling attributes for a process.
Scheduler *Scheduler `json:"scheduler,omitempty"`
// Personality contains configuration for the Linux personality syscall.
Personality *LinuxPersonality `json:"personality,omitempty"`
// IOPriority is the container's I/O priority.
IOPriority *IOPriority `json:"io_priority,omitempty"`
}
// Scheduler is based on the Linux sched_setattr(2) syscall.
type Scheduler = specs.Scheduler
// ToSchedAttr is to convert *configs.Scheduler to *unix.SchedAttr.
func ToSchedAttr(scheduler *Scheduler) (*unix.SchedAttr, error) {
var policy uint32
switch scheduler.Policy {
case specs.SchedOther:
policy = 0
case specs.SchedFIFO:
policy = 1
case specs.SchedRR:
policy = 2
case specs.SchedBatch:
policy = 3
case specs.SchedISO:
policy = 4
case specs.SchedIdle:
policy = 5
case specs.SchedDeadline:
policy = 6
default:
return nil, fmt.Errorf("invalid scheduler policy: %s", scheduler.Policy)
}
var flags uint64
for _, flag := range scheduler.Flags {
switch flag {
case specs.SchedFlagResetOnFork:
flags |= 0x01
case specs.SchedFlagReclaim:
flags |= 0x02
case specs.SchedFlagDLOverrun:
flags |= 0x04
case specs.SchedFlagKeepPolicy:
flags |= 0x08
case specs.SchedFlagKeepParams:
flags |= 0x10
case specs.SchedFlagUtilClampMin:
flags |= 0x20
case specs.SchedFlagUtilClampMax:
flags |= 0x40
default:
return nil, fmt.Errorf("invalid scheduler flag: %s", flag)
}
}
return &unix.SchedAttr{
Size: unix.SizeofSchedAttr,
Policy: policy,
Flags: flags,
Nice: scheduler.Nice,
Priority: uint32(scheduler.Priority),
Runtime: scheduler.Runtime,
Deadline: scheduler.Deadline,
Period: scheduler.Period,
}, nil
}
var IOPrioClassMapping = map[specs.IOPriorityClass]int{
specs.IOPRIO_CLASS_RT: 1,
specs.IOPRIO_CLASS_BE: 2,
specs.IOPRIO_CLASS_IDLE: 3,
}
type IOPriority = specs.LinuxIOPriority
type (
HookName string
HookList []Hook
@ -277,6 +358,7 @@ type Capabilities struct {
Ambient []string
}
// Deprecated: use (Hooks).Run instead.
func (hooks HookList) RunHooks(state *specs.State) error {
for i, h := range hooks {
if err := h.Run(state); err != nil {
@ -333,6 +415,18 @@ func (hooks *Hooks) MarshalJSON() ([]byte, error) {
})
}
// Run executes all hooks for the given hook name.
func (hooks Hooks) Run(name HookName, state *specs.State) error {
list := hooks[name]
for i, h := range list {
if err := h.Run(state); err != nil {
return fmt.Errorf("error running %s hook #%d: %w", name, i, err)
}
}
return nil
}
type Hook interface {
// Run executes the hook with the provided state.
Run(*specs.State) error
@ -393,7 +487,7 @@ func (c Command) Run(s *specs.State) error {
go func() {
err := cmd.Wait()
if err != nil {
err = fmt.Errorf("error running hook: %w, stdout: %s, stderr: %s", err, stdout.String(), stderr.String())
err = fmt.Errorf("%w, stdout: %s, stderr: %s", err, stdout.String(), stderr.String())
}
errC <- err
}()

View file

@ -7,22 +7,33 @@ import (
)
var (
errNoUIDMap = errors.New("User namespaces enabled, but no uid mappings found.")
errNoUserMap = errors.New("User namespaces enabled, but no user mapping found.")
errNoGIDMap = errors.New("User namespaces enabled, but no gid mappings found.")
errNoGroupMap = errors.New("User namespaces enabled, but no group mapping found.")
errNoUIDMap = errors.New("user namespaces enabled, but no uid mappings found")
errNoGIDMap = errors.New("user namespaces enabled, but no gid mappings found")
)
// Please check https://man7.org/linux/man-pages/man2/personality.2.html for const details.
// https://raw.githubusercontent.com/torvalds/linux/master/include/uapi/linux/personality.h
const (
PerLinux = 0x0000
PerLinux32 = 0x0008
)
type LinuxPersonality struct {
// Domain for the personality
// can only contain values "LINUX" and "LINUX32"
Domain int `json:"domain"`
}
// HostUID gets the translated uid for the process on host which could be
// different when user namespaces are enabled.
func (c Config) HostUID(containerId int) (int, error) {
if c.Namespaces.Contains(NEWUSER) {
if c.UidMappings == nil {
if len(c.UIDMappings) == 0 {
return -1, errNoUIDMap
}
id, found := c.hostIDFromMapping(int64(containerId), c.UidMappings)
id, found := c.hostIDFromMapping(int64(containerId), c.UIDMappings)
if !found {
return -1, errNoUserMap
return -1, fmt.Errorf("user namespaces enabled, but no mapping found for uid %d", containerId)
}
// If we are a 32-bit binary running on a 64-bit system, it's possible
// the mapped user is too large to store in an int, which means we
@ -47,12 +58,12 @@ func (c Config) HostRootUID() (int, error) {
// different when user namespaces are enabled.
func (c Config) HostGID(containerId int) (int, error) {
if c.Namespaces.Contains(NEWUSER) {
if c.GidMappings == nil {
if len(c.GIDMappings) == 0 {
return -1, errNoGIDMap
}
id, found := c.hostIDFromMapping(int64(containerId), c.GidMappings)
id, found := c.hostIDFromMapping(int64(containerId), c.GIDMappings)
if !found {
return -1, errNoGroupMap
return -1, fmt.Errorf("user namespaces enabled, but no mapping found for gid %d", containerId)
}
// If we are a 32-bit binary running on a 64-bit system, it's possible
// the mapped user is too large to store in an int, which means we

View file

@ -1,48 +1,7 @@
package configs
import "golang.org/x/sys/unix"
const (
// EXT_COPYUP is a directive to copy up the contents of a directory when
// a tmpfs is mounted over it.
EXT_COPYUP = 1 << iota //nolint:golint // ignore "don't use ALL_CAPS" warning
EXT_COPYUP = 1 << iota //nolint:golint,revive // ignore "don't use ALL_CAPS" warning
)
type Mount struct {
// Source path for the mount.
Source string `json:"source"`
// Destination path for the mount inside the container.
Destination string `json:"destination"`
// Device the mount is for.
Device string `json:"device"`
// Mount flags.
Flags int `json:"flags"`
// Propagation Flags
PropagationFlags []int `json:"propagation_flags"`
// Mount data applied to the mount.
Data string `json:"data"`
// Relabel source if set, "z" indicates shared, "Z" indicates unshared.
Relabel string `json:"relabel"`
// RecAttr represents mount properties to be applied recursively (AT_RECURSIVE), see mount_setattr(2).
RecAttr *unix.MountAttr `json:"rec_attr"`
// Extensions are additional flags that are specific to runc.
Extensions int `json:"extensions"`
// Optional Command to be run before Source is mounted.
PremountCmds []Command `json:"premount_cmds"`
// Optional Command to be run after Source is mounted.
PostmountCmds []Command `json:"postmount_cmds"`
}
func (m *Mount) IsBind() bool {
return m.Flags&unix.MS_BIND != 0
}

View file

@ -0,0 +1,66 @@
package configs
import "golang.org/x/sys/unix"
type MountIDMapping struct {
// Recursive indicates if the mapping needs to be recursive.
Recursive bool `json:"recursive"`
// UserNSPath is a path to a user namespace that indicates the necessary
// id-mappings for MOUNT_ATTR_IDMAP. If set to non-"", UIDMappings and
// GIDMappings must be set to nil.
UserNSPath string `json:"userns_path,omitempty"`
// UIDMappings is the uid mapping set for this mount, to be used with
// MOUNT_ATTR_IDMAP.
UIDMappings []IDMap `json:"uid_mappings,omitempty"`
// GIDMappings is the gid mapping set for this mount, to be used with
// MOUNT_ATTR_IDMAP.
GIDMappings []IDMap `json:"gid_mappings,omitempty"`
}
type Mount struct {
// Source path for the mount.
Source string `json:"source"`
// Destination path for the mount inside the container.
Destination string `json:"destination"`
// Device the mount is for.
Device string `json:"device"`
// Mount flags.
Flags int `json:"flags"`
// Mount flags that were explicitly cleared in the configuration (meaning
// the user explicitly requested that these flags *not* be set).
ClearedFlags int `json:"cleared_flags"`
// Propagation Flags
PropagationFlags []int `json:"propagation_flags"`
// Mount data applied to the mount.
Data string `json:"data"`
// Relabel source if set, "z" indicates shared, "Z" indicates unshared.
Relabel string `json:"relabel"`
// RecAttr represents mount properties to be applied recursively (AT_RECURSIVE), see mount_setattr(2).
RecAttr *unix.MountAttr `json:"rec_attr"`
// Extensions are additional flags that are specific to runc.
Extensions int `json:"extensions"`
// Mapping is the MOUNT_ATTR_IDMAP configuration for the mount. If non-nil,
// the mount is configured to use MOUNT_ATTR_IDMAP-style id mappings.
IDMapping *MountIDMapping `json:"id_mapping,omitempty"`
}
func (m *Mount) IsBind() bool {
return m.Flags&unix.MS_BIND != 0
}
func (m *Mount) IsIDMapped() bool {
return m.IDMapping != nil
}

View file

@ -0,0 +1,10 @@
//go:build !linux
// +build !linux
package configs
type Mount struct{}
func (m *Mount) IsBind() bool {
return false
}

View file

@ -14,6 +14,7 @@ const (
NEWIPC NamespaceType = "NEWIPC"
NEWUSER NamespaceType = "NEWUSER"
NEWCGROUP NamespaceType = "NEWCGROUP"
NEWTIME NamespaceType = "NEWTIME"
)
var (
@ -38,6 +39,8 @@ func NsName(ns NamespaceType) string {
return "uts"
case NEWCGROUP:
return "cgroup"
case NEWTIME:
return "time"
}
return ""
}
@ -56,6 +59,9 @@ func IsNamespaceSupported(ns NamespaceType) bool {
if nsFile == "" {
return false
}
// We don't need to use /proc/thread-self here because the list of
// namespace types is unrelated to the thread. This lets us avoid having to
// do runtime.LockOSThread.
_, err := os.Stat("/proc/self/ns/" + nsFile)
// a namespace is supported if it exists and we have permissions to read it
supported = err == nil
@ -72,6 +78,7 @@ func NamespaceTypes() []NamespaceType {
NEWPID,
NEWNS,
NEWCGROUP,
NEWTIME,
}
}

View file

@ -17,6 +17,7 @@ var namespaceInfo = map[NamespaceType]int{
NEWUTS: unix.CLONE_NEWUTS,
NEWPID: unix.CLONE_NEWPID,
NEWCGROUP: unix.CLONE_NEWCGROUP,
NEWTIME: unix.CLONE_NEWTIME,
}
// CloneFlags parses the container's Namespaces options to set the correct
@ -31,3 +32,15 @@ func (n *Namespaces) CloneFlags() uintptr {
}
return uintptr(flag)
}
// IsPrivate tells whether the namespace of type t is configured as private
// (i.e. it exists and is not shared).
func (n Namespaces) IsPrivate(t NamespaceType) bool {
for _, v := range n {
if v.Type == t {
return v.Path == ""
}
}
// Not found, so implicitly sharing a parent namespace.
return false
}

View file

@ -1,157 +0,0 @@
//go:build darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris
// +build darwin dragonfly freebsd linux netbsd openbsd solaris
package user
import (
"io"
"os"
"strconv"
"golang.org/x/sys/unix"
)
// Unix-specific path to the passwd and group formatted files.
const (
unixPasswdPath = "/etc/passwd"
unixGroupPath = "/etc/group"
)
// LookupUser looks up a user by their username in /etc/passwd. If the user
// cannot be found (or there is no /etc/passwd file on the filesystem), then
// LookupUser returns an error.
func LookupUser(username string) (User, error) {
return lookupUserFunc(func(u User) bool {
return u.Name == username
})
}
// LookupUid looks up a user by their user id in /etc/passwd. If the user cannot
// be found (or there is no /etc/passwd file on the filesystem), then LookupId
// returns an error.
func LookupUid(uid int) (User, error) {
return lookupUserFunc(func(u User) bool {
return u.Uid == uid
})
}
func lookupUserFunc(filter func(u User) bool) (User, error) {
// Get operating system-specific passwd reader-closer.
passwd, err := GetPasswd()
if err != nil {
return User{}, err
}
defer passwd.Close()
// Get the users.
users, err := ParsePasswdFilter(passwd, filter)
if err != nil {
return User{}, err
}
// No user entries found.
if len(users) == 0 {
return User{}, ErrNoPasswdEntries
}
// Assume the first entry is the "correct" one.
return users[0], nil
}
// LookupGroup looks up a group by its name in /etc/group. If the group cannot
// be found (or there is no /etc/group file on the filesystem), then LookupGroup
// returns an error.
func LookupGroup(groupname string) (Group, error) {
return lookupGroupFunc(func(g Group) bool {
return g.Name == groupname
})
}
// LookupGid looks up a group by its group id in /etc/group. If the group cannot
// be found (or there is no /etc/group file on the filesystem), then LookupGid
// returns an error.
func LookupGid(gid int) (Group, error) {
return lookupGroupFunc(func(g Group) bool {
return g.Gid == gid
})
}
func lookupGroupFunc(filter func(g Group) bool) (Group, error) {
// Get operating system-specific group reader-closer.
group, err := GetGroup()
if err != nil {
return Group{}, err
}
defer group.Close()
// Get the users.
groups, err := ParseGroupFilter(group, filter)
if err != nil {
return Group{}, err
}
// No user entries found.
if len(groups) == 0 {
return Group{}, ErrNoGroupEntries
}
// Assume the first entry is the "correct" one.
return groups[0], nil
}
func GetPasswdPath() (string, error) {
return unixPasswdPath, nil
}
func GetPasswd() (io.ReadCloser, error) {
return os.Open(unixPasswdPath)
}
func GetGroupPath() (string, error) {
return unixGroupPath, nil
}
func GetGroup() (io.ReadCloser, error) {
return os.Open(unixGroupPath)
}
// CurrentUser looks up the current user by their user id in /etc/passwd. If the
// user cannot be found (or there is no /etc/passwd file on the filesystem),
// then CurrentUser returns an error.
func CurrentUser() (User, error) {
return LookupUid(unix.Getuid())
}
// CurrentGroup looks up the current user's group by their primary group id's
// entry in /etc/passwd. If the group cannot be found (or there is no
// /etc/group file on the filesystem), then CurrentGroup returns an error.
func CurrentGroup() (Group, error) {
return LookupGid(unix.Getgid())
}
func currentUserSubIDs(fileName string) ([]SubID, error) {
u, err := CurrentUser()
if err != nil {
return nil, err
}
filter := func(entry SubID) bool {
return entry.Name == u.Name || entry.Name == strconv.Itoa(u.Uid)
}
return ParseSubIDFileFilter(fileName, filter)
}
func CurrentUserSubUIDs() ([]SubID, error) {
return currentUserSubIDs("/etc/subuid")
}
func CurrentUserSubGIDs() ([]SubID, error) {
return currentUserSubIDs("/etc/subgid")
}
func CurrentProcessUIDMap() ([]IDMap, error) {
return ParseIDMapFile("/proc/self/uid_map")
}
func CurrentProcessGIDMap() ([]IDMap, error) {
return ParseIDMapFile("/proc/self/gid_map")
}

View file

@ -1,605 +0,0 @@
package user
import (
"bufio"
"bytes"
"errors"
"fmt"
"io"
"os"
"strconv"
"strings"
)
const (
minID = 0
maxID = 1<<31 - 1 // for 32-bit systems compatibility
)
var (
// ErrNoPasswdEntries is returned if no matching entries were found in /etc/group.
ErrNoPasswdEntries = errors.New("no matching entries in passwd file")
// ErrNoGroupEntries is returned if no matching entries were found in /etc/passwd.
ErrNoGroupEntries = errors.New("no matching entries in group file")
// ErrRange is returned if a UID or GID is outside of the valid range.
ErrRange = fmt.Errorf("uids and gids must be in range %d-%d", minID, maxID)
)
type User struct {
Name string
Pass string
Uid int
Gid int
Gecos string
Home string
Shell string
}
type Group struct {
Name string
Pass string
Gid int
List []string
}
// SubID represents an entry in /etc/sub{u,g}id
type SubID struct {
Name string
SubID int64
Count int64
}
// IDMap represents an entry in /proc/PID/{u,g}id_map
type IDMap struct {
ID int64
ParentID int64
Count int64
}
func parseLine(line []byte, v ...interface{}) {
parseParts(bytes.Split(line, []byte(":")), v...)
}
func parseParts(parts [][]byte, v ...interface{}) {
if len(parts) == 0 {
return
}
for i, p := range parts {
// Ignore cases where we don't have enough fields to populate the arguments.
// Some configuration files like to misbehave.
if len(v) <= i {
break
}
// Use the type of the argument to figure out how to parse it, scanf() style.
// This is legit.
switch e := v[i].(type) {
case *string:
*e = string(p)
case *int:
// "numbers", with conversion errors ignored because of some misbehaving configuration files.
*e, _ = strconv.Atoi(string(p))
case *int64:
*e, _ = strconv.ParseInt(string(p), 10, 64)
case *[]string:
// Comma-separated lists.
if len(p) != 0 {
*e = strings.Split(string(p), ",")
} else {
*e = []string{}
}
default:
// Someone goof'd when writing code using this function. Scream so they can hear us.
panic(fmt.Sprintf("parseLine only accepts {*string, *int, *int64, *[]string} as arguments! %#v is not a pointer!", e))
}
}
}
func ParsePasswdFile(path string) ([]User, error) {
passwd, err := os.Open(path)
if err != nil {
return nil, err
}
defer passwd.Close()
return ParsePasswd(passwd)
}
func ParsePasswd(passwd io.Reader) ([]User, error) {
return ParsePasswdFilter(passwd, nil)
}
func ParsePasswdFileFilter(path string, filter func(User) bool) ([]User, error) {
passwd, err := os.Open(path)
if err != nil {
return nil, err
}
defer passwd.Close()
return ParsePasswdFilter(passwd, filter)
}
func ParsePasswdFilter(r io.Reader, filter func(User) bool) ([]User, error) {
if r == nil {
return nil, errors.New("nil source for passwd-formatted data")
}
var (
s = bufio.NewScanner(r)
out = []User{}
)
for s.Scan() {
line := bytes.TrimSpace(s.Bytes())
if len(line) == 0 {
continue
}
// see: man 5 passwd
// name:password:UID:GID:GECOS:directory:shell
// Name:Pass:Uid:Gid:Gecos:Home:Shell
// root:x:0:0:root:/root:/bin/bash
// adm:x:3:4:adm:/var/adm:/bin/false
p := User{}
parseLine(line, &p.Name, &p.Pass, &p.Uid, &p.Gid, &p.Gecos, &p.Home, &p.Shell)
if filter == nil || filter(p) {
out = append(out, p)
}
}
if err := s.Err(); err != nil {
return nil, err
}
return out, nil
}
func ParseGroupFile(path string) ([]Group, error) {
group, err := os.Open(path)
if err != nil {
return nil, err
}
defer group.Close()
return ParseGroup(group)
}
func ParseGroup(group io.Reader) ([]Group, error) {
return ParseGroupFilter(group, nil)
}
func ParseGroupFileFilter(path string, filter func(Group) bool) ([]Group, error) {
group, err := os.Open(path)
if err != nil {
return nil, err
}
defer group.Close()
return ParseGroupFilter(group, filter)
}
func ParseGroupFilter(r io.Reader, filter func(Group) bool) ([]Group, error) {
if r == nil {
return nil, errors.New("nil source for group-formatted data")
}
rd := bufio.NewReader(r)
out := []Group{}
// Read the file line-by-line.
for {
var (
isPrefix bool
wholeLine []byte
err error
)
// Read the next line. We do so in chunks (as much as reader's
// buffer is able to keep), check if we read enough columns
// already on each step and store final result in wholeLine.
for {
var line []byte
line, isPrefix, err = rd.ReadLine()
if err != nil {
// We should return no error if EOF is reached
// without a match.
if err == io.EOF {
err = nil
}
return out, err
}
// Simple common case: line is short enough to fit in a
// single reader's buffer.
if !isPrefix && len(wholeLine) == 0 {
wholeLine = line
break
}
wholeLine = append(wholeLine, line...)
// Check if we read the whole line already.
if !isPrefix {
break
}
}
// There's no spec for /etc/passwd or /etc/group, but we try to follow
// the same rules as the glibc parser, which allows comments and blank
// space at the beginning of a line.
wholeLine = bytes.TrimSpace(wholeLine)
if len(wholeLine) == 0 || wholeLine[0] == '#' {
continue
}
// see: man 5 group
// group_name:password:GID:user_list
// Name:Pass:Gid:List
// root:x:0:root
// adm:x:4:root,adm,daemon
p := Group{}
parseLine(wholeLine, &p.Name, &p.Pass, &p.Gid, &p.List)
if filter == nil || filter(p) {
out = append(out, p)
}
}
}
type ExecUser struct {
Uid int
Gid int
Sgids []int
Home string
}
// GetExecUserPath is a wrapper for GetExecUser. It reads data from each of the
// given file paths and uses that data as the arguments to GetExecUser. If the
// files cannot be opened for any reason, the error is ignored and a nil
// io.Reader is passed instead.
func GetExecUserPath(userSpec string, defaults *ExecUser, passwdPath, groupPath string) (*ExecUser, error) {
var passwd, group io.Reader
if passwdFile, err := os.Open(passwdPath); err == nil {
passwd = passwdFile
defer passwdFile.Close()
}
if groupFile, err := os.Open(groupPath); err == nil {
group = groupFile
defer groupFile.Close()
}
return GetExecUser(userSpec, defaults, passwd, group)
}
// GetExecUser parses a user specification string (using the passwd and group
// readers as sources for /etc/passwd and /etc/group data, respectively). In
// the case of blank fields or missing data from the sources, the values in
// defaults is used.
//
// GetExecUser will return an error if a user or group literal could not be
// found in any entry in passwd and group respectively.
//
// Examples of valid user specifications are:
// - ""
// - "user"
// - "uid"
// - "user:group"
// - "uid:gid
// - "user:gid"
// - "uid:group"
//
// It should be noted that if you specify a numeric user or group id, they will
// not be evaluated as usernames (only the metadata will be filled). So attempting
// to parse a user with user.Name = "1337" will produce the user with a UID of
// 1337.
func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) (*ExecUser, error) {
if defaults == nil {
defaults = new(ExecUser)
}
// Copy over defaults.
user := &ExecUser{
Uid: defaults.Uid,
Gid: defaults.Gid,
Sgids: defaults.Sgids,
Home: defaults.Home,
}
// Sgids slice *cannot* be nil.
if user.Sgids == nil {
user.Sgids = []int{}
}
// Allow for userArg to have either "user" syntax, or optionally "user:group" syntax
var userArg, groupArg string
parseLine([]byte(userSpec), &userArg, &groupArg)
// Convert userArg and groupArg to be numeric, so we don't have to execute
// Atoi *twice* for each iteration over lines.
uidArg, uidErr := strconv.Atoi(userArg)
gidArg, gidErr := strconv.Atoi(groupArg)
// Find the matching user.
users, err := ParsePasswdFilter(passwd, func(u User) bool {
if userArg == "" {
// Default to current state of the user.
return u.Uid == user.Uid
}
if uidErr == nil {
// If the userArg is numeric, always treat it as a UID.
return uidArg == u.Uid
}
return u.Name == userArg
})
// If we can't find the user, we have to bail.
if err != nil && passwd != nil {
if userArg == "" {
userArg = strconv.Itoa(user.Uid)
}
return nil, fmt.Errorf("unable to find user %s: %w", userArg, err)
}
var matchedUserName string
if len(users) > 0 {
// First match wins, even if there's more than one matching entry.
matchedUserName = users[0].Name
user.Uid = users[0].Uid
user.Gid = users[0].Gid
user.Home = users[0].Home
} else if userArg != "" {
// If we can't find a user with the given username, the only other valid
// option is if it's a numeric username with no associated entry in passwd.
if uidErr != nil {
// Not numeric.
return nil, fmt.Errorf("unable to find user %s: %w", userArg, ErrNoPasswdEntries)
}
user.Uid = uidArg
// Must be inside valid uid range.
if user.Uid < minID || user.Uid > maxID {
return nil, ErrRange
}
// Okay, so it's numeric. We can just roll with this.
}
// On to the groups. If we matched a username, we need to do this because of
// the supplementary group IDs.
if groupArg != "" || matchedUserName != "" {
groups, err := ParseGroupFilter(group, func(g Group) bool {
// If the group argument isn't explicit, we'll just search for it.
if groupArg == "" {
// Check if user is a member of this group.
for _, u := range g.List {
if u == matchedUserName {
return true
}
}
return false
}
if gidErr == nil {
// If the groupArg is numeric, always treat it as a GID.
return gidArg == g.Gid
}
return g.Name == groupArg
})
if err != nil && group != nil {
return nil, fmt.Errorf("unable to find groups for spec %v: %w", matchedUserName, err)
}
// Only start modifying user.Gid if it is in explicit form.
if groupArg != "" {
if len(groups) > 0 {
// First match wins, even if there's more than one matching entry.
user.Gid = groups[0].Gid
} else {
// If we can't find a group with the given name, the only other valid
// option is if it's a numeric group name with no associated entry in group.
if gidErr != nil {
// Not numeric.
return nil, fmt.Errorf("unable to find group %s: %w", groupArg, ErrNoGroupEntries)
}
user.Gid = gidArg
// Must be inside valid gid range.
if user.Gid < minID || user.Gid > maxID {
return nil, ErrRange
}
// Okay, so it's numeric. We can just roll with this.
}
} else if len(groups) > 0 {
// Supplementary group ids only make sense if in the implicit form.
user.Sgids = make([]int, len(groups))
for i, group := range groups {
user.Sgids[i] = group.Gid
}
}
}
return user, nil
}
// GetAdditionalGroups looks up a list of groups by name or group id
// against the given /etc/group formatted data. If a group name cannot
// be found, an error will be returned. If a group id cannot be found,
// or the given group data is nil, the id will be returned as-is
// provided it is in the legal range.
func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, error) {
groups := []Group{}
if group != nil {
var err error
groups, err = ParseGroupFilter(group, func(g Group) bool {
for _, ag := range additionalGroups {
if g.Name == ag || strconv.Itoa(g.Gid) == ag {
return true
}
}
return false
})
if err != nil {
return nil, fmt.Errorf("Unable to find additional groups %v: %w", additionalGroups, err)
}
}
gidMap := make(map[int]struct{})
for _, ag := range additionalGroups {
var found bool
for _, g := range groups {
// if we found a matched group either by name or gid, take the
// first matched as correct
if g.Name == ag || strconv.Itoa(g.Gid) == ag {
if _, ok := gidMap[g.Gid]; !ok {
gidMap[g.Gid] = struct{}{}
found = true
break
}
}
}
// we asked for a group but didn't find it. let's check to see
// if we wanted a numeric group
if !found {
gid, err := strconv.ParseInt(ag, 10, 64)
if err != nil {
// Not a numeric ID either.
return nil, fmt.Errorf("Unable to find group %s: %w", ag, ErrNoGroupEntries)
}
// Ensure gid is inside gid range.
if gid < minID || gid > maxID {
return nil, ErrRange
}
gidMap[int(gid)] = struct{}{}
}
}
gids := []int{}
for gid := range gidMap {
gids = append(gids, gid)
}
return gids, nil
}
// GetAdditionalGroupsPath is a wrapper around GetAdditionalGroups
// that opens the groupPath given and gives it as an argument to
// GetAdditionalGroups.
func GetAdditionalGroupsPath(additionalGroups []string, groupPath string) ([]int, error) {
var group io.Reader
if groupFile, err := os.Open(groupPath); err == nil {
group = groupFile
defer groupFile.Close()
}
return GetAdditionalGroups(additionalGroups, group)
}
func ParseSubIDFile(path string) ([]SubID, error) {
subid, err := os.Open(path)
if err != nil {
return nil, err
}
defer subid.Close()
return ParseSubID(subid)
}
func ParseSubID(subid io.Reader) ([]SubID, error) {
return ParseSubIDFilter(subid, nil)
}
func ParseSubIDFileFilter(path string, filter func(SubID) bool) ([]SubID, error) {
subid, err := os.Open(path)
if err != nil {
return nil, err
}
defer subid.Close()
return ParseSubIDFilter(subid, filter)
}
func ParseSubIDFilter(r io.Reader, filter func(SubID) bool) ([]SubID, error) {
if r == nil {
return nil, errors.New("nil source for subid-formatted data")
}
var (
s = bufio.NewScanner(r)
out = []SubID{}
)
for s.Scan() {
line := bytes.TrimSpace(s.Bytes())
if len(line) == 0 {
continue
}
// see: man 5 subuid
p := SubID{}
parseLine(line, &p.Name, &p.SubID, &p.Count)
if filter == nil || filter(p) {
out = append(out, p)
}
}
if err := s.Err(); err != nil {
return nil, err
}
return out, nil
}
func ParseIDMapFile(path string) ([]IDMap, error) {
r, err := os.Open(path)
if err != nil {
return nil, err
}
defer r.Close()
return ParseIDMap(r)
}
func ParseIDMap(r io.Reader) ([]IDMap, error) {
return ParseIDMapFilter(r, nil)
}
func ParseIDMapFileFilter(path string, filter func(IDMap) bool) ([]IDMap, error) {
r, err := os.Open(path)
if err != nil {
return nil, err
}
defer r.Close()
return ParseIDMapFilter(r, filter)
}
func ParseIDMapFilter(r io.Reader, filter func(IDMap) bool) ([]IDMap, error) {
if r == nil {
return nil, errors.New("nil source for idmap-formatted data")
}
var (
s = bufio.NewScanner(r)
out = []IDMap{}
)
for s.Scan() {
line := bytes.TrimSpace(s.Bytes())
if len(line) == 0 {
continue
}
// see: man 7 user_namespaces
p := IDMap{}
parseParts(bytes.Fields(line), &p.ID, &p.ParentID, &p.Count)
if filter == nil || filter(p) {
out = append(out, p)
}
}
if err := s.Err(); err != nil {
return nil, err
}
return out, nil
}

View file

@ -1,43 +0,0 @@
//go:build gofuzz
// +build gofuzz
package user
import (
"io"
"strings"
)
func IsDivisbleBy(n int, divisibleby int) bool {
return (n % divisibleby) == 0
}
func FuzzUser(data []byte) int {
if len(data) == 0 {
return -1
}
if !IsDivisbleBy(len(data), 5) {
return -1
}
var divided [][]byte
chunkSize := len(data) / 5
for i := 0; i < len(data); i += chunkSize {
end := i + chunkSize
divided = append(divided, data[i:end])
}
_, _ = ParsePasswdFilter(strings.NewReader(string(divided[0])), nil)
var passwd, group io.Reader
group = strings.NewReader(string(divided[1]))
_, _ = GetAdditionalGroups([]string{string(divided[2])}, group)
passwd = strings.NewReader(string(divided[3]))
_, _ = GetExecUser(string(divided[4]), nil, passwd, group)
return 1
}

View file

@ -1,5 +1,4 @@
package userns
// RunningInUserNS detects whether we are currently running in a user namespace.
// Originally copied from github.com/lxc/lxd/shared/util.go
var RunningInUserNS = runningInUserNS

View file

@ -3,14 +3,7 @@
package userns
import (
"strings"
"github.com/opencontainers/runc/libcontainer/user"
)
func FuzzUIDMap(data []byte) int {
uidmap, _ := user.ParseIDMap(strings.NewReader(string(data)))
_ = uidMapInUserNS(uidmap)
func FuzzUIDMap(uidmap []byte) int {
_ = uidMapInUserNS(string(uidmap))
return 1
}

View file

@ -1,9 +1,10 @@
package userns
import (
"bufio"
"fmt"
"os"
"sync"
"github.com/opencontainers/runc/libcontainer/user"
)
var (
@ -12,26 +13,43 @@ var (
)
// runningInUserNS detects whether we are currently running in a user namespace.
// Originally copied from github.com/lxc/lxd/shared/util.go
//
// Originally copied from https://github.com/lxc/incus/blob/e45085dd42f826b3c8c3228e9733c0b6f998eafe/shared/util.go#L678-L700.
func runningInUserNS() bool {
nsOnce.Do(func() {
uidmap, err := user.CurrentProcessUIDMap()
file, err := os.Open("/proc/self/uid_map")
if err != nil {
// This kernel-provided file only exists if user namespaces are supported
// This kernel-provided file only exists if user namespaces are supported.
return
}
inUserNS = uidMapInUserNS(uidmap)
defer file.Close()
buf := bufio.NewReader(file)
l, _, err := buf.ReadLine()
if err != nil {
return
}
inUserNS = uidMapInUserNS(string(l))
})
return inUserNS
}
func uidMapInUserNS(uidmap []user.IDMap) bool {
/*
* We assume we are in the initial user namespace if we have a full
* range - 4294967295 uids starting at uid 0.
*/
if len(uidmap) == 1 && uidmap[0].ID == 0 && uidmap[0].ParentID == 0 && uidmap[0].Count == 4294967295 {
func uidMapInUserNS(uidMap string) bool {
if uidMap == "" {
// File exist but empty (the initial state when userns is created,
// see user_namespaces(7)).
return true
}
var a, b, c int64
if _, err := fmt.Sscanf(uidMap, "%d %d %d", &a, &b, &c); err != nil {
// Assume we are in a regular, non user namespace.
return false
}
return true
// As per user_namespaces(7), /proc/self/uid_map of
// the initial user namespace shows 0 0 4294967295.
initNS := a == 0 && b == 0 && c == 4294967295
return !initNS
}

View file

@ -3,8 +3,6 @@
package userns
import "github.com/opencontainers/runc/libcontainer/user"
// runningInUserNS is a stub for non-Linux systems
// Always returns false
func runningInUserNS() bool {
@ -13,6 +11,6 @@ func runningInUserNS() bool {
// uidMapInUserNS is a stub for non-Linux systems
// Always returns false
func uidMapInUserNS(uidmap []user.IDMap) bool {
func uidMapInUserNS(uidMap string) bool {
return false
}

View file

@ -0,0 +1,156 @@
package userns
import (
"fmt"
"os"
"sort"
"strings"
"sync"
"syscall"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/configs"
)
type Mapping struct {
UIDMappings []configs.IDMap
GIDMappings []configs.IDMap
}
func (m Mapping) toSys() (uids, gids []syscall.SysProcIDMap) {
for _, uid := range m.UIDMappings {
uids = append(uids, syscall.SysProcIDMap{
ContainerID: int(uid.ContainerID),
HostID: int(uid.HostID),
Size: int(uid.Size),
})
}
for _, gid := range m.GIDMappings {
gids = append(gids, syscall.SysProcIDMap{
ContainerID: int(gid.ContainerID),
HostID: int(gid.HostID),
Size: int(gid.Size),
})
}
return
}
// id returns a unique identifier for this mapping, agnostic of the order of
// the uid and gid mappings (because the order doesn't matter to the kernel).
// The set of userns handles is indexed using this ID.
func (m Mapping) id() string {
var uids, gids []string
for _, idmap := range m.UIDMappings {
uids = append(uids, fmt.Sprintf("%d:%d:%d", idmap.ContainerID, idmap.HostID, idmap.Size))
}
for _, idmap := range m.GIDMappings {
gids = append(gids, fmt.Sprintf("%d:%d:%d", idmap.ContainerID, idmap.HostID, idmap.Size))
}
// We don't care about the sort order -- just sort them.
sort.Strings(uids)
sort.Strings(gids)
return "uid=" + strings.Join(uids, ",") + ";gid=" + strings.Join(gids, ",")
}
type Handles struct {
m sync.Mutex
maps map[string]*os.File
}
// Release all resources associated with this Handle. All existing files
// returned from Get() will continue to work even after calling Release(). The
// same Handles can be re-used after calling Release().
func (hs *Handles) Release() {
hs.m.Lock()
defer hs.m.Unlock()
// Close the files for good measure, though GC will do that for us anyway.
for _, file := range hs.maps {
_ = file.Close()
}
hs.maps = nil
}
func spawnProc(req Mapping) (*os.Process, error) {
// We need to spawn a subprocess with the requested mappings, which is
// unfortunately quite expensive. The "safe" way of doing this is natively
// with Go (and then spawning something like "sleep infinity"), but
// execve() is a waste of cycles because we just need some process to have
// the right mapping, we don't care what it's executing. The "unsafe"
// option of doing a clone() behind the back of Go is probably okay in
// theory as long as we just do kill(getpid(), SIGSTOP). However, if we
// tell Go to put the new process into PTRACE_TRACEME mode, we can avoid
// the exec and not have to faff around with the mappings.
//
// Note that Go's stdlib does not support newuidmap, but in the case of
// id-mapped mounts, it seems incredibly unlikely that the user will be
// requesting us to do a remapping as an unprivileged user with mappings
// they have privileges over.
logrus.Debugf("spawning dummy process for id-mapping %s", req.id())
uidMappings, gidMappings := req.toSys()
// We don't need to use /proc/thread-self here because the exe mm of a
// thread-group is guaranteed to be the same for all threads by definition.
// This lets us avoid having to do runtime.LockOSThread.
return os.StartProcess("/proc/self/exe", []string{"runc", "--help"}, &os.ProcAttr{
Sys: &syscall.SysProcAttr{
Cloneflags: unix.CLONE_NEWUSER,
UidMappings: uidMappings,
GidMappings: gidMappings,
GidMappingsEnableSetgroups: false,
// Put the process into PTRACE_TRACEME mode to allow us to get the
// userns without having a proper execve() target.
Ptrace: true,
},
})
}
func dupFile(f *os.File) (*os.File, error) {
newFd, err := unix.FcntlInt(f.Fd(), unix.F_DUPFD_CLOEXEC, 0)
if err != nil {
return nil, os.NewSyscallError("fcntl(F_DUPFD_CLOEXEC)", err)
}
return os.NewFile(uintptr(newFd), f.Name()), nil
}
// Get returns a handle to a /proc/$pid/ns/user nsfs file with the requested
// mapping. The processes spawned to produce userns nsfds are cached, so if
// equivalent user namespace mappings are requested, the same user namespace
// will be returned. The caller is responsible for closing the returned file
// descriptor.
func (hs *Handles) Get(req Mapping) (file *os.File, err error) {
hs.m.Lock()
defer hs.m.Unlock()
if hs.maps == nil {
hs.maps = make(map[string]*os.File)
}
file, ok := hs.maps[req.id()]
if !ok {
proc, err := spawnProc(req)
if err != nil {
return nil, fmt.Errorf("failed to spawn dummy process for map %s: %w", req.id(), err)
}
// Make sure we kill the helper process. We ignore errors because
// there's not much we can do about them anyway, and ultimately
defer func() {
_ = proc.Kill()
_, _ = proc.Wait()
}()
// Stash away a handle to the userns file. This is neater than keeping
// the process alive, because Go's GC can handle files much better than
// leaked processes, and having long-living useless processes seems
// less than ideal.
file, err = os.Open(fmt.Sprintf("/proc/%d/ns/user", proc.Pid))
if err != nil {
return nil, err
}
hs.maps[req.id()] = file
}
// Duplicate the file, to make sure the lifecycle of each *os.File we
// return is independent.
return dupFile(file)
}

View file

@ -19,13 +19,14 @@ package utils
import (
"fmt"
"os"
"runtime"
"golang.org/x/sys/unix"
)
// MaxSendfdLen is the maximum length of the name of a file descriptor being
// sent using SendFd. The name of the file handle returned by RecvFd will never
// be larger than this value.
// MaxNameLen is the maximum length of the name of a file descriptor being sent
// using SendFile. The name of the file handle returned by RecvFile will never be
// larger than this value.
const MaxNameLen = 4096
// oobSpace is the size of the oob slice required to store a single FD. Note
@ -33,26 +34,21 @@ const MaxNameLen = 4096
// so sizeof(fd) = 4.
var oobSpace = unix.CmsgSpace(4)
// RecvFd waits for a file descriptor to be sent over the given AF_UNIX
// RecvFile waits for a file descriptor to be sent over the given AF_UNIX
// socket. The file name of the remote file descriptor will be recreated
// locally (it is sent as non-auxiliary data in the same payload).
func RecvFd(socket *os.File) (*os.File, error) {
// For some reason, unix.Recvmsg uses the length rather than the capacity
// when passing the msg_controllen and other attributes to recvmsg. So we
// have to actually set the length.
func RecvFile(socket *os.File) (_ *os.File, Err error) {
name := make([]byte, MaxNameLen)
oob := make([]byte, oobSpace)
sockfd := socket.Fd()
n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, 0)
n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, unix.MSG_CMSG_CLOEXEC)
if err != nil {
return nil, err
}
if n >= MaxNameLen || oobn != oobSpace {
return nil, fmt.Errorf("recvfd: incorrect number of bytes read (n=%d oobn=%d)", n, oobn)
return nil, fmt.Errorf("recvfile: incorrect number of bytes read (n=%d oobn=%d)", n, oobn)
}
// Truncate.
name = name[:n]
oob = oob[:oobn]
@ -61,36 +57,63 @@ func RecvFd(socket *os.File) (*os.File, error) {
if err != nil {
return nil, err
}
// We cannot control how many SCM_RIGHTS we receive, and upon receiving
// them all of the descriptors are installed in our fd table, so we need to
// parse all of the SCM_RIGHTS we received in order to close all of the
// descriptors on error.
var fds []int
defer func() {
for i, fd := range fds {
if i == 0 && Err == nil {
// Only close the first one on error.
continue
}
// Always close extra ones.
_ = unix.Close(fd)
}
}()
var lastErr error
for _, scm := range scms {
if scm.Header.Type == unix.SCM_RIGHTS {
scmFds, err := unix.ParseUnixRights(&scm)
if err != nil {
lastErr = err
} else {
fds = append(fds, scmFds...)
}
}
}
if lastErr != nil {
return nil, lastErr
}
// We do this after collecting the fds to make sure we close them all when
// returning an error here.
if len(scms) != 1 {
return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms))
}
scm := scms[0]
fds, err := unix.ParseUnixRights(&scm)
if err != nil {
return nil, err
}
if len(fds) != 1 {
return nil, fmt.Errorf("recvfd: number of fds is not 1: %d", len(fds))
}
fd := uintptr(fds[0])
return os.NewFile(fd, string(name)), nil
return os.NewFile(uintptr(fds[0]), string(name)), nil
}
// SendFd sends a file descriptor over the given AF_UNIX socket. In
// addition, the file.Name() of the given file will also be sent as
// non-auxiliary data in the same payload (allowing to send contextual
// information for a file descriptor).
func SendFd(socket *os.File, name string, fd uintptr) error {
// SendFile sends a file over the given AF_UNIX socket. file.Name() is also
// included so that if the other end uses RecvFile, the file will have the same
// name information.
func SendFile(socket *os.File, file *os.File) error {
name := file.Name()
if len(name) >= MaxNameLen {
return fmt.Errorf("sendfd: filename too long: %s", name)
}
return SendFds(socket, []byte(name), int(fd))
err := SendRawFd(socket, name, file.Fd())
runtime.KeepAlive(file)
return err
}
// SendFds sends a list of files descriptor and msg over the given AF_UNIX socket.
func SendFds(socket *os.File, msg []byte, fds ...int) error {
oob := unix.UnixRights(fds...)
return unix.Sendmsg(int(socket.Fd()), msg, oob, nil, 0)
// SendRawFd sends a specific file descriptor over the given AF_UNIX socket.
func SendRawFd(socket *os.File, msg string, fd uintptr) error {
oob := unix.UnixRights(int(fd))
return unix.Sendmsg(int(socket.Fd()), []byte(msg), oob, nil, 0)
}

View file

@ -3,15 +3,12 @@ package utils
import (
"encoding/binary"
"encoding/json"
"fmt"
"io"
"os"
"path/filepath"
"strconv"
"strings"
"unsafe"
securejoin "github.com/cyphar/filepath-securejoin"
"golang.org/x/sys/unix"
)
@ -43,6 +40,9 @@ func ExitStatus(status unix.WaitStatus) int {
}
// WriteJSON writes the provided struct v to w using standard json marshaling
// without a trailing newline. This is used instead of json.Encoder because
// there might be a problem in json decoder in some cases, see:
// https://github.com/docker/docker/issues/14203#issuecomment-174177790
func WriteJSON(w io.Writer, v interface{}) error {
data, err := json.Marshal(v)
if err != nil {
@ -99,52 +99,16 @@ func stripRoot(root, path string) string {
return CleanPath("/" + path)
}
// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
// corresponding to the unsafePath resolved within the root. Before passing the
// fd, this path is verified to have been inside the root -- so operating on it
// through the passed fdpath should be safe. Do not access this path through
// the original path strings, and do not attempt to use the pathname outside of
// the passed closure (the file handle will be freed once the closure returns).
func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
// Remove the root then forcefully resolve inside the root.
unsafePath = stripRoot(root, unsafePath)
path, err := securejoin.SecureJoin(root, unsafePath)
if err != nil {
return fmt.Errorf("resolving path inside rootfs failed: %w", err)
}
// Open the target path.
fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
if err != nil {
return fmt.Errorf("open o_path procfd: %w", err)
}
defer fh.Close()
// Double-check the path is the one we expected.
procfd := "/proc/self/fd/" + strconv.Itoa(int(fh.Fd()))
if realpath, err := os.Readlink(procfd); err != nil {
return fmt.Errorf("procfd verification failed: %w", err)
} else if realpath != path {
return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
}
// Run the closure.
return fn(procfd)
}
// SearchLabels searches a list of key-value pairs for the provided key and
// returns the corresponding value. The pairs must be separated with '='.
func SearchLabels(labels []string, query string) string {
for _, l := range labels {
parts := strings.SplitN(l, "=", 2)
if len(parts) < 2 {
continue
}
if parts[0] == query {
return parts[1]
// SearchLabels searches through a list of key=value pairs for a given key,
// returning its value, and the binary flag telling whether the key exist.
func SearchLabels(labels []string, key string) (string, bool) {
key += "="
for _, s := range labels {
if strings.HasPrefix(s, key) {
return s[len(key):], true
}
}
return ""
return "", false
}
// Annotations returns the bundle path and user defined annotations from the

View file

@ -5,10 +5,16 @@ package utils
import (
"fmt"
"math"
"os"
"path/filepath"
"runtime"
"strconv"
"sync"
_ "unsafe" // for go:linkname
securejoin "github.com/cyphar/filepath-securejoin"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
@ -24,12 +30,39 @@ func EnsureProcHandle(fh *os.File) error {
return nil
}
var (
haveCloseRangeCloexecBool bool
haveCloseRangeCloexecOnce sync.Once
)
func haveCloseRangeCloexec() bool {
haveCloseRangeCloexecOnce.Do(func() {
// Make sure we're not closing a random file descriptor.
tmpFd, err := unix.FcntlInt(0, unix.F_DUPFD_CLOEXEC, 0)
if err != nil {
return
}
defer unix.Close(tmpFd)
err = unix.CloseRange(uint(tmpFd), uint(tmpFd), unix.CLOSE_RANGE_CLOEXEC)
// Any error means we cannot use close_range(CLOSE_RANGE_CLOEXEC).
// -ENOSYS and -EINVAL ultimately mean we don't have support, but any
// other potential error would imply that even the most basic close
// operation wouldn't work.
haveCloseRangeCloexecBool = err == nil
})
return haveCloseRangeCloexecBool
}
type fdFunc func(fd int)
// fdRangeFrom calls the passed fdFunc for each file descriptor that is open in
// the current process.
func fdRangeFrom(minFd int, fn fdFunc) error {
fdDir, err := os.Open("/proc/self/fd")
procSelfFd, closer := ProcThreadSelf("fd")
defer closer()
fdDir, err := os.Open(procSelfFd)
if err != nil {
return err
}
@ -67,6 +100,12 @@ func fdRangeFrom(minFd int, fn fdFunc) error {
// CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or
// equal to minFd in the current process.
func CloseExecFrom(minFd int) error {
// Use close_range(CLOSE_RANGE_CLOEXEC) if possible.
if haveCloseRangeCloexec() {
err := unix.CloseRange(uint(minFd), math.MaxUint, unix.CLOSE_RANGE_CLOEXEC)
return os.NewSyscallError("close_range", err)
}
// Otherwise, fall back to the standard loop.
return fdRangeFrom(minFd, unix.CloseOnExec)
}
@ -89,7 +128,8 @@ func runtime_IsPollDescriptor(fd uintptr) bool //nolint:revive
// *os.File operations would apply to the wrong file). This function is only
// intended to be called from the last stage of runc init.
func UnsafeCloseFrom(minFd int) error {
// We must not close some file descriptors.
// We cannot use close_range(2) even if it is available, because we must
// not close some file descriptors.
return fdRangeFrom(minFd, func(fd int) {
if runtime_IsPollDescriptor(uintptr(fd)) {
// These are the Go runtimes internal netpoll file descriptors.
@ -107,11 +147,117 @@ func UnsafeCloseFrom(minFd int) error {
})
}
// NewSockPair returns a new unix socket pair
func NewSockPair(name string) (parent *os.File, child *os.File, err error) {
// NewSockPair returns a new SOCK_STREAM unix socket pair.
func NewSockPair(name string) (parent, child *os.File, err error) {
fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
if err != nil {
return nil, nil, err
}
return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil
}
// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
// corresponding to the unsafePath resolved within the root. Before passing the
// fd, this path is verified to have been inside the root -- so operating on it
// through the passed fdpath should be safe. Do not access this path through
// the original path strings, and do not attempt to use the pathname outside of
// the passed closure (the file handle will be freed once the closure returns).
func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
// Remove the root then forcefully resolve inside the root.
unsafePath = stripRoot(root, unsafePath)
path, err := securejoin.SecureJoin(root, unsafePath)
if err != nil {
return fmt.Errorf("resolving path inside rootfs failed: %w", err)
}
procSelfFd, closer := ProcThreadSelf("fd/")
defer closer()
// Open the target path.
fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
if err != nil {
return fmt.Errorf("open o_path procfd: %w", err)
}
defer fh.Close()
procfd := filepath.Join(procSelfFd, strconv.Itoa(int(fh.Fd())))
// Double-check the path is the one we expected.
if realpath, err := os.Readlink(procfd); err != nil {
return fmt.Errorf("procfd verification failed: %w", err)
} else if realpath != path {
return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
}
return fn(procfd)
}
type ProcThreadSelfCloser func()
var (
haveProcThreadSelf bool
haveProcThreadSelfOnce sync.Once
)
// ProcThreadSelf returns a string that is equivalent to
// /proc/thread-self/<subpath>, with a graceful fallback on older kernels where
// /proc/thread-self doesn't exist. This method DOES NOT use SecureJoin,
// meaning that the passed string needs to be trusted. The caller _must_ call
// the returned procThreadSelfCloser function (which is runtime.UnlockOSThread)
// *only once* after it has finished using the returned path string.
func ProcThreadSelf(subpath string) (string, ProcThreadSelfCloser) {
haveProcThreadSelfOnce.Do(func() {
if _, err := os.Stat("/proc/thread-self/"); err == nil {
haveProcThreadSelf = true
} else {
logrus.Debugf("cannot stat /proc/thread-self (%v), falling back to /proc/self/task/<tid>", err)
}
})
// We need to lock our thread until the caller is done with the path string
// because any non-atomic operation on the path (such as opening a file,
// then reading it) could be interrupted by the Go runtime where the
// underlying thread is swapped out and the original thread is killed,
// resulting in pull-your-hair-out-hard-to-debug issues in the caller. In
// addition, the pre-3.17 fallback makes everything non-atomic because the
// same thing could happen between unix.Gettid() and the path operations.
//
// In theory, we don't need to lock in the atomic user case when using
// /proc/thread-self/, but it's better to be safe than sorry (and there are
// only one or two truly atomic users of /proc/thread-self/).
runtime.LockOSThread()
threadSelf := "/proc/thread-self/"
if !haveProcThreadSelf {
// Pre-3.17 kernels did not have /proc/thread-self, so do it manually.
threadSelf = "/proc/self/task/" + strconv.Itoa(unix.Gettid()) + "/"
if _, err := os.Stat(threadSelf); err != nil {
// Unfortunately, this code is called from rootfs_linux.go where we
// are running inside the pid namespace of the container but /proc
// is the host's procfs. Unfortunately there is no real way to get
// the correct tid to use here (the kernel age means we cannot do
// things like set up a private fsopen("proc") -- even scanning
// NSpid in all of the tasks in /proc/self/task/*/status requires
// Linux 4.1).
//
// So, we just have to assume that /proc/self is acceptable in this
// one specific case.
if os.Getpid() == 1 {
logrus.Debugf("/proc/thread-self (tid=%d) cannot be emulated inside the initial container setup -- using /proc/self instead: %v", unix.Gettid(), err)
} else {
// This should never happen, but the fallback should work in most cases...
logrus.Warnf("/proc/thread-self could not be emulated for pid=%d (tid=%d) -- using more buggy /proc/self fallback instead: %v", os.Getpid(), unix.Gettid(), err)
}
threadSelf = "/proc/self/"
}
}
return threadSelf + subpath, runtime.UnlockOSThread
}
// ProcThreadSelfFd is small wrapper around ProcThreadSelf to make it easier to
// create a /proc/thread-self handle for given file descriptor.
//
// It is basically equivalent to ProcThreadSelf(fmt.Sprintf("fd/%d", fd)), but
// without using fmt.Sprintf to avoid unneeded overhead.
func ProcThreadSelfFd(fd uintptr) (string, ProcThreadSelfCloser) {
return ProcThreadSelf("fd/" + strconv.FormatUint(uint64(fd), 10))
}

5
vendor/modules.txt vendored
View file

@ -986,12 +986,11 @@ github.com/opencontainers/go-digest/digestset
github.com/opencontainers/image-spec/identity
github.com/opencontainers/image-spec/specs-go
github.com/opencontainers/image-spec/specs-go/v1
# github.com/opencontainers/runc v1.1.12
## explicit; go 1.17
# github.com/opencontainers/runc v1.2.0-rc.1
## explicit; go 1.20
github.com/opencontainers/runc/libcontainer/cgroups
github.com/opencontainers/runc/libcontainer/configs
github.com/opencontainers/runc/libcontainer/devices
github.com/opencontainers/runc/libcontainer/user
github.com/opencontainers/runc/libcontainer/userns
github.com/opencontainers/runc/libcontainer/utils
# github.com/opencontainers/runtime-spec v1.2.0