567c01f6d1
This is a backport of9f6b562dd1
, adapted to avoid the refactoring that happened ind92739713c
. Original commit message is as follows: > If no seccomp policy is requested, then the built-in default policy in > dockerd applies. This has no rule for "clone3" defined, nor any default > errno defined. So when runc receives the config it attempts to determine > a default errno, using logic defined in its commit: > > opencontainers/runc@7a8d716 > > As explained in the above commit message, runc uses a heuristic to > decide which errno to return by default: > > [quote] > The solution applied here is to prepend a "stub" filter which returns > -ENOSYS if the requested syscall has a larger syscall number than any > syscall mentioned in the filter. The reason for this specific rule is > that syscall numbers are (roughly) allocated sequentially and thus newer > syscalls will (usually) have a larger syscall number -- thus causing our > filters to produce -ENOSYS if the filter was written before the syscall > existed. > [/quote] > > Unfortunately clone3 appears to one of the edge cases that does not > result in use of ENOSYS, instead ending up with the historical EPERM > errno. > > Latest glibc (2.33.9000, in Fedora 35 rawhide) will attempt to use > clone3 by default. If it sees ENOSYS then it will automatically > fallback to using clone. Any other errno is treated as a fatal > error. Thus when docker seccomp policy triggers EPERM from clone3, > no fallback occurs and programs are thus unable to spawn threads. > > The clone3 syscall is much more complicated than clone, most notably its > flags are not exposed as a directly argument any more. Instead they are > hidden inside a struct. This means that seccomp filters are unable to > apply policy based on values seen in flags. Thus we can't directly > replicate the current "clone" filtering for "clone3". We can at least > ensure "clone3" returns ENOSYS errno, to trigger fallback to "clone" > at which point we can filter on flags. Signed-off-by: Tianon Gravi <admwiggin@gmail.com> Co-authored-by: Daniel P. Berrangé <berrange@redhat.com>
102 lines
3.5 KiB
Go
102 lines
3.5 KiB
Go
package seccomp // import "github.com/docker/docker/profiles/seccomp"
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"github.com/opencontainers/runtime-spec/specs-go"
|
|
)
|
|
|
|
// Seccomp represents the config for a seccomp profile for syscall restriction.
|
|
type Seccomp struct {
|
|
DefaultAction specs.LinuxSeccompAction `json:"defaultAction"`
|
|
// Architectures is kept to maintain backward compatibility with the old
|
|
// seccomp profile.
|
|
Architectures []specs.Arch `json:"architectures,omitempty"`
|
|
ArchMap []Architecture `json:"archMap,omitempty"`
|
|
Syscalls []*Syscall `json:"syscalls"`
|
|
}
|
|
|
|
// Architecture is used to represent a specific architecture
|
|
// and its sub-architectures
|
|
type Architecture struct {
|
|
Arch specs.Arch `json:"architecture"`
|
|
SubArches []specs.Arch `json:"subArchitectures"`
|
|
}
|
|
|
|
// Filter is used to conditionally apply Seccomp rules
|
|
type Filter struct {
|
|
Caps []string `json:"caps,omitempty"`
|
|
Arches []string `json:"arches,omitempty"`
|
|
|
|
// MinKernel describes the minimum kernel version the rule must be applied
|
|
// on, in the format "<kernel version>.<major revision>" (e.g. "3.12").
|
|
//
|
|
// When matching the kernel version of the host, minor revisions, and distro-
|
|
// specific suffixes are ignored, which means that "3.12.25-gentoo", "3.12-1-amd64",
|
|
// "3.12", and "3.12-rc5" are considered equal (kernel 3, major revision 12).
|
|
MinKernel *KernelVersion `json:"minKernel,omitempty"`
|
|
}
|
|
|
|
// Syscall is used to match a group of syscalls in Seccomp
|
|
type Syscall struct {
|
|
Name string `json:"name,omitempty"`
|
|
Names []string `json:"names,omitempty"`
|
|
Action specs.LinuxSeccompAction `json:"action"`
|
|
ErrnoRet *uint `json:"errnoRet,omitempty"`
|
|
Args []*specs.LinuxSeccompArg `json:"args"`
|
|
Comment string `json:"comment"`
|
|
Includes Filter `json:"includes"`
|
|
Excludes Filter `json:"excludes"`
|
|
}
|
|
|
|
// KernelVersion holds information about the kernel.
|
|
type KernelVersion struct {
|
|
Kernel uint64 // Version of the Kernel (i.e., the "4" in "4.1.2-generic")
|
|
Major uint64 // Major revision of the Kernel (i.e., the "1" in "4.1.2-generic")
|
|
}
|
|
|
|
// String implements fmt.Stringer for KernelVersion
|
|
func (k *KernelVersion) String() string {
|
|
if k.Kernel > 0 || k.Major > 0 {
|
|
return fmt.Sprintf("%d.%d", k.Kernel, k.Major)
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// MarshalJSON implements json.Unmarshaler for KernelVersion
|
|
func (k *KernelVersion) MarshalJSON() ([]byte, error) {
|
|
return json.Marshal(k.String())
|
|
}
|
|
|
|
// UnmarshalJSON implements json.Marshaler for KernelVersion
|
|
func (k *KernelVersion) UnmarshalJSON(version []byte) error {
|
|
var (
|
|
ver string
|
|
err error
|
|
)
|
|
|
|
// make sure we have a string
|
|
if err = json.Unmarshal(version, &ver); err != nil {
|
|
return fmt.Errorf(`invalid kernel version: %s, expected "<kernel>.<major>": %v`, string(version), err)
|
|
}
|
|
if ver == "" {
|
|
return nil
|
|
}
|
|
parts := strings.SplitN(ver, ".", 3)
|
|
if len(parts) != 2 {
|
|
return fmt.Errorf(`invalid kernel version: %s, expected "<kernel>.<major>"`, string(version))
|
|
}
|
|
if k.Kernel, err = strconv.ParseUint(parts[0], 10, 8); err != nil {
|
|
return fmt.Errorf(`invalid kernel version: %s, expected "<kernel>.<major>": %v`, string(version), err)
|
|
}
|
|
if k.Major, err = strconv.ParseUint(parts[1], 10, 8); err != nil {
|
|
return fmt.Errorf(`invalid kernel version: %s, expected "<kernel>.<major>": %v`, string(version), err)
|
|
}
|
|
if k.Kernel == 0 && k.Major == 0 {
|
|
return fmt.Errorf(`invalid kernel version: %s, expected "<kernel>.<major>": version cannot be 0.0`, string(version))
|
|
}
|
|
return nil
|
|
}
|