moby/profiles/seccomp/seccomp.go
Tianon Gravi 567c01f6d1 seccomp: add support for "clone3" syscall in default policy
This is a backport of 9f6b562dd1, adapted to avoid the refactoring that happened in d92739713c.

Original commit message is as follows:

> If no seccomp policy is requested, then the built-in default policy in
> dockerd applies. This has no rule for "clone3" defined, nor any default
> errno defined. So when runc receives the config it attempts to determine
> a default errno, using logic defined in its commit:
>
>   opencontainers/runc@7a8d716
>
> As explained in the above commit message, runc uses a heuristic to
> decide which errno to return by default:
>
> [quote]
>   The solution applied here is to prepend a "stub" filter which returns
>   -ENOSYS if the requested syscall has a larger syscall number than any
>   syscall mentioned in the filter. The reason for this specific rule is
>   that syscall numbers are (roughly) allocated sequentially and thus newer
>   syscalls will (usually) have a larger syscall number -- thus causing our
>   filters to produce -ENOSYS if the filter was written before the syscall
>   existed.
> [/quote]
>
> Unfortunately clone3 appears to one of the edge cases that does not
> result in use of ENOSYS, instead ending up with the historical EPERM
> errno.
>
> Latest glibc (2.33.9000, in Fedora 35 rawhide) will attempt to use
> clone3 by default. If it sees ENOSYS then it will automatically
> fallback to using clone. Any other errno is treated as a fatal
> error. Thus when docker seccomp policy triggers EPERM from clone3,
> no fallback occurs and programs are thus unable to spawn threads.
>
> The clone3 syscall is much more complicated than clone, most notably its
> flags are not exposed as a directly argument any more. Instead they are
> hidden inside a struct. This means that seccomp filters are unable to
> apply policy based on values seen in flags. Thus we can't directly
> replicate the current "clone" filtering for "clone3". We can at least
> ensure "clone3" returns ENOSYS errno, to trigger fallback to "clone"
> at which point we can filter on flags.

Signed-off-by: Tianon Gravi <admwiggin@gmail.com>
Co-authored-by: Daniel P. Berrangé <berrange@redhat.com>
2021-09-13 08:56:21 -07:00

102 lines
3.5 KiB
Go

package seccomp // import "github.com/docker/docker/profiles/seccomp"
import (
"encoding/json"
"fmt"
"strconv"
"strings"
"github.com/opencontainers/runtime-spec/specs-go"
)
// Seccomp represents the config for a seccomp profile for syscall restriction.
type Seccomp struct {
DefaultAction specs.LinuxSeccompAction `json:"defaultAction"`
// Architectures is kept to maintain backward compatibility with the old
// seccomp profile.
Architectures []specs.Arch `json:"architectures,omitempty"`
ArchMap []Architecture `json:"archMap,omitempty"`
Syscalls []*Syscall `json:"syscalls"`
}
// Architecture is used to represent a specific architecture
// and its sub-architectures
type Architecture struct {
Arch specs.Arch `json:"architecture"`
SubArches []specs.Arch `json:"subArchitectures"`
}
// Filter is used to conditionally apply Seccomp rules
type Filter struct {
Caps []string `json:"caps,omitempty"`
Arches []string `json:"arches,omitempty"`
// MinKernel describes the minimum kernel version the rule must be applied
// on, in the format "<kernel version>.<major revision>" (e.g. "3.12").
//
// When matching the kernel version of the host, minor revisions, and distro-
// specific suffixes are ignored, which means that "3.12.25-gentoo", "3.12-1-amd64",
// "3.12", and "3.12-rc5" are considered equal (kernel 3, major revision 12).
MinKernel *KernelVersion `json:"minKernel,omitempty"`
}
// Syscall is used to match a group of syscalls in Seccomp
type Syscall struct {
Name string `json:"name,omitempty"`
Names []string `json:"names,omitempty"`
Action specs.LinuxSeccompAction `json:"action"`
ErrnoRet *uint `json:"errnoRet,omitempty"`
Args []*specs.LinuxSeccompArg `json:"args"`
Comment string `json:"comment"`
Includes Filter `json:"includes"`
Excludes Filter `json:"excludes"`
}
// KernelVersion holds information about the kernel.
type KernelVersion struct {
Kernel uint64 // Version of the Kernel (i.e., the "4" in "4.1.2-generic")
Major uint64 // Major revision of the Kernel (i.e., the "1" in "4.1.2-generic")
}
// String implements fmt.Stringer for KernelVersion
func (k *KernelVersion) String() string {
if k.Kernel > 0 || k.Major > 0 {
return fmt.Sprintf("%d.%d", k.Kernel, k.Major)
}
return ""
}
// MarshalJSON implements json.Unmarshaler for KernelVersion
func (k *KernelVersion) MarshalJSON() ([]byte, error) {
return json.Marshal(k.String())
}
// UnmarshalJSON implements json.Marshaler for KernelVersion
func (k *KernelVersion) UnmarshalJSON(version []byte) error {
var (
ver string
err error
)
// make sure we have a string
if err = json.Unmarshal(version, &ver); err != nil {
return fmt.Errorf(`invalid kernel version: %s, expected "<kernel>.<major>": %v`, string(version), err)
}
if ver == "" {
return nil
}
parts := strings.SplitN(ver, ".", 3)
if len(parts) != 2 {
return fmt.Errorf(`invalid kernel version: %s, expected "<kernel>.<major>"`, string(version))
}
if k.Kernel, err = strconv.ParseUint(parts[0], 10, 8); err != nil {
return fmt.Errorf(`invalid kernel version: %s, expected "<kernel>.<major>": %v`, string(version), err)
}
if k.Major, err = strconv.ParseUint(parts[1], 10, 8); err != nil {
return fmt.Errorf(`invalid kernel version: %s, expected "<kernel>.<major>": %v`, string(version), err)
}
if k.Kernel == 0 && k.Major == 0 {
return fmt.Errorf(`invalid kernel version: %s, expected "<kernel>.<major>": version cannot be 0.0`, string(version))
}
return nil
}