moby/profiles/seccomp/seccomp_linux.go
Tianon Gravi 567c01f6d1 seccomp: add support for "clone3" syscall in default policy
This is a backport of 9f6b562dd1, adapted to avoid the refactoring that happened in d92739713c.

Original commit message is as follows:

> If no seccomp policy is requested, then the built-in default policy in
> dockerd applies. This has no rule for "clone3" defined, nor any default
> errno defined. So when runc receives the config it attempts to determine
> a default errno, using logic defined in its commit:
>
>   opencontainers/runc@7a8d716
>
> As explained in the above commit message, runc uses a heuristic to
> decide which errno to return by default:
>
> [quote]
>   The solution applied here is to prepend a "stub" filter which returns
>   -ENOSYS if the requested syscall has a larger syscall number than any
>   syscall mentioned in the filter. The reason for this specific rule is
>   that syscall numbers are (roughly) allocated sequentially and thus newer
>   syscalls will (usually) have a larger syscall number -- thus causing our
>   filters to produce -ENOSYS if the filter was written before the syscall
>   existed.
> [/quote]
>
> Unfortunately clone3 appears to one of the edge cases that does not
> result in use of ENOSYS, instead ending up with the historical EPERM
> errno.
>
> Latest glibc (2.33.9000, in Fedora 35 rawhide) will attempt to use
> clone3 by default. If it sees ENOSYS then it will automatically
> fallback to using clone. Any other errno is treated as a fatal
> error. Thus when docker seccomp policy triggers EPERM from clone3,
> no fallback occurs and programs are thus unable to spawn threads.
>
> The clone3 syscall is much more complicated than clone, most notably its
> flags are not exposed as a directly argument any more. Instead they are
> hidden inside a struct. This means that seccomp filters are unable to
> apply policy based on values seen in flags. Thus we can't directly
> replicate the current "clone" filtering for "clone3". We can at least
> ensure "clone3" returns ENOSYS errno, to trigger fallback to "clone"
> at which point we can filter on flags.

Signed-off-by: Tianon Gravi <admwiggin@gmail.com>
Co-authored-by: Daniel P. Berrangé <berrange@redhat.com>
2021-09-13 08:56:21 -07:00

174 lines
4.7 KiB
Go

//go:generate go run -tags 'seccomp' generate.go
package seccomp // import "github.com/docker/docker/profiles/seccomp"
import (
"encoding/json"
"errors"
"fmt"
"runtime"
specs "github.com/opencontainers/runtime-spec/specs-go"
)
// GetDefaultProfile returns the default seccomp profile.
func GetDefaultProfile(rs *specs.Spec) (*specs.LinuxSeccomp, error) {
return setupSeccomp(DefaultProfile(), rs)
}
// LoadProfile takes a json string and decodes the seccomp profile.
func LoadProfile(body string, rs *specs.Spec) (*specs.LinuxSeccomp, error) {
var config Seccomp
if err := json.Unmarshal([]byte(body), &config); err != nil {
return nil, fmt.Errorf("Decoding seccomp profile failed: %v", err)
}
return setupSeccomp(&config, rs)
}
// libseccomp string => seccomp arch
var nativeToSeccomp = map[string]specs.Arch{
"x86": specs.ArchX86,
"amd64": specs.ArchX86_64,
"arm": specs.ArchARM,
"arm64": specs.ArchAARCH64,
"mips64": specs.ArchMIPS64,
"mips64n32": specs.ArchMIPS64N32,
"mipsel64": specs.ArchMIPSEL64,
"mips3l64n32": specs.ArchMIPSEL64N32,
"mipsle": specs.ArchMIPSEL,
"ppc": specs.ArchPPC,
"ppc64": specs.ArchPPC64,
"ppc64le": specs.ArchPPC64LE,
"s390": specs.ArchS390,
"s390x": specs.ArchS390X,
}
// GOARCH => libseccomp string
var goToNative = map[string]string{
"386": "x86",
"amd64": "amd64",
"arm": "arm",
"arm64": "arm64",
"mips64": "mips64",
"mips64p32": "mips64n32",
"mips64le": "mipsel64",
"mips64p32le": "mips3l64n32",
"mipsle": "mipsel",
"ppc": "ppc",
"ppc64": "ppc64",
"ppc64le": "ppc64le",
"s390": "s390",
"s390x": "s390x",
}
// inSlice tests whether a string is contained in a slice of strings or not.
// Comparison is case sensitive
func inSlice(slice []string, s string) bool {
for _, ss := range slice {
if s == ss {
return true
}
}
return false
}
func setupSeccomp(config *Seccomp, rs *specs.Spec) (*specs.LinuxSeccomp, error) {
if config == nil {
return nil, nil
}
// No default action specified, no syscalls listed, assume seccomp disabled
if config.DefaultAction == "" && len(config.Syscalls) == 0 {
return nil, nil
}
newConfig := &specs.LinuxSeccomp{}
if len(config.Architectures) != 0 && len(config.ArchMap) != 0 {
return nil, errors.New("'architectures' and 'archMap' were specified in the seccomp profile, use either 'architectures' or 'archMap'")
}
// if config.Architectures == 0 then libseccomp will figure out the architecture to use
if len(config.Architectures) != 0 {
newConfig.Architectures = config.Architectures
}
arch := goToNative[runtime.GOARCH]
seccompArch, archExists := nativeToSeccomp[arch]
if len(config.ArchMap) != 0 && archExists {
for _, a := range config.ArchMap {
if a.Arch == seccompArch {
newConfig.Architectures = append(newConfig.Architectures, a.Arch)
newConfig.Architectures = append(newConfig.Architectures, a.SubArches...)
break
}
}
}
newConfig.DefaultAction = config.DefaultAction
Loop:
// Loop through all syscall blocks and convert them to libcontainer format after filtering them
for _, call := range config.Syscalls {
if len(call.Excludes.Arches) > 0 {
if inSlice(call.Excludes.Arches, arch) {
continue Loop
}
}
if len(call.Excludes.Caps) > 0 {
for _, c := range call.Excludes.Caps {
if inSlice(rs.Process.Capabilities.Bounding, c) {
continue Loop
}
}
}
if call.Excludes.MinKernel != nil {
if ok, err := kernelGreaterEqualThan(*call.Excludes.MinKernel); err != nil {
return nil, err
} else if ok {
continue Loop
}
}
if len(call.Includes.Arches) > 0 {
if !inSlice(call.Includes.Arches, arch) {
continue Loop
}
}
if len(call.Includes.Caps) > 0 {
for _, c := range call.Includes.Caps {
if !inSlice(rs.Process.Capabilities.Bounding, c) {
continue Loop
}
}
}
if call.Includes.MinKernel != nil {
if ok, err := kernelGreaterEqualThan(*call.Includes.MinKernel); err != nil {
return nil, err
} else if !ok {
continue Loop
}
}
newCall := specs.LinuxSyscall{
Action: call.Action,
ErrnoRet: call.ErrnoRet,
}
if call.Name != "" && len(call.Names) != 0 {
return nil, errors.New("'name' and 'names' were specified in the seccomp profile, use either 'name' or 'names'")
}
if call.Name != "" {
newCall.Names = []string{call.Name}
} else {
newCall.Names = call.Names
}
// Loop through all the arguments of the syscall and convert them
for _, arg := range call.Args {
newCall.Args = append(newCall.Args, *arg)
}
newConfig.Syscalls = append(newConfig.Syscalls, newCall)
}
return newConfig, nil
}