567c01f6d1
This is a backport of9f6b562dd1
, adapted to avoid the refactoring that happened ind92739713c
. Original commit message is as follows: > If no seccomp policy is requested, then the built-in default policy in > dockerd applies. This has no rule for "clone3" defined, nor any default > errno defined. So when runc receives the config it attempts to determine > a default errno, using logic defined in its commit: > > opencontainers/runc@7a8d716 > > As explained in the above commit message, runc uses a heuristic to > decide which errno to return by default: > > [quote] > The solution applied here is to prepend a "stub" filter which returns > -ENOSYS if the requested syscall has a larger syscall number than any > syscall mentioned in the filter. The reason for this specific rule is > that syscall numbers are (roughly) allocated sequentially and thus newer > syscalls will (usually) have a larger syscall number -- thus causing our > filters to produce -ENOSYS if the filter was written before the syscall > existed. > [/quote] > > Unfortunately clone3 appears to one of the edge cases that does not > result in use of ENOSYS, instead ending up with the historical EPERM > errno. > > Latest glibc (2.33.9000, in Fedora 35 rawhide) will attempt to use > clone3 by default. If it sees ENOSYS then it will automatically > fallback to using clone. Any other errno is treated as a fatal > error. Thus when docker seccomp policy triggers EPERM from clone3, > no fallback occurs and programs are thus unable to spawn threads. > > The clone3 syscall is much more complicated than clone, most notably its > flags are not exposed as a directly argument any more. Instead they are > hidden inside a struct. This means that seccomp filters are unable to > apply policy based on values seen in flags. Thus we can't directly > replicate the current "clone" filtering for "clone3". We can at least > ensure "clone3" returns ENOSYS errno, to trigger fallback to "clone" > at which point we can filter on flags. Signed-off-by: Tianon Gravi <admwiggin@gmail.com> Co-authored-by: Daniel P. Berrangé <berrange@redhat.com>
722 lines
13 KiB
Go
722 lines
13 KiB
Go
// +build seccomp
|
|
|
|
package seccomp // import "github.com/docker/docker/profiles/seccomp"
|
|
|
|
import (
|
|
"github.com/opencontainers/runtime-spec/specs-go"
|
|
"golang.org/x/sys/unix"
|
|
)
|
|
|
|
func arches() []Architecture {
|
|
return []Architecture{
|
|
{
|
|
Arch: specs.ArchX86_64,
|
|
SubArches: []specs.Arch{specs.ArchX86, specs.ArchX32},
|
|
},
|
|
{
|
|
Arch: specs.ArchAARCH64,
|
|
SubArches: []specs.Arch{specs.ArchARM},
|
|
},
|
|
{
|
|
Arch: specs.ArchMIPS64,
|
|
SubArches: []specs.Arch{specs.ArchMIPS, specs.ArchMIPS64N32},
|
|
},
|
|
{
|
|
Arch: specs.ArchMIPS64N32,
|
|
SubArches: []specs.Arch{specs.ArchMIPS, specs.ArchMIPS64},
|
|
},
|
|
{
|
|
Arch: specs.ArchMIPSEL64,
|
|
SubArches: []specs.Arch{specs.ArchMIPSEL, specs.ArchMIPSEL64N32},
|
|
},
|
|
{
|
|
Arch: specs.ArchMIPSEL64N32,
|
|
SubArches: []specs.Arch{specs.ArchMIPSEL, specs.ArchMIPSEL64},
|
|
},
|
|
{
|
|
Arch: specs.ArchS390X,
|
|
SubArches: []specs.Arch{specs.ArchS390},
|
|
},
|
|
}
|
|
}
|
|
|
|
// DefaultProfile defines the allowed syscalls for the default seccomp profile.
|
|
func DefaultProfile() *Seccomp {
|
|
nosys := uint(unix.ENOSYS)
|
|
syscalls := []*Syscall{
|
|
{
|
|
Names: []string{
|
|
"accept",
|
|
"accept4",
|
|
"access",
|
|
"adjtimex",
|
|
"alarm",
|
|
"bind",
|
|
"brk",
|
|
"capget",
|
|
"capset",
|
|
"chdir",
|
|
"chmod",
|
|
"chown",
|
|
"chown32",
|
|
"clock_adjtime",
|
|
"clock_adjtime64",
|
|
"clock_getres",
|
|
"clock_getres_time64",
|
|
"clock_gettime",
|
|
"clock_gettime64",
|
|
"clock_nanosleep",
|
|
"clock_nanosleep_time64",
|
|
"close",
|
|
"close_range",
|
|
"connect",
|
|
"copy_file_range",
|
|
"creat",
|
|
"dup",
|
|
"dup2",
|
|
"dup3",
|
|
"epoll_create",
|
|
"epoll_create1",
|
|
"epoll_ctl",
|
|
"epoll_ctl_old",
|
|
"epoll_pwait",
|
|
"epoll_pwait2",
|
|
"epoll_wait",
|
|
"epoll_wait_old",
|
|
"eventfd",
|
|
"eventfd2",
|
|
"execve",
|
|
"execveat",
|
|
"exit",
|
|
"exit_group",
|
|
"faccessat",
|
|
"faccessat2",
|
|
"fadvise64",
|
|
"fadvise64_64",
|
|
"fallocate",
|
|
"fanotify_mark",
|
|
"fchdir",
|
|
"fchmod",
|
|
"fchmodat",
|
|
"fchown",
|
|
"fchown32",
|
|
"fchownat",
|
|
"fcntl",
|
|
"fcntl64",
|
|
"fdatasync",
|
|
"fgetxattr",
|
|
"flistxattr",
|
|
"flock",
|
|
"fork",
|
|
"fremovexattr",
|
|
"fsetxattr",
|
|
"fstat",
|
|
"fstat64",
|
|
"fstatat64",
|
|
"fstatfs",
|
|
"fstatfs64",
|
|
"fsync",
|
|
"ftruncate",
|
|
"ftruncate64",
|
|
"futex",
|
|
"futex_time64",
|
|
"futimesat",
|
|
"getcpu",
|
|
"getcwd",
|
|
"getdents",
|
|
"getdents64",
|
|
"getegid",
|
|
"getegid32",
|
|
"geteuid",
|
|
"geteuid32",
|
|
"getgid",
|
|
"getgid32",
|
|
"getgroups",
|
|
"getgroups32",
|
|
"getitimer",
|
|
"getpeername",
|
|
"getpgid",
|
|
"getpgrp",
|
|
"getpid",
|
|
"getppid",
|
|
"getpriority",
|
|
"getrandom",
|
|
"getresgid",
|
|
"getresgid32",
|
|
"getresuid",
|
|
"getresuid32",
|
|
"getrlimit",
|
|
"get_robust_list",
|
|
"getrusage",
|
|
"getsid",
|
|
"getsockname",
|
|
"getsockopt",
|
|
"get_thread_area",
|
|
"gettid",
|
|
"gettimeofday",
|
|
"getuid",
|
|
"getuid32",
|
|
"getxattr",
|
|
"inotify_add_watch",
|
|
"inotify_init",
|
|
"inotify_init1",
|
|
"inotify_rm_watch",
|
|
"io_cancel",
|
|
"ioctl",
|
|
"io_destroy",
|
|
"io_getevents",
|
|
"io_pgetevents",
|
|
"io_pgetevents_time64",
|
|
"ioprio_get",
|
|
"ioprio_set",
|
|
"io_setup",
|
|
"io_submit",
|
|
"io_uring_enter",
|
|
"io_uring_register",
|
|
"io_uring_setup",
|
|
"ipc",
|
|
"kill",
|
|
"lchown",
|
|
"lchown32",
|
|
"lgetxattr",
|
|
"link",
|
|
"linkat",
|
|
"listen",
|
|
"listxattr",
|
|
"llistxattr",
|
|
"_llseek",
|
|
"lremovexattr",
|
|
"lseek",
|
|
"lsetxattr",
|
|
"lstat",
|
|
"lstat64",
|
|
"madvise",
|
|
"membarrier",
|
|
"memfd_create",
|
|
"mincore",
|
|
"mkdir",
|
|
"mkdirat",
|
|
"mknod",
|
|
"mknodat",
|
|
"mlock",
|
|
"mlock2",
|
|
"mlockall",
|
|
"mmap",
|
|
"mmap2",
|
|
"mprotect",
|
|
"mq_getsetattr",
|
|
"mq_notify",
|
|
"mq_open",
|
|
"mq_timedreceive",
|
|
"mq_timedreceive_time64",
|
|
"mq_timedsend",
|
|
"mq_timedsend_time64",
|
|
"mq_unlink",
|
|
"mremap",
|
|
"msgctl",
|
|
"msgget",
|
|
"msgrcv",
|
|
"msgsnd",
|
|
"msync",
|
|
"munlock",
|
|
"munlockall",
|
|
"munmap",
|
|
"nanosleep",
|
|
"newfstatat",
|
|
"_newselect",
|
|
"open",
|
|
"openat",
|
|
"openat2",
|
|
"pause",
|
|
"pidfd_open",
|
|
"pidfd_send_signal",
|
|
"pipe",
|
|
"pipe2",
|
|
"poll",
|
|
"ppoll",
|
|
"ppoll_time64",
|
|
"prctl",
|
|
"pread64",
|
|
"preadv",
|
|
"preadv2",
|
|
"prlimit64",
|
|
"pselect6",
|
|
"pselect6_time64",
|
|
"pwrite64",
|
|
"pwritev",
|
|
"pwritev2",
|
|
"read",
|
|
"readahead",
|
|
"readlink",
|
|
"readlinkat",
|
|
"readv",
|
|
"recv",
|
|
"recvfrom",
|
|
"recvmmsg",
|
|
"recvmmsg_time64",
|
|
"recvmsg",
|
|
"remap_file_pages",
|
|
"removexattr",
|
|
"rename",
|
|
"renameat",
|
|
"renameat2",
|
|
"restart_syscall",
|
|
"rmdir",
|
|
"rseq",
|
|
"rt_sigaction",
|
|
"rt_sigpending",
|
|
"rt_sigprocmask",
|
|
"rt_sigqueueinfo",
|
|
"rt_sigreturn",
|
|
"rt_sigsuspend",
|
|
"rt_sigtimedwait",
|
|
"rt_sigtimedwait_time64",
|
|
"rt_tgsigqueueinfo",
|
|
"sched_getaffinity",
|
|
"sched_getattr",
|
|
"sched_getparam",
|
|
"sched_get_priority_max",
|
|
"sched_get_priority_min",
|
|
"sched_getscheduler",
|
|
"sched_rr_get_interval",
|
|
"sched_rr_get_interval_time64",
|
|
"sched_setaffinity",
|
|
"sched_setattr",
|
|
"sched_setparam",
|
|
"sched_setscheduler",
|
|
"sched_yield",
|
|
"seccomp",
|
|
"select",
|
|
"semctl",
|
|
"semget",
|
|
"semop",
|
|
"semtimedop",
|
|
"semtimedop_time64",
|
|
"send",
|
|
"sendfile",
|
|
"sendfile64",
|
|
"sendmmsg",
|
|
"sendmsg",
|
|
"sendto",
|
|
"setfsgid",
|
|
"setfsgid32",
|
|
"setfsuid",
|
|
"setfsuid32",
|
|
"setgid",
|
|
"setgid32",
|
|
"setgroups",
|
|
"setgroups32",
|
|
"setitimer",
|
|
"setpgid",
|
|
"setpriority",
|
|
"setregid",
|
|
"setregid32",
|
|
"setresgid",
|
|
"setresgid32",
|
|
"setresuid",
|
|
"setresuid32",
|
|
"setreuid",
|
|
"setreuid32",
|
|
"setrlimit",
|
|
"set_robust_list",
|
|
"setsid",
|
|
"setsockopt",
|
|
"set_thread_area",
|
|
"set_tid_address",
|
|
"setuid",
|
|
"setuid32",
|
|
"setxattr",
|
|
"shmat",
|
|
"shmctl",
|
|
"shmdt",
|
|
"shmget",
|
|
"shutdown",
|
|
"sigaltstack",
|
|
"signalfd",
|
|
"signalfd4",
|
|
"sigprocmask",
|
|
"sigreturn",
|
|
"socket",
|
|
"socketcall",
|
|
"socketpair",
|
|
"splice",
|
|
"stat",
|
|
"stat64",
|
|
"statfs",
|
|
"statfs64",
|
|
"statx",
|
|
"symlink",
|
|
"symlinkat",
|
|
"sync",
|
|
"sync_file_range",
|
|
"syncfs",
|
|
"sysinfo",
|
|
"tee",
|
|
"tgkill",
|
|
"time",
|
|
"timer_create",
|
|
"timer_delete",
|
|
"timer_getoverrun",
|
|
"timer_gettime",
|
|
"timer_gettime64",
|
|
"timer_settime",
|
|
"timer_settime64",
|
|
"timerfd_create",
|
|
"timerfd_gettime",
|
|
"timerfd_gettime64",
|
|
"timerfd_settime",
|
|
"timerfd_settime64",
|
|
"times",
|
|
"tkill",
|
|
"truncate",
|
|
"truncate64",
|
|
"ugetrlimit",
|
|
"umask",
|
|
"uname",
|
|
"unlink",
|
|
"unlinkat",
|
|
"utime",
|
|
"utimensat",
|
|
"utimensat_time64",
|
|
"utimes",
|
|
"vfork",
|
|
"vmsplice",
|
|
"wait4",
|
|
"waitid",
|
|
"waitpid",
|
|
"write",
|
|
"writev",
|
|
},
|
|
Action: specs.ActAllow,
|
|
Args: []*specs.LinuxSeccompArg{},
|
|
},
|
|
{
|
|
Names: []string{"ptrace"},
|
|
Action: specs.ActAllow,
|
|
Includes: Filter{
|
|
MinKernel: &KernelVersion{4, 8},
|
|
},
|
|
},
|
|
{
|
|
Names: []string{"personality"},
|
|
Action: specs.ActAllow,
|
|
Args: []*specs.LinuxSeccompArg{
|
|
{
|
|
Index: 0,
|
|
Value: 0x0,
|
|
Op: specs.OpEqualTo,
|
|
},
|
|
},
|
|
},
|
|
{
|
|
Names: []string{"personality"},
|
|
Action: specs.ActAllow,
|
|
Args: []*specs.LinuxSeccompArg{
|
|
{
|
|
Index: 0,
|
|
Value: 0x0008,
|
|
Op: specs.OpEqualTo,
|
|
},
|
|
},
|
|
},
|
|
{
|
|
Names: []string{"personality"},
|
|
Action: specs.ActAllow,
|
|
Args: []*specs.LinuxSeccompArg{
|
|
{
|
|
Index: 0,
|
|
Value: 0x20000,
|
|
Op: specs.OpEqualTo,
|
|
},
|
|
},
|
|
},
|
|
{
|
|
Names: []string{"personality"},
|
|
Action: specs.ActAllow,
|
|
Args: []*specs.LinuxSeccompArg{
|
|
{
|
|
Index: 0,
|
|
Value: 0x20008,
|
|
Op: specs.OpEqualTo,
|
|
},
|
|
},
|
|
},
|
|
{
|
|
Names: []string{"personality"},
|
|
Action: specs.ActAllow,
|
|
Args: []*specs.LinuxSeccompArg{
|
|
{
|
|
Index: 0,
|
|
Value: 0xffffffff,
|
|
Op: specs.OpEqualTo,
|
|
},
|
|
},
|
|
},
|
|
{
|
|
Names: []string{
|
|
"sync_file_range2",
|
|
},
|
|
Action: specs.ActAllow,
|
|
Args: []*specs.LinuxSeccompArg{},
|
|
Includes: Filter{
|
|
Arches: []string{"ppc64le"},
|
|
},
|
|
},
|
|
{
|
|
Names: []string{
|
|
"arm_fadvise64_64",
|
|
"arm_sync_file_range",
|
|
"sync_file_range2",
|
|
"breakpoint",
|
|
"cacheflush",
|
|
"set_tls",
|
|
},
|
|
Action: specs.ActAllow,
|
|
Args: []*specs.LinuxSeccompArg{},
|
|
Includes: Filter{
|
|
Arches: []string{"arm", "arm64"},
|
|
},
|
|
},
|
|
{
|
|
Names: []string{
|
|
"arch_prctl",
|
|
},
|
|
Action: specs.ActAllow,
|
|
Args: []*specs.LinuxSeccompArg{},
|
|
Includes: Filter{
|
|
Arches: []string{"amd64", "x32"},
|
|
},
|
|
},
|
|
{
|
|
Names: []string{
|
|
"modify_ldt",
|
|
},
|
|
Action: specs.ActAllow,
|
|
Args: []*specs.LinuxSeccompArg{},
|
|
Includes: Filter{
|
|
Arches: []string{"amd64", "x32", "x86"},
|
|
},
|
|
},
|
|
{
|
|
Names: []string{
|
|
"s390_pci_mmio_read",
|
|
"s390_pci_mmio_write",
|
|
"s390_runtime_instr",
|
|
},
|
|
Action: specs.ActAllow,
|
|
Args: []*specs.LinuxSeccompArg{},
|
|
Includes: Filter{
|
|
Arches: []string{"s390", "s390x"},
|
|
},
|
|
},
|
|
{
|
|
Names: []string{
|
|
"open_by_handle_at",
|
|
},
|
|
Action: specs.ActAllow,
|
|
Args: []*specs.LinuxSeccompArg{},
|
|
Includes: Filter{
|
|
Caps: []string{"CAP_DAC_READ_SEARCH"},
|
|
},
|
|
},
|
|
{
|
|
Names: []string{
|
|
"bpf",
|
|
"clone",
|
|
"clone3",
|
|
"fanotify_init",
|
|
"fsconfig",
|
|
"fsmount",
|
|
"fsopen",
|
|
"fspick",
|
|
"lookup_dcookie",
|
|
"mount",
|
|
"move_mount",
|
|
"name_to_handle_at",
|
|
"open_tree",
|
|
"perf_event_open",
|
|
"quotactl",
|
|
"setdomainname",
|
|
"sethostname",
|
|
"setns",
|
|
"syslog",
|
|
"umount",
|
|
"umount2",
|
|
"unshare",
|
|
},
|
|
Action: specs.ActAllow,
|
|
Args: []*specs.LinuxSeccompArg{},
|
|
Includes: Filter{
|
|
Caps: []string{"CAP_SYS_ADMIN"},
|
|
},
|
|
},
|
|
{
|
|
Names: []string{
|
|
"clone",
|
|
},
|
|
Action: specs.ActAllow,
|
|
Args: []*specs.LinuxSeccompArg{
|
|
{
|
|
Index: 0,
|
|
Value: unix.CLONE_NEWNS | unix.CLONE_NEWUTS | unix.CLONE_NEWIPC | unix.CLONE_NEWUSER | unix.CLONE_NEWPID | unix.CLONE_NEWNET | unix.CLONE_NEWCGROUP,
|
|
ValueTwo: 0,
|
|
Op: specs.OpMaskedEqual,
|
|
},
|
|
},
|
|
Excludes: Filter{
|
|
Caps: []string{"CAP_SYS_ADMIN"},
|
|
Arches: []string{"s390", "s390x"},
|
|
},
|
|
},
|
|
{
|
|
Names: []string{
|
|
"clone",
|
|
},
|
|
Action: specs.ActAllow,
|
|
Args: []*specs.LinuxSeccompArg{
|
|
{
|
|
Index: 1,
|
|
Value: unix.CLONE_NEWNS | unix.CLONE_NEWUTS | unix.CLONE_NEWIPC | unix.CLONE_NEWUSER | unix.CLONE_NEWPID | unix.CLONE_NEWNET | unix.CLONE_NEWCGROUP,
|
|
ValueTwo: 0,
|
|
Op: specs.OpMaskedEqual,
|
|
},
|
|
},
|
|
Comment: "s390 parameter ordering for clone is different",
|
|
Includes: Filter{
|
|
Arches: []string{"s390", "s390x"},
|
|
},
|
|
Excludes: Filter{
|
|
Caps: []string{"CAP_SYS_ADMIN"},
|
|
},
|
|
},
|
|
{
|
|
Names: []string{
|
|
"clone3",
|
|
},
|
|
Action: specs.ActErrno,
|
|
ErrnoRet: &nosys,
|
|
Args: []*specs.LinuxSeccompArg{},
|
|
Excludes: Filter{
|
|
Caps: []string{"CAP_SYS_ADMIN"},
|
|
},
|
|
},
|
|
{
|
|
Names: []string{
|
|
"reboot",
|
|
},
|
|
Action: specs.ActAllow,
|
|
Args: []*specs.LinuxSeccompArg{},
|
|
Includes: Filter{
|
|
Caps: []string{"CAP_SYS_BOOT"},
|
|
},
|
|
},
|
|
{
|
|
Names: []string{
|
|
"chroot",
|
|
},
|
|
Action: specs.ActAllow,
|
|
Args: []*specs.LinuxSeccompArg{},
|
|
Includes: Filter{
|
|
Caps: []string{"CAP_SYS_CHROOT"},
|
|
},
|
|
},
|
|
{
|
|
Names: []string{
|
|
"delete_module",
|
|
"init_module",
|
|
"finit_module",
|
|
},
|
|
Action: specs.ActAllow,
|
|
Args: []*specs.LinuxSeccompArg{},
|
|
Includes: Filter{
|
|
Caps: []string{"CAP_SYS_MODULE"},
|
|
},
|
|
},
|
|
{
|
|
Names: []string{
|
|
"acct",
|
|
},
|
|
Action: specs.ActAllow,
|
|
Args: []*specs.LinuxSeccompArg{},
|
|
Includes: Filter{
|
|
Caps: []string{"CAP_SYS_PACCT"},
|
|
},
|
|
},
|
|
{
|
|
Names: []string{
|
|
"kcmp",
|
|
"pidfd_getfd",
|
|
"process_madvise",
|
|
"process_vm_readv",
|
|
"process_vm_writev",
|
|
"ptrace",
|
|
},
|
|
Action: specs.ActAllow,
|
|
Args: []*specs.LinuxSeccompArg{},
|
|
Includes: Filter{
|
|
Caps: []string{"CAP_SYS_PTRACE"},
|
|
},
|
|
},
|
|
{
|
|
Names: []string{
|
|
"iopl",
|
|
"ioperm",
|
|
},
|
|
Action: specs.ActAllow,
|
|
Args: []*specs.LinuxSeccompArg{},
|
|
Includes: Filter{
|
|
Caps: []string{"CAP_SYS_RAWIO"},
|
|
},
|
|
},
|
|
{
|
|
Names: []string{
|
|
"settimeofday",
|
|
"stime",
|
|
"clock_settime",
|
|
},
|
|
Action: specs.ActAllow,
|
|
Args: []*specs.LinuxSeccompArg{},
|
|
Includes: Filter{
|
|
Caps: []string{"CAP_SYS_TIME"},
|
|
},
|
|
},
|
|
{
|
|
Names: []string{
|
|
"vhangup",
|
|
},
|
|
Action: specs.ActAllow,
|
|
Args: []*specs.LinuxSeccompArg{},
|
|
Includes: Filter{
|
|
Caps: []string{"CAP_SYS_TTY_CONFIG"},
|
|
},
|
|
},
|
|
{
|
|
Names: []string{
|
|
"get_mempolicy",
|
|
"mbind",
|
|
"set_mempolicy",
|
|
},
|
|
Action: specs.ActAllow,
|
|
Args: []*specs.LinuxSeccompArg{},
|
|
Includes: Filter{
|
|
Caps: []string{"CAP_SYS_NICE"},
|
|
},
|
|
},
|
|
{
|
|
Names: []string{
|
|
"syslog",
|
|
},
|
|
Action: specs.ActAllow,
|
|
Args: []*specs.LinuxSeccompArg{},
|
|
Includes: Filter{
|
|
Caps: []string{"CAP_SYSLOG"},
|
|
},
|
|
},
|
|
}
|
|
|
|
return &Seccomp{
|
|
DefaultAction: specs.ActErrno,
|
|
ArchMap: arches(),
|
|
Syscalls: syscalls,
|
|
}
|
|
}
|