moby/profiles/seccomp/default_linux.go

826 lines
15 KiB
Go
Raw Normal View History

package seccomp // import "github.com/docker/docker/profiles/seccomp"
import (
"github.com/opencontainers/runtime-spec/specs-go"
"golang.org/x/sys/unix"
)
func arches() []Architecture {
return []Architecture{
{
Arch: specs.ArchX86_64,
SubArches: []specs.Arch{specs.ArchX86, specs.ArchX32},
},
{
Arch: specs.ArchAARCH64,
SubArches: []specs.Arch{specs.ArchARM},
},
{
Arch: specs.ArchMIPS64,
SubArches: []specs.Arch{specs.ArchMIPS, specs.ArchMIPS64N32},
},
{
Arch: specs.ArchMIPS64N32,
SubArches: []specs.Arch{specs.ArchMIPS, specs.ArchMIPS64},
},
{
Arch: specs.ArchMIPSEL64,
SubArches: []specs.Arch{specs.ArchMIPSEL, specs.ArchMIPSEL64N32},
},
{
Arch: specs.ArchMIPSEL64N32,
SubArches: []specs.Arch{specs.ArchMIPSEL, specs.ArchMIPSEL64},
},
{
Arch: specs.ArchS390X,
SubArches: []specs.Arch{specs.ArchS390},
},
{
Arch: specs.ArchRISCV64,
SubArches: nil,
},
}
}
// DefaultProfile defines the allowed syscalls for the default seccomp profile.
func DefaultProfile() *Seccomp {
seccomp: add support for "clone3" syscall in default policy If no seccomp policy is requested, then the built-in default policy in dockerd applies. This has no rule for "clone3" defined, nor any default errno defined. So when runc receives the config it attempts to determine a default errno, using logic defined in its commit: https://github.com/opencontainers/runc/commit/7a8d7162f9d72f20d83eaa36aeb5426deecd58f2 As explained in the above commit message, runc uses a heuristic to decide which errno to return by default: [quote] The solution applied here is to prepend a "stub" filter which returns -ENOSYS if the requested syscall has a larger syscall number than any syscall mentioned in the filter. The reason for this specific rule is that syscall numbers are (roughly) allocated sequentially and thus newer syscalls will (usually) have a larger syscall number -- thus causing our filters to produce -ENOSYS if the filter was written before the syscall existed. [/quote] Unfortunately clone3 appears to one of the edge cases that does not result in use of ENOSYS, instead ending up with the historical EPERM errno. Latest glibc (2.33.9000, in Fedora 35 rawhide) will attempt to use clone3 by default. If it sees ENOSYS then it will automatically fallback to using clone. Any other errno is treated as a fatal error. Thus when docker seccomp policy triggers EPERM from clone3, no fallback occurs and programs are thus unable to spawn threads. The clone3 syscall is much more complicated than clone, most notably its flags are not exposed as a directly argument any more. Instead they are hidden inside a struct. This means that seccomp filters are unable to apply policy based on values seen in flags. Thus we can't directly replicate the current "clone" filtering for "clone3". We can at least ensure "clone3" returns ENOSYS errno, to trigger fallback to "clone" at which point we can filter on flags. Fixes: https://github.com/moby/moby/issues/42680 Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
2021-07-26 18:10:01 +00:00
nosys := uint(unix.ENOSYS)
syscalls := []*Syscall{
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"accept",
"accept4",
"access",
"adjtimex",
"alarm",
"bind",
"brk",
"capget",
"capset",
"chdir",
"chmod",
"chown",
"chown32",
"clock_adjtime",
"clock_adjtime64",
"clock_getres",
"clock_getres_time64",
"clock_gettime",
"clock_gettime64",
"clock_nanosleep",
"clock_nanosleep_time64",
"close",
"close_range",
"connect",
"copy_file_range",
"creat",
"dup",
"dup2",
"dup3",
"epoll_create",
"epoll_create1",
"epoll_ctl",
"epoll_ctl_old",
"epoll_pwait",
"epoll_pwait2",
"epoll_wait",
"epoll_wait_old",
"eventfd",
"eventfd2",
"execve",
"execveat",
"exit",
"exit_group",
"faccessat",
"faccessat2",
"fadvise64",
"fadvise64_64",
"fallocate",
"fanotify_mark",
"fchdir",
"fchmod",
"fchmodat",
"fchown",
"fchown32",
"fchownat",
"fcntl",
"fcntl64",
"fdatasync",
"fgetxattr",
"flistxattr",
"flock",
"fork",
"fremovexattr",
"fsetxattr",
"fstat",
"fstat64",
"fstatat64",
"fstatfs",
"fstatfs64",
"fsync",
"ftruncate",
"ftruncate64",
"futex",
"futex_time64",
"futex_waitv",
"futimesat",
"getcpu",
"getcwd",
"getdents",
"getdents64",
"getegid",
"getegid32",
"geteuid",
"geteuid32",
"getgid",
"getgid32",
"getgroups",
"getgroups32",
"getitimer",
"getpeername",
"getpgid",
"getpgrp",
"getpid",
"getppid",
"getpriority",
"getrandom",
"getresgid",
"getresgid32",
"getresuid",
"getresuid32",
"getrlimit",
"get_robust_list",
"getrusage",
"getsid",
"getsockname",
"getsockopt",
"get_thread_area",
"gettid",
"gettimeofday",
"getuid",
"getuid32",
"getxattr",
"inotify_add_watch",
"inotify_init",
"inotify_init1",
"inotify_rm_watch",
"io_cancel",
"ioctl",
"io_destroy",
"io_getevents",
"io_pgetevents",
"io_pgetevents_time64",
"ioprio_get",
"ioprio_set",
"io_setup",
"io_submit",
"io_uring_enter",
"io_uring_register",
"io_uring_setup",
"ipc",
"kill",
"landlock_add_rule",
"landlock_create_ruleset",
"landlock_restrict_self",
"lchown",
"lchown32",
"lgetxattr",
"link",
"linkat",
"listen",
"listxattr",
"llistxattr",
"_llseek",
"lremovexattr",
"lseek",
"lsetxattr",
"lstat",
"lstat64",
"madvise",
"membarrier",
"memfd_create",
"memfd_secret",
"mincore",
"mkdir",
"mkdirat",
"mknod",
"mknodat",
"mlock",
"mlock2",
"mlockall",
"mmap",
"mmap2",
"mprotect",
"mq_getsetattr",
"mq_notify",
"mq_open",
"mq_timedreceive",
"mq_timedreceive_time64",
"mq_timedsend",
"mq_timedsend_time64",
"mq_unlink",
"mremap",
"msgctl",
"msgget",
"msgrcv",
"msgsnd",
"msync",
"munlock",
"munlockall",
"munmap",
"name_to_handle_at",
"nanosleep",
"newfstatat",
"_newselect",
"open",
"openat",
"openat2",
"pause",
"pidfd_open",
"pidfd_send_signal",
"pipe",
"pipe2",
"pkey_alloc",
"pkey_free",
"pkey_mprotect",
"poll",
"ppoll",
"ppoll_time64",
"prctl",
"pread64",
"preadv",
"preadv2",
"prlimit64",
"process_mrelease",
"pselect6",
"pselect6_time64",
"pwrite64",
"pwritev",
"pwritev2",
"read",
"readahead",
"readlink",
"readlinkat",
"readv",
"recv",
"recvfrom",
"recvmmsg",
"recvmmsg_time64",
"recvmsg",
"remap_file_pages",
"removexattr",
"rename",
"renameat",
"renameat2",
"restart_syscall",
"rmdir",
"rseq",
"rt_sigaction",
"rt_sigpending",
"rt_sigprocmask",
"rt_sigqueueinfo",
"rt_sigreturn",
"rt_sigsuspend",
"rt_sigtimedwait",
"rt_sigtimedwait_time64",
"rt_tgsigqueueinfo",
"sched_getaffinity",
"sched_getattr",
"sched_getparam",
"sched_get_priority_max",
"sched_get_priority_min",
"sched_getscheduler",
"sched_rr_get_interval",
"sched_rr_get_interval_time64",
"sched_setaffinity",
"sched_setattr",
"sched_setparam",
"sched_setscheduler",
"sched_yield",
"seccomp",
"select",
"semctl",
"semget",
"semop",
"semtimedop",
"semtimedop_time64",
"send",
"sendfile",
"sendfile64",
"sendmmsg",
"sendmsg",
"sendto",
"setfsgid",
"setfsgid32",
"setfsuid",
"setfsuid32",
"setgid",
"setgid32",
"setgroups",
"setgroups32",
"setitimer",
"setpgid",
"setpriority",
"setregid",
"setregid32",
"setresgid",
"setresgid32",
"setresuid",
"setresuid32",
"setreuid",
"setreuid32",
"setrlimit",
"set_robust_list",
"setsid",
"setsockopt",
"set_thread_area",
"set_tid_address",
"setuid",
"setuid32",
"setxattr",
"shmat",
"shmctl",
"shmdt",
"shmget",
"shutdown",
"sigaltstack",
"signalfd",
"signalfd4",
"sigprocmask",
"sigreturn",
"socketcall",
"socketpair",
"splice",
"stat",
"stat64",
"statfs",
"statfs64",
"statx",
"symlink",
"symlinkat",
"sync",
"sync_file_range",
"syncfs",
"sysinfo",
"tee",
"tgkill",
"time",
"timer_create",
"timer_delete",
"timer_getoverrun",
"timer_gettime",
"timer_gettime64",
"timer_settime",
"timer_settime64",
"timerfd_create",
"timerfd_gettime",
"timerfd_gettime64",
"timerfd_settime",
"timerfd_settime64",
"times",
"tkill",
"truncate",
"truncate64",
"ugetrlimit",
"umask",
"uname",
"unlink",
"unlinkat",
"utime",
"utimensat",
"utimensat_time64",
"utimes",
"vfork",
"vmsplice",
"wait4",
"waitid",
"waitpid",
"write",
"writev",
},
Action: specs.ActAllow,
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"process_vm_readv",
"process_vm_writev",
"ptrace",
},
Action: specs.ActAllow,
},
Includes: &Filter{
MinKernel: &KernelVersion{4, 8},
},
},
seccomp: block socket calls to AF_VSOCK in default profile This syncs the seccomp-profile with the latest changes in containerd's profile, applying the same changes as https://github.com/containerd/containerd/commit/17a93240359b406c78e5f08a20013f759ea230bf Some background from the associated ticket: > We want to use vsock for guest-host communication on KubeVirt > (https://github.com/kubevirt/kubevirt). In KubeVirt we run VMs in pods. > > However since anyone can just connect from any pod to any VM with the > default seccomp settings, we cannot limit connection attempts to our > privileged node-agent. > > ### Describe the solution you'd like > We want to deny the `socket` syscall for the `AF_VSOCK` family by default. > > I see in [1] and [2] that AF_VSOCK was actually already blocked for some > time, but that got reverted since some architectures support the `socketcall` > syscall which can't be restricted properly. However we are mostly interested > in `arm64` and `amd64` where limiting `socket` would probably be enough. > > ### Additional context > I know that in theory we could use our own seccomp profiles, but we would want > to provide security for as many users as possible which use KubeVirt, and there > it would be very helpful if this protection could be added by being part of the > DefaultRuntime profile to easily ensure that it is active for all pods [3]. > > Impact on existing workloads: It is unlikely that this will disturb any existing > workload, becuase VSOCK is almost exclusively used for host-guest commmunication. > However if someone would still use it: Privileged pods would still be able to > use `socket` for `AF_VSOCK`, custom seccomp policies could be applied too. > Further it was already blocked for quite some time and the blockade got lifted > due to reasons not related to AF_VSOCK. > > The PR in KubeVirt which adds VSOCK support for additional context: [4] > > [1]: https://github.com/moby/moby/pull/29076#commitcomment-21831387 > [2]: https://github.com/moby/moby/commit/dcf2632945b87acedeea989a5aa36c084a20ae88 > [3]: https://kubernetes.io/docs/tutorials/security/seccomp/#enable-the-use-of-runtimedefault-as-the-default-seccomp-profile-for-all-workloads > [4]: https://github.com/kubevirt/kubevirt/pull/8546 Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
2022-12-01 13:06:37 +00:00
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{"socket"},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{
{
Index: 0,
Value: unix.AF_VSOCK,
Op: specs.OpNotEqual,
},
},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{"personality"},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{
{
Index: 0,
Value: 0x0,
Op: specs.OpEqualTo,
},
},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{"personality"},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{
{
Index: 0,
Value: 0x0008,
Op: specs.OpEqualTo,
},
},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{"personality"},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{
{
Index: 0,
Value: 0x20000,
Op: specs.OpEqualTo,
},
},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{"personality"},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{
{
Index: 0,
Value: 0x20008,
Op: specs.OpEqualTo,
},
},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{"personality"},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{
{
Index: 0,
Value: 0xffffffff,
Op: specs.OpEqualTo,
},
},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"sync_file_range2",
"swapcontext",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Arches: []string{"ppc64le"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"arm_fadvise64_64",
"arm_sync_file_range",
"sync_file_range2",
"breakpoint",
"cacheflush",
"set_tls",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Arches: []string{"arm", "arm64"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"arch_prctl",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Arches: []string{"amd64", "x32"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"modify_ldt",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Arches: []string{"amd64", "x32", "x86"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"s390_pci_mmio_read",
"s390_pci_mmio_write",
"s390_runtime_instr",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Arches: []string{"s390", "s390x"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"riscv_flush_icache",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Arches: []string{"riscv64"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"open_by_handle_at",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_DAC_READ_SEARCH"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"bpf",
"clone",
seccomp: add support for "clone3" syscall in default policy If no seccomp policy is requested, then the built-in default policy in dockerd applies. This has no rule for "clone3" defined, nor any default errno defined. So when runc receives the config it attempts to determine a default errno, using logic defined in its commit: https://github.com/opencontainers/runc/commit/7a8d7162f9d72f20d83eaa36aeb5426deecd58f2 As explained in the above commit message, runc uses a heuristic to decide which errno to return by default: [quote] The solution applied here is to prepend a "stub" filter which returns -ENOSYS if the requested syscall has a larger syscall number than any syscall mentioned in the filter. The reason for this specific rule is that syscall numbers are (roughly) allocated sequentially and thus newer syscalls will (usually) have a larger syscall number -- thus causing our filters to produce -ENOSYS if the filter was written before the syscall existed. [/quote] Unfortunately clone3 appears to one of the edge cases that does not result in use of ENOSYS, instead ending up with the historical EPERM errno. Latest glibc (2.33.9000, in Fedora 35 rawhide) will attempt to use clone3 by default. If it sees ENOSYS then it will automatically fallback to using clone. Any other errno is treated as a fatal error. Thus when docker seccomp policy triggers EPERM from clone3, no fallback occurs and programs are thus unable to spawn threads. The clone3 syscall is much more complicated than clone, most notably its flags are not exposed as a directly argument any more. Instead they are hidden inside a struct. This means that seccomp filters are unable to apply policy based on values seen in flags. Thus we can't directly replicate the current "clone" filtering for "clone3". We can at least ensure "clone3" returns ENOSYS errno, to trigger fallback to "clone" at which point we can filter on flags. Fixes: https://github.com/moby/moby/issues/42680 Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
2021-07-26 18:10:01 +00:00
"clone3",
"fanotify_init",
"fsconfig",
"fsmount",
"fsopen",
"fspick",
"lookup_dcookie",
"mount",
"mount_setattr",
"move_mount",
"open_tree",
"perf_event_open",
"quotactl",
"quotactl_fd",
"setdomainname",
"sethostname",
"setns",
"syslog",
"umount",
"umount2",
"unshare",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_SYS_ADMIN"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"clone",
},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{
{
Index: 0,
Value: unix.CLONE_NEWNS | unix.CLONE_NEWUTS | unix.CLONE_NEWIPC | unix.CLONE_NEWUSER | unix.CLONE_NEWPID | unix.CLONE_NEWNET | unix.CLONE_NEWCGROUP,
ValueTwo: 0,
Op: specs.OpMaskedEqual,
},
},
},
Excludes: &Filter{
Caps: []string{"CAP_SYS_ADMIN"},
Arches: []string{"s390", "s390x"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"clone",
},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{
{
Index: 1,
Value: unix.CLONE_NEWNS | unix.CLONE_NEWUTS | unix.CLONE_NEWIPC | unix.CLONE_NEWUSER | unix.CLONE_NEWPID | unix.CLONE_NEWNET | unix.CLONE_NEWCGROUP,
ValueTwo: 0,
Op: specs.OpMaskedEqual,
},
},
},
Comment: "s390 parameter ordering for clone is different",
Includes: &Filter{
Arches: []string{"s390", "s390x"},
},
Excludes: &Filter{
Caps: []string{"CAP_SYS_ADMIN"},
},
},
seccomp: add support for "clone3" syscall in default policy If no seccomp policy is requested, then the built-in default policy in dockerd applies. This has no rule for "clone3" defined, nor any default errno defined. So when runc receives the config it attempts to determine a default errno, using logic defined in its commit: https://github.com/opencontainers/runc/commit/7a8d7162f9d72f20d83eaa36aeb5426deecd58f2 As explained in the above commit message, runc uses a heuristic to decide which errno to return by default: [quote] The solution applied here is to prepend a "stub" filter which returns -ENOSYS if the requested syscall has a larger syscall number than any syscall mentioned in the filter. The reason for this specific rule is that syscall numbers are (roughly) allocated sequentially and thus newer syscalls will (usually) have a larger syscall number -- thus causing our filters to produce -ENOSYS if the filter was written before the syscall existed. [/quote] Unfortunately clone3 appears to one of the edge cases that does not result in use of ENOSYS, instead ending up with the historical EPERM errno. Latest glibc (2.33.9000, in Fedora 35 rawhide) will attempt to use clone3 by default. If it sees ENOSYS then it will automatically fallback to using clone. Any other errno is treated as a fatal error. Thus when docker seccomp policy triggers EPERM from clone3, no fallback occurs and programs are thus unable to spawn threads. The clone3 syscall is much more complicated than clone, most notably its flags are not exposed as a directly argument any more. Instead they are hidden inside a struct. This means that seccomp filters are unable to apply policy based on values seen in flags. Thus we can't directly replicate the current "clone" filtering for "clone3". We can at least ensure "clone3" returns ENOSYS errno, to trigger fallback to "clone" at which point we can filter on flags. Fixes: https://github.com/moby/moby/issues/42680 Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
2021-07-26 18:10:01 +00:00
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"clone3",
},
Action: specs.ActErrno,
ErrnoRet: &nosys,
},
Excludes: &Filter{
Caps: []string{"CAP_SYS_ADMIN"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"reboot",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_SYS_BOOT"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"chroot",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_SYS_CHROOT"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"delete_module",
"init_module",
"finit_module",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_SYS_MODULE"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"acct",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_SYS_PACCT"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"kcmp",
"pidfd_getfd",
"process_madvise",
"process_vm_readv",
"process_vm_writev",
"ptrace",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_SYS_PTRACE"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"iopl",
"ioperm",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_SYS_RAWIO"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"settimeofday",
"stime",
"clock_settime",
"clock_settime64",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_SYS_TIME"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"vhangup",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_SYS_TTY_CONFIG"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"get_mempolicy",
"mbind",
"set_mempolicy",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_SYS_NICE"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"syslog",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_SYSLOG"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"bpf",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_BPF"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"perf_event_open",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_PERFMON"},
},
},
}
errnoRet := uint(unix.EPERM)
return &Seccomp{
LinuxSeccomp: specs.LinuxSeccomp{
DefaultAction: specs.ActErrno,
DefaultErrnoRet: &errnoRet,
},
ArchMap: arches(),
Syscalls: syscalls,
}
}