diff --git a/api/swagger.yaml b/api/swagger.yaml index da30227383..8d2da6f873 100644 --- a/api/swagger.yaml +++ b/api/swagger.yaml @@ -4047,6 +4047,13 @@ definitions: enum: ["cgroupfs", "systemd", "none"] default: "cgroupfs" example: "cgroupfs" + CgroupVersion: + description: | + The version of the cgroup. + type: "string" + enum: ["1", "2"] + default: "1" + example: "1" NEventsListener: description: "Number of event listeners subscribed." type: "integer" diff --git a/api/types/types.go b/api/types/types.go index 79e6dd8436..a6ed75de3e 100644 --- a/api/types/types.go +++ b/api/types/types.go @@ -175,6 +175,7 @@ type Info struct { SystemTime string LoggingDriver string CgroupDriver string + CgroupVersion string `json:",omitempty"` NEventsListener int KernelVersion string OperatingSystem string diff --git a/cmd/dockerd/daemon.go b/cmd/dockerd/daemon.go index ca6e2a0d26..9ca4a77655 100644 --- a/cmd/dockerd/daemon.go +++ b/cmd/dockerd/daemon.go @@ -45,6 +45,7 @@ import ( "github.com/docker/docker/pkg/pidfile" "github.com/docker/docker/pkg/plugingetter" "github.com/docker/docker/pkg/signal" + "github.com/docker/docker/pkg/sysinfo" "github.com/docker/docker/pkg/system" "github.com/docker/docker/plugin" "github.com/docker/docker/rootless" @@ -456,7 +457,11 @@ func warnOnDeprecatedConfigOptions(config *config.Config) { } func initRouter(opts routerOptions) { - decoder := runconfig.ContainerDecoder{} + decoder := runconfig.ContainerDecoder{ + GetSysInfo: func() *sysinfo.SysInfo { + return opts.daemon.RawSysInfo(true) + }, + } routers := []router.Router{ // we need to add the checkpoint router before the container router or the DELETE gets masked diff --git a/daemon/daemon.go b/daemon/daemon.go index 4ece4bef5e..f3015040b5 100644 --- a/daemon/daemon.go +++ b/daemon/daemon.go @@ -42,6 +42,7 @@ import ( "github.com/docker/docker/errdefs" bkconfig "github.com/moby/buildkit/cmd/buildkitd/config" "github.com/moby/buildkit/util/resolver" + rsystem "github.com/opencontainers/runc/libcontainer/system" "github.com/sirupsen/logrus" // register graph drivers @@ -56,7 +57,6 @@ import ( "github.com/docker/docker/pkg/idtools" "github.com/docker/docker/pkg/locker" "github.com/docker/docker/pkg/plugingetter" - "github.com/docker/docker/pkg/sysinfo" "github.com/docker/docker/pkg/system" "github.com/docker/docker/pkg/truncindex" "github.com/docker/docker/plugin" @@ -1026,10 +1026,10 @@ func NewDaemon(ctx context.Context, config *config.Config, pluginStore *plugin.S return nil, err } - sysInfo := sysinfo.New(false) + sysInfo := d.RawSysInfo(false) // Check if Devices cgroup is mounted, it is hard requirement for container security, // on Linux. - if runtime.GOOS == "linux" && !sysInfo.CgroupDevicesEnabled { + if runtime.GOOS == "linux" && !sysInfo.CgroupDevicesEnabled && !rsystem.RunningInUserNS() { return nil, errors.New("Devices cgroup isn't mounted") } diff --git a/daemon/daemon_unix.go b/daemon/daemon_unix.go index e3e4cb52b1..71e167361c 100644 --- a/daemon/daemon_unix.go +++ b/daemon/daemon_unix.go @@ -644,7 +644,7 @@ func verifyPlatformContainerSettings(daemon *Daemon, hostConfig *containertypes. if hostConfig == nil { return nil, nil } - sysInfo := sysinfo.New(true) + sysInfo := daemon.RawSysInfo(true) w, err := verifyPlatformContainerResources(&hostConfig.Resources, sysInfo, update) @@ -1745,7 +1745,7 @@ func (daemon *Daemon) initCgroupsPath(path string) error { } path = filepath.Join(mnt, root, path) - sysInfo := sysinfo.New(true) + sysInfo := daemon.RawSysInfo(true) if err := maybeCreateCPURealTimeFile(sysInfo.CPURealtimePeriod, daemon.configStore.CPURealtimePeriod, "cpu.rt_period_us", path); err != nil { return err } @@ -1779,3 +1779,16 @@ func (daemon *Daemon) setupSeccompProfile() error { func (daemon *Daemon) useShimV2() bool { return cgroups.IsCgroup2UnifiedMode() } + +// RawSysInfo returns *sysinfo.SysInfo . +func (daemon *Daemon) RawSysInfo(quiet bool) *sysinfo.SysInfo { + var opts []sysinfo.Opt + if daemon.getCgroupDriver() == cgroupSystemdDriver { + rootlesskitParentEUID := os.Getenv("ROOTLESSKIT_PARENT_EUID") + if rootlesskitParentEUID != "" { + groupPath := fmt.Sprintf("/user.slice/user-%s.slice", rootlesskitParentEUID) + opts = append(opts, sysinfo.WithCgroup2GroupPath(groupPath)) + } + } + return sysinfo.New(quiet, opts...) +} diff --git a/daemon/daemon_unsupported.go b/daemon/daemon_unsupported.go index 6d8ac6224b..4c2476edcf 100644 --- a/daemon/daemon_unsupported.go +++ b/daemon/daemon_unsupported.go @@ -1,9 +1,18 @@ // +build !linux,!freebsd,!windows package daemon // import "github.com/docker/docker/daemon" -import "github.com/docker/docker/daemon/config" + +import ( + "github.com/docker/docker/daemon/config" + "github.com/docker/docker/pkg/sysinfo" +) const platformSupported = false func setupResolvConf(config *config.Config) { } + +// RawSysInfo returns *sysinfo.SysInfo . +func (daemon *Daemon) RawSysInfo(quiet bool) *sysinfo.SysInfo { + return sysinfo.New(quiet) +} diff --git a/daemon/daemon_windows.go b/daemon/daemon_windows.go index b7b64e4923..5222b05e0f 100644 --- a/daemon/daemon_windows.go +++ b/daemon/daemon_windows.go @@ -657,3 +657,8 @@ func setupResolvConf(config *config.Config) { func (daemon *Daemon) useShimV2() bool { return true } + +// RawSysInfo returns *sysinfo.SysInfo . +func (daemon *Daemon) RawSysInfo(quiet bool) *sysinfo.SysInfo { + return sysinfo.New(quiet) +} diff --git a/daemon/info.go b/daemon/info.go index 75541d0cfb..72f30cdd00 100644 --- a/daemon/info.go +++ b/daemon/info.go @@ -28,7 +28,7 @@ import ( func (daemon *Daemon) SystemInfo() *types.Info { defer metrics.StartTimer(hostInfoFunctions.WithValues("system_info"))() - sysInfo := sysinfo.New(true) + sysInfo := daemon.RawSysInfo(true) cRunning, cPaused, cStopped := stateCtr.get() v := &types.Info{ @@ -47,7 +47,6 @@ func (daemon *Daemon) SystemInfo() *types.Info { NGoroutines: runtime.NumGoroutine(), SystemTime: time.Now().Format(time.RFC3339Nano), LoggingDriver: daemon.defaultLogConfig.Type, - CgroupDriver: daemon.getCgroupDriver(), NEventsListener: daemon.EventsService.SubscribersCount(), KernelVersion: kernelVersion(), OperatingSystem: operatingSystem(), diff --git a/daemon/info_unix.go b/daemon/info_unix.go index 06a67271b5..b9d54101b0 100644 --- a/daemon/info_unix.go +++ b/daemon/info_unix.go @@ -19,6 +19,12 @@ import ( // fillPlatformInfo fills the platform related info. func (daemon *Daemon) fillPlatformInfo(v *types.Info, sysInfo *sysinfo.SysInfo) { + v.CgroupDriver = daemon.getCgroupDriver() + v.CgroupVersion = "1" + if sysInfo.CgroupUnified { + v.CgroupVersion = "2" + } + v.MemoryLimit = sysInfo.MemoryLimit v.SwapLimit = sysInfo.SwapLimit v.KernelMemory = sysInfo.KernelMemory @@ -81,32 +87,43 @@ func (daemon *Daemon) fillPlatformInfo(v *types.Info, sysInfo *sysinfo.SysInfo) v.InitCommit.ID = "N/A" } - if !v.MemoryLimit { - v.Warnings = append(v.Warnings, "WARNING: No memory limit support") - } - if !v.SwapLimit { - v.Warnings = append(v.Warnings, "WARNING: No swap limit support") - } - if !v.KernelMemory { - v.Warnings = append(v.Warnings, "WARNING: No kernel memory limit support") - } - if !v.KernelMemoryTCP { - v.Warnings = append(v.Warnings, "WARNING: No kernel memory TCP limit support") - } - if !v.OomKillDisable { - v.Warnings = append(v.Warnings, "WARNING: No oom kill disable support") - } - if !v.CPUCfsQuota { - v.Warnings = append(v.Warnings, "WARNING: No cpu cfs quota support") - } - if !v.CPUCfsPeriod { - v.Warnings = append(v.Warnings, "WARNING: No cpu cfs period support") - } - if !v.CPUShares { - v.Warnings = append(v.Warnings, "WARNING: No cpu shares support") - } - if !v.CPUSet { - v.Warnings = append(v.Warnings, "WARNING: No cpuset support") + if v.CgroupDriver == cgroupNoneDriver { + if v.CgroupVersion == "2" { + v.Warnings = append(v.Warnings, "WARNING: Running in rootless-mode without cgroup. To enable cgroup in rootless-mode, you need to set exec-opt \"native.cgroupdriver=systemd\".") + } else { + v.Warnings = append(v.Warnings, "WARNING: Running in rootless-mode without cgroup. To enable cgroup in rootless-mode, you need to boot the system in cgroup v2 mode and set exec-opt \"native.cgroupdriver=systemd\".") + } + } else { + if !v.MemoryLimit { + v.Warnings = append(v.Warnings, "WARNING: No memory limit support") + } + if !v.SwapLimit { + v.Warnings = append(v.Warnings, "WARNING: No swap limit support") + } + if !v.KernelMemory { + v.Warnings = append(v.Warnings, "WARNING: No kernel memory limit support") + } + if !v.KernelMemoryTCP { + v.Warnings = append(v.Warnings, "WARNING: No kernel memory TCP limit support") + } + if !v.OomKillDisable { + v.Warnings = append(v.Warnings, "WARNING: No oom kill disable support") + } + if !v.CPUCfsQuota { + v.Warnings = append(v.Warnings, "WARNING: No cpu cfs quota support") + } + if !v.CPUCfsPeriod { + v.Warnings = append(v.Warnings, "WARNING: No cpu cfs period support") + } + if !v.CPUShares { + v.Warnings = append(v.Warnings, "WARNING: No cpu shares support") + } + if !v.CPUSet { + v.Warnings = append(v.Warnings, "WARNING: No cpuset support") + } + if v.CgroupVersion == "2" { + v.Warnings = append(v.Warnings, "WARNING: Support for cgroup v2 is experimental") + } } if !v.IPv4Forwarding { v.Warnings = append(v.Warnings, "WARNING: IPv4 forwarding is disabled") diff --git a/docs/api/version-history.md b/docs/api/version-history.md index 8ca3c36007..a4f8cddff1 100644 --- a/docs/api/version-history.md +++ b/docs/api/version-history.md @@ -17,6 +17,7 @@ keywords: "API, Docker, rcli, REST, documentation" [Docker Engine API v1.41](https://docs.docker.com/engine/api/v1.41/) documentation +* `GET /info` now returns an `CgroupVersion` field, containing the cgroup version. * `POST /services/create` and `POST /services/{id}/update` now supports `BindOptions.NonRecursive`. * The `ClusterStore` and `ClusterAdvertise` fields in `GET /info` are deprecated and are now omitted if they contain an empty value. This change is not versioned, diff --git a/pkg/sysinfo/cgroup2_linux.go b/pkg/sysinfo/cgroup2_linux.go new file mode 100644 index 0000000000..584f4f9652 --- /dev/null +++ b/pkg/sysinfo/cgroup2_linux.go @@ -0,0 +1,151 @@ +package sysinfo // import "github.com/docker/docker/pkg/sysinfo" + +import ( + "io/ioutil" + "path" + "strings" + + cgroupsV2 "github.com/containerd/cgroups/v2" + rsystem "github.com/opencontainers/runc/libcontainer/system" + "github.com/sirupsen/logrus" +) + +type infoCollectorV2 func(info *SysInfo, controllers map[string]struct{}, dirPath string) (warnings []string) + +func newV2(quiet bool, opts *opts) *SysInfo { + var warnings []string + sysInfo := &SysInfo{ + CgroupUnified: true, + } + g := opts.cg2GroupPath + if g == "" { + g = "/" + } + m, err := cgroupsV2.LoadManager("/sys/fs/cgroup", g) + if err != nil { + logrus.Warn(err) + } else { + controllersM := make(map[string]struct{}) + controllers, err := m.Controllers() + if err != nil { + logrus.Warn(err) + } + for _, c := range controllers { + controllersM[c] = struct{}{} + } + opsV2 := []infoCollectorV2{ + applyMemoryCgroupInfoV2, + applyCPUCgroupInfoV2, + applyIOCgroupInfoV2, + applyCPUSetCgroupInfoV2, + applyPIDSCgroupInfoV2, + applyDevicesCgroupInfoV2, + } + dirPath := path.Join("/sys/fs/cgroup", path.Clean(g)) + for _, o := range opsV2 { + w := o(sysInfo, controllersM, dirPath) + warnings = append(warnings, w...) + } + } + + ops := []infoCollector{ + applyNetworkingInfo, + applyAppArmorInfo, + applySeccompInfo, + applyCgroupNsInfo, + } + for _, o := range ops { + w := o(sysInfo, nil) + warnings = append(warnings, w...) + } + if !quiet { + for _, w := range warnings { + logrus.Warn(w) + } + } + return sysInfo +} + +func applyMemoryCgroupInfoV2(info *SysInfo, controllers map[string]struct{}, _ string) []string { + var warnings []string + if _, ok := controllers["memory"]; !ok { + warnings = append(warnings, "Unable to find memory controller") + return warnings + } + + info.MemoryLimit = true + info.SwapLimit = true + info.MemoryReservation = true + info.OomKillDisable = false + info.MemorySwappiness = false + info.KernelMemory = false + info.KernelMemoryTCP = false + return warnings +} + +func applyCPUCgroupInfoV2(info *SysInfo, controllers map[string]struct{}, _ string) []string { + var warnings []string + if _, ok := controllers["cpu"]; !ok { + warnings = append(warnings, "Unable to find cpu controller") + return warnings + } + info.CPUShares = true + info.CPUCfsPeriod = true + info.CPUCfsQuota = true + info.CPURealtimePeriod = false + info.CPURealtimeRuntime = false + return warnings +} + +func applyIOCgroupInfoV2(info *SysInfo, controllers map[string]struct{}, _ string) []string { + var warnings []string + if _, ok := controllers["io"]; !ok { + warnings = append(warnings, "Unable to find io controller") + return warnings + } + + info.BlkioWeight = true + info.BlkioWeightDevice = true + info.BlkioReadBpsDevice = true + info.BlkioWriteBpsDevice = true + info.BlkioReadIOpsDevice = true + info.BlkioWriteIOpsDevice = true + return warnings +} + +func applyCPUSetCgroupInfoV2(info *SysInfo, controllers map[string]struct{}, dirPath string) []string { + var warnings []string + if _, ok := controllers["cpuset"]; !ok { + warnings = append(warnings, "Unable to find cpuset controller") + return warnings + } + info.Cpuset = true + + cpus, err := ioutil.ReadFile(path.Join(dirPath, "cpuset.cpus.effective")) + if err != nil { + return warnings + } + info.Cpus = strings.TrimSpace(string(cpus)) + + mems, err := ioutil.ReadFile(path.Join(dirPath, "cpuset.mems.effective")) + if err != nil { + return warnings + } + info.Mems = strings.TrimSpace(string(mems)) + return warnings +} + +func applyPIDSCgroupInfoV2(info *SysInfo, controllers map[string]struct{}, _ string) []string { + var warnings []string + if _, ok := controllers["pids"]; !ok { + warnings = append(warnings, "Unable to find pids controller") + return warnings + } + info.PidsLimit = true + return warnings +} + +func applyDevicesCgroupInfoV2(info *SysInfo, controllers map[string]struct{}, _ string) []string { + info.CgroupDevicesEnabled = !rsystem.RunningInUserNS() + return nil +} diff --git a/pkg/sysinfo/sysinfo.go b/pkg/sysinfo/sysinfo.go index c0d29fb9e2..f64801c4da 100644 --- a/pkg/sysinfo/sysinfo.go +++ b/pkg/sysinfo/sysinfo.go @@ -30,6 +30,9 @@ type SysInfo struct { // Whether the cgroup has the mountpoint of "devices" or not CgroupDevicesEnabled bool + + // Whether the cgroup is in unified mode (v2). + CgroupUnified bool } type cgroupMemInfo struct { diff --git a/pkg/sysinfo/sysinfo_linux.go b/pkg/sysinfo/sysinfo_linux.go index a488ef0e4f..9fe2d68d16 100644 --- a/pkg/sysinfo/sysinfo_linux.go +++ b/pkg/sysinfo/sysinfo_linux.go @@ -28,10 +28,37 @@ func findCgroupMountpoints() (map[string]string, error) { type infoCollector func(info *SysInfo, cgMounts map[string]string) (warnings []string) +type opts struct { + cg2GroupPath string +} + +// Opt for New(). +type Opt func(*opts) + +// WithCgroup2GroupPath specifies the cgroup v2 group path to inspect availability +// of the controllers. +// +// WithCgroup2GroupPath is expected to be used for rootless mode with systemd driver. +// +// e.g. g = "/user.slice/user-1000.slice/user@1000.service" +func WithCgroup2GroupPath(g string) Opt { + return func(o *opts) { + o.cg2GroupPath = path.Clean(g) + } +} + // New returns a new SysInfo, using the filesystem to detect which features // the kernel supports. If `quiet` is `false` warnings are printed in logs // whenever an error occurs or misconfigurations are present. -func New(quiet bool) *SysInfo { +func New(quiet bool, options ...Opt) *SysInfo { + var opts opts + for _, o := range options { + o(&opts) + } + if cgroups.IsCgroup2UnifiedMode() { + return newV2(quiet, &opts) + } + var ops []infoCollector var warnings []string sysInfo := &SysInfo{} @@ -60,9 +87,6 @@ func New(quiet bool) *SysInfo { w := o(sysInfo, cgMounts) warnings = append(warnings, w...) } - if cgroups.IsCgroup2UnifiedMode() { - warnings = append(warnings, "Your system is running cgroup v2 (unsupported)") - } if !quiet { for _, w := range warnings { logrus.Warn(w) @@ -73,15 +97,6 @@ func New(quiet bool) *SysInfo { // applyMemoryCgroupInfo reads the memory information from the memory cgroup mount point. func applyMemoryCgroupInfo(info *SysInfo, cgMounts map[string]string) []string { - if cgroups.IsCgroup2UnifiedMode() { - // TODO: check cgroup2 info correctly - info.MemoryLimit = true - info.SwapLimit = true - info.MemoryReservation = true - info.OomKillDisable = true - info.MemorySwappiness = true - return nil - } var warnings []string mountPoint, ok := cgMounts["memory"] if !ok { @@ -120,15 +135,6 @@ func applyMemoryCgroupInfo(info *SysInfo, cgMounts map[string]string) []string { // applyCPUCgroupInfo reads the cpu information from the cpu cgroup mount point. func applyCPUCgroupInfo(info *SysInfo, cgMounts map[string]string) []string { - if cgroups.IsCgroup2UnifiedMode() { - // TODO: check cgroup2 info correctly - info.CPUShares = true - info.CPUCfsPeriod = true - info.CPUCfsQuota = true - info.CPURealtimePeriod = true - info.CPURealtimeRuntime = true - return nil - } var warnings []string mountPoint, ok := cgMounts["cpu"] if !ok { @@ -166,15 +172,6 @@ func applyCPUCgroupInfo(info *SysInfo, cgMounts map[string]string) []string { // applyBlkioCgroupInfo reads the blkio information from the blkio cgroup mount point. func applyBlkioCgroupInfo(info *SysInfo, cgMounts map[string]string) []string { - if cgroups.IsCgroup2UnifiedMode() { - // TODO: check cgroup2 info correctly - info.BlkioWeight = true - info.BlkioReadBpsDevice = true - info.BlkioWriteBpsDevice = true - info.BlkioReadIOpsDevice = true - info.BlkioWriteIOpsDevice = true - return nil - } var warnings []string mountPoint, ok := cgMounts["blkio"] if !ok { @@ -216,11 +213,6 @@ func applyBlkioCgroupInfo(info *SysInfo, cgMounts map[string]string) []string { // applyCPUSetCgroupInfo reads the cpuset information from the cpuset cgroup mount point. func applyCPUSetCgroupInfo(info *SysInfo, cgMounts map[string]string) []string { - if cgroups.IsCgroup2UnifiedMode() { - // TODO: check cgroup2 info correctly - info.Cpuset = true - return nil - } var warnings []string mountPoint, ok := cgMounts["cpuset"] if !ok { @@ -248,11 +240,6 @@ func applyCPUSetCgroupInfo(info *SysInfo, cgMounts map[string]string) []string { // applyPIDSCgroupInfo reads the pids information from the pids cgroup mount point. func applyPIDSCgroupInfo(info *SysInfo, _ map[string]string) []string { - if cgroups.IsCgroup2UnifiedMode() { - // TODO: check cgroup2 info correctly - info.PidsLimit = true - return nil - } var warnings []string _, err := cgroups.FindCgroupMountpoint("", "pids") if err != nil { @@ -265,11 +252,6 @@ func applyPIDSCgroupInfo(info *SysInfo, _ map[string]string) []string { // applyDevicesCgroupInfo reads the pids information from the devices cgroup mount point. func applyDevicesCgroupInfo(info *SysInfo, cgMounts map[string]string) []string { - if cgroups.IsCgroup2UnifiedMode() { - // TODO: check cgroup2 info correctly - info.CgroupDevicesEnabled = true - return nil - } var warnings []string _, ok := cgMounts["devices"] info.CgroupDevicesEnabled = ok diff --git a/pkg/sysinfo/sysinfo_unix.go b/pkg/sysinfo/sysinfo_unix.go index 23cc695fb8..47a131bc87 100644 --- a/pkg/sysinfo/sysinfo_unix.go +++ b/pkg/sysinfo/sysinfo_unix.go @@ -2,8 +2,13 @@ package sysinfo // import "github.com/docker/docker/pkg/sysinfo" +type opts struct{} + +// Opt for New(). +type Opt func(*opts) + // New returns an empty SysInfo for non linux for now. -func New(quiet bool) *SysInfo { +func New(quiet bool, options ...Opt) *SysInfo { sysInfo := &SysInfo{} return sysInfo } diff --git a/pkg/sysinfo/sysinfo_windows.go b/pkg/sysinfo/sysinfo_windows.go index 5f68524e7e..372e84af54 100644 --- a/pkg/sysinfo/sysinfo_windows.go +++ b/pkg/sysinfo/sysinfo_windows.go @@ -1,7 +1,12 @@ package sysinfo // import "github.com/docker/docker/pkg/sysinfo" +type opts struct{} + +// Opt for New(). +type Opt func(*opts) + // New returns an empty SysInfo for windows for now. -func New(quiet bool) *SysInfo { +func New(quiet bool, options ...Opt) *SysInfo { sysInfo := &SysInfo{} return sysInfo } diff --git a/runconfig/config.go b/runconfig/config.go index cbacf47df3..3d435f54ae 100644 --- a/runconfig/config.go +++ b/runconfig/config.go @@ -11,11 +11,20 @@ import ( // ContainerDecoder implements httputils.ContainerDecoder // calling DecodeContainerConfig. -type ContainerDecoder struct{} +type ContainerDecoder struct { + GetSysInfo func() *sysinfo.SysInfo +} // DecodeConfig makes ContainerDecoder to implement httputils.ContainerDecoder func (r ContainerDecoder) DecodeConfig(src io.Reader) (*container.Config, *container.HostConfig, *networktypes.NetworkingConfig, error) { - return decodeContainerConfig(src) + var si *sysinfo.SysInfo + if r.GetSysInfo != nil { + si = r.GetSysInfo() + } else { + si = sysinfo.New(true) + } + + return decodeContainerConfig(src, si) } // DecodeHostConfig makes ContainerDecoder to implement httputils.ContainerDecoder @@ -27,7 +36,7 @@ func (r ContainerDecoder) DecodeHostConfig(src io.Reader) (*container.HostConfig // struct and returns both a Config and a HostConfig struct // Be aware this function is not checking whether the resulted structs are nil, // it's your business to do so -func decodeContainerConfig(src io.Reader) (*container.Config, *container.HostConfig, *networktypes.NetworkingConfig, error) { +func decodeContainerConfig(src io.Reader, si *sysinfo.SysInfo) (*container.Config, *container.HostConfig, *networktypes.NetworkingConfig, error) { var w ContainerConfigWrapper decoder := json.NewDecoder(src) @@ -63,7 +72,7 @@ func decodeContainerConfig(src io.Reader) (*container.Config, *container.HostCon } // Validate Resources - if err := validateResources(hc, sysinfo.New(true)); err != nil { + if err := validateResources(hc, si); err != nil { return nil, nil, nil, err } diff --git a/runconfig/config_test.go b/runconfig/config_test.go index 22506126a5..8b316e66e7 100644 --- a/runconfig/config_test.go +++ b/runconfig/config_test.go @@ -12,6 +12,7 @@ import ( "github.com/docker/docker/api/types/container" networktypes "github.com/docker/docker/api/types/network" "github.com/docker/docker/api/types/strslice" + "github.com/docker/docker/pkg/sysinfo" ) type f struct { @@ -46,7 +47,7 @@ func TestDecodeContainerConfig(t *testing.T) { t.Fatal(err) } - c, h, _, err := decodeContainerConfig(bytes.NewReader(b)) + c, h, _, err := decodeContainerConfig(bytes.NewReader(b), sysinfo.New(true)) if err != nil { t.Fatal(fmt.Errorf("Error parsing %s: %v", f, err)) } @@ -130,5 +131,5 @@ func callDecodeContainerConfigIsolation(isolation string) (*container.Config, *c if b, err = json.Marshal(w); err != nil { return nil, nil, nil, fmt.Errorf("Error on marshal %s", err.Error()) } - return decodeContainerConfig(bytes.NewReader(b)) + return decodeContainerConfig(bytes.NewReader(b), sysinfo.New(true)) } diff --git a/vendor.conf b/vendor.conf index 2b217120b6..7366df7cd1 100644 --- a/vendor.conf +++ b/vendor.conf @@ -129,6 +129,7 @@ github.com/containerd/go-runc 7016d3ce2328dd2cb1192b2076eb github.com/containerd/typeurl b45ef1f1f737e10bd45b25b669df25f0da8b9ba0 github.com/containerd/ttrpc 0be804eadb152bc3b3c20c5edc314c4633833398 github.com/gogo/googleapis 01e0f9cca9b92166042241267ee2a5cdf5cff46c # v1.3.2 +github.com/cilium/ebpf 60c3aa43f488292fe2ee50fb8b833b383ca8ebbb # cluster github.com/docker/swarmkit ebe39a32e3ed4c3a3783a02c11cccf388818694c diff --git a/vendor/github.com/cilium/ebpf/LICENSE b/vendor/github.com/cilium/ebpf/LICENSE new file mode 100644 index 0000000000..c637ae99c2 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/LICENSE @@ -0,0 +1,23 @@ +MIT License + +Copyright (c) 2017 Nathan Sweet +Copyright (c) 2018, 2019 Cloudflare +Copyright (c) 2019 Authors of Cilium + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/vendor/github.com/cilium/ebpf/abi.go b/vendor/github.com/cilium/ebpf/abi.go new file mode 100644 index 0000000000..3cde33f7c2 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/abi.go @@ -0,0 +1,203 @@ +package ebpf + +import ( + "bufio" + "bytes" + "fmt" + "io" + "os" + "syscall" + + "github.com/cilium/ebpf/internal" + "github.com/pkg/errors" +) + +// MapABI are the attributes of a Map which are available across all supported kernels. +type MapABI struct { + Type MapType + KeySize uint32 + ValueSize uint32 + MaxEntries uint32 + Flags uint32 +} + +func newMapABIFromSpec(spec *MapSpec) *MapABI { + return &MapABI{ + spec.Type, + spec.KeySize, + spec.ValueSize, + spec.MaxEntries, + spec.Flags, + } +} + +func newMapABIFromFd(fd *bpfFD) (string, *MapABI, error) { + info, err := bpfGetMapInfoByFD(fd) + if err != nil { + if errors.Cause(err) == syscall.EINVAL { + abi, err := newMapABIFromProc(fd) + return "", abi, err + } + return "", nil, err + } + + return "", &MapABI{ + MapType(info.mapType), + info.keySize, + info.valueSize, + info.maxEntries, + info.flags, + }, nil +} + +func newMapABIFromProc(fd *bpfFD) (*MapABI, error) { + var abi MapABI + err := scanFdInfo(fd, map[string]interface{}{ + "map_type": &abi.Type, + "key_size": &abi.KeySize, + "value_size": &abi.ValueSize, + "max_entries": &abi.MaxEntries, + "map_flags": &abi.Flags, + }) + if err != nil { + return nil, err + } + return &abi, nil +} + +// Equal returns true if two ABIs have the same values. +func (abi *MapABI) Equal(other *MapABI) bool { + switch { + case abi.Type != other.Type: + return false + case abi.KeySize != other.KeySize: + return false + case abi.ValueSize != other.ValueSize: + return false + case abi.MaxEntries != other.MaxEntries: + return false + case abi.Flags != other.Flags: + return false + default: + return true + } +} + +// ProgramABI are the attributes of a Program which are available across all supported kernels. +type ProgramABI struct { + Type ProgramType +} + +func newProgramABIFromSpec(spec *ProgramSpec) *ProgramABI { + return &ProgramABI{ + spec.Type, + } +} + +func newProgramABIFromFd(fd *bpfFD) (string, *ProgramABI, error) { + info, err := bpfGetProgInfoByFD(fd) + if err != nil { + if errors.Cause(err) == syscall.EINVAL { + return newProgramABIFromProc(fd) + } + + return "", nil, err + } + + var name string + if bpfName := convertCString(info.name[:]); bpfName != "" { + name = bpfName + } else { + name = convertCString(info.tag[:]) + } + + return name, &ProgramABI{ + Type: ProgramType(info.progType), + }, nil +} + +func newProgramABIFromProc(fd *bpfFD) (string, *ProgramABI, error) { + var ( + abi ProgramABI + name string + ) + + err := scanFdInfo(fd, map[string]interface{}{ + "prog_type": &abi.Type, + "prog_tag": &name, + }) + if errors.Cause(err) == errMissingFields { + return "", nil, &internal.UnsupportedFeatureError{ + Name: "reading ABI from /proc/self/fdinfo", + MinimumVersion: internal.Version{4, 11, 0}, + } + } + if err != nil { + return "", nil, err + } + + return name, &abi, nil +} + +func scanFdInfo(fd *bpfFD, fields map[string]interface{}) error { + raw, err := fd.value() + if err != nil { + return err + } + + fh, err := os.Open(fmt.Sprintf("/proc/self/fdinfo/%d", raw)) + if err != nil { + return err + } + defer fh.Close() + + return errors.Wrap(scanFdInfoReader(fh, fields), fh.Name()) +} + +var errMissingFields = errors.New("missing fields") + +func scanFdInfoReader(r io.Reader, fields map[string]interface{}) error { + var ( + scanner = bufio.NewScanner(r) + scanned int + ) + + for scanner.Scan() { + parts := bytes.SplitN(scanner.Bytes(), []byte("\t"), 2) + if len(parts) != 2 { + continue + } + + name := bytes.TrimSuffix(parts[0], []byte(":")) + field, ok := fields[string(name)] + if !ok { + continue + } + + if n, err := fmt.Fscanln(bytes.NewReader(parts[1]), field); err != nil || n != 1 { + return errors.Wrapf(err, "can't parse field %s", name) + } + + scanned++ + } + + if err := scanner.Err(); err != nil { + return err + } + + if scanned != len(fields) { + return errMissingFields + } + + return nil +} + +// Equal returns true if two ABIs have the same values. +func (abi *ProgramABI) Equal(other *ProgramABI) bool { + switch { + case abi.Type != other.Type: + return false + default: + return true + } +} diff --git a/vendor/github.com/cilium/ebpf/asm/alu.go b/vendor/github.com/cilium/ebpf/asm/alu.go new file mode 100644 index 0000000000..70ccc4d151 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/alu.go @@ -0,0 +1,149 @@ +package asm + +//go:generate stringer -output alu_string.go -type=Source,Endianness,ALUOp + +// Source of ALU / ALU64 / Branch operations +// +// msb lsb +// +----+-+---+ +// |op |S|cls| +// +----+-+---+ +type Source uint8 + +const sourceMask OpCode = 0x08 + +// Source bitmask +const ( + // InvalidSource is returned by getters when invoked + // on non ALU / branch OpCodes. + InvalidSource Source = 0xff + // ImmSource src is from constant + ImmSource Source = 0x00 + // RegSource src is from register + RegSource Source = 0x08 +) + +// The Endianness of a byte swap instruction. +type Endianness uint8 + +const endianMask = sourceMask + +// Endian flags +const ( + InvalidEndian Endianness = 0xff + // Convert to little endian + LE Endianness = 0x00 + // Convert to big endian + BE Endianness = 0x08 +) + +// ALUOp are ALU / ALU64 operations +// +// msb lsb +// +----+-+---+ +// |OP |s|cls| +// +----+-+---+ +type ALUOp uint8 + +const aluMask OpCode = 0xf0 + +const ( + // InvalidALUOp is returned by getters when invoked + // on non ALU OpCodes + InvalidALUOp ALUOp = 0xff + // Add - addition + Add ALUOp = 0x00 + // Sub - subtraction + Sub ALUOp = 0x10 + // Mul - multiplication + Mul ALUOp = 0x20 + // Div - division + Div ALUOp = 0x30 + // Or - bitwise or + Or ALUOp = 0x40 + // And - bitwise and + And ALUOp = 0x50 + // LSh - bitwise shift left + LSh ALUOp = 0x60 + // RSh - bitwise shift right + RSh ALUOp = 0x70 + // Neg - sign/unsign signing bit + Neg ALUOp = 0x80 + // Mod - modulo + Mod ALUOp = 0x90 + // Xor - bitwise xor + Xor ALUOp = 0xa0 + // Mov - move value from one place to another + Mov ALUOp = 0xb0 + // ArSh - arithmatic shift + ArSh ALUOp = 0xc0 + // Swap - endian conversions + Swap ALUOp = 0xd0 +) + +// HostTo converts from host to another endianness. +func HostTo(endian Endianness, dst Register, size Size) Instruction { + var imm int64 + switch size { + case Half: + imm = 16 + case Word: + imm = 32 + case DWord: + imm = 64 + default: + return Instruction{OpCode: InvalidOpCode} + } + + return Instruction{ + OpCode: OpCode(ALUClass).SetALUOp(Swap).SetSource(Source(endian)), + Dst: dst, + Constant: imm, + } +} + +// Op returns the OpCode for an ALU operation with a given source. +func (op ALUOp) Op(source Source) OpCode { + return OpCode(ALU64Class).SetALUOp(op).SetSource(source) +} + +// Reg emits `dst (op) src`. +func (op ALUOp) Reg(dst, src Register) Instruction { + return Instruction{ + OpCode: op.Op(RegSource), + Dst: dst, + Src: src, + } +} + +// Imm emits `dst (op) value`. +func (op ALUOp) Imm(dst Register, value int32) Instruction { + return Instruction{ + OpCode: op.Op(ImmSource), + Dst: dst, + Constant: int64(value), + } +} + +// Op32 returns the OpCode for a 32-bit ALU operation with a given source. +func (op ALUOp) Op32(source Source) OpCode { + return OpCode(ALUClass).SetALUOp(op).SetSource(source) +} + +// Reg32 emits `dst (op) src`, zeroing the upper 32 bit of dst. +func (op ALUOp) Reg32(dst, src Register) Instruction { + return Instruction{ + OpCode: op.Op32(RegSource), + Dst: dst, + Src: src, + } +} + +// Imm32 emits `dst (op) value`, zeroing the upper 32 bit of dst. +func (op ALUOp) Imm32(dst Register, value int32) Instruction { + return Instruction{ + OpCode: op.Op32(ImmSource), + Dst: dst, + Constant: int64(value), + } +} diff --git a/vendor/github.com/cilium/ebpf/asm/alu_string.go b/vendor/github.com/cilium/ebpf/asm/alu_string.go new file mode 100644 index 0000000000..72d3fe6292 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/alu_string.go @@ -0,0 +1,107 @@ +// Code generated by "stringer -output alu_string.go -type=Source,Endianness,ALUOp"; DO NOT EDIT. + +package asm + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[InvalidSource-255] + _ = x[ImmSource-0] + _ = x[RegSource-8] +} + +const ( + _Source_name_0 = "ImmSource" + _Source_name_1 = "RegSource" + _Source_name_2 = "InvalidSource" +) + +func (i Source) String() string { + switch { + case i == 0: + return _Source_name_0 + case i == 8: + return _Source_name_1 + case i == 255: + return _Source_name_2 + default: + return "Source(" + strconv.FormatInt(int64(i), 10) + ")" + } +} +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[InvalidEndian-255] + _ = x[LE-0] + _ = x[BE-8] +} + +const ( + _Endianness_name_0 = "LE" + _Endianness_name_1 = "BE" + _Endianness_name_2 = "InvalidEndian" +) + +func (i Endianness) String() string { + switch { + case i == 0: + return _Endianness_name_0 + case i == 8: + return _Endianness_name_1 + case i == 255: + return _Endianness_name_2 + default: + return "Endianness(" + strconv.FormatInt(int64(i), 10) + ")" + } +} +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[InvalidALUOp-255] + _ = x[Add-0] + _ = x[Sub-16] + _ = x[Mul-32] + _ = x[Div-48] + _ = x[Or-64] + _ = x[And-80] + _ = x[LSh-96] + _ = x[RSh-112] + _ = x[Neg-128] + _ = x[Mod-144] + _ = x[Xor-160] + _ = x[Mov-176] + _ = x[ArSh-192] + _ = x[Swap-208] +} + +const _ALUOp_name = "AddSubMulDivOrAndLShRShNegModXorMovArShSwapInvalidALUOp" + +var _ALUOp_map = map[ALUOp]string{ + 0: _ALUOp_name[0:3], + 16: _ALUOp_name[3:6], + 32: _ALUOp_name[6:9], + 48: _ALUOp_name[9:12], + 64: _ALUOp_name[12:14], + 80: _ALUOp_name[14:17], + 96: _ALUOp_name[17:20], + 112: _ALUOp_name[20:23], + 128: _ALUOp_name[23:26], + 144: _ALUOp_name[26:29], + 160: _ALUOp_name[29:32], + 176: _ALUOp_name[32:35], + 192: _ALUOp_name[35:39], + 208: _ALUOp_name[39:43], + 255: _ALUOp_name[43:55], +} + +func (i ALUOp) String() string { + if str, ok := _ALUOp_map[i]; ok { + return str + } + return "ALUOp(" + strconv.FormatInt(int64(i), 10) + ")" +} diff --git a/vendor/github.com/cilium/ebpf/asm/doc.go b/vendor/github.com/cilium/ebpf/asm/doc.go new file mode 100644 index 0000000000..7031bdc276 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/doc.go @@ -0,0 +1,2 @@ +// Package asm is an assembler for eBPF bytecode. +package asm diff --git a/vendor/github.com/cilium/ebpf/asm/func.go b/vendor/github.com/cilium/ebpf/asm/func.go new file mode 100644 index 0000000000..97f794cdb2 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/func.go @@ -0,0 +1,143 @@ +package asm + +//go:generate stringer -output func_string.go -type=BuiltinFunc + +// BuiltinFunc is a built-in eBPF function. +type BuiltinFunc int32 + +// eBPF built-in functions +// +// You can renegerate this list using the following gawk script: +// +// /FN\(.+\),/ { +// match($1, /\((.+)\)/, r) +// split(r[1], p, "_") +// printf "Fn" +// for (i in p) { +// printf "%s%s", toupper(substr(p[i], 1, 1)), substr(p[i], 2) +// } +// print "" +// } +// +// The script expects include/uapi/linux/bpf.h as it's input. +const ( + FnUnspec BuiltinFunc = iota + FnMapLookupElem + FnMapUpdateElem + FnMapDeleteElem + FnProbeRead + FnKtimeGetNs + FnTracePrintk + FnGetPrandomU32 + FnGetSmpProcessorId + FnSkbStoreBytes + FnL3CsumReplace + FnL4CsumReplace + FnTailCall + FnCloneRedirect + FnGetCurrentPidTgid + FnGetCurrentUidGid + FnGetCurrentComm + FnGetCgroupClassid + FnSkbVlanPush + FnSkbVlanPop + FnSkbGetTunnelKey + FnSkbSetTunnelKey + FnPerfEventRead + FnRedirect + FnGetRouteRealm + FnPerfEventOutput + FnSkbLoadBytes + FnGetStackid + FnCsumDiff + FnSkbGetTunnelOpt + FnSkbSetTunnelOpt + FnSkbChangeProto + FnSkbChangeType + FnSkbUnderCgroup + FnGetHashRecalc + FnGetCurrentTask + FnProbeWriteUser + FnCurrentTaskUnderCgroup + FnSkbChangeTail + FnSkbPullData + FnCsumUpdate + FnSetHashInvalid + FnGetNumaNodeId + FnSkbChangeHead + FnXdpAdjustHead + FnProbeReadStr + FnGetSocketCookie + FnGetSocketUid + FnSetHash + FnSetsockopt + FnSkbAdjustRoom + FnRedirectMap + FnSkRedirectMap + FnSockMapUpdate + FnXdpAdjustMeta + FnPerfEventReadValue + FnPerfProgReadValue + FnGetsockopt + FnOverrideReturn + FnSockOpsCbFlagsSet + FnMsgRedirectMap + FnMsgApplyBytes + FnMsgCorkBytes + FnMsgPullData + FnBind + FnXdpAdjustTail + FnSkbGetXfrmState + FnGetStack + FnSkbLoadBytesRelative + FnFibLookup + FnSockHashUpdate + FnMsgRedirectHash + FnSkRedirectHash + FnLwtPushEncap + FnLwtSeg6StoreBytes + FnLwtSeg6AdjustSrh + FnLwtSeg6Action + FnRcRepeat + FnRcKeydown + FnSkbCgroupId + FnGetCurrentCgroupId + FnGetLocalStorage + FnSkSelectReuseport + FnSkbAncestorCgroupId + FnSkLookupTcp + FnSkLookupUdp + FnSkRelease + FnMapPushElem + FnMapPopElem + FnMapPeekElem + FnMsgPushData + FnMsgPopData + FnRcPointerRel + FnSpinLock + FnSpinUnlock + FnSkFullsock + FnTcpSock + FnSkbEcnSetCe + FnGetListenerSock + FnSkcLookupTcp + FnTcpCheckSyncookie + FnSysctlGetName + FnSysctlGetCurrentValue + FnSysctlGetNewValue + FnSysctlSetNewValue + FnStrtol + FnStrtoul + FnSkStorageGet + FnSkStorageDelete + FnSendSignal + FnTcpGenSyncookie +) + +// Call emits a function call. +func (fn BuiltinFunc) Call() Instruction { + return Instruction{ + OpCode: OpCode(JumpClass).SetJumpOp(Call), + Constant: int64(fn), + } +} diff --git a/vendor/github.com/cilium/ebpf/asm/func_string.go b/vendor/github.com/cilium/ebpf/asm/func_string.go new file mode 100644 index 0000000000..8860b9fdb4 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/func_string.go @@ -0,0 +1,133 @@ +// Code generated by "stringer -output func_string.go -type=BuiltinFunc"; DO NOT EDIT. + +package asm + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[FnUnspec-0] + _ = x[FnMapLookupElem-1] + _ = x[FnMapUpdateElem-2] + _ = x[FnMapDeleteElem-3] + _ = x[FnProbeRead-4] + _ = x[FnKtimeGetNs-5] + _ = x[FnTracePrintk-6] + _ = x[FnGetPrandomU32-7] + _ = x[FnGetSmpProcessorId-8] + _ = x[FnSkbStoreBytes-9] + _ = x[FnL3CsumReplace-10] + _ = x[FnL4CsumReplace-11] + _ = x[FnTailCall-12] + _ = x[FnCloneRedirect-13] + _ = x[FnGetCurrentPidTgid-14] + _ = x[FnGetCurrentUidGid-15] + _ = x[FnGetCurrentComm-16] + _ = x[FnGetCgroupClassid-17] + _ = x[FnSkbVlanPush-18] + _ = x[FnSkbVlanPop-19] + _ = x[FnSkbGetTunnelKey-20] + _ = x[FnSkbSetTunnelKey-21] + _ = x[FnPerfEventRead-22] + _ = x[FnRedirect-23] + _ = x[FnGetRouteRealm-24] + _ = x[FnPerfEventOutput-25] + _ = x[FnSkbLoadBytes-26] + _ = x[FnGetStackid-27] + _ = x[FnCsumDiff-28] + _ = x[FnSkbGetTunnelOpt-29] + _ = x[FnSkbSetTunnelOpt-30] + _ = x[FnSkbChangeProto-31] + _ = x[FnSkbChangeType-32] + _ = x[FnSkbUnderCgroup-33] + _ = x[FnGetHashRecalc-34] + _ = x[FnGetCurrentTask-35] + _ = x[FnProbeWriteUser-36] + _ = x[FnCurrentTaskUnderCgroup-37] + _ = x[FnSkbChangeTail-38] + _ = x[FnSkbPullData-39] + _ = x[FnCsumUpdate-40] + _ = x[FnSetHashInvalid-41] + _ = x[FnGetNumaNodeId-42] + _ = x[FnSkbChangeHead-43] + _ = x[FnXdpAdjustHead-44] + _ = x[FnProbeReadStr-45] + _ = x[FnGetSocketCookie-46] + _ = x[FnGetSocketUid-47] + _ = x[FnSetHash-48] + _ = x[FnSetsockopt-49] + _ = x[FnSkbAdjustRoom-50] + _ = x[FnRedirectMap-51] + _ = x[FnSkRedirectMap-52] + _ = x[FnSockMapUpdate-53] + _ = x[FnXdpAdjustMeta-54] + _ = x[FnPerfEventReadValue-55] + _ = x[FnPerfProgReadValue-56] + _ = x[FnGetsockopt-57] + _ = x[FnOverrideReturn-58] + _ = x[FnSockOpsCbFlagsSet-59] + _ = x[FnMsgRedirectMap-60] + _ = x[FnMsgApplyBytes-61] + _ = x[FnMsgCorkBytes-62] + _ = x[FnMsgPullData-63] + _ = x[FnBind-64] + _ = x[FnXdpAdjustTail-65] + _ = x[FnSkbGetXfrmState-66] + _ = x[FnGetStack-67] + _ = x[FnSkbLoadBytesRelative-68] + _ = x[FnFibLookup-69] + _ = x[FnSockHashUpdate-70] + _ = x[FnMsgRedirectHash-71] + _ = x[FnSkRedirectHash-72] + _ = x[FnLwtPushEncap-73] + _ = x[FnLwtSeg6StoreBytes-74] + _ = x[FnLwtSeg6AdjustSrh-75] + _ = x[FnLwtSeg6Action-76] + _ = x[FnRcRepeat-77] + _ = x[FnRcKeydown-78] + _ = x[FnSkbCgroupId-79] + _ = x[FnGetCurrentCgroupId-80] + _ = x[FnGetLocalStorage-81] + _ = x[FnSkSelectReuseport-82] + _ = x[FnSkbAncestorCgroupId-83] + _ = x[FnSkLookupTcp-84] + _ = x[FnSkLookupUdp-85] + _ = x[FnSkRelease-86] + _ = x[FnMapPushElem-87] + _ = x[FnMapPopElem-88] + _ = x[FnMapPeekElem-89] + _ = x[FnMsgPushData-90] + _ = x[FnMsgPopData-91] + _ = x[FnRcPointerRel-92] + _ = x[FnSpinLock-93] + _ = x[FnSpinUnlock-94] + _ = x[FnSkFullsock-95] + _ = x[FnTcpSock-96] + _ = x[FnSkbEcnSetCe-97] + _ = x[FnGetListenerSock-98] + _ = x[FnSkcLookupTcp-99] + _ = x[FnTcpCheckSyncookie-100] + _ = x[FnSysctlGetName-101] + _ = x[FnSysctlGetCurrentValue-102] + _ = x[FnSysctlGetNewValue-103] + _ = x[FnSysctlSetNewValue-104] + _ = x[FnStrtol-105] + _ = x[FnStrtoul-106] + _ = x[FnSkStorageGet-107] + _ = x[FnSkStorageDelete-108] + _ = x[FnSendSignal-109] + _ = x[FnTcpGenSyncookie-110] +} + +const _BuiltinFunc_name = "FnUnspecFnMapLookupElemFnMapUpdateElemFnMapDeleteElemFnProbeReadFnKtimeGetNsFnTracePrintkFnGetPrandomU32FnGetSmpProcessorIdFnSkbStoreBytesFnL3CsumReplaceFnL4CsumReplaceFnTailCallFnCloneRedirectFnGetCurrentPidTgidFnGetCurrentUidGidFnGetCurrentCommFnGetCgroupClassidFnSkbVlanPushFnSkbVlanPopFnSkbGetTunnelKeyFnSkbSetTunnelKeyFnPerfEventReadFnRedirectFnGetRouteRealmFnPerfEventOutputFnSkbLoadBytesFnGetStackidFnCsumDiffFnSkbGetTunnelOptFnSkbSetTunnelOptFnSkbChangeProtoFnSkbChangeTypeFnSkbUnderCgroupFnGetHashRecalcFnGetCurrentTaskFnProbeWriteUserFnCurrentTaskUnderCgroupFnSkbChangeTailFnSkbPullDataFnCsumUpdateFnSetHashInvalidFnGetNumaNodeIdFnSkbChangeHeadFnXdpAdjustHeadFnProbeReadStrFnGetSocketCookieFnGetSocketUidFnSetHashFnSetsockoptFnSkbAdjustRoomFnRedirectMapFnSkRedirectMapFnSockMapUpdateFnXdpAdjustMetaFnPerfEventReadValueFnPerfProgReadValueFnGetsockoptFnOverrideReturnFnSockOpsCbFlagsSetFnMsgRedirectMapFnMsgApplyBytesFnMsgCorkBytesFnMsgPullDataFnBindFnXdpAdjustTailFnSkbGetXfrmStateFnGetStackFnSkbLoadBytesRelativeFnFibLookupFnSockHashUpdateFnMsgRedirectHashFnSkRedirectHashFnLwtPushEncapFnLwtSeg6StoreBytesFnLwtSeg6AdjustSrhFnLwtSeg6ActionFnRcRepeatFnRcKeydownFnSkbCgroupIdFnGetCurrentCgroupIdFnGetLocalStorageFnSkSelectReuseportFnSkbAncestorCgroupIdFnSkLookupTcpFnSkLookupUdpFnSkReleaseFnMapPushElemFnMapPopElemFnMapPeekElemFnMsgPushDataFnMsgPopDataFnRcPointerRelFnSpinLockFnSpinUnlockFnSkFullsockFnTcpSockFnSkbEcnSetCeFnGetListenerSockFnSkcLookupTcpFnTcpCheckSyncookieFnSysctlGetNameFnSysctlGetCurrentValueFnSysctlGetNewValueFnSysctlSetNewValueFnStrtolFnStrtoulFnSkStorageGetFnSkStorageDeleteFnSendSignalFnTcpGenSyncookie" + +var _BuiltinFunc_index = [...]uint16{0, 8, 23, 38, 53, 64, 76, 89, 104, 123, 138, 153, 168, 178, 193, 212, 230, 246, 264, 277, 289, 306, 323, 338, 348, 363, 380, 394, 406, 416, 433, 450, 466, 481, 497, 512, 528, 544, 568, 583, 596, 608, 624, 639, 654, 669, 683, 700, 714, 723, 735, 750, 763, 778, 793, 808, 828, 847, 859, 875, 894, 910, 925, 939, 952, 958, 973, 990, 1000, 1022, 1033, 1049, 1066, 1082, 1096, 1115, 1133, 1148, 1158, 1169, 1182, 1202, 1219, 1238, 1259, 1272, 1285, 1296, 1309, 1321, 1334, 1347, 1359, 1373, 1383, 1395, 1407, 1416, 1429, 1446, 1460, 1479, 1494, 1517, 1536, 1555, 1563, 1572, 1586, 1603, 1615, 1632} + +func (i BuiltinFunc) String() string { + if i < 0 || i >= BuiltinFunc(len(_BuiltinFunc_index)-1) { + return "BuiltinFunc(" + strconv.FormatInt(int64(i), 10) + ")" + } + return _BuiltinFunc_name[_BuiltinFunc_index[i]:_BuiltinFunc_index[i+1]] +} diff --git a/vendor/github.com/cilium/ebpf/asm/instruction.go b/vendor/github.com/cilium/ebpf/asm/instruction.go new file mode 100644 index 0000000000..c8ed6cfb49 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/instruction.go @@ -0,0 +1,416 @@ +package asm + +import ( + "encoding/binary" + "fmt" + "io" + "math" + "strings" + + "github.com/pkg/errors" +) + +// InstructionSize is the size of a BPF instruction in bytes +const InstructionSize = 8 + +// Instruction is a single eBPF instruction. +type Instruction struct { + OpCode OpCode + Dst Register + Src Register + Offset int16 + Constant int64 + Reference string + Symbol string +} + +// Sym creates a symbol. +func (ins Instruction) Sym(name string) Instruction { + ins.Symbol = name + return ins +} + +// Unmarshal decodes a BPF instruction. +func (ins *Instruction) Unmarshal(r io.Reader, bo binary.ByteOrder) (uint64, error) { + var bi bpfInstruction + err := binary.Read(r, bo, &bi) + if err != nil { + return 0, err + } + + ins.OpCode = bi.OpCode + ins.Dst = bi.Registers.Dst() + ins.Src = bi.Registers.Src() + ins.Offset = bi.Offset + ins.Constant = int64(bi.Constant) + + if !bi.OpCode.isDWordLoad() { + return InstructionSize, nil + } + + var bi2 bpfInstruction + if err := binary.Read(r, bo, &bi2); err != nil { + // No Wrap, to avoid io.EOF clash + return 0, errors.New("64bit immediate is missing second half") + } + if bi2.OpCode != 0 || bi2.Offset != 0 || bi2.Registers != 0 { + return 0, errors.New("64bit immediate has non-zero fields") + } + ins.Constant = int64(uint64(uint32(bi2.Constant))<<32 | uint64(uint32(bi.Constant))) + + return 2 * InstructionSize, nil +} + +// Marshal encodes a BPF instruction. +func (ins Instruction) Marshal(w io.Writer, bo binary.ByteOrder) (uint64, error) { + if ins.OpCode == InvalidOpCode { + return 0, errors.New("invalid opcode") + } + + isDWordLoad := ins.OpCode.isDWordLoad() + + cons := int32(ins.Constant) + if isDWordLoad { + // Encode least significant 32bit first for 64bit operations. + cons = int32(uint32(ins.Constant)) + } + + bpfi := bpfInstruction{ + ins.OpCode, + newBPFRegisters(ins.Dst, ins.Src), + ins.Offset, + cons, + } + + if err := binary.Write(w, bo, &bpfi); err != nil { + return 0, err + } + + if !isDWordLoad { + return InstructionSize, nil + } + + bpfi = bpfInstruction{ + Constant: int32(ins.Constant >> 32), + } + + if err := binary.Write(w, bo, &bpfi); err != nil { + return 0, err + } + + return 2 * InstructionSize, nil +} + +// RewriteMapPtr changes an instruction to use a new map fd. +// +// Returns an error if the fd is invalid, or the instruction +// is incorrect. +func (ins *Instruction) RewriteMapPtr(fd int) error { + if !ins.OpCode.isDWordLoad() { + return errors.Errorf("%s is not a 64 bit load", ins.OpCode) + } + + if fd < 0 { + return errors.New("invalid fd") + } + + ins.Src = R1 + ins.Constant = int64(fd) + return nil +} + +// Format implements fmt.Formatter. +func (ins Instruction) Format(f fmt.State, c rune) { + if c != 'v' { + fmt.Fprintf(f, "{UNRECOGNIZED: %c}", c) + return + } + + op := ins.OpCode + + if op == InvalidOpCode { + fmt.Fprint(f, "INVALID") + return + } + + // Omit trailing space for Exit + if op.JumpOp() == Exit { + fmt.Fprint(f, op) + return + } + + fmt.Fprintf(f, "%v ", op) + switch cls := op.Class(); cls { + case LdClass, LdXClass, StClass, StXClass: + switch op.Mode() { + case ImmMode: + fmt.Fprintf(f, "dst: %s imm: %d", ins.Dst, ins.Constant) + case AbsMode: + fmt.Fprintf(f, "imm: %d", ins.Constant) + case IndMode: + fmt.Fprintf(f, "dst: %s src: %s imm: %d", ins.Dst, ins.Src, ins.Constant) + case MemMode: + fmt.Fprintf(f, "dst: %s src: %s off: %d imm: %d", ins.Dst, ins.Src, ins.Offset, ins.Constant) + case XAddMode: + fmt.Fprintf(f, "dst: %s src: %s", ins.Dst, ins.Src) + } + + case ALU64Class, ALUClass: + fmt.Fprintf(f, "dst: %s ", ins.Dst) + if op.ALUOp() == Swap || op.Source() == ImmSource { + fmt.Fprintf(f, "imm: %d", ins.Constant) + } else { + fmt.Fprintf(f, "src: %s", ins.Src) + } + + case JumpClass: + switch jop := op.JumpOp(); jop { + case Call: + if ins.Src == R1 { + // bpf-to-bpf call + fmt.Fprint(f, ins.Constant) + } else { + fmt.Fprint(f, BuiltinFunc(ins.Constant)) + } + + default: + fmt.Fprintf(f, "dst: %s off: %d ", ins.Dst, ins.Offset) + if op.Source() == ImmSource { + fmt.Fprintf(f, "imm: %d", ins.Constant) + } else { + fmt.Fprintf(f, "src: %s", ins.Src) + } + } + } + + if ins.Reference != "" { + fmt.Fprintf(f, " <%s>", ins.Reference) + } +} + +// Instructions is an eBPF program. +type Instructions []Instruction + +func (insns Instructions) String() string { + return fmt.Sprint(insns) +} + +// RewriteMapPtr rewrites all loads of a specific map pointer to a new fd. +// +// Returns an error if the symbol isn't used, see IsUnreferencedSymbol. +func (insns Instructions) RewriteMapPtr(symbol string, fd int) error { + if symbol == "" { + return errors.New("empty symbol") + } + + found := false + for i := range insns { + ins := &insns[i] + if ins.Reference != symbol { + continue + } + + if err := ins.RewriteMapPtr(fd); err != nil { + return err + } + + found = true + } + + if !found { + return &unreferencedSymbolError{symbol} + } + + return nil +} + +// SymbolOffsets returns the set of symbols and their offset in +// the instructions. +func (insns Instructions) SymbolOffsets() (map[string]int, error) { + offsets := make(map[string]int) + + for i, ins := range insns { + if ins.Symbol == "" { + continue + } + + if _, ok := offsets[ins.Symbol]; ok { + return nil, errors.Errorf("duplicate symbol %s", ins.Symbol) + } + + offsets[ins.Symbol] = i + } + + return offsets, nil +} + +// ReferenceOffsets returns the set of references and their offset in +// the instructions. +func (insns Instructions) ReferenceOffsets() map[string][]int { + offsets := make(map[string][]int) + + for i, ins := range insns { + if ins.Reference == "" { + continue + } + + offsets[ins.Reference] = append(offsets[ins.Reference], i) + } + + return offsets +} + +func (insns Instructions) marshalledOffsets() (map[string]int, error) { + symbols := make(map[string]int) + + marshalledPos := 0 + for _, ins := range insns { + currentPos := marshalledPos + marshalledPos += ins.OpCode.marshalledInstructions() + + if ins.Symbol == "" { + continue + } + + if _, ok := symbols[ins.Symbol]; ok { + return nil, errors.Errorf("duplicate symbol %s", ins.Symbol) + } + + symbols[ins.Symbol] = currentPos + } + + return symbols, nil +} + +// Format implements fmt.Formatter. +// +// You can control indentation of symbols by +// specifying a width. Setting a precision controls the indentation of +// instructions. +// The default character is a tab, which can be overriden by specifying +// the ' ' space flag. +func (insns Instructions) Format(f fmt.State, c rune) { + if c != 's' && c != 'v' { + fmt.Fprintf(f, "{UNKNOWN FORMAT '%c'}", c) + return + } + + // Precision is better in this case, because it allows + // specifying 0 padding easily. + padding, ok := f.Precision() + if !ok { + padding = 1 + } + + indent := strings.Repeat("\t", padding) + if f.Flag(' ') { + indent = strings.Repeat(" ", padding) + } + + symPadding, ok := f.Width() + if !ok { + symPadding = padding - 1 + } + if symPadding < 0 { + symPadding = 0 + } + + symIndent := strings.Repeat("\t", symPadding) + if f.Flag(' ') { + symIndent = strings.Repeat(" ", symPadding) + } + + // Figure out how many digits we need to represent the highest + // offset. + highestOffset := 0 + for _, ins := range insns { + highestOffset += ins.OpCode.marshalledInstructions() + } + offsetWidth := int(math.Ceil(math.Log10(float64(highestOffset)))) + + offset := 0 + for _, ins := range insns { + if ins.Symbol != "" { + fmt.Fprintf(f, "%s%s:\n", symIndent, ins.Symbol) + } + fmt.Fprintf(f, "%s%*d: %v\n", indent, offsetWidth, offset, ins) + offset += ins.OpCode.marshalledInstructions() + } + + return +} + +// Marshal encodes a BPF program into the kernel format. +func (insns Instructions) Marshal(w io.Writer, bo binary.ByteOrder) error { + absoluteOffsets, err := insns.marshalledOffsets() + if err != nil { + return err + } + + num := 0 + for i, ins := range insns { + switch { + case ins.OpCode.JumpOp() == Call && ins.Constant == -1: + // Rewrite bpf to bpf call + offset, ok := absoluteOffsets[ins.Reference] + if !ok { + return errors.Errorf("instruction %d: reference to missing symbol %s", i, ins.Reference) + } + + ins.Constant = int64(offset - num - 1) + + case ins.OpCode.Class() == JumpClass && ins.Offset == -1: + // Rewrite jump to label + offset, ok := absoluteOffsets[ins.Reference] + if !ok { + return errors.Errorf("instruction %d: reference to missing symbol %s", i, ins.Reference) + } + + ins.Offset = int16(offset - num - 1) + } + + n, err := ins.Marshal(w, bo) + if err != nil { + return errors.Wrapf(err, "instruction %d", i) + } + + num += int(n / InstructionSize) + } + return nil +} + +type bpfInstruction struct { + OpCode OpCode + Registers bpfRegisters + Offset int16 + Constant int32 +} + +type bpfRegisters uint8 + +func newBPFRegisters(dst, src Register) bpfRegisters { + return bpfRegisters((src << 4) | (dst & 0xF)) +} + +func (r bpfRegisters) Dst() Register { + return Register(r & 0xF) +} + +func (r bpfRegisters) Src() Register { + return Register(r >> 4) +} + +type unreferencedSymbolError struct { + symbol string +} + +func (use *unreferencedSymbolError) Error() string { + return fmt.Sprintf("unreferenced symbol %s", use.symbol) +} + +// IsUnreferencedSymbol returns true if err was caused by +// an unreferenced symbol. +func IsUnreferencedSymbol(err error) bool { + _, ok := err.(*unreferencedSymbolError) + return ok +} diff --git a/vendor/github.com/cilium/ebpf/asm/jump.go b/vendor/github.com/cilium/ebpf/asm/jump.go new file mode 100644 index 0000000000..33c9b56562 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/jump.go @@ -0,0 +1,109 @@ +package asm + +//go:generate stringer -output jump_string.go -type=JumpOp + +// JumpOp affect control flow. +// +// msb lsb +// +----+-+---+ +// |OP |s|cls| +// +----+-+---+ +type JumpOp uint8 + +const jumpMask OpCode = aluMask + +const ( + // InvalidJumpOp is returned by getters when invoked + // on non branch OpCodes + InvalidJumpOp JumpOp = 0xff + // Ja jumps by offset unconditionally + Ja JumpOp = 0x00 + // JEq jumps by offset if r == imm + JEq JumpOp = 0x10 + // JGT jumps by offset if r > imm + JGT JumpOp = 0x20 + // JGE jumps by offset if r >= imm + JGE JumpOp = 0x30 + // JSet jumps by offset if r & imm + JSet JumpOp = 0x40 + // JNE jumps by offset if r != imm + JNE JumpOp = 0x50 + // JSGT jumps by offset if signed r > signed imm + JSGT JumpOp = 0x60 + // JSGE jumps by offset if signed r >= signed imm + JSGE JumpOp = 0x70 + // Call builtin or user defined function from imm + Call JumpOp = 0x80 + // Exit ends execution, with value in r0 + Exit JumpOp = 0x90 + // JLT jumps by offset if r < imm + JLT JumpOp = 0xa0 + // JLE jumps by offset if r <= imm + JLE JumpOp = 0xb0 + // JSLT jumps by offset if signed r < signed imm + JSLT JumpOp = 0xc0 + // JSLE jumps by offset if signed r <= signed imm + JSLE JumpOp = 0xd0 +) + +// Return emits an exit instruction. +// +// Requires a return value in R0. +func Return() Instruction { + return Instruction{ + OpCode: OpCode(JumpClass).SetJumpOp(Exit), + } +} + +// Op returns the OpCode for a given jump source. +func (op JumpOp) Op(source Source) OpCode { + return OpCode(JumpClass).SetJumpOp(op).SetSource(source) +} + +// Imm compares dst to value, and adjusts PC by offset if the condition is fulfilled. +func (op JumpOp) Imm(dst Register, value int32, label string) Instruction { + if op == Exit || op == Call || op == Ja { + return Instruction{OpCode: InvalidOpCode} + } + + return Instruction{ + OpCode: OpCode(JumpClass).SetJumpOp(op).SetSource(ImmSource), + Dst: dst, + Offset: -1, + Constant: int64(value), + Reference: label, + } +} + +// Reg compares dst to src, and adjusts PC by offset if the condition is fulfilled. +func (op JumpOp) Reg(dst, src Register, label string) Instruction { + if op == Exit || op == Call || op == Ja { + return Instruction{OpCode: InvalidOpCode} + } + + return Instruction{ + OpCode: OpCode(JumpClass).SetJumpOp(op).SetSource(RegSource), + Dst: dst, + Src: src, + Offset: -1, + Reference: label, + } +} + +// Label adjusts PC to the address of the label. +func (op JumpOp) Label(label string) Instruction { + if op == Call { + return Instruction{ + OpCode: OpCode(JumpClass).SetJumpOp(Call), + Src: R1, + Constant: -1, + Reference: label, + } + } + + return Instruction{ + OpCode: OpCode(JumpClass).SetJumpOp(op), + Offset: -1, + Reference: label, + } +} diff --git a/vendor/github.com/cilium/ebpf/asm/jump_string.go b/vendor/github.com/cilium/ebpf/asm/jump_string.go new file mode 100644 index 0000000000..85a4aaffa5 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/jump_string.go @@ -0,0 +1,53 @@ +// Code generated by "stringer -output jump_string.go -type=JumpOp"; DO NOT EDIT. + +package asm + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[InvalidJumpOp-255] + _ = x[Ja-0] + _ = x[JEq-16] + _ = x[JGT-32] + _ = x[JGE-48] + _ = x[JSet-64] + _ = x[JNE-80] + _ = x[JSGT-96] + _ = x[JSGE-112] + _ = x[Call-128] + _ = x[Exit-144] + _ = x[JLT-160] + _ = x[JLE-176] + _ = x[JSLT-192] + _ = x[JSLE-208] +} + +const _JumpOp_name = "JaJEqJGTJGEJSetJNEJSGTJSGECallExitJLTJLEJSLTJSLEInvalidJumpOp" + +var _JumpOp_map = map[JumpOp]string{ + 0: _JumpOp_name[0:2], + 16: _JumpOp_name[2:5], + 32: _JumpOp_name[5:8], + 48: _JumpOp_name[8:11], + 64: _JumpOp_name[11:15], + 80: _JumpOp_name[15:18], + 96: _JumpOp_name[18:22], + 112: _JumpOp_name[22:26], + 128: _JumpOp_name[26:30], + 144: _JumpOp_name[30:34], + 160: _JumpOp_name[34:37], + 176: _JumpOp_name[37:40], + 192: _JumpOp_name[40:44], + 208: _JumpOp_name[44:48], + 255: _JumpOp_name[48:61], +} + +func (i JumpOp) String() string { + if str, ok := _JumpOp_map[i]; ok { + return str + } + return "JumpOp(" + strconv.FormatInt(int64(i), 10) + ")" +} diff --git a/vendor/github.com/cilium/ebpf/asm/load_store.go b/vendor/github.com/cilium/ebpf/asm/load_store.go new file mode 100644 index 0000000000..ab0e92fc3c --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/load_store.go @@ -0,0 +1,189 @@ +package asm + +//go:generate stringer -output load_store_string.go -type=Mode,Size + +// Mode for load and store operations +// +// msb lsb +// +---+--+---+ +// |MDE|sz|cls| +// +---+--+---+ +type Mode uint8 + +const modeMask OpCode = 0xe0 + +const ( + // InvalidMode is returned by getters when invoked + // on non load / store OpCodes + InvalidMode Mode = 0xff + // ImmMode - immediate value + ImmMode Mode = 0x00 + // AbsMode - immediate value + offset + AbsMode Mode = 0x20 + // IndMode - indirect (imm+src) + IndMode Mode = 0x40 + // MemMode - load from memory + MemMode Mode = 0x60 + // XAddMode - add atomically across processors. + XAddMode Mode = 0xc0 +) + +// Size of load and store operations +// +// msb lsb +// +---+--+---+ +// |mde|SZ|cls| +// +---+--+---+ +type Size uint8 + +const sizeMask OpCode = 0x18 + +const ( + // InvalidSize is returned by getters when invoked + // on non load / store OpCodes + InvalidSize Size = 0xff + // DWord - double word; 64 bits + DWord Size = 0x18 + // Word - word; 32 bits + Word Size = 0x00 + // Half - half-word; 16 bits + Half Size = 0x08 + // Byte - byte; 8 bits + Byte Size = 0x10 +) + +// Sizeof returns the size in bytes. +func (s Size) Sizeof() int { + switch s { + case DWord: + return 8 + case Word: + return 4 + case Half: + return 2 + case Byte: + return 1 + default: + return -1 + } +} + +// LoadMemOp returns the OpCode to load a value of given size from memory. +func LoadMemOp(size Size) OpCode { + return OpCode(LdXClass).SetMode(MemMode).SetSize(size) +} + +// LoadMem emits `dst = *(size *)(src + offset)`. +func LoadMem(dst, src Register, offset int16, size Size) Instruction { + return Instruction{ + OpCode: LoadMemOp(size), + Dst: dst, + Src: src, + Offset: offset, + } +} + +// LoadImmOp returns the OpCode to load an immediate of given size. +// +// As of kernel 4.20, only DWord size is accepted. +func LoadImmOp(size Size) OpCode { + return OpCode(LdClass).SetMode(ImmMode).SetSize(size) +} + +// LoadImm emits `dst = (size)value`. +// +// As of kernel 4.20, only DWord size is accepted. +func LoadImm(dst Register, value int64, size Size) Instruction { + return Instruction{ + OpCode: LoadImmOp(size), + Dst: dst, + Constant: value, + } +} + +// LoadMapPtr stores a pointer to a map in dst. +func LoadMapPtr(dst Register, fd int) Instruction { + if fd < 0 { + return Instruction{OpCode: InvalidOpCode} + } + + return Instruction{ + OpCode: LoadImmOp(DWord), + Dst: dst, + Src: R1, + Constant: int64(fd), + } +} + +// LoadIndOp returns the OpCode for loading a value of given size from an sk_buff. +func LoadIndOp(size Size) OpCode { + return OpCode(LdClass).SetMode(IndMode).SetSize(size) +} + +// LoadInd emits `dst = ntoh(*(size *)(((sk_buff *)R6)->data + src + offset))`. +func LoadInd(dst, src Register, offset int32, size Size) Instruction { + return Instruction{ + OpCode: LoadIndOp(size), + Dst: dst, + Src: src, + Constant: int64(offset), + } +} + +// LoadAbsOp returns the OpCode for loading a value of given size from an sk_buff. +func LoadAbsOp(size Size) OpCode { + return OpCode(LdClass).SetMode(AbsMode).SetSize(size) +} + +// LoadAbs emits `r0 = ntoh(*(size *)(((sk_buff *)R6)->data + offset))`. +func LoadAbs(offset int32, size Size) Instruction { + return Instruction{ + OpCode: LoadAbsOp(size), + Dst: R0, + Constant: int64(offset), + } +} + +// StoreMemOp returns the OpCode for storing a register of given size in memory. +func StoreMemOp(size Size) OpCode { + return OpCode(StXClass).SetMode(MemMode).SetSize(size) +} + +// StoreMem emits `*(size *)(dst + offset) = src` +func StoreMem(dst Register, offset int16, src Register, size Size) Instruction { + return Instruction{ + OpCode: StoreMemOp(size), + Dst: dst, + Src: src, + Offset: offset, + } +} + +// StoreImmOp returns the OpCode for storing an immediate of given size in memory. +func StoreImmOp(size Size) OpCode { + return OpCode(StClass).SetMode(MemMode).SetSize(size) +} + +// StoreImm emits `*(size *)(dst + offset) = value`. +func StoreImm(dst Register, offset int16, value int64, size Size) Instruction { + return Instruction{ + OpCode: StoreImmOp(size), + Dst: dst, + Offset: offset, + Constant: value, + } +} + +// StoreXAddOp returns the OpCode to atomically add a register to a value in memory. +func StoreXAddOp(size Size) OpCode { + return OpCode(StXClass).SetMode(XAddMode).SetSize(size) +} + +// StoreXAdd atomically adds src to *dst. +func StoreXAdd(dst, src Register, size Size) Instruction { + return Instruction{ + OpCode: StoreXAddOp(size), + Dst: dst, + Src: src, + } +} diff --git a/vendor/github.com/cilium/ebpf/asm/load_store_string.go b/vendor/github.com/cilium/ebpf/asm/load_store_string.go new file mode 100644 index 0000000000..76d29a0756 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/load_store_string.go @@ -0,0 +1,80 @@ +// Code generated by "stringer -output load_store_string.go -type=Mode,Size"; DO NOT EDIT. + +package asm + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[InvalidMode-255] + _ = x[ImmMode-0] + _ = x[AbsMode-32] + _ = x[IndMode-64] + _ = x[MemMode-96] + _ = x[XAddMode-192] +} + +const ( + _Mode_name_0 = "ImmMode" + _Mode_name_1 = "AbsMode" + _Mode_name_2 = "IndMode" + _Mode_name_3 = "MemMode" + _Mode_name_4 = "XAddMode" + _Mode_name_5 = "InvalidMode" +) + +func (i Mode) String() string { + switch { + case i == 0: + return _Mode_name_0 + case i == 32: + return _Mode_name_1 + case i == 64: + return _Mode_name_2 + case i == 96: + return _Mode_name_3 + case i == 192: + return _Mode_name_4 + case i == 255: + return _Mode_name_5 + default: + return "Mode(" + strconv.FormatInt(int64(i), 10) + ")" + } +} +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[InvalidSize-255] + _ = x[DWord-24] + _ = x[Word-0] + _ = x[Half-8] + _ = x[Byte-16] +} + +const ( + _Size_name_0 = "Word" + _Size_name_1 = "Half" + _Size_name_2 = "Byte" + _Size_name_3 = "DWord" + _Size_name_4 = "InvalidSize" +) + +func (i Size) String() string { + switch { + case i == 0: + return _Size_name_0 + case i == 8: + return _Size_name_1 + case i == 16: + return _Size_name_2 + case i == 24: + return _Size_name_3 + case i == 255: + return _Size_name_4 + default: + return "Size(" + strconv.FormatInt(int64(i), 10) + ")" + } +} diff --git a/vendor/github.com/cilium/ebpf/asm/opcode.go b/vendor/github.com/cilium/ebpf/asm/opcode.go new file mode 100644 index 0000000000..d796de3fe0 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/opcode.go @@ -0,0 +1,237 @@ +package asm + +import ( + "fmt" + "strings" +) + +//go:generate stringer -output opcode_string.go -type=Class + +type encoding int + +const ( + unknownEncoding encoding = iota + loadOrStore + jumpOrALU +) + +// Class of operations +// +// msb lsb +// +---+--+---+ +// | ?? |CLS| +// +---+--+---+ +type Class uint8 + +const classMask OpCode = 0x07 + +const ( + // LdClass load memory + LdClass Class = 0x00 + // LdXClass load memory from constant + LdXClass Class = 0x01 + // StClass load register from memory + StClass Class = 0x02 + // StXClass load register from constant + StXClass Class = 0x03 + // ALUClass arithmetic operators + ALUClass Class = 0x04 + // JumpClass jump operators + JumpClass Class = 0x05 + // ALU64Class arithmetic in 64 bit mode + ALU64Class Class = 0x07 +) + +func (cls Class) encoding() encoding { + switch cls { + case LdClass, LdXClass, StClass, StXClass: + return loadOrStore + case ALU64Class, ALUClass, JumpClass: + return jumpOrALU + default: + return unknownEncoding + } +} + +// OpCode is a packed eBPF opcode. +// +// Its encoding is defined by a Class value: +// +// msb lsb +// +----+-+---+ +// | ???? |CLS| +// +----+-+---+ +type OpCode uint8 + +// InvalidOpCode is returned by setters on OpCode +const InvalidOpCode OpCode = 0xff + +// marshalledInstructions returns the number of BPF instructions required +// to encode this opcode. +func (op OpCode) marshalledInstructions() int { + if op == LoadImmOp(DWord) { + return 2 + } + return 1 +} + +func (op OpCode) isDWordLoad() bool { + return op == LoadImmOp(DWord) +} + +// Class returns the class of operation. +func (op OpCode) Class() Class { + return Class(op & classMask) +} + +// Mode returns the mode for load and store operations. +func (op OpCode) Mode() Mode { + if op.Class().encoding() != loadOrStore { + return InvalidMode + } + return Mode(op & modeMask) +} + +// Size returns the size for load and store operations. +func (op OpCode) Size() Size { + if op.Class().encoding() != loadOrStore { + return InvalidSize + } + return Size(op & sizeMask) +} + +// Source returns the source for branch and ALU operations. +func (op OpCode) Source() Source { + if op.Class().encoding() != jumpOrALU || op.ALUOp() == Swap { + return InvalidSource + } + return Source(op & sourceMask) +} + +// ALUOp returns the ALUOp. +func (op OpCode) ALUOp() ALUOp { + if op.Class().encoding() != jumpOrALU { + return InvalidALUOp + } + return ALUOp(op & aluMask) +} + +// Endianness returns the Endianness for a byte swap instruction. +func (op OpCode) Endianness() Endianness { + if op.ALUOp() != Swap { + return InvalidEndian + } + return Endianness(op & endianMask) +} + +// JumpOp returns the JumpOp. +func (op OpCode) JumpOp() JumpOp { + if op.Class().encoding() != jumpOrALU { + return InvalidJumpOp + } + return JumpOp(op & jumpMask) +} + +// SetMode sets the mode on load and store operations. +// +// Returns InvalidOpCode if op is of the wrong class. +func (op OpCode) SetMode(mode Mode) OpCode { + if op.Class().encoding() != loadOrStore || !valid(OpCode(mode), modeMask) { + return InvalidOpCode + } + return (op & ^modeMask) | OpCode(mode) +} + +// SetSize sets the size on load and store operations. +// +// Returns InvalidOpCode if op is of the wrong class. +func (op OpCode) SetSize(size Size) OpCode { + if op.Class().encoding() != loadOrStore || !valid(OpCode(size), sizeMask) { + return InvalidOpCode + } + return (op & ^sizeMask) | OpCode(size) +} + +// SetSource sets the source on jump and ALU operations. +// +// Returns InvalidOpCode if op is of the wrong class. +func (op OpCode) SetSource(source Source) OpCode { + if op.Class().encoding() != jumpOrALU || !valid(OpCode(source), sourceMask) { + return InvalidOpCode + } + return (op & ^sourceMask) | OpCode(source) +} + +// SetALUOp sets the ALUOp on ALU operations. +// +// Returns InvalidOpCode if op is of the wrong class. +func (op OpCode) SetALUOp(alu ALUOp) OpCode { + class := op.Class() + if (class != ALUClass && class != ALU64Class) || !valid(OpCode(alu), aluMask) { + return InvalidOpCode + } + return (op & ^aluMask) | OpCode(alu) +} + +// SetJumpOp sets the JumpOp on jump operations. +// +// Returns InvalidOpCode if op is of the wrong class. +func (op OpCode) SetJumpOp(jump JumpOp) OpCode { + if op.Class() != JumpClass || !valid(OpCode(jump), jumpMask) { + return InvalidOpCode + } + return (op & ^jumpMask) | OpCode(jump) +} + +func (op OpCode) String() string { + var f strings.Builder + + switch class := op.Class(); class { + case LdClass, LdXClass, StClass, StXClass: + f.WriteString(strings.TrimSuffix(class.String(), "Class")) + + mode := op.Mode() + f.WriteString(strings.TrimSuffix(mode.String(), "Mode")) + + switch op.Size() { + case DWord: + f.WriteString("DW") + case Word: + f.WriteString("W") + case Half: + f.WriteString("H") + case Byte: + f.WriteString("B") + } + + case ALU64Class, ALUClass: + f.WriteString(op.ALUOp().String()) + + if op.ALUOp() == Swap { + // Width for Endian is controlled by Constant + f.WriteString(op.Endianness().String()) + } else { + if class == ALUClass { + f.WriteString("32") + } + + f.WriteString(strings.TrimSuffix(op.Source().String(), "Source")) + } + + case JumpClass: + f.WriteString(op.JumpOp().String()) + if jop := op.JumpOp(); jop != Exit && jop != Call { + f.WriteString(strings.TrimSuffix(op.Source().String(), "Source")) + } + + default: + fmt.Fprintf(&f, "%#x", op) + } + + return f.String() +} + +// valid returns true if all bits in value are covered by mask. +func valid(value, mask OpCode) bool { + return value & ^mask == 0 +} diff --git a/vendor/github.com/cilium/ebpf/asm/opcode_string.go b/vendor/github.com/cilium/ebpf/asm/opcode_string.go new file mode 100644 index 0000000000..079ce1db0b --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/opcode_string.go @@ -0,0 +1,38 @@ +// Code generated by "stringer -output opcode_string.go -type=Class"; DO NOT EDIT. + +package asm + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[LdClass-0] + _ = x[LdXClass-1] + _ = x[StClass-2] + _ = x[StXClass-3] + _ = x[ALUClass-4] + _ = x[JumpClass-5] + _ = x[ALU64Class-7] +} + +const ( + _Class_name_0 = "LdClassLdXClassStClassStXClassALUClassJumpClass" + _Class_name_1 = "ALU64Class" +) + +var ( + _Class_index_0 = [...]uint8{0, 7, 15, 22, 30, 38, 47} +) + +func (i Class) String() string { + switch { + case 0 <= i && i <= 5: + return _Class_name_0[_Class_index_0[i]:_Class_index_0[i+1]] + case i == 7: + return _Class_name_1 + default: + return "Class(" + strconv.FormatInt(int64(i), 10) + ")" + } +} diff --git a/vendor/github.com/cilium/ebpf/asm/register.go b/vendor/github.com/cilium/ebpf/asm/register.go new file mode 100644 index 0000000000..4f284fbe7d --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/register.go @@ -0,0 +1,42 @@ +package asm + +import ( + "fmt" +) + +// Register is the source or destination of most operations. +type Register uint8 + +// R0 contains return values. +const R0 Register = 0 + +// Registers for function arguments. +const ( + R1 Register = R0 + 1 + iota + R2 + R3 + R4 + R5 +) + +// Callee saved registers preserved by function calls. +const ( + R6 Register = R5 + 1 + iota + R7 + R8 + R9 +) + +// Read-only frame pointer to access stack. +const ( + R10 Register = R9 + 1 + RFP = R10 +) + +func (r Register) String() string { + v := uint8(r) + if v == 10 { + return "rfp" + } + return fmt.Sprintf("r%d", v) +} diff --git a/vendor/github.com/cilium/ebpf/collection.go b/vendor/github.com/cilium/ebpf/collection.go new file mode 100644 index 0000000000..5ad1a5ec4b --- /dev/null +++ b/vendor/github.com/cilium/ebpf/collection.go @@ -0,0 +1,148 @@ +package ebpf + +import ( + "github.com/cilium/ebpf/asm" + "github.com/pkg/errors" +) + +// CollectionOptions control loading a collection into the kernel. +type CollectionOptions struct { + Programs ProgramOptions +} + +// CollectionSpec describes a collection. +type CollectionSpec struct { + Maps map[string]*MapSpec + Programs map[string]*ProgramSpec +} + +// Copy returns a recursive copy of the spec. +func (cs *CollectionSpec) Copy() *CollectionSpec { + if cs == nil { + return nil + } + + cpy := CollectionSpec{ + Maps: make(map[string]*MapSpec, len(cs.Maps)), + Programs: make(map[string]*ProgramSpec, len(cs.Programs)), + } + + for name, spec := range cs.Maps { + cpy.Maps[name] = spec.Copy() + } + + for name, spec := range cs.Programs { + cpy.Programs[name] = spec.Copy() + } + + return &cpy +} + +// Collection is a collection of Programs and Maps associated +// with their symbols +type Collection struct { + Programs map[string]*Program + Maps map[string]*Map +} + +// NewCollection creates a Collection from a specification. +// +// Only maps referenced by at least one of the programs are initialized. +func NewCollection(spec *CollectionSpec) (*Collection, error) { + return NewCollectionWithOptions(spec, CollectionOptions{}) +} + +// NewCollectionWithOptions creates a Collection from a specification. +// +// Only maps referenced by at least one of the programs are initialized. +func NewCollectionWithOptions(spec *CollectionSpec, opts CollectionOptions) (*Collection, error) { + maps := make(map[string]*Map) + for mapName, mapSpec := range spec.Maps { + m, err := NewMap(mapSpec) + if err != nil { + return nil, errors.Wrapf(err, "map %s", mapName) + } + maps[mapName] = m + } + + progs := make(map[string]*Program) + for progName, origProgSpec := range spec.Programs { + progSpec := origProgSpec.Copy() + + // Rewrite any reference to a valid map. + for i := range progSpec.Instructions { + var ( + ins = &progSpec.Instructions[i] + m = maps[ins.Reference] + ) + + if ins.Reference == "" || m == nil { + continue + } + + if ins.Src == asm.R1 { + // Don't overwrite maps already rewritten, users can + // rewrite programs in the spec themselves + continue + } + + if err := ins.RewriteMapPtr(m.FD()); err != nil { + return nil, errors.Wrapf(err, "progam %s: map %s", progName, ins.Reference) + } + } + + prog, err := NewProgramWithOptions(progSpec, opts.Programs) + if err != nil { + return nil, errors.Wrapf(err, "program %s", progName) + } + progs[progName] = prog + } + + return &Collection{ + progs, + maps, + }, nil +} + +// LoadCollection parses an object file and converts it to a collection. +func LoadCollection(file string) (*Collection, error) { + spec, err := LoadCollectionSpec(file) + if err != nil { + return nil, err + } + return NewCollection(spec) +} + +// Close frees all maps and programs associated with the collection. +// +// The collection mustn't be used afterwards. +func (coll *Collection) Close() { + for _, prog := range coll.Programs { + prog.Close() + } + for _, m := range coll.Maps { + m.Close() + } +} + +// DetachMap removes the named map from the Collection. +// +// This means that a later call to Close() will not affect this map. +// +// Returns nil if no map of that name exists. +func (coll *Collection) DetachMap(name string) *Map { + m := coll.Maps[name] + delete(coll.Maps, name) + return m +} + +// DetachProgram removes the named program from the Collection. +// +// This means that a later call to Close() will not affect this program. +// +// Returns nil if no program of that name exists. +func (coll *Collection) DetachProgram(name string) *Program { + p := coll.Programs[name] + delete(coll.Programs, name) + return p +} diff --git a/vendor/github.com/cilium/ebpf/doc.go b/vendor/github.com/cilium/ebpf/doc.go new file mode 100644 index 0000000000..d96e6b1e6d --- /dev/null +++ b/vendor/github.com/cilium/ebpf/doc.go @@ -0,0 +1,17 @@ +// Package ebpf is a toolkit for working with eBPF programs. +// +// eBPF programs are small snippets of code which are executed directly +// in a VM in the Linux kernel, which makes them very fast and flexible. +// Many Linux subsystems now accept eBPF programs. This makes it possible +// to implement highly application specific logic inside the kernel, +// without having to modify the actual kernel itself. +// +// This package is designed for long-running processes which +// want to use eBPF to implement part of their application logic. It has no +// run-time dependencies outside of the library and the Linux kernel itself. +// eBPF code should be compiled ahead of time using clang, and shipped with +// your application as any other resource. +// +// This package doesn't include code required to attach eBPF to Linux +// subsystems, since this varies per subsystem. +package ebpf diff --git a/vendor/github.com/cilium/ebpf/elf_reader.go b/vendor/github.com/cilium/ebpf/elf_reader.go new file mode 100644 index 0000000000..3bdc0849bd --- /dev/null +++ b/vendor/github.com/cilium/ebpf/elf_reader.go @@ -0,0 +1,392 @@ +package ebpf + +import ( + "bytes" + "debug/elf" + "encoding/binary" + "fmt" + "io" + "os" + "strings" + + "github.com/cilium/ebpf/asm" + + "github.com/pkg/errors" +) + +type elfCode struct { + *elf.File + symbols []elf.Symbol + symbolsPerSection map[elf.SectionIndex]map[uint64]string +} + +// LoadCollectionSpec parses an ELF file into a CollectionSpec. +func LoadCollectionSpec(file string) (*CollectionSpec, error) { + f, err := os.Open(file) + if err != nil { + return nil, err + } + defer f.Close() + + spec, err := LoadCollectionSpecFromReader(f) + return spec, errors.Wrapf(err, "file %s", file) +} + +// LoadCollectionSpecFromReader parses an ELF file into a CollectionSpec. +func LoadCollectionSpecFromReader(code io.ReaderAt) (*CollectionSpec, error) { + f, err := elf.NewFile(code) + if err != nil { + return nil, err + } + defer f.Close() + + symbols, err := f.Symbols() + if err != nil { + return nil, errors.Wrap(err, "load symbols") + } + + ec := &elfCode{f, symbols, symbolsPerSection(symbols)} + + var licenseSection, versionSection *elf.Section + progSections := make(map[elf.SectionIndex]*elf.Section) + relSections := make(map[elf.SectionIndex]*elf.Section) + mapSections := make(map[elf.SectionIndex]*elf.Section) + for i, sec := range ec.Sections { + switch { + case strings.HasPrefix(sec.Name, "license"): + licenseSection = sec + case strings.HasPrefix(sec.Name, "version"): + versionSection = sec + case strings.HasPrefix(sec.Name, "maps"): + mapSections[elf.SectionIndex(i)] = sec + case sec.Type == elf.SHT_REL: + if int(sec.Info) >= len(ec.Sections) { + return nil, errors.Errorf("found relocation section %v for missing section %v", i, sec.Info) + } + + // Store relocations under the section index of the target + idx := elf.SectionIndex(sec.Info) + if relSections[idx] != nil { + return nil, errors.Errorf("section %d has multiple relocation sections", idx) + } + relSections[idx] = sec + case sec.Type == elf.SHT_PROGBITS && (sec.Flags&elf.SHF_EXECINSTR) != 0 && sec.Size > 0: + progSections[elf.SectionIndex(i)] = sec + } + } + + license, err := loadLicense(licenseSection) + if err != nil { + return nil, errors.Wrap(err, "load license") + } + + version, err := loadVersion(versionSection, ec.ByteOrder) + if err != nil { + return nil, errors.Wrap(err, "load version") + } + + maps, err := ec.loadMaps(mapSections) + if err != nil { + return nil, errors.Wrap(err, "load maps") + } + + progs, libs, err := ec.loadPrograms(progSections, relSections, license, version) + if err != nil { + return nil, errors.Wrap(err, "load programs") + } + + if len(libs) > 0 { + for name, prog := range progs { + prog.Instructions, err = link(prog.Instructions, libs...) + if err != nil { + return nil, errors.Wrapf(err, "program %s", name) + } + } + } + + return &CollectionSpec{maps, progs}, nil +} + +func loadLicense(sec *elf.Section) (string, error) { + if sec == nil { + return "", errors.Errorf("missing license section") + } + data, err := sec.Data() + if err != nil { + return "", errors.Wrapf(err, "section %s", sec.Name) + } + return string(bytes.TrimRight(data, "\000")), nil +} + +func loadVersion(sec *elf.Section, bo binary.ByteOrder) (uint32, error) { + if sec == nil { + return 0, nil + } + + var version uint32 + err := binary.Read(sec.Open(), bo, &version) + return version, errors.Wrapf(err, "section %s", sec.Name) +} + +func (ec *elfCode) loadPrograms(progSections, relSections map[elf.SectionIndex]*elf.Section, license string, version uint32) (map[string]*ProgramSpec, []asm.Instructions, error) { + var ( + progs = make(map[string]*ProgramSpec) + libs []asm.Instructions + ) + for idx, prog := range progSections { + syms := ec.symbolsPerSection[idx] + if len(syms) == 0 { + return nil, nil, errors.Errorf("section %v: missing symbols", prog.Name) + } + + funcSym := syms[0] + if funcSym == "" { + return nil, nil, errors.Errorf("section %v: no label at start", prog.Name) + } + + rels, err := ec.loadRelocations(relSections[idx]) + if err != nil { + return nil, nil, errors.Wrapf(err, "program %s: can't load relocations", funcSym) + } + + insns, err := ec.loadInstructions(prog, syms, rels) + if err != nil { + return nil, nil, errors.Wrapf(err, "program %s: can't unmarshal instructions", funcSym) + } + + if progType, attachType := getProgType(prog.Name); progType == UnspecifiedProgram { + // There is no single name we can use for "library" sections, + // since they may contain multiple functions. We'll decode the + // labels they contain later on, and then link sections that way. + libs = append(libs, insns) + } else { + progs[funcSym] = &ProgramSpec{ + Name: funcSym, + Type: progType, + AttachType: attachType, + License: license, + KernelVersion: version, + Instructions: insns, + } + } + } + return progs, libs, nil +} + +func (ec *elfCode) loadInstructions(section *elf.Section, symbols, relocations map[uint64]string) (asm.Instructions, error) { + var ( + r = section.Open() + insns asm.Instructions + ins asm.Instruction + offset uint64 + ) + for { + n, err := ins.Unmarshal(r, ec.ByteOrder) + if err == io.EOF { + return insns, nil + } + if err != nil { + return nil, errors.Wrapf(err, "offset %d", offset) + } + + ins.Symbol = symbols[offset] + ins.Reference = relocations[offset] + + insns = append(insns, ins) + offset += n + } +} + +func (ec *elfCode) loadMaps(mapSections map[elf.SectionIndex]*elf.Section) (map[string]*MapSpec, error) { + var ( + maps = make(map[string]*MapSpec) + b = make([]byte, 1) + ) + for idx, sec := range mapSections { + syms := ec.symbolsPerSection[idx] + if len(syms) == 0 { + return nil, errors.Errorf("section %v: no symbols", sec.Name) + } + + if sec.Size%uint64(len(syms)) != 0 { + return nil, errors.Errorf("section %v: map descriptors are not of equal size", sec.Name) + } + + var ( + r = sec.Open() + size = sec.Size / uint64(len(syms)) + ) + for i, offset := 0, uint64(0); i < len(syms); i, offset = i+1, offset+size { + mapSym := syms[offset] + if mapSym == "" { + fmt.Println(syms) + return nil, errors.Errorf("section %s: missing symbol for map at offset %d", sec.Name, offset) + } + + if maps[mapSym] != nil { + return nil, errors.Errorf("section %v: map %v already exists", sec.Name, mapSym) + } + + lr := io.LimitReader(r, int64(size)) + + var spec MapSpec + switch { + case binary.Read(lr, ec.ByteOrder, &spec.Type) != nil: + return nil, errors.Errorf("map %v: missing type", mapSym) + case binary.Read(lr, ec.ByteOrder, &spec.KeySize) != nil: + return nil, errors.Errorf("map %v: missing key size", mapSym) + case binary.Read(lr, ec.ByteOrder, &spec.ValueSize) != nil: + return nil, errors.Errorf("map %v: missing value size", mapSym) + case binary.Read(lr, ec.ByteOrder, &spec.MaxEntries) != nil: + return nil, errors.Errorf("map %v: missing max entries", mapSym) + case binary.Read(lr, ec.ByteOrder, &spec.Flags) != nil: + return nil, errors.Errorf("map %v: missing flags", mapSym) + } + + for { + _, err := lr.Read(b) + if err == io.EOF { + break + } + if err != nil { + return nil, err + } + if b[0] != 0 { + return nil, errors.Errorf("map %v: unknown and non-zero fields in definition", mapSym) + } + } + + maps[mapSym] = &spec + } + } + return maps, nil +} + +func getProgType(v string) (ProgramType, AttachType) { + types := map[string]ProgramType{ + // From https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/lib/bpf/libbpf.c#n3568 + "socket": SocketFilter, + "seccomp": SocketFilter, + "kprobe/": Kprobe, + "kretprobe/": Kprobe, + "tracepoint/": TracePoint, + "xdp": XDP, + "perf_event": PerfEvent, + "sockops": SockOps, + "sk_skb": SkSKB, + "sk_msg": SkMsg, + "lirc_mode2": LircMode2, + "flow_dissector": FlowDissector, + + "cgroup_skb/": CGroupSKB, + "cgroup/dev": CGroupDevice, + "cgroup/skb": CGroupSKB, + "cgroup/sock": CGroupSock, + "cgroup/post_bind": CGroupSock, + "cgroup/bind": CGroupSockAddr, + "cgroup/connect": CGroupSockAddr, + "cgroup/sendmsg": CGroupSockAddr, + "cgroup/recvmsg": CGroupSockAddr, + "cgroup/sysctl": CGroupSysctl, + "cgroup/getsockopt": CGroupSockopt, + "cgroup/setsockopt": CGroupSockopt, + "classifier": SchedCLS, + "action": SchedACT, + } + attachTypes := map[string]AttachType{ + "cgroup_skb/ingress": AttachCGroupInetIngress, + "cgroup_skb/egress": AttachCGroupInetEgress, + "cgroup/sock": AttachCGroupInetSockCreate, + "cgroup/post_bind4": AttachCGroupInet4PostBind, + "cgroup/post_bind6": AttachCGroupInet6PostBind, + "cgroup/dev": AttachCGroupDevice, + "sockops": AttachCGroupSockOps, + "sk_skb/stream_parser": AttachSkSKBStreamParser, + "sk_skb/stream_verdict": AttachSkSKBStreamVerdict, + "sk_msg": AttachSkSKBStreamVerdict, + "lirc_mode2": AttachLircMode2, + "flow_dissector": AttachFlowDissector, + "cgroup/bind4": AttachCGroupInet4Bind, + "cgroup/bind6": AttachCGroupInet6Bind, + "cgroup/connect4": AttachCGroupInet4Connect, + "cgroup/connect6": AttachCGroupInet6Connect, + "cgroup/sendmsg4": AttachCGroupUDP4Sendmsg, + "cgroup/sendmsg6": AttachCGroupUDP6Sendmsg, + "cgroup/recvmsg4": AttachCGroupUDP4Recvmsg, + "cgroup/recvmsg6": AttachCGroupUDP6Recvmsg, + "cgroup/sysctl": AttachCGroupSysctl, + "cgroup/getsockopt": AttachCGroupGetsockopt, + "cgroup/setsockopt": AttachCGroupSetsockopt, + } + attachType := AttachNone + for k, t := range attachTypes { + if strings.HasPrefix(v, k) { + attachType = t + } + } + + for k, t := range types { + if strings.HasPrefix(v, k) { + return t, attachType + } + } + return UnspecifiedProgram, AttachNone +} + +func (ec *elfCode) loadRelocations(sec *elf.Section) (map[uint64]string, error) { + rels := make(map[uint64]string) + if sec == nil { + return rels, nil + } + + if sec.Entsize < 16 { + return nil, errors.New("rels are less than 16 bytes") + } + + r := sec.Open() + for off := uint64(0); off < sec.Size; off += sec.Entsize { + ent := io.LimitReader(r, int64(sec.Entsize)) + + var rel elf.Rel64 + if binary.Read(ent, ec.ByteOrder, &rel) != nil { + return nil, errors.Errorf("can't parse relocation at offset %v", off) + } + + symNo := int(elf.R_SYM64(rel.Info) - 1) + if symNo >= len(ec.symbols) { + return nil, errors.Errorf("relocation at offset %d: symbol %v doesnt exist", off, symNo) + } + + rels[rel.Off] = ec.symbols[symNo].Name + } + return rels, nil +} + +func symbolsPerSection(symbols []elf.Symbol) map[elf.SectionIndex]map[uint64]string { + result := make(map[elf.SectionIndex]map[uint64]string) + for i, sym := range symbols { + switch elf.ST_TYPE(sym.Info) { + case elf.STT_NOTYPE: + // Older versions of LLVM doesn't tag + // symbols correctly. + break + case elf.STT_OBJECT: + break + case elf.STT_FUNC: + break + default: + continue + } + + if sym.Name == "" { + continue + } + + idx := sym.Section + if _, ok := result[idx]; !ok { + result[idx] = make(map[uint64]string) + } + result[idx][sym.Value] = symbols[i].Name + } + return result +} diff --git a/vendor/github.com/cilium/ebpf/go.mod b/vendor/github.com/cilium/ebpf/go.mod new file mode 100644 index 0000000000..687bdec9f6 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/go.mod @@ -0,0 +1,8 @@ +module github.com/cilium/ebpf + +go 1.12 + +require ( + github.com/pkg/errors v0.8.1 + golang.org/x/sys v0.0.0-20191022100944-742c48ecaeb7 +) diff --git a/vendor/github.com/cilium/ebpf/internal/cpu.go b/vendor/github.com/cilium/ebpf/internal/cpu.go new file mode 100644 index 0000000000..ce3cab730a --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/cpu.go @@ -0,0 +1,64 @@ +package internal + +import ( + "fmt" + "os" + "sync" + + "github.com/pkg/errors" +) + +var sysCPU struct { + once sync.Once + err error + num int +} + +// PossibleCPUs returns the max number of CPUs a system may possibly have +// Logical CPU numbers must be of the form 0-n +func PossibleCPUs() (int, error) { + sysCPU.once.Do(func() { + sysCPU.num, sysCPU.err = parseCPUs("/sys/devices/system/cpu/possible") + }) + + return sysCPU.num, sysCPU.err +} + +var onlineCPU struct { + once sync.Once + err error + num int +} + +// OnlineCPUs returns the number of currently online CPUs +// Logical CPU numbers must be of the form 0-n +func OnlineCPUs() (int, error) { + onlineCPU.once.Do(func() { + onlineCPU.num, onlineCPU.err = parseCPUs("/sys/devices/system/cpu/online") + }) + + return onlineCPU.num, onlineCPU.err +} + +// parseCPUs parses the number of cpus from sysfs, +// in the format of "/sys/devices/system/cpu/{possible,online,..}. +// Logical CPU numbers must be of the form 0-n +func parseCPUs(path string) (int, error) { + file, err := os.Open(path) + if err != nil { + return 0, err + } + defer file.Close() + + var low, high int + n, _ := fmt.Fscanf(file, "%d-%d", &low, &high) + if n < 1 || low != 0 { + return 0, errors.Wrapf(err, "%s has unknown format", path) + } + if n == 1 { + high = low + } + + // cpus is 0 indexed + return high + 1, nil +} diff --git a/vendor/github.com/cilium/ebpf/internal/endian.go b/vendor/github.com/cilium/ebpf/internal/endian.go new file mode 100644 index 0000000000..ac8a94e512 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/endian.go @@ -0,0 +1,24 @@ +package internal + +import ( + "encoding/binary" + "unsafe" +) + +// NativeEndian is set to either binary.BigEndian or binary.LittleEndian, +// depending on the host's endianness. +var NativeEndian binary.ByteOrder + +func init() { + if isBigEndian() { + NativeEndian = binary.BigEndian + } else { + NativeEndian = binary.LittleEndian + } +} + +func isBigEndian() (ret bool) { + i := int(0x1) + bs := (*[int(unsafe.Sizeof(i))]byte)(unsafe.Pointer(&i)) + return bs[0] == 0 +} diff --git a/vendor/github.com/cilium/ebpf/internal/feature.go b/vendor/github.com/cilium/ebpf/internal/feature.go new file mode 100644 index 0000000000..f7497d37f1 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/feature.go @@ -0,0 +1,85 @@ +package internal + +import ( + "fmt" + "sync" + + "github.com/pkg/errors" +) + +// UnsupportedFeatureError is returned by FeatureTest() functions. +type UnsupportedFeatureError struct { + // The minimum Linux mainline version required for this feature. + // Used for the error string, and for sanity checking during testing. + MinimumVersion Version + + // The name of the feature that isn't supported. + Name string +} + +func (ufe *UnsupportedFeatureError) Error() string { + return fmt.Sprintf("%s not supported (requires >= %s)", ufe.Name, ufe.MinimumVersion) +} + +// FeatureTest wraps a function so that it is run at most once. +// +// name should identify the tested feature, while version must be in the +// form Major.Minor[.Patch]. +// +// Returns a descriptive UnsupportedFeatureError if the feature is not available. +func FeatureTest(name, version string, fn func() bool) func() error { + v, err := NewVersion(version) + if err != nil { + return func() error { return err } + } + + var ( + once sync.Once + result error + ) + + return func() error { + once.Do(func() { + if !fn() { + result = &UnsupportedFeatureError{ + MinimumVersion: v, + Name: name, + } + } + }) + return result + } +} + +// A Version in the form Major.Minor.Patch. +type Version [3]uint16 + +// NewVersion creates a version from a string like "Major.Minor.Patch". +// +// Patch is optional. +func NewVersion(ver string) (Version, error) { + var major, minor, patch uint16 + n, _ := fmt.Sscanf(ver, "%d.%d.%d", &major, &minor, &patch) + if n < 2 { + return Version{}, errors.Errorf("invalid version: %s", ver) + } + return Version{major, minor, patch}, nil +} + +func (v Version) String() string { + if v[2] == 0 { + return fmt.Sprintf("v%d.%d", v[0], v[1]) + } + return fmt.Sprintf("v%d.%d.%d", v[0], v[1], v[2]) +} + +// Less returns true if the version is less than another version. +func (v Version) Less(other Version) bool { + for i, a := range v { + if a == other[i] { + continue + } + return a < other[i] + } + return false +} diff --git a/vendor/github.com/cilium/ebpf/internal/unix/types_linux.go b/vendor/github.com/cilium/ebpf/internal/unix/types_linux.go new file mode 100644 index 0000000000..1693096674 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/unix/types_linux.go @@ -0,0 +1,127 @@ +// +build linux + +package unix + +import ( + "syscall" + + linux "golang.org/x/sys/unix" +) + +const ( + ENOENT = linux.ENOENT + EAGAIN = linux.EAGAIN + ENOSPC = linux.ENOSPC + EINVAL = linux.EINVAL + EPOLLIN = linux.EPOLLIN + BPF_OBJ_NAME_LEN = linux.BPF_OBJ_NAME_LEN + BPF_TAG_SIZE = linux.BPF_TAG_SIZE + SYS_BPF = linux.SYS_BPF + F_DUPFD_CLOEXEC = linux.F_DUPFD_CLOEXEC + EPOLL_CTL_ADD = linux.EPOLL_CTL_ADD + EPOLL_CLOEXEC = linux.EPOLL_CLOEXEC + O_CLOEXEC = linux.O_CLOEXEC + O_NONBLOCK = linux.O_NONBLOCK + PROT_READ = linux.PROT_READ + PROT_WRITE = linux.PROT_WRITE + MAP_SHARED = linux.MAP_SHARED + PERF_TYPE_SOFTWARE = linux.PERF_TYPE_SOFTWARE + PERF_COUNT_SW_BPF_OUTPUT = linux.PERF_COUNT_SW_BPF_OUTPUT + PerfBitWatermark = linux.PerfBitWatermark + PERF_SAMPLE_RAW = linux.PERF_SAMPLE_RAW + PERF_FLAG_FD_CLOEXEC = linux.PERF_FLAG_FD_CLOEXEC + RLIM_INFINITY = linux.RLIM_INFINITY +) + +// Statfs_t is a wrapper +type Statfs_t = linux.Statfs_t + +// Rlimit is a wrapper +type Rlimit = linux.Rlimit + +// Setrlimit is a wrapper +func Setrlimit(resource int, rlim *Rlimit) (err error) { + return linux.Setrlimit(resource, rlim) +} + +// Syscall is a wrapper +func Syscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno) { + return linux.Syscall(trap, a1, a2, a3) +} + +// FcntlInt is a wrapper +func FcntlInt(fd uintptr, cmd, arg int) (int, error) { + return linux.FcntlInt(fd, cmd, arg) +} + +// Statfs is a wrapper +func Statfs(path string, buf *Statfs_t) (err error) { + return linux.Statfs(path, buf) +} + +// Close is a wrapper +func Close(fd int) (err error) { + return linux.Close(fd) +} + +// EpollEvent is a wrapper +type EpollEvent = linux.EpollEvent + +// EpollWait is a wrapper +func EpollWait(epfd int, events []EpollEvent, msec int) (n int, err error) { + return linux.EpollWait(epfd, events, msec) +} + +// EpollCtl is a wrapper +func EpollCtl(epfd int, op int, fd int, event *EpollEvent) (err error) { + return linux.EpollCtl(epfd, op, fd, event) +} + +// Eventfd is a wrapper +func Eventfd(initval uint, flags int) (fd int, err error) { + return linux.Eventfd(initval, flags) +} + +// Write is a wrapper +func Write(fd int, p []byte) (n int, err error) { + return linux.Write(fd, p) +} + +// EpollCreate1 is a wrapper +func EpollCreate1(flag int) (fd int, err error) { + return linux.EpollCreate1(flag) +} + +// PerfEventMmapPage is a wrapper +type PerfEventMmapPage linux.PerfEventMmapPage + +// SetNonblock is a wrapper +func SetNonblock(fd int, nonblocking bool) (err error) { + return linux.SetNonblock(fd, nonblocking) +} + +// Mmap is a wrapper +func Mmap(fd int, offset int64, length int, prot int, flags int) (data []byte, err error) { + return linux.Mmap(fd, offset, length, prot, flags) +} + +// Munmap is a wrapper +func Munmap(b []byte) (err error) { + return linux.Munmap(b) +} + +// PerfEventAttr is a wrapper +type PerfEventAttr = linux.PerfEventAttr + +// PerfEventOpen is a wrapper +func PerfEventOpen(attr *PerfEventAttr, pid int, cpu int, groupFd int, flags int) (fd int, err error) { + return linux.PerfEventOpen(attr, pid, cpu, groupFd, flags) +} + +// Utsname is a wrapper +type Utsname = linux.Utsname + +// Uname is a wrapper +func Uname(buf *Utsname) (err error) { + return linux.Uname(buf) +} diff --git a/vendor/github.com/cilium/ebpf/internal/unix/types_other.go b/vendor/github.com/cilium/ebpf/internal/unix/types_other.go new file mode 100644 index 0000000000..57a514da7a --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/unix/types_other.go @@ -0,0 +1,193 @@ +// +build !linux + +package unix + +import ( + "fmt" + "runtime" + "syscall" +) + +var errNonLinux = fmt.Errorf("unsupported platform %s/%s", runtime.GOOS, runtime.GOARCH) + +const ( + ENOENT = syscall.ENOENT + EAGAIN = syscall.EAGAIN + ENOSPC = syscall.ENOSPC + EINVAL = syscall.EINVAL + BPF_OBJ_NAME_LEN = 0x10 + BPF_TAG_SIZE = 0x8 + SYS_BPF = 321 + F_DUPFD_CLOEXEC = 0x406 + EPOLLIN = 0x1 + EPOLL_CTL_ADD = 0x1 + EPOLL_CLOEXEC = 0x80000 + O_CLOEXEC = 0x80000 + O_NONBLOCK = 0x800 + PROT_READ = 0x1 + PROT_WRITE = 0x2 + MAP_SHARED = 0x1 + PERF_TYPE_SOFTWARE = 0x1 + PERF_COUNT_SW_BPF_OUTPUT = 0xa + PerfBitWatermark = 0x4000 + PERF_SAMPLE_RAW = 0x400 + PERF_FLAG_FD_CLOEXEC = 0x8 +) + +// Statfs_t is a wrapper +type Statfs_t struct { + Type int64 + Bsize int64 + Blocks uint64 + Bfree uint64 + Bavail uint64 + Files uint64 + Ffree uint64 + Fsid [2]int32 + Namelen int64 + Frsize int64 + Flags int64 + Spare [4]int64 +} + +// Rlimit is a wrapper +type Rlimit struct { + Cur uint64 + Max uint64 +} + +// Setrlimit is a wrapper +func Setrlimit(resource int, rlim *Rlimit) (err error) { + return errNonLinux +} + +// Syscall is a wrapper +func Syscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno) { + return 0, 0, syscall.Errno(1) +} + +// FcntlInt is a wrapper +func FcntlInt(fd uintptr, cmd, arg int) (int, error) { + return -1, errNonLinux +} + +// Statfs is a wrapper +func Statfs(path string, buf *Statfs_t) error { + return errNonLinux +} + +// Close is a wrapper +func Close(fd int) (err error) { + return errNonLinux +} + +// EpollEvent is a wrapper +type EpollEvent struct { + Events uint32 + Fd int32 + Pad int32 +} + +// EpollWait is a wrapper +func EpollWait(epfd int, events []EpollEvent, msec int) (n int, err error) { + return 0, errNonLinux +} + +// EpollCtl is a wrapper +func EpollCtl(epfd int, op int, fd int, event *EpollEvent) (err error) { + return errNonLinux +} + +// Eventfd is a wrapper +func Eventfd(initval uint, flags int) (fd int, err error) { + return 0, errNonLinux +} + +// Write is a wrapper +func Write(fd int, p []byte) (n int, err error) { + return 0, errNonLinux +} + +// EpollCreate1 is a wrapper +func EpollCreate1(flag int) (fd int, err error) { + return 0, errNonLinux +} + +// PerfEventMmapPage is a wrapper +type PerfEventMmapPage struct { + Version uint32 + Compat_version uint32 + Lock uint32 + Index uint32 + Offset int64 + Time_enabled uint64 + Time_running uint64 + Capabilities uint64 + Pmc_width uint16 + Time_shift uint16 + Time_mult uint32 + Time_offset uint64 + Time_zero uint64 + Size uint32 + + Data_head uint64 + Data_tail uint64 + Data_offset uint64 + Data_size uint64 + Aux_head uint64 + Aux_tail uint64 + Aux_offset uint64 + Aux_size uint64 +} + +// SetNonblock is a wrapper +func SetNonblock(fd int, nonblocking bool) (err error) { + return errNonLinux +} + +// Mmap is a wrapper +func Mmap(fd int, offset int64, length int, prot int, flags int) (data []byte, err error) { + return []byte{}, errNonLinux +} + +// Munmap is a wrapper +func Munmap(b []byte) (err error) { + return errNonLinux +} + +// PerfEventAttr is a wrapper +type PerfEventAttr struct { + Type uint32 + Size uint32 + Config uint64 + Sample uint64 + Sample_type uint64 + Read_format uint64 + Bits uint64 + Wakeup uint32 + Bp_type uint32 + Ext1 uint64 + Ext2 uint64 + Branch_sample_type uint64 + Sample_regs_user uint64 + Sample_stack_user uint32 + Clockid int32 + Sample_regs_intr uint64 + Aux_watermark uint32 + Sample_max_stack uint16 +} + +// PerfEventOpen is a wrapper +func PerfEventOpen(attr *PerfEventAttr, pid int, cpu int, groupFd int, flags int) (fd int, err error) { + return 0, errNonLinux +} + +// Utsname is a wrapper +type Utsname struct { + Release [65]byte +} + +// Uname is a wrapper +func Uname(buf *Utsname) (err error) { + return errNonLinux +} diff --git a/vendor/github.com/cilium/ebpf/linker.go b/vendor/github.com/cilium/ebpf/linker.go new file mode 100644 index 0000000000..da556c2269 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/linker.go @@ -0,0 +1,58 @@ +package ebpf + +import ( + "github.com/cilium/ebpf/asm" +) + +// link resolves bpf-to-bpf calls. +// +// Each section may contain multiple functions / labels, and is only linked +// if the program being edited references one of these functions. +// +// Sections must not require linking themselves. +func link(insns asm.Instructions, sections ...asm.Instructions) (asm.Instructions, error) { + for _, section := range sections { + var err error + insns, err = linkSection(insns, section) + if err != nil { + return nil, err + } + } + return insns, nil +} + +func linkSection(insns, section asm.Instructions) (asm.Instructions, error) { + // A map of symbols to the libraries which contain them. + symbols, err := section.SymbolOffsets() + if err != nil { + return nil, err + } + + for _, ins := range insns { + if ins.Reference == "" { + continue + } + + if ins.OpCode.JumpOp() != asm.Call || ins.Src != asm.R1 { + continue + } + + if ins.Constant != -1 { + // This is already a valid call, no need to link again. + continue + } + + if _, ok := symbols[ins.Reference]; !ok { + // Symbol isn't available in this section + continue + } + + // At this point we know that at least one function in the + // library is called from insns. Merge the two sections. + // The rewrite of ins.Constant happens in asm.Instruction.Marshal. + return append(insns, section...), nil + } + + // None of the functions in the section are called. Do nothing. + return insns, nil +} diff --git a/vendor/github.com/cilium/ebpf/map.go b/vendor/github.com/cilium/ebpf/map.go new file mode 100644 index 0000000000..a8e1eab341 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/map.go @@ -0,0 +1,604 @@ +package ebpf + +import ( + "fmt" + "unsafe" + + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/unix" + + "github.com/pkg/errors" +) + +// MapSpec defines a Map. +type MapSpec struct { + // Name is passed to the kernel as a debug aid. Must only contain + // alpha numeric and '_' characters. + Name string + Type MapType + KeySize uint32 + ValueSize uint32 + MaxEntries uint32 + Flags uint32 + // InnerMap is used as a template for ArrayOfMaps and HashOfMaps + InnerMap *MapSpec +} + +func (ms *MapSpec) String() string { + return fmt.Sprintf("%s(keySize=%d, valueSize=%d, maxEntries=%d, flags=%d)", ms.Type, ms.KeySize, ms.ValueSize, ms.MaxEntries, ms.Flags) +} + +// Copy returns a copy of the spec. +func (ms *MapSpec) Copy() *MapSpec { + if ms == nil { + return nil + } + + cpy := *ms + cpy.InnerMap = ms.InnerMap.Copy() + return &cpy +} + +// Map represents a Map file descriptor. +// +// It is not safe to close a map which is used by other goroutines. +// +// Methods which take interface{} arguments by default encode +// them using binary.Read/Write in the machine's native endianness. +// +// Implement encoding.BinaryMarshaler or encoding.BinaryUnmarshaler +// if you require custom encoding. +type Map struct { + name string + fd *bpfFD + abi MapABI + // Per CPU maps return values larger than the size in the spec + fullValueSize int +} + +// NewMapFromFD creates a map from a raw fd. +// +// You should not use fd after calling this function. +func NewMapFromFD(fd int) (*Map, error) { + if fd < 0 { + return nil, errors.New("invalid fd") + } + bpfFd := newBPFFD(uint32(fd)) + + name, abi, err := newMapABIFromFd(bpfFd) + if err != nil { + bpfFd.forget() + return nil, err + } + return newMap(bpfFd, name, abi) +} + +// NewMap creates a new Map. +// +// Creating a map for the first time will perform feature detection +// by creating small, temporary maps. +func NewMap(spec *MapSpec) (*Map, error) { + if spec.Type != ArrayOfMaps && spec.Type != HashOfMaps { + return createMap(spec, nil) + } + + if spec.InnerMap == nil { + return nil, errors.Errorf("%s requires InnerMap", spec.Type) + } + + template, err := createMap(spec.InnerMap, nil) + if err != nil { + return nil, err + } + defer template.Close() + + return createMap(spec, template.fd) +} + +func createMap(spec *MapSpec, inner *bpfFD) (*Map, error) { + spec = spec.Copy() + + switch spec.Type { + case ArrayOfMaps: + fallthrough + case HashOfMaps: + if err := haveNestedMaps(); err != nil { + return nil, err + } + + if spec.ValueSize != 0 && spec.ValueSize != 4 { + return nil, errors.Errorf("ValueSize must be zero or four for map of map") + } + spec.ValueSize = 4 + + case PerfEventArray: + if spec.KeySize != 0 { + return nil, errors.Errorf("KeySize must be zero for perf event array") + } + if spec.ValueSize != 0 { + return nil, errors.Errorf("ValueSize must be zero for perf event array") + } + if spec.MaxEntries == 0 { + n, err := internal.OnlineCPUs() + if err != nil { + return nil, errors.Wrap(err, "perf event array") + } + spec.MaxEntries = uint32(n) + } + + spec.KeySize = 4 + spec.ValueSize = 4 + } + + attr := bpfMapCreateAttr{ + mapType: spec.Type, + keySize: spec.KeySize, + valueSize: spec.ValueSize, + maxEntries: spec.MaxEntries, + flags: spec.Flags, + } + + if inner != nil { + var err error + attr.innerMapFd, err = inner.value() + if err != nil { + return nil, errors.Wrap(err, "map create") + } + } + + name, err := newBPFObjName(spec.Name) + if err != nil { + return nil, errors.Wrap(err, "map create") + } + + if haveObjName() == nil { + attr.mapName = name + } + + fd, err := bpfMapCreate(&attr) + if err != nil { + return nil, errors.Wrap(err, "map create") + } + + return newMap(fd, spec.Name, newMapABIFromSpec(spec)) +} + +func newMap(fd *bpfFD, name string, abi *MapABI) (*Map, error) { + m := &Map{ + name, + fd, + *abi, + int(abi.ValueSize), + } + + if !abi.Type.hasPerCPUValue() { + return m, nil + } + + possibleCPUs, err := internal.PossibleCPUs() + if err != nil { + return nil, err + } + + m.fullValueSize = align(int(abi.ValueSize), 8) * possibleCPUs + return m, nil +} + +func (m *Map) String() string { + if m.name != "" { + return fmt.Sprintf("%s(%s)#%v", m.abi.Type, m.name, m.fd) + } + return fmt.Sprintf("%s#%v", m.abi.Type, m.fd) +} + +// ABI gets the ABI of the Map +func (m *Map) ABI() MapABI { + return m.abi +} + +// Lookup retrieves a value from a Map. +// +// Calls Close() on valueOut if it is of type **Map or **Program, +// and *valueOut is not nil. +// +// Returns an error if the key doesn't exist, see IsNotExist. +func (m *Map) Lookup(key, valueOut interface{}) error { + valuePtr, valueBytes := makeBuffer(valueOut, m.fullValueSize) + + if err := m.lookup(key, valuePtr); err != nil { + return err + } + + if valueBytes == nil { + return nil + } + + if m.abi.Type.hasPerCPUValue() { + return unmarshalPerCPUValue(valueOut, int(m.abi.ValueSize), valueBytes) + } + + switch value := valueOut.(type) { + case **Map: + m, err := unmarshalMap(valueBytes) + if err != nil { + return err + } + + (*value).Close() + *value = m + return nil + case *Map: + return errors.Errorf("can't unmarshal into %T, need %T", value, (**Map)(nil)) + case Map: + return errors.Errorf("can't unmarshal into %T, need %T", value, (**Map)(nil)) + + case **Program: + p, err := unmarshalProgram(valueBytes) + if err != nil { + return err + } + + (*value).Close() + *value = p + return nil + case *Program: + return errors.Errorf("can't unmarshal into %T, need %T", value, (**Program)(nil)) + case Program: + return errors.Errorf("can't unmarshal into %T, need %T", value, (**Program)(nil)) + + default: + return unmarshalBytes(valueOut, valueBytes) + } +} + +// LookupBytes gets a value from Map. +// +// Returns a nil value if a key doesn't exist. +func (m *Map) LookupBytes(key interface{}) ([]byte, error) { + valueBytes := make([]byte, m.fullValueSize) + valuePtr := newPtr(unsafe.Pointer(&valueBytes[0])) + + err := m.lookup(key, valuePtr) + if IsNotExist(err) { + return nil, nil + } + + return valueBytes, err +} + +func (m *Map) lookup(key interface{}, valueOut syscallPtr) error { + keyPtr, err := marshalPtr(key, int(m.abi.KeySize)) + if err != nil { + return errors.WithMessage(err, "can't marshal key") + } + + err = bpfMapLookupElem(m.fd, keyPtr, valueOut) + return errors.WithMessage(err, "lookup failed") +} + +// MapUpdateFlags controls the behaviour of the Map.Update call. +// +// The exact semantics depend on the specific MapType. +type MapUpdateFlags uint64 + +const ( + // UpdateAny creates a new element or update an existing one. + UpdateAny MapUpdateFlags = iota + // UpdateNoExist creates a new element. + UpdateNoExist MapUpdateFlags = 1 << (iota - 1) + // UpdateExist updates an existing element. + UpdateExist +) + +// Put replaces or creates a value in map. +// +// It is equivalent to calling Update with UpdateAny. +func (m *Map) Put(key, value interface{}) error { + return m.Update(key, value, UpdateAny) +} + +// Update changes the value of a key. +func (m *Map) Update(key, value interface{}, flags MapUpdateFlags) error { + keyPtr, err := marshalPtr(key, int(m.abi.KeySize)) + if err != nil { + return errors.WithMessage(err, "can't marshal key") + } + + var valuePtr syscallPtr + if m.abi.Type.hasPerCPUValue() { + valuePtr, err = marshalPerCPUValue(value, int(m.abi.ValueSize)) + } else { + valuePtr, err = marshalPtr(value, int(m.abi.ValueSize)) + } + if err != nil { + return errors.WithMessage(err, "can't marshal value") + } + + return bpfMapUpdateElem(m.fd, keyPtr, valuePtr, uint64(flags)) +} + +// Delete removes a value. +// +// Returns an error if the key does not exist, see IsNotExist. +func (m *Map) Delete(key interface{}) error { + keyPtr, err := marshalPtr(key, int(m.abi.KeySize)) + if err != nil { + return errors.WithMessage(err, "can't marshal key") + } + + err = bpfMapDeleteElem(m.fd, keyPtr) + return errors.WithMessage(err, "can't delete key") +} + +// NextKey finds the key following an initial key. +// +// See NextKeyBytes for details. +func (m *Map) NextKey(key, nextKeyOut interface{}) error { + nextKeyPtr, nextKeyBytes := makeBuffer(nextKeyOut, int(m.abi.KeySize)) + + if err := m.nextKey(key, nextKeyPtr); err != nil { + return err + } + + if nextKeyBytes == nil { + return nil + } + + err := unmarshalBytes(nextKeyOut, nextKeyBytes) + return errors.WithMessage(err, "can't unmarshal next key") +} + +// NextKeyBytes returns the key following an initial key as a byte slice. +// +// Passing nil will return the first key. +// +// Use Iterate if you want to traverse all entries in the map. +func (m *Map) NextKeyBytes(key interface{}) ([]byte, error) { + nextKey := make([]byte, m.abi.KeySize) + nextKeyPtr := newPtr(unsafe.Pointer(&nextKey[0])) + + err := m.nextKey(key, nextKeyPtr) + if IsNotExist(err) { + return nil, nil + } + + return nextKey, err +} + +func (m *Map) nextKey(key interface{}, nextKeyOut syscallPtr) error { + var ( + keyPtr syscallPtr + err error + ) + + if key != nil { + keyPtr, err = marshalPtr(key, int(m.abi.KeySize)) + if err != nil { + return errors.WithMessage(err, "can't marshal key") + } + } + + err = bpfMapGetNextKey(m.fd, keyPtr, nextKeyOut) + return errors.WithMessage(err, "can't get next key") +} + +// Iterate traverses a map. +// +// It's safe to create multiple iterators at the same time. +// +// It's not possible to guarantee that all keys in a map will be +// returned if there are concurrent modifications to the map. +func (m *Map) Iterate() *MapIterator { + return newMapIterator(m) +} + +// Close removes a Map +func (m *Map) Close() error { + if m == nil { + // This makes it easier to clean up when iterating maps + // of maps / programs. + return nil + } + + return m.fd.close() +} + +// FD gets the file descriptor of the Map. +// +// Calling this function is invalid after Close has been called. +func (m *Map) FD() int { + fd, err := m.fd.value() + if err != nil { + // Best effort: -1 is the number most likely to be an + // invalid file descriptor. + return -1 + } + + return int(fd) +} + +// Clone creates a duplicate of the Map. +// +// Closing the duplicate does not affect the original, and vice versa. +// Changes made to the map are reflected by both instances however. +// +// Cloning a nil Map returns nil. +func (m *Map) Clone() (*Map, error) { + if m == nil { + return nil, nil + } + + dup, err := m.fd.dup() + if err != nil { + return nil, errors.Wrap(err, "can't clone map") + } + + return newMap(dup, m.name, &m.abi) +} + +// Pin persists the map past the lifetime of the process that created it. +// +// This requires bpffs to be mounted above fileName. See http://cilium.readthedocs.io/en/doc-1.0/kubernetes/install/#mounting-the-bpf-fs-optional +func (m *Map) Pin(fileName string) error { + return bpfPinObject(fileName, m.fd) +} + +// LoadPinnedMap load a Map from a BPF file. +// +// The function is not compatible with nested maps. +// Use LoadPinnedMapExplicit in these situations. +func LoadPinnedMap(fileName string) (*Map, error) { + fd, err := bpfGetObject(fileName) + if err != nil { + return nil, err + } + name, abi, err := newMapABIFromFd(fd) + if err != nil { + _ = fd.close() + return nil, err + } + return newMap(fd, name, abi) +} + +// LoadPinnedMapExplicit loads a map with explicit parameters. +func LoadPinnedMapExplicit(fileName string, abi *MapABI) (*Map, error) { + fd, err := bpfGetObject(fileName) + if err != nil { + return nil, err + } + return newMap(fd, "", abi) +} + +func unmarshalMap(buf []byte) (*Map, error) { + if len(buf) != 4 { + return nil, errors.New("map id requires 4 byte value") + } + + // Looking up an entry in a nested map or prog array returns an id, + // not an fd. + id := internal.NativeEndian.Uint32(buf) + fd, err := bpfGetMapFDByID(id) + if err != nil { + return nil, err + } + + name, abi, err := newMapABIFromFd(fd) + if err != nil { + _ = fd.close() + return nil, err + } + + return newMap(fd, name, abi) +} + +// MarshalBinary implements BinaryMarshaler. +func (m *Map) MarshalBinary() ([]byte, error) { + fd, err := m.fd.value() + if err != nil { + return nil, err + } + + buf := make([]byte, 4) + internal.NativeEndian.PutUint32(buf, fd) + return buf, nil +} + +// MapIterator iterates a Map. +// +// See Map.Iterate. +type MapIterator struct { + target *Map + prevKey interface{} + prevBytes []byte + count, maxEntries uint32 + done bool + err error +} + +func newMapIterator(target *Map) *MapIterator { + return &MapIterator{ + target: target, + maxEntries: target.abi.MaxEntries, + prevBytes: make([]byte, int(target.abi.KeySize)), + } +} + +var errIterationAborted = errors.New("iteration aborted") + +// Next decodes the next key and value. +// +// Iterating a hash map from which keys are being deleted is not +// safe. You may see the same key multiple times. Iteration may +// also abort with an error, see IsIterationAborted. +// +// Returns false if there are no more entries. You must check +// the result of Err afterwards. +// +// See Map.Get for further caveats around valueOut. +func (mi *MapIterator) Next(keyOut, valueOut interface{}) bool { + if mi.err != nil || mi.done { + return false + } + + for ; mi.count < mi.maxEntries; mi.count++ { + var nextBytes []byte + nextBytes, mi.err = mi.target.NextKeyBytes(mi.prevKey) + if mi.err != nil { + return false + } + + if nextBytes == nil { + mi.done = true + return false + } + + // The user can get access to nextBytes since unmarshalBytes + // does not copy when unmarshaling into a []byte. + // Make a copy to prevent accidental corruption of + // iterator state. + copy(mi.prevBytes, nextBytes) + mi.prevKey = mi.prevBytes + + mi.err = mi.target.Lookup(nextBytes, valueOut) + if IsNotExist(mi.err) { + // Even though the key should be valid, we couldn't look up + // its value. If we're iterating a hash map this is probably + // because a concurrent delete removed the value before we + // could get it. This means that the next call to NextKeyBytes + // is very likely to restart iteration. + // If we're iterating one of the fd maps like + // ProgramArray it means that a given slot doesn't have + // a valid fd associated. It's OK to continue to the next slot. + continue + } + if mi.err != nil { + return false + } + + mi.err = unmarshalBytes(keyOut, nextBytes) + return mi.err == nil + } + + mi.err = errIterationAborted + return false +} + +// Err returns any encountered error. +// +// The method must be called after Next returns nil. +func (mi *MapIterator) Err() error { + return mi.err +} + +// IsNotExist returns true if the error indicates that a +// key doesn't exist. +func IsNotExist(err error) bool { + return errors.Cause(err) == unix.ENOENT +} + +// IsIterationAborted returns true if the iteration was aborted. +// +// This occurs when keys are deleted from a hash map during iteration. +func IsIterationAborted(err error) bool { + return errors.Cause(err) == errIterationAborted +} diff --git a/vendor/github.com/cilium/ebpf/marshalers.go b/vendor/github.com/cilium/ebpf/marshalers.go new file mode 100644 index 0000000000..44ba273305 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/marshalers.go @@ -0,0 +1,192 @@ +package ebpf + +import ( + "bytes" + "encoding" + "encoding/binary" + "reflect" + "runtime" + "unsafe" + + "github.com/cilium/ebpf/internal" + + "github.com/pkg/errors" +) + +func marshalPtr(data interface{}, length int) (syscallPtr, error) { + if ptr, ok := data.(unsafe.Pointer); ok { + return newPtr(ptr), nil + } + + buf, err := marshalBytes(data, length) + if err != nil { + return syscallPtr{}, err + } + + return newPtr(unsafe.Pointer(&buf[0])), nil +} + +func marshalBytes(data interface{}, length int) (buf []byte, err error) { + switch value := data.(type) { + case encoding.BinaryMarshaler: + buf, err = value.MarshalBinary() + case string: + buf = []byte(value) + case []byte: + buf = value + case unsafe.Pointer: + err = errors.New("can't marshal from unsafe.Pointer") + default: + var wr bytes.Buffer + err = binary.Write(&wr, internal.NativeEndian, value) + err = errors.Wrapf(err, "encoding %T", value) + buf = wr.Bytes() + } + if err != nil { + return nil, err + } + + if len(buf) != length { + return nil, errors.Errorf("%T doesn't marshal to %d bytes", data, length) + } + return buf, nil +} + +func makeBuffer(dst interface{}, length int) (syscallPtr, []byte) { + if ptr, ok := dst.(unsafe.Pointer); ok { + return newPtr(ptr), nil + } + + buf := make([]byte, length) + return newPtr(unsafe.Pointer(&buf[0])), buf +} + +func unmarshalBytes(data interface{}, buf []byte) error { + switch value := data.(type) { + case unsafe.Pointer: + sh := &reflect.SliceHeader{ + Data: uintptr(value), + Len: len(buf), + Cap: len(buf), + } + + dst := *(*[]byte)(unsafe.Pointer(sh)) + copy(dst, buf) + runtime.KeepAlive(value) + return nil + case encoding.BinaryUnmarshaler: + return value.UnmarshalBinary(buf) + case *string: + *value = string(buf) + return nil + case *[]byte: + *value = buf + return nil + case string: + return errors.New("require pointer to string") + case []byte: + return errors.New("require pointer to []byte") + default: + rd := bytes.NewReader(buf) + err := binary.Read(rd, internal.NativeEndian, value) + return errors.Wrapf(err, "decoding %T", value) + } +} + +// marshalPerCPUValue encodes a slice containing one value per +// possible CPU into a buffer of bytes. +// +// Values are initialized to zero if the slice has less elements than CPUs. +// +// slice must have a type like []elementType. +func marshalPerCPUValue(slice interface{}, elemLength int) (syscallPtr, error) { + sliceType := reflect.TypeOf(slice) + if sliceType.Kind() != reflect.Slice { + return syscallPtr{}, errors.New("per-CPU value requires slice") + } + + possibleCPUs, err := internal.PossibleCPUs() + if err != nil { + return syscallPtr{}, err + } + + sliceValue := reflect.ValueOf(slice) + sliceLen := sliceValue.Len() + if sliceLen > possibleCPUs { + return syscallPtr{}, errors.Errorf("per-CPU value exceeds number of CPUs") + } + + alignedElemLength := align(elemLength, 8) + buf := make([]byte, alignedElemLength*possibleCPUs) + + for i := 0; i < sliceLen; i++ { + elem := sliceValue.Index(i).Interface() + elemBytes, err := marshalBytes(elem, elemLength) + if err != nil { + return syscallPtr{}, err + } + + offset := i * alignedElemLength + copy(buf[offset:offset+elemLength], elemBytes) + } + + return newPtr(unsafe.Pointer(&buf[0])), nil +} + +// unmarshalPerCPUValue decodes a buffer into a slice containing one value per +// possible CPU. +// +// valueOut must have a type like *[]elementType +func unmarshalPerCPUValue(slicePtr interface{}, elemLength int, buf []byte) error { + slicePtrType := reflect.TypeOf(slicePtr) + if slicePtrType.Kind() != reflect.Ptr || slicePtrType.Elem().Kind() != reflect.Slice { + return errors.Errorf("per-cpu value requires pointer to slice") + } + + possibleCPUs, err := internal.PossibleCPUs() + if err != nil { + return err + } + + sliceType := slicePtrType.Elem() + slice := reflect.MakeSlice(sliceType, possibleCPUs, possibleCPUs) + + sliceElemType := sliceType.Elem() + sliceElemIsPointer := sliceElemType.Kind() == reflect.Ptr + if sliceElemIsPointer { + sliceElemType = sliceElemType.Elem() + } + + step := len(buf) / possibleCPUs + if step < elemLength { + return errors.Errorf("per-cpu element length is larger than available data") + } + for i := 0; i < possibleCPUs; i++ { + var elem interface{} + if sliceElemIsPointer { + newElem := reflect.New(sliceElemType) + slice.Index(i).Set(newElem) + elem = newElem.Interface() + } else { + elem = slice.Index(i).Addr().Interface() + } + + // Make a copy, since unmarshal can hold on to itemBytes + elemBytes := make([]byte, elemLength) + copy(elemBytes, buf[:elemLength]) + + err := unmarshalBytes(elem, elemBytes) + if err != nil { + return errors.Wrapf(err, "cpu %d", i) + } + + buf = buf[step:] + } + + reflect.ValueOf(slicePtr).Elem().Set(slice) + return nil +} + +func align(n, alignment int) int { + return (int(n) + alignment - 1) / alignment * alignment +} diff --git a/vendor/github.com/cilium/ebpf/prog.go b/vendor/github.com/cilium/ebpf/prog.go new file mode 100644 index 0000000000..c69cc2b05a --- /dev/null +++ b/vendor/github.com/cilium/ebpf/prog.go @@ -0,0 +1,504 @@ +package ebpf + +import ( + "bytes" + "fmt" + "math" + "strings" + "time" + "unsafe" + + "github.com/cilium/ebpf/asm" + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/unix" + + "github.com/pkg/errors" +) + +const ( + // Number of bytes to pad the output buffer for BPF_PROG_TEST_RUN. + // This is currently the maximum of spare space allocated for SKB + // and XDP programs, and equal to XDP_PACKET_HEADROOM + NET_IP_ALIGN. + outputPad = 256 + 2 +) + +// DefaultVerifierLogSize is the default number of bytes allocated for the +// verifier log. +const DefaultVerifierLogSize = 64 * 1024 + +// ProgramOptions control loading a program into the kernel. +type ProgramOptions struct { + // Controls the detail emitted by the kernel verifier. Set to non-zero + // to enable logging. + LogLevel uint32 + // Controls the output buffer size for the verifier. Defaults to + // DefaultVerifierLogSize. + LogSize int +} + +// ProgramSpec defines a Program +type ProgramSpec struct { + // Name is passed to the kernel as a debug aid. Must only contain + // alpha numeric and '_' characters. + Name string + Type ProgramType + AttachType AttachType + Instructions asm.Instructions + License string + KernelVersion uint32 +} + +// Copy returns a copy of the spec. +func (ps *ProgramSpec) Copy() *ProgramSpec { + if ps == nil { + return nil + } + + cpy := *ps + cpy.Instructions = make(asm.Instructions, len(ps.Instructions)) + copy(cpy.Instructions, ps.Instructions) + return &cpy +} + +// Program represents BPF program loaded into the kernel. +// +// It is not safe to close a Program which is used by other goroutines. +type Program struct { + // Contains the output of the kernel verifier if enabled, + // otherwise it is empty. + VerifierLog string + + fd *bpfFD + name string + abi ProgramABI +} + +// NewProgram creates a new Program. +// +// Loading a program for the first time will perform +// feature detection by loading small, temporary programs. +func NewProgram(spec *ProgramSpec) (*Program, error) { + return NewProgramWithOptions(spec, ProgramOptions{}) +} + +// NewProgramWithOptions creates a new Program. +// +// Loading a program for the first time will perform +// feature detection by loading small, temporary programs. +func NewProgramWithOptions(spec *ProgramSpec, opts ProgramOptions) (*Program, error) { + attr, err := convertProgramSpec(spec) + if err != nil { + return nil, err + } + + logSize := DefaultVerifierLogSize + if opts.LogSize > 0 { + logSize = opts.LogSize + } + + var logBuf []byte + if opts.LogLevel > 0 { + logBuf = make([]byte, logSize) + attr.logLevel = opts.LogLevel + attr.logSize = uint32(len(logBuf)) + attr.logBuf = newPtr(unsafe.Pointer(&logBuf[0])) + } + + fd, err := bpfProgLoad(attr) + if err == nil { + prog := newProgram(fd, spec.Name, &ProgramABI{spec.Type}) + prog.VerifierLog = convertCString(logBuf) + return prog, nil + } + + truncated := errors.Cause(err) == unix.ENOSPC + if opts.LogLevel == 0 { + // Re-run with the verifier enabled to get better error messages. + logBuf = make([]byte, logSize) + attr.logLevel = 1 + attr.logSize = uint32(len(logBuf)) + attr.logBuf = newPtr(unsafe.Pointer(&logBuf[0])) + + _, nerr := bpfProgLoad(attr) + truncated = errors.Cause(nerr) == unix.ENOSPC + } + + logs := convertCString(logBuf) + if truncated { + logs += "\n(truncated...)" + } + + return nil, &loadError{err, logs} +} + +// NewProgramFromFD creates a program from a raw fd. +// +// You should not use fd after calling this function. +// +// Requires at least Linux 4.11. +func NewProgramFromFD(fd int) (*Program, error) { + if fd < 0 { + return nil, errors.New("invalid fd") + } + bpfFd := newBPFFD(uint32(fd)) + + name, abi, err := newProgramABIFromFd(bpfFd) + if err != nil { + bpfFd.forget() + return nil, err + } + + return newProgram(bpfFd, name, abi), nil +} + +func newProgram(fd *bpfFD, name string, abi *ProgramABI) *Program { + return &Program{ + name: name, + fd: fd, + abi: *abi, + } +} + +func convertProgramSpec(spec *ProgramSpec) (*bpfProgLoadAttr, error) { + if len(spec.Instructions) == 0 { + return nil, errors.New("Instructions cannot be empty") + } + + if len(spec.License) == 0 { + return nil, errors.New("License cannot be empty") + } + + buf := bytes.NewBuffer(make([]byte, 0, len(spec.Instructions)*asm.InstructionSize)) + err := spec.Instructions.Marshal(buf, internal.NativeEndian) + if err != nil { + return nil, err + } + + bytecode := buf.Bytes() + insCount := uint32(len(bytecode) / asm.InstructionSize) + lic := []byte(spec.License) + attr := &bpfProgLoadAttr{ + progType: spec.Type, + expectedAttachType: spec.AttachType, + insCount: insCount, + instructions: newPtr(unsafe.Pointer(&bytecode[0])), + license: newPtr(unsafe.Pointer(&lic[0])), + } + + name, err := newBPFObjName(spec.Name) + if err != nil { + return nil, err + } + + if haveObjName() == nil { + attr.progName = name + } + + return attr, nil +} + +func (p *Program) String() string { + if p.name != "" { + return fmt.Sprintf("%s(%s)#%v", p.abi.Type, p.name, p.fd) + } + return fmt.Sprintf("%s#%v", p.abi.Type, p.fd) +} + +// ABI gets the ABI of the Program +func (p *Program) ABI() ProgramABI { + return p.abi +} + +// FD gets the file descriptor of the Program. +// +// It is invalid to call this function after Close has been called. +func (p *Program) FD() int { + fd, err := p.fd.value() + if err != nil { + // Best effort: -1 is the number most likely to be an + // invalid file descriptor. + return -1 + } + + return int(fd) +} + +// Clone creates a duplicate of the Program. +// +// Closing the duplicate does not affect the original, and vice versa. +// +// Cloning a nil Program returns nil. +func (p *Program) Clone() (*Program, error) { + if p == nil { + return nil, nil + } + + dup, err := p.fd.dup() + if err != nil { + return nil, errors.Wrap(err, "can't clone program") + } + + return newProgram(dup, p.name, &p.abi), nil +} + +// Pin persists the Program past the lifetime of the process that created it +// +// This requires bpffs to be mounted above fileName. See http://cilium.readthedocs.io/en/doc-1.0/kubernetes/install/#mounting-the-bpf-fs-optional +func (p *Program) Pin(fileName string) error { + return errors.Wrap(bpfPinObject(fileName, p.fd), "can't pin program") +} + +// Close unloads the program from the kernel. +func (p *Program) Close() error { + if p == nil { + return nil + } + + return p.fd.close() +} + +// Test runs the Program in the kernel with the given input and returns the +// value returned by the eBPF program. outLen may be zero. +// +// Note: the kernel expects at least 14 bytes input for an ethernet header for +// XDP and SKB programs. +// +// This function requires at least Linux 4.12. +func (p *Program) Test(in []byte) (uint32, []byte, error) { + ret, out, _, err := p.testRun(in, 1) + return ret, out, errors.Wrap(err, "can't test program") +} + +// Benchmark runs the Program with the given input for a number of times +// and returns the time taken per iteration. +// +// The returned value is the return value of the last execution of +// the program. +// +// This function requires at least Linux 4.12. +func (p *Program) Benchmark(in []byte, repeat int) (uint32, time.Duration, error) { + ret, _, total, err := p.testRun(in, repeat) + return ret, total, errors.Wrap(err, "can't benchmark program") +} + +var haveProgTestRun = internal.FeatureTest("BPF_PROG_TEST_RUN", "4.12", func() bool { + prog, err := NewProgram(&ProgramSpec{ + Type: SocketFilter, + Instructions: asm.Instructions{ + asm.LoadImm(asm.R0, 0, asm.DWord), + asm.Return(), + }, + License: "MIT", + }) + if err != nil { + // This may be because we lack sufficient permissions, etc. + return false + } + defer prog.Close() + + fd, err := prog.fd.value() + if err != nil { + return false + } + + // Programs require at least 14 bytes input + in := make([]byte, 14) + attr := bpfProgTestRunAttr{ + fd: fd, + dataSizeIn: uint32(len(in)), + dataIn: newPtr(unsafe.Pointer(&in[0])), + } + + _, err = bpfCall(_ProgTestRun, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + + // Check for EINVAL specifically, rather than err != nil since we + // otherwise misdetect due to insufficient permissions. + return errors.Cause(err) != unix.EINVAL +}) + +func (p *Program) testRun(in []byte, repeat int) (uint32, []byte, time.Duration, error) { + if uint(repeat) > math.MaxUint32 { + return 0, nil, 0, fmt.Errorf("repeat is too high") + } + + if len(in) == 0 { + return 0, nil, 0, fmt.Errorf("missing input") + } + + if uint(len(in)) > math.MaxUint32 { + return 0, nil, 0, fmt.Errorf("input is too long") + } + + if err := haveProgTestRun(); err != nil { + return 0, nil, 0, err + } + + // Older kernels ignore the dataSizeOut argument when copying to user space. + // Combined with things like bpf_xdp_adjust_head() we don't really know what the final + // size will be. Hence we allocate an output buffer which we hope will always be large + // enough, and panic if the kernel wrote past the end of the allocation. + // See https://patchwork.ozlabs.org/cover/1006822/ + out := make([]byte, len(in)+outputPad) + + fd, err := p.fd.value() + if err != nil { + return 0, nil, 0, err + } + + attr := bpfProgTestRunAttr{ + fd: fd, + dataSizeIn: uint32(len(in)), + dataSizeOut: uint32(len(out)), + dataIn: newPtr(unsafe.Pointer(&in[0])), + dataOut: newPtr(unsafe.Pointer(&out[0])), + repeat: uint32(repeat), + } + + _, err = bpfCall(_ProgTestRun, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + if err != nil { + return 0, nil, 0, errors.Wrap(err, "can't run test") + } + + if int(attr.dataSizeOut) > cap(out) { + // Houston, we have a problem. The program created more data than we allocated, + // and the kernel wrote past the end of our buffer. + panic("kernel wrote past end of output buffer") + } + out = out[:int(attr.dataSizeOut)] + + total := time.Duration(attr.duration) * time.Nanosecond + return attr.retval, out, total, nil +} + +func unmarshalProgram(buf []byte) (*Program, error) { + if len(buf) != 4 { + return nil, errors.New("program id requires 4 byte value") + } + + // Looking up an entry in a nested map or prog array returns an id, + // not an fd. + id := internal.NativeEndian.Uint32(buf) + fd, err := bpfGetProgramFDByID(id) + if err != nil { + return nil, err + } + + name, abi, err := newProgramABIFromFd(fd) + if err != nil { + _ = fd.close() + return nil, err + } + + return newProgram(fd, name, abi), nil +} + +// MarshalBinary implements BinaryMarshaler. +func (p *Program) MarshalBinary() ([]byte, error) { + value, err := p.fd.value() + if err != nil { + return nil, err + } + + buf := make([]byte, 4) + internal.NativeEndian.PutUint32(buf, value) + return buf, nil +} + +// Attach a Program to a container object fd +func (p *Program) Attach(fd int, typ AttachType, flags AttachFlags) error { + if fd < 0 { + return errors.New("invalid fd") + } + + pfd, err := p.fd.value() + if err != nil { + return err + } + + attr := bpfProgAlterAttr{ + targetFd: uint32(fd), + attachBpfFd: pfd, + attachType: uint32(typ), + attachFlags: uint32(flags), + } + + return bpfProgAlter(_ProgAttach, &attr) +} + +// Detach a Program from a container object fd +func (p *Program) Detach(fd int, typ AttachType, flags AttachFlags) error { + if fd < 0 { + return errors.New("invalid fd") + } + + pfd, err := p.fd.value() + if err != nil { + return err + } + + attr := bpfProgAlterAttr{ + targetFd: uint32(fd), + attachBpfFd: pfd, + attachType: uint32(typ), + attachFlags: uint32(flags), + } + + return bpfProgAlter(_ProgDetach, &attr) +} + +// LoadPinnedProgram loads a Program from a BPF file. +// +// Requires at least Linux 4.11. +func LoadPinnedProgram(fileName string) (*Program, error) { + fd, err := bpfGetObject(fileName) + if err != nil { + return nil, err + } + + name, abi, err := newProgramABIFromFd(fd) + if err != nil { + _ = fd.close() + return nil, errors.Wrapf(err, "can't get ABI for %s", fileName) + } + + return newProgram(fd, name, abi), nil +} + +// SanitizeName replaces all invalid characters in name. +// +// Use this to automatically generate valid names for maps and +// programs at run time. +// +// Passing a negative value for replacement will delete characters +// instead of replacing them. +func SanitizeName(name string, replacement rune) string { + return strings.Map(func(char rune) rune { + if invalidBPFObjNameChar(char) { + return replacement + } + return char + }, name) +} + +type loadError struct { + cause error + verifierLog string +} + +func (le *loadError) Error() string { + if le.verifierLog == "" { + return fmt.Sprintf("failed to load program: %s", le.cause) + } + return fmt.Sprintf("failed to load program: %s: %s", le.cause, le.verifierLog) +} + +func (le *loadError) Cause() error { + return le.cause +} + +// IsNotSupported returns true if an error occurred because +// the kernel does not have support for a specific feature. +func IsNotSupported(err error) bool { + _, notSupported := errors.Cause(err).(*internal.UnsupportedFeatureError) + return notSupported +} diff --git a/vendor/github.com/cilium/ebpf/ptr_32_be.go b/vendor/github.com/cilium/ebpf/ptr_32_be.go new file mode 100644 index 0000000000..775774436a --- /dev/null +++ b/vendor/github.com/cilium/ebpf/ptr_32_be.go @@ -0,0 +1,14 @@ +// +build armbe mips mips64p32 + +package ebpf + +import ( + "unsafe" +) + +// ptr wraps an unsafe.Pointer to be 64bit to +// conform to the syscall specification. +type syscallPtr struct { + pad uint32 + ptr unsafe.Pointer +} diff --git a/vendor/github.com/cilium/ebpf/ptr_32_le.go b/vendor/github.com/cilium/ebpf/ptr_32_le.go new file mode 100644 index 0000000000..14b805e92c --- /dev/null +++ b/vendor/github.com/cilium/ebpf/ptr_32_le.go @@ -0,0 +1,14 @@ +// +build 386 amd64p32 arm mipsle mips64p32le + +package ebpf + +import ( + "unsafe" +) + +// ptr wraps an unsafe.Pointer to be 64bit to +// conform to the syscall specification. +type syscallPtr struct { + ptr unsafe.Pointer + pad uint32 +} diff --git a/vendor/github.com/cilium/ebpf/ptr_64.go b/vendor/github.com/cilium/ebpf/ptr_64.go new file mode 100644 index 0000000000..c897d7273b --- /dev/null +++ b/vendor/github.com/cilium/ebpf/ptr_64.go @@ -0,0 +1,14 @@ +// +build !386,!amd64p32,!arm,!mipsle,!mips64p32le +// +build !armbe,!mips,!mips64p32 + +package ebpf + +import ( + "unsafe" +) + +// ptr wraps an unsafe.Pointer to be 64bit to +// conform to the syscall specification. +type syscallPtr struct { + ptr unsafe.Pointer +} diff --git a/vendor/github.com/cilium/ebpf/readme.md b/vendor/github.com/cilium/ebpf/readme.md new file mode 100644 index 0000000000..26ab2b9045 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/readme.md @@ -0,0 +1,20 @@ +eBPF +------- +[![](https://godoc.org/github.com/cilium/ebpf?status.svg)](https://godoc.org/github.com/cilium/ebpf) + +eBPF is a pure Go library that provides utilities for loading, compiling, and debugging eBPF programs. It has minimal external dependencies and is intended to be used in long running processes. + +[ebpf/asm](https://godoc.org/github.com/cilium/ebpf/asm) contains a basic assembler. + +The library is maintained by [Cloudflare](https://www.cloudflare.com) and [Cilium](https://www.cilium.io). Feel free to [join](https://cilium.herokuapp.com/) the [libbpf-go](https://cilium.slack.com/messages/libbpf-go) channel on Slack. + +## Current status + +The package is production ready, but **the API is explicitly unstable +right now**. Expect to update your code if you want to follow along. + +## Useful resources + +* [Cilium eBPF documentation](https://cilium.readthedocs.io/en/latest/bpf/#bpf-guide) (recommended) +* [Linux documentation on BPF](http://elixir.free-electrons.com/linux/latest/source/Documentation/networking/filter.txt) +* [eBPF features by Linux version](https://github.com/iovisor/bcc/blob/master/docs/kernel-versions.md) diff --git a/vendor/github.com/cilium/ebpf/syscalls.go b/vendor/github.com/cilium/ebpf/syscalls.go new file mode 100644 index 0000000000..d00a03fb21 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/syscalls.go @@ -0,0 +1,447 @@ +package ebpf + +import ( + "bytes" + "path/filepath" + "runtime" + "strconv" + "strings" + "unsafe" + + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/unix" + + "github.com/pkg/errors" +) + +var errClosedFd = errors.New("use of closed file descriptor") + +type bpfFD struct { + raw int64 +} + +func newBPFFD(value uint32) *bpfFD { + fd := &bpfFD{int64(value)} + runtime.SetFinalizer(fd, (*bpfFD).close) + return fd +} + +func (fd *bpfFD) String() string { + return strconv.FormatInt(fd.raw, 10) +} + +func (fd *bpfFD) value() (uint32, error) { + if fd.raw < 0 { + return 0, errClosedFd + } + + return uint32(fd.raw), nil +} + +func (fd *bpfFD) close() error { + if fd.raw < 0 { + return nil + } + + value := int(fd.raw) + fd.raw = -1 + + fd.forget() + return unix.Close(value) +} + +func (fd *bpfFD) forget() { + runtime.SetFinalizer(fd, nil) +} + +func (fd *bpfFD) dup() (*bpfFD, error) { + if fd.raw < 0 { + return nil, errClosedFd + } + + dup, err := unix.FcntlInt(uintptr(fd.raw), unix.F_DUPFD_CLOEXEC, 0) + if err != nil { + return nil, errors.Wrap(err, "can't dup fd") + } + + return newBPFFD(uint32(dup)), nil +} + +// bpfObjName is a null-terminated string made up of +// 'A-Za-z0-9_' characters. +type bpfObjName [unix.BPF_OBJ_NAME_LEN]byte + +// newBPFObjName truncates the result if it is too long. +func newBPFObjName(name string) (bpfObjName, error) { + idx := strings.IndexFunc(name, invalidBPFObjNameChar) + if idx != -1 { + return bpfObjName{}, errors.Errorf("invalid character '%c' in name '%s'", name[idx], name) + } + + var result bpfObjName + copy(result[:unix.BPF_OBJ_NAME_LEN-1], name) + return result, nil +} + +func invalidBPFObjNameChar(char rune) bool { + switch { + case char >= 'A' && char <= 'Z': + fallthrough + case char >= 'a' && char <= 'z': + fallthrough + case char >= '0' && char <= '9': + fallthrough + case char == '_': + return false + default: + return true + } +} + +type bpfMapCreateAttr struct { + mapType MapType + keySize uint32 + valueSize uint32 + maxEntries uint32 + flags uint32 + innerMapFd uint32 // since 4.12 56f668dfe00d + numaNode uint32 // since 4.14 96eabe7a40aa + mapName bpfObjName // since 4.15 ad5b177bd73f +} + +type bpfMapOpAttr struct { + mapFd uint32 + padding uint32 + key syscallPtr + value syscallPtr + flags uint64 +} + +type bpfMapInfo struct { + mapType uint32 + id uint32 + keySize uint32 + valueSize uint32 + maxEntries uint32 + flags uint32 + mapName bpfObjName // since 4.15 ad5b177bd73f +} + +type bpfPinObjAttr struct { + fileName syscallPtr + fd uint32 + padding uint32 +} + +type bpfProgLoadAttr struct { + progType ProgramType + insCount uint32 + instructions syscallPtr + license syscallPtr + logLevel uint32 + logSize uint32 + logBuf syscallPtr + kernelVersion uint32 // since 4.1 2541517c32be + progFlags uint32 // since 4.11 e07b98d9bffe + progName bpfObjName // since 4.15 067cae47771c + progIfIndex uint32 // since 4.15 1f6f4cb7ba21 + expectedAttachType AttachType // since 4.17 5e43f899b03a +} + +type bpfProgInfo struct { + progType uint32 + id uint32 + tag [unix.BPF_TAG_SIZE]byte + jitedLen uint32 + xlatedLen uint32 + jited syscallPtr + xlated syscallPtr + loadTime uint64 // since 4.15 cb4d2b3f03d8 + createdByUID uint32 + nrMapIDs uint32 + mapIds syscallPtr + name bpfObjName +} + +type bpfProgTestRunAttr struct { + fd uint32 + retval uint32 + dataSizeIn uint32 + dataSizeOut uint32 + dataIn syscallPtr + dataOut syscallPtr + repeat uint32 + duration uint32 +} + +type bpfProgAlterAttr struct { + targetFd uint32 + attachBpfFd uint32 + attachType uint32 + attachFlags uint32 +} + +type bpfObjGetInfoByFDAttr struct { + fd uint32 + infoLen uint32 + info syscallPtr // May be either bpfMapInfo or bpfProgInfo +} + +type bpfGetFDByIDAttr struct { + id uint32 + next uint32 +} + +func newPtr(ptr unsafe.Pointer) syscallPtr { + return syscallPtr{ptr: ptr} +} + +func bpfProgLoad(attr *bpfProgLoadAttr) (*bpfFD, error) { + for { + fd, err := bpfCall(_ProgLoad, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + // As of ~4.20 the verifier can be interrupted by a signal, + // and returns EAGAIN in that case. + if err == unix.EAGAIN { + continue + } + + if err != nil { + return nil, err + } + + return newBPFFD(uint32(fd)), nil + } +} + +func bpfProgAlter(cmd int, attr *bpfProgAlterAttr) error { + _, err := bpfCall(cmd, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + return err +} + +func bpfMapCreate(attr *bpfMapCreateAttr) (*bpfFD, error) { + fd, err := bpfCall(_MapCreate, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + if err != nil { + return nil, err + } + + return newBPFFD(uint32(fd)), nil +} + +var haveNestedMaps = internal.FeatureTest("nested maps", "4.12", func() bool { + inner, err := bpfMapCreate(&bpfMapCreateAttr{ + mapType: Array, + keySize: 4, + valueSize: 4, + maxEntries: 1, + }) + if err != nil { + return false + } + defer inner.close() + + innerFd, _ := inner.value() + nested, err := bpfMapCreate(&bpfMapCreateAttr{ + mapType: ArrayOfMaps, + keySize: 4, + valueSize: 4, + maxEntries: 1, + innerMapFd: innerFd, + }) + if err != nil { + return false + } + + _ = nested.close() + return true +}) + +func bpfMapLookupElem(m *bpfFD, key, valueOut syscallPtr) error { + fd, err := m.value() + if err != nil { + return err + } + + attr := bpfMapOpAttr{ + mapFd: fd, + key: key, + value: valueOut, + } + _, err = bpfCall(_MapLookupElem, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + return err +} + +func bpfMapUpdateElem(m *bpfFD, key, valueOut syscallPtr, flags uint64) error { + fd, err := m.value() + if err != nil { + return err + } + + attr := bpfMapOpAttr{ + mapFd: fd, + key: key, + value: valueOut, + flags: flags, + } + _, err = bpfCall(_MapUpdateElem, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + return err +} + +func bpfMapDeleteElem(m *bpfFD, key syscallPtr) error { + fd, err := m.value() + if err != nil { + return err + } + + attr := bpfMapOpAttr{ + mapFd: fd, + key: key, + } + _, err = bpfCall(_MapDeleteElem, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + return err +} + +func bpfMapGetNextKey(m *bpfFD, key, nextKeyOut syscallPtr) error { + fd, err := m.value() + if err != nil { + return err + } + + attr := bpfMapOpAttr{ + mapFd: fd, + key: key, + value: nextKeyOut, + } + _, err = bpfCall(_MapGetNextKey, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + return err +} + +const bpfFSType = 0xcafe4a11 + +func bpfPinObject(fileName string, fd *bpfFD) error { + dirName := filepath.Dir(fileName) + var statfs unix.Statfs_t + if err := unix.Statfs(dirName, &statfs); err != nil { + return err + } + if uint64(statfs.Type) != bpfFSType { + return errors.Errorf("%s is not on a bpf filesystem", fileName) + } + + value, err := fd.value() + if err != nil { + return err + } + + _, err = bpfCall(_ObjPin, unsafe.Pointer(&bpfPinObjAttr{ + fileName: newPtr(unsafe.Pointer(&[]byte(fileName)[0])), + fd: value, + }), 16) + return errors.Wrapf(err, "pin object %s", fileName) +} + +func bpfGetObject(fileName string) (*bpfFD, error) { + ptr, err := bpfCall(_ObjGet, unsafe.Pointer(&bpfPinObjAttr{ + fileName: newPtr(unsafe.Pointer(&[]byte(fileName)[0])), + }), 16) + if err != nil { + return nil, errors.Wrapf(err, "get object %s", fileName) + } + return newBPFFD(uint32(ptr)), nil +} + +func bpfGetObjectInfoByFD(fd *bpfFD, info unsafe.Pointer, size uintptr) error { + value, err := fd.value() + if err != nil { + return err + } + + // available from 4.13 + attr := bpfObjGetInfoByFDAttr{ + fd: value, + infoLen: uint32(size), + info: newPtr(info), + } + _, err = bpfCall(_ObjGetInfoByFD, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + return errors.Wrapf(err, "fd %d", value) +} + +func bpfGetProgInfoByFD(fd *bpfFD) (*bpfProgInfo, error) { + var info bpfProgInfo + err := bpfGetObjectInfoByFD(fd, unsafe.Pointer(&info), unsafe.Sizeof(info)) + return &info, errors.Wrap(err, "can't get program info") +} + +func bpfGetMapInfoByFD(fd *bpfFD) (*bpfMapInfo, error) { + var info bpfMapInfo + err := bpfGetObjectInfoByFD(fd, unsafe.Pointer(&info), unsafe.Sizeof(info)) + return &info, errors.Wrap(err, "can't get map info") +} + +var haveObjName = internal.FeatureTest("object names", "4.15", func() bool { + name, err := newBPFObjName("feature_test") + if err != nil { + // This really is a fatal error, but it should be caught + // by the unit tests not working. + return false + } + + attr := bpfMapCreateAttr{ + mapType: Array, + keySize: 4, + valueSize: 4, + maxEntries: 1, + mapName: name, + } + + fd, err := bpfMapCreate(&attr) + if err != nil { + return false + } + + _ = fd.close() + return true +}) + +func bpfGetMapFDByID(id uint32) (*bpfFD, error) { + // available from 4.13 + attr := bpfGetFDByIDAttr{ + id: id, + } + ptr, err := bpfCall(_MapGetFDByID, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + if err != nil { + return nil, errors.Wrapf(err, "can't get fd for map id %d", id) + } + return newBPFFD(uint32(ptr)), nil +} + +func bpfGetProgramFDByID(id uint32) (*bpfFD, error) { + // available from 4.13 + attr := bpfGetFDByIDAttr{ + id: id, + } + ptr, err := bpfCall(_ProgGetFDByID, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + if err != nil { + return nil, errors.Wrapf(err, "can't get fd for program id %d", id) + } + return newBPFFD(uint32(ptr)), nil +} + +func bpfCall(cmd int, attr unsafe.Pointer, size uintptr) (uintptr, error) { + r1, _, errNo := unix.Syscall(unix.SYS_BPF, uintptr(cmd), uintptr(attr), size) + runtime.KeepAlive(attr) + + var err error + if errNo != 0 { + err = errNo + } + + return r1, err +} + +func convertCString(in []byte) string { + inLen := bytes.IndexByte(in, 0) + if inLen == -1 { + return "" + } + return string(in[:inLen]) +} diff --git a/vendor/github.com/cilium/ebpf/types.go b/vendor/github.com/cilium/ebpf/types.go new file mode 100644 index 0000000000..0daf9a7152 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/types.go @@ -0,0 +1,189 @@ +package ebpf + +//go:generate stringer -output types_string.go -type=MapType,ProgramType + +// MapType indicates the type map structure +// that will be initialized in the kernel. +type MapType uint32 + +// All the various map types that can be created +const ( + UnspecifiedMap MapType = iota + // Hash is a hash map + Hash + // Array is an array map + Array + // ProgramArray - A program array map is a special kind of array map whose map + // values contain only file descriptors referring to other eBPF + // programs. Thus, both the key_size and value_size must be + // exactly four bytes. This map is used in conjunction with the + // TailCall helper. + ProgramArray + // PerfEventArray - A perf event array is used in conjunction with PerfEventRead + // and PerfEventOutput calls, to read the raw bpf_perf_data from the registers. + PerfEventArray + // PerCPUHash - This data structure is useful for people who have high performance + // network needs and can reconcile adds at the end of some cycle, so that + // hashes can be lock free without the use of XAdd, which can be costly. + PerCPUHash + // PerCPUArray - This data structure is useful for people who have high performance + // network needs and can reconcile adds at the end of some cycle, so that + // hashes can be lock free without the use of XAdd, which can be costly. + // Each CPU gets a copy of this hash, the contents of all of which can be reconciled + // later. + PerCPUArray + // StackTrace - This holds whole user and kernel stack traces, it can be retrieved with + // GetStackID + StackTrace + // CGroupArray - This is a very niche structure used to help SKBInCGroup determine + // if an skb is from a socket belonging to a specific cgroup + CGroupArray + // LRUHash - This allows you to create a small hash structure that will purge the + // least recently used items rather than thow an error when you run out of memory + LRUHash + // LRUCPUHash - This is NOT like PerCPUHash, this structure is shared among the CPUs, + // it has more to do with including the CPU id with the LRU calculation so that if a + // particular CPU is using a value over-and-over again, then it will be saved, but if + // a value is being retrieved a lot but sparsely across CPUs it is not as important, basically + // giving weight to CPU locality over overall usage. + LRUCPUHash + // LPMTrie - This is an implementation of Longest-Prefix-Match Trie structure. It is useful, + // for storing things like IP addresses which can be bit masked allowing for keys of differing + // values to refer to the same reference based on their masks. See wikipedia for more details. + LPMTrie + // ArrayOfMaps - Each item in the array is another map. The inner map mustn't be a map of maps + // itself. + ArrayOfMaps + // HashOfMaps - Each item in the hash map is another map. The inner map mustn't be a map of maps + // itself. + HashOfMaps +) + +// hasPerCPUValue returns true if the Map stores a value per CPU. +func (mt MapType) hasPerCPUValue() bool { + if mt == PerCPUHash || mt == PerCPUArray { + return true + } + return false +} + +const ( + _MapCreate = iota + _MapLookupElem + _MapUpdateElem + _MapDeleteElem + _MapGetNextKey + _ProgLoad + _ObjPin + _ObjGet + _ProgAttach + _ProgDetach + _ProgTestRun + _ProgGetNextID + _MapGetNextID + _ProgGetFDByID + _MapGetFDByID + _ObjGetInfoByFD +) + +const ( + _Any = iota + _NoExist + _Exist +) + +// ProgramType of the eBPF program +type ProgramType uint32 + +// eBPF program types +const ( + // Unrecognized program type + UnspecifiedProgram ProgramType = iota + // SocketFilter socket or seccomp filter + SocketFilter + // Kprobe program + Kprobe + // SchedCLS traffic control shaper + SchedCLS + // SchedACT routing control shaper + SchedACT + // TracePoint program + TracePoint + // XDP program + XDP + // PerfEvent program + PerfEvent + // CGroupSKB program + CGroupSKB + // CGroupSock program + CGroupSock + // LWTIn program + LWTIn + // LWTOut program + LWTOut + // LWTXmit program + LWTXmit + // SockOps program + SockOps + // SkSKB program + SkSKB + // CGroupDevice program + CGroupDevice + // SkMsg program + SkMsg + // RawTracepoint program + RawTracepoint + // CGroupSockAddr program + CGroupSockAddr + // LWTSeg6Local program + LWTSeg6Local + // LircMode2 program + LircMode2 + // SkReuseport program + SkReuseport + // FlowDissector program + FlowDissector + // CGroupSysctl program + CGroupSysctl + // RawTracepointWritable program + RawTracepointWritable + // CGroupSockopt program + CGroupSockopt +) + +// AttachType of the eBPF program, needed to differentiate allowed context accesses in +// some newer program types like CGroupSockAddr. Should be set to AttachNone if not required. +// Will cause invalid argument (EINVAL) at program load time if set incorrectly. +type AttachType uint32 + +// AttachNone is an alias for AttachCGroupInetIngress for readability reasons +const AttachNone AttachType = 0 + +const ( + AttachCGroupInetIngress AttachType = iota + AttachCGroupInetEgress + AttachCGroupInetSockCreate + AttachCGroupSockOps + AttachSkSKBStreamParser + AttachSkSKBStreamVerdict + AttachCGroupDevice + AttachSkMsgVerdict + AttachCGroupInet4Bind + AttachCGroupInet6Bind + AttachCGroupInet4Connect + AttachCGroupInet6Connect + AttachCGroupInet4PostBind + AttachCGroupInet6PostBind + AttachCGroupUDP4Sendmsg + AttachCGroupUDP6Sendmsg + AttachLircMode2 + AttachFlowDissector + AttachCGroupSysctl + AttachCGroupUDP4Recvmsg + AttachCGroupUDP6Recvmsg + AttachCGroupGetsockopt + AttachCGroupSetsockopt +) + +// AttachFlags of the eBPF program used in BPF_PROG_ATTACH command +type AttachFlags uint32 diff --git a/vendor/github.com/cilium/ebpf/types_string.go b/vendor/github.com/cilium/ebpf/types_string.go new file mode 100644 index 0000000000..4813437ec2 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/types_string.go @@ -0,0 +1,78 @@ +// Code generated by "stringer -output types_string.go -type=MapType,ProgramType"; DO NOT EDIT. + +package ebpf + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[UnspecifiedMap-0] + _ = x[Hash-1] + _ = x[Array-2] + _ = x[ProgramArray-3] + _ = x[PerfEventArray-4] + _ = x[PerCPUHash-5] + _ = x[PerCPUArray-6] + _ = x[StackTrace-7] + _ = x[CGroupArray-8] + _ = x[LRUHash-9] + _ = x[LRUCPUHash-10] + _ = x[LPMTrie-11] + _ = x[ArrayOfMaps-12] + _ = x[HashOfMaps-13] +} + +const _MapType_name = "UnspecifiedMapHashArrayProgramArrayPerfEventArrayPerCPUHashPerCPUArrayStackTraceCGroupArrayLRUHashLRUCPUHashLPMTrieArrayOfMapsHashOfMaps" + +var _MapType_index = [...]uint8{0, 14, 18, 23, 35, 49, 59, 70, 80, 91, 98, 108, 115, 126, 136} + +func (i MapType) String() string { + if i >= MapType(len(_MapType_index)-1) { + return "MapType(" + strconv.FormatInt(int64(i), 10) + ")" + } + return _MapType_name[_MapType_index[i]:_MapType_index[i+1]] +} +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[UnspecifiedProgram-0] + _ = x[SocketFilter-1] + _ = x[Kprobe-2] + _ = x[SchedCLS-3] + _ = x[SchedACT-4] + _ = x[TracePoint-5] + _ = x[XDP-6] + _ = x[PerfEvent-7] + _ = x[CGroupSKB-8] + _ = x[CGroupSock-9] + _ = x[LWTIn-10] + _ = x[LWTOut-11] + _ = x[LWTXmit-12] + _ = x[SockOps-13] + _ = x[SkSKB-14] + _ = x[CGroupDevice-15] + _ = x[SkMsg-16] + _ = x[RawTracepoint-17] + _ = x[CGroupSockAddr-18] + _ = x[LWTSeg6Local-19] + _ = x[LircMode2-20] + _ = x[SkReuseport-21] + _ = x[FlowDissector-22] + _ = x[CGroupSysctl-23] + _ = x[RawTracepointWritable-24] + _ = x[CGroupSockopt-25] +} + +const _ProgramType_name = "UnspecifiedProgramSocketFilterKprobeSchedCLSSchedACTTracePointXDPPerfEventCGroupSKBCGroupSockLWTInLWTOutLWTXmitSockOpsSkSKBCGroupDeviceSkMsgRawTracepointCGroupSockAddrLWTSeg6LocalLircMode2SkReuseportFlowDissectorCGroupSysctlRawTracepointWritableCGroupSockopt" + +var _ProgramType_index = [...]uint16{0, 18, 30, 36, 44, 52, 62, 65, 74, 83, 93, 98, 104, 111, 118, 123, 135, 140, 153, 167, 179, 188, 199, 212, 224, 245, 258} + +func (i ProgramType) String() string { + if i >= ProgramType(len(_ProgramType_index)-1) { + return "ProgramType(" + strconv.FormatInt(int64(i), 10) + ")" + } + return _ProgramType_name[_ProgramType_index[i]:_ProgramType_index[i+1]] +} diff --git a/vendor/github.com/containerd/cgroups/v2/cpu.go b/vendor/github.com/containerd/cgroups/v2/cpu.go new file mode 100644 index 0000000000..65282ff082 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/v2/cpu.go @@ -0,0 +1,83 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package v2 + +import ( + "math" + "strconv" + "strings" +) + +type CPUMax string + +func NewCPUMax(quota *int64, period *uint64) CPUMax { + max := "max" + if quota != nil { + max = strconv.FormatInt(*quota, 10) + } + return CPUMax(strings.Join([]string{max, strconv.FormatUint(*period, 10)}, " ")) +} + +type CPU struct { + Weight *uint64 + Max CPUMax + Cpus string + Mems string +} + +func (c CPUMax) extractQuotaAndPeriod() (int64, uint64) { + var ( + quota int64 + period uint64 + ) + values := strings.Split(string(c), " ") + if values[0] == "max" { + quota = math.MaxInt64 + } else { + quota, _ = strconv.ParseInt(values[0], 10, 64) + } + period, _ = strconv.ParseUint(values[1], 10, 64) + return quota, period +} + +func (r *CPU) Values() (o []Value) { + if r.Weight != nil { + o = append(o, Value{ + filename: "cpu.weight", + value: *r.Weight, + }) + } + if r.Max != "" { + o = append(o, Value{ + filename: "cpu.max", + value: r.Max, + }) + } + if r.Cpus != "" { + o = append(o, Value{ + filename: "cpuset.cpus", + value: r.Cpus, + }) + } + if r.Mems != "" { + o = append(o, Value{ + filename: "cpuset.mems", + value: r.Mems, + }) + } + return o +} diff --git a/vendor/github.com/containerd/cgroups/v2/devicefilter.go b/vendor/github.com/containerd/cgroups/v2/devicefilter.go new file mode 100644 index 0000000000..4b8c32be92 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/v2/devicefilter.go @@ -0,0 +1,199 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Devicefilter containes eBPF device filter program +// +// The implementation is based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c +// +// Although ebpf.c is originally licensed under LGPL-3.0-or-later, the author (Giuseppe Scrivano) +// agreed to relicense the file in Apache License 2.0: https://github.com/opencontainers/runc/issues/2144#issuecomment-543116397 +// +// This particular Go implementation based on runc version +// https://github.com/opencontainers/runc/blob/master/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go +package v2 + +import ( + "fmt" + "math" + + "github.com/cilium/ebpf/asm" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +const ( + // license string format is same as kernel MODULE_LICENSE macro + license = "Apache" +) + +// DeviceFilter returns eBPF device filter program and its license string +func DeviceFilter(devices []specs.LinuxDeviceCgroup) (asm.Instructions, string, error) { + p := &program{} + p.init() + for i := len(devices) - 1; i >= 0; i-- { + if err := p.appendDevice(devices[i]); err != nil { + return nil, "", err + } + } + insts, err := p.finalize() + return insts, license, err +} + +type program struct { + insts asm.Instructions + hasWildCard bool + blockID int +} + +func (p *program) init() { + // struct bpf_cgroup_dev_ctx: https://elixir.bootlin.com/linux/v5.3.6/source/include/uapi/linux/bpf.h#L3423 + /* + u32 access_type + u32 major + u32 minor + */ + // R2 <- type (lower 16 bit of u32 access_type at R1[0]) + p.insts = append(p.insts, + asm.LoadMem(asm.R2, asm.R1, 0, asm.Half)) + + // R3 <- access (upper 16 bit of u32 access_type at R1[0]) + p.insts = append(p.insts, + asm.LoadMem(asm.R3, asm.R1, 0, asm.Word), + // RSh: bitwise shift right + asm.RSh.Imm32(asm.R3, 16)) + + // R4 <- major (u32 major at R1[4]) + p.insts = append(p.insts, + asm.LoadMem(asm.R4, asm.R1, 4, asm.Word)) + + // R5 <- minor (u32 minor at R1[8]) + p.insts = append(p.insts, + asm.LoadMem(asm.R5, asm.R1, 8, asm.Word)) +} + +// appendDevice needs to be called from the last element of OCI linux.resources.devices to the head element. +func (p *program) appendDevice(dev specs.LinuxDeviceCgroup) error { + if p.blockID < 0 { + return errors.New("the program is finalized") + } + if p.hasWildCard { + // All entries after wildcard entry are ignored + return nil + } + + bpfType := int32(-1) + hasType := true + switch dev.Type { + case string('c'): + bpfType = int32(unix.BPF_DEVCG_DEV_CHAR) + case string('b'): + bpfType = int32(unix.BPF_DEVCG_DEV_BLOCK) + case string('a'): + hasType = false + default: + // if not specified in OCI json, typ is set to DeviceTypeAll + return errors.Errorf("invalid DeviceType %q", dev.Type) + } + if *dev.Major > math.MaxUint32 { + return errors.Errorf("invalid major %d", *dev.Major) + } + if *dev.Minor > math.MaxUint32 { + return errors.Errorf("invalid minor %d", *dev.Major) + } + hasMajor := *dev.Major >= 0 // if not specified in OCI json, major is set to -1 + hasMinor := *dev.Minor >= 0 + bpfAccess := int32(0) + for _, r := range dev.Access { + switch r { + case 'r': + bpfAccess |= unix.BPF_DEVCG_ACC_READ + case 'w': + bpfAccess |= unix.BPF_DEVCG_ACC_WRITE + case 'm': + bpfAccess |= unix.BPF_DEVCG_ACC_MKNOD + default: + return errors.Errorf("unknown device access %v", r) + } + } + // If the access is rwm, skip the check. + hasAccess := bpfAccess != (unix.BPF_DEVCG_ACC_READ | unix.BPF_DEVCG_ACC_WRITE | unix.BPF_DEVCG_ACC_MKNOD) + + blockSym := fmt.Sprintf("block-%d", p.blockID) + nextBlockSym := fmt.Sprintf("block-%d", p.blockID+1) + prevBlockLastIdx := len(p.insts) - 1 + if hasType { + p.insts = append(p.insts, + // if (R2 != bpfType) goto next + asm.JNE.Imm(asm.R2, bpfType, nextBlockSym), + ) + } + if hasAccess { + p.insts = append(p.insts, + // if (R3 & bpfAccess == 0 /* use R1 as a temp var */) goto next + asm.Mov.Reg32(asm.R1, asm.R3), + asm.And.Imm32(asm.R1, bpfAccess), + asm.JEq.Imm(asm.R1, 0, nextBlockSym), + ) + } + if hasMajor { + p.insts = append(p.insts, + // if (R4 != major) goto next + asm.JNE.Imm(asm.R4, int32(*dev.Major), nextBlockSym), + ) + } + if hasMinor { + p.insts = append(p.insts, + // if (R5 != minor) goto next + asm.JNE.Imm(asm.R5, int32(*dev.Minor), nextBlockSym), + ) + } + if !hasType && !hasAccess && !hasMajor && !hasMinor { + p.hasWildCard = true + } + p.insts = append(p.insts, acceptBlock(dev.Allow)...) + // set blockSym to the first instruction we added in this iteration + p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].Sym(blockSym) + p.blockID++ + return nil +} + +func (p *program) finalize() (asm.Instructions, error) { + if p.hasWildCard { + // acceptBlock with asm.Return() is already inserted + return p.insts, nil + } + blockSym := fmt.Sprintf("block-%d", p.blockID) + p.insts = append(p.insts, + // R0 <- 0 + asm.Mov.Imm32(asm.R0, 0).Sym(blockSym), + asm.Return(), + ) + p.blockID = -1 + return p.insts, nil +} + +func acceptBlock(accept bool) asm.Instructions { + v := int32(0) + if accept { + v = 1 + } + return []asm.Instruction{ + // R0 <- v + asm.Mov.Imm32(asm.R0, v), + asm.Return(), + } +} diff --git a/vendor/github.com/containerd/cgroups/v2/ebpf.go b/vendor/github.com/containerd/cgroups/v2/ebpf.go new file mode 100644 index 0000000000..adda755089 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/v2/ebpf.go @@ -0,0 +1,83 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package v2 + +import ( + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/asm" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +// LoadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/ directory. +// +// Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 . +// +// https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92 +func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFD int) (func() error, error) { + nilCloser := func() error { + return nil + } + spec := &ebpf.ProgramSpec{ + Type: ebpf.CGroupDevice, + Instructions: insts, + License: license, + } + prog, err := ebpf.NewProgram(spec) + if err != nil { + return nilCloser, err + } + if err := prog.Attach(dirFD, ebpf.AttachCGroupDevice, unix.BPF_F_ALLOW_MULTI); err != nil { + return nilCloser, errors.Wrap(err, "failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI)") + } + closer := func() error { + if err := prog.Detach(dirFD, ebpf.AttachCGroupDevice, unix.BPF_F_ALLOW_MULTI); err != nil { + return errors.Wrap(err, "failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI)") + } + return nil + } + return closer, nil +} + +func isRWM(cgroupPermissions string) bool { + r := false + w := false + m := false + for _, rn := range cgroupPermissions { + switch rn { + case 'r': + r = true + case 'w': + w = true + case 'm': + m = true + } + } + return r && w && m +} + +// the logic is from runc +// https://github.com/opencontainers/runc/blob/master/libcontainer/cgroups/fs/devices_v2.go#L44 +func canSkipEBPFError(devices []specs.LinuxDeviceCgroup) bool { + for _, dev := range devices { + if dev.Allow || !isRWM(dev.Access) { + return false + } + } + return true +} diff --git a/vendor/github.com/containerd/cgroups/v2/errors.go b/vendor/github.com/containerd/cgroups/v2/errors.go new file mode 100644 index 0000000000..46d2d9c2e1 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/v2/errors.go @@ -0,0 +1,50 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package v2 + +import ( + "errors" + "os" +) + +var ( + ErrInvalidPid = errors.New("cgroups: pid must be greater than 0") + ErrMountPointNotExist = errors.New("cgroups: cgroup mountpoint does not exist") + ErrInvalidFormat = errors.New("cgroups: parsing file with invalid format failed") + ErrFreezerNotSupported = errors.New("cgroups: freezer cgroup (v2) not supported on this system") + ErrMemoryNotSupported = errors.New("cgroups: memory cgroup (v2) not supported on this system") + ErrPidsNotSupported = errors.New("cgroups: pids cgroup (v2) not supported on this system") + ErrCPUNotSupported = errors.New("cgroups: cpu cgroup (v2) not supported on this system") + ErrCgroupDeleted = errors.New("cgroups: cgroup deleted") + ErrNoCgroupMountDestination = errors.New("cgroups: cannot find cgroup mount destination") + ErrInvalidGroupPath = errors.New("cgroups: invalid group path") +) + +// ErrorHandler is a function that handles and acts on errors +type ErrorHandler func(err error) error + +// IgnoreNotExist ignores any errors that are for not existing files +func IgnoreNotExist(err error) error { + if os.IsNotExist(err) { + return nil + } + return err +} + +func errPassthrough(err error) error { + return err +} diff --git a/vendor/github.com/containerd/cgroups/v2/hugetlb.go b/vendor/github.com/containerd/cgroups/v2/hugetlb.go new file mode 100644 index 0000000000..16b35bd780 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/v2/hugetlb.go @@ -0,0 +1,37 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package v2 + +import "strings" + +type HugeTlb []HugeTlbEntry + +type HugeTlbEntry struct { + HugePageSize string + Limit uint64 +} + +func (r *HugeTlb) Values() (o []Value) { + for _, e := range *r { + o = append(o, Value{ + filename: strings.Join([]string{"hugetlb", e.HugePageSize, "max"}, "."), + value: e.Limit, + }) + } + + return o +} diff --git a/vendor/github.com/containerd/cgroups/v2/io.go b/vendor/github.com/containerd/cgroups/v2/io.go new file mode 100644 index 0000000000..70078d576e --- /dev/null +++ b/vendor/github.com/containerd/cgroups/v2/io.go @@ -0,0 +1,64 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package v2 + +import "fmt" + +type IOType string + +const ( + ReadBPS IOType = "rbps" + WriteBPS IOType = "wbps" + ReadIOPS IOType = "riops" + WriteIOPS IOType = "wiops" +) + +type BFQ struct { + Weight uint16 +} + +type Entry struct { + Type IOType + Major int64 + Minor int64 + Rate uint64 +} + +func (e Entry) String() string { + return fmt.Sprintf("%d:%d %s=%d", e.Major, e.Minor, e.Type, e.Rate) +} + +type IO struct { + BFQ BFQ + Max []Entry +} + +func (i *IO) Values() (o []Value) { + if i.BFQ.Weight != 0 { + o = append(o, Value{ + filename: "io.bfq.weight", + value: i.BFQ.Weight, + }) + } + for _, e := range i.Max { + o = append(o, Value{ + filename: "io.max", + value: e.String(), + }) + } + return o +} diff --git a/vendor/github.com/containerd/cgroups/v2/manager.go b/vendor/github.com/containerd/cgroups/v2/manager.go new file mode 100644 index 0000000000..09bf908b85 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/v2/manager.go @@ -0,0 +1,739 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package v2 + +import ( + "bufio" + "fmt" + "io/ioutil" + "math" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + "syscall" + "time" + + "golang.org/x/sys/unix" + + "github.com/containerd/cgroups/v2/stats" + "github.com/godbus/dbus/v5" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" + + systemdDbus "github.com/coreos/go-systemd/v22/dbus" +) + +const ( + subtreeControl = "cgroup.subtree_control" + controllersFile = "cgroup.controllers" + defaultCgroup2Path = "/sys/fs/cgroup" + defaultSlice = "system.slice" +) + +var ( + canDelegate bool + once sync.Once +) + +type cgValuer interface { + Values() []Value +} + +type Event struct { + Low uint64 + High uint64 + Max uint64 + OOM uint64 + OOMKill uint64 +} + +// Resources for a cgroups v2 unified hierarchy +type Resources struct { + CPU *CPU + Memory *Memory + Pids *Pids + IO *IO + RDMA *RDMA + HugeTlb *HugeTlb + // When len(Devices) is zero, devices are not controlled + Devices []specs.LinuxDeviceCgroup +} + +// Values returns the raw filenames and values that +// can be written to the unified hierarchy +func (r *Resources) Values() (o []Value) { + if r.CPU != nil { + o = append(o, r.CPU.Values()...) + } + if r.Memory != nil { + o = append(o, r.Memory.Values()...) + } + if r.Pids != nil { + o = append(o, r.Pids.Values()...) + } + if r.IO != nil { + o = append(o, r.IO.Values()...) + } + if r.RDMA != nil { + o = append(o, r.RDMA.Values()...) + } + if r.HugeTlb != nil { + o = append(o, r.HugeTlb.Values()...) + } + return o +} + +// EnabledControllers returns the list of all not nil resource controllers +func (r *Resources) EnabledControllers() (c []string) { + if r.CPU != nil { + c = append(c, "cpu") + c = append(c, "cpuset") + } + if r.Memory != nil { + c = append(c, "memory") + } + if r.Pids != nil { + c = append(c, "pids") + } + if r.IO != nil { + c = append(c, "io") + } + if r.RDMA != nil { + c = append(c, "rdma") + } + if r.HugeTlb != nil { + c = append(c, "hugetlb") + } + return +} + +// Value of a cgroup setting +type Value struct { + filename string + value interface{} +} + +// write the value to the full, absolute path, of a unified hierarchy +func (c *Value) write(path string, perm os.FileMode) error { + var data []byte + switch t := c.value.(type) { + case uint64: + data = []byte(strconv.FormatUint(t, 10)) + case uint16: + data = []byte(strconv.FormatUint(uint64(t), 10)) + case int64: + data = []byte(strconv.FormatInt(t, 10)) + case []byte: + data = t + case string: + data = []byte(t) + case CPUMax: + data = []byte(t) + default: + return ErrInvalidFormat + } + return ioutil.WriteFile( + filepath.Join(path, c.filename), + data, + perm, + ) +} + +func writeValues(path string, values []Value) error { + for _, o := range values { + if err := o.write(path, defaultFilePerm); err != nil { + return err + } + } + return nil +} + +func NewManager(mountpoint string, group string, resources *Resources) (*Manager, error) { + if err := VerifyGroupPath(group); err != nil { + return nil, err + } + path := filepath.Join(mountpoint, group) + if err := os.MkdirAll(path, defaultDirPerm); err != nil { + return nil, err + } + m := Manager{ + unifiedMountpoint: mountpoint, + path: path, + } + if err := m.ToggleControllers(resources.EnabledControllers(), Enable); err != nil { + // clean up cgroup dir on failure + os.Remove(path) + return nil, err + } + if err := setResources(path, resources); err != nil { + os.Remove(path) + return nil, err + } + return &m, nil +} + +func LoadManager(mountpoint string, group string) (*Manager, error) { + if err := VerifyGroupPath(group); err != nil { + return nil, err + } + path := filepath.Join(mountpoint, group) + return &Manager{ + unifiedMountpoint: mountpoint, + path: path, + }, nil +} + +type Manager struct { + unifiedMountpoint string + path string +} + +func setResources(path string, resources *Resources) error { + if resources != nil { + if err := writeValues(path, resources.Values()); err != nil { + return err + } + if err := setDevices(path, resources.Devices); err != nil { + return err + } + } + return nil +} + +func (c *Manager) RootControllers() ([]string, error) { + b, err := ioutil.ReadFile(filepath.Join(c.unifiedMountpoint, controllersFile)) + if err != nil { + return nil, err + } + return strings.Fields(string(b)), nil +} + +func (c *Manager) Controllers() ([]string, error) { + b, err := ioutil.ReadFile(filepath.Join(c.path, controllersFile)) + if err != nil { + return nil, err + } + return strings.Fields(string(b)), nil +} + +type ControllerToggle int + +const ( + Enable ControllerToggle = iota + 1 + Disable +) + +func toggleFunc(controllers []string, prefix string) []string { + out := make([]string, len(controllers)) + for i, c := range controllers { + out[i] = prefix + c + } + return out +} + +func (c *Manager) ToggleControllers(controllers []string, t ControllerToggle) error { + // when c.path is like /foo/bar/baz, the following files need to be written: + // * /sys/fs/cgroup/cgroup.subtree_control + // * /sys/fs/cgroup/foo/cgroup.subtree_control + // * /sys/fs/cgroup/foo/bar/cgroup.subtree_control + // Note that /sys/fs/cgroup/foo/bar/baz/cgroup.subtree_control does not need to be written. + split := strings.Split(c.path, "/") + var lastErr error + for i, _ := range split { + f := strings.Join(split[:i], "/") + if !strings.HasPrefix(f, c.unifiedMountpoint) || f == c.path { + continue + } + filePath := filepath.Join(f, subtreeControl) + if err := c.writeSubtreeControl(filePath, controllers, t); err != nil { + // When running as rootless, the user may face EPERM on parent groups, but it is neglible when the + // controller is already written. + // So we only return the last error. + lastErr = errors.Wrapf(err, "failed to write subtree controllers %+v to %q", controllers, filePath) + } + } + return lastErr +} + +func (c *Manager) writeSubtreeControl(filePath string, controllers []string, t ControllerToggle) error { + f, err := os.OpenFile(filePath, os.O_WRONLY, 0) + if err != nil { + return err + } + defer f.Close() + switch t { + case Enable: + controllers = toggleFunc(controllers, "+") + case Disable: + controllers = toggleFunc(controllers, "-") + } + _, err = f.WriteString(strings.Join(controllers, " ")) + return err +} + +func (c *Manager) NewChild(name string, resources *Resources) (*Manager, error) { + if strings.HasPrefix(name, "/") { + return nil, errors.New("name must be relative") + } + path := filepath.Join(c.path, name) + if err := os.MkdirAll(path, defaultDirPerm); err != nil { + return nil, err + } + if err := setResources(path, resources); err != nil { + // clean up cgroup dir on failure + os.Remove(path) + return nil, err + } + return &Manager{ + unifiedMountpoint: c.unifiedMountpoint, + path: path, + }, nil +} + +func (c *Manager) AddProc(pid uint64) error { + v := Value{ + filename: cgroupProcs, + value: pid, + } + return writeValues(c.path, []Value{v}) +} + +func (c *Manager) Delete() error { + return remove(c.path) +} + +func (c *Manager) Procs(recursive bool) ([]uint64, error) { + var processes []uint64 + err := filepath.Walk(c.path, func(p string, info os.FileInfo, err error) error { + if err != nil { + return err + } + if !recursive && info.IsDir() { + if p == c.path { + return nil + } + return filepath.SkipDir + } + _, name := filepath.Split(p) + if name != cgroupProcs { + return nil + } + procs, err := parseCgroupProcsFile(p) + if err != nil { + return err + } + processes = append(processes, procs...) + return nil + }) + return processes, err +} + +var singleValueFiles = []string{ + "pids.current", + "pids.max", +} + +func (c *Manager) Stat() (*stats.Metrics, error) { + controllers, err := c.Controllers() + if err != nil { + return nil, err + } + out := make(map[string]interface{}) + for _, controller := range controllers { + switch controller { + case "cpu", "memory": + filename := fmt.Sprintf("%s.stat", controller) + if err := readKVStatsFile(c.path, filename, out); err != nil { + if os.IsNotExist(err) { + continue + } + return nil, err + } + } + } + for _, name := range singleValueFiles { + if err := readSingleFile(c.path, name, out); err != nil { + if os.IsNotExist(err) { + continue + } + return nil, err + } + } + var metrics stats.Metrics + + metrics.Pids = &stats.PidsStat{ + Current: getPidValue("pids.current", out), + Limit: getPidValue("pids.max", out), + } + metrics.CPU = &stats.CPUStat{ + UsageUsec: getUint64Value("usage_usec", out), + UserUsec: getUint64Value("user_usec", out), + SystemUsec: getUint64Value("system_usec", out), + NrPeriods: getUint64Value("nr_periods", out), + NrThrottled: getUint64Value("nr_throttled", out), + ThrottledUsec: getUint64Value("throttled_usec", out), + } + metrics.Memory = &stats.MemoryStat{ + Anon: getUint64Value("anon", out), + File: getUint64Value("file", out), + KernelStack: getUint64Value("kernel_stack", out), + Slab: getUint64Value("slab", out), + Sock: getUint64Value("sock", out), + Shmem: getUint64Value("shmem", out), + FileMapped: getUint64Value("file_mapped", out), + FileDirty: getUint64Value("file_dirty", out), + FileWriteback: getUint64Value("file_writeback", out), + AnonThp: getUint64Value("anon_thp", out), + InactiveAnon: getUint64Value("inactive_anon", out), + ActiveAnon: getUint64Value("active_anon", out), + InactiveFile: getUint64Value("inactive_file", out), + ActiveFile: getUint64Value("active_file", out), + Unevictable: getUint64Value("unevictable", out), + SlabReclaimable: getUint64Value("slab_reclaimable", out), + SlabUnreclaimable: getUint64Value("slab_unreclaimable", out), + Pgfault: getUint64Value("pgfault", out), + Pgmajfault: getUint64Value("pgmajfault", out), + WorkingsetRefault: getUint64Value("workingset_refault", out), + WorkingsetActivate: getUint64Value("workingset_activate", out), + WorkingsetNodereclaim: getUint64Value("workingset_nodereclaim", out), + Pgrefill: getUint64Value("pgrefill", out), + Pgscan: getUint64Value("pgscan", out), + Pgsteal: getUint64Value("pgsteal", out), + Pgactivate: getUint64Value("pgactivate", out), + Pgdeactivate: getUint64Value("pgdeactivate", out), + Pglazyfree: getUint64Value("pglazyfree", out), + Pglazyfreed: getUint64Value("pglazyfreed", out), + ThpFaultAlloc: getUint64Value("thp_fault_alloc", out), + ThpCollapseAlloc: getUint64Value("thp_collapse_alloc", out), + Usage: getStatFileContentUint64(filepath.Join(c.path, "memory.current")), + UsageLimit: getStatFileContentUint64(filepath.Join(c.path, "memory.max")), + SwapUsage: getStatFileContentUint64(filepath.Join(c.path, "memory.swap.current")), + SwapLimit: getStatFileContentUint64(filepath.Join(c.path, "memory.swap.max")), + } + + metrics.Io = &stats.IOStat{Usage: readIoStats(c.path)} + metrics.Rdma = &stats.RdmaStat{ + Current: rdmaStats(filepath.Join(c.path, "rdma.current")), + Limit: rdmaStats(filepath.Join(c.path, "rdma.max")), + } + metrics.Hugetlb = readHugeTlbStats(c.path) + + return &metrics, nil +} + +func getUint64Value(key string, out map[string]interface{}) uint64 { + v, ok := out[key] + if !ok { + return 0 + } + switch t := v.(type) { + case uint64: + return t + } + return 0 +} + +func getPidValue(key string, out map[string]interface{}) uint64 { + v, ok := out[key] + if !ok { + return 0 + } + switch t := v.(type) { + case uint64: + return t + case string: + if t == "max" { + return math.MaxUint64 + } + } + return 0 +} + +func readSingleFile(path string, file string, out map[string]interface{}) error { + f, err := os.Open(filepath.Join(path, file)) + if err != nil { + return err + } + defer f.Close() + data, err := ioutil.ReadAll(f) + if err != nil { + return err + } + s := strings.TrimSpace(string(data)) + v, err := parseUint(s, 10, 64) + if err != nil { + // if we cannot parse as a uint, parse as a string + out[file] = s + return nil + } + out[file] = v + return nil +} + +func readKVStatsFile(path string, file string, out map[string]interface{}) error { + f, err := os.Open(filepath.Join(path, file)) + if err != nil { + return err + } + defer f.Close() + + s := bufio.NewScanner(f) + for s.Scan() { + if err := s.Err(); err != nil { + return err + } + name, value, err := parseKV(s.Text()) + if err != nil { + return errors.Wrapf(err, "error while parsing %s (line=%q)", filepath.Join(path, file), s.Text()) + } + out[name] = value + } + return nil +} + +func (c *Manager) Freeze() error { + return c.freeze(c.path, Frozen) +} + +func (c *Manager) Thaw() error { + return c.freeze(c.path, Thawed) +} + +func (c *Manager) freeze(path string, state State) error { + values := state.Values() + for { + if err := writeValues(path, values); err != nil { + return err + } + current, err := fetchState(path) + if err != nil { + return err + } + if current == state { + return nil + } + time.Sleep(1 * time.Millisecond) + } +} + +// MemoryEventFD returns inotify file descriptor and 'memory.events' inotify watch descriptor +func (c *Manager) MemoryEventFD() (int, uint32, error) { + fpath := filepath.Join(c.path, "memory.events") + fd, err := syscall.InotifyInit() + if err != nil { + return 0, 0, errors.Errorf("Failed to create inotify fd") + } + wd, err := syscall.InotifyAddWatch(fd, fpath, unix.IN_MODIFY) + if wd < 0 { + syscall.Close(fd) + return 0, 0, errors.Errorf("Failed to add inotify watch for %q", fpath) + } + + return fd, uint32(wd), nil +} + +func (c *Manager) EventChan() (<-chan Event, <-chan error) { + ec := make(chan Event) + errCh := make(chan error) + go c.waitForEvents(ec, errCh) + + return ec, nil +} + +func (c *Manager) waitForEvents(ec chan<- Event, errCh chan<- error) { + fd, wd, err := c.MemoryEventFD() + + defer syscall.InotifyRmWatch(fd, wd) + defer syscall.Close(fd) + + if err != nil { + errCh <- err + return + } + + for { + buffer := make([]byte, syscall.SizeofInotifyEvent*10) + bytesRead, err := syscall.Read(fd, buffer) + if err != nil { + errCh <- err + return + } + var out map[string]interface{} + if bytesRead >= syscall.SizeofInotifyEvent { + if err := readKVStatsFile(c.path, "memory.events", out); err != nil { + e := Event{ + High: out["high"].(uint64), + Low: out["low"].(uint64), + Max: out["max"].(uint64), + OOM: out["oom"].(uint64), + OOMKill: out["oom_kill"].(uint64), + } + ec <- e + } else { + errCh <- err + return + } + } + } +} + +func setDevices(path string, devices []specs.LinuxDeviceCgroup) error { + if len(devices) == 0 { + return nil + } + insts, license, err := DeviceFilter(devices) + if err != nil { + return err + } + dirFD, err := unix.Open(path, unix.O_DIRECTORY|unix.O_RDONLY, 0600) + if err != nil { + return errors.Errorf("cannot get dir FD for %s", path) + } + defer unix.Close(dirFD) + if _, err := LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil { + if !canSkipEBPFError(devices) { + return err + } + } + return nil +} + +func NewSystemd(slice, group string, pid int, resources *Resources) (*Manager, error) { + if slice == "" { + slice = defaultSlice + } + path := filepath.Join(defaultCgroup2Path, slice, group) + conn, err := systemdDbus.New() + if err != nil { + return &Manager{}, err + } + defer conn.Close() + + properties := []systemdDbus.Property{ + systemdDbus.PropDescription(fmt.Sprintf("cgroup %s", group)), + newSystemdProperty("DefaultDependencies", false), + newSystemdProperty("MemoryAccounting", true), + newSystemdProperty("CPUAccounting", true), + newSystemdProperty("IOAccounting", true), + } + + // if we create a slice, the parent is defined via a Wants= + if strings.HasSuffix(group, ".slice") { + properties = append(properties, systemdDbus.PropWants(defaultSlice)) + } else { + // otherwise, we use Slice= + properties = append(properties, systemdDbus.PropSlice(defaultSlice)) + } + + // only add pid if its valid, -1 is used w/ general slice creation. + if pid != -1 { + properties = append(properties, newSystemdProperty("PIDs", []uint32{uint32(pid)})) + } + + if resources.Memory != nil && *resources.Memory.Max != 0 { + properties = append(properties, + newSystemdProperty("MemoryMax", uint64(*resources.Memory.Max))) + } + + if resources.CPU != nil && *resources.CPU.Weight != 0 { + properties = append(properties, + newSystemdProperty("CPUWeight", *resources.CPU.Weight)) + } + + if resources.CPU != nil && resources.CPU.Max != "" { + quota, period := resources.CPU.Max.extractQuotaAndPeriod() + // cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd. + // corresponds to USEC_INFINITY in systemd + // if USEC_INFINITY is provided, CPUQuota is left unbound by systemd + // always setting a property value ensures we can apply a quota and remove it later + cpuQuotaPerSecUSec := uint64(math.MaxUint64) + if quota > 0 { + // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota + // (integer percentage of CPU) internally. This means that if a fractional percent of + // CPU is indicated by Resources.CpuQuota, we need to round up to the nearest + // 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect. + cpuQuotaPerSecUSec = uint64(quota*1000000) / period + if cpuQuotaPerSecUSec%10000 != 0 { + cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000 + } + } + properties = append(properties, + newSystemdProperty("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec)) + } + + // If we can delegate, we add the property back in + if canDelegate { + properties = append(properties, newSystemdProperty("Delegate", true)) + } + + if resources.Pids != nil && resources.Pids.Max > 0 { + properties = append(properties, + newSystemdProperty("TasksAccounting", true), + newSystemdProperty("TasksMax", uint64(resources.Pids.Max))) + } + + statusChan := make(chan string, 1) + if _, err := conn.StartTransientUnit(group, "replace", properties, statusChan); err == nil { + select { + case <-statusChan: + case <-time.After(time.Second): + logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", group) + } + } else if !isUnitExists(err) { + return &Manager{}, err + } + + return &Manager{ + path: path, + }, nil +} + +func LoadSystemd(slice, group string) (*Manager, error) { + if slice == "" { + slice = defaultSlice + } + group = filepath.Join(defaultCgroup2Path, slice, group) + return &Manager{ + path: group, + }, nil +} + +func (c *Manager) DeleteSystemd() error { + conn, err := systemdDbus.New() + if err != nil { + return err + } + defer conn.Close() + group := systemdUnitFromPath(c.path) + ch := make(chan string) + _, err = conn.StopUnit(group, "replace", ch) + if err != nil { + return err + } + <-ch + return nil +} + +func newSystemdProperty(name string, units interface{}) systemdDbus.Property { + return systemdDbus.Property{ + Name: name, + Value: dbus.MakeVariant(units), + } +} diff --git a/vendor/github.com/containerd/cgroups/v2/memory.go b/vendor/github.com/containerd/cgroups/v2/memory.go new file mode 100644 index 0000000000..72f94b738b --- /dev/null +++ b/vendor/github.com/containerd/cgroups/v2/memory.go @@ -0,0 +1,52 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package v2 + +type Memory struct { + Swap *int64 + Max *int64 + Low *int64 + High *int64 +} + +func (r *Memory) Values() (o []Value) { + if r.Swap != nil { + o = append(o, Value{ + filename: "memory.swap.max", + value: *r.Swap, + }) + } + if r.Max != nil { + o = append(o, Value{ + filename: "memory.max", + value: *r.Max, + }) + } + if r.Low != nil { + o = append(o, Value{ + filename: "memory.low", + value: *r.Low, + }) + } + if r.High != nil { + o = append(o, Value{ + filename: "memory.high", + value: *r.High, + }) + } + return o +} diff --git a/vendor/github.com/containerd/cgroups/v2/paths.go b/vendor/github.com/containerd/cgroups/v2/paths.go new file mode 100644 index 0000000000..6f2f5edb3d --- /dev/null +++ b/vendor/github.com/containerd/cgroups/v2/paths.go @@ -0,0 +1,60 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package v2 + +import ( + "fmt" + "path/filepath" + "strings" +) + +// NestedGroupPath will nest the cgroups based on the calling processes cgroup +// placing its child processes inside its own path +func NestedGroupPath(suffix string) (string, error) { + path, err := parseCgroupFile("/proc/self/cgroup") + if err != nil { + return "", err + } + return filepath.Join(string(path), suffix), nil +} + +// PidGroupPath will return the correct cgroup paths for an existing process running inside a cgroup +// This is commonly used for the Load function to restore an existing container +func PidGroupPath(pid int) (string, error) { + p := fmt.Sprintf("/proc/%d/cgroup", pid) + return parseCgroupFile(p) +} + +// VerifyGroupPath verifies the format of group path string g. +// The format is same as the third field in /proc/PID/cgroup. +// e.g. "/user.slice/user-1001.slice/session-1.scope" +// +// g must be a "clean" absolute path starts with "/", and must not contain "/sys/fs/cgroup" prefix. +// +// VerifyGroupPath doesn't verify whether g actually exists on the system. +func VerifyGroupPath(g string) error { + if !strings.HasPrefix(g, "/") { + return ErrInvalidGroupPath + } + if filepath.Clean(g) != g { + return ErrInvalidGroupPath + } + if strings.HasPrefix(g, "/sys/fs/cgroup") { + return ErrInvalidGroupPath + } + return nil +} diff --git a/vendor/github.com/containerd/cgroups/v2/pids.go b/vendor/github.com/containerd/cgroups/v2/pids.go new file mode 100644 index 0000000000..0b5aa0c3bf --- /dev/null +++ b/vendor/github.com/containerd/cgroups/v2/pids.go @@ -0,0 +1,37 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package v2 + +import "strconv" + +type Pids struct { + Max int64 +} + +func (r *Pids) Values() (o []Value) { + if r.Max != 0 { + limit := "max" + if r.Max > 0 { + limit = strconv.FormatInt(r.Max, 10) + } + o = append(o, Value{ + filename: "pids.max", + value: limit, + }) + } + return o +} diff --git a/vendor/github.com/containerd/cgroups/v2/rdma.go b/vendor/github.com/containerd/cgroups/v2/rdma.go new file mode 100644 index 0000000000..44caa4f57a --- /dev/null +++ b/vendor/github.com/containerd/cgroups/v2/rdma.go @@ -0,0 +1,46 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package v2 + +import ( + "fmt" +) + +type RDMA struct { + Limit []RDMAEntry +} + +type RDMAEntry struct { + Device string + HcaHandles uint32 + HcaObjects uint32 +} + +func (r RDMAEntry) String() string { + return fmt.Sprintf("%s hca_handle=%d hca_object=%d", r.Device, r.HcaHandles, r.HcaObjects) +} + +func (r *RDMA) Values() (o []Value) { + for _, e := range r.Limit { + o = append(o, Value{ + filename: "rdma.max", + value: e.String(), + }) + } + + return o +} diff --git a/vendor/github.com/containerd/cgroups/v2/state.go b/vendor/github.com/containerd/cgroups/v2/state.go new file mode 100644 index 0000000000..09b75b6c3d --- /dev/null +++ b/vendor/github.com/containerd/cgroups/v2/state.go @@ -0,0 +1,65 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package v2 + +import ( + "io/ioutil" + "path/filepath" + "strings" +) + +// State is a type that represents the state of the current cgroup +type State string + +const ( + Unknown State = "" + Thawed State = "thawed" + Frozen State = "frozen" + Deleted State = "deleted" + + cgroupFreeze = "cgroup.freeze" +) + +func (s State) Values() []Value { + v := Value{ + filename: cgroupFreeze, + } + switch s { + case Frozen: + v.value = "1" + case Thawed: + v.value = "0" + } + return []Value{ + v, + } +} + +func fetchState(path string) (State, error) { + current, err := ioutil.ReadFile(filepath.Join(path, cgroupFreeze)) + if err != nil { + return Unknown, err + } + switch strings.TrimSpace(string(current)) { + case "1": + return Frozen, nil + case "0": + return Thawed, nil + default: + return Unknown, nil + } +} diff --git a/vendor/github.com/containerd/cgroups/v2/utils.go b/vendor/github.com/containerd/cgroups/v2/utils.go new file mode 100644 index 0000000000..befe1d049c --- /dev/null +++ b/vendor/github.com/containerd/cgroups/v2/utils.go @@ -0,0 +1,442 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package v2 + +import ( + "bufio" + "fmt" + "io" + "io/ioutil" + "math" + "os" + "path/filepath" + "strconv" + "strings" + "time" + + "github.com/godbus/dbus/v5" + + "github.com/containerd/cgroups/v2/stats" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" +) + +const ( + cgroupProcs = "cgroup.procs" + defaultDirPerm = 0755 +) + +// defaultFilePerm is a var so that the test framework can change the filemode +// of all files created when the tests are running. The difference between the +// tests and real world use is that files like "cgroup.procs" will exist when writing +// to a read cgroup filesystem and do not exist prior when running in the tests. +// this is set to a non 0 value in the test code +var defaultFilePerm = os.FileMode(0) + +// remove will remove a cgroup path handling EAGAIN and EBUSY errors and +// retrying the remove after a exp timeout +func remove(path string) error { + var err error + delay := 10 * time.Millisecond + for i := 0; i < 5; i++ { + if i != 0 { + time.Sleep(delay) + delay *= 2 + } + if err = os.RemoveAll(path); err == nil { + return nil + } + } + return errors.Wrapf(err, "cgroups: unable to remove path %q", path) +} + +// parseCgroupProcsFile parses /sys/fs/cgroup/$GROUPPATH/cgroup.procs +func parseCgroupProcsFile(path string) ([]uint64, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + defer f.Close() + var ( + out []uint64 + s = bufio.NewScanner(f) + ) + for s.Scan() { + if t := s.Text(); t != "" { + pid, err := strconv.ParseUint(t, 10, 0) + if err != nil { + return nil, err + } + out = append(out, pid) + } + } + return out, nil +} + +func parseKV(raw string) (string, interface{}, error) { + parts := strings.Fields(raw) + switch len(parts) { + case 2: + v, err := parseUint(parts[1], 10, 64) + if err != nil { + // if we cannot parse as a uint, parse as a string + return parts[0], parts[1], nil + } + return parts[0], v, nil + default: + return "", 0, ErrInvalidFormat + } +} + +func readUint(path string) (uint64, error) { + v, err := ioutil.ReadFile(path) + if err != nil { + return 0, err + } + return parseUint(strings.TrimSpace(string(v)), 10, 64) +} + +func parseUint(s string, base, bitSize int) (uint64, error) { + v, err := strconv.ParseUint(s, base, bitSize) + if err != nil { + intValue, intErr := strconv.ParseInt(s, base, bitSize) + // 1. Handle negative values greater than MinInt64 (and) + // 2. Handle negative values lesser than MinInt64 + if intErr == nil && intValue < 0 { + return 0, nil + } else if intErr != nil && + intErr.(*strconv.NumError).Err == strconv.ErrRange && + intValue < 0 { + return 0, nil + } + return 0, err + } + return v, nil +} + +// parseCgroupFile parses /proc/PID/cgroup file and return string +func parseCgroupFile(path string) (string, error) { + f, err := os.Open(path) + if err != nil { + return "", err + } + defer f.Close() + return parseCgroupFromReader(f) +} + +func parseCgroupFromReader(r io.Reader) (string, error) { + var ( + s = bufio.NewScanner(r) + ) + for s.Scan() { + if err := s.Err(); err != nil { + return "", err + } + var ( + text = s.Text() + parts = strings.SplitN(text, ":", 3) + ) + if len(parts) < 3 { + return "", fmt.Errorf("invalid cgroup entry: %q", text) + } + // text is like "0::/user.slice/user-1001.slice/session-1.scope" + if parts[0] == "0" && parts[1] == "" { + return parts[2], nil + } + } + return "", fmt.Errorf("cgroup path not found") +} + +// ToResources converts the oci LinuxResources struct into a +// v2 Resources type for use with this package. +// +// converting cgroups configuration from v1 to v2 +// ref: https://github.com/containers/crun/blob/master/crun.1.md#cgroup-v2 +func ToResources(spec *specs.LinuxResources) *Resources { + var resources Resources + if cpu := spec.CPU; cpu != nil { + resources.CPU = &CPU{ + Cpus: cpu.Cpus, + Mems: cpu.Mems, + } + if shares := cpu.Shares; shares != nil { + convertedWeight := (1 + ((*shares-2)*9999)/262142) + resources.CPU.Weight = &convertedWeight + } + if period := cpu.Period; period != nil { + resources.CPU.Max = NewCPUMax(cpu.Quota, period) + } + } + if mem := spec.Memory; mem != nil { + resources.Memory = &Memory{} + if swap := mem.Swap; swap != nil { + resources.Memory.Swap = swap + } + if l := mem.Limit; l != nil { + resources.Memory.Max = l + } + if l := mem.Reservation; l != nil { + resources.Memory.Low = l + } + } + if hugetlbs := spec.HugepageLimits; hugetlbs != nil { + hugeTlbUsage := HugeTlb{} + for _, hugetlb := range hugetlbs { + hugeTlbUsage = append(hugeTlbUsage, HugeTlbEntry{ + HugePageSize: hugetlb.Pagesize, + Limit: hugetlb.Limit, + }) + } + resources.HugeTlb = &hugeTlbUsage + } + if pids := spec.Pids; pids != nil { + resources.Pids = &Pids{ + Max: pids.Limit, + } + } + if i := spec.BlockIO; i != nil { + resources.IO = &IO{} + if i.Weight != nil { + resources.IO.BFQ.Weight = 1 + (*i.Weight-10)*9999/990 + } + for t, devices := range map[IOType][]specs.LinuxThrottleDevice{ + ReadBPS: i.ThrottleReadBpsDevice, + WriteBPS: i.ThrottleWriteBpsDevice, + ReadIOPS: i.ThrottleReadIOPSDevice, + WriteIOPS: i.ThrottleWriteIOPSDevice, + } { + for _, d := range devices { + resources.IO.Max = append(resources.IO.Max, Entry{ + Type: t, + Major: d.Major, + Minor: d.Minor, + Rate: d.Rate, + }) + } + } + } + if i := spec.Rdma; i != nil { + resources.RDMA = &RDMA{} + for device, value := range spec.Rdma { + if device != "" && (value.HcaHandles != nil || value.HcaObjects != nil) { + resources.RDMA.Limit = append(resources.RDMA.Limit, RDMAEntry{ + Device: device, + HcaHandles: *value.HcaHandles, + HcaObjects: *value.HcaObjects, + }) + } + } + } + + return &resources +} + +// Gets uint64 parsed content of single value cgroup stat file +func getStatFileContentUint64(filePath string) uint64 { + contents, err := ioutil.ReadFile(filePath) + if err != nil { + return 0 + } + trimmed := strings.TrimSpace(string(contents)) + if trimmed == "max" { + return math.MaxUint64 + } + + res, err := parseUint(trimmed, 10, 64) + if err != nil { + logrus.Errorf("unable to parse %q as a uint from Cgroup file %q", string(contents), filePath) + return res + } + + return res +} + +func readIoStats(path string) []*stats.IOEntry { + // more details on the io.stat file format: https://www.kernel.org/doc/Documentation/cgroup-v2.txt + var usage []*stats.IOEntry + fpath := filepath.Join(path, "io.stat") + currentData, err := ioutil.ReadFile(fpath) + if err != nil { + return usage + } + entries := strings.Split(string(currentData), "\n") + + for _, entry := range entries { + parts := strings.Split(entry, " ") + if len(parts) < 2 { + continue + } + majmin := strings.Split(parts[0], ":") + if len(majmin) != 2 { + continue + } + major, err := strconv.ParseUint(majmin[0], 10, 0) + if err != nil { + return usage + } + minor, err := strconv.ParseUint(majmin[1], 10, 0) + if err != nil { + return usage + } + parts = parts[1:] + ioEntry := stats.IOEntry{ + Major: major, + Minor: minor, + } + for _, stats := range parts { + keyPairValue := strings.Split(stats, "=") + if len(keyPairValue) != 2 { + continue + } + v, err := strconv.ParseUint(keyPairValue[1], 10, 0) + if err != nil { + continue + } + switch keyPairValue[0] { + case "rbytes": + ioEntry.Rbytes = v + case "wbytes": + ioEntry.Wbytes = v + case "rios": + ioEntry.Rios = v + case "wios": + ioEntry.Wios = v + } + } + usage = append(usage, &ioEntry) + } + return usage +} + +func rdmaStats(filepath string) []*stats.RdmaEntry { + currentData, err := ioutil.ReadFile(filepath) + if err != nil { + return []*stats.RdmaEntry{} + } + return toRdmaEntry(strings.Split(string(currentData), "\n")) +} + +func parseRdmaKV(raw string, entry *stats.RdmaEntry) { + var value uint64 + var err error + + parts := strings.Split(raw, "=") + switch len(parts) { + case 2: + if parts[1] == "max" { + value = math.MaxUint32 + } else { + value, err = parseUint(parts[1], 10, 32) + if err != nil { + return + } + } + if parts[0] == "hca_handle" { + entry.HcaHandles = uint32(value) + } else if parts[0] == "hca_object" { + entry.HcaObjects = uint32(value) + } + } +} + +func toRdmaEntry(strEntries []string) []*stats.RdmaEntry { + var rdmaEntries []*stats.RdmaEntry + for i := range strEntries { + parts := strings.Fields(strEntries[i]) + switch len(parts) { + case 3: + entry := new(stats.RdmaEntry) + entry.Device = parts[0] + parseRdmaKV(parts[1], entry) + parseRdmaKV(parts[2], entry) + + rdmaEntries = append(rdmaEntries, entry) + default: + continue + } + } + return rdmaEntries +} + +// isUnitExists returns true if the error is that a systemd unit already exists. +func isUnitExists(err error) bool { + if err != nil { + if dbusError, ok := err.(dbus.Error); ok { + return strings.Contains(dbusError.Name, "org.freedesktop.systemd1.UnitExists") + } + } + return false +} + +func systemdUnitFromPath(path string) string { + _, unit := filepath.Split(path) + return unit +} + +func readHugeTlbStats(path string) []*stats.HugeTlbStat { + var usage = []*stats.HugeTlbStat{} + var keyUsage = make(map[string]*stats.HugeTlbStat) + f, err := os.Open(path) + if err != nil { + return usage + } + files, err := f.Readdir(-1) + f.Close() + if err != nil { + return usage + } + + for _, file := range files { + if strings.Contains(file.Name(), "hugetlb") && + (strings.HasSuffix(file.Name(), "max") || strings.HasSuffix(file.Name(), "current")) { + var hugeTlb *stats.HugeTlbStat + var ok bool + fileName := strings.Split(file.Name(), ".") + pageSize := fileName[1] + if hugeTlb, ok = keyUsage[pageSize]; !ok { + hugeTlb = &stats.HugeTlbStat{} + } + hugeTlb.Pagesize = pageSize + out, err := ioutil.ReadFile(filepath.Join(path, file.Name())) + if err != nil { + continue + } + var value uint64 + stringVal := strings.TrimSpace(string(out)) + if stringVal == "max" { + value = math.MaxUint64 + } else { + value, err = strconv.ParseUint(stringVal, 10, 64) + } + if err != nil { + continue + } + switch fileName[2] { + case "max": + hugeTlb.Max = value + case "current": + hugeTlb.Current = value + } + keyUsage[pageSize] = hugeTlb + } + } + for _, entry := range keyUsage { + usage = append(usage, entry) + } + return usage +}