Ver código fonte

Merge pull request #42143 from thaJeztah/check_libcontainer

vendor: github.com/opencontainers/runc v1.0.0-rc95
Akihiro Suda 4 anos atrás
pai
commit
33c332ad19
100 arquivos alterados com 11377 adições e 2052 exclusões
  1. 3 4
      oci/devices_linux.go
  2. 2 2
      oci/devices_linux_test.go
  3. 0 20
      oci/devices_unsupported.go
  4. 5 5
      vendor.conf
  5. 62 0
      vendor/github.com/cilium/ebpf/README.md
  6. 0 206
      vendor/github.com/cilium/ebpf/abi.go
  7. 1 1
      vendor/github.com/cilium/ebpf/asm/func.go
  8. 84 64
      vendor/github.com/cilium/ebpf/asm/instruction.go
  9. 3 3
      vendor/github.com/cilium/ebpf/asm/opcode.go
  10. 346 51
      vendor/github.com/cilium/ebpf/collection.go
  11. 1 2
      vendor/github.com/cilium/ebpf/doc.go
  12. 456 265
      vendor/github.com/cilium/ebpf/elf_reader.go
  13. 21 0
      vendor/github.com/cilium/ebpf/elf_reader_fuzz.go
  14. 6 0
      vendor/github.com/cilium/ebpf/examples/README.md
  15. 9 0
      vendor/github.com/cilium/ebpf/examples/go.mod
  16. 3265 0
      vendor/github.com/cilium/ebpf/examples/headers/bpf_helper_defs.h
  17. 80 0
      vendor/github.com/cilium/ebpf/examples/headers/bpf_helpers.h
  18. 107 0
      vendor/github.com/cilium/ebpf/examples/headers/common.h
  19. 26 0
      vendor/github.com/cilium/ebpf/examples/kprobe/bpf/kprobe_example.c
  20. 25 0
      vendor/github.com/cilium/ebpf/examples/uprobe/bpf/uprobe_example.c
  21. 6 2
      vendor/github.com/cilium/ebpf/go.mod
  22. 239 0
      vendor/github.com/cilium/ebpf/info.go
  23. 175 100
      vendor/github.com/cilium/ebpf/internal/btf/btf.go
  24. 17 5
      vendor/github.com/cilium/ebpf/internal/btf/btf_types.go
  25. 388 0
      vendor/github.com/cilium/ebpf/internal/btf/core.go
  26. 126 27
      vendor/github.com/cilium/ebpf/internal/btf/ext_info.go
  27. 49 0
      vendor/github.com/cilium/ebpf/internal/btf/fuzz.go
  28. 360 76
      vendor/github.com/cilium/ebpf/internal/btf/types.go
  29. 52 0
      vendor/github.com/cilium/ebpf/internal/elf.go
  30. 30 52
      vendor/github.com/cilium/ebpf/internal/feature.go
  31. 44 0
      vendor/github.com/cilium/ebpf/internal/pinning.go
  32. 10 5
      vendor/github.com/cilium/ebpf/internal/ptr.go
  33. 43 2
      vendor/github.com/cilium/ebpf/internal/syscall.go
  34. 61 10
      vendor/github.com/cilium/ebpf/internal/unix/types_linux.go
  35. 52 9
      vendor/github.com/cilium/ebpf/internal/unix/types_other.go
  36. 163 0
      vendor/github.com/cilium/ebpf/internal/version.go
  37. 47 0
      vendor/github.com/cilium/ebpf/linker.go
  38. 558 151
      vendor/github.com/cilium/ebpf/map.go
  39. 23 8
      vendor/github.com/cilium/ebpf/marshalers.go
  40. 240 147
      vendor/github.com/cilium/ebpf/prog.go
  41. 0 25
      vendor/github.com/cilium/ebpf/readme.md
  42. 142 89
      vendor/github.com/cilium/ebpf/syscalls.go
  43. 81 33
      vendor/github.com/cilium/ebpf/types.go
  44. 36 5
      vendor/github.com/cilium/ebpf/types_string.go
  45. 25 13
      vendor/github.com/opencontainers/runc/README.md
  46. 16 14
      vendor/github.com/opencontainers/runc/go.mod
  47. 87 83
      vendor/github.com/opencontainers/runc/libcontainer/README.md
  48. 23 13
      vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go
  49. 51 0
      vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/fscommon.go
  50. 120 0
      vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/open.go
  51. 122 0
      vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/utils.go
  52. 28 0
      vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go
  53. 115 42
      vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go
  54. 41 59
      vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go
  55. 5 7
      vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go
  56. 15 10
      vendor/github.com/opencontainers/runc/libcontainer/configs/config.go
  57. 9 0
      vendor/github.com/opencontainers/runc/libcontainer/configs/configs_fuzzer.go
  58. 0 16
      vendor/github.com/opencontainers/runc/libcontainer/configs/device_unix.go
  59. 0 5
      vendor/github.com/opencontainers/runc/libcontainer/configs/device_windows.go
  60. 17 0
      vendor/github.com/opencontainers/runc/libcontainer/configs/devices.go
  61. 1 1
      vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go
  62. 33 29
      vendor/github.com/opencontainers/runc/libcontainer/devices/device.go
  63. 22 14
      vendor/github.com/opencontainers/runc/libcontainer/devices/device_unix.go
  64. 51 29
      vendor/github.com/opencontainers/runc/libcontainer/nsenter/cloned_binary.c
  65. 142 0
      vendor/github.com/opencontainers/runc/libcontainer/nsenter/escape.c
  66. 222 139
      vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c
  67. 1 0
      vendor/github.com/opencontainers/runc/libcontainer/nsenter/test/escape.c
  68. 53 0
      vendor/github.com/opencontainers/runc/libcontainer/nsenter/test/escape.go
  69. 0 41
      vendor/github.com/opencontainers/runc/libcontainer/user/lookup.go
  70. 16 4
      vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go
  71. 0 40
      vendor/github.com/opencontainers/runc/libcontainer/user/lookup_windows.go
  72. 10 42
      vendor/github.com/opencontainers/runc/libcontainer/user/user.go
  73. 42 0
      vendor/github.com/opencontainers/runc/libcontainer/user/user_fuzzer.go
  74. 5 0
      vendor/github.com/opencontainers/runc/libcontainer/userns/userns.go
  75. 15 0
      vendor/github.com/opencontainers/runc/libcontainer/userns/userns_fuzzer.go
  76. 37 0
      vendor/github.com/opencontainers/runc/libcontainer/userns/userns_linux.go
  77. 17 0
      vendor/github.com/opencontainers/runc/libcontainer/userns/userns_unsupported.go
  78. 1 1
      vendor/github.com/opencontainers/runtime-spec/README.md
  79. 16 7
      vendor/github.com/opencontainers/runtime-spec/specs-go/config.go
  80. 25 4
      vendor/github.com/opencontainers/runtime-spec/specs-go/state.go
  81. 2 0
      vendor/golang.org/x/net/README.md
  82. 3 3
      vendor/golang.org/x/net/go.mod
  83. 8 4
      vendor/golang.org/x/net/http2/server.go
  84. 34 6
      vendor/golang.org/x/net/http2/transport.go
  85. 1 1
      vendor/golang.org/x/net/idna/tables12.0.0.go
  86. 2394 0
      vendor/golang.org/x/net/idna/tables13.0.0.go
  87. 1 1
      vendor/golang.org/x/net/internal/socket/cmsghdr.go
  88. 13 3
      vendor/golang.org/x/net/internal/socket/cmsghdr_stub.go
  89. 21 0
      vendor/golang.org/x/net/internal/socket/cmsghdr_unix.go
  90. 25 0
      vendor/golang.org/x/net/internal/socket/cmsghdr_zos_s390x.go
  91. 1 1
      vendor/golang.org/x/net/internal/socket/error_unix.go
  92. 1 1
      vendor/golang.org/x/net/internal/socket/iovec_64bit.go
  93. 1 1
      vendor/golang.org/x/net/internal/socket/iovec_stub.go
  94. 1 1
      vendor/golang.org/x/net/internal/socket/msghdr_stub.go
  95. 36 0
      vendor/golang.org/x/net/internal/socket/msghdr_zos_s390x.go
  96. 4 3
      vendor/golang.org/x/net/internal/socket/rawconn_msg.go
  97. 1 1
      vendor/golang.org/x/net/internal/socket/rawconn_nomsg.go
  98. 1 9
      vendor/golang.org/x/net/internal/socket/socket.go
  99. 2 12
      vendor/golang.org/x/net/internal/socket/sys.go
  100. 0 23
      vendor/golang.org/x/net/internal/socket/sys_bsdvar.go

+ 3 - 4
oci/devices_linux.go

@@ -6,14 +6,13 @@ import (
 	"path/filepath"
 	"path/filepath"
 	"strings"
 	"strings"
 
 
-	"github.com/opencontainers/runc/libcontainer/configs"
 	"github.com/opencontainers/runc/libcontainer/devices"
 	"github.com/opencontainers/runc/libcontainer/devices"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"golang.org/x/sys/unix"
 	"golang.org/x/sys/unix"
 )
 )
 
 
-// Device transforms a libcontainer configs.Device to a specs.LinuxDevice object.
-func Device(d *configs.Device) specs.LinuxDevice {
+// Device transforms a libcontainer devices.Device to a specs.LinuxDevice object.
+func Device(d *devices.Device) specs.LinuxDevice {
 	return specs.LinuxDevice{
 	return specs.LinuxDevice{
 		Type:     string(d.Type),
 		Type:     string(d.Type),
 		Path:     d.Path,
 		Path:     d.Path,
@@ -25,7 +24,7 @@ func Device(d *configs.Device) specs.LinuxDevice {
 	}
 	}
 }
 }
 
 
-func deviceCgroup(d *configs.Device) specs.LinuxDeviceCgroup {
+func deviceCgroup(d *devices.Device) specs.LinuxDeviceCgroup {
 	return specs.LinuxDeviceCgroup{
 	return specs.LinuxDeviceCgroup{
 		Allow:  true,
 		Allow:  true,
 		Type:   string(d.Type),
 		Type:   string(d.Type),

+ 2 - 2
oci/devices_linux_test.go

@@ -4,7 +4,7 @@ import (
 	"os"
 	"os"
 	"testing"
 	"testing"
 
 
-	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/opencontainers/runc/libcontainer/devices"
 	"golang.org/x/sys/unix"
 	"golang.org/x/sys/unix"
 	"gotest.tools/v3/assert"
 	"gotest.tools/v3/assert"
 )
 )
@@ -24,7 +24,7 @@ func TestDeviceMode(t *testing.T) {
 	for _, tc := range tests {
 	for _, tc := range tests {
 		tc := tc
 		tc := tc
 		t.Run(tc.name, func(t *testing.T) {
 		t.Run(tc.name, func(t *testing.T) {
-			d := Device(&configs.Device{FileMode: tc.in})
+			d := Device(&devices.Device{FileMode: tc.in})
 			assert.Equal(t, *d.FileMode, tc.out)
 			assert.Equal(t, *d.FileMode, tc.out)
 		})
 		})
 	}
 	}

+ 0 - 20
oci/devices_unsupported.go

@@ -1,20 +0,0 @@
-// +build !linux
-
-package oci // import "github.com/docker/docker/oci"
-
-import (
-	"errors"
-
-	"github.com/opencontainers/runc/libcontainer/configs"
-	specs "github.com/opencontainers/runtime-spec/specs-go"
-)
-
-// Device transforms a libcontainer configs.Device to a specs.Device object.
-// Not implemented
-func Device(d *configs.Device) specs.LinuxDevice { return specs.LinuxDevice{} }
-
-// DevicesFromPath computes a list of devices and device permissions from paths (pathOnHost and pathInContainer) and cgroup permissions.
-// Not implemented
-func DevicesFromPath(pathOnHost, pathInContainer, cgroupPermissions string) (devs []specs.LinuxDevice, devPermissions []specs.LinuxDeviceCgroup, err error) {
-	return nil, nil, errors.New("oci/devices: unsupported platform")
-}

+ 5 - 5
vendor.conf

@@ -19,8 +19,8 @@ github.com/moby/sys                                 b0f1fd7235275d01bd35cc4421e8
 github.com/creack/pty                               2a38352e8b4d7ab6c336eef107e42a55e72e7fbc # v1.1.11
 github.com/creack/pty                               2a38352e8b4d7ab6c336eef107e42a55e72e7fbc # v1.1.11
 github.com/sirupsen/logrus                          6699a89a232f3db797f2e280639854bbc4b89725 # v1.7.0
 github.com/sirupsen/logrus                          6699a89a232f3db797f2e280639854bbc4b89725 # v1.7.0
 github.com/tchap/go-patricia                        a7f0089c6f496e8e70402f61733606daa326cac5 # v2.3.0
 github.com/tchap/go-patricia                        a7f0089c6f496e8e70402f61733606daa326cac5 # v2.3.0
-golang.org/x/net                                    ab34263943818b32f575efc978a3d24e80b04bd7
-golang.org/x/sys                                    b64e53b001e413bd5067f36d4e439eded3827374
+golang.org/x/net                                    6772e930b67bb09bf22262c7378e7d2f67cf59d1
+golang.org/x/sys                                    d19ff857e887eacb631721f188c7d365c2331456
 github.com/docker/go-units                          519db1ee28dcc9fd2474ae59fca29a810482bfb1 # v0.4.0
 github.com/docker/go-units                          519db1ee28dcc9fd2474ae59fca29a810482bfb1 # v0.4.0
 github.com/docker/go-connections                    7395e3f8aa162843a74ed6d48e79627d9792ac55 # v0.4.0
 github.com/docker/go-connections                    7395e3f8aa162843a74ed6d48e79627d9792ac55 # v0.4.0
 golang.org/x/text                                   23ae387dee1f90d29a23c0e87ee0b46038fbed0e # v0.3.3
 golang.org/x/text                                   23ae387dee1f90d29a23c0e87ee0b46038fbed0e # v0.3.3
@@ -92,8 +92,8 @@ google.golang.org/grpc                              f495f5b15ae7ccda3b38c53a1bfc
 # the containerd project first, and update both after that is merged.
 # the containerd project first, and update both after that is merged.
 # This commit does not need to match RUNC_COMMIT as it is used for helper
 # This commit does not need to match RUNC_COMMIT as it is used for helper
 # packages but should be newer or equal.
 # packages but should be newer or equal.
-github.com/opencontainers/runc                      ff819c7e9184c13b7c2607fe6c30ae19403a7aff # v1.0.0-rc92
-github.com/opencontainers/runtime-spec              4d89ac9fbff6c455f46a5bb59c6b1bb7184a5e43 # v1.0.3-0.20200728170252-4d89ac9fbff6
+github.com/opencontainers/runc                      b9ee9c6314599f1b4a7f497e1f1f856fe433d3b7 # v1.0.0-rc95
+github.com/opencontainers/runtime-spec              1c3f411f041711bbeecf35ff7e93461ea6789220 # v1.0.3-0.20210326190908-1c3f411f0417
 github.com/opencontainers/image-spec                d60099175f88c47cd379c4738d158884749ed235 # v1.0.1
 github.com/opencontainers/image-spec                d60099175f88c47cd379c4738d158884749ed235 # v1.0.1
 github.com/cyphar/filepath-securejoin               a261ee33d7a517f054effbf451841abaafe3e0fd # v0.2.2
 github.com/cyphar/filepath-securejoin               a261ee33d7a517f054effbf451841abaafe3e0fd # v0.2.2
 
 
@@ -141,7 +141,7 @@ github.com/containerd/go-runc                       16b287bc67d069a60fa48db15f33
 github.com/containerd/typeurl                       cd3ce7159eae562a4f60ceff37dada11a939d247 # v1.0.1
 github.com/containerd/typeurl                       cd3ce7159eae562a4f60ceff37dada11a939d247 # v1.0.1
 github.com/containerd/ttrpc                         bfba540dc45464586c106b1f31c8547933c1eb41 # v1.0.2
 github.com/containerd/ttrpc                         bfba540dc45464586c106b1f31c8547933c1eb41 # v1.0.2
 github.com/gogo/googleapis                          01e0f9cca9b92166042241267ee2a5cdf5cff46c # v1.3.2
 github.com/gogo/googleapis                          01e0f9cca9b92166042241267ee2a5cdf5cff46c # v1.3.2
-github.com/cilium/ebpf                              1c8d4c9ef7759622653a1d319284a44652333b28
+github.com/cilium/ebpf                              ef54c303d1fff1e80a9bf20f00a378fde5419d61 # v0.5.0
 github.com/klauspost/compress                       a3b7545c88eea469c2246bee0e6c130525d56190 # v1.11.13
 github.com/klauspost/compress                       a3b7545c88eea469c2246bee0e6c130525d56190 # v1.11.13
 github.com/pelletier/go-toml                        65ca8064882c8c308e5c804c5d5443d409e0738c # v1.8.1
 github.com/pelletier/go-toml                        65ca8064882c8c308e5c804c5d5443d409e0738c # v1.8.1
 
 

+ 62 - 0
vendor/github.com/cilium/ebpf/README.md

@@ -0,0 +1,62 @@
+# eBPF
+
+[![PkgGoDev](https://pkg.go.dev/badge/github.com/cilium/ebpf)](https://pkg.go.dev/github.com/cilium/ebpf)
+
+eBPF is a pure Go library that provides utilities for loading, compiling, and
+debugging eBPF programs. It has minimal external dependencies and is intended to
+be used in long running processes.
+
+* [asm](https://pkg.go.dev/github.com/cilium/ebpf/asm) contains a basic
+  assembler
+* [link](https://pkg.go.dev/github.com/cilium/ebpf/link) allows attaching eBPF
+  to various hooks
+* [perf](https://pkg.go.dev/github.com/cilium/ebpf/perf) allows reading from a
+  `PERF_EVENT_ARRAY`
+* [cmd/bpf2go](https://pkg.go.dev/github.com/cilium/ebpf/cmd/bpf2go) allows
+  compiling and embedding eBPF programs in Go code
+
+The library is maintained by [Cloudflare](https://www.cloudflare.com) and
+[Cilium](https://www.cilium.io). Feel free to
+[join](https://cilium.herokuapp.com/) the
+[#libbpf-go](https://cilium.slack.com/messages/libbpf-go) channel on Slack.
+
+## Current status
+
+The package is production ready, but **the API is explicitly unstable right
+now**. Expect to update your code if you want to follow along.
+
+## Getting Started
+
+A small collection of Go and eBPF programs that serve as examples for building
+your own tools can be found under [examples/](examples/).
+
+Contributions are highly encouraged, as they highlight certain use cases of
+eBPF and the library, and help shape the future of the project.
+
+## Requirements
+
+* A version of Go that is [supported by
+  upstream](https://golang.org/doc/devel/release.html#policy)
+* Linux 4.9, 4.19 or 5.4 (versions in-between should work, but are not tested)
+
+## Useful resources
+
+* [eBPF.io](https://ebpf.io) (recommended)
+* [Cilium eBPF documentation](https://docs.cilium.io/en/latest/bpf/#bpf-guide)
+  (recommended)
+* [Linux documentation on
+  BPF](https://www.kernel.org/doc/html/latest/networking/filter.html)
+* [eBPF features by Linux
+  version](https://github.com/iovisor/bcc/blob/master/docs/kernel-versions.md)
+
+## Regenerating Testdata
+
+Run `make` in the root of this repository to rebuild testdata in all
+subpackages. This requires Docker, as it relies on a standardized build
+environment to keep the build output stable.
+
+The toolchain image build files are kept in [testdata/docker/](testdata/docker/).
+
+## License
+
+MIT

+ 0 - 206
vendor/github.com/cilium/ebpf/abi.go

@@ -1,206 +0,0 @@
-package ebpf
-
-import (
-	"bufio"
-	"bytes"
-	"errors"
-	"fmt"
-	"io"
-	"os"
-	"syscall"
-
-	"github.com/cilium/ebpf/internal"
-)
-
-// MapABI are the attributes of a Map which are available across all supported kernels.
-type MapABI struct {
-	Type       MapType
-	KeySize    uint32
-	ValueSize  uint32
-	MaxEntries uint32
-	Flags      uint32
-}
-
-func newMapABIFromSpec(spec *MapSpec) *MapABI {
-	return &MapABI{
-		spec.Type,
-		spec.KeySize,
-		spec.ValueSize,
-		spec.MaxEntries,
-		spec.Flags,
-	}
-}
-
-func newMapABIFromFd(fd *internal.FD) (string, *MapABI, error) {
-	info, err := bpfGetMapInfoByFD(fd)
-	if err != nil {
-		if errors.Is(err, syscall.EINVAL) {
-			abi, err := newMapABIFromProc(fd)
-			return "", abi, err
-		}
-		return "", nil, err
-	}
-
-	return "", &MapABI{
-		MapType(info.mapType),
-		info.keySize,
-		info.valueSize,
-		info.maxEntries,
-		info.flags,
-	}, nil
-}
-
-func newMapABIFromProc(fd *internal.FD) (*MapABI, error) {
-	var abi MapABI
-	err := scanFdInfo(fd, map[string]interface{}{
-		"map_type":    &abi.Type,
-		"key_size":    &abi.KeySize,
-		"value_size":  &abi.ValueSize,
-		"max_entries": &abi.MaxEntries,
-		"map_flags":   &abi.Flags,
-	})
-	if err != nil {
-		return nil, err
-	}
-	return &abi, nil
-}
-
-// Equal returns true if two ABIs have the same values.
-func (abi *MapABI) Equal(other *MapABI) bool {
-	switch {
-	case abi.Type != other.Type:
-		return false
-	case abi.KeySize != other.KeySize:
-		return false
-	case abi.ValueSize != other.ValueSize:
-		return false
-	case abi.MaxEntries != other.MaxEntries:
-		return false
-	case abi.Flags != other.Flags:
-		return false
-	default:
-		return true
-	}
-}
-
-// ProgramABI are the attributes of a Program which are available across all supported kernels.
-type ProgramABI struct {
-	Type ProgramType
-}
-
-func newProgramABIFromSpec(spec *ProgramSpec) *ProgramABI {
-	return &ProgramABI{
-		spec.Type,
-	}
-}
-
-func newProgramABIFromFd(fd *internal.FD) (string, *ProgramABI, error) {
-	info, err := bpfGetProgInfoByFD(fd)
-	if err != nil {
-		if errors.Is(err, syscall.EINVAL) {
-			return newProgramABIFromProc(fd)
-		}
-
-		return "", nil, err
-	}
-
-	var name string
-	if bpfName := internal.CString(info.name[:]); bpfName != "" {
-		name = bpfName
-	} else {
-		name = internal.CString(info.tag[:])
-	}
-
-	return name, &ProgramABI{
-		Type: ProgramType(info.progType),
-	}, nil
-}
-
-func newProgramABIFromProc(fd *internal.FD) (string, *ProgramABI, error) {
-	var (
-		abi  ProgramABI
-		name string
-	)
-
-	err := scanFdInfo(fd, map[string]interface{}{
-		"prog_type": &abi.Type,
-		"prog_tag":  &name,
-	})
-	if errors.Is(err, errMissingFields) {
-		return "", nil, &internal.UnsupportedFeatureError{
-			Name:           "reading ABI from /proc/self/fdinfo",
-			MinimumVersion: internal.Version{4, 11, 0},
-		}
-	}
-	if err != nil {
-		return "", nil, err
-	}
-
-	return name, &abi, nil
-}
-
-func scanFdInfo(fd *internal.FD, fields map[string]interface{}) error {
-	raw, err := fd.Value()
-	if err != nil {
-		return err
-	}
-
-	fh, err := os.Open(fmt.Sprintf("/proc/self/fdinfo/%d", raw))
-	if err != nil {
-		return err
-	}
-	defer fh.Close()
-
-	if err := scanFdInfoReader(fh, fields); err != nil {
-		return fmt.Errorf("%s: %w", fh.Name(), err)
-	}
-	return nil
-}
-
-var errMissingFields = errors.New("missing fields")
-
-func scanFdInfoReader(r io.Reader, fields map[string]interface{}) error {
-	var (
-		scanner = bufio.NewScanner(r)
-		scanned int
-	)
-
-	for scanner.Scan() {
-		parts := bytes.SplitN(scanner.Bytes(), []byte("\t"), 2)
-		if len(parts) != 2 {
-			continue
-		}
-
-		name := bytes.TrimSuffix(parts[0], []byte(":"))
-		field, ok := fields[string(name)]
-		if !ok {
-			continue
-		}
-
-		if n, err := fmt.Fscanln(bytes.NewReader(parts[1]), field); err != nil || n != 1 {
-			return fmt.Errorf("can't parse field %s: %v", name, err)
-		}
-
-		scanned++
-	}
-
-	if err := scanner.Err(); err != nil {
-		return err
-	}
-
-	if scanned != len(fields) {
-		return errMissingFields
-	}
-
-	return nil
-}
-
-// Equal returns true if two ABIs have the same values.
-func (abi *ProgramABI) Equal(other *ProgramABI) bool {
-	switch {
-	case abi.Type != other.Type:
-		return false
-	default:
-		return true
-	}
-}

+ 1 - 1
vendor/github.com/cilium/ebpf/asm/func.go

@@ -7,7 +7,7 @@ type BuiltinFunc int32
 
 
 // eBPF built-in functions
 // eBPF built-in functions
 //
 //
-// You can renegerate this list using the following gawk script:
+// You can regenerate this list using the following gawk script:
 //
 //
 //    /FN\(.+\),/ {
 //    /FN\(.+\),/ {
 //      match($1, /\((.+)\)/, r)
 //      match($1, /\((.+)\)/, r)

+ 84 - 64
vendor/github.com/cilium/ebpf/asm/instruction.go

@@ -1,17 +1,29 @@
 package asm
 package asm
 
 
 import (
 import (
+	"crypto/sha1"
 	"encoding/binary"
 	"encoding/binary"
+	"encoding/hex"
 	"errors"
 	"errors"
 	"fmt"
 	"fmt"
 	"io"
 	"io"
 	"math"
 	"math"
 	"strings"
 	"strings"
+
+	"github.com/cilium/ebpf/internal/unix"
 )
 )
 
 
 // InstructionSize is the size of a BPF instruction in bytes
 // InstructionSize is the size of a BPF instruction in bytes
 const InstructionSize = 8
 const InstructionSize = 8
 
 
+// RawInstructionOffset is an offset in units of raw BPF instructions.
+type RawInstructionOffset uint64
+
+// Bytes returns the offset of an instruction in bytes.
+func (rio RawInstructionOffset) Bytes() uint64 {
+	return uint64(rio) * InstructionSize
+}
+
 // Instruction is a single eBPF instruction.
 // Instruction is a single eBPF instruction.
 type Instruction struct {
 type Instruction struct {
 	OpCode    OpCode
 	OpCode    OpCode
@@ -151,10 +163,20 @@ func (ins *Instruction) mapOffset() uint32 {
 	return uint32(uint64(ins.Constant) >> 32)
 	return uint32(uint64(ins.Constant) >> 32)
 }
 }
 
 
+// isLoadFromMap returns true if the instruction loads from a map.
+//
+// This covers both loading the map pointer and direct map value loads.
 func (ins *Instruction) isLoadFromMap() bool {
 func (ins *Instruction) isLoadFromMap() bool {
 	return ins.OpCode == LoadImmOp(DWord) && (ins.Src == PseudoMapFD || ins.Src == PseudoMapValue)
 	return ins.OpCode == LoadImmOp(DWord) && (ins.Src == PseudoMapFD || ins.Src == PseudoMapValue)
 }
 }
 
 
+// IsFunctionCall returns true if the instruction calls another BPF function.
+//
+// This is not the same thing as a BPF helper call.
+func (ins *Instruction) IsFunctionCall() bool {
+	return ins.OpCode.JumpOp() == Call && ins.Src == PseudoCall
+}
+
 // Format implements fmt.Formatter.
 // Format implements fmt.Formatter.
 func (ins Instruction) Format(f fmt.State, c rune) {
 func (ins Instruction) Format(f fmt.State, c rune) {
 	if c != 'v' {
 	if c != 'v' {
@@ -310,34 +332,12 @@ func (insns Instructions) ReferenceOffsets() map[string][]int {
 	return offsets
 	return offsets
 }
 }
 
 
-func (insns Instructions) marshalledOffsets() (map[string]int, error) {
-	symbols := make(map[string]int)
-
-	marshalledPos := 0
-	for _, ins := range insns {
-		currentPos := marshalledPos
-		marshalledPos += ins.OpCode.marshalledInstructions()
-
-		if ins.Symbol == "" {
-			continue
-		}
-
-		if _, ok := symbols[ins.Symbol]; ok {
-			return nil, fmt.Errorf("duplicate symbol %s", ins.Symbol)
-		}
-
-		symbols[ins.Symbol] = currentPos
-	}
-
-	return symbols, nil
-}
-
 // Format implements fmt.Formatter.
 // Format implements fmt.Formatter.
 //
 //
 // You can control indentation of symbols by
 // You can control indentation of symbols by
 // specifying a width. Setting a precision controls the indentation of
 // specifying a width. Setting a precision controls the indentation of
 // instructions.
 // instructions.
-// The default character is a tab, which can be overriden by specifying
+// The default character is a tab, which can be overridden by specifying
 // the ' ' space flag.
 // the ' ' space flag.
 func (insns Instructions) Format(f fmt.State, c rune) {
 func (insns Instructions) Format(f fmt.State, c rune) {
 	if c != 's' && c != 'v' {
 	if c != 's' && c != 'v' {
@@ -370,63 +370,83 @@ func (insns Instructions) Format(f fmt.State, c rune) {
 		symIndent = strings.Repeat(" ", symPadding)
 		symIndent = strings.Repeat(" ", symPadding)
 	}
 	}
 
 
-	// Figure out how many digits we need to represent the highest
-	// offset.
-	highestOffset := 0
-	for _, ins := range insns {
-		highestOffset += ins.OpCode.marshalledInstructions()
-	}
+	// Guess how many digits we need at most, by assuming that all instructions
+	// are double wide.
+	highestOffset := len(insns) * 2
 	offsetWidth := int(math.Ceil(math.Log10(float64(highestOffset))))
 	offsetWidth := int(math.Ceil(math.Log10(float64(highestOffset))))
 
 
-	offset := 0
-	for _, ins := range insns {
-		if ins.Symbol != "" {
-			fmt.Fprintf(f, "%s%s:\n", symIndent, ins.Symbol)
+	iter := insns.Iterate()
+	for iter.Next() {
+		if iter.Ins.Symbol != "" {
+			fmt.Fprintf(f, "%s%s:\n", symIndent, iter.Ins.Symbol)
 		}
 		}
-		fmt.Fprintf(f, "%s%*d: %v\n", indent, offsetWidth, offset, ins)
-		offset += ins.OpCode.marshalledInstructions()
+		fmt.Fprintf(f, "%s%*d: %v\n", indent, offsetWidth, iter.Offset, iter.Ins)
 	}
 	}
-
-	return
 }
 }
 
 
 // Marshal encodes a BPF program into the kernel format.
 // Marshal encodes a BPF program into the kernel format.
 func (insns Instructions) Marshal(w io.Writer, bo binary.ByteOrder) error {
 func (insns Instructions) Marshal(w io.Writer, bo binary.ByteOrder) error {
-	absoluteOffsets, err := insns.marshalledOffsets()
-	if err != nil {
-		return err
+	for i, ins := range insns {
+		_, err := ins.Marshal(w, bo)
+		if err != nil {
+			return fmt.Errorf("instruction %d: %w", i, err)
+		}
 	}
 	}
+	return nil
+}
 
 
-	num := 0
+// Tag calculates the kernel tag for a series of instructions.
+//
+// It mirrors bpf_prog_calc_tag in the kernel and so can be compared
+// to ProgramInfo.Tag to figure out whether a loaded program matches
+// certain instructions.
+func (insns Instructions) Tag(bo binary.ByteOrder) (string, error) {
+	h := sha1.New()
 	for i, ins := range insns {
 	for i, ins := range insns {
-		switch {
-		case ins.OpCode.JumpOp() == Call && ins.Src == PseudoCall && ins.Constant == -1:
-			// Rewrite bpf to bpf call
-			offset, ok := absoluteOffsets[ins.Reference]
-			if !ok {
-				return fmt.Errorf("instruction %d: reference to missing symbol %s", i, ins.Reference)
-			}
-
-			ins.Constant = int64(offset - num - 1)
-
-		case ins.OpCode.Class() == JumpClass && ins.Offset == -1:
-			// Rewrite jump to label
-			offset, ok := absoluteOffsets[ins.Reference]
-			if !ok {
-				return fmt.Errorf("instruction %d: reference to missing symbol %s", i, ins.Reference)
-			}
-
-			ins.Offset = int16(offset - num - 1)
+		if ins.isLoadFromMap() {
+			ins.Constant = 0
 		}
 		}
-
-		n, err := ins.Marshal(w, bo)
+		_, err := ins.Marshal(h, bo)
 		if err != nil {
 		if err != nil {
-			return fmt.Errorf("instruction %d: %w", i, err)
+			return "", fmt.Errorf("instruction %d: %w", i, err)
 		}
 		}
+	}
+	return hex.EncodeToString(h.Sum(nil)[:unix.BPF_TAG_SIZE]), nil
+}
+
+// Iterate allows iterating a BPF program while keeping track of
+// various offsets.
+//
+// Modifying the instruction slice will lead to undefined behaviour.
+func (insns Instructions) Iterate() *InstructionIterator {
+	return &InstructionIterator{insns: insns}
+}
+
+// InstructionIterator iterates over a BPF program.
+type InstructionIterator struct {
+	insns Instructions
+	// The instruction in question.
+	Ins *Instruction
+	// The index of the instruction in the original instruction slice.
+	Index int
+	// The offset of the instruction in raw BPF instructions. This accounts
+	// for double-wide instructions.
+	Offset RawInstructionOffset
+}
 
 
-		num += int(n / InstructionSize)
+// Next returns true as long as there are any instructions remaining.
+func (iter *InstructionIterator) Next() bool {
+	if len(iter.insns) == 0 {
+		return false
 	}
 	}
-	return nil
+
+	if iter.Ins != nil {
+		iter.Index++
+		iter.Offset += RawInstructionOffset(iter.Ins.OpCode.rawInstructions())
+	}
+	iter.Ins = &iter.insns[0]
+	iter.insns = iter.insns[1:]
+	return true
 }
 }
 
 
 type bpfInstruction struct {
 type bpfInstruction struct {

+ 3 - 3
vendor/github.com/cilium/ebpf/asm/opcode.go

@@ -66,10 +66,10 @@ type OpCode uint8
 // InvalidOpCode is returned by setters on OpCode
 // InvalidOpCode is returned by setters on OpCode
 const InvalidOpCode OpCode = 0xff
 const InvalidOpCode OpCode = 0xff
 
 
-// marshalledInstructions returns the number of BPF instructions required
+// rawInstructions returns the number of BPF instructions required
 // to encode this opcode.
 // to encode this opcode.
-func (op OpCode) marshalledInstructions() int {
-	if op == LoadImmOp(DWord) {
+func (op OpCode) rawInstructions() int {
+	if op.isDWordLoad() {
 		return 2
 		return 2
 	}
 	}
 	return 1
 	return 1

+ 346 - 51
vendor/github.com/cilium/ebpf/collection.go

@@ -4,6 +4,8 @@ import (
 	"errors"
 	"errors"
 	"fmt"
 	"fmt"
 	"math"
 	"math"
+	"reflect"
+	"strings"
 
 
 	"github.com/cilium/ebpf/asm"
 	"github.com/cilium/ebpf/asm"
 	"github.com/cilium/ebpf/internal"
 	"github.com/cilium/ebpf/internal"
@@ -11,7 +13,10 @@ import (
 )
 )
 
 
 // CollectionOptions control loading a collection into the kernel.
 // CollectionOptions control loading a collection into the kernel.
+//
+// Maps and Programs are passed to NewMapWithOptions and NewProgramsWithOptions.
 type CollectionOptions struct {
 type CollectionOptions struct {
+	Maps     MapOptions
 	Programs ProgramOptions
 	Programs ProgramOptions
 }
 }
 
 
@@ -126,6 +131,106 @@ func (cs *CollectionSpec) RewriteConstants(consts map[string]interface{}) error
 	return nil
 	return nil
 }
 }
 
 
+// Assign the contents of a CollectionSpec to a struct.
+//
+// This function is a short-cut to manually checking the presence
+// of maps and programs in a collection spec. Consider using bpf2go if this
+// sounds useful.
+//
+// The argument to must be a pointer to a struct. A field of the
+// struct is updated with values from Programs or Maps if it
+// has an `ebpf` tag and its type is *ProgramSpec or *MapSpec.
+// The tag gives the name of the program or map as found in
+// the CollectionSpec.
+//
+//    struct {
+//        Foo     *ebpf.ProgramSpec `ebpf:"xdp_foo"`
+//        Bar     *ebpf.MapSpec     `ebpf:"bar_map"`
+//        Ignored int
+//    }
+//
+// Returns an error if any of the fields can't be found, or
+// if the same map or program is assigned multiple times.
+func (cs *CollectionSpec) Assign(to interface{}) error {
+	valueOf := func(typ reflect.Type, name string) (reflect.Value, error) {
+		switch typ {
+		case reflect.TypeOf((*ProgramSpec)(nil)):
+			p := cs.Programs[name]
+			if p == nil {
+				return reflect.Value{}, fmt.Errorf("missing program %q", name)
+			}
+			return reflect.ValueOf(p), nil
+		case reflect.TypeOf((*MapSpec)(nil)):
+			m := cs.Maps[name]
+			if m == nil {
+				return reflect.Value{}, fmt.Errorf("missing map %q", name)
+			}
+			return reflect.ValueOf(m), nil
+		default:
+			return reflect.Value{}, fmt.Errorf("unsupported type %s", typ)
+		}
+	}
+
+	return assignValues(to, valueOf)
+}
+
+// LoadAndAssign maps and programs into the kernel and assign them to a struct.
+//
+// This function is a short-cut to manually checking the presence
+// of maps and programs in a collection spec. Consider using bpf2go if this
+// sounds useful.
+//
+// The argument to must be a pointer to a struct. A field of the
+// struct is updated with values from Programs or Maps if it
+// has an `ebpf` tag and its type is *Program or *Map.
+// The tag gives the name of the program or map as found in
+// the CollectionSpec.
+//
+//    struct {
+//        Foo     *ebpf.Program `ebpf:"xdp_foo"`
+//        Bar     *ebpf.Map     `ebpf:"bar_map"`
+//        Ignored int
+//    }
+//
+// opts may be nil.
+//
+// Returns an error if any of the fields can't be found, or
+// if the same map or program is assigned multiple times.
+func (cs *CollectionSpec) LoadAndAssign(to interface{}, opts *CollectionOptions) error {
+	if opts == nil {
+		opts = &CollectionOptions{}
+	}
+
+	loadMap, loadProgram, done, cleanup := lazyLoadCollection(cs, opts)
+	defer cleanup()
+
+	valueOf := func(typ reflect.Type, name string) (reflect.Value, error) {
+		switch typ {
+		case reflect.TypeOf((*Program)(nil)):
+			p, err := loadProgram(name)
+			if err != nil {
+				return reflect.Value{}, err
+			}
+			return reflect.ValueOf(p), nil
+		case reflect.TypeOf((*Map)(nil)):
+			m, err := loadMap(name)
+			if err != nil {
+				return reflect.Value{}, err
+			}
+			return reflect.ValueOf(m), nil
+		default:
+			return reflect.Value{}, fmt.Errorf("unsupported type %s", typ)
+		}
+	}
+
+	if err := assignValues(to, valueOf); err != nil {
+		return err
+	}
+
+	done()
+	return nil
+}
+
 // Collection is a collection of Programs and Maps associated
 // Collection is a collection of Programs and Maps associated
 // with their symbols
 // with their symbols
 type Collection struct {
 type Collection struct {
@@ -134,28 +239,75 @@ type Collection struct {
 }
 }
 
 
 // NewCollection creates a Collection from a specification.
 // NewCollection creates a Collection from a specification.
-//
-// Only maps referenced by at least one of the programs are initialized.
 func NewCollection(spec *CollectionSpec) (*Collection, error) {
 func NewCollection(spec *CollectionSpec) (*Collection, error) {
 	return NewCollectionWithOptions(spec, CollectionOptions{})
 	return NewCollectionWithOptions(spec, CollectionOptions{})
 }
 }
 
 
 // NewCollectionWithOptions creates a Collection from a specification.
 // NewCollectionWithOptions creates a Collection from a specification.
-//
-// Only maps referenced by at least one of the programs are initialized.
-func NewCollectionWithOptions(spec *CollectionSpec, opts CollectionOptions) (coll *Collection, err error) {
+func NewCollectionWithOptions(spec *CollectionSpec, opts CollectionOptions) (*Collection, error) {
+	loadMap, loadProgram, done, cleanup := lazyLoadCollection(spec, &opts)
+	defer cleanup()
+
+	for mapName := range spec.Maps {
+		_, err := loadMap(mapName)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	for progName := range spec.Programs {
+		_, err := loadProgram(progName)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	maps, progs := done()
+	return &Collection{
+		progs,
+		maps,
+	}, nil
+}
+
+type btfHandleCache map[*btf.Spec]*btf.Handle
+
+func (btfs btfHandleCache) load(spec *btf.Spec) (*btf.Handle, error) {
+	if btfs[spec] != nil {
+		return btfs[spec], nil
+	}
+
+	handle, err := btf.NewHandle(spec)
+	if err != nil {
+		return nil, err
+	}
+
+	btfs[spec] = handle
+	return handle, nil
+}
+
+func (btfs btfHandleCache) close() {
+	for _, handle := range btfs {
+		handle.Close()
+	}
+}
+
+func lazyLoadCollection(coll *CollectionSpec, opts *CollectionOptions) (
+	loadMap func(string) (*Map, error),
+	loadProgram func(string) (*Program, error),
+	done func() (map[string]*Map, map[string]*Program),
+	cleanup func(),
+) {
 	var (
 	var (
-		maps  = make(map[string]*Map)
-		progs = make(map[string]*Program)
-		btfs  = make(map[*btf.Spec]*btf.Handle)
+		maps             = make(map[string]*Map)
+		progs            = make(map[string]*Program)
+		btfs             = make(btfHandleCache)
+		skipMapsAndProgs = false
 	)
 	)
 
 
-	defer func() {
-		for _, btf := range btfs {
-			btf.Close()
-		}
+	cleanup = func() {
+		btfs.close()
 
 
-		if err == nil {
+		if skipMapsAndProgs {
 			return
 			return
 		}
 		}
 
 
@@ -166,40 +318,43 @@ func NewCollectionWithOptions(spec *CollectionSpec, opts CollectionOptions) (col
 		for _, p := range progs {
 		for _, p := range progs {
 			p.Close()
 			p.Close()
 		}
 		}
-	}()
+	}
 
 
-	loadBTF := func(spec *btf.Spec) (*btf.Handle, error) {
-		if btfs[spec] != nil {
-			return btfs[spec], nil
-		}
+	done = func() (map[string]*Map, map[string]*Program) {
+		skipMapsAndProgs = true
+		return maps, progs
+	}
 
 
-		handle, err := btf.NewHandle(spec)
-		if err != nil {
-			return nil, err
+	loadMap = func(mapName string) (*Map, error) {
+		if m := maps[mapName]; m != nil {
+			return m, nil
 		}
 		}
 
 
-		btfs[spec] = handle
-		return handle, nil
-	}
-
-	for mapName, mapSpec := range spec.Maps {
-		var handle *btf.Handle
-		if mapSpec.BTF != nil {
-			handle, err = loadBTF(btf.MapSpec(mapSpec.BTF))
-			if err != nil && !errors.Is(err, btf.ErrNotSupported) {
-				return nil, err
-			}
+		mapSpec := coll.Maps[mapName]
+		if mapSpec == nil {
+			return nil, fmt.Errorf("missing map %s", mapName)
 		}
 		}
 
 
-		m, err := newMapWithBTF(mapSpec, handle)
+		m, err := newMapWithOptions(mapSpec, opts.Maps, btfs)
 		if err != nil {
 		if err != nil {
 			return nil, fmt.Errorf("map %s: %w", mapName, err)
 			return nil, fmt.Errorf("map %s: %w", mapName, err)
 		}
 		}
+
 		maps[mapName] = m
 		maps[mapName] = m
+		return m, nil
 	}
 	}
 
 
-	for progName, origProgSpec := range spec.Programs {
-		progSpec := origProgSpec.Copy()
+	loadProgram = func(progName string) (*Program, error) {
+		if prog := progs[progName]; prog != nil {
+			return prog, nil
+		}
+
+		progSpec := coll.Programs[progName]
+		if progSpec == nil {
+			return nil, fmt.Errorf("unknown program %s", progName)
+		}
+
+		progSpec = progSpec.Copy()
 
 
 		// Rewrite any reference to a valid map.
 		// Rewrite any reference to a valid map.
 		for i := range progSpec.Instructions {
 		for i := range progSpec.Instructions {
@@ -215,9 +370,9 @@ func NewCollectionWithOptions(spec *CollectionSpec, opts CollectionOptions) (col
 				continue
 				continue
 			}
 			}
 
 
-			m := maps[ins.Reference]
-			if m == nil {
-				return nil, fmt.Errorf("program %s: missing map %s", progName, ins.Reference)
+			m, err := loadMap(ins.Reference)
+			if err != nil {
+				return nil, fmt.Errorf("program %s: %s", progName, err)
 			}
 			}
 
 
 			fd := m.FD()
 			fd := m.FD()
@@ -229,25 +384,16 @@ func NewCollectionWithOptions(spec *CollectionSpec, opts CollectionOptions) (col
 			}
 			}
 		}
 		}
 
 
-		var handle *btf.Handle
-		if progSpec.BTF != nil {
-			handle, err = loadBTF(btf.ProgramSpec(progSpec.BTF))
-			if err != nil && !errors.Is(err, btf.ErrNotSupported) {
-				return nil, err
-			}
-		}
-
-		prog, err := newProgramWithBTF(progSpec, handle, opts.Programs)
+		prog, err := newProgramWithOptions(progSpec, opts.Programs, btfs)
 		if err != nil {
 		if err != nil {
 			return nil, fmt.Errorf("program %s: %w", progName, err)
 			return nil, fmt.Errorf("program %s: %w", progName, err)
 		}
 		}
+
 		progs[progName] = prog
 		progs[progName] = prog
+		return prog, nil
 	}
 	}
 
 
-	return &Collection{
-		progs,
-		maps,
-	}, nil
+	return
 }
 }
 
 
 // LoadCollection parses an object file and converts it to a collection.
 // LoadCollection parses an object file and converts it to a collection.
@@ -292,3 +438,152 @@ func (coll *Collection) DetachProgram(name string) *Program {
 	delete(coll.Programs, name)
 	delete(coll.Programs, name)
 	return p
 	return p
 }
 }
+
+// Assign the contents of a collection to a struct.
+//
+// Deprecated: use CollectionSpec.Assign instead. It provides the same
+// functionality but creates only the maps and programs requested.
+func (coll *Collection) Assign(to interface{}) error {
+	assignedMaps := make(map[string]struct{})
+	assignedPrograms := make(map[string]struct{})
+	valueOf := func(typ reflect.Type, name string) (reflect.Value, error) {
+		switch typ {
+		case reflect.TypeOf((*Program)(nil)):
+			p := coll.Programs[name]
+			if p == nil {
+				return reflect.Value{}, fmt.Errorf("missing program %q", name)
+			}
+			assignedPrograms[name] = struct{}{}
+			return reflect.ValueOf(p), nil
+		case reflect.TypeOf((*Map)(nil)):
+			m := coll.Maps[name]
+			if m == nil {
+				return reflect.Value{}, fmt.Errorf("missing map %q", name)
+			}
+			assignedMaps[name] = struct{}{}
+			return reflect.ValueOf(m), nil
+		default:
+			return reflect.Value{}, fmt.Errorf("unsupported type %s", typ)
+		}
+	}
+
+	if err := assignValues(to, valueOf); err != nil {
+		return err
+	}
+
+	for name := range assignedPrograms {
+		coll.DetachProgram(name)
+	}
+
+	for name := range assignedMaps {
+		coll.DetachMap(name)
+	}
+
+	return nil
+}
+
+func assignValues(to interface{}, valueOf func(reflect.Type, string) (reflect.Value, error)) error {
+	type structField struct {
+		reflect.StructField
+		value reflect.Value
+	}
+
+	var (
+		fields        []structField
+		visitedTypes  = make(map[reflect.Type]bool)
+		flattenStruct func(reflect.Value) error
+	)
+
+	flattenStruct = func(structVal reflect.Value) error {
+		structType := structVal.Type()
+		if structType.Kind() != reflect.Struct {
+			return fmt.Errorf("%s is not a struct", structType)
+		}
+
+		if visitedTypes[structType] {
+			return fmt.Errorf("recursion on type %s", structType)
+		}
+
+		for i := 0; i < structType.NumField(); i++ {
+			field := structField{structType.Field(i), structVal.Field(i)}
+
+			name := field.Tag.Get("ebpf")
+			if name != "" {
+				fields = append(fields, field)
+				continue
+			}
+
+			var err error
+			switch field.Type.Kind() {
+			case reflect.Ptr:
+				if field.Type.Elem().Kind() != reflect.Struct {
+					continue
+				}
+
+				if field.value.IsNil() {
+					return fmt.Errorf("nil pointer to %s", structType)
+				}
+
+				err = flattenStruct(field.value.Elem())
+
+			case reflect.Struct:
+				err = flattenStruct(field.value)
+
+			default:
+				continue
+			}
+
+			if err != nil {
+				return fmt.Errorf("field %s: %s", field.Name, err)
+			}
+		}
+
+		return nil
+	}
+
+	toValue := reflect.ValueOf(to)
+	if toValue.Type().Kind() != reflect.Ptr {
+		return fmt.Errorf("%T is not a pointer to struct", to)
+	}
+
+	if toValue.IsNil() {
+		return fmt.Errorf("nil pointer to %T", to)
+	}
+
+	if err := flattenStruct(toValue.Elem()); err != nil {
+		return err
+	}
+
+	type elem struct {
+		// Either *Map or *Program
+		typ  reflect.Type
+		name string
+	}
+
+	assignedTo := make(map[elem]string)
+	for _, field := range fields {
+		name := field.Tag.Get("ebpf")
+		if strings.Contains(name, ",") {
+			return fmt.Errorf("field %s: ebpf tag contains a comma", field.Name)
+		}
+
+		e := elem{field.Type, name}
+		if assignedField := assignedTo[e]; assignedField != "" {
+			return fmt.Errorf("field %s: %q was already assigned to %s", field.Name, name, assignedField)
+		}
+
+		value, err := valueOf(field.Type, name)
+		if err != nil {
+			return fmt.Errorf("field %s: %w", field.Name, err)
+		}
+
+		if !field.value.CanSet() {
+			return fmt.Errorf("field %s: can't set value", field.Name)
+		}
+
+		field.value.Set(value)
+		assignedTo[e] = field.Name
+	}
+
+	return nil
+}

+ 1 - 2
vendor/github.com/cilium/ebpf/doc.go

@@ -12,6 +12,5 @@
 // eBPF code should be compiled ahead of time using clang, and shipped with
 // eBPF code should be compiled ahead of time using clang, and shipped with
 // your application as any other resource.
 // your application as any other resource.
 //
 //
-// This package doesn't include code required to attach eBPF to Linux
-// subsystems, since this varies per subsystem.
+// Use the link subpackage to attach a loaded program to a hook in the kernel.
 package ebpf
 package ebpf

+ 456 - 265
vendor/github.com/cilium/ebpf/elf_reader.go

@@ -1,6 +1,7 @@
 package ebpf
 package ebpf
 
 
 import (
 import (
+	"bufio"
 	"bytes"
 	"bytes"
 	"debug/elf"
 	"debug/elf"
 	"encoding/binary"
 	"encoding/binary"
@@ -17,12 +18,14 @@ import (
 	"github.com/cilium/ebpf/internal/unix"
 	"github.com/cilium/ebpf/internal/unix"
 )
 )
 
 
+// elfCode is a convenience to reduce the amount of arguments that have to
+// be passed around explicitly. You should treat it's contents as immutable.
 type elfCode struct {
 type elfCode struct {
-	*elf.File
-	symbols           []elf.Symbol
-	symbolsPerSection map[elf.SectionIndex]map[uint64]elf.Symbol
-	license           string
-	version           uint32
+	*internal.SafeELFFile
+	sections map[elf.SectionIndex]*elfSection
+	license  string
+	version  uint32
+	btf      *btf.Spec
 }
 }
 
 
 // LoadCollectionSpec parses an ELF file into a CollectionSpec.
 // LoadCollectionSpec parses an ELF file into a CollectionSpec.
@@ -42,63 +45,52 @@ func LoadCollectionSpec(file string) (*CollectionSpec, error) {
 
 
 // LoadCollectionSpecFromReader parses an ELF file into a CollectionSpec.
 // LoadCollectionSpecFromReader parses an ELF file into a CollectionSpec.
 func LoadCollectionSpecFromReader(rd io.ReaderAt) (*CollectionSpec, error) {
 func LoadCollectionSpecFromReader(rd io.ReaderAt) (*CollectionSpec, error) {
-	f, err := elf.NewFile(rd)
+	f, err := internal.NewSafeELFFile(rd)
 	if err != nil {
 	if err != nil {
 		return nil, err
 		return nil, err
 	}
 	}
 	defer f.Close()
 	defer f.Close()
 
 
-	symbols, err := f.Symbols()
-	if err != nil {
-		return nil, fmt.Errorf("load symbols: %v", err)
-	}
-
-	ec := &elfCode{f, symbols, symbolsPerSection(symbols), "", 0}
-
 	var (
 	var (
 		licenseSection *elf.Section
 		licenseSection *elf.Section
 		versionSection *elf.Section
 		versionSection *elf.Section
-		btfMaps        = make(map[elf.SectionIndex]*elf.Section)
-		progSections   = make(map[elf.SectionIndex]*elf.Section)
+		sections       = make(map[elf.SectionIndex]*elfSection)
 		relSections    = make(map[elf.SectionIndex]*elf.Section)
 		relSections    = make(map[elf.SectionIndex]*elf.Section)
-		mapSections    = make(map[elf.SectionIndex]*elf.Section)
-		dataSections   = make(map[elf.SectionIndex]*elf.Section)
 	)
 	)
 
 
-	for i, sec := range ec.Sections {
+	// This is the target of relocations generated by inline assembly.
+	sections[elf.SHN_UNDEF] = newElfSection(new(elf.Section), undefSection)
+
+	// Collect all the sections we're interested in. This includes relocations
+	// which we parse later.
+	for i, sec := range f.Sections {
+		idx := elf.SectionIndex(i)
+
 		switch {
 		switch {
 		case strings.HasPrefix(sec.Name, "license"):
 		case strings.HasPrefix(sec.Name, "license"):
 			licenseSection = sec
 			licenseSection = sec
 		case strings.HasPrefix(sec.Name, "version"):
 		case strings.HasPrefix(sec.Name, "version"):
 			versionSection = sec
 			versionSection = sec
 		case strings.HasPrefix(sec.Name, "maps"):
 		case strings.HasPrefix(sec.Name, "maps"):
-			mapSections[elf.SectionIndex(i)] = sec
+			sections[idx] = newElfSection(sec, mapSection)
 		case sec.Name == ".maps":
 		case sec.Name == ".maps":
-			btfMaps[elf.SectionIndex(i)] = sec
-		case sec.Name == ".bss" || sec.Name == ".rodata" || sec.Name == ".data":
-			dataSections[elf.SectionIndex(i)] = sec
+			sections[idx] = newElfSection(sec, btfMapSection)
+		case sec.Name == ".bss" || sec.Name == ".data" || strings.HasPrefix(sec.Name, ".rodata"):
+			sections[idx] = newElfSection(sec, dataSection)
 		case sec.Type == elf.SHT_REL:
 		case sec.Type == elf.SHT_REL:
-			if int(sec.Info) >= len(ec.Sections) {
-				return nil, fmt.Errorf("found relocation section %v for missing section %v", i, sec.Info)
-			}
-
 			// Store relocations under the section index of the target
 			// Store relocations under the section index of the target
-			idx := elf.SectionIndex(sec.Info)
-			if relSections[idx] != nil {
-				return nil, fmt.Errorf("section %d has multiple relocation sections", sec.Info)
-			}
-			relSections[idx] = sec
+			relSections[elf.SectionIndex(sec.Info)] = sec
 		case sec.Type == elf.SHT_PROGBITS && (sec.Flags&elf.SHF_EXECINSTR) != 0 && sec.Size > 0:
 		case sec.Type == elf.SHT_PROGBITS && (sec.Flags&elf.SHF_EXECINSTR) != 0 && sec.Size > 0:
-			progSections[elf.SectionIndex(i)] = sec
+			sections[idx] = newElfSection(sec, programSection)
 		}
 		}
 	}
 	}
 
 
-	ec.license, err = loadLicense(licenseSection)
+	license, err := loadLicense(licenseSection)
 	if err != nil {
 	if err != nil {
 		return nil, fmt.Errorf("load license: %w", err)
 		return nil, fmt.Errorf("load license: %w", err)
 	}
 	}
 
 
-	ec.version, err = loadVersion(versionSection, ec.ByteOrder)
+	version, err := loadVersion(versionSection, f.ByteOrder)
 	if err != nil {
 	if err != nil {
 		return nil, fmt.Errorf("load version: %w", err)
 		return nil, fmt.Errorf("load version: %w", err)
 	}
 	}
@@ -108,37 +100,90 @@ func LoadCollectionSpecFromReader(rd io.ReaderAt) (*CollectionSpec, error) {
 		return nil, fmt.Errorf("load BTF: %w", err)
 		return nil, fmt.Errorf("load BTF: %w", err)
 	}
 	}
 
 
-	relocations, referencedSections, err := ec.loadRelocations(relSections)
+	// Assign symbols to all the sections we're interested in.
+	symbols, err := f.Symbols()
 	if err != nil {
 	if err != nil {
-		return nil, fmt.Errorf("load relocations: %w", err)
+		return nil, fmt.Errorf("load symbols: %v", err)
 	}
 	}
 
 
-	maps := make(map[string]*MapSpec)
-	if err := ec.loadMaps(maps, mapSections); err != nil {
-		return nil, fmt.Errorf("load maps: %w", err)
-	}
+	for _, symbol := range symbols {
+		idx := symbol.Section
+		symType := elf.ST_TYPE(symbol.Info)
 
 
-	if len(btfMaps) > 0 {
-		if err := ec.loadBTFMaps(maps, btfMaps, btfSpec); err != nil {
-			return nil, fmt.Errorf("load BTF maps: %w", err)
+		section := sections[idx]
+		if section == nil {
+			continue
 		}
 		}
+
+		// Older versions of LLVM don't tag symbols correctly, so keep
+		// all NOTYPE ones.
+		keep := symType == elf.STT_NOTYPE
+		switch section.kind {
+		case mapSection, btfMapSection, dataSection:
+			keep = keep || symType == elf.STT_OBJECT
+		case programSection:
+			keep = keep || symType == elf.STT_FUNC
+		}
+		if !keep || symbol.Name == "" {
+			continue
+		}
+
+		section.symbols[symbol.Value] = symbol
 	}
 	}
 
 
-	if len(dataSections) > 0 {
-		for idx := range dataSections {
-			if !referencedSections[idx] {
-				// Prune data sections which are not referenced by any
-				// instructions.
-				delete(dataSections, idx)
-			}
+	ec := &elfCode{
+		SafeELFFile: f,
+		sections:    sections,
+		license:     license,
+		version:     version,
+		btf:         btfSpec,
+	}
+
+	// Go through relocation sections, and parse the ones for sections we're
+	// interested in. Make sure that relocations point at valid sections.
+	for idx, relSection := range relSections {
+		section := sections[idx]
+		if section == nil {
+			continue
 		}
 		}
 
 
-		if err := ec.loadDataSections(maps, dataSections, btfSpec); err != nil {
-			return nil, fmt.Errorf("load data sections: %w", err)
+		rels, err := ec.loadRelocations(relSection, symbols)
+		if err != nil {
+			return nil, fmt.Errorf("relocation for section %q: %w", section.Name, err)
+		}
+
+		for _, rel := range rels {
+			target := sections[rel.Section]
+			if target == nil {
+				return nil, fmt.Errorf("section %q: reference to %q in section %s: %w", section.Name, rel.Name, rel.Section, ErrNotSupported)
+			}
+
+			if target.Flags&elf.SHF_STRINGS > 0 {
+				return nil, fmt.Errorf("section %q: string %q is not stack allocated: %w", section.Name, rel.Name, ErrNotSupported)
+			}
+
+			target.references++
 		}
 		}
+
+		section.relocations = rels
+	}
+
+	// Collect all the various ways to define maps.
+	maps := make(map[string]*MapSpec)
+	if err := ec.loadMaps(maps); err != nil {
+		return nil, fmt.Errorf("load maps: %w", err)
 	}
 	}
 
 
-	progs, err := ec.loadPrograms(progSections, relocations, btfSpec)
+	if err := ec.loadBTFMaps(maps); err != nil {
+		return nil, fmt.Errorf("load BTF maps: %w", err)
+	}
+
+	if err := ec.loadDataSections(maps); err != nil {
+		return nil, fmt.Errorf("load data sections: %w", err)
+	}
+
+	// Finally, collect programs and link them.
+	progs, err := ec.loadPrograms()
 	if err != nil {
 	if err != nil {
 		return nil, fmt.Errorf("load programs: %w", err)
 		return nil, fmt.Errorf("load programs: %w", err)
 	}
 	}
@@ -170,33 +215,69 @@ func loadVersion(sec *elf.Section, bo binary.ByteOrder) (uint32, error) {
 	return version, nil
 	return version, nil
 }
 }
 
 
-func (ec *elfCode) loadPrograms(progSections map[elf.SectionIndex]*elf.Section, relocations map[elf.SectionIndex]map[uint64]elf.Symbol, btfSpec *btf.Spec) (map[string]*ProgramSpec, error) {
+type elfSectionKind int
+
+const (
+	undefSection elfSectionKind = iota
+	mapSection
+	btfMapSection
+	programSection
+	dataSection
+)
+
+type elfSection struct {
+	*elf.Section
+	kind elfSectionKind
+	// Offset from the start of the section to a symbol
+	symbols map[uint64]elf.Symbol
+	// Offset from the start of the section to a relocation, which points at
+	// a symbol in another section.
+	relocations map[uint64]elf.Symbol
+	// The number of relocations pointing at this section.
+	references int
+}
+
+func newElfSection(section *elf.Section, kind elfSectionKind) *elfSection {
+	return &elfSection{
+		section,
+		kind,
+		make(map[uint64]elf.Symbol),
+		make(map[uint64]elf.Symbol),
+		0,
+	}
+}
+
+func (ec *elfCode) loadPrograms() (map[string]*ProgramSpec, error) {
 	var (
 	var (
 		progs []*ProgramSpec
 		progs []*ProgramSpec
 		libs  []*ProgramSpec
 		libs  []*ProgramSpec
 	)
 	)
 
 
-	for idx, sec := range progSections {
-		syms := ec.symbolsPerSection[idx]
-		if len(syms) == 0 {
+	for _, sec := range ec.sections {
+		if sec.kind != programSection {
+			continue
+		}
+
+		if len(sec.symbols) == 0 {
 			return nil, fmt.Errorf("section %v: missing symbols", sec.Name)
 			return nil, fmt.Errorf("section %v: missing symbols", sec.Name)
 		}
 		}
 
 
-		funcSym, ok := syms[0]
+		funcSym, ok := sec.symbols[0]
 		if !ok {
 		if !ok {
 			return nil, fmt.Errorf("section %v: no label at start", sec.Name)
 			return nil, fmt.Errorf("section %v: no label at start", sec.Name)
 		}
 		}
 
 
-		insns, length, err := ec.loadInstructions(sec, syms, relocations[idx])
+		insns, length, err := ec.loadInstructions(sec)
 		if err != nil {
 		if err != nil {
-			return nil, fmt.Errorf("program %s: can't unmarshal instructions: %w", funcSym.Name, err)
+			return nil, fmt.Errorf("program %s: %w", funcSym.Name, err)
 		}
 		}
 
 
-		progType, attachType, attachTo := getProgType(sec.Name)
+		progType, attachType, progFlags, attachTo := getProgType(sec.Name)
 
 
 		spec := &ProgramSpec{
 		spec := &ProgramSpec{
 			Name:          funcSym.Name,
 			Name:          funcSym.Name,
 			Type:          progType,
 			Type:          progType,
+			Flags:         progFlags,
 			AttachType:    attachType,
 			AttachType:    attachType,
 			AttachTo:      attachTo,
 			AttachTo:      attachTo,
 			License:       ec.license,
 			License:       ec.license,
@@ -205,8 +286,8 @@ func (ec *elfCode) loadPrograms(progSections map[elf.SectionIndex]*elf.Section,
 			ByteOrder:     ec.ByteOrder,
 			ByteOrder:     ec.ByteOrder,
 		}
 		}
 
 
-		if btfSpec != nil {
-			spec.BTF, err = btfSpec.Program(sec.Name, length)
+		if ec.btf != nil {
+			spec.BTF, err = ec.btf.Program(sec.Name, length)
 			if err != nil && !errors.Is(err, btf.ErrNoExtendedInfo) {
 			if err != nil && !errors.Is(err, btf.ErrNoExtendedInfo) {
 				return nil, fmt.Errorf("program %s: %w", funcSym.Name, err)
 				return nil, fmt.Errorf("program %s: %w", funcSym.Name, err)
 			}
 			}
@@ -234,9 +315,9 @@ func (ec *elfCode) loadPrograms(progSections map[elf.SectionIndex]*elf.Section,
 	return res, nil
 	return res, nil
 }
 }
 
 
-func (ec *elfCode) loadInstructions(section *elf.Section, symbols, relocations map[uint64]elf.Symbol) (asm.Instructions, uint64, error) {
+func (ec *elfCode) loadInstructions(section *elfSection) (asm.Instructions, uint64, error) {
 	var (
 	var (
-		r      = section.Open()
+		r      = bufio.NewReader(section.Open())
 		insns  asm.Instructions
 		insns  asm.Instructions
 		offset uint64
 		offset uint64
 	)
 	)
@@ -250,11 +331,11 @@ func (ec *elfCode) loadInstructions(section *elf.Section, symbols, relocations m
 			return nil, 0, fmt.Errorf("offset %d: %w", offset, err)
 			return nil, 0, fmt.Errorf("offset %d: %w", offset, err)
 		}
 		}
 
 
-		ins.Symbol = symbols[offset].Name
+		ins.Symbol = section.symbols[offset].Name
 
 
-		if rel, ok := relocations[offset]; ok {
+		if rel, ok := section.relocations[offset]; ok {
 			if err = ec.relocateInstruction(&ins, rel); err != nil {
 			if err = ec.relocateInstruction(&ins, rel); err != nil {
-				return nil, 0, fmt.Errorf("offset %d: can't relocate instruction: %w", offset, err)
+				return nil, 0, fmt.Errorf("offset %d: relocate instruction: %w", offset, err)
 			}
 			}
 		}
 		}
 
 
@@ -270,69 +351,66 @@ func (ec *elfCode) relocateInstruction(ins *asm.Instruction, rel elf.Symbol) err
 		name = rel.Name
 		name = rel.Name
 	)
 	)
 
 
-	if typ == elf.STT_SECTION {
-		// Symbols with section type do not have a name set. Get it
-		// from the section itself.
-		idx := int(rel.Section)
-		if idx > len(ec.Sections) {
-			return errors.New("out-of-bounds section index")
+	target := ec.sections[rel.Section]
+
+	switch target.kind {
+	case mapSection, btfMapSection:
+		if bind != elf.STB_GLOBAL {
+			return fmt.Errorf("possible erroneous static qualifier on map definition: found reference to %q", name)
 		}
 		}
 
 
-		name = ec.Sections[idx].Name
-	}
+		if typ != elf.STT_OBJECT && typ != elf.STT_NOTYPE {
+			// STT_NOTYPE is generated on clang < 8 which doesn't tag
+			// relocations appropriately.
+			return fmt.Errorf("map load: incorrect relocation type %v", typ)
+		}
 
 
-outer:
-	switch {
-	case ins.OpCode == asm.LoadImmOp(asm.DWord):
-		// There are two distinct types of a load from a map:
-		// a direct one, where the value is extracted without
-		// a call to map_lookup_elem in eBPF, and an indirect one
-		// that goes via the helper. They are distinguished by
-		// different relocations.
+		ins.Src = asm.PseudoMapFD
+
+		// Mark the instruction as needing an update when creating the
+		// collection.
+		if err := ins.RewriteMapPtr(-1); err != nil {
+			return err
+		}
+
+	case dataSection:
 		switch typ {
 		switch typ {
 		case elf.STT_SECTION:
 		case elf.STT_SECTION:
-			// This is a direct load since the referenced symbol is a
-			// section. Weirdly, the offset of the real symbol in the
-			// section is encoded in the instruction stream.
 			if bind != elf.STB_LOCAL {
 			if bind != elf.STB_LOCAL {
 				return fmt.Errorf("direct load: %s: unsupported relocation %s", name, bind)
 				return fmt.Errorf("direct load: %s: unsupported relocation %s", name, bind)
 			}
 			}
 
 
-			// For some reason, clang encodes the offset of the symbol its
-			// section in the first basic BPF instruction, while the kernel
-			// expects it in the second one.
-			ins.Constant <<= 32
-			ins.Src = asm.PseudoMapValue
-
-		case elf.STT_NOTYPE:
-			if bind == elf.STB_GLOBAL && rel.Section == elf.SHN_UNDEF {
-				// This is a relocation generated by inline assembly.
-				// We can't do more than assigning ins.Reference.
-				break outer
-			}
-
-			// This is an ELF generated on clang < 8, which doesn't tag
-			// relocations appropriately.
-			fallthrough
-
 		case elf.STT_OBJECT:
 		case elf.STT_OBJECT:
 			if bind != elf.STB_GLOBAL {
 			if bind != elf.STB_GLOBAL {
-				return fmt.Errorf("load: %s: unsupported binding: %s", name, bind)
+				return fmt.Errorf("direct load: %s: unsupported relocation %s", name, bind)
 			}
 			}
 
 
-			ins.Src = asm.PseudoMapFD
-
 		default:
 		default:
-			return fmt.Errorf("load: %s: unsupported relocation: %s", name, typ)
+			return fmt.Errorf("incorrect relocation type %v for direct map load", typ)
 		}
 		}
 
 
+		// We rely on using the name of the data section as the reference. It
+		// would be nicer to keep the real name in case of an STT_OBJECT, but
+		// it's not clear how to encode that into Instruction.
+		name = target.Name
+
+		// For some reason, clang encodes the offset of the symbol its
+		// section in the first basic BPF instruction, while the kernel
+		// expects it in the second one.
+		ins.Constant <<= 32
+		ins.Src = asm.PseudoMapValue
+
 		// Mark the instruction as needing an update when creating the
 		// Mark the instruction as needing an update when creating the
 		// collection.
 		// collection.
 		if err := ins.RewriteMapPtr(-1); err != nil {
 		if err := ins.RewriteMapPtr(-1); err != nil {
 			return err
 			return err
 		}
 		}
 
 
-	case ins.OpCode.JumpOp() == asm.Call:
+	case programSection:
+		if ins.OpCode.JumpOp() != asm.Call {
+			return fmt.Errorf("not a call instruction: %s", ins)
+		}
+
 		if ins.Src != asm.PseudoCall {
 		if ins.Src != asm.PseudoCall {
 			return fmt.Errorf("call: %s: incorrect source register", name)
 			return fmt.Errorf("call: %s: incorrect source register", name)
 		}
 		}
@@ -357,7 +435,7 @@ outer:
 				return fmt.Errorf("call: %s: invalid offset %d", name, offset)
 				return fmt.Errorf("call: %s: invalid offset %d", name, offset)
 			}
 			}
 
 
-			sym, ok := ec.symbolsPerSection[rel.Section][uint64(offset)]
+			sym, ok := target.symbols[uint64(offset)]
 			if !ok {
 			if !ok {
 				return fmt.Errorf("call: %s: no symbol at offset %d", name, offset)
 				return fmt.Errorf("call: %s: no symbol at offset %d", name, offset)
 			}
 			}
@@ -369,31 +447,46 @@ outer:
 			return fmt.Errorf("call: %s: invalid symbol type %s", name, typ)
 			return fmt.Errorf("call: %s: invalid symbol type %s", name, typ)
 		}
 		}
 
 
+	case undefSection:
+		if bind != elf.STB_GLOBAL {
+			return fmt.Errorf("asm relocation: %s: unsupported binding: %s", name, bind)
+		}
+
+		if typ != elf.STT_NOTYPE {
+			return fmt.Errorf("asm relocation: %s: unsupported type %s", name, typ)
+		}
+
+		// There is nothing to do here but set ins.Reference.
+
 	default:
 	default:
-		return fmt.Errorf("relocation for unsupported instruction: %s", ins.OpCode)
+		return fmt.Errorf("relocation to %q: %w", target.Name, ErrNotSupported)
 	}
 	}
 
 
 	ins.Reference = name
 	ins.Reference = name
 	return nil
 	return nil
 }
 }
 
 
-func (ec *elfCode) loadMaps(maps map[string]*MapSpec, mapSections map[elf.SectionIndex]*elf.Section) error {
-	for idx, sec := range mapSections {
-		syms := ec.symbolsPerSection[idx]
-		if len(syms) == 0 {
+func (ec *elfCode) loadMaps(maps map[string]*MapSpec) error {
+	for _, sec := range ec.sections {
+		if sec.kind != mapSection {
+			continue
+		}
+
+		nSym := len(sec.symbols)
+		if nSym == 0 {
 			return fmt.Errorf("section %v: no symbols", sec.Name)
 			return fmt.Errorf("section %v: no symbols", sec.Name)
 		}
 		}
 
 
-		if sec.Size%uint64(len(syms)) != 0 {
+		if sec.Size%uint64(nSym) != 0 {
 			return fmt.Errorf("section %v: map descriptors are not of equal size", sec.Name)
 			return fmt.Errorf("section %v: map descriptors are not of equal size", sec.Name)
 		}
 		}
 
 
 		var (
 		var (
-			r    = sec.Open()
-			size = sec.Size / uint64(len(syms))
+			r    = bufio.NewReader(sec.Open())
+			size = sec.Size / uint64(nSym)
 		)
 		)
-		for i, offset := 0, uint64(0); i < len(syms); i, offset = i+1, offset+size {
-			mapSym, ok := syms[offset]
+		for i, offset := 0, uint64(0); i < nSym; i, offset = i+1, offset+size {
+			mapSym, ok := sec.symbols[offset]
 			if !ok {
 			if !ok {
 				return fmt.Errorf("section %s: missing symbol for map at offset %d", sec.Name, offset)
 				return fmt.Errorf("section %s: missing symbol for map at offset %d", sec.Name, offset)
 			}
 			}
@@ -431,24 +524,43 @@ func (ec *elfCode) loadMaps(maps map[string]*MapSpec, mapSections map[elf.Sectio
 	return nil
 	return nil
 }
 }
 
 
-func (ec *elfCode) loadBTFMaps(maps map[string]*MapSpec, mapSections map[elf.SectionIndex]*elf.Section, spec *btf.Spec) error {
-	if spec == nil {
-		return fmt.Errorf("missing BTF")
-	}
+func (ec *elfCode) loadBTFMaps(maps map[string]*MapSpec) error {
+	for _, sec := range ec.sections {
+		if sec.kind != btfMapSection {
+			continue
+		}
 
 
-	for idx, sec := range mapSections {
-		syms := ec.symbolsPerSection[idx]
-		if len(syms) == 0 {
-			return fmt.Errorf("section %v: no symbols", sec.Name)
+		if ec.btf == nil {
+			return fmt.Errorf("missing BTF")
+		}
+
+		_, err := io.Copy(internal.DiscardZeroes{}, bufio.NewReader(sec.Open()))
+		if err != nil {
+			return fmt.Errorf("section %v: initializing BTF map definitions: %w", sec.Name, internal.ErrNotSupported)
+		}
+
+		var ds btf.Datasec
+		if err := ec.btf.FindType(sec.Name, &ds); err != nil {
+			return fmt.Errorf("cannot find section '%s' in BTF: %w", sec.Name, err)
 		}
 		}
 
 
-		for _, sym := range syms {
-			name := sym.Name
+		for _, vs := range ds.Vars {
+			v, ok := vs.Type.(*btf.Var)
+			if !ok {
+				return fmt.Errorf("section %v: unexpected type %s", sec.Name, vs.Type)
+			}
+			name := string(v.Name)
+
 			if maps[name] != nil {
 			if maps[name] != nil {
-				return fmt.Errorf("section %v: map %v already exists", sec.Name, sym)
+				return fmt.Errorf("section %v: map %s already exists", sec.Name, name)
+			}
+
+			mapStruct, ok := v.Type.(*btf.Struct)
+			if !ok {
+				return fmt.Errorf("expected struct, got %s", v.Type)
 			}
 			}
 
 
-			mapSpec, err := mapSpecFromBTF(spec, name)
+			mapSpec, err := mapSpecFromBTF(name, mapStruct, false, ec.btf)
 			if err != nil {
 			if err != nil {
 				return fmt.Errorf("map %v: %w", name, err)
 				return fmt.Errorf("map %v: %w", name, err)
 			}
 			}
@@ -460,30 +572,21 @@ func (ec *elfCode) loadBTFMaps(maps map[string]*MapSpec, mapSections map[elf.Sec
 	return nil
 	return nil
 }
 }
 
 
-func mapSpecFromBTF(spec *btf.Spec, name string) (*MapSpec, error) {
-	btfMap, btfMapMembers, err := spec.Map(name)
-	if err != nil {
-		return nil, fmt.Errorf("can't get BTF: %w", err)
-	}
-
-	keyType := btf.MapKey(btfMap)
-	size, err := btf.Sizeof(keyType)
-	if err != nil {
-		return nil, fmt.Errorf("can't get size of BTF key: %w", err)
-	}
-	keySize := uint32(size)
-
-	valueType := btf.MapValue(btfMap)
-	size, err = btf.Sizeof(valueType)
-	if err != nil {
-		return nil, fmt.Errorf("can't get size of BTF value: %w", err)
-	}
-	valueSize := uint32(size)
+// mapSpecFromBTF produces a MapSpec based on a btf.Struct def representing
+// a BTF map definition. The name and spec arguments will be copied to the
+// resulting MapSpec, and inner must be true on any resursive invocations.
+func mapSpecFromBTF(name string, def *btf.Struct, inner bool, spec *btf.Spec) (*MapSpec, error) {
 
 
 	var (
 	var (
+		key, value                 btf.Type
+		keySize, valueSize         uint32
 		mapType, flags, maxEntries uint32
 		mapType, flags, maxEntries uint32
+		pinType                    PinType
+		innerMapSpec               *MapSpec
+		err                        error
 	)
 	)
-	for _, member := range btfMapMembers {
+
+	for i, member := range def.Members {
 		switch member.Name {
 		switch member.Name {
 		case "type":
 		case "type":
 			mapType, err = uintFromBTF(member.Type)
 			mapType, err = uintFromBTF(member.Type)
@@ -503,8 +606,48 @@ func mapSpecFromBTF(spec *btf.Spec, name string) (*MapSpec, error) {
 				return nil, fmt.Errorf("can't get BTF map max entries: %w", err)
 				return nil, fmt.Errorf("can't get BTF map max entries: %w", err)
 			}
 			}
 
 
+		case "key":
+			if keySize != 0 {
+				return nil, errors.New("both key and key_size given")
+			}
+
+			pk, ok := member.Type.(*btf.Pointer)
+			if !ok {
+				return nil, fmt.Errorf("key type is not a pointer: %T", member.Type)
+			}
+
+			key = pk.Target
+
+			size, err := btf.Sizeof(pk.Target)
+			if err != nil {
+				return nil, fmt.Errorf("can't get size of BTF key: %w", err)
+			}
+
+			keySize = uint32(size)
+
+		case "value":
+			if valueSize != 0 {
+				return nil, errors.New("both value and value_size given")
+			}
+
+			vk, ok := member.Type.(*btf.Pointer)
+			if !ok {
+				return nil, fmt.Errorf("value type is not a pointer: %T", member.Type)
+			}
+
+			value = vk.Target
+
+			size, err := btf.Sizeof(vk.Target)
+			if err != nil {
+				return nil, fmt.Errorf("can't get size of BTF value: %w", err)
+			}
+
+			valueSize = uint32(size)
+
 		case "key_size":
 		case "key_size":
-			if _, isVoid := keyType.(*btf.Void); !isVoid {
+			// Key needs to be nil and keySize needs to be 0 for key_size to be
+			// considered a valid member.
+			if key != nil || keySize != 0 {
 				return nil, errors.New("both key and key_size given")
 				return nil, errors.New("both key and key_size given")
 			}
 			}
 
 
@@ -514,7 +657,9 @@ func mapSpecFromBTF(spec *btf.Spec, name string) (*MapSpec, error) {
 			}
 			}
 
 
 		case "value_size":
 		case "value_size":
-			if _, isVoid := valueType.(*btf.Void); !isVoid {
+			// Value needs to be nil and valueSize needs to be 0 for value_size to be
+			// considered a valid member.
+			if value != nil || valueSize != 0 {
 				return nil, errors.New("both value and value_size given")
 				return nil, errors.New("both value and value_size given")
 			}
 			}
 
 
@@ -524,28 +669,79 @@ func mapSpecFromBTF(spec *btf.Spec, name string) (*MapSpec, error) {
 			}
 			}
 
 
 		case "pinning":
 		case "pinning":
+			if inner {
+				return nil, errors.New("inner maps can't be pinned")
+			}
+
 			pinning, err := uintFromBTF(member.Type)
 			pinning, err := uintFromBTF(member.Type)
 			if err != nil {
 			if err != nil {
 				return nil, fmt.Errorf("can't get pinning: %w", err)
 				return nil, fmt.Errorf("can't get pinning: %w", err)
 			}
 			}
 
 
-			if pinning != 0 {
-				return nil, fmt.Errorf("'pinning' attribute not supported: %w", ErrNotSupported)
+			pinType = PinType(pinning)
+
+		case "values":
+			// The 'values' field in BTF map definitions is used for declaring map
+			// value types that are references to other BPF objects, like other maps
+			// or programs. It is always expected to be an array of pointers.
+			if i != len(def.Members)-1 {
+				return nil, errors.New("'values' must be the last member in a BTF map definition")
+			}
+
+			if valueSize != 0 && valueSize != 4 {
+				return nil, errors.New("value_size must be 0 or 4")
+			}
+			valueSize = 4
+
+			valueType, err := resolveBTFArrayMacro(member.Type)
+			if err != nil {
+				return nil, fmt.Errorf("can't resolve type of member 'values': %w", err)
+			}
+
+			switch t := valueType.(type) {
+			case *btf.Struct:
+				// The values member pointing to an array of structs means we're expecting
+				// a map-in-map declaration.
+				if MapType(mapType) != ArrayOfMaps && MapType(mapType) != HashOfMaps {
+					return nil, errors.New("outer map needs to be an array or a hash of maps")
+				}
+				if inner {
+					return nil, fmt.Errorf("nested inner maps are not supported")
+				}
+
+				// This inner map spec is used as a map template, but it needs to be
+				// created as a traditional map before it can be used to do so.
+				// libbpf names the inner map template '<outer_name>.inner', but we
+				// opted for _inner to simplify validation logic. (dots only supported
+				// on kernels 5.2 and up)
+				// Pass the BTF spec from the parent object, since both parent and
+				// child must be created from the same BTF blob (on kernels that support BTF).
+				innerMapSpec, err = mapSpecFromBTF(name+"_inner", t, true, spec)
+				if err != nil {
+					return nil, fmt.Errorf("can't parse BTF map definition of inner map: %w", err)
+				}
+
+			default:
+				return nil, fmt.Errorf("unsupported value type %q in 'values' field", t)
 			}
 			}
 
 
-		case "key", "value":
 		default:
 		default:
 			return nil, fmt.Errorf("unrecognized field %s in BTF map definition", member.Name)
 			return nil, fmt.Errorf("unrecognized field %s in BTF map definition", member.Name)
 		}
 		}
 	}
 	}
 
 
+	bm := btf.NewMap(spec, key, value)
+
 	return &MapSpec{
 	return &MapSpec{
+		Name:       SanitizeName(name, -1),
 		Type:       MapType(mapType),
 		Type:       MapType(mapType),
 		KeySize:    keySize,
 		KeySize:    keySize,
 		ValueSize:  valueSize,
 		ValueSize:  valueSize,
 		MaxEntries: maxEntries,
 		MaxEntries: maxEntries,
 		Flags:      flags,
 		Flags:      flags,
-		BTF:        btfMap,
+		BTF:        &bm,
+		Pinning:    pinType,
+		InnerMap:   innerMapSpec,
 	}, nil
 	}, nil
 }
 }
 
 
@@ -565,13 +761,40 @@ func uintFromBTF(typ btf.Type) (uint32, error) {
 	return arr.Nelems, nil
 	return arr.Nelems, nil
 }
 }
 
 
-func (ec *elfCode) loadDataSections(maps map[string]*MapSpec, dataSections map[elf.SectionIndex]*elf.Section, spec *btf.Spec) error {
-	if spec == nil {
-		return errors.New("data sections require BTF, make sure all consts are marked as static")
+// resolveBTFArrayMacro resolves the __array macro, which declares an array
+// of pointers to a given type. This function returns the target Type of
+// the pointers in the array.
+func resolveBTFArrayMacro(typ btf.Type) (btf.Type, error) {
+	arr, ok := typ.(*btf.Array)
+	if !ok {
+		return nil, fmt.Errorf("not an array: %v", typ)
+	}
+
+	ptr, ok := arr.Type.(*btf.Pointer)
+	if !ok {
+		return nil, fmt.Errorf("not an array of pointers: %v", typ)
 	}
 	}
 
 
-	for _, sec := range dataSections {
-		btfMap, err := spec.Datasec(sec.Name)
+	return ptr.Target, nil
+}
+
+func (ec *elfCode) loadDataSections(maps map[string]*MapSpec) error {
+	for _, sec := range ec.sections {
+		if sec.kind != dataSection {
+			continue
+		}
+
+		if sec.references == 0 {
+			// Prune data sections which are not referenced by any
+			// instructions.
+			continue
+		}
+
+		if ec.btf == nil {
+			return errors.New("data sections require BTF, make sure all consts are marked as static")
+		}
+
+		btfMap, err := ec.btf.Datasec(sec.Name)
 		if err != nil {
 		if err != nil {
 			return err
 			return err
 		}
 		}
@@ -609,54 +832,61 @@ func (ec *elfCode) loadDataSections(maps map[string]*MapSpec, dataSections map[e
 	return nil
 	return nil
 }
 }
 
 
-func getProgType(sectionName string) (ProgramType, AttachType, string) {
+func getProgType(sectionName string) (ProgramType, AttachType, uint32, string) {
 	types := map[string]struct {
 	types := map[string]struct {
 		progType   ProgramType
 		progType   ProgramType
 		attachType AttachType
 		attachType AttachType
+		progFlags  uint32
 	}{
 	}{
 		// From https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/lib/bpf/libbpf.c
 		// From https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/lib/bpf/libbpf.c
-		"socket":                {SocketFilter, AttachNone},
-		"seccomp":               {SocketFilter, AttachNone},
-		"kprobe/":               {Kprobe, AttachNone},
-		"uprobe/":               {Kprobe, AttachNone},
-		"kretprobe/":            {Kprobe, AttachNone},
-		"uretprobe/":            {Kprobe, AttachNone},
-		"tracepoint/":           {TracePoint, AttachNone},
-		"raw_tracepoint/":       {RawTracepoint, AttachNone},
-		"xdp":                   {XDP, AttachNone},
-		"perf_event":            {PerfEvent, AttachNone},
-		"lwt_in":                {LWTIn, AttachNone},
-		"lwt_out":               {LWTOut, AttachNone},
-		"lwt_xmit":              {LWTXmit, AttachNone},
-		"lwt_seg6local":         {LWTSeg6Local, AttachNone},
-		"sockops":               {SockOps, AttachCGroupSockOps},
-		"sk_skb/stream_parser":  {SkSKB, AttachSkSKBStreamParser},
-		"sk_skb/stream_verdict": {SkSKB, AttachSkSKBStreamParser},
-		"sk_msg":                {SkMsg, AttachSkSKBStreamVerdict},
-		"lirc_mode2":            {LircMode2, AttachLircMode2},
-		"flow_dissector":        {FlowDissector, AttachFlowDissector},
-		"iter/":                 {Tracing, AttachTraceIter},
-
-		"cgroup_skb/ingress": {CGroupSKB, AttachCGroupInetIngress},
-		"cgroup_skb/egress":  {CGroupSKB, AttachCGroupInetEgress},
-		"cgroup/dev":         {CGroupDevice, AttachCGroupDevice},
-		"cgroup/skb":         {CGroupSKB, AttachNone},
-		"cgroup/sock":        {CGroupSock, AttachCGroupInetSockCreate},
-		"cgroup/post_bind4":  {CGroupSock, AttachCGroupInet4PostBind},
-		"cgroup/post_bind6":  {CGroupSock, AttachCGroupInet6PostBind},
-		"cgroup/bind4":       {CGroupSockAddr, AttachCGroupInet4Bind},
-		"cgroup/bind6":       {CGroupSockAddr, AttachCGroupInet6Bind},
-		"cgroup/connect4":    {CGroupSockAddr, AttachCGroupInet4Connect},
-		"cgroup/connect6":    {CGroupSockAddr, AttachCGroupInet6Connect},
-		"cgroup/sendmsg4":    {CGroupSockAddr, AttachCGroupUDP4Sendmsg},
-		"cgroup/sendmsg6":    {CGroupSockAddr, AttachCGroupUDP6Sendmsg},
-		"cgroup/recvmsg4":    {CGroupSockAddr, AttachCGroupUDP4Recvmsg},
-		"cgroup/recvmsg6":    {CGroupSockAddr, AttachCGroupUDP6Recvmsg},
-		"cgroup/sysctl":      {CGroupSysctl, AttachCGroupSysctl},
-		"cgroup/getsockopt":  {CGroupSockopt, AttachCGroupGetsockopt},
-		"cgroup/setsockopt":  {CGroupSockopt, AttachCGroupSetsockopt},
-		"classifier":         {SchedCLS, AttachNone},
-		"action":             {SchedACT, AttachNone},
+		"socket":                {SocketFilter, AttachNone, 0},
+		"seccomp":               {SocketFilter, AttachNone, 0},
+		"kprobe/":               {Kprobe, AttachNone, 0},
+		"uprobe/":               {Kprobe, AttachNone, 0},
+		"kretprobe/":            {Kprobe, AttachNone, 0},
+		"uretprobe/":            {Kprobe, AttachNone, 0},
+		"tracepoint/":           {TracePoint, AttachNone, 0},
+		"raw_tracepoint/":       {RawTracepoint, AttachNone, 0},
+		"xdp":                   {XDP, AttachNone, 0},
+		"perf_event":            {PerfEvent, AttachNone, 0},
+		"lwt_in":                {LWTIn, AttachNone, 0},
+		"lwt_out":               {LWTOut, AttachNone, 0},
+		"lwt_xmit":              {LWTXmit, AttachNone, 0},
+		"lwt_seg6local":         {LWTSeg6Local, AttachNone, 0},
+		"sockops":               {SockOps, AttachCGroupSockOps, 0},
+		"sk_skb/stream_parser":  {SkSKB, AttachSkSKBStreamParser, 0},
+		"sk_skb/stream_verdict": {SkSKB, AttachSkSKBStreamParser, 0},
+		"sk_msg":                {SkMsg, AttachSkSKBStreamVerdict, 0},
+		"lirc_mode2":            {LircMode2, AttachLircMode2, 0},
+		"flow_dissector":        {FlowDissector, AttachFlowDissector, 0},
+		"iter/":                 {Tracing, AttachTraceIter, 0},
+		"fentry.s/":             {Tracing, AttachTraceFEntry, unix.BPF_F_SLEEPABLE},
+		"fmod_ret.s/":           {Tracing, AttachModifyReturn, unix.BPF_F_SLEEPABLE},
+		"fexit.s/":              {Tracing, AttachTraceFExit, unix.BPF_F_SLEEPABLE},
+		"sk_lookup/":            {SkLookup, AttachSkLookup, 0},
+		"lsm/":                  {LSM, AttachLSMMac, 0},
+		"lsm.s/":                {LSM, AttachLSMMac, unix.BPF_F_SLEEPABLE},
+
+		"cgroup_skb/ingress": {CGroupSKB, AttachCGroupInetIngress, 0},
+		"cgroup_skb/egress":  {CGroupSKB, AttachCGroupInetEgress, 0},
+		"cgroup/dev":         {CGroupDevice, AttachCGroupDevice, 0},
+		"cgroup/skb":         {CGroupSKB, AttachNone, 0},
+		"cgroup/sock":        {CGroupSock, AttachCGroupInetSockCreate, 0},
+		"cgroup/post_bind4":  {CGroupSock, AttachCGroupInet4PostBind, 0},
+		"cgroup/post_bind6":  {CGroupSock, AttachCGroupInet6PostBind, 0},
+		"cgroup/bind4":       {CGroupSockAddr, AttachCGroupInet4Bind, 0},
+		"cgroup/bind6":       {CGroupSockAddr, AttachCGroupInet6Bind, 0},
+		"cgroup/connect4":    {CGroupSockAddr, AttachCGroupInet4Connect, 0},
+		"cgroup/connect6":    {CGroupSockAddr, AttachCGroupInet6Connect, 0},
+		"cgroup/sendmsg4":    {CGroupSockAddr, AttachCGroupUDP4Sendmsg, 0},
+		"cgroup/sendmsg6":    {CGroupSockAddr, AttachCGroupUDP6Sendmsg, 0},
+		"cgroup/recvmsg4":    {CGroupSockAddr, AttachCGroupUDP4Recvmsg, 0},
+		"cgroup/recvmsg6":    {CGroupSockAddr, AttachCGroupUDP6Recvmsg, 0},
+		"cgroup/sysctl":      {CGroupSysctl, AttachCGroupSysctl, 0},
+		"cgroup/getsockopt":  {CGroupSockopt, AttachCGroupGetsockopt, 0},
+		"cgroup/setsockopt":  {CGroupSockopt, AttachCGroupSetsockopt, 0},
+		"classifier":         {SchedCLS, AttachNone, 0},
+		"action":             {SchedACT, AttachNone, 0},
 	}
 	}
 
 
 	for prefix, t := range types {
 	for prefix, t := range types {
@@ -665,78 +895,39 @@ func getProgType(sectionName string) (ProgramType, AttachType, string) {
 		}
 		}
 
 
 		if !strings.HasSuffix(prefix, "/") {
 		if !strings.HasSuffix(prefix, "/") {
-			return t.progType, t.attachType, ""
+			return t.progType, t.attachType, t.progFlags, ""
 		}
 		}
 
 
-		return t.progType, t.attachType, sectionName[len(prefix):]
+		return t.progType, t.attachType, t.progFlags, sectionName[len(prefix):]
 	}
 	}
 
 
-	return UnspecifiedProgram, AttachNone, ""
+	return UnspecifiedProgram, AttachNone, 0, ""
 }
 }
 
 
-func (ec *elfCode) loadRelocations(sections map[elf.SectionIndex]*elf.Section) (map[elf.SectionIndex]map[uint64]elf.Symbol, map[elf.SectionIndex]bool, error) {
-	result := make(map[elf.SectionIndex]map[uint64]elf.Symbol)
-	targets := make(map[elf.SectionIndex]bool)
-	for idx, sec := range sections {
-		rels := make(map[uint64]elf.Symbol)
-
-		if sec.Entsize < 16 {
-			return nil, nil, fmt.Errorf("section %s: relocations are less than 16 bytes", sec.Name)
-		}
-
-		r := sec.Open()
-		for off := uint64(0); off < sec.Size; off += sec.Entsize {
-			ent := io.LimitReader(r, int64(sec.Entsize))
+func (ec *elfCode) loadRelocations(sec *elf.Section, symbols []elf.Symbol) (map[uint64]elf.Symbol, error) {
+	rels := make(map[uint64]elf.Symbol)
 
 
-			var rel elf.Rel64
-			if binary.Read(ent, ec.ByteOrder, &rel) != nil {
-				return nil, nil, fmt.Errorf("can't parse relocation at offset %v", off)
-			}
-
-			symNo := int(elf.R_SYM64(rel.Info) - 1)
-			if symNo >= len(ec.symbols) {
-				return nil, nil, fmt.Errorf("relocation at offset %d: symbol %v doesnt exist", off, symNo)
-			}
-
-			symbol := ec.symbols[symNo]
-			targets[symbol.Section] = true
-			rels[rel.Off] = ec.symbols[symNo]
-		}
-
-		result[idx] = rels
+	if sec.Entsize < 16 {
+		return nil, fmt.Errorf("section %s: relocations are less than 16 bytes", sec.Name)
 	}
 	}
-	return result, targets, nil
-}
 
 
-func symbolsPerSection(symbols []elf.Symbol) map[elf.SectionIndex]map[uint64]elf.Symbol {
-	result := make(map[elf.SectionIndex]map[uint64]elf.Symbol)
-	for _, sym := range symbols {
-		switch elf.ST_TYPE(sym.Info) {
-		case elf.STT_NOTYPE:
-			// Older versions of LLVM doesn't tag
-			// symbols correctly.
-			break
-		case elf.STT_OBJECT:
-			break
-		case elf.STT_FUNC:
-			break
-		default:
-			continue
-		}
+	r := bufio.NewReader(sec.Open())
+	for off := uint64(0); off < sec.Size; off += sec.Entsize {
+		ent := io.LimitReader(r, int64(sec.Entsize))
 
 
-		if sym.Section == elf.SHN_UNDEF || sym.Section >= elf.SHN_LORESERVE {
-			continue
+		var rel elf.Rel64
+		if binary.Read(ent, ec.ByteOrder, &rel) != nil {
+			return nil, fmt.Errorf("can't parse relocation at offset %v", off)
 		}
 		}
 
 
-		if sym.Name == "" {
-			continue
+		symNo := int(elf.R_SYM64(rel.Info) - 1)
+		if symNo >= len(symbols) {
+			return nil, fmt.Errorf("offset %d: symbol %d doesn't exist", off, symNo)
 		}
 		}
 
 
-		idx := sym.Section
-		if _, ok := result[idx]; !ok {
-			result[idx] = make(map[uint64]elf.Symbol)
-		}
-		result[idx][sym.Value] = sym
+		symbol := symbols[symNo]
+		rels[rel.Off] = symbol
 	}
 	}
-	return result
+
+	return rels, nil
 }
 }

+ 21 - 0
vendor/github.com/cilium/ebpf/elf_reader_fuzz.go

@@ -0,0 +1,21 @@
+// +build gofuzz
+
+// Use with https://github.com/dvyukov/go-fuzz
+
+package ebpf
+
+import "bytes"
+
+func FuzzLoadCollectionSpec(data []byte) int {
+	spec, err := LoadCollectionSpecFromReader(bytes.NewReader(data))
+	if err != nil {
+		if spec != nil {
+			panic("spec is not nil")
+		}
+		return 0
+	}
+	if spec == nil {
+		panic("spec is nil")
+	}
+	return 1
+}

+ 6 - 0
vendor/github.com/cilium/ebpf/examples/README.md

@@ -0,0 +1,6 @@
+# eBPF Examples
+
+- [kprobe](kprobe/) - Attach a program to the entry or exit of an arbitrary kernel symbol (function).
+- [uprobe](uprobe/) - Like a kprobe, but for symbols in userspace binaries (e.g. `bash`).
+- [tracepoint](tracepoint/) - Attach a program to predetermined kernel tracepoints.
+- Add your use case(s) here!

+ 9 - 0
vendor/github.com/cilium/ebpf/examples/go.mod

@@ -0,0 +1,9 @@
+module github.com/cilium/ebpf/examples
+
+go 1.15
+
+require (
+	github.com/cilium/ebpf v0.4.1-0.20210401155455-cb5b8b6084b4 // indirect
+	github.com/elastic/go-perf v0.0.0-20191212140718-9c656876f595
+	golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c
+)

+ 3265 - 0
vendor/github.com/cilium/ebpf/examples/headers/bpf_helper_defs.h

@@ -0,0 +1,3265 @@
+/* This is auto-generated file. See bpf_helpers_doc.py for details. */
+
+/* Forward declarations of BPF structs */
+struct bpf_fib_lookup;
+struct bpf_sk_lookup;
+struct bpf_perf_event_data;
+struct bpf_perf_event_value;
+struct bpf_pidns_info;
+struct bpf_sock;
+struct bpf_sock_addr;
+struct bpf_sock_ops;
+struct bpf_sock_tuple;
+struct bpf_spin_lock;
+struct bpf_sysctl;
+struct bpf_tcp_sock;
+struct bpf_tunnel_key;
+struct bpf_xfrm_state;
+struct pt_regs;
+struct sk_reuseport_md;
+struct sockaddr;
+struct tcphdr;
+struct seq_file;
+struct tcp6_sock;
+struct tcp_sock;
+struct tcp_timewait_sock;
+struct tcp_request_sock;
+struct udp6_sock;
+struct task_struct;
+struct __sk_buff;
+struct sk_msg_md;
+struct xdp_md;
+
+/*
+ * bpf_map_lookup_elem
+ *
+ * 	Perform a lookup in *map* for an entry associated to *key*.
+ *
+ * Returns
+ * 	Map value associated to *key*, or **NULL** if no entry was
+ * 	found.
+ */
+static void *(*bpf_map_lookup_elem)(void *map, const void *key) = (void *) 1;
+
+/*
+ * bpf_map_update_elem
+ *
+ * 	Add or update the value of the entry associated to *key* in
+ * 	*map* with *value*. *flags* is one of:
+ *
+ * 	**BPF_NOEXIST**
+ * 		The entry for *key* must not exist in the map.
+ * 	**BPF_EXIST**
+ * 		The entry for *key* must already exist in the map.
+ * 	**BPF_ANY**
+ * 		No condition on the existence of the entry for *key*.
+ *
+ * 	Flag value **BPF_NOEXIST** cannot be used for maps of types
+ * 	**BPF_MAP_TYPE_ARRAY** or **BPF_MAP_TYPE_PERCPU_ARRAY**  (all
+ * 	elements always exist), the helper would return an error.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_map_update_elem)(void *map, const void *key, const void *value, __u64 flags) = (void *) 2;
+
+/*
+ * bpf_map_delete_elem
+ *
+ * 	Delete entry with *key* from *map*.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_map_delete_elem)(void *map, const void *key) = (void *) 3;
+
+/*
+ * bpf_probe_read
+ *
+ * 	For tracing programs, safely attempt to read *size* bytes from
+ * 	kernel space address *unsafe_ptr* and store the data in *dst*.
+ *
+ * 	Generally, use **bpf_probe_read_user**\ () or
+ * 	**bpf_probe_read_kernel**\ () instead.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_probe_read)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 4;
+
+/*
+ * bpf_ktime_get_ns
+ *
+ * 	Return the time elapsed since system boot, in nanoseconds.
+ * 	Does not include time the system was suspended.
+ * 	See: **clock_gettime**\ (**CLOCK_MONOTONIC**)
+ *
+ * Returns
+ * 	Current *ktime*.
+ */
+static __u64 (*bpf_ktime_get_ns)(void) = (void *) 5;
+
+/*
+ * bpf_trace_printk
+ *
+ * 	This helper is a "printk()-like" facility for debugging. It
+ * 	prints a message defined by format *fmt* (of size *fmt_size*)
+ * 	to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if
+ * 	available. It can take up to three additional **u64**
+ * 	arguments (as an eBPF helpers, the total number of arguments is
+ * 	limited to five).
+ *
+ * 	Each time the helper is called, it appends a line to the trace.
+ * 	Lines are discarded while *\/sys/kernel/debug/tracing/trace* is
+ * 	open, use *\/sys/kernel/debug/tracing/trace_pipe* to avoid this.
+ * 	The format of the trace is customizable, and the exact output
+ * 	one will get depends on the options set in
+ * 	*\/sys/kernel/debug/tracing/trace_options* (see also the
+ * 	*README* file under the same directory). However, it usually
+ * 	defaults to something like:
+ *
+ * 	::
+ *
+ * 		telnet-470   [001] .N.. 419421.045894: 0x00000001: <formatted msg>
+ *
+ * 	In the above:
+ *
+ * 		* ``telnet`` is the name of the current task.
+ * 		* ``470`` is the PID of the current task.
+ * 		* ``001`` is the CPU number on which the task is
+ * 		  running.
+ * 		* In ``.N..``, each character refers to a set of
+ * 		  options (whether irqs are enabled, scheduling
+ * 		  options, whether hard/softirqs are running, level of
+ * 		  preempt_disabled respectively). **N** means that
+ * 		  **TIF_NEED_RESCHED** and **PREEMPT_NEED_RESCHED**
+ * 		  are set.
+ * 		* ``419421.045894`` is a timestamp.
+ * 		* ``0x00000001`` is a fake value used by BPF for the
+ * 		  instruction pointer register.
+ * 		* ``<formatted msg>`` is the message formatted with
+ * 		  *fmt*.
+ *
+ * 	The conversion specifiers supported by *fmt* are similar, but
+ * 	more limited than for printk(). They are **%d**, **%i**,
+ * 	**%u**, **%x**, **%ld**, **%li**, **%lu**, **%lx**, **%lld**,
+ * 	**%lli**, **%llu**, **%llx**, **%p**, **%s**. No modifier (size
+ * 	of field, padding with zeroes, etc.) is available, and the
+ * 	helper will return **-EINVAL** (but print nothing) if it
+ * 	encounters an unknown specifier.
+ *
+ * 	Also, note that **bpf_trace_printk**\ () is slow, and should
+ * 	only be used for debugging purposes. For this reason, a notice
+ * 	bloc (spanning several lines) is printed to kernel logs and
+ * 	states that the helper should not be used "for production use"
+ * 	the first time this helper is used (or more precisely, when
+ * 	**trace_printk**\ () buffers are allocated). For passing values
+ * 	to user space, perf events should be preferred.
+ *
+ * Returns
+ * 	The number of bytes written to the buffer, or a negative error
+ * 	in case of failure.
+ */
+static long (*bpf_trace_printk)(const char *fmt, __u32 fmt_size, ...) = (void *) 6;
+
+/*
+ * bpf_get_prandom_u32
+ *
+ * 	Get a pseudo-random number.
+ *
+ * 	From a security point of view, this helper uses its own
+ * 	pseudo-random internal state, and cannot be used to infer the
+ * 	seed of other random functions in the kernel. However, it is
+ * 	essential to note that the generator used by the helper is not
+ * 	cryptographically secure.
+ *
+ * Returns
+ * 	A random 32-bit unsigned value.
+ */
+static __u32 (*bpf_get_prandom_u32)(void) = (void *) 7;
+
+/*
+ * bpf_get_smp_processor_id
+ *
+ * 	Get the SMP (symmetric multiprocessing) processor id. Note that
+ * 	all programs run with preemption disabled, which means that the
+ * 	SMP processor id is stable during all the execution of the
+ * 	program.
+ *
+ * Returns
+ * 	The SMP id of the processor running the program.
+ */
+static __u32 (*bpf_get_smp_processor_id)(void) = (void *) 8;
+
+/*
+ * bpf_skb_store_bytes
+ *
+ * 	Store *len* bytes from address *from* into the packet
+ * 	associated to *skb*, at *offset*. *flags* are a combination of
+ * 	**BPF_F_RECOMPUTE_CSUM** (automatically recompute the
+ * 	checksum for the packet after storing the bytes) and
+ * 	**BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\
+ * 	**->swhash** and *skb*\ **->l4hash** to 0).
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_skb_store_bytes)(struct __sk_buff *skb, __u32 offset, const void *from, __u32 len, __u64 flags) = (void *) 9;
+
+/*
+ * bpf_l3_csum_replace
+ *
+ * 	Recompute the layer 3 (e.g. IP) checksum for the packet
+ * 	associated to *skb*. Computation is incremental, so the helper
+ * 	must know the former value of the header field that was
+ * 	modified (*from*), the new value of this field (*to*), and the
+ * 	number of bytes (2 or 4) for this field, stored in *size*.
+ * 	Alternatively, it is possible to store the difference between
+ * 	the previous and the new values of the header field in *to*, by
+ * 	setting *from* and *size* to 0. For both methods, *offset*
+ * 	indicates the location of the IP checksum within the packet.
+ *
+ * 	This helper works in combination with **bpf_csum_diff**\ (),
+ * 	which does not update the checksum in-place, but offers more
+ * 	flexibility and can handle sizes larger than 2 or 4 for the
+ * 	checksum to update.
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_l3_csum_replace)(struct __sk_buff *skb, __u32 offset, __u64 from, __u64 to, __u64 size) = (void *) 10;
+
+/*
+ * bpf_l4_csum_replace
+ *
+ * 	Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the
+ * 	packet associated to *skb*. Computation is incremental, so the
+ * 	helper must know the former value of the header field that was
+ * 	modified (*from*), the new value of this field (*to*), and the
+ * 	number of bytes (2 or 4) for this field, stored on the lowest
+ * 	four bits of *flags*. Alternatively, it is possible to store
+ * 	the difference between the previous and the new values of the
+ * 	header field in *to*, by setting *from* and the four lowest
+ * 	bits of *flags* to 0. For both methods, *offset* indicates the
+ * 	location of the IP checksum within the packet. In addition to
+ * 	the size of the field, *flags* can be added (bitwise OR) actual
+ * 	flags. With **BPF_F_MARK_MANGLED_0**, a null checksum is left
+ * 	untouched (unless **BPF_F_MARK_ENFORCE** is added as well), and
+ * 	for updates resulting in a null checksum the value is set to
+ * 	**CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates
+ * 	the checksum is to be computed against a pseudo-header.
+ *
+ * 	This helper works in combination with **bpf_csum_diff**\ (),
+ * 	which does not update the checksum in-place, but offers more
+ * 	flexibility and can handle sizes larger than 2 or 4 for the
+ * 	checksum to update.
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_l4_csum_replace)(struct __sk_buff *skb, __u32 offset, __u64 from, __u64 to, __u64 flags) = (void *) 11;
+
+/*
+ * bpf_tail_call
+ *
+ * 	This special helper is used to trigger a "tail call", or in
+ * 	other words, to jump into another eBPF program. The same stack
+ * 	frame is used (but values on stack and in registers for the
+ * 	caller are not accessible to the callee). This mechanism allows
+ * 	for program chaining, either for raising the maximum number of
+ * 	available eBPF instructions, or to execute given programs in
+ * 	conditional blocks. For security reasons, there is an upper
+ * 	limit to the number of successive tail calls that can be
+ * 	performed.
+ *
+ * 	Upon call of this helper, the program attempts to jump into a
+ * 	program referenced at index *index* in *prog_array_map*, a
+ * 	special map of type **BPF_MAP_TYPE_PROG_ARRAY**, and passes
+ * 	*ctx*, a pointer to the context.
+ *
+ * 	If the call succeeds, the kernel immediately runs the first
+ * 	instruction of the new program. This is not a function call,
+ * 	and it never returns to the previous program. If the call
+ * 	fails, then the helper has no effect, and the caller continues
+ * 	to run its subsequent instructions. A call can fail if the
+ * 	destination program for the jump does not exist (i.e. *index*
+ * 	is superior to the number of entries in *prog_array_map*), or
+ * 	if the maximum number of tail calls has been reached for this
+ * 	chain of programs. This limit is defined in the kernel by the
+ * 	macro **MAX_TAIL_CALL_CNT** (not accessible to user space),
+ * 	which is currently set to 32.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_tail_call)(void *ctx, void *prog_array_map, __u32 index) = (void *) 12;
+
+/*
+ * bpf_clone_redirect
+ *
+ * 	Clone and redirect the packet associated to *skb* to another
+ * 	net device of index *ifindex*. Both ingress and egress
+ * 	interfaces can be used for redirection. The **BPF_F_INGRESS**
+ * 	value in *flags* is used to make the distinction (ingress path
+ * 	is selected if the flag is present, egress path otherwise).
+ * 	This is the only flag supported for now.
+ *
+ * 	In comparison with **bpf_redirect**\ () helper,
+ * 	**bpf_clone_redirect**\ () has the associated cost of
+ * 	duplicating the packet buffer, but this can be executed out of
+ * 	the eBPF program. Conversely, **bpf_redirect**\ () is more
+ * 	efficient, but it is handled through an action code where the
+ * 	redirection happens only after the eBPF program has returned.
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_clone_redirect)(struct __sk_buff *skb, __u32 ifindex, __u64 flags) = (void *) 13;
+
+/*
+ * bpf_get_current_pid_tgid
+ *
+ *
+ * Returns
+ * 	A 64-bit integer containing the current tgid and pid, and
+ * 	created as such:
+ * 	*current_task*\ **->tgid << 32 \|**
+ * 	*current_task*\ **->pid**.
+ */
+static __u64 (*bpf_get_current_pid_tgid)(void) = (void *) 14;
+
+/*
+ * bpf_get_current_uid_gid
+ *
+ *
+ * Returns
+ * 	A 64-bit integer containing the current GID and UID, and
+ * 	created as such: *current_gid* **<< 32 \|** *current_uid*.
+ */
+static __u64 (*bpf_get_current_uid_gid)(void) = (void *) 15;
+
+/*
+ * bpf_get_current_comm
+ *
+ * 	Copy the **comm** attribute of the current task into *buf* of
+ * 	*size_of_buf*. The **comm** attribute contains the name of
+ * 	the executable (excluding the path) for the current task. The
+ * 	*size_of_buf* must be strictly positive. On success, the
+ * 	helper makes sure that the *buf* is NUL-terminated. On failure,
+ * 	it is filled with zeroes.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_get_current_comm)(void *buf, __u32 size_of_buf) = (void *) 16;
+
+/*
+ * bpf_get_cgroup_classid
+ *
+ * 	Retrieve the classid for the current task, i.e. for the net_cls
+ * 	cgroup to which *skb* belongs.
+ *
+ * 	This helper can be used on TC egress path, but not on ingress.
+ *
+ * 	The net_cls cgroup provides an interface to tag network packets
+ * 	based on a user-provided identifier for all traffic coming from
+ * 	the tasks belonging to the related cgroup. See also the related
+ * 	kernel documentation, available from the Linux sources in file
+ * 	*Documentation/admin-guide/cgroup-v1/net_cls.rst*.
+ *
+ * 	The Linux kernel has two versions for cgroups: there are
+ * 	cgroups v1 and cgroups v2. Both are available to users, who can
+ * 	use a mixture of them, but note that the net_cls cgroup is for
+ * 	cgroup v1 only. This makes it incompatible with BPF programs
+ * 	run on cgroups, which is a cgroup-v2-only feature (a socket can
+ * 	only hold data for one version of cgroups at a time).
+ *
+ * 	This helper is only available is the kernel was compiled with
+ * 	the **CONFIG_CGROUP_NET_CLASSID** configuration option set to
+ * 	"**y**" or to "**m**".
+ *
+ * Returns
+ * 	The classid, or 0 for the default unconfigured classid.
+ */
+static __u32 (*bpf_get_cgroup_classid)(struct __sk_buff *skb) = (void *) 17;
+
+/*
+ * bpf_skb_vlan_push
+ *
+ * 	Push a *vlan_tci* (VLAN tag control information) of protocol
+ * 	*vlan_proto* to the packet associated to *skb*, then update
+ * 	the checksum. Note that if *vlan_proto* is different from
+ * 	**ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to
+ * 	be **ETH_P_8021Q**.
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_skb_vlan_push)(struct __sk_buff *skb, __be16 vlan_proto, __u16 vlan_tci) = (void *) 18;
+
+/*
+ * bpf_skb_vlan_pop
+ *
+ * 	Pop a VLAN header from the packet associated to *skb*.
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_skb_vlan_pop)(struct __sk_buff *skb) = (void *) 19;
+
+/*
+ * bpf_skb_get_tunnel_key
+ *
+ * 	Get tunnel metadata. This helper takes a pointer *key* to an
+ * 	empty **struct bpf_tunnel_key** of **size**, that will be
+ * 	filled with tunnel metadata for the packet associated to *skb*.
+ * 	The *flags* can be set to **BPF_F_TUNINFO_IPV6**, which
+ * 	indicates that the tunnel is based on IPv6 protocol instead of
+ * 	IPv4.
+ *
+ * 	The **struct bpf_tunnel_key** is an object that generalizes the
+ * 	principal parameters used by various tunneling protocols into a
+ * 	single struct. This way, it can be used to easily make a
+ * 	decision based on the contents of the encapsulation header,
+ * 	"summarized" in this struct. In particular, it holds the IP
+ * 	address of the remote end (IPv4 or IPv6, depending on the case)
+ * 	in *key*\ **->remote_ipv4** or *key*\ **->remote_ipv6**. Also,
+ * 	this struct exposes the *key*\ **->tunnel_id**, which is
+ * 	generally mapped to a VNI (Virtual Network Identifier), making
+ * 	it programmable together with the **bpf_skb_set_tunnel_key**\
+ * 	() helper.
+ *
+ * 	Let's imagine that the following code is part of a program
+ * 	attached to the TC ingress interface, on one end of a GRE
+ * 	tunnel, and is supposed to filter out all messages coming from
+ * 	remote ends with IPv4 address other than 10.0.0.1:
+ *
+ * 	::
+ *
+ * 		int ret;
+ * 		struct bpf_tunnel_key key = {};
+ * 		
+ * 		ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
+ * 		if (ret < 0)
+ * 			return TC_ACT_SHOT;	// drop packet
+ * 		
+ * 		if (key.remote_ipv4 != 0x0a000001)
+ * 			return TC_ACT_SHOT;	// drop packet
+ * 		
+ * 		return TC_ACT_OK;		// accept packet
+ *
+ * 	This interface can also be used with all encapsulation devices
+ * 	that can operate in "collect metadata" mode: instead of having
+ * 	one network device per specific configuration, the "collect
+ * 	metadata" mode only requires a single device where the
+ * 	configuration can be extracted from this helper.
+ *
+ * 	This can be used together with various tunnels such as VXLan,
+ * 	Geneve, GRE or IP in IP (IPIP).
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_skb_get_tunnel_key)(struct __sk_buff *skb, struct bpf_tunnel_key *key, __u32 size, __u64 flags) = (void *) 20;
+
+/*
+ * bpf_skb_set_tunnel_key
+ *
+ * 	Populate tunnel metadata for packet associated to *skb.* The
+ * 	tunnel metadata is set to the contents of *key*, of *size*. The
+ * 	*flags* can be set to a combination of the following values:
+ *
+ * 	**BPF_F_TUNINFO_IPV6**
+ * 		Indicate that the tunnel is based on IPv6 protocol
+ * 		instead of IPv4.
+ * 	**BPF_F_ZERO_CSUM_TX**
+ * 		For IPv4 packets, add a flag to tunnel metadata
+ * 		indicating that checksum computation should be skipped
+ * 		and checksum set to zeroes.
+ * 	**BPF_F_DONT_FRAGMENT**
+ * 		Add a flag to tunnel metadata indicating that the
+ * 		packet should not be fragmented.
+ * 	**BPF_F_SEQ_NUMBER**
+ * 		Add a flag to tunnel metadata indicating that a
+ * 		sequence number should be added to tunnel header before
+ * 		sending the packet. This flag was added for GRE
+ * 		encapsulation, but might be used with other protocols
+ * 		as well in the future.
+ *
+ * 	Here is a typical usage on the transmit path:
+ *
+ * 	::
+ *
+ * 		struct bpf_tunnel_key key;
+ * 		     populate key ...
+ * 		bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0);
+ * 		bpf_clone_redirect(skb, vxlan_dev_ifindex, 0);
+ *
+ * 	See also the description of the **bpf_skb_get_tunnel_key**\ ()
+ * 	helper for additional information.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_skb_set_tunnel_key)(struct __sk_buff *skb, struct bpf_tunnel_key *key, __u32 size, __u64 flags) = (void *) 21;
+
+/*
+ * bpf_perf_event_read
+ *
+ * 	Read the value of a perf event counter. This helper relies on a
+ * 	*map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of
+ * 	the perf event counter is selected when *map* is updated with
+ * 	perf event file descriptors. The *map* is an array whose size
+ * 	is the number of available CPUs, and each cell contains a value
+ * 	relative to one CPU. The value to retrieve is indicated by
+ * 	*flags*, that contains the index of the CPU to look up, masked
+ * 	with **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to
+ * 	**BPF_F_CURRENT_CPU** to indicate that the value for the
+ * 	current CPU should be retrieved.
+ *
+ * 	Note that before Linux 4.13, only hardware perf event can be
+ * 	retrieved.
+ *
+ * 	Also, be aware that the newer helper
+ * 	**bpf_perf_event_read_value**\ () is recommended over
+ * 	**bpf_perf_event_read**\ () in general. The latter has some ABI
+ * 	quirks where error and counter value are used as a return code
+ * 	(which is wrong to do since ranges may overlap). This issue is
+ * 	fixed with **bpf_perf_event_read_value**\ (), which at the same
+ * 	time provides more features over the **bpf_perf_event_read**\
+ * 	() interface. Please refer to the description of
+ * 	**bpf_perf_event_read_value**\ () for details.
+ *
+ * Returns
+ * 	The value of the perf event counter read from the map, or a
+ * 	negative error code in case of failure.
+ */
+static __u64 (*bpf_perf_event_read)(void *map, __u64 flags) = (void *) 22;
+
+/*
+ * bpf_redirect
+ *
+ * 	Redirect the packet to another net device of index *ifindex*.
+ * 	This helper is somewhat similar to **bpf_clone_redirect**\
+ * 	(), except that the packet is not cloned, which provides
+ * 	increased performance.
+ *
+ * 	Except for XDP, both ingress and egress interfaces can be used
+ * 	for redirection. The **BPF_F_INGRESS** value in *flags* is used
+ * 	to make the distinction (ingress path is selected if the flag
+ * 	is present, egress path otherwise). Currently, XDP only
+ * 	supports redirection to the egress interface, and accepts no
+ * 	flag at all.
+ *
+ * 	The same effect can also be attained with the more generic
+ * 	**bpf_redirect_map**\ (), which uses a BPF map to store the
+ * 	redirect target instead of providing it directly to the helper.
+ *
+ * Returns
+ * 	For XDP, the helper returns **XDP_REDIRECT** on success or
+ * 	**XDP_ABORTED** on error. For other program types, the values
+ * 	are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on
+ * 	error.
+ */
+static long (*bpf_redirect)(__u32 ifindex, __u64 flags) = (void *) 23;
+
+/*
+ * bpf_get_route_realm
+ *
+ * 	Retrieve the realm or the route, that is to say the
+ * 	**tclassid** field of the destination for the *skb*. The
+ * 	indentifier retrieved is a user-provided tag, similar to the
+ * 	one used with the net_cls cgroup (see description for
+ * 	**bpf_get_cgroup_classid**\ () helper), but here this tag is
+ * 	held by a route (a destination entry), not by a task.
+ *
+ * 	Retrieving this identifier works with the clsact TC egress hook
+ * 	(see also **tc-bpf(8)**), or alternatively on conventional
+ * 	classful egress qdiscs, but not on TC ingress path. In case of
+ * 	clsact TC egress hook, this has the advantage that, internally,
+ * 	the destination entry has not been dropped yet in the transmit
+ * 	path. Therefore, the destination entry does not need to be
+ * 	artificially held via **netif_keep_dst**\ () for a classful
+ * 	qdisc until the *skb* is freed.
+ *
+ * 	This helper is available only if the kernel was compiled with
+ * 	**CONFIG_IP_ROUTE_CLASSID** configuration option.
+ *
+ * Returns
+ * 	The realm of the route for the packet associated to *skb*, or 0
+ * 	if none was found.
+ */
+static __u32 (*bpf_get_route_realm)(struct __sk_buff *skb) = (void *) 24;
+
+/*
+ * bpf_perf_event_output
+ *
+ * 	Write raw *data* blob into a special BPF perf event held by
+ * 	*map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
+ * 	event must have the following attributes: **PERF_SAMPLE_RAW**
+ * 	as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and
+ * 	**PERF_COUNT_SW_BPF_OUTPUT** as **config**.
+ *
+ * 	The *flags* are used to indicate the index in *map* for which
+ * 	the value must be put, masked with **BPF_F_INDEX_MASK**.
+ * 	Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU**
+ * 	to indicate that the index of the current CPU core should be
+ * 	used.
+ *
+ * 	The value to write, of *size*, is passed through eBPF stack and
+ * 	pointed by *data*.
+ *
+ * 	The context of the program *ctx* needs also be passed to the
+ * 	helper.
+ *
+ * 	On user space, a program willing to read the values needs to
+ * 	call **perf_event_open**\ () on the perf event (either for
+ * 	one or for all CPUs) and to store the file descriptor into the
+ * 	*map*. This must be done before the eBPF program can send data
+ * 	into it. An example is available in file
+ * 	*samples/bpf/trace_output_user.c* in the Linux kernel source
+ * 	tree (the eBPF program counterpart is in
+ * 	*samples/bpf/trace_output_kern.c*).
+ *
+ * 	**bpf_perf_event_output**\ () achieves better performance
+ * 	than **bpf_trace_printk**\ () for sharing data with user
+ * 	space, and is much better suitable for streaming data from eBPF
+ * 	programs.
+ *
+ * 	Note that this helper is not restricted to tracing use cases
+ * 	and can be used with programs attached to TC or XDP as well,
+ * 	where it allows for passing data to user space listeners. Data
+ * 	can be:
+ *
+ * 	* Only custom structs,
+ * 	* Only the packet payload, or
+ * 	* A combination of both.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_perf_event_output)(void *ctx, void *map, __u64 flags, void *data, __u64 size) = (void *) 25;
+
+/*
+ * bpf_skb_load_bytes
+ *
+ * 	This helper was provided as an easy way to load data from a
+ * 	packet. It can be used to load *len* bytes from *offset* from
+ * 	the packet associated to *skb*, into the buffer pointed by
+ * 	*to*.
+ *
+ * 	Since Linux 4.7, usage of this helper has mostly been replaced
+ * 	by "direct packet access", enabling packet data to be
+ * 	manipulated with *skb*\ **->data** and *skb*\ **->data_end**
+ * 	pointing respectively to the first byte of packet data and to
+ * 	the byte after the last byte of packet data. However, it
+ * 	remains useful if one wishes to read large quantities of data
+ * 	at once from a packet into the eBPF stack.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_skb_load_bytes)(const void *skb, __u32 offset, void *to, __u32 len) = (void *) 26;
+
+/*
+ * bpf_get_stackid
+ *
+ * 	Walk a user or a kernel stack and return its id. To achieve
+ * 	this, the helper needs *ctx*, which is a pointer to the context
+ * 	on which the tracing program is executed, and a pointer to a
+ * 	*map* of type **BPF_MAP_TYPE_STACK_TRACE**.
+ *
+ * 	The last argument, *flags*, holds the number of stack frames to
+ * 	skip (from 0 to 255), masked with
+ * 	**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ * 	a combination of the following flags:
+ *
+ * 	**BPF_F_USER_STACK**
+ * 		Collect a user space stack instead of a kernel stack.
+ * 	**BPF_F_FAST_STACK_CMP**
+ * 		Compare stacks by hash only.
+ * 	**BPF_F_REUSE_STACKID**
+ * 		If two different stacks hash into the same *stackid*,
+ * 		discard the old one.
+ *
+ * 	The stack id retrieved is a 32 bit long integer handle which
+ * 	can be further combined with other data (including other stack
+ * 	ids) and used as a key into maps. This can be useful for
+ * 	generating a variety of graphs (such as flame graphs or off-cpu
+ * 	graphs).
+ *
+ * 	For walking a stack, this helper is an improvement over
+ * 	**bpf_probe_read**\ (), which can be used with unrolled loops
+ * 	but is not efficient and consumes a lot of eBPF instructions.
+ * 	Instead, **bpf_get_stackid**\ () can collect up to
+ * 	**PERF_MAX_STACK_DEPTH** both kernel and user frames. Note that
+ * 	this limit can be controlled with the **sysctl** program, and
+ * 	that it should be manually increased in order to profile long
+ * 	user stacks (such as stacks for Java programs). To do so, use:
+ *
+ * 	::
+ *
+ * 		# sysctl kernel.perf_event_max_stack=<new value>
+ *
+ * Returns
+ * 	The positive or null stack id on success, or a negative error
+ * 	in case of failure.
+ */
+static long (*bpf_get_stackid)(void *ctx, void *map, __u64 flags) = (void *) 27;
+
+/*
+ * bpf_csum_diff
+ *
+ * 	Compute a checksum difference, from the raw buffer pointed by
+ * 	*from*, of length *from_size* (that must be a multiple of 4),
+ * 	towards the raw buffer pointed by *to*, of size *to_size*
+ * 	(same remark). An optional *seed* can be added to the value
+ * 	(this can be cascaded, the seed may come from a previous call
+ * 	to the helper).
+ *
+ * 	This is flexible enough to be used in several ways:
+ *
+ * 	* With *from_size* == 0, *to_size* > 0 and *seed* set to
+ * 	  checksum, it can be used when pushing new data.
+ * 	* With *from_size* > 0, *to_size* == 0 and *seed* set to
+ * 	  checksum, it can be used when removing data from a packet.
+ * 	* With *from_size* > 0, *to_size* > 0 and *seed* set to 0, it
+ * 	  can be used to compute a diff. Note that *from_size* and
+ * 	  *to_size* do not need to be equal.
+ *
+ * 	This helper can be used in combination with
+ * 	**bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ (), to
+ * 	which one can feed in the difference computed with
+ * 	**bpf_csum_diff**\ ().
+ *
+ * Returns
+ * 	The checksum result, or a negative error code in case of
+ * 	failure.
+ */
+static __s64 (*bpf_csum_diff)(__be32 *from, __u32 from_size, __be32 *to, __u32 to_size, __wsum seed) = (void *) 28;
+
+/*
+ * bpf_skb_get_tunnel_opt
+ *
+ * 	Retrieve tunnel options metadata for the packet associated to
+ * 	*skb*, and store the raw tunnel option data to the buffer *opt*
+ * 	of *size*.
+ *
+ * 	This helper can be used with encapsulation devices that can
+ * 	operate in "collect metadata" mode (please refer to the related
+ * 	note in the description of **bpf_skb_get_tunnel_key**\ () for
+ * 	more details). A particular example where this can be used is
+ * 	in combination with the Geneve encapsulation protocol, where it
+ * 	allows for pushing (with **bpf_skb_get_tunnel_opt**\ () helper)
+ * 	and retrieving arbitrary TLVs (Type-Length-Value headers) from
+ * 	the eBPF program. This allows for full customization of these
+ * 	headers.
+ *
+ * Returns
+ * 	The size of the option data retrieved.
+ */
+static long (*bpf_skb_get_tunnel_opt)(struct __sk_buff *skb, void *opt, __u32 size) = (void *) 29;
+
+/*
+ * bpf_skb_set_tunnel_opt
+ *
+ * 	Set tunnel options metadata for the packet associated to *skb*
+ * 	to the option data contained in the raw buffer *opt* of *size*.
+ *
+ * 	See also the description of the **bpf_skb_get_tunnel_opt**\ ()
+ * 	helper for additional information.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_skb_set_tunnel_opt)(struct __sk_buff *skb, void *opt, __u32 size) = (void *) 30;
+
+/*
+ * bpf_skb_change_proto
+ *
+ * 	Change the protocol of the *skb* to *proto*. Currently
+ * 	supported are transition from IPv4 to IPv6, and from IPv6 to
+ * 	IPv4. The helper takes care of the groundwork for the
+ * 	transition, including resizing the socket buffer. The eBPF
+ * 	program is expected to fill the new headers, if any, via
+ * 	**skb_store_bytes**\ () and to recompute the checksums with
+ * 	**bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\
+ * 	(). The main case for this helper is to perform NAT64
+ * 	operations out of an eBPF program.
+ *
+ * 	Internally, the GSO type is marked as dodgy so that headers are
+ * 	checked and segments are recalculated by the GSO/GRO engine.
+ * 	The size for GSO target is adapted as well.
+ *
+ * 	All values for *flags* are reserved for future usage, and must
+ * 	be left at zero.
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_skb_change_proto)(struct __sk_buff *skb, __be16 proto, __u64 flags) = (void *) 31;
+
+/*
+ * bpf_skb_change_type
+ *
+ * 	Change the packet type for the packet associated to *skb*. This
+ * 	comes down to setting *skb*\ **->pkt_type** to *type*, except
+ * 	the eBPF program does not have a write access to *skb*\
+ * 	**->pkt_type** beside this helper. Using a helper here allows
+ * 	for graceful handling of errors.
+ *
+ * 	The major use case is to change incoming *skb*s to
+ * 	**PACKET_HOST** in a programmatic way instead of having to
+ * 	recirculate via **redirect**\ (..., **BPF_F_INGRESS**), for
+ * 	example.
+ *
+ * 	Note that *type* only allows certain values. At this time, they
+ * 	are:
+ *
+ * 	**PACKET_HOST**
+ * 		Packet is for us.
+ * 	**PACKET_BROADCAST**
+ * 		Send packet to all.
+ * 	**PACKET_MULTICAST**
+ * 		Send packet to group.
+ * 	**PACKET_OTHERHOST**
+ * 		Send packet to someone else.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_skb_change_type)(struct __sk_buff *skb, __u32 type) = (void *) 32;
+
+/*
+ * bpf_skb_under_cgroup
+ *
+ * 	Check whether *skb* is a descendant of the cgroup2 held by
+ * 	*map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*.
+ *
+ * Returns
+ * 	The return value depends on the result of the test, and can be:
+ *
+ * 	* 0, if the *skb* failed the cgroup2 descendant test.
+ * 	* 1, if the *skb* succeeded the cgroup2 descendant test.
+ * 	* A negative error code, if an error occurred.
+ */
+static long (*bpf_skb_under_cgroup)(struct __sk_buff *skb, void *map, __u32 index) = (void *) 33;
+
+/*
+ * bpf_get_hash_recalc
+ *
+ * 	Retrieve the hash of the packet, *skb*\ **->hash**. If it is
+ * 	not set, in particular if the hash was cleared due to mangling,
+ * 	recompute this hash. Later accesses to the hash can be done
+ * 	directly with *skb*\ **->hash**.
+ *
+ * 	Calling **bpf_set_hash_invalid**\ (), changing a packet
+ * 	prototype with **bpf_skb_change_proto**\ (), or calling
+ * 	**bpf_skb_store_bytes**\ () with the
+ * 	**BPF_F_INVALIDATE_HASH** are actions susceptible to clear
+ * 	the hash and to trigger a new computation for the next call to
+ * 	**bpf_get_hash_recalc**\ ().
+ *
+ * Returns
+ * 	The 32-bit hash.
+ */
+static __u32 (*bpf_get_hash_recalc)(struct __sk_buff *skb) = (void *) 34;
+
+/*
+ * bpf_get_current_task
+ *
+ *
+ * Returns
+ * 	A pointer to the current task struct.
+ */
+static __u64 (*bpf_get_current_task)(void) = (void *) 35;
+
+/*
+ * bpf_probe_write_user
+ *
+ * 	Attempt in a safe way to write *len* bytes from the buffer
+ * 	*src* to *dst* in memory. It only works for threads that are in
+ * 	user context, and *dst* must be a valid user space address.
+ *
+ * 	This helper should not be used to implement any kind of
+ * 	security mechanism because of TOC-TOU attacks, but rather to
+ * 	debug, divert, and manipulate execution of semi-cooperative
+ * 	processes.
+ *
+ * 	Keep in mind that this feature is meant for experiments, and it
+ * 	has a risk of crashing the system and running programs.
+ * 	Therefore, when an eBPF program using this helper is attached,
+ * 	a warning including PID and process name is printed to kernel
+ * 	logs.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_probe_write_user)(void *dst, const void *src, __u32 len) = (void *) 36;
+
+/*
+ * bpf_current_task_under_cgroup
+ *
+ * 	Check whether the probe is being run is the context of a given
+ * 	subset of the cgroup2 hierarchy. The cgroup2 to test is held by
+ * 	*map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*.
+ *
+ * Returns
+ * 	The return value depends on the result of the test, and can be:
+ *
+ * 	* 0, if the *skb* task belongs to the cgroup2.
+ * 	* 1, if the *skb* task does not belong to the cgroup2.
+ * 	* A negative error code, if an error occurred.
+ */
+static long (*bpf_current_task_under_cgroup)(void *map, __u32 index) = (void *) 37;
+
+/*
+ * bpf_skb_change_tail
+ *
+ * 	Resize (trim or grow) the packet associated to *skb* to the
+ * 	new *len*. The *flags* are reserved for future usage, and must
+ * 	be left at zero.
+ *
+ * 	The basic idea is that the helper performs the needed work to
+ * 	change the size of the packet, then the eBPF program rewrites
+ * 	the rest via helpers like **bpf_skb_store_bytes**\ (),
+ * 	**bpf_l3_csum_replace**\ (), **bpf_l3_csum_replace**\ ()
+ * 	and others. This helper is a slow path utility intended for
+ * 	replies with control messages. And because it is targeted for
+ * 	slow path, the helper itself can afford to be slow: it
+ * 	implicitly linearizes, unclones and drops offloads from the
+ * 	*skb*.
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_skb_change_tail)(struct __sk_buff *skb, __u32 len, __u64 flags) = (void *) 38;
+
+/*
+ * bpf_skb_pull_data
+ *
+ * 	Pull in non-linear data in case the *skb* is non-linear and not
+ * 	all of *len* are part of the linear section. Make *len* bytes
+ * 	from *skb* readable and writable. If a zero value is passed for
+ * 	*len*, then the whole length of the *skb* is pulled.
+ *
+ * 	This helper is only needed for reading and writing with direct
+ * 	packet access.
+ *
+ * 	For direct packet access, testing that offsets to access
+ * 	are within packet boundaries (test on *skb*\ **->data_end**) is
+ * 	susceptible to fail if offsets are invalid, or if the requested
+ * 	data is in non-linear parts of the *skb*. On failure the
+ * 	program can just bail out, or in the case of a non-linear
+ * 	buffer, use a helper to make the data available. The
+ * 	**bpf_skb_load_bytes**\ () helper is a first solution to access
+ * 	the data. Another one consists in using **bpf_skb_pull_data**
+ * 	to pull in once the non-linear parts, then retesting and
+ * 	eventually access the data.
+ *
+ * 	At the same time, this also makes sure the *skb* is uncloned,
+ * 	which is a necessary condition for direct write. As this needs
+ * 	to be an invariant for the write part only, the verifier
+ * 	detects writes and adds a prologue that is calling
+ * 	**bpf_skb_pull_data()** to effectively unclone the *skb* from
+ * 	the very beginning in case it is indeed cloned.
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_skb_pull_data)(struct __sk_buff *skb, __u32 len) = (void *) 39;
+
+/*
+ * bpf_csum_update
+ *
+ * 	Add the checksum *csum* into *skb*\ **->csum** in case the
+ * 	driver has supplied a checksum for the entire packet into that
+ * 	field. Return an error otherwise. This helper is intended to be
+ * 	used in combination with **bpf_csum_diff**\ (), in particular
+ * 	when the checksum needs to be updated after data has been
+ * 	written into the packet through direct packet access.
+ *
+ * Returns
+ * 	The checksum on success, or a negative error code in case of
+ * 	failure.
+ */
+static __s64 (*bpf_csum_update)(struct __sk_buff *skb, __wsum csum) = (void *) 40;
+
+/*
+ * bpf_set_hash_invalid
+ *
+ * 	Invalidate the current *skb*\ **->hash**. It can be used after
+ * 	mangling on headers through direct packet access, in order to
+ * 	indicate that the hash is outdated and to trigger a
+ * 	recalculation the next time the kernel tries to access this
+ * 	hash or when the **bpf_get_hash_recalc**\ () helper is called.
+ *
+ */
+static void (*bpf_set_hash_invalid)(struct __sk_buff *skb) = (void *) 41;
+
+/*
+ * bpf_get_numa_node_id
+ *
+ * 	Return the id of the current NUMA node. The primary use case
+ * 	for this helper is the selection of sockets for the local NUMA
+ * 	node, when the program is attached to sockets using the
+ * 	**SO_ATTACH_REUSEPORT_EBPF** option (see also **socket(7)**),
+ * 	but the helper is also available to other eBPF program types,
+ * 	similarly to **bpf_get_smp_processor_id**\ ().
+ *
+ * Returns
+ * 	The id of current NUMA node.
+ */
+static long (*bpf_get_numa_node_id)(void) = (void *) 42;
+
+/*
+ * bpf_skb_change_head
+ *
+ * 	Grows headroom of packet associated to *skb* and adjusts the
+ * 	offset of the MAC header accordingly, adding *len* bytes of
+ * 	space. It automatically extends and reallocates memory as
+ * 	required.
+ *
+ * 	This helper can be used on a layer 3 *skb* to push a MAC header
+ * 	for redirection into a layer 2 device.
+ *
+ * 	All values for *flags* are reserved for future usage, and must
+ * 	be left at zero.
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_skb_change_head)(struct __sk_buff *skb, __u32 len, __u64 flags) = (void *) 43;
+
+/*
+ * bpf_xdp_adjust_head
+ *
+ * 	Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that
+ * 	it is possible to use a negative value for *delta*. This helper
+ * 	can be used to prepare the packet for pushing or popping
+ * 	headers.
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_xdp_adjust_head)(struct xdp_md *xdp_md, int delta) = (void *) 44;
+
+/*
+ * bpf_probe_read_str
+ *
+ * 	Copy a NUL terminated string from an unsafe kernel address
+ * 	*unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for
+ * 	more details.
+ *
+ * 	Generally, use **bpf_probe_read_user_str**\ () or
+ * 	**bpf_probe_read_kernel_str**\ () instead.
+ *
+ * Returns
+ * 	On success, the strictly positive length of the string,
+ * 	including the trailing NUL character. On error, a negative
+ * 	value.
+ */
+static long (*bpf_probe_read_str)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 45;
+
+/*
+ * bpf_get_socket_cookie
+ *
+ * 	If the **struct sk_buff** pointed by *skb* has a known socket,
+ * 	retrieve the cookie (generated by the kernel) of this socket.
+ * 	If no cookie has been set yet, generate a new cookie. Once
+ * 	generated, the socket cookie remains stable for the life of the
+ * 	socket. This helper can be useful for monitoring per socket
+ * 	networking traffic statistics as it provides a global socket
+ * 	identifier that can be assumed unique.
+ *
+ * Returns
+ * 	A 8-byte long non-decreasing number on success, or 0 if the
+ * 	socket field is missing inside *skb*.
+ */
+static __u64 (*bpf_get_socket_cookie)(void *ctx) = (void *) 46;
+
+/*
+ * bpf_get_socket_uid
+ *
+ *
+ * Returns
+ * 	The owner UID of the socket associated to *skb*. If the socket
+ * 	is **NULL**, or if it is not a full socket (i.e. if it is a
+ * 	time-wait or a request socket instead), **overflowuid** value
+ * 	is returned (note that **overflowuid** might also be the actual
+ * 	UID value for the socket).
+ */
+static __u32 (*bpf_get_socket_uid)(struct __sk_buff *skb) = (void *) 47;
+
+/*
+ * bpf_set_hash
+ *
+ * 	Set the full hash for *skb* (set the field *skb*\ **->hash**)
+ * 	to value *hash*.
+ *
+ * Returns
+ * 	0
+ */
+static long (*bpf_set_hash)(struct __sk_buff *skb, __u32 hash) = (void *) 48;
+
+/*
+ * bpf_setsockopt
+ *
+ * 	Emulate a call to **setsockopt()** on the socket associated to
+ * 	*bpf_socket*, which must be a full socket. The *level* at
+ * 	which the option resides and the name *optname* of the option
+ * 	must be specified, see **setsockopt(2)** for more information.
+ * 	The option value of length *optlen* is pointed by *optval*.
+ *
+ * 	*bpf_socket* should be one of the following:
+ *
+ * 	* **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
+ * 	* **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
+ * 	  and **BPF_CGROUP_INET6_CONNECT**.
+ *
+ * 	This helper actually implements a subset of **setsockopt()**.
+ * 	It supports the following *level*\ s:
+ *
+ * 	* **SOL_SOCKET**, which supports the following *optname*\ s:
+ * 	  **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**,
+ * 	  **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**,
+ * 	  **SO_BINDTODEVICE**, **SO_KEEPALIVE**.
+ * 	* **IPPROTO_TCP**, which supports the following *optname*\ s:
+ * 	  **TCP_CONGESTION**, **TCP_BPF_IW**,
+ * 	  **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**,
+ * 	  **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**,
+ * 	  **TCP_SYNCNT**, **TCP_USER_TIMEOUT**.
+ * 	* **IPPROTO_IP**, which supports *optname* **IP_TOS**.
+ * 	* **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_setsockopt)(void *bpf_socket, int level, int optname, void *optval, int optlen) = (void *) 49;
+
+/*
+ * bpf_skb_adjust_room
+ *
+ * 	Grow or shrink the room for data in the packet associated to
+ * 	*skb* by *len_diff*, and according to the selected *mode*.
+ *
+ * 	By default, the helper will reset any offloaded checksum
+ * 	indicator of the skb to CHECKSUM_NONE. This can be avoided
+ * 	by the following flag:
+ *
+ * 	* **BPF_F_ADJ_ROOM_NO_CSUM_RESET**: Do not reset offloaded
+ * 	  checksum data of the skb to CHECKSUM_NONE.
+ *
+ * 	There are two supported modes at this time:
+ *
+ * 	* **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer
+ * 	  (room space is added or removed below the layer 2 header).
+ *
+ * 	* **BPF_ADJ_ROOM_NET**: Adjust room at the network layer
+ * 	  (room space is added or removed below the layer 3 header).
+ *
+ * 	The following flags are supported at this time:
+ *
+ * 	* **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size.
+ * 	  Adjusting mss in this way is not allowed for datagrams.
+ *
+ * 	* **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4**,
+ * 	  **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6**:
+ * 	  Any new space is reserved to hold a tunnel header.
+ * 	  Configure skb offsets and other fields accordingly.
+ *
+ * 	* **BPF_F_ADJ_ROOM_ENCAP_L4_GRE**,
+ * 	  **BPF_F_ADJ_ROOM_ENCAP_L4_UDP**:
+ * 	  Use with ENCAP_L3 flags to further specify the tunnel type.
+ *
+ * 	* **BPF_F_ADJ_ROOM_ENCAP_L2**\ (*len*):
+ * 	  Use with ENCAP_L3/L4 flags to further specify the tunnel
+ * 	  type; *len* is the length of the inner MAC header.
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_skb_adjust_room)(struct __sk_buff *skb, __s32 len_diff, __u32 mode, __u64 flags) = (void *) 50;
+
+/*
+ * bpf_redirect_map
+ *
+ * 	Redirect the packet to the endpoint referenced by *map* at
+ * 	index *key*. Depending on its type, this *map* can contain
+ * 	references to net devices (for forwarding packets through other
+ * 	ports), or to CPUs (for redirecting XDP frames to another CPU;
+ * 	but this is only implemented for native XDP (with driver
+ * 	support) as of this writing).
+ *
+ * 	The lower two bits of *flags* are used as the return code if
+ * 	the map lookup fails. This is so that the return value can be
+ * 	one of the XDP program return codes up to **XDP_TX**, as chosen
+ * 	by the caller. Any higher bits in the *flags* argument must be
+ * 	unset.
+ *
+ * 	See also **bpf_redirect**\ (), which only supports redirecting
+ * 	to an ifindex, but doesn't require a map to do so.
+ *
+ * Returns
+ * 	**XDP_REDIRECT** on success, or the value of the two lower bits
+ * 	of the *flags* argument on error.
+ */
+static long (*bpf_redirect_map)(void *map, __u32 key, __u64 flags) = (void *) 51;
+
+/*
+ * bpf_sk_redirect_map
+ *
+ * 	Redirect the packet to the socket referenced by *map* (of type
+ * 	**BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and
+ * 	egress interfaces can be used for redirection. The
+ * 	**BPF_F_INGRESS** value in *flags* is used to make the
+ * 	distinction (ingress path is selected if the flag is present,
+ * 	egress path otherwise). This is the only flag supported for now.
+ *
+ * Returns
+ * 	**SK_PASS** on success, or **SK_DROP** on error.
+ */
+static long (*bpf_sk_redirect_map)(struct __sk_buff *skb, void *map, __u32 key, __u64 flags) = (void *) 52;
+
+/*
+ * bpf_sock_map_update
+ *
+ * 	Add an entry to, or update a *map* referencing sockets. The
+ * 	*skops* is used as a new value for the entry associated to
+ * 	*key*. *flags* is one of:
+ *
+ * 	**BPF_NOEXIST**
+ * 		The entry for *key* must not exist in the map.
+ * 	**BPF_EXIST**
+ * 		The entry for *key* must already exist in the map.
+ * 	**BPF_ANY**
+ * 		No condition on the existence of the entry for *key*.
+ *
+ * 	If the *map* has eBPF programs (parser and verdict), those will
+ * 	be inherited by the socket being added. If the socket is
+ * 	already attached to eBPF programs, this results in an error.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_sock_map_update)(struct bpf_sock_ops *skops, void *map, void *key, __u64 flags) = (void *) 53;
+
+/*
+ * bpf_xdp_adjust_meta
+ *
+ * 	Adjust the address pointed by *xdp_md*\ **->data_meta** by
+ * 	*delta* (which can be positive or negative). Note that this
+ * 	operation modifies the address stored in *xdp_md*\ **->data**,
+ * 	so the latter must be loaded only after the helper has been
+ * 	called.
+ *
+ * 	The use of *xdp_md*\ **->data_meta** is optional and programs
+ * 	are not required to use it. The rationale is that when the
+ * 	packet is processed with XDP (e.g. as DoS filter), it is
+ * 	possible to push further meta data along with it before passing
+ * 	to the stack, and to give the guarantee that an ingress eBPF
+ * 	program attached as a TC classifier on the same device can pick
+ * 	this up for further post-processing. Since TC works with socket
+ * 	buffers, it remains possible to set from XDP the **mark** or
+ * 	**priority** pointers, or other pointers for the socket buffer.
+ * 	Having this scratch space generic and programmable allows for
+ * 	more flexibility as the user is free to store whatever meta
+ * 	data they need.
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_xdp_adjust_meta)(struct xdp_md *xdp_md, int delta) = (void *) 54;
+
+/*
+ * bpf_perf_event_read_value
+ *
+ * 	Read the value of a perf event counter, and store it into *buf*
+ * 	of size *buf_size*. This helper relies on a *map* of type
+ * 	**BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of the perf event
+ * 	counter is selected when *map* is updated with perf event file
+ * 	descriptors. The *map* is an array whose size is the number of
+ * 	available CPUs, and each cell contains a value relative to one
+ * 	CPU. The value to retrieve is indicated by *flags*, that
+ * 	contains the index of the CPU to look up, masked with
+ * 	**BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to
+ * 	**BPF_F_CURRENT_CPU** to indicate that the value for the
+ * 	current CPU should be retrieved.
+ *
+ * 	This helper behaves in a way close to
+ * 	**bpf_perf_event_read**\ () helper, save that instead of
+ * 	just returning the value observed, it fills the *buf*
+ * 	structure. This allows for additional data to be retrieved: in
+ * 	particular, the enabled and running times (in *buf*\
+ * 	**->enabled** and *buf*\ **->running**, respectively) are
+ * 	copied. In general, **bpf_perf_event_read_value**\ () is
+ * 	recommended over **bpf_perf_event_read**\ (), which has some
+ * 	ABI issues and provides fewer functionalities.
+ *
+ * 	These values are interesting, because hardware PMU (Performance
+ * 	Monitoring Unit) counters are limited resources. When there are
+ * 	more PMU based perf events opened than available counters,
+ * 	kernel will multiplex these events so each event gets certain
+ * 	percentage (but not all) of the PMU time. In case that
+ * 	multiplexing happens, the number of samples or counter value
+ * 	will not reflect the case compared to when no multiplexing
+ * 	occurs. This makes comparison between different runs difficult.
+ * 	Typically, the counter value should be normalized before
+ * 	comparing to other experiments. The usual normalization is done
+ * 	as follows.
+ *
+ * 	::
+ *
+ * 		normalized_counter = counter * t_enabled / t_running
+ *
+ * 	Where t_enabled is the time enabled for event and t_running is
+ * 	the time running for event since last normalization. The
+ * 	enabled and running times are accumulated since the perf event
+ * 	open. To achieve scaling factor between two invocations of an
+ * 	eBPF program, users can use CPU id as the key (which is
+ * 	typical for perf array usage model) to remember the previous
+ * 	value and do the calculation inside the eBPF program.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_perf_event_read_value)(void *map, __u64 flags, struct bpf_perf_event_value *buf, __u32 buf_size) = (void *) 55;
+
+/*
+ * bpf_perf_prog_read_value
+ *
+ * 	For en eBPF program attached to a perf event, retrieve the
+ * 	value of the event counter associated to *ctx* and store it in
+ * 	the structure pointed by *buf* and of size *buf_size*. Enabled
+ * 	and running times are also stored in the structure (see
+ * 	description of helper **bpf_perf_event_read_value**\ () for
+ * 	more details).
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_perf_prog_read_value)(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, __u32 buf_size) = (void *) 56;
+
+/*
+ * bpf_getsockopt
+ *
+ * 	Emulate a call to **getsockopt()** on the socket associated to
+ * 	*bpf_socket*, which must be a full socket. The *level* at
+ * 	which the option resides and the name *optname* of the option
+ * 	must be specified, see **getsockopt(2)** for more information.
+ * 	The retrieved value is stored in the structure pointed by
+ * 	*opval* and of length *optlen*.
+ *
+ * 	*bpf_socket* should be one of the following:
+ *
+ * 	* **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
+ * 	* **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
+ * 	  and **BPF_CGROUP_INET6_CONNECT**.
+ *
+ * 	This helper actually implements a subset of **getsockopt()**.
+ * 	It supports the following *level*\ s:
+ *
+ * 	* **IPPROTO_TCP**, which supports *optname*
+ * 	  **TCP_CONGESTION**.
+ * 	* **IPPROTO_IP**, which supports *optname* **IP_TOS**.
+ * 	* **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_getsockopt)(void *bpf_socket, int level, int optname, void *optval, int optlen) = (void *) 57;
+
+/*
+ * bpf_override_return
+ *
+ * 	Used for error injection, this helper uses kprobes to override
+ * 	the return value of the probed function, and to set it to *rc*.
+ * 	The first argument is the context *regs* on which the kprobe
+ * 	works.
+ *
+ * 	This helper works by setting the PC (program counter)
+ * 	to an override function which is run in place of the original
+ * 	probed function. This means the probed function is not run at
+ * 	all. The replacement function just returns with the required
+ * 	value.
+ *
+ * 	This helper has security implications, and thus is subject to
+ * 	restrictions. It is only available if the kernel was compiled
+ * 	with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration
+ * 	option, and in this case it only works on functions tagged with
+ * 	**ALLOW_ERROR_INJECTION** in the kernel code.
+ *
+ * 	Also, the helper is only available for the architectures having
+ * 	the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing,
+ * 	x86 architecture is the only one to support this feature.
+ *
+ * Returns
+ * 	0
+ */
+static long (*bpf_override_return)(struct pt_regs *regs, __u64 rc) = (void *) 58;
+
+/*
+ * bpf_sock_ops_cb_flags_set
+ *
+ * 	Attempt to set the value of the **bpf_sock_ops_cb_flags** field
+ * 	for the full TCP socket associated to *bpf_sock_ops* to
+ * 	*argval*.
+ *
+ * 	The primary use of this field is to determine if there should
+ * 	be calls to eBPF programs of type
+ * 	**BPF_PROG_TYPE_SOCK_OPS** at various points in the TCP
+ * 	code. A program of the same type can change its value, per
+ * 	connection and as necessary, when the connection is
+ * 	established. This field is directly accessible for reading, but
+ * 	this helper must be used for updates in order to return an
+ * 	error if an eBPF program tries to set a callback that is not
+ * 	supported in the current kernel.
+ *
+ * 	*argval* is a flag array which can combine these flags:
+ *
+ * 	* **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out)
+ * 	* **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission)
+ * 	* **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change)
+ * 	* **BPF_SOCK_OPS_RTT_CB_FLAG** (every RTT)
+ *
+ * 	Therefore, this function can be used to clear a callback flag by
+ * 	setting the appropriate bit to zero. e.g. to disable the RTO
+ * 	callback:
+ *
+ * 	**bpf_sock_ops_cb_flags_set(bpf_sock,**
+ * 		**bpf_sock->bpf_sock_ops_cb_flags & ~BPF_SOCK_OPS_RTO_CB_FLAG)**
+ *
+ * 	Here are some examples of where one could call such eBPF
+ * 	program:
+ *
+ * 	* When RTO fires.
+ * 	* When a packet is retransmitted.
+ * 	* When the connection terminates.
+ * 	* When a packet is sent.
+ * 	* When a packet is received.
+ *
+ * Returns
+ * 	Code **-EINVAL** if the socket is not a full TCP socket;
+ * 	otherwise, a positive number containing the bits that could not
+ * 	be set is returned (which comes down to 0 if all bits were set
+ * 	as required).
+ */
+static long (*bpf_sock_ops_cb_flags_set)(struct bpf_sock_ops *bpf_sock, int argval) = (void *) 59;
+
+/*
+ * bpf_msg_redirect_map
+ *
+ * 	This helper is used in programs implementing policies at the
+ * 	socket level. If the message *msg* is allowed to pass (i.e. if
+ * 	the verdict eBPF program returns **SK_PASS**), redirect it to
+ * 	the socket referenced by *map* (of type
+ * 	**BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and
+ * 	egress interfaces can be used for redirection. The
+ * 	**BPF_F_INGRESS** value in *flags* is used to make the
+ * 	distinction (ingress path is selected if the flag is present,
+ * 	egress path otherwise). This is the only flag supported for now.
+ *
+ * Returns
+ * 	**SK_PASS** on success, or **SK_DROP** on error.
+ */
+static long (*bpf_msg_redirect_map)(struct sk_msg_md *msg, void *map, __u32 key, __u64 flags) = (void *) 60;
+
+/*
+ * bpf_msg_apply_bytes
+ *
+ * 	For socket policies, apply the verdict of the eBPF program to
+ * 	the next *bytes* (number of bytes) of message *msg*.
+ *
+ * 	For example, this helper can be used in the following cases:
+ *
+ * 	* A single **sendmsg**\ () or **sendfile**\ () system call
+ * 	  contains multiple logical messages that the eBPF program is
+ * 	  supposed to read and for which it should apply a verdict.
+ * 	* An eBPF program only cares to read the first *bytes* of a
+ * 	  *msg*. If the message has a large payload, then setting up
+ * 	  and calling the eBPF program repeatedly for all bytes, even
+ * 	  though the verdict is already known, would create unnecessary
+ * 	  overhead.
+ *
+ * 	When called from within an eBPF program, the helper sets a
+ * 	counter internal to the BPF infrastructure, that is used to
+ * 	apply the last verdict to the next *bytes*. If *bytes* is
+ * 	smaller than the current data being processed from a
+ * 	**sendmsg**\ () or **sendfile**\ () system call, the first
+ * 	*bytes* will be sent and the eBPF program will be re-run with
+ * 	the pointer for start of data pointing to byte number *bytes*
+ * 	**+ 1**. If *bytes* is larger than the current data being
+ * 	processed, then the eBPF verdict will be applied to multiple
+ * 	**sendmsg**\ () or **sendfile**\ () calls until *bytes* are
+ * 	consumed.
+ *
+ * 	Note that if a socket closes with the internal counter holding
+ * 	a non-zero value, this is not a problem because data is not
+ * 	being buffered for *bytes* and is sent as it is received.
+ *
+ * Returns
+ * 	0
+ */
+static long (*bpf_msg_apply_bytes)(struct sk_msg_md *msg, __u32 bytes) = (void *) 61;
+
+/*
+ * bpf_msg_cork_bytes
+ *
+ * 	For socket policies, prevent the execution of the verdict eBPF
+ * 	program for message *msg* until *bytes* (byte number) have been
+ * 	accumulated.
+ *
+ * 	This can be used when one needs a specific number of bytes
+ * 	before a verdict can be assigned, even if the data spans
+ * 	multiple **sendmsg**\ () or **sendfile**\ () calls. The extreme
+ * 	case would be a user calling **sendmsg**\ () repeatedly with
+ * 	1-byte long message segments. Obviously, this is bad for
+ * 	performance, but it is still valid. If the eBPF program needs
+ * 	*bytes* bytes to validate a header, this helper can be used to
+ * 	prevent the eBPF program to be called again until *bytes* have
+ * 	been accumulated.
+ *
+ * Returns
+ * 	0
+ */
+static long (*bpf_msg_cork_bytes)(struct sk_msg_md *msg, __u32 bytes) = (void *) 62;
+
+/*
+ * bpf_msg_pull_data
+ *
+ * 	For socket policies, pull in non-linear data from user space
+ * 	for *msg* and set pointers *msg*\ **->data** and *msg*\
+ * 	**->data_end** to *start* and *end* bytes offsets into *msg*,
+ * 	respectively.
+ *
+ * 	If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a
+ * 	*msg* it can only parse data that the (**data**, **data_end**)
+ * 	pointers have already consumed. For **sendmsg**\ () hooks this
+ * 	is likely the first scatterlist element. But for calls relying
+ * 	on the **sendpage** handler (e.g. **sendfile**\ ()) this will
+ * 	be the range (**0**, **0**) because the data is shared with
+ * 	user space and by default the objective is to avoid allowing
+ * 	user space to modify data while (or after) eBPF verdict is
+ * 	being decided. This helper can be used to pull in data and to
+ * 	set the start and end pointer to given values. Data will be
+ * 	copied if necessary (i.e. if data was not linear and if start
+ * 	and end pointers do not point to the same chunk).
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * 	All values for *flags* are reserved for future usage, and must
+ * 	be left at zero.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_msg_pull_data)(struct sk_msg_md *msg, __u32 start, __u32 end, __u64 flags) = (void *) 63;
+
+/*
+ * bpf_bind
+ *
+ * 	Bind the socket associated to *ctx* to the address pointed by
+ * 	*addr*, of length *addr_len*. This allows for making outgoing
+ * 	connection from the desired IP address, which can be useful for
+ * 	example when all processes inside a cgroup should use one
+ * 	single IP address on a host that has multiple IP configured.
+ *
+ * 	This helper works for IPv4 and IPv6, TCP and UDP sockets. The
+ * 	domain (*addr*\ **->sa_family**) must be **AF_INET** (or
+ * 	**AF_INET6**). It's advised to pass zero port (**sin_port**
+ * 	or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like
+ * 	behavior and lets the kernel efficiently pick up an unused
+ * 	port as long as 4-tuple is unique. Passing non-zero port might
+ * 	lead to degraded performance.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_bind)(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) = (void *) 64;
+
+/*
+ * bpf_xdp_adjust_tail
+ *
+ * 	Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is
+ * 	possible to both shrink and grow the packet tail.
+ * 	Shrink done via *delta* being a negative integer.
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_xdp_adjust_tail)(struct xdp_md *xdp_md, int delta) = (void *) 65;
+
+/*
+ * bpf_skb_get_xfrm_state
+ *
+ * 	Retrieve the XFRM state (IP transform framework, see also
+ * 	**ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*.
+ *
+ * 	The retrieved value is stored in the **struct bpf_xfrm_state**
+ * 	pointed by *xfrm_state* and of length *size*.
+ *
+ * 	All values for *flags* are reserved for future usage, and must
+ * 	be left at zero.
+ *
+ * 	This helper is available only if the kernel was compiled with
+ * 	**CONFIG_XFRM** configuration option.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_skb_get_xfrm_state)(struct __sk_buff *skb, __u32 index, struct bpf_xfrm_state *xfrm_state, __u32 size, __u64 flags) = (void *) 66;
+
+/*
+ * bpf_get_stack
+ *
+ * 	Return a user or a kernel stack in bpf program provided buffer.
+ * 	To achieve this, the helper needs *ctx*, which is a pointer
+ * 	to the context on which the tracing program is executed.
+ * 	To store the stacktrace, the bpf program provides *buf* with
+ * 	a nonnegative *size*.
+ *
+ * 	The last argument, *flags*, holds the number of stack frames to
+ * 	skip (from 0 to 255), masked with
+ * 	**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ * 	the following flags:
+ *
+ * 	**BPF_F_USER_STACK**
+ * 		Collect a user space stack instead of a kernel stack.
+ * 	**BPF_F_USER_BUILD_ID**
+ * 		Collect buildid+offset instead of ips for user stack,
+ * 		only valid if **BPF_F_USER_STACK** is also specified.
+ *
+ * 	**bpf_get_stack**\ () can collect up to
+ * 	**PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
+ * 	to sufficient large buffer size. Note that
+ * 	this limit can be controlled with the **sysctl** program, and
+ * 	that it should be manually increased in order to profile long
+ * 	user stacks (such as stacks for Java programs). To do so, use:
+ *
+ * 	::
+ *
+ * 		# sysctl kernel.perf_event_max_stack=<new value>
+ *
+ * Returns
+ * 	A non-negative value equal to or less than *size* on success,
+ * 	or a negative error in case of failure.
+ */
+static long (*bpf_get_stack)(void *ctx, void *buf, __u32 size, __u64 flags) = (void *) 67;
+
+/*
+ * bpf_skb_load_bytes_relative
+ *
+ * 	This helper is similar to **bpf_skb_load_bytes**\ () in that
+ * 	it provides an easy way to load *len* bytes from *offset*
+ * 	from the packet associated to *skb*, into the buffer pointed
+ * 	by *to*. The difference to **bpf_skb_load_bytes**\ () is that
+ * 	a fifth argument *start_header* exists in order to select a
+ * 	base offset to start from. *start_header* can be one of:
+ *
+ * 	**BPF_HDR_START_MAC**
+ * 		Base offset to load data from is *skb*'s mac header.
+ * 	**BPF_HDR_START_NET**
+ * 		Base offset to load data from is *skb*'s network header.
+ *
+ * 	In general, "direct packet access" is the preferred method to
+ * 	access packet data, however, this helper is in particular useful
+ * 	in socket filters where *skb*\ **->data** does not always point
+ * 	to the start of the mac header and where "direct packet access"
+ * 	is not available.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_skb_load_bytes_relative)(const void *skb, __u32 offset, void *to, __u32 len, __u32 start_header) = (void *) 68;
+
+/*
+ * bpf_fib_lookup
+ *
+ * 	Do FIB lookup in kernel tables using parameters in *params*.
+ * 	If lookup is successful and result shows packet is to be
+ * 	forwarded, the neighbor tables are searched for the nexthop.
+ * 	If successful (ie., FIB lookup shows forwarding and nexthop
+ * 	is resolved), the nexthop address is returned in ipv4_dst
+ * 	or ipv6_dst based on family, smac is set to mac address of
+ * 	egress device, dmac is set to nexthop mac address, rt_metric
+ * 	is set to metric from route (IPv4/IPv6 only), and ifindex
+ * 	is set to the device index of the nexthop from the FIB lookup.
+ *
+ * 	*plen* argument is the size of the passed in struct.
+ * 	*flags* argument can be a combination of one or more of the
+ * 	following values:
+ *
+ * 	**BPF_FIB_LOOKUP_DIRECT**
+ * 		Do a direct table lookup vs full lookup using FIB
+ * 		rules.
+ * 	**BPF_FIB_LOOKUP_OUTPUT**
+ * 		Perform lookup from an egress perspective (default is
+ * 		ingress).
+ *
+ * 	*ctx* is either **struct xdp_md** for XDP programs or
+ * 	**struct sk_buff** tc cls_act programs.
+ *
+ * Returns
+ * 	* < 0 if any input argument is invalid
+ * 	*   0 on success (packet is forwarded, nexthop neighbor exists)
+ * 	* > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the
+ * 	  packet is not forwarded or needs assist from full stack
+ */
+static long (*bpf_fib_lookup)(void *ctx, struct bpf_fib_lookup *params, int plen, __u32 flags) = (void *) 69;
+
+/*
+ * bpf_sock_hash_update
+ *
+ * 	Add an entry to, or update a sockhash *map* referencing sockets.
+ * 	The *skops* is used as a new value for the entry associated to
+ * 	*key*. *flags* is one of:
+ *
+ * 	**BPF_NOEXIST**
+ * 		The entry for *key* must not exist in the map.
+ * 	**BPF_EXIST**
+ * 		The entry for *key* must already exist in the map.
+ * 	**BPF_ANY**
+ * 		No condition on the existence of the entry for *key*.
+ *
+ * 	If the *map* has eBPF programs (parser and verdict), those will
+ * 	be inherited by the socket being added. If the socket is
+ * 	already attached to eBPF programs, this results in an error.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_sock_hash_update)(struct bpf_sock_ops *skops, void *map, void *key, __u64 flags) = (void *) 70;
+
+/*
+ * bpf_msg_redirect_hash
+ *
+ * 	This helper is used in programs implementing policies at the
+ * 	socket level. If the message *msg* is allowed to pass (i.e. if
+ * 	the verdict eBPF program returns **SK_PASS**), redirect it to
+ * 	the socket referenced by *map* (of type
+ * 	**BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
+ * 	egress interfaces can be used for redirection. The
+ * 	**BPF_F_INGRESS** value in *flags* is used to make the
+ * 	distinction (ingress path is selected if the flag is present,
+ * 	egress path otherwise). This is the only flag supported for now.
+ *
+ * Returns
+ * 	**SK_PASS** on success, or **SK_DROP** on error.
+ */
+static long (*bpf_msg_redirect_hash)(struct sk_msg_md *msg, void *map, void *key, __u64 flags) = (void *) 71;
+
+/*
+ * bpf_sk_redirect_hash
+ *
+ * 	This helper is used in programs implementing policies at the
+ * 	skb socket level. If the sk_buff *skb* is allowed to pass (i.e.
+ * 	if the verdeict eBPF program returns **SK_PASS**), redirect it
+ * 	to the socket referenced by *map* (of type
+ * 	**BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
+ * 	egress interfaces can be used for redirection. The
+ * 	**BPF_F_INGRESS** value in *flags* is used to make the
+ * 	distinction (ingress path is selected if the flag is present,
+ * 	egress otherwise). This is the only flag supported for now.
+ *
+ * Returns
+ * 	**SK_PASS** on success, or **SK_DROP** on error.
+ */
+static long (*bpf_sk_redirect_hash)(struct __sk_buff *skb, void *map, void *key, __u64 flags) = (void *) 72;
+
+/*
+ * bpf_lwt_push_encap
+ *
+ * 	Encapsulate the packet associated to *skb* within a Layer 3
+ * 	protocol header. This header is provided in the buffer at
+ * 	address *hdr*, with *len* its size in bytes. *type* indicates
+ * 	the protocol of the header and can be one of:
+ *
+ * 	**BPF_LWT_ENCAP_SEG6**
+ * 		IPv6 encapsulation with Segment Routing Header
+ * 		(**struct ipv6_sr_hdr**). *hdr* only contains the SRH,
+ * 		the IPv6 header is computed by the kernel.
+ * 	**BPF_LWT_ENCAP_SEG6_INLINE**
+ * 		Only works if *skb* contains an IPv6 packet. Insert a
+ * 		Segment Routing Header (**struct ipv6_sr_hdr**) inside
+ * 		the IPv6 header.
+ * 	**BPF_LWT_ENCAP_IP**
+ * 		IP encapsulation (GRE/GUE/IPIP/etc). The outer header
+ * 		must be IPv4 or IPv6, followed by zero or more
+ * 		additional headers, up to **LWT_BPF_MAX_HEADROOM**
+ * 		total bytes in all prepended headers. Please note that
+ * 		if **skb_is_gso**\ (*skb*) is true, no more than two
+ * 		headers can be prepended, and the inner header, if
+ * 		present, should be either GRE or UDP/GUE.
+ *
+ * 	**BPF_LWT_ENCAP_SEG6**\ \* types can be called by BPF programs
+ * 	of type **BPF_PROG_TYPE_LWT_IN**; **BPF_LWT_ENCAP_IP** type can
+ * 	be called by bpf programs of types **BPF_PROG_TYPE_LWT_IN** and
+ * 	**BPF_PROG_TYPE_LWT_XMIT**.
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_lwt_push_encap)(struct __sk_buff *skb, __u32 type, void *hdr, __u32 len) = (void *) 73;
+
+/*
+ * bpf_lwt_seg6_store_bytes
+ *
+ * 	Store *len* bytes from address *from* into the packet
+ * 	associated to *skb*, at *offset*. Only the flags, tag and TLVs
+ * 	inside the outermost IPv6 Segment Routing Header can be
+ * 	modified through this helper.
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_lwt_seg6_store_bytes)(struct __sk_buff *skb, __u32 offset, const void *from, __u32 len) = (void *) 74;
+
+/*
+ * bpf_lwt_seg6_adjust_srh
+ *
+ * 	Adjust the size allocated to TLVs in the outermost IPv6
+ * 	Segment Routing Header contained in the packet associated to
+ * 	*skb*, at position *offset* by *delta* bytes. Only offsets
+ * 	after the segments are accepted. *delta* can be as well
+ * 	positive (growing) as negative (shrinking).
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_lwt_seg6_adjust_srh)(struct __sk_buff *skb, __u32 offset, __s32 delta) = (void *) 75;
+
+/*
+ * bpf_lwt_seg6_action
+ *
+ * 	Apply an IPv6 Segment Routing action of type *action* to the
+ * 	packet associated to *skb*. Each action takes a parameter
+ * 	contained at address *param*, and of length *param_len* bytes.
+ * 	*action* can be one of:
+ *
+ * 	**SEG6_LOCAL_ACTION_END_X**
+ * 		End.X action: Endpoint with Layer-3 cross-connect.
+ * 		Type of *param*: **struct in6_addr**.
+ * 	**SEG6_LOCAL_ACTION_END_T**
+ * 		End.T action: Endpoint with specific IPv6 table lookup.
+ * 		Type of *param*: **int**.
+ * 	**SEG6_LOCAL_ACTION_END_B6**
+ * 		End.B6 action: Endpoint bound to an SRv6 policy.
+ * 		Type of *param*: **struct ipv6_sr_hdr**.
+ * 	**SEG6_LOCAL_ACTION_END_B6_ENCAP**
+ * 		End.B6.Encap action: Endpoint bound to an SRv6
+ * 		encapsulation policy.
+ * 		Type of *param*: **struct ipv6_sr_hdr**.
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_lwt_seg6_action)(struct __sk_buff *skb, __u32 action, void *param, __u32 param_len) = (void *) 76;
+
+/*
+ * bpf_rc_repeat
+ *
+ * 	This helper is used in programs implementing IR decoding, to
+ * 	report a successfully decoded repeat key message. This delays
+ * 	the generation of a key up event for previously generated
+ * 	key down event.
+ *
+ * 	Some IR protocols like NEC have a special IR message for
+ * 	repeating last button, for when a button is held down.
+ *
+ * 	The *ctx* should point to the lirc sample as passed into
+ * 	the program.
+ *
+ * 	This helper is only available is the kernel was compiled with
+ * 	the **CONFIG_BPF_LIRC_MODE2** configuration option set to
+ * 	"**y**".
+ *
+ * Returns
+ * 	0
+ */
+static long (*bpf_rc_repeat)(void *ctx) = (void *) 77;
+
+/*
+ * bpf_rc_keydown
+ *
+ * 	This helper is used in programs implementing IR decoding, to
+ * 	report a successfully decoded key press with *scancode*,
+ * 	*toggle* value in the given *protocol*. The scancode will be
+ * 	translated to a keycode using the rc keymap, and reported as
+ * 	an input key down event. After a period a key up event is
+ * 	generated. This period can be extended by calling either
+ * 	**bpf_rc_keydown**\ () again with the same values, or calling
+ * 	**bpf_rc_repeat**\ ().
+ *
+ * 	Some protocols include a toggle bit, in case the button was
+ * 	released and pressed again between consecutive scancodes.
+ *
+ * 	The *ctx* should point to the lirc sample as passed into
+ * 	the program.
+ *
+ * 	The *protocol* is the decoded protocol number (see
+ * 	**enum rc_proto** for some predefined values).
+ *
+ * 	This helper is only available is the kernel was compiled with
+ * 	the **CONFIG_BPF_LIRC_MODE2** configuration option set to
+ * 	"**y**".
+ *
+ * Returns
+ * 	0
+ */
+static long (*bpf_rc_keydown)(void *ctx, __u32 protocol, __u64 scancode, __u32 toggle) = (void *) 78;
+
+/*
+ * bpf_skb_cgroup_id
+ *
+ * 	Return the cgroup v2 id of the socket associated with the *skb*.
+ * 	This is roughly similar to the **bpf_get_cgroup_classid**\ ()
+ * 	helper for cgroup v1 by providing a tag resp. identifier that
+ * 	can be matched on or used for map lookups e.g. to implement
+ * 	policy. The cgroup v2 id of a given path in the hierarchy is
+ * 	exposed in user space through the f_handle API in order to get
+ * 	to the same 64-bit id.
+ *
+ * 	This helper can be used on TC egress path, but not on ingress,
+ * 	and is available only if the kernel was compiled with the
+ * 	**CONFIG_SOCK_CGROUP_DATA** configuration option.
+ *
+ * Returns
+ * 	The id is returned or 0 in case the id could not be retrieved.
+ */
+static __u64 (*bpf_skb_cgroup_id)(struct __sk_buff *skb) = (void *) 79;
+
+/*
+ * bpf_get_current_cgroup_id
+ *
+ *
+ * Returns
+ * 	A 64-bit integer containing the current cgroup id based
+ * 	on the cgroup within which the current task is running.
+ */
+static __u64 (*bpf_get_current_cgroup_id)(void) = (void *) 80;
+
+/*
+ * bpf_get_local_storage
+ *
+ * 	Get the pointer to the local storage area.
+ * 	The type and the size of the local storage is defined
+ * 	by the *map* argument.
+ * 	The *flags* meaning is specific for each map type,
+ * 	and has to be 0 for cgroup local storage.
+ *
+ * 	Depending on the BPF program type, a local storage area
+ * 	can be shared between multiple instances of the BPF program,
+ * 	running simultaneously.
+ *
+ * 	A user should care about the synchronization by himself.
+ * 	For example, by using the **BPF_STX_XADD** instruction to alter
+ * 	the shared data.
+ *
+ * Returns
+ * 	A pointer to the local storage area.
+ */
+static void *(*bpf_get_local_storage)(void *map, __u64 flags) = (void *) 81;
+
+/*
+ * bpf_sk_select_reuseport
+ *
+ * 	Select a **SO_REUSEPORT** socket from a
+ * 	**BPF_MAP_TYPE_REUSEPORT_ARRAY** *map*.
+ * 	It checks the selected socket is matching the incoming
+ * 	request in the socket buffer.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_sk_select_reuseport)(struct sk_reuseport_md *reuse, void *map, void *key, __u64 flags) = (void *) 82;
+
+/*
+ * bpf_skb_ancestor_cgroup_id
+ *
+ * 	Return id of cgroup v2 that is ancestor of cgroup associated
+ * 	with the *skb* at the *ancestor_level*.  The root cgroup is at
+ * 	*ancestor_level* zero and each step down the hierarchy
+ * 	increments the level. If *ancestor_level* == level of cgroup
+ * 	associated with *skb*, then return value will be same as that
+ * 	of **bpf_skb_cgroup_id**\ ().
+ *
+ * 	The helper is useful to implement policies based on cgroups
+ * 	that are upper in hierarchy than immediate cgroup associated
+ * 	with *skb*.
+ *
+ * 	The format of returned id and helper limitations are same as in
+ * 	**bpf_skb_cgroup_id**\ ().
+ *
+ * Returns
+ * 	The id is returned or 0 in case the id could not be retrieved.
+ */
+static __u64 (*bpf_skb_ancestor_cgroup_id)(struct __sk_buff *skb, int ancestor_level) = (void *) 83;
+
+/*
+ * bpf_sk_lookup_tcp
+ *
+ * 	Look for TCP socket matching *tuple*, optionally in a child
+ * 	network namespace *netns*. The return value must be checked,
+ * 	and if non-**NULL**, released via **bpf_sk_release**\ ().
+ *
+ * 	The *ctx* should point to the context of the program, such as
+ * 	the skb or socket (depending on the hook in use). This is used
+ * 	to determine the base network namespace for the lookup.
+ *
+ * 	*tuple_size* must be one of:
+ *
+ * 	**sizeof**\ (*tuple*\ **->ipv4**)
+ * 		Look for an IPv4 socket.
+ * 	**sizeof**\ (*tuple*\ **->ipv6**)
+ * 		Look for an IPv6 socket.
+ *
+ * 	If the *netns* is a negative signed 32-bit integer, then the
+ * 	socket lookup table in the netns associated with the *ctx*
+ * 	will be used. For the TC hooks, this is the netns of the device
+ * 	in the skb. For socket hooks, this is the netns of the socket.
+ * 	If *netns* is any other signed 32-bit value greater than or
+ * 	equal to zero then it specifies the ID of the netns relative to
+ * 	the netns associated with the *ctx*. *netns* values beyond the
+ * 	range of 32-bit integers are reserved for future use.
+ *
+ * 	All values for *flags* are reserved for future usage, and must
+ * 	be left at zero.
+ *
+ * 	This helper is available only if the kernel was compiled with
+ * 	**CONFIG_NET** configuration option.
+ *
+ * Returns
+ * 	Pointer to **struct bpf_sock**, or **NULL** in case of failure.
+ * 	For sockets with reuseport option, the **struct bpf_sock**
+ * 	result is from *reuse*\ **->socks**\ [] using the hash of the
+ * 	tuple.
+ */
+static struct bpf_sock *(*bpf_sk_lookup_tcp)(void *ctx, struct bpf_sock_tuple *tuple, __u32 tuple_size, __u64 netns, __u64 flags) = (void *) 84;
+
+/*
+ * bpf_sk_lookup_udp
+ *
+ * 	Look for UDP socket matching *tuple*, optionally in a child
+ * 	network namespace *netns*. The return value must be checked,
+ * 	and if non-**NULL**, released via **bpf_sk_release**\ ().
+ *
+ * 	The *ctx* should point to the context of the program, such as
+ * 	the skb or socket (depending on the hook in use). This is used
+ * 	to determine the base network namespace for the lookup.
+ *
+ * 	*tuple_size* must be one of:
+ *
+ * 	**sizeof**\ (*tuple*\ **->ipv4**)
+ * 		Look for an IPv4 socket.
+ * 	**sizeof**\ (*tuple*\ **->ipv6**)
+ * 		Look for an IPv6 socket.
+ *
+ * 	If the *netns* is a negative signed 32-bit integer, then the
+ * 	socket lookup table in the netns associated with the *ctx*
+ * 	will be used. For the TC hooks, this is the netns of the device
+ * 	in the skb. For socket hooks, this is the netns of the socket.
+ * 	If *netns* is any other signed 32-bit value greater than or
+ * 	equal to zero then it specifies the ID of the netns relative to
+ * 	the netns associated with the *ctx*. *netns* values beyond the
+ * 	range of 32-bit integers are reserved for future use.
+ *
+ * 	All values for *flags* are reserved for future usage, and must
+ * 	be left at zero.
+ *
+ * 	This helper is available only if the kernel was compiled with
+ * 	**CONFIG_NET** configuration option.
+ *
+ * Returns
+ * 	Pointer to **struct bpf_sock**, or **NULL** in case of failure.
+ * 	For sockets with reuseport option, the **struct bpf_sock**
+ * 	result is from *reuse*\ **->socks**\ [] using the hash of the
+ * 	tuple.
+ */
+static struct bpf_sock *(*bpf_sk_lookup_udp)(void *ctx, struct bpf_sock_tuple *tuple, __u32 tuple_size, __u64 netns, __u64 flags) = (void *) 85;
+
+/*
+ * bpf_sk_release
+ *
+ * 	Release the reference held by *sock*. *sock* must be a
+ * 	non-**NULL** pointer that was returned from
+ * 	**bpf_sk_lookup_xxx**\ ().
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_sk_release)(struct bpf_sock *sock) = (void *) 86;
+
+/*
+ * bpf_map_push_elem
+ *
+ * 	Push an element *value* in *map*. *flags* is one of:
+ *
+ * 	**BPF_EXIST**
+ * 		If the queue/stack is full, the oldest element is
+ * 		removed to make room for this.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_map_push_elem)(void *map, const void *value, __u64 flags) = (void *) 87;
+
+/*
+ * bpf_map_pop_elem
+ *
+ * 	Pop an element from *map*.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_map_pop_elem)(void *map, void *value) = (void *) 88;
+
+/*
+ * bpf_map_peek_elem
+ *
+ * 	Get an element from *map* without removing it.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_map_peek_elem)(void *map, void *value) = (void *) 89;
+
+/*
+ * bpf_msg_push_data
+ *
+ * 	For socket policies, insert *len* bytes into *msg* at offset
+ * 	*start*.
+ *
+ * 	If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a
+ * 	*msg* it may want to insert metadata or options into the *msg*.
+ * 	This can later be read and used by any of the lower layer BPF
+ * 	hooks.
+ *
+ * 	This helper may fail if under memory pressure (a malloc
+ * 	fails) in these cases BPF programs will get an appropriate
+ * 	error and BPF programs will need to handle them.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_msg_push_data)(struct sk_msg_md *msg, __u32 start, __u32 len, __u64 flags) = (void *) 90;
+
+/*
+ * bpf_msg_pop_data
+ *
+ * 	Will remove *len* bytes from a *msg* starting at byte *start*.
+ * 	This may result in **ENOMEM** errors under certain situations if
+ * 	an allocation and copy are required due to a full ring buffer.
+ * 	However, the helper will try to avoid doing the allocation
+ * 	if possible. Other errors can occur if input parameters are
+ * 	invalid either due to *start* byte not being valid part of *msg*
+ * 	payload and/or *pop* value being to large.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_msg_pop_data)(struct sk_msg_md *msg, __u32 start, __u32 len, __u64 flags) = (void *) 91;
+
+/*
+ * bpf_rc_pointer_rel
+ *
+ * 	This helper is used in programs implementing IR decoding, to
+ * 	report a successfully decoded pointer movement.
+ *
+ * 	The *ctx* should point to the lirc sample as passed into
+ * 	the program.
+ *
+ * 	This helper is only available is the kernel was compiled with
+ * 	the **CONFIG_BPF_LIRC_MODE2** configuration option set to
+ * 	"**y**".
+ *
+ * Returns
+ * 	0
+ */
+static long (*bpf_rc_pointer_rel)(void *ctx, __s32 rel_x, __s32 rel_y) = (void *) 92;
+
+/*
+ * bpf_spin_lock
+ *
+ * 	Acquire a spinlock represented by the pointer *lock*, which is
+ * 	stored as part of a value of a map. Taking the lock allows to
+ * 	safely update the rest of the fields in that value. The
+ * 	spinlock can (and must) later be released with a call to
+ * 	**bpf_spin_unlock**\ (\ *lock*\ ).
+ *
+ * 	Spinlocks in BPF programs come with a number of restrictions
+ * 	and constraints:
+ *
+ * 	* **bpf_spin_lock** objects are only allowed inside maps of
+ * 	  types **BPF_MAP_TYPE_HASH** and **BPF_MAP_TYPE_ARRAY** (this
+ * 	  list could be extended in the future).
+ * 	* BTF description of the map is mandatory.
+ * 	* The BPF program can take ONE lock at a time, since taking two
+ * 	  or more could cause dead locks.
+ * 	* Only one **struct bpf_spin_lock** is allowed per map element.
+ * 	* When the lock is taken, calls (either BPF to BPF or helpers)
+ * 	  are not allowed.
+ * 	* The **BPF_LD_ABS** and **BPF_LD_IND** instructions are not
+ * 	  allowed inside a spinlock-ed region.
+ * 	* The BPF program MUST call **bpf_spin_unlock**\ () to release
+ * 	  the lock, on all execution paths, before it returns.
+ * 	* The BPF program can access **struct bpf_spin_lock** only via
+ * 	  the **bpf_spin_lock**\ () and **bpf_spin_unlock**\ ()
+ * 	  helpers. Loading or storing data into the **struct
+ * 	  bpf_spin_lock** *lock*\ **;** field of a map is not allowed.
+ * 	* To use the **bpf_spin_lock**\ () helper, the BTF description
+ * 	  of the map value must be a struct and have **struct
+ * 	  bpf_spin_lock** *anyname*\ **;** field at the top level.
+ * 	  Nested lock inside another struct is not allowed.
+ * 	* The **struct bpf_spin_lock** *lock* field in a map value must
+ * 	  be aligned on a multiple of 4 bytes in that value.
+ * 	* Syscall with command **BPF_MAP_LOOKUP_ELEM** does not copy
+ * 	  the **bpf_spin_lock** field to user space.
+ * 	* Syscall with command **BPF_MAP_UPDATE_ELEM**, or update from
+ * 	  a BPF program, do not update the **bpf_spin_lock** field.
+ * 	* **bpf_spin_lock** cannot be on the stack or inside a
+ * 	  networking packet (it can only be inside of a map values).
+ * 	* **bpf_spin_lock** is available to root only.
+ * 	* Tracing programs and socket filter programs cannot use
+ * 	  **bpf_spin_lock**\ () due to insufficient preemption checks
+ * 	  (but this may change in the future).
+ * 	* **bpf_spin_lock** is not allowed in inner maps of map-in-map.
+ *
+ * Returns
+ * 	0
+ */
+static long (*bpf_spin_lock)(struct bpf_spin_lock *lock) = (void *) 93;
+
+/*
+ * bpf_spin_unlock
+ *
+ * 	Release the *lock* previously locked by a call to
+ * 	**bpf_spin_lock**\ (\ *lock*\ ).
+ *
+ * Returns
+ * 	0
+ */
+static long (*bpf_spin_unlock)(struct bpf_spin_lock *lock) = (void *) 94;
+
+/*
+ * bpf_sk_fullsock
+ *
+ * 	This helper gets a **struct bpf_sock** pointer such
+ * 	that all the fields in this **bpf_sock** can be accessed.
+ *
+ * Returns
+ * 	A **struct bpf_sock** pointer on success, or **NULL** in
+ * 	case of failure.
+ */
+static struct bpf_sock *(*bpf_sk_fullsock)(struct bpf_sock *sk) = (void *) 95;
+
+/*
+ * bpf_tcp_sock
+ *
+ * 	This helper gets a **struct bpf_tcp_sock** pointer from a
+ * 	**struct bpf_sock** pointer.
+ *
+ * Returns
+ * 	A **struct bpf_tcp_sock** pointer on success, or **NULL** in
+ * 	case of failure.
+ */
+static struct bpf_tcp_sock *(*bpf_tcp_sock)(struct bpf_sock *sk) = (void *) 96;
+
+/*
+ * bpf_skb_ecn_set_ce
+ *
+ * 	Set ECN (Explicit Congestion Notification) field of IP header
+ * 	to **CE** (Congestion Encountered) if current value is **ECT**
+ * 	(ECN Capable Transport). Otherwise, do nothing. Works with IPv6
+ * 	and IPv4.
+ *
+ * Returns
+ * 	1 if the **CE** flag is set (either by the current helper call
+ * 	or because it was already present), 0 if it is not set.
+ */
+static long (*bpf_skb_ecn_set_ce)(struct __sk_buff *skb) = (void *) 97;
+
+/*
+ * bpf_get_listener_sock
+ *
+ * 	Return a **struct bpf_sock** pointer in **TCP_LISTEN** state.
+ * 	**bpf_sk_release**\ () is unnecessary and not allowed.
+ *
+ * Returns
+ * 	A **struct bpf_sock** pointer on success, or **NULL** in
+ * 	case of failure.
+ */
+static struct bpf_sock *(*bpf_get_listener_sock)(struct bpf_sock *sk) = (void *) 98;
+
+/*
+ * bpf_skc_lookup_tcp
+ *
+ * 	Look for TCP socket matching *tuple*, optionally in a child
+ * 	network namespace *netns*. The return value must be checked,
+ * 	and if non-**NULL**, released via **bpf_sk_release**\ ().
+ *
+ * 	This function is identical to **bpf_sk_lookup_tcp**\ (), except
+ * 	that it also returns timewait or request sockets. Use
+ * 	**bpf_sk_fullsock**\ () or **bpf_tcp_sock**\ () to access the
+ * 	full structure.
+ *
+ * 	This helper is available only if the kernel was compiled with
+ * 	**CONFIG_NET** configuration option.
+ *
+ * Returns
+ * 	Pointer to **struct bpf_sock**, or **NULL** in case of failure.
+ * 	For sockets with reuseport option, the **struct bpf_sock**
+ * 	result is from *reuse*\ **->socks**\ [] using the hash of the
+ * 	tuple.
+ */
+static struct bpf_sock *(*bpf_skc_lookup_tcp)(void *ctx, struct bpf_sock_tuple *tuple, __u32 tuple_size, __u64 netns, __u64 flags) = (void *) 99;
+
+/*
+ * bpf_tcp_check_syncookie
+ *
+ * 	Check whether *iph* and *th* contain a valid SYN cookie ACK for
+ * 	the listening socket in *sk*.
+ *
+ * 	*iph* points to the start of the IPv4 or IPv6 header, while
+ * 	*iph_len* contains **sizeof**\ (**struct iphdr**) or
+ * 	**sizeof**\ (**struct ip6hdr**).
+ *
+ * 	*th* points to the start of the TCP header, while *th_len*
+ * 	contains **sizeof**\ (**struct tcphdr**).
+ *
+ * Returns
+ * 	0 if *iph* and *th* are a valid SYN cookie ACK, or a negative
+ * 	error otherwise.
+ */
+static long (*bpf_tcp_check_syncookie)(struct bpf_sock *sk, void *iph, __u32 iph_len, struct tcphdr *th, __u32 th_len) = (void *) 100;
+
+/*
+ * bpf_sysctl_get_name
+ *
+ * 	Get name of sysctl in /proc/sys/ and copy it into provided by
+ * 	program buffer *buf* of size *buf_len*.
+ *
+ * 	The buffer is always NUL terminated, unless it's zero-sized.
+ *
+ * 	If *flags* is zero, full name (e.g. "net/ipv4/tcp_mem") is
+ * 	copied. Use **BPF_F_SYSCTL_BASE_NAME** flag to copy base name
+ * 	only (e.g. "tcp_mem").
+ *
+ * Returns
+ * 	Number of character copied (not including the trailing NUL).
+ *
+ * 	**-E2BIG** if the buffer wasn't big enough (*buf* will contain
+ * 	truncated name in this case).
+ */
+static long (*bpf_sysctl_get_name)(struct bpf_sysctl *ctx, char *buf, unsigned long buf_len, __u64 flags) = (void *) 101;
+
+/*
+ * bpf_sysctl_get_current_value
+ *
+ * 	Get current value of sysctl as it is presented in /proc/sys
+ * 	(incl. newline, etc), and copy it as a string into provided
+ * 	by program buffer *buf* of size *buf_len*.
+ *
+ * 	The whole value is copied, no matter what file position user
+ * 	space issued e.g. sys_read at.
+ *
+ * 	The buffer is always NUL terminated, unless it's zero-sized.
+ *
+ * Returns
+ * 	Number of character copied (not including the trailing NUL).
+ *
+ * 	**-E2BIG** if the buffer wasn't big enough (*buf* will contain
+ * 	truncated name in this case).
+ *
+ * 	**-EINVAL** if current value was unavailable, e.g. because
+ * 	sysctl is uninitialized and read returns -EIO for it.
+ */
+static long (*bpf_sysctl_get_current_value)(struct bpf_sysctl *ctx, char *buf, unsigned long buf_len) = (void *) 102;
+
+/*
+ * bpf_sysctl_get_new_value
+ *
+ * 	Get new value being written by user space to sysctl (before
+ * 	the actual write happens) and copy it as a string into
+ * 	provided by program buffer *buf* of size *buf_len*.
+ *
+ * 	User space may write new value at file position > 0.
+ *
+ * 	The buffer is always NUL terminated, unless it's zero-sized.
+ *
+ * Returns
+ * 	Number of character copied (not including the trailing NUL).
+ *
+ * 	**-E2BIG** if the buffer wasn't big enough (*buf* will contain
+ * 	truncated name in this case).
+ *
+ * 	**-EINVAL** if sysctl is being read.
+ */
+static long (*bpf_sysctl_get_new_value)(struct bpf_sysctl *ctx, char *buf, unsigned long buf_len) = (void *) 103;
+
+/*
+ * bpf_sysctl_set_new_value
+ *
+ * 	Override new value being written by user space to sysctl with
+ * 	value provided by program in buffer *buf* of size *buf_len*.
+ *
+ * 	*buf* should contain a string in same form as provided by user
+ * 	space on sysctl write.
+ *
+ * 	User space may write new value at file position > 0. To override
+ * 	the whole sysctl value file position should be set to zero.
+ *
+ * Returns
+ * 	0 on success.
+ *
+ * 	**-E2BIG** if the *buf_len* is too big.
+ *
+ * 	**-EINVAL** if sysctl is being read.
+ */
+static long (*bpf_sysctl_set_new_value)(struct bpf_sysctl *ctx, const char *buf, unsigned long buf_len) = (void *) 104;
+
+/*
+ * bpf_strtol
+ *
+ * 	Convert the initial part of the string from buffer *buf* of
+ * 	size *buf_len* to a long integer according to the given base
+ * 	and save the result in *res*.
+ *
+ * 	The string may begin with an arbitrary amount of white space
+ * 	(as determined by **isspace**\ (3)) followed by a single
+ * 	optional '**-**' sign.
+ *
+ * 	Five least significant bits of *flags* encode base, other bits
+ * 	are currently unused.
+ *
+ * 	Base must be either 8, 10, 16 or 0 to detect it automatically
+ * 	similar to user space **strtol**\ (3).
+ *
+ * Returns
+ * 	Number of characters consumed on success. Must be positive but
+ * 	no more than *buf_len*.
+ *
+ * 	**-EINVAL** if no valid digits were found or unsupported base
+ * 	was provided.
+ *
+ * 	**-ERANGE** if resulting value was out of range.
+ */
+static long (*bpf_strtol)(const char *buf, unsigned long buf_len, __u64 flags, long *res) = (void *) 105;
+
+/*
+ * bpf_strtoul
+ *
+ * 	Convert the initial part of the string from buffer *buf* of
+ * 	size *buf_len* to an unsigned long integer according to the
+ * 	given base and save the result in *res*.
+ *
+ * 	The string may begin with an arbitrary amount of white space
+ * 	(as determined by **isspace**\ (3)).
+ *
+ * 	Five least significant bits of *flags* encode base, other bits
+ * 	are currently unused.
+ *
+ * 	Base must be either 8, 10, 16 or 0 to detect it automatically
+ * 	similar to user space **strtoul**\ (3).
+ *
+ * Returns
+ * 	Number of characters consumed on success. Must be positive but
+ * 	no more than *buf_len*.
+ *
+ * 	**-EINVAL** if no valid digits were found or unsupported base
+ * 	was provided.
+ *
+ * 	**-ERANGE** if resulting value was out of range.
+ */
+static long (*bpf_strtoul)(const char *buf, unsigned long buf_len, __u64 flags, unsigned long *res) = (void *) 106;
+
+/*
+ * bpf_sk_storage_get
+ *
+ * 	Get a bpf-local-storage from a *sk*.
+ *
+ * 	Logically, it could be thought of getting the value from
+ * 	a *map* with *sk* as the **key**.  From this
+ * 	perspective,  the usage is not much different from
+ * 	**bpf_map_lookup_elem**\ (*map*, **&**\ *sk*) except this
+ * 	helper enforces the key must be a full socket and the map must
+ * 	be a **BPF_MAP_TYPE_SK_STORAGE** also.
+ *
+ * 	Underneath, the value is stored locally at *sk* instead of
+ * 	the *map*.  The *map* is used as the bpf-local-storage
+ * 	"type". The bpf-local-storage "type" (i.e. the *map*) is
+ * 	searched against all bpf-local-storages residing at *sk*.
+ *
+ * 	An optional *flags* (**BPF_SK_STORAGE_GET_F_CREATE**) can be
+ * 	used such that a new bpf-local-storage will be
+ * 	created if one does not exist.  *value* can be used
+ * 	together with **BPF_SK_STORAGE_GET_F_CREATE** to specify
+ * 	the initial value of a bpf-local-storage.  If *value* is
+ * 	**NULL**, the new bpf-local-storage will be zero initialized.
+ *
+ * Returns
+ * 	A bpf-local-storage pointer is returned on success.
+ *
+ * 	**NULL** if not found or there was an error in adding
+ * 	a new bpf-local-storage.
+ */
+static void *(*bpf_sk_storage_get)(void *map, struct bpf_sock *sk, void *value, __u64 flags) = (void *) 107;
+
+/*
+ * bpf_sk_storage_delete
+ *
+ * 	Delete a bpf-local-storage from a *sk*.
+ *
+ * Returns
+ * 	0 on success.
+ *
+ * 	**-ENOENT** if the bpf-local-storage cannot be found.
+ */
+static long (*bpf_sk_storage_delete)(void *map, struct bpf_sock *sk) = (void *) 108;
+
+/*
+ * bpf_send_signal
+ *
+ * 	Send signal *sig* to the process of the current task.
+ * 	The signal may be delivered to any of this process's threads.
+ *
+ * Returns
+ * 	0 on success or successfully queued.
+ *
+ * 	**-EBUSY** if work queue under nmi is full.
+ *
+ * 	**-EINVAL** if *sig* is invalid.
+ *
+ * 	**-EPERM** if no permission to send the *sig*.
+ *
+ * 	**-EAGAIN** if bpf program can try again.
+ */
+static long (*bpf_send_signal)(__u32 sig) = (void *) 109;
+
+/*
+ * bpf_tcp_gen_syncookie
+ *
+ * 	Try to issue a SYN cookie for the packet with corresponding
+ * 	IP/TCP headers, *iph* and *th*, on the listening socket in *sk*.
+ *
+ * 	*iph* points to the start of the IPv4 or IPv6 header, while
+ * 	*iph_len* contains **sizeof**\ (**struct iphdr**) or
+ * 	**sizeof**\ (**struct ip6hdr**).
+ *
+ * 	*th* points to the start of the TCP header, while *th_len*
+ * 	contains the length of the TCP header.
+ *
+ * Returns
+ * 	On success, lower 32 bits hold the generated SYN cookie in
+ * 	followed by 16 bits which hold the MSS value for that cookie,
+ * 	and the top 16 bits are unused.
+ *
+ * 	On failure, the returned value is one of the following:
+ *
+ * 	**-EINVAL** SYN cookie cannot be issued due to error
+ *
+ * 	**-ENOENT** SYN cookie should not be issued (no SYN flood)
+ *
+ * 	**-EOPNOTSUPP** kernel configuration does not enable SYN cookies
+ *
+ * 	**-EPROTONOSUPPORT** IP packet version is not 4 or 6
+ */
+static __s64 (*bpf_tcp_gen_syncookie)(struct bpf_sock *sk, void *iph, __u32 iph_len, struct tcphdr *th, __u32 th_len) = (void *) 110;
+
+/*
+ * bpf_skb_output
+ *
+ * 	Write raw *data* blob into a special BPF perf event held by
+ * 	*map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
+ * 	event must have the following attributes: **PERF_SAMPLE_RAW**
+ * 	as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and
+ * 	**PERF_COUNT_SW_BPF_OUTPUT** as **config**.
+ *
+ * 	The *flags* are used to indicate the index in *map* for which
+ * 	the value must be put, masked with **BPF_F_INDEX_MASK**.
+ * 	Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU**
+ * 	to indicate that the index of the current CPU core should be
+ * 	used.
+ *
+ * 	The value to write, of *size*, is passed through eBPF stack and
+ * 	pointed by *data*.
+ *
+ * 	*ctx* is a pointer to in-kernel struct sk_buff.
+ *
+ * 	This helper is similar to **bpf_perf_event_output**\ () but
+ * 	restricted to raw_tracepoint bpf programs.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_skb_output)(void *ctx, void *map, __u64 flags, void *data, __u64 size) = (void *) 111;
+
+/*
+ * bpf_probe_read_user
+ *
+ * 	Safely attempt to read *size* bytes from user space address
+ * 	*unsafe_ptr* and store the data in *dst*.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_probe_read_user)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 112;
+
+/*
+ * bpf_probe_read_kernel
+ *
+ * 	Safely attempt to read *size* bytes from kernel space address
+ * 	*unsafe_ptr* and store the data in *dst*.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_probe_read_kernel)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 113;
+
+/*
+ * bpf_probe_read_user_str
+ *
+ * 	Copy a NUL terminated string from an unsafe user address
+ * 	*unsafe_ptr* to *dst*. The *size* should include the
+ * 	terminating NUL byte. In case the string length is smaller than
+ * 	*size*, the target is not padded with further NUL bytes. If the
+ * 	string length is larger than *size*, just *size*-1 bytes are
+ * 	copied and the last byte is set to NUL.
+ *
+ * 	On success, the length of the copied string is returned. This
+ * 	makes this helper useful in tracing programs for reading
+ * 	strings, and more importantly to get its length at runtime. See
+ * 	the following snippet:
+ *
+ * 	::
+ *
+ * 		SEC("kprobe/sys_open")
+ * 		void bpf_sys_open(struct pt_regs *ctx)
+ * 		{
+ * 		        char buf[PATHLEN]; // PATHLEN is defined to 256
+ * 		        int res = bpf_probe_read_user_str(buf, sizeof(buf),
+ * 			                                  ctx->di);
+ *
+ * 			// Consume buf, for example push it to
+ * 			// userspace via bpf_perf_event_output(); we
+ * 			// can use res (the string length) as event
+ * 			// size, after checking its boundaries.
+ * 		}
+ *
+ * 	In comparison, using **bpf_probe_read_user**\ () helper here
+ * 	instead to read the string would require to estimate the length
+ * 	at compile time, and would often result in copying more memory
+ * 	than necessary.
+ *
+ * 	Another useful use case is when parsing individual process
+ * 	arguments or individual environment variables navigating
+ * 	*current*\ **->mm->arg_start** and *current*\
+ * 	**->mm->env_start**: using this helper and the return value,
+ * 	one can quickly iterate at the right offset of the memory area.
+ *
+ * Returns
+ * 	On success, the strictly positive length of the string,
+ * 	including the trailing NUL character. On error, a negative
+ * 	value.
+ */
+static long (*bpf_probe_read_user_str)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 114;
+
+/*
+ * bpf_probe_read_kernel_str
+ *
+ * 	Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr*
+ * 	to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply.
+ *
+ * Returns
+ * 	On success, the strictly positive length of the string, including
+ * 	the trailing NUL character. On error, a negative value.
+ */
+static long (*bpf_probe_read_kernel_str)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 115;
+
+/*
+ * bpf_tcp_send_ack
+ *
+ * 	Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**.
+ * 	*rcv_nxt* is the ack_seq to be sent out.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_tcp_send_ack)(void *tp, __u32 rcv_nxt) = (void *) 116;
+
+/*
+ * bpf_send_signal_thread
+ *
+ * 	Send signal *sig* to the thread corresponding to the current task.
+ *
+ * Returns
+ * 	0 on success or successfully queued.
+ *
+ * 	**-EBUSY** if work queue under nmi is full.
+ *
+ * 	**-EINVAL** if *sig* is invalid.
+ *
+ * 	**-EPERM** if no permission to send the *sig*.
+ *
+ * 	**-EAGAIN** if bpf program can try again.
+ */
+static long (*bpf_send_signal_thread)(__u32 sig) = (void *) 117;
+
+/*
+ * bpf_jiffies64
+ *
+ * 	Obtain the 64bit jiffies
+ *
+ * Returns
+ * 	The 64 bit jiffies
+ */
+static __u64 (*bpf_jiffies64)(void) = (void *) 118;
+
+/*
+ * bpf_read_branch_records
+ *
+ * 	For an eBPF program attached to a perf event, retrieve the
+ * 	branch records (**struct perf_branch_entry**) associated to *ctx*
+ * 	and store it in the buffer pointed by *buf* up to size
+ * 	*size* bytes.
+ *
+ * Returns
+ * 	On success, number of bytes written to *buf*. On error, a
+ * 	negative value.
+ *
+ * 	The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to
+ * 	instead return the number of bytes required to store all the
+ * 	branch entries. If this flag is set, *buf* may be NULL.
+ *
+ * 	**-EINVAL** if arguments invalid or **size** not a multiple
+ * 	of **sizeof**\ (**struct perf_branch_entry**\ ).
+ *
+ * 	**-ENOENT** if architecture does not support branch records.
+ */
+static long (*bpf_read_branch_records)(struct bpf_perf_event_data *ctx, void *buf, __u32 size, __u64 flags) = (void *) 119;
+
+/*
+ * bpf_get_ns_current_pid_tgid
+ *
+ * 	Returns 0 on success, values for *pid* and *tgid* as seen from the current
+ * 	*namespace* will be returned in *nsdata*.
+ *
+ * Returns
+ * 	0 on success, or one of the following in case of failure:
+ *
+ * 	**-EINVAL** if dev and inum supplied don't match dev_t and inode number
+ * 	with nsfs of current task, or if dev conversion to dev_t lost high bits.
+ *
+ * 	**-ENOENT** if pidns does not exists for the current task.
+ */
+static long (*bpf_get_ns_current_pid_tgid)(__u64 dev, __u64 ino, struct bpf_pidns_info *nsdata, __u32 size) = (void *) 120;
+
+/*
+ * bpf_xdp_output
+ *
+ * 	Write raw *data* blob into a special BPF perf event held by
+ * 	*map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
+ * 	event must have the following attributes: **PERF_SAMPLE_RAW**
+ * 	as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and
+ * 	**PERF_COUNT_SW_BPF_OUTPUT** as **config**.
+ *
+ * 	The *flags* are used to indicate the index in *map* for which
+ * 	the value must be put, masked with **BPF_F_INDEX_MASK**.
+ * 	Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU**
+ * 	to indicate that the index of the current CPU core should be
+ * 	used.
+ *
+ * 	The value to write, of *size*, is passed through eBPF stack and
+ * 	pointed by *data*.
+ *
+ * 	*ctx* is a pointer to in-kernel struct xdp_buff.
+ *
+ * 	This helper is similar to **bpf_perf_eventoutput**\ () but
+ * 	restricted to raw_tracepoint bpf programs.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_xdp_output)(void *ctx, void *map, __u64 flags, void *data, __u64 size) = (void *) 121;
+
+/*
+ * bpf_get_netns_cookie
+ *
+ * 	Retrieve the cookie (generated by the kernel) of the network
+ * 	namespace the input *ctx* is associated with. The network
+ * 	namespace cookie remains stable for its lifetime and provides
+ * 	a global identifier that can be assumed unique. If *ctx* is
+ * 	NULL, then the helper returns the cookie for the initial
+ * 	network namespace. The cookie itself is very similar to that
+ * 	of **bpf_get_socket_cookie**\ () helper, but for network
+ * 	namespaces instead of sockets.
+ *
+ * Returns
+ * 	A 8-byte long opaque number.
+ */
+static __u64 (*bpf_get_netns_cookie)(void *ctx) = (void *) 122;
+
+/*
+ * bpf_get_current_ancestor_cgroup_id
+ *
+ * 	Return id of cgroup v2 that is ancestor of the cgroup associated
+ * 	with the current task at the *ancestor_level*. The root cgroup
+ * 	is at *ancestor_level* zero and each step down the hierarchy
+ * 	increments the level. If *ancestor_level* == level of cgroup
+ * 	associated with the current task, then return value will be the
+ * 	same as that of **bpf_get_current_cgroup_id**\ ().
+ *
+ * 	The helper is useful to implement policies based on cgroups
+ * 	that are upper in hierarchy than immediate cgroup associated
+ * 	with the current task.
+ *
+ * 	The format of returned id and helper limitations are same as in
+ * 	**bpf_get_current_cgroup_id**\ ().
+ *
+ * Returns
+ * 	The id is returned or 0 in case the id could not be retrieved.
+ */
+static __u64 (*bpf_get_current_ancestor_cgroup_id)(int ancestor_level) = (void *) 123;
+
+/*
+ * bpf_sk_assign
+ *
+ * 	Helper is overloaded depending on BPF program type. This
+ * 	description applies to **BPF_PROG_TYPE_SCHED_CLS** and
+ * 	**BPF_PROG_TYPE_SCHED_ACT** programs.
+ *
+ * 	Assign the *sk* to the *skb*. When combined with appropriate
+ * 	routing configuration to receive the packet towards the socket,
+ * 	will cause *skb* to be delivered to the specified socket.
+ * 	Subsequent redirection of *skb* via  **bpf_redirect**\ (),
+ * 	**bpf_clone_redirect**\ () or other methods outside of BPF may
+ * 	interfere with successful delivery to the socket.
+ *
+ * 	This operation is only valid from TC ingress path.
+ *
+ * 	The *flags* argument must be zero.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure:
+ *
+ * 	**-EINVAL** if specified *flags* are not supported.
+ *
+ * 	**-ENOENT** if the socket is unavailable for assignment.
+ *
+ * 	**-ENETUNREACH** if the socket is unreachable (wrong netns).
+ *
+ * 	**-EOPNOTSUPP** if the operation is not supported, for example
+ * 	a call from outside of TC ingress.
+ *
+ * 	**-ESOCKTNOSUPPORT** if the socket type is not supported
+ * 	(reuseport).
+ */
+static long (*bpf_sk_assign)(void *ctx, struct bpf_sock *sk, __u64 flags) = (void *) 124;
+
+/*
+ * bpf_ktime_get_boot_ns
+ *
+ * 	Return the time elapsed since system boot, in nanoseconds.
+ * 	Does include the time the system was suspended.
+ * 	See: **clock_gettime**\ (**CLOCK_BOOTTIME**)
+ *
+ * Returns
+ * 	Current *ktime*.
+ */
+static __u64 (*bpf_ktime_get_boot_ns)(void) = (void *) 125;
+
+/*
+ * bpf_seq_printf
+ *
+ * 	**bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print
+ * 	out the format string.
+ * 	The *m* represents the seq_file. The *fmt* and *fmt_size* are for
+ * 	the format string itself. The *data* and *data_len* are format string
+ * 	arguments. The *data* are a **u64** array and corresponding format string
+ * 	values are stored in the array. For strings and pointers where pointees
+ * 	are accessed, only the pointer values are stored in the *data* array.
+ * 	The *data_len* is the size of *data* in bytes.
+ *
+ * 	Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory.
+ * 	Reading kernel memory may fail due to either invalid address or
+ * 	valid address but requiring a major memory fault. If reading kernel memory
+ * 	fails, the string for **%s** will be an empty string, and the ip
+ * 	address for **%p{i,I}{4,6}** will be 0. Not returning error to
+ * 	bpf program is consistent with what **bpf_trace_printk**\ () does for now.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure:
+ *
+ * 	**-EBUSY** if per-CPU memory copy buffer is busy, can try again
+ * 	by returning 1 from bpf program.
+ *
+ * 	**-EINVAL** if arguments are invalid, or if *fmt* is invalid/unsupported.
+ *
+ * 	**-E2BIG** if *fmt* contains too many format specifiers.
+ *
+ * 	**-EOVERFLOW** if an overflow happened: The same object will be tried again.
+ */
+static long (*bpf_seq_printf)(struct seq_file *m, const char *fmt, __u32 fmt_size, const void *data, __u32 data_len) = (void *) 126;
+
+/*
+ * bpf_seq_write
+ *
+ * 	**bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data.
+ * 	The *m* represents the seq_file. The *data* and *len* represent the
+ * 	data to write in bytes.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure:
+ *
+ * 	**-EOVERFLOW** if an overflow happened: The same object will be tried again.
+ */
+static long (*bpf_seq_write)(struct seq_file *m, const void *data, __u32 len) = (void *) 127;
+
+/*
+ * bpf_sk_cgroup_id
+ *
+ * 	Return the cgroup v2 id of the socket *sk*.
+ *
+ * 	*sk* must be a non-**NULL** pointer to a full socket, e.g. one
+ * 	returned from **bpf_sk_lookup_xxx**\ (),
+ * 	**bpf_sk_fullsock**\ (), etc. The format of returned id is
+ * 	same as in **bpf_skb_cgroup_id**\ ().
+ *
+ * 	This helper is available only if the kernel was compiled with
+ * 	the **CONFIG_SOCK_CGROUP_DATA** configuration option.
+ *
+ * Returns
+ * 	The id is returned or 0 in case the id could not be retrieved.
+ */
+static __u64 (*bpf_sk_cgroup_id)(struct bpf_sock *sk) = (void *) 128;
+
+/*
+ * bpf_sk_ancestor_cgroup_id
+ *
+ * 	Return id of cgroup v2 that is ancestor of cgroup associated
+ * 	with the *sk* at the *ancestor_level*.  The root cgroup is at
+ * 	*ancestor_level* zero and each step down the hierarchy
+ * 	increments the level. If *ancestor_level* == level of cgroup
+ * 	associated with *sk*, then return value will be same as that
+ * 	of **bpf_sk_cgroup_id**\ ().
+ *
+ * 	The helper is useful to implement policies based on cgroups
+ * 	that are upper in hierarchy than immediate cgroup associated
+ * 	with *sk*.
+ *
+ * 	The format of returned id and helper limitations are same as in
+ * 	**bpf_sk_cgroup_id**\ ().
+ *
+ * Returns
+ * 	The id is returned or 0 in case the id could not be retrieved.
+ */
+static __u64 (*bpf_sk_ancestor_cgroup_id)(struct bpf_sock *sk, int ancestor_level) = (void *) 129;
+
+/*
+ * bpf_ringbuf_output
+ *
+ * 	Copy *size* bytes from *data* into a ring buffer *ringbuf*.
+ * 	If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification
+ * 	of new data availability is sent.
+ * 	If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification
+ * 	of new data availability is sent unconditionally.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_ringbuf_output)(void *ringbuf, void *data, __u64 size, __u64 flags) = (void *) 130;
+
+/*
+ * bpf_ringbuf_reserve
+ *
+ * 	Reserve *size* bytes of payload in a ring buffer *ringbuf*.
+ *
+ * Returns
+ * 	Valid pointer with *size* bytes of memory available; NULL,
+ * 	otherwise.
+ */
+static void *(*bpf_ringbuf_reserve)(void *ringbuf, __u64 size, __u64 flags) = (void *) 131;
+
+/*
+ * bpf_ringbuf_submit
+ *
+ * 	Submit reserved ring buffer sample, pointed to by *data*.
+ * 	If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification
+ * 	of new data availability is sent.
+ * 	If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification
+ * 	of new data availability is sent unconditionally.
+ *
+ * Returns
+ * 	Nothing. Always succeeds.
+ */
+static void (*bpf_ringbuf_submit)(void *data, __u64 flags) = (void *) 132;
+
+/*
+ * bpf_ringbuf_discard
+ *
+ * 	Discard reserved ring buffer sample, pointed to by *data*.
+ * 	If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification
+ * 	of new data availability is sent.
+ * 	If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification
+ * 	of new data availability is sent unconditionally.
+ *
+ * Returns
+ * 	Nothing. Always succeeds.
+ */
+static void (*bpf_ringbuf_discard)(void *data, __u64 flags) = (void *) 133;
+
+/*
+ * bpf_ringbuf_query
+ *
+ * 	Query various characteristics of provided ring buffer. What
+ * 	exactly is queries is determined by *flags*:
+ *
+ * 	* **BPF_RB_AVAIL_DATA**: Amount of data not yet consumed.
+ * 	* **BPF_RB_RING_SIZE**: The size of ring buffer.
+ * 	* **BPF_RB_CONS_POS**: Consumer position (can wrap around).
+ * 	* **BPF_RB_PROD_POS**: Producer(s) position (can wrap around).
+ *
+ * 	Data returned is just a momentary snapshot of actual values
+ * 	and could be inaccurate, so this facility should be used to
+ * 	power heuristics and for reporting, not to make 100% correct
+ * 	calculation.
+ *
+ * Returns
+ * 	Requested value, or 0, if *flags* are not recognized.
+ */
+static __u64 (*bpf_ringbuf_query)(void *ringbuf, __u64 flags) = (void *) 134;
+
+/*
+ * bpf_csum_level
+ *
+ * 	Change the skbs checksum level by one layer up or down, or
+ * 	reset it entirely to none in order to have the stack perform
+ * 	checksum validation. The level is applicable to the following
+ * 	protocols: TCP, UDP, GRE, SCTP, FCOE. For example, a decap of
+ * 	| ETH | IP | UDP | GUE | IP | TCP | into | ETH | IP | TCP |
+ * 	through **bpf_skb_adjust_room**\ () helper with passing in
+ * 	**BPF_F_ADJ_ROOM_NO_CSUM_RESET** flag would require one	call
+ * 	to **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_DEC** since
+ * 	the UDP header is removed. Similarly, an encap of the latter
+ * 	into the former could be accompanied by a helper call to
+ * 	**bpf_csum_level**\ () with **BPF_CSUM_LEVEL_INC** if the
+ * 	skb is still intended to be processed in higher layers of the
+ * 	stack instead of just egressing at tc.
+ *
+ * 	There are three supported level settings at this time:
+ *
+ * 	* **BPF_CSUM_LEVEL_INC**: Increases skb->csum_level for skbs
+ * 	  with CHECKSUM_UNNECESSARY.
+ * 	* **BPF_CSUM_LEVEL_DEC**: Decreases skb->csum_level for skbs
+ * 	  with CHECKSUM_UNNECESSARY.
+ * 	* **BPF_CSUM_LEVEL_RESET**: Resets skb->csum_level to 0 and
+ * 	  sets CHECKSUM_NONE to force checksum validation by the stack.
+ * 	* **BPF_CSUM_LEVEL_QUERY**: No-op, returns the current
+ * 	  skb->csum_level.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure. In the
+ * 	case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level
+ * 	is returned or the error code -EACCES in case the skb is not
+ * 	subject to CHECKSUM_UNNECESSARY.
+ */
+static long (*bpf_csum_level)(struct __sk_buff *skb, __u64 level) = (void *) 135;
+
+/*
+ * bpf_skc_to_tcp6_sock
+ *
+ * 	Dynamically cast a *sk* pointer to a *tcp6_sock* pointer.
+ *
+ * Returns
+ * 	*sk* if casting is valid, or NULL otherwise.
+ */
+static struct tcp6_sock *(*bpf_skc_to_tcp6_sock)(void *sk) = (void *) 136;
+
+/*
+ * bpf_skc_to_tcp_sock
+ *
+ * 	Dynamically cast a *sk* pointer to a *tcp_sock* pointer.
+ *
+ * Returns
+ * 	*sk* if casting is valid, or NULL otherwise.
+ */
+static struct tcp_sock *(*bpf_skc_to_tcp_sock)(void *sk) = (void *) 137;
+
+/*
+ * bpf_skc_to_tcp_timewait_sock
+ *
+ * 	Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer.
+ *
+ * Returns
+ * 	*sk* if casting is valid, or NULL otherwise.
+ */
+static struct tcp_timewait_sock *(*bpf_skc_to_tcp_timewait_sock)(void *sk) = (void *) 138;
+
+/*
+ * bpf_skc_to_tcp_request_sock
+ *
+ * 	Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer.
+ *
+ * Returns
+ * 	*sk* if casting is valid, or NULL otherwise.
+ */
+static struct tcp_request_sock *(*bpf_skc_to_tcp_request_sock)(void *sk) = (void *) 139;
+
+/*
+ * bpf_skc_to_udp6_sock
+ *
+ * 	Dynamically cast a *sk* pointer to a *udp6_sock* pointer.
+ *
+ * Returns
+ * 	*sk* if casting is valid, or NULL otherwise.
+ */
+static struct udp6_sock *(*bpf_skc_to_udp6_sock)(void *sk) = (void *) 140;
+
+/*
+ * bpf_get_task_stack
+ *
+ * 	Return a user or a kernel stack in bpf program provided buffer.
+ * 	To achieve this, the helper needs *task*, which is a valid
+ * 	pointer to struct task_struct. To store the stacktrace, the
+ * 	bpf program provides *buf* with	a nonnegative *size*.
+ *
+ * 	The last argument, *flags*, holds the number of stack frames to
+ * 	skip (from 0 to 255), masked with
+ * 	**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ * 	the following flags:
+ *
+ * 	**BPF_F_USER_STACK**
+ * 		Collect a user space stack instead of a kernel stack.
+ * 	**BPF_F_USER_BUILD_ID**
+ * 		Collect buildid+offset instead of ips for user stack,
+ * 		only valid if **BPF_F_USER_STACK** is also specified.
+ *
+ * 	**bpf_get_task_stack**\ () can collect up to
+ * 	**PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
+ * 	to sufficient large buffer size. Note that
+ * 	this limit can be controlled with the **sysctl** program, and
+ * 	that it should be manually increased in order to profile long
+ * 	user stacks (such as stacks for Java programs). To do so, use:
+ *
+ * 	::
+ *
+ * 		# sysctl kernel.perf_event_max_stack=<new value>
+ *
+ * Returns
+ * 	A non-negative value equal to or less than *size* on success,
+ * 	or a negative error in case of failure.
+ */
+static long (*bpf_get_task_stack)(struct task_struct *task, void *buf, __u32 size, __u64 flags) = (void *) 141;
+
+

+ 80 - 0
vendor/github.com/cilium/ebpf/examples/headers/bpf_helpers.h

@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+#ifndef __BPF_HELPERS__
+#define __BPF_HELPERS__
+
+/*
+ * Note that bpf programs need to include either
+ * vmlinux.h (auto-generated from BTF) or linux/types.h
+ * in advance since bpf_helper_defs.h uses such types
+ * as __u64.
+ */
+#include "bpf_helper_defs.h"
+
+#define __uint(name, val) int (*name)[val]
+#define __type(name, val) typeof(val) *name
+#define __array(name, val) typeof(val) *name[]
+
+/* Helper macro to print out debug messages */
+#define bpf_printk(fmt, ...)				\
+({							\
+	char ____fmt[] = fmt;				\
+	bpf_trace_printk(____fmt, sizeof(____fmt),	\
+			 ##__VA_ARGS__);		\
+})
+
+/*
+ * Helper macro to place programs, maps, license in
+ * different sections in elf_bpf file. Section names
+ * are interpreted by elf_bpf loader
+ */
+#define SEC(NAME) __attribute__((section(NAME), used))
+
+#ifndef __always_inline
+#define __always_inline __attribute__((always_inline))
+#endif
+#ifndef __weak
+#define __weak __attribute__((weak))
+#endif
+
+/*
+ * Helper macro to manipulate data structures
+ */
+#ifndef offsetof
+#define offsetof(TYPE, MEMBER)  __builtin_offsetof(TYPE, MEMBER)
+#endif
+#ifndef container_of
+#define container_of(ptr, type, member)				\
+	({							\
+		void *__mptr = (void *)(ptr);			\
+		((type *)(__mptr - offsetof(type, member)));	\
+	})
+#endif
+
+/*
+ * Helper structure used by eBPF C program
+ * to describe BPF map attributes to libbpf loader
+ */
+struct bpf_map_def {
+	unsigned int type;
+	unsigned int key_size;
+	unsigned int value_size;
+	unsigned int max_entries;
+	unsigned int map_flags;
+};
+
+enum libbpf_pin_type {
+	LIBBPF_PIN_NONE,
+	/* PIN_BY_NAME: pin maps by name (in /sys/fs/bpf by default) */
+	LIBBPF_PIN_BY_NAME,
+};
+
+enum libbpf_tristate {
+	TRI_NO = 0,
+	TRI_YES = 1,
+	TRI_MODULE = 2,
+};
+
+#define __kconfig __attribute__((section(".kconfig")))
+#define __ksym __attribute__((section(".ksyms")))
+
+#endif

+ 107 - 0
vendor/github.com/cilium/ebpf/examples/headers/common.h

@@ -0,0 +1,107 @@
+// This is a compact version of `vmlinux.h` to be used in the examples using C code.
+
+#ifndef __VMLINUX_H__
+#define __VMLINUX_H__
+
+typedef unsigned char __u8;
+typedef short int __s16;
+typedef short unsigned int __u16;
+typedef int __s32;
+typedef unsigned int __u32;
+typedef long long int __s64;
+typedef long long unsigned int __u64;
+typedef __u8 u8;
+typedef __s16 s16;
+typedef __u16 u16;
+typedef __s32 s32;
+typedef __u32 u32;
+typedef __s64 s64;
+typedef __u64 u64;
+typedef __u16 __le16;
+typedef __u16 __be16;
+typedef __u32 __be32;
+typedef __u64 __be64;
+typedef __u32 __wsum;
+
+enum bpf_map_type {
+	BPF_MAP_TYPE_UNSPEC = 0,
+	BPF_MAP_TYPE_HASH = 1,
+	BPF_MAP_TYPE_ARRAY = 2,
+	BPF_MAP_TYPE_PROG_ARRAY = 3,
+	BPF_MAP_TYPE_PERF_EVENT_ARRAY = 4,
+	BPF_MAP_TYPE_PERCPU_HASH = 5,
+	BPF_MAP_TYPE_PERCPU_ARRAY = 6,
+	BPF_MAP_TYPE_STACK_TRACE = 7,
+	BPF_MAP_TYPE_CGROUP_ARRAY = 8,
+	BPF_MAP_TYPE_LRU_HASH = 9,
+	BPF_MAP_TYPE_LRU_PERCPU_HASH = 10,
+	BPF_MAP_TYPE_LPM_TRIE = 11,
+	BPF_MAP_TYPE_ARRAY_OF_MAPS = 12,
+	BPF_MAP_TYPE_HASH_OF_MAPS = 13,
+	BPF_MAP_TYPE_DEVMAP = 14,
+	BPF_MAP_TYPE_SOCKMAP = 15,
+	BPF_MAP_TYPE_CPUMAP = 16,
+	BPF_MAP_TYPE_XSKMAP = 17,
+	BPF_MAP_TYPE_SOCKHASH = 18,
+	BPF_MAP_TYPE_CGROUP_STORAGE = 19,
+	BPF_MAP_TYPE_REUSEPORT_SOCKARRAY = 20,
+	BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE = 21,
+	BPF_MAP_TYPE_QUEUE = 22,
+	BPF_MAP_TYPE_STACK = 23,
+	BPF_MAP_TYPE_SK_STORAGE = 24,
+	BPF_MAP_TYPE_DEVMAP_HASH = 25,
+	BPF_MAP_TYPE_STRUCT_OPS = 26,
+	BPF_MAP_TYPE_RINGBUF = 27,
+	BPF_MAP_TYPE_INODE_STORAGE = 28,
+};
+
+enum {
+	BPF_ANY = 0,
+	BPF_NOEXIST = 1,
+	BPF_EXIST = 2,
+	BPF_F_LOCK = 4,
+};
+
+/* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and
+ * BPF_FUNC_perf_event_read_value flags.
+ */
+#define BPF_F_INDEX_MASK 0xffffffffULL
+#define BPF_F_CURRENT_CPU BPF_F_INDEX_MASK
+
+#define PT_REGS_RC(x) ((x)->rax)
+struct pt_regs {
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
+	unsigned long r15;
+	unsigned long r14;
+	unsigned long r13;
+	unsigned long r12;
+	unsigned long rbp;
+	unsigned long rbx;
+/* These regs are callee-clobbered. Always saved on kernel entry. */
+	unsigned long r11;
+	unsigned long r10;
+	unsigned long r9;
+	unsigned long r8;
+	unsigned long rax;
+	unsigned long rcx;
+	unsigned long rdx;
+	unsigned long rsi;
+	unsigned long rdi;
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
+	unsigned long orig_rax;
+/* Return frame for iretq */
+	unsigned long rip;
+	unsigned long cs;
+	unsigned long eflags;
+	unsigned long rsp;
+	unsigned long ss;
+/* top of stack page */
+};
+
+#endif /* __VMLINUX_H__ */

+ 26 - 0
vendor/github.com/cilium/ebpf/examples/kprobe/bpf/kprobe_example.c

@@ -0,0 +1,26 @@
+#include "common.h"
+#include "bpf_helpers.h"
+
+char __license[] SEC("license") = "Dual MIT/GPL";
+
+struct bpf_map_def SEC("maps") kprobe_map = {
+    .type = BPF_MAP_TYPE_ARRAY,
+    .key_size = sizeof(u32),
+    .value_size = sizeof(u64),
+    .max_entries = 1,
+};
+
+SEC("kprobe/__x64_sys_execve")
+int kprobe_execve() {
+    u32 key = 0;
+    u64 initval = 1, *valp;
+
+    valp = bpf_map_lookup_elem(&kprobe_map, &key);
+    if (!valp) {
+        bpf_map_update_elem(&kprobe_map, &key, &initval, BPF_ANY);
+        return 0;
+    }
+    __sync_fetch_and_add(valp, 1);
+
+    return 0;
+}

+ 25 - 0
vendor/github.com/cilium/ebpf/examples/uprobe/bpf/uprobe_example.c

@@ -0,0 +1,25 @@
+#include "common.h"
+#include "bpf_helpers.h"
+
+char __license[] SEC("license") = "Dual MIT/GPL";
+
+struct event_t {
+	u32 pid;
+	char str[80];
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+} events SEC(".maps");
+
+SEC("uprobe/bash_readline")
+int uprobe_bash_readline(struct pt_regs *ctx) {
+	struct event_t event;
+
+	event.pid = bpf_get_current_pid_tgid();
+	bpf_probe_read(&event.str, sizeof(event.str), (void *)PT_REGS_RC(ctx));
+
+	bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &event, sizeof(event));
+
+	return 0;
+}

+ 6 - 2
vendor/github.com/cilium/ebpf/go.mod

@@ -1,5 +1,9 @@
 module github.com/cilium/ebpf
 module github.com/cilium/ebpf
 
 
-go 1.13
+go 1.15
 
 
-require golang.org/x/sys v0.0.0-20200124204421-9fbb57f87de9
+require (
+	github.com/frankban/quicktest v1.11.3
+	github.com/google/go-cmp v0.5.4
+	golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c
+)

+ 239 - 0
vendor/github.com/cilium/ebpf/info.go

@@ -0,0 +1,239 @@
+package ebpf
+
+import (
+	"bufio"
+	"encoding/hex"
+	"errors"
+	"fmt"
+	"io"
+	"os"
+	"strings"
+	"syscall"
+	"time"
+
+	"github.com/cilium/ebpf/internal"
+)
+
+// MapInfo describes a map.
+type MapInfo struct {
+	Type       MapType
+	id         MapID
+	KeySize    uint32
+	ValueSize  uint32
+	MaxEntries uint32
+	Flags      uint32
+	// Name as supplied by user space at load time.
+	Name string
+}
+
+func newMapInfoFromFd(fd *internal.FD) (*MapInfo, error) {
+	info, err := bpfGetMapInfoByFD(fd)
+	if errors.Is(err, syscall.EINVAL) {
+		return newMapInfoFromProc(fd)
+	}
+	if err != nil {
+		return nil, err
+	}
+
+	return &MapInfo{
+		MapType(info.map_type),
+		MapID(info.id),
+		info.key_size,
+		info.value_size,
+		info.max_entries,
+		info.map_flags,
+		// name is available from 4.15.
+		internal.CString(info.name[:]),
+	}, nil
+}
+
+func newMapInfoFromProc(fd *internal.FD) (*MapInfo, error) {
+	var mi MapInfo
+	err := scanFdInfo(fd, map[string]interface{}{
+		"map_type":    &mi.Type,
+		"key_size":    &mi.KeySize,
+		"value_size":  &mi.ValueSize,
+		"max_entries": &mi.MaxEntries,
+		"map_flags":   &mi.Flags,
+	})
+	if err != nil {
+		return nil, err
+	}
+	return &mi, nil
+}
+
+// ID returns the map ID.
+//
+// Available from 4.13.
+//
+// The bool return value indicates whether this optional field is available.
+func (mi *MapInfo) ID() (MapID, bool) {
+	return mi.id, mi.id > 0
+}
+
+// programStats holds statistics of a program.
+type programStats struct {
+	// Total accumulated runtime of the program ins ns.
+	runtime time.Duration
+	// Total number of times the program was called.
+	runCount uint64
+}
+
+// ProgramInfo describes a program.
+type ProgramInfo struct {
+	Type ProgramType
+	id   ProgramID
+	// Truncated hash of the BPF bytecode.
+	Tag string
+	// Name as supplied by user space at load time.
+	Name string
+
+	stats *programStats
+}
+
+func newProgramInfoFromFd(fd *internal.FD) (*ProgramInfo, error) {
+	info, err := bpfGetProgInfoByFD(fd)
+	if errors.Is(err, syscall.EINVAL) {
+		return newProgramInfoFromProc(fd)
+	}
+	if err != nil {
+		return nil, err
+	}
+
+	return &ProgramInfo{
+		Type: ProgramType(info.prog_type),
+		id:   ProgramID(info.id),
+		// tag is available if the kernel supports BPF_PROG_GET_INFO_BY_FD.
+		Tag: hex.EncodeToString(info.tag[:]),
+		// name is available from 4.15.
+		Name: internal.CString(info.name[:]),
+		stats: &programStats{
+			runtime:  time.Duration(info.run_time_ns),
+			runCount: info.run_cnt,
+		},
+	}, nil
+}
+
+func newProgramInfoFromProc(fd *internal.FD) (*ProgramInfo, error) {
+	var info ProgramInfo
+	err := scanFdInfo(fd, map[string]interface{}{
+		"prog_type": &info.Type,
+		"prog_tag":  &info.Tag,
+	})
+	if errors.Is(err, errMissingFields) {
+		return nil, &internal.UnsupportedFeatureError{
+			Name:           "reading program info from /proc/self/fdinfo",
+			MinimumVersion: internal.Version{4, 10, 0},
+		}
+	}
+	if err != nil {
+		return nil, err
+	}
+
+	return &info, nil
+}
+
+// ID returns the program ID.
+//
+// Available from 4.13.
+//
+// The bool return value indicates whether this optional field is available.
+func (pi *ProgramInfo) ID() (ProgramID, bool) {
+	return pi.id, pi.id > 0
+}
+
+// RunCount returns the total number of times the program was called.
+//
+// Can return 0 if the collection of statistics is not enabled. See EnableStats().
+// The bool return value indicates whether this optional field is available.
+func (pi *ProgramInfo) RunCount() (uint64, bool) {
+	if pi.stats != nil {
+		return pi.stats.runCount, true
+	}
+	return 0, false
+}
+
+// Runtime returns the total accumulated runtime of the program.
+//
+// Can return 0 if the collection of statistics is not enabled. See EnableStats().
+// The bool return value indicates whether this optional field is available.
+func (pi *ProgramInfo) Runtime() (time.Duration, bool) {
+	if pi.stats != nil {
+		return pi.stats.runtime, true
+	}
+	return time.Duration(0), false
+}
+
+func scanFdInfo(fd *internal.FD, fields map[string]interface{}) error {
+	raw, err := fd.Value()
+	if err != nil {
+		return err
+	}
+
+	fh, err := os.Open(fmt.Sprintf("/proc/self/fdinfo/%d", raw))
+	if err != nil {
+		return err
+	}
+	defer fh.Close()
+
+	if err := scanFdInfoReader(fh, fields); err != nil {
+		return fmt.Errorf("%s: %w", fh.Name(), err)
+	}
+	return nil
+}
+
+var errMissingFields = errors.New("missing fields")
+
+func scanFdInfoReader(r io.Reader, fields map[string]interface{}) error {
+	var (
+		scanner = bufio.NewScanner(r)
+		scanned int
+	)
+
+	for scanner.Scan() {
+		parts := strings.SplitN(scanner.Text(), "\t", 2)
+		if len(parts) != 2 {
+			continue
+		}
+
+		name := strings.TrimSuffix(parts[0], ":")
+		field, ok := fields[string(name)]
+		if !ok {
+			continue
+		}
+
+		if n, err := fmt.Sscanln(parts[1], field); err != nil || n != 1 {
+			return fmt.Errorf("can't parse field %s: %v", name, err)
+		}
+
+		scanned++
+	}
+
+	if err := scanner.Err(); err != nil {
+		return err
+	}
+
+	if scanned != len(fields) {
+		return errMissingFields
+	}
+
+	return nil
+}
+
+// EnableStats starts the measuring of the runtime
+// and run counts of eBPF programs.
+//
+// Collecting statistics can have an impact on the performance.
+//
+// Requires at least 5.8.
+func EnableStats(which uint32) (io.Closer, error) {
+	attr := internal.BPFEnableStatsAttr{
+		StatsType: which,
+	}
+
+	fd, err := internal.BPFEnableStats(&attr)
+	if err != nil {
+		return nil, err
+	}
+	return fd, nil
+}

+ 175 - 100
vendor/github.com/cilium/ebpf/internal/btf/btf.go

@@ -29,12 +29,14 @@ var (
 
 
 // Spec represents decoded BTF.
 // Spec represents decoded BTF.
 type Spec struct {
 type Spec struct {
-	rawTypes  []rawType
-	strings   stringTable
-	types     map[string][]Type
-	funcInfos map[string]extInfo
-	lineInfos map[string]extInfo
-	byteOrder binary.ByteOrder
+	rawTypes   []rawType
+	strings    stringTable
+	types      []Type
+	namedTypes map[string][]namedType
+	funcInfos  map[string]extInfo
+	lineInfos  map[string]extInfo
+	coreRelos  map[string]bpfCoreRelos
+	byteOrder  binary.ByteOrder
 }
 }
 
 
 type btfHeader struct {
 type btfHeader struct {
@@ -53,35 +55,15 @@ type btfHeader struct {
 //
 //
 // Returns a nil Spec and no error if no BTF was present.
 // Returns a nil Spec and no error if no BTF was present.
 func LoadSpecFromReader(rd io.ReaderAt) (*Spec, error) {
 func LoadSpecFromReader(rd io.ReaderAt) (*Spec, error) {
-	file, err := elf.NewFile(rd)
+	file, err := internal.NewSafeELFFile(rd)
 	if err != nil {
 	if err != nil {
 		return nil, err
 		return nil, err
 	}
 	}
 	defer file.Close()
 	defer file.Close()
 
 
-	var (
-		btfSection    *elf.Section
-		btfExtSection *elf.Section
-		sectionSizes  = make(map[string]uint32)
-	)
-
-	for _, sec := range file.Sections {
-		switch sec.Name {
-		case ".BTF":
-			btfSection = sec
-		case ".BTF.ext":
-			btfExtSection = sec
-		default:
-			if sec.Type != elf.SHT_PROGBITS && sec.Type != elf.SHT_NOBITS {
-				break
-			}
-
-			if sec.Size > math.MaxUint32 {
-				return nil, fmt.Errorf("section %s exceeds maximum size", sec.Name)
-			}
-
-			sectionSizes[sec.Name] = uint32(sec.Size)
-		}
+	btfSection, btfExtSection, sectionSizes, err := findBtfSections(file)
+	if err != nil {
+		return nil, err
 	}
 	}
 
 
 	if btfSection == nil {
 	if btfSection == nil {
@@ -100,6 +82,10 @@ func LoadSpecFromReader(rd io.ReaderAt) (*Spec, error) {
 			continue
 			continue
 		}
 		}
 
 
+		if int(symbol.Section) >= len(file.Sections) {
+			return nil, fmt.Errorf("symbol %s: invalid section %d", symbol.Name, symbol.Section)
+		}
+
 		secName := file.Sections[symbol.Section].Name
 		secName := file.Sections[symbol.Section].Name
 		if _, ok := sectionSizes[secName]; !ok {
 		if _, ok := sectionSizes[secName]; !ok {
 			continue
 			continue
@@ -121,7 +107,7 @@ func LoadSpecFromReader(rd io.ReaderAt) (*Spec, error) {
 		return spec, nil
 		return spec, nil
 	}
 	}
 
 
-	spec.funcInfos, spec.lineInfos, err = parseExtInfos(btfExtSection.Open(), file.ByteOrder, spec.strings)
+	spec.funcInfos, spec.lineInfos, spec.coreRelos, err = parseExtInfos(btfExtSection.Open(), file.ByteOrder, spec.strings)
 	if err != nil {
 	if err != nil {
 		return nil, fmt.Errorf("can't read ext info: %w", err)
 		return nil, fmt.Errorf("can't read ext info: %w", err)
 	}
 	}
@@ -129,6 +115,51 @@ func LoadSpecFromReader(rd io.ReaderAt) (*Spec, error) {
 	return spec, nil
 	return spec, nil
 }
 }
 
 
+func findBtfSections(file *internal.SafeELFFile) (*elf.Section, *elf.Section, map[string]uint32, error) {
+	var (
+		btfSection    *elf.Section
+		btfExtSection *elf.Section
+		sectionSizes  = make(map[string]uint32)
+	)
+
+	for _, sec := range file.Sections {
+		switch sec.Name {
+		case ".BTF":
+			btfSection = sec
+		case ".BTF.ext":
+			btfExtSection = sec
+		default:
+			if sec.Type != elf.SHT_PROGBITS && sec.Type != elf.SHT_NOBITS {
+				break
+			}
+
+			if sec.Size > math.MaxUint32 {
+				return nil, nil, nil, fmt.Errorf("section %s exceeds maximum size", sec.Name)
+			}
+
+			sectionSizes[sec.Name] = uint32(sec.Size)
+		}
+	}
+	return btfSection, btfExtSection, sectionSizes, nil
+}
+
+func loadSpecFromVmlinux(rd io.ReaderAt) (*Spec, error) {
+	file, err := internal.NewSafeELFFile(rd)
+	if err != nil {
+		return nil, err
+	}
+	defer file.Close()
+
+	btfSection, _, _, err := findBtfSections(file)
+	if err != nil {
+		return nil, fmt.Errorf(".BTF ELF section: %s", err)
+	}
+	if btfSection == nil {
+		return nil, fmt.Errorf("unable to find .BTF ELF section")
+	}
+	return loadNakedSpec(btfSection.Open(), file.ByteOrder, nil, nil)
+}
+
 func loadNakedSpec(btf io.ReadSeeker, bo binary.ByteOrder, sectionSizes map[string]uint32, variableOffsets map[variable]uint32) (*Spec, error) {
 func loadNakedSpec(btf io.ReadSeeker, bo binary.ByteOrder, sectionSizes map[string]uint32, variableOffsets map[variable]uint32) (*Spec, error) {
 	rawTypes, rawStrings, err := parseBTF(btf, bo)
 	rawTypes, rawStrings, err := parseBTF(btf, bo)
 	if err != nil {
 	if err != nil {
@@ -140,16 +171,17 @@ func loadNakedSpec(btf io.ReadSeeker, bo binary.ByteOrder, sectionSizes map[stri
 		return nil, err
 		return nil, err
 	}
 	}
 
 
-	types, err := inflateRawTypes(rawTypes, rawStrings)
+	types, typesByName, err := inflateRawTypes(rawTypes, rawStrings)
 	if err != nil {
 	if err != nil {
 		return nil, err
 		return nil, err
 	}
 	}
 
 
 	return &Spec{
 	return &Spec{
-		rawTypes:  rawTypes,
-		types:     types,
-		strings:   rawStrings,
-		byteOrder: bo,
+		rawTypes:   rawTypes,
+		namedTypes: typesByName,
+		types:      types,
+		strings:    rawStrings,
+		byteOrder:  bo,
 	}, nil
 	}, nil
 }
 }
 
 
@@ -176,16 +208,43 @@ func LoadKernelSpec() (*Spec, error) {
 }
 }
 
 
 func loadKernelSpec() (*Spec, error) {
 func loadKernelSpec() (*Spec, error) {
+	release, err := unix.KernelRelease()
+	if err != nil {
+		return nil, fmt.Errorf("can't read kernel release number: %w", err)
+	}
+
 	fh, err := os.Open("/sys/kernel/btf/vmlinux")
 	fh, err := os.Open("/sys/kernel/btf/vmlinux")
-	if os.IsNotExist(err) {
-		return nil, fmt.Errorf("can't open kernel BTF at /sys/kernel/btf/vmlinux: %w", ErrNotFound)
+	if err == nil {
+		defer fh.Close()
+
+		return loadNakedSpec(fh, internal.NativeEndian, nil, nil)
 	}
 	}
-	if err != nil {
-		return nil, fmt.Errorf("can't read kernel BTF: %s", err)
+
+	// use same list of locations as libbpf
+	// https://github.com/libbpf/libbpf/blob/9a3a42608dbe3731256a5682a125ac1e23bced8f/src/btf.c#L3114-L3122
+	locations := []string{
+		"/boot/vmlinux-%s",
+		"/lib/modules/%s/vmlinux-%[1]s",
+		"/lib/modules/%s/build/vmlinux",
+		"/usr/lib/modules/%s/kernel/vmlinux",
+		"/usr/lib/debug/boot/vmlinux-%s",
+		"/usr/lib/debug/boot/vmlinux-%s.debug",
+		"/usr/lib/debug/lib/modules/%s/vmlinux",
 	}
 	}
-	defer fh.Close()
 
 
-	return loadNakedSpec(fh, internal.NativeEndian, nil, nil)
+	for _, loc := range locations {
+		path := fmt.Sprintf(loc, release)
+
+		fh, err := os.Open(path)
+		if err != nil {
+			continue
+		}
+		defer fh.Close()
+
+		return loadSpecFromVmlinux(fh)
+	}
+
+	return nil, fmt.Errorf("no BTF for kernel version %s: %w", release, internal.ErrNotSupported)
 }
 }
 
 
 func parseBTF(btf io.ReadSeeker, bo binary.ByteOrder) ([]rawType, stringTable, error) {
 func parseBTF(btf io.ReadSeeker, bo binary.ByteOrder) ([]rawType, stringTable, error) {
@@ -259,10 +318,14 @@ func fixupDatasec(rawTypes []rawType, rawStrings stringTable, sectionSizes map[s
 			return err
 			return err
 		}
 		}
 
 
-		if name == ".kconfig" || name == ".ksym" {
+		if name == ".kconfig" || name == ".ksyms" {
 			return fmt.Errorf("reference to %s: %w", name, ErrNotSupported)
 			return fmt.Errorf("reference to %s: %w", name, ErrNotSupported)
 		}
 		}
 
 
+		if rawTypes[i].SizeType != 0 {
+			continue
+		}
+
 		size, ok := sectionSizes[name]
 		size, ok := sectionSizes[name]
 		if !ok {
 		if !ok {
 			return fmt.Errorf("data section %s: missing size", name)
 			return fmt.Errorf("data section %s: missing size", name)
@@ -369,54 +432,19 @@ func (s *Spec) Program(name string, length uint64) (*Program, error) {
 		return nil, errors.New("length musn't be zero")
 		return nil, errors.New("length musn't be zero")
 	}
 	}
 
 
-	if s.funcInfos == nil && s.lineInfos == nil {
+	if s.funcInfos == nil && s.lineInfos == nil && s.coreRelos == nil {
 		return nil, fmt.Errorf("BTF for section %s: %w", name, ErrNoExtendedInfo)
 		return nil, fmt.Errorf("BTF for section %s: %w", name, ErrNoExtendedInfo)
 	}
 	}
 
 
 	funcInfos, funcOK := s.funcInfos[name]
 	funcInfos, funcOK := s.funcInfos[name]
 	lineInfos, lineOK := s.lineInfos[name]
 	lineInfos, lineOK := s.lineInfos[name]
+	coreRelos, coreOK := s.coreRelos[name]
 
 
-	if !funcOK && !lineOK {
+	if !funcOK && !lineOK && !coreOK {
 		return nil, fmt.Errorf("no extended BTF info for section %s", name)
 		return nil, fmt.Errorf("no extended BTF info for section %s", name)
 	}
 	}
 
 
-	return &Program{s, length, funcInfos, lineInfos}, nil
-}
-
-// Map finds the BTF for a map.
-//
-// Returns an error if there is no BTF for the given name.
-func (s *Spec) Map(name string) (*Map, []Member, error) {
-	var mapVar Var
-	if err := s.FindType(name, &mapVar); err != nil {
-		return nil, nil, err
-	}
-
-	mapStruct, ok := mapVar.Type.(*Struct)
-	if !ok {
-		return nil, nil, fmt.Errorf("expected struct, have %s", mapVar.Type)
-	}
-
-	var key, value Type
-	for _, member := range mapStruct.Members {
-		switch member.Name {
-		case "key":
-			key = member.Type
-
-		case "value":
-			value = member.Type
-		}
-	}
-
-	if key == nil {
-		key = (*Void)(nil)
-	}
-
-	if value == nil {
-		value = (*Void)(nil)
-	}
-
-	return &Map{s, key, value}, mapStruct.Members, nil
+	return &Program{s, length, funcInfos, lineInfos, coreRelos}, nil
 }
 }
 
 
 // Datasec returns the BTF required to create maps which represent data sections.
 // Datasec returns the BTF required to create maps which represent data sections.
@@ -426,7 +454,8 @@ func (s *Spec) Datasec(name string) (*Map, error) {
 		return nil, fmt.Errorf("data section %s: can't get BTF: %w", name, err)
 		return nil, fmt.Errorf("data section %s: can't get BTF: %w", name, err)
 	}
 	}
 
 
-	return &Map{s, &Void{}, &datasec}, nil
+	m := NewMap(s, &Void{}, &datasec)
+	return &m, nil
 }
 }
 
 
 // FindType searches for a type with a specific name.
 // FindType searches for a type with a specific name.
@@ -441,11 +470,16 @@ func (s *Spec) FindType(name string, typ Type) error {
 		candidate Type
 		candidate Type
 	)
 	)
 
 
-	for _, typ := range s.types[name] {
+	for _, typ := range s.namedTypes[essentialName(name)] {
 		if reflect.TypeOf(typ) != wanted {
 		if reflect.TypeOf(typ) != wanted {
 			continue
 			continue
 		}
 		}
 
 
+		// Match against the full name, not just the essential one.
+		if typ.name() != name {
+			continue
+		}
+
 		if candidate != nil {
 		if candidate != nil {
 			return fmt.Errorf("type %s: multiple candidates for %T", name, typ)
 			return fmt.Errorf("type %s: multiple candidates for %T", name, typ)
 		}
 		}
@@ -532,6 +566,23 @@ type Map struct {
 	key, value Type
 	key, value Type
 }
 }
 
 
+// NewMap returns a new Map containing the given values.
+// The key and value arguments are initialized to Void if nil values are given.
+func NewMap(spec *Spec, key Type, value Type) Map {
+	if key == nil {
+		key = &Void{}
+	}
+	if value == nil {
+		value = &Void{}
+	}
+
+	return Map{
+		spec:  spec,
+		key:   key,
+		value: value,
+	}
+}
+
 // MapSpec should be a method on Map, but is a free function
 // MapSpec should be a method on Map, but is a free function
 // to hide it from users of the ebpf package.
 // to hide it from users of the ebpf package.
 func MapSpec(m *Map) *Spec {
 func MapSpec(m *Map) *Spec {
@@ -555,6 +606,7 @@ type Program struct {
 	spec                 *Spec
 	spec                 *Spec
 	length               uint64
 	length               uint64
 	funcInfos, lineInfos extInfo
 	funcInfos, lineInfos extInfo
+	coreRelos            bpfCoreRelos
 }
 }
 
 
 // ProgramSpec returns the Spec needed for loading function and line infos into the kernel.
 // ProgramSpec returns the Spec needed for loading function and line infos into the kernel.
@@ -580,9 +632,10 @@ func ProgramAppend(s, other *Program) error {
 		return fmt.Errorf("line infos: %w", err)
 		return fmt.Errorf("line infos: %w", err)
 	}
 	}
 
 
-	s.length += other.length
 	s.funcInfos = funcInfos
 	s.funcInfos = funcInfos
 	s.lineInfos = lineInfos
 	s.lineInfos = lineInfos
+	s.coreRelos = s.coreRelos.append(other.coreRelos, s.length)
+	s.length += other.length
 	return nil
 	return nil
 }
 }
 
 
@@ -612,6 +665,19 @@ func ProgramLineInfos(s *Program) (recordSize uint32, bytes []byte, err error) {
 	return s.lineInfos.recordSize, bytes, nil
 	return s.lineInfos.recordSize, bytes, nil
 }
 }
 
 
+// ProgramRelocations returns the CO-RE relocations required to adjust the
+// program to the target.
+//
+// This is a free function instead of a method to hide it from users
+// of package ebpf.
+func ProgramRelocations(s *Program, target *Spec) (map[uint64]Relocation, error) {
+	if len(s.coreRelos) == 0 {
+		return nil, nil
+	}
+
+	return coreRelocate(s.spec, target, s.coreRelos)
+}
+
 type bpfLoadBTFAttr struct {
 type bpfLoadBTFAttr struct {
 	btf         internal.Pointer
 	btf         internal.Pointer
 	logBuf      internal.Pointer
 	logBuf      internal.Pointer
@@ -621,9 +687,7 @@ type bpfLoadBTFAttr struct {
 }
 }
 
 
 func bpfLoadBTF(attr *bpfLoadBTFAttr) (*internal.FD, error) {
 func bpfLoadBTF(attr *bpfLoadBTFAttr) (*internal.FD, error) {
-	const _BTFLoad = 18
-
-	fd, err := internal.BPF(_BTFLoad, unsafe.Pointer(attr), unsafe.Sizeof(*attr))
+	fd, err := internal.BPF(internal.BPF_BTF_LOAD, unsafe.Pointer(attr), unsafe.Sizeof(*attr))
 	if err != nil {
 	if err != nil {
 		return nil, err
 		return nil, err
 	}
 	}
@@ -653,7 +717,7 @@ func marshalBTF(types interface{}, strings []byte, bo binary.ByteOrder) []byte {
 	return buf.Bytes()
 	return buf.Bytes()
 }
 }
 
 
-var haveBTF = internal.FeatureTest("BTF", "5.1", func() (bool, error) {
+var haveBTF = internal.FeatureTest("BTF", "5.1", func() error {
 	var (
 	var (
 		types struct {
 		types struct {
 			Integer btfType
 			Integer btfType
@@ -677,15 +741,24 @@ var haveBTF = internal.FeatureTest("BTF", "5.1", func() (bool, error) {
 		btf:     internal.NewSlicePointer(btf),
 		btf:     internal.NewSlicePointer(btf),
 		btfSize: uint32(len(btf)),
 		btfSize: uint32(len(btf)),
 	})
 	})
-	if err == nil {
-		fd.Close()
+	if errors.Is(err, unix.EINVAL) || errors.Is(err, unix.EPERM) {
+		// Treat both EINVAL and EPERM as not supported: loading the program
+		// might still succeed without BTF.
+		return internal.ErrNotSupported
+	}
+	if err != nil {
+		return err
 	}
 	}
-	// Check for EINVAL specifically, rather than err != nil since we
-	// otherwise misdetect due to insufficient permissions.
-	return !errors.Is(err, unix.EINVAL), nil
+
+	fd.Close()
+	return nil
 })
 })
 
 
-var haveFuncLinkage = internal.FeatureTest("BTF func linkage", "5.6", func() (bool, error) {
+var haveFuncLinkage = internal.FeatureTest("BTF func linkage", "5.6", func() error {
+	if err := haveBTF(); err != nil {
+		return err
+	}
+
 	var (
 	var (
 		types struct {
 		types struct {
 			FuncProto btfType
 			FuncProto btfType
@@ -706,11 +779,13 @@ var haveFuncLinkage = internal.FeatureTest("BTF func linkage", "5.6", func() (bo
 		btf:     internal.NewSlicePointer(btf),
 		btf:     internal.NewSlicePointer(btf),
 		btfSize: uint32(len(btf)),
 		btfSize: uint32(len(btf)),
 	})
 	})
-	if err == nil {
-		fd.Close()
+	if errors.Is(err, unix.EINVAL) {
+		return internal.ErrNotSupported
+	}
+	if err != nil {
+		return err
 	}
 	}
 
 
-	// Check for EINVAL specifically, rather than err != nil since we
-	// otherwise misdetect due to insufficient permissions.
-	return !errors.Is(err, unix.EINVAL), nil
+	fd.Close()
+	return nil
 })
 })

+ 17 - 5
vendor/github.com/cilium/ebpf/internal/btf/btf_types.go

@@ -31,19 +31,23 @@ const (
 	kindDatasec
 	kindDatasec
 )
 )
 
 
+// btfFuncLinkage describes BTF function linkage metadata.
 type btfFuncLinkage uint8
 type btfFuncLinkage uint8
 
 
+// Equivalent of enum btf_func_linkage.
 const (
 const (
 	linkageStatic btfFuncLinkage = iota
 	linkageStatic btfFuncLinkage = iota
 	linkageGlobal
 	linkageGlobal
-	linkageExtern
+	// linkageExtern // Currently unused in libbpf.
 )
 )
 
 
 const (
 const (
-	btfTypeKindShift = 24
-	btfTypeKindLen   = 4
-	btfTypeVlenShift = 0
-	btfTypeVlenMask  = 16
+	btfTypeKindShift     = 24
+	btfTypeKindLen       = 4
+	btfTypeVlenShift     = 0
+	btfTypeVlenMask      = 16
+	btfTypeKindFlagShift = 31
+	btfTypeKindFlagMask  = 1
 )
 )
 
 
 // btfType is equivalent to struct btf_type in Documentation/bpf/btf.rst.
 // btfType is equivalent to struct btf_type in Documentation/bpf/btf.rst.
@@ -136,6 +140,10 @@ func (bt *btfType) SetVlen(vlen int) {
 	bt.setInfo(uint32(vlen), btfTypeVlenMask, btfTypeVlenShift)
 	bt.setInfo(uint32(vlen), btfTypeVlenMask, btfTypeVlenShift)
 }
 }
 
 
+func (bt *btfType) KindFlag() bool {
+	return bt.info(btfTypeKindFlagMask, btfTypeKindFlagShift) == 1
+}
+
 func (bt *btfType) Linkage() btfFuncLinkage {
 func (bt *btfType) Linkage() btfFuncLinkage {
 	return btfFuncLinkage(bt.info(btfTypeVlenMask, btfTypeVlenShift))
 	return btfFuncLinkage(bt.info(btfTypeVlenMask, btfTypeVlenShift))
 }
 }
@@ -257,3 +265,7 @@ func readTypes(r io.Reader, bo binary.ByteOrder) ([]rawType, error) {
 		types = append(types, rawType{header, data})
 		types = append(types, rawType{header, data})
 	}
 	}
 }
 }
+
+func intEncoding(raw uint32) (IntEncoding, uint32, byte) {
+	return IntEncoding((raw & 0x0f000000) >> 24), (raw & 0x00ff0000) >> 16, byte(raw & 0x000000ff)
+}

+ 388 - 0
vendor/github.com/cilium/ebpf/internal/btf/core.go

@@ -0,0 +1,388 @@
+package btf
+
+import (
+	"errors"
+	"fmt"
+	"reflect"
+	"strconv"
+	"strings"
+)
+
+// Code in this file is derived from libbpf, which is available under a BSD
+// 2-Clause license.
+
+// Relocation describes a CO-RE relocation.
+type Relocation struct {
+	Current uint32
+	New     uint32
+}
+
+func (r Relocation) equal(other Relocation) bool {
+	return r.Current == other.Current && r.New == other.New
+}
+
+// coreReloKind is the type of CO-RE relocation
+type coreReloKind uint32
+
+const (
+	reloFieldByteOffset coreReloKind = iota /* field byte offset */
+	reloFieldByteSize                       /* field size in bytes */
+	reloFieldExists                         /* field existence in target kernel */
+	reloFieldSigned                         /* field signedness (0 - unsigned, 1 - signed) */
+	reloFieldLShiftU64                      /* bitfield-specific left bitshift */
+	reloFieldRShiftU64                      /* bitfield-specific right bitshift */
+	reloTypeIDLocal                         /* type ID in local BPF object */
+	reloTypeIDTarget                        /* type ID in target kernel */
+	reloTypeExists                          /* type existence in target kernel */
+	reloTypeSize                            /* type size in bytes */
+	reloEnumvalExists                       /* enum value existence in target kernel */
+	reloEnumvalValue                        /* enum value integer value */
+)
+
+func (k coreReloKind) String() string {
+	switch k {
+	case reloFieldByteOffset:
+		return "byte_off"
+	case reloFieldByteSize:
+		return "byte_sz"
+	case reloFieldExists:
+		return "field_exists"
+	case reloFieldSigned:
+		return "signed"
+	case reloFieldLShiftU64:
+		return "lshift_u64"
+	case reloFieldRShiftU64:
+		return "rshift_u64"
+	case reloTypeIDLocal:
+		return "local_type_id"
+	case reloTypeIDTarget:
+		return "target_type_id"
+	case reloTypeExists:
+		return "type_exists"
+	case reloTypeSize:
+		return "type_size"
+	case reloEnumvalExists:
+		return "enumval_exists"
+	case reloEnumvalValue:
+		return "enumval_value"
+	default:
+		return "unknown"
+	}
+}
+
+func coreRelocate(local, target *Spec, coreRelos bpfCoreRelos) (map[uint64]Relocation, error) {
+	if target == nil {
+		var err error
+		target, err = loadKernelSpec()
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	if local.byteOrder != target.byteOrder {
+		return nil, fmt.Errorf("can't relocate %s against %s", local.byteOrder, target.byteOrder)
+	}
+
+	relocations := make(map[uint64]Relocation, len(coreRelos))
+	for _, relo := range coreRelos {
+		accessorStr, err := local.strings.Lookup(relo.AccessStrOff)
+		if err != nil {
+			return nil, err
+		}
+
+		accessor, err := parseCoreAccessor(accessorStr)
+		if err != nil {
+			return nil, fmt.Errorf("accessor %q: %s", accessorStr, err)
+		}
+
+		if int(relo.TypeID) >= len(local.types) {
+			return nil, fmt.Errorf("invalid type id %d", relo.TypeID)
+		}
+
+		typ := local.types[relo.TypeID]
+
+		if relo.ReloKind == reloTypeIDLocal {
+			relocations[uint64(relo.InsnOff)] = Relocation{
+				uint32(typ.ID()),
+				uint32(typ.ID()),
+			}
+			continue
+		}
+
+		named, ok := typ.(namedType)
+		if !ok || named.name() == "" {
+			return nil, fmt.Errorf("relocate anonymous type %s: %w", typ.String(), ErrNotSupported)
+		}
+
+		name := essentialName(named.name())
+		res, err := coreCalculateRelocation(typ, target.namedTypes[name], relo.ReloKind, accessor)
+		if err != nil {
+			return nil, fmt.Errorf("relocate %s: %w", name, err)
+		}
+
+		relocations[uint64(relo.InsnOff)] = res
+	}
+
+	return relocations, nil
+}
+
+var errAmbiguousRelocation = errors.New("ambiguous relocation")
+
+func coreCalculateRelocation(local Type, targets []namedType, kind coreReloKind, localAccessor coreAccessor) (Relocation, error) {
+	var relos []Relocation
+	var matches []Type
+	for _, target := range targets {
+		switch kind {
+		case reloTypeIDTarget:
+			if localAccessor[0] != 0 {
+				return Relocation{}, fmt.Errorf("%s: unexpected non-zero accessor", kind)
+			}
+
+			if compat, err := coreAreTypesCompatible(local, target); err != nil {
+				return Relocation{}, fmt.Errorf("%s: %s", kind, err)
+			} else if !compat {
+				continue
+			}
+
+			relos = append(relos, Relocation{uint32(target.ID()), uint32(target.ID())})
+
+		default:
+			return Relocation{}, fmt.Errorf("relocation %s: %w", kind, ErrNotSupported)
+		}
+		matches = append(matches, target)
+	}
+
+	if len(relos) == 0 {
+		// TODO: Add switch for existence checks like reloEnumvalExists here.
+
+		// TODO: This might have to be poisoned.
+		return Relocation{}, fmt.Errorf("no relocation found, tried %v", targets)
+	}
+
+	relo := relos[0]
+	for _, altRelo := range relos[1:] {
+		if !altRelo.equal(relo) {
+			return Relocation{}, fmt.Errorf("multiple types %v match: %w", matches, errAmbiguousRelocation)
+		}
+	}
+
+	return relo, nil
+}
+
+/* coreAccessor contains a path through a struct. It contains at least one index.
+ *
+ * The interpretation depends on the kind of the relocation. The following is
+ * taken from struct bpf_core_relo in libbpf_internal.h:
+ *
+ * - for field-based relocations, string encodes an accessed field using
+ *   a sequence of field and array indices, separated by colon (:). It's
+ *   conceptually very close to LLVM's getelementptr ([0]) instruction's
+ *   arguments for identifying offset to a field.
+ * - for type-based relocations, strings is expected to be just "0";
+ * - for enum value-based relocations, string contains an index of enum
+ *   value within its enum type;
+ *
+ * Example to provide a better feel.
+ *
+ *   struct sample {
+ *       int a;
+ *       struct {
+ *           int b[10];
+ *       };
+ *   };
+ *
+ *   struct sample s = ...;
+ *   int x = &s->a;     // encoded as "0:0" (a is field #0)
+ *   int y = &s->b[5];  // encoded as "0:1:0:5" (anon struct is field #1,
+ *                      // b is field #0 inside anon struct, accessing elem #5)
+ *   int z = &s[10]->b; // encoded as "10:1" (ptr is used as an array)
+ */
+type coreAccessor []int
+
+func parseCoreAccessor(accessor string) (coreAccessor, error) {
+	if accessor == "" {
+		return nil, fmt.Errorf("empty accessor")
+	}
+
+	var result coreAccessor
+	parts := strings.Split(accessor, ":")
+	for _, part := range parts {
+		// 31 bits to avoid overflowing int on 32 bit platforms.
+		index, err := strconv.ParseUint(part, 10, 31)
+		if err != nil {
+			return nil, fmt.Errorf("accessor index %q: %s", part, err)
+		}
+
+		result = append(result, int(index))
+	}
+
+	return result, nil
+}
+
+/* The comment below is from bpf_core_types_are_compat in libbpf.c:
+ *
+ * Check local and target types for compatibility. This check is used for
+ * type-based CO-RE relocations and follow slightly different rules than
+ * field-based relocations. This function assumes that root types were already
+ * checked for name match. Beyond that initial root-level name check, names
+ * are completely ignored. Compatibility rules are as follows:
+ *   - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs are considered compatible, but
+ *     kind should match for local and target types (i.e., STRUCT is not
+ *     compatible with UNION);
+ *   - for ENUMs, the size is ignored;
+ *   - for INT, size and signedness are ignored;
+ *   - for ARRAY, dimensionality is ignored, element types are checked for
+ *     compatibility recursively;
+ *   - CONST/VOLATILE/RESTRICT modifiers are ignored;
+ *   - TYPEDEFs/PTRs are compatible if types they pointing to are compatible;
+ *   - FUNC_PROTOs are compatible if they have compatible signature: same
+ *     number of input args and compatible return and argument types.
+ * These rules are not set in stone and probably will be adjusted as we get
+ * more experience with using BPF CO-RE relocations.
+ */
+func coreAreTypesCompatible(localType Type, targetType Type) (bool, error) {
+	var (
+		localTs, targetTs typeDeque
+		l, t              = &localType, &targetType
+		depth             = 0
+	)
+
+	for ; l != nil && t != nil; l, t = localTs.shift(), targetTs.shift() {
+		if depth >= maxTypeDepth {
+			return false, errors.New("types are nested too deep")
+		}
+
+		localType = skipQualifierAndTypedef(*l)
+		targetType = skipQualifierAndTypedef(*t)
+
+		if reflect.TypeOf(localType) != reflect.TypeOf(targetType) {
+			return false, nil
+		}
+
+		switch lv := (localType).(type) {
+		case *Void, *Struct, *Union, *Enum, *Fwd:
+			// Nothing to do here
+
+		case *Int:
+			tv := targetType.(*Int)
+			if lv.isBitfield() || tv.isBitfield() {
+				return false, nil
+			}
+
+		case *Pointer, *Array:
+			depth++
+			localType.walk(&localTs)
+			targetType.walk(&targetTs)
+
+		case *FuncProto:
+			tv := targetType.(*FuncProto)
+			if len(lv.Params) != len(tv.Params) {
+				return false, nil
+			}
+
+			depth++
+			localType.walk(&localTs)
+			targetType.walk(&targetTs)
+
+		default:
+			return false, fmt.Errorf("unsupported type %T", localType)
+		}
+	}
+
+	if l != nil {
+		return false, fmt.Errorf("dangling local type %T", *l)
+	}
+
+	if t != nil {
+		return false, fmt.Errorf("dangling target type %T", *t)
+	}
+
+	return true, nil
+}
+
+/* The comment below is from bpf_core_fields_are_compat in libbpf.c:
+ *
+ * Check two types for compatibility for the purpose of field access
+ * relocation. const/volatile/restrict and typedefs are skipped to ensure we
+ * are relocating semantically compatible entities:
+ *   - any two STRUCTs/UNIONs are compatible and can be mixed;
+ *   - any two FWDs are compatible, if their names match (modulo flavor suffix);
+ *   - any two PTRs are always compatible;
+ *   - for ENUMs, names should be the same (ignoring flavor suffix) or at
+ *     least one of enums should be anonymous;
+ *   - for ENUMs, check sizes, names are ignored;
+ *   - for INT, size and signedness are ignored;
+ *   - for ARRAY, dimensionality is ignored, element types are checked for
+ *     compatibility recursively;
+ *   - everything else shouldn't be ever a target of relocation.
+ * These rules are not set in stone and probably will be adjusted as we get
+ * more experience with using BPF CO-RE relocations.
+ */
+func coreAreMembersCompatible(localType Type, targetType Type) (bool, error) {
+	doNamesMatch := func(a, b string) bool {
+		if a == "" || b == "" {
+			// allow anonymous and named type to match
+			return true
+		}
+
+		return essentialName(a) == essentialName(b)
+	}
+
+	for depth := 0; depth <= maxTypeDepth; depth++ {
+		localType = skipQualifierAndTypedef(localType)
+		targetType = skipQualifierAndTypedef(targetType)
+
+		_, lok := localType.(composite)
+		_, tok := targetType.(composite)
+		if lok && tok {
+			return true, nil
+		}
+
+		if reflect.TypeOf(localType) != reflect.TypeOf(targetType) {
+			return false, nil
+		}
+
+		switch lv := localType.(type) {
+		case *Pointer:
+			return true, nil
+
+		case *Enum:
+			tv := targetType.(*Enum)
+			return doNamesMatch(lv.name(), tv.name()), nil
+
+		case *Fwd:
+			tv := targetType.(*Fwd)
+			return doNamesMatch(lv.name(), tv.name()), nil
+
+		case *Int:
+			tv := targetType.(*Int)
+			return !lv.isBitfield() && !tv.isBitfield(), nil
+
+		case *Array:
+			tv := targetType.(*Array)
+
+			localType = lv.Type
+			targetType = tv.Type
+
+		default:
+			return false, fmt.Errorf("unsupported type %T", localType)
+		}
+	}
+
+	return false, errors.New("types are nested too deep")
+}
+
+func skipQualifierAndTypedef(typ Type) Type {
+	result := typ
+	for depth := 0; depth <= maxTypeDepth; depth++ {
+		switch v := (result).(type) {
+		case qualifier:
+			result = v.qualify()
+		case *Typedef:
+			result = v.Type
+		default:
+			return result
+		}
+	}
+	return typ
+}

+ 126 - 27
vendor/github.com/cilium/ebpf/internal/btf/ext_info.go

@@ -1,6 +1,7 @@
 package btf
 package btf
 
 
 import (
 import (
+	"bufio"
 	"bytes"
 	"bytes"
 	"encoding/binary"
 	"encoding/binary"
 	"errors"
 	"errors"
@@ -24,55 +25,82 @@ type btfExtHeader struct {
 	LineInfoLen uint32
 	LineInfoLen uint32
 }
 }
 
 
-func parseExtInfos(r io.ReadSeeker, bo binary.ByteOrder, strings stringTable) (funcInfo, lineInfo map[string]extInfo, err error) {
+type btfExtCoreHeader struct {
+	CoreReloOff uint32
+	CoreReloLen uint32
+}
+
+func parseExtInfos(r io.ReadSeeker, bo binary.ByteOrder, strings stringTable) (funcInfo, lineInfo map[string]extInfo, coreRelos map[string]bpfCoreRelos, err error) {
 	var header btfExtHeader
 	var header btfExtHeader
+	var coreHeader btfExtCoreHeader
 	if err := binary.Read(r, bo, &header); err != nil {
 	if err := binary.Read(r, bo, &header); err != nil {
-		return nil, nil, fmt.Errorf("can't read header: %v", err)
+		return nil, nil, nil, fmt.Errorf("can't read header: %v", err)
 	}
 	}
 
 
 	if header.Magic != btfMagic {
 	if header.Magic != btfMagic {
-		return nil, nil, fmt.Errorf("incorrect magic value %v", header.Magic)
+		return nil, nil, nil, fmt.Errorf("incorrect magic value %v", header.Magic)
 	}
 	}
 
 
 	if header.Version != 1 {
 	if header.Version != 1 {
-		return nil, nil, fmt.Errorf("unexpected version %v", header.Version)
+		return nil, nil, nil, fmt.Errorf("unexpected version %v", header.Version)
 	}
 	}
 
 
 	if header.Flags != 0 {
 	if header.Flags != 0 {
-		return nil, nil, fmt.Errorf("unsupported flags %v", header.Flags)
+		return nil, nil, nil, fmt.Errorf("unsupported flags %v", header.Flags)
 	}
 	}
 
 
 	remainder := int64(header.HdrLen) - int64(binary.Size(&header))
 	remainder := int64(header.HdrLen) - int64(binary.Size(&header))
 	if remainder < 0 {
 	if remainder < 0 {
-		return nil, nil, errors.New("header is too short")
+		return nil, nil, nil, errors.New("header is too short")
+	}
+
+	coreHdrSize := int64(binary.Size(&coreHeader))
+	if remainder >= coreHdrSize {
+		if err := binary.Read(r, bo, &coreHeader); err != nil {
+			return nil, nil, nil, fmt.Errorf("can't read CO-RE relocation header: %v", err)
+		}
+		remainder -= coreHdrSize
 	}
 	}
 
 
 	// Of course, the .BTF.ext header has different semantics than the
 	// Of course, the .BTF.ext header has different semantics than the
 	// .BTF ext header. We need to ignore non-null values.
 	// .BTF ext header. We need to ignore non-null values.
 	_, err = io.CopyN(ioutil.Discard, r, remainder)
 	_, err = io.CopyN(ioutil.Discard, r, remainder)
 	if err != nil {
 	if err != nil {
-		return nil, nil, fmt.Errorf("header padding: %v", err)
+		return nil, nil, nil, fmt.Errorf("header padding: %v", err)
 	}
 	}
 
 
 	if _, err := r.Seek(int64(header.HdrLen+header.FuncInfoOff), io.SeekStart); err != nil {
 	if _, err := r.Seek(int64(header.HdrLen+header.FuncInfoOff), io.SeekStart); err != nil {
-		return nil, nil, fmt.Errorf("can't seek to function info section: %v", err)
+		return nil, nil, nil, fmt.Errorf("can't seek to function info section: %v", err)
 	}
 	}
 
 
-	funcInfo, err = parseExtInfo(io.LimitReader(r, int64(header.FuncInfoLen)), bo, strings)
+	buf := bufio.NewReader(io.LimitReader(r, int64(header.FuncInfoLen)))
+	funcInfo, err = parseExtInfo(buf, bo, strings)
 	if err != nil {
 	if err != nil {
-		return nil, nil, fmt.Errorf("function info: %w", err)
+		return nil, nil, nil, fmt.Errorf("function info: %w", err)
 	}
 	}
 
 
 	if _, err := r.Seek(int64(header.HdrLen+header.LineInfoOff), io.SeekStart); err != nil {
 	if _, err := r.Seek(int64(header.HdrLen+header.LineInfoOff), io.SeekStart); err != nil {
-		return nil, nil, fmt.Errorf("can't seek to line info section: %v", err)
+		return nil, nil, nil, fmt.Errorf("can't seek to line info section: %v", err)
 	}
 	}
 
 
-	lineInfo, err = parseExtInfo(io.LimitReader(r, int64(header.LineInfoLen)), bo, strings)
+	buf = bufio.NewReader(io.LimitReader(r, int64(header.LineInfoLen)))
+	lineInfo, err = parseExtInfo(buf, bo, strings)
 	if err != nil {
 	if err != nil {
-		return nil, nil, fmt.Errorf("line info: %w", err)
+		return nil, nil, nil, fmt.Errorf("line info: %w", err)
+	}
+
+	if coreHeader.CoreReloOff > 0 && coreHeader.CoreReloLen > 0 {
+		if _, err := r.Seek(int64(header.HdrLen+coreHeader.CoreReloOff), io.SeekStart); err != nil {
+			return nil, nil, nil, fmt.Errorf("can't seek to CO-RE relocation section: %v", err)
+		}
+
+		coreRelos, err = parseExtInfoRelos(io.LimitReader(r, int64(coreHeader.CoreReloLen)), bo, strings)
+		if err != nil {
+			return nil, nil, nil, fmt.Errorf("CO-RE relocation info: %w", err)
+		}
 	}
 	}
 
 
-	return funcInfo, lineInfo, nil
+	return funcInfo, lineInfo, coreRelos, nil
 }
 }
 
 
 type btfExtInfoSec struct {
 type btfExtInfoSec struct {
@@ -127,6 +155,8 @@ func (ei extInfo) MarshalBinary() ([]byte, error) {
 }
 }
 
 
 func parseExtInfo(r io.Reader, bo binary.ByteOrder, strings stringTable) (map[string]extInfo, error) {
 func parseExtInfo(r io.Reader, bo binary.ByteOrder, strings stringTable) (map[string]extInfo, error) {
+	const maxRecordSize = 256
+
 	var recordSize uint32
 	var recordSize uint32
 	if err := binary.Read(r, bo, &recordSize); err != nil {
 	if err := binary.Read(r, bo, &recordSize); err != nil {
 		return nil, fmt.Errorf("can't read record size: %v", err)
 		return nil, fmt.Errorf("can't read record size: %v", err)
@@ -136,23 +166,15 @@ func parseExtInfo(r io.Reader, bo binary.ByteOrder, strings stringTable) (map[st
 		// Need at least insnOff
 		// Need at least insnOff
 		return nil, errors.New("record size too short")
 		return nil, errors.New("record size too short")
 	}
 	}
+	if recordSize > maxRecordSize {
+		return nil, fmt.Errorf("record size %v exceeds %v", recordSize, maxRecordSize)
+	}
 
 
 	result := make(map[string]extInfo)
 	result := make(map[string]extInfo)
 	for {
 	for {
-		var infoHeader btfExtInfoSec
-		if err := binary.Read(r, bo, &infoHeader); err == io.EOF {
+		secName, infoHeader, err := parseExtInfoHeader(r, bo, strings)
+		if errors.Is(err, io.EOF) {
 			return result, nil
 			return result, nil
-		} else if err != nil {
-			return nil, fmt.Errorf("can't read ext info header: %v", err)
-		}
-
-		secName, err := strings.Lookup(infoHeader.SecNameOff)
-		if err != nil {
-			return nil, fmt.Errorf("can't get section name: %w", err)
-		}
-
-		if infoHeader.NumInfo == 0 {
-			return nil, fmt.Errorf("section %s has invalid number of records", secName)
 		}
 		}
 
 
 		var records []extInfoRecord
 		var records []extInfoRecord
@@ -180,3 +202,80 @@ func parseExtInfo(r io.Reader, bo binary.ByteOrder, strings stringTable) (map[st
 		}
 		}
 	}
 	}
 }
 }
+
+// bpfCoreRelo matches `struct bpf_core_relo` from the kernel
+type bpfCoreRelo struct {
+	InsnOff      uint32
+	TypeID       TypeID
+	AccessStrOff uint32
+	ReloKind     coreReloKind
+}
+
+type bpfCoreRelos []bpfCoreRelo
+
+// append two slices of extInfoRelo to each other. The InsnOff of b are adjusted
+// by offset.
+func (r bpfCoreRelos) append(other bpfCoreRelos, offset uint64) bpfCoreRelos {
+	result := make([]bpfCoreRelo, 0, len(r)+len(other))
+	result = append(result, r...)
+	for _, relo := range other {
+		relo.InsnOff += uint32(offset)
+		result = append(result, relo)
+	}
+	return result
+}
+
+var extInfoReloSize = binary.Size(bpfCoreRelo{})
+
+func parseExtInfoRelos(r io.Reader, bo binary.ByteOrder, strings stringTable) (map[string]bpfCoreRelos, error) {
+	var recordSize uint32
+	if err := binary.Read(r, bo, &recordSize); err != nil {
+		return nil, fmt.Errorf("read record size: %v", err)
+	}
+
+	if recordSize != uint32(extInfoReloSize) {
+		return nil, fmt.Errorf("expected record size %d, got %d", extInfoReloSize, recordSize)
+	}
+
+	result := make(map[string]bpfCoreRelos)
+	for {
+		secName, infoHeader, err := parseExtInfoHeader(r, bo, strings)
+		if errors.Is(err, io.EOF) {
+			return result, nil
+		}
+
+		var relos []bpfCoreRelo
+		for i := uint32(0); i < infoHeader.NumInfo; i++ {
+			var relo bpfCoreRelo
+			if err := binary.Read(r, bo, &relo); err != nil {
+				return nil, fmt.Errorf("section %v: read record: %v", secName, err)
+			}
+
+			if relo.InsnOff%asm.InstructionSize != 0 {
+				return nil, fmt.Errorf("section %v: offset %v is not aligned with instruction size", secName, relo.InsnOff)
+			}
+
+			relos = append(relos, relo)
+		}
+
+		result[secName] = relos
+	}
+}
+
+func parseExtInfoHeader(r io.Reader, bo binary.ByteOrder, strings stringTable) (string, *btfExtInfoSec, error) {
+	var infoHeader btfExtInfoSec
+	if err := binary.Read(r, bo, &infoHeader); err != nil {
+		return "", nil, fmt.Errorf("read ext info header: %w", err)
+	}
+
+	secName, err := strings.Lookup(infoHeader.SecNameOff)
+	if err != nil {
+		return "", nil, fmt.Errorf("get section name: %w", err)
+	}
+
+	if infoHeader.NumInfo == 0 {
+		return "", nil, fmt.Errorf("section %s has zero records", secName)
+	}
+
+	return secName, &infoHeader, nil
+}

+ 49 - 0
vendor/github.com/cilium/ebpf/internal/btf/fuzz.go

@@ -0,0 +1,49 @@
+// +build gofuzz
+
+// Use with https://github.com/dvyukov/go-fuzz
+
+package btf
+
+import (
+	"bytes"
+	"encoding/binary"
+
+	"github.com/cilium/ebpf/internal"
+)
+
+func FuzzSpec(data []byte) int {
+	if len(data) < binary.Size(btfHeader{}) {
+		return -1
+	}
+
+	spec, err := loadNakedSpec(bytes.NewReader(data), internal.NativeEndian, nil, nil)
+	if err != nil {
+		if spec != nil {
+			panic("spec is not nil")
+		}
+		return 0
+	}
+	if spec == nil {
+		panic("spec is nil")
+	}
+	return 1
+}
+
+func FuzzExtInfo(data []byte) int {
+	if len(data) < binary.Size(btfExtHeader{}) {
+		return -1
+	}
+
+	table := stringTable("\x00foo\x00barfoo\x00")
+	info, err := parseExtInfo(bytes.NewReader(data), internal.NativeEndian, table)
+	if err != nil {
+		if info != nil {
+			panic("info is not nil")
+		}
+		return 0
+	}
+	if info == nil {
+		panic("info is nil")
+	}
+	return 1
+}

+ 360 - 76
vendor/github.com/cilium/ebpf/internal/btf/types.go

@@ -4,6 +4,7 @@ import (
 	"errors"
 	"errors"
 	"fmt"
 	"fmt"
 	"math"
 	"math"
+	"strings"
 )
 )
 
 
 const maxTypeDepth = 32
 const maxTypeDepth = 32
@@ -20,10 +21,22 @@ func (tid TypeID) ID() TypeID {
 type Type interface {
 type Type interface {
 	ID() TypeID
 	ID() TypeID
 
 
+	String() string
+
 	// Make a copy of the type, without copying Type members.
 	// Make a copy of the type, without copying Type members.
 	copy() Type
 	copy() Type
 
 
-	walk(*copyStack)
+	// Enumerate all nested Types. Repeated calls must visit nested
+	// types in the same order.
+	walk(*typeDeque)
+}
+
+// namedType is a type with a name.
+//
+// Most named types simply embed Name.
+type namedType interface {
+	Type
+	name() string
 }
 }
 
 
 // Name identifies a type.
 // Name identifies a type.
@@ -39,9 +52,18 @@ func (n Name) name() string {
 type Void struct{}
 type Void struct{}
 
 
 func (v *Void) ID() TypeID      { return 0 }
 func (v *Void) ID() TypeID      { return 0 }
+func (v *Void) String() string  { return "void#0" }
 func (v *Void) size() uint32    { return 0 }
 func (v *Void) size() uint32    { return 0 }
 func (v *Void) copy() Type      { return (*Void)(nil) }
 func (v *Void) copy() Type      { return (*Void)(nil) }
-func (v *Void) walk(*copyStack) {}
+func (v *Void) walk(*typeDeque) {}
+
+type IntEncoding byte
+
+const (
+	Signed IntEncoding = 1 << iota
+	Char
+	Bool
+)
 
 
 // Int is an integer of a given length.
 // Int is an integer of a given length.
 type Int struct {
 type Int struct {
@@ -49,24 +71,64 @@ type Int struct {
 	Name
 	Name
 
 
 	// The size of the integer in bytes.
 	// The size of the integer in bytes.
-	Size uint32
+	Size     uint32
+	Encoding IntEncoding
+	// Offset is the starting bit offset. Currently always 0.
+	// See https://www.kernel.org/doc/html/latest/bpf/btf.html#btf-kind-int
+	Offset uint32
+	Bits   byte
+}
+
+var _ namedType = (*Int)(nil)
+
+func (i *Int) String() string {
+	var s strings.Builder
+
+	switch {
+	case i.Encoding&Char != 0:
+		s.WriteString("char")
+	case i.Encoding&Bool != 0:
+		s.WriteString("bool")
+	default:
+		if i.Encoding&Signed == 0 {
+			s.WriteRune('u')
+		}
+		s.WriteString("int")
+		fmt.Fprintf(&s, "%d", i.Size*8)
+	}
+
+	fmt.Fprintf(&s, "#%d", i.TypeID)
+
+	if i.Bits > 0 {
+		fmt.Fprintf(&s, "[bits=%d]", i.Bits)
+	}
+
+	return s.String()
 }
 }
 
 
 func (i *Int) size() uint32    { return i.Size }
 func (i *Int) size() uint32    { return i.Size }
-func (i *Int) walk(*copyStack) {}
+func (i *Int) walk(*typeDeque) {}
 func (i *Int) copy() Type {
 func (i *Int) copy() Type {
 	cpy := *i
 	cpy := *i
 	return &cpy
 	return &cpy
 }
 }
 
 
+func (i *Int) isBitfield() bool {
+	return i.Offset > 0
+}
+
 // Pointer is a pointer to another type.
 // Pointer is a pointer to another type.
 type Pointer struct {
 type Pointer struct {
 	TypeID
 	TypeID
 	Target Type
 	Target Type
 }
 }
 
 
-func (p *Pointer) size() uint32       { return 8 }
-func (p *Pointer) walk(cs *copyStack) { cs.push(&p.Target) }
+func (p *Pointer) String() string {
+	return fmt.Sprintf("pointer#%d[target=#%d]", p.TypeID, p.Target.ID())
+}
+
+func (p *Pointer) size() uint32        { return 8 }
+func (p *Pointer) walk(tdq *typeDeque) { tdq.push(&p.Target) }
 func (p *Pointer) copy() Type {
 func (p *Pointer) copy() Type {
 	cpy := *p
 	cpy := *p
 	return &cpy
 	return &cpy
@@ -79,7 +141,11 @@ type Array struct {
 	Nelems uint32
 	Nelems uint32
 }
 }
 
 
-func (arr *Array) walk(cs *copyStack) { cs.push(&arr.Type) }
+func (arr *Array) String() string {
+	return fmt.Sprintf("array#%d[type=#%d n=%d]", arr.TypeID, arr.Type.ID(), arr.Nelems)
+}
+
+func (arr *Array) walk(tdq *typeDeque) { tdq.push(&arr.Type) }
 func (arr *Array) copy() Type {
 func (arr *Array) copy() Type {
 	cpy := *arr
 	cpy := *arr
 	return &cpy
 	return &cpy
@@ -94,11 +160,15 @@ type Struct struct {
 	Members []Member
 	Members []Member
 }
 }
 
 
+func (s *Struct) String() string {
+	return fmt.Sprintf("struct#%d[%q]", s.TypeID, s.Name)
+}
+
 func (s *Struct) size() uint32 { return s.Size }
 func (s *Struct) size() uint32 { return s.Size }
 
 
-func (s *Struct) walk(cs *copyStack) {
+func (s *Struct) walk(tdq *typeDeque) {
 	for i := range s.Members {
 	for i := range s.Members {
-		cs.push(&s.Members[i].Type)
+		tdq.push(&s.Members[i].Type)
 	}
 	}
 }
 }
 
 
@@ -109,6 +179,10 @@ func (s *Struct) copy() Type {
 	return &cpy
 	return &cpy
 }
 }
 
 
+func (s *Struct) members() []Member {
+	return s.Members
+}
+
 // Union is a compound type where members occupy the same memory.
 // Union is a compound type where members occupy the same memory.
 type Union struct {
 type Union struct {
 	TypeID
 	TypeID
@@ -118,11 +192,15 @@ type Union struct {
 	Members []Member
 	Members []Member
 }
 }
 
 
+func (u *Union) String() string {
+	return fmt.Sprintf("union#%d[%q]", u.TypeID, u.Name)
+}
+
 func (u *Union) size() uint32 { return u.Size }
 func (u *Union) size() uint32 { return u.Size }
 
 
-func (u *Union) walk(cs *copyStack) {
+func (u *Union) walk(tdq *typeDeque) {
 	for i := range u.Members {
 	for i := range u.Members {
-		cs.push(&u.Members[i].Type)
+		tdq.push(&u.Members[i].Type)
 	}
 	}
 }
 }
 
 
@@ -133,35 +211,90 @@ func (u *Union) copy() Type {
 	return &cpy
 	return &cpy
 }
 }
 
 
+func (u *Union) members() []Member {
+	return u.Members
+}
+
+type composite interface {
+	members() []Member
+}
+
+var (
+	_ composite = (*Struct)(nil)
+	_ composite = (*Union)(nil)
+)
+
 // Member is part of a Struct or Union.
 // Member is part of a Struct or Union.
 //
 //
 // It is not a valid Type.
 // It is not a valid Type.
 type Member struct {
 type Member struct {
 	Name
 	Name
-	Type   Type
-	Offset uint32
+	Type Type
+	// Offset is the bit offset of this member
+	Offset       uint32
+	BitfieldSize uint32
 }
 }
 
 
 // Enum lists possible values.
 // Enum lists possible values.
 type Enum struct {
 type Enum struct {
 	TypeID
 	TypeID
 	Name
 	Name
+	Values []EnumValue
+}
+
+func (e *Enum) String() string {
+	return fmt.Sprintf("enum#%d[%q]", e.TypeID, e.Name)
+}
+
+// EnumValue is part of an Enum
+//
+// Is is not a valid Type
+type EnumValue struct {
+	Name
+	Value int32
 }
 }
 
 
 func (e *Enum) size() uint32    { return 4 }
 func (e *Enum) size() uint32    { return 4 }
-func (e *Enum) walk(*copyStack) {}
+func (e *Enum) walk(*typeDeque) {}
 func (e *Enum) copy() Type {
 func (e *Enum) copy() Type {
 	cpy := *e
 	cpy := *e
+	cpy.Values = make([]EnumValue, len(e.Values))
+	copy(cpy.Values, e.Values)
 	return &cpy
 	return &cpy
 }
 }
 
 
+// FwdKind is the type of forward declaration.
+type FwdKind int
+
+// Valid types of forward declaration.
+const (
+	FwdStruct FwdKind = iota
+	FwdUnion
+)
+
+func (fk FwdKind) String() string {
+	switch fk {
+	case FwdStruct:
+		return "struct"
+	case FwdUnion:
+		return "union"
+	default:
+		return fmt.Sprintf("%T(%d)", fk, int(fk))
+	}
+}
+
 // Fwd is a forward declaration of a Type.
 // Fwd is a forward declaration of a Type.
 type Fwd struct {
 type Fwd struct {
 	TypeID
 	TypeID
 	Name
 	Name
+	Kind FwdKind
+}
+
+func (f *Fwd) String() string {
+	return fmt.Sprintf("fwd#%d[%s %q]", f.TypeID, f.Kind, f.Name)
 }
 }
 
 
-func (f *Fwd) walk(*copyStack) {}
+func (f *Fwd) walk(*typeDeque) {}
 func (f *Fwd) copy() Type {
 func (f *Fwd) copy() Type {
 	cpy := *f
 	cpy := *f
 	return &cpy
 	return &cpy
@@ -174,43 +307,62 @@ type Typedef struct {
 	Type Type
 	Type Type
 }
 }
 
 
-func (td *Typedef) walk(cs *copyStack) { cs.push(&td.Type) }
+func (td *Typedef) String() string {
+	return fmt.Sprintf("typedef#%d[%q #%d]", td.TypeID, td.Name, td.Type.ID())
+}
+
+func (td *Typedef) walk(tdq *typeDeque) { tdq.push(&td.Type) }
 func (td *Typedef) copy() Type {
 func (td *Typedef) copy() Type {
 	cpy := *td
 	cpy := *td
 	return &cpy
 	return &cpy
 }
 }
 
 
-// Volatile is a modifier.
+// Volatile is a qualifier.
 type Volatile struct {
 type Volatile struct {
 	TypeID
 	TypeID
 	Type Type
 	Type Type
 }
 }
 
 
-func (v *Volatile) walk(cs *copyStack) { cs.push(&v.Type) }
+func (v *Volatile) String() string {
+	return fmt.Sprintf("volatile#%d[#%d]", v.TypeID, v.Type.ID())
+}
+
+func (v *Volatile) qualify() Type       { return v.Type }
+func (v *Volatile) walk(tdq *typeDeque) { tdq.push(&v.Type) }
 func (v *Volatile) copy() Type {
 func (v *Volatile) copy() Type {
 	cpy := *v
 	cpy := *v
 	return &cpy
 	return &cpy
 }
 }
 
 
-// Const is a modifier.
+// Const is a qualifier.
 type Const struct {
 type Const struct {
 	TypeID
 	TypeID
 	Type Type
 	Type Type
 }
 }
 
 
-func (c *Const) walk(cs *copyStack) { cs.push(&c.Type) }
+func (c *Const) String() string {
+	return fmt.Sprintf("const#%d[#%d]", c.TypeID, c.Type.ID())
+}
+
+func (c *Const) qualify() Type       { return c.Type }
+func (c *Const) walk(tdq *typeDeque) { tdq.push(&c.Type) }
 func (c *Const) copy() Type {
 func (c *Const) copy() Type {
 	cpy := *c
 	cpy := *c
 	return &cpy
 	return &cpy
 }
 }
 
 
-// Restrict is a modifier.
+// Restrict is a qualifier.
 type Restrict struct {
 type Restrict struct {
 	TypeID
 	TypeID
 	Type Type
 	Type Type
 }
 }
 
 
-func (r *Restrict) walk(cs *copyStack) { cs.push(&r.Type) }
+func (r *Restrict) String() string {
+	return fmt.Sprintf("restrict#%d[#%d]", r.TypeID, r.Type.ID())
+}
+
+func (r *Restrict) qualify() Type       { return r.Type }
+func (r *Restrict) walk(tdq *typeDeque) { tdq.push(&r.Type) }
 func (r *Restrict) copy() Type {
 func (r *Restrict) copy() Type {
 	cpy := *r
 	cpy := *r
 	return &cpy
 	return &cpy
@@ -223,7 +375,11 @@ type Func struct {
 	Type Type
 	Type Type
 }
 }
 
 
-func (f *Func) walk(cs *copyStack) { cs.push(&f.Type) }
+func (f *Func) String() string {
+	return fmt.Sprintf("func#%d[%q proto=#%d]", f.TypeID, f.Name, f.Type.ID())
+}
+
+func (f *Func) walk(tdq *typeDeque) { tdq.push(&f.Type) }
 func (f *Func) copy() Type {
 func (f *Func) copy() Type {
 	cpy := *f
 	cpy := *f
 	return &cpy
 	return &cpy
@@ -233,15 +389,38 @@ func (f *Func) copy() Type {
 type FuncProto struct {
 type FuncProto struct {
 	TypeID
 	TypeID
 	Return Type
 	Return Type
-	// Parameters not supported yet
+	Params []FuncParam
+}
+
+func (fp *FuncProto) String() string {
+	var s strings.Builder
+	fmt.Fprintf(&s, "proto#%d[", fp.TypeID)
+	for _, param := range fp.Params {
+		fmt.Fprintf(&s, "%q=#%d, ", param.Name, param.Type.ID())
+	}
+	fmt.Fprintf(&s, "return=#%d]", fp.Return.ID())
+	return s.String()
+}
+
+func (fp *FuncProto) walk(tdq *typeDeque) {
+	tdq.push(&fp.Return)
+	for i := range fp.Params {
+		tdq.push(&fp.Params[i].Type)
+	}
 }
 }
 
 
-func (fp *FuncProto) walk(cs *copyStack) { cs.push(&fp.Return) }
 func (fp *FuncProto) copy() Type {
 func (fp *FuncProto) copy() Type {
 	cpy := *fp
 	cpy := *fp
+	cpy.Params = make([]FuncParam, len(fp.Params))
+	copy(cpy.Params, fp.Params)
 	return &cpy
 	return &cpy
 }
 }
 
 
+type FuncParam struct {
+	Name
+	Type Type
+}
+
 // Var is a global variable.
 // Var is a global variable.
 type Var struct {
 type Var struct {
 	TypeID
 	TypeID
@@ -249,7 +428,12 @@ type Var struct {
 	Type Type
 	Type Type
 }
 }
 
 
-func (v *Var) walk(cs *copyStack) { cs.push(&v.Type) }
+func (v *Var) String() string {
+	// TODO: Linkage
+	return fmt.Sprintf("var#%d[%q]", v.TypeID, v.Name)
+}
+
+func (v *Var) walk(tdq *typeDeque) { tdq.push(&v.Type) }
 func (v *Var) copy() Type {
 func (v *Var) copy() Type {
 	cpy := *v
 	cpy := *v
 	return &cpy
 	return &cpy
@@ -263,11 +447,15 @@ type Datasec struct {
 	Vars []VarSecinfo
 	Vars []VarSecinfo
 }
 }
 
 
+func (ds *Datasec) String() string {
+	return fmt.Sprintf("section#%d[%q]", ds.TypeID, ds.Name)
+}
+
 func (ds *Datasec) size() uint32 { return ds.Size }
 func (ds *Datasec) size() uint32 { return ds.Size }
 
 
-func (ds *Datasec) walk(cs *copyStack) {
+func (ds *Datasec) walk(tdq *typeDeque) {
 	for i := range ds.Vars {
 	for i := range ds.Vars {
-		cs.push(&ds.Vars[i].Type)
+		tdq.push(&ds.Vars[i].Type)
 	}
 	}
 }
 }
 
 
@@ -279,6 +467,8 @@ func (ds *Datasec) copy() Type {
 }
 }
 
 
 // VarSecinfo describes variable in a Datasec
 // VarSecinfo describes variable in a Datasec
+//
+// It is not a valid Type.
 type VarSecinfo struct {
 type VarSecinfo struct {
 	Type   Type
 	Type   Type
 	Offset uint32
 	Offset uint32
@@ -298,6 +488,16 @@ var (
 	_ sizer = (*Datasec)(nil)
 	_ sizer = (*Datasec)(nil)
 )
 )
 
 
+type qualifier interface {
+	qualify() Type
+}
+
+var (
+	_ qualifier = (*Const)(nil)
+	_ qualifier = (*Restrict)(nil)
+	_ qualifier = (*Volatile)(nil)
+)
+
 // Sizeof returns the size of a type in bytes.
 // Sizeof returns the size of a type in bytes.
 //
 //
 // Returns an error if the size can't be computed.
 // Returns an error if the size can't be computed.
@@ -326,14 +526,9 @@ func Sizeof(typ Type) (int, error) {
 		case *Typedef:
 		case *Typedef:
 			typ = v.Type
 			typ = v.Type
 			continue
 			continue
-		case *Volatile:
-			typ = v.Type
-			continue
-		case *Const:
-			typ = v.Type
-			continue
-		case *Restrict:
-			typ = v.Type
+
+		case qualifier:
+			typ = v.qualify()
 			continue
 			continue
 
 
 		default:
 		default:
@@ -361,7 +556,7 @@ func Sizeof(typ Type) (int, error) {
 func copyType(typ Type) Type {
 func copyType(typ Type) Type {
 	var (
 	var (
 		copies = make(map[Type]Type)
 		copies = make(map[Type]Type)
-		work   copyStack
+		work   typeDeque
 	)
 	)
 
 
 	for t := &typ; t != nil; t = work.pop() {
 	for t := &typ; t != nil; t = work.pop() {
@@ -382,40 +577,83 @@ func copyType(typ Type) Type {
 	return typ
 	return typ
 }
 }
 
 
-// copyStack keeps track of pointers to types which still
-// need to be copied.
-type copyStack []*Type
+// typeDeque keeps track of pointers to types which still
+// need to be visited.
+type typeDeque struct {
+	types       []*Type
+	read, write uint64
+	mask        uint64
+}
 
 
 // push adds a type to the stack.
 // push adds a type to the stack.
-func (cs *copyStack) push(t *Type) {
-	*cs = append(*cs, t)
+func (dq *typeDeque) push(t *Type) {
+	if dq.write-dq.read < uint64(len(dq.types)) {
+		dq.types[dq.write&dq.mask] = t
+		dq.write++
+		return
+	}
+
+	new := len(dq.types) * 2
+	if new == 0 {
+		new = 8
+	}
+
+	types := make([]*Type, new)
+	pivot := dq.read & dq.mask
+	n := copy(types, dq.types[pivot:])
+	n += copy(types[n:], dq.types[:pivot])
+	types[n] = t
+
+	dq.types = types
+	dq.mask = uint64(new) - 1
+	dq.read, dq.write = 0, uint64(n+1)
 }
 }
 
 
-// pop returns the topmost Type, or nil.
-func (cs *copyStack) pop() *Type {
-	n := len(*cs)
-	if n == 0 {
+// shift returns the first element or null.
+func (dq *typeDeque) shift() *Type {
+	if dq.read == dq.write {
 		return nil
 		return nil
 	}
 	}
 
 
-	t := (*cs)[n-1]
-	*cs = (*cs)[:n-1]
+	index := dq.read & dq.mask
+	t := dq.types[index]
+	dq.types[index] = nil
+	dq.read++
 	return t
 	return t
 }
 }
 
 
-type namer interface {
-	name() string
+// pop returns the last element or null.
+func (dq *typeDeque) pop() *Type {
+	if dq.read == dq.write {
+		return nil
+	}
+
+	dq.write--
+	index := dq.write & dq.mask
+	t := dq.types[index]
+	dq.types[index] = nil
+	return t
 }
 }
 
 
-var _ namer = Name("")
+// all returns all elements.
+//
+// The deque is empty after calling this method.
+func (dq *typeDeque) all() []*Type {
+	length := dq.write - dq.read
+	types := make([]*Type, 0, length)
+	for t := dq.shift(); t != nil; t = dq.shift() {
+		types = append(types, t)
+	}
+	return types
+}
 
 
 // inflateRawTypes takes a list of raw btf types linked via type IDs, and turns
 // inflateRawTypes takes a list of raw btf types linked via type IDs, and turns
 // it into a graph of Types connected via pointers.
 // it into a graph of Types connected via pointers.
 //
 //
-// Returns a map of named types (so, where NameOff is non-zero). Since BTF ignores
-// compilation units, multiple types may share the same name. A Type may form a
-// cyclic graph by pointing at itself.
-func inflateRawTypes(rawTypes []rawType, rawStrings stringTable) (namedTypes map[string][]Type, err error) {
+// Returns a map of named types (so, where NameOff is non-zero) and a slice of types
+// indexed by TypeID. Since BTF ignores compilation units, multiple types may share
+// the same name. A Type may form a cyclic graph by pointing at itself.
+func inflateRawTypes(rawTypes []rawType, rawStrings stringTable) (types []Type, namedTypes map[string][]namedType, err error) {
 	type fixupDef struct {
 	type fixupDef struct {
 		id           TypeID
 		id           TypeID
 		expectedKind btfKind
 		expectedKind btfKind
@@ -427,7 +665,7 @@ func inflateRawTypes(rawTypes []rawType, rawStrings stringTable) (namedTypes map
 		fixups = append(fixups, fixupDef{id, expectedKind, typ})
 		fixups = append(fixups, fixupDef{id, expectedKind, typ})
 	}
 	}
 
 
-	convertMembers := func(raw []btfMember) ([]Member, error) {
+	convertMembers := func(raw []btfMember, kindFlag bool) ([]Member, error) {
 		// NB: The fixup below relies on pre-allocating this array to
 		// NB: The fixup below relies on pre-allocating this array to
 		// work, since otherwise append might re-allocate members.
 		// work, since otherwise append might re-allocate members.
 		members := make([]Member, 0, len(raw))
 		members := make([]Member, 0, len(raw))
@@ -436,10 +674,15 @@ func inflateRawTypes(rawTypes []rawType, rawStrings stringTable) (namedTypes map
 			if err != nil {
 			if err != nil {
 				return nil, fmt.Errorf("can't get name for member %d: %w", i, err)
 				return nil, fmt.Errorf("can't get name for member %d: %w", i, err)
 			}
 			}
-			members = append(members, Member{
+			m := Member{
 				Name:   name,
 				Name:   name,
 				Offset: btfMember.Offset,
 				Offset: btfMember.Offset,
-			})
+			}
+			if kindFlag {
+				m.BitfieldSize = btfMember.Offset >> 24
+				m.Offset &= 0xffffff
+			}
+			members = append(members, m)
 		}
 		}
 		for i := range members {
 		for i := range members {
 			fixup(raw[i].Type, kindUnknown, &members[i].Type)
 			fixup(raw[i].Type, kindUnknown, &members[i].Type)
@@ -447,9 +690,9 @@ func inflateRawTypes(rawTypes []rawType, rawStrings stringTable) (namedTypes map
 		return members, nil
 		return members, nil
 	}
 	}
 
 
-	types := make([]Type, 0, len(rawTypes))
+	types = make([]Type, 0, len(rawTypes))
 	types = append(types, (*Void)(nil))
 	types = append(types, (*Void)(nil))
-	namedTypes = make(map[string][]Type)
+	namedTypes = make(map[string][]namedType)
 
 
 	for i, raw := range rawTypes {
 	for i, raw := range rawTypes {
 		var (
 		var (
@@ -461,12 +704,13 @@ func inflateRawTypes(rawTypes []rawType, rawStrings stringTable) (namedTypes map
 
 
 		name, err := rawStrings.LookupName(raw.NameOff)
 		name, err := rawStrings.LookupName(raw.NameOff)
 		if err != nil {
 		if err != nil {
-			return nil, fmt.Errorf("can't get name for type id %d: %w", id, err)
+			return nil, nil, fmt.Errorf("get name for type id %d: %w", id, err)
 		}
 		}
 
 
 		switch raw.Kind() {
 		switch raw.Kind() {
 		case kindInt:
 		case kindInt:
-			typ = &Int{id, name, raw.Size()}
+			encoding, offset, bits := intEncoding(*raw.data.(*uint32))
+			typ = &Int{id, name, raw.Size(), encoding, offset, bits}
 
 
 		case kindPointer:
 		case kindPointer:
 			ptr := &Pointer{id, nil}
 			ptr := &Pointer{id, nil}
@@ -483,24 +727,40 @@ func inflateRawTypes(rawTypes []rawType, rawStrings stringTable) (namedTypes map
 			typ = arr
 			typ = arr
 
 
 		case kindStruct:
 		case kindStruct:
-			members, err := convertMembers(raw.data.([]btfMember))
+			members, err := convertMembers(raw.data.([]btfMember), raw.KindFlag())
 			if err != nil {
 			if err != nil {
-				return nil, fmt.Errorf("struct %s (id %d): %w", name, id, err)
+				return nil, nil, fmt.Errorf("struct %s (id %d): %w", name, id, err)
 			}
 			}
 			typ = &Struct{id, name, raw.Size(), members}
 			typ = &Struct{id, name, raw.Size(), members}
 
 
 		case kindUnion:
 		case kindUnion:
-			members, err := convertMembers(raw.data.([]btfMember))
+			members, err := convertMembers(raw.data.([]btfMember), raw.KindFlag())
 			if err != nil {
 			if err != nil {
-				return nil, fmt.Errorf("union %s (id %d): %w", name, id, err)
+				return nil, nil, fmt.Errorf("union %s (id %d): %w", name, id, err)
 			}
 			}
 			typ = &Union{id, name, raw.Size(), members}
 			typ = &Union{id, name, raw.Size(), members}
 
 
 		case kindEnum:
 		case kindEnum:
-			typ = &Enum{id, name}
+			rawvals := raw.data.([]btfEnum)
+			vals := make([]EnumValue, 0, len(rawvals))
+			for i, btfVal := range rawvals {
+				name, err := rawStrings.LookupName(btfVal.NameOff)
+				if err != nil {
+					return nil, nil, fmt.Errorf("get name for enum value %d: %s", i, err)
+				}
+				vals = append(vals, EnumValue{
+					Name:  name,
+					Value: btfVal.Val,
+				})
+			}
+			typ = &Enum{id, name, vals}
 
 
 		case kindForward:
 		case kindForward:
-			typ = &Fwd{id, name}
+			if raw.KindFlag() {
+				typ = &Fwd{id, name, FwdUnion}
+			} else {
+				typ = &Fwd{id, name, FwdStruct}
+			}
 
 
 		case kindTypedef:
 		case kindTypedef:
 			typedef := &Typedef{id, name, nil}
 			typedef := &Typedef{id, name, nil}
@@ -528,7 +788,22 @@ func inflateRawTypes(rawTypes []rawType, rawStrings stringTable) (namedTypes map
 			typ = fn
 			typ = fn
 
 
 		case kindFuncProto:
 		case kindFuncProto:
-			fp := &FuncProto{id, nil}
+			rawparams := raw.data.([]btfParam)
+			params := make([]FuncParam, 0, len(rawparams))
+			for i, param := range rawparams {
+				name, err := rawStrings.LookupName(param.NameOff)
+				if err != nil {
+					return nil, nil, fmt.Errorf("get name for func proto parameter %d: %s", i, err)
+				}
+				params = append(params, FuncParam{
+					Name: name,
+				})
+			}
+			for i := range params {
+				fixup(rawparams[i].Type, kindUnknown, &params[i].Type)
+			}
+
+			fp := &FuncProto{id, nil, params}
 			fixup(raw.Type(), kindUnknown, &fp.Return)
 			fixup(raw.Type(), kindUnknown, &fp.Return)
 			typ = fp
 			typ = fp
 
 
@@ -552,14 +827,14 @@ func inflateRawTypes(rawTypes []rawType, rawStrings stringTable) (namedTypes map
 			typ = &Datasec{id, name, raw.SizeType, vars}
 			typ = &Datasec{id, name, raw.SizeType, vars}
 
 
 		default:
 		default:
-			return nil, fmt.Errorf("type id %d: unknown kind: %v", id, raw.Kind())
+			return nil, nil, fmt.Errorf("type id %d: unknown kind: %v", id, raw.Kind())
 		}
 		}
 
 
 		types = append(types, typ)
 		types = append(types, typ)
 
 
-		if namer, ok := typ.(namer); ok {
-			if name := namer.name(); name != "" {
-				namedTypes[name] = append(namedTypes[name], typ)
+		if named, ok := typ.(namedType); ok {
+			if name := essentialName(named.name()); name != "" {
+				namedTypes[name] = append(namedTypes[name], named)
 			}
 			}
 		}
 		}
 	}
 	}
@@ -567,7 +842,7 @@ func inflateRawTypes(rawTypes []rawType, rawStrings stringTable) (namedTypes map
 	for _, fixup := range fixups {
 	for _, fixup := range fixups {
 		i := int(fixup.id)
 		i := int(fixup.id)
 		if i >= len(types) {
 		if i >= len(types) {
-			return nil, fmt.Errorf("reference to invalid type id: %d", fixup.id)
+			return nil, nil, fmt.Errorf("reference to invalid type id: %d", fixup.id)
 		}
 		}
 
 
 		// Default void (id 0) to unknown
 		// Default void (id 0) to unknown
@@ -577,11 +852,20 @@ func inflateRawTypes(rawTypes []rawType, rawStrings stringTable) (namedTypes map
 		}
 		}
 
 
 		if expected := fixup.expectedKind; expected != kindUnknown && rawKind != expected {
 		if expected := fixup.expectedKind; expected != kindUnknown && rawKind != expected {
-			return nil, fmt.Errorf("expected type id %d to have kind %s, found %s", fixup.id, expected, rawKind)
+			return nil, nil, fmt.Errorf("expected type id %d to have kind %s, found %s", fixup.id, expected, rawKind)
 		}
 		}
 
 
 		*fixup.typ = types[i]
 		*fixup.typ = types[i]
 	}
 	}
 
 
-	return namedTypes, nil
+	return types, namedTypes, nil
+}
+
+// essentialName returns name without a ___ suffix.
+func essentialName(name string) string {
+	lastIdx := strings.LastIndex(name, "___")
+	if lastIdx > 0 {
+		return name[:lastIdx]
+	}
+	return name
 }
 }

+ 52 - 0
vendor/github.com/cilium/ebpf/internal/elf.go

@@ -0,0 +1,52 @@
+package internal
+
+import (
+	"debug/elf"
+	"fmt"
+	"io"
+)
+
+type SafeELFFile struct {
+	*elf.File
+}
+
+// NewSafeELFFile reads an ELF safely.
+//
+// Any panic during parsing is turned into an error. This is necessary since
+// there are a bunch of unfixed bugs in debug/elf.
+//
+// https://github.com/golang/go/issues?q=is%3Aissue+is%3Aopen+debug%2Felf+in%3Atitle
+func NewSafeELFFile(r io.ReaderAt) (safe *SafeELFFile, err error) {
+	defer func() {
+		r := recover()
+		if r == nil {
+			return
+		}
+
+		safe = nil
+		err = fmt.Errorf("reading ELF file panicked: %s", r)
+	}()
+
+	file, err := elf.NewFile(r)
+	if err != nil {
+		return nil, err
+	}
+
+	return &SafeELFFile{file}, nil
+}
+
+// Symbols is the safe version of elf.File.Symbols.
+func (se *SafeELFFile) Symbols() (syms []elf.Symbol, err error) {
+	defer func() {
+		r := recover()
+		if r == nil {
+			return
+		}
+
+		syms = nil
+		err = fmt.Errorf("reading ELF symbols panicked: %s", r)
+	}()
+
+	syms, err = se.File.Symbols()
+	return
+}

+ 30 - 52
vendor/github.com/cilium/ebpf/internal/feature.go

@@ -20,6 +20,9 @@ type UnsupportedFeatureError struct {
 }
 }
 
 
 func (ufe *UnsupportedFeatureError) Error() string {
 func (ufe *UnsupportedFeatureError) Error() string {
+	if ufe.MinimumVersion.Unspecified() {
+		return fmt.Sprintf("%s not supported", ufe.Name)
+	}
 	return fmt.Sprintf("%s not supported (requires >= %s)", ufe.Name, ufe.MinimumVersion)
 	return fmt.Sprintf("%s not supported (requires >= %s)", ufe.Name, ufe.MinimumVersion)
 }
 }
 
 
@@ -29,7 +32,7 @@ func (ufe *UnsupportedFeatureError) Is(target error) bool {
 }
 }
 
 
 type featureTest struct {
 type featureTest struct {
-	sync.Mutex
+	sync.RWMutex
 	successful bool
 	successful bool
 	result     error
 	result     error
 }
 }
@@ -39,10 +42,10 @@ type featureTest struct {
 //
 //
 // The return values have the following semantics:
 // The return values have the following semantics:
 //
 //
+//   err == ErrNotSupported: the feature is not available
+//   err == nil: the feature is available
 //   err != nil: the test couldn't be executed
 //   err != nil: the test couldn't be executed
-//   err == nil && available: the feature is available
-//   err == nil && !available: the feature isn't available
-type FeatureTestFn func() (available bool, err error)
+type FeatureTestFn func() error
 
 
 // FeatureTest wraps a function so that it is run at most once.
 // FeatureTest wraps a function so that it is run at most once.
 //
 //
@@ -58,65 +61,40 @@ func FeatureTest(name, version string, fn FeatureTestFn) func() error {
 
 
 	ft := new(featureTest)
 	ft := new(featureTest)
 	return func() error {
 	return func() error {
+		ft.RLock()
+		if ft.successful {
+			defer ft.RUnlock()
+			return ft.result
+		}
+		ft.RUnlock()
 		ft.Lock()
 		ft.Lock()
 		defer ft.Unlock()
 		defer ft.Unlock()
-
+		// check one more time on the off
+		// chance that two go routines
+		// were able to call into the write
+		// lock
 		if ft.successful {
 		if ft.successful {
 			return ft.result
 			return ft.result
 		}
 		}
-
-		available, err := fn()
-		if errors.Is(err, ErrNotSupported) {
-			// The feature test aborted because a dependent feature
-			// is missing, which we should cache.
-			available = false
-		} else if err != nil {
-			// We couldn't execute the feature test to a point
-			// where it could make a determination.
-			// Don't cache the result, just return it.
-			return fmt.Errorf("can't detect support for %s: %w", name, err)
-		}
-
-		ft.successful = true
-		if !available {
+		err := fn()
+		switch {
+		case errors.Is(err, ErrNotSupported):
 			ft.result = &UnsupportedFeatureError{
 			ft.result = &UnsupportedFeatureError{
 				MinimumVersion: v,
 				MinimumVersion: v,
 				Name:           name,
 				Name:           name,
 			}
 			}
-		}
-		return ft.result
-	}
-}
+			fallthrough
 
 
-// A Version in the form Major.Minor.Patch.
-type Version [3]uint16
+		case err == nil:
+			ft.successful = true
 
 
-// NewVersion creates a version from a string like "Major.Minor.Patch".
-//
-// Patch is optional.
-func NewVersion(ver string) (Version, error) {
-	var major, minor, patch uint16
-	n, _ := fmt.Sscanf(ver, "%d.%d.%d", &major, &minor, &patch)
-	if n < 2 {
-		return Version{}, fmt.Errorf("invalid version: %s", ver)
-	}
-	return Version{major, minor, patch}, nil
-}
-
-func (v Version) String() string {
-	if v[2] == 0 {
-		return fmt.Sprintf("v%d.%d", v[0], v[1])
-	}
-	return fmt.Sprintf("v%d.%d.%d", v[0], v[1], v[2])
-}
-
-// Less returns true if the version is less than another version.
-func (v Version) Less(other Version) bool {
-	for i, a := range v {
-		if a == other[i] {
-			continue
+		default:
+			// We couldn't execute the feature test to a point
+			// where it could make a determination.
+			// Don't cache the result, just return it.
+			return fmt.Errorf("detect support for %s: %w", name, err)
 		}
 		}
-		return a < other[i]
+
+		return ft.result
 	}
 	}
-	return false
 }
 }

+ 44 - 0
vendor/github.com/cilium/ebpf/internal/pinning.go

@@ -0,0 +1,44 @@
+package internal
+
+import (
+	"errors"
+	"fmt"
+	"os"
+
+	"github.com/cilium/ebpf/internal/unix"
+)
+
+func Pin(currentPath, newPath string, fd *FD) error {
+	if newPath == "" {
+		return errors.New("given pinning path cannot be empty")
+	}
+	if currentPath == newPath {
+		return nil
+	}
+	if currentPath == "" {
+		return BPFObjPin(newPath, fd)
+	}
+	var err error
+	// Renameat2 is used instead of os.Rename to disallow the new path replacing
+	// an existing path.
+	if err = unix.Renameat2(unix.AT_FDCWD, currentPath, unix.AT_FDCWD, newPath, unix.RENAME_NOREPLACE); err == nil {
+		// Object is now moved to the new pinning path.
+		return nil
+	}
+	if !os.IsNotExist(err) {
+		return fmt.Errorf("unable to move pinned object to new path %v: %w", newPath, err)
+	}
+	// Internal state not in sync with the file system so let's fix it.
+	return BPFObjPin(newPath, fd)
+}
+
+func Unpin(pinnedPath string) error {
+	if pinnedPath == "" {
+		return nil
+	}
+	err := os.Remove(pinnedPath)
+	if err == nil || os.IsNotExist(err) {
+		return nil
+	}
+	return err
+}

+ 10 - 5
vendor/github.com/cilium/ebpf/internal/ptr.go

@@ -1,6 +1,10 @@
 package internal
 package internal
 
 
-import "unsafe"
+import (
+	"unsafe"
+
+	"github.com/cilium/ebpf/internal/unix"
+)
 
 
 // NewPointer creates a 64-bit pointer from an unsafe Pointer.
 // NewPointer creates a 64-bit pointer from an unsafe Pointer.
 func NewPointer(ptr unsafe.Pointer) Pointer {
 func NewPointer(ptr unsafe.Pointer) Pointer {
@@ -22,9 +26,10 @@ func NewStringPointer(str string) Pointer {
 		return Pointer{}
 		return Pointer{}
 	}
 	}
 
 
-	// The kernel expects strings to be zero terminated
-	buf := make([]byte, len(str)+1)
-	copy(buf, str)
+	p, err := unix.BytePtrFromString(str)
+	if err != nil {
+		return Pointer{}
+	}
 
 
-	return Pointer{ptr: unsafe.Pointer(&buf[0])}
+	return Pointer{ptr: unsafe.Pointer(p)}
 }
 }

+ 43 - 2
vendor/github.com/cilium/ebpf/internal/syscall.go

@@ -91,6 +91,19 @@ func BPFProgDetach(attr *BPFProgDetachAttr) error {
 	return err
 	return err
 }
 }
 
 
+type BPFEnableStatsAttr struct {
+	StatsType uint32
+}
+
+func BPFEnableStats(attr *BPFEnableStatsAttr) (*FD, error) {
+	ptr, err := BPF(BPF_ENABLE_STATS, unsafe.Pointer(attr), unsafe.Sizeof(*attr))
+	if err != nil {
+		return nil, fmt.Errorf("enable stats: %w", err)
+	}
+	return NewFD(uint32(ptr)), nil
+
+}
+
 type bpfObjAttr struct {
 type bpfObjAttr struct {
 	fileName  Pointer
 	fileName  Pointer
 	fd        uint32
 	fd        uint32
@@ -127,9 +140,10 @@ func BPFObjPin(fileName string, fd *FD) error {
 }
 }
 
 
 // BPFObjGet wraps BPF_OBJ_GET.
 // BPFObjGet wraps BPF_OBJ_GET.
-func BPFObjGet(fileName string) (*FD, error) {
+func BPFObjGet(fileName string, flags uint32) (*FD, error) {
 	attr := bpfObjAttr{
 	attr := bpfObjAttr{
-		fileName: NewStringPointer(fileName),
+		fileName:  NewStringPointer(fileName),
+		fileFlags: flags,
 	}
 	}
 	ptr, err := BPF(BPF_OBJ_GET, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
 	ptr, err := BPF(BPF_OBJ_GET, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
 	if err != nil {
 	if err != nil {
@@ -137,3 +151,30 @@ func BPFObjGet(fileName string) (*FD, error) {
 	}
 	}
 	return NewFD(uint32(ptr)), nil
 	return NewFD(uint32(ptr)), nil
 }
 }
+
+type bpfObjGetInfoByFDAttr struct {
+	fd      uint32
+	infoLen uint32
+	info    Pointer
+}
+
+// BPFObjGetInfoByFD wraps BPF_OBJ_GET_INFO_BY_FD.
+//
+// Available from 4.13.
+func BPFObjGetInfoByFD(fd *FD, info unsafe.Pointer, size uintptr) error {
+	value, err := fd.Value()
+	if err != nil {
+		return err
+	}
+
+	attr := bpfObjGetInfoByFDAttr{
+		fd:      value,
+		infoLen: uint32(size),
+		info:    NewPointer(info),
+	}
+	_, err = BPF(BPF_OBJ_GET_INFO_BY_FD, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
+	if err != nil {
+		return fmt.Errorf("fd %v: %w", fd, err)
+	}
+	return nil
+}

+ 61 - 10
vendor/github.com/cilium/ebpf/internal/unix/types_linux.go

@@ -3,24 +3,34 @@
 package unix
 package unix
 
 
 import (
 import (
+	"bytes"
 	"syscall"
 	"syscall"
 
 
 	linux "golang.org/x/sys/unix"
 	linux "golang.org/x/sys/unix"
 )
 )
 
 
 const (
 const (
-	ENOENT                   = linux.ENOENT
-	EEXIST                   = linux.EEXIST
-	EAGAIN                   = linux.EAGAIN
-	ENOSPC                   = linux.ENOSPC
-	EINVAL                   = linux.EINVAL
-	EPOLLIN                  = linux.EPOLLIN
-	EINTR                    = linux.EINTR
-	EPERM                    = linux.EPERM
-	ESRCH                    = linux.ESRCH
-	ENODEV                   = linux.ENODEV
+	ENOENT  = linux.ENOENT
+	EEXIST  = linux.EEXIST
+	EAGAIN  = linux.EAGAIN
+	ENOSPC  = linux.ENOSPC
+	EINVAL  = linux.EINVAL
+	EPOLLIN = linux.EPOLLIN
+	EINTR   = linux.EINTR
+	EPERM   = linux.EPERM
+	ESRCH   = linux.ESRCH
+	ENODEV  = linux.ENODEV
+	// ENOTSUPP is not the same as ENOTSUP or EOPNOTSUP
+	ENOTSUPP = syscall.Errno(0x20c)
+
+	EBADF                    = linux.EBADF
+	BPF_F_NO_PREALLOC        = linux.BPF_F_NO_PREALLOC
+	BPF_F_NUMA_NODE          = linux.BPF_F_NUMA_NODE
+	BPF_F_RDONLY             = linux.BPF_F_RDONLY
+	BPF_F_WRONLY             = linux.BPF_F_WRONLY
 	BPF_F_RDONLY_PROG        = linux.BPF_F_RDONLY_PROG
 	BPF_F_RDONLY_PROG        = linux.BPF_F_RDONLY_PROG
 	BPF_F_WRONLY_PROG        = linux.BPF_F_WRONLY_PROG
 	BPF_F_WRONLY_PROG        = linux.BPF_F_WRONLY_PROG
+	BPF_F_SLEEPABLE          = linux.BPF_F_SLEEPABLE
 	BPF_OBJ_NAME_LEN         = linux.BPF_OBJ_NAME_LEN
 	BPF_OBJ_NAME_LEN         = linux.BPF_OBJ_NAME_LEN
 	BPF_TAG_SIZE             = linux.BPF_TAG_SIZE
 	BPF_TAG_SIZE             = linux.BPF_TAG_SIZE
 	SYS_BPF                  = linux.SYS_BPF
 	SYS_BPF                  = linux.SYS_BPF
@@ -33,12 +43,21 @@ const (
 	PROT_WRITE               = linux.PROT_WRITE
 	PROT_WRITE               = linux.PROT_WRITE
 	MAP_SHARED               = linux.MAP_SHARED
 	MAP_SHARED               = linux.MAP_SHARED
 	PERF_TYPE_SOFTWARE       = linux.PERF_TYPE_SOFTWARE
 	PERF_TYPE_SOFTWARE       = linux.PERF_TYPE_SOFTWARE
+	PERF_TYPE_TRACEPOINT     = linux.PERF_TYPE_TRACEPOINT
 	PERF_COUNT_SW_BPF_OUTPUT = linux.PERF_COUNT_SW_BPF_OUTPUT
 	PERF_COUNT_SW_BPF_OUTPUT = linux.PERF_COUNT_SW_BPF_OUTPUT
+	PERF_EVENT_IOC_DISABLE   = linux.PERF_EVENT_IOC_DISABLE
+	PERF_EVENT_IOC_ENABLE    = linux.PERF_EVENT_IOC_ENABLE
+	PERF_EVENT_IOC_SET_BPF   = linux.PERF_EVENT_IOC_SET_BPF
 	PerfBitWatermark         = linux.PerfBitWatermark
 	PerfBitWatermark         = linux.PerfBitWatermark
 	PERF_SAMPLE_RAW          = linux.PERF_SAMPLE_RAW
 	PERF_SAMPLE_RAW          = linux.PERF_SAMPLE_RAW
 	PERF_FLAG_FD_CLOEXEC     = linux.PERF_FLAG_FD_CLOEXEC
 	PERF_FLAG_FD_CLOEXEC     = linux.PERF_FLAG_FD_CLOEXEC
 	RLIM_INFINITY            = linux.RLIM_INFINITY
 	RLIM_INFINITY            = linux.RLIM_INFINITY
 	RLIMIT_MEMLOCK           = linux.RLIMIT_MEMLOCK
 	RLIMIT_MEMLOCK           = linux.RLIMIT_MEMLOCK
+	BPF_STATS_RUN_TIME       = linux.BPF_STATS_RUN_TIME
+	PERF_RECORD_LOST         = linux.PERF_RECORD_LOST
+	PERF_RECORD_SAMPLE       = linux.PERF_RECORD_SAMPLE
+	AT_FDCWD                 = linux.AT_FDCWD
+	RENAME_NOREPLACE         = linux.RENAME_NOREPLACE
 )
 )
 
 
 // Statfs_t is a wrapper
 // Statfs_t is a wrapper
@@ -62,6 +81,11 @@ func FcntlInt(fd uintptr, cmd, arg int) (int, error) {
 	return linux.FcntlInt(fd, cmd, arg)
 	return linux.FcntlInt(fd, cmd, arg)
 }
 }
 
 
+// IoctlSetInt is a wrapper
+func IoctlSetInt(fd int, req uint, value int) error {
+	return linux.IoctlSetInt(fd, req, value)
+}
+
 // Statfs is a wrapper
 // Statfs is a wrapper
 func Statfs(path string, buf *Statfs_t) (err error) {
 func Statfs(path string, buf *Statfs_t) (err error) {
 	return linux.Statfs(path, buf)
 	return linux.Statfs(path, buf)
@@ -148,3 +172,30 @@ func Gettid() int {
 func Tgkill(tgid int, tid int, sig syscall.Signal) (err error) {
 func Tgkill(tgid int, tid int, sig syscall.Signal) (err error) {
 	return linux.Tgkill(tgid, tid, sig)
 	return linux.Tgkill(tgid, tid, sig)
 }
 }
+
+// BytePtrFromString is a wrapper
+func BytePtrFromString(s string) (*byte, error) {
+	return linux.BytePtrFromString(s)
+}
+
+// ByteSliceToString is a wrapper
+func ByteSliceToString(s []byte) string {
+	return linux.ByteSliceToString(s)
+}
+
+// Renameat2 is a wrapper
+func Renameat2(olddirfd int, oldpath string, newdirfd int, newpath string, flags uint) error {
+	return linux.Renameat2(olddirfd, oldpath, newdirfd, newpath, flags)
+}
+
+func KernelRelease() (string, error) {
+	var uname Utsname
+	err := Uname(&uname)
+	if err != nil {
+		return "", err
+	}
+
+	end := bytes.IndexByte(uname.Release[:], 0)
+	release := string(uname.Release[:end])
+	return release, nil
+}

+ 52 - 9
vendor/github.com/cilium/ebpf/internal/unix/types_other.go

@@ -11,17 +11,26 @@ import (
 var errNonLinux = fmt.Errorf("unsupported platform %s/%s", runtime.GOOS, runtime.GOARCH)
 var errNonLinux = fmt.Errorf("unsupported platform %s/%s", runtime.GOOS, runtime.GOARCH)
 
 
 const (
 const (
-	ENOENT                   = syscall.ENOENT
-	EEXIST                   = syscall.EEXIST
-	EAGAIN                   = syscall.EAGAIN
-	ENOSPC                   = syscall.ENOSPC
-	EINVAL                   = syscall.EINVAL
-	EINTR                    = syscall.EINTR
-	EPERM                    = syscall.EPERM
-	ESRCH                    = syscall.ESRCH
-	ENODEV                   = syscall.ENODEV
+	ENOENT = syscall.ENOENT
+	EEXIST = syscall.EEXIST
+	EAGAIN = syscall.EAGAIN
+	ENOSPC = syscall.ENOSPC
+	EINVAL = syscall.EINVAL
+	EINTR  = syscall.EINTR
+	EPERM  = syscall.EPERM
+	ESRCH  = syscall.ESRCH
+	ENODEV = syscall.ENODEV
+	EBADF  = syscall.Errno(0)
+	// ENOTSUPP is not the same as ENOTSUP or EOPNOTSUP
+	ENOTSUPP = syscall.Errno(0x20c)
+
+	BPF_F_NO_PREALLOC        = 0
+	BPF_F_NUMA_NODE          = 0
+	BPF_F_RDONLY             = 0
+	BPF_F_WRONLY             = 0
 	BPF_F_RDONLY_PROG        = 0
 	BPF_F_RDONLY_PROG        = 0
 	BPF_F_WRONLY_PROG        = 0
 	BPF_F_WRONLY_PROG        = 0
+	BPF_F_SLEEPABLE          = 0
 	BPF_OBJ_NAME_LEN         = 0x10
 	BPF_OBJ_NAME_LEN         = 0x10
 	BPF_TAG_SIZE             = 0x8
 	BPF_TAG_SIZE             = 0x8
 	SYS_BPF                  = 321
 	SYS_BPF                  = 321
@@ -35,12 +44,21 @@ const (
 	PROT_WRITE               = 0x2
 	PROT_WRITE               = 0x2
 	MAP_SHARED               = 0x1
 	MAP_SHARED               = 0x1
 	PERF_TYPE_SOFTWARE       = 0x1
 	PERF_TYPE_SOFTWARE       = 0x1
+	PERF_TYPE_TRACEPOINT     = 0
 	PERF_COUNT_SW_BPF_OUTPUT = 0xa
 	PERF_COUNT_SW_BPF_OUTPUT = 0xa
+	PERF_EVENT_IOC_DISABLE   = 0
+	PERF_EVENT_IOC_ENABLE    = 0
+	PERF_EVENT_IOC_SET_BPF   = 0
 	PerfBitWatermark         = 0x4000
 	PerfBitWatermark         = 0x4000
 	PERF_SAMPLE_RAW          = 0x400
 	PERF_SAMPLE_RAW          = 0x400
 	PERF_FLAG_FD_CLOEXEC     = 0x8
 	PERF_FLAG_FD_CLOEXEC     = 0x8
 	RLIM_INFINITY            = 0x7fffffffffffffff
 	RLIM_INFINITY            = 0x7fffffffffffffff
 	RLIMIT_MEMLOCK           = 8
 	RLIMIT_MEMLOCK           = 8
+	BPF_STATS_RUN_TIME       = 0
+	PERF_RECORD_LOST         = 2
+	PERF_RECORD_SAMPLE       = 9
+	AT_FDCWD                 = -0x2
+	RENAME_NOREPLACE         = 0x1
 )
 )
 
 
 // Statfs_t is a wrapper
 // Statfs_t is a wrapper
@@ -80,6 +98,11 @@ func FcntlInt(fd uintptr, cmd, arg int) (int, error) {
 	return -1, errNonLinux
 	return -1, errNonLinux
 }
 }
 
 
+// IoctlSetInt is a wrapper
+func IoctlSetInt(fd int, req uint, value int) error {
+	return errNonLinux
+}
+
 // Statfs is a wrapper
 // Statfs is a wrapper
 func Statfs(path string, buf *Statfs_t) error {
 func Statfs(path string, buf *Statfs_t) error {
 	return errNonLinux
 	return errNonLinux
@@ -194,6 +217,7 @@ func PerfEventOpen(attr *PerfEventAttr, pid int, cpu int, groupFd int, flags int
 // Utsname is a wrapper
 // Utsname is a wrapper
 type Utsname struct {
 type Utsname struct {
 	Release [65]byte
 	Release [65]byte
+	Version [65]byte
 }
 }
 
 
 // Uname is a wrapper
 // Uname is a wrapper
@@ -215,3 +239,22 @@ func Gettid() int {
 func Tgkill(tgid int, tid int, sig syscall.Signal) (err error) {
 func Tgkill(tgid int, tid int, sig syscall.Signal) (err error) {
 	return errNonLinux
 	return errNonLinux
 }
 }
+
+// BytePtrFromString is a wrapper
+func BytePtrFromString(s string) (*byte, error) {
+	return nil, errNonLinux
+}
+
+// ByteSliceToString is a wrapper
+func ByteSliceToString(s []byte) string {
+	return ""
+}
+
+// Renameat2 is a wrapper
+func Renameat2(olddirfd int, oldpath string, newdirfd int, newpath string, flags uint) error {
+	return errNonLinux
+}
+
+func KernelRelease() (string, error) {
+	return "", errNonLinux
+}

+ 163 - 0
vendor/github.com/cilium/ebpf/internal/version.go

@@ -0,0 +1,163 @@
+package internal
+
+import (
+	"fmt"
+	"io/ioutil"
+	"regexp"
+	"sync"
+
+	"github.com/cilium/ebpf/internal/unix"
+)
+
+const (
+	// Version constant used in ELF binaries indicating that the loader needs to
+	// substitute the eBPF program's version with the value of the kernel's
+	// KERNEL_VERSION compile-time macro. Used for compatibility with BCC, gobpf
+	// and RedSift.
+	MagicKernelVersion = 0xFFFFFFFE
+)
+
+var (
+	// Match between one and three decimals separated by dots, with the last
+	// segment (patch level) being optional on some kernels.
+	// The x.y.z string must appear at the start of a string or right after
+	// whitespace to prevent sequences like 'x.y.z-a.b.c' from matching 'a.b.c'.
+	rgxKernelVersion = regexp.MustCompile(`(?:\A|\s)\d{1,3}\.\d{1,3}(?:\.\d{1,3})?`)
+
+	kernelVersion = struct {
+		once    sync.Once
+		version Version
+		err     error
+	}{}
+)
+
+// A Version in the form Major.Minor.Patch.
+type Version [3]uint16
+
+// NewVersion creates a version from a string like "Major.Minor.Patch".
+//
+// Patch is optional.
+func NewVersion(ver string) (Version, error) {
+	var major, minor, patch uint16
+	n, _ := fmt.Sscanf(ver, "%d.%d.%d", &major, &minor, &patch)
+	if n < 2 {
+		return Version{}, fmt.Errorf("invalid version: %s", ver)
+	}
+	return Version{major, minor, patch}, nil
+}
+
+func (v Version) String() string {
+	if v[2] == 0 {
+		return fmt.Sprintf("v%d.%d", v[0], v[1])
+	}
+	return fmt.Sprintf("v%d.%d.%d", v[0], v[1], v[2])
+}
+
+// Less returns true if the version is less than another version.
+func (v Version) Less(other Version) bool {
+	for i, a := range v {
+		if a == other[i] {
+			continue
+		}
+		return a < other[i]
+	}
+	return false
+}
+
+// Unspecified returns true if the version is all zero.
+func (v Version) Unspecified() bool {
+	return v[0] == 0 && v[1] == 0 && v[2] == 0
+}
+
+// Kernel implements the kernel's KERNEL_VERSION macro from linux/version.h.
+// It represents the kernel version and patch level as a single value.
+func (v Version) Kernel() uint32 {
+
+	// Kernels 4.4 and 4.9 have their SUBLEVEL clamped to 255 to avoid
+	// overflowing into PATCHLEVEL.
+	// See kernel commit 9b82f13e7ef3 ("kbuild: clamp SUBLEVEL to 255").
+	s := v[2]
+	if s > 255 {
+		s = 255
+	}
+
+	// Truncate members to uint8 to prevent them from spilling over into
+	// each other when overflowing 8 bits.
+	return uint32(uint8(v[0]))<<16 | uint32(uint8(v[1]))<<8 | uint32(uint8(s))
+}
+
+// KernelVersion returns the version of the currently running kernel.
+func KernelVersion() (Version, error) {
+	kernelVersion.once.Do(func() {
+		kernelVersion.version, kernelVersion.err = detectKernelVersion()
+	})
+
+	if kernelVersion.err != nil {
+		return Version{}, kernelVersion.err
+	}
+	return kernelVersion.version, nil
+}
+
+// detectKernelVersion returns the version of the running kernel. It scans the
+// following sources in order: /proc/version_signature, uname -v, uname -r.
+// In each of those locations, the last-appearing x.y(.z) value is selected
+// for parsing. The first location that yields a usable version number is
+// returned.
+func detectKernelVersion() (Version, error) {
+
+	// Try reading /proc/version_signature for Ubuntu compatibility.
+	// Example format: Ubuntu 4.15.0-91.92-generic 4.15.18
+	// This method exists in the kernel itself, see d18acd15c
+	// ("perf tools: Fix kernel version error in ubuntu").
+	if pvs, err := ioutil.ReadFile("/proc/version_signature"); err == nil {
+		// If /proc/version_signature exists, failing to parse it is an error.
+		// It only exists on Ubuntu, where the real patch level is not obtainable
+		// through any other method.
+		v, err := findKernelVersion(string(pvs))
+		if err != nil {
+			return Version{}, err
+		}
+		return v, nil
+	}
+
+	var uname unix.Utsname
+	if err := unix.Uname(&uname); err != nil {
+		return Version{}, fmt.Errorf("calling uname: %w", err)
+	}
+
+	// Debian puts the version including the patch level in uname.Version.
+	// It is not an error if there's no version number in uname.Version,
+	// as most distributions don't use it. Parsing can continue on uname.Release.
+	// Example format: #1 SMP Debian 4.19.37-5+deb10u2 (2019-08-08)
+	if v, err := findKernelVersion(unix.ByteSliceToString(uname.Version[:])); err == nil {
+		return v, nil
+	}
+
+	// Most other distributions have the full kernel version including patch
+	// level in uname.Release.
+	// Example format: 4.19.0-5-amd64, 5.5.10-arch1-1
+	v, err := findKernelVersion(unix.ByteSliceToString(uname.Release[:]))
+	if err != nil {
+		return Version{}, err
+	}
+
+	return v, nil
+}
+
+// findKernelVersion matches s against rgxKernelVersion and parses the result
+// into a Version. If s contains multiple matches, the last entry is selected.
+func findKernelVersion(s string) (Version, error) {
+	m := rgxKernelVersion.FindAllString(s, -1)
+	if m == nil {
+		return Version{}, fmt.Errorf("no kernel version in string: %s", s)
+	}
+	// Pick the last match of the string in case there are multiple.
+	s = m[len(m)-1]
+
+	v, err := NewVersion(s)
+	if err != nil {
+		return Version{}, fmt.Errorf("parsing version string %s: %w", s, err)
+	}
+
+	return v, nil
+}

+ 47 - 0
vendor/github.com/cilium/ebpf/linker.go

@@ -84,3 +84,50 @@ func needSection(insns, section asm.Instructions) (bool, error) {
 	// None of the functions in the section are called.
 	// None of the functions in the section are called.
 	return false, nil
 	return false, nil
 }
 }
+
+func fixupJumpsAndCalls(insns asm.Instructions) error {
+	symbolOffsets := make(map[string]asm.RawInstructionOffset)
+	iter := insns.Iterate()
+	for iter.Next() {
+		ins := iter.Ins
+
+		if ins.Symbol == "" {
+			continue
+		}
+
+		if _, ok := symbolOffsets[ins.Symbol]; ok {
+			return fmt.Errorf("duplicate symbol %s", ins.Symbol)
+		}
+
+		symbolOffsets[ins.Symbol] = iter.Offset
+	}
+
+	iter = insns.Iterate()
+	for iter.Next() {
+		i := iter.Index
+		offset := iter.Offset
+		ins := iter.Ins
+
+		switch {
+		case ins.IsFunctionCall() && ins.Constant == -1:
+			// Rewrite bpf to bpf call
+			callOffset, ok := symbolOffsets[ins.Reference]
+			if !ok {
+				return fmt.Errorf("instruction %d: reference to missing symbol %q", i, ins.Reference)
+			}
+
+			ins.Constant = int64(callOffset - offset - 1)
+
+		case ins.OpCode.Class() == asm.JumpClass && ins.Offset == -1:
+			// Rewrite jump to label
+			jumpOffset, ok := symbolOffsets[ins.Reference]
+			if !ok {
+				return fmt.Errorf("instruction %d: reference to missing symbol %q", i, ins.Reference)
+			}
+
+			ins.Offset = int16(jumpOffset - offset - 1)
+		}
+	}
+
+	return nil
+}

Diferenças do arquivo suprimidas por serem muito extensas
+ 558 - 151
vendor/github.com/cilium/ebpf/map.go


+ 23 - 8
vendor/github.com/cilium/ebpf/marshalers.go

@@ -13,14 +13,12 @@ import (
 	"github.com/cilium/ebpf/internal"
 	"github.com/cilium/ebpf/internal"
 )
 )
 
 
+// marshalPtr converts an arbitrary value into a pointer suitable
+// to be passed to the kernel.
+//
+// As an optimization, it returns the original value if it is an
+// unsafe.Pointer.
 func marshalPtr(data interface{}, length int) (internal.Pointer, error) {
 func marshalPtr(data interface{}, length int) (internal.Pointer, error) {
-	if data == nil {
-		if length == 0 {
-			return internal.NewPointer(nil), nil
-		}
-		return internal.Pointer{}, errors.New("can't use nil as key of map")
-	}
-
 	if ptr, ok := data.(unsafe.Pointer); ok {
 	if ptr, ok := data.(unsafe.Pointer); ok {
 		return internal.NewPointer(ptr), nil
 		return internal.NewPointer(ptr), nil
 	}
 	}
@@ -33,6 +31,13 @@ func marshalPtr(data interface{}, length int) (internal.Pointer, error) {
 	return internal.NewSlicePointer(buf), nil
 	return internal.NewSlicePointer(buf), nil
 }
 }
 
 
+// marshalBytes converts an arbitrary value into a byte buffer.
+//
+// Prefer using Map.marshalKey and Map.marshalValue if possible, since
+// those have special cases that allow more types to be encoded.
+//
+// Returns an error if the given value isn't representable in exactly
+// length bytes.
 func marshalBytes(data interface{}, length int) (buf []byte, err error) {
 func marshalBytes(data interface{}, length int) (buf []byte, err error) {
 	switch value := data.(type) {
 	switch value := data.(type) {
 	case encoding.BinaryMarshaler:
 	case encoding.BinaryMarshaler:
@@ -43,6 +48,8 @@ func marshalBytes(data interface{}, length int) (buf []byte, err error) {
 		buf = value
 		buf = value
 	case unsafe.Pointer:
 	case unsafe.Pointer:
 		err = errors.New("can't marshal from unsafe.Pointer")
 		err = errors.New("can't marshal from unsafe.Pointer")
+	case Map, *Map, Program, *Program:
+		err = fmt.Errorf("can't marshal %T", value)
 	default:
 	default:
 		var wr bytes.Buffer
 		var wr bytes.Buffer
 		err = binary.Write(&wr, internal.NativeEndian, value)
 		err = binary.Write(&wr, internal.NativeEndian, value)
@@ -70,10 +77,16 @@ func makeBuffer(dst interface{}, length int) (internal.Pointer, []byte) {
 	return internal.NewSlicePointer(buf), buf
 	return internal.NewSlicePointer(buf), buf
 }
 }
 
 
+// unmarshalBytes converts a byte buffer into an arbitrary value.
+//
+// Prefer using Map.unmarshalKey and Map.unmarshalValue if possible, since
+// those have special cases that allow more types to be encoded.
 func unmarshalBytes(data interface{}, buf []byte) error {
 func unmarshalBytes(data interface{}, buf []byte) error {
 	switch value := data.(type) {
 	switch value := data.(type) {
 	case unsafe.Pointer:
 	case unsafe.Pointer:
-		sh := &reflect.SliceHeader{
+		// This could be solved in Go 1.17 by unsafe.Slice instead. (https://github.com/golang/go/issues/19367)
+		// We could opt for removing unsafe.Pointer support in the lib as well.
+		sh := &reflect.SliceHeader{ //nolint:govet
 			Data: uintptr(value),
 			Data: uintptr(value),
 			Len:  len(buf),
 			Len:  len(buf),
 			Cap:  len(buf),
 			Cap:  len(buf),
@@ -83,6 +96,8 @@ func unmarshalBytes(data interface{}, buf []byte) error {
 		copy(dst, buf)
 		copy(dst, buf)
 		runtime.KeepAlive(value)
 		runtime.KeepAlive(value)
 		return nil
 		return nil
+	case Map, *Map, Program, *Program:
+		return fmt.Errorf("can't unmarshal into %T", value)
 	case encoding.BinaryUnmarshaler:
 	case encoding.BinaryUnmarshaler:
 		return value.UnmarshalBinary(buf)
 		return value.UnmarshalBinary(buf)
 	case *string:
 	case *string:

+ 240 - 147
vendor/github.com/cilium/ebpf/prog.go

@@ -6,6 +6,7 @@ import (
 	"errors"
 	"errors"
 	"fmt"
 	"fmt"
 	"math"
 	"math"
+	"path/filepath"
 	"strings"
 	"strings"
 	"time"
 	"time"
 
 
@@ -18,7 +19,7 @@ import (
 // ErrNotSupported is returned whenever the kernel doesn't support a feature.
 // ErrNotSupported is returned whenever the kernel doesn't support a feature.
 var ErrNotSupported = internal.ErrNotSupported
 var ErrNotSupported = internal.ErrNotSupported
 
 
-// ProgramID represents the unique ID of an eBPF program
+// ProgramID represents the unique ID of an eBPF program.
 type ProgramID uint32
 type ProgramID uint32
 
 
 const (
 const (
@@ -42,7 +43,7 @@ type ProgramOptions struct {
 	LogSize int
 	LogSize int
 }
 }
 
 
-// ProgramSpec defines a Program
+// ProgramSpec defines a Program.
 type ProgramSpec struct {
 type ProgramSpec struct {
 	// Name is passed to the kernel as a debug aid. Must only contain
 	// Name is passed to the kernel as a debug aid. Must only contain
 	// alpha numeric and '_' characters.
 	// alpha numeric and '_' characters.
@@ -54,16 +55,19 @@ type ProgramSpec struct {
 	// depends on Type and AttachType.
 	// depends on Type and AttachType.
 	AttachTo     string
 	AttachTo     string
 	Instructions asm.Instructions
 	Instructions asm.Instructions
-
+	// Flags is passed to the kernel and specifies additional program
+	// load attributes.
+	Flags uint32
 	// License of the program. Some helpers are only available if
 	// License of the program. Some helpers are only available if
 	// the license is deemed compatible with the GPL.
 	// the license is deemed compatible with the GPL.
 	//
 	//
 	// See https://www.kernel.org/doc/html/latest/process/license-rules.html#id1
 	// See https://www.kernel.org/doc/html/latest/process/license-rules.html#id1
 	License string
 	License string
 
 
-	// Version used by tracing programs.
+	// Version used by Kprobe programs.
 	//
 	//
-	// Deprecated: superseded by BTF.
+	// Deprecated on kernels 5.0 and later. Leave empty to let the library
+	// detect this value automatically.
 	KernelVersion uint32
 	KernelVersion uint32
 
 
 	// The BTF associated with this program. Changing Instructions
 	// The BTF associated with this program. Changing Instructions
@@ -87,6 +91,13 @@ func (ps *ProgramSpec) Copy() *ProgramSpec {
 	return &cpy
 	return &cpy
 }
 }
 
 
+// Tag calculates the kernel tag for a series of instructions.
+//
+// Use asm.Instructions.Tag if you need to calculate for non-native endianness.
+func (ps *ProgramSpec) Tag() (string, error) {
+	return ps.Instructions.Tag(internal.NativeEndian)
+}
+
 // Program represents BPF program loaded into the kernel.
 // Program represents BPF program loaded into the kernel.
 //
 //
 // It is not safe to close a Program which is used by other goroutines.
 // It is not safe to close a Program which is used by other goroutines.
@@ -97,8 +108,8 @@ type Program struct {
 
 
 	fd         *internal.FD
 	fd         *internal.FD
 	name       string
 	name       string
-	abi        ProgramABI
-	attachType AttachType
+	pinnedPath string
+	typ        ProgramType
 }
 }
 
 
 // NewProgram creates a new Program.
 // NewProgram creates a new Program.
@@ -114,24 +125,112 @@ func NewProgram(spec *ProgramSpec) (*Program, error) {
 // Loading a program for the first time will perform
 // Loading a program for the first time will perform
 // feature detection by loading small, temporary programs.
 // feature detection by loading small, temporary programs.
 func NewProgramWithOptions(spec *ProgramSpec, opts ProgramOptions) (*Program, error) {
 func NewProgramWithOptions(spec *ProgramSpec, opts ProgramOptions) (*Program, error) {
-	if spec.BTF == nil {
-		return newProgramWithBTF(spec, nil, opts)
+	btfs := make(btfHandleCache)
+	defer btfs.close()
+
+	return newProgramWithOptions(spec, opts, btfs)
+}
+
+func newProgramWithOptions(spec *ProgramSpec, opts ProgramOptions, btfs btfHandleCache) (*Program, error) {
+	if len(spec.Instructions) == 0 {
+		return nil, errors.New("Instructions cannot be empty")
+	}
+
+	if len(spec.License) == 0 {
+		return nil, errors.New("License cannot be empty")
 	}
 	}
 
 
-	handle, err := btf.NewHandle(btf.ProgramSpec(spec.BTF))
-	if err != nil && !errors.Is(err, btf.ErrNotSupported) {
-		return nil, fmt.Errorf("can't load BTF: %w", err)
+	if spec.ByteOrder != nil && spec.ByteOrder != internal.NativeEndian {
+		return nil, fmt.Errorf("can't load %s program on %s", spec.ByteOrder, internal.NativeEndian)
 	}
 	}
 
 
-	return newProgramWithBTF(spec, handle, opts)
-}
+	// Kernels before 5.0 (6c4fc209fcf9 "bpf: remove useless version check for prog load")
+	// require the version field to be set to the value of the KERNEL_VERSION
+	// macro for kprobe-type programs.
+	// Overwrite Kprobe program version if set to zero or the magic version constant.
+	kv := spec.KernelVersion
+	if spec.Type == Kprobe && (kv == 0 || kv == internal.MagicKernelVersion) {
+		v, err := internal.KernelVersion()
+		if err != nil {
+			return nil, fmt.Errorf("detecting kernel version: %w", err)
+		}
+		kv = v.Kernel()
+	}
+
+	insns := make(asm.Instructions, len(spec.Instructions))
+	copy(insns, spec.Instructions)
+
+	if err := fixupJumpsAndCalls(insns); err != nil {
+		return nil, err
+	}
 
 
-func newProgramWithBTF(spec *ProgramSpec, btf *btf.Handle, opts ProgramOptions) (*Program, error) {
-	attr, err := convertProgramSpec(spec, btf)
+	buf := bytes.NewBuffer(make([]byte, 0, len(spec.Instructions)*asm.InstructionSize))
+	err := insns.Marshal(buf, internal.NativeEndian)
 	if err != nil {
 	if err != nil {
 		return nil, err
 		return nil, err
 	}
 	}
 
 
+	bytecode := buf.Bytes()
+	insCount := uint32(len(bytecode) / asm.InstructionSize)
+	attr := &bpfProgLoadAttr{
+		progType:           spec.Type,
+		progFlags:          spec.Flags,
+		expectedAttachType: spec.AttachType,
+		insCount:           insCount,
+		instructions:       internal.NewSlicePointer(bytecode),
+		license:            internal.NewStringPointer(spec.License),
+		kernelVersion:      kv,
+	}
+
+	if haveObjName() == nil {
+		attr.progName = newBPFObjName(spec.Name)
+	}
+
+	var btfDisabled bool
+	if spec.BTF != nil {
+		if relos, err := btf.ProgramRelocations(spec.BTF, nil); err != nil {
+			return nil, fmt.Errorf("CO-RE relocations: %s", err)
+		} else if len(relos) > 0 {
+			return nil, fmt.Errorf("applying CO-RE relocations: %w", ErrNotSupported)
+		}
+
+		handle, err := btfs.load(btf.ProgramSpec(spec.BTF))
+		btfDisabled = errors.Is(err, btf.ErrNotSupported)
+		if err != nil && !btfDisabled {
+			return nil, fmt.Errorf("load BTF: %w", err)
+		}
+
+		if handle != nil {
+			attr.progBTFFd = uint32(handle.FD())
+
+			recSize, bytes, err := btf.ProgramLineInfos(spec.BTF)
+			if err != nil {
+				return nil, fmt.Errorf("get BTF line infos: %w", err)
+			}
+			attr.lineInfoRecSize = recSize
+			attr.lineInfoCnt = uint32(uint64(len(bytes)) / uint64(recSize))
+			attr.lineInfo = internal.NewSlicePointer(bytes)
+
+			recSize, bytes, err = btf.ProgramFuncInfos(spec.BTF)
+			if err != nil {
+				return nil, fmt.Errorf("get BTF function infos: %w", err)
+			}
+			attr.funcInfoRecSize = recSize
+			attr.funcInfoCnt = uint32(uint64(len(bytes)) / uint64(recSize))
+			attr.funcInfo = internal.NewSlicePointer(bytes)
+		}
+	}
+
+	if spec.AttachTo != "" {
+		target, err := resolveBTFType(spec.AttachTo, spec.Type, spec.AttachType)
+		if err != nil {
+			return nil, err
+		}
+		if target != nil {
+			attr.attachBTFID = target.ID()
+		}
+	}
+
 	logSize := DefaultVerifierLogSize
 	logSize := DefaultVerifierLogSize
 	if opts.LogSize > 0 {
 	if opts.LogSize > 0 {
 		logSize = opts.LogSize
 		logSize = opts.LogSize
@@ -147,9 +246,7 @@ func newProgramWithBTF(spec *ProgramSpec, btf *btf.Handle, opts ProgramOptions)
 
 
 	fd, err := bpfProgLoad(attr)
 	fd, err := bpfProgLoad(attr)
 	if err == nil {
 	if err == nil {
-		prog := newProgram(fd, spec.Name, &ProgramABI{spec.Type})
-		prog.VerifierLog = internal.CString(logBuf)
-		return prog, nil
+		return &Program{internal.CString(logBuf), fd, spec.Name, "", spec.Type}, nil
 	}
 	}
 
 
 	logErr := err
 	logErr := err
@@ -163,115 +260,71 @@ func newProgramWithBTF(spec *ProgramSpec, btf *btf.Handle, opts ProgramOptions)
 		_, logErr = bpfProgLoad(attr)
 		_, logErr = bpfProgLoad(attr)
 	}
 	}
 
 
+	if errors.Is(logErr, unix.EPERM) && logBuf[0] == 0 {
+		// EPERM due to RLIMIT_MEMLOCK happens before the verifier, so we can
+		// check that the log is empty to reduce false positives.
+		return nil, fmt.Errorf("load program: RLIMIT_MEMLOCK may be too low: %w", logErr)
+	}
+
 	err = internal.ErrorWithLog(err, logBuf, logErr)
 	err = internal.ErrorWithLog(err, logBuf, logErr)
-	return nil, fmt.Errorf("can't load program: %w", err)
+	if btfDisabled {
+		return nil, fmt.Errorf("load program without BTF: %w", err)
+	}
+	return nil, fmt.Errorf("load program: %w", err)
 }
 }
 
 
 // NewProgramFromFD creates a program from a raw fd.
 // NewProgramFromFD creates a program from a raw fd.
 //
 //
 // You should not use fd after calling this function.
 // You should not use fd after calling this function.
 //
 //
-// Requires at least Linux 4.11.
+// Requires at least Linux 4.10.
 func NewProgramFromFD(fd int) (*Program, error) {
 func NewProgramFromFD(fd int) (*Program, error) {
 	if fd < 0 {
 	if fd < 0 {
 		return nil, errors.New("invalid fd")
 		return nil, errors.New("invalid fd")
 	}
 	}
-	bpfFd := internal.NewFD(uint32(fd))
 
 
-	name, abi, err := newProgramABIFromFd(bpfFd)
-	if err != nil {
-		bpfFd.Forget()
-		return nil, err
-	}
-
-	return newProgram(bpfFd, name, abi), nil
-}
-
-func newProgram(fd *internal.FD, name string, abi *ProgramABI) *Program {
-	return &Program{
-		name: name,
-		fd:   fd,
-		abi:  *abi,
-	}
+	return newProgramFromFD(internal.NewFD(uint32(fd)))
 }
 }
 
 
-func convertProgramSpec(spec *ProgramSpec, handle *btf.Handle) (*bpfProgLoadAttr, error) {
-	if len(spec.Instructions) == 0 {
-		return nil, errors.New("Instructions cannot be empty")
-	}
-
-	if len(spec.License) == 0 {
-		return nil, errors.New("License cannot be empty")
-	}
-
-	if spec.ByteOrder != nil && spec.ByteOrder != internal.NativeEndian {
-		return nil, fmt.Errorf("can't load %s program on %s", spec.ByteOrder, internal.NativeEndian)
-	}
-
-	buf := bytes.NewBuffer(make([]byte, 0, len(spec.Instructions)*asm.InstructionSize))
-	err := spec.Instructions.Marshal(buf, internal.NativeEndian)
+// NewProgramFromID returns the program for a given id.
+//
+// Returns ErrNotExist, if there is no eBPF program with the given id.
+func NewProgramFromID(id ProgramID) (*Program, error) {
+	fd, err := bpfObjGetFDByID(internal.BPF_PROG_GET_FD_BY_ID, uint32(id))
 	if err != nil {
 	if err != nil {
-		return nil, err
-	}
-
-	bytecode := buf.Bytes()
-	insCount := uint32(len(bytecode) / asm.InstructionSize)
-	attr := &bpfProgLoadAttr{
-		progType:           spec.Type,
-		expectedAttachType: spec.AttachType,
-		insCount:           insCount,
-		instructions:       internal.NewSlicePointer(bytecode),
-		license:            internal.NewStringPointer(spec.License),
-		kernelVersion:      spec.KernelVersion,
-	}
-
-	if haveObjName() == nil {
-		attr.progName = newBPFObjName(spec.Name)
+		return nil, fmt.Errorf("get program by id: %w", err)
 	}
 	}
 
 
-	if handle != nil && spec.BTF != nil {
-		attr.progBTFFd = uint32(handle.FD())
-
-		recSize, bytes, err := btf.ProgramLineInfos(spec.BTF)
-		if err != nil {
-			return nil, fmt.Errorf("can't get BTF line infos: %w", err)
-		}
-		attr.lineInfoRecSize = recSize
-		attr.lineInfoCnt = uint32(uint64(len(bytes)) / uint64(recSize))
-		attr.lineInfo = internal.NewSlicePointer(bytes)
-
-		recSize, bytes, err = btf.ProgramFuncInfos(spec.BTF)
-		if err != nil {
-			return nil, fmt.Errorf("can't get BTF function infos: %w", err)
-		}
-		attr.funcInfoRecSize = recSize
-		attr.funcInfoCnt = uint32(uint64(len(bytes)) / uint64(recSize))
-		attr.funcInfo = internal.NewSlicePointer(bytes)
-	}
+	return newProgramFromFD(fd)
+}
 
 
-	if spec.AttachTo != "" {
-		target, err := resolveBTFType(spec.AttachTo, spec.Type, spec.AttachType)
-		if err != nil {
-			return nil, err
-		}
-		if target != nil {
-			attr.attachBTFID = target.ID()
-		}
+func newProgramFromFD(fd *internal.FD) (*Program, error) {
+	info, err := newProgramInfoFromFd(fd)
+	if err != nil {
+		fd.Close()
+		return nil, fmt.Errorf("discover program type: %w", err)
 	}
 	}
 
 
-	return attr, nil
+	return &Program{"", fd, "", "", info.Type}, nil
 }
 }
 
 
 func (p *Program) String() string {
 func (p *Program) String() string {
 	if p.name != "" {
 	if p.name != "" {
-		return fmt.Sprintf("%s(%s)#%v", p.abi.Type, p.name, p.fd)
+		return fmt.Sprintf("%s(%s)#%v", p.typ, p.name, p.fd)
 	}
 	}
-	return fmt.Sprintf("%s#%v", p.abi.Type, p.fd)
+	return fmt.Sprintf("%s(%v)", p.typ, p.fd)
 }
 }
 
 
-// ABI gets the ABI of the Program
-func (p *Program) ABI() ProgramABI {
-	return p.abi
+// Type returns the underlying type of the program.
+func (p *Program) Type() ProgramType {
+	return p.typ
+}
+
+// Info returns metadata about the program.
+//
+// Requires at least 4.10.
+func (p *Program) Info() (*ProgramInfo, error) {
+	return newProgramInfoFromFd(p.fd)
 }
 }
 
 
 // FD gets the file descriptor of the Program.
 // FD gets the file descriptor of the Program.
@@ -303,19 +356,42 @@ func (p *Program) Clone() (*Program, error) {
 		return nil, fmt.Errorf("can't clone program: %w", err)
 		return nil, fmt.Errorf("can't clone program: %w", err)
 	}
 	}
 
 
-	return newProgram(dup, p.name, &p.abi), nil
+	return &Program{p.VerifierLog, dup, p.name, "", p.typ}, nil
 }
 }
 
 
-// Pin persists the Program past the lifetime of the process that created it
+// Pin persists the Program on the BPF virtual file system past the lifetime of
+// the process that created it
 //
 //
-// This requires bpffs to be mounted above fileName. See http://cilium.readthedocs.io/en/doc-1.0/kubernetes/install/#mounting-the-bpf-fs-optional
+// Calling Pin on a previously pinned program will overwrite the path, except when
+// the new path already exists. Re-pinning across filesystems is not supported.
+//
+// This requires bpffs to be mounted above fileName. See https://docs.cilium.io/en/k8s-doc/admin/#admin-mount-bpffs
 func (p *Program) Pin(fileName string) error {
 func (p *Program) Pin(fileName string) error {
-	if err := internal.BPFObjPin(fileName, p.fd); err != nil {
-		return fmt.Errorf("can't pin program: %w", err)
+	if err := internal.Pin(p.pinnedPath, fileName, p.fd); err != nil {
+		return err
 	}
 	}
+	p.pinnedPath = fileName
 	return nil
 	return nil
 }
 }
 
 
+// Unpin removes the persisted state for the Program from the BPF virtual filesystem.
+//
+// Failed calls to Unpin will not alter the state returned by IsPinned.
+//
+// Unpinning an unpinned Program returns nil.
+func (p *Program) Unpin() error {
+	if err := internal.Unpin(p.pinnedPath); err != nil {
+		return err
+	}
+	p.pinnedPath = ""
+	return nil
+}
+
+// IsPinned returns true if the Program has a non-empty pinned path.
+func (p *Program) IsPinned() bool {
+	return p.pinnedPath != ""
+}
+
 // Close unloads the program from the kernel.
 // Close unloads the program from the kernel.
 func (p *Program) Close() error {
 func (p *Program) Close() error {
 	if p == nil {
 	if p == nil {
@@ -359,7 +435,7 @@ func (p *Program) Benchmark(in []byte, repeat int, reset func()) (uint32, time.D
 	return ret, total, nil
 	return ret, total, nil
 }
 }
 
 
-var haveProgTestRun = internal.FeatureTest("BPF_PROG_TEST_RUN", "4.12", func() (bool, error) {
+var haveProgTestRun = internal.FeatureTest("BPF_PROG_TEST_RUN", "4.12", func() error {
 	prog, err := NewProgram(&ProgramSpec{
 	prog, err := NewProgram(&ProgramSpec{
 		Type: SocketFilter,
 		Type: SocketFilter,
 		Instructions: asm.Instructions{
 		Instructions: asm.Instructions{
@@ -370,7 +446,7 @@ var haveProgTestRun = internal.FeatureTest("BPF_PROG_TEST_RUN", "4.12", func() (
 	})
 	})
 	if err != nil {
 	if err != nil {
 		// This may be because we lack sufficient permissions, etc.
 		// This may be because we lack sufficient permissions, etc.
-		return false, err
+		return err
 	}
 	}
 	defer prog.Close()
 	defer prog.Close()
 
 
@@ -383,10 +459,16 @@ var haveProgTestRun = internal.FeatureTest("BPF_PROG_TEST_RUN", "4.12", func() (
 	}
 	}
 
 
 	err = bpfProgTestRun(&attr)
 	err = bpfProgTestRun(&attr)
-
-	// Check for EINVAL specifically, rather than err != nil since we
-	// otherwise misdetect due to insufficient permissions.
-	return !errors.Is(err, unix.EINVAL), nil
+	if errors.Is(err, unix.EINVAL) {
+		// Check for EINVAL specifically, rather than err != nil since we
+		// otherwise misdetect due to insufficient permissions.
+		return internal.ErrNotSupported
+	}
+	if errors.Is(err, unix.EINTR) {
+		// We know that PROG_TEST_RUN is supported if we get EINTR.
+		return nil
+	}
+	return err
 })
 })
 
 
 func (p *Program) testRun(in []byte, repeat int, reset func()) (uint32, []byte, time.Duration, error) {
 func (p *Program) testRun(in []byte, repeat int, reset func()) (uint32, []byte, time.Duration, error) {
@@ -465,8 +547,11 @@ func unmarshalProgram(buf []byte) (*Program, error) {
 	return NewProgramFromID(ProgramID(id))
 	return NewProgramFromID(ProgramID(id))
 }
 }
 
 
-// MarshalBinary implements BinaryMarshaler.
-func (p *Program) MarshalBinary() ([]byte, error) {
+func marshalProgram(p *Program, length int) ([]byte, error) {
+	if length != 4 {
+		return nil, fmt.Errorf("can't marshal program to %d bytes", length)
+	}
+
 	value, err := p.fd.Value()
 	value, err := p.fd.Value()
 	if err != nil {
 	if err != nil {
 		return nil, err
 		return nil, err
@@ -529,28 +614,28 @@ func (p *Program) Detach(fd int, typ AttachType, flags AttachFlags) error {
 // LoadPinnedProgram loads a Program from a BPF file.
 // LoadPinnedProgram loads a Program from a BPF file.
 //
 //
 // Requires at least Linux 4.11.
 // Requires at least Linux 4.11.
-func LoadPinnedProgram(fileName string) (*Program, error) {
-	fd, err := internal.BPFObjGet(fileName)
+func LoadPinnedProgram(fileName string, opts *LoadPinOptions) (*Program, error) {
+	fd, err := internal.BPFObjGet(fileName, opts.Marshal())
 	if err != nil {
 	if err != nil {
 		return nil, err
 		return nil, err
 	}
 	}
 
 
-	name, abi, err := newProgramABIFromFd(fd)
+	info, err := newProgramInfoFromFd(fd)
 	if err != nil {
 	if err != nil {
 		_ = fd.Close()
 		_ = fd.Close()
-		return nil, fmt.Errorf("can't get ABI for %s: %w", fileName, err)
+		return nil, fmt.Errorf("info for %s: %w", fileName, err)
 	}
 	}
 
 
-	return newProgram(fd, name, abi), nil
+	return &Program{"", fd, filepath.Base(fileName), fileName, info.Type}, nil
 }
 }
 
 
-// SanitizeName replaces all invalid characters in name.
-//
-// Use this to automatically generate valid names for maps and
-// programs at run time.
+// SanitizeName replaces all invalid characters in name with replacement.
+// Passing a negative value for replacement will delete characters instead
+// of replacing them. Use this to automatically generate valid names for maps
+// and programs at runtime.
 //
 //
-// Passing a negative value for replacement will delete characters
-// instead of replacing them.
+// The set of allowed characters depends on the running kernel version.
+// Dots are only allowed as of kernel 5.2.
 func SanitizeName(name string, replacement rune) string {
 func SanitizeName(name string, replacement rune) string {
 	return strings.Map(func(char rune) rune {
 	return strings.Map(func(char rune) rune {
 		if invalidBPFObjNameChar(char) {
 		if invalidBPFObjNameChar(char) {
@@ -568,25 +653,9 @@ func ProgramGetNextID(startID ProgramID) (ProgramID, error) {
 	return ProgramID(id), err
 	return ProgramID(id), err
 }
 }
 
 
-// NewProgramFromID returns the program for a given id.
-//
-// Returns ErrNotExist, if there is no eBPF program with the given id.
-func NewProgramFromID(id ProgramID) (*Program, error) {
-	fd, err := bpfObjGetFDByID(internal.BPF_PROG_GET_FD_BY_ID, uint32(id))
-	if err != nil {
-		return nil, err
-	}
-
-	name, abi, err := newProgramABIFromFd(fd)
-	if err != nil {
-		_ = fd.Close()
-		return nil, err
-	}
-
-	return newProgram(fd, name, abi), nil
-}
-
 // ID returns the systemwide unique ID of the program.
 // ID returns the systemwide unique ID of the program.
+//
+// Deprecated: use ProgramInfo.ID() instead.
 func (p *Program) ID() (ProgramID, error) {
 func (p *Program) ID() (ProgramID, error) {
 	info, err := bpfGetProgInfoByFD(p.fd)
 	info, err := bpfGetProgInfoByFD(p.fd)
 	if err != nil {
 	if err != nil {
@@ -595,12 +664,16 @@ func (p *Program) ID() (ProgramID, error) {
 	return ProgramID(info.id), nil
 	return ProgramID(info.id), nil
 }
 }
 
 
-func resolveBTFType(name string, progType ProgramType, attachType AttachType) (btf.Type, error) {
+func findKernelType(name string, typ btf.Type) error {
 	kernel, err := btf.LoadKernelSpec()
 	kernel, err := btf.LoadKernelSpec()
 	if err != nil {
 	if err != nil {
-		return nil, fmt.Errorf("can't resolve BTF type %s: %w", name, err)
+		return fmt.Errorf("can't load kernel spec: %w", err)
 	}
 	}
 
 
+	return kernel.FindType(name, typ)
+}
+
+func resolveBTFType(name string, progType ProgramType, attachType AttachType) (btf.Type, error) {
 	type match struct {
 	type match struct {
 		p ProgramType
 		p ProgramType
 		a AttachType
 		a AttachType
@@ -608,10 +681,30 @@ func resolveBTFType(name string, progType ProgramType, attachType AttachType) (b
 
 
 	target := match{progType, attachType}
 	target := match{progType, attachType}
 	switch target {
 	switch target {
+	case match{LSM, AttachLSMMac}:
+		var target btf.Func
+		err := findKernelType("bpf_lsm_"+name, &target)
+		if errors.Is(err, btf.ErrNotFound) {
+			return nil, &internal.UnsupportedFeatureError{
+				Name: name + " LSM hook",
+			}
+		}
+		if err != nil {
+			return nil, fmt.Errorf("resolve BTF for LSM hook %s: %w", name, err)
+		}
+
+		return &target, nil
+
 	case match{Tracing, AttachTraceIter}:
 	case match{Tracing, AttachTraceIter}:
 		var target btf.Func
 		var target btf.Func
-		if err := kernel.FindType("bpf_iter_"+name, &target); err != nil {
-			return nil, fmt.Errorf("can't resolve BTF for iterator %s: %w", name, err)
+		err := findKernelType("bpf_iter_"+name, &target)
+		if errors.Is(err, btf.ErrNotFound) {
+			return nil, &internal.UnsupportedFeatureError{
+				Name: name + " iterator",
+			}
+		}
+		if err != nil {
+			return nil, fmt.Errorf("resolve BTF for iterator %s: %w", name, err)
 		}
 		}
 
 
 		return &target, nil
 		return &target, nil

+ 0 - 25
vendor/github.com/cilium/ebpf/readme.md

@@ -1,25 +0,0 @@
-eBPF
--------
-[![](https://godoc.org/github.com/cilium/ebpf?status.svg)](https://godoc.org/github.com/cilium/ebpf)
-
-eBPF is a pure Go library that provides utilities for loading, compiling, and debugging eBPF programs. It has minimal external dependencies and is intended to be used in long running processes.
-
-[ebpf/asm](https://godoc.org/github.com/cilium/ebpf/asm) contains a basic assembler.
-
-The library is maintained by [Cloudflare](https://www.cloudflare.com) and [Cilium](https://www.cilium.io). Feel free to [join](https://cilium.herokuapp.com/) the [libbpf-go](https://cilium.slack.com/messages/libbpf-go) channel on Slack.
-
-## Current status
-
-The package is production ready, but **the API is explicitly unstable
-right now**. Expect to update your code if you want to follow along.
-
-## Requirements
-
-* A version of Go that is [supported by upstream](https://golang.org/doc/devel/release.html#policy)
-* Linux 4.9, 4.19 or 5.4 (versions in-between should work, but are not tested)
-
-## Useful resources
-
-* [Cilium eBPF documentation](https://cilium.readthedocs.io/en/latest/bpf/#bpf-guide) (recommended)
-* [Linux documentation on BPF](http://elixir.free-electrons.com/linux/latest/source/Documentation/networking/filter.txt)
-* [eBPF features by Linux version](https://github.com/iovisor/bcc/blob/master/docs/kernel-versions.md)

+ 142 - 89
vendor/github.com/cilium/ebpf/syscalls.go

@@ -3,7 +3,6 @@ package ebpf
 import (
 import (
 	"errors"
 	"errors"
 	"fmt"
 	"fmt"
-	"os"
 	"unsafe"
 	"unsafe"
 
 
 	"github.com/cilium/ebpf/internal"
 	"github.com/cilium/ebpf/internal"
@@ -12,9 +11,7 @@ import (
 )
 )
 
 
 // Generic errors returned by BPF syscalls.
 // Generic errors returned by BPF syscalls.
-var (
-	ErrNotExist = errors.New("requested object does not exist")
-)
+var ErrNotExist = errors.New("requested object does not exist")
 
 
 // bpfObjName is a null-terminated string made up of
 // bpfObjName is a null-terminated string made up of
 // 'A-Za-z0-9_' characters.
 // 'A-Za-z0-9_' characters.
@@ -27,18 +24,20 @@ func newBPFObjName(name string) bpfObjName {
 	return result
 	return result
 }
 }
 
 
+// invalidBPFObjNameChar returns true if char may not appear in
+// a BPF object name.
 func invalidBPFObjNameChar(char rune) bool {
 func invalidBPFObjNameChar(char rune) bool {
 	dotAllowed := objNameAllowsDot() == nil
 	dotAllowed := objNameAllowsDot() == nil
 
 
 	switch {
 	switch {
 	case char >= 'A' && char <= 'Z':
 	case char >= 'A' && char <= 'Z':
-		fallthrough
+		return false
 	case char >= 'a' && char <= 'z':
 	case char >= 'a' && char <= 'z':
-		fallthrough
+		return false
 	case char >= '0' && char <= '9':
 	case char >= '0' && char <= '9':
-		fallthrough
+		return false
 	case dotAllowed && char == '.':
 	case dotAllowed && char == '.':
-		fallthrough
+		return false
 	case char == '_':
 	case char == '_':
 		return false
 		return false
 	default:
 	default:
@@ -69,14 +68,32 @@ type bpfMapOpAttr struct {
 	flags   uint64
 	flags   uint64
 }
 }
 
 
+type bpfBatchMapOpAttr struct {
+	inBatch   internal.Pointer
+	outBatch  internal.Pointer
+	keys      internal.Pointer
+	values    internal.Pointer
+	count     uint32
+	mapFd     uint32
+	elemFlags uint64
+	flags     uint64
+}
+
 type bpfMapInfo struct {
 type bpfMapInfo struct {
-	mapType    uint32
-	id         uint32
-	keySize    uint32
-	valueSize  uint32
-	maxEntries uint32
-	flags      uint32
-	mapName    bpfObjName // since 4.15 ad5b177bd73f
+	map_type                  uint32 // since 4.12 1e2709769086
+	id                        uint32
+	key_size                  uint32
+	value_size                uint32
+	max_entries               uint32
+	map_flags                 uint32
+	name                      bpfObjName // since 4.15 ad5b177bd73f
+	ifindex                   uint32     // since 4.16 52775b33bb50
+	btf_vmlinux_value_type_id uint32     // since 5.6  85d33df357b6
+	netns_dev                 uint64     // since 4.16 52775b33bb50
+	netns_ino                 uint64
+	btf_id                    uint32 // since 4.18 78958fca7ead
+	btf_key_type_id           uint32 // since 4.18 9b2cf328b2ec
+	btf_value_type_id         uint32
 }
 }
 
 
 type bpfProgLoadAttr struct {
 type bpfProgLoadAttr struct {
@@ -104,18 +121,40 @@ type bpfProgLoadAttr struct {
 }
 }
 
 
 type bpfProgInfo struct {
 type bpfProgInfo struct {
-	progType     uint32
-	id           uint32
-	tag          [unix.BPF_TAG_SIZE]byte
-	jitedLen     uint32
-	xlatedLen    uint32
-	jited        internal.Pointer
-	xlated       internal.Pointer
-	loadTime     uint64 // since 4.15 cb4d2b3f03d8
-	createdByUID uint32
-	nrMapIDs     uint32
-	mapIds       internal.Pointer
-	name         bpfObjName
+	prog_type                uint32
+	id                       uint32
+	tag                      [unix.BPF_TAG_SIZE]byte
+	jited_prog_len           uint32
+	xlated_prog_len          uint32
+	jited_prog_insns         internal.Pointer
+	xlated_prog_insns        internal.Pointer
+	load_time                uint64 // since 4.15 cb4d2b3f03d8
+	created_by_uid           uint32
+	nr_map_ids               uint32
+	map_ids                  internal.Pointer
+	name                     bpfObjName // since 4.15 067cae47771c
+	ifindex                  uint32
+	gpl_compatible           uint32
+	netns_dev                uint64
+	netns_ino                uint64
+	nr_jited_ksyms           uint32
+	nr_jited_func_lens       uint32
+	jited_ksyms              internal.Pointer
+	jited_func_lens          internal.Pointer
+	btf_id                   uint32
+	func_info_rec_size       uint32
+	func_info                internal.Pointer
+	nr_func_info             uint32
+	nr_line_info             uint32
+	line_info                internal.Pointer
+	jited_line_info          internal.Pointer
+	nr_jited_line_info       uint32
+	line_info_rec_size       uint32
+	jited_line_info_rec_size uint32
+	nr_prog_tags             uint32
+	prog_tags                internal.Pointer
+	run_time_ns              uint64
+	run_cnt                  uint64
 }
 }
 
 
 type bpfProgTestRunAttr struct {
 type bpfProgTestRunAttr struct {
@@ -129,12 +168,6 @@ type bpfProgTestRunAttr struct {
 	duration    uint32
 	duration    uint32
 }
 }
 
 
-type bpfObjGetInfoByFDAttr struct {
-	fd      uint32
-	infoLen uint32
-	info    internal.Pointer // May be either bpfMapInfo or bpfProgInfo
-}
-
 type bpfGetFDByIDAttr struct {
 type bpfGetFDByIDAttr struct {
 	id   uint32
 	id   uint32
 	next uint32
 	next uint32
@@ -174,10 +207,6 @@ func bpfProgTestRun(attr *bpfProgTestRunAttr) error {
 
 
 func bpfMapCreate(attr *bpfMapCreateAttr) (*internal.FD, error) {
 func bpfMapCreate(attr *bpfMapCreateAttr) (*internal.FD, error) {
 	fd, err := internal.BPF(internal.BPF_MAP_CREATE, unsafe.Pointer(attr), unsafe.Sizeof(*attr))
 	fd, err := internal.BPF(internal.BPF_MAP_CREATE, unsafe.Pointer(attr), unsafe.Sizeof(*attr))
-	if errors.Is(err, os.ErrPermission) {
-		return nil, errors.New("permission denied or insufficient rlimit to lock memory for map")
-	}
-
 	if err != nil {
 	if err != nil {
 		return nil, err
 		return nil, err
 	}
 	}
@@ -185,35 +214,25 @@ func bpfMapCreate(attr *bpfMapCreateAttr) (*internal.FD, error) {
 	return internal.NewFD(uint32(fd)), nil
 	return internal.NewFD(uint32(fd)), nil
 }
 }
 
 
-var haveNestedMaps = internal.FeatureTest("nested maps", "4.12", func() (bool, error) {
-	inner, err := bpfMapCreate(&bpfMapCreateAttr{
-		mapType:    Array,
-		keySize:    4,
-		valueSize:  4,
-		maxEntries: 1,
-	})
-	if err != nil {
-		return false, err
-	}
-	defer inner.Close()
-
-	innerFd, _ := inner.Value()
-	nested, err := bpfMapCreate(&bpfMapCreateAttr{
+var haveNestedMaps = internal.FeatureTest("nested maps", "4.12", func() error {
+	_, err := bpfMapCreate(&bpfMapCreateAttr{
 		mapType:    ArrayOfMaps,
 		mapType:    ArrayOfMaps,
 		keySize:    4,
 		keySize:    4,
 		valueSize:  4,
 		valueSize:  4,
 		maxEntries: 1,
 		maxEntries: 1,
-		innerMapFd: innerFd,
+		// Invalid file descriptor.
+		innerMapFd: ^uint32(0),
 	})
 	})
-	if err != nil {
-		return false, nil
+	if errors.Is(err, unix.EINVAL) {
+		return internal.ErrNotSupported
 	}
 	}
-
-	_ = nested.Close()
-	return true, nil
+	if errors.Is(err, unix.EBADF) {
+		return nil
+	}
+	return err
 })
 })
 
 
-var haveMapMutabilityModifiers = internal.FeatureTest("read- and write-only maps", "5.2", func() (bool, error) {
+var haveMapMutabilityModifiers = internal.FeatureTest("read- and write-only maps", "5.2", func() error {
 	// This checks BPF_F_RDONLY_PROG and BPF_F_WRONLY_PROG. Since
 	// This checks BPF_F_RDONLY_PROG and BPF_F_WRONLY_PROG. Since
 	// BPF_MAP_FREEZE appeared in 5.2 as well we don't do a separate check.
 	// BPF_MAP_FREEZE appeared in 5.2 as well we don't do a separate check.
 	m, err := bpfMapCreate(&bpfMapCreateAttr{
 	m, err := bpfMapCreate(&bpfMapCreateAttr{
@@ -224,10 +243,10 @@ var haveMapMutabilityModifiers = internal.FeatureTest("read- and write-only maps
 		flags:      unix.BPF_F_RDONLY_PROG,
 		flags:      unix.BPF_F_RDONLY_PROG,
 	})
 	})
 	if err != nil {
 	if err != nil {
-		return false, nil
+		return internal.ErrNotSupported
 	}
 	}
 	_ = m.Close()
 	_ = m.Close()
-	return true, nil
+	return nil
 })
 })
 
 
 func bpfMapLookupElem(m *internal.FD, key, valueOut internal.Pointer) error {
 func bpfMapLookupElem(m *internal.FD, key, valueOut internal.Pointer) error {
@@ -313,6 +332,29 @@ func objGetNextID(cmd internal.BPFCmd, start uint32) (uint32, error) {
 	return attr.nextID, wrapObjError(err)
 	return attr.nextID, wrapObjError(err)
 }
 }
 
 
+func bpfMapBatch(cmd internal.BPFCmd, m *internal.FD, inBatch, outBatch, keys, values internal.Pointer, count uint32, opts *BatchOptions) (uint32, error) {
+	fd, err := m.Value()
+	if err != nil {
+		return 0, err
+	}
+
+	attr := bpfBatchMapOpAttr{
+		inBatch:  inBatch,
+		outBatch: outBatch,
+		keys:     keys,
+		values:   values,
+		count:    count,
+		mapFd:    fd,
+	}
+	if opts != nil {
+		attr.elemFlags = opts.ElemFlags
+		attr.flags = opts.Flags
+	}
+	_, err = internal.BPF(cmd, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
+	// always return count even on an error, as things like update might partially be fulfilled.
+	return attr.count, wrapMapError(err)
+}
+
 func wrapObjError(err error) error {
 func wrapObjError(err error) error {
 	if err == nil {
 	if err == nil {
 		return nil
 		return nil
@@ -337,7 +379,11 @@ func wrapMapError(err error) error {
 		return ErrKeyExist
 		return ErrKeyExist
 	}
 	}
 
 
-	return errors.New(err.Error())
+	if errors.Is(err, unix.ENOTSUPP) {
+		return ErrNotSupported
+	}
+
+	return err
 }
 }
 
 
 func bpfMapFreeze(m *internal.FD) error {
 func bpfMapFreeze(m *internal.FD) error {
@@ -353,28 +399,9 @@ func bpfMapFreeze(m *internal.FD) error {
 	return err
 	return err
 }
 }
 
 
-func bpfGetObjectInfoByFD(fd *internal.FD, info unsafe.Pointer, size uintptr) error {
-	value, err := fd.Value()
-	if err != nil {
-		return err
-	}
-
-	// available from 4.13
-	attr := bpfObjGetInfoByFDAttr{
-		fd:      value,
-		infoLen: uint32(size),
-		info:    internal.NewPointer(info),
-	}
-	_, err = internal.BPF(internal.BPF_OBJ_GET_INFO_BY_FD, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
-	if err != nil {
-		return fmt.Errorf("fd %d: %w", fd, err)
-	}
-	return nil
-}
-
 func bpfGetProgInfoByFD(fd *internal.FD) (*bpfProgInfo, error) {
 func bpfGetProgInfoByFD(fd *internal.FD) (*bpfProgInfo, error) {
 	var info bpfProgInfo
 	var info bpfProgInfo
-	if err := bpfGetObjectInfoByFD(fd, unsafe.Pointer(&info), unsafe.Sizeof(info)); err != nil {
+	if err := internal.BPFObjGetInfoByFD(fd, unsafe.Pointer(&info), unsafe.Sizeof(info)); err != nil {
 		return nil, fmt.Errorf("can't get program info: %w", err)
 		return nil, fmt.Errorf("can't get program info: %w", err)
 	}
 	}
 	return &info, nil
 	return &info, nil
@@ -382,14 +409,14 @@ func bpfGetProgInfoByFD(fd *internal.FD) (*bpfProgInfo, error) {
 
 
 func bpfGetMapInfoByFD(fd *internal.FD) (*bpfMapInfo, error) {
 func bpfGetMapInfoByFD(fd *internal.FD) (*bpfMapInfo, error) {
 	var info bpfMapInfo
 	var info bpfMapInfo
-	err := bpfGetObjectInfoByFD(fd, unsafe.Pointer(&info), unsafe.Sizeof(info))
+	err := internal.BPFObjGetInfoByFD(fd, unsafe.Pointer(&info), unsafe.Sizeof(info))
 	if err != nil {
 	if err != nil {
 		return nil, fmt.Errorf("can't get map info: %w", err)
 		return nil, fmt.Errorf("can't get map info: %w", err)
 	}
 	}
 	return &info, nil
 	return &info, nil
 }
 }
 
 
-var haveObjName = internal.FeatureTest("object names", "4.15", func() (bool, error) {
+var haveObjName = internal.FeatureTest("object names", "4.15", func() error {
 	attr := bpfMapCreateAttr{
 	attr := bpfMapCreateAttr{
 		mapType:    Array,
 		mapType:    Array,
 		keySize:    4,
 		keySize:    4,
@@ -400,16 +427,16 @@ var haveObjName = internal.FeatureTest("object names", "4.15", func() (bool, err
 
 
 	fd, err := bpfMapCreate(&attr)
 	fd, err := bpfMapCreate(&attr)
 	if err != nil {
 	if err != nil {
-		return false, nil
+		return internal.ErrNotSupported
 	}
 	}
 
 
 	_ = fd.Close()
 	_ = fd.Close()
-	return true, nil
+	return nil
 })
 })
 
 
-var objNameAllowsDot = internal.FeatureTest("dot in object names", "5.2", func() (bool, error) {
+var objNameAllowsDot = internal.FeatureTest("dot in object names", "5.2", func() error {
 	if err := haveObjName(); err != nil {
 	if err := haveObjName(); err != nil {
-		return false, err
+		return err
 	}
 	}
 
 
 	attr := bpfMapCreateAttr{
 	attr := bpfMapCreateAttr{
@@ -422,11 +449,37 @@ var objNameAllowsDot = internal.FeatureTest("dot in object names", "5.2", func()
 
 
 	fd, err := bpfMapCreate(&attr)
 	fd, err := bpfMapCreate(&attr)
 	if err != nil {
 	if err != nil {
-		return false, nil
+		return internal.ErrNotSupported
 	}
 	}
 
 
 	_ = fd.Close()
 	_ = fd.Close()
-	return true, nil
+	return nil
+})
+
+var haveBatchAPI = internal.FeatureTest("map batch api", "5.6", func() error {
+	var maxEntries uint32 = 2
+	attr := bpfMapCreateAttr{
+		mapType:    Hash,
+		keySize:    4,
+		valueSize:  4,
+		maxEntries: maxEntries,
+	}
+
+	fd, err := bpfMapCreate(&attr)
+	if err != nil {
+		return internal.ErrNotSupported
+	}
+	defer fd.Close()
+	keys := []uint32{1, 2}
+	values := []uint32{3, 4}
+	kp, _ := marshalPtr(keys, 8)
+	vp, _ := marshalPtr(values, 8)
+	nilPtr := internal.NewPointer(nil)
+	_, err = bpfMapBatch(internal.BPF_MAP_UPDATE_BATCH, fd, nilPtr, nilPtr, kp, vp, maxEntries, nil)
+	if err != nil {
+		return internal.ErrNotSupported
+	}
+	return nil
 })
 })
 
 
 func bpfObjGetFDByID(cmd internal.BPFCmd, id uint32) (*internal.FD, error) {
 func bpfObjGetFDByID(cmd internal.BPFCmd, id uint32) (*internal.FD, error) {

+ 81 - 33
vendor/github.com/cilium/ebpf/types.go

@@ -1,6 +1,10 @@
 package ebpf
 package ebpf
 
 
-//go:generate stringer -output types_string.go -type=MapType,ProgramType,AttachType
+import (
+	"github.com/cilium/ebpf/internal/unix"
+)
+
+//go:generate stringer -output types_string.go -type=MapType,ProgramType,AttachType,PinType
 
 
 // MapType indicates the type map structure
 // MapType indicates the type map structure
 // that will be initialized in the kernel.
 // that will be initialized in the kernel.
@@ -85,10 +89,19 @@ const (
 
 
 // hasPerCPUValue returns true if the Map stores a value per CPU.
 // hasPerCPUValue returns true if the Map stores a value per CPU.
 func (mt MapType) hasPerCPUValue() bool {
 func (mt MapType) hasPerCPUValue() bool {
-	if mt == PerCPUHash || mt == PerCPUArray || mt == LRUCPUHash {
-		return true
-	}
-	return false
+	return mt == PerCPUHash || mt == PerCPUArray || mt == LRUCPUHash
+}
+
+// canStoreMap returns true if the map type accepts a map fd
+// for update and returns a map id for lookup.
+func (mt MapType) canStoreMap() bool {
+	return mt == ArrayOfMaps || mt == HashOfMaps
+}
+
+// canStoreProgram returns true if the map type accepts a program fd
+// for update and returns a program id for lookup.
+func (mt MapType) canStoreProgram() bool {
+	return mt == ProgramArray
 }
 }
 
 
 // ProgramType of the eBPF program
 // ProgramType of the eBPF program
@@ -96,60 +109,37 @@ type ProgramType uint32
 
 
 // eBPF program types
 // eBPF program types
 const (
 const (
-	// Unrecognized program type
 	UnspecifiedProgram ProgramType = iota
 	UnspecifiedProgram ProgramType = iota
-	// SocketFilter socket or seccomp filter
 	SocketFilter
 	SocketFilter
-	// Kprobe program
 	Kprobe
 	Kprobe
-	// SchedCLS traffic control shaper
 	SchedCLS
 	SchedCLS
-	// SchedACT routing control shaper
 	SchedACT
 	SchedACT
-	// TracePoint program
 	TracePoint
 	TracePoint
-	// XDP program
 	XDP
 	XDP
-	// PerfEvent program
 	PerfEvent
 	PerfEvent
-	// CGroupSKB program
 	CGroupSKB
 	CGroupSKB
-	// CGroupSock program
 	CGroupSock
 	CGroupSock
-	// LWTIn program
 	LWTIn
 	LWTIn
-	// LWTOut program
 	LWTOut
 	LWTOut
-	// LWTXmit program
 	LWTXmit
 	LWTXmit
-	// SockOps program
 	SockOps
 	SockOps
-	// SkSKB program
 	SkSKB
 	SkSKB
-	// CGroupDevice program
 	CGroupDevice
 	CGroupDevice
-	// SkMsg program
 	SkMsg
 	SkMsg
-	// RawTracepoint program
 	RawTracepoint
 	RawTracepoint
-	// CGroupSockAddr program
 	CGroupSockAddr
 	CGroupSockAddr
-	// LWTSeg6Local program
 	LWTSeg6Local
 	LWTSeg6Local
-	// LircMode2 program
 	LircMode2
 	LircMode2
-	// SkReuseport program
 	SkReuseport
 	SkReuseport
-	// FlowDissector program
 	FlowDissector
 	FlowDissector
-	// CGroupSysctl program
 	CGroupSysctl
 	CGroupSysctl
-	// RawTracepointWritable program
 	RawTracepointWritable
 	RawTracepointWritable
-	// CGroupSockopt program
 	CGroupSockopt
 	CGroupSockopt
-	// Tracing program
 	Tracing
 	Tracing
+	StructOps
+	Extension
+	LSM
+	SkLookup
 )
 )
 
 
 // AttachType of the eBPF program, needed to differentiate allowed context accesses in
 // AttachType of the eBPF program, needed to differentiate allowed context accesses in
@@ -157,7 +147,7 @@ const (
 // Will cause invalid argument (EINVAL) at program load time if set incorrectly.
 // Will cause invalid argument (EINVAL) at program load time if set incorrectly.
 type AttachType uint32
 type AttachType uint32
 
 
-// AttachNone is an alias for AttachCGroupInetIngress for readability reasons
+// AttachNone is an alias for AttachCGroupInetIngress for readability reasons.
 const AttachNone AttachType = 0
 const AttachNone AttachType = 0
 
 
 const (
 const (
@@ -190,7 +180,65 @@ const (
 	AttachModifyReturn
 	AttachModifyReturn
 	AttachLSMMac
 	AttachLSMMac
 	AttachTraceIter
 	AttachTraceIter
+	AttachCgroupInet4GetPeername
+	AttachCgroupInet6GetPeername
+	AttachCgroupInet4GetSockname
+	AttachCgroupInet6GetSockname
+	AttachXDPDevMap
+	AttachCgroupInetSockRelease
+	AttachXDPCPUMap
+	AttachSkLookup
+	AttachXDP
 )
 )
 
 
 // AttachFlags of the eBPF program used in BPF_PROG_ATTACH command
 // AttachFlags of the eBPF program used in BPF_PROG_ATTACH command
 type AttachFlags uint32
 type AttachFlags uint32
+
+// PinType determines whether a map is pinned into a BPFFS.
+type PinType int
+
+// Valid pin types.
+//
+// Mirrors enum libbpf_pin_type.
+const (
+	PinNone PinType = iota
+	// Pin an object by using its name as the filename.
+	PinByName
+)
+
+// LoadPinOptions control how a pinned object is loaded.
+type LoadPinOptions struct {
+	// Request a read-only or write-only object. The default is a read-write
+	// object. Only one of the flags may be set.
+	ReadOnly  bool
+	WriteOnly bool
+
+	// Raw flags for the syscall. Other fields of this struct take precedence.
+	Flags uint32
+}
+
+// Marshal returns a value suitable for BPF_OBJ_GET syscall file_flags parameter.
+func (lpo *LoadPinOptions) Marshal() uint32 {
+	if lpo == nil {
+		return 0
+	}
+
+	flags := lpo.Flags
+	if lpo.ReadOnly {
+		flags |= unix.BPF_F_RDONLY
+	}
+	if lpo.WriteOnly {
+		flags |= unix.BPF_F_WRONLY
+	}
+	return flags
+}
+
+// BatchOptions batch map operations options
+//
+// Mirrors libbpf struct bpf_map_batch_opts
+// Currently BPF_F_FLAG is the only supported
+// flag (for ElemFlags).
+type BatchOptions struct {
+	ElemFlags uint64
+	Flags     uint64
+}

+ 36 - 5
vendor/github.com/cilium/ebpf/types_string.go

@@ -1,4 +1,4 @@
-// Code generated by "stringer -output types_string.go -type=MapType,ProgramType,AttachType"; DO NOT EDIT.
+// Code generated by "stringer -output types_string.go -type=MapType,ProgramType,AttachType,PinType"; DO NOT EDIT.
 
 
 package ebpf
 package ebpf
 
 
@@ -77,11 +77,15 @@ func _() {
 	_ = x[RawTracepointWritable-24]
 	_ = x[RawTracepointWritable-24]
 	_ = x[CGroupSockopt-25]
 	_ = x[CGroupSockopt-25]
 	_ = x[Tracing-26]
 	_ = x[Tracing-26]
+	_ = x[StructOps-27]
+	_ = x[Extension-28]
+	_ = x[LSM-29]
+	_ = x[SkLookup-30]
 }
 }
 
 
-const _ProgramType_name = "UnspecifiedProgramSocketFilterKprobeSchedCLSSchedACTTracePointXDPPerfEventCGroupSKBCGroupSockLWTInLWTOutLWTXmitSockOpsSkSKBCGroupDeviceSkMsgRawTracepointCGroupSockAddrLWTSeg6LocalLircMode2SkReuseportFlowDissectorCGroupSysctlRawTracepointWritableCGroupSockoptTracing"
+const _ProgramType_name = "UnspecifiedProgramSocketFilterKprobeSchedCLSSchedACTTracePointXDPPerfEventCGroupSKBCGroupSockLWTInLWTOutLWTXmitSockOpsSkSKBCGroupDeviceSkMsgRawTracepointCGroupSockAddrLWTSeg6LocalLircMode2SkReuseportFlowDissectorCGroupSysctlRawTracepointWritableCGroupSockoptTracingStructOpsExtensionLSMSkLookup"
 
 
-var _ProgramType_index = [...]uint16{0, 18, 30, 36, 44, 52, 62, 65, 74, 83, 93, 98, 104, 111, 118, 123, 135, 140, 153, 167, 179, 188, 199, 212, 224, 245, 258, 265}
+var _ProgramType_index = [...]uint16{0, 18, 30, 36, 44, 52, 62, 65, 74, 83, 93, 98, 104, 111, 118, 123, 135, 140, 153, 167, 179, 188, 199, 212, 224, 245, 258, 265, 274, 283, 286, 294}
 
 
 func (i ProgramType) String() string {
 func (i ProgramType) String() string {
 	if i >= ProgramType(len(_ProgramType_index)-1) {
 	if i >= ProgramType(len(_ProgramType_index)-1) {
@@ -123,11 +127,20 @@ func _() {
 	_ = x[AttachModifyReturn-26]
 	_ = x[AttachModifyReturn-26]
 	_ = x[AttachLSMMac-27]
 	_ = x[AttachLSMMac-27]
 	_ = x[AttachTraceIter-28]
 	_ = x[AttachTraceIter-28]
+	_ = x[AttachCgroupInet4GetPeername-29]
+	_ = x[AttachCgroupInet6GetPeername-30]
+	_ = x[AttachCgroupInet4GetSockname-31]
+	_ = x[AttachCgroupInet6GetSockname-32]
+	_ = x[AttachXDPDevMap-33]
+	_ = x[AttachCgroupInetSockRelease-34]
+	_ = x[AttachXDPCPUMap-35]
+	_ = x[AttachSkLookup-36]
+	_ = x[AttachXDP-37]
 }
 }
 
 
-const _AttachType_name = "AttachNoneAttachCGroupInetEgressAttachCGroupInetSockCreateAttachCGroupSockOpsAttachSkSKBStreamParserAttachSkSKBStreamVerdictAttachCGroupDeviceAttachSkMsgVerdictAttachCGroupInet4BindAttachCGroupInet6BindAttachCGroupInet4ConnectAttachCGroupInet6ConnectAttachCGroupInet4PostBindAttachCGroupInet6PostBindAttachCGroupUDP4SendmsgAttachCGroupUDP6SendmsgAttachLircMode2AttachFlowDissectorAttachCGroupSysctlAttachCGroupUDP4RecvmsgAttachCGroupUDP6RecvmsgAttachCGroupGetsockoptAttachCGroupSetsockoptAttachTraceRawTpAttachTraceFEntryAttachTraceFExitAttachModifyReturnAttachLSMMacAttachTraceIter"
+const _AttachType_name = "AttachNoneAttachCGroupInetEgressAttachCGroupInetSockCreateAttachCGroupSockOpsAttachSkSKBStreamParserAttachSkSKBStreamVerdictAttachCGroupDeviceAttachSkMsgVerdictAttachCGroupInet4BindAttachCGroupInet6BindAttachCGroupInet4ConnectAttachCGroupInet6ConnectAttachCGroupInet4PostBindAttachCGroupInet6PostBindAttachCGroupUDP4SendmsgAttachCGroupUDP6SendmsgAttachLircMode2AttachFlowDissectorAttachCGroupSysctlAttachCGroupUDP4RecvmsgAttachCGroupUDP6RecvmsgAttachCGroupGetsockoptAttachCGroupSetsockoptAttachTraceRawTpAttachTraceFEntryAttachTraceFExitAttachModifyReturnAttachLSMMacAttachTraceIterAttachCgroupInet4GetPeernameAttachCgroupInet6GetPeernameAttachCgroupInet4GetSocknameAttachCgroupInet6GetSocknameAttachXDPDevMapAttachCgroupInetSockReleaseAttachXDPCPUMapAttachSkLookupAttachXDP"
 
 
-var _AttachType_index = [...]uint16{0, 10, 32, 58, 77, 100, 124, 142, 160, 181, 202, 226, 250, 275, 300, 323, 346, 361, 380, 398, 421, 444, 466, 488, 504, 521, 537, 555, 567, 582}
+var _AttachType_index = [...]uint16{0, 10, 32, 58, 77, 100, 124, 142, 160, 181, 202, 226, 250, 275, 300, 323, 346, 361, 380, 398, 421, 444, 466, 488, 504, 521, 537, 555, 567, 582, 610, 638, 666, 694, 709, 736, 751, 765, 774}
 
 
 func (i AttachType) String() string {
 func (i AttachType) String() string {
 	if i >= AttachType(len(_AttachType_index)-1) {
 	if i >= AttachType(len(_AttachType_index)-1) {
@@ -135,3 +148,21 @@ func (i AttachType) String() string {
 	}
 	}
 	return _AttachType_name[_AttachType_index[i]:_AttachType_index[i+1]]
 	return _AttachType_name[_AttachType_index[i]:_AttachType_index[i+1]]
 }
 }
+func _() {
+	// An "invalid array index" compiler error signifies that the constant values have changed.
+	// Re-run the stringer command to generate them again.
+	var x [1]struct{}
+	_ = x[PinNone-0]
+	_ = x[PinByName-1]
+}
+
+const _PinType_name = "PinNonePinByName"
+
+var _PinType_index = [...]uint8{0, 7, 16}
+
+func (i PinType) String() string {
+	if i < 0 || i >= PinType(len(_PinType_index)-1) {
+		return "PinType(" + strconv.FormatInt(int64(i), 10) + ")"
+	}
+	return _PinType_name[_PinType_index[i]:_PinType_index[i+1]]
+}

+ 25 - 13
vendor/github.com/opencontainers/runc/README.md

@@ -1,9 +1,10 @@
 # runc
 # runc
 
 
-[![Build Status](https://travis-ci.org/opencontainers/runc.svg?branch=master)](https://travis-ci.org/opencontainers/runc)
 [![Go Report Card](https://goreportcard.com/badge/github.com/opencontainers/runc)](https://goreportcard.com/report/github.com/opencontainers/runc)
 [![Go Report Card](https://goreportcard.com/badge/github.com/opencontainers/runc)](https://goreportcard.com/report/github.com/opencontainers/runc)
 [![GoDoc](https://godoc.org/github.com/opencontainers/runc?status.svg)](https://godoc.org/github.com/opencontainers/runc)
 [![GoDoc](https://godoc.org/github.com/opencontainers/runc?status.svg)](https://godoc.org/github.com/opencontainers/runc)
 [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/588/badge)](https://bestpractices.coreinfrastructure.org/projects/588)
 [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/588/badge)](https://bestpractices.coreinfrastructure.org/projects/588)
+[![gha/validate](https://github.com/opencontainers/runc/workflows/validate/badge.svg)](https://github.com/opencontainers/runc/actions?query=workflow%3Avalidate)
+[![gha/ci](https://github.com/opencontainers/runc/workflows/ci/badge.svg)](https://github.com/opencontainers/runc/actions?query=workflow%3Aci)
 
 
 ## Introduction
 ## Introduction
 
 
@@ -17,10 +18,6 @@ This means that `runc` 1.0.0 should implement the 1.0 version of the specificati
 
 
 You can find official releases of `runc` on the [release](https://github.com/opencontainers/runc/releases) page.
 You can find official releases of `runc` on the [release](https://github.com/opencontainers/runc/releases) page.
 
 
-Currently, the following features are not considered to be production-ready:
-
-* [Support for cgroup v2](./docs/cgroup-v2.md)
-
 ## Security
 ## Security
 
 
 The reporting process and disclosure communications are outlined [here](https://github.com/opencontainers/org/blob/master/SECURITY.md).
 The reporting process and disclosure communications are outlined [here](https://github.com/opencontainers/org/blob/master/SECURITY.md).
@@ -64,19 +61,20 @@ sudo make install
 with some of them enabled by default (see `BUILDTAGS` in top-level `Makefile`).
 with some of them enabled by default (see `BUILDTAGS` in top-level `Makefile`).
 
 
 To change build tags from the default, set the `BUILDTAGS` variable for make,
 To change build tags from the default, set the `BUILDTAGS` variable for make,
-e.g.
+e.g. to disable seccomp:
 
 
 ```bash
 ```bash
-make BUILDTAGS='seccomp apparmor'
+make BUILDTAGS=""
 ```
 ```
 
 
 | Build Tag | Feature                            | Enabled by default | Dependency |
 | Build Tag | Feature                            | Enabled by default | Dependency |
 |-----------|------------------------------------|--------------------|------------|
 |-----------|------------------------------------|--------------------|------------|
 | seccomp   | Syscall filtering                  | yes                | libseccomp |
 | seccomp   | Syscall filtering                  | yes                | libseccomp |
-| selinux   | selinux process and mount labeling | yes                | <none>     |
-| apparmor  | apparmor profile support           | yes                | <none>     |
-| nokmem    | disable kernel memory accounting   | no                 | <none>     |
 
 
+The following build tags were used earlier, but are now obsoleted:
+ - **nokmem** (since runc v1.0.0-rc94 kernel memory settings are ignored)
+ - **apparmor** (since runc v1.0.0-rc93 the feature is always enabled)
+ - **selinux**  (since runc v1.0.0-rc93 the feature is always enabled)
 
 
 ### Running the test suite
 ### Running the test suite
 
 
@@ -128,6 +126,14 @@ make verify-dependencies
 
 
 ## Using runc
 ## Using runc
 
 
+Please note that runc is a low level tool not designed with an end user
+in mind. It is mostly employed by other higher level container software.
+
+Therefore, unless there is some specific use case that prevents the use
+of tools like Docker or Podman, it is not recommended to use runc directly.
+
+If you still want to use runc, here's how.
+
 ### Creating an OCI Bundle
 ### Creating an OCI Bundle
 
 
 In order to use runc you must have your container in the format of an OCI bundle.
 In order to use runc you must have your container in the format of an OCI bundle.
@@ -169,7 +175,9 @@ If you used the unmodified `runc spec` template this should give you a `sh` sess
 
 
 The second way to start a container is using the specs lifecycle operations.
 The second way to start a container is using the specs lifecycle operations.
 This gives you more power over how the container is created and managed while it is running.
 This gives you more power over how the container is created and managed while it is running.
-This will also launch the container in the background so you will have to edit the `config.json` to remove the `terminal` setting for the simple examples here.
+This will also launch the container in the background so you will have to edit
+the `config.json` to remove the `terminal` setting for the simple examples
+below (see more details about [runc terminal handling](docs/terminals.md)).
 Your process field in the `config.json` should look like this below with `"terminal": false` and `"args": ["sleep", "5"]`.
 Your process field in the `config.json` should look like this below with `"terminal": false` and `"args": ["sleep", "5"]`.
 
 
 
 
@@ -292,8 +300,12 @@ PIDFile=/run/mycontainerid.pid
 WantedBy=multi-user.target
 WantedBy=multi-user.target
 ```
 ```
 
 
-#### cgroup v2
-See [`./docs/cgroup-v2.md`](./docs/cgroup-v2.md).
+## More documentation
+
+* [cgroup v2](./docs/cgroup-v2.md)
+* [Checkpoint and restore](./docs/checkpoint-restore.md)
+* [systemd cgroup driver](./docs/systemd.md)
+* [Terminals and standard IO](./docs/terminals.md)
 
 
 ## License
 ## License
 
 

+ 16 - 14
vendor/github.com/opencontainers/runc/go.mod

@@ -1,26 +1,28 @@
 module github.com/opencontainers/runc
 module github.com/opencontainers/runc
 
 
-go 1.14
+go 1.13
 
 
 require (
 require (
-	github.com/checkpoint-restore/go-criu/v4 v4.1.0
-	github.com/cilium/ebpf v0.0.0-20200702112145-1c8d4c9ef775
-	github.com/containerd/console v1.0.0
-	github.com/coreos/go-systemd/v22 v22.1.0
+	github.com/checkpoint-restore/go-criu/v5 v5.0.0
+	github.com/cilium/ebpf v0.5.0
+	github.com/containerd/console v1.0.2
+	github.com/coreos/go-systemd/v22 v22.3.1
 	github.com/cyphar/filepath-securejoin v0.2.2
 	github.com/cyphar/filepath-securejoin v0.2.2
 	github.com/docker/go-units v0.4.0
 	github.com/docker/go-units v0.4.0
-	github.com/godbus/dbus/v5 v5.0.3
-	github.com/golang/protobuf v1.4.2
-	github.com/moby/sys/mountinfo v0.1.3
-	github.com/mrunalp/fileutils v0.0.0-20200520151820-abd8a0e76976
-	github.com/opencontainers/runtime-spec v1.0.3-0.20200728170252-4d89ac9fbff6
-	github.com/opencontainers/selinux v1.6.0
+	github.com/godbus/dbus/v5 v5.0.4
+	github.com/moby/sys/mountinfo v0.4.1
+	github.com/mrunalp/fileutils v0.5.0
+	github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417
+	github.com/opencontainers/selinux v1.8.0
 	github.com/pkg/errors v0.9.1
 	github.com/pkg/errors v0.9.1
 	github.com/seccomp/libseccomp-golang v0.9.1
 	github.com/seccomp/libseccomp-golang v0.9.1
-	github.com/sirupsen/logrus v1.6.0
-	github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2
+	github.com/sirupsen/logrus v1.7.0
+	github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635
 	// NOTE: urfave/cli must be <= v1.22.1 due to a regression: https://github.com/urfave/cli/issues/1092
 	// NOTE: urfave/cli must be <= v1.22.1 due to a regression: https://github.com/urfave/cli/issues/1092
 	github.com/urfave/cli v1.22.1
 	github.com/urfave/cli v1.22.1
 	github.com/vishvananda/netlink v1.1.0
 	github.com/vishvananda/netlink v1.1.0
-	golang.org/x/sys v0.0.0-20200728102440-3e129f6d46b1
+	github.com/willf/bitset v1.1.11
+	golang.org/x/net v0.0.0-20201224014010-6772e930b67b
+	golang.org/x/sys v0.0.0-20210426230700-d19ff857e887
+	google.golang.org/protobuf v1.25.0
 )
 )

+ 87 - 83
vendor/github.com/opencontainers/runc/libcontainer/README.md

@@ -57,90 +57,94 @@ struct describing how the container is to be created. A sample would look simila
 
 
 ```go
 ```go
 defaultMountFlags := unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
 defaultMountFlags := unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
+var devices []*configs.DeviceRule
+for _, device := range specconv.AllowedDevices {
+	devices = append(devices, &device.Rule)
+}
 config := &configs.Config{
 config := &configs.Config{
 	Rootfs: "/your/path/to/rootfs",
 	Rootfs: "/your/path/to/rootfs",
 	Capabilities: &configs.Capabilities{
 	Capabilities: &configs.Capabilities{
-                Bounding: []string{
-                        "CAP_CHOWN",
-                        "CAP_DAC_OVERRIDE",
-                        "CAP_FSETID",
-                        "CAP_FOWNER",
-                        "CAP_MKNOD",
-                        "CAP_NET_RAW",
-                        "CAP_SETGID",
-                        "CAP_SETUID",
-                        "CAP_SETFCAP",
-                        "CAP_SETPCAP",
-                        "CAP_NET_BIND_SERVICE",
-                        "CAP_SYS_CHROOT",
-                        "CAP_KILL",
-                        "CAP_AUDIT_WRITE",
-                },
-                Effective: []string{
-                        "CAP_CHOWN",
-                        "CAP_DAC_OVERRIDE",
-                        "CAP_FSETID",
-                        "CAP_FOWNER",
-                        "CAP_MKNOD",
-                        "CAP_NET_RAW",
-                        "CAP_SETGID",
-                        "CAP_SETUID",
-                        "CAP_SETFCAP",
-                        "CAP_SETPCAP",
-                        "CAP_NET_BIND_SERVICE",
-                        "CAP_SYS_CHROOT",
-                        "CAP_KILL",
-                        "CAP_AUDIT_WRITE",
-                },
-                Inheritable: []string{
-                        "CAP_CHOWN",
-                        "CAP_DAC_OVERRIDE",
-                        "CAP_FSETID",
-                        "CAP_FOWNER",
-                        "CAP_MKNOD",
-                        "CAP_NET_RAW",
-                        "CAP_SETGID",
-                        "CAP_SETUID",
-                        "CAP_SETFCAP",
-                        "CAP_SETPCAP",
-                        "CAP_NET_BIND_SERVICE",
-                        "CAP_SYS_CHROOT",
-                        "CAP_KILL",
-                        "CAP_AUDIT_WRITE",
-                },
-                Permitted: []string{
-                        "CAP_CHOWN",
-                        "CAP_DAC_OVERRIDE",
-                        "CAP_FSETID",
-                        "CAP_FOWNER",
-                        "CAP_MKNOD",
-                        "CAP_NET_RAW",
-                        "CAP_SETGID",
-                        "CAP_SETUID",
-                        "CAP_SETFCAP",
-                        "CAP_SETPCAP",
-                        "CAP_NET_BIND_SERVICE",
-                        "CAP_SYS_CHROOT",
-                        "CAP_KILL",
-                        "CAP_AUDIT_WRITE",
-                },
-                Ambient: []string{
-                        "CAP_CHOWN",
-                        "CAP_DAC_OVERRIDE",
-                        "CAP_FSETID",
-                        "CAP_FOWNER",
-                        "CAP_MKNOD",
-                        "CAP_NET_RAW",
-                        "CAP_SETGID",
-                        "CAP_SETUID",
-                        "CAP_SETFCAP",
-                        "CAP_SETPCAP",
-                        "CAP_NET_BIND_SERVICE",
-                        "CAP_SYS_CHROOT",
-                        "CAP_KILL",
-                        "CAP_AUDIT_WRITE",
-                },
-        },
+		Bounding: []string{
+			"CAP_CHOWN",
+			"CAP_DAC_OVERRIDE",
+			"CAP_FSETID",
+			"CAP_FOWNER",
+			"CAP_MKNOD",
+			"CAP_NET_RAW",
+			"CAP_SETGID",
+			"CAP_SETUID",
+			"CAP_SETFCAP",
+			"CAP_SETPCAP",
+			"CAP_NET_BIND_SERVICE",
+			"CAP_SYS_CHROOT",
+			"CAP_KILL",
+			"CAP_AUDIT_WRITE",
+		},
+		Effective: []string{
+			"CAP_CHOWN",
+			"CAP_DAC_OVERRIDE",
+			"CAP_FSETID",
+			"CAP_FOWNER",
+			"CAP_MKNOD",
+			"CAP_NET_RAW",
+			"CAP_SETGID",
+			"CAP_SETUID",
+			"CAP_SETFCAP",
+			"CAP_SETPCAP",
+			"CAP_NET_BIND_SERVICE",
+			"CAP_SYS_CHROOT",
+			"CAP_KILL",
+			"CAP_AUDIT_WRITE",
+		},
+		Inheritable: []string{
+			"CAP_CHOWN",
+			"CAP_DAC_OVERRIDE",
+			"CAP_FSETID",
+			"CAP_FOWNER",
+			"CAP_MKNOD",
+			"CAP_NET_RAW",
+			"CAP_SETGID",
+			"CAP_SETUID",
+			"CAP_SETFCAP",
+			"CAP_SETPCAP",
+			"CAP_NET_BIND_SERVICE",
+			"CAP_SYS_CHROOT",
+			"CAP_KILL",
+			"CAP_AUDIT_WRITE",
+		},
+		Permitted: []string{
+			"CAP_CHOWN",
+			"CAP_DAC_OVERRIDE",
+			"CAP_FSETID",
+			"CAP_FOWNER",
+			"CAP_MKNOD",
+			"CAP_NET_RAW",
+			"CAP_SETGID",
+			"CAP_SETUID",
+			"CAP_SETFCAP",
+			"CAP_SETPCAP",
+			"CAP_NET_BIND_SERVICE",
+			"CAP_SYS_CHROOT",
+			"CAP_KILL",
+			"CAP_AUDIT_WRITE",
+		},
+		Ambient: []string{
+			"CAP_CHOWN",
+			"CAP_DAC_OVERRIDE",
+			"CAP_FSETID",
+			"CAP_FOWNER",
+			"CAP_MKNOD",
+			"CAP_NET_RAW",
+			"CAP_SETGID",
+			"CAP_SETUID",
+			"CAP_SETFCAP",
+			"CAP_SETPCAP",
+			"CAP_NET_BIND_SERVICE",
+			"CAP_SYS_CHROOT",
+			"CAP_KILL",
+			"CAP_AUDIT_WRITE",
+		},
+	},
 	Namespaces: configs.Namespaces([]configs.Namespace{
 	Namespaces: configs.Namespaces([]configs.Namespace{
 		{Type: configs.NEWNS},
 		{Type: configs.NEWNS},
 		{Type: configs.NEWUTS},
 		{Type: configs.NEWUTS},
@@ -155,7 +159,7 @@ config := &configs.Config{
 		Parent: "system",
 		Parent: "system",
 		Resources: &configs.Resources{
 		Resources: &configs.Resources{
 			MemorySwappiness: nil,
 			MemorySwappiness: nil,
-			Devices:          specconv.AllowedDevices,
+			Devices:          devices,
 		},
 		},
 	},
 	},
 	MaskPaths: []string{
 	MaskPaths: []string{
@@ -313,7 +317,7 @@ state, err := container.State()
 #### Checkpoint & Restore
 #### Checkpoint & Restore
 
 
 libcontainer now integrates [CRIU](http://criu.org/) for checkpointing and restoring containers.
 libcontainer now integrates [CRIU](http://criu.org/) for checkpointing and restoring containers.
-This let's you save the state of a process running inside a container to disk, and then restore
+This lets you save the state of a process running inside a container to disk, and then restore
 that state into a new process, on the same machine or on another machine.
 that state into a new process, on the same machine or on another machine.
 
 
 `criu` version 1.5.2 or higher is required to use checkpoint and restore.
 `criu` version 1.5.2 or higher is required to use checkpoint and restore.

+ 23 - 13
vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go

@@ -7,37 +7,44 @@ import (
 )
 )
 
 
 type Manager interface {
 type Manager interface {
-	// Applies cgroup configuration to the process with the specified pid
+	// Apply creates a cgroup, if not yet created, and adds a process
+	// with the specified pid into that cgroup.  A special value of -1
+	// can be used to merely create a cgroup.
 	Apply(pid int) error
 	Apply(pid int) error
 
 
-	// Returns the PIDs inside the cgroup set
+	// GetPids returns the PIDs of all processes inside the cgroup.
 	GetPids() ([]int, error)
 	GetPids() ([]int, error)
 
 
-	// Returns the PIDs inside the cgroup set & all sub-cgroups
+	// GetAllPids returns the PIDs of all processes inside the cgroup
+	// any all its sub-cgroups.
 	GetAllPids() ([]int, error)
 	GetAllPids() ([]int, error)
 
 
-	// Returns statistics for the cgroup set
+	// GetStats returns cgroups statistics.
 	GetStats() (*Stats, error)
 	GetStats() (*Stats, error)
 
 
-	// Toggles the freezer cgroup according with specified state
+	// Freeze sets the freezer cgroup to the specified state.
 	Freeze(state configs.FreezerState) error
 	Freeze(state configs.FreezerState) error
 
 
-	// Destroys the cgroup set
+	// Destroy removes cgroup.
 	Destroy() error
 	Destroy() error
 
 
 	// Path returns a cgroup path to the specified controller/subsystem.
 	// Path returns a cgroup path to the specified controller/subsystem.
 	// For cgroupv2, the argument is unused and can be empty.
 	// For cgroupv2, the argument is unused and can be empty.
 	Path(string) string
 	Path(string) string
 
 
-	// Sets the cgroup as configured.
-	Set(container *configs.Config) error
+	// Set sets cgroup resources parameters/limits. If the argument is nil,
+	// the resources specified during Manager creation (or the previous call
+	// to Set) are used.
+	Set(r *configs.Resources) error
 
 
-	// GetPaths returns cgroup path(s) to save in a state file in order to restore later.
+	// GetPaths returns cgroup path(s) to save in a state file in order to
+	// restore later.
 	//
 	//
-	// For cgroup v1, a key is cgroup subsystem name, and the value is the path
-	// to the cgroup for this subsystem.
+	// For cgroup v1, a key is cgroup subsystem name, and the value is the
+	// path to the cgroup for this subsystem.
 	//
 	//
-	// For cgroup v2 unified hierarchy, a key is "", and the value is the unified path.
+	// For cgroup v2 unified hierarchy, a key is "", and the value is the
+	// unified path.
 	GetPaths() map[string]string
 	GetPaths() map[string]string
 
 
 	// GetCgroups returns the cgroup data as configured.
 	// GetCgroups returns the cgroup data as configured.
@@ -46,6 +53,9 @@ type Manager interface {
 	// GetFreezerState retrieves the current FreezerState of the cgroup.
 	// GetFreezerState retrieves the current FreezerState of the cgroup.
 	GetFreezerState() (configs.FreezerState, error)
 	GetFreezerState() (configs.FreezerState, error)
 
 
-	// Whether the cgroup path exists or not
+	// Exists returns whether the cgroup path exists or not.
 	Exists() bool
 	Exists() bool
+
+	// OOMKillCount reports OOM kill count for the cgroup.
+	OOMKillCount() (uint64, error)
 }
 }

+ 51 - 0
vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/fscommon.go

@@ -0,0 +1,51 @@
+// +build linux
+
+package fscommon
+
+import (
+	"bytes"
+	"os"
+
+	"github.com/pkg/errors"
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
+)
+
+// WriteFile writes data to a cgroup file in dir.
+// It is supposed to be used for cgroup files only.
+func WriteFile(dir, file, data string) error {
+	fd, err := OpenFile(dir, file, unix.O_WRONLY)
+	if err != nil {
+		return err
+	}
+	defer fd.Close()
+	if err := retryingWriteFile(fd, data); err != nil {
+		return errors.Wrapf(err, "failed to write %q", data)
+	}
+	return nil
+}
+
+// ReadFile reads data from a cgroup file in dir.
+// It is supposed to be used for cgroup files only.
+func ReadFile(dir, file string) (string, error) {
+	fd, err := OpenFile(dir, file, unix.O_RDONLY)
+	if err != nil {
+		return "", err
+	}
+	defer fd.Close()
+	var buf bytes.Buffer
+
+	_, err = buf.ReadFrom(fd)
+	return buf.String(), err
+}
+
+func retryingWriteFile(fd *os.File, data string) error {
+	for {
+		_, err := fd.Write([]byte(data))
+		if errors.Is(err, unix.EINTR) {
+			logrus.Infof("interrupted while writing %s to %s", data, fd.Name())
+			continue
+		}
+		return err
+	}
+}

+ 120 - 0
vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/open.go

@@ -0,0 +1,120 @@
+package fscommon
+
+import (
+	"os"
+	"strings"
+	"sync"
+
+	"github.com/pkg/errors"
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
+)
+
+const (
+	cgroupfsDir    = "/sys/fs/cgroup"
+	cgroupfsPrefix = cgroupfsDir + "/"
+)
+
+var (
+	// TestMode is set to true by unit tests that need "fake" cgroupfs.
+	TestMode bool
+
+	cgroupFd     int = -1
+	prepOnce     sync.Once
+	prepErr      error
+	resolveFlags uint64
+)
+
+func prepareOpenat2() error {
+	prepOnce.Do(func() {
+		fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{
+			Flags: unix.O_DIRECTORY | unix.O_PATH})
+		if err != nil {
+			prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err}
+			if err != unix.ENOSYS {
+				logrus.Warnf("falling back to securejoin: %s", prepErr)
+			} else {
+				logrus.Debug("openat2 not available, falling back to securejoin")
+			}
+			return
+		}
+		var st unix.Statfs_t
+		if err = unix.Fstatfs(fd, &st); err != nil {
+			prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err}
+			logrus.Warnf("falling back to securejoin: %s", prepErr)
+			return
+		}
+
+		cgroupFd = fd
+
+		resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS
+		if st.Type == unix.CGROUP2_SUPER_MAGIC {
+			// cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks
+			resolveFlags |= unix.RESOLVE_NO_XDEV | unix.RESOLVE_NO_SYMLINKS
+		}
+
+	})
+
+	return prepErr
+}
+
+// OpenFile opens a cgroup file in a given dir with given flags.
+// It is supposed to be used for cgroup files only.
+func OpenFile(dir, file string, flags int) (*os.File, error) {
+	if dir == "" {
+		return nil, errors.Errorf("no directory specified for %s", file)
+	}
+	mode := os.FileMode(0)
+	if TestMode && flags&os.O_WRONLY != 0 {
+		// "emulate" cgroup fs for unit tests
+		flags |= os.O_TRUNC | os.O_CREATE
+		mode = 0o600
+	}
+	if prepareOpenat2() != nil {
+		return openFallback(dir, file, flags, mode)
+	}
+	reldir := strings.TrimPrefix(dir, cgroupfsPrefix)
+	if len(reldir) == len(dir) { // non-standard path, old system?
+		return openFallback(dir, file, flags, mode)
+	}
+
+	relname := reldir + "/" + file
+	fd, err := unix.Openat2(cgroupFd, relname,
+		&unix.OpenHow{
+			Resolve: resolveFlags,
+			Flags:   uint64(flags) | unix.O_CLOEXEC,
+			Mode:    uint64(mode),
+		})
+	if err != nil {
+		return nil, &os.PathError{Op: "openat2", Path: dir + "/" + file, Err: err}
+	}
+
+	return os.NewFile(uintptr(fd), cgroupfsPrefix+relname), nil
+}
+
+var errNotCgroupfs = errors.New("not a cgroup file")
+
+// openFallback is used when openat2(2) is not available. It checks the opened
+// file is on cgroupfs, returning an error otherwise.
+func openFallback(dir, file string, flags int, mode os.FileMode) (*os.File, error) {
+	path := dir + "/" + file
+	fd, err := os.OpenFile(path, flags, mode)
+	if err != nil {
+		return nil, err
+	}
+	if TestMode {
+		return fd, nil
+	}
+	// Check this is a cgroupfs file.
+	var st unix.Statfs_t
+	if err := unix.Fstatfs(int(fd.Fd()), &st); err != nil {
+		_ = fd.Close()
+		return nil, &os.PathError{Op: "statfs", Path: path, Err: err}
+	}
+	if st.Type != unix.CGROUP_SUPER_MAGIC && st.Type != unix.CGROUP2_SUPER_MAGIC {
+		_ = fd.Close()
+		return nil, &os.PathError{Op: "open", Path: path, Err: errNotCgroupfs}
+	}
+
+	return fd, nil
+}

+ 122 - 0
vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/utils.go

@@ -0,0 +1,122 @@
+// +build linux
+
+package fscommon
+
+import (
+	"errors"
+	"fmt"
+	"math"
+	"strconv"
+	"strings"
+)
+
+var (
+	ErrNotValidFormat = errors.New("line is not a valid key value format")
+)
+
+// ParseUint converts a string to an uint64 integer.
+// Negative values are returned at zero as, due to kernel bugs,
+// some of the memory cgroup stats can be negative.
+func ParseUint(s string, base, bitSize int) (uint64, error) {
+	value, err := strconv.ParseUint(s, base, bitSize)
+	if err != nil {
+		intValue, intErr := strconv.ParseInt(s, base, bitSize)
+		// 1. Handle negative values greater than MinInt64 (and)
+		// 2. Handle negative values lesser than MinInt64
+		if intErr == nil && intValue < 0 {
+			return 0, nil
+		} else if intErr != nil && intErr.(*strconv.NumError).Err == strconv.ErrRange && intValue < 0 {
+			return 0, nil
+		}
+
+		return value, err
+	}
+
+	return value, nil
+}
+
+// ParseKeyValue parses a space-separated "name value" kind of cgroup
+// parameter and returns its key as a string, and its value as uint64
+// (ParseUint is used to convert the value). For example,
+// "io_service_bytes 1234" will be returned as "io_service_bytes", 1234.
+func ParseKeyValue(t string) (string, uint64, error) {
+	parts := strings.SplitN(t, " ", 3)
+	if len(parts) != 2 {
+		return "", 0, fmt.Errorf("line %q is not in key value format", t)
+	}
+
+	value, err := ParseUint(parts[1], 10, 64)
+	if err != nil {
+		return "", 0, fmt.Errorf("unable to convert to uint64: %v", err)
+	}
+
+	return parts[0], value, nil
+}
+
+// GetValueByKey reads a key-value pairs from the specified cgroup file,
+// and returns a value of the specified key. ParseUint is used for value
+// conversion.
+func GetValueByKey(path, file, key string) (uint64, error) {
+	content, err := ReadFile(path, file)
+	if err != nil {
+		return 0, err
+	}
+
+	lines := strings.Split(string(content), "\n")
+	for _, line := range lines {
+		arr := strings.Split(line, " ")
+		if len(arr) == 2 && arr[0] == key {
+			return ParseUint(arr[1], 10, 64)
+		}
+	}
+
+	return 0, nil
+}
+
+// GetCgroupParamUint reads a single uint64 value from the specified cgroup file.
+// If the value read is "max", the math.MaxUint64 is returned.
+func GetCgroupParamUint(path, file string) (uint64, error) {
+	contents, err := GetCgroupParamString(path, file)
+	if err != nil {
+		return 0, err
+	}
+	contents = strings.TrimSpace(contents)
+	if contents == "max" {
+		return math.MaxUint64, nil
+	}
+
+	res, err := ParseUint(contents, 10, 64)
+	if err != nil {
+		return res, fmt.Errorf("unable to parse file %q", path+"/"+file)
+	}
+	return res, nil
+}
+
+// GetCgroupParamInt reads a single int64 value from specified cgroup file.
+// If the value read is "max", the math.MaxInt64 is returned.
+func GetCgroupParamInt(path, file string) (int64, error) {
+	contents, err := ReadFile(path, file)
+	if err != nil {
+		return 0, err
+	}
+	contents = strings.TrimSpace(contents)
+	if contents == "max" {
+		return math.MaxInt64, nil
+	}
+
+	res, err := strconv.ParseInt(contents, 10, 64)
+	if err != nil {
+		return res, fmt.Errorf("unable to parse %q as a int from Cgroup file %q", contents, path+"/"+file)
+	}
+	return res, nil
+}
+
+// GetCgroupParamString reads a string from the specified cgroup file.
+func GetCgroupParamString(path, file string) (string, error) {
+	contents, err := ReadFile(path, file)
+	if err != nil {
+		return "", err
+	}
+
+	return strings.TrimSpace(contents), nil
+}

+ 28 - 0
vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go

@@ -39,6 +39,33 @@ type CpuStats struct {
 	ThrottlingData ThrottlingData `json:"throttling_data,omitempty"`
 	ThrottlingData ThrottlingData `json:"throttling_data,omitempty"`
 }
 }
 
 
+type CPUSetStats struct {
+	// List of the physical numbers of the CPUs on which processes
+	// in that cpuset are allowed to execute
+	CPUs []uint16 `json:"cpus,omitempty"`
+	// cpu_exclusive flag
+	CPUExclusive uint64 `json:"cpu_exclusive"`
+	// List of memory nodes on which processes in that cpuset
+	// are allowed to allocate memory
+	Mems []uint16 `json:"mems,omitempty"`
+	// mem_hardwall flag
+	MemHardwall uint64 `json:"mem_hardwall"`
+	// mem_exclusive flag
+	MemExclusive uint64 `json:"mem_exclusive"`
+	// memory_migrate flag
+	MemoryMigrate uint64 `json:"memory_migrate"`
+	// memory_spread page flag
+	MemorySpreadPage uint64 `json:"memory_spread_page"`
+	// memory_spread slab flag
+	MemorySpreadSlab uint64 `json:"memory_spread_slab"`
+	// memory_pressure
+	MemoryPressure uint64 `json:"memory_pressure"`
+	// sched_load balance flag
+	SchedLoadBalance uint64 `json:"sched_load_balance"`
+	// sched_relax_domain_level
+	SchedRelaxDomainLevel int64 `json:"sched_relax_domain_level"`
+}
+
 type MemoryData struct {
 type MemoryData struct {
 	Usage    uint64 `json:"usage,omitempty"`
 	Usage    uint64 `json:"usage,omitempty"`
 	MaxUsage uint64 `json:"max_usage,omitempty"`
 	MaxUsage uint64 `json:"max_usage,omitempty"`
@@ -121,6 +148,7 @@ type HugetlbStats struct {
 
 
 type Stats struct {
 type Stats struct {
 	CpuStats    CpuStats    `json:"cpu_stats,omitempty"`
 	CpuStats    CpuStats    `json:"cpu_stats,omitempty"`
+	CPUSetStats CPUSetStats `json:"cpuset_stats,omitempty"`
 	MemoryStats MemoryStats `json:"memory_stats,omitempty"`
 	MemoryStats MemoryStats `json:"memory_stats,omitempty"`
 	PidsStats   PidsStats   `json:"pids_stats,omitempty"`
 	PidsStats   PidsStats   `json:"pids_stats,omitempty"`
 	BlkioStats  BlkioStats  `json:"blkio_stats,omitempty"`
 	BlkioStats  BlkioStats  `json:"blkio_stats,omitempty"`

+ 115 - 42
vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go

@@ -15,7 +15,9 @@ import (
 	"sync"
 	"sync"
 	"time"
 	"time"
 
 
-	units "github.com/docker/go-units"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/userns"
+	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
 	"golang.org/x/sys/unix"
 )
 )
 
 
@@ -29,19 +31,19 @@ var (
 	isUnified     bool
 	isUnified     bool
 )
 )
 
 
-// HugePageSizeUnitList is a list of the units used by the linux kernel when
-// naming the HugePage control files.
-// https://www.kernel.org/doc/Documentation/cgroup-v1/hugetlb.txt
-// TODO Since the kernel only use KB, MB and GB; TB and PB should be removed,
-// depends on https://github.com/docker/go-units/commit/a09cd47f892041a4fac473133d181f5aea6fa393
-var HugePageSizeUnitList = []string{"B", "KB", "MB", "GB", "TB", "PB"}
-
 // IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode.
 // IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode.
 func IsCgroup2UnifiedMode() bool {
 func IsCgroup2UnifiedMode() bool {
 	isUnifiedOnce.Do(func() {
 	isUnifiedOnce.Do(func() {
 		var st unix.Statfs_t
 		var st unix.Statfs_t
-		if err := unix.Statfs(unifiedMountpoint, &st); err != nil {
-			panic("cannot statfs cgroup root")
+		err := unix.Statfs(unifiedMountpoint, &st)
+		if err != nil {
+			if os.IsNotExist(err) && userns.RunningInUserNS() {
+				// ignore the "not found" error if running in userns
+				logrus.WithError(err).Debugf("%s missing, assuming cgroup v1", unifiedMountpoint)
+				isUnified = false
+				return
+			}
+			panic(fmt.Sprintf("cannot statfs cgroup root: %s", err))
 		}
 		}
 		isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
 		isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
 	})
 	})
@@ -86,11 +88,11 @@ func GetAllSubsystems() ([]string, error) {
 		// - freezer: implemented in kernel 5.2
 		// - freezer: implemented in kernel 5.2
 		// We assume these are always available, as it is hard to detect availability.
 		// We assume these are always available, as it is hard to detect availability.
 		pseudo := []string{"devices", "freezer"}
 		pseudo := []string{"devices", "freezer"}
-		data, err := ioutil.ReadFile("/sys/fs/cgroup/cgroup.controllers")
+		data, err := fscommon.ReadFile("/sys/fs/cgroup", "cgroup.controllers")
 		if err != nil {
 		if err != nil {
 			return nil, err
 			return nil, err
 		}
 		}
-		subsystems := append(pseudo, strings.Fields(string(data))...)
+		subsystems := append(pseudo, strings.Fields(data)...)
 		return subsystems, nil
 		return subsystems, nil
 	}
 	}
 	f, err := os.Open("/proc/cgroups")
 	f, err := os.Open("/proc/cgroups")
@@ -207,20 +209,66 @@ func EnterPid(cgroupPaths map[string]string, pid int) error {
 	return nil
 	return nil
 }
 }
 
 
+func rmdir(path string) error {
+	err := unix.Rmdir(path)
+	if err == nil || err == unix.ENOENT {
+		return nil
+	}
+	return &os.PathError{Op: "rmdir", Path: path, Err: err}
+}
+
+// RemovePath aims to remove cgroup path. It does so recursively,
+// by removing any subdirectories (sub-cgroups) first.
+func RemovePath(path string) error {
+	// try the fast path first
+	if err := rmdir(path); err == nil {
+		return nil
+	}
+
+	infos, err := ioutil.ReadDir(path)
+	if err != nil {
+		if os.IsNotExist(err) {
+			err = nil
+		}
+		return err
+	}
+	for _, info := range infos {
+		if info.IsDir() {
+			// We should remove subcgroups dir first
+			if err = RemovePath(filepath.Join(path, info.Name())); err != nil {
+				break
+			}
+		}
+	}
+	if err == nil {
+		err = rmdir(path)
+	}
+	return err
+}
+
 // RemovePaths iterates over the provided paths removing them.
 // RemovePaths iterates over the provided paths removing them.
 // We trying to remove all paths five times with increasing delay between tries.
 // We trying to remove all paths five times with increasing delay between tries.
 // If after all there are not removed cgroups - appropriate error will be
 // If after all there are not removed cgroups - appropriate error will be
 // returned.
 // returned.
 func RemovePaths(paths map[string]string) (err error) {
 func RemovePaths(paths map[string]string) (err error) {
+	const retries = 5
 	delay := 10 * time.Millisecond
 	delay := 10 * time.Millisecond
-	for i := 0; i < 5; i++ {
+	for i := 0; i < retries; i++ {
 		if i != 0 {
 		if i != 0 {
 			time.Sleep(delay)
 			time.Sleep(delay)
 			delay *= 2
 			delay *= 2
 		}
 		}
 		for s, p := range paths {
 		for s, p := range paths {
-			os.RemoveAll(p)
-			// TODO: here probably should be logging
+			if err := RemovePath(p); err != nil {
+				// do not log intermediate iterations
+				switch i {
+				case 0:
+					logrus.WithError(err).Warnf("Failed to remove cgroup (will retry)")
+				case retries - 1:
+					logrus.WithError(err).Error("Failed to remove cgroup")
+				}
+
+			}
 			_, err := os.Stat(p)
 			_, err := os.Stat(p)
 			// We need this strange way of checking cgroups existence because
 			// We need this strange way of checking cgroups existence because
 			// RemoveAll almost always returns error, even on already removed
 			// RemoveAll almost always returns error, even on already removed
@@ -230,6 +278,8 @@ func RemovePaths(paths map[string]string) (err error) {
 			}
 			}
 		}
 		}
 		if len(paths) == 0 {
 		if len(paths) == 0 {
+			//nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506
+			paths = make(map[string]string)
 			return nil
 			return nil
 		}
 		}
 	}
 	}
@@ -237,27 +287,50 @@ func RemovePaths(paths map[string]string) (err error) {
 }
 }
 
 
 func GetHugePageSize() ([]string, error) {
 func GetHugePageSize() ([]string, error) {
-	files, err := ioutil.ReadDir("/sys/kernel/mm/hugepages")
+	dir, err := os.OpenFile("/sys/kernel/mm/hugepages", unix.O_DIRECTORY|unix.O_RDONLY, 0)
 	if err != nil {
 	if err != nil {
-		return []string{}, err
+		return nil, err
 	}
 	}
-	var fileNames []string
-	for _, st := range files {
-		fileNames = append(fileNames, st.Name())
+	files, err := dir.Readdirnames(0)
+	dir.Close()
+	if err != nil {
+		return nil, err
 	}
 	}
-	return getHugePageSizeFromFilenames(fileNames)
+
+	return getHugePageSizeFromFilenames(files)
 }
 }
 
 
 func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) {
 func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) {
-	var pageSizes []string
-	for _, fileName := range fileNames {
-		nameArray := strings.Split(fileName, "-")
-		pageSize, err := units.RAMInBytes(nameArray[1])
+	pageSizes := make([]string, 0, len(fileNames))
+
+	for _, file := range fileNames {
+		// example: hugepages-1048576kB
+		val := strings.TrimPrefix(file, "hugepages-")
+		if len(val) == len(file) {
+			// unexpected file name: no prefix found
+			continue
+		}
+		// The suffix is always "kB" (as of Linux 5.9)
+		eLen := len(val) - 2
+		val = strings.TrimSuffix(val, "kB")
+		if len(val) != eLen {
+			logrus.Warnf("GetHugePageSize: %s: invalid filename suffix (expected \"kB\")", file)
+			continue
+		}
+		size, err := strconv.Atoi(val)
 		if err != nil {
 		if err != nil {
-			return []string{}, err
+			return nil, err
 		}
 		}
-		sizeString := units.CustomSize("%g%s", float64(pageSize), 1024.0, HugePageSizeUnitList)
-		pageSizes = append(pageSizes, sizeString)
+		// Model after https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/hugetlb_cgroup.c?id=eff48ddeab782e35e58ccc8853f7386bbae9dec4#n574
+		// but in our case the size is in KB already.
+		if size >= (1 << 20) {
+			val = strconv.Itoa(size>>20) + "GB"
+		} else if size >= (1 << 10) {
+			val = strconv.Itoa(size>>10) + "MB"
+		} else {
+			val += "KB"
+		}
+		pageSizes = append(pageSizes, val)
 	}
 	}
 
 
 	return pageSizes, nil
 	return pageSizes, nil
@@ -303,14 +376,14 @@ func WriteCgroupProc(dir string, pid int) error {
 		return nil
 		return nil
 	}
 	}
 
 
-	cgroupProcessesFile, err := os.OpenFile(filepath.Join(dir, CgroupProcesses), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0700)
+	file, err := fscommon.OpenFile(dir, CgroupProcesses, os.O_WRONLY)
 	if err != nil {
 	if err != nil {
 		return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err)
 		return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err)
 	}
 	}
-	defer cgroupProcessesFile.Close()
+	defer file.Close()
 
 
 	for i := 0; i < 5; i++ {
 	for i := 0; i < 5; i++ {
-		_, err = cgroupProcessesFile.WriteString(strconv.Itoa(pid))
+		_, err = file.WriteString(strconv.Itoa(pid))
 		if err == nil {
 		if err == nil {
 			return nil
 			return nil
 		}
 		}
@@ -327,17 +400,6 @@ func WriteCgroupProc(dir string, pid int) error {
 	return err
 	return err
 }
 }
 
 
-// Since the OCI spec is designed for cgroup v1, in some cases
-// there is need to convert from the cgroup v1 configuration to cgroup v2
-// the formula for BlkIOWeight is y = (1 + (x - 10) * 9999 / 990)
-// convert linearly from [10-1000] to [1-10000]
-func ConvertBlkIOToCgroupV2Value(blkIoWeight uint16) uint64 {
-	if blkIoWeight == 0 {
-		return 0
-	}
-	return uint64(1 + (uint64(blkIoWeight)-10)*9999/990)
-}
-
 // Since the OCI spec is designed for cgroup v1, in some cases
 // Since the OCI spec is designed for cgroup v1, in some cases
 // there is need to convert from the cgroup v1 configuration to cgroup v2
 // there is need to convert from the cgroup v1 configuration to cgroup v2
 // the formula for cpuShares is y = (1 + ((x - 2) * 9999) / 262142)
 // the formula for cpuShares is y = (1 + ((x - 2) * 9999) / 262142)
@@ -377,3 +439,14 @@ func ConvertMemorySwapToCgroupV2Value(memorySwap, memory int64) (int64, error) {
 
 
 	return memorySwap - memory, nil
 	return memorySwap - memory, nil
 }
 }
+
+// Since the OCI spec is designed for cgroup v1, in some cases
+// there is need to convert from the cgroup v1 configuration to cgroup v2
+// the formula for BlkIOWeight to IOWeight is y = (1 + (x - 10) * 9999 / 990)
+// convert linearly from [10-1000] to [1-10000]
+func ConvertBlkIOToIOWeightValue(blkIoWeight uint16) uint64 {
+	if blkIoWeight == 0 {
+		return 0
+	}
+	return uint64(1 + (uint64(blkIoWeight)-10)*9999/990)
+}

+ 41 - 59
vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go

@@ -1,16 +1,16 @@
 package cgroups
 package cgroups
 
 
 import (
 import (
-	"bufio"
 	"errors"
 	"errors"
 	"fmt"
 	"fmt"
-	"io"
 	"os"
 	"os"
 	"path/filepath"
 	"path/filepath"
 	"strings"
 	"strings"
+	"sync"
 	"syscall"
 	"syscall"
 
 
 	securejoin "github.com/cyphar/filepath-securejoin"
 	securejoin "github.com/cyphar/filepath-securejoin"
+	"github.com/moby/sys/mountinfo"
 	"golang.org/x/sys/unix"
 	"golang.org/x/sys/unix"
 )
 )
 
 
@@ -23,7 +23,12 @@ const (
 )
 )
 
 
 var (
 var (
-	errUnified = errors.New("not implemented for cgroup v2 unified hierarchy")
+	errUnified     = errors.New("not implemented for cgroup v2 unified hierarchy")
+	ErrV1NoUnified = errors.New("invalid configuration: cannot use unified on cgroup v1")
+
+	readMountinfoOnce sync.Once
+	readMountinfoErr  error
+	cgroupMountinfo   []*mountinfo.Info
 )
 )
 
 
 type NotFoundError struct {
 type NotFoundError struct {
@@ -90,6 +95,21 @@ func tryDefaultPath(cgroupPath, subsystem string) string {
 	return path
 	return path
 }
 }
 
 
+// readCgroupMountinfo returns a list of cgroup v1 mounts (i.e. the ones
+// with fstype of "cgroup") for the current running process.
+//
+// The results are cached (to avoid re-reading mountinfo which is relatively
+// expensive), so it is assumed that cgroup mounts are not being changed.
+func readCgroupMountinfo() ([]*mountinfo.Info, error) {
+	readMountinfoOnce.Do(func() {
+		cgroupMountinfo, readMountinfoErr = mountinfo.GetMounts(
+			mountinfo.FSTypeFilter("cgroup"),
+		)
+	})
+
+	return cgroupMountinfo, readMountinfoErr
+}
+
 // https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt
 // https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt
 func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) {
 func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) {
 	if IsCgroup2UnifiedMode() {
 	if IsCgroup2UnifiedMode() {
@@ -110,56 +130,28 @@ func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string,
 		return "", "", errUnified
 		return "", "", errUnified
 	}
 	}
 
 
-	// Avoid parsing mountinfo by checking if subsystem is valid/available.
-	if !isSubsystemAvailable(subsystem) {
-		return "", "", NewNotFoundError(subsystem)
-	}
-
-	f, err := os.Open("/proc/self/mountinfo")
+	mi, err := readCgroupMountinfo()
 	if err != nil {
 	if err != nil {
 		return "", "", err
 		return "", "", err
 	}
 	}
-	defer f.Close()
 
 
-	return findCgroupMountpointAndRootFromReader(f, cgroupPath, subsystem)
+	return findCgroupMountpointAndRootFromMI(mi, cgroupPath, subsystem)
 }
 }
 
 
-func findCgroupMountpointAndRootFromReader(reader io.Reader, cgroupPath, subsystem string) (string, string, error) {
-	scanner := bufio.NewScanner(reader)
-	for scanner.Scan() {
-		txt := scanner.Text()
-		fields := strings.Fields(txt)
-		if len(fields) < 9 {
-			continue
-		}
-		if strings.HasPrefix(fields[4], cgroupPath) {
-			for _, opt := range strings.Split(fields[len(fields)-1], ",") {
+func findCgroupMountpointAndRootFromMI(mounts []*mountinfo.Info, cgroupPath, subsystem string) (string, string, error) {
+	for _, mi := range mounts {
+		if strings.HasPrefix(mi.Mountpoint, cgroupPath) {
+			for _, opt := range strings.Split(mi.VFSOptions, ",") {
 				if opt == subsystem {
 				if opt == subsystem {
-					return fields[4], fields[3], nil
+					return mi.Mountpoint, mi.Root, nil
 				}
 				}
 			}
 			}
 		}
 		}
 	}
 	}
-	if err := scanner.Err(); err != nil {
-		return "", "", err
-	}
 
 
 	return "", "", NewNotFoundError(subsystem)
 	return "", "", NewNotFoundError(subsystem)
 }
 }
 
 
-func isSubsystemAvailable(subsystem string) bool {
-	if IsCgroup2UnifiedMode() {
-		panic("don't call isSubsystemAvailable from cgroupv2 code")
-	}
-
-	cgroups, err := ParseCgroupFile("/proc/self/cgroup")
-	if err != nil {
-		return false
-	}
-	_, avail := cgroups[subsystem]
-	return avail
-}
-
 func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) {
 func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) {
 	if len(m.Subsystems) == 0 {
 	if len(m.Subsystems) == 0 {
 		return "", fmt.Errorf("no subsystem for mount")
 		return "", fmt.Errorf("no subsystem for mount")
@@ -168,25 +160,15 @@ func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) {
 	return getControllerPath(m.Subsystems[0], cgroups)
 	return getControllerPath(m.Subsystems[0], cgroups)
 }
 }
 
 
-func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount, error) {
+func getCgroupMountsHelper(ss map[string]bool, mounts []*mountinfo.Info, all bool) ([]Mount, error) {
 	res := make([]Mount, 0, len(ss))
 	res := make([]Mount, 0, len(ss))
-	scanner := bufio.NewScanner(mi)
 	numFound := 0
 	numFound := 0
-	for scanner.Scan() && numFound < len(ss) {
-		txt := scanner.Text()
-		sepIdx := strings.Index(txt, " - ")
-		if sepIdx == -1 {
-			return nil, fmt.Errorf("invalid mountinfo format")
-		}
-		if txt[sepIdx+3:sepIdx+10] == "cgroup2" || txt[sepIdx+3:sepIdx+9] != "cgroup" {
-			continue
-		}
-		fields := strings.Split(txt, " ")
+	for _, mi := range mounts {
 		m := Mount{
 		m := Mount{
-			Mountpoint: fields[4],
-			Root:       fields[3],
+			Mountpoint: mi.Mountpoint,
+			Root:       mi.Root,
 		}
 		}
-		for _, opt := range strings.Split(fields[len(fields)-1], ",") {
+		for _, opt := range strings.Split(mi.VFSOptions, ",") {
 			seen, known := ss[opt]
 			seen, known := ss[opt]
 			if !known || (!all && seen) {
 			if !known || (!all && seen) {
 				continue
 				continue
@@ -199,19 +181,18 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount,
 		if len(m.Subsystems) > 0 || all {
 		if len(m.Subsystems) > 0 || all {
 			res = append(res, m)
 			res = append(res, m)
 		}
 		}
-	}
-	if err := scanner.Err(); err != nil {
-		return nil, err
+		if !all && numFound >= len(ss) {
+			break
+		}
 	}
 	}
 	return res, nil
 	return res, nil
 }
 }
 
 
 func getCgroupMountsV1(all bool) ([]Mount, error) {
 func getCgroupMountsV1(all bool) ([]Mount, error) {
-	f, err := os.Open("/proc/self/mountinfo")
+	mi, err := readCgroupMountinfo()
 	if err != nil {
 	if err != nil {
 		return nil, err
 		return nil, err
 	}
 	}
-	defer f.Close()
 
 
 	allSubsystems, err := ParseCgroupFile("/proc/self/cgroup")
 	allSubsystems, err := ParseCgroupFile("/proc/self/cgroup")
 	if err != nil {
 	if err != nil {
@@ -222,7 +203,8 @@ func getCgroupMountsV1(all bool) ([]Mount, error) {
 	for s := range allSubsystems {
 	for s := range allSubsystems {
 		allMap[s] = false
 		allMap[s] = false
 	}
 	}
-	return getCgroupMountsHelper(allMap, f, all)
+
+	return getCgroupMountsHelper(allMap, mi, all)
 }
 }
 
 
 // GetOwnCgroup returns the relative path to the cgroup docker is running in.
 // GetOwnCgroup returns the relative path to the cgroup docker is running in.

+ 5 - 7
vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go

@@ -2,6 +2,7 @@ package configs
 
 
 import (
 import (
 	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
 	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
+	"github.com/opencontainers/runc/libcontainer/devices"
 )
 )
 
 
 type FreezerState string
 type FreezerState string
@@ -42,7 +43,7 @@ type Cgroup struct {
 
 
 type Resources struct {
 type Resources struct {
 	// Devices is the set of access rules for devices in the container.
 	// Devices is the set of access rules for devices in the container.
-	Devices []*DeviceRule `json:"devices"`
+	Devices []*devices.Rule `json:"devices"`
 
 
 	// Memory limit (in bytes)
 	// Memory limit (in bytes)
 	Memory int64 `json:"memory"`
 	Memory int64 `json:"memory"`
@@ -53,12 +54,6 @@ type Resources struct {
 	// Total memory usage (memory + swap); set `-1` to enable unlimited swap
 	// Total memory usage (memory + swap); set `-1` to enable unlimited swap
 	MemorySwap int64 `json:"memory_swap"`
 	MemorySwap int64 `json:"memory_swap"`
 
 
-	// Kernel memory limit (in bytes)
-	KernelMemory int64 `json:"kernel_memory"`
-
-	// Kernel memory limit for TCP use (in bytes)
-	KernelMemoryTCP int64 `json:"kernel_memory_tcp"`
-
 	// CPU shares (relative weight vs. other containers)
 	// CPU shares (relative weight vs. other containers)
 	CpuShares uint64 `json:"cpu_shares"`
 	CpuShares uint64 `json:"cpu_shares"`
 
 
@@ -127,6 +122,9 @@ type Resources struct {
 	// CpuWeight sets a proportional bandwidth limit.
 	// CpuWeight sets a proportional bandwidth limit.
 	CpuWeight uint64 `json:"cpu_weight"`
 	CpuWeight uint64 `json:"cpu_weight"`
 
 
+	// Unified is cgroupv2-only key-value map.
+	Unified map[string]string `json:"unified"`
+
 	// SkipDevices allows to skip configuring device permissions.
 	// SkipDevices allows to skip configuring device permissions.
 	// Used by e.g. kubelet while creating a parent cgroup (kubepods)
 	// Used by e.g. kubelet while creating a parent cgroup (kubepods)
 	// common for many containers.
 	// common for many containers.

+ 15 - 10
vendor/github.com/opencontainers/runc/libcontainer/configs/config.go

@@ -7,6 +7,7 @@ import (
 	"os/exec"
 	"os/exec"
 	"time"
 	"time"
 
 
+	"github.com/opencontainers/runc/libcontainer/devices"
 	"github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/pkg/errors"
 	"github.com/pkg/errors"
 	"github.com/sirupsen/logrus"
 	"github.com/sirupsen/logrus"
@@ -30,9 +31,10 @@ type IDMap struct {
 // for syscalls. Additional architectures can be added by specifying them in
 // for syscalls. Additional architectures can be added by specifying them in
 // Architectures.
 // Architectures.
 type Seccomp struct {
 type Seccomp struct {
-	DefaultAction Action     `json:"default_action"`
-	Architectures []string   `json:"architectures"`
-	Syscalls      []*Syscall `json:"syscalls"`
+	DefaultAction   Action     `json:"default_action"`
+	Architectures   []string   `json:"architectures"`
+	Syscalls        []*Syscall `json:"syscalls"`
+	DefaultErrnoRet *uint      `json:"default_errno_ret"`
 }
 }
 
 
 // Action is taken upon rule match in Seccomp
 // Action is taken upon rule match in Seccomp
@@ -92,6 +94,9 @@ type Config struct {
 	// Path to a directory containing the container's root filesystem.
 	// Path to a directory containing the container's root filesystem.
 	Rootfs string `json:"rootfs"`
 	Rootfs string `json:"rootfs"`
 
 
+	// Umask is the umask to use inside of the container.
+	Umask *uint32 `json:"umask"`
+
 	// Readonlyfs will remount the container's rootfs as readonly where only externally mounted
 	// Readonlyfs will remount the container's rootfs as readonly where only externally mounted
 	// bind mounts are writtable.
 	// bind mounts are writtable.
 	Readonlyfs bool `json:"readonlyfs"`
 	Readonlyfs bool `json:"readonlyfs"`
@@ -104,7 +109,7 @@ type Config struct {
 	Mounts []*Mount `json:"mounts"`
 	Mounts []*Mount `json:"mounts"`
 
 
 	// The device nodes that should be automatically created within the container upon container start.  Note, make sure that the node is marked as allowed in the cgroup as well!
 	// The device nodes that should be automatically created within the container upon container start.  Note, make sure that the node is marked as allowed in the cgroup as well!
-	Devices []*Device `json:"devices"`
+	Devices []*devices.Device `json:"devices"`
 
 
 	MountLabel string `json:"mount_label"`
 	MountLabel string `json:"mount_label"`
 
 
@@ -218,25 +223,25 @@ const (
 	// the runtime environment has been created but before the pivot_root has been executed.
 	// the runtime environment has been created but before the pivot_root has been executed.
 	// CreateRuntime is called immediately after the deprecated Prestart hook.
 	// CreateRuntime is called immediately after the deprecated Prestart hook.
 	// CreateRuntime commands are called in the Runtime Namespace.
 	// CreateRuntime commands are called in the Runtime Namespace.
-	CreateRuntime = "createRuntime"
+	CreateRuntime HookName = "createRuntime"
 
 
 	// CreateContainer commands MUST be called as part of the create operation after
 	// CreateContainer commands MUST be called as part of the create operation after
 	// the runtime environment has been created but before the pivot_root has been executed.
 	// the runtime environment has been created but before the pivot_root has been executed.
 	// CreateContainer commands are called in the Container namespace.
 	// CreateContainer commands are called in the Container namespace.
-	CreateContainer = "createContainer"
+	CreateContainer HookName = "createContainer"
 
 
 	// StartContainer commands MUST be called as part of the start operation and before
 	// StartContainer commands MUST be called as part of the start operation and before
 	// the container process is started.
 	// the container process is started.
 	// StartContainer commands are called in the Container namespace.
 	// StartContainer commands are called in the Container namespace.
-	StartContainer = "startContainer"
+	StartContainer HookName = "startContainer"
 
 
 	// Poststart commands are executed after the container init process starts.
 	// Poststart commands are executed after the container init process starts.
 	// Poststart commands are called in the Runtime Namespace.
 	// Poststart commands are called in the Runtime Namespace.
-	Poststart = "poststart"
+	Poststart HookName = "poststart"
 
 
 	// Poststop commands are executed after the container init process exits.
 	// Poststop commands are executed after the container init process exits.
 	// Poststop commands are called in the Runtime Namespace.
 	// Poststop commands are called in the Runtime Namespace.
-	Poststop = "poststop"
+	Poststop HookName = "poststop"
 )
 )
 
 
 type Capabilities struct {
 type Capabilities struct {
@@ -383,7 +388,7 @@ func (c Command) Run(s *specs.State) error {
 		return err
 		return err
 	case <-timerCh:
 	case <-timerCh:
 		cmd.Process.Kill()
 		cmd.Process.Kill()
-		cmd.Wait()
+		<-errC
 		return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds())
 		return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds())
 	}
 	}
 }
 }

+ 9 - 0
vendor/github.com/opencontainers/runc/libcontainer/configs/configs_fuzzer.go

@@ -0,0 +1,9 @@
+// +build gofuzz
+
+package configs
+
+func FuzzUnmarshalJSON(data []byte) int {
+	hooks := Hooks{}
+	_ = hooks.UnmarshalJSON(data)
+	return 1
+}

+ 0 - 16
vendor/github.com/opencontainers/runc/libcontainer/configs/device_unix.go

@@ -1,16 +0,0 @@
-// +build !windows
-
-package configs
-
-import (
-	"errors"
-
-	"golang.org/x/sys/unix"
-)
-
-func (d *DeviceRule) Mkdev() (uint64, error) {
-	if d.Major == Wildcard || d.Minor == Wildcard {
-		return 0, errors.New("cannot mkdev() device with wildcards")
-	}
-	return unix.Mkdev(uint32(d.Major), uint32(d.Minor)), nil
-}

+ 0 - 5
vendor/github.com/opencontainers/runc/libcontainer/configs/device_windows.go

@@ -1,5 +0,0 @@
-package configs
-
-func (d *DeviceRule) Mkdev() (uint64, error) {
-	return 0, nil
-}

+ 17 - 0
vendor/github.com/opencontainers/runc/libcontainer/configs/devices.go

@@ -0,0 +1,17 @@
+package configs
+
+import "github.com/opencontainers/runc/libcontainer/devices"
+
+type (
+	// Deprecated: use libcontainer/devices.Device
+	Device = devices.Device
+
+	// Deprecated: use libcontainer/devices.Rule
+	DeviceRule = devices.Rule
+
+	// Deprecated: use libcontainer/devices.Type
+	DeviceType = devices.Type
+
+	// Deprecated: use libcontainer/devices.Permissions
+	DevicePermissions = devices.Permissions
+)

+ 1 - 1
vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go

@@ -56,7 +56,7 @@ func IsNamespaceSupported(ns NamespaceType) bool {
 	if nsFile == "" {
 	if nsFile == "" {
 		return false
 		return false
 	}
 	}
-	_, err := os.Stat(fmt.Sprintf("/proc/self/ns/%s", nsFile))
+	_, err := os.Stat("/proc/self/ns/" + nsFile)
 	// a namespace is supported if it exists and we have permissions to read it
 	// a namespace is supported if it exists and we have permissions to read it
 	supported = err == nil
 	supported = err == nil
 	supportedNamespaces[ns] = supported
 	supportedNamespaces[ns] = supported

+ 33 - 29
vendor/github.com/opencontainers/runc/libcontainer/configs/device.go → vendor/github.com/opencontainers/runc/libcontainer/devices/device.go

@@ -1,4 +1,4 @@
-package configs
+package devices
 
 
 import (
 import (
 	"fmt"
 	"fmt"
@@ -11,7 +11,7 @@ const (
 )
 )
 
 
 type Device struct {
 type Device struct {
-	DeviceRule
+	Rule
 
 
 	// Path to the device.
 	// Path to the device.
 	Path string `json:"path"`
 	Path string `json:"path"`
@@ -26,10 +26,10 @@ type Device struct {
 	Gid uint32 `json:"gid"`
 	Gid uint32 `json:"gid"`
 }
 }
 
 
-// DevicePermissions is a cgroupv1-style string to represent device access. It
+// Permissions is a cgroupv1-style string to represent device access. It
 // has to be a string for backward compatibility reasons, hence why it has
 // has to be a string for backward compatibility reasons, hence why it has
 // methods to do set operations.
 // methods to do set operations.
-type DevicePermissions string
+type Permissions string
 
 
 const (
 const (
 	deviceRead uint = (1 << iota)
 	deviceRead uint = (1 << iota)
@@ -37,7 +37,7 @@ const (
 	deviceMknod
 	deviceMknod
 )
 )
 
 
-func (p DevicePermissions) toSet() uint {
+func (p Permissions) toSet() uint {
 	var set uint
 	var set uint
 	for _, perm := range p {
 	for _, perm := range p {
 		switch perm {
 		switch perm {
@@ -52,7 +52,7 @@ func (p DevicePermissions) toSet() uint {
 	return set
 	return set
 }
 }
 
 
-func fromSet(set uint) DevicePermissions {
+func fromSet(set uint) Permissions {
 	var perm string
 	var perm string
 	if set&deviceRead == deviceRead {
 	if set&deviceRead == deviceRead {
 		perm += "r"
 		perm += "r"
@@ -63,53 +63,53 @@ func fromSet(set uint) DevicePermissions {
 	if set&deviceMknod == deviceMknod {
 	if set&deviceMknod == deviceMknod {
 		perm += "m"
 		perm += "m"
 	}
 	}
-	return DevicePermissions(perm)
+	return Permissions(perm)
 }
 }
 
 
-// Union returns the union of the two sets of DevicePermissions.
-func (p DevicePermissions) Union(o DevicePermissions) DevicePermissions {
+// Union returns the union of the two sets of Permissions.
+func (p Permissions) Union(o Permissions) Permissions {
 	lhs := p.toSet()
 	lhs := p.toSet()
 	rhs := o.toSet()
 	rhs := o.toSet()
 	return fromSet(lhs | rhs)
 	return fromSet(lhs | rhs)
 }
 }
 
 
-// Difference returns the set difference of the two sets of DevicePermissions.
+// Difference returns the set difference of the two sets of Permissions.
 // In set notation, A.Difference(B) gives you A\B.
 // In set notation, A.Difference(B) gives you A\B.
-func (p DevicePermissions) Difference(o DevicePermissions) DevicePermissions {
+func (p Permissions) Difference(o Permissions) Permissions {
 	lhs := p.toSet()
 	lhs := p.toSet()
 	rhs := o.toSet()
 	rhs := o.toSet()
 	return fromSet(lhs &^ rhs)
 	return fromSet(lhs &^ rhs)
 }
 }
 
 
-// Intersection computes the intersection of the two sets of DevicePermissions.
-func (p DevicePermissions) Intersection(o DevicePermissions) DevicePermissions {
+// Intersection computes the intersection of the two sets of Permissions.
+func (p Permissions) Intersection(o Permissions) Permissions {
 	lhs := p.toSet()
 	lhs := p.toSet()
 	rhs := o.toSet()
 	rhs := o.toSet()
 	return fromSet(lhs & rhs)
 	return fromSet(lhs & rhs)
 }
 }
 
 
-// IsEmpty returns whether the set of permissions in a DevicePermissions is
+// IsEmpty returns whether the set of permissions in a Permissions is
 // empty.
 // empty.
-func (p DevicePermissions) IsEmpty() bool {
-	return p == DevicePermissions("")
+func (p Permissions) IsEmpty() bool {
+	return p == Permissions("")
 }
 }
 
 
 // IsValid returns whether the set of permissions is a subset of valid
 // IsValid returns whether the set of permissions is a subset of valid
 // permissions (namely, {r,w,m}).
 // permissions (namely, {r,w,m}).
-func (p DevicePermissions) IsValid() bool {
+func (p Permissions) IsValid() bool {
 	return p == fromSet(p.toSet())
 	return p == fromSet(p.toSet())
 }
 }
 
 
-type DeviceType rune
+type Type rune
 
 
 const (
 const (
-	WildcardDevice DeviceType = 'a'
-	BlockDevice    DeviceType = 'b'
-	CharDevice     DeviceType = 'c' // or 'u'
-	FifoDevice     DeviceType = 'p'
+	WildcardDevice Type = 'a'
+	BlockDevice    Type = 'b'
+	CharDevice     Type = 'c' // or 'u'
+	FifoDevice     Type = 'p'
 )
 )
 
 
-func (t DeviceType) IsValid() bool {
+func (t Type) IsValid() bool {
 	switch t {
 	switch t {
 	case WildcardDevice, BlockDevice, CharDevice, FifoDevice:
 	case WildcardDevice, BlockDevice, CharDevice, FifoDevice:
 		return true
 		return true
@@ -118,7 +118,7 @@ func (t DeviceType) IsValid() bool {
 	}
 	}
 }
 }
 
 
-func (t DeviceType) CanMknod() bool {
+func (t Type) CanMknod() bool {
 	switch t {
 	switch t {
 	case BlockDevice, CharDevice, FifoDevice:
 	case BlockDevice, CharDevice, FifoDevice:
 		return true
 		return true
@@ -127,7 +127,7 @@ func (t DeviceType) CanMknod() bool {
 	}
 	}
 }
 }
 
 
-func (t DeviceType) CanCgroup() bool {
+func (t Type) CanCgroup() bool {
 	switch t {
 	switch t {
 	case WildcardDevice, BlockDevice, CharDevice:
 	case WildcardDevice, BlockDevice, CharDevice:
 		return true
 		return true
@@ -136,10 +136,10 @@ func (t DeviceType) CanCgroup() bool {
 	}
 	}
 }
 }
 
 
-type DeviceRule struct {
+type Rule struct {
 	// Type of device ('c' for char, 'b' for block). If set to 'a', this rule
 	// Type of device ('c' for char, 'b' for block). If set to 'a', this rule
 	// acts as a wildcard and all fields other than Allow are ignored.
 	// acts as a wildcard and all fields other than Allow are ignored.
-	Type DeviceType `json:"type"`
+	Type Type `json:"type"`
 
 
 	// Major is the device's major number.
 	// Major is the device's major number.
 	Major int64 `json:"major"`
 	Major int64 `json:"major"`
@@ -149,13 +149,13 @@ type DeviceRule struct {
 
 
 	// Permissions is the set of permissions that this rule applies to (in the
 	// Permissions is the set of permissions that this rule applies to (in the
 	// cgroupv1 format -- any combination of "rwm").
 	// cgroupv1 format -- any combination of "rwm").
-	Permissions DevicePermissions `json:"permissions"`
+	Permissions Permissions `json:"permissions"`
 
 
 	// Allow specifies whether this rule is allowed.
 	// Allow specifies whether this rule is allowed.
 	Allow bool `json:"allow"`
 	Allow bool `json:"allow"`
 }
 }
 
 
-func (d *DeviceRule) CgroupString() string {
+func (d *Rule) CgroupString() string {
 	var (
 	var (
 		major = strconv.FormatInt(d.Major, 10)
 		major = strconv.FormatInt(d.Major, 10)
 		minor = strconv.FormatInt(d.Minor, 10)
 		minor = strconv.FormatInt(d.Minor, 10)
@@ -168,3 +168,7 @@ func (d *DeviceRule) CgroupString() string {
 	}
 	}
 	return fmt.Sprintf("%c %s:%s %s", d.Type, major, minor, d.Permissions)
 	return fmt.Sprintf("%c %s:%s %s", d.Type, major, minor, d.Permissions)
 }
 }
+
+func (d *Rule) Mkdev() (uint64, error) {
+	return mkDev(d)
+}

+ 22 - 14
vendor/github.com/opencontainers/runc/libcontainer/devices/devices.go → vendor/github.com/opencontainers/runc/libcontainer/devices/device_unix.go

@@ -1,3 +1,5 @@
+// +build !windows
+
 package devices
 package devices
 
 
 import (
 import (
@@ -6,7 +8,6 @@ import (
 	"os"
 	"os"
 	"path/filepath"
 	"path/filepath"
 
 
-	"github.com/opencontainers/runc/libcontainer/configs"
 	"golang.org/x/sys/unix"
 	"golang.org/x/sys/unix"
 )
 )
 
 
@@ -21,9 +22,16 @@ var (
 	ioutilReadDir = ioutil.ReadDir
 	ioutilReadDir = ioutil.ReadDir
 )
 )
 
 
+func mkDev(d *Rule) (uint64, error) {
+	if d.Major == Wildcard || d.Minor == Wildcard {
+		return 0, errors.New("cannot mkdev() device with wildcards")
+	}
+	return unix.Mkdev(uint32(d.Major), uint32(d.Minor)), nil
+}
+
 // Given the path to a device and its cgroup_permissions(which cannot be easily queried) look up the
 // Given the path to a device and its cgroup_permissions(which cannot be easily queried) look up the
 // information about a linux device and return that information as a Device struct.
 // information about a linux device and return that information as a Device struct.
-func DeviceFromPath(path, permissions string) (*configs.Device, error) {
+func DeviceFromPath(path, permissions string) (*Device, error) {
 	var stat unix.Stat_t
 	var stat unix.Stat_t
 	err := unixLstat(path, &stat)
 	err := unixLstat(path, &stat)
 	if err != nil {
 	if err != nil {
@@ -31,7 +39,7 @@ func DeviceFromPath(path, permissions string) (*configs.Device, error) {
 	}
 	}
 
 
 	var (
 	var (
-		devType   configs.DeviceType
+		devType   Type
 		mode      = stat.Mode
 		mode      = stat.Mode
 		devNumber = uint64(stat.Rdev)
 		devNumber = uint64(stat.Rdev)
 		major     = unix.Major(devNumber)
 		major     = unix.Major(devNumber)
@@ -39,41 +47,41 @@ func DeviceFromPath(path, permissions string) (*configs.Device, error) {
 	)
 	)
 	switch mode & unix.S_IFMT {
 	switch mode & unix.S_IFMT {
 	case unix.S_IFBLK:
 	case unix.S_IFBLK:
-		devType = configs.BlockDevice
+		devType = BlockDevice
 	case unix.S_IFCHR:
 	case unix.S_IFCHR:
-		devType = configs.CharDevice
+		devType = CharDevice
 	case unix.S_IFIFO:
 	case unix.S_IFIFO:
-		devType = configs.FifoDevice
+		devType = FifoDevice
 	default:
 	default:
 		return nil, ErrNotADevice
 		return nil, ErrNotADevice
 	}
 	}
-	return &configs.Device{
-		DeviceRule: configs.DeviceRule{
+	return &Device{
+		Rule: Rule{
 			Type:        devType,
 			Type:        devType,
 			Major:       int64(major),
 			Major:       int64(major),
 			Minor:       int64(minor),
 			Minor:       int64(minor),
-			Permissions: configs.DevicePermissions(permissions),
+			Permissions: Permissions(permissions),
 		},
 		},
 		Path:     path,
 		Path:     path,
-		FileMode: os.FileMode(mode),
+		FileMode: os.FileMode(mode &^ unix.S_IFMT),
 		Uid:      stat.Uid,
 		Uid:      stat.Uid,
 		Gid:      stat.Gid,
 		Gid:      stat.Gid,
 	}, nil
 	}, nil
 }
 }
 
 
 // HostDevices returns all devices that can be found under /dev directory.
 // HostDevices returns all devices that can be found under /dev directory.
-func HostDevices() ([]*configs.Device, error) {
+func HostDevices() ([]*Device, error) {
 	return GetDevices("/dev")
 	return GetDevices("/dev")
 }
 }
 
 
 // GetDevices recursively traverses a directory specified by path
 // GetDevices recursively traverses a directory specified by path
 // and returns all devices found there.
 // and returns all devices found there.
-func GetDevices(path string) ([]*configs.Device, error) {
+func GetDevices(path string) ([]*Device, error) {
 	files, err := ioutilReadDir(path)
 	files, err := ioutilReadDir(path)
 	if err != nil {
 	if err != nil {
 		return nil, err
 		return nil, err
 	}
 	}
-	var out []*configs.Device
+	var out []*Device
 	for _, f := range files {
 	for _, f := range files {
 		switch {
 		switch {
 		case f.IsDir():
 		case f.IsDir():
@@ -104,7 +112,7 @@ func GetDevices(path string) ([]*configs.Device, error) {
 			}
 			}
 			return nil, err
 			return nil, err
 		}
 		}
-		if device.Type == configs.FifoDevice {
+		if device.Type == FifoDevice {
 			continue
 			continue
 		}
 		}
 		out = append(out, device)
 		out = append(out, device)

+ 51 - 29
vendor/github.com/opencontainers/runc/libcontainer/nsenter/cloned_binary.c

@@ -59,14 +59,38 @@
 #include <sys/syscall.h>
 #include <sys/syscall.h>
 
 
 /* Use our own wrapper for memfd_create. */
 /* Use our own wrapper for memfd_create. */
-#if !defined(SYS_memfd_create) && defined(__NR_memfd_create)
-#  define SYS_memfd_create __NR_memfd_create
+#ifndef SYS_memfd_create
+#  ifdef __NR_memfd_create
+#    define SYS_memfd_create __NR_memfd_create
+#  else
+/* These values come from <https://fedora.juszkiewicz.com.pl/syscalls.html>. */
+#    warning "libc is outdated -- using hard-coded SYS_memfd_create"
+#    if defined(__x86_64__)
+#      define SYS_memfd_create 319
+#    elif defined(__i386__)
+#      define SYS_memfd_create 356
+#    elif defined(__ia64__)
+#      define SYS_memfd_create 1340
+#    elif defined(__arm__)
+#      define SYS_memfd_create 385
+#    elif defined(__aarch64__)
+#      define SYS_memfd_create 279
+#    elif defined(__ppc__) || defined(__PPC64__) || defined(__powerpc64__)
+#      define SYS_memfd_create 360
+#    elif defined(__s390__) || defined(__s390x__)
+#      define SYS_memfd_create 350
+#    else
+#      warning "unknown architecture -- cannot hard-code SYS_memfd_create"
+#    endif
+#  endif
 #endif
 #endif
+
 /* memfd_create(2) flags -- copied from <linux/memfd.h>. */
 /* memfd_create(2) flags -- copied from <linux/memfd.h>. */
 #ifndef MFD_CLOEXEC
 #ifndef MFD_CLOEXEC
 #  define MFD_CLOEXEC       0x0001U
 #  define MFD_CLOEXEC       0x0001U
 #  define MFD_ALLOW_SEALING 0x0002U
 #  define MFD_ALLOW_SEALING 0x0002U
 #endif
 #endif
+
 int memfd_create(const char *name, unsigned int flags)
 int memfd_create(const char *name, unsigned int flags)
 {
 {
 #ifdef SYS_memfd_create
 #ifdef SYS_memfd_create
@@ -77,7 +101,6 @@ int memfd_create(const char *name, unsigned int flags)
 #endif
 #endif
 }
 }
 
 
-
 /* This comes directly from <linux/fcntl.h>. */
 /* This comes directly from <linux/fcntl.h>. */
 #ifndef F_LINUX_SPECIFIC_BASE
 #ifndef F_LINUX_SPECIFIC_BASE
 #  define F_LINUX_SPECIFIC_BASE 1024
 #  define F_LINUX_SPECIFIC_BASE 1024
@@ -103,7 +126,7 @@ static void *must_realloc(void *ptr, size_t size)
 	void *old = ptr;
 	void *old = ptr;
 	do {
 	do {
 		ptr = realloc(old, size);
 		ptr = realloc(old, size);
-	} while(!ptr);
+	} while (!ptr);
 	return ptr;
 	return ptr;
 }
 }
 
 
@@ -115,10 +138,10 @@ static void *must_realloc(void *ptr, size_t size)
 static int is_self_cloned(void)
 static int is_self_cloned(void)
 {
 {
 	int fd, ret, is_cloned = 0;
 	int fd, ret, is_cloned = 0;
-	struct stat statbuf = {};
-	struct statfs fsbuf = {};
+	struct stat statbuf = { };
+	struct statfs fsbuf = { };
 
 
-	fd = open("/proc/self/exe", O_RDONLY|O_CLOEXEC);
+	fd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC);
 	if (fd < 0) {
 	if (fd < 0) {
 		fprintf(stderr, "you have no read access to runc binary file\n");
 		fprintf(stderr, "you have no read access to runc binary file\n");
 		return -ENOTRECOVERABLE;
 		return -ENOTRECOVERABLE;
@@ -274,7 +297,7 @@ enum {
 static int make_execfd(int *fdtype)
 static int make_execfd(int *fdtype)
 {
 {
 	int fd = -1;
 	int fd = -1;
-	char template[PATH_MAX] = {0};
+	char template[PATH_MAX] = { 0 };
 	char *prefix = getenv("_LIBCONTAINER_STATEDIR");
 	char *prefix = getenv("_LIBCONTAINER_STATEDIR");
 
 
 	if (!prefix || *prefix != '/')
 	if (!prefix || *prefix != '/')
@@ -303,7 +326,7 @@ static int make_execfd(int *fdtype)
 	*fdtype = EFD_FILE;
 	*fdtype = EFD_FILE;
 	fd = open(prefix, O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700);
 	fd = open(prefix, O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700);
 	if (fd >= 0) {
 	if (fd >= 0) {
-		struct stat statbuf = {};
+		struct stat statbuf = { };
 		bool working_otmpfile = false;
 		bool working_otmpfile = false;
 
 
 		/*
 		/*
@@ -348,27 +371,27 @@ static int seal_execfd(int *fd, int fdtype)
 	switch (fdtype) {
 	switch (fdtype) {
 	case EFD_MEMFD:
 	case EFD_MEMFD:
 		return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_SEALS);
 		return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_SEALS);
-	case EFD_FILE: {
-		/* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */
-		int newfd;
-		char fdpath[PATH_MAX] = {0};
+	case EFD_FILE:{
+			/* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */
+			int newfd;
+			char fdpath[PATH_MAX] = { 0 };
 
 
-		if (fchmod(*fd, 0100) < 0)
-			return -1;
+			if (fchmod(*fd, 0100) < 0)
+				return -1;
 
 
-		if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0)
-			return -1;
+			if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0)
+				return -1;
 
 
-		newfd = open(fdpath, O_PATH | O_CLOEXEC);
-		if (newfd < 0)
-			return -1;
+			newfd = open(fdpath, O_PATH | O_CLOEXEC);
+			if (newfd < 0)
+				return -1;
 
 
-		close(*fd);
-		*fd = newfd;
-		return 0;
-	}
+			close(*fd);
+			*fd = newfd;
+			return 0;
+		}
 	default:
 	default:
-	   break;
+		break;
 	}
 	}
 	return -1;
 	return -1;
 }
 }
@@ -376,7 +399,7 @@ static int seal_execfd(int *fd, int fdtype)
 static int try_bindfd(void)
 static int try_bindfd(void)
 {
 {
 	int fd, ret = -1;
 	int fd, ret = -1;
-	char template[PATH_MAX] = {0};
+	char template[PATH_MAX] = { 0 };
 	char *prefix = getenv("_LIBCONTAINER_STATEDIR");
 	char *prefix = getenv("_LIBCONTAINER_STATEDIR");
 
 
 	if (!prefix || *prefix != '/')
 	if (!prefix || *prefix != '/')
@@ -404,7 +427,6 @@ static int try_bindfd(void)
 	if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0)
 	if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0)
 		goto out_umount;
 		goto out_umount;
 
 
-
 	/* Get read-only handle that we're sure can't be made read-write. */
 	/* Get read-only handle that we're sure can't be made read-write. */
 	ret = open(template, O_PATH | O_CLOEXEC);
 	ret = open(template, O_PATH | O_CLOEXEC);
 
 
@@ -448,7 +470,7 @@ static ssize_t fd_to_fd(int outfd, int infd)
 			if (n < 0)
 			if (n < 0)
 				return -1;
 				return -1;
 			nwritten += n;
 			nwritten += n;
-		} while(nwritten < nread);
+		} while (nwritten < nread);
 
 
 		total += nwritten;
 		total += nwritten;
 	}
 	}
@@ -459,7 +481,7 @@ static ssize_t fd_to_fd(int outfd, int infd)
 static int clone_binary(void)
 static int clone_binary(void)
 {
 {
 	int binfd, execfd;
 	int binfd, execfd;
-	struct stat statbuf = {};
+	struct stat statbuf = { };
 	size_t sent = 0;
 	size_t sent = 0;
 	int fdtype = EFD_NONE;
 	int fdtype = EFD_NONE;
 
 

+ 142 - 0
vendor/github.com/opencontainers/runc/libcontainer/nsenter/escape.c

@@ -0,0 +1,142 @@
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef ESCAPE_TEST
+#  include <assert.h>
+#  define test_assert(arg) assert(arg)
+#else
+#  define test_assert(arg)
+#endif
+
+#define DEL '\x7f'
+
+/*
+ * Poor man version of itoa with base=16 and input number from 0 to 15,
+ * represented by a char. Converts it to a single hex digit ('0' to 'f').
+ */
+static char hex(char i)
+{
+	test_assert(i >= 0 && i < 16);
+
+	if (i >= 0 && i < 10) {
+		return '0' + i;
+	}
+	if (i >= 10 && i < 16) {
+		return 'a' + i - 10;
+	}
+	return '?';
+}
+
+/*
+ * Given the character, tells how many _extra_ characters are needed
+ * to JSON-escape it. If 0 is returned, the character does not need to
+ * be escaped.
+ */
+static int need_escape(char c)
+{
+	switch (c) {
+	case '\\':
+	case '"':
+	case '\b':
+	case '\n':
+	case '\r':
+	case '\t':
+	case '\f':
+		return 1;
+	case DEL:		// -> \u007f
+		return 5;
+	default:
+		if (c > 0 && c < ' ') {
+			// ASCII decimal 01 to 31 -> \u00xx
+			return 5;
+		}
+		return 0;
+	}
+}
+
+/*
+ * Escape the string so it can be used as a JSON string (per RFC4627,
+ * section 2.5 minimal requirements, plus the DEL (0x7f) character).
+ *
+ * It is expected that the argument is a string allocated via malloc.
+ * In case no escaping is needed, the original string is returned as is;
+ * otherwise, the original string is free'd, and the newly allocated
+ * escaped string is returned. Thus, in any case, the value returned
+ * need to be free'd by the caller.
+ */
+char *escape_json_string(char *s)
+{
+	int i, j, len;
+	char *c, *out;
+
+	/*
+	 * First, check if escaping is at all needed -- if not, we can avoid
+	 * malloc and return the argument as is.  While at it, count how much
+	 * extra space is required.
+	 *
+	 * XXX: the counting code must be in sync with the escaping code
+	 * (checked by test_assert()s below).
+	 */
+	for (i = j = 0; s[i] != '\0'; i++) {
+		j += need_escape(s[i]);
+	}
+	if (j == 0) {
+		// nothing to escape
+		return s;
+	}
+
+	len = i + j + 1;
+	out = malloc(len);
+	if (!out) {
+		free(s);
+		// As malloc failed, strdup can fail, too, so in the worst case
+		// scenario NULL will be returned from here.
+		return strdup("escape_json_string: out of memory");
+	}
+	for (c = s, j = 0; *c != '\0'; c++) {
+		switch (*c) {
+		case '"':
+		case '\\':
+			test_assert(need_escape(*c) == 1);
+			out[j++] = '\\';
+			out[j++] = *c;
+			continue;
+		}
+		if ((*c < 0 || *c >= ' ') && (*c != DEL)) {
+			// no escape needed
+			test_assert(need_escape(*c) == 0);
+			out[j++] = *c;
+			continue;
+		}
+		out[j++] = '\\';
+		switch (*c) {
+		case '\b':
+			out[j++] = 'b';
+			break;
+		case '\n':
+			out[j++] = 'n';
+			break;
+		case '\r':
+			out[j++] = 'r';
+			break;
+		case '\t':
+			out[j++] = 't';
+			break;
+		case '\f':
+			out[j++] = 'f';
+			break;
+		default:
+			test_assert(need_escape(*c) == 5);
+			out[j++] = 'u';
+			out[j++] = '0';
+			out[j++] = '0';
+			out[j++] = hex(*c >> 4);
+			out[j++] = hex(*c & 0x0f);
+		}
+	}
+	test_assert(j + 1 == len);
+	out[j] = '\0';
+
+	free(s);
+	return out;
+}

+ 222 - 139
vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c

@@ -29,6 +29,8 @@
 /* Get all of the CLONE_NEW* flags. */
 /* Get all of the CLONE_NEW* flags. */
 #include "namespace.h"
 #include "namespace.h"
 
 
+extern char *escape_json_string(char *str);
+
 /* Synchronisation values. */
 /* Synchronisation values. */
 enum sync_t {
 enum sync_t {
 	SYNC_USERMAP_PLS = 0x40,	/* Request parent to map our users. */
 	SYNC_USERMAP_PLS = 0x40,	/* Request parent to map our users. */
@@ -36,7 +38,7 @@ enum sync_t {
 	SYNC_RECVPID_PLS = 0x42,	/* Tell parent we're sending the PID. */
 	SYNC_RECVPID_PLS = 0x42,	/* Tell parent we're sending the PID. */
 	SYNC_RECVPID_ACK = 0x43,	/* PID was correctly received by parent. */
 	SYNC_RECVPID_ACK = 0x43,	/* PID was correctly received by parent. */
 	SYNC_GRANDCHILD = 0x44,	/* The grandchild is ready to run. */
 	SYNC_GRANDCHILD = 0x44,	/* The grandchild is ready to run. */
-	SYNC_CHILD_READY = 0x45,	/* The child or grandchild is ready to return. */
+	SYNC_CHILD_FINISH = 0x45,	/* The child or grandchild has finished. */
 };
 };
 
 
 /*
 /*
@@ -45,10 +47,14 @@ enum sync_t {
  */
  */
 #define CREATECGROUPNS 0x80
 #define CREATECGROUPNS 0x80
 
 
+#define STAGE_SETUP  -1
 /* longjmp() arguments. */
 /* longjmp() arguments. */
-#define JUMP_PARENT 0x00
-#define JUMP_CHILD  0xA0
-#define JUMP_INIT   0xA1
+#define STAGE_PARENT  0
+#define STAGE_CHILD   1
+#define STAGE_INIT    2
+
+/* Stores the current stage of nsexec. */
+int current_stage = STAGE_SETUP;
 
 
 /* Assume the stack grows down, so arguments should be above it. */
 /* Assume the stack grows down, so arguments should be above it. */
 struct clone_t {
 struct clone_t {
@@ -56,7 +62,7 @@ struct clone_t {
 	 * Reserve some space for clone() to locate arguments
 	 * Reserve some space for clone() to locate arguments
 	 * and retcode in this place
 	 * and retcode in this place
 	 */
 	 */
-	char stack[4096] __attribute__ ((aligned(16)));
+	char stack[4096] __attribute__((aligned(16)));
 	char stack_ptr[0];
 	char stack_ptr[0];
 
 
 	/* There's two children. This is used to execute the different code. */
 	/* There's two children. This is used to execute the different code. */
@@ -102,31 +108,31 @@ static int logfd = -1;
  * List of netlink message types sent to us as part of bootstrapping the init.
  * List of netlink message types sent to us as part of bootstrapping the init.
  * These constants are defined in libcontainer/message_linux.go.
  * These constants are defined in libcontainer/message_linux.go.
  */
  */
-#define INIT_MSG			62000
+#define INIT_MSG		62000
 #define CLONE_FLAGS_ATTR	27281
 #define CLONE_FLAGS_ATTR	27281
 #define NS_PATHS_ATTR		27282
 #define NS_PATHS_ATTR		27282
-#define UIDMAP_ATTR			27283
-#define GIDMAP_ATTR			27284
+#define UIDMAP_ATTR		27283
+#define GIDMAP_ATTR		27284
 #define SETGROUP_ATTR		27285
 #define SETGROUP_ATTR		27285
 #define OOM_SCORE_ADJ_ATTR	27286
 #define OOM_SCORE_ADJ_ATTR	27286
 #define ROOTLESS_EUID_ATTR	27287
 #define ROOTLESS_EUID_ATTR	27287
-#define UIDMAPPATH_ATTR	    27288
-#define GIDMAPPATH_ATTR	    27289
+#define UIDMAPPATH_ATTR		27288
+#define GIDMAPPATH_ATTR		27289
 
 
 /*
 /*
  * Use the raw syscall for versions of glibc which don't include a function for
  * Use the raw syscall for versions of glibc which don't include a function for
  * it, namely (glibc 2.12).
  * it, namely (glibc 2.12).
  */
  */
 #if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14
 #if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14
-#	define _GNU_SOURCE
-#	include "syscall.h"
-#	if !defined(SYS_setns) && defined(__NR_setns)
-#		define SYS_setns __NR_setns
-#	endif
-
-#ifndef SYS_setns
-#	error "setns(2) syscall not supported by glibc version"
-#endif
+#  define _GNU_SOURCE
+#  include "syscall.h"
+#  if !defined(SYS_setns) && defined(__NR_setns)
+#    define SYS_setns __NR_setns
+#  endif
+
+#  ifndef SYS_setns
+#    error "setns(2) syscall not supported by glibc version"
+#  endif
 
 
 int setns(int fd, int nstype)
 int setns(int fd, int nstype)
 {
 {
@@ -134,33 +140,43 @@ int setns(int fd, int nstype)
 }
 }
 #endif
 #endif
 
 
-static void write_log_with_info(const char *level, const char *function, int line, const char *format, ...)
+static void write_log(const char *level, const char *format, ...)
 {
 {
-	char message[1024] = {};
-
+	char *message = NULL, *stage = NULL;
 	va_list args;
 	va_list args;
+	int ret;
 
 
 	if (logfd < 0 || level == NULL)
 	if (logfd < 0 || level == NULL)
-		return;
+		goto out;
 
 
 	va_start(args, format);
 	va_start(args, format);
-	if (vsnprintf(message, sizeof(message), format, args) < 0)
-		goto done;
-
-	dprintf(logfd, "{\"level\":\"%s\", \"msg\": \"%s:%d %s\"}\n", level, function, line, message);
-done:
+	ret = vasprintf(&message, format, args);
 	va_end(args);
 	va_end(args);
-}
+	if (ret < 0)
+		goto out;
 
 
-#define write_log(level, fmt, ...) \
-	write_log_with_info((level), __FUNCTION__, __LINE__, (fmt), ##__VA_ARGS__)
+	message = escape_json_string(message);
+
+	if (current_stage == STAGE_SETUP)
+		stage = strdup("nsexec");
+	else
+		ret = asprintf(&stage, "nsexec-%d", current_stage);
+	if (ret < 0)
+		goto out;
+
+	dprintf(logfd, "{\"level\":\"%s\", \"msg\": \"%s[%d]: %s\"}\n", level, stage, getpid(), message);
+
+out:
+	free(message);
+	free(stage);
+}
 
 
 /* XXX: This is ugly. */
 /* XXX: This is ugly. */
 static int syncfd = -1;
 static int syncfd = -1;
 
 
 #define bail(fmt, ...)                                       \
 #define bail(fmt, ...)                                       \
 	do {                                                       \
 	do {                                                       \
-		write_log(FATAL, "nsenter: " fmt ": %m", ##__VA_ARGS__); \
+		write_log(FATAL, fmt ": %m", ##__VA_ARGS__); \
 		exit(1);                                                 \
 		exit(1);                                                 \
 	} while(0)
 	} while(0)
 
 
@@ -187,7 +203,7 @@ static int write_file(char *data, size_t data_len, char *pathfmt, ...)
 		goto out;
 		goto out;
 	}
 	}
 
 
- out:
+out:
 	close(fd);
 	close(fd);
 	return ret;
 	return ret;
 }
 }
@@ -297,9 +313,11 @@ static void update_uidmap(const char *path, int pid, char *map, size_t map_len)
 	if (map == NULL || map_len <= 0)
 	if (map == NULL || map_len <= 0)
 		return;
 		return;
 
 
+	write_log(DEBUG, "update /proc/%d/uid_map to '%s'", pid, map);
 	if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) {
 	if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) {
 		if (errno != EPERM)
 		if (errno != EPERM)
 			bail("failed to update /proc/%d/uid_map", pid);
 			bail("failed to update /proc/%d/uid_map", pid);
+		write_log(DEBUG, "update /proc/%d/uid_map got -EPERM (trying %s)", pid, path);
 		if (try_mapping_tool(path, pid, map, map_len))
 		if (try_mapping_tool(path, pid, map, map_len))
 			bail("failed to use newuid map on %d", pid);
 			bail("failed to use newuid map on %d", pid);
 	}
 	}
@@ -310,9 +328,11 @@ static void update_gidmap(const char *path, int pid, char *map, size_t map_len)
 	if (map == NULL || map_len <= 0)
 	if (map == NULL || map_len <= 0)
 		return;
 		return;
 
 
+	write_log(DEBUG, "update /proc/%d/gid_map to '%s'", pid, map);
 	if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) {
 	if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) {
 		if (errno != EPERM)
 		if (errno != EPERM)
 			bail("failed to update /proc/%d/gid_map", pid);
 			bail("failed to update /proc/%d/gid_map", pid);
+		write_log(DEBUG, "update /proc/%d/gid_map got -EPERM (trying %s)", pid, path);
 		if (try_mapping_tool(path, pid, map, map_len))
 		if (try_mapping_tool(path, pid, map, map_len))
 			bail("failed to use newgid map on %d", pid);
 			bail("failed to use newgid map on %d", pid);
 	}
 	}
@@ -323,19 +343,20 @@ static void update_oom_score_adj(char *data, size_t len)
 	if (data == NULL || len <= 0)
 	if (data == NULL || len <= 0)
 		return;
 		return;
 
 
+	write_log(DEBUG, "update /proc/self/oom_score_adj to '%s'", data);
 	if (write_file(data, len, "/proc/self/oom_score_adj") < 0)
 	if (write_file(data, len, "/proc/self/oom_score_adj") < 0)
 		bail("failed to update /proc/self/oom_score_adj");
 		bail("failed to update /proc/self/oom_score_adj");
 }
 }
 
 
 /* A dummy function that just jumps to the given jumpval. */
 /* A dummy function that just jumps to the given jumpval. */
-static int child_func(void *arg) __attribute__ ((noinline));
+static int child_func(void *arg) __attribute__((noinline));
 static int child_func(void *arg)
 static int child_func(void *arg)
 {
 {
 	struct clone_t *ca = (struct clone_t *)arg;
 	struct clone_t *ca = (struct clone_t *)arg;
 	longjmp(*ca->env, ca->jmpval);
 	longjmp(*ca->env, ca->jmpval);
 }
 }
 
 
-static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline));
+static int clone_parent(jmp_buf *env, int jmpval) __attribute__((noinline));
 static int clone_parent(jmp_buf *env, int jmpval)
 static int clone_parent(jmp_buf *env, int jmpval)
 {
 {
 	struct clone_t ca = {
 	struct clone_t ca = {
@@ -507,7 +528,6 @@ void join_namespaces(char *nslist)
 	char *namespace = strtok_r(nslist, ",", &saveptr);
 	char *namespace = strtok_r(nslist, ",", &saveptr);
 	struct namespace_t {
 	struct namespace_t {
 		int fd;
 		int fd;
-		int ns;
 		char type[PATH_MAX];
 		char type[PATH_MAX];
 		char path[PATH_MAX];
 		char path[PATH_MAX];
 	} *namespaces = NULL;
 	} *namespaces = NULL;
@@ -542,7 +562,7 @@ void join_namespaces(char *nslist)
 			bail("failed to open %s", path);
 			bail("failed to open %s", path);
 
 
 		ns->fd = fd;
 		ns->fd = fd;
-		ns->ns = nsflag(namespace);
+		strncpy(ns->type, namespace, PATH_MAX - 1);
 		strncpy(ns->path, path, PATH_MAX - 1);
 		strncpy(ns->path, path, PATH_MAX - 1);
 		ns->path[PATH_MAX - 1] = '\0';
 		ns->path[PATH_MAX - 1] = '\0';
 	} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
 	} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
@@ -555,12 +575,14 @@ void join_namespaces(char *nslist)
 	 */
 	 */
 
 
 	for (i = 0; i < num; i++) {
 	for (i = 0; i < num; i++) {
-		struct namespace_t ns = namespaces[i];
+		struct namespace_t *ns = &namespaces[i];
+		int flag = nsflag(ns->type);
 
 
-		if (setns(ns.fd, ns.ns) < 0)
-			bail("failed to setns to %s", ns.path);
+		write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", flag, ns->type, ns->path);
+		if (setns(ns->fd, flag) < 0)
+			bail("failed to setns into %s namespace", ns->type);
 
 
-		close(ns.fd);
+		close(ns->fd);
 	}
 	}
 
 
 	free(namespaces);
 	free(namespaces);
@@ -569,6 +591,14 @@ void join_namespaces(char *nslist)
 /* Defined in cloned_binary.c. */
 /* Defined in cloned_binary.c. */
 extern int ensure_cloned_binary(void);
 extern int ensure_cloned_binary(void);
 
 
+static inline int sane_kill(pid_t pid, int signum)
+{
+	if (pid > 0)
+		return kill(pid, signum);
+	else
+		return 0;
+}
+
 void nsexec(void)
 void nsexec(void)
 {
 {
 	int pipenum;
 	int pipenum;
@@ -598,7 +628,14 @@ void nsexec(void)
 	if (ensure_cloned_binary() < 0)
 	if (ensure_cloned_binary() < 0)
 		bail("could not ensure we are a cloned binary");
 		bail("could not ensure we are a cloned binary");
 
 
-	write_log(DEBUG, "nsexec started");
+	/*
+	 * Inform the parent we're past initial setup.
+	 * For the other side of this, see initWaiter.
+	 */
+	if (write(pipenum, "", 1) != 1)
+		bail("could not inform the parent we are past initial setup");
+
+	write_log(DEBUG, "=> nsexec container setup");
 
 
 	/* Parse all of the netlink configuration. */
 	/* Parse all of the netlink configuration. */
 	nl_parse(pipenum, &config);
 	nl_parse(pipenum, &config);
@@ -622,6 +659,7 @@ void nsexec(void)
 	 * containers), which is the recommendation from the kernel folks.
 	 * containers), which is the recommendation from the kernel folks.
 	 */
 	 */
 	if (config.namespaces) {
 	if (config.namespaces) {
+		write_log(DEBUG, "set process as non-dumpable");
 		if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
 		if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
 			bail("failed to set process as non-dumpable");
 			bail("failed to set process as non-dumpable");
 	}
 	}
@@ -686,45 +724,49 @@ void nsexec(void)
 	 * -- Aleksa "what has my life come to?" Sarai
 	 * -- Aleksa "what has my life come to?" Sarai
 	 */
 	 */
 
 
-	switch (setjmp(env)) {
+	current_stage = setjmp(env);
+	switch (current_stage) {
 		/*
 		/*
 		 * Stage 0: We're in the parent. Our job is just to create a new child
 		 * Stage 0: We're in the parent. Our job is just to create a new child
-		 *          (stage 1: JUMP_CHILD) process and write its uid_map and
+		 *          (stage 1: STAGE_CHILD) process and write its uid_map and
 		 *          gid_map. That process will go on to create a new process, then
 		 *          gid_map. That process will go on to create a new process, then
 		 *          it will send us its PID which we will send to the bootstrap
 		 *          it will send us its PID which we will send to the bootstrap
 		 *          process.
 		 *          process.
 		 */
 		 */
-	case JUMP_PARENT:{
+	case STAGE_PARENT:{
 			int len;
 			int len;
-			pid_t child, first_child = -1;
-			bool ready = false;
+			pid_t stage1_pid = -1, stage2_pid = -1;
+			bool stage1_complete, stage2_complete;
 
 
 			/* For debugging. */
 			/* For debugging. */
 			prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0);
 			prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0);
+			write_log(DEBUG, "~> nsexec stage-0");
 
 
 			/* Start the process of getting a container. */
 			/* Start the process of getting a container. */
-			child = clone_parent(&env, JUMP_CHILD);
-			if (child < 0)
-				bail("unable to fork: child_func");
+			write_log(DEBUG, "spawn stage-1");
+			stage1_pid = clone_parent(&env, STAGE_CHILD);
+			if (stage1_pid < 0)
+				bail("unable to spawn stage-1");
 
 
-			/*
-			 * State machine for synchronisation with the children.
-			 *
-			 * Father only return when both child and grandchild are
-			 * ready, so we can receive all possible error codes
-			 * generated by children.
-			 */
 			syncfd = sync_child_pipe[1];
 			syncfd = sync_child_pipe[1];
 			close(sync_child_pipe[0]);
 			close(sync_child_pipe[0]);
 
 
-			while (!ready) {
+			/*
+			 * State machine for synchronisation with the children. We only
+			 * return once both the child and grandchild are ready.
+			 */
+			write_log(DEBUG, "-> stage-1 synchronisation loop");
+			stage1_complete = false;
+			while (!stage1_complete) {
 				enum sync_t s;
 				enum sync_t s;
 
 
 				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
 				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
-					bail("failed to sync with child: next state");
+					bail("failed to sync with stage-1: next state");
 
 
 				switch (s) {
 				switch (s) {
 				case SYNC_USERMAP_PLS:
 				case SYNC_USERMAP_PLS:
+					write_log(DEBUG, "stage-1 requested userns mappings");
+
 					/*
 					/*
 					 * Enable setgroups(2) if we've been asked to. But we also
 					 * Enable setgroups(2) if we've been asked to. But we also
 					 * have to explicitly disable setgroups(2) if we're
 					 * have to explicitly disable setgroups(2) if we're
@@ -735,70 +777,78 @@ void nsexec(void)
 					 * For rootless multi-entry mapping, config.is_setgroup shall be true and
 					 * For rootless multi-entry mapping, config.is_setgroup shall be true and
 					 * newuidmap/newgidmap shall be used.
 					 * newuidmap/newgidmap shall be used.
 					 */
 					 */
-
 					if (config.is_rootless_euid && !config.is_setgroup)
 					if (config.is_rootless_euid && !config.is_setgroup)
-						update_setgroups(child, SETGROUPS_DENY);
+						update_setgroups(stage1_pid, SETGROUPS_DENY);
 
 
 					/* Set up mappings. */
 					/* Set up mappings. */
-					update_uidmap(config.uidmappath, child, config.uidmap, config.uidmap_len);
-					update_gidmap(config.gidmappath, child, config.gidmap, config.gidmap_len);
+					update_uidmap(config.uidmappath, stage1_pid, config.uidmap, config.uidmap_len);
+					update_gidmap(config.gidmappath, stage1_pid, config.gidmap, config.gidmap_len);
 
 
 					s = SYNC_USERMAP_ACK;
 					s = SYNC_USERMAP_ACK;
 					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
 					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
-						kill(child, SIGKILL);
-						bail("failed to sync with child: write(SYNC_USERMAP_ACK)");
+						sane_kill(stage1_pid, SIGKILL);
+						sane_kill(stage2_pid, SIGKILL);
+						bail("failed to sync with stage-1: write(SYNC_USERMAP_ACK)");
 					}
 					}
 					break;
 					break;
-				case SYNC_RECVPID_PLS:{
-						first_child = child;
-
-						/* Get the init_func pid. */
-						if (read(syncfd, &child, sizeof(child)) != sizeof(child)) {
-							kill(first_child, SIGKILL);
-							bail("failed to sync with child: read(childpid)");
-						}
-
-						/* Send ACK. */
-						s = SYNC_RECVPID_ACK;
-						if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
-							kill(first_child, SIGKILL);
-							kill(child, SIGKILL);
-							bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
-						}
-
-						/* Send the init_func pid back to our parent.
-						 *
-						 * Send the init_func pid and the pid of the first child back to our parent.
-						 * We need to send both back because we can't reap the first child we created (CLONE_PARENT).
-						 * It becomes the responsibility of our parent to reap the first child.
-						 */
-						len = dprintf(pipenum, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child);
-						if (len < 0) {
-							kill(child, SIGKILL);
-							bail("unable to generate JSON for child pid");
-						}
+				case SYNC_RECVPID_PLS:
+					write_log(DEBUG, "stage-1 requested pid to be forwarded");
+
+					/* Get the stage-2 pid. */
+					if (read(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) {
+						sane_kill(stage1_pid, SIGKILL);
+						sane_kill(stage2_pid, SIGKILL);
+						bail("failed to sync with stage-1: read(stage2_pid)");
+					}
+
+					/* Send ACK. */
+					s = SYNC_RECVPID_ACK;
+					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
+						sane_kill(stage1_pid, SIGKILL);
+						sane_kill(stage2_pid, SIGKILL);
+						bail("failed to sync with stage-1: write(SYNC_RECVPID_ACK)");
+					}
+
+					/*
+					 * Send both the stage-1 and stage-2 pids back to runc.
+					 * runc needs the stage-2 to continue process management,
+					 * but because stage-1 was spawned with CLONE_PARENT we
+					 * cannot reap it within stage-0 and thus we need to ask
+					 * runc to reap the zombie for us.
+					 */
+					write_log(DEBUG, "forward stage-1 (%d) and stage-2 (%d) pids to runc",
+						  stage1_pid, stage2_pid);
+					len =
+					    dprintf(pipenum, "{\"stage1_pid\":%d,\"stage2_pid\":%d}\n", stage1_pid,
+						    stage2_pid);
+					if (len < 0) {
+						sane_kill(stage1_pid, SIGKILL);
+						sane_kill(stage2_pid, SIGKILL);
+						bail("failed to sync with runc: write(pid-JSON)");
 					}
 					}
 					break;
 					break;
-				case SYNC_CHILD_READY:
-					ready = true;
+				case SYNC_CHILD_FINISH:
+					write_log(DEBUG, "stage-1 complete");
+					stage1_complete = true;
 					break;
 					break;
 				default:
 				default:
 					bail("unexpected sync value: %u", s);
 					bail("unexpected sync value: %u", s);
 				}
 				}
 			}
 			}
+			write_log(DEBUG, "<- stage-1 synchronisation loop");
 
 
 			/* Now sync with grandchild. */
 			/* Now sync with grandchild. */
-
 			syncfd = sync_grandchild_pipe[1];
 			syncfd = sync_grandchild_pipe[1];
 			close(sync_grandchild_pipe[0]);
 			close(sync_grandchild_pipe[0]);
-
-			ready = false;
-			while (!ready) {
+			write_log(DEBUG, "-> stage-2 synchronisation loop");
+			stage2_complete = false;
+			while (!stage2_complete) {
 				enum sync_t s;
 				enum sync_t s;
 
 
+				write_log(DEBUG, "signalling stage-2 to run");
 				s = SYNC_GRANDCHILD;
 				s = SYNC_GRANDCHILD;
 				if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
 				if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
-					kill(child, SIGKILL);
+					sane_kill(stage2_pid, SIGKILL);
 					bail("failed to sync with child: write(SYNC_GRANDCHILD)");
 					bail("failed to sync with child: write(SYNC_GRANDCHILD)");
 				}
 				}
 
 
@@ -806,27 +856,31 @@ void nsexec(void)
 					bail("failed to sync with child: next state");
 					bail("failed to sync with child: next state");
 
 
 				switch (s) {
 				switch (s) {
-				case SYNC_CHILD_READY:
-					ready = true;
+				case SYNC_CHILD_FINISH:
+					write_log(DEBUG, "stage-2 complete");
+					stage2_complete = true;
 					break;
 					break;
 				default:
 				default:
 					bail("unexpected sync value: %u", s);
 					bail("unexpected sync value: %u", s);
 				}
 				}
 			}
 			}
+			write_log(DEBUG, "<- stage-2 synchronisation loop");
+			write_log(DEBUG, "<~ nsexec stage-0");
 			exit(0);
 			exit(0);
 		}
 		}
+		break;
 
 
 		/*
 		/*
 		 * Stage 1: We're in the first child process. Our job is to join any
 		 * Stage 1: We're in the first child process. Our job is to join any
-		 *          provided namespaces in the netlink payload and unshare all
-		 *          of the requested namespaces. If we've been asked to
-		 *          CLONE_NEWUSER, we will ask our parent (stage 0) to set up
-		 *          our user mappings for us. Then, we create a new child
-		 *          (stage 2: JUMP_INIT) for PID namespace. We then send the
-		 *          child's PID to our parent (stage 0).
+		 *          provided namespaces in the netlink payload and unshare all of
+		 *          the requested namespaces. If we've been asked to CLONE_NEWUSER,
+		 *          we will ask our parent (stage 0) to set up our user mappings
+		 *          for us. Then, we create a new child (stage 2: STAGE_INIT) for
+		 *          PID namespace. We then send the child's PID to our parent
+		 *          (stage 0).
 		 */
 		 */
-	case JUMP_CHILD:{
-			pid_t child;
+	case STAGE_CHILD:{
+			pid_t stage2_pid = -1;
 			enum sync_t s;
 			enum sync_t s;
 
 
 			/* We're in a child and thus need to tell the parent if we die. */
 			/* We're in a child and thus need to tell the parent if we die. */
@@ -835,11 +889,12 @@ void nsexec(void)
 
 
 			/* For debugging. */
 			/* For debugging. */
 			prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0);
 			prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0);
+			write_log(DEBUG, "~> nsexec stage-1");
 
 
 			/*
 			/*
 			 * We need to setns first. We cannot do this earlier (in stage 0)
 			 * We need to setns first. We cannot do this earlier (in stage 0)
 			 * because of the fact that we forked to get here (the PID of
 			 * because of the fact that we forked to get here (the PID of
-			 * [stage 2: JUMP_INIT]) would be meaningless). We could send it
+			 * [stage 2: STAGE_INIT]) would be meaningless). We could send it
 			 * using cmsg(3) but that's just annoying.
 			 * using cmsg(3) but that's just annoying.
 			 */
 			 */
 			if (config.namespaces)
 			if (config.namespaces)
@@ -865,40 +920,50 @@ void nsexec(void)
 			 * problem.
 			 * problem.
 			 */
 			 */
 			if (config.cloneflags & CLONE_NEWUSER) {
 			if (config.cloneflags & CLONE_NEWUSER) {
+				write_log(DEBUG, "unshare user namespace");
 				if (unshare(CLONE_NEWUSER) < 0)
 				if (unshare(CLONE_NEWUSER) < 0)
 					bail("failed to unshare user namespace");
 					bail("failed to unshare user namespace");
 				config.cloneflags &= ~CLONE_NEWUSER;
 				config.cloneflags &= ~CLONE_NEWUSER;
 
 
 				/*
 				/*
-				 * We don't have the privileges to do any mapping here (see the
-				 * clone_parent rant). So signal our parent to hook us up.
+				 * We need to set ourselves as dumpable temporarily so that the
+				 * parent process can write to our procfs files.
 				 */
 				 */
-
-				/* Switching is only necessary if we joined namespaces. */
 				if (config.namespaces) {
 				if (config.namespaces) {
+					write_log(DEBUG, "temporarily set process as dumpable");
 					if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0)
 					if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0)
-						bail("failed to set process as dumpable");
+						bail("failed to temporarily set process as dumpable");
 				}
 				}
+
+				/*
+				 * We don't have the privileges to do any mapping here (see the
+				 * clone_parent rant). So signal stage-0 to do the mapping for
+				 * us.
+				 */
+				write_log(DEBUG, "request stage-0 to map user namespace");
 				s = SYNC_USERMAP_PLS;
 				s = SYNC_USERMAP_PLS;
 				if (write(syncfd, &s, sizeof(s)) != sizeof(s))
 				if (write(syncfd, &s, sizeof(s)) != sizeof(s))
 					bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
 					bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
 
 
 				/* ... wait for mapping ... */
 				/* ... wait for mapping ... */
-
+				write_log(DEBUG, "request stage-0 to map user namespace");
 				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
 				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
 					bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
 					bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
 				if (s != SYNC_USERMAP_ACK)
 				if (s != SYNC_USERMAP_ACK)
 					bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
 					bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
-				/* Switching is only necessary if we joined namespaces. */
+
+				/* Revert temporary re-dumpable setting. */
 				if (config.namespaces) {
 				if (config.namespaces) {
+					write_log(DEBUG, "re-set process as non-dumpable");
 					if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
 					if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
-						bail("failed to set process as dumpable");
+						bail("failed to re-set process as non-dumpable");
 				}
 				}
 
 
 				/* Become root in the namespace proper. */
 				/* Become root in the namespace proper. */
 				if (setresuid(0, 0, 0) < 0)
 				if (setresuid(0, 0, 0) < 0)
 					bail("failed to become root in user namespace");
 					bail("failed to become root in user namespace");
 			}
 			}
+
 			/*
 			/*
 			 * Unshare all of the namespaces. Now, it should be noted that this
 			 * Unshare all of the namespaces. Now, it should be noted that this
 			 * ordering might break in the future (especially with rootless
 			 * ordering might break in the future (especially with rootless
@@ -909,8 +974,9 @@ void nsexec(void)
 			 * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
 			 * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
 			 * was broken, so we'll just do it the long way anyway.
 			 * was broken, so we'll just do it the long way anyway.
 			 */
 			 */
+			write_log(DEBUG, "unshare remaining namespace (except cgroupns)");
 			if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0)
 			if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0)
-				bail("failed to unshare namespaces");
+				bail("failed to unshare remaining namespaces (except cgroupns)");
 
 
 			/*
 			/*
 			 * TODO: What about non-namespace clone flags that we're dropping here?
 			 * TODO: What about non-namespace clone flags that we're dropping here?
@@ -921,41 +987,45 @@ void nsexec(void)
 			 * which would break many applications and libraries, so we must fork
 			 * which would break many applications and libraries, so we must fork
 			 * to actually enter the new PID namespace.
 			 * to actually enter the new PID namespace.
 			 */
 			 */
-			child = clone_parent(&env, JUMP_INIT);
-			if (child < 0)
-				bail("unable to fork: init_func");
+			write_log(DEBUG, "spawn stage-2");
+			stage2_pid = clone_parent(&env, STAGE_INIT);
+			if (stage2_pid < 0)
+				bail("unable to spawn stage-2");
 
 
 			/* Send the child to our parent, which knows what it's doing. */
 			/* Send the child to our parent, which knows what it's doing. */
+			write_log(DEBUG, "request stage-0 to forward stage-2 pid (%d)", stage2_pid);
 			s = SYNC_RECVPID_PLS;
 			s = SYNC_RECVPID_PLS;
 			if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
 			if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
-				kill(child, SIGKILL);
+				sane_kill(stage2_pid, SIGKILL);
 				bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
 				bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
 			}
 			}
-			if (write(syncfd, &child, sizeof(child)) != sizeof(child)) {
-				kill(child, SIGKILL);
-				bail("failed to sync with parent: write(childpid)");
+			if (write(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) {
+				sane_kill(stage2_pid, SIGKILL);
+				bail("failed to sync with parent: write(stage2_pid)");
 			}
 			}
 
 
 			/* ... wait for parent to get the pid ... */
 			/* ... wait for parent to get the pid ... */
-
 			if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
 			if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
-				kill(child, SIGKILL);
+				sane_kill(stage2_pid, SIGKILL);
 				bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
 				bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
 			}
 			}
 			if (s != SYNC_RECVPID_ACK) {
 			if (s != SYNC_RECVPID_ACK) {
-				kill(child, SIGKILL);
+				sane_kill(stage2_pid, SIGKILL);
 				bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
 				bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
 			}
 			}
 
 
-			s = SYNC_CHILD_READY;
+			write_log(DEBUG, "signal completion to stage-0");
+			s = SYNC_CHILD_FINISH;
 			if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
 			if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
-				kill(child, SIGKILL);
-				bail("failed to sync with parent: write(SYNC_CHILD_READY)");
+				sane_kill(stage2_pid, SIGKILL);
+				bail("failed to sync with parent: write(SYNC_CHILD_FINISH)");
 			}
 			}
 
 
-			/* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */
+			/* Our work is done. [Stage 2: STAGE_INIT] is doing the rest of the work. */
+			write_log(DEBUG, "<~ nsexec stage-1");
 			exit(0);
 			exit(0);
 		}
 		}
+		break;
 
 
 		/*
 		/*
 		 * Stage 2: We're the final child process, and the only process that will
 		 * Stage 2: We're the final child process, and the only process that will
@@ -963,7 +1033,7 @@ void nsexec(void)
 		 *          final cleanup steps and then return to the Go runtime to allow
 		 *          final cleanup steps and then return to the Go runtime to allow
 		 *          init_linux.go to run.
 		 *          init_linux.go to run.
 		 */
 		 */
-	case JUMP_INIT:{
+	case STAGE_INIT:{
 			/*
 			/*
 			 * We're inside the child now, having jumped from the
 			 * We're inside the child now, having jumped from the
 			 * start_child() code after forking in the parent.
 			 * start_child() code after forking in the parent.
@@ -978,6 +1048,7 @@ void nsexec(void)
 
 
 			/* For debugging. */
 			/* For debugging. */
 			prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0);
 			prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0);
+			write_log(DEBUG, "~> nsexec stage-2");
 
 
 			if (read(syncfd, &s, sizeof(s)) != sizeof(s))
 			if (read(syncfd, &s, sizeof(s)) != sizeof(s))
 				bail("failed to sync with parent: read(SYNC_GRANDCHILD)");
 				bail("failed to sync with parent: read(SYNC_GRANDCHILD)");
@@ -998,21 +1069,30 @@ void nsexec(void)
 					bail("setgroups failed");
 					bail("setgroups failed");
 			}
 			}
 
 
-			/* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */
+			/*
+			 * Wait until our topmost parent has finished cgroup setup in
+			 * p.manager.Apply().
+			 *
+			 * TODO(cyphar): Check if this code is actually needed because we
+			 *               should be in the cgroup even from stage-0, so
+			 *               waiting until now might not make sense.
+			 */
 			if (config.cloneflags & CLONE_NEWCGROUP) {
 			if (config.cloneflags & CLONE_NEWCGROUP) {
 				uint8_t value;
 				uint8_t value;
 				if (read(pipenum, &value, sizeof(value)) != sizeof(value))
 				if (read(pipenum, &value, sizeof(value)) != sizeof(value))
 					bail("read synchronisation value failed");
 					bail("read synchronisation value failed");
 				if (value == CREATECGROUPNS) {
 				if (value == CREATECGROUPNS) {
+					write_log(DEBUG, "unshare cgroup namespace");
 					if (unshare(CLONE_NEWCGROUP) < 0)
 					if (unshare(CLONE_NEWCGROUP) < 0)
 						bail("failed to unshare cgroup namespace");
 						bail("failed to unshare cgroup namespace");
 				} else
 				} else
 					bail("received unknown synchronisation value");
 					bail("received unknown synchronisation value");
 			}
 			}
 
 
-			s = SYNC_CHILD_READY;
+			write_log(DEBUG, "signal completion to stage-0");
+			s = SYNC_CHILD_FINISH;
 			if (write(syncfd, &s, sizeof(s)) != sizeof(s))
 			if (write(syncfd, &s, sizeof(s)) != sizeof(s))
-				bail("failed to sync with patent: write(SYNC_CHILD_READY)");
+				bail("failed to sync with patent: write(SYNC_CHILD_FINISH)");
 
 
 			/* Close sync pipes. */
 			/* Close sync pipes. */
 			close(sync_grandchild_pipe[0]);
 			close(sync_grandchild_pipe[0]);
@@ -1021,10 +1101,13 @@ void nsexec(void)
 			nl_free(&config);
 			nl_free(&config);
 
 
 			/* Finish executing, let the Go runtime take over. */
 			/* Finish executing, let the Go runtime take over. */
+			write_log(DEBUG, "<= nsexec container setup");
+			write_log(DEBUG, "booting up go runtime ...");
 			return;
 			return;
 		}
 		}
+		break;
 	default:
 	default:
-		bail("unexpected jump value");
+		bail("unknown stage '%d' for jump value", current_stage);
 	}
 	}
 
 
 	/* Should never be reached. */
 	/* Should never be reached. */

+ 1 - 0
vendor/github.com/opencontainers/runc/libcontainer/nsenter/test/escape.c

@@ -0,0 +1 @@
+../escape.c

+ 53 - 0
vendor/github.com/opencontainers/runc/libcontainer/nsenter/test/escape.go

@@ -0,0 +1,53 @@
+package escapetest
+
+// This file is part of escape_json_string unit test.
+// It is in a separate package so cgo can be used together
+// with go test.
+
+// #include <stdlib.h>
+// extern char *escape_json_string(char *str);
+// #cgo CFLAGS: -DESCAPE_TEST=1
+import "C"
+
+import (
+	"testing"
+	"unsafe"
+)
+
+func testEscapeJsonString(t *testing.T, input, want string) {
+	in := C.CString(input)
+	out := C.escape_json_string(in)
+	got := C.GoString(out)
+	C.free(unsafe.Pointer(out))
+	t.Logf("input: %q, output: %q", input, got)
+	if got != want {
+		t.Errorf("Failed on input: %q, want %q, got %q", input, want, got)
+	}
+}
+
+func testEscapeJson(t *testing.T) {
+	testCases := []struct {
+		input, output string
+	}{
+		{"", ""},
+		{"abcdef", "abcdef"},
+		{`\\\\\\`, `\\\\\\\\\\\\`},
+		{`with"quote`, `with\"quote`},
+		{"\n\r\b\t\f\\", `\n\r\b\t\f\\`},
+		{"\007", "\\u0007"},
+		{"\017 \020 \037", "\\u000f \\u0010 \\u001f"},
+		{"\033", "\\u001b"},
+		{`<->`, `<->`},
+		{"\176\177\200", "~\\u007f\200"},
+		{"\000", ""},
+		{"a\x7fxc", "a\\u007fxc"},
+		{"a\033xc", "a\\u001bxc"},
+		{"a\nxc", "a\\nxc"},
+		{"a\\xc", "a\\\\xc"},
+		{"Barney B\303\244r", "Barney B\303\244r"},
+	}
+
+	for _, tc := range testCases {
+		testEscapeJsonString(t, tc.input, tc.output)
+	}
+}

+ 0 - 41
vendor/github.com/opencontainers/runc/libcontainer/user/lookup.go

@@ -1,41 +0,0 @@
-package user
-
-import (
-	"errors"
-)
-
-var (
-	// The current operating system does not provide the required data for user lookups.
-	ErrUnsupported = errors.New("user lookup: operating system does not provide passwd-formatted data")
-	// No matching entries found in file.
-	ErrNoPasswdEntries = errors.New("no matching entries in passwd file")
-	ErrNoGroupEntries  = errors.New("no matching entries in group file")
-)
-
-// LookupUser looks up a user by their username in /etc/passwd. If the user
-// cannot be found (or there is no /etc/passwd file on the filesystem), then
-// LookupUser returns an error.
-func LookupUser(username string) (User, error) {
-	return lookupUser(username)
-}
-
-// LookupUid looks up a user by their user id in /etc/passwd. If the user cannot
-// be found (or there is no /etc/passwd file on the filesystem), then LookupId
-// returns an error.
-func LookupUid(uid int) (User, error) {
-	return lookupUid(uid)
-}
-
-// LookupGroup looks up a group by its name in /etc/group. If the group cannot
-// be found (or there is no /etc/group file on the filesystem), then LookupGroup
-// returns an error.
-func LookupGroup(groupname string) (Group, error) {
-	return lookupGroup(groupname)
-}
-
-// LookupGid looks up a group by its group id in /etc/group. If the group cannot
-// be found (or there is no /etc/group file on the filesystem), then LookupGid
-// returns an error.
-func LookupGid(gid int) (Group, error) {
-	return lookupGid(gid)
-}

+ 16 - 4
vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go

@@ -16,13 +16,19 @@ const (
 	unixGroupPath  = "/etc/group"
 	unixGroupPath  = "/etc/group"
 )
 )
 
 
-func lookupUser(username string) (User, error) {
+// LookupUser looks up a user by their username in /etc/passwd. If the user
+// cannot be found (or there is no /etc/passwd file on the filesystem), then
+// LookupUser returns an error.
+func LookupUser(username string) (User, error) {
 	return lookupUserFunc(func(u User) bool {
 	return lookupUserFunc(func(u User) bool {
 		return u.Name == username
 		return u.Name == username
 	})
 	})
 }
 }
 
 
-func lookupUid(uid int) (User, error) {
+// LookupUid looks up a user by their user id in /etc/passwd. If the user cannot
+// be found (or there is no /etc/passwd file on the filesystem), then LookupId
+// returns an error.
+func LookupUid(uid int) (User, error) {
 	return lookupUserFunc(func(u User) bool {
 	return lookupUserFunc(func(u User) bool {
 		return u.Uid == uid
 		return u.Uid == uid
 	})
 	})
@@ -51,13 +57,19 @@ func lookupUserFunc(filter func(u User) bool) (User, error) {
 	return users[0], nil
 	return users[0], nil
 }
 }
 
 
-func lookupGroup(groupname string) (Group, error) {
+// LookupGroup looks up a group by its name in /etc/group. If the group cannot
+// be found (or there is no /etc/group file on the filesystem), then LookupGroup
+// returns an error.
+func LookupGroup(groupname string) (Group, error) {
 	return lookupGroupFunc(func(g Group) bool {
 	return lookupGroupFunc(func(g Group) bool {
 		return g.Name == groupname
 		return g.Name == groupname
 	})
 	})
 }
 }
 
 
-func lookupGid(gid int) (Group, error) {
+// LookupGid looks up a group by its group id in /etc/group. If the group cannot
+// be found (or there is no /etc/group file on the filesystem), then LookupGid
+// returns an error.
+func LookupGid(gid int) (Group, error) {
 	return lookupGroupFunc(func(g Group) bool {
 	return lookupGroupFunc(func(g Group) bool {
 		return g.Gid == gid
 		return g.Gid == gid
 	})
 	})

+ 0 - 40
vendor/github.com/opencontainers/runc/libcontainer/user/lookup_windows.go

@@ -1,40 +0,0 @@
-// +build windows
-
-package user
-
-import (
-	"fmt"
-	"os/user"
-)
-
-func lookupUser(username string) (User, error) {
-	u, err := user.Lookup(username)
-	if err != nil {
-		return User{}, err
-	}
-	return userFromOS(u)
-}
-
-func lookupUid(uid int) (User, error) {
-	u, err := user.LookupId(fmt.Sprintf("%d", uid))
-	if err != nil {
-		return User{}, err
-	}
-	return userFromOS(u)
-}
-
-func lookupGroup(groupname string) (Group, error) {
-	g, err := user.LookupGroup(groupname)
-	if err != nil {
-		return Group{}, err
-	}
-	return groupFromOS(g)
-}
-
-func lookupGid(gid int) (Group, error) {
-	g, err := user.LookupGroupId(fmt.Sprintf("%d", gid))
-	if err != nil {
-		return Group{}, err
-	}
-	return groupFromOS(g)
-}

+ 10 - 42
vendor/github.com/opencontainers/runc/libcontainer/user/user.go

@@ -2,10 +2,10 @@ package user
 
 
 import (
 import (
 	"bufio"
 	"bufio"
+	"errors"
 	"fmt"
 	"fmt"
 	"io"
 	"io"
 	"os"
 	"os"
-	"os/user"
 	"strconv"
 	"strconv"
 	"strings"
 	"strings"
 )
 )
@@ -16,6 +16,13 @@ const (
 )
 )
 
 
 var (
 var (
+	// The current operating system does not provide the required data for user lookups.
+	ErrUnsupported = errors.New("user lookup: operating system does not provide passwd-formatted data")
+
+	// No matching entries found in file.
+	ErrNoPasswdEntries = errors.New("no matching entries in passwd file")
+	ErrNoGroupEntries  = errors.New("no matching entries in group file")
+
 	ErrRange = fmt.Errorf("uids and gids must be in range %d-%d", minId, maxId)
 	ErrRange = fmt.Errorf("uids and gids must be in range %d-%d", minId, maxId)
 )
 )
 
 
@@ -29,28 +36,6 @@ type User struct {
 	Shell string
 	Shell string
 }
 }
 
 
-// userFromOS converts an os/user.(*User) to local User
-//
-// (This does not include Pass, Shell or Gecos)
-func userFromOS(u *user.User) (User, error) {
-	newUser := User{
-		Name: u.Username,
-		Home: u.HomeDir,
-	}
-	id, err := strconv.Atoi(u.Uid)
-	if err != nil {
-		return newUser, err
-	}
-	newUser.Uid = id
-
-	id, err = strconv.Atoi(u.Gid)
-	if err != nil {
-		return newUser, err
-	}
-	newUser.Gid = id
-	return newUser, nil
-}
-
 type Group struct {
 type Group struct {
 	Name string
 	Name string
 	Pass string
 	Pass string
@@ -58,23 +43,6 @@ type Group struct {
 	List []string
 	List []string
 }
 }
 
 
-// groupFromOS converts an os/user.(*Group) to local Group
-//
-// (This does not include Pass or List)
-func groupFromOS(g *user.Group) (Group, error) {
-	newGroup := Group{
-		Name: g.Name,
-	}
-
-	id, err := strconv.Atoi(g.Gid)
-	if err != nil {
-		return newGroup, err
-	}
-	newGroup.Gid = id
-
-	return newGroup, nil
-}
-
 // SubID represents an entry in /etc/sub{u,g}id
 // SubID represents an entry in /etc/sub{u,g}id
 type SubID struct {
 type SubID struct {
 	Name  string
 	Name  string
@@ -466,7 +434,7 @@ func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, err
 		// we asked for a group but didn't find it. let's check to see
 		// we asked for a group but didn't find it. let's check to see
 		// if we wanted a numeric group
 		// if we wanted a numeric group
 		if !found {
 		if !found {
-			gid, err := strconv.Atoi(ag)
+			gid, err := strconv.ParseInt(ag, 10, 64)
 			if err != nil {
 			if err != nil {
 				return nil, fmt.Errorf("Unable to find group %s", ag)
 				return nil, fmt.Errorf("Unable to find group %s", ag)
 			}
 			}
@@ -474,7 +442,7 @@ func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, err
 			if gid < minId || gid > maxId {
 			if gid < minId || gid > maxId {
 				return nil, ErrRange
 				return nil, ErrRange
 			}
 			}
-			gidMap[gid] = struct{}{}
+			gidMap[int(gid)] = struct{}{}
 		}
 		}
 	}
 	}
 	gids := []int{}
 	gids := []int{}

+ 42 - 0
vendor/github.com/opencontainers/runc/libcontainer/user/user_fuzzer.go

@@ -0,0 +1,42 @@
+// +build gofuzz
+
+package user
+
+import (
+	"io"
+	"strings"
+)
+
+func IsDivisbleBy(n int, divisibleby int) bool {
+	return (n % divisibleby) == 0
+}
+
+func FuzzUser(data []byte) int {
+	if len(data) == 0 {
+		return -1
+	}
+	if !IsDivisbleBy(len(data), 5) {
+		return -1
+	}
+
+	var divided [][]byte
+
+	chunkSize := len(data) / 5
+
+	for i := 0; i < len(data); i += chunkSize {
+		end := i + chunkSize
+
+		divided = append(divided, data[i:end])
+	}
+
+	_, _ = ParsePasswdFilter(strings.NewReader(string(divided[0])), nil)
+
+	var passwd, group io.Reader
+
+	group = strings.NewReader(string(divided[1]))
+	_, _ = GetAdditionalGroups([]string{string(divided[2])}, group)
+
+	passwd = strings.NewReader(string(divided[3]))
+	_, _ = GetExecUser(string(divided[4]), nil, passwd, group)
+	return 1
+}

+ 5 - 0
vendor/github.com/opencontainers/runc/libcontainer/userns/userns.go

@@ -0,0 +1,5 @@
+package userns
+
+// RunningInUserNS detects whether we are currently running in a user namespace.
+// Originally copied from github.com/lxc/lxd/shared/util.go
+var RunningInUserNS = runningInUserNS

+ 15 - 0
vendor/github.com/opencontainers/runc/libcontainer/userns/userns_fuzzer.go

@@ -0,0 +1,15 @@
+// +build gofuzz
+
+package userns
+
+import (
+	"strings"
+
+	"github.com/opencontainers/runc/libcontainer/user"
+)
+
+func FuzzUIDMap(data []byte) int {
+	uidmap, _ := user.ParseIDMap(strings.NewReader(string(data)))
+	_ = uidMapInUserNS(uidmap)
+	return 1
+}

+ 37 - 0
vendor/github.com/opencontainers/runc/libcontainer/userns/userns_linux.go

@@ -0,0 +1,37 @@
+package userns
+
+import (
+	"sync"
+
+	"github.com/opencontainers/runc/libcontainer/user"
+)
+
+var (
+	inUserNS bool
+	nsOnce   sync.Once
+)
+
+// runningInUserNS detects whether we are currently running in a user namespace.
+// Originally copied from github.com/lxc/lxd/shared/util.go
+func runningInUserNS() bool {
+	nsOnce.Do(func() {
+		uidmap, err := user.CurrentProcessUIDMap()
+		if err != nil {
+			// This kernel-provided file only exists if user namespaces are supported
+			return
+		}
+		inUserNS = uidMapInUserNS(uidmap)
+	})
+	return inUserNS
+}
+
+func uidMapInUserNS(uidmap []user.IDMap) bool {
+	/*
+	 * We assume we are in the initial user namespace if we have a full
+	 * range - 4294967295 uids starting at uid 0.
+	 */
+	if len(uidmap) == 1 && uidmap[0].ID == 0 && uidmap[0].ParentID == 0 && uidmap[0].Count == 4294967295 {
+		return false
+	}
+	return true
+}

+ 17 - 0
vendor/github.com/opencontainers/runc/libcontainer/userns/userns_unsupported.go

@@ -0,0 +1,17 @@
+// +build !linux
+
+package userns
+
+import "github.com/opencontainers/runc/libcontainer/user"
+
+// runningInUserNS is a stub for non-Linux systems
+// Always returns false
+func runningInUserNS() bool {
+	return false
+}
+
+// uidMapInUserNS is a stub for non-Linux systems
+// Always returns false
+func uidMapInUserNS(uidmap []user.IDMap) bool {
+	return false
+}

+ 1 - 1
vendor/github.com/opencontainers/runtime-spec/README.md

@@ -135,7 +135,7 @@ Read more on [How to Write a Git Commit Message][how-to-git-commit] or the Discu
 8. When possible, one keyword to scope the change in the subject (i.e. "README: ...", "runtime: ...")
 8. When possible, one keyword to scope the change in the subject (i.e. "README: ...", "runtime: ...")
 
 
 
 
-[charter]: https://www.opencontainers.org/about/governance
+[charter]: https://github.com/opencontainers/tob/blob/master/CHARTER.md
 [code-of-conduct]: https://github.com/opencontainers/org/blob/master/CODE_OF_CONDUCT.md
 [code-of-conduct]: https://github.com/opencontainers/org/blob/master/CODE_OF_CONDUCT.md
 [dev-list]: https://groups.google.com/a/opencontainers.org/forum/#!forum/dev
 [dev-list]: https://groups.google.com/a/opencontainers.org/forum/#!forum/dev
 [how-to-git-commit]: http://chris.beams.io/posts/git-commit
 [how-to-git-commit]: http://chris.beams.io/posts/git-commit

+ 16 - 7
vendor/github.com/opencontainers/runtime-spec/specs-go/config.go

@@ -60,7 +60,7 @@ type Process struct {
 	SelinuxLabel string `json:"selinuxLabel,omitempty" platform:"linux"`
 	SelinuxLabel string `json:"selinuxLabel,omitempty" platform:"linux"`
 }
 }
 
 
-// LinuxCapabilities specifies the whitelist of capabilities that are kept for a process.
+// LinuxCapabilities specifies the list of allowed capabilities that are kept for a process.
 // http://man7.org/linux/man-pages/man7/capabilities.7.html
 // http://man7.org/linux/man-pages/man7/capabilities.7.html
 type LinuxCapabilities struct {
 type LinuxCapabilities struct {
 	// Bounding is the set of capabilities checked by the kernel.
 	// Bounding is the set of capabilities checked by the kernel.
@@ -354,7 +354,7 @@ type LinuxRdma struct {
 
 
 // LinuxResources has container runtime resource constraints
 // LinuxResources has container runtime resource constraints
 type LinuxResources struct {
 type LinuxResources struct {
-	// Devices configures the device whitelist.
+	// Devices configures the device allowlist.
 	Devices []LinuxDeviceCgroup `json:"devices,omitempty"`
 	Devices []LinuxDeviceCgroup `json:"devices,omitempty"`
 	// Memory restriction configuration
 	// Memory restriction configuration
 	Memory *LinuxMemory `json:"memory,omitempty"`
 	Memory *LinuxMemory `json:"memory,omitempty"`
@@ -372,6 +372,8 @@ type LinuxResources struct {
 	// Limits are a set of key value pairs that define RDMA resource limits,
 	// Limits are a set of key value pairs that define RDMA resource limits,
 	// where the key is device name and value is resource limits.
 	// where the key is device name and value is resource limits.
 	Rdma map[string]LinuxRdma `json:"rdma,omitempty"`
 	Rdma map[string]LinuxRdma `json:"rdma,omitempty"`
+	// Unified resources.
+	Unified map[string]string `json:"unified,omitempty"`
 }
 }
 
 
 // LinuxDevice represents the mknod information for a Linux special device file
 // LinuxDevice represents the mknod information for a Linux special device file
@@ -392,7 +394,8 @@ type LinuxDevice struct {
 	GID *uint32 `json:"gid,omitempty"`
 	GID *uint32 `json:"gid,omitempty"`
 }
 }
 
 
-// LinuxDeviceCgroup represents a device rule for the whitelist controller
+// LinuxDeviceCgroup represents a device rule for the devices specified to
+// the device controller
 type LinuxDeviceCgroup struct {
 type LinuxDeviceCgroup struct {
 	// Allow or deny
 	// Allow or deny
 	Allow bool `json:"allow"`
 	Allow bool `json:"allow"`
@@ -595,10 +598,13 @@ type VMImage struct {
 
 
 // LinuxSeccomp represents syscall restrictions
 // LinuxSeccomp represents syscall restrictions
 type LinuxSeccomp struct {
 type LinuxSeccomp struct {
-	DefaultAction LinuxSeccompAction `json:"defaultAction"`
-	Architectures []Arch             `json:"architectures,omitempty"`
-	Flags         []LinuxSeccompFlag `json:"flags,omitempty"`
-	Syscalls      []LinuxSyscall     `json:"syscalls,omitempty"`
+	DefaultAction    LinuxSeccompAction `json:"defaultAction"`
+	DefaultErrnoRet  *uint              `json:"defaultErrnoRet,omitempty"`
+	Architectures    []Arch             `json:"architectures,omitempty"`
+	Flags            []LinuxSeccompFlag `json:"flags,omitempty"`
+	ListenerPath     string             `json:"listenerPath,omitempty"`
+	ListenerMetadata string             `json:"listenerMetadata,omitempty"`
+	Syscalls         []LinuxSyscall     `json:"syscalls,omitempty"`
 }
 }
 
 
 // Arch used for additional architectures
 // Arch used for additional architectures
@@ -628,6 +634,7 @@ const (
 	ArchS390X       Arch = "SCMP_ARCH_S390X"
 	ArchS390X       Arch = "SCMP_ARCH_S390X"
 	ArchPARISC      Arch = "SCMP_ARCH_PARISC"
 	ArchPARISC      Arch = "SCMP_ARCH_PARISC"
 	ArchPARISC64    Arch = "SCMP_ARCH_PARISC64"
 	ArchPARISC64    Arch = "SCMP_ARCH_PARISC64"
+	ArchRISCV64     Arch = "SCMP_ARCH_RISCV64"
 )
 )
 
 
 // LinuxSeccompAction taken upon Seccomp rule match
 // LinuxSeccompAction taken upon Seccomp rule match
@@ -637,11 +644,13 @@ type LinuxSeccompAction string
 const (
 const (
 	ActKill        LinuxSeccompAction = "SCMP_ACT_KILL"
 	ActKill        LinuxSeccompAction = "SCMP_ACT_KILL"
 	ActKillProcess LinuxSeccompAction = "SCMP_ACT_KILL_PROCESS"
 	ActKillProcess LinuxSeccompAction = "SCMP_ACT_KILL_PROCESS"
+	ActKillThread  LinuxSeccompAction = "SCMP_ACT_KILL_THREAD"
 	ActTrap        LinuxSeccompAction = "SCMP_ACT_TRAP"
 	ActTrap        LinuxSeccompAction = "SCMP_ACT_TRAP"
 	ActErrno       LinuxSeccompAction = "SCMP_ACT_ERRNO"
 	ActErrno       LinuxSeccompAction = "SCMP_ACT_ERRNO"
 	ActTrace       LinuxSeccompAction = "SCMP_ACT_TRACE"
 	ActTrace       LinuxSeccompAction = "SCMP_ACT_TRACE"
 	ActAllow       LinuxSeccompAction = "SCMP_ACT_ALLOW"
 	ActAllow       LinuxSeccompAction = "SCMP_ACT_ALLOW"
 	ActLog         LinuxSeccompAction = "SCMP_ACT_LOG"
 	ActLog         LinuxSeccompAction = "SCMP_ACT_LOG"
+	ActNotify      LinuxSeccompAction = "SCMP_ACT_NOTIFY"
 )
 )
 
 
 // LinuxSeccompOperator used to match syscall arguments in Seccomp
 // LinuxSeccompOperator used to match syscall arguments in Seccomp

+ 25 - 4
vendor/github.com/opencontainers/runtime-spec/specs-go/state.go

@@ -5,17 +5,17 @@ type ContainerState string
 
 
 const (
 const (
 	// StateCreating indicates that the container is being created
 	// StateCreating indicates that the container is being created
-	StateCreating ContainerState  = "creating"
+	StateCreating ContainerState = "creating"
 
 
 	// StateCreated indicates that the runtime has finished the create operation
 	// StateCreated indicates that the runtime has finished the create operation
-	StateCreated ContainerState  = "created"
+	StateCreated ContainerState = "created"
 
 
 	// StateRunning indicates that the container process has executed the
 	// StateRunning indicates that the container process has executed the
 	// user-specified program but has not exited
 	// user-specified program but has not exited
-	StateRunning ContainerState  = "running"
+	StateRunning ContainerState = "running"
 
 
 	// StateStopped indicates that the container process has exited
 	// StateStopped indicates that the container process has exited
-	StateStopped ContainerState  = "stopped"
+	StateStopped ContainerState = "stopped"
 )
 )
 
 
 // State holds information about the runtime state of the container.
 // State holds information about the runtime state of the container.
@@ -33,3 +33,24 @@ type State struct {
 	// Annotations are key values associated with the container.
 	// Annotations are key values associated with the container.
 	Annotations map[string]string `json:"annotations,omitempty"`
 	Annotations map[string]string `json:"annotations,omitempty"`
 }
 }
+
+const (
+	// SeccompFdName is the name of the seccomp notify file descriptor.
+	SeccompFdName string = "seccompFd"
+)
+
+// ContainerProcessState holds information about the state of a container process.
+type ContainerProcessState struct {
+	// Version is the version of the specification that is supported.
+	Version string `json:"ociVersion"`
+	// Fds is a string array containing the names of the file descriptors passed.
+	// The index of the name in this array corresponds to index of the file
+	// descriptor in the `SCM_RIGHTS` array.
+	Fds []string `json:"fds"`
+	// Pid is the process ID as seen by the runtime.
+	Pid int `json:"pid"`
+	// Opaque metadata.
+	Metadata string `json:"metadata,omitempty"`
+	// State of the container.
+	State State `json:"state"`
+}

+ 2 - 0
vendor/golang.org/x/net/README.md

@@ -1,5 +1,7 @@
 # Go Networking
 # Go Networking
 
 
+[![Go Reference](https://pkg.go.dev/badge/golang.org/x/net.svg)](https://pkg.go.dev/golang.org/x/net)
+
 This repository holds supplementary Go networking libraries.
 This repository holds supplementary Go networking libraries.
 
 
 ## Download/Install
 ## Download/Install

+ 3 - 3
vendor/golang.org/x/net/go.mod

@@ -3,7 +3,7 @@ module golang.org/x/net
 go 1.11
 go 1.11
 
 
 require (
 require (
-	golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9
-	golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd
-	golang.org/x/text v0.3.0
+	golang.org/x/sys v0.0.0-20201119102817-f84b799fce68
+	golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1
+	golang.org/x/text v0.3.3
 )
 )

+ 8 - 4
vendor/golang.org/x/net/http2/server.go

@@ -1694,6 +1694,7 @@ func (sc *serverConn) processData(f *DataFrame) error {
 		if len(data) > 0 {
 		if len(data) > 0 {
 			wrote, err := st.body.Write(data)
 			wrote, err := st.body.Write(data)
 			if err != nil {
 			if err != nil {
+				sc.sendWindowUpdate(nil, int(f.Length)-wrote)
 				return streamError(id, ErrCodeStreamClosed)
 				return streamError(id, ErrCodeStreamClosed)
 			}
 			}
 			if wrote != len(data) {
 			if wrote != len(data) {
@@ -2020,7 +2021,11 @@ func (sc *serverConn) newWriterAndRequest(st *stream, f *MetaHeadersFrame) (*res
 	}
 	}
 	if bodyOpen {
 	if bodyOpen {
 		if vv, ok := rp.header["Content-Length"]; ok {
 		if vv, ok := rp.header["Content-Length"]; ok {
-			req.ContentLength, _ = strconv.ParseInt(vv[0], 10, 64)
+			if cl, err := strconv.ParseUint(vv[0], 10, 63); err == nil {
+				req.ContentLength = int64(cl)
+			} else {
+				req.ContentLength = 0
+			}
 		} else {
 		} else {
 			req.ContentLength = -1
 			req.ContentLength = -1
 		}
 		}
@@ -2403,9 +2408,8 @@ func (rws *responseWriterState) writeChunk(p []byte) (n int, err error) {
 		var ctype, clen string
 		var ctype, clen string
 		if clen = rws.snapHeader.Get("Content-Length"); clen != "" {
 		if clen = rws.snapHeader.Get("Content-Length"); clen != "" {
 			rws.snapHeader.Del("Content-Length")
 			rws.snapHeader.Del("Content-Length")
-			clen64, err := strconv.ParseInt(clen, 10, 64)
-			if err == nil && clen64 >= 0 {
-				rws.sentContentLen = clen64
+			if cl, err := strconv.ParseUint(clen, 10, 63); err == nil {
+				rws.sentContentLen = int64(cl)
 			} else {
 			} else {
 				clen = ""
 				clen = ""
 			}
 			}

+ 34 - 6
vendor/golang.org/x/net/http2/transport.go

@@ -154,12 +154,21 @@ func (t *Transport) pingTimeout() time.Duration {
 
 
 // ConfigureTransport configures a net/http HTTP/1 Transport to use HTTP/2.
 // ConfigureTransport configures a net/http HTTP/1 Transport to use HTTP/2.
 // It returns an error if t1 has already been HTTP/2-enabled.
 // It returns an error if t1 has already been HTTP/2-enabled.
+//
+// Use ConfigureTransports instead to configure the HTTP/2 Transport.
 func ConfigureTransport(t1 *http.Transport) error {
 func ConfigureTransport(t1 *http.Transport) error {
-	_, err := configureTransport(t1)
+	_, err := ConfigureTransports(t1)
 	return err
 	return err
 }
 }
 
 
-func configureTransport(t1 *http.Transport) (*Transport, error) {
+// ConfigureTransports configures a net/http HTTP/1 Transport to use HTTP/2.
+// It returns a new HTTP/2 Transport for further configuration.
+// It returns an error if t1 has already been HTTP/2-enabled.
+func ConfigureTransports(t1 *http.Transport) (*Transport, error) {
+	return configureTransports(t1)
+}
+
+func configureTransports(t1 *http.Transport) (*Transport, error) {
 	connPool := new(clientConnPool)
 	connPool := new(clientConnPool)
 	t2 := &Transport{
 	t2 := &Transport{
 		ConnPool: noDialClientConnPool{connPool},
 		ConnPool: noDialClientConnPool{connPool},
@@ -689,6 +698,7 @@ func (t *Transport) newClientConn(c net.Conn, singleUse bool) (*ClientConn, erro
 	cc.inflow.add(transportDefaultConnFlow + initialWindowSize)
 	cc.inflow.add(transportDefaultConnFlow + initialWindowSize)
 	cc.bw.Flush()
 	cc.bw.Flush()
 	if cc.werr != nil {
 	if cc.werr != nil {
+		cc.Close()
 		return nil, cc.werr
 		return nil, cc.werr
 	}
 	}
 
 
@@ -1080,6 +1090,15 @@ func (cc *ClientConn) roundTrip(req *http.Request) (res *http.Response, gotErrAf
 	bodyWriter := cc.t.getBodyWriterState(cs, body)
 	bodyWriter := cc.t.getBodyWriterState(cs, body)
 	cs.on100 = bodyWriter.on100
 	cs.on100 = bodyWriter.on100
 
 
+	defer func() {
+		cc.wmu.Lock()
+		werr := cc.werr
+		cc.wmu.Unlock()
+		if werr != nil {
+			cc.Close()
+		}
+	}()
+
 	cc.wmu.Lock()
 	cc.wmu.Lock()
 	endStream := !hasBody && !hasTrailers
 	endStream := !hasBody && !hasTrailers
 	werr := cc.writeHeaders(cs.ID, endStream, int(cc.maxFrameSize), hdrs)
 	werr := cc.writeHeaders(cs.ID, endStream, int(cc.maxFrameSize), hdrs)
@@ -1129,6 +1148,9 @@ func (cc *ClientConn) roundTrip(req *http.Request) (res *http.Response, gotErrAf
 			// we can keep it.
 			// we can keep it.
 			bodyWriter.cancel()
 			bodyWriter.cancel()
 			cs.abortRequestBodyWrite(errStopReqBodyWrite)
 			cs.abortRequestBodyWrite(errStopReqBodyWrite)
+			if hasBody && !bodyWritten {
+				<-bodyWriter.resc
+			}
 		}
 		}
 		if re.err != nil {
 		if re.err != nil {
 			cc.forgetStreamID(cs.ID)
 			cc.forgetStreamID(cs.ID)
@@ -1149,6 +1171,7 @@ func (cc *ClientConn) roundTrip(req *http.Request) (res *http.Response, gotErrAf
 			} else {
 			} else {
 				bodyWriter.cancel()
 				bodyWriter.cancel()
 				cs.abortRequestBodyWrite(errStopReqBodyWriteAndCancel)
 				cs.abortRequestBodyWrite(errStopReqBodyWriteAndCancel)
+				<-bodyWriter.resc
 			}
 			}
 			cc.forgetStreamID(cs.ID)
 			cc.forgetStreamID(cs.ID)
 			return nil, cs.getStartedWrite(), errTimeout
 			return nil, cs.getStartedWrite(), errTimeout
@@ -1158,6 +1181,7 @@ func (cc *ClientConn) roundTrip(req *http.Request) (res *http.Response, gotErrAf
 			} else {
 			} else {
 				bodyWriter.cancel()
 				bodyWriter.cancel()
 				cs.abortRequestBodyWrite(errStopReqBodyWriteAndCancel)
 				cs.abortRequestBodyWrite(errStopReqBodyWriteAndCancel)
+				<-bodyWriter.resc
 			}
 			}
 			cc.forgetStreamID(cs.ID)
 			cc.forgetStreamID(cs.ID)
 			return nil, cs.getStartedWrite(), ctx.Err()
 			return nil, cs.getStartedWrite(), ctx.Err()
@@ -1167,6 +1191,7 @@ func (cc *ClientConn) roundTrip(req *http.Request) (res *http.Response, gotErrAf
 			} else {
 			} else {
 				bodyWriter.cancel()
 				bodyWriter.cancel()
 				cs.abortRequestBodyWrite(errStopReqBodyWriteAndCancel)
 				cs.abortRequestBodyWrite(errStopReqBodyWriteAndCancel)
+				<-bodyWriter.resc
 			}
 			}
 			cc.forgetStreamID(cs.ID)
 			cc.forgetStreamID(cs.ID)
 			return nil, cs.getStartedWrite(), errRequestCanceled
 			return nil, cs.getStartedWrite(), errRequestCanceled
@@ -1176,6 +1201,7 @@ func (cc *ClientConn) roundTrip(req *http.Request) (res *http.Response, gotErrAf
 			// forgetStreamID.
 			// forgetStreamID.
 			return nil, cs.getStartedWrite(), cs.resetErr
 			return nil, cs.getStartedWrite(), cs.resetErr
 		case err := <-bodyWriter.resc:
 		case err := <-bodyWriter.resc:
+			bodyWritten = true
 			// Prefer the read loop's response, if available. Issue 16102.
 			// Prefer the read loop's response, if available. Issue 16102.
 			select {
 			select {
 			case re := <-readLoopResCh:
 			case re := <-readLoopResCh:
@@ -1186,7 +1212,6 @@ func (cc *ClientConn) roundTrip(req *http.Request) (res *http.Response, gotErrAf
 				cc.forgetStreamID(cs.ID)
 				cc.forgetStreamID(cs.ID)
 				return nil, cs.getStartedWrite(), err
 				return nil, cs.getStartedWrite(), err
 			}
 			}
-			bodyWritten = true
 			if d := cc.responseHeaderTimeout(); d != 0 {
 			if d := cc.responseHeaderTimeout(); d != 0 {
 				timer := time.NewTimer(d)
 				timer := time.NewTimer(d)
 				defer timer.Stop()
 				defer timer.Stop()
@@ -2006,8 +2031,8 @@ func (rl *clientConnReadLoop) handleResponse(cs *clientStream, f *MetaHeadersFra
 	if !streamEnded || isHead {
 	if !streamEnded || isHead {
 		res.ContentLength = -1
 		res.ContentLength = -1
 		if clens := res.Header["Content-Length"]; len(clens) == 1 {
 		if clens := res.Header["Content-Length"]; len(clens) == 1 {
-			if clen64, err := strconv.ParseInt(clens[0], 10, 64); err == nil {
-				res.ContentLength = clen64
+			if cl, err := strconv.ParseUint(clens[0], 10, 63); err == nil {
+				res.ContentLength = int64(cl)
 			} else {
 			} else {
 				// TODO: care? unlike http/1, it won't mess up our framing, so it's
 				// TODO: care? unlike http/1, it won't mess up our framing, so it's
 				// more safe smuggling-wise to ignore.
 				// more safe smuggling-wise to ignore.
@@ -2525,6 +2550,7 @@ func strSliceContains(ss []string, s string) bool {
 
 
 type erringRoundTripper struct{ err error }
 type erringRoundTripper struct{ err error }
 
 
+func (rt erringRoundTripper) RoundTripErr() error                             { return rt.err }
 func (rt erringRoundTripper) RoundTrip(*http.Request) (*http.Response, error) { return nil, rt.err }
 func (rt erringRoundTripper) RoundTrip(*http.Request) (*http.Response, error) { return nil, rt.err }
 
 
 // gzipReader wraps a response body so it can lazily
 // gzipReader wraps a response body so it can lazily
@@ -2606,7 +2632,9 @@ func (t *Transport) getBodyWriterState(cs *clientStream, body io.Reader) (s body
 
 
 func (s bodyWriterState) cancel() {
 func (s bodyWriterState) cancel() {
 	if s.timer != nil {
 	if s.timer != nil {
-		s.timer.Stop()
+		if s.timer.Stop() {
+			s.resc <- nil
+		}
 	}
 	}
 }
 }
 
 

+ 1 - 1
vendor/golang.org/x/net/idna/tables12.00.go → vendor/golang.org/x/net/idna/tables12.0.0.go

@@ -1,6 +1,6 @@
 // Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
 // Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
 
 
-// +build go1.14
+// +build go1.14,!go1.16
 
 
 package idna
 package idna
 
 

Diferenças do arquivo suprimidas por serem muito extensas
+ 2394 - 0
vendor/golang.org/x/net/idna/tables13.0.0.go


+ 1 - 1
vendor/golang.org/x/net/internal/socket/cmsghdr.go

@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // license that can be found in the LICENSE file.
 
 
-// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris zos
 
 
 package socket
 package socket
 
 

+ 13 - 3
vendor/golang.org/x/net/internal/socket/cmsghdr_stub.go

@@ -2,13 +2,23 @@
 // Use of this source code is governed by a BSD-style
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // license that can be found in the LICENSE file.
 
 
-// +build !aix,!darwin,!dragonfly,!freebsd,!linux,!netbsd,!openbsd,!solaris
+// +build !aix,!darwin,!dragonfly,!freebsd,!linux,!netbsd,!openbsd,!solaris,!zos
 
 
 package socket
 package socket
 
 
-type cmsghdr struct{}
+func controlHeaderLen() int {
+	return 0
+}
+
+func controlMessageLen(dataLen int) int {
+	return 0
+}
 
 
-const sizeofCmsghdr = 0
+func controlMessageSpace(dataLen int) int {
+	return 0
+}
+
+type cmsghdr struct{}
 
 
 func (h *cmsghdr) len() int { return 0 }
 func (h *cmsghdr) len() int { return 0 }
 func (h *cmsghdr) lvl() int { return 0 }
 func (h *cmsghdr) lvl() int { return 0 }

+ 21 - 0
vendor/golang.org/x/net/internal/socket/cmsghdr_unix.go

@@ -0,0 +1,21 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
+
+package socket
+
+import "golang.org/x/sys/unix"
+
+func controlHeaderLen() int {
+	return unix.CmsgLen(0)
+}
+
+func controlMessageLen(dataLen int) int {
+	return unix.CmsgLen(dataLen)
+}
+
+func controlMessageSpace(dataLen int) int {
+	return unix.CmsgSpace(dataLen)
+}

+ 25 - 0
vendor/golang.org/x/net/internal/socket/cmsghdr_zos_s390x.go

@@ -0,0 +1,25 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package socket
+
+import "syscall"
+
+func (h *cmsghdr) set(l, lvl, typ int) {
+	h.Len = int32(l)
+	h.Level = int32(lvl)
+	h.Type = int32(typ)
+}
+
+func controlHeaderLen() int {
+	return syscall.CmsgLen(0)
+}
+
+func controlMessageLen(dataLen int) int {
+	return syscall.CmsgLen(dataLen)
+}
+
+func controlMessageSpace(dataLen int) int {
+	return syscall.CmsgSpace(dataLen)
+}

+ 1 - 1
vendor/golang.org/x/net/internal/socket/error_unix.go

@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // license that can be found in the LICENSE file.
 
 
-// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris zos
 
 
 package socket
 package socket
 
 

+ 1 - 1
vendor/golang.org/x/net/internal/socket/iovec_64bit.go

@@ -3,7 +3,7 @@
 // license that can be found in the LICENSE file.
 // license that can be found in the LICENSE file.
 
 
 // +build arm64 amd64 ppc64 ppc64le mips64 mips64le riscv64 s390x
 // +build arm64 amd64 ppc64 ppc64le mips64 mips64le riscv64 s390x
-// +build aix darwin dragonfly freebsd linux netbsd openbsd
+// +build aix darwin dragonfly freebsd linux netbsd openbsd zos
 
 
 package socket
 package socket
 
 

+ 1 - 1
vendor/golang.org/x/net/internal/socket/iovec_stub.go

@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // license that can be found in the LICENSE file.
 
 
-// +build !aix,!darwin,!dragonfly,!freebsd,!linux,!netbsd,!openbsd,!solaris
+// +build !aix,!darwin,!dragonfly,!freebsd,!linux,!netbsd,!openbsd,!solaris,!zos
 
 
 package socket
 package socket
 
 

+ 1 - 1
vendor/golang.org/x/net/internal/socket/msghdr_stub.go

@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // license that can be found in the LICENSE file.
 
 
-// +build !aix,!darwin,!dragonfly,!freebsd,!linux,!netbsd,!openbsd,!solaris
+// +build !aix,!darwin,!dragonfly,!freebsd,!linux,!netbsd,!openbsd,!solaris,!zos
 
 
 package socket
 package socket
 
 

+ 36 - 0
vendor/golang.org/x/net/internal/socket/msghdr_zos_s390x.go

@@ -0,0 +1,36 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build s390x
+// +build zos
+
+package socket
+
+import "unsafe"
+
+func (h *msghdr) pack(vs []iovec, bs [][]byte, oob []byte, sa []byte) {
+	for i := range vs {
+		vs[i].set(bs[i])
+	}
+	if len(vs) > 0 {
+		h.Iov = &vs[0]
+		h.Iovlen = int32(len(vs))
+	}
+	if len(oob) > 0 {
+		h.Control = (*byte)(unsafe.Pointer(&oob[0]))
+		h.Controllen = uint32(len(oob))
+	}
+	if sa != nil {
+		h.Name = (*byte)(unsafe.Pointer(&sa[0]))
+		h.Namelen = uint32(len(sa))
+	}
+}
+
+func (h *msghdr) controllen() int {
+	return int(h.Controllen)
+}
+
+func (h *msghdr) flags() int {
+	return int(h.Flags)
+}

+ 4 - 3
vendor/golang.org/x/net/internal/socket/rawconn_msg.go

@@ -2,12 +2,13 @@
 // Use of this source code is governed by a BSD-style
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // license that can be found in the LICENSE file.
 
 
-// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris windows
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris windows zos
 
 
 package socket
 package socket
 
 
 import (
 import (
 	"os"
 	"os"
+	"runtime"
 	"syscall"
 	"syscall"
 )
 )
 
 
@@ -24,7 +25,7 @@ func (c *Conn) recvMsg(m *Message, flags int) error {
 	var n int
 	var n int
 	fn := func(s uintptr) bool {
 	fn := func(s uintptr) bool {
 		n, operr = recvmsg(s, &h, flags)
 		n, operr = recvmsg(s, &h, flags)
-		if operr == syscall.EAGAIN {
+		if operr == syscall.EAGAIN || (runtime.GOOS == "zos" && operr == syscall.EWOULDBLOCK) {
 			return false
 			return false
 		}
 		}
 		return true
 		return true
@@ -61,7 +62,7 @@ func (c *Conn) sendMsg(m *Message, flags int) error {
 	var n int
 	var n int
 	fn := func(s uintptr) bool {
 	fn := func(s uintptr) bool {
 		n, operr = sendmsg(s, &h, flags)
 		n, operr = sendmsg(s, &h, flags)
-		if operr == syscall.EAGAIN {
+		if operr == syscall.EAGAIN || (runtime.GOOS == "zos" && operr == syscall.EWOULDBLOCK) {
 			return false
 			return false
 		}
 		}
 		return true
 		return true

+ 1 - 1
vendor/golang.org/x/net/internal/socket/rawconn_nomsg.go

@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // license that can be found in the LICENSE file.
 
 
-// +build !aix,!darwin,!dragonfly,!freebsd,!linux,!netbsd,!openbsd,!solaris,!windows
+// +build !aix,!darwin,!dragonfly,!freebsd,!linux,!netbsd,!openbsd,!solaris,!windows,!zos
 
 
 package socket
 package socket
 
 

+ 1 - 9
vendor/golang.org/x/net/internal/socket/socket.go

@@ -90,17 +90,9 @@ func (o *Option) SetInt(c *Conn, v int) error {
 	return o.set(c, b)
 	return o.set(c, b)
 }
 }
 
 
-func controlHeaderLen() int {
-	return roundup(sizeofCmsghdr)
-}
-
-func controlMessageLen(dataLen int) int {
-	return roundup(sizeofCmsghdr) + dataLen
-}
-
 // ControlMessageSpace returns the whole length of control message.
 // ControlMessageSpace returns the whole length of control message.
 func ControlMessageSpace(dataLen int) int {
 func ControlMessageSpace(dataLen int) int {
-	return roundup(sizeofCmsghdr) + roundup(dataLen)
+	return controlMessageSpace(dataLen)
 }
 }
 
 
 // A ControlMessage represents the head message in a stream of control
 // A ControlMessage represents the head message in a stream of control

+ 2 - 12
vendor/golang.org/x/net/internal/socket/sys.go

@@ -9,13 +9,8 @@ import (
 	"unsafe"
 	"unsafe"
 )
 )
 
 
-var (
-	// NativeEndian is the machine native endian implementation of
-	// ByteOrder.
-	NativeEndian binary.ByteOrder
-
-	kernelAlign int
-)
+// NativeEndian is the machine native endian implementation of ByteOrder.
+var NativeEndian binary.ByteOrder
 
 
 func init() {
 func init() {
 	i := uint32(1)
 	i := uint32(1)
@@ -25,9 +20,4 @@ func init() {
 	} else {
 	} else {
 		NativeEndian = binary.BigEndian
 		NativeEndian = binary.BigEndian
 	}
 	}
-	kernelAlign = probeProtocolStack()
-}
-
-func roundup(l int) int {
-	return (l + kernelAlign - 1) &^ (kernelAlign - 1)
 }
 }

+ 0 - 23
vendor/golang.org/x/net/internal/socket/sys_bsdvar.go

@@ -1,23 +0,0 @@
-// Copyright 2017 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build aix freebsd netbsd openbsd
-
-package socket
-
-import (
-	"runtime"
-	"unsafe"
-)
-
-func probeProtocolStack() int {
-	if (runtime.GOOS == "netbsd" || runtime.GOOS == "openbsd") && runtime.GOARCH == "arm" {
-		return 8
-	}
-	if runtime.GOOS == "aix" {
-		return 1
-	}
-	var p uintptr
-	return int(unsafe.Sizeof(p))
-}

Alguns arquivos não foram mostrados porque muitos arquivos mudaram nesse diff