Browse Source

Merge pull request #42143 from thaJeztah/check_libcontainer

vendor: github.com/opencontainers/runc v1.0.0-rc95
Akihiro Suda 4 years ago
parent
commit
33c332ad19
100 changed files with 11377 additions and 2052 deletions
  1. 3 4
      oci/devices_linux.go
  2. 2 2
      oci/devices_linux_test.go
  3. 0 20
      oci/devices_unsupported.go
  4. 5 5
      vendor.conf
  5. 62 0
      vendor/github.com/cilium/ebpf/README.md
  6. 0 206
      vendor/github.com/cilium/ebpf/abi.go
  7. 1 1
      vendor/github.com/cilium/ebpf/asm/func.go
  8. 84 64
      vendor/github.com/cilium/ebpf/asm/instruction.go
  9. 3 3
      vendor/github.com/cilium/ebpf/asm/opcode.go
  10. 346 51
      vendor/github.com/cilium/ebpf/collection.go
  11. 1 2
      vendor/github.com/cilium/ebpf/doc.go
  12. 456 265
      vendor/github.com/cilium/ebpf/elf_reader.go
  13. 21 0
      vendor/github.com/cilium/ebpf/elf_reader_fuzz.go
  14. 6 0
      vendor/github.com/cilium/ebpf/examples/README.md
  15. 9 0
      vendor/github.com/cilium/ebpf/examples/go.mod
  16. 3265 0
      vendor/github.com/cilium/ebpf/examples/headers/bpf_helper_defs.h
  17. 80 0
      vendor/github.com/cilium/ebpf/examples/headers/bpf_helpers.h
  18. 107 0
      vendor/github.com/cilium/ebpf/examples/headers/common.h
  19. 26 0
      vendor/github.com/cilium/ebpf/examples/kprobe/bpf/kprobe_example.c
  20. 25 0
      vendor/github.com/cilium/ebpf/examples/uprobe/bpf/uprobe_example.c
  21. 6 2
      vendor/github.com/cilium/ebpf/go.mod
  22. 239 0
      vendor/github.com/cilium/ebpf/info.go
  23. 175 100
      vendor/github.com/cilium/ebpf/internal/btf/btf.go
  24. 17 5
      vendor/github.com/cilium/ebpf/internal/btf/btf_types.go
  25. 388 0
      vendor/github.com/cilium/ebpf/internal/btf/core.go
  26. 126 27
      vendor/github.com/cilium/ebpf/internal/btf/ext_info.go
  27. 49 0
      vendor/github.com/cilium/ebpf/internal/btf/fuzz.go
  28. 360 76
      vendor/github.com/cilium/ebpf/internal/btf/types.go
  29. 52 0
      vendor/github.com/cilium/ebpf/internal/elf.go
  30. 30 52
      vendor/github.com/cilium/ebpf/internal/feature.go
  31. 44 0
      vendor/github.com/cilium/ebpf/internal/pinning.go
  32. 10 5
      vendor/github.com/cilium/ebpf/internal/ptr.go
  33. 43 2
      vendor/github.com/cilium/ebpf/internal/syscall.go
  34. 61 10
      vendor/github.com/cilium/ebpf/internal/unix/types_linux.go
  35. 52 9
      vendor/github.com/cilium/ebpf/internal/unix/types_other.go
  36. 163 0
      vendor/github.com/cilium/ebpf/internal/version.go
  37. 47 0
      vendor/github.com/cilium/ebpf/linker.go
  38. 558 151
      vendor/github.com/cilium/ebpf/map.go
  39. 23 8
      vendor/github.com/cilium/ebpf/marshalers.go
  40. 240 147
      vendor/github.com/cilium/ebpf/prog.go
  41. 0 25
      vendor/github.com/cilium/ebpf/readme.md
  42. 142 89
      vendor/github.com/cilium/ebpf/syscalls.go
  43. 81 33
      vendor/github.com/cilium/ebpf/types.go
  44. 36 5
      vendor/github.com/cilium/ebpf/types_string.go
  45. 25 13
      vendor/github.com/opencontainers/runc/README.md
  46. 16 14
      vendor/github.com/opencontainers/runc/go.mod
  47. 87 83
      vendor/github.com/opencontainers/runc/libcontainer/README.md
  48. 23 13
      vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go
  49. 51 0
      vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/fscommon.go
  50. 120 0
      vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/open.go
  51. 122 0
      vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/utils.go
  52. 28 0
      vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go
  53. 115 42
      vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go
  54. 41 59
      vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go
  55. 5 7
      vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go
  56. 15 10
      vendor/github.com/opencontainers/runc/libcontainer/configs/config.go
  57. 9 0
      vendor/github.com/opencontainers/runc/libcontainer/configs/configs_fuzzer.go
  58. 0 16
      vendor/github.com/opencontainers/runc/libcontainer/configs/device_unix.go
  59. 0 5
      vendor/github.com/opencontainers/runc/libcontainer/configs/device_windows.go
  60. 17 0
      vendor/github.com/opencontainers/runc/libcontainer/configs/devices.go
  61. 1 1
      vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go
  62. 33 29
      vendor/github.com/opencontainers/runc/libcontainer/devices/device.go
  63. 22 14
      vendor/github.com/opencontainers/runc/libcontainer/devices/device_unix.go
  64. 51 29
      vendor/github.com/opencontainers/runc/libcontainer/nsenter/cloned_binary.c
  65. 142 0
      vendor/github.com/opencontainers/runc/libcontainer/nsenter/escape.c
  66. 222 139
      vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c
  67. 1 0
      vendor/github.com/opencontainers/runc/libcontainer/nsenter/test/escape.c
  68. 53 0
      vendor/github.com/opencontainers/runc/libcontainer/nsenter/test/escape.go
  69. 0 41
      vendor/github.com/opencontainers/runc/libcontainer/user/lookup.go
  70. 16 4
      vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go
  71. 0 40
      vendor/github.com/opencontainers/runc/libcontainer/user/lookup_windows.go
  72. 10 42
      vendor/github.com/opencontainers/runc/libcontainer/user/user.go
  73. 42 0
      vendor/github.com/opencontainers/runc/libcontainer/user/user_fuzzer.go
  74. 5 0
      vendor/github.com/opencontainers/runc/libcontainer/userns/userns.go
  75. 15 0
      vendor/github.com/opencontainers/runc/libcontainer/userns/userns_fuzzer.go
  76. 37 0
      vendor/github.com/opencontainers/runc/libcontainer/userns/userns_linux.go
  77. 17 0
      vendor/github.com/opencontainers/runc/libcontainer/userns/userns_unsupported.go
  78. 1 1
      vendor/github.com/opencontainers/runtime-spec/README.md
  79. 16 7
      vendor/github.com/opencontainers/runtime-spec/specs-go/config.go
  80. 25 4
      vendor/github.com/opencontainers/runtime-spec/specs-go/state.go
  81. 2 0
      vendor/golang.org/x/net/README.md
  82. 3 3
      vendor/golang.org/x/net/go.mod
  83. 8 4
      vendor/golang.org/x/net/http2/server.go
  84. 34 6
      vendor/golang.org/x/net/http2/transport.go
  85. 1 1
      vendor/golang.org/x/net/idna/tables12.0.0.go
  86. 2394 0
      vendor/golang.org/x/net/idna/tables13.0.0.go
  87. 1 1
      vendor/golang.org/x/net/internal/socket/cmsghdr.go
  88. 13 3
      vendor/golang.org/x/net/internal/socket/cmsghdr_stub.go
  89. 21 0
      vendor/golang.org/x/net/internal/socket/cmsghdr_unix.go
  90. 25 0
      vendor/golang.org/x/net/internal/socket/cmsghdr_zos_s390x.go
  91. 1 1
      vendor/golang.org/x/net/internal/socket/error_unix.go
  92. 1 1
      vendor/golang.org/x/net/internal/socket/iovec_64bit.go
  93. 1 1
      vendor/golang.org/x/net/internal/socket/iovec_stub.go
  94. 1 1
      vendor/golang.org/x/net/internal/socket/msghdr_stub.go
  95. 36 0
      vendor/golang.org/x/net/internal/socket/msghdr_zos_s390x.go
  96. 4 3
      vendor/golang.org/x/net/internal/socket/rawconn_msg.go
  97. 1 1
      vendor/golang.org/x/net/internal/socket/rawconn_nomsg.go
  98. 1 9
      vendor/golang.org/x/net/internal/socket/socket.go
  99. 2 12
      vendor/golang.org/x/net/internal/socket/sys.go
  100. 0 23
      vendor/golang.org/x/net/internal/socket/sys_bsdvar.go

+ 3 - 4
oci/devices_linux.go

@@ -6,14 +6,13 @@ import (
 	"path/filepath"
 	"strings"
 
-	"github.com/opencontainers/runc/libcontainer/configs"
 	"github.com/opencontainers/runc/libcontainer/devices"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"golang.org/x/sys/unix"
 )
 
-// Device transforms a libcontainer configs.Device to a specs.LinuxDevice object.
-func Device(d *configs.Device) specs.LinuxDevice {
+// Device transforms a libcontainer devices.Device to a specs.LinuxDevice object.
+func Device(d *devices.Device) specs.LinuxDevice {
 	return specs.LinuxDevice{
 		Type:     string(d.Type),
 		Path:     d.Path,
@@ -25,7 +24,7 @@ func Device(d *configs.Device) specs.LinuxDevice {
 	}
 }
 
-func deviceCgroup(d *configs.Device) specs.LinuxDeviceCgroup {
+func deviceCgroup(d *devices.Device) specs.LinuxDeviceCgroup {
 	return specs.LinuxDeviceCgroup{
 		Allow:  true,
 		Type:   string(d.Type),

+ 2 - 2
oci/devices_linux_test.go

@@ -4,7 +4,7 @@ import (
 	"os"
 	"testing"
 
-	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/opencontainers/runc/libcontainer/devices"
 	"golang.org/x/sys/unix"
 	"gotest.tools/v3/assert"
 )
@@ -24,7 +24,7 @@ func TestDeviceMode(t *testing.T) {
 	for _, tc := range tests {
 		tc := tc
 		t.Run(tc.name, func(t *testing.T) {
-			d := Device(&configs.Device{FileMode: tc.in})
+			d := Device(&devices.Device{FileMode: tc.in})
 			assert.Equal(t, *d.FileMode, tc.out)
 		})
 	}

+ 0 - 20
oci/devices_unsupported.go

@@ -1,20 +0,0 @@
-// +build !linux
-
-package oci // import "github.com/docker/docker/oci"
-
-import (
-	"errors"
-
-	"github.com/opencontainers/runc/libcontainer/configs"
-	specs "github.com/opencontainers/runtime-spec/specs-go"
-)
-
-// Device transforms a libcontainer configs.Device to a specs.Device object.
-// Not implemented
-func Device(d *configs.Device) specs.LinuxDevice { return specs.LinuxDevice{} }
-
-// DevicesFromPath computes a list of devices and device permissions from paths (pathOnHost and pathInContainer) and cgroup permissions.
-// Not implemented
-func DevicesFromPath(pathOnHost, pathInContainer, cgroupPermissions string) (devs []specs.LinuxDevice, devPermissions []specs.LinuxDeviceCgroup, err error) {
-	return nil, nil, errors.New("oci/devices: unsupported platform")
-}

+ 5 - 5
vendor.conf

@@ -19,8 +19,8 @@ github.com/moby/sys                                 b0f1fd7235275d01bd35cc4421e8
 github.com/creack/pty                               2a38352e8b4d7ab6c336eef107e42a55e72e7fbc # v1.1.11
 github.com/sirupsen/logrus                          6699a89a232f3db797f2e280639854bbc4b89725 # v1.7.0
 github.com/tchap/go-patricia                        a7f0089c6f496e8e70402f61733606daa326cac5 # v2.3.0
-golang.org/x/net                                    ab34263943818b32f575efc978a3d24e80b04bd7
-golang.org/x/sys                                    b64e53b001e413bd5067f36d4e439eded3827374
+golang.org/x/net                                    6772e930b67bb09bf22262c7378e7d2f67cf59d1
+golang.org/x/sys                                    d19ff857e887eacb631721f188c7d365c2331456
 github.com/docker/go-units                          519db1ee28dcc9fd2474ae59fca29a810482bfb1 # v0.4.0
 github.com/docker/go-connections                    7395e3f8aa162843a74ed6d48e79627d9792ac55 # v0.4.0
 golang.org/x/text                                   23ae387dee1f90d29a23c0e87ee0b46038fbed0e # v0.3.3
@@ -92,8 +92,8 @@ google.golang.org/grpc                              f495f5b15ae7ccda3b38c53a1bfc
 # the containerd project first, and update both after that is merged.
 # This commit does not need to match RUNC_COMMIT as it is used for helper
 # packages but should be newer or equal.
-github.com/opencontainers/runc                      ff819c7e9184c13b7c2607fe6c30ae19403a7aff # v1.0.0-rc92
-github.com/opencontainers/runtime-spec              4d89ac9fbff6c455f46a5bb59c6b1bb7184a5e43 # v1.0.3-0.20200728170252-4d89ac9fbff6
+github.com/opencontainers/runc                      b9ee9c6314599f1b4a7f497e1f1f856fe433d3b7 # v1.0.0-rc95
+github.com/opencontainers/runtime-spec              1c3f411f041711bbeecf35ff7e93461ea6789220 # v1.0.3-0.20210326190908-1c3f411f0417
 github.com/opencontainers/image-spec                d60099175f88c47cd379c4738d158884749ed235 # v1.0.1
 github.com/cyphar/filepath-securejoin               a261ee33d7a517f054effbf451841abaafe3e0fd # v0.2.2
 
@@ -141,7 +141,7 @@ github.com/containerd/go-runc                       16b287bc67d069a60fa48db15f33
 github.com/containerd/typeurl                       cd3ce7159eae562a4f60ceff37dada11a939d247 # v1.0.1
 github.com/containerd/ttrpc                         bfba540dc45464586c106b1f31c8547933c1eb41 # v1.0.2
 github.com/gogo/googleapis                          01e0f9cca9b92166042241267ee2a5cdf5cff46c # v1.3.2
-github.com/cilium/ebpf                              1c8d4c9ef7759622653a1d319284a44652333b28
+github.com/cilium/ebpf                              ef54c303d1fff1e80a9bf20f00a378fde5419d61 # v0.5.0
 github.com/klauspost/compress                       a3b7545c88eea469c2246bee0e6c130525d56190 # v1.11.13
 github.com/pelletier/go-toml                        65ca8064882c8c308e5c804c5d5443d409e0738c # v1.8.1
 

+ 62 - 0
vendor/github.com/cilium/ebpf/README.md

@@ -0,0 +1,62 @@
+# eBPF
+
+[![PkgGoDev](https://pkg.go.dev/badge/github.com/cilium/ebpf)](https://pkg.go.dev/github.com/cilium/ebpf)
+
+eBPF is a pure Go library that provides utilities for loading, compiling, and
+debugging eBPF programs. It has minimal external dependencies and is intended to
+be used in long running processes.
+
+* [asm](https://pkg.go.dev/github.com/cilium/ebpf/asm) contains a basic
+  assembler
+* [link](https://pkg.go.dev/github.com/cilium/ebpf/link) allows attaching eBPF
+  to various hooks
+* [perf](https://pkg.go.dev/github.com/cilium/ebpf/perf) allows reading from a
+  `PERF_EVENT_ARRAY`
+* [cmd/bpf2go](https://pkg.go.dev/github.com/cilium/ebpf/cmd/bpf2go) allows
+  compiling and embedding eBPF programs in Go code
+
+The library is maintained by [Cloudflare](https://www.cloudflare.com) and
+[Cilium](https://www.cilium.io). Feel free to
+[join](https://cilium.herokuapp.com/) the
+[#libbpf-go](https://cilium.slack.com/messages/libbpf-go) channel on Slack.
+
+## Current status
+
+The package is production ready, but **the API is explicitly unstable right
+now**. Expect to update your code if you want to follow along.
+
+## Getting Started
+
+A small collection of Go and eBPF programs that serve as examples for building
+your own tools can be found under [examples/](examples/).
+
+Contributions are highly encouraged, as they highlight certain use cases of
+eBPF and the library, and help shape the future of the project.
+
+## Requirements
+
+* A version of Go that is [supported by
+  upstream](https://golang.org/doc/devel/release.html#policy)
+* Linux 4.9, 4.19 or 5.4 (versions in-between should work, but are not tested)
+
+## Useful resources
+
+* [eBPF.io](https://ebpf.io) (recommended)
+* [Cilium eBPF documentation](https://docs.cilium.io/en/latest/bpf/#bpf-guide)
+  (recommended)
+* [Linux documentation on
+  BPF](https://www.kernel.org/doc/html/latest/networking/filter.html)
+* [eBPF features by Linux
+  version](https://github.com/iovisor/bcc/blob/master/docs/kernel-versions.md)
+
+## Regenerating Testdata
+
+Run `make` in the root of this repository to rebuild testdata in all
+subpackages. This requires Docker, as it relies on a standardized build
+environment to keep the build output stable.
+
+The toolchain image build files are kept in [testdata/docker/](testdata/docker/).
+
+## License
+
+MIT

+ 0 - 206
vendor/github.com/cilium/ebpf/abi.go

@@ -1,206 +0,0 @@
-package ebpf
-
-import (
-	"bufio"
-	"bytes"
-	"errors"
-	"fmt"
-	"io"
-	"os"
-	"syscall"
-
-	"github.com/cilium/ebpf/internal"
-)
-
-// MapABI are the attributes of a Map which are available across all supported kernels.
-type MapABI struct {
-	Type       MapType
-	KeySize    uint32
-	ValueSize  uint32
-	MaxEntries uint32
-	Flags      uint32
-}
-
-func newMapABIFromSpec(spec *MapSpec) *MapABI {
-	return &MapABI{
-		spec.Type,
-		spec.KeySize,
-		spec.ValueSize,
-		spec.MaxEntries,
-		spec.Flags,
-	}
-}
-
-func newMapABIFromFd(fd *internal.FD) (string, *MapABI, error) {
-	info, err := bpfGetMapInfoByFD(fd)
-	if err != nil {
-		if errors.Is(err, syscall.EINVAL) {
-			abi, err := newMapABIFromProc(fd)
-			return "", abi, err
-		}
-		return "", nil, err
-	}
-
-	return "", &MapABI{
-		MapType(info.mapType),
-		info.keySize,
-		info.valueSize,
-		info.maxEntries,
-		info.flags,
-	}, nil
-}
-
-func newMapABIFromProc(fd *internal.FD) (*MapABI, error) {
-	var abi MapABI
-	err := scanFdInfo(fd, map[string]interface{}{
-		"map_type":    &abi.Type,
-		"key_size":    &abi.KeySize,
-		"value_size":  &abi.ValueSize,
-		"max_entries": &abi.MaxEntries,
-		"map_flags":   &abi.Flags,
-	})
-	if err != nil {
-		return nil, err
-	}
-	return &abi, nil
-}
-
-// Equal returns true if two ABIs have the same values.
-func (abi *MapABI) Equal(other *MapABI) bool {
-	switch {
-	case abi.Type != other.Type:
-		return false
-	case abi.KeySize != other.KeySize:
-		return false
-	case abi.ValueSize != other.ValueSize:
-		return false
-	case abi.MaxEntries != other.MaxEntries:
-		return false
-	case abi.Flags != other.Flags:
-		return false
-	default:
-		return true
-	}
-}
-
-// ProgramABI are the attributes of a Program which are available across all supported kernels.
-type ProgramABI struct {
-	Type ProgramType
-}
-
-func newProgramABIFromSpec(spec *ProgramSpec) *ProgramABI {
-	return &ProgramABI{
-		spec.Type,
-	}
-}
-
-func newProgramABIFromFd(fd *internal.FD) (string, *ProgramABI, error) {
-	info, err := bpfGetProgInfoByFD(fd)
-	if err != nil {
-		if errors.Is(err, syscall.EINVAL) {
-			return newProgramABIFromProc(fd)
-		}
-
-		return "", nil, err
-	}
-
-	var name string
-	if bpfName := internal.CString(info.name[:]); bpfName != "" {
-		name = bpfName
-	} else {
-		name = internal.CString(info.tag[:])
-	}
-
-	return name, &ProgramABI{
-		Type: ProgramType(info.progType),
-	}, nil
-}
-
-func newProgramABIFromProc(fd *internal.FD) (string, *ProgramABI, error) {
-	var (
-		abi  ProgramABI
-		name string
-	)
-
-	err := scanFdInfo(fd, map[string]interface{}{
-		"prog_type": &abi.Type,
-		"prog_tag":  &name,
-	})
-	if errors.Is(err, errMissingFields) {
-		return "", nil, &internal.UnsupportedFeatureError{
-			Name:           "reading ABI from /proc/self/fdinfo",
-			MinimumVersion: internal.Version{4, 11, 0},
-		}
-	}
-	if err != nil {
-		return "", nil, err
-	}
-
-	return name, &abi, nil
-}
-
-func scanFdInfo(fd *internal.FD, fields map[string]interface{}) error {
-	raw, err := fd.Value()
-	if err != nil {
-		return err
-	}
-
-	fh, err := os.Open(fmt.Sprintf("/proc/self/fdinfo/%d", raw))
-	if err != nil {
-		return err
-	}
-	defer fh.Close()
-
-	if err := scanFdInfoReader(fh, fields); err != nil {
-		return fmt.Errorf("%s: %w", fh.Name(), err)
-	}
-	return nil
-}
-
-var errMissingFields = errors.New("missing fields")
-
-func scanFdInfoReader(r io.Reader, fields map[string]interface{}) error {
-	var (
-		scanner = bufio.NewScanner(r)
-		scanned int
-	)
-
-	for scanner.Scan() {
-		parts := bytes.SplitN(scanner.Bytes(), []byte("\t"), 2)
-		if len(parts) != 2 {
-			continue
-		}
-
-		name := bytes.TrimSuffix(parts[0], []byte(":"))
-		field, ok := fields[string(name)]
-		if !ok {
-			continue
-		}
-
-		if n, err := fmt.Fscanln(bytes.NewReader(parts[1]), field); err != nil || n != 1 {
-			return fmt.Errorf("can't parse field %s: %v", name, err)
-		}
-
-		scanned++
-	}
-
-	if err := scanner.Err(); err != nil {
-		return err
-	}
-
-	if scanned != len(fields) {
-		return errMissingFields
-	}
-
-	return nil
-}
-
-// Equal returns true if two ABIs have the same values.
-func (abi *ProgramABI) Equal(other *ProgramABI) bool {
-	switch {
-	case abi.Type != other.Type:
-		return false
-	default:
-		return true
-	}
-}

+ 1 - 1
vendor/github.com/cilium/ebpf/asm/func.go

@@ -7,7 +7,7 @@ type BuiltinFunc int32
 
 // eBPF built-in functions
 //
-// You can renegerate this list using the following gawk script:
+// You can regenerate this list using the following gawk script:
 //
 //    /FN\(.+\),/ {
 //      match($1, /\((.+)\)/, r)

+ 84 - 64
vendor/github.com/cilium/ebpf/asm/instruction.go

@@ -1,17 +1,29 @@
 package asm
 
 import (
+	"crypto/sha1"
 	"encoding/binary"
+	"encoding/hex"
 	"errors"
 	"fmt"
 	"io"
 	"math"
 	"strings"
+
+	"github.com/cilium/ebpf/internal/unix"
 )
 
 // InstructionSize is the size of a BPF instruction in bytes
 const InstructionSize = 8
 
+// RawInstructionOffset is an offset in units of raw BPF instructions.
+type RawInstructionOffset uint64
+
+// Bytes returns the offset of an instruction in bytes.
+func (rio RawInstructionOffset) Bytes() uint64 {
+	return uint64(rio) * InstructionSize
+}
+
 // Instruction is a single eBPF instruction.
 type Instruction struct {
 	OpCode    OpCode
@@ -151,10 +163,20 @@ func (ins *Instruction) mapOffset() uint32 {
 	return uint32(uint64(ins.Constant) >> 32)
 }
 
+// isLoadFromMap returns true if the instruction loads from a map.
+//
+// This covers both loading the map pointer and direct map value loads.
 func (ins *Instruction) isLoadFromMap() bool {
 	return ins.OpCode == LoadImmOp(DWord) && (ins.Src == PseudoMapFD || ins.Src == PseudoMapValue)
 }
 
+// IsFunctionCall returns true if the instruction calls another BPF function.
+//
+// This is not the same thing as a BPF helper call.
+func (ins *Instruction) IsFunctionCall() bool {
+	return ins.OpCode.JumpOp() == Call && ins.Src == PseudoCall
+}
+
 // Format implements fmt.Formatter.
 func (ins Instruction) Format(f fmt.State, c rune) {
 	if c != 'v' {
@@ -310,34 +332,12 @@ func (insns Instructions) ReferenceOffsets() map[string][]int {
 	return offsets
 }
 
-func (insns Instructions) marshalledOffsets() (map[string]int, error) {
-	symbols := make(map[string]int)
-
-	marshalledPos := 0
-	for _, ins := range insns {
-		currentPos := marshalledPos
-		marshalledPos += ins.OpCode.marshalledInstructions()
-
-		if ins.Symbol == "" {
-			continue
-		}
-
-		if _, ok := symbols[ins.Symbol]; ok {
-			return nil, fmt.Errorf("duplicate symbol %s", ins.Symbol)
-		}
-
-		symbols[ins.Symbol] = currentPos
-	}
-
-	return symbols, nil
-}
-
 // Format implements fmt.Formatter.
 //
 // You can control indentation of symbols by
 // specifying a width. Setting a precision controls the indentation of
 // instructions.
-// The default character is a tab, which can be overriden by specifying
+// The default character is a tab, which can be overridden by specifying
 // the ' ' space flag.
 func (insns Instructions) Format(f fmt.State, c rune) {
 	if c != 's' && c != 'v' {
@@ -370,63 +370,83 @@ func (insns Instructions) Format(f fmt.State, c rune) {
 		symIndent = strings.Repeat(" ", symPadding)
 	}
 
-	// Figure out how many digits we need to represent the highest
-	// offset.
-	highestOffset := 0
-	for _, ins := range insns {
-		highestOffset += ins.OpCode.marshalledInstructions()
-	}
+	// Guess how many digits we need at most, by assuming that all instructions
+	// are double wide.
+	highestOffset := len(insns) * 2
 	offsetWidth := int(math.Ceil(math.Log10(float64(highestOffset))))
 
-	offset := 0
-	for _, ins := range insns {
-		if ins.Symbol != "" {
-			fmt.Fprintf(f, "%s%s:\n", symIndent, ins.Symbol)
+	iter := insns.Iterate()
+	for iter.Next() {
+		if iter.Ins.Symbol != "" {
+			fmt.Fprintf(f, "%s%s:\n", symIndent, iter.Ins.Symbol)
 		}
-		fmt.Fprintf(f, "%s%*d: %v\n", indent, offsetWidth, offset, ins)
-		offset += ins.OpCode.marshalledInstructions()
+		fmt.Fprintf(f, "%s%*d: %v\n", indent, offsetWidth, iter.Offset, iter.Ins)
 	}
-
-	return
 }
 
 // Marshal encodes a BPF program into the kernel format.
 func (insns Instructions) Marshal(w io.Writer, bo binary.ByteOrder) error {
-	absoluteOffsets, err := insns.marshalledOffsets()
-	if err != nil {
-		return err
+	for i, ins := range insns {
+		_, err := ins.Marshal(w, bo)
+		if err != nil {
+			return fmt.Errorf("instruction %d: %w", i, err)
+		}
 	}
+	return nil
+}
 
-	num := 0
+// Tag calculates the kernel tag for a series of instructions.
+//
+// It mirrors bpf_prog_calc_tag in the kernel and so can be compared
+// to ProgramInfo.Tag to figure out whether a loaded program matches
+// certain instructions.
+func (insns Instructions) Tag(bo binary.ByteOrder) (string, error) {
+	h := sha1.New()
 	for i, ins := range insns {
-		switch {
-		case ins.OpCode.JumpOp() == Call && ins.Src == PseudoCall && ins.Constant == -1:
-			// Rewrite bpf to bpf call
-			offset, ok := absoluteOffsets[ins.Reference]
-			if !ok {
-				return fmt.Errorf("instruction %d: reference to missing symbol %s", i, ins.Reference)
-			}
-
-			ins.Constant = int64(offset - num - 1)
-
-		case ins.OpCode.Class() == JumpClass && ins.Offset == -1:
-			// Rewrite jump to label
-			offset, ok := absoluteOffsets[ins.Reference]
-			if !ok {
-				return fmt.Errorf("instruction %d: reference to missing symbol %s", i, ins.Reference)
-			}
-
-			ins.Offset = int16(offset - num - 1)
+		if ins.isLoadFromMap() {
+			ins.Constant = 0
 		}
-
-		n, err := ins.Marshal(w, bo)
+		_, err := ins.Marshal(h, bo)
 		if err != nil {
-			return fmt.Errorf("instruction %d: %w", i, err)
+			return "", fmt.Errorf("instruction %d: %w", i, err)
 		}
+	}
+	return hex.EncodeToString(h.Sum(nil)[:unix.BPF_TAG_SIZE]), nil
+}
+
+// Iterate allows iterating a BPF program while keeping track of
+// various offsets.
+//
+// Modifying the instruction slice will lead to undefined behaviour.
+func (insns Instructions) Iterate() *InstructionIterator {
+	return &InstructionIterator{insns: insns}
+}
+
+// InstructionIterator iterates over a BPF program.
+type InstructionIterator struct {
+	insns Instructions
+	// The instruction in question.
+	Ins *Instruction
+	// The index of the instruction in the original instruction slice.
+	Index int
+	// The offset of the instruction in raw BPF instructions. This accounts
+	// for double-wide instructions.
+	Offset RawInstructionOffset
+}
 
-		num += int(n / InstructionSize)
+// Next returns true as long as there are any instructions remaining.
+func (iter *InstructionIterator) Next() bool {
+	if len(iter.insns) == 0 {
+		return false
 	}
-	return nil
+
+	if iter.Ins != nil {
+		iter.Index++
+		iter.Offset += RawInstructionOffset(iter.Ins.OpCode.rawInstructions())
+	}
+	iter.Ins = &iter.insns[0]
+	iter.insns = iter.insns[1:]
+	return true
 }
 
 type bpfInstruction struct {

+ 3 - 3
vendor/github.com/cilium/ebpf/asm/opcode.go

@@ -66,10 +66,10 @@ type OpCode uint8
 // InvalidOpCode is returned by setters on OpCode
 const InvalidOpCode OpCode = 0xff
 
-// marshalledInstructions returns the number of BPF instructions required
+// rawInstructions returns the number of BPF instructions required
 // to encode this opcode.
-func (op OpCode) marshalledInstructions() int {
-	if op == LoadImmOp(DWord) {
+func (op OpCode) rawInstructions() int {
+	if op.isDWordLoad() {
 		return 2
 	}
 	return 1

+ 346 - 51
vendor/github.com/cilium/ebpf/collection.go

@@ -4,6 +4,8 @@ import (
 	"errors"
 	"fmt"
 	"math"
+	"reflect"
+	"strings"
 
 	"github.com/cilium/ebpf/asm"
 	"github.com/cilium/ebpf/internal"
@@ -11,7 +13,10 @@ import (
 )
 
 // CollectionOptions control loading a collection into the kernel.
+//
+// Maps and Programs are passed to NewMapWithOptions and NewProgramsWithOptions.
 type CollectionOptions struct {
+	Maps     MapOptions
 	Programs ProgramOptions
 }
 
@@ -126,6 +131,106 @@ func (cs *CollectionSpec) RewriteConstants(consts map[string]interface{}) error
 	return nil
 }
 
+// Assign the contents of a CollectionSpec to a struct.
+//
+// This function is a short-cut to manually checking the presence
+// of maps and programs in a collection spec. Consider using bpf2go if this
+// sounds useful.
+//
+// The argument to must be a pointer to a struct. A field of the
+// struct is updated with values from Programs or Maps if it
+// has an `ebpf` tag and its type is *ProgramSpec or *MapSpec.
+// The tag gives the name of the program or map as found in
+// the CollectionSpec.
+//
+//    struct {
+//        Foo     *ebpf.ProgramSpec `ebpf:"xdp_foo"`
+//        Bar     *ebpf.MapSpec     `ebpf:"bar_map"`
+//        Ignored int
+//    }
+//
+// Returns an error if any of the fields can't be found, or
+// if the same map or program is assigned multiple times.
+func (cs *CollectionSpec) Assign(to interface{}) error {
+	valueOf := func(typ reflect.Type, name string) (reflect.Value, error) {
+		switch typ {
+		case reflect.TypeOf((*ProgramSpec)(nil)):
+			p := cs.Programs[name]
+			if p == nil {
+				return reflect.Value{}, fmt.Errorf("missing program %q", name)
+			}
+			return reflect.ValueOf(p), nil
+		case reflect.TypeOf((*MapSpec)(nil)):
+			m := cs.Maps[name]
+			if m == nil {
+				return reflect.Value{}, fmt.Errorf("missing map %q", name)
+			}
+			return reflect.ValueOf(m), nil
+		default:
+			return reflect.Value{}, fmt.Errorf("unsupported type %s", typ)
+		}
+	}
+
+	return assignValues(to, valueOf)
+}
+
+// LoadAndAssign maps and programs into the kernel and assign them to a struct.
+//
+// This function is a short-cut to manually checking the presence
+// of maps and programs in a collection spec. Consider using bpf2go if this
+// sounds useful.
+//
+// The argument to must be a pointer to a struct. A field of the
+// struct is updated with values from Programs or Maps if it
+// has an `ebpf` tag and its type is *Program or *Map.
+// The tag gives the name of the program or map as found in
+// the CollectionSpec.
+//
+//    struct {
+//        Foo     *ebpf.Program `ebpf:"xdp_foo"`
+//        Bar     *ebpf.Map     `ebpf:"bar_map"`
+//        Ignored int
+//    }
+//
+// opts may be nil.
+//
+// Returns an error if any of the fields can't be found, or
+// if the same map or program is assigned multiple times.
+func (cs *CollectionSpec) LoadAndAssign(to interface{}, opts *CollectionOptions) error {
+	if opts == nil {
+		opts = &CollectionOptions{}
+	}
+
+	loadMap, loadProgram, done, cleanup := lazyLoadCollection(cs, opts)
+	defer cleanup()
+
+	valueOf := func(typ reflect.Type, name string) (reflect.Value, error) {
+		switch typ {
+		case reflect.TypeOf((*Program)(nil)):
+			p, err := loadProgram(name)
+			if err != nil {
+				return reflect.Value{}, err
+			}
+			return reflect.ValueOf(p), nil
+		case reflect.TypeOf((*Map)(nil)):
+			m, err := loadMap(name)
+			if err != nil {
+				return reflect.Value{}, err
+			}
+			return reflect.ValueOf(m), nil
+		default:
+			return reflect.Value{}, fmt.Errorf("unsupported type %s", typ)
+		}
+	}
+
+	if err := assignValues(to, valueOf); err != nil {
+		return err
+	}
+
+	done()
+	return nil
+}
+
 // Collection is a collection of Programs and Maps associated
 // with their symbols
 type Collection struct {
@@ -134,28 +239,75 @@ type Collection struct {
 }
 
 // NewCollection creates a Collection from a specification.
-//
-// Only maps referenced by at least one of the programs are initialized.
 func NewCollection(spec *CollectionSpec) (*Collection, error) {
 	return NewCollectionWithOptions(spec, CollectionOptions{})
 }
 
 // NewCollectionWithOptions creates a Collection from a specification.
-//
-// Only maps referenced by at least one of the programs are initialized.
-func NewCollectionWithOptions(spec *CollectionSpec, opts CollectionOptions) (coll *Collection, err error) {
+func NewCollectionWithOptions(spec *CollectionSpec, opts CollectionOptions) (*Collection, error) {
+	loadMap, loadProgram, done, cleanup := lazyLoadCollection(spec, &opts)
+	defer cleanup()
+
+	for mapName := range spec.Maps {
+		_, err := loadMap(mapName)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	for progName := range spec.Programs {
+		_, err := loadProgram(progName)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	maps, progs := done()
+	return &Collection{
+		progs,
+		maps,
+	}, nil
+}
+
+type btfHandleCache map[*btf.Spec]*btf.Handle
+
+func (btfs btfHandleCache) load(spec *btf.Spec) (*btf.Handle, error) {
+	if btfs[spec] != nil {
+		return btfs[spec], nil
+	}
+
+	handle, err := btf.NewHandle(spec)
+	if err != nil {
+		return nil, err
+	}
+
+	btfs[spec] = handle
+	return handle, nil
+}
+
+func (btfs btfHandleCache) close() {
+	for _, handle := range btfs {
+		handle.Close()
+	}
+}
+
+func lazyLoadCollection(coll *CollectionSpec, opts *CollectionOptions) (
+	loadMap func(string) (*Map, error),
+	loadProgram func(string) (*Program, error),
+	done func() (map[string]*Map, map[string]*Program),
+	cleanup func(),
+) {
 	var (
-		maps  = make(map[string]*Map)
-		progs = make(map[string]*Program)
-		btfs  = make(map[*btf.Spec]*btf.Handle)
+		maps             = make(map[string]*Map)
+		progs            = make(map[string]*Program)
+		btfs             = make(btfHandleCache)
+		skipMapsAndProgs = false
 	)
 
-	defer func() {
-		for _, btf := range btfs {
-			btf.Close()
-		}
+	cleanup = func() {
+		btfs.close()
 
-		if err == nil {
+		if skipMapsAndProgs {
 			return
 		}
 
@@ -166,40 +318,43 @@ func NewCollectionWithOptions(spec *CollectionSpec, opts CollectionOptions) (col
 		for _, p := range progs {
 			p.Close()
 		}
-	}()
+	}
 
-	loadBTF := func(spec *btf.Spec) (*btf.Handle, error) {
-		if btfs[spec] != nil {
-			return btfs[spec], nil
-		}
+	done = func() (map[string]*Map, map[string]*Program) {
+		skipMapsAndProgs = true
+		return maps, progs
+	}
 
-		handle, err := btf.NewHandle(spec)
-		if err != nil {
-			return nil, err
+	loadMap = func(mapName string) (*Map, error) {
+		if m := maps[mapName]; m != nil {
+			return m, nil
 		}
 
-		btfs[spec] = handle
-		return handle, nil
-	}
-
-	for mapName, mapSpec := range spec.Maps {
-		var handle *btf.Handle
-		if mapSpec.BTF != nil {
-			handle, err = loadBTF(btf.MapSpec(mapSpec.BTF))
-			if err != nil && !errors.Is(err, btf.ErrNotSupported) {
-				return nil, err
-			}
+		mapSpec := coll.Maps[mapName]
+		if mapSpec == nil {
+			return nil, fmt.Errorf("missing map %s", mapName)
 		}
 
-		m, err := newMapWithBTF(mapSpec, handle)
+		m, err := newMapWithOptions(mapSpec, opts.Maps, btfs)
 		if err != nil {
 			return nil, fmt.Errorf("map %s: %w", mapName, err)
 		}
+
 		maps[mapName] = m
+		return m, nil
 	}
 
-	for progName, origProgSpec := range spec.Programs {
-		progSpec := origProgSpec.Copy()
+	loadProgram = func(progName string) (*Program, error) {
+		if prog := progs[progName]; prog != nil {
+			return prog, nil
+		}
+
+		progSpec := coll.Programs[progName]
+		if progSpec == nil {
+			return nil, fmt.Errorf("unknown program %s", progName)
+		}
+
+		progSpec = progSpec.Copy()
 
 		// Rewrite any reference to a valid map.
 		for i := range progSpec.Instructions {
@@ -215,9 +370,9 @@ func NewCollectionWithOptions(spec *CollectionSpec, opts CollectionOptions) (col
 				continue
 			}
 
-			m := maps[ins.Reference]
-			if m == nil {
-				return nil, fmt.Errorf("program %s: missing map %s", progName, ins.Reference)
+			m, err := loadMap(ins.Reference)
+			if err != nil {
+				return nil, fmt.Errorf("program %s: %s", progName, err)
 			}
 
 			fd := m.FD()
@@ -229,25 +384,16 @@ func NewCollectionWithOptions(spec *CollectionSpec, opts CollectionOptions) (col
 			}
 		}
 
-		var handle *btf.Handle
-		if progSpec.BTF != nil {
-			handle, err = loadBTF(btf.ProgramSpec(progSpec.BTF))
-			if err != nil && !errors.Is(err, btf.ErrNotSupported) {
-				return nil, err
-			}
-		}
-
-		prog, err := newProgramWithBTF(progSpec, handle, opts.Programs)
+		prog, err := newProgramWithOptions(progSpec, opts.Programs, btfs)
 		if err != nil {
 			return nil, fmt.Errorf("program %s: %w", progName, err)
 		}
+
 		progs[progName] = prog
+		return prog, nil
 	}
 
-	return &Collection{
-		progs,
-		maps,
-	}, nil
+	return
 }
 
 // LoadCollection parses an object file and converts it to a collection.
@@ -292,3 +438,152 @@ func (coll *Collection) DetachProgram(name string) *Program {
 	delete(coll.Programs, name)
 	return p
 }
+
+// Assign the contents of a collection to a struct.
+//
+// Deprecated: use CollectionSpec.Assign instead. It provides the same
+// functionality but creates only the maps and programs requested.
+func (coll *Collection) Assign(to interface{}) error {
+	assignedMaps := make(map[string]struct{})
+	assignedPrograms := make(map[string]struct{})
+	valueOf := func(typ reflect.Type, name string) (reflect.Value, error) {
+		switch typ {
+		case reflect.TypeOf((*Program)(nil)):
+			p := coll.Programs[name]
+			if p == nil {
+				return reflect.Value{}, fmt.Errorf("missing program %q", name)
+			}
+			assignedPrograms[name] = struct{}{}
+			return reflect.ValueOf(p), nil
+		case reflect.TypeOf((*Map)(nil)):
+			m := coll.Maps[name]
+			if m == nil {
+				return reflect.Value{}, fmt.Errorf("missing map %q", name)
+			}
+			assignedMaps[name] = struct{}{}
+			return reflect.ValueOf(m), nil
+		default:
+			return reflect.Value{}, fmt.Errorf("unsupported type %s", typ)
+		}
+	}
+
+	if err := assignValues(to, valueOf); err != nil {
+		return err
+	}
+
+	for name := range assignedPrograms {
+		coll.DetachProgram(name)
+	}
+
+	for name := range assignedMaps {
+		coll.DetachMap(name)
+	}
+
+	return nil
+}
+
+func assignValues(to interface{}, valueOf func(reflect.Type, string) (reflect.Value, error)) error {
+	type structField struct {
+		reflect.StructField
+		value reflect.Value
+	}
+
+	var (
+		fields        []structField
+		visitedTypes  = make(map[reflect.Type]bool)
+		flattenStruct func(reflect.Value) error
+	)
+
+	flattenStruct = func(structVal reflect.Value) error {
+		structType := structVal.Type()
+		if structType.Kind() != reflect.Struct {
+			return fmt.Errorf("%s is not a struct", structType)
+		}
+
+		if visitedTypes[structType] {
+			return fmt.Errorf("recursion on type %s", structType)
+		}
+
+		for i := 0; i < structType.NumField(); i++ {
+			field := structField{structType.Field(i), structVal.Field(i)}
+
+			name := field.Tag.Get("ebpf")
+			if name != "" {
+				fields = append(fields, field)
+				continue
+			}
+
+			var err error
+			switch field.Type.Kind() {
+			case reflect.Ptr:
+				if field.Type.Elem().Kind() != reflect.Struct {
+					continue
+				}
+
+				if field.value.IsNil() {
+					return fmt.Errorf("nil pointer to %s", structType)
+				}
+
+				err = flattenStruct(field.value.Elem())
+
+			case reflect.Struct:
+				err = flattenStruct(field.value)
+
+			default:
+				continue
+			}
+
+			if err != nil {
+				return fmt.Errorf("field %s: %s", field.Name, err)
+			}
+		}
+
+		return nil
+	}
+
+	toValue := reflect.ValueOf(to)
+	if toValue.Type().Kind() != reflect.Ptr {
+		return fmt.Errorf("%T is not a pointer to struct", to)
+	}
+
+	if toValue.IsNil() {
+		return fmt.Errorf("nil pointer to %T", to)
+	}
+
+	if err := flattenStruct(toValue.Elem()); err != nil {
+		return err
+	}
+
+	type elem struct {
+		// Either *Map or *Program
+		typ  reflect.Type
+		name string
+	}
+
+	assignedTo := make(map[elem]string)
+	for _, field := range fields {
+		name := field.Tag.Get("ebpf")
+		if strings.Contains(name, ",") {
+			return fmt.Errorf("field %s: ebpf tag contains a comma", field.Name)
+		}
+
+		e := elem{field.Type, name}
+		if assignedField := assignedTo[e]; assignedField != "" {
+			return fmt.Errorf("field %s: %q was already assigned to %s", field.Name, name, assignedField)
+		}
+
+		value, err := valueOf(field.Type, name)
+		if err != nil {
+			return fmt.Errorf("field %s: %w", field.Name, err)
+		}
+
+		if !field.value.CanSet() {
+			return fmt.Errorf("field %s: can't set value", field.Name)
+		}
+
+		field.value.Set(value)
+		assignedTo[e] = field.Name
+	}
+
+	return nil
+}

+ 1 - 2
vendor/github.com/cilium/ebpf/doc.go

@@ -12,6 +12,5 @@
 // eBPF code should be compiled ahead of time using clang, and shipped with
 // your application as any other resource.
 //
-// This package doesn't include code required to attach eBPF to Linux
-// subsystems, since this varies per subsystem.
+// Use the link subpackage to attach a loaded program to a hook in the kernel.
 package ebpf

+ 456 - 265
vendor/github.com/cilium/ebpf/elf_reader.go

@@ -1,6 +1,7 @@
 package ebpf
 
 import (
+	"bufio"
 	"bytes"
 	"debug/elf"
 	"encoding/binary"
@@ -17,12 +18,14 @@ import (
 	"github.com/cilium/ebpf/internal/unix"
 )
 
+// elfCode is a convenience to reduce the amount of arguments that have to
+// be passed around explicitly. You should treat it's contents as immutable.
 type elfCode struct {
-	*elf.File
-	symbols           []elf.Symbol
-	symbolsPerSection map[elf.SectionIndex]map[uint64]elf.Symbol
-	license           string
-	version           uint32
+	*internal.SafeELFFile
+	sections map[elf.SectionIndex]*elfSection
+	license  string
+	version  uint32
+	btf      *btf.Spec
 }
 
 // LoadCollectionSpec parses an ELF file into a CollectionSpec.
@@ -42,63 +45,52 @@ func LoadCollectionSpec(file string) (*CollectionSpec, error) {
 
 // LoadCollectionSpecFromReader parses an ELF file into a CollectionSpec.
 func LoadCollectionSpecFromReader(rd io.ReaderAt) (*CollectionSpec, error) {
-	f, err := elf.NewFile(rd)
+	f, err := internal.NewSafeELFFile(rd)
 	if err != nil {
 		return nil, err
 	}
 	defer f.Close()
 
-	symbols, err := f.Symbols()
-	if err != nil {
-		return nil, fmt.Errorf("load symbols: %v", err)
-	}
-
-	ec := &elfCode{f, symbols, symbolsPerSection(symbols), "", 0}
-
 	var (
 		licenseSection *elf.Section
 		versionSection *elf.Section
-		btfMaps        = make(map[elf.SectionIndex]*elf.Section)
-		progSections   = make(map[elf.SectionIndex]*elf.Section)
+		sections       = make(map[elf.SectionIndex]*elfSection)
 		relSections    = make(map[elf.SectionIndex]*elf.Section)
-		mapSections    = make(map[elf.SectionIndex]*elf.Section)
-		dataSections   = make(map[elf.SectionIndex]*elf.Section)
 	)
 
-	for i, sec := range ec.Sections {
+	// This is the target of relocations generated by inline assembly.
+	sections[elf.SHN_UNDEF] = newElfSection(new(elf.Section), undefSection)
+
+	// Collect all the sections we're interested in. This includes relocations
+	// which we parse later.
+	for i, sec := range f.Sections {
+		idx := elf.SectionIndex(i)
+
 		switch {
 		case strings.HasPrefix(sec.Name, "license"):
 			licenseSection = sec
 		case strings.HasPrefix(sec.Name, "version"):
 			versionSection = sec
 		case strings.HasPrefix(sec.Name, "maps"):
-			mapSections[elf.SectionIndex(i)] = sec
+			sections[idx] = newElfSection(sec, mapSection)
 		case sec.Name == ".maps":
-			btfMaps[elf.SectionIndex(i)] = sec
-		case sec.Name == ".bss" || sec.Name == ".rodata" || sec.Name == ".data":
-			dataSections[elf.SectionIndex(i)] = sec
+			sections[idx] = newElfSection(sec, btfMapSection)
+		case sec.Name == ".bss" || sec.Name == ".data" || strings.HasPrefix(sec.Name, ".rodata"):
+			sections[idx] = newElfSection(sec, dataSection)
 		case sec.Type == elf.SHT_REL:
-			if int(sec.Info) >= len(ec.Sections) {
-				return nil, fmt.Errorf("found relocation section %v for missing section %v", i, sec.Info)
-			}
-
 			// Store relocations under the section index of the target
-			idx := elf.SectionIndex(sec.Info)
-			if relSections[idx] != nil {
-				return nil, fmt.Errorf("section %d has multiple relocation sections", sec.Info)
-			}
-			relSections[idx] = sec
+			relSections[elf.SectionIndex(sec.Info)] = sec
 		case sec.Type == elf.SHT_PROGBITS && (sec.Flags&elf.SHF_EXECINSTR) != 0 && sec.Size > 0:
-			progSections[elf.SectionIndex(i)] = sec
+			sections[idx] = newElfSection(sec, programSection)
 		}
 	}
 
-	ec.license, err = loadLicense(licenseSection)
+	license, err := loadLicense(licenseSection)
 	if err != nil {
 		return nil, fmt.Errorf("load license: %w", err)
 	}
 
-	ec.version, err = loadVersion(versionSection, ec.ByteOrder)
+	version, err := loadVersion(versionSection, f.ByteOrder)
 	if err != nil {
 		return nil, fmt.Errorf("load version: %w", err)
 	}
@@ -108,37 +100,90 @@ func LoadCollectionSpecFromReader(rd io.ReaderAt) (*CollectionSpec, error) {
 		return nil, fmt.Errorf("load BTF: %w", err)
 	}
 
-	relocations, referencedSections, err := ec.loadRelocations(relSections)
+	// Assign symbols to all the sections we're interested in.
+	symbols, err := f.Symbols()
 	if err != nil {
-		return nil, fmt.Errorf("load relocations: %w", err)
+		return nil, fmt.Errorf("load symbols: %v", err)
 	}
 
-	maps := make(map[string]*MapSpec)
-	if err := ec.loadMaps(maps, mapSections); err != nil {
-		return nil, fmt.Errorf("load maps: %w", err)
-	}
+	for _, symbol := range symbols {
+		idx := symbol.Section
+		symType := elf.ST_TYPE(symbol.Info)
 
-	if len(btfMaps) > 0 {
-		if err := ec.loadBTFMaps(maps, btfMaps, btfSpec); err != nil {
-			return nil, fmt.Errorf("load BTF maps: %w", err)
+		section := sections[idx]
+		if section == nil {
+			continue
 		}
+
+		// Older versions of LLVM don't tag symbols correctly, so keep
+		// all NOTYPE ones.
+		keep := symType == elf.STT_NOTYPE
+		switch section.kind {
+		case mapSection, btfMapSection, dataSection:
+			keep = keep || symType == elf.STT_OBJECT
+		case programSection:
+			keep = keep || symType == elf.STT_FUNC
+		}
+		if !keep || symbol.Name == "" {
+			continue
+		}
+
+		section.symbols[symbol.Value] = symbol
 	}
 
-	if len(dataSections) > 0 {
-		for idx := range dataSections {
-			if !referencedSections[idx] {
-				// Prune data sections which are not referenced by any
-				// instructions.
-				delete(dataSections, idx)
-			}
+	ec := &elfCode{
+		SafeELFFile: f,
+		sections:    sections,
+		license:     license,
+		version:     version,
+		btf:         btfSpec,
+	}
+
+	// Go through relocation sections, and parse the ones for sections we're
+	// interested in. Make sure that relocations point at valid sections.
+	for idx, relSection := range relSections {
+		section := sections[idx]
+		if section == nil {
+			continue
 		}
 
-		if err := ec.loadDataSections(maps, dataSections, btfSpec); err != nil {
-			return nil, fmt.Errorf("load data sections: %w", err)
+		rels, err := ec.loadRelocations(relSection, symbols)
+		if err != nil {
+			return nil, fmt.Errorf("relocation for section %q: %w", section.Name, err)
+		}
+
+		for _, rel := range rels {
+			target := sections[rel.Section]
+			if target == nil {
+				return nil, fmt.Errorf("section %q: reference to %q in section %s: %w", section.Name, rel.Name, rel.Section, ErrNotSupported)
+			}
+
+			if target.Flags&elf.SHF_STRINGS > 0 {
+				return nil, fmt.Errorf("section %q: string %q is not stack allocated: %w", section.Name, rel.Name, ErrNotSupported)
+			}
+
+			target.references++
 		}
+
+		section.relocations = rels
+	}
+
+	// Collect all the various ways to define maps.
+	maps := make(map[string]*MapSpec)
+	if err := ec.loadMaps(maps); err != nil {
+		return nil, fmt.Errorf("load maps: %w", err)
 	}
 
-	progs, err := ec.loadPrograms(progSections, relocations, btfSpec)
+	if err := ec.loadBTFMaps(maps); err != nil {
+		return nil, fmt.Errorf("load BTF maps: %w", err)
+	}
+
+	if err := ec.loadDataSections(maps); err != nil {
+		return nil, fmt.Errorf("load data sections: %w", err)
+	}
+
+	// Finally, collect programs and link them.
+	progs, err := ec.loadPrograms()
 	if err != nil {
 		return nil, fmt.Errorf("load programs: %w", err)
 	}
@@ -170,33 +215,69 @@ func loadVersion(sec *elf.Section, bo binary.ByteOrder) (uint32, error) {
 	return version, nil
 }
 
-func (ec *elfCode) loadPrograms(progSections map[elf.SectionIndex]*elf.Section, relocations map[elf.SectionIndex]map[uint64]elf.Symbol, btfSpec *btf.Spec) (map[string]*ProgramSpec, error) {
+type elfSectionKind int
+
+const (
+	undefSection elfSectionKind = iota
+	mapSection
+	btfMapSection
+	programSection
+	dataSection
+)
+
+type elfSection struct {
+	*elf.Section
+	kind elfSectionKind
+	// Offset from the start of the section to a symbol
+	symbols map[uint64]elf.Symbol
+	// Offset from the start of the section to a relocation, which points at
+	// a symbol in another section.
+	relocations map[uint64]elf.Symbol
+	// The number of relocations pointing at this section.
+	references int
+}
+
+func newElfSection(section *elf.Section, kind elfSectionKind) *elfSection {
+	return &elfSection{
+		section,
+		kind,
+		make(map[uint64]elf.Symbol),
+		make(map[uint64]elf.Symbol),
+		0,
+	}
+}
+
+func (ec *elfCode) loadPrograms() (map[string]*ProgramSpec, error) {
 	var (
 		progs []*ProgramSpec
 		libs  []*ProgramSpec
 	)
 
-	for idx, sec := range progSections {
-		syms := ec.symbolsPerSection[idx]
-		if len(syms) == 0 {
+	for _, sec := range ec.sections {
+		if sec.kind != programSection {
+			continue
+		}
+
+		if len(sec.symbols) == 0 {
 			return nil, fmt.Errorf("section %v: missing symbols", sec.Name)
 		}
 
-		funcSym, ok := syms[0]
+		funcSym, ok := sec.symbols[0]
 		if !ok {
 			return nil, fmt.Errorf("section %v: no label at start", sec.Name)
 		}
 
-		insns, length, err := ec.loadInstructions(sec, syms, relocations[idx])
+		insns, length, err := ec.loadInstructions(sec)
 		if err != nil {
-			return nil, fmt.Errorf("program %s: can't unmarshal instructions: %w", funcSym.Name, err)
+			return nil, fmt.Errorf("program %s: %w", funcSym.Name, err)
 		}
 
-		progType, attachType, attachTo := getProgType(sec.Name)
+		progType, attachType, progFlags, attachTo := getProgType(sec.Name)
 
 		spec := &ProgramSpec{
 			Name:          funcSym.Name,
 			Type:          progType,
+			Flags:         progFlags,
 			AttachType:    attachType,
 			AttachTo:      attachTo,
 			License:       ec.license,
@@ -205,8 +286,8 @@ func (ec *elfCode) loadPrograms(progSections map[elf.SectionIndex]*elf.Section,
 			ByteOrder:     ec.ByteOrder,
 		}
 
-		if btfSpec != nil {
-			spec.BTF, err = btfSpec.Program(sec.Name, length)
+		if ec.btf != nil {
+			spec.BTF, err = ec.btf.Program(sec.Name, length)
 			if err != nil && !errors.Is(err, btf.ErrNoExtendedInfo) {
 				return nil, fmt.Errorf("program %s: %w", funcSym.Name, err)
 			}
@@ -234,9 +315,9 @@ func (ec *elfCode) loadPrograms(progSections map[elf.SectionIndex]*elf.Section,
 	return res, nil
 }
 
-func (ec *elfCode) loadInstructions(section *elf.Section, symbols, relocations map[uint64]elf.Symbol) (asm.Instructions, uint64, error) {
+func (ec *elfCode) loadInstructions(section *elfSection) (asm.Instructions, uint64, error) {
 	var (
-		r      = section.Open()
+		r      = bufio.NewReader(section.Open())
 		insns  asm.Instructions
 		offset uint64
 	)
@@ -250,11 +331,11 @@ func (ec *elfCode) loadInstructions(section *elf.Section, symbols, relocations m
 			return nil, 0, fmt.Errorf("offset %d: %w", offset, err)
 		}
 
-		ins.Symbol = symbols[offset].Name
+		ins.Symbol = section.symbols[offset].Name
 
-		if rel, ok := relocations[offset]; ok {
+		if rel, ok := section.relocations[offset]; ok {
 			if err = ec.relocateInstruction(&ins, rel); err != nil {
-				return nil, 0, fmt.Errorf("offset %d: can't relocate instruction: %w", offset, err)
+				return nil, 0, fmt.Errorf("offset %d: relocate instruction: %w", offset, err)
 			}
 		}
 
@@ -270,69 +351,66 @@ func (ec *elfCode) relocateInstruction(ins *asm.Instruction, rel elf.Symbol) err
 		name = rel.Name
 	)
 
-	if typ == elf.STT_SECTION {
-		// Symbols with section type do not have a name set. Get it
-		// from the section itself.
-		idx := int(rel.Section)
-		if idx > len(ec.Sections) {
-			return errors.New("out-of-bounds section index")
+	target := ec.sections[rel.Section]
+
+	switch target.kind {
+	case mapSection, btfMapSection:
+		if bind != elf.STB_GLOBAL {
+			return fmt.Errorf("possible erroneous static qualifier on map definition: found reference to %q", name)
 		}
 
-		name = ec.Sections[idx].Name
-	}
+		if typ != elf.STT_OBJECT && typ != elf.STT_NOTYPE {
+			// STT_NOTYPE is generated on clang < 8 which doesn't tag
+			// relocations appropriately.
+			return fmt.Errorf("map load: incorrect relocation type %v", typ)
+		}
 
-outer:
-	switch {
-	case ins.OpCode == asm.LoadImmOp(asm.DWord):
-		// There are two distinct types of a load from a map:
-		// a direct one, where the value is extracted without
-		// a call to map_lookup_elem in eBPF, and an indirect one
-		// that goes via the helper. They are distinguished by
-		// different relocations.
+		ins.Src = asm.PseudoMapFD
+
+		// Mark the instruction as needing an update when creating the
+		// collection.
+		if err := ins.RewriteMapPtr(-1); err != nil {
+			return err
+		}
+
+	case dataSection:
 		switch typ {
 		case elf.STT_SECTION:
-			// This is a direct load since the referenced symbol is a
-			// section. Weirdly, the offset of the real symbol in the
-			// section is encoded in the instruction stream.
 			if bind != elf.STB_LOCAL {
 				return fmt.Errorf("direct load: %s: unsupported relocation %s", name, bind)
 			}
 
-			// For some reason, clang encodes the offset of the symbol its
-			// section in the first basic BPF instruction, while the kernel
-			// expects it in the second one.
-			ins.Constant <<= 32
-			ins.Src = asm.PseudoMapValue
-
-		case elf.STT_NOTYPE:
-			if bind == elf.STB_GLOBAL && rel.Section == elf.SHN_UNDEF {
-				// This is a relocation generated by inline assembly.
-				// We can't do more than assigning ins.Reference.
-				break outer
-			}
-
-			// This is an ELF generated on clang < 8, which doesn't tag
-			// relocations appropriately.
-			fallthrough
-
 		case elf.STT_OBJECT:
 			if bind != elf.STB_GLOBAL {
-				return fmt.Errorf("load: %s: unsupported binding: %s", name, bind)
+				return fmt.Errorf("direct load: %s: unsupported relocation %s", name, bind)
 			}
 
-			ins.Src = asm.PseudoMapFD
-
 		default:
-			return fmt.Errorf("load: %s: unsupported relocation: %s", name, typ)
+			return fmt.Errorf("incorrect relocation type %v for direct map load", typ)
 		}
 
+		// We rely on using the name of the data section as the reference. It
+		// would be nicer to keep the real name in case of an STT_OBJECT, but
+		// it's not clear how to encode that into Instruction.
+		name = target.Name
+
+		// For some reason, clang encodes the offset of the symbol its
+		// section in the first basic BPF instruction, while the kernel
+		// expects it in the second one.
+		ins.Constant <<= 32
+		ins.Src = asm.PseudoMapValue
+
 		// Mark the instruction as needing an update when creating the
 		// collection.
 		if err := ins.RewriteMapPtr(-1); err != nil {
 			return err
 		}
 
-	case ins.OpCode.JumpOp() == asm.Call:
+	case programSection:
+		if ins.OpCode.JumpOp() != asm.Call {
+			return fmt.Errorf("not a call instruction: %s", ins)
+		}
+
 		if ins.Src != asm.PseudoCall {
 			return fmt.Errorf("call: %s: incorrect source register", name)
 		}
@@ -357,7 +435,7 @@ outer:
 				return fmt.Errorf("call: %s: invalid offset %d", name, offset)
 			}
 
-			sym, ok := ec.symbolsPerSection[rel.Section][uint64(offset)]
+			sym, ok := target.symbols[uint64(offset)]
 			if !ok {
 				return fmt.Errorf("call: %s: no symbol at offset %d", name, offset)
 			}
@@ -369,31 +447,46 @@ outer:
 			return fmt.Errorf("call: %s: invalid symbol type %s", name, typ)
 		}
 
+	case undefSection:
+		if bind != elf.STB_GLOBAL {
+			return fmt.Errorf("asm relocation: %s: unsupported binding: %s", name, bind)
+		}
+
+		if typ != elf.STT_NOTYPE {
+			return fmt.Errorf("asm relocation: %s: unsupported type %s", name, typ)
+		}
+
+		// There is nothing to do here but set ins.Reference.
+
 	default:
-		return fmt.Errorf("relocation for unsupported instruction: %s", ins.OpCode)
+		return fmt.Errorf("relocation to %q: %w", target.Name, ErrNotSupported)
 	}
 
 	ins.Reference = name
 	return nil
 }
 
-func (ec *elfCode) loadMaps(maps map[string]*MapSpec, mapSections map[elf.SectionIndex]*elf.Section) error {
-	for idx, sec := range mapSections {
-		syms := ec.symbolsPerSection[idx]
-		if len(syms) == 0 {
+func (ec *elfCode) loadMaps(maps map[string]*MapSpec) error {
+	for _, sec := range ec.sections {
+		if sec.kind != mapSection {
+			continue
+		}
+
+		nSym := len(sec.symbols)
+		if nSym == 0 {
 			return fmt.Errorf("section %v: no symbols", sec.Name)
 		}
 
-		if sec.Size%uint64(len(syms)) != 0 {
+		if sec.Size%uint64(nSym) != 0 {
 			return fmt.Errorf("section %v: map descriptors are not of equal size", sec.Name)
 		}
 
 		var (
-			r    = sec.Open()
-			size = sec.Size / uint64(len(syms))
+			r    = bufio.NewReader(sec.Open())
+			size = sec.Size / uint64(nSym)
 		)
-		for i, offset := 0, uint64(0); i < len(syms); i, offset = i+1, offset+size {
-			mapSym, ok := syms[offset]
+		for i, offset := 0, uint64(0); i < nSym; i, offset = i+1, offset+size {
+			mapSym, ok := sec.symbols[offset]
 			if !ok {
 				return fmt.Errorf("section %s: missing symbol for map at offset %d", sec.Name, offset)
 			}
@@ -431,24 +524,43 @@ func (ec *elfCode) loadMaps(maps map[string]*MapSpec, mapSections map[elf.Sectio
 	return nil
 }
 
-func (ec *elfCode) loadBTFMaps(maps map[string]*MapSpec, mapSections map[elf.SectionIndex]*elf.Section, spec *btf.Spec) error {
-	if spec == nil {
-		return fmt.Errorf("missing BTF")
-	}
+func (ec *elfCode) loadBTFMaps(maps map[string]*MapSpec) error {
+	for _, sec := range ec.sections {
+		if sec.kind != btfMapSection {
+			continue
+		}
 
-	for idx, sec := range mapSections {
-		syms := ec.symbolsPerSection[idx]
-		if len(syms) == 0 {
-			return fmt.Errorf("section %v: no symbols", sec.Name)
+		if ec.btf == nil {
+			return fmt.Errorf("missing BTF")
+		}
+
+		_, err := io.Copy(internal.DiscardZeroes{}, bufio.NewReader(sec.Open()))
+		if err != nil {
+			return fmt.Errorf("section %v: initializing BTF map definitions: %w", sec.Name, internal.ErrNotSupported)
+		}
+
+		var ds btf.Datasec
+		if err := ec.btf.FindType(sec.Name, &ds); err != nil {
+			return fmt.Errorf("cannot find section '%s' in BTF: %w", sec.Name, err)
 		}
 
-		for _, sym := range syms {
-			name := sym.Name
+		for _, vs := range ds.Vars {
+			v, ok := vs.Type.(*btf.Var)
+			if !ok {
+				return fmt.Errorf("section %v: unexpected type %s", sec.Name, vs.Type)
+			}
+			name := string(v.Name)
+
 			if maps[name] != nil {
-				return fmt.Errorf("section %v: map %v already exists", sec.Name, sym)
+				return fmt.Errorf("section %v: map %s already exists", sec.Name, name)
+			}
+
+			mapStruct, ok := v.Type.(*btf.Struct)
+			if !ok {
+				return fmt.Errorf("expected struct, got %s", v.Type)
 			}
 
-			mapSpec, err := mapSpecFromBTF(spec, name)
+			mapSpec, err := mapSpecFromBTF(name, mapStruct, false, ec.btf)
 			if err != nil {
 				return fmt.Errorf("map %v: %w", name, err)
 			}
@@ -460,30 +572,21 @@ func (ec *elfCode) loadBTFMaps(maps map[string]*MapSpec, mapSections map[elf.Sec
 	return nil
 }
 
-func mapSpecFromBTF(spec *btf.Spec, name string) (*MapSpec, error) {
-	btfMap, btfMapMembers, err := spec.Map(name)
-	if err != nil {
-		return nil, fmt.Errorf("can't get BTF: %w", err)
-	}
-
-	keyType := btf.MapKey(btfMap)
-	size, err := btf.Sizeof(keyType)
-	if err != nil {
-		return nil, fmt.Errorf("can't get size of BTF key: %w", err)
-	}
-	keySize := uint32(size)
-
-	valueType := btf.MapValue(btfMap)
-	size, err = btf.Sizeof(valueType)
-	if err != nil {
-		return nil, fmt.Errorf("can't get size of BTF value: %w", err)
-	}
-	valueSize := uint32(size)
+// mapSpecFromBTF produces a MapSpec based on a btf.Struct def representing
+// a BTF map definition. The name and spec arguments will be copied to the
+// resulting MapSpec, and inner must be true on any resursive invocations.
+func mapSpecFromBTF(name string, def *btf.Struct, inner bool, spec *btf.Spec) (*MapSpec, error) {
 
 	var (
+		key, value                 btf.Type
+		keySize, valueSize         uint32
 		mapType, flags, maxEntries uint32
+		pinType                    PinType
+		innerMapSpec               *MapSpec
+		err                        error
 	)
-	for _, member := range btfMapMembers {
+
+	for i, member := range def.Members {
 		switch member.Name {
 		case "type":
 			mapType, err = uintFromBTF(member.Type)
@@ -503,8 +606,48 @@ func mapSpecFromBTF(spec *btf.Spec, name string) (*MapSpec, error) {
 				return nil, fmt.Errorf("can't get BTF map max entries: %w", err)
 			}
 
+		case "key":
+			if keySize != 0 {
+				return nil, errors.New("both key and key_size given")
+			}
+
+			pk, ok := member.Type.(*btf.Pointer)
+			if !ok {
+				return nil, fmt.Errorf("key type is not a pointer: %T", member.Type)
+			}
+
+			key = pk.Target
+
+			size, err := btf.Sizeof(pk.Target)
+			if err != nil {
+				return nil, fmt.Errorf("can't get size of BTF key: %w", err)
+			}
+
+			keySize = uint32(size)
+
+		case "value":
+			if valueSize != 0 {
+				return nil, errors.New("both value and value_size given")
+			}
+
+			vk, ok := member.Type.(*btf.Pointer)
+			if !ok {
+				return nil, fmt.Errorf("value type is not a pointer: %T", member.Type)
+			}
+
+			value = vk.Target
+
+			size, err := btf.Sizeof(vk.Target)
+			if err != nil {
+				return nil, fmt.Errorf("can't get size of BTF value: %w", err)
+			}
+
+			valueSize = uint32(size)
+
 		case "key_size":
-			if _, isVoid := keyType.(*btf.Void); !isVoid {
+			// Key needs to be nil and keySize needs to be 0 for key_size to be
+			// considered a valid member.
+			if key != nil || keySize != 0 {
 				return nil, errors.New("both key and key_size given")
 			}
 
@@ -514,7 +657,9 @@ func mapSpecFromBTF(spec *btf.Spec, name string) (*MapSpec, error) {
 			}
 
 		case "value_size":
-			if _, isVoid := valueType.(*btf.Void); !isVoid {
+			// Value needs to be nil and valueSize needs to be 0 for value_size to be
+			// considered a valid member.
+			if value != nil || valueSize != 0 {
 				return nil, errors.New("both value and value_size given")
 			}
 
@@ -524,28 +669,79 @@ func mapSpecFromBTF(spec *btf.Spec, name string) (*MapSpec, error) {
 			}
 
 		case "pinning":
+			if inner {
+				return nil, errors.New("inner maps can't be pinned")
+			}
+
 			pinning, err := uintFromBTF(member.Type)
 			if err != nil {
 				return nil, fmt.Errorf("can't get pinning: %w", err)
 			}
 
-			if pinning != 0 {
-				return nil, fmt.Errorf("'pinning' attribute not supported: %w", ErrNotSupported)
+			pinType = PinType(pinning)
+
+		case "values":
+			// The 'values' field in BTF map definitions is used for declaring map
+			// value types that are references to other BPF objects, like other maps
+			// or programs. It is always expected to be an array of pointers.
+			if i != len(def.Members)-1 {
+				return nil, errors.New("'values' must be the last member in a BTF map definition")
+			}
+
+			if valueSize != 0 && valueSize != 4 {
+				return nil, errors.New("value_size must be 0 or 4")
+			}
+			valueSize = 4
+
+			valueType, err := resolveBTFArrayMacro(member.Type)
+			if err != nil {
+				return nil, fmt.Errorf("can't resolve type of member 'values': %w", err)
+			}
+
+			switch t := valueType.(type) {
+			case *btf.Struct:
+				// The values member pointing to an array of structs means we're expecting
+				// a map-in-map declaration.
+				if MapType(mapType) != ArrayOfMaps && MapType(mapType) != HashOfMaps {
+					return nil, errors.New("outer map needs to be an array or a hash of maps")
+				}
+				if inner {
+					return nil, fmt.Errorf("nested inner maps are not supported")
+				}
+
+				// This inner map spec is used as a map template, but it needs to be
+				// created as a traditional map before it can be used to do so.
+				// libbpf names the inner map template '<outer_name>.inner', but we
+				// opted for _inner to simplify validation logic. (dots only supported
+				// on kernels 5.2 and up)
+				// Pass the BTF spec from the parent object, since both parent and
+				// child must be created from the same BTF blob (on kernels that support BTF).
+				innerMapSpec, err = mapSpecFromBTF(name+"_inner", t, true, spec)
+				if err != nil {
+					return nil, fmt.Errorf("can't parse BTF map definition of inner map: %w", err)
+				}
+
+			default:
+				return nil, fmt.Errorf("unsupported value type %q in 'values' field", t)
 			}
 
-		case "key", "value":
 		default:
 			return nil, fmt.Errorf("unrecognized field %s in BTF map definition", member.Name)
 		}
 	}
 
+	bm := btf.NewMap(spec, key, value)
+
 	return &MapSpec{
+		Name:       SanitizeName(name, -1),
 		Type:       MapType(mapType),
 		KeySize:    keySize,
 		ValueSize:  valueSize,
 		MaxEntries: maxEntries,
 		Flags:      flags,
-		BTF:        btfMap,
+		BTF:        &bm,
+		Pinning:    pinType,
+		InnerMap:   innerMapSpec,
 	}, nil
 }
 
@@ -565,13 +761,40 @@ func uintFromBTF(typ btf.Type) (uint32, error) {
 	return arr.Nelems, nil
 }
 
-func (ec *elfCode) loadDataSections(maps map[string]*MapSpec, dataSections map[elf.SectionIndex]*elf.Section, spec *btf.Spec) error {
-	if spec == nil {
-		return errors.New("data sections require BTF, make sure all consts are marked as static")
+// resolveBTFArrayMacro resolves the __array macro, which declares an array
+// of pointers to a given type. This function returns the target Type of
+// the pointers in the array.
+func resolveBTFArrayMacro(typ btf.Type) (btf.Type, error) {
+	arr, ok := typ.(*btf.Array)
+	if !ok {
+		return nil, fmt.Errorf("not an array: %v", typ)
+	}
+
+	ptr, ok := arr.Type.(*btf.Pointer)
+	if !ok {
+		return nil, fmt.Errorf("not an array of pointers: %v", typ)
 	}
 
-	for _, sec := range dataSections {
-		btfMap, err := spec.Datasec(sec.Name)
+	return ptr.Target, nil
+}
+
+func (ec *elfCode) loadDataSections(maps map[string]*MapSpec) error {
+	for _, sec := range ec.sections {
+		if sec.kind != dataSection {
+			continue
+		}
+
+		if sec.references == 0 {
+			// Prune data sections which are not referenced by any
+			// instructions.
+			continue
+		}
+
+		if ec.btf == nil {
+			return errors.New("data sections require BTF, make sure all consts are marked as static")
+		}
+
+		btfMap, err := ec.btf.Datasec(sec.Name)
 		if err != nil {
 			return err
 		}
@@ -609,54 +832,61 @@ func (ec *elfCode) loadDataSections(maps map[string]*MapSpec, dataSections map[e
 	return nil
 }
 
-func getProgType(sectionName string) (ProgramType, AttachType, string) {
+func getProgType(sectionName string) (ProgramType, AttachType, uint32, string) {
 	types := map[string]struct {
 		progType   ProgramType
 		attachType AttachType
+		progFlags  uint32
 	}{
 		// From https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/lib/bpf/libbpf.c
-		"socket":                {SocketFilter, AttachNone},
-		"seccomp":               {SocketFilter, AttachNone},
-		"kprobe/":               {Kprobe, AttachNone},
-		"uprobe/":               {Kprobe, AttachNone},
-		"kretprobe/":            {Kprobe, AttachNone},
-		"uretprobe/":            {Kprobe, AttachNone},
-		"tracepoint/":           {TracePoint, AttachNone},
-		"raw_tracepoint/":       {RawTracepoint, AttachNone},
-		"xdp":                   {XDP, AttachNone},
-		"perf_event":            {PerfEvent, AttachNone},
-		"lwt_in":                {LWTIn, AttachNone},
-		"lwt_out":               {LWTOut, AttachNone},
-		"lwt_xmit":              {LWTXmit, AttachNone},
-		"lwt_seg6local":         {LWTSeg6Local, AttachNone},
-		"sockops":               {SockOps, AttachCGroupSockOps},
-		"sk_skb/stream_parser":  {SkSKB, AttachSkSKBStreamParser},
-		"sk_skb/stream_verdict": {SkSKB, AttachSkSKBStreamParser},
-		"sk_msg":                {SkMsg, AttachSkSKBStreamVerdict},
-		"lirc_mode2":            {LircMode2, AttachLircMode2},
-		"flow_dissector":        {FlowDissector, AttachFlowDissector},
-		"iter/":                 {Tracing, AttachTraceIter},
-
-		"cgroup_skb/ingress": {CGroupSKB, AttachCGroupInetIngress},
-		"cgroup_skb/egress":  {CGroupSKB, AttachCGroupInetEgress},
-		"cgroup/dev":         {CGroupDevice, AttachCGroupDevice},
-		"cgroup/skb":         {CGroupSKB, AttachNone},
-		"cgroup/sock":        {CGroupSock, AttachCGroupInetSockCreate},
-		"cgroup/post_bind4":  {CGroupSock, AttachCGroupInet4PostBind},
-		"cgroup/post_bind6":  {CGroupSock, AttachCGroupInet6PostBind},
-		"cgroup/bind4":       {CGroupSockAddr, AttachCGroupInet4Bind},
-		"cgroup/bind6":       {CGroupSockAddr, AttachCGroupInet6Bind},
-		"cgroup/connect4":    {CGroupSockAddr, AttachCGroupInet4Connect},
-		"cgroup/connect6":    {CGroupSockAddr, AttachCGroupInet6Connect},
-		"cgroup/sendmsg4":    {CGroupSockAddr, AttachCGroupUDP4Sendmsg},
-		"cgroup/sendmsg6":    {CGroupSockAddr, AttachCGroupUDP6Sendmsg},
-		"cgroup/recvmsg4":    {CGroupSockAddr, AttachCGroupUDP4Recvmsg},
-		"cgroup/recvmsg6":    {CGroupSockAddr, AttachCGroupUDP6Recvmsg},
-		"cgroup/sysctl":      {CGroupSysctl, AttachCGroupSysctl},
-		"cgroup/getsockopt":  {CGroupSockopt, AttachCGroupGetsockopt},
-		"cgroup/setsockopt":  {CGroupSockopt, AttachCGroupSetsockopt},
-		"classifier":         {SchedCLS, AttachNone},
-		"action":             {SchedACT, AttachNone},
+		"socket":                {SocketFilter, AttachNone, 0},
+		"seccomp":               {SocketFilter, AttachNone, 0},
+		"kprobe/":               {Kprobe, AttachNone, 0},
+		"uprobe/":               {Kprobe, AttachNone, 0},
+		"kretprobe/":            {Kprobe, AttachNone, 0},
+		"uretprobe/":            {Kprobe, AttachNone, 0},
+		"tracepoint/":           {TracePoint, AttachNone, 0},
+		"raw_tracepoint/":       {RawTracepoint, AttachNone, 0},
+		"xdp":                   {XDP, AttachNone, 0},
+		"perf_event":            {PerfEvent, AttachNone, 0},
+		"lwt_in":                {LWTIn, AttachNone, 0},
+		"lwt_out":               {LWTOut, AttachNone, 0},
+		"lwt_xmit":              {LWTXmit, AttachNone, 0},
+		"lwt_seg6local":         {LWTSeg6Local, AttachNone, 0},
+		"sockops":               {SockOps, AttachCGroupSockOps, 0},
+		"sk_skb/stream_parser":  {SkSKB, AttachSkSKBStreamParser, 0},
+		"sk_skb/stream_verdict": {SkSKB, AttachSkSKBStreamParser, 0},
+		"sk_msg":                {SkMsg, AttachSkSKBStreamVerdict, 0},
+		"lirc_mode2":            {LircMode2, AttachLircMode2, 0},
+		"flow_dissector":        {FlowDissector, AttachFlowDissector, 0},
+		"iter/":                 {Tracing, AttachTraceIter, 0},
+		"fentry.s/":             {Tracing, AttachTraceFEntry, unix.BPF_F_SLEEPABLE},
+		"fmod_ret.s/":           {Tracing, AttachModifyReturn, unix.BPF_F_SLEEPABLE},
+		"fexit.s/":              {Tracing, AttachTraceFExit, unix.BPF_F_SLEEPABLE},
+		"sk_lookup/":            {SkLookup, AttachSkLookup, 0},
+		"lsm/":                  {LSM, AttachLSMMac, 0},
+		"lsm.s/":                {LSM, AttachLSMMac, unix.BPF_F_SLEEPABLE},
+
+		"cgroup_skb/ingress": {CGroupSKB, AttachCGroupInetIngress, 0},
+		"cgroup_skb/egress":  {CGroupSKB, AttachCGroupInetEgress, 0},
+		"cgroup/dev":         {CGroupDevice, AttachCGroupDevice, 0},
+		"cgroup/skb":         {CGroupSKB, AttachNone, 0},
+		"cgroup/sock":        {CGroupSock, AttachCGroupInetSockCreate, 0},
+		"cgroup/post_bind4":  {CGroupSock, AttachCGroupInet4PostBind, 0},
+		"cgroup/post_bind6":  {CGroupSock, AttachCGroupInet6PostBind, 0},
+		"cgroup/bind4":       {CGroupSockAddr, AttachCGroupInet4Bind, 0},
+		"cgroup/bind6":       {CGroupSockAddr, AttachCGroupInet6Bind, 0},
+		"cgroup/connect4":    {CGroupSockAddr, AttachCGroupInet4Connect, 0},
+		"cgroup/connect6":    {CGroupSockAddr, AttachCGroupInet6Connect, 0},
+		"cgroup/sendmsg4":    {CGroupSockAddr, AttachCGroupUDP4Sendmsg, 0},
+		"cgroup/sendmsg6":    {CGroupSockAddr, AttachCGroupUDP6Sendmsg, 0},
+		"cgroup/recvmsg4":    {CGroupSockAddr, AttachCGroupUDP4Recvmsg, 0},
+		"cgroup/recvmsg6":    {CGroupSockAddr, AttachCGroupUDP6Recvmsg, 0},
+		"cgroup/sysctl":      {CGroupSysctl, AttachCGroupSysctl, 0},
+		"cgroup/getsockopt":  {CGroupSockopt, AttachCGroupGetsockopt, 0},
+		"cgroup/setsockopt":  {CGroupSockopt, AttachCGroupSetsockopt, 0},
+		"classifier":         {SchedCLS, AttachNone, 0},
+		"action":             {SchedACT, AttachNone, 0},
 	}
 
 	for prefix, t := range types {
@@ -665,78 +895,39 @@ func getProgType(sectionName string) (ProgramType, AttachType, string) {
 		}
 
 		if !strings.HasSuffix(prefix, "/") {
-			return t.progType, t.attachType, ""
+			return t.progType, t.attachType, t.progFlags, ""
 		}
 
-		return t.progType, t.attachType, sectionName[len(prefix):]
+		return t.progType, t.attachType, t.progFlags, sectionName[len(prefix):]
 	}
 
-	return UnspecifiedProgram, AttachNone, ""
+	return UnspecifiedProgram, AttachNone, 0, ""
 }
 
-func (ec *elfCode) loadRelocations(sections map[elf.SectionIndex]*elf.Section) (map[elf.SectionIndex]map[uint64]elf.Symbol, map[elf.SectionIndex]bool, error) {
-	result := make(map[elf.SectionIndex]map[uint64]elf.Symbol)
-	targets := make(map[elf.SectionIndex]bool)
-	for idx, sec := range sections {
-		rels := make(map[uint64]elf.Symbol)
-
-		if sec.Entsize < 16 {
-			return nil, nil, fmt.Errorf("section %s: relocations are less than 16 bytes", sec.Name)
-		}
-
-		r := sec.Open()
-		for off := uint64(0); off < sec.Size; off += sec.Entsize {
-			ent := io.LimitReader(r, int64(sec.Entsize))
+func (ec *elfCode) loadRelocations(sec *elf.Section, symbols []elf.Symbol) (map[uint64]elf.Symbol, error) {
+	rels := make(map[uint64]elf.Symbol)
 
-			var rel elf.Rel64
-			if binary.Read(ent, ec.ByteOrder, &rel) != nil {
-				return nil, nil, fmt.Errorf("can't parse relocation at offset %v", off)
-			}
-
-			symNo := int(elf.R_SYM64(rel.Info) - 1)
-			if symNo >= len(ec.symbols) {
-				return nil, nil, fmt.Errorf("relocation at offset %d: symbol %v doesnt exist", off, symNo)
-			}
-
-			symbol := ec.symbols[symNo]
-			targets[symbol.Section] = true
-			rels[rel.Off] = ec.symbols[symNo]
-		}
-
-		result[idx] = rels
+	if sec.Entsize < 16 {
+		return nil, fmt.Errorf("section %s: relocations are less than 16 bytes", sec.Name)
 	}
-	return result, targets, nil
-}
 
-func symbolsPerSection(symbols []elf.Symbol) map[elf.SectionIndex]map[uint64]elf.Symbol {
-	result := make(map[elf.SectionIndex]map[uint64]elf.Symbol)
-	for _, sym := range symbols {
-		switch elf.ST_TYPE(sym.Info) {
-		case elf.STT_NOTYPE:
-			// Older versions of LLVM doesn't tag
-			// symbols correctly.
-			break
-		case elf.STT_OBJECT:
-			break
-		case elf.STT_FUNC:
-			break
-		default:
-			continue
-		}
+	r := bufio.NewReader(sec.Open())
+	for off := uint64(0); off < sec.Size; off += sec.Entsize {
+		ent := io.LimitReader(r, int64(sec.Entsize))
 
-		if sym.Section == elf.SHN_UNDEF || sym.Section >= elf.SHN_LORESERVE {
-			continue
+		var rel elf.Rel64
+		if binary.Read(ent, ec.ByteOrder, &rel) != nil {
+			return nil, fmt.Errorf("can't parse relocation at offset %v", off)
 		}
 
-		if sym.Name == "" {
-			continue
+		symNo := int(elf.R_SYM64(rel.Info) - 1)
+		if symNo >= len(symbols) {
+			return nil, fmt.Errorf("offset %d: symbol %d doesn't exist", off, symNo)
 		}
 
-		idx := sym.Section
-		if _, ok := result[idx]; !ok {
-			result[idx] = make(map[uint64]elf.Symbol)
-		}
-		result[idx][sym.Value] = sym
+		symbol := symbols[symNo]
+		rels[rel.Off] = symbol
 	}
-	return result
+
+	return rels, nil
 }

+ 21 - 0
vendor/github.com/cilium/ebpf/elf_reader_fuzz.go

@@ -0,0 +1,21 @@
+// +build gofuzz
+
+// Use with https://github.com/dvyukov/go-fuzz
+
+package ebpf
+
+import "bytes"
+
+func FuzzLoadCollectionSpec(data []byte) int {
+	spec, err := LoadCollectionSpecFromReader(bytes.NewReader(data))
+	if err != nil {
+		if spec != nil {
+			panic("spec is not nil")
+		}
+		return 0
+	}
+	if spec == nil {
+		panic("spec is nil")
+	}
+	return 1
+}

+ 6 - 0
vendor/github.com/cilium/ebpf/examples/README.md

@@ -0,0 +1,6 @@
+# eBPF Examples
+
+- [kprobe](kprobe/) - Attach a program to the entry or exit of an arbitrary kernel symbol (function).
+- [uprobe](uprobe/) - Like a kprobe, but for symbols in userspace binaries (e.g. `bash`).
+- [tracepoint](tracepoint/) - Attach a program to predetermined kernel tracepoints.
+- Add your use case(s) here!

+ 9 - 0
vendor/github.com/cilium/ebpf/examples/go.mod

@@ -0,0 +1,9 @@
+module github.com/cilium/ebpf/examples
+
+go 1.15
+
+require (
+	github.com/cilium/ebpf v0.4.1-0.20210401155455-cb5b8b6084b4 // indirect
+	github.com/elastic/go-perf v0.0.0-20191212140718-9c656876f595
+	golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c
+)

+ 3265 - 0
vendor/github.com/cilium/ebpf/examples/headers/bpf_helper_defs.h

@@ -0,0 +1,3265 @@
+/* This is auto-generated file. See bpf_helpers_doc.py for details. */
+
+/* Forward declarations of BPF structs */
+struct bpf_fib_lookup;
+struct bpf_sk_lookup;
+struct bpf_perf_event_data;
+struct bpf_perf_event_value;
+struct bpf_pidns_info;
+struct bpf_sock;
+struct bpf_sock_addr;
+struct bpf_sock_ops;
+struct bpf_sock_tuple;
+struct bpf_spin_lock;
+struct bpf_sysctl;
+struct bpf_tcp_sock;
+struct bpf_tunnel_key;
+struct bpf_xfrm_state;
+struct pt_regs;
+struct sk_reuseport_md;
+struct sockaddr;
+struct tcphdr;
+struct seq_file;
+struct tcp6_sock;
+struct tcp_sock;
+struct tcp_timewait_sock;
+struct tcp_request_sock;
+struct udp6_sock;
+struct task_struct;
+struct __sk_buff;
+struct sk_msg_md;
+struct xdp_md;
+
+/*
+ * bpf_map_lookup_elem
+ *
+ * 	Perform a lookup in *map* for an entry associated to *key*.
+ *
+ * Returns
+ * 	Map value associated to *key*, or **NULL** if no entry was
+ * 	found.
+ */
+static void *(*bpf_map_lookup_elem)(void *map, const void *key) = (void *) 1;
+
+/*
+ * bpf_map_update_elem
+ *
+ * 	Add or update the value of the entry associated to *key* in
+ * 	*map* with *value*. *flags* is one of:
+ *
+ * 	**BPF_NOEXIST**
+ * 		The entry for *key* must not exist in the map.
+ * 	**BPF_EXIST**
+ * 		The entry for *key* must already exist in the map.
+ * 	**BPF_ANY**
+ * 		No condition on the existence of the entry for *key*.
+ *
+ * 	Flag value **BPF_NOEXIST** cannot be used for maps of types
+ * 	**BPF_MAP_TYPE_ARRAY** or **BPF_MAP_TYPE_PERCPU_ARRAY**  (all
+ * 	elements always exist), the helper would return an error.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_map_update_elem)(void *map, const void *key, const void *value, __u64 flags) = (void *) 2;
+
+/*
+ * bpf_map_delete_elem
+ *
+ * 	Delete entry with *key* from *map*.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_map_delete_elem)(void *map, const void *key) = (void *) 3;
+
+/*
+ * bpf_probe_read
+ *
+ * 	For tracing programs, safely attempt to read *size* bytes from
+ * 	kernel space address *unsafe_ptr* and store the data in *dst*.
+ *
+ * 	Generally, use **bpf_probe_read_user**\ () or
+ * 	**bpf_probe_read_kernel**\ () instead.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_probe_read)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 4;
+
+/*
+ * bpf_ktime_get_ns
+ *
+ * 	Return the time elapsed since system boot, in nanoseconds.
+ * 	Does not include time the system was suspended.
+ * 	See: **clock_gettime**\ (**CLOCK_MONOTONIC**)
+ *
+ * Returns
+ * 	Current *ktime*.
+ */
+static __u64 (*bpf_ktime_get_ns)(void) = (void *) 5;
+
+/*
+ * bpf_trace_printk
+ *
+ * 	This helper is a "printk()-like" facility for debugging. It
+ * 	prints a message defined by format *fmt* (of size *fmt_size*)
+ * 	to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if
+ * 	available. It can take up to three additional **u64**
+ * 	arguments (as an eBPF helpers, the total number of arguments is
+ * 	limited to five).
+ *
+ * 	Each time the helper is called, it appends a line to the trace.
+ * 	Lines are discarded while *\/sys/kernel/debug/tracing/trace* is
+ * 	open, use *\/sys/kernel/debug/tracing/trace_pipe* to avoid this.
+ * 	The format of the trace is customizable, and the exact output
+ * 	one will get depends on the options set in
+ * 	*\/sys/kernel/debug/tracing/trace_options* (see also the
+ * 	*README* file under the same directory). However, it usually
+ * 	defaults to something like:
+ *
+ * 	::
+ *
+ * 		telnet-470   [001] .N.. 419421.045894: 0x00000001: <formatted msg>
+ *
+ * 	In the above:
+ *
+ * 		* ``telnet`` is the name of the current task.
+ * 		* ``470`` is the PID of the current task.
+ * 		* ``001`` is the CPU number on which the task is
+ * 		  running.
+ * 		* In ``.N..``, each character refers to a set of
+ * 		  options (whether irqs are enabled, scheduling
+ * 		  options, whether hard/softirqs are running, level of
+ * 		  preempt_disabled respectively). **N** means that
+ * 		  **TIF_NEED_RESCHED** and **PREEMPT_NEED_RESCHED**
+ * 		  are set.
+ * 		* ``419421.045894`` is a timestamp.
+ * 		* ``0x00000001`` is a fake value used by BPF for the
+ * 		  instruction pointer register.
+ * 		* ``<formatted msg>`` is the message formatted with
+ * 		  *fmt*.
+ *
+ * 	The conversion specifiers supported by *fmt* are similar, but
+ * 	more limited than for printk(). They are **%d**, **%i**,
+ * 	**%u**, **%x**, **%ld**, **%li**, **%lu**, **%lx**, **%lld**,
+ * 	**%lli**, **%llu**, **%llx**, **%p**, **%s**. No modifier (size
+ * 	of field, padding with zeroes, etc.) is available, and the
+ * 	helper will return **-EINVAL** (but print nothing) if it
+ * 	encounters an unknown specifier.
+ *
+ * 	Also, note that **bpf_trace_printk**\ () is slow, and should
+ * 	only be used for debugging purposes. For this reason, a notice
+ * 	bloc (spanning several lines) is printed to kernel logs and
+ * 	states that the helper should not be used "for production use"
+ * 	the first time this helper is used (or more precisely, when
+ * 	**trace_printk**\ () buffers are allocated). For passing values
+ * 	to user space, perf events should be preferred.
+ *
+ * Returns
+ * 	The number of bytes written to the buffer, or a negative error
+ * 	in case of failure.
+ */
+static long (*bpf_trace_printk)(const char *fmt, __u32 fmt_size, ...) = (void *) 6;
+
+/*
+ * bpf_get_prandom_u32
+ *
+ * 	Get a pseudo-random number.
+ *
+ * 	From a security point of view, this helper uses its own
+ * 	pseudo-random internal state, and cannot be used to infer the
+ * 	seed of other random functions in the kernel. However, it is
+ * 	essential to note that the generator used by the helper is not
+ * 	cryptographically secure.
+ *
+ * Returns
+ * 	A random 32-bit unsigned value.
+ */
+static __u32 (*bpf_get_prandom_u32)(void) = (void *) 7;
+
+/*
+ * bpf_get_smp_processor_id
+ *
+ * 	Get the SMP (symmetric multiprocessing) processor id. Note that
+ * 	all programs run with preemption disabled, which means that the
+ * 	SMP processor id is stable during all the execution of the
+ * 	program.
+ *
+ * Returns
+ * 	The SMP id of the processor running the program.
+ */
+static __u32 (*bpf_get_smp_processor_id)(void) = (void *) 8;
+
+/*
+ * bpf_skb_store_bytes
+ *
+ * 	Store *len* bytes from address *from* into the packet
+ * 	associated to *skb*, at *offset*. *flags* are a combination of
+ * 	**BPF_F_RECOMPUTE_CSUM** (automatically recompute the
+ * 	checksum for the packet after storing the bytes) and
+ * 	**BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\
+ * 	**->swhash** and *skb*\ **->l4hash** to 0).
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_skb_store_bytes)(struct __sk_buff *skb, __u32 offset, const void *from, __u32 len, __u64 flags) = (void *) 9;
+
+/*
+ * bpf_l3_csum_replace
+ *
+ * 	Recompute the layer 3 (e.g. IP) checksum for the packet
+ * 	associated to *skb*. Computation is incremental, so the helper
+ * 	must know the former value of the header field that was
+ * 	modified (*from*), the new value of this field (*to*), and the
+ * 	number of bytes (2 or 4) for this field, stored in *size*.
+ * 	Alternatively, it is possible to store the difference between
+ * 	the previous and the new values of the header field in *to*, by
+ * 	setting *from* and *size* to 0. For both methods, *offset*
+ * 	indicates the location of the IP checksum within the packet.
+ *
+ * 	This helper works in combination with **bpf_csum_diff**\ (),
+ * 	which does not update the checksum in-place, but offers more
+ * 	flexibility and can handle sizes larger than 2 or 4 for the
+ * 	checksum to update.
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_l3_csum_replace)(struct __sk_buff *skb, __u32 offset, __u64 from, __u64 to, __u64 size) = (void *) 10;
+
+/*
+ * bpf_l4_csum_replace
+ *
+ * 	Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the
+ * 	packet associated to *skb*. Computation is incremental, so the
+ * 	helper must know the former value of the header field that was
+ * 	modified (*from*), the new value of this field (*to*), and the
+ * 	number of bytes (2 or 4) for this field, stored on the lowest
+ * 	four bits of *flags*. Alternatively, it is possible to store
+ * 	the difference between the previous and the new values of the
+ * 	header field in *to*, by setting *from* and the four lowest
+ * 	bits of *flags* to 0. For both methods, *offset* indicates the
+ * 	location of the IP checksum within the packet. In addition to
+ * 	the size of the field, *flags* can be added (bitwise OR) actual
+ * 	flags. With **BPF_F_MARK_MANGLED_0**, a null checksum is left
+ * 	untouched (unless **BPF_F_MARK_ENFORCE** is added as well), and
+ * 	for updates resulting in a null checksum the value is set to
+ * 	**CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates
+ * 	the checksum is to be computed against a pseudo-header.
+ *
+ * 	This helper works in combination with **bpf_csum_diff**\ (),
+ * 	which does not update the checksum in-place, but offers more
+ * 	flexibility and can handle sizes larger than 2 or 4 for the
+ * 	checksum to update.
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_l4_csum_replace)(struct __sk_buff *skb, __u32 offset, __u64 from, __u64 to, __u64 flags) = (void *) 11;
+
+/*
+ * bpf_tail_call
+ *
+ * 	This special helper is used to trigger a "tail call", or in
+ * 	other words, to jump into another eBPF program. The same stack
+ * 	frame is used (but values on stack and in registers for the
+ * 	caller are not accessible to the callee). This mechanism allows
+ * 	for program chaining, either for raising the maximum number of
+ * 	available eBPF instructions, or to execute given programs in
+ * 	conditional blocks. For security reasons, there is an upper
+ * 	limit to the number of successive tail calls that can be
+ * 	performed.
+ *
+ * 	Upon call of this helper, the program attempts to jump into a
+ * 	program referenced at index *index* in *prog_array_map*, a
+ * 	special map of type **BPF_MAP_TYPE_PROG_ARRAY**, and passes
+ * 	*ctx*, a pointer to the context.
+ *
+ * 	If the call succeeds, the kernel immediately runs the first
+ * 	instruction of the new program. This is not a function call,
+ * 	and it never returns to the previous program. If the call
+ * 	fails, then the helper has no effect, and the caller continues
+ * 	to run its subsequent instructions. A call can fail if the
+ * 	destination program for the jump does not exist (i.e. *index*
+ * 	is superior to the number of entries in *prog_array_map*), or
+ * 	if the maximum number of tail calls has been reached for this
+ * 	chain of programs. This limit is defined in the kernel by the
+ * 	macro **MAX_TAIL_CALL_CNT** (not accessible to user space),
+ * 	which is currently set to 32.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_tail_call)(void *ctx, void *prog_array_map, __u32 index) = (void *) 12;
+
+/*
+ * bpf_clone_redirect
+ *
+ * 	Clone and redirect the packet associated to *skb* to another
+ * 	net device of index *ifindex*. Both ingress and egress
+ * 	interfaces can be used for redirection. The **BPF_F_INGRESS**
+ * 	value in *flags* is used to make the distinction (ingress path
+ * 	is selected if the flag is present, egress path otherwise).
+ * 	This is the only flag supported for now.
+ *
+ * 	In comparison with **bpf_redirect**\ () helper,
+ * 	**bpf_clone_redirect**\ () has the associated cost of
+ * 	duplicating the packet buffer, but this can be executed out of
+ * 	the eBPF program. Conversely, **bpf_redirect**\ () is more
+ * 	efficient, but it is handled through an action code where the
+ * 	redirection happens only after the eBPF program has returned.
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_clone_redirect)(struct __sk_buff *skb, __u32 ifindex, __u64 flags) = (void *) 13;
+
+/*
+ * bpf_get_current_pid_tgid
+ *
+ *
+ * Returns
+ * 	A 64-bit integer containing the current tgid and pid, and
+ * 	created as such:
+ * 	*current_task*\ **->tgid << 32 \|**
+ * 	*current_task*\ **->pid**.
+ */
+static __u64 (*bpf_get_current_pid_tgid)(void) = (void *) 14;
+
+/*
+ * bpf_get_current_uid_gid
+ *
+ *
+ * Returns
+ * 	A 64-bit integer containing the current GID and UID, and
+ * 	created as such: *current_gid* **<< 32 \|** *current_uid*.
+ */
+static __u64 (*bpf_get_current_uid_gid)(void) = (void *) 15;
+
+/*
+ * bpf_get_current_comm
+ *
+ * 	Copy the **comm** attribute of the current task into *buf* of
+ * 	*size_of_buf*. The **comm** attribute contains the name of
+ * 	the executable (excluding the path) for the current task. The
+ * 	*size_of_buf* must be strictly positive. On success, the
+ * 	helper makes sure that the *buf* is NUL-terminated. On failure,
+ * 	it is filled with zeroes.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_get_current_comm)(void *buf, __u32 size_of_buf) = (void *) 16;
+
+/*
+ * bpf_get_cgroup_classid
+ *
+ * 	Retrieve the classid for the current task, i.e. for the net_cls
+ * 	cgroup to which *skb* belongs.
+ *
+ * 	This helper can be used on TC egress path, but not on ingress.
+ *
+ * 	The net_cls cgroup provides an interface to tag network packets
+ * 	based on a user-provided identifier for all traffic coming from
+ * 	the tasks belonging to the related cgroup. See also the related
+ * 	kernel documentation, available from the Linux sources in file
+ * 	*Documentation/admin-guide/cgroup-v1/net_cls.rst*.
+ *
+ * 	The Linux kernel has two versions for cgroups: there are
+ * 	cgroups v1 and cgroups v2. Both are available to users, who can
+ * 	use a mixture of them, but note that the net_cls cgroup is for
+ * 	cgroup v1 only. This makes it incompatible with BPF programs
+ * 	run on cgroups, which is a cgroup-v2-only feature (a socket can
+ * 	only hold data for one version of cgroups at a time).
+ *
+ * 	This helper is only available is the kernel was compiled with
+ * 	the **CONFIG_CGROUP_NET_CLASSID** configuration option set to
+ * 	"**y**" or to "**m**".
+ *
+ * Returns
+ * 	The classid, or 0 for the default unconfigured classid.
+ */
+static __u32 (*bpf_get_cgroup_classid)(struct __sk_buff *skb) = (void *) 17;
+
+/*
+ * bpf_skb_vlan_push
+ *
+ * 	Push a *vlan_tci* (VLAN tag control information) of protocol
+ * 	*vlan_proto* to the packet associated to *skb*, then update
+ * 	the checksum. Note that if *vlan_proto* is different from
+ * 	**ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to
+ * 	be **ETH_P_8021Q**.
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_skb_vlan_push)(struct __sk_buff *skb, __be16 vlan_proto, __u16 vlan_tci) = (void *) 18;
+
+/*
+ * bpf_skb_vlan_pop
+ *
+ * 	Pop a VLAN header from the packet associated to *skb*.
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_skb_vlan_pop)(struct __sk_buff *skb) = (void *) 19;
+
+/*
+ * bpf_skb_get_tunnel_key
+ *
+ * 	Get tunnel metadata. This helper takes a pointer *key* to an
+ * 	empty **struct bpf_tunnel_key** of **size**, that will be
+ * 	filled with tunnel metadata for the packet associated to *skb*.
+ * 	The *flags* can be set to **BPF_F_TUNINFO_IPV6**, which
+ * 	indicates that the tunnel is based on IPv6 protocol instead of
+ * 	IPv4.
+ *
+ * 	The **struct bpf_tunnel_key** is an object that generalizes the
+ * 	principal parameters used by various tunneling protocols into a
+ * 	single struct. This way, it can be used to easily make a
+ * 	decision based on the contents of the encapsulation header,
+ * 	"summarized" in this struct. In particular, it holds the IP
+ * 	address of the remote end (IPv4 or IPv6, depending on the case)
+ * 	in *key*\ **->remote_ipv4** or *key*\ **->remote_ipv6**. Also,
+ * 	this struct exposes the *key*\ **->tunnel_id**, which is
+ * 	generally mapped to a VNI (Virtual Network Identifier), making
+ * 	it programmable together with the **bpf_skb_set_tunnel_key**\
+ * 	() helper.
+ *
+ * 	Let's imagine that the following code is part of a program
+ * 	attached to the TC ingress interface, on one end of a GRE
+ * 	tunnel, and is supposed to filter out all messages coming from
+ * 	remote ends with IPv4 address other than 10.0.0.1:
+ *
+ * 	::
+ *
+ * 		int ret;
+ * 		struct bpf_tunnel_key key = {};
+ * 		
+ * 		ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
+ * 		if (ret < 0)
+ * 			return TC_ACT_SHOT;	// drop packet
+ * 		
+ * 		if (key.remote_ipv4 != 0x0a000001)
+ * 			return TC_ACT_SHOT;	// drop packet
+ * 		
+ * 		return TC_ACT_OK;		// accept packet
+ *
+ * 	This interface can also be used with all encapsulation devices
+ * 	that can operate in "collect metadata" mode: instead of having
+ * 	one network device per specific configuration, the "collect
+ * 	metadata" mode only requires a single device where the
+ * 	configuration can be extracted from this helper.
+ *
+ * 	This can be used together with various tunnels such as VXLan,
+ * 	Geneve, GRE or IP in IP (IPIP).
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_skb_get_tunnel_key)(struct __sk_buff *skb, struct bpf_tunnel_key *key, __u32 size, __u64 flags) = (void *) 20;
+
+/*
+ * bpf_skb_set_tunnel_key
+ *
+ * 	Populate tunnel metadata for packet associated to *skb.* The
+ * 	tunnel metadata is set to the contents of *key*, of *size*. The
+ * 	*flags* can be set to a combination of the following values:
+ *
+ * 	**BPF_F_TUNINFO_IPV6**
+ * 		Indicate that the tunnel is based on IPv6 protocol
+ * 		instead of IPv4.
+ * 	**BPF_F_ZERO_CSUM_TX**
+ * 		For IPv4 packets, add a flag to tunnel metadata
+ * 		indicating that checksum computation should be skipped
+ * 		and checksum set to zeroes.
+ * 	**BPF_F_DONT_FRAGMENT**
+ * 		Add a flag to tunnel metadata indicating that the
+ * 		packet should not be fragmented.
+ * 	**BPF_F_SEQ_NUMBER**
+ * 		Add a flag to tunnel metadata indicating that a
+ * 		sequence number should be added to tunnel header before
+ * 		sending the packet. This flag was added for GRE
+ * 		encapsulation, but might be used with other protocols
+ * 		as well in the future.
+ *
+ * 	Here is a typical usage on the transmit path:
+ *
+ * 	::
+ *
+ * 		struct bpf_tunnel_key key;
+ * 		     populate key ...
+ * 		bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0);
+ * 		bpf_clone_redirect(skb, vxlan_dev_ifindex, 0);
+ *
+ * 	See also the description of the **bpf_skb_get_tunnel_key**\ ()
+ * 	helper for additional information.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_skb_set_tunnel_key)(struct __sk_buff *skb, struct bpf_tunnel_key *key, __u32 size, __u64 flags) = (void *) 21;
+
+/*
+ * bpf_perf_event_read
+ *
+ * 	Read the value of a perf event counter. This helper relies on a
+ * 	*map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of
+ * 	the perf event counter is selected when *map* is updated with
+ * 	perf event file descriptors. The *map* is an array whose size
+ * 	is the number of available CPUs, and each cell contains a value
+ * 	relative to one CPU. The value to retrieve is indicated by
+ * 	*flags*, that contains the index of the CPU to look up, masked
+ * 	with **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to
+ * 	**BPF_F_CURRENT_CPU** to indicate that the value for the
+ * 	current CPU should be retrieved.
+ *
+ * 	Note that before Linux 4.13, only hardware perf event can be
+ * 	retrieved.
+ *
+ * 	Also, be aware that the newer helper
+ * 	**bpf_perf_event_read_value**\ () is recommended over
+ * 	**bpf_perf_event_read**\ () in general. The latter has some ABI
+ * 	quirks where error and counter value are used as a return code
+ * 	(which is wrong to do since ranges may overlap). This issue is
+ * 	fixed with **bpf_perf_event_read_value**\ (), which at the same
+ * 	time provides more features over the **bpf_perf_event_read**\
+ * 	() interface. Please refer to the description of
+ * 	**bpf_perf_event_read_value**\ () for details.
+ *
+ * Returns
+ * 	The value of the perf event counter read from the map, or a
+ * 	negative error code in case of failure.
+ */
+static __u64 (*bpf_perf_event_read)(void *map, __u64 flags) = (void *) 22;
+
+/*
+ * bpf_redirect
+ *
+ * 	Redirect the packet to another net device of index *ifindex*.
+ * 	This helper is somewhat similar to **bpf_clone_redirect**\
+ * 	(), except that the packet is not cloned, which provides
+ * 	increased performance.
+ *
+ * 	Except for XDP, both ingress and egress interfaces can be used
+ * 	for redirection. The **BPF_F_INGRESS** value in *flags* is used
+ * 	to make the distinction (ingress path is selected if the flag
+ * 	is present, egress path otherwise). Currently, XDP only
+ * 	supports redirection to the egress interface, and accepts no
+ * 	flag at all.
+ *
+ * 	The same effect can also be attained with the more generic
+ * 	**bpf_redirect_map**\ (), which uses a BPF map to store the
+ * 	redirect target instead of providing it directly to the helper.
+ *
+ * Returns
+ * 	For XDP, the helper returns **XDP_REDIRECT** on success or
+ * 	**XDP_ABORTED** on error. For other program types, the values
+ * 	are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on
+ * 	error.
+ */
+static long (*bpf_redirect)(__u32 ifindex, __u64 flags) = (void *) 23;
+
+/*
+ * bpf_get_route_realm
+ *
+ * 	Retrieve the realm or the route, that is to say the
+ * 	**tclassid** field of the destination for the *skb*. The
+ * 	indentifier retrieved is a user-provided tag, similar to the
+ * 	one used with the net_cls cgroup (see description for
+ * 	**bpf_get_cgroup_classid**\ () helper), but here this tag is
+ * 	held by a route (a destination entry), not by a task.
+ *
+ * 	Retrieving this identifier works with the clsact TC egress hook
+ * 	(see also **tc-bpf(8)**), or alternatively on conventional
+ * 	classful egress qdiscs, but not on TC ingress path. In case of
+ * 	clsact TC egress hook, this has the advantage that, internally,
+ * 	the destination entry has not been dropped yet in the transmit
+ * 	path. Therefore, the destination entry does not need to be
+ * 	artificially held via **netif_keep_dst**\ () for a classful
+ * 	qdisc until the *skb* is freed.
+ *
+ * 	This helper is available only if the kernel was compiled with
+ * 	**CONFIG_IP_ROUTE_CLASSID** configuration option.
+ *
+ * Returns
+ * 	The realm of the route for the packet associated to *skb*, or 0
+ * 	if none was found.
+ */
+static __u32 (*bpf_get_route_realm)(struct __sk_buff *skb) = (void *) 24;
+
+/*
+ * bpf_perf_event_output
+ *
+ * 	Write raw *data* blob into a special BPF perf event held by
+ * 	*map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
+ * 	event must have the following attributes: **PERF_SAMPLE_RAW**
+ * 	as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and
+ * 	**PERF_COUNT_SW_BPF_OUTPUT** as **config**.
+ *
+ * 	The *flags* are used to indicate the index in *map* for which
+ * 	the value must be put, masked with **BPF_F_INDEX_MASK**.
+ * 	Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU**
+ * 	to indicate that the index of the current CPU core should be
+ * 	used.
+ *
+ * 	The value to write, of *size*, is passed through eBPF stack and
+ * 	pointed by *data*.
+ *
+ * 	The context of the program *ctx* needs also be passed to the
+ * 	helper.
+ *
+ * 	On user space, a program willing to read the values needs to
+ * 	call **perf_event_open**\ () on the perf event (either for
+ * 	one or for all CPUs) and to store the file descriptor into the
+ * 	*map*. This must be done before the eBPF program can send data
+ * 	into it. An example is available in file
+ * 	*samples/bpf/trace_output_user.c* in the Linux kernel source
+ * 	tree (the eBPF program counterpart is in
+ * 	*samples/bpf/trace_output_kern.c*).
+ *
+ * 	**bpf_perf_event_output**\ () achieves better performance
+ * 	than **bpf_trace_printk**\ () for sharing data with user
+ * 	space, and is much better suitable for streaming data from eBPF
+ * 	programs.
+ *
+ * 	Note that this helper is not restricted to tracing use cases
+ * 	and can be used with programs attached to TC or XDP as well,
+ * 	where it allows for passing data to user space listeners. Data
+ * 	can be:
+ *
+ * 	* Only custom structs,
+ * 	* Only the packet payload, or
+ * 	* A combination of both.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_perf_event_output)(void *ctx, void *map, __u64 flags, void *data, __u64 size) = (void *) 25;
+
+/*
+ * bpf_skb_load_bytes
+ *
+ * 	This helper was provided as an easy way to load data from a
+ * 	packet. It can be used to load *len* bytes from *offset* from
+ * 	the packet associated to *skb*, into the buffer pointed by
+ * 	*to*.
+ *
+ * 	Since Linux 4.7, usage of this helper has mostly been replaced
+ * 	by "direct packet access", enabling packet data to be
+ * 	manipulated with *skb*\ **->data** and *skb*\ **->data_end**
+ * 	pointing respectively to the first byte of packet data and to
+ * 	the byte after the last byte of packet data. However, it
+ * 	remains useful if one wishes to read large quantities of data
+ * 	at once from a packet into the eBPF stack.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_skb_load_bytes)(const void *skb, __u32 offset, void *to, __u32 len) = (void *) 26;
+
+/*
+ * bpf_get_stackid
+ *
+ * 	Walk a user or a kernel stack and return its id. To achieve
+ * 	this, the helper needs *ctx*, which is a pointer to the context
+ * 	on which the tracing program is executed, and a pointer to a
+ * 	*map* of type **BPF_MAP_TYPE_STACK_TRACE**.
+ *
+ * 	The last argument, *flags*, holds the number of stack frames to
+ * 	skip (from 0 to 255), masked with
+ * 	**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ * 	a combination of the following flags:
+ *
+ * 	**BPF_F_USER_STACK**
+ * 		Collect a user space stack instead of a kernel stack.
+ * 	**BPF_F_FAST_STACK_CMP**
+ * 		Compare stacks by hash only.
+ * 	**BPF_F_REUSE_STACKID**
+ * 		If two different stacks hash into the same *stackid*,
+ * 		discard the old one.
+ *
+ * 	The stack id retrieved is a 32 bit long integer handle which
+ * 	can be further combined with other data (including other stack
+ * 	ids) and used as a key into maps. This can be useful for
+ * 	generating a variety of graphs (such as flame graphs or off-cpu
+ * 	graphs).
+ *
+ * 	For walking a stack, this helper is an improvement over
+ * 	**bpf_probe_read**\ (), which can be used with unrolled loops
+ * 	but is not efficient and consumes a lot of eBPF instructions.
+ * 	Instead, **bpf_get_stackid**\ () can collect up to
+ * 	**PERF_MAX_STACK_DEPTH** both kernel and user frames. Note that
+ * 	this limit can be controlled with the **sysctl** program, and
+ * 	that it should be manually increased in order to profile long
+ * 	user stacks (such as stacks for Java programs). To do so, use:
+ *
+ * 	::
+ *
+ * 		# sysctl kernel.perf_event_max_stack=<new value>
+ *
+ * Returns
+ * 	The positive or null stack id on success, or a negative error
+ * 	in case of failure.
+ */
+static long (*bpf_get_stackid)(void *ctx, void *map, __u64 flags) = (void *) 27;
+
+/*
+ * bpf_csum_diff
+ *
+ * 	Compute a checksum difference, from the raw buffer pointed by
+ * 	*from*, of length *from_size* (that must be a multiple of 4),
+ * 	towards the raw buffer pointed by *to*, of size *to_size*
+ * 	(same remark). An optional *seed* can be added to the value
+ * 	(this can be cascaded, the seed may come from a previous call
+ * 	to the helper).
+ *
+ * 	This is flexible enough to be used in several ways:
+ *
+ * 	* With *from_size* == 0, *to_size* > 0 and *seed* set to
+ * 	  checksum, it can be used when pushing new data.
+ * 	* With *from_size* > 0, *to_size* == 0 and *seed* set to
+ * 	  checksum, it can be used when removing data from a packet.
+ * 	* With *from_size* > 0, *to_size* > 0 and *seed* set to 0, it
+ * 	  can be used to compute a diff. Note that *from_size* and
+ * 	  *to_size* do not need to be equal.
+ *
+ * 	This helper can be used in combination with
+ * 	**bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ (), to
+ * 	which one can feed in the difference computed with
+ * 	**bpf_csum_diff**\ ().
+ *
+ * Returns
+ * 	The checksum result, or a negative error code in case of
+ * 	failure.
+ */
+static __s64 (*bpf_csum_diff)(__be32 *from, __u32 from_size, __be32 *to, __u32 to_size, __wsum seed) = (void *) 28;
+
+/*
+ * bpf_skb_get_tunnel_opt
+ *
+ * 	Retrieve tunnel options metadata for the packet associated to
+ * 	*skb*, and store the raw tunnel option data to the buffer *opt*
+ * 	of *size*.
+ *
+ * 	This helper can be used with encapsulation devices that can
+ * 	operate in "collect metadata" mode (please refer to the related
+ * 	note in the description of **bpf_skb_get_tunnel_key**\ () for
+ * 	more details). A particular example where this can be used is
+ * 	in combination with the Geneve encapsulation protocol, where it
+ * 	allows for pushing (with **bpf_skb_get_tunnel_opt**\ () helper)
+ * 	and retrieving arbitrary TLVs (Type-Length-Value headers) from
+ * 	the eBPF program. This allows for full customization of these
+ * 	headers.
+ *
+ * Returns
+ * 	The size of the option data retrieved.
+ */
+static long (*bpf_skb_get_tunnel_opt)(struct __sk_buff *skb, void *opt, __u32 size) = (void *) 29;
+
+/*
+ * bpf_skb_set_tunnel_opt
+ *
+ * 	Set tunnel options metadata for the packet associated to *skb*
+ * 	to the option data contained in the raw buffer *opt* of *size*.
+ *
+ * 	See also the description of the **bpf_skb_get_tunnel_opt**\ ()
+ * 	helper for additional information.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_skb_set_tunnel_opt)(struct __sk_buff *skb, void *opt, __u32 size) = (void *) 30;
+
+/*
+ * bpf_skb_change_proto
+ *
+ * 	Change the protocol of the *skb* to *proto*. Currently
+ * 	supported are transition from IPv4 to IPv6, and from IPv6 to
+ * 	IPv4. The helper takes care of the groundwork for the
+ * 	transition, including resizing the socket buffer. The eBPF
+ * 	program is expected to fill the new headers, if any, via
+ * 	**skb_store_bytes**\ () and to recompute the checksums with
+ * 	**bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\
+ * 	(). The main case for this helper is to perform NAT64
+ * 	operations out of an eBPF program.
+ *
+ * 	Internally, the GSO type is marked as dodgy so that headers are
+ * 	checked and segments are recalculated by the GSO/GRO engine.
+ * 	The size for GSO target is adapted as well.
+ *
+ * 	All values for *flags* are reserved for future usage, and must
+ * 	be left at zero.
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_skb_change_proto)(struct __sk_buff *skb, __be16 proto, __u64 flags) = (void *) 31;
+
+/*
+ * bpf_skb_change_type
+ *
+ * 	Change the packet type for the packet associated to *skb*. This
+ * 	comes down to setting *skb*\ **->pkt_type** to *type*, except
+ * 	the eBPF program does not have a write access to *skb*\
+ * 	**->pkt_type** beside this helper. Using a helper here allows
+ * 	for graceful handling of errors.
+ *
+ * 	The major use case is to change incoming *skb*s to
+ * 	**PACKET_HOST** in a programmatic way instead of having to
+ * 	recirculate via **redirect**\ (..., **BPF_F_INGRESS**), for
+ * 	example.
+ *
+ * 	Note that *type* only allows certain values. At this time, they
+ * 	are:
+ *
+ * 	**PACKET_HOST**
+ * 		Packet is for us.
+ * 	**PACKET_BROADCAST**
+ * 		Send packet to all.
+ * 	**PACKET_MULTICAST**
+ * 		Send packet to group.
+ * 	**PACKET_OTHERHOST**
+ * 		Send packet to someone else.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_skb_change_type)(struct __sk_buff *skb, __u32 type) = (void *) 32;
+
+/*
+ * bpf_skb_under_cgroup
+ *
+ * 	Check whether *skb* is a descendant of the cgroup2 held by
+ * 	*map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*.
+ *
+ * Returns
+ * 	The return value depends on the result of the test, and can be:
+ *
+ * 	* 0, if the *skb* failed the cgroup2 descendant test.
+ * 	* 1, if the *skb* succeeded the cgroup2 descendant test.
+ * 	* A negative error code, if an error occurred.
+ */
+static long (*bpf_skb_under_cgroup)(struct __sk_buff *skb, void *map, __u32 index) = (void *) 33;
+
+/*
+ * bpf_get_hash_recalc
+ *
+ * 	Retrieve the hash of the packet, *skb*\ **->hash**. If it is
+ * 	not set, in particular if the hash was cleared due to mangling,
+ * 	recompute this hash. Later accesses to the hash can be done
+ * 	directly with *skb*\ **->hash**.
+ *
+ * 	Calling **bpf_set_hash_invalid**\ (), changing a packet
+ * 	prototype with **bpf_skb_change_proto**\ (), or calling
+ * 	**bpf_skb_store_bytes**\ () with the
+ * 	**BPF_F_INVALIDATE_HASH** are actions susceptible to clear
+ * 	the hash and to trigger a new computation for the next call to
+ * 	**bpf_get_hash_recalc**\ ().
+ *
+ * Returns
+ * 	The 32-bit hash.
+ */
+static __u32 (*bpf_get_hash_recalc)(struct __sk_buff *skb) = (void *) 34;
+
+/*
+ * bpf_get_current_task
+ *
+ *
+ * Returns
+ * 	A pointer to the current task struct.
+ */
+static __u64 (*bpf_get_current_task)(void) = (void *) 35;
+
+/*
+ * bpf_probe_write_user
+ *
+ * 	Attempt in a safe way to write *len* bytes from the buffer
+ * 	*src* to *dst* in memory. It only works for threads that are in
+ * 	user context, and *dst* must be a valid user space address.
+ *
+ * 	This helper should not be used to implement any kind of
+ * 	security mechanism because of TOC-TOU attacks, but rather to
+ * 	debug, divert, and manipulate execution of semi-cooperative
+ * 	processes.
+ *
+ * 	Keep in mind that this feature is meant for experiments, and it
+ * 	has a risk of crashing the system and running programs.
+ * 	Therefore, when an eBPF program using this helper is attached,
+ * 	a warning including PID and process name is printed to kernel
+ * 	logs.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_probe_write_user)(void *dst, const void *src, __u32 len) = (void *) 36;
+
+/*
+ * bpf_current_task_under_cgroup
+ *
+ * 	Check whether the probe is being run is the context of a given
+ * 	subset of the cgroup2 hierarchy. The cgroup2 to test is held by
+ * 	*map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*.
+ *
+ * Returns
+ * 	The return value depends on the result of the test, and can be:
+ *
+ * 	* 0, if the *skb* task belongs to the cgroup2.
+ * 	* 1, if the *skb* task does not belong to the cgroup2.
+ * 	* A negative error code, if an error occurred.
+ */
+static long (*bpf_current_task_under_cgroup)(void *map, __u32 index) = (void *) 37;
+
+/*
+ * bpf_skb_change_tail
+ *
+ * 	Resize (trim or grow) the packet associated to *skb* to the
+ * 	new *len*. The *flags* are reserved for future usage, and must
+ * 	be left at zero.
+ *
+ * 	The basic idea is that the helper performs the needed work to
+ * 	change the size of the packet, then the eBPF program rewrites
+ * 	the rest via helpers like **bpf_skb_store_bytes**\ (),
+ * 	**bpf_l3_csum_replace**\ (), **bpf_l3_csum_replace**\ ()
+ * 	and others. This helper is a slow path utility intended for
+ * 	replies with control messages. And because it is targeted for
+ * 	slow path, the helper itself can afford to be slow: it
+ * 	implicitly linearizes, unclones and drops offloads from the
+ * 	*skb*.
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_skb_change_tail)(struct __sk_buff *skb, __u32 len, __u64 flags) = (void *) 38;
+
+/*
+ * bpf_skb_pull_data
+ *
+ * 	Pull in non-linear data in case the *skb* is non-linear and not
+ * 	all of *len* are part of the linear section. Make *len* bytes
+ * 	from *skb* readable and writable. If a zero value is passed for
+ * 	*len*, then the whole length of the *skb* is pulled.
+ *
+ * 	This helper is only needed for reading and writing with direct
+ * 	packet access.
+ *
+ * 	For direct packet access, testing that offsets to access
+ * 	are within packet boundaries (test on *skb*\ **->data_end**) is
+ * 	susceptible to fail if offsets are invalid, or if the requested
+ * 	data is in non-linear parts of the *skb*. On failure the
+ * 	program can just bail out, or in the case of a non-linear
+ * 	buffer, use a helper to make the data available. The
+ * 	**bpf_skb_load_bytes**\ () helper is a first solution to access
+ * 	the data. Another one consists in using **bpf_skb_pull_data**
+ * 	to pull in once the non-linear parts, then retesting and
+ * 	eventually access the data.
+ *
+ * 	At the same time, this also makes sure the *skb* is uncloned,
+ * 	which is a necessary condition for direct write. As this needs
+ * 	to be an invariant for the write part only, the verifier
+ * 	detects writes and adds a prologue that is calling
+ * 	**bpf_skb_pull_data()** to effectively unclone the *skb* from
+ * 	the very beginning in case it is indeed cloned.
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_skb_pull_data)(struct __sk_buff *skb, __u32 len) = (void *) 39;
+
+/*
+ * bpf_csum_update
+ *
+ * 	Add the checksum *csum* into *skb*\ **->csum** in case the
+ * 	driver has supplied a checksum for the entire packet into that
+ * 	field. Return an error otherwise. This helper is intended to be
+ * 	used in combination with **bpf_csum_diff**\ (), in particular
+ * 	when the checksum needs to be updated after data has been
+ * 	written into the packet through direct packet access.
+ *
+ * Returns
+ * 	The checksum on success, or a negative error code in case of
+ * 	failure.
+ */
+static __s64 (*bpf_csum_update)(struct __sk_buff *skb, __wsum csum) = (void *) 40;
+
+/*
+ * bpf_set_hash_invalid
+ *
+ * 	Invalidate the current *skb*\ **->hash**. It can be used after
+ * 	mangling on headers through direct packet access, in order to
+ * 	indicate that the hash is outdated and to trigger a
+ * 	recalculation the next time the kernel tries to access this
+ * 	hash or when the **bpf_get_hash_recalc**\ () helper is called.
+ *
+ */
+static void (*bpf_set_hash_invalid)(struct __sk_buff *skb) = (void *) 41;
+
+/*
+ * bpf_get_numa_node_id
+ *
+ * 	Return the id of the current NUMA node. The primary use case
+ * 	for this helper is the selection of sockets for the local NUMA
+ * 	node, when the program is attached to sockets using the
+ * 	**SO_ATTACH_REUSEPORT_EBPF** option (see also **socket(7)**),
+ * 	but the helper is also available to other eBPF program types,
+ * 	similarly to **bpf_get_smp_processor_id**\ ().
+ *
+ * Returns
+ * 	The id of current NUMA node.
+ */
+static long (*bpf_get_numa_node_id)(void) = (void *) 42;
+
+/*
+ * bpf_skb_change_head
+ *
+ * 	Grows headroom of packet associated to *skb* and adjusts the
+ * 	offset of the MAC header accordingly, adding *len* bytes of
+ * 	space. It automatically extends and reallocates memory as
+ * 	required.
+ *
+ * 	This helper can be used on a layer 3 *skb* to push a MAC header
+ * 	for redirection into a layer 2 device.
+ *
+ * 	All values for *flags* are reserved for future usage, and must
+ * 	be left at zero.
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_skb_change_head)(struct __sk_buff *skb, __u32 len, __u64 flags) = (void *) 43;
+
+/*
+ * bpf_xdp_adjust_head
+ *
+ * 	Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that
+ * 	it is possible to use a negative value for *delta*. This helper
+ * 	can be used to prepare the packet for pushing or popping
+ * 	headers.
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_xdp_adjust_head)(struct xdp_md *xdp_md, int delta) = (void *) 44;
+
+/*
+ * bpf_probe_read_str
+ *
+ * 	Copy a NUL terminated string from an unsafe kernel address
+ * 	*unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for
+ * 	more details.
+ *
+ * 	Generally, use **bpf_probe_read_user_str**\ () or
+ * 	**bpf_probe_read_kernel_str**\ () instead.
+ *
+ * Returns
+ * 	On success, the strictly positive length of the string,
+ * 	including the trailing NUL character. On error, a negative
+ * 	value.
+ */
+static long (*bpf_probe_read_str)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 45;
+
+/*
+ * bpf_get_socket_cookie
+ *
+ * 	If the **struct sk_buff** pointed by *skb* has a known socket,
+ * 	retrieve the cookie (generated by the kernel) of this socket.
+ * 	If no cookie has been set yet, generate a new cookie. Once
+ * 	generated, the socket cookie remains stable for the life of the
+ * 	socket. This helper can be useful for monitoring per socket
+ * 	networking traffic statistics as it provides a global socket
+ * 	identifier that can be assumed unique.
+ *
+ * Returns
+ * 	A 8-byte long non-decreasing number on success, or 0 if the
+ * 	socket field is missing inside *skb*.
+ */
+static __u64 (*bpf_get_socket_cookie)(void *ctx) = (void *) 46;
+
+/*
+ * bpf_get_socket_uid
+ *
+ *
+ * Returns
+ * 	The owner UID of the socket associated to *skb*. If the socket
+ * 	is **NULL**, or if it is not a full socket (i.e. if it is a
+ * 	time-wait or a request socket instead), **overflowuid** value
+ * 	is returned (note that **overflowuid** might also be the actual
+ * 	UID value for the socket).
+ */
+static __u32 (*bpf_get_socket_uid)(struct __sk_buff *skb) = (void *) 47;
+
+/*
+ * bpf_set_hash
+ *
+ * 	Set the full hash for *skb* (set the field *skb*\ **->hash**)
+ * 	to value *hash*.
+ *
+ * Returns
+ * 	0
+ */
+static long (*bpf_set_hash)(struct __sk_buff *skb, __u32 hash) = (void *) 48;
+
+/*
+ * bpf_setsockopt
+ *
+ * 	Emulate a call to **setsockopt()** on the socket associated to
+ * 	*bpf_socket*, which must be a full socket. The *level* at
+ * 	which the option resides and the name *optname* of the option
+ * 	must be specified, see **setsockopt(2)** for more information.
+ * 	The option value of length *optlen* is pointed by *optval*.
+ *
+ * 	*bpf_socket* should be one of the following:
+ *
+ * 	* **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
+ * 	* **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
+ * 	  and **BPF_CGROUP_INET6_CONNECT**.
+ *
+ * 	This helper actually implements a subset of **setsockopt()**.
+ * 	It supports the following *level*\ s:
+ *
+ * 	* **SOL_SOCKET**, which supports the following *optname*\ s:
+ * 	  **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**,
+ * 	  **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**,
+ * 	  **SO_BINDTODEVICE**, **SO_KEEPALIVE**.
+ * 	* **IPPROTO_TCP**, which supports the following *optname*\ s:
+ * 	  **TCP_CONGESTION**, **TCP_BPF_IW**,
+ * 	  **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**,
+ * 	  **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**,
+ * 	  **TCP_SYNCNT**, **TCP_USER_TIMEOUT**.
+ * 	* **IPPROTO_IP**, which supports *optname* **IP_TOS**.
+ * 	* **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_setsockopt)(void *bpf_socket, int level, int optname, void *optval, int optlen) = (void *) 49;
+
+/*
+ * bpf_skb_adjust_room
+ *
+ * 	Grow or shrink the room for data in the packet associated to
+ * 	*skb* by *len_diff*, and according to the selected *mode*.
+ *
+ * 	By default, the helper will reset any offloaded checksum
+ * 	indicator of the skb to CHECKSUM_NONE. This can be avoided
+ * 	by the following flag:
+ *
+ * 	* **BPF_F_ADJ_ROOM_NO_CSUM_RESET**: Do not reset offloaded
+ * 	  checksum data of the skb to CHECKSUM_NONE.
+ *
+ * 	There are two supported modes at this time:
+ *
+ * 	* **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer
+ * 	  (room space is added or removed below the layer 2 header).
+ *
+ * 	* **BPF_ADJ_ROOM_NET**: Adjust room at the network layer
+ * 	  (room space is added or removed below the layer 3 header).
+ *
+ * 	The following flags are supported at this time:
+ *
+ * 	* **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size.
+ * 	  Adjusting mss in this way is not allowed for datagrams.
+ *
+ * 	* **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4**,
+ * 	  **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6**:
+ * 	  Any new space is reserved to hold a tunnel header.
+ * 	  Configure skb offsets and other fields accordingly.
+ *
+ * 	* **BPF_F_ADJ_ROOM_ENCAP_L4_GRE**,
+ * 	  **BPF_F_ADJ_ROOM_ENCAP_L4_UDP**:
+ * 	  Use with ENCAP_L3 flags to further specify the tunnel type.
+ *
+ * 	* **BPF_F_ADJ_ROOM_ENCAP_L2**\ (*len*):
+ * 	  Use with ENCAP_L3/L4 flags to further specify the tunnel
+ * 	  type; *len* is the length of the inner MAC header.
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_skb_adjust_room)(struct __sk_buff *skb, __s32 len_diff, __u32 mode, __u64 flags) = (void *) 50;
+
+/*
+ * bpf_redirect_map
+ *
+ * 	Redirect the packet to the endpoint referenced by *map* at
+ * 	index *key*. Depending on its type, this *map* can contain
+ * 	references to net devices (for forwarding packets through other
+ * 	ports), or to CPUs (for redirecting XDP frames to another CPU;
+ * 	but this is only implemented for native XDP (with driver
+ * 	support) as of this writing).
+ *
+ * 	The lower two bits of *flags* are used as the return code if
+ * 	the map lookup fails. This is so that the return value can be
+ * 	one of the XDP program return codes up to **XDP_TX**, as chosen
+ * 	by the caller. Any higher bits in the *flags* argument must be
+ * 	unset.
+ *
+ * 	See also **bpf_redirect**\ (), which only supports redirecting
+ * 	to an ifindex, but doesn't require a map to do so.
+ *
+ * Returns
+ * 	**XDP_REDIRECT** on success, or the value of the two lower bits
+ * 	of the *flags* argument on error.
+ */
+static long (*bpf_redirect_map)(void *map, __u32 key, __u64 flags) = (void *) 51;
+
+/*
+ * bpf_sk_redirect_map
+ *
+ * 	Redirect the packet to the socket referenced by *map* (of type
+ * 	**BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and
+ * 	egress interfaces can be used for redirection. The
+ * 	**BPF_F_INGRESS** value in *flags* is used to make the
+ * 	distinction (ingress path is selected if the flag is present,
+ * 	egress path otherwise). This is the only flag supported for now.
+ *
+ * Returns
+ * 	**SK_PASS** on success, or **SK_DROP** on error.
+ */
+static long (*bpf_sk_redirect_map)(struct __sk_buff *skb, void *map, __u32 key, __u64 flags) = (void *) 52;
+
+/*
+ * bpf_sock_map_update
+ *
+ * 	Add an entry to, or update a *map* referencing sockets. The
+ * 	*skops* is used as a new value for the entry associated to
+ * 	*key*. *flags* is one of:
+ *
+ * 	**BPF_NOEXIST**
+ * 		The entry for *key* must not exist in the map.
+ * 	**BPF_EXIST**
+ * 		The entry for *key* must already exist in the map.
+ * 	**BPF_ANY**
+ * 		No condition on the existence of the entry for *key*.
+ *
+ * 	If the *map* has eBPF programs (parser and verdict), those will
+ * 	be inherited by the socket being added. If the socket is
+ * 	already attached to eBPF programs, this results in an error.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_sock_map_update)(struct bpf_sock_ops *skops, void *map, void *key, __u64 flags) = (void *) 53;
+
+/*
+ * bpf_xdp_adjust_meta
+ *
+ * 	Adjust the address pointed by *xdp_md*\ **->data_meta** by
+ * 	*delta* (which can be positive or negative). Note that this
+ * 	operation modifies the address stored in *xdp_md*\ **->data**,
+ * 	so the latter must be loaded only after the helper has been
+ * 	called.
+ *
+ * 	The use of *xdp_md*\ **->data_meta** is optional and programs
+ * 	are not required to use it. The rationale is that when the
+ * 	packet is processed with XDP (e.g. as DoS filter), it is
+ * 	possible to push further meta data along with it before passing
+ * 	to the stack, and to give the guarantee that an ingress eBPF
+ * 	program attached as a TC classifier on the same device can pick
+ * 	this up for further post-processing. Since TC works with socket
+ * 	buffers, it remains possible to set from XDP the **mark** or
+ * 	**priority** pointers, or other pointers for the socket buffer.
+ * 	Having this scratch space generic and programmable allows for
+ * 	more flexibility as the user is free to store whatever meta
+ * 	data they need.
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_xdp_adjust_meta)(struct xdp_md *xdp_md, int delta) = (void *) 54;
+
+/*
+ * bpf_perf_event_read_value
+ *
+ * 	Read the value of a perf event counter, and store it into *buf*
+ * 	of size *buf_size*. This helper relies on a *map* of type
+ * 	**BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of the perf event
+ * 	counter is selected when *map* is updated with perf event file
+ * 	descriptors. The *map* is an array whose size is the number of
+ * 	available CPUs, and each cell contains a value relative to one
+ * 	CPU. The value to retrieve is indicated by *flags*, that
+ * 	contains the index of the CPU to look up, masked with
+ * 	**BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to
+ * 	**BPF_F_CURRENT_CPU** to indicate that the value for the
+ * 	current CPU should be retrieved.
+ *
+ * 	This helper behaves in a way close to
+ * 	**bpf_perf_event_read**\ () helper, save that instead of
+ * 	just returning the value observed, it fills the *buf*
+ * 	structure. This allows for additional data to be retrieved: in
+ * 	particular, the enabled and running times (in *buf*\
+ * 	**->enabled** and *buf*\ **->running**, respectively) are
+ * 	copied. In general, **bpf_perf_event_read_value**\ () is
+ * 	recommended over **bpf_perf_event_read**\ (), which has some
+ * 	ABI issues and provides fewer functionalities.
+ *
+ * 	These values are interesting, because hardware PMU (Performance
+ * 	Monitoring Unit) counters are limited resources. When there are
+ * 	more PMU based perf events opened than available counters,
+ * 	kernel will multiplex these events so each event gets certain
+ * 	percentage (but not all) of the PMU time. In case that
+ * 	multiplexing happens, the number of samples or counter value
+ * 	will not reflect the case compared to when no multiplexing
+ * 	occurs. This makes comparison between different runs difficult.
+ * 	Typically, the counter value should be normalized before
+ * 	comparing to other experiments. The usual normalization is done
+ * 	as follows.
+ *
+ * 	::
+ *
+ * 		normalized_counter = counter * t_enabled / t_running
+ *
+ * 	Where t_enabled is the time enabled for event and t_running is
+ * 	the time running for event since last normalization. The
+ * 	enabled and running times are accumulated since the perf event
+ * 	open. To achieve scaling factor between two invocations of an
+ * 	eBPF program, users can use CPU id as the key (which is
+ * 	typical for perf array usage model) to remember the previous
+ * 	value and do the calculation inside the eBPF program.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_perf_event_read_value)(void *map, __u64 flags, struct bpf_perf_event_value *buf, __u32 buf_size) = (void *) 55;
+
+/*
+ * bpf_perf_prog_read_value
+ *
+ * 	For en eBPF program attached to a perf event, retrieve the
+ * 	value of the event counter associated to *ctx* and store it in
+ * 	the structure pointed by *buf* and of size *buf_size*. Enabled
+ * 	and running times are also stored in the structure (see
+ * 	description of helper **bpf_perf_event_read_value**\ () for
+ * 	more details).
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_perf_prog_read_value)(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, __u32 buf_size) = (void *) 56;
+
+/*
+ * bpf_getsockopt
+ *
+ * 	Emulate a call to **getsockopt()** on the socket associated to
+ * 	*bpf_socket*, which must be a full socket. The *level* at
+ * 	which the option resides and the name *optname* of the option
+ * 	must be specified, see **getsockopt(2)** for more information.
+ * 	The retrieved value is stored in the structure pointed by
+ * 	*opval* and of length *optlen*.
+ *
+ * 	*bpf_socket* should be one of the following:
+ *
+ * 	* **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
+ * 	* **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
+ * 	  and **BPF_CGROUP_INET6_CONNECT**.
+ *
+ * 	This helper actually implements a subset of **getsockopt()**.
+ * 	It supports the following *level*\ s:
+ *
+ * 	* **IPPROTO_TCP**, which supports *optname*
+ * 	  **TCP_CONGESTION**.
+ * 	* **IPPROTO_IP**, which supports *optname* **IP_TOS**.
+ * 	* **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_getsockopt)(void *bpf_socket, int level, int optname, void *optval, int optlen) = (void *) 57;
+
+/*
+ * bpf_override_return
+ *
+ * 	Used for error injection, this helper uses kprobes to override
+ * 	the return value of the probed function, and to set it to *rc*.
+ * 	The first argument is the context *regs* on which the kprobe
+ * 	works.
+ *
+ * 	This helper works by setting the PC (program counter)
+ * 	to an override function which is run in place of the original
+ * 	probed function. This means the probed function is not run at
+ * 	all. The replacement function just returns with the required
+ * 	value.
+ *
+ * 	This helper has security implications, and thus is subject to
+ * 	restrictions. It is only available if the kernel was compiled
+ * 	with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration
+ * 	option, and in this case it only works on functions tagged with
+ * 	**ALLOW_ERROR_INJECTION** in the kernel code.
+ *
+ * 	Also, the helper is only available for the architectures having
+ * 	the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing,
+ * 	x86 architecture is the only one to support this feature.
+ *
+ * Returns
+ * 	0
+ */
+static long (*bpf_override_return)(struct pt_regs *regs, __u64 rc) = (void *) 58;
+
+/*
+ * bpf_sock_ops_cb_flags_set
+ *
+ * 	Attempt to set the value of the **bpf_sock_ops_cb_flags** field
+ * 	for the full TCP socket associated to *bpf_sock_ops* to
+ * 	*argval*.
+ *
+ * 	The primary use of this field is to determine if there should
+ * 	be calls to eBPF programs of type
+ * 	**BPF_PROG_TYPE_SOCK_OPS** at various points in the TCP
+ * 	code. A program of the same type can change its value, per
+ * 	connection and as necessary, when the connection is
+ * 	established. This field is directly accessible for reading, but
+ * 	this helper must be used for updates in order to return an
+ * 	error if an eBPF program tries to set a callback that is not
+ * 	supported in the current kernel.
+ *
+ * 	*argval* is a flag array which can combine these flags:
+ *
+ * 	* **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out)
+ * 	* **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission)
+ * 	* **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change)
+ * 	* **BPF_SOCK_OPS_RTT_CB_FLAG** (every RTT)
+ *
+ * 	Therefore, this function can be used to clear a callback flag by
+ * 	setting the appropriate bit to zero. e.g. to disable the RTO
+ * 	callback:
+ *
+ * 	**bpf_sock_ops_cb_flags_set(bpf_sock,**
+ * 		**bpf_sock->bpf_sock_ops_cb_flags & ~BPF_SOCK_OPS_RTO_CB_FLAG)**
+ *
+ * 	Here are some examples of where one could call such eBPF
+ * 	program:
+ *
+ * 	* When RTO fires.
+ * 	* When a packet is retransmitted.
+ * 	* When the connection terminates.
+ * 	* When a packet is sent.
+ * 	* When a packet is received.
+ *
+ * Returns
+ * 	Code **-EINVAL** if the socket is not a full TCP socket;
+ * 	otherwise, a positive number containing the bits that could not
+ * 	be set is returned (which comes down to 0 if all bits were set
+ * 	as required).
+ */
+static long (*bpf_sock_ops_cb_flags_set)(struct bpf_sock_ops *bpf_sock, int argval) = (void *) 59;
+
+/*
+ * bpf_msg_redirect_map
+ *
+ * 	This helper is used in programs implementing policies at the
+ * 	socket level. If the message *msg* is allowed to pass (i.e. if
+ * 	the verdict eBPF program returns **SK_PASS**), redirect it to
+ * 	the socket referenced by *map* (of type
+ * 	**BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and
+ * 	egress interfaces can be used for redirection. The
+ * 	**BPF_F_INGRESS** value in *flags* is used to make the
+ * 	distinction (ingress path is selected if the flag is present,
+ * 	egress path otherwise). This is the only flag supported for now.
+ *
+ * Returns
+ * 	**SK_PASS** on success, or **SK_DROP** on error.
+ */
+static long (*bpf_msg_redirect_map)(struct sk_msg_md *msg, void *map, __u32 key, __u64 flags) = (void *) 60;
+
+/*
+ * bpf_msg_apply_bytes
+ *
+ * 	For socket policies, apply the verdict of the eBPF program to
+ * 	the next *bytes* (number of bytes) of message *msg*.
+ *
+ * 	For example, this helper can be used in the following cases:
+ *
+ * 	* A single **sendmsg**\ () or **sendfile**\ () system call
+ * 	  contains multiple logical messages that the eBPF program is
+ * 	  supposed to read and for which it should apply a verdict.
+ * 	* An eBPF program only cares to read the first *bytes* of a
+ * 	  *msg*. If the message has a large payload, then setting up
+ * 	  and calling the eBPF program repeatedly for all bytes, even
+ * 	  though the verdict is already known, would create unnecessary
+ * 	  overhead.
+ *
+ * 	When called from within an eBPF program, the helper sets a
+ * 	counter internal to the BPF infrastructure, that is used to
+ * 	apply the last verdict to the next *bytes*. If *bytes* is
+ * 	smaller than the current data being processed from a
+ * 	**sendmsg**\ () or **sendfile**\ () system call, the first
+ * 	*bytes* will be sent and the eBPF program will be re-run with
+ * 	the pointer for start of data pointing to byte number *bytes*
+ * 	**+ 1**. If *bytes* is larger than the current data being
+ * 	processed, then the eBPF verdict will be applied to multiple
+ * 	**sendmsg**\ () or **sendfile**\ () calls until *bytes* are
+ * 	consumed.
+ *
+ * 	Note that if a socket closes with the internal counter holding
+ * 	a non-zero value, this is not a problem because data is not
+ * 	being buffered for *bytes* and is sent as it is received.
+ *
+ * Returns
+ * 	0
+ */
+static long (*bpf_msg_apply_bytes)(struct sk_msg_md *msg, __u32 bytes) = (void *) 61;
+
+/*
+ * bpf_msg_cork_bytes
+ *
+ * 	For socket policies, prevent the execution of the verdict eBPF
+ * 	program for message *msg* until *bytes* (byte number) have been
+ * 	accumulated.
+ *
+ * 	This can be used when one needs a specific number of bytes
+ * 	before a verdict can be assigned, even if the data spans
+ * 	multiple **sendmsg**\ () or **sendfile**\ () calls. The extreme
+ * 	case would be a user calling **sendmsg**\ () repeatedly with
+ * 	1-byte long message segments. Obviously, this is bad for
+ * 	performance, but it is still valid. If the eBPF program needs
+ * 	*bytes* bytes to validate a header, this helper can be used to
+ * 	prevent the eBPF program to be called again until *bytes* have
+ * 	been accumulated.
+ *
+ * Returns
+ * 	0
+ */
+static long (*bpf_msg_cork_bytes)(struct sk_msg_md *msg, __u32 bytes) = (void *) 62;
+
+/*
+ * bpf_msg_pull_data
+ *
+ * 	For socket policies, pull in non-linear data from user space
+ * 	for *msg* and set pointers *msg*\ **->data** and *msg*\
+ * 	**->data_end** to *start* and *end* bytes offsets into *msg*,
+ * 	respectively.
+ *
+ * 	If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a
+ * 	*msg* it can only parse data that the (**data**, **data_end**)
+ * 	pointers have already consumed. For **sendmsg**\ () hooks this
+ * 	is likely the first scatterlist element. But for calls relying
+ * 	on the **sendpage** handler (e.g. **sendfile**\ ()) this will
+ * 	be the range (**0**, **0**) because the data is shared with
+ * 	user space and by default the objective is to avoid allowing
+ * 	user space to modify data while (or after) eBPF verdict is
+ * 	being decided. This helper can be used to pull in data and to
+ * 	set the start and end pointer to given values. Data will be
+ * 	copied if necessary (i.e. if data was not linear and if start
+ * 	and end pointers do not point to the same chunk).
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * 	All values for *flags* are reserved for future usage, and must
+ * 	be left at zero.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_msg_pull_data)(struct sk_msg_md *msg, __u32 start, __u32 end, __u64 flags) = (void *) 63;
+
+/*
+ * bpf_bind
+ *
+ * 	Bind the socket associated to *ctx* to the address pointed by
+ * 	*addr*, of length *addr_len*. This allows for making outgoing
+ * 	connection from the desired IP address, which can be useful for
+ * 	example when all processes inside a cgroup should use one
+ * 	single IP address on a host that has multiple IP configured.
+ *
+ * 	This helper works for IPv4 and IPv6, TCP and UDP sockets. The
+ * 	domain (*addr*\ **->sa_family**) must be **AF_INET** (or
+ * 	**AF_INET6**). It's advised to pass zero port (**sin_port**
+ * 	or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like
+ * 	behavior and lets the kernel efficiently pick up an unused
+ * 	port as long as 4-tuple is unique. Passing non-zero port might
+ * 	lead to degraded performance.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_bind)(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) = (void *) 64;
+
+/*
+ * bpf_xdp_adjust_tail
+ *
+ * 	Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is
+ * 	possible to both shrink and grow the packet tail.
+ * 	Shrink done via *delta* being a negative integer.
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_xdp_adjust_tail)(struct xdp_md *xdp_md, int delta) = (void *) 65;
+
+/*
+ * bpf_skb_get_xfrm_state
+ *
+ * 	Retrieve the XFRM state (IP transform framework, see also
+ * 	**ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*.
+ *
+ * 	The retrieved value is stored in the **struct bpf_xfrm_state**
+ * 	pointed by *xfrm_state* and of length *size*.
+ *
+ * 	All values for *flags* are reserved for future usage, and must
+ * 	be left at zero.
+ *
+ * 	This helper is available only if the kernel was compiled with
+ * 	**CONFIG_XFRM** configuration option.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_skb_get_xfrm_state)(struct __sk_buff *skb, __u32 index, struct bpf_xfrm_state *xfrm_state, __u32 size, __u64 flags) = (void *) 66;
+
+/*
+ * bpf_get_stack
+ *
+ * 	Return a user or a kernel stack in bpf program provided buffer.
+ * 	To achieve this, the helper needs *ctx*, which is a pointer
+ * 	to the context on which the tracing program is executed.
+ * 	To store the stacktrace, the bpf program provides *buf* with
+ * 	a nonnegative *size*.
+ *
+ * 	The last argument, *flags*, holds the number of stack frames to
+ * 	skip (from 0 to 255), masked with
+ * 	**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ * 	the following flags:
+ *
+ * 	**BPF_F_USER_STACK**
+ * 		Collect a user space stack instead of a kernel stack.
+ * 	**BPF_F_USER_BUILD_ID**
+ * 		Collect buildid+offset instead of ips for user stack,
+ * 		only valid if **BPF_F_USER_STACK** is also specified.
+ *
+ * 	**bpf_get_stack**\ () can collect up to
+ * 	**PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
+ * 	to sufficient large buffer size. Note that
+ * 	this limit can be controlled with the **sysctl** program, and
+ * 	that it should be manually increased in order to profile long
+ * 	user stacks (such as stacks for Java programs). To do so, use:
+ *
+ * 	::
+ *
+ * 		# sysctl kernel.perf_event_max_stack=<new value>
+ *
+ * Returns
+ * 	A non-negative value equal to or less than *size* on success,
+ * 	or a negative error in case of failure.
+ */
+static long (*bpf_get_stack)(void *ctx, void *buf, __u32 size, __u64 flags) = (void *) 67;
+
+/*
+ * bpf_skb_load_bytes_relative
+ *
+ * 	This helper is similar to **bpf_skb_load_bytes**\ () in that
+ * 	it provides an easy way to load *len* bytes from *offset*
+ * 	from the packet associated to *skb*, into the buffer pointed
+ * 	by *to*. The difference to **bpf_skb_load_bytes**\ () is that
+ * 	a fifth argument *start_header* exists in order to select a
+ * 	base offset to start from. *start_header* can be one of:
+ *
+ * 	**BPF_HDR_START_MAC**
+ * 		Base offset to load data from is *skb*'s mac header.
+ * 	**BPF_HDR_START_NET**
+ * 		Base offset to load data from is *skb*'s network header.
+ *
+ * 	In general, "direct packet access" is the preferred method to
+ * 	access packet data, however, this helper is in particular useful
+ * 	in socket filters where *skb*\ **->data** does not always point
+ * 	to the start of the mac header and where "direct packet access"
+ * 	is not available.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_skb_load_bytes_relative)(const void *skb, __u32 offset, void *to, __u32 len, __u32 start_header) = (void *) 68;
+
+/*
+ * bpf_fib_lookup
+ *
+ * 	Do FIB lookup in kernel tables using parameters in *params*.
+ * 	If lookup is successful and result shows packet is to be
+ * 	forwarded, the neighbor tables are searched for the nexthop.
+ * 	If successful (ie., FIB lookup shows forwarding and nexthop
+ * 	is resolved), the nexthop address is returned in ipv4_dst
+ * 	or ipv6_dst based on family, smac is set to mac address of
+ * 	egress device, dmac is set to nexthop mac address, rt_metric
+ * 	is set to metric from route (IPv4/IPv6 only), and ifindex
+ * 	is set to the device index of the nexthop from the FIB lookup.
+ *
+ * 	*plen* argument is the size of the passed in struct.
+ * 	*flags* argument can be a combination of one or more of the
+ * 	following values:
+ *
+ * 	**BPF_FIB_LOOKUP_DIRECT**
+ * 		Do a direct table lookup vs full lookup using FIB
+ * 		rules.
+ * 	**BPF_FIB_LOOKUP_OUTPUT**
+ * 		Perform lookup from an egress perspective (default is
+ * 		ingress).
+ *
+ * 	*ctx* is either **struct xdp_md** for XDP programs or
+ * 	**struct sk_buff** tc cls_act programs.
+ *
+ * Returns
+ * 	* < 0 if any input argument is invalid
+ * 	*   0 on success (packet is forwarded, nexthop neighbor exists)
+ * 	* > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the
+ * 	  packet is not forwarded or needs assist from full stack
+ */
+static long (*bpf_fib_lookup)(void *ctx, struct bpf_fib_lookup *params, int plen, __u32 flags) = (void *) 69;
+
+/*
+ * bpf_sock_hash_update
+ *
+ * 	Add an entry to, or update a sockhash *map* referencing sockets.
+ * 	The *skops* is used as a new value for the entry associated to
+ * 	*key*. *flags* is one of:
+ *
+ * 	**BPF_NOEXIST**
+ * 		The entry for *key* must not exist in the map.
+ * 	**BPF_EXIST**
+ * 		The entry for *key* must already exist in the map.
+ * 	**BPF_ANY**
+ * 		No condition on the existence of the entry for *key*.
+ *
+ * 	If the *map* has eBPF programs (parser and verdict), those will
+ * 	be inherited by the socket being added. If the socket is
+ * 	already attached to eBPF programs, this results in an error.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_sock_hash_update)(struct bpf_sock_ops *skops, void *map, void *key, __u64 flags) = (void *) 70;
+
+/*
+ * bpf_msg_redirect_hash
+ *
+ * 	This helper is used in programs implementing policies at the
+ * 	socket level. If the message *msg* is allowed to pass (i.e. if
+ * 	the verdict eBPF program returns **SK_PASS**), redirect it to
+ * 	the socket referenced by *map* (of type
+ * 	**BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
+ * 	egress interfaces can be used for redirection. The
+ * 	**BPF_F_INGRESS** value in *flags* is used to make the
+ * 	distinction (ingress path is selected if the flag is present,
+ * 	egress path otherwise). This is the only flag supported for now.
+ *
+ * Returns
+ * 	**SK_PASS** on success, or **SK_DROP** on error.
+ */
+static long (*bpf_msg_redirect_hash)(struct sk_msg_md *msg, void *map, void *key, __u64 flags) = (void *) 71;
+
+/*
+ * bpf_sk_redirect_hash
+ *
+ * 	This helper is used in programs implementing policies at the
+ * 	skb socket level. If the sk_buff *skb* is allowed to pass (i.e.
+ * 	if the verdeict eBPF program returns **SK_PASS**), redirect it
+ * 	to the socket referenced by *map* (of type
+ * 	**BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
+ * 	egress interfaces can be used for redirection. The
+ * 	**BPF_F_INGRESS** value in *flags* is used to make the
+ * 	distinction (ingress path is selected if the flag is present,
+ * 	egress otherwise). This is the only flag supported for now.
+ *
+ * Returns
+ * 	**SK_PASS** on success, or **SK_DROP** on error.
+ */
+static long (*bpf_sk_redirect_hash)(struct __sk_buff *skb, void *map, void *key, __u64 flags) = (void *) 72;
+
+/*
+ * bpf_lwt_push_encap
+ *
+ * 	Encapsulate the packet associated to *skb* within a Layer 3
+ * 	protocol header. This header is provided in the buffer at
+ * 	address *hdr*, with *len* its size in bytes. *type* indicates
+ * 	the protocol of the header and can be one of:
+ *
+ * 	**BPF_LWT_ENCAP_SEG6**
+ * 		IPv6 encapsulation with Segment Routing Header
+ * 		(**struct ipv6_sr_hdr**). *hdr* only contains the SRH,
+ * 		the IPv6 header is computed by the kernel.
+ * 	**BPF_LWT_ENCAP_SEG6_INLINE**
+ * 		Only works if *skb* contains an IPv6 packet. Insert a
+ * 		Segment Routing Header (**struct ipv6_sr_hdr**) inside
+ * 		the IPv6 header.
+ * 	**BPF_LWT_ENCAP_IP**
+ * 		IP encapsulation (GRE/GUE/IPIP/etc). The outer header
+ * 		must be IPv4 or IPv6, followed by zero or more
+ * 		additional headers, up to **LWT_BPF_MAX_HEADROOM**
+ * 		total bytes in all prepended headers. Please note that
+ * 		if **skb_is_gso**\ (*skb*) is true, no more than two
+ * 		headers can be prepended, and the inner header, if
+ * 		present, should be either GRE or UDP/GUE.
+ *
+ * 	**BPF_LWT_ENCAP_SEG6**\ \* types can be called by BPF programs
+ * 	of type **BPF_PROG_TYPE_LWT_IN**; **BPF_LWT_ENCAP_IP** type can
+ * 	be called by bpf programs of types **BPF_PROG_TYPE_LWT_IN** and
+ * 	**BPF_PROG_TYPE_LWT_XMIT**.
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_lwt_push_encap)(struct __sk_buff *skb, __u32 type, void *hdr, __u32 len) = (void *) 73;
+
+/*
+ * bpf_lwt_seg6_store_bytes
+ *
+ * 	Store *len* bytes from address *from* into the packet
+ * 	associated to *skb*, at *offset*. Only the flags, tag and TLVs
+ * 	inside the outermost IPv6 Segment Routing Header can be
+ * 	modified through this helper.
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_lwt_seg6_store_bytes)(struct __sk_buff *skb, __u32 offset, const void *from, __u32 len) = (void *) 74;
+
+/*
+ * bpf_lwt_seg6_adjust_srh
+ *
+ * 	Adjust the size allocated to TLVs in the outermost IPv6
+ * 	Segment Routing Header contained in the packet associated to
+ * 	*skb*, at position *offset* by *delta* bytes. Only offsets
+ * 	after the segments are accepted. *delta* can be as well
+ * 	positive (growing) as negative (shrinking).
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_lwt_seg6_adjust_srh)(struct __sk_buff *skb, __u32 offset, __s32 delta) = (void *) 75;
+
+/*
+ * bpf_lwt_seg6_action
+ *
+ * 	Apply an IPv6 Segment Routing action of type *action* to the
+ * 	packet associated to *skb*. Each action takes a parameter
+ * 	contained at address *param*, and of length *param_len* bytes.
+ * 	*action* can be one of:
+ *
+ * 	**SEG6_LOCAL_ACTION_END_X**
+ * 		End.X action: Endpoint with Layer-3 cross-connect.
+ * 		Type of *param*: **struct in6_addr**.
+ * 	**SEG6_LOCAL_ACTION_END_T**
+ * 		End.T action: Endpoint with specific IPv6 table lookup.
+ * 		Type of *param*: **int**.
+ * 	**SEG6_LOCAL_ACTION_END_B6**
+ * 		End.B6 action: Endpoint bound to an SRv6 policy.
+ * 		Type of *param*: **struct ipv6_sr_hdr**.
+ * 	**SEG6_LOCAL_ACTION_END_B6_ENCAP**
+ * 		End.B6.Encap action: Endpoint bound to an SRv6
+ * 		encapsulation policy.
+ * 		Type of *param*: **struct ipv6_sr_hdr**.
+ *
+ * 	A call to this helper is susceptible to change the underlying
+ * 	packet buffer. Therefore, at load time, all checks on pointers
+ * 	previously done by the verifier are invalidated and must be
+ * 	performed again, if the helper is used in combination with
+ * 	direct packet access.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_lwt_seg6_action)(struct __sk_buff *skb, __u32 action, void *param, __u32 param_len) = (void *) 76;
+
+/*
+ * bpf_rc_repeat
+ *
+ * 	This helper is used in programs implementing IR decoding, to
+ * 	report a successfully decoded repeat key message. This delays
+ * 	the generation of a key up event for previously generated
+ * 	key down event.
+ *
+ * 	Some IR protocols like NEC have a special IR message for
+ * 	repeating last button, for when a button is held down.
+ *
+ * 	The *ctx* should point to the lirc sample as passed into
+ * 	the program.
+ *
+ * 	This helper is only available is the kernel was compiled with
+ * 	the **CONFIG_BPF_LIRC_MODE2** configuration option set to
+ * 	"**y**".
+ *
+ * Returns
+ * 	0
+ */
+static long (*bpf_rc_repeat)(void *ctx) = (void *) 77;
+
+/*
+ * bpf_rc_keydown
+ *
+ * 	This helper is used in programs implementing IR decoding, to
+ * 	report a successfully decoded key press with *scancode*,
+ * 	*toggle* value in the given *protocol*. The scancode will be
+ * 	translated to a keycode using the rc keymap, and reported as
+ * 	an input key down event. After a period a key up event is
+ * 	generated. This period can be extended by calling either
+ * 	**bpf_rc_keydown**\ () again with the same values, or calling
+ * 	**bpf_rc_repeat**\ ().
+ *
+ * 	Some protocols include a toggle bit, in case the button was
+ * 	released and pressed again between consecutive scancodes.
+ *
+ * 	The *ctx* should point to the lirc sample as passed into
+ * 	the program.
+ *
+ * 	The *protocol* is the decoded protocol number (see
+ * 	**enum rc_proto** for some predefined values).
+ *
+ * 	This helper is only available is the kernel was compiled with
+ * 	the **CONFIG_BPF_LIRC_MODE2** configuration option set to
+ * 	"**y**".
+ *
+ * Returns
+ * 	0
+ */
+static long (*bpf_rc_keydown)(void *ctx, __u32 protocol, __u64 scancode, __u32 toggle) = (void *) 78;
+
+/*
+ * bpf_skb_cgroup_id
+ *
+ * 	Return the cgroup v2 id of the socket associated with the *skb*.
+ * 	This is roughly similar to the **bpf_get_cgroup_classid**\ ()
+ * 	helper for cgroup v1 by providing a tag resp. identifier that
+ * 	can be matched on or used for map lookups e.g. to implement
+ * 	policy. The cgroup v2 id of a given path in the hierarchy is
+ * 	exposed in user space through the f_handle API in order to get
+ * 	to the same 64-bit id.
+ *
+ * 	This helper can be used on TC egress path, but not on ingress,
+ * 	and is available only if the kernel was compiled with the
+ * 	**CONFIG_SOCK_CGROUP_DATA** configuration option.
+ *
+ * Returns
+ * 	The id is returned or 0 in case the id could not be retrieved.
+ */
+static __u64 (*bpf_skb_cgroup_id)(struct __sk_buff *skb) = (void *) 79;
+
+/*
+ * bpf_get_current_cgroup_id
+ *
+ *
+ * Returns
+ * 	A 64-bit integer containing the current cgroup id based
+ * 	on the cgroup within which the current task is running.
+ */
+static __u64 (*bpf_get_current_cgroup_id)(void) = (void *) 80;
+
+/*
+ * bpf_get_local_storage
+ *
+ * 	Get the pointer to the local storage area.
+ * 	The type and the size of the local storage is defined
+ * 	by the *map* argument.
+ * 	The *flags* meaning is specific for each map type,
+ * 	and has to be 0 for cgroup local storage.
+ *
+ * 	Depending on the BPF program type, a local storage area
+ * 	can be shared between multiple instances of the BPF program,
+ * 	running simultaneously.
+ *
+ * 	A user should care about the synchronization by himself.
+ * 	For example, by using the **BPF_STX_XADD** instruction to alter
+ * 	the shared data.
+ *
+ * Returns
+ * 	A pointer to the local storage area.
+ */
+static void *(*bpf_get_local_storage)(void *map, __u64 flags) = (void *) 81;
+
+/*
+ * bpf_sk_select_reuseport
+ *
+ * 	Select a **SO_REUSEPORT** socket from a
+ * 	**BPF_MAP_TYPE_REUSEPORT_ARRAY** *map*.
+ * 	It checks the selected socket is matching the incoming
+ * 	request in the socket buffer.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_sk_select_reuseport)(struct sk_reuseport_md *reuse, void *map, void *key, __u64 flags) = (void *) 82;
+
+/*
+ * bpf_skb_ancestor_cgroup_id
+ *
+ * 	Return id of cgroup v2 that is ancestor of cgroup associated
+ * 	with the *skb* at the *ancestor_level*.  The root cgroup is at
+ * 	*ancestor_level* zero and each step down the hierarchy
+ * 	increments the level. If *ancestor_level* == level of cgroup
+ * 	associated with *skb*, then return value will be same as that
+ * 	of **bpf_skb_cgroup_id**\ ().
+ *
+ * 	The helper is useful to implement policies based on cgroups
+ * 	that are upper in hierarchy than immediate cgroup associated
+ * 	with *skb*.
+ *
+ * 	The format of returned id and helper limitations are same as in
+ * 	**bpf_skb_cgroup_id**\ ().
+ *
+ * Returns
+ * 	The id is returned or 0 in case the id could not be retrieved.
+ */
+static __u64 (*bpf_skb_ancestor_cgroup_id)(struct __sk_buff *skb, int ancestor_level) = (void *) 83;
+
+/*
+ * bpf_sk_lookup_tcp
+ *
+ * 	Look for TCP socket matching *tuple*, optionally in a child
+ * 	network namespace *netns*. The return value must be checked,
+ * 	and if non-**NULL**, released via **bpf_sk_release**\ ().
+ *
+ * 	The *ctx* should point to the context of the program, such as
+ * 	the skb or socket (depending on the hook in use). This is used
+ * 	to determine the base network namespace for the lookup.
+ *
+ * 	*tuple_size* must be one of:
+ *
+ * 	**sizeof**\ (*tuple*\ **->ipv4**)
+ * 		Look for an IPv4 socket.
+ * 	**sizeof**\ (*tuple*\ **->ipv6**)
+ * 		Look for an IPv6 socket.
+ *
+ * 	If the *netns* is a negative signed 32-bit integer, then the
+ * 	socket lookup table in the netns associated with the *ctx*
+ * 	will be used. For the TC hooks, this is the netns of the device
+ * 	in the skb. For socket hooks, this is the netns of the socket.
+ * 	If *netns* is any other signed 32-bit value greater than or
+ * 	equal to zero then it specifies the ID of the netns relative to
+ * 	the netns associated with the *ctx*. *netns* values beyond the
+ * 	range of 32-bit integers are reserved for future use.
+ *
+ * 	All values for *flags* are reserved for future usage, and must
+ * 	be left at zero.
+ *
+ * 	This helper is available only if the kernel was compiled with
+ * 	**CONFIG_NET** configuration option.
+ *
+ * Returns
+ * 	Pointer to **struct bpf_sock**, or **NULL** in case of failure.
+ * 	For sockets with reuseport option, the **struct bpf_sock**
+ * 	result is from *reuse*\ **->socks**\ [] using the hash of the
+ * 	tuple.
+ */
+static struct bpf_sock *(*bpf_sk_lookup_tcp)(void *ctx, struct bpf_sock_tuple *tuple, __u32 tuple_size, __u64 netns, __u64 flags) = (void *) 84;
+
+/*
+ * bpf_sk_lookup_udp
+ *
+ * 	Look for UDP socket matching *tuple*, optionally in a child
+ * 	network namespace *netns*. The return value must be checked,
+ * 	and if non-**NULL**, released via **bpf_sk_release**\ ().
+ *
+ * 	The *ctx* should point to the context of the program, such as
+ * 	the skb or socket (depending on the hook in use). This is used
+ * 	to determine the base network namespace for the lookup.
+ *
+ * 	*tuple_size* must be one of:
+ *
+ * 	**sizeof**\ (*tuple*\ **->ipv4**)
+ * 		Look for an IPv4 socket.
+ * 	**sizeof**\ (*tuple*\ **->ipv6**)
+ * 		Look for an IPv6 socket.
+ *
+ * 	If the *netns* is a negative signed 32-bit integer, then the
+ * 	socket lookup table in the netns associated with the *ctx*
+ * 	will be used. For the TC hooks, this is the netns of the device
+ * 	in the skb. For socket hooks, this is the netns of the socket.
+ * 	If *netns* is any other signed 32-bit value greater than or
+ * 	equal to zero then it specifies the ID of the netns relative to
+ * 	the netns associated with the *ctx*. *netns* values beyond the
+ * 	range of 32-bit integers are reserved for future use.
+ *
+ * 	All values for *flags* are reserved for future usage, and must
+ * 	be left at zero.
+ *
+ * 	This helper is available only if the kernel was compiled with
+ * 	**CONFIG_NET** configuration option.
+ *
+ * Returns
+ * 	Pointer to **struct bpf_sock**, or **NULL** in case of failure.
+ * 	For sockets with reuseport option, the **struct bpf_sock**
+ * 	result is from *reuse*\ **->socks**\ [] using the hash of the
+ * 	tuple.
+ */
+static struct bpf_sock *(*bpf_sk_lookup_udp)(void *ctx, struct bpf_sock_tuple *tuple, __u32 tuple_size, __u64 netns, __u64 flags) = (void *) 85;
+
+/*
+ * bpf_sk_release
+ *
+ * 	Release the reference held by *sock*. *sock* must be a
+ * 	non-**NULL** pointer that was returned from
+ * 	**bpf_sk_lookup_xxx**\ ().
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_sk_release)(struct bpf_sock *sock) = (void *) 86;
+
+/*
+ * bpf_map_push_elem
+ *
+ * 	Push an element *value* in *map*. *flags* is one of:
+ *
+ * 	**BPF_EXIST**
+ * 		If the queue/stack is full, the oldest element is
+ * 		removed to make room for this.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_map_push_elem)(void *map, const void *value, __u64 flags) = (void *) 87;
+
+/*
+ * bpf_map_pop_elem
+ *
+ * 	Pop an element from *map*.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_map_pop_elem)(void *map, void *value) = (void *) 88;
+
+/*
+ * bpf_map_peek_elem
+ *
+ * 	Get an element from *map* without removing it.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_map_peek_elem)(void *map, void *value) = (void *) 89;
+
+/*
+ * bpf_msg_push_data
+ *
+ * 	For socket policies, insert *len* bytes into *msg* at offset
+ * 	*start*.
+ *
+ * 	If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a
+ * 	*msg* it may want to insert metadata or options into the *msg*.
+ * 	This can later be read and used by any of the lower layer BPF
+ * 	hooks.
+ *
+ * 	This helper may fail if under memory pressure (a malloc
+ * 	fails) in these cases BPF programs will get an appropriate
+ * 	error and BPF programs will need to handle them.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_msg_push_data)(struct sk_msg_md *msg, __u32 start, __u32 len, __u64 flags) = (void *) 90;
+
+/*
+ * bpf_msg_pop_data
+ *
+ * 	Will remove *len* bytes from a *msg* starting at byte *start*.
+ * 	This may result in **ENOMEM** errors under certain situations if
+ * 	an allocation and copy are required due to a full ring buffer.
+ * 	However, the helper will try to avoid doing the allocation
+ * 	if possible. Other errors can occur if input parameters are
+ * 	invalid either due to *start* byte not being valid part of *msg*
+ * 	payload and/or *pop* value being to large.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_msg_pop_data)(struct sk_msg_md *msg, __u32 start, __u32 len, __u64 flags) = (void *) 91;
+
+/*
+ * bpf_rc_pointer_rel
+ *
+ * 	This helper is used in programs implementing IR decoding, to
+ * 	report a successfully decoded pointer movement.
+ *
+ * 	The *ctx* should point to the lirc sample as passed into
+ * 	the program.
+ *
+ * 	This helper is only available is the kernel was compiled with
+ * 	the **CONFIG_BPF_LIRC_MODE2** configuration option set to
+ * 	"**y**".
+ *
+ * Returns
+ * 	0
+ */
+static long (*bpf_rc_pointer_rel)(void *ctx, __s32 rel_x, __s32 rel_y) = (void *) 92;
+
+/*
+ * bpf_spin_lock
+ *
+ * 	Acquire a spinlock represented by the pointer *lock*, which is
+ * 	stored as part of a value of a map. Taking the lock allows to
+ * 	safely update the rest of the fields in that value. The
+ * 	spinlock can (and must) later be released with a call to
+ * 	**bpf_spin_unlock**\ (\ *lock*\ ).
+ *
+ * 	Spinlocks in BPF programs come with a number of restrictions
+ * 	and constraints:
+ *
+ * 	* **bpf_spin_lock** objects are only allowed inside maps of
+ * 	  types **BPF_MAP_TYPE_HASH** and **BPF_MAP_TYPE_ARRAY** (this
+ * 	  list could be extended in the future).
+ * 	* BTF description of the map is mandatory.
+ * 	* The BPF program can take ONE lock at a time, since taking two
+ * 	  or more could cause dead locks.
+ * 	* Only one **struct bpf_spin_lock** is allowed per map element.
+ * 	* When the lock is taken, calls (either BPF to BPF or helpers)
+ * 	  are not allowed.
+ * 	* The **BPF_LD_ABS** and **BPF_LD_IND** instructions are not
+ * 	  allowed inside a spinlock-ed region.
+ * 	* The BPF program MUST call **bpf_spin_unlock**\ () to release
+ * 	  the lock, on all execution paths, before it returns.
+ * 	* The BPF program can access **struct bpf_spin_lock** only via
+ * 	  the **bpf_spin_lock**\ () and **bpf_spin_unlock**\ ()
+ * 	  helpers. Loading or storing data into the **struct
+ * 	  bpf_spin_lock** *lock*\ **;** field of a map is not allowed.
+ * 	* To use the **bpf_spin_lock**\ () helper, the BTF description
+ * 	  of the map value must be a struct and have **struct
+ * 	  bpf_spin_lock** *anyname*\ **;** field at the top level.
+ * 	  Nested lock inside another struct is not allowed.
+ * 	* The **struct bpf_spin_lock** *lock* field in a map value must
+ * 	  be aligned on a multiple of 4 bytes in that value.
+ * 	* Syscall with command **BPF_MAP_LOOKUP_ELEM** does not copy
+ * 	  the **bpf_spin_lock** field to user space.
+ * 	* Syscall with command **BPF_MAP_UPDATE_ELEM**, or update from
+ * 	  a BPF program, do not update the **bpf_spin_lock** field.
+ * 	* **bpf_spin_lock** cannot be on the stack or inside a
+ * 	  networking packet (it can only be inside of a map values).
+ * 	* **bpf_spin_lock** is available to root only.
+ * 	* Tracing programs and socket filter programs cannot use
+ * 	  **bpf_spin_lock**\ () due to insufficient preemption checks
+ * 	  (but this may change in the future).
+ * 	* **bpf_spin_lock** is not allowed in inner maps of map-in-map.
+ *
+ * Returns
+ * 	0
+ */
+static long (*bpf_spin_lock)(struct bpf_spin_lock *lock) = (void *) 93;
+
+/*
+ * bpf_spin_unlock
+ *
+ * 	Release the *lock* previously locked by a call to
+ * 	**bpf_spin_lock**\ (\ *lock*\ ).
+ *
+ * Returns
+ * 	0
+ */
+static long (*bpf_spin_unlock)(struct bpf_spin_lock *lock) = (void *) 94;
+
+/*
+ * bpf_sk_fullsock
+ *
+ * 	This helper gets a **struct bpf_sock** pointer such
+ * 	that all the fields in this **bpf_sock** can be accessed.
+ *
+ * Returns
+ * 	A **struct bpf_sock** pointer on success, or **NULL** in
+ * 	case of failure.
+ */
+static struct bpf_sock *(*bpf_sk_fullsock)(struct bpf_sock *sk) = (void *) 95;
+
+/*
+ * bpf_tcp_sock
+ *
+ * 	This helper gets a **struct bpf_tcp_sock** pointer from a
+ * 	**struct bpf_sock** pointer.
+ *
+ * Returns
+ * 	A **struct bpf_tcp_sock** pointer on success, or **NULL** in
+ * 	case of failure.
+ */
+static struct bpf_tcp_sock *(*bpf_tcp_sock)(struct bpf_sock *sk) = (void *) 96;
+
+/*
+ * bpf_skb_ecn_set_ce
+ *
+ * 	Set ECN (Explicit Congestion Notification) field of IP header
+ * 	to **CE** (Congestion Encountered) if current value is **ECT**
+ * 	(ECN Capable Transport). Otherwise, do nothing. Works with IPv6
+ * 	and IPv4.
+ *
+ * Returns
+ * 	1 if the **CE** flag is set (either by the current helper call
+ * 	or because it was already present), 0 if it is not set.
+ */
+static long (*bpf_skb_ecn_set_ce)(struct __sk_buff *skb) = (void *) 97;
+
+/*
+ * bpf_get_listener_sock
+ *
+ * 	Return a **struct bpf_sock** pointer in **TCP_LISTEN** state.
+ * 	**bpf_sk_release**\ () is unnecessary and not allowed.
+ *
+ * Returns
+ * 	A **struct bpf_sock** pointer on success, or **NULL** in
+ * 	case of failure.
+ */
+static struct bpf_sock *(*bpf_get_listener_sock)(struct bpf_sock *sk) = (void *) 98;
+
+/*
+ * bpf_skc_lookup_tcp
+ *
+ * 	Look for TCP socket matching *tuple*, optionally in a child
+ * 	network namespace *netns*. The return value must be checked,
+ * 	and if non-**NULL**, released via **bpf_sk_release**\ ().
+ *
+ * 	This function is identical to **bpf_sk_lookup_tcp**\ (), except
+ * 	that it also returns timewait or request sockets. Use
+ * 	**bpf_sk_fullsock**\ () or **bpf_tcp_sock**\ () to access the
+ * 	full structure.
+ *
+ * 	This helper is available only if the kernel was compiled with
+ * 	**CONFIG_NET** configuration option.
+ *
+ * Returns
+ * 	Pointer to **struct bpf_sock**, or **NULL** in case of failure.
+ * 	For sockets with reuseport option, the **struct bpf_sock**
+ * 	result is from *reuse*\ **->socks**\ [] using the hash of the
+ * 	tuple.
+ */
+static struct bpf_sock *(*bpf_skc_lookup_tcp)(void *ctx, struct bpf_sock_tuple *tuple, __u32 tuple_size, __u64 netns, __u64 flags) = (void *) 99;
+
+/*
+ * bpf_tcp_check_syncookie
+ *
+ * 	Check whether *iph* and *th* contain a valid SYN cookie ACK for
+ * 	the listening socket in *sk*.
+ *
+ * 	*iph* points to the start of the IPv4 or IPv6 header, while
+ * 	*iph_len* contains **sizeof**\ (**struct iphdr**) or
+ * 	**sizeof**\ (**struct ip6hdr**).
+ *
+ * 	*th* points to the start of the TCP header, while *th_len*
+ * 	contains **sizeof**\ (**struct tcphdr**).
+ *
+ * Returns
+ * 	0 if *iph* and *th* are a valid SYN cookie ACK, or a negative
+ * 	error otherwise.
+ */
+static long (*bpf_tcp_check_syncookie)(struct bpf_sock *sk, void *iph, __u32 iph_len, struct tcphdr *th, __u32 th_len) = (void *) 100;
+
+/*
+ * bpf_sysctl_get_name
+ *
+ * 	Get name of sysctl in /proc/sys/ and copy it into provided by
+ * 	program buffer *buf* of size *buf_len*.
+ *
+ * 	The buffer is always NUL terminated, unless it's zero-sized.
+ *
+ * 	If *flags* is zero, full name (e.g. "net/ipv4/tcp_mem") is
+ * 	copied. Use **BPF_F_SYSCTL_BASE_NAME** flag to copy base name
+ * 	only (e.g. "tcp_mem").
+ *
+ * Returns
+ * 	Number of character copied (not including the trailing NUL).
+ *
+ * 	**-E2BIG** if the buffer wasn't big enough (*buf* will contain
+ * 	truncated name in this case).
+ */
+static long (*bpf_sysctl_get_name)(struct bpf_sysctl *ctx, char *buf, unsigned long buf_len, __u64 flags) = (void *) 101;
+
+/*
+ * bpf_sysctl_get_current_value
+ *
+ * 	Get current value of sysctl as it is presented in /proc/sys
+ * 	(incl. newline, etc), and copy it as a string into provided
+ * 	by program buffer *buf* of size *buf_len*.
+ *
+ * 	The whole value is copied, no matter what file position user
+ * 	space issued e.g. sys_read at.
+ *
+ * 	The buffer is always NUL terminated, unless it's zero-sized.
+ *
+ * Returns
+ * 	Number of character copied (not including the trailing NUL).
+ *
+ * 	**-E2BIG** if the buffer wasn't big enough (*buf* will contain
+ * 	truncated name in this case).
+ *
+ * 	**-EINVAL** if current value was unavailable, e.g. because
+ * 	sysctl is uninitialized and read returns -EIO for it.
+ */
+static long (*bpf_sysctl_get_current_value)(struct bpf_sysctl *ctx, char *buf, unsigned long buf_len) = (void *) 102;
+
+/*
+ * bpf_sysctl_get_new_value
+ *
+ * 	Get new value being written by user space to sysctl (before
+ * 	the actual write happens) and copy it as a string into
+ * 	provided by program buffer *buf* of size *buf_len*.
+ *
+ * 	User space may write new value at file position > 0.
+ *
+ * 	The buffer is always NUL terminated, unless it's zero-sized.
+ *
+ * Returns
+ * 	Number of character copied (not including the trailing NUL).
+ *
+ * 	**-E2BIG** if the buffer wasn't big enough (*buf* will contain
+ * 	truncated name in this case).
+ *
+ * 	**-EINVAL** if sysctl is being read.
+ */
+static long (*bpf_sysctl_get_new_value)(struct bpf_sysctl *ctx, char *buf, unsigned long buf_len) = (void *) 103;
+
+/*
+ * bpf_sysctl_set_new_value
+ *
+ * 	Override new value being written by user space to sysctl with
+ * 	value provided by program in buffer *buf* of size *buf_len*.
+ *
+ * 	*buf* should contain a string in same form as provided by user
+ * 	space on sysctl write.
+ *
+ * 	User space may write new value at file position > 0. To override
+ * 	the whole sysctl value file position should be set to zero.
+ *
+ * Returns
+ * 	0 on success.
+ *
+ * 	**-E2BIG** if the *buf_len* is too big.
+ *
+ * 	**-EINVAL** if sysctl is being read.
+ */
+static long (*bpf_sysctl_set_new_value)(struct bpf_sysctl *ctx, const char *buf, unsigned long buf_len) = (void *) 104;
+
+/*
+ * bpf_strtol
+ *
+ * 	Convert the initial part of the string from buffer *buf* of
+ * 	size *buf_len* to a long integer according to the given base
+ * 	and save the result in *res*.
+ *
+ * 	The string may begin with an arbitrary amount of white space
+ * 	(as determined by **isspace**\ (3)) followed by a single
+ * 	optional '**-**' sign.
+ *
+ * 	Five least significant bits of *flags* encode base, other bits
+ * 	are currently unused.
+ *
+ * 	Base must be either 8, 10, 16 or 0 to detect it automatically
+ * 	similar to user space **strtol**\ (3).
+ *
+ * Returns
+ * 	Number of characters consumed on success. Must be positive but
+ * 	no more than *buf_len*.
+ *
+ * 	**-EINVAL** if no valid digits were found or unsupported base
+ * 	was provided.
+ *
+ * 	**-ERANGE** if resulting value was out of range.
+ */
+static long (*bpf_strtol)(const char *buf, unsigned long buf_len, __u64 flags, long *res) = (void *) 105;
+
+/*
+ * bpf_strtoul
+ *
+ * 	Convert the initial part of the string from buffer *buf* of
+ * 	size *buf_len* to an unsigned long integer according to the
+ * 	given base and save the result in *res*.
+ *
+ * 	The string may begin with an arbitrary amount of white space
+ * 	(as determined by **isspace**\ (3)).
+ *
+ * 	Five least significant bits of *flags* encode base, other bits
+ * 	are currently unused.
+ *
+ * 	Base must be either 8, 10, 16 or 0 to detect it automatically
+ * 	similar to user space **strtoul**\ (3).
+ *
+ * Returns
+ * 	Number of characters consumed on success. Must be positive but
+ * 	no more than *buf_len*.
+ *
+ * 	**-EINVAL** if no valid digits were found or unsupported base
+ * 	was provided.
+ *
+ * 	**-ERANGE** if resulting value was out of range.
+ */
+static long (*bpf_strtoul)(const char *buf, unsigned long buf_len, __u64 flags, unsigned long *res) = (void *) 106;
+
+/*
+ * bpf_sk_storage_get
+ *
+ * 	Get a bpf-local-storage from a *sk*.
+ *
+ * 	Logically, it could be thought of getting the value from
+ * 	a *map* with *sk* as the **key**.  From this
+ * 	perspective,  the usage is not much different from
+ * 	**bpf_map_lookup_elem**\ (*map*, **&**\ *sk*) except this
+ * 	helper enforces the key must be a full socket and the map must
+ * 	be a **BPF_MAP_TYPE_SK_STORAGE** also.
+ *
+ * 	Underneath, the value is stored locally at *sk* instead of
+ * 	the *map*.  The *map* is used as the bpf-local-storage
+ * 	"type". The bpf-local-storage "type" (i.e. the *map*) is
+ * 	searched against all bpf-local-storages residing at *sk*.
+ *
+ * 	An optional *flags* (**BPF_SK_STORAGE_GET_F_CREATE**) can be
+ * 	used such that a new bpf-local-storage will be
+ * 	created if one does not exist.  *value* can be used
+ * 	together with **BPF_SK_STORAGE_GET_F_CREATE** to specify
+ * 	the initial value of a bpf-local-storage.  If *value* is
+ * 	**NULL**, the new bpf-local-storage will be zero initialized.
+ *
+ * Returns
+ * 	A bpf-local-storage pointer is returned on success.
+ *
+ * 	**NULL** if not found or there was an error in adding
+ * 	a new bpf-local-storage.
+ */
+static void *(*bpf_sk_storage_get)(void *map, struct bpf_sock *sk, void *value, __u64 flags) = (void *) 107;
+
+/*
+ * bpf_sk_storage_delete
+ *
+ * 	Delete a bpf-local-storage from a *sk*.
+ *
+ * Returns
+ * 	0 on success.
+ *
+ * 	**-ENOENT** if the bpf-local-storage cannot be found.
+ */
+static long (*bpf_sk_storage_delete)(void *map, struct bpf_sock *sk) = (void *) 108;
+
+/*
+ * bpf_send_signal
+ *
+ * 	Send signal *sig* to the process of the current task.
+ * 	The signal may be delivered to any of this process's threads.
+ *
+ * Returns
+ * 	0 on success or successfully queued.
+ *
+ * 	**-EBUSY** if work queue under nmi is full.
+ *
+ * 	**-EINVAL** if *sig* is invalid.
+ *
+ * 	**-EPERM** if no permission to send the *sig*.
+ *
+ * 	**-EAGAIN** if bpf program can try again.
+ */
+static long (*bpf_send_signal)(__u32 sig) = (void *) 109;
+
+/*
+ * bpf_tcp_gen_syncookie
+ *
+ * 	Try to issue a SYN cookie for the packet with corresponding
+ * 	IP/TCP headers, *iph* and *th*, on the listening socket in *sk*.
+ *
+ * 	*iph* points to the start of the IPv4 or IPv6 header, while
+ * 	*iph_len* contains **sizeof**\ (**struct iphdr**) or
+ * 	**sizeof**\ (**struct ip6hdr**).
+ *
+ * 	*th* points to the start of the TCP header, while *th_len*
+ * 	contains the length of the TCP header.
+ *
+ * Returns
+ * 	On success, lower 32 bits hold the generated SYN cookie in
+ * 	followed by 16 bits which hold the MSS value for that cookie,
+ * 	and the top 16 bits are unused.
+ *
+ * 	On failure, the returned value is one of the following:
+ *
+ * 	**-EINVAL** SYN cookie cannot be issued due to error
+ *
+ * 	**-ENOENT** SYN cookie should not be issued (no SYN flood)
+ *
+ * 	**-EOPNOTSUPP** kernel configuration does not enable SYN cookies
+ *
+ * 	**-EPROTONOSUPPORT** IP packet version is not 4 or 6
+ */
+static __s64 (*bpf_tcp_gen_syncookie)(struct bpf_sock *sk, void *iph, __u32 iph_len, struct tcphdr *th, __u32 th_len) = (void *) 110;
+
+/*
+ * bpf_skb_output
+ *
+ * 	Write raw *data* blob into a special BPF perf event held by
+ * 	*map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
+ * 	event must have the following attributes: **PERF_SAMPLE_RAW**
+ * 	as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and
+ * 	**PERF_COUNT_SW_BPF_OUTPUT** as **config**.
+ *
+ * 	The *flags* are used to indicate the index in *map* for which
+ * 	the value must be put, masked with **BPF_F_INDEX_MASK**.
+ * 	Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU**
+ * 	to indicate that the index of the current CPU core should be
+ * 	used.
+ *
+ * 	The value to write, of *size*, is passed through eBPF stack and
+ * 	pointed by *data*.
+ *
+ * 	*ctx* is a pointer to in-kernel struct sk_buff.
+ *
+ * 	This helper is similar to **bpf_perf_event_output**\ () but
+ * 	restricted to raw_tracepoint bpf programs.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_skb_output)(void *ctx, void *map, __u64 flags, void *data, __u64 size) = (void *) 111;
+
+/*
+ * bpf_probe_read_user
+ *
+ * 	Safely attempt to read *size* bytes from user space address
+ * 	*unsafe_ptr* and store the data in *dst*.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_probe_read_user)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 112;
+
+/*
+ * bpf_probe_read_kernel
+ *
+ * 	Safely attempt to read *size* bytes from kernel space address
+ * 	*unsafe_ptr* and store the data in *dst*.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_probe_read_kernel)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 113;
+
+/*
+ * bpf_probe_read_user_str
+ *
+ * 	Copy a NUL terminated string from an unsafe user address
+ * 	*unsafe_ptr* to *dst*. The *size* should include the
+ * 	terminating NUL byte. In case the string length is smaller than
+ * 	*size*, the target is not padded with further NUL bytes. If the
+ * 	string length is larger than *size*, just *size*-1 bytes are
+ * 	copied and the last byte is set to NUL.
+ *
+ * 	On success, the length of the copied string is returned. This
+ * 	makes this helper useful in tracing programs for reading
+ * 	strings, and more importantly to get its length at runtime. See
+ * 	the following snippet:
+ *
+ * 	::
+ *
+ * 		SEC("kprobe/sys_open")
+ * 		void bpf_sys_open(struct pt_regs *ctx)
+ * 		{
+ * 		        char buf[PATHLEN]; // PATHLEN is defined to 256
+ * 		        int res = bpf_probe_read_user_str(buf, sizeof(buf),
+ * 			                                  ctx->di);
+ *
+ * 			// Consume buf, for example push it to
+ * 			// userspace via bpf_perf_event_output(); we
+ * 			// can use res (the string length) as event
+ * 			// size, after checking its boundaries.
+ * 		}
+ *
+ * 	In comparison, using **bpf_probe_read_user**\ () helper here
+ * 	instead to read the string would require to estimate the length
+ * 	at compile time, and would often result in copying more memory
+ * 	than necessary.
+ *
+ * 	Another useful use case is when parsing individual process
+ * 	arguments or individual environment variables navigating
+ * 	*current*\ **->mm->arg_start** and *current*\
+ * 	**->mm->env_start**: using this helper and the return value,
+ * 	one can quickly iterate at the right offset of the memory area.
+ *
+ * Returns
+ * 	On success, the strictly positive length of the string,
+ * 	including the trailing NUL character. On error, a negative
+ * 	value.
+ */
+static long (*bpf_probe_read_user_str)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 114;
+
+/*
+ * bpf_probe_read_kernel_str
+ *
+ * 	Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr*
+ * 	to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply.
+ *
+ * Returns
+ * 	On success, the strictly positive length of the string, including
+ * 	the trailing NUL character. On error, a negative value.
+ */
+static long (*bpf_probe_read_kernel_str)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 115;
+
+/*
+ * bpf_tcp_send_ack
+ *
+ * 	Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**.
+ * 	*rcv_nxt* is the ack_seq to be sent out.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_tcp_send_ack)(void *tp, __u32 rcv_nxt) = (void *) 116;
+
+/*
+ * bpf_send_signal_thread
+ *
+ * 	Send signal *sig* to the thread corresponding to the current task.
+ *
+ * Returns
+ * 	0 on success or successfully queued.
+ *
+ * 	**-EBUSY** if work queue under nmi is full.
+ *
+ * 	**-EINVAL** if *sig* is invalid.
+ *
+ * 	**-EPERM** if no permission to send the *sig*.
+ *
+ * 	**-EAGAIN** if bpf program can try again.
+ */
+static long (*bpf_send_signal_thread)(__u32 sig) = (void *) 117;
+
+/*
+ * bpf_jiffies64
+ *
+ * 	Obtain the 64bit jiffies
+ *
+ * Returns
+ * 	The 64 bit jiffies
+ */
+static __u64 (*bpf_jiffies64)(void) = (void *) 118;
+
+/*
+ * bpf_read_branch_records
+ *
+ * 	For an eBPF program attached to a perf event, retrieve the
+ * 	branch records (**struct perf_branch_entry**) associated to *ctx*
+ * 	and store it in the buffer pointed by *buf* up to size
+ * 	*size* bytes.
+ *
+ * Returns
+ * 	On success, number of bytes written to *buf*. On error, a
+ * 	negative value.
+ *
+ * 	The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to
+ * 	instead return the number of bytes required to store all the
+ * 	branch entries. If this flag is set, *buf* may be NULL.
+ *
+ * 	**-EINVAL** if arguments invalid or **size** not a multiple
+ * 	of **sizeof**\ (**struct perf_branch_entry**\ ).
+ *
+ * 	**-ENOENT** if architecture does not support branch records.
+ */
+static long (*bpf_read_branch_records)(struct bpf_perf_event_data *ctx, void *buf, __u32 size, __u64 flags) = (void *) 119;
+
+/*
+ * bpf_get_ns_current_pid_tgid
+ *
+ * 	Returns 0 on success, values for *pid* and *tgid* as seen from the current
+ * 	*namespace* will be returned in *nsdata*.
+ *
+ * Returns
+ * 	0 on success, or one of the following in case of failure:
+ *
+ * 	**-EINVAL** if dev and inum supplied don't match dev_t and inode number
+ * 	with nsfs of current task, or if dev conversion to dev_t lost high bits.
+ *
+ * 	**-ENOENT** if pidns does not exists for the current task.
+ */
+static long (*bpf_get_ns_current_pid_tgid)(__u64 dev, __u64 ino, struct bpf_pidns_info *nsdata, __u32 size) = (void *) 120;
+
+/*
+ * bpf_xdp_output
+ *
+ * 	Write raw *data* blob into a special BPF perf event held by
+ * 	*map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
+ * 	event must have the following attributes: **PERF_SAMPLE_RAW**
+ * 	as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and
+ * 	**PERF_COUNT_SW_BPF_OUTPUT** as **config**.
+ *
+ * 	The *flags* are used to indicate the index in *map* for which
+ * 	the value must be put, masked with **BPF_F_INDEX_MASK**.
+ * 	Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU**
+ * 	to indicate that the index of the current CPU core should be
+ * 	used.
+ *
+ * 	The value to write, of *size*, is passed through eBPF stack and
+ * 	pointed by *data*.
+ *
+ * 	*ctx* is a pointer to in-kernel struct xdp_buff.
+ *
+ * 	This helper is similar to **bpf_perf_eventoutput**\ () but
+ * 	restricted to raw_tracepoint bpf programs.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_xdp_output)(void *ctx, void *map, __u64 flags, void *data, __u64 size) = (void *) 121;
+
+/*
+ * bpf_get_netns_cookie
+ *
+ * 	Retrieve the cookie (generated by the kernel) of the network
+ * 	namespace the input *ctx* is associated with. The network
+ * 	namespace cookie remains stable for its lifetime and provides
+ * 	a global identifier that can be assumed unique. If *ctx* is
+ * 	NULL, then the helper returns the cookie for the initial
+ * 	network namespace. The cookie itself is very similar to that
+ * 	of **bpf_get_socket_cookie**\ () helper, but for network
+ * 	namespaces instead of sockets.
+ *
+ * Returns
+ * 	A 8-byte long opaque number.
+ */
+static __u64 (*bpf_get_netns_cookie)(void *ctx) = (void *) 122;
+
+/*
+ * bpf_get_current_ancestor_cgroup_id
+ *
+ * 	Return id of cgroup v2 that is ancestor of the cgroup associated
+ * 	with the current task at the *ancestor_level*. The root cgroup
+ * 	is at *ancestor_level* zero and each step down the hierarchy
+ * 	increments the level. If *ancestor_level* == level of cgroup
+ * 	associated with the current task, then return value will be the
+ * 	same as that of **bpf_get_current_cgroup_id**\ ().
+ *
+ * 	The helper is useful to implement policies based on cgroups
+ * 	that are upper in hierarchy than immediate cgroup associated
+ * 	with the current task.
+ *
+ * 	The format of returned id and helper limitations are same as in
+ * 	**bpf_get_current_cgroup_id**\ ().
+ *
+ * Returns
+ * 	The id is returned or 0 in case the id could not be retrieved.
+ */
+static __u64 (*bpf_get_current_ancestor_cgroup_id)(int ancestor_level) = (void *) 123;
+
+/*
+ * bpf_sk_assign
+ *
+ * 	Helper is overloaded depending on BPF program type. This
+ * 	description applies to **BPF_PROG_TYPE_SCHED_CLS** and
+ * 	**BPF_PROG_TYPE_SCHED_ACT** programs.
+ *
+ * 	Assign the *sk* to the *skb*. When combined with appropriate
+ * 	routing configuration to receive the packet towards the socket,
+ * 	will cause *skb* to be delivered to the specified socket.
+ * 	Subsequent redirection of *skb* via  **bpf_redirect**\ (),
+ * 	**bpf_clone_redirect**\ () or other methods outside of BPF may
+ * 	interfere with successful delivery to the socket.
+ *
+ * 	This operation is only valid from TC ingress path.
+ *
+ * 	The *flags* argument must be zero.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure:
+ *
+ * 	**-EINVAL** if specified *flags* are not supported.
+ *
+ * 	**-ENOENT** if the socket is unavailable for assignment.
+ *
+ * 	**-ENETUNREACH** if the socket is unreachable (wrong netns).
+ *
+ * 	**-EOPNOTSUPP** if the operation is not supported, for example
+ * 	a call from outside of TC ingress.
+ *
+ * 	**-ESOCKTNOSUPPORT** if the socket type is not supported
+ * 	(reuseport).
+ */
+static long (*bpf_sk_assign)(void *ctx, struct bpf_sock *sk, __u64 flags) = (void *) 124;
+
+/*
+ * bpf_ktime_get_boot_ns
+ *
+ * 	Return the time elapsed since system boot, in nanoseconds.
+ * 	Does include the time the system was suspended.
+ * 	See: **clock_gettime**\ (**CLOCK_BOOTTIME**)
+ *
+ * Returns
+ * 	Current *ktime*.
+ */
+static __u64 (*bpf_ktime_get_boot_ns)(void) = (void *) 125;
+
+/*
+ * bpf_seq_printf
+ *
+ * 	**bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print
+ * 	out the format string.
+ * 	The *m* represents the seq_file. The *fmt* and *fmt_size* are for
+ * 	the format string itself. The *data* and *data_len* are format string
+ * 	arguments. The *data* are a **u64** array and corresponding format string
+ * 	values are stored in the array. For strings and pointers where pointees
+ * 	are accessed, only the pointer values are stored in the *data* array.
+ * 	The *data_len* is the size of *data* in bytes.
+ *
+ * 	Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory.
+ * 	Reading kernel memory may fail due to either invalid address or
+ * 	valid address but requiring a major memory fault. If reading kernel memory
+ * 	fails, the string for **%s** will be an empty string, and the ip
+ * 	address for **%p{i,I}{4,6}** will be 0. Not returning error to
+ * 	bpf program is consistent with what **bpf_trace_printk**\ () does for now.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure:
+ *
+ * 	**-EBUSY** if per-CPU memory copy buffer is busy, can try again
+ * 	by returning 1 from bpf program.
+ *
+ * 	**-EINVAL** if arguments are invalid, or if *fmt* is invalid/unsupported.
+ *
+ * 	**-E2BIG** if *fmt* contains too many format specifiers.
+ *
+ * 	**-EOVERFLOW** if an overflow happened: The same object will be tried again.
+ */
+static long (*bpf_seq_printf)(struct seq_file *m, const char *fmt, __u32 fmt_size, const void *data, __u32 data_len) = (void *) 126;
+
+/*
+ * bpf_seq_write
+ *
+ * 	**bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data.
+ * 	The *m* represents the seq_file. The *data* and *len* represent the
+ * 	data to write in bytes.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure:
+ *
+ * 	**-EOVERFLOW** if an overflow happened: The same object will be tried again.
+ */
+static long (*bpf_seq_write)(struct seq_file *m, const void *data, __u32 len) = (void *) 127;
+
+/*
+ * bpf_sk_cgroup_id
+ *
+ * 	Return the cgroup v2 id of the socket *sk*.
+ *
+ * 	*sk* must be a non-**NULL** pointer to a full socket, e.g. one
+ * 	returned from **bpf_sk_lookup_xxx**\ (),
+ * 	**bpf_sk_fullsock**\ (), etc. The format of returned id is
+ * 	same as in **bpf_skb_cgroup_id**\ ().
+ *
+ * 	This helper is available only if the kernel was compiled with
+ * 	the **CONFIG_SOCK_CGROUP_DATA** configuration option.
+ *
+ * Returns
+ * 	The id is returned or 0 in case the id could not be retrieved.
+ */
+static __u64 (*bpf_sk_cgroup_id)(struct bpf_sock *sk) = (void *) 128;
+
+/*
+ * bpf_sk_ancestor_cgroup_id
+ *
+ * 	Return id of cgroup v2 that is ancestor of cgroup associated
+ * 	with the *sk* at the *ancestor_level*.  The root cgroup is at
+ * 	*ancestor_level* zero and each step down the hierarchy
+ * 	increments the level. If *ancestor_level* == level of cgroup
+ * 	associated with *sk*, then return value will be same as that
+ * 	of **bpf_sk_cgroup_id**\ ().
+ *
+ * 	The helper is useful to implement policies based on cgroups
+ * 	that are upper in hierarchy than immediate cgroup associated
+ * 	with *sk*.
+ *
+ * 	The format of returned id and helper limitations are same as in
+ * 	**bpf_sk_cgroup_id**\ ().
+ *
+ * Returns
+ * 	The id is returned or 0 in case the id could not be retrieved.
+ */
+static __u64 (*bpf_sk_ancestor_cgroup_id)(struct bpf_sock *sk, int ancestor_level) = (void *) 129;
+
+/*
+ * bpf_ringbuf_output
+ *
+ * 	Copy *size* bytes from *data* into a ring buffer *ringbuf*.
+ * 	If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification
+ * 	of new data availability is sent.
+ * 	If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification
+ * 	of new data availability is sent unconditionally.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure.
+ */
+static long (*bpf_ringbuf_output)(void *ringbuf, void *data, __u64 size, __u64 flags) = (void *) 130;
+
+/*
+ * bpf_ringbuf_reserve
+ *
+ * 	Reserve *size* bytes of payload in a ring buffer *ringbuf*.
+ *
+ * Returns
+ * 	Valid pointer with *size* bytes of memory available; NULL,
+ * 	otherwise.
+ */
+static void *(*bpf_ringbuf_reserve)(void *ringbuf, __u64 size, __u64 flags) = (void *) 131;
+
+/*
+ * bpf_ringbuf_submit
+ *
+ * 	Submit reserved ring buffer sample, pointed to by *data*.
+ * 	If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification
+ * 	of new data availability is sent.
+ * 	If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification
+ * 	of new data availability is sent unconditionally.
+ *
+ * Returns
+ * 	Nothing. Always succeeds.
+ */
+static void (*bpf_ringbuf_submit)(void *data, __u64 flags) = (void *) 132;
+
+/*
+ * bpf_ringbuf_discard
+ *
+ * 	Discard reserved ring buffer sample, pointed to by *data*.
+ * 	If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification
+ * 	of new data availability is sent.
+ * 	If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification
+ * 	of new data availability is sent unconditionally.
+ *
+ * Returns
+ * 	Nothing. Always succeeds.
+ */
+static void (*bpf_ringbuf_discard)(void *data, __u64 flags) = (void *) 133;
+
+/*
+ * bpf_ringbuf_query
+ *
+ * 	Query various characteristics of provided ring buffer. What
+ * 	exactly is queries is determined by *flags*:
+ *
+ * 	* **BPF_RB_AVAIL_DATA**: Amount of data not yet consumed.
+ * 	* **BPF_RB_RING_SIZE**: The size of ring buffer.
+ * 	* **BPF_RB_CONS_POS**: Consumer position (can wrap around).
+ * 	* **BPF_RB_PROD_POS**: Producer(s) position (can wrap around).
+ *
+ * 	Data returned is just a momentary snapshot of actual values
+ * 	and could be inaccurate, so this facility should be used to
+ * 	power heuristics and for reporting, not to make 100% correct
+ * 	calculation.
+ *
+ * Returns
+ * 	Requested value, or 0, if *flags* are not recognized.
+ */
+static __u64 (*bpf_ringbuf_query)(void *ringbuf, __u64 flags) = (void *) 134;
+
+/*
+ * bpf_csum_level
+ *
+ * 	Change the skbs checksum level by one layer up or down, or
+ * 	reset it entirely to none in order to have the stack perform
+ * 	checksum validation. The level is applicable to the following
+ * 	protocols: TCP, UDP, GRE, SCTP, FCOE. For example, a decap of
+ * 	| ETH | IP | UDP | GUE | IP | TCP | into | ETH | IP | TCP |
+ * 	through **bpf_skb_adjust_room**\ () helper with passing in
+ * 	**BPF_F_ADJ_ROOM_NO_CSUM_RESET** flag would require one	call
+ * 	to **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_DEC** since
+ * 	the UDP header is removed. Similarly, an encap of the latter
+ * 	into the former could be accompanied by a helper call to
+ * 	**bpf_csum_level**\ () with **BPF_CSUM_LEVEL_INC** if the
+ * 	skb is still intended to be processed in higher layers of the
+ * 	stack instead of just egressing at tc.
+ *
+ * 	There are three supported level settings at this time:
+ *
+ * 	* **BPF_CSUM_LEVEL_INC**: Increases skb->csum_level for skbs
+ * 	  with CHECKSUM_UNNECESSARY.
+ * 	* **BPF_CSUM_LEVEL_DEC**: Decreases skb->csum_level for skbs
+ * 	  with CHECKSUM_UNNECESSARY.
+ * 	* **BPF_CSUM_LEVEL_RESET**: Resets skb->csum_level to 0 and
+ * 	  sets CHECKSUM_NONE to force checksum validation by the stack.
+ * 	* **BPF_CSUM_LEVEL_QUERY**: No-op, returns the current
+ * 	  skb->csum_level.
+ *
+ * Returns
+ * 	0 on success, or a negative error in case of failure. In the
+ * 	case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level
+ * 	is returned or the error code -EACCES in case the skb is not
+ * 	subject to CHECKSUM_UNNECESSARY.
+ */
+static long (*bpf_csum_level)(struct __sk_buff *skb, __u64 level) = (void *) 135;
+
+/*
+ * bpf_skc_to_tcp6_sock
+ *
+ * 	Dynamically cast a *sk* pointer to a *tcp6_sock* pointer.
+ *
+ * Returns
+ * 	*sk* if casting is valid, or NULL otherwise.
+ */
+static struct tcp6_sock *(*bpf_skc_to_tcp6_sock)(void *sk) = (void *) 136;
+
+/*
+ * bpf_skc_to_tcp_sock
+ *
+ * 	Dynamically cast a *sk* pointer to a *tcp_sock* pointer.
+ *
+ * Returns
+ * 	*sk* if casting is valid, or NULL otherwise.
+ */
+static struct tcp_sock *(*bpf_skc_to_tcp_sock)(void *sk) = (void *) 137;
+
+/*
+ * bpf_skc_to_tcp_timewait_sock
+ *
+ * 	Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer.
+ *
+ * Returns
+ * 	*sk* if casting is valid, or NULL otherwise.
+ */
+static struct tcp_timewait_sock *(*bpf_skc_to_tcp_timewait_sock)(void *sk) = (void *) 138;
+
+/*
+ * bpf_skc_to_tcp_request_sock
+ *
+ * 	Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer.
+ *
+ * Returns
+ * 	*sk* if casting is valid, or NULL otherwise.
+ */
+static struct tcp_request_sock *(*bpf_skc_to_tcp_request_sock)(void *sk) = (void *) 139;
+
+/*
+ * bpf_skc_to_udp6_sock
+ *
+ * 	Dynamically cast a *sk* pointer to a *udp6_sock* pointer.
+ *
+ * Returns
+ * 	*sk* if casting is valid, or NULL otherwise.
+ */
+static struct udp6_sock *(*bpf_skc_to_udp6_sock)(void *sk) = (void *) 140;
+
+/*
+ * bpf_get_task_stack
+ *
+ * 	Return a user or a kernel stack in bpf program provided buffer.
+ * 	To achieve this, the helper needs *task*, which is a valid
+ * 	pointer to struct task_struct. To store the stacktrace, the
+ * 	bpf program provides *buf* with	a nonnegative *size*.
+ *
+ * 	The last argument, *flags*, holds the number of stack frames to
+ * 	skip (from 0 to 255), masked with
+ * 	**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ * 	the following flags:
+ *
+ * 	**BPF_F_USER_STACK**
+ * 		Collect a user space stack instead of a kernel stack.
+ * 	**BPF_F_USER_BUILD_ID**
+ * 		Collect buildid+offset instead of ips for user stack,
+ * 		only valid if **BPF_F_USER_STACK** is also specified.
+ *
+ * 	**bpf_get_task_stack**\ () can collect up to
+ * 	**PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
+ * 	to sufficient large buffer size. Note that
+ * 	this limit can be controlled with the **sysctl** program, and
+ * 	that it should be manually increased in order to profile long
+ * 	user stacks (such as stacks for Java programs). To do so, use:
+ *
+ * 	::
+ *
+ * 		# sysctl kernel.perf_event_max_stack=<new value>
+ *
+ * Returns
+ * 	A non-negative value equal to or less than *size* on success,
+ * 	or a negative error in case of failure.
+ */
+static long (*bpf_get_task_stack)(struct task_struct *task, void *buf, __u32 size, __u64 flags) = (void *) 141;
+
+

+ 80 - 0
vendor/github.com/cilium/ebpf/examples/headers/bpf_helpers.h

@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+#ifndef __BPF_HELPERS__
+#define __BPF_HELPERS__
+
+/*
+ * Note that bpf programs need to include either
+ * vmlinux.h (auto-generated from BTF) or linux/types.h
+ * in advance since bpf_helper_defs.h uses such types
+ * as __u64.
+ */
+#include "bpf_helper_defs.h"
+
+#define __uint(name, val) int (*name)[val]
+#define __type(name, val) typeof(val) *name
+#define __array(name, val) typeof(val) *name[]
+
+/* Helper macro to print out debug messages */
+#define bpf_printk(fmt, ...)				\
+({							\
+	char ____fmt[] = fmt;				\
+	bpf_trace_printk(____fmt, sizeof(____fmt),	\
+			 ##__VA_ARGS__);		\
+})
+
+/*
+ * Helper macro to place programs, maps, license in
+ * different sections in elf_bpf file. Section names
+ * are interpreted by elf_bpf loader
+ */
+#define SEC(NAME) __attribute__((section(NAME), used))
+
+#ifndef __always_inline
+#define __always_inline __attribute__((always_inline))
+#endif
+#ifndef __weak
+#define __weak __attribute__((weak))
+#endif
+
+/*
+ * Helper macro to manipulate data structures
+ */
+#ifndef offsetof
+#define offsetof(TYPE, MEMBER)  __builtin_offsetof(TYPE, MEMBER)
+#endif
+#ifndef container_of
+#define container_of(ptr, type, member)				\
+	({							\
+		void *__mptr = (void *)(ptr);			\
+		((type *)(__mptr - offsetof(type, member)));	\
+	})
+#endif
+
+/*
+ * Helper structure used by eBPF C program
+ * to describe BPF map attributes to libbpf loader
+ */
+struct bpf_map_def {
+	unsigned int type;
+	unsigned int key_size;
+	unsigned int value_size;
+	unsigned int max_entries;
+	unsigned int map_flags;
+};
+
+enum libbpf_pin_type {
+	LIBBPF_PIN_NONE,
+	/* PIN_BY_NAME: pin maps by name (in /sys/fs/bpf by default) */
+	LIBBPF_PIN_BY_NAME,
+};
+
+enum libbpf_tristate {
+	TRI_NO = 0,
+	TRI_YES = 1,
+	TRI_MODULE = 2,
+};
+
+#define __kconfig __attribute__((section(".kconfig")))
+#define __ksym __attribute__((section(".ksyms")))
+
+#endif

+ 107 - 0
vendor/github.com/cilium/ebpf/examples/headers/common.h

@@ -0,0 +1,107 @@
+// This is a compact version of `vmlinux.h` to be used in the examples using C code.
+
+#ifndef __VMLINUX_H__
+#define __VMLINUX_H__
+
+typedef unsigned char __u8;
+typedef short int __s16;
+typedef short unsigned int __u16;
+typedef int __s32;
+typedef unsigned int __u32;
+typedef long long int __s64;
+typedef long long unsigned int __u64;
+typedef __u8 u8;
+typedef __s16 s16;
+typedef __u16 u16;
+typedef __s32 s32;
+typedef __u32 u32;
+typedef __s64 s64;
+typedef __u64 u64;
+typedef __u16 __le16;
+typedef __u16 __be16;
+typedef __u32 __be32;
+typedef __u64 __be64;
+typedef __u32 __wsum;
+
+enum bpf_map_type {
+	BPF_MAP_TYPE_UNSPEC = 0,
+	BPF_MAP_TYPE_HASH = 1,
+	BPF_MAP_TYPE_ARRAY = 2,
+	BPF_MAP_TYPE_PROG_ARRAY = 3,
+	BPF_MAP_TYPE_PERF_EVENT_ARRAY = 4,
+	BPF_MAP_TYPE_PERCPU_HASH = 5,
+	BPF_MAP_TYPE_PERCPU_ARRAY = 6,
+	BPF_MAP_TYPE_STACK_TRACE = 7,
+	BPF_MAP_TYPE_CGROUP_ARRAY = 8,
+	BPF_MAP_TYPE_LRU_HASH = 9,
+	BPF_MAP_TYPE_LRU_PERCPU_HASH = 10,
+	BPF_MAP_TYPE_LPM_TRIE = 11,
+	BPF_MAP_TYPE_ARRAY_OF_MAPS = 12,
+	BPF_MAP_TYPE_HASH_OF_MAPS = 13,
+	BPF_MAP_TYPE_DEVMAP = 14,
+	BPF_MAP_TYPE_SOCKMAP = 15,
+	BPF_MAP_TYPE_CPUMAP = 16,
+	BPF_MAP_TYPE_XSKMAP = 17,
+	BPF_MAP_TYPE_SOCKHASH = 18,
+	BPF_MAP_TYPE_CGROUP_STORAGE = 19,
+	BPF_MAP_TYPE_REUSEPORT_SOCKARRAY = 20,
+	BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE = 21,
+	BPF_MAP_TYPE_QUEUE = 22,
+	BPF_MAP_TYPE_STACK = 23,
+	BPF_MAP_TYPE_SK_STORAGE = 24,
+	BPF_MAP_TYPE_DEVMAP_HASH = 25,
+	BPF_MAP_TYPE_STRUCT_OPS = 26,
+	BPF_MAP_TYPE_RINGBUF = 27,
+	BPF_MAP_TYPE_INODE_STORAGE = 28,
+};
+
+enum {
+	BPF_ANY = 0,
+	BPF_NOEXIST = 1,
+	BPF_EXIST = 2,
+	BPF_F_LOCK = 4,
+};
+
+/* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and
+ * BPF_FUNC_perf_event_read_value flags.
+ */
+#define BPF_F_INDEX_MASK 0xffffffffULL
+#define BPF_F_CURRENT_CPU BPF_F_INDEX_MASK
+
+#define PT_REGS_RC(x) ((x)->rax)
+struct pt_regs {
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
+	unsigned long r15;
+	unsigned long r14;
+	unsigned long r13;
+	unsigned long r12;
+	unsigned long rbp;
+	unsigned long rbx;
+/* These regs are callee-clobbered. Always saved on kernel entry. */
+	unsigned long r11;
+	unsigned long r10;
+	unsigned long r9;
+	unsigned long r8;
+	unsigned long rax;
+	unsigned long rcx;
+	unsigned long rdx;
+	unsigned long rsi;
+	unsigned long rdi;
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
+	unsigned long orig_rax;
+/* Return frame for iretq */
+	unsigned long rip;
+	unsigned long cs;
+	unsigned long eflags;
+	unsigned long rsp;
+	unsigned long ss;
+/* top of stack page */
+};
+
+#endif /* __VMLINUX_H__ */

+ 26 - 0
vendor/github.com/cilium/ebpf/examples/kprobe/bpf/kprobe_example.c

@@ -0,0 +1,26 @@
+#include "common.h"
+#include "bpf_helpers.h"
+
+char __license[] SEC("license") = "Dual MIT/GPL";
+
+struct bpf_map_def SEC("maps") kprobe_map = {
+    .type = BPF_MAP_TYPE_ARRAY,
+    .key_size = sizeof(u32),
+    .value_size = sizeof(u64),
+    .max_entries = 1,
+};
+
+SEC("kprobe/__x64_sys_execve")
+int kprobe_execve() {
+    u32 key = 0;
+    u64 initval = 1, *valp;
+
+    valp = bpf_map_lookup_elem(&kprobe_map, &key);
+    if (!valp) {
+        bpf_map_update_elem(&kprobe_map, &key, &initval, BPF_ANY);
+        return 0;
+    }
+    __sync_fetch_and_add(valp, 1);
+
+    return 0;
+}

+ 25 - 0
vendor/github.com/cilium/ebpf/examples/uprobe/bpf/uprobe_example.c

@@ -0,0 +1,25 @@
+#include "common.h"
+#include "bpf_helpers.h"
+
+char __license[] SEC("license") = "Dual MIT/GPL";
+
+struct event_t {
+	u32 pid;
+	char str[80];
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+} events SEC(".maps");
+
+SEC("uprobe/bash_readline")
+int uprobe_bash_readline(struct pt_regs *ctx) {
+	struct event_t event;
+
+	event.pid = bpf_get_current_pid_tgid();
+	bpf_probe_read(&event.str, sizeof(event.str), (void *)PT_REGS_RC(ctx));
+
+	bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &event, sizeof(event));
+
+	return 0;
+}

+ 6 - 2
vendor/github.com/cilium/ebpf/go.mod

@@ -1,5 +1,9 @@
 module github.com/cilium/ebpf
 
-go 1.13
+go 1.15
 
-require golang.org/x/sys v0.0.0-20200124204421-9fbb57f87de9
+require (
+	github.com/frankban/quicktest v1.11.3
+	github.com/google/go-cmp v0.5.4
+	golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c
+)

+ 239 - 0
vendor/github.com/cilium/ebpf/info.go

@@ -0,0 +1,239 @@
+package ebpf
+
+import (
+	"bufio"
+	"encoding/hex"
+	"errors"
+	"fmt"
+	"io"
+	"os"
+	"strings"
+	"syscall"
+	"time"
+
+	"github.com/cilium/ebpf/internal"
+)
+
+// MapInfo describes a map.
+type MapInfo struct {
+	Type       MapType
+	id         MapID
+	KeySize    uint32
+	ValueSize  uint32
+	MaxEntries uint32
+	Flags      uint32
+	// Name as supplied by user space at load time.
+	Name string
+}
+
+func newMapInfoFromFd(fd *internal.FD) (*MapInfo, error) {
+	info, err := bpfGetMapInfoByFD(fd)
+	if errors.Is(err, syscall.EINVAL) {
+		return newMapInfoFromProc(fd)
+	}
+	if err != nil {
+		return nil, err
+	}
+
+	return &MapInfo{
+		MapType(info.map_type),
+		MapID(info.id),
+		info.key_size,
+		info.value_size,
+		info.max_entries,
+		info.map_flags,
+		// name is available from 4.15.
+		internal.CString(info.name[:]),
+	}, nil
+}
+
+func newMapInfoFromProc(fd *internal.FD) (*MapInfo, error) {
+	var mi MapInfo
+	err := scanFdInfo(fd, map[string]interface{}{
+		"map_type":    &mi.Type,
+		"key_size":    &mi.KeySize,
+		"value_size":  &mi.ValueSize,
+		"max_entries": &mi.MaxEntries,
+		"map_flags":   &mi.Flags,
+	})
+	if err != nil {
+		return nil, err
+	}
+	return &mi, nil
+}
+
+// ID returns the map ID.
+//
+// Available from 4.13.
+//
+// The bool return value indicates whether this optional field is available.
+func (mi *MapInfo) ID() (MapID, bool) {
+	return mi.id, mi.id > 0
+}
+
+// programStats holds statistics of a program.
+type programStats struct {
+	// Total accumulated runtime of the program ins ns.
+	runtime time.Duration
+	// Total number of times the program was called.
+	runCount uint64
+}
+
+// ProgramInfo describes a program.
+type ProgramInfo struct {
+	Type ProgramType
+	id   ProgramID
+	// Truncated hash of the BPF bytecode.
+	Tag string
+	// Name as supplied by user space at load time.
+	Name string
+
+	stats *programStats
+}
+
+func newProgramInfoFromFd(fd *internal.FD) (*ProgramInfo, error) {
+	info, err := bpfGetProgInfoByFD(fd)
+	if errors.Is(err, syscall.EINVAL) {
+		return newProgramInfoFromProc(fd)
+	}
+	if err != nil {
+		return nil, err
+	}
+
+	return &ProgramInfo{
+		Type: ProgramType(info.prog_type),
+		id:   ProgramID(info.id),
+		// tag is available if the kernel supports BPF_PROG_GET_INFO_BY_FD.
+		Tag: hex.EncodeToString(info.tag[:]),
+		// name is available from 4.15.
+		Name: internal.CString(info.name[:]),
+		stats: &programStats{
+			runtime:  time.Duration(info.run_time_ns),
+			runCount: info.run_cnt,
+		},
+	}, nil
+}
+
+func newProgramInfoFromProc(fd *internal.FD) (*ProgramInfo, error) {
+	var info ProgramInfo
+	err := scanFdInfo(fd, map[string]interface{}{
+		"prog_type": &info.Type,
+		"prog_tag":  &info.Tag,
+	})
+	if errors.Is(err, errMissingFields) {
+		return nil, &internal.UnsupportedFeatureError{
+			Name:           "reading program info from /proc/self/fdinfo",
+			MinimumVersion: internal.Version{4, 10, 0},
+		}
+	}
+	if err != nil {
+		return nil, err
+	}
+
+	return &info, nil
+}
+
+// ID returns the program ID.
+//
+// Available from 4.13.
+//
+// The bool return value indicates whether this optional field is available.
+func (pi *ProgramInfo) ID() (ProgramID, bool) {
+	return pi.id, pi.id > 0
+}
+
+// RunCount returns the total number of times the program was called.
+//
+// Can return 0 if the collection of statistics is not enabled. See EnableStats().
+// The bool return value indicates whether this optional field is available.
+func (pi *ProgramInfo) RunCount() (uint64, bool) {
+	if pi.stats != nil {
+		return pi.stats.runCount, true
+	}
+	return 0, false
+}
+
+// Runtime returns the total accumulated runtime of the program.
+//
+// Can return 0 if the collection of statistics is not enabled. See EnableStats().
+// The bool return value indicates whether this optional field is available.
+func (pi *ProgramInfo) Runtime() (time.Duration, bool) {
+	if pi.stats != nil {
+		return pi.stats.runtime, true
+	}
+	return time.Duration(0), false
+}
+
+func scanFdInfo(fd *internal.FD, fields map[string]interface{}) error {
+	raw, err := fd.Value()
+	if err != nil {
+		return err
+	}
+
+	fh, err := os.Open(fmt.Sprintf("/proc/self/fdinfo/%d", raw))
+	if err != nil {
+		return err
+	}
+	defer fh.Close()
+
+	if err := scanFdInfoReader(fh, fields); err != nil {
+		return fmt.Errorf("%s: %w", fh.Name(), err)
+	}
+	return nil
+}
+
+var errMissingFields = errors.New("missing fields")
+
+func scanFdInfoReader(r io.Reader, fields map[string]interface{}) error {
+	var (
+		scanner = bufio.NewScanner(r)
+		scanned int
+	)
+
+	for scanner.Scan() {
+		parts := strings.SplitN(scanner.Text(), "\t", 2)
+		if len(parts) != 2 {
+			continue
+		}
+
+		name := strings.TrimSuffix(parts[0], ":")
+		field, ok := fields[string(name)]
+		if !ok {
+			continue
+		}
+
+		if n, err := fmt.Sscanln(parts[1], field); err != nil || n != 1 {
+			return fmt.Errorf("can't parse field %s: %v", name, err)
+		}
+
+		scanned++
+	}
+
+	if err := scanner.Err(); err != nil {
+		return err
+	}
+
+	if scanned != len(fields) {
+		return errMissingFields
+	}
+
+	return nil
+}
+
+// EnableStats starts the measuring of the runtime
+// and run counts of eBPF programs.
+//
+// Collecting statistics can have an impact on the performance.
+//
+// Requires at least 5.8.
+func EnableStats(which uint32) (io.Closer, error) {
+	attr := internal.BPFEnableStatsAttr{
+		StatsType: which,
+	}
+
+	fd, err := internal.BPFEnableStats(&attr)
+	if err != nil {
+		return nil, err
+	}
+	return fd, nil
+}

+ 175 - 100
vendor/github.com/cilium/ebpf/internal/btf/btf.go

@@ -29,12 +29,14 @@ var (
 
 // Spec represents decoded BTF.
 type Spec struct {
-	rawTypes  []rawType
-	strings   stringTable
-	types     map[string][]Type
-	funcInfos map[string]extInfo
-	lineInfos map[string]extInfo
-	byteOrder binary.ByteOrder
+	rawTypes   []rawType
+	strings    stringTable
+	types      []Type
+	namedTypes map[string][]namedType
+	funcInfos  map[string]extInfo
+	lineInfos  map[string]extInfo
+	coreRelos  map[string]bpfCoreRelos
+	byteOrder  binary.ByteOrder
 }
 
 type btfHeader struct {
@@ -53,35 +55,15 @@ type btfHeader struct {
 //
 // Returns a nil Spec and no error if no BTF was present.
 func LoadSpecFromReader(rd io.ReaderAt) (*Spec, error) {
-	file, err := elf.NewFile(rd)
+	file, err := internal.NewSafeELFFile(rd)
 	if err != nil {
 		return nil, err
 	}
 	defer file.Close()
 
-	var (
-		btfSection    *elf.Section
-		btfExtSection *elf.Section
-		sectionSizes  = make(map[string]uint32)
-	)
-
-	for _, sec := range file.Sections {
-		switch sec.Name {
-		case ".BTF":
-			btfSection = sec
-		case ".BTF.ext":
-			btfExtSection = sec
-		default:
-			if sec.Type != elf.SHT_PROGBITS && sec.Type != elf.SHT_NOBITS {
-				break
-			}
-
-			if sec.Size > math.MaxUint32 {
-				return nil, fmt.Errorf("section %s exceeds maximum size", sec.Name)
-			}
-
-			sectionSizes[sec.Name] = uint32(sec.Size)
-		}
+	btfSection, btfExtSection, sectionSizes, err := findBtfSections(file)
+	if err != nil {
+		return nil, err
 	}
 
 	if btfSection == nil {
@@ -100,6 +82,10 @@ func LoadSpecFromReader(rd io.ReaderAt) (*Spec, error) {
 			continue
 		}
 
+		if int(symbol.Section) >= len(file.Sections) {
+			return nil, fmt.Errorf("symbol %s: invalid section %d", symbol.Name, symbol.Section)
+		}
+
 		secName := file.Sections[symbol.Section].Name
 		if _, ok := sectionSizes[secName]; !ok {
 			continue
@@ -121,7 +107,7 @@ func LoadSpecFromReader(rd io.ReaderAt) (*Spec, error) {
 		return spec, nil
 	}
 
-	spec.funcInfos, spec.lineInfos, err = parseExtInfos(btfExtSection.Open(), file.ByteOrder, spec.strings)
+	spec.funcInfos, spec.lineInfos, spec.coreRelos, err = parseExtInfos(btfExtSection.Open(), file.ByteOrder, spec.strings)
 	if err != nil {
 		return nil, fmt.Errorf("can't read ext info: %w", err)
 	}
@@ -129,6 +115,51 @@ func LoadSpecFromReader(rd io.ReaderAt) (*Spec, error) {
 	return spec, nil
 }
 
+func findBtfSections(file *internal.SafeELFFile) (*elf.Section, *elf.Section, map[string]uint32, error) {
+	var (
+		btfSection    *elf.Section
+		btfExtSection *elf.Section
+		sectionSizes  = make(map[string]uint32)
+	)
+
+	for _, sec := range file.Sections {
+		switch sec.Name {
+		case ".BTF":
+			btfSection = sec
+		case ".BTF.ext":
+			btfExtSection = sec
+		default:
+			if sec.Type != elf.SHT_PROGBITS && sec.Type != elf.SHT_NOBITS {
+				break
+			}
+
+			if sec.Size > math.MaxUint32 {
+				return nil, nil, nil, fmt.Errorf("section %s exceeds maximum size", sec.Name)
+			}
+
+			sectionSizes[sec.Name] = uint32(sec.Size)
+		}
+	}
+	return btfSection, btfExtSection, sectionSizes, nil
+}
+
+func loadSpecFromVmlinux(rd io.ReaderAt) (*Spec, error) {
+	file, err := internal.NewSafeELFFile(rd)
+	if err != nil {
+		return nil, err
+	}
+	defer file.Close()
+
+	btfSection, _, _, err := findBtfSections(file)
+	if err != nil {
+		return nil, fmt.Errorf(".BTF ELF section: %s", err)
+	}
+	if btfSection == nil {
+		return nil, fmt.Errorf("unable to find .BTF ELF section")
+	}
+	return loadNakedSpec(btfSection.Open(), file.ByteOrder, nil, nil)
+}
+
 func loadNakedSpec(btf io.ReadSeeker, bo binary.ByteOrder, sectionSizes map[string]uint32, variableOffsets map[variable]uint32) (*Spec, error) {
 	rawTypes, rawStrings, err := parseBTF(btf, bo)
 	if err != nil {
@@ -140,16 +171,17 @@ func loadNakedSpec(btf io.ReadSeeker, bo binary.ByteOrder, sectionSizes map[stri
 		return nil, err
 	}
 
-	types, err := inflateRawTypes(rawTypes, rawStrings)
+	types, typesByName, err := inflateRawTypes(rawTypes, rawStrings)
 	if err != nil {
 		return nil, err
 	}
 
 	return &Spec{
-		rawTypes:  rawTypes,
-		types:     types,
-		strings:   rawStrings,
-		byteOrder: bo,
+		rawTypes:   rawTypes,
+		namedTypes: typesByName,
+		types:      types,
+		strings:    rawStrings,
+		byteOrder:  bo,
 	}, nil
 }
 
@@ -176,16 +208,43 @@ func LoadKernelSpec() (*Spec, error) {
 }
 
 func loadKernelSpec() (*Spec, error) {
+	release, err := unix.KernelRelease()
+	if err != nil {
+		return nil, fmt.Errorf("can't read kernel release number: %w", err)
+	}
+
 	fh, err := os.Open("/sys/kernel/btf/vmlinux")
-	if os.IsNotExist(err) {
-		return nil, fmt.Errorf("can't open kernel BTF at /sys/kernel/btf/vmlinux: %w", ErrNotFound)
+	if err == nil {
+		defer fh.Close()
+
+		return loadNakedSpec(fh, internal.NativeEndian, nil, nil)
 	}
-	if err != nil {
-		return nil, fmt.Errorf("can't read kernel BTF: %s", err)
+
+	// use same list of locations as libbpf
+	// https://github.com/libbpf/libbpf/blob/9a3a42608dbe3731256a5682a125ac1e23bced8f/src/btf.c#L3114-L3122
+	locations := []string{
+		"/boot/vmlinux-%s",
+		"/lib/modules/%s/vmlinux-%[1]s",
+		"/lib/modules/%s/build/vmlinux",
+		"/usr/lib/modules/%s/kernel/vmlinux",
+		"/usr/lib/debug/boot/vmlinux-%s",
+		"/usr/lib/debug/boot/vmlinux-%s.debug",
+		"/usr/lib/debug/lib/modules/%s/vmlinux",
 	}
-	defer fh.Close()
 
-	return loadNakedSpec(fh, internal.NativeEndian, nil, nil)
+	for _, loc := range locations {
+		path := fmt.Sprintf(loc, release)
+
+		fh, err := os.Open(path)
+		if err != nil {
+			continue
+		}
+		defer fh.Close()
+
+		return loadSpecFromVmlinux(fh)
+	}
+
+	return nil, fmt.Errorf("no BTF for kernel version %s: %w", release, internal.ErrNotSupported)
 }
 
 func parseBTF(btf io.ReadSeeker, bo binary.ByteOrder) ([]rawType, stringTable, error) {
@@ -259,10 +318,14 @@ func fixupDatasec(rawTypes []rawType, rawStrings stringTable, sectionSizes map[s
 			return err
 		}
 
-		if name == ".kconfig" || name == ".ksym" {
+		if name == ".kconfig" || name == ".ksyms" {
 			return fmt.Errorf("reference to %s: %w", name, ErrNotSupported)
 		}
 
+		if rawTypes[i].SizeType != 0 {
+			continue
+		}
+
 		size, ok := sectionSizes[name]
 		if !ok {
 			return fmt.Errorf("data section %s: missing size", name)
@@ -369,54 +432,19 @@ func (s *Spec) Program(name string, length uint64) (*Program, error) {
 		return nil, errors.New("length musn't be zero")
 	}
 
-	if s.funcInfos == nil && s.lineInfos == nil {
+	if s.funcInfos == nil && s.lineInfos == nil && s.coreRelos == nil {
 		return nil, fmt.Errorf("BTF for section %s: %w", name, ErrNoExtendedInfo)
 	}
 
 	funcInfos, funcOK := s.funcInfos[name]
 	lineInfos, lineOK := s.lineInfos[name]
+	coreRelos, coreOK := s.coreRelos[name]
 
-	if !funcOK && !lineOK {
+	if !funcOK && !lineOK && !coreOK {
 		return nil, fmt.Errorf("no extended BTF info for section %s", name)
 	}
 
-	return &Program{s, length, funcInfos, lineInfos}, nil
-}
-
-// Map finds the BTF for a map.
-//
-// Returns an error if there is no BTF for the given name.
-func (s *Spec) Map(name string) (*Map, []Member, error) {
-	var mapVar Var
-	if err := s.FindType(name, &mapVar); err != nil {
-		return nil, nil, err
-	}
-
-	mapStruct, ok := mapVar.Type.(*Struct)
-	if !ok {
-		return nil, nil, fmt.Errorf("expected struct, have %s", mapVar.Type)
-	}
-
-	var key, value Type
-	for _, member := range mapStruct.Members {
-		switch member.Name {
-		case "key":
-			key = member.Type
-
-		case "value":
-			value = member.Type
-		}
-	}
-
-	if key == nil {
-		key = (*Void)(nil)
-	}
-
-	if value == nil {
-		value = (*Void)(nil)
-	}
-
-	return &Map{s, key, value}, mapStruct.Members, nil
+	return &Program{s, length, funcInfos, lineInfos, coreRelos}, nil
 }
 
 // Datasec returns the BTF required to create maps which represent data sections.
@@ -426,7 +454,8 @@ func (s *Spec) Datasec(name string) (*Map, error) {
 		return nil, fmt.Errorf("data section %s: can't get BTF: %w", name, err)
 	}
 
-	return &Map{s, &Void{}, &datasec}, nil
+	m := NewMap(s, &Void{}, &datasec)
+	return &m, nil
 }
 
 // FindType searches for a type with a specific name.
@@ -441,11 +470,16 @@ func (s *Spec) FindType(name string, typ Type) error {
 		candidate Type
 	)
 
-	for _, typ := range s.types[name] {
+	for _, typ := range s.namedTypes[essentialName(name)] {
 		if reflect.TypeOf(typ) != wanted {
 			continue
 		}
 
+		// Match against the full name, not just the essential one.
+		if typ.name() != name {
+			continue
+		}
+
 		if candidate != nil {
 			return fmt.Errorf("type %s: multiple candidates for %T", name, typ)
 		}
@@ -532,6 +566,23 @@ type Map struct {
 	key, value Type
 }
 
+// NewMap returns a new Map containing the given values.
+// The key and value arguments are initialized to Void if nil values are given.
+func NewMap(spec *Spec, key Type, value Type) Map {
+	if key == nil {
+		key = &Void{}
+	}
+	if value == nil {
+		value = &Void{}
+	}
+
+	return Map{
+		spec:  spec,
+		key:   key,
+		value: value,
+	}
+}
+
 // MapSpec should be a method on Map, but is a free function
 // to hide it from users of the ebpf package.
 func MapSpec(m *Map) *Spec {
@@ -555,6 +606,7 @@ type Program struct {
 	spec                 *Spec
 	length               uint64
 	funcInfos, lineInfos extInfo
+	coreRelos            bpfCoreRelos
 }
 
 // ProgramSpec returns the Spec needed for loading function and line infos into the kernel.
@@ -580,9 +632,10 @@ func ProgramAppend(s, other *Program) error {
 		return fmt.Errorf("line infos: %w", err)
 	}
 
-	s.length += other.length
 	s.funcInfos = funcInfos
 	s.lineInfos = lineInfos
+	s.coreRelos = s.coreRelos.append(other.coreRelos, s.length)
+	s.length += other.length
 	return nil
 }
 
@@ -612,6 +665,19 @@ func ProgramLineInfos(s *Program) (recordSize uint32, bytes []byte, err error) {
 	return s.lineInfos.recordSize, bytes, nil
 }
 
+// ProgramRelocations returns the CO-RE relocations required to adjust the
+// program to the target.
+//
+// This is a free function instead of a method to hide it from users
+// of package ebpf.
+func ProgramRelocations(s *Program, target *Spec) (map[uint64]Relocation, error) {
+	if len(s.coreRelos) == 0 {
+		return nil, nil
+	}
+
+	return coreRelocate(s.spec, target, s.coreRelos)
+}
+
 type bpfLoadBTFAttr struct {
 	btf         internal.Pointer
 	logBuf      internal.Pointer
@@ -621,9 +687,7 @@ type bpfLoadBTFAttr struct {
 }
 
 func bpfLoadBTF(attr *bpfLoadBTFAttr) (*internal.FD, error) {
-	const _BTFLoad = 18
-
-	fd, err := internal.BPF(_BTFLoad, unsafe.Pointer(attr), unsafe.Sizeof(*attr))
+	fd, err := internal.BPF(internal.BPF_BTF_LOAD, unsafe.Pointer(attr), unsafe.Sizeof(*attr))
 	if err != nil {
 		return nil, err
 	}
@@ -653,7 +717,7 @@ func marshalBTF(types interface{}, strings []byte, bo binary.ByteOrder) []byte {
 	return buf.Bytes()
 }
 
-var haveBTF = internal.FeatureTest("BTF", "5.1", func() (bool, error) {
+var haveBTF = internal.FeatureTest("BTF", "5.1", func() error {
 	var (
 		types struct {
 			Integer btfType
@@ -677,15 +741,24 @@ var haveBTF = internal.FeatureTest("BTF", "5.1", func() (bool, error) {
 		btf:     internal.NewSlicePointer(btf),
 		btfSize: uint32(len(btf)),
 	})
-	if err == nil {
-		fd.Close()
+	if errors.Is(err, unix.EINVAL) || errors.Is(err, unix.EPERM) {
+		// Treat both EINVAL and EPERM as not supported: loading the program
+		// might still succeed without BTF.
+		return internal.ErrNotSupported
+	}
+	if err != nil {
+		return err
 	}
-	// Check for EINVAL specifically, rather than err != nil since we
-	// otherwise misdetect due to insufficient permissions.
-	return !errors.Is(err, unix.EINVAL), nil
+
+	fd.Close()
+	return nil
 })
 
-var haveFuncLinkage = internal.FeatureTest("BTF func linkage", "5.6", func() (bool, error) {
+var haveFuncLinkage = internal.FeatureTest("BTF func linkage", "5.6", func() error {
+	if err := haveBTF(); err != nil {
+		return err
+	}
+
 	var (
 		types struct {
 			FuncProto btfType
@@ -706,11 +779,13 @@ var haveFuncLinkage = internal.FeatureTest("BTF func linkage", "5.6", func() (bo
 		btf:     internal.NewSlicePointer(btf),
 		btfSize: uint32(len(btf)),
 	})
-	if err == nil {
-		fd.Close()
+	if errors.Is(err, unix.EINVAL) {
+		return internal.ErrNotSupported
+	}
+	if err != nil {
+		return err
 	}
 
-	// Check for EINVAL specifically, rather than err != nil since we
-	// otherwise misdetect due to insufficient permissions.
-	return !errors.Is(err, unix.EINVAL), nil
+	fd.Close()
+	return nil
 })

+ 17 - 5
vendor/github.com/cilium/ebpf/internal/btf/btf_types.go

@@ -31,19 +31,23 @@ const (
 	kindDatasec
 )
 
+// btfFuncLinkage describes BTF function linkage metadata.
 type btfFuncLinkage uint8
 
+// Equivalent of enum btf_func_linkage.
 const (
 	linkageStatic btfFuncLinkage = iota
 	linkageGlobal
-	linkageExtern
+	// linkageExtern // Currently unused in libbpf.
 )
 
 const (
-	btfTypeKindShift = 24
-	btfTypeKindLen   = 4
-	btfTypeVlenShift = 0
-	btfTypeVlenMask  = 16
+	btfTypeKindShift     = 24
+	btfTypeKindLen       = 4
+	btfTypeVlenShift     = 0
+	btfTypeVlenMask      = 16
+	btfTypeKindFlagShift = 31
+	btfTypeKindFlagMask  = 1
 )
 
 // btfType is equivalent to struct btf_type in Documentation/bpf/btf.rst.
@@ -136,6 +140,10 @@ func (bt *btfType) SetVlen(vlen int) {
 	bt.setInfo(uint32(vlen), btfTypeVlenMask, btfTypeVlenShift)
 }
 
+func (bt *btfType) KindFlag() bool {
+	return bt.info(btfTypeKindFlagMask, btfTypeKindFlagShift) == 1
+}
+
 func (bt *btfType) Linkage() btfFuncLinkage {
 	return btfFuncLinkage(bt.info(btfTypeVlenMask, btfTypeVlenShift))
 }
@@ -257,3 +265,7 @@ func readTypes(r io.Reader, bo binary.ByteOrder) ([]rawType, error) {
 		types = append(types, rawType{header, data})
 	}
 }
+
+func intEncoding(raw uint32) (IntEncoding, uint32, byte) {
+	return IntEncoding((raw & 0x0f000000) >> 24), (raw & 0x00ff0000) >> 16, byte(raw & 0x000000ff)
+}

+ 388 - 0
vendor/github.com/cilium/ebpf/internal/btf/core.go

@@ -0,0 +1,388 @@
+package btf
+
+import (
+	"errors"
+	"fmt"
+	"reflect"
+	"strconv"
+	"strings"
+)
+
+// Code in this file is derived from libbpf, which is available under a BSD
+// 2-Clause license.
+
+// Relocation describes a CO-RE relocation.
+type Relocation struct {
+	Current uint32
+	New     uint32
+}
+
+func (r Relocation) equal(other Relocation) bool {
+	return r.Current == other.Current && r.New == other.New
+}
+
+// coreReloKind is the type of CO-RE relocation
+type coreReloKind uint32
+
+const (
+	reloFieldByteOffset coreReloKind = iota /* field byte offset */
+	reloFieldByteSize                       /* field size in bytes */
+	reloFieldExists                         /* field existence in target kernel */
+	reloFieldSigned                         /* field signedness (0 - unsigned, 1 - signed) */
+	reloFieldLShiftU64                      /* bitfield-specific left bitshift */
+	reloFieldRShiftU64                      /* bitfield-specific right bitshift */
+	reloTypeIDLocal                         /* type ID in local BPF object */
+	reloTypeIDTarget                        /* type ID in target kernel */
+	reloTypeExists                          /* type existence in target kernel */
+	reloTypeSize                            /* type size in bytes */
+	reloEnumvalExists                       /* enum value existence in target kernel */
+	reloEnumvalValue                        /* enum value integer value */
+)
+
+func (k coreReloKind) String() string {
+	switch k {
+	case reloFieldByteOffset:
+		return "byte_off"
+	case reloFieldByteSize:
+		return "byte_sz"
+	case reloFieldExists:
+		return "field_exists"
+	case reloFieldSigned:
+		return "signed"
+	case reloFieldLShiftU64:
+		return "lshift_u64"
+	case reloFieldRShiftU64:
+		return "rshift_u64"
+	case reloTypeIDLocal:
+		return "local_type_id"
+	case reloTypeIDTarget:
+		return "target_type_id"
+	case reloTypeExists:
+		return "type_exists"
+	case reloTypeSize:
+		return "type_size"
+	case reloEnumvalExists:
+		return "enumval_exists"
+	case reloEnumvalValue:
+		return "enumval_value"
+	default:
+		return "unknown"
+	}
+}
+
+func coreRelocate(local, target *Spec, coreRelos bpfCoreRelos) (map[uint64]Relocation, error) {
+	if target == nil {
+		var err error
+		target, err = loadKernelSpec()
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	if local.byteOrder != target.byteOrder {
+		return nil, fmt.Errorf("can't relocate %s against %s", local.byteOrder, target.byteOrder)
+	}
+
+	relocations := make(map[uint64]Relocation, len(coreRelos))
+	for _, relo := range coreRelos {
+		accessorStr, err := local.strings.Lookup(relo.AccessStrOff)
+		if err != nil {
+			return nil, err
+		}
+
+		accessor, err := parseCoreAccessor(accessorStr)
+		if err != nil {
+			return nil, fmt.Errorf("accessor %q: %s", accessorStr, err)
+		}
+
+		if int(relo.TypeID) >= len(local.types) {
+			return nil, fmt.Errorf("invalid type id %d", relo.TypeID)
+		}
+
+		typ := local.types[relo.TypeID]
+
+		if relo.ReloKind == reloTypeIDLocal {
+			relocations[uint64(relo.InsnOff)] = Relocation{
+				uint32(typ.ID()),
+				uint32(typ.ID()),
+			}
+			continue
+		}
+
+		named, ok := typ.(namedType)
+		if !ok || named.name() == "" {
+			return nil, fmt.Errorf("relocate anonymous type %s: %w", typ.String(), ErrNotSupported)
+		}
+
+		name := essentialName(named.name())
+		res, err := coreCalculateRelocation(typ, target.namedTypes[name], relo.ReloKind, accessor)
+		if err != nil {
+			return nil, fmt.Errorf("relocate %s: %w", name, err)
+		}
+
+		relocations[uint64(relo.InsnOff)] = res
+	}
+
+	return relocations, nil
+}
+
+var errAmbiguousRelocation = errors.New("ambiguous relocation")
+
+func coreCalculateRelocation(local Type, targets []namedType, kind coreReloKind, localAccessor coreAccessor) (Relocation, error) {
+	var relos []Relocation
+	var matches []Type
+	for _, target := range targets {
+		switch kind {
+		case reloTypeIDTarget:
+			if localAccessor[0] != 0 {
+				return Relocation{}, fmt.Errorf("%s: unexpected non-zero accessor", kind)
+			}
+
+			if compat, err := coreAreTypesCompatible(local, target); err != nil {
+				return Relocation{}, fmt.Errorf("%s: %s", kind, err)
+			} else if !compat {
+				continue
+			}
+
+			relos = append(relos, Relocation{uint32(target.ID()), uint32(target.ID())})
+
+		default:
+			return Relocation{}, fmt.Errorf("relocation %s: %w", kind, ErrNotSupported)
+		}
+		matches = append(matches, target)
+	}
+
+	if len(relos) == 0 {
+		// TODO: Add switch for existence checks like reloEnumvalExists here.
+
+		// TODO: This might have to be poisoned.
+		return Relocation{}, fmt.Errorf("no relocation found, tried %v", targets)
+	}
+
+	relo := relos[0]
+	for _, altRelo := range relos[1:] {
+		if !altRelo.equal(relo) {
+			return Relocation{}, fmt.Errorf("multiple types %v match: %w", matches, errAmbiguousRelocation)
+		}
+	}
+
+	return relo, nil
+}
+
+/* coreAccessor contains a path through a struct. It contains at least one index.
+ *
+ * The interpretation depends on the kind of the relocation. The following is
+ * taken from struct bpf_core_relo in libbpf_internal.h:
+ *
+ * - for field-based relocations, string encodes an accessed field using
+ *   a sequence of field and array indices, separated by colon (:). It's
+ *   conceptually very close to LLVM's getelementptr ([0]) instruction's
+ *   arguments for identifying offset to a field.
+ * - for type-based relocations, strings is expected to be just "0";
+ * - for enum value-based relocations, string contains an index of enum
+ *   value within its enum type;
+ *
+ * Example to provide a better feel.
+ *
+ *   struct sample {
+ *       int a;
+ *       struct {
+ *           int b[10];
+ *       };
+ *   };
+ *
+ *   struct sample s = ...;
+ *   int x = &s->a;     // encoded as "0:0" (a is field #0)
+ *   int y = &s->b[5];  // encoded as "0:1:0:5" (anon struct is field #1,
+ *                      // b is field #0 inside anon struct, accessing elem #5)
+ *   int z = &s[10]->b; // encoded as "10:1" (ptr is used as an array)
+ */
+type coreAccessor []int
+
+func parseCoreAccessor(accessor string) (coreAccessor, error) {
+	if accessor == "" {
+		return nil, fmt.Errorf("empty accessor")
+	}
+
+	var result coreAccessor
+	parts := strings.Split(accessor, ":")
+	for _, part := range parts {
+		// 31 bits to avoid overflowing int on 32 bit platforms.
+		index, err := strconv.ParseUint(part, 10, 31)
+		if err != nil {
+			return nil, fmt.Errorf("accessor index %q: %s", part, err)
+		}
+
+		result = append(result, int(index))
+	}
+
+	return result, nil
+}
+
+/* The comment below is from bpf_core_types_are_compat in libbpf.c:
+ *
+ * Check local and target types for compatibility. This check is used for
+ * type-based CO-RE relocations and follow slightly different rules than
+ * field-based relocations. This function assumes that root types were already
+ * checked for name match. Beyond that initial root-level name check, names
+ * are completely ignored. Compatibility rules are as follows:
+ *   - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs are considered compatible, but
+ *     kind should match for local and target types (i.e., STRUCT is not
+ *     compatible with UNION);
+ *   - for ENUMs, the size is ignored;
+ *   - for INT, size and signedness are ignored;
+ *   - for ARRAY, dimensionality is ignored, element types are checked for
+ *     compatibility recursively;
+ *   - CONST/VOLATILE/RESTRICT modifiers are ignored;
+ *   - TYPEDEFs/PTRs are compatible if types they pointing to are compatible;
+ *   - FUNC_PROTOs are compatible if they have compatible signature: same
+ *     number of input args and compatible return and argument types.
+ * These rules are not set in stone and probably will be adjusted as we get
+ * more experience with using BPF CO-RE relocations.
+ */
+func coreAreTypesCompatible(localType Type, targetType Type) (bool, error) {
+	var (
+		localTs, targetTs typeDeque
+		l, t              = &localType, &targetType
+		depth             = 0
+	)
+
+	for ; l != nil && t != nil; l, t = localTs.shift(), targetTs.shift() {
+		if depth >= maxTypeDepth {
+			return false, errors.New("types are nested too deep")
+		}
+
+		localType = skipQualifierAndTypedef(*l)
+		targetType = skipQualifierAndTypedef(*t)
+
+		if reflect.TypeOf(localType) != reflect.TypeOf(targetType) {
+			return false, nil
+		}
+
+		switch lv := (localType).(type) {
+		case *Void, *Struct, *Union, *Enum, *Fwd:
+			// Nothing to do here
+
+		case *Int:
+			tv := targetType.(*Int)
+			if lv.isBitfield() || tv.isBitfield() {
+				return false, nil
+			}
+
+		case *Pointer, *Array:
+			depth++
+			localType.walk(&localTs)
+			targetType.walk(&targetTs)
+
+		case *FuncProto:
+			tv := targetType.(*FuncProto)
+			if len(lv.Params) != len(tv.Params) {
+				return false, nil
+			}
+
+			depth++
+			localType.walk(&localTs)
+			targetType.walk(&targetTs)
+
+		default:
+			return false, fmt.Errorf("unsupported type %T", localType)
+		}
+	}
+
+	if l != nil {
+		return false, fmt.Errorf("dangling local type %T", *l)
+	}
+
+	if t != nil {
+		return false, fmt.Errorf("dangling target type %T", *t)
+	}
+
+	return true, nil
+}
+
+/* The comment below is from bpf_core_fields_are_compat in libbpf.c:
+ *
+ * Check two types for compatibility for the purpose of field access
+ * relocation. const/volatile/restrict and typedefs are skipped to ensure we
+ * are relocating semantically compatible entities:
+ *   - any two STRUCTs/UNIONs are compatible and can be mixed;
+ *   - any two FWDs are compatible, if their names match (modulo flavor suffix);
+ *   - any two PTRs are always compatible;
+ *   - for ENUMs, names should be the same (ignoring flavor suffix) or at
+ *     least one of enums should be anonymous;
+ *   - for ENUMs, check sizes, names are ignored;
+ *   - for INT, size and signedness are ignored;
+ *   - for ARRAY, dimensionality is ignored, element types are checked for
+ *     compatibility recursively;
+ *   - everything else shouldn't be ever a target of relocation.
+ * These rules are not set in stone and probably will be adjusted as we get
+ * more experience with using BPF CO-RE relocations.
+ */
+func coreAreMembersCompatible(localType Type, targetType Type) (bool, error) {
+	doNamesMatch := func(a, b string) bool {
+		if a == "" || b == "" {
+			// allow anonymous and named type to match
+			return true
+		}
+
+		return essentialName(a) == essentialName(b)
+	}
+
+	for depth := 0; depth <= maxTypeDepth; depth++ {
+		localType = skipQualifierAndTypedef(localType)
+		targetType = skipQualifierAndTypedef(targetType)
+
+		_, lok := localType.(composite)
+		_, tok := targetType.(composite)
+		if lok && tok {
+			return true, nil
+		}
+
+		if reflect.TypeOf(localType) != reflect.TypeOf(targetType) {
+			return false, nil
+		}
+
+		switch lv := localType.(type) {
+		case *Pointer:
+			return true, nil
+
+		case *Enum:
+			tv := targetType.(*Enum)
+			return doNamesMatch(lv.name(), tv.name()), nil
+
+		case *Fwd:
+			tv := targetType.(*Fwd)
+			return doNamesMatch(lv.name(), tv.name()), nil
+
+		case *Int:
+			tv := targetType.(*Int)
+			return !lv.isBitfield() && !tv.isBitfield(), nil
+
+		case *Array:
+			tv := targetType.(*Array)
+
+			localType = lv.Type
+			targetType = tv.Type
+
+		default:
+			return false, fmt.Errorf("unsupported type %T", localType)
+		}
+	}
+
+	return false, errors.New("types are nested too deep")
+}
+
+func skipQualifierAndTypedef(typ Type) Type {
+	result := typ
+	for depth := 0; depth <= maxTypeDepth; depth++ {
+		switch v := (result).(type) {
+		case qualifier:
+			result = v.qualify()
+		case *Typedef:
+			result = v.Type
+		default:
+			return result
+		}
+	}
+	return typ
+}

+ 126 - 27
vendor/github.com/cilium/ebpf/internal/btf/ext_info.go

@@ -1,6 +1,7 @@
 package btf
 
 import (
+	"bufio"
 	"bytes"
 	"encoding/binary"
 	"errors"
@@ -24,55 +25,82 @@ type btfExtHeader struct {
 	LineInfoLen uint32
 }
 
-func parseExtInfos(r io.ReadSeeker, bo binary.ByteOrder, strings stringTable) (funcInfo, lineInfo map[string]extInfo, err error) {
+type btfExtCoreHeader struct {
+	CoreReloOff uint32
+	CoreReloLen uint32
+}
+
+func parseExtInfos(r io.ReadSeeker, bo binary.ByteOrder, strings stringTable) (funcInfo, lineInfo map[string]extInfo, coreRelos map[string]bpfCoreRelos, err error) {
 	var header btfExtHeader
+	var coreHeader btfExtCoreHeader
 	if err := binary.Read(r, bo, &header); err != nil {
-		return nil, nil, fmt.Errorf("can't read header: %v", err)
+		return nil, nil, nil, fmt.Errorf("can't read header: %v", err)
 	}
 
 	if header.Magic != btfMagic {
-		return nil, nil, fmt.Errorf("incorrect magic value %v", header.Magic)
+		return nil, nil, nil, fmt.Errorf("incorrect magic value %v", header.Magic)
 	}
 
 	if header.Version != 1 {
-		return nil, nil, fmt.Errorf("unexpected version %v", header.Version)
+		return nil, nil, nil, fmt.Errorf("unexpected version %v", header.Version)
 	}
 
 	if header.Flags != 0 {
-		return nil, nil, fmt.Errorf("unsupported flags %v", header.Flags)
+		return nil, nil, nil, fmt.Errorf("unsupported flags %v", header.Flags)
 	}
 
 	remainder := int64(header.HdrLen) - int64(binary.Size(&header))
 	if remainder < 0 {
-		return nil, nil, errors.New("header is too short")
+		return nil, nil, nil, errors.New("header is too short")
+	}
+
+	coreHdrSize := int64(binary.Size(&coreHeader))
+	if remainder >= coreHdrSize {
+		if err := binary.Read(r, bo, &coreHeader); err != nil {
+			return nil, nil, nil, fmt.Errorf("can't read CO-RE relocation header: %v", err)
+		}
+		remainder -= coreHdrSize
 	}
 
 	// Of course, the .BTF.ext header has different semantics than the
 	// .BTF ext header. We need to ignore non-null values.
 	_, err = io.CopyN(ioutil.Discard, r, remainder)
 	if err != nil {
-		return nil, nil, fmt.Errorf("header padding: %v", err)
+		return nil, nil, nil, fmt.Errorf("header padding: %v", err)
 	}
 
 	if _, err := r.Seek(int64(header.HdrLen+header.FuncInfoOff), io.SeekStart); err != nil {
-		return nil, nil, fmt.Errorf("can't seek to function info section: %v", err)
+		return nil, nil, nil, fmt.Errorf("can't seek to function info section: %v", err)
 	}
 
-	funcInfo, err = parseExtInfo(io.LimitReader(r, int64(header.FuncInfoLen)), bo, strings)
+	buf := bufio.NewReader(io.LimitReader(r, int64(header.FuncInfoLen)))
+	funcInfo, err = parseExtInfo(buf, bo, strings)
 	if err != nil {
-		return nil, nil, fmt.Errorf("function info: %w", err)
+		return nil, nil, nil, fmt.Errorf("function info: %w", err)
 	}
 
 	if _, err := r.Seek(int64(header.HdrLen+header.LineInfoOff), io.SeekStart); err != nil {
-		return nil, nil, fmt.Errorf("can't seek to line info section: %v", err)
+		return nil, nil, nil, fmt.Errorf("can't seek to line info section: %v", err)
 	}
 
-	lineInfo, err = parseExtInfo(io.LimitReader(r, int64(header.LineInfoLen)), bo, strings)
+	buf = bufio.NewReader(io.LimitReader(r, int64(header.LineInfoLen)))
+	lineInfo, err = parseExtInfo(buf, bo, strings)
 	if err != nil {
-		return nil, nil, fmt.Errorf("line info: %w", err)
+		return nil, nil, nil, fmt.Errorf("line info: %w", err)
+	}
+
+	if coreHeader.CoreReloOff > 0 && coreHeader.CoreReloLen > 0 {
+		if _, err := r.Seek(int64(header.HdrLen+coreHeader.CoreReloOff), io.SeekStart); err != nil {
+			return nil, nil, nil, fmt.Errorf("can't seek to CO-RE relocation section: %v", err)
+		}
+
+		coreRelos, err = parseExtInfoRelos(io.LimitReader(r, int64(coreHeader.CoreReloLen)), bo, strings)
+		if err != nil {
+			return nil, nil, nil, fmt.Errorf("CO-RE relocation info: %w", err)
+		}
 	}
 
-	return funcInfo, lineInfo, nil
+	return funcInfo, lineInfo, coreRelos, nil
 }
 
 type btfExtInfoSec struct {
@@ -127,6 +155,8 @@ func (ei extInfo) MarshalBinary() ([]byte, error) {
 }
 
 func parseExtInfo(r io.Reader, bo binary.ByteOrder, strings stringTable) (map[string]extInfo, error) {
+	const maxRecordSize = 256
+
 	var recordSize uint32
 	if err := binary.Read(r, bo, &recordSize); err != nil {
 		return nil, fmt.Errorf("can't read record size: %v", err)
@@ -136,23 +166,15 @@ func parseExtInfo(r io.Reader, bo binary.ByteOrder, strings stringTable) (map[st
 		// Need at least insnOff
 		return nil, errors.New("record size too short")
 	}
+	if recordSize > maxRecordSize {
+		return nil, fmt.Errorf("record size %v exceeds %v", recordSize, maxRecordSize)
+	}
 
 	result := make(map[string]extInfo)
 	for {
-		var infoHeader btfExtInfoSec
-		if err := binary.Read(r, bo, &infoHeader); err == io.EOF {
+		secName, infoHeader, err := parseExtInfoHeader(r, bo, strings)
+		if errors.Is(err, io.EOF) {
 			return result, nil
-		} else if err != nil {
-			return nil, fmt.Errorf("can't read ext info header: %v", err)
-		}
-
-		secName, err := strings.Lookup(infoHeader.SecNameOff)
-		if err != nil {
-			return nil, fmt.Errorf("can't get section name: %w", err)
-		}
-
-		if infoHeader.NumInfo == 0 {
-			return nil, fmt.Errorf("section %s has invalid number of records", secName)
 		}
 
 		var records []extInfoRecord
@@ -180,3 +202,80 @@ func parseExtInfo(r io.Reader, bo binary.ByteOrder, strings stringTable) (map[st
 		}
 	}
 }
+
+// bpfCoreRelo matches `struct bpf_core_relo` from the kernel
+type bpfCoreRelo struct {
+	InsnOff      uint32
+	TypeID       TypeID
+	AccessStrOff uint32
+	ReloKind     coreReloKind
+}
+
+type bpfCoreRelos []bpfCoreRelo
+
+// append two slices of extInfoRelo to each other. The InsnOff of b are adjusted
+// by offset.
+func (r bpfCoreRelos) append(other bpfCoreRelos, offset uint64) bpfCoreRelos {
+	result := make([]bpfCoreRelo, 0, len(r)+len(other))
+	result = append(result, r...)
+	for _, relo := range other {
+		relo.InsnOff += uint32(offset)
+		result = append(result, relo)
+	}
+	return result
+}
+
+var extInfoReloSize = binary.Size(bpfCoreRelo{})
+
+func parseExtInfoRelos(r io.Reader, bo binary.ByteOrder, strings stringTable) (map[string]bpfCoreRelos, error) {
+	var recordSize uint32
+	if err := binary.Read(r, bo, &recordSize); err != nil {
+		return nil, fmt.Errorf("read record size: %v", err)
+	}
+
+	if recordSize != uint32(extInfoReloSize) {
+		return nil, fmt.Errorf("expected record size %d, got %d", extInfoReloSize, recordSize)
+	}
+
+	result := make(map[string]bpfCoreRelos)
+	for {
+		secName, infoHeader, err := parseExtInfoHeader(r, bo, strings)
+		if errors.Is(err, io.EOF) {
+			return result, nil
+		}
+
+		var relos []bpfCoreRelo
+		for i := uint32(0); i < infoHeader.NumInfo; i++ {
+			var relo bpfCoreRelo
+			if err := binary.Read(r, bo, &relo); err != nil {
+				return nil, fmt.Errorf("section %v: read record: %v", secName, err)
+			}
+
+			if relo.InsnOff%asm.InstructionSize != 0 {
+				return nil, fmt.Errorf("section %v: offset %v is not aligned with instruction size", secName, relo.InsnOff)
+			}
+
+			relos = append(relos, relo)
+		}
+
+		result[secName] = relos
+	}
+}
+
+func parseExtInfoHeader(r io.Reader, bo binary.ByteOrder, strings stringTable) (string, *btfExtInfoSec, error) {
+	var infoHeader btfExtInfoSec
+	if err := binary.Read(r, bo, &infoHeader); err != nil {
+		return "", nil, fmt.Errorf("read ext info header: %w", err)
+	}
+
+	secName, err := strings.Lookup(infoHeader.SecNameOff)
+	if err != nil {
+		return "", nil, fmt.Errorf("get section name: %w", err)
+	}
+
+	if infoHeader.NumInfo == 0 {
+		return "", nil, fmt.Errorf("section %s has zero records", secName)
+	}
+
+	return secName, &infoHeader, nil
+}

+ 49 - 0
vendor/github.com/cilium/ebpf/internal/btf/fuzz.go

@@ -0,0 +1,49 @@
+// +build gofuzz
+
+// Use with https://github.com/dvyukov/go-fuzz
+
+package btf
+
+import (
+	"bytes"
+	"encoding/binary"
+
+	"github.com/cilium/ebpf/internal"
+)
+
+func FuzzSpec(data []byte) int {
+	if len(data) < binary.Size(btfHeader{}) {
+		return -1
+	}
+
+	spec, err := loadNakedSpec(bytes.NewReader(data), internal.NativeEndian, nil, nil)
+	if err != nil {
+		if spec != nil {
+			panic("spec is not nil")
+		}
+		return 0
+	}
+	if spec == nil {
+		panic("spec is nil")
+	}
+	return 1
+}
+
+func FuzzExtInfo(data []byte) int {
+	if len(data) < binary.Size(btfExtHeader{}) {
+		return -1
+	}
+
+	table := stringTable("\x00foo\x00barfoo\x00")
+	info, err := parseExtInfo(bytes.NewReader(data), internal.NativeEndian, table)
+	if err != nil {
+		if info != nil {
+			panic("info is not nil")
+		}
+		return 0
+	}
+	if info == nil {
+		panic("info is nil")
+	}
+	return 1
+}

+ 360 - 76
vendor/github.com/cilium/ebpf/internal/btf/types.go

@@ -4,6 +4,7 @@ import (
 	"errors"
 	"fmt"
 	"math"
+	"strings"
 )
 
 const maxTypeDepth = 32
@@ -20,10 +21,22 @@ func (tid TypeID) ID() TypeID {
 type Type interface {
 	ID() TypeID
 
+	String() string
+
 	// Make a copy of the type, without copying Type members.
 	copy() Type
 
-	walk(*copyStack)
+	// Enumerate all nested Types. Repeated calls must visit nested
+	// types in the same order.
+	walk(*typeDeque)
+}
+
+// namedType is a type with a name.
+//
+// Most named types simply embed Name.
+type namedType interface {
+	Type
+	name() string
 }
 
 // Name identifies a type.
@@ -39,9 +52,18 @@ func (n Name) name() string {
 type Void struct{}
 
 func (v *Void) ID() TypeID      { return 0 }
+func (v *Void) String() string  { return "void#0" }
 func (v *Void) size() uint32    { return 0 }
 func (v *Void) copy() Type      { return (*Void)(nil) }
-func (v *Void) walk(*copyStack) {}
+func (v *Void) walk(*typeDeque) {}
+
+type IntEncoding byte
+
+const (
+	Signed IntEncoding = 1 << iota
+	Char
+	Bool
+)
 
 // Int is an integer of a given length.
 type Int struct {
@@ -49,24 +71,64 @@ type Int struct {
 	Name
 
 	// The size of the integer in bytes.
-	Size uint32
+	Size     uint32
+	Encoding IntEncoding
+	// Offset is the starting bit offset. Currently always 0.
+	// See https://www.kernel.org/doc/html/latest/bpf/btf.html#btf-kind-int
+	Offset uint32
+	Bits   byte
+}
+
+var _ namedType = (*Int)(nil)
+
+func (i *Int) String() string {
+	var s strings.Builder
+
+	switch {
+	case i.Encoding&Char != 0:
+		s.WriteString("char")
+	case i.Encoding&Bool != 0:
+		s.WriteString("bool")
+	default:
+		if i.Encoding&Signed == 0 {
+			s.WriteRune('u')
+		}
+		s.WriteString("int")
+		fmt.Fprintf(&s, "%d", i.Size*8)
+	}
+
+	fmt.Fprintf(&s, "#%d", i.TypeID)
+
+	if i.Bits > 0 {
+		fmt.Fprintf(&s, "[bits=%d]", i.Bits)
+	}
+
+	return s.String()
 }
 
 func (i *Int) size() uint32    { return i.Size }
-func (i *Int) walk(*copyStack) {}
+func (i *Int) walk(*typeDeque) {}
 func (i *Int) copy() Type {
 	cpy := *i
 	return &cpy
 }
 
+func (i *Int) isBitfield() bool {
+	return i.Offset > 0
+}
+
 // Pointer is a pointer to another type.
 type Pointer struct {
 	TypeID
 	Target Type
 }
 
-func (p *Pointer) size() uint32       { return 8 }
-func (p *Pointer) walk(cs *copyStack) { cs.push(&p.Target) }
+func (p *Pointer) String() string {
+	return fmt.Sprintf("pointer#%d[target=#%d]", p.TypeID, p.Target.ID())
+}
+
+func (p *Pointer) size() uint32        { return 8 }
+func (p *Pointer) walk(tdq *typeDeque) { tdq.push(&p.Target) }
 func (p *Pointer) copy() Type {
 	cpy := *p
 	return &cpy
@@ -79,7 +141,11 @@ type Array struct {
 	Nelems uint32
 }
 
-func (arr *Array) walk(cs *copyStack) { cs.push(&arr.Type) }
+func (arr *Array) String() string {
+	return fmt.Sprintf("array#%d[type=#%d n=%d]", arr.TypeID, arr.Type.ID(), arr.Nelems)
+}
+
+func (arr *Array) walk(tdq *typeDeque) { tdq.push(&arr.Type) }
 func (arr *Array) copy() Type {
 	cpy := *arr
 	return &cpy
@@ -94,11 +160,15 @@ type Struct struct {
 	Members []Member
 }
 
+func (s *Struct) String() string {
+	return fmt.Sprintf("struct#%d[%q]", s.TypeID, s.Name)
+}
+
 func (s *Struct) size() uint32 { return s.Size }
 
-func (s *Struct) walk(cs *copyStack) {
+func (s *Struct) walk(tdq *typeDeque) {
 	for i := range s.Members {
-		cs.push(&s.Members[i].Type)
+		tdq.push(&s.Members[i].Type)
 	}
 }
 
@@ -109,6 +179,10 @@ func (s *Struct) copy() Type {
 	return &cpy
 }
 
+func (s *Struct) members() []Member {
+	return s.Members
+}
+
 // Union is a compound type where members occupy the same memory.
 type Union struct {
 	TypeID
@@ -118,11 +192,15 @@ type Union struct {
 	Members []Member
 }
 
+func (u *Union) String() string {
+	return fmt.Sprintf("union#%d[%q]", u.TypeID, u.Name)
+}
+
 func (u *Union) size() uint32 { return u.Size }
 
-func (u *Union) walk(cs *copyStack) {
+func (u *Union) walk(tdq *typeDeque) {
 	for i := range u.Members {
-		cs.push(&u.Members[i].Type)
+		tdq.push(&u.Members[i].Type)
 	}
 }
 
@@ -133,35 +211,90 @@ func (u *Union) copy() Type {
 	return &cpy
 }
 
+func (u *Union) members() []Member {
+	return u.Members
+}
+
+type composite interface {
+	members() []Member
+}
+
+var (
+	_ composite = (*Struct)(nil)
+	_ composite = (*Union)(nil)
+)
+
 // Member is part of a Struct or Union.
 //
 // It is not a valid Type.
 type Member struct {
 	Name
-	Type   Type
-	Offset uint32
+	Type Type
+	// Offset is the bit offset of this member
+	Offset       uint32
+	BitfieldSize uint32
 }
 
 // Enum lists possible values.
 type Enum struct {
 	TypeID
 	Name
+	Values []EnumValue
+}
+
+func (e *Enum) String() string {
+	return fmt.Sprintf("enum#%d[%q]", e.TypeID, e.Name)
+}
+
+// EnumValue is part of an Enum
+//
+// Is is not a valid Type
+type EnumValue struct {
+	Name
+	Value int32
 }
 
 func (e *Enum) size() uint32    { return 4 }
-func (e *Enum) walk(*copyStack) {}
+func (e *Enum) walk(*typeDeque) {}
 func (e *Enum) copy() Type {
 	cpy := *e
+	cpy.Values = make([]EnumValue, len(e.Values))
+	copy(cpy.Values, e.Values)
 	return &cpy
 }
 
+// FwdKind is the type of forward declaration.
+type FwdKind int
+
+// Valid types of forward declaration.
+const (
+	FwdStruct FwdKind = iota
+	FwdUnion
+)
+
+func (fk FwdKind) String() string {
+	switch fk {
+	case FwdStruct:
+		return "struct"
+	case FwdUnion:
+		return "union"
+	default:
+		return fmt.Sprintf("%T(%d)", fk, int(fk))
+	}
+}
+
 // Fwd is a forward declaration of a Type.
 type Fwd struct {
 	TypeID
 	Name
+	Kind FwdKind
+}
+
+func (f *Fwd) String() string {
+	return fmt.Sprintf("fwd#%d[%s %q]", f.TypeID, f.Kind, f.Name)
 }
 
-func (f *Fwd) walk(*copyStack) {}
+func (f *Fwd) walk(*typeDeque) {}
 func (f *Fwd) copy() Type {
 	cpy := *f
 	return &cpy
@@ -174,43 +307,62 @@ type Typedef struct {
 	Type Type
 }
 
-func (td *Typedef) walk(cs *copyStack) { cs.push(&td.Type) }
+func (td *Typedef) String() string {
+	return fmt.Sprintf("typedef#%d[%q #%d]", td.TypeID, td.Name, td.Type.ID())
+}
+
+func (td *Typedef) walk(tdq *typeDeque) { tdq.push(&td.Type) }
 func (td *Typedef) copy() Type {
 	cpy := *td
 	return &cpy
 }
 
-// Volatile is a modifier.
+// Volatile is a qualifier.
 type Volatile struct {
 	TypeID
 	Type Type
 }
 
-func (v *Volatile) walk(cs *copyStack) { cs.push(&v.Type) }
+func (v *Volatile) String() string {
+	return fmt.Sprintf("volatile#%d[#%d]", v.TypeID, v.Type.ID())
+}
+
+func (v *Volatile) qualify() Type       { return v.Type }
+func (v *Volatile) walk(tdq *typeDeque) { tdq.push(&v.Type) }
 func (v *Volatile) copy() Type {
 	cpy := *v
 	return &cpy
 }
 
-// Const is a modifier.
+// Const is a qualifier.
 type Const struct {
 	TypeID
 	Type Type
 }
 
-func (c *Const) walk(cs *copyStack) { cs.push(&c.Type) }
+func (c *Const) String() string {
+	return fmt.Sprintf("const#%d[#%d]", c.TypeID, c.Type.ID())
+}
+
+func (c *Const) qualify() Type       { return c.Type }
+func (c *Const) walk(tdq *typeDeque) { tdq.push(&c.Type) }
 func (c *Const) copy() Type {
 	cpy := *c
 	return &cpy
 }
 
-// Restrict is a modifier.
+// Restrict is a qualifier.
 type Restrict struct {
 	TypeID
 	Type Type
 }
 
-func (r *Restrict) walk(cs *copyStack) { cs.push(&r.Type) }
+func (r *Restrict) String() string {
+	return fmt.Sprintf("restrict#%d[#%d]", r.TypeID, r.Type.ID())
+}
+
+func (r *Restrict) qualify() Type       { return r.Type }
+func (r *Restrict) walk(tdq *typeDeque) { tdq.push(&r.Type) }
 func (r *Restrict) copy() Type {
 	cpy := *r
 	return &cpy
@@ -223,7 +375,11 @@ type Func struct {
 	Type Type
 }
 
-func (f *Func) walk(cs *copyStack) { cs.push(&f.Type) }
+func (f *Func) String() string {
+	return fmt.Sprintf("func#%d[%q proto=#%d]", f.TypeID, f.Name, f.Type.ID())
+}
+
+func (f *Func) walk(tdq *typeDeque) { tdq.push(&f.Type) }
 func (f *Func) copy() Type {
 	cpy := *f
 	return &cpy
@@ -233,15 +389,38 @@ func (f *Func) copy() Type {
 type FuncProto struct {
 	TypeID
 	Return Type
-	// Parameters not supported yet
+	Params []FuncParam
+}
+
+func (fp *FuncProto) String() string {
+	var s strings.Builder
+	fmt.Fprintf(&s, "proto#%d[", fp.TypeID)
+	for _, param := range fp.Params {
+		fmt.Fprintf(&s, "%q=#%d, ", param.Name, param.Type.ID())
+	}
+	fmt.Fprintf(&s, "return=#%d]", fp.Return.ID())
+	return s.String()
+}
+
+func (fp *FuncProto) walk(tdq *typeDeque) {
+	tdq.push(&fp.Return)
+	for i := range fp.Params {
+		tdq.push(&fp.Params[i].Type)
+	}
 }
 
-func (fp *FuncProto) walk(cs *copyStack) { cs.push(&fp.Return) }
 func (fp *FuncProto) copy() Type {
 	cpy := *fp
+	cpy.Params = make([]FuncParam, len(fp.Params))
+	copy(cpy.Params, fp.Params)
 	return &cpy
 }
 
+type FuncParam struct {
+	Name
+	Type Type
+}
+
 // Var is a global variable.
 type Var struct {
 	TypeID
@@ -249,7 +428,12 @@ type Var struct {
 	Type Type
 }
 
-func (v *Var) walk(cs *copyStack) { cs.push(&v.Type) }
+func (v *Var) String() string {
+	// TODO: Linkage
+	return fmt.Sprintf("var#%d[%q]", v.TypeID, v.Name)
+}
+
+func (v *Var) walk(tdq *typeDeque) { tdq.push(&v.Type) }
 func (v *Var) copy() Type {
 	cpy := *v
 	return &cpy
@@ -263,11 +447,15 @@ type Datasec struct {
 	Vars []VarSecinfo
 }
 
+func (ds *Datasec) String() string {
+	return fmt.Sprintf("section#%d[%q]", ds.TypeID, ds.Name)
+}
+
 func (ds *Datasec) size() uint32 { return ds.Size }
 
-func (ds *Datasec) walk(cs *copyStack) {
+func (ds *Datasec) walk(tdq *typeDeque) {
 	for i := range ds.Vars {
-		cs.push(&ds.Vars[i].Type)
+		tdq.push(&ds.Vars[i].Type)
 	}
 }
 
@@ -279,6 +467,8 @@ func (ds *Datasec) copy() Type {
 }
 
 // VarSecinfo describes variable in a Datasec
+//
+// It is not a valid Type.
 type VarSecinfo struct {
 	Type   Type
 	Offset uint32
@@ -298,6 +488,16 @@ var (
 	_ sizer = (*Datasec)(nil)
 )
 
+type qualifier interface {
+	qualify() Type
+}
+
+var (
+	_ qualifier = (*Const)(nil)
+	_ qualifier = (*Restrict)(nil)
+	_ qualifier = (*Volatile)(nil)
+)
+
 // Sizeof returns the size of a type in bytes.
 //
 // Returns an error if the size can't be computed.
@@ -326,14 +526,9 @@ func Sizeof(typ Type) (int, error) {
 		case *Typedef:
 			typ = v.Type
 			continue
-		case *Volatile:
-			typ = v.Type
-			continue
-		case *Const:
-			typ = v.Type
-			continue
-		case *Restrict:
-			typ = v.Type
+
+		case qualifier:
+			typ = v.qualify()
 			continue
 
 		default:
@@ -361,7 +556,7 @@ func Sizeof(typ Type) (int, error) {
 func copyType(typ Type) Type {
 	var (
 		copies = make(map[Type]Type)
-		work   copyStack
+		work   typeDeque
 	)
 
 	for t := &typ; t != nil; t = work.pop() {
@@ -382,40 +577,83 @@ func copyType(typ Type) Type {
 	return typ
 }
 
-// copyStack keeps track of pointers to types which still
-// need to be copied.
-type copyStack []*Type
+// typeDeque keeps track of pointers to types which still
+// need to be visited.
+type typeDeque struct {
+	types       []*Type
+	read, write uint64
+	mask        uint64
+}
 
 // push adds a type to the stack.
-func (cs *copyStack) push(t *Type) {
-	*cs = append(*cs, t)
+func (dq *typeDeque) push(t *Type) {
+	if dq.write-dq.read < uint64(len(dq.types)) {
+		dq.types[dq.write&dq.mask] = t
+		dq.write++
+		return
+	}
+
+	new := len(dq.types) * 2
+	if new == 0 {
+		new = 8
+	}
+
+	types := make([]*Type, new)
+	pivot := dq.read & dq.mask
+	n := copy(types, dq.types[pivot:])
+	n += copy(types[n:], dq.types[:pivot])
+	types[n] = t
+
+	dq.types = types
+	dq.mask = uint64(new) - 1
+	dq.read, dq.write = 0, uint64(n+1)
 }
 
-// pop returns the topmost Type, or nil.
-func (cs *copyStack) pop() *Type {
-	n := len(*cs)
-	if n == 0 {
+// shift returns the first element or null.
+func (dq *typeDeque) shift() *Type {
+	if dq.read == dq.write {
 		return nil
 	}
 
-	t := (*cs)[n-1]
-	*cs = (*cs)[:n-1]
+	index := dq.read & dq.mask
+	t := dq.types[index]
+	dq.types[index] = nil
+	dq.read++
 	return t
 }
 
-type namer interface {
-	name() string
+// pop returns the last element or null.
+func (dq *typeDeque) pop() *Type {
+	if dq.read == dq.write {
+		return nil
+	}
+
+	dq.write--
+	index := dq.write & dq.mask
+	t := dq.types[index]
+	dq.types[index] = nil
+	return t
 }
 
-var _ namer = Name("")
+// all returns all elements.
+//
+// The deque is empty after calling this method.
+func (dq *typeDeque) all() []*Type {
+	length := dq.write - dq.read
+	types := make([]*Type, 0, length)
+	for t := dq.shift(); t != nil; t = dq.shift() {
+		types = append(types, t)
+	}
+	return types
+}
 
 // inflateRawTypes takes a list of raw btf types linked via type IDs, and turns
 // it into a graph of Types connected via pointers.
 //
-// Returns a map of named types (so, where NameOff is non-zero). Since BTF ignores
-// compilation units, multiple types may share the same name. A Type may form a
-// cyclic graph by pointing at itself.
-func inflateRawTypes(rawTypes []rawType, rawStrings stringTable) (namedTypes map[string][]Type, err error) {
+// Returns a map of named types (so, where NameOff is non-zero) and a slice of types
+// indexed by TypeID. Since BTF ignores compilation units, multiple types may share
+// the same name. A Type may form a cyclic graph by pointing at itself.
+func inflateRawTypes(rawTypes []rawType, rawStrings stringTable) (types []Type, namedTypes map[string][]namedType, err error) {
 	type fixupDef struct {
 		id           TypeID
 		expectedKind btfKind
@@ -427,7 +665,7 @@ func inflateRawTypes(rawTypes []rawType, rawStrings stringTable) (namedTypes map
 		fixups = append(fixups, fixupDef{id, expectedKind, typ})
 	}
 
-	convertMembers := func(raw []btfMember) ([]Member, error) {
+	convertMembers := func(raw []btfMember, kindFlag bool) ([]Member, error) {
 		// NB: The fixup below relies on pre-allocating this array to
 		// work, since otherwise append might re-allocate members.
 		members := make([]Member, 0, len(raw))
@@ -436,10 +674,15 @@ func inflateRawTypes(rawTypes []rawType, rawStrings stringTable) (namedTypes map
 			if err != nil {
 				return nil, fmt.Errorf("can't get name for member %d: %w", i, err)
 			}
-			members = append(members, Member{
+			m := Member{
 				Name:   name,
 				Offset: btfMember.Offset,
-			})
+			}
+			if kindFlag {
+				m.BitfieldSize = btfMember.Offset >> 24
+				m.Offset &= 0xffffff
+			}
+			members = append(members, m)
 		}
 		for i := range members {
 			fixup(raw[i].Type, kindUnknown, &members[i].Type)
@@ -447,9 +690,9 @@ func inflateRawTypes(rawTypes []rawType, rawStrings stringTable) (namedTypes map
 		return members, nil
 	}
 
-	types := make([]Type, 0, len(rawTypes))
+	types = make([]Type, 0, len(rawTypes))
 	types = append(types, (*Void)(nil))
-	namedTypes = make(map[string][]Type)
+	namedTypes = make(map[string][]namedType)
 
 	for i, raw := range rawTypes {
 		var (
@@ -461,12 +704,13 @@ func inflateRawTypes(rawTypes []rawType, rawStrings stringTable) (namedTypes map
 
 		name, err := rawStrings.LookupName(raw.NameOff)
 		if err != nil {
-			return nil, fmt.Errorf("can't get name for type id %d: %w", id, err)
+			return nil, nil, fmt.Errorf("get name for type id %d: %w", id, err)
 		}
 
 		switch raw.Kind() {
 		case kindInt:
-			typ = &Int{id, name, raw.Size()}
+			encoding, offset, bits := intEncoding(*raw.data.(*uint32))
+			typ = &Int{id, name, raw.Size(), encoding, offset, bits}
 
 		case kindPointer:
 			ptr := &Pointer{id, nil}
@@ -483,24 +727,40 @@ func inflateRawTypes(rawTypes []rawType, rawStrings stringTable) (namedTypes map
 			typ = arr
 
 		case kindStruct:
-			members, err := convertMembers(raw.data.([]btfMember))
+			members, err := convertMembers(raw.data.([]btfMember), raw.KindFlag())
 			if err != nil {
-				return nil, fmt.Errorf("struct %s (id %d): %w", name, id, err)
+				return nil, nil, fmt.Errorf("struct %s (id %d): %w", name, id, err)
 			}
 			typ = &Struct{id, name, raw.Size(), members}
 
 		case kindUnion:
-			members, err := convertMembers(raw.data.([]btfMember))
+			members, err := convertMembers(raw.data.([]btfMember), raw.KindFlag())
 			if err != nil {
-				return nil, fmt.Errorf("union %s (id %d): %w", name, id, err)
+				return nil, nil, fmt.Errorf("union %s (id %d): %w", name, id, err)
 			}
 			typ = &Union{id, name, raw.Size(), members}
 
 		case kindEnum:
-			typ = &Enum{id, name}
+			rawvals := raw.data.([]btfEnum)
+			vals := make([]EnumValue, 0, len(rawvals))
+			for i, btfVal := range rawvals {
+				name, err := rawStrings.LookupName(btfVal.NameOff)
+				if err != nil {
+					return nil, nil, fmt.Errorf("get name for enum value %d: %s", i, err)
+				}
+				vals = append(vals, EnumValue{
+					Name:  name,
+					Value: btfVal.Val,
+				})
+			}
+			typ = &Enum{id, name, vals}
 
 		case kindForward:
-			typ = &Fwd{id, name}
+			if raw.KindFlag() {
+				typ = &Fwd{id, name, FwdUnion}
+			} else {
+				typ = &Fwd{id, name, FwdStruct}
+			}
 
 		case kindTypedef:
 			typedef := &Typedef{id, name, nil}
@@ -528,7 +788,22 @@ func inflateRawTypes(rawTypes []rawType, rawStrings stringTable) (namedTypes map
 			typ = fn
 
 		case kindFuncProto:
-			fp := &FuncProto{id, nil}
+			rawparams := raw.data.([]btfParam)
+			params := make([]FuncParam, 0, len(rawparams))
+			for i, param := range rawparams {
+				name, err := rawStrings.LookupName(param.NameOff)
+				if err != nil {
+					return nil, nil, fmt.Errorf("get name for func proto parameter %d: %s", i, err)
+				}
+				params = append(params, FuncParam{
+					Name: name,
+				})
+			}
+			for i := range params {
+				fixup(rawparams[i].Type, kindUnknown, &params[i].Type)
+			}
+
+			fp := &FuncProto{id, nil, params}
 			fixup(raw.Type(), kindUnknown, &fp.Return)
 			typ = fp
 
@@ -552,14 +827,14 @@ func inflateRawTypes(rawTypes []rawType, rawStrings stringTable) (namedTypes map
 			typ = &Datasec{id, name, raw.SizeType, vars}
 
 		default:
-			return nil, fmt.Errorf("type id %d: unknown kind: %v", id, raw.Kind())
+			return nil, nil, fmt.Errorf("type id %d: unknown kind: %v", id, raw.Kind())
 		}
 
 		types = append(types, typ)
 
-		if namer, ok := typ.(namer); ok {
-			if name := namer.name(); name != "" {
-				namedTypes[name] = append(namedTypes[name], typ)
+		if named, ok := typ.(namedType); ok {
+			if name := essentialName(named.name()); name != "" {
+				namedTypes[name] = append(namedTypes[name], named)
 			}
 		}
 	}
@@ -567,7 +842,7 @@ func inflateRawTypes(rawTypes []rawType, rawStrings stringTable) (namedTypes map
 	for _, fixup := range fixups {
 		i := int(fixup.id)
 		if i >= len(types) {
-			return nil, fmt.Errorf("reference to invalid type id: %d", fixup.id)
+			return nil, nil, fmt.Errorf("reference to invalid type id: %d", fixup.id)
 		}
 
 		// Default void (id 0) to unknown
@@ -577,11 +852,20 @@ func inflateRawTypes(rawTypes []rawType, rawStrings stringTable) (namedTypes map
 		}
 
 		if expected := fixup.expectedKind; expected != kindUnknown && rawKind != expected {
-			return nil, fmt.Errorf("expected type id %d to have kind %s, found %s", fixup.id, expected, rawKind)
+			return nil, nil, fmt.Errorf("expected type id %d to have kind %s, found %s", fixup.id, expected, rawKind)
 		}
 
 		*fixup.typ = types[i]
 	}
 
-	return namedTypes, nil
+	return types, namedTypes, nil
+}
+
+// essentialName returns name without a ___ suffix.
+func essentialName(name string) string {
+	lastIdx := strings.LastIndex(name, "___")
+	if lastIdx > 0 {
+		return name[:lastIdx]
+	}
+	return name
 }

+ 52 - 0
vendor/github.com/cilium/ebpf/internal/elf.go

@@ -0,0 +1,52 @@
+package internal
+
+import (
+	"debug/elf"
+	"fmt"
+	"io"
+)
+
+type SafeELFFile struct {
+	*elf.File
+}
+
+// NewSafeELFFile reads an ELF safely.
+//
+// Any panic during parsing is turned into an error. This is necessary since
+// there are a bunch of unfixed bugs in debug/elf.
+//
+// https://github.com/golang/go/issues?q=is%3Aissue+is%3Aopen+debug%2Felf+in%3Atitle
+func NewSafeELFFile(r io.ReaderAt) (safe *SafeELFFile, err error) {
+	defer func() {
+		r := recover()
+		if r == nil {
+			return
+		}
+
+		safe = nil
+		err = fmt.Errorf("reading ELF file panicked: %s", r)
+	}()
+
+	file, err := elf.NewFile(r)
+	if err != nil {
+		return nil, err
+	}
+
+	return &SafeELFFile{file}, nil
+}
+
+// Symbols is the safe version of elf.File.Symbols.
+func (se *SafeELFFile) Symbols() (syms []elf.Symbol, err error) {
+	defer func() {
+		r := recover()
+		if r == nil {
+			return
+		}
+
+		syms = nil
+		err = fmt.Errorf("reading ELF symbols panicked: %s", r)
+	}()
+
+	syms, err = se.File.Symbols()
+	return
+}

+ 30 - 52
vendor/github.com/cilium/ebpf/internal/feature.go

@@ -20,6 +20,9 @@ type UnsupportedFeatureError struct {
 }
 
 func (ufe *UnsupportedFeatureError) Error() string {
+	if ufe.MinimumVersion.Unspecified() {
+		return fmt.Sprintf("%s not supported", ufe.Name)
+	}
 	return fmt.Sprintf("%s not supported (requires >= %s)", ufe.Name, ufe.MinimumVersion)
 }
 
@@ -29,7 +32,7 @@ func (ufe *UnsupportedFeatureError) Is(target error) bool {
 }
 
 type featureTest struct {
-	sync.Mutex
+	sync.RWMutex
 	successful bool
 	result     error
 }
@@ -39,10 +42,10 @@ type featureTest struct {
 //
 // The return values have the following semantics:
 //
+//   err == ErrNotSupported: the feature is not available
+//   err == nil: the feature is available
 //   err != nil: the test couldn't be executed
-//   err == nil && available: the feature is available
-//   err == nil && !available: the feature isn't available
-type FeatureTestFn func() (available bool, err error)
+type FeatureTestFn func() error
 
 // FeatureTest wraps a function so that it is run at most once.
 //
@@ -58,65 +61,40 @@ func FeatureTest(name, version string, fn FeatureTestFn) func() error {
 
 	ft := new(featureTest)
 	return func() error {
+		ft.RLock()
+		if ft.successful {
+			defer ft.RUnlock()
+			return ft.result
+		}
+		ft.RUnlock()
 		ft.Lock()
 		defer ft.Unlock()
-
+		// check one more time on the off
+		// chance that two go routines
+		// were able to call into the write
+		// lock
 		if ft.successful {
 			return ft.result
 		}
-
-		available, err := fn()
-		if errors.Is(err, ErrNotSupported) {
-			// The feature test aborted because a dependent feature
-			// is missing, which we should cache.
-			available = false
-		} else if err != nil {
-			// We couldn't execute the feature test to a point
-			// where it could make a determination.
-			// Don't cache the result, just return it.
-			return fmt.Errorf("can't detect support for %s: %w", name, err)
-		}
-
-		ft.successful = true
-		if !available {
+		err := fn()
+		switch {
+		case errors.Is(err, ErrNotSupported):
 			ft.result = &UnsupportedFeatureError{
 				MinimumVersion: v,
 				Name:           name,
 			}
-		}
-		return ft.result
-	}
-}
+			fallthrough
 
-// A Version in the form Major.Minor.Patch.
-type Version [3]uint16
+		case err == nil:
+			ft.successful = true
 
-// NewVersion creates a version from a string like "Major.Minor.Patch".
-//
-// Patch is optional.
-func NewVersion(ver string) (Version, error) {
-	var major, minor, patch uint16
-	n, _ := fmt.Sscanf(ver, "%d.%d.%d", &major, &minor, &patch)
-	if n < 2 {
-		return Version{}, fmt.Errorf("invalid version: %s", ver)
-	}
-	return Version{major, minor, patch}, nil
-}
-
-func (v Version) String() string {
-	if v[2] == 0 {
-		return fmt.Sprintf("v%d.%d", v[0], v[1])
-	}
-	return fmt.Sprintf("v%d.%d.%d", v[0], v[1], v[2])
-}
-
-// Less returns true if the version is less than another version.
-func (v Version) Less(other Version) bool {
-	for i, a := range v {
-		if a == other[i] {
-			continue
+		default:
+			// We couldn't execute the feature test to a point
+			// where it could make a determination.
+			// Don't cache the result, just return it.
+			return fmt.Errorf("detect support for %s: %w", name, err)
 		}
-		return a < other[i]
+
+		return ft.result
 	}
-	return false
 }

+ 44 - 0
vendor/github.com/cilium/ebpf/internal/pinning.go

@@ -0,0 +1,44 @@
+package internal
+
+import (
+	"errors"
+	"fmt"
+	"os"
+
+	"github.com/cilium/ebpf/internal/unix"
+)
+
+func Pin(currentPath, newPath string, fd *FD) error {
+	if newPath == "" {
+		return errors.New("given pinning path cannot be empty")
+	}
+	if currentPath == newPath {
+		return nil
+	}
+	if currentPath == "" {
+		return BPFObjPin(newPath, fd)
+	}
+	var err error
+	// Renameat2 is used instead of os.Rename to disallow the new path replacing
+	// an existing path.
+	if err = unix.Renameat2(unix.AT_FDCWD, currentPath, unix.AT_FDCWD, newPath, unix.RENAME_NOREPLACE); err == nil {
+		// Object is now moved to the new pinning path.
+		return nil
+	}
+	if !os.IsNotExist(err) {
+		return fmt.Errorf("unable to move pinned object to new path %v: %w", newPath, err)
+	}
+	// Internal state not in sync with the file system so let's fix it.
+	return BPFObjPin(newPath, fd)
+}
+
+func Unpin(pinnedPath string) error {
+	if pinnedPath == "" {
+		return nil
+	}
+	err := os.Remove(pinnedPath)
+	if err == nil || os.IsNotExist(err) {
+		return nil
+	}
+	return err
+}

+ 10 - 5
vendor/github.com/cilium/ebpf/internal/ptr.go

@@ -1,6 +1,10 @@
 package internal
 
-import "unsafe"
+import (
+	"unsafe"
+
+	"github.com/cilium/ebpf/internal/unix"
+)
 
 // NewPointer creates a 64-bit pointer from an unsafe Pointer.
 func NewPointer(ptr unsafe.Pointer) Pointer {
@@ -22,9 +26,10 @@ func NewStringPointer(str string) Pointer {
 		return Pointer{}
 	}
 
-	// The kernel expects strings to be zero terminated
-	buf := make([]byte, len(str)+1)
-	copy(buf, str)
+	p, err := unix.BytePtrFromString(str)
+	if err != nil {
+		return Pointer{}
+	}
 
-	return Pointer{ptr: unsafe.Pointer(&buf[0])}
+	return Pointer{ptr: unsafe.Pointer(p)}
 }

+ 43 - 2
vendor/github.com/cilium/ebpf/internal/syscall.go

@@ -91,6 +91,19 @@ func BPFProgDetach(attr *BPFProgDetachAttr) error {
 	return err
 }
 
+type BPFEnableStatsAttr struct {
+	StatsType uint32
+}
+
+func BPFEnableStats(attr *BPFEnableStatsAttr) (*FD, error) {
+	ptr, err := BPF(BPF_ENABLE_STATS, unsafe.Pointer(attr), unsafe.Sizeof(*attr))
+	if err != nil {
+		return nil, fmt.Errorf("enable stats: %w", err)
+	}
+	return NewFD(uint32(ptr)), nil
+
+}
+
 type bpfObjAttr struct {
 	fileName  Pointer
 	fd        uint32
@@ -127,9 +140,10 @@ func BPFObjPin(fileName string, fd *FD) error {
 }
 
 // BPFObjGet wraps BPF_OBJ_GET.
-func BPFObjGet(fileName string) (*FD, error) {
+func BPFObjGet(fileName string, flags uint32) (*FD, error) {
 	attr := bpfObjAttr{
-		fileName: NewStringPointer(fileName),
+		fileName:  NewStringPointer(fileName),
+		fileFlags: flags,
 	}
 	ptr, err := BPF(BPF_OBJ_GET, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
 	if err != nil {
@@ -137,3 +151,30 @@ func BPFObjGet(fileName string) (*FD, error) {
 	}
 	return NewFD(uint32(ptr)), nil
 }
+
+type bpfObjGetInfoByFDAttr struct {
+	fd      uint32
+	infoLen uint32
+	info    Pointer
+}
+
+// BPFObjGetInfoByFD wraps BPF_OBJ_GET_INFO_BY_FD.
+//
+// Available from 4.13.
+func BPFObjGetInfoByFD(fd *FD, info unsafe.Pointer, size uintptr) error {
+	value, err := fd.Value()
+	if err != nil {
+		return err
+	}
+
+	attr := bpfObjGetInfoByFDAttr{
+		fd:      value,
+		infoLen: uint32(size),
+		info:    NewPointer(info),
+	}
+	_, err = BPF(BPF_OBJ_GET_INFO_BY_FD, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
+	if err != nil {
+		return fmt.Errorf("fd %v: %w", fd, err)
+	}
+	return nil
+}

+ 61 - 10
vendor/github.com/cilium/ebpf/internal/unix/types_linux.go

@@ -3,24 +3,34 @@
 package unix
 
 import (
+	"bytes"
 	"syscall"
 
 	linux "golang.org/x/sys/unix"
 )
 
 const (
-	ENOENT                   = linux.ENOENT
-	EEXIST                   = linux.EEXIST
-	EAGAIN                   = linux.EAGAIN
-	ENOSPC                   = linux.ENOSPC
-	EINVAL                   = linux.EINVAL
-	EPOLLIN                  = linux.EPOLLIN
-	EINTR                    = linux.EINTR
-	EPERM                    = linux.EPERM
-	ESRCH                    = linux.ESRCH
-	ENODEV                   = linux.ENODEV
+	ENOENT  = linux.ENOENT
+	EEXIST  = linux.EEXIST
+	EAGAIN  = linux.EAGAIN
+	ENOSPC  = linux.ENOSPC
+	EINVAL  = linux.EINVAL
+	EPOLLIN = linux.EPOLLIN
+	EINTR   = linux.EINTR
+	EPERM   = linux.EPERM
+	ESRCH   = linux.ESRCH
+	ENODEV  = linux.ENODEV
+	// ENOTSUPP is not the same as ENOTSUP or EOPNOTSUP
+	ENOTSUPP = syscall.Errno(0x20c)
+
+	EBADF                    = linux.EBADF
+	BPF_F_NO_PREALLOC        = linux.BPF_F_NO_PREALLOC
+	BPF_F_NUMA_NODE          = linux.BPF_F_NUMA_NODE
+	BPF_F_RDONLY             = linux.BPF_F_RDONLY
+	BPF_F_WRONLY             = linux.BPF_F_WRONLY
 	BPF_F_RDONLY_PROG        = linux.BPF_F_RDONLY_PROG
 	BPF_F_WRONLY_PROG        = linux.BPF_F_WRONLY_PROG
+	BPF_F_SLEEPABLE          = linux.BPF_F_SLEEPABLE
 	BPF_OBJ_NAME_LEN         = linux.BPF_OBJ_NAME_LEN
 	BPF_TAG_SIZE             = linux.BPF_TAG_SIZE
 	SYS_BPF                  = linux.SYS_BPF
@@ -33,12 +43,21 @@ const (
 	PROT_WRITE               = linux.PROT_WRITE
 	MAP_SHARED               = linux.MAP_SHARED
 	PERF_TYPE_SOFTWARE       = linux.PERF_TYPE_SOFTWARE
+	PERF_TYPE_TRACEPOINT     = linux.PERF_TYPE_TRACEPOINT
 	PERF_COUNT_SW_BPF_OUTPUT = linux.PERF_COUNT_SW_BPF_OUTPUT
+	PERF_EVENT_IOC_DISABLE   = linux.PERF_EVENT_IOC_DISABLE
+	PERF_EVENT_IOC_ENABLE    = linux.PERF_EVENT_IOC_ENABLE
+	PERF_EVENT_IOC_SET_BPF   = linux.PERF_EVENT_IOC_SET_BPF
 	PerfBitWatermark         = linux.PerfBitWatermark
 	PERF_SAMPLE_RAW          = linux.PERF_SAMPLE_RAW
 	PERF_FLAG_FD_CLOEXEC     = linux.PERF_FLAG_FD_CLOEXEC
 	RLIM_INFINITY            = linux.RLIM_INFINITY
 	RLIMIT_MEMLOCK           = linux.RLIMIT_MEMLOCK
+	BPF_STATS_RUN_TIME       = linux.BPF_STATS_RUN_TIME
+	PERF_RECORD_LOST         = linux.PERF_RECORD_LOST
+	PERF_RECORD_SAMPLE       = linux.PERF_RECORD_SAMPLE
+	AT_FDCWD                 = linux.AT_FDCWD
+	RENAME_NOREPLACE         = linux.RENAME_NOREPLACE
 )
 
 // Statfs_t is a wrapper
@@ -62,6 +81,11 @@ func FcntlInt(fd uintptr, cmd, arg int) (int, error) {
 	return linux.FcntlInt(fd, cmd, arg)
 }
 
+// IoctlSetInt is a wrapper
+func IoctlSetInt(fd int, req uint, value int) error {
+	return linux.IoctlSetInt(fd, req, value)
+}
+
 // Statfs is a wrapper
 func Statfs(path string, buf *Statfs_t) (err error) {
 	return linux.Statfs(path, buf)
@@ -148,3 +172,30 @@ func Gettid() int {
 func Tgkill(tgid int, tid int, sig syscall.Signal) (err error) {
 	return linux.Tgkill(tgid, tid, sig)
 }
+
+// BytePtrFromString is a wrapper
+func BytePtrFromString(s string) (*byte, error) {
+	return linux.BytePtrFromString(s)
+}
+
+// ByteSliceToString is a wrapper
+func ByteSliceToString(s []byte) string {
+	return linux.ByteSliceToString(s)
+}
+
+// Renameat2 is a wrapper
+func Renameat2(olddirfd int, oldpath string, newdirfd int, newpath string, flags uint) error {
+	return linux.Renameat2(olddirfd, oldpath, newdirfd, newpath, flags)
+}
+
+func KernelRelease() (string, error) {
+	var uname Utsname
+	err := Uname(&uname)
+	if err != nil {
+		return "", err
+	}
+
+	end := bytes.IndexByte(uname.Release[:], 0)
+	release := string(uname.Release[:end])
+	return release, nil
+}

+ 52 - 9
vendor/github.com/cilium/ebpf/internal/unix/types_other.go

@@ -11,17 +11,26 @@ import (
 var errNonLinux = fmt.Errorf("unsupported platform %s/%s", runtime.GOOS, runtime.GOARCH)
 
 const (
-	ENOENT                   = syscall.ENOENT
-	EEXIST                   = syscall.EEXIST
-	EAGAIN                   = syscall.EAGAIN
-	ENOSPC                   = syscall.ENOSPC
-	EINVAL                   = syscall.EINVAL
-	EINTR                    = syscall.EINTR
-	EPERM                    = syscall.EPERM
-	ESRCH                    = syscall.ESRCH
-	ENODEV                   = syscall.ENODEV
+	ENOENT = syscall.ENOENT
+	EEXIST = syscall.EEXIST
+	EAGAIN = syscall.EAGAIN
+	ENOSPC = syscall.ENOSPC
+	EINVAL = syscall.EINVAL
+	EINTR  = syscall.EINTR
+	EPERM  = syscall.EPERM
+	ESRCH  = syscall.ESRCH
+	ENODEV = syscall.ENODEV
+	EBADF  = syscall.Errno(0)
+	// ENOTSUPP is not the same as ENOTSUP or EOPNOTSUP
+	ENOTSUPP = syscall.Errno(0x20c)
+
+	BPF_F_NO_PREALLOC        = 0
+	BPF_F_NUMA_NODE          = 0
+	BPF_F_RDONLY             = 0
+	BPF_F_WRONLY             = 0
 	BPF_F_RDONLY_PROG        = 0
 	BPF_F_WRONLY_PROG        = 0
+	BPF_F_SLEEPABLE          = 0
 	BPF_OBJ_NAME_LEN         = 0x10
 	BPF_TAG_SIZE             = 0x8
 	SYS_BPF                  = 321
@@ -35,12 +44,21 @@ const (
 	PROT_WRITE               = 0x2
 	MAP_SHARED               = 0x1
 	PERF_TYPE_SOFTWARE       = 0x1
+	PERF_TYPE_TRACEPOINT     = 0
 	PERF_COUNT_SW_BPF_OUTPUT = 0xa
+	PERF_EVENT_IOC_DISABLE   = 0
+	PERF_EVENT_IOC_ENABLE    = 0
+	PERF_EVENT_IOC_SET_BPF   = 0
 	PerfBitWatermark         = 0x4000
 	PERF_SAMPLE_RAW          = 0x400
 	PERF_FLAG_FD_CLOEXEC     = 0x8
 	RLIM_INFINITY            = 0x7fffffffffffffff
 	RLIMIT_MEMLOCK           = 8
+	BPF_STATS_RUN_TIME       = 0
+	PERF_RECORD_LOST         = 2
+	PERF_RECORD_SAMPLE       = 9
+	AT_FDCWD                 = -0x2
+	RENAME_NOREPLACE         = 0x1
 )
 
 // Statfs_t is a wrapper
@@ -80,6 +98,11 @@ func FcntlInt(fd uintptr, cmd, arg int) (int, error) {
 	return -1, errNonLinux
 }
 
+// IoctlSetInt is a wrapper
+func IoctlSetInt(fd int, req uint, value int) error {
+	return errNonLinux
+}
+
 // Statfs is a wrapper
 func Statfs(path string, buf *Statfs_t) error {
 	return errNonLinux
@@ -194,6 +217,7 @@ func PerfEventOpen(attr *PerfEventAttr, pid int, cpu int, groupFd int, flags int
 // Utsname is a wrapper
 type Utsname struct {
 	Release [65]byte
+	Version [65]byte
 }
 
 // Uname is a wrapper
@@ -215,3 +239,22 @@ func Gettid() int {
 func Tgkill(tgid int, tid int, sig syscall.Signal) (err error) {
 	return errNonLinux
 }
+
+// BytePtrFromString is a wrapper
+func BytePtrFromString(s string) (*byte, error) {
+	return nil, errNonLinux
+}
+
+// ByteSliceToString is a wrapper
+func ByteSliceToString(s []byte) string {
+	return ""
+}
+
+// Renameat2 is a wrapper
+func Renameat2(olddirfd int, oldpath string, newdirfd int, newpath string, flags uint) error {
+	return errNonLinux
+}
+
+func KernelRelease() (string, error) {
+	return "", errNonLinux
+}

+ 163 - 0
vendor/github.com/cilium/ebpf/internal/version.go

@@ -0,0 +1,163 @@
+package internal
+
+import (
+	"fmt"
+	"io/ioutil"
+	"regexp"
+	"sync"
+
+	"github.com/cilium/ebpf/internal/unix"
+)
+
+const (
+	// Version constant used in ELF binaries indicating that the loader needs to
+	// substitute the eBPF program's version with the value of the kernel's
+	// KERNEL_VERSION compile-time macro. Used for compatibility with BCC, gobpf
+	// and RedSift.
+	MagicKernelVersion = 0xFFFFFFFE
+)
+
+var (
+	// Match between one and three decimals separated by dots, with the last
+	// segment (patch level) being optional on some kernels.
+	// The x.y.z string must appear at the start of a string or right after
+	// whitespace to prevent sequences like 'x.y.z-a.b.c' from matching 'a.b.c'.
+	rgxKernelVersion = regexp.MustCompile(`(?:\A|\s)\d{1,3}\.\d{1,3}(?:\.\d{1,3})?`)
+
+	kernelVersion = struct {
+		once    sync.Once
+		version Version
+		err     error
+	}{}
+)
+
+// A Version in the form Major.Minor.Patch.
+type Version [3]uint16
+
+// NewVersion creates a version from a string like "Major.Minor.Patch".
+//
+// Patch is optional.
+func NewVersion(ver string) (Version, error) {
+	var major, minor, patch uint16
+	n, _ := fmt.Sscanf(ver, "%d.%d.%d", &major, &minor, &patch)
+	if n < 2 {
+		return Version{}, fmt.Errorf("invalid version: %s", ver)
+	}
+	return Version{major, minor, patch}, nil
+}
+
+func (v Version) String() string {
+	if v[2] == 0 {
+		return fmt.Sprintf("v%d.%d", v[0], v[1])
+	}
+	return fmt.Sprintf("v%d.%d.%d", v[0], v[1], v[2])
+}
+
+// Less returns true if the version is less than another version.
+func (v Version) Less(other Version) bool {
+	for i, a := range v {
+		if a == other[i] {
+			continue
+		}
+		return a < other[i]
+	}
+	return false
+}
+
+// Unspecified returns true if the version is all zero.
+func (v Version) Unspecified() bool {
+	return v[0] == 0 && v[1] == 0 && v[2] == 0
+}
+
+// Kernel implements the kernel's KERNEL_VERSION macro from linux/version.h.
+// It represents the kernel version and patch level as a single value.
+func (v Version) Kernel() uint32 {
+
+	// Kernels 4.4 and 4.9 have their SUBLEVEL clamped to 255 to avoid
+	// overflowing into PATCHLEVEL.
+	// See kernel commit 9b82f13e7ef3 ("kbuild: clamp SUBLEVEL to 255").
+	s := v[2]
+	if s > 255 {
+		s = 255
+	}
+
+	// Truncate members to uint8 to prevent them from spilling over into
+	// each other when overflowing 8 bits.
+	return uint32(uint8(v[0]))<<16 | uint32(uint8(v[1]))<<8 | uint32(uint8(s))
+}
+
+// KernelVersion returns the version of the currently running kernel.
+func KernelVersion() (Version, error) {
+	kernelVersion.once.Do(func() {
+		kernelVersion.version, kernelVersion.err = detectKernelVersion()
+	})
+
+	if kernelVersion.err != nil {
+		return Version{}, kernelVersion.err
+	}
+	return kernelVersion.version, nil
+}
+
+// detectKernelVersion returns the version of the running kernel. It scans the
+// following sources in order: /proc/version_signature, uname -v, uname -r.
+// In each of those locations, the last-appearing x.y(.z) value is selected
+// for parsing. The first location that yields a usable version number is
+// returned.
+func detectKernelVersion() (Version, error) {
+
+	// Try reading /proc/version_signature for Ubuntu compatibility.
+	// Example format: Ubuntu 4.15.0-91.92-generic 4.15.18
+	// This method exists in the kernel itself, see d18acd15c
+	// ("perf tools: Fix kernel version error in ubuntu").
+	if pvs, err := ioutil.ReadFile("/proc/version_signature"); err == nil {
+		// If /proc/version_signature exists, failing to parse it is an error.
+		// It only exists on Ubuntu, where the real patch level is not obtainable
+		// through any other method.
+		v, err := findKernelVersion(string(pvs))
+		if err != nil {
+			return Version{}, err
+		}
+		return v, nil
+	}
+
+	var uname unix.Utsname
+	if err := unix.Uname(&uname); err != nil {
+		return Version{}, fmt.Errorf("calling uname: %w", err)
+	}
+
+	// Debian puts the version including the patch level in uname.Version.
+	// It is not an error if there's no version number in uname.Version,
+	// as most distributions don't use it. Parsing can continue on uname.Release.
+	// Example format: #1 SMP Debian 4.19.37-5+deb10u2 (2019-08-08)
+	if v, err := findKernelVersion(unix.ByteSliceToString(uname.Version[:])); err == nil {
+		return v, nil
+	}
+
+	// Most other distributions have the full kernel version including patch
+	// level in uname.Release.
+	// Example format: 4.19.0-5-amd64, 5.5.10-arch1-1
+	v, err := findKernelVersion(unix.ByteSliceToString(uname.Release[:]))
+	if err != nil {
+		return Version{}, err
+	}
+
+	return v, nil
+}
+
+// findKernelVersion matches s against rgxKernelVersion and parses the result
+// into a Version. If s contains multiple matches, the last entry is selected.
+func findKernelVersion(s string) (Version, error) {
+	m := rgxKernelVersion.FindAllString(s, -1)
+	if m == nil {
+		return Version{}, fmt.Errorf("no kernel version in string: %s", s)
+	}
+	// Pick the last match of the string in case there are multiple.
+	s = m[len(m)-1]
+
+	v, err := NewVersion(s)
+	if err != nil {
+		return Version{}, fmt.Errorf("parsing version string %s: %w", s, err)
+	}
+
+	return v, nil
+}

+ 47 - 0
vendor/github.com/cilium/ebpf/linker.go

@@ -84,3 +84,50 @@ func needSection(insns, section asm.Instructions) (bool, error) {
 	// None of the functions in the section are called.
 	return false, nil
 }
+
+func fixupJumpsAndCalls(insns asm.Instructions) error {
+	symbolOffsets := make(map[string]asm.RawInstructionOffset)
+	iter := insns.Iterate()
+	for iter.Next() {
+		ins := iter.Ins
+
+		if ins.Symbol == "" {
+			continue
+		}
+
+		if _, ok := symbolOffsets[ins.Symbol]; ok {
+			return fmt.Errorf("duplicate symbol %s", ins.Symbol)
+		}
+
+		symbolOffsets[ins.Symbol] = iter.Offset
+	}
+
+	iter = insns.Iterate()
+	for iter.Next() {
+		i := iter.Index
+		offset := iter.Offset
+		ins := iter.Ins
+
+		switch {
+		case ins.IsFunctionCall() && ins.Constant == -1:
+			// Rewrite bpf to bpf call
+			callOffset, ok := symbolOffsets[ins.Reference]
+			if !ok {
+				return fmt.Errorf("instruction %d: reference to missing symbol %q", i, ins.Reference)
+			}
+
+			ins.Constant = int64(callOffset - offset - 1)
+
+		case ins.OpCode.Class() == asm.JumpClass && ins.Offset == -1:
+			// Rewrite jump to label
+			jumpOffset, ok := symbolOffsets[ins.Reference]
+			if !ok {
+				return fmt.Errorf("instruction %d: reference to missing symbol %q", i, ins.Reference)
+			}
+
+			ins.Offset = int16(jumpOffset - offset - 1)
+		}
+	}
+
+	return nil
+}

File diff suppressed because it is too large
+ 558 - 151
vendor/github.com/cilium/ebpf/map.go


+ 23 - 8
vendor/github.com/cilium/ebpf/marshalers.go

@@ -13,14 +13,12 @@ import (
 	"github.com/cilium/ebpf/internal"
 )
 
+// marshalPtr converts an arbitrary value into a pointer suitable
+// to be passed to the kernel.
+//
+// As an optimization, it returns the original value if it is an
+// unsafe.Pointer.
 func marshalPtr(data interface{}, length int) (internal.Pointer, error) {
-	if data == nil {
-		if length == 0 {
-			return internal.NewPointer(nil), nil
-		}
-		return internal.Pointer{}, errors.New("can't use nil as key of map")
-	}
-
 	if ptr, ok := data.(unsafe.Pointer); ok {
 		return internal.NewPointer(ptr), nil
 	}
@@ -33,6 +31,13 @@ func marshalPtr(data interface{}, length int) (internal.Pointer, error) {
 	return internal.NewSlicePointer(buf), nil
 }
 
+// marshalBytes converts an arbitrary value into a byte buffer.
+//
+// Prefer using Map.marshalKey and Map.marshalValue if possible, since
+// those have special cases that allow more types to be encoded.
+//
+// Returns an error if the given value isn't representable in exactly
+// length bytes.
 func marshalBytes(data interface{}, length int) (buf []byte, err error) {
 	switch value := data.(type) {
 	case encoding.BinaryMarshaler:
@@ -43,6 +48,8 @@ func marshalBytes(data interface{}, length int) (buf []byte, err error) {
 		buf = value
 	case unsafe.Pointer:
 		err = errors.New("can't marshal from unsafe.Pointer")
+	case Map, *Map, Program, *Program:
+		err = fmt.Errorf("can't marshal %T", value)
 	default:
 		var wr bytes.Buffer
 		err = binary.Write(&wr, internal.NativeEndian, value)
@@ -70,10 +77,16 @@ func makeBuffer(dst interface{}, length int) (internal.Pointer, []byte) {
 	return internal.NewSlicePointer(buf), buf
 }
 
+// unmarshalBytes converts a byte buffer into an arbitrary value.
+//
+// Prefer using Map.unmarshalKey and Map.unmarshalValue if possible, since
+// those have special cases that allow more types to be encoded.
 func unmarshalBytes(data interface{}, buf []byte) error {
 	switch value := data.(type) {
 	case unsafe.Pointer:
-		sh := &reflect.SliceHeader{
+		// This could be solved in Go 1.17 by unsafe.Slice instead. (https://github.com/golang/go/issues/19367)
+		// We could opt for removing unsafe.Pointer support in the lib as well.
+		sh := &reflect.SliceHeader{ //nolint:govet
 			Data: uintptr(value),
 			Len:  len(buf),
 			Cap:  len(buf),
@@ -83,6 +96,8 @@ func unmarshalBytes(data interface{}, buf []byte) error {
 		copy(dst, buf)
 		runtime.KeepAlive(value)
 		return nil
+	case Map, *Map, Program, *Program:
+		return fmt.Errorf("can't unmarshal into %T", value)
 	case encoding.BinaryUnmarshaler:
 		return value.UnmarshalBinary(buf)
 	case *string:

+ 240 - 147
vendor/github.com/cilium/ebpf/prog.go

@@ -6,6 +6,7 @@ import (
 	"errors"
 	"fmt"
 	"math"
+	"path/filepath"
 	"strings"
 	"time"
 
@@ -18,7 +19,7 @@ import (
 // ErrNotSupported is returned whenever the kernel doesn't support a feature.
 var ErrNotSupported = internal.ErrNotSupported
 
-// ProgramID represents the unique ID of an eBPF program
+// ProgramID represents the unique ID of an eBPF program.
 type ProgramID uint32
 
 const (
@@ -42,7 +43,7 @@ type ProgramOptions struct {
 	LogSize int
 }
 
-// ProgramSpec defines a Program
+// ProgramSpec defines a Program.
 type ProgramSpec struct {
 	// Name is passed to the kernel as a debug aid. Must only contain
 	// alpha numeric and '_' characters.
@@ -54,16 +55,19 @@ type ProgramSpec struct {
 	// depends on Type and AttachType.
 	AttachTo     string
 	Instructions asm.Instructions
-
+	// Flags is passed to the kernel and specifies additional program
+	// load attributes.
+	Flags uint32
 	// License of the program. Some helpers are only available if
 	// the license is deemed compatible with the GPL.
 	//
 	// See https://www.kernel.org/doc/html/latest/process/license-rules.html#id1
 	License string
 
-	// Version used by tracing programs.
+	// Version used by Kprobe programs.
 	//
-	// Deprecated: superseded by BTF.
+	// Deprecated on kernels 5.0 and later. Leave empty to let the library
+	// detect this value automatically.
 	KernelVersion uint32
 
 	// The BTF associated with this program. Changing Instructions
@@ -87,6 +91,13 @@ func (ps *ProgramSpec) Copy() *ProgramSpec {
 	return &cpy
 }
 
+// Tag calculates the kernel tag for a series of instructions.
+//
+// Use asm.Instructions.Tag if you need to calculate for non-native endianness.
+func (ps *ProgramSpec) Tag() (string, error) {
+	return ps.Instructions.Tag(internal.NativeEndian)
+}
+
 // Program represents BPF program loaded into the kernel.
 //
 // It is not safe to close a Program which is used by other goroutines.
@@ -97,8 +108,8 @@ type Program struct {
 
 	fd         *internal.FD
 	name       string
-	abi        ProgramABI
-	attachType AttachType
+	pinnedPath string
+	typ        ProgramType
 }
 
 // NewProgram creates a new Program.
@@ -114,24 +125,112 @@ func NewProgram(spec *ProgramSpec) (*Program, error) {
 // Loading a program for the first time will perform
 // feature detection by loading small, temporary programs.
 func NewProgramWithOptions(spec *ProgramSpec, opts ProgramOptions) (*Program, error) {
-	if spec.BTF == nil {
-		return newProgramWithBTF(spec, nil, opts)
+	btfs := make(btfHandleCache)
+	defer btfs.close()
+
+	return newProgramWithOptions(spec, opts, btfs)
+}
+
+func newProgramWithOptions(spec *ProgramSpec, opts ProgramOptions, btfs btfHandleCache) (*Program, error) {
+	if len(spec.Instructions) == 0 {
+		return nil, errors.New("Instructions cannot be empty")
+	}
+
+	if len(spec.License) == 0 {
+		return nil, errors.New("License cannot be empty")
 	}
 
-	handle, err := btf.NewHandle(btf.ProgramSpec(spec.BTF))
-	if err != nil && !errors.Is(err, btf.ErrNotSupported) {
-		return nil, fmt.Errorf("can't load BTF: %w", err)
+	if spec.ByteOrder != nil && spec.ByteOrder != internal.NativeEndian {
+		return nil, fmt.Errorf("can't load %s program on %s", spec.ByteOrder, internal.NativeEndian)
 	}
 
-	return newProgramWithBTF(spec, handle, opts)
-}
+	// Kernels before 5.0 (6c4fc209fcf9 "bpf: remove useless version check for prog load")
+	// require the version field to be set to the value of the KERNEL_VERSION
+	// macro for kprobe-type programs.
+	// Overwrite Kprobe program version if set to zero or the magic version constant.
+	kv := spec.KernelVersion
+	if spec.Type == Kprobe && (kv == 0 || kv == internal.MagicKernelVersion) {
+		v, err := internal.KernelVersion()
+		if err != nil {
+			return nil, fmt.Errorf("detecting kernel version: %w", err)
+		}
+		kv = v.Kernel()
+	}
+
+	insns := make(asm.Instructions, len(spec.Instructions))
+	copy(insns, spec.Instructions)
+
+	if err := fixupJumpsAndCalls(insns); err != nil {
+		return nil, err
+	}
 
-func newProgramWithBTF(spec *ProgramSpec, btf *btf.Handle, opts ProgramOptions) (*Program, error) {
-	attr, err := convertProgramSpec(spec, btf)
+	buf := bytes.NewBuffer(make([]byte, 0, len(spec.Instructions)*asm.InstructionSize))
+	err := insns.Marshal(buf, internal.NativeEndian)
 	if err != nil {
 		return nil, err
 	}
 
+	bytecode := buf.Bytes()
+	insCount := uint32(len(bytecode) / asm.InstructionSize)
+	attr := &bpfProgLoadAttr{
+		progType:           spec.Type,
+		progFlags:          spec.Flags,
+		expectedAttachType: spec.AttachType,
+		insCount:           insCount,
+		instructions:       internal.NewSlicePointer(bytecode),
+		license:            internal.NewStringPointer(spec.License),
+		kernelVersion:      kv,
+	}
+
+	if haveObjName() == nil {
+		attr.progName = newBPFObjName(spec.Name)
+	}
+
+	var btfDisabled bool
+	if spec.BTF != nil {
+		if relos, err := btf.ProgramRelocations(spec.BTF, nil); err != nil {
+			return nil, fmt.Errorf("CO-RE relocations: %s", err)
+		} else if len(relos) > 0 {
+			return nil, fmt.Errorf("applying CO-RE relocations: %w", ErrNotSupported)
+		}
+
+		handle, err := btfs.load(btf.ProgramSpec(spec.BTF))
+		btfDisabled = errors.Is(err, btf.ErrNotSupported)
+		if err != nil && !btfDisabled {
+			return nil, fmt.Errorf("load BTF: %w", err)
+		}
+
+		if handle != nil {
+			attr.progBTFFd = uint32(handle.FD())
+
+			recSize, bytes, err := btf.ProgramLineInfos(spec.BTF)
+			if err != nil {
+				return nil, fmt.Errorf("get BTF line infos: %w", err)
+			}
+			attr.lineInfoRecSize = recSize
+			attr.lineInfoCnt = uint32(uint64(len(bytes)) / uint64(recSize))
+			attr.lineInfo = internal.NewSlicePointer(bytes)
+
+			recSize, bytes, err = btf.ProgramFuncInfos(spec.BTF)
+			if err != nil {
+				return nil, fmt.Errorf("get BTF function infos: %w", err)
+			}
+			attr.funcInfoRecSize = recSize
+			attr.funcInfoCnt = uint32(uint64(len(bytes)) / uint64(recSize))
+			attr.funcInfo = internal.NewSlicePointer(bytes)
+		}
+	}
+
+	if spec.AttachTo != "" {
+		target, err := resolveBTFType(spec.AttachTo, spec.Type, spec.AttachType)
+		if err != nil {
+			return nil, err
+		}
+		if target != nil {
+			attr.attachBTFID = target.ID()
+		}
+	}
+
 	logSize := DefaultVerifierLogSize
 	if opts.LogSize > 0 {
 		logSize = opts.LogSize
@@ -147,9 +246,7 @@ func newProgramWithBTF(spec *ProgramSpec, btf *btf.Handle, opts ProgramOptions)
 
 	fd, err := bpfProgLoad(attr)
 	if err == nil {
-		prog := newProgram(fd, spec.Name, &ProgramABI{spec.Type})
-		prog.VerifierLog = internal.CString(logBuf)
-		return prog, nil
+		return &Program{internal.CString(logBuf), fd, spec.Name, "", spec.Type}, nil
 	}
 
 	logErr := err
@@ -163,115 +260,71 @@ func newProgramWithBTF(spec *ProgramSpec, btf *btf.Handle, opts ProgramOptions)
 		_, logErr = bpfProgLoad(attr)
 	}
 
+	if errors.Is(logErr, unix.EPERM) && logBuf[0] == 0 {
+		// EPERM due to RLIMIT_MEMLOCK happens before the verifier, so we can
+		// check that the log is empty to reduce false positives.
+		return nil, fmt.Errorf("load program: RLIMIT_MEMLOCK may be too low: %w", logErr)
+	}
+
 	err = internal.ErrorWithLog(err, logBuf, logErr)
-	return nil, fmt.Errorf("can't load program: %w", err)
+	if btfDisabled {
+		return nil, fmt.Errorf("load program without BTF: %w", err)
+	}
+	return nil, fmt.Errorf("load program: %w", err)
 }
 
 // NewProgramFromFD creates a program from a raw fd.
 //
 // You should not use fd after calling this function.
 //
-// Requires at least Linux 4.11.
+// Requires at least Linux 4.10.
 func NewProgramFromFD(fd int) (*Program, error) {
 	if fd < 0 {
 		return nil, errors.New("invalid fd")
 	}
-	bpfFd := internal.NewFD(uint32(fd))
 
-	name, abi, err := newProgramABIFromFd(bpfFd)
-	if err != nil {
-		bpfFd.Forget()
-		return nil, err
-	}
-
-	return newProgram(bpfFd, name, abi), nil
-}
-
-func newProgram(fd *internal.FD, name string, abi *ProgramABI) *Program {
-	return &Program{
-		name: name,
-		fd:   fd,
-		abi:  *abi,
-	}
+	return newProgramFromFD(internal.NewFD(uint32(fd)))
 }
 
-func convertProgramSpec(spec *ProgramSpec, handle *btf.Handle) (*bpfProgLoadAttr, error) {
-	if len(spec.Instructions) == 0 {
-		return nil, errors.New("Instructions cannot be empty")
-	}
-
-	if len(spec.License) == 0 {
-		return nil, errors.New("License cannot be empty")
-	}
-
-	if spec.ByteOrder != nil && spec.ByteOrder != internal.NativeEndian {
-		return nil, fmt.Errorf("can't load %s program on %s", spec.ByteOrder, internal.NativeEndian)
-	}
-
-	buf := bytes.NewBuffer(make([]byte, 0, len(spec.Instructions)*asm.InstructionSize))
-	err := spec.Instructions.Marshal(buf, internal.NativeEndian)
+// NewProgramFromID returns the program for a given id.
+//
+// Returns ErrNotExist, if there is no eBPF program with the given id.
+func NewProgramFromID(id ProgramID) (*Program, error) {
+	fd, err := bpfObjGetFDByID(internal.BPF_PROG_GET_FD_BY_ID, uint32(id))
 	if err != nil {
-		return nil, err
-	}
-
-	bytecode := buf.Bytes()
-	insCount := uint32(len(bytecode) / asm.InstructionSize)
-	attr := &bpfProgLoadAttr{
-		progType:           spec.Type,
-		expectedAttachType: spec.AttachType,
-		insCount:           insCount,
-		instructions:       internal.NewSlicePointer(bytecode),
-		license:            internal.NewStringPointer(spec.License),
-		kernelVersion:      spec.KernelVersion,
-	}
-
-	if haveObjName() == nil {
-		attr.progName = newBPFObjName(spec.Name)
+		return nil, fmt.Errorf("get program by id: %w", err)
 	}
 
-	if handle != nil && spec.BTF != nil {
-		attr.progBTFFd = uint32(handle.FD())
-
-		recSize, bytes, err := btf.ProgramLineInfos(spec.BTF)
-		if err != nil {
-			return nil, fmt.Errorf("can't get BTF line infos: %w", err)
-		}
-		attr.lineInfoRecSize = recSize
-		attr.lineInfoCnt = uint32(uint64(len(bytes)) / uint64(recSize))
-		attr.lineInfo = internal.NewSlicePointer(bytes)
-
-		recSize, bytes, err = btf.ProgramFuncInfos(spec.BTF)
-		if err != nil {
-			return nil, fmt.Errorf("can't get BTF function infos: %w", err)
-		}
-		attr.funcInfoRecSize = recSize
-		attr.funcInfoCnt = uint32(uint64(len(bytes)) / uint64(recSize))
-		attr.funcInfo = internal.NewSlicePointer(bytes)
-	}
+	return newProgramFromFD(fd)
+}
 
-	if spec.AttachTo != "" {
-		target, err := resolveBTFType(spec.AttachTo, spec.Type, spec.AttachType)
-		if err != nil {
-			return nil, err
-		}
-		if target != nil {
-			attr.attachBTFID = target.ID()
-		}
+func newProgramFromFD(fd *internal.FD) (*Program, error) {
+	info, err := newProgramInfoFromFd(fd)
+	if err != nil {
+		fd.Close()
+		return nil, fmt.Errorf("discover program type: %w", err)
 	}
 
-	return attr, nil
+	return &Program{"", fd, "", "", info.Type}, nil
 }
 
 func (p *Program) String() string {
 	if p.name != "" {
-		return fmt.Sprintf("%s(%s)#%v", p.abi.Type, p.name, p.fd)
+		return fmt.Sprintf("%s(%s)#%v", p.typ, p.name, p.fd)
 	}
-	return fmt.Sprintf("%s#%v", p.abi.Type, p.fd)
+	return fmt.Sprintf("%s(%v)", p.typ, p.fd)
 }
 
-// ABI gets the ABI of the Program
-func (p *Program) ABI() ProgramABI {
-	return p.abi
+// Type returns the underlying type of the program.
+func (p *Program) Type() ProgramType {
+	return p.typ
+}
+
+// Info returns metadata about the program.
+//
+// Requires at least 4.10.
+func (p *Program) Info() (*ProgramInfo, error) {
+	return newProgramInfoFromFd(p.fd)
 }
 
 // FD gets the file descriptor of the Program.
@@ -303,19 +356,42 @@ func (p *Program) Clone() (*Program, error) {
 		return nil, fmt.Errorf("can't clone program: %w", err)
 	}
 
-	return newProgram(dup, p.name, &p.abi), nil
+	return &Program{p.VerifierLog, dup, p.name, "", p.typ}, nil
 }
 
-// Pin persists the Program past the lifetime of the process that created it
+// Pin persists the Program on the BPF virtual file system past the lifetime of
+// the process that created it
 //
-// This requires bpffs to be mounted above fileName. See http://cilium.readthedocs.io/en/doc-1.0/kubernetes/install/#mounting-the-bpf-fs-optional
+// Calling Pin on a previously pinned program will overwrite the path, except when
+// the new path already exists. Re-pinning across filesystems is not supported.
+//
+// This requires bpffs to be mounted above fileName. See https://docs.cilium.io/en/k8s-doc/admin/#admin-mount-bpffs
 func (p *Program) Pin(fileName string) error {
-	if err := internal.BPFObjPin(fileName, p.fd); err != nil {
-		return fmt.Errorf("can't pin program: %w", err)
+	if err := internal.Pin(p.pinnedPath, fileName, p.fd); err != nil {
+		return err
 	}
+	p.pinnedPath = fileName
 	return nil
 }
 
+// Unpin removes the persisted state for the Program from the BPF virtual filesystem.
+//
+// Failed calls to Unpin will not alter the state returned by IsPinned.
+//
+// Unpinning an unpinned Program returns nil.
+func (p *Program) Unpin() error {
+	if err := internal.Unpin(p.pinnedPath); err != nil {
+		return err
+	}
+	p.pinnedPath = ""
+	return nil
+}
+
+// IsPinned returns true if the Program has a non-empty pinned path.
+func (p *Program) IsPinned() bool {
+	return p.pinnedPath != ""
+}
+
 // Close unloads the program from the kernel.
 func (p *Program) Close() error {
 	if p == nil {
@@ -359,7 +435,7 @@ func (p *Program) Benchmark(in []byte, repeat int, reset func()) (uint32, time.D
 	return ret, total, nil
 }
 
-var haveProgTestRun = internal.FeatureTest("BPF_PROG_TEST_RUN", "4.12", func() (bool, error) {
+var haveProgTestRun = internal.FeatureTest("BPF_PROG_TEST_RUN", "4.12", func() error {
 	prog, err := NewProgram(&ProgramSpec{
 		Type: SocketFilter,
 		Instructions: asm.Instructions{
@@ -370,7 +446,7 @@ var haveProgTestRun = internal.FeatureTest("BPF_PROG_TEST_RUN", "4.12", func() (
 	})
 	if err != nil {
 		// This may be because we lack sufficient permissions, etc.
-		return false, err
+		return err
 	}
 	defer prog.Close()
 
@@ -383,10 +459,16 @@ var haveProgTestRun = internal.FeatureTest("BPF_PROG_TEST_RUN", "4.12", func() (
 	}
 
 	err = bpfProgTestRun(&attr)
-
-	// Check for EINVAL specifically, rather than err != nil since we
-	// otherwise misdetect due to insufficient permissions.
-	return !errors.Is(err, unix.EINVAL), nil
+	if errors.Is(err, unix.EINVAL) {
+		// Check for EINVAL specifically, rather than err != nil since we
+		// otherwise misdetect due to insufficient permissions.
+		return internal.ErrNotSupported
+	}
+	if errors.Is(err, unix.EINTR) {
+		// We know that PROG_TEST_RUN is supported if we get EINTR.
+		return nil
+	}
+	return err
 })
 
 func (p *Program) testRun(in []byte, repeat int, reset func()) (uint32, []byte, time.Duration, error) {
@@ -465,8 +547,11 @@ func unmarshalProgram(buf []byte) (*Program, error) {
 	return NewProgramFromID(ProgramID(id))
 }
 
-// MarshalBinary implements BinaryMarshaler.
-func (p *Program) MarshalBinary() ([]byte, error) {
+func marshalProgram(p *Program, length int) ([]byte, error) {
+	if length != 4 {
+		return nil, fmt.Errorf("can't marshal program to %d bytes", length)
+	}
+
 	value, err := p.fd.Value()
 	if err != nil {
 		return nil, err
@@ -529,28 +614,28 @@ func (p *Program) Detach(fd int, typ AttachType, flags AttachFlags) error {
 // LoadPinnedProgram loads a Program from a BPF file.
 //
 // Requires at least Linux 4.11.
-func LoadPinnedProgram(fileName string) (*Program, error) {
-	fd, err := internal.BPFObjGet(fileName)
+func LoadPinnedProgram(fileName string, opts *LoadPinOptions) (*Program, error) {
+	fd, err := internal.BPFObjGet(fileName, opts.Marshal())
 	if err != nil {
 		return nil, err
 	}
 
-	name, abi, err := newProgramABIFromFd(fd)
+	info, err := newProgramInfoFromFd(fd)
 	if err != nil {
 		_ = fd.Close()
-		return nil, fmt.Errorf("can't get ABI for %s: %w", fileName, err)
+		return nil, fmt.Errorf("info for %s: %w", fileName, err)
 	}
 
-	return newProgram(fd, name, abi), nil
+	return &Program{"", fd, filepath.Base(fileName), fileName, info.Type}, nil
 }
 
-// SanitizeName replaces all invalid characters in name.
-//
-// Use this to automatically generate valid names for maps and
-// programs at run time.
+// SanitizeName replaces all invalid characters in name with replacement.
+// Passing a negative value for replacement will delete characters instead
+// of replacing them. Use this to automatically generate valid names for maps
+// and programs at runtime.
 //
-// Passing a negative value for replacement will delete characters
-// instead of replacing them.
+// The set of allowed characters depends on the running kernel version.
+// Dots are only allowed as of kernel 5.2.
 func SanitizeName(name string, replacement rune) string {
 	return strings.Map(func(char rune) rune {
 		if invalidBPFObjNameChar(char) {
@@ -568,25 +653,9 @@ func ProgramGetNextID(startID ProgramID) (ProgramID, error) {
 	return ProgramID(id), err
 }
 
-// NewProgramFromID returns the program for a given id.
-//
-// Returns ErrNotExist, if there is no eBPF program with the given id.
-func NewProgramFromID(id ProgramID) (*Program, error) {
-	fd, err := bpfObjGetFDByID(internal.BPF_PROG_GET_FD_BY_ID, uint32(id))
-	if err != nil {
-		return nil, err
-	}
-
-	name, abi, err := newProgramABIFromFd(fd)
-	if err != nil {
-		_ = fd.Close()
-		return nil, err
-	}
-
-	return newProgram(fd, name, abi), nil
-}
-
 // ID returns the systemwide unique ID of the program.
+//
+// Deprecated: use ProgramInfo.ID() instead.
 func (p *Program) ID() (ProgramID, error) {
 	info, err := bpfGetProgInfoByFD(p.fd)
 	if err != nil {
@@ -595,12 +664,16 @@ func (p *Program) ID() (ProgramID, error) {
 	return ProgramID(info.id), nil
 }
 
-func resolveBTFType(name string, progType ProgramType, attachType AttachType) (btf.Type, error) {
+func findKernelType(name string, typ btf.Type) error {
 	kernel, err := btf.LoadKernelSpec()
 	if err != nil {
-		return nil, fmt.Errorf("can't resolve BTF type %s: %w", name, err)
+		return fmt.Errorf("can't load kernel spec: %w", err)
 	}
 
+	return kernel.FindType(name, typ)
+}
+
+func resolveBTFType(name string, progType ProgramType, attachType AttachType) (btf.Type, error) {
 	type match struct {
 		p ProgramType
 		a AttachType
@@ -608,10 +681,30 @@ func resolveBTFType(name string, progType ProgramType, attachType AttachType) (b
 
 	target := match{progType, attachType}
 	switch target {
+	case match{LSM, AttachLSMMac}:
+		var target btf.Func
+		err := findKernelType("bpf_lsm_"+name, &target)
+		if errors.Is(err, btf.ErrNotFound) {
+			return nil, &internal.UnsupportedFeatureError{
+				Name: name + " LSM hook",
+			}
+		}
+		if err != nil {
+			return nil, fmt.Errorf("resolve BTF for LSM hook %s: %w", name, err)
+		}
+
+		return &target, nil
+
 	case match{Tracing, AttachTraceIter}:
 		var target btf.Func
-		if err := kernel.FindType("bpf_iter_"+name, &target); err != nil {
-			return nil, fmt.Errorf("can't resolve BTF for iterator %s: %w", name, err)
+		err := findKernelType("bpf_iter_"+name, &target)
+		if errors.Is(err, btf.ErrNotFound) {
+			return nil, &internal.UnsupportedFeatureError{
+				Name: name + " iterator",
+			}
+		}
+		if err != nil {
+			return nil, fmt.Errorf("resolve BTF for iterator %s: %w", name, err)
 		}
 
 		return &target, nil

+ 0 - 25
vendor/github.com/cilium/ebpf/readme.md

@@ -1,25 +0,0 @@
-eBPF
--------
-[![](https://godoc.org/github.com/cilium/ebpf?status.svg)](https://godoc.org/github.com/cilium/ebpf)
-
-eBPF is a pure Go library that provides utilities for loading, compiling, and debugging eBPF programs. It has minimal external dependencies and is intended to be used in long running processes.
-
-[ebpf/asm](https://godoc.org/github.com/cilium/ebpf/asm) contains a basic assembler.
-
-The library is maintained by [Cloudflare](https://www.cloudflare.com) and [Cilium](https://www.cilium.io). Feel free to [join](https://cilium.herokuapp.com/) the [libbpf-go](https://cilium.slack.com/messages/libbpf-go) channel on Slack.
-
-## Current status
-
-The package is production ready, but **the API is explicitly unstable
-right now**. Expect to update your code if you want to follow along.
-
-## Requirements
-
-* A version of Go that is [supported by upstream](https://golang.org/doc/devel/release.html#policy)
-* Linux 4.9, 4.19 or 5.4 (versions in-between should work, but are not tested)
-
-## Useful resources
-
-* [Cilium eBPF documentation](https://cilium.readthedocs.io/en/latest/bpf/#bpf-guide) (recommended)
-* [Linux documentation on BPF](http://elixir.free-electrons.com/linux/latest/source/Documentation/networking/filter.txt)
-* [eBPF features by Linux version](https://github.com/iovisor/bcc/blob/master/docs/kernel-versions.md)

+ 142 - 89
vendor/github.com/cilium/ebpf/syscalls.go

@@ -3,7 +3,6 @@ package ebpf
 import (
 	"errors"
 	"fmt"
-	"os"
 	"unsafe"
 
 	"github.com/cilium/ebpf/internal"
@@ -12,9 +11,7 @@ import (
 )
 
 // Generic errors returned by BPF syscalls.
-var (
-	ErrNotExist = errors.New("requested object does not exist")
-)
+var ErrNotExist = errors.New("requested object does not exist")
 
 // bpfObjName is a null-terminated string made up of
 // 'A-Za-z0-9_' characters.
@@ -27,18 +24,20 @@ func newBPFObjName(name string) bpfObjName {
 	return result
 }
 
+// invalidBPFObjNameChar returns true if char may not appear in
+// a BPF object name.
 func invalidBPFObjNameChar(char rune) bool {
 	dotAllowed := objNameAllowsDot() == nil
 
 	switch {
 	case char >= 'A' && char <= 'Z':
-		fallthrough
+		return false
 	case char >= 'a' && char <= 'z':
-		fallthrough
+		return false
 	case char >= '0' && char <= '9':
-		fallthrough
+		return false
 	case dotAllowed && char == '.':
-		fallthrough
+		return false
 	case char == '_':
 		return false
 	default:
@@ -69,14 +68,32 @@ type bpfMapOpAttr struct {
 	flags   uint64
 }
 
+type bpfBatchMapOpAttr struct {
+	inBatch   internal.Pointer
+	outBatch  internal.Pointer
+	keys      internal.Pointer
+	values    internal.Pointer
+	count     uint32
+	mapFd     uint32
+	elemFlags uint64
+	flags     uint64
+}
+
 type bpfMapInfo struct {
-	mapType    uint32
-	id         uint32
-	keySize    uint32
-	valueSize  uint32
-	maxEntries uint32
-	flags      uint32
-	mapName    bpfObjName // since 4.15 ad5b177bd73f
+	map_type                  uint32 // since 4.12 1e2709769086
+	id                        uint32
+	key_size                  uint32
+	value_size                uint32
+	max_entries               uint32
+	map_flags                 uint32
+	name                      bpfObjName // since 4.15 ad5b177bd73f
+	ifindex                   uint32     // since 4.16 52775b33bb50
+	btf_vmlinux_value_type_id uint32     // since 5.6  85d33df357b6
+	netns_dev                 uint64     // since 4.16 52775b33bb50
+	netns_ino                 uint64
+	btf_id                    uint32 // since 4.18 78958fca7ead
+	btf_key_type_id           uint32 // since 4.18 9b2cf328b2ec
+	btf_value_type_id         uint32
 }
 
 type bpfProgLoadAttr struct {
@@ -104,18 +121,40 @@ type bpfProgLoadAttr struct {
 }
 
 type bpfProgInfo struct {
-	progType     uint32
-	id           uint32
-	tag          [unix.BPF_TAG_SIZE]byte
-	jitedLen     uint32
-	xlatedLen    uint32
-	jited        internal.Pointer
-	xlated       internal.Pointer
-	loadTime     uint64 // since 4.15 cb4d2b3f03d8
-	createdByUID uint32
-	nrMapIDs     uint32
-	mapIds       internal.Pointer
-	name         bpfObjName
+	prog_type                uint32
+	id                       uint32
+	tag                      [unix.BPF_TAG_SIZE]byte
+	jited_prog_len           uint32
+	xlated_prog_len          uint32
+	jited_prog_insns         internal.Pointer
+	xlated_prog_insns        internal.Pointer
+	load_time                uint64 // since 4.15 cb4d2b3f03d8
+	created_by_uid           uint32
+	nr_map_ids               uint32
+	map_ids                  internal.Pointer
+	name                     bpfObjName // since 4.15 067cae47771c
+	ifindex                  uint32
+	gpl_compatible           uint32
+	netns_dev                uint64
+	netns_ino                uint64
+	nr_jited_ksyms           uint32
+	nr_jited_func_lens       uint32
+	jited_ksyms              internal.Pointer
+	jited_func_lens          internal.Pointer
+	btf_id                   uint32
+	func_info_rec_size       uint32
+	func_info                internal.Pointer
+	nr_func_info             uint32
+	nr_line_info             uint32
+	line_info                internal.Pointer
+	jited_line_info          internal.Pointer
+	nr_jited_line_info       uint32
+	line_info_rec_size       uint32
+	jited_line_info_rec_size uint32
+	nr_prog_tags             uint32
+	prog_tags                internal.Pointer
+	run_time_ns              uint64
+	run_cnt                  uint64
 }
 
 type bpfProgTestRunAttr struct {
@@ -129,12 +168,6 @@ type bpfProgTestRunAttr struct {
 	duration    uint32
 }
 
-type bpfObjGetInfoByFDAttr struct {
-	fd      uint32
-	infoLen uint32
-	info    internal.Pointer // May be either bpfMapInfo or bpfProgInfo
-}
-
 type bpfGetFDByIDAttr struct {
 	id   uint32
 	next uint32
@@ -174,10 +207,6 @@ func bpfProgTestRun(attr *bpfProgTestRunAttr) error {
 
 func bpfMapCreate(attr *bpfMapCreateAttr) (*internal.FD, error) {
 	fd, err := internal.BPF(internal.BPF_MAP_CREATE, unsafe.Pointer(attr), unsafe.Sizeof(*attr))
-	if errors.Is(err, os.ErrPermission) {
-		return nil, errors.New("permission denied or insufficient rlimit to lock memory for map")
-	}
-
 	if err != nil {
 		return nil, err
 	}
@@ -185,35 +214,25 @@ func bpfMapCreate(attr *bpfMapCreateAttr) (*internal.FD, error) {
 	return internal.NewFD(uint32(fd)), nil
 }
 
-var haveNestedMaps = internal.FeatureTest("nested maps", "4.12", func() (bool, error) {
-	inner, err := bpfMapCreate(&bpfMapCreateAttr{
-		mapType:    Array,
-		keySize:    4,
-		valueSize:  4,
-		maxEntries: 1,
-	})
-	if err != nil {
-		return false, err
-	}
-	defer inner.Close()
-
-	innerFd, _ := inner.Value()
-	nested, err := bpfMapCreate(&bpfMapCreateAttr{
+var haveNestedMaps = internal.FeatureTest("nested maps", "4.12", func() error {
+	_, err := bpfMapCreate(&bpfMapCreateAttr{
 		mapType:    ArrayOfMaps,
 		keySize:    4,
 		valueSize:  4,
 		maxEntries: 1,
-		innerMapFd: innerFd,
+		// Invalid file descriptor.
+		innerMapFd: ^uint32(0),
 	})
-	if err != nil {
-		return false, nil
+	if errors.Is(err, unix.EINVAL) {
+		return internal.ErrNotSupported
 	}
-
-	_ = nested.Close()
-	return true, nil
+	if errors.Is(err, unix.EBADF) {
+		return nil
+	}
+	return err
 })
 
-var haveMapMutabilityModifiers = internal.FeatureTest("read- and write-only maps", "5.2", func() (bool, error) {
+var haveMapMutabilityModifiers = internal.FeatureTest("read- and write-only maps", "5.2", func() error {
 	// This checks BPF_F_RDONLY_PROG and BPF_F_WRONLY_PROG. Since
 	// BPF_MAP_FREEZE appeared in 5.2 as well we don't do a separate check.
 	m, err := bpfMapCreate(&bpfMapCreateAttr{
@@ -224,10 +243,10 @@ var haveMapMutabilityModifiers = internal.FeatureTest("read- and write-only maps
 		flags:      unix.BPF_F_RDONLY_PROG,
 	})
 	if err != nil {
-		return false, nil
+		return internal.ErrNotSupported
 	}
 	_ = m.Close()
-	return true, nil
+	return nil
 })
 
 func bpfMapLookupElem(m *internal.FD, key, valueOut internal.Pointer) error {
@@ -313,6 +332,29 @@ func objGetNextID(cmd internal.BPFCmd, start uint32) (uint32, error) {
 	return attr.nextID, wrapObjError(err)
 }
 
+func bpfMapBatch(cmd internal.BPFCmd, m *internal.FD, inBatch, outBatch, keys, values internal.Pointer, count uint32, opts *BatchOptions) (uint32, error) {
+	fd, err := m.Value()
+	if err != nil {
+		return 0, err
+	}
+
+	attr := bpfBatchMapOpAttr{
+		inBatch:  inBatch,
+		outBatch: outBatch,
+		keys:     keys,
+		values:   values,
+		count:    count,
+		mapFd:    fd,
+	}
+	if opts != nil {
+		attr.elemFlags = opts.ElemFlags
+		attr.flags = opts.Flags
+	}
+	_, err = internal.BPF(cmd, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
+	// always return count even on an error, as things like update might partially be fulfilled.
+	return attr.count, wrapMapError(err)
+}
+
 func wrapObjError(err error) error {
 	if err == nil {
 		return nil
@@ -337,7 +379,11 @@ func wrapMapError(err error) error {
 		return ErrKeyExist
 	}
 
-	return errors.New(err.Error())
+	if errors.Is(err, unix.ENOTSUPP) {
+		return ErrNotSupported
+	}
+
+	return err
 }
 
 func bpfMapFreeze(m *internal.FD) error {
@@ -353,28 +399,9 @@ func bpfMapFreeze(m *internal.FD) error {
 	return err
 }
 
-func bpfGetObjectInfoByFD(fd *internal.FD, info unsafe.Pointer, size uintptr) error {
-	value, err := fd.Value()
-	if err != nil {
-		return err
-	}
-
-	// available from 4.13
-	attr := bpfObjGetInfoByFDAttr{
-		fd:      value,
-		infoLen: uint32(size),
-		info:    internal.NewPointer(info),
-	}
-	_, err = internal.BPF(internal.BPF_OBJ_GET_INFO_BY_FD, unsafe.Pointer(&attr), unsafe.Sizeof(attr))
-	if err != nil {
-		return fmt.Errorf("fd %d: %w", fd, err)
-	}
-	return nil
-}
-
 func bpfGetProgInfoByFD(fd *internal.FD) (*bpfProgInfo, error) {
 	var info bpfProgInfo
-	if err := bpfGetObjectInfoByFD(fd, unsafe.Pointer(&info), unsafe.Sizeof(info)); err != nil {
+	if err := internal.BPFObjGetInfoByFD(fd, unsafe.Pointer(&info), unsafe.Sizeof(info)); err != nil {
 		return nil, fmt.Errorf("can't get program info: %w", err)
 	}
 	return &info, nil
@@ -382,14 +409,14 @@ func bpfGetProgInfoByFD(fd *internal.FD) (*bpfProgInfo, error) {
 
 func bpfGetMapInfoByFD(fd *internal.FD) (*bpfMapInfo, error) {
 	var info bpfMapInfo
-	err := bpfGetObjectInfoByFD(fd, unsafe.Pointer(&info), unsafe.Sizeof(info))
+	err := internal.BPFObjGetInfoByFD(fd, unsafe.Pointer(&info), unsafe.Sizeof(info))
 	if err != nil {
 		return nil, fmt.Errorf("can't get map info: %w", err)
 	}
 	return &info, nil
 }
 
-var haveObjName = internal.FeatureTest("object names", "4.15", func() (bool, error) {
+var haveObjName = internal.FeatureTest("object names", "4.15", func() error {
 	attr := bpfMapCreateAttr{
 		mapType:    Array,
 		keySize:    4,
@@ -400,16 +427,16 @@ var haveObjName = internal.FeatureTest("object names", "4.15", func() (bool, err
 
 	fd, err := bpfMapCreate(&attr)
 	if err != nil {
-		return false, nil
+		return internal.ErrNotSupported
 	}
 
 	_ = fd.Close()
-	return true, nil
+	return nil
 })
 
-var objNameAllowsDot = internal.FeatureTest("dot in object names", "5.2", func() (bool, error) {
+var objNameAllowsDot = internal.FeatureTest("dot in object names", "5.2", func() error {
 	if err := haveObjName(); err != nil {
-		return false, err
+		return err
 	}
 
 	attr := bpfMapCreateAttr{
@@ -422,11 +449,37 @@ var objNameAllowsDot = internal.FeatureTest("dot in object names", "5.2", func()
 
 	fd, err := bpfMapCreate(&attr)
 	if err != nil {
-		return false, nil
+		return internal.ErrNotSupported
 	}
 
 	_ = fd.Close()
-	return true, nil
+	return nil
+})
+
+var haveBatchAPI = internal.FeatureTest("map batch api", "5.6", func() error {
+	var maxEntries uint32 = 2
+	attr := bpfMapCreateAttr{
+		mapType:    Hash,
+		keySize:    4,
+		valueSize:  4,
+		maxEntries: maxEntries,
+	}
+
+	fd, err := bpfMapCreate(&attr)
+	if err != nil {
+		return internal.ErrNotSupported
+	}
+	defer fd.Close()
+	keys := []uint32{1, 2}
+	values := []uint32{3, 4}
+	kp, _ := marshalPtr(keys, 8)
+	vp, _ := marshalPtr(values, 8)
+	nilPtr := internal.NewPointer(nil)
+	_, err = bpfMapBatch(internal.BPF_MAP_UPDATE_BATCH, fd, nilPtr, nilPtr, kp, vp, maxEntries, nil)
+	if err != nil {
+		return internal.ErrNotSupported
+	}
+	return nil
 })
 
 func bpfObjGetFDByID(cmd internal.BPFCmd, id uint32) (*internal.FD, error) {

+ 81 - 33
vendor/github.com/cilium/ebpf/types.go

@@ -1,6 +1,10 @@
 package ebpf
 
-//go:generate stringer -output types_string.go -type=MapType,ProgramType,AttachType
+import (
+	"github.com/cilium/ebpf/internal/unix"
+)
+
+//go:generate stringer -output types_string.go -type=MapType,ProgramType,AttachType,PinType
 
 // MapType indicates the type map structure
 // that will be initialized in the kernel.
@@ -85,10 +89,19 @@ const (
 
 // hasPerCPUValue returns true if the Map stores a value per CPU.
 func (mt MapType) hasPerCPUValue() bool {
-	if mt == PerCPUHash || mt == PerCPUArray || mt == LRUCPUHash {
-		return true
-	}
-	return false
+	return mt == PerCPUHash || mt == PerCPUArray || mt == LRUCPUHash
+}
+
+// canStoreMap returns true if the map type accepts a map fd
+// for update and returns a map id for lookup.
+func (mt MapType) canStoreMap() bool {
+	return mt == ArrayOfMaps || mt == HashOfMaps
+}
+
+// canStoreProgram returns true if the map type accepts a program fd
+// for update and returns a program id for lookup.
+func (mt MapType) canStoreProgram() bool {
+	return mt == ProgramArray
 }
 
 // ProgramType of the eBPF program
@@ -96,60 +109,37 @@ type ProgramType uint32
 
 // eBPF program types
 const (
-	// Unrecognized program type
 	UnspecifiedProgram ProgramType = iota
-	// SocketFilter socket or seccomp filter
 	SocketFilter
-	// Kprobe program
 	Kprobe
-	// SchedCLS traffic control shaper
 	SchedCLS
-	// SchedACT routing control shaper
 	SchedACT
-	// TracePoint program
 	TracePoint
-	// XDP program
 	XDP
-	// PerfEvent program
 	PerfEvent
-	// CGroupSKB program
 	CGroupSKB
-	// CGroupSock program
 	CGroupSock
-	// LWTIn program
 	LWTIn
-	// LWTOut program
 	LWTOut
-	// LWTXmit program
 	LWTXmit
-	// SockOps program
 	SockOps
-	// SkSKB program
 	SkSKB
-	// CGroupDevice program
 	CGroupDevice
-	// SkMsg program
 	SkMsg
-	// RawTracepoint program
 	RawTracepoint
-	// CGroupSockAddr program
 	CGroupSockAddr
-	// LWTSeg6Local program
 	LWTSeg6Local
-	// LircMode2 program
 	LircMode2
-	// SkReuseport program
 	SkReuseport
-	// FlowDissector program
 	FlowDissector
-	// CGroupSysctl program
 	CGroupSysctl
-	// RawTracepointWritable program
 	RawTracepointWritable
-	// CGroupSockopt program
 	CGroupSockopt
-	// Tracing program
 	Tracing
+	StructOps
+	Extension
+	LSM
+	SkLookup
 )
 
 // AttachType of the eBPF program, needed to differentiate allowed context accesses in
@@ -157,7 +147,7 @@ const (
 // Will cause invalid argument (EINVAL) at program load time if set incorrectly.
 type AttachType uint32
 
-// AttachNone is an alias for AttachCGroupInetIngress for readability reasons
+// AttachNone is an alias for AttachCGroupInetIngress for readability reasons.
 const AttachNone AttachType = 0
 
 const (
@@ -190,7 +180,65 @@ const (
 	AttachModifyReturn
 	AttachLSMMac
 	AttachTraceIter
+	AttachCgroupInet4GetPeername
+	AttachCgroupInet6GetPeername
+	AttachCgroupInet4GetSockname
+	AttachCgroupInet6GetSockname
+	AttachXDPDevMap
+	AttachCgroupInetSockRelease
+	AttachXDPCPUMap
+	AttachSkLookup
+	AttachXDP
 )
 
 // AttachFlags of the eBPF program used in BPF_PROG_ATTACH command
 type AttachFlags uint32
+
+// PinType determines whether a map is pinned into a BPFFS.
+type PinType int
+
+// Valid pin types.
+//
+// Mirrors enum libbpf_pin_type.
+const (
+	PinNone PinType = iota
+	// Pin an object by using its name as the filename.
+	PinByName
+)
+
+// LoadPinOptions control how a pinned object is loaded.
+type LoadPinOptions struct {
+	// Request a read-only or write-only object. The default is a read-write
+	// object. Only one of the flags may be set.
+	ReadOnly  bool
+	WriteOnly bool
+
+	// Raw flags for the syscall. Other fields of this struct take precedence.
+	Flags uint32
+}
+
+// Marshal returns a value suitable for BPF_OBJ_GET syscall file_flags parameter.
+func (lpo *LoadPinOptions) Marshal() uint32 {
+	if lpo == nil {
+		return 0
+	}
+
+	flags := lpo.Flags
+	if lpo.ReadOnly {
+		flags |= unix.BPF_F_RDONLY
+	}
+	if lpo.WriteOnly {
+		flags |= unix.BPF_F_WRONLY
+	}
+	return flags
+}
+
+// BatchOptions batch map operations options
+//
+// Mirrors libbpf struct bpf_map_batch_opts
+// Currently BPF_F_FLAG is the only supported
+// flag (for ElemFlags).
+type BatchOptions struct {
+	ElemFlags uint64
+	Flags     uint64
+}

+ 36 - 5
vendor/github.com/cilium/ebpf/types_string.go

@@ -1,4 +1,4 @@
-// Code generated by "stringer -output types_string.go -type=MapType,ProgramType,AttachType"; DO NOT EDIT.
+// Code generated by "stringer -output types_string.go -type=MapType,ProgramType,AttachType,PinType"; DO NOT EDIT.
 
 package ebpf
 
@@ -77,11 +77,15 @@ func _() {
 	_ = x[RawTracepointWritable-24]
 	_ = x[CGroupSockopt-25]
 	_ = x[Tracing-26]
+	_ = x[StructOps-27]
+	_ = x[Extension-28]
+	_ = x[LSM-29]
+	_ = x[SkLookup-30]
 }
 
-const _ProgramType_name = "UnspecifiedProgramSocketFilterKprobeSchedCLSSchedACTTracePointXDPPerfEventCGroupSKBCGroupSockLWTInLWTOutLWTXmitSockOpsSkSKBCGroupDeviceSkMsgRawTracepointCGroupSockAddrLWTSeg6LocalLircMode2SkReuseportFlowDissectorCGroupSysctlRawTracepointWritableCGroupSockoptTracing"
+const _ProgramType_name = "UnspecifiedProgramSocketFilterKprobeSchedCLSSchedACTTracePointXDPPerfEventCGroupSKBCGroupSockLWTInLWTOutLWTXmitSockOpsSkSKBCGroupDeviceSkMsgRawTracepointCGroupSockAddrLWTSeg6LocalLircMode2SkReuseportFlowDissectorCGroupSysctlRawTracepointWritableCGroupSockoptTracingStructOpsExtensionLSMSkLookup"
 
-var _ProgramType_index = [...]uint16{0, 18, 30, 36, 44, 52, 62, 65, 74, 83, 93, 98, 104, 111, 118, 123, 135, 140, 153, 167, 179, 188, 199, 212, 224, 245, 258, 265}
+var _ProgramType_index = [...]uint16{0, 18, 30, 36, 44, 52, 62, 65, 74, 83, 93, 98, 104, 111, 118, 123, 135, 140, 153, 167, 179, 188, 199, 212, 224, 245, 258, 265, 274, 283, 286, 294}
 
 func (i ProgramType) String() string {
 	if i >= ProgramType(len(_ProgramType_index)-1) {
@@ -123,11 +127,20 @@ func _() {
 	_ = x[AttachModifyReturn-26]
 	_ = x[AttachLSMMac-27]
 	_ = x[AttachTraceIter-28]
+	_ = x[AttachCgroupInet4GetPeername-29]
+	_ = x[AttachCgroupInet6GetPeername-30]
+	_ = x[AttachCgroupInet4GetSockname-31]
+	_ = x[AttachCgroupInet6GetSockname-32]
+	_ = x[AttachXDPDevMap-33]
+	_ = x[AttachCgroupInetSockRelease-34]
+	_ = x[AttachXDPCPUMap-35]
+	_ = x[AttachSkLookup-36]
+	_ = x[AttachXDP-37]
 }
 
-const _AttachType_name = "AttachNoneAttachCGroupInetEgressAttachCGroupInetSockCreateAttachCGroupSockOpsAttachSkSKBStreamParserAttachSkSKBStreamVerdictAttachCGroupDeviceAttachSkMsgVerdictAttachCGroupInet4BindAttachCGroupInet6BindAttachCGroupInet4ConnectAttachCGroupInet6ConnectAttachCGroupInet4PostBindAttachCGroupInet6PostBindAttachCGroupUDP4SendmsgAttachCGroupUDP6SendmsgAttachLircMode2AttachFlowDissectorAttachCGroupSysctlAttachCGroupUDP4RecvmsgAttachCGroupUDP6RecvmsgAttachCGroupGetsockoptAttachCGroupSetsockoptAttachTraceRawTpAttachTraceFEntryAttachTraceFExitAttachModifyReturnAttachLSMMacAttachTraceIter"
+const _AttachType_name = "AttachNoneAttachCGroupInetEgressAttachCGroupInetSockCreateAttachCGroupSockOpsAttachSkSKBStreamParserAttachSkSKBStreamVerdictAttachCGroupDeviceAttachSkMsgVerdictAttachCGroupInet4BindAttachCGroupInet6BindAttachCGroupInet4ConnectAttachCGroupInet6ConnectAttachCGroupInet4PostBindAttachCGroupInet6PostBindAttachCGroupUDP4SendmsgAttachCGroupUDP6SendmsgAttachLircMode2AttachFlowDissectorAttachCGroupSysctlAttachCGroupUDP4RecvmsgAttachCGroupUDP6RecvmsgAttachCGroupGetsockoptAttachCGroupSetsockoptAttachTraceRawTpAttachTraceFEntryAttachTraceFExitAttachModifyReturnAttachLSMMacAttachTraceIterAttachCgroupInet4GetPeernameAttachCgroupInet6GetPeernameAttachCgroupInet4GetSocknameAttachCgroupInet6GetSocknameAttachXDPDevMapAttachCgroupInetSockReleaseAttachXDPCPUMapAttachSkLookupAttachXDP"
 
-var _AttachType_index = [...]uint16{0, 10, 32, 58, 77, 100, 124, 142, 160, 181, 202, 226, 250, 275, 300, 323, 346, 361, 380, 398, 421, 444, 466, 488, 504, 521, 537, 555, 567, 582}
+var _AttachType_index = [...]uint16{0, 10, 32, 58, 77, 100, 124, 142, 160, 181, 202, 226, 250, 275, 300, 323, 346, 361, 380, 398, 421, 444, 466, 488, 504, 521, 537, 555, 567, 582, 610, 638, 666, 694, 709, 736, 751, 765, 774}
 
 func (i AttachType) String() string {
 	if i >= AttachType(len(_AttachType_index)-1) {
@@ -135,3 +148,21 @@ func (i AttachType) String() string {
 	}
 	return _AttachType_name[_AttachType_index[i]:_AttachType_index[i+1]]
 }
+func _() {
+	// An "invalid array index" compiler error signifies that the constant values have changed.
+	// Re-run the stringer command to generate them again.
+	var x [1]struct{}
+	_ = x[PinNone-0]
+	_ = x[PinByName-1]
+}
+
+const _PinType_name = "PinNonePinByName"
+
+var _PinType_index = [...]uint8{0, 7, 16}
+
+func (i PinType) String() string {
+	if i < 0 || i >= PinType(len(_PinType_index)-1) {
+		return "PinType(" + strconv.FormatInt(int64(i), 10) + ")"
+	}
+	return _PinType_name[_PinType_index[i]:_PinType_index[i+1]]
+}

+ 25 - 13
vendor/github.com/opencontainers/runc/README.md

@@ -1,9 +1,10 @@
 # runc
 
-[![Build Status](https://travis-ci.org/opencontainers/runc.svg?branch=master)](https://travis-ci.org/opencontainers/runc)
 [![Go Report Card](https://goreportcard.com/badge/github.com/opencontainers/runc)](https://goreportcard.com/report/github.com/opencontainers/runc)
 [![GoDoc](https://godoc.org/github.com/opencontainers/runc?status.svg)](https://godoc.org/github.com/opencontainers/runc)
 [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/588/badge)](https://bestpractices.coreinfrastructure.org/projects/588)
+[![gha/validate](https://github.com/opencontainers/runc/workflows/validate/badge.svg)](https://github.com/opencontainers/runc/actions?query=workflow%3Avalidate)
+[![gha/ci](https://github.com/opencontainers/runc/workflows/ci/badge.svg)](https://github.com/opencontainers/runc/actions?query=workflow%3Aci)
 
 ## Introduction
 
@@ -17,10 +18,6 @@ This means that `runc` 1.0.0 should implement the 1.0 version of the specificati
 
 You can find official releases of `runc` on the [release](https://github.com/opencontainers/runc/releases) page.
 
-Currently, the following features are not considered to be production-ready:
-
-* [Support for cgroup v2](./docs/cgroup-v2.md)
-
 ## Security
 
 The reporting process and disclosure communications are outlined [here](https://github.com/opencontainers/org/blob/master/SECURITY.md).
@@ -64,19 +61,20 @@ sudo make install
 with some of them enabled by default (see `BUILDTAGS` in top-level `Makefile`).
 
 To change build tags from the default, set the `BUILDTAGS` variable for make,
-e.g.
+e.g. to disable seccomp:
 
 ```bash
-make BUILDTAGS='seccomp apparmor'
+make BUILDTAGS=""
 ```
 
 | Build Tag | Feature                            | Enabled by default | Dependency |
 |-----------|------------------------------------|--------------------|------------|
 | seccomp   | Syscall filtering                  | yes                | libseccomp |
-| selinux   | selinux process and mount labeling | yes                | <none>     |
-| apparmor  | apparmor profile support           | yes                | <none>     |
-| nokmem    | disable kernel memory accounting   | no                 | <none>     |
 
+The following build tags were used earlier, but are now obsoleted:
+ - **nokmem** (since runc v1.0.0-rc94 kernel memory settings are ignored)
+ - **apparmor** (since runc v1.0.0-rc93 the feature is always enabled)
+ - **selinux**  (since runc v1.0.0-rc93 the feature is always enabled)
 
 ### Running the test suite
 
@@ -128,6 +126,14 @@ make verify-dependencies
 
 ## Using runc
 
+Please note that runc is a low level tool not designed with an end user
+in mind. It is mostly employed by other higher level container software.
+
+Therefore, unless there is some specific use case that prevents the use
+of tools like Docker or Podman, it is not recommended to use runc directly.
+
+If you still want to use runc, here's how.
+
 ### Creating an OCI Bundle
 
 In order to use runc you must have your container in the format of an OCI bundle.
@@ -169,7 +175,9 @@ If you used the unmodified `runc spec` template this should give you a `sh` sess
 
 The second way to start a container is using the specs lifecycle operations.
 This gives you more power over how the container is created and managed while it is running.
-This will also launch the container in the background so you will have to edit the `config.json` to remove the `terminal` setting for the simple examples here.
+This will also launch the container in the background so you will have to edit
+the `config.json` to remove the `terminal` setting for the simple examples
+below (see more details about [runc terminal handling](docs/terminals.md)).
 Your process field in the `config.json` should look like this below with `"terminal": false` and `"args": ["sleep", "5"]`.
 
 
@@ -292,8 +300,12 @@ PIDFile=/run/mycontainerid.pid
 WantedBy=multi-user.target
 ```
 
-#### cgroup v2
-See [`./docs/cgroup-v2.md`](./docs/cgroup-v2.md).
+## More documentation
+
+* [cgroup v2](./docs/cgroup-v2.md)
+* [Checkpoint and restore](./docs/checkpoint-restore.md)
+* [systemd cgroup driver](./docs/systemd.md)
+* [Terminals and standard IO](./docs/terminals.md)
 
 ## License
 

+ 16 - 14
vendor/github.com/opencontainers/runc/go.mod

@@ -1,26 +1,28 @@
 module github.com/opencontainers/runc
 
-go 1.14
+go 1.13
 
 require (
-	github.com/checkpoint-restore/go-criu/v4 v4.1.0
-	github.com/cilium/ebpf v0.0.0-20200702112145-1c8d4c9ef775
-	github.com/containerd/console v1.0.0
-	github.com/coreos/go-systemd/v22 v22.1.0
+	github.com/checkpoint-restore/go-criu/v5 v5.0.0
+	github.com/cilium/ebpf v0.5.0
+	github.com/containerd/console v1.0.2
+	github.com/coreos/go-systemd/v22 v22.3.1
 	github.com/cyphar/filepath-securejoin v0.2.2
 	github.com/docker/go-units v0.4.0
-	github.com/godbus/dbus/v5 v5.0.3
-	github.com/golang/protobuf v1.4.2
-	github.com/moby/sys/mountinfo v0.1.3
-	github.com/mrunalp/fileutils v0.0.0-20200520151820-abd8a0e76976
-	github.com/opencontainers/runtime-spec v1.0.3-0.20200728170252-4d89ac9fbff6
-	github.com/opencontainers/selinux v1.6.0
+	github.com/godbus/dbus/v5 v5.0.4
+	github.com/moby/sys/mountinfo v0.4.1
+	github.com/mrunalp/fileutils v0.5.0
+	github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417
+	github.com/opencontainers/selinux v1.8.0
 	github.com/pkg/errors v0.9.1
 	github.com/seccomp/libseccomp-golang v0.9.1
-	github.com/sirupsen/logrus v1.6.0
-	github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2
+	github.com/sirupsen/logrus v1.7.0
+	github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635
 	// NOTE: urfave/cli must be <= v1.22.1 due to a regression: https://github.com/urfave/cli/issues/1092
 	github.com/urfave/cli v1.22.1
 	github.com/vishvananda/netlink v1.1.0
-	golang.org/x/sys v0.0.0-20200728102440-3e129f6d46b1
+	github.com/willf/bitset v1.1.11
+	golang.org/x/net v0.0.0-20201224014010-6772e930b67b
+	golang.org/x/sys v0.0.0-20210426230700-d19ff857e887
+	google.golang.org/protobuf v1.25.0
 )

+ 87 - 83
vendor/github.com/opencontainers/runc/libcontainer/README.md

@@ -57,90 +57,94 @@ struct describing how the container is to be created. A sample would look simila
 
 ```go
 defaultMountFlags := unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
+var devices []*configs.DeviceRule
+for _, device := range specconv.AllowedDevices {
+	devices = append(devices, &device.Rule)
+}
 config := &configs.Config{
 	Rootfs: "/your/path/to/rootfs",
 	Capabilities: &configs.Capabilities{
-                Bounding: []string{
-                        "CAP_CHOWN",
-                        "CAP_DAC_OVERRIDE",
-                        "CAP_FSETID",
-                        "CAP_FOWNER",
-                        "CAP_MKNOD",
-                        "CAP_NET_RAW",
-                        "CAP_SETGID",
-                        "CAP_SETUID",
-                        "CAP_SETFCAP",
-                        "CAP_SETPCAP",
-                        "CAP_NET_BIND_SERVICE",
-                        "CAP_SYS_CHROOT",
-                        "CAP_KILL",
-                        "CAP_AUDIT_WRITE",
-                },
-                Effective: []string{
-                        "CAP_CHOWN",
-                        "CAP_DAC_OVERRIDE",
-                        "CAP_FSETID",
-                        "CAP_FOWNER",
-                        "CAP_MKNOD",
-                        "CAP_NET_RAW",
-                        "CAP_SETGID",
-                        "CAP_SETUID",
-                        "CAP_SETFCAP",
-                        "CAP_SETPCAP",
-                        "CAP_NET_BIND_SERVICE",
-                        "CAP_SYS_CHROOT",
-                        "CAP_KILL",
-                        "CAP_AUDIT_WRITE",
-                },
-                Inheritable: []string{
-                        "CAP_CHOWN",
-                        "CAP_DAC_OVERRIDE",
-                        "CAP_FSETID",
-                        "CAP_FOWNER",
-                        "CAP_MKNOD",
-                        "CAP_NET_RAW",
-                        "CAP_SETGID",
-                        "CAP_SETUID",
-                        "CAP_SETFCAP",
-                        "CAP_SETPCAP",
-                        "CAP_NET_BIND_SERVICE",
-                        "CAP_SYS_CHROOT",
-                        "CAP_KILL",
-                        "CAP_AUDIT_WRITE",
-                },
-                Permitted: []string{
-                        "CAP_CHOWN",
-                        "CAP_DAC_OVERRIDE",
-                        "CAP_FSETID",
-                        "CAP_FOWNER",
-                        "CAP_MKNOD",
-                        "CAP_NET_RAW",
-                        "CAP_SETGID",
-                        "CAP_SETUID",
-                        "CAP_SETFCAP",
-                        "CAP_SETPCAP",
-                        "CAP_NET_BIND_SERVICE",
-                        "CAP_SYS_CHROOT",
-                        "CAP_KILL",
-                        "CAP_AUDIT_WRITE",
-                },
-                Ambient: []string{
-                        "CAP_CHOWN",
-                        "CAP_DAC_OVERRIDE",
-                        "CAP_FSETID",
-                        "CAP_FOWNER",
-                        "CAP_MKNOD",
-                        "CAP_NET_RAW",
-                        "CAP_SETGID",
-                        "CAP_SETUID",
-                        "CAP_SETFCAP",
-                        "CAP_SETPCAP",
-                        "CAP_NET_BIND_SERVICE",
-                        "CAP_SYS_CHROOT",
-                        "CAP_KILL",
-                        "CAP_AUDIT_WRITE",
-                },
-        },
+		Bounding: []string{
+			"CAP_CHOWN",
+			"CAP_DAC_OVERRIDE",
+			"CAP_FSETID",
+			"CAP_FOWNER",
+			"CAP_MKNOD",
+			"CAP_NET_RAW",
+			"CAP_SETGID",
+			"CAP_SETUID",
+			"CAP_SETFCAP",
+			"CAP_SETPCAP",
+			"CAP_NET_BIND_SERVICE",
+			"CAP_SYS_CHROOT",
+			"CAP_KILL",
+			"CAP_AUDIT_WRITE",
+		},
+		Effective: []string{
+			"CAP_CHOWN",
+			"CAP_DAC_OVERRIDE",
+			"CAP_FSETID",
+			"CAP_FOWNER",
+			"CAP_MKNOD",
+			"CAP_NET_RAW",
+			"CAP_SETGID",
+			"CAP_SETUID",
+			"CAP_SETFCAP",
+			"CAP_SETPCAP",
+			"CAP_NET_BIND_SERVICE",
+			"CAP_SYS_CHROOT",
+			"CAP_KILL",
+			"CAP_AUDIT_WRITE",
+		},
+		Inheritable: []string{
+			"CAP_CHOWN",
+			"CAP_DAC_OVERRIDE",
+			"CAP_FSETID",
+			"CAP_FOWNER",
+			"CAP_MKNOD",
+			"CAP_NET_RAW",
+			"CAP_SETGID",
+			"CAP_SETUID",
+			"CAP_SETFCAP",
+			"CAP_SETPCAP",
+			"CAP_NET_BIND_SERVICE",
+			"CAP_SYS_CHROOT",
+			"CAP_KILL",
+			"CAP_AUDIT_WRITE",
+		},
+		Permitted: []string{
+			"CAP_CHOWN",
+			"CAP_DAC_OVERRIDE",
+			"CAP_FSETID",
+			"CAP_FOWNER",
+			"CAP_MKNOD",
+			"CAP_NET_RAW",
+			"CAP_SETGID",
+			"CAP_SETUID",
+			"CAP_SETFCAP",
+			"CAP_SETPCAP",
+			"CAP_NET_BIND_SERVICE",
+			"CAP_SYS_CHROOT",
+			"CAP_KILL",
+			"CAP_AUDIT_WRITE",
+		},
+		Ambient: []string{
+			"CAP_CHOWN",
+			"CAP_DAC_OVERRIDE",
+			"CAP_FSETID",
+			"CAP_FOWNER",
+			"CAP_MKNOD",
+			"CAP_NET_RAW",
+			"CAP_SETGID",
+			"CAP_SETUID",
+			"CAP_SETFCAP",
+			"CAP_SETPCAP",
+			"CAP_NET_BIND_SERVICE",
+			"CAP_SYS_CHROOT",
+			"CAP_KILL",
+			"CAP_AUDIT_WRITE",
+		},
+	},
 	Namespaces: configs.Namespaces([]configs.Namespace{
 		{Type: configs.NEWNS},
 		{Type: configs.NEWUTS},
@@ -155,7 +159,7 @@ config := &configs.Config{
 		Parent: "system",
 		Resources: &configs.Resources{
 			MemorySwappiness: nil,
-			Devices:          specconv.AllowedDevices,
+			Devices:          devices,
 		},
 	},
 	MaskPaths: []string{
@@ -313,7 +317,7 @@ state, err := container.State()
 #### Checkpoint & Restore
 
 libcontainer now integrates [CRIU](http://criu.org/) for checkpointing and restoring containers.
-This let's you save the state of a process running inside a container to disk, and then restore
+This lets you save the state of a process running inside a container to disk, and then restore
 that state into a new process, on the same machine or on another machine.
 
 `criu` version 1.5.2 or higher is required to use checkpoint and restore.

+ 23 - 13
vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go

@@ -7,37 +7,44 @@ import (
 )
 
 type Manager interface {
-	// Applies cgroup configuration to the process with the specified pid
+	// Apply creates a cgroup, if not yet created, and adds a process
+	// with the specified pid into that cgroup.  A special value of -1
+	// can be used to merely create a cgroup.
 	Apply(pid int) error
 
-	// Returns the PIDs inside the cgroup set
+	// GetPids returns the PIDs of all processes inside the cgroup.
 	GetPids() ([]int, error)
 
-	// Returns the PIDs inside the cgroup set & all sub-cgroups
+	// GetAllPids returns the PIDs of all processes inside the cgroup
+	// any all its sub-cgroups.
 	GetAllPids() ([]int, error)
 
-	// Returns statistics for the cgroup set
+	// GetStats returns cgroups statistics.
 	GetStats() (*Stats, error)
 
-	// Toggles the freezer cgroup according with specified state
+	// Freeze sets the freezer cgroup to the specified state.
 	Freeze(state configs.FreezerState) error
 
-	// Destroys the cgroup set
+	// Destroy removes cgroup.
 	Destroy() error
 
 	// Path returns a cgroup path to the specified controller/subsystem.
 	// For cgroupv2, the argument is unused and can be empty.
 	Path(string) string
 
-	// Sets the cgroup as configured.
-	Set(container *configs.Config) error
+	// Set sets cgroup resources parameters/limits. If the argument is nil,
+	// the resources specified during Manager creation (or the previous call
+	// to Set) are used.
+	Set(r *configs.Resources) error
 
-	// GetPaths returns cgroup path(s) to save in a state file in order to restore later.
+	// GetPaths returns cgroup path(s) to save in a state file in order to
+	// restore later.
 	//
-	// For cgroup v1, a key is cgroup subsystem name, and the value is the path
-	// to the cgroup for this subsystem.
+	// For cgroup v1, a key is cgroup subsystem name, and the value is the
+	// path to the cgroup for this subsystem.
 	//
-	// For cgroup v2 unified hierarchy, a key is "", and the value is the unified path.
+	// For cgroup v2 unified hierarchy, a key is "", and the value is the
+	// unified path.
 	GetPaths() map[string]string
 
 	// GetCgroups returns the cgroup data as configured.
@@ -46,6 +53,9 @@ type Manager interface {
 	// GetFreezerState retrieves the current FreezerState of the cgroup.
 	GetFreezerState() (configs.FreezerState, error)
 
-	// Whether the cgroup path exists or not
+	// Exists returns whether the cgroup path exists or not.
 	Exists() bool
+
+	// OOMKillCount reports OOM kill count for the cgroup.
+	OOMKillCount() (uint64, error)
 }

+ 51 - 0
vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/fscommon.go

@@ -0,0 +1,51 @@
+// +build linux
+
+package fscommon
+
+import (
+	"bytes"
+	"os"
+
+	"github.com/pkg/errors"
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
+)
+
+// WriteFile writes data to a cgroup file in dir.
+// It is supposed to be used for cgroup files only.
+func WriteFile(dir, file, data string) error {
+	fd, err := OpenFile(dir, file, unix.O_WRONLY)
+	if err != nil {
+		return err
+	}
+	defer fd.Close()
+	if err := retryingWriteFile(fd, data); err != nil {
+		return errors.Wrapf(err, "failed to write %q", data)
+	}
+	return nil
+}
+
+// ReadFile reads data from a cgroup file in dir.
+// It is supposed to be used for cgroup files only.
+func ReadFile(dir, file string) (string, error) {
+	fd, err := OpenFile(dir, file, unix.O_RDONLY)
+	if err != nil {
+		return "", err
+	}
+	defer fd.Close()
+	var buf bytes.Buffer
+
+	_, err = buf.ReadFrom(fd)
+	return buf.String(), err
+}
+
+func retryingWriteFile(fd *os.File, data string) error {
+	for {
+		_, err := fd.Write([]byte(data))
+		if errors.Is(err, unix.EINTR) {
+			logrus.Infof("interrupted while writing %s to %s", data, fd.Name())
+			continue
+		}
+		return err
+	}
+}

+ 120 - 0
vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/open.go

@@ -0,0 +1,120 @@
+package fscommon
+
+import (
+	"os"
+	"strings"
+	"sync"
+
+	"github.com/pkg/errors"
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
+)
+
+const (
+	cgroupfsDir    = "/sys/fs/cgroup"
+	cgroupfsPrefix = cgroupfsDir + "/"
+)
+
+var (
+	// TestMode is set to true by unit tests that need "fake" cgroupfs.
+	TestMode bool
+
+	cgroupFd     int = -1
+	prepOnce     sync.Once
+	prepErr      error
+	resolveFlags uint64
+)
+
+func prepareOpenat2() error {
+	prepOnce.Do(func() {
+		fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{
+			Flags: unix.O_DIRECTORY | unix.O_PATH})
+		if err != nil {
+			prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err}
+			if err != unix.ENOSYS {
+				logrus.Warnf("falling back to securejoin: %s", prepErr)
+			} else {
+				logrus.Debug("openat2 not available, falling back to securejoin")
+			}
+			return
+		}
+		var st unix.Statfs_t
+		if err = unix.Fstatfs(fd, &st); err != nil {
+			prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err}
+			logrus.Warnf("falling back to securejoin: %s", prepErr)
+			return
+		}
+
+		cgroupFd = fd
+
+		resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS
+		if st.Type == unix.CGROUP2_SUPER_MAGIC {
+			// cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks
+			resolveFlags |= unix.RESOLVE_NO_XDEV | unix.RESOLVE_NO_SYMLINKS
+		}
+
+	})
+
+	return prepErr
+}
+
+// OpenFile opens a cgroup file in a given dir with given flags.
+// It is supposed to be used for cgroup files only.
+func OpenFile(dir, file string, flags int) (*os.File, error) {
+	if dir == "" {
+		return nil, errors.Errorf("no directory specified for %s", file)
+	}
+	mode := os.FileMode(0)
+	if TestMode && flags&os.O_WRONLY != 0 {
+		// "emulate" cgroup fs for unit tests
+		flags |= os.O_TRUNC | os.O_CREATE
+		mode = 0o600
+	}
+	if prepareOpenat2() != nil {
+		return openFallback(dir, file, flags, mode)
+	}
+	reldir := strings.TrimPrefix(dir, cgroupfsPrefix)
+	if len(reldir) == len(dir) { // non-standard path, old system?
+		return openFallback(dir, file, flags, mode)
+	}
+
+	relname := reldir + "/" + file
+	fd, err := unix.Openat2(cgroupFd, relname,
+		&unix.OpenHow{
+			Resolve: resolveFlags,
+			Flags:   uint64(flags) | unix.O_CLOEXEC,
+			Mode:    uint64(mode),
+		})
+	if err != nil {
+		return nil, &os.PathError{Op: "openat2", Path: dir + "/" + file, Err: err}
+	}
+
+	return os.NewFile(uintptr(fd), cgroupfsPrefix+relname), nil
+}
+
+var errNotCgroupfs = errors.New("not a cgroup file")
+
+// openFallback is used when openat2(2) is not available. It checks the opened
+// file is on cgroupfs, returning an error otherwise.
+func openFallback(dir, file string, flags int, mode os.FileMode) (*os.File, error) {
+	path := dir + "/" + file
+	fd, err := os.OpenFile(path, flags, mode)
+	if err != nil {
+		return nil, err
+	}
+	if TestMode {
+		return fd, nil
+	}
+	// Check this is a cgroupfs file.
+	var st unix.Statfs_t
+	if err := unix.Fstatfs(int(fd.Fd()), &st); err != nil {
+		_ = fd.Close()
+		return nil, &os.PathError{Op: "statfs", Path: path, Err: err}
+	}
+	if st.Type != unix.CGROUP_SUPER_MAGIC && st.Type != unix.CGROUP2_SUPER_MAGIC {
+		_ = fd.Close()
+		return nil, &os.PathError{Op: "open", Path: path, Err: errNotCgroupfs}
+	}
+
+	return fd, nil
+}

+ 122 - 0
vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/utils.go

@@ -0,0 +1,122 @@
+// +build linux
+
+package fscommon
+
+import (
+	"errors"
+	"fmt"
+	"math"
+	"strconv"
+	"strings"
+)
+
+var (
+	ErrNotValidFormat = errors.New("line is not a valid key value format")
+)
+
+// ParseUint converts a string to an uint64 integer.
+// Negative values are returned at zero as, due to kernel bugs,
+// some of the memory cgroup stats can be negative.
+func ParseUint(s string, base, bitSize int) (uint64, error) {
+	value, err := strconv.ParseUint(s, base, bitSize)
+	if err != nil {
+		intValue, intErr := strconv.ParseInt(s, base, bitSize)
+		// 1. Handle negative values greater than MinInt64 (and)
+		// 2. Handle negative values lesser than MinInt64
+		if intErr == nil && intValue < 0 {
+			return 0, nil
+		} else if intErr != nil && intErr.(*strconv.NumError).Err == strconv.ErrRange && intValue < 0 {
+			return 0, nil
+		}
+
+		return value, err
+	}
+
+	return value, nil
+}
+
+// ParseKeyValue parses a space-separated "name value" kind of cgroup
+// parameter and returns its key as a string, and its value as uint64
+// (ParseUint is used to convert the value). For example,
+// "io_service_bytes 1234" will be returned as "io_service_bytes", 1234.
+func ParseKeyValue(t string) (string, uint64, error) {
+	parts := strings.SplitN(t, " ", 3)
+	if len(parts) != 2 {
+		return "", 0, fmt.Errorf("line %q is not in key value format", t)
+	}
+
+	value, err := ParseUint(parts[1], 10, 64)
+	if err != nil {
+		return "", 0, fmt.Errorf("unable to convert to uint64: %v", err)
+	}
+
+	return parts[0], value, nil
+}
+
+// GetValueByKey reads a key-value pairs from the specified cgroup file,
+// and returns a value of the specified key. ParseUint is used for value
+// conversion.
+func GetValueByKey(path, file, key string) (uint64, error) {
+	content, err := ReadFile(path, file)
+	if err != nil {
+		return 0, err
+	}
+
+	lines := strings.Split(string(content), "\n")
+	for _, line := range lines {
+		arr := strings.Split(line, " ")
+		if len(arr) == 2 && arr[0] == key {
+			return ParseUint(arr[1], 10, 64)
+		}
+	}
+
+	return 0, nil
+}
+
+// GetCgroupParamUint reads a single uint64 value from the specified cgroup file.
+// If the value read is "max", the math.MaxUint64 is returned.
+func GetCgroupParamUint(path, file string) (uint64, error) {
+	contents, err := GetCgroupParamString(path, file)
+	if err != nil {
+		return 0, err
+	}
+	contents = strings.TrimSpace(contents)
+	if contents == "max" {
+		return math.MaxUint64, nil
+	}
+
+	res, err := ParseUint(contents, 10, 64)
+	if err != nil {
+		return res, fmt.Errorf("unable to parse file %q", path+"/"+file)
+	}
+	return res, nil
+}
+
+// GetCgroupParamInt reads a single int64 value from specified cgroup file.
+// If the value read is "max", the math.MaxInt64 is returned.
+func GetCgroupParamInt(path, file string) (int64, error) {
+	contents, err := ReadFile(path, file)
+	if err != nil {
+		return 0, err
+	}
+	contents = strings.TrimSpace(contents)
+	if contents == "max" {
+		return math.MaxInt64, nil
+	}
+
+	res, err := strconv.ParseInt(contents, 10, 64)
+	if err != nil {
+		return res, fmt.Errorf("unable to parse %q as a int from Cgroup file %q", contents, path+"/"+file)
+	}
+	return res, nil
+}
+
+// GetCgroupParamString reads a string from the specified cgroup file.
+func GetCgroupParamString(path, file string) (string, error) {
+	contents, err := ReadFile(path, file)
+	if err != nil {
+		return "", err
+	}
+
+	return strings.TrimSpace(contents), nil
+}

+ 28 - 0
vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go

@@ -39,6 +39,33 @@ type CpuStats struct {
 	ThrottlingData ThrottlingData `json:"throttling_data,omitempty"`
 }
 
+type CPUSetStats struct {
+	// List of the physical numbers of the CPUs on which processes
+	// in that cpuset are allowed to execute
+	CPUs []uint16 `json:"cpus,omitempty"`
+	// cpu_exclusive flag
+	CPUExclusive uint64 `json:"cpu_exclusive"`
+	// List of memory nodes on which processes in that cpuset
+	// are allowed to allocate memory
+	Mems []uint16 `json:"mems,omitempty"`
+	// mem_hardwall flag
+	MemHardwall uint64 `json:"mem_hardwall"`
+	// mem_exclusive flag
+	MemExclusive uint64 `json:"mem_exclusive"`
+	// memory_migrate flag
+	MemoryMigrate uint64 `json:"memory_migrate"`
+	// memory_spread page flag
+	MemorySpreadPage uint64 `json:"memory_spread_page"`
+	// memory_spread slab flag
+	MemorySpreadSlab uint64 `json:"memory_spread_slab"`
+	// memory_pressure
+	MemoryPressure uint64 `json:"memory_pressure"`
+	// sched_load balance flag
+	SchedLoadBalance uint64 `json:"sched_load_balance"`
+	// sched_relax_domain_level
+	SchedRelaxDomainLevel int64 `json:"sched_relax_domain_level"`
+}
+
 type MemoryData struct {
 	Usage    uint64 `json:"usage,omitempty"`
 	MaxUsage uint64 `json:"max_usage,omitempty"`
@@ -121,6 +148,7 @@ type HugetlbStats struct {
 
 type Stats struct {
 	CpuStats    CpuStats    `json:"cpu_stats,omitempty"`
+	CPUSetStats CPUSetStats `json:"cpuset_stats,omitempty"`
 	MemoryStats MemoryStats `json:"memory_stats,omitempty"`
 	PidsStats   PidsStats   `json:"pids_stats,omitempty"`
 	BlkioStats  BlkioStats  `json:"blkio_stats,omitempty"`

+ 115 - 42
vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go

@@ -15,7 +15,9 @@ import (
 	"sync"
 	"time"
 
-	units "github.com/docker/go-units"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/userns"
+	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
 )
 
@@ -29,19 +31,19 @@ var (
 	isUnified     bool
 )
 
-// HugePageSizeUnitList is a list of the units used by the linux kernel when
-// naming the HugePage control files.
-// https://www.kernel.org/doc/Documentation/cgroup-v1/hugetlb.txt
-// TODO Since the kernel only use KB, MB and GB; TB and PB should be removed,
-// depends on https://github.com/docker/go-units/commit/a09cd47f892041a4fac473133d181f5aea6fa393
-var HugePageSizeUnitList = []string{"B", "KB", "MB", "GB", "TB", "PB"}
-
 // IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode.
 func IsCgroup2UnifiedMode() bool {
 	isUnifiedOnce.Do(func() {
 		var st unix.Statfs_t
-		if err := unix.Statfs(unifiedMountpoint, &st); err != nil {
-			panic("cannot statfs cgroup root")
+		err := unix.Statfs(unifiedMountpoint, &st)
+		if err != nil {
+			if os.IsNotExist(err) && userns.RunningInUserNS() {
+				// ignore the "not found" error if running in userns
+				logrus.WithError(err).Debugf("%s missing, assuming cgroup v1", unifiedMountpoint)
+				isUnified = false
+				return
+			}
+			panic(fmt.Sprintf("cannot statfs cgroup root: %s", err))
 		}
 		isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
 	})
@@ -86,11 +88,11 @@ func GetAllSubsystems() ([]string, error) {
 		// - freezer: implemented in kernel 5.2
 		// We assume these are always available, as it is hard to detect availability.
 		pseudo := []string{"devices", "freezer"}
-		data, err := ioutil.ReadFile("/sys/fs/cgroup/cgroup.controllers")
+		data, err := fscommon.ReadFile("/sys/fs/cgroup", "cgroup.controllers")
 		if err != nil {
 			return nil, err
 		}
-		subsystems := append(pseudo, strings.Fields(string(data))...)
+		subsystems := append(pseudo, strings.Fields(data)...)
 		return subsystems, nil
 	}
 	f, err := os.Open("/proc/cgroups")
@@ -207,20 +209,66 @@ func EnterPid(cgroupPaths map[string]string, pid int) error {
 	return nil
 }
 
+func rmdir(path string) error {
+	err := unix.Rmdir(path)
+	if err == nil || err == unix.ENOENT {
+		return nil
+	}
+	return &os.PathError{Op: "rmdir", Path: path, Err: err}
+}
+
+// RemovePath aims to remove cgroup path. It does so recursively,
+// by removing any subdirectories (sub-cgroups) first.
+func RemovePath(path string) error {
+	// try the fast path first
+	if err := rmdir(path); err == nil {
+		return nil
+	}
+
+	infos, err := ioutil.ReadDir(path)
+	if err != nil {
+		if os.IsNotExist(err) {
+			err = nil
+		}
+		return err
+	}
+	for _, info := range infos {
+		if info.IsDir() {
+			// We should remove subcgroups dir first
+			if err = RemovePath(filepath.Join(path, info.Name())); err != nil {
+				break
+			}
+		}
+	}
+	if err == nil {
+		err = rmdir(path)
+	}
+	return err
+}
+
 // RemovePaths iterates over the provided paths removing them.
 // We trying to remove all paths five times with increasing delay between tries.
 // If after all there are not removed cgroups - appropriate error will be
 // returned.
 func RemovePaths(paths map[string]string) (err error) {
+	const retries = 5
 	delay := 10 * time.Millisecond
-	for i := 0; i < 5; i++ {
+	for i := 0; i < retries; i++ {
 		if i != 0 {
 			time.Sleep(delay)
 			delay *= 2
 		}
 		for s, p := range paths {
-			os.RemoveAll(p)
-			// TODO: here probably should be logging
+			if err := RemovePath(p); err != nil {
+				// do not log intermediate iterations
+				switch i {
+				case 0:
+					logrus.WithError(err).Warnf("Failed to remove cgroup (will retry)")
+				case retries - 1:
+					logrus.WithError(err).Error("Failed to remove cgroup")
+				}
+
+			}
 			_, err := os.Stat(p)
 			// We need this strange way of checking cgroups existence because
 			// RemoveAll almost always returns error, even on already removed
@@ -230,6 +278,8 @@ func RemovePaths(paths map[string]string) (err error) {
 			}
 		}
 		if len(paths) == 0 {
+			//nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506
+			paths = make(map[string]string)
 			return nil
 		}
 	}
@@ -237,27 +287,50 @@ func RemovePaths(paths map[string]string) (err error) {
 }
 
 func GetHugePageSize() ([]string, error) {
-	files, err := ioutil.ReadDir("/sys/kernel/mm/hugepages")
+	dir, err := os.OpenFile("/sys/kernel/mm/hugepages", unix.O_DIRECTORY|unix.O_RDONLY, 0)
 	if err != nil {
-		return []string{}, err
+		return nil, err
 	}
-	var fileNames []string
-	for _, st := range files {
-		fileNames = append(fileNames, st.Name())
+	files, err := dir.Readdirnames(0)
+	dir.Close()
+	if err != nil {
+		return nil, err
 	}
-	return getHugePageSizeFromFilenames(fileNames)
+
+	return getHugePageSizeFromFilenames(files)
 }
 
 func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) {
-	var pageSizes []string
-	for _, fileName := range fileNames {
-		nameArray := strings.Split(fileName, "-")
-		pageSize, err := units.RAMInBytes(nameArray[1])
+	pageSizes := make([]string, 0, len(fileNames))
+
+	for _, file := range fileNames {
+		// example: hugepages-1048576kB
+		val := strings.TrimPrefix(file, "hugepages-")
+		if len(val) == len(file) {
+			// unexpected file name: no prefix found
+			continue
+		}
+		// The suffix is always "kB" (as of Linux 5.9)
+		eLen := len(val) - 2
+		val = strings.TrimSuffix(val, "kB")
+		if len(val) != eLen {
+			logrus.Warnf("GetHugePageSize: %s: invalid filename suffix (expected \"kB\")", file)
+			continue
+		}
+		size, err := strconv.Atoi(val)
 		if err != nil {
-			return []string{}, err
+			return nil, err
 		}
-		sizeString := units.CustomSize("%g%s", float64(pageSize), 1024.0, HugePageSizeUnitList)
-		pageSizes = append(pageSizes, sizeString)
+		// Model after https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/hugetlb_cgroup.c?id=eff48ddeab782e35e58ccc8853f7386bbae9dec4#n574
+		// but in our case the size is in KB already.
+		if size >= (1 << 20) {
+			val = strconv.Itoa(size>>20) + "GB"
+		} else if size >= (1 << 10) {
+			val = strconv.Itoa(size>>10) + "MB"
+		} else {
+			val += "KB"
+		}
+		pageSizes = append(pageSizes, val)
 	}
 
 	return pageSizes, nil
@@ -303,14 +376,14 @@ func WriteCgroupProc(dir string, pid int) error {
 		return nil
 	}
 
-	cgroupProcessesFile, err := os.OpenFile(filepath.Join(dir, CgroupProcesses), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0700)
+	file, err := fscommon.OpenFile(dir, CgroupProcesses, os.O_WRONLY)
 	if err != nil {
 		return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err)
 	}
-	defer cgroupProcessesFile.Close()
+	defer file.Close()
 
 	for i := 0; i < 5; i++ {
-		_, err = cgroupProcessesFile.WriteString(strconv.Itoa(pid))
+		_, err = file.WriteString(strconv.Itoa(pid))
 		if err == nil {
 			return nil
 		}
@@ -327,17 +400,6 @@ func WriteCgroupProc(dir string, pid int) error {
 	return err
 }
 
-// Since the OCI spec is designed for cgroup v1, in some cases
-// there is need to convert from the cgroup v1 configuration to cgroup v2
-// the formula for BlkIOWeight is y = (1 + (x - 10) * 9999 / 990)
-// convert linearly from [10-1000] to [1-10000]
-func ConvertBlkIOToCgroupV2Value(blkIoWeight uint16) uint64 {
-	if blkIoWeight == 0 {
-		return 0
-	}
-	return uint64(1 + (uint64(blkIoWeight)-10)*9999/990)
-}
-
 // Since the OCI spec is designed for cgroup v1, in some cases
 // there is need to convert from the cgroup v1 configuration to cgroup v2
 // the formula for cpuShares is y = (1 + ((x - 2) * 9999) / 262142)
@@ -377,3 +439,14 @@ func ConvertMemorySwapToCgroupV2Value(memorySwap, memory int64) (int64, error) {
 
 	return memorySwap - memory, nil
 }
+
+// Since the OCI spec is designed for cgroup v1, in some cases
+// there is need to convert from the cgroup v1 configuration to cgroup v2
+// the formula for BlkIOWeight to IOWeight is y = (1 + (x - 10) * 9999 / 990)
+// convert linearly from [10-1000] to [1-10000]
+func ConvertBlkIOToIOWeightValue(blkIoWeight uint16) uint64 {
+	if blkIoWeight == 0 {
+		return 0
+	}
+	return uint64(1 + (uint64(blkIoWeight)-10)*9999/990)
+}

+ 41 - 59
vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go

@@ -1,16 +1,16 @@
 package cgroups
 
 import (
-	"bufio"
 	"errors"
 	"fmt"
-	"io"
 	"os"
 	"path/filepath"
 	"strings"
+	"sync"
 	"syscall"
 
 	securejoin "github.com/cyphar/filepath-securejoin"
+	"github.com/moby/sys/mountinfo"
 	"golang.org/x/sys/unix"
 )
 
@@ -23,7 +23,12 @@ const (
 )
 
 var (
-	errUnified = errors.New("not implemented for cgroup v2 unified hierarchy")
+	errUnified     = errors.New("not implemented for cgroup v2 unified hierarchy")
+	ErrV1NoUnified = errors.New("invalid configuration: cannot use unified on cgroup v1")
+
+	readMountinfoOnce sync.Once
+	readMountinfoErr  error
+	cgroupMountinfo   []*mountinfo.Info
 )
 
 type NotFoundError struct {
@@ -90,6 +95,21 @@ func tryDefaultPath(cgroupPath, subsystem string) string {
 	return path
 }
 
+// readCgroupMountinfo returns a list of cgroup v1 mounts (i.e. the ones
+// with fstype of "cgroup") for the current running process.
+//
+// The results are cached (to avoid re-reading mountinfo which is relatively
+// expensive), so it is assumed that cgroup mounts are not being changed.
+func readCgroupMountinfo() ([]*mountinfo.Info, error) {
+	readMountinfoOnce.Do(func() {
+		cgroupMountinfo, readMountinfoErr = mountinfo.GetMounts(
+			mountinfo.FSTypeFilter("cgroup"),
+		)
+	})
+
+	return cgroupMountinfo, readMountinfoErr
+}
+
 // https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt
 func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) {
 	if IsCgroup2UnifiedMode() {
@@ -110,56 +130,28 @@ func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string,
 		return "", "", errUnified
 	}
 
-	// Avoid parsing mountinfo by checking if subsystem is valid/available.
-	if !isSubsystemAvailable(subsystem) {
-		return "", "", NewNotFoundError(subsystem)
-	}
-
-	f, err := os.Open("/proc/self/mountinfo")
+	mi, err := readCgroupMountinfo()
 	if err != nil {
 		return "", "", err
 	}
-	defer f.Close()
 
-	return findCgroupMountpointAndRootFromReader(f, cgroupPath, subsystem)
+	return findCgroupMountpointAndRootFromMI(mi, cgroupPath, subsystem)
 }
 
-func findCgroupMountpointAndRootFromReader(reader io.Reader, cgroupPath, subsystem string) (string, string, error) {
-	scanner := bufio.NewScanner(reader)
-	for scanner.Scan() {
-		txt := scanner.Text()
-		fields := strings.Fields(txt)
-		if len(fields) < 9 {
-			continue
-		}
-		if strings.HasPrefix(fields[4], cgroupPath) {
-			for _, opt := range strings.Split(fields[len(fields)-1], ",") {
+func findCgroupMountpointAndRootFromMI(mounts []*mountinfo.Info, cgroupPath, subsystem string) (string, string, error) {
+	for _, mi := range mounts {
+		if strings.HasPrefix(mi.Mountpoint, cgroupPath) {
+			for _, opt := range strings.Split(mi.VFSOptions, ",") {
 				if opt == subsystem {
-					return fields[4], fields[3], nil
+					return mi.Mountpoint, mi.Root, nil
 				}
 			}
 		}
 	}
-	if err := scanner.Err(); err != nil {
-		return "", "", err
-	}
 
 	return "", "", NewNotFoundError(subsystem)
 }
 
-func isSubsystemAvailable(subsystem string) bool {
-	if IsCgroup2UnifiedMode() {
-		panic("don't call isSubsystemAvailable from cgroupv2 code")
-	}
-
-	cgroups, err := ParseCgroupFile("/proc/self/cgroup")
-	if err != nil {
-		return false
-	}
-	_, avail := cgroups[subsystem]
-	return avail
-}
-
 func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) {
 	if len(m.Subsystems) == 0 {
 		return "", fmt.Errorf("no subsystem for mount")
@@ -168,25 +160,15 @@ func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) {
 	return getControllerPath(m.Subsystems[0], cgroups)
 }
 
-func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount, error) {
+func getCgroupMountsHelper(ss map[string]bool, mounts []*mountinfo.Info, all bool) ([]Mount, error) {
 	res := make([]Mount, 0, len(ss))
-	scanner := bufio.NewScanner(mi)
 	numFound := 0
-	for scanner.Scan() && numFound < len(ss) {
-		txt := scanner.Text()
-		sepIdx := strings.Index(txt, " - ")
-		if sepIdx == -1 {
-			return nil, fmt.Errorf("invalid mountinfo format")
-		}
-		if txt[sepIdx+3:sepIdx+10] == "cgroup2" || txt[sepIdx+3:sepIdx+9] != "cgroup" {
-			continue
-		}
-		fields := strings.Split(txt, " ")
+	for _, mi := range mounts {
 		m := Mount{
-			Mountpoint: fields[4],
-			Root:       fields[3],
+			Mountpoint: mi.Mountpoint,
+			Root:       mi.Root,
 		}
-		for _, opt := range strings.Split(fields[len(fields)-1], ",") {
+		for _, opt := range strings.Split(mi.VFSOptions, ",") {
 			seen, known := ss[opt]
 			if !known || (!all && seen) {
 				continue
@@ -199,19 +181,18 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount,
 		if len(m.Subsystems) > 0 || all {
 			res = append(res, m)
 		}
-	}
-	if err := scanner.Err(); err != nil {
-		return nil, err
+		if !all && numFound >= len(ss) {
+			break
+		}
 	}
 	return res, nil
 }
 
 func getCgroupMountsV1(all bool) ([]Mount, error) {
-	f, err := os.Open("/proc/self/mountinfo")
+	mi, err := readCgroupMountinfo()
 	if err != nil {
 		return nil, err
 	}
-	defer f.Close()
 
 	allSubsystems, err := ParseCgroupFile("/proc/self/cgroup")
 	if err != nil {
@@ -222,7 +203,8 @@ func getCgroupMountsV1(all bool) ([]Mount, error) {
 	for s := range allSubsystems {
 		allMap[s] = false
 	}
-	return getCgroupMountsHelper(allMap, f, all)
+
+	return getCgroupMountsHelper(allMap, mi, all)
 }
 
 // GetOwnCgroup returns the relative path to the cgroup docker is running in.

+ 5 - 7
vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go

@@ -2,6 +2,7 @@ package configs
 
 import (
 	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
+	"github.com/opencontainers/runc/libcontainer/devices"
 )
 
 type FreezerState string
@@ -42,7 +43,7 @@ type Cgroup struct {
 
 type Resources struct {
 	// Devices is the set of access rules for devices in the container.
-	Devices []*DeviceRule `json:"devices"`
+	Devices []*devices.Rule `json:"devices"`
 
 	// Memory limit (in bytes)
 	Memory int64 `json:"memory"`
@@ -53,12 +54,6 @@ type Resources struct {
 	// Total memory usage (memory + swap); set `-1` to enable unlimited swap
 	MemorySwap int64 `json:"memory_swap"`
 
-	// Kernel memory limit (in bytes)
-	KernelMemory int64 `json:"kernel_memory"`
-
-	// Kernel memory limit for TCP use (in bytes)
-	KernelMemoryTCP int64 `json:"kernel_memory_tcp"`
-
 	// CPU shares (relative weight vs. other containers)
 	CpuShares uint64 `json:"cpu_shares"`
 
@@ -127,6 +122,9 @@ type Resources struct {
 	// CpuWeight sets a proportional bandwidth limit.
 	CpuWeight uint64 `json:"cpu_weight"`
 
+	// Unified is cgroupv2-only key-value map.
+	Unified map[string]string `json:"unified"`
+
 	// SkipDevices allows to skip configuring device permissions.
 	// Used by e.g. kubelet while creating a parent cgroup (kubepods)
 	// common for many containers.

+ 15 - 10
vendor/github.com/opencontainers/runc/libcontainer/configs/config.go

@@ -7,6 +7,7 @@ import (
 	"os/exec"
 	"time"
 
+	"github.com/opencontainers/runc/libcontainer/devices"
 	"github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/pkg/errors"
 	"github.com/sirupsen/logrus"
@@ -30,9 +31,10 @@ type IDMap struct {
 // for syscalls. Additional architectures can be added by specifying them in
 // Architectures.
 type Seccomp struct {
-	DefaultAction Action     `json:"default_action"`
-	Architectures []string   `json:"architectures"`
-	Syscalls      []*Syscall `json:"syscalls"`
+	DefaultAction   Action     `json:"default_action"`
+	Architectures   []string   `json:"architectures"`
+	Syscalls        []*Syscall `json:"syscalls"`
+	DefaultErrnoRet *uint      `json:"default_errno_ret"`
 }
 
 // Action is taken upon rule match in Seccomp
@@ -92,6 +94,9 @@ type Config struct {
 	// Path to a directory containing the container's root filesystem.
 	Rootfs string `json:"rootfs"`
 
+	// Umask is the umask to use inside of the container.
+	Umask *uint32 `json:"umask"`
+
 	// Readonlyfs will remount the container's rootfs as readonly where only externally mounted
 	// bind mounts are writtable.
 	Readonlyfs bool `json:"readonlyfs"`
@@ -104,7 +109,7 @@ type Config struct {
 	Mounts []*Mount `json:"mounts"`
 
 	// The device nodes that should be automatically created within the container upon container start.  Note, make sure that the node is marked as allowed in the cgroup as well!
-	Devices []*Device `json:"devices"`
+	Devices []*devices.Device `json:"devices"`
 
 	MountLabel string `json:"mount_label"`
 
@@ -218,25 +223,25 @@ const (
 	// the runtime environment has been created but before the pivot_root has been executed.
 	// CreateRuntime is called immediately after the deprecated Prestart hook.
 	// CreateRuntime commands are called in the Runtime Namespace.
-	CreateRuntime = "createRuntime"
+	CreateRuntime HookName = "createRuntime"
 
 	// CreateContainer commands MUST be called as part of the create operation after
 	// the runtime environment has been created but before the pivot_root has been executed.
 	// CreateContainer commands are called in the Container namespace.
-	CreateContainer = "createContainer"
+	CreateContainer HookName = "createContainer"
 
 	// StartContainer commands MUST be called as part of the start operation and before
 	// the container process is started.
 	// StartContainer commands are called in the Container namespace.
-	StartContainer = "startContainer"
+	StartContainer HookName = "startContainer"
 
 	// Poststart commands are executed after the container init process starts.
 	// Poststart commands are called in the Runtime Namespace.
-	Poststart = "poststart"
+	Poststart HookName = "poststart"
 
 	// Poststop commands are executed after the container init process exits.
 	// Poststop commands are called in the Runtime Namespace.
-	Poststop = "poststop"
+	Poststop HookName = "poststop"
 )
 
 type Capabilities struct {
@@ -383,7 +388,7 @@ func (c Command) Run(s *specs.State) error {
 		return err
 	case <-timerCh:
 		cmd.Process.Kill()
-		cmd.Wait()
+		<-errC
 		return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds())
 	}
 }

+ 9 - 0
vendor/github.com/opencontainers/runc/libcontainer/configs/configs_fuzzer.go

@@ -0,0 +1,9 @@
+// +build gofuzz
+
+package configs
+
+func FuzzUnmarshalJSON(data []byte) int {
+	hooks := Hooks{}
+	_ = hooks.UnmarshalJSON(data)
+	return 1
+}

+ 0 - 16
vendor/github.com/opencontainers/runc/libcontainer/configs/device_unix.go

@@ -1,16 +0,0 @@
-// +build !windows
-
-package configs
-
-import (
-	"errors"
-
-	"golang.org/x/sys/unix"
-)
-
-func (d *DeviceRule) Mkdev() (uint64, error) {
-	if d.Major == Wildcard || d.Minor == Wildcard {
-		return 0, errors.New("cannot mkdev() device with wildcards")
-	}
-	return unix.Mkdev(uint32(d.Major), uint32(d.Minor)), nil
-}

+ 0 - 5
vendor/github.com/opencontainers/runc/libcontainer/configs/device_windows.go

@@ -1,5 +0,0 @@
-package configs
-
-func (d *DeviceRule) Mkdev() (uint64, error) {
-	return 0, nil
-}

+ 17 - 0
vendor/github.com/opencontainers/runc/libcontainer/configs/devices.go

@@ -0,0 +1,17 @@
+package configs
+
+import "github.com/opencontainers/runc/libcontainer/devices"
+
+type (
+	// Deprecated: use libcontainer/devices.Device
+	Device = devices.Device
+
+	// Deprecated: use libcontainer/devices.Rule
+	DeviceRule = devices.Rule
+
+	// Deprecated: use libcontainer/devices.Type
+	DeviceType = devices.Type
+
+	// Deprecated: use libcontainer/devices.Permissions
+	DevicePermissions = devices.Permissions
+)

+ 1 - 1
vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go

@@ -56,7 +56,7 @@ func IsNamespaceSupported(ns NamespaceType) bool {
 	if nsFile == "" {
 		return false
 	}
-	_, err := os.Stat(fmt.Sprintf("/proc/self/ns/%s", nsFile))
+	_, err := os.Stat("/proc/self/ns/" + nsFile)
 	// a namespace is supported if it exists and we have permissions to read it
 	supported = err == nil
 	supportedNamespaces[ns] = supported

+ 33 - 29
vendor/github.com/opencontainers/runc/libcontainer/configs/device.go → vendor/github.com/opencontainers/runc/libcontainer/devices/device.go

@@ -1,4 +1,4 @@
-package configs
+package devices
 
 import (
 	"fmt"
@@ -11,7 +11,7 @@ const (
 )
 
 type Device struct {
-	DeviceRule
+	Rule
 
 	// Path to the device.
 	Path string `json:"path"`
@@ -26,10 +26,10 @@ type Device struct {
 	Gid uint32 `json:"gid"`
 }
 
-// DevicePermissions is a cgroupv1-style string to represent device access. It
+// Permissions is a cgroupv1-style string to represent device access. It
 // has to be a string for backward compatibility reasons, hence why it has
 // methods to do set operations.
-type DevicePermissions string
+type Permissions string
 
 const (
 	deviceRead uint = (1 << iota)
@@ -37,7 +37,7 @@ const (
 	deviceMknod
 )
 
-func (p DevicePermissions) toSet() uint {
+func (p Permissions) toSet() uint {
 	var set uint
 	for _, perm := range p {
 		switch perm {
@@ -52,7 +52,7 @@ func (p DevicePermissions) toSet() uint {
 	return set
 }
 
-func fromSet(set uint) DevicePermissions {
+func fromSet(set uint) Permissions {
 	var perm string
 	if set&deviceRead == deviceRead {
 		perm += "r"
@@ -63,53 +63,53 @@ func fromSet(set uint) DevicePermissions {
 	if set&deviceMknod == deviceMknod {
 		perm += "m"
 	}
-	return DevicePermissions(perm)
+	return Permissions(perm)
 }
 
-// Union returns the union of the two sets of DevicePermissions.
-func (p DevicePermissions) Union(o DevicePermissions) DevicePermissions {
+// Union returns the union of the two sets of Permissions.
+func (p Permissions) Union(o Permissions) Permissions {
 	lhs := p.toSet()
 	rhs := o.toSet()
 	return fromSet(lhs | rhs)
 }
 
-// Difference returns the set difference of the two sets of DevicePermissions.
+// Difference returns the set difference of the two sets of Permissions.
 // In set notation, A.Difference(B) gives you A\B.
-func (p DevicePermissions) Difference(o DevicePermissions) DevicePermissions {
+func (p Permissions) Difference(o Permissions) Permissions {
 	lhs := p.toSet()
 	rhs := o.toSet()
 	return fromSet(lhs &^ rhs)
 }
 
-// Intersection computes the intersection of the two sets of DevicePermissions.
-func (p DevicePermissions) Intersection(o DevicePermissions) DevicePermissions {
+// Intersection computes the intersection of the two sets of Permissions.
+func (p Permissions) Intersection(o Permissions) Permissions {
 	lhs := p.toSet()
 	rhs := o.toSet()
 	return fromSet(lhs & rhs)
 }
 
-// IsEmpty returns whether the set of permissions in a DevicePermissions is
+// IsEmpty returns whether the set of permissions in a Permissions is
 // empty.
-func (p DevicePermissions) IsEmpty() bool {
-	return p == DevicePermissions("")
+func (p Permissions) IsEmpty() bool {
+	return p == Permissions("")
 }
 
 // IsValid returns whether the set of permissions is a subset of valid
 // permissions (namely, {r,w,m}).
-func (p DevicePermissions) IsValid() bool {
+func (p Permissions) IsValid() bool {
 	return p == fromSet(p.toSet())
 }
 
-type DeviceType rune
+type Type rune
 
 const (
-	WildcardDevice DeviceType = 'a'
-	BlockDevice    DeviceType = 'b'
-	CharDevice     DeviceType = 'c' // or 'u'
-	FifoDevice     DeviceType = 'p'
+	WildcardDevice Type = 'a'
+	BlockDevice    Type = 'b'
+	CharDevice     Type = 'c' // or 'u'
+	FifoDevice     Type = 'p'
 )
 
-func (t DeviceType) IsValid() bool {
+func (t Type) IsValid() bool {
 	switch t {
 	case WildcardDevice, BlockDevice, CharDevice, FifoDevice:
 		return true
@@ -118,7 +118,7 @@ func (t DeviceType) IsValid() bool {
 	}
 }
 
-func (t DeviceType) CanMknod() bool {
+func (t Type) CanMknod() bool {
 	switch t {
 	case BlockDevice, CharDevice, FifoDevice:
 		return true
@@ -127,7 +127,7 @@ func (t DeviceType) CanMknod() bool {
 	}
 }
 
-func (t DeviceType) CanCgroup() bool {
+func (t Type) CanCgroup() bool {
 	switch t {
 	case WildcardDevice, BlockDevice, CharDevice:
 		return true
@@ -136,10 +136,10 @@ func (t DeviceType) CanCgroup() bool {
 	}
 }
 
-type DeviceRule struct {
+type Rule struct {
 	// Type of device ('c' for char, 'b' for block). If set to 'a', this rule
 	// acts as a wildcard and all fields other than Allow are ignored.
-	Type DeviceType `json:"type"`
+	Type Type `json:"type"`
 
 	// Major is the device's major number.
 	Major int64 `json:"major"`
@@ -149,13 +149,13 @@ type DeviceRule struct {
 
 	// Permissions is the set of permissions that this rule applies to (in the
 	// cgroupv1 format -- any combination of "rwm").
-	Permissions DevicePermissions `json:"permissions"`
+	Permissions Permissions `json:"permissions"`
 
 	// Allow specifies whether this rule is allowed.
 	Allow bool `json:"allow"`
 }
 
-func (d *DeviceRule) CgroupString() string {
+func (d *Rule) CgroupString() string {
 	var (
 		major = strconv.FormatInt(d.Major, 10)
 		minor = strconv.FormatInt(d.Minor, 10)
@@ -168,3 +168,7 @@ func (d *DeviceRule) CgroupString() string {
 	}
 	return fmt.Sprintf("%c %s:%s %s", d.Type, major, minor, d.Permissions)
 }
+
+func (d *Rule) Mkdev() (uint64, error) {
+	return mkDev(d)
+}

+ 22 - 14
vendor/github.com/opencontainers/runc/libcontainer/devices/devices.go → vendor/github.com/opencontainers/runc/libcontainer/devices/device_unix.go

@@ -1,3 +1,5 @@
+// +build !windows
+
 package devices
 
 import (
@@ -6,7 +8,6 @@ import (
 	"os"
 	"path/filepath"
 
-	"github.com/opencontainers/runc/libcontainer/configs"
 	"golang.org/x/sys/unix"
 )
 
@@ -21,9 +22,16 @@ var (
 	ioutilReadDir = ioutil.ReadDir
 )
 
+func mkDev(d *Rule) (uint64, error) {
+	if d.Major == Wildcard || d.Minor == Wildcard {
+		return 0, errors.New("cannot mkdev() device with wildcards")
+	}
+	return unix.Mkdev(uint32(d.Major), uint32(d.Minor)), nil
+}
+
 // Given the path to a device and its cgroup_permissions(which cannot be easily queried) look up the
 // information about a linux device and return that information as a Device struct.
-func DeviceFromPath(path, permissions string) (*configs.Device, error) {
+func DeviceFromPath(path, permissions string) (*Device, error) {
 	var stat unix.Stat_t
 	err := unixLstat(path, &stat)
 	if err != nil {
@@ -31,7 +39,7 @@ func DeviceFromPath(path, permissions string) (*configs.Device, error) {
 	}
 
 	var (
-		devType   configs.DeviceType
+		devType   Type
 		mode      = stat.Mode
 		devNumber = uint64(stat.Rdev)
 		major     = unix.Major(devNumber)
@@ -39,41 +47,41 @@ func DeviceFromPath(path, permissions string) (*configs.Device, error) {
 	)
 	switch mode & unix.S_IFMT {
 	case unix.S_IFBLK:
-		devType = configs.BlockDevice
+		devType = BlockDevice
 	case unix.S_IFCHR:
-		devType = configs.CharDevice
+		devType = CharDevice
 	case unix.S_IFIFO:
-		devType = configs.FifoDevice
+		devType = FifoDevice
 	default:
 		return nil, ErrNotADevice
 	}
-	return &configs.Device{
-		DeviceRule: configs.DeviceRule{
+	return &Device{
+		Rule: Rule{
 			Type:        devType,
 			Major:       int64(major),
 			Minor:       int64(minor),
-			Permissions: configs.DevicePermissions(permissions),
+			Permissions: Permissions(permissions),
 		},
 		Path:     path,
-		FileMode: os.FileMode(mode),
+		FileMode: os.FileMode(mode &^ unix.S_IFMT),
 		Uid:      stat.Uid,
 		Gid:      stat.Gid,
 	}, nil
 }
 
 // HostDevices returns all devices that can be found under /dev directory.
-func HostDevices() ([]*configs.Device, error) {
+func HostDevices() ([]*Device, error) {
 	return GetDevices("/dev")
 }
 
 // GetDevices recursively traverses a directory specified by path
 // and returns all devices found there.
-func GetDevices(path string) ([]*configs.Device, error) {
+func GetDevices(path string) ([]*Device, error) {
 	files, err := ioutilReadDir(path)
 	if err != nil {
 		return nil, err
 	}
-	var out []*configs.Device
+	var out []*Device
 	for _, f := range files {
 		switch {
 		case f.IsDir():
@@ -104,7 +112,7 @@ func GetDevices(path string) ([]*configs.Device, error) {
 			}
 			return nil, err
 		}
-		if device.Type == configs.FifoDevice {
+		if device.Type == FifoDevice {
 			continue
 		}
 		out = append(out, device)

+ 51 - 29
vendor/github.com/opencontainers/runc/libcontainer/nsenter/cloned_binary.c

@@ -59,14 +59,38 @@
 #include <sys/syscall.h>
 
 /* Use our own wrapper for memfd_create. */
-#if !defined(SYS_memfd_create) && defined(__NR_memfd_create)
-#  define SYS_memfd_create __NR_memfd_create
+#ifndef SYS_memfd_create
+#  ifdef __NR_memfd_create
+#    define SYS_memfd_create __NR_memfd_create
+#  else
+/* These values come from <https://fedora.juszkiewicz.com.pl/syscalls.html>. */
+#    warning "libc is outdated -- using hard-coded SYS_memfd_create"
+#    if defined(__x86_64__)
+#      define SYS_memfd_create 319
+#    elif defined(__i386__)
+#      define SYS_memfd_create 356
+#    elif defined(__ia64__)
+#      define SYS_memfd_create 1340
+#    elif defined(__arm__)
+#      define SYS_memfd_create 385
+#    elif defined(__aarch64__)
+#      define SYS_memfd_create 279
+#    elif defined(__ppc__) || defined(__PPC64__) || defined(__powerpc64__)
+#      define SYS_memfd_create 360
+#    elif defined(__s390__) || defined(__s390x__)
+#      define SYS_memfd_create 350
+#    else
+#      warning "unknown architecture -- cannot hard-code SYS_memfd_create"
+#    endif
+#  endif
 #endif
+
 /* memfd_create(2) flags -- copied from <linux/memfd.h>. */
 #ifndef MFD_CLOEXEC
 #  define MFD_CLOEXEC       0x0001U
 #  define MFD_ALLOW_SEALING 0x0002U
 #endif
+
 int memfd_create(const char *name, unsigned int flags)
 {
 #ifdef SYS_memfd_create
@@ -77,7 +101,6 @@ int memfd_create(const char *name, unsigned int flags)
 #endif
 }
 
-
 /* This comes directly from <linux/fcntl.h>. */
 #ifndef F_LINUX_SPECIFIC_BASE
 #  define F_LINUX_SPECIFIC_BASE 1024
@@ -103,7 +126,7 @@ static void *must_realloc(void *ptr, size_t size)
 	void *old = ptr;
 	do {
 		ptr = realloc(old, size);
-	} while(!ptr);
+	} while (!ptr);
 	return ptr;
 }
 
@@ -115,10 +138,10 @@ static void *must_realloc(void *ptr, size_t size)
 static int is_self_cloned(void)
 {
 	int fd, ret, is_cloned = 0;
-	struct stat statbuf = {};
-	struct statfs fsbuf = {};
+	struct stat statbuf = { };
+	struct statfs fsbuf = { };
 
-	fd = open("/proc/self/exe", O_RDONLY|O_CLOEXEC);
+	fd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC);
 	if (fd < 0) {
 		fprintf(stderr, "you have no read access to runc binary file\n");
 		return -ENOTRECOVERABLE;
@@ -274,7 +297,7 @@ enum {
 static int make_execfd(int *fdtype)
 {
 	int fd = -1;
-	char template[PATH_MAX] = {0};
+	char template[PATH_MAX] = { 0 };
 	char *prefix = getenv("_LIBCONTAINER_STATEDIR");
 
 	if (!prefix || *prefix != '/')
@@ -303,7 +326,7 @@ static int make_execfd(int *fdtype)
 	*fdtype = EFD_FILE;
 	fd = open(prefix, O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700);
 	if (fd >= 0) {
-		struct stat statbuf = {};
+		struct stat statbuf = { };
 		bool working_otmpfile = false;
 
 		/*
@@ -348,27 +371,27 @@ static int seal_execfd(int *fd, int fdtype)
 	switch (fdtype) {
 	case EFD_MEMFD:
 		return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_SEALS);
-	case EFD_FILE: {
-		/* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */
-		int newfd;
-		char fdpath[PATH_MAX] = {0};
+	case EFD_FILE:{
+			/* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */
+			int newfd;
+			char fdpath[PATH_MAX] = { 0 };
 
-		if (fchmod(*fd, 0100) < 0)
-			return -1;
+			if (fchmod(*fd, 0100) < 0)
+				return -1;
 
-		if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0)
-			return -1;
+			if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0)
+				return -1;
 
-		newfd = open(fdpath, O_PATH | O_CLOEXEC);
-		if (newfd < 0)
-			return -1;
+			newfd = open(fdpath, O_PATH | O_CLOEXEC);
+			if (newfd < 0)
+				return -1;
 
-		close(*fd);
-		*fd = newfd;
-		return 0;
-	}
+			close(*fd);
+			*fd = newfd;
+			return 0;
+		}
 	default:
-	   break;
+		break;
 	}
 	return -1;
 }
@@ -376,7 +399,7 @@ static int seal_execfd(int *fd, int fdtype)
 static int try_bindfd(void)
 {
 	int fd, ret = -1;
-	char template[PATH_MAX] = {0};
+	char template[PATH_MAX] = { 0 };
 	char *prefix = getenv("_LIBCONTAINER_STATEDIR");
 
 	if (!prefix || *prefix != '/')
@@ -404,7 +427,6 @@ static int try_bindfd(void)
 	if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0)
 		goto out_umount;
 
-
 	/* Get read-only handle that we're sure can't be made read-write. */
 	ret = open(template, O_PATH | O_CLOEXEC);
 
@@ -448,7 +470,7 @@ static ssize_t fd_to_fd(int outfd, int infd)
 			if (n < 0)
 				return -1;
 			nwritten += n;
-		} while(nwritten < nread);
+		} while (nwritten < nread);
 
 		total += nwritten;
 	}
@@ -459,7 +481,7 @@ static ssize_t fd_to_fd(int outfd, int infd)
 static int clone_binary(void)
 {
 	int binfd, execfd;
-	struct stat statbuf = {};
+	struct stat statbuf = { };
 	size_t sent = 0;
 	int fdtype = EFD_NONE;
 

+ 142 - 0
vendor/github.com/opencontainers/runc/libcontainer/nsenter/escape.c

@@ -0,0 +1,142 @@
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef ESCAPE_TEST
+#  include <assert.h>
+#  define test_assert(arg) assert(arg)
+#else
+#  define test_assert(arg)
+#endif
+
+#define DEL '\x7f'
+
+/*
+ * Poor man version of itoa with base=16 and input number from 0 to 15,
+ * represented by a char. Converts it to a single hex digit ('0' to 'f').
+ */
+static char hex(char i)
+{
+	test_assert(i >= 0 && i < 16);
+
+	if (i >= 0 && i < 10) {
+		return '0' + i;
+	}
+	if (i >= 10 && i < 16) {
+		return 'a' + i - 10;
+	}
+	return '?';
+}
+
+/*
+ * Given the character, tells how many _extra_ characters are needed
+ * to JSON-escape it. If 0 is returned, the character does not need to
+ * be escaped.
+ */
+static int need_escape(char c)
+{
+	switch (c) {
+	case '\\':
+	case '"':
+	case '\b':
+	case '\n':
+	case '\r':
+	case '\t':
+	case '\f':
+		return 1;
+	case DEL:		// -> \u007f
+		return 5;
+	default:
+		if (c > 0 && c < ' ') {
+			// ASCII decimal 01 to 31 -> \u00xx
+			return 5;
+		}
+		return 0;
+	}
+}
+
+/*
+ * Escape the string so it can be used as a JSON string (per RFC4627,
+ * section 2.5 minimal requirements, plus the DEL (0x7f) character).
+ *
+ * It is expected that the argument is a string allocated via malloc.
+ * In case no escaping is needed, the original string is returned as is;
+ * otherwise, the original string is free'd, and the newly allocated
+ * escaped string is returned. Thus, in any case, the value returned
+ * need to be free'd by the caller.
+ */
+char *escape_json_string(char *s)
+{
+	int i, j, len;
+	char *c, *out;
+
+	/*
+	 * First, check if escaping is at all needed -- if not, we can avoid
+	 * malloc and return the argument as is.  While at it, count how much
+	 * extra space is required.
+	 *
+	 * XXX: the counting code must be in sync with the escaping code
+	 * (checked by test_assert()s below).
+	 */
+	for (i = j = 0; s[i] != '\0'; i++) {
+		j += need_escape(s[i]);
+	}
+	if (j == 0) {
+		// nothing to escape
+		return s;
+	}
+
+	len = i + j + 1;
+	out = malloc(len);
+	if (!out) {
+		free(s);
+		// As malloc failed, strdup can fail, too, so in the worst case
+		// scenario NULL will be returned from here.
+		return strdup("escape_json_string: out of memory");
+	}
+	for (c = s, j = 0; *c != '\0'; c++) {
+		switch (*c) {
+		case '"':
+		case '\\':
+			test_assert(need_escape(*c) == 1);
+			out[j++] = '\\';
+			out[j++] = *c;
+			continue;
+		}
+		if ((*c < 0 || *c >= ' ') && (*c != DEL)) {
+			// no escape needed
+			test_assert(need_escape(*c) == 0);
+			out[j++] = *c;
+			continue;
+		}
+		out[j++] = '\\';
+		switch (*c) {
+		case '\b':
+			out[j++] = 'b';
+			break;
+		case '\n':
+			out[j++] = 'n';
+			break;
+		case '\r':
+			out[j++] = 'r';
+			break;
+		case '\t':
+			out[j++] = 't';
+			break;
+		case '\f':
+			out[j++] = 'f';
+			break;
+		default:
+			test_assert(need_escape(*c) == 5);
+			out[j++] = 'u';
+			out[j++] = '0';
+			out[j++] = '0';
+			out[j++] = hex(*c >> 4);
+			out[j++] = hex(*c & 0x0f);
+		}
+	}
+	test_assert(j + 1 == len);
+	out[j] = '\0';
+
+	free(s);
+	return out;
+}

+ 222 - 139
vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c

@@ -29,6 +29,8 @@
 /* Get all of the CLONE_NEW* flags. */
 #include "namespace.h"
 
+extern char *escape_json_string(char *str);
+
 /* Synchronisation values. */
 enum sync_t {
 	SYNC_USERMAP_PLS = 0x40,	/* Request parent to map our users. */
@@ -36,7 +38,7 @@ enum sync_t {
 	SYNC_RECVPID_PLS = 0x42,	/* Tell parent we're sending the PID. */
 	SYNC_RECVPID_ACK = 0x43,	/* PID was correctly received by parent. */
 	SYNC_GRANDCHILD = 0x44,	/* The grandchild is ready to run. */
-	SYNC_CHILD_READY = 0x45,	/* The child or grandchild is ready to return. */
+	SYNC_CHILD_FINISH = 0x45,	/* The child or grandchild has finished. */
 };
 
 /*
@@ -45,10 +47,14 @@ enum sync_t {
  */
 #define CREATECGROUPNS 0x80
 
+#define STAGE_SETUP  -1
 /* longjmp() arguments. */
-#define JUMP_PARENT 0x00
-#define JUMP_CHILD  0xA0
-#define JUMP_INIT   0xA1
+#define STAGE_PARENT  0
+#define STAGE_CHILD   1
+#define STAGE_INIT    2
+
+/* Stores the current stage of nsexec. */
+int current_stage = STAGE_SETUP;
 
 /* Assume the stack grows down, so arguments should be above it. */
 struct clone_t {
@@ -56,7 +62,7 @@ struct clone_t {
 	 * Reserve some space for clone() to locate arguments
 	 * and retcode in this place
 	 */
-	char stack[4096] __attribute__ ((aligned(16)));
+	char stack[4096] __attribute__((aligned(16)));
 	char stack_ptr[0];
 
 	/* There's two children. This is used to execute the different code. */
@@ -102,31 +108,31 @@ static int logfd = -1;
  * List of netlink message types sent to us as part of bootstrapping the init.
  * These constants are defined in libcontainer/message_linux.go.
  */
-#define INIT_MSG			62000
+#define INIT_MSG		62000
 #define CLONE_FLAGS_ATTR	27281
 #define NS_PATHS_ATTR		27282
-#define UIDMAP_ATTR			27283
-#define GIDMAP_ATTR			27284
+#define UIDMAP_ATTR		27283
+#define GIDMAP_ATTR		27284
 #define SETGROUP_ATTR		27285
 #define OOM_SCORE_ADJ_ATTR	27286
 #define ROOTLESS_EUID_ATTR	27287
-#define UIDMAPPATH_ATTR	    27288
-#define GIDMAPPATH_ATTR	    27289
+#define UIDMAPPATH_ATTR		27288
+#define GIDMAPPATH_ATTR		27289
 
 /*
  * Use the raw syscall for versions of glibc which don't include a function for
  * it, namely (glibc 2.12).
  */
 #if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14
-#	define _GNU_SOURCE
-#	include "syscall.h"
-#	if !defined(SYS_setns) && defined(__NR_setns)
-#		define SYS_setns __NR_setns
-#	endif
-
-#ifndef SYS_setns
-#	error "setns(2) syscall not supported by glibc version"
-#endif
+#  define _GNU_SOURCE
+#  include "syscall.h"
+#  if !defined(SYS_setns) && defined(__NR_setns)
+#    define SYS_setns __NR_setns
+#  endif
+
+#  ifndef SYS_setns
+#    error "setns(2) syscall not supported by glibc version"
+#  endif
 
 int setns(int fd, int nstype)
 {
@@ -134,33 +140,43 @@ int setns(int fd, int nstype)
 }
 #endif
 
-static void write_log_with_info(const char *level, const char *function, int line, const char *format, ...)
+static void write_log(const char *level, const char *format, ...)
 {
-	char message[1024] = {};
-
+	char *message = NULL, *stage = NULL;
 	va_list args;
+	int ret;
 
 	if (logfd < 0 || level == NULL)
-		return;
+		goto out;
 
 	va_start(args, format);
-	if (vsnprintf(message, sizeof(message), format, args) < 0)
-		goto done;
-
-	dprintf(logfd, "{\"level\":\"%s\", \"msg\": \"%s:%d %s\"}\n", level, function, line, message);
-done:
+	ret = vasprintf(&message, format, args);
 	va_end(args);
-}
+	if (ret < 0)
+		goto out;
 
-#define write_log(level, fmt, ...) \
-	write_log_with_info((level), __FUNCTION__, __LINE__, (fmt), ##__VA_ARGS__)
+	message = escape_json_string(message);
+
+	if (current_stage == STAGE_SETUP)
+		stage = strdup("nsexec");
+	else
+		ret = asprintf(&stage, "nsexec-%d", current_stage);
+	if (ret < 0)
+		goto out;
+
+	dprintf(logfd, "{\"level\":\"%s\", \"msg\": \"%s[%d]: %s\"}\n", level, stage, getpid(), message);
+
+out:
+	free(message);
+	free(stage);
+}
 
 /* XXX: This is ugly. */
 static int syncfd = -1;
 
 #define bail(fmt, ...)                                       \
 	do {                                                       \
-		write_log(FATAL, "nsenter: " fmt ": %m", ##__VA_ARGS__); \
+		write_log(FATAL, fmt ": %m", ##__VA_ARGS__); \
 		exit(1);                                                 \
 	} while(0)
 
@@ -187,7 +203,7 @@ static int write_file(char *data, size_t data_len, char *pathfmt, ...)
 		goto out;
 	}
 
- out:
+out:
 	close(fd);
 	return ret;
 }
@@ -297,9 +313,11 @@ static void update_uidmap(const char *path, int pid, char *map, size_t map_len)
 	if (map == NULL || map_len <= 0)
 		return;
 
+	write_log(DEBUG, "update /proc/%d/uid_map to '%s'", pid, map);
 	if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) {
 		if (errno != EPERM)
 			bail("failed to update /proc/%d/uid_map", pid);
+		write_log(DEBUG, "update /proc/%d/uid_map got -EPERM (trying %s)", pid, path);
 		if (try_mapping_tool(path, pid, map, map_len))
 			bail("failed to use newuid map on %d", pid);
 	}
@@ -310,9 +328,11 @@ static void update_gidmap(const char *path, int pid, char *map, size_t map_len)
 	if (map == NULL || map_len <= 0)
 		return;
 
+	write_log(DEBUG, "update /proc/%d/gid_map to '%s'", pid, map);
 	if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) {
 		if (errno != EPERM)
 			bail("failed to update /proc/%d/gid_map", pid);
+		write_log(DEBUG, "update /proc/%d/gid_map got -EPERM (trying %s)", pid, path);
 		if (try_mapping_tool(path, pid, map, map_len))
 			bail("failed to use newgid map on %d", pid);
 	}
@@ -323,19 +343,20 @@ static void update_oom_score_adj(char *data, size_t len)
 	if (data == NULL || len <= 0)
 		return;
 
+	write_log(DEBUG, "update /proc/self/oom_score_adj to '%s'", data);
 	if (write_file(data, len, "/proc/self/oom_score_adj") < 0)
 		bail("failed to update /proc/self/oom_score_adj");
 }
 
 /* A dummy function that just jumps to the given jumpval. */
-static int child_func(void *arg) __attribute__ ((noinline));
+static int child_func(void *arg) __attribute__((noinline));
 static int child_func(void *arg)
 {
 	struct clone_t *ca = (struct clone_t *)arg;
 	longjmp(*ca->env, ca->jmpval);
 }
 
-static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline));
+static int clone_parent(jmp_buf *env, int jmpval) __attribute__((noinline));
 static int clone_parent(jmp_buf *env, int jmpval)
 {
 	struct clone_t ca = {
@@ -507,7 +528,6 @@ void join_namespaces(char *nslist)
 	char *namespace = strtok_r(nslist, ",", &saveptr);
 	struct namespace_t {
 		int fd;
-		int ns;
 		char type[PATH_MAX];
 		char path[PATH_MAX];
 	} *namespaces = NULL;
@@ -542,7 +562,7 @@ void join_namespaces(char *nslist)
 			bail("failed to open %s", path);
 
 		ns->fd = fd;
-		ns->ns = nsflag(namespace);
+		strncpy(ns->type, namespace, PATH_MAX - 1);
 		strncpy(ns->path, path, PATH_MAX - 1);
 		ns->path[PATH_MAX - 1] = '\0';
 	} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
@@ -555,12 +575,14 @@ void join_namespaces(char *nslist)
 	 */
 
 	for (i = 0; i < num; i++) {
-		struct namespace_t ns = namespaces[i];
+		struct namespace_t *ns = &namespaces[i];
+		int flag = nsflag(ns->type);
 
-		if (setns(ns.fd, ns.ns) < 0)
-			bail("failed to setns to %s", ns.path);
+		write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", flag, ns->type, ns->path);
+		if (setns(ns->fd, flag) < 0)
+			bail("failed to setns into %s namespace", ns->type);
 
-		close(ns.fd);
+		close(ns->fd);
 	}
 
 	free(namespaces);
@@ -569,6 +591,14 @@ void join_namespaces(char *nslist)
 /* Defined in cloned_binary.c. */
 extern int ensure_cloned_binary(void);
 
+static inline int sane_kill(pid_t pid, int signum)
+{
+	if (pid > 0)
+		return kill(pid, signum);
+	else
+		return 0;
+}
+
 void nsexec(void)
 {
 	int pipenum;
@@ -598,7 +628,14 @@ void nsexec(void)
 	if (ensure_cloned_binary() < 0)
 		bail("could not ensure we are a cloned binary");
 
-	write_log(DEBUG, "nsexec started");
+	/*
+	 * Inform the parent we're past initial setup.
+	 * For the other side of this, see initWaiter.
+	 */
+	if (write(pipenum, "", 1) != 1)
+		bail("could not inform the parent we are past initial setup");
+
+	write_log(DEBUG, "=> nsexec container setup");
 
 	/* Parse all of the netlink configuration. */
 	nl_parse(pipenum, &config);
@@ -622,6 +659,7 @@ void nsexec(void)
 	 * containers), which is the recommendation from the kernel folks.
 	 */
 	if (config.namespaces) {
+		write_log(DEBUG, "set process as non-dumpable");
 		if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
 			bail("failed to set process as non-dumpable");
 	}
@@ -686,45 +724,49 @@ void nsexec(void)
 	 * -- Aleksa "what has my life come to?" Sarai
 	 */
 
-	switch (setjmp(env)) {
+	current_stage = setjmp(env);
+	switch (current_stage) {
 		/*
 		 * Stage 0: We're in the parent. Our job is just to create a new child
-		 *          (stage 1: JUMP_CHILD) process and write its uid_map and
+		 *          (stage 1: STAGE_CHILD) process and write its uid_map and
 		 *          gid_map. That process will go on to create a new process, then
 		 *          it will send us its PID which we will send to the bootstrap
 		 *          process.
 		 */
-	case JUMP_PARENT:{
+	case STAGE_PARENT:{
 			int len;
-			pid_t child, first_child = -1;
-			bool ready = false;
+			pid_t stage1_pid = -1, stage2_pid = -1;
+			bool stage1_complete, stage2_complete;
 
 			/* For debugging. */
 			prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0);
+			write_log(DEBUG, "~> nsexec stage-0");
 
 			/* Start the process of getting a container. */
-			child = clone_parent(&env, JUMP_CHILD);
-			if (child < 0)
-				bail("unable to fork: child_func");
+			write_log(DEBUG, "spawn stage-1");
+			stage1_pid = clone_parent(&env, STAGE_CHILD);
+			if (stage1_pid < 0)
+				bail("unable to spawn stage-1");
 
-			/*
-			 * State machine for synchronisation with the children.
-			 *
-			 * Father only return when both child and grandchild are
-			 * ready, so we can receive all possible error codes
-			 * generated by children.
-			 */
 			syncfd = sync_child_pipe[1];
 			close(sync_child_pipe[0]);
 
-			while (!ready) {
+			/*
+			 * State machine for synchronisation with the children. We only
+			 * return once both the child and grandchild are ready.
+			 */
+			write_log(DEBUG, "-> stage-1 synchronisation loop");
+			stage1_complete = false;
+			while (!stage1_complete) {
 				enum sync_t s;
 
 				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
-					bail("failed to sync with child: next state");
+					bail("failed to sync with stage-1: next state");
 
 				switch (s) {
 				case SYNC_USERMAP_PLS:
+					write_log(DEBUG, "stage-1 requested userns mappings");
+
 					/*
 					 * Enable setgroups(2) if we've been asked to. But we also
 					 * have to explicitly disable setgroups(2) if we're
@@ -735,70 +777,78 @@ void nsexec(void)
 					 * For rootless multi-entry mapping, config.is_setgroup shall be true and
 					 * newuidmap/newgidmap shall be used.
 					 */
-
 					if (config.is_rootless_euid && !config.is_setgroup)
-						update_setgroups(child, SETGROUPS_DENY);
+						update_setgroups(stage1_pid, SETGROUPS_DENY);
 
 					/* Set up mappings. */
-					update_uidmap(config.uidmappath, child, config.uidmap, config.uidmap_len);
-					update_gidmap(config.gidmappath, child, config.gidmap, config.gidmap_len);
+					update_uidmap(config.uidmappath, stage1_pid, config.uidmap, config.uidmap_len);
+					update_gidmap(config.gidmappath, stage1_pid, config.gidmap, config.gidmap_len);
 
 					s = SYNC_USERMAP_ACK;
 					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
-						kill(child, SIGKILL);
-						bail("failed to sync with child: write(SYNC_USERMAP_ACK)");
+						sane_kill(stage1_pid, SIGKILL);
+						sane_kill(stage2_pid, SIGKILL);
+						bail("failed to sync with stage-1: write(SYNC_USERMAP_ACK)");
 					}
 					break;
-				case SYNC_RECVPID_PLS:{
-						first_child = child;
-
-						/* Get the init_func pid. */
-						if (read(syncfd, &child, sizeof(child)) != sizeof(child)) {
-							kill(first_child, SIGKILL);
-							bail("failed to sync with child: read(childpid)");
-						}
-
-						/* Send ACK. */
-						s = SYNC_RECVPID_ACK;
-						if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
-							kill(first_child, SIGKILL);
-							kill(child, SIGKILL);
-							bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
-						}
-
-						/* Send the init_func pid back to our parent.
-						 *
-						 * Send the init_func pid and the pid of the first child back to our parent.
-						 * We need to send both back because we can't reap the first child we created (CLONE_PARENT).
-						 * It becomes the responsibility of our parent to reap the first child.
-						 */
-						len = dprintf(pipenum, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child);
-						if (len < 0) {
-							kill(child, SIGKILL);
-							bail("unable to generate JSON for child pid");
-						}
+				case SYNC_RECVPID_PLS:
+					write_log(DEBUG, "stage-1 requested pid to be forwarded");
+
+					/* Get the stage-2 pid. */
+					if (read(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) {
+						sane_kill(stage1_pid, SIGKILL);
+						sane_kill(stage2_pid, SIGKILL);
+						bail("failed to sync with stage-1: read(stage2_pid)");
+					}
+
+					/* Send ACK. */
+					s = SYNC_RECVPID_ACK;
+					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
+						sane_kill(stage1_pid, SIGKILL);
+						sane_kill(stage2_pid, SIGKILL);
+						bail("failed to sync with stage-1: write(SYNC_RECVPID_ACK)");
+					}
+
+					/*
+					 * Send both the stage-1 and stage-2 pids back to runc.
+					 * runc needs the stage-2 to continue process management,
+					 * but because stage-1 was spawned with CLONE_PARENT we
+					 * cannot reap it within stage-0 and thus we need to ask
+					 * runc to reap the zombie for us.
+					 */
+					write_log(DEBUG, "forward stage-1 (%d) and stage-2 (%d) pids to runc",
+						  stage1_pid, stage2_pid);
+					len =
+					    dprintf(pipenum, "{\"stage1_pid\":%d,\"stage2_pid\":%d}\n", stage1_pid,
+						    stage2_pid);
+					if (len < 0) {
+						sane_kill(stage1_pid, SIGKILL);
+						sane_kill(stage2_pid, SIGKILL);
+						bail("failed to sync with runc: write(pid-JSON)");
 					}
 					break;
-				case SYNC_CHILD_READY:
-					ready = true;
+				case SYNC_CHILD_FINISH:
+					write_log(DEBUG, "stage-1 complete");
+					stage1_complete = true;
 					break;
 				default:
 					bail("unexpected sync value: %u", s);
 				}
 			}
+			write_log(DEBUG, "<- stage-1 synchronisation loop");
 
 			/* Now sync with grandchild. */
-
 			syncfd = sync_grandchild_pipe[1];
 			close(sync_grandchild_pipe[0]);
-
-			ready = false;
-			while (!ready) {
+			write_log(DEBUG, "-> stage-2 synchronisation loop");
+			stage2_complete = false;
+			while (!stage2_complete) {
 				enum sync_t s;
 
+				write_log(DEBUG, "signalling stage-2 to run");
 				s = SYNC_GRANDCHILD;
 				if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
-					kill(child, SIGKILL);
+					sane_kill(stage2_pid, SIGKILL);
 					bail("failed to sync with child: write(SYNC_GRANDCHILD)");
 				}
 
@@ -806,27 +856,31 @@ void nsexec(void)
 					bail("failed to sync with child: next state");
 
 				switch (s) {
-				case SYNC_CHILD_READY:
-					ready = true;
+				case SYNC_CHILD_FINISH:
+					write_log(DEBUG, "stage-2 complete");
+					stage2_complete = true;
 					break;
 				default:
 					bail("unexpected sync value: %u", s);
 				}
 			}
+			write_log(DEBUG, "<- stage-2 synchronisation loop");
+			write_log(DEBUG, "<~ nsexec stage-0");
 			exit(0);
 		}
+		break;
 
 		/*
 		 * Stage 1: We're in the first child process. Our job is to join any
-		 *          provided namespaces in the netlink payload and unshare all
-		 *          of the requested namespaces. If we've been asked to
-		 *          CLONE_NEWUSER, we will ask our parent (stage 0) to set up
-		 *          our user mappings for us. Then, we create a new child
-		 *          (stage 2: JUMP_INIT) for PID namespace. We then send the
-		 *          child's PID to our parent (stage 0).
+		 *          provided namespaces in the netlink payload and unshare all of
+		 *          the requested namespaces. If we've been asked to CLONE_NEWUSER,
+		 *          we will ask our parent (stage 0) to set up our user mappings
+		 *          for us. Then, we create a new child (stage 2: STAGE_INIT) for
+		 *          PID namespace. We then send the child's PID to our parent
+		 *          (stage 0).
 		 */
-	case JUMP_CHILD:{
-			pid_t child;
+	case STAGE_CHILD:{
+			pid_t stage2_pid = -1;
 			enum sync_t s;
 
 			/* We're in a child and thus need to tell the parent if we die. */
@@ -835,11 +889,12 @@ void nsexec(void)
 
 			/* For debugging. */
 			prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0);
+			write_log(DEBUG, "~> nsexec stage-1");
 
 			/*
 			 * We need to setns first. We cannot do this earlier (in stage 0)
 			 * because of the fact that we forked to get here (the PID of
-			 * [stage 2: JUMP_INIT]) would be meaningless). We could send it
+			 * [stage 2: STAGE_INIT]) would be meaningless). We could send it
 			 * using cmsg(3) but that's just annoying.
 			 */
 			if (config.namespaces)
@@ -865,40 +920,50 @@ void nsexec(void)
 			 * problem.
 			 */
 			if (config.cloneflags & CLONE_NEWUSER) {
+				write_log(DEBUG, "unshare user namespace");
 				if (unshare(CLONE_NEWUSER) < 0)
 					bail("failed to unshare user namespace");
 				config.cloneflags &= ~CLONE_NEWUSER;
 
 				/*
-				 * We don't have the privileges to do any mapping here (see the
-				 * clone_parent rant). So signal our parent to hook us up.
+				 * We need to set ourselves as dumpable temporarily so that the
+				 * parent process can write to our procfs files.
 				 */
-
-				/* Switching is only necessary if we joined namespaces. */
 				if (config.namespaces) {
+					write_log(DEBUG, "temporarily set process as dumpable");
 					if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0)
-						bail("failed to set process as dumpable");
+						bail("failed to temporarily set process as dumpable");
 				}
+
+				/*
+				 * We don't have the privileges to do any mapping here (see the
+				 * clone_parent rant). So signal stage-0 to do the mapping for
+				 * us.
+				 */
+				write_log(DEBUG, "request stage-0 to map user namespace");
 				s = SYNC_USERMAP_PLS;
 				if (write(syncfd, &s, sizeof(s)) != sizeof(s))
 					bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
 
 				/* ... wait for mapping ... */
-
+				write_log(DEBUG, "request stage-0 to map user namespace");
 				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
 					bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
 				if (s != SYNC_USERMAP_ACK)
 					bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
-				/* Switching is only necessary if we joined namespaces. */
+
+				/* Revert temporary re-dumpable setting. */
 				if (config.namespaces) {
+					write_log(DEBUG, "re-set process as non-dumpable");
 					if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
-						bail("failed to set process as dumpable");
+						bail("failed to re-set process as non-dumpable");
 				}
 
 				/* Become root in the namespace proper. */
 				if (setresuid(0, 0, 0) < 0)
 					bail("failed to become root in user namespace");
 			}
+
 			/*
 			 * Unshare all of the namespaces. Now, it should be noted that this
 			 * ordering might break in the future (especially with rootless
@@ -909,8 +974,9 @@ void nsexec(void)
 			 * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
 			 * was broken, so we'll just do it the long way anyway.
 			 */
+			write_log(DEBUG, "unshare remaining namespace (except cgroupns)");
 			if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0)
-				bail("failed to unshare namespaces");
+				bail("failed to unshare remaining namespaces (except cgroupns)");
 
 			/*
 			 * TODO: What about non-namespace clone flags that we're dropping here?
@@ -921,41 +987,45 @@ void nsexec(void)
 			 * which would break many applications and libraries, so we must fork
 			 * to actually enter the new PID namespace.
 			 */
-			child = clone_parent(&env, JUMP_INIT);
-			if (child < 0)
-				bail("unable to fork: init_func");
+			write_log(DEBUG, "spawn stage-2");
+			stage2_pid = clone_parent(&env, STAGE_INIT);
+			if (stage2_pid < 0)
+				bail("unable to spawn stage-2");
 
 			/* Send the child to our parent, which knows what it's doing. */
+			write_log(DEBUG, "request stage-0 to forward stage-2 pid (%d)", stage2_pid);
 			s = SYNC_RECVPID_PLS;
 			if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
-				kill(child, SIGKILL);
+				sane_kill(stage2_pid, SIGKILL);
 				bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
 			}
-			if (write(syncfd, &child, sizeof(child)) != sizeof(child)) {
-				kill(child, SIGKILL);
-				bail("failed to sync with parent: write(childpid)");
+			if (write(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) {
+				sane_kill(stage2_pid, SIGKILL);
+				bail("failed to sync with parent: write(stage2_pid)");
 			}
 
 			/* ... wait for parent to get the pid ... */
-
 			if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
-				kill(child, SIGKILL);
+				sane_kill(stage2_pid, SIGKILL);
 				bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
 			}
 			if (s != SYNC_RECVPID_ACK) {
-				kill(child, SIGKILL);
+				sane_kill(stage2_pid, SIGKILL);
 				bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
 			}
 
-			s = SYNC_CHILD_READY;
+			write_log(DEBUG, "signal completion to stage-0");
+			s = SYNC_CHILD_FINISH;
 			if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
-				kill(child, SIGKILL);
-				bail("failed to sync with parent: write(SYNC_CHILD_READY)");
+				sane_kill(stage2_pid, SIGKILL);
+				bail("failed to sync with parent: write(SYNC_CHILD_FINISH)");
 			}
 
-			/* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */
+			/* Our work is done. [Stage 2: STAGE_INIT] is doing the rest of the work. */
+			write_log(DEBUG, "<~ nsexec stage-1");
 			exit(0);
 		}
+		break;
 
 		/*
 		 * Stage 2: We're the final child process, and the only process that will
@@ -963,7 +1033,7 @@ void nsexec(void)
 		 *          final cleanup steps and then return to the Go runtime to allow
 		 *          init_linux.go to run.
 		 */
-	case JUMP_INIT:{
+	case STAGE_INIT:{
 			/*
 			 * We're inside the child now, having jumped from the
 			 * start_child() code after forking in the parent.
@@ -978,6 +1048,7 @@ void nsexec(void)
 
 			/* For debugging. */
 			prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0);
+			write_log(DEBUG, "~> nsexec stage-2");
 
 			if (read(syncfd, &s, sizeof(s)) != sizeof(s))
 				bail("failed to sync with parent: read(SYNC_GRANDCHILD)");
@@ -998,21 +1069,30 @@ void nsexec(void)
 					bail("setgroups failed");
 			}
 
-			/* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */
+			/*
+			 * Wait until our topmost parent has finished cgroup setup in
+			 * p.manager.Apply().
+			 *
+			 * TODO(cyphar): Check if this code is actually needed because we
+			 *               should be in the cgroup even from stage-0, so
+			 *               waiting until now might not make sense.
+			 */
 			if (config.cloneflags & CLONE_NEWCGROUP) {
 				uint8_t value;
 				if (read(pipenum, &value, sizeof(value)) != sizeof(value))
 					bail("read synchronisation value failed");
 				if (value == CREATECGROUPNS) {
+					write_log(DEBUG, "unshare cgroup namespace");
 					if (unshare(CLONE_NEWCGROUP) < 0)
 						bail("failed to unshare cgroup namespace");
 				} else
 					bail("received unknown synchronisation value");
 			}
 
-			s = SYNC_CHILD_READY;
+			write_log(DEBUG, "signal completion to stage-0");
+			s = SYNC_CHILD_FINISH;
 			if (write(syncfd, &s, sizeof(s)) != sizeof(s))
-				bail("failed to sync with patent: write(SYNC_CHILD_READY)");
+				bail("failed to sync with patent: write(SYNC_CHILD_FINISH)");
 
 			/* Close sync pipes. */
 			close(sync_grandchild_pipe[0]);
@@ -1021,10 +1101,13 @@ void nsexec(void)
 			nl_free(&config);
 
 			/* Finish executing, let the Go runtime take over. */
+			write_log(DEBUG, "<= nsexec container setup");
+			write_log(DEBUG, "booting up go runtime ...");
 			return;
 		}
+		break;
 	default:
-		bail("unexpected jump value");
+		bail("unknown stage '%d' for jump value", current_stage);
 	}
 
 	/* Should never be reached. */

+ 1 - 0
vendor/github.com/opencontainers/runc/libcontainer/nsenter/test/escape.c

@@ -0,0 +1 @@
+../escape.c

+ 53 - 0
vendor/github.com/opencontainers/runc/libcontainer/nsenter/test/escape.go

@@ -0,0 +1,53 @@
+package escapetest
+
+// This file is part of escape_json_string unit test.
+// It is in a separate package so cgo can be used together
+// with go test.
+
+// #include <stdlib.h>
+// extern char *escape_json_string(char *str);
+// #cgo CFLAGS: -DESCAPE_TEST=1
+import "C"
+
+import (
+	"testing"
+	"unsafe"
+)
+
+func testEscapeJsonString(t *testing.T, input, want string) {
+	in := C.CString(input)
+	out := C.escape_json_string(in)
+	got := C.GoString(out)
+	C.free(unsafe.Pointer(out))
+	t.Logf("input: %q, output: %q", input, got)
+	if got != want {
+		t.Errorf("Failed on input: %q, want %q, got %q", input, want, got)
+	}
+}
+
+func testEscapeJson(t *testing.T) {
+	testCases := []struct {
+		input, output string
+	}{
+		{"", ""},
+		{"abcdef", "abcdef"},
+		{`\\\\\\`, `\\\\\\\\\\\\`},
+		{`with"quote`, `with\"quote`},
+		{"\n\r\b\t\f\\", `\n\r\b\t\f\\`},
+		{"\007", "\\u0007"},
+		{"\017 \020 \037", "\\u000f \\u0010 \\u001f"},
+		{"\033", "\\u001b"},
+		{`<->`, `<->`},
+		{"\176\177\200", "~\\u007f\200"},
+		{"\000", ""},
+		{"a\x7fxc", "a\\u007fxc"},
+		{"a\033xc", "a\\u001bxc"},
+		{"a\nxc", "a\\nxc"},
+		{"a\\xc", "a\\\\xc"},
+		{"Barney B\303\244r", "Barney B\303\244r"},
+	}
+
+	for _, tc := range testCases {
+		testEscapeJsonString(t, tc.input, tc.output)
+	}
+}

+ 0 - 41
vendor/github.com/opencontainers/runc/libcontainer/user/lookup.go

@@ -1,41 +0,0 @@
-package user
-
-import (
-	"errors"
-)
-
-var (
-	// The current operating system does not provide the required data for user lookups.
-	ErrUnsupported = errors.New("user lookup: operating system does not provide passwd-formatted data")
-	// No matching entries found in file.
-	ErrNoPasswdEntries = errors.New("no matching entries in passwd file")
-	ErrNoGroupEntries  = errors.New("no matching entries in group file")
-)
-
-// LookupUser looks up a user by their username in /etc/passwd. If the user
-// cannot be found (or there is no /etc/passwd file on the filesystem), then
-// LookupUser returns an error.
-func LookupUser(username string) (User, error) {
-	return lookupUser(username)
-}
-
-// LookupUid looks up a user by their user id in /etc/passwd. If the user cannot
-// be found (or there is no /etc/passwd file on the filesystem), then LookupId
-// returns an error.
-func LookupUid(uid int) (User, error) {
-	return lookupUid(uid)
-}
-
-// LookupGroup looks up a group by its name in /etc/group. If the group cannot
-// be found (or there is no /etc/group file on the filesystem), then LookupGroup
-// returns an error.
-func LookupGroup(groupname string) (Group, error) {
-	return lookupGroup(groupname)
-}
-
-// LookupGid looks up a group by its group id in /etc/group. If the group cannot
-// be found (or there is no /etc/group file on the filesystem), then LookupGid
-// returns an error.
-func LookupGid(gid int) (Group, error) {
-	return lookupGid(gid)
-}

+ 16 - 4
vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go

@@ -16,13 +16,19 @@ const (
 	unixGroupPath  = "/etc/group"
 )
 
-func lookupUser(username string) (User, error) {
+// LookupUser looks up a user by their username in /etc/passwd. If the user
+// cannot be found (or there is no /etc/passwd file on the filesystem), then
+// LookupUser returns an error.
+func LookupUser(username string) (User, error) {
 	return lookupUserFunc(func(u User) bool {
 		return u.Name == username
 	})
 }
 
-func lookupUid(uid int) (User, error) {
+// LookupUid looks up a user by their user id in /etc/passwd. If the user cannot
+// be found (or there is no /etc/passwd file on the filesystem), then LookupId
+// returns an error.
+func LookupUid(uid int) (User, error) {
 	return lookupUserFunc(func(u User) bool {
 		return u.Uid == uid
 	})
@@ -51,13 +57,19 @@ func lookupUserFunc(filter func(u User) bool) (User, error) {
 	return users[0], nil
 }
 
-func lookupGroup(groupname string) (Group, error) {
+// LookupGroup looks up a group by its name in /etc/group. If the group cannot
+// be found (or there is no /etc/group file on the filesystem), then LookupGroup
+// returns an error.
+func LookupGroup(groupname string) (Group, error) {
 	return lookupGroupFunc(func(g Group) bool {
 		return g.Name == groupname
 	})
 }
 
-func lookupGid(gid int) (Group, error) {
+// LookupGid looks up a group by its group id in /etc/group. If the group cannot
+// be found (or there is no /etc/group file on the filesystem), then LookupGid
+// returns an error.
+func LookupGid(gid int) (Group, error) {
 	return lookupGroupFunc(func(g Group) bool {
 		return g.Gid == gid
 	})

+ 0 - 40
vendor/github.com/opencontainers/runc/libcontainer/user/lookup_windows.go

@@ -1,40 +0,0 @@
-// +build windows
-
-package user
-
-import (
-	"fmt"
-	"os/user"
-)
-
-func lookupUser(username string) (User, error) {
-	u, err := user.Lookup(username)
-	if err != nil {
-		return User{}, err
-	}
-	return userFromOS(u)
-}
-
-func lookupUid(uid int) (User, error) {
-	u, err := user.LookupId(fmt.Sprintf("%d", uid))
-	if err != nil {
-		return User{}, err
-	}
-	return userFromOS(u)
-}
-
-func lookupGroup(groupname string) (Group, error) {
-	g, err := user.LookupGroup(groupname)
-	if err != nil {
-		return Group{}, err
-	}
-	return groupFromOS(g)
-}
-
-func lookupGid(gid int) (Group, error) {
-	g, err := user.LookupGroupId(fmt.Sprintf("%d", gid))
-	if err != nil {
-		return Group{}, err
-	}
-	return groupFromOS(g)
-}

+ 10 - 42
vendor/github.com/opencontainers/runc/libcontainer/user/user.go

@@ -2,10 +2,10 @@ package user
 
 import (
 	"bufio"
+	"errors"
 	"fmt"
 	"io"
 	"os"
-	"os/user"
 	"strconv"
 	"strings"
 )
@@ -16,6 +16,13 @@ const (
 )
 
 var (
+	// The current operating system does not provide the required data for user lookups.
+	ErrUnsupported = errors.New("user lookup: operating system does not provide passwd-formatted data")
+
+	// No matching entries found in file.
+	ErrNoPasswdEntries = errors.New("no matching entries in passwd file")
+	ErrNoGroupEntries  = errors.New("no matching entries in group file")
+
 	ErrRange = fmt.Errorf("uids and gids must be in range %d-%d", minId, maxId)
 )
 
@@ -29,28 +36,6 @@ type User struct {
 	Shell string
 }
 
-// userFromOS converts an os/user.(*User) to local User
-//
-// (This does not include Pass, Shell or Gecos)
-func userFromOS(u *user.User) (User, error) {
-	newUser := User{
-		Name: u.Username,
-		Home: u.HomeDir,
-	}
-	id, err := strconv.Atoi(u.Uid)
-	if err != nil {
-		return newUser, err
-	}
-	newUser.Uid = id
-
-	id, err = strconv.Atoi(u.Gid)
-	if err != nil {
-		return newUser, err
-	}
-	newUser.Gid = id
-	return newUser, nil
-}
-
 type Group struct {
 	Name string
 	Pass string
@@ -58,23 +43,6 @@ type Group struct {
 	List []string
 }
 
-// groupFromOS converts an os/user.(*Group) to local Group
-//
-// (This does not include Pass or List)
-func groupFromOS(g *user.Group) (Group, error) {
-	newGroup := Group{
-		Name: g.Name,
-	}
-
-	id, err := strconv.Atoi(g.Gid)
-	if err != nil {
-		return newGroup, err
-	}
-	newGroup.Gid = id
-
-	return newGroup, nil
-}
-
 // SubID represents an entry in /etc/sub{u,g}id
 type SubID struct {
 	Name  string
@@ -466,7 +434,7 @@ func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, err
 		// we asked for a group but didn't find it. let's check to see
 		// if we wanted a numeric group
 		if !found {
-			gid, err := strconv.Atoi(ag)
+			gid, err := strconv.ParseInt(ag, 10, 64)
 			if err != nil {
 				return nil, fmt.Errorf("Unable to find group %s", ag)
 			}
@@ -474,7 +442,7 @@ func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, err
 			if gid < minId || gid > maxId {
 				return nil, ErrRange
 			}
-			gidMap[gid] = struct{}{}
+			gidMap[int(gid)] = struct{}{}
 		}
 	}
 	gids := []int{}

+ 42 - 0
vendor/github.com/opencontainers/runc/libcontainer/user/user_fuzzer.go

@@ -0,0 +1,42 @@
+// +build gofuzz
+
+package user
+
+import (
+	"io"
+	"strings"
+)
+
+func IsDivisbleBy(n int, divisibleby int) bool {
+	return (n % divisibleby) == 0
+}
+
+func FuzzUser(data []byte) int {
+	if len(data) == 0 {
+		return -1
+	}
+	if !IsDivisbleBy(len(data), 5) {
+		return -1
+	}
+
+	var divided [][]byte
+
+	chunkSize := len(data) / 5
+
+	for i := 0; i < len(data); i += chunkSize {
+		end := i + chunkSize
+
+		divided = append(divided, data[i:end])
+	}
+
+	_, _ = ParsePasswdFilter(strings.NewReader(string(divided[0])), nil)
+
+	var passwd, group io.Reader
+
+	group = strings.NewReader(string(divided[1]))
+	_, _ = GetAdditionalGroups([]string{string(divided[2])}, group)
+
+	passwd = strings.NewReader(string(divided[3]))
+	_, _ = GetExecUser(string(divided[4]), nil, passwd, group)
+	return 1
+}

+ 5 - 0
vendor/github.com/opencontainers/runc/libcontainer/userns/userns.go

@@ -0,0 +1,5 @@
+package userns
+
+// RunningInUserNS detects whether we are currently running in a user namespace.
+// Originally copied from github.com/lxc/lxd/shared/util.go
+var RunningInUserNS = runningInUserNS

+ 15 - 0
vendor/github.com/opencontainers/runc/libcontainer/userns/userns_fuzzer.go

@@ -0,0 +1,15 @@
+// +build gofuzz
+
+package userns
+
+import (
+	"strings"
+
+	"github.com/opencontainers/runc/libcontainer/user"
+)
+
+func FuzzUIDMap(data []byte) int {
+	uidmap, _ := user.ParseIDMap(strings.NewReader(string(data)))
+	_ = uidMapInUserNS(uidmap)
+	return 1
+}

+ 37 - 0
vendor/github.com/opencontainers/runc/libcontainer/userns/userns_linux.go

@@ -0,0 +1,37 @@
+package userns
+
+import (
+	"sync"
+
+	"github.com/opencontainers/runc/libcontainer/user"
+)
+
+var (
+	inUserNS bool
+	nsOnce   sync.Once
+)
+
+// runningInUserNS detects whether we are currently running in a user namespace.
+// Originally copied from github.com/lxc/lxd/shared/util.go
+func runningInUserNS() bool {
+	nsOnce.Do(func() {
+		uidmap, err := user.CurrentProcessUIDMap()
+		if err != nil {
+			// This kernel-provided file only exists if user namespaces are supported
+			return
+		}
+		inUserNS = uidMapInUserNS(uidmap)
+	})
+	return inUserNS
+}
+
+func uidMapInUserNS(uidmap []user.IDMap) bool {
+	/*
+	 * We assume we are in the initial user namespace if we have a full
+	 * range - 4294967295 uids starting at uid 0.
+	 */
+	if len(uidmap) == 1 && uidmap[0].ID == 0 && uidmap[0].ParentID == 0 && uidmap[0].Count == 4294967295 {
+		return false
+	}
+	return true
+}

+ 17 - 0
vendor/github.com/opencontainers/runc/libcontainer/userns/userns_unsupported.go

@@ -0,0 +1,17 @@
+// +build !linux
+
+package userns
+
+import "github.com/opencontainers/runc/libcontainer/user"
+
+// runningInUserNS is a stub for non-Linux systems
+// Always returns false
+func runningInUserNS() bool {
+	return false
+}
+
+// uidMapInUserNS is a stub for non-Linux systems
+// Always returns false
+func uidMapInUserNS(uidmap []user.IDMap) bool {
+	return false
+}

+ 1 - 1
vendor/github.com/opencontainers/runtime-spec/README.md

@@ -135,7 +135,7 @@ Read more on [How to Write a Git Commit Message][how-to-git-commit] or the Discu
 8. When possible, one keyword to scope the change in the subject (i.e. "README: ...", "runtime: ...")
 
 
-[charter]: https://www.opencontainers.org/about/governance
+[charter]: https://github.com/opencontainers/tob/blob/master/CHARTER.md
 [code-of-conduct]: https://github.com/opencontainers/org/blob/master/CODE_OF_CONDUCT.md
 [dev-list]: https://groups.google.com/a/opencontainers.org/forum/#!forum/dev
 [how-to-git-commit]: http://chris.beams.io/posts/git-commit

+ 16 - 7
vendor/github.com/opencontainers/runtime-spec/specs-go/config.go

@@ -60,7 +60,7 @@ type Process struct {
 	SelinuxLabel string `json:"selinuxLabel,omitempty" platform:"linux"`
 }
 
-// LinuxCapabilities specifies the whitelist of capabilities that are kept for a process.
+// LinuxCapabilities specifies the list of allowed capabilities that are kept for a process.
 // http://man7.org/linux/man-pages/man7/capabilities.7.html
 type LinuxCapabilities struct {
 	// Bounding is the set of capabilities checked by the kernel.
@@ -354,7 +354,7 @@ type LinuxRdma struct {
 
 // LinuxResources has container runtime resource constraints
 type LinuxResources struct {
-	// Devices configures the device whitelist.
+	// Devices configures the device allowlist.
 	Devices []LinuxDeviceCgroup `json:"devices,omitempty"`
 	// Memory restriction configuration
 	Memory *LinuxMemory `json:"memory,omitempty"`
@@ -372,6 +372,8 @@ type LinuxResources struct {
 	// Limits are a set of key value pairs that define RDMA resource limits,
 	// where the key is device name and value is resource limits.
 	Rdma map[string]LinuxRdma `json:"rdma,omitempty"`
+	// Unified resources.
+	Unified map[string]string `json:"unified,omitempty"`
 }
 
 // LinuxDevice represents the mknod information for a Linux special device file
@@ -392,7 +394,8 @@ type LinuxDevice struct {
 	GID *uint32 `json:"gid,omitempty"`
 }
 
-// LinuxDeviceCgroup represents a device rule for the whitelist controller
+// LinuxDeviceCgroup represents a device rule for the devices specified to
+// the device controller
 type LinuxDeviceCgroup struct {
 	// Allow or deny
 	Allow bool `json:"allow"`
@@ -595,10 +598,13 @@ type VMImage struct {
 
 // LinuxSeccomp represents syscall restrictions
 type LinuxSeccomp struct {
-	DefaultAction LinuxSeccompAction `json:"defaultAction"`
-	Architectures []Arch             `json:"architectures,omitempty"`
-	Flags         []LinuxSeccompFlag `json:"flags,omitempty"`
-	Syscalls      []LinuxSyscall     `json:"syscalls,omitempty"`
+	DefaultAction    LinuxSeccompAction `json:"defaultAction"`
+	DefaultErrnoRet  *uint              `json:"defaultErrnoRet,omitempty"`
+	Architectures    []Arch             `json:"architectures,omitempty"`
+	Flags            []LinuxSeccompFlag `json:"flags,omitempty"`
+	ListenerPath     string             `json:"listenerPath,omitempty"`
+	ListenerMetadata string             `json:"listenerMetadata,omitempty"`
+	Syscalls         []LinuxSyscall     `json:"syscalls,omitempty"`
 }
 
 // Arch used for additional architectures
@@ -628,6 +634,7 @@ const (
 	ArchS390X       Arch = "SCMP_ARCH_S390X"
 	ArchPARISC      Arch = "SCMP_ARCH_PARISC"
 	ArchPARISC64    Arch = "SCMP_ARCH_PARISC64"
+	ArchRISCV64     Arch = "SCMP_ARCH_RISCV64"
 )
 
 // LinuxSeccompAction taken upon Seccomp rule match
@@ -637,11 +644,13 @@ type LinuxSeccompAction string
 const (
 	ActKill        LinuxSeccompAction = "SCMP_ACT_KILL"
 	ActKillProcess LinuxSeccompAction = "SCMP_ACT_KILL_PROCESS"
+	ActKillThread  LinuxSeccompAction = "SCMP_ACT_KILL_THREAD"
 	ActTrap        LinuxSeccompAction = "SCMP_ACT_TRAP"
 	ActErrno       LinuxSeccompAction = "SCMP_ACT_ERRNO"
 	ActTrace       LinuxSeccompAction = "SCMP_ACT_TRACE"
 	ActAllow       LinuxSeccompAction = "SCMP_ACT_ALLOW"
 	ActLog         LinuxSeccompAction = "SCMP_ACT_LOG"
+	ActNotify      LinuxSeccompAction = "SCMP_ACT_NOTIFY"
 )
 
 // LinuxSeccompOperator used to match syscall arguments in Seccomp

+ 25 - 4
vendor/github.com/opencontainers/runtime-spec/specs-go/state.go

@@ -5,17 +5,17 @@ type ContainerState string
 
 const (
 	// StateCreating indicates that the container is being created
-	StateCreating ContainerState  = "creating"
+	StateCreating ContainerState = "creating"
 
 	// StateCreated indicates that the runtime has finished the create operation
-	StateCreated ContainerState  = "created"
+	StateCreated ContainerState = "created"
 
 	// StateRunning indicates that the container process has executed the
 	// user-specified program but has not exited
-	StateRunning ContainerState  = "running"
+	StateRunning ContainerState = "running"
 
 	// StateStopped indicates that the container process has exited
-	StateStopped ContainerState  = "stopped"
+	StateStopped ContainerState = "stopped"
 )
 
 // State holds information about the runtime state of the container.
@@ -33,3 +33,24 @@ type State struct {
 	// Annotations are key values associated with the container.
 	Annotations map[string]string `json:"annotations,omitempty"`
 }
+
+const (
+	// SeccompFdName is the name of the seccomp notify file descriptor.
+	SeccompFdName string = "seccompFd"
+)
+
+// ContainerProcessState holds information about the state of a container process.
+type ContainerProcessState struct {
+	// Version is the version of the specification that is supported.
+	Version string `json:"ociVersion"`
+	// Fds is a string array containing the names of the file descriptors passed.
+	// The index of the name in this array corresponds to index of the file
+	// descriptor in the `SCM_RIGHTS` array.
+	Fds []string `json:"fds"`
+	// Pid is the process ID as seen by the runtime.
+	Pid int `json:"pid"`
+	// Opaque metadata.
+	Metadata string `json:"metadata,omitempty"`
+	// State of the container.
+	State State `json:"state"`
+}

+ 2 - 0
vendor/golang.org/x/net/README.md

@@ -1,5 +1,7 @@
 # Go Networking
 
+[![Go Reference](https://pkg.go.dev/badge/golang.org/x/net.svg)](https://pkg.go.dev/golang.org/x/net)
+
 This repository holds supplementary Go networking libraries.
 
 ## Download/Install

+ 3 - 3
vendor/golang.org/x/net/go.mod

@@ -3,7 +3,7 @@ module golang.org/x/net
 go 1.11
 
 require (
-	golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9
-	golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd
-	golang.org/x/text v0.3.0
+	golang.org/x/sys v0.0.0-20201119102817-f84b799fce68
+	golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1
+	golang.org/x/text v0.3.3
 )

+ 8 - 4
vendor/golang.org/x/net/http2/server.go

@@ -1694,6 +1694,7 @@ func (sc *serverConn) processData(f *DataFrame) error {
 		if len(data) > 0 {
 			wrote, err := st.body.Write(data)
 			if err != nil {
+				sc.sendWindowUpdate(nil, int(f.Length)-wrote)
 				return streamError(id, ErrCodeStreamClosed)
 			}
 			if wrote != len(data) {
@@ -2020,7 +2021,11 @@ func (sc *serverConn) newWriterAndRequest(st *stream, f *MetaHeadersFrame) (*res
 	}
 	if bodyOpen {
 		if vv, ok := rp.header["Content-Length"]; ok {
-			req.ContentLength, _ = strconv.ParseInt(vv[0], 10, 64)
+			if cl, err := strconv.ParseUint(vv[0], 10, 63); err == nil {
+				req.ContentLength = int64(cl)
+			} else {
+				req.ContentLength = 0
+			}
 		} else {
 			req.ContentLength = -1
 		}
@@ -2403,9 +2408,8 @@ func (rws *responseWriterState) writeChunk(p []byte) (n int, err error) {
 		var ctype, clen string
 		if clen = rws.snapHeader.Get("Content-Length"); clen != "" {
 			rws.snapHeader.Del("Content-Length")
-			clen64, err := strconv.ParseInt(clen, 10, 64)
-			if err == nil && clen64 >= 0 {
-				rws.sentContentLen = clen64
+			if cl, err := strconv.ParseUint(clen, 10, 63); err == nil {
+				rws.sentContentLen = int64(cl)
 			} else {
 				clen = ""
 			}

+ 34 - 6
vendor/golang.org/x/net/http2/transport.go

@@ -154,12 +154,21 @@ func (t *Transport) pingTimeout() time.Duration {
 
 // ConfigureTransport configures a net/http HTTP/1 Transport to use HTTP/2.
 // It returns an error if t1 has already been HTTP/2-enabled.
+//
+// Use ConfigureTransports instead to configure the HTTP/2 Transport.
 func ConfigureTransport(t1 *http.Transport) error {
-	_, err := configureTransport(t1)
+	_, err := ConfigureTransports(t1)
 	return err
 }
 
-func configureTransport(t1 *http.Transport) (*Transport, error) {
+// ConfigureTransports configures a net/http HTTP/1 Transport to use HTTP/2.
+// It returns a new HTTP/2 Transport for further configuration.
+// It returns an error if t1 has already been HTTP/2-enabled.
+func ConfigureTransports(t1 *http.Transport) (*Transport, error) {
+	return configureTransports(t1)
+}
+
+func configureTransports(t1 *http.Transport) (*Transport, error) {
 	connPool := new(clientConnPool)
 	t2 := &Transport{
 		ConnPool: noDialClientConnPool{connPool},
@@ -689,6 +698,7 @@ func (t *Transport) newClientConn(c net.Conn, singleUse bool) (*ClientConn, erro
 	cc.inflow.add(transportDefaultConnFlow + initialWindowSize)
 	cc.bw.Flush()
 	if cc.werr != nil {
+		cc.Close()
 		return nil, cc.werr
 	}
 
@@ -1080,6 +1090,15 @@ func (cc *ClientConn) roundTrip(req *http.Request) (res *http.Response, gotErrAf
 	bodyWriter := cc.t.getBodyWriterState(cs, body)
 	cs.on100 = bodyWriter.on100
 
+	defer func() {
+		cc.wmu.Lock()
+		werr := cc.werr
+		cc.wmu.Unlock()
+		if werr != nil {
+			cc.Close()
+		}
+	}()
+
 	cc.wmu.Lock()
 	endStream := !hasBody && !hasTrailers
 	werr := cc.writeHeaders(cs.ID, endStream, int(cc.maxFrameSize), hdrs)
@@ -1129,6 +1148,9 @@ func (cc *ClientConn) roundTrip(req *http.Request) (res *http.Response, gotErrAf
 			// we can keep it.
 			bodyWriter.cancel()
 			cs.abortRequestBodyWrite(errStopReqBodyWrite)
+			if hasBody && !bodyWritten {
+				<-bodyWriter.resc
+			}
 		}
 		if re.err != nil {
 			cc.forgetStreamID(cs.ID)
@@ -1149,6 +1171,7 @@ func (cc *ClientConn) roundTrip(req *http.Request) (res *http.Response, gotErrAf
 			} else {
 				bodyWriter.cancel()
 				cs.abortRequestBodyWrite(errStopReqBodyWriteAndCancel)
+				<-bodyWriter.resc
 			}
 			cc.forgetStreamID(cs.ID)
 			return nil, cs.getStartedWrite(), errTimeout
@@ -1158,6 +1181,7 @@ func (cc *ClientConn) roundTrip(req *http.Request) (res *http.Response, gotErrAf
 			} else {
 				bodyWriter.cancel()
 				cs.abortRequestBodyWrite(errStopReqBodyWriteAndCancel)
+				<-bodyWriter.resc
 			}
 			cc.forgetStreamID(cs.ID)
 			return nil, cs.getStartedWrite(), ctx.Err()
@@ -1167,6 +1191,7 @@ func (cc *ClientConn) roundTrip(req *http.Request) (res *http.Response, gotErrAf
 			} else {
 				bodyWriter.cancel()
 				cs.abortRequestBodyWrite(errStopReqBodyWriteAndCancel)
+				<-bodyWriter.resc
 			}
 			cc.forgetStreamID(cs.ID)
 			return nil, cs.getStartedWrite(), errRequestCanceled
@@ -1176,6 +1201,7 @@ func (cc *ClientConn) roundTrip(req *http.Request) (res *http.Response, gotErrAf
 			// forgetStreamID.
 			return nil, cs.getStartedWrite(), cs.resetErr
 		case err := <-bodyWriter.resc:
+			bodyWritten = true
 			// Prefer the read loop's response, if available. Issue 16102.
 			select {
 			case re := <-readLoopResCh:
@@ -1186,7 +1212,6 @@ func (cc *ClientConn) roundTrip(req *http.Request) (res *http.Response, gotErrAf
 				cc.forgetStreamID(cs.ID)
 				return nil, cs.getStartedWrite(), err
 			}
-			bodyWritten = true
 			if d := cc.responseHeaderTimeout(); d != 0 {
 				timer := time.NewTimer(d)
 				defer timer.Stop()
@@ -2006,8 +2031,8 @@ func (rl *clientConnReadLoop) handleResponse(cs *clientStream, f *MetaHeadersFra
 	if !streamEnded || isHead {
 		res.ContentLength = -1
 		if clens := res.Header["Content-Length"]; len(clens) == 1 {
-			if clen64, err := strconv.ParseInt(clens[0], 10, 64); err == nil {
-				res.ContentLength = clen64
+			if cl, err := strconv.ParseUint(clens[0], 10, 63); err == nil {
+				res.ContentLength = int64(cl)
 			} else {
 				// TODO: care? unlike http/1, it won't mess up our framing, so it's
 				// more safe smuggling-wise to ignore.
@@ -2525,6 +2550,7 @@ func strSliceContains(ss []string, s string) bool {
 
 type erringRoundTripper struct{ err error }
 
+func (rt erringRoundTripper) RoundTripErr() error                             { return rt.err }
 func (rt erringRoundTripper) RoundTrip(*http.Request) (*http.Response, error) { return nil, rt.err }
 
 // gzipReader wraps a response body so it can lazily
@@ -2606,7 +2632,9 @@ func (t *Transport) getBodyWriterState(cs *clientStream, body io.Reader) (s body
 
 func (s bodyWriterState) cancel() {
 	if s.timer != nil {
-		s.timer.Stop()
+		if s.timer.Stop() {
+			s.resc <- nil
+		}
 	}
 }
 

+ 1 - 1
vendor/golang.org/x/net/idna/tables12.00.go → vendor/golang.org/x/net/idna/tables12.0.0.go

@@ -1,6 +1,6 @@
 // Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
 
-// +build go1.14
+// +build go1.14,!go1.16
 
 package idna
 

File diff suppressed because it is too large
+ 2394 - 0
vendor/golang.org/x/net/idna/tables13.0.0.go


+ 1 - 1
vendor/golang.org/x/net/internal/socket/cmsghdr.go

@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris zos
 
 package socket
 

+ 13 - 3
vendor/golang.org/x/net/internal/socket/cmsghdr_stub.go

@@ -2,13 +2,23 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !aix,!darwin,!dragonfly,!freebsd,!linux,!netbsd,!openbsd,!solaris
+// +build !aix,!darwin,!dragonfly,!freebsd,!linux,!netbsd,!openbsd,!solaris,!zos
 
 package socket
 
-type cmsghdr struct{}
+func controlHeaderLen() int {
+	return 0
+}
+
+func controlMessageLen(dataLen int) int {
+	return 0
+}
 
-const sizeofCmsghdr = 0
+func controlMessageSpace(dataLen int) int {
+	return 0
+}
+
+type cmsghdr struct{}
 
 func (h *cmsghdr) len() int { return 0 }
 func (h *cmsghdr) lvl() int { return 0 }

+ 21 - 0
vendor/golang.org/x/net/internal/socket/cmsghdr_unix.go

@@ -0,0 +1,21 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
+
+package socket
+
+import "golang.org/x/sys/unix"
+
+func controlHeaderLen() int {
+	return unix.CmsgLen(0)
+}
+
+func controlMessageLen(dataLen int) int {
+	return unix.CmsgLen(dataLen)
+}
+
+func controlMessageSpace(dataLen int) int {
+	return unix.CmsgSpace(dataLen)
+}

+ 25 - 0
vendor/golang.org/x/net/internal/socket/cmsghdr_zos_s390x.go

@@ -0,0 +1,25 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package socket
+
+import "syscall"
+
+func (h *cmsghdr) set(l, lvl, typ int) {
+	h.Len = int32(l)
+	h.Level = int32(lvl)
+	h.Type = int32(typ)
+}
+
+func controlHeaderLen() int {
+	return syscall.CmsgLen(0)
+}
+
+func controlMessageLen(dataLen int) int {
+	return syscall.CmsgLen(dataLen)
+}
+
+func controlMessageSpace(dataLen int) int {
+	return syscall.CmsgSpace(dataLen)
+}

+ 1 - 1
vendor/golang.org/x/net/internal/socket/error_unix.go

@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris zos
 
 package socket
 

+ 1 - 1
vendor/golang.org/x/net/internal/socket/iovec_64bit.go

@@ -3,7 +3,7 @@
 // license that can be found in the LICENSE file.
 
 // +build arm64 amd64 ppc64 ppc64le mips64 mips64le riscv64 s390x
-// +build aix darwin dragonfly freebsd linux netbsd openbsd
+// +build aix darwin dragonfly freebsd linux netbsd openbsd zos
 
 package socket
 

+ 1 - 1
vendor/golang.org/x/net/internal/socket/iovec_stub.go

@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !aix,!darwin,!dragonfly,!freebsd,!linux,!netbsd,!openbsd,!solaris
+// +build !aix,!darwin,!dragonfly,!freebsd,!linux,!netbsd,!openbsd,!solaris,!zos
 
 package socket
 

+ 1 - 1
vendor/golang.org/x/net/internal/socket/msghdr_stub.go

@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !aix,!darwin,!dragonfly,!freebsd,!linux,!netbsd,!openbsd,!solaris
+// +build !aix,!darwin,!dragonfly,!freebsd,!linux,!netbsd,!openbsd,!solaris,!zos
 
 package socket
 

+ 36 - 0
vendor/golang.org/x/net/internal/socket/msghdr_zos_s390x.go

@@ -0,0 +1,36 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build s390x
+// +build zos
+
+package socket
+
+import "unsafe"
+
+func (h *msghdr) pack(vs []iovec, bs [][]byte, oob []byte, sa []byte) {
+	for i := range vs {
+		vs[i].set(bs[i])
+	}
+	if len(vs) > 0 {
+		h.Iov = &vs[0]
+		h.Iovlen = int32(len(vs))
+	}
+	if len(oob) > 0 {
+		h.Control = (*byte)(unsafe.Pointer(&oob[0]))
+		h.Controllen = uint32(len(oob))
+	}
+	if sa != nil {
+		h.Name = (*byte)(unsafe.Pointer(&sa[0]))
+		h.Namelen = uint32(len(sa))
+	}
+}
+
+func (h *msghdr) controllen() int {
+	return int(h.Controllen)
+}
+
+func (h *msghdr) flags() int {
+	return int(h.Flags)
+}

+ 4 - 3
vendor/golang.org/x/net/internal/socket/rawconn_msg.go

@@ -2,12 +2,13 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris windows
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris windows zos
 
 package socket
 
 import (
 	"os"
+	"runtime"
 	"syscall"
 )
 
@@ -24,7 +25,7 @@ func (c *Conn) recvMsg(m *Message, flags int) error {
 	var n int
 	fn := func(s uintptr) bool {
 		n, operr = recvmsg(s, &h, flags)
-		if operr == syscall.EAGAIN {
+		if operr == syscall.EAGAIN || (runtime.GOOS == "zos" && operr == syscall.EWOULDBLOCK) {
 			return false
 		}
 		return true
@@ -61,7 +62,7 @@ func (c *Conn) sendMsg(m *Message, flags int) error {
 	var n int
 	fn := func(s uintptr) bool {
 		n, operr = sendmsg(s, &h, flags)
-		if operr == syscall.EAGAIN {
+		if operr == syscall.EAGAIN || (runtime.GOOS == "zos" && operr == syscall.EWOULDBLOCK) {
 			return false
 		}
 		return true

+ 1 - 1
vendor/golang.org/x/net/internal/socket/rawconn_nomsg.go

@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !aix,!darwin,!dragonfly,!freebsd,!linux,!netbsd,!openbsd,!solaris,!windows
+// +build !aix,!darwin,!dragonfly,!freebsd,!linux,!netbsd,!openbsd,!solaris,!windows,!zos
 
 package socket
 

+ 1 - 9
vendor/golang.org/x/net/internal/socket/socket.go

@@ -90,17 +90,9 @@ func (o *Option) SetInt(c *Conn, v int) error {
 	return o.set(c, b)
 }
 
-func controlHeaderLen() int {
-	return roundup(sizeofCmsghdr)
-}
-
-func controlMessageLen(dataLen int) int {
-	return roundup(sizeofCmsghdr) + dataLen
-}
-
 // ControlMessageSpace returns the whole length of control message.
 func ControlMessageSpace(dataLen int) int {
-	return roundup(sizeofCmsghdr) + roundup(dataLen)
+	return controlMessageSpace(dataLen)
 }
 
 // A ControlMessage represents the head message in a stream of control

+ 2 - 12
vendor/golang.org/x/net/internal/socket/sys.go

@@ -9,13 +9,8 @@ import (
 	"unsafe"
 )
 
-var (
-	// NativeEndian is the machine native endian implementation of
-	// ByteOrder.
-	NativeEndian binary.ByteOrder
-
-	kernelAlign int
-)
+// NativeEndian is the machine native endian implementation of ByteOrder.
+var NativeEndian binary.ByteOrder
 
 func init() {
 	i := uint32(1)
@@ -25,9 +20,4 @@ func init() {
 	} else {
 		NativeEndian = binary.BigEndian
 	}
-	kernelAlign = probeProtocolStack()
-}
-
-func roundup(l int) int {
-	return (l + kernelAlign - 1) &^ (kernelAlign - 1)
 }

+ 0 - 23
vendor/golang.org/x/net/internal/socket/sys_bsdvar.go

@@ -1,23 +0,0 @@
-// Copyright 2017 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build aix freebsd netbsd openbsd
-
-package socket
-
-import (
-	"runtime"
-	"unsafe"
-)
-
-func probeProtocolStack() int {
-	if (runtime.GOOS == "netbsd" || runtime.GOOS == "openbsd") && runtime.GOARCH == "arm" {
-		return 8
-	}
-	if runtime.GOOS == "aix" {
-		return 1
-	}
-	var p uintptr
-	return int(unsafe.Sizeof(p))
-}

Some files were not shown because too many files changed in this diff