123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394 |
- package link
- import (
- "bytes"
- "errors"
- "fmt"
- "os"
- "path/filepath"
- "runtime"
- "strconv"
- "strings"
- "unsafe"
- "github.com/cilium/ebpf"
- "github.com/cilium/ebpf/asm"
- "github.com/cilium/ebpf/internal"
- "github.com/cilium/ebpf/internal/sys"
- "github.com/cilium/ebpf/internal/unix"
- )
- // Getting the terminology right is usually the hardest part. For posterity and
- // for staying sane during implementation:
- //
- // - trace event: Representation of a kernel runtime hook. Filesystem entries
- // under <tracefs>/events. Can be tracepoints (static), kprobes or uprobes.
- // Can be instantiated into perf events (see below).
- // - tracepoint: A predetermined hook point in the kernel. Exposed as trace
- // events in (sub)directories under <tracefs>/events. Cannot be closed or
- // removed, they are static.
- // - k(ret)probe: Ephemeral trace events based on entry or exit points of
- // exported kernel symbols. kprobe-based (tracefs) trace events can be
- // created system-wide by writing to the <tracefs>/kprobe_events file, or
- // they can be scoped to the current process by creating PMU perf events.
- // - u(ret)probe: Ephemeral trace events based on user provides ELF binaries
- // and offsets. uprobe-based (tracefs) trace events can be
- // created system-wide by writing to the <tracefs>/uprobe_events file, or
- // they can be scoped to the current process by creating PMU perf events.
- // - perf event: An object instantiated based on an existing trace event or
- // kernel symbol. Referred to by fd in userspace.
- // Exactly one eBPF program can be attached to a perf event. Multiple perf
- // events can be created from a single trace event. Closing a perf event
- // stops any further invocations of the attached eBPF program.
- var (
- tracefsPath = "/sys/kernel/debug/tracing"
- errInvalidInput = errors.New("invalid input")
- )
- const (
- perfAllThreads = -1
- )
- type perfEventType uint8
- const (
- tracepointEvent perfEventType = iota
- kprobeEvent
- kretprobeEvent
- uprobeEvent
- uretprobeEvent
- )
- // A perfEvent represents a perf event kernel object. Exactly one eBPF program
- // can be attached to it. It is created based on a tracefs trace event or a
- // Performance Monitoring Unit (PMU).
- type perfEvent struct {
- // The event type determines the types of programs that can be attached.
- typ perfEventType
- // Group and name of the tracepoint/kprobe/uprobe.
- group string
- name string
- // PMU event ID read from sysfs. Valid IDs are non-zero.
- pmuID uint64
- // ID of the trace event read from tracefs. Valid IDs are non-zero.
- tracefsID uint64
- // User provided arbitrary value.
- cookie uint64
- // This is the perf event FD.
- fd *sys.FD
- }
- func (pe *perfEvent) Close() error {
- if err := pe.fd.Close(); err != nil {
- return fmt.Errorf("closing perf event fd: %w", err)
- }
- switch pe.typ {
- case kprobeEvent, kretprobeEvent:
- // Clean up kprobe tracefs entry.
- if pe.tracefsID != 0 {
- return closeTraceFSProbeEvent(kprobeType, pe.group, pe.name)
- }
- case uprobeEvent, uretprobeEvent:
- // Clean up uprobe tracefs entry.
- if pe.tracefsID != 0 {
- return closeTraceFSProbeEvent(uprobeType, pe.group, pe.name)
- }
- case tracepointEvent:
- // Tracepoint trace events don't hold any extra resources.
- return nil
- }
- return nil
- }
- // perfEventLink represents a bpf perf link.
- type perfEventLink struct {
- RawLink
- pe *perfEvent
- }
- func (pl *perfEventLink) isLink() {}
- // Pinning requires the underlying perf event FD to stay open.
- //
- // | PerfEvent FD | BpfLink FD | Works |
- // |--------------|------------|-------|
- // | Open | Open | Yes |
- // | Closed | Open | No |
- // | Open | Closed | No (Pin() -> EINVAL) |
- // | Closed | Closed | No (Pin() -> EINVAL) |
- //
- // There is currently no pretty way to recover the perf event FD
- // when loading a pinned link, so leave as not supported for now.
- func (pl *perfEventLink) Pin(string) error {
- return fmt.Errorf("perf event link pin: %w", ErrNotSupported)
- }
- func (pl *perfEventLink) Unpin() error {
- return fmt.Errorf("perf event link unpin: %w", ErrNotSupported)
- }
- func (pl *perfEventLink) Close() error {
- if err := pl.pe.Close(); err != nil {
- return fmt.Errorf("perf event link close: %w", err)
- }
- return pl.fd.Close()
- }
- func (pl *perfEventLink) Update(prog *ebpf.Program) error {
- return fmt.Errorf("perf event link update: %w", ErrNotSupported)
- }
- // perfEventIoctl implements Link and handles the perf event lifecycle
- // via ioctl().
- type perfEventIoctl struct {
- *perfEvent
- }
- func (pi *perfEventIoctl) isLink() {}
- // Since 4.15 (e87c6bc3852b "bpf: permit multiple bpf attachments for a single perf event"),
- // calling PERF_EVENT_IOC_SET_BPF appends the given program to a prog_array
- // owned by the perf event, which means multiple programs can be attached
- // simultaneously.
- //
- // Before 4.15, calling PERF_EVENT_IOC_SET_BPF more than once on a perf event
- // returns EEXIST.
- //
- // Detaching a program from a perf event is currently not possible, so a
- // program replacement mechanism cannot be implemented for perf events.
- func (pi *perfEventIoctl) Update(prog *ebpf.Program) error {
- return fmt.Errorf("perf event ioctl update: %w", ErrNotSupported)
- }
- func (pi *perfEventIoctl) Pin(string) error {
- return fmt.Errorf("perf event ioctl pin: %w", ErrNotSupported)
- }
- func (pi *perfEventIoctl) Unpin() error {
- return fmt.Errorf("perf event ioctl unpin: %w", ErrNotSupported)
- }
- func (pi *perfEventIoctl) Info() (*Info, error) {
- return nil, fmt.Errorf("perf event ioctl info: %w", ErrNotSupported)
- }
- // attach the given eBPF prog to the perf event stored in pe.
- // pe must contain a valid perf event fd.
- // prog's type must match the program type stored in pe.
- func attachPerfEvent(pe *perfEvent, prog *ebpf.Program) (Link, error) {
- if prog == nil {
- return nil, errors.New("cannot attach a nil program")
- }
- if prog.FD() < 0 {
- return nil, fmt.Errorf("invalid program: %w", sys.ErrClosedFd)
- }
- switch pe.typ {
- case kprobeEvent, kretprobeEvent, uprobeEvent, uretprobeEvent:
- if t := prog.Type(); t != ebpf.Kprobe {
- return nil, fmt.Errorf("invalid program type (expected %s): %s", ebpf.Kprobe, t)
- }
- case tracepointEvent:
- if t := prog.Type(); t != ebpf.TracePoint {
- return nil, fmt.Errorf("invalid program type (expected %s): %s", ebpf.TracePoint, t)
- }
- default:
- return nil, fmt.Errorf("unknown perf event type: %d", pe.typ)
- }
- if err := haveBPFLinkPerfEvent(); err == nil {
- return attachPerfEventLink(pe, prog)
- }
- return attachPerfEventIoctl(pe, prog)
- }
- func attachPerfEventIoctl(pe *perfEvent, prog *ebpf.Program) (*perfEventIoctl, error) {
- if pe.cookie != 0 {
- return nil, fmt.Errorf("cookies are not supported: %w", ErrNotSupported)
- }
- // Assign the eBPF program to the perf event.
- err := unix.IoctlSetInt(pe.fd.Int(), unix.PERF_EVENT_IOC_SET_BPF, prog.FD())
- if err != nil {
- return nil, fmt.Errorf("setting perf event bpf program: %w", err)
- }
- // PERF_EVENT_IOC_ENABLE and _DISABLE ignore their given values.
- if err := unix.IoctlSetInt(pe.fd.Int(), unix.PERF_EVENT_IOC_ENABLE, 0); err != nil {
- return nil, fmt.Errorf("enable perf event: %s", err)
- }
- pi := &perfEventIoctl{pe}
- // Close the perf event when its reference is lost to avoid leaking system resources.
- runtime.SetFinalizer(pi, (*perfEventIoctl).Close)
- return pi, nil
- }
- // Use the bpf api to attach the perf event (BPF_LINK_TYPE_PERF_EVENT, 5.15+).
- //
- // https://github.com/torvalds/linux/commit/b89fbfbb854c9afc3047e8273cc3a694650b802e
- func attachPerfEventLink(pe *perfEvent, prog *ebpf.Program) (*perfEventLink, error) {
- fd, err := sys.LinkCreatePerfEvent(&sys.LinkCreatePerfEventAttr{
- ProgFd: uint32(prog.FD()),
- TargetFd: pe.fd.Uint(),
- AttachType: sys.BPF_PERF_EVENT,
- BpfCookie: pe.cookie,
- })
- if err != nil {
- return nil, fmt.Errorf("cannot create bpf perf link: %v", err)
- }
- pl := &perfEventLink{RawLink{fd: fd}, pe}
- // Close the perf event when its reference is lost to avoid leaking system resources.
- runtime.SetFinalizer(pl, (*perfEventLink).Close)
- return pl, nil
- }
- // unsafeStringPtr returns an unsafe.Pointer to a NUL-terminated copy of str.
- func unsafeStringPtr(str string) (unsafe.Pointer, error) {
- p, err := unix.BytePtrFromString(str)
- if err != nil {
- return nil, err
- }
- return unsafe.Pointer(p), nil
- }
- // getTraceEventID reads a trace event's ID from tracefs given its group and name.
- // The kernel requires group and name to be alphanumeric or underscore.
- //
- // name automatically has its invalid symbols converted to underscores so the caller
- // can pass a raw symbol name, e.g. a kernel symbol containing dots.
- func getTraceEventID(group, name string) (uint64, error) {
- name = sanitizeSymbol(name)
- tid, err := uint64FromFile(tracefsPath, "events", group, name, "id")
- if errors.Is(err, os.ErrNotExist) {
- return 0, fmt.Errorf("trace event %s/%s: %w", group, name, os.ErrNotExist)
- }
- if err != nil {
- return 0, fmt.Errorf("reading trace event ID of %s/%s: %w", group, name, err)
- }
- return tid, nil
- }
- // getPMUEventType reads a Performance Monitoring Unit's type (numeric identifier)
- // from /sys/bus/event_source/devices/<pmu>/type.
- //
- // Returns ErrNotSupported if the pmu type is not supported.
- func getPMUEventType(typ probeType) (uint64, error) {
- et, err := uint64FromFile("/sys/bus/event_source/devices", typ.String(), "type")
- if errors.Is(err, os.ErrNotExist) {
- return 0, fmt.Errorf("pmu type %s: %w", typ, ErrNotSupported)
- }
- if err != nil {
- return 0, fmt.Errorf("reading pmu type %s: %w", typ, err)
- }
- return et, nil
- }
- // openTracepointPerfEvent opens a tracepoint-type perf event. System-wide
- // [k,u]probes created by writing to <tracefs>/[k,u]probe_events are tracepoints
- // behind the scenes, and can be attached to using these perf events.
- func openTracepointPerfEvent(tid uint64, pid int) (*sys.FD, error) {
- attr := unix.PerfEventAttr{
- Type: unix.PERF_TYPE_TRACEPOINT,
- Config: tid,
- Sample_type: unix.PERF_SAMPLE_RAW,
- Sample: 1,
- Wakeup: 1,
- }
- fd, err := unix.PerfEventOpen(&attr, pid, 0, -1, unix.PERF_FLAG_FD_CLOEXEC)
- if err != nil {
- return nil, fmt.Errorf("opening tracepoint perf event: %w", err)
- }
- return sys.NewFD(fd)
- }
- // uint64FromFile reads a uint64 from a file. All elements of path are sanitized
- // and joined onto base. Returns error if base no longer prefixes the path after
- // joining all components.
- func uint64FromFile(base string, path ...string) (uint64, error) {
- l := filepath.Join(path...)
- p := filepath.Join(base, l)
- if !strings.HasPrefix(p, base) {
- return 0, fmt.Errorf("path '%s' attempts to escape base path '%s': %w", l, base, errInvalidInput)
- }
- data, err := os.ReadFile(p)
- if err != nil {
- return 0, fmt.Errorf("reading file %s: %w", p, err)
- }
- et := bytes.TrimSpace(data)
- return strconv.ParseUint(string(et), 10, 64)
- }
- // Probe BPF perf link.
- //
- // https://elixir.bootlin.com/linux/v5.16.8/source/kernel/bpf/syscall.c#L4307
- // https://github.com/torvalds/linux/commit/b89fbfbb854c9afc3047e8273cc3a694650b802e
- var haveBPFLinkPerfEvent = internal.FeatureTest("bpf_link_perf_event", "5.15", func() error {
- prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{
- Name: "probe_bpf_perf_link",
- Type: ebpf.Kprobe,
- Instructions: asm.Instructions{
- asm.Mov.Imm(asm.R0, 0),
- asm.Return(),
- },
- License: "MIT",
- })
- if err != nil {
- return err
- }
- defer prog.Close()
- _, err = sys.LinkCreatePerfEvent(&sys.LinkCreatePerfEventAttr{
- ProgFd: uint32(prog.FD()),
- AttachType: sys.BPF_PERF_EVENT,
- })
- if errors.Is(err, unix.EINVAL) {
- return internal.ErrNotSupported
- }
- if errors.Is(err, unix.EBADF) {
- return nil
- }
- return err
- })
- // isValidTraceID implements the equivalent of a regex match
- // against "^[a-zA-Z_][0-9a-zA-Z_]*$".
- //
- // Trace event groups, names and kernel symbols must adhere to this set
- // of characters. Non-empty, first character must not be a number, all
- // characters must be alphanumeric or underscore.
- func isValidTraceID(s string) bool {
- if len(s) < 1 {
- return false
- }
- for i, c := range []byte(s) {
- switch {
- case c >= 'a' && c <= 'z':
- case c >= 'A' && c <= 'Z':
- case c == '_':
- case i > 0 && c >= '0' && c <= '9':
- default:
- return false
- }
- }
- return true
- }
|