perf_event.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394
  1. package link
  2. import (
  3. "bytes"
  4. "errors"
  5. "fmt"
  6. "os"
  7. "path/filepath"
  8. "runtime"
  9. "strconv"
  10. "strings"
  11. "unsafe"
  12. "github.com/cilium/ebpf"
  13. "github.com/cilium/ebpf/asm"
  14. "github.com/cilium/ebpf/internal"
  15. "github.com/cilium/ebpf/internal/sys"
  16. "github.com/cilium/ebpf/internal/unix"
  17. )
  18. // Getting the terminology right is usually the hardest part. For posterity and
  19. // for staying sane during implementation:
  20. //
  21. // - trace event: Representation of a kernel runtime hook. Filesystem entries
  22. // under <tracefs>/events. Can be tracepoints (static), kprobes or uprobes.
  23. // Can be instantiated into perf events (see below).
  24. // - tracepoint: A predetermined hook point in the kernel. Exposed as trace
  25. // events in (sub)directories under <tracefs>/events. Cannot be closed or
  26. // removed, they are static.
  27. // - k(ret)probe: Ephemeral trace events based on entry or exit points of
  28. // exported kernel symbols. kprobe-based (tracefs) trace events can be
  29. // created system-wide by writing to the <tracefs>/kprobe_events file, or
  30. // they can be scoped to the current process by creating PMU perf events.
  31. // - u(ret)probe: Ephemeral trace events based on user provides ELF binaries
  32. // and offsets. uprobe-based (tracefs) trace events can be
  33. // created system-wide by writing to the <tracefs>/uprobe_events file, or
  34. // they can be scoped to the current process by creating PMU perf events.
  35. // - perf event: An object instantiated based on an existing trace event or
  36. // kernel symbol. Referred to by fd in userspace.
  37. // Exactly one eBPF program can be attached to a perf event. Multiple perf
  38. // events can be created from a single trace event. Closing a perf event
  39. // stops any further invocations of the attached eBPF program.
  40. var (
  41. tracefsPath = "/sys/kernel/debug/tracing"
  42. errInvalidInput = errors.New("invalid input")
  43. )
  44. const (
  45. perfAllThreads = -1
  46. )
  47. type perfEventType uint8
  48. const (
  49. tracepointEvent perfEventType = iota
  50. kprobeEvent
  51. kretprobeEvent
  52. uprobeEvent
  53. uretprobeEvent
  54. )
  55. // A perfEvent represents a perf event kernel object. Exactly one eBPF program
  56. // can be attached to it. It is created based on a tracefs trace event or a
  57. // Performance Monitoring Unit (PMU).
  58. type perfEvent struct {
  59. // The event type determines the types of programs that can be attached.
  60. typ perfEventType
  61. // Group and name of the tracepoint/kprobe/uprobe.
  62. group string
  63. name string
  64. // PMU event ID read from sysfs. Valid IDs are non-zero.
  65. pmuID uint64
  66. // ID of the trace event read from tracefs. Valid IDs are non-zero.
  67. tracefsID uint64
  68. // User provided arbitrary value.
  69. cookie uint64
  70. // This is the perf event FD.
  71. fd *sys.FD
  72. }
  73. func (pe *perfEvent) Close() error {
  74. if err := pe.fd.Close(); err != nil {
  75. return fmt.Errorf("closing perf event fd: %w", err)
  76. }
  77. switch pe.typ {
  78. case kprobeEvent, kretprobeEvent:
  79. // Clean up kprobe tracefs entry.
  80. if pe.tracefsID != 0 {
  81. return closeTraceFSProbeEvent(kprobeType, pe.group, pe.name)
  82. }
  83. case uprobeEvent, uretprobeEvent:
  84. // Clean up uprobe tracefs entry.
  85. if pe.tracefsID != 0 {
  86. return closeTraceFSProbeEvent(uprobeType, pe.group, pe.name)
  87. }
  88. case tracepointEvent:
  89. // Tracepoint trace events don't hold any extra resources.
  90. return nil
  91. }
  92. return nil
  93. }
  94. // perfEventLink represents a bpf perf link.
  95. type perfEventLink struct {
  96. RawLink
  97. pe *perfEvent
  98. }
  99. func (pl *perfEventLink) isLink() {}
  100. // Pinning requires the underlying perf event FD to stay open.
  101. //
  102. // | PerfEvent FD | BpfLink FD | Works |
  103. // |--------------|------------|-------|
  104. // | Open | Open | Yes |
  105. // | Closed | Open | No |
  106. // | Open | Closed | No (Pin() -> EINVAL) |
  107. // | Closed | Closed | No (Pin() -> EINVAL) |
  108. //
  109. // There is currently no pretty way to recover the perf event FD
  110. // when loading a pinned link, so leave as not supported for now.
  111. func (pl *perfEventLink) Pin(string) error {
  112. return fmt.Errorf("perf event link pin: %w", ErrNotSupported)
  113. }
  114. func (pl *perfEventLink) Unpin() error {
  115. return fmt.Errorf("perf event link unpin: %w", ErrNotSupported)
  116. }
  117. func (pl *perfEventLink) Close() error {
  118. if err := pl.pe.Close(); err != nil {
  119. return fmt.Errorf("perf event link close: %w", err)
  120. }
  121. return pl.fd.Close()
  122. }
  123. func (pl *perfEventLink) Update(prog *ebpf.Program) error {
  124. return fmt.Errorf("perf event link update: %w", ErrNotSupported)
  125. }
  126. // perfEventIoctl implements Link and handles the perf event lifecycle
  127. // via ioctl().
  128. type perfEventIoctl struct {
  129. *perfEvent
  130. }
  131. func (pi *perfEventIoctl) isLink() {}
  132. // Since 4.15 (e87c6bc3852b "bpf: permit multiple bpf attachments for a single perf event"),
  133. // calling PERF_EVENT_IOC_SET_BPF appends the given program to a prog_array
  134. // owned by the perf event, which means multiple programs can be attached
  135. // simultaneously.
  136. //
  137. // Before 4.15, calling PERF_EVENT_IOC_SET_BPF more than once on a perf event
  138. // returns EEXIST.
  139. //
  140. // Detaching a program from a perf event is currently not possible, so a
  141. // program replacement mechanism cannot be implemented for perf events.
  142. func (pi *perfEventIoctl) Update(prog *ebpf.Program) error {
  143. return fmt.Errorf("perf event ioctl update: %w", ErrNotSupported)
  144. }
  145. func (pi *perfEventIoctl) Pin(string) error {
  146. return fmt.Errorf("perf event ioctl pin: %w", ErrNotSupported)
  147. }
  148. func (pi *perfEventIoctl) Unpin() error {
  149. return fmt.Errorf("perf event ioctl unpin: %w", ErrNotSupported)
  150. }
  151. func (pi *perfEventIoctl) Info() (*Info, error) {
  152. return nil, fmt.Errorf("perf event ioctl info: %w", ErrNotSupported)
  153. }
  154. // attach the given eBPF prog to the perf event stored in pe.
  155. // pe must contain a valid perf event fd.
  156. // prog's type must match the program type stored in pe.
  157. func attachPerfEvent(pe *perfEvent, prog *ebpf.Program) (Link, error) {
  158. if prog == nil {
  159. return nil, errors.New("cannot attach a nil program")
  160. }
  161. if prog.FD() < 0 {
  162. return nil, fmt.Errorf("invalid program: %w", sys.ErrClosedFd)
  163. }
  164. switch pe.typ {
  165. case kprobeEvent, kretprobeEvent, uprobeEvent, uretprobeEvent:
  166. if t := prog.Type(); t != ebpf.Kprobe {
  167. return nil, fmt.Errorf("invalid program type (expected %s): %s", ebpf.Kprobe, t)
  168. }
  169. case tracepointEvent:
  170. if t := prog.Type(); t != ebpf.TracePoint {
  171. return nil, fmt.Errorf("invalid program type (expected %s): %s", ebpf.TracePoint, t)
  172. }
  173. default:
  174. return nil, fmt.Errorf("unknown perf event type: %d", pe.typ)
  175. }
  176. if err := haveBPFLinkPerfEvent(); err == nil {
  177. return attachPerfEventLink(pe, prog)
  178. }
  179. return attachPerfEventIoctl(pe, prog)
  180. }
  181. func attachPerfEventIoctl(pe *perfEvent, prog *ebpf.Program) (*perfEventIoctl, error) {
  182. if pe.cookie != 0 {
  183. return nil, fmt.Errorf("cookies are not supported: %w", ErrNotSupported)
  184. }
  185. // Assign the eBPF program to the perf event.
  186. err := unix.IoctlSetInt(pe.fd.Int(), unix.PERF_EVENT_IOC_SET_BPF, prog.FD())
  187. if err != nil {
  188. return nil, fmt.Errorf("setting perf event bpf program: %w", err)
  189. }
  190. // PERF_EVENT_IOC_ENABLE and _DISABLE ignore their given values.
  191. if err := unix.IoctlSetInt(pe.fd.Int(), unix.PERF_EVENT_IOC_ENABLE, 0); err != nil {
  192. return nil, fmt.Errorf("enable perf event: %s", err)
  193. }
  194. pi := &perfEventIoctl{pe}
  195. // Close the perf event when its reference is lost to avoid leaking system resources.
  196. runtime.SetFinalizer(pi, (*perfEventIoctl).Close)
  197. return pi, nil
  198. }
  199. // Use the bpf api to attach the perf event (BPF_LINK_TYPE_PERF_EVENT, 5.15+).
  200. //
  201. // https://github.com/torvalds/linux/commit/b89fbfbb854c9afc3047e8273cc3a694650b802e
  202. func attachPerfEventLink(pe *perfEvent, prog *ebpf.Program) (*perfEventLink, error) {
  203. fd, err := sys.LinkCreatePerfEvent(&sys.LinkCreatePerfEventAttr{
  204. ProgFd: uint32(prog.FD()),
  205. TargetFd: pe.fd.Uint(),
  206. AttachType: sys.BPF_PERF_EVENT,
  207. BpfCookie: pe.cookie,
  208. })
  209. if err != nil {
  210. return nil, fmt.Errorf("cannot create bpf perf link: %v", err)
  211. }
  212. pl := &perfEventLink{RawLink{fd: fd}, pe}
  213. // Close the perf event when its reference is lost to avoid leaking system resources.
  214. runtime.SetFinalizer(pl, (*perfEventLink).Close)
  215. return pl, nil
  216. }
  217. // unsafeStringPtr returns an unsafe.Pointer to a NUL-terminated copy of str.
  218. func unsafeStringPtr(str string) (unsafe.Pointer, error) {
  219. p, err := unix.BytePtrFromString(str)
  220. if err != nil {
  221. return nil, err
  222. }
  223. return unsafe.Pointer(p), nil
  224. }
  225. // getTraceEventID reads a trace event's ID from tracefs given its group and name.
  226. // The kernel requires group and name to be alphanumeric or underscore.
  227. //
  228. // name automatically has its invalid symbols converted to underscores so the caller
  229. // can pass a raw symbol name, e.g. a kernel symbol containing dots.
  230. func getTraceEventID(group, name string) (uint64, error) {
  231. name = sanitizeSymbol(name)
  232. tid, err := uint64FromFile(tracefsPath, "events", group, name, "id")
  233. if errors.Is(err, os.ErrNotExist) {
  234. return 0, fmt.Errorf("trace event %s/%s: %w", group, name, os.ErrNotExist)
  235. }
  236. if err != nil {
  237. return 0, fmt.Errorf("reading trace event ID of %s/%s: %w", group, name, err)
  238. }
  239. return tid, nil
  240. }
  241. // getPMUEventType reads a Performance Monitoring Unit's type (numeric identifier)
  242. // from /sys/bus/event_source/devices/<pmu>/type.
  243. //
  244. // Returns ErrNotSupported if the pmu type is not supported.
  245. func getPMUEventType(typ probeType) (uint64, error) {
  246. et, err := uint64FromFile("/sys/bus/event_source/devices", typ.String(), "type")
  247. if errors.Is(err, os.ErrNotExist) {
  248. return 0, fmt.Errorf("pmu type %s: %w", typ, ErrNotSupported)
  249. }
  250. if err != nil {
  251. return 0, fmt.Errorf("reading pmu type %s: %w", typ, err)
  252. }
  253. return et, nil
  254. }
  255. // openTracepointPerfEvent opens a tracepoint-type perf event. System-wide
  256. // [k,u]probes created by writing to <tracefs>/[k,u]probe_events are tracepoints
  257. // behind the scenes, and can be attached to using these perf events.
  258. func openTracepointPerfEvent(tid uint64, pid int) (*sys.FD, error) {
  259. attr := unix.PerfEventAttr{
  260. Type: unix.PERF_TYPE_TRACEPOINT,
  261. Config: tid,
  262. Sample_type: unix.PERF_SAMPLE_RAW,
  263. Sample: 1,
  264. Wakeup: 1,
  265. }
  266. fd, err := unix.PerfEventOpen(&attr, pid, 0, -1, unix.PERF_FLAG_FD_CLOEXEC)
  267. if err != nil {
  268. return nil, fmt.Errorf("opening tracepoint perf event: %w", err)
  269. }
  270. return sys.NewFD(fd)
  271. }
  272. // uint64FromFile reads a uint64 from a file. All elements of path are sanitized
  273. // and joined onto base. Returns error if base no longer prefixes the path after
  274. // joining all components.
  275. func uint64FromFile(base string, path ...string) (uint64, error) {
  276. l := filepath.Join(path...)
  277. p := filepath.Join(base, l)
  278. if !strings.HasPrefix(p, base) {
  279. return 0, fmt.Errorf("path '%s' attempts to escape base path '%s': %w", l, base, errInvalidInput)
  280. }
  281. data, err := os.ReadFile(p)
  282. if err != nil {
  283. return 0, fmt.Errorf("reading file %s: %w", p, err)
  284. }
  285. et := bytes.TrimSpace(data)
  286. return strconv.ParseUint(string(et), 10, 64)
  287. }
  288. // Probe BPF perf link.
  289. //
  290. // https://elixir.bootlin.com/linux/v5.16.8/source/kernel/bpf/syscall.c#L4307
  291. // https://github.com/torvalds/linux/commit/b89fbfbb854c9afc3047e8273cc3a694650b802e
  292. var haveBPFLinkPerfEvent = internal.FeatureTest("bpf_link_perf_event", "5.15", func() error {
  293. prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{
  294. Name: "probe_bpf_perf_link",
  295. Type: ebpf.Kprobe,
  296. Instructions: asm.Instructions{
  297. asm.Mov.Imm(asm.R0, 0),
  298. asm.Return(),
  299. },
  300. License: "MIT",
  301. })
  302. if err != nil {
  303. return err
  304. }
  305. defer prog.Close()
  306. _, err = sys.LinkCreatePerfEvent(&sys.LinkCreatePerfEventAttr{
  307. ProgFd: uint32(prog.FD()),
  308. AttachType: sys.BPF_PERF_EVENT,
  309. })
  310. if errors.Is(err, unix.EINVAL) {
  311. return internal.ErrNotSupported
  312. }
  313. if errors.Is(err, unix.EBADF) {
  314. return nil
  315. }
  316. return err
  317. })
  318. // isValidTraceID implements the equivalent of a regex match
  319. // against "^[a-zA-Z_][0-9a-zA-Z_]*$".
  320. //
  321. // Trace event groups, names and kernel symbols must adhere to this set
  322. // of characters. Non-empty, first character must not be a number, all
  323. // characters must be alphanumeric or underscore.
  324. func isValidTraceID(s string) bool {
  325. if len(s) < 1 {
  326. return false
  327. }
  328. for i, c := range []byte(s) {
  329. switch {
  330. case c >= 'a' && c <= 'z':
  331. case c >= 'A' && c <= 'Z':
  332. case c == '_':
  333. case i > 0 && c >= '0' && c <= '9':
  334. default:
  335. return false
  336. }
  337. }
  338. return true
  339. }