kprobe.go 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568
  1. package link
  2. import (
  3. "bytes"
  4. "crypto/rand"
  5. "errors"
  6. "fmt"
  7. "os"
  8. "path/filepath"
  9. "runtime"
  10. "strings"
  11. "sync"
  12. "syscall"
  13. "unsafe"
  14. "github.com/cilium/ebpf"
  15. "github.com/cilium/ebpf/internal/sys"
  16. "github.com/cilium/ebpf/internal/unix"
  17. )
  18. var (
  19. kprobeEventsPath = filepath.Join(tracefsPath, "kprobe_events")
  20. kprobeRetprobeBit = struct {
  21. once sync.Once
  22. value uint64
  23. err error
  24. }{}
  25. )
  26. type probeType uint8
  27. type probeArgs struct {
  28. symbol, group, path string
  29. offset, refCtrOffset, cookie uint64
  30. pid int
  31. ret bool
  32. }
  33. // KprobeOptions defines additional parameters that will be used
  34. // when loading Kprobes.
  35. type KprobeOptions struct {
  36. // Arbitrary value that can be fetched from an eBPF program
  37. // via `bpf_get_attach_cookie()`.
  38. //
  39. // Needs kernel 5.15+.
  40. Cookie uint64
  41. // Offset of the kprobe relative to the traced symbol.
  42. // Can be used to insert kprobes at arbitrary offsets in kernel functions,
  43. // e.g. in places where functions have been inlined.
  44. Offset uint64
  45. }
  46. const (
  47. kprobeType probeType = iota
  48. uprobeType
  49. )
  50. func (pt probeType) String() string {
  51. if pt == kprobeType {
  52. return "kprobe"
  53. }
  54. return "uprobe"
  55. }
  56. func (pt probeType) EventsPath() string {
  57. if pt == kprobeType {
  58. return kprobeEventsPath
  59. }
  60. return uprobeEventsPath
  61. }
  62. func (pt probeType) PerfEventType(ret bool) perfEventType {
  63. if pt == kprobeType {
  64. if ret {
  65. return kretprobeEvent
  66. }
  67. return kprobeEvent
  68. }
  69. if ret {
  70. return uretprobeEvent
  71. }
  72. return uprobeEvent
  73. }
  74. func (pt probeType) RetprobeBit() (uint64, error) {
  75. if pt == kprobeType {
  76. return kretprobeBit()
  77. }
  78. return uretprobeBit()
  79. }
  80. // Kprobe attaches the given eBPF program to a perf event that fires when the
  81. // given kernel symbol starts executing. See /proc/kallsyms for available
  82. // symbols. For example, printk():
  83. //
  84. // kp, err := Kprobe("printk", prog, nil)
  85. //
  86. // Losing the reference to the resulting Link (kp) will close the Kprobe
  87. // and prevent further execution of prog. The Link must be Closed during
  88. // program shutdown to avoid leaking system resources.
  89. func Kprobe(symbol string, prog *ebpf.Program, opts *KprobeOptions) (Link, error) {
  90. k, err := kprobe(symbol, prog, opts, false)
  91. if err != nil {
  92. return nil, err
  93. }
  94. lnk, err := attachPerfEvent(k, prog)
  95. if err != nil {
  96. k.Close()
  97. return nil, err
  98. }
  99. return lnk, nil
  100. }
  101. // Kretprobe attaches the given eBPF program to a perf event that fires right
  102. // before the given kernel symbol exits, with the function stack left intact.
  103. // See /proc/kallsyms for available symbols. For example, printk():
  104. //
  105. // kp, err := Kretprobe("printk", prog, nil)
  106. //
  107. // Losing the reference to the resulting Link (kp) will close the Kretprobe
  108. // and prevent further execution of prog. The Link must be Closed during
  109. // program shutdown to avoid leaking system resources.
  110. func Kretprobe(symbol string, prog *ebpf.Program, opts *KprobeOptions) (Link, error) {
  111. k, err := kprobe(symbol, prog, opts, true)
  112. if err != nil {
  113. return nil, err
  114. }
  115. lnk, err := attachPerfEvent(k, prog)
  116. if err != nil {
  117. k.Close()
  118. return nil, err
  119. }
  120. return lnk, nil
  121. }
  122. // isValidKprobeSymbol implements the equivalent of a regex match
  123. // against "^[a-zA-Z_][0-9a-zA-Z_.]*$".
  124. func isValidKprobeSymbol(s string) bool {
  125. if len(s) < 1 {
  126. return false
  127. }
  128. for i, c := range []byte(s) {
  129. switch {
  130. case c >= 'a' && c <= 'z':
  131. case c >= 'A' && c <= 'Z':
  132. case c == '_':
  133. case i > 0 && c >= '0' && c <= '9':
  134. // Allow `.` in symbol name. GCC-compiled kernel may change symbol name
  135. // to have a `.isra.$n` suffix, like `udp_send_skb.isra.52`.
  136. // See: https://gcc.gnu.org/gcc-10/changes.html
  137. case i > 0 && c == '.':
  138. default:
  139. return false
  140. }
  141. }
  142. return true
  143. }
  144. // kprobe opens a perf event on the given symbol and attaches prog to it.
  145. // If ret is true, create a kretprobe.
  146. func kprobe(symbol string, prog *ebpf.Program, opts *KprobeOptions, ret bool) (*perfEvent, error) {
  147. if symbol == "" {
  148. return nil, fmt.Errorf("symbol name cannot be empty: %w", errInvalidInput)
  149. }
  150. if prog == nil {
  151. return nil, fmt.Errorf("prog cannot be nil: %w", errInvalidInput)
  152. }
  153. if !isValidKprobeSymbol(symbol) {
  154. return nil, fmt.Errorf("symbol '%s' must be a valid symbol in /proc/kallsyms: %w", symbol, errInvalidInput)
  155. }
  156. if prog.Type() != ebpf.Kprobe {
  157. return nil, fmt.Errorf("eBPF program type %s is not a Kprobe: %w", prog.Type(), errInvalidInput)
  158. }
  159. args := probeArgs{
  160. pid: perfAllThreads,
  161. symbol: symbol,
  162. ret: ret,
  163. }
  164. if opts != nil {
  165. args.cookie = opts.Cookie
  166. args.offset = opts.Offset
  167. }
  168. // Use kprobe PMU if the kernel has it available.
  169. tp, err := pmuKprobe(args)
  170. if errors.Is(err, os.ErrNotExist) {
  171. args.symbol = platformPrefix(symbol)
  172. tp, err = pmuKprobe(args)
  173. }
  174. if err == nil {
  175. return tp, nil
  176. }
  177. if err != nil && !errors.Is(err, ErrNotSupported) {
  178. return nil, fmt.Errorf("creating perf_kprobe PMU: %w", err)
  179. }
  180. // Use tracefs if kprobe PMU is missing.
  181. args.symbol = symbol
  182. tp, err = tracefsKprobe(args)
  183. if errors.Is(err, os.ErrNotExist) {
  184. args.symbol = platformPrefix(symbol)
  185. tp, err = tracefsKprobe(args)
  186. }
  187. if err != nil {
  188. return nil, fmt.Errorf("creating trace event '%s' in tracefs: %w", symbol, err)
  189. }
  190. return tp, nil
  191. }
  192. // pmuKprobe opens a perf event based on the kprobe PMU.
  193. // Returns os.ErrNotExist if the given symbol does not exist in the kernel.
  194. func pmuKprobe(args probeArgs) (*perfEvent, error) {
  195. return pmuProbe(kprobeType, args)
  196. }
  197. // pmuProbe opens a perf event based on a Performance Monitoring Unit.
  198. //
  199. // Requires at least a 4.17 kernel.
  200. // e12f03d7031a "perf/core: Implement the 'perf_kprobe' PMU"
  201. // 33ea4b24277b "perf/core: Implement the 'perf_uprobe' PMU"
  202. //
  203. // Returns ErrNotSupported if the kernel doesn't support perf_[k,u]probe PMU
  204. func pmuProbe(typ probeType, args probeArgs) (*perfEvent, error) {
  205. // Getting the PMU type will fail if the kernel doesn't support
  206. // the perf_[k,u]probe PMU.
  207. et, err := getPMUEventType(typ)
  208. if err != nil {
  209. return nil, err
  210. }
  211. var config uint64
  212. if args.ret {
  213. bit, err := typ.RetprobeBit()
  214. if err != nil {
  215. return nil, err
  216. }
  217. config |= 1 << bit
  218. }
  219. var (
  220. attr unix.PerfEventAttr
  221. sp unsafe.Pointer
  222. )
  223. switch typ {
  224. case kprobeType:
  225. // Create a pointer to a NUL-terminated string for the kernel.
  226. sp, err = unsafeStringPtr(args.symbol)
  227. if err != nil {
  228. return nil, err
  229. }
  230. attr = unix.PerfEventAttr{
  231. // The minimum size required for PMU kprobes is PERF_ATTR_SIZE_VER1,
  232. // since it added the config2 (Ext2) field. Use Ext2 as probe_offset.
  233. Size: unix.PERF_ATTR_SIZE_VER1,
  234. Type: uint32(et), // PMU event type read from sysfs
  235. Ext1: uint64(uintptr(sp)), // Kernel symbol to trace
  236. Ext2: args.offset, // Kernel symbol offset
  237. Config: config, // Retprobe flag
  238. }
  239. case uprobeType:
  240. sp, err = unsafeStringPtr(args.path)
  241. if err != nil {
  242. return nil, err
  243. }
  244. if args.refCtrOffset != 0 {
  245. config |= args.refCtrOffset << uprobeRefCtrOffsetShift
  246. }
  247. attr = unix.PerfEventAttr{
  248. // The minimum size required for PMU uprobes is PERF_ATTR_SIZE_VER1,
  249. // since it added the config2 (Ext2) field. The Size field controls the
  250. // size of the internal buffer the kernel allocates for reading the
  251. // perf_event_attr argument from userspace.
  252. Size: unix.PERF_ATTR_SIZE_VER1,
  253. Type: uint32(et), // PMU event type read from sysfs
  254. Ext1: uint64(uintptr(sp)), // Uprobe path
  255. Ext2: args.offset, // Uprobe offset
  256. Config: config, // RefCtrOffset, Retprobe flag
  257. }
  258. }
  259. rawFd, err := unix.PerfEventOpen(&attr, args.pid, 0, -1, unix.PERF_FLAG_FD_CLOEXEC)
  260. // On some old kernels, kprobe PMU doesn't allow `.` in symbol names and
  261. // return -EINVAL. Return ErrNotSupported to allow falling back to tracefs.
  262. // https://github.com/torvalds/linux/blob/94710cac0ef4/kernel/trace/trace_kprobe.c#L340-L343
  263. if errors.Is(err, unix.EINVAL) && strings.Contains(args.symbol, ".") {
  264. return nil, fmt.Errorf("symbol '%s+%#x': older kernels don't accept dots: %w", args.symbol, args.offset, ErrNotSupported)
  265. }
  266. // Since commit 97c753e62e6c, ENOENT is correctly returned instead of EINVAL
  267. // when trying to create a kretprobe for a missing symbol. Make sure ENOENT
  268. // is returned to the caller.
  269. if errors.Is(err, os.ErrNotExist) || errors.Is(err, unix.EINVAL) {
  270. return nil, fmt.Errorf("symbol '%s+%#x' not found: %w", args.symbol, args.offset, os.ErrNotExist)
  271. }
  272. // Since commit ab105a4fb894, -EILSEQ is returned when a kprobe sym+offset is resolved
  273. // to an invalid insn boundary.
  274. if errors.Is(err, syscall.EILSEQ) {
  275. return nil, fmt.Errorf("symbol '%s+%#x' not found (bad insn boundary): %w", args.symbol, args.offset, os.ErrNotExist)
  276. }
  277. // Since at least commit cb9a19fe4aa51, ENOTSUPP is returned
  278. // when attempting to set a uprobe on a trap instruction.
  279. if errors.Is(err, unix.ENOTSUPP) {
  280. return nil, fmt.Errorf("failed setting uprobe on offset %#x (possible trap insn): %w", args.offset, err)
  281. }
  282. if err != nil {
  283. return nil, fmt.Errorf("opening perf event: %w", err)
  284. }
  285. // Ensure the string pointer is not collected before PerfEventOpen returns.
  286. runtime.KeepAlive(sp)
  287. fd, err := sys.NewFD(rawFd)
  288. if err != nil {
  289. return nil, err
  290. }
  291. // Kernel has perf_[k,u]probe PMU available, initialize perf event.
  292. return &perfEvent{
  293. typ: typ.PerfEventType(args.ret),
  294. name: args.symbol,
  295. pmuID: et,
  296. cookie: args.cookie,
  297. fd: fd,
  298. }, nil
  299. }
  300. // tracefsKprobe creates a Kprobe tracefs entry.
  301. func tracefsKprobe(args probeArgs) (*perfEvent, error) {
  302. return tracefsProbe(kprobeType, args)
  303. }
  304. // tracefsProbe creates a trace event by writing an entry to <tracefs>/[k,u]probe_events.
  305. // A new trace event group name is generated on every call to support creating
  306. // multiple trace events for the same kernel or userspace symbol.
  307. // Path and offset are only set in the case of uprobe(s) and are used to set
  308. // the executable/library path on the filesystem and the offset where the probe is inserted.
  309. // A perf event is then opened on the newly-created trace event and returned to the caller.
  310. func tracefsProbe(typ probeType, args probeArgs) (_ *perfEvent, err error) {
  311. // Generate a random string for each trace event we attempt to create.
  312. // This value is used as the 'group' token in tracefs to allow creating
  313. // multiple kprobe trace events with the same name.
  314. group, err := randomGroup("ebpf")
  315. if err != nil {
  316. return nil, fmt.Errorf("randomizing group name: %w", err)
  317. }
  318. args.group = group
  319. // Before attempting to create a trace event through tracefs,
  320. // check if an event with the same group and name already exists.
  321. // Kernels 4.x and earlier don't return os.ErrExist on writing a duplicate
  322. // entry, so we need to rely on reads for detecting uniqueness.
  323. _, err = getTraceEventID(group, args.symbol)
  324. if err == nil {
  325. return nil, fmt.Errorf("trace event already exists: %s/%s", group, args.symbol)
  326. }
  327. if err != nil && !errors.Is(err, os.ErrNotExist) {
  328. return nil, fmt.Errorf("checking trace event %s/%s: %w", group, args.symbol, err)
  329. }
  330. // Create the [k,u]probe trace event using tracefs.
  331. if err := createTraceFSProbeEvent(typ, args); err != nil {
  332. return nil, fmt.Errorf("creating probe entry on tracefs: %w", err)
  333. }
  334. defer func() {
  335. if err != nil {
  336. // Make sure we clean up the created tracefs event when we return error.
  337. // If a livepatch handler is already active on the symbol, the write to
  338. // tracefs will succeed, a trace event will show up, but creating the
  339. // perf event will fail with EBUSY.
  340. _ = closeTraceFSProbeEvent(typ, args.group, args.symbol)
  341. }
  342. }()
  343. // Get the newly-created trace event's id.
  344. tid, err := getTraceEventID(group, args.symbol)
  345. if err != nil {
  346. return nil, fmt.Errorf("getting trace event id: %w", err)
  347. }
  348. // Kprobes are ephemeral tracepoints and share the same perf event type.
  349. fd, err := openTracepointPerfEvent(tid, args.pid)
  350. if err != nil {
  351. return nil, err
  352. }
  353. return &perfEvent{
  354. typ: typ.PerfEventType(args.ret),
  355. group: group,
  356. name: args.symbol,
  357. tracefsID: tid,
  358. cookie: args.cookie,
  359. fd: fd,
  360. }, nil
  361. }
  362. // createTraceFSProbeEvent creates a new ephemeral trace event by writing to
  363. // <tracefs>/[k,u]probe_events. Returns os.ErrNotExist if symbol is not a valid
  364. // kernel symbol, or if it is not traceable with kprobes. Returns os.ErrExist
  365. // if a probe with the same group and symbol already exists.
  366. func createTraceFSProbeEvent(typ probeType, args probeArgs) error {
  367. // Open the kprobe_events file in tracefs.
  368. f, err := os.OpenFile(typ.EventsPath(), os.O_APPEND|os.O_WRONLY, 0666)
  369. if err != nil {
  370. return fmt.Errorf("error opening '%s': %w", typ.EventsPath(), err)
  371. }
  372. defer f.Close()
  373. var pe, token string
  374. switch typ {
  375. case kprobeType:
  376. // The kprobe_events syntax is as follows (see Documentation/trace/kprobetrace.txt):
  377. // p[:[GRP/]EVENT] [MOD:]SYM[+offs]|MEMADDR [FETCHARGS] : Set a probe
  378. // r[MAXACTIVE][:[GRP/]EVENT] [MOD:]SYM[+0] [FETCHARGS] : Set a return probe
  379. // -:[GRP/]EVENT : Clear a probe
  380. //
  381. // Some examples:
  382. // r:ebpf_1234/r_my_kretprobe nf_conntrack_destroy
  383. // p:ebpf_5678/p_my_kprobe __x64_sys_execve
  384. //
  385. // Leaving the kretprobe's MAXACTIVE set to 0 (or absent) will make the
  386. // kernel default to NR_CPUS. This is desired in most eBPF cases since
  387. // subsampling or rate limiting logic can be more accurately implemented in
  388. // the eBPF program itself.
  389. // See Documentation/kprobes.txt for more details.
  390. token = kprobeToken(args)
  391. pe = fmt.Sprintf("%s:%s/%s %s", probePrefix(args.ret), args.group, sanitizeSymbol(args.symbol), token)
  392. case uprobeType:
  393. // The uprobe_events syntax is as follows:
  394. // p[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] : Set a probe
  395. // r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] : Set a return probe
  396. // -:[GRP/]EVENT : Clear a probe
  397. //
  398. // Some examples:
  399. // r:ebpf_1234/readline /bin/bash:0x12345
  400. // p:ebpf_5678/main_mySymbol /bin/mybin:0x12345(0x123)
  401. //
  402. // See Documentation/trace/uprobetracer.txt for more details.
  403. token = uprobeToken(args)
  404. pe = fmt.Sprintf("%s:%s/%s %s", probePrefix(args.ret), args.group, args.symbol, token)
  405. }
  406. _, err = f.WriteString(pe)
  407. // Since commit 97c753e62e6c, ENOENT is correctly returned instead of EINVAL
  408. // when trying to create a kretprobe for a missing symbol. Make sure ENOENT
  409. // is returned to the caller.
  410. // EINVAL is also returned on pre-5.2 kernels when the `SYM[+offs]` token
  411. // is resolved to an invalid insn boundary.
  412. if errors.Is(err, os.ErrNotExist) || errors.Is(err, unix.EINVAL) {
  413. return fmt.Errorf("token %s: %w", token, os.ErrNotExist)
  414. }
  415. // Since commit ab105a4fb894, -EILSEQ is returned when a kprobe sym+offset is resolved
  416. // to an invalid insn boundary.
  417. if errors.Is(err, syscall.EILSEQ) {
  418. return fmt.Errorf("token %s: bad insn boundary: %w", token, os.ErrNotExist)
  419. }
  420. // ERANGE is returned when the `SYM[+offs]` token is too big and cannot
  421. // be resolved.
  422. if errors.Is(err, syscall.ERANGE) {
  423. return fmt.Errorf("token %s: offset too big: %w", token, os.ErrNotExist)
  424. }
  425. if err != nil {
  426. return fmt.Errorf("writing '%s' to '%s': %w", pe, typ.EventsPath(), err)
  427. }
  428. return nil
  429. }
  430. // closeTraceFSProbeEvent removes the [k,u]probe with the given type, group and symbol
  431. // from <tracefs>/[k,u]probe_events.
  432. func closeTraceFSProbeEvent(typ probeType, group, symbol string) error {
  433. f, err := os.OpenFile(typ.EventsPath(), os.O_APPEND|os.O_WRONLY, 0666)
  434. if err != nil {
  435. return fmt.Errorf("error opening %s: %w", typ.EventsPath(), err)
  436. }
  437. defer f.Close()
  438. // See [k,u]probe_events syntax above. The probe type does not need to be specified
  439. // for removals.
  440. pe := fmt.Sprintf("-:%s/%s", group, sanitizeSymbol(symbol))
  441. if _, err = f.WriteString(pe); err != nil {
  442. return fmt.Errorf("writing '%s' to '%s': %w", pe, typ.EventsPath(), err)
  443. }
  444. return nil
  445. }
  446. // randomGroup generates a pseudorandom string for use as a tracefs group name.
  447. // Returns an error when the output string would exceed 63 characters (kernel
  448. // limitation), when rand.Read() fails or when prefix contains characters not
  449. // allowed by isValidTraceID.
  450. func randomGroup(prefix string) (string, error) {
  451. if !isValidTraceID(prefix) {
  452. return "", fmt.Errorf("prefix '%s' must be alphanumeric or underscore: %w", prefix, errInvalidInput)
  453. }
  454. b := make([]byte, 8)
  455. if _, err := rand.Read(b); err != nil {
  456. return "", fmt.Errorf("reading random bytes: %w", err)
  457. }
  458. group := fmt.Sprintf("%s_%x", prefix, b)
  459. if len(group) > 63 {
  460. return "", fmt.Errorf("group name '%s' cannot be longer than 63 characters: %w", group, errInvalidInput)
  461. }
  462. return group, nil
  463. }
  464. func probePrefix(ret bool) string {
  465. if ret {
  466. return "r"
  467. }
  468. return "p"
  469. }
  470. // determineRetprobeBit reads a Performance Monitoring Unit's retprobe bit
  471. // from /sys/bus/event_source/devices/<pmu>/format/retprobe.
  472. func determineRetprobeBit(typ probeType) (uint64, error) {
  473. p := filepath.Join("/sys/bus/event_source/devices/", typ.String(), "/format/retprobe")
  474. data, err := os.ReadFile(p)
  475. if err != nil {
  476. return 0, err
  477. }
  478. var rp uint64
  479. n, err := fmt.Sscanf(string(bytes.TrimSpace(data)), "config:%d", &rp)
  480. if err != nil {
  481. return 0, fmt.Errorf("parse retprobe bit: %w", err)
  482. }
  483. if n != 1 {
  484. return 0, fmt.Errorf("parse retprobe bit: expected 1 item, got %d", n)
  485. }
  486. return rp, nil
  487. }
  488. func kretprobeBit() (uint64, error) {
  489. kprobeRetprobeBit.once.Do(func() {
  490. kprobeRetprobeBit.value, kprobeRetprobeBit.err = determineRetprobeBit(kprobeType)
  491. })
  492. return kprobeRetprobeBit.value, kprobeRetprobeBit.err
  493. }
  494. // kprobeToken creates the SYM[+offs] token for the tracefs api.
  495. func kprobeToken(args probeArgs) string {
  496. po := args.symbol
  497. if args.offset != 0 {
  498. po += fmt.Sprintf("+%#x", args.offset)
  499. }
  500. return po
  501. }