summaryrefslogtreecommitdiff
path: root/vendor/github.com/cilium/ebpf/link/kprobe.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/cilium/ebpf/link/kprobe.go')
-rw-r--r--vendor/github.com/cilium/ebpf/link/kprobe.go568
1 files changed, 568 insertions, 0 deletions
diff --git a/vendor/github.com/cilium/ebpf/link/kprobe.go b/vendor/github.com/cilium/ebpf/link/kprobe.go
new file mode 100644
index 000000000..fdf622a0c
--- /dev/null
+++ b/vendor/github.com/cilium/ebpf/link/kprobe.go
@@ -0,0 +1,568 @@
+package link
+
+import (
+ "bytes"
+ "crypto/rand"
+ "errors"
+ "fmt"
+ "os"
+ "path/filepath"
+ "runtime"
+ "strings"
+ "sync"
+ "syscall"
+ "unsafe"
+
+ "github.com/cilium/ebpf"
+ "github.com/cilium/ebpf/internal/sys"
+ "github.com/cilium/ebpf/internal/unix"
+)
+
+var (
+ kprobeEventsPath = filepath.Join(tracefsPath, "kprobe_events")
+
+ kprobeRetprobeBit = struct {
+ once sync.Once
+ value uint64
+ err error
+ }{}
+)
+
+type probeType uint8
+
+type probeArgs struct {
+ symbol, group, path string
+ offset, refCtrOffset, cookie uint64
+ pid int
+ ret bool
+}
+
+// KprobeOptions defines additional parameters that will be used
+// when loading Kprobes.
+type KprobeOptions struct {
+ // Arbitrary value that can be fetched from an eBPF program
+ // via `bpf_get_attach_cookie()`.
+ //
+ // Needs kernel 5.15+.
+ Cookie uint64
+ // Offset of the kprobe relative to the traced symbol.
+ // Can be used to insert kprobes at arbitrary offsets in kernel functions,
+ // e.g. in places where functions have been inlined.
+ Offset uint64
+}
+
+const (
+ kprobeType probeType = iota
+ uprobeType
+)
+
+func (pt probeType) String() string {
+ if pt == kprobeType {
+ return "kprobe"
+ }
+ return "uprobe"
+}
+
+func (pt probeType) EventsPath() string {
+ if pt == kprobeType {
+ return kprobeEventsPath
+ }
+ return uprobeEventsPath
+}
+
+func (pt probeType) PerfEventType(ret bool) perfEventType {
+ if pt == kprobeType {
+ if ret {
+ return kretprobeEvent
+ }
+ return kprobeEvent
+ }
+ if ret {
+ return uretprobeEvent
+ }
+ return uprobeEvent
+}
+
+func (pt probeType) RetprobeBit() (uint64, error) {
+ if pt == kprobeType {
+ return kretprobeBit()
+ }
+ return uretprobeBit()
+}
+
+// Kprobe attaches the given eBPF program to a perf event that fires when the
+// given kernel symbol starts executing. See /proc/kallsyms for available
+// symbols. For example, printk():
+//
+// kp, err := Kprobe("printk", prog, nil)
+//
+// Losing the reference to the resulting Link (kp) will close the Kprobe
+// and prevent further execution of prog. The Link must be Closed during
+// program shutdown to avoid leaking system resources.
+func Kprobe(symbol string, prog *ebpf.Program, opts *KprobeOptions) (Link, error) {
+ k, err := kprobe(symbol, prog, opts, false)
+ if err != nil {
+ return nil, err
+ }
+
+ lnk, err := attachPerfEvent(k, prog)
+ if err != nil {
+ k.Close()
+ return nil, err
+ }
+
+ return lnk, nil
+}
+
+// Kretprobe attaches the given eBPF program to a perf event that fires right
+// before the given kernel symbol exits, with the function stack left intact.
+// See /proc/kallsyms for available symbols. For example, printk():
+//
+// kp, err := Kretprobe("printk", prog, nil)
+//
+// Losing the reference to the resulting Link (kp) will close the Kretprobe
+// and prevent further execution of prog. The Link must be Closed during
+// program shutdown to avoid leaking system resources.
+func Kretprobe(symbol string, prog *ebpf.Program, opts *KprobeOptions) (Link, error) {
+ k, err := kprobe(symbol, prog, opts, true)
+ if err != nil {
+ return nil, err
+ }
+
+ lnk, err := attachPerfEvent(k, prog)
+ if err != nil {
+ k.Close()
+ return nil, err
+ }
+
+ return lnk, nil
+}
+
+// isValidKprobeSymbol implements the equivalent of a regex match
+// against "^[a-zA-Z_][0-9a-zA-Z_.]*$".
+func isValidKprobeSymbol(s string) bool {
+ if len(s) < 1 {
+ return false
+ }
+
+ for i, c := range []byte(s) {
+ switch {
+ case c >= 'a' && c <= 'z':
+ case c >= 'A' && c <= 'Z':
+ case c == '_':
+ case i > 0 && c >= '0' && c <= '9':
+
+ // Allow `.` in symbol name. GCC-compiled kernel may change symbol name
+ // to have a `.isra.$n` suffix, like `udp_send_skb.isra.52`.
+ // See: https://gcc.gnu.org/gcc-10/changes.html
+ case i > 0 && c == '.':
+
+ default:
+ return false
+ }
+ }
+
+ return true
+}
+
+// kprobe opens a perf event on the given symbol and attaches prog to it.
+// If ret is true, create a kretprobe.
+func kprobe(symbol string, prog *ebpf.Program, opts *KprobeOptions, ret bool) (*perfEvent, error) {
+ if symbol == "" {
+ return nil, fmt.Errorf("symbol name cannot be empty: %w", errInvalidInput)
+ }
+ if prog == nil {
+ return nil, fmt.Errorf("prog cannot be nil: %w", errInvalidInput)
+ }
+ if !isValidKprobeSymbol(symbol) {
+ return nil, fmt.Errorf("symbol '%s' must be a valid symbol in /proc/kallsyms: %w", symbol, errInvalidInput)
+ }
+ if prog.Type() != ebpf.Kprobe {
+ return nil, fmt.Errorf("eBPF program type %s is not a Kprobe: %w", prog.Type(), errInvalidInput)
+ }
+
+ args := probeArgs{
+ pid: perfAllThreads,
+ symbol: symbol,
+ ret: ret,
+ }
+
+ if opts != nil {
+ args.cookie = opts.Cookie
+ args.offset = opts.Offset
+ }
+
+ // Use kprobe PMU if the kernel has it available.
+ tp, err := pmuKprobe(args)
+ if errors.Is(err, os.ErrNotExist) {
+ args.symbol = platformPrefix(symbol)
+ tp, err = pmuKprobe(args)
+ }
+ if err == nil {
+ return tp, nil
+ }
+ if err != nil && !errors.Is(err, ErrNotSupported) {
+ return nil, fmt.Errorf("creating perf_kprobe PMU: %w", err)
+ }
+
+ // Use tracefs if kprobe PMU is missing.
+ args.symbol = symbol
+ tp, err = tracefsKprobe(args)
+ if errors.Is(err, os.ErrNotExist) {
+ args.symbol = platformPrefix(symbol)
+ tp, err = tracefsKprobe(args)
+ }
+ if err != nil {
+ return nil, fmt.Errorf("creating trace event '%s' in tracefs: %w", symbol, err)
+ }
+
+ return tp, nil
+}
+
+// pmuKprobe opens a perf event based on the kprobe PMU.
+// Returns os.ErrNotExist if the given symbol does not exist in the kernel.
+func pmuKprobe(args probeArgs) (*perfEvent, error) {
+ return pmuProbe(kprobeType, args)
+}
+
+// pmuProbe opens a perf event based on a Performance Monitoring Unit.
+//
+// Requires at least a 4.17 kernel.
+// e12f03d7031a "perf/core: Implement the 'perf_kprobe' PMU"
+// 33ea4b24277b "perf/core: Implement the 'perf_uprobe' PMU"
+//
+// Returns ErrNotSupported if the kernel doesn't support perf_[k,u]probe PMU
+func pmuProbe(typ probeType, args probeArgs) (*perfEvent, error) {
+ // Getting the PMU type will fail if the kernel doesn't support
+ // the perf_[k,u]probe PMU.
+ et, err := getPMUEventType(typ)
+ if err != nil {
+ return nil, err
+ }
+
+ var config uint64
+ if args.ret {
+ bit, err := typ.RetprobeBit()
+ if err != nil {
+ return nil, err
+ }
+ config |= 1 << bit
+ }
+
+ var (
+ attr unix.PerfEventAttr
+ sp unsafe.Pointer
+ )
+ switch typ {
+ case kprobeType:
+ // Create a pointer to a NUL-terminated string for the kernel.
+ sp, err = unsafeStringPtr(args.symbol)
+ if err != nil {
+ return nil, err
+ }
+
+ attr = unix.PerfEventAttr{
+ // The minimum size required for PMU kprobes is PERF_ATTR_SIZE_VER1,
+ // since it added the config2 (Ext2) field. Use Ext2 as probe_offset.
+ Size: unix.PERF_ATTR_SIZE_VER1,
+ Type: uint32(et), // PMU event type read from sysfs
+ Ext1: uint64(uintptr(sp)), // Kernel symbol to trace
+ Ext2: args.offset, // Kernel symbol offset
+ Config: config, // Retprobe flag
+ }
+ case uprobeType:
+ sp, err = unsafeStringPtr(args.path)
+ if err != nil {
+ return nil, err
+ }
+
+ if args.refCtrOffset != 0 {
+ config |= args.refCtrOffset << uprobeRefCtrOffsetShift
+ }
+
+ attr = unix.PerfEventAttr{
+ // The minimum size required for PMU uprobes is PERF_ATTR_SIZE_VER1,
+ // since it added the config2 (Ext2) field. The Size field controls the
+ // size of the internal buffer the kernel allocates for reading the
+ // perf_event_attr argument from userspace.
+ Size: unix.PERF_ATTR_SIZE_VER1,
+ Type: uint32(et), // PMU event type read from sysfs
+ Ext1: uint64(uintptr(sp)), // Uprobe path
+ Ext2: args.offset, // Uprobe offset
+ Config: config, // RefCtrOffset, Retprobe flag
+ }
+ }
+
+ rawFd, err := unix.PerfEventOpen(&attr, args.pid, 0, -1, unix.PERF_FLAG_FD_CLOEXEC)
+
+ // On some old kernels, kprobe PMU doesn't allow `.` in symbol names and
+ // return -EINVAL. Return ErrNotSupported to allow falling back to tracefs.
+ // https://github.com/torvalds/linux/blob/94710cac0ef4/kernel/trace/trace_kprobe.c#L340-L343
+ if errors.Is(err, unix.EINVAL) && strings.Contains(args.symbol, ".") {
+ return nil, fmt.Errorf("symbol '%s+%#x': older kernels don't accept dots: %w", args.symbol, args.offset, ErrNotSupported)
+ }
+ // Since commit 97c753e62e6c, ENOENT is correctly returned instead of EINVAL
+ // when trying to create a kretprobe for a missing symbol. Make sure ENOENT
+ // is returned to the caller.
+ if errors.Is(err, os.ErrNotExist) || errors.Is(err, unix.EINVAL) {
+ return nil, fmt.Errorf("symbol '%s+%#x' not found: %w", args.symbol, args.offset, os.ErrNotExist)
+ }
+ // Since commit ab105a4fb894, -EILSEQ is returned when a kprobe sym+offset is resolved
+ // to an invalid insn boundary.
+ if errors.Is(err, syscall.EILSEQ) {
+ return nil, fmt.Errorf("symbol '%s+%#x' not found (bad insn boundary): %w", args.symbol, args.offset, os.ErrNotExist)
+ }
+ // Since at least commit cb9a19fe4aa51, ENOTSUPP is returned
+ // when attempting to set a uprobe on a trap instruction.
+ if errors.Is(err, unix.ENOTSUPP) {
+ return nil, fmt.Errorf("failed setting uprobe on offset %#x (possible trap insn): %w", args.offset, err)
+ }
+ if err != nil {
+ return nil, fmt.Errorf("opening perf event: %w", err)
+ }
+
+ // Ensure the string pointer is not collected before PerfEventOpen returns.
+ runtime.KeepAlive(sp)
+
+ fd, err := sys.NewFD(rawFd)
+ if err != nil {
+ return nil, err
+ }
+
+ // Kernel has perf_[k,u]probe PMU available, initialize perf event.
+ return &perfEvent{
+ typ: typ.PerfEventType(args.ret),
+ name: args.symbol,
+ pmuID: et,
+ cookie: args.cookie,
+ fd: fd,
+ }, nil
+}
+
+// tracefsKprobe creates a Kprobe tracefs entry.
+func tracefsKprobe(args probeArgs) (*perfEvent, error) {
+ return tracefsProbe(kprobeType, args)
+}
+
+// tracefsProbe creates a trace event by writing an entry to <tracefs>/[k,u]probe_events.
+// A new trace event group name is generated on every call to support creating
+// multiple trace events for the same kernel or userspace symbol.
+// Path and offset are only set in the case of uprobe(s) and are used to set
+// the executable/library path on the filesystem and the offset where the probe is inserted.
+// A perf event is then opened on the newly-created trace event and returned to the caller.
+func tracefsProbe(typ probeType, args probeArgs) (_ *perfEvent, err error) {
+ // Generate a random string for each trace event we attempt to create.
+ // This value is used as the 'group' token in tracefs to allow creating
+ // multiple kprobe trace events with the same name.
+ group, err := randomGroup("ebpf")
+ if err != nil {
+ return nil, fmt.Errorf("randomizing group name: %w", err)
+ }
+ args.group = group
+
+ // Before attempting to create a trace event through tracefs,
+ // check if an event with the same group and name already exists.
+ // Kernels 4.x and earlier don't return os.ErrExist on writing a duplicate
+ // entry, so we need to rely on reads for detecting uniqueness.
+ _, err = getTraceEventID(group, args.symbol)
+ if err == nil {
+ return nil, fmt.Errorf("trace event already exists: %s/%s", group, args.symbol)
+ }
+ if err != nil && !errors.Is(err, os.ErrNotExist) {
+ return nil, fmt.Errorf("checking trace event %s/%s: %w", group, args.symbol, err)
+ }
+
+ // Create the [k,u]probe trace event using tracefs.
+ if err := createTraceFSProbeEvent(typ, args); err != nil {
+ return nil, fmt.Errorf("creating probe entry on tracefs: %w", err)
+ }
+ defer func() {
+ if err != nil {
+ // Make sure we clean up the created tracefs event when we return error.
+ // If a livepatch handler is already active on the symbol, the write to
+ // tracefs will succeed, a trace event will show up, but creating the
+ // perf event will fail with EBUSY.
+ _ = closeTraceFSProbeEvent(typ, args.group, args.symbol)
+ }
+ }()
+
+ // Get the newly-created trace event's id.
+ tid, err := getTraceEventID(group, args.symbol)
+ if err != nil {
+ return nil, fmt.Errorf("getting trace event id: %w", err)
+ }
+
+ // Kprobes are ephemeral tracepoints and share the same perf event type.
+ fd, err := openTracepointPerfEvent(tid, args.pid)
+ if err != nil {
+ return nil, err
+ }
+
+ return &perfEvent{
+ typ: typ.PerfEventType(args.ret),
+ group: group,
+ name: args.symbol,
+ tracefsID: tid,
+ cookie: args.cookie,
+ fd: fd,
+ }, nil
+}
+
+// createTraceFSProbeEvent creates a new ephemeral trace event by writing to
+// <tracefs>/[k,u]probe_events. Returns os.ErrNotExist if symbol is not a valid
+// kernel symbol, or if it is not traceable with kprobes. Returns os.ErrExist
+// if a probe with the same group and symbol already exists.
+func createTraceFSProbeEvent(typ probeType, args probeArgs) error {
+ // Open the kprobe_events file in tracefs.
+ f, err := os.OpenFile(typ.EventsPath(), os.O_APPEND|os.O_WRONLY, 0666)
+ if err != nil {
+ return fmt.Errorf("error opening '%s': %w", typ.EventsPath(), err)
+ }
+ defer f.Close()
+
+ var pe, token string
+ switch typ {
+ case kprobeType:
+ // The kprobe_events syntax is as follows (see Documentation/trace/kprobetrace.txt):
+ // p[:[GRP/]EVENT] [MOD:]SYM[+offs]|MEMADDR [FETCHARGS] : Set a probe
+ // r[MAXACTIVE][:[GRP/]EVENT] [MOD:]SYM[+0] [FETCHARGS] : Set a return probe
+ // -:[GRP/]EVENT : Clear a probe
+ //
+ // Some examples:
+ // r:ebpf_1234/r_my_kretprobe nf_conntrack_destroy
+ // p:ebpf_5678/p_my_kprobe __x64_sys_execve
+ //
+ // Leaving the kretprobe's MAXACTIVE set to 0 (or absent) will make the
+ // kernel default to NR_CPUS. This is desired in most eBPF cases since
+ // subsampling or rate limiting logic can be more accurately implemented in
+ // the eBPF program itself.
+ // See Documentation/kprobes.txt for more details.
+ token = kprobeToken(args)
+ pe = fmt.Sprintf("%s:%s/%s %s", probePrefix(args.ret), args.group, sanitizeSymbol(args.symbol), token)
+ case uprobeType:
+ // The uprobe_events syntax is as follows:
+ // p[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] : Set a probe
+ // r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] : Set a return probe
+ // -:[GRP/]EVENT : Clear a probe
+ //
+ // Some examples:
+ // r:ebpf_1234/readline /bin/bash:0x12345
+ // p:ebpf_5678/main_mySymbol /bin/mybin:0x12345(0x123)
+ //
+ // See Documentation/trace/uprobetracer.txt for more details.
+ token = uprobeToken(args)
+ pe = fmt.Sprintf("%s:%s/%s %s", probePrefix(args.ret), args.group, args.symbol, token)
+ }
+ _, err = f.WriteString(pe)
+ // Since commit 97c753e62e6c, ENOENT is correctly returned instead of EINVAL
+ // when trying to create a kretprobe for a missing symbol. Make sure ENOENT
+ // is returned to the caller.
+ // EINVAL is also returned on pre-5.2 kernels when the `SYM[+offs]` token
+ // is resolved to an invalid insn boundary.
+ if errors.Is(err, os.ErrNotExist) || errors.Is(err, unix.EINVAL) {
+ return fmt.Errorf("token %s: %w", token, os.ErrNotExist)
+ }
+ // Since commit ab105a4fb894, -EILSEQ is returned when a kprobe sym+offset is resolved
+ // to an invalid insn boundary.
+ if errors.Is(err, syscall.EILSEQ) {
+ return fmt.Errorf("token %s: bad insn boundary: %w", token, os.ErrNotExist)
+ }
+ // ERANGE is returned when the `SYM[+offs]` token is too big and cannot
+ // be resolved.
+ if errors.Is(err, syscall.ERANGE) {
+ return fmt.Errorf("token %s: offset too big: %w", token, os.ErrNotExist)
+ }
+ if err != nil {
+ return fmt.Errorf("writing '%s' to '%s': %w", pe, typ.EventsPath(), err)
+ }
+
+ return nil
+}
+
+// closeTraceFSProbeEvent removes the [k,u]probe with the given type, group and symbol
+// from <tracefs>/[k,u]probe_events.
+func closeTraceFSProbeEvent(typ probeType, group, symbol string) error {
+ f, err := os.OpenFile(typ.EventsPath(), os.O_APPEND|os.O_WRONLY, 0666)
+ if err != nil {
+ return fmt.Errorf("error opening %s: %w", typ.EventsPath(), err)
+ }
+ defer f.Close()
+
+ // See [k,u]probe_events syntax above. The probe type does not need to be specified
+ // for removals.
+ pe := fmt.Sprintf("-:%s/%s", group, sanitizeSymbol(symbol))
+ if _, err = f.WriteString(pe); err != nil {
+ return fmt.Errorf("writing '%s' to '%s': %w", pe, typ.EventsPath(), err)
+ }
+
+ return nil
+}
+
+// randomGroup generates a pseudorandom string for use as a tracefs group name.
+// Returns an error when the output string would exceed 63 characters (kernel
+// limitation), when rand.Read() fails or when prefix contains characters not
+// allowed by isValidTraceID.
+func randomGroup(prefix string) (string, error) {
+ if !isValidTraceID(prefix) {
+ return "", fmt.Errorf("prefix '%s' must be alphanumeric or underscore: %w", prefix, errInvalidInput)
+ }
+
+ b := make([]byte, 8)
+ if _, err := rand.Read(b); err != nil {
+ return "", fmt.Errorf("reading random bytes: %w", err)
+ }
+
+ group := fmt.Sprintf("%s_%x", prefix, b)
+ if len(group) > 63 {
+ return "", fmt.Errorf("group name '%s' cannot be longer than 63 characters: %w", group, errInvalidInput)
+ }
+
+ return group, nil
+}
+
+func probePrefix(ret bool) string {
+ if ret {
+ return "r"
+ }
+ return "p"
+}
+
+// determineRetprobeBit reads a Performance Monitoring Unit's retprobe bit
+// from /sys/bus/event_source/devices/<pmu>/format/retprobe.
+func determineRetprobeBit(typ probeType) (uint64, error) {
+ p := filepath.Join("/sys/bus/event_source/devices/", typ.String(), "/format/retprobe")
+
+ data, err := os.ReadFile(p)
+ if err != nil {
+ return 0, err
+ }
+
+ var rp uint64
+ n, err := fmt.Sscanf(string(bytes.TrimSpace(data)), "config:%d", &rp)
+ if err != nil {
+ return 0, fmt.Errorf("parse retprobe bit: %w", err)
+ }
+ if n != 1 {
+ return 0, fmt.Errorf("parse retprobe bit: expected 1 item, got %d", n)
+ }
+
+ return rp, nil
+}
+
+func kretprobeBit() (uint64, error) {
+ kprobeRetprobeBit.once.Do(func() {
+ kprobeRetprobeBit.value, kprobeRetprobeBit.err = determineRetprobeBit(kprobeType)
+ })
+ return kprobeRetprobeBit.value, kprobeRetprobeBit.err
+}
+
+// kprobeToken creates the SYM[+offs] token for the tracefs api.
+func kprobeToken(args probeArgs) string {
+ po := args.symbol
+
+ if args.offset != 0 {
+ po += fmt.Sprintf("+%#x", args.offset)
+ }
+
+ return po
+}