diff options
| -rw-r--r-- | include/linux/bpf.h | 50 | ||||
| -rw-r--r-- | include/linux/trace_events.h | 6 | ||||
| -rw-r--r-- | include/trace/bpf_probe.h | 2 | ||||
| -rw-r--r-- | kernel/bpf/syscall.c | 5 | ||||
| -rw-r--r-- | kernel/bpf/verifier.c | 13 | ||||
| -rw-r--r-- | kernel/events/core.c | 9 | ||||
| -rw-r--r-- | kernel/trace/bpf_trace.c | 48 | ||||
| -rw-r--r-- | kernel/trace/trace_syscalls.c | 110 | ||||
| -rw-r--r-- | net/bpf/test_run.c | 65 | ||||
| -rw-r--r-- | tools/lib/bpf/libbpf.c | 88 | ||||
| -rw-r--r-- | tools/testing/selftests/bpf/prog_tests/sleepable_tracepoints.c | 142 | ||||
| -rw-r--r-- | tools/testing/selftests/bpf/progs/test_sleepable_tracepoints.c | 112 | ||||
| -rw-r--r-- | tools/testing/selftests/bpf/progs/test_sleepable_tracepoints_fail.c | 18 | ||||
| -rw-r--r-- | tools/testing/selftests/bpf/verifier/sleepable.c | 17 |
14 files changed, 578 insertions, 107 deletions
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3cb6b9e70080..d3aea3931b85 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3079,6 +3079,56 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr); void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip); +static __always_inline u32 +bpf_prog_run_array_sleepable(const struct bpf_prog_array *array, + const void *ctx, bpf_prog_run_fn run_prog) +{ + const struct bpf_prog_array_item *item; + struct bpf_prog *prog; + struct bpf_run_ctx *old_run_ctx; + struct bpf_trace_run_ctx run_ctx; + u32 ret = 1; + + if (unlikely(!array)) + return ret; + + migrate_disable(); + + run_ctx.is_uprobe = false; + + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + item = &array->items[0]; + while ((prog = READ_ONCE(item->prog))) { + /* Skip dummy_bpf_prog placeholder (len == 0) */ + if (unlikely(!prog->len)) { + item++; + continue; + } + + if (unlikely(!bpf_prog_get_recursion_context(prog))) { + bpf_prog_inc_misses_counter(prog); + bpf_prog_put_recursion_context(prog); + item++; + continue; + } + + run_ctx.bpf_cookie = item->bpf_cookie; + + if (!prog->sleepable) { + guard(rcu)(); + ret &= run_prog(prog, ctx); + } else { + ret &= run_prog(prog, ctx); + } + + bpf_prog_put_recursion_context(prog); + item++; + } + bpf_reset_run_ctx(old_run_ctx); + migrate_enable(); + return ret; +} + #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 40a43a4c7caf..d49338c44014 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -770,6 +770,7 @@ trace_trigger_soft_disabled(struct trace_event_file *file) #ifdef CONFIG_BPF_EVENTS unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx); +unsigned int trace_call_bpf_faultable(struct trace_event_call *call, void *ctx); int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie); void perf_event_detach_bpf_prog(struct perf_event *event); int perf_event_query_prog_array(struct perf_event *event, void __user *info); @@ -792,6 +793,11 @@ static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *c return 1; } +static inline unsigned int trace_call_bpf_faultable(struct trace_event_call *call, void *ctx) +{ + return 1; +} + static inline int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie) { diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h index 9391d54d3f12..d1de8f9aa07f 100644 --- a/include/trace/bpf_probe.h +++ b/include/trace/bpf_probe.h @@ -58,9 +58,7 @@ static notrace void \ __bpf_trace_##call(void *__data, proto) \ { \ might_fault(); \ - preempt_disable_notrace(); \ CONCATENATE(bpf_trace_run, COUNT_ARGS(args))(__data, CAST_TO_U64(args)); \ - preempt_enable_notrace(); \ } #undef DECLARE_EVENT_SYSCALL_CLASS diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a3c0214ca934..3b1f0ba02f61 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4281,6 +4281,11 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, if (!btp) return -ENOENT; + if (prog->sleepable && !tracepoint_is_faultable(btp->tp)) { + bpf_put_raw_tracepoint(btp); + return -EINVAL; + } + link = kzalloc_obj(*link, GFP_USER); if (!link) { err = -ENOMEM; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 185210b73385..5b4806fdb648 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19267,6 +19267,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, btp = bpf_get_raw_tracepoint(tname); if (!btp) return -EINVAL; + if (prog->sleepable && !tracepoint_is_faultable(btp->tp)) { + bpf_log(log, "Sleepable program cannot attach to non-faultable tracepoint %s\n", + tname); + bpf_put_raw_tracepoint(btp); + return -EINVAL; + } fname = kallsyms_lookup((unsigned long)btp->bpf_func, NULL, NULL, NULL, trace_symbol); bpf_put_raw_tracepoint(btp); @@ -19483,6 +19489,7 @@ static bool can_be_sleepable(struct bpf_prog *prog) case BPF_MODIFY_RETURN: case BPF_TRACE_ITER: case BPF_TRACE_FSESSION: + case BPF_TRACE_RAW_TP: return true; default: return false; @@ -19490,7 +19497,9 @@ static bool can_be_sleepable(struct bpf_prog *prog) } return prog->type == BPF_PROG_TYPE_LSM || prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ || - prog->type == BPF_PROG_TYPE_STRUCT_OPS; + prog->type == BPF_PROG_TYPE_STRUCT_OPS || + prog->type == BPF_PROG_TYPE_RAW_TRACEPOINT || + prog->type == BPF_PROG_TYPE_TRACEPOINT; } static int check_attach_btf_id(struct bpf_verifier_env *env) @@ -19512,7 +19521,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) } if (prog->sleepable && !can_be_sleepable(prog)) { - verbose(env, "Only fentry/fexit/fsession/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n"); + verbose(env, "Program of this type cannot be sleepable\n"); return -EINVAL; } diff --git a/kernel/events/core.c b/kernel/events/core.c index 6d1f8bad7e1c..0f9cacfa7cb8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11643,6 +11643,15 @@ static int __perf_event_set_bpf_prog(struct perf_event *event, /* only uprobe programs are allowed to be sleepable */ return -EINVAL; + if (prog->type == BPF_PROG_TYPE_TRACEPOINT && prog->sleepable) { + /* + * Sleepable tracepoint programs can only attach to faultable + * tracepoints. Currently only syscall tracepoints are faultable. + */ + if (!is_syscall_tp) + return -EINVAL; + } + /* Kprobe override only works for kprobes, not uprobes. */ if (prog->kprobe_override && !is_kprobe) return -EINVAL; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e916f0ccbed9..a822c589c9bd 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -152,6 +152,34 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) return ret; } +/** + * trace_call_bpf_faultable - invoke BPF program in faultable context + * @call: tracepoint event + * @ctx: opaque context pointer + * + * Variant of trace_call_bpf() for faultable tracepoints (syscall + * tracepoints). Supports sleepable BPF programs by using rcu_tasks_trace + * for lifetime protection and bpf_prog_run_array_sleepable() for per-program + * RCU flavor selection, following the uprobe pattern. + * + * Per-program recursion protection is provided by + * bpf_prog_run_array_sleepable(). Global bpf_prog_active is not + * needed because syscall tracepoints cannot self-recurse. + * + * Must be called from a faultable/preemptible context. + */ +unsigned int trace_call_bpf_faultable(struct trace_event_call *call, void *ctx) +{ + struct bpf_prog_array *prog_array; + + might_fault(); + guard(rcu_tasks_trace)(); + + prog_array = rcu_dereference_check(call->prog_array, + rcu_read_lock_trace_held()); + return bpf_prog_run_array_sleepable(prog_array, ctx, bpf_prog_run); +} + #ifdef CONFIG_BPF_KPROBE_OVERRIDE BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) { @@ -2072,11 +2100,19 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) static __always_inline void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) { + struct srcu_ctr __percpu *scp = NULL; struct bpf_prog *prog = link->link.prog; + bool sleepable = prog->sleepable; struct bpf_run_ctx *old_run_ctx; struct bpf_trace_run_ctx run_ctx; - rcu_read_lock_dont_migrate(); + if (sleepable) { + scp = rcu_read_lock_tasks_trace(); + migrate_disable(); + } else { + rcu_read_lock_dont_migrate(); + } + if (unlikely(!bpf_prog_get_recursion_context(prog))) { bpf_prog_inc_misses_counter(prog); goto out; @@ -2085,12 +2121,18 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) run_ctx.bpf_cookie = link->cookie; old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); - (void) bpf_prog_run(prog, args); + (void)bpf_prog_run(prog, args); bpf_reset_run_ctx(old_run_ctx); out: bpf_prog_put_recursion_context(prog); - rcu_read_unlock_migrate(); + + if (sleepable) { + migrate_enable(); + rcu_read_unlock_tasks_trace(scp); + } else { + rcu_read_unlock_migrate(); + } } #define UNPACK(...) __VA_ARGS__ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 8ad72e17d8eb..e98ee7e1e66f 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1371,33 +1371,33 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls); static int sys_perf_refcount_enter; static int sys_perf_refcount_exit; -static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs, +static int perf_call_bpf_enter(struct trace_event_call *call, struct syscall_metadata *sys_data, - struct syscall_trace_enter *rec) + int syscall_nr, unsigned long *args) { struct syscall_tp_t { struct trace_entry ent; int syscall_nr; unsigned long args[SYSCALL_DEFINE_MAXARGS]; } __aligned(8) param; + struct pt_regs regs = {}; int i; BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *)); - /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. ¶m) */ - perf_fetch_caller_regs(regs); - *(struct pt_regs **)¶m = regs; - param.syscall_nr = rec->nr; + /* bpf prog requires 'regs' to be the first member in the ctx */ + perf_fetch_caller_regs(®s); + *(struct pt_regs **)¶m = ®s; + param.syscall_nr = syscall_nr; for (i = 0; i < sys_data->nb_args; i++) - param.args[i] = rec->args[i]; - return trace_call_bpf(call, ¶m); + param.args[i] = args[i]; + return trace_call_bpf_faultable(call, ¶m); } static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) { struct syscall_metadata *sys_data; struct syscall_trace_enter *rec; - struct pt_regs *fake_regs; struct hlist_head *head; unsigned long args[6]; bool valid_prog_array; @@ -1410,12 +1410,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) int size = 0; int uargs = 0; - /* - * Syscall probe called with preemption enabled, but the ring - * buffer and per-cpu data require preemption to be disabled. - */ might_fault(); - guard(preempt_notrace)(); syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) @@ -1429,6 +1424,26 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) syscall_get_arguments(current, regs, args); + /* + * Run BPF program in faultable context before per-cpu buffer + * allocation, allowing sleepable BPF programs to execute. + */ + valid_prog_array = bpf_prog_array_valid(sys_data->enter_event); + if (valid_prog_array && + !perf_call_bpf_enter(sys_data->enter_event, sys_data, + syscall_nr, args)) + return; + + /* + * Per-cpu ring buffer and perf event list operations require + * preemption to be disabled. + */ + guard(preempt_notrace)(); + + head = this_cpu_ptr(sys_data->enter_event->perf_events); + if (hlist_empty(head)) + return; + /* Check if this syscall event faults in user space memory */ mayfault = sys_data->user_mask != 0; @@ -1438,17 +1453,12 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) return; } - head = this_cpu_ptr(sys_data->enter_event->perf_events); - valid_prog_array = bpf_prog_array_valid(sys_data->enter_event); - if (!valid_prog_array && hlist_empty(head)) - return; - /* get the size after alignment with the u32 buffer size field */ size += sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); size = ALIGN(size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - rec = perf_trace_buf_alloc(size, &fake_regs, &rctx); + rec = perf_trace_buf_alloc(size, NULL, &rctx); if (!rec) return; @@ -1458,13 +1468,6 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) if (mayfault) syscall_put_data(sys_data, rec, user_ptr, size, user_sizes, uargs); - if ((valid_prog_array && - !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) || - hlist_empty(head)) { - perf_swevent_put_recursion_context(rctx); - return; - } - perf_trace_buf_submit(rec, size, rctx, sys_data->enter_event->event.type, 1, regs, head, NULL); @@ -1514,40 +1517,35 @@ static void perf_sysenter_disable(struct trace_event_call *call) syscall_fault_buffer_disable(); } -static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs, - struct syscall_trace_exit *rec) +static int perf_call_bpf_exit(struct trace_event_call *call, + int syscall_nr, long ret_val) { struct syscall_tp_t { struct trace_entry ent; int syscall_nr; unsigned long ret; } __aligned(8) param; - - /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. ¶m) */ - perf_fetch_caller_regs(regs); - *(struct pt_regs **)¶m = regs; - param.syscall_nr = rec->nr; - param.ret = rec->ret; - return trace_call_bpf(call, ¶m); + struct pt_regs regs = {}; + + /* bpf prog requires 'regs' to be the first member in the ctx */ + perf_fetch_caller_regs(®s); + *(struct pt_regs **)¶m = ®s; + param.syscall_nr = syscall_nr; + param.ret = ret_val; + return trace_call_bpf_faultable(call, ¶m); } static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) { struct syscall_metadata *sys_data; struct syscall_trace_exit *rec; - struct pt_regs *fake_regs; struct hlist_head *head; bool valid_prog_array; int syscall_nr; int rctx; int size; - /* - * Syscall probe called with preemption enabled, but the ring - * buffer and per-cpu data require preemption to be disabled. - */ might_fault(); - guard(preempt_notrace)(); syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) @@ -1559,29 +1557,37 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) if (!sys_data) return; - head = this_cpu_ptr(sys_data->exit_event->perf_events); + /* + * Run BPF program in faultable context before per-cpu buffer + * allocation, allowing sleepable BPF programs to execute. + */ valid_prog_array = bpf_prog_array_valid(sys_data->exit_event); - if (!valid_prog_array && hlist_empty(head)) + if (valid_prog_array && + !perf_call_bpf_exit(sys_data->exit_event, syscall_nr, + syscall_get_return_value(current, regs))) + return; + + /* + * Per-cpu ring buffer and perf event list operations require + * preemption to be disabled. + */ + guard(preempt_notrace)(); + + head = this_cpu_ptr(sys_data->exit_event->perf_events); + if (hlist_empty(head)) return; /* We can probably do that at build time */ size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - rec = perf_trace_buf_alloc(size, &fake_regs, &rctx); + rec = perf_trace_buf_alloc(size, NULL, &rctx); if (!rec) return; rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); - if ((valid_prog_array && - !perf_call_bpf_exit(sys_data->exit_event, fake_regs, rec)) || - hlist_empty(head)) { - perf_swevent_put_recursion_context(rctx); - return; - } - perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type, 1, regs, head, NULL); } diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 2bc04feadfab..c9aea7052ba7 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -748,14 +748,35 @@ static void __bpf_prog_test_run_raw_tp(void *data) { struct bpf_raw_tp_test_run_info *info = data; + struct srcu_ctr __percpu *scp = NULL; struct bpf_trace_run_ctx run_ctx = {}; struct bpf_run_ctx *old_run_ctx; old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); - rcu_read_lock(); + if (info->prog->sleepable) { + scp = rcu_read_lock_tasks_trace(); + migrate_disable(); + } else { + rcu_read_lock(); + } + + if (unlikely(!bpf_prog_get_recursion_context(info->prog))) { + bpf_prog_inc_misses_counter(info->prog); + goto out; + } + info->retval = bpf_prog_run(info->prog, info->ctx); - rcu_read_unlock(); + +out: + bpf_prog_put_recursion_context(info->prog); + + if (info->prog->sleepable) { + migrate_enable(); + rcu_read_unlock_tasks_trace(scp); + } else { + rcu_read_unlock(); + } bpf_reset_run_ctx(old_run_ctx); } @@ -783,6 +804,13 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 && cpu != 0) return -EINVAL; + /* + * Sleepable programs cannot run with preemption disabled or in + * hardirq context (smp_call_function_single), reject the flag. + */ + if (prog->sleepable && (kattr->test.flags & BPF_F_TEST_RUN_ON_CPU)) + return -EINVAL; + if (ctx_size_in) { info.ctx = memdup_user(ctx_in, ctx_size_in); if (IS_ERR(info.ctx)) @@ -791,24 +819,31 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, info.ctx = NULL; } + info.retval = 0; info.prog = prog; - current_cpu = get_cpu(); - if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 || - cpu == current_cpu) { + if (prog->sleepable) { __bpf_prog_test_run_raw_tp(&info); - } else if (cpu >= nr_cpu_ids || !cpu_online(cpu)) { - /* smp_call_function_single() also checks cpu_online() - * after csd_lock(). However, since cpu is from user - * space, let's do an extra quick check to filter out - * invalid value before smp_call_function_single(). - */ - err = -ENXIO; } else { - err = smp_call_function_single(cpu, __bpf_prog_test_run_raw_tp, - &info, 1); + current_cpu = get_cpu(); + if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 || + cpu == current_cpu) { + __bpf_prog_test_run_raw_tp(&info); + } else if (cpu >= nr_cpu_ids || !cpu_online(cpu)) { + /* + * smp_call_function_single() also checks cpu_online() + * after csd_lock(). However, since cpu is from user + * space, let's do an extra quick check to filter out + * invalid value before smp_call_function_single(). + */ + err = -ENXIO; + } else { + err = smp_call_function_single(cpu, + __bpf_prog_test_run_raw_tp, + &info, 1); + } + put_cpu(); } - put_cpu(); if (!err && copy_to_user(&uattr->test.retval, &info.retval, sizeof(u32))) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 83aae7a39d36..ab2071fdd3e8 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -10018,11 +10018,16 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("netkit/peer", SCHED_CLS, BPF_NETKIT_PEER, SEC_NONE), SEC_DEF("tracepoint+", TRACEPOINT, 0, SEC_NONE, attach_tp), SEC_DEF("tp+", TRACEPOINT, 0, SEC_NONE, attach_tp), + SEC_DEF("tracepoint.s+", TRACEPOINT, 0, SEC_SLEEPABLE, attach_tp), + SEC_DEF("tp.s+", TRACEPOINT, 0, SEC_SLEEPABLE, attach_tp), SEC_DEF("raw_tracepoint+", RAW_TRACEPOINT, 0, SEC_NONE, attach_raw_tp), SEC_DEF("raw_tp+", RAW_TRACEPOINT, 0, SEC_NONE, attach_raw_tp), + SEC_DEF("raw_tracepoint.s+", RAW_TRACEPOINT, 0, SEC_SLEEPABLE, attach_raw_tp), + SEC_DEF("raw_tp.s+", RAW_TRACEPOINT, 0, SEC_SLEEPABLE, attach_raw_tp), SEC_DEF("raw_tracepoint.w+", RAW_TRACEPOINT_WRITABLE, 0, SEC_NONE, attach_raw_tp), SEC_DEF("raw_tp.w+", RAW_TRACEPOINT_WRITABLE, 0, SEC_NONE, attach_raw_tp), SEC_DEF("tp_btf+", TRACING, BPF_TRACE_RAW_TP, SEC_ATTACH_BTF, attach_trace), + SEC_DEF("tp_btf.s+", TRACING, BPF_TRACE_RAW_TP, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace), SEC_DEF("fentry+", TRACING, BPF_TRACE_FENTRY, SEC_ATTACH_BTF, attach_trace), SEC_DEF("fmod_ret+", TRACING, BPF_MODIFY_RETURN, SEC_ATTACH_BTF, attach_trace), SEC_DEF("fexit+", TRACING, BPF_TRACE_FEXIT, SEC_ATTACH_BTF, attach_trace), @@ -13152,25 +13157,61 @@ struct bpf_link *bpf_program__attach_tracepoint(const struct bpf_program *prog, return bpf_program__attach_tracepoint_opts(prog, tp_category, tp_name, NULL); } +/* + * Match section name against a prefix array. Returns pointer past + * "prefix/" on match, empty string for bare sections (exact prefix + * match), or NULL if no prefix matches. + */ +static const char *sec_name_match_prefix(const char *sec_name, + const char *const *prefixes, + size_t n) +{ + size_t i; + + for (i = 0; i < n; i++) { + size_t pfx_len; + + if (!str_has_pfx(sec_name, prefixes[i])) + continue; + + pfx_len = strlen(prefixes[i]); + if (sec_name[pfx_len] == '\0') + return sec_name + pfx_len; + + if (sec_name[pfx_len] != '/' || sec_name[pfx_len + 1] == '\0') + continue; + + return sec_name + pfx_len + 1; + } + return NULL; +} + static int attach_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link) { + static const char *const prefixes[] = { + "tp.s", + "tp", + "tracepoint.s", + "tracepoint", + }; char *sec_name, *tp_cat, *tp_name; + const char *match; *link = NULL; - /* no auto-attach for SEC("tp") or SEC("tracepoint") */ - if (strcmp(prog->sec_name, "tp") == 0 || strcmp(prog->sec_name, "tracepoint") == 0) + match = sec_name_match_prefix(prog->sec_name, prefixes, ARRAY_SIZE(prefixes)); + if (!match) { + pr_warn("prog '%s': invalid section name '%s'\n", prog->name, prog->sec_name); + return -EINVAL; + } + if (!match[0]) /* bare section name no autoattach */ return 0; sec_name = strdup(prog->sec_name); if (!sec_name) return -ENOMEM; - /* extract "tp/<category>/<name>" or "tracepoint/<category>/<name>" */ - if (str_has_pfx(prog->sec_name, "tp/")) - tp_cat = sec_name + sizeof("tp/") - 1; - else - tp_cat = sec_name + sizeof("tracepoint/") - 1; + tp_cat = sec_name + (match - prog->sec_name); tp_name = strchr(tp_cat, '/'); if (!tp_name) { free(sec_name); @@ -13234,37 +13275,22 @@ static int attach_raw_tp(const struct bpf_program *prog, long cookie, struct bpf "raw_tracepoint", "raw_tp.w", "raw_tracepoint.w", + "raw_tp.s", + "raw_tracepoint.s", }; - size_t i; - const char *tp_name = NULL; + const char *match; *link = NULL; - for (i = 0; i < ARRAY_SIZE(prefixes); i++) { - size_t pfx_len; - - if (!str_has_pfx(prog->sec_name, prefixes[i])) - continue; - - pfx_len = strlen(prefixes[i]); - /* no auto-attach case of, e.g., SEC("raw_tp") */ - if (prog->sec_name[pfx_len] == '\0') - return 0; - - if (prog->sec_name[pfx_len] != '/') - continue; - - tp_name = prog->sec_name + pfx_len + 1; - break; - } - - if (!tp_name) { - pr_warn("prog '%s': invalid section name '%s'\n", - prog->name, prog->sec_name); + match = sec_name_match_prefix(prog->sec_name, prefixes, ARRAY_SIZE(prefixes)); + if (!match) { + pr_warn("prog '%s': invalid section name '%s'\n", prog->name, prog->sec_name); return -EINVAL; } + if (!match[0]) + return 0; - *link = bpf_program__attach_raw_tracepoint(prog, tp_name); + *link = bpf_program__attach_raw_tracepoint(prog, match); return libbpf_get_error(*link); } diff --git a/tools/testing/selftests/bpf/prog_tests/sleepable_tracepoints.c b/tools/testing/selftests/bpf/prog_tests/sleepable_tracepoints.c new file mode 100644 index 000000000000..19500b785ee3 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/sleepable_tracepoints.c @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include <test_progs.h> +#include <unistd.h> +#include "test_sleepable_tracepoints.skel.h" +#include "test_sleepable_tracepoints_fail.skel.h" + +static void run_test(struct test_sleepable_tracepoints *skel) +{ + char buf[PATH_MAX] = "/"; + + skel->bss->target_pid = getpid(); + skel->bss->prog_triggered = 0; + skel->bss->err = 0; + skel->bss->copied_byte = 0; + + syscall(__NR_getcwd, buf, sizeof(buf)); + + ASSERT_EQ(skel->bss->prog_triggered, 1, "prog_triggered"); + ASSERT_EQ(skel->bss->err, 0, "err"); + ASSERT_EQ(skel->bss->copied_byte, '/', "copied_byte"); +} + +static void run_auto_attach_test(struct bpf_program *prog, + struct test_sleepable_tracepoints *skel) +{ + struct bpf_link *link; + + link = bpf_program__attach(prog); + if (!ASSERT_OK_PTR(link, "prog_attach")) + return; + + run_test(skel); + bpf_link__destroy(link); +} + +static void test_attach_only(struct bpf_program *prog) +{ + struct bpf_link *link; + + link = bpf_program__attach(prog); + if (ASSERT_OK_PTR(link, "attach")) + bpf_link__destroy(link); +} + +static void test_attach_reject(struct bpf_program *prog) +{ + struct bpf_link *link; + + link = bpf_program__attach(prog); + if (!ASSERT_ERR_PTR(link, "attach_should_fail")) + bpf_link__destroy(link); +} + +static void test_raw_tp_bare(struct test_sleepable_tracepoints *skel) +{ + struct bpf_link *link; + + link = bpf_program__attach_raw_tracepoint(skel->progs.handle_raw_tp_bare, + "sys_enter"); + if (ASSERT_OK_PTR(link, "attach")) + bpf_link__destroy(link); +} + +static void test_tp_bare(struct test_sleepable_tracepoints *skel) +{ + struct bpf_link *link; + + link = bpf_program__attach_tracepoint(skel->progs.handle_tp_bare, + "syscalls", "sys_enter_getcwd"); + if (ASSERT_OK_PTR(link, "attach")) + bpf_link__destroy(link); +} + +static void test_test_run(struct test_sleepable_tracepoints *skel) +{ + __u64 args[2] = {0x1234ULL, 0x5678ULL}; + LIBBPF_OPTS(bpf_test_run_opts, topts, + .ctx_in = args, + .ctx_size_in = sizeof(args), + ); + int fd, err; + + fd = bpf_program__fd(skel->progs.handle_test_run); + err = bpf_prog_test_run_opts(fd, &topts); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(topts.retval, args[0] + args[1], "test_run_retval"); +} + +static void test_test_run_on_cpu_reject(struct test_sleepable_tracepoints *skel) +{ + __u64 args[2] = {}; + LIBBPF_OPTS(bpf_test_run_opts, topts, + .ctx_in = args, + .ctx_size_in = sizeof(args), + .flags = BPF_F_TEST_RUN_ON_CPU, + ); + int fd, err; + + fd = bpf_program__fd(skel->progs.handle_test_run); + err = bpf_prog_test_run_opts(fd, &topts); + ASSERT_ERR(err, "test_run_on_cpu_reject"); +} + +void test_sleepable_tracepoints(void) +{ + struct test_sleepable_tracepoints *skel; + + skel = test_sleepable_tracepoints__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + if (test__start_subtest("tp_btf")) + run_auto_attach_test(skel->progs.handle_sys_enter_tp_btf, skel); + if (test__start_subtest("raw_tp")) + run_auto_attach_test(skel->progs.handle_sys_enter_raw_tp, skel); + if (test__start_subtest("tracepoint")) + run_auto_attach_test(skel->progs.handle_sys_enter_tp, skel); + if (test__start_subtest("sys_exit")) + run_auto_attach_test(skel->progs.handle_sys_exit_tp, skel); + if (test__start_subtest("tracepoint_alias")) + test_attach_only(skel->progs.handle_sys_enter_tp_alias); + if (test__start_subtest("raw_tracepoint_alias")) + test_attach_only(skel->progs.handle_sys_enter_raw_tp_alias); + if (test__start_subtest("raw_tp_bare")) + test_raw_tp_bare(skel); + if (test__start_subtest("tp_bare")) + test_tp_bare(skel); + if (test__start_subtest("test_run")) + test_test_run(skel); + if (test__start_subtest("test_run_on_cpu_reject")) + test_test_run_on_cpu_reject(skel); + if (test__start_subtest("raw_tp_non_faultable")) + test_attach_reject(skel->progs.handle_raw_tp_non_faultable); + if (test__start_subtest("tp_non_syscall")) + test_attach_reject(skel->progs.handle_tp_non_syscall); + if (test__start_subtest("tp_btf_non_faultable_reject")) + RUN_TESTS(test_sleepable_tracepoints_fail); + + test_sleepable_tracepoints__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_sleepable_tracepoints.c b/tools/testing/selftests/bpf/progs/test_sleepable_tracepoints.c new file mode 100644 index 000000000000..254f7fd895d9 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_sleepable_tracepoints.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include <vmlinux.h> +#include <asm/unistd.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> +#include <bpf/bpf_helpers.h> + +char _license[] SEC("license") = "GPL"; + +int target_pid; +int prog_triggered; +long err; +char copied_byte; + +static int copy_getcwd_arg(char *ubuf) +{ + err = bpf_copy_from_user(&copied_byte, sizeof(copied_byte), ubuf); + if (err) + return err; + + prog_triggered = 1; + return 0; +} + +SEC("tp_btf.s/sys_enter") +int BPF_PROG(handle_sys_enter_tp_btf, struct pt_regs *regs, long id) +{ + if ((bpf_get_current_pid_tgid() >> 32) != target_pid || + id != __NR_getcwd) + return 0; + + return copy_getcwd_arg((void *)PT_REGS_PARM1_SYSCALL(regs)); +} + +SEC("raw_tp.s/sys_enter") +int BPF_PROG(handle_sys_enter_raw_tp, struct pt_regs *regs, long id) +{ + if ((bpf_get_current_pid_tgid() >> 32) != target_pid || + id != __NR_getcwd) + return 0; + + return copy_getcwd_arg((void *)PT_REGS_PARM1_CORE_SYSCALL(regs)); +} + +SEC("tp.s/syscalls/sys_enter_getcwd") +int handle_sys_enter_tp(struct syscall_trace_enter *args) +{ + if ((bpf_get_current_pid_tgid() >> 32) != target_pid) + return 0; + + return copy_getcwd_arg((void *)args->args[0]); +} + +SEC("tp.s/syscalls/sys_exit_getcwd") +int handle_sys_exit_tp(struct syscall_trace_exit *args) +{ + struct pt_regs *regs; + + if ((bpf_get_current_pid_tgid() >> 32) != target_pid) + return 0; + + regs = (struct pt_regs *)bpf_task_pt_regs(bpf_get_current_task_btf()); + return copy_getcwd_arg((void *)PT_REGS_PARM1_CORE_SYSCALL(regs)); +} + +SEC("raw_tp.s") +int BPF_PROG(handle_raw_tp_bare, struct pt_regs *regs, long id) +{ + return 0; +} + +SEC("tp.s") +int handle_tp_bare(void *ctx) +{ + return 0; +} + +SEC("tracepoint.s/syscalls/sys_enter_getcwd") +int handle_sys_enter_tp_alias(struct syscall_trace_enter *args) +{ + return 0; +} + +SEC("raw_tracepoint.s/sys_enter") +int BPF_PROG(handle_sys_enter_raw_tp_alias, struct pt_regs *regs, long id) +{ + return 0; +} + +SEC("raw_tp.s/sys_enter") +int BPF_PROG(handle_test_run, struct pt_regs *regs, long id) +{ + if ((__u64)regs == 0x1234ULL && (__u64)id == 0x5678ULL) + return (__u64)regs + (__u64)id; + + return 0; +} + +SEC("raw_tp.s/sched_switch") +int BPF_PROG(handle_raw_tp_non_faultable, bool preempt, + struct task_struct *prev, struct task_struct *next) +{ + return 0; +} + +SEC("tp.s/sched/sched_switch") +int handle_tp_non_syscall(void *ctx) +{ + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_sleepable_tracepoints_fail.c b/tools/testing/selftests/bpf/progs/test_sleepable_tracepoints_fail.c new file mode 100644 index 000000000000..1a0748a9520b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_sleepable_tracepoints_fail.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include <vmlinux.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_helpers.h> +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +/* Sleepable program on a non-faultable tracepoint should fail to load */ +SEC("tp_btf.s/sched_switch") +__failure __msg("Sleepable program cannot attach to non-faultable tracepoint") +int BPF_PROG(handle_sched_switch, bool preempt, + struct task_struct *prev, struct task_struct *next) +{ + return 0; +} diff --git a/tools/testing/selftests/bpf/verifier/sleepable.c b/tools/testing/selftests/bpf/verifier/sleepable.c index c2b7f5ebf168..6dabc5522945 100644 --- a/tools/testing/selftests/bpf/verifier/sleepable.c +++ b/tools/testing/selftests/bpf/verifier/sleepable.c @@ -76,7 +76,20 @@ .runs = -1, }, { - "sleepable raw tracepoint reject", + "sleepable raw tracepoint accept", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_TRACING, + .expected_attach_type = BPF_TRACE_RAW_TP, + .kfunc = "sys_enter", + .result = ACCEPT, + .flags = BPF_F_SLEEPABLE, + .runs = -1, +}, +{ + "sleepable raw tracepoint reject non-faultable", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), @@ -85,7 +98,7 @@ .expected_attach_type = BPF_TRACE_RAW_TP, .kfunc = "sched_switch", .result = REJECT, - .errstr = "Only fentry/fexit/fsession/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable", + .errstr = "Sleepable program cannot attach to non-faultable tracepoint", .flags = BPF_F_SLEEPABLE, .runs = -1, }, |
