summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/bpf.h50
-rw-r--r--include/linux/trace_events.h6
-rw-r--r--include/trace/bpf_probe.h2
-rw-r--r--kernel/bpf/syscall.c5
-rw-r--r--kernel/bpf/verifier.c13
-rw-r--r--kernel/events/core.c9
-rw-r--r--kernel/trace/bpf_trace.c48
-rw-r--r--kernel/trace/trace_syscalls.c110
-rw-r--r--net/bpf/test_run.c65
-rw-r--r--tools/lib/bpf/libbpf.c88
-rw-r--r--tools/testing/selftests/bpf/prog_tests/sleepable_tracepoints.c142
-rw-r--r--tools/testing/selftests/bpf/progs/test_sleepable_tracepoints.c112
-rw-r--r--tools/testing/selftests/bpf/progs/test_sleepable_tracepoints_fail.c18
-rw-r--r--tools/testing/selftests/bpf/verifier/sleepable.c17
14 files changed, 578 insertions, 107 deletions
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3cb6b9e70080..d3aea3931b85 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -3079,6 +3079,56 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr);
void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr);
void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip);
+static __always_inline u32
+bpf_prog_run_array_sleepable(const struct bpf_prog_array *array,
+ const void *ctx, bpf_prog_run_fn run_prog)
+{
+ const struct bpf_prog_array_item *item;
+ struct bpf_prog *prog;
+ struct bpf_run_ctx *old_run_ctx;
+ struct bpf_trace_run_ctx run_ctx;
+ u32 ret = 1;
+
+ if (unlikely(!array))
+ return ret;
+
+ migrate_disable();
+
+ run_ctx.is_uprobe = false;
+
+ old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+ item = &array->items[0];
+ while ((prog = READ_ONCE(item->prog))) {
+ /* Skip dummy_bpf_prog placeholder (len == 0) */
+ if (unlikely(!prog->len)) {
+ item++;
+ continue;
+ }
+
+ if (unlikely(!bpf_prog_get_recursion_context(prog))) {
+ bpf_prog_inc_misses_counter(prog);
+ bpf_prog_put_recursion_context(prog);
+ item++;
+ continue;
+ }
+
+ run_ctx.bpf_cookie = item->bpf_cookie;
+
+ if (!prog->sleepable) {
+ guard(rcu)();
+ ret &= run_prog(prog, ctx);
+ } else {
+ ret &= run_prog(prog, ctx);
+ }
+
+ bpf_prog_put_recursion_context(prog);
+ item++;
+ }
+ bpf_reset_run_ctx(old_run_ctx);
+ migrate_enable();
+ return ret;
+}
+
#else /* !CONFIG_BPF_SYSCALL */
static inline struct bpf_prog *bpf_prog_get(u32 ufd)
{
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 40a43a4c7caf..d49338c44014 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -770,6 +770,7 @@ trace_trigger_soft_disabled(struct trace_event_file *file)
#ifdef CONFIG_BPF_EVENTS
unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx);
+unsigned int trace_call_bpf_faultable(struct trace_event_call *call, void *ctx);
int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie);
void perf_event_detach_bpf_prog(struct perf_event *event);
int perf_event_query_prog_array(struct perf_event *event, void __user *info);
@@ -792,6 +793,11 @@ static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *c
return 1;
}
+static inline unsigned int trace_call_bpf_faultable(struct trace_event_call *call, void *ctx)
+{
+ return 1;
+}
+
static inline int
perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie)
{
diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h
index 9391d54d3f12..d1de8f9aa07f 100644
--- a/include/trace/bpf_probe.h
+++ b/include/trace/bpf_probe.h
@@ -58,9 +58,7 @@ static notrace void \
__bpf_trace_##call(void *__data, proto) \
{ \
might_fault(); \
- preempt_disable_notrace(); \
CONCATENATE(bpf_trace_run, COUNT_ARGS(args))(__data, CAST_TO_U64(args)); \
- preempt_enable_notrace(); \
}
#undef DECLARE_EVENT_SYSCALL_CLASS
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a3c0214ca934..3b1f0ba02f61 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -4281,6 +4281,11 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
if (!btp)
return -ENOENT;
+ if (prog->sleepable && !tracepoint_is_faultable(btp->tp)) {
+ bpf_put_raw_tracepoint(btp);
+ return -EINVAL;
+ }
+
link = kzalloc_obj(*link, GFP_USER);
if (!link) {
err = -ENOMEM;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 185210b73385..5b4806fdb648 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -19267,6 +19267,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
btp = bpf_get_raw_tracepoint(tname);
if (!btp)
return -EINVAL;
+ if (prog->sleepable && !tracepoint_is_faultable(btp->tp)) {
+ bpf_log(log, "Sleepable program cannot attach to non-faultable tracepoint %s\n",
+ tname);
+ bpf_put_raw_tracepoint(btp);
+ return -EINVAL;
+ }
fname = kallsyms_lookup((unsigned long)btp->bpf_func, NULL, NULL, NULL,
trace_symbol);
bpf_put_raw_tracepoint(btp);
@@ -19483,6 +19489,7 @@ static bool can_be_sleepable(struct bpf_prog *prog)
case BPF_MODIFY_RETURN:
case BPF_TRACE_ITER:
case BPF_TRACE_FSESSION:
+ case BPF_TRACE_RAW_TP:
return true;
default:
return false;
@@ -19490,7 +19497,9 @@ static bool can_be_sleepable(struct bpf_prog *prog)
}
return prog->type == BPF_PROG_TYPE_LSM ||
prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ ||
- prog->type == BPF_PROG_TYPE_STRUCT_OPS;
+ prog->type == BPF_PROG_TYPE_STRUCT_OPS ||
+ prog->type == BPF_PROG_TYPE_RAW_TRACEPOINT ||
+ prog->type == BPF_PROG_TYPE_TRACEPOINT;
}
static int check_attach_btf_id(struct bpf_verifier_env *env)
@@ -19512,7 +19521,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
}
if (prog->sleepable && !can_be_sleepable(prog)) {
- verbose(env, "Only fentry/fexit/fsession/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n");
+ verbose(env, "Program of this type cannot be sleepable\n");
return -EINVAL;
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6d1f8bad7e1c..0f9cacfa7cb8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11643,6 +11643,15 @@ static int __perf_event_set_bpf_prog(struct perf_event *event,
/* only uprobe programs are allowed to be sleepable */
return -EINVAL;
+ if (prog->type == BPF_PROG_TYPE_TRACEPOINT && prog->sleepable) {
+ /*
+ * Sleepable tracepoint programs can only attach to faultable
+ * tracepoints. Currently only syscall tracepoints are faultable.
+ */
+ if (!is_syscall_tp)
+ return -EINVAL;
+ }
+
/* Kprobe override only works for kprobes, not uprobes. */
if (prog->kprobe_override && !is_kprobe)
return -EINVAL;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index e916f0ccbed9..a822c589c9bd 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -152,6 +152,34 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
return ret;
}
+/**
+ * trace_call_bpf_faultable - invoke BPF program in faultable context
+ * @call: tracepoint event
+ * @ctx: opaque context pointer
+ *
+ * Variant of trace_call_bpf() for faultable tracepoints (syscall
+ * tracepoints). Supports sleepable BPF programs by using rcu_tasks_trace
+ * for lifetime protection and bpf_prog_run_array_sleepable() for per-program
+ * RCU flavor selection, following the uprobe pattern.
+ *
+ * Per-program recursion protection is provided by
+ * bpf_prog_run_array_sleepable(). Global bpf_prog_active is not
+ * needed because syscall tracepoints cannot self-recurse.
+ *
+ * Must be called from a faultable/preemptible context.
+ */
+unsigned int trace_call_bpf_faultable(struct trace_event_call *call, void *ctx)
+{
+ struct bpf_prog_array *prog_array;
+
+ might_fault();
+ guard(rcu_tasks_trace)();
+
+ prog_array = rcu_dereference_check(call->prog_array,
+ rcu_read_lock_trace_held());
+ return bpf_prog_run_array_sleepable(prog_array, ctx, bpf_prog_run);
+}
+
#ifdef CONFIG_BPF_KPROBE_OVERRIDE
BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
{
@@ -2072,11 +2100,19 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
static __always_inline
void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
{
+ struct srcu_ctr __percpu *scp = NULL;
struct bpf_prog *prog = link->link.prog;
+ bool sleepable = prog->sleepable;
struct bpf_run_ctx *old_run_ctx;
struct bpf_trace_run_ctx run_ctx;
- rcu_read_lock_dont_migrate();
+ if (sleepable) {
+ scp = rcu_read_lock_tasks_trace();
+ migrate_disable();
+ } else {
+ rcu_read_lock_dont_migrate();
+ }
+
if (unlikely(!bpf_prog_get_recursion_context(prog))) {
bpf_prog_inc_misses_counter(prog);
goto out;
@@ -2085,12 +2121,18 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
run_ctx.bpf_cookie = link->cookie;
old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
- (void) bpf_prog_run(prog, args);
+ (void)bpf_prog_run(prog, args);
bpf_reset_run_ctx(old_run_ctx);
out:
bpf_prog_put_recursion_context(prog);
- rcu_read_unlock_migrate();
+
+ if (sleepable) {
+ migrate_enable();
+ rcu_read_unlock_tasks_trace(scp);
+ } else {
+ rcu_read_unlock_migrate();
+ }
}
#define UNPACK(...) __VA_ARGS__
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 8ad72e17d8eb..e98ee7e1e66f 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1371,33 +1371,33 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
static int sys_perf_refcount_enter;
static int sys_perf_refcount_exit;
-static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs,
+static int perf_call_bpf_enter(struct trace_event_call *call,
struct syscall_metadata *sys_data,
- struct syscall_trace_enter *rec)
+ int syscall_nr, unsigned long *args)
{
struct syscall_tp_t {
struct trace_entry ent;
int syscall_nr;
unsigned long args[SYSCALL_DEFINE_MAXARGS];
} __aligned(8) param;
+ struct pt_regs regs = {};
int i;
BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *));
- /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
- perf_fetch_caller_regs(regs);
- *(struct pt_regs **)&param = regs;
- param.syscall_nr = rec->nr;
+ /* bpf prog requires 'regs' to be the first member in the ctx */
+ perf_fetch_caller_regs(&regs);
+ *(struct pt_regs **)&param = &regs;
+ param.syscall_nr = syscall_nr;
for (i = 0; i < sys_data->nb_args; i++)
- param.args[i] = rec->args[i];
- return trace_call_bpf(call, &param);
+ param.args[i] = args[i];
+ return trace_call_bpf_faultable(call, &param);
}
static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
{
struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec;
- struct pt_regs *fake_regs;
struct hlist_head *head;
unsigned long args[6];
bool valid_prog_array;
@@ -1410,12 +1410,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
int size = 0;
int uargs = 0;
- /*
- * Syscall probe called with preemption enabled, but the ring
- * buffer and per-cpu data require preemption to be disabled.
- */
might_fault();
- guard(preempt_notrace)();
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
@@ -1429,6 +1424,26 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
syscall_get_arguments(current, regs, args);
+ /*
+ * Run BPF program in faultable context before per-cpu buffer
+ * allocation, allowing sleepable BPF programs to execute.
+ */
+ valid_prog_array = bpf_prog_array_valid(sys_data->enter_event);
+ if (valid_prog_array &&
+ !perf_call_bpf_enter(sys_data->enter_event, sys_data,
+ syscall_nr, args))
+ return;
+
+ /*
+ * Per-cpu ring buffer and perf event list operations require
+ * preemption to be disabled.
+ */
+ guard(preempt_notrace)();
+
+ head = this_cpu_ptr(sys_data->enter_event->perf_events);
+ if (hlist_empty(head))
+ return;
+
/* Check if this syscall event faults in user space memory */
mayfault = sys_data->user_mask != 0;
@@ -1438,17 +1453,12 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
return;
}
- head = this_cpu_ptr(sys_data->enter_event->perf_events);
- valid_prog_array = bpf_prog_array_valid(sys_data->enter_event);
- if (!valid_prog_array && hlist_empty(head))
- return;
-
/* get the size after alignment with the u32 buffer size field */
size += sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
size = ALIGN(size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
+ rec = perf_trace_buf_alloc(size, NULL, &rctx);
if (!rec)
return;
@@ -1458,13 +1468,6 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
if (mayfault)
syscall_put_data(sys_data, rec, user_ptr, size, user_sizes, uargs);
- if ((valid_prog_array &&
- !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) ||
- hlist_empty(head)) {
- perf_swevent_put_recursion_context(rctx);
- return;
- }
-
perf_trace_buf_submit(rec, size, rctx,
sys_data->enter_event->event.type, 1, regs,
head, NULL);
@@ -1514,40 +1517,35 @@ static void perf_sysenter_disable(struct trace_event_call *call)
syscall_fault_buffer_disable();
}
-static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs,
- struct syscall_trace_exit *rec)
+static int perf_call_bpf_exit(struct trace_event_call *call,
+ int syscall_nr, long ret_val)
{
struct syscall_tp_t {
struct trace_entry ent;
int syscall_nr;
unsigned long ret;
} __aligned(8) param;
-
- /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
- perf_fetch_caller_regs(regs);
- *(struct pt_regs **)&param = regs;
- param.syscall_nr = rec->nr;
- param.ret = rec->ret;
- return trace_call_bpf(call, &param);
+ struct pt_regs regs = {};
+
+ /* bpf prog requires 'regs' to be the first member in the ctx */
+ perf_fetch_caller_regs(&regs);
+ *(struct pt_regs **)&param = &regs;
+ param.syscall_nr = syscall_nr;
+ param.ret = ret_val;
+ return trace_call_bpf_faultable(call, &param);
}
static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
{
struct syscall_metadata *sys_data;
struct syscall_trace_exit *rec;
- struct pt_regs *fake_regs;
struct hlist_head *head;
bool valid_prog_array;
int syscall_nr;
int rctx;
int size;
- /*
- * Syscall probe called with preemption enabled, but the ring
- * buffer and per-cpu data require preemption to be disabled.
- */
might_fault();
- guard(preempt_notrace)();
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
@@ -1559,29 +1557,37 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
if (!sys_data)
return;
- head = this_cpu_ptr(sys_data->exit_event->perf_events);
+ /*
+ * Run BPF program in faultable context before per-cpu buffer
+ * allocation, allowing sleepable BPF programs to execute.
+ */
valid_prog_array = bpf_prog_array_valid(sys_data->exit_event);
- if (!valid_prog_array && hlist_empty(head))
+ if (valid_prog_array &&
+ !perf_call_bpf_exit(sys_data->exit_event, syscall_nr,
+ syscall_get_return_value(current, regs)))
+ return;
+
+ /*
+ * Per-cpu ring buffer and perf event list operations require
+ * preemption to be disabled.
+ */
+ guard(preempt_notrace)();
+
+ head = this_cpu_ptr(sys_data->exit_event->perf_events);
+ if (hlist_empty(head))
return;
/* We can probably do that at build time */
size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
+ rec = perf_trace_buf_alloc(size, NULL, &rctx);
if (!rec)
return;
rec->nr = syscall_nr;
rec->ret = syscall_get_return_value(current, regs);
- if ((valid_prog_array &&
- !perf_call_bpf_exit(sys_data->exit_event, fake_regs, rec)) ||
- hlist_empty(head)) {
- perf_swevent_put_recursion_context(rctx);
- return;
- }
-
perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
1, regs, head, NULL);
}
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 2bc04feadfab..c9aea7052ba7 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -748,14 +748,35 @@ static void
__bpf_prog_test_run_raw_tp(void *data)
{
struct bpf_raw_tp_test_run_info *info = data;
+ struct srcu_ctr __percpu *scp = NULL;
struct bpf_trace_run_ctx run_ctx = {};
struct bpf_run_ctx *old_run_ctx;
old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
- rcu_read_lock();
+ if (info->prog->sleepable) {
+ scp = rcu_read_lock_tasks_trace();
+ migrate_disable();
+ } else {
+ rcu_read_lock();
+ }
+
+ if (unlikely(!bpf_prog_get_recursion_context(info->prog))) {
+ bpf_prog_inc_misses_counter(info->prog);
+ goto out;
+ }
+
info->retval = bpf_prog_run(info->prog, info->ctx);
- rcu_read_unlock();
+
+out:
+ bpf_prog_put_recursion_context(info->prog);
+
+ if (info->prog->sleepable) {
+ migrate_enable();
+ rcu_read_unlock_tasks_trace(scp);
+ } else {
+ rcu_read_unlock();
+ }
bpf_reset_run_ctx(old_run_ctx);
}
@@ -783,6 +804,13 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog,
if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 && cpu != 0)
return -EINVAL;
+ /*
+ * Sleepable programs cannot run with preemption disabled or in
+ * hardirq context (smp_call_function_single), reject the flag.
+ */
+ if (prog->sleepable && (kattr->test.flags & BPF_F_TEST_RUN_ON_CPU))
+ return -EINVAL;
+
if (ctx_size_in) {
info.ctx = memdup_user(ctx_in, ctx_size_in);
if (IS_ERR(info.ctx))
@@ -791,24 +819,31 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog,
info.ctx = NULL;
}
+ info.retval = 0;
info.prog = prog;
- current_cpu = get_cpu();
- if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 ||
- cpu == current_cpu) {
+ if (prog->sleepable) {
__bpf_prog_test_run_raw_tp(&info);
- } else if (cpu >= nr_cpu_ids || !cpu_online(cpu)) {
- /* smp_call_function_single() also checks cpu_online()
- * after csd_lock(). However, since cpu is from user
- * space, let's do an extra quick check to filter out
- * invalid value before smp_call_function_single().
- */
- err = -ENXIO;
} else {
- err = smp_call_function_single(cpu, __bpf_prog_test_run_raw_tp,
- &info, 1);
+ current_cpu = get_cpu();
+ if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 ||
+ cpu == current_cpu) {
+ __bpf_prog_test_run_raw_tp(&info);
+ } else if (cpu >= nr_cpu_ids || !cpu_online(cpu)) {
+ /*
+ * smp_call_function_single() also checks cpu_online()
+ * after csd_lock(). However, since cpu is from user
+ * space, let's do an extra quick check to filter out
+ * invalid value before smp_call_function_single().
+ */
+ err = -ENXIO;
+ } else {
+ err = smp_call_function_single(cpu,
+ __bpf_prog_test_run_raw_tp,
+ &info, 1);
+ }
+ put_cpu();
}
- put_cpu();
if (!err &&
copy_to_user(&uattr->test.retval, &info.retval, sizeof(u32)))
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 83aae7a39d36..ab2071fdd3e8 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -10018,11 +10018,16 @@ static const struct bpf_sec_def section_defs[] = {
SEC_DEF("netkit/peer", SCHED_CLS, BPF_NETKIT_PEER, SEC_NONE),
SEC_DEF("tracepoint+", TRACEPOINT, 0, SEC_NONE, attach_tp),
SEC_DEF("tp+", TRACEPOINT, 0, SEC_NONE, attach_tp),
+ SEC_DEF("tracepoint.s+", TRACEPOINT, 0, SEC_SLEEPABLE, attach_tp),
+ SEC_DEF("tp.s+", TRACEPOINT, 0, SEC_SLEEPABLE, attach_tp),
SEC_DEF("raw_tracepoint+", RAW_TRACEPOINT, 0, SEC_NONE, attach_raw_tp),
SEC_DEF("raw_tp+", RAW_TRACEPOINT, 0, SEC_NONE, attach_raw_tp),
+ SEC_DEF("raw_tracepoint.s+", RAW_TRACEPOINT, 0, SEC_SLEEPABLE, attach_raw_tp),
+ SEC_DEF("raw_tp.s+", RAW_TRACEPOINT, 0, SEC_SLEEPABLE, attach_raw_tp),
SEC_DEF("raw_tracepoint.w+", RAW_TRACEPOINT_WRITABLE, 0, SEC_NONE, attach_raw_tp),
SEC_DEF("raw_tp.w+", RAW_TRACEPOINT_WRITABLE, 0, SEC_NONE, attach_raw_tp),
SEC_DEF("tp_btf+", TRACING, BPF_TRACE_RAW_TP, SEC_ATTACH_BTF, attach_trace),
+ SEC_DEF("tp_btf.s+", TRACING, BPF_TRACE_RAW_TP, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace),
SEC_DEF("fentry+", TRACING, BPF_TRACE_FENTRY, SEC_ATTACH_BTF, attach_trace),
SEC_DEF("fmod_ret+", TRACING, BPF_MODIFY_RETURN, SEC_ATTACH_BTF, attach_trace),
SEC_DEF("fexit+", TRACING, BPF_TRACE_FEXIT, SEC_ATTACH_BTF, attach_trace),
@@ -13152,25 +13157,61 @@ struct bpf_link *bpf_program__attach_tracepoint(const struct bpf_program *prog,
return bpf_program__attach_tracepoint_opts(prog, tp_category, tp_name, NULL);
}
+/*
+ * Match section name against a prefix array. Returns pointer past
+ * "prefix/" on match, empty string for bare sections (exact prefix
+ * match), or NULL if no prefix matches.
+ */
+static const char *sec_name_match_prefix(const char *sec_name,
+ const char *const *prefixes,
+ size_t n)
+{
+ size_t i;
+
+ for (i = 0; i < n; i++) {
+ size_t pfx_len;
+
+ if (!str_has_pfx(sec_name, prefixes[i]))
+ continue;
+
+ pfx_len = strlen(prefixes[i]);
+ if (sec_name[pfx_len] == '\0')
+ return sec_name + pfx_len;
+
+ if (sec_name[pfx_len] != '/' || sec_name[pfx_len + 1] == '\0')
+ continue;
+
+ return sec_name + pfx_len + 1;
+ }
+ return NULL;
+}
+
static int attach_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link)
{
+ static const char *const prefixes[] = {
+ "tp.s",
+ "tp",
+ "tracepoint.s",
+ "tracepoint",
+ };
char *sec_name, *tp_cat, *tp_name;
+ const char *match;
*link = NULL;
- /* no auto-attach for SEC("tp") or SEC("tracepoint") */
- if (strcmp(prog->sec_name, "tp") == 0 || strcmp(prog->sec_name, "tracepoint") == 0)
+ match = sec_name_match_prefix(prog->sec_name, prefixes, ARRAY_SIZE(prefixes));
+ if (!match) {
+ pr_warn("prog '%s': invalid section name '%s'\n", prog->name, prog->sec_name);
+ return -EINVAL;
+ }
+ if (!match[0]) /* bare section name no autoattach */
return 0;
sec_name = strdup(prog->sec_name);
if (!sec_name)
return -ENOMEM;
- /* extract "tp/<category>/<name>" or "tracepoint/<category>/<name>" */
- if (str_has_pfx(prog->sec_name, "tp/"))
- tp_cat = sec_name + sizeof("tp/") - 1;
- else
- tp_cat = sec_name + sizeof("tracepoint/") - 1;
+ tp_cat = sec_name + (match - prog->sec_name);
tp_name = strchr(tp_cat, '/');
if (!tp_name) {
free(sec_name);
@@ -13234,37 +13275,22 @@ static int attach_raw_tp(const struct bpf_program *prog, long cookie, struct bpf
"raw_tracepoint",
"raw_tp.w",
"raw_tracepoint.w",
+ "raw_tp.s",
+ "raw_tracepoint.s",
};
- size_t i;
- const char *tp_name = NULL;
+ const char *match;
*link = NULL;
- for (i = 0; i < ARRAY_SIZE(prefixes); i++) {
- size_t pfx_len;
-
- if (!str_has_pfx(prog->sec_name, prefixes[i]))
- continue;
-
- pfx_len = strlen(prefixes[i]);
- /* no auto-attach case of, e.g., SEC("raw_tp") */
- if (prog->sec_name[pfx_len] == '\0')
- return 0;
-
- if (prog->sec_name[pfx_len] != '/')
- continue;
-
- tp_name = prog->sec_name + pfx_len + 1;
- break;
- }
-
- if (!tp_name) {
- pr_warn("prog '%s': invalid section name '%s'\n",
- prog->name, prog->sec_name);
+ match = sec_name_match_prefix(prog->sec_name, prefixes, ARRAY_SIZE(prefixes));
+ if (!match) {
+ pr_warn("prog '%s': invalid section name '%s'\n", prog->name, prog->sec_name);
return -EINVAL;
}
+ if (!match[0])
+ return 0;
- *link = bpf_program__attach_raw_tracepoint(prog, tp_name);
+ *link = bpf_program__attach_raw_tracepoint(prog, match);
return libbpf_get_error(*link);
}
diff --git a/tools/testing/selftests/bpf/prog_tests/sleepable_tracepoints.c b/tools/testing/selftests/bpf/prog_tests/sleepable_tracepoints.c
new file mode 100644
index 000000000000..19500b785ee3
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/sleepable_tracepoints.c
@@ -0,0 +1,142 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+
+#include <test_progs.h>
+#include <unistd.h>
+#include "test_sleepable_tracepoints.skel.h"
+#include "test_sleepable_tracepoints_fail.skel.h"
+
+static void run_test(struct test_sleepable_tracepoints *skel)
+{
+ char buf[PATH_MAX] = "/";
+
+ skel->bss->target_pid = getpid();
+ skel->bss->prog_triggered = 0;
+ skel->bss->err = 0;
+ skel->bss->copied_byte = 0;
+
+ syscall(__NR_getcwd, buf, sizeof(buf));
+
+ ASSERT_EQ(skel->bss->prog_triggered, 1, "prog_triggered");
+ ASSERT_EQ(skel->bss->err, 0, "err");
+ ASSERT_EQ(skel->bss->copied_byte, '/', "copied_byte");
+}
+
+static void run_auto_attach_test(struct bpf_program *prog,
+ struct test_sleepable_tracepoints *skel)
+{
+ struct bpf_link *link;
+
+ link = bpf_program__attach(prog);
+ if (!ASSERT_OK_PTR(link, "prog_attach"))
+ return;
+
+ run_test(skel);
+ bpf_link__destroy(link);
+}
+
+static void test_attach_only(struct bpf_program *prog)
+{
+ struct bpf_link *link;
+
+ link = bpf_program__attach(prog);
+ if (ASSERT_OK_PTR(link, "attach"))
+ bpf_link__destroy(link);
+}
+
+static void test_attach_reject(struct bpf_program *prog)
+{
+ struct bpf_link *link;
+
+ link = bpf_program__attach(prog);
+ if (!ASSERT_ERR_PTR(link, "attach_should_fail"))
+ bpf_link__destroy(link);
+}
+
+static void test_raw_tp_bare(struct test_sleepable_tracepoints *skel)
+{
+ struct bpf_link *link;
+
+ link = bpf_program__attach_raw_tracepoint(skel->progs.handle_raw_tp_bare,
+ "sys_enter");
+ if (ASSERT_OK_PTR(link, "attach"))
+ bpf_link__destroy(link);
+}
+
+static void test_tp_bare(struct test_sleepable_tracepoints *skel)
+{
+ struct bpf_link *link;
+
+ link = bpf_program__attach_tracepoint(skel->progs.handle_tp_bare,
+ "syscalls", "sys_enter_getcwd");
+ if (ASSERT_OK_PTR(link, "attach"))
+ bpf_link__destroy(link);
+}
+
+static void test_test_run(struct test_sleepable_tracepoints *skel)
+{
+ __u64 args[2] = {0x1234ULL, 0x5678ULL};
+ LIBBPF_OPTS(bpf_test_run_opts, topts,
+ .ctx_in = args,
+ .ctx_size_in = sizeof(args),
+ );
+ int fd, err;
+
+ fd = bpf_program__fd(skel->progs.handle_test_run);
+ err = bpf_prog_test_run_opts(fd, &topts);
+ ASSERT_OK(err, "test_run");
+ ASSERT_EQ(topts.retval, args[0] + args[1], "test_run_retval");
+}
+
+static void test_test_run_on_cpu_reject(struct test_sleepable_tracepoints *skel)
+{
+ __u64 args[2] = {};
+ LIBBPF_OPTS(bpf_test_run_opts, topts,
+ .ctx_in = args,
+ .ctx_size_in = sizeof(args),
+ .flags = BPF_F_TEST_RUN_ON_CPU,
+ );
+ int fd, err;
+
+ fd = bpf_program__fd(skel->progs.handle_test_run);
+ err = bpf_prog_test_run_opts(fd, &topts);
+ ASSERT_ERR(err, "test_run_on_cpu_reject");
+}
+
+void test_sleepable_tracepoints(void)
+{
+ struct test_sleepable_tracepoints *skel;
+
+ skel = test_sleepable_tracepoints__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "open_and_load"))
+ return;
+
+ if (test__start_subtest("tp_btf"))
+ run_auto_attach_test(skel->progs.handle_sys_enter_tp_btf, skel);
+ if (test__start_subtest("raw_tp"))
+ run_auto_attach_test(skel->progs.handle_sys_enter_raw_tp, skel);
+ if (test__start_subtest("tracepoint"))
+ run_auto_attach_test(skel->progs.handle_sys_enter_tp, skel);
+ if (test__start_subtest("sys_exit"))
+ run_auto_attach_test(skel->progs.handle_sys_exit_tp, skel);
+ if (test__start_subtest("tracepoint_alias"))
+ test_attach_only(skel->progs.handle_sys_enter_tp_alias);
+ if (test__start_subtest("raw_tracepoint_alias"))
+ test_attach_only(skel->progs.handle_sys_enter_raw_tp_alias);
+ if (test__start_subtest("raw_tp_bare"))
+ test_raw_tp_bare(skel);
+ if (test__start_subtest("tp_bare"))
+ test_tp_bare(skel);
+ if (test__start_subtest("test_run"))
+ test_test_run(skel);
+ if (test__start_subtest("test_run_on_cpu_reject"))
+ test_test_run_on_cpu_reject(skel);
+ if (test__start_subtest("raw_tp_non_faultable"))
+ test_attach_reject(skel->progs.handle_raw_tp_non_faultable);
+ if (test__start_subtest("tp_non_syscall"))
+ test_attach_reject(skel->progs.handle_tp_non_syscall);
+ if (test__start_subtest("tp_btf_non_faultable_reject"))
+ RUN_TESTS(test_sleepable_tracepoints_fail);
+
+ test_sleepable_tracepoints__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_sleepable_tracepoints.c b/tools/testing/selftests/bpf/progs/test_sleepable_tracepoints.c
new file mode 100644
index 000000000000..254f7fd895d9
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_sleepable_tracepoints.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <asm/unistd.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+
+int target_pid;
+int prog_triggered;
+long err;
+char copied_byte;
+
+static int copy_getcwd_arg(char *ubuf)
+{
+ err = bpf_copy_from_user(&copied_byte, sizeof(copied_byte), ubuf);
+ if (err)
+ return err;
+
+ prog_triggered = 1;
+ return 0;
+}
+
+SEC("tp_btf.s/sys_enter")
+int BPF_PROG(handle_sys_enter_tp_btf, struct pt_regs *regs, long id)
+{
+ if ((bpf_get_current_pid_tgid() >> 32) != target_pid ||
+ id != __NR_getcwd)
+ return 0;
+
+ return copy_getcwd_arg((void *)PT_REGS_PARM1_SYSCALL(regs));
+}
+
+SEC("raw_tp.s/sys_enter")
+int BPF_PROG(handle_sys_enter_raw_tp, struct pt_regs *regs, long id)
+{
+ if ((bpf_get_current_pid_tgid() >> 32) != target_pid ||
+ id != __NR_getcwd)
+ return 0;
+
+ return copy_getcwd_arg((void *)PT_REGS_PARM1_CORE_SYSCALL(regs));
+}
+
+SEC("tp.s/syscalls/sys_enter_getcwd")
+int handle_sys_enter_tp(struct syscall_trace_enter *args)
+{
+ if ((bpf_get_current_pid_tgid() >> 32) != target_pid)
+ return 0;
+
+ return copy_getcwd_arg((void *)args->args[0]);
+}
+
+SEC("tp.s/syscalls/sys_exit_getcwd")
+int handle_sys_exit_tp(struct syscall_trace_exit *args)
+{
+ struct pt_regs *regs;
+
+ if ((bpf_get_current_pid_tgid() >> 32) != target_pid)
+ return 0;
+
+ regs = (struct pt_regs *)bpf_task_pt_regs(bpf_get_current_task_btf());
+ return copy_getcwd_arg((void *)PT_REGS_PARM1_CORE_SYSCALL(regs));
+}
+
+SEC("raw_tp.s")
+int BPF_PROG(handle_raw_tp_bare, struct pt_regs *regs, long id)
+{
+ return 0;
+}
+
+SEC("tp.s")
+int handle_tp_bare(void *ctx)
+{
+ return 0;
+}
+
+SEC("tracepoint.s/syscalls/sys_enter_getcwd")
+int handle_sys_enter_tp_alias(struct syscall_trace_enter *args)
+{
+ return 0;
+}
+
+SEC("raw_tracepoint.s/sys_enter")
+int BPF_PROG(handle_sys_enter_raw_tp_alias, struct pt_regs *regs, long id)
+{
+ return 0;
+}
+
+SEC("raw_tp.s/sys_enter")
+int BPF_PROG(handle_test_run, struct pt_regs *regs, long id)
+{
+ if ((__u64)regs == 0x1234ULL && (__u64)id == 0x5678ULL)
+ return (__u64)regs + (__u64)id;
+
+ return 0;
+}
+
+SEC("raw_tp.s/sched_switch")
+int BPF_PROG(handle_raw_tp_non_faultable, bool preempt,
+ struct task_struct *prev, struct task_struct *next)
+{
+ return 0;
+}
+
+SEC("tp.s/sched/sched_switch")
+int handle_tp_non_syscall(void *ctx)
+{
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_sleepable_tracepoints_fail.c b/tools/testing/selftests/bpf/progs/test_sleepable_tracepoints_fail.c
new file mode 100644
index 000000000000..1a0748a9520b
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_sleepable_tracepoints_fail.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+char _license[] SEC("license") = "GPL";
+
+/* Sleepable program on a non-faultable tracepoint should fail to load */
+SEC("tp_btf.s/sched_switch")
+__failure __msg("Sleepable program cannot attach to non-faultable tracepoint")
+int BPF_PROG(handle_sched_switch, bool preempt,
+ struct task_struct *prev, struct task_struct *next)
+{
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/verifier/sleepable.c b/tools/testing/selftests/bpf/verifier/sleepable.c
index c2b7f5ebf168..6dabc5522945 100644
--- a/tools/testing/selftests/bpf/verifier/sleepable.c
+++ b/tools/testing/selftests/bpf/verifier/sleepable.c
@@ -76,7 +76,20 @@
.runs = -1,
},
{
- "sleepable raw tracepoint reject",
+ "sleepable raw tracepoint accept",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACING,
+ .expected_attach_type = BPF_TRACE_RAW_TP,
+ .kfunc = "sys_enter",
+ .result = ACCEPT,
+ .flags = BPF_F_SLEEPABLE,
+ .runs = -1,
+},
+{
+ "sleepable raw tracepoint reject non-faultable",
.insns = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
@@ -85,7 +98,7 @@
.expected_attach_type = BPF_TRACE_RAW_TP,
.kfunc = "sched_switch",
.result = REJECT,
- .errstr = "Only fentry/fexit/fsession/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable",
+ .errstr = "Sleepable program cannot attach to non-faultable tracepoint",
.flags = BPF_F_SLEEPABLE,
.runs = -1,
},