bpf: implement sleepable uprobes by chaining gps

uprobes work by raising a trap, setting a task flag from within the interrupt handler, and processing the actual work for the uprobe on the way back to userspace. As a result, uprobe handlers already execute in a might_fault/_sleep context. The primary obstacle to sleepable bpf uprobe programs is therefore on the bpf side. Namely, the bpf_prog_array attached to the uprobe is protected by normal rcu. In order for uprobe bpf programs to become sleepable, it has to be protected by the tasks_trace rcu flavor instead (and kfree() called after a corresponding grace period). Therefore, the free path for bpf_prog_array now chains a tasks_trace and normal grace periods one after the other. Users who iterate under tasks_trace read section would be safe, as would users who iterate under normal read sections (from non-sleepable locations). The downside is that the tasks_trace latency affects all perf_event-attached bpf programs (and not just uprobe ones). This is deemed safe given the possible attach rates for kprobe/uprobe/tp programs. Separately, non-sleepable programs need access to dynamically sized rcu-protected maps, so bpf_run_prog_array_sleepables now conditionally takes an rcu read section, in addition to the overarching tasks_trace section. Signed-off-by: Delyan Kratunov <delyank@fb.com> Link: https://lore.kernel.org/r/ce844d62a2fd0443b08c5ab02e95bc7149f9aeb1.1655248076.git.delyank@fb.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
author: Delyan Kratunov <delyank@fb.com> 2022-06-15 02:10:46 +0300
committer: Alexei Starovoitov <ast@kernel.org> 2022-06-17 05:27:29 +0300
commit: 8c7dcb84e3b744b2b70baa7a44a9b1881c33a9c9 (patch)
tree: 05179f5cbd02bd3b8672d6842d14bc6f7ed5d708 /include/linux/bpf.h
parent: d687f621c518d791b5fffde8add3112d869b0b1b (diff)
download: linux-8c7dcb84e3b744b2b70baa7a44a9b1881c33a9c9.tar.xz
1 files changed, 52 insertions, 0 deletions
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 69106ae46464..f3e88afdaffe 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -26,6 +26,7 @@
 #include <linux/stddef.h>
 #include <linux/bpfptr.h>
 #include <linux/btf.h>
+#include <linux/rcupdate_trace.h>
 
 struct bpf_verifier_env;
 struct bpf_verifier_log;
@@ -1372,6 +1373,8 @@ extern struct bpf_empty_prog_array bpf_empty_prog_array;
 
 struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
 void bpf_prog_array_free(struct bpf_prog_array *progs);
+/* Use when traversal over the bpf_prog_array uses tasks_trace rcu */
+void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs);
 int bpf_prog_array_length(struct bpf_prog_array *progs);
 bool bpf_prog_array_is_empty(struct bpf_prog_array *array);
 int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs,
@@ -1463,6 +1466,55 @@ bpf_prog_run_array(const struct bpf_prog_array *array,
 	return ret;
 }
 
+/* Notes on RCU design for bpf_prog_arrays containing sleepable programs:
+ *
+ * We use the tasks_trace rcu flavor read section to protect the bpf_prog_array
+ * overall. As a result, we must use the bpf_prog_array_free_sleepable
+ * in order to use the tasks_trace rcu grace period.
+ *
+ * When a non-sleepable program is inside the array, we take the rcu read
+ * section and disable preemption for that program alone, so it can access
+ * rcu-protected dynamically sized maps.
+ */
+static __always_inline u32
+bpf_prog_run_array_sleepable(const struct bpf_prog_array __rcu *array_rcu,
+			     const void *ctx, bpf_prog_run_fn run_prog)
+{
+	const struct bpf_prog_array_item *item;
+	const struct bpf_prog *prog;
+	const struct bpf_prog_array *array;
+	struct bpf_run_ctx *old_run_ctx;
+	struct bpf_trace_run_ctx run_ctx;
+	u32 ret = 1;
+
+	might_fault();
+
+	rcu_read_lock_trace();
+	migrate_disable();
+
+	array = rcu_dereference_check(array_rcu, rcu_read_lock_trace_held());
+	if (unlikely(!array))
+		goto out;
+	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+	item = &array->items[0];
+	while ((prog = READ_ONCE(item->prog))) {
+		if (!prog->aux->sleepable)
+			rcu_read_lock();
+
+		run_ctx.bpf_cookie = item->bpf_cookie;
+		ret &= run_prog(prog, ctx);
+		item++;
+
+		if (!prog->aux->sleepable)
+			rcu_read_unlock();
+	}
+	bpf_reset_run_ctx(old_run_ctx);
+out:
+	migrate_enable();
+	rcu_read_unlock_trace();
+	return ret;
+}
+
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 extern struct mutex bpf_stats_enabled_mutex;
author	Delyan Kratunov <delyank@fb.com>	2022-06-15 02:10:46 +0300
committer	Alexei Starovoitov <ast@kernel.org>	2022-06-17 05:27:29 +0300
commit	8c7dcb84e3b744b2b70baa7a44a9b1881c33a9c9 (patch)
tree	05179f5cbd02bd3b8672d6842d14bc6f7ed5d708 /include/linux/bpf.h
parent	d687f621c518d791b5fffde8add3112d869b0b1b (diff)
download	linux-8c7dcb84e3b744b2b70baa7a44a9b1881c33a9c9.tar.xz