summaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
authorAndrii Nakryiko <andrii@kernel.org>2024-10-24 07:41:59 +0300
committerPeter Zijlstra <peterz@infradead.org>2024-10-31 00:42:19 +0300
commitdd1a7567784e2b1f80258be04f57bcfa82c997eb (patch)
tree140090ec21ec27be44d53ea9b9983a6723c228cf /include/linux
parent2bf8e5aceff899f5117f14c73e869a61c44d8a69 (diff)
downloadlinux-dd1a7567784e2b1f80258be04f57bcfa82c997eb.tar.xz
uprobes: SRCU-protect uretprobe lifetime (with timeout)
Avoid taking refcount on uprobe in prepare_uretprobe(), instead take uretprobe-specific SRCU lock and keep it active as kernel transfers control back to user space. Given we can't rely on user space returning from traced function within reasonable time period, we need to make sure not to keep SRCU lock active for too long, though. To that effect, we employ a timer callback which is meant to terminate SRCU lock region after predefined timeout (currently set to 100ms), and instead transfer underlying struct uprobe's lifetime protection to refcounting. This fallback to less scalable refcounting after 100ms is a fine tradeoff from uretprobe's scalability and performance perspective, because uretprobing *long running* user functions inherently doesn't run into scalability issues (there is just not enough frequency to cause noticeable issues with either performance or scalability). The overall trick is in ensuring synchronization between current thread and timer's callback fired on some other thread. To cope with that with minimal logic complications, we add hprobe wrapper which is used to contain all the synchronization related issues behind a small number of basic helpers: hprobe_expire() for "downgrading" uprobe from SRCU-protected state to refcounted state, and a hprobe_consume() and hprobe_finalize() pair of single-use consuming helpers. Other than that, whatever current thread's logic is there stays the same, as timer thread cannot modify return_instance state (or add new/remove old return_instances). It only takes care of SRCU unlock and uprobe refcounting, which is hidden from the higher-level uretprobe handling logic. We use atomic xchg() in hprobe_consume(), which is called from performance critical handle_uretprobe_chain() function run in the current context. When uncontended, this xchg() doesn't seem to hurt performance as there are no other competing CPUs fighting for the same cache line. We also mark struct return_instance as ____cacheline_aligned to ensure no false sharing can happen. Another technical moment. We need to make sure that the list of return instances can be safely traversed under RCU from timer callback, so we delay return_instance freeing with kfree_rcu() and make sure that list modifications use RCU-aware operations. Also, given SRCU lock survives transition from kernel to user space and back we need to use lower-level __srcu_read_lock() and __srcu_read_unlock() to avoid lockdep complaining. Just to give an impression of a kind of performance improvements this change brings, below are benchmarking results with and without these SRCU changes, assuming other uprobe optimizations (mainly RCU Tasks Trace for entry uprobes, lockless RB-tree lookup, and lockless VMA to uprobe lookup) are left intact: WITHOUT SRCU for uretprobes =========================== uretprobe-nop ( 1 cpus): 2.197 ± 0.002M/s ( 2.197M/s/cpu) uretprobe-nop ( 2 cpus): 3.325 ± 0.001M/s ( 1.662M/s/cpu) uretprobe-nop ( 3 cpus): 4.129 ± 0.002M/s ( 1.376M/s/cpu) uretprobe-nop ( 4 cpus): 6.180 ± 0.003M/s ( 1.545M/s/cpu) uretprobe-nop ( 8 cpus): 7.323 ± 0.005M/s ( 0.915M/s/cpu) uretprobe-nop (16 cpus): 6.943 ± 0.005M/s ( 0.434M/s/cpu) uretprobe-nop (32 cpus): 5.931 ± 0.014M/s ( 0.185M/s/cpu) uretprobe-nop (64 cpus): 5.145 ± 0.003M/s ( 0.080M/s/cpu) uretprobe-nop (80 cpus): 4.925 ± 0.005M/s ( 0.062M/s/cpu) WITH SRCU for uretprobes ======================== uretprobe-nop ( 1 cpus): 1.968 ± 0.001M/s ( 1.968M/s/cpu) uretprobe-nop ( 2 cpus): 3.739 ± 0.003M/s ( 1.869M/s/cpu) uretprobe-nop ( 3 cpus): 5.616 ± 0.003M/s ( 1.872M/s/cpu) uretprobe-nop ( 4 cpus): 7.286 ± 0.002M/s ( 1.822M/s/cpu) uretprobe-nop ( 8 cpus): 13.657 ± 0.007M/s ( 1.707M/s/cpu) uretprobe-nop (32 cpus): 45.305 ± 0.066M/s ( 1.416M/s/cpu) uretprobe-nop (64 cpus): 42.390 ± 0.922M/s ( 0.662M/s/cpu) uretprobe-nop (80 cpus): 47.554 ± 2.411M/s ( 0.594M/s/cpu) Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20241024044159.3156646-3-andrii@kernel.org
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/uprobes.h54
1 files changed, 52 insertions, 2 deletions
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index dbaf04189548..7a051b5d2edd 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -15,6 +15,7 @@
#include <linux/rbtree.h>
#include <linux/types.h>
#include <linux/wait.h>
+#include <linux/timer.h>
struct uprobe;
struct vm_area_struct;
@@ -67,6 +68,53 @@ enum uprobe_task_state {
UTASK_SSTEP_TRAPPED,
};
+/* The state of hybrid-lifetime uprobe inside struct return_instance */
+enum hprobe_state {
+ HPROBE_LEASED, /* uretprobes_srcu-protected uprobe */
+ HPROBE_STABLE, /* refcounted uprobe */
+ HPROBE_GONE, /* NULL uprobe, SRCU expired, refcount failed */
+ HPROBE_CONSUMED, /* uprobe "consumed" by uretprobe handler */
+};
+
+/*
+ * Hybrid lifetime uprobe. Represents a uprobe instance that could be either
+ * SRCU protected (with SRCU protection eventually potentially timing out),
+ * refcounted using uprobe->ref, or there could be no valid uprobe (NULL).
+ *
+ * hprobe's internal state is setup such that background timer thread can
+ * atomically "downgrade" temporarily RCU-protected uprobe into refcounted one
+ * (or no uprobe, if refcounting failed).
+ *
+ * *stable* pointer always point to the uprobe (or could be NULL if there is
+ * was no valid underlying uprobe to begin with).
+ *
+ * *leased* pointer is the key to achieving race-free atomic lifetime state
+ * transition and can have three possible states:
+ * - either the same non-NULL value as *stable*, in which case uprobe is
+ * SRCU-protected;
+ * - NULL, in which case uprobe (if there is any) is refcounted;
+ * - special __UPROBE_DEAD value, which represents an uprobe that was SRCU
+ * protected initially, but SRCU period timed out and we attempted to
+ * convert it to refcounted, but refcount_inc_not_zero() failed, because
+ * uprobe effectively went away (the last consumer unsubscribed). In this
+ * case it's important to know that *stable* pointer (which still has
+ * non-NULL uprobe pointer) shouldn't be used, because lifetime of
+ * underlying uprobe is not guaranteed anymore. __UPROBE_DEAD is just an
+ * internal marker and is handled transparently by hprobe_fetch() helper.
+ *
+ * When uprobe is SRCU-protected, we also record srcu_idx value, necessary for
+ * SRCU unlocking.
+ *
+ * See hprobe_expire() and hprobe_fetch() for details of race-free uprobe
+ * state transitioning details. It all hinges on atomic xchg() over *leaded*
+ * pointer. *stable* pointer, once initially set, is not modified concurrently.
+ */
+struct hprobe {
+ enum hprobe_state state;
+ int srcu_idx;
+ struct uprobe *uprobe;
+};
+
/*
* uprobe_task: Metadata of a task while it singlesteps.
*/
@@ -86,6 +134,7 @@ struct uprobe_task {
};
struct uprobe *active_uprobe;
+ struct timer_list ri_timer;
unsigned long xol_vaddr;
struct arch_uprobe *auprobe;
@@ -100,7 +149,7 @@ struct return_consumer {
};
struct return_instance {
- struct uprobe *uprobe;
+ struct hprobe hprobe;
unsigned long func;
unsigned long stack; /* stack pointer */
unsigned long orig_ret_vaddr; /* original return address */
@@ -108,9 +157,10 @@ struct return_instance {
int consumers_cnt;
struct return_instance *next; /* keep as stack */
+ struct rcu_head rcu;
struct return_consumer consumers[] __counted_by(consumers_cnt);
-};
+} ____cacheline_aligned;
enum rp_check {
RP_CHECK_CALL,