349 files changed, 27848 insertions, 4969 deletions
diff --git a/Documentation/bpf/btf.rst b/Documentation/bpf/btf.rst
index 3b60583f5db2..3f05f17990ad 100644
--- a/Documentation/bpf/btf.rst
+++ b/Documentation/bpf/btf.rst
@@ -97,10 +97,8 @@ Each type contains the following common data::
     struct btf_type {
         __u32 name_off;
         /* "info" bits arrangement
-         * bits  0-15: vlen (e.g. # of struct's members)
-         * bits 16-23: unused
-         * bits 24-28: kind (e.g. int, ptr, array...etc)
-         * bits 29-30: unused
+         * bits  0-23: vlen (e.g. # of struct's members)
+         * bits 24-30: kind (e.g. int, ptr, array...etc)
          * bit     31: kind_flag, currently used by
          *             struct, union, enum, fwd, enum64,
          *             decl_tag and type_tag
diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst
index 75e6c078e0e7..4c814ff6061e 100644
--- a/Documentation/bpf/kfuncs.rst
+++ b/Documentation/bpf/kfuncs.rst
@@ -207,8 +207,26 @@ Here, the buffer may be NULL. If the buffer is not NULL, it must be at least
 buffer__szk bytes in size. The kfunc is responsible for checking if the buffer
 is NULL before using it.
 
-2.3.5 __str Annotation
-----------------------------
+2.3.5 __nonown_allowed Annotation
+---------------------------------
+
+This annotation is used to indicate that the parameter may be a non-owning reference.
+
+An example is given below::
+
+        __bpf_kfunc int bpf_list_add(..., struct bpf_list_node
+                                     *prev__nonown_allowed, ...)
+        {
+                ...
+        }
+
+For the ``prev__nonown_allowed`` parameter (resolved as ``KF_ARG_PTR_TO_LIST_NODE``),
+suffix ``__nonown_allowed`` retains the usual owning-pointer rules and also
+permits a non-owning reference with no ref_obj_id (e.g. the return value of
+bpf_list_front() / bpf_list_back()).
+
+2.3.6 __str Annotation
+----------------------
 This annotation is used to indicate that the argument is a constant string.
 
 An example is given below::
@@ -462,6 +480,20 @@ In order to accommodate such requirements, the verifier will enforce strict
 PTR_TO_BTF_ID type matching if two types have the exact same name, with one
 being suffixed with ``___init``.
 
+2.8 Accessing arena memory through kfunc arguments
+--------------------------------------------------
+
+A read or write at any address inside an arena does not oops the kernel.
+Unallocated arena pages are lazily backed by a scratch page and the
+access is reported through the program's BPF stream as an error. Only
+the BPF program's correctness is affected; the kernel itself remains
+intact.
+
+The arena is followed by a ``GUARD_SZ / 2`` (32 KiB) guard region that
+is also covered by this recovery. A kfunc handed an arena pointer may
+therefore access up to ``GUARD_SZ / 2`` past it without bounds-checking
+against the arena. Larger accesses must verify the range explicitly.
+
 .. _BPF_kfunc_lifecycle_expectations:
 
 3. kfunc lifecycle expectations
diff --git a/Documentation/bpf/map_lru_hash_update.dot b/Documentation/bpf/map_lru_hash_update.dot
index ab10058f5b79..412bc8b3b57e 100644
--- a/Documentation/bpf/map_lru_hash_update.dot
+++ b/Documentation/bpf/map_lru_hash_update.dot
@@ -21,10 +21,18 @@ digraph {
   // names that initiate the corresponding logic in kernel/bpf/bpf_lru_list.c.
   // Number suffixes and errno suffixes handle subsections of the corresponding
   // logic in the function as of the writing of this dot.
+  //
+  // All LRU locks are rqspinlock_t. Every acquire can fail (AA self-deadlock
+  // or contention timeout); on failure the corresponding helper returns NULL
+  // and the caller propagates -ENOMEM. The "rqspinlock acquire failed"
+  // terminal below is reached via the dashed arrows from each acquire site.
+
+  rqspinlock_failed [shape=rectangle,
+    label="Any LRU rqspinlock\nacquire fails\n(AA or timeout)"]
 
   // cf. __local_list_pop_free() / bpf_percpu_lru_pop_free()
   local_freelist_check [shape=diamond,fillcolor=1,
-    label="Local freelist\nnode available?"];
+    label="Local freelist\nnode available?\n(lockless free_llist)"];
   use_local_node [shape=rectangle,
     label="Use node owned\nby this CPU"]
 
@@ -82,6 +90,15 @@ digraph {
     // fn__local_list_pop_pending()
   }
 
+  // Post-steal: re-acquire local loc_l->lock to insert the stolen node into
+  // the local pending list. If the acquire fails, the stolen node is published
+  // to the lockless local free_llist so the next pop on this CPU picks it up
+  // instead of orphaning it.
+  post_steal_lock [shape=diamond,fillcolor=1,
+    label="Acquire local\nloc_l->lock\nto add pending"]
+  post_steal_to_free_llist [shape=rectangle,
+    label="Publish stolen node to\nlocal free_llist (lockless)"]
+
   fn_bpf_lru_list_pop_free_to_local2 [shape=rectangle,
     label="Use node that was\nnot recently referenced"]
   local_freelist_check4 [shape=rectangle,
@@ -97,10 +114,19 @@ digraph {
   fn_htab_lru_map_update_elem_ENOENT [shape=oval,label="return -ENOENT"]
 
   begin -> local_freelist_check
+  // The initial per-CPU lock (loc_l->lock for common, l->lock for percpu) is
+  // acquired before the local freelist check; rqspinlock failure here exits
+  // directly to -ENOMEM (no recovery needed: nothing was removed yet).
+  local_freelist_check -> rqspinlock_failed [style=dashed,
+    xlabel="acquire fails"]
   local_freelist_check -> use_local_node [xlabel="Y"]
   local_freelist_check -> common_lru_check [xlabel="N"]
   common_lru_check -> fn_bpf_lru_list_pop_free_to_local [xlabel="Y"]
   common_lru_check -> fn___bpf_lru_list_shrink_inactive [xlabel="N"]
+  // Global lru_list lock acquire failure in pop_free_to_local: skip refill,
+  // fall through to the steal path. Not ENOMEM by itself.
+  fn_bpf_lru_list_pop_free_to_local -> common_lru_check2 [style=dashed,
+    xlabel="global lru_lock\nacquire fails"]
   fn_bpf_lru_list_pop_free_to_local -> fn___bpf_lru_node_move_to_free
   fn___bpf_lru_node_move_to_free ->
     fn_bpf_lru_list_pop_free_to_local2 [xlabel="Y"]
@@ -120,13 +146,27 @@ digraph {
   local_freelist_check6 -> local_freelist_check7
   local_freelist_check7 -> fn_htab_lru_map_update_elem
 
-  fn_htab_lru_map_update_elem -> fn_htab_lru_map_update_elem3 [xlabel = "Y"]
+  // Steal-loop victim lock failure is silent: treat as "no node found here"
+  // and continue to next CPU; same edge as the existing "N" path.
+  local_freelist_check5 -> fn_htab_lru_map_update_elem2 [style=dashed,
+    xlabel="victim's lock\nfails: skip"]
+  // After a successful steal, re-acquire the local loc_l->lock. On failure
+  // the stolen node is published to free_llist (recovered, not orphaned)
+  // and the update returns -ENOMEM.
+  fn_htab_lru_map_update_elem -> post_steal_lock [xlabel = "Y"]
+  post_steal_lock -> fn_htab_lru_map_update_elem3 [xlabel = "OK"]
+  post_steal_lock -> post_steal_to_free_llist [style=dashed,
+    xlabel="loc_l->lock\nacquire fails"]
+  post_steal_to_free_llist -> fn_htab_lru_map_update_elem_ENOMEM
   fn_htab_lru_map_update_elem -> fn_htab_lru_map_update_elem2  [xlabel = "N"]
   fn_htab_lru_map_update_elem2 ->
     fn_htab_lru_map_update_elem_ENOMEM [xlabel = "Y"]
   fn_htab_lru_map_update_elem2 -> local_freelist_check5 [xlabel = "N"]
   fn_htab_lru_map_update_elem3 -> fn_htab_lru_map_update_elem4
 
+  // Shared rqspinlock-failure terminal collapses to the same -ENOMEM exit.
+  rqspinlock_failed -> fn_htab_lru_map_update_elem_ENOMEM
+
   use_local_node -> fn_htab_lru_map_update_elem4
   fn_bpf_lru_list_pop_free_to_local2 -> fn_htab_lru_map_update_elem4
   local_freelist_check4 -> fn_htab_lru_map_update_elem4
diff --git a/MAINTAINERS b/MAINTAINERS
index 12bd898fcc4b..a7fd31320b38 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4797,13 +4797,6 @@ S:	Supported
 F:	arch/x86/net/
 X:	arch/x86/net/bpf_jit_comp32.c
 
-BPF [BTF]
-M:	Martin KaFai Lau <martin.lau@linux.dev>
-L:	bpf@vger.kernel.org
-S:	Maintained
-F:	include/linux/btf*
-F:	kernel/bpf/btf.c
-
 BPF [CORE]
 M:	Alexei Starovoitov <ast@kernel.org>
 M:	Daniel Borkmann <daniel@iogearbox.net>
@@ -4840,12 +4833,13 @@ BPF [GENERAL] (Safe Dynamic Programs and Tools)
 M:	Alexei Starovoitov <ast@kernel.org>
 M:	Daniel Borkmann <daniel@iogearbox.net>
 M:	Andrii Nakryiko <andrii@kernel.org>
-R:	Martin KaFai Lau <martin.lau@linux.dev>
 M:	Eduard Zingerman <eddyz87@gmail.com>
 M:	Kumar Kartikeya Dwivedi <memxor@gmail.com>
+R:	Martin KaFai Lau <martin.lau@linux.dev>
 R:	Song Liu <song@kernel.org>
 R:	Yonghong Song <yonghong.song@linux.dev>
 R:	Jiri Olsa <jolsa@kernel.org>
+R:	Emil Tsalapatis <emil@etsalapatis.com>
 L:	bpf@vger.kernel.org
 S:	Supported
 W:	https://bpf.io/
@@ -4855,7 +4849,9 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git
 F:	Documentation/bpf/
 F:	Documentation/networking/filter.rst
 F:	Documentation/userspace-api/ebpf/
+F:	arch/*/include/asm/rqspinlock.h
 F:	arch/*/net/*
+F:	include/asm-generic/rqspinlock.h
 F:	include/linux/bpf*
 F:	include/linux/btf*
 F:	include/linux/buildid.h
@@ -4867,17 +4863,15 @@ F:	include/uapi/linux/filter.h
 F:	kernel/bpf/
 F:	kernel/trace/bpf_trace.c
 F:	lib/buildid.c
-F:	arch/*/include/asm/rqspinlock.h
-F:	include/asm-generic/rqspinlock.h
 F:	lib/test_bpf.c
 F:	net/bpf/
 F:	net/core/filter.c
 F:	net/sched/act_bpf.c
 F:	net/sched/cls_bpf.c
 F:	samples/bpf/
+F:	scripts/Makefile.btf
 F:	scripts/bpf_doc.py
 F:	scripts/gen-btf.sh
-F:	scripts/Makefile.btf
 F:	scripts/pahole-version.sh
 F:	tools/bpf/
 F:	tools/lib/bpf/
@@ -4892,6 +4886,7 @@ F:	kernel/bpf/*iter.c
 BPF [L7 FRAMEWORK] (sockmap)
 M:	John Fastabend <john.fastabend@gmail.com>
 M:	Jakub Sitnicki <jakub@cloudflare.com>
+M:	Jiayuan Chen <jiayuan.chen@linux.dev>
 L:	netdev@vger.kernel.org
 L:	bpf@vger.kernel.org
 S:	Maintained
@@ -4935,18 +4930,11 @@ F:	tools/testing/selftests/bpf/prog_tests/tc_netkit.c
 F:	tools/testing/selftests/drivers/net/hw/nk_qlease.py
 F:	tools/testing/selftests/net/nk_qlease.py
 
-BPF [NETWORKING] (struct_ops, reuseport)
-M:	Martin KaFai Lau <martin.lau@linux.dev>
-L:	bpf@vger.kernel.org
-L:	netdev@vger.kernel.org
-S:	Maintained
-F:	kernel/bpf/bpf_struct*
-
 BPF [NETWORKING] (tcx & tc BPF, sock_addr)
-M:	Martin KaFai Lau <martin.lau@linux.dev>
 M:	Daniel Borkmann <daniel@iogearbox.net>
 R:	John Fastabend <john.fastabend@gmail.com>
 R:	Stanislav Fomichev <sdf@fomichev.me>
+R:	Martin KaFai Lau <martin.lau@linux.dev>
 L:	bpf@vger.kernel.org
 L:	netdev@vger.kernel.org
 S:	Maintained
@@ -4981,14 +4969,6 @@ L:	bpf@vger.kernel.org
 S:	Maintained
 F:	tools/testing/selftests/bpf/
 
-BPF [STORAGE & CGROUPS]
-M:	Martin KaFai Lau <martin.lau@linux.dev>
-L:	bpf@vger.kernel.org
-S:	Maintained
-F:	kernel/bpf/*storage.c
-F:	kernel/bpf/bpf_lru*
-F:	kernel/bpf/cgroup.c
-
 BPF [TOOLING] (bpftool)
 M:	Quentin Monnet <qmo@kernel.org>
 L:	bpf@vger.kernel.org
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index c9e4e00a9af2..27689c62bd25 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1830,6 +1830,34 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
 	return __ptep_get_and_clear(mm, addr, ptep);
 }
 
+static inline bool ptep_try_set(pte_t *ptep, pte_t new_pte)
+{
+	pteval_t old = 0;
+
+	if (!try_cmpxchg(&pte_val(*ptep), &old, pte_val(new_pte)))
+		return false;
+
+	/*
+	 * The store must be complete by the time this returns, but the caller
+	 * may be in lazy MMU mode, where __set_pte_complete() would defer the
+	 * barriers. Issue them directly.
+	 */
+	emit_pte_barriers();
+	return true;
+}
+#define ptep_try_set ptep_try_set
+
+/*
+ * arm64 mandates break-before-make: a cleared kernel PTE must have its TLB
+ * invalidated before a different page is installed in its place. The broadcast
+ * TLBI is an instruction, not an IPI, so this is safe with interrupts disabled.
+ */
+static inline void flush_tlb_before_set(unsigned long addr)
+{
+	flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+}
+#define flush_tlb_before_set flush_tlb_before_set
+
 #define test_and_clear_young_ptes test_and_clear_young_ptes
 static inline bool test_and_clear_young_ptes(struct vm_area_struct *vma,
 		unsigned long addr, pte_t *ptep, unsigned int nr)
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 739800835920..85e23388f9bb 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -9,6 +9,7 @@
 
 #include <linux/acpi.h>
 #include <linux/bitfield.h>
+#include <linux/bpf_defs.h>
 #include <linux/extable.h>
 #include <linux/kfence.h>
 #include <linux/signal.h>
@@ -436,9 +437,12 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr,
 	} else if (is_pkvm_stage2_abort(esr)) {
 		msg = "access to hypervisor-protected memory";
 	} else {
-		if (esr_fsc_is_translation_fault(esr) &&
-		    kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
-			return;
+		if (esr_fsc_is_translation_fault(esr)) {
+			if (kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
+				return;
+			if (bpf_arena_handle_page_fault(addr, esr & ESR_ELx_WNR, regs->pc))
+				return;
+		}
 
 		msg = "paging request";
 	}
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 0816c40fc7af..f6bcc0e1a950 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -47,7 +47,7 @@
 /* Map BPF registers to A64 registers */
 static const int bpf2a64[] = {
 	/* return value from in-kernel function, and exit value from eBPF */
-	[BPF_REG_0] = A64_R(7),
+	[BPF_REG_0] = A64_R(8),
 	/* arguments from eBPF program to in-kernel function */
 	[BPF_REG_1] = A64_R(0),
 	[BPF_REG_2] = A64_R(1),
@@ -86,6 +86,7 @@ struct jit_ctx {
 	__le32 *image;
 	__le32 *ro_image;
 	u32 stack_size;
+	u16 stack_arg_size;
 	u64 user_vm_start;
 	u64 arena_vm_start;
 	bool fp_used;
@@ -533,13 +534,20 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf)
 	 *                        |     |
 	 *                        +-----+ <= (BPF_FP - prog->aux->stack_depth)
 	 *                        |RSVD | padding
-	 * current A64_SP =>      +-----+ <= (BPF_FP - ctx->stack_size)
+	 *                        +-----+ <= (BPF_FP - ctx->stack_size)
+	 *                        |     |
+	 *                        | ... | outgoing stack args (9+, if any)
+	 *                        |     |
+	 * current A64_SP =>      +-----+
 	 *                        |     |
 	 *                        | ... | Function call stack
 	 *                        |     |
 	 *                        +-----+
 	 *                          low
 	 *
+	 * Stack args 6-8 are passed in x5-x7, args 9+ at [SP].
+	 * Incoming args 9+ are at [A64_FP + 16], [A64_FP + 24], ...
+	 * (above the saved FP/LR pair pushed in the callee prologue).
 	 */
 
 	emit_kcfi(is_main_prog ? cfi_bpf_hash : cfi_bpf_subprog_hash, ctx);
@@ -613,6 +621,9 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf)
 	if (ctx->stack_size && !ctx->priv_sp_used)
 		emit(A64_SUB_I(1, A64_SP, A64_SP, ctx->stack_size), ctx);
 
+	if (ctx->stack_arg_size)
+		emit(A64_SUB_I(1, A64_SP, A64_SP, ctx->stack_arg_size), ctx);
+
 	if (ctx->arena_vm_start)
 		emit_a64_mov_i64(arena_vm_base, ctx->arena_vm_start, ctx);
 
@@ -673,6 +684,9 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx)
 	/* Update tail_call_cnt if the slot is populated. */
 	emit(A64_STR64I(tcc, ptr, 0), ctx);
 
+	if (ctx->stack_arg_size)
+		emit(A64_ADD_I(1, A64_SP, A64_SP, ctx->stack_arg_size), ctx);
+
 	/* restore SP */
 	if (ctx->stack_size && !ctx->priv_sp_used)
 		emit(A64_ADD_I(1, A64_SP, A64_SP, ctx->stack_size), ctx);
@@ -1034,6 +1048,9 @@ static void build_epilogue(struct jit_ctx *ctx, bool was_classic)
 	const u8 r0 = bpf2a64[BPF_REG_0];
 	const u8 ptr = bpf2a64[TCCNT_PTR];
 
+	if (ctx->stack_arg_size)
+		emit(A64_ADD_I(1, A64_SP, A64_SP, ctx->stack_arg_size), ctx);
+
 	/* We're done with BPF stack */
 	if (ctx->stack_size && !ctx->priv_sp_used)
 		emit(A64_ADD_I(1, A64_SP, A64_SP, ctx->stack_size), ctx);
@@ -1048,7 +1065,7 @@ static void build_epilogue(struct jit_ctx *ctx, bool was_classic)
 	/* Restore FP/LR registers */
 	emit(A64_POP(A64_FP, A64_LR, A64_SP), ctx);
 
-	/* Move the return value from bpf:r0 (aka x7) to x0 */
+	/* Move the return value from bpf:r0 (aka x8) to x0 */
 	emit(A64_MOV(1, A64_R(0), r0), ctx);
 
 	/* Authenticate lr */
@@ -1191,6 +1208,42 @@ static int add_exception_handler(const struct bpf_insn *insn,
 	return 0;
 }
 
+static const u8 stack_arg_reg[] = { A64_R(5), A64_R(6), A64_R(7) };
+
+#define NR_STACK_ARG_REGS	ARRAY_SIZE(stack_arg_reg)
+
+static void emit_stack_arg_load(u8 dst, s16 bpf_off, struct jit_ctx *ctx)
+{
+	int idx = bpf_off / sizeof(u64) - 1;
+
+	if (idx < NR_STACK_ARG_REGS)
+		emit(A64_MOV(1, dst, stack_arg_reg[idx]), ctx);
+	else
+		emit(A64_LDR64I(dst, A64_FP, (idx - NR_STACK_ARG_REGS) * sizeof(u64) + 16), ctx);
+}
+
+static void emit_stack_arg_store(u8 src_a64, s16 bpf_off, struct jit_ctx *ctx)
+{
+	int idx = -bpf_off / sizeof(u64) - 1;
+
+	if (idx < NR_STACK_ARG_REGS)
+		emit(A64_MOV(1, stack_arg_reg[idx], src_a64), ctx);
+	else
+		emit(A64_STR64I(src_a64, A64_SP, (idx - NR_STACK_ARG_REGS) * sizeof(u64)), ctx);
+}
+
+static void emit_stack_arg_store_imm(s32 imm, s16 bpf_off, const u8 tmp, struct jit_ctx *ctx)
+{
+	int idx = -bpf_off / sizeof(u64) - 1;
+
+	if (idx < NR_STACK_ARG_REGS) {
+		emit_a64_mov_i(1, stack_arg_reg[idx], imm, ctx);
+	} else {
+		emit_a64_mov_i(1, tmp, imm, ctx);
+		emit(A64_STR64I(tmp, A64_SP, (idx - NR_STACK_ARG_REGS) * sizeof(u64)), ctx);
+	}
+}
+
 /* JITs an eBPF instruction.
  * Returns:
  * 0  - successfully JITed an 8-byte eBPF instruction.
@@ -1646,6 +1699,11 @@ emit_cond_jmp:
 	case BPF_LDX | BPF_MEM | BPF_H:
 	case BPF_LDX | BPF_MEM | BPF_B:
 	case BPF_LDX | BPF_MEM | BPF_DW:
+		if (insn->src_reg == BPF_REG_PARAMS) {
+			emit_stack_arg_load(dst, off, ctx);
+			break;
+		}
+		fallthrough;
 	case BPF_LDX | BPF_PROBE_MEM | BPF_DW:
 	case BPF_LDX | BPF_PROBE_MEM | BPF_W:
 	case BPF_LDX | BPF_PROBE_MEM | BPF_H:
@@ -1672,6 +1730,8 @@ emit_cond_jmp:
 		if (src == fp) {
 			src_adj = ctx->priv_sp_used ? priv_sp : A64_SP;
 			off_adj = off + ctx->stack_size;
+			if (!ctx->priv_sp_used)
+				off_adj += ctx->stack_arg_size;
 		} else {
 			src_adj = src;
 			off_adj = off;
@@ -1752,6 +1812,11 @@ emit_cond_jmp:
 	case BPF_ST | BPF_MEM | BPF_H:
 	case BPF_ST | BPF_MEM | BPF_B:
 	case BPF_ST | BPF_MEM | BPF_DW:
+		if (insn->dst_reg == BPF_REG_PARAMS) {
+			emit_stack_arg_store_imm(imm, off, tmp, ctx);
+			break;
+		}
+		fallthrough;
 	case BPF_ST | BPF_PROBE_MEM32 | BPF_B:
 	case BPF_ST | BPF_PROBE_MEM32 | BPF_H:
 	case BPF_ST | BPF_PROBE_MEM32 | BPF_W:
@@ -1763,6 +1828,8 @@ emit_cond_jmp:
 		if (dst == fp) {
 			dst_adj = ctx->priv_sp_used ? priv_sp : A64_SP;
 			off_adj = off + ctx->stack_size;
+			if (!ctx->priv_sp_used)
+				off_adj += ctx->stack_arg_size;
 		} else {
 			dst_adj = dst;
 			off_adj = off;
@@ -1814,6 +1881,11 @@ emit_cond_jmp:
 	case BPF_STX | BPF_MEM | BPF_H:
 	case BPF_STX | BPF_MEM | BPF_B:
 	case BPF_STX | BPF_MEM | BPF_DW:
+		if (insn->dst_reg == BPF_REG_PARAMS) {
+			emit_stack_arg_store(src, off, ctx);
+			break;
+		}
+		fallthrough;
 	case BPF_STX | BPF_PROBE_MEM32 | BPF_B:
 	case BPF_STX | BPF_PROBE_MEM32 | BPF_H:
 	case BPF_STX | BPF_PROBE_MEM32 | BPF_W:
@@ -1825,6 +1897,8 @@ emit_cond_jmp:
 		if (dst == fp) {
 			dst_adj = ctx->priv_sp_used ? priv_sp : A64_SP;
 			off_adj = off + ctx->stack_size;
+			if (!ctx->priv_sp_used)
+				off_adj += ctx->stack_arg_size;
 		} else {
 			dst_adj = dst;
 			off_adj = off;
@@ -2018,6 +2092,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_pr
 	u8 *ro_image_ptr;
 	int body_idx;
 	int exentry_idx;
+	int out_cnt;
 
 	if (!prog->jit_requested)
 		return prog;
@@ -2065,6 +2140,14 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_pr
 	ctx.user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena);
 	ctx.arena_vm_start = bpf_arena_get_kern_vm_start(prog->aux->arena);
 
+	out_cnt = bpf_out_stack_arg_cnt(env, prog);
+	if (out_cnt) {
+		int nr_on_stack = out_cnt - NR_STACK_ARG_REGS;
+
+		if (nr_on_stack > 0)
+			ctx.stack_arg_size = round_up(nr_on_stack * sizeof(u64), 16);
+	}
+
 	if (priv_stack_ptr)
 		ctx.priv_sp_used = true;
 
@@ -2229,6 +2312,11 @@ bool bpf_jit_supports_kfunc_call(void)
 	return true;
 }
 
+bool bpf_jit_supports_stack_args(void)
+{
+	return true;
+}
+
 void *bpf_arch_text_copy(void *dst, void *src, size_t len)
 {
 	if (!aarch64_insn_copy(dst, src, len))
@@ -2247,24 +2335,24 @@ bool bpf_jit_supports_subprog_tailcalls(void)
 	return true;
 }
 
-static void invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_link *l,
+static void invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_node *node,
 			    int bargs_off, int retval_off, int run_ctx_off,
 			    bool save_ret)
 {
 	__le32 *branch;
 	u64 enter_prog;
 	u64 exit_prog;
-	struct bpf_prog *p = l->link.prog;
+	struct bpf_prog *p = node->link->prog;
 	int cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie);
 
 	enter_prog = (u64)bpf_trampoline_enter(p);
 	exit_prog = (u64)bpf_trampoline_exit(p);
 
-	if (l->cookie == 0) {
+	if (node->cookie == 0) {
 		/* if cookie is zero, one instruction is enough to store it */
 		emit(A64_STR64I(A64_ZR, A64_SP, run_ctx_off + cookie_off), ctx);
 	} else {
-		emit_a64_mov_i64(A64_R(10), l->cookie, ctx);
+		emit_a64_mov_i64(A64_R(10), node->cookie, ctx);
 		emit(A64_STR64I(A64_R(10), A64_SP, run_ctx_off + cookie_off),
 		     ctx);
 	}
@@ -2314,7 +2402,7 @@ static void invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_link *l,
 	emit_call(exit_prog, ctx);
 }
 
-static void invoke_bpf_mod_ret(struct jit_ctx *ctx, struct bpf_tramp_links *tl,
+static void invoke_bpf_mod_ret(struct jit_ctx *ctx, struct bpf_tramp_nodes *tn,
 			       int bargs_off, int retval_off, int run_ctx_off,
 			       __le32 **branches)
 {
@@ -2324,8 +2412,8 @@ static void invoke_bpf_mod_ret(struct jit_ctx *ctx, struct bpf_tramp_links *tl,
 	 * Set this to 0 to avoid confusing the program.
 	 */
 	emit(A64_STR64I(A64_ZR, A64_SP, retval_off), ctx);
-	for (i = 0; i < tl->nr_links; i++) {
-		invoke_bpf_prog(ctx, tl->links[i], bargs_off, retval_off,
+	for (i = 0; i < tn->nr_nodes; i++) {
+		invoke_bpf_prog(ctx, tn->nodes[i], bargs_off, retval_off,
 				run_ctx_off, true);
 		/* if (*(u64 *)(sp + retval_off) !=  0)
 		 *	goto do_fexit;
@@ -2456,10 +2544,10 @@ static void restore_args(struct jit_ctx *ctx, int bargs_off, int nregs)
 	}
 }
 
-static bool is_struct_ops_tramp(const struct bpf_tramp_links *fentry_links)
+static bool is_struct_ops_tramp(const struct bpf_tramp_nodes *fentry_nodes)
 {
-	return fentry_links->nr_links == 1 &&
-		fentry_links->links[0]->link.type == BPF_LINK_TYPE_STRUCT_OPS;
+	return fentry_nodes->nr_nodes == 1 &&
+		fentry_nodes->nodes[0]->link->type == BPF_LINK_TYPE_STRUCT_OPS;
 }
 
 static void store_func_meta(struct jit_ctx *ctx, u64 func_meta, int func_meta_off)
@@ -2480,7 +2568,7 @@ static void store_func_meta(struct jit_ctx *ctx, u64 func_meta, int func_meta_of
  *
  */
 static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
-			      struct bpf_tramp_links *tlinks, void *func_addr,
+			      struct bpf_tramp_nodes *tnodes, void *func_addr,
 			      const struct btf_func_model *m,
 			      const struct arg_aux *a,
 			      u32 flags)
@@ -2496,14 +2584,14 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 	int run_ctx_off;
 	int oargs_off;
 	int nfuncargs;
-	struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
-	struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT];
-	struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN];
+	struct bpf_tramp_nodes *fentry = &tnodes[BPF_TRAMP_FENTRY];
+	struct bpf_tramp_nodes *fexit = &tnodes[BPF_TRAMP_FEXIT];
+	struct bpf_tramp_nodes *fmod_ret = &tnodes[BPF_TRAMP_MODIFY_RETURN];
 	bool save_ret;
 	__le32 **branches = NULL;
 	bool is_struct_ops = is_struct_ops_tramp(fentry);
 	int cookie_off, cookie_cnt, cookie_bargs_off;
-	int fsession_cnt = bpf_fsession_cnt(tlinks);
+	int fsession_cnt = bpf_fsession_cnt(tnodes);
 	u64 func_meta;
 
 	/* trampoline stack layout:
@@ -2549,7 +2637,7 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 
 	cookie_off = stack_size;
 	/* room for session cookies */
-	cookie_cnt = bpf_fsession_cookie_cnt(tlinks);
+	cookie_cnt = bpf_fsession_cookie_cnt(tnodes);
 	stack_size += cookie_cnt * 8;
 
 	ip_off = stack_size;
@@ -2646,20 +2734,20 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 	}
 
 	cookie_bargs_off = (bargs_off - cookie_off) / 8;
-	for (i = 0; i < fentry->nr_links; i++) {
-		if (bpf_prog_calls_session_cookie(fentry->links[i])) {
+	for (i = 0; i < fentry->nr_nodes; i++) {
+		if (bpf_prog_calls_session_cookie(fentry->nodes[i])) {
 			u64 meta = func_meta | (cookie_bargs_off << BPF_TRAMP_COOKIE_INDEX_SHIFT);
 
 			store_func_meta(ctx, meta, func_meta_off);
 			cookie_bargs_off--;
 		}
-		invoke_bpf_prog(ctx, fentry->links[i], bargs_off,
+		invoke_bpf_prog(ctx, fentry->nodes[i], bargs_off,
 				retval_off, run_ctx_off,
 				flags & BPF_TRAMP_F_RET_FENTRY_RET);
 	}
 
-	if (fmod_ret->nr_links) {
-		branches = kcalloc(fmod_ret->nr_links, sizeof(__le32 *),
+	if (fmod_ret->nr_nodes) {
+		branches = kcalloc(fmod_ret->nr_nodes, sizeof(__le32 *),
 				   GFP_KERNEL);
 		if (!branches)
 			return -ENOMEM;
@@ -2683,7 +2771,7 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 	}
 
 	/* update the branches saved in invoke_bpf_mod_ret with cbnz */
-	for (i = 0; i < fmod_ret->nr_links && ctx->image != NULL; i++) {
+	for (i = 0; i < fmod_ret->nr_nodes && ctx->image != NULL; i++) {
 		int offset = &ctx->image[ctx->idx] - branches[i];
 		*branches[i] = cpu_to_le32(A64_CBNZ(1, A64_R(10), offset));
 	}
@@ -2694,14 +2782,14 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 		store_func_meta(ctx, func_meta, func_meta_off);
 
 	cookie_bargs_off = (bargs_off - cookie_off) / 8;
-	for (i = 0; i < fexit->nr_links; i++) {
-		if (bpf_prog_calls_session_cookie(fexit->links[i])) {
+	for (i = 0; i < fexit->nr_nodes; i++) {
+		if (bpf_prog_calls_session_cookie(fexit->nodes[i])) {
 			u64 meta = func_meta | (cookie_bargs_off << BPF_TRAMP_COOKIE_INDEX_SHIFT);
 
 			store_func_meta(ctx, meta, func_meta_off);
 			cookie_bargs_off--;
 		}
-		invoke_bpf_prog(ctx, fexit->links[i], bargs_off, retval_off,
+		invoke_bpf_prog(ctx, fexit->nodes[i], bargs_off, retval_off,
 				run_ctx_off, false);
 	}
 
@@ -2759,7 +2847,7 @@ bool bpf_jit_supports_fsession(void)
 }
 
 int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
-			     struct bpf_tramp_links *tlinks, void *func_addr)
+			     struct bpf_tramp_nodes *tnodes, void *func_addr)
 {
 	struct jit_ctx ctx = {
 		.image = NULL,
@@ -2773,7 +2861,7 @@ int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
 	if (ret < 0)
 		return ret;
 
-	ret = prepare_trampoline(&ctx, &im, tlinks, func_addr, m, &aaux, flags);
+	ret = prepare_trampoline(&ctx, &im, tnodes, func_addr, m, &aaux, flags);
 	if (ret < 0)
 		return ret;
 
@@ -2797,7 +2885,7 @@ int arch_protect_bpf_trampoline(void *image, unsigned int size)
 
 int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image,
 				void *ro_image_end, const struct btf_func_model *m,
-				u32 flags, struct bpf_tramp_links *tlinks,
+				u32 flags, struct bpf_tramp_nodes *tnodes,
 				void *func_addr)
 {
 	u32 size = ro_image_end - ro_image;
@@ -2824,7 +2912,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image,
 	ret = calc_arg_aux(m, &aaux);
 	if (ret)
 		goto out;
-	ret = prepare_trampoline(&ctx, im, tlinks, func_addr, m, &aaux, flags);
+	ret = prepare_trampoline(&ctx, im, tnodes, func_addr, m, &aaux, flags);
 
 	if (ret > 0 && validate_code(&ctx) < 0) {
 		ret = -EINVAL;
diff --git a/arch/arm64/net/bpf_timed_may_goto.S b/arch/arm64/net/bpf_timed_may_goto.S
index 894cfcd7b241..a9a802711a7f 100644
--- a/arch/arm64/net/bpf_timed_may_goto.S
+++ b/arch/arm64/net/bpf_timed_may_goto.S
@@ -8,8 +8,8 @@ SYM_FUNC_START(arch_bpf_timed_may_goto)
 	stp     x29, x30, [sp, #-64]!
 	mov     x29, sp
 
-	/* Save BPF registers R0 - R5 (x7, x0-x4)*/
-	stp	x7, x0, [sp, #16]
+	/* Save BPF registers R0 - R5 (x8, x0-x4)*/
+	stp	x8, x0, [sp, #16]
 	stp	x1, x2, [sp, #32]
 	stp	x3, x4, [sp, #48]
 
@@ -28,8 +28,8 @@ SYM_FUNC_START(arch_bpf_timed_may_goto)
 	/* BPF_REG_AX(x9) will be stored into count, so move return value to it. */
 	mov	x9, x0
 
-	/* Restore BPF registers R0 - R5 (x7, x0-x4) */
-	ldp	x7, x0, [sp, #16]
+	/* Restore BPF registers R0 - R5 (x8, x0-x4) */
+	ldp	x8, x0, [sp, #16]
 	ldp	x1, x2, [sp, #32]
 	ldp	x3, x4, [sp, #48]
 
diff --git a/arch/loongarch/net/bpf_jit.c b/arch/loongarch/net/bpf_jit.c
index 24913dc7f4e8..058ffbbaad85 100644
--- a/arch/loongarch/net/bpf_jit.c
+++ b/arch/loongarch/net/bpf_jit.c
@@ -1674,17 +1674,17 @@ static void restore_stk_args(struct jit_ctx *ctx, int nr_stk_args, int args_off,
 	}
 }
 
-static int invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_link *l,
+static int invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_node *n,
 			   int args_off, int retval_off, int run_ctx_off, bool save_ret)
 {
 	int ret;
 	u32 *branch;
-	struct bpf_prog *p = l->link.prog;
+	struct bpf_prog *p = n->link->prog;
 	int cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie);
 
-	if (l->cookie)
+	if (n->cookie)
 		emit_store_stack_imm64(ctx, LOONGARCH_GPR_T1,
-				      -run_ctx_off + cookie_off, l->cookie);
+				      -run_ctx_off + cookie_off, n->cookie);
 	else
 		emit_insn(ctx, std, LOONGARCH_GPR_ZERO, LOONGARCH_GPR_FP, -run_ctx_off + cookie_off);
 
@@ -1737,22 +1737,22 @@ static int invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_link *l,
 	return ret;
 }
 
-static int invoke_bpf(struct jit_ctx *ctx, struct bpf_tramp_links *tl,
+static int invoke_bpf(struct jit_ctx *ctx, struct bpf_tramp_nodes *tn,
 		      int args_off, int retval_off, int run_ctx_off,
 		      int func_meta_off, bool save_ret, u64 func_meta, int cookie_off)
 {
 	int i, cur_cookie = (cookie_off - args_off) / 8;
 
-	for (i = 0; i < tl->nr_links; i++) {
+	for (i = 0; i < tn->nr_nodes; i++) {
 		int err;
 
-		if (bpf_prog_calls_session_cookie(tl->links[i])) {
+		if (bpf_prog_calls_session_cookie(tn->nodes[i])) {
 			u64 meta = func_meta | ((u64)cur_cookie << BPF_TRAMP_COOKIE_INDEX_SHIFT);
 
 			emit_store_stack_imm64(ctx, LOONGARCH_GPR_T1, -func_meta_off, meta);
 			cur_cookie--;
 		}
-		err = invoke_bpf_prog(ctx, tl->links[i], args_off, retval_off, run_ctx_off, save_ret);
+		err = invoke_bpf_prog(ctx, tn->nodes[i], args_off, retval_off, run_ctx_off, save_ret);
 		if (err)
 			return err;
 	}
@@ -1807,7 +1807,7 @@ static void sign_extend(struct jit_ctx *ctx, int rd, int rj, u8 size, bool sign)
 }
 
 static int __arch_prepare_bpf_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
-					 const struct btf_func_model *m, struct bpf_tramp_links *tlinks,
+					 const struct btf_func_model *m, struct bpf_tramp_nodes *tnodes,
 					 void *func_addr, u32 flags)
 {
 	int i, ret, save_ret;
@@ -1817,9 +1817,9 @@ static int __arch_prepare_bpf_trampoline(struct jit_ctx *ctx, struct bpf_tramp_i
 	unsigned long long func_meta;
 	bool is_struct_ops = flags & BPF_TRAMP_F_INDIRECT;
 	void *orig_call = func_addr;
-	struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
-	struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT];
-	struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN];
+	struct bpf_tramp_nodes *fentry = &tnodes[BPF_TRAMP_FENTRY];
+	struct bpf_tramp_nodes *fexit = &tnodes[BPF_TRAMP_FEXIT];
+	struct bpf_tramp_nodes *fmod_ret = &tnodes[BPF_TRAMP_MODIFY_RETURN];
 	u32 **branches = NULL;
 
 	/*
@@ -1898,7 +1898,7 @@ static int __arch_prepare_bpf_trampoline(struct jit_ctx *ctx, struct bpf_tramp_i
 		ip_off = stack_size;
 	}
 
-	cookie_cnt = bpf_fsession_cookie_cnt(tlinks);
+	cookie_cnt = bpf_fsession_cookie_cnt(tnodes);
 
 	/* Room for session cookies */
 	stack_size += cookie_cnt * 8;
@@ -1969,7 +1969,7 @@ static int __arch_prepare_bpf_trampoline(struct jit_ctx *ctx, struct bpf_tramp_i
 
 	store_args(ctx, nr_arg_slots, args_off);
 
-	if (bpf_fsession_cnt(tlinks)) {
+	if (bpf_fsession_cnt(tnodes)) {
 		/* clear all session cookies' value */
 		for (i = 0; i < cookie_cnt; i++)
 			emit_insn(ctx, std, LOONGARCH_GPR_ZERO, LOONGARCH_GPR_FP, -cookie_off + 8 * i);
@@ -1994,20 +1994,20 @@ static int __arch_prepare_bpf_trampoline(struct jit_ctx *ctx, struct bpf_tramp_i
 			return ret;
 	}
 
-	if (fentry->nr_links) {
+	if (fentry->nr_nodes) {
 		ret = invoke_bpf(ctx, fentry, args_off, retval_off, run_ctx_off, func_meta_off,
 				 flags & BPF_TRAMP_F_RET_FENTRY_RET, func_meta, cookie_off);
 		if (ret)
 			return ret;
 	}
-	if (fmod_ret->nr_links) {
-		branches  = kcalloc(fmod_ret->nr_links, sizeof(u32 *), GFP_KERNEL);
+	if (fmod_ret->nr_nodes) {
+		branches  = kcalloc(fmod_ret->nr_nodes, sizeof(u32 *), GFP_KERNEL);
 		if (!branches)
 			return -ENOMEM;
 
 		emit_insn(ctx, std, LOONGARCH_GPR_ZERO, LOONGARCH_GPR_FP, -retval_off);
-		for (i = 0; i < fmod_ret->nr_links; i++) {
-			ret = invoke_bpf_prog(ctx, fmod_ret->links[i],
+		for (i = 0; i < fmod_ret->nr_nodes; i++) {
+			ret = invoke_bpf_prog(ctx, fmod_ret->nodes[i],
 					      args_off, retval_off, run_ctx_off, true);
 			if (ret)
 				goto out;
@@ -2035,17 +2035,17 @@ static int __arch_prepare_bpf_trampoline(struct jit_ctx *ctx, struct bpf_tramp_i
 			emit_insn(ctx, nop);
 	}
 
-	for (i = 0; ctx->image && i < fmod_ret->nr_links; i++) {
+	for (i = 0; ctx->image && i < fmod_ret->nr_nodes; i++) {
 		int offset = (void *)(&ctx->image[ctx->idx]) - (void *)branches[i];
 		*branches[i] = larch_insn_gen_bne(LOONGARCH_GPR_T1, LOONGARCH_GPR_ZERO, offset);
 	}
 
 	/* Set "is_return" flag for fsession */
 	func_meta |= (1ULL << BPF_TRAMP_IS_RETURN_SHIFT);
-	if (bpf_fsession_cnt(tlinks))
+	if (bpf_fsession_cnt(tnodes))
 		emit_store_stack_imm64(ctx, LOONGARCH_GPR_T1, -func_meta_off, func_meta);
 
-	if (fexit->nr_links) {
+	if (fexit->nr_nodes) {
 		ret = invoke_bpf(ctx, fexit, args_off, retval_off, run_ctx_off,
 				 func_meta_off, false, func_meta, cookie_off);
 		if (ret)
@@ -2115,7 +2115,7 @@ out:
 
 int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image,
 				void *ro_image_end, const struct btf_func_model *m,
-				u32 flags, struct bpf_tramp_links *tlinks, void *func_addr)
+				u32 flags, struct bpf_tramp_nodes *tnodes, void *func_addr)
 {
 	int ret, size;
 	void *image, *tmp;
@@ -2131,7 +2131,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image,
 	ctx.idx = 0;
 
 	jit_fill_hole(image, (unsigned int)(ro_image_end - ro_image));
-	ret = __arch_prepare_bpf_trampoline(&ctx, im, m, tlinks, func_addr, flags);
+	ret = __arch_prepare_bpf_trampoline(&ctx, im, m, tnodes, func_addr, flags);
 	if (ret < 0)
 		goto out;
 
@@ -2152,7 +2152,7 @@ out:
 }
 
 int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
-			     struct bpf_tramp_links *tlinks, void *func_addr)
+			     struct bpf_tramp_nodes *tnodes, void *func_addr)
 {
 	int ret;
 	struct jit_ctx ctx;
@@ -2161,7 +2161,7 @@ int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
 	ctx.image = NULL;
 	ctx.idx = 0;
 
-	ret = __arch_prepare_bpf_trampoline(&ctx, &im, m, tlinks, func_addr, flags);
+	ret = __arch_prepare_bpf_trampoline(&ctx, &im, m, tnodes, func_addr, flags);
 
 	return ret < 0 ? ret : ret * LOONGARCH_INSN_SIZE;
 }
diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index 53ab97ad6074..6351a187ca61 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -597,22 +597,22 @@ int arch_protect_bpf_trampoline(void *image, unsigned int size)
 }
 
 static int invoke_bpf_prog(u32 *image, u32 *ro_image, struct codegen_context *ctx,
-			   struct bpf_tramp_link *l, int regs_off, int retval_off,
+			   struct bpf_tramp_node *n, int regs_off, int retval_off,
 			   int run_ctx_off, bool save_ret)
 {
-	struct bpf_prog *p = l->link.prog;
+	struct bpf_prog *p = n->link->prog;
 	ppc_inst_t branch_insn;
 	u32 jmp_idx;
 	int ret = 0;
 
 	/* Save cookie */
 	if (IS_ENABLED(CONFIG_PPC64)) {
-		PPC_LI64(_R3, l->cookie);
+		PPC_LI64(_R3, n->cookie);
 		EMIT(PPC_RAW_STD(_R3, _R1, run_ctx_off + offsetof(struct bpf_tramp_run_ctx,
 				 bpf_cookie)));
 	} else {
-		PPC_LI32(_R3, l->cookie >> 32);
-		PPC_LI32(_R4, l->cookie);
+		PPC_LI32(_R3, n->cookie >> 32);
+		PPC_LI32(_R4, n->cookie);
 		EMIT(PPC_RAW_STW(_R3, _R1,
 				 run_ctx_off + offsetof(struct bpf_tramp_run_ctx, bpf_cookie)));
 		EMIT(PPC_RAW_STW(_R4, _R1,
@@ -679,7 +679,7 @@ static int invoke_bpf_prog(u32 *image, u32 *ro_image, struct codegen_context *ct
 }
 
 static int invoke_bpf_mod_ret(u32 *image, u32 *ro_image, struct codegen_context *ctx,
-			      struct bpf_tramp_links *tl, int regs_off, int retval_off,
+			      struct bpf_tramp_nodes *tn, int regs_off, int retval_off,
 			      int run_ctx_off, u32 *branches)
 {
 	int i;
@@ -690,8 +690,8 @@ static int invoke_bpf_mod_ret(u32 *image, u32 *ro_image, struct codegen_context
 	 */
 	EMIT(PPC_RAW_LI(_R3, 0));
 	EMIT(PPC_RAW_STL(_R3, _R1, retval_off));
-	for (i = 0; i < tl->nr_links; i++) {
-		if (invoke_bpf_prog(image, ro_image, ctx, tl->links[i], regs_off, retval_off,
+	for (i = 0; i < tn->nr_nodes; i++) {
+		if (invoke_bpf_prog(image, ro_image, ctx, tn->nodes[i], regs_off, retval_off,
 				    run_ctx_off, true))
 			return -EINVAL;
 
@@ -807,18 +807,18 @@ static void bpf_trampoline_restore_args_stack(u32 *image, struct codegen_context
 static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_image,
 					 void *rw_image_end, void *ro_image,
 					 const struct btf_func_model *m, u32 flags,
-					 struct bpf_tramp_links *tlinks,
+					 struct bpf_tramp_nodes *tnodes,
 					 void *func_addr)
 {
 	int regs_off, func_meta_off, ip_off, run_ctx_off, retval_off;
 	int nvr_off, alt_lr_off, r4_off = 0;
-	struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN];
-	struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
-	struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT];
+	struct bpf_tramp_nodes *fmod_ret = &tnodes[BPF_TRAMP_MODIFY_RETURN];
+	struct bpf_tramp_nodes *fentry = &tnodes[BPF_TRAMP_FENTRY];
+	struct bpf_tramp_nodes *fexit = &tnodes[BPF_TRAMP_FEXIT];
 	int i, ret, nr_regs, retaddr_off, bpf_frame_size = 0;
 	struct codegen_context codegen_ctx, *ctx;
 	int cookie_off, cookie_cnt, cookie_ctx_off;
-	int fsession_cnt = bpf_fsession_cnt(tlinks);
+	int fsession_cnt = bpf_fsession_cnt(tnodes);
 	u64 func_meta;
 	u32 *image = (u32 *)rw_image;
 	ppc_inst_t branch_insn;
@@ -893,7 +893,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 
 	/* room for session cookies */
 	cookie_off = bpf_frame_size;
-	cookie_cnt = bpf_fsession_cookie_cnt(tlinks);
+	cookie_cnt = bpf_fsession_cookie_cnt(tnodes);
 	bpf_frame_size += cookie_cnt * 8;
 
 	/* Room for IP address argument */
@@ -1030,21 +1030,21 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 
 	cookie_ctx_off = (regs_off - cookie_off) / 8;
 
-	for (i = 0; i < fentry->nr_links; i++) {
-		if (bpf_prog_calls_session_cookie(fentry->links[i])) {
+	for (i = 0; i < fentry->nr_nodes; i++) {
+		if (bpf_prog_calls_session_cookie(fentry->nodes[i])) {
 			u64 meta = func_meta | (cookie_ctx_off << BPF_TRAMP_COOKIE_INDEX_SHIFT);
 
 			store_func_meta(image, ctx, meta, func_meta_off);
 			cookie_ctx_off--;
 		}
 
-		if (invoke_bpf_prog(image, ro_image, ctx, fentry->links[i], regs_off, retval_off,
+		if (invoke_bpf_prog(image, ro_image, ctx, fentry->nodes[i], regs_off, retval_off,
 				    run_ctx_off, flags & BPF_TRAMP_F_RET_FENTRY_RET))
 			return -EINVAL;
 	}
 
-	if (fmod_ret->nr_links) {
-		branches = kcalloc(fmod_ret->nr_links, sizeof(u32), GFP_KERNEL);
+	if (fmod_ret->nr_nodes) {
+		branches = kcalloc(fmod_ret->nr_nodes, sizeof(u32), GFP_KERNEL);
 		if (!branches)
 			return -ENOMEM;
 
@@ -1093,7 +1093,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 	}
 
 	/* Update branches saved in invoke_bpf_mod_ret with address of do_fexit */
-	for (i = 0; i < fmod_ret->nr_links && image; i++) {
+	for (i = 0; i < fmod_ret->nr_nodes && image; i++) {
 		if (create_cond_branch(&branch_insn, &image[branches[i]],
 				       (unsigned long)&image[ctx->idx], COND_NE << 16)) {
 			ret = -EINVAL;
@@ -1110,15 +1110,15 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 
 	cookie_ctx_off = (regs_off - cookie_off) / 8;
 
-	for (i = 0; i < fexit->nr_links; i++) {
-		if (bpf_prog_calls_session_cookie(fexit->links[i])) {
+	for (i = 0; i < fexit->nr_nodes; i++) {
+		if (bpf_prog_calls_session_cookie(fexit->nodes[i])) {
 			u64 meta = func_meta | (cookie_ctx_off << BPF_TRAMP_COOKIE_INDEX_SHIFT);
 
 			store_func_meta(image, ctx, meta, func_meta_off);
 			cookie_ctx_off--;
 		}
 
-		if (invoke_bpf_prog(image, ro_image, ctx, fexit->links[i], regs_off, retval_off,
+		if (invoke_bpf_prog(image, ro_image, ctx, fexit->nodes[i], regs_off, retval_off,
 				    run_ctx_off, false)) {
 			ret = -EINVAL;
 			goto cleanup;
@@ -1185,18 +1185,18 @@ cleanup:
 }
 
 int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
-			     struct bpf_tramp_links *tlinks, void *func_addr)
+			     struct bpf_tramp_nodes *tnodes, void *func_addr)
 {
 	struct bpf_tramp_image im;
 	int ret;
 
-	ret = __arch_prepare_bpf_trampoline(&im, NULL, NULL, NULL, m, flags, tlinks, func_addr);
+	ret = __arch_prepare_bpf_trampoline(&im, NULL, NULL, NULL, m, flags, tnodes, func_addr);
 	return ret;
 }
 
 int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
 				const struct btf_func_model *m, u32 flags,
-				struct bpf_tramp_links *tlinks,
+				struct bpf_tramp_nodes *tnodes,
 				void *func_addr)
 {
 	u32 size = image_end - image;
@@ -1212,7 +1212,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 		return -ENOMEM;
 
 	ret = __arch_prepare_bpf_trampoline(im, rw_image, rw_image + size, image, m,
-					    flags, tlinks, func_addr);
+					    flags, tnodes, func_addr);
 	if (ret < 0)
 		goto out;
 
diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
index 2f1109dbf105..c03c1de16b79 100644
--- a/arch/riscv/net/bpf_jit_comp64.c
+++ b/arch/riscv/net/bpf_jit_comp64.c
@@ -934,15 +934,15 @@ static void emit_store_stack_imm64(u8 reg, int stack_off, u64 imm64,
 	emit_sd(RV_REG_FP, stack_off, reg, ctx);
 }
 
-static int invoke_bpf_prog(struct bpf_tramp_link *l, int args_off, int retval_off,
+static int invoke_bpf_prog(struct bpf_tramp_node *node, int args_off, int retval_off,
 			   int run_ctx_off, bool save_ret, struct rv_jit_context *ctx)
 {
 	int ret, branch_off;
-	struct bpf_prog *p = l->link.prog;
+	struct bpf_prog *p = node->link->prog;
 	int cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie);
 
-	if (l->cookie)
-		emit_store_stack_imm64(RV_REG_T1, -run_ctx_off + cookie_off, l->cookie, ctx);
+	if (node->cookie)
+		emit_store_stack_imm64(RV_REG_T1, -run_ctx_off + cookie_off, node->cookie, ctx);
 	else
 		emit_sd(RV_REG_FP, -run_ctx_off + cookie_off, RV_REG_ZERO, ctx);
 
@@ -996,22 +996,22 @@ static int invoke_bpf_prog(struct bpf_tramp_link *l, int args_off, int retval_of
 	return ret;
 }
 
-static int invoke_bpf(struct bpf_tramp_links *tl, int args_off, int retval_off,
+static int invoke_bpf(struct bpf_tramp_nodes *tn, int args_off, int retval_off,
 		      int run_ctx_off, int func_meta_off, bool save_ret, u64 func_meta,
 		      int cookie_off, struct rv_jit_context *ctx)
 {
 	int i, cur_cookie = (cookie_off - args_off) / 8;
 
-	for (i = 0; i < tl->nr_links; i++) {
+	for (i = 0; i < tn->nr_nodes; i++) {
 		int err;
 
-		if (bpf_prog_calls_session_cookie(tl->links[i])) {
+		if (bpf_prog_calls_session_cookie(tn->nodes[i])) {
 			u64 meta = func_meta | ((u64)cur_cookie << BPF_TRAMP_COOKIE_INDEX_SHIFT);
 
 			emit_store_stack_imm64(RV_REG_T1, -func_meta_off, meta, ctx);
 			cur_cookie--;
 		}
-		err = invoke_bpf_prog(tl->links[i], args_off, retval_off, run_ctx_off,
+		err = invoke_bpf_prog(tn->nodes[i], args_off, retval_off, run_ctx_off,
 				      save_ret, ctx);
 		if (err)
 			return err;
@@ -1021,7 +1021,7 @@ static int invoke_bpf(struct bpf_tramp_links *tl, int args_off, int retval_off,
 
 static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im,
 					 const struct btf_func_model *m,
-					 struct bpf_tramp_links *tlinks,
+					 struct bpf_tramp_nodes *tnodes,
 					 void *func_addr, u32 flags,
 					 struct rv_jit_context *ctx)
 {
@@ -1030,9 +1030,9 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im,
 	int stack_size = 0, nr_arg_slots = 0;
 	int retval_off, args_off, func_meta_off, ip_off, run_ctx_off, sreg_off, stk_arg_off;
 	int cookie_off, cookie_cnt;
-	struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
-	struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT];
-	struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN];
+	struct bpf_tramp_nodes *fentry = &tnodes[BPF_TRAMP_FENTRY];
+	struct bpf_tramp_nodes *fexit = &tnodes[BPF_TRAMP_FEXIT];
+	struct bpf_tramp_nodes *fmod_ret = &tnodes[BPF_TRAMP_MODIFY_RETURN];
 	bool is_struct_ops = flags & BPF_TRAMP_F_INDIRECT;
 	void *orig_call = func_addr;
 	bool save_ret;
@@ -1115,7 +1115,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im,
 		ip_off = stack_size;
 	}
 
-	cookie_cnt = bpf_fsession_cookie_cnt(tlinks);
+	cookie_cnt = bpf_fsession_cookie_cnt(tnodes);
 	/* room for session cookies */
 	stack_size += cookie_cnt * 8;
 	cookie_off = stack_size;
@@ -1172,7 +1172,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im,
 
 	store_args(nr_arg_slots, args_off, ctx);
 
-	if (bpf_fsession_cnt(tlinks)) {
+	if (bpf_fsession_cnt(tnodes)) {
 		/* clear all session cookies' value */
 		for (i = 0; i < cookie_cnt; i++)
 			emit_sd(RV_REG_FP, -cookie_off + 8 * i, RV_REG_ZERO, ctx);
@@ -1187,22 +1187,22 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im,
 			return ret;
 	}
 
-	if (fentry->nr_links) {
+	if (fentry->nr_nodes) {
 		ret = invoke_bpf(fentry, args_off, retval_off, run_ctx_off, func_meta_off,
 				 flags & BPF_TRAMP_F_RET_FENTRY_RET, func_meta, cookie_off, ctx);
 		if (ret)
 			return ret;
 	}
 
-	if (fmod_ret->nr_links) {
-		branches_off = kzalloc_objs(int, fmod_ret->nr_links);
+	if (fmod_ret->nr_nodes) {
+		branches_off = kzalloc_objs(int, fmod_ret->nr_nodes);
 		if (!branches_off)
 			return -ENOMEM;
 
 		/* cleanup to avoid garbage return value confusion */
 		emit_sd(RV_REG_FP, -retval_off, RV_REG_ZERO, ctx);
-		for (i = 0; i < fmod_ret->nr_links; i++) {
-			ret = invoke_bpf_prog(fmod_ret->links[i], args_off, retval_off,
+		for (i = 0; i < fmod_ret->nr_nodes; i++) {
+			ret = invoke_bpf_prog(fmod_ret->nodes[i], args_off, retval_off,
 					      run_ctx_off, true, ctx);
 			if (ret)
 				goto out;
@@ -1230,7 +1230,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im,
 	}
 
 	/* update branches saved in invoke_bpf_mod_ret with bnez */
-	for (i = 0; ctx->insns && i < fmod_ret->nr_links; i++) {
+	for (i = 0; ctx->insns && i < fmod_ret->nr_nodes; i++) {
 		offset = ninsns_rvoff(ctx->ninsns - branches_off[i]);
 		insn = rv_bne(RV_REG_T1, RV_REG_ZERO, offset >> 1);
 		*(u32 *)(ctx->insns + branches_off[i]) = insn;
@@ -1238,10 +1238,10 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im,
 
 	/* set "is_return" flag for fsession */
 	func_meta |= (1ULL << BPF_TRAMP_IS_RETURN_SHIFT);
-	if (bpf_fsession_cnt(tlinks))
+	if (bpf_fsession_cnt(tnodes))
 		emit_store_stack_imm64(RV_REG_T1, -func_meta_off, func_meta, ctx);
 
-	if (fexit->nr_links) {
+	if (fexit->nr_nodes) {
 		ret = invoke_bpf(fexit, args_off, retval_off, run_ctx_off, func_meta_off,
 				 false, func_meta, cookie_off, ctx);
 		if (ret)
@@ -1305,7 +1305,7 @@ out:
 }
 
 int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
-			     struct bpf_tramp_links *tlinks, void *func_addr)
+			     struct bpf_tramp_nodes *tnodes, void *func_addr)
 {
 	struct bpf_tramp_image im;
 	struct rv_jit_context ctx;
@@ -1314,7 +1314,7 @@ int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
 	ctx.ninsns = 0;
 	ctx.insns = NULL;
 	ctx.ro_insns = NULL;
-	ret = __arch_prepare_bpf_trampoline(&im, m, tlinks, func_addr, flags, &ctx);
+	ret = __arch_prepare_bpf_trampoline(&im, m, tnodes, func_addr, flags, &ctx);
 
 	return ret < 0 ? ret : ninsns_rvoff(ctx.ninsns);
 }
@@ -1331,7 +1331,7 @@ void arch_free_bpf_trampoline(void *image, unsigned int size)
 
 int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image,
 				void *ro_image_end, const struct btf_func_model *m,
-				u32 flags, struct bpf_tramp_links *tlinks,
+				u32 flags, struct bpf_tramp_nodes *tnodes,
 				void *func_addr)
 {
 	int ret;
@@ -1346,7 +1346,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image,
 	ctx.ninsns = 0;
 	ctx.insns = image;
 	ctx.ro_insns = ro_image;
-	ret = __arch_prepare_bpf_trampoline(im, m, tlinks, func_addr, flags, &ctx);
+	ret = __arch_prepare_bpf_trampoline(im, m, tnodes, func_addr, flags, &ctx);
 	if (ret < 0)
 		goto out;
 
@@ -1808,6 +1808,13 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
 			break;
 		}
 
+		/* Implement helper call to bpf_get_current_task/_btf() inline */
+		if (insn->src_reg == 0 && (insn->imm == BPF_FUNC_get_current_task ||
+					   insn->imm == BPF_FUNC_get_current_task_btf)) {
+			emit_mv(bpf_to_rv_reg(BPF_REG_0, ctx), RV_REG_TP, ctx);
+			break;
+		}
+
 		mark_call(ctx);
 		ret = bpf_jit_get_func_addr(ctx->prog, insn, extra_pass,
 					    &addr, &fixed_addr);
@@ -2138,6 +2145,8 @@ bool bpf_jit_inlines_helper_call(s32 imm)
 {
 	switch (imm) {
 	case BPF_FUNC_get_smp_processor_id:
+	case BPF_FUNC_get_current_task:
+	case BPF_FUNC_get_current_task_btf:
 		return true;
 	default:
 		return false;
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 94128fe6be23..31749c0362ca 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -27,6 +27,7 @@
 #include <asm/extable.h>
 #include <asm/dis.h>
 #include <asm/facility.h>
+#include <asm/lowcore.h>
 #include <asm/nospec-branch.h>
 #include <asm/set_memory.h>
 #include <asm/text-patching.h>
@@ -1777,6 +1778,30 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
 		int j, ret;
 		u64 func;
 
+		/* Implement helper call to bpf_get_smp_processor_id() inline */
+		if (insn->src_reg == 0 &&
+		    insn->imm == BPF_FUNC_get_smp_processor_id) {
+			const u32 *cpu_nr = &get_lowcore()->cpu_nr;
+
+			/* ly %b0, cpu_nr */
+			EMIT6_DISP_LH(0xe3000000, 0x0058, BPF_REG_0, REG_0, REG_0,
+				      (unsigned long)cpu_nr);
+			break;
+		}
+
+		/* Implement helper call to bpf_get_current_task/_btf() inline */
+		if (insn->src_reg == 0 &&
+		    (insn->imm == BPF_FUNC_get_current_task ||
+		     insn->imm == BPF_FUNC_get_current_task_btf)) {
+			const u64 *current_task =
+				&get_lowcore()->current_task;
+
+			/* lg %b0, current_task */
+			EMIT6_DISP_LH(0xe3000000, 0x0004, BPF_REG_0, REG_0, REG_0,
+				      (unsigned long)current_task);
+			break;
+		}
+
 		ret = bpf_jit_get_func_addr(fp, insn, extra_pass,
 					    &func, &func_addr_fixed);
 		if (ret < 0)
@@ -2512,19 +2537,19 @@ static void emit_store_stack_imm64(struct bpf_jit *jit, int tmp_reg, int stack_o
 
 static int invoke_bpf_prog(struct bpf_tramp_jit *tjit,
 			   const struct btf_func_model *m,
-			   struct bpf_tramp_link *tlink, bool save_ret)
+			   struct bpf_tramp_node *node, bool save_ret)
 {
 	struct bpf_jit *jit = &tjit->common;
 	int cookie_off = tjit->run_ctx_off +
 			 offsetof(struct bpf_tramp_run_ctx, bpf_cookie);
-	struct bpf_prog *p = tlink->link.prog;
+	struct bpf_prog *p = node->link->prog;
 	int patch;
 
 	/*
-	 * run_ctx.cookie = tlink->cookie;
+	 * run_ctx.cookie = node->cookie;
 	 */
 
-	emit_store_stack_imm64(jit, REG_W0, cookie_off, tlink->cookie);
+	emit_store_stack_imm64(jit, REG_W0, cookie_off, node->cookie);
 
 	/*
 	 * if ((start = __bpf_prog_enter(p, &run_ctx)) == 0)
@@ -2584,20 +2609,20 @@ static int invoke_bpf_prog(struct bpf_tramp_jit *tjit,
 
 static int invoke_bpf(struct bpf_tramp_jit *tjit,
 		      const struct btf_func_model *m,
-		      struct bpf_tramp_links *tl, bool save_ret,
+		      struct bpf_tramp_nodes *tn, bool save_ret,
 		      u64 func_meta, int cookie_off)
 {
 	int i, cur_cookie = (tjit->bpf_args_off - cookie_off) / sizeof(u64);
 	struct bpf_jit *jit = &tjit->common;
 
-	for (i = 0; i < tl->nr_links; i++) {
-		if (bpf_prog_calls_session_cookie(tl->links[i])) {
+	for (i = 0; i < tn->nr_nodes; i++) {
+		if (bpf_prog_calls_session_cookie(tn->nodes[i])) {
 			u64 meta = func_meta | ((u64)cur_cookie << BPF_TRAMP_COOKIE_INDEX_SHIFT);
 
 			emit_store_stack_imm64(jit, REG_0, tjit->func_meta_off, meta);
 			cur_cookie--;
 		}
-		if (invoke_bpf_prog(tjit, m, tl->links[i], save_ret))
+		if (invoke_bpf_prog(tjit, m, tn->nodes[i], save_ret))
 			return -EINVAL;
 	}
 
@@ -2626,12 +2651,12 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im,
 					 struct bpf_tramp_jit *tjit,
 					 const struct btf_func_model *m,
 					 u32 flags,
-					 struct bpf_tramp_links *tlinks,
+					 struct bpf_tramp_nodes *tnodes,
 					 void *func_addr)
 {
-	struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN];
-	struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
-	struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT];
+	struct bpf_tramp_nodes *fmod_ret = &tnodes[BPF_TRAMP_MODIFY_RETURN];
+	struct bpf_tramp_nodes *fentry = &tnodes[BPF_TRAMP_FENTRY];
+	struct bpf_tramp_nodes *fexit = &tnodes[BPF_TRAMP_FEXIT];
 	int nr_bpf_args, nr_reg_args, nr_stack_args;
 	int cookie_cnt, cookie_off, fsession_cnt;
 	struct bpf_jit *jit = &tjit->common;
@@ -2668,8 +2693,8 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im,
 			return -ENOTSUPP;
 	}
 
-	cookie_cnt = bpf_fsession_cookie_cnt(tlinks);
-	fsession_cnt = bpf_fsession_cnt(tlinks);
+	cookie_cnt = bpf_fsession_cookie_cnt(tnodes);
+	fsession_cnt = bpf_fsession_cnt(tnodes);
 
 	/*
 	 * Calculate the stack layout.
@@ -2804,7 +2829,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im,
 		       func_meta, cookie_off))
 		return -EINVAL;
 
-	if (fmod_ret->nr_links) {
+	if (fmod_ret->nr_nodes) {
 		/*
 		 * retval = 0;
 		 */
@@ -2813,8 +2838,8 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im,
 		_EMIT6(0xd707f000 | tjit->retval_off,
 		       0xf000 | tjit->retval_off);
 
-		for (i = 0; i < fmod_ret->nr_links; i++) {
-			if (invoke_bpf_prog(tjit, m, fmod_ret->links[i], true))
+		for (i = 0; i < fmod_ret->nr_nodes; i++) {
+			if (invoke_bpf_prog(tjit, m, fmod_ret->nodes[i], true))
 				return -EINVAL;
 
 			/*
@@ -2939,7 +2964,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im,
 }
 
 int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
-			     struct bpf_tramp_links *tlinks, void *orig_call)
+			     struct bpf_tramp_nodes *tnodes, void *orig_call)
 {
 	struct bpf_tramp_image im;
 	struct bpf_tramp_jit tjit;
@@ -2948,14 +2973,14 @@ int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
 	memset(&tjit, 0, sizeof(tjit));
 
 	ret = __arch_prepare_bpf_trampoline(&im, &tjit, m, flags,
-					    tlinks, orig_call);
+					    tnodes, orig_call);
 
 	return ret < 0 ? ret : tjit.common.prg;
 }
 
 int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
 				void *image_end, const struct btf_func_model *m,
-				u32 flags, struct bpf_tramp_links *tlinks,
+				u32 flags, struct bpf_tramp_nodes *tnodes,
 				void *func_addr)
 {
 	struct bpf_tramp_jit tjit;
@@ -2964,7 +2989,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
 	/* Compute offsets, check whether the code fits. */
 	memset(&tjit, 0, sizeof(tjit));
 	ret = __arch_prepare_bpf_trampoline(im, &tjit, m, flags,
-					    tlinks, func_addr);
+					    tnodes, func_addr);
 
 	if (ret < 0)
 		return ret;
@@ -2978,7 +3003,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
 	tjit.common.prg = 0;
 	tjit.common.prg_buf = image;
 	ret = __arch_prepare_bpf_trampoline(im, &tjit, m, flags,
-					    tlinks, func_addr);
+					    tnodes, func_addr);
 
 	return ret < 0 ? ret : tjit.common.prg;
 }
@@ -3057,3 +3082,15 @@ bool bpf_jit_supports_timed_may_goto(void)
 {
 	return true;
 }
+
+bool bpf_jit_inlines_helper_call(s32 imm)
+{
+	switch (imm) {
+	case BPF_FUNC_get_smp_processor_id:
+	case BPF_FUNC_get_current_task:
+	case BPF_FUNC_get_current_task_btf:
+		return true;
+	default:
+		return false;
+	}
+}
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 2187e9cfcefa..ac295ca6c92f 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1284,6 +1284,18 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
 	} while (!try_cmpxchg((long *)&ptep->pte, (long *)&old_pte, *(long *)&new_pte));
 }
 
+/*
+ * Note: strictly-zero compare is narrower than pte_none(), but the gap is
+ * harmless: _PAGE_DIRTY and _PAGE_ACCESSED aren't set on untouched kernel PTEs.
+ */
+static inline bool ptep_try_set(pte_t *ptep, pte_t new_pte)
+{
+	pte_t old_pte = __pte(0);
+
+	return try_cmpxchg((long *)&ptep->pte, (long *)&old_pte, *(long *)&new_pte);
+}
+#define ptep_try_set ptep_try_set
+
 #define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0)
 
 #define  __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 250942ef60bb..45b99c3b1442 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -8,6 +8,7 @@
 #include <linux/sched/task_stack.h>	/* task_stack_*(), ...		*/
 #include <linux/kdebug.h>		/* oops_begin/end, ...		*/
 #include <linux/memblock.h>		/* max_low_pfn			*/
+#include <linux/bpf_defs.h>		/* bpf_arena_handle_page_fault	*/
 #include <linux/kfence.h>		/* kfence_handle_page_fault	*/
 #include <linux/kprobes.h>		/* NOKPROBE_SYMBOL, ...		*/
 #include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/
@@ -693,10 +694,13 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code,
 	if (IS_ENABLED(CONFIG_EFI))
 		efi_crash_gracefully_on_page_fault(address, regs);
 
-	/* Only not-present faults should be handled by KFENCE. */
-	if (!(error_code & X86_PF_PROT) &&
-	    kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs))
-		return;
+	/* Only not-present faults should be handled by KFENCE or BPF arena. */
+	if (!(error_code & X86_PF_PROT)) {
+		if (kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs))
+			return;
+		if (bpf_arena_handle_page_fault(address, error_code & X86_PF_WRITE, regs->ip))
+			return;
+	}
 
 oops:
 	/*
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index ea9e707e8abf..054e043ffcd2 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -10,6 +10,7 @@
 #include <linux/if_vlan.h>
 #include <linux/bitfield.h>
 #include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
 #include <linux/memory.h>
 #include <linux/sort.h>
 #include <asm/extable.h>
@@ -390,6 +391,34 @@ static void pop_callee_regs(u8 **pprog, bool *callee_regs_used)
 	*pprog = prog;
 }
 
+/* add rsp, depth */
+static void emit_add_rsp(u8 **pprog, u16 depth)
+{
+	u8 *prog = *pprog;
+
+	if (!depth)
+		return;
+	if (is_imm8(depth))
+		EMIT4(0x48, 0x83, 0xC4, depth); /* add rsp, imm8 */
+	else
+		EMIT3_off32(0x48, 0x81, 0xC4, depth); /* add rsp, imm32 */
+	*pprog = prog;
+}
+
+/* sub rsp, depth */
+static void emit_sub_rsp(u8 **pprog, u16 depth)
+{
+	u8 *prog = *pprog;
+
+	if (!depth)
+		return;
+	if (is_imm8(depth))
+		EMIT4(0x48, 0x83, 0xEC, depth); /* sub rsp, imm8 */
+	else
+		EMIT3_off32(0x48, 0x81, 0xEC, depth); /* sub rsp, imm32 */
+	*pprog = prog;
+}
+
 static void emit_nops(u8 **pprog, int len)
 {
 	u8 *prog = *pprog;
@@ -1659,21 +1688,47 @@ static int do_jit(struct bpf_verifier_env *env, struct bpf_prog *bpf_prog, int *
 	bool seen_exit = false;
 	u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
 	void __percpu *priv_frame_ptr = NULL;
+	u16 out_stack_arg_cnt, outgoing_rsp;
 	u64 arena_vm_start, user_vm_start;
 	void __percpu *priv_stack_ptr;
 	int i, excnt = 0;
 	int ilen, proglen = 0;
 	u8 *ip, *prog = temp;
 	u32 stack_depth;
+	int callee_saved_size;
+	s32 outgoing_arg_base;
 	int err;
 
 	stack_depth = bpf_prog->aux->stack_depth;
+	out_stack_arg_cnt = bpf_out_stack_arg_cnt(env, bpf_prog);
 	priv_stack_ptr = bpf_prog->aux->priv_stack_ptr;
 	if (priv_stack_ptr) {
 		priv_frame_ptr = priv_stack_ptr + PRIV_STACK_GUARD_SZ + round_up(stack_depth, 8);
 		stack_depth = 0;
 	}
 
+	/*
+	 * Follow x86-64 calling convention for both BPF-to-BPF and
+	 * kfunc calls:
+	 *   - Arg 6 is passed in R9 register
+	 *   - Args 7+ are passed on the stack at [rsp]
+	 *
+	 * Incoming arg 6 is read from R9 (BPF r11+8 → MOV from R9).
+	 * Incoming args 7+ are read from [rbp + 16], [rbp + 24], ...
+	 * (BPF r11+16, r11+24, ... map directly with no offset change).
+	 *
+	 * tail_call_reachable is rejected by the verifier and priv_stack
+	 * is disabled by the JIT when stack args exist, so R9 is always
+	 * available.
+	 *
+	 * Stack layout (high to low):
+	 *   [rbp + 16 + ...]    incoming stack args 7+ (from caller)
+	 *   [rbp + 8]           return address
+	 *   [rbp]               saved rbp
+	 *   [rbp - prog_stack]  program stack
+	 *   [below]             callee-saved regs
+	 *   [below]             outgoing args 7+ (= rsp)
+	 */
 	arena_vm_start = bpf_arena_get_kern_vm_start(bpf_prog->aux->arena);
 	user_vm_start = bpf_arena_get_user_vm_start(bpf_prog->aux->arena);
 
@@ -1700,6 +1755,44 @@ static int do_jit(struct bpf_verifier_env *env, struct bpf_prog *bpf_prog, int *
 			push_r12(&prog);
 		push_callee_regs(&prog, callee_regs_used);
 	}
+
+	/* Compute callee-saved register area size. */
+	callee_saved_size = 0;
+	if (bpf_prog->aux->exception_boundary || arena_vm_start)
+		callee_saved_size += 8; /* r12 */
+	if (bpf_prog->aux->exception_boundary) {
+		callee_saved_size += 4 * 8; /* rbx, r13, r14, r15 */
+	} else {
+		int j;
+
+		for (j = 0; j < 4; j++)
+			if (callee_regs_used[j])
+				callee_saved_size += 8;
+	}
+	/*
+	 * Base offset from rbp for translating BPF outgoing args 7+
+	 * to native offsets. BPF uses negative offsets from r11
+	 * (r11-8 for arg6, r11-16 for arg7, ...) while x86 uses
+	 * positive offsets from rsp ([rsp+0] for arg7, [rsp+8] for
+	 * arg8, ...). Arg 6 goes to R9 directly.
+	 *
+	 * The translation reverses direction:
+	 *   native_off = outgoing_arg_base - outgoing_rsp - bpf_off - 16
+	 *
+	 * Note that tail_call_reachable is guaranteed to be false when
+	 * stack args exist, so tcc pushes need not be accounted for.
+	 */
+	outgoing_arg_base = -(round_up(stack_depth, 8) + callee_saved_size);
+
+	/*
+	 * Allocate outgoing stack arg area for args 7+ only.
+	 * Arg 6 goes into r9 register, not on stack.
+	 */
+	outgoing_rsp = out_stack_arg_cnt > 1 ? (out_stack_arg_cnt - 1) * 8 : 0;
+	if (bpf_prog->aux->exception_boundary)
+		bpf_prog->aux->stack_arg_sp_adjust = outgoing_rsp;
+	emit_sub_rsp(&prog, outgoing_rsp);
+
 	if (arena_vm_start)
 		emit_mov_imm64(&prog, X86_REG_R12,
 			       arena_vm_start >> 32, (u32) arena_vm_start);
@@ -1721,7 +1814,7 @@ static int do_jit(struct bpf_verifier_env *env, struct bpf_prog *bpf_prog, int *
 		u8 b2 = 0, b3 = 0;
 		u8 *start_of_ldx;
 		s64 jmp_offset;
-		s16 insn_off;
+		s32 insn_off;
 		u8 jmp_cond;
 		u8 *func;
 		int nops;
@@ -2134,12 +2227,27 @@ static int do_jit(struct bpf_verifier_env *env, struct bpf_prog *bpf_prog, int *
 				EMIT1(0xC7);
 			goto st;
 		case BPF_ST | BPF_MEM | BPF_DW:
+			if (dst_reg == BPF_REG_PARAMS && insn->off == -8) {
+				/* Arg 6: store immediate in r9 register */
+				emit_mov_imm64(&prog, X86_REG_R9, imm32 >> 31, (u32)imm32);
+				break;
+			}
 			EMIT2(add_1mod(0x48, dst_reg), 0xC7);
 
-st:			if (is_imm8(insn->off))
-				EMIT2(add_1reg(0x40, dst_reg), insn->off);
+st:			insn_off = insn->off;
+			if (dst_reg == BPF_REG_PARAMS) {
+				/*
+				 * Args 7+: reverse BPF negative offsets to
+				 * x86 positive rsp offsets.
+				 * BPF off=-16 → [rsp+0], off=-24 → [rsp+8], ...
+				 */
+				insn_off = outgoing_arg_base - outgoing_rsp - insn_off - 16;
+				dst_reg = BPF_REG_FP;
+			}
+			if (is_imm8(insn_off))
+				EMIT2(add_1reg(0x40, dst_reg), insn_off);
 			else
-				EMIT1_off32(add_1reg(0x80, dst_reg), insn->off);
+				EMIT1_off32(add_1reg(0x80, dst_reg), insn_off);
 
 			EMIT(imm32, bpf_size_to_x86_bytes(BPF_SIZE(insn->code)));
 			break;
@@ -2149,7 +2257,17 @@ st:			if (is_imm8(insn->off))
 		case BPF_STX | BPF_MEM | BPF_H:
 		case BPF_STX | BPF_MEM | BPF_W:
 		case BPF_STX | BPF_MEM | BPF_DW:
-			emit_stx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
+			if (dst_reg == BPF_REG_PARAMS && insn->off == -8) {
+				/* Arg 6: store register value in r9 */
+				EMIT_mov(X86_REG_R9, src_reg);
+				break;
+			}
+			insn_off = insn->off;
+			if (dst_reg == BPF_REG_PARAMS) {
+				insn_off = outgoing_arg_base - outgoing_rsp - insn_off - 16;
+				dst_reg = BPF_REG_FP;
+			}
+			emit_stx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn_off);
 			break;
 
 		case BPF_ST | BPF_PROBE_MEM32 | BPF_B:
@@ -2248,6 +2366,19 @@ populate_extable:
 		case BPF_LDX | BPF_PROBE_MEMSX | BPF_H:
 		case BPF_LDX | BPF_PROBE_MEMSX | BPF_W:
 			insn_off = insn->off;
+			if (src_reg == BPF_REG_PARAMS) {
+				if (insn_off == 8) {
+					/* Incoming arg 6: read from r9 */
+					EMIT_mov(dst_reg, X86_REG_R9);
+					break;
+				}
+				src_reg = BPF_REG_FP;
+				/*
+				 * Incoming args 7+: native_off == bpf_off
+				 * (r11+16 → [rbp+16], r11+24 → [rbp+24], ...)
+				 * No offset adjustment needed.
+				 */
+			}
 
 			if (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
 			    BPF_MODE(insn->code) == BPF_PROBE_MEMSX) {
@@ -2736,6 +2867,8 @@ emit_jmp:
 				if (emit_spectre_bhb_barrier(&prog, ip, bpf_prog))
 					return -EINVAL;
 			}
+			/* Deallocate outgoing args 7+ area. */
+			emit_add_rsp(&prog, outgoing_rsp);
 			if (bpf_prog->aux->exception_boundary) {
 				pop_callee_regs(&prog, all_callee_regs_used);
 				pop_r12(&prog);
@@ -2971,15 +3104,15 @@ static void restore_regs(const struct btf_func_model *m, u8 **prog,
 }
 
 static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
-			   struct bpf_tramp_link *l, int stack_size,
+			   struct bpf_tramp_node *node, int stack_size,
 			   int run_ctx_off, bool save_ret,
 			   void *image, void *rw_image)
 {
 	u8 *prog = *pprog;
 	u8 *jmp_insn;
 	int ctx_cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie);
-	struct bpf_prog *p = l->link.prog;
-	u64 cookie = l->cookie;
+	struct bpf_prog *p = node->link->prog;
+	u64 cookie = node->cookie;
 
 	/* mov rdi, cookie */
 	emit_mov_imm64(&prog, BPF_REG_1, (long) cookie >> 32, (u32) (long) cookie);
@@ -3086,7 +3219,7 @@ static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond)
 }
 
 static int invoke_bpf(const struct btf_func_model *m, u8 **pprog,
-		      struct bpf_tramp_links *tl, int stack_size,
+		      struct bpf_tramp_nodes *tl, int stack_size,
 		      int run_ctx_off, int func_meta_off, bool save_ret,
 		      void *image, void *rw_image, u64 func_meta,
 		      int cookie_off)
@@ -3094,13 +3227,13 @@ static int invoke_bpf(const struct btf_func_model *m, u8 **pprog,
 	int i, cur_cookie = (cookie_off - stack_size) / 8;
 	u8 *prog = *pprog;
 
-	for (i = 0; i < tl->nr_links; i++) {
-		if (tl->links[i]->link.prog->call_session_cookie) {
+	for (i = 0; i < tl->nr_nodes; i++) {
+		if (tl->nodes[i]->link->prog->call_session_cookie) {
 			emit_store_stack_imm64(&prog, BPF_REG_0, -func_meta_off,
 				func_meta | (cur_cookie << BPF_TRAMP_COOKIE_INDEX_SHIFT));
 			cur_cookie--;
 		}
-		if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size,
+		if (invoke_bpf_prog(m, &prog, tl->nodes[i], stack_size,
 				    run_ctx_off, save_ret, image, rw_image))
 			return -EINVAL;
 	}
@@ -3109,7 +3242,7 @@ static int invoke_bpf(const struct btf_func_model *m, u8 **pprog,
 }
 
 static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
-			      struct bpf_tramp_links *tl, int stack_size,
+			      struct bpf_tramp_nodes *tl, int stack_size,
 			      int run_ctx_off, u8 **branches,
 			      void *image, void *rw_image)
 {
@@ -3121,8 +3254,8 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
 	 */
 	emit_mov_imm32(&prog, false, BPF_REG_0, 0);
 	emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
-	for (i = 0; i < tl->nr_links; i++) {
-		if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, run_ctx_off, true,
+	for (i = 0; i < tl->nr_nodes; i++) {
+		if (invoke_bpf_prog(m, &prog, tl->nodes[i], stack_size, run_ctx_off, true,
 				    image, rw_image))
 			return -EINVAL;
 
@@ -3213,14 +3346,14 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
 static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_image,
 					 void *rw_image_end, void *image,
 					 const struct btf_func_model *m, u32 flags,
-					 struct bpf_tramp_links *tlinks,
+					 struct bpf_tramp_nodes *tnodes,
 					 void *func_addr)
 {
 	int i, ret, nr_regs = m->nr_args, stack_size = 0;
 	int regs_off, func_meta_off, ip_off, run_ctx_off, arg_stack_off, rbx_off;
-	struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
-	struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT];
-	struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN];
+	struct bpf_tramp_nodes *fentry = &tnodes[BPF_TRAMP_FENTRY];
+	struct bpf_tramp_nodes *fexit = &tnodes[BPF_TRAMP_FEXIT];
+	struct bpf_tramp_nodes *fmod_ret = &tnodes[BPF_TRAMP_MODIFY_RETURN];
 	void *orig_call = func_addr;
 	int cookie_off, cookie_cnt;
 	u8 **branches = NULL;
@@ -3292,7 +3425,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 
 	ip_off = stack_size;
 
-	cookie_cnt = bpf_fsession_cookie_cnt(tlinks);
+	cookie_cnt = bpf_fsession_cookie_cnt(tnodes);
 	/* room for session cookies */
 	stack_size += cookie_cnt * 8;
 	cookie_off = stack_size;
@@ -3385,7 +3518,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 		}
 	}
 
-	if (bpf_fsession_cnt(tlinks)) {
+	if (bpf_fsession_cnt(tnodes)) {
 		/* clear all the session cookies' value */
 		for (int i = 0; i < cookie_cnt; i++)
 			emit_store_stack_imm64(&prog, BPF_REG_0, -cookie_off + 8 * i, 0);
@@ -3393,15 +3526,15 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 		emit_store_stack_imm64(&prog, BPF_REG_0, -8, 0);
 	}
 
-	if (fentry->nr_links) {
+	if (fentry->nr_nodes) {
 		if (invoke_bpf(m, &prog, fentry, regs_off, run_ctx_off, func_meta_off,
 			       flags & BPF_TRAMP_F_RET_FENTRY_RET, image, rw_image,
 			       func_meta, cookie_off))
 			return -EINVAL;
 	}
 
-	if (fmod_ret->nr_links) {
-		branches = kcalloc(fmod_ret->nr_links, sizeof(u8 *),
+	if (fmod_ret->nr_nodes) {
+		branches = kcalloc(fmod_ret->nr_nodes, sizeof(u8 *),
 				   GFP_KERNEL);
 		if (!branches)
 			return -ENOMEM;
@@ -3440,7 +3573,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 		emit_nops(&prog, X86_PATCH_SIZE);
 	}
 
-	if (fmod_ret->nr_links) {
+	if (fmod_ret->nr_nodes) {
 		/* From Intel 64 and IA-32 Architectures Optimization
 		 * Reference Manual, 3.4.1.4 Code Alignment, Assembly/Compiler
 		 * Coding Rule 11: All branch targets should be 16-byte
@@ -3450,7 +3583,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 		/* Update the branches saved in invoke_bpf_mod_ret with the
 		 * aligned address of do_fexit.
 		 */
-		for (i = 0; i < fmod_ret->nr_links; i++) {
+		for (i = 0; i < fmod_ret->nr_nodes; i++) {
 			emit_cond_near_jump(&branches[i], image + (prog - (u8 *)rw_image),
 					    image + (branches[i] - (u8 *)rw_image), X86_JNE);
 		}
@@ -3458,10 +3591,10 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 
 	/* set the "is_return" flag for fsession */
 	func_meta |= (1ULL << BPF_TRAMP_IS_RETURN_SHIFT);
-	if (bpf_fsession_cnt(tlinks))
+	if (bpf_fsession_cnt(tnodes))
 		emit_store_stack_imm64(&prog, BPF_REG_0, -func_meta_off, func_meta);
 
-	if (fexit->nr_links) {
+	if (fexit->nr_nodes) {
 		if (invoke_bpf(m, &prog, fexit, regs_off, run_ctx_off, func_meta_off,
 			       false, image, rw_image, func_meta, cookie_off)) {
 			ret = -EINVAL;
@@ -3535,7 +3668,7 @@ int arch_protect_bpf_trampoline(void *image, unsigned int size)
 
 int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
 				const struct btf_func_model *m, u32 flags,
-				struct bpf_tramp_links *tlinks,
+				struct bpf_tramp_nodes *tnodes,
 				void *func_addr)
 {
 	void *rw_image, *tmp;
@@ -3550,7 +3683,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 		return -ENOMEM;
 
 	ret = __arch_prepare_bpf_trampoline(im, rw_image, rw_image + size, image, m,
-					    flags, tlinks, func_addr);
+					    flags, tnodes, func_addr);
 	if (ret < 0)
 		goto out;
 
@@ -3563,7 +3696,7 @@ out:
 }
 
 int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
-			     struct bpf_tramp_links *tlinks, void *func_addr)
+			     struct bpf_tramp_nodes *tnodes, void *func_addr)
 {
 	struct bpf_tramp_image im;
 	void *image;
@@ -3581,7 +3714,7 @@ int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
 		return -ENOMEM;
 
 	ret = __arch_prepare_bpf_trampoline(&im, image, image + PAGE_SIZE, image,
-					    m, flags, tlinks, func_addr);
+					    m, flags, tnodes, func_addr);
 	bpf_jit_free_exec(image);
 	return ret;
 }
@@ -3793,7 +3926,8 @@ skip_init_addrs:
 	for (pass = 0; pass < MAX_PASSES || image; pass++) {
 		if (!padding && pass >= PADDING_PASSES)
 			padding = true;
-		proglen = do_jit(env, prog, addrs, image, rw_image, oldproglen, &ctx, padding);
+		proglen = do_jit(env, prog, addrs, image, rw_image, oldproglen,
+				 &ctx, padding);
 		if (proglen <= 0) {
 out_image:
 			image = NULL;
@@ -3910,6 +4044,11 @@ bool bpf_jit_supports_kfunc_call(void)
 	return true;
 }
 
+bool bpf_jit_supports_stack_args(void)
+{
+	return true;
+}
+
 void *bpf_arch_text_copy(void *dst, void *src, size_t len)
 {
 	if (text_poke_copy(dst, src, len) == NULL)
diff --git a/drivers/base/power/power.h b/drivers/base/power/power.h
index 922ed457db19..8823aceeac8b 100644
--- a/drivers/base/power/power.h
+++ b/drivers/base/power/power.h
@@ -168,3 +168,10 @@ static inline void device_pm_init(struct device *dev)
 	device_pm_sleep_init(dev);
 	pm_runtime_init(dev);
 }
+
+#ifdef CONFIG_BPF_SYSCALL
+struct bpf_ws_lock { };
+struct bpf_ws_lock *bpf_wakeup_sources_read_lock(void);
+void bpf_wakeup_sources_read_unlock(struct bpf_ws_lock *lock);
+void *bpf_wakeup_sources_get_head(void);
+#endif
diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index b8e48a023bf0..80b497de2deb 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -1168,11 +1168,78 @@ static const struct file_operations wakeup_sources_stats_fops = {
 	.release = seq_release_private,
 };
 
-static int __init wakeup_sources_debugfs_init(void)
+#ifdef CONFIG_BPF_SYSCALL
+#include <linux/btf.h>
+
+__bpf_kfunc_start_defs();
+
+/**
+ * bpf_wakeup_sources_read_lock - Acquire the SRCU lock for wakeup sources
+ *
+ * The underlying SRCU lock returns an integer index. However, the BPF verifier
+ * requires a pointer (PTR_TO_BTF_ID) to strictly track the state of acquired
+ * resources using KF_ACQUIRE and KF_RELEASE semantics. We use an opaque
+ * structure pointer (struct bpf_ws_lock *) to satisfy the verifier while
+ * safely encoding the integer index within the pointer address itself.
+ *
+ * Return: An opaque pointer encoding the SRCU lock index + 1 (to avoid NULL).
+ */
+__bpf_kfunc struct bpf_ws_lock *bpf_wakeup_sources_read_lock(void)
+{
+	return (struct bpf_ws_lock *)(long)(wakeup_sources_read_lock() + 1);
+}
+
+/**
+ * bpf_wakeup_sources_read_unlock - Release the SRCU lock for wakeup sources
+ * @lock: The opaque pointer returned by bpf_wakeup_sources_read_lock()
+ *
+ * The BPF verifier guarantees that @lock is a valid, unreleased pointer from
+ * the acquire function. We decode the pointer back into the integer SRCU index
+ * by subtracting 1 and release the lock.
+ */
+__bpf_kfunc void bpf_wakeup_sources_read_unlock(struct bpf_ws_lock *lock)
+{
+	wakeup_sources_read_unlock((int)(long)lock - 1);
+}
+
+/**
+ * bpf_wakeup_sources_get_head - Get the head of the wakeup sources list
+ *
+ * Return: The head of the wakeup sources list.
+ */
+__bpf_kfunc void *bpf_wakeup_sources_get_head(void)
+{
+	return &wakeup_sources;
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(wakeup_source_kfunc_ids)
+BTF_ID_FLAGS(func, bpf_wakeup_sources_read_lock, KF_ACQUIRE)
+BTF_ID_FLAGS(func, bpf_wakeup_sources_read_unlock, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_wakeup_sources_get_head)
+BTF_KFUNCS_END(wakeup_source_kfunc_ids)
+
+static const struct btf_kfunc_id_set wakeup_source_kfunc_set = {
+	.set   = &wakeup_source_kfunc_ids,
+};
+
+static void __init wakeup_sources_bpf_init(void)
+{
+	if (register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &wakeup_source_kfunc_set))
+		pm_pr_dbg("Wakeup: failed to register BTF kfuncs\n");
+}
+#else
+static inline void wakeup_sources_bpf_init(void) {}
+#endif /* CONFIG_BPF_SYSCALL */
+
+static int __init wakeup_sources_init(void)
 {
 	debugfs_create_file("wakeup_sources", 0444, NULL, NULL,
 			    &wakeup_sources_stats_fops);
+	wakeup_sources_bpf_init();
+
 	return 0;
 }
 
-postcore_initcall(wakeup_sources_debugfs_init);
+postcore_initcall(wakeup_sources_init);
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index 70368fe7c510..1caa87da72b5 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -561,10 +561,10 @@ nfp_bpf_check_alu(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
 	const struct bpf_reg_state *dreg =
 		cur_regs(env) + meta->insn.dst_reg;
 
-	meta->umin_src = min(meta->umin_src, sreg->umin_value);
-	meta->umax_src = max(meta->umax_src, sreg->umax_value);
-	meta->umin_dst = min(meta->umin_dst, dreg->umin_value);
-	meta->umax_dst = max(meta->umax_dst, dreg->umax_value);
+	meta->umin_src = min(meta->umin_src, reg_umin(sreg));
+	meta->umax_src = max(meta->umax_src, reg_umax(sreg));
+	meta->umin_dst = min(meta->umin_dst, reg_umin(dreg));
+	meta->umax_dst = max(meta->umax_dst, reg_umax(dreg));
 
 	/* NFP supports u16 and u32 multiplication.
 	 *
diff --git a/fs/bpf_fs_kfuncs.c b/fs/bpf_fs_kfuncs.c
index 11841c3d4260..768aca2dc0f0 100644
--- a/fs/bpf_fs_kfuncs.c
+++ b/fs/bpf_fs_kfuncs.c
@@ -200,7 +200,7 @@ int bpf_set_dentry_xattr_locked(struct dentry *dentry, const char *name__str,
 				const struct bpf_dynptr *value_p, int flags)
 {
 
-	struct bpf_dynptr_kern *value_ptr = (struct bpf_dynptr_kern *)value_p;
+	const struct bpf_dynptr_kern *value_ptr = (struct bpf_dynptr_kern *)value_p;
 	struct inode *inode = d_inode(dentry);
 	const void *value;
 	u32 value_len;
diff --git a/fs/verity/measure.c b/fs/verity/measure.c
index 818083507885..cfe2d5e535f9 100644
--- a/fs/verity/measure.c
+++ b/fs/verity/measure.c
@@ -118,9 +118,9 @@ __bpf_kfunc_start_defs();
  *
  * Return: 0 on success, a negative value on error.
  */
-__bpf_kfunc int bpf_get_fsverity_digest(struct file *file, struct bpf_dynptr *digest_p)
+__bpf_kfunc int bpf_get_fsverity_digest(struct file *file, const struct bpf_dynptr *digest_p)
 {
-	struct bpf_dynptr_kern *digest_ptr = (struct bpf_dynptr_kern *)digest_p;
+	const struct bpf_dynptr_kern *digest_ptr = (struct bpf_dynptr_kern *)digest_p;
 	const struct inode *inode = file_inode(file);
 	u32 dynptr_sz = __bpf_dynptr_size(digest_ptr);
 	struct fsverity_digest *arg;
diff --git a/include/asm-generic/rqspinlock.h b/include/asm-generic/rqspinlock.h
index 151d267a496b..4d46643f46cb 100644
--- a/include/asm-generic/rqspinlock.h
+++ b/include/asm-generic/rqspinlock.h
@@ -243,12 +243,20 @@ static __always_inline void res_spin_unlock(rqspinlock_t *lock)
 	({                                        \
 		int __ret;                        \
 		local_irq_save(flags);            \
-		__ret = raw_res_spin_lock(lock);  \
-		if (__ret)                        \
+		preempt_disable();                \
+		__ret = res_spin_lock(lock);      \
+		if (__ret) {                      \
 			local_irq_restore(flags); \
+			preempt_enable();         \
+		}                                 \
 		__ret;                            \
 	})
 
-#define raw_res_spin_unlock_irqrestore(lock, flags) ({ raw_res_spin_unlock(lock); local_irq_restore(flags); })
+#define raw_res_spin_unlock_irqrestore(lock, flags) \
+	({                                          \
+		res_spin_unlock(lock);              \
+		local_irq_restore(flags);           \
+		preempt_enable();                   \
+	})
 
 #endif /* __ASM_GENERIC_RQSPINLOCK_H */
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index b2e79c2b41d5..4d0cc65976a1 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -421,7 +421,7 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr,
 			   enum bpf_prog_type ptype);
 int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
 int cgroup_bpf_prog_query(const union bpf_attr *attr,
-			  union bpf_attr __user *uattr);
+			  union bpf_attr __user *uattr, u32 uattr_size);
 
 const struct bpf_func_proto *
 cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog);
@@ -452,7 +452,8 @@ static inline int cgroup_bpf_link_attach(const union bpf_attr *attr,
 }
 
 static inline int cgroup_bpf_prog_query(const union bpf_attr *attr,
-					union bpf_attr __user *uattr)
+					union bpf_attr __user *uattr,
+					u32 uattr_size)
 {
 	return -EINVAL;
 }
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 64efc3fdb716..7719f6528445 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -6,6 +6,7 @@
 
 #include <uapi/linux/bpf.h>
 #include <uapi/linux/filter.h>
+#include <linux/bpf_defs.h>
 
 #include <crypto/sha2.h>
 #include <linux/workqueue.h>
@@ -32,6 +33,8 @@
 #include <linux/memcontrol.h>
 #include <linux/cfi.h>
 #include <linux/xattr.h>
+#include <linux/key.h>
+#include <linux/ftrace.h>
 #include <asm/rqspinlock.h>
 
 struct bpf_verifier_env;
@@ -111,7 +114,7 @@ struct bpf_map_ops {
 	long (*map_pop_elem)(struct bpf_map *map, void *value);
 	long (*map_peek_elem)(struct bpf_map *map, void *value);
 	void *(*map_lookup_percpu_elem)(struct bpf_map *map, void *key, u32 cpu);
-	int (*map_get_hash)(struct bpf_map *map, u32 hash_buf_size, void *hash_buf);
+	int (*map_get_hash)(struct bpf_map *map);
 
 	/* funcs called by prog_array and perf_event_array map */
 	void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
@@ -296,6 +299,7 @@ struct bpf_map_owner {
 
 struct bpf_map {
 	u8 sha[SHA256_DIGEST_SIZE];
+	u32 excl;
 	const struct bpf_map_ops *ops;
 	struct bpf_map *inner_map_meta;
 #ifdef CONFIG_SECURITY
@@ -489,6 +493,35 @@ static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_f
 	return rec->field_mask & type;
 }
 
+static inline bool btf_field_is_nmi_safe(enum btf_field_type type)
+{
+	switch (type) {
+	case BPF_SPIN_LOCK:
+	case BPF_RES_SPIN_LOCK:
+	case BPF_TIMER:
+	case BPF_WORKQUEUE:
+	case BPF_TASK_WORK:
+	case BPF_KPTR_UNREF:
+	case BPF_REFCOUNT:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool btf_record_has_nmi_unsafe_fields(const struct btf_record *rec)
+{
+	int i;
+
+	if (IS_ERR_OR_NULL(rec))
+		return false;
+	for (i = 0; i < rec->cnt; i++) {
+		if (!btf_field_is_nmi_safe(rec->fields[i].type))
+			return true;
+	}
+	return false;
+}
+
 static inline void bpf_obj_init(const struct btf_record *rec, void *obj)
 {
 	int i;
@@ -618,6 +651,8 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
 		      struct bpf_spin_lock *spin_lock);
 u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena);
 u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena);
+u64 bpf_arena_map_kern_vm_start(struct bpf_map *map);
+struct bpf_map *bpf_prog_arena(struct bpf_prog *prog);
 int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size);
 
 struct bpf_offload_dev;
@@ -679,6 +714,8 @@ int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags,
 void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt, int node_id,
 					  u64 flags);
 void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt);
+void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt, int node_id,
+				      u64 flags);
 #else
 static inline void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
 							int node_id, u64 flags)
@@ -689,6 +726,12 @@ static inline void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr
 static inline void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt)
 {
 }
+
+static inline void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
+						    int node_id, u64 flags)
+{
+	return NULL;
+}
 #endif
 
 extern const struct bpf_map_ops bpf_map_offload_ops;
@@ -1052,7 +1095,7 @@ struct bpf_insn_access_aux {
 		struct {
 			struct btf *btf;
 			u32 btf_id;
-			u32 ref_obj_id;
+			u32 ref_id;
 		};
 	};
 	struct bpf_verifier_log *log; /* for verbose logs */
@@ -1152,6 +1195,11 @@ struct bpf_prog_offload {
 
 /* The longest tracepoint has 12 args.
  * See include/trace/bpf_probe.h
+ *
+ * Also reuse this macro for maximum number of arguments a BPF function
+ * or a kfunc can have. Args 1-5 are passed in registers, args 6-12 via
+ * stack arg slots. The JIT may map some stack arg slots to registers based
+ * on the native calling convention (e.g., arg 6 to R9 on x86-64).
  */
 #define MAX_BPF_FUNC_ARGS 12
 
@@ -1234,9 +1282,9 @@ enum {
 #define BPF_TRAMP_COOKIE_INDEX_SHIFT	8
 #define BPF_TRAMP_IS_RETURN_SHIFT	63
 
-struct bpf_tramp_links {
-	struct bpf_tramp_link *links[BPF_MAX_TRAMP_LINKS];
-	int nr_links;
+struct bpf_tramp_nodes {
+	struct bpf_tramp_node *nodes[BPF_MAX_TRAMP_LINKS];
+	int nr_nodes;
 };
 
 struct bpf_tramp_run_ctx;
@@ -1264,13 +1312,13 @@ struct bpf_tramp_run_ctx;
 struct bpf_tramp_image;
 int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
 				const struct btf_func_model *m, u32 flags,
-				struct bpf_tramp_links *tlinks,
+				struct bpf_tramp_nodes *tnodes,
 				void *func_addr);
 void *arch_alloc_bpf_trampoline(unsigned int size);
 void arch_free_bpf_trampoline(void *image, unsigned int size);
 int __must_check arch_protect_bpf_trampoline(void *image, unsigned int size);
 int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
-			     struct bpf_tramp_links *tlinks, void *func_addr);
+			     struct bpf_tramp_nodes *tnodes, void *func_addr);
 
 u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
 					     struct bpf_tramp_run_ctx *run_ctx);
@@ -1336,8 +1384,6 @@ struct bpf_trampoline {
 	/* hlist for trampoline_ip_table */
 	struct hlist_node hlist_ip;
 	struct ftrace_ops *fops;
-	/* serializes access to fields of this trampoline */
-	struct mutex mutex;
 	refcount_t refcnt;
 	u32 flags;
 	u64 key;
@@ -1358,6 +1404,11 @@ struct bpf_trampoline {
 	int progs_cnt[BPF_TRAMP_MAX];
 	/* Executable image of trampoline */
 	struct bpf_tramp_image *cur_image;
+	/* Used as temporary old image storage for multi_attach */
+	struct {
+		struct bpf_tramp_image *old_image;
+		u32 old_flags;
+	} multi_attach;
 };
 
 struct bpf_attach_target_info {
@@ -1455,11 +1506,13 @@ static inline int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u6
 	return 0;
 }
 
+struct bpf_tracing_multi_link;
+
 #ifdef CONFIG_BPF_JIT
-int bpf_trampoline_link_prog(struct bpf_tramp_link *link,
+int bpf_trampoline_link_prog(struct bpf_tramp_node *node,
 			     struct bpf_trampoline *tr,
 			     struct bpf_prog *tgt_prog);
-int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
+int bpf_trampoline_unlink_prog(struct bpf_tramp_node *node,
 			       struct bpf_trampoline *tr,
 			       struct bpf_prog *tgt_prog);
 struct bpf_trampoline *bpf_trampoline_get(u64 key,
@@ -1467,6 +1520,11 @@ struct bpf_trampoline *bpf_trampoline_get(u64 key,
 void bpf_trampoline_put(struct bpf_trampoline *tr);
 int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs);
 
+int bpf_trampoline_multi_attach(struct bpf_prog *prog, u32 *ids,
+				struct bpf_tracing_multi_link *link);
+int bpf_trampoline_multi_detach(struct bpf_prog *prog,
+				struct bpf_tracing_multi_link *link);
+
 /*
  * When the architecture supports STATIC_CALL replace the bpf_dispatcher_fn
  * indirection with a direct call to the bpf program. If the architecture does
@@ -1544,14 +1602,15 @@ void bpf_jit_uncharge_modmem(u32 size);
 bool bpf_prog_has_trampoline(const struct bpf_prog *prog);
 bool bpf_insn_is_indirect_target(const struct bpf_verifier_env *env, const struct bpf_prog *prog,
 				 int insn_idx);
+u16 bpf_out_stack_arg_cnt(const struct bpf_verifier_env *env, const struct bpf_prog *prog);
 #else
-static inline int bpf_trampoline_link_prog(struct bpf_tramp_link *link,
+static inline int bpf_trampoline_link_prog(struct bpf_tramp_node *node,
 					   struct bpf_trampoline *tr,
 					   struct bpf_prog *tgt_prog)
 {
 	return -ENOTSUPP;
 }
-static inline int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
+static inline int bpf_trampoline_unlink_prog(struct bpf_tramp_node *node,
 					     struct bpf_trampoline *tr,
 					     struct bpf_prog *tgt_prog)
 {
@@ -1578,6 +1637,16 @@ static inline bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
 {
 	return false;
 }
+static inline int bpf_trampoline_multi_attach(struct bpf_prog *prog, u32 *ids,
+					      struct bpf_tracing_multi_link *link)
+{
+	return -ENOTSUPP;
+}
+static inline int bpf_trampoline_multi_detach(struct bpf_prog *prog,
+					      struct bpf_tracing_multi_link *link)
+{
+	return -ENOTSUPP;
+}
 #endif
 
 struct bpf_func_info_aux {
@@ -1615,7 +1684,7 @@ struct bpf_ctx_arg_aux {
 	enum bpf_reg_type reg_type;
 	struct btf *btf;
 	u32 btf_id;
-	u32 ref_obj_id;
+	u32 ref_id;
 	bool refcounted;
 };
 
@@ -1657,6 +1726,19 @@ struct bpf_stream_stage {
 	int len;
 };
 
+enum bpf_sig_verdict {
+	BPF_SIG_UNSIGNED = 0,
+	BPF_SIG_VERIFIED,
+};
+
+enum bpf_sig_keyring {
+	BPF_SIG_KEYRING_NONE = 0,
+	BPF_SIG_KEYRING_BUILTIN,
+	BPF_SIG_KEYRING_SECONDARY,
+	BPF_SIG_KEYRING_PLATFORM,
+	BPF_SIG_KEYRING_USER,
+};
+
 struct bpf_prog_aux {
 	atomic64_t refcnt;
 	u32 used_map_cnt;
@@ -1699,6 +1781,11 @@ struct bpf_prog_aux {
 	bool changes_pkt_data;
 	bool might_sleep;
 	bool kprobe_write_ctx;
+	struct {
+		s32 keyring_serial;
+		u8 keyring_type;
+		u8 verdict;
+	} sig;
 	u64 prog_array_member_cnt; /* counts how many times as member of prog_array */
 	struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */
 	struct bpf_arena *arena;
@@ -1731,6 +1818,7 @@ struct bpf_prog_aux {
 	struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE];
 	char name[BPF_OBJ_NAME_LEN];
 	u64 (*bpf_exception_cb)(u64 cookie, u64 sp, u64 bp, u64, u64);
+	u16 stack_arg_sp_adjust;
 #ifdef CONFIG_SECURITY
 	void *security;
 #endif
@@ -1874,12 +1962,17 @@ struct bpf_link_ops {
 	__poll_t (*poll)(struct file *file, struct poll_table_struct *pts);
 };
 
-struct bpf_tramp_link {
-	struct bpf_link link;
+struct bpf_tramp_node {
+	struct bpf_link *link;
 	struct hlist_node tramp_hlist;
 	u64 cookie;
 };
 
+struct bpf_tramp_link {
+	struct bpf_link link;
+	struct bpf_tramp_node node;
+};
+
 struct bpf_shim_tramp_link {
 	struct bpf_tramp_link link;
 	struct bpf_trampoline *trampoline;
@@ -1887,13 +1980,31 @@ struct bpf_shim_tramp_link {
 
 struct bpf_tracing_link {
 	struct bpf_tramp_link link;
+	struct bpf_tramp_node fexit;
 	struct bpf_trampoline *trampoline;
 	struct bpf_prog *tgt_prog;
 };
 
-struct bpf_fsession_link {
-	struct bpf_tracing_link link;
-	struct bpf_tramp_link fexit;
+struct bpf_tracing_multi_node {
+	struct bpf_tramp_node node;
+	struct bpf_trampoline *trampoline;
+	struct ftrace_func_entry entry;
+};
+
+struct bpf_tracing_multi_data {
+	struct ftrace_hash *unreg;
+	struct ftrace_hash *modify;
+	struct ftrace_hash *reg;
+	struct ftrace_func_entry *entry;
+};
+
+struct bpf_tracing_multi_link {
+	struct bpf_link link;
+	struct bpf_tracing_multi_data data;
+	u64 *cookies;
+	struct bpf_tramp_node *fexits;
+	int nodes_cnt;
+	struct bpf_tracing_multi_node nodes[] __counted_by(nodes_cnt);
 };
 
 struct bpf_raw_tp_link {
@@ -2079,6 +2190,12 @@ static inline void bpf_prog_put_recursion_context(struct bpf_prog *prog)
 #endif
 }
 
+static inline bool is_tracing_multi(enum bpf_attach_type type)
+{
+	return type == BPF_TRACE_FENTRY_MULTI || type == BPF_TRACE_FEXIT_MULTI ||
+	       type == BPF_TRACE_FSESSION_MULTI;
+}
+
 #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL)
 /* This macro helps developer to register a struct_ops type and generate
  * type information correctly. Developers should use this macro to register
@@ -2099,8 +2216,8 @@ void bpf_struct_ops_put(const void *kdata);
 int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff);
 int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
 				       void *value);
-int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
-				      struct bpf_tramp_link *link,
+int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_nodes *tnodes,
+				      struct bpf_tramp_node *node,
 				      const struct btf_func_model *model,
 				      void *stub_func,
 				      void **image, u32 *image_off,
@@ -2125,6 +2242,9 @@ int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map);
 void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog);
 void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux);
 u32 bpf_struct_ops_id(const void *kdata);
+int bpf_struct_ops_for_each_prog(const void *kdata,
+				 int (*cb)(struct bpf_prog *prog, void *data),
+				 void *data);
 
 #ifdef CONFIG_NET
 /* Define it here to avoid the use of forward declaration */
@@ -2192,31 +2312,33 @@ static inline void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_op
 
 #endif
 
-static inline int bpf_fsession_cnt(struct bpf_tramp_links *links)
+static inline int bpf_fsession_cnt(struct bpf_tramp_nodes *nodes)
 {
-	struct bpf_tramp_links fentries = links[BPF_TRAMP_FENTRY];
+	struct bpf_tramp_nodes fentries = nodes[BPF_TRAMP_FENTRY];
 	int cnt = 0;
 
-	for (int i = 0; i < links[BPF_TRAMP_FENTRY].nr_links; i++) {
-		if (fentries.links[i]->link.prog->expected_attach_type == BPF_TRACE_FSESSION)
+	for (int i = 0; i < nodes[BPF_TRAMP_FENTRY].nr_nodes; i++) {
+		if (fentries.nodes[i]->link->prog->expected_attach_type == BPF_TRACE_FSESSION)
+			cnt++;
+		if (fentries.nodes[i]->link->prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI)
 			cnt++;
 	}
 
 	return cnt;
 }
 
-static inline bool bpf_prog_calls_session_cookie(struct bpf_tramp_link *link)
+static inline bool bpf_prog_calls_session_cookie(struct bpf_tramp_node *node)
 {
-	return link->link.prog->call_session_cookie;
+	return node->link->prog->call_session_cookie;
 }
 
-static inline int bpf_fsession_cookie_cnt(struct bpf_tramp_links *links)
+static inline int bpf_fsession_cookie_cnt(struct bpf_tramp_nodes *nodes)
 {
-	struct bpf_tramp_links fentries = links[BPF_TRAMP_FENTRY];
+	struct bpf_tramp_nodes fentries = nodes[BPF_TRAMP_FENTRY];
 	int cnt = 0;
 
-	for (int i = 0; i < links[BPF_TRAMP_FENTRY].nr_links; i++) {
-		if (bpf_prog_calls_session_cookie(fentries.links[i]))
+	for (int i = 0; i < nodes[BPF_TRAMP_FENTRY].nr_nodes; i++) {
+		if (bpf_prog_calls_session_cookie(fentries.nodes[i]))
 			cnt++;
 	}
 
@@ -2598,6 +2720,7 @@ bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *r
 void bpf_obj_free_timer(const struct btf_record *rec, void *obj);
 void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj);
 void bpf_obj_free_task_work(const struct btf_record *rec, void *obj);
+void bpf_obj_cancel_fields(struct bpf_map *map, void *obj);
 void bpf_obj_free_fields(const struct btf_record *rec, void *obj);
 void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu);
 
@@ -2764,6 +2887,9 @@ void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
 void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type,
 			     const struct bpf_link_ops *ops, struct bpf_prog *prog,
 			     enum bpf_attach_type attach_type, bool sleepable);
+void bpf_tramp_link_init(struct bpf_tramp_link *link, enum bpf_link_type type,
+			 const struct bpf_link_ops *ops, struct bpf_prog *prog,
+			 enum bpf_attach_type attach_type, u64 cookie);
 int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer);
 int bpf_link_settle(struct bpf_link_primer *primer);
 void bpf_link_cleanup(struct bpf_link_primer *primer);
@@ -2917,7 +3043,9 @@ int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size,
 			     size_t actual_size);
 
 /* verify correctness of eBPF program */
-int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size);
+struct bpf_log_attr;
+int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr,
+	      struct bpf_log_attr *attr_log);
 
 #ifndef CONFIG_BPF_JIT_ALWAYS_ON
 int bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth);
@@ -3088,6 +3216,56 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr);
 void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr);
 void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip);
 
+static __always_inline u32
+bpf_prog_run_array_sleepable(const struct bpf_prog_array *array,
+			     const void *ctx, bpf_prog_run_fn run_prog)
+{
+	const struct bpf_prog_array_item *item;
+	struct bpf_prog *prog;
+	struct bpf_run_ctx *old_run_ctx;
+	struct bpf_trace_run_ctx run_ctx;
+	u32 ret = 1;
+
+	if (unlikely(!array))
+		return ret;
+
+	migrate_disable();
+
+	run_ctx.is_uprobe = false;
+
+	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+	item = &array->items[0];
+	while ((prog = READ_ONCE(item->prog))) {
+		/* Skip dummy_bpf_prog placeholder (len == 0) */
+		if (unlikely(!prog->len)) {
+			item++;
+			continue;
+		}
+
+		if (unlikely(!bpf_prog_get_recursion_context(prog))) {
+			bpf_prog_inc_misses_counter(prog);
+			bpf_prog_put_recursion_context(prog);
+			item++;
+			continue;
+		}
+
+		run_ctx.bpf_cookie = item->bpf_cookie;
+
+		if (!prog->sleepable) {
+			guard(rcu)();
+			ret &= run_prog(prog, ctx);
+		} else {
+			ret &= run_prog(prog, ctx);
+		}
+
+		bpf_prog_put_recursion_context(prog);
+		item++;
+	}
+	bpf_reset_run_ctx(old_run_ctx);
+	migrate_enable();
+	return ret;
+}
+
 #else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
@@ -3135,6 +3313,12 @@ static inline void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_
 {
 }
 
+static inline void bpf_tramp_link_init(struct bpf_tramp_link *link, enum bpf_link_type type,
+				       const struct bpf_link_ops *ops, struct bpf_prog *prog,
+				       enum bpf_attach_type attach_type, u64 cookie)
+{
+}
+
 static inline int bpf_link_prime(struct bpf_link *link,
 				 struct bpf_link_primer *primer)
 {
@@ -3626,15 +3810,25 @@ static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map,
 #endif /* CONFIG_BPF_SYSCALL */
 #endif /* defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) */
 
-#if defined(CONFIG_KEYS) && defined(CONFIG_BPF_SYSCALL)
+#ifdef CONFIG_KEYS
+struct bpf_key {
+	struct key *key;
+	bool has_ref;
+};
+#endif /* CONFIG_KEYS */
 
+#if defined(CONFIG_KEYS) && defined(CONFIG_BPF_SYSCALL)
 struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags);
 struct bpf_key *bpf_lookup_system_key(u64 id);
 void bpf_key_put(struct bpf_key *bkey);
-int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p,
-			       struct bpf_dynptr *sig_p,
+int bpf_verify_pkcs7_signature(const struct bpf_dynptr *data_p,
+			       const struct bpf_dynptr *sig_p,
 			       struct bpf_key *trusted_keyring);
 
+static inline s32 bpf_key_serial(const struct bpf_key *key)
+{
+	return key->has_ref ? key->key->serial : 0;
+}
 #else
 static inline struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags)
 {
@@ -3650,12 +3844,17 @@ static inline void bpf_key_put(struct bpf_key *bkey)
 {
 }
 
-static inline int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p,
-					     struct bpf_dynptr *sig_p,
+static inline int bpf_verify_pkcs7_signature(const struct bpf_dynptr *data_p,
+					     const struct bpf_dynptr *sig_p,
 					     struct bpf_key *trusted_keyring)
 {
 	return -EOPNOTSUPP;
 }
+
+static inline s32 bpf_key_serial(const struct bpf_key *key)
+{
+	return 0;
+}
 #endif /* defined(CONFIG_KEYS) && defined(CONFIG_BPF_SYSCALL) */
 
 /* verifier prototypes for helper functions called from eBPF programs */
@@ -3931,15 +4130,6 @@ static inline void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype) {}
 static inline void bpf_cgroup_atype_put(int cgroup_atype) {}
 #endif /* CONFIG_BPF_LSM */
 
-struct key;
-
-#ifdef CONFIG_KEYS
-struct bpf_key {
-	struct key *key;
-	bool has_ref;
-};
-#endif /* CONFIG_KEYS */
-
 static inline bool type_is_alloc(u32 type)
 {
 	return type & MEM_ALLOC;
diff --git a/include/linux/bpf_defs.h b/include/linux/bpf_defs.h
new file mode 100644
index 000000000000..2185cd3966d4
--- /dev/null
+++ b/include/linux/bpf_defs.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Subset of bpf.h declarations, split out so files that need only these
+ * declarations can avoid bpf.h's full include cost.
+ */
+#ifndef _LINUX_BPF_DEFS_H
+#define _LINUX_BPF_DEFS_H
+
+#ifdef CONFIG_BPF_SYSCALL
+bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, unsigned long fault_ip);
+#else
+static inline bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write,
+					       unsigned long fault_ip)
+{
+	return false;
+}
+#endif
+
+#endif /* _LINUX_BPF_DEFS_H */
diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
index 643809cc78c3..143775a27a2a 100644
--- a/include/linux/bpf_lsm.h
+++ b/include/linux/bpf_lsm.h
@@ -52,6 +52,7 @@ int bpf_set_dentry_xattr_locked(struct dentry *dentry, const char *name__str,
 				const struct bpf_dynptr *value_p, int flags);
 int bpf_remove_dentry_xattr_locked(struct dentry *dentry, const char *name__str);
 bool bpf_lsm_has_d_inode_locked(const struct bpf_prog *prog);
+bool bpf_lsm_hook_returns_errno(u32 btf_id);
 
 #else /* !CONFIG_BPF_LSM */
 
@@ -104,6 +105,11 @@ static inline bool bpf_lsm_has_d_inode_locked(const struct bpf_prog *prog)
 {
 	return false;
 }
+
+static inline bool bpf_lsm_hook_returns_errno(u32 btf_id)
+{
+	return true;
+}
 #endif /* CONFIG_BPF_LSM */
 
 #endif /* _LINUX_BPF_LSM_H */
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index b13de31e163f..e5906829aa6f 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -134,6 +134,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARENA, arena_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_INSN_ARRAY, insn_array_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_RHASH, rhtab_map_ops)
 
 BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint)
 BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing)
@@ -155,3 +156,4 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_PERF_EVENT, perf)
 BPF_LINK_TYPE(BPF_LINK_TYPE_KPROBE_MULTI, kprobe_multi)
 BPF_LINK_TYPE(BPF_LINK_TYPE_STRUCT_OPS, struct_ops)
 BPF_LINK_TYPE(BPF_LINK_TYPE_UPROBE_MULTI, uprobe_multi)
+BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING_MULTI, tracing_multi)
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 185b2aa43a42..39a851e690ec 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -8,6 +8,7 @@
 #include <linux/btf.h> /* for struct btf and btf_id() */
 #include <linux/filter.h> /* for MAX_BPF_STACK */
 #include <linux/tnum.h>
+#include <linux/cnum.h>
 
 /* Maximum variable offset umax_value permitted when resolving memory accesses.
  * In practice this is far bigger than any realistic pointer offset; this limit
@@ -65,7 +66,6 @@ struct bpf_reg_state {
 
 		struct { /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */
 			u32 mem_size;
-			u32 dynptr_id; /* for dynptr slices */
 		};
 
 		/* For dynptr stack slots */
@@ -120,14 +120,8 @@ struct bpf_reg_state {
 	 * These refer to the same value as var_off, not necessarily the actual
 	 * contents of the register.
 	 */
-	s64 smin_value; /* minimum possible (s64)value */
-	s64 smax_value; /* maximum possible (s64)value */
-	u64 umin_value; /* minimum possible (u64)value */
-	u64 umax_value; /* maximum possible (u64)value */
-	s32 s32_min_value; /* minimum possible (s32)value */
-	s32 s32_max_value; /* maximum possible (s32)value */
-	u32 u32_min_value; /* minimum possible (u32)value */
-	u32 u32_max_value; /* maximum possible (u32)value */
+	struct cnum64 r64; /* 64-bit range as circular number */
+	struct cnum32 r32; /* 32-bit range as circular number */
 	/* For PTR_TO_PACKET, used to find other pointers with the same variable
 	 * offset, so they can share range knowledge.
 	 * For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we
@@ -153,46 +147,14 @@ struct bpf_reg_state {
 #define BPF_ADD_CONST32 (1U << 30)
 #define BPF_ADD_CONST (BPF_ADD_CONST64 | BPF_ADD_CONST32)
 	u32 id;
-	/* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned
-	 * from a pointer-cast helper, bpf_sk_fullsock() and
-	 * bpf_tcp_sock().
-	 *
-	 * Consider the following where "sk" is a reference counted
-	 * pointer returned from "sk = bpf_sk_lookup_tcp();":
-	 *
-	 * 1: sk = bpf_sk_lookup_tcp();
-	 * 2: if (!sk) { return 0; }
-	 * 3: fullsock = bpf_sk_fullsock(sk);
-	 * 4: if (!fullsock) { bpf_sk_release(sk); return 0; }
-	 * 5: tp = bpf_tcp_sock(fullsock);
-	 * 6: if (!tp) { bpf_sk_release(sk); return 0; }
-	 * 7: bpf_sk_release(sk);
-	 * 8: snd_cwnd = tp->snd_cwnd;  // verifier will complain
-	 *
-	 * After bpf_sk_release(sk) at line 7, both "fullsock" ptr and
-	 * "tp" ptr should be invalidated also.  In order to do that,
-	 * the reg holding "fullsock" and "sk" need to remember
-	 * the original refcounted ptr id (i.e. sk_reg->id) in ref_obj_id
-	 * such that the verifier can reset all regs which have
-	 * ref_obj_id matching the sk_reg->id.
-	 *
-	 * sk_reg->ref_obj_id is set to sk_reg->id at line 1.
-	 * sk_reg->id will stay as NULL-marking purpose only.
-	 * After NULL-marking is done, sk_reg->id can be reset to 0.
-	 *
-	 * After "fullsock = bpf_sk_fullsock(sk);" at line 3,
-	 * fullsock_reg->ref_obj_id is set to sk_reg->ref_obj_id.
-	 *
-	 * After "tp = bpf_tcp_sock(fullsock);" at line 5,
-	 * tp_reg->ref_obj_id is set to fullsock_reg->ref_obj_id
-	 * which is the same as sk_reg->ref_obj_id.
-	 *
-	 * From the verifier perspective, if sk, fullsock and tp
-	 * are not NULL, they are the same ptr with different
-	 * reg->type.  In particular, bpf_sk_release(tp) is also
-	 * allowed and has the same effect as bpf_sk_release(sk).
+	/*
+	 * Tracks the parent object this register was derived from.
+	 * Used for cascading invalidation: when the parent object is
+	 * released or invalidated, all registers with matching parent_id
+	 * are also invalidated. For example, a slice from bpf_dynptr_data()
+	 * gets parent_id set to the dynptr's id.
 	 */
-	u32 ref_obj_id;
+	u32 parent_id;
 	/* Inside the callee two registers can be both PTR_TO_STACK like
 	 * R1=fp-8 and R2=fp-8, but one of them points to this function stack
 	 * while another to the caller's stack. To differentiate them 'frameno'
@@ -209,6 +171,66 @@ struct bpf_reg_state {
 	bool precise;
 };
 
+static inline s64 reg_smin(const struct bpf_reg_state *reg)
+{
+	return cnum64_smin(reg->r64);
+}
+
+static inline s64 reg_smax(const struct bpf_reg_state *reg)
+{
+	return cnum64_smax(reg->r64);
+}
+
+static inline u64 reg_umin(const struct bpf_reg_state *reg)
+{
+	return cnum64_umin(reg->r64);
+}
+
+static inline u64 reg_umax(const struct bpf_reg_state *reg)
+{
+	return cnum64_umax(reg->r64);
+}
+
+static inline s32 reg_s32_min(const struct bpf_reg_state *reg)
+{
+	return cnum32_smin(reg->r32);
+}
+
+static inline s32 reg_s32_max(const struct bpf_reg_state *reg)
+{
+	return cnum32_smax(reg->r32);
+}
+
+static inline u32 reg_u32_min(const struct bpf_reg_state *reg)
+{
+	return cnum32_umin(reg->r32);
+}
+
+static inline u32 reg_u32_max(const struct bpf_reg_state *reg)
+{
+	return cnum32_umax(reg->r32);
+}
+
+static inline void reg_set_srange32(struct bpf_reg_state *reg, s32 smin, s32 smax)
+{
+	reg->r32 = cnum32_from_srange(smin, smax);
+}
+
+static inline void reg_set_urange32(struct bpf_reg_state *reg, u32 umin, u32 umax)
+{
+	reg->r32 = cnum32_from_urange(umin, umax);
+}
+
+static inline void reg_set_srange64(struct bpf_reg_state *reg, s64 smin, s64 smax)
+{
+	reg->r64 = cnum64_from_srange(smin, smax);
+}
+
+static inline void reg_set_urange64(struct bpf_reg_state *reg, u64 umin, u64 umax)
+{
+	reg->r64 = cnum64_from_urange(umin, umax);
+}
+
 enum bpf_stack_slot_type {
 	STACK_INVALID,    /* nothing was stored in this stack slot */
 	STACK_SPILL,      /* register spilled into stack */
@@ -309,10 +331,14 @@ struct bpf_reference_state {
 	 * is used purely to inform the user of a reference leak.
 	 */
 	int insn_idx;
-	/* Use to keep track of the source object of a lock, to ensure
-	 * it matches on unlock.
-	 */
-	void *ptr;
+	union {
+		/* For REF_TYPE_PTR */
+		int parent_id;
+		/* Use to keep track of the source object of a lock, to ensure
+		 * it matches on unlock.
+		 */
+		void *ptr;
+	};
 };
 
 struct bpf_retval_range {
@@ -347,6 +373,7 @@ struct bpf_func_state {
 	bool in_callback_fn;
 	bool in_async_callback_fn;
 	bool in_exception_callback_fn;
+	bool no_stack_arg_load;
 	/* For callback calling functions that limit number of possible
 	 * callback executions (e.g. bpf_loop) keeps track of current
 	 * simulated iteration number.
@@ -372,46 +399,49 @@ struct bpf_func_state {
 	 * `stack`. allocated_stack is always a multiple of BPF_REG_SIZE.
 	 */
 	int allocated_stack;
+
+	u16 out_stack_arg_cnt; /* Number of outgoing on-stack argument slots */
+	struct bpf_reg_state *stack_arg_regs; /* Outgoing on-stack arguments */
 };
 
-#define MAX_CALL_FRAMES 8
+#define MAX_CALL_FRAMES 16
 
-/* instruction history flags, used in bpf_jmp_history_entry.flags field */
+/* instruction history flags, used in bpf_jmp_history_entry.flags field.
+ * Frame number and SPI are stored in dedicated fields of bpf_jmp_history_entry.
+ */
 enum {
-	/* instruction references stack slot through PTR_TO_STACK register;
-	 * we also store stack's frame number in lower 3 bits (MAX_CALL_FRAMES is 8)
-	 * and accessed stack slot's index in next 6 bits (MAX_BPF_STACK is 512,
-	 * 8 bytes per slot, so slot index (spi) is [0, 63])
-	 */
-	INSN_F_FRAMENO_MASK = 0x7, /* 3 bits */
-
-	INSN_F_SPI_MASK = 0x3f, /* 6 bits */
-	INSN_F_SPI_SHIFT = 3, /* shifted 3 bits to the left */
+	INSN_F_STACK_ACCESS = BIT(0),
 
-	INSN_F_STACK_ACCESS = BIT(9),
+	INSN_F_DST_REG_STACK = BIT(1), /* dst_reg is PTR_TO_STACK */
+	INSN_F_SRC_REG_STACK = BIT(2), /* src_reg is PTR_TO_STACK */
 
-	INSN_F_DST_REG_STACK = BIT(10), /* dst_reg is PTR_TO_STACK */
-	INSN_F_SRC_REG_STACK = BIT(11), /* src_reg is PTR_TO_STACK */
-	/* total 12 bits are used now. */
+	INSN_F_STACK_ARG_ACCESS = BIT(3),
 };
 
-static_assert(INSN_F_FRAMENO_MASK + 1 >= MAX_CALL_FRAMES);
-static_assert(INSN_F_SPI_MASK + 1 >= MAX_BPF_STACK / 8);
-
 struct bpf_jmp_history_entry {
-	u32 idx;
 	/* insn idx can't be bigger than 1 million */
+	u32 idx : 20;
+	u32 frame : 4;	/* stack access frame number */
+	u32 spi : 6;	/* stack slot index (0..63) */
+	u32 : 2;
 	u32 prev_idx : 20;
 	/* special INSN_F_xxx flags */
-	u32 flags : 12;
-	/* additional registers that need precision tracking when this
-	 * jump is backtracked, vector of six 10-bit records
+	u32 flags : 4;
+	u32 : 8;
+	/*
+	 * additional registers that need precision tracking when this
+	 * jump is backtracked, vector of five 11-bit records
 	 */
 	u64 linked_regs;
 };
 
-/* Maximum number of register states that can exist at once */
-#define BPF_ID_MAP_SIZE ((MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) * MAX_CALL_FRAMES)
+static_assert(MAX_CALL_FRAMES <= (1 << 4));
+static_assert(MAX_BPF_STACK / 8 <= (1 << 6));
+
+/* Maximum number of bpf_reg_state objects that can exist at once */
+#define MAX_STACK_ARG_SLOTS (MAX_BPF_FUNC_ARGS - MAX_BPF_FUNC_REG_ARGS)
+#define BPF_ID_MAP_SIZE ((MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE + \
+			  MAX_STACK_ARG_SLOTS) * MAX_CALL_FRAMES)
 struct bpf_verifier_state {
 	/* call stack tracking */
 	struct bpf_func_state *frame[MAX_CALL_FRAMES];
@@ -497,10 +527,23 @@ struct bpf_verifier_state {
 	u32 may_goto_depth;
 };
 
-#define bpf_get_spilled_reg(slot, frame, mask)				\
-	(((slot < frame->allocated_stack / BPF_REG_SIZE) &&		\
-	  ((1 << frame->stack[slot].slot_type[BPF_REG_SIZE - 1]) & (mask))) \
-	 ? &frame->stack[slot].spilled_ptr : NULL)
+static inline struct bpf_reg_state *
+bpf_get_spilled_reg(int slot, struct bpf_func_state *frame, u32 mask)
+{
+	if (slot < frame->allocated_stack / BPF_REG_SIZE &&
+	    (1 << frame->stack[slot].slot_type[BPF_REG_SIZE - 1]) & mask)
+		return &frame->stack[slot].spilled_ptr;
+	return NULL;
+}
+
+static inline struct bpf_reg_state *
+bpf_get_spilled_stack_arg(int slot, struct bpf_func_state *frame)
+{
+	if (slot < frame->out_stack_arg_cnt &&
+	    frame->stack_arg_regs[slot].type != NOT_INIT)
+		return &frame->stack_arg_regs[slot];
+	return NULL;
+}
 
 /* Iterate over 'frame', setting 'reg' to either NULL or a spilled register. */
 #define bpf_for_each_spilled_reg(iter, frame, reg, mask)			\
@@ -508,7 +551,13 @@ struct bpf_verifier_state {
 	     iter < frame->allocated_stack / BPF_REG_SIZE;		\
 	     iter++, reg = bpf_get_spilled_reg(iter, frame, mask))
 
-#define bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, __mask, __expr)   \
+/* Iterate over 'frame', setting 'reg' to either NULL or a spilled stack arg. */
+#define bpf_for_each_spilled_stack_arg(iter, frame, reg)               \
+	for (iter = 0, reg = bpf_get_spilled_stack_arg(iter, frame);   \
+	     iter < frame->out_stack_arg_cnt;                          \
+	     iter++, reg = bpf_get_spilled_stack_arg(iter, frame))
+
+#define bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, __stack, __mask, __expr)   \
 	({                                                               \
 		struct bpf_verifier_state *___vstate = __vst;            \
 		int ___i, ___j;                                          \
@@ -516,6 +565,7 @@ struct bpf_verifier_state {
 			struct bpf_reg_state *___regs;                   \
 			__state = ___vstate->frame[___i];                \
 			___regs = __state->regs;                         \
+			__stack = NULL;                                  \
 			for (___j = 0; ___j < MAX_BPF_REG; ___j++) {     \
 				__reg = &___regs[___j];                  \
 				(void)(__expr);                          \
@@ -523,14 +573,27 @@ struct bpf_verifier_state {
 			bpf_for_each_spilled_reg(___j, __state, __reg, __mask) { \
 				if (!__reg)                              \
 					continue;                        \
+				__stack = &__state->stack[___j];         \
 				(void)(__expr);                          \
 			}                                                \
+			__stack = NULL;                                  \
+			bpf_for_each_spilled_stack_arg(___j, __state, __reg) { \
+				if (!__reg)                              \
+					continue;                        \
+				(void)(__expr);                          \
+			}						 \
 		}                                                        \
+		(void)__stack;                                           \
 	})
 
 /* Invoke __expr over regsiters in __vst, setting __state and __reg */
-#define bpf_for_each_reg_in_vstate(__vst, __state, __reg, __expr) \
-	bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, 1 << STACK_SPILL, __expr)
+#define bpf_for_each_reg_in_vstate(__vst, __state, __reg, __expr)		\
+	({									\
+		struct bpf_stack_state * ___stack;                        	\
+		(void)___stack;							\
+		bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, ___stack,\
+						1 << STACK_SPILL, __expr);	\
+	})
 
 /* linked list of verifier states used to prune search */
 struct bpf_verifier_state_list {
@@ -700,6 +763,22 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 	return log && log->level;
 }
 
+struct bpf_log_attr {
+	char __user *ubuf;
+	u32 size;
+	u32 level;
+	u32 offsetof_true_size;
+	bpfptr_t uattr;
+};
+
+int bpf_log_attr_init(struct bpf_log_attr *log, u64 log_buf, u32 log_size, u32 log_level,
+		      u32 offsetof_log_true_size, bpfptr_t uattr, struct bpf_common_attr *common,
+		      bpfptr_t uattr_common, u32 size_common);
+struct bpf_verifier_log *bpf_log_attr_create_vlog(struct bpf_log_attr *attr_log,
+						  struct bpf_common_attr *common, bpfptr_t uattr,
+						  u32 size);
+int bpf_log_attr_finalize(struct bpf_log_attr *attr, struct bpf_verifier_log *log);
+
 #define BPF_MAX_SUBPROGS 256
 
 struct bpf_subprog_arg_info {
@@ -724,6 +803,7 @@ struct bpf_subprog_info {
 	u32 exit_idx; /* Index of one of the BPF_EXIT instructions in this subprogram */
 	u16 stack_depth; /* max. stack depth used by this function */
 	u16 stack_extra;
+	u32 insn_processed;
 	/* offsets in range [stack_depth .. fastcall_stack_off)
 	 * are used for bpf_fastcall spills and fills.
 	 */
@@ -740,12 +820,21 @@ struct bpf_subprog_info {
 	bool keep_fastcall_stack: 1;
 	bool changes_pkt_data: 1;
 	bool might_sleep: 1;
-	u8 arg_cnt:3;
+	u8 arg_cnt:4;
 
 	enum priv_stack_mode priv_stack_mode;
-	struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS];
+	struct bpf_subprog_arg_info args[MAX_BPF_FUNC_ARGS];
+	u16 stack_arg_cnt; /* incoming + max outgoing */
+	u16 max_out_stack_arg_cnt;
 };
 
+static inline u16 bpf_in_stack_arg_cnt(const struct bpf_subprog_info *sub)
+{
+	if (sub->arg_cnt > MAX_BPF_FUNC_REG_ARGS)
+		return sub->arg_cnt - MAX_BPF_FUNC_REG_ARGS;
+	return 0;
+}
+
 struct bpf_verifier_env;
 
 struct backtrack_state {
@@ -753,6 +842,7 @@ struct backtrack_state {
 	u32 frame;
 	u32 reg_masks[MAX_CALL_FRAMES];
 	u64 stack_masks[MAX_CALL_FRAMES];
+	u8 stack_arg_masks[MAX_CALL_FRAMES];
 };
 
 struct bpf_id_pair {
@@ -881,6 +971,8 @@ struct bpf_verifier_env {
 	u32 prev_insn_processed, insn_processed;
 	/* number of jmps, calls, exits analyzed so far */
 	u32 prev_jmps_processed, jmps_processed;
+	/* maximum combined stack depth */
+	u32 max_stack_depth;
 	/* total verification time */
 	u64 verification_time;
 	/* maximum number of verifier states kept in 'branching' instructions */
@@ -914,6 +1006,7 @@ struct bpf_verifier_env {
 	 * e.g., in reg_type_str() to generate reg_type string
 	 */
 	char tmp_str_buf[TMP_STR_BUF_LEN];
+	char tmp_arg_name[32];
 	struct bpf_insn insn_buf[INSN_BUF_SIZE];
 	struct bpf_insn epilogue_buf[INSN_BUF_SIZE];
 	struct bpf_scc_callchain callchain_buf;
@@ -1087,7 +1180,7 @@ struct list_head *bpf_explored_state(struct bpf_verifier_env *env, int idx);
 void bpf_free_verifier_state(struct bpf_verifier_state *state, bool free_self);
 void bpf_free_backedges(struct bpf_scc_visit *visit);
 int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
-			 int insn_flags, u64 linked_regs);
+			 int insn_flags, int spi, int frame, u64 linked_regs);
 void bpf_bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist);
 void bpf_mark_reg_not_init(const struct bpf_verifier_env *env,
 			   struct bpf_reg_state *reg);
@@ -1150,6 +1243,11 @@ static inline void bpf_bt_set_frame_slot(struct backtrack_state *bt, u32 frame,
 	bt->stack_masks[frame] |= 1ull << slot;
 }
 
+static inline void bt_set_frame_stack_arg_slot(struct backtrack_state *bt, u32 frame, u32 slot)
+{
+	bt->stack_arg_masks[frame] |= 1 << slot;
+}
+
 static inline bool bt_is_frame_reg_set(struct backtrack_state *bt, u32 frame, u32 reg)
 {
 	return bt->reg_masks[frame] & (1 << reg);
@@ -1321,6 +1419,25 @@ struct bpf_map_desc {
 	int uid;
 };
 
+/* The last initialized dynptr; Populated by process_dynptr_func() */
+struct bpf_dynptr_desc {
+	enum bpf_dynptr_type type;
+	u32 id;
+	u32 parent_id;
+};
+
+/*
+ * The last seen rereferenced object; Updated by update_ref_obj() when a register refers to a
+ * referenced object. Used when the helper or kfunc is casting a referenced object, returning
+ * allocated memory derived from referenced object or creating a dynptr with a referenced
+ * object as parent.
+ */
+struct ref_obj_desc {
+	u32 id;
+	u32 parent_id;
+	u8 cnt;
+};
+
 struct bpf_kfunc_call_arg_meta {
 	/* In parameters */
 	struct btf *btf;
@@ -1329,7 +1446,6 @@ struct bpf_kfunc_call_arg_meta {
 	const struct btf_type *func_proto;
 	const char *func_name;
 	/* Out parameters */
-	u32 ref_obj_id;
 	u8 release_regno;
 	bool r0_rdonly;
 	u32 ret_btf_id;
@@ -1362,15 +1478,12 @@ struct bpf_kfunc_call_arg_meta {
 		struct btf_field *field;
 	} arg_rbtree_root;
 	struct {
-		enum bpf_dynptr_type type;
-		u32 id;
-		u32 ref_obj_id;
-	} initialized_dynptr;
-	struct {
 		u8 spi;
 		u8 frameno;
 	} iter;
 	struct bpf_map_desc map;
+	struct bpf_dynptr_desc dynptr;
+	struct ref_obj_desc ref_obj;
 	u64 mem_size;
 };
 
@@ -1479,6 +1592,10 @@ int bpf_add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, u16 offset);
 int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			 struct bpf_insn *insn_buf, int insn_idx, int *cnt);
 
+/* Functions exported from verifier.c, used by trampoline.c */
+int bpf_check_attach_btf_id_multi(struct btf *btf, struct bpf_prog *prog, u32 btf_id,
+				  struct bpf_attach_target_info *tgt_info);
+
 /* Functions in fixups.c, called from bpf_check() */
 int bpf_remove_fastcall_spills_fills(struct bpf_verifier_env *env);
 int bpf_optimize_bpf_loop(struct bpf_verifier_env *env);
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 48108471c5b1..240401d9b25b 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -145,7 +145,8 @@ const char *btf_get_name(const struct btf *btf);
 void btf_get(struct btf *btf);
 void btf_put(struct btf *btf);
 const struct btf_header *btf_header(const struct btf *btf);
-int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_sz);
+struct bpf_log_attr;
+int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_attr *attr_log);
 struct btf *btf_get_by_fd(int fd);
 int btf_get_info_by_fd(const struct btf *btf,
 		       const union bpf_attr *attr,
@@ -415,12 +416,12 @@ static inline bool btf_type_is_array(const struct btf_type *t)
 	return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
 }
 
-static inline u16 btf_type_vlen(const struct btf_type *t)
+static inline u32 btf_type_vlen(const struct btf_type *t)
 {
 	return BTF_INFO_VLEN(t->info);
 }
 
-static inline u16 btf_vlen(const struct btf_type *t)
+static inline u32 btf_vlen(const struct btf_type *t)
 {
 	return btf_type_vlen(t);
 }
diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h
index af011db39ab3..8b5a9ee92513 100644
--- a/include/linux/btf_ids.h
+++ b/include/linux/btf_ids.h
@@ -284,5 +284,6 @@ extern u32 bpf_cgroup_btf_id[];
 extern u32 bpf_local_storage_map_btf_id[];
 extern u32 btf_bpf_map_id[];
 extern u32 bpf_kmem_cache_btf_id[];
+extern u32 bpf_multi_func_btf_id[];
 
 #endif
diff --git a/include/linux/cnum.h b/include/linux/cnum.h
new file mode 100644
index 000000000000..49b7d0c7645d
--- /dev/null
+++ b/include/linux/cnum.h
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#ifndef _LINUX_CNUM_H
+#define _LINUX_CNUM_H
+
+#include <linux/types.h>
+
+/*
+ * cnum32: a circular number.
+ * A unified representation for signed and unsigned ranges.
+ *
+ * Assume that a 32-bit range is a circle, with 0 being in the 12 o'clock
+ * position, numbers placed sequentially in clockwise order and U32_MAX
+ * in the 11 o'clock position. Signed values map onto the same circle:
+ * S32_MAX sits at 5 o'clock, S32_MIN sits at 6 o'clock (opposite 0),
+ * negative values occupy the left half and positive values the right half.
+ *
+ * @cnum32 represents an arc on this circle drawn clockwise.
+ * @base corresponds to the first value of the range.
+ * @size corresponds to the number of integers in the range excluding @base.
+ * (The @base is excluded to avoid integer overflow when representing the full
+ *  0..U32_MAX range, which corresponds to 2^32, which can't be stored in u32).
+ *
+ * For example: {U32_MAX, 1} corresponds to signed range [-1, 0],
+ *              {S32_MAX, 1} corresponds to unsigned range [S32_MAX, S32_MIN].
+ */
+struct cnum32 {
+	u32 base;
+	u32 size;
+};
+
+#define CNUM32_UNBOUNDED ((struct cnum32){ .base = 0, .size = U32_MAX })
+#define CNUM32_EMPTY ((struct cnum32){ .base = U32_MAX, .size = U32_MAX })
+
+struct cnum32 cnum32_from_urange(u32 min, u32 max);
+struct cnum32 cnum32_from_srange(s32 min, s32 max);
+u32 cnum32_umin(struct cnum32 cnum);
+u32 cnum32_umax(struct cnum32 cnum);
+s32 cnum32_smin(struct cnum32 cnum);
+s32 cnum32_smax(struct cnum32 cnum);
+struct cnum32 cnum32_intersect(struct cnum32 a, struct cnum32 b);
+void cnum32_intersect_with(struct cnum32 *dst, struct cnum32 src);
+void cnum32_intersect_with_urange(struct cnum32 *dst, u32 min, u32 max);
+void cnum32_intersect_with_srange(struct cnum32 *dst, s32 min, s32 max);
+bool cnum32_contains(struct cnum32 cnum, u32 v);
+bool cnum32_is_const(struct cnum32 cnum);
+bool cnum32_is_empty(struct cnum32 cnum);
+struct cnum32 cnum32_add(struct cnum32 a, struct cnum32 b);
+struct cnum32 cnum32_negate(struct cnum32 a);
+bool cnum32_is_subset(struct cnum32 outer, struct cnum32 inner);
+
+/* Same as cnum32 but for 64-bit ranges */
+struct cnum64 {
+	u64 base;
+	u64 size;
+};
+
+#define CNUM64_UNBOUNDED ((struct cnum64){ .base = 0, .size = U64_MAX })
+#define CNUM64_EMPTY ((struct cnum64){ .base = U64_MAX, .size = U64_MAX })
+
+struct cnum64 cnum64_from_urange(u64 min, u64 max);
+struct cnum64 cnum64_from_srange(s64 min, s64 max);
+u64 cnum64_umin(struct cnum64 cnum);
+u64 cnum64_umax(struct cnum64 cnum);
+s64 cnum64_smin(struct cnum64 cnum);
+s64 cnum64_smax(struct cnum64 cnum);
+struct cnum64 cnum64_intersect(struct cnum64 a, struct cnum64 b);
+void cnum64_intersect_with(struct cnum64 *dst, struct cnum64 src);
+void cnum64_intersect_with_urange(struct cnum64 *dst, u64 min, u64 max);
+void cnum64_intersect_with_srange(struct cnum64 *dst, s64 min, s64 max);
+bool cnum64_contains(struct cnum64 cnum, u64 v);
+bool cnum64_is_const(struct cnum64 cnum);
+bool cnum64_is_empty(struct cnum64 cnum);
+struct cnum64 cnum64_add(struct cnum64 a, struct cnum64 b);
+struct cnum64 cnum64_negate(struct cnum64 a);
+bool cnum64_is_subset(struct cnum64 outer, struct cnum64 inner);
+
+struct cnum32 cnum32_from_cnum64(struct cnum64 cnum);
+struct cnum64 cnum64_cnum32_intersect(struct cnum64 a, struct cnum32 b);
+
+#endif /* _LINUX_CNUM_H */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 88a241aac36a..67d337ede91b 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -58,8 +58,9 @@ struct ctl_table_header;
 #define BPF_REG_H	BPF_REG_9	/* hlen, callee-saved */
 
 /* Kernel hidden auxiliary/helper register. */
-#define BPF_REG_AX		MAX_BPF_REG
-#define MAX_BPF_EXT_REG		(MAX_BPF_REG + 1)
+#define BPF_REG_PARAMS		MAX_BPF_REG
+#define BPF_REG_AX		(MAX_BPF_REG + 1)
+#define MAX_BPF_EXT_REG		(MAX_BPF_REG + 2)
 #define MAX_BPF_JIT_REG		MAX_BPF_EXT_REG
 
 /* unused opcode to mark special call to bpf_tail_call() helper */
@@ -748,6 +749,27 @@ static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog,
 	return ret;
 }
 
+static inline bool is_stack_arg_ldx(const struct bpf_insn *insn)
+{
+	return insn->code == (BPF_LDX | BPF_MEM | BPF_DW) &&
+	       insn->src_reg == BPF_REG_PARAMS &&
+	       insn->off > 0 && insn->off % 8 == 0;
+}
+
+static inline bool is_stack_arg_st(const struct bpf_insn *insn)
+{
+	return insn->code == (BPF_ST | BPF_MEM | BPF_DW) &&
+	       insn->dst_reg == BPF_REG_PARAMS &&
+	       insn->off < 0 && insn->off % 8 == 0;
+}
+
+static inline bool is_stack_arg_stx(const struct bpf_insn *insn)
+{
+	return insn->code == (BPF_STX | BPF_MEM | BPF_DW) &&
+	       insn->dst_reg == BPF_REG_PARAMS &&
+	       insn->off < 0 && insn->off % 8 == 0;
+}
+
 #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN
 
 struct bpf_skb_data_end {
@@ -1159,6 +1181,7 @@ bool bpf_jit_inlines_helper_call(s32 imm);
 bool bpf_jit_supports_subprog_tailcalls(void);
 bool bpf_jit_supports_percpu_insn(void);
 bool bpf_jit_supports_kfunc_call(void);
+bool bpf_jit_supports_stack_args(void);
 bool bpf_jit_supports_far_kfunc_call(void);
 bool bpf_jit_supports_exceptions(void);
 bool bpf_jit_supports_ptr_xchg(void);
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 28b30c6f1031..02bc5027523a 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -415,6 +415,8 @@ struct ftrace_hash *alloc_ftrace_hash(int size_bits);
 void free_ftrace_hash(struct ftrace_hash *hash);
 struct ftrace_func_entry *add_ftrace_hash_entry_direct(struct ftrace_hash *hash,
 						       unsigned long ip, unsigned long direct);
+void add_ftrace_hash_entry(struct ftrace_hash *hash, struct ftrace_func_entry *entry);
+void ftrace_hash_remove(struct ftrace_hash *hash);
 
 /* The hash used to know what functions callbacks trace */
 struct ftrace_ops_hash {
@@ -551,6 +553,8 @@ int update_ftrace_direct_mod(struct ftrace_ops *ops, struct ftrace_hash *hash, b
 
 void ftrace_stub_direct_tramp(void);
 
+unsigned long ftrace_hash_count(struct ftrace_hash *hash);
+
 #else
 struct ftrace_ops;
 static inline unsigned long ftrace_find_rec_direct(unsigned long ip)
@@ -590,6 +594,11 @@ static inline int update_ftrace_direct_mod(struct ftrace_ops *ops, struct ftrace
 	return -ENODEV;
 }
 
+static inline unsigned long ftrace_hash_count(struct ftrace_hash *hash)
+{
+	return 0;
+}
+
 /*
  * This must be implemented by the architecture.
  * It is the way the ftrace direct_ops helper, when called
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index eca14547f9c1..2981e386da7b 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1070,6 +1070,49 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
 }
 #endif
 
+#ifndef ptep_try_set
+/**
+ * ptep_try_set - atomically set an empty kernel PTE
+ * @ptep: page table entry
+ * @new_pte: value to install
+ *
+ * Atomically set *@ptep to @new_pte iff *@ptep is pte_none(). Return true on
+ * success, false if the slot was already populated or the arch has no
+ * implementation.
+ *
+ * For special kernel page tables only - never user page tables. The caller must
+ * prevent concurrent teardown of @ptep and must accept that other writers may
+ * race. Concurrent clearers must use ptep_get_and_clear() so racing accesses
+ * agree on the outcome.
+ *
+ * Architectures opt in by providing a cmpxchg-based override and defining
+ * ptep_try_set as an identity macro. The generic stub returns false, which is
+ * correct for callers that fall through to oops on failure.
+ */
+static inline bool ptep_try_set(pte_t *ptep, pte_t new_pte)
+{
+	return false;
+}
+#endif
+
+#ifndef flush_tlb_before_set
+/**
+ * flush_tlb_before_set - invalidate a kernel PTE's TLB before re-setting it
+ * @addr: kernel virtual address whose PTE was just cleared
+ *
+ * Some architectures (e.g. arm64) do not allow a live page-table entry to be
+ * repointed at a different page in one step. The old entry must first be made
+ * invalid and its translation flushed from every TLB, and only then may the new
+ * entry be written.
+ *
+ * This is only for the lockless atomic kernel-PTE installers (ptep_try_set()).
+ * It must be callable with interrupts disabled.
+ */
+static inline void flush_tlb_before_set(unsigned long addr)
+{
+}
+#endif
+
 #ifndef wrprotect_ptes
 /**
  * wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index ef5230cece36..79f83b6eec27 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -263,6 +263,8 @@ struct rhash_lock_head __rcu **__rht_bucket_nested(
 struct rhash_lock_head __rcu **rht_bucket_nested_insert(
 	struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash);
 
+void *rhashtable_next_key(struct rhashtable *ht, const void *prev_key);
+
 #define rht_dereference(p, ht) \
 	rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht))
 
@@ -1117,7 +1119,7 @@ unlocked:
 		atomic_dec(&ht->nelems);
 		if (unlikely(ht->p.automatic_shrinking &&
 			     rht_shrink_below_30(ht, tbl)))
-			schedule_work(&ht->run_work);
+			irq_work_queue(&ht->run_irq_work);
 		err = 0;
 	}
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 4fb7291f54b6..874d9067a43b 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -940,7 +940,8 @@ asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
 asmlinkage long sys_getrandom(char __user *buf, size_t count,
 			      unsigned int flags);
 asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags);
-asmlinkage long sys_bpf(int cmd, union bpf_attr __user *attr, unsigned int size);
+asmlinkage long sys_bpf(int cmd, union bpf_attr __user *attr, unsigned int size,
+			struct bpf_common_attr __user *attr_common, unsigned int size_common);
 asmlinkage long sys_execveat(int dfd, const char __user *filename,
 			const char __user *const __user *argv,
 			const char __user *const __user *envp, int flags);
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 40a43a4c7caf..308c76b57d13 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -770,6 +770,7 @@ trace_trigger_soft_disabled(struct trace_event_file *file)
 
 #ifdef CONFIG_BPF_EVENTS
 unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx);
+unsigned int trace_call_bpf_faultable(struct trace_event_call *call, void *ctx);
 int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie);
 void perf_event_detach_bpf_prog(struct perf_event *event);
 int perf_event_query_prog_array(struct perf_event *event, void __user *info);
@@ -786,12 +787,18 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
 			    unsigned long *missed);
 int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
 int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
+int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr);
 #else
 static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 {
 	return 1;
 }
 
+static inline unsigned int trace_call_bpf_faultable(struct trace_event_call *call, void *ctx)
+{
+	return 1;
+}
+
 static inline int
 perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie)
 {
@@ -838,6 +845,11 @@ bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 {
 	return -EOPNOTSUPP;
 }
+static inline int
+bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
 enum {
diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h
index 9391d54d3f12..d1de8f9aa07f 100644
--- a/include/trace/bpf_probe.h
+++ b/include/trace/bpf_probe.h
@@ -58,9 +58,7 @@ static notrace void							\
 __bpf_trace_##call(void *__data, proto)					\
 {									\
 	might_fault();							\
-	preempt_disable_notrace();					\
 	CONCATENATE(bpf_trace_run, COUNT_ARGS(args))(__data, CAST_TO_U64(args));	\
-	preempt_enable_notrace();					\
 }
 
 #undef DECLARE_EVENT_SYSCALL_CLASS
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 552bc5d9afbd..89b36de5fdbb 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -994,6 +994,7 @@ enum bpf_cmd {
 	BPF_PROG_STREAM_READ_BY_FD,
 	BPF_PROG_ASSOC_STRUCT_OPS,
 	__MAX_BPF_CMD,
+	BPF_COMMON_ATTRS = 1 << 16, /* Indicate carrying syscall common attrs. */
 };
 
 enum bpf_map_type {
@@ -1046,6 +1047,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_CGRP_STORAGE,
 	BPF_MAP_TYPE_ARENA,
 	BPF_MAP_TYPE_INSN_ARRAY,
+	BPF_MAP_TYPE_RHASH,
 	__MAX_BPF_MAP_TYPE
 };
 
@@ -1154,6 +1156,9 @@ enum bpf_attach_type {
 	BPF_TRACE_KPROBE_SESSION,
 	BPF_TRACE_UPROBE_SESSION,
 	BPF_TRACE_FSESSION,
+	BPF_TRACE_FENTRY_MULTI,
+	BPF_TRACE_FEXIT_MULTI,
+	BPF_TRACE_FSESSION_MULTI,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -1178,6 +1183,7 @@ enum bpf_link_type {
 	BPF_LINK_TYPE_UPROBE_MULTI = 12,
 	BPF_LINK_TYPE_NETKIT = 13,
 	BPF_LINK_TYPE_SOCKMAP = 14,
+	BPF_LINK_TYPE_TRACING_MULTI = 15,
 	__MAX_BPF_LINK_TYPE,
 };
 
@@ -1321,7 +1327,11 @@ enum {
  * BPF_TRACE_UPROBE_MULTI attach type to create return probe.
  */
 enum {
-	BPF_F_UPROBE_MULTI_RETURN = (1U << 0)
+	/* Get return uprobe. */
+	BPF_F_UPROBE_MULTI_RETURN     = (1U << 0),
+
+	/* Get path from provided path_fd. */
+	BPF_F_UPROBE_MULTI_PATH_FD    = (1U << 1),
 };
 
 /* link_create.netfilter.flags used in LINK_CREATE command for
@@ -1500,6 +1510,13 @@ struct bpf_stack_build_id {
 	};
 };
 
+struct bpf_common_attr {
+	__aligned_u64 log_buf;
+	__u32 log_size;
+	__u32 log_level;
+	__u32 log_true_size;
+};
+
 #define BPF_OBJ_NAME_LEN 16U
 
 enum {
@@ -1537,6 +1554,11 @@ union bpf_attr {
 		 *
 		 * BPF_MAP_TYPE_ARENA - contains the address where user space
 		 * is going to mmap() the arena. It has to be page aligned.
+		 *
+		 * BPF_MAP_TYPE_RHASH - initial table size hint
+		 * (nelem_hint). 0 = use rhashtable default. Must be
+		 * <= min(max_entries, U16_MAX). Upper 32 bits reserved,
+		 * must be zero.
 		 */
 		__u64	map_extra;
 
@@ -1846,6 +1868,7 @@ union bpf_attr {
 				__u32		cnt;
 				__u32		flags;
 				__u32		pid;
+				__u32		path_fd;
 			} uprobe_multi;
 			struct {
 				union {
@@ -1861,6 +1884,11 @@ union bpf_attr {
 				};
 				__u64		expected_revision;
 			} cgroup;
+			struct {
+				__aligned_u64	ids;
+				__aligned_u64	cookies;
+				__u32		cnt;
+			} tracing_multi;
 		};
 	} link_create;
 
@@ -6698,6 +6726,7 @@ struct bpf_prog_info {
 	__u32 verified_insns;
 	__u32 attach_btf_obj_id;
 	__u32 attach_btf_id;
+	__u32 :32;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
@@ -6719,6 +6748,7 @@ struct bpf_map_info {
 	__u64 map_extra;
 	__aligned_u64 hash;
 	__u32 hash_size;
+	__u32 :32;
 } __attribute__((aligned(8)));
 
 struct bpf_btf_info {
diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h
index 638615ebddc2..618167cab4e6 100644
--- a/include/uapi/linux/btf.h
+++ b/include/uapi/linux/btf.h
@@ -33,20 +33,22 @@ struct btf_header {
 	__u32	layout_len;	/* length of layout section	*/
 };
 
-/* Max # of type identifier */
-#define BTF_MAX_TYPE	0x000fffff
-/* Max offset into the string section */
-#define BTF_MAX_NAME_OFFSET	0x00ffffff
-/* Max # of struct/union/enum members or func args */
-#define BTF_MAX_VLEN	0xffff
+enum btf_max {
+	/* Max possible kind */
+	BTF_MAX_KIND =		0x0000007f,
+	/* Max # of type identifier */
+	BTF_MAX_TYPE =		0x000fffff,
+	/* Max offset into the string section */
+	BTF_MAX_NAME_OFFSET =	0x00ffffff,
+	/* Max # of struct/union/enum members or func args */
+	BTF_MAX_VLEN =		0x00ffffff,
+};
 
 struct btf_type {
 	__u32 name_off;
 	/* "info" bits arrangement
-	 * bits  0-15: vlen (e.g. # of struct's members)
-	 * bits 16-23: unused
-	 * bits 24-28: kind (e.g. int, ptr, array...etc)
-	 * bits 29-30: unused
+	 * bits  0-23: vlen (e.g. # of struct's members)
+	 * bits 24-30: kind (e.g. int, ptr, array...etc)
 	 * bit     31: kind_flag, currently used by
 	 *             struct, union, enum, fwd, enum64,
 	 *             decl_tag and type_tag
@@ -65,8 +67,8 @@ struct btf_type {
 	};
 };
 
-#define BTF_INFO_KIND(info)	(((info) >> 24) & 0x1f)
-#define BTF_INFO_VLEN(info)	((info) & 0xffff)
+#define BTF_INFO_KIND(info)	(((info) >> 24) & 0x7f)
+#define BTF_INFO_VLEN(info)	((info) & 0xffffff)
 #define BTF_INFO_KFLAG(info)	((info) >> 31)
 
 enum {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 399007b67a92..4dc41bf5780c 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
 endif
 CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy)
 
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o const_fold.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o cnum.o log.o token.o liveness.o const_fold.o
 obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o bpf_insn_array.o
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 49a8f7b1beef..af49c154473d 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -53,12 +53,15 @@ struct bpf_arena {
 	u64 user_vm_start;
 	u64 user_vm_end;
 	struct vm_struct *kern_vm;
+	struct page *scratch_page;
 	struct range_tree rt;
 	/* protects rt */
 	rqspinlock_t spinlock;
 	struct list_head vma_list;
 	/* protects vma_list */
 	struct mutex lock;
+	u64 zap_gen;
+	struct mutex zap_mutex;
 	struct irq_work     free_irq;
 	struct work_struct  free_work;
 	struct llist_head   free_spans;
@@ -83,6 +86,32 @@ u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
 	return arena ? arena->user_vm_start : 0;
 }
 
+/**
+ * bpf_arena_map_kern_vm_start - kern_vm_start lookup by struct bpf_map *
+ * @map: a BPF_MAP_TYPE_ARENA map
+ *
+ * Return @map's kern_vm_start.
+ */
+u64 bpf_arena_map_kern_vm_start(struct bpf_map *map)
+{
+	return bpf_arena_get_kern_vm_start(container_of(map, struct bpf_arena, map));
+}
+
+/**
+ * bpf_prog_arena - return the bpf_map of the arena referenced by @prog
+ * @prog: a loaded BPF program
+ *
+ * The verifier enforces at most one arena per program and stores it in
+ * prog->aux->arena. Return that arena's underlying bpf_map, or NULL if
+ * @prog does not reference an arena.
+ */
+struct bpf_map *bpf_prog_arena(struct bpf_prog *prog)
+{
+	struct bpf_arena *arena = prog->aux->arena;
+
+	return arena ? &arena->map : NULL;
+}
+
 static long arena_map_peek_elem(struct bpf_map *map, void *value)
 {
 	return -EOPNOTSUPP;
@@ -115,26 +144,57 @@ static long compute_pgoff(struct bpf_arena *arena, long uaddr)
 
 struct apply_range_data {
 	struct page **pages;
+	struct page *scratch_page;
 	int i;
 };
 
+struct clear_range_data {
+	struct llist_head *free_pages;
+	struct page *scratch_page;
+};
+
 static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data)
 {
 	struct apply_range_data *d = data;
 	struct page *page;
+	pte_t pteval;
 
 	if (!data)
 		return 0;
-	/* sanity check */
-	if (unlikely(!pte_none(ptep_get(pte))))
-		return -EBUSY;
 
 	page = d->pages[d->i];
 	/* paranoia, similar to vmap_pages_pte_range() */
 	if (WARN_ON_ONCE(!pfn_valid(page_to_pfn(page))))
 		return -EINVAL;
 
-	set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL));
+	pteval = mk_pte(page, PAGE_KERNEL);
+#ifdef ptep_try_set
+	/*
+	 * Kernel-fault recovery may have installed the scratch page here, and
+	 * some architectures (arm64) prohibit valid->valid PTE transitions.
+	 * Install atomically into a none slot. If scratch is present, clear it
+	 * and flush_tlb_before_set() (break-before-make) before retrying.
+	 */
+	while (!ptep_try_set(pte, pteval)) {
+		pte_t old = ptep_get(pte);
+
+		if (pte_none(old))
+			continue;
+		if (WARN_ON_ONCE(pte_page(old) != d->scratch_page))
+			return -EBUSY;
+		ptep_get_and_clear(&init_mm, addr, pte);
+		flush_tlb_before_set(addr);
+	}
+#else
+	/*
+	 * Without ptep_try_set() there is no atomic installer, but such arches
+	 * also do not wire up bpf_arena_handle_page_fault(), so no scratch page
+	 * is ever installed and the slot is always none here.
+	 */
+	if (unlikely(!pte_none(ptep_get(pte))))
+		return -EBUSY;
+	set_pte_at(&init_mm, addr, pte, pteval);
+#endif
 	d->i++;
 	return 0;
 }
@@ -144,33 +204,59 @@ static void flush_vmap_cache(unsigned long start, unsigned long size)
 	flush_cache_vmap(start, start + size);
 }
 
-static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *free_pages)
+static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data)
 {
+	struct clear_range_data *d = data;
 	pte_t old_pte;
 	struct page *page;
 
-	/* sanity check */
-	old_pte = ptep_get(pte);
+	/*
+	 * Pairs with ptep_try_set() in the kernel-fault scratch installer.
+	 * Both sides must be atomic.
+	 */
+	old_pte = ptep_get_and_clear(&init_mm, addr, pte);
 	if (pte_none(old_pte) || !pte_present(old_pte))
-		return 0; /* nothing to do */
+		return 0;
 
 	page = pte_page(old_pte);
 	if (WARN_ON_ONCE(!page))
 		return -EINVAL;
 
-	pte_clear(&init_mm, addr, pte);
+	/*
+	 * Skip the per-arena scratch page. A kernel fault on an unallocated uaddr
+	 * scratches its PTE. A later bpf_arena_free_pages() over that range walks
+	 * here. Without the skip, scratch_page would be freed.
+	 */
+	if (page == d->scratch_page)
+		return 0;
+
+	__llist_add(&page->pcp_llist, d->free_pages);
+	return 0;
+}
 
-	/* Add page to the list so it is freed later */
-	if (free_pages)
-		__llist_add(&page->pcp_llist, free_pages);
+static int apply_range_set_scratch_cb(pte_t *pte, unsigned long addr, void *data)
+{
+	struct page *scratch_page = data;
 
+	if (!pte_none(ptep_get(pte)))
+		return 0;
+	/*
+	 * Best-effort install. ptep_try_set() returns false only if another
+	 * installer (real allocation or concurrent fault) won the cmpxchg.
+	 * Their PTE is already valid, so the access retry succeeds.
+	 *
+	 * No flush_tlb_kernel_range() needed. Stale "not mapped" entries just
+	 * cause one extra re-fault through this same path.
+	 */
+	ptep_try_set(pte, mk_pte(scratch_page, PAGE_KERNEL));
 	return 0;
 }
 
 static int populate_pgtable_except_pte(struct bpf_arena *arena)
 {
+	/* Populate intermediates for the recovery range (4 GiB + upper half-guard). */
 	return apply_to_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
-				   KERN_VM_SZ - GUARD_SZ, apply_range_set_cb, NULL);
+				   SZ_4G + GUARD_SZ / 2, apply_range_set_cb, NULL);
 }
 
 static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
@@ -221,22 +307,30 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
 	init_irq_work(&arena->free_irq, arena_free_irq);
 	INIT_WORK(&arena->free_work, arena_free_worker);
 	bpf_map_init_from_attr(&arena->map, attr);
+
+	err = bpf_map_alloc_pages(&arena->map, NUMA_NO_NODE, 1, &arena->scratch_page);
+	if (err)
+		goto err_free_arena;
+
 	range_tree_init(&arena->rt);
 	err = range_tree_set(&arena->rt, 0, attr->max_entries);
-	if (err) {
-		bpf_map_area_free(arena);
-		goto err;
-	}
+	if (err)
+		goto err_free_scratch;
 	mutex_init(&arena->lock);
+	mutex_init(&arena->zap_mutex);
 	raw_res_spin_lock_init(&arena->spinlock);
 	err = populate_pgtable_except_pte(arena);
-	if (err) {
-		range_tree_destroy(&arena->rt);
-		bpf_map_area_free(arena);
-		goto err;
-	}
+	if (err)
+		goto err_destroy_rt;
 
 	return &arena->map;
+
+err_destroy_rt:
+	range_tree_destroy(&arena->rt);
+err_free_scratch:
+	__free_page(arena->scratch_page);
+err_free_arena:
+	bpf_map_area_free(arena);
 err:
 	free_vm_area(kern_vm);
 	return ERR_PTR(err);
@@ -244,6 +338,7 @@ err:
 
 static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data)
 {
+	struct bpf_arena *arena = data;
 	struct page *page;
 	pte_t pte;
 
@@ -252,6 +347,12 @@ static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data)
 		return 0;
 	page = pte_page(pte);
 	/*
+	 * Skip the scratch page. The walk is page-table-driven, not range-tree-driven,
+	 * so it can visit scratch PTEs at uaddrs the BPF program never allocated.
+	 */
+	if (page == arena->scratch_page)
+		return 0;
+	/*
 	 * We do not update pte here:
 	 * 1. Nobody should be accessing bpf_arena's range outside of a kernel bug
 	 * 2. TLB flushing is batched or deferred. Even if we clear pte,
@@ -286,9 +387,10 @@ static void arena_map_free(struct bpf_map *map)
 	 * free those pages.
 	 */
 	apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
-				     KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL);
+				     SZ_4G + GUARD_SZ / 2, existing_page_cb, arena);
 	free_vm_area(arena->kern_vm);
 	range_tree_destroy(&arena->rt);
+	__free_page(arena->scratch_page);
 	bpf_map_area_free(arena);
 }
 
@@ -318,6 +420,7 @@ struct vma_list {
 	struct vm_area_struct *vma;
 	struct list_head head;
 	refcount_t mmap_count;
+	u64 zap_gen;
 };
 
 static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma)
@@ -330,6 +433,7 @@ static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma)
 	refcount_set(&vml->mmap_count, 1);
 	vma->vm_private_data = vml;
 	vml->vma = vma;
+	vml->zap_gen = 0;
 	list_add(&vml->head, &arena->vma_list);
 	return 0;
 }
@@ -384,33 +488,38 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
 		return VM_FAULT_RETRY;
 
 	page = vmalloc_to_page((void *)kaddr);
-	if (page)
+	if (page) {
+		if (page == arena->scratch_page)
+			/* BPF triggered scratch here; don't lazy-alloc over it */
+			goto out_sigsegv;
 		/* already have a page vmap-ed */
 		goto out;
+	}
 
 	bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
 
 	if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT)
 		/* User space requested to segfault when page is not allocated by bpf prog */
-		goto out_unlock_sigsegv;
+		goto out_sigsegv_memcg;
 
 	ret = range_tree_clear(&arena->rt, vmf->pgoff, 1);
 	if (ret)
-		goto out_unlock_sigsegv;
+		goto out_sigsegv_memcg;
 
-	struct apply_range_data data = { .pages = &page, .i = 0 };
+	struct apply_range_data data = { .pages = &page, .i = 0,
+					 .scratch_page = arena->scratch_page };
 	/* Account into memcg of the process that created bpf_arena */
 	ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
 	if (ret) {
 		range_tree_set(&arena->rt, vmf->pgoff, 1);
-		goto out_unlock_sigsegv;
+		goto out_sigsegv_memcg;
 	}
 
 	ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data);
 	if (ret) {
 		range_tree_set(&arena->rt, vmf->pgoff, 1);
 		free_pages_nolock(page, 0);
-		goto out_unlock_sigsegv;
+		goto out_sigsegv_memcg;
 	}
 	flush_vmap_cache(kaddr, PAGE_SIZE);
 	bpf_map_memcg_exit(old_memcg, new_memcg);
@@ -419,8 +528,9 @@ out:
 	raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
 	vmf->page = page;
 	return 0;
-out_unlock_sigsegv:
+out_sigsegv_memcg:
 	bpf_map_memcg_exit(old_memcg, new_memcg);
+out_sigsegv:
 	raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
 	return VM_FAULT_SIGSEGV;
 }
@@ -587,6 +697,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
 		return 0;
 	}
 	data.pages = pages;
+	data.scratch_page = arena->scratch_page;
 
 	if (raw_res_spin_lock_irqsave(&arena->spinlock, flags))
 		goto out_free_pages;
@@ -668,12 +779,60 @@ out_free_pages:
  */
 static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
 {
+	unsigned long size = (unsigned long)page_cnt << PAGE_SHIFT;
+	struct vm_area_struct *vma;
+	struct mm_struct *mm;
 	struct vma_list *vml;
+	unsigned long vm_start;
+	u64 my_gen;
 
-	guard(mutex)(&arena->lock);
-	/* iterate link list under lock */
-	list_for_each_entry(vml, &arena->vma_list, head)
-		zap_vma_range(vml->vma, uaddr, PAGE_SIZE * page_cnt);
+	/*
+	 * Taking mmap_read_lock() under arena->lock would deadlock against
+	 * arena_vm_close(), which runs with mmap_write_lock held and then
+	 * acquires arena->lock. Drop arena->lock for mmap_read_lock().
+	 *
+	 * Use per-call my_gen, recorded in vml->zap_gen, to remember which
+	 * vmls this invocation has already processed across the lock drop.
+	 * Hold zap_mutex around the whole walk so concurrent zap_pages()
+	 * callers cannot overwrite each other's marks on shared vmls --
+	 * otherwise call B's mark would make call A skip a vml that A has
+	 * not yet zapped for A's uaddr range.
+	 */
+	mutex_lock(&arena->zap_mutex);
+	mutex_lock(&arena->lock);
+	my_gen = ++arena->zap_gen;
+	for (;;) {
+		mm = NULL;
+		list_for_each_entry(vml, &arena->vma_list, head) {
+			if (vml->zap_gen >= my_gen)
+				continue;
+			vml->zap_gen = my_gen;
+			if (!mmget_not_zero(vml->vma->vm_mm))
+				continue;
+			mm = vml->vma->vm_mm;
+			vm_start = vml->vma->vm_start;
+			break;
+		}
+		if (!mm)
+			break;
+		mutex_unlock(&arena->lock);
+
+		mmap_read_lock(mm);
+		/*
+		 * Re-resolve: while we waited the VMA could have been unmapped
+		 * and a different mapping installed at the same address.
+		 */
+		vma = find_vma(mm, vm_start);
+		if (vma && vma->vm_start == vm_start &&
+		    vma->vm_file && vma->vm_file->private_data == &arena->map)
+			zap_vma_range(vma, uaddr, size);
+		mmap_read_unlock(mm);
+		mmput(mm);
+
+		mutex_lock(&arena->lock);
+	}
+	mutex_unlock(&arena->lock);
+	mutex_unlock(&arena->zap_mutex);
 }
 
 static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable)
@@ -685,6 +844,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt,
 	struct llist_head free_pages;
 	struct llist_node *pos, *t;
 	struct arena_free_span *s;
+	struct clear_range_data cdata;
 	unsigned long flags;
 	int ret = 0;
 
@@ -713,9 +873,11 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt,
 	range_tree_set(&arena->rt, pgoff, page_cnt);
 
 	init_llist_head(&free_pages);
+	cdata.free_pages = &free_pages;
+	cdata.scratch_page = arena->scratch_page;
 	/* clear ptes and collect struct pages */
 	apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT,
-				     apply_range_clear_cb, &free_pages);
+				     apply_range_clear_cb, &cdata);
 
 	/* drop the lock to do the tlb flush and zap pages */
 	raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
@@ -805,6 +967,7 @@ static void arena_free_worker(struct work_struct *work)
 	struct arena_free_span *s;
 	u64 arena_vm_start, user_vm_start;
 	struct llist_head free_pages;
+	struct clear_range_data cdata;
 	struct page *page;
 	unsigned long full_uaddr;
 	long kaddr, page_cnt, pgoff;
@@ -818,6 +981,8 @@ static void arena_free_worker(struct work_struct *work)
 	bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
 
 	init_llist_head(&free_pages);
+	cdata.free_pages = &free_pages;
+	cdata.scratch_page = arena->scratch_page;
 	arena_vm_start = bpf_arena_get_kern_vm_start(arena);
 	user_vm_start = bpf_arena_get_user_vm_start(arena);
 
@@ -830,7 +995,7 @@ static void arena_free_worker(struct work_struct *work)
 
 		/* clear ptes and collect pages in free_pages llist */
 		apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT,
-					     apply_range_clear_cb, &free_pages);
+					     apply_range_clear_cb, &cdata);
 
 		range_tree_set(&arena->rt, pgoff, page_cnt);
 	}
@@ -893,6 +1058,19 @@ void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 pag
 
 	return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, false);
 }
+
+void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
+				      int node_id, u64 flags)
+{
+	struct bpf_map *map = p__map;
+	struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+	if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt)
+		return NULL;
+
+	return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true);
+}
+
 __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt)
 {
 	struct bpf_map *map = p__map;
@@ -945,23 +1123,12 @@ static int __init kfunc_init(void)
 }
 late_initcall(kfunc_init);
 
-void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip)
+static void __bpf_prog_report_arena_violation(struct bpf_prog *prog, bool write,
+					      unsigned long addr, unsigned long fault_ip)
 {
 	struct bpf_stream_stage ss;
-	struct bpf_prog *prog;
 	u64 user_vm_start;
 
-	/*
-	 * The RCU read lock is held to safely traverse the latch tree, but we
-	 * don't need its protection when accessing the prog, since it will not
-	 * disappear while we are handling the fault.
-	 */
-	rcu_read_lock();
-	prog = bpf_prog_ksym_find(fault_ip);
-	rcu_read_unlock();
-	if (!prog)
-		return;
-
 	/* Use main prog for stream access */
 	prog = prog->aux->main_prog_aux->prog;
 
@@ -974,3 +1141,53 @@ void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned lo
 		bpf_stream_dump_stack(ss);
 	}));
 }
+
+bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, unsigned long fault_ip)
+{
+	struct bpf_arena *arena;
+	struct bpf_prog *prog;
+	unsigned long kbase;
+	unsigned long page_addr = addr & PAGE_MASK;
+
+	prog = bpf_prog_find_from_stack();
+	if (!prog)
+		return false;
+
+	arena = prog->aux->arena;
+	/* a prog not using arena may be on stack, so arena can be NULL */
+	if (!arena)
+		return false;
+
+	kbase = bpf_arena_get_kern_vm_start(arena);
+
+	/*
+	 * Recovery covers the 4 GiB mappable band plus the upper half-guard.
+	 * Lower guard is unreachable from kfuncs; an address there indicates
+	 * a different bug class - leave it to the regular kernel oops path.
+	 */
+	if (page_addr < kbase || page_addr >= kbase + SZ_4G + GUARD_SZ / 2)
+		return false;
+
+	apply_to_page_range(&init_mm, page_addr, PAGE_SIZE,
+			    apply_range_set_scratch_cb, arena->scratch_page);
+	flush_vmap_cache(page_addr, PAGE_SIZE);
+	__bpf_prog_report_arena_violation(prog, is_write, page_addr - kbase, fault_ip);
+	return true;
+}
+
+void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip)
+{
+	struct bpf_prog *prog;
+
+	/*
+	 * The RCU read lock is held to safely traverse the latch tree, but we
+	 * don't need its protection when accessing the prog, since it will not
+	 * disappear while we are handling the fault.
+	 */
+	rcu_read_lock();
+	prog = bpf_prog_ksym_find(fault_ip);
+	rcu_read_unlock();
+	if (!prog)
+		return;
+	__bpf_prog_report_arena_violation(prog, write, addr, fault_ip);
+}
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index dfb2110ab733..248b4818178c 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -175,14 +175,12 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
 	return array->value + (u64)array->elem_size * (index & array->index_mask);
 }
 
-static int array_map_get_hash(struct bpf_map *map, u32 hash_buf_size,
-			       void *hash_buf)
+static int array_map_get_hash(struct bpf_map *map)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 
 	sha256(array->value, (u64)array->elem_size * array->map.max_entries,
-	       hash_buf);
-	memcpy(array->map.sha, hash_buf, sizeof(array->map.sha));
+	       array->map.sha);
 	return 0;
 }
 
@@ -386,7 +384,7 @@ static long array_map_update_elem(struct bpf_map *map, void *key, void *value,
 	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
 		val = this_cpu_ptr(array->pptrs[index & array->index_mask]);
 		copy_map_value(map, val, value);
-		bpf_obj_free_fields(array->map.record, val);
+		bpf_obj_cancel_fields(map, val);
 	} else {
 		val = array->value +
 			(u64)array->elem_size * (index & array->index_mask);
@@ -394,7 +392,7 @@ static long array_map_update_elem(struct bpf_map *map, void *key, void *value,
 			copy_map_value_locked(map, val, value, false);
 		else
 			copy_map_value(map, val, value);
-		bpf_obj_free_fields(array->map.record, val);
+		bpf_obj_cancel_fields(map, val);
 	}
 	return 0;
 }
@@ -434,14 +432,14 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
 		cpu = map_flags >> 32;
 		ptr = per_cpu_ptr(pptr, cpu);
 		copy_map_value(map, ptr, value);
-		bpf_obj_free_fields(array->map.record, ptr);
+		bpf_obj_cancel_fields(map, ptr);
 		goto unlock;
 	}
 	for_each_possible_cpu(cpu) {
 		ptr = per_cpu_ptr(pptr, cpu);
 		val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu;
 		copy_map_value(map, ptr, val);
-		bpf_obj_free_fields(array->map.record, ptr);
+		bpf_obj_cancel_fields(map, ptr);
 	}
 unlock:
 	rcu_read_unlock();
diff --git a/kernel/bpf/backtrack.c b/kernel/bpf/backtrack.c
index 854731dc93fe..2e4ae0ef0860 100644
--- a/kernel/bpf/backtrack.c
+++ b/kernel/bpf/backtrack.c
@@ -9,7 +9,7 @@
 
 /* for any branch, call, exit record the history of jmps in the given state */
 int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
-			 int insn_flags, u64 linked_regs)
+			 int insn_flags, int spi, int frame, u64 linked_regs)
 {
 	u32 cnt = cur->jmp_history_cnt;
 	struct bpf_jmp_history_entry *p;
@@ -25,6 +25,8 @@ int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state
 				env, "insn history: insn_idx %d cur flags %x new flags %x",
 				env->insn_idx, env->cur_hist_ent->flags, insn_flags);
 		env->cur_hist_ent->flags |= insn_flags;
+		env->cur_hist_ent->spi = spi;
+		env->cur_hist_ent->frame = frame;
 		verifier_bug_if(env->cur_hist_ent->linked_regs != 0, env,
 				"insn history: insn_idx %d linked_regs: %#llx",
 				env->insn_idx, env->cur_hist_ent->linked_regs);
@@ -43,6 +45,8 @@ int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state
 	p->idx = env->insn_idx;
 	p->prev_idx = env->prev_insn_idx;
 	p->flags = insn_flags;
+	p->spi = spi;
+	p->frame = frame;
 	p->linked_regs = linked_regs;
 	cur->jmp_history_cnt = cnt;
 	env->cur_hist_ent = p;
@@ -64,16 +68,6 @@ static bool is_atomic_fetch_insn(const struct bpf_insn *insn)
 	       (insn->imm & BPF_FETCH);
 }
 
-static int insn_stack_access_spi(int insn_flags)
-{
-	return (insn_flags >> INSN_F_SPI_SHIFT) & INSN_F_SPI_MASK;
-}
-
-static int insn_stack_access_frameno(int insn_flags)
-{
-	return insn_flags & INSN_F_FRAMENO_MASK;
-}
-
 /* Backtrack one insn at a time. If idx is not at the top of recorded
  * history then previous instruction came from straight line execution.
  * Return -ENOENT if we exhausted all instructions within given state.
@@ -135,11 +129,21 @@ static inline u32 bt_empty(struct backtrack_state *bt)
 	int i;
 
 	for (i = 0; i <= bt->frame; i++)
-		mask |= bt->reg_masks[i] | bt->stack_masks[i];
+		mask |= bt->reg_masks[i] | bt->stack_masks[i] | bt->stack_arg_masks[i];
 
 	return mask == 0;
 }
 
+static inline void bt_clear_frame_stack_arg_slot(struct backtrack_state *bt, u32 frame, u32 slot)
+{
+	bt->stack_arg_masks[frame] &= ~(1 << slot);
+}
+
+static inline bool bt_is_frame_stack_arg_slot_set(struct backtrack_state *bt, u32 frame, u32 slot)
+{
+	return bt->stack_arg_masks[frame] & (1 << slot);
+}
+
 static inline int bt_subprog_enter(struct backtrack_state *bt)
 {
 	if (bt->frame == MAX_CALL_FRAMES - 1) {
@@ -200,6 +204,11 @@ static inline u64 bt_stack_mask(struct backtrack_state *bt)
 	return bt->stack_masks[bt->frame];
 }
 
+static inline u8 bt_stack_arg_mask(struct backtrack_state *bt)
+{
+	return bt->stack_arg_masks[bt->frame];
+}
+
 static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg)
 {
 	return bt->reg_masks[bt->frame] & (1 << reg);
@@ -341,6 +350,19 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
 			return 0;
 		bt_clear_reg(bt, load_reg);
 
+		if (hist && hist->flags & INSN_F_STACK_ARG_ACCESS) {
+			spi = hist->spi;
+			/*
+			 * Stack arg read: callee reads from r11+off, but
+			 * the data lives in the caller's stack_arg_regs.
+			 * Set the mask in the caller frame so precision
+			 * is marked in the caller's slot at the callee
+			 * entry checkpoint.
+			 */
+			bt_set_frame_stack_arg_slot(bt, bt->frame - 1, spi);
+			return 0;
+		}
+
 		/* scalars can only be spilled into stack w/o losing precision.
 		 * Load from any other memory can be zero extended.
 		 * The desire to keep that precision is already indicated
@@ -353,8 +375,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
 		 * that [fp - off] slot contains scalar that needs to be
 		 * tracked with precision
 		 */
-		spi = insn_stack_access_spi(hist->flags);
-		fr = insn_stack_access_frameno(hist->flags);
+		spi = hist->spi;
+		fr = hist->frame;
 		bpf_bt_set_frame_slot(bt, fr, spi);
 	} else if (class == BPF_STX || class == BPF_ST) {
 		if (bt_is_reg_set(bt, dreg))
@@ -363,11 +385,22 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
 			 * encountered a case of pointer subtraction.
 			 */
 			return -ENOTSUPP;
+
+		if (hist && hist->flags & INSN_F_STACK_ARG_ACCESS) {
+			spi = hist->spi;
+			if (!bt_is_frame_stack_arg_slot_set(bt, bt->frame, spi))
+				return 0;
+			bt_clear_frame_stack_arg_slot(bt, bt->frame, spi);
+			if (class == BPF_STX)
+				bt_set_reg(bt, sreg);
+			return 0;
+		}
+
 		/* scalars can only be spilled into stack */
 		if (!hist || !(hist->flags & INSN_F_STACK_ACCESS))
 			return 0;
-		spi = insn_stack_access_spi(hist->flags);
-		fr = insn_stack_access_frameno(hist->flags);
+		spi = hist->spi;
+		fr = hist->frame;
 		if (!bt_is_frame_slot_set(bt, fr, spi))
 			return 0;
 		bt_clear_frame_slot(bt, fr, spi);
@@ -431,6 +464,12 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
 						bpf_bt_set_frame_reg(bt, bt->frame - 1, i);
 					}
 				}
+				if (bt_stack_arg_mask(bt)) {
+					verifier_bug(env,
+						     "static subprog leftover stack arg slots %x",
+						     bt_stack_arg_mask(bt));
+					return -EFAULT;
+				}
 				if (bt_subprog_exit(bt))
 					return -EFAULT;
 				return 0;
@@ -901,6 +940,17 @@ int bpf_mark_chain_precision(struct bpf_verifier_env *env,
 					*changed = true;
 				}
 			}
+			for (i = 0; i < func->out_stack_arg_cnt; i++) {
+				if (!bt_is_frame_stack_arg_slot_set(bt, fr, i))
+					continue;
+				reg = &func->stack_arg_regs[i];
+				if (reg->type != SCALAR_VALUE || reg->precise) {
+					bt_clear_frame_stack_arg_slot(bt, fr, i);
+				} else {
+					reg->precise = true;
+					*changed = true;
+				}
+			}
 			if (env->log.level & BPF_LOG_LEVEL2) {
 				fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
 					     bt_frame_reg_mask(bt, fr));
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index e7a2fc60523f..5ed7cb4b98c0 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -13,23 +13,8 @@
 #define PERCPU_FREE_TARGET		(4)
 #define PERCPU_NR_SCANS			PERCPU_FREE_TARGET
 
-/* Helpers to get the local list index */
-#define LOCAL_LIST_IDX(t)	((t) - BPF_LOCAL_LIST_T_OFFSET)
-#define LOCAL_FREE_LIST_IDX	LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_FREE)
-#define LOCAL_PENDING_LIST_IDX	LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING)
 #define IS_LOCAL_LIST_TYPE(t)	((t) >= BPF_LOCAL_LIST_T_OFFSET)
 
-/* Local list helpers */
-static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l)
-{
-	return &loc_l->lists[LOCAL_FREE_LIST_IDX];
-}
-
-static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l)
-{
-	return &loc_l->lists[LOCAL_PENDING_LIST_IDX];
-}
-
 /* bpf_lru_node helpers */
 static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node)
 {
@@ -72,6 +57,7 @@ static void __bpf_lru_node_move_to_free(struct bpf_lru_list *l,
 	bpf_lru_list_count_dec(l, node->type);
 
 	node->type = tgt_free_type;
+	WRITE_ONCE(node->pending_free, 0);
 	list_move(&node->list, free_list);
 }
 
@@ -87,6 +73,9 @@ static void __bpf_lru_node_move_in(struct bpf_lru_list *l,
 	bpf_lru_list_count_inc(l, tgt_type);
 	node->type = tgt_type;
 	bpf_lru_node_clear_ref(node);
+	/* Reset pending_free only when moving to the free list */
+	if (tgt_type == BPF_LRU_LIST_T_FREE)
+		WRITE_ONCE(node->pending_free, 0);
 	list_move(&node->list, &l->lists[tgt_type]);
 }
 
@@ -212,9 +201,11 @@ __bpf_lru_list_shrink_inactive(struct bpf_lru *lru,
 	unsigned int i = 0;
 
 	list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) {
-		if (bpf_lru_node_is_ref(node)) {
+		if (bpf_lru_node_is_ref(node) &&
+		    !READ_ONCE(node->pending_free)) {
 			__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
-		} else if (lru->del_from_htab(lru->del_arg, node)) {
+		} else if (READ_ONCE(node->pending_free) ||
+			   lru->del_from_htab(lru->del_arg, node)) {
 			__bpf_lru_node_move_to_free(l, node, free_list,
 						    tgt_free_type);
 			if (++nshrinked == tgt_nshrink)
@@ -273,7 +264,8 @@ static unsigned int __bpf_lru_list_shrink(struct bpf_lru *lru,
 
 	list_for_each_entry_safe_reverse(node, tmp_node, force_shrink_list,
 					 list) {
-		if (lru->del_from_htab(lru->del_arg, node)) {
+		if (READ_ONCE(node->pending_free) ||
+		    lru->del_from_htab(lru->del_arg, node)) {
 			__bpf_lru_node_move_to_free(l, node, free_list,
 						    tgt_free_type);
 			return 1;
@@ -290,8 +282,10 @@ static void __local_list_flush(struct bpf_lru_list *l,
 	struct bpf_lru_node *node, *tmp_node;
 
 	list_for_each_entry_safe_reverse(node, tmp_node,
-					 local_pending_list(loc_l), list) {
-		if (bpf_lru_node_is_ref(node))
+					 &loc_l->pending_list, list) {
+		if (READ_ONCE(node->pending_free))
+			__bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_FREE);
+		else if (bpf_lru_node_is_ref(node))
 			__bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_ACTIVE);
 		else
 			__bpf_lru_node_move_in(l, node,
@@ -307,9 +301,12 @@ static void bpf_lru_list_push_free(struct bpf_lru_list *l,
 	if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)))
 		return;
 
-	raw_spin_lock_irqsave(&l->lock, flags);
+	if (raw_res_spin_lock_irqsave(&l->lock, flags)) {
+		WRITE_ONCE(node->pending_free, 1);
+		return;
+	}
 	__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
-	raw_spin_unlock_irqrestore(&l->lock, flags);
+	raw_res_spin_unlock_irqrestore(&l->lock, flags);
 }
 
 static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
@@ -318,8 +315,10 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
 	struct bpf_lru_list *l = &lru->common_lru.lru_list;
 	struct bpf_lru_node *node, *tmp_node;
 	unsigned int nfree = 0;
+	LIST_HEAD(tmp_free);
 
-	raw_spin_lock(&l->lock);
+	if (raw_res_spin_lock(&l->lock))
+		return;
 
 	__local_list_flush(l, loc_l);
 
@@ -327,7 +326,7 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
 
 	list_for_each_entry_safe(node, tmp_node, &l->lists[BPF_LRU_LIST_T_FREE],
 				 list) {
-		__bpf_lru_node_move_to_free(l, node, local_free_list(loc_l),
+		__bpf_lru_node_move_to_free(l, node, &tmp_free,
 					    BPF_LRU_LOCAL_LIST_T_FREE);
 		if (++nfree == lru->target_free)
 			break;
@@ -335,10 +334,19 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
 
 	if (nfree < lru->target_free)
 		__bpf_lru_list_shrink(lru, l, lru->target_free - nfree,
-				      local_free_list(loc_l),
+				      &tmp_free,
 				      BPF_LRU_LOCAL_LIST_T_FREE);
 
-	raw_spin_unlock(&l->lock);
+	raw_res_spin_unlock(&l->lock);
+
+	/*
+	 * Transfer the harvested nodes from the temporary list_head into
+	 * the lockless per-CPU free llist.
+	 */
+	list_for_each_entry_safe(node, tmp_node, &tmp_free, list) {
+		list_del(&node->list);
+		llist_add(&node->llist, &loc_l->free_llist);
+	}
 }
 
 static void __local_list_add_pending(struct bpf_lru *lru,
@@ -350,22 +358,21 @@ static void __local_list_add_pending(struct bpf_lru *lru,
 	*(u32 *)((void *)node + lru->hash_offset) = hash;
 	node->cpu = cpu;
 	node->type = BPF_LRU_LOCAL_LIST_T_PENDING;
+	WRITE_ONCE(node->pending_free, 0);
 	bpf_lru_node_clear_ref(node);
-	list_add(&node->list, local_pending_list(loc_l));
+	list_add(&node->list, &loc_l->pending_list);
 }
 
 static struct bpf_lru_node *
 __local_list_pop_free(struct bpf_lru_locallist *loc_l)
 {
-	struct bpf_lru_node *node;
+	struct llist_node *llnode;
 
-	node = list_first_entry_or_null(local_free_list(loc_l),
-					struct bpf_lru_node,
-					list);
-	if (node)
-		list_del(&node->list);
+	llnode = llist_del_first(&loc_l->free_llist);
+	if (!llnode)
+		return NULL;
 
-	return node;
+	return container_of(llnode, struct bpf_lru_node, llist);
 }
 
 static struct bpf_lru_node *
@@ -376,10 +383,10 @@ __local_list_pop_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l)
 
 ignore_ref:
 	/* Get from the tail (i.e. older element) of the pending list. */
-	list_for_each_entry_reverse(node, local_pending_list(loc_l),
-				    list) {
+	list_for_each_entry_reverse(node, &loc_l->pending_list, list) {
 		if ((!bpf_lru_node_is_ref(node) || force) &&
-		    lru->del_from_htab(lru->del_arg, node)) {
+		    (READ_ONCE(node->pending_free) ||
+		     lru->del_from_htab(lru->del_arg, node))) {
 			list_del(&node->list);
 			return node;
 		}
@@ -404,7 +411,8 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru,
 
 	l = per_cpu_ptr(lru->percpu_lru, cpu);
 
-	raw_spin_lock_irqsave(&l->lock, flags);
+	if (raw_res_spin_lock_irqsave(&l->lock, flags))
+		return NULL;
 
 	__bpf_lru_list_rotate(lru, l);
 
@@ -420,7 +428,7 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru,
 		__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE);
 	}
 
-	raw_spin_unlock_irqrestore(&l->lock, flags);
+	raw_res_spin_unlock_irqrestore(&l->lock, flags);
 
 	return node;
 }
@@ -437,7 +445,8 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
 
 	loc_l = per_cpu_ptr(clru->local_list, cpu);
 
-	raw_spin_lock_irqsave(&loc_l->lock, flags);
+	if (raw_res_spin_lock_irqsave(&loc_l->lock, flags))
+		return NULL;
 
 	node = __local_list_pop_free(loc_l);
 	if (!node) {
@@ -448,17 +457,22 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
 	if (node)
 		__local_list_add_pending(lru, loc_l, cpu, node, hash);
 
-	raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+	raw_res_spin_unlock_irqrestore(&loc_l->lock, flags);
 
 	if (node)
 		return node;
 
-	/* No free nodes found from the local free list and
+	/*
+	 * No free nodes found from the local free list and
 	 * the global LRU list.
 	 *
 	 * Steal from the local free/pending list of the
 	 * current CPU and remote CPU in RR.  It starts
 	 * with the loc_l->next_steal CPU.
+	 *
+	 * Acquire the victim's lock before touching either list. On
+	 * acquisition failure (rqspinlock AA or timeout) skip the victim
+	 * and try the next CPU.
 	 */
 
 	first_steal = loc_l->next_steal;
@@ -466,24 +480,36 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
 	do {
 		steal_loc_l = per_cpu_ptr(clru->local_list, steal);
 
-		raw_spin_lock_irqsave(&steal_loc_l->lock, flags);
-
-		node = __local_list_pop_free(steal_loc_l);
-		if (!node)
-			node = __local_list_pop_pending(lru, steal_loc_l);
-
-		raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags);
+		if (!raw_res_spin_lock_irqsave(&steal_loc_l->lock, flags)) {
+			node = __local_list_pop_free(steal_loc_l);
+			if (!node)
+				node = __local_list_pop_pending(lru, steal_loc_l);
+			raw_res_spin_unlock_irqrestore(&steal_loc_l->lock, flags);
+		}
 
 		steal = cpumask_next_wrap(steal, cpu_possible_mask);
 	} while (!node && steal != first_steal);
 
 	loc_l->next_steal = steal;
 
-	if (node) {
-		raw_spin_lock_irqsave(&loc_l->lock, flags);
-		__local_list_add_pending(lru, loc_l, cpu, node, hash);
-		raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+	if (!node)
+		return NULL;
+
+	if (raw_res_spin_lock_irqsave(&loc_l->lock, flags)) {
+		/*
+		 * The local pending lock can't be acquired (rqspinlock AA
+		 * or timeout). Return the stolen node to the per-CPU
+		 * free_llist instead of orphaning it; the next pop_free on
+		 * this CPU will pick it up.
+		 */
+		node->type = BPF_LRU_LOCAL_LIST_T_FREE;
+		bpf_lru_node_clear_ref(node);
+		WRITE_ONCE(node->pending_free, 0);
+		llist_add(&node->llist, &loc_l->free_llist);
+		return NULL;
 	}
+	__local_list_add_pending(lru, loc_l, cpu, node, hash);
+	raw_res_spin_unlock_irqrestore(&loc_l->lock, flags);
 
 	return node;
 }
@@ -511,18 +537,24 @@ static void bpf_common_lru_push_free(struct bpf_lru *lru,
 
 		loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu);
 
-		raw_spin_lock_irqsave(&loc_l->lock, flags);
+		if (raw_res_spin_lock_irqsave(&loc_l->lock, flags)) {
+			WRITE_ONCE(node->pending_free, 1);
+			return;
+		}
 
 		if (unlikely(node->type != BPF_LRU_LOCAL_LIST_T_PENDING)) {
-			raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+			raw_res_spin_unlock_irqrestore(&loc_l->lock,
+						       flags);
 			goto check_lru_list;
 		}
 
 		node->type = BPF_LRU_LOCAL_LIST_T_FREE;
 		bpf_lru_node_clear_ref(node);
-		list_move(&node->list, local_free_list(loc_l));
+		list_del(&node->list);
+
+		raw_res_spin_unlock_irqrestore(&loc_l->lock, flags);
 
-		raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+		llist_add(&node->llist, &loc_l->free_llist);
 		return;
 	}
 
@@ -538,11 +570,14 @@ static void bpf_percpu_lru_push_free(struct bpf_lru *lru,
 
 	l = per_cpu_ptr(lru->percpu_lru, node->cpu);
 
-	raw_spin_lock_irqsave(&l->lock, flags);
+	if (raw_res_spin_lock_irqsave(&l->lock, flags)) {
+		WRITE_ONCE(node->pending_free, 1);
+		return;
+	}
 
 	__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
 
-	raw_spin_unlock_irqrestore(&l->lock, flags);
+	raw_res_spin_unlock_irqrestore(&l->lock, flags);
 }
 
 void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node)
@@ -565,6 +600,7 @@ static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf,
 
 		node = (struct bpf_lru_node *)(buf + node_offset);
 		node->type = BPF_LRU_LIST_T_FREE;
+		node->pending_free = 0;
 		bpf_lru_node_clear_ref(node);
 		list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
 		buf += elem_size;
@@ -594,6 +630,7 @@ again:
 		node = (struct bpf_lru_node *)(buf + node_offset);
 		node->cpu = cpu;
 		node->type = BPF_LRU_LIST_T_FREE;
+		node->pending_free = 0;
 		bpf_lru_node_clear_ref(node);
 		list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
 		i++;
@@ -618,14 +655,12 @@ void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
 
 static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu)
 {
-	int i;
-
-	for (i = 0; i < NR_BPF_LRU_LOCAL_LIST_T; i++)
-		INIT_LIST_HEAD(&loc_l->lists[i]);
+	INIT_LIST_HEAD(&loc_l->pending_list);
+	init_llist_head(&loc_l->free_llist);
 
 	loc_l->next_steal = cpu;
 
-	raw_spin_lock_init(&loc_l->lock);
+	raw_res_spin_lock_init(&loc_l->lock);
 }
 
 static void bpf_lru_list_init(struct bpf_lru_list *l)
@@ -640,7 +675,7 @@ static void bpf_lru_list_init(struct bpf_lru_list *l)
 
 	l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE];
 
-	raw_spin_lock_init(&l->lock);
+	raw_res_spin_lock_init(&l->lock);
 }
 
 int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index fe2661a58ea9..8d0ee61622af 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -6,11 +6,11 @@
 
 #include <linux/cache.h>
 #include <linux/list.h>
-#include <linux/spinlock_types.h>
+#include <linux/llist.h>
+#include <asm/rqspinlock.h>
 
 #define NR_BPF_LRU_LIST_T	(3)
 #define NR_BPF_LRU_LIST_COUNT	(2)
-#define NR_BPF_LRU_LOCAL_LIST_T (2)
 #define BPF_LOCAL_LIST_T_OFFSET NR_BPF_LRU_LIST_T
 
 enum bpf_lru_list_type {
@@ -22,10 +22,22 @@ enum bpf_lru_list_type {
 };
 
 struct bpf_lru_node {
-	struct list_head list;
+	/*
+	 * A node is in at most one list at a time. The free path on the
+	 * per-CPU locallist uses an llist, so share storage via a union.
+	 */
+	union {
+		struct list_head list;
+		struct llist_node llist;
+	};
 	u16 cpu;
 	u8 type;
 	u8 ref;
+	/*
+	 * Marks nodes whose *_push_free() lock acquire failed; reclaimed
+	 * by flush/shrink which honor the flag instead of del_from_htab().
+	 */
+	u8 pending_free;
 };
 
 struct bpf_lru_list {
@@ -34,13 +46,14 @@ struct bpf_lru_list {
 	/* The next inactive list rotation starts from here */
 	struct list_head *next_inactive_rotation;
 
-	raw_spinlock_t lock ____cacheline_aligned_in_smp;
+	rqspinlock_t lock ____cacheline_aligned_in_smp;
 };
 
 struct bpf_lru_locallist {
-	struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T];
+	struct list_head pending_list;
+	struct llist_head free_llist;
 	u16 next_steal;
-	raw_spinlock_t lock;
+	rqspinlock_t lock;
 };
 
 struct bpf_common_lru {
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index c5c925f00202..564071a92d7d 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -427,6 +427,26 @@ BTF_ID(func, bpf_lsm_audit_rule_known)
 BTF_ID(func, bpf_lsm_inode_xattr_skipcap)
 BTF_SET_END(bool_lsm_hooks)
 
+/* hooks returning void */
+#define LSM_HOOK_void(DEFAULT, NAME, ...) BTF_ID(func, bpf_lsm_##NAME)
+#define LSM_HOOK_int(DEFAULT, NAME, ...)  /* nothing */
+#define LSM_HOOK(RET, DEFAULT, NAME, ...) LSM_HOOK_##RET(DEFAULT, NAME, __VA_ARGS__)
+BTF_SET_START(void_lsm_hooks)
+#include <linux/lsm_hook_defs.h>
+#undef LSM_HOOK
+#undef LSM_HOOK_void
+#undef LSM_HOOK_int
+BTF_SET_END(void_lsm_hooks)
+
+bool bpf_lsm_hook_returns_errno(u32 btf_id)
+{
+	if (btf_id_set_contains(&bool_lsm_hooks, btf_id))
+		return false;
+	if (btf_id_set_contains(&void_lsm_hooks, btf_id))
+		return false;
+	return true;
+}
+
 int bpf_lsm_get_retval_range(const struct bpf_prog *prog,
 			     struct bpf_retval_range *retval_range)
 {
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 521cb9d7e8c7..51b16e5f5534 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -594,8 +594,8 @@ const struct bpf_link_ops bpf_struct_ops_link_lops = {
 	.dealloc = bpf_struct_ops_link_dealloc,
 };
 
-int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
-				      struct bpf_tramp_link *link,
+int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_nodes *tnodes,
+				      struct bpf_tramp_node *node,
 				      const struct btf_func_model *model,
 				      void *stub_func,
 				      void **_image, u32 *_image_off,
@@ -605,13 +605,13 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
 	void *image = *_image;
 	int size;
 
-	tlinks[BPF_TRAMP_FENTRY].links[0] = link;
-	tlinks[BPF_TRAMP_FENTRY].nr_links = 1;
+	tnodes[BPF_TRAMP_FENTRY].nodes[0] = node;
+	tnodes[BPF_TRAMP_FENTRY].nr_nodes = 1;
 
 	if (model->ret_size > 0)
 		flags |= BPF_TRAMP_F_RET_FENTRY_RET;
 
-	size = arch_bpf_trampoline_size(model, flags, tlinks, stub_func);
+	size = arch_bpf_trampoline_size(model, flags, tnodes, stub_func);
 	if (size <= 0)
 		return size ? : -EFAULT;
 
@@ -628,7 +628,7 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
 
 	size = arch_prepare_bpf_trampoline(NULL, image + image_off,
 					   image + image_off + size,
-					   model, flags, tlinks, stub_func);
+					   model, flags, tnodes, stub_func);
 	if (size <= 0) {
 		if (image != *_image)
 			bpf_struct_ops_image_free(image);
@@ -693,7 +693,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 	const struct btf_type *module_type;
 	const struct btf_member *member;
 	const struct btf_type *t = st_ops_desc->type;
-	struct bpf_tramp_links *tlinks;
+	struct bpf_tramp_nodes *tnodes;
 	void *udata, *kdata;
 	int prog_fd, err;
 	u32 i, trampoline_start, image_off = 0;
@@ -720,8 +720,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 	if (uvalue->common.state || refcount_read(&uvalue->common.refcnt))
 		return -EINVAL;
 
-	tlinks = kzalloc_objs(*tlinks, BPF_TRAMP_MAX);
-	if (!tlinks)
+	tnodes = kzalloc_objs(*tnodes, BPF_TRAMP_MAX);
+	if (!tnodes)
 		return -ENOMEM;
 
 	uvalue = (struct bpf_struct_ops_value *)st_map->uvalue;
@@ -817,8 +817,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 			err = -ENOMEM;
 			goto reset_unlock;
 		}
-		bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS,
-			      &bpf_struct_ops_link_lops, prog, prog->expected_attach_type);
+		bpf_tramp_link_init(link, BPF_LINK_TYPE_STRUCT_OPS,
+			      &bpf_struct_ops_link_lops, prog, prog->expected_attach_type, 0);
+
 		*plink++ = &link->link;
 
 		/* Poison pointer on error instead of return for backward compatibility */
@@ -832,7 +833,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 		*pksym++ = ksym;
 
 		trampoline_start = image_off;
-		err = bpf_struct_ops_prepare_trampoline(tlinks, link,
+		err = bpf_struct_ops_prepare_trampoline(tnodes, &link->node,
 						&st_ops->func_models[i],
 						*(void **)(st_ops->cfi_stubs + moff),
 						&image, &image_off,
@@ -911,7 +912,7 @@ reset_unlock:
 	memset(uvalue, 0, map->value_size);
 	memset(kvalue, 0, map->value_size);
 unlock:
-	kfree(tlinks);
+	kfree(tnodes);
 	mutex_unlock(&st_map->lock);
 	if (!err)
 		bpf_struct_ops_map_add_ksyms(st_map);
@@ -1204,6 +1205,42 @@ u32 bpf_struct_ops_id(const void *kdata)
 }
 EXPORT_SYMBOL_GPL(bpf_struct_ops_id);
 
+/**
+ * bpf_struct_ops_for_each_prog - Invoke @cb for each member prog
+ * @kdata: kernel-side struct_ops vmtable (the @kdata arg to ->reg/->update/->unreg)
+ * @cb: callback invoked once per member prog; non-zero return stops iteration
+ * @data: opaque argument passed to @cb
+ *
+ * Walks the struct_ops member progs registered on the map containing @kdata.
+ * Intended for use from struct_ops ->reg() callbacks (and similar) that need to
+ * inspect the loaded BPF programs (for example to discover maps they reference
+ * via @prog->aux->used_maps).
+ *
+ * Return 0 if iteration completed, otherwise the first non-zero @cb return.
+ */
+int bpf_struct_ops_for_each_prog(const void *kdata,
+				 int (*cb)(struct bpf_prog *prog, void *data),
+				 void *data)
+{
+	struct bpf_struct_ops_value *kvalue;
+	struct bpf_struct_ops_map *st_map;
+	u32 i;
+	int ret;
+
+	kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+	st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
+
+	for (i = 0; i < st_map->funcs_cnt; i++) {
+		if (!st_map->links[i])
+			continue;
+		ret = cb(st_map->links[i]->prog, data);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(bpf_struct_ops_for_each_prog);
+
 static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
 {
 	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index a62d78581207..15ae7c43f594 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -182,7 +182,6 @@
 #define BITS_ROUNDUP_BYTES(bits) \
 	(BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits))
 
-#define BTF_INFO_MASK 0x9f00ffff
 #define BTF_INT_MASK 0x0fffffff
 #define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE)
 #define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET)
@@ -289,7 +288,7 @@ enum verifier_phase {
 struct resolve_vertex {
 	const struct btf_type *t;
 	u32 type_id;
-	u16 next_member;
+	u32 next_member;
 };
 
 enum visit_state {
@@ -2031,7 +2030,7 @@ static int env_stack_push(struct btf_verifier_env *env,
 }
 
 static void env_stack_set_next_member(struct btf_verifier_env *env,
-				      u16 next_member)
+				      u32 next_member)
 {
 	env->stack[env->top_stack - 1].next_member = next_member;
 }
@@ -3293,7 +3292,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
 	struct btf *btf = env->btf;
 	u32 struct_size = t->size;
 	u32 offset;
-	u16 i;
+	u32 i;
 
 	meta_needed = btf_type_vlen(t) * sizeof(*member);
 	if (meta_left < meta_needed) {
@@ -3369,7 +3368,7 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
 {
 	const struct btf_member *member;
 	int err;
-	u16 i;
+	u32 i;
 
 	/* Before continue resolving the next_member,
 	 * ensure the last member is indeed resolved to a
@@ -3668,7 +3667,7 @@ end:
 static int btf_repeat_fields(struct btf_field_info *info, int info_cnt,
 			     u32 field_cnt, u32 repeat_cnt, u32 elem_size)
 {
-	u32 i, j;
+	u32 i, j, total_cnt, total_repeats;
 	u32 cur;
 
 	/* Ensure not repeating fields that should not be repeated. */
@@ -3686,10 +3685,9 @@ static int btf_repeat_fields(struct btf_field_info *info, int info_cnt,
 		}
 	}
 
-	/* The type of struct size or variable size is u32,
-	 * so the multiplication will not overflow.
-	 */
-	if (field_cnt * (repeat_cnt + 1) > info_cnt)
+	if (check_add_overflow(repeat_cnt, 1, &total_repeats) ||
+	    check_mul_overflow(field_cnt, total_repeats, &total_cnt) ||
+	    total_cnt > (u32)info_cnt)
 		return -E2BIG;
 
 	cur = field_cnt;
@@ -4447,7 +4445,7 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
 	const struct btf_enum *enums = btf_type_enum(t);
 	struct btf *btf = env->btf;
 	const char *fmt_str;
-	u16 i, nr_enums;
+	u32 i, nr_enums;
 	u32 meta_needed;
 
 	nr_enums = btf_type_vlen(t);
@@ -4555,7 +4553,7 @@ static s32 btf_enum64_check_meta(struct btf_verifier_env *env,
 	const struct btf_enum64 *enums = btf_type_enum64(t);
 	struct btf *btf = env->btf;
 	const char *fmt_str;
-	u16 i, nr_enums;
+	u32 i, nr_enums;
 	u32 meta_needed;
 
 	nr_enums = btf_type_vlen(t);
@@ -4683,7 +4681,7 @@ static void btf_func_proto_log(struct btf_verifier_env *env,
 			       const struct btf_type *t)
 {
 	const struct btf_param *args = (const struct btf_param *)(t + 1);
-	u16 nr_args = btf_type_vlen(t), i;
+	u32 nr_args = btf_type_vlen(t), i;
 
 	btf_verifier_log(env, "return=%u args=(", t->type);
 	if (!nr_args) {
@@ -4929,7 +4927,7 @@ static int btf_datasec_resolve(struct btf_verifier_env *env,
 {
 	const struct btf_var_secinfo *vsi;
 	struct btf *btf = env->btf;
-	u16 i;
+	u32 i;
 
 	env->resolve_mode = RESOLVE_TBD;
 	for_each_vsi_from(i, v->next_member, v->t, vsi) {
@@ -5183,7 +5181,7 @@ static int btf_func_proto_check(struct btf_verifier_env *env,
 	const struct btf_type *ret_type;
 	const struct btf_param *args;
 	const struct btf *btf;
-	u16 nr_args, i;
+	u32 nr_args, i;
 	int err;
 
 	btf = env->btf;
@@ -5278,7 +5276,7 @@ static int btf_func_check(struct btf_verifier_env *env,
 	const struct btf_type *proto_type;
 	const struct btf_param *args;
 	const struct btf *btf;
-	u16 nr_args, i;
+	u32 nr_args, i;
 
 	btf = env->btf;
 	proto_type = btf_type_by_id(btf, t->type);
@@ -5336,12 +5334,6 @@ static s32 btf_check_meta(struct btf_verifier_env *env,
 	}
 	meta_left -= sizeof(*t);
 
-	if (t->info & ~BTF_INFO_MASK) {
-		btf_verifier_log(env, "[%u] Invalid btf_info:%x",
-				 env->log_type_id, t->info);
-		return -EINVAL;
-	}
-
 	if (BTF_INFO_KIND(t->info) > BTF_KIND_MAX ||
 	    BTF_INFO_KIND(t->info) == BTF_KIND_UNKN) {
 		btf_verifier_log(env, "[%u] Invalid kind:%u",
@@ -5914,25 +5906,10 @@ static int btf_check_type_tags(struct btf_verifier_env *env,
 	return 0;
 }
 
-static int finalize_log(struct bpf_verifier_log *log, bpfptr_t uattr, u32 uattr_size)
-{
-	u32 log_true_size;
-	int err;
-
-	err = bpf_vlog_finalize(log, &log_true_size);
-
-	if (uattr_size >= offsetofend(union bpf_attr, btf_log_true_size) &&
-	    copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, btf_log_true_size),
-				  &log_true_size, sizeof(log_true_size)))
-		err = -EFAULT;
-
-	return err;
-}
-
-static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
+static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr,
+			     struct bpf_log_attr *attr_log)
 {
 	bpfptr_t btf_data = make_bpfptr(attr->btf, uattr.is_kernel);
-	char __user *log_ubuf = u64_to_user_ptr(attr->btf_log_buf);
 	struct btf_struct_metas *struct_meta_tab;
 	struct btf_verifier_env *env = NULL;
 	struct btf *btf = NULL;
@@ -5949,8 +5926,7 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
 	/* user could have requested verbose verifier output
 	 * and supplied buffer to store the verification trace
 	 */
-	err = bpf_vlog_init(&env->log, attr->btf_log_level,
-			    log_ubuf, attr->btf_log_size);
+	err = bpf_vlog_init(&env->log, attr_log->level, attr_log->ubuf, attr_log->size);
 	if (err)
 		goto errout_free;
 
@@ -6015,7 +5991,7 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
 		}
 	}
 
-	err = finalize_log(&env->log, uattr, uattr_size);
+	err = bpf_log_attr_finalize(attr_log, &env->log);
 	if (err)
 		goto errout_free;
 
@@ -6027,7 +6003,7 @@ errout_meta:
 	btf_free_struct_meta_tab(btf);
 errout:
 	/* overwrite err with -ENOSPC or -EFAULT */
-	ret = finalize_log(&env->log, uattr, uattr_size);
+	ret = bpf_log_attr_finalize(attr_log, &env->log);
 	if (ret)
 		err = ret;
 errout_free:
@@ -6980,7 +6956,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 			info->reg_type = ctx_arg_info->reg_type;
 			info->btf = ctx_arg_info->btf ? : btf_vmlinux;
 			info->btf_id = ctx_arg_info->btf_id;
-			info->ref_obj_id = ctx_arg_info->ref_obj_id;
+			info->ref_id = ctx_arg_info->ref_id;
 			return true;
 		}
 	}
@@ -7825,6 +7801,134 @@ enum btf_arg_tag {
 	ARG_TAG_ARENA	  = BIT_ULL(5),
 };
 
+static int btf_scan_decl_tags(struct bpf_verifier_env *env,
+			      const struct btf *btf,
+			      const struct btf_type *fn_t,
+			      u32 arg_idx, bool is_global, u32 *tags)
+{
+	int id = btf_named_start_id(btf, false) - 1;
+	const char tag_key[] = "arg:";
+	static const struct {
+		const char *tag_value;
+		enum btf_arg_tag arg_tag;
+	} tag_values[] = {
+		{ "ctx", ARG_TAG_CTX },
+		{ "trusted", ARG_TAG_TRUSTED },
+		{ "untrusted", ARG_TAG_UNTRUSTED },
+		{ "nonnull", ARG_TAG_NONNULL },
+		{ "nullable", ARG_TAG_NULLABLE },
+		{ "arena", ARG_TAG_ARENA },
+	};
+
+	/*
+	 * The 'arg:<tag>' decl_tag takes precedence over the derivation
+	 * of the register type from the BTF type itself.
+	 */
+	while ((id = btf_find_next_decl_tag(btf, fn_t, arg_idx, tag_key, id)) > 0) {
+		const struct btf_type *tag_t;
+		const char *tag;
+		int i;
+		bool found;
+
+		/* disallow arg tags in static subprogs */
+		if (!is_global) {
+			bpf_log(&env->log,
+				"arg#%d type tag is not supported in static functions\n",
+				arg_idx);
+			return -EOPNOTSUPP;
+		}
+
+		tag_t = btf_type_by_id(btf, id);
+		tag = __btf_name_by_offset(btf, tag_t->name_off) + (sizeof(tag_key) - 1);
+
+		found = false;
+		for (i = 0; i < ARRAY_SIZE(tag_values); ++i) {
+			if (!strcmp(tag, tag_values[i].tag_value)) {
+				*tags |= tag_values[i].arg_tag;
+				found = true;
+				break;
+			}
+		}
+
+		if (!found) {
+			bpf_log(&env->log, "arg#%d has unsupported set of tags\n", arg_idx);
+			return -EOPNOTSUPP;
+		}
+	}
+	if (id != -ENOENT) {
+		bpf_log(&env->log, "arg#%d type tag fetching failure: %d\n", arg_idx, id);
+		return id;
+	}
+
+	return 0;
+}
+
+static int btf_scan_type_tags(struct bpf_verifier_env *env,
+			      const struct btf *btf, u32 type_id,
+			      u32 *tags)
+{
+	const struct btf_type *t;
+
+	/* Find the first pointer type in the chain. */
+	t = btf_type_skip_modifiers(btf, type_id, NULL);
+
+	/*
+	 * We currently reject type tags on non-pointer types,
+	 * which neither LLVM nor GCC support anyway.
+	 */
+	if (!t || !btf_type_is_ptr(t))
+		return 0;
+
+	/* We got a pointer, get all associated type tags. */
+	for (t = btf_type_by_id(btf, t->type); t && btf_type_is_modifier(t);
+		t = btf_type_by_id(btf, t->type)) {
+
+		/* Skip non-type tag modifiers. */
+		if (!btf_type_is_type_tag(t))
+			continue;
+
+		const char *tag = __btf_name_by_offset(btf, t->name_off);
+
+		if (strcmp(tag, "arena") == 0) {
+			*tags |= ARG_TAG_ARENA;
+		} else {
+			bpf_log(&env->log, "function signature member has unsupported type tag '%s'\n",
+				tag);
+			return -EOPNOTSUPP;
+		}
+	}
+
+	return 0;
+}
+
+/* Check whether the type is a valid return type. */
+static int btf_validate_return_type(struct bpf_verifier_env *env, struct btf *btf,
+		const struct btf_type *t, int subprog)
+{
+	u32 tags = 0;
+	int err;
+
+	err = btf_scan_type_tags(env, btf, t->type, &tags);
+	if (err)
+		return err;
+
+	t = btf_type_skip_modifiers(btf, t->type, NULL);
+
+	/*
+	 * We allow all subprogs except for the main one to return any kind of arena pointer.
+	 * General arena variables are not allowed, since it makes no sense to return by value
+	 * a variable that's on the heap in the first place.
+	 */
+	if (subprog && (tags & ARG_TAG_ARENA) && btf_type_is_ptr(t))
+		return 0;
+
+	/* We always accept void or scalars. */
+	if (btf_type_is_void(t) || btf_type_is_int(t) || btf_is_any_enum(t))
+		return 0;
+
+	return -EOPNOTSUPP;
+}
+
 /* Process BTF of a function to produce high-level expectation of function
  * arguments (like ARG_PTR_TO_CTX, or ARG_PTR_TO_MEM, etc). This information
  * is cached in subprog info for reuse.
@@ -7843,6 +7947,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
 	struct btf *btf = prog->aux->btf;
 	const struct btf_param *args;
 	const struct btf_type *t, *ref_t, *fn_t;
+	int err;
 	u32 i, nargs, btf_id;
 	const char *tname;
 
@@ -7887,25 +7992,36 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
 	}
 	args = (const struct btf_param *)(t + 1);
 	nargs = btf_type_vlen(t);
+	sub->arg_cnt = nargs;
+	if (nargs > MAX_BPF_FUNC_ARGS) {
+		bpf_log(log, "kernel supports at most %d parameters, function %s has %d\n",
+			MAX_BPF_FUNC_ARGS, tname, nargs);
+		return -EFAULT;
+	}
 	if (nargs > MAX_BPF_FUNC_REG_ARGS) {
-		if (!is_global)
-			return -EINVAL;
-		bpf_log(log, "Global function %s() with %d > %d args. Buggy compiler.\n",
+		if (!bpf_jit_supports_stack_args()) {
+			bpf_log(log, "JIT does not support function %s() with %d args\n",
+				tname, nargs);
+			return -EFAULT;
+		}
+		sub->stack_arg_cnt = nargs - MAX_BPF_FUNC_REG_ARGS;
+	}
+
+	if (is_global && nargs > MAX_BPF_FUNC_REG_ARGS) {
+		bpf_log(log, "global function %s has %d > %d args, stack args not supported\n",
 			tname, nargs, MAX_BPF_FUNC_REG_ARGS);
 		return -EINVAL;
 	}
-	/* check that function is void or returns int, exception cb also requires this */
-	t = btf_type_by_id(btf, t->type);
-	while (btf_type_is_modifier(t))
-		t = btf_type_by_id(btf, t->type);
-	if (!btf_type_is_void(t) && !btf_type_is_int(t) && !btf_is_any_enum(t)) {
-		if (!is_global)
-			return -EINVAL;
-		bpf_log(log,
-			"Global function %s() return value not void or scalar. "
-			"Only those are supported.\n",
-			tname);
-		return -EINVAL;
+
+	err = btf_validate_return_type(env, btf, t, subprog);
+	if (err) {
+		if (is_global) {
+			bpf_log(log,
+				"Global function %s() return value not void or scalar. "
+				"Only those are supported.\n",
+				tname);
+		}
+		return err;
 	}
 
 	/* Convert BTF function arguments into verifier types.
@@ -7913,42 +8029,13 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
 	 */
 	for (i = 0; i < nargs; i++) {
 		u32 tags = 0;
-		int id = btf_named_start_id(btf, false) - 1;
-
-		/* 'arg:<tag>' decl_tag takes precedence over derivation of
-		 * register type from BTF type itself
-		 */
-		while ((id = btf_find_next_decl_tag(btf, fn_t, i, "arg:", id)) > 0) {
-			const struct btf_type *tag_t = btf_type_by_id(btf, id);
-			const char *tag = __btf_name_by_offset(btf, tag_t->name_off) + 4;
-
-			/* disallow arg tags in static subprogs */
-			if (!is_global) {
-				bpf_log(log, "arg#%d type tag is not supported in static functions\n", i);
-				return -EOPNOTSUPP;
-			}
+		err = btf_scan_decl_tags(env, btf, fn_t, i, is_global, &tags);
+		if (err)
+			return err;
 
-			if (strcmp(tag, "ctx") == 0) {
-				tags |= ARG_TAG_CTX;
-			} else if (strcmp(tag, "trusted") == 0) {
-				tags |= ARG_TAG_TRUSTED;
-			} else if (strcmp(tag, "untrusted") == 0) {
-				tags |= ARG_TAG_UNTRUSTED;
-			} else if (strcmp(tag, "nonnull") == 0) {
-				tags |= ARG_TAG_NONNULL;
-			} else if (strcmp(tag, "nullable") == 0) {
-				tags |= ARG_TAG_NULLABLE;
-			} else if (strcmp(tag, "arena") == 0) {
-				tags |= ARG_TAG_ARENA;
-			} else {
-				bpf_log(log, "arg#%d has unsupported set of tags\n", i);
-				return -EOPNOTSUPP;
-			}
-		}
-		if (id != -ENOENT) {
-			bpf_log(log, "arg#%d type tag fetching failure: %d\n", i, id);
-			return id;
-		}
+		err = btf_scan_type_tags(env, btf, args[i].type, &tags);
+		if (err)
+			return err;
 
 		t = btf_type_by_id(btf, args[i].type);
 		while (btf_type_is_modifier(t))
@@ -7973,7 +8060,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
 				bpf_log(log, "arg#%d has invalid combination of tags\n", i);
 				return -EINVAL;
 			}
-			sub->args[i].arg_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY;
+			sub->args[i].arg_type = ARG_PTR_TO_DYNPTR;
 			continue;
 		}
 		if (tags & ARG_TAG_TRUSTED) {
@@ -8074,7 +8161,6 @@ skip_pointer:
 		return -EINVAL;
 	}
 
-	sub->arg_cnt = nargs;
 	sub->args_cached = true;
 
 	return 0;
@@ -8196,12 +8282,12 @@ static int __btf_new_fd(struct btf *btf)
 	return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC);
 }
 
-int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
+int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_attr *attr_log)
 {
 	struct btf *btf;
 	int ret;
 
-	btf = btf_parse(attr, uattr, uattr_size);
+	btf = btf_parse(attr, uattr, attr_log);
 	if (IS_ERR(btf))
 		return PTR_ERR(btf);
 
@@ -8684,6 +8770,39 @@ static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name,
 	return 0;
 }
 
+static int btf_check_kfunc_name(struct btf *btf, const char *func_name, u32 kind)
+{
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+	struct btf_module *btf_mod, *tmp;
+#endif
+	s32 id;
+
+	if (!btf_is_module(btf))
+		return 0;
+
+	id = btf_find_by_name_kind(bpf_get_btf_vmlinux(), func_name, kind);
+	if (id >= 0) {
+		pr_err("kfunc %s (id: %d) is already present in vmlinux.\n",
+		       func_name, id);
+		return -EINVAL;
+	}
+
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+	guard(mutex)(&btf_module_mutex);
+	list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
+		if (btf_mod->btf == btf)
+			continue;
+		id = btf_find_by_name_kind(btf_mod->btf, func_name, kind);
+		if (id >= 0) {
+			pr_err("kfunc %s (id: %d) is already present in module %s.\n",
+			       func_name, id, btf_mod->module->name);
+			return -EINVAL;
+		}
+	}
+#endif
+	return 0;
+}
+
 static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags)
 {
 	const struct btf_type *func;
@@ -8697,7 +8816,8 @@ static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags)
 
 	/* sanity check kfunc name */
 	func_name = btf_name_by_offset(btf, func->name_off);
-	if (!func_name || !func_name[0])
+	if (!func_name || !func_name[0] ||
+	    btf_check_kfunc_name(btf, func_name, BTF_INFO_KIND(func->info)))
 		return -EINVAL;
 
 	func = btf_type_by_id(btf, func->type);
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 876f6a81a9b6..83ce66296ac1 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -55,6 +55,28 @@ void __init cgroup_bpf_lifetime_notifier_init(void)
 						&cgroup_bpf_lifetime_nb));
 }
 
+#ifdef CONFIG_BPF_LSM
+struct cgroup_lsm_atype {
+	u32 attach_btf_id;
+	int refcnt;
+	bool returns_errno;
+};
+
+static struct cgroup_lsm_atype cgroup_lsm_atype[CGROUP_LSM_NUM];
+
+static bool cgroup_bpf_hook_returns_errno(enum cgroup_bpf_attach_type atype)
+{
+	if (atype >= CGROUP_LSM_START && atype <= CGROUP_LSM_END)
+		return READ_ONCE(cgroup_lsm_atype[atype - CGROUP_LSM_START].returns_errno);
+	return true;
+}
+#else
+static bool cgroup_bpf_hook_returns_errno(enum cgroup_bpf_attach_type atype)
+{
+	return true;
+}
+#endif
+
 /* __always_inline is necessary to prevent indirect call through run_prog
  * function pointer.
  */
@@ -83,7 +105,8 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
 			*(ret_flags) |= (func_ret >> 1);
 			func_ret &= 1;
 		}
-		if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval))
+		if (!func_ret && cgroup_bpf_hook_returns_errno(atype) &&
+		    !IS_ERR_VALUE((long)run_ctx.retval))
 			run_ctx.retval = -EPERM;
 		item++;
 	}
@@ -156,13 +179,6 @@ unsigned int __cgroup_bpf_run_lsm_current(const void *ctx,
 }
 
 #ifdef CONFIG_BPF_LSM
-struct cgroup_lsm_atype {
-	u32 attach_btf_id;
-	int refcnt;
-};
-
-static struct cgroup_lsm_atype cgroup_lsm_atype[CGROUP_LSM_NUM];
-
 static enum cgroup_bpf_attach_type
 bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
 {
@@ -191,10 +207,13 @@ void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype)
 
 	lockdep_assert_held(&cgroup_mutex);
 
-	WARN_ON_ONCE(cgroup_lsm_atype[i].attach_btf_id &&
-		     cgroup_lsm_atype[i].attach_btf_id != attach_btf_id);
-
-	cgroup_lsm_atype[i].attach_btf_id = attach_btf_id;
+	if (!cgroup_lsm_atype[i].attach_btf_id) {
+		cgroup_lsm_atype[i].attach_btf_id = attach_btf_id;
+		WRITE_ONCE(cgroup_lsm_atype[i].returns_errno,
+			   bpf_lsm_hook_returns_errno(attach_btf_id));
+	} else {
+		WARN_ON_ONCE(cgroup_lsm_atype[i].attach_btf_id != attach_btf_id);
+	}
 	cgroup_lsm_atype[i].refcnt++;
 }
 
@@ -203,8 +222,10 @@ void bpf_cgroup_atype_put(int cgroup_atype)
 	int i = cgroup_atype - CGROUP_LSM_START;
 
 	cgroup_lock();
-	if (--cgroup_lsm_atype[i].refcnt <= 0)
+	if (--cgroup_lsm_atype[i].refcnt <= 0) {
+		WRITE_ONCE(cgroup_lsm_atype[i].returns_errno, true);
 		cgroup_lsm_atype[i].attach_btf_id = 0;
+	}
 	WARN_ON_ONCE(cgroup_lsm_atype[i].refcnt < 0);
 	cgroup_unlock();
 }
@@ -1208,7 +1229,7 @@ static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 
 /* Must be called with cgroup_mutex held to avoid races. */
 static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
-			      union bpf_attr __user *uattr)
+			      union bpf_attr __user *uattr, u32 uattr_size)
 {
 	__u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
 	bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE;
@@ -1259,7 +1280,8 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 		return -EFAULT;
 	if (!effective_query && from_atype == to_atype)
 		revision = cgrp->bpf.revisions[from_atype];
-	if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision)))
+	if (uattr_size >= offsetofend(union bpf_attr, query.revision) &&
+	    copy_to_user(&uattr->query.revision, &revision, sizeof(revision)))
 		return -EFAULT;
 	if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt)
 		/* return early if user requested only program count + flags */
@@ -1312,12 +1334,12 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 }
 
 static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
-			    union bpf_attr __user *uattr)
+			    union bpf_attr __user *uattr, u32 uattr_size)
 {
 	int ret;
 
 	cgroup_lock();
-	ret = __cgroup_bpf_query(cgrp, attr, uattr);
+	ret = __cgroup_bpf_query(cgrp, attr, uattr, uattr_size);
 	cgroup_unlock();
 	return ret;
 }
@@ -1520,7 +1542,7 @@ out_put_cgroup:
 }
 
 int cgroup_bpf_prog_query(const union bpf_attr *attr,
-			  union bpf_attr __user *uattr)
+			  union bpf_attr __user *uattr, u32 uattr_size)
 {
 	struct cgroup *cgrp;
 	int ret;
@@ -1529,7 +1551,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
 	if (IS_ERR(cgrp))
 		return PTR_ERR(cgrp);
 
-	ret = cgroup_bpf_query(cgrp, attr, uattr);
+	ret = cgroup_bpf_query(cgrp, attr, uattr, uattr_size);
 
 	cgroup_put(cgrp);
 	return ret;
@@ -1935,8 +1957,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 
 	kfree(ctx.cur_val);
 
-	if (ret == 1 && ctx.new_updated) {
-		kfree(*buf);
+	if (!ret && ctx.new_updated) {
+		kvfree(*buf);
 		*buf = ctx.new_val;
 		*pcount = ctx.new_len;
 	} else {
@@ -2342,6 +2364,7 @@ BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx,
 		return -E2BIG;
 
 	memcpy(ctx->new_val, buf, buf_len);
+	((char *)ctx->new_val)[buf_len] = '\0';
 	ctx->new_len = buf_len;
 	ctx->new_updated = 1;
 
diff --git a/kernel/bpf/cnum.c b/kernel/bpf/cnum.c
new file mode 100644
index 000000000000..86142cb2aee5
--- /dev/null
+++ b/kernel/bpf/cnum.c
@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#include <linux/bits.h>
+
+#define T 32
+#include "cnum_defs.h"
+#undef T
+
+#define T 64
+#include "cnum_defs.h"
+#undef T
+
+struct cnum32 cnum32_from_cnum64(struct cnum64 cnum)
+{
+	if (cnum64_is_empty(cnum))
+		return CNUM32_EMPTY;
+
+	if (cnum.size >= U32_MAX)
+		return (struct cnum32){ .base = 0, .size = U32_MAX };
+	else
+		return (struct cnum32){ .base = (u32)cnum.base, .size = cnum.size };
+}
+
+/*
+ * Suppose 'a' and 'b' are laid out as follows:
+ *
+ *                                                          64-bit number axis --->
+ *
+ * N*2^32                   (N+1)*2^32                (N+2)*2^32                (N+3)*2^32
+ * ||------|---|=====|-------||----------|=====|-------||----------|=====|----|--||
+ *         |   |< b >|                   |< b >|                   |< b >|    |
+ *         |   |                                                         |    |
+ *         |<--+--------------------------- a ---------------------------+--->|
+ *             |                                                         |
+ *             |<-------------------------- t -------------------------->|
+ *
+ * In such a case it is possible to infer a more tight representation t
+ * such that ∀ v ∈ a, (u32)v ∈ b: v ∈ t.
+ */
+struct cnum64 cnum64_cnum32_intersect(struct cnum64 a, struct cnum32 b)
+{
+	/*
+	 * To simplify reasoning, rotate the circles so that [virtual] a1 starts
+	 * at u32 boundary, b1 represents b in this new frame of reference.
+	 */
+	struct cnum32 b1 = { b.base - (u32)a.base, b.size };
+	struct cnum64 t = a;
+	u64 d, b1_max;
+
+	if (cnum64_is_empty(a) || cnum32_is_empty(b))
+		return CNUM64_EMPTY;
+
+	if (cnum32_urange_overflow(b1)) {
+		b1_max = (u32)b1.base + (u32)b1.size; /* overflow here is fine and necessary */
+		if ((u32)a.size > b1_max && (u32)a.size < b1.base) {
+			/*
+			 * N*2^32                   (N+1)*2^32
+			 * ||=====|------------|=====||=====|---------|---|=====||
+			 *  |b1 ->|            |<- b1||b1 ->|         |   |<- b1|
+			 *  |<----------------- a1 ------------------>|
+			 *  |<-------------- t ------------>|<-- d -->| (after adjustment)
+			 *                                  ^
+			 *                                b1_max
+			 */
+			d = (u32)a.size - b1_max;
+			t.size -= d;
+		} else {
+			/*
+			 * No adjustments possible in the following cases:
+			 *
+			 * ||=====|------------|=====||===|=|-------------|=|===||
+			 *  |b1 ->|            |<- b1||b1 +>|             |<+ b1|
+			 *  |<----------------- a1 ------>|                 |
+			 *  |<----------------- (or) a1 ------------------->|
+			 */
+		}
+	} else {
+		if (t.size < b1.base)
+			/*
+			 * N*2^32                   (N+1)*2^32
+			 * ||----------|--|=======|--||------>
+			 *  |<-- a1 -->|  |<- b ->|
+			 */
+			return CNUM64_EMPTY;
+		/*
+		 * N*2^32                   (N+1)*2^32
+		 * ||-------------|========|-||-----| -------|========|-||
+		 *  |             |<- b1 ->|        |        |<- b1 ->|
+		 *  |<------------+ a1 ------------>|
+		 *                |<------ t ------>| (after adjustment)
+		 */
+		t.base += b1.base;
+		t.size -= b1.base;
+		b1_max = b1.base + b1.size;
+		d = 0;
+		if ((u32)a.size < b1.base)
+			/*
+			 * N*2^32                   (N+1)*2^32
+			 * ||-------------|========|-||------|-------|========|-||
+			 *  |             |<- b1 ->|         |       |<- b1 ->|
+			 *  |<------------+-- a1 --+-------->|
+			 *                |<- t  ->|<-- d -->| (after adjustment)
+			 */
+			d = (u32)a.size + (BIT_ULL(32) - b1_max);
+		else if ((u32)a.size >= b1_max)
+			/*
+			 * N*2^32                   (N+1)*2^32
+			 * ||--|========|------------||--|========|-------|-----||
+			 *  |  |<- b1 ->|                |<- b1 ->|       |
+			 *  |<-+------------------ a1 ------------+------>|
+			 *     |<-------------- t --------------->|<- d ->| (after adjustment)
+			 */
+			d = (u32)a.size - b1_max;
+		if (t.size < d)
+			return CNUM64_EMPTY;
+		t.size -= d;
+	}
+	return t;
+}
diff --git a/kernel/bpf/cnum_defs.h b/kernel/bpf/cnum_defs.h
new file mode 100644
index 000000000000..a90e317e3578
--- /dev/null
+++ b/kernel/bpf/cnum_defs.h
@@ -0,0 +1,247 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#ifndef T
+#error "Define T (bit width: 32, 64) before including cnum_defs.h"
+#endif
+
+#include <linux/cnum.h>
+#include <linux/kernel.h>
+#include <linux/limits.h>
+#include <linux/minmax.h>
+#include <linux/compiler_types.h>
+
+#define cnum_t   __PASTE(cnum, T)
+#define ut       __PASTE(u, T)
+#define st       __PASTE(s, T)
+#define UT_MAX   __PASTE(__PASTE(U, T), _MAX)
+#define ST_MAX   __PASTE(__PASTE(S, T), _MAX)
+#define ST_MIN   __PASTE(__PASTE(S, T), _MIN)
+#define EMPTY    __PASTE(__PASTE(CNUM, T), _EMPTY)
+#define FN(name) __PASTE(__PASTE(cnum, T), __PASTE(_, name))
+
+struct cnum_t FN(from_urange)(ut min, ut max)
+{
+	return (struct cnum_t){ .base = min, .size = (ut)max - min };
+}
+
+struct cnum_t FN(from_srange)(st min, st max)
+{
+	ut size = (ut)max - (ut)min;
+	ut base = size == UT_MAX ? 0 : (ut)min;
+
+	return (struct cnum_t){ .base = base, .size = size };
+}
+
+/* True if this cnum represents two unsigned ranges. */
+static inline bool FN(urange_overflow)(struct cnum_t cnum)
+{
+	/* Same as cnum.base + cnum.size > UT_MAX but avoids overflow */
+	return cnum.size > UT_MAX - (ut)cnum.base;
+}
+
+/*
+ * cnum{T}_umin / cnum{T}_umax query an unsigned range represented by this cnum.
+ * If cnum represents a range crossing the UT_MAX/0 boundary, the unbound range
+ * [0..UT_MAX] is returned.
+ */
+ut FN(umin)(struct cnum_t cnum)
+{
+	return FN(urange_overflow)(cnum) ? 0 : cnum.base;
+}
+EXPORT_SYMBOL_GPL(FN(umin));
+
+ut FN(umax)(struct cnum_t cnum)
+{
+	return FN(urange_overflow)(cnum) ? UT_MAX : cnum.base + cnum.size;
+}
+EXPORT_SYMBOL_GPL(FN(umax));
+
+/* True if this cnum represents two signed ranges. */
+static inline bool FN(srange_overflow)(struct cnum_t cnum)
+{
+	return FN(contains)(cnum, (ut)ST_MAX) && FN(contains)(cnum, (ut)ST_MIN);
+}
+
+/*
+ * cnum{T}_smin / cnum{T}_smax query a signed range represented by this cnum.
+ * If cnum represents a range crossing the ST_MAX/ST_MIN boundary, the unbound range
+ * [ST_MIN..ST_MAX] is returned.
+ */
+st FN(smin)(struct cnum_t cnum)
+{
+	return FN(srange_overflow)(cnum)
+	       ? ST_MIN
+	       : min((st)cnum.base, (st)(cnum.base + cnum.size));
+}
+
+st FN(smax)(struct cnum_t cnum)
+{
+	return FN(srange_overflow)(cnum)
+	       ? ST_MAX
+	       : max((st)cnum.base, (st)(cnum.base + cnum.size));
+}
+
+/*
+ * Returns a possibly empty intersection of cnums 'a' and 'b'.
+ * If 'a' and 'b' intersect in two sub-arcs, the function over-approximates
+ * and returns either 'a' or 'b', whichever is smaller.
+ */
+struct cnum_t FN(intersect)(struct cnum_t a, struct cnum_t b)
+{
+	struct cnum_t b1;
+	ut dbase;
+
+	if (FN(is_empty)(a) || FN(is_empty)(b))
+		return EMPTY;
+
+	if (a.base > b.base)
+		swap(a, b);
+
+	/*
+	 * Rotate frame of reference such that a.base is 0.
+	 * 'b1' is 'b' in this frame of reference.
+	 */
+	dbase = b.base - a.base;
+	b1 = (struct cnum_t){ dbase, b.size };
+	if (FN(urange_overflow)(b1)) {
+		if (b1.base <= a.size) {
+			/*
+			 * Rotated frame (a.base at origin):
+			 *
+			 * 0                                       UT_MAX
+			 * |--------------------------------------------|
+			 * [=== a ==========================]           |
+			 * [= b1 tail =]  [========= b1 main ==========>]
+			 *                 ^-- b1.base <= a.size
+			 *
+			 * 'a' and 'b' intersect in two disjoint arcs,
+			 * can't represent as single cnum, over-approximate
+			 * the result.
+			 */
+			return a.size <= b.size ? a : b;
+		} else {
+			/*
+			 * Rotated frame (a.base at origin):
+			 *
+			 * 0                                       UT_MAX
+			 * |--------------------------------------------|
+			 * [=== a =============]  |                     |
+			 * [= b1 tail =]          [======= b1 main ====>]
+			 *                         ^-- b1.base > a.size
+			 *
+			 * Only 'b' tail intersects 'a'.
+			 */
+			return (struct cnum_t) {
+				.base = a.base,
+				.size = min(a.size, (ut)(b1.base + b1.size)),
+			};
+		}
+	} else if (a.size >= b1.base) {
+		/*
+		 * Rotated frame (a.base at origin):
+		 *
+		 * 0                                             UT_MAX
+		 * |--------------------------------------------------|
+		 * [=== a ==================================]         |
+		 *                   [== b1 =====================]
+		 *
+		 * 0                                             UT_MAX
+		 * |--------------------------------------------------|
+		 * [=== a ==================================]         |
+		 *                   [== b1 ====]
+		 *                   ^-- b1.base <= a.size
+		 *                   |<-- a.size - dbase -->|
+		 *
+		 * 'a' and 'b' intersect as one cnum.
+		 */
+		return (struct cnum_t) {
+			.base = b.base,
+			.size = min((ut)(a.size - dbase), b.size),
+		};
+	} else {
+		return EMPTY;
+	}
+}
+
+void FN(intersect_with)(struct cnum_t *dst, struct cnum_t src)
+{
+	*dst = FN(intersect)(*dst, src);
+}
+
+void FN(intersect_with_urange)(struct cnum_t *dst, ut min, ut max)
+{
+	FN(intersect_with)(dst, FN(from_urange)(min, max));
+}
+
+void FN(intersect_with_srange)(struct cnum_t *dst, st min, st max)
+{
+	FN(intersect_with)(dst, FN(from_srange)(min, max));
+}
+
+static inline struct cnum_t FN(normalize)(struct cnum_t cnum)
+{
+	if (cnum.size == UT_MAX && cnum.base != 0 && cnum.base != (ut)ST_MAX)
+		cnum.base = 0;
+	return cnum;
+}
+
+struct cnum_t FN(add)(struct cnum_t a, struct cnum_t b)
+{
+	if (FN(is_empty)(a) || FN(is_empty)(b))
+		return EMPTY;
+	if (a.size > UT_MAX - b.size)
+		return (struct cnum_t){ 0, (ut)UT_MAX };
+	else
+		return FN(normalize)((struct cnum_t){ a.base + b.base, a.size + b.size });
+}
+
+struct cnum_t FN(negate)(struct cnum_t a)
+{
+	if (FN(is_empty)(a))
+		return EMPTY;
+	return FN(normalize)((struct cnum_t){ -((ut)a.base + a.size), a.size });
+}
+
+bool FN(is_empty)(struct cnum_t cnum)
+{
+	return cnum.base == EMPTY.base && cnum.size == EMPTY.size;
+}
+
+bool FN(contains)(struct cnum_t cnum, ut v)
+{
+	if (FN(is_empty)(cnum))
+		return false;
+	if (FN(urange_overflow)(cnum))
+		return v >= cnum.base || v <= (ut)cnum.base + cnum.size;
+	else
+		return v >= cnum.base && v <= (ut)cnum.base + cnum.size;
+}
+
+bool FN(is_const)(struct cnum_t cnum)
+{
+	return cnum.size == 0;
+}
+
+bool FN(is_subset)(struct cnum_t bigger, struct cnum_t smaller)
+{
+	if (FN(is_empty(smaller)))
+		return true;
+	if (FN(is_empty(bigger)))
+		return false;
+	/* rotate both arcs such that 'bigger' starts at origin, hence does not overflow */
+	smaller.base -= bigger.base;
+	bigger.base = 0;
+	if (FN(urange_overflow)(smaller) && bigger.size < UT_MAX)
+		return false;
+	return smaller.base + smaller.size <= bigger.size;
+}
+
+#undef EMPTY
+#undef cnum_t
+#undef ut
+#undef st
+#undef UT_MAX
+#undef ST_MAX
+#undef ST_MIN
+#undef FN
diff --git a/kernel/bpf/const_fold.c b/kernel/bpf/const_fold.c
index db73c4740b1e..b2a19acadb91 100644
--- a/kernel/bpf/const_fold.c
+++ b/kernel/bpf/const_fold.c
@@ -58,6 +58,14 @@ static void const_reg_xfer(struct bpf_verifier_env *env, struct const_arg_info *
 	u8 opcode = BPF_OP(insn->code) | BPF_SRC(insn->code);
 	int r;
 
+	/* Stack arg stores (r11-based) are outside the tracked register set. */
+	if (is_stack_arg_st(insn) || is_stack_arg_stx(insn))
+		return;
+	if (is_stack_arg_ldx(insn)) {
+		ci_out[insn->dst_reg] = unknown;
+		return;
+	}
+
 	switch (class) {
 	case BPF_ALU:
 	case BPF_ALU64:
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 6aa2a8b24030..649cce41e13f 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1299,8 +1299,8 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
 	u32 imm_rnd = get_random_u32();
 	s16 off;
 
-	BUILD_BUG_ON(BPF_REG_AX  + 1 != MAX_BPF_JIT_REG);
-	BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG);
+	BUILD_BUG_ON(BPF_REG_PARAMS + 2 != MAX_BPF_JIT_REG);
+	BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG);
 
 	/* Constraints on AX register:
 	 *
@@ -1582,6 +1582,16 @@ bool bpf_insn_is_indirect_target(const struct bpf_verifier_env *env, const struc
 	insn_idx += prog->aux->subprog_start;
 	return env->insn_aux_data[insn_idx].indirect_target;
 }
+
+u16 bpf_out_stack_arg_cnt(const struct bpf_verifier_env *env, const struct bpf_prog *prog)
+{
+	const struct bpf_subprog_info *sub;
+
+	if (!env)
+		return 0;
+	sub = &env->subprog_info[prog->aux->func_idx];
+	return sub->stack_arg_cnt - bpf_in_stack_arg_cnt(sub);
+}
 #endif /* CONFIG_BPF_JIT */
 
 /* Base function for offset calculation. Needs to go into .text section,
@@ -2471,7 +2481,7 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map,
 			cookie = aux->cgroup_storage[i] ?
 				 aux->cgroup_storage[i]->cookie : 0;
 			ret = map->owner->storage_cookie[i] == cookie ||
-			      !cookie;
+			      (!cookie && !aux->tail_call_reachable);
 		}
 		if (ret &&
 		    map->owner->attach_func_proto != aux->attach_func_proto) {
@@ -3228,6 +3238,11 @@ bool __weak bpf_jit_supports_kfunc_call(void)
 	return false;
 }
 
+bool __weak bpf_jit_supports_stack_args(void)
+{
+	return false;
+}
+
 bool __weak bpf_jit_supports_far_kfunc_call(void)
 {
 	return false;
@@ -3363,6 +3378,12 @@ __weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
 }
 
 #ifdef CONFIG_BPF_SYSCALL
+__weak bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write,
+					unsigned long fault_ip)
+{
+	return false;
+}
+
 static int __init bpf_global_ma_init(void)
 {
 	int ret;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index cc0a43ebab6b..dc7b859e8bbf 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -581,6 +581,10 @@ static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj,
 {
 	struct xdp_frame *nxdpf;
 
+	/* Frags live outside the linear frame and cannot be cloned safely. */
+	if (unlikely(xdp_frame_has_frags(xdpf)))
+		return -EOPNOTSUPP;
+
 	nxdpf = xdpf_clone(xdpf);
 	if (!nxdpf)
 		return -ENOMEM;
@@ -706,6 +710,18 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
 	if (unlikely(err))
 		return err;
 
+	if (dst->xdp_prog && skb_cloned(skb)) {
+		struct sk_buff *nskb;
+
+		nskb = skb_copy(skb, GFP_ATOMIC);
+		if (!nskb)
+			return -ENOMEM;
+
+		nskb->mac_len = skb->mac_len;
+		consume_skb(skb);
+		skb = nskb;
+	}
+
 	/* Redirect has already succeeded semantically at this point, so we just
 	 * return 0 even if packet is dropped. Helper below takes care of
 	 * freeing skb.
@@ -726,6 +742,9 @@ static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst,
 	struct sk_buff *nskb;
 	int err;
 
+	if (unlikely(skb_is_nonlinear(skb)))
+		return -EOPNOTSUPP;
+
 	nskb = skb_clone(skb, GFP_ATOMIC);
 	if (!nskb)
 		return -ENOMEM;
diff --git a/kernel/bpf/fixups.c b/kernel/bpf/fixups.c
index 3692adf62558..3cf2cc6e3ab6 100644
--- a/kernel/bpf/fixups.c
+++ b/kernel/bpf/fixups.c
@@ -870,7 +870,7 @@ int bpf_convert_ctx_accesses(struct bpf_verifier_env *env)
 		case PTR_TO_BTF_ID:
 		case PTR_TO_BTF_ID | PTR_UNTRUSTED:
 		/* PTR_TO_BTF_ID | MEM_ALLOC always has a valid lifetime, unlike
-		 * PTR_TO_BTF_ID, and an active ref_obj_id, but the same cannot
+		 * PTR_TO_BTF_ID, and an active referenced id, but the same cannot
 		 * be said once it is marked PTR_UNTRUSTED, hence we must handle
 		 * any faults for loads into such types. BPF_WRITE is disallowed
 		 * for this case.
@@ -1265,6 +1265,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	prog->aux->real_func_cnt = env->subprog_cnt;
 	prog->aux->bpf_exception_cb = (void *)func[env->exception_callback_subprog]->bpf_func;
 	prog->aux->exception_boundary = func[0]->aux->exception_boundary;
+	prog->aux->stack_arg_sp_adjust = func[0]->aux->stack_arg_sp_adjust;
 	bpf_prog_jit_attempt_done(prog);
 	return 0;
 out_free:
@@ -1378,9 +1379,21 @@ int bpf_fixup_call_args(struct bpf_verifier_env *env)
 	struct bpf_prog *prog = env->prog;
 	struct bpf_insn *insn = prog->insnsi;
 	bool has_kfunc_call = bpf_prog_has_kfunc_call(prog);
-	int i, depth;
+	int depth;
 #endif
-	int err = 0;
+	int i, err = 0;
+
+	for (i = 0; i < env->subprog_cnt; i++) {
+		struct bpf_subprog_info *subprog = &env->subprog_info[i];
+		u16 outgoing = subprog->stack_arg_cnt - bpf_in_stack_arg_cnt(subprog);
+
+		if (subprog->max_out_stack_arg_cnt > outgoing) {
+			verbose(env,
+				"func#%d writes %u stack arg slots, but calls only require %u\n",
+				i, subprog->max_out_stack_arg_cnt, outgoing);
+			return -EINVAL;
+		}
+	}
 
 	if (env->prog->jit_requested &&
 	    !bpf_prog_is_offloaded(env->prog->aux)) {
@@ -1395,6 +1408,12 @@ int bpf_fixup_call_args(struct bpf_verifier_env *env)
 		verbose(env, "calling kernel functions are not allowed in non-JITed programs\n");
 		return -EINVAL;
 	}
+	for (i = 0; i < env->subprog_cnt; i++) {
+		if (bpf_in_stack_arg_cnt(&env->subprog_info[i])) {
+			verbose(env, "stack args are not supported in non-JITed programs\n");
+			return -EINVAL;
+		}
+	}
 	if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) {
 		/* When JIT fails the progs with bpf2bpf calls and tail_calls
 		 * have to be rejected, since interpreter doesn't support them yet.
@@ -2167,6 +2186,8 @@ patch_map_ops_generic:
 		    insn->imm == BPF_FUNC_get_func_ret) {
 			if (eatype == BPF_TRACE_FEXIT ||
 			    eatype == BPF_TRACE_FSESSION ||
+			    eatype == BPF_TRACE_FEXIT_MULTI ||
+			    eatype == BPF_TRACE_FSESSION_MULTI ||
 			    eatype == BPF_MODIFY_RETURN) {
 				/* Load nr_args from ctx - 8 */
 				insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 3dd9b4924ae4..9f394e1aa2e8 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -9,6 +9,7 @@
 #include <linux/rculist_nulls.h>
 #include <linux/rcupdate_wait.h>
 #include <linux/random.h>
+#include <linux/rhashtable.h>
 #include <uapi/linux/btf.h>
 #include <linux/rcupdate_trace.h>
 #include <linux/btf_ids.h>
@@ -242,6 +243,10 @@ static void htab_free_prealloced_fields(struct bpf_htab *htab)
 
 	if (IS_ERR_OR_NULL(htab->map.record))
 		return;
+	/*
+	 * Preallocated maps do not have a bpf_mem_alloc destructor, so fully
+	 * destroy every element, including the extra elements.
+	 */
 	if (htab_has_extra_elems(htab))
 		num_entries += num_possible_cpus();
 	for (i = 0; i < num_entries; i++) {
@@ -496,28 +501,26 @@ static void htab_dtor_ctx_free(void *ctx)
 	kfree(ctx);
 }
 
-static int htab_set_dtor(struct bpf_htab *htab, void (*dtor)(void *, void *))
+static int bpf_ma_set_dtor(struct bpf_map *map, struct bpf_mem_alloc *ma,
+			   void (*dtor)(void *, void *))
 {
-	u32 key_size = htab->map.key_size;
-	struct bpf_mem_alloc *ma;
 	struct htab_btf_record *hrec;
 	int err;
 
 	/* No need for dtors. */
-	if (IS_ERR_OR_NULL(htab->map.record))
+	if (IS_ERR_OR_NULL(map->record))
 		return 0;
 
 	hrec = kzalloc(sizeof(*hrec), GFP_KERNEL);
 	if (!hrec)
 		return -ENOMEM;
-	hrec->key_size = key_size;
-	hrec->record = btf_record_dup(htab->map.record);
+	hrec->key_size = map->key_size;
+	hrec->record = btf_record_dup(map->record);
 	if (IS_ERR(hrec->record)) {
 		err = PTR_ERR(hrec->record);
 		kfree(hrec);
 		return err;
 	}
-	ma = htab_is_percpu(htab) ? &htab->pcpu_ma : &htab->ma;
 	bpf_mem_alloc_set_dtor(ma, dtor, htab_dtor_ctx_free, hrec);
 	return 0;
 }
@@ -534,9 +537,9 @@ static int htab_map_check_btf(struct bpf_map *map, const struct btf *btf,
 	 * populated in htab_map_alloc(), so it will always appear as NULL.
 	 */
 	if (htab_is_percpu(htab))
-		return htab_set_dtor(htab, htab_pcpu_mem_dtor);
+		return bpf_ma_set_dtor(map, &htab->pcpu_ma, htab_pcpu_mem_dtor);
 	else
-		return htab_set_dtor(htab, htab_mem_dtor);
+		return bpf_ma_set_dtor(map, &htab->ma, htab_mem_dtor);
 }
 
 static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
@@ -834,8 +837,8 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map,
 	return insn - insn_buf;
 }
 
-static void check_and_free_fields(struct bpf_htab *htab,
-				  struct htab_elem *elem)
+static void check_and_cancel_fields(struct bpf_htab *htab,
+				    struct htab_elem *elem)
 {
 	if (IS_ERR_OR_NULL(htab->map.record))
 		return;
@@ -845,11 +848,11 @@ static void check_and_free_fields(struct bpf_htab *htab,
 		int cpu;
 
 		for_each_possible_cpu(cpu)
-			bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu));
+			bpf_obj_cancel_fields(&htab->map, per_cpu_ptr(pptr, cpu));
 	} else {
 		void *map_value = htab_elem_value(elem, htab->map.key_size);
 
-		bpf_obj_free_fields(htab->map.record, map_value);
+		bpf_obj_cancel_fields(&htab->map, map_value);
 	}
 }
 
@@ -884,7 +887,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
 	htab_unlock_bucket(b, flags);
 
 	if (l == tgt_l)
-		check_and_free_fields(htab, l);
+		check_and_cancel_fields(htab, l);
 	return l == tgt_l;
 }
 
@@ -949,7 +952,7 @@ find_first_elem:
 
 static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
 {
-	check_and_free_fields(htab, l);
+	check_and_cancel_fields(htab, l);
 
 	if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
 		bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
@@ -1002,7 +1005,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 
 	if (htab_is_prealloc(htab)) {
 		bpf_map_dec_elem_count(&htab->map);
-		check_and_free_fields(htab, l);
+		check_and_cancel_fields(htab, l);
 		pcpu_freelist_push(&htab->freelist, &l->fnode);
 	} else {
 		dec_elem_count(htab);
@@ -1019,7 +1022,7 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
 		/* copy true value_size bytes */
 		ptr = this_cpu_ptr(pptr);
 		copy_map_value(&htab->map, ptr, value);
-		bpf_obj_free_fields(htab->map.record, ptr);
+		bpf_obj_cancel_fields(&htab->map, ptr);
 	} else {
 		u32 size = round_up(htab->map.value_size, 8);
 		void *val;
@@ -1029,7 +1032,7 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
 			cpu = map_flags >> 32;
 			ptr = per_cpu_ptr(pptr, cpu);
 			copy_map_value(&htab->map, ptr, value);
-			bpf_obj_free_fields(htab->map.record, ptr);
+			bpf_obj_cancel_fields(&htab->map, ptr);
 			return;
 		}
 
@@ -1037,7 +1040,7 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
 			ptr = per_cpu_ptr(pptr, cpu);
 			val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu;
 			copy_map_value(&htab->map, ptr, val);
-			bpf_obj_free_fields(htab->map.record, ptr);
+			bpf_obj_cancel_fields(&htab->map, ptr);
 		}
 	}
 }
@@ -1253,11 +1256,11 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	if (l_old) {
 		hlist_nulls_del_rcu(&l_old->hash_node);
 
-		/* l_old has already been stashed in htab->extra_elems, free
-		 * its special fields before it is available for reuse.
+		/* l_old has already been stashed in htab->extra_elems, cancel
+		 * its reusable special fields before it is available for reuse.
 		 */
 		if (htab_is_prealloc(htab))
-			check_and_free_fields(htab, l_old);
+			check_and_cancel_fields(htab, l_old);
 	}
 	htab_unlock_bucket(b, flags);
 	if (l_old && !htab_is_prealloc(htab))
@@ -1270,7 +1273,7 @@ err:
 
 static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
 {
-	check_and_free_fields(htab, elem);
+	check_and_cancel_fields(htab, elem);
 	bpf_map_dec_elem_count(&htab->map);
 	bpf_lru_push_free(&htab->lru, &elem->lru_node);
 }
@@ -2739,3 +2742,794 @@ const struct bpf_map_ops htab_of_maps_map_ops = {
 	BATCH_OPS(htab),
 	.map_btf_id = &htab_map_btf_ids[0],
 };
+
+struct rhtab_elem {
+	struct rhash_head node;
+	/* key bytes, then value bytes follow */
+	u8 data[] __aligned(8);
+};
+
+struct bpf_rhtab {
+	struct bpf_map map;
+	struct rhashtable ht;
+	struct bpf_mem_alloc ma;
+	u32 elem_size;
+	bool freeing_internal;
+};
+
+static const struct rhashtable_params rhtab_params = {
+	.head_offset = offsetof(struct rhtab_elem, node),
+	.key_offset  = offsetof(struct rhtab_elem, data),
+};
+
+static inline void *rhtab_elem_value(struct rhtab_elem *l, u32 key_size)
+{
+	return l->data + round_up(key_size, 8);
+}
+
+/* Specialize hash function and objcmp for long sized key */
+static __always_inline int rhtab_key_cmp_long(struct rhashtable_compare_arg *arg,
+					      const void *ptr)
+{
+	const unsigned long key1 = *(const unsigned long *)arg->key;
+	const struct rhtab_elem *key2 = ptr;
+
+	return key1 != *(const unsigned long *)key2->data;
+}
+
+static __always_inline u32 rhtab_hashfn_long(const void *data, u32 len, u32 seed)
+{
+	u64 k = *(const unsigned long *)data;
+
+	return (u32)(k ^ (k >> 32)) ^ seed;
+}
+
+static const struct rhashtable_params rhtab_params_long = {
+	.head_offset = offsetof(struct rhtab_elem, node),
+	.key_offset  = offsetof(struct rhtab_elem, data),
+	.key_len     = sizeof(long),
+	.hashfn      = rhtab_hashfn_long,
+	.obj_cmpfn   = rhtab_key_cmp_long,
+};
+
+static struct bpf_map *rhtab_map_alloc(union bpf_attr *attr)
+{
+	struct rhashtable_params params;
+	struct bpf_rhtab *rhtab;
+	int err = 0;
+
+	rhtab = bpf_map_area_alloc(sizeof(*rhtab), NUMA_NO_NODE);
+	if (!rhtab)
+		return ERR_PTR(-ENOMEM);
+
+	bpf_map_init_from_attr(&rhtab->map, attr);
+
+	if (rhtab->map.max_entries > 1UL << 31) {
+		err = -E2BIG;
+		goto free_rhtab;
+	}
+
+	rhtab->elem_size = sizeof(struct rhtab_elem) + round_up(rhtab->map.key_size, 8) +
+			   round_up(rhtab->map.value_size, 8);
+
+	params = rhtab_params;
+	params.key_len = rhtab->map.key_size;
+	params.nelem_hint = (u32)attr->map_extra;
+	params.automatic_shrinking = true;
+
+	if (rhtab->map.key_size == sizeof(long)) {
+		params.hashfn = rhtab_hashfn_long;
+		params.obj_cmpfn = rhtab_key_cmp_long;
+	}
+
+	err = rhashtable_init(&rhtab->ht, &params);
+	if (err)
+		goto free_rhtab;
+
+	/* Set max_elems after rhashtable_init() since init zeroes the struct */
+	rhtab->ht.max_elems = rhtab->map.max_entries;
+
+	err = bpf_mem_alloc_init(&rhtab->ma, rhtab->elem_size, false);
+	if (err)
+		goto destroy_rhtab;
+
+	return &rhtab->map;
+
+destroy_rhtab:
+	rhashtable_destroy(&rhtab->ht);
+free_rhtab:
+	bpf_map_area_free(rhtab);
+	return ERR_PTR(err);
+}
+
+static int rhtab_map_alloc_check(union bpf_attr *attr)
+{
+	if (!(attr->map_flags & BPF_F_NO_PREALLOC))
+		return -EINVAL;
+
+	if (attr->map_flags & BPF_F_ZERO_SEED)
+		return -EINVAL;
+
+	if (attr->key_size > U16_MAX)
+		return -E2BIG;
+
+	if (attr->map_extra >> 32)
+		return -EINVAL;
+
+	if ((u32)attr->map_extra > U16_MAX)
+		return -E2BIG;
+
+	if ((u32)attr->map_extra > attr->max_entries)
+		return -EINVAL;
+
+	return htab_map_alloc_check(attr);
+}
+
+static void rhtab_check_and_free_fields(struct bpf_rhtab *rhtab,
+					struct rhtab_elem *elem)
+{
+	if (IS_ERR_OR_NULL(rhtab->map.record))
+		return;
+
+	bpf_obj_free_fields(rhtab->map.record,
+			    rhtab_elem_value(elem, rhtab->map.key_size));
+}
+
+static void rhtab_mem_dtor(void *obj, void *ctx)
+{
+	struct htab_btf_record *hrec = ctx;
+	struct rhtab_elem *elem = obj;
+
+	if (IS_ERR_OR_NULL(hrec->record))
+		return;
+
+	bpf_obj_free_fields(hrec->record,
+			    rhtab_elem_value(elem, hrec->key_size));
+}
+
+static void rhtab_free_elem(void *ptr, void *arg)
+{
+	struct bpf_rhtab *rhtab = arg;
+	struct rhtab_elem *elem = ptr;
+
+	bpf_map_free_internal_structs(&rhtab->map, rhtab_elem_value(elem, rhtab->map.key_size));
+	bpf_mem_cache_free_rcu(&rhtab->ma, elem);
+}
+
+static void rhtab_map_free(struct bpf_map *map)
+{
+	struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+
+	rhashtable_free_and_destroy(&rhtab->ht, rhtab_free_elem, rhtab);
+	bpf_mem_alloc_destroy(&rhtab->ma);
+	bpf_map_area_free(rhtab);
+}
+
+static void *rhtab_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+
+	/* Hold RCU lock in case sleepable program calls via gen_lookup */
+	guard(rcu)();
+
+	if (map->key_size == sizeof(long))
+		return rhashtable_lookup_likely(&rhtab->ht, key, rhtab_params_long);
+
+	return rhashtable_lookup_likely(&rhtab->ht, key, rhtab_params);
+}
+
+static void *rhtab_map_lookup_elem(struct bpf_map *map, void *key) __must_hold(RCU)
+{
+	struct rhtab_elem *l;
+
+	l = rhtab_lookup_elem(map, key);
+	return l ? rhtab_elem_value(l, map->key_size) : NULL;
+}
+
+static void rhtab_read_elem_value(struct bpf_map *map, void *dst, struct rhtab_elem *elem,
+				  u64 flags)
+{
+	void *src = rhtab_elem_value(elem, map->key_size);
+
+	if (flags & BPF_F_LOCK)
+		copy_map_value_locked(map, dst, src, true);
+	else
+		copy_map_value(map, dst, src);
+}
+
+static int rhtab_delete_elem(struct bpf_rhtab *rhtab, struct rhtab_elem *elem, void *copy,
+			     u64 flags)
+{
+	int err;
+
+	/*
+	 * disable_instrumentation() mitigates the deadlock for programs running in NMI context.
+	 * rhashtable locks bucket with local_irq_save(). Only NMI programs may reenter
+	 * rhashtable code, bpf_disable_instrumentation() disables programs running in NMI, except
+	 * raw tracepoints, which we don't have in rhashtable.
+	 */
+	bpf_disable_instrumentation();
+
+	if (rhtab->map.key_size == sizeof(long))
+		err = rhashtable_remove_fast(&rhtab->ht, &elem->node, rhtab_params_long);
+	else
+		err = rhashtable_remove_fast(&rhtab->ht, &elem->node, rhtab_params);
+
+	bpf_enable_instrumentation();
+
+	if (err)
+		return err;
+
+	if (copy) {
+		rhtab_read_elem_value(&rhtab->map, copy, elem, flags);
+		check_and_init_map_value(&rhtab->map, copy);
+	}
+	/* Release internal structs: kptr, bpf_timer, task_work, wq */
+	rhtab_check_and_free_fields(rhtab, elem);
+	bpf_mem_cache_free_rcu(&rhtab->ma, elem);
+	return 0;
+}
+
+
+static long rhtab_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+	struct rhtab_elem *elem;
+
+	guard(rcu)();
+
+	elem = rhtab_lookup_elem(map, key);
+	if (!elem)
+		return -ENOENT;
+
+	return rhtab_delete_elem(rhtab, elem, NULL, 0);
+}
+
+static int rhtab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, void *value, u64 flags)
+{
+	struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+	struct rhtab_elem *elem;
+	int err;
+
+	err = bpf_map_check_op_flags(map, flags, BPF_F_LOCK);
+	if (err)
+		return err;
+
+	guard(rcu)();
+
+	elem = rhtab_lookup_elem(map, key);
+	if (!elem)
+		return -ENOENT;
+
+	return rhtab_delete_elem(rhtab, elem, value, flags);
+}
+
+static long rhtab_map_update_existing(struct bpf_map *map, struct rhtab_elem *elem, void *value,
+				      u64 map_flags)
+{
+	struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+	void *old_val = rhtab_elem_value(elem, map->key_size);
+
+	if (map_flags & BPF_NOEXIST)
+		return -EEXIST;
+
+	if (map_flags & BPF_F_LOCK)
+		copy_map_value_locked(map, old_val, value, false);
+	else
+		copy_map_value(map, old_val, value);
+
+	/*
+	 * Torn reads: a concurrent reader without BPF_F_LOCK may observe
+	 * the value mid-copy. Callers requiring consistent reads must use
+	 * BPF_F_LOCK, matching arraymap semantics.
+	 *
+	 * copy_map_value() skips special-field offsets, so old timers/
+	 * kptrs/etc. still sit in the slot. Cancel them after the copy
+	 * to match arraymap's update semantics.
+	 */
+	rhtab_check_and_free_fields(rhtab, elem);
+	return 0;
+}
+
+static long rhtab_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags)
+{
+	struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+	struct rhtab_elem *elem, *tmp;
+
+	if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
+		return -EINVAL;
+
+	if ((map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK))
+		return -EINVAL;
+
+	guard(rcu)();
+	elem = rhtab_lookup_elem(map, key);
+	if (elem)
+		return rhtab_map_update_existing(map, elem, value, map_flags);
+
+	if (map_flags & BPF_EXIST)
+		return -ENOENT;
+
+	/*
+	 * Reject new insertions while map_release_uref cleanup walks the
+	 * table. Without this, new elements could keep triggering rehash
+	 * and prevent the walk from terminating.
+	 */
+	if (READ_ONCE(rhtab->freeing_internal))
+		return -EBUSY;
+
+	/* Check max_entries limit before inserting new element */
+	if (atomic_read(&rhtab->ht.nelems) >= map->max_entries)
+		return -E2BIG;
+
+	elem = bpf_mem_cache_alloc(&rhtab->ma);
+	if (!elem)
+		return -ENOMEM;
+
+	memcpy(elem->data, key, map->key_size);
+	copy_map_value(map, rhtab_elem_value(elem, map->key_size), value);
+	check_and_init_map_value(map, rhtab_elem_value(elem, map->key_size));
+
+	/* Prevent deadlock for NMI programs attempting to take bucket lock */
+	bpf_disable_instrumentation();
+
+	if (map->key_size == sizeof(long))
+		tmp = rhashtable_lookup_get_insert_fast(&rhtab->ht, &elem->node, rhtab_params_long);
+	else
+		tmp = rhashtable_lookup_get_insert_fast(&rhtab->ht, &elem->node, rhtab_params);
+
+	bpf_enable_instrumentation();
+
+	if (tmp) {
+		bpf_mem_cache_free(&rhtab->ma, elem);
+		if (IS_ERR(tmp))
+			return PTR_ERR(tmp);
+
+		return rhtab_map_update_existing(map, tmp, value, map_flags);
+	}
+
+	return 0;
+}
+
+static int rhtab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+{
+	struct bpf_insn *insn = insn_buf;
+	const int ret = BPF_REG_0;
+
+	BUILD_BUG_ON(!__same_type(&rhtab_lookup_elem,
+				  (void *(*)(struct bpf_map *map, void *key)) NULL));
+	*insn++ = BPF_EMIT_CALL(rhtab_lookup_elem);
+	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
+	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
+				offsetof(struct rhtab_elem, data) + round_up(map->key_size, 8));
+
+	return insn - insn_buf;
+}
+
+static int rhtab_map_check_btf(struct bpf_map *map, const struct btf *btf,
+			       const struct btf_type *key_type,
+			       const struct btf_type *value_type)
+{
+	struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+
+	return bpf_ma_set_dtor(map, &rhtab->ma, rhtab_mem_dtor);
+}
+
+static void rhtab_map_free_internal_structs(struct bpf_map *map)
+{
+	struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+	struct rhashtable_iter iter;
+	struct rhtab_elem *elem;
+
+	if (!bpf_map_has_internal_structs(map))
+		return;
+
+	/*
+	 * Block new insertions. Once observed, no new growth is triggered,
+	 * so any in-flight rehash will drain and the walker is guaranteed
+	 * to stop returning -EAGAIN. Treat -EAGAIN as "rehash in progress,
+	 * retry"; do not wait for the worker.
+	 */
+	WRITE_ONCE(rhtab->freeing_internal, true);
+
+	rhashtable_walk_enter(&rhtab->ht, &iter);
+	rhashtable_walk_start(&iter);
+
+	while ((elem = rhashtable_walk_next(&iter))) {
+		if (IS_ERR(elem)) {
+			if (PTR_ERR(elem) == -EAGAIN)
+				continue;
+			break;
+		}
+
+		bpf_map_free_internal_structs(map, rhtab_elem_value(elem, map->key_size));
+
+		if (need_resched()) { /* Avoid stalls on large maps */
+			rhashtable_walk_stop(&iter);
+			cond_resched();
+			rhashtable_walk_start(&iter);
+		}
+	}
+
+	rhashtable_walk_stop(&iter);
+	rhashtable_walk_exit(&iter);
+	WRITE_ONCE(rhtab->freeing_internal, false);
+}
+
+static int rhtab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+	__must_hold_shared(RCU)
+{
+	struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+	struct rhtab_elem *elem;
+
+	elem = rhashtable_next_key(&rhtab->ht, key);
+
+	/* if not found, return the first key */
+	if (PTR_ERR(elem) == -ENOENT)
+		elem = rhashtable_next_key(&rhtab->ht, NULL);
+
+	if (IS_ERR(elem))
+		return PTR_ERR(elem);
+	if (!elem)
+		return -ENOENT;
+
+	memcpy(next_key, elem->data, map->key_size);
+	return 0;
+}
+
+static void rhtab_map_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m)
+{
+	void *value;
+
+	/* Guarantee that hashtab value is not freed */
+	guard(rcu)();
+
+	value = rhtab_map_lookup_elem(map, key);
+	if (!value)
+		return;
+
+	btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
+	seq_puts(m, ": ");
+	btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
+	seq_putc(m, '\n');
+}
+
+static long bpf_each_rhash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
+				void *callback_ctx, u64 flags)
+{
+	struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+	void *prev_key = NULL;
+	struct rhtab_elem *elem;
+	int num_elems = 0;
+	u64 ret = 0;
+
+	cant_migrate();
+
+	if (flags != 0)
+		return -EINVAL;
+
+	rcu_read_lock();
+	/*
+	 * Best-effort iteration: if rhashtable is concurrently resized or
+	 * elements are deleted/inserted, there may be missed or duplicate
+	 * elements visited.
+	 */
+	while ((elem = rhashtable_next_key(&rhtab->ht, prev_key))) {
+		if (IS_ERR(elem))
+			break;
+		num_elems++;
+		ret = callback_fn((u64)(long)map,
+				  (u64)(long)elem->data,
+				  (u64)(long)rhtab_elem_value(elem, map->key_size),
+				  (u64)(long)callback_ctx, 0);
+		if (ret)
+			break;
+
+		prev_key = elem->data;	/* valid while RCU held */
+	}
+	rcu_read_unlock();
+
+	return num_elems;
+}
+
+static u64 rhtab_map_mem_usage(const struct bpf_map *map)
+{
+	struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+	u64 num_entries;
+
+	/* Excludes rhashtable bucket overhead (~ nelems * sizeof(void *) at 75% load). */
+	num_entries = atomic_read(&rhtab->ht.nelems);
+	return sizeof(struct bpf_rhtab) + rhtab->elem_size * num_entries;
+}
+
+static int __rhtab_map_lookup_and_delete_batch(struct bpf_map *map,
+					       const union bpf_attr *attr,
+					       union bpf_attr __user *uattr,
+					       bool do_delete)
+{
+	struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+	void __user *uvalues = u64_to_user_ptr(attr->batch.values);
+	void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
+	void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
+	void *cursor = NULL, *keys = NULL, *values = NULL, *dst_key, *dst_val;
+	struct rhtab_elem **del_elems = NULL;
+	u32 max_count, total, key_size, value_size, i;
+	bool has_next_cursor = false;
+	struct rhtab_elem *elem;
+	u64 elem_map_flags, map_flags;
+	int ret = 0;
+
+	elem_map_flags = attr->batch.elem_flags;
+	ret = bpf_map_check_op_flags(map, elem_map_flags, BPF_F_LOCK);
+	if (ret)
+		return ret;
+
+	map_flags = attr->batch.flags;
+	if (map_flags)
+		return -EINVAL;
+
+	max_count = attr->batch.count;
+	if (!max_count)
+		return 0;
+
+	if (put_user(0, &uattr->batch.count))
+		return -EFAULT;
+
+	key_size = map->key_size;
+	value_size = map->value_size;
+
+	keys = kvmalloc_array(max_count, key_size, GFP_USER | __GFP_NOWARN);
+	values = kvmalloc_array(max_count, value_size, GFP_USER | __GFP_NOWARN);
+	if (do_delete)
+		del_elems = kvmalloc_array(max_count, sizeof(void *),
+					   GFP_USER | __GFP_NOWARN);
+	cursor = kmalloc(key_size, GFP_USER | __GFP_NOWARN);
+
+	if (!keys || !values || !cursor || (do_delete && !del_elems)) {
+		ret = -ENOMEM;
+		goto free;
+	}
+
+	if (ubatch && copy_from_user(cursor, ubatch, key_size)) {
+		ret = -EFAULT;
+		goto free;
+	}
+
+	dst_key = keys;
+	dst_val = values;
+	total = 0;
+
+	rcu_read_lock();
+
+	/*
+	 * Cursor stores the key of the next-to-process element (stashed by
+	 * the previous batch). Look it up directly so the element is included
+	 * here rather than skipped by next_key(). If the cursor was deleted
+	 * concurrently (or by the previous do_delete batch), return -EAGAIN
+	 * so userspace can distinguish a lost cursor from end-of-iteration
+	 * (-ENOENT) and restart from a NULL cursor.
+	 */
+	if (ubatch) {
+		elem = rhtab_lookup_elem(map, cursor);
+		if (!elem) {
+			rcu_read_unlock();
+			ret = -EAGAIN;
+			goto free;
+		}
+	} else {
+		elem = rhashtable_next_key(&rhtab->ht, NULL);
+	}
+
+	while (elem && !IS_ERR(elem) && total < max_count) {
+		memcpy(dst_key, elem->data, key_size);
+		rhtab_read_elem_value(map, dst_val, elem, elem_map_flags);
+		check_and_init_map_value(map, dst_val);
+
+		if (do_delete)
+			del_elems[total] = elem;
+
+		elem = rhashtable_next_key(&rhtab->ht, dst_key);
+		dst_key += key_size;
+		dst_val += value_size;
+		total++;
+
+		/* Bail to userspace to avoid stalls. */
+		if (need_resched())
+			break;
+	}
+
+	if (elem && !IS_ERR(elem)) {
+		/* Stash next-to-process key as cursor for the next batch. */
+		memcpy(cursor, elem->data, key_size);
+		has_next_cursor = true;
+	}
+
+	if (do_delete) {
+		for (i = 0; i < total; i++)
+			rhtab_delete_elem(rhtab, del_elems[i], NULL, 0);
+	}
+
+	rcu_read_unlock();
+
+	if (total == 0) {
+		ret = -ENOENT;
+		goto free;
+	}
+
+	/* No more elements after this batch. */
+	if (!has_next_cursor)
+		ret = -ENOENT;
+
+	if (copy_to_user(ukeys, keys, (size_t)total * key_size) ||
+	    copy_to_user(uvalues, values, (size_t)total * value_size) ||
+	    put_user(total, &uattr->batch.count) ||
+	    (has_next_cursor &&
+	     copy_to_user(u64_to_user_ptr(attr->batch.out_batch),
+			  cursor, key_size))) {
+		ret = -EFAULT;
+		goto free;
+	}
+
+free:
+	kfree(cursor);
+	kvfree(keys);
+	kvfree(values);
+	kvfree(del_elems);
+	return ret;
+}
+
+static int rhtab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
+				  union bpf_attr __user *uattr)
+{
+	return __rhtab_map_lookup_and_delete_batch(map, attr, uattr, false);
+}
+
+static int rhtab_map_lookup_and_delete_batch(struct bpf_map *map, const union bpf_attr *attr,
+					     union bpf_attr __user *uattr)
+{
+	return __rhtab_map_lookup_and_delete_batch(map, attr, uattr, true);
+}
+
+struct bpf_iter_seq_rhash_map_info {
+	struct bpf_map *map;
+	struct bpf_rhtab *rhtab;
+	struct rhashtable_iter iter;
+};
+
+static void *bpf_rhash_map_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(RCU)
+{
+	struct bpf_iter_seq_rhash_map_info *info = seq->private;
+	struct rhtab_elem *elem;
+
+	rhashtable_walk_start(&info->iter);
+	/*
+	 * Re-deliver the element returned by walk_next() at the end of the
+	 * previous read() — bpf_seq_read may have stopped before show()
+	 * consumed it. Rehash rewinds the walker; retry on -EAGAIN.
+	 */
+	do {
+		elem = rhashtable_walk_peek(&info->iter);
+	} while (PTR_ERR(elem) == -EAGAIN);
+
+	if (IS_ERR(elem))
+		return NULL;
+
+	if (elem && *pos == 0)
+		++*pos;
+	return elem;
+}
+
+static void *bpf_rhash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct bpf_iter_seq_rhash_map_info *info = seq->private;
+	struct rhtab_elem *elem;
+
+	++*pos;
+
+	/* Rehash rewinds the walker; retry until it stops returning -EAGAIN. */
+	do {
+		elem = rhashtable_walk_next(&info->iter);
+	} while (PTR_ERR(elem) == -EAGAIN);
+
+	if (IS_ERR(elem))
+		return NULL;
+	return elem;
+}
+
+static int __bpf_rhash_map_seq_show(struct seq_file *seq,
+				    struct rhtab_elem *elem)
+{
+	struct bpf_iter_seq_rhash_map_info *info = seq->private;
+	struct bpf_iter__bpf_map_elem ctx = {};
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+	int ret = 0;
+
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, elem == NULL);
+	if (prog) {
+		ctx.meta = &meta;
+		ctx.map = info->map;
+		if (elem) {
+			ctx.key = elem->data;
+			ctx.value = rhtab_elem_value(elem, info->map->key_size);
+		}
+		ret = bpf_iter_run_prog(prog, &ctx);
+	}
+
+	return ret;
+}
+
+static int bpf_rhash_map_seq_show(struct seq_file *seq, void *v)
+{
+	return __bpf_rhash_map_seq_show(seq, v);
+}
+
+static void bpf_rhash_map_seq_stop(struct seq_file *seq, void *v)
+	__releases(RCU)
+{
+	struct bpf_iter_seq_rhash_map_info *info = seq->private;
+
+	if (!v)
+		(void)__bpf_rhash_map_seq_show(seq, NULL);
+
+	rhashtable_walk_stop(&info->iter);
+}
+
+static int bpf_iter_init_rhash_map(void *priv_data, struct bpf_iter_aux_info *aux)
+{
+	struct bpf_iter_seq_rhash_map_info *info = priv_data;
+	struct bpf_map *map = aux->map;
+
+	bpf_map_inc_with_uref(map);
+	info->map = map;
+	info->rhtab = container_of(map, struct bpf_rhtab, map);
+	rhashtable_walk_enter(&info->rhtab->ht, &info->iter);
+	return 0;
+}
+
+static void bpf_iter_fini_rhash_map(void *priv_data)
+{
+	struct bpf_iter_seq_rhash_map_info *info = priv_data;
+
+	rhashtable_walk_exit(&info->iter);
+	bpf_map_put_with_uref(info->map);
+}
+
+static const struct seq_operations bpf_rhash_map_seq_ops = {
+	.start = bpf_rhash_map_seq_start,
+	.next = bpf_rhash_map_seq_next,
+	.stop = bpf_rhash_map_seq_stop,
+	.show = bpf_rhash_map_seq_show,
+};
+
+static const struct bpf_iter_seq_info rhash_iter_seq_info = {
+	.seq_ops = &bpf_rhash_map_seq_ops,
+	.init_seq_private = bpf_iter_init_rhash_map,
+	.fini_seq_private = bpf_iter_fini_rhash_map,
+	.seq_priv_size = sizeof(struct bpf_iter_seq_rhash_map_info),
+};
+
+BTF_ID_LIST_SINGLE(rhtab_map_btf_ids, struct, bpf_rhtab)
+const struct bpf_map_ops rhtab_map_ops = {
+	.map_meta_equal = bpf_map_meta_equal,
+	.map_alloc_check = rhtab_map_alloc_check,
+	.map_alloc = rhtab_map_alloc,
+	.map_free = rhtab_map_free,
+	.map_get_next_key = rhtab_map_get_next_key,
+	.map_release_uref = rhtab_map_free_internal_structs,
+	.map_check_btf = rhtab_map_check_btf,
+	.map_lookup_elem = rhtab_map_lookup_elem,
+	.map_lookup_and_delete_elem = rhtab_map_lookup_and_delete_elem,
+	.map_update_elem = rhtab_map_update_elem,
+	.map_delete_elem = rhtab_map_delete_elem,
+	.map_gen_lookup = rhtab_map_gen_lookup,
+	.map_seq_show_elem = rhtab_map_seq_show_elem,
+	.map_set_for_each_callback_args = map_set_for_each_callback_args,
+	.map_for_each_callback = bpf_each_rhash_elem,
+	.map_mem_usage = rhtab_map_mem_usage,
+	BATCH_OPS(rhtab),
+	.map_btf_id = &rhtab_map_btf_ids[0],
+	.iter_seq_info = &rhash_iter_seq_info,
+};
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index b5314c9fed3c..8e196c9b7c50 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1944,7 +1944,7 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = {
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
-	.arg3_type	= ARG_PTR_TO_DYNPTR | MEM_RDONLY,
+	.arg3_type	= ARG_PTR_TO_DYNPTR,
 	.arg4_type	= ARG_ANYTHING,
 	.arg5_type	= ARG_ANYTHING,
 };
@@ -2001,7 +2001,7 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = {
 	.func		= bpf_dynptr_write,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_DYNPTR | MEM_RDONLY,
+	.arg1_type	= ARG_PTR_TO_DYNPTR,
 	.arg2_type	= ARG_ANYTHING,
 	.arg3_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
 	.arg4_type	= ARG_CONST_SIZE_OR_ZERO,
@@ -2044,7 +2044,7 @@ static const struct bpf_func_proto bpf_dynptr_data_proto = {
 	.func		= bpf_dynptr_data,
 	.gpl_only	= false,
 	.ret_type	= RET_PTR_TO_DYNPTR_MEM_OR_NULL,
-	.arg1_type	= ARG_PTR_TO_DYNPTR | MEM_RDONLY,
+	.arg1_type	= ARG_PTR_TO_DYNPTR,
 	.arg2_type	= ARG_ANYTHING,
 	.arg3_type	= ARG_CONST_ALLOC_SIZE_OR_ZERO,
 };
@@ -2247,10 +2247,11 @@ EXPORT_SYMBOL_GPL(bpf_base_func_proto);
 void bpf_list_head_free(const struct btf_field *field, void *list_head,
 			struct bpf_spin_lock *spin_lock)
 {
-	struct list_head *head = list_head, *orig_head = list_head;
+	struct list_head *head = list_head, drain, *pos, *n;
 
 	BUILD_BUG_ON(sizeof(struct list_head) > sizeof(struct bpf_list_head));
 	BUILD_BUG_ON(__alignof__(struct list_head) > __alignof__(struct bpf_list_head));
+	INIT_LIST_HEAD(&drain);
 
 	/* Do the actual list draining outside the lock to not hold the lock for
 	 * too long, and also prevent deadlocks if tracing programs end up
@@ -2261,20 +2262,30 @@ void bpf_list_head_free(const struct btf_field *field, void *list_head,
 	__bpf_spin_lock_irqsave(spin_lock);
 	if (!head->next || list_empty(head))
 		goto unlock;
-	head = head->next;
+	list_for_each_safe(pos, n, head) {
+		struct bpf_list_node_kern *node;
+
+		node = container_of(pos, struct bpf_list_node_kern, list_head);
+		WRITE_ONCE(node->owner, BPF_PTR_POISON);
+		list_move_tail(pos, &drain);
+	}
 unlock:
-	INIT_LIST_HEAD(orig_head);
+	INIT_LIST_HEAD(head);
 	__bpf_spin_unlock_irqrestore(spin_lock);
 
-	while (head != orig_head) {
-		void *obj = head;
+	while (!list_empty(&drain)) {
+		struct bpf_list_node_kern *node;
 
-		obj -= field->graph_root.node_offset;
-		head = head->next;
+		pos = drain.next;
+		node = container_of(pos, struct bpf_list_node_kern, list_head);
+		list_del_init(pos);
+		/* Ensure __bpf_list_add() sees the node as unlinked. */
+		smp_store_release(&node->owner, NULL);
 		/* The contained type can also have resources, including a
 		 * bpf_list_head which needs to be freed.
 		 */
-		__bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
+		__bpf_obj_drop_impl((char *)pos - field->graph_root.node_offset,
+				    field->graph_root.value_rec, false);
 	}
 }
 
@@ -2295,6 +2306,7 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
 		      struct bpf_spin_lock *spin_lock)
 {
 	struct rb_root_cached orig_root, *root = rb_root;
+	struct bpf_rb_node_kern *node;
 	struct rb_node *pos, *n;
 	void *obj;
 
@@ -2303,14 +2315,20 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
 
 	__bpf_spin_lock_irqsave(spin_lock);
 	orig_root = *root;
+	bpf_rbtree_postorder_for_each_entry_safe(pos, n, &orig_root.rb_root) {
+		node = rb_entry(pos, struct bpf_rb_node_kern, rb_node);
+		WRITE_ONCE(node->owner, BPF_PTR_POISON);
+	}
 	*root = RB_ROOT_CACHED;
 	__bpf_spin_unlock_irqrestore(spin_lock);
 
 	bpf_rbtree_postorder_for_each_entry_safe(pos, n, &orig_root.rb_root) {
 		obj = pos;
 		obj -= field->graph_root.node_offset;
-
-
+		node = rb_entry(pos, struct bpf_rb_node_kern, rb_node);
+		RB_CLEAR_NODE(pos);
+		/* Ensure __bpf_rbtree_add() sees the node as unlinked. */
+		smp_store_release(&node->owner, NULL);
 		__bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
 	}
 }
@@ -2467,9 +2485,11 @@ __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta
 
 static int __bpf_list_add(struct bpf_list_node_kern *node,
 			  struct bpf_list_head *head,
-			  bool tail, struct btf_record *rec, u64 off)
+			  struct list_head **prev_ptr,
+			  struct btf_record *rec, u64 off)
 {
 	struct list_head *n = &node->list_head, *h = (void *)head;
+	struct list_head *prev;
 
 	/* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
 	 * called on its fields, so init here
@@ -2477,19 +2497,31 @@ static int __bpf_list_add(struct bpf_list_node_kern *node,
 	if (unlikely(!h->next))
 		INIT_LIST_HEAD(h);
 
+	prev = *prev_ptr;
+
+	/* When prev is not the list head, it must be a node in this list. */
+	if (prev != h) {
+		struct bpf_list_node_kern *prev_kn =
+			container_of(prev, struct bpf_list_node_kern, list_head);
+
+		if (unlikely(READ_ONCE(prev_kn->owner) != head))
+			goto fail;
+	}
+
 	/* node->owner != NULL implies !list_empty(n), no need to separately
 	 * check the latter
 	 */
-	if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
-		/* Only called from BPF prog, no need to migrate_disable */
-		__bpf_obj_drop_impl((void *)n - off, rec, false);
-		return -EINVAL;
-	}
+	if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON))
+		goto fail;
 
-	tail ? list_add_tail(n, h) : list_add(n, h);
+	list_add(n, prev);
 	WRITE_ONCE(node->owner, head);
-
 	return 0;
+
+fail:
+	/* Only called from BPF prog, no need to migrate_disable */
+	__bpf_obj_drop_impl((void *)n - off, rec, false);
+	return -EINVAL;
 }
 
 /**
@@ -2510,8 +2542,9 @@ __bpf_kfunc int bpf_list_push_front(struct bpf_list_head *head,
 				    u64 off)
 {
 	struct bpf_list_node_kern *n = (void *)node;
+	struct list_head *h = (void *)head;
 
-	return __bpf_list_add(n, head, false, meta ? meta->record : NULL, off);
+	return __bpf_list_add(n, head, &h, meta ? meta->record : NULL, off);
 }
 
 __bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head,
@@ -2539,8 +2572,9 @@ __bpf_kfunc int bpf_list_push_back(struct bpf_list_head *head,
 				   u64 off)
 {
 	struct bpf_list_node_kern *n = (void *)node;
+	struct list_head *h = (void *)head;
 
-	return __bpf_list_add(n, head, true, meta ? meta->record : NULL, off);
+	return __bpf_list_add(n, head, &h->prev, meta ? meta->record : NULL, off);
 }
 
 __bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head,
@@ -2550,37 +2584,63 @@ __bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head,
 	return bpf_list_push_back(head, node, meta__ign, off);
 }
 
-static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail)
+__bpf_kfunc int bpf_list_add(struct bpf_list_head *head, struct bpf_list_node *new,
+			     struct bpf_list_node *prev__nonown_allowed,
+			     struct btf_struct_meta *meta, u64 off)
+{
+	struct bpf_list_node_kern *n = (void *)new, *p = (void *)prev__nonown_allowed;
+	struct list_head *prev_ptr = &p->list_head;
+
+	return __bpf_list_add(n, head, &prev_ptr, meta ? meta->record : NULL, off);
+}
+
+static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head,
+					    struct list_head *n)
 {
-	struct list_head *n, *h = (void *)head;
+	struct list_head *h = (void *)head;
 	struct bpf_list_node_kern *node;
 
 	/* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
 	 * called on its fields, so init here
 	 */
-	if (unlikely(!h->next))
+	if (unlikely(!h->next)) {
 		INIT_LIST_HEAD(h);
+		return NULL;
+	}
 	if (list_empty(h))
 		return NULL;
 
-	n = tail ? h->prev : h->next;
 	node = container_of(n, struct bpf_list_node_kern, list_head);
-	if (WARN_ON_ONCE(READ_ONCE(node->owner) != head))
+	if (unlikely(READ_ONCE(node->owner) != head))
 		return NULL;
 
 	list_del_init(n);
-	WRITE_ONCE(node->owner, NULL);
+	/* Ensure __bpf_list_add() sees the node as unlinked. */
+	smp_store_release(&node->owner, NULL);
 	return (struct bpf_list_node *)n;
 }
 
 __bpf_kfunc struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head)
 {
-	return __bpf_list_del(head, false);
+	struct list_head *h = (void *)head;
+
+	return __bpf_list_del(head, h->next);
 }
 
 __bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head)
 {
-	return __bpf_list_del(head, true);
+	struct list_head *h = (void *)head;
+
+	return __bpf_list_del(head, h->prev);
+}
+
+__bpf_kfunc struct bpf_list_node *bpf_list_del(struct bpf_list_head *head,
+					       struct bpf_list_node *node__nonown_allowed)
+{
+	struct bpf_list_node_kern *kn = (void *)node__nonown_allowed;
+
+	/* verifier guarantees node is a list node rather than list head */
+	return __bpf_list_del(head, &kn->list_head);
 }
 
 __bpf_kfunc struct bpf_list_node *bpf_list_front(struct bpf_list_head *head)
@@ -2603,6 +2663,43 @@ __bpf_kfunc struct bpf_list_node *bpf_list_back(struct bpf_list_head *head)
 	return (struct bpf_list_node *)h->prev;
 }
 
+__bpf_kfunc bool bpf_list_is_first(struct bpf_list_head *head,
+				   struct bpf_list_node *node__nonown_allowed)
+{
+	struct list_head *h = (struct list_head *)head;
+	struct bpf_list_node_kern *kn = (struct bpf_list_node_kern *)node__nonown_allowed;
+
+	if (READ_ONCE(kn->owner) != head)
+		return false;
+
+	return list_is_first(&kn->list_head, h);
+}
+
+__bpf_kfunc bool bpf_list_is_last(struct bpf_list_head *head,
+				  struct bpf_list_node *node__nonown_allowed)
+{
+	struct list_head *h = (struct list_head *)head;
+	struct bpf_list_node_kern *kn = (struct bpf_list_node_kern *)node__nonown_allowed;
+
+	if (READ_ONCE(kn->owner) != head)
+		return false;
+
+	return list_is_last(&kn->list_head, h);
+}
+
+__bpf_kfunc bool bpf_list_empty(struct bpf_list_head *head)
+{
+	struct list_head *h = (struct list_head *)head;
+
+	/* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
+	 * called on its fields, so init here
+	 */
+	if (unlikely(!h->next))
+		INIT_LIST_HEAD(h);
+
+	return list_empty(h);
+}
+
 __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
 						  struct bpf_rb_node *node)
 {
@@ -2912,11 +3009,13 @@ __bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid)
 {
 	struct task_struct *p;
 
-	rcu_read_lock();
+	guard(rcu)();
+	if (!task_active_pid_ns(current))
+		return NULL;
+
 	p = find_task_by_vpid(vpid);
 	if (p)
 		p = bpf_task_acquire(p);
-	rcu_read_unlock();
 
 	return p;
 }
@@ -3072,7 +3171,7 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset,
 	return bpf_dynptr_slice(p, offset, buffer__nullable, buffer__szk);
 }
 
-__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u64 start, u64 end)
+__bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr *p, u64 start, u64 end)
 {
 	struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
 	u64 size;
@@ -3093,14 +3192,14 @@ __bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u64 start, u64 end
 
 __bpf_kfunc bool bpf_dynptr_is_null(const struct bpf_dynptr *p)
 {
-	struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+	const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
 
 	return !ptr->data;
 }
 
 __bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p)
 {
-	struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+	const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
 
 	if (!ptr->data)
 		return false;
@@ -3110,7 +3209,7 @@ __bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p)
 
 __bpf_kfunc u64 bpf_dynptr_size(const struct bpf_dynptr *p)
 {
-	struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+	const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
 
 	if (!ptr->data)
 		return -EINVAL;
@@ -3122,7 +3221,7 @@ __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p,
 				 struct bpf_dynptr *clone__uninit)
 {
 	struct bpf_dynptr_kern *clone = (struct bpf_dynptr_kern *)clone__uninit;
-	struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+	const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
 
 	if (!ptr->data) {
 		bpf_dynptr_set_null(clone);
@@ -3145,11 +3244,11 @@ __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p,
  * Copies data from source dynptr to destination dynptr.
  * Returns 0 on success; negative error, otherwise.
  */
-__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u64 dst_off,
-				struct bpf_dynptr *src_ptr, u64 src_off, u64 size)
+__bpf_kfunc int bpf_dynptr_copy(const struct bpf_dynptr *dst_ptr, u64 dst_off,
+				const struct bpf_dynptr *src_ptr, u64 src_off, u64 size)
 {
-	struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr;
-	struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr;
+	const struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr;
+	const struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr;
 	void *src_slice, *dst_slice;
 	char buf[256];
 	u64 off;
@@ -3200,9 +3299,9 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u64 dst_off,
  * at @offset with the constant byte @val.
  * Returns 0 on success; negative error, otherwise.
  */
-__bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u64 offset, u64 size, u8 val)
+__bpf_kfunc int bpf_dynptr_memset(const struct bpf_dynptr *p, u64 offset, u64 size, u8 val)
 {
-	struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+	const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
 	u64 chunk_sz, write_off;
 	char buf[256];
 	void* slice;
@@ -3301,7 +3400,7 @@ __bpf_kfunc void bpf_throw(u64 cookie)
 	 * which skips compiler generated instrumentation to do the same.
 	 */
 	kasan_unpoison_task_stack_below((void *)(long)ctx.sp);
-	ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp, 0, 0);
+	ctx.aux->bpf_exception_cb(cookie, ctx.sp + ctx.aux->stack_arg_sp_adjust, ctx.bp, 0, 0);
 	WARN(1, "A call to BPF exception callback should never return\n");
 }
 
@@ -4214,13 +4313,13 @@ __bpf_kfunc void bpf_key_put(struct bpf_key *bkey)
  *
  * Return: 0 on success, a negative value on error.
  */
-__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p,
-			       struct bpf_dynptr *sig_p,
+__bpf_kfunc int bpf_verify_pkcs7_signature(const struct bpf_dynptr *data_p,
+			       const struct bpf_dynptr *sig_p,
 			       struct bpf_key *trusted_keyring)
 {
 #ifdef CONFIG_SYSTEM_DATA_VERIFICATION
-	struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p;
-	struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p;
+	const struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p;
+	const struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p;
 	const void *data, *sig;
 	u32 data_len, sig_len;
 	int ret;
@@ -4718,10 +4817,15 @@ BTF_ID_FLAGS(func, bpf_list_push_front, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, bpf_list_push_front_impl)
 BTF_ID_FLAGS(func, bpf_list_push_back, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, bpf_list_push_back_impl)
+BTF_ID_FLAGS(func, bpf_list_add, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_list_del, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_list_front, KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_list_back, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_list_is_first)
+BTF_ID_FLAGS(func, bpf_list_is_last)
+BTF_ID_FLAGS(func, bpf_list_empty)
 BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE)
 BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL)
@@ -4862,7 +4966,7 @@ BTF_ID_FLAGS(func, bpf_stream_print_stack, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, bpf_dynptr_from_file)
-BTF_ID_FLAGS(func, bpf_dynptr_file_discard)
+BTF_ID_FLAGS(func, bpf_dynptr_file_discard, KF_RELEASE)
 BTF_ID_FLAGS(func, bpf_timer_cancel_async)
 BTF_KFUNCS_END(common_btf_ids)
 
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index c3f79b5a2f8c..7837968c0842 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -845,6 +845,10 @@ static void bpf_destroy_inode(struct inode *inode)
 	simple_xattrs_free(&opts->xa_cache, &bi->xattrs, NULL);
 }
 
+/*
+ * Called after RCU grace period - safe to free inode and anything
+ *  that might be accessed by RCU pathwalk (inode fields, i_link).
+ */
 static void bpf_free_inode(struct inode *inode)
 {
 	if (S_ISLNK(inode->i_mode))
diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c
index 58197d73b120..0aadfbae0acc 100644
--- a/kernel/bpf/liveness.c
+++ b/kernel/bpf/liveness.c
@@ -610,6 +610,21 @@ enum arg_track_state {
 /* Track callee stack slots fp-8 through fp-512 (64 slots of 8 bytes each) */
 #define MAX_ARG_SPILL_SLOTS 64
 
+/*
+ * Combined register + stack arg tracking: R0-R10 at indices 0-10,
+ * outgoing stack arg slots at indices MAX_BPF_REG..MAX_BPF_REG+6.
+ */
+#define MAX_AT_TRACK_REGS (MAX_BPF_REG + MAX_STACK_ARG_SLOTS)
+
+static int stack_arg_off_to_slot(s16 off)
+{
+	int aoff = off < 0 ? -off : off;
+
+	if (aoff / 8 > MAX_STACK_ARG_SLOTS)
+		return -1;
+	return aoff / 8 - 1;
+}
+
 static bool arg_is_visited(const struct arg_track *at)
 {
 	return at->frame != ARG_UNVISITED;
@@ -791,7 +806,9 @@ static bool arg_track_join(struct bpf_verifier_env *env, int idx, int target, in
 		return true;
 
 	verbose(env, "arg JOIN insn %d -> %d ", idx, target);
-	if (r >= 0)
+	if (r >= MAX_BPF_REG)
+		verbose(env, "sa%d: ", r - MAX_BPF_REG);
+	else if (r >= 0)
 		verbose(env, "r%d: ", r);
 	else
 		verbose(env, "fp%+d: ", r * 8);
@@ -1032,6 +1049,21 @@ static void arg_track_log(struct bpf_verifier_env *env, struct bpf_insn *insn, i
 		verbose(env, "\tr%d: ", i); verbose_arg_track(env, &at_in[i]);
 		verbose(env, " -> "); verbose_arg_track(env, &at_out[i]);
 	}
+	/* Log outgoing stack arg slot transitions at indices MAX_BPF_REG..MAX_AT_TRACK_REGS-1 */
+	for (i = 0; i < MAX_STACK_ARG_SLOTS; i++) {
+		int ai = MAX_BPF_REG + i;
+
+		if (arg_track_eq(&at_out[ai], &at_in[ai]))
+			continue;
+		if (!printed) {
+			verbose(env, "%3d: ", idx);
+			bpf_verbose_insn(env, insn);
+			bpf_vlog_reset(&env->log, env->log.end_pos - 1);
+			printed = true;
+		}
+		verbose(env, "\tsa%d: ", i); verbose_arg_track(env, &at_in[ai]);
+		verbose(env, " -> "); verbose_arg_track(env, &at_out[ai]);
+	}
 	for (i = 0; i < MAX_ARG_SPILL_SLOTS; i++) {
 		if (arg_track_eq(&at_stack_out[i], &at_stack_in[i]))
 			continue;
@@ -1062,6 +1094,7 @@ static bool can_be_local_fp(int depth, int regno, struct arg_track *at)
 static void arg_track_xfer(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			   int insn_idx,
 			   struct arg_track *at_out, struct arg_track *at_stack_out,
+			   const struct arg_track *at_stack_arg_entry,
 			   struct func_instance *instance,
 			   u32 *callsites)
 {
@@ -1071,9 +1104,21 @@ static void arg_track_xfer(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	struct arg_track *dst = &at_out[insn->dst_reg];
 	struct arg_track *src = &at_out[insn->src_reg];
 	struct arg_track none = { .frame = ARG_NONE };
-	int r;
-
-	if (class == BPF_ALU64 && BPF_SRC(insn->code) == BPF_K) {
+	int r, slot;
+
+	/* Handle stack arg stores and loads. */
+	if (is_stack_arg_st(insn) || is_stack_arg_stx(insn)) {
+		slot = stack_arg_off_to_slot(insn->off);
+		if (slot >= 0) {
+			if (is_stack_arg_stx(insn))
+				at_out[MAX_BPF_REG + slot] = at_out[insn->src_reg];
+			else
+				at_out[MAX_BPF_REG + slot] = none;
+		}
+	} else if (is_stack_arg_ldx(insn)) {
+		slot = stack_arg_off_to_slot(insn->off);
+		at_out[insn->dst_reg] = (slot >= 0) ? at_stack_arg_entry[slot] : none;
+	} else if (class == BPF_ALU64 && BPF_SRC(insn->code) == BPF_K) {
 		if (code == BPF_MOV) {
 			*dst = none;
 		} else if (dst->frame >= 0) {
@@ -1297,6 +1342,16 @@ static int record_load_store_access(struct bpf_verifier_env *env,
 	struct arg_track resolved, *ptr;
 	int oi;
 
+	/*
+	 * Stack arg insns use dst_reg/src_reg=BPF_REG_PARAMS(11). Since at[]
+	 * is extended to MAX_AT_TRACK_REGS, at[11] holds the arg_track for
+	 * outgoing stack arg slot 0 — not the pointer used for the memory
+	 * access. Skip so the slot's tracked value isn't confused with the
+	 * base register that record_stack_access() expects.
+	 */
+	if (is_stack_arg_stx(insn) || is_stack_arg_st(insn) || is_stack_arg_ldx(insn))
+		return 0;
+
 	switch (class) {
 	case BPF_LDX:
 		ptr = &at[insn->src_reg];
@@ -1343,6 +1398,42 @@ static int record_load_store_access(struct bpf_verifier_env *env,
 	return 0;
 }
 
+static int record_arg_access(struct bpf_verifier_env *env,
+			     struct func_instance *instance,
+			     struct bpf_insn *insn,
+			     struct arg_track *at, int arg_idx,
+			     int insn_idx)
+{
+	int depth = instance->depth;
+	int frame = at->frame;
+	int err = 0;
+	s64 bytes;
+
+	if (!arg_is_fp(at))
+		return 0;
+
+	if (bpf_helper_call(insn)) {
+		bytes = bpf_helper_stack_access_bytes(env, insn, arg_idx, insn_idx);
+	} else if (bpf_pseudo_kfunc_call(insn)) {
+		bytes = bpf_kfunc_stack_access_bytes(env, insn, arg_idx, insn_idx);
+	} else {
+		for (int f = 0; f <= depth; f++) {
+			err = mark_stack_read(instance, f, insn_idx, SPIS_ALL);
+			if (err)
+				return err;
+		}
+		return 0;
+	}
+	if (bytes == 0)
+		return 0;
+
+	if (frame >= 0 && frame <= depth)
+		err = record_stack_access(instance, at, bytes, frame, insn_idx);
+	else if (frame == ARG_IMPRECISE)
+		err = record_imprecise(instance, at->mask, insn_idx);
+	return err;
+}
+
 /* Record stack access for a given 'at' state of helper/kfunc 'insn' */
 static int record_call_access(struct bpf_verifier_env *env,
 			      struct func_instance *instance,
@@ -1350,9 +1441,8 @@ static int record_call_access(struct bpf_verifier_env *env,
 			      int insn_idx)
 {
 	struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
-	int depth = instance->depth;
 	struct bpf_call_summary cs;
-	int r, err = 0, num_params = 5;
+	int r, err, num_params = 5;
 
 	if (bpf_pseudo_call(insn))
 		return 0;
@@ -1360,32 +1450,15 @@ static int record_call_access(struct bpf_verifier_env *env,
 	if (bpf_get_call_summary(env, insn, &cs))
 		num_params = cs.num_params;
 
-	for (r = BPF_REG_1; r < BPF_REG_1 + num_params; r++) {
-		int frame = at[r].frame;
-		s64 bytes;
-
-		if (!arg_is_fp(&at[r]))
-			continue;
-
-		if (bpf_helper_call(insn)) {
-			bytes = bpf_helper_stack_access_bytes(env, insn, r - 1, insn_idx);
-		} else if (bpf_pseudo_kfunc_call(insn)) {
-			bytes = bpf_kfunc_stack_access_bytes(env, insn, r - 1, insn_idx);
-		} else {
-			for (int f = 0; f <= depth; f++) {
-				err = mark_stack_read(instance, f, insn_idx, SPIS_ALL);
-				if (err)
-					return err;
-			}
-			return 0;
-		}
-		if (bytes == 0)
-			continue;
+	for (r = BPF_REG_1; r < BPF_REG_1 + min(num_params, MAX_BPF_FUNC_REG_ARGS); r++) {
+		err = record_arg_access(env, instance, insn, &at[r], r - 1, insn_idx);
+		if (err)
+			return err;
+	}
 
-		if (frame >= 0 && frame <= depth)
-			err = record_stack_access(instance, &at[r], bytes, frame, insn_idx);
-		else if (frame == ARG_IMPRECISE)
-			err = record_imprecise(instance, at[r].mask, insn_idx);
+	for (r = 0; r < MAX_STACK_ARG_SLOTS && r < num_params - MAX_BPF_FUNC_REG_ARGS; r++) {
+		err = record_arg_access(env, instance, insn, &at[MAX_BPF_REG + r],
+					r + MAX_BPF_FUNC_REG_ARGS, insn_idx);
 		if (err)
 			return err;
 	}
@@ -1445,7 +1518,7 @@ static int find_callback_subprog(struct bpf_verifier_env *env,
 
 /* Per-subprog intermediate state kept alive across analysis phases */
 struct subprog_at_info {
-	struct arg_track (*at_in)[MAX_BPF_REG];
+	struct arg_track (*at_in)[MAX_AT_TRACK_REGS];
 	int len;
 };
 
@@ -1479,6 +1552,9 @@ static void print_subprog_arg_access(struct bpf_verifier_env *env,
 			for (r = 0; r < MAX_BPF_REG - 1; r++)
 				if (arg_is_fp(&info->at_in[i][r]))
 					has_extra = true;
+			for (r = 0; r < MAX_STACK_ARG_SLOTS; r++)
+				if (arg_is_fp(&info->at_in[i][MAX_BPF_REG + r]))
+					has_extra = true;
 		}
 		if (is_ldx_stx_call) {
 			for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++)
@@ -1503,6 +1579,12 @@ static void print_subprog_arg_access(struct bpf_verifier_env *env,
 				verbose(env, " r%d=", r);
 				verbose_arg_track(env, &info->at_in[i][r]);
 			}
+			for (r = 0; r < MAX_STACK_ARG_SLOTS; r++) {
+				if (!arg_is_fp(&info->at_in[i][MAX_BPF_REG + r]))
+					continue;
+				verbose(env, " sa%d=", r);
+				verbose_arg_track(env, &info->at_in[i][MAX_BPF_REG + r]);
+			}
 		}
 
 		if (is_ldx_stx_call) {
@@ -1525,7 +1607,7 @@ static void print_subprog_arg_access(struct bpf_verifier_env *env,
  * Runs forward fixed-point with arg_track_xfer(), then records
  * memory accesses in a single linear pass over converged state.
  *
- * @callee_entry: pre-populated entry state for R1-R5
+ * @callee_entry: pre-populated entry state for R1-R5 and stack args
  *                NULL for main (subprog 0).
  * @info:         stores at_in, len for debug printing.
  */
@@ -1543,10 +1625,11 @@ static int compute_subprog_args(struct bpf_verifier_env *env,
 	int end = env->subprog_info[subprog + 1].start;
 	int po_end = env->subprog_info[subprog + 1].postorder_start;
 	int len = end - start;
-	struct arg_track (*at_in)[MAX_BPF_REG] = NULL;
-	struct arg_track at_out[MAX_BPF_REG];
+	struct arg_track (*at_in)[MAX_AT_TRACK_REGS] = NULL;
+	struct arg_track at_out[MAX_AT_TRACK_REGS];
 	struct arg_track (*at_stack_in)[MAX_ARG_SPILL_SLOTS] = NULL;
 	struct arg_track *at_stack_out = NULL;
+	struct arg_track at_stack_arg_entry[MAX_STACK_ARG_SLOTS];
 	struct arg_track unvisited = { .frame = ARG_UNVISITED };
 	struct arg_track none = { .frame = ARG_NONE };
 	bool changed;
@@ -1565,13 +1648,13 @@ static int compute_subprog_args(struct bpf_verifier_env *env,
 		goto err_free;
 
 	for (i = 0; i < len; i++) {
-		for (r = 0; r < MAX_BPF_REG; r++)
+		for (r = 0; r < MAX_AT_TRACK_REGS; r++)
 			at_in[i][r] = unvisited;
 		for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++)
 			at_stack_in[i][r] = unvisited;
 	}
 
-	for (r = 0; r < MAX_BPF_REG; r++)
+	for (r = 0; r < MAX_AT_TRACK_REGS; r++)
 		at_in[0][r] = none;
 
 	/* Entry: R10 is always precisely the current frame's FP */
@@ -1587,6 +1670,10 @@ static int compute_subprog_args(struct bpf_verifier_env *env,
 	for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++)
 		at_stack_in[0][r] = none;
 
+	/* Entry: incoming stack args from caller, or ARG_NONE for main */
+	for (r = 0; r < MAX_STACK_ARG_SLOTS; r++)
+		at_stack_arg_entry[r] = callee_entry ? callee_entry[MAX_BPF_REG + r] : none;
+
 	if (env->log.level & BPF_LOG_LEVEL2)
 		verbose(env, "subprog#%d: analyzing (depth %d)...\n", subprog, depth);
 
@@ -1605,7 +1692,8 @@ redo:
 		memcpy(at_out, at_in[i], sizeof(at_out));
 		memcpy(at_stack_out, at_stack_in[i], MAX_ARG_SPILL_SLOTS * sizeof(*at_stack_out));
 
-		arg_track_xfer(env, insn, idx, at_out, at_stack_out, instance, callsites);
+		arg_track_xfer(env, insn, idx, at_out, at_stack_out,
+			       at_stack_arg_entry, instance, callsites);
 		arg_track_log(env, insn, idx, at_in[i], at_stack_in[i], at_out, at_stack_out);
 
 		/* Propagate to successors within this subprogram */
@@ -1619,7 +1707,7 @@ redo:
 				continue;
 			ti = target - start;
 
-			for (r = 0; r < MAX_BPF_REG; r++)
+			for (r = 0; r < MAX_AT_TRACK_REGS; r++)
 				changed |= arg_track_join(env, idx, target, r,
 							  &at_in[ti][r], at_out[r]);
 
@@ -1674,11 +1762,14 @@ err_free:
 	return err;
 }
 
-/* Return true if any of R1-R5 is derived from a frame pointer. */
+/* Return true if any of R1-R5 or stack args is derived from a frame pointer. */
 static bool has_fp_args(struct arg_track *args)
 {
 	for (int r = BPF_REG_1; r <= BPF_REG_5; r++)
-		if (args[r].frame != ARG_NONE)
+		if (arg_is_fp(&args[r]))
+			return true;
+	for (int r = 0; r < MAX_STACK_ARG_SLOTS; r++)
+		if (arg_is_fp(&args[MAX_BPF_REG + r]))
 			return true;
 	return false;
 }
@@ -1803,7 +1894,7 @@ static int analyze_subprog(struct bpf_verifier_env *env,
 	/* For each reachable call site in the subprog, recurse into callees */
 	for (int p = po_start; p < po_end; p++) {
 		int idx = env->cfg.insn_postorder[p];
-		struct arg_track callee_args[BPF_REG_5 + 1];
+		struct arg_track callee_args[MAX_AT_TRACK_REGS] = {};
 		struct arg_track none = { .frame = ARG_NONE };
 		struct bpf_insn *insn = &insns[idx];
 		struct func_instance *callee_instance;
@@ -1818,9 +1909,11 @@ static int analyze_subprog(struct bpf_verifier_env *env,
 			if (callee < 0)
 				continue;
 
-			/* Build entry args: R1-R5 from at_in at call site */
+			/* Build entry args: R1-R5 and stack args from at_in at call site */
 			for (int r = BPF_REG_1; r <= BPF_REG_5; r++)
 				callee_args[r] = info[subprog].at_in[j][r];
+			for (int r = 0; r < MAX_STACK_ARG_SLOTS; r++)
+				callee_args[MAX_BPF_REG + r] = info[subprog].at_in[j][MAX_BPF_REG + r];
 		} else if (bpf_calls_callback(env, idx)) {
 			callee = find_callback_subprog(env, insn, idx, &caller_reg, &cb_callee_reg);
 			if (callee == -2) {
@@ -1842,6 +1935,8 @@ static int analyze_subprog(struct bpf_verifier_env *env,
 
 			for (int r = BPF_REG_1; r <= BPF_REG_5; r++)
 				callee_args[r] = none;
+			for (int r = 0; r < MAX_STACK_ARG_SLOTS; r++)
+				callee_args[MAX_BPF_REG + r] = none;
 			callee_args[cb_callee_reg] = info[subprog].at_in[j][caller_reg];
 		} else {
 			continue;
@@ -2085,7 +2180,7 @@ static void compute_insn_live_regs(struct bpf_verifier_env *env,
 			def = ALL_CALLER_SAVED_REGS;
 			use = def & ~BIT(BPF_REG_0);
 			if (bpf_get_call_summary(env, insn, &cs))
-				use = GENMASK(cs.num_params, 1);
+				use = GENMASK(min_t(u8, cs.num_params, MAX_BPF_FUNC_REG_ARGS), 1);
 			break;
 		default:
 			def = 0;
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 011e4ec25acd..b740fa73ee26 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -13,17 +13,17 @@
 
 #define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args)
 
-static bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
+static bool bpf_verifier_log_attr_valid(u32 log_level, char __user *log_buf, u32 log_size)
 {
 	/* ubuf and len_total should both be specified (or not) together */
-	if (!!log->ubuf != !!log->len_total)
+	if (!!log_buf != !!log_size)
 		return false;
 	/* log buf without log_level is meaningless */
-	if (log->ubuf && log->level == 0)
+	if (log_buf && log_level == 0)
 		return false;
-	if (log->level & ~BPF_LOG_MASK)
+	if (log_level & ~BPF_LOG_MASK)
 		return false;
-	if (log->len_total > UINT_MAX >> 2)
+	if (log_size > UINT_MAX >> 2)
 		return false;
 	return true;
 }
@@ -36,7 +36,7 @@ int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level,
 	log->len_total = log_size;
 
 	/* log attributes have to be sane */
-	if (!bpf_verifier_log_attr_valid(log))
+	if (!bpf_verifier_log_attr_valid(log_level, log_buf, log_size))
 		return -EINVAL;
 
 	return 0;
@@ -571,20 +571,20 @@ static void print_scalar_ranges(struct bpf_verifier_env *env,
 		u64 val;
 		bool omit;
 	} minmaxs[] = {
-		{"smin",   reg->smin_value,         reg->smin_value == S64_MIN},
-		{"smax",   reg->smax_value,         reg->smax_value == S64_MAX},
-		{"umin",   reg->umin_value,         reg->umin_value == 0},
-		{"umax",   reg->umax_value,         reg->umax_value == U64_MAX},
+		{"smin",   reg_smin(reg),         reg_smin(reg) == S64_MIN},
+		{"smax",   reg_smax(reg),         reg_smax(reg) == S64_MAX},
+		{"umin",   reg_umin(reg),         reg_umin(reg) == 0},
+		{"umax",   reg_umax(reg),         reg_umax(reg) == U64_MAX},
 		{"smin32",
-		 is_snum_decimal((s64)reg->s32_min_value)
-			 ? (s64)reg->s32_min_value
-			 : (u32)reg->s32_min_value, reg->s32_min_value == S32_MIN},
+		 is_snum_decimal((s64)reg_s32_min(reg))
+			 ? (s64)reg_s32_min(reg)
+			 : (u32)reg_s32_min(reg), reg_s32_min(reg) == S32_MIN},
 		{"smax32",
-		 is_snum_decimal((s64)reg->s32_max_value)
-			 ? (s64)reg->s32_max_value
-			 : (u32)reg->s32_max_value, reg->s32_max_value == S32_MAX},
-		{"umin32", reg->u32_min_value,      reg->u32_min_value == 0},
-		{"umax32", reg->u32_max_value,      reg->u32_max_value == U32_MAX},
+		 is_snum_decimal((s64)reg_s32_max(reg))
+			 ? (s64)reg_s32_max(reg)
+			 : (u32)reg_s32_max(reg), reg_s32_max(reg) == S32_MAX},
+		{"umin32", reg_u32_min(reg),      reg_u32_min(reg) == 0},
+		{"umax32", reg_u32_max(reg),      reg_u32_max(reg) == U32_MAX},
 	}, *m1, *m2, *mend = &minmaxs[ARRAY_SIZE(minmaxs)];
 	bool neg1, neg2;
 
@@ -665,8 +665,8 @@ static void print_reg_state(struct bpf_verifier_env *env,
 		verbose_a("id=%d", reg->id & ~BPF_ADD_CONST);
 	if (reg->id & BPF_ADD_CONST)
 		verbose(env, "%+d", reg->delta);
-	if (reg->ref_obj_id)
-		verbose_a("ref_obj_id=%d", reg->ref_obj_id);
+	if (reg->parent_id)
+		verbose_a("parent_id=%d", reg->parent_id);
 	if (type_is_non_owning_ref(reg->type))
 		verbose_a("%s", "non_own_ref");
 	if (type_is_map_ptr(t)) {
@@ -768,21 +768,19 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie
 			verbose(env, "=dynptr_%s(", dynptr_type_str(reg->dynptr.type));
 			if (reg->id)
 				verbose_a("id=%d", reg->id);
-			if (reg->ref_obj_id)
-				verbose_a("ref_id=%d", reg->ref_obj_id);
-			if (reg->dynptr_id)
-				verbose_a("dynptr_id=%d", reg->dynptr_id);
+			if (reg->parent_id)
+				verbose_a("parent_id=%d", reg->parent_id);
 			verbose(env, ")");
 			break;
 		case STACK_ITER:
-			/* only main slot has ref_obj_id set; skip others */
-			if (!reg->ref_obj_id)
+			/* only main slot has id set; skip others */
+			if (!reg->id)
 				continue;
 
-			verbose(env, " fp%d=iter_%s(ref_id=%d,state=%s,depth=%u)",
+			verbose(env, " fp%d=iter_%s(id=%d,state=%s,depth=%u)",
 				(-i - 1) * BPF_REG_SIZE,
 				iter_type_str(reg->iter.btf, reg->iter.btf_id),
-				reg->ref_obj_id, iter_state_str(reg->iter.state),
+				reg->id, iter_state_str(reg->iter.state),
 				reg->iter.depth);
 			break;
 		case STACK_MISC:
@@ -825,3 +823,81 @@ void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_st
 	}
 	print_verifier_state(env, vstate, frameno, false);
 }
+
+int bpf_log_attr_init(struct bpf_log_attr *log, u64 log_buf, u32 log_size, u32 log_level,
+		      u32 offsetof_log_true_size, bpfptr_t uattr, struct bpf_common_attr *common,
+		      bpfptr_t uattr_common, u32 size_common)
+{
+	char __user *ubuf_common = u64_to_user_ptr(common->log_buf);
+	char __user *ubuf = u64_to_user_ptr(log_buf);
+
+	if (!bpf_verifier_log_attr_valid(common->log_level, ubuf_common, common->log_size) ||
+	    !bpf_verifier_log_attr_valid(log_level, ubuf, log_size))
+		return -EINVAL;
+
+	if (ubuf && ubuf_common && (ubuf != ubuf_common || log_size != common->log_size ||
+				    log_level != common->log_level))
+		return -EINVAL;
+
+	memset(log, 0, sizeof(*log));
+	log->ubuf = ubuf;
+	log->size = log_size;
+	log->level = log_level;
+	log->offsetof_true_size = offsetof_log_true_size;
+	log->uattr = uattr;
+
+	if (!ubuf && ubuf_common) {
+		log->ubuf = ubuf_common;
+		log->size = common->log_size;
+		log->level = common->log_level;
+		log->uattr = uattr_common;
+		log->offsetof_true_size = 0;
+		if (size_common >= offsetofend(struct bpf_common_attr, log_true_size))
+			log->offsetof_true_size = offsetof(struct bpf_common_attr, log_true_size);
+	}
+	return 0;
+}
+
+struct bpf_verifier_log *bpf_log_attr_create_vlog(struct bpf_log_attr *attr_log,
+						  struct bpf_common_attr *common, bpfptr_t uattr,
+						  u32 size)
+{
+	struct bpf_verifier_log *log;
+	int err;
+
+	memset(attr_log, 0, sizeof(*attr_log));
+	attr_log->uattr = uattr;
+	if (size >= offsetofend(struct bpf_common_attr, log_true_size))
+		attr_log->offsetof_true_size = offsetof(struct bpf_common_attr, log_true_size);
+
+	if (!size)
+		return NULL;
+
+	log = kzalloc_obj(*log, GFP_KERNEL);
+	if (!log)
+		return ERR_PTR(-ENOMEM);
+
+	err = bpf_vlog_init(log, common->log_level, u64_to_user_ptr(common->log_buf),
+			    common->log_size);
+	if (err) {
+		kfree(log);
+		return ERR_PTR(err);
+	}
+
+	return log;
+}
+
+int bpf_log_attr_finalize(struct bpf_log_attr *attr, struct bpf_verifier_log *log)
+{
+	u32 log_true_size;
+	int err;
+
+	err = bpf_vlog_finalize(log, &log_true_size);
+
+	if (attr->offsetof_true_size &&
+	    copy_to_bpfptr_offset(attr->uattr, attr->offsetof_true_size, &log_true_size,
+				  sizeof(log_true_size)))
+		return -EFAULT;
+
+	return err;
+}
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 0f57608b385d..4d6f25db9ba1 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -246,7 +246,7 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
 
 	/* Start walking the trie from the root node ... */
 
-	for (node = rcu_dereference_check(trie->root, rcu_read_lock_bh_held());
+	for (node = rcu_dereference_check(trie->root, bpf_rcu_lock_held());
 	     node;) {
 		unsigned int next_bit;
 		size_t matchlen;
@@ -280,7 +280,7 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
 		 */
 		next_bit = extract_bit(key->data, node->prefixlen);
 		node = rcu_dereference_check(node->child[next_bit],
-					     rcu_read_lock_bh_held());
+					     bpf_rcu_lock_held());
 	}
 
 	if (!found)
@@ -359,7 +359,7 @@ static long trie_update_elem(struct bpf_map *map,
 	 */
 	slot = &trie->root;
 
-	while ((node = rcu_dereference(*slot))) {
+	while ((node = rcu_dereference_protected(*slot, 1))) {
 		matchlen = longest_prefix_match(trie, node, key);
 
 		if (node->prefixlen != matchlen ||
@@ -482,7 +482,7 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
 	trim = &trie->root;
 	trim2 = trim;
 	parent = NULL;
-	while ((node = rcu_dereference(*trim))) {
+	while ((node = rcu_dereference_protected(*trim, 1))) {
 		matchlen = longest_prefix_match(trie, node, key);
 
 		if (node->prefixlen != matchlen ||
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 645bd30bc9a9..d2cbab4bdf64 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -20,7 +20,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
 	/* Does not support >1 level map-in-map */
 	if (inner_map->inner_map_meta)
 		return ERR_PTR(-EINVAL);
-
+	if (inner_map->excl_prog_sha)
+		return ERR_PTR(-ENOTSUPP);
 	if (!inner_map->ops->map_meta_equal)
 		return ERR_PTR(-ENOTSUPP);
 
@@ -101,6 +102,8 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map,
 	inner_map = __bpf_map_get(f);
 	if (IS_ERR(inner_map))
 		return inner_map;
+	if (inner_map->excl_prog_sha)
+		return ERR_PTR(-ENOTSUPP);
 
 	inner_map_meta = map->inner_map_meta;
 	if (inner_map_meta->ops->map_meta_equal(inner_map_meta, inner_map))
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index 261a03ea73d3..c19b360bad9e 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -112,6 +112,10 @@ static int bpf_iter_attach_map(struct bpf_prog *prog,
 	map = bpf_map_get_with_uref(linfo->map.map_fd);
 	if (IS_ERR(map))
 		return PTR_ERR(map);
+	if (map->excl_prog_sha) {
+		err = -EPERM;
+		goto put_map;
+	}
 
 	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
 	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
@@ -119,7 +123,8 @@ static int bpf_iter_attach_map(struct bpf_prog *prog,
 		is_percpu = true;
 	else if (map->map_type != BPF_MAP_TYPE_HASH &&
 		 map->map_type != BPF_MAP_TYPE_LRU_HASH &&
-		 map->map_type != BPF_MAP_TYPE_ARRAY)
+		 map->map_type != BPF_MAP_TYPE_ARRAY &&
+		 map->map_type != BPF_MAP_TYPE_RHASH)
 		goto put_map;
 
 	key_acc_size = prog->aux->max_rdonly_access;
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index da3d328f5c15..77ba03216c09 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -9,6 +9,7 @@
 #include <linux/perf_event.h>
 #include <linux/btf_ids.h>
 #include <linux/buildid.h>
+#include <linux/mmap_lock.h>
 #include "percpu_freelist.h"
 #include "mmap_unlock_work.h"
 
@@ -152,6 +153,180 @@ static int fetch_build_id(struct vm_area_struct *vma, unsigned char *build_id, b
 			 : build_id_parse_nofault(vma, build_id, NULL);
 }
 
+static inline void stack_map_build_id_set_ip(struct bpf_stack_build_id *id)
+{
+	id->status = BPF_STACK_BUILD_ID_IP;
+	memset(id->build_id, 0, BUILD_ID_SIZE_MAX);
+}
+
+static inline u64 stack_map_build_id_offset(unsigned long vm_pgoff,
+					    unsigned long vm_start, u64 ip)
+{
+	return (vm_pgoff << PAGE_SHIFT) + ip - vm_start;
+}
+
+static inline void stack_map_build_id_set_valid(struct bpf_stack_build_id *id,
+						u64 offset,
+						const unsigned char *build_id)
+{
+	id->status = BPF_STACK_BUILD_ID_VALID;
+	id->offset = offset;
+	if (id->build_id != build_id)
+		memcpy(id->build_id, build_id, BUILD_ID_SIZE_MAX);
+}
+
+struct stack_map_vma_lock {
+	struct vm_area_struct *vma;
+	struct mm_struct *mm;
+};
+
+/*
+ * Acquire a stable read-side reference on the VMA covering @ip.
+ *
+ * With CONFIG_PER_VMA_LOCK=y this returns a VMA with its per-VMA read
+ * lock held and mmap_lock dropped, so the caller may sleep.
+ *
+ * With CONFIG_PER_VMA_LOCK=n it returns a VMA with mmap_lock still
+ * held; the caller must snapshot any fields it needs and pin vm_file
+ * with get_file() before stack_map_unlock_vma() drops mmap_lock, as
+ * the VMA may be split, merged, or freed after that.
+ *
+ * Returns NULL on failure, in which case no lock is held.
+ */
+static struct vm_area_struct *
+stack_map_lock_vma(struct stack_map_vma_lock *lock, unsigned long ip)
+{
+	struct mm_struct *mm = lock->mm;
+	struct vm_area_struct *vma;
+
+	/* noop under !CONFIG_PER_VMA_LOCK */
+	vma = lock_vma_under_rcu(mm, ip);
+	if (vma) {
+		lock->vma = vma;
+		return vma;
+	}
+
+	/*
+	 * Taking mmap_read_lock() is unsafe here, because the caller BPF
+	 * program might already hold it, causing a deadlock.
+	 */
+	if (!mmap_read_trylock(mm))
+		return NULL;
+
+	vma = vma_lookup(mm, ip);
+	if (!vma) {
+		mmap_read_unlock(mm);
+		return NULL;
+	}
+
+#ifdef CONFIG_PER_VMA_LOCK
+	if (!vma_start_read_locked(vma)) {
+		mmap_read_unlock(mm);
+		return NULL;
+	}
+	mmap_read_unlock(mm);
+#endif
+
+	lock->vma = vma;
+	return vma;
+}
+
+static void stack_map_unlock_vma(struct stack_map_vma_lock *lock)
+{
+#ifdef CONFIG_PER_VMA_LOCK
+	vma_end_read(lock->vma);
+#else
+	mmap_read_unlock(lock->mm);
+#endif
+	lock->vma = NULL;
+}
+
+static void stack_map_get_build_id_offset_sleepable(struct bpf_stack_build_id *id_offs,
+						    u32 trace_nr)
+{
+	struct mm_struct *mm = current->mm;
+	struct stack_map_vma_lock lock = { .mm = mm };
+	struct {
+		struct file *file;
+		const unsigned char *build_id;
+		unsigned long vm_start;
+		unsigned long vm_end;
+		unsigned long vm_pgoff;
+	} cache = {};
+	unsigned long vm_pgoff, vm_start, vm_end;
+	struct vm_area_struct *vma;
+	struct file *file;
+	u64 offset;
+	u64 ip;
+
+	for (u32 i = 0; i < trace_nr; i++) {
+		ip = READ_ONCE(id_offs[i].ip);
+
+		/*
+		 * Range cache fast path: if ip falls within the previously
+		 * resolved VMA range, reuse the cache build_id without
+		 * re-acquiring the VMA lock.
+		 */
+		if (cache.build_id && ip >= cache.vm_start && ip < cache.vm_end) {
+			offset = stack_map_build_id_offset(cache.vm_pgoff, cache.vm_start, ip);
+			stack_map_build_id_set_valid(&id_offs[i], offset, cache.build_id);
+			continue;
+		}
+
+		vma = stack_map_lock_vma(&lock, ip);
+		if (!vma) {
+			stack_map_build_id_set_ip(&id_offs[i]);
+			continue;
+		}
+		if (vma_is_anonymous(vma) || !vma->vm_file) {
+			stack_map_build_id_set_ip(&id_offs[i]);
+			stack_map_unlock_vma(&lock);
+			continue;
+		}
+
+		file = vma->vm_file;
+		vm_pgoff = vma->vm_pgoff;
+		vm_start = vma->vm_start;
+		vm_end = vma->vm_end;
+		offset = stack_map_build_id_offset(vm_pgoff, vm_start, ip);
+
+		/*
+		 * Same backing file as previous (e.g. different VMAs
+		 * of the same ELF binary). Reuse the cache build_id.
+		 */
+		if (file == cache.file) {
+			stack_map_unlock_vma(&lock);
+			stack_map_build_id_set_valid(&id_offs[i], offset, cache.build_id);
+			cache.vm_start = vm_start;
+			cache.vm_end = vm_end;
+			cache.vm_pgoff = vm_pgoff;
+			continue;
+		}
+
+		file = get_file(file);
+		stack_map_unlock_vma(&lock);
+
+		/* build_id_parse_file() may block on filesystem reads */
+		if (build_id_parse_file(file, id_offs[i].build_id, NULL)) {
+			stack_map_build_id_set_ip(&id_offs[i]);
+			fput(file);
+			continue;
+		}
+
+		stack_map_build_id_set_valid(&id_offs[i], offset, id_offs[i].build_id);
+		if (cache.file)
+			fput(cache.file);
+		cache.file = file;
+		cache.build_id = id_offs[i].build_id;
+		cache.vm_start = vm_start;
+		cache.vm_end = vm_end;
+		cache.vm_pgoff = vm_pgoff;
+	}
+
+	if (cache.file)
+		fput(cache.file);
+}
+
 /*
  * Expects all id_offs[i].ip values to be set to correct initial IPs.
  * They will be subsequently:
@@ -165,44 +340,50 @@ static int fetch_build_id(struct vm_area_struct *vma, unsigned char *build_id, b
 static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 					  u32 trace_nr, bool user, bool may_fault)
 {
-	int i;
 	struct mmap_unlock_irq_work *work = NULL;
 	bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
+	bool has_user_ctx = user && current && current->mm;
 	struct vm_area_struct *vma, *prev_vma = NULL;
-	const char *prev_build_id;
+	const unsigned char *prev_build_id = NULL;
+	int i;
+
+	if (may_fault && has_user_ctx) {
+		stack_map_get_build_id_offset_sleepable(id_offs, trace_nr);
+		return;
+	}
 
 	/* If the irq_work is in use, fall back to report ips. Same
 	 * fallback is used for kernel stack (!user) on a stackmap with
 	 * build_id.
 	 */
-	if (!user || !current || !current->mm || irq_work_busy ||
-	    !mmap_read_trylock(current->mm)) {
+	if (!has_user_ctx || irq_work_busy || !mmap_read_trylock(current->mm)) {
 		/* cannot access current->mm, fall back to ips */
-		for (i = 0; i < trace_nr; i++) {
-			id_offs[i].status = BPF_STACK_BUILD_ID_IP;
-			memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
-		}
+		for (i = 0; i < trace_nr; i++)
+			stack_map_build_id_set_ip(&id_offs[i]);
 		return;
 	}
 
 	for (i = 0; i < trace_nr; i++) {
 		u64 ip = READ_ONCE(id_offs[i].ip);
+		u64 offset;
 
-		if (range_in_vma(prev_vma, ip, ip)) {
+		if (prev_build_id && range_in_vma(prev_vma, ip, ip)) {
 			vma = prev_vma;
-			memcpy(id_offs[i].build_id, prev_build_id, BUILD_ID_SIZE_MAX);
-			goto build_id_valid;
+			offset = stack_map_build_id_offset(vma->vm_pgoff, vma->vm_start, ip);
+			stack_map_build_id_set_valid(&id_offs[i], offset, prev_build_id);
+			continue;
 		}
 		vma = find_vma(current->mm, ip);
-		if (!vma || fetch_build_id(vma, id_offs[i].build_id, may_fault)) {
+		if (!vma || vma_is_anonymous(vma) ||
+		    fetch_build_id(vma, id_offs[i].build_id, may_fault)) {
 			/* per entry fall back to ips */
-			id_offs[i].status = BPF_STACK_BUILD_ID_IP;
-			memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
+			stack_map_build_id_set_ip(&id_offs[i]);
+			prev_vma = vma;
+			prev_build_id = NULL;
 			continue;
 		}
-build_id_valid:
-		id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ip - vma->vm_start;
-		id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
+		offset = stack_map_build_id_offset(vma->vm_pgoff, vma->vm_start, ip);
+		stack_map_build_id_set_valid(&id_offs[i], offset, id_offs[i].build_id);
 		prev_vma = vma;
 		prev_build_id = id_offs[i].build_id;
 	}
diff --git a/kernel/bpf/states.c b/kernel/bpf/states.c
index 8478d2c6ed5b..32f346ce3ffc 100644
--- a/kernel/bpf/states.c
+++ b/kernel/bpf/states.c
@@ -2,6 +2,7 @@
 /* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
 #include <linux/bpf.h>
 #include <linux/bpf_verifier.h>
+#include <linux/cnum.h>
 #include <linux/filter.h>
 
 #define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args)
@@ -301,14 +302,8 @@ int bpf_update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_s
 static bool range_within(const struct bpf_reg_state *old,
 			 const struct bpf_reg_state *cur)
 {
-	return old->umin_value <= cur->umin_value &&
-	       old->umax_value >= cur->umax_value &&
-	       old->smin_value <= cur->smin_value &&
-	       old->smax_value >= cur->smax_value &&
-	       old->u32_min_value <= cur->u32_min_value &&
-	       old->u32_max_value >= cur->u32_max_value &&
-	       old->s32_min_value <= cur->s32_min_value &&
-	       old->s32_max_value >= cur->s32_max_value;
+	return cnum64_is_subset(old->r64, cur->r64) &&
+	       cnum32_is_subset(old->r32, cur->r32);
 }
 
 /* If in the old state two registers had the same id, then they need to have
@@ -348,8 +343,12 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
 		return true;
 	}
 
-	/* We ran out of idmap slots, which should be impossible */
-	WARN_ON_ONCE(1);
+	/*
+	 * idmap slots are bounded by the number of registers and stack slots.
+	 * Since referenced dynptrs acquire intermediate references that do
+	 * not live in either, so the map can be exhausted. Since it is unlikely,
+	 * fail the verification by treating the states as not equivalent.
+	 */
 	return false;
 }
 
@@ -494,7 +493,7 @@ static bool regs_exact(const struct bpf_reg_state *rold,
 {
 	return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
 	       check_ids(rold->id, rcur->id, idmap) &&
-	       check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
+	       check_ids(rold->parent_id, rcur->parent_id, idmap);
 }
 
 enum exact_level {
@@ -619,7 +618,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
 		       range_within(rold, rcur) &&
 		       tnum_in(rold->var_off, rcur->var_off) &&
 		       check_ids(rold->id, rcur->id, idmap) &&
-		       check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
+		       check_ids(rold->parent_id, rcur->parent_id, idmap);
 	case PTR_TO_PACKET_META:
 	case PTR_TO_PACKET:
 		/* We must have at least as much range as the old ptr
@@ -799,7 +798,8 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
 			cur_reg = &cur->stack[spi].spilled_ptr;
 			if (old_reg->dynptr.type != cur_reg->dynptr.type ||
 			    old_reg->dynptr.first_slot != cur_reg->dynptr.first_slot ||
-			    !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
+			    !check_ids(old_reg->id, cur_reg->id, idmap) ||
+			    !check_ids(old_reg->parent_id, cur_reg->parent_id, idmap))
 				return false;
 			break;
 		case STACK_ITER:
@@ -815,13 +815,13 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
 			    old_reg->iter.btf_id != cur_reg->iter.btf_id ||
 			    old_reg->iter.state != cur_reg->iter.state ||
 			    /* ignore {old_reg,cur_reg}->iter.depth, see above */
-			    !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
+			    !check_ids(old_reg->id, cur_reg->id, idmap))
 				return false;
 			break;
 		case STACK_IRQ_FLAG:
 			old_reg = &old->stack[spi].spilled_ptr;
 			cur_reg = &cur->stack[spi].spilled_ptr;
-			if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap) ||
+			if (!check_ids(old_reg->id, cur_reg->id, idmap) ||
 			    old_reg->irq.kfunc_class != cur_reg->irq.kfunc_class)
 				return false;
 			break;
@@ -838,6 +838,32 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
 	return true;
 }
 
+/*
+ * Compare stack arg slots between old and current states.
+ * Outgoing stack args are path-local state and must agree for pruning.
+ */
+static bool stack_arg_safe(struct bpf_verifier_env *env, struct bpf_func_state *old,
+			   struct bpf_func_state *cur, struct bpf_idmap *idmap,
+			   enum exact_level exact)
+{
+	int i, nslots;
+
+	nslots = max(old->out_stack_arg_cnt, cur->out_stack_arg_cnt);
+	for (i = 0; i < nslots; i++) {
+		struct bpf_reg_state *old_arg, *cur_arg;
+		struct bpf_reg_state not_init = { .type = NOT_INIT };
+
+		old_arg = i < old->out_stack_arg_cnt ?
+			  &old->stack_arg_regs[i] : &not_init;
+		cur_arg = i < cur->out_stack_arg_cnt ?
+			  &cur->stack_arg_regs[i] : &not_init;
+		if (!regsafe(env, old_arg, cur_arg, idmap, exact))
+			return false;
+	}
+
+	return true;
+}
+
 static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *cur,
 		    struct bpf_idmap *idmap)
 {
@@ -868,6 +894,9 @@ static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c
 			return false;
 		switch (old->refs[i].type) {
 		case REF_TYPE_PTR:
+			if (!check_ids(old->refs[i].parent_id, cur->refs[i].parent_id, idmap))
+				return false;
+			break;
 		case REF_TYPE_IRQ:
 			break;
 		case REF_TYPE_LOCK:
@@ -920,6 +949,9 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
 	if (old->callback_depth > cur->callback_depth)
 		return false;
 
+	if (!old->no_stack_arg_load && cur->no_stack_arg_load)
+		return false;
+
 	for (i = 0; i < MAX_BPF_REG; i++)
 		if (((1 << i) & live_regs) &&
 		    !regsafe(env, &old->regs[i], &cur->regs[i],
@@ -929,6 +961,9 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
 	if (!stacksafe(env, old, cur, &env->idmap_scratch, exact))
 		return false;
 
+	if (!stack_arg_safe(env, old, cur, &env->idmap_scratch, exact))
+		return false;
+
 	return true;
 }
 
@@ -1376,7 +1411,7 @@ hit:
 			 */
 			err = 0;
 			if (bpf_is_jmp_point(env, env->insn_idx))
-				err = bpf_push_jmp_history(env, cur, 0, 0);
+				err = bpf_push_jmp_history(env, cur, 0, 0, 0, 0);
 			err = err ? : propagate_precision(env, &sl->state, cur, NULL);
 			if (err)
 				return err;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 630d530782fe..b44106c8ea75 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -41,6 +41,7 @@
 #include <linux/overflow.h>
 #include <linux/cookie.h>
 #include <linux/verification.h>
+#include <linux/btf_ids.h>
 
 #include <net/netfilter/nf_bpf_link.h>
 #include <net/netkit.h>
@@ -807,6 +808,11 @@ void bpf_obj_free_task_work(const struct btf_record *rec, void *obj)
 	bpf_task_work_cancel_and_free(obj + rec->task_work_off);
 }
 
+void bpf_obj_cancel_fields(struct bpf_map *map, void *obj)
+{
+	bpf_map_free_internal_structs(map, obj);
+}
+
 void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 {
 	const struct btf_field *fields;
@@ -1280,6 +1286,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
 			case BPF_SPIN_LOCK:
 			case BPF_RES_SPIN_LOCK:
 				if (map->map_type != BPF_MAP_TYPE_HASH &&
+				    map->map_type != BPF_MAP_TYPE_RHASH &&
 				    map->map_type != BPF_MAP_TYPE_ARRAY &&
 				    map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
 				    map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
@@ -1294,6 +1301,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
 			case BPF_WORKQUEUE:
 			case BPF_TASK_WORK:
 				if (map->map_type != BPF_MAP_TYPE_HASH &&
+				    map->map_type != BPF_MAP_TYPE_RHASH &&
 				    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
 				    map->map_type != BPF_MAP_TYPE_ARRAY) {
 					ret = -EOPNOTSUPP;
@@ -1305,6 +1313,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
 			case BPF_KPTR_PERCPU:
 			case BPF_REFCOUNT:
 				if (map->map_type != BPF_MAP_TYPE_HASH &&
+				    map->map_type != BPF_MAP_TYPE_RHASH &&
 				    map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
 				    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
 				    map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH &&
@@ -1359,7 +1368,8 @@ free_map_tab:
 
 #define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size
 /* called via syscall */
-static int map_create(union bpf_attr *attr, bpfptr_t uattr)
+static int map_create_alloc(union bpf_attr *attr, bpfptr_t uattr, struct bpf_verifier_log *log,
+			    struct bpf_map **mapp, struct bpf_token **tokenp)
 {
 	const struct bpf_map_ops *ops;
 	struct bpf_token *token = NULL;
@@ -1367,12 +1377,13 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
 	u32 map_type = attr->map_type;
 	struct bpf_map *map;
 	bool token_flag;
-	int f_flags;
 	int err;
 
 	err = CHECK_ATTR(BPF_MAP_CREATE);
-	if (err)
+	if (err) {
+		bpf_log(log, "Invalid attr.\n");
 		return -EINVAL;
+	}
 
 	/* check BPF_F_TOKEN_FD flag, remember if it's set, and then clear it
 	 * to avoid per-map type checks tripping on unknown flag
@@ -1381,31 +1392,40 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
 	attr->map_flags &= ~BPF_F_TOKEN_FD;
 
 	if (attr->btf_vmlinux_value_type_id) {
-		if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
-		    attr->btf_key_type_id || attr->btf_value_type_id)
+		if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS) {
+			bpf_log(log, "btf_vmlinux_value_type_id can only be used with struct_ops maps.\n");
 			return -EINVAL;
+		}
+		if (attr->btf_key_type_id || attr->btf_value_type_id) {
+			bpf_log(log, "btf_vmlinux_value_type_id is mutually exclusive with btf_key_type_id and btf_value_type_id.\n");
+			return -EINVAL;
+		}
 	} else if (attr->btf_key_type_id && !attr->btf_value_type_id) {
+		bpf_log(log, "Invalid btf_value_type_id.\n");
 		return -EINVAL;
 	}
 
 	if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&
 	    attr->map_type != BPF_MAP_TYPE_ARENA &&
-	    attr->map_extra != 0)
+	    attr->map_type != BPF_MAP_TYPE_RHASH &&
+	    attr->map_extra != 0) {
+		bpf_log(log, "Invalid map_extra.\n");
 		return -EINVAL;
-
-	f_flags = bpf_get_file_flag(attr->map_flags);
-	if (f_flags < 0)
-		return f_flags;
+	}
 
 	if (numa_node != NUMA_NO_NODE &&
 	    ((unsigned int)numa_node >= nr_node_ids ||
-	     !node_online(numa_node)))
+	     !node_online(numa_node))) {
+		bpf_log(log, "Invalid numa_node.\n");
 		return -EINVAL;
+	}
 
 	/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
 	map_type = attr->map_type;
-	if (map_type >= ARRAY_SIZE(bpf_map_types))
+	if (map_type >= ARRAY_SIZE(bpf_map_types)) {
+		bpf_log(log, "Invalid map_type.\n");
 		return -EINVAL;
+	}
 	map_type = array_index_nospec(map_type, ARRAY_SIZE(bpf_map_types));
 	ops = bpf_map_types[map_type];
 	if (!ops)
@@ -1423,8 +1443,10 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
 
 	if (token_flag) {
 		token = bpf_token_get_from_fd(attr->map_token_fd);
-		if (IS_ERR(token))
+		if (IS_ERR(token)) {
+			bpf_log(log, "Invalid map_token_fd.\n");
 			return PTR_ERR(token);
+		}
 
 		/* if current token doesn't grant map creation permissions,
 		 * then we can't use this token, so ignore it and rely on
@@ -1457,6 +1479,7 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
 	case BPF_MAP_TYPE_CGROUP_ARRAY:
 	case BPF_MAP_TYPE_ARRAY_OF_MAPS:
 	case BPF_MAP_TYPE_HASH:
+	case BPF_MAP_TYPE_RHASH:
 	case BPF_MAP_TYPE_PERCPU_HASH:
 	case BPF_MAP_TYPE_HASH_OF_MAPS:
 	case BPF_MAP_TYPE_RINGBUF:
@@ -1507,8 +1530,10 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
 
 	err = bpf_obj_name_cpy(map->name, attr->map_name,
 			       sizeof(attr->map_name));
-	if (err < 0)
+	if (err < 0) {
+		bpf_log(log, "Invalid map_name.\n");
 		goto free_map;
+	}
 
 	preempt_disable();
 	map->cookie = gen_cookie_next(&bpf_map_cookie);
@@ -1531,6 +1556,7 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
 
 		btf = btf_get_by_fd(attr->btf_fd);
 		if (IS_ERR(btf)) {
+			bpf_log(log, "Invalid btf_fd.\n");
 			err = PTR_ERR(btf);
 			goto free_map;
 		}
@@ -1558,6 +1584,7 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
 		bpfptr_t uprog_hash = make_bpfptr(attr->excl_prog_hash, uattr.is_kernel);
 
 		if (attr->excl_prog_hash_size != SHA256_DIGEST_SIZE) {
+			bpf_log(log, "Invalid excl_prog_hash_size.\n");
 			err = -EINVAL;
 			goto free_map;
 		}
@@ -1572,11 +1599,62 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
 			err = -EFAULT;
 			goto free_map;
 		}
+
+		/* See libbpf: emit_signature_match() */
+		BUILD_BUG_ON(offsetof(struct bpf_map, excl) != SHA256_DIGEST_SIZE);
+		BUILD_BUG_ON(!__same_type(map->excl, u32));
+		BUILD_BUG_ON(offsetof(struct bpf_map, sha)  != 0);
+		BUILD_BUG_ON(!__same_type(map->sha, u8[SHA256_DIGEST_SIZE]));
+		map->excl = 1;
 	} else if (attr->excl_prog_hash_size) {
+		bpf_log(log, "Invalid excl_prog_hash_size.\n");
 		err = -EINVAL;
 		goto free_map;
 	}
 
+	*mapp = map;
+	*tokenp = token;
+	return 0;
+
+free_map:
+	bpf_map_free(map);
+put_token:
+	bpf_token_put(token);
+	return err;
+}
+
+static int map_create(union bpf_attr *attr, bpfptr_t uattr, struct bpf_common_attr *attr_common,
+		      bpfptr_t uattr_common, u32 size_common)
+{
+	struct bpf_token *token = NULL;
+	struct bpf_verifier_log *log;
+	struct bpf_log_attr attr_log;
+	struct bpf_map *map = NULL;
+	int err, ret;
+	int f_flags;
+
+	log = bpf_log_attr_create_vlog(&attr_log, attr_common, uattr_common, size_common);
+	if (IS_ERR(log))
+		return PTR_ERR(log);
+
+	err = map_create_alloc(attr, uattr, log, &map, &token);
+
+	/* preserve original error even if log finalization is successful */
+	ret = bpf_log_attr_finalize(&attr_log, log);
+	if (ret)
+		err = ret;
+
+	kfree(log);
+
+	if (err)
+		goto free_map;
+
+	f_flags = bpf_get_file_flag(attr->map_flags);
+	if (f_flags < 0) {
+		err = f_flags;
+		goto free_map;
+	}
+
 	err = security_bpf_map_create(map, attr, token, uattr.is_kernel);
 	if (err)
 		goto free_map_sec;
@@ -1605,8 +1683,8 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr)
 free_map_sec:
 	security_bpf_map_free(map);
 free_map:
-	bpf_map_free(map);
-put_token:
+	if (map)
+		bpf_map_free(map);
 	bpf_token_put(token);
 	return err;
 }
@@ -2192,6 +2270,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
 		   map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
 		   map->map_type == BPF_MAP_TYPE_LRU_HASH ||
 		   map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
+		   map->map_type == BPF_MAP_TYPE_RHASH ||
 		   map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
 		if (!bpf_map_is_offloaded(map)) {
 			bpf_disable_instrumentation();
@@ -2646,7 +2725,8 @@ static int
 bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
 			   enum bpf_attach_type expected_attach_type,
 			   struct btf *attach_btf, u32 btf_id,
-			   struct bpf_prog *dst_prog)
+			   struct bpf_prog *dst_prog,
+			   bool multi_func)
 {
 	if (btf_id) {
 		if (btf_id > BTF_MAX_TYPE)
@@ -2666,6 +2746,14 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
 		}
 	}
 
+	if (multi_func) {
+		if (prog_type != BPF_PROG_TYPE_TRACING)
+			return -EINVAL;
+		if (!attach_btf || btf_id)
+			return -EINVAL;
+		return 0;
+	}
+
 	if (attach_btf && (!btf_id || dst_prog))
 		return -EINVAL;
 
@@ -2798,8 +2886,22 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
 	}
 }
 
+static enum bpf_sig_keyring bpf_classify_keyring(s32 keyring_id)
+{
+	switch (keyring_id) {
+	case 0:
+		return BPF_SIG_KEYRING_BUILTIN;
+	case (s32)(unsigned long)VERIFY_USE_SECONDARY_KEYRING:
+		return BPF_SIG_KEYRING_SECONDARY;
+	case (s32)(unsigned long)VERIFY_USE_PLATFORM_KEYRING:
+		return BPF_SIG_KEYRING_PLATFORM;
+	default:
+		return BPF_SIG_KEYRING_USER;
+	}
+}
+
 static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr,
-				     bool is_kernel)
+				     bool is_kernel, s32 *keyring_serial)
 {
 	bpfptr_t usig = make_bpfptr(attr->signature, is_kernel);
 	struct bpf_dynptr_kern sig_ptr, insns_ptr;
@@ -2835,7 +2937,8 @@ static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr
 
 	err = bpf_verify_pkcs7_signature((struct bpf_dynptr *)&insns_ptr,
 					 (struct bpf_dynptr *)&sig_ptr, key);
-
+	if (!err)
+		*keyring_serial = bpf_key_serial(key);
 	bpf_key_put(key);
 	kvfree(sig);
 	return err;
@@ -2858,10 +2961,15 @@ static int bpf_prog_mark_insn_arrays_ready(struct bpf_prog *prog)
 	return 0;
 }
 
+extern int bpf_multi_func(void);
+int __init __used bpf_multi_func(void) { return 0; }
+
+BTF_ID_LIST_GLOBAL_SINGLE(bpf_multi_func_btf_id, func, bpf_multi_func)
+
 /* last field in 'union bpf_attr' used by this command */
 #define BPF_PROG_LOAD_LAST_FIELD keyring_id
 
-static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
+static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_attr *attr_log)
 {
 	enum bpf_prog_type type = attr->prog_type;
 	struct bpf_prog *prog, *dst_prog = NULL;
@@ -2870,6 +2978,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	bool bpf_cap;
 	int err;
 	char license[128];
+	bool multi_func;
 
 	if (CHECK_ATTR(BPF_PROG_LOAD))
 		return -EINVAL;
@@ -2936,6 +3045,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON))
 		goto put_token;
 
+	multi_func = is_tracing_multi(attr->expected_attach_type);
+
 	/* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog
 	 * or btf, we need to check which one it is
 	 */
@@ -2957,7 +3068,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 				goto put_token;
 			}
 		}
-	} else if (attr->attach_btf_id) {
+	} else if (attr->attach_btf_id || multi_func) {
 		/* fall back to vmlinux BTF, if BTF type ID is specified */
 		attach_btf = bpf_get_btf_vmlinux();
 		if (IS_ERR(attach_btf)) {
@@ -2973,7 +3084,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 
 	if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
 				       attach_btf, attr->attach_btf_id,
-				       dst_prog)) {
+				       dst_prog, multi_func)) {
 		if (dst_prog)
 			bpf_prog_put(dst_prog);
 		if (attach_btf)
@@ -2996,7 +3107,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	prog->expected_attach_type = attr->expected_attach_type;
 	prog->sleepable = !!(attr->prog_flags & BPF_F_SLEEPABLE);
 	prog->aux->attach_btf = attach_btf;
-	prog->aux->attach_btf_id = attr->attach_btf_id;
+	prog->aux->attach_btf_id = multi_func ? bpf_multi_func_btf_id[0] : attr->attach_btf_id;
 	prog->aux->dst_prog = dst_prog;
 	prog->aux->dev_bound = !!attr->prog_ifindex;
 	prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
@@ -3022,13 +3133,17 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 
 	/* eBPF programs must be GPL compatible to use GPL-ed functions */
 	prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0;
-
 	if (attr->signature) {
-		err = bpf_prog_verify_signature(prog, attr, uattr.is_kernel);
+		err = bpf_prog_verify_signature(prog, attr, uattr.is_kernel,
+						&prog->aux->sig.keyring_serial);
 		if (err)
 			goto free_prog;
+		prog->aux->sig.keyring_type = bpf_classify_keyring(attr->keyring_id);
+		prog->aux->sig.verdict = BPF_SIG_VERIFIED;
+	} else {
+		prog->aux->sig.keyring_type = BPF_SIG_KEYRING_NONE;
+		prog->aux->sig.verdict = BPF_SIG_UNSIGNED;
 	}
-
 	prog->orig_prog = NULL;
 	prog->jited = 0;
 
@@ -3076,10 +3191,10 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 
 	err = security_bpf_prog_load(prog, attr, token, uattr.is_kernel);
 	if (err)
-		goto free_prog_sec;
+		goto free_prog;
 
 	/* run eBPF verifier */
-	err = bpf_check(&prog, attr, uattr, uattr_size);
+	err = bpf_check(&prog, attr, uattr, attr_log);
 	if (err < 0)
 		goto free_used_maps;
 
@@ -3122,8 +3237,6 @@ free_used_maps:
 	__bpf_prog_put_noref(prog, prog->aux->real_func_cnt);
 	return err;
 
-free_prog_sec:
-	security_bpf_prog_free(prog);
 free_prog:
 	free_uid(prog->aux->user);
 	if (prog->aux->attach_btf)
@@ -3198,6 +3311,15 @@ void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
 	bpf_link_init_sleepable(link, type, ops, prog, attach_type, false);
 }
 
+void bpf_tramp_link_init(struct bpf_tramp_link *link, enum bpf_link_type type,
+			 const struct bpf_link_ops *ops, struct bpf_prog *prog,
+			 enum bpf_attach_type attach_type, u64 cookie)
+{
+	bpf_link_init(&link->link, type, ops, prog, attach_type);
+	link->node.link = &link->link;
+	link->node.cookie = cookie;
+}
+
 static void bpf_link_free_id(int id)
 {
 	if (!id)
@@ -3358,7 +3480,7 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
 			seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_KPROBE_MULTI_RETURN ?
 				   "kretprobe_multi" : "kprobe_multi");
 		else if (link->type == BPF_LINK_TYPE_UPROBE_MULTI)
-			seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_UPROBE_MULTI_RETURN ?
+			seq_printf(m, "link_type:\t%s\n", link->flags & BPF_F_UPROBE_MULTI_RETURN ?
 				   "uretprobe_multi" : "uprobe_multi");
 		else
 			seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]);
@@ -3505,7 +3627,7 @@ static void bpf_tracing_link_release(struct bpf_link *link)
 	struct bpf_tracing_link *tr_link =
 		container_of(link, struct bpf_tracing_link, link.link);
 
-	WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link,
+	WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link.node,
 						tr_link->trampoline,
 						tr_link->tgt_prog));
 
@@ -3518,8 +3640,7 @@ static void bpf_tracing_link_release(struct bpf_link *link)
 
 static void bpf_tracing_link_dealloc(struct bpf_link *link)
 {
-	struct bpf_tracing_link *tr_link =
-		container_of(link, struct bpf_tracing_link, link.link);
+	struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link);
 
 	kfree(tr_link);
 }
@@ -3527,8 +3648,8 @@ static void bpf_tracing_link_dealloc(struct bpf_link *link)
 static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
 					 struct seq_file *seq)
 {
-	struct bpf_tracing_link *tr_link =
-		container_of(link, struct bpf_tracing_link, link.link);
+	struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link);
+
 	u32 target_btf_id, target_obj_id;
 
 	bpf_trampoline_unpack_key(tr_link->trampoline->key,
@@ -3541,17 +3662,16 @@ static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
 		   link->attach_type,
 		   target_obj_id,
 		   target_btf_id,
-		   tr_link->link.cookie);
+		   tr_link->link.node.cookie);
 }
 
 static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
 					   struct bpf_link_info *info)
 {
-	struct bpf_tracing_link *tr_link =
-		container_of(link, struct bpf_tracing_link, link.link);
+	struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link);
 
 	info->tracing.attach_type = link->attach_type;
-	info->tracing.cookie = tr_link->link.cookie;
+	info->tracing.cookie = tr_link->link.node.cookie;
 	bpf_trampoline_unpack_key(tr_link->trampoline->key,
 				  &info->tracing.target_obj_id,
 				  &info->tracing.target_btf_id);
@@ -3633,29 +3753,18 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
 		key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id);
 	}
 
-	if (prog->expected_attach_type == BPF_TRACE_FSESSION) {
-		struct bpf_fsession_link *fslink;
-
-		fslink = kzalloc_obj(*fslink, GFP_USER);
-		if (fslink) {
-			bpf_link_init(&fslink->fexit.link, BPF_LINK_TYPE_TRACING,
-				      &bpf_tracing_link_lops, prog, attach_type);
-			fslink->fexit.cookie = bpf_cookie;
-			link = &fslink->link;
-		} else {
-			link = NULL;
-		}
-	} else {
-		link = kzalloc_obj(*link, GFP_USER);
-	}
+	link = kzalloc_obj(*link, GFP_USER);
 	if (!link) {
 		err = -ENOMEM;
 		goto out_put_prog;
 	}
-	bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING,
-		      &bpf_tracing_link_lops, prog, attach_type);
+	bpf_tramp_link_init(&link->link, BPF_LINK_TYPE_TRACING,
+			    &bpf_tracing_link_lops, prog, attach_type, bpf_cookie);
 
-	link->link.cookie = bpf_cookie;
+	if (prog->expected_attach_type == BPF_TRACE_FSESSION) {
+		link->fexit.link = &link->link.link;
+		link->fexit.cookie = bpf_cookie;
+	}
 
 	mutex_lock(&prog->aux->dst_mutex);
 
@@ -3758,7 +3867,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
 	if (err)
 		goto out_unlock;
 
-	err = bpf_trampoline_link_prog(&link->link, tr, tgt_prog);
+	err = bpf_trampoline_link_prog(&link->link.node, tr, tgt_prog);
 	if (err) {
 		bpf_link_cleanup(&link_primer);
 		link = NULL;
@@ -4281,6 +4390,11 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
 	if (!btp)
 		return -ENOENT;
 
+	if (prog->sleepable && !tracepoint_is_faultable(btp->tp)) {
+		bpf_put_raw_tracepoint(btp);
+		return -EINVAL;
+	}
+
 	link = kzalloc_obj(*link, GFP_USER);
 	if (!link) {
 		err = -ENOMEM;
@@ -4389,6 +4503,9 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
 	case BPF_TRACE_FENTRY:
 	case BPF_TRACE_FEXIT:
 	case BPF_TRACE_FSESSION:
+	case BPF_TRACE_FSESSION_MULTI:
+	case BPF_TRACE_FENTRY_MULTI:
+	case BPF_TRACE_FEXIT_MULTI:
 	case BPF_MODIFY_RETURN:
 		return BPF_PROG_TYPE_TRACING;
 	case BPF_LSM_MAC:
@@ -4654,7 +4771,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 #define BPF_PROG_QUERY_LAST_FIELD query.revision
 
 static int bpf_prog_query(const union bpf_attr *attr,
-			  union bpf_attr __user *uattr)
+			  union bpf_attr __user *uattr, u32 uattr_size)
 {
 	if (!bpf_net_capable())
 		return -EPERM;
@@ -4693,7 +4810,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_CGROUP_GETSOCKOPT:
 	case BPF_CGROUP_SETSOCKOPT:
 	case BPF_LSM_CGROUP:
-		return cgroup_bpf_prog_query(attr, uattr);
+		return cgroup_bpf_prog_query(attr, uattr, uattr_size);
 	case BPF_LIRC_MODE2:
 		return lirc_prog_query(attr, uattr);
 	case BPF_FLOW_DISSECTOR:
@@ -5045,10 +5162,11 @@ static int bpf_prog_get_info_by_fd(struct file *file,
 	u32 info_len = attr->info.info_len;
 	struct bpf_prog_kstats stats;
 	char __user *uinsns;
-	u32 ulen;
+	u32 ulen, len;
 	int err;
 
-	err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
+	len = offsetofend(struct bpf_prog_info, attach_btf_id);
+	err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), len, info_len);
 	if (err)
 		return err;
 	info_len = min_t(u32, sizeof(info), info_len);
@@ -5330,10 +5448,11 @@ static int bpf_map_get_info_by_fd(struct file *file,
 {
 	struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info);
 	struct bpf_map_info info;
-	u32 info_len = attr->info.info_len;
+	u32 info_len = attr->info.info_len, len;
 	int err;
 
-	err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
+	len = offsetofend(struct bpf_map_info, hash_size);
+	err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), len, info_len);
 	if (err)
 		return err;
 	info_len = min_t(u32, sizeof(info), info_len);
@@ -5371,18 +5490,16 @@ static int bpf_map_get_info_by_fd(struct file *file,
 
 		if (!map->ops->map_get_hash)
 			return -EINVAL;
-
-		if (info.hash_size != SHA256_DIGEST_SIZE)
+		if (info.hash_size != sizeof(map->sha))
 			return -EINVAL;
-
 		if (!READ_ONCE(map->frozen))
 			return -EPERM;
 
-		err = map->ops->map_get_hash(map, SHA256_DIGEST_SIZE, map->sha);
+		err = map->ops->map_get_hash(map);
 		if (err != 0)
 			return err;
 
-		if (copy_to_user(uhash, map->sha, SHA256_DIGEST_SIZE) != 0)
+		if (copy_to_user(uhash, map->sha, sizeof(map->sha)) != 0)
 			return -EFAULT;
 	} else if (info.hash_size) {
 		return -EINVAL;
@@ -5495,7 +5612,7 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
 
 #define BPF_BTF_LOAD_LAST_FIELD btf_token_fd
 
-static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
+static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_attr *attr_log)
 {
 	struct bpf_token *token = NULL;
 
@@ -5522,7 +5639,7 @@ static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_
 
 	bpf_token_put(token);
 
-	return btf_new_fd(attr, uattr, uattr_size);
+	return btf_new_fd(attr, uattr, attr_log);
 }
 
 #define BPF_BTF_GET_FD_BY_ID_LAST_FIELD fd_by_id_token_fd
@@ -5723,7 +5840,7 @@ err_put:
 	return err;
 }
 
-#define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.pid
+#define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.path_fd
 static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 {
 	struct bpf_prog *prog;
@@ -5774,6 +5891,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 			ret = bpf_iter_link_attach(attr, uattr, prog);
 		else if (prog->expected_attach_type == BPF_LSM_CGROUP)
 			ret = cgroup_bpf_link_attach(attr, prog);
+		else if (is_tracing_multi(prog->expected_attach_type))
+			ret = bpf_tracing_multi_attach(prog, attr);
 		else
 			ret = bpf_tracing_prog_attach(prog,
 						      attr->link_create.target_fd,
@@ -6232,8 +6351,12 @@ put_prog:
 	return ret;
 }
 
-static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
+static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size,
+		     bpfptr_t uattr_common, unsigned int size_common)
 {
+	struct bpf_common_attr attr_common;
+	u32 offsetof_log_true_size = 0;
+	struct bpf_log_attr attr_log;
 	union bpf_attr attr;
 	int err;
 
@@ -6247,13 +6370,29 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
 	if (copy_from_bpfptr(&attr, uattr, size) != 0)
 		return -EFAULT;
 
+	memset(&attr_common, 0, sizeof(attr_common));
+	if (cmd & BPF_COMMON_ATTRS) {
+		err = bpf_check_uarg_tail_zero(uattr_common,
+					       offsetofend(struct bpf_common_attr, log_true_size),
+					       size_common);
+		if (err)
+			return err;
+
+		cmd &= ~BPF_COMMON_ATTRS;
+		size_common = min_t(u32, size_common, sizeof(attr_common));
+		if (copy_from_bpfptr(&attr_common, uattr_common, size_common) != 0)
+			return -EFAULT;
+	} else {
+		size_common = 0;
+	}
+
 	err = security_bpf(cmd, &attr, size, uattr.is_kernel);
 	if (err < 0)
 		return err;
 
 	switch (cmd) {
 	case BPF_MAP_CREATE:
-		err = map_create(&attr, uattr);
+		err = map_create(&attr, uattr, &attr_common, uattr_common, size_common);
 		break;
 	case BPF_MAP_LOOKUP_ELEM:
 		err = map_lookup_elem(&attr);
@@ -6271,7 +6410,12 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
 		err = map_freeze(&attr);
 		break;
 	case BPF_PROG_LOAD:
-		err = bpf_prog_load(&attr, uattr, size);
+		if (size >= offsetofend(union bpf_attr, log_true_size))
+			offsetof_log_true_size = offsetof(union bpf_attr, log_true_size);
+		err = bpf_log_attr_init(&attr_log, attr.log_buf, attr.log_size, attr.log_level,
+					offsetof_log_true_size, uattr, &attr_common, uattr_common,
+					size_common);
+		err = err ?: bpf_prog_load(&attr, uattr, &attr_log);
 		break;
 	case BPF_OBJ_PIN:
 		err = bpf_obj_pin(&attr);
@@ -6286,7 +6430,7 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
 		err = bpf_prog_detach(&attr);
 		break;
 	case BPF_PROG_QUERY:
-		err = bpf_prog_query(&attr, uattr.user);
+		err = bpf_prog_query(&attr, uattr.user, size);
 		break;
 	case BPF_PROG_TEST_RUN:
 		err = bpf_prog_test_run(&attr, uattr.user);
@@ -6316,7 +6460,12 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
 		err = bpf_raw_tracepoint_open(&attr);
 		break;
 	case BPF_BTF_LOAD:
-		err = bpf_btf_load(&attr, uattr, size);
+		if (size >= offsetofend(union bpf_attr, btf_log_true_size))
+			offsetof_log_true_size = offsetof(union bpf_attr, btf_log_true_size);
+		err = bpf_log_attr_init(&attr_log, attr.btf_log_buf, attr.btf_log_size,
+					attr.btf_log_level, offsetof_log_true_size, uattr,
+					&attr_common, uattr_common, size_common);
+		err = err ?: bpf_btf_load(&attr, uattr, &attr_log);
 		break;
 	case BPF_BTF_GET_FD_BY_ID:
 		err = bpf_btf_get_fd_by_id(&attr);
@@ -6382,9 +6531,10 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
 	return err;
 }
 
-SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
+SYSCALL_DEFINE5(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size,
+		struct bpf_common_attr __user *, uattr_common, unsigned int, size_common)
 {
-	return __sys_bpf(cmd, USER_BPFPTR(uattr), size);
+	return __sys_bpf(cmd, USER_BPFPTR(uattr), size, USER_BPFPTR(uattr_common), size_common);
 }
 
 static bool syscall_prog_is_valid_access(int off, int size,
@@ -6414,7 +6564,7 @@ BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
 	default:
 		return -EINVAL;
 	}
-	return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size);
+	return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size, KERNEL_BPFPTR(NULL), 0);
 }
 
 
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index f02254a21585..1a721fc4bef5 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -30,8 +30,46 @@ static struct hlist_head trampoline_ip_table[TRAMPOLINE_TABLE_SIZE];
 /* serializes access to trampoline tables */
 static DEFINE_MUTEX(trampoline_mutex);
 
+/*
+ * Keep 32 trampoline locks (5 bits) in the pool so trampoline_lock_all()
+ * stays below MAX_LOCK_DEPTH.  Each pool slot has a distinct lockdep
+ * class because trampoline_lock_all() takes all pool mutexes at once;
+ * otherwise lockdep would report recursive locking on same-class mutexes.
+ */
+#define TRAMPOLINE_LOCKS_BITS 5
+#define TRAMPOLINE_LOCKS_TABLE_SIZE (1 << TRAMPOLINE_LOCKS_BITS)
+
+static struct {
+	struct mutex mutex;
+	struct lock_class_key key;
+} trampoline_locks[TRAMPOLINE_LOCKS_TABLE_SIZE];
+
+static struct mutex *select_trampoline_lock(struct bpf_trampoline *tr)
+{
+	return &trampoline_locks[hash_ptr(tr, TRAMPOLINE_LOCKS_BITS)].mutex;
+}
+
+static void trampoline_lock(struct bpf_trampoline *tr)
+{
+	mutex_lock(select_trampoline_lock(tr));
+}
+
+static void trampoline_unlock(struct bpf_trampoline *tr)
+{
+	mutex_unlock(select_trampoline_lock(tr));
+}
+
+struct bpf_trampoline_ops {
+	int (*register_fentry)(struct bpf_trampoline *tr, struct bpf_tramp_image *im, void *data);
+	int (*unregister_fentry)(struct bpf_trampoline *tr, u32 orig_flags, void *data);
+	int (*modify_fentry)(struct bpf_trampoline *tr, u32 orig_flags, struct bpf_tramp_image *im,
+			     bool lock_direct_mutex, void *data);
+};
+
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
-static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex);
+static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex,
+				 const struct bpf_trampoline_ops *ops, void *data);
+static const struct bpf_trampoline_ops trampoline_ops;
 
 #ifdef CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS
 static struct bpf_trampoline *direct_ops_ip_lookup(struct ftrace_ops *ops, unsigned long ip)
@@ -69,9 +107,9 @@ static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip,
 
 	if (cmd == FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF) {
 		/* This is called inside register_ftrace_direct_multi(), so
-		 * tr->mutex is already locked.
+		 * trampoline's mutex is already locked.
 		 */
-		lockdep_assert_held_once(&tr->mutex);
+		lockdep_assert_held_once(select_trampoline_lock(tr));
 
 		/* Instead of updating the trampoline here, we propagate
 		 * -EAGAIN to register_ftrace_direct(). Then we can
@@ -91,7 +129,7 @@ static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip,
 	}
 
 	/* The normal locking order is
-	 *    tr->mutex => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c)
+	 *    select_trampoline_lock(tr) => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c)
 	 *
 	 * The following two commands are called from
 	 *
@@ -99,12 +137,12 @@ static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip,
 	 *   cleanup_direct_functions_after_ipmodify
 	 *
 	 * In both cases, direct_mutex is already locked. Use
-	 * mutex_trylock(&tr->mutex) to avoid deadlock in race condition
-	 * (something else is making changes to this same trampoline).
+	 * mutex_trylock(select_trampoline_lock(tr)) to avoid deadlock in race condition
+	 * (something else holds the same pool lock).
 	 */
-	if (!mutex_trylock(&tr->mutex)) {
-		/* sleep 1 ms to make sure whatever holding tr->mutex makes
-		 * some progress.
+	if (!mutex_trylock(select_trampoline_lock(tr))) {
+		/* sleep 1 ms to make sure whatever holding select_trampoline_lock(tr)
+		 * makes some progress.
 		 */
 		msleep(1);
 		return -EAGAIN;
@@ -116,20 +154,22 @@ static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip,
 
 		if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) &&
 		    !(tr->flags & BPF_TRAMP_F_ORIG_STACK))
-			ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */);
+			ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */,
+						    &trampoline_ops, NULL);
 		break;
 	case FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER:
 		tr->flags &= ~BPF_TRAMP_F_SHARE_IPMODIFY;
 
 		if (tr->flags & BPF_TRAMP_F_ORIG_STACK)
-			ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */);
+			ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */,
+						    &trampoline_ops, NULL);
 		break;
 	default:
 		ret = -EINVAL;
 		break;
 	}
 
-	mutex_unlock(&tr->mutex);
+	trampoline_unlock(tr);
 	return ret;
 }
 #endif
@@ -142,7 +182,9 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
 	switch (ptype) {
 	case BPF_PROG_TYPE_TRACING:
 		if (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT ||
-		    eatype == BPF_MODIFY_RETURN || eatype == BPF_TRACE_FSESSION)
+		    eatype == BPF_MODIFY_RETURN || eatype == BPF_TRACE_FSESSION ||
+		    eatype == BPF_TRACE_FENTRY_MULTI || eatype == BPF_TRACE_FEXIT_MULTI ||
+		    eatype == BPF_TRACE_FSESSION_MULTI)
 			return true;
 		return false;
 	case BPF_PROG_TYPE_LSM:
@@ -359,7 +401,6 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key, unsigned long ip)
 	head = &trampoline_ip_table[hash_64(tr->ip, TRAMPOLINE_HASH_BITS)];
 	hlist_add_head(&tr->hlist_ip, head);
 	refcount_set(&tr->refcnt, 1);
-	mutex_init(&tr->mutex);
 	for (i = 0; i < BPF_TRAMP_MAX; i++)
 		INIT_HLIST_HEAD(&tr->progs_hlist[i]);
 out:
@@ -386,9 +427,11 @@ static int bpf_trampoline_update_fentry(struct bpf_trampoline *tr, u32 orig_flag
 	return bpf_arch_text_poke(ip, old_t, new_t, old_addr, new_addr);
 }
 
-static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags,
-			     void *old_addr)
+static void bpf_tramp_image_put(struct bpf_tramp_image *im);
+
+static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags, void *data __maybe_unused)
 {
+	void *old_addr = tr->cur_image->image;
 	int ret;
 
 	if (tr->func.ftrace_managed)
@@ -396,13 +439,19 @@ static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags,
 	else
 		ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, NULL);
 
-	return ret;
+	if (ret)
+		return ret;
+
+	bpf_tramp_image_put(tr->cur_image);
+	tr->cur_image = NULL;
+	return 0;
 }
 
-static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags,
-			 void *old_addr, void *new_addr,
-			 bool lock_direct_mutex)
+static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags, struct bpf_tramp_image *im,
+			 bool lock_direct_mutex, void *data __maybe_unused)
 {
+	void *old_addr = tr->cur_image->image;
+	void *new_addr = im->image;
 	int ret;
 
 	if (tr->func.ftrace_managed) {
@@ -411,12 +460,20 @@ static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags,
 		ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr,
 						   new_addr);
 	}
-	return ret;
+
+	if (ret)
+		return ret;
+
+	bpf_tramp_image_put(tr->cur_image);
+	tr->cur_image = im;
+	return 0;
 }
 
 /* first time registering */
-static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
+static int register_fentry(struct bpf_trampoline *tr, struct bpf_tramp_image *im,
+			   void *data __maybe_unused)
 {
+	void *new_addr = im->image;
 	void *ip = tr->func.addr;
 	unsigned long faddr;
 	int ret;
@@ -434,33 +491,42 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
 		ret = bpf_trampoline_update_fentry(tr, 0, NULL, new_addr);
 	}
 
-	return ret;
+	if (ret)
+		return ret;
+
+	tr->cur_image = im;
+	return 0;
 }
 
-static struct bpf_tramp_links *
+static const struct bpf_trampoline_ops trampoline_ops = {
+	.register_fentry   = register_fentry,
+	.unregister_fentry = unregister_fentry,
+	.modify_fentry     = modify_fentry,
+};
+
+static struct bpf_tramp_nodes *
 bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_arg)
 {
-	struct bpf_tramp_link *link;
-	struct bpf_tramp_links *tlinks;
-	struct bpf_tramp_link **links;
+	struct bpf_tramp_node *node, **nodes;
+	struct bpf_tramp_nodes *tnodes;
 	int kind;
 
 	*total = 0;
-	tlinks = kzalloc_objs(*tlinks, BPF_TRAMP_MAX);
-	if (!tlinks)
+	tnodes = kzalloc_objs(*tnodes, BPF_TRAMP_MAX);
+	if (!tnodes)
 		return ERR_PTR(-ENOMEM);
 
 	for (kind = 0; kind < BPF_TRAMP_MAX; kind++) {
-		tlinks[kind].nr_links = tr->progs_cnt[kind];
+		tnodes[kind].nr_nodes = tr->progs_cnt[kind];
 		*total += tr->progs_cnt[kind];
-		links = tlinks[kind].links;
+		nodes = tnodes[kind].nodes;
 
-		hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) {
-			*ip_arg |= link->link.prog->call_get_func_ip;
-			*links++ = link;
+		hlist_for_each_entry(node, &tr->progs_hlist[kind], tramp_hlist) {
+			*ip_arg |= node->link->prog->call_get_func_ip;
+			*nodes++ = node;
 		}
 	}
-	return tlinks;
+	return tnodes;
 }
 
 static void bpf_tramp_image_free(struct bpf_tramp_image *im)
@@ -604,30 +670,29 @@ out:
 	return ERR_PTR(err);
 }
 
-static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex)
+static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex,
+				 const struct bpf_trampoline_ops *ops, void *data)
 {
 	struct bpf_tramp_image *im;
-	struct bpf_tramp_links *tlinks;
+	struct bpf_tramp_nodes *tnodes;
 	u32 orig_flags = tr->flags;
 	bool ip_arg = false;
 	int err, total, size;
 
-	tlinks = bpf_trampoline_get_progs(tr, &total, &ip_arg);
-	if (IS_ERR(tlinks))
-		return PTR_ERR(tlinks);
+	tnodes = bpf_trampoline_get_progs(tr, &total, &ip_arg);
+	if (IS_ERR(tnodes))
+		return PTR_ERR(tnodes);
 
 	if (total == 0) {
-		err = unregister_fentry(tr, orig_flags, tr->cur_image->image);
-		bpf_tramp_image_put(tr->cur_image);
-		tr->cur_image = NULL;
+		err = ops->unregister_fentry(tr, orig_flags, data);
 		goto out;
 	}
 
 	/* clear all bits except SHARE_IPMODIFY and TAIL_CALL_CTX */
 	tr->flags &= (BPF_TRAMP_F_SHARE_IPMODIFY | BPF_TRAMP_F_TAIL_CALL_CTX);
 
-	if (tlinks[BPF_TRAMP_FEXIT].nr_links ||
-	    tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) {
+	if (tnodes[BPF_TRAMP_FEXIT].nr_nodes ||
+	    tnodes[BPF_TRAMP_MODIFY_RETURN].nr_nodes) {
 		/* NOTE: BPF_TRAMP_F_RESTORE_REGS and BPF_TRAMP_F_SKIP_FRAME
 		 * should not be set together.
 		 */
@@ -658,7 +723,7 @@ again:
 #endif
 
 	size = arch_bpf_trampoline_size(&tr->func.model, tr->flags,
-					tlinks, tr->func.addr);
+					tnodes, tr->func.addr);
 	if (size < 0) {
 		err = size;
 		goto out;
@@ -676,7 +741,7 @@ again:
 	}
 
 	err = arch_prepare_bpf_trampoline(im, im->image, im->image + size,
-					  &tr->func.model, tr->flags, tlinks,
+					  &tr->func.model, tr->flags, tnodes,
 					  tr->func.addr);
 	if (err < 0)
 		goto out_free;
@@ -685,14 +750,12 @@ again:
 	if (err)
 		goto out_free;
 
-	WARN_ON(tr->cur_image && total == 0);
 	if (tr->cur_image)
 		/* progs already running at this address */
-		err = modify_fentry(tr, orig_flags, tr->cur_image->image,
-				    im->image, lock_direct_mutex);
+		err = ops->modify_fentry(tr, orig_flags, im, lock_direct_mutex, data);
 	else
 		/* first time registering */
-		err = register_fentry(tr, im->image);
+		err = ops->register_fentry(tr, im, data);
 
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
 	if (err == -EAGAIN) {
@@ -704,34 +767,31 @@ again:
 		goto again;
 	}
 #endif
-	if (err)
-		goto out_free;
 
-	if (tr->cur_image)
-		bpf_tramp_image_put(tr->cur_image);
-	tr->cur_image = im;
+out_free:
+	if (err)
+		bpf_tramp_image_free(im);
 out:
 	/* If any error happens, restore previous flags */
 	if (err)
 		tr->flags = orig_flags;
-	kfree(tlinks);
+	kfree(tnodes);
 	return err;
-
-out_free:
-	bpf_tramp_image_free(im);
-	goto out;
 }
 
 static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)
 {
 	switch (prog->expected_attach_type) {
 	case BPF_TRACE_FENTRY:
+	case BPF_TRACE_FENTRY_MULTI:
 		return BPF_TRAMP_FENTRY;
 	case BPF_MODIFY_RETURN:
 		return BPF_TRAMP_MODIFY_RETURN;
 	case BPF_TRACE_FEXIT:
+	case BPF_TRACE_FEXIT_MULTI:
 		return BPF_TRAMP_FEXIT;
 	case BPF_TRACE_FSESSION:
+	case BPF_TRACE_FSESSION_MULTI:
 		return BPF_TRAMP_FSESSION;
 	case BPF_LSM_MAC:
 		if (!prog->aux->attach_func_proto->type)
@@ -764,39 +824,33 @@ static int bpf_freplace_check_tgt_prog(struct bpf_prog *tgt_prog)
 	return 0;
 }
 
-static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link,
-				      struct bpf_trampoline *tr,
-				      struct bpf_prog *tgt_prog)
+static struct bpf_tramp_node *fsession_exit(struct bpf_tramp_node *node)
 {
-	struct bpf_fsession_link *fslink = NULL;
-	enum bpf_tramp_prog_type kind;
-	struct bpf_tramp_link *link_exiting;
-	struct hlist_head *prog_list;
-	int err = 0;
-	int cnt = 0, i;
+	if (node->link->type == BPF_LINK_TYPE_TRACING) {
+		struct bpf_tracing_link *link;
 
-	kind = bpf_attach_type_to_tramp(link->link.prog);
-	if (tr->extension_prog)
-		/* cannot attach fentry/fexit if extension prog is attached.
-		 * cannot overwrite extension prog either.
-		 */
-		return -EBUSY;
+		link = container_of(node->link, struct bpf_tracing_link, link.link);
+		return &link->fexit;
+	} else if (node->link->type == BPF_LINK_TYPE_TRACING_MULTI) {
+		struct bpf_tracing_multi_link *link;
+		struct bpf_tracing_multi_node *mnode;
 
-	for (i = 0; i < BPF_TRAMP_MAX; i++)
-		cnt += tr->progs_cnt[i];
-
-	if (kind == BPF_TRAMP_REPLACE) {
-		/* Cannot attach extension if fentry/fexit are in use. */
-		if (cnt)
-			return -EBUSY;
-		err = bpf_freplace_check_tgt_prog(tgt_prog);
-		if (err)
-			return err;
-		tr->extension_prog = link->link.prog;
-		return bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP,
-					  BPF_MOD_JUMP, NULL,
-					  link->link.prog->bpf_func);
+		link = container_of(node->link, struct bpf_tracing_multi_link, link);
+		mnode = container_of(node, struct bpf_tracing_multi_node, node);
+		return &link->fexits[mnode - link->nodes];
 	}
+	return NULL;
+}
+
+static int bpf_trampoline_add_prog(struct bpf_trampoline *tr,
+				   struct bpf_tramp_node *node,
+				   int cnt)
+{
+	enum bpf_tramp_prog_type kind;
+	struct bpf_tramp_node *node_existing, *fexit;
+	struct hlist_head *prog_list;
+
+	kind = bpf_attach_type_to_tramp(node->link->prog);
 	if (kind == BPF_TRAMP_FSESSION) {
 		prog_list = &tr->progs_hlist[BPF_TRAMP_FENTRY];
 		cnt++;
@@ -805,59 +859,112 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link,
 	}
 	if (cnt >= BPF_MAX_TRAMP_LINKS)
 		return -E2BIG;
-	if (!hlist_unhashed(&link->tramp_hlist))
+	if (!hlist_unhashed(&node->tramp_hlist))
 		/* prog already linked */
 		return -EBUSY;
-	hlist_for_each_entry(link_exiting, prog_list, tramp_hlist) {
-		if (link_exiting->link.prog != link->link.prog)
+	hlist_for_each_entry(node_existing, prog_list, tramp_hlist) {
+		if (node_existing->link->prog != node->link->prog)
 			continue;
 		/* prog already linked */
 		return -EBUSY;
 	}
 
-	hlist_add_head(&link->tramp_hlist, prog_list);
+	hlist_add_head(&node->tramp_hlist, prog_list);
 	if (kind == BPF_TRAMP_FSESSION) {
 		tr->progs_cnt[BPF_TRAMP_FENTRY]++;
-		fslink = container_of(link, struct bpf_fsession_link, link.link);
-		hlist_add_head(&fslink->fexit.tramp_hlist, &tr->progs_hlist[BPF_TRAMP_FEXIT]);
+		fexit = fsession_exit(node);
+		if (WARN_ON_ONCE(!fexit))
+			return -EINVAL;
+		hlist_add_head(&fexit->tramp_hlist, &tr->progs_hlist[BPF_TRAMP_FEXIT]);
 		tr->progs_cnt[BPF_TRAMP_FEXIT]++;
 	} else {
 		tr->progs_cnt[kind]++;
 	}
-	err = bpf_trampoline_update(tr, true /* lock_direct_mutex */);
-	if (err) {
-		hlist_del_init(&link->tramp_hlist);
-		if (kind == BPF_TRAMP_FSESSION) {
-			tr->progs_cnt[BPF_TRAMP_FENTRY]--;
-			hlist_del_init(&fslink->fexit.tramp_hlist);
-			tr->progs_cnt[BPF_TRAMP_FEXIT]--;
-		} else {
-			tr->progs_cnt[kind]--;
-		}
+	return 0;
+}
+
+static void bpf_trampoline_remove_prog(struct bpf_trampoline *tr,
+				       struct bpf_tramp_node *node)
+{
+	enum bpf_tramp_prog_type kind;
+	struct bpf_tramp_node *fexit;
+
+	kind = bpf_attach_type_to_tramp(node->link->prog);
+	if (kind == BPF_TRAMP_FSESSION) {
+		fexit = fsession_exit(node);
+		if (WARN_ON_ONCE(!fexit))
+			return;
+		hlist_del_init(&fexit->tramp_hlist);
+		tr->progs_cnt[BPF_TRAMP_FEXIT]--;
+		kind = BPF_TRAMP_FENTRY;
 	}
+	hlist_del_init(&node->tramp_hlist);
+	tr->progs_cnt[kind]--;
+}
+
+static int __bpf_trampoline_link_prog(struct bpf_tramp_node *node,
+				      struct bpf_trampoline *tr,
+				      struct bpf_prog *tgt_prog,
+				      const struct bpf_trampoline_ops *ops,
+				      void *data)
+{
+	enum bpf_tramp_prog_type kind;
+	int err = 0;
+	int cnt = 0, i;
+
+	kind = bpf_attach_type_to_tramp(node->link->prog);
+	if (tr->extension_prog)
+		/* cannot attach fentry/fexit if extension prog is attached.
+		 * cannot overwrite extension prog either.
+		 */
+		return -EBUSY;
+
+	for (i = 0; i < BPF_TRAMP_MAX; i++)
+		cnt += tr->progs_cnt[i];
+
+	if (kind == BPF_TRAMP_REPLACE) {
+		/* Cannot attach extension if fentry/fexit are in use. */
+		if (cnt)
+			return -EBUSY;
+		err = bpf_freplace_check_tgt_prog(tgt_prog);
+		if (err)
+			return err;
+		tr->extension_prog = node->link->prog;
+		return bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP,
+					  BPF_MOD_JUMP, NULL,
+					  node->link->prog->bpf_func);
+	}
+	err = bpf_trampoline_add_prog(tr, node, cnt);
+	if (err)
+		return err;
+	err = bpf_trampoline_update(tr, true /* lock_direct_mutex */, ops, data);
+	if (err)
+		bpf_trampoline_remove_prog(tr, node);
 	return err;
 }
 
-int bpf_trampoline_link_prog(struct bpf_tramp_link *link,
+int bpf_trampoline_link_prog(struct bpf_tramp_node *node,
 			     struct bpf_trampoline *tr,
 			     struct bpf_prog *tgt_prog)
 {
 	int err;
 
-	mutex_lock(&tr->mutex);
-	err = __bpf_trampoline_link_prog(link, tr, tgt_prog);
-	mutex_unlock(&tr->mutex);
+	trampoline_lock(tr);
+	err = __bpf_trampoline_link_prog(node, tr, tgt_prog, &trampoline_ops, NULL);
+	trampoline_unlock(tr);
 	return err;
 }
 
-static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
+static int __bpf_trampoline_unlink_prog(struct bpf_tramp_node *node,
 					struct bpf_trampoline *tr,
-					struct bpf_prog *tgt_prog)
+					struct bpf_prog *tgt_prog,
+					const struct bpf_trampoline_ops *ops,
+					void *data)
 {
 	enum bpf_tramp_prog_type kind;
 	int err;
 
-	kind = bpf_attach_type_to_tramp(link->link.prog);
+	kind = bpf_attach_type_to_tramp(node->link->prog);
 	if (kind == BPF_TRAMP_REPLACE) {
 		WARN_ON_ONCE(!tr->extension_prog);
 		err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP,
@@ -867,29 +974,21 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
 		guard(mutex)(&tgt_prog->aux->ext_mutex);
 		tgt_prog->aux->is_extended = false;
 		return err;
-	} else if (kind == BPF_TRAMP_FSESSION) {
-		struct bpf_fsession_link *fslink =
-			container_of(link, struct bpf_fsession_link, link.link);
-
-		hlist_del_init(&fslink->fexit.tramp_hlist);
-		tr->progs_cnt[BPF_TRAMP_FEXIT]--;
-		kind = BPF_TRAMP_FENTRY;
 	}
-	hlist_del_init(&link->tramp_hlist);
-	tr->progs_cnt[kind]--;
-	return bpf_trampoline_update(tr, true /* lock_direct_mutex */);
+	bpf_trampoline_remove_prog(tr, node);
+	return bpf_trampoline_update(tr, true /* lock_direct_mutex */, ops, data);
 }
 
 /* bpf_trampoline_unlink_prog() should never fail. */
-int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
+int bpf_trampoline_unlink_prog(struct bpf_tramp_node *node,
 			       struct bpf_trampoline *tr,
 			       struct bpf_prog *tgt_prog)
 {
 	int err;
 
-	mutex_lock(&tr->mutex);
-	err = __bpf_trampoline_unlink_prog(link, tr, tgt_prog);
-	mutex_unlock(&tr->mutex);
+	trampoline_lock(tr);
+	err = __bpf_trampoline_unlink_prog(node, tr, tgt_prog, &trampoline_ops, NULL);
+	trampoline_unlock(tr);
 	return err;
 }
 
@@ -903,7 +1002,7 @@ static void bpf_shim_tramp_link_release(struct bpf_link *link)
 	if (!shim_link->trampoline)
 		return;
 
-	WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline, NULL));
+	WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link.node, shim_link->trampoline, NULL));
 	bpf_trampoline_put(shim_link->trampoline);
 }
 
@@ -949,8 +1048,8 @@ static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog
 	p->type = BPF_PROG_TYPE_LSM;
 	p->expected_attach_type = BPF_LSM_MAC;
 	bpf_prog_inc(p);
-	bpf_link_init(&shim_link->link.link, BPF_LINK_TYPE_UNSPEC,
-		      &bpf_shim_tramp_link_lops, p, attach_type);
+	bpf_tramp_link_init(&shim_link->link, BPF_LINK_TYPE_UNSPEC,
+		      &bpf_shim_tramp_link_lops, p, attach_type, 0);
 	bpf_cgroup_atype_get(p->aux->attach_btf_id, cgroup_atype);
 
 	return shim_link;
@@ -959,15 +1058,15 @@ static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog
 static struct bpf_shim_tramp_link *cgroup_shim_find(struct bpf_trampoline *tr,
 						    bpf_func_t bpf_func)
 {
-	struct bpf_tramp_link *link;
+	struct bpf_tramp_node *node;
 	int kind;
 
 	for (kind = 0; kind < BPF_TRAMP_MAX; kind++) {
-		hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) {
-			struct bpf_prog *p = link->link.prog;
+		hlist_for_each_entry(node, &tr->progs_hlist[kind], tramp_hlist) {
+			struct bpf_prog *p = node->link->prog;
 
 			if (p->bpf_func == bpf_func)
-				return container_of(link, struct bpf_shim_tramp_link, link);
+				return container_of(node, struct bpf_shim_tramp_link, link.node);
 		}
 	}
 
@@ -999,12 +1098,12 @@ int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
 	if (!tr)
 		return  -ENOMEM;
 
-	mutex_lock(&tr->mutex);
+	trampoline_lock(tr);
 
 	shim_link = cgroup_shim_find(tr, bpf_func);
 	if (shim_link && !IS_ERR(bpf_link_inc_not_zero(&shim_link->link.link))) {
 		/* Reusing existing shim attached by the other program. */
-		mutex_unlock(&tr->mutex);
+		trampoline_unlock(tr);
 		bpf_trampoline_put(tr); /* bpf_trampoline_get above */
 		return 0;
 	}
@@ -1017,23 +1116,23 @@ int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
 		goto err;
 	}
 
-	err = __bpf_trampoline_link_prog(&shim_link->link, tr, NULL);
+	err = __bpf_trampoline_link_prog(&shim_link->link.node, tr, NULL, &trampoline_ops, NULL);
 	if (err)
 		goto err;
 
 	shim_link->trampoline = tr;
 	/* note, we're still holding tr refcnt from above */
 
-	mutex_unlock(&tr->mutex);
+	trampoline_unlock(tr);
 
 	return 0;
 err:
-	mutex_unlock(&tr->mutex);
+	trampoline_unlock(tr);
 
 	if (shim_link)
 		bpf_link_put(&shim_link->link.link);
 
-	/* have to release tr while _not_ holding its mutex */
+	/* have to release tr while _not_ holding pool mutex for trampoline */
 	bpf_trampoline_put(tr); /* bpf_trampoline_get above */
 
 	return err;
@@ -1054,9 +1153,9 @@ void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog)
 	if (WARN_ON_ONCE(!tr))
 		return;
 
-	mutex_lock(&tr->mutex);
+	trampoline_lock(tr);
 	shim_link = cgroup_shim_find(tr, bpf_func);
-	mutex_unlock(&tr->mutex);
+	trampoline_unlock(tr);
 
 	if (shim_link)
 		bpf_link_put(&shim_link->link.link);
@@ -1074,14 +1173,14 @@ struct bpf_trampoline *bpf_trampoline_get(u64 key,
 	if (!tr)
 		return NULL;
 
-	mutex_lock(&tr->mutex);
+	trampoline_lock(tr);
 	if (tr->func.addr)
 		goto out;
 
 	memcpy(&tr->func.model, &tgt_info->fmodel, sizeof(tgt_info->fmodel));
 	tr->func.addr = (void *)tgt_info->tgt_addr;
 out:
-	mutex_unlock(&tr->mutex);
+	trampoline_unlock(tr);
 	return tr;
 }
 
@@ -1094,7 +1193,6 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
 	mutex_lock(&trampoline_mutex);
 	if (!refcount_dec_and_test(&tr->refcnt))
 		goto out;
-	WARN_ON_ONCE(mutex_is_locked(&tr->mutex));
 
 	for (i = 0; i < BPF_TRAMP_MAX; i++)
 		if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[i])))
@@ -1333,7 +1431,7 @@ bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog)
 int __weak
 arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
 			    const struct btf_func_model *m, u32 flags,
-			    struct bpf_tramp_links *tlinks,
+			    struct bpf_tramp_nodes *tnodes,
 			    void *func_addr)
 {
 	return -ENOTSUPP;
@@ -1367,11 +1465,288 @@ int __weak arch_protect_bpf_trampoline(void *image, unsigned int size)
 }
 
 int __weak arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
-				    struct bpf_tramp_links *tlinks, void *func_addr)
+				    struct bpf_tramp_nodes *tnodes, void *func_addr)
 {
 	return -ENOTSUPP;
 }
 
+#if defined(CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS) && \
+    defined(CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS) && \
+    defined(CONFIG_BPF_SYSCALL)
+
+static void trampoline_lock_all(void)
+{
+	int i;
+
+	for (i = 0; i < TRAMPOLINE_LOCKS_TABLE_SIZE; i++)
+		mutex_lock(&trampoline_locks[i].mutex);
+}
+
+static void trampoline_unlock_all(void)
+{
+	int i;
+
+	for (i = 0; i < TRAMPOLINE_LOCKS_TABLE_SIZE; i++)
+		mutex_unlock(&trampoline_locks[i].mutex);
+}
+
+static void remove_tracing_multi_data(struct bpf_tracing_multi_data *data)
+{
+	ftrace_hash_remove(data->reg);
+	ftrace_hash_remove(data->unreg);
+	ftrace_hash_remove(data->modify);
+}
+
+static void clear_tracing_multi_data(struct bpf_tracing_multi_data *data)
+{
+	remove_tracing_multi_data(data);
+
+	free_ftrace_hash(data->reg);
+	free_ftrace_hash(data->unreg);
+	free_ftrace_hash(data->modify);
+}
+
+static int init_tracing_multi_data(struct bpf_tracing_multi_data *data)
+{
+	data->reg    = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS);
+	data->unreg  = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS);
+	data->modify = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS);
+
+	if (!data->reg || !data->unreg || !data->modify) {
+		clear_tracing_multi_data(data);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static void ftrace_hash_add(struct ftrace_hash *hash, struct ftrace_func_entry *entry,
+			    unsigned long ip, unsigned long direct)
+{
+	entry->ip = ip;
+	entry->direct = direct;
+	add_ftrace_hash_entry(hash, entry);
+}
+
+static int register_fentry_multi(struct bpf_trampoline *tr, struct bpf_tramp_image *im, void *ptr)
+{
+	unsigned long addr = (unsigned long) im->image;
+	unsigned long ip = ftrace_location(tr->ip);
+	struct bpf_tracing_multi_data *data = ptr;
+
+	if (bpf_trampoline_use_jmp(tr->flags))
+		addr = ftrace_jmp_set(addr);
+
+	ftrace_hash_add(data->reg, data->entry, ip, addr);
+	tr->cur_image = im;
+	return 0;
+}
+
+static int unregister_fentry_multi(struct bpf_trampoline *tr, u32 orig_flags, void *ptr)
+{
+	unsigned long addr = (unsigned long) tr->cur_image->image;
+	unsigned long ip = ftrace_location(tr->ip);
+	struct bpf_tracing_multi_data *data = ptr;
+
+	if (bpf_trampoline_use_jmp(tr->flags))
+		addr = ftrace_jmp_set(addr);
+
+	ftrace_hash_add(data->unreg, data->entry, ip, addr);
+	tr->cur_image = NULL;
+	return 0;
+}
+
+static int modify_fentry_multi(struct bpf_trampoline *tr, u32 orig_flags, struct bpf_tramp_image *im,
+			       bool lock_direct_mutex, void *ptr)
+{
+	unsigned long addr = (unsigned long) im->image;
+	unsigned long ip = ftrace_location(tr->ip);
+	struct bpf_tracing_multi_data *data = ptr;
+
+	if (bpf_trampoline_use_jmp(tr->flags))
+		addr = ftrace_jmp_set(addr);
+
+	ftrace_hash_add(data->modify, data->entry, ip, addr);
+	tr->cur_image = im;
+	return 0;
+}
+
+static const struct bpf_trampoline_ops trampoline_multi_ops = {
+	.register_fentry   = register_fentry_multi,
+	.unregister_fentry = unregister_fentry_multi,
+	.modify_fentry     = modify_fentry_multi,
+};
+
+static void bpf_trampoline_multi_attach_init(struct bpf_trampoline *tr)
+{
+	tr->multi_attach.old_image = tr->cur_image;
+	tr->multi_attach.old_flags = tr->flags;
+}
+
+static void bpf_trampoline_multi_attach_free(struct bpf_trampoline *tr)
+{
+	if (tr->multi_attach.old_image)
+		bpf_tramp_image_put(tr->multi_attach.old_image);
+
+	tr->multi_attach.old_image = NULL;
+	tr->multi_attach.old_flags = 0;
+}
+
+static void bpf_trampoline_multi_attach_rollback(struct bpf_trampoline *tr)
+{
+	if (tr->cur_image)
+		bpf_tramp_image_put(tr->cur_image);
+	tr->cur_image = tr->multi_attach.old_image;
+	tr->flags = tr->multi_attach.old_flags;
+
+	tr->multi_attach.old_image = NULL;
+	tr->multi_attach.old_flags = 0;
+}
+
+#define for_each_mnode_cnt(mnode, link, cnt) \
+	for (i = 0, mnode = &link->nodes[i]; i < cnt; i++, mnode = &link->nodes[i])
+
+#define for_each_mnode(mnode, link) \
+	for_each_mnode_cnt(mnode, link, link->nodes_cnt)
+
+int bpf_trampoline_multi_attach(struct bpf_prog *prog, u32 *ids,
+				struct bpf_tracing_multi_link *link)
+{
+	struct bpf_tracing_multi_data *data = &link->data;
+	struct bpf_attach_target_info tgt_info = {};
+	struct btf *btf = prog->aux->attach_btf;
+	struct bpf_tracing_multi_node *mnode;
+	struct bpf_trampoline *tr;
+	int i, err, rollback_cnt;
+	u64 key;
+
+	for_each_mnode(mnode, link) {
+		rollback_cnt = i;
+
+		err = bpf_check_attach_btf_id_multi(btf, prog, ids[i], &tgt_info);
+		if (err)
+			goto rollback_put;
+
+		key = bpf_trampoline_compute_key(NULL, btf, ids[i]);
+
+		tr = bpf_trampoline_get(key, &tgt_info);
+		if (!tr) {
+			err = -ENOMEM;
+			goto rollback_put;
+		}
+
+		mnode->trampoline = tr;
+		mnode->node.link = &link->link;
+		mnode->node.cookie = link->cookies ? link->cookies[i] : 0;
+
+		if (prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI) {
+			link->fexits[i].link = &link->link;
+			link->fexits[i].cookie = link->cookies ? link->cookies[i] : 0;
+		}
+
+		cond_resched();
+	}
+
+	err = init_tracing_multi_data(data);
+	if (err) {
+		rollback_cnt = link->nodes_cnt;
+		goto rollback_put;
+	}
+
+	trampoline_lock_all();
+
+	for_each_mnode(mnode, link) {
+		bpf_trampoline_multi_attach_init(mnode->trampoline);
+
+		data->entry = &mnode->entry;
+		err = __bpf_trampoline_link_prog(&mnode->node, mnode->trampoline, NULL,
+						 &trampoline_multi_ops, data);
+		if (err) {
+			rollback_cnt = i;
+			goto rollback_unlink;
+		}
+	}
+
+	rollback_cnt = link->nodes_cnt;
+	if (ftrace_hash_count(data->reg)) {
+		err = update_ftrace_direct_add(&direct_ops, data->reg);
+		if (err)
+			goto rollback_unlink;
+	}
+
+	if (ftrace_hash_count(data->modify)) {
+		err = update_ftrace_direct_mod(&direct_ops, data->modify, true);
+		if (err) {
+			if (ftrace_hash_count(data->reg))
+				WARN_ON_ONCE(update_ftrace_direct_del(&direct_ops, data->reg));
+			goto rollback_unlink;
+		}
+	}
+
+	for_each_mnode(mnode, link)
+		bpf_trampoline_multi_attach_free(mnode->trampoline);
+
+	trampoline_unlock_all();
+
+	remove_tracing_multi_data(data);
+	return 0;
+
+rollback_unlink:
+	for_each_mnode_cnt(mnode, link, rollback_cnt) {
+		bpf_trampoline_remove_prog(mnode->trampoline, &mnode->node);
+		bpf_trampoline_multi_attach_rollback(mnode->trampoline);
+	}
+
+	trampoline_unlock_all();
+
+	clear_tracing_multi_data(data);
+	rollback_cnt = link->nodes_cnt;
+
+rollback_put:
+	for_each_mnode_cnt(mnode, link, rollback_cnt)
+		bpf_trampoline_put(mnode->trampoline);
+
+	return err;
+}
+
+int bpf_trampoline_multi_detach(struct bpf_prog *prog, struct bpf_tracing_multi_link *link)
+{
+	struct bpf_tracing_multi_data *data = &link->data;
+	struct bpf_tracing_multi_node *mnode;
+	int i;
+
+	trampoline_lock_all();
+
+	for_each_mnode(mnode, link) {
+		data->entry = &mnode->entry;
+		bpf_trampoline_multi_attach_init(mnode->trampoline);
+		WARN_ON_ONCE(__bpf_trampoline_unlink_prog(&mnode->node, mnode->trampoline,
+					NULL, &trampoline_multi_ops, data));
+	}
+
+	if (ftrace_hash_count(data->unreg))
+		WARN_ON_ONCE(update_ftrace_direct_del(&direct_ops, data->unreg));
+	if (ftrace_hash_count(data->modify))
+		WARN_ON_ONCE(update_ftrace_direct_mod(&direct_ops, data->modify, true));
+
+	for_each_mnode(mnode, link)
+		bpf_trampoline_multi_attach_free(mnode->trampoline);
+
+	trampoline_unlock_all();
+
+	for_each_mnode(mnode, link)
+		bpf_trampoline_put(mnode->trampoline);
+
+	clear_tracing_multi_data(data);
+	return 0;
+}
+
+#undef for_each_mnode_cnt
+#undef for_each_mnode
+
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS &&
+	  CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS &&
+	  CONFIG_BPF_SYSCALL */
+
 static int __init init_trampolines(void)
 {
 	int i;
@@ -1380,6 +1755,8 @@ static int __init init_trampolines(void)
 		INIT_HLIST_HEAD(&trampoline_key_table[i]);
 	for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++)
 		INIT_HLIST_HEAD(&trampoline_ip_table[i]);
+	for (i = 0; i < TRAMPOLINE_LOCKS_TABLE_SIZE; i++)
+		__mutex_init(&trampoline_locks[i].mutex, "trampoline_lock", &trampoline_locks[i].key);
 	return 0;
 }
 late_initcall(init_trampolines);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7fb88e1cd7c4..2abc79dbf281 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -26,6 +26,7 @@
 #include <linux/poison.h>
 #include <linux/module.h>
 #include <linux/cpumask.h>
+#include <linux/cnum.h>
 #include <linux/bpf_mem_alloc.h>
 #include <net/xdp.h>
 #include <linux/trace_events.h>
@@ -199,14 +200,15 @@ struct bpf_verifier_stack_elem {
 
 #define BPF_PRIV_STACK_MIN_SIZE		64
 
-static int acquire_reference(struct bpf_verifier_env *env, int insn_idx);
-static int release_reference_nomark(struct bpf_verifier_state *state, int ref_obj_id);
-static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
+static int acquire_reference(struct bpf_verifier_env *env, int insn_idx, int parent_id);
+static int release_reference_nomark(struct bpf_verifier_state *state, int id);
+static int release_reference(struct bpf_verifier_env *env, int id);
 static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
 static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env);
+static bool is_tracing_prog_type(enum bpf_prog_type type);
 static int ref_set_non_owning(struct bpf_verifier_env *env,
 			      struct bpf_reg_state *reg);
-static bool is_trusted_reg(const struct bpf_reg_state *reg);
+static bool is_trusted_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg);
 static inline bool in_sleepable_context(struct bpf_verifier_env *env);
 static const char *non_sleepable_context_description(struct bpf_verifier_env *env);
 static void scalar32_min_max_add(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg);
@@ -230,8 +232,28 @@ static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state)
 			     (poisoned ? BPF_MAP_KEY_POISON : 0ULL);
 }
 
+static void update_ref_obj(struct ref_obj_desc *ref_obj, struct bpf_reg_state *reg)
+{
+	ref_obj->id = reg->id;
+	ref_obj->parent_id = reg->parent_id;
+	ref_obj->cnt++;
+}
+
+static int validate_ref_obj(struct bpf_verifier_env *env, struct ref_obj_desc *ref_obj)
+{
+	if (ref_obj->cnt > 1) {
+		verifier_bug(env, "function expects only one referenced object but got %d\n",
+			     ref_obj->cnt);
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
 struct bpf_call_arg_meta {
 	struct bpf_map_desc map;
+	struct bpf_dynptr_desc dynptr;
+	struct ref_obj_desc ref_obj;
 	bool raw_mode;
 	bool pkt_access;
 	u8 release_regno;
@@ -239,8 +261,6 @@ struct bpf_call_arg_meta {
 	int access_size;
 	int mem_size;
 	u64 msize_max_value;
-	int ref_obj_id;
-	int dynptr_id;
 	int func_id;
 	struct btf *btf;
 	u32 btf_id;
@@ -261,6 +281,41 @@ struct bpf_kfunc_meta {
 
 struct btf *btf_vmlinux;
 
+typedef struct argno {
+	int argno;
+} argno_t;
+
+static argno_t argno_from_reg(u32 regno)
+{
+	return (argno_t){ .argno = regno };
+}
+
+static argno_t argno_from_arg(u32 arg)
+{
+	return (argno_t){ .argno = -arg };
+}
+
+static int reg_from_argno(argno_t a)
+{
+	if (a.argno >= 0)
+		return a.argno;
+	if (a.argno >= -MAX_BPF_FUNC_REG_ARGS)
+		return -a.argno;
+	return -1;
+}
+
+static int arg_from_argno(argno_t a)
+{
+	if (a.argno < 0)
+		return -a.argno;
+	return -1;
+}
+
+static int arg_idx_from_argno(argno_t a)
+{
+	return arg_from_argno(a) - 1;
+}
+
 static const char *btf_type_name(const struct btf *btf, u32 id)
 {
 	return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
@@ -290,12 +345,12 @@ static void verbose_invalid_scalar(struct bpf_verifier_env *env,
 	bool unknown = true;
 
 	verbose(env, "%s the register %s has", ctx, reg_name);
-	if (reg->smin_value > S64_MIN) {
-		verbose(env, " smin=%lld", reg->smin_value);
+	if (reg_smin(reg) > S64_MIN) {
+		verbose(env, " smin=%lld", reg_smin(reg));
 		unknown = false;
 	}
-	if (reg->smax_value < S64_MAX) {
-		verbose(env, " smax=%lld", reg->smax_value);
+	if (reg_smax(reg) < S64_MAX) {
+		verbose(env, " smax=%lld", reg_smax(reg));
 		unknown = false;
 	}
 	if (unknown)
@@ -303,7 +358,7 @@ static void verbose_invalid_scalar(struct bpf_verifier_env *env,
 	verbose(env, " should have been in [%d, %d]\n", range.minval, range.maxval);
 }
 
-static bool reg_not_null(const struct bpf_reg_state *reg)
+static bool reg_not_null(struct bpf_verifier_env *env, const struct bpf_reg_state *reg)
 {
 	enum bpf_reg_type type;
 
@@ -317,7 +372,7 @@ static bool reg_not_null(const struct bpf_reg_state *reg)
 		type == PTR_TO_MAP_VALUE ||
 		type == PTR_TO_MAP_KEY ||
 		type == PTR_TO_SOCK_COMMON ||
-		(type == PTR_TO_BTF_ID && is_trusted_reg(reg)) ||
+		(type == PTR_TO_BTF_ID && is_trusted_reg(env, reg)) ||
 		(type == PTR_TO_MEM && !(reg->type & PTR_UNTRUSTED)) ||
 		type == CONST_PTR_TO_MAP;
 }
@@ -434,11 +489,6 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id)
 		func_id == BPF_FUNC_skc_to_tcp_request_sock;
 }
 
-static bool is_dynptr_ref_function(enum bpf_func_id func_id)
-{
-	return func_id == BPF_FUNC_dynptr_data;
-}
-
 static bool is_sync_callback_calling_kfunc(u32 btf_id);
 static bool is_async_callback_calling_kfunc(u32 btf_id);
 static bool is_callback_calling_kfunc(u32 btf_id);
@@ -497,22 +547,6 @@ bool bpf_is_may_goto_insn(struct bpf_insn *insn)
 	return insn->code == (BPF_JMP | BPF_JCOND) && insn->src_reg == BPF_MAY_GOTO;
 }
 
-static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id,
-					const struct bpf_map *map)
-{
-	int ref_obj_uses = 0;
-
-	if (is_ptr_cast_function(func_id))
-		ref_obj_uses++;
-	if (is_acquire_function(func_id, map))
-		ref_obj_uses++;
-	if (is_dynptr_ref_function(func_id))
-		ref_obj_uses++;
-
-	return ref_obj_uses > 1;
-}
-
-
 static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots)
 {
        int allocated_slots = state->allocated_stack / BPF_REG_SIZE;
@@ -609,43 +643,44 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type)
 	}
 }
 
-static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
+static bool dynptr_type_referenced(enum bpf_dynptr_type type)
 {
 	return type == BPF_DYNPTR_TYPE_RINGBUF || type == BPF_DYNPTR_TYPE_FILE;
 }
 
 static void __mark_dynptr_reg(struct bpf_reg_state *reg,
 			      enum bpf_dynptr_type type,
-			      bool first_slot, int dynptr_id);
+			      bool first_slot, int id, int parent_id);
 
 
 static void mark_dynptr_stack_regs(struct bpf_verifier_env *env,
 				   struct bpf_reg_state *sreg1,
 				   struct bpf_reg_state *sreg2,
-				   enum bpf_dynptr_type type)
+				   enum bpf_dynptr_type type, int parent_id)
 {
 	int id = ++env->id_gen;
 
-	__mark_dynptr_reg(sreg1, type, true, id);
-	__mark_dynptr_reg(sreg2, type, false, id);
+	__mark_dynptr_reg(sreg1, type, true, id, parent_id);
+	__mark_dynptr_reg(sreg2, type, false, id, parent_id);
 }
 
 static void mark_dynptr_cb_reg(struct bpf_verifier_env *env,
 			       struct bpf_reg_state *reg,
 			       enum bpf_dynptr_type type)
 {
-	__mark_dynptr_reg(reg, type, true, ++env->id_gen);
+	__mark_dynptr_reg(reg, type, true, ++env->id_gen, 0);
 }
 
 static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
 				        struct bpf_func_state *state, int spi);
 
 static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
-				   enum bpf_arg_type arg_type, int insn_idx, int clone_ref_obj_id)
+				   enum bpf_arg_type arg_type, int insn_idx,
+				   struct ref_obj_desc *ref_obj, struct bpf_dynptr_desc *dynptr)
 {
 	struct bpf_func_state *state = bpf_func(env, reg);
+	int spi, i, err, parent_id = 0;
 	enum bpf_dynptr_type type;
-	int spi, i, err;
 
 	spi = dynptr_get_spi(env, reg);
 	if (spi < 0)
@@ -676,94 +711,69 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
 	if (type == BPF_DYNPTR_TYPE_INVALID)
 		return -EINVAL;
 
-	mark_dynptr_stack_regs(env, &state->stack[spi].spilled_ptr,
-			       &state->stack[spi - 1].spilled_ptr, type);
+	if (dynptr->type == BPF_DYNPTR_TYPE_INVALID) { /* dynptr constructors */
+		err = validate_ref_obj(env, ref_obj);
+		if (err)
+			return err;
 
-	if (dynptr_type_refcounted(type)) {
-		/* The id is used to track proper releasing */
-		int id;
+		/* Track parent's id if the parent is a referenced object */
+		parent_id = ref_obj->id;
 
-		if (clone_ref_obj_id)
-			id = clone_ref_obj_id;
-		else
-			id = acquire_reference(env, insn_idx);
+		if (dynptr_type_referenced(type)) {
+			int id;
 
-		if (id < 0)
-			return id;
+			/*
+			 * Create an intermediate reference that tracks the referenced
+			 * object for the referenced dynptr. Freeing a referenced dynptr
+			 * through helpers/kfuncs will invalidate all clones.
+			 */
+			id = acquire_reference(env, insn_idx, parent_id);
+			if (id < 0)
+				return id;
 
-		state->stack[spi].spilled_ptr.ref_obj_id = id;
-		state->stack[spi - 1].spilled_ptr.ref_obj_id = id;
+			parent_id = id;
+		}
+	} else { /* bpf_dynptr_clone() */
+		parent_id = dynptr->parent_id;
 	}
 
+	mark_dynptr_stack_regs(env, &state->stack[spi].spilled_ptr,
+			       &state->stack[spi - 1].spilled_ptr, type, parent_id);
+
 	return 0;
 }
 
-static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi)
+static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_stack_state *stack)
 {
 	int i;
 
 	for (i = 0; i < BPF_REG_SIZE; i++) {
-		state->stack[spi].slot_type[i] = STACK_INVALID;
-		state->stack[spi - 1].slot_type[i] = STACK_INVALID;
+		stack[0].slot_type[i] = STACK_INVALID;
+		stack[1].slot_type[i] = STACK_INVALID;
 	}
 
-	bpf_mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
-	bpf_mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
+	bpf_mark_reg_not_init(env, &stack[0].spilled_ptr);
+	bpf_mark_reg_not_init(env, &stack[1].spilled_ptr);
 }
 
 static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
 {
 	struct bpf_func_state *state = bpf_func(env, reg);
-	int spi, ref_obj_id, i;
+	int spi;
 
-	/*
-	 * This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot
-	 * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr
-	 * is safe to do directly.
-	 */
-	if (reg->type == CONST_PTR_TO_DYNPTR) {
-		verifier_bug(env, "CONST_PTR_TO_DYNPTR cannot be released");
-		return -EFAULT;
-	}
 	spi = dynptr_get_spi(env, reg);
 	if (spi < 0)
 		return spi;
 
-	if (!dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
-		invalidate_dynptr(env, state, spi);
-		return 0;
-	}
-
-	ref_obj_id = state->stack[spi].spilled_ptr.ref_obj_id;
-
-	/* If the dynptr has a ref_obj_id, then we need to invalidate
-	 * two things:
-	 *
-	 * 1) Any dynptrs with a matching ref_obj_id (clones)
-	 * 2) Any slices derived from this dynptr.
+	/*
+	 * For referenced dynptr, release the parent ref which cascades to
+	 * all clones and derived slices. For non-referenced dynptr, only
+	 * the dynptr and slices derived from it will be invalidated.
 	 */
-
-	/* Invalidate any slices associated with this dynptr */
-	WARN_ON_ONCE(release_reference(env, ref_obj_id));
-
-	/* Invalidate any dynptr clones */
-	for (i = 1; i < state->allocated_stack / BPF_REG_SIZE; i++) {
-		if (state->stack[i].spilled_ptr.ref_obj_id != ref_obj_id)
-			continue;
-
-		/* it should always be the case that if the ref obj id
-		 * matches then the stack slot also belongs to a
-		 * dynptr
-		 */
-		if (state->stack[i].slot_type[0] != STACK_DYNPTR) {
-			verifier_bug(env, "misconfigured ref_obj_id");
-			return -EFAULT;
-		}
-		if (state->stack[i].spilled_ptr.dynptr.first_slot)
-			invalidate_dynptr(env, state, i);
-	}
-
-	return 0;
+	reg = &state->stack[spi].spilled_ptr;
+	return release_reference(env, dynptr_type_referenced(reg->dynptr.type)
+				      ? reg->parent_id
+				      : reg->id);
 }
 
 static void __mark_reg_unknown(const struct bpf_verifier_env *env,
@@ -777,12 +787,29 @@ static void mark_reg_invalid(const struct bpf_verifier_env *env, struct bpf_reg_
 		__mark_reg_unknown(env, reg);
 }
 
+static int dynptr_ref_cnt(struct bpf_verifier_env *env, int v_parent_id)
+{
+	struct bpf_stack_state *stack;
+	struct bpf_func_state *state;
+	struct bpf_reg_state *reg;
+	int ref_cnt = 0;
+
+	bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, stack, 1 << STACK_DYNPTR, ({
+		if (!stack || stack->slot_type[0] != STACK_DYNPTR)
+			continue;
+		if (!stack->spilled_ptr.dynptr.first_slot)
+			continue;
+		if (stack->spilled_ptr.parent_id == v_parent_id)
+			ref_cnt++;
+	}));
+
+	return ref_cnt;
+}
+
 static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
 				        struct bpf_func_state *state, int spi)
 {
-	struct bpf_func_state *fstate;
-	struct bpf_reg_state *dreg;
-	int i, dynptr_id;
+	int err = 0;
 
 	/* We always ensure that STACK_DYNPTR is never set partially,
 	 * hence just checking for slot_type[0] is enough. This is
@@ -796,56 +823,25 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
 	if (!state->stack[spi].spilled_ptr.dynptr.first_slot)
 		spi = spi + 1;
 
-	if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
-		int ref_obj_id = state->stack[spi].spilled_ptr.ref_obj_id;
-		int ref_cnt = 0;
-
-		/*
-		 * A referenced dynptr can be overwritten only if there is at
-		 * least one other dynptr sharing the same ref_obj_id,
-		 * ensuring the reference can still be properly released.
-		 */
-		for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
-			if (state->stack[i].slot_type[0] != STACK_DYNPTR)
-				continue;
-			if (!state->stack[i].spilled_ptr.dynptr.first_slot)
-				continue;
-			if (state->stack[i].spilled_ptr.ref_obj_id == ref_obj_id)
-				ref_cnt++;
-		}
-
-		if (ref_cnt <= 1) {
-			verbose(env, "cannot overwrite referenced dynptr\n");
-			return -EINVAL;
-		}
+	/*
+	 * A referenced dynptr can be overwritten only if there is at
+	 * least one other dynptr sharing the same virtual ref parent,
+	 * ensuring the reference can still be properly released.
+	 */
+	if (dynptr_type_referenced(state->stack[spi].spilled_ptr.dynptr.type) &&
+	    dynptr_ref_cnt(env, state->stack[spi].spilled_ptr.parent_id) <= 1) {
+		verbose(env, "cannot overwrite referenced dynptr\n");
+		return -EINVAL;
 	}
 
-	mark_stack_slot_scratched(env, spi);
-	mark_stack_slot_scratched(env, spi - 1);
-
-	/* Writing partially to one dynptr stack slot destroys both. */
-	for (i = 0; i < BPF_REG_SIZE; i++) {
-		state->stack[spi].slot_type[i] = STACK_INVALID;
-		state->stack[spi - 1].slot_type[i] = STACK_INVALID;
+	/* Invalidate the dynptr and any derived slices */
+	err = release_reference(env, state->stack[spi].spilled_ptr.id);
+	if (!err) {
+		mark_stack_slot_scratched(env, spi);
+		mark_stack_slot_scratched(env, spi - 1);
 	}
 
-	dynptr_id = state->stack[spi].spilled_ptr.id;
-	/* Invalidate any slices associated with this dynptr */
-	bpf_for_each_reg_in_vstate(env->cur_state, fstate, dreg, ({
-		/* Dynptr slices are only PTR_TO_MEM_OR_NULL and PTR_TO_MEM */
-		if (dreg->type != (PTR_TO_MEM | PTR_MAYBE_NULL) && dreg->type != PTR_TO_MEM)
-			continue;
-		if (dreg->dynptr_id == dynptr_id)
-			mark_reg_invalid(env, dreg);
-	}));
-
-	/* Do not release reference state, we are destroying dynptr on stack,
-	 * not using some helper to release it. Just reset register.
-	 */
-	bpf_mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
-	bpf_mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
-
-	return 0;
+	return err;
 }
 
 static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
@@ -945,7 +941,7 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env,
 	if (spi < 0)
 		return spi;
 
-	id = acquire_reference(env, insn_idx);
+	id = acquire_reference(env, insn_idx, 0);
 	if (id < 0)
 		return id;
 
@@ -961,7 +957,7 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env,
 			else
 				st->type |= PTR_UNTRUSTED;
 		}
-		st->ref_obj_id = i == 0 ? id : 0;
+		st->id = i == 0 ? id : 0;
 		st->iter.btf = btf;
 		st->iter.btf_id = btf_id;
 		st->iter.state = BPF_ITER_STATE_ACTIVE;
@@ -991,7 +987,7 @@ static int unmark_stack_slots_iter(struct bpf_verifier_env *env,
 		struct bpf_reg_state *st = &slot->spilled_ptr;
 
 		if (i == 0)
-			WARN_ON_ONCE(release_reference(env, st->ref_obj_id));
+			WARN_ON_ONCE(release_reference(env, st->id));
 
 		bpf_mark_reg_not_init(env, st);
 
@@ -1047,10 +1043,10 @@ static int is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_s
 
 		if (st->type & PTR_UNTRUSTED)
 			return -EPROTO;
-		/* only main (first) slot has ref_obj_id set */
-		if (i == 0 && !st->ref_obj_id)
+		/* only main (first) slot has id set */
+		if (i == 0 && !st->id)
 			return -EINVAL;
-		if (i != 0 && st->ref_obj_id)
+		if (i != 0 && st->id)
 			return -EINVAL;
 		if (st->iter.btf != btf || st->iter.btf_id != btf_id)
 			return -EINVAL;
@@ -1089,7 +1085,7 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env,
 
 	__mark_reg_known_zero(st);
 	st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
-	st->ref_obj_id = id;
+	st->id = id;
 	st->irq.kfunc_class = kfunc_class;
 
 	for (i = 0; i < BPF_REG_SIZE; i++)
@@ -1123,7 +1119,7 @@ static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_r
 		return -EINVAL;
 	}
 
-	err = release_irq_state(env->cur_state, st->ref_obj_id);
+	err = release_irq_state(env->cur_state, st->id);
 	WARN_ON_ONCE(err && err != -EACCES);
 	if (err) {
 		int insn_idx = 0;
@@ -1187,7 +1183,7 @@ static int is_irq_flag_reg_valid_init(struct bpf_verifier_env *env, struct bpf_r
 	slot = &state->stack[spi];
 	st = &slot->spilled_ptr;
 
-	if (!st->ref_obj_id)
+	if (!st->id)
 		return -EINVAL;
 
 	for (i = 0; i < BPF_REG_SIZE; i++)
@@ -1339,6 +1335,18 @@ static int copy_stack_state(struct bpf_func_state *dst, const struct bpf_func_st
 		return -ENOMEM;
 
 	dst->allocated_stack = src->allocated_stack;
+
+	/* copy stack args state */
+	n = src->out_stack_arg_cnt;
+	if (n) {
+		dst->stack_arg_regs = copy_array(dst->stack_arg_regs, src->stack_arg_regs, n,
+						 sizeof(struct bpf_reg_state),
+						 GFP_KERNEL_ACCOUNT);
+		if (!dst->stack_arg_regs)
+			return -ENOMEM;
+	}
+
+	dst->out_stack_arg_cnt = src->out_stack_arg_cnt;
 	return 0;
 }
 
@@ -1380,6 +1388,23 @@ static int grow_stack_state(struct bpf_verifier_env *env, struct bpf_func_state
 	return 0;
 }
 
+static int grow_stack_arg_slots(struct bpf_verifier_env *env,
+				struct bpf_func_state *state, int cnt)
+{
+	size_t old_n = state->out_stack_arg_cnt;
+
+	if (old_n >= cnt)
+		return 0;
+
+	state->stack_arg_regs = realloc_array(state->stack_arg_regs, old_n, cnt,
+					      sizeof(struct bpf_reg_state));
+	if (!state->stack_arg_regs)
+		return -ENOMEM;
+
+	state->out_stack_arg_cnt = cnt;
+	return 0;
+}
+
 /* Acquire a pointer id from the env and update the state->refs to include
  * this new pointer reference.
  * On success, returns a valid pointer id to associate with the register
@@ -1399,7 +1424,7 @@ static struct bpf_reference_state *acquire_reference_state(struct bpf_verifier_e
 	return &state->refs[new_ofs];
 }
 
-static int acquire_reference(struct bpf_verifier_env *env, int insn_idx)
+static int acquire_reference(struct bpf_verifier_env *env, int insn_idx, int parent_id)
 {
 	struct bpf_reference_state *s;
 
@@ -1408,6 +1433,7 @@ static int acquire_reference(struct bpf_verifier_env *env, int insn_idx)
 		return -ENOMEM;
 	s->type = REF_TYPE_PTR;
 	s->id = ++env->id_gen;
+	s->parent_id = parent_id;
 	return s->id;
 }
 
@@ -1464,17 +1490,25 @@ static void release_reference_state(struct bpf_verifier_state *state, int idx)
 	return;
 }
 
-static bool find_reference_state(struct bpf_verifier_state *state, int ptr_id)
+static bool find_reference_state(struct bpf_verifier_state *state, int id)
 {
 	int i;
 
-	for (i = 0; i < state->acquired_refs; i++)
-		if (state->refs[i].id == ptr_id)
+	for (i = 0; i < state->acquired_refs; i++) {
+		if (state->refs[i].type != REF_TYPE_PTR)
+			continue;
+		if (state->refs[i].id == id)
 			return true;
+	}
 
 	return false;
 }
 
+static bool reg_is_referenced(struct bpf_verifier_env *env, const struct bpf_reg_state *reg)
+{
+	return find_reference_state(env->cur_state, reg->id);
+}
+
 static int release_lock_state(struct bpf_verifier_state *state, int type, int id, void *ptr)
 {
 	void *prev_ptr = NULL;
@@ -1542,6 +1576,7 @@ static void free_func_state(struct bpf_func_state *state)
 {
 	if (!state)
 		return;
+	kfree(state->stack_arg_regs);
 	kfree(state->stack);
 	kfree(state);
 }
@@ -1750,6 +1785,22 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
 	return &elem->st;
 }
 
+static const char *reg_arg_name(struct bpf_verifier_env *env, argno_t argno)
+{
+	char *buf = env->tmp_arg_name;
+	int len = sizeof(env->tmp_arg_name);
+	int arg, regno = reg_from_argno(argno);
+
+	if (regno >= 0) {
+		snprintf(buf, len, "R%d", regno);
+	} else {
+		arg = arg_from_argno(argno);
+		snprintf(buf, len, "*(R11-%u)", (arg - MAX_BPF_FUNC_REG_ARGS) * BPF_REG_SIZE);
+	}
+
+	return buf;
+}
+
 static const int caller_saved[CALLER_SAVED_REGS] = {
 	BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
 };
@@ -1758,15 +1809,8 @@ static const int caller_saved[CALLER_SAVED_REGS] = {
 static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm)
 {
 	reg->var_off = tnum_const(imm);
-	reg->smin_value = (s64)imm;
-	reg->smax_value = (s64)imm;
-	reg->umin_value = imm;
-	reg->umax_value = imm;
-
-	reg->s32_min_value = (s32)imm;
-	reg->s32_max_value = (s32)imm;
-	reg->u32_min_value = (u32)imm;
-	reg->u32_max_value = (u32)imm;
+	reg->r64 = cnum64_from_urange(imm, imm);
+	reg->r32 = cnum32_from_urange((u32)imm, (u32)imm);
 }
 
 /* Mark the unknown part of a register (variable offset or scalar value) as
@@ -1778,17 +1822,14 @@ static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm)
 	memset(((u8 *)reg) + sizeof(reg->type), 0,
 	       offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type));
 	reg->id = 0;
-	reg->ref_obj_id = 0;
+	reg->parent_id = 0;
 	___mark_reg_known(reg, imm);
 }
 
 static void __mark_reg32_known(struct bpf_reg_state *reg, u64 imm)
 {
 	reg->var_off = tnum_const_subreg(reg->var_off, imm);
-	reg->s32_min_value = (s32)imm;
-	reg->s32_max_value = (s32)imm;
-	reg->u32_min_value = (u32)imm;
-	reg->u32_max_value = (u32)imm;
+	reg->r32 = cnum32_from_urange((u32)imm, (u32)imm);
 }
 
 /* Mark the 'variable offset' part of a register as zero.  This should be
@@ -1816,7 +1857,7 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env,
 }
 
 static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type,
-			      bool first_slot, int dynptr_id)
+			      bool first_slot, int id, int parent_id)
 {
 	/* reg->type has no meaning for STACK_DYNPTR, but when we set reg for
 	 * callback arguments, it does need to be CONST_PTR_TO_DYNPTR, so simply
@@ -1825,7 +1866,8 @@ static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type ty
 	__mark_reg_known_zero(reg);
 	reg->type = CONST_PTR_TO_DYNPTR;
 	/* Give each dynptr a unique id to uniquely associate slices to it. */
-	reg->id = dynptr_id;
+	reg->id = id;
+	reg->parent_id = parent_id;
 	reg->dynptr.type = type;
 	reg->dynptr.first_slot = first_slot;
 }
@@ -1899,34 +1941,21 @@ static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg,
 	       tnum_equals_const(reg->var_off, 0);
 }
 
-/* Reset the min/max bounds of a register */
-static void __mark_reg_unbounded(struct bpf_reg_state *reg)
+static void __mark_reg32_unbounded(struct bpf_reg_state *reg)
 {
-	reg->smin_value = S64_MIN;
-	reg->smax_value = S64_MAX;
-	reg->umin_value = 0;
-	reg->umax_value = U64_MAX;
-
-	reg->s32_min_value = S32_MIN;
-	reg->s32_max_value = S32_MAX;
-	reg->u32_min_value = 0;
-	reg->u32_max_value = U32_MAX;
+	reg->r32 = CNUM32_UNBOUNDED;
 }
 
 static void __mark_reg64_unbounded(struct bpf_reg_state *reg)
 {
-	reg->smin_value = S64_MIN;
-	reg->smax_value = S64_MAX;
-	reg->umin_value = 0;
-	reg->umax_value = U64_MAX;
+	reg->r64 = CNUM64_UNBOUNDED;
 }
 
-static void __mark_reg32_unbounded(struct bpf_reg_state *reg)
+/* Reset the min/max bounds of a register */
+static void __mark_reg_unbounded(struct bpf_reg_state *reg)
 {
-	reg->s32_min_value = S32_MIN;
-	reg->s32_max_value = S32_MAX;
-	reg->u32_min_value = 0;
-	reg->u32_max_value = U32_MAX;
+	__mark_reg64_unbounded(reg);
+	__mark_reg32_unbounded(reg);
 }
 
 static void reset_reg64_and_tnum(struct bpf_reg_state *reg)
@@ -1941,19 +1970,32 @@ static void reset_reg32_and_tnum(struct bpf_reg_state *reg)
 	reg->var_off = tnum_unknown;
 }
 
-static void __update_reg32_bounds(struct bpf_reg_state *reg)
+static struct cnum32 cnum32_from_tnum(struct tnum tnum)
 {
-	struct tnum var32_off = tnum_subreg(reg->var_off);
+	tnum = tnum_subreg(tnum);
+	if ((tnum.mask & S32_MIN) || (tnum.value & S32_MIN))
+		/* min signed is max(sign bit) | min(other bits) */
+		/* max signed is min(sign bit) | max(other bits) */
+		return cnum32_from_srange(tnum.value | (tnum.mask & S32_MIN),
+					  tnum.value | (tnum.mask & S32_MAX));
+	else
+		return cnum32_from_urange(tnum.value, (tnum.value | tnum.mask));
+}
 
-	/* min signed is max(sign bit) | min(other bits) */
-	reg->s32_min_value = max_t(s32, reg->s32_min_value,
-			var32_off.value | (var32_off.mask & S32_MIN));
-	/* max signed is min(sign bit) | max(other bits) */
-	reg->s32_max_value = min_t(s32, reg->s32_max_value,
-			var32_off.value | (var32_off.mask & S32_MAX));
-	reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)var32_off.value);
-	reg->u32_max_value = min(reg->u32_max_value,
-				 (u32)(var32_off.value | var32_off.mask));
+static struct cnum64 cnum64_from_tnum(struct tnum tnum)
+{
+	if ((tnum.mask & S64_MIN) || (tnum.value & S64_MIN))
+		/* min signed is max(sign bit) | min(other bits) */
+		/* max signed is min(sign bit) | max(other bits) */
+		return cnum64_from_srange(tnum.value | (tnum.mask & S64_MIN),
+					  tnum.value | (tnum.mask & S64_MAX));
+	else
+		return cnum64_from_urange(tnum.value, (tnum.value | tnum.mask));
+}
+
+static void __update_reg32_bounds(struct bpf_reg_state *reg)
+{
+	cnum32_intersect_with(&reg->r32, cnum32_from_tnum(reg->var_off));
 }
 
 static void __update_reg64_bounds(struct bpf_reg_state *reg)
@@ -1961,26 +2003,18 @@ static void __update_reg64_bounds(struct bpf_reg_state *reg)
 	u64 tnum_next, tmax;
 	bool umin_in_tnum;
 
-	/* min signed is max(sign bit) | min(other bits) */
-	reg->smin_value = max_t(s64, reg->smin_value,
-				reg->var_off.value | (reg->var_off.mask & S64_MIN));
-	/* max signed is min(sign bit) | max(other bits) */
-	reg->smax_value = min_t(s64, reg->smax_value,
-				reg->var_off.value | (reg->var_off.mask & S64_MAX));
-	reg->umin_value = max(reg->umin_value, reg->var_off.value);
-	reg->umax_value = min(reg->umax_value,
-			      reg->var_off.value | reg->var_off.mask);
+	cnum64_intersect_with(&reg->r64, cnum64_from_tnum(reg->var_off));
 
 	/* Check if u64 and tnum overlap in a single value */
-	tnum_next = tnum_step(reg->var_off, reg->umin_value);
-	umin_in_tnum = (reg->umin_value & ~reg->var_off.mask) == reg->var_off.value;
+	tnum_next = tnum_step(reg->var_off, reg_umin(reg));
+	umin_in_tnum = (reg_umin(reg) & ~reg->var_off.mask) == reg->var_off.value;
 	tmax = reg->var_off.value | reg->var_off.mask;
-	if (umin_in_tnum && tnum_next > reg->umax_value) {
+	if (umin_in_tnum && tnum_next > reg_umax(reg)) {
 		/* The u64 range and the tnum only overlap in umin.
 		 * u64:  ---[xxxxxx]-----
 		 * tnum: --xx----------x-
 		 */
-		___mark_reg_known(reg, reg->umin_value);
+		___mark_reg_known(reg, reg_umin(reg));
 	} else if (!umin_in_tnum && tnum_next == tmax) {
 		/* The u64 range and the tnum only overlap in the maximum value
 		 * represented by the tnum, called tmax.
@@ -1988,8 +2022,8 @@ static void __update_reg64_bounds(struct bpf_reg_state *reg)
 		 * tnum: xx-----x--------
 		 */
 		___mark_reg_known(reg, tmax);
-	} else if (!umin_in_tnum && tnum_next <= reg->umax_value &&
-		   tnum_step(reg->var_off, tnum_next) > reg->umax_value) {
+	} else if (!umin_in_tnum && tnum_next <= reg_umax(reg) &&
+		   tnum_step(reg->var_off, tnum_next) > reg_umax(reg)) {
 		/* The u64 range and the tnum only overlap in between umin
 		 * (excluded) and umax.
 		 * u64:  ---[xxxxxx]-----
@@ -2005,329 +2039,19 @@ static void __update_reg_bounds(struct bpf_reg_state *reg)
 	__update_reg64_bounds(reg);
 }
 
-/* Uses signed min/max values to inform unsigned, and vice-versa */
 static void deduce_bounds_32_from_64(struct bpf_reg_state *reg)
 {
-	/* If upper 32 bits of u64/s64 range don't change, we can use lower 32
-	 * bits to improve our u32/s32 boundaries.
-	 *
-	 * E.g., the case where we have upper 32 bits as zero ([10, 20] in
-	 * u64) is pretty trivial, it's obvious that in u32 we'll also have
-	 * [10, 20] range. But this property holds for any 64-bit range as
-	 * long as upper 32 bits in that entire range of values stay the same.
-	 *
-	 * E.g., u64 range [0x10000000A, 0x10000000F] ([4294967306, 4294967311]
-	 * in decimal) has the same upper 32 bits throughout all the values in
-	 * that range. As such, lower 32 bits form a valid [0xA, 0xF] ([10, 15])
-	 * range.
-	 *
-	 * Note also, that [0xA, 0xF] is a valid range both in u32 and in s32,
-	 * following the rules outlined below about u64/s64 correspondence
-	 * (which equally applies to u32 vs s32 correspondence). In general it
-	 * depends on actual hexadecimal values of 32-bit range. They can form
-	 * only valid u32, or only valid s32 ranges in some cases.
-	 *
-	 * So we use all these insights to derive bounds for subregisters here.
-	 */
-	if ((reg->umin_value >> 32) == (reg->umax_value >> 32)) {
-		/* u64 to u32 casting preserves validity of low 32 bits as
-		 * a range, if upper 32 bits are the same
-		 */
-		reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->umin_value);
-		reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->umax_value);
-
-		if ((s32)reg->umin_value <= (s32)reg->umax_value) {
-			reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value);
-			reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value);
-		}
-	}
-	if ((reg->smin_value >> 32) == (reg->smax_value >> 32)) {
-		/* low 32 bits should form a proper u32 range */
-		if ((u32)reg->smin_value <= (u32)reg->smax_value) {
-			reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->smin_value);
-			reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->smax_value);
-		}
-		/* low 32 bits should form a proper s32 range */
-		if ((s32)reg->smin_value <= (s32)reg->smax_value) {
-			reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value);
-			reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value);
-		}
-	}
-	/* Special case where upper bits form a small sequence of two
-	 * sequential numbers (in 32-bit unsigned space, so 0xffffffff to
-	 * 0x00000000 is also valid), while lower bits form a proper s32 range
-	 * going from negative numbers to positive numbers. E.g., let's say we
-	 * have s64 range [-1, 1] ([0xffffffffffffffff, 0x0000000000000001]).
-	 * Possible s64 values are {-1, 0, 1} ({0xffffffffffffffff,
-	 * 0x0000000000000000, 0x00000000000001}). Ignoring upper 32 bits,
-	 * we still get a valid s32 range [-1, 1] ([0xffffffff, 0x00000001]).
-	 * Note that it doesn't have to be 0xffffffff going to 0x00000000 in
-	 * upper 32 bits. As a random example, s64 range
-	 * [0xfffffff0fffffff0; 0xfffffff100000010], forms a valid s32 range
-	 * [-16, 16] ([0xfffffff0; 0x00000010]) in its 32 bit subregister.
-	 */
-	if ((u32)(reg->umin_value >> 32) + 1 == (u32)(reg->umax_value >> 32) &&
-	    (s32)reg->umin_value < 0 && (s32)reg->umax_value >= 0) {
-		reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value);
-		reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value);
-	}
-	if ((u32)(reg->smin_value >> 32) + 1 == (u32)(reg->smax_value >> 32) &&
-	    (s32)reg->smin_value < 0 && (s32)reg->smax_value >= 0) {
-		reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value);
-		reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value);
-	}
-}
-
-static void deduce_bounds_32_from_32(struct bpf_reg_state *reg)
-{
-	/* if u32 range forms a valid s32 range (due to matching sign bit),
-	 * try to learn from that
-	 */
-	if ((s32)reg->u32_min_value <= (s32)reg->u32_max_value) {
-		reg->s32_min_value = max_t(s32, reg->s32_min_value, reg->u32_min_value);
-		reg->s32_max_value = min_t(s32, reg->s32_max_value, reg->u32_max_value);
-	}
-	/* If we cannot cross the sign boundary, then signed and unsigned bounds
-	 * are the same, so combine.  This works even in the negative case, e.g.
-	 * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
-	 */
-	if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) {
-		reg->u32_min_value = max_t(u32, reg->s32_min_value, reg->u32_min_value);
-		reg->u32_max_value = min_t(u32, reg->s32_max_value, reg->u32_max_value);
-	} else {
-		if (reg->u32_max_value < (u32)reg->s32_min_value) {
-			/* See __reg64_deduce_bounds() for detailed explanation.
-			 * Refine ranges in the following situation:
-			 *
-			 * 0                                                   U32_MAX
-			 * |  [xxxxxxxxxxxxxx u32 range xxxxxxxxxxxxxx]              |
-			 * |----------------------------|----------------------------|
-			 * |xxxxx s32 range xxxxxxxxx]                       [xxxxxxx|
-			 * 0                     S32_MAX S32_MIN                    -1
-			 */
-			reg->s32_min_value = (s32)reg->u32_min_value;
-			reg->u32_max_value = min_t(u32, reg->u32_max_value, reg->s32_max_value);
-		} else if ((u32)reg->s32_max_value < reg->u32_min_value) {
-			/*
-			 * 0                                                   U32_MAX
-			 * |              [xxxxxxxxxxxxxx u32 range xxxxxxxxxxxxxx]  |
-			 * |----------------------------|----------------------------|
-			 * |xxxxxxxxx]                       [xxxxxxxxxxxx s32 range |
-			 * 0                     S32_MAX S32_MIN                    -1
-			 */
-			reg->s32_max_value = (s32)reg->u32_max_value;
-			reg->u32_min_value = max_t(u32, reg->u32_min_value, reg->s32_min_value);
-		}
-	}
-}
-
-static void deduce_bounds_64_from_64(struct bpf_reg_state *reg)
-{
-	/* If u64 range forms a valid s64 range (due to matching sign bit),
-	 * try to learn from that. Let's do a bit of ASCII art to see when
-	 * this is happening. Let's take u64 range first:
-	 *
-	 * 0             0x7fffffffffffffff 0x8000000000000000        U64_MAX
-	 * |-------------------------------|--------------------------------|
-	 *
-	 * Valid u64 range is formed when umin and umax are anywhere in the
-	 * range [0, U64_MAX], and umin <= umax. u64 case is simple and
-	 * straightforward. Let's see how s64 range maps onto the same range
-	 * of values, annotated below the line for comparison:
-	 *
-	 * 0             0x7fffffffffffffff 0x8000000000000000        U64_MAX
-	 * |-------------------------------|--------------------------------|
-	 * 0                        S64_MAX S64_MIN                        -1
-	 *
-	 * So s64 values basically start in the middle and they are logically
-	 * contiguous to the right of it, wrapping around from -1 to 0, and
-	 * then finishing as S64_MAX (0x7fffffffffffffff) right before
-	 * S64_MIN. We can try drawing the continuity of u64 vs s64 values
-	 * more visually as mapped to sign-agnostic range of hex values.
-	 *
-	 *  u64 start                                               u64 end
-	 *  _______________________________________________________________
-	 * /                                                               \
-	 * 0             0x7fffffffffffffff 0x8000000000000000        U64_MAX
-	 * |-------------------------------|--------------------------------|
-	 * 0                        S64_MAX S64_MIN                        -1
-	 *                                / \
-	 * >------------------------------   ------------------------------->
-	 * s64 continues...        s64 end   s64 start          s64 "midpoint"
-	 *
-	 * What this means is that, in general, we can't always derive
-	 * something new about u64 from any random s64 range, and vice versa.
-	 *
-	 * But we can do that in two particular cases. One is when entire
-	 * u64/s64 range is *entirely* contained within left half of the above
-	 * diagram or when it is *entirely* contained in the right half. I.e.:
-	 *
-	 * |-------------------------------|--------------------------------|
-	 *     ^                   ^            ^                 ^
-	 *     A                   B            C                 D
-	 *
-	 * [A, B] and [C, D] are contained entirely in their respective halves
-	 * and form valid contiguous ranges as both u64 and s64 values. [A, B]
-	 * will be non-negative both as u64 and s64 (and in fact it will be
-	 * identical ranges no matter the signedness). [C, D] treated as s64
-	 * will be a range of negative values, while in u64 it will be
-	 * non-negative range of values larger than 0x8000000000000000.
-	 *
-	 * Now, any other range here can't be represented in both u64 and s64
-	 * simultaneously. E.g., [A, C], [A, D], [B, C], [B, D] are valid
-	 * contiguous u64 ranges, but they are discontinuous in s64. [B, C]
-	 * in s64 would be properly presented as [S64_MIN, C] and [B, S64_MAX],
-	 * for example. Similarly, valid s64 range [D, A] (going from negative
-	 * to positive values), would be two separate [D, U64_MAX] and [0, A]
-	 * ranges as u64. Currently reg_state can't represent two segments per
-	 * numeric domain, so in such situations we can only derive maximal
-	 * possible range ([0, U64_MAX] for u64, and [S64_MIN, S64_MAX] for s64).
-	 *
-	 * So we use these facts to derive umin/umax from smin/smax and vice
-	 * versa only if they stay within the same "half". This is equivalent
-	 * to checking sign bit: lower half will have sign bit as zero, upper
-	 * half have sign bit 1. Below in code we simplify this by just
-	 * casting umin/umax as smin/smax and checking if they form valid
-	 * range, and vice versa. Those are equivalent checks.
-	 */
-	if ((s64)reg->umin_value <= (s64)reg->umax_value) {
-		reg->smin_value = max_t(s64, reg->smin_value, reg->umin_value);
-		reg->smax_value = min_t(s64, reg->smax_value, reg->umax_value);
-	}
-	/* If we cannot cross the sign boundary, then signed and unsigned bounds
-	 * are the same, so combine.  This works even in the negative case, e.g.
-	 * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
-	 */
-	if ((u64)reg->smin_value <= (u64)reg->smax_value) {
-		reg->umin_value = max_t(u64, reg->smin_value, reg->umin_value);
-		reg->umax_value = min_t(u64, reg->smax_value, reg->umax_value);
-	} else {
-		/* If the s64 range crosses the sign boundary, then it's split
-		 * between the beginning and end of the U64 domain. In that
-		 * case, we can derive new bounds if the u64 range overlaps
-		 * with only one end of the s64 range.
-		 *
-		 * In the following example, the u64 range overlaps only with
-		 * positive portion of the s64 range.
-		 *
-		 * 0                                                   U64_MAX
-		 * |  [xxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxx]              |
-		 * |----------------------------|----------------------------|
-		 * |xxxxx s64 range xxxxxxxxx]                       [xxxxxxx|
-		 * 0                     S64_MAX S64_MIN                    -1
-		 *
-		 * We can thus derive the following new s64 and u64 ranges.
-		 *
-		 * 0                                                   U64_MAX
-		 * |  [xxxxxx u64 range xxxxx]                               |
-		 * |----------------------------|----------------------------|
-		 * |  [xxxxxx s64 range xxxxx]                               |
-		 * 0                     S64_MAX S64_MIN                    -1
-		 *
-		 * If they overlap in two places, we can't derive anything
-		 * because reg_state can't represent two ranges per numeric
-		 * domain.
-		 *
-		 * 0                                                   U64_MAX
-		 * |  [xxxxxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxxxxx]        |
-		 * |----------------------------|----------------------------|
-		 * |xxxxx s64 range xxxxxxxxx]                    [xxxxxxxxxx|
-		 * 0                     S64_MAX S64_MIN                    -1
-		 *
-		 * The first condition below corresponds to the first diagram
-		 * above.
-		 */
-		if (reg->umax_value < (u64)reg->smin_value) {
-			reg->smin_value = (s64)reg->umin_value;
-			reg->umax_value = min_t(u64, reg->umax_value, reg->smax_value);
-		} else if ((u64)reg->smax_value < reg->umin_value) {
-			/* This second condition considers the case where the u64 range
-			 * overlaps with the negative portion of the s64 range:
-			 *
-			 * 0                                                   U64_MAX
-			 * |              [xxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxx]  |
-			 * |----------------------------|----------------------------|
-			 * |xxxxxxxxx]                       [xxxxxxxxxxxx s64 range |
-			 * 0                     S64_MAX S64_MIN                    -1
-			 */
-			reg->smax_value = (s64)reg->umax_value;
-			reg->umin_value = max_t(u64, reg->umin_value, reg->smin_value);
-		}
-	}
+	cnum32_intersect_with(&reg->r32, cnum32_from_cnum64(reg->r64));
 }
 
 static void deduce_bounds_64_from_32(struct bpf_reg_state *reg)
 {
-	/* Try to tighten 64-bit bounds from 32-bit knowledge, using 32-bit
-	 * values on both sides of 64-bit range in hope to have tighter range.
-	 * E.g., if r1 is [0x1'00000000, 0x3'80000000], and we learn from
-	 * 32-bit signed > 0 operation that s32 bounds are now [1; 0x7fffffff].
-	 * With this, we can substitute 1 as low 32-bits of _low_ 64-bit bound
-	 * (0x100000000 -> 0x100000001) and 0x7fffffff as low 32-bits of
-	 * _high_ 64-bit bound (0x380000000 -> 0x37fffffff) and arrive at a
-	 * better overall bounds for r1 as [0x1'000000001; 0x3'7fffffff].
-	 * We just need to make sure that derived bounds we are intersecting
-	 * with are well-formed ranges in respective s64 or u64 domain, just
-	 * like we do with similar kinds of 32-to-64 or 64-to-32 adjustments.
-	 */
-	__u64 new_umin, new_umax;
-	__s64 new_smin, new_smax;
-
-	/* u32 -> u64 tightening, it's always well-formed */
-	new_umin = (reg->umin_value & ~0xffffffffULL) | reg->u32_min_value;
-	new_umax = (reg->umax_value & ~0xffffffffULL) | reg->u32_max_value;
-	reg->umin_value = max_t(u64, reg->umin_value, new_umin);
-	reg->umax_value = min_t(u64, reg->umax_value, new_umax);
-	/* u32 -> s64 tightening, u32 range embedded into s64 preserves range validity */
-	new_smin = (reg->smin_value & ~0xffffffffULL) | reg->u32_min_value;
-	new_smax = (reg->smax_value & ~0xffffffffULL) | reg->u32_max_value;
-	reg->smin_value = max_t(s64, reg->smin_value, new_smin);
-	reg->smax_value = min_t(s64, reg->smax_value, new_smax);
-
-	/* Here we would like to handle a special case after sign extending load,
-	 * when upper bits for a 64-bit range are all 1s or all 0s.
-	 *
-	 * Upper bits are all 1s when register is in a range:
-	 *   [0xffff_ffff_0000_0000, 0xffff_ffff_ffff_ffff]
-	 * Upper bits are all 0s when register is in a range:
-	 *   [0x0000_0000_0000_0000, 0x0000_0000_ffff_ffff]
-	 * Together this forms are continuous range:
-	 *   [0xffff_ffff_0000_0000, 0x0000_0000_ffff_ffff]
-	 *
-	 * Now, suppose that register range is in fact tighter:
-	 *   [0xffff_ffff_8000_0000, 0x0000_0000_ffff_ffff] (R)
-	 * Also suppose that it's 32-bit range is positive,
-	 * meaning that lower 32-bits of the full 64-bit register
-	 * are in the range:
-	 *   [0x0000_0000, 0x7fff_ffff] (W)
-	 *
-	 * If this happens, then any value in a range:
-	 *   [0xffff_ffff_0000_0000, 0xffff_ffff_7fff_ffff]
-	 * is smaller than a lowest bound of the range (R):
-	 *   0xffff_ffff_8000_0000
-	 * which means that upper bits of the full 64-bit register
-	 * can't be all 1s, when lower bits are in range (W).
-	 *
-	 * Note that:
-	 *  - 0xffff_ffff_8000_0000 == (s64)S32_MIN
-	 *  - 0x0000_0000_7fff_ffff == (s64)S32_MAX
-	 * These relations are used in the conditions below.
-	 */
-	if (reg->s32_min_value >= 0 && reg->smin_value >= S32_MIN && reg->smax_value <= S32_MAX) {
-		reg->smin_value = reg->s32_min_value;
-		reg->smax_value = reg->s32_max_value;
-		reg->umin_value = reg->s32_min_value;
-		reg->umax_value = reg->s32_max_value;
-		reg->var_off = tnum_intersect(reg->var_off,
-					      tnum_range(reg->smin_value, reg->smax_value));
-	}
+	reg->r64 = cnum64_cnum32_intersect(reg->r64, reg->r32);
 }
 
 static void __reg_deduce_bounds(struct bpf_reg_state *reg)
 {
-	deduce_bounds_64_from_64(reg);
 	deduce_bounds_32_from_64(reg);
-	deduce_bounds_32_from_32(reg);
 	deduce_bounds_64_from_32(reg);
 }
 
@@ -2335,11 +2059,11 @@ static void __reg_deduce_bounds(struct bpf_reg_state *reg)
 static void __reg_bound_offset(struct bpf_reg_state *reg)
 {
 	struct tnum var64_off = tnum_intersect(reg->var_off,
-					       tnum_range(reg->umin_value,
-							  reg->umax_value));
+					       tnum_range(reg_umin(reg),
+							  reg_umax(reg)));
 	struct tnum var32_off = tnum_intersect(tnum_subreg(var64_off),
-					       tnum_range(reg->u32_min_value,
-							  reg->u32_max_value));
+					       tnum_range(reg_u32_min(reg),
+							  reg_u32_max(reg)));
 
 	reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off);
 }
@@ -2365,35 +2089,25 @@ static void reg_bounds_sync(struct bpf_reg_state *reg)
 	__update_reg_bounds(reg);
 }
 
-static bool range_bounds_violation(struct bpf_reg_state *reg)
-{
-	return (reg->umin_value > reg->umax_value || reg->smin_value > reg->smax_value ||
-		reg->u32_min_value > reg->u32_max_value ||
-		reg->s32_min_value > reg->s32_max_value);
-}
-
 static bool const_tnum_range_mismatch(struct bpf_reg_state *reg)
 {
-	u64 uval = reg->var_off.value;
-	s64 sval = (s64)uval;
-
 	if (!tnum_is_const(reg->var_off))
 		return false;
 
-	return reg->umin_value != uval || reg->umax_value != uval ||
-	       reg->smin_value != sval || reg->smax_value != sval;
+	return !cnum64_is_const(reg->r64) || reg->r64.base != reg->var_off.value;
 }
 
 static bool const_tnum_range_mismatch_32(struct bpf_reg_state *reg)
 {
-	u32 uval32 = tnum_subreg(reg->var_off).value;
-	s32 sval32 = (s32)uval32;
-
 	if (!tnum_subreg_is_const(reg->var_off))
 		return false;
 
-	return reg->u32_min_value != uval32 || reg->u32_max_value != uval32 ||
-	       reg->s32_min_value != sval32 || reg->s32_max_value != sval32;
+	return !cnum32_is_const(reg->r32) || reg->r32.base != tnum_subreg(reg->var_off).value;
+}
+
+static bool range_bounds_violation(struct bpf_reg_state *reg)
+{
+	return cnum32_is_empty(reg->r32) || cnum64_is_empty(reg->r64);
 }
 
 static int reg_bounds_sanity_check(struct bpf_verifier_env *env,
@@ -2418,12 +2132,11 @@ static int reg_bounds_sanity_check(struct bpf_verifier_env *env,
 
 	return 0;
 out:
-	verifier_bug(env, "REG INVARIANTS VIOLATION (%s): %s u64=[%#llx, %#llx] "
-		     "s64=[%#llx, %#llx] u32=[%#x, %#x] s32=[%#x, %#x] var_off=(%#llx, %#llx)",
-		     ctx, msg, reg->umin_value, reg->umax_value,
-		     reg->smin_value, reg->smax_value,
-		     reg->u32_min_value, reg->u32_max_value,
-		     reg->s32_min_value, reg->s32_max_value,
+	verifier_bug(env, "REG INVARIANTS VIOLATION (%s): %s r64={.base=%#llx, .size=%#llx} "
+		     "r32={.base=%#x, .size=%#x} var_off=(%#llx, %#llx)",
+		     ctx, msg,
+		     reg->r64.base, reg->r64.size,
+		     reg->r32.base, reg->r32.size,
 		     reg->var_off.value, reg->var_off.mask);
 	if (env->test_reg_invariants)
 		return -EFAULT;
@@ -2431,44 +2144,15 @@ out:
 	return 0;
 }
 
-static bool __reg32_bound_s64(s32 a)
-{
-	return a >= 0 && a <= S32_MAX;
-}
-
-static void __reg_assign_32_into_64(struct bpf_reg_state *reg)
-{
-	reg->umin_value = reg->u32_min_value;
-	reg->umax_value = reg->u32_max_value;
-
-	/* Attempt to pull 32-bit signed bounds into 64-bit bounds but must
-	 * be positive otherwise set to worse case bounds and refine later
-	 * from tnum.
-	 */
-	if (__reg32_bound_s64(reg->s32_min_value) &&
-	    __reg32_bound_s64(reg->s32_max_value)) {
-		reg->smin_value = reg->s32_min_value;
-		reg->smax_value = reg->s32_max_value;
-	} else {
-		reg->smin_value = 0;
-		reg->smax_value = U32_MAX;
-	}
-}
-
 /* Mark a register as having a completely unknown (scalar) value. */
 void bpf_mark_reg_unknown_imprecise(struct bpf_reg_state *reg)
 {
-	/*
-	 * Clear type, off, and union(map_ptr, range) and
-	 * padding between 'type' and union
-	 */
-	memset(reg, 0, offsetof(struct bpf_reg_state, var_off));
+	s32 subreg_def = reg->subreg_def;
+
+	memset(reg, 0, sizeof(*reg));
 	reg->type = SCALAR_VALUE;
-	reg->id = 0;
-	reg->ref_obj_id = 0;
 	reg->var_off = tnum_unknown;
-	reg->frameno = 0;
-	reg->precise = false;
+	reg->subreg_def = subreg_def;
 	__mark_reg_unbounded(reg);
 }
 
@@ -2496,11 +2180,12 @@ static int __mark_reg_s32_range(struct bpf_verifier_env *env,
 {
 	struct bpf_reg_state *reg = regs + regno;
 
-	reg->s32_min_value = max_t(s32, reg->s32_min_value, s32_min);
-	reg->s32_max_value = min_t(s32, reg->s32_max_value, s32_max);
-
-	reg->smin_value = max_t(s64, reg->smin_value, s32_min);
-	reg->smax_value = min_t(s64, reg->smax_value, s32_max);
+	reg_set_srange32(reg,
+			 max_t(s32, reg_s32_min(reg), s32_min),
+			 min_t(s32, reg_s32_max(reg), s32_max));
+	reg_set_srange64(reg,
+			 max_t(s64, reg_smin(reg), s32_min),
+			 min_t(s64, reg_smax(reg), s32_max));
 
 	reg_bounds_sync(reg);
 
@@ -3295,50 +2980,13 @@ out:
 	return ret;
 }
 
-static int mark_stack_slot_obj_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
-				    int spi, int nr_slots)
+static void mark_stack_slots_scratched(struct bpf_verifier_env *env,
+				       int spi, int nr_slots)
 {
 	int i;
 
 	for (i = 0; i < nr_slots; i++)
 		mark_stack_slot_scratched(env, spi - i);
-	return 0;
-}
-
-static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
-{
-	int spi;
-
-	/* For CONST_PTR_TO_DYNPTR, it must have already been done by
-	 * check_reg_arg in check_helper_call and mark_btf_func_reg_size in
-	 * check_kfunc_call.
-	 */
-	if (reg->type == CONST_PTR_TO_DYNPTR)
-		return 0;
-	spi = dynptr_get_spi(env, reg);
-	if (spi < 0)
-		return spi;
-	/* Caller ensures dynptr is valid and initialized, which means spi is in
-	 * bounds and spi is the first dynptr slot. Simply mark stack slot as
-	 * read.
-	 */
-	return mark_stack_slot_obj_read(env, reg, spi, BPF_DYNPTR_NR_SLOTS);
-}
-
-static int mark_iter_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
-			  int spi, int nr_slots)
-{
-	return mark_stack_slot_obj_read(env, reg, spi, nr_slots);
-}
-
-static int mark_irq_flag_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
-{
-	int spi;
-
-	spi = irq_flag_get_spi(env, reg);
-	if (spi < 0)
-		return spi;
-	return mark_stack_slot_obj_read(env, reg, spi, 1);
 }
 
 /* This function is supposed to be used by the following 32-bit optimization
@@ -3491,17 +3139,12 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 	return __check_reg_arg(env, state->regs, regno, t);
 }
 
-static int insn_stack_access_flags(int frameno, int spi)
-{
-	return INSN_F_STACK_ACCESS | (spi << INSN_F_SPI_SHIFT) | frameno;
-}
-
 static void mark_indirect_target(struct bpf_verifier_env *env, int idx)
 {
 	env->insn_aux_data[idx].indirect_target = true;
 }
 
-#define LR_FRAMENO_BITS	3
+#define LR_FRAMENO_BITS	4
 #define LR_SPI_BITS	6
 #define LR_ENTRY_BITS	(LR_SPI_BITS + LR_FRAMENO_BITS + 1)
 #define LR_SIZE_BITS	4
@@ -3510,7 +3153,11 @@ static void mark_indirect_target(struct bpf_verifier_env *env, int idx)
 #define LR_SIZE_MASK	((1ull << LR_SIZE_BITS)    - 1)
 #define LR_SPI_OFF	LR_FRAMENO_BITS
 #define LR_IS_REG_OFF	(LR_SPI_BITS + LR_FRAMENO_BITS)
-#define LINKED_REGS_MAX	6
+#define LINKED_REGS_MAX	5
+
+static_assert(MAX_CALL_FRAMES <= (1 << LR_FRAMENO_BITS));
+static_assert(LINKED_REGS_MAX < (1 << LR_SIZE_BITS));
+static_assert(LINKED_REGS_MAX * LR_ENTRY_BITS + LR_SIZE_BITS <= 64);
 
 struct linked_reg {
 	u8 frameno;
@@ -3534,10 +3181,11 @@ static struct linked_reg *linked_regs_push(struct linked_regs *s)
 	return NULL;
 }
 
-/* Use u64 as a vector of 6 10-bit values, use first 4-bits to track
+/*
+ * Use u64 as a vector of 5 11-bit values, use first 4-bits to track
  * number of elements currently in stack.
- * Pack one history entry for linked registers as 10 bits in the following format:
- * - 3-bits frameno
+ * Pack one history entry for linked registers as 11 bits in the following format:
+ * - 4-bits frameno
  * - 6-bits spi_or_reg
  * - 1-bit  is_reg
  */
@@ -3733,12 +3381,6 @@ static void assign_scalar_id_before_mov(struct bpf_verifier_env *env,
 		src_reg->id = ++env->id_gen;
 }
 
-/* Copy src state preserving dst->parent and dst->live fields */
-static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src)
-{
-	*dst = *src;
-}
-
 static void save_register_state(struct bpf_verifier_env *env,
 				struct bpf_func_state *state,
 				int spi, struct bpf_reg_state *reg,
@@ -3746,7 +3388,7 @@ static void save_register_state(struct bpf_verifier_env *env,
 {
 	int i;
 
-	copy_register_state(&state->stack[spi].spilled_ptr, reg);
+	state->stack[spi].spilled_ptr = *reg;
 
 	for (i = BPF_REG_SIZE; i > BPF_REG_SIZE - size; i--)
 		state->stack[spi].slot_type[i - 1] = STACK_SPILL;
@@ -3763,7 +3405,7 @@ static bool is_bpf_st_mem(struct bpf_insn *insn)
 
 static int get_reg_width(struct bpf_reg_state *reg)
 {
-	return fls64(reg->umax_value);
+	return fls64(reg_umax(reg));
 }
 
 /* See comment for mark_fastcall_pattern_for_call() */
@@ -3816,7 +3458,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 	int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
 	struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
 	struct bpf_reg_state *reg = NULL;
-	int insn_flags = insn_stack_access_flags(state->frameno, spi);
+	int insn_flags = INSN_F_STACK_ACCESS;
+	int hist_spi = spi, hist_frame = state->frameno;
 
 	/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
 	 * so it's aligned access and [off, off + size) are within stack limits
@@ -3912,11 +3555,12 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 	}
 
 	if (insn_flags)
-		return bpf_push_jmp_history(env, env->cur_state, insn_flags, 0);
+		return bpf_push_jmp_history(env, env->cur_state, insn_flags,
+					    hist_spi, hist_frame, 0);
 	return 0;
 }
 
-/* Write the stack: 'stack[ptr_regno + off] = value_regno'. 'ptr_regno' is
+/* Write the stack: 'stack[ptr_reg + off] = value_regno'. 'ptr_reg' is
  * known to contain a variable offset.
  * This function checks whether the write is permitted and conservatively
  * tracks the effects of the write, considering that each stack slot in the
@@ -3937,13 +3581,13 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 static int check_stack_write_var_off(struct bpf_verifier_env *env,
 				     /* func where register points to */
 				     struct bpf_func_state *state,
-				     int ptr_regno, int off, int size,
+				     struct bpf_reg_state *ptr_reg, int off, int size,
 				     int value_regno, int insn_idx)
 {
 	struct bpf_func_state *cur; /* state of the current function */
 	int min_off, max_off;
 	int i, err;
-	struct bpf_reg_state *ptr_reg = NULL, *value_reg = NULL;
+	struct bpf_reg_state *value_reg = NULL;
 	struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
 	bool writing_zero = false;
 	/* set if the fact that we're writing a zero is used to let any
@@ -3952,9 +3596,8 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
 	bool zero_used = false;
 
 	cur = env->cur_state->frame[env->cur_state->curframe];
-	ptr_reg = &cur->regs[ptr_regno];
-	min_off = ptr_reg->smin_value + off;
-	max_off = ptr_reg->smax_value + off + size;
+	min_off = reg_smin(ptr_reg) + off;
+	max_off = reg_smax(ptr_reg) + off + size;
 	if (value_regno >= 0)
 		value_reg = &cur->regs[value_regno];
 	if ((value_reg && bpf_register_is_null(value_reg)) ||
@@ -4109,7 +3752,8 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
 	int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
 	struct bpf_reg_state *reg;
 	u8 *stype, type;
-	int insn_flags = insn_stack_access_flags(reg_state->frameno, spi);
+	int insn_flags = INSN_F_STACK_ACCESS;
+	int hist_spi = spi, hist_frame = reg_state->frameno;
 
 	stype = reg_state->stack[spi].slot_type;
 	reg = &reg_state->stack[spi].spilled_ptr;
@@ -4146,7 +3790,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
 					 * with the destination register on fill.
 					 */
 					assign_scalar_id_before_mov(env, reg);
-				copy_register_state(&state->regs[dst_regno], reg);
+				state->regs[dst_regno] = *reg;
 				state->regs[dst_regno].subreg_def = subreg_def;
 
 				/* Break the relation on a narrowing fill.
@@ -4201,7 +3845,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
 				 * with the destination register on fill.
 				 */
 				assign_scalar_id_before_mov(env, reg);
-			copy_register_state(&state->regs[dst_regno], reg);
+			state->regs[dst_regno] = *reg;
 			/* mark reg as written since spilled pointer state likely
 			 * has its liveness marks cleared by is_state_visited()
 			 * which resets stack/reg liveness for state transitions
@@ -4240,7 +3884,8 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
 		insn_flags = 0; /* we are not restoring spilled register */
 	}
 	if (insn_flags)
-		return bpf_push_jmp_history(env, env->cur_state, insn_flags, 0);
+		return bpf_push_jmp_history(env, env->cur_state, insn_flags,
+					    hist_spi, hist_frame, 0);
 	return 0;
 }
 
@@ -4249,8 +3894,8 @@ enum bpf_access_src {
 	ACCESS_HELPER = 2,  /* the access is performed by a helper */
 };
 
-static int check_stack_range_initialized(struct bpf_verifier_env *env,
-					 int regno, int off, int access_size,
+static int check_stack_range_initialized(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+					 argno_t argno, int off, int access_size,
 					 bool zero_size_allowed,
 					 enum bpf_access_type type,
 					 struct bpf_call_arg_meta *meta);
@@ -4260,37 +3905,35 @@ static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno)
 	return cur_regs(env) + regno;
 }
 
-/* Read the stack at 'ptr_regno + off' and put the result into the register
+/* Read the stack at 'reg + off' and put the result into the register
  * 'dst_regno'.
- * 'off' includes the pointer register's fixed offset(i.e. 'ptr_regno.off'),
+ * 'off' includes the pointer register's fixed offset(i.e. 'reg->off'),
  * but not its variable offset.
  * 'size' is assumed to be <= reg size and the access is assumed to be aligned.
  *
  * As opposed to check_stack_read_fixed_off, this function doesn't deal with
  * filling registers (i.e. reads of spilled register cannot be detected when
  * the offset is not fixed). We conservatively mark 'dst_regno' as containing
- * SCALAR_VALUE. That's why we assert that the 'ptr_regno' has a variable
+ * SCALAR_VALUE. That's why we assert that the 'reg' has a variable
  * offset; for a fixed offset check_stack_read_fixed_off should be used
  * instead.
  */
-static int check_stack_read_var_off(struct bpf_verifier_env *env,
-				    int ptr_regno, int off, int size, int dst_regno)
+static int check_stack_read_var_off(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+				    argno_t ptr_argno, int off, int size, int dst_regno)
 {
-	/* The state of the source register. */
-	struct bpf_reg_state *reg = reg_state(env, ptr_regno);
 	struct bpf_func_state *ptr_state = bpf_func(env, reg);
 	int err;
 	int min_off, max_off;
 
 	/* Note that we pass a NULL meta, so raw access will not be permitted.
 	 */
-	err = check_stack_range_initialized(env, ptr_regno, off, size,
+	err = check_stack_range_initialized(env, reg, ptr_argno, off, size,
 					    false, BPF_READ, NULL);
 	if (err)
 		return err;
 
-	min_off = reg->smin_value + off;
-	max_off = reg->smax_value + off;
+	min_off = reg_smin(reg) + off;
+	max_off = reg_smax(reg) + off;
 	mark_reg_stack_read(env, ptr_state, min_off, max_off + size, dst_regno);
 	check_fastcall_stack_contract(env, ptr_state, env->insn_idx, min_off);
 	return 0;
@@ -4306,10 +3949,9 @@ static int check_stack_read_var_off(struct bpf_verifier_env *env,
  * can be -1, meaning that the read value is not going to a register.
  */
 static int check_stack_read(struct bpf_verifier_env *env,
-			    int ptr_regno, int off, int size,
+			    struct bpf_reg_state *reg, argno_t ptr_argno, int off, int size,
 			    int dst_regno)
 {
-	struct bpf_reg_state *reg = reg_state(env, ptr_regno);
 	struct bpf_func_state *state = bpf_func(env, reg);
 	int err;
 	/* Some accesses are only permitted with a static offset. */
@@ -4345,7 +3987,7 @@ static int check_stack_read(struct bpf_verifier_env *env,
 		 * than fixed offset ones. Note that dst_regno >= 0 on this
 		 * branch.
 		 */
-		err = check_stack_read_var_off(env, ptr_regno, off, size,
+		err = check_stack_read_var_off(env, reg, ptr_argno, off, size,
 					       dst_regno);
 	}
 	return err;
@@ -4355,17 +3997,16 @@ static int check_stack_read(struct bpf_verifier_env *env,
 /* check_stack_write dispatches to check_stack_write_fixed_off or
  * check_stack_write_var_off.
  *
- * 'ptr_regno' is the register used as a pointer into the stack.
+ * 'reg' is the register used as a pointer into the stack.
  * 'value_regno' is the register whose value we're writing to the stack. It can
  * be -1, meaning that we're not writing from a register.
  *
  * The caller must ensure that the offset falls within the maximum stack size.
  */
 static int check_stack_write(struct bpf_verifier_env *env,
-			     int ptr_regno, int off, int size,
+			     struct bpf_reg_state *reg, int off, int size,
 			     int value_regno, int insn_idx)
 {
-	struct bpf_reg_state *reg = reg_state(env, ptr_regno);
 	struct bpf_func_state *state = bpf_func(env, reg);
 	int err;
 
@@ -4378,28 +4019,135 @@ static int check_stack_write(struct bpf_verifier_env *env,
 		 * than fixed offset ones.
 		 */
 		err = check_stack_write_var_off(env, state,
-						ptr_regno, off, size,
+						reg, off, size,
 						value_regno, insn_idx);
 	}
 	return err;
 }
 
-static int check_map_access_type(struct bpf_verifier_env *env, u32 regno,
+/*
+ * Write a value to the outgoing stack arg area.
+ * off is a negative offset from r11 (e.g. -8 for arg6, -16 for arg7).
+ */
+static int check_stack_arg_write(struct bpf_verifier_env *env, struct bpf_func_state *state,
+				 int off, struct bpf_reg_state *value_reg)
+{
+	int max_stack_arg_regs = MAX_BPF_FUNC_ARGS - MAX_BPF_FUNC_REG_ARGS;
+	struct bpf_subprog_info *subprog = &env->subprog_info[state->subprogno];
+	int spi = -off / BPF_REG_SIZE - 1;
+	struct bpf_reg_state *arg;
+	int err;
+
+	if (spi >= max_stack_arg_regs) {
+		verbose(env, "stack arg write offset %d exceeds max %d stack args\n",
+			off, max_stack_arg_regs);
+		return -EINVAL;
+	}
+
+	err = grow_stack_arg_slots(env, state, spi + 1);
+	if (err)
+		return err;
+
+	/* Track the max outgoing stack arg slot count. */
+	if (spi + 1 > subprog->max_out_stack_arg_cnt)
+		subprog->max_out_stack_arg_cnt = spi + 1;
+
+	if (value_reg) {
+		state->stack_arg_regs[spi] = *value_reg;
+	} else {
+		/* BPF_ST: store immediate, treat as scalar */
+		arg = &state->stack_arg_regs[spi];
+		arg->type = SCALAR_VALUE;
+		__mark_reg_known(arg, env->prog->insnsi[env->insn_idx].imm);
+	}
+	state->no_stack_arg_load = true;
+	return bpf_push_jmp_history(env, env->cur_state,
+				    INSN_F_STACK_ARG_ACCESS, spi, 0, 0);
+}
+
+/*
+ * Read a value from the incoming stack arg area.
+ * off is a positive offset from r11 (e.g. +8 for arg6, +16 for arg7).
+ */
+static int check_stack_arg_read(struct bpf_verifier_env *env, struct bpf_func_state *state,
+				int off, int dst_regno)
+{
+	struct bpf_subprog_info *subprog = &env->subprog_info[state->subprogno];
+	struct bpf_verifier_state *vstate = env->cur_state;
+	int spi = off / BPF_REG_SIZE - 1;
+	struct bpf_func_state *caller, *cur;
+	struct bpf_reg_state *arg;
+
+	if (state->no_stack_arg_load) {
+		verbose(env, "r11 load must be before any r11 store or call insn\n");
+		return -EINVAL;
+	}
+
+	if (spi + 1 > bpf_in_stack_arg_cnt(subprog)) {
+		verbose(env, "invalid read from stack arg off %d depth %d\n",
+			off, bpf_in_stack_arg_cnt(subprog) * BPF_REG_SIZE);
+		return -EACCES;
+	}
+
+	caller = vstate->frame[vstate->curframe - 1];
+	arg = &caller->stack_arg_regs[spi];
+	cur = vstate->frame[vstate->curframe];
+	cur->regs[dst_regno] = *arg;
+	return bpf_push_jmp_history(env, env->cur_state,
+				    INSN_F_STACK_ARG_ACCESS, spi, 0, 0);
+}
+
+static int mark_stack_arg_precision(struct bpf_verifier_env *env, int arg_idx)
+{
+	struct bpf_func_state *caller = cur_func(env);
+	int spi = arg_idx - MAX_BPF_FUNC_REG_ARGS;
+
+	bt_set_frame_stack_arg_slot(&env->bt, caller->frameno, spi);
+	return mark_chain_precision_batch(env, env->cur_state);
+}
+
+static int check_outgoing_stack_args(struct bpf_verifier_env *env, struct bpf_func_state *caller,
+				     int nargs)
+{
+	int i, spi;
+
+	for (i = MAX_BPF_FUNC_REG_ARGS; i < nargs; i++) {
+		spi = i - MAX_BPF_FUNC_REG_ARGS;
+		if (spi >= caller->out_stack_arg_cnt ||
+		    caller->stack_arg_regs[spi].type == NOT_INIT) {
+			verbose(env, "callee expects %d args, stack arg%d is not initialized\n",
+				nargs, spi + 1);
+			return -EFAULT;
+		}
+	}
+
+	return 0;
+}
+
+static struct bpf_reg_state *get_func_arg_reg(struct bpf_func_state *caller,
+					      struct bpf_reg_state *regs, int arg)
+{
+	if (arg < MAX_BPF_FUNC_REG_ARGS)
+		return &regs[arg + 1];
+
+	return &caller->stack_arg_regs[arg - MAX_BPF_FUNC_REG_ARGS];
+}
+
+static int check_map_access_type(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
 				 int off, int size, enum bpf_access_type type)
 {
-	struct bpf_reg_state *reg = reg_state(env, regno);
 	struct bpf_map *map = reg->map_ptr;
 	u32 cap = bpf_map_flags_to_cap(map);
 
 	if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) {
 		verbose(env, "write into map forbidden, value_size=%d off=%lld size=%d\n",
-			map->value_size, reg->smin_value + off, size);
+			map->value_size, reg_smin(reg) + off, size);
 		return -EACCES;
 	}
 
 	if (type == BPF_READ && !(cap & BPF_MAP_CAN_READ)) {
 		verbose(env, "read from map forbidden, value_size=%d off=%lld size=%d\n",
-			map->value_size, reg->smin_value + off, size);
+			map->value_size, reg_smin(reg) + off, size);
 		return -EACCES;
 	}
 
@@ -4407,17 +4155,15 @@ static int check_map_access_type(struct bpf_verifier_env *env, u32 regno,
 }
 
 /* check read/write into memory region (e.g., map value, ringbuf sample, etc) */
-static int __check_mem_access(struct bpf_verifier_env *env, int regno,
+static int __check_mem_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno,
 			      int off, int size, u32 mem_size,
 			      bool zero_size_allowed)
 {
 	bool size_ok = size > 0 || (size == 0 && zero_size_allowed);
-	struct bpf_reg_state *reg;
 
 	if (off >= 0 && size_ok && (u64)off + size <= mem_size)
 		return 0;
 
-	reg = &cur_regs(env)[regno];
 	switch (reg->type) {
 	case PTR_TO_MAP_KEY:
 		verbose(env, "invalid access to map key, key_size=%d off=%d size=%d\n",
@@ -4430,8 +4176,8 @@ static int __check_mem_access(struct bpf_verifier_env *env, int regno,
 	case PTR_TO_PACKET:
 	case PTR_TO_PACKET_META:
 	case PTR_TO_PACKET_END:
-		verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
-			off, size, regno, reg->id, off, mem_size);
+		verbose(env, "invalid access to packet, off=%d size=%d, %s(id=%d,off=%d,r=%d)\n",
+			off, size, reg_arg_name(env, argno), reg->id, off, mem_size);
 		break;
 	case PTR_TO_CTX:
 		verbose(env, "invalid access to context, ctx_size=%d off=%d size=%d\n",
@@ -4447,13 +4193,10 @@ static int __check_mem_access(struct bpf_verifier_env *env, int regno,
 }
 
 /* check read/write into a memory region with possible variable offset */
-static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno,
+static int check_mem_region_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno,
 				   int off, int size, u32 mem_size,
 				   bool zero_size_allowed)
 {
-	struct bpf_verifier_state *vstate = env->cur_state;
-	struct bpf_func_state *state = vstate->frame[vstate->curframe];
-	struct bpf_reg_state *reg = &state->regs[regno];
 	int err;
 
 	/* We may have adjusted the register pointing to memory region, so we
@@ -4466,36 +4209,36 @@ static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno,
 	 * index'es we need to make sure that whatever we use
 	 * will have a set floor within our range.
 	 */
-	if (reg->smin_value < 0 &&
-	    (reg->smin_value == S64_MIN ||
-	     (off + reg->smin_value != (s64)(s32)(off + reg->smin_value)) ||
-	      reg->smin_value + off < 0)) {
-		verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
-			regno);
+	if (reg_smin(reg) < 0 &&
+	    (reg_smin(reg) == S64_MIN ||
+	     (off + reg_smin(reg) != (s64)(s32)(off + reg_smin(reg))) ||
+	      reg_smin(reg) + off < 0)) {
+		verbose(env, "%s min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+			reg_arg_name(env, argno));
 		return -EACCES;
 	}
-	err = __check_mem_access(env, regno, reg->smin_value + off, size,
+	err = __check_mem_access(env, reg, argno, reg_smin(reg) + off, size,
 				 mem_size, zero_size_allowed);
 	if (err) {
-		verbose(env, "R%d min value is outside of the allowed memory range\n",
-			regno);
+		verbose(env, "%s min value is outside of the allowed memory range\n",
+			reg_arg_name(env, argno));
 		return err;
 	}
 
 	/* If we haven't set a max value then we need to bail since we can't be
 	 * sure we won't do bad things.
-	 * If reg->umax_value + off could overflow, treat that as unbounded too.
+	 * If reg_umax(reg) + off could overflow, treat that as unbounded too.
 	 */
-	if (reg->umax_value >= BPF_MAX_VAR_OFF) {
-		verbose(env, "R%d unbounded memory access, make sure to bounds check any such access\n",
-			regno);
+	if (reg_umax(reg) >= BPF_MAX_VAR_OFF) {
+		verbose(env, "%s unbounded memory access, make sure to bounds check any such access\n",
+			reg_arg_name(env, argno));
 		return -EACCES;
 	}
-	err = __check_mem_access(env, regno, reg->umax_value + off, size,
+	err = __check_mem_access(env, reg, argno, reg_umax(reg) + off, size,
 				 mem_size, zero_size_allowed);
 	if (err) {
-		verbose(env, "R%d max value is outside of the allowed memory range\n",
-			regno);
+		verbose(env, "%s max value is outside of the allowed memory range\n",
+			reg_arg_name(env, argno));
 		return err;
 	}
 
@@ -4503,7 +4246,7 @@ static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno,
 }
 
 static int __check_ptr_off_reg(struct bpf_verifier_env *env,
-			       const struct bpf_reg_state *reg, int regno,
+			       const struct bpf_reg_state *reg, argno_t argno,
 			       bool fixed_off_ok)
 {
 	/* Access to this pointer-typed register or passing it to a helper
@@ -4519,15 +4262,15 @@ static int __check_ptr_off_reg(struct bpf_verifier_env *env,
 		return -EACCES;
 	}
 
-	if (reg->smin_value < 0) {
-		verbose(env, "negative offset %s ptr R%d off=%lld disallowed\n",
-			reg_type_str(env, reg->type), regno, reg->var_off.value);
+	if (reg_smin(reg) < 0) {
+		verbose(env, "negative offset %s ptr %s off=%lld disallowed\n",
+			reg_type_str(env, reg->type), reg_arg_name(env, argno), reg->var_off.value);
 		return -EACCES;
 	}
 
 	if (!fixed_off_ok && reg->var_off.value != 0) {
-		verbose(env, "dereference of modified %s ptr R%d off=%lld disallowed\n",
-			reg_type_str(env, reg->type), regno, reg->var_off.value);
+		verbose(env, "dereference of modified %s ptr %s off=%lld disallowed\n",
+			reg_type_str(env, reg->type), reg_arg_name(env, argno), reg->var_off.value);
 		return -EACCES;
 	}
 
@@ -4537,7 +4280,7 @@ static int __check_ptr_off_reg(struct bpf_verifier_env *env,
 static int check_ptr_off_reg(struct bpf_verifier_env *env,
 		             const struct bpf_reg_state *reg, int regno)
 {
-	return __check_ptr_off_reg(env, reg, regno, false);
+	return __check_ptr_off_reg(env, reg, argno_from_reg(regno), false);
 }
 
 static int map_kptr_match_type(struct bpf_verifier_env *env,
@@ -4573,9 +4316,9 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
 	 * referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the
 	 * normal store of unreferenced kptr, we must ensure var_off is zero.
 	 * Since ref_ptr cannot be accessed directly by BPF insns, check for
-	 * reg->ref_obj_id is not needed here.
+	 * reg->id is not needed here.
 	 */
-	if (__check_ptr_off_reg(env, reg, regno, true))
+	if (__check_ptr_off_reg(env, reg, argno_from_reg(regno), true))
 		return -EACCES;
 
 	/* A full type match is needed, as BTF can be vmlinux, module or prog BTF, and
@@ -4718,7 +4461,7 @@ static int mark_uptr_ld_reg(struct bpf_verifier_env *env, u32 regno,
 	return 0;
 }
 
-static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
+static int check_map_kptr_access(struct bpf_verifier_env *env,
 				 int value_regno, int insn_idx,
 				 struct btf_field *kptr_field)
 {
@@ -4795,19 +4538,16 @@ static u32 map_mem_size(const struct bpf_map *map)
 }
 
 /* check read/write into a map element with possible variable offset */
-static int check_map_access(struct bpf_verifier_env *env, u32 regno,
+static int check_map_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno,
 			    int off, int size, bool zero_size_allowed,
 			    enum bpf_access_src src)
 {
-	struct bpf_verifier_state *vstate = env->cur_state;
-	struct bpf_func_state *state = vstate->frame[vstate->curframe];
-	struct bpf_reg_state *reg = &state->regs[regno];
 	struct bpf_map *map = reg->map_ptr;
 	u32 mem_size = map_mem_size(map);
 	struct btf_record *rec;
 	int err, i;
 
-	err = check_mem_region_access(env, regno, off, size, mem_size, zero_size_allowed);
+	err = check_mem_region_access(env, reg, argno, off, size, mem_size, zero_size_allowed);
 	if (err)
 		return err;
 
@@ -4822,8 +4562,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 		 * this program. To check that [x1, x2) overlaps with [y1, y2),
 		 * it is sufficient to check x1 < y2 && y1 < x2.
 		 */
-		if (reg->smin_value + off < p + field->size &&
-		    p < reg->umax_value + off + size) {
+		if (reg_smin(reg) + off < p + field->size &&
+		    p < reg_umax(reg) + off + size) {
 			switch (field->type) {
 			case BPF_KPTR_UNREF:
 			case BPF_KPTR_REF:
@@ -4903,30 +4643,29 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
 	}
 }
 
-static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
+static int check_packet_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int off,
 			       int size, bool zero_size_allowed)
 {
-	struct bpf_reg_state *reg = reg_state(env, regno);
 	int err;
 
 	if (reg->range < 0) {
-		verbose(env, "R%d offset is outside of the packet\n", regno);
+		verbose(env, "%s offset is outside of the packet\n", reg_arg_name(env, argno));
 		return -EINVAL;
 	}
 
-	err = check_mem_region_access(env, regno, off, size, reg->range, zero_size_allowed);
+	err = check_mem_region_access(env, reg, argno, off, size, reg->range, zero_size_allowed);
 	if (err)
 		return err;
 
 	/* __check_mem_access has made sure "off + size - 1" is within u16.
-	 * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff,
+	 * reg_umax(reg) can't be bigger than MAX_PACKET_OFF which is 0xffff,
 	 * otherwise find_good_pkt_pointers would have refused to set range info
 	 * that __check_mem_access would have rejected this pkt access.
-	 * Therefore, "off + reg->umax_value + size - 1" won't overflow u32.
+	 * Therefore, "off + reg_umax(reg) + size - 1" won't overflow u32.
 	 */
 	env->prog->aux->max_pkt_offset =
 		max_t(u32, env->prog->aux->max_pkt_offset,
-		      off + reg->umax_value + size - 1);
+		      off + reg_umax(reg) + size - 1);
 
 	return 0;
 }
@@ -4950,8 +4689,8 @@ static int __check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int of
 		 * type of narrower access.
 		 */
 		if (base_type(info->reg_type) == PTR_TO_BTF_ID) {
-			if (info->ref_obj_id &&
-			    !find_reference_state(env->cur_state, info->ref_obj_id)) {
+			if (info->ref_id &&
+			    !find_reference_state(env->cur_state, info->ref_id)) {
 				verbose(env, "invalid bpf_context access off=%d. Reference may already be released\n",
 					off);
 				return -EACCES;
@@ -4969,7 +4708,7 @@ static int __check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int of
 	return -EACCES;
 }
 
-static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, u32 regno,
+static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, struct bpf_reg_state *reg, argno_t argno,
 			    int off, int access_size, enum bpf_access_type t,
 			    struct bpf_insn_access_aux *info)
 {
@@ -4979,17 +4718,15 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 	 */
 	bool var_off_ok = is_var_ctx_off_allowed(env->prog);
 	bool fixed_off_ok = !env->ops->convert_ctx_access;
-	struct bpf_reg_state *regs = cur_regs(env);
-	struct bpf_reg_state *reg = regs + regno;
 	int err;
 
 	if (var_off_ok)
-		err = check_mem_region_access(env, regno, off, access_size, U16_MAX, false);
+		err = check_mem_region_access(env, reg, argno, off, access_size, U16_MAX, false);
 	else
-		err = __check_ptr_off_reg(env, reg, regno, fixed_off_ok);
+		err = __check_ptr_off_reg(env, reg, argno, fixed_off_ok);
 	if (err)
 		return err;
-	off += reg->umax_value;
+	off += reg_umax(reg);
 
 	err = __check_ctx_access(env, insn_idx, off, access_size, t, info);
 	if (err)
@@ -4997,9 +4734,21 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 	return err;
 }
 
-static int check_flow_keys_access(struct bpf_verifier_env *env, int off,
-				  int size)
+static int check_flow_keys_access(struct bpf_verifier_env *env,
+				  struct bpf_reg_state *reg, argno_t argno,
+				  int off, int size)
 {
+	/* Only a constant offset is allowed here; fold it into off. */
+	if (!tnum_is_const(reg->var_off)) {
+		char tn_buf[48];
+
+		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+		verbose(env, "%s invalid variable offset to flow keys: off=%d, var_off=%s\n",
+			reg_arg_name(env, argno), off, tn_buf);
+		return -EACCES;
+	}
+	off += reg->var_off.value;
+
 	if (size < 0 || off < 0 ||
 	    (u64)off + size > sizeof(struct bpf_flow_keys)) {
 		verbose(env, "invalid access to flow keys off=%d size=%d\n",
@@ -5010,16 +4759,15 @@ static int check_flow_keys_access(struct bpf_verifier_env *env, int off,
 }
 
 static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
-			     u32 regno, int off, int size,
+			     struct bpf_reg_state *reg, argno_t argno, int off, int size,
 			     enum bpf_access_type t)
 {
-	struct bpf_reg_state *reg = reg_state(env, regno);
 	struct bpf_insn_access_aux info = {};
 	bool valid;
 
-	if (reg->smin_value < 0) {
-		verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
-			regno);
+	if (reg_smin(reg) < 0) {
+		verbose(env, "%s min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+			reg_arg_name(env, argno));
 		return -EACCES;
 	}
 
@@ -5047,8 +4795,8 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
 		return 0;
 	}
 
-	verbose(env, "R%d invalid %s access off=%d size=%d\n",
-		regno, reg_type_str(env, reg->type), off, size);
+	verbose(env, "%s invalid %s access off=%d size=%d\n",
+		reg_arg_name(env, argno), reg_type_str(env, reg->type), off, size);
 
 	return -EACCES;
 }
@@ -5123,10 +4871,10 @@ static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
 	[CONST_PTR_TO_MAP] = btf_bpf_map_id,
 };
 
-static bool is_trusted_reg(const struct bpf_reg_state *reg)
+static bool is_trusted_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg)
 {
 	/* A referenced register is always trusted. */
-	if (reg->ref_obj_id)
+	if (reg_is_referenced(env, reg))
 		return true;
 
 	/* Types listed in the reg2btf_ids are always trusted */
@@ -5368,7 +5116,10 @@ process_func:
 	}
 
 	subprog_depth = round_up_stack_depth(env, subprog[idx].stack_depth);
-	if (priv_stack_supported) {
+	if (IS_ENABLED(CONFIG_X86_64) && subprog[idx].stack_arg_cnt) {
+		/* x86-64 uses R9 for both private stack frame pointer and arg6. */
+		subprog[idx].priv_stack_mode = NO_PRIV_STACK;
+	} else if (priv_stack_supported) {
 		/* Request private stack support only if the subprog stack
 		 * depth is no less than BPF_PRIV_STACK_MIN_SIZE. This is to
 		 * avoid jit penalty if the stack usage is small.
@@ -5379,6 +5130,8 @@ process_func:
 	}
 
 	if (subprog[idx].priv_stack_mode == PRIV_STACK_ADAPTIVE) {
+		if (subprog_depth > env->max_stack_depth)
+			env->max_stack_depth = subprog_depth;
 		if (subprog_depth > MAX_BPF_STACK) {
 			verbose(env, "stack size of subprog %d is %d. Too large\n",
 				idx, subprog_depth);
@@ -5386,6 +5139,8 @@ process_func:
 		}
 	} else {
 		depth += subprog_depth;
+		if (depth > env->max_stack_depth)
+			env->max_stack_depth = depth;
 		if (depth > MAX_BPF_STACK) {
 			total = 0;
 			for (tmp = idx; tmp >= 0; tmp = dinfo[tmp].caller)
@@ -5472,14 +5227,23 @@ continue_func:
 	 * this info will be utilized by JIT so that we will be preserving the
 	 * tail call counter throughout bpf2bpf calls combined with tailcalls
 	 */
-	if (tail_call_reachable)
+	if (tail_call_reachable) {
 		for (tmp = idx; tmp >= 0; tmp = dinfo[tmp].caller) {
 			if (subprog[tmp].is_exception_cb) {
 				verbose(env, "cannot tail call within exception cb\n");
 				return -EINVAL;
 			}
+			if (subprog[tmp].stack_arg_cnt) {
+				verbose(env, "tail_calls are not allowed in programs with stack args\n");
+				return -EINVAL;
+			}
 			subprog[tmp].tail_call_reachable = true;
 		}
+	} else if (!idx && subprog[0].has_tail_call && subprog[0].stack_arg_cnt) {
+		verbose(env, "tail_calls are not allowed in programs with stack args\n");
+		return -EINVAL;
+	}
+
 	if (subprog[0].tail_call_reachable)
 		env->prog->aux->tail_call_reachable = true;
 
@@ -5498,6 +5262,9 @@ continue_func:
 	frame = dinfo[idx].frame;
 	i = dinfo[idx].ret_insn;
 
+	/* reset tail_call_reachable to the parent's actual state */
+	tail_call_reachable = subprog[idx].tail_call_reachable;
+
 	goto continue_func;
 }
 
@@ -5558,12 +5325,12 @@ static int check_max_stack_depth(struct bpf_verifier_env *env)
 static int __check_buffer_access(struct bpf_verifier_env *env,
 				 const char *buf_info,
 				 const struct bpf_reg_state *reg,
-				 int regno, int off, int size)
+				 argno_t argno, int off, int size)
 {
 	if (off < 0) {
 		verbose(env,
-			"R%d invalid %s buffer access: off=%d, size=%d\n",
-			regno, buf_info, off, size);
+			"%s invalid %s buffer access: off=%d, size=%d\n",
+			reg_arg_name(env, argno), buf_info, off, size);
 		return -EACCES;
 	}
 	if (!tnum_is_const(reg->var_off)) {
@@ -5571,8 +5338,8 @@ static int __check_buffer_access(struct bpf_verifier_env *env,
 
 		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
 		verbose(env,
-			"R%d invalid variable buffer offset: off=%d, var_off=%s\n",
-			regno, off, tn_buf);
+			"%s invalid variable buffer offset: off=%d, var_off=%s\n",
+			reg_arg_name(env, argno), off, tn_buf);
 		return -EACCES;
 	}
 
@@ -5581,11 +5348,11 @@ static int __check_buffer_access(struct bpf_verifier_env *env,
 
 static int check_tp_buffer_access(struct bpf_verifier_env *env,
 				  const struct bpf_reg_state *reg,
-				  int regno, int off, int size)
+				  argno_t argno, int off, int size)
 {
 	int err;
 
-	err = __check_buffer_access(env, "tracepoint", reg, regno, off, size);
+	err = __check_buffer_access(env, "tracepoint", reg, argno, off, size);
 	if (err)
 		return err;
 
@@ -5597,14 +5364,14 @@ static int check_tp_buffer_access(struct bpf_verifier_env *env,
 
 static int check_buffer_access(struct bpf_verifier_env *env,
 			       const struct bpf_reg_state *reg,
-			       int regno, int off, int size,
+			       argno_t argno, int off, int size,
 			       bool zero_size_allowed,
 			       u32 *max_access)
 {
 	const char *buf_info = type_is_rdonly_mem(reg->type) ? "rdonly" : "rdwr";
 	int err;
 
-	err = __check_buffer_access(env, buf_info, reg, regno, off, size);
+	err = __check_buffer_access(env, buf_info, reg, argno, off, size);
 	if (err)
 		return err;
 
@@ -5617,7 +5384,7 @@ static int check_buffer_access(struct bpf_verifier_env *env,
 static void zext_32_to_64(struct bpf_reg_state *reg)
 {
 	reg->var_off = tnum_subreg(reg->var_off);
-	__reg_assign_32_into_64(reg);
+	reg_set_urange64(reg, reg_u32_min(reg), reg_u32_max(reg));
 }
 
 /* truncate register to smaller size (in bytes)
@@ -5632,15 +5399,10 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
 
 	/* fix arithmetic bounds */
 	mask = ((u64)1 << (size * 8)) - 1;
-	if ((reg->umin_value & ~mask) == (reg->umax_value & ~mask)) {
-		reg->umin_value &= mask;
-		reg->umax_value &= mask;
-	} else {
-		reg->umin_value = 0;
-		reg->umax_value = mask;
-	}
-	reg->smin_value = reg->umin_value;
-	reg->smax_value = reg->umax_value;
+	if ((reg_umin(reg) & ~mask) == (reg_umax(reg) & ~mask))
+		reg_set_urange64(reg, reg_umin(reg) & mask, reg_umax(reg) & mask);
+	else
+		reg_set_urange64(reg, 0, mask);
 
 	/* If size is smaller than 32bit register the 32bit register
 	 * values are also truncated so we push 64-bit bounds into
@@ -5655,19 +5417,16 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
 static void set_sext64_default_val(struct bpf_reg_state *reg, int size)
 {
 	if (size == 1) {
-		reg->smin_value = reg->s32_min_value = S8_MIN;
-		reg->smax_value = reg->s32_max_value = S8_MAX;
+		reg_set_srange64(reg, S8_MIN, S8_MAX);
+		reg_set_srange32(reg, S8_MIN, S8_MAX);
 	} else if (size == 2) {
-		reg->smin_value = reg->s32_min_value = S16_MIN;
-		reg->smax_value = reg->s32_max_value = S16_MAX;
+		reg_set_srange64(reg, S16_MIN, S16_MAX);
+		reg_set_srange32(reg, S16_MIN, S16_MAX);
 	} else {
 		/* size == 4 */
-		reg->smin_value = reg->s32_min_value = S32_MIN;
-		reg->smax_value = reg->s32_max_value = S32_MAX;
+		reg_set_srange64(reg, S32_MIN, S32_MAX);
+		reg_set_srange32(reg, S32_MIN, S32_MAX);
 	}
-	reg->umin_value = reg->u32_min_value = 0;
-	reg->umax_value = U64_MAX;
-	reg->u32_max_value = U32_MAX;
 	reg->var_off = tnum_unknown;
 }
 
@@ -5688,29 +5447,27 @@ static void coerce_reg_to_size_sx(struct bpf_reg_state *reg, int size)
 			reg->var_off = tnum_const((s32)u64_cval);
 
 		u64_cval = reg->var_off.value;
-		reg->smax_value = reg->smin_value = u64_cval;
-		reg->umax_value = reg->umin_value = u64_cval;
-		reg->s32_max_value = reg->s32_min_value = u64_cval;
-		reg->u32_max_value = reg->u32_min_value = u64_cval;
+		reg->r64 = cnum64_from_urange(u64_cval, u64_cval);
+		reg->r32 = cnum32_from_urange((u32)u64_cval, (u32)u64_cval);
 		return;
 	}
 
-	top_smax_value = ((u64)reg->smax_value >> num_bits) << num_bits;
-	top_smin_value = ((u64)reg->smin_value >> num_bits) << num_bits;
+	top_smax_value = ((u64)reg_smax(reg) >> num_bits) << num_bits;
+	top_smin_value = ((u64)reg_smin(reg) >> num_bits) << num_bits;
 
 	if (top_smax_value != top_smin_value)
 		goto out;
 
 	/* find the s64_min and s64_min after sign extension */
 	if (size == 1) {
-		init_s64_max = (s8)reg->smax_value;
-		init_s64_min = (s8)reg->smin_value;
+		init_s64_max = (s8)reg_smax(reg);
+		init_s64_min = (s8)reg_smin(reg);
 	} else if (size == 2) {
-		init_s64_max = (s16)reg->smax_value;
-		init_s64_min = (s16)reg->smin_value;
+		init_s64_max = (s16)reg_smax(reg);
+		init_s64_min = (s16)reg_smin(reg);
 	} else {
-		init_s64_max = (s32)reg->smax_value;
-		init_s64_min = (s32)reg->smin_value;
+		init_s64_max = (s32)reg_smax(reg);
+		init_s64_min = (s32)reg_smin(reg);
 	}
 
 	s64_max = max(init_s64_max, init_s64_min);
@@ -5718,10 +5475,8 @@ static void coerce_reg_to_size_sx(struct bpf_reg_state *reg, int size)
 
 	/* both of s64_max/s64_min positive or negative */
 	if ((s64_max >= 0) == (s64_min >= 0)) {
-		reg->s32_min_value = reg->smin_value = s64_min;
-		reg->s32_max_value = reg->smax_value = s64_max;
-		reg->u32_min_value = reg->umin_value = s64_min;
-		reg->u32_max_value = reg->umax_value = s64_max;
+		reg_set_srange64(reg, s64_min, s64_max);
+		reg_set_srange32(reg, s64_min, s64_max);
 		reg->var_off = tnum_range(s64_min, s64_max);
 		return;
 	}
@@ -5732,16 +5487,11 @@ out:
 
 static void set_sext32_default_val(struct bpf_reg_state *reg, int size)
 {
-	if (size == 1) {
-		reg->s32_min_value = S8_MIN;
-		reg->s32_max_value = S8_MAX;
-	} else {
+	if (size == 1)
+		reg_set_srange32(reg, S8_MIN, S8_MAX);
+	else
 		/* size == 2 */
-		reg->s32_min_value = S16_MIN;
-		reg->s32_max_value = S16_MAX;
-	}
-	reg->u32_min_value = 0;
-	reg->u32_max_value = U32_MAX;
+		reg_set_srange32(reg, S16_MIN, S16_MAX);
 	reg->var_off = tnum_subreg(tnum_unknown);
 }
 
@@ -5759,34 +5509,30 @@ static void coerce_subreg_to_size_sx(struct bpf_reg_state *reg, int size)
 			reg->var_off = tnum_const((s16)u32_val);
 
 		u32_val = reg->var_off.value;
-		reg->s32_min_value = reg->s32_max_value = u32_val;
-		reg->u32_min_value = reg->u32_max_value = u32_val;
+		reg_set_srange32(reg, u32_val, u32_val);
 		return;
 	}
 
-	top_smax_value = ((u32)reg->s32_max_value >> num_bits) << num_bits;
-	top_smin_value = ((u32)reg->s32_min_value >> num_bits) << num_bits;
+	top_smax_value = ((u32)reg_s32_max(reg) >> num_bits) << num_bits;
+	top_smin_value = ((u32)reg_s32_min(reg) >> num_bits) << num_bits;
 
 	if (top_smax_value != top_smin_value)
 		goto out;
 
 	/* find the s32_min and s32_min after sign extension */
 	if (size == 1) {
-		init_s32_max = (s8)reg->s32_max_value;
-		init_s32_min = (s8)reg->s32_min_value;
+		init_s32_max = (s8)reg_s32_max(reg);
+		init_s32_min = (s8)reg_s32_min(reg);
 	} else {
 		/* size == 2 */
-		init_s32_max = (s16)reg->s32_max_value;
-		init_s32_min = (s16)reg->s32_min_value;
+		init_s32_max = (s16)reg_s32_max(reg);
+		init_s32_min = (s16)reg_s32_min(reg);
 	}
 	s32_max = max(init_s32_max, init_s32_min);
 	s32_min = min(init_s32_max, init_s32_min);
 
 	if ((s32_min >= 0) == (s32_max >= 0)) {
-		reg->s32_min_value = s32_min;
-		reg->s32_max_value = s32_max;
-		reg->u32_min_value = (u32)s32_min;
-		reg->u32_max_value = (u32)s32_max;
+		reg_set_srange32(reg, s32_min, s32_max);
 		reg->var_off = tnum_subreg(tnum_range(s32_min, s32_max));
 		return;
 	}
@@ -5976,12 +5722,11 @@ static bool type_is_trusted_or_null(struct bpf_verifier_env *env,
 }
 
 static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
-				   struct bpf_reg_state *regs,
-				   int regno, int off, int size,
+				   struct bpf_reg_state *regs, struct bpf_reg_state *reg,
+				   argno_t argno, int off, int size,
 				   enum bpf_access_type atype,
 				   int value_regno)
 {
-	struct bpf_reg_state *reg = regs + regno;
 	const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id);
 	const char *tname = btf_name_by_offset(reg->btf, t->name_off);
 	const char *field_name = NULL;
@@ -6007,8 +5752,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 
 		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
 		verbose(env,
-			"R%d is ptr_%s invalid variable offset: off=%d, var_off=%s\n",
-			regno, tname, off, tn_buf);
+			"%s is ptr_%s invalid variable offset: off=%d, var_off=%s\n",
+			reg_arg_name(env, argno), tname, off, tn_buf);
 		return -EACCES;
 	}
 
@@ -6016,22 +5761,22 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 
 	if (off < 0) {
 		verbose(env,
-			"R%d is ptr_%s invalid negative access: off=%d\n",
-			regno, tname, off);
+			"%s is ptr_%s invalid negative access: off=%d\n",
+			reg_arg_name(env, argno), tname, off);
 		return -EACCES;
 	}
 
 	if (reg->type & MEM_USER) {
 		verbose(env,
-			"R%d is ptr_%s access user memory: off=%d\n",
-			regno, tname, off);
+			"%s is ptr_%s access user memory: off=%d\n",
+			reg_arg_name(env, argno), tname, off);
 		return -EACCES;
 	}
 
 	if (reg->type & MEM_PERCPU) {
 		verbose(env,
-			"R%d is ptr_%s access percpu memory: off=%d\n",
-			regno, tname, off);
+			"%s is ptr_%s access percpu memory: off=%d\n",
+			reg_arg_name(env, argno), tname, off);
 		return -EACCES;
 	}
 
@@ -6043,7 +5788,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 		ret = env->ops->btf_struct_access(&env->log, reg, off, size);
 	} else {
 		/* Writes are permitted with default btf_struct_access for
-		 * program allocated objects (which always have ref_obj_id > 0),
+		 * program allocated objects (which always have id > 0),
 		 * but not for untrusted PTR_TO_BTF_ID | MEM_ALLOC.
 		 */
 		if (atype != BPF_READ && !type_is_ptr_alloc_obj(reg->type)) {
@@ -6052,8 +5797,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 		}
 
 		if (type_is_alloc(reg->type) && !type_is_non_owning_ref(reg->type) &&
-		    !(reg->type & MEM_RCU) && !reg->ref_obj_id) {
-			verifier_bug(env, "ref_obj_id for allocated object must be non-zero");
+		    !(reg->type & MEM_RCU) && !reg_is_referenced(env, reg)) {
+			verifier_bug(env, "allocated object must have a referenced id");
 			return -EFAULT;
 		}
 
@@ -6072,7 +5817,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 		 */
 		flag = PTR_UNTRUSTED;
 
-	} else if (is_trusted_reg(reg) || is_rcu_reg(reg)) {
+	} else if (is_trusted_reg(env, reg) || is_rcu_reg(reg)) {
 		/* By default any pointer obtained from walking a trusted pointer is no
 		 * longer trusted, unless the field being accessed has explicitly been
 		 * marked as inheriting its parent's state of trust (either full or RCU).
@@ -6133,12 +5878,11 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 }
 
 static int check_ptr_to_map_access(struct bpf_verifier_env *env,
-				   struct bpf_reg_state *regs,
-				   int regno, int off, int size,
+				   struct bpf_reg_state *regs, struct bpf_reg_state *reg,
+				   argno_t argno, int off, int size,
 				   enum bpf_access_type atype,
 				   int value_regno)
 {
-	struct bpf_reg_state *reg = regs + regno;
 	struct bpf_map *map = reg->map_ptr;
 	struct bpf_reg_state map_reg;
 	enum bpf_type_flag flag = 0;
@@ -6169,8 +5913,8 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
 	}
 
 	if (off < 0) {
-		verbose(env, "R%d is %s invalid negative access: off=%d\n",
-			regno, tname, off);
+		verbose(env, "%s is %s invalid negative access: off=%d\n",
+			reg_arg_name(env, argno), tname, off);
 		return -EACCES;
 	}
 
@@ -6227,11 +5971,10 @@ static int check_stack_slot_within_bounds(struct bpf_verifier_env *env,
  * 'off' includes `regno->offset`, but not its dynamic part (if any).
  */
 static int check_stack_access_within_bounds(
-		struct bpf_verifier_env *env,
-		int regno, int off, int access_size,
+		struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+		argno_t argno, int off, int access_size,
 		enum bpf_access_type type)
 {
-	struct bpf_reg_state *reg = reg_state(env, regno);
 	struct bpf_func_state *state = bpf_func(env, reg);
 	s64 min_off, max_off;
 	int err;
@@ -6246,14 +5989,14 @@ static int check_stack_access_within_bounds(
 		min_off = (s64)reg->var_off.value + off;
 		max_off = min_off + access_size;
 	} else {
-		if (reg->smax_value >= BPF_MAX_VAR_OFF ||
-		    reg->smin_value <= -BPF_MAX_VAR_OFF) {
-			verbose(env, "invalid unbounded variable-offset%s stack R%d\n",
-				err_extra, regno);
+		if (reg_smax(reg) >= BPF_MAX_VAR_OFF ||
+		    reg_smin(reg) <= -BPF_MAX_VAR_OFF) {
+			verbose(env, "invalid unbounded variable-offset%s stack %s\n",
+				err_extra, reg_arg_name(env, argno));
 			return -EACCES;
 		}
-		min_off = reg->smin_value + off;
-		max_off = reg->smax_value + off + access_size;
+		min_off = reg_smin(reg) + off;
+		max_off = reg_smax(reg) + off + access_size;
 	}
 
 	err = check_stack_slot_within_bounds(env, min_off, state, type);
@@ -6267,14 +6010,14 @@ static int check_stack_access_within_bounds(
 
 	if (err) {
 		if (tnum_is_const(reg->var_off)) {
-			verbose(env, "invalid%s stack R%d off=%lld size=%d\n",
-				err_extra, regno, min_off, access_size);
+			verbose(env, "invalid%s stack %s off=%lld size=%d\n",
+				err_extra, reg_arg_name(env, argno), min_off, access_size);
 		} else {
 			char tn_buf[48];
 
 			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-			verbose(env, "invalid variable-offset%s stack R%d var_off=%s off=%d size=%d\n",
-				err_extra, regno, tn_buf, off, access_size);
+			verbose(env, "invalid variable-offset%s stack %s var_off=%s off=%d size=%d\n",
+				err_extra, reg_arg_name(env, argno), tn_buf, off, access_size);
 		}
 		return err;
 	}
@@ -6319,12 +6062,11 @@ static void add_scalar_to_reg(struct bpf_reg_state *dst_reg, s64 val)
  * if t==write && value_regno==-1, some unknown value is stored into memory
  * if t==read && value_regno==-1, don't care what we read from memory
  */
-static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno,
+static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct bpf_reg_state *reg, argno_t argno,
 			    int off, int bpf_size, enum bpf_access_type t,
 			    int value_regno, bool strict_alignment_once, bool is_ldsx)
 {
 	struct bpf_reg_state *regs = cur_regs(env);
-	struct bpf_reg_state *reg = regs + regno;
 	int size, err = 0;
 
 	size = bpf_size_to_bytes(bpf_size);
@@ -6337,11 +6079,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 
 	if (reg->type == PTR_TO_MAP_KEY) {
 		if (t == BPF_WRITE) {
-			verbose(env, "write to change key R%d not allowed\n", regno);
+			verbose(env, "write to change key %s not allowed\n",
+				reg_arg_name(env, argno));
 			return -EACCES;
 		}
 
-		err = check_mem_region_access(env, regno, off, size,
+		err = check_mem_region_access(env, reg, argno, off, size,
 					      reg->map_ptr->key_size, false);
 		if (err)
 			return err;
@@ -6355,17 +6098,17 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			verbose(env, "R%d leaks addr into map\n", value_regno);
 			return -EACCES;
 		}
-		err = check_map_access_type(env, regno, off, size, t);
+		err = check_map_access_type(env, reg, off, size, t);
 		if (err)
 			return err;
-		err = check_map_access(env, regno, off, size, false, ACCESS_DIRECT);
+		err = check_map_access(env, reg, argno, off, size, false, ACCESS_DIRECT);
 		if (err)
 			return err;
 		if (tnum_is_const(reg->var_off))
 			kptr_field = btf_record_find(reg->map_ptr->record,
 						     off + reg->var_off.value, BPF_KPTR | BPF_UPTR);
 		if (kptr_field) {
-			err = check_map_kptr_access(env, regno, value_regno, insn_idx, kptr_field);
+			err = check_map_kptr_access(env, value_regno, insn_idx, kptr_field);
 		} else if (t == BPF_READ && value_regno >= 0) {
 			struct bpf_map *map = reg->map_ptr;
 
@@ -6393,7 +6136,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 						     size);
 					return -EACCES;
 				}
-				copy_register_state(&regs[value_regno], reg);
+				regs[value_regno] = *reg;
 				add_scalar_to_reg(&regs[value_regno], off);
 				regs[value_regno].type = PTR_TO_INSN;
 			} else {
@@ -6405,14 +6148,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		bool rdonly_untrusted = rdonly_mem && (reg->type & PTR_UNTRUSTED);
 
 		if (type_may_be_null(reg->type)) {
-			verbose(env, "R%d invalid mem access '%s'\n", regno,
+			verbose(env, "%s invalid mem access '%s'\n", reg_arg_name(env, argno),
 				reg_type_str(env, reg->type));
 			return -EACCES;
 		}
 
 		if (t == BPF_WRITE && rdonly_mem) {
-			verbose(env, "R%d cannot write into %s\n",
-				regno, reg_type_str(env, reg->type));
+			verbose(env, "%s cannot write into %s\n",
+				reg_arg_name(env, argno), reg_type_str(env, reg->type));
 			return -EACCES;
 		}
 
@@ -6427,7 +6170,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		 * instructions, hence no need to check bounds in that case.
 		 */
 		if (!rdonly_untrusted)
-			err = check_mem_region_access(env, regno, off, size,
+			err = check_mem_region_access(env, reg, argno, off, size,
 						      reg->mem_size, false);
 		if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem))
 			mark_reg_unknown(env, regs, value_regno);
@@ -6445,7 +6188,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			return -EACCES;
 		}
 
-		err = check_ctx_access(env, insn_idx, regno, off, size, t, &info);
+		err = check_ctx_access(env, insn_idx, reg, argno, off, size, t, &info);
 		if (!err && t == BPF_READ && value_regno >= 0) {
 			/* ctx access returns either a scalar, or a
 			 * PTR_TO_PACKET[_META,_END]. In the latter
@@ -6463,8 +6206,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			} else {
 				mark_reg_known_zero(env, regs,
 						    value_regno);
-				if (type_may_be_null(info.reg_type))
-					regs[value_regno].id = ++env->id_gen;
 				/* A load of ctx field could have different
 				 * actual load size with the one encoded in the
 				 * insn. When the dst is PTR, it is for sure not
@@ -6474,23 +6215,25 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 				if (base_type(info.reg_type) == PTR_TO_BTF_ID) {
 					regs[value_regno].btf = info.btf;
 					regs[value_regno].btf_id = info.btf_id;
-					regs[value_regno].ref_obj_id = info.ref_obj_id;
+					regs[value_regno].id = info.ref_id;
 				}
+				if (type_may_be_null(info.reg_type) && !regs[value_regno].id)
+					regs[value_regno].id = ++env->id_gen;
 			}
 			regs[value_regno].type = info.reg_type;
 		}
 
 	} else if (reg->type == PTR_TO_STACK) {
 		/* Basic bounds checks. */
-		err = check_stack_access_within_bounds(env, regno, off, size, t);
+		err = check_stack_access_within_bounds(env, reg, argno, off, size, t);
 		if (err)
 			return err;
 
 		if (t == BPF_READ)
-			err = check_stack_read(env, regno, off, size,
+			err = check_stack_read(env, reg, argno, off, size,
 					       value_regno);
 		else
-			err = check_stack_write(env, regno, off, size,
+			err = check_stack_write(env, reg, off, size,
 						value_regno, insn_idx);
 	} else if (reg_is_pkt_pointer(reg)) {
 		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
@@ -6503,7 +6246,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 				value_regno);
 			return -EACCES;
 		}
-		err = check_packet_access(env, regno, off, size, false);
+		err = check_packet_access(env, reg, argno, off, size, false);
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown(env, regs, value_regno);
 	} else if (reg->type == PTR_TO_FLOW_KEYS) {
@@ -6514,28 +6257,28 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			return -EACCES;
 		}
 
-		err = check_flow_keys_access(env, off, size);
+		err = check_flow_keys_access(env, reg, argno, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown(env, regs, value_regno);
 	} else if (type_is_sk_pointer(reg->type)) {
 		if (t == BPF_WRITE) {
-			verbose(env, "R%d cannot write into %s\n",
-				regno, reg_type_str(env, reg->type));
+			verbose(env, "%s cannot write into %s\n",
+				reg_arg_name(env, argno), reg_type_str(env, reg->type));
 			return -EACCES;
 		}
-		err = check_sock_access(env, insn_idx, regno, off, size, t);
+		err = check_sock_access(env, insn_idx, reg, argno, off, size, t);
 		if (!err && value_regno >= 0)
 			mark_reg_unknown(env, regs, value_regno);
 	} else if (reg->type == PTR_TO_TP_BUFFER) {
-		err = check_tp_buffer_access(env, reg, regno, off, size);
+		err = check_tp_buffer_access(env, reg, argno, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown(env, regs, value_regno);
 	} else if (base_type(reg->type) == PTR_TO_BTF_ID &&
 		   !type_may_be_null(reg->type)) {
-		err = check_ptr_to_btf_access(env, regs, regno, off, size, t,
+		err = check_ptr_to_btf_access(env, regs, reg, argno, off, size, t,
 					      value_regno);
 	} else if (reg->type == CONST_PTR_TO_MAP) {
-		err = check_ptr_to_map_access(env, regs, regno, off, size, t,
+		err = check_ptr_to_map_access(env, regs, reg, argno, off, size, t,
 					      value_regno);
 	} else if (base_type(reg->type) == PTR_TO_BUF &&
 		   !type_may_be_null(reg->type)) {
@@ -6544,8 +6287,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 
 		if (rdonly_mem) {
 			if (t == BPF_WRITE) {
-				verbose(env, "R%d cannot write into %s\n",
-					regno, reg_type_str(env, reg->type));
+				verbose(env, "%s cannot write into %s\n",
+					reg_arg_name(env, argno), reg_type_str(env, reg->type));
 				return -EACCES;
 			}
 			max_access = &env->prog->aux->max_rdonly_access;
@@ -6553,7 +6296,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			max_access = &env->prog->aux->max_rdwr_access;
 		}
 
-		err = check_buffer_access(env, reg, regno, off, size, false,
+		err = check_buffer_access(env, reg, argno, off, size, false,
 					  max_access);
 
 		if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ))
@@ -6562,7 +6305,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		if (t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown(env, regs, value_regno);
 	} else {
-		verbose(env, "R%d invalid mem access '%s'\n", regno,
+		verbose(env, "%s invalid mem access '%s'\n", reg_arg_name(env, argno),
 			reg_type_str(env, reg->type));
 		return -EACCES;
 	}
@@ -6585,10 +6328,20 @@ static int check_load_mem(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			  bool strict_alignment_once, bool is_ldsx,
 			  bool allow_trust_mismatch, const char *ctx)
 {
+	struct bpf_verifier_state *vstate = env->cur_state;
+	struct bpf_func_state *state = vstate->frame[vstate->curframe];
 	struct bpf_reg_state *regs = cur_regs(env);
 	enum bpf_reg_type src_reg_type;
 	int err;
 
+	/* Handle stack arg read */
+	if (is_stack_arg_ldx(insn)) {
+		err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
+		if (err)
+			return err;
+		return check_stack_arg_read(env, state, insn->off, insn->dst_reg);
+	}
+
 	/* check src operand */
 	err = check_reg_arg(env, insn->src_reg, SRC_OP);
 	if (err)
@@ -6604,7 +6357,7 @@ static int check_load_mem(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	/* Check if (src_reg + off) is readable. The state of dst_reg will be
 	 * updated by this call.
 	 */
-	err = check_mem_access(env, env->insn_idx, insn->src_reg, insn->off,
+	err = check_mem_access(env, env->insn_idx, regs + insn->src_reg, argno_from_reg(insn->src_reg), insn->off,
 			       BPF_SIZE(insn->code), BPF_READ, insn->dst_reg,
 			       strict_alignment_once, is_ldsx);
 	err = err ?: save_aux_ptr_type(env, src_reg_type,
@@ -6617,10 +6370,20 @@ static int check_load_mem(struct bpf_verifier_env *env, struct bpf_insn *insn,
 static int check_store_reg(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			   bool strict_alignment_once)
 {
+	struct bpf_verifier_state *vstate = env->cur_state;
+	struct bpf_func_state *state = vstate->frame[vstate->curframe];
 	struct bpf_reg_state *regs = cur_regs(env);
 	enum bpf_reg_type dst_reg_type;
 	int err;
 
+	/* Handle stack arg write */
+	if (is_stack_arg_stx(insn)) {
+		err = check_reg_arg(env, insn->src_reg, SRC_OP);
+		if (err)
+			return err;
+		return check_stack_arg_write(env, state, insn->off, regs + insn->src_reg);
+	}
+
 	/* check src1 operand */
 	err = check_reg_arg(env, insn->src_reg, SRC_OP);
 	if (err)
@@ -6634,7 +6397,7 @@ static int check_store_reg(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	dst_reg_type = regs[insn->dst_reg].type;
 
 	/* Check if (dst_reg + off) is writeable. */
-	err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off,
+	err = check_mem_access(env, env->insn_idx, regs + insn->dst_reg, argno_from_reg(insn->dst_reg), insn->off,
 			       BPF_SIZE(insn->code), BPF_WRITE, insn->src_reg,
 			       strict_alignment_once, false);
 	err = err ?: save_aux_ptr_type(env, dst_reg_type, false);
@@ -6645,6 +6408,7 @@ static int check_store_reg(struct bpf_verifier_env *env, struct bpf_insn *insn,
 static int check_atomic_rmw(struct bpf_verifier_env *env,
 			    struct bpf_insn *insn)
 {
+	struct bpf_reg_state *dst_reg;
 	int load_reg;
 	int err;
 
@@ -6706,13 +6470,15 @@ static int check_atomic_rmw(struct bpf_verifier_env *env,
 		load_reg = -1;
 	}
 
+	dst_reg = cur_regs(env) + insn->dst_reg;
+
 	/* Check whether we can read the memory, with second call for fetch
 	 * case to simulate the register fill.
 	 */
-	err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off,
+	err = check_mem_access(env, env->insn_idx, dst_reg, argno_from_reg(insn->dst_reg), insn->off,
 			       BPF_SIZE(insn->code), BPF_READ, -1, true, false);
 	if (!err && load_reg >= 0)
-		err = check_mem_access(env, env->insn_idx, insn->dst_reg,
+		err = check_mem_access(env, env->insn_idx, dst_reg, argno_from_reg(insn->dst_reg),
 				       insn->off, BPF_SIZE(insn->code),
 				       BPF_READ, load_reg, true, false);
 	if (err)
@@ -6724,7 +6490,7 @@ static int check_atomic_rmw(struct bpf_verifier_env *env,
 			return err;
 	}
 	/* Check whether we can write into the same memory. */
-	err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off,
+	err = check_mem_access(env, env->insn_idx, dst_reg, argno_from_reg(insn->dst_reg), insn->off,
 			       BPF_SIZE(insn->code), BPF_WRITE, -1, true, false);
 	if (err)
 		return err;
@@ -6813,11 +6579,10 @@ static int check_atomic(struct bpf_verifier_env *env, struct bpf_insn *insn)
  * read offsets are marked as read.
  */
 static int check_stack_range_initialized(
-		struct bpf_verifier_env *env, int regno, int off,
+		struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int off,
 		int access_size, bool zero_size_allowed,
 		enum bpf_access_type type, struct bpf_call_arg_meta *meta)
 {
-	struct bpf_reg_state *reg = reg_state(env, regno);
 	struct bpf_func_state *state = bpf_func(env, reg);
 	int err, min_off, max_off, i, j, slot, spi;
 	/* Some accesses can write anything into the stack, others are
@@ -6839,7 +6604,7 @@ static int check_stack_range_initialized(
 		return -EACCES;
 	}
 
-	err = check_stack_access_within_bounds(env, regno, off, access_size, type);
+	err = check_stack_access_within_bounds(env, reg, argno, off, access_size, type);
 	if (err)
 		return err;
 
@@ -6856,8 +6621,8 @@ static int check_stack_range_initialized(
 			char tn_buf[48];
 
 			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-			verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n",
-				regno, tn_buf);
+			verbose(env, "%s variable offset stack access prohibited for !root, var_off=%s\n",
+				reg_arg_name(env, argno), tn_buf);
 			return -EACCES;
 		}
 		/* Only initialized buffer on stack is allowed to be accessed
@@ -6869,8 +6634,8 @@ static int check_stack_range_initialized(
 		if (meta && meta->raw_mode)
 			meta = NULL;
 
-		min_off = reg->smin_value + off;
-		max_off = reg->smax_value + off;
+		min_off = reg_smin(reg) + off;
+		max_off = reg_smax(reg) + off;
 	}
 
 	if (meta && meta->raw_mode) {
@@ -6900,7 +6665,7 @@ static int check_stack_range_initialized(
 			}
 		}
 		meta->access_size = access_size;
-		meta->regno = regno;
+		meta->regno = reg_from_argno(argno);
 		return 0;
 	}
 
@@ -6940,17 +6705,17 @@ static int check_stack_range_initialized(
 		if (*stype == STACK_POISON) {
 			if (allow_poison)
 				goto mark;
-			verbose(env, "reading from stack R%d off %d+%d size %d, slot poisoned by dead code elimination\n",
-				regno, min_off, i - min_off, access_size);
+			verbose(env, "reading from stack %s off %d+%d size %d, slot poisoned by dead code elimination\n",
+				reg_arg_name(env, argno), min_off, i - min_off, access_size);
 		} else if (tnum_is_const(reg->var_off)) {
-			verbose(env, "invalid read from stack R%d off %d+%d size %d\n",
-				regno, min_off, i - min_off, access_size);
+			verbose(env, "invalid read from stack %s off %d+%d size %d\n",
+				reg_arg_name(env, argno), min_off, i - min_off, access_size);
 		} else {
 			char tn_buf[48];
 
 			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-			verbose(env, "invalid read from stack R%d var_off %s+%d size %d\n",
-				regno, tn_buf, i - min_off, access_size);
+			verbose(env, "invalid read from stack %s var_off %s+%d size %d\n",
+				reg_arg_name(env, argno), tn_buf, i - min_off, access_size);
 		}
 		return -EACCES;
 mark:
@@ -6959,48 +6724,48 @@ mark:
 	return 0;
 }
 
-static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
+static int check_helper_mem_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno,
 				   int access_size, enum bpf_access_type access_type,
 				   bool zero_size_allowed,
 				   struct bpf_call_arg_meta *meta)
 {
-	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+	struct bpf_reg_state *regs = cur_regs(env);
 	u32 *max_access;
 
 	switch (base_type(reg->type)) {
 	case PTR_TO_PACKET:
 	case PTR_TO_PACKET_META:
-		return check_packet_access(env, regno, 0, access_size,
+		return check_packet_access(env, reg, argno, 0, access_size,
 					   zero_size_allowed);
 	case PTR_TO_MAP_KEY:
 		if (access_type == BPF_WRITE) {
-			verbose(env, "R%d cannot write into %s\n", regno,
-				reg_type_str(env, reg->type));
+			verbose(env, "%s cannot write into %s\n",
+				reg_arg_name(env, argno), reg_type_str(env, reg->type));
 			return -EACCES;
 		}
-		return check_mem_region_access(env, regno, 0, access_size,
+		return check_mem_region_access(env, reg, argno, 0, access_size,
 					       reg->map_ptr->key_size, false);
 	case PTR_TO_MAP_VALUE:
-		if (check_map_access_type(env, regno, 0, access_size, access_type))
+		if (check_map_access_type(env, reg, 0, access_size, access_type))
 			return -EACCES;
-		return check_map_access(env, regno, 0, access_size,
+		return check_map_access(env, reg, argno, 0, access_size,
 					zero_size_allowed, ACCESS_HELPER);
 	case PTR_TO_MEM:
 		if (type_is_rdonly_mem(reg->type)) {
 			if (access_type == BPF_WRITE) {
-				verbose(env, "R%d cannot write into %s\n", regno,
-					reg_type_str(env, reg->type));
+				verbose(env, "%s cannot write into %s\n",
+					reg_arg_name(env, argno), reg_type_str(env, reg->type));
 				return -EACCES;
 			}
 		}
-		return check_mem_region_access(env, regno, 0,
+		return check_mem_region_access(env, reg, argno, 0,
 					       access_size, reg->mem_size,
 					       zero_size_allowed);
 	case PTR_TO_BUF:
 		if (type_is_rdonly_mem(reg->type)) {
 			if (access_type == BPF_WRITE) {
-				verbose(env, "R%d cannot write into %s\n", regno,
-					reg_type_str(env, reg->type));
+				verbose(env, "%s cannot write into %s\n",
+					reg_arg_name(env, argno), reg_type_str(env, reg->type));
 				return -EACCES;
 			}
 
@@ -7008,26 +6773,26 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 		} else {
 			max_access = &env->prog->aux->max_rdwr_access;
 		}
-		return check_buffer_access(env, reg, regno, 0,
+		return check_buffer_access(env, reg, argno, 0,
 					   access_size, zero_size_allowed,
 					   max_access);
 	case PTR_TO_STACK:
 		return check_stack_range_initialized(
-				env,
-				regno, 0, access_size,
+				env, reg,
+				argno, 0, access_size,
 				zero_size_allowed, access_type, meta);
 	case PTR_TO_BTF_ID:
-		return check_ptr_to_btf_access(env, regs, regno, 0,
-					       access_size, BPF_READ, -1);
+		return check_ptr_to_btf_access(env, regs, reg, argno, 0,
+					       access_size, access_type, -1);
 	case PTR_TO_CTX:
 		/* Only permit reading or writing syscall context using helper calls. */
 		if (is_var_ctx_off_allowed(env->prog)) {
-			int err = check_mem_region_access(env, regno, 0, access_size, U16_MAX,
+			int err = check_mem_region_access(env, reg, argno, 0, access_size, U16_MAX,
 							  zero_size_allowed);
 			if (err)
 				return err;
-			if (env->prog->aux->max_ctx_offset < reg->umax_value + access_size)
-				env->prog->aux->max_ctx_offset = reg->umax_value + access_size;
+			if (env->prog->aux->max_ctx_offset < reg_umax(reg) + access_size)
+				env->prog->aux->max_ctx_offset = reg_umax(reg) + access_size;
 			return 0;
 		}
 		fallthrough;
@@ -7037,7 +6802,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 		    bpf_register_is_null(reg))
 			return 0;
 
-		verbose(env, "R%d type=%s ", regno,
+		verbose(env, "%s type=%s ", reg_arg_name(env, argno),
 			reg_type_str(env, reg->type));
 		verbose(env, "expected=%s\n", reg_type_str(env, PTR_TO_STACK));
 		return -EACCES;
@@ -7047,12 +6812,12 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 /* verify arguments to helpers or kfuncs consisting of a pointer and an access
  * size.
  *
- * @regno is the register containing the access size. regno-1 is the register
- * containing the pointer.
+ * @mem_reg contains the pointer, @size_reg contains the access size.
  */
 static int check_mem_size_reg(struct bpf_verifier_env *env,
-			      struct bpf_reg_state *reg, u32 regno,
-			      enum bpf_access_type access_type,
+			      struct bpf_reg_state *mem_reg,
+			      struct bpf_reg_state *size_reg, argno_t mem_argno,
+			      argno_t size_argno, enum bpf_access_type access_type,
 			      bool zero_size_allowed,
 			      struct bpf_call_arg_meta *meta)
 {
@@ -7066,42 +6831,48 @@ static int check_mem_size_reg(struct bpf_verifier_env *env,
 	 * out. Only upper bounds can be learned because retval is an
 	 * int type and negative retvals are allowed.
 	 */
-	meta->msize_max_value = reg->umax_value;
+	meta->msize_max_value = reg_umax(size_reg);
 
 	/* The register is SCALAR_VALUE; the access check happens using
 	 * its boundaries. For unprivileged variable accesses, disable
 	 * raw mode so that the program is required to initialize all
 	 * the memory that the helper could just partially fill up.
 	 */
-	if (!tnum_is_const(reg->var_off))
+	if (!tnum_is_const(size_reg->var_off))
 		meta = NULL;
 
-	if (reg->smin_value < 0) {
-		verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
-			regno);
+	if (reg_smin(size_reg) < 0) {
+		verbose(env, "%s min value is negative, either use unsigned or 'var &= const'\n",
+			reg_arg_name(env, size_argno));
 		return -EACCES;
 	}
 
-	if (reg->umin_value == 0 && !zero_size_allowed) {
-		verbose(env, "R%d invalid zero-sized read: u64=[%lld,%lld]\n",
-			regno, reg->umin_value, reg->umax_value);
+	if (reg_umin(size_reg) == 0 && !zero_size_allowed) {
+		verbose(env, "%s invalid zero-sized read: u64=[%lld,%lld]\n",
+			reg_arg_name(env, size_argno), reg_umin(size_reg), reg_umax(size_reg));
 		return -EACCES;
 	}
 
-	if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
-		verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
-			regno);
+	if (reg_umax(size_reg) >= BPF_MAX_VAR_SIZ) {
+		verbose(env, "%s unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
+			reg_arg_name(env, size_argno));
 		return -EACCES;
 	}
-	err = check_helper_mem_access(env, regno - 1, reg->umax_value,
+	err = check_helper_mem_access(env, mem_reg, mem_argno, reg_umax(size_reg),
 				      access_type, zero_size_allowed, meta);
-	if (!err)
-		err = mark_chain_precision(env, regno);
+	if (!err) {
+		int regno = reg_from_argno(size_argno);
+
+		if (regno >= 0)
+			err = mark_chain_precision(env, regno);
+		else
+			err = mark_stack_arg_precision(env, arg_idx_from_argno(size_argno));
+	}
 	return err;
 }
 
 static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
-			 u32 regno, u32 mem_size)
+			 argno_t argno, u32 mem_size)
 {
 	bool may_be_null = type_may_be_null(reg->type);
 	struct bpf_reg_state saved_reg;
@@ -7110,6 +6881,12 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg
 	if (bpf_register_is_null(reg))
 		return 0;
 
+	if (mem_size > S32_MAX) {
+		verbose(env, "%s memory size %u is too large\n",
+			reg_arg_name(env, argno), mem_size);
+		return -EACCES;
+	}
+
 	/* Assuming that the register contains a value check if the memory
 	 * access is safe. Temporarily save and restore the register's state as
 	 * the conversion shouldn't be visible to a caller.
@@ -7121,8 +6898,8 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg
 
 	int size = base_type(reg->type) == PTR_TO_STACK ? -(int)mem_size : mem_size;
 
-	err = check_helper_mem_access(env, regno, size, BPF_READ, true, NULL);
-	err = err ?: check_helper_mem_access(env, regno, size, BPF_WRITE, true, NULL);
+	err = check_helper_mem_access(env, reg, argno, size, BPF_READ, true, NULL);
+	err = err ?: check_helper_mem_access(env, reg, argno, size, BPF_WRITE, true, NULL);
 
 	if (may_be_null)
 		*reg = saved_reg;
@@ -7130,17 +6907,14 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg
 	return err;
 }
 
-static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
-				    u32 regno)
+static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *mem_reg,
+				    struct bpf_reg_state *size_reg, argno_t mem_argno, argno_t size_argno)
 {
-	struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1];
 	bool may_be_null = type_may_be_null(mem_reg->type);
 	struct bpf_reg_state saved_reg;
 	struct bpf_call_arg_meta meta;
 	int err;
 
-	WARN_ON_ONCE(regno < BPF_REG_2 || regno > BPF_REG_5);
-
 	memset(&meta, 0, sizeof(meta));
 
 	if (may_be_null) {
@@ -7148,8 +6922,8 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg
 		mark_ptr_not_null_reg(mem_reg);
 	}
 
-	err = check_mem_size_reg(env, reg, regno, BPF_READ, true, &meta);
-	err = err ?: check_mem_size_reg(env, reg, regno, BPF_WRITE, true, &meta);
+	err = check_mem_size_reg(env, mem_reg, size_reg, mem_argno, size_argno, BPF_READ, true, &meta);
+	err = err ?: check_mem_size_reg(env, mem_reg, size_reg, mem_argno, size_argno, BPF_WRITE, true, &meta);
 
 	if (may_be_null)
 		*mem_reg = saved_reg;
@@ -7185,11 +6959,10 @@ enum {
  * env->cur_state->active_locks remembers which map value element or allocated
  * object got locked and clears it after bpf_spin_unlock.
  */
-static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags)
+static int process_spin_lock(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int flags)
 {
 	bool is_lock = flags & PROCESS_SPIN_LOCK, is_res_lock = flags & PROCESS_RES_LOCK;
 	const char *lock_str = is_res_lock ? "bpf_res_spin" : "bpf_spin";
-	struct bpf_reg_state *reg = reg_state(env, regno);
 	struct bpf_verifier_state *cur = env->cur_state;
 	bool is_const = tnum_is_const(reg->var_off);
 	bool is_irq = flags & PROCESS_LOCK_IRQ;
@@ -7202,8 +6975,8 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags)
 
 	if (!is_const) {
 		verbose(env,
-			"R%d doesn't have constant offset. %s_lock has to be at the constant offset\n",
-			regno, lock_str);
+			"%s doesn't have constant offset. %s_lock has to be at the constant offset\n",
+			reg_arg_name(env, argno), lock_str);
 		return -EINVAL;
 	}
 	if (reg->type == PTR_TO_MAP_VALUE) {
@@ -7302,11 +7075,10 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags)
 }
 
 /* Check if @regno is a pointer to a specific field in a map value */
-static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno,
+static int check_map_field_pointer(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno,
 				   enum btf_field_type field_type,
 				   struct bpf_map_desc *map_desc)
 {
-	struct bpf_reg_state *reg = reg_state(env, regno);
 	bool is_const = tnum_is_const(reg->var_off);
 	struct bpf_map *map = reg->map_ptr;
 	u64 val = reg->var_off.value;
@@ -7315,8 +7087,8 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno,
 
 	if (!is_const) {
 		verbose(env,
-			"R%d doesn't have constant offset. %s has to be at the constant offset\n",
-			regno, struct_name);
+			"%s doesn't have constant offset. %s has to be at the constant offset\n",
+			reg_arg_name(env, argno), struct_name);
 		return -EINVAL;
 	}
 	if (!map->btf) {
@@ -7356,26 +7128,26 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno,
 	return 0;
 }
 
-static int process_timer_func(struct bpf_verifier_env *env, int regno,
+static int process_timer_func(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno,
 			      struct bpf_map_desc *map)
 {
 	if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
 		verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n");
 		return -EOPNOTSUPP;
 	}
-	return check_map_field_pointer(env, regno, BPF_TIMER, map);
+	return check_map_field_pointer(env, reg, argno, BPF_TIMER, map);
 }
 
-static int process_timer_helper(struct bpf_verifier_env *env, int regno,
+static int process_timer_helper(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno,
 				struct bpf_call_arg_meta *meta)
 {
-	return process_timer_func(env, regno, &meta->map);
+	return process_timer_func(env, reg, argno, &meta->map);
 }
 
-static int process_timer_kfunc(struct bpf_verifier_env *env, int regno,
+static int process_timer_kfunc(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno,
 			       struct bpf_kfunc_call_arg_meta *meta)
 {
-	return process_timer_func(env, regno, &meta->map);
+	return process_timer_func(env, reg, argno, &meta->map);
 }
 
 static int process_kptr_func(struct bpf_verifier_env *env, int regno,
@@ -7426,52 +7198,42 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
 	return 0;
 }
 
-/* There are two register types representing a bpf_dynptr, one is PTR_TO_STACK
+/*
+ * Validate dynptr arguments for helper, kfunc and subprog.
+ *
+ * @dynptr is both input and output. It is populated when the argument is
+ * tagged with MEM_UNINIT (i.e., the dynptr argument that will be constructed)
+ * and consumed when the argument is expecting to be an initialized dynptr.
+ * @parent_id is used to track the referenced parent object (e.g., file or skb in
+ * qdisc program) when constructing a dynptr.
+ *
+ * There are two register types representing a bpf_dynptr, one is PTR_TO_STACK
  * which points to a stack slot, and the other is CONST_PTR_TO_DYNPTR.
  *
  * In both cases we deal with the first 8 bytes, but need to mark the next 8
  * bytes as STACK_DYNPTR in case of PTR_TO_STACK. In case of
  * CONST_PTR_TO_DYNPTR, we are guaranteed to get the beginning of the object.
  *
- * Mutability of bpf_dynptr is at two levels, one is at the level of struct
- * bpf_dynptr itself, i.e. whether the helper is receiving a pointer to struct
- * bpf_dynptr or pointer to const struct bpf_dynptr. In the former case, it can
- * mutate the view of the dynptr and also possibly destroy it. In the latter
- * case, it cannot mutate the bpf_dynptr itself but it can still mutate the
- * memory that dynptr points to.
- *
- * The verifier will keep track both levels of mutation (bpf_dynptr's in
- * reg->type and the memory's in reg->dynptr.type), but there is no support for
- * readonly dynptr view yet, hence only the first case is tracked and checked.
- *
- * This is consistent with how C applies the const modifier to a struct object,
- * where the pointer itself inside bpf_dynptr becomes const but not what it
- * points to.
- *
- * Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument
- * type, and declare it as 'const struct bpf_dynptr *' in their prototype.
+ * Mutability of bpf_dynptr is at two levels: the dynptr and the memory the
+ * dynptr points to. At the first level, the verifier will make sure a
+ * CONST_PTR_TO_DYNPTR cannot be reinitialized or destroyed. The mutability of
+ * a dynptr's view (i.e., start and offset) is not tracked as there is not such
+ * use case. The second level is tracked using the upper bit of bpf_dynptr->size
+ * and checked dynamically during runtime.
  */
-static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn_idx,
-			       enum bpf_arg_type arg_type, int clone_ref_obj_id)
+static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+			       argno_t argno, int insn_idx, enum bpf_arg_type arg_type,
+			       struct ref_obj_desc *ref_obj, struct bpf_dynptr_desc *dynptr)
 {
-	struct bpf_reg_state *reg = reg_state(env, regno);
-	int err;
+	int spi, err = 0;
 
 	if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) {
 		verbose(env,
-			"arg#%d expected pointer to stack or const struct bpf_dynptr\n",
-			regno - 1);
+			"%s expected pointer to stack or const struct bpf_dynptr\n",
+			reg_arg_name(env, argno));
 		return -EINVAL;
 	}
 
-	/* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an
-	 * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*):
-	 */
-	if ((arg_type & (MEM_UNINIT | MEM_RDONLY)) == (MEM_UNINIT | MEM_RDONLY)) {
-		verifier_bug(env, "misconfigured dynptr helper type flags");
-		return -EFAULT;
-	}
-
 	/*  MEM_UNINIT - Points to memory that is an appropriate candidate for
 	 *		 constructing a mutable bpf_dynptr object.
 	 *
@@ -7479,13 +7241,12 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn
 	 *		 pointing to a region of at least 16 bytes which doesn't
 	 *		 contain an existing bpf_dynptr.
 	 *
-	 *  MEM_RDONLY - Points to a initialized bpf_dynptr that will not be
-	 *		 mutated or destroyed. However, the memory it points to
-	 *		 may be mutated.
+	 *  OBJ_RELEASE - Points to a initialized bpf_dynptr that will be
+	 *		  destroyed.
 	 *
-	 *  None       - Points to a initialized dynptr that can be mutated and
-	 *		 destroyed, including mutation of the memory it points
-	 *		 to.
+	 *  None       - Points to a initialized dynptr that cannot be
+	 *		 reinitialized or destroyed. However, the view of the
+	 *		 dynptr and the memory it points to may be mutated.
 	 */
 	if (arg_type & MEM_UNINIT) {
 		int i;
@@ -7497,45 +7258,58 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn
 
 		/* we write BPF_DW bits (8 bytes) at a time */
 		for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) {
-			err = check_mem_access(env, insn_idx, regno,
+			err = check_mem_access(env, insn_idx, reg, argno,
 					       i, BPF_DW, BPF_WRITE, -1, false, false);
 			if (err)
 				return err;
 		}
 
-		err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx, clone_ref_obj_id);
-	} else /* MEM_RDONLY and None case from above */ {
+		err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx, ref_obj, dynptr);
+	} else /* OBJ_RELEASE and None case from above */ {
 		/* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */
-		if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) {
-			verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n");
+		if (reg->type == CONST_PTR_TO_DYNPTR && (arg_type & OBJ_RELEASE)) {
+			verbose(env, "CONST_PTR_TO_DYNPTR cannot be released\n");
 			return -EINVAL;
 		}
 
 		if (!is_dynptr_reg_valid_init(env, reg)) {
-			verbose(env,
-				"Expected an initialized dynptr as arg #%d\n",
-				regno - 1);
+			verbose(env, "Expected an initialized dynptr as %s\n",
+				reg_arg_name(env, argno));
 			return -EINVAL;
 		}
 
-		/* Fold modifiers (in this case, MEM_RDONLY) when checking expected type */
-		if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) {
+		/* Fold modifiers (in this case, OBJ_RELEASE) when checking expected type */
+		if (!is_dynptr_type_expected(env, reg, arg_type & ~OBJ_RELEASE)) {
 			verbose(env,
-				"Expected a dynptr of type %s as arg #%d\n",
-				dynptr_type_str(arg_to_dynptr_type(arg_type)), regno - 1);
+				"Expected a dynptr of type %s as %s\n",
+				dynptr_type_str(arg_to_dynptr_type(arg_type)),
+				reg_arg_name(env, argno));
 			return -EINVAL;
 		}
 
-		err = mark_dynptr_read(env, reg);
-	}
-	return err;
-}
+		if (reg->type != CONST_PTR_TO_DYNPTR) {
+			struct bpf_func_state *state = bpf_func(env, reg);
 
-static u32 iter_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi)
-{
-	struct bpf_func_state *state = bpf_func(env, reg);
+			spi = dynptr_get_spi(env, reg);
+			if (spi < 0)
+				return spi;
+
+			/*
+			 * For CONST_PTR_TO_DYNPTR, reg is already scratched by check_reg_arg
+			 * in check_helper_call and mark_btf_func_reg_size in check_kfunc_call.
+			 */
+			mark_stack_slots_scratched(env, spi, BPF_DYNPTR_NR_SLOTS);
 
-	return state->stack[spi].spilled_ptr.ref_obj_id;
+			reg = &state->stack[spi].spilled_ptr;
+		}
+
+		if (dynptr) {
+			dynptr->type = reg->dynptr.type;
+			dynptr->id = reg->id;
+			dynptr->parent_id = reg->parent_id;
+		}
+	}
+	return err;
 }
 
 static bool is_iter_kfunc(struct bpf_kfunc_call_arg_meta *meta)
@@ -7567,15 +7341,17 @@ static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg_idx,
 	return btf_param_match_suffix(meta->btf, arg, "__iter");
 }
 
-static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx,
+static int process_iter_arg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int insn_idx,
 			    struct bpf_kfunc_call_arg_meta *meta)
 {
-	struct bpf_reg_state *reg = reg_state(env, regno);
+	struct bpf_func_state *state = bpf_func(env, reg);
 	const struct btf_type *t;
+	u32 arg_idx = arg_idx_from_argno(argno);
 	int spi, err, i, nr_slots, btf_id;
 
 	if (reg->type != PTR_TO_STACK) {
-		verbose(env, "arg#%d expected pointer to an iterator on stack\n", regno - 1);
+		verbose(env, "%s expected pointer to an iterator on stack\n",
+			reg_arg_name(env, argno));
 		return -EINVAL;
 	}
 
@@ -7585,9 +7361,10 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
 	 * to any kfunc, if arg has "__iter" suffix, we need to be a bit more
 	 * conservative here.
 	 */
-	btf_id = btf_check_iter_arg(meta->btf, meta->func_proto, regno - 1);
+	btf_id = btf_check_iter_arg(meta->btf, meta->func_proto, arg_idx);
 	if (btf_id < 0) {
-		verbose(env, "expected valid iter pointer as arg #%d\n", regno - 1);
+		verbose(env, "expected valid iter pointer as %s\n",
+			reg_arg_name(env, argno));
 		return -EINVAL;
 	}
 	t = btf_type_by_id(meta->btf, btf_id);
@@ -7596,13 +7373,13 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
 	if (is_iter_new_kfunc(meta)) {
 		/* bpf_iter_<type>_new() expects pointer to uninit iter state */
 		if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) {
-			verbose(env, "expected uninitialized iter_%s as arg #%d\n",
-				iter_type_str(meta->btf, btf_id), regno - 1);
+			verbose(env, "expected uninitialized iter_%s as %s\n",
+				iter_type_str(meta->btf, btf_id), reg_arg_name(env, argno));
 			return -EINVAL;
 		}
 
 		for (i = 0; i < nr_slots * 8; i += BPF_REG_SIZE) {
-			err = check_mem_access(env, insn_idx, regno,
+			err = check_mem_access(env, insn_idx, reg, argno,
 					       i, BPF_DW, BPF_WRITE, -1, false, false);
 			if (err)
 				return err;
@@ -7620,8 +7397,8 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
 		case 0:
 			break;
 		case -EINVAL:
-			verbose(env, "expected an initialized iter_%s as arg #%d\n",
-				iter_type_str(meta->btf, btf_id), regno - 1);
+			verbose(env, "expected an initialized iter_%s as %s\n",
+				iter_type_str(meta->btf, btf_id), reg_arg_name(env, argno));
 			return err;
 		case -EPROTO:
 			verbose(env, "expected an RCU CS when using %s\n", meta->func_name);
@@ -7634,14 +7411,12 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
 		if (spi < 0)
 			return spi;
 
-		err = mark_iter_read(env, reg, spi, nr_slots);
-		if (err)
-			return err;
+		mark_stack_slots_scratched(env, spi, nr_slots);
 
 		/* remember meta->iter info for process_iter_next_call() */
 		meta->iter.spi = spi;
 		meta->iter.frameno = reg->frameno;
-		meta->ref_obj_id = iter_ref_obj_id(env, reg, spi);
+		update_ref_obj(&meta->ref_obj, &state->stack[spi].spilled_ptr);
 
 		if (is_iter_destroy_kfunc(meta)) {
 			err = unmark_stack_slots_iter(env, reg, nr_slots);
@@ -8041,12 +7816,11 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_DYNPTR]		= &dynptr_types,
 };
 
-static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
+static int check_reg_type(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno,
 			  enum bpf_arg_type arg_type,
 			  const u32 *arg_btf_id,
 			  struct bpf_call_arg_meta *meta)
 {
-	struct bpf_reg_state *reg = reg_state(env, regno);
 	enum bpf_reg_type expected, type = reg->type;
 	const struct bpf_reg_types *compatible;
 	int i, j, err;
@@ -8077,7 +7851,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
 		type &= ~DYNPTR_TYPE_FLAG_MASK;
 
 	/* Local kptr types are allowed as the source argument of bpf_kptr_xchg */
-	if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type) && regno == BPF_REG_2) {
+	if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type) && reg_from_argno(argno) == BPF_REG_2) {
 		type &= ~MEM_ALLOC;
 		type &= ~MEM_PERCPU;
 	}
@@ -8091,7 +7865,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
 			goto found;
 	}
 
-	verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, reg->type));
+	verbose(env, "%s type=%s expected=", reg_arg_name(env, argno), reg_type_str(env, reg->type));
 	for (j = 0; j + 1 < i; j++)
 		verbose(env, "%s, ", reg_type_str(env, compatible->types[j]));
 	verbose(env, "%s\n", reg_type_str(env, compatible->types[j]));
@@ -8104,9 +7878,9 @@ found:
 	if (compatible == &mem_types) {
 		if (!(arg_type & MEM_RDONLY)) {
 			verbose(env,
-				"%s() may write into memory pointed by R%d type=%s\n",
+				"%s() may write into memory pointed by %s type=%s\n",
 				func_id_name(meta->func_id),
-				regno, reg_type_str(env, reg->type));
+				reg_arg_name(env, argno), reg_type_str(env, reg->type));
 			return -EACCES;
 		}
 		return 0;
@@ -8129,7 +7903,8 @@ found:
 
 		if (type_may_be_null(reg->type) &&
 		    (!type_may_be_null(arg_type) || arg_type_is_release(arg_type))) {
-			verbose(env, "Possibly NULL pointer passed to helper arg%d\n", regno);
+			verbose(env, "Possibly NULL pointer passed to helper %s\n",
+				reg_arg_name(env, argno));
 			return -EACCES;
 		}
 
@@ -8142,25 +7917,26 @@ found:
 		}
 
 		if (meta->func_id == BPF_FUNC_kptr_xchg) {
-			if (map_kptr_match_type(env, meta->kptr_field, reg, regno))
+			if (map_kptr_match_type(env, meta->kptr_field, reg, reg_from_argno(argno)))
 				return -EACCES;
 		} else {
 			if (arg_btf_id == BPF_PTR_POISON) {
 				verbose(env, "verifier internal error:");
-				verbose(env, "R%d has non-overwritten BPF_PTR_POISON type\n",
-					regno);
+				verbose(env, "%s has non-overwritten BPF_PTR_POISON type\n",
+					reg_arg_name(env, argno));
 				return -EACCES;
 			}
 
-			err = __check_ptr_off_reg(env, reg, regno, true);
+			err = __check_ptr_off_reg(env, reg, argno, true);
 			if (err)
 				return err;
 
 			if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id,
 						  reg->var_off.value, btf_vmlinux, *arg_btf_id,
 						  strict_type_match)) {
-				verbose(env, "R%d is of type %s but %s is expected\n",
-					regno, btf_type_name(reg->btf, reg->btf_id),
+				verbose(env, "%s is of type %s but %s is expected\n",
+					reg_arg_name(env, argno),
+					btf_type_name(reg->btf, reg->btf_id),
 					btf_type_name(btf_vmlinux, *arg_btf_id));
 				return -EACCES;
 			}
@@ -8177,8 +7953,11 @@ found:
 			return -EFAULT;
 		}
 		/* Check if local kptr in src arg matches kptr in dst arg */
-		if (meta->func_id == BPF_FUNC_kptr_xchg && regno == BPF_REG_2) {
-			if (map_kptr_match_type(env, meta->kptr_field, reg, regno))
+		if (meta->func_id == BPF_FUNC_kptr_xchg) {
+			int regno = reg_from_argno(argno);
+
+			if (regno == BPF_REG_2 &&
+			    map_kptr_match_type(env, meta->kptr_field, reg, regno))
 				return -EACCES;
 		}
 		break;
@@ -8212,7 +7991,7 @@ reg_find_field_offset(const struct bpf_reg_state *reg, s32 off, u32 fields)
 }
 
 static int check_func_arg_reg_off(struct bpf_verifier_env *env,
-				  const struct bpf_reg_state *reg, int regno,
+				  const struct bpf_reg_state *reg, argno_t argno,
 				  enum bpf_arg_type arg_type)
 {
 	u32 type = reg->type;
@@ -8220,7 +7999,7 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env,
 	/* When referenced register is passed to release function, its fixed
 	 * offset must be 0.
 	 *
-	 * We will check arg_type_is_release reg has ref_obj_id when storing
+	 * We will check arg_type_is_release reg has id when storing
 	 * meta->release_regno.
 	 */
 	if (arg_type_is_release(arg_type)) {
@@ -8238,8 +8017,8 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env,
 		 * to give the user a better error message.
 		 */
 		if (!tnum_is_const(reg->var_off) || reg->var_off.value != 0) {
-			verbose(env, "R%d must have zero offset when passed to release func or trusted arg to kfunc\n",
-				regno);
+			verbose(env, "%s must have zero offset when passed to release func or trusted arg to kfunc\n",
+				reg_arg_name(env, argno));
 			return -EINVAL;
 		}
 	}
@@ -8275,7 +8054,7 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env,
 		 * cases. var_off always must be 0 for PTR_TO_BTF_ID, hence we
 		 * still need to do checks instead of returning.
 		 */
-		return __check_ptr_off_reg(env, reg, regno, true);
+		return __check_ptr_off_reg(env, reg, argno, true);
 	case PTR_TO_CTX:
 		/*
 		 * Allow fixed and variable offsets for syscall context, but
@@ -8287,78 +8066,12 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env,
 			return 0;
 		fallthrough;
 	default:
-		return __check_ptr_off_reg(env, reg, regno, false);
-	}
-}
-
-static struct bpf_reg_state *get_dynptr_arg_reg(struct bpf_verifier_env *env,
-						const struct bpf_func_proto *fn,
-						struct bpf_reg_state *regs)
-{
-	struct bpf_reg_state *state = NULL;
-	int i;
-
-	for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++)
-		if (arg_type_is_dynptr(fn->arg_type[i])) {
-			if (state) {
-				verbose(env, "verifier internal error: multiple dynptr args\n");
-				return NULL;
-			}
-			state = &regs[BPF_REG_1 + i];
-		}
-
-	if (!state)
-		verbose(env, "verifier internal error: no dynptr arg found\n");
-
-	return state;
-}
-
-static int dynptr_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
-{
-	struct bpf_func_state *state = bpf_func(env, reg);
-	int spi;
-
-	if (reg->type == CONST_PTR_TO_DYNPTR)
-		return reg->id;
-	spi = dynptr_get_spi(env, reg);
-	if (spi < 0)
-		return spi;
-	return state->stack[spi].spilled_ptr.id;
-}
-
-static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
-{
-	struct bpf_func_state *state = bpf_func(env, reg);
-	int spi;
-
-	if (reg->type == CONST_PTR_TO_DYNPTR)
-		return reg->ref_obj_id;
-	spi = dynptr_get_spi(env, reg);
-	if (spi < 0)
-		return spi;
-	return state->stack[spi].spilled_ptr.ref_obj_id;
-}
-
-static enum bpf_dynptr_type dynptr_get_type(struct bpf_verifier_env *env,
-					    struct bpf_reg_state *reg)
-{
-	struct bpf_func_state *state = bpf_func(env, reg);
-	int spi;
-
-	if (reg->type == CONST_PTR_TO_DYNPTR)
-		return reg->dynptr.type;
-
-	spi = bpf_get_spi(reg->var_off.value);
-	if (spi < 0) {
-		verbose(env, "verifier internal error: invalid spi when querying dynptr type\n");
-		return BPF_DYNPTR_TYPE_INVALID;
+		return __check_ptr_off_reg(env, reg, argno, false);
 	}
-
-	return state->stack[spi].spilled_ptr.dynptr.type;
 }
 
-static int check_reg_const_str(struct bpf_verifier_env *env,
-			       struct bpf_reg_state *reg, u32 regno)
+static int check_arg_const_str(struct bpf_verifier_env *env,
+			       struct bpf_reg_state *reg, argno_t argno)
 {
 	struct bpf_map *map = reg->map_ptr;
 	int err;
@@ -8370,17 +8083,18 @@ static int check_reg_const_str(struct bpf_verifier_env *env,
 		return -EINVAL;
 
 	if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) {
-		verbose(env, "R%d points to insn_array map which cannot be used as const string\n", regno);
+		verbose(env, "%s points to insn_array map which cannot be used as const string\n",
+			reg_arg_name(env, argno));
 		return -EACCES;
 	}
 
 	if (!bpf_map_is_rdonly(map)) {
-		verbose(env, "R%d does not point to a readonly map'\n", regno);
+		verbose(env, "%s does not point to a readonly map'\n", reg_arg_name(env, argno));
 		return -EACCES;
 	}
 
 	if (!tnum_is_const(reg->var_off)) {
-		verbose(env, "R%d is not a constant address'\n", regno);
+		verbose(env, "%s is not a constant address'\n", reg_arg_name(env, argno));
 		return -EACCES;
 	}
 
@@ -8389,7 +8103,7 @@ static int check_reg_const_str(struct bpf_verifier_env *env,
 		return -EACCES;
 	}
 
-	err = check_map_access(env, regno, 0,
+	err = check_map_access(env, reg, argno, 0,
 			       map->value_size - reg->var_off.value, false,
 			       ACCESS_HELPER);
 	if (err)
@@ -8471,7 +8185,7 @@ static int get_constant_map_key(struct bpf_verifier_env *env,
 	return 0;
 }
 
-static bool can_elide_value_nullness(enum bpf_map_type type);
+static bool can_elide_value_nullness(const struct bpf_map *map);
 
 static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 			  struct bpf_call_arg_meta *meta,
@@ -8481,6 +8195,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 	u32 regno = BPF_REG_1 + arg;
 	struct bpf_reg_state *reg = reg_state(env, regno);
 	enum bpf_arg_type arg_type = fn->arg_type[arg];
+	argno_t argno = argno_from_arg(arg + 1);
 	enum bpf_reg_type type = reg->type;
 	u32 *arg_btf_id = NULL;
 	u32 key_size;
@@ -8525,56 +8240,24 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 	    base_type(arg_type) == ARG_PTR_TO_SPIN_LOCK)
 		arg_btf_id = fn->arg_btf_id[arg];
 
-	err = check_reg_type(env, regno, arg_type, arg_btf_id, meta);
+	err = check_reg_type(env, reg, argno_from_reg(regno), arg_type, arg_btf_id, meta);
 	if (err)
 		return err;
 
-	err = check_func_arg_reg_off(env, reg, regno, arg_type);
+	err = check_func_arg_reg_off(env, reg, argno_from_reg(regno), arg_type);
 	if (err)
 		return err;
 
 skip_type_check:
-	if (arg_type_is_release(arg_type)) {
-		if (arg_type_is_dynptr(arg_type)) {
-			struct bpf_func_state *state = bpf_func(env, reg);
-			int spi;
-
-			/* Only dynptr created on stack can be released, thus
-			 * the get_spi and stack state checks for spilled_ptr
-			 * should only be done before process_dynptr_func for
-			 * PTR_TO_STACK.
-			 */
-			if (reg->type == PTR_TO_STACK) {
-				spi = dynptr_get_spi(env, reg);
-				if (spi < 0 || !state->stack[spi].spilled_ptr.ref_obj_id) {
-					verbose(env, "arg %d is an unacquired reference\n", regno);
-					return -EINVAL;
-				}
-			} else {
-				verbose(env, "cannot release unowned const bpf_dynptr\n");
-				return -EINVAL;
-			}
-		} else if (!reg->ref_obj_id && !bpf_register_is_null(reg)) {
-			verbose(env, "R%d must be referenced when passed to release function\n",
-				regno);
-			return -EINVAL;
-		}
-		if (meta->release_regno) {
-			verifier_bug(env, "more than one release argument");
-			return -EFAULT;
-		}
-		meta->release_regno = regno;
+	if (arg_type_is_release(arg_type) && !arg_type_is_dynptr(arg_type) &&
+	    !reg_is_referenced(env, reg) && !bpf_register_is_null(reg)) {
+		verbose(env, "release helper %s expects referenced PTR_TO_BTF_ID passed to %s\n",
+			func_id_name(meta->func_id), reg_arg_name(env, argno));
+		return -EINVAL;
 	}
 
-	if (reg->ref_obj_id && base_type(arg_type) != ARG_KPTR_XCHG_DEST) {
-		if (meta->ref_obj_id) {
-			verbose(env, "more than one arg with ref_obj_id R%d %u %u",
-				regno, reg->ref_obj_id,
-				meta->ref_obj_id);
-			return -EACCES;
-		}
-		meta->ref_obj_id = reg->ref_obj_id;
-	}
+	if (reg_is_referenced(env, reg))
+		update_ref_obj(&meta->ref_obj, reg);
 
 	switch (base_type(arg_type)) {
 	case ARG_CONST_MAP_PTR:
@@ -8618,10 +8301,10 @@ skip_type_check:
 			return -EFAULT;
 		}
 		key_size = meta->map.ptr->key_size;
-		err = check_helper_mem_access(env, regno, key_size, BPF_READ, false, NULL);
+		err = check_helper_mem_access(env, reg, argno_from_reg(regno), key_size, BPF_READ, false, NULL);
 		if (err)
 			return err;
-		if (can_elide_value_nullness(meta->map.ptr->map_type)) {
+		if (can_elide_value_nullness(meta->map.ptr)) {
 			err = get_constant_map_key(env, reg, key_size, &meta->const_map_key);
 			if (err < 0) {
 				meta->const_map_key = -1;
@@ -8645,7 +8328,7 @@ skip_type_check:
 			return -EFAULT;
 		}
 		meta->raw_mode = arg_type & MEM_UNINIT;
-		err = check_helper_mem_access(env, regno, meta->map.ptr->value_size,
+		err = check_helper_mem_access(env, reg, argno_from_reg(regno), meta->map.ptr->value_size,
 					      arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ,
 					      false, meta);
 		break;
@@ -8663,11 +8346,11 @@ skip_type_check:
 			return -EACCES;
 		}
 		if (meta->func_id == BPF_FUNC_spin_lock) {
-			err = process_spin_lock(env, regno, PROCESS_SPIN_LOCK);
+			err = process_spin_lock(env, reg, argno_from_reg(regno), PROCESS_SPIN_LOCK);
 			if (err)
 				return err;
 		} else if (meta->func_id == BPF_FUNC_spin_unlock) {
-			err = process_spin_lock(env, regno, 0);
+			err = process_spin_lock(env, reg, argno_from_reg(regno), 0);
 			if (err)
 				return err;
 		} else {
@@ -8676,7 +8359,7 @@ skip_type_check:
 		}
 		break;
 	case ARG_PTR_TO_TIMER:
-		err = process_timer_helper(env, regno, meta);
+		err = process_timer_helper(env, reg, argno_from_reg(regno), meta);
 		if (err)
 			return err;
 		break;
@@ -8689,7 +8372,7 @@ skip_type_check:
 		 */
 		meta->raw_mode = arg_type & MEM_UNINIT;
 		if (arg_type & MEM_FIXED_SIZE) {
-			err = check_helper_mem_access(env, regno, fn->arg_size[arg],
+			err = check_helper_mem_access(env, reg, argno_from_reg(regno), fn->arg_size[arg],
 						      arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ,
 						      false, meta);
 			if (err)
@@ -8699,19 +8382,22 @@ skip_type_check:
 		}
 		break;
 	case ARG_CONST_SIZE:
-		err = check_mem_size_reg(env, reg, regno,
+		err = check_mem_size_reg(env, reg_state(env, regno - 1), reg, argno_from_reg(regno - 1),
+					 argno_from_reg(regno),
 					 fn->arg_type[arg - 1] & MEM_WRITE ?
 					 BPF_WRITE : BPF_READ,
 					 false, meta);
 		break;
 	case ARG_CONST_SIZE_OR_ZERO:
-		err = check_mem_size_reg(env, reg, regno,
+		err = check_mem_size_reg(env, reg_state(env, regno - 1), reg, argno_from_reg(regno - 1),
+					 argno_from_reg(regno),
 					 fn->arg_type[arg - 1] & MEM_WRITE ?
 					 BPF_WRITE : BPF_READ,
 					 true, meta);
 		break;
 	case ARG_PTR_TO_DYNPTR:
-		err = process_dynptr_func(env, regno, insn_idx, arg_type, 0);
+		err = process_dynptr_func(env, reg, argno_from_reg(regno), insn_idx, arg_type, &meta->ref_obj,
+					  &meta->dynptr);
 		if (err)
 			return err;
 		break;
@@ -8728,7 +8414,7 @@ skip_type_check:
 		break;
 	case ARG_PTR_TO_CONST_STR:
 	{
-		err = check_reg_const_str(env, reg, regno);
+		err = check_arg_const_str(env, reg, argno_from_reg(regno));
 		if (err)
 			return err;
 		break;
@@ -9130,11 +8816,29 @@ static bool check_mem_arg_rw_flag_ok(const struct bpf_func_proto *fn)
 	return true;
 }
 
-static int check_func_proto(const struct bpf_func_proto *fn)
+static bool check_proto_release_reg(const struct bpf_func_proto *fn, struct bpf_call_arg_meta *meta)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) {
+		enum bpf_arg_type arg_type = fn->arg_type[i];
+
+		if (arg_type_is_release(arg_type)) {
+			if (meta->release_regno)
+				return false;
+			meta->release_regno = i + 1;
+		}
+	}
+
+	return true;
+}
+
+static int check_func_proto(const struct bpf_func_proto *fn, struct bpf_call_arg_meta *meta)
 {
 	return check_raw_mode_ok(fn) &&
 	       check_arg_pair_ok(fn) &&
 	       check_mem_arg_rw_flag_ok(fn) &&
+	       check_proto_release_reg(fn, meta) &&
 	       check_btf_id_ok(fn) ? 0 : -EINVAL;
 }
 
@@ -9181,14 +8885,14 @@ static void mark_pkt_end(struct bpf_verifier_state *vstate, int regn, bool range
 		reg->range = AT_PKT_END;
 }
 
-static int release_reference_nomark(struct bpf_verifier_state *state, int ref_obj_id)
+static int release_reference_nomark(struct bpf_verifier_state *state, int id)
 {
 	int i;
 
 	for (i = 0; i < state->acquired_refs; i++) {
 		if (state->refs[i].type != REF_TYPE_PTR)
 			continue;
-		if (state->refs[i].id == ref_obj_id) {
+		if (state->refs[i].id == id) {
 			release_reference_state(state, i);
 			return 0;
 		}
@@ -9196,26 +8900,83 @@ static int release_reference_nomark(struct bpf_verifier_state *state, int ref_ob
 	return -EINVAL;
 }
 
-/* The pointer with the specified id has released its reference to kernel
- * resources. Identify all copies of the same pointer and clear the reference.
- *
- * This is the release function corresponding to acquire_reference(). Idempotent.
- */
-static int release_reference(struct bpf_verifier_env *env, int ref_obj_id)
+static int idstack_push(struct bpf_idmap *idmap, u32 id)
 {
+	int i;
+
+	if (!id)
+		return 0;
+
+	for (i = 0; i < idmap->cnt; i++)
+		if (idmap->map[i].old == id)
+			return 0;
+
+	if (WARN_ON_ONCE(idmap->cnt >= BPF_ID_MAP_SIZE))
+		return -EFAULT;
+
+	idmap->map[idmap->cnt++].old = id;
+	return 0;
+}
+
+static int idstack_pop(struct bpf_idmap *idmap)
+{
+	if (!idmap->cnt)
+		return 0;
+
+	return idmap->map[--idmap->cnt].old;
+}
+
+/* Release id and objects derived from it iteratively in a DFS manner */
+static int release_reference(struct bpf_verifier_env *env, int id)
+{
+	u32 mask = (1 << STACK_SPILL) | (1 << STACK_DYNPTR);
 	struct bpf_verifier_state *vstate = env->cur_state;
+	struct bpf_idmap *idstack = &env->idmap_scratch;
+	struct bpf_stack_state *stack;
 	struct bpf_func_state *state;
 	struct bpf_reg_state *reg;
-	int err;
+	int i, err;
 
-	err = release_reference_nomark(vstate, ref_obj_id);
+	idstack->cnt = 0;
+	err = idstack_push(idstack, id);
 	if (err)
 		return err;
 
-	bpf_for_each_reg_in_vstate(vstate, state, reg, ({
-		if (reg->ref_obj_id == ref_obj_id)
-			mark_reg_invalid(env, reg);
-	}));
+	if (find_reference_state(vstate, id))
+		WARN_ON_ONCE(release_reference_nomark(vstate, id));
+
+	while ((id = idstack_pop(idstack))) {
+		/*
+		 * Child references are inaccessible after parent is released,
+		 * any child references that exist at this point are a leak.
+		 */
+		for (i = 0; i < vstate->acquired_refs; i++) {
+			if (vstate->refs[i].type != REF_TYPE_PTR)
+				continue;
+			if (vstate->refs[i].parent_id != id)
+				continue;
+			verbose(env, "Leaking reference id=%d alloc_insn=%d. Release it first.\n",
+				vstate->refs[i].id, vstate->refs[i].insn_idx);
+			return -EINVAL;
+		}
+
+		bpf_for_each_reg_in_vstate_mask(vstate, state, reg, stack, mask, ({
+			if (reg->id != id && reg->parent_id != id)
+				continue;
+
+			/* Free objects derived from the current object */
+			if (reg->parent_id == id) {
+				err = idstack_push(idstack, reg->id);
+				if (err)
+					return err;
+			}
+
+			if (!stack || stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL)
+				mark_reg_invalid(env, reg);
+			else if (stack->slot_type[BPF_REG_SIZE - 1] == STACK_DYNPTR)
+				invalidate_dynptr(env, stack);
+		}));
+	}
 
 	return 0;
 }
@@ -9231,6 +8992,42 @@ static void invalidate_non_owning_refs(struct bpf_verifier_env *env)
 	}));
 }
 
+static void invalidate_rcu_protected_refs(struct bpf_verifier_env *env)
+{
+	struct bpf_stack_state *stack;
+	struct bpf_func_state *state;
+	struct bpf_reg_state *reg;
+	u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER);
+
+	bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, stack, clear_mask, ({
+		if (reg->type & MEM_RCU) {
+			reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL);
+			reg->type |= PTR_UNTRUSTED;
+		}
+	}));
+}
+
+static int ref_convert_alloc_rcu_protected(struct bpf_verifier_env *env, u32 id)
+{
+	struct bpf_func_state *state;
+	struct bpf_reg_state *reg;
+	int err;
+
+	err = release_reference_nomark(env->cur_state, id);
+
+	bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
+		if (reg->id != id)
+			continue;
+		if ((reg->type & MEM_ALLOC) && (reg->type & MEM_PERCPU)) {
+			reg->id = 0;
+			reg->type &= ~MEM_ALLOC;
+			reg->type |= MEM_RCU;
+		}
+	}));
+
+	return err;
+}
+
 static void clear_caller_saved_regs(struct bpf_verifier_env *env,
 				    struct bpf_reg_state *regs)
 {
@@ -9243,6 +9040,15 @@ static void clear_caller_saved_regs(struct bpf_verifier_env *env,
 	}
 }
 
+static void invalidate_outgoing_stack_args(const struct bpf_verifier_env *env,
+					   struct bpf_func_state *state)
+{
+	int i, nslots = state->out_stack_arg_cnt;
+
+	for (i = 0; i < nslots; i++)
+		bpf_mark_reg_not_init(env, &state->stack_arg_regs[i]);
+}
+
 typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env,
 				   struct bpf_func_state *caller,
 				   struct bpf_func_state *callee,
@@ -9305,11 +9111,23 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
 				    struct bpf_reg_state *regs)
 {
 	struct bpf_subprog_info *sub = subprog_info(env, subprog);
+	struct bpf_func_state *caller = cur_func(env);
 	struct bpf_verifier_log *log = &env->log;
+	struct ref_obj_desc ref_obj = {};
 	u32 i;
-	int ret;
+	int ret, err;
 
 	ret = btf_prepare_func_args(env, subprog);
+	if (ret) {
+		if (bpf_in_stack_arg_cnt(sub) > 0) {
+			err = check_outgoing_stack_args(env, caller, sub->arg_cnt);
+			if (err)
+				return err;
+		}
+		return ret;
+	}
+
+	ret = check_outgoing_stack_args(env, caller, sub->arg_cnt);
 	if (ret)
 		return ret;
 
@@ -9317,13 +9135,13 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
 	 * verifier sees.
 	 */
 	for (i = 0; i < sub->arg_cnt; i++) {
-		u32 regno = i + 1;
-		struct bpf_reg_state *reg = &regs[regno];
+		argno_t argno = argno_from_arg(i + 1);
+		struct bpf_reg_state *reg = get_func_arg_reg(caller, regs, i);
 		struct bpf_subprog_arg_info *arg = &sub->args[i];
 
 		if (arg->arg_type == ARG_ANYTHING) {
 			if (reg->type != SCALAR_VALUE) {
-				bpf_log(log, "R%d is not a scalar\n", regno);
+				bpf_log(log, "%s is not a scalar\n", reg_arg_name(env, argno));
 				return -EINVAL;
 			}
 		} else if (arg->arg_type & PTR_UNTRUSTED) {
@@ -9333,24 +9151,26 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
 			 * invalid memory access.
 			 */
 		} else if (arg->arg_type == ARG_PTR_TO_CTX) {
-			ret = check_func_arg_reg_off(env, reg, regno, ARG_PTR_TO_CTX);
+			ret = check_func_arg_reg_off(env, reg, argno, ARG_PTR_TO_CTX);
 			if (ret < 0)
 				return ret;
 			/* If function expects ctx type in BTF check that caller
 			 * is passing PTR_TO_CTX.
 			 */
 			if (reg->type != PTR_TO_CTX) {
-				bpf_log(log, "arg#%d expects pointer to ctx\n", i);
+				bpf_log(log, "%s expects pointer to ctx\n",
+					reg_arg_name(env, argno));
 				return -EINVAL;
 			}
 		} else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) {
-			ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE);
+			ret = check_func_arg_reg_off(env, reg, argno, ARG_DONTCARE);
 			if (ret < 0)
 				return ret;
-			if (check_mem_reg(env, reg, regno, arg->mem_size))
+			if (check_mem_reg(env, reg, argno, arg->mem_size))
 				return -EINVAL;
 			if (!(arg->arg_type & PTR_MAYBE_NULL) && (reg->type & PTR_MAYBE_NULL)) {
-				bpf_log(log, "arg#%d is expected to be non-NULL\n", i);
+				bpf_log(log, "%s is expected to be non-NULL\n",
+					reg_arg_name(env, argno));
 				return -EINVAL;
 			}
 		} else if (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) {
@@ -9362,15 +9182,16 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
 			 * run-time debug nightmare.
 			 */
 			if (reg->type != PTR_TO_ARENA && reg->type != SCALAR_VALUE) {
-				bpf_log(log, "R%d is not a pointer to arena or scalar.\n", regno);
+				bpf_log(log, "%s is not a pointer to arena or scalar.\n",
+					reg_arg_name(env, argno));
 				return -EINVAL;
 			}
-		} else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) {
-			ret = check_func_arg_reg_off(env, reg, regno, ARG_PTR_TO_DYNPTR);
+		} else if (arg->arg_type == ARG_PTR_TO_DYNPTR) {
+			ret = check_func_arg_reg_off(env, reg, argno, ARG_PTR_TO_DYNPTR);
 			if (ret)
 				return ret;
 
-			ret = process_dynptr_func(env, regno, -1, arg->arg_type, 0);
+			ret = process_dynptr_func(env, reg, argno, -1, arg->arg_type, &ref_obj, NULL);
 			if (ret)
 				return ret;
 		} else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) {
@@ -9381,12 +9202,13 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
 				continue;
 
 			memset(&meta, 0, sizeof(meta)); /* leave func_id as zero */
-			err = check_reg_type(env, regno, arg->arg_type, &arg->btf_id, &meta);
-			err = err ?: check_func_arg_reg_off(env, reg, regno, arg->arg_type);
+			err = check_reg_type(env, reg, argno, arg->arg_type, &arg->btf_id, &meta);
+			err = err ?: check_func_arg_reg_off(env, reg, argno, arg->arg_type);
 			if (err)
 				return err;
 		} else {
-			verifier_bug(env, "unrecognized arg#%d type %d", i, arg->arg_type);
+			verifier_bug(env, "unrecognized %s type %d",
+				     reg_arg_name(env, argno), arg->arg_type);
 			return -EFAULT;
 		}
 	}
@@ -9505,6 +9327,8 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			   int *insn_idx)
 {
 	struct bpf_verifier_state *state = env->cur_state;
+	struct bpf_subprog_info *caller_info;
+	u16 callee_incoming, stack_arg_cnt;
 	struct bpf_func_state *caller;
 	int err, subprog, target_insn;
 
@@ -9547,6 +9371,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		/* mark global subprog for verifying after main prog */
 		subprog_aux(env, subprog)->called = true;
 		clear_caller_saved_regs(env, caller->regs);
+		invalidate_outgoing_stack_args(env, cur_func(env));
 
 		/* All non-void global functions return a 64-bit SCALAR_VALUE. */
 		if (!subprog_returns_void(env, subprog)) {
@@ -9569,6 +9394,16 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		return 0;
 	}
 
+	/*
+	 * Track caller's total stack arg count (incoming + max outgoing).
+	 * This is needed so the JIT knows how much stack arg space to allocate.
+	 */
+	caller_info = &env->subprog_info[caller->subprogno];
+	callee_incoming = bpf_in_stack_arg_cnt(&env->subprog_info[subprog]);
+	stack_arg_cnt = bpf_in_stack_arg_cnt(caller_info) + callee_incoming;
+	if (stack_arg_cnt > caller_info->stack_arg_cnt)
+		caller_info->stack_arg_cnt = stack_arg_cnt;
+
 	/* for regular function entry setup new frame and continue
 	 * from that frame.
 	 */
@@ -9852,9 +9687,9 @@ static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env)
 static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg)
 {
 	if (range.return_32bit)
-		return range.minval <= reg->s32_min_value && reg->s32_max_value <= range.maxval;
+		return range.minval <= reg_s32_min(reg) && reg_s32_max(reg) <= range.maxval;
 	else
-		return range.minval <= reg->smin_value && reg->smax_value <= range.maxval;
+		return range.minval <= reg_smin(reg) && reg_smax(reg) <= range.maxval;
 }
 
 static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
@@ -9926,6 +9761,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 	 * bpf_throw, this will be done by copy_verifier_state for extra frames. */
 	free_func_state(callee);
 	state->frame[state->curframe--] = NULL;
+	invalidate_outgoing_stack_args(env, caller);
 
 	/* for callbacks widen imprecise scalars to make programs like below verify:
 	 *
@@ -9952,7 +9788,9 @@ static int do_refine_retval_range(struct bpf_verifier_env *env,
 				  int func_id,
 				  struct bpf_call_arg_meta *meta)
 {
+	struct bpf_retval_range range;
 	struct bpf_reg_state *ret_reg = &regs[BPF_REG_0];
+	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
 
 	if (ret_type != RET_INTEGER)
 		return 0;
@@ -9963,21 +9801,36 @@ static int do_refine_retval_range(struct bpf_verifier_env *env,
 	case BPF_FUNC_probe_read_str:
 	case BPF_FUNC_probe_read_kernel_str:
 	case BPF_FUNC_probe_read_user_str:
-		ret_reg->smax_value = meta->msize_max_value;
-		ret_reg->s32_max_value = meta->msize_max_value;
-		ret_reg->smin_value = -MAX_ERRNO;
-		ret_reg->s32_min_value = -MAX_ERRNO;
+		reg_set_srange64(ret_reg, -MAX_ERRNO, meta->msize_max_value);
+		reg_set_srange32(ret_reg, -MAX_ERRNO, meta->msize_max_value);
 		reg_bounds_sync(ret_reg);
 		break;
 	case BPF_FUNC_get_smp_processor_id:
-		ret_reg->umax_value = nr_cpu_ids - 1;
-		ret_reg->u32_max_value = nr_cpu_ids - 1;
-		ret_reg->smax_value = nr_cpu_ids - 1;
-		ret_reg->s32_max_value = nr_cpu_ids - 1;
-		ret_reg->umin_value = 0;
-		ret_reg->u32_min_value = 0;
-		ret_reg->smin_value = 0;
-		ret_reg->s32_min_value = 0;
+		reg_set_urange64(ret_reg, 0, nr_cpu_ids - 1);
+		reg_set_urange32(ret_reg, 0, nr_cpu_ids - 1);
+		reg_bounds_sync(ret_reg);
+		break;
+	case BPF_FUNC_get_retval:
+		/*
+		 * bpf_get_retval may see arbitrary value passed by bpf_prog_run_array_cg for
+		 * CGROUP_GETSOCKOPT type.
+		 */
+		if (prog_type == BPF_PROG_TYPE_CGROUP_SOCKOPT &&
+		    env->prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT)
+			break;
+
+		if (prog_type == BPF_PROG_TYPE_LSM &&
+		    env->prog->expected_attach_type == BPF_LSM_CGROUP) {
+			if (!env->prog->aux->attach_func_proto->type)
+				break;
+			bpf_lsm_get_retval_range(env->prog, &range);
+		} else {
+			range.minval = -MAX_ERRNO;
+			range.maxval = 0;
+		}
+
+		reg_set_srange64(ret_reg, range.minval, range.maxval);
+		reg_set_srange32(ret_reg, range.minval, range.maxval);
 		reg_bounds_sync(ret_reg);
 		break;
 	}
@@ -10086,7 +9939,7 @@ static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exi
 		 * kernel. Type checks are performed later in check_return_code.
 		 */
 		if (type == BPF_PROG_TYPE_STRUCT_OPS && !exception_exit &&
-		    reg->ref_obj_id == state->refs[i].id)
+		    reg->id == state->refs[i].id)
 			continue;
 		verbose(env, "Unreleased reference id=%d alloc_insn=%d\n",
 			state->refs[i].id, state->refs[i].insn_idx);
@@ -10221,13 +10074,16 @@ static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno
 				 state->callback_subprogno == subprogno);
 }
 
-/* Returns whether or not the given map type can potentially elide
+/* Returns whether or not the given map can potentially elide
  * lookup return value nullness check. This is possible if the key
  * is statically known.
  */
-static bool can_elide_value_nullness(enum bpf_map_type type)
+static bool can_elide_value_nullness(const struct bpf_map *map)
 {
-	switch (type) {
+	if (map->map_flags & BPF_F_INNER_MAP)
+		return false;
+
+	switch (map->map_type) {
 	case BPF_MAP_TYPE_ARRAY:
 	case BPF_MAP_TYPE_PERCPU_ARRAY:
 		return true;
@@ -10272,6 +10128,24 @@ static const char *non_sleepable_context_description(struct bpf_verifier_env *en
 	return "non-sleepable prog";
 }
 
+static int release_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+		       bool convert_rcu, bool release_dynptr)
+{
+	int err = -EINVAL;
+
+	if (bpf_register_is_null(reg))
+		return 0;
+
+	if (release_dynptr)
+		err = unmark_stack_slots_dynptr(env, reg);
+	else if (convert_rcu)
+		err = ref_convert_alloc_rcu_protected(env, reg->id);
+	else if (reg_is_referenced(env, reg))
+		err = release_reference(env, reg->id);
+
+	return err;
+}
+
 static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			     int *insn_idx_p)
 {
@@ -10321,7 +10195,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 	memset(&meta, 0, sizeof(meta));
 	meta.pkt_access = fn->pkt_access;
 
-	err = check_func_proto(fn);
+	err = check_func_proto(fn, &meta);
 	if (err) {
 		verifier_bug(env, "incorrect func proto %s#%d", func_id_name(func_id), func_id);
 		return err;
@@ -10353,55 +10227,26 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 	if (err)
 		return err;
 
+	regs = cur_regs(env);
+
 	/* Mark slots with STACK_MISC in case of raw mode, stack offset
 	 * is inferred from register state.
 	 */
 	for (i = 0; i < meta.access_size; i++) {
-		err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B,
+		err = check_mem_access(env, insn_idx, regs + meta.regno, argno_from_reg(meta.regno), i, BPF_B,
 				       BPF_WRITE, -1, false, false);
 		if (err)
 			return err;
 	}
 
-	regs = cur_regs(env);
-
 	if (meta.release_regno) {
-		err = -EINVAL;
-		if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) {
-			err = unmark_stack_slots_dynptr(env, &regs[meta.release_regno]);
-		} else if (func_id == BPF_FUNC_kptr_xchg && meta.ref_obj_id) {
-			u32 ref_obj_id = meta.ref_obj_id;
-			bool in_rcu = in_rcu_cs(env);
-			struct bpf_func_state *state;
-			struct bpf_reg_state *reg;
-
-			err = release_reference_nomark(env->cur_state, ref_obj_id);
-			if (!err) {
-				bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
-					if (reg->ref_obj_id == ref_obj_id) {
-						if (in_rcu && (reg->type & MEM_ALLOC) && (reg->type & MEM_PERCPU)) {
-							reg->ref_obj_id = 0;
-							reg->type &= ~MEM_ALLOC;
-							reg->type |= MEM_RCU;
-						} else {
-							mark_reg_invalid(env, reg);
-						}
-					}
-				}));
-			}
-		} else if (meta.ref_obj_id) {
-			err = release_reference(env, meta.ref_obj_id);
-		} else if (bpf_register_is_null(&regs[meta.release_regno])) {
-			/* meta.ref_obj_id can only be 0 if register that is meant to be
-			 * released is NULL, which must be > R0.
-			 */
-			err = 0;
-		}
-		if (err) {
-			verbose(env, "func %s#%d reference has not been acquired before\n",
-				func_id_name(func_id), func_id);
+		struct bpf_reg_state *reg = &regs[meta.release_regno];
+		bool convert_rcu = (func_id == BPF_FUNC_kptr_xchg) && in_rcu_cs(env) &&
+				   (reg->type & MEM_ALLOC) && (reg->type & MEM_PERCPU);
+
+		err = release_reg(env, reg, convert_rcu, !!meta.dynptr.id);
+		if (err)
 			return err;
-		}
 	}
 
 	switch (func_id) {
@@ -10442,7 +10287,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		err = mark_chain_precision(env, BPF_REG_1);
 		if (err)
 			return err;
-		if (cur_func(env)->callback_depth < regs[BPF_REG_1].umax_value) {
+		if (cur_func(env)->callback_depth < reg_umax(&regs[BPF_REG_1])) {
 			err = push_callback_call(env, insn, insn_idx, meta.subprogno,
 						 set_loop_callback_state);
 		} else {
@@ -10460,6 +10305,24 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		}
 		break;
 	case BPF_FUNC_set_retval:
+	{
+		struct bpf_retval_range range = {
+			.minval = -MAX_ERRNO,
+			.maxval = 0,
+			.return_32bit = true
+		};
+		struct bpf_reg_state *r1 = &regs[BPF_REG_1];
+
+		if (r1->type != SCALAR_VALUE) {
+			verbose(env, "R1 is not a scalar\n");
+			return -EINVAL;
+		}
+
+		/* CGROUP_GETSOCKOPT is allowed to return arbitrary value */
+		if (prog_type == BPF_PROG_TYPE_CGROUP_SOCKOPT &&
+		    env->prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT)
+			break;
+
 		if (prog_type == BPF_PROG_TYPE_LSM &&
 		    env->prog->expected_attach_type == BPF_LSM_CGROUP) {
 			if (!env->prog->aux->attach_func_proto->type) {
@@ -10469,54 +10332,24 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 				verbose(env, "BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n");
 				return -EINVAL;
 			}
+			bpf_lsm_get_retval_range(env->prog, &range);
 		}
-		break;
-	case BPF_FUNC_dynptr_data:
-	{
-		struct bpf_reg_state *reg;
-		int id, ref_obj_id;
 
-		reg = get_dynptr_arg_reg(env, fn, regs);
-		if (!reg)
-			return -EFAULT;
-
-
-		if (meta.dynptr_id) {
-			verifier_bug(env, "meta.dynptr_id already set");
-			return -EFAULT;
-		}
-		if (meta.ref_obj_id) {
-			verifier_bug(env, "meta.ref_obj_id already set");
-			return -EFAULT;
-		}
-
-		id = dynptr_id(env, reg);
-		if (id < 0) {
-			verifier_bug(env, "failed to obtain dynptr id");
-			return id;
-		}
+		err = mark_chain_precision(env, BPF_REG_1);
+		if (err)
+			return err;
 
-		ref_obj_id = dynptr_ref_obj_id(env, reg);
-		if (ref_obj_id < 0) {
-			verifier_bug(env, "failed to obtain dynptr ref_obj_id");
-			return ref_obj_id;
+		if (!retval_range_within(range, r1)) {
+			verbose_invalid_scalar(env, r1, range, "At bpf_set_retval", "R1");
+			return -EINVAL;
 		}
 
-		meta.dynptr_id = id;
-		meta.ref_obj_id = ref_obj_id;
-
 		break;
 	}
 	case BPF_FUNC_dynptr_write:
 	{
-		enum bpf_dynptr_type dynptr_type;
-		struct bpf_reg_state *reg;
+		enum bpf_dynptr_type dynptr_type = meta.dynptr.type;
 
-		reg = get_dynptr_arg_reg(env, fn, regs);
-		if (!reg)
-			return -EFAULT;
-
-		dynptr_type = dynptr_get_type(env, reg);
 		if (dynptr_type == BPF_DYNPTR_TYPE_INVALID)
 			return -EFAULT;
 
@@ -10560,6 +10393,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		bpf_mark_reg_not_init(env, &regs[caller_saved[i]]);
 		check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
 	}
+	invalidate_outgoing_stack_args(env, cur_func(env));
 
 	/* helper call returns 64-bit value. */
 	regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
@@ -10589,7 +10423,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		}
 
 		if (func_id == BPF_FUNC_map_lookup_elem &&
-		    can_elide_value_nullness(meta.map.ptr->map_type) &&
+		    can_elide_value_nullness(meta.map.ptr) &&
 		    meta.const_map_key >= 0 &&
 		    meta.const_map_key < meta.map.ptr->max_entries)
 			ret_flag &= ~PTR_MAYBE_NULL;
@@ -10701,29 +10535,45 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 	if (type_may_be_null(regs[BPF_REG_0].type))
 		regs[BPF_REG_0].id = ++env->id_gen;
 
-	if (helper_multiple_ref_obj_use(func_id, meta.map.ptr)) {
-		verifier_bug(env, "func %s#%d sets ref_obj_id more than once",
-			     func_id_name(func_id), func_id);
-		return -EFAULT;
-	}
+	if (is_ptr_cast_function(func_id) &&
+	    find_reference_state(env->cur_state, meta.ref_obj.id)) {
+		struct bpf_verifier_state *branch;
+		struct bpf_reg_state *r0;
 
-	if (is_dynptr_ref_function(func_id))
-		regs[BPF_REG_0].dynptr_id = meta.dynptr_id;
+		err = validate_ref_obj(env, &meta.ref_obj);
+		if (err)
+			return err;
 
-	if (is_ptr_cast_function(func_id) || is_dynptr_ref_function(func_id)) {
-		/* For release_reference() */
-		regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
+		/*
+		 * In order for a release of any of the original or cast pointers
+		 * to invalidate all other pointers, reuse the same reference id for
+		 * the cast result.
+		 * This reference id can't be used for nullness propagation,
+		 * as cast might return NULL for a non-NULL input.
+		 * Hence, explore the NULL case as a separate branch.
+		 */
+		branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false);
+		if (IS_ERR(branch))
+			return PTR_ERR(branch);
+
+		r0 = &branch->frame[branch->curframe]->regs[BPF_REG_0];
+		__mark_reg_known_zero(r0);
+		r0->type = SCALAR_VALUE;
+
+		regs[BPF_REG_0].type &= ~PTR_MAYBE_NULL;
+		regs[BPF_REG_0].id = meta.ref_obj.id;
 	} else if (is_acquire_function(func_id, meta.map.ptr)) {
-		int id = acquire_reference(env, insn_idx);
+		int id = acquire_reference(env, insn_idx, 0);
 
 		if (id < 0)
 			return id;
-		/* For mark_ptr_or_null_reg() */
+
 		regs[BPF_REG_0].id = id;
-		/* For release_reference() */
-		regs[BPF_REG_0].ref_obj_id = id;
 	}
 
+	if (func_id == BPF_FUNC_dynptr_data)
+		regs[BPF_REG_0].parent_id = meta.dynptr.id;
+
 	err = do_refine_retval_range(env, regs, fn->ret_type, func_id, &meta);
 	if (err)
 		return err;
@@ -10819,7 +10669,6 @@ static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta)
 	return meta->kfunc_flags & KF_RELEASE;
 }
 
-
 static bool is_kfunc_destructive(struct bpf_kfunc_call_arg_meta *meta)
 {
 	return meta->kfunc_flags & KF_DESTRUCTIVE;
@@ -10896,6 +10745,11 @@ static bool is_kfunc_arg_nullable(const struct btf *btf, const struct btf_param
 	return btf_param_match_suffix(btf, arg, "__nullable");
 }
 
+static bool is_kfunc_arg_nonown_allowed(const struct btf *btf, const struct btf_param *arg)
+{
+	return btf_param_match_suffix(btf, arg, "__nonown_allowed");
+}
+
 static bool is_kfunc_arg_const_str(const struct btf *btf, const struct btf_param *arg)
 {
 	return btf_param_match_suffix(btf, arg, "__str");
@@ -11136,10 +10990,15 @@ enum special_kfunc_type {
 	KF_bpf_list_push_front,
 	KF_bpf_list_push_back_impl,
 	KF_bpf_list_push_back,
+	KF_bpf_list_add,
 	KF_bpf_list_pop_front,
 	KF_bpf_list_pop_back,
+	KF_bpf_list_del,
 	KF_bpf_list_front,
 	KF_bpf_list_back,
+	KF_bpf_list_is_first,
+	KF_bpf_list_is_last,
+	KF_bpf_list_empty,
 	KF_bpf_cast_to_kern_ctx,
 	KF_bpf_rdonly_cast,
 	KF_bpf_rcu_read_lock,
@@ -11204,10 +11063,15 @@ BTF_ID(func, bpf_list_push_front_impl)
 BTF_ID(func, bpf_list_push_front)
 BTF_ID(func, bpf_list_push_back_impl)
 BTF_ID(func, bpf_list_push_back)
+BTF_ID(func, bpf_list_add)
 BTF_ID(func, bpf_list_pop_front)
 BTF_ID(func, bpf_list_pop_back)
+BTF_ID(func, bpf_list_del)
 BTF_ID(func, bpf_list_front)
 BTF_ID(func, bpf_list_back)
+BTF_ID(func, bpf_list_is_first)
+BTF_ID(func, bpf_list_is_last)
+BTF_ID(func, bpf_list_empty)
 BTF_ID(func, bpf_cast_to_kern_ctx)
 BTF_ID(func, bpf_rdonly_cast)
 BTF_ID(func, bpf_rcu_read_lock)
@@ -11319,7 +11183,8 @@ static bool is_bpf_list_push_kfunc(u32 func_id)
 	return func_id == special_kfunc_list[KF_bpf_list_push_front] ||
 	       func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
 	       func_id == special_kfunc_list[KF_bpf_list_push_back] ||
-	       func_id == special_kfunc_list[KF_bpf_list_push_back_impl];
+	       func_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
+	       func_id == special_kfunc_list[KF_bpf_list_add];
 }
 
 static bool is_bpf_rbtree_add_kfunc(u32 func_id)
@@ -11368,15 +11233,12 @@ bool bpf_is_kfunc_pkt_changing(struct bpf_kfunc_call_arg_meta *meta)
 }
 
 static enum kfunc_ptr_arg_type
-get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
-		       struct bpf_kfunc_call_arg_meta *meta,
+get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, struct bpf_func_state *caller,
+		       struct bpf_reg_state *regs, struct bpf_kfunc_call_arg_meta *meta,
 		       const struct btf_type *t, const struct btf_type *ref_t,
 		       const char *ref_tname, const struct btf_param *args,
-		       int argno, int nargs)
+		       int arg, int nargs, argno_t argno, struct bpf_reg_state *reg)
 {
-	u32 regno = argno + 1;
-	struct bpf_reg_state *regs = cur_regs(env);
-	struct bpf_reg_state *reg = &regs[regno];
 	bool arg_mem_size = false;
 
 	if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
@@ -11384,9 +11246,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
 	    meta->func_id == special_kfunc_list[KF_bpf_session_cookie])
 		return KF_ARG_PTR_TO_CTX;
 
-	if (argno + 1 < nargs &&
-	    (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]) ||
-	     is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1])))
+	if (arg + 1 < nargs &&
+	    (is_kfunc_arg_mem_size(meta->btf, &args[arg + 1], get_func_arg_reg(caller, regs, arg + 1)) ||
+	     is_kfunc_arg_const_mem_size(meta->btf, &args[arg + 1], get_func_arg_reg(caller, regs, arg + 1))))
 		arg_mem_size = true;
 
 	/* In this function, we verify the kfunc's BTF as per the argument type,
@@ -11394,68 +11256,69 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
 	 * type to our caller. When a set of conditions hold in the BTF type of
 	 * arguments, we resolve it to a known kfunc_ptr_arg_type.
 	 */
-	if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno))
+	if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), arg))
 		return KF_ARG_PTR_TO_CTX;
 
-	if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && bpf_register_is_null(reg) &&
+	if (is_kfunc_arg_nullable(meta->btf, &args[arg]) && bpf_register_is_null(reg) &&
 	    !arg_mem_size)
 		return KF_ARG_PTR_TO_NULL;
 
-	if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno]))
+	if (is_kfunc_arg_alloc_obj(meta->btf, &args[arg]))
 		return KF_ARG_PTR_TO_ALLOC_BTF_ID;
 
-	if (is_kfunc_arg_refcounted_kptr(meta->btf, &args[argno]))
+	if (is_kfunc_arg_refcounted_kptr(meta->btf, &args[arg]))
 		return KF_ARG_PTR_TO_REFCOUNTED_KPTR;
 
-	if (is_kfunc_arg_dynptr(meta->btf, &args[argno]))
+	if (is_kfunc_arg_dynptr(meta->btf, &args[arg]))
 		return KF_ARG_PTR_TO_DYNPTR;
 
-	if (is_kfunc_arg_iter(meta, argno, &args[argno]))
+	if (is_kfunc_arg_iter(meta, arg, &args[arg]))
 		return KF_ARG_PTR_TO_ITER;
 
-	if (is_kfunc_arg_list_head(meta->btf, &args[argno]))
+	if (is_kfunc_arg_list_head(meta->btf, &args[arg]))
 		return KF_ARG_PTR_TO_LIST_HEAD;
 
-	if (is_kfunc_arg_list_node(meta->btf, &args[argno]))
+	if (is_kfunc_arg_list_node(meta->btf, &args[arg]))
 		return KF_ARG_PTR_TO_LIST_NODE;
 
-	if (is_kfunc_arg_rbtree_root(meta->btf, &args[argno]))
+	if (is_kfunc_arg_rbtree_root(meta->btf, &args[arg]))
 		return KF_ARG_PTR_TO_RB_ROOT;
 
-	if (is_kfunc_arg_rbtree_node(meta->btf, &args[argno]))
+	if (is_kfunc_arg_rbtree_node(meta->btf, &args[arg]))
 		return KF_ARG_PTR_TO_RB_NODE;
 
-	if (is_kfunc_arg_const_str(meta->btf, &args[argno]))
+	if (is_kfunc_arg_const_str(meta->btf, &args[arg]))
 		return KF_ARG_PTR_TO_CONST_STR;
 
-	if (is_kfunc_arg_map(meta->btf, &args[argno]))
+	if (is_kfunc_arg_map(meta->btf, &args[arg]))
 		return KF_ARG_PTR_TO_MAP;
 
-	if (is_kfunc_arg_wq(meta->btf, &args[argno]))
+	if (is_kfunc_arg_wq(meta->btf, &args[arg]))
 		return KF_ARG_PTR_TO_WORKQUEUE;
 
-	if (is_kfunc_arg_timer(meta->btf, &args[argno]))
+	if (is_kfunc_arg_timer(meta->btf, &args[arg]))
 		return KF_ARG_PTR_TO_TIMER;
 
-	if (is_kfunc_arg_task_work(meta->btf, &args[argno]))
+	if (is_kfunc_arg_task_work(meta->btf, &args[arg]))
 		return KF_ARG_PTR_TO_TASK_WORK;
 
-	if (is_kfunc_arg_irq_flag(meta->btf, &args[argno]))
+	if (is_kfunc_arg_irq_flag(meta->btf, &args[arg]))
 		return KF_ARG_PTR_TO_IRQ_FLAG;
 
-	if (is_kfunc_arg_res_spin_lock(meta->btf, &args[argno]))
+	if (is_kfunc_arg_res_spin_lock(meta->btf, &args[arg]))
 		return KF_ARG_PTR_TO_RES_SPIN_LOCK;
 
 	if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) {
 		if (!btf_type_is_struct(ref_t)) {
-			verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n",
-				meta->func_name, argno, btf_type_str(ref_t), ref_tname);
+			verbose(env, "kernel function %s %s pointer type %s %s is not supported\n",
+				meta->func_name, reg_arg_name(env, argno),
+				btf_type_str(ref_t), ref_tname);
 			return -EINVAL;
 		}
 		return KF_ARG_PTR_TO_BTF_ID;
 	}
 
-	if (is_kfunc_arg_callback(env, meta->btf, &args[argno]))
+	if (is_kfunc_arg_callback(env, meta->btf, &args[arg]))
 		return KF_ARG_PTR_TO_CALLBACK;
 
 	/* This is the catch all argument type of register types supported by
@@ -11465,8 +11328,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
 	 */
 	if (!btf_type_is_scalar(ref_t) && !__btf_type_is_scalar_struct(env, meta->btf, ref_t, 0) &&
 	    (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) {
-		verbose(env, "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n",
-			argno, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : "");
+		verbose(env, "%s pointer type %s %s must point to %sscalar, or struct with scalar\n",
+			reg_arg_name(env, argno),
+			btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : "");
 		return -EINVAL;
 	}
 	return arg_mem_size ? KF_ARG_PTR_TO_MEM_SIZE : KF_ARG_PTR_TO_MEM;
@@ -11477,7 +11341,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
 					const struct btf_type *ref_t,
 					const char *ref_tname, u32 ref_id,
 					struct bpf_kfunc_call_arg_meta *meta,
-					int argno)
+					int arg, argno_t argno)
 {
 	const struct btf_type *reg_ref_t;
 	bool strict_type_match = false;
@@ -11519,7 +11383,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
 	 * btf_struct_ids_match() to walk the struct at the 0th offset, and
 	 * resolve types.
 	 */
-	if ((is_kfunc_release(meta) && reg->ref_obj_id) ||
+	if ((is_kfunc_release(meta) && reg_is_referenced(env, reg)) ||
 	    btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id))
 		strict_type_match = true;
 
@@ -11535,19 +11399,19 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
 	 */
 	taking_projection = btf_is_projection_of(ref_tname, reg_ref_tname);
 	if (!taking_projection && !struct_same) {
-		verbose(env, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
-			meta->func_name, argno, btf_type_str(ref_t), ref_tname, argno + 1,
+		verbose(env, "kernel function %s %s expected pointer to %s %s but %s has a pointer to %s %s\n",
+			meta->func_name, reg_arg_name(env, argno),
+			btf_type_str(ref_t), ref_tname, reg_arg_name(env, argno),
 			btf_type_str(reg_ref_t), reg_ref_tname);
 		return -EINVAL;
 	}
 	return 0;
 }
 
-static int process_irq_flag(struct bpf_verifier_env *env, int regno,
+static int process_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno,
 			     struct bpf_kfunc_call_arg_meta *meta)
 {
-	struct bpf_reg_state *reg = reg_state(env, regno);
-	int err, kfunc_class = IRQ_NATIVE_KFUNC;
+	int err, spi, kfunc_class = IRQ_NATIVE_KFUNC;
 	bool irq_save;
 
 	if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_save] ||
@@ -11567,11 +11431,13 @@ static int process_irq_flag(struct bpf_verifier_env *env, int regno,
 
 	if (irq_save) {
 		if (!is_irq_flag_reg_valid_uninit(env, reg)) {
-			verbose(env, "expected uninitialized irq flag as arg#%d\n", regno - 1);
+			verbose(env, "expected uninitialized irq flag as %s\n",
+				reg_arg_name(env, argno));
 			return -EINVAL;
 		}
 
-		err = check_mem_access(env, env->insn_idx, regno, 0, BPF_DW, BPF_WRITE, -1, false, false);
+		err = check_mem_access(env, env->insn_idx, reg, argno, 0, BPF_DW,
+				       BPF_WRITE, -1, false, false);
 		if (err)
 			return err;
 
@@ -11581,13 +11447,16 @@ static int process_irq_flag(struct bpf_verifier_env *env, int regno,
 	} else {
 		err = is_irq_flag_reg_valid_init(env, reg);
 		if (err) {
-			verbose(env, "expected an initialized irq flag as arg#%d\n", regno - 1);
+			verbose(env, "expected an initialized irq flag as %s\n",
+				reg_arg_name(env, argno));
 			return err;
 		}
 
-		err = mark_irq_flag_read(env, reg);
-		if (err)
-			return err;
+		spi = irq_flag_get_spi(env, reg);
+		if (spi < 0)
+			return spi;
+
+		mark_stack_slots_scratched(env, spi, 1);
 
 		err = unmark_stack_slot_irq_flag(env, reg, kfunc_class);
 		if (err)
@@ -11618,36 +11487,21 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state
 	return 0;
 }
 
-static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_obj_id)
+static void ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 id)
 {
-	struct bpf_verifier_state *state = env->cur_state;
 	struct bpf_func_state *unused;
 	struct bpf_reg_state *reg;
-	int i;
-
-	if (!ref_obj_id) {
-		verifier_bug(env, "ref_obj_id is zero for owning -> non-owning conversion");
-		return -EFAULT;
-	}
 
-	for (i = 0; i < state->acquired_refs; i++) {
-		if (state->refs[i].id != ref_obj_id)
-			continue;
+	WARN_ON_ONCE(release_reference_nomark(env->cur_state, id));
 
-		/* Clear ref_obj_id here so release_reference doesn't clobber
-		 * the whole reg
-		 */
-		bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({
-			if (reg->ref_obj_id == ref_obj_id) {
-				reg->ref_obj_id = 0;
-				ref_set_non_owning(env, reg);
-			}
-		}));
-		return 0;
-	}
+	bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({
+		if (reg->id == id) {
+			reg->id = 0;
+			ref_set_non_owning(env, reg);
+		}
+	}));
 
-	verifier_bug(env, "ref state missing for ref_obj_id");
-	return -EFAULT;
+	return;
 }
 
 /* Implementation details:
@@ -11728,8 +11582,12 @@ static bool is_bpf_list_api_kfunc(u32 btf_id)
 	return is_bpf_list_push_kfunc(btf_id) ||
 	       btf_id == special_kfunc_list[KF_bpf_list_pop_front] ||
 	       btf_id == special_kfunc_list[KF_bpf_list_pop_back] ||
+	       btf_id == special_kfunc_list[KF_bpf_list_del] ||
 	       btf_id == special_kfunc_list[KF_bpf_list_front] ||
-	       btf_id == special_kfunc_list[KF_bpf_list_back];
+	       btf_id == special_kfunc_list[KF_bpf_list_back] ||
+	       btf_id == special_kfunc_list[KF_bpf_list_is_first] ||
+	       btf_id == special_kfunc_list[KF_bpf_list_is_last] ||
+	       btf_id == special_kfunc_list[KF_bpf_list_empty];
 }
 
 static bool is_bpf_rbtree_api_kfunc(u32 btf_id)
@@ -11850,7 +11708,10 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env,
 
 	switch (node_field_type) {
 	case BPF_LIST_NODE:
-		ret = is_bpf_list_push_kfunc(kfunc_btf_id);
+		ret = is_bpf_list_push_kfunc(kfunc_btf_id) ||
+		      kfunc_btf_id == special_kfunc_list[KF_bpf_list_del] ||
+		      kfunc_btf_id == special_kfunc_list[KF_bpf_list_is_first] ||
+		      kfunc_btf_id == special_kfunc_list[KF_bpf_list_is_last];
 		break;
 	case BPF_RB_NODE:
 		ret = (is_bpf_rbtree_add_kfunc(kfunc_btf_id) ||
@@ -11872,7 +11733,7 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env,
 
 static int
 __process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env,
-				   struct bpf_reg_state *reg, u32 regno,
+				   struct bpf_reg_state *reg, argno_t argno,
 				   struct bpf_kfunc_call_arg_meta *meta,
 				   enum btf_field_type head_field_type,
 				   struct btf_field **head_field)
@@ -11893,8 +11754,8 @@ __process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env,
 	head_type_name = btf_field_type_name(head_field_type);
 	if (!tnum_is_const(reg->var_off)) {
 		verbose(env,
-			"R%d doesn't have constant offset. %s has to be at the constant offset\n",
-			regno, head_type_name);
+			"%s doesn't have constant offset. %s has to be at the constant offset\n",
+			reg_arg_name(env, argno), head_type_name);
 		return -EINVAL;
 	}
 
@@ -11922,24 +11783,24 @@ __process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env,
 }
 
 static int process_kf_arg_ptr_to_list_head(struct bpf_verifier_env *env,
-					   struct bpf_reg_state *reg, u32 regno,
+					   struct bpf_reg_state *reg, argno_t argno,
 					   struct bpf_kfunc_call_arg_meta *meta)
 {
-	return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_LIST_HEAD,
+	return __process_kf_arg_ptr_to_graph_root(env, reg, argno, meta, BPF_LIST_HEAD,
 							  &meta->arg_list_head.field);
 }
 
 static int process_kf_arg_ptr_to_rbtree_root(struct bpf_verifier_env *env,
-					     struct bpf_reg_state *reg, u32 regno,
+					     struct bpf_reg_state *reg, argno_t argno,
 					     struct bpf_kfunc_call_arg_meta *meta)
 {
-	return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_RB_ROOT,
+	return __process_kf_arg_ptr_to_graph_root(env, reg, argno, meta, BPF_RB_ROOT,
 							  &meta->arg_rbtree_root.field);
 }
 
 static int
 __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env,
-				   struct bpf_reg_state *reg, u32 regno,
+				   struct bpf_reg_state *reg, argno_t argno,
 				   struct bpf_kfunc_call_arg_meta *meta,
 				   enum btf_field_type head_field_type,
 				   enum btf_field_type node_field_type,
@@ -11961,8 +11822,8 @@ __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env,
 	node_type_name = btf_field_type_name(node_field_type);
 	if (!tnum_is_const(reg->var_off)) {
 		verbose(env,
-			"R%d doesn't have constant offset. %s has to be at the constant offset\n",
-			regno, node_type_name);
+			"%s doesn't have constant offset. %s has to be at the constant offset\n",
+			reg_arg_name(env, argno), node_type_name);
 		return -EINVAL;
 	}
 
@@ -12003,19 +11864,19 @@ __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env,
 }
 
 static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env,
-					   struct bpf_reg_state *reg, u32 regno,
+					   struct bpf_reg_state *reg, argno_t argno,
 					   struct bpf_kfunc_call_arg_meta *meta)
 {
-	return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta,
+	return __process_kf_arg_ptr_to_graph_node(env, reg, argno, meta,
 						  BPF_LIST_HEAD, BPF_LIST_NODE,
 						  &meta->arg_list_head.field);
 }
 
 static int process_kf_arg_ptr_to_rbtree_node(struct bpf_verifier_env *env,
-					     struct bpf_reg_state *reg, u32 regno,
+					     struct bpf_reg_state *reg, argno_t argno,
 					     struct bpf_kfunc_call_arg_meta *meta)
 {
-	return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta,
+	return __process_kf_arg_ptr_to_graph_node(env, reg, argno, meta,
 						  BPF_RB_ROOT, BPF_RB_NODE,
 						  &meta->arg_rbtree_root.field);
 }
@@ -12046,6 +11907,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			    int insn_idx)
 {
 	const char *func_name = meta->func_name, *ref_tname;
+	struct bpf_func_state *caller = cur_func(env);
+	struct bpf_reg_state *regs = cur_regs(env);
 	const struct btf *btf = meta->btf;
 	const struct btf_param *args;
 	struct btf_record *rec;
@@ -12054,20 +11917,31 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 
 	args = (const struct btf_param *)(meta->func_proto + 1);
 	nargs = btf_type_vlen(meta->func_proto);
-	if (nargs > MAX_BPF_FUNC_REG_ARGS) {
+	if (nargs > MAX_BPF_FUNC_ARGS) {
 		verbose(env, "Function %s has %d > %d args\n", func_name, nargs,
-			MAX_BPF_FUNC_REG_ARGS);
+			MAX_BPF_FUNC_ARGS);
 		return -EINVAL;
 	}
+	if (nargs > MAX_BPF_FUNC_REG_ARGS && !bpf_jit_supports_stack_args()) {
+		verbose(env, "JIT does not support kfunc %s() with %d args\n",
+			func_name, nargs);
+		return -ENOTSUPP;
+	}
+
+	ret = check_outgoing_stack_args(env, caller, nargs);
+	if (ret)
+		return ret;
 
 	/* Check that BTF function arguments match actual types that the
 	 * verifier sees.
 	 */
 	for (i = 0; i < nargs; i++) {
-		struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[i + 1];
+		struct bpf_reg_state *reg = get_func_arg_reg(caller, regs, i);
 		const struct btf_type *t, *ref_t, *resolve_ret;
 		enum bpf_arg_type arg_type = ARG_DONTCARE;
-		u32 regno = i + 1, ref_id, type_size;
+		argno_t argno = argno_from_arg(i + 1);
+		int regno = reg_from_argno(argno);
+		u32 ref_id, type_size;
 		bool is_ret_buf_sz = false;
 		int kf_arg_type;
 
@@ -12077,6 +11951,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 				verifier_bug(env, "Only 1 prog->aux argument supported per-kfunc");
 				return -EFAULT;
 			}
+			if (regno < 0) {
+				verbose(env, "%s prog->aux cannot be a stack argument\n",
+					reg_arg_name(env, argno));
+				return -EINVAL;
+			}
 			meta->arg_prog = true;
 			cur_aux(env)->arg_prog = regno;
 			continue;
@@ -12089,7 +11968,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 
 		if (btf_type_is_scalar(t)) {
 			if (reg->type != SCALAR_VALUE) {
-				verbose(env, "R%d is not a scalar\n", regno);
+				verbose(env, "%s is not a scalar\n", reg_arg_name(env, argno));
 				return -EINVAL;
 			}
 
@@ -12099,10 +11978,14 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 					return -EFAULT;
 				}
 				if (!tnum_is_const(reg->var_off)) {
-					verbose(env, "R%d must be a known constant\n", regno);
+					verbose(env, "%s must be a known constant\n",
+						reg_arg_name(env, argno));
 					return -EINVAL;
 				}
-				ret = mark_chain_precision(env, regno);
+				if (regno >= 0)
+					ret = mark_chain_precision(env, regno);
+				else
+					ret = mark_stack_arg_precision(env, i);
 				if (ret < 0)
 					return ret;
 				meta->arg_constant.found = true;
@@ -12121,12 +12004,16 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 				}
 
 				if (!tnum_is_const(reg->var_off)) {
-					verbose(env, "R%d is not a const\n", regno);
+					verbose(env, "%s is not a const\n",
+						reg_arg_name(env, argno));
 					return -EINVAL;
 				}
 
 				meta->r0_size = reg->var_off.value;
-				ret = mark_chain_precision(env, regno);
+				if (regno >= 0)
+					ret = mark_chain_precision(env, regno);
+				else
+					ret = mark_stack_arg_precision(env, i);
 				if (ret)
 					return ret;
 			}
@@ -12134,32 +12021,33 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 		}
 
 		if (!btf_type_is_ptr(t)) {
-			verbose(env, "Unrecognized arg#%d type %s\n", i, btf_type_str(t));
+			verbose(env, "Unrecognized %s type %s\n",
+				reg_arg_name(env, argno), btf_type_str(t));
 			return -EINVAL;
 		}
 
 		if ((bpf_register_is_null(reg) || type_may_be_null(reg->type)) &&
 		    !is_kfunc_arg_nullable(meta->btf, &args[i])) {
-			verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i);
+			verbose(env, "Possibly NULL pointer passed to trusted %s\n",
+				reg_arg_name(env, argno));
 			return -EACCES;
 		}
 
-		if (reg->ref_obj_id) {
-			if (is_kfunc_release(meta) && meta->ref_obj_id) {
-				verifier_bug(env, "more than one arg with ref_obj_id R%d %u %u",
-					     regno, reg->ref_obj_id,
-					     meta->ref_obj_id);
-				return -EFAULT;
-			}
-			meta->ref_obj_id = reg->ref_obj_id;
-			if (is_kfunc_release(meta))
-				meta->release_regno = regno;
+		if (regno == meta->release_regno && !is_kfunc_arg_dynptr(meta->btf, &args[i]) &&
+		    !reg_is_referenced(env, reg) && !bpf_register_is_null(reg)) {
+			verbose(env, "release kfunc %s expects referenced PTR_TO_BTF_ID passed to %s\n",
+				func_name, reg_arg_name(env, argno));
+			return -EINVAL;
 		}
 
+		if (reg_is_referenced(env, reg))
+			update_ref_obj(&meta->ref_obj, reg);
+
 		ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
 		ref_tname = btf_name_by_offset(btf, ref_t->name_off);
 
-		kf_arg_type = get_kfunc_ptr_arg_type(env, meta, t, ref_t, ref_tname, args, i, nargs);
+		kf_arg_type = get_kfunc_ptr_arg_type(env, caller, regs, meta, t, ref_t, ref_tname,
+						     args, i, nargs, argno, reg);
 		if (kf_arg_type < 0)
 			return kf_arg_type;
 
@@ -12168,7 +12056,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			continue;
 		case KF_ARG_PTR_TO_MAP:
 			if (!reg->map_ptr) {
-				verbose(env, "pointer in R%d isn't map pointer\n", regno);
+				verbose(env, "pointer in %s isn't map pointer\n",
+					reg_arg_name(env, argno));
 				return -EINVAL;
 			}
 			if (meta->map.ptr && (reg->map_ptr->record->wq_off >= 0 ||
@@ -12204,18 +12093,19 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			fallthrough;
 		case KF_ARG_PTR_TO_ALLOC_BTF_ID:
 		case KF_ARG_PTR_TO_BTF_ID:
-			if (!is_trusted_reg(reg)) {
+			if (!is_trusted_reg(env, reg)) {
 				if (!is_kfunc_rcu(meta)) {
-					verbose(env, "R%d must be referenced or trusted\n", regno);
+					verbose(env, "%s must be referenced or trusted\n",
+						reg_arg_name(env, argno));
 					return -EINVAL;
 				}
 				if (!is_rcu_reg(reg)) {
-					verbose(env, "R%d must be a rcu pointer\n", regno);
+					verbose(env, "%s must be a rcu pointer\n",
+						reg_arg_name(env, argno));
 					return -EINVAL;
 				}
 			}
 			fallthrough;
-		case KF_ARG_PTR_TO_DYNPTR:
 		case KF_ARG_PTR_TO_ITER:
 		case KF_ARG_PTR_TO_LIST_HEAD:
 		case KF_ARG_PTR_TO_LIST_NODE:
@@ -12232,6 +12122,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 		case KF_ARG_PTR_TO_IRQ_FLAG:
 		case KF_ARG_PTR_TO_RES_SPIN_LOCK:
 			break;
+		case KF_ARG_PTR_TO_DYNPTR:
+			arg_type = ARG_PTR_TO_DYNPTR;
+			break;
 		case KF_ARG_PTR_TO_CTX:
 			arg_type = ARG_PTR_TO_CTX;
 			break;
@@ -12240,17 +12133,17 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			return -EFAULT;
 		}
 
-		if (is_kfunc_release(meta) && reg->ref_obj_id)
+		if (regno == meta->release_regno)
 			arg_type |= OBJ_RELEASE;
-		ret = check_func_arg_reg_off(env, reg, regno, arg_type);
+		ret = check_func_arg_reg_off(env, reg, argno, arg_type);
 		if (ret < 0)
 			return ret;
 
 		switch (kf_arg_type) {
 		case KF_ARG_PTR_TO_CTX:
 			if (reg->type != PTR_TO_CTX) {
-				verbose(env, "arg#%d expected pointer to ctx, but got %s\n",
-					i, reg_type_str(env, reg->type));
+				verbose(env, "%s expected pointer to ctx, but got %s\n",
+					reg_arg_name(env, argno), reg_type_str(env, reg->type));
 				return -EINVAL;
 			}
 
@@ -12264,19 +12157,22 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 		case KF_ARG_PTR_TO_ALLOC_BTF_ID:
 			if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC)) {
 				if (!is_bpf_obj_drop_kfunc(meta->func_id)) {
-					verbose(env, "arg#%d expected for bpf_obj_drop()\n", i);
+					verbose(env, "%s expected for bpf_obj_drop()\n",
+						reg_arg_name(env, argno));
 					return -EINVAL;
 				}
 			} else if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC | MEM_PERCPU)) {
 				if (!is_bpf_percpu_obj_drop_kfunc(meta->func_id)) {
-					verbose(env, "arg#%d expected for bpf_percpu_obj_drop()\n", i);
+					verbose(env, "%s expected for bpf_percpu_obj_drop()\n",
+						reg_arg_name(env, argno));
 					return -EINVAL;
 				}
 			} else {
-				verbose(env, "arg#%d expected pointer to allocated object\n", i);
+				verbose(env, "%s expected pointer to allocated object\n",
+					reg_arg_name(env, argno));
 				return -EINVAL;
 			}
-			if (!reg->ref_obj_id) {
+			if (!reg_is_referenced(env, reg)) {
 				verbose(env, "allocated object must be referenced\n");
 				return -EINVAL;
 			}
@@ -12288,10 +12184,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 		case KF_ARG_PTR_TO_DYNPTR:
 		{
 			enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR;
-			int clone_ref_obj_id = 0;
-
-			if (reg->type == CONST_PTR_TO_DYNPTR)
-				dynptr_arg_type |= MEM_RDONLY;
 
 			if (is_kfunc_arg_uninit(btf, &args[i]))
 				dynptr_arg_type |= MEM_UNINIT;
@@ -12305,11 +12197,10 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			} else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) {
 				dynptr_arg_type |= DYNPTR_TYPE_FILE;
 			} else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_file_discard]) {
-				dynptr_arg_type |= DYNPTR_TYPE_FILE;
-				meta->release_regno = regno;
+				dynptr_arg_type |= DYNPTR_TYPE_FILE | OBJ_RELEASE;
 			} else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] &&
 				   (dynptr_arg_type & MEM_UNINIT)) {
-				enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type;
+				enum bpf_dynptr_type parent_type = meta->dynptr.type;
 
 				if (parent_type == BPF_DYNPTR_TYPE_INVALID) {
 					verifier_bug(env, "no dynptr type for parent of clone");
@@ -12317,29 +12208,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 				}
 
 				dynptr_arg_type |= (unsigned int)get_dynptr_type_flag(parent_type);
-				clone_ref_obj_id = meta->initialized_dynptr.ref_obj_id;
-				if (dynptr_type_refcounted(parent_type) && !clone_ref_obj_id) {
-					verifier_bug(env, "missing ref obj id for parent of clone");
-					return -EFAULT;
-				}
 			}
 
-			ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type, clone_ref_obj_id);
+			ret = process_dynptr_func(env, reg, argno, insn_idx, dynptr_arg_type,
+						  &meta->ref_obj, &meta->dynptr);
 			if (ret < 0)
 				return ret;
-
-			if (!(dynptr_arg_type & MEM_UNINIT)) {
-				int id = dynptr_id(env, reg);
-
-				if (id < 0) {
-					verifier_bug(env, "failed to obtain dynptr id");
-					return id;
-				}
-				meta->initialized_dynptr.id = id;
-				meta->initialized_dynptr.type = dynptr_get_type(env, reg);
-				meta->initialized_dynptr.ref_obj_id = dynptr_ref_obj_id(env, reg);
-			}
-
 			break;
 		}
 		case KF_ARG_PTR_TO_ITER:
@@ -12349,63 +12223,78 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 					return -EINVAL;
 				}
 			}
-			ret = process_iter_arg(env, regno, insn_idx, meta);
+			ret = process_iter_arg(env, reg, argno, insn_idx, meta);
 			if (ret < 0)
 				return ret;
 			break;
 		case KF_ARG_PTR_TO_LIST_HEAD:
 			if (reg->type != PTR_TO_MAP_VALUE &&
 			    reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
-				verbose(env, "arg#%d expected pointer to map value or allocated object\n", i);
+				verbose(env, "%s expected pointer to map value or allocated object\n",
+					reg_arg_name(env, argno));
 				return -EINVAL;
 			}
-			if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) {
+			if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) &&
+			    !reg_is_referenced(env, reg)) {
 				verbose(env, "allocated object must be referenced\n");
 				return -EINVAL;
 			}
-			ret = process_kf_arg_ptr_to_list_head(env, reg, regno, meta);
+			ret = process_kf_arg_ptr_to_list_head(env, reg, argno, meta);
 			if (ret < 0)
 				return ret;
 			break;
 		case KF_ARG_PTR_TO_RB_ROOT:
 			if (reg->type != PTR_TO_MAP_VALUE &&
 			    reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
-				verbose(env, "arg#%d expected pointer to map value or allocated object\n", i);
+				verbose(env, "%s expected pointer to map value or allocated object\n",
+					reg_arg_name(env, argno));
 				return -EINVAL;
 			}
-			if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) {
+			if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) &&
+			    !reg_is_referenced(env, reg)) {
 				verbose(env, "allocated object must be referenced\n");
 				return -EINVAL;
 			}
-			ret = process_kf_arg_ptr_to_rbtree_root(env, reg, regno, meta);
+			ret = process_kf_arg_ptr_to_rbtree_root(env, reg, argno, meta);
 			if (ret < 0)
 				return ret;
 			break;
 		case KF_ARG_PTR_TO_LIST_NODE:
+			if (is_kfunc_arg_nonown_allowed(btf, &args[i]) &&
+			    type_is_non_owning_ref(reg->type) && !reg_is_referenced(env, reg)) {
+				/* Allow bpf_list_front/back return value for
+				 * __nonown_allowed list-node arguments.
+				 */
+				goto check_ok;
+			}
 			if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
-				verbose(env, "arg#%d expected pointer to allocated object\n", i);
+				verbose(env, "%s expected pointer to allocated object\n",
+					reg_arg_name(env, argno));
 				return -EINVAL;
 			}
-			if (!reg->ref_obj_id) {
+			if (!reg_is_referenced(env, reg)) {
 				verbose(env, "allocated object must be referenced\n");
 				return -EINVAL;
 			}
-			ret = process_kf_arg_ptr_to_list_node(env, reg, regno, meta);
+check_ok:
+			ret = process_kf_arg_ptr_to_list_node(env, reg, argno, meta);
 			if (ret < 0)
 				return ret;
 			break;
 		case KF_ARG_PTR_TO_RB_NODE:
 			if (is_bpf_rbtree_add_kfunc(meta->func_id)) {
 				if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
-					verbose(env, "arg#%d expected pointer to allocated object\n", i);
+					verbose(env, "%s expected pointer to allocated object\n",
+						reg_arg_name(env, argno));
 					return -EINVAL;
 				}
-				if (!reg->ref_obj_id) {
+				if (!reg_is_referenced(env, reg)) {
 					verbose(env, "allocated object must be referenced\n");
 					return -EINVAL;
 				}
 			} else {
-				if (!type_is_non_owning_ref(reg->type) && !reg->ref_obj_id) {
+				if (!type_is_non_owning_ref(reg->type) &&
+				    !reg_is_referenced(env, reg)) {
 					verbose(env, "%s can only take non-owning or refcounted bpf_rb_node pointer\n", func_name);
 					return -EINVAL;
 				}
@@ -12415,7 +12304,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 				}
 			}
 
-			ret = process_kf_arg_ptr_to_rbtree_node(env, reg, regno, meta);
+			ret = process_kf_arg_ptr_to_rbtree_node(env, reg, argno, meta);
 			if (ret < 0)
 				return ret;
 			break;
@@ -12430,38 +12319,44 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			if ((base_type(reg->type) != PTR_TO_BTF_ID ||
 			     (bpf_type_has_unsafe_modifiers(reg->type) && !is_rcu_reg(reg))) &&
 			    !reg2btf_ids[base_type(reg->type)]) {
-				verbose(env, "arg#%d is %s ", i, reg_type_str(env, reg->type));
+				verbose(env, "%s is %s ", reg_arg_name(env, argno),
+					reg_type_str(env, reg->type));
 				verbose(env, "expected %s or socket\n",
 					reg_type_str(env, base_type(reg->type) |
 							  (type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS)));
 				return -EINVAL;
 			}
-			ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i);
+			ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i, argno);
 			if (ret < 0)
 				return ret;
 			break;
 		case KF_ARG_PTR_TO_MEM:
 			resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
 			if (IS_ERR(resolve_ret)) {
-				verbose(env, "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
-					i, btf_type_str(ref_t), ref_tname, PTR_ERR(resolve_ret));
+				verbose(env, "%s reference type('%s %s') size cannot be determined: %ld\n",
+					reg_arg_name(env, argno), btf_type_str(ref_t),
+					ref_tname, PTR_ERR(resolve_ret));
 				return -EINVAL;
 			}
-			ret = check_mem_reg(env, reg, regno, type_size);
+			ret = check_mem_reg(env, reg, argno, type_size);
 			if (ret < 0)
 				return ret;
 			break;
 		case KF_ARG_PTR_TO_MEM_SIZE:
 		{
-			struct bpf_reg_state *buff_reg = &regs[regno];
+			struct bpf_reg_state *buff_reg = reg;
 			const struct btf_param *buff_arg = &args[i];
-			struct bpf_reg_state *size_reg = &regs[regno + 1];
+			struct bpf_reg_state *size_reg = get_func_arg_reg(caller, regs, i + 1);
 			const struct btf_param *size_arg = &args[i + 1];
+			argno_t next_argno = argno_from_arg(i + 2);
 
 			if (!bpf_register_is_null(buff_reg) || !is_kfunc_arg_nullable(meta->btf, buff_arg)) {
-				ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1);
+				ret = check_kfunc_mem_size_reg(env, buff_reg, size_reg,
+							       argno, next_argno);
 				if (ret < 0) {
-					verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1);
+					verbose(env, "%s and ", reg_arg_name(env, argno));
+					verbose(env, "%s memory, len pair leads to invalid memory access\n",
+						reg_arg_name(env, next_argno));
 					return ret;
 				}
 			}
@@ -12472,7 +12367,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 					return -EFAULT;
 				}
 				if (!tnum_is_const(size_reg->var_off)) {
-					verbose(env, "R%d must be a known constant\n", regno + 1);
+					verbose(env, "%s must be a known constant\n",
+						reg_arg_name(env, next_argno));
 					return -EINVAL;
 				}
 				meta->arg_constant.found = true;
@@ -12485,14 +12381,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 		}
 		case KF_ARG_PTR_TO_CALLBACK:
 			if (reg->type != PTR_TO_FUNC) {
-				verbose(env, "arg%d expected pointer to func\n", i);
+				verbose(env, "%s expected pointer to func\n", reg_arg_name(env, argno));
 				return -EINVAL;
 			}
 			meta->subprogno = reg->subprogno;
 			break;
 		case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
 			if (!type_is_ptr_alloc_obj(reg->type)) {
-				verbose(env, "arg#%d is neither owning or non-owning ref\n", i);
+				verbose(env, "%s is neither owning or non-owning ref\n",
+					reg_arg_name(env, argno));
 				return -EINVAL;
 			}
 			if (!type_is_non_owning_ref(reg->type))
@@ -12505,7 +12402,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			}
 
 			if (rec->refcount_off < 0) {
-				verbose(env, "arg#%d doesn't point to a type with bpf_refcount field\n", i);
+				verbose(env, "%s doesn't point to a type with bpf_refcount field\n",
+					reg_arg_name(env, argno));
 				return -EINVAL;
 			}
 
@@ -12514,46 +12412,51 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			break;
 		case KF_ARG_PTR_TO_CONST_STR:
 			if (reg->type != PTR_TO_MAP_VALUE) {
-				verbose(env, "arg#%d doesn't point to a const string\n", i);
+				verbose(env, "%s doesn't point to a const string\n",
+					reg_arg_name(env, argno));
 				return -EINVAL;
 			}
-			ret = check_reg_const_str(env, reg, regno);
+			ret = check_arg_const_str(env, reg, argno);
 			if (ret)
 				return ret;
 			break;
 		case KF_ARG_PTR_TO_WORKQUEUE:
 			if (reg->type != PTR_TO_MAP_VALUE) {
-				verbose(env, "arg#%d doesn't point to a map value\n", i);
+				verbose(env, "%s doesn't point to a map value\n",
+					reg_arg_name(env, argno));
 				return -EINVAL;
 			}
-			ret = check_map_field_pointer(env, regno, BPF_WORKQUEUE, &meta->map);
+			ret = check_map_field_pointer(env, reg, argno, BPF_WORKQUEUE, &meta->map);
 			if (ret < 0)
 				return ret;
 			break;
 		case KF_ARG_PTR_TO_TIMER:
 			if (reg->type != PTR_TO_MAP_VALUE) {
-				verbose(env, "arg#%d doesn't point to a map value\n", i);
+				verbose(env, "%s doesn't point to a map value\n",
+					reg_arg_name(env, argno));
 				return -EINVAL;
 			}
-			ret = process_timer_kfunc(env, regno, meta);
+			ret = process_timer_kfunc(env, reg, argno, meta);
 			if (ret < 0)
 				return ret;
 			break;
 		case KF_ARG_PTR_TO_TASK_WORK:
 			if (reg->type != PTR_TO_MAP_VALUE) {
-				verbose(env, "arg#%d doesn't point to a map value\n", i);
+				verbose(env, "%s doesn't point to a map value\n",
+					reg_arg_name(env, argno));
 				return -EINVAL;
 			}
-			ret = check_map_field_pointer(env, regno, BPF_TASK_WORK, &meta->map);
+			ret = check_map_field_pointer(env, reg, argno, BPF_TASK_WORK, &meta->map);
 			if (ret < 0)
 				return ret;
 			break;
 		case KF_ARG_PTR_TO_IRQ_FLAG:
 			if (reg->type != PTR_TO_STACK) {
-				verbose(env, "arg#%d doesn't point to an irq flag on stack\n", i);
+				verbose(env, "%s doesn't point to an irq flag on stack\n",
+					reg_arg_name(env, argno));
 				return -EINVAL;
 			}
-			ret = process_irq_flag(env, regno, meta);
+			ret = process_irq_flag(env, reg, argno, meta);
 			if (ret < 0)
 				return ret;
 			break;
@@ -12562,7 +12465,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			int flags = PROCESS_RES_LOCK;
 
 			if (reg->type != PTR_TO_MAP_VALUE && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
-				verbose(env, "arg#%d doesn't point to map value or allocated object\n", i);
+				verbose(env, "%s doesn't point to map value or allocated object\n",
+					reg_arg_name(env, argno));
 				return -EINVAL;
 			}
 
@@ -12574,7 +12478,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] ||
 			    meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore])
 				flags |= PROCESS_LOCK_IRQ;
-			ret = process_spin_lock(env, regno, flags);
+			ret = process_spin_lock(env, reg, argno, flags);
 			if (ret < 0)
 				return ret;
 			break;
@@ -12582,12 +12486,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 		}
 	}
 
-	if (is_kfunc_release(meta) && !meta->release_regno) {
-		verbose(env, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n",
-			func_name);
-		return -EINVAL;
-	}
-
 	return 0;
 }
 
@@ -12614,6 +12512,10 @@ int bpf_fetch_kfunc_arg_meta(struct bpf_verifier_env *env,
 
 	meta->kfunc_flags = *kfunc.flags;
 
+	/* Only support release referenced argument passed by register */
+	if (is_kfunc_release(meta))
+		meta->release_regno = BPF_REG_1;
+
 	return 0;
 }
 
@@ -12943,7 +12845,7 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca
 		}
 	} else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice] ||
 		   meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) {
-		enum bpf_type_flag type_flag = get_dynptr_type_flag(meta->initialized_dynptr.type);
+		enum bpf_type_flag type_flag = get_dynptr_type_flag(meta->dynptr.type);
 
 		mark_reg_known_zero(env, regs, BPF_REG_0);
 
@@ -12967,16 +12869,11 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca
 			}
 		}
 
-		if (!meta->initialized_dynptr.id) {
+		if (!meta->dynptr.id) {
 			verifier_bug(env, "no dynptr id");
 			return -EFAULT;
 		}
-		regs[BPF_REG_0].dynptr_id = meta->initialized_dynptr.id;
-
-		/* we don't need to set BPF_REG_0's ref obj id
-		 * because packet slices are not refcounted (see
-		 * dynptr_type_refcounted)
-		 */
+		regs[BPF_REG_0].parent_id = meta->dynptr.id;
 	} else {
 		return 0;
 	}
@@ -12990,7 +12887,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			    int *insn_idx_p)
 {
 	bool sleepable, rcu_lock, rcu_unlock, preempt_disable, preempt_enable;
-	u32 i, nargs, ptr_type_id, release_ref_obj_id;
+	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
 	struct bpf_reg_state *regs = cur_regs(env);
 	const char *func_name, *ptr_type_name;
 	const struct btf_type *t, *ptr_type;
@@ -12998,7 +12895,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	struct bpf_insn_aux_data *insn_aux;
 	int err, insn_idx = *insn_idx_p;
 	const struct btf_param *args;
+	u32 i, nargs, ptr_type_id;
 	struct btf *desc_btf;
+	int id;
 
 	/* skip for now, but return error when we find this in fixup_kfunc_call */
 	if (!insn->imm)
@@ -13065,6 +12964,21 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	if (err < 0)
 		return err;
 
+	if ((is_bpf_obj_drop_kfunc(meta.func_id) ||
+	     is_bpf_percpu_obj_drop_kfunc(meta.func_id)) && (is_tracing_prog_type(prog_type) ||
+	     /* is_tracing_prog_type() for now doesn't cover non-iterator tracing progs. */
+	     (prog_type == BPF_PROG_TYPE_TRACING && env->prog->expected_attach_type != BPF_TRACE_ITER
+	      && !env->prog->sleepable))) {
+		struct btf_struct_meta *struct_meta;
+
+		struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id);
+		if (struct_meta && btf_record_has_nmi_unsafe_fields(struct_meta->record)) {
+			verbose(env, "%s cannot be used in tracing programs on types with NMI unsafe fields\n",
+				func_name);
+			return -EINVAL;
+		}
+	}
+
 	if (is_bpf_rbtree_add_kfunc(meta.func_id)) {
 		err = push_callback_call(env, insn, insn_idx, meta.subprogno,
 					 set_rbtree_add_callback_state);
@@ -13109,22 +13023,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	if (rcu_lock) {
 		env->cur_state->active_rcu_locks++;
 	} else if (rcu_unlock) {
-		struct bpf_func_state *state;
-		struct bpf_reg_state *reg;
-		u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER);
-
 		if (env->cur_state->active_rcu_locks == 0) {
 			verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name);
 			return -EINVAL;
 		}
-		if (--env->cur_state->active_rcu_locks == 0) {
-			bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, clear_mask, ({
-				if (reg->type & MEM_RCU) {
-					reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL);
-					reg->type |= PTR_UNTRUSTED;
-				}
-			}));
-		}
+		if (--env->cur_state->active_rcu_locks == 0)
+			invalidate_rcu_protected_refs(env);
 	} else if (preempt_disable) {
 		env->cur_state->active_preempt_locks++;
 	} else if (preempt_enable) {
@@ -13155,37 +13059,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	 * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
 	 */
 	if (meta.release_regno) {
-		struct bpf_reg_state *reg = &regs[meta.release_regno];
-
-		if (meta.initialized_dynptr.ref_obj_id) {
-			err = unmark_stack_slots_dynptr(env, reg);
-		} else {
-			err = release_reference(env, reg->ref_obj_id);
-			if (err)
-				verbose(env, "kfunc %s#%d reference has not been acquired before\n",
-					func_name, meta.func_id);
-		}
+		err = release_reg(env, &regs[meta.release_regno], false, !!meta.dynptr.id);
 		if (err)
 			return err;
 	}
 
 	if (is_bpf_list_push_kfunc(meta.func_id) || is_bpf_rbtree_add_kfunc(meta.func_id)) {
-		release_ref_obj_id = regs[BPF_REG_2].ref_obj_id;
+		id = regs[BPF_REG_2].id;
 		insn_aux->insert_off = regs[BPF_REG_2].var_off.value;
 		insn_aux->kptr_struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id);
-		err = ref_convert_owning_non_owning(env, release_ref_obj_id);
-		if (err) {
-			verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n",
-				func_name, meta.func_id);
-			return err;
-		}
-
-		err = release_reference(env, release_ref_obj_id);
-		if (err) {
-			verbose(env, "kfunc %s#%d reference has not been acquired before\n",
-				func_name, meta.func_id);
-			return err;
-		}
+		ref_convert_owning_non_owning(env, id);
 	}
 
 	if (meta.func_id == special_kfunc_list[KF_bpf_throw]) {
@@ -13212,6 +13095,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		bpf_mark_reg_not_init(env, &regs[regno]);
 		regs[regno].subreg_def = DEF_NOT_SUBREG;
 	}
+	invalidate_outgoing_stack_args(env, cur_func(env));
 
 	/* Check return type */
 	t = btf_type_skip_modifiers(desc_btf, meta.func_proto->type, NULL);
@@ -13269,8 +13153,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 				regs[BPF_REG_0].type |= MEM_RDONLY;
 
 			/* Ensures we don't access the memory after a release_reference() */
-			if (meta.ref_obj_id)
-				regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
+			if (meta.ref_obj.id) {
+				err = validate_ref_obj(env, &meta.ref_obj);
+				if (err)
+					return err;
+				regs[BPF_REG_0].parent_id = meta.ref_obj.id;
+			}
 
 			if (is_kfunc_rcu_protected(&meta))
 				regs[BPF_REG_0].type |= MEM_RCU;
@@ -13316,13 +13204,10 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		}
 		mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *));
 		if (is_kfunc_acquire(&meta)) {
-			int id = acquire_reference(env, insn_idx);
-
+			id = acquire_reference(env, insn_idx, 0);
 			if (id < 0)
 				return id;
-			if (is_kfunc_ret_null(&meta))
-				regs[BPF_REG_0].id = id;
-			regs[BPF_REG_0].ref_obj_id = id;
+			regs[BPF_REG_0].id = id;
 		} else if (is_rbtree_node_type(ptr_type) || is_list_node_type(ptr_type)) {
 			ref_set_non_owning(env, &regs[BPF_REG_0]);
 		}
@@ -13344,8 +13229,18 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		clear_all_pkt_pointers(env);
 
 	nargs = btf_type_vlen(meta.func_proto);
+	if (nargs > MAX_BPF_FUNC_REG_ARGS) {
+		struct bpf_func_state *caller = cur_func(env);
+		struct bpf_subprog_info *caller_info = &env->subprog_info[caller->subprogno];
+		u16 out_stack_arg_cnt = nargs - MAX_BPF_FUNC_REG_ARGS;
+		u16 stack_arg_cnt = bpf_in_stack_arg_cnt(caller_info) + out_stack_arg_cnt;
+
+		if (stack_arg_cnt > caller_info->stack_arg_cnt)
+			caller_info->stack_arg_cnt = stack_arg_cnt;
+	}
+
 	args = (const struct btf_param *)(meta.func_proto + 1);
-	for (i = 0; i < nargs; i++) {
+	for (i = 0; i < min_t(int, nargs, MAX_BPF_FUNC_REG_ARGS); i++) {
 		u32 regno = i + 1;
 
 		t = btf_type_skip_modifiers(desc_btf, args[i].type, NULL);
@@ -13377,7 +13272,7 @@ static bool check_reg_sane_offset_scalar(struct bpf_verifier_env *env,
 {
 	bool known = tnum_is_const(reg->var_off);
 	s64 val = reg->var_off.value;
-	s64 smin = reg->smin_value;
+	s64 smin = reg_smin(reg);
 
 	if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) {
 		verbose(env, "math between %s pointer and %lld is not allowed\n",
@@ -13406,7 +13301,7 @@ static bool check_reg_sane_offset_ptr(struct bpf_verifier_env *env,
 {
 	bool known = tnum_is_const(reg->var_off);
 	s64 val = reg->var_off.value;
-	s64 smin = reg->smin_value;
+	s64 smin = reg_smin(reg);
 
 	if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) {
 		verbose(env, "%s pointer offset %lld is not allowed\n",
@@ -13448,7 +13343,7 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
 		break;
 	case PTR_TO_MAP_VALUE:
 		max = ptr_reg->map_ptr->value_size;
-		ptr_limit = mask_to_left ? ptr_reg->smin_value : ptr_reg->umax_value;
+		ptr_limit = mask_to_left ? reg_smin(ptr_reg) : reg_umax(ptr_reg);
 		break;
 	default:
 		return REASON_TYPE;
@@ -13537,7 +13432,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
 	struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : &info->aux;
 	struct bpf_verifier_state *vstate = env->cur_state;
 	bool off_is_imm = tnum_is_const(off_reg->var_off);
-	bool off_is_neg = off_reg->smin_value < 0;
+	bool off_is_neg = reg_smin(off_reg) < 0;
 	bool ptr_is_dst_reg = ptr_reg == dst_reg;
 	u8 opcode = BPF_OP(insn->code);
 	u32 alu_state, alu_limit;
@@ -13556,7 +13451,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
 
 	if (!commit_window) {
 		if (!tnum_is_const(off_reg->var_off) &&
-		    (off_reg->smin_value < 0) != (off_reg->smax_value < 0))
+		    (reg_smin(off_reg) < 0) != (reg_smax(off_reg) < 0))
 			return REASON_BOUNDS;
 
 		info->mask_to_left = (opcode == BPF_ADD &&  off_is_neg) ||
@@ -13612,7 +13507,7 @@ do_sim:
 	 */
 	if (!ptr_is_dst_reg) {
 		tmp = *dst_reg;
-		copy_register_state(dst_reg, ptr_reg);
+		*dst_reg = *ptr_reg;
 	}
 	err = sanitize_speculative_path(env, NULL, env->insn_idx + 1, env->insn_idx);
 	if (err < 0)
@@ -13706,7 +13601,7 @@ static int check_stack_access_for_ptr_arithmetic(
 
 static int sanitize_check_bounds(struct bpf_verifier_env *env,
 				 const struct bpf_insn *insn,
-				 const struct bpf_reg_state *dst_reg)
+				 struct bpf_reg_state *dst_reg)
 {
 	u32 dst = insn->dst_reg;
 
@@ -13723,7 +13618,7 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env,
 			return -EACCES;
 		break;
 	case PTR_TO_MAP_VALUE:
-		if (check_map_access(env, dst, 0, 1, false, ACCESS_HELPER)) {
+		if (check_map_access(env, dst_reg, argno_from_reg(dst), 0, 1, false, ACCESS_HELPER)) {
 			verbose(env, "R%d pointer arithmetic of map value goes out of range, "
 				"prohibited for !root\n", dst);
 			return -EACCES;
@@ -13750,10 +13645,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	struct bpf_func_state *state = vstate->frame[vstate->curframe];
 	struct bpf_reg_state *regs = state->regs, *dst_reg;
 	bool known = tnum_is_const(off_reg->var_off);
-	s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value,
-	    smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
-	u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value,
-	    umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value;
+	s64 smin_val = reg_smin(off_reg), smax_val = reg_smax(off_reg);
+	u64 umin_val = reg_umin(off_reg), umax_val = reg_umax(off_reg);
 	struct bpf_sanitize_info info = {};
 	u8 opcode = BPF_OP(insn->code);
 	u32 dst = insn->dst_reg;
@@ -13855,16 +13748,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		 * added into the variable offset, and we copy the fixed offset
 		 * from ptr_reg.
 		 */
-		if (check_add_overflow(smin_ptr, smin_val, &dst_reg->smin_value) ||
-		    check_add_overflow(smax_ptr, smax_val, &dst_reg->smax_value)) {
-			dst_reg->smin_value = S64_MIN;
-			dst_reg->smax_value = S64_MAX;
-		}
-		if (check_add_overflow(umin_ptr, umin_val, &dst_reg->umin_value) ||
-		    check_add_overflow(umax_ptr, umax_val, &dst_reg->umax_value)) {
-			dst_reg->umin_value = 0;
-			dst_reg->umax_value = U64_MAX;
-		}
+		dst_reg->r64 = cnum64_add(ptr_reg->r64, off_reg->r64);
 		dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
 		dst_reg->raw = ptr_reg->raw;
 		if (reg_is_pkt_pointer(ptr_reg)) {
@@ -13896,24 +13780,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 				dst);
 			return -EACCES;
 		}
-		/* A new variable offset is created.  If the subtrahend is known
-		 * nonnegative, then any reg->range we had before is still good.
-		 */
-		if (check_sub_overflow(smin_ptr, smax_val, &dst_reg->smin_value) ||
-		    check_sub_overflow(smax_ptr, smin_val, &dst_reg->smax_value)) {
-			/* Overflow possible, we know nothing */
-			dst_reg->smin_value = S64_MIN;
-			dst_reg->smax_value = S64_MAX;
-		}
-		if (umin_ptr < umax_val) {
-			/* Overflow possible, we know nothing */
-			dst_reg->umin_value = 0;
-			dst_reg->umax_value = U64_MAX;
-		} else {
-			/* Cannot overflow (as long as bounds are consistent) */
-			dst_reg->umin_value = umin_ptr - umax_val;
-			dst_reg->umax_value = umax_ptr - umin_val;
-		}
+		dst_reg->r64 = cnum64_add(ptr_reg->r64, cnum64_negate(off_reg->r64));
 		dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off);
 		dst_reg->raw = ptr_reg->raw;
 		if (reg_is_pkt_pointer(ptr_reg)) {
@@ -13970,227 +13837,123 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 static void scalar32_min_max_add(struct bpf_reg_state *dst_reg,
 				 struct bpf_reg_state *src_reg)
 {
-	s32 *dst_smin = &dst_reg->s32_min_value;
-	s32 *dst_smax = &dst_reg->s32_max_value;
-	u32 *dst_umin = &dst_reg->u32_min_value;
-	u32 *dst_umax = &dst_reg->u32_max_value;
-	u32 umin_val = src_reg->u32_min_value;
-	u32 umax_val = src_reg->u32_max_value;
-	bool min_overflow, max_overflow;
-
-	if (check_add_overflow(*dst_smin, src_reg->s32_min_value, dst_smin) ||
-	    check_add_overflow(*dst_smax, src_reg->s32_max_value, dst_smax)) {
-		*dst_smin = S32_MIN;
-		*dst_smax = S32_MAX;
-	}
-
-	/* If either all additions overflow or no additions overflow, then
-	 * it is okay to set: dst_umin = dst_umin + src_umin, dst_umax =
-	 * dst_umax + src_umax. Otherwise (some additions overflow), set
-	 * the output bounds to unbounded.
-	 */
-	min_overflow = check_add_overflow(*dst_umin, umin_val, dst_umin);
-	max_overflow = check_add_overflow(*dst_umax, umax_val, dst_umax);
-
-	if (!min_overflow && max_overflow) {
-		*dst_umin = 0;
-		*dst_umax = U32_MAX;
-	}
+	dst_reg->r32 = cnum32_add(dst_reg->r32, src_reg->r32);
 }
 
 static void scalar_min_max_add(struct bpf_reg_state *dst_reg,
 			       struct bpf_reg_state *src_reg)
 {
-	s64 *dst_smin = &dst_reg->smin_value;
-	s64 *dst_smax = &dst_reg->smax_value;
-	u64 *dst_umin = &dst_reg->umin_value;
-	u64 *dst_umax = &dst_reg->umax_value;
-	u64 umin_val = src_reg->umin_value;
-	u64 umax_val = src_reg->umax_value;
-	bool min_overflow, max_overflow;
-
-	if (check_add_overflow(*dst_smin, src_reg->smin_value, dst_smin) ||
-	    check_add_overflow(*dst_smax, src_reg->smax_value, dst_smax)) {
-		*dst_smin = S64_MIN;
-		*dst_smax = S64_MAX;
-	}
-
-	/* If either all additions overflow or no additions overflow, then
-	 * it is okay to set: dst_umin = dst_umin + src_umin, dst_umax =
-	 * dst_umax + src_umax. Otherwise (some additions overflow), set
-	 * the output bounds to unbounded.
-	 */
-	min_overflow = check_add_overflow(*dst_umin, umin_val, dst_umin);
-	max_overflow = check_add_overflow(*dst_umax, umax_val, dst_umax);
-
-	if (!min_overflow && max_overflow) {
-		*dst_umin = 0;
-		*dst_umax = U64_MAX;
-	}
+	dst_reg->r64 = cnum64_add(dst_reg->r64, src_reg->r64);
 }
 
 static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg,
 				 struct bpf_reg_state *src_reg)
 {
-	s32 *dst_smin = &dst_reg->s32_min_value;
-	s32 *dst_smax = &dst_reg->s32_max_value;
-	u32 *dst_umin = &dst_reg->u32_min_value;
-	u32 *dst_umax = &dst_reg->u32_max_value;
-	u32 umin_val = src_reg->u32_min_value;
-	u32 umax_val = src_reg->u32_max_value;
-	bool min_underflow, max_underflow;
-
-	if (check_sub_overflow(*dst_smin, src_reg->s32_max_value, dst_smin) ||
-	    check_sub_overflow(*dst_smax, src_reg->s32_min_value, dst_smax)) {
-		/* Overflow possible, we know nothing */
-		*dst_smin = S32_MIN;
-		*dst_smax = S32_MAX;
-	}
-
-	/* If either all subtractions underflow or no subtractions
-	 * underflow, it is okay to set: dst_umin = dst_umin - src_umax,
-	 * dst_umax = dst_umax - src_umin. Otherwise (some subtractions
-	 * underflow), set the output bounds to unbounded.
-	 */
-	min_underflow = check_sub_overflow(*dst_umin, umax_val, dst_umin);
-	max_underflow = check_sub_overflow(*dst_umax, umin_val, dst_umax);
-
-	if (min_underflow && !max_underflow) {
-		*dst_umin = 0;
-		*dst_umax = U32_MAX;
-	}
+	dst_reg->r32 = cnum32_add(dst_reg->r32, cnum32_negate(src_reg->r32));
 }
 
 static void scalar_min_max_sub(struct bpf_reg_state *dst_reg,
 			       struct bpf_reg_state *src_reg)
 {
-	s64 *dst_smin = &dst_reg->smin_value;
-	s64 *dst_smax = &dst_reg->smax_value;
-	u64 *dst_umin = &dst_reg->umin_value;
-	u64 *dst_umax = &dst_reg->umax_value;
-	u64 umin_val = src_reg->umin_value;
-	u64 umax_val = src_reg->umax_value;
-	bool min_underflow, max_underflow;
-
-	if (check_sub_overflow(*dst_smin, src_reg->smax_value, dst_smin) ||
-	    check_sub_overflow(*dst_smax, src_reg->smin_value, dst_smax)) {
-		/* Overflow possible, we know nothing */
-		*dst_smin = S64_MIN;
-		*dst_smax = S64_MAX;
-	}
-
-	/* If either all subtractions underflow or no subtractions
-	 * underflow, it is okay to set: dst_umin = dst_umin - src_umax,
-	 * dst_umax = dst_umax - src_umin. Otherwise (some subtractions
-	 * underflow), set the output bounds to unbounded.
-	 */
-	min_underflow = check_sub_overflow(*dst_umin, umax_val, dst_umin);
-	max_underflow = check_sub_overflow(*dst_umax, umin_val, dst_umax);
-
-	if (min_underflow && !max_underflow) {
-		*dst_umin = 0;
-		*dst_umax = U64_MAX;
-	}
+	dst_reg->r64 = cnum64_add(dst_reg->r64, cnum64_negate(src_reg->r64));
 }
 
 static void scalar32_min_max_mul(struct bpf_reg_state *dst_reg,
 				 struct bpf_reg_state *src_reg)
 {
-	s32 *dst_smin = &dst_reg->s32_min_value;
-	s32 *dst_smax = &dst_reg->s32_max_value;
-	u32 *dst_umin = &dst_reg->u32_min_value;
-	u32 *dst_umax = &dst_reg->u32_max_value;
+	s32 smin = reg_s32_min(dst_reg);
+	s32 smax = reg_s32_max(dst_reg);
+	u32 umin = reg_u32_min(dst_reg);
+	u32 umax = reg_u32_max(dst_reg);
 	s32 tmp_prod[4];
 
-	if (check_mul_overflow(*dst_umax, src_reg->u32_max_value, dst_umax) ||
-	    check_mul_overflow(*dst_umin, src_reg->u32_min_value, dst_umin)) {
+	if (check_mul_overflow(umax, reg_u32_max(src_reg), &umax) ||
+	    check_mul_overflow(umin, reg_u32_min(src_reg), &umin)) {
 		/* Overflow possible, we know nothing */
-		*dst_umin = 0;
-		*dst_umax = U32_MAX;
+		umin = 0;
+		umax = U32_MAX;
 	}
-	if (check_mul_overflow(*dst_smin, src_reg->s32_min_value, &tmp_prod[0]) ||
-	    check_mul_overflow(*dst_smin, src_reg->s32_max_value, &tmp_prod[1]) ||
-	    check_mul_overflow(*dst_smax, src_reg->s32_min_value, &tmp_prod[2]) ||
-	    check_mul_overflow(*dst_smax, src_reg->s32_max_value, &tmp_prod[3])) {
+	if (check_mul_overflow(smin, reg_s32_min(src_reg), &tmp_prod[0]) ||
+	    check_mul_overflow(smin, reg_s32_max(src_reg), &tmp_prod[1]) ||
+	    check_mul_overflow(smax, reg_s32_min(src_reg), &tmp_prod[2]) ||
+	    check_mul_overflow(smax, reg_s32_max(src_reg), &tmp_prod[3])) {
 		/* Overflow possible, we know nothing */
-		*dst_smin = S32_MIN;
-		*dst_smax = S32_MAX;
+		smin = S32_MIN;
+		smax = S32_MAX;
 	} else {
-		*dst_smin = min_array(tmp_prod, 4);
-		*dst_smax = max_array(tmp_prod, 4);
+		smin = min_array(tmp_prod, 4);
+		smax = max_array(tmp_prod, 4);
 	}
+
+	dst_reg->r32 = cnum32_intersect(cnum32_from_urange(umin, umax),
+					cnum32_from_srange(smin, smax));
 }
 
 static void scalar_min_max_mul(struct bpf_reg_state *dst_reg,
 			       struct bpf_reg_state *src_reg)
 {
-	s64 *dst_smin = &dst_reg->smin_value;
-	s64 *dst_smax = &dst_reg->smax_value;
-	u64 *dst_umin = &dst_reg->umin_value;
-	u64 *dst_umax = &dst_reg->umax_value;
+	s64 smin = reg_smin(dst_reg);
+	s64 smax = reg_smax(dst_reg);
+	u64 umin = reg_umin(dst_reg);
+	u64 umax = reg_umax(dst_reg);
 	s64 tmp_prod[4];
 
-	if (check_mul_overflow(*dst_umax, src_reg->umax_value, dst_umax) ||
-	    check_mul_overflow(*dst_umin, src_reg->umin_value, dst_umin)) {
+	if (check_mul_overflow(umax, reg_umax(src_reg), &umax) ||
+	    check_mul_overflow(umin, reg_umin(src_reg), &umin)) {
 		/* Overflow possible, we know nothing */
-		*dst_umin = 0;
-		*dst_umax = U64_MAX;
+		umin = 0;
+		umax = U64_MAX;
 	}
-	if (check_mul_overflow(*dst_smin, src_reg->smin_value, &tmp_prod[0]) ||
-	    check_mul_overflow(*dst_smin, src_reg->smax_value, &tmp_prod[1]) ||
-	    check_mul_overflow(*dst_smax, src_reg->smin_value, &tmp_prod[2]) ||
-	    check_mul_overflow(*dst_smax, src_reg->smax_value, &tmp_prod[3])) {
+	if (check_mul_overflow(smin, reg_smin(src_reg), &tmp_prod[0]) ||
+	    check_mul_overflow(smin, reg_smax(src_reg), &tmp_prod[1]) ||
+	    check_mul_overflow(smax, reg_smin(src_reg), &tmp_prod[2]) ||
+	    check_mul_overflow(smax, reg_smax(src_reg), &tmp_prod[3])) {
 		/* Overflow possible, we know nothing */
-		*dst_smin = S64_MIN;
-		*dst_smax = S64_MAX;
+		smin = S64_MIN;
+		smax = S64_MAX;
 	} else {
-		*dst_smin = min_array(tmp_prod, 4);
-		*dst_smax = max_array(tmp_prod, 4);
+		smin = min_array(tmp_prod, 4);
+		smax = max_array(tmp_prod, 4);
 	}
+
+	dst_reg->r64 = cnum64_intersect(cnum64_from_urange(umin, umax),
+					cnum64_from_srange(smin, smax));
 }
 
 static void scalar32_min_max_udiv(struct bpf_reg_state *dst_reg,
 				  struct bpf_reg_state *src_reg)
 {
-	u32 *dst_umin = &dst_reg->u32_min_value;
-	u32 *dst_umax = &dst_reg->u32_max_value;
-	u32 src_val = src_reg->u32_min_value; /* non-zero, const divisor */
+	u32 src_val = reg_u32_min(src_reg); /* non-zero, const divisor */
 
-	*dst_umin = *dst_umin / src_val;
-	*dst_umax = *dst_umax / src_val;
+	reg_set_urange32(dst_reg, reg_u32_min(dst_reg) / src_val,
+			 reg_u32_max(dst_reg) / src_val);
 
 	/* Reset other ranges/tnum to unbounded/unknown. */
-	dst_reg->s32_min_value = S32_MIN;
-	dst_reg->s32_max_value = S32_MAX;
 	reset_reg64_and_tnum(dst_reg);
 }
 
 static void scalar_min_max_udiv(struct bpf_reg_state *dst_reg,
 				struct bpf_reg_state *src_reg)
 {
-	u64 *dst_umin = &dst_reg->umin_value;
-	u64 *dst_umax = &dst_reg->umax_value;
-	u64 src_val = src_reg->umin_value; /* non-zero, const divisor */
+	u64 src_val = reg_umin(src_reg); /* non-zero, const divisor */
 
-	*dst_umin = div64_u64(*dst_umin, src_val);
-	*dst_umax = div64_u64(*dst_umax, src_val);
+	reg_set_urange64(dst_reg, div64_u64(reg_umin(dst_reg), src_val),
+			 div64_u64(reg_umax(dst_reg), src_val));
 
 	/* Reset other ranges/tnum to unbounded/unknown. */
-	dst_reg->smin_value = S64_MIN;
-	dst_reg->smax_value = S64_MAX;
 	reset_reg32_and_tnum(dst_reg);
 }
 
 static void scalar32_min_max_sdiv(struct bpf_reg_state *dst_reg,
 				  struct bpf_reg_state *src_reg)
 {
-	s32 *dst_smin = &dst_reg->s32_min_value;
-	s32 *dst_smax = &dst_reg->s32_max_value;
-	s32 src_val = src_reg->s32_min_value; /* non-zero, const divisor */
+	s32 smin = reg_s32_min(dst_reg);
+	s32 smax = reg_s32_max(dst_reg);
+	s32 src_val = reg_s32_min(src_reg); /* non-zero, const divisor */
 	s32 res1, res2;
 
 	/* BPF div specification: S32_MIN / -1 = S32_MIN */
-	if (*dst_smin == S32_MIN && src_val == -1) {
+	if (smin == S32_MIN && src_val == -1) {
 		/*
 		 * If the dividend range contains more than just S32_MIN,
 		 * we cannot precisely track the result, so it becomes unbounded.
@@ -14199,35 +13962,34 @@ static void scalar32_min_max_sdiv(struct bpf_reg_state *dst_reg,
 		 *     = {S32_MIN} U [S32_MAX-9, S32_MAX] = [S32_MIN, S32_MAX]
 		 * Otherwise (if dividend is exactly S32_MIN), result remains S32_MIN.
 		 */
-		if (*dst_smax != S32_MIN) {
-			*dst_smin = S32_MIN;
-			*dst_smax = S32_MAX;
+		if (smax != S32_MIN) {
+			smin = S32_MIN;
+			smax = S32_MAX;
 		}
 		goto reset;
 	}
 
-	res1 = *dst_smin / src_val;
-	res2 = *dst_smax / src_val;
-	*dst_smin = min(res1, res2);
-	*dst_smax = max(res1, res2);
+	res1 = smin / src_val;
+	res2 = smax / src_val;
+	smin = min(res1, res2);
+	smax = max(res1, res2);
 
 reset:
+	reg_set_srange32(dst_reg, smin, smax);
 	/* Reset other ranges/tnum to unbounded/unknown. */
-	dst_reg->u32_min_value = 0;
-	dst_reg->u32_max_value = U32_MAX;
 	reset_reg64_and_tnum(dst_reg);
 }
 
 static void scalar_min_max_sdiv(struct bpf_reg_state *dst_reg,
 				struct bpf_reg_state *src_reg)
 {
-	s64 *dst_smin = &dst_reg->smin_value;
-	s64 *dst_smax = &dst_reg->smax_value;
-	s64 src_val = src_reg->smin_value; /* non-zero, const divisor */
+	s64 smin = reg_smin(dst_reg);
+	s64 smax = reg_smax(dst_reg);
+	s64 src_val = reg_smin(src_reg); /* non-zero, const divisor */
 	s64 res1, res2;
 
 	/* BPF div specification: S64_MIN / -1 = S64_MIN */
-	if (*dst_smin == S64_MIN && src_val == -1) {
+	if (smin == S64_MIN && src_val == -1) {
 		/*
 		 * If the dividend range contains more than just S64_MIN,
 		 * we cannot precisely track the result, so it becomes unbounded.
@@ -14236,79 +13998,66 @@ static void scalar_min_max_sdiv(struct bpf_reg_state *dst_reg,
 		 *     = {S64_MIN} U [S64_MAX-9, S64_MAX] = [S64_MIN, S64_MAX]
 		 * Otherwise (if dividend is exactly S64_MIN), result remains S64_MIN.
 		 */
-		if (*dst_smax != S64_MIN) {
-			*dst_smin = S64_MIN;
-			*dst_smax = S64_MAX;
+		if (smax != S64_MIN) {
+			smin = S64_MIN;
+			smax = S64_MAX;
 		}
 		goto reset;
 	}
 
-	res1 = div64_s64(*dst_smin, src_val);
-	res2 = div64_s64(*dst_smax, src_val);
-	*dst_smin = min(res1, res2);
-	*dst_smax = max(res1, res2);
+	res1 = div64_s64(smin, src_val);
+	res2 = div64_s64(smax, src_val);
+	smin = min(res1, res2);
+	smax = max(res1, res2);
 
 reset:
+	reg_set_srange64(dst_reg, smin, smax);
 	/* Reset other ranges/tnum to unbounded/unknown. */
-	dst_reg->umin_value = 0;
-	dst_reg->umax_value = U64_MAX;
 	reset_reg32_and_tnum(dst_reg);
 }
 
 static void scalar32_min_max_umod(struct bpf_reg_state *dst_reg,
 				  struct bpf_reg_state *src_reg)
 {
-	u32 *dst_umin = &dst_reg->u32_min_value;
-	u32 *dst_umax = &dst_reg->u32_max_value;
-	u32 src_val = src_reg->u32_min_value; /* non-zero, const divisor */
+	u32 src_val = reg_u32_min(src_reg); /* non-zero, const divisor */
 	u32 res_max = src_val - 1;
 
 	/*
 	 * If dst_umax <= res_max, the result remains unchanged.
 	 * e.g., [2, 5] % 10 = [2, 5].
 	 */
-	if (*dst_umax <= res_max)
+	if (reg_u32_max(dst_reg) <= res_max)
 		return;
 
-	*dst_umin = 0;
-	*dst_umax = min(*dst_umax, res_max);
+	reg_set_urange32(dst_reg, 0, min(reg_u32_max(dst_reg), res_max));
 
 	/* Reset other ranges/tnum to unbounded/unknown. */
-	dst_reg->s32_min_value = S32_MIN;
-	dst_reg->s32_max_value = S32_MAX;
 	reset_reg64_and_tnum(dst_reg);
 }
 
 static void scalar_min_max_umod(struct bpf_reg_state *dst_reg,
 				struct bpf_reg_state *src_reg)
 {
-	u64 *dst_umin = &dst_reg->umin_value;
-	u64 *dst_umax = &dst_reg->umax_value;
-	u64 src_val = src_reg->umin_value; /* non-zero, const divisor */
+	u64 src_val = reg_umin(src_reg); /* non-zero, const divisor */
 	u64 res_max = src_val - 1;
 
 	/*
 	 * If dst_umax <= res_max, the result remains unchanged.
 	 * e.g., [2, 5] % 10 = [2, 5].
 	 */
-	if (*dst_umax <= res_max)
+	if (reg_umax(dst_reg) <= res_max)
 		return;
 
-	*dst_umin = 0;
-	*dst_umax = min(*dst_umax, res_max);
+	reg_set_urange64(dst_reg, 0, min(reg_umax(dst_reg), res_max));
 
 	/* Reset other ranges/tnum to unbounded/unknown. */
-	dst_reg->smin_value = S64_MIN;
-	dst_reg->smax_value = S64_MAX;
 	reset_reg32_and_tnum(dst_reg);
 }
 
 static void scalar32_min_max_smod(struct bpf_reg_state *dst_reg,
 				  struct bpf_reg_state *src_reg)
 {
-	s32 *dst_smin = &dst_reg->s32_min_value;
-	s32 *dst_smax = &dst_reg->s32_max_value;
-	s32 src_val = src_reg->s32_min_value; /* non-zero, const divisor */
+	s32 src_val = reg_s32_min(src_reg); /* non-zero, const divisor */
 
 	/*
 	 * Safe absolute value calculation:
@@ -14328,33 +14077,26 @@ static void scalar32_min_max_smod(struct bpf_reg_state *dst_reg,
 	 * If the dividend is already within the result range,
 	 * the result remains unchanged. e.g., [-2, 5] % 10 = [-2, 5].
 	 */
-	if (*dst_smin >= -res_max_abs && *dst_smax <= res_max_abs)
+	if (reg_s32_min(dst_reg) >= -res_max_abs && reg_s32_max(dst_reg) <= res_max_abs)
 		return;
 
 	/* General case: result has the same sign as the dividend. */
-	if (*dst_smin >= 0) {
-		*dst_smin = 0;
-		*dst_smax = min(*dst_smax, res_max_abs);
-	} else if (*dst_smax <= 0) {
-		*dst_smax = 0;
-		*dst_smin = max(*dst_smin, -res_max_abs);
+	if (reg_s32_min(dst_reg) >= 0) {
+		reg_set_srange32(dst_reg, 0, min(reg_s32_max(dst_reg), res_max_abs));
+	} else if (reg_s32_max(dst_reg) <= 0) {
+		reg_set_srange32(dst_reg, max(reg_s32_min(dst_reg), -res_max_abs), 0);
 	} else {
-		*dst_smin = -res_max_abs;
-		*dst_smax = res_max_abs;
+		reg_set_srange32(dst_reg, -res_max_abs, res_max_abs);
 	}
 
 	/* Reset other ranges/tnum to unbounded/unknown. */
-	dst_reg->u32_min_value = 0;
-	dst_reg->u32_max_value = U32_MAX;
 	reset_reg64_and_tnum(dst_reg);
 }
 
 static void scalar_min_max_smod(struct bpf_reg_state *dst_reg,
 				struct bpf_reg_state *src_reg)
 {
-	s64 *dst_smin = &dst_reg->smin_value;
-	s64 *dst_smax = &dst_reg->smax_value;
-	s64 src_val = src_reg->smin_value; /* non-zero, const divisor */
+	s64 src_val = reg_smin(src_reg); /* non-zero, const divisor */
 
 	/*
 	 * Safe absolute value calculation:
@@ -14374,24 +14116,19 @@ static void scalar_min_max_smod(struct bpf_reg_state *dst_reg,
 	 * If the dividend is already within the result range,
 	 * the result remains unchanged. e.g., [-2, 5] % 10 = [-2, 5].
 	 */
-	if (*dst_smin >= -res_max_abs && *dst_smax <= res_max_abs)
+	if (reg_smin(dst_reg) >= -res_max_abs && reg_smax(dst_reg) <= res_max_abs)
 		return;
 
 	/* General case: result has the same sign as the dividend. */
-	if (*dst_smin >= 0) {
-		*dst_smin = 0;
-		*dst_smax = min(*dst_smax, res_max_abs);
-	} else if (*dst_smax <= 0) {
-		*dst_smax = 0;
-		*dst_smin = max(*dst_smin, -res_max_abs);
+	if (reg_smin(dst_reg) >= 0) {
+		reg_set_srange64(dst_reg, 0, min(reg_smax(dst_reg), res_max_abs));
+	} else if (reg_smax(dst_reg) <= 0) {
+		reg_set_srange64(dst_reg, max(reg_smin(dst_reg), -res_max_abs), 0);
 	} else {
-		*dst_smin = -res_max_abs;
-		*dst_smax = res_max_abs;
+		reg_set_srange64(dst_reg, -res_max_abs, res_max_abs);
 	}
 
 	/* Reset other ranges/tnum to unbounded/unknown. */
-	dst_reg->umin_value = 0;
-	dst_reg->umax_value = U64_MAX;
 	reset_reg32_and_tnum(dst_reg);
 }
 
@@ -14401,7 +14138,7 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
 	bool src_known = tnum_subreg_is_const(src_reg->var_off);
 	bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
 	struct tnum var32_off = tnum_subreg(dst_reg->var_off);
-	u32 umax_val = src_reg->u32_max_value;
+	u32 umax_val = reg_u32_max(src_reg);
 
 	if (src_known && dst_known) {
 		__mark_reg32_known(dst_reg, var32_off.value);
@@ -14411,19 +14148,9 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
 	/* We get our minimum from the var_off, since that's inherently
 	 * bitwise.  Our maximum is the minimum of the operands' maxima.
 	 */
-	dst_reg->u32_min_value = var32_off.value;
-	dst_reg->u32_max_value = min(dst_reg->u32_max_value, umax_val);
-
-	/* Safe to set s32 bounds by casting u32 result into s32 when u32
-	 * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded.
-	 */
-	if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) {
-		dst_reg->s32_min_value = dst_reg->u32_min_value;
-		dst_reg->s32_max_value = dst_reg->u32_max_value;
-	} else {
-		dst_reg->s32_min_value = S32_MIN;
-		dst_reg->s32_max_value = S32_MAX;
-	}
+	reg_set_urange32(dst_reg,
+			 var32_off.value,
+			 min(reg_u32_max(dst_reg), umax_val));
 }
 
 static void scalar_min_max_and(struct bpf_reg_state *dst_reg,
@@ -14431,7 +14158,7 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg,
 {
 	bool src_known = tnum_is_const(src_reg->var_off);
 	bool dst_known = tnum_is_const(dst_reg->var_off);
-	u64 umax_val = src_reg->umax_value;
+	u64 umax_val = reg_umax(src_reg);
 
 	if (src_known && dst_known) {
 		__mark_reg_known(dst_reg, dst_reg->var_off.value);
@@ -14441,19 +14168,10 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg,
 	/* We get our minimum from the var_off, since that's inherently
 	 * bitwise.  Our maximum is the minimum of the operands' maxima.
 	 */
-	dst_reg->umin_value = dst_reg->var_off.value;
-	dst_reg->umax_value = min(dst_reg->umax_value, umax_val);
+	reg_set_urange64(dst_reg,
+			 dst_reg->var_off.value,
+			 min(reg_umax(dst_reg), umax_val));
 
-	/* Safe to set s64 bounds by casting u64 result into s64 when u64
-	 * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded.
-	 */
-	if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) {
-		dst_reg->smin_value = dst_reg->umin_value;
-		dst_reg->smax_value = dst_reg->umax_value;
-	} else {
-		dst_reg->smin_value = S64_MIN;
-		dst_reg->smax_value = S64_MAX;
-	}
 	/* We may learn something more from the var_off */
 	__update_reg_bounds(dst_reg);
 }
@@ -14464,7 +14182,7 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg,
 	bool src_known = tnum_subreg_is_const(src_reg->var_off);
 	bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
 	struct tnum var32_off = tnum_subreg(dst_reg->var_off);
-	u32 umin_val = src_reg->u32_min_value;
+	u32 umin_val = reg_u32_min(src_reg);
 
 	if (src_known && dst_known) {
 		__mark_reg32_known(dst_reg, var32_off.value);
@@ -14474,19 +14192,9 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg,
 	/* We get our maximum from the var_off, and our minimum is the
 	 * maximum of the operands' minima
 	 */
-	dst_reg->u32_min_value = max(dst_reg->u32_min_value, umin_val);
-	dst_reg->u32_max_value = var32_off.value | var32_off.mask;
-
-	/* Safe to set s32 bounds by casting u32 result into s32 when u32
-	 * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded.
-	 */
-	if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) {
-		dst_reg->s32_min_value = dst_reg->u32_min_value;
-		dst_reg->s32_max_value = dst_reg->u32_max_value;
-	} else {
-		dst_reg->s32_min_value = S32_MIN;
-		dst_reg->s32_max_value = S32_MAX;
-	}
+	reg_set_urange32(dst_reg,
+			 max(reg_u32_min(dst_reg), umin_val),
+			 var32_off.value | var32_off.mask);
 }
 
 static void scalar_min_max_or(struct bpf_reg_state *dst_reg,
@@ -14494,7 +14202,7 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg,
 {
 	bool src_known = tnum_is_const(src_reg->var_off);
 	bool dst_known = tnum_is_const(dst_reg->var_off);
-	u64 umin_val = src_reg->umin_value;
+	u64 umin_val = reg_umin(src_reg);
 
 	if (src_known && dst_known) {
 		__mark_reg_known(dst_reg, dst_reg->var_off.value);
@@ -14504,19 +14212,10 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg,
 	/* We get our maximum from the var_off, and our minimum is the
 	 * maximum of the operands' minima
 	 */
-	dst_reg->umin_value = max(dst_reg->umin_value, umin_val);
-	dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask;
+	reg_set_urange64(dst_reg,
+			 max(reg_umin(dst_reg), umin_val),
+			 dst_reg->var_off.value | dst_reg->var_off.mask);
 
-	/* Safe to set s64 bounds by casting u64 result into s64 when u64
-	 * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded.
-	 */
-	if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) {
-		dst_reg->smin_value = dst_reg->umin_value;
-		dst_reg->smax_value = dst_reg->umax_value;
-	} else {
-		dst_reg->smin_value = S64_MIN;
-		dst_reg->smax_value = S64_MAX;
-	}
 	/* We may learn something more from the var_off */
 	__update_reg_bounds(dst_reg);
 }
@@ -14534,19 +14233,7 @@ static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg,
 	}
 
 	/* We get both minimum and maximum from the var32_off. */
-	dst_reg->u32_min_value = var32_off.value;
-	dst_reg->u32_max_value = var32_off.value | var32_off.mask;
-
-	/* Safe to set s32 bounds by casting u32 result into s32 when u32
-	 * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded.
-	 */
-	if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) {
-		dst_reg->s32_min_value = dst_reg->u32_min_value;
-		dst_reg->s32_max_value = dst_reg->u32_max_value;
-	} else {
-		dst_reg->s32_min_value = S32_MIN;
-		dst_reg->s32_max_value = S32_MAX;
-	}
+	reg_set_urange32(dst_reg, var32_off.value, var32_off.value | var32_off.mask);
 }
 
 static void scalar_min_max_xor(struct bpf_reg_state *dst_reg,
@@ -14562,46 +14249,30 @@ static void scalar_min_max_xor(struct bpf_reg_state *dst_reg,
 	}
 
 	/* We get both minimum and maximum from the var_off. */
-	dst_reg->umin_value = dst_reg->var_off.value;
-	dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask;
-
-	/* Safe to set s64 bounds by casting u64 result into s64 when u64
-	 * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded.
-	 */
-	if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) {
-		dst_reg->smin_value = dst_reg->umin_value;
-		dst_reg->smax_value = dst_reg->umax_value;
-	} else {
-		dst_reg->smin_value = S64_MIN;
-		dst_reg->smax_value = S64_MAX;
-	}
-
-	__update_reg_bounds(dst_reg);
+	reg_set_urange64(dst_reg,
+			 dst_reg->var_off.value,
+			 dst_reg->var_off.value | dst_reg->var_off.mask);
 }
 
 static void __scalar32_min_max_lsh(struct bpf_reg_state *dst_reg,
 				   u64 umin_val, u64 umax_val)
 {
-	/* We lose all sign bit information (except what we can pick
-	 * up from var_off)
-	 */
-	dst_reg->s32_min_value = S32_MIN;
-	dst_reg->s32_max_value = S32_MAX;
 	/* If we might shift our top bit out, then we know nothing */
-	if (umax_val > 31 || dst_reg->u32_max_value > 1ULL << (31 - umax_val)) {
-		dst_reg->u32_min_value = 0;
-		dst_reg->u32_max_value = U32_MAX;
-	} else {
-		dst_reg->u32_min_value <<= umin_val;
-		dst_reg->u32_max_value <<= umax_val;
-	}
+	if (umax_val > 31 || reg_u32_max(dst_reg) > 1ULL << (31 - umax_val))
+		reg_set_urange32(dst_reg, 0, U32_MAX);
+	else
+		/* We lose all sign bit information (except what we can pick
+		 * up from var_off)
+		 */
+		reg_set_urange32(dst_reg, reg_u32_min(dst_reg) << umin_val,
+				 reg_u32_max(dst_reg) << umax_val);
 }
 
 static void scalar32_min_max_lsh(struct bpf_reg_state *dst_reg,
 				 struct bpf_reg_state *src_reg)
 {
-	u32 umax_val = src_reg->u32_max_value;
-	u32 umin_val = src_reg->u32_min_value;
+	u32 umax_val = reg_u32_max(src_reg);
+	u32 umin_val = reg_u32_min(src_reg);
 	/* u32 alu operation will zext upper bits */
 	struct tnum subreg = tnum_subreg(dst_reg->var_off);
 
@@ -14618,34 +14289,34 @@ static void scalar32_min_max_lsh(struct bpf_reg_state *dst_reg,
 static void __scalar64_min_max_lsh(struct bpf_reg_state *dst_reg,
 				   u64 umin_val, u64 umax_val)
 {
+	struct cnum64 u, s;
+
 	/* Special case <<32 because it is a common compiler pattern to sign
 	 * extend subreg by doing <<32 s>>32. smin/smax assignments are correct
 	 * because s32 bounds don't flip sign when shifting to the left by
 	 * 32bits.
 	 */
-	if (umin_val == 32 && umax_val == 32) {
-		dst_reg->smax_value = (s64)dst_reg->s32_max_value << 32;
-		dst_reg->smin_value = (s64)dst_reg->s32_min_value << 32;
-	} else {
-		dst_reg->smax_value = S64_MAX;
-		dst_reg->smin_value = S64_MIN;
-	}
+	if (umin_val == 32 && umax_val == 32)
+		s = cnum64_from_srange((s64)reg_s32_min(dst_reg) << 32,
+				       (s64)reg_s32_max(dst_reg) << 32);
+	else
+		s = CNUM64_UNBOUNDED;
 
 	/* If we might shift our top bit out, then we know nothing */
-	if (dst_reg->umax_value > 1ULL << (63 - umax_val)) {
-		dst_reg->umin_value = 0;
-		dst_reg->umax_value = U64_MAX;
-	} else {
-		dst_reg->umin_value <<= umin_val;
-		dst_reg->umax_value <<= umax_val;
-	}
+	if (reg_umax(dst_reg) > 1ULL << (63 - umax_val))
+		u = CNUM64_UNBOUNDED;
+	else
+		u = cnum64_from_urange(reg_umin(dst_reg) << umin_val,
+				       reg_umax(dst_reg) << umax_val);
+
+	dst_reg->r64 = cnum64_intersect(u, s);
 }
 
 static void scalar_min_max_lsh(struct bpf_reg_state *dst_reg,
 			       struct bpf_reg_state *src_reg)
 {
-	u64 umax_val = src_reg->umax_value;
-	u64 umin_val = src_reg->umin_value;
+	u64 umax_val = reg_umax(src_reg);
+	u64 umin_val = reg_umin(src_reg);
 
 	/* scalar64 calc uses 32bit unshifted bounds so must be called first */
 	__scalar64_min_max_lsh(dst_reg, umin_val, umax_val);
@@ -14660,8 +14331,8 @@ static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg,
 				 struct bpf_reg_state *src_reg)
 {
 	struct tnum subreg = tnum_subreg(dst_reg->var_off);
-	u32 umax_val = src_reg->u32_max_value;
-	u32 umin_val = src_reg->u32_min_value;
+	u32 umax_val = reg_u32_max(src_reg);
+	u32 umin_val = reg_u32_min(src_reg);
 
 	/* BPF_RSH is an unsigned shift.  If the value in dst_reg might
 	 * be negative, then either:
@@ -14677,12 +14348,10 @@ static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg,
 	 * and rely on inferring new ones from the unsigned bounds and
 	 * var_off of the result.
 	 */
-	dst_reg->s32_min_value = S32_MIN;
-	dst_reg->s32_max_value = S32_MAX;
 
 	dst_reg->var_off = tnum_rshift(subreg, umin_val);
-	dst_reg->u32_min_value >>= umax_val;
-	dst_reg->u32_max_value >>= umin_val;
+	reg_set_urange32(dst_reg, reg_u32_min(dst_reg) >> umax_val,
+			 reg_u32_max(dst_reg) >> umin_val);
 
 	__mark_reg64_unbounded(dst_reg);
 	__update_reg32_bounds(dst_reg);
@@ -14691,8 +14360,8 @@ static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg,
 static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg,
 			       struct bpf_reg_state *src_reg)
 {
-	u64 umax_val = src_reg->umax_value;
-	u64 umin_val = src_reg->umin_value;
+	u64 umax_val = reg_umax(src_reg);
+	u64 umin_val = reg_umin(src_reg);
 
 	/* BPF_RSH is an unsigned shift.  If the value in dst_reg might
 	 * be negative, then either:
@@ -14708,11 +14377,9 @@ static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg,
 	 * and rely on inferring new ones from the unsigned bounds and
 	 * var_off of the result.
 	 */
-	dst_reg->smin_value = S64_MIN;
-	dst_reg->smax_value = S64_MAX;
 	dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val);
-	dst_reg->umin_value >>= umax_val;
-	dst_reg->umax_value >>= umin_val;
+	reg_set_urange64(dst_reg, reg_umin(dst_reg) >> umax_val,
+			 reg_umax(dst_reg) >> umin_val);
 
 	/* Its not easy to operate on alu32 bounds here because it depends
 	 * on bits being shifted in. Take easy way out and mark unbounded
@@ -14725,22 +14392,19 @@ static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg,
 static void scalar32_min_max_arsh(struct bpf_reg_state *dst_reg,
 				  struct bpf_reg_state *src_reg)
 {
-	u64 umin_val = src_reg->u32_min_value;
+	u64 umin_val = reg_u32_min(src_reg);
 
 	/* Upon reaching here, src_known is true and
 	 * umax_val is equal to umin_val.
+	 * Blow away the dst_reg umin_value/umax_value and rely on
+	 * dst_reg var_off to refine the result.
 	 */
-	dst_reg->s32_min_value = (u32)(((s32)dst_reg->s32_min_value) >> umin_val);
-	dst_reg->s32_max_value = (u32)(((s32)dst_reg->s32_max_value) >> umin_val);
+	reg_set_srange32(dst_reg,
+			 (u32)(((s32)reg_s32_min(dst_reg)) >> umin_val),
+			 (u32)(((s32)reg_s32_max(dst_reg)) >> umin_val));
 
 	dst_reg->var_off = tnum_arshift(tnum_subreg(dst_reg->var_off), umin_val, 32);
 
-	/* blow away the dst_reg umin_value/umax_value and rely on
-	 * dst_reg var_off to refine the result.
-	 */
-	dst_reg->u32_min_value = 0;
-	dst_reg->u32_max_value = U32_MAX;
-
 	__mark_reg64_unbounded(dst_reg);
 	__update_reg32_bounds(dst_reg);
 }
@@ -14748,22 +14412,16 @@ static void scalar32_min_max_arsh(struct bpf_reg_state *dst_reg,
 static void scalar_min_max_arsh(struct bpf_reg_state *dst_reg,
 				struct bpf_reg_state *src_reg)
 {
-	u64 umin_val = src_reg->umin_value;
+	u64 umin_val = reg_umin(src_reg);
 
 	/* Upon reaching here, src_known is true and umax_val is equal
 	 * to umin_val.
 	 */
-	dst_reg->smin_value >>= umin_val;
-	dst_reg->smax_value >>= umin_val;
+	reg_set_srange64(dst_reg, reg_smin(dst_reg) >> umin_val,
+			 reg_smax(dst_reg) >> umin_val);
 
 	dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val, 64);
 
-	/* blow away the dst_reg umin_value/umax_value and rely on
-	 * dst_reg var_off to refine the result.
-	 */
-	dst_reg->umin_value = 0;
-	dst_reg->umax_value = U64_MAX;
-
 	/* Its not easy to operate on alu32 bounds here because it depends
 	 * on bits being shifted in from upper 32-bits. Take easy way out
 	 * and mark unbounded so we can recalculate later from tnum.
@@ -14829,13 +14487,13 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn,
 
 	if (insn_bitness == 32) {
 		if (tnum_subreg_is_const(src_reg->var_off)
-		    && src_reg->s32_min_value == src_reg->s32_max_value
-		    && src_reg->u32_min_value == src_reg->u32_max_value)
+		    && reg_s32_min(src_reg) == reg_s32_max(src_reg)
+		    && reg_u32_min(src_reg) == reg_u32_max(src_reg))
 			src_is_const = true;
 	} else {
 		if (tnum_is_const(src_reg->var_off)
-		    && src_reg->smin_value == src_reg->smax_value
-		    && src_reg->umin_value == src_reg->umax_value)
+		    && reg_smin(src_reg) == reg_smax(src_reg)
+		    && reg_umin(src_reg) == reg_umax(src_reg))
 			src_is_const = true;
 	}
 
@@ -14865,7 +14523,7 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn,
 	case BPF_LSH:
 	case BPF_RSH:
 	case BPF_ARSH:
-		return (src_is_const && src_reg->umax_value < insn_bitness);
+		return (src_is_const && reg_umax(src_reg) < insn_bitness);
 	default:
 		return false;
 	}
@@ -14878,9 +14536,9 @@ static int maybe_fork_scalars(struct bpf_verifier_env *env, struct bpf_insn *ins
 	struct bpf_reg_state *regs;
 	bool alu32;
 
-	if (dst_reg->smin_value == -1 && dst_reg->smax_value == 0)
+	if (reg_smin(dst_reg) == -1 && reg_smax(dst_reg) == 0)
 		alu32 = false;
-	else if (dst_reg->s32_min_value == -1 && dst_reg->s32_max_value == 0)
+	else if (reg_s32_min(dst_reg) == -1 && reg_s32_max(dst_reg) == 0)
 		alu32 = true;
 	else
 		return 0;
@@ -14964,7 +14622,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 		break;
 	case BPF_DIV:
 		/* BPF div specification: x / 0 = 0 */
-		if ((alu32 && src_reg.u32_min_value == 0) || (!alu32 && src_reg.umin_value == 0)) {
+		if ((alu32 && reg_u32_min(&src_reg) == 0) || (!alu32 && reg_umin(&src_reg) == 0)) {
 			___mark_reg_known(dst_reg, 0);
 			break;
 		}
@@ -14981,7 +14639,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 		break;
 	case BPF_MOD:
 		/* BPF mod specification: x % 0 = x */
-		if ((alu32 && src_reg.u32_min_value == 0) || (!alu32 && src_reg.umin_value == 0))
+		if ((alu32 && reg_u32_min(&src_reg) == 0) || (!alu32 && reg_umin(&src_reg) == 0))
 			break;
 		if (alu32)
 			if (off == 1)
@@ -15169,7 +14827,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 	 * umax_value before the ALU operation. After adjust_scalar_min_max_vals(),
 	 * alu32 ops will have zero-extended the result, making umax_value <= U32_MAX.
 	 */
-	u64 dst_umax = dst_reg->umax_value;
+	u64 dst_umax = reg_umax(dst_reg);
 
 	err = adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
 	if (err)
@@ -15299,7 +14957,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 					 * copy register state to dest reg
 					 */
 					assign_scalar_id_before_mov(env, src_reg);
-					copy_register_state(dst_reg, src_reg);
+					*dst_reg = *src_reg;
 					dst_reg->subreg_def = DEF_NOT_SUBREG;
 				} else {
 					/* case: R1 = (s8, s16 s32)R2 */
@@ -15311,10 +14969,10 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 					} else if (src_reg->type == SCALAR_VALUE) {
 						bool no_sext;
 
-						no_sext = src_reg->umax_value < (1ULL << (insn->off - 1));
+						no_sext = reg_umax(src_reg) < (1ULL << (insn->off - 1));
 						if (no_sext)
 							assign_scalar_id_before_mov(env, src_reg);
-						copy_register_state(dst_reg, src_reg);
+						*dst_reg = *src_reg;
 						if (!no_sext)
 							clear_scalar_id(dst_reg);
 						coerce_reg_to_size_sx(dst_reg, insn->off >> 3);
@@ -15336,7 +14994,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 
 						if (is_src_reg_u32)
 							assign_scalar_id_before_mov(env, src_reg);
-						copy_register_state(dst_reg, src_reg);
+						*dst_reg = *src_reg;
 						/* Make sure ID is cleared if src_reg is not in u32
 						 * range otherwise dst_reg min/max could be incorrectly
 						 * propagated into src_reg by sync_linked_regs()
@@ -15346,11 +15004,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 						dst_reg->subreg_def = env->insn_idx + 1;
 					} else {
 						/* case: W1 = (s8, s16)W2 */
-						bool no_sext = src_reg->umax_value < (1ULL << (insn->off - 1));
+						bool no_sext = reg_umax(src_reg) < (1ULL << (insn->off - 1));
 
 						if (no_sext)
 							assign_scalar_id_before_mov(env, src_reg);
-						copy_register_state(dst_reg, src_reg);
+						*dst_reg = *src_reg;
 						if (!no_sext)
 							clear_scalar_id(dst_reg);
 						dst_reg->subreg_def = env->insn_idx + 1;
@@ -15428,17 +15086,17 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
 	struct bpf_reg_state *reg;
 	int new_range;
 
-	if (dst_reg->umax_value == 0 && range_right_open)
+	if (reg_umax(dst_reg) == 0 && range_right_open)
 		/* This doesn't give us any range */
 		return;
 
-	if (dst_reg->umax_value > MAX_PACKET_OFF)
+	if (reg_umax(dst_reg) > MAX_PACKET_OFF)
 		/* Risk of overflow.  For instance, ptr + (1<<63) may be less
 		 * than pkt_end, but that's because it's also less than pkt.
 		 */
 		return;
 
-	new_range = dst_reg->umax_value;
+	new_range = reg_umax(dst_reg);
 	if (range_right_open)
 		new_range++;
 
@@ -15487,7 +15145,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
 	/* If our ids match, then we must have the same max_value.  And we
 	 * don't care about the other reg's fixed offset, since if it's too big
 	 * the range won't allow anything.
-	 * dst_reg->umax_value is known < MAX_PACKET_OFF, therefore it fits in a u16.
+	 * reg_umax(dst_reg) is known < MAX_PACKET_OFF, therefore it fits in a u16.
 	 */
 	bpf_for_each_reg_in_vstate(vstate, state, reg, ({
 		if (reg->type == type && reg->id == dst_reg->id)
@@ -15543,14 +15201,14 @@ static int is_scalar_branch_taken(struct bpf_verifier_env *env, struct bpf_reg_s
 {
 	struct tnum t1 = is_jmp32 ? tnum_subreg(reg1->var_off) : reg1->var_off;
 	struct tnum t2 = is_jmp32 ? tnum_subreg(reg2->var_off) : reg2->var_off;
-	u64 umin1 = is_jmp32 ? (u64)reg1->u32_min_value : reg1->umin_value;
-	u64 umax1 = is_jmp32 ? (u64)reg1->u32_max_value : reg1->umax_value;
-	s64 smin1 = is_jmp32 ? (s64)reg1->s32_min_value : reg1->smin_value;
-	s64 smax1 = is_jmp32 ? (s64)reg1->s32_max_value : reg1->smax_value;
-	u64 umin2 = is_jmp32 ? (u64)reg2->u32_min_value : reg2->umin_value;
-	u64 umax2 = is_jmp32 ? (u64)reg2->u32_max_value : reg2->umax_value;
-	s64 smin2 = is_jmp32 ? (s64)reg2->s32_min_value : reg2->smin_value;
-	s64 smax2 = is_jmp32 ? (s64)reg2->s32_max_value : reg2->smax_value;
+	u64 umin1 = is_jmp32 ? (u64)reg_u32_min(reg1) : reg_umin(reg1);
+	u64 umax1 = is_jmp32 ? (u64)reg_u32_max(reg1) : reg_umax(reg1);
+	s64 smin1 = is_jmp32 ? (s64)reg_s32_min(reg1) : reg_smin(reg1);
+	s64 smax1 = is_jmp32 ? (s64)reg_s32_max(reg1) : reg_smax(reg1);
+	u64 umin2 = is_jmp32 ? (u64)reg_u32_min(reg2) : reg_umin(reg2);
+	u64 umax2 = is_jmp32 ? (u64)reg_u32_max(reg2) : reg_umax(reg2);
+	s64 smin2 = is_jmp32 ? (s64)reg_s32_min(reg2) : reg_smin(reg2);
+	s64 smax2 = is_jmp32 ? (s64)reg_s32_max(reg2) : reg_smax(reg2);
 
 	if (reg1 == reg2) {
 		switch (opcode) {
@@ -15595,11 +15253,11 @@ static int is_scalar_branch_taken(struct bpf_verifier_env *env, struct bpf_reg_s
 			 * utilize 32-bit subrange knowledge to eliminate
 			 * branches that can't be taken a priori
 			 */
-			if (reg1->u32_min_value > reg2->u32_max_value ||
-			    reg1->u32_max_value < reg2->u32_min_value)
+			if (reg_u32_min(reg1) > reg_u32_max(reg2) ||
+			    reg_u32_max(reg1) < reg_u32_min(reg2))
 				return 0;
-			if (reg1->s32_min_value > reg2->s32_max_value ||
-			    reg1->s32_max_value < reg2->s32_min_value)
+			if (reg_s32_min(reg1) > reg_s32_max(reg2) ||
+			    reg_s32_max(reg1) < reg_s32_min(reg2))
 				return 0;
 		}
 		break;
@@ -15621,11 +15279,11 @@ static int is_scalar_branch_taken(struct bpf_verifier_env *env, struct bpf_reg_s
 			 * utilize 32-bit subrange knowledge to eliminate
 			 * branches that can't be taken a priori
 			 */
-			if (reg1->u32_min_value > reg2->u32_max_value ||
-			    reg1->u32_max_value < reg2->u32_min_value)
+			if (reg_u32_min(reg1) > reg_u32_max(reg2) ||
+			    reg_u32_max(reg1) < reg_u32_min(reg2))
 				return 1;
-			if (reg1->s32_min_value > reg2->s32_max_value ||
-			    reg1->s32_max_value < reg2->s32_min_value)
+			if (reg_s32_min(reg1) > reg_s32_max(reg2) ||
+			    reg_s32_max(reg1) < reg_s32_min(reg2))
 				return 1;
 		}
 		break;
@@ -15780,7 +15438,7 @@ static int is_branch_taken(struct bpf_verifier_env *env, struct bpf_reg_state *r
 		if (!is_reg_const(reg2, is_jmp32))
 			return -1;
 
-		if (!reg_not_null(reg1))
+		if (!reg_not_null(env, reg1))
 			return -1;
 
 		/* If pointer is valid tests against zero will fail so we can
@@ -15852,27 +15510,15 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state
 	switch (opcode) {
 	case BPF_JEQ:
 		if (is_jmp32) {
-			reg1->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value);
-			reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value);
-			reg1->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value);
-			reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value);
-			reg2->u32_min_value = reg1->u32_min_value;
-			reg2->u32_max_value = reg1->u32_max_value;
-			reg2->s32_min_value = reg1->s32_min_value;
-			reg2->s32_max_value = reg1->s32_max_value;
+			reg1->r32 = cnum32_intersect(reg1->r32, reg2->r32);
+			reg2->r32 = reg1->r32;
 
 			t = tnum_intersect(tnum_subreg(reg1->var_off), tnum_subreg(reg2->var_off));
 			reg1->var_off = tnum_with_subreg(reg1->var_off, t);
 			reg2->var_off = tnum_with_subreg(reg2->var_off, t);
 		} else {
-			reg1->umin_value = max(reg1->umin_value, reg2->umin_value);
-			reg1->umax_value = min(reg1->umax_value, reg2->umax_value);
-			reg1->smin_value = max(reg1->smin_value, reg2->smin_value);
-			reg1->smax_value = min(reg1->smax_value, reg2->smax_value);
-			reg2->umin_value = reg1->umin_value;
-			reg2->umax_value = reg1->umax_value;
-			reg2->smin_value = reg1->smin_value;
-			reg2->smax_value = reg1->smax_value;
+			reg1->r64 = cnum64_intersect(reg1->r64, reg2->r64);
+			reg2->r64 = reg1->r64;
 
 			reg1->var_off = tnum_intersect(reg1->var_off, reg2->var_off);
 			reg2->var_off = reg1->var_off;
@@ -15889,32 +15535,11 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state
 		 */
 		val = reg_const_value(reg2, is_jmp32);
 		if (is_jmp32) {
-			/* u32_min_value is not equal to 0xffffffff at this point,
-			 * because otherwise u32_max_value is 0xffffffff as well,
-			 * in such a case both reg1 and reg2 would be constants,
-			 * jump would be predicted and regs_refine_cond_op()
-			 * wouldn't be called.
-			 *
-			 * Same reasoning works for all {u,s}{min,max}{32,64} cases
-			 * below.
-			 */
-			if (reg1->u32_min_value == (u32)val)
-				reg1->u32_min_value++;
-			if (reg1->u32_max_value == (u32)val)
-				reg1->u32_max_value--;
-			if (reg1->s32_min_value == (s32)val)
-				reg1->s32_min_value++;
-			if (reg1->s32_max_value == (s32)val)
-				reg1->s32_max_value--;
+			/* Complement of the range [val, val] as cnum32. */
+			cnum32_intersect_with(&reg1->r32, (struct cnum32){ val + 1, U32_MAX - 1 });
 		} else {
-			if (reg1->umin_value == (u64)val)
-				reg1->umin_value++;
-			if (reg1->umax_value == (u64)val)
-				reg1->umax_value--;
-			if (reg1->smin_value == (s64)val)
-				reg1->smin_value++;
-			if (reg1->smax_value == (s64)val)
-				reg1->smax_value--;
+			/* Complement of the range [val, val] as cnum64. */
+			cnum64_intersect_with(&reg1->r64, (struct cnum64){ val + 1, U64_MAX - 1 });
 		}
 		break;
 	case BPF_JSET:
@@ -15961,38 +15586,38 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state
 		break;
 	case BPF_JLE:
 		if (is_jmp32) {
-			reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value);
-			reg2->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value);
+			cnum32_intersect_with_urange(&reg1->r32, 0, reg_u32_max(reg2));
+			cnum32_intersect_with_urange(&reg2->r32, reg_u32_min(reg1), U32_MAX);
 		} else {
-			reg1->umax_value = min(reg1->umax_value, reg2->umax_value);
-			reg2->umin_value = max(reg1->umin_value, reg2->umin_value);
+			cnum64_intersect_with_urange(&reg1->r64, 0, reg_umax(reg2));
+			cnum64_intersect_with_urange(&reg2->r64, reg_umin(reg1), U64_MAX);
 		}
 		break;
 	case BPF_JLT:
 		if (is_jmp32) {
-			reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value - 1);
-			reg2->u32_min_value = max(reg1->u32_min_value + 1, reg2->u32_min_value);
+			cnum32_intersect_with_urange(&reg1->r32, 0, reg_u32_max(reg2) - 1);
+			cnum32_intersect_with_urange(&reg2->r32, reg_u32_min(reg1) + 1, U32_MAX);
 		} else {
-			reg1->umax_value = min(reg1->umax_value, reg2->umax_value - 1);
-			reg2->umin_value = max(reg1->umin_value + 1, reg2->umin_value);
+			cnum64_intersect_with_urange(&reg1->r64, 0, reg_umax(reg2) - 1);
+			cnum64_intersect_with_urange(&reg2->r64, reg_umin(reg1) + 1, U64_MAX);
 		}
 		break;
 	case BPF_JSLE:
 		if (is_jmp32) {
-			reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value);
-			reg2->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value);
+			cnum32_intersect_with_srange(&reg1->r32, S32_MIN, reg_s32_max(reg2));
+			cnum32_intersect_with_srange(&reg2->r32, reg_s32_min(reg1), S32_MAX);
 		} else {
-			reg1->smax_value = min(reg1->smax_value, reg2->smax_value);
-			reg2->smin_value = max(reg1->smin_value, reg2->smin_value);
+			cnum64_intersect_with_srange(&reg1->r64, S64_MIN, reg_smax(reg2));
+			cnum64_intersect_with_srange(&reg2->r64, reg_smin(reg1), S64_MAX);
 		}
 		break;
 	case BPF_JSLT:
 		if (is_jmp32) {
-			reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value - 1);
-			reg2->s32_min_value = max(reg1->s32_min_value + 1, reg2->s32_min_value);
+			cnum32_intersect_with_srange(&reg1->r32, S32_MIN, reg_s32_max(reg2) - 1);
+			cnum32_intersect_with_srange(&reg2->r32, reg_s32_min(reg1) + 1, S32_MAX);
 		} else {
-			reg1->smax_value = min(reg1->smax_value, reg2->smax_value - 1);
-			reg2->smin_value = max(reg1->smin_value + 1, reg2->smin_value);
+			cnum64_intersect_with_srange(&reg1->r64, S64_MIN, reg_smax(reg2) - 1);
+			cnum64_intersect_with_srange(&reg2->r64, reg_smin(reg1) + 1, S64_MAX);
 		}
 		break;
 	default:
@@ -16030,7 +15655,7 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
 		    WARN_ON_ONCE(!tnum_equals_const(reg->var_off, 0)))
 			return;
 		if (is_null) {
-			/* We don't need id and ref_obj_id from this point
+			/* We don't need id from this point
 			 * onwards anymore, thus we should better reset it,
 			 * so that state pruning has chances to take effect.
 			 */
@@ -16042,15 +15667,10 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
 
 		mark_ptr_not_null_reg(reg);
 
-		if (!reg_may_point_to_spin_lock(reg)) {
-			/* For not-NULL ptr, reg->ref_obj_id will be reset
-			 * in release_reference().
-			 *
-			 * reg->id is still used by spin_lock ptr. Other
-			 * than spin_lock ptr type, reg->id can be reset.
-			 */
-			reg->id = 0;
-		}
+		/*
+		 * reg->id is preserved for object relationship tracking
+		 * and spin_lock lock state tracking
+		 */
 	}
 }
 
@@ -16062,10 +15682,9 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
 {
 	struct bpf_func_state *state = vstate->frame[vstate->curframe];
 	struct bpf_reg_state *regs = state->regs, *reg;
-	u32 ref_obj_id = regs[regno].ref_obj_id;
 	u32 id = regs[regno].id;
 
-	if (ref_obj_id && ref_obj_id == id && is_null)
+	if (is_null && find_reference_state(vstate, id))
 		/* regs[regno] is in the " == NULL" branch.
 		 * No one could have freed the reference state before
 		 * doing the NULL check.
@@ -16263,7 +15882,7 @@ static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_s
 		    reg->delta == known_reg->delta) {
 			s32 saved_subreg_def = reg->subreg_def;
 
-			copy_register_state(reg, known_reg);
+			*reg = *known_reg;
 			reg->subreg_def = saved_subreg_def;
 		} else {
 			s32 saved_subreg_def = reg->subreg_def;
@@ -16274,7 +15893,7 @@ static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_s
 			__mark_reg_known(&fake_reg, (s64)reg->delta - (s64)known_reg->delta);
 
 			/* reg = known_reg; reg += delta */
-			copy_register_state(reg, known_reg);
+			*reg = *known_reg;
 			/*
 			 * Must preserve off, id and subreg_def flag,
 			 * otherwise another sync_linked_regs() will be incorrect.
@@ -16371,16 +15990,16 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	}
 
 	if (insn_flags) {
-		err = bpf_push_jmp_history(env, this_branch, insn_flags, 0);
+		err = bpf_push_jmp_history(env, this_branch, insn_flags, 0, 0, 0);
 		if (err)
 			return err;
 	}
 
 	is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
-	copy_register_state(&env->false_reg1, dst_reg);
-	copy_register_state(&env->false_reg2, src_reg);
-	copy_register_state(&env->true_reg1, dst_reg);
-	copy_register_state(&env->true_reg2, src_reg);
+	env->false_reg1 = *dst_reg;
+	env->false_reg2 = *src_reg;
+	env->true_reg1 = *dst_reg;
+	env->true_reg2 = *src_reg;
 	pred = is_branch_taken(env, dst_reg, src_reg, opcode, is_jmp32);
 	if (pred >= 0) {
 		/* If we get here with a dst_reg pointer type it is because
@@ -16435,7 +16054,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	if (dst_reg->type == SCALAR_VALUE && dst_reg->id)
 		collect_linked_regs(env, this_branch, dst_reg->id, &linked_regs);
 	if (linked_regs.cnt > 1) {
-		err = bpf_push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs));
+		err = bpf_push_jmp_history(env, this_branch, 0, 0, 0, linked_regs_pack(&linked_regs));
 		if (err)
 			return err;
 	}
@@ -16449,11 +16068,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	if (err)
 		return err;
 
-	copy_register_state(dst_reg, &env->false_reg1);
-	copy_register_state(src_reg, &env->false_reg2);
-	copy_register_state(&other_branch_regs[insn->dst_reg], &env->true_reg1);
+	*dst_reg = env->false_reg1;
+	*src_reg = env->false_reg2;
+	other_branch_regs[insn->dst_reg] = env->true_reg1;
 	if (BPF_SRC(insn->code) == BPF_X)
-		copy_register_state(&other_branch_regs[insn->src_reg], &env->true_reg2);
+		other_branch_regs[insn->src_reg] = env->true_reg2;
 
 	if (BPF_SRC(insn->code) == BPF_X &&
 	    src_reg->type == SCALAR_VALUE && src_reg->id &&
@@ -16788,6 +16407,9 @@ static bool return_retval_range(struct bpf_verifier_env *env, struct bpf_retval_
 		case BPF_TRACE_FENTRY:
 		case BPF_TRACE_FEXIT:
 		case BPF_TRACE_FSESSION:
+		case BPF_TRACE_FENTRY_MULTI:
+		case BPF_TRACE_FEXIT_MULTI:
+		case BPF_TRACE_FSESSION_MULTI:
 			*range = retval_range(0, 0);
 			break;
 		case BPF_TRACE_RAW_TP:
@@ -16904,8 +16526,8 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
 		ret_type = btf_type_resolve_ptr(prog->aux->attach_btf,
 						prog->aux->attach_func_proto->type,
 						NULL);
-		if (ret_type && ret_type == reg_type && reg->ref_obj_id)
-			return __check_ptr_off_reg(env, reg, regno, false);
+		if (ret_type && ret_type == reg_type && reg_is_referenced(env, reg))
+			return __check_ptr_off_reg(env, reg, argno_from_reg(regno), false);
 	}
 
 	/* eBPF calling convention is such that R0 is used
@@ -16977,6 +16599,10 @@ static int check_global_subprog_return_code(struct bpf_verifier_env *env)
 	if (err)
 		return err;
 
+	/* Pointers to arena are safe to pass between subprograms. */
+	if (is_arena_reg(env, BPF_REG_0))
+		return 0;
+
 	if (is_pointer_value(env, BPF_REG_0)) {
 		verbose(env, "R%d leaks addr as return value\n", BPF_REG_0);
 		return -EACCES;
@@ -17493,16 +17119,16 @@ static int indirect_jump_min_max_index(struct bpf_verifier_env *env,
 				       u32 *pmin_index, u32 *pmax_index)
 {
 	struct bpf_reg_state *reg = reg_state(env, regno);
-	u64 min_index = reg->umin_value;
-	u64 max_index = reg->umax_value;
+	u64 min_index = reg_umin(reg);
+	u64 max_index = reg_umax(reg);
 	const u32 size = 8;
 
 	if (min_index > (u64) U32_MAX * size) {
-		verbose(env, "the sum of R%u umin_value %llu is too big\n", regno, reg->umin_value);
+		verbose(env, "the sum of R%u umin_value %llu is too big\n", regno, reg_umin(reg));
 		return -ERANGE;
 	}
 	if (max_index > (u64) U32_MAX * size) {
-		verbose(env, "the sum of R%u umax_value %llu is too big\n", regno, reg->umax_value);
+		verbose(env, "the sum of R%u umax_value %llu is too big\n", regno, reg_umax(reg));
 		return -ERANGE;
 	}
 
@@ -17601,6 +17227,14 @@ static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state)
 		return check_store_reg(env, insn, false);
 
 	case BPF_ST: {
+		/* Handle stack arg write (store immediate) */
+		if (is_stack_arg_st(insn)) {
+			struct bpf_verifier_state *vstate = env->cur_state;
+			struct bpf_func_state *state = vstate->frame[vstate->curframe];
+
+			return check_stack_arg_write(env, state, insn->off, NULL);
+		}
+
 		enum bpf_reg_type dst_reg_type;
 
 		err = check_reg_arg(env, insn->dst_reg, SRC_OP);
@@ -17609,7 +17243,7 @@ static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state)
 
 		dst_reg_type = cur_regs(env)[insn->dst_reg].type;
 
-		err = check_mem_access(env, env->insn_idx, insn->dst_reg,
+		err = check_mem_access(env, env->insn_idx, cur_regs(env) + insn->dst_reg, argno_from_reg(insn->dst_reg),
 				       insn->off, BPF_SIZE(insn->code),
 				       BPF_WRITE, -1, false, false);
 		if (err)
@@ -17635,6 +17269,8 @@ static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state)
 				}
 			}
 			mark_reg_scratched(env, BPF_REG_0);
+			if (bpf_in_stack_arg_cnt(&env->subprog_info[cur_func(env)->subprogno]))
+				cur_func(env)->no_stack_arg_load = true;
 			if (insn->src_reg == BPF_PSEUDO_CALL)
 				return check_func_call(env, insn, &env->insn_idx);
 			if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
@@ -17732,7 +17368,7 @@ static int do_check(struct bpf_verifier_env *env)
 		}
 
 		if (bpf_is_jmp_point(env, env->insn_idx)) {
-			err = bpf_push_jmp_history(env, state, 0, 0);
+			err = bpf_push_jmp_history(env, state, 0, 0, 0, 0);
 			if (err)
 				return err;
 		}
@@ -18117,11 +17753,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
 	if (prog->sleepable)
 		switch (map->map_type) {
 		case BPF_MAP_TYPE_HASH:
+		case BPF_MAP_TYPE_RHASH:
 		case BPF_MAP_TYPE_LRU_HASH:
 		case BPF_MAP_TYPE_ARRAY:
 		case BPF_MAP_TYPE_PERCPU_HASH:
 		case BPF_MAP_TYPE_PERCPU_ARRAY:
 		case BPF_MAP_TYPE_LRU_PERCPU_HASH:
+		case BPF_MAP_TYPE_LPM_TRIE:
 		case BPF_MAP_TYPE_ARRAY_OF_MAPS:
 		case BPF_MAP_TYPE_HASH_OF_MAPS:
 		case BPF_MAP_TYPE_RINGBUF:
@@ -18439,11 +18077,12 @@ static int check_and_resolve_insns(struct bpf_verifier_env *env)
 		return err;
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
-		if (insn->dst_reg >= MAX_BPF_REG) {
+		if (insn->dst_reg >= MAX_BPF_REG &&
+		    !is_stack_arg_st(insn) && !is_stack_arg_stx(insn)) {
 			verbose(env, "R%d is invalid\n", insn->dst_reg);
 			return -EINVAL;
 		}
-		if (insn->src_reg >= MAX_BPF_REG) {
+		if (insn->src_reg >= MAX_BPF_REG && !is_stack_arg_ldx(insn)) {
 			verbose(env, "R%d is invalid\n", insn->src_reg);
 			return -EINVAL;
 		}
@@ -18750,7 +18389,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
 				goto out;
 			}
 		}
-		for (i = BPF_REG_1; i <= sub->arg_cnt; i++) {
+		for (i = BPF_REG_1; i <= min_t(u32, sub->arg_cnt, MAX_BPF_FUNC_REG_ARGS); i++) {
 			arg = &sub->args[i - BPF_REG_1];
 			reg = &regs[i];
 
@@ -18760,9 +18399,9 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
 			} else if (arg->arg_type == ARG_ANYTHING) {
 				reg->type = SCALAR_VALUE;
 				mark_reg_unknown(env, regs, i);
-			} else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) {
+			} else if (arg->arg_type == ARG_PTR_TO_DYNPTR) {
 				/* assume unspecial LOCAL dynptr type */
-				__mark_dynptr_reg(reg, BPF_DYNPTR_TYPE_LOCAL, true, ++env->id_gen);
+				__mark_dynptr_reg(reg, BPF_DYNPTR_TYPE_LOCAL, true, ++env->id_gen, 0);
 			} else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) {
 				reg->type = PTR_TO_MEM;
 				reg->type |= arg->arg_type &
@@ -18788,11 +18427,17 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
 				mark_reg_unknown(env, regs, i);
 			} else {
 				verifier_bug(env, "unhandled arg#%d type %d",
-					     i - BPF_REG_1, arg->arg_type);
+					     i - BPF_REG_1 + 1, arg->arg_type);
 				ret = -EFAULT;
 				goto out;
 			}
 		}
+		if (env->prog->type == BPF_PROG_TYPE_EXT && sub->arg_cnt > MAX_BPF_FUNC_REG_ARGS) {
+			verbose(env, "freplace programs with >%d args not supported yet\n",
+				MAX_BPF_FUNC_REG_ARGS);
+			ret = -EINVAL;
+			goto out;
+		}
 	} else {
 		/* if main BPF program has associated BTF info, validate that
 		 * it's matching expected signature, and otherwise mark BTF
@@ -18800,8 +18445,11 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
 		 */
 		if (env->prog->aux->func_info_aux) {
 			ret = btf_prepare_func_args(env, 0);
-			if (ret || sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_PTR_TO_CTX)
+			if (ret || sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_PTR_TO_CTX) {
 				env->prog->aux->func_info_aux[0].unreliable = true;
+				sub->arg_cnt = 1;
+				sub->stack_arg_cnt = 0;
+			}
 		}
 
 		/* 1st arg to a function */
@@ -18811,9 +18459,13 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
 
 	/* Acquire references for struct_ops program arguments tagged with "__ref" */
 	if (!subprog && env->prog->type == BPF_PROG_TYPE_STRUCT_OPS) {
-		for (i = 0; i < aux->ctx_arg_info_size; i++)
-			aux->ctx_arg_info[i].ref_obj_id = aux->ctx_arg_info[i].refcounted ?
-							  acquire_reference(env, 0) : 0;
+		for (i = 0; i < aux->ctx_arg_info_size; i++) {
+			ret = aux->ctx_arg_info[i].refcounted ? acquire_reference(env, 0, 0) : 0;
+			if (ret < 0)
+				goto out;
+
+			aux->ctx_arg_info[i].ref_id = ret;
+		}
 	}
 
 	ret = do_check(env);
@@ -18849,6 +18501,7 @@ static int do_check_subprogs(struct bpf_verifier_env *env)
 	struct bpf_prog_aux *aux = env->prog->aux;
 	struct bpf_func_info_aux *sub_aux;
 	int i, ret, new_cnt;
+	u32 insn_processed;
 
 	if (!aux->func_info)
 		return 0;
@@ -18863,6 +18516,8 @@ again:
 		if (!bpf_subprog_is_global(env, i))
 			continue;
 
+		insn_processed = env->insn_processed;
+
 		sub_aux = subprog_aux(env, i);
 		if (!sub_aux->called || sub_aux->verified)
 			continue;
@@ -18870,6 +18525,7 @@ again:
 		env->insn_idx = env->subprog_info[i].start;
 		WARN_ON_ONCE(env->insn_idx == 0);
 		ret = do_check_common(env, i);
+		env->subprog_info[i].insn_processed = env->insn_processed - insn_processed;
 		if (ret) {
 			return ret;
 		} else if (env->log.level & BPF_LOG_LEVEL) {
@@ -18896,10 +18552,12 @@ again:
 
 static int do_check_main(struct bpf_verifier_env *env)
 {
+	u32 insn_processed = env->insn_processed;
 	int ret;
 
 	env->insn_idx = 0;
 	ret = do_check_common(env, 0);
+	env->subprog_info[0].insn_processed = env->insn_processed - insn_processed;
 	if (!ret)
 		env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
 	return ret;
@@ -18908,19 +18566,20 @@ static int do_check_main(struct bpf_verifier_env *env)
 
 static void print_verification_stats(struct bpf_verifier_env *env)
 {
-	int i;
+	/* Skip over hidden subprogs which are not verified. */
+	int i, subprog_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
 
 	if (env->log.level & BPF_LOG_STATS) {
 		verbose(env, "verification time %lld usec\n",
 			div_u64(env->verification_time, 1000));
-		verbose(env, "stack depth ");
-		for (i = 0; i < env->subprog_cnt; i++) {
-			u32 depth = env->subprog_info[i].stack_depth;
-
-			verbose(env, "%d", depth);
-			if (i + 1 < env->subprog_cnt)
-				verbose(env, "+");
-		}
+		verbose(env, "stack depth %d", env->subprog_info[0].stack_depth);
+		for (i = 1; i < subprog_cnt; i++)
+			verbose(env, "+%d", env->subprog_info[i].stack_depth);
+		verbose(env, " max %d\n", env->max_stack_depth);
+		verbose(env, "insns processed %d", env->subprog_info[0].insn_processed);
+		for (i = 1; i < subprog_cnt; i++)
+			if (bpf_subprog_is_global(env, i))
+				verbose(env, "+%d", env->subprog_info[i].insn_processed);
 		verbose(env, "\n");
 	}
 	verbose(env, "processed %d insns (limit %d) max_states_per_insn %d "
@@ -19142,6 +18801,60 @@ static int check_attach_modify_return(unsigned long addr, const char *func_name)
 
 #endif /* CONFIG_FUNCTION_ERROR_INJECTION */
 
+static bool is_tracing_multi_id(const struct bpf_prog *prog, u32 btf_id)
+{
+	return is_tracing_multi(prog->expected_attach_type) && bpf_multi_func_btf_id[0] == btf_id;
+}
+
+static int btf_id_allow_sleepable(u32 btf_id, unsigned long addr, const struct bpf_prog *prog,
+				  const struct btf *btf)
+{
+	const struct btf_type *t;
+	const char *tname;
+
+	switch (prog->type) {
+	case BPF_PROG_TYPE_TRACING:
+		t = btf_type_by_id(btf, btf_id);
+		if (!t)
+			return -EINVAL;
+		tname = btf_name_by_offset(btf, t->name_off);
+		if (!tname)
+			return -EINVAL;
+
+		/*
+		 * *.multi sleepable programs will pass initial sleepable check,
+		 * the actual attached btf ids are checked later during the link
+		 * attachment.
+		 */
+		if (is_tracing_multi_id(prog, btf_id))
+			return 0;
+		if (!check_attach_sleepable(btf_id, addr, tname))
+			return 0;
+		/*
+		 * fentry/fexit/fmod_ret progs can also be sleepable if they are
+		 * in the fmodret id set with the KF_SLEEPABLE flag.
+		 */
+		else {
+			u32 *flags = btf_kfunc_is_modify_return(btf, btf_id, prog);
+
+			if (flags && (*flags & KF_SLEEPABLE))
+				return 0;
+		}
+		break;
+	case BPF_PROG_TYPE_LSM:
+		/*
+		 * LSM progs check that they are attached to bpf_lsm_*() funcs.
+		 * Only some of them are sleepable.
+		 */
+		if (bpf_lsm_is_sleepable_hook(btf_id))
+			return 0;
+		break;
+	default:
+		break;
+	}
+	return -EINVAL;
+}
+
 int bpf_check_attach_target(struct bpf_verifier_log *log,
 			    const struct bpf_prog *prog,
 			    const struct bpf_prog *tgt_prog,
@@ -19264,7 +18977,10 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 		    prog_extension &&
 		    (tgt_prog->expected_attach_type == BPF_TRACE_FENTRY ||
 		     tgt_prog->expected_attach_type == BPF_TRACE_FEXIT ||
-		     tgt_prog->expected_attach_type == BPF_TRACE_FSESSION)) {
+		     tgt_prog->expected_attach_type == BPF_TRACE_FENTRY_MULTI ||
+		     tgt_prog->expected_attach_type == BPF_TRACE_FEXIT_MULTI ||
+		     tgt_prog->expected_attach_type == BPF_TRACE_FSESSION ||
+		     tgt_prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI)) {
 			/* Program extensions can extend all program types
 			 * except fentry/fexit. The reason is the following.
 			 * The fentry/fexit programs are used for performance
@@ -19314,6 +19030,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 		btp = bpf_get_raw_tracepoint(tname);
 		if (!btp)
 			return -EINVAL;
+		if (prog->sleepable && !tracepoint_is_faultable(btp->tp)) {
+			bpf_log(log, "Sleepable program cannot attach to non-faultable tracepoint %s\n",
+				tname);
+			bpf_put_raw_tracepoint(btp);
+			return -EINVAL;
+		}
 		fname = kallsyms_lookup((unsigned long)btp->bpf_func, NULL, NULL, NULL,
 					trace_symbol);
 		bpf_put_raw_tracepoint(btp);
@@ -19364,7 +19086,11 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 	case BPF_TRACE_FENTRY:
 	case BPF_TRACE_FEXIT:
 	case BPF_TRACE_FSESSION:
-		if (prog->expected_attach_type == BPF_TRACE_FSESSION &&
+	case BPF_TRACE_FSESSION_MULTI:
+	case BPF_TRACE_FENTRY_MULTI:
+	case BPF_TRACE_FEXIT_MULTI:
+		if ((prog->expected_attach_type == BPF_TRACE_FSESSION ||
+		    prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI) &&
 		    !bpf_jit_supports_fsession()) {
 			bpf_log(log, "JIT does not support fsession\n");
 			return -EOPNOTSUPP;
@@ -19393,7 +19119,18 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 		if (ret < 0)
 			return ret;
 
-		if (tgt_prog) {
+		/*
+		 * *.multi programs don't need an address during program
+		 * verification, we just take the module ref if needed.
+		 */
+		if (is_tracing_multi_id(prog, btf_id)) {
+			if (btf_is_module(btf)) {
+				mod = btf_try_get_module(btf);
+				if (!mod)
+					return -ENOENT;
+			}
+			addr = 0;
+		} else if (tgt_prog) {
 			if (subprog == 0)
 				addr = (long) tgt_prog->bpf_func;
 			else
@@ -19418,32 +19155,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 		}
 
 		if (prog->sleepable) {
-			ret = -EINVAL;
-			switch (prog->type) {
-			case BPF_PROG_TYPE_TRACING:
-				if (!check_attach_sleepable(btf_id, addr, tname))
-					ret = 0;
-				/* fentry/fexit/fmod_ret progs can also be sleepable if they are
-				 * in the fmodret id set with the KF_SLEEPABLE flag.
-				 */
-				else {
-					u32 *flags = btf_kfunc_is_modify_return(btf, btf_id,
-										prog);
-
-					if (flags && (*flags & KF_SLEEPABLE))
-						ret = 0;
-				}
-				break;
-			case BPF_PROG_TYPE_LSM:
-				/* LSM progs check that they are attached to bpf_lsm_*() funcs.
-				 * Only some of them are sleepable.
-				 */
-				if (bpf_lsm_is_sleepable_hook(btf_id))
-					ret = 0;
-				break;
-			default:
-				break;
-			}
+			ret = btf_id_allow_sleepable(btf_id, addr, prog, btf);
 			if (ret) {
 				module_put(mod);
 				bpf_log(log, "%s is not sleepable\n", tname);
@@ -19530,14 +19242,22 @@ static bool can_be_sleepable(struct bpf_prog *prog)
 		case BPF_MODIFY_RETURN:
 		case BPF_TRACE_ITER:
 		case BPF_TRACE_FSESSION:
+		case BPF_TRACE_RAW_TP:
+		case BPF_TRACE_FENTRY_MULTI:
+		case BPF_TRACE_FEXIT_MULTI:
+		case BPF_TRACE_FSESSION_MULTI:
 			return true;
 		default:
 			return false;
 		}
 	}
-	return prog->type == BPF_PROG_TYPE_LSM ||
-	       prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ ||
-	       prog->type == BPF_PROG_TYPE_STRUCT_OPS;
+	if (prog->type == BPF_PROG_TYPE_LSM)
+		return prog->expected_attach_type != BPF_LSM_CGROUP;
+
+	return prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ ||
+	       prog->type == BPF_PROG_TYPE_STRUCT_OPS ||
+	       prog->type == BPF_PROG_TYPE_RAW_TRACEPOINT ||
+	       prog->type == BPF_PROG_TYPE_TRACEPOINT;
 }
 
 static int check_attach_btf_id(struct bpf_verifier_env *env)
@@ -19559,7 +19279,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 	}
 
 	if (prog->sleepable && !can_be_sleepable(prog)) {
-		verbose(env, "Only fentry/fexit/fsession/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n");
+		verbose(env, "Program of this type cannot be sleepable\n");
 		return -EINVAL;
 	}
 
@@ -19612,6 +19332,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 		return -EINVAL;
 	} else if ((prog->expected_attach_type == BPF_TRACE_FEXIT ||
 		   prog->expected_attach_type == BPF_TRACE_FSESSION ||
+		   prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI ||
 		   prog->expected_attach_type == BPF_MODIFY_RETURN) &&
 		   btf_id_set_contains(&noreturn_deny, btf_id)) {
 		verbose(env, "Attaching fexit/fsession/fmod_ret to __noreturn function '%s' is rejected.\n",
@@ -19619,6 +19340,14 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 		return -EINVAL;
 	}
 
+	/*
+	 * We don't get trampoline for tracing_multi programs at this point,
+	 * it's done when tracing_multi link is created.
+	 */
+	if (prog->type == BPF_PROG_TYPE_TRACING &&
+	    is_tracing_multi(prog->expected_attach_type))
+		return 0;
+
 	key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id);
 	tr = bpf_trampoline_get(key, &tgt_info);
 	if (!tr)
@@ -19631,6 +19360,62 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 	return 0;
 }
 
+int bpf_check_attach_btf_id_multi(struct btf *btf, struct bpf_prog *prog, u32 btf_id,
+				  struct bpf_attach_target_info *tgt_info)
+{
+	const struct btf_type *t;
+	unsigned long addr;
+	const char *tname;
+	int err;
+
+	if (!btf_id || !btf)
+		return -EINVAL;
+
+	/* Check noreturn attachment. */
+	if ((prog->expected_attach_type == BPF_TRACE_FEXIT_MULTI ||
+	     prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI) &&
+	     btf_id_set_contains(&noreturn_deny, btf_id))
+		return -EINVAL;
+	/* Check denied attachment. */
+	if (btf_id_set_contains(&btf_id_deny, btf_id))
+		return -EINVAL;
+
+	/* Check and get function target data. */
+	t = btf_type_by_id(btf, btf_id);
+	if (!t)
+		return -EINVAL;
+	tname = btf_name_by_offset(btf, t->name_off);
+	if (!tname)
+		return -EINVAL;
+	if (!btf_type_is_func(t))
+		return -EINVAL;
+	t = btf_type_by_id(btf, t->type);
+	if (!btf_type_is_func_proto(t))
+		return -EINVAL;
+	err = btf_distill_func_proto(NULL, btf, t, tname, &tgt_info->fmodel);
+	if (err < 0)
+		return err;
+	if (btf_is_module(btf)) {
+		/* The bpf program already holds reference to module. */
+		if (WARN_ON_ONCE(!prog->aux->mod))
+			return -EINVAL;
+		addr = find_kallsyms_symbol_value(prog->aux->mod, tname);
+	} else {
+		addr = kallsyms_lookup_name(tname);
+	}
+	if (!addr || !ftrace_location(addr))
+		return -ENOENT;
+
+	/* Check sleepable program attachment. */
+	if (prog->sleepable) {
+		err = btf_id_allow_sleepable(btf_id, addr, prog, btf);
+		if (err)
+			return err;
+	}
+	tgt_info->tgt_addr = addr;
+	return 0;
+}
+
 struct btf *bpf_get_btf_vmlinux(void)
 {
 	if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
@@ -19849,8 +19634,11 @@ int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		int struct_meta_reg = BPF_REG_3;
 		int node_offset_reg = BPF_REG_4;
 
-		/* rbtree_add has extra 'less' arg, so args-to-fixup are in diff regs */
-		if (is_bpf_rbtree_add_kfunc(desc->func_id)) {
+		/* list_add/rbtree_add have an extra arg (prev/less),
+		 * so args-to-fixup are in diff regs.
+		 */
+		if (desc->func_id == special_kfunc_list[KF_bpf_list_add] ||
+		    is_bpf_rbtree_add_kfunc(desc->func_id)) {
 			struct_meta_reg = BPF_REG_4;
 			node_offset_reg = BPF_REG_5;
 		}
@@ -19868,7 +19656,9 @@ int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
 		*cnt = 1;
 	} else if (desc->func_id == special_kfunc_list[KF_bpf_session_is_return] &&
-		   env->prog->expected_attach_type == BPF_TRACE_FSESSION) {
+		   (env->prog->expected_attach_type == BPF_TRACE_FSESSION ||
+		    env->prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI)) {
+
 		/*
 		 * inline the bpf_session_is_return() for fsession:
 		 *   bool bpf_session_is_return(void *ctx)
@@ -19881,7 +19671,8 @@ int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1);
 		*cnt = 3;
 	} else if (desc->func_id == special_kfunc_list[KF_bpf_session_cookie] &&
-		   env->prog->expected_attach_type == BPF_TRACE_FSESSION) {
+		   (env->prog->expected_attach_type == BPF_TRACE_FSESSION ||
+		    env->prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI)) {
 		/*
 		 * inline bpf_session_cookie() for fsession:
 		 *   __u64 *bpf_session_cookie(void *ctx)
@@ -19912,12 +19703,12 @@ int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	return 0;
 }
 
-int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
+int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr,
+	      struct bpf_log_attr *attr_log)
 {
 	u64 start_time = ktime_get_ns();
 	struct bpf_verifier_env *env;
 	int i, len, ret = -EINVAL, err;
-	u32 log_true_size;
 	bool is_priv;
 
 	BTF_TYPE_EMIT(enum bpf_features);
@@ -19964,9 +19755,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 	/* user could have requested verbose verifier output
 	 * and supplied buffer to store the verification trace
 	 */
-	ret = bpf_vlog_init(&env->log, attr->log_level,
-			    (char __user *) (unsigned long) attr->log_buf,
-			    attr->log_size);
+	ret = bpf_vlog_init(&env->log, attr_log->level, attr_log->ubuf, attr_log->size);
 	if (ret)
 		goto err_unlock;
 
@@ -20128,17 +19917,10 @@ skip_full_check:
 	env->prog->aux->verified_insns = env->insn_processed;
 
 	/* preserve original error even if log finalization is successful */
-	err = bpf_vlog_finalize(&env->log, &log_true_size);
+	err = bpf_log_attr_finalize(attr_log, &env->log);
 	if (err)
 		ret = err;
 
-	if (uattr_size >= offsetofend(union bpf_attr, log_true_size) &&
-	    copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, log_true_size),
-				  &log_true_size, sizeof(log_true_size))) {
-		ret = -EFAULT;
-		goto err_release_maps;
-	}
-
 	if (ret)
 		goto err_release_maps;
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 95d806bba654..b1e1c5f0c7ba 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11712,6 +11712,15 @@ static int __perf_event_set_bpf_prog(struct perf_event *event,
 		/* only uprobe programs are allowed to be sleepable */
 		return -EINVAL;
 
+	if (prog->type == BPF_PROG_TYPE_TRACEPOINT && prog->sleepable) {
+		/*
+		 * Sleepable tracepoint programs can only attach to faultable
+		 * tracepoints. Currently only syscall tracepoints are faultable.
+		 */
+		if (!is_syscall_tp)
+			return -EINVAL;
+	}
+
 	/* Kprobe override only works for kprobes, not uprobes. */
 	if (prog->kprobe_override && !is_kprobe)
 		return -EINVAL;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index a02bd258677e..82f8feea6931 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -23,6 +23,7 @@
 #include <linux/sort.h>
 #include <linux/key.h>
 #include <linux/namei.h>
+#include <linux/file.h>
 
 #include <net/bpf_sk_storage.h>
 
@@ -42,6 +43,7 @@
 
 #define MAX_UPROBE_MULTI_CNT (1U << 20)
 #define MAX_KPROBE_MULTI_CNT (1U << 20)
+#define MAX_TRACING_MULTI_CNT (1U << 20)
 
 #ifdef CONFIG_MODULES
 struct bpf_trace_module {
@@ -152,6 +154,34 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 	return ret;
 }
 
+/**
+ * trace_call_bpf_faultable - invoke BPF program in faultable context
+ * @call: tracepoint event
+ * @ctx: opaque context pointer
+ *
+ * Variant of trace_call_bpf() for faultable tracepoints (syscall
+ * tracepoints). Supports sleepable BPF programs by using rcu_tasks_trace
+ * for lifetime protection and bpf_prog_run_array_sleepable() for per-program
+ * RCU flavor selection, following the uprobe pattern.
+ *
+ * Per-program recursion protection is provided by
+ * bpf_prog_run_array_sleepable(). Global bpf_prog_active is not
+ * needed because syscall tracepoints cannot self-recurse.
+ *
+ * Must be called from a faultable/preemptible context.
+ */
+unsigned int trace_call_bpf_faultable(struct trace_event_call *call, void *ctx)
+{
+	struct bpf_prog_array *prog_array;
+
+	might_fault();
+	guard(rcu_tasks_trace)();
+
+	prog_array = rcu_dereference_check(call->prog_array,
+					   rcu_read_lock_trace_held());
+	return bpf_prog_run_array_sleepable(prog_array, ctx, bpf_prog_run);
+}
+
 #ifdef CONFIG_BPF_KPROBE_OVERRIDE
 BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
 {
@@ -1305,7 +1335,8 @@ static inline bool is_uprobe_session(const struct bpf_prog *prog)
 static inline bool is_trace_fsession(const struct bpf_prog *prog)
 {
 	return prog->type == BPF_PROG_TYPE_TRACING &&
-	       prog->expected_attach_type == BPF_TRACE_FSESSION;
+	       (prog->expected_attach_type == BPF_TRACE_FSESSION ||
+		prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI);
 }
 
 static const struct bpf_func_proto *
@@ -2072,11 +2103,19 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
 static __always_inline
 void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
 {
+	struct srcu_ctr __percpu *scp = NULL;
 	struct bpf_prog *prog = link->link.prog;
+	bool sleepable = prog->sleepable;
 	struct bpf_run_ctx *old_run_ctx;
 	struct bpf_trace_run_ctx run_ctx;
 
-	rcu_read_lock_dont_migrate();
+	if (sleepable) {
+		scp = rcu_read_lock_tasks_trace();
+		migrate_disable();
+	} else {
+		rcu_read_lock_dont_migrate();
+	}
+
 	if (unlikely(!bpf_prog_get_recursion_context(prog))) {
 		bpf_prog_inc_misses_counter(prog);
 		goto out;
@@ -2085,12 +2124,18 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
 	run_ctx.bpf_cookie = link->cookie;
 	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
 
-	(void) bpf_prog_run(prog, args);
+	(void)bpf_prog_run(prog, args);
 
 	bpf_reset_run_ctx(old_run_ctx);
 out:
 	bpf_prog_put_recursion_context(prog);
-	rcu_read_unlock_migrate();
+
+	if (sleepable) {
+		migrate_enable();
+		rcu_read_unlock_tasks_trace(scp);
+	} else {
+		rcu_read_unlock_migrate();
+	}
 }
 
 #define UNPACK(...)			__VA_ARGS__
@@ -3170,6 +3215,38 @@ static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx)
 	return run_ctx->uprobe->cookie;
 }
 
+static int bpf_uprobe_multi_get_path(const union bpf_attr *attr, struct path *path)
+{
+	void __user *upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path);
+	u32 path_fd = attr->link_create.uprobe_multi.path_fd;
+	u32 flags = attr->link_create.uprobe_multi.flags;
+
+	if (flags & BPF_F_UPROBE_MULTI_PATH_FD) {
+		/*
+		 * When BPF_F_UPROBE_MULTI_PATH_FD is set, the executable is
+		 * identified by path_fd, upath must be NULL.
+		 */
+		if (upath)
+			return -EINVAL;
+
+		CLASS(fd, f)(path_fd);
+		if (fd_empty(f))
+			return -EBADF;
+		*path = fd_file(f)->f_path;
+		path_get(path);
+		return 0;
+	}
+
+	/*
+	 * When BPF_F_UPROBE_MULTI_PATH_FD is not set, the path is resolved
+	 * relative to the cwd (AT_FDCWD) or absolute using the upath string.
+	 */
+	if (!upath || path_fd)
+		return -EINVAL;
+
+	return user_path_at(AT_FDCWD, upath, LOOKUP_FOLLOW, path);
+}
+
 int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 {
 	struct bpf_uprobe_multi_link *link = NULL;
@@ -3179,10 +3256,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 	struct task_struct *task = NULL;
 	unsigned long __user *uoffsets;
 	u64 __user *ucookies;
-	void __user *upath;
+	unsigned long size;
 	u32 flags, cnt, i;
 	struct path path;
-	char *name;
 	pid_t pid;
 	int err;
 
@@ -3197,19 +3273,18 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 		return -EINVAL;
 
 	flags = attr->link_create.uprobe_multi.flags;
-	if (flags & ~BPF_F_UPROBE_MULTI_RETURN)
+	if (flags & ~(BPF_F_UPROBE_MULTI_RETURN | BPF_F_UPROBE_MULTI_PATH_FD))
 		return -EINVAL;
 
 	/*
-	 * path, offsets and cnt are mandatory,
+	 * offsets and cnt are mandatory,
 	 * ref_ctr_offsets and cookies are optional
 	 */
-	upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path);
 	uoffsets = u64_to_user_ptr(attr->link_create.uprobe_multi.offsets);
 	cnt = attr->link_create.uprobe_multi.cnt;
 	pid = attr->link_create.uprobe_multi.pid;
 
-	if (!upath || !uoffsets || !cnt || pid < 0)
+	if (!uoffsets || !cnt || pid < 0)
 		return -EINVAL;
 	if (cnt > MAX_UPROBE_MULTI_CNT)
 		return -E2BIG;
@@ -3217,14 +3292,17 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 	uref_ctr_offsets = u64_to_user_ptr(attr->link_create.uprobe_multi.ref_ctr_offsets);
 	ucookies = u64_to_user_ptr(attr->link_create.uprobe_multi.cookies);
 
-	name = strndup_user(upath, PATH_MAX);
-	if (IS_ERR(name)) {
-		err = PTR_ERR(name);
-		return err;
-	}
+	/*
+	 * All uoffsets/uref_ctr_offsets/ucookies arrays have the same value
+	 * size, we need to check their address range is safe for __get_user
+	 * calls.
+	 */
+	size = sizeof(*uoffsets) * cnt;
+	if (!access_ok(uoffsets, size) || !access_ok(uref_ctr_offsets, size) ||
+	    !access_ok(ucookies, size))
+		return -EFAULT;
 
-	err = kern_path(name, LOOKUP_FOLLOW, &path);
-	kfree(name);
+	err = bpf_uprobe_multi_get_path(attr, &path);
 	if (err)
 		return err;
 
@@ -3398,12 +3476,12 @@ typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struc
  * direct calls into all the specific callback implementations
  * (copy_user_data_sleepable, copy_user_data_nofault, and so on)
  */
-static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u64 doff, u64 size,
+static __always_inline int __bpf_dynptr_copy_str(const struct bpf_dynptr *dptr, u64 doff, u64 size,
 						 const void *unsafe_src,
 						 copy_fn_t str_copy_fn,
 						 struct task_struct *tsk)
 {
-	struct bpf_dynptr_kern *dst;
+	const struct bpf_dynptr_kern *dst;
 	u64 chunk_sz, off;
 	void *dst_slice;
 	int cnt, err;
@@ -3439,7 +3517,7 @@ static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u64
 					     u64 size, const void *unsafe_src,
 					     copy_fn_t copy_fn, struct task_struct *tsk)
 {
-	struct bpf_dynptr_kern *dst;
+	const struct bpf_dynptr_kern *dst;
 	void *dst_slice;
 	char buf[256];
 	u64 off, chunk_sz;
@@ -3540,49 +3618,49 @@ __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid
 	return bpf_send_signal_common(sig, type, task, value);
 }
 
-__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u64 off,
+__bpf_kfunc int bpf_probe_read_user_dynptr(const struct bpf_dynptr *dptr, u64 off,
 					   u64 size, const void __user *unsafe_ptr__ign)
 {
 	return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign,
 				 copy_user_data_nofault, NULL);
 }
 
-__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u64 off,
+__bpf_kfunc int bpf_probe_read_kernel_dynptr(const struct bpf_dynptr *dptr, u64 off,
 					     u64 size, const void *unsafe_ptr__ign)
 {
 	return __bpf_dynptr_copy(dptr, off, size, unsafe_ptr__ign,
 				 copy_kernel_data_nofault, NULL);
 }
 
-__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u64 off,
+__bpf_kfunc int bpf_probe_read_user_str_dynptr(const struct bpf_dynptr *dptr, u64 off,
 					       u64 size, const void __user *unsafe_ptr__ign)
 {
 	return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign,
 				     copy_user_str_nofault, NULL);
 }
 
-__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u64 off,
+__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(const struct bpf_dynptr *dptr, u64 off,
 						 u64 size, const void *unsafe_ptr__ign)
 {
 	return __bpf_dynptr_copy_str(dptr, off, size, unsafe_ptr__ign,
 				     copy_kernel_str_nofault, NULL);
 }
 
-__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u64 off,
+__bpf_kfunc int bpf_copy_from_user_dynptr(const struct bpf_dynptr *dptr, u64 off,
 					  u64 size, const void __user *unsafe_ptr__ign)
 {
 	return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign,
 				 copy_user_data_sleepable, NULL);
 }
 
-__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u64 off,
+__bpf_kfunc int bpf_copy_from_user_str_dynptr(const struct bpf_dynptr *dptr, u64 off,
 					      u64 size, const void __user *unsafe_ptr__ign)
 {
 	return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign,
 				     copy_user_str_sleepable, NULL);
 }
 
-__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off,
+__bpf_kfunc int bpf_copy_from_user_task_dynptr(const struct bpf_dynptr *dptr, u64 off,
 					       u64 size, const void __user *unsafe_ptr__ign,
 					       struct task_struct *tsk)
 {
@@ -3590,7 +3668,7 @@ __bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off,
 				 copy_user_data_sleepable, tsk);
 }
 
-__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64 off,
+__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(const struct bpf_dynptr *dptr, u64 off,
 						   u64 size, const void __user *unsafe_ptr__ign,
 						   struct task_struct *tsk)
 {
@@ -3599,3 +3677,203 @@ __bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64
 }
 
 __bpf_kfunc_end_defs();
+
+#if defined(CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS) && \
+    defined(CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS)
+
+static void bpf_tracing_multi_link_release(struct bpf_link *link)
+{
+	struct bpf_tracing_multi_link *tr_link =
+		container_of(link, struct bpf_tracing_multi_link, link);
+
+	WARN_ON_ONCE(bpf_trampoline_multi_detach(link->prog, tr_link));
+}
+
+static void bpf_tracing_multi_link_dealloc(struct bpf_link *link)
+{
+	struct bpf_tracing_multi_link *tr_link =
+		container_of(link, struct bpf_tracing_multi_link, link);
+
+	kvfree(tr_link->fexits);
+	kvfree(tr_link->cookies);
+	kvfree(tr_link);
+}
+
+#ifdef CONFIG_PROC_FS
+static void bpf_tracing_multi_show_fdinfo(const struct bpf_link *link,
+					  struct seq_file *seq)
+{
+	struct bpf_tracing_multi_link *tr_link =
+		container_of(link, struct bpf_tracing_multi_link, link);
+	bool has_cookies = !!tr_link->cookies;
+
+	seq_printf(seq, "attach_type:\t%u\n", tr_link->link.attach_type);
+	seq_printf(seq, "cnt:\t%u\n", tr_link->nodes_cnt);
+
+	seq_printf(seq, "%s\t %s\t %s\t %s\n", "obj-id", "btf-id", "cookie", "func");
+	for (int i = 0; i < tr_link->nodes_cnt; i++) {
+		struct bpf_tracing_multi_node *mnode = &tr_link->nodes[i];
+		u32 btf_id, obj_id;
+
+		bpf_trampoline_unpack_key(mnode->trampoline->key, &obj_id, &btf_id);
+		seq_printf(seq, "%u\t %u\t %llu\t %pS\n",
+			   obj_id, btf_id,
+			   has_cookies ? tr_link->cookies[i] : 0,
+			   (void *) mnode->trampoline->ip);
+
+		cond_resched();
+	}
+}
+#endif
+
+static const struct bpf_link_ops bpf_tracing_multi_link_lops = {
+	.release = bpf_tracing_multi_link_release,
+	.dealloc_deferred = bpf_tracing_multi_link_dealloc,
+#ifdef CONFIG_PROC_FS
+	.show_fdinfo = bpf_tracing_multi_show_fdinfo,
+#endif
+};
+
+static int ids_cmp_r(const void *pa, const void *pb, const void *priv __maybe_unused)
+{
+	u32 a = *(u32 *) pa;
+	u32 b = *(u32 *) pb;
+
+	return (a > b) - (a < b);
+}
+
+static void ids_swap_r(void *a, void *b, int size __maybe_unused,
+		       const void *priv __maybe_unused)
+{
+	u64 *cookie_a, *cookie_b, *cookies;
+	u32 *id_a = a, *id_b = b, *ids;
+	void **data = (void **) priv;
+
+	ids     = data[0];
+	cookies = data[1];
+
+	if (cookies) {
+		cookie_a = cookies + (id_a - ids);
+		cookie_b = cookies + (id_b - ids);
+		swap(*cookie_a, *cookie_b);
+	}
+	swap(*id_a, *id_b);
+}
+
+static int check_dup_ids(u32 *ids, u64 *cookies, u32 cnt)
+{
+	void *data[2] = { ids, cookies };
+	int err = 0;
+
+	/*
+	 * Sort ids array (together with cookies array if defined)
+	 * and check it for duplicates. The ids and cookies arrays
+	 * are left sorted.
+	 */
+	sort_r_nonatomic(ids, cnt, sizeof(ids[0]), ids_cmp_r, ids_swap_r, data);
+
+	for (int i = 1; i < cnt; i++) {
+		if (ids[i] == ids[i - 1]) {
+			err = -EINVAL;
+			break;
+		}
+	}
+	return err;
+}
+
+int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr)
+{
+	struct bpf_tracing_multi_link *link = NULL;
+	struct bpf_tramp_node *fexits = NULL;
+	struct bpf_link_primer link_primer;
+	u32 cnt, *ids = NULL;
+	u64 __user *ucookies;
+	u64 *cookies = NULL;
+	u32 __user *uids;
+	int err;
+
+	uids = u64_to_user_ptr(attr->link_create.tracing_multi.ids);
+	cnt = attr->link_create.tracing_multi.cnt;
+
+	if (!cnt || !uids)
+		return -EINVAL;
+	if (cnt > MAX_TRACING_MULTI_CNT)
+		return -E2BIG;
+	if (attr->link_create.flags || attr->link_create.target_fd)
+		return -EINVAL;
+
+	ids = kvmalloc_objs(*ids, cnt);
+	if (!ids)
+		return -ENOMEM;
+
+	if (copy_from_user(ids, uids, cnt * sizeof(*ids))) {
+		err = -EFAULT;
+		goto error;
+	}
+
+	ucookies = u64_to_user_ptr(attr->link_create.tracing_multi.cookies);
+	if (ucookies) {
+		cookies = kvmalloc_objs(*cookies, cnt);
+		if (!cookies) {
+			err = -ENOMEM;
+			goto error;
+		}
+		if (copy_from_user(cookies, ucookies, cnt * sizeof(*cookies))) {
+			err = -EFAULT;
+			goto error;
+		}
+	}
+
+	err = check_dup_ids(ids, cookies, cnt);
+	if (err)
+		goto error;
+
+	if (prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI) {
+		fexits = kvmalloc_objs(*fexits, cnt);
+		if (!fexits) {
+			err = -ENOMEM;
+			goto error;
+		}
+	}
+
+	link = kvzalloc_flex(*link, nodes, cnt);
+	if (!link) {
+		err = -ENOMEM;
+		goto error;
+	}
+
+	bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING_MULTI,
+		      &bpf_tracing_multi_link_lops, prog, prog->expected_attach_type);
+
+	err = bpf_link_prime(&link->link, &link_primer);
+	if (err)
+		goto error;
+
+	link->nodes_cnt = cnt;
+	link->cookies = cookies;
+	link->fexits = fexits;
+
+	err = bpf_trampoline_multi_attach(prog, ids, link);
+	kvfree(ids);
+	if (err) {
+		bpf_link_cleanup(&link_primer);
+		return err;
+	}
+	return bpf_link_settle(&link_primer);
+
+error:
+	kvfree(fexits);
+	kvfree(cookies);
+	kvfree(ids);
+	kvfree(link);
+	return err;
+}
+
+#else
+
+int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr)
+{
+	return -EOPNOTSUPP;
+}
+
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS && CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS */
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b2611de3f594..f93e34dd2328 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1198,8 +1198,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
 	return __ftrace_lookup_ip(hash, ip);
 }
 
-static void __add_hash_entry(struct ftrace_hash *hash,
-			     struct ftrace_func_entry *entry)
+void add_ftrace_hash_entry(struct ftrace_hash *hash, struct ftrace_func_entry *entry)
 {
 	struct hlist_head *hhd;
 	unsigned long key;
@@ -1221,7 +1220,7 @@ add_ftrace_hash_entry_direct(struct ftrace_hash *hash, unsigned long ip, unsigne
 
 	entry->ip = ip;
 	entry->direct = direct;
-	__add_hash_entry(hash, entry);
+	add_ftrace_hash_entry(hash, entry);
 
 	return entry;
 }
@@ -1249,6 +1248,25 @@ remove_hash_entry(struct ftrace_hash *hash,
 	hash->count--;
 }
 
+void ftrace_hash_remove(struct ftrace_hash *hash)
+{
+	struct ftrace_func_entry *entry;
+	struct hlist_head *hhd;
+	struct hlist_node *tn;
+	int size;
+	int i;
+
+	if (!hash || !hash->count)
+		return;
+	size = 1 << hash->size_bits;
+	for (i = 0; i < size; i++) {
+		hhd = &hash->buckets[i];
+		hlist_for_each_entry_safe(entry, tn, hhd, hlist)
+			remove_hash_entry(hash, entry);
+	}
+	FTRACE_WARN_ON(hash->count);
+}
+
 static void ftrace_hash_clear(struct ftrace_hash *hash)
 {
 	struct hlist_head *hhd;
@@ -1458,7 +1476,7 @@ static struct ftrace_hash *__move_hash(struct ftrace_hash *src, int size)
 		hhd = &src->buckets[i];
 		hlist_for_each_entry_safe(entry, tn, hhd, hlist) {
 			remove_hash_entry(src, entry);
-			__add_hash_entry(new_hash, entry);
+			add_ftrace_hash_entry(new_hash, entry);
 		}
 	}
 	return new_hash;
@@ -5341,7 +5359,7 @@ int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper,
 	map->entry.ip = ip;
 	map->data = data;
 
-	__add_hash_entry(&mapper->hash, &map->entry);
+	add_ftrace_hash_entry(&mapper->hash, &map->entry);
 
 	return 0;
 }
@@ -6288,11 +6306,16 @@ int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
 }
 EXPORT_SYMBOL_GPL(modify_ftrace_direct);
 
-static unsigned long hash_count(struct ftrace_hash *hash)
+static inline unsigned long hash_count(struct ftrace_hash *hash)
 {
 	return hash ? hash->count : 0;
 }
 
+unsigned long ftrace_hash_count(struct ftrace_hash *hash)
+{
+	return hash_count(hash);
+}
+
 /**
  * hash_add - adds two struct ftrace_hash and returns the result
  * @a: struct ftrace_hash object
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 8ad72e17d8eb..e98ee7e1e66f 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1371,33 +1371,33 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
 static int sys_perf_refcount_enter;
 static int sys_perf_refcount_exit;
 
-static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs,
+static int perf_call_bpf_enter(struct trace_event_call *call,
 			       struct syscall_metadata *sys_data,
-			       struct syscall_trace_enter *rec)
+			       int syscall_nr, unsigned long *args)
 {
 	struct syscall_tp_t {
 		struct trace_entry ent;
 		int syscall_nr;
 		unsigned long args[SYSCALL_DEFINE_MAXARGS];
 	} __aligned(8) param;
+	struct pt_regs regs = {};
 	int i;
 
 	BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *));
 
-	/* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
-	perf_fetch_caller_regs(regs);
-	*(struct pt_regs **)&param = regs;
-	param.syscall_nr = rec->nr;
+	/* bpf prog requires 'regs' to be the first member in the ctx */
+	perf_fetch_caller_regs(&regs);
+	*(struct pt_regs **)&param = &regs;
+	param.syscall_nr = syscall_nr;
 	for (i = 0; i < sys_data->nb_args; i++)
-		param.args[i] = rec->args[i];
-	return trace_call_bpf(call, &param);
+		param.args[i] = args[i];
+	return trace_call_bpf_faultable(call, &param);
 }
 
 static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 {
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_enter *rec;
-	struct pt_regs *fake_regs;
 	struct hlist_head *head;
 	unsigned long args[6];
 	bool valid_prog_array;
@@ -1410,12 +1410,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	int size = 0;
 	int uargs = 0;
 
-	/*
-	 * Syscall probe called with preemption enabled, but the ring
-	 * buffer and per-cpu data require preemption to be disabled.
-	 */
 	might_fault();
-	guard(preempt_notrace)();
 
 	syscall_nr = trace_get_syscall_nr(current, regs);
 	if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
@@ -1429,6 +1424,26 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 
 	syscall_get_arguments(current, regs, args);
 
+	/*
+	 * Run BPF program in faultable context before per-cpu buffer
+	 * allocation, allowing sleepable BPF programs to execute.
+	 */
+	valid_prog_array = bpf_prog_array_valid(sys_data->enter_event);
+	if (valid_prog_array &&
+	    !perf_call_bpf_enter(sys_data->enter_event, sys_data,
+				 syscall_nr, args))
+		return;
+
+	/*
+	 * Per-cpu ring buffer and perf event list operations require
+	 * preemption to be disabled.
+	 */
+	guard(preempt_notrace)();
+
+	head = this_cpu_ptr(sys_data->enter_event->perf_events);
+	if (hlist_empty(head))
+		return;
+
 	/* Check if this syscall event faults in user space memory */
 	mayfault = sys_data->user_mask != 0;
 
@@ -1438,17 +1453,12 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 			return;
 	}
 
-	head = this_cpu_ptr(sys_data->enter_event->perf_events);
-	valid_prog_array = bpf_prog_array_valid(sys_data->enter_event);
-	if (!valid_prog_array && hlist_empty(head))
-		return;
-
 	/* get the size after alignment with the u32 buffer size field */
 	size += sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
 	size = ALIGN(size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 
-	rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
+	rec = perf_trace_buf_alloc(size, NULL, &rctx);
 	if (!rec)
 		return;
 
@@ -1458,13 +1468,6 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	if (mayfault)
 		syscall_put_data(sys_data, rec, user_ptr, size, user_sizes, uargs);
 
-	if ((valid_prog_array &&
-	     !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) ||
-	    hlist_empty(head)) {
-		perf_swevent_put_recursion_context(rctx);
-		return;
-	}
-
 	perf_trace_buf_submit(rec, size, rctx,
 			      sys_data->enter_event->event.type, 1, regs,
 			      head, NULL);
@@ -1514,40 +1517,35 @@ static void perf_sysenter_disable(struct trace_event_call *call)
 		syscall_fault_buffer_disable();
 }
 
-static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs,
-			      struct syscall_trace_exit *rec)
+static int perf_call_bpf_exit(struct trace_event_call *call,
+			      int syscall_nr, long ret_val)
 {
 	struct syscall_tp_t {
 		struct trace_entry ent;
 		int syscall_nr;
 		unsigned long ret;
 	} __aligned(8) param;
-
-	/* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
-	perf_fetch_caller_regs(regs);
-	*(struct pt_regs **)&param = regs;
-	param.syscall_nr = rec->nr;
-	param.ret = rec->ret;
-	return trace_call_bpf(call, &param);
+	struct pt_regs regs = {};
+
+	/* bpf prog requires 'regs' to be the first member in the ctx */
+	perf_fetch_caller_regs(&regs);
+	*(struct pt_regs **)&param = &regs;
+	param.syscall_nr = syscall_nr;
+	param.ret = ret_val;
+	return trace_call_bpf_faultable(call, &param);
 }
 
 static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 {
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_exit *rec;
-	struct pt_regs *fake_regs;
 	struct hlist_head *head;
 	bool valid_prog_array;
 	int syscall_nr;
 	int rctx;
 	int size;
 
-	/*
-	 * Syscall probe called with preemption enabled, but the ring
-	 * buffer and per-cpu data require preemption to be disabled.
-	 */
 	might_fault();
-	guard(preempt_notrace)();
 
 	syscall_nr = trace_get_syscall_nr(current, regs);
 	if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
@@ -1559,29 +1557,37 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	if (!sys_data)
 		return;
 
-	head = this_cpu_ptr(sys_data->exit_event->perf_events);
+	/*
+	 * Run BPF program in faultable context before per-cpu buffer
+	 * allocation, allowing sleepable BPF programs to execute.
+	 */
 	valid_prog_array = bpf_prog_array_valid(sys_data->exit_event);
-	if (!valid_prog_array && hlist_empty(head))
+	if (valid_prog_array &&
+	    !perf_call_bpf_exit(sys_data->exit_event, syscall_nr,
+				syscall_get_return_value(current, regs)))
+		return;
+
+	/*
+	 * Per-cpu ring buffer and perf event list operations require
+	 * preemption to be disabled.
+	 */
+	guard(preempt_notrace)();
+
+	head = this_cpu_ptr(sys_data->exit_event->perf_events);
+	if (hlist_empty(head))
 		return;
 
 	/* We can probably do that at build time */
 	size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 
-	rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
+	rec = perf_trace_buf_alloc(size, NULL, &rctx);
 	if (!rec)
 		return;
 
 	rec->nr = syscall_nr;
 	rec->ret = syscall_get_return_value(current, regs);
 
-	if ((valid_prog_array &&
-	     !perf_call_bpf_exit(sys_data->exit_event, fake_regs, rec)) ||
-	    hlist_empty(head)) {
-		perf_swevent_put_recursion_context(rctx);
-		return;
-	}
-
 	perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
 			      1, regs, head, NULL);
 }
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index c0ba34eadb39..40cfb38ac919 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -687,6 +687,108 @@ void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
 }
 EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
 
+/* Scan one element forward from prev_key's position in @tbl.
+ * Returns first rhash_head whose bucket > prev_key's bucket, or the
+ * element immediately after prev_key inside prev_key's bucket.
+ * Returns the first element if prev_key is NULL, NULL when @tbl is
+ * exhausted, or ERR_PTR(-ENOENT) if prev_key is not found in @tbl.
+ */
+static struct rhash_head *__rhashtable_next_in_table(
+	struct rhashtable *ht, struct bucket_table *tbl,
+	const void *prev_key)
+{
+	struct rhashtable_compare_arg arg = { .ht = ht, .key = prev_key };
+	const struct rhashtable_params params = ht->p;
+	struct rhash_head *he;
+	unsigned int b = 0;
+	bool found = false;
+
+	if (prev_key) {
+		b = rht_key_hashfn(ht, tbl, prev_key, params);
+		rht_for_each_rcu(he, tbl, b) {
+			bool match = params.obj_cmpfn
+				     ? !params.obj_cmpfn(&arg, rht_obj(ht, he))
+				     : !rhashtable_compare(&arg, rht_obj(ht, he));
+			if (found) {
+				if (match)
+					continue;
+				return he;
+			}
+			if (match)
+				found = true;
+		}
+		if (!found)
+			return ERR_PTR(-ENOENT);
+		b++;
+	}
+
+	for (; b < tbl->size; b++)
+		rht_for_each_rcu(he, tbl, b)
+			return he;
+	return NULL;
+}
+
+/**
+ * rhashtable_next_key - return next element after a given key
+ * @ht:		hash table
+ * @prev_key:	pointer to previous key, or NULL for the first element
+ *
+ * WARNING: this walk is highly unstable. Unlike rhashtable_walk_*(),
+ * it cannot detect a concurrent resize or rehash, so a full iteration
+ * is NOT guaranteed to terminate under adversarial or sustained
+ * rehashing. Callers MUST tolerate skipped and duplicated elements and
+ * SHOULD bound their loop externally.
+ *
+ * Returns the next element in best-effort iteration order, walking the
+ * @tbl chain (including any future_tbl in flight). Caller must hold RCU.
+ *
+ * Pass @prev_key == NULL to obtain the first element. To iterate, set
+ * @prev_key to the key of the previously returned element on each call,
+ * and stop when NULL is returned.
+ *
+ * Best-effort semantics:
+ *   - Across the tbl->future_tbl chain, an element being migrated may
+ *     transiently appear in both tables and be observed twice.
+ *   - Concurrent inserts may or may not be observed.
+ *   - Termination of a full iteration loop is NOT guaranteed under
+ *     adversarial continuous rehash; callers MUST tolerate skips and
+ *     repeats and SHOULD bound their loop externally.
+ *   - Behavior on tables that contain duplicate keys is undefined:
+ *     duplicates may be skipped, repeated, or trap the walk in a
+ *     cycle. Callers requiring duplicate-key iteration must use
+ *     rhashtable_walk_*() instead.
+ *   - rhltable instances are not supported and return
+ *     ERR_PTR(-EOPNOTSUPP).
+ *   - If prev_key was concurrently deleted and is not present in any
+ *     in-flight table, returns ERR_PTR(-ENOENT).
+ *
+ * Returns entry of the next element, or NULL when iteration is exhausted,
+ * or ERR_PTR(-ENOENT) if prev_key is not found, or
+ * ERR_PTR(-EOPNOTSUPP) if @ht is an rhltable.
+ */
+void *rhashtable_next_key(struct rhashtable *ht, const void *prev_key)
+{
+	struct bucket_table *tbl;
+	struct rhash_head *he;
+
+	if (unlikely(ht->rhlist))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	tbl = rht_dereference_rcu(ht->tbl, ht);
+	do {
+		he = __rhashtable_next_in_table(ht, tbl, prev_key);
+		if (!IS_ERR_OR_NULL(he))
+			return rht_obj(ht, he);
+		if (!he)
+			prev_key = NULL;
+		/* See any new future_tbl attached during a rehash. */
+		smp_rmb();
+		tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+	} while (tbl);
+	return he; /* NULL or -ENOENT */
+}
+EXPORT_SYMBOL_GPL(rhashtable_next_key);
+
 /**
  * rhashtable_walk_enter - Initialise an iterator
  * @ht:		Table to walk over
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index 5892c0f17ddc..af6f3340c034 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -560,8 +560,23 @@ static int bpf_fill_max_jmp_never_taken(struct bpf_test *self)
 }
 
 /* ALU result computation used in tests */
-static bool __bpf_alu_result(u64 *res, u64 v1, u64 v2, u8 op)
+enum { F_ALU32 = 1, F_SIGNED = 2 };
+
+static bool __bpf_alu_result(u64 *res, u64 v1, u64 v2, u8 op, u32 flags)
 {
+	bool is_signed = flags & F_SIGNED;
+
+	/* Narrow operands for ALU32 */
+	if (flags & F_ALU32) {
+		if (is_signed) {
+			v1 = (u64)(s32)v1;
+			v2 = (u64)(s32)v2;
+		} else {
+			v1 = (u32)v1;
+			v2 = (u32)v2;
+		}
+	}
+
 	*res = 0;
 	switch (op) {
 	case BPF_MOV:
@@ -599,12 +614,28 @@ static bool __bpf_alu_result(u64 *res, u64 v1, u64 v2, u8 op)
 	case BPF_DIV:
 		if (v2 == 0)
 			return false;
-		*res = div64_u64(v1, v2);
+		if (!is_signed) {
+			*res = div64_u64(v1, v2);
+		} else {
+			if ((s64)v2 == -1) /* Handled by verifier */
+				return false;
+			*res = (u64)div64_s64(v1, v2);
+		}
 		break;
 	case BPF_MOD:
 		if (v2 == 0)
 			return false;
-		div64_u64_rem(v1, v2, res);
+		if (!is_signed) {
+			div64_u64_rem(v1, v2, res);
+		} else {
+			if ((s64)v2 == -1)
+				return false;
+			/*
+			 * Avoid s64 % s64 which generates __moddi3 on
+			 * 32-bit architectures. Use div64_s64 instead.
+			 */
+			*res = (u64)((s64)v1 - div64_s64(v1, v2) * (s64)v2);
+		}
 		break;
 	}
 	return true;
@@ -612,7 +643,7 @@ static bool __bpf_alu_result(u64 *res, u64 v1, u64 v2, u8 op)
 
 /* Test an ALU shift operation for all valid shift values */
 static int __bpf_fill_alu_shift(struct bpf_test *self, u8 op,
-				u8 mode, bool alu32)
+				u8 mode, u32 flags)
 {
 	static const s64 regs[] = {
 		0x0123456789abcdefLL, /* dword > 0, word < 0 */
@@ -620,7 +651,7 @@ static int __bpf_fill_alu_shift(struct bpf_test *self, u8 op,
 		0xfedcba0198765432LL, /* dword < 0, word < 0 */
 		0x0123458967abcdefLL, /* dword > 0, word > 0 */
 	};
-	int bits = alu32 ? 32 : 64;
+	int bits = (flags & F_ALU32) ? 32 : 64;
 	int len = (2 + 7 * bits) * ARRAY_SIZE(regs) + 3;
 	struct bpf_insn *insn;
 	int imm, k;
@@ -643,7 +674,7 @@ static int __bpf_fill_alu_shift(struct bpf_test *self, u8 op,
 			/* Perform operation */
 			insn[i++] = BPF_ALU64_REG(BPF_MOV, R1, R3);
 			insn[i++] = BPF_ALU64_IMM(BPF_MOV, R2, imm);
-			if (alu32) {
+			if (flags & F_ALU32) {
 				if (mode == BPF_K)
 					insn[i++] = BPF_ALU32_IMM(op, R1, imm);
 				else
@@ -653,14 +684,14 @@ static int __bpf_fill_alu_shift(struct bpf_test *self, u8 op,
 					reg = (s32)reg;
 				else
 					reg = (u32)reg;
-				__bpf_alu_result(&val, reg, imm, op);
+				__bpf_alu_result(&val, reg, imm, op, 0);
 				val = (u32)val;
 			} else {
 				if (mode == BPF_K)
 					insn[i++] = BPF_ALU64_IMM(op, R1, imm);
 				else
 					insn[i++] = BPF_ALU64_REG(op, R1, R2);
-				__bpf_alu_result(&val, reg, imm, op);
+				__bpf_alu_result(&val, reg, imm, op, 0);
 			}
 
 			/*
@@ -688,62 +719,62 @@ static int __bpf_fill_alu_shift(struct bpf_test *self, u8 op,
 
 static int bpf_fill_alu64_lsh_imm(struct bpf_test *self)
 {
-	return __bpf_fill_alu_shift(self, BPF_LSH, BPF_K, false);
+	return __bpf_fill_alu_shift(self, BPF_LSH, BPF_K, 0);
 }
 
 static int bpf_fill_alu64_rsh_imm(struct bpf_test *self)
 {
-	return __bpf_fill_alu_shift(self, BPF_RSH, BPF_K, false);
+	return __bpf_fill_alu_shift(self, BPF_RSH, BPF_K, 0);
 }
 
 static int bpf_fill_alu64_arsh_imm(struct bpf_test *self)
 {
-	return __bpf_fill_alu_shift(self, BPF_ARSH, BPF_K, false);
+	return __bpf_fill_alu_shift(self, BPF_ARSH, BPF_K, 0);
 }
 
 static int bpf_fill_alu64_lsh_reg(struct bpf_test *self)
 {
-	return __bpf_fill_alu_shift(self, BPF_LSH, BPF_X, false);
+	return __bpf_fill_alu_shift(self, BPF_LSH, BPF_X, 0);
 }
 
 static int bpf_fill_alu64_rsh_reg(struct bpf_test *self)
 {
-	return __bpf_fill_alu_shift(self, BPF_RSH, BPF_X, false);
+	return __bpf_fill_alu_shift(self, BPF_RSH, BPF_X, 0);
 }
 
 static int bpf_fill_alu64_arsh_reg(struct bpf_test *self)
 {
-	return __bpf_fill_alu_shift(self, BPF_ARSH, BPF_X, false);
+	return __bpf_fill_alu_shift(self, BPF_ARSH, BPF_X, 0);
 }
 
 static int bpf_fill_alu32_lsh_imm(struct bpf_test *self)
 {
-	return __bpf_fill_alu_shift(self, BPF_LSH, BPF_K, true);
+	return __bpf_fill_alu_shift(self, BPF_LSH, BPF_K, F_ALU32);
 }
 
 static int bpf_fill_alu32_rsh_imm(struct bpf_test *self)
 {
-	return __bpf_fill_alu_shift(self, BPF_RSH, BPF_K, true);
+	return __bpf_fill_alu_shift(self, BPF_RSH, BPF_K, F_ALU32);
 }
 
 static int bpf_fill_alu32_arsh_imm(struct bpf_test *self)
 {
-	return __bpf_fill_alu_shift(self, BPF_ARSH, BPF_K, true);
+	return __bpf_fill_alu_shift(self, BPF_ARSH, BPF_K, F_ALU32);
 }
 
 static int bpf_fill_alu32_lsh_reg(struct bpf_test *self)
 {
-	return __bpf_fill_alu_shift(self, BPF_LSH, BPF_X, true);
+	return __bpf_fill_alu_shift(self, BPF_LSH, BPF_X, F_ALU32);
 }
 
 static int bpf_fill_alu32_rsh_reg(struct bpf_test *self)
 {
-	return __bpf_fill_alu_shift(self, BPF_RSH, BPF_X, true);
+	return __bpf_fill_alu_shift(self, BPF_RSH, BPF_X, F_ALU32);
 }
 
 static int bpf_fill_alu32_arsh_reg(struct bpf_test *self)
 {
-	return __bpf_fill_alu_shift(self, BPF_ARSH, BPF_X, true);
+	return __bpf_fill_alu_shift(self, BPF_ARSH, BPF_X, F_ALU32);
 }
 
 /*
@@ -751,9 +782,9 @@ static int bpf_fill_alu32_arsh_reg(struct bpf_test *self)
  * for the case when the source and destination are the same.
  */
 static int __bpf_fill_alu_shift_same_reg(struct bpf_test *self, u8 op,
-					 bool alu32)
+					 u32 flags)
 {
-	int bits = alu32 ? 32 : 64;
+	int bits = (flags & F_ALU32) ? 32 : 64;
 	int len = 3 + 6 * bits;
 	struct bpf_insn *insn;
 	int i = 0;
@@ -770,14 +801,14 @@ static int __bpf_fill_alu_shift_same_reg(struct bpf_test *self, u8 op,
 
 		/* Perform operation */
 		insn[i++] = BPF_ALU64_IMM(BPF_MOV, R1, val);
-		if (alu32)
+		if (flags & F_ALU32)
 			insn[i++] = BPF_ALU32_REG(op, R1, R1);
 		else
 			insn[i++] = BPF_ALU64_REG(op, R1, R1);
 
 		/* Compute the reference result */
-		__bpf_alu_result(&res, val, val, op);
-		if (alu32)
+		__bpf_alu_result(&res, val, val, op, 0);
+		if (flags & F_ALU32)
 			res = (u32)res;
 		i += __bpf_ld_imm64(&insn[i], R2, res);
 
@@ -798,32 +829,32 @@ static int __bpf_fill_alu_shift_same_reg(struct bpf_test *self, u8 op,
 
 static int bpf_fill_alu64_lsh_same_reg(struct bpf_test *self)
 {
-	return __bpf_fill_alu_shift_same_reg(self, BPF_LSH, false);
+	return __bpf_fill_alu_shift_same_reg(self, BPF_LSH, 0);
 }
 
 static int bpf_fill_alu64_rsh_same_reg(struct bpf_test *self)
 {
-	return __bpf_fill_alu_shift_same_reg(self, BPF_RSH, false);
+	return __bpf_fill_alu_shift_same_reg(self, BPF_RSH, 0);
 }
 
 static int bpf_fill_alu64_arsh_same_reg(struct bpf_test *self)
 {
-	return __bpf_fill_alu_shift_same_reg(self, BPF_ARSH, false);
+	return __bpf_fill_alu_shift_same_reg(self, BPF_ARSH, 0);
 }
 
 static int bpf_fill_alu32_lsh_same_reg(struct bpf_test *self)
 {
-	return __bpf_fill_alu_shift_same_reg(self, BPF_LSH, true);
+	return __bpf_fill_alu_shift_same_reg(self, BPF_LSH, F_ALU32);
 }
 
 static int bpf_fill_alu32_rsh_same_reg(struct bpf_test *self)
 {
-	return __bpf_fill_alu_shift_same_reg(self, BPF_RSH, true);
+	return __bpf_fill_alu_shift_same_reg(self, BPF_RSH, F_ALU32);
 }
 
 static int bpf_fill_alu32_arsh_same_reg(struct bpf_test *self)
 {
-	return __bpf_fill_alu_shift_same_reg(self, BPF_ARSH, true);
+	return __bpf_fill_alu_shift_same_reg(self, BPF_ARSH, F_ALU32);
 }
 
 /*
@@ -936,17 +967,20 @@ static int __bpf_fill_pattern(struct bpf_test *self, void *arg,
 static int __bpf_emit_alu64_imm(struct bpf_test *self, void *arg,
 				struct bpf_insn *insns, s64 dst, s64 imm)
 {
-	int op = *(int *)arg;
+	int *a = arg;
+	int op = a[0];
+	u32 flags = a[1];
+	s16 off = (flags & F_SIGNED) ? 1 : 0;
 	int i = 0;
 	u64 res;
 
 	if (!insns)
 		return 7;
 
-	if (__bpf_alu_result(&res, dst, (s32)imm, op)) {
+	if (__bpf_alu_result(&res, dst, (s32)imm, op, flags)) {
 		i += __bpf_ld_imm64(&insns[i], R1, dst);
 		i += __bpf_ld_imm64(&insns[i], R3, res);
-		insns[i++] = BPF_ALU64_IMM(op, R1, imm);
+		insns[i++] = BPF_ALU64_IMM_OFF(op, R1, imm, off);
 		insns[i++] = BPF_JMP_REG(BPF_JEQ, R1, R3, 1);
 		insns[i++] = BPF_EXIT_INSN();
 	}
@@ -957,17 +991,20 @@ static int __bpf_emit_alu64_imm(struct bpf_test *self, void *arg,
 static int __bpf_emit_alu32_imm(struct bpf_test *self, void *arg,
 				struct bpf_insn *insns, s64 dst, s64 imm)
 {
-	int op = *(int *)arg;
+	int *a = arg;
+	int op = a[0];
+	u32 flags = a[1];
+	s16 off = (flags & F_SIGNED) ? 1 : 0;
 	int i = 0;
 	u64 res;
 
 	if (!insns)
 		return 7;
 
-	if (__bpf_alu_result(&res, (u32)dst, (u32)imm, op)) {
+	if (__bpf_alu_result(&res, dst, (s32)imm, op, flags | F_ALU32)) {
 		i += __bpf_ld_imm64(&insns[i], R1, dst);
 		i += __bpf_ld_imm64(&insns[i], R3, (u32)res);
-		insns[i++] = BPF_ALU32_IMM(op, R1, imm);
+		insns[i++] = BPF_ALU32_IMM_OFF(op, R1, imm, off);
 		insns[i++] = BPF_JMP_REG(BPF_JEQ, R1, R3, 1);
 		insns[i++] = BPF_EXIT_INSN();
 	}
@@ -985,7 +1022,7 @@ static int __bpf_emit_alu64_reg(struct bpf_test *self, void *arg,
 	if (!insns)
 		return 9;
 
-	if (__bpf_alu_result(&res, dst, src, op)) {
+	if (__bpf_alu_result(&res, dst, src, op, 0)) {
 		i += __bpf_ld_imm64(&insns[i], R1, dst);
 		i += __bpf_ld_imm64(&insns[i], R2, src);
 		i += __bpf_ld_imm64(&insns[i], R3, res);
@@ -1007,7 +1044,7 @@ static int __bpf_emit_alu32_reg(struct bpf_test *self, void *arg,
 	if (!insns)
 		return 9;
 
-	if (__bpf_alu_result(&res, (u32)dst, (u32)src, op)) {
+	if (__bpf_alu_result(&res, (u32)dst, (u32)src, op, 0)) {
 		i += __bpf_ld_imm64(&insns[i], R1, dst);
 		i += __bpf_ld_imm64(&insns[i], R2, src);
 		i += __bpf_ld_imm64(&insns[i], R3, (u32)res);
@@ -1019,16 +1056,20 @@ static int __bpf_emit_alu32_reg(struct bpf_test *self, void *arg,
 	return i;
 }
 
-static int __bpf_fill_alu64_imm(struct bpf_test *self, int op)
+static int __bpf_fill_alu64_imm(struct bpf_test *self, int op, u32 flags)
 {
-	return __bpf_fill_pattern(self, &op, 64, 32,
+	int arg[2] = {op, flags};
+
+	return __bpf_fill_pattern(self, &arg, 64, 32,
 				  PATTERN_BLOCK1, PATTERN_BLOCK2,
 				  &__bpf_emit_alu64_imm);
 }
 
-static int __bpf_fill_alu32_imm(struct bpf_test *self, int op)
+static int __bpf_fill_alu32_imm(struct bpf_test *self, int op, u32 flags)
 {
-	return __bpf_fill_pattern(self, &op, 64, 32,
+	int arg[2] = {op, flags};
+
+	return __bpf_fill_pattern(self, &arg, 64, 32,
 				  PATTERN_BLOCK1, PATTERN_BLOCK2,
 				  &__bpf_emit_alu32_imm);
 }
@@ -1050,93 +1091,115 @@ static int __bpf_fill_alu32_reg(struct bpf_test *self, int op)
 /* ALU64 immediate operations */
 static int bpf_fill_alu64_mov_imm(struct bpf_test *self)
 {
-	return __bpf_fill_alu64_imm(self, BPF_MOV);
+	return __bpf_fill_alu64_imm(self, BPF_MOV, 0);
 }
 
 static int bpf_fill_alu64_and_imm(struct bpf_test *self)
 {
-	return __bpf_fill_alu64_imm(self, BPF_AND);
+	return __bpf_fill_alu64_imm(self, BPF_AND, 0);
 }
 
 static int bpf_fill_alu64_or_imm(struct bpf_test *self)
 {
-	return __bpf_fill_alu64_imm(self, BPF_OR);
+	return __bpf_fill_alu64_imm(self, BPF_OR, 0);
 }
 
 static int bpf_fill_alu64_xor_imm(struct bpf_test *self)
 {
-	return __bpf_fill_alu64_imm(self, BPF_XOR);
+	return __bpf_fill_alu64_imm(self, BPF_XOR, 0);
 }
 
 static int bpf_fill_alu64_add_imm(struct bpf_test *self)
 {
-	return __bpf_fill_alu64_imm(self, BPF_ADD);
+	return __bpf_fill_alu64_imm(self, BPF_ADD, 0);
 }
 
 static int bpf_fill_alu64_sub_imm(struct bpf_test *self)
 {
-	return __bpf_fill_alu64_imm(self, BPF_SUB);
+	return __bpf_fill_alu64_imm(self, BPF_SUB, 0);
 }
 
 static int bpf_fill_alu64_mul_imm(struct bpf_test *self)
 {
-	return __bpf_fill_alu64_imm(self, BPF_MUL);
+	return __bpf_fill_alu64_imm(self, BPF_MUL, 0);
 }
 
 static int bpf_fill_alu64_div_imm(struct bpf_test *self)
 {
-	return __bpf_fill_alu64_imm(self, BPF_DIV);
+	return __bpf_fill_alu64_imm(self, BPF_DIV, 0);
 }
 
 static int bpf_fill_alu64_mod_imm(struct bpf_test *self)
 {
-	return __bpf_fill_alu64_imm(self, BPF_MOD);
+	return __bpf_fill_alu64_imm(self, BPF_MOD, 0);
+}
+
+/* Signed ALU64 immediate operations */
+static int bpf_fill_alu64_sdiv_imm(struct bpf_test *self)
+{
+	return __bpf_fill_alu64_imm(self, BPF_DIV, F_SIGNED);
+}
+
+static int bpf_fill_alu64_smod_imm(struct bpf_test *self)
+{
+	return __bpf_fill_alu64_imm(self, BPF_MOD, F_SIGNED);
+}
+
+/* Signed ALU32 immediate operations */
+static int bpf_fill_alu32_sdiv_imm(struct bpf_test *self)
+{
+	return __bpf_fill_alu32_imm(self, BPF_DIV, F_SIGNED);
+}
+
+static int bpf_fill_alu32_smod_imm(struct bpf_test *self)
+{
+	return __bpf_fill_alu32_imm(self, BPF_MOD, F_SIGNED);
 }
 
 /* ALU32 immediate operations */
 static int bpf_fill_alu32_mov_imm(struct bpf_test *self)
 {
-	return __bpf_fill_alu32_imm(self, BPF_MOV);
+	return __bpf_fill_alu32_imm(self, BPF_MOV, 0);
 }
 
 static int bpf_fill_alu32_and_imm(struct bpf_test *self)
 {
-	return __bpf_fill_alu32_imm(self, BPF_AND);
+	return __bpf_fill_alu32_imm(self, BPF_AND, 0);
 }
 
 static int bpf_fill_alu32_or_imm(struct bpf_test *self)
 {
-	return __bpf_fill_alu32_imm(self, BPF_OR);
+	return __bpf_fill_alu32_imm(self, BPF_OR, 0);
 }
 
 static int bpf_fill_alu32_xor_imm(struct bpf_test *self)
 {
-	return __bpf_fill_alu32_imm(self, BPF_XOR);
+	return __bpf_fill_alu32_imm(self, BPF_XOR, 0);
 }
 
 static int bpf_fill_alu32_add_imm(struct bpf_test *self)
 {
-	return __bpf_fill_alu32_imm(self, BPF_ADD);
+	return __bpf_fill_alu32_imm(self, BPF_ADD, 0);
 }
 
 static int bpf_fill_alu32_sub_imm(struct bpf_test *self)
 {
-	return __bpf_fill_alu32_imm(self, BPF_SUB);
+	return __bpf_fill_alu32_imm(self, BPF_SUB, 0);
 }
 
 static int bpf_fill_alu32_mul_imm(struct bpf_test *self)
 {
-	return __bpf_fill_alu32_imm(self, BPF_MUL);
+	return __bpf_fill_alu32_imm(self, BPF_MUL, 0);
 }
 
 static int bpf_fill_alu32_div_imm(struct bpf_test *self)
 {
-	return __bpf_fill_alu32_imm(self, BPF_DIV);
+	return __bpf_fill_alu32_imm(self, BPF_DIV, 0);
 }
 
 static int bpf_fill_alu32_mod_imm(struct bpf_test *self)
 {
-	return __bpf_fill_alu32_imm(self, BPF_MOD);
+	return __bpf_fill_alu32_imm(self, BPF_MOD, 0);
 }
 
 /* ALU64 register operations */
@@ -1235,7 +1298,8 @@ static int bpf_fill_alu32_mod_reg(struct bpf_test *self)
  * Test JITs that implement complex ALU operations as function
  * calls, and must re-arrange operands for argument passing.
  */
-static int __bpf_fill_alu_imm_regs(struct bpf_test *self, u8 op, bool alu32)
+static int __bpf_fill_alu_imm_regs(struct bpf_test *self, u8 op,
+				    u32 flags)
 {
 	int len = 2 + 10 * 10;
 	struct bpf_insn *insns;
@@ -1249,28 +1313,37 @@ static int __bpf_fill_alu_imm_regs(struct bpf_test *self, u8 op, bool alu32)
 		return -ENOMEM;
 
 	/* Operand and result values according to operation */
-	if (alu32)
-		dst = 0x76543210U;
-	else
-		dst = 0x7edcba9876543210ULL;
+	if (flags & F_SIGNED) {
+		if (flags & F_ALU32)
+			dst = -76543210;
+		else
+			dst = -7654321076543210LL;
+	} else {
+		if (flags & F_ALU32)
+			dst = 0x76543210U;
+		else
+			dst = 0x7edcba9876543210ULL;
+	}
 	imm = 0x01234567U;
 
 	if (op == BPF_LSH || op == BPF_RSH || op == BPF_ARSH)
 		imm &= 31;
 
-	__bpf_alu_result(&res, dst, imm, op);
+	__bpf_alu_result(&res, dst, imm, op, flags);
 
-	if (alu32)
+	if (flags & F_ALU32)
 		res = (u32)res;
 
 	/* Check all operand registers */
 	for (rd = R0; rd <= R9; rd++) {
 		i += __bpf_ld_imm64(&insns[i], rd, dst);
 
-		if (alu32)
-			insns[i++] = BPF_ALU32_IMM(op, rd, imm);
+		s16 off = (flags & F_SIGNED) ? 1 : 0;
+
+		if (flags & F_ALU32)
+			insns[i++] = BPF_ALU32_IMM_OFF(op, rd, imm, off);
 		else
-			insns[i++] = BPF_ALU64_IMM(op, rd, imm);
+			insns[i++] = BPF_ALU64_IMM_OFF(op, rd, imm, off);
 
 		insns[i++] = BPF_JMP32_IMM(BPF_JEQ, rd, res, 2);
 		insns[i++] = BPF_MOV64_IMM(R0, __LINE__);
@@ -1295,123 +1368,145 @@ static int __bpf_fill_alu_imm_regs(struct bpf_test *self, u8 op, bool alu32)
 /* ALU64 K registers */
 static int bpf_fill_alu64_mov_imm_regs(struct bpf_test *self)
 {
-	return __bpf_fill_alu_imm_regs(self, BPF_MOV, false);
+	return __bpf_fill_alu_imm_regs(self, BPF_MOV, 0);
 }
 
 static int bpf_fill_alu64_and_imm_regs(struct bpf_test *self)
 {
-	return __bpf_fill_alu_imm_regs(self, BPF_AND, false);
+	return __bpf_fill_alu_imm_regs(self, BPF_AND, 0);
 }
 
 static int bpf_fill_alu64_or_imm_regs(struct bpf_test *self)
 {
-	return __bpf_fill_alu_imm_regs(self, BPF_OR, false);
+	return __bpf_fill_alu_imm_regs(self, BPF_OR, 0);
 }
 
 static int bpf_fill_alu64_xor_imm_regs(struct bpf_test *self)
 {
-	return __bpf_fill_alu_imm_regs(self, BPF_XOR, false);
+	return __bpf_fill_alu_imm_regs(self, BPF_XOR, 0);
 }
 
 static int bpf_fill_alu64_lsh_imm_regs(struct bpf_test *self)
 {
-	return __bpf_fill_alu_imm_regs(self, BPF_LSH, false);
+	return __bpf_fill_alu_imm_regs(self, BPF_LSH, 0);
 }
 
 static int bpf_fill_alu64_rsh_imm_regs(struct bpf_test *self)
 {
-	return __bpf_fill_alu_imm_regs(self, BPF_RSH, false);
+	return __bpf_fill_alu_imm_regs(self, BPF_RSH, 0);
 }
 
 static int bpf_fill_alu64_arsh_imm_regs(struct bpf_test *self)
 {
-	return __bpf_fill_alu_imm_regs(self, BPF_ARSH, false);
+	return __bpf_fill_alu_imm_regs(self, BPF_ARSH, 0);
 }
 
 static int bpf_fill_alu64_add_imm_regs(struct bpf_test *self)
 {
-	return __bpf_fill_alu_imm_regs(self, BPF_ADD, false);
+	return __bpf_fill_alu_imm_regs(self, BPF_ADD, 0);
 }
 
 static int bpf_fill_alu64_sub_imm_regs(struct bpf_test *self)
 {
-	return __bpf_fill_alu_imm_regs(self, BPF_SUB, false);
+	return __bpf_fill_alu_imm_regs(self, BPF_SUB, 0);
 }
 
 static int bpf_fill_alu64_mul_imm_regs(struct bpf_test *self)
 {
-	return __bpf_fill_alu_imm_regs(self, BPF_MUL, false);
+	return __bpf_fill_alu_imm_regs(self, BPF_MUL, 0);
 }
 
 static int bpf_fill_alu64_div_imm_regs(struct bpf_test *self)
 {
-	return __bpf_fill_alu_imm_regs(self, BPF_DIV, false);
+	return __bpf_fill_alu_imm_regs(self, BPF_DIV, 0);
 }
 
 static int bpf_fill_alu64_mod_imm_regs(struct bpf_test *self)
 {
-	return __bpf_fill_alu_imm_regs(self, BPF_MOD, false);
+	return __bpf_fill_alu_imm_regs(self, BPF_MOD, 0);
+}
+
+/* Signed ALU64 K registers */
+static int bpf_fill_alu64_sdiv_imm_regs(struct bpf_test *self)
+{
+	return __bpf_fill_alu_imm_regs(self, BPF_DIV, F_SIGNED);
+}
+
+static int bpf_fill_alu64_smod_imm_regs(struct bpf_test *self)
+{
+	return __bpf_fill_alu_imm_regs(self, BPF_MOD, F_SIGNED);
 }
 
 /* ALU32 K registers */
 static int bpf_fill_alu32_mov_imm_regs(struct bpf_test *self)
 {
-	return __bpf_fill_alu_imm_regs(self, BPF_MOV, true);
+	return __bpf_fill_alu_imm_regs(self, BPF_MOV, F_ALU32);
 }
 
 static int bpf_fill_alu32_and_imm_regs(struct bpf_test *self)
 {
-	return __bpf_fill_alu_imm_regs(self, BPF_AND, true);
+	return __bpf_fill_alu_imm_regs(self, BPF_AND, F_ALU32);
 }
 
 static int bpf_fill_alu32_or_imm_regs(struct bpf_test *self)
 {
-	return __bpf_fill_alu_imm_regs(self, BPF_OR, true);
+	return __bpf_fill_alu_imm_regs(self, BPF_OR, F_ALU32);
 }
 
 static int bpf_fill_alu32_xor_imm_regs(struct bpf_test *self)
 {
-	return __bpf_fill_alu_imm_regs(self, BPF_XOR, true);
+	return __bpf_fill_alu_imm_regs(self, BPF_XOR, F_ALU32);
 }
 
 static int bpf_fill_alu32_lsh_imm_regs(struct bpf_test *self)
 {
-	return __bpf_fill_alu_imm_regs(self, BPF_LSH, true);
+	return __bpf_fill_alu_imm_regs(self, BPF_LSH, F_ALU32);
 }
 
 static int bpf_fill_alu32_rsh_imm_regs(struct bpf_test *self)
 {
-	return __bpf_fill_alu_imm_regs(self, BPF_RSH, true);
+	return __bpf_fill_alu_imm_regs(self, BPF_RSH, F_ALU32);
 }
 
 static int bpf_fill_alu32_arsh_imm_regs(struct bpf_test *self)
 {
-	return __bpf_fill_alu_imm_regs(self, BPF_ARSH, true);
+	return __bpf_fill_alu_imm_regs(self, BPF_ARSH, F_ALU32);
 }
 
 static int bpf_fill_alu32_add_imm_regs(struct bpf_test *self)
 {
-	return __bpf_fill_alu_imm_regs(self, BPF_ADD, true);
+	return __bpf_fill_alu_imm_regs(self, BPF_ADD, F_ALU32);
 }
 
 static int bpf_fill_alu32_sub_imm_regs(struct bpf_test *self)
 {
-	return __bpf_fill_alu_imm_regs(self, BPF_SUB, true);
+	return __bpf_fill_alu_imm_regs(self, BPF_SUB, F_ALU32);
 }
 
 static int bpf_fill_alu32_mul_imm_regs(struct bpf_test *self)
 {
-	return __bpf_fill_alu_imm_regs(self, BPF_MUL, true);
+	return __bpf_fill_alu_imm_regs(self, BPF_MUL, F_ALU32);
 }
 
 static int bpf_fill_alu32_div_imm_regs(struct bpf_test *self)
 {
-	return __bpf_fill_alu_imm_regs(self, BPF_DIV, true);
+	return __bpf_fill_alu_imm_regs(self, BPF_DIV, F_ALU32);
 }
 
 static int bpf_fill_alu32_mod_imm_regs(struct bpf_test *self)
 {
-	return __bpf_fill_alu_imm_regs(self, BPF_MOD, true);
+	return __bpf_fill_alu_imm_regs(self, BPF_MOD, F_ALU32);
+}
+
+/* Signed ALU32 K registers */
+static int bpf_fill_alu32_sdiv_imm_regs(struct bpf_test *self)
+{
+	return __bpf_fill_alu_imm_regs(self, BPF_DIV, F_ALU32 | F_SIGNED);
+}
+
+static int bpf_fill_alu32_smod_imm_regs(struct bpf_test *self)
+{
+	return __bpf_fill_alu_imm_regs(self, BPF_MOD, F_ALU32 | F_SIGNED);
 }
 
 /*
@@ -1442,8 +1537,8 @@ static int __bpf_fill_alu_reg_pairs(struct bpf_test *self, u8 op, bool alu32)
 	if (op == BPF_LSH || op == BPF_RSH || op == BPF_ARSH)
 		src &= 31;
 
-	__bpf_alu_result(&res, dst, src, op);
-	__bpf_alu_result(&same, src, src, op);
+	__bpf_alu_result(&res, dst, src, op, 0);
+	__bpf_alu_result(&same, src, src, op, 0);
 
 	if (alu32) {
 		res = (u32)res;
@@ -1626,7 +1721,7 @@ static int __bpf_emit_atomic64(struct bpf_test *self, void *arg,
 		res = src;
 		break;
 	default:
-		__bpf_alu_result(&res, dst, src, BPF_OP(op));
+		__bpf_alu_result(&res, dst, src, BPF_OP(op), 0);
 	}
 
 	keep = 0x0123456789abcdefULL;
@@ -1673,7 +1768,7 @@ static int __bpf_emit_atomic32(struct bpf_test *self, void *arg,
 		res = src;
 		break;
 	default:
-		__bpf_alu_result(&res, (u32)dst, (u32)src, BPF_OP(op));
+		__bpf_alu_result(&res, (u32)dst, (u32)src, BPF_OP(op), 0);
 	}
 
 	keep = 0x0123456789abcdefULL;
@@ -1939,7 +2034,7 @@ static int __bpf_fill_atomic_reg_pairs(struct bpf_test *self, u8 width, u8 op)
 		res = mem;
 		break;
 	default:
-		__bpf_alu_result(&res, mem, upd, BPF_OP(op));
+		__bpf_alu_result(&res, mem, upd, BPF_OP(op), 0);
 	}
 
 	/* Test all operand registers */
@@ -12354,6 +12449,22 @@ static struct bpf_test tests[] = {
 		{ { 0, 1 } },
 		.fill_helper = bpf_fill_alu64_mod_imm_regs,
 	},
+	{
+		"ALU64_SDIV_K: registers",
+		{ },
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+		.fill_helper = bpf_fill_alu64_sdiv_imm_regs,
+	},
+	{
+		"ALU64_SMOD_K: registers",
+		{ },
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+		.fill_helper = bpf_fill_alu64_smod_imm_regs,
+	},
 	/* ALU32 K registers */
 	{
 		"ALU32_MOV_K: registers",
@@ -12451,6 +12562,22 @@ static struct bpf_test tests[] = {
 		{ { 0, 1 } },
 		.fill_helper = bpf_fill_alu32_mod_imm_regs,
 	},
+	{
+		"ALU32_SDIV_K: registers",
+		{ },
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+		.fill_helper = bpf_fill_alu32_sdiv_imm_regs,
+	},
+	{
+		"ALU32_SMOD_K: registers",
+		{ },
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+		.fill_helper = bpf_fill_alu32_smod_imm_regs,
+	},
 	/* ALU64 X register combinations */
 	{
 		"ALU64_MOV_X: register combinations",
@@ -12881,6 +13008,24 @@ static struct bpf_test tests[] = {
 		.fill_helper = bpf_fill_alu64_mod_imm,
 		.nr_testruns = NR_PATTERN_RUNS,
 	},
+	{
+		"ALU64_SDIV_K: all immediate value magnitudes",
+		{ },
+		INTERNAL | FLAG_NO_DATA,
+		{ },
+		{ { 0, 1 } },
+		.fill_helper = bpf_fill_alu64_sdiv_imm,
+		.nr_testruns = NR_PATTERN_RUNS,
+	},
+	{
+		"ALU64_SMOD_K: all immediate value magnitudes",
+		{ },
+		INTERNAL | FLAG_NO_DATA,
+		{ },
+		{ { 0, 1 } },
+		.fill_helper = bpf_fill_alu64_smod_imm,
+		.nr_testruns = NR_PATTERN_RUNS,
+	},
 	/* ALU32 immediate magnitudes */
 	{
 		"ALU32_MOV_K: all immediate value magnitudes",
@@ -12963,6 +13108,24 @@ static struct bpf_test tests[] = {
 		.fill_helper = bpf_fill_alu32_mod_imm,
 		.nr_testruns = NR_PATTERN_RUNS,
 	},
+	{
+		"ALU32_SDIV_K: all immediate value magnitudes",
+		{ },
+		INTERNAL | FLAG_NO_DATA,
+		{ },
+		{ { 0, 1 } },
+		.fill_helper = bpf_fill_alu32_sdiv_imm,
+		.nr_testruns = NR_PATTERN_RUNS,
+	},
+	{
+		"ALU32_SMOD_K: all immediate value magnitudes",
+		{ },
+		INTERNAL | FLAG_NO_DATA,
+		{ },
+		{ { 0, 1 } },
+		.fill_helper = bpf_fill_alu32_smod_imm,
+		.nr_testruns = NR_PATTERN_RUNS,
+	},
 	/* ALU64 register magnitudes */
 	{
 		"ALU64_MOV_X: all register value magnitudes",
diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c
index 0b33559a910b..b767a38a74f9 100644
--- a/lib/test_rhashtable.c
+++ b/lib/test_rhashtable.c
@@ -679,6 +679,78 @@ out:
 	return err;
 }
 
+static int __init test_rhashtable_next_key(void)
+{
+	struct rhashtable_params params = test_rht_params;
+	struct test_obj_val key_missing = { .id = 99999, .tid = 0 };
+	struct test_obj_val *prev_key = NULL;
+	struct rhashtable ht;
+	struct test_obj *objs, *cur;
+	int i, count = 0, err;
+	int visited_keys[8] = { 0 };
+	const int n = ARRAY_SIZE(visited_keys);
+
+	params.nelem_hint = n;
+
+	err = rhashtable_init(&ht, &params);
+	if (err)
+		return err;
+
+	objs = kcalloc(n, sizeof(*objs), GFP_KERNEL);
+	if (!objs) {
+		rhashtable_destroy(&ht);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < n; i++) {
+		objs[i].value.id = i;
+		err = rhashtable_insert_fast(&ht, &objs[i].node, params);
+		if (err)
+			goto out;
+	}
+
+	rcu_read_lock();
+
+	/* NULL prev_key: walk from the beginning, expect all n elements. */
+	while ((cur = rhashtable_next_key(&ht, prev_key))) {
+		if (IS_ERR(cur)) {
+			err = -EINVAL;
+			goto unlock;
+		}
+		count++;
+		prev_key = &cur->value;
+		visited_keys[cur->value.id] = 1;
+		if (count > n)
+			break;
+	}
+
+	if (count != n) {
+		err = -EINVAL;
+		goto unlock;
+	}
+
+	for (i = 0; i < n; i++) {
+		if (!visited_keys[i]) {
+			err = -EINVAL;
+			goto unlock;
+		}
+	}
+
+	/* Non-existing prev_key: must return ERR_PTR(-ENOENT). */
+	cur = rhashtable_next_key(&ht, &key_missing);
+	if (!IS_ERR(cur) || PTR_ERR(cur) != -ENOENT)
+		err = -EINVAL;
+
+unlock:
+	rcu_read_unlock();
+out:
+	for (i = 0; i < n; i++)
+		rhashtable_remove_fast(&ht, &objs[i].node, params);
+	kfree(objs);
+	rhashtable_destroy(&ht);
+	return err;
+}
+
 static int __init test_rht_init(void)
 {
 	unsigned int entries;
@@ -738,6 +810,9 @@ static int __init test_rht_init(void)
 
 	test_insert_duplicates_run();
 
+	pr_info("Testing rhashtable_next_key: %s\n",
+		test_rhashtable_next_key() == 0 ? "pass" : "FAIL");
+
 	if (!tcount)
 		return 0;
 
diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c
index ae5a54c350b9..191a6b3ee254 100644
--- a/net/bpf/bpf_dummy_struct_ops.c
+++ b/net/bpf/bpf_dummy_struct_ops.c
@@ -132,7 +132,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
 	const struct bpf_struct_ops *st_ops = &bpf_bpf_dummy_ops;
 	const struct btf_type *func_proto;
 	struct bpf_dummy_ops_test_args *args;
-	struct bpf_tramp_links *tlinks = NULL;
+	struct bpf_tramp_nodes *tnodes = NULL;
 	struct bpf_tramp_link *link = NULL;
 	void *image = NULL;
 	unsigned int op_idx;
@@ -158,8 +158,8 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
 	if (err)
 		goto out;
 
-	tlinks = kzalloc_objs(*tlinks, BPF_TRAMP_MAX);
-	if (!tlinks) {
+	tnodes = kzalloc_objs(*tnodes, BPF_TRAMP_MAX);
+	if (!tnodes) {
 		err = -ENOMEM;
 		goto out;
 	}
@@ -171,11 +171,11 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
 	}
 	/* prog doesn't take the ownership of the reference from caller */
 	bpf_prog_inc(prog);
-	bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_link_lops, prog,
-		      prog->expected_attach_type);
+	bpf_tramp_link_init(link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_link_lops,
+			    prog, prog->expected_attach_type, 0);
 
 	op_idx = prog->expected_attach_type;
-	err = bpf_struct_ops_prepare_trampoline(tlinks, link,
+	err = bpf_struct_ops_prepare_trampoline(tnodes, &link->node,
 						&st_ops->func_models[op_idx],
 						&dummy_ops_test_ret_function,
 						&image, &image_off,
@@ -198,7 +198,7 @@ out:
 	bpf_struct_ops_image_free(image);
 	if (link)
 		bpf_link_put(&link->link);
-	kfree(tlinks);
+	kfree(tnodes);
 	return err;
 }
 
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index dbf0d8eae8d8..7fdee8f52ee2 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -702,6 +702,9 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog,
 	case BPF_TRACE_FENTRY:
 	case BPF_TRACE_FEXIT:
 	case BPF_TRACE_FSESSION:
+	case BPF_TRACE_FENTRY_MULTI:
+	case BPF_TRACE_FEXIT_MULTI:
+	case BPF_TRACE_FSESSION_MULTI:
 		if (bpf_fentry_test1(1) != 2 ||
 		    bpf_fentry_test2(2, 3) != 5 ||
 		    bpf_fentry_test3(4, 5, 6) != 15 ||
@@ -747,14 +750,35 @@ static void
 __bpf_prog_test_run_raw_tp(void *data)
 {
 	struct bpf_raw_tp_test_run_info *info = data;
+	struct srcu_ctr __percpu *scp = NULL;
 	struct bpf_trace_run_ctx run_ctx = {};
 	struct bpf_run_ctx *old_run_ctx;
 
 	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
 
-	rcu_read_lock();
+	if (info->prog->sleepable) {
+		scp = rcu_read_lock_tasks_trace();
+		migrate_disable();
+	} else {
+		rcu_read_lock();
+	}
+
+	if (unlikely(!bpf_prog_get_recursion_context(info->prog))) {
+		bpf_prog_inc_misses_counter(info->prog);
+		goto out;
+	}
+
 	info->retval = bpf_prog_run(info->prog, info->ctx);
-	rcu_read_unlock();
+
+out:
+	bpf_prog_put_recursion_context(info->prog);
+
+	if (info->prog->sleepable) {
+		migrate_enable();
+		rcu_read_unlock_tasks_trace(scp);
+	} else {
+		rcu_read_unlock();
+	}
 
 	bpf_reset_run_ctx(old_run_ctx);
 }
@@ -782,6 +806,13 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog,
 	if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 && cpu != 0)
 		return -EINVAL;
 
+	/*
+	 * Sleepable programs cannot run with preemption disabled or in
+	 * hardirq context (smp_call_function_single), reject the flag.
+	 */
+	if (prog->sleepable && (kattr->test.flags & BPF_F_TEST_RUN_ON_CPU))
+		return -EINVAL;
+
 	if (ctx_size_in) {
 		info.ctx = memdup_user(ctx_in, ctx_size_in);
 		if (IS_ERR(info.ctx))
@@ -790,24 +821,31 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog,
 		info.ctx = NULL;
 	}
 
+	info.retval = 0;
 	info.prog = prog;
 
-	current_cpu = get_cpu();
-	if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 ||
-	    cpu == current_cpu) {
+	if (prog->sleepable) {
 		__bpf_prog_test_run_raw_tp(&info);
-	} else if (cpu >= nr_cpu_ids || !cpu_online(cpu)) {
-		/* smp_call_function_single() also checks cpu_online()
-		 * after csd_lock(). However, since cpu is from user
-		 * space, let's do an extra quick check to filter out
-		 * invalid value before smp_call_function_single().
-		 */
-		err = -ENXIO;
 	} else {
-		err = smp_call_function_single(cpu, __bpf_prog_test_run_raw_tp,
-					       &info, 1);
+		current_cpu = get_cpu();
+		if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 ||
+		    cpu == current_cpu) {
+			__bpf_prog_test_run_raw_tp(&info);
+		} else if (cpu >= nr_cpu_ids || !cpu_online(cpu)) {
+			/*
+			 * smp_call_function_single() also checks cpu_online()
+			 * after csd_lock(). However, since cpu is from user
+			 * space, let's do an extra quick check to filter out
+			 * invalid value before smp_call_function_single().
+			 */
+			err = -ENXIO;
+		} else {
+			err = smp_call_function_single(cpu,
+						       __bpf_prog_test_run_raw_tp,
+						       &info, 1);
+		}
+		put_cpu();
 	}
-	put_cpu();
 
 	if (!err &&
 	    copy_to_user(&uattr->test.retval, &info.retval, sizeof(u32)))
diff --git a/net/core/filter.c b/net/core/filter.c
index 40037413dd4e..2e96b4b847ce 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2654,6 +2654,37 @@ static void sk_msg_reset_curr(struct sk_msg *msg)
 	}
 }
 
+static bool sk_msg_elem_is_copy(const struct sk_msg *msg, u32 i)
+{
+	return test_bit(i, msg->sg.copy);
+}
+
+static void sk_msg_clear_elem_copy(struct sk_msg *msg, u32 i)
+{
+	__clear_bit(i, msg->sg.copy);
+}
+
+static void sk_msg_set_elem_copy(struct sk_msg *msg, u32 i, bool sg_copy)
+{
+	__assign_bit(i, msg->sg.copy, sg_copy);
+}
+
+static void sk_msg_clear_copy_range(struct sk_msg *msg, u32 start, u32 end)
+{
+	while (start != end) {
+		sk_msg_clear_elem_copy(msg, start);
+		sk_msg_iter_var_next(start);
+	}
+}
+
+static void sk_msg_sg_move(struct sk_msg *msg, u32 dst, u32 src)
+{
+	msg->sg.data[dst] = msg->sg.data[src];
+
+	sk_msg_set_elem_copy(msg, dst,	
+		sk_msg_elem_is_copy(msg, src));
+}
+
 static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
 	.func           = bpf_msg_cork_bytes,
 	.gpl_only       = false,
@@ -2692,7 +2723,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
 	 * account for the headroom.
 	 */
 	bytes_sg_total = start - offset + bytes;
-	if (!test_bit(i, msg->sg.copy) && bytes_sg_total <= len)
+	if (!sk_msg_elem_is_copy(msg, i) && bytes_sg_total <= len)
 		goto out;
 
 	/* At this point we need to linearize multiple scatterlist
@@ -2733,13 +2764,13 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
 		poffset += len;
 		sge->length = 0;
 		put_page(sg_page(sge));
-		__clear_bit(i, msg->sg.copy);
+		sk_msg_clear_elem_copy(msg, i);
 
 		sk_msg_iter_var_next(i);
 	} while (i != last_sge);
 
 	sg_set_page(&msg->sg.data[first_sge], page, copy, 0);
-	__clear_bit(first_sge, msg->sg.copy);
+	sk_msg_clear_elem_copy(msg, first_sge);
 
 	/* To repair sg ring we need to shift entries. If we only
 	 * had a single entry though we can just replace it and
@@ -2749,8 +2780,14 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
 	shift = last_sge > first_sge ?
 		last_sge - first_sge - 1 :
 		NR_MSG_FRAG_IDS - first_sge + last_sge - 1;
-	if (!shift)
+	if (!shift) {
+		sk_msg_clear_elem_copy(msg, msg->sg.end);
 		goto out;
+	}
+
+	i = first_sge;
+	sk_msg_iter_var_next(i);
+	sk_msg_clear_copy_range(msg, i, last_sge);
 
 	i = first_sge;
 	sk_msg_iter_var_next(i);
@@ -2764,18 +2801,18 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
 		if (move_from == msg->sg.end)
 			break;
 
-		msg->sg.data[i] = msg->sg.data[move_from];
-		sk_msg_sg_copy_assign(msg, i, msg, move_from);
+		sk_msg_sg_move(msg, i, move_from);
 		msg->sg.data[move_from].length = 0;
 		msg->sg.data[move_from].page_link = 0;
 		msg->sg.data[move_from].offset = 0;
-		__clear_bit(move_from, msg->sg.copy);
+		sk_msg_clear_elem_copy(msg, move_from);
 		sk_msg_iter_var_next(i);
 	} while (1);
 
 	msg->sg.end = msg->sg.end - shift > msg->sg.end ?
 		      msg->sg.end - shift + NR_MSG_FRAG_IDS :
 		      msg->sg.end - shift;
+	sk_msg_clear_elem_copy(msg, msg->sg.end);
 out:
 	sk_msg_reset_curr(msg);
 	msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset;
@@ -2796,9 +2833,10 @@ static const struct bpf_func_proto bpf_msg_pull_data_proto = {
 BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
 	   u32, len, u64, flags)
 {
+	bool sge_copy = false, nsge_copy = false, nnsge_copy = false;
 	struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge;
 	u32 new, i = 0, l = 0, space, copy = 0, offset = 0;
-	bool sge_copy, nsge_copy, nnsge_copy, rsge_copy = false;
+	bool rsge_copy = false;
 	u8 *raw, *to, *from;
 	struct page *page;
 
@@ -2834,6 +2872,9 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
 	if (!space || (space == 1 && start != offset))
 		copy = msg->sg.data[i].length;
 
+	if (unlikely(copy + len < copy))
+		return -EINVAL;
+
 	page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP,
 			   get_order(copy + len));
 	if (unlikely(!page))
@@ -2871,7 +2912,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
 			sk_msg_iter_var_prev(i);
 		psge = sk_msg_elem(msg, i);
 		rsge = sk_msg_elem_cpy(msg, i);
-		rsge_copy = test_bit(i, msg->sg.copy);
+		rsge_copy = sk_msg_elem_is_copy(msg, i);
 
 		psge->length = start - offset;
 		rsge.length -= psge->length;
@@ -2896,21 +2937,21 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
 
 	/* Shift one or two slots as needed */
 	sge = sk_msg_elem_cpy(msg, new);
-	sge_copy = test_bit(new, msg->sg.copy);
 	sg_unmark_end(&sge);
+	sge_copy = sk_msg_elem_is_copy(msg, new);
 
 	nsge = sk_msg_elem_cpy(msg, i);
-	nsge_copy = test_bit(i, msg->sg.copy);
+	nsge_copy = sk_msg_elem_is_copy(msg, i);
 	if (rsge.length) {
 		sk_msg_iter_var_next(i);
 		nnsge = sk_msg_elem_cpy(msg, i);
-		nnsge_copy = test_bit(i, msg->sg.copy);
+		nnsge_copy = sk_msg_elem_is_copy(msg, i);
 		sk_msg_iter_next(msg, end);
 	}
 
 	while (i != msg->sg.end) {
 		msg->sg.data[i] = sge;
-		__assign_bit(i, msg->sg.copy, sge_copy);
+		sk_msg_set_elem_copy(msg, i, sge_copy);
 		sge = nsge;
 		sge_copy = nsge_copy;
 		sk_msg_iter_var_next(i);
@@ -2918,10 +2959,10 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
 			nsge = nnsge;
 			nsge_copy = nnsge_copy;
 			nnsge = sk_msg_elem_cpy(msg, i);
-			nnsge_copy = test_bit(i, msg->sg.copy);
+			nnsge_copy = sk_msg_elem_is_copy(msg, i);
 		} else {
 			nsge = sk_msg_elem_cpy(msg, i);
-			nsge_copy = test_bit(i, msg->sg.copy);
+			nsge_copy = sk_msg_elem_is_copy(msg, i);
 		}
 	}
 
@@ -2929,14 +2970,15 @@ place_new:
 	/* Place newly allocated data buffer */
 	sk_mem_charge(msg->sk, len);
 	msg->sg.size += len;
-	__clear_bit(new, msg->sg.copy);
+	sk_msg_clear_elem_copy(msg, new);
 	sg_set_page(&msg->sg.data[new], page, len + copy, 0);
 	if (rsge.length) {
 		get_page(sg_page(&rsge));
 		sk_msg_iter_var_next(new);
 		msg->sg.data[new] = rsge;
-		__assign_bit(new, msg->sg.copy, rsge_copy);
+		sk_msg_set_elem_copy(msg, new, rsge_copy);
 	}
+	sk_msg_clear_elem_copy(msg, msg->sg.end);
 
 	sk_msg_reset_curr(msg);
 	sk_msg_compute_data_pointers(msg);
@@ -2962,12 +3004,11 @@ static void sk_msg_shift_left(struct sk_msg *msg, int i)
 	do {
 		prev = i;
 		sk_msg_iter_var_next(i);
-		msg->sg.data[prev] = msg->sg.data[i];
-		sk_msg_sg_copy_assign(msg, prev, msg, i);
+		sk_msg_sg_move(msg, prev, i);
 	} while (i != msg->sg.end);
 
 	sk_msg_iter_prev(msg, end);
-	__clear_bit(msg->sg.end, msg->sg.copy);
+	sk_msg_clear_elem_copy(msg, msg->sg.end);
 }
 
 static void sk_msg_shift_right(struct sk_msg *msg, int i)
@@ -2977,28 +3018,29 @@ static void sk_msg_shift_right(struct sk_msg *msg, int i)
 
 	sk_msg_iter_next(msg, end);
 	sge = sk_msg_elem_cpy(msg, i);
-	sge_copy = test_bit(i, msg->sg.copy);
+	sge_copy = sk_msg_elem_is_copy(msg, i);
 	sk_msg_iter_var_next(i);
 	tmp = sk_msg_elem_cpy(msg, i);
-	tmp_copy = test_bit(i, msg->sg.copy);
+	tmp_copy = sk_msg_elem_is_copy(msg, i);
 
 	while (i != msg->sg.end) {
 		msg->sg.data[i] = sge;
-		__assign_bit(i, msg->sg.copy, sge_copy);
+		sk_msg_set_elem_copy(msg, i, sge_copy);
 		sk_msg_iter_var_next(i);
 		sge = tmp;
 		sge_copy = tmp_copy;
 		tmp = sk_msg_elem_cpy(msg, i);
-		tmp_copy = test_bit(i, msg->sg.copy);
+		tmp_copy = sk_msg_elem_is_copy(msg, i);
 	}
+	sk_msg_clear_elem_copy(msg, msg->sg.end);
 }
 
 BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
 	   u32, len, u64, flags)
 {
 	u32 i = 0, l = 0, space, offset = 0;
-	u64 last = start + len;
-	int pop;
+	u64 last = (u64)start + len;
+	u32 pop;
 
 	if (unlikely(flags))
 		return -EINVAL;
@@ -3047,10 +3089,10 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
 	 */
 	if (start != offset) {
 		struct scatterlist *nsge, *sge = sk_msg_elem(msg, i);
+		bool sge_copy = sk_msg_elem_is_copy(msg, i);
 		int a = start - offset;
 		int b = sge->length - pop - a;
-		u32 sge_i = i;
-		bool sge_copy = test_bit(i, msg->sg.copy);
+		u32 sge_idx = i;
 
 		sk_msg_iter_var_next(i);
 
@@ -3063,7 +3105,7 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
 				sg_set_page(nsge,
 					    sg_page(sge),
 					    b, sge->offset + pop + a);
-				__assign_bit(i, msg->sg.copy, sge_copy);
+				sk_msg_set_elem_copy(msg, i, sge_copy);
 			} else {
 				struct page *page, *orig;
 				u8 *to, *from;
@@ -3080,7 +3122,7 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
 				memcpy(to, from, a);
 				memcpy(to + a, from + a + pop, b);
 				sg_set_page(sge, page, a + b, 0);
-				__clear_bit(sge_i, msg->sg.copy);
+				sk_msg_clear_elem_copy(msg, sge_idx);
 				put_page(orig);
 			}
 			pop = 0;
@@ -5571,11 +5613,24 @@ static int sol_tcp_sockopt(struct sock *sk, int optname,
 				 KERNEL_SOCKPTR(optval), *optlen);
 }
 
+static bool sk_allows_sol_ip_sockopt(struct sock *sk)
+{
+	switch (sk->sk_family) {
+	case AF_INET:
+		return true;
+	case AF_INET6:
+		/* Allow getting/setting sockopt for possible ipv4-mapped ipv6 socket. */
+		return sk->sk_type != SOCK_RAW && !ipv6_only_sock(sk);
+	default:
+		return false;
+	}
+}
+
 static int sol_ip_sockopt(struct sock *sk, int optname,
 			  char *optval, int *optlen,
 			  bool getopt)
 {
-	if (sk->sk_family != AF_INET)
+	if (!sk_allows_sol_ip_sockopt(sk))
 		return -EINVAL;
 
 	switch (optname) {
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index f71ef82a5f3d..bf588f508b79 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -599,6 +599,7 @@ static int handle_gso_encap(struct sk_buff *skb, bool ipv4, int encap_len)
 
 int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress)
 {
+	bool is_udp_tunnel;
 	struct iphdr *iph;
 	bool ipv4;
 	int err;
@@ -612,10 +613,16 @@ int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress)
 		ipv4 = true;
 		if (unlikely(len < iph->ihl * 4))
 			return -EINVAL;
+		is_udp_tunnel = iph->protocol == IPPROTO_UDP;
+		if (unlikely(is_udp_tunnel && len < iph->ihl * 4 + sizeof(struct udphdr)))
+			return -EINVAL;
 	} else if (iph->version == 6) {
 		ipv4 = false;
 		if (unlikely(len < sizeof(struct ipv6hdr)))
 			return -EINVAL;
+		is_udp_tunnel = ((struct ipv6hdr *)iph)->nexthdr == NEXTHDR_UDP;
+		if (unlikely(is_udp_tunnel && len < sizeof(struct ipv6hdr) + sizeof(struct udphdr)))
+			return -EINVAL;
 	} else {
 		return -EINVAL;
 	}
@@ -637,6 +644,11 @@ int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress)
 	if (ingress)
 		skb_postpush_rcsum(skb, iph, len);
 	skb_reset_network_header(skb);
+	if (is_udp_tunnel) {
+		size_t iph_sz = ipv4 ? iph->ihl * 4 : sizeof(struct ipv6hdr);
+
+		skb_set_transport_header(skb, skb_network_offset(skb) + iph_sz);
+	}
 	memcpy(skb_network_header(skb), hdr, len);
 	bpf_compute_data_pointers(skb);
 	skb_clear_hash(skb);
diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c
index 9f33b07b1481..ad57c4c9eaab 100644
--- a/net/ipv4/udp_bpf.c
+++ b/net/ipv4/udp_bpf.c
@@ -50,7 +50,9 @@ static int udp_msg_wait_data(struct sock *sk, struct sk_psock *psock,
 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 	ret = udp_msg_has_data(sk, psock);
 	if (!ret) {
+		release_sock(sk);
 		wait_woken(&wait, TASK_INTERRUPTIBLE, timeo);
+		lock_sock(sk);
 		ret = udp_msg_has_data(sk, psock);
 	}
 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
@@ -79,6 +81,7 @@ static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		goto out;
 	}
 
+	lock_sock(sk);
 msg_bytes_ready:
 	copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
 	if (!copied) {
@@ -90,11 +93,17 @@ msg_bytes_ready:
 		if (data) {
 			if (psock_has_data(psock))
 				goto msg_bytes_ready;
+
+			release_sock(sk);
+
 			ret = sk_udp_recvmsg(sk, msg, len, flags);
 			goto out;
 		}
 		copied = -EAGAIN;
 	}
+
+	release_sock(sk);
+
 	ret = copied;
 out:
 	sk_psock_put(sk, psock);
diff --git a/tools/bpf/Makefile b/tools/bpf/Makefile
index fd2585af1252..9c19e81f3c27 100644
--- a/tools/bpf/Makefile
+++ b/tools/bpf/Makefile
@@ -11,6 +11,7 @@ INSTALL ?= install
 CFLAGS += -Wall -O2
 CFLAGS += -D__EXPORTED_HEADERS__ -I$(srctree)/tools/include/uapi \
 	  -I$(srctree)/tools/include
+CFLAGS += $(EXTRA_CFLAGS)
 
 # This will work when bpf is built in tools env. where srctree
 # isn't set and when invoked from selftests build, where srctree
diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst
index 1af3305ea2b2..5daf3de5c744 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-map.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst
@@ -56,7 +56,7 @@ MAP COMMANDS
 |     | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage**
 |     | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage**
 |     | **task_storage** | **bloom_filter** | **user_ringbuf** | **cgrp_storage** | **arena**
-|     | **insn_array** }
+|     | **insn_array** | **rhash** }
 
 DESCRIPTION
 ===========
diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
index 0febf60e1b64..271a7dc77273 100644
--- a/tools/bpf/bpftool/Makefile
+++ b/tools/bpf/bpftool/Makefile
@@ -47,7 +47,8 @@ $(LIBBPF_INTERNAL_HDRS): $(LIBBPF_HDRS_DIR)/%.h: $(BPF_DIR)/%.h | $(LIBBPF_HDRS_
 $(LIBBPF_BOOTSTRAP): $(wildcard $(BPF_DIR)/*.[ch] $(BPF_DIR)/Makefile) | $(LIBBPF_BOOTSTRAP_OUTPUT)
 	$(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(LIBBPF_BOOTSTRAP_OUTPUT) \
 		DESTDIR=$(LIBBPF_BOOTSTRAP_DESTDIR:/=) prefix= \
-		ARCH= CROSS_COMPILE= CC="$(HOSTCC)" LD="$(HOSTLD)" AR="$(HOSTAR)" $@ install_headers
+		ARCH= CROSS_COMPILE= CC="$(HOSTCC)" LD="$(HOSTLD)" AR="$(HOSTAR)" \
+		CFLAGS="$(LIBBPF_BOOTSTRAP_CFLAGS)" EXTRA_CFLAGS= $@ install_headers
 
 $(LIBBPF_BOOTSTRAP_INTERNAL_HDRS): $(LIBBPF_BOOTSTRAP_HDRS_DIR)/%.h: $(BPF_DIR)/%.h | $(LIBBPF_BOOTSTRAP_HDRS_DIR)
 	$(call QUIET_INSTALL, $@)
@@ -81,6 +82,13 @@ CFLAGS += -DPACKAGE='"bpftool"' -D__EXPORTED_HEADERS__ \
 ifneq ($(BPFTOOL_VERSION),)
 CFLAGS += -DBPFTOOL_VERSION='"$(BPFTOOL_VERSION)"'
 endif
+
+# This must be done before appending EXTRA_CFLAGS to CFLAGS to avoid
+# including flags that are not applicable to the host compiler.
+HOST_CFLAGS := $(subst -I$(LIBBPF_INCLUDE),-I$(LIBBPF_BOOTSTRAP_INCLUDE),\
+		$(subst $(CLANG_CROSS_FLAGS),,$(CFLAGS)))
+HOST_CFLAGS += $(HOST_EXTRACFLAGS)
+
 ifneq ($(EXTRA_CFLAGS),)
 CFLAGS += $(EXTRA_CFLAGS)
 endif
@@ -88,10 +96,11 @@ ifneq ($(EXTRA_LDFLAGS),)
 LDFLAGS += $(EXTRA_LDFLAGS)
 endif
 
-HOST_CFLAGS := $(subst -I$(LIBBPF_INCLUDE),-I$(LIBBPF_BOOTSTRAP_INCLUDE),\
-		$(subst $(CLANG_CROSS_FLAGS),,$(CFLAGS)))
 HOST_LDFLAGS := $(LDFLAGS)
 
+# Remove warnings for libbpf bootstrap build
+LIBBPF_BOOTSTRAP_CFLAGS := $(filter-out -W -Wall -Wextra -Wformat -Wformat-signedness,$(HOST_CFLAGS))
+
 INSTALL ?= install
 RM ?= rm -f
 
@@ -106,6 +115,10 @@ ifneq ($(SKIP_CRYPTO),1)
   CRYPTO_LIBS := -lcrypto
 endif
 
+ifeq ($(MAKECMDGOALS),bootstrap)
+FEATURE_TESTS := libelf-zstd
+FEATURE_DISPLAY :=
+else
 FEATURE_TESTS := clang-bpf-co-re
 FEATURE_TESTS += llvm
 FEATURE_TESTS += libcap
@@ -122,6 +135,7 @@ FEATURE_DISPLAY += libcap
 FEATURE_DISPLAY += libbfd
 FEATURE_DISPLAY += libbfd-liberty
 FEATURE_DISPLAY += libbfd-liberty-z
+endif
 
 check_feat := 1
 NON_CHECK_FEAT_TARGETS := clean uninstall doc doc-clean doc-install doc-uninstall
diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c
index 2e899e940034..6ef908adf3a4 100644
--- a/tools/bpf/bpftool/btf.c
+++ b/tools/bpf/bpftool/btf.c
@@ -179,8 +179,7 @@ static int dump_btf_type(const struct btf *btf, __u32 id,
 	case BTF_KIND_STRUCT:
 	case BTF_KIND_UNION: {
 		const struct btf_member *m = (const void *)(t + 1);
-		__u16 vlen = BTF_INFO_VLEN(t->info);
-		int i;
+		__u32 i, vlen = BTF_INFO_VLEN(t->info);
 
 		if (json_output) {
 			jsonw_uint_field(w, "size", t->size);
@@ -225,9 +224,8 @@ static int dump_btf_type(const struct btf *btf, __u32 id,
 	}
 	case BTF_KIND_ENUM: {
 		const struct btf_enum *v = (const void *)(t + 1);
-		__u16 vlen = BTF_INFO_VLEN(t->info);
+		__u32 i, vlen = BTF_INFO_VLEN(t->info);
 		const char *encoding;
-		int i;
 
 		encoding = btf_kflag(t) ? "SIGNED" : "UNSIGNED";
 		if (json_output) {
@@ -263,9 +261,8 @@ static int dump_btf_type(const struct btf *btf, __u32 id,
 	}
 	case BTF_KIND_ENUM64: {
 		const struct btf_enum64 *v = btf_enum64(t);
-		__u16 vlen = btf_vlen(t);
+		__u32 i, vlen = btf_vlen(t);
 		const char *encoding;
-		int i;
 
 		encoding = btf_kflag(t) ? "SIGNED" : "UNSIGNED";
 		if (json_output) {
@@ -325,8 +322,7 @@ static int dump_btf_type(const struct btf *btf, __u32 id,
 	}
 	case BTF_KIND_FUNC_PROTO: {
 		const struct btf_param *p = (const void *)(t + 1);
-		__u16 vlen = BTF_INFO_VLEN(t->info);
-		int i;
+		__u32 i, vlen = BTF_INFO_VLEN(t->info);
 
 		if (json_output) {
 			jsonw_uint_field(w, "ret_type_id", t->type);
@@ -369,8 +365,7 @@ static int dump_btf_type(const struct btf *btf, __u32 id,
 	case BTF_KIND_DATASEC: {
 		const struct btf_var_secinfo *v = (const void *)(t + 1);
 		const struct btf_type *vt;
-		__u16 vlen = BTF_INFO_VLEN(t->info);
-		int i;
+		__u32 i, vlen = BTF_INFO_VLEN(t->info);
 
 		if (json_output) {
 			jsonw_uint_field(w, "size", t->size);
@@ -675,7 +670,7 @@ static __u64 btf_name_hasher(__u64 hash, const struct btf *btf, __u32 name_off)
 static __u64 btf_type_disambig_hash(const struct btf *btf, __u32 id, bool include_members)
 {
 	const struct btf_type *t = btf__type_by_id(btf, id);
-	int i;
+	__u32 i;
 	size_t hash = 0;
 
 	hash = btf_name_hasher(hash, btf, t->name_off);
diff --git a/tools/bpf/bpftool/btf_dumper.c b/tools/bpf/bpftool/btf_dumper.c
index def297e879f4..9dc8425b1789 100644
--- a/tools/bpf/bpftool/btf_dumper.c
+++ b/tools/bpf/bpftool/btf_dumper.c
@@ -150,7 +150,7 @@ static int btf_dumper_enum(const struct btf_dumper *d,
 {
 	const struct btf_enum *enums = btf_enum(t);
 	__s64 value;
-	__u16 i;
+	__u32 i;
 
 	switch (t->size) {
 	case 8:
@@ -189,7 +189,7 @@ static int btf_dumper_enum64(const struct btf_dumper *d,
 	const struct btf_enum64 *enums = btf_enum64(t);
 	__u32 val_lo32, val_hi32;
 	__u64 value;
-	__u16 i;
+	__u32 i;
 
 	value = *(__u64 *)data;
 	val_lo32 = (__u32)value;
diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c
index 2f9e10752e28..6ae7262ebe0c 100644
--- a/tools/bpf/bpftool/gen.c
+++ b/tools/bpf/bpftool/gen.c
@@ -1399,7 +1399,7 @@ static int do_skeleton(int argc, char **argv)
 				continue;
 
 			if (use_loader)
-				printf("t\tint %s_fd;\n", ident);
+				printf("\t\tint %s_fd;\n", ident);
 			else
 				printf("\t\tstruct bpf_link *%s;\n", ident);
 		}
@@ -2094,7 +2094,8 @@ btfgen_mark_type(struct btfgen_info *info, unsigned int type_id, bool follow_poi
 	struct btf_type *cloned_type;
 	struct btf_param *param;
 	struct btf_array *array;
-	int err, i;
+	__u32 i;
+	int err;
 
 	if (type_id == 0)
 		return 0;
@@ -2229,7 +2230,8 @@ static int btfgen_mark_type_match(struct btfgen_info *info, __u32 type_id, bool
 	const struct btf_type *btf_type;
 	struct btf *btf = info->src_btf;
 	struct btf_type *cloned_type;
-	int i, err;
+	int err;
+	__u32 i;
 
 	if (type_id == 0)
 		return 0;
@@ -2249,7 +2251,7 @@ static int btfgen_mark_type_match(struct btfgen_info *info, __u32 type_id, bool
 	case BTF_KIND_STRUCT:
 	case BTF_KIND_UNION: {
 		struct btf_member *m = btf_members(btf_type);
-		__u16 vlen = btf_vlen(btf_type);
+		__u32 vlen = btf_vlen(btf_type);
 
 		if (behind_ptr)
 			break;
@@ -2286,7 +2288,7 @@ static int btfgen_mark_type_match(struct btfgen_info *info, __u32 type_id, bool
 		break;
 	}
 	case BTF_KIND_FUNC_PROTO: {
-		__u16 vlen = btf_vlen(btf_type);
+		__u32 vlen = btf_vlen(btf_type);
 		struct btf_param *param;
 
 		/* mark ret type */
@@ -2492,8 +2494,9 @@ static struct btf *btfgen_get_btf(struct btfgen_info *info)
 {
 	struct btf *btf_new = NULL;
 	unsigned int *ids = NULL;
-	unsigned int i, n = btf__type_cnt(info->marked_btf);
+	unsigned int n = btf__type_cnt(info->marked_btf);
 	int err = 0;
+	__u32 i;
 
 	btf_new = btf__new_empty();
 	if (!btf_new) {
@@ -2523,8 +2526,7 @@ static struct btf *btfgen_get_btf(struct btfgen_info *info)
 		/* add members for struct and union */
 		if (btf_is_composite(type)) {
 			struct btf_member *cloned_m, *m;
-			unsigned short vlen;
-			int idx_src;
+			__u32 vlen, idx_src;
 
 			name = btf__str_by_offset(info->src_btf, type->name_off);
 
diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index 7ebf7dbcfba4..71a45d96617e 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -1478,7 +1478,7 @@ static int do_help(int argc, char **argv)
 		"                 cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n"
 		"                 queue | stack | sk_storage | struct_ops | ringbuf | inode_storage |\n"
 		"                 task_storage | bloom_filter | user_ringbuf | cgrp_storage | arena |\n"
-		"                 insn_array }\n"
+		"                 insn_array | rhash }\n"
 		"       " HELP_SPEC_OPTIONS " |\n"
 		"                    {-f|--bpffs} | {-n|--nomount} }\n"
 		"",
diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c
index 974189da8a91..dba28755d284 100644
--- a/tools/bpf/bpftool/net.c
+++ b/tools/bpf/bpftool/net.c
@@ -603,14 +603,14 @@ static int query_flow_dissector(struct bpf_attach_info *attach_info)
 			     &attach_flags, prog_ids, &prog_cnt);
 	close(fd);
 	if (err) {
-		if (errno == EINVAL) {
+		if (err == -EINVAL) {
 			/* Older kernel's don't support querying
 			 * flow dissector programs.
 			 */
 			errno = 0;
 			return 0;
 		}
-		p_err("can't query prog: %s", strerror(errno));
+		p_err("can't query prog: %s", strerror(-err));
 		return -1;
 	}
 
diff --git a/tools/build/feature/test-bpf.c b/tools/build/feature/test-bpf.c
index e7a405f83af6..89d59674f39b 100644
--- a/tools/build/feature/test-bpf.c
+++ b/tools/build/feature/test-bpf.c
@@ -20,6 +20,8 @@
 #  define __NR_bpf 6319
 # elif defined(__mips__) && defined(_ABI64)
 #  define __NR_bpf 5315
+# elif defined(__loongarch__)
+#  define __NR_bpf 280
 # else
 #  error __NR_bpf not defined. libbpf does not support your arch.
 # endif
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 677be9a47347..89b36de5fdbb 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -994,6 +994,7 @@ enum bpf_cmd {
 	BPF_PROG_STREAM_READ_BY_FD,
 	BPF_PROG_ASSOC_STRUCT_OPS,
 	__MAX_BPF_CMD,
+	BPF_COMMON_ATTRS = 1 << 16, /* Indicate carrying syscall common attrs. */
 };
 
 enum bpf_map_type {
@@ -1046,6 +1047,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_CGRP_STORAGE,
 	BPF_MAP_TYPE_ARENA,
 	BPF_MAP_TYPE_INSN_ARRAY,
+	BPF_MAP_TYPE_RHASH,
 	__MAX_BPF_MAP_TYPE
 };
 
@@ -1154,6 +1156,9 @@ enum bpf_attach_type {
 	BPF_TRACE_KPROBE_SESSION,
 	BPF_TRACE_UPROBE_SESSION,
 	BPF_TRACE_FSESSION,
+	BPF_TRACE_FENTRY_MULTI,
+	BPF_TRACE_FEXIT_MULTI,
+	BPF_TRACE_FSESSION_MULTI,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -1178,6 +1183,7 @@ enum bpf_link_type {
 	BPF_LINK_TYPE_UPROBE_MULTI = 12,
 	BPF_LINK_TYPE_NETKIT = 13,
 	BPF_LINK_TYPE_SOCKMAP = 14,
+	BPF_LINK_TYPE_TRACING_MULTI = 15,
 	__MAX_BPF_LINK_TYPE,
 };
 
@@ -1321,7 +1327,11 @@ enum {
  * BPF_TRACE_UPROBE_MULTI attach type to create return probe.
  */
 enum {
-	BPF_F_UPROBE_MULTI_RETURN = (1U << 0)
+	/* Get return uprobe. */
+	BPF_F_UPROBE_MULTI_RETURN     = (1U << 0),
+
+	/* Get path from provided path_fd. */
+	BPF_F_UPROBE_MULTI_PATH_FD    = (1U << 1),
 };
 
 /* link_create.netfilter.flags used in LINK_CREATE command for
@@ -1500,6 +1510,13 @@ struct bpf_stack_build_id {
 	};
 };
 
+struct bpf_common_attr {
+	__aligned_u64 log_buf;
+	__u32 log_size;
+	__u32 log_level;
+	__u32 log_true_size;
+};
+
 #define BPF_OBJ_NAME_LEN 16U
 
 enum {
@@ -1537,6 +1554,11 @@ union bpf_attr {
 		 *
 		 * BPF_MAP_TYPE_ARENA - contains the address where user space
 		 * is going to mmap() the arena. It has to be page aligned.
+		 *
+		 * BPF_MAP_TYPE_RHASH - initial table size hint
+		 * (nelem_hint). 0 = use rhashtable default. Must be
+		 * <= min(max_entries, U16_MAX). Upper 32 bits reserved,
+		 * must be zero.
 		 */
 		__u64	map_extra;
 
@@ -1846,6 +1868,7 @@ union bpf_attr {
 				__u32		cnt;
 				__u32		flags;
 				__u32		pid;
+				__u32		path_fd;
 			} uprobe_multi;
 			struct {
 				union {
@@ -1861,6 +1884,11 @@ union bpf_attr {
 				};
 				__u64		expected_revision;
 			} cgroup;
+			struct {
+				__aligned_u64	ids;
+				__aligned_u64	cookies;
+				__u32		cnt;
+			} tracing_multi;
 		};
 	} link_create;
 
@@ -6698,6 +6726,7 @@ struct bpf_prog_info {
 	__u32 verified_insns;
 	__u32 attach_btf_obj_id;
 	__u32 attach_btf_id;
+	__u32 :32;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
@@ -6719,6 +6748,7 @@ struct bpf_map_info {
 	__u64 map_extra;
 	__aligned_u64 hash;
 	__u32 hash_size;
+	__u32 :32;
 } __attribute__((aligned(8)));
 
 struct bpf_btf_info {
@@ -7236,6 +7266,7 @@ enum {
 	TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */
 	SK_BPF_CB_FLAGS		= 1009, /* Get or set sock ops flags in socket */
 	SK_BPF_BYPASS_PROT_MEM	= 1010, /* Get or Set sk->sk_bypass_prot_mem */
+
 };
 
 enum {
diff --git a/tools/include/uapi/linux/btf.h b/tools/include/uapi/linux/btf.h
index 638615ebddc2..618167cab4e6 100644
--- a/tools/include/uapi/linux/btf.h
+++ b/tools/include/uapi/linux/btf.h
@@ -33,20 +33,22 @@ struct btf_header {
 	__u32	layout_len;	/* length of layout section	*/
 };
 
-/* Max # of type identifier */
-#define BTF_MAX_TYPE	0x000fffff
-/* Max offset into the string section */
-#define BTF_MAX_NAME_OFFSET	0x00ffffff
-/* Max # of struct/union/enum members or func args */
-#define BTF_MAX_VLEN	0xffff
+enum btf_max {
+	/* Max possible kind */
+	BTF_MAX_KIND =		0x0000007f,
+	/* Max # of type identifier */
+	BTF_MAX_TYPE =		0x000fffff,
+	/* Max offset into the string section */
+	BTF_MAX_NAME_OFFSET =	0x00ffffff,
+	/* Max # of struct/union/enum members or func args */
+	BTF_MAX_VLEN =		0x00ffffff,
+};
 
 struct btf_type {
 	__u32 name_off;
 	/* "info" bits arrangement
-	 * bits  0-15: vlen (e.g. # of struct's members)
-	 * bits 16-23: unused
-	 * bits 24-28: kind (e.g. int, ptr, array...etc)
-	 * bits 29-30: unused
+	 * bits  0-23: vlen (e.g. # of struct's members)
+	 * bits 24-30: kind (e.g. int, ptr, array...etc)
 	 * bit     31: kind_flag, currently used by
 	 *             struct, union, enum, fwd, enum64,
 	 *             decl_tag and type_tag
@@ -65,8 +67,8 @@ struct btf_type {
 	};
 };
 
-#define BTF_INFO_KIND(info)	(((info) >> 24) & 0x1f)
-#define BTF_INFO_VLEN(info)	((info) & 0xffff)
+#define BTF_INFO_KIND(info)	(((info) >> 24) & 0x7f)
+#define BTF_INFO_VLEN(info)	((info) & 0xffffff)
 #define BTF_INFO_KFLAG(info)	((info) >> 31)
 
 enum {
diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
index 168140f8e646..eca584fb061e 100644
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@ -49,6 +49,14 @@ man_dir_SQ = '$(subst ','\'',$(man_dir))'
 export man_dir man_dir_SQ INSTALL
 export DESTDIR DESTDIR_SQ
 
+# Defer assigning EXTRA_CFLAGS to CFLAGS until after including
+# tools/scripts/Makefile.include, as it may add flags to EXTRA_CFLAGS.
+ifdef EXTRA_CFLAGS
+  CFLAGS :=
+else
+  CFLAGS := -g -O2
+endif
+
 include $(srctree)/tools/scripts/Makefile.include
 
 # copy a bit from Linux kbuild
@@ -70,13 +78,6 @@ LIB_TARGET	= libbpf.a libbpf.so.$(LIBBPF_VERSION)
 LIB_FILE	= libbpf.a libbpf.so*
 PC_FILE		= libbpf.pc
 
-# Set compile option CFLAGS
-ifdef EXTRA_CFLAGS
-  CFLAGS := $(EXTRA_CFLAGS)
-else
-  CFLAGS := -g -O2
-endif
-
 # Append required CFLAGS
 override CFLAGS += -std=gnu89
 override CFLAGS += $(EXTRA_WARNINGS) -Wno-switch-enum
@@ -84,7 +85,7 @@ override CFLAGS += -Werror -Wall
 override CFLAGS += $(INCLUDES)
 override CFLAGS += -fvisibility=hidden
 override CFLAGS += -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64
-override CFLAGS += $(CLANG_CROSS_FLAGS)
+override CFLAGS += $(EXTRA_CFLAGS)
 
 # flags specific for shared library
 SHLIB_FLAGS := -DSHARED -fPIC
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 5846de364209..96819c082c77 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -59,6 +59,8 @@
 #  define __NR_bpf 6319
 # elif defined(__mips__) && defined(_ABI64)
 #  define __NR_bpf 5315
+# elif defined(__loongarch__)
+#  define __NR_bpf 280
 # else
 #  error __NR_bpf not defined. libbpf does not support your arch.
 # endif
@@ -69,6 +71,42 @@ static inline __u64 ptr_to_u64(const void *ptr)
 	return (__u64) (unsigned long) ptr;
 }
 
+static inline int sys_bpf_ext(enum bpf_cmd cmd, union bpf_attr *attr,
+			      unsigned int size,
+			      struct bpf_common_attr *attr_common,
+			      unsigned int size_common)
+{
+	cmd = attr_common ? (cmd | BPF_COMMON_ATTRS) : (cmd & ~BPF_COMMON_ATTRS);
+	return syscall(__NR_bpf, cmd, attr, size, attr_common, size_common);
+}
+
+static inline int sys_bpf_ext_fd(enum bpf_cmd cmd, union bpf_attr *attr,
+				 unsigned int size,
+				 struct bpf_common_attr *attr_common,
+				 unsigned int size_common)
+{
+	int fd;
+
+	fd = sys_bpf_ext(cmd, attr, size, attr_common, size_common);
+	return ensure_good_fd(fd);
+}
+
+int probe_sys_bpf_ext(void)
+{
+	const size_t attr_sz = offsetofend(union bpf_attr, prog_token_fd);
+	union bpf_attr attr;
+	int fd;
+
+	memset(&attr, 0, attr_sz);
+	fd = syscall(__NR_bpf, BPF_PROG_LOAD | BPF_COMMON_ATTRS, &attr, attr_sz, NULL,
+		     sizeof(struct bpf_common_attr));
+	if (fd >= 0) {
+		close(fd);
+		return -EINVAL;
+	}
+	return errno == EFAULT ? 1 : 0;
+}
+
 static inline int sys_bpf(enum bpf_cmd cmd, union bpf_attr *attr,
 			  unsigned int size)
 {
@@ -173,6 +211,9 @@ int bpf_map_create(enum bpf_map_type map_type,
 		   const struct bpf_map_create_opts *opts)
 {
 	const size_t attr_sz = offsetofend(union bpf_attr, excl_prog_hash_size);
+	const size_t attr_common_sz = sizeof(struct bpf_common_attr);
+	struct bpf_common_attr attr_common;
+	struct bpf_log_opts *log_opts;
 	union bpf_attr attr;
 	int fd;
 
@@ -206,7 +247,21 @@ int bpf_map_create(enum bpf_map_type map_type,
 	attr.excl_prog_hash = ptr_to_u64(OPTS_GET(opts, excl_prog_hash, NULL));
 	attr.excl_prog_hash_size = OPTS_GET(opts, excl_prog_hash_size, 0);
 
-	fd = sys_bpf_fd(BPF_MAP_CREATE, &attr, attr_sz);
+	log_opts = OPTS_GET(opts, log_opts, NULL);
+	if (!OPTS_VALID(log_opts, bpf_log_opts))
+		return libbpf_err(-EINVAL);
+
+	if (log_opts && feat_supported(NULL, FEAT_BPF_SYSCALL_COMMON_ATTRS)) {
+		memset(&attr_common, 0, attr_common_sz);
+		attr_common.log_buf = ptr_to_u64(OPTS_GET(log_opts, buf, NULL));
+		attr_common.log_size = OPTS_GET(log_opts, size, 0);
+		attr_common.log_level = OPTS_GET(log_opts, level, 0);
+		fd = sys_bpf_ext_fd(BPF_MAP_CREATE, &attr, attr_sz, &attr_common, attr_common_sz);
+		OPTS_SET(log_opts, true_size, attr_common.log_true_size);
+	} else {
+		fd = sys_bpf_fd(BPF_MAP_CREATE, &attr, attr_sz);
+		OPTS_SET(log_opts, true_size, 0);
+	}
 	return libbpf_err_errno(fd);
 }
 
@@ -787,9 +842,19 @@ int bpf_link_create(int prog_fd, int target_fd,
 		attr.link_create.uprobe_multi.ref_ctr_offsets = ptr_to_u64(OPTS_GET(opts, uprobe_multi.ref_ctr_offsets, 0));
 		attr.link_create.uprobe_multi.cookies = ptr_to_u64(OPTS_GET(opts, uprobe_multi.cookies, 0));
 		attr.link_create.uprobe_multi.pid = OPTS_GET(opts, uprobe_multi.pid, 0);
+		attr.link_create.uprobe_multi.path_fd = OPTS_GET(opts, uprobe_multi.path_fd, 0);
 		if (!OPTS_ZEROED(opts, uprobe_multi))
 			return libbpf_err(-EINVAL);
 		break;
+	case BPF_TRACE_FENTRY_MULTI:
+	case BPF_TRACE_FEXIT_MULTI:
+	case BPF_TRACE_FSESSION_MULTI:
+		attr.link_create.tracing_multi.ids = ptr_to_u64(OPTS_GET(opts, tracing_multi.ids, 0));
+		attr.link_create.tracing_multi.cookies = ptr_to_u64(OPTS_GET(opts, tracing_multi.cookies, 0));
+		attr.link_create.tracing_multi.cnt = OPTS_GET(opts, tracing_multi.cnt, 0);
+		if (!OPTS_ZEROED(opts, tracing_multi))
+			return libbpf_err(-EINVAL);
+		break;
 	case BPF_TRACE_RAW_TP:
 	case BPF_TRACE_FENTRY:
 	case BPF_TRACE_FEXIT:
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 2c8e88ddb674..7534a593edae 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -37,6 +37,18 @@ extern "C" {
 
 LIBBPF_API int libbpf_set_memlock_rlim(size_t memlock_bytes);
 
+struct bpf_log_opts {
+	size_t sz; /* size of this struct for forward/backward compatibility */
+
+	char *buf;
+	__u32 size;
+	__u32 level;
+	__u32 true_size; /* out parameter set by kernel */
+
+	size_t :0;
+};
+#define bpf_log_opts__last_field true_size
+
 struct bpf_map_create_opts {
 	size_t sz; /* size of this struct for forward/backward compatibility */
 
@@ -57,9 +69,12 @@ struct bpf_map_create_opts {
 
 	const void *excl_prog_hash;
 	__u32 excl_prog_hash_size;
+
+	struct bpf_log_opts *log_opts;
+
 	size_t :0;
 };
-#define bpf_map_create_opts__last_field excl_prog_hash_size
+#define bpf_map_create_opts__last_field log_opts
 
 LIBBPF_API int bpf_map_create(enum bpf_map_type map_type,
 			      const char *map_name,
@@ -429,6 +444,7 @@ struct bpf_link_create_opts {
 			const unsigned long *ref_ctr_offsets;
 			const __u64 *cookies;
 			__u32 pid;
+			__u32 path_fd;
 		} uprobe_multi;
 		struct {
 			__u64 cookie;
@@ -454,10 +470,15 @@ struct bpf_link_create_opts {
 			__u32 relative_id;
 			__u64 expected_revision;
 		} cgroup;
+		struct {
+			const __u32 *ids;
+			const __u64 *cookies;
+			__u32 cnt;
+		} tracing_multi;
 	};
 	size_t :0;
 };
-#define bpf_link_create_opts__last_field uprobe_multi.pid
+#define bpf_link_create_opts__last_field uprobe_multi.path_fd
 
 LIBBPF_API int bpf_link_create(int prog_fd, int target_fd,
 			       enum bpf_attach_type attach_type,
diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index ceb57b46a878..823bce895178 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -421,7 +421,7 @@ static int btf_type_size_unknown(const struct btf *btf, const struct btf_type *t
 {
 	__u32 l_cnt = btf->hdr.layout_len / sizeof(struct btf_layout);
 	struct btf_layout *l = btf->layout;
-	__u16 vlen = btf_vlen(t);
+	__u32 vlen = btf_vlen(t);
 	__u32 kind = btf_kind(t);
 
 	/* Fall back to base BTF if needed as they share layout information */
@@ -454,7 +454,7 @@ static int btf_type_size_unknown(const struct btf *btf, const struct btf_type *t
 static int btf_type_size(const struct btf *btf, const struct btf_type *t)
 {
 	const int base_size = sizeof(struct btf_type);
-	__u16 vlen = btf_vlen(t);
+	__u32 vlen = btf_vlen(t);
 
 	switch (btf_kind(t)) {
 	case BTF_KIND_FWD:
@@ -506,7 +506,7 @@ static int btf_bswap_type_rest(struct btf_type *t)
 	struct btf_array *a;
 	struct btf_param *p;
 	struct btf_enum *e;
-	__u16 vlen = btf_vlen(t);
+	__u32 vlen = btf_vlen(t);
 	int i;
 
 	switch (btf_kind(t)) {
@@ -1007,7 +1007,7 @@ int btf__align_of(const struct btf *btf, __u32 id)
 	case BTF_KIND_STRUCT:
 	case BTF_KIND_UNION: {
 		const struct btf_member *m = btf_members(t);
-		__u16 vlen = btf_vlen(t);
+		__u32 vlen = btf_vlen(t);
 		int i, max_align = 1, align;
 
 		for (i = 0; i < vlen; i++, m++) {
@@ -2121,9 +2121,12 @@ static void *btf_add_type_mem(struct btf *btf, size_t add_sz)
 			      btf->hdr.type_len, UINT_MAX, add_sz);
 }
 
-static void btf_type_inc_vlen(struct btf_type *t)
+static int btf_type_inc_vlen(struct btf_type *t)
 {
+	if (btf_vlen(t) == BTF_MAX_VLEN)
+		return -ENOSPC;
 	t->info = btf_type_info(btf_kind(t), btf_vlen(t) + 1, btf_kflag(t));
+	return 0;
 }
 
 static void btf_hdr_update_type_len(struct btf *btf, int new_len)
@@ -2652,6 +2655,8 @@ int btf__add_field(struct btf *btf, const char *name, int type_id,
 	t = btf_last_type(btf);
 	if (!btf_is_composite(t))
 		return libbpf_err(-EINVAL);
+	if (btf_vlen(t) == BTF_MAX_VLEN)
+		return libbpf_err(-ENOSPC);
 
 	if (validate_type_id(type_id))
 		return libbpf_err(-EINVAL);
@@ -2686,6 +2691,7 @@ int btf__add_field(struct btf *btf, const char *name, int type_id,
 
 	/* btf_add_type_mem can invalidate t pointer */
 	t = btf_last_type(btf);
+
 	/* update parent type's vlen and kflag */
 	t->info = btf_type_info(btf_kind(t), btf_vlen(t) + 1, is_bitfield || btf_kflag(t));
 
@@ -2796,7 +2802,9 @@ int btf__add_enum_value(struct btf *btf, const char *name, __s64 value)
 
 	/* update parent type's vlen */
 	t = btf_last_type(btf);
-	btf_type_inc_vlen(t);
+	err = btf_type_inc_vlen(t);
+	if (err)
+		return libbpf_err(err);
 
 	/* if negative value, set signedness to signed */
 	if (value < 0)
@@ -2873,7 +2881,9 @@ int btf__add_enum64_value(struct btf *btf, const char *name, __u64 value)
 
 	/* update parent type's vlen */
 	t = btf_last_type(btf);
-	btf_type_inc_vlen(t);
+	err = btf_type_inc_vlen(t);
+	if (err)
+		return libbpf_err(err);
 
 	btf_hdr_update_type_len(btf, btf->hdr.type_len + sz);
 	return 0;
@@ -3115,7 +3125,9 @@ int btf__add_func_param(struct btf *btf, const char *name, int type_id)
 
 	/* update parent type's vlen */
 	t = btf_last_type(btf);
-	btf_type_inc_vlen(t);
+	err = btf_type_inc_vlen(t);
+	if (err)
+		return libbpf_err(err);
 
 	btf_hdr_update_type_len(btf, btf->hdr.type_len + sz);
 	return 0;
@@ -3257,7 +3269,9 @@ int btf__add_datasec_var_info(struct btf *btf, int var_type_id, __u32 offset, __
 
 	/* update parent type's vlen */
 	t = btf_last_type(btf);
-	btf_type_inc_vlen(t);
+	err = btf_type_inc_vlen(t);
+	if (err)
+		return libbpf_err(err);
 
 	btf_hdr_update_type_len(btf, btf->hdr.type_len + sz);
 	return 0;
@@ -4311,7 +4325,7 @@ static long btf_hash_enum(struct btf_type *t)
 static bool btf_equal_enum_members(struct btf_type *t1, struct btf_type *t2)
 {
 	const struct btf_enum *m1, *m2;
-	__u16 vlen;
+	__u32 vlen;
 	int i;
 
 	vlen = btf_vlen(t1);
@@ -4329,7 +4343,7 @@ static bool btf_equal_enum_members(struct btf_type *t1, struct btf_type *t2)
 static bool btf_equal_enum64_members(struct btf_type *t1, struct btf_type *t2)
 {
 	const struct btf_enum64 *m1, *m2;
-	__u16 vlen;
+	__u32 vlen;
 	int i;
 
 	vlen = btf_vlen(t1);
@@ -4406,7 +4420,7 @@ static long btf_hash_struct(struct btf_type *t)
 static bool btf_shallow_equal_struct(struct btf_type *t1, struct btf_type *t2)
 {
 	const struct btf_member *m1, *m2;
-	__u16 vlen;
+	__u32 vlen;
 	int i;
 
 	if (!btf_equal_common(t1, t2))
@@ -4482,7 +4496,7 @@ static bool btf_compat_array(struct btf_type *t1, struct btf_type *t2)
 static long btf_hash_fnproto(struct btf_type *t)
 {
 	const struct btf_param *member = btf_params(t);
-	__u16 vlen = btf_vlen(t);
+	__u32 vlen = btf_vlen(t);
 	long h = btf_hash_common(t);
 	int i;
 
@@ -4504,7 +4518,7 @@ static long btf_hash_fnproto(struct btf_type *t)
 static bool btf_equal_fnproto(struct btf_type *t1, struct btf_type *t2)
 {
 	const struct btf_param *m1, *m2;
-	__u16 vlen;
+	__u32 vlen;
 	int i;
 
 	if (!btf_equal_common(t1, t2))
@@ -4530,7 +4544,7 @@ static bool btf_equal_fnproto(struct btf_type *t1, struct btf_type *t2)
 static bool btf_compat_fnproto(struct btf_type *t1, struct btf_type *t2)
 {
 	const struct btf_param *m1, *m2;
-	__u16 vlen;
+	__u32 vlen;
 	int i;
 
 	/* skip return type ID */
@@ -4578,12 +4592,14 @@ static int btf_dedup_prep(struct btf_dedup *d)
 		case BTF_KIND_RESTRICT:
 		case BTF_KIND_PTR:
 		case BTF_KIND_FWD:
-		case BTF_KIND_TYPEDEF:
 		case BTF_KIND_FUNC:
 		case BTF_KIND_FLOAT:
 		case BTF_KIND_TYPE_TAG:
 			h = btf_hash_common(t);
 			break;
+		case BTF_KIND_TYPEDEF:
+			h = btf_hash_typedef(t);
+			break;
 		case BTF_KIND_INT:
 		case BTF_KIND_DECL_TAG:
 			h = btf_hash_int_decl_tag(t);
@@ -5077,7 +5093,7 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id,
 	case BTF_KIND_STRUCT:
 	case BTF_KIND_UNION: {
 		const struct btf_member *cand_m, *canon_m;
-		__u16 vlen;
+		__u32 vlen;
 
 		if (!btf_shallow_equal_struct(cand_type, canon_type))
 			return 0;
@@ -5105,7 +5121,7 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id,
 
 	case BTF_KIND_FUNC_PROTO: {
 		const struct btf_param *cand_p, *canon_p;
-		__u16 vlen;
+		__u32 vlen;
 
 		if (!btf_compat_fnproto(cand_type, canon_type))
 			return 0;
@@ -5439,7 +5455,7 @@ static int btf_dedup_ref_type(struct btf_dedup *d, __u32 type_id)
 
 	case BTF_KIND_FUNC_PROTO: {
 		struct btf_param *param;
-		__u16 vlen;
+		__u32 vlen;
 		int i;
 
 		ref_type_id = btf_dedup_ref_type(d, t->type);
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index a1f8deca2603..1a31f2da947f 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -435,7 +435,7 @@ static inline __u16 btf_kind(const struct btf_type *t)
 	return BTF_INFO_KIND(t->info);
 }
 
-static inline __u16 btf_vlen(const struct btf_type *t)
+static inline __u32 btf_vlen(const struct btf_type *t)
 {
 	return BTF_INFO_VLEN(t->info);
 }
diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c
index 53c6624161d7..cc1ba65bb6c5 100644
--- a/tools/lib/bpf/btf_dump.c
+++ b/tools/lib/bpf/btf_dump.c
@@ -316,7 +316,7 @@ static int btf_dump_mark_referenced(struct btf_dump *d)
 {
 	int i, j, n = btf__type_cnt(d->btf);
 	const struct btf_type *t;
-	__u16 vlen;
+	__u32 vlen;
 
 	for (i = d->last_id + 1; i < n; i++) {
 		t = btf__type_by_id(d->btf, i);
@@ -485,7 +485,7 @@ static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr)
 	 */
 	struct btf_dump_type_aux_state *tstate = &d->type_states[id];
 	const struct btf_type *t;
-	__u16 vlen;
+	__u32 vlen;
 	int err, i;
 
 	/* return true, letting typedefs know that it's ok to be emitted */
@@ -798,7 +798,7 @@ static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id)
 		 */
 		if (top_level_def || t->name_off == 0) {
 			const struct btf_member *m = btf_members(t);
-			__u16 vlen = btf_vlen(t);
+			__u32 vlen = btf_vlen(t);
 			int i, new_cont_id;
 
 			new_cont_id = t->name_off == 0 ? cont_id : id;
@@ -820,7 +820,7 @@ static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id)
 		break;
 	case BTF_KIND_FUNC_PROTO: {
 		const struct btf_param *p = btf_params(t);
-		__u16 n = btf_vlen(t);
+		__u32 n = btf_vlen(t);
 		int i;
 
 		btf_dump_emit_type(d, t->type, cont_id);
@@ -839,7 +839,7 @@ static bool btf_is_struct_packed(const struct btf *btf, __u32 id,
 {
 	const struct btf_member *m;
 	int max_align = 1, align, i, bit_sz;
-	__u16 vlen;
+	__u32 vlen;
 
 	m = btf_members(t);
 	vlen = btf_vlen(t);
@@ -973,7 +973,7 @@ static void btf_dump_emit_struct_def(struct btf_dump *d,
 	bool is_struct = btf_is_struct(t);
 	bool packed, prev_bitfield = false;
 	int align, i, off = 0;
-	__u16 vlen = btf_vlen(t);
+	__u32 vlen = btf_vlen(t);
 
 	align = btf__align_of(d->btf, id);
 	packed = is_struct ? btf_is_struct_packed(d->btf, id, t) : 0;
@@ -1064,7 +1064,7 @@ static void btf_dump_emit_enum_fwd(struct btf_dump *d, __u32 id,
 
 static void btf_dump_emit_enum32_val(struct btf_dump *d,
 				     const struct btf_type *t,
-				     int lvl, __u16 vlen)
+				     int lvl, __u32 vlen)
 {
 	const struct btf_enum *v = btf_enum(t);
 	bool is_signed = btf_kflag(t);
@@ -1089,7 +1089,7 @@ static void btf_dump_emit_enum32_val(struct btf_dump *d,
 
 static void btf_dump_emit_enum64_val(struct btf_dump *d,
 				     const struct btf_type *t,
-				     int lvl, __u16 vlen)
+				     int lvl, __u32 vlen)
 {
 	const struct btf_enum64 *v = btf_enum64(t);
 	bool is_signed = btf_kflag(t);
@@ -1122,7 +1122,7 @@ static void btf_dump_emit_enum_def(struct btf_dump *d, __u32 id,
 				   const struct btf_type *t,
 				   int lvl)
 {
-	__u16 vlen = btf_vlen(t);
+	__u32 vlen = btf_vlen(t);
 
 	btf_dump_printf(d, "enum%s%s",
 			t->name_off ? " " : "",
@@ -1542,7 +1542,7 @@ static void btf_dump_emit_type_chain(struct btf_dump *d,
 		}
 		case BTF_KIND_FUNC_PROTO: {
 			const struct btf_param *p = btf_params(t);
-			__u16 vlen = btf_vlen(t);
+			__u32 vlen = btf_vlen(t);
 			int i;
 
 			/*
@@ -2159,7 +2159,7 @@ static int btf_dump_struct_data(struct btf_dump *d,
 				const void *data)
 {
 	const struct btf_member *m = btf_members(t);
-	__u16 n = btf_vlen(t);
+	__u32 n = btf_vlen(t);
 	int i, err = 0;
 
 	/* note that we increment depth before calling btf_dump_print() below;
@@ -2449,7 +2449,7 @@ static int btf_dump_type_data_check_zero(struct btf_dump *d,
 	case BTF_KIND_STRUCT:
 	case BTF_KIND_UNION: {
 		const struct btf_member *m = btf_members(t);
-		__u16 n = btf_vlen(t);
+		__u32 n = btf_vlen(t);
 
 		/* if any struct/union member is non-zero, the struct/union
 		 * is considered non-zero and dumped.
diff --git a/tools/lib/bpf/features.c b/tools/lib/bpf/features.c
index 4f19a0d79b0c..b7e388f99d0b 100644
--- a/tools/lib/bpf/features.c
+++ b/tools/lib/bpf/features.c
@@ -615,6 +615,11 @@ static int probe_kern_btf_layout(int token_fd)
 						 (char *)layout, token_fd));
 }
 
+static int probe_bpf_syscall_common_attrs(int token_fd)
+{
+	return probe_sys_bpf_ext();
+}
+
 typedef int (*feature_probe_fn)(int /* token_fd */);
 
 static struct kern_feature_cache feature_cache;
@@ -699,6 +704,9 @@ static struct kern_feature_desc {
 	[FEAT_BTF_LAYOUT] = {
 		"kernel supports BTF layout", probe_kern_btf_layout,
 	},
+	[FEAT_BPF_SYSCALL_COMMON_ATTRS] = {
+		"BPF syscall common attributes support", probe_bpf_syscall_common_attrs,
+	},
 };
 
 bool feat_supported(struct kern_feature_cache *cache, enum kern_feature_id feat_id)
diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c
index 9478b8f78f26..d79695f01c87 100644
--- a/tools/lib/bpf/gen_loader.c
+++ b/tools/lib/bpf/gen_loader.c
@@ -63,6 +63,7 @@ static int realloc_insn_buf(struct bpf_gen *gen, __u32 size)
 		gen->error = -ENOMEM;
 		free(gen->insn_start);
 		gen->insn_start = NULL;
+		gen->insn_cur = NULL;
 		return -ENOMEM;
 	}
 	gen->insn_start = insn_start;
@@ -86,6 +87,7 @@ static int realloc_data_buf(struct bpf_gen *gen, __u32 size)
 		gen->error = -ENOMEM;
 		free(gen->data_start);
 		gen->data_start = NULL;
+		gen->data_cur = NULL;
 		return -ENOMEM;
 	}
 	gen->data_start = data_start;
@@ -158,10 +160,16 @@ void bpf_gen__init(struct bpf_gen *gen, int log_level, int nr_progs, int nr_maps
 
 static int add_data(struct bpf_gen *gen, const void *data, __u32 size)
 {
-	__u32 size8 = roundup(size, 8);
 	__u64 zero = 0;
+	__u32 size8;
 	void *prev;
 
+	if (size > INT32_MAX) {
+		gen->error = -ERANGE;
+		return 0;
+	}
+	size8 = roundup(size, 8);
+
 	if (realloc_data_buf(gen, size8))
 		return 0;
 	prev = gen->data_cur;
@@ -293,7 +301,6 @@ static void emit_check_err(struct bpf_gen *gen)
 		emit(gen, BPF_JMP_IMM(BPF_JSLT, BPF_REG_7, 0, off));
 	} else {
 		gen->error = -ERANGE;
-		emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, -1));
 	}
 }
 
@@ -398,13 +405,12 @@ int bpf_gen__finish(struct bpf_gen *gen, int nr_progs, int nr_maps)
 			      blob_fd_array_off(gen, i));
 	emit(gen, BPF_MOV64_IMM(BPF_REG_0, 0));
 	emit(gen, BPF_EXIT_INSN());
-	if (OPTS_GET(gen->opts, gen_hash, false))
-		compute_sha_update_offsets(gen);
-
-	pr_debug("gen: finish %s\n", errstr(gen->error));
 	if (!gen->error) {
 		struct gen_loader_opts *opts = gen->opts;
 
+		if (OPTS_GET(opts, gen_hash, false))
+			compute_sha_update_offsets(gen);
+
 		opts->insns = gen->insn_start;
 		opts->insns_sz = gen->insn_cur - gen->insn_start;
 		opts->data = gen->data_start;
@@ -419,6 +425,7 @@ int bpf_gen__finish(struct bpf_gen *gen, int nr_progs, int nr_maps)
 				bpf_insn_bswap(insn++);
 		}
 	}
+	pr_debug("gen: finish %s\n", errstr(gen->error));
 	return gen->error;
 }
 
@@ -545,13 +552,22 @@ void bpf_gen__map_create(struct bpf_gen *gen,
 	default:
 		break;
 	}
-	/* conditionally update max_entries */
-	if (map_idx >= 0)
+
+	/*
+	 * Conditionally update max_entries from the host-supplied loader
+	 * ctx. This sizes the map at runtime, but for a signed loader
+	 * (gen_hash) it would let an untrusted host re-dimension the
+	 * program's maps after emit_signature_match(), outside what the
+	 * signature attests to. Keep the signer-provided max_entries
+	 * baked into the blob in that case.
+	 */
+	if (map_idx >= 0 && !OPTS_GET(gen->opts, gen_hash, false))
 		move_ctx2blob(gen, attr_field(map_create_attr, max_entries), 4,
 			      sizeof(struct bpf_loader_ctx) +
 			      sizeof(struct bpf_map_desc) * map_idx +
 			      offsetof(struct bpf_map_desc, max_entries),
 			      true /* check that max_entries != 0 */);
+
 	/* emit MAP_CREATE command */
 	emit_sys_bpf(gen, BPF_MAP_CREATE, map_create_attr, attr_size);
 	debug_ret(gen, "map_create %s idx %d type %d value_size %d value_btf_id %d",
@@ -585,6 +601,23 @@ static void emit_signature_match(struct bpf_gen *gen)
 	__s64 off;
 	int i;
 
+	/*
+	 * Reject if the metadata map is not exclusive. Without exclusivity
+	 * the cached map->sha[] verified above can be stale: another BPF
+	 * program with map access could have mutated the contents between
+	 * BPF_OBJ_GET_INFO_BY_FD and loader execution.
+	 */
+	emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX,
+					 0, 0, 0, 0));
+	emit(gen, BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, SHA256_DIGEST_LENGTH));
+	off = -(gen->insn_cur - gen->insn_start - gen->cleanup_label) / 8 - 2;
+	if (is_simm16(off)) {
+		emit(gen, BPF_MOV64_IMM(BPF_REG_7, -EINVAL));
+		emit(gen, BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 1, off));
+	} else {
+		gen->error = -ERANGE;
+	}
+
 	for (i = 0; i < SHA256_DWORD_SIZE; i++) {
 		emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX,
 						 0, 0, 0, 0));
@@ -1053,7 +1086,7 @@ void bpf_gen__prog_load(struct bpf_gen *gen,
 		 prog_idx, prog_type, insns_off, insn_cnt, license_off);
 
 	/* convert blob insns to target endianness */
-	if (gen->swapped_endian) {
+	if (gen->swapped_endian && !gen->error) {
 		struct bpf_insn *insn = gen->data_start + insns_off;
 		int i;
 
@@ -1091,7 +1124,7 @@ void bpf_gen__prog_load(struct bpf_gen *gen,
 		 sizeof(struct bpf_core_relo));
 
 	/* convert all info blobs to target endianness */
-	if (gen->swapped_endian)
+	if (gen->swapped_endian && !gen->error)
 		info_blob_bswap(gen, func_info, line_info, core_relos, load_attr);
 
 	libbpf_strlcpy(attr.prog_name, prog_name, sizeof(attr.prog_name));
@@ -1169,27 +1202,36 @@ void bpf_gen__map_update_elem(struct bpf_gen *gen, int map_idx, void *pvalue,
 	value = add_data(gen, pvalue, value_size);
 	key = add_data(gen, &zero, sizeof(zero));
 
-	/* if (map_desc[map_idx].initial_value) {
+	/*
+	 * if (map_desc[map_idx].initial_value) {
 	 *    if (ctx->flags & BPF_SKEL_KERNEL)
 	 *        bpf_probe_read_kernel(value, value_size, initial_value);
 	 *    else
 	 *        bpf_copy_from_user(value, value_size, initial_value);
 	 * }
+	 *
+	 * The runtime initial_value comes from the host-supplied loader
+	 * ctx and would overwrite the blob value after emit_signature_match()
+	 * has already validated map->sha[]. For a signed loader (gen_hash)
+	 * the attested blob value must be authoritative, so skip the override
+	 * and leave the hashed value in place.
 	 */
-	emit(gen, BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_6,
-			      sizeof(struct bpf_loader_ctx) +
-			      sizeof(struct bpf_map_desc) * map_idx +
-			      offsetof(struct bpf_map_desc, initial_value)));
-	emit(gen, BPF_JMP_IMM(BPF_JEQ, BPF_REG_3, 0, 8));
-	emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE,
-					 0, 0, 0, value));
-	emit(gen, BPF_MOV64_IMM(BPF_REG_2, value_size));
-	emit(gen, BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6,
-			      offsetof(struct bpf_loader_ctx, flags)));
-	emit(gen, BPF_JMP_IMM(BPF_JSET, BPF_REG_0, BPF_SKEL_KERNEL, 2));
-	emit(gen, BPF_EMIT_CALL(BPF_FUNC_copy_from_user));
-	emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, 1));
-	emit(gen, BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel));
+	if (!OPTS_GET(gen->opts, gen_hash, false)) {
+		emit(gen, BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_6,
+				      sizeof(struct bpf_loader_ctx) +
+				      sizeof(struct bpf_map_desc) * map_idx +
+				      offsetof(struct bpf_map_desc, initial_value)));
+		emit(gen, BPF_JMP_IMM(BPF_JEQ, BPF_REG_3, 0, 8));
+		emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE,
+						 0, 0, 0, value));
+		emit(gen, BPF_MOV64_IMM(BPF_REG_2, value_size));
+		emit(gen, BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6,
+				      offsetof(struct bpf_loader_ctx, flags)));
+		emit(gen, BPF_JMP_IMM(BPF_JSET, BPF_REG_0, BPF_SKEL_KERNEL, 2));
+		emit(gen, BPF_EMIT_CALL(BPF_FUNC_copy_from_user));
+		emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, 1));
+		emit(gen, BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel));
+	}
 
 	map_update_attr = add_data(gen, &attr, attr_size);
 	pr_debug("gen: map_update_elem: idx %d, value: off %d size %d, attr: off %d size %d\n",
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 3a80a018fc7d..1368752aa13c 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -136,6 +136,9 @@ static const char * const attach_type_name[] = {
 	[BPF_NETKIT_PEER]		= "netkit_peer",
 	[BPF_TRACE_KPROBE_SESSION]	= "trace_kprobe_session",
 	[BPF_TRACE_UPROBE_SESSION]	= "trace_uprobe_session",
+	[BPF_TRACE_FENTRY_MULTI]	= "trace_fentry_multi",
+	[BPF_TRACE_FEXIT_MULTI]		= "trace_fexit_multi",
+	[BPF_TRACE_FSESSION_MULTI]	= "trace_fsession_multi",
 };
 
 static const char * const link_type_name[] = {
@@ -154,6 +157,7 @@ static const char * const link_type_name[] = {
 	[BPF_LINK_TYPE_UPROBE_MULTI]		= "uprobe_multi",
 	[BPF_LINK_TYPE_NETKIT]			= "netkit",
 	[BPF_LINK_TYPE_SOCKMAP]			= "sockmap",
+	[BPF_LINK_TYPE_TRACING_MULTI]		= "tracing_multi",
 };
 
 static const char * const map_type_name[] = {
@@ -192,6 +196,7 @@ static const char * const map_type_name[] = {
 	[BPF_MAP_TYPE_CGRP_STORAGE]		= "cgrp_storage",
 	[BPF_MAP_TYPE_ARENA]			= "arena",
 	[BPF_MAP_TYPE_INSN_ARRAY]		= "insn_array",
+	[BPF_MAP_TYPE_RHASH]			= "rhash",
 };
 
 static const char * const prog_type_name[] = {
@@ -7767,6 +7772,69 @@ static int bpf_object__sanitize_prog(struct bpf_object *obj, struct bpf_program
 static int libbpf_find_attach_btf_id(struct bpf_program *prog, const char *attach_name,
 				     int *btf_obj_fd, int *btf_type_id);
 
+static inline bool is_tracing_multi(enum bpf_attach_type type)
+{
+	return type == BPF_TRACE_FENTRY_MULTI || type == BPF_TRACE_FEXIT_MULTI ||
+	       type == BPF_TRACE_FSESSION_MULTI;
+}
+
+static const struct module_btf *find_attach_module(struct bpf_object *obj, const char *attach)
+{
+	const char *sep, *mod_name = NULL;
+	int i, mod_len, err;
+
+	/*
+	 * We expect attach string in the form of either
+	 * - function_pattern or
+	 * - <module>:function_pattern
+	 */
+	sep = strchr(attach, ':');
+	if (sep) {
+		mod_name = attach;
+		mod_len = sep - mod_name;
+	}
+	if (!mod_name)
+		return NULL;
+
+	err = load_module_btfs(obj);
+	if (err)
+		return NULL;
+
+	for (i = 0; i < obj->btf_module_cnt; i++) {
+		const struct module_btf *mod = &obj->btf_modules[i];
+
+		if (strncmp(mod->name, mod_name, mod_len) == 0 && mod->name[mod_len] == '\0')
+			return mod;
+	}
+	return NULL;
+}
+
+static int tracing_multi_mod_fd(struct bpf_program *prog, int *btf_obj_fd)
+{
+	const char *attach_name, *sep;
+	const struct module_btf *mod;
+
+	*btf_obj_fd = 0;
+	attach_name = strchr(prog->sec_name, '/');
+
+	/* Program with no details in spec, using kernel btf. */
+	if (!attach_name)
+		return 0;
+
+	/* Program with no module section, using kernel btf. */
+	sep = strchr(++attach_name, ':');
+	if (!sep)
+		return 0;
+
+	/* Program with module specified, get its btf fd. */
+	mod = find_attach_module(prog->obj, attach_name);
+	if (!mod)
+		return -EINVAL;
+
+	*btf_obj_fd = mod->fd;
+	return 0;
+}
+
 /* this is called as prog->sec_def->prog_prepare_load_fn for libbpf-supported sec_defs */
 static int libbpf_prepare_prog_load(struct bpf_program *prog,
 				    struct bpf_prog_load_opts *opts, long cookie)
@@ -7830,6 +7898,18 @@ static int libbpf_prepare_prog_load(struct bpf_program *prog,
 		opts->attach_btf_obj_fd = btf_obj_fd;
 		opts->attach_btf_id = btf_type_id;
 	}
+
+	if (is_tracing_multi(prog->expected_attach_type)) {
+		int err, btf_obj_fd = 0;
+
+		err = tracing_multi_mod_fd(prog, &btf_obj_fd);
+		if (err < 0)
+			return err;
+
+		prog->attach_btf_obj_fd = btf_obj_fd;
+		opts->attach_btf_obj_fd = btf_obj_fd;
+	}
+
 	return 0;
 }
 
@@ -8936,13 +9016,10 @@ static void bpf_object_unpin(struct bpf_object *obj)
 			bpf_map__unpin(&obj->maps[i], NULL);
 }
 
-static void bpf_object_post_load_cleanup(struct bpf_object *obj)
+static void bpf_object_cleanup_btf(struct bpf_object *obj)
 {
 	int i;
 
-	/* clean up fd_array */
-	zfree(&obj->fd_array);
-
 	/* clean up module BTFs */
 	for (i = 0; i < obj->btf_module_cnt; i++) {
 		close(obj->btf_modules[i].fd);
@@ -8950,6 +9027,8 @@ static void bpf_object_post_load_cleanup(struct bpf_object *obj)
 		free(obj->btf_modules[i].name);
 	}
 	obj->btf_module_cnt = 0;
+	obj->btf_module_cap = 0;
+	obj->btf_modules_loaded = false;
 	zfree(&obj->btf_modules);
 
 	/* clean up vmlinux BTF */
@@ -8957,6 +9036,15 @@ static void bpf_object_post_load_cleanup(struct bpf_object *obj)
 	obj->btf_vmlinux = NULL;
 }
 
+static void bpf_object_post_load_cleanup(struct bpf_object *obj)
+{
+	/* clean up fd_array */
+	zfree(&obj->fd_array);
+
+	/* clean up BTF */
+	bpf_object_cleanup_btf(obj);
+}
+
 static int bpf_object_prepare(struct bpf_object *obj, const char *target_btf_path)
 {
 	int err;
@@ -9983,6 +10071,7 @@ static int attach_kprobe_session(const struct bpf_program *prog, long cookie, st
 static int attach_uprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link);
 static int attach_lsm(const struct bpf_program *prog, long cookie, struct bpf_link **link);
 static int attach_iter(const struct bpf_program *prog, long cookie, struct bpf_link **link);
+static int attach_tracing_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link);
 
 static const struct bpf_sec_def section_defs[] = {
 	SEC_DEF("socket",		SOCKET_FILTER, 0, SEC_NONE),
@@ -10018,11 +10107,16 @@ static const struct bpf_sec_def section_defs[] = {
 	SEC_DEF("netkit/peer",		SCHED_CLS, BPF_NETKIT_PEER, SEC_NONE),
 	SEC_DEF("tracepoint+",		TRACEPOINT, 0, SEC_NONE, attach_tp),
 	SEC_DEF("tp+",			TRACEPOINT, 0, SEC_NONE, attach_tp),
+	SEC_DEF("tracepoint.s+",	TRACEPOINT, 0, SEC_SLEEPABLE, attach_tp),
+	SEC_DEF("tp.s+",		TRACEPOINT, 0, SEC_SLEEPABLE, attach_tp),
 	SEC_DEF("raw_tracepoint+",	RAW_TRACEPOINT, 0, SEC_NONE, attach_raw_tp),
 	SEC_DEF("raw_tp+",		RAW_TRACEPOINT, 0, SEC_NONE, attach_raw_tp),
+	SEC_DEF("raw_tracepoint.s+",	RAW_TRACEPOINT, 0, SEC_SLEEPABLE, attach_raw_tp),
+	SEC_DEF("raw_tp.s+",		RAW_TRACEPOINT, 0, SEC_SLEEPABLE, attach_raw_tp),
 	SEC_DEF("raw_tracepoint.w+",	RAW_TRACEPOINT_WRITABLE, 0, SEC_NONE, attach_raw_tp),
 	SEC_DEF("raw_tp.w+",		RAW_TRACEPOINT_WRITABLE, 0, SEC_NONE, attach_raw_tp),
 	SEC_DEF("tp_btf+",		TRACING, BPF_TRACE_RAW_TP, SEC_ATTACH_BTF, attach_trace),
+	SEC_DEF("tp_btf.s+",		TRACING, BPF_TRACE_RAW_TP, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace),
 	SEC_DEF("fentry+",		TRACING, BPF_TRACE_FENTRY, SEC_ATTACH_BTF, attach_trace),
 	SEC_DEF("fmod_ret+",		TRACING, BPF_MODIFY_RETURN, SEC_ATTACH_BTF, attach_trace),
 	SEC_DEF("fexit+",		TRACING, BPF_TRACE_FEXIT, SEC_ATTACH_BTF, attach_trace),
@@ -10031,6 +10125,12 @@ static const struct bpf_sec_def section_defs[] = {
 	SEC_DEF("fexit.s+",		TRACING, BPF_TRACE_FEXIT, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace),
 	SEC_DEF("fsession+",		TRACING, BPF_TRACE_FSESSION, SEC_ATTACH_BTF, attach_trace),
 	SEC_DEF("fsession.s+",		TRACING, BPF_TRACE_FSESSION, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace),
+	SEC_DEF("fsession.multi+",	TRACING, BPF_TRACE_FSESSION_MULTI, 0, attach_tracing_multi),
+	SEC_DEF("fsession.multi.s+",	TRACING, BPF_TRACE_FSESSION_MULTI, SEC_SLEEPABLE, attach_tracing_multi),
+	SEC_DEF("fentry.multi+",	TRACING, BPF_TRACE_FENTRY_MULTI, 0, attach_tracing_multi),
+	SEC_DEF("fexit.multi+",		TRACING, BPF_TRACE_FEXIT_MULTI, 0, attach_tracing_multi),
+	SEC_DEF("fentry.multi.s+",	TRACING, BPF_TRACE_FENTRY_MULTI, SEC_SLEEPABLE, attach_tracing_multi),
+	SEC_DEF("fexit.multi.s+",	TRACING, BPF_TRACE_FEXIT_MULTI, SEC_SLEEPABLE, attach_tracing_multi),
 	SEC_DEF("freplace+",		EXT, 0, SEC_ATTACH_BTF, attach_trace),
 	SEC_DEF("lsm+",			LSM, BPF_LSM_MAC, SEC_ATTACH_BTF, attach_lsm),
 	SEC_DEF("lsm.s+",		LSM, BPF_LSM_MAC, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_lsm),
@@ -12280,7 +12380,7 @@ error:
 static int attach_kprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link)
 {
 	DECLARE_LIBBPF_OPTS(bpf_kprobe_opts, opts);
-	unsigned long offset = 0;
+	long offset = 0;
 	const char *func_name;
 	char *func;
 	int n;
@@ -12302,6 +12402,13 @@ static int attach_kprobe(const struct bpf_program *prog, long cookie, struct bpf
 		pr_warn("kprobe name is invalid: %s\n", func_name);
 		return -EINVAL;
 	}
+
+	if (offset < 0) {
+		free(func);
+		pr_warn("kprobe offset must be a non-negative integer: %li\n", offset);
+		return -EINVAL;
+	}
+
 	if (opts.retprobe && offset != 0) {
 		free(func);
 		pr_warn("kretprobes do not support offset specification\n");
@@ -12425,6 +12532,279 @@ static int attach_uprobe_multi(const struct bpf_program *prog, long cookie, stru
 	return ret;
 }
 
+#define MAX_BPF_FUNC_ARGS 12
+
+static bool btf_type_is_modifier(const struct btf_type *t)
+{
+	switch (BTF_INFO_KIND(t->info)) {
+	case BTF_KIND_TYPEDEF:
+	case BTF_KIND_VOLATILE:
+	case BTF_KIND_CONST:
+	case BTF_KIND_RESTRICT:
+	case BTF_KIND_TYPE_TAG:
+		return true;
+	default:
+		return false;
+	}
+}
+
+#define MAX_RESOLVE_DEPTH 32
+
+static int btf_get_type_size(const struct btf *btf, __u32 type_id,
+			     const struct btf_type **ret_type)
+{
+	const struct btf_type *t;
+	int i;
+
+	*ret_type = btf__type_by_id(btf, 0);
+	if (!type_id)
+		return 0;
+	t = btf__type_by_id(btf, type_id);
+	for (i = 0; i < MAX_RESOLVE_DEPTH && t && btf_type_is_modifier(t); i++)
+		t = btf__type_by_id(btf, t->type);
+	if (!t || i == MAX_RESOLVE_DEPTH)
+		return -EINVAL;
+	*ret_type = t;
+	if (btf_is_ptr(t))
+		return btf__pointer_size(btf);
+	if (btf_is_int(t) || btf_is_any_enum(t) || btf_is_struct(t) || btf_is_union(t))
+		return t->size;
+	return -EINVAL;
+}
+
+bool btf_type_is_traceable_func(const struct btf *btf, const struct btf_type *t)
+{
+	const struct btf_param *args;
+	const struct btf_type *proto;
+	__u32 i, nargs;
+	int ret;
+
+	if (!btf_is_func(t))
+		return false;
+	proto = btf__type_by_id(btf, t->type);
+	if (!proto || !btf_is_func_proto(proto))
+		return false;
+
+	args = (const struct btf_param *)(proto + 1);
+	nargs = btf_vlen(proto);
+	if (nargs > MAX_BPF_FUNC_ARGS)
+		return false;
+
+	/* No support for struct return type. */
+	ret = btf_get_type_size(btf, proto->type, &t);
+	if (ret < 0 || btf_is_struct(t) || btf_is_union(t))
+		return false;
+
+	for (i = 0; i < nargs; i++) {
+		/* No support for variable args. */
+		if (i == nargs - 1 && args[i].type == 0)
+			return false;
+		ret = btf_get_type_size(btf, args[i].type, &t);
+		/* No support of struct argument size greater than 16 bytes. */
+		if (ret < 0 || ret > 16)
+			return false;
+		/* No support for void argument. */
+		if (ret == 0)
+			return false;
+	}
+
+	return true;
+}
+
+static int
+collect_btf_func_ids_by_glob(const struct btf *btf, const char *pattern, __u32 **ids)
+{
+	__u32 type_id, nr_types = btf__type_cnt(btf);
+	size_t cap = 0, cnt = 0;
+
+	if (!pattern)
+		return -EINVAL;
+
+	for (type_id = 1; type_id < nr_types; type_id++) {
+		const struct btf_type *t = btf__type_by_id(btf, type_id);
+		const char *name;
+		int err;
+
+		if (btf_kind(t) != BTF_KIND_FUNC)
+			continue;
+		name = btf__name_by_offset(btf, t->name_off);
+		if (!name)
+			continue;
+
+		if (!glob_match(name, pattern))
+			continue;
+		if (!btf_type_is_traceable_func(btf, t))
+			continue;
+
+		err = libbpf_ensure_mem((void **) ids, &cap, sizeof(**ids), cnt + 1);
+		if (err) {
+			free(*ids);
+			return -ENOMEM;
+		}
+		(*ids)[cnt++] = type_id;
+	}
+
+	return cnt;
+}
+
+static int collect_func_ids_by_glob(const struct bpf_program *prog, const char *pattern, __u32 **ids)
+{
+	struct bpf_object *obj = prog->obj;
+	const struct module_btf *mod;
+	struct btf *btf = NULL;
+	const char *sep;
+	int err;
+
+	err = bpf_object__load_vmlinux_btf(obj, true);
+	if (err)
+		return err;
+
+	/* In case we have module specified, we will find its btf and use that. */
+	sep = strchr(pattern, ':');
+	if (sep) {
+		mod = find_attach_module(obj, pattern);
+		if (!mod) {
+			err = -EINVAL;
+			goto cleanup;
+		}
+		btf = mod->btf;
+		pattern = sep + 1;
+	} else {
+		/* Program is loaded for kernel module. */
+		if (prog->attach_btf_obj_fd) {
+			err = -EINVAL;
+			goto cleanup;
+		}
+		btf = obj->btf_vmlinux;
+	}
+
+	err = collect_btf_func_ids_by_glob(btf, pattern, ids);
+
+cleanup:
+	bpf_object_cleanup_btf(obj);
+	return err;
+}
+
+struct bpf_link *
+bpf_program__attach_tracing_multi(const struct bpf_program *prog, const char *pattern,
+				  const struct bpf_tracing_multi_opts *opts)
+{
+	LIBBPF_OPTS(bpf_link_create_opts, lopts);
+	int prog_fd, link_fd, err, cnt;
+	__u32 *free_ids = NULL;
+	struct bpf_link *link;
+	const __u64 *cookies;
+	const __u32 *ids;
+
+	if (!OPTS_VALID(opts, bpf_tracing_multi_opts))
+		return libbpf_err_ptr(-EINVAL);
+
+	prog_fd = bpf_program__fd(prog);
+	if (prog_fd < 0) {
+		pr_warn("prog '%s': can't attach BPF program without FD (was it loaded?)\n",
+			prog->name);
+		return libbpf_err_ptr(-EINVAL);
+	}
+
+	cnt = OPTS_GET(opts, cnt, 0);
+	ids = OPTS_GET(opts, ids, NULL);
+	cookies = OPTS_GET(opts, cookies, NULL);
+
+	if (!!ids != !!cnt)
+		return libbpf_err_ptr(-EINVAL);
+	if (pattern && (ids || cookies))
+		return libbpf_err_ptr(-EINVAL);
+	if (!pattern && !ids)
+		return libbpf_err_ptr(-EINVAL);
+
+	if (pattern) {
+		cnt = collect_func_ids_by_glob(prog, pattern, &free_ids);
+		if (cnt < 0)
+			return libbpf_err_ptr(cnt);
+		if (cnt == 0)
+			return libbpf_err_ptr(-EINVAL);
+		ids = (const __u32 *) free_ids;
+	}
+
+	lopts.tracing_multi.ids = ids;
+	lopts.tracing_multi.cookies = cookies;
+	lopts.tracing_multi.cnt = cnt;
+
+	link = calloc(1, sizeof(*link));
+	if (!link) {
+		err = -ENOMEM;
+		goto error;
+	}
+	link->detach = &bpf_link__detach_fd;
+
+	link_fd = bpf_link_create(prog_fd, 0, prog->expected_attach_type, &lopts);
+	if (link_fd < 0) {
+		err = -errno;
+		pr_warn("prog '%s': failed to attach: %s\n", prog->name, errstr(err));
+		goto error;
+	}
+	link->fd = link_fd;
+	free(free_ids);
+	return link;
+
+error:
+	free(link);
+	free(free_ids);
+	return libbpf_err_ptr(err);
+}
+
+static int attach_tracing_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link)
+{
+	static const char *const prefixes[] = {
+		"fentry.multi",
+		"fexit.multi",
+		"fsession.multi",
+		"fentry.multi.s",
+		"fexit.multi.s",
+		"fsession.multi.s",
+	};
+	const char *spec = NULL;
+	char *pattern;
+	size_t i;
+	int n;
+
+	*link = NULL;
+
+	for (i = 0; i < ARRAY_SIZE(prefixes); i++) {
+		size_t pfx_len;
+
+		if (!str_has_pfx(prog->sec_name, prefixes[i]))
+			continue;
+
+		pfx_len = strlen(prefixes[i]);
+		/* no auto-attach case of, e.g., SEC("fentry.multi") */
+		if (prog->sec_name[pfx_len] == '\0')
+			return 0;
+
+		if (prog->sec_name[pfx_len] != '/')
+			continue;
+
+		spec = prog->sec_name + pfx_len + 1;
+		break;
+	}
+
+	if (!spec) {
+		pr_warn("prog '%s': invalid section name '%s'\n",
+			prog->name, prog->sec_name);
+		return -EINVAL;
+	}
+
+	n = sscanf(spec, "%m[a-zA-Z0-9_.*?:]", &pattern);
+	if (n < 1) {
+		pr_warn("tracing multi pattern is invalid: %s\n", spec);
+		return -EINVAL;
+	}
+
+	*link = bpf_program__attach_tracing_multi(prog, pattern, NULL);
+	free(pattern);
+	return libbpf_get_error(*link);
+}
+
 static inline int add_uprobe_event_legacy(const char *probe_name, bool retprobe,
 					  const char *binary_path, size_t offset)
 {
@@ -13145,25 +13525,61 @@ struct bpf_link *bpf_program__attach_tracepoint(const struct bpf_program *prog,
 	return bpf_program__attach_tracepoint_opts(prog, tp_category, tp_name, NULL);
 }
 
+/*
+ * Match section name against a prefix array. Returns pointer past
+ * "prefix/" on match, empty string for bare sections (exact prefix
+ * match), or NULL if no prefix matches.
+ */
+static const char *sec_name_match_prefix(const char *sec_name,
+					 const char *const *prefixes,
+					 size_t n)
+{
+	size_t i;
+
+	for (i = 0; i < n; i++) {
+		size_t pfx_len;
+
+		if (!str_has_pfx(sec_name, prefixes[i]))
+			continue;
+
+		pfx_len = strlen(prefixes[i]);
+		if (sec_name[pfx_len] == '\0')
+			return sec_name + pfx_len;
+
+		if (sec_name[pfx_len] != '/' || sec_name[pfx_len + 1] == '\0')
+			continue;
+
+		return sec_name + pfx_len + 1;
+	}
+	return NULL;
+}
+
 static int attach_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link)
 {
+	static const char *const prefixes[] = {
+		"tp.s",
+		"tp",
+		"tracepoint.s",
+		"tracepoint",
+	};
 	char *sec_name, *tp_cat, *tp_name;
+	const char *match;
 
 	*link = NULL;
 
-	/* no auto-attach for SEC("tp") or SEC("tracepoint") */
-	if (strcmp(prog->sec_name, "tp") == 0 || strcmp(prog->sec_name, "tracepoint") == 0)
+	match = sec_name_match_prefix(prog->sec_name, prefixes, ARRAY_SIZE(prefixes));
+	if (!match) {
+		pr_warn("prog '%s': invalid section name '%s'\n", prog->name, prog->sec_name);
+		return -EINVAL;
+	}
+	if (!match[0]) /* bare section name no autoattach */
 		return 0;
 
 	sec_name = strdup(prog->sec_name);
 	if (!sec_name)
 		return -ENOMEM;
 
-	/* extract "tp/<category>/<name>" or "tracepoint/<category>/<name>" */
-	if (str_has_pfx(prog->sec_name, "tp/"))
-		tp_cat = sec_name + sizeof("tp/") - 1;
-	else
-		tp_cat = sec_name + sizeof("tracepoint/") - 1;
+	tp_cat = sec_name + (match - prog->sec_name);
 	tp_name = strchr(tp_cat, '/');
 	if (!tp_name) {
 		free(sec_name);
@@ -13227,37 +13643,22 @@ static int attach_raw_tp(const struct bpf_program *prog, long cookie, struct bpf
 		"raw_tracepoint",
 		"raw_tp.w",
 		"raw_tracepoint.w",
+		"raw_tp.s",
+		"raw_tracepoint.s",
 	};
-	size_t i;
-	const char *tp_name = NULL;
+	const char *match;
 
 	*link = NULL;
 
-	for (i = 0; i < ARRAY_SIZE(prefixes); i++) {
-		size_t pfx_len;
-
-		if (!str_has_pfx(prog->sec_name, prefixes[i]))
-			continue;
-
-		pfx_len = strlen(prefixes[i]);
-		/* no auto-attach case of, e.g., SEC("raw_tp") */
-		if (prog->sec_name[pfx_len] == '\0')
-			return 0;
-
-		if (prog->sec_name[pfx_len] != '/')
-			continue;
-
-		tp_name = prog->sec_name + pfx_len + 1;
-		break;
-	}
-
-	if (!tp_name) {
-		pr_warn("prog '%s': invalid section name '%s'\n",
-			prog->name, prog->sec_name);
+	match = sec_name_match_prefix(prog->sec_name, prefixes, ARRAY_SIZE(prefixes));
+	if (!match) {
+		pr_warn("prog '%s': invalid section name '%s'\n", prog->name, prog->sec_name);
 		return -EINVAL;
 	}
+	if (!match[0])
+		return 0;
 
-	*link = bpf_program__attach_raw_tracepoint(prog, tp_name);
+	*link = bpf_program__attach_raw_tracepoint(prog, match);
 	return libbpf_get_error(*link);
 }
 
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index bba4e8464396..b965ad571540 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -726,6 +726,21 @@ bpf_program__attach_ksyscall(const struct bpf_program *prog,
 			     const char *syscall_name,
 			     const struct bpf_ksyscall_opts *opts);
 
+struct bpf_tracing_multi_opts {
+	/* size of this struct, for forward/backward compatibility */
+	size_t sz;
+	const __u32 *ids;
+	const __u64 *cookies;
+	size_t cnt;
+	size_t :0;
+};
+
+#define bpf_tracing_multi_opts__last_field cnt
+
+LIBBPF_API struct bpf_link *
+bpf_program__attach_tracing_multi(const struct bpf_program *prog, const char *pattern,
+				  const struct bpf_tracing_multi_opts *opts);
+
 struct bpf_uprobe_opts {
 	/* size of this struct, for forward/backward compatibility */
 	size_t sz;
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index dfed8d60af05..b731df19ae69 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -458,6 +458,7 @@ LIBBPF_1.7.0 {
 
 LIBBPF_1.8.0 {
 	global:
+		bpf_program__attach_tracing_multi;
 		bpf_program__clone;
 		btf__new_empty_opts;
 } LIBBPF_1.7.0;
diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
index 3781c45b46d3..04cd303fb5a8 100644
--- a/tools/lib/bpf/libbpf_internal.h
+++ b/tools/lib/bpf/libbpf_internal.h
@@ -250,6 +250,7 @@ const struct btf_type *skip_mods_and_typedefs(const struct btf *btf, __u32 id, _
 const struct btf_header *btf_header(const struct btf *btf);
 void btf_set_base_btf(struct btf *btf, const struct btf *base_btf);
 int btf_relocate(struct btf *btf, const struct btf *base_btf, __u32 **id_map);
+bool btf_type_is_traceable_func(const struct btf *btf, const struct btf_type *t);
 
 static inline enum btf_func_linkage btf_func_linkage(const struct btf_type *t)
 {
@@ -398,6 +399,8 @@ enum kern_feature_id {
 	FEAT_UPROBE_SYSCALL,
 	/* Kernel supports BTF layout information */
 	FEAT_BTF_LAYOUT,
+	/* Kernel supports BPF syscall common attributes */
+	FEAT_BPF_SYSCALL_COMMON_ATTRS,
 	__FEAT_CNT,
 };
 
@@ -768,4 +771,5 @@ int probe_fd(int fd);
 #define SHA256_DWORD_SIZE SHA256_DIGEST_LENGTH / sizeof(__u64)
 
 void libbpf_sha256(const void *data, size_t len, __u8 out[SHA256_DIGEST_LENGTH]);
+int probe_sys_bpf_ext(void);
 #endif /* __LIBBPF_LIBBPF_INTERNAL_H */
diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c
index b70d9637ecf5..e40819465ddc 100644
--- a/tools/lib/bpf/libbpf_probes.c
+++ b/tools/lib/bpf/libbpf_probes.c
@@ -309,6 +309,9 @@ static int probe_map_create(enum bpf_map_type map_type)
 		value_size	= sizeof(__u64);
 		opts.map_flags	= BPF_F_NO_PREALLOC;
 		break;
+	case BPF_MAP_TYPE_RHASH:
+		opts.map_flags	= BPF_F_NO_PREALLOC;
+		break;
 	case BPF_MAP_TYPE_CGROUP_STORAGE:
 	case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
 		key_size	= sizeof(struct bpf_cgroup_storage_key);
diff --git a/tools/lib/bpf/relo_core.c b/tools/lib/bpf/relo_core.c
index 0ccc8f548cba..6ae3f2a15ad0 100644
--- a/tools/lib/bpf/relo_core.c
+++ b/tools/lib/bpf/relo_core.c
@@ -191,8 +191,8 @@ recur:
 	case BTF_KIND_FUNC_PROTO: {
 		struct btf_param *local_p = btf_params(local_type);
 		struct btf_param *targ_p = btf_params(targ_type);
-		__u16 local_vlen = btf_vlen(local_type);
-		__u16 targ_vlen = btf_vlen(targ_type);
+		__u32 local_vlen = btf_vlen(local_type);
+		__u32 targ_vlen = btf_vlen(targ_type);
 		int i, err;
 
 		if (local_vlen != targ_vlen)
@@ -1457,8 +1457,8 @@ static bool bpf_core_names_match(const struct btf *local_btf, size_t local_name_
 static int bpf_core_enums_match(const struct btf *local_btf, const struct btf_type *local_t,
 				const struct btf *targ_btf, const struct btf_type *targ_t)
 {
-	__u16 local_vlen = btf_vlen(local_t);
-	__u16 targ_vlen = btf_vlen(targ_t);
+	__u32 local_vlen = btf_vlen(local_t);
+	__u32 targ_vlen = btf_vlen(targ_t);
 	int i, j;
 
 	if (local_t->size != targ_t->size)
@@ -1498,8 +1498,8 @@ static int bpf_core_composites_match(const struct btf *local_btf, const struct b
 				     bool behind_ptr, int level)
 {
 	const struct btf_member *local_m = btf_members(local_t);
-	__u16 local_vlen = btf_vlen(local_t);
-	__u16 targ_vlen = btf_vlen(targ_t);
+	__u32 local_vlen = btf_vlen(local_t);
+	__u32 targ_vlen = btf_vlen(targ_t);
 	int i, j, err;
 
 	if (local_vlen > targ_vlen)
@@ -1674,8 +1674,8 @@ recur:
 	case BTF_KIND_FUNC_PROTO: {
 		struct btf_param *local_p = btf_params(local_t);
 		struct btf_param *targ_p = btf_params(targ_t);
-		__u16 local_vlen = btf_vlen(local_t);
-		__u16 targ_vlen = btf_vlen(targ_t);
+		__u32 local_vlen = btf_vlen(local_t);
+		__u32 targ_vlen = btf_vlen(targ_t);
 		int i, err;
 
 		if (local_k != targ_k)
diff --git a/tools/lib/bpf/skel_internal.h b/tools/lib/bpf/skel_internal.h
index 6a8f5c7a02eb..74503d358bc8 100644
--- a/tools/lib/bpf/skel_internal.h
+++ b/tools/lib/bpf/skel_internal.h
@@ -243,7 +243,12 @@ static inline int skel_map_create(enum bpf_map_type map_type,
 	attr.excl_prog_hash = (unsigned long) excl_prog_hash;
 	attr.excl_prog_hash_size = excl_prog_hash_sz;
 
+#ifdef __KERNEL__
+	if (strscpy(attr.map_name, map_name) < 0)
+		return -EINVAL;
+#else
 	strncpy(attr.map_name, map_name, sizeof(attr.map_name));
+#endif
 	attr.key_size = key_size;
 	attr.value_size = value_size;
 	attr.max_entries = max_entries;
diff --git a/tools/lib/bpf/strset.c b/tools/lib/bpf/strset.c
index 2464bcbd04e0..ace73c6b3d62 100644
--- a/tools/lib/bpf/strset.c
+++ b/tools/lib/bpf/strset.c
@@ -107,6 +107,41 @@ static void *strset_add_str_mem(struct strset *set, size_t add_sz)
 			      set->strs_data_len, set->strs_data_max_len, add_sz);
 }
 
+static long strset_str_append(struct strset *set, const char *s)
+{
+	uintptr_t old_data = (uintptr_t)set->strs_data;
+	size_t old_data_len = set->strs_data_len;
+	uintptr_t old_s = (uintptr_t)s;
+	long len = strlen(s) + 1;
+	void *p;
+
+	/*
+	 * Hashmap keys are always offsets within set->strs_data, so to even
+	 * look up some string from the "outside", we need to first append it
+	 * at the end, so that it can be addressed with an offset. Luckily,
+	 * until set->strs_data_len is incremented, that string is just a piece
+	 * of garbage for the rest of the code, so no harm, no foul. On the
+	 * other hand, if the string is unique, it's already appended and
+	 * ready to be used, only a simple set->strs_data_len increment away.
+	 */
+	p = strset_add_str_mem(set, len);
+	if (!p)
+		return -ENOMEM;
+
+	/*
+	 * The set->strs_data might have reallocated and if 's' pointed
+	 * to an internal string within the old buffer, then it became
+	 * dangling and needs to be reconstructed before the copy.
+	 */
+	if (old_data && old_data != (uintptr_t)set->strs_data &&
+	    old_s >= old_data && old_s < old_data + old_data_len)
+		s = set->strs_data + (old_s - old_data);
+
+	memcpy(p, s, len);
+
+	return len;
+}
+
 /* Find string offset that corresponds to a given string *s*.
  * Returns:
  *   - >0 offset into string data, if string is found;
@@ -116,16 +151,12 @@ static void *strset_add_str_mem(struct strset *set, size_t add_sz)
 int strset__find_str(struct strset *set, const char *s)
 {
 	long old_off, new_off, len;
-	void *p;
 
-	/* see strset__add_str() for why we do this */
-	len = strlen(s) + 1;
-	p = strset_add_str_mem(set, len);
-	if (!p)
-		return -ENOMEM;
+	len = strset_str_append(set, s);
+	if (len < 0)
+		return len;
 
 	new_off = set->strs_data_len;
-	memcpy(p, s, len);
 
 	if (hashmap__find(set->strs_hash, new_off, &old_off))
 		return old_off;
@@ -142,24 +173,13 @@ int strset__find_str(struct strset *set, const char *s)
 int strset__add_str(struct strset *set, const char *s)
 {
 	long old_off, new_off, len;
-	void *p;
 	int err;
 
-	/* Hashmap keys are always offsets within set->strs_data, so to even
-	 * look up some string from the "outside", we need to first append it
-	 * at the end, so that it can be addressed with an offset. Luckily,
-	 * until set->strs_data_len is incremented, that string is just a piece
-	 * of garbage for the rest of the code, so no harm, no foul. On the
-	 * other hand, if the string is unique, it's already appended and
-	 * ready to be used, only a simple set->strs_data_len increment away.
-	 */
-	len = strlen(s) + 1;
-	p = strset_add_str_mem(set, len);
-	if (!p)
-		return -ENOMEM;
+	len = strset_str_append(set, s);
+	if (len < 0)
+		return len;
 
 	new_off = set->strs_data_len;
-	memcpy(p, s, len);
 
 	/* Now attempt to add the string, but only if the string with the same
 	 * contents doesn't exist already (HASHMAP_ADD strategy). If such
diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c
index e3710933fd52..57fb82bb81b5 100644
--- a/tools/lib/bpf/usdt.c
+++ b/tools/lib/bpf/usdt.c
@@ -468,10 +468,10 @@ static int parse_elf_segs(Elf *elf, const char *path, struct elf_seg **segs, siz
 
 static int parse_vma_segs(int pid, const char *lib_path, struct elf_seg **segs, size_t *seg_cnt)
 {
-	char path[PATH_MAX], line[PATH_MAX], mode[16];
+	char path[PATH_MAX], line[4096], mode[16];
 	size_t seg_start, seg_end, seg_off;
 	struct elf_seg *seg;
-	int tmp_pid, i, err;
+	int tmp_pid, n, i, err;
 	FILE *f;
 
 	*seg_cnt = 0;
@@ -480,8 +480,13 @@ static int parse_vma_segs(int pid, const char *lib_path, struct elf_seg **segs,
 	 * /proc/<pid>/root/<path>. They will be reported as just /<path> in
 	 * /proc/<pid>/maps.
 	 */
-	if (sscanf(lib_path, "/proc/%d/root%s", &tmp_pid, path) == 2 && pid == tmp_pid)
+	/* %n is not counted in sscanf() return value, so initialize it. */
+	n = 0;
+	if (sscanf(lib_path, "/proc/%d/root%n", &tmp_pid, &n) == 1 &&
+	    n > 0 && pid == tmp_pid && lib_path[n] == '/') {
+		libbpf_strlcpy(path, lib_path + n, sizeof(path));
 		goto proceed;
+	}
 
 	if (!realpath(lib_path, path)) {
 		pr_warn("usdt: failed to get absolute path of '%s' (err %s), using path as is...\n",
@@ -504,8 +509,11 @@ proceed:
 	 * 7f5c6f5d1000-7f5c6f5d3000 rw-p 001c7000 08:04 21238613      /usr/lib64/libc-2.17.so
 	 * 7f5c6f5d3000-7f5c6f5d8000 rw-p 00000000 00:00 0
 	 * 7f5c6f5d8000-7f5c6f5d9000 r-xp 00000000 103:01 362990598    /data/users/andriin/linux/tools/bpf/usdt/libhello_usdt.so
+	 *
+	 * Some VMA names can be longer than the local buffer. Bound the
+	 * writes, but still consume the rest of the line.
 	 */
-	while (fscanf(f, "%zx-%zx %s %zx %*s %*d%[^\n]\n",
+	while (fscanf(f, "%zx-%zx %15s %zx %*s %*d%4095[^\n]%*[^\n]\n",
 		      &seg_start, &seg_end, mode, &seg_off, line) == 5) {
 		void *tmp;
 
diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index bfdc5518ecc8..986a6389186b 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -21,7 +21,6 @@ test_lirc_mode2_user
 flow_dissector_load
 test_tcpnotify_user
 test_libbpf
-xdping
 test_cpp
 *.d
 *.subskel.h
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 6ef6872adbc3..b642ee489ea6 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -44,6 +44,12 @@ SKIP_LLVM	?=
 SKIP_LIBBFD	?=
 SKIP_CRYPTO	?=
 
+# When BPF_STRICT_BUILD is 1, any BPF object, skeleton, test object, or
+# benchmark compilation failure is fatal. Set to 0 to tolerate failures
+# and continue building the remaining tests.
+BPF_STRICT_BUILD ?= 1
+PERMISSIVE := $(filter 0,$(BPF_STRICT_BUILD))
+
 ifeq ($(srctree),)
 srctree := $(patsubst %/,%,$(dir $(CURDIR)))
 srctree := $(patsubst %/,%,$(dir $(srctree)))
@@ -51,19 +57,20 @@ srctree := $(patsubst %/,%,$(dir $(srctree)))
 srctree := $(patsubst %/,%,$(dir $(srctree)))
 endif
 
-CFLAGS += -g $(OPT_FLAGS) -rdynamic -std=gnu11				\
+COMMON_CFLAGS = -g $(OPT_FLAGS) -rdynamic -std=gnu11				\
 	  -Wall -Werror -fno-omit-frame-pointer				\
 	  -Wno-unused-but-set-variable					\
 	  $(GENFLAGS) $(SAN_CFLAGS) $(LIBELF_CFLAGS)			\
 	  -I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR)		\
-	  -I$(TOOLSINCDIR) -I$(TOOLSARCHINCDIR) -I$(APIDIR) -I$(OUTPUT)
+	  -I$(TOOLSINCDIR) -I$(TOOLSARCHINCDIR) -I$(APIDIR) -I$(OUTPUT)	\
+	  -I$(CURDIR)/libarena/include
 LDFLAGS += $(SAN_LDFLAGS)
 LDLIBS += $(LIBELF_LIBS) -lz -lrt -lpthread
 
 PCAP_CFLAGS	:= $(shell $(PKG_CONFIG) --cflags libpcap 2>/dev/null && echo "-DTRAFFIC_MONITOR=1")
 PCAP_LIBS	:= $(shell $(PKG_CONFIG) --libs libpcap 2>/dev/null)
 LDLIBS += $(PCAP_LIBS)
-CFLAGS += $(PCAP_CFLAGS)
+CFLAGS += $(COMMON_CFLAGS) $(PCAP_CFLAGS)
 
 # Some utility functions use LLVM libraries
 jit_disasm_helpers.c-CFLAGS = $(LLVM_CFLAGS)
@@ -78,6 +85,12 @@ ifneq ($(shell $(CLANG) --target=bpf -mcpu=help 2>&1 | grep 'v4'),)
 CLANG_CPUV4 := 1
 endif
 
+# Check whether clang supports BPF address sanitizer (requires LLVM 22+)
+CLANG_HAS_ARENA_ASAN := $(shell echo 'int x;' | \
+	$(CLANG) --target=bpf -fsanitize=kernel-address \
+	-mllvm -asan-shadow-addr-space=1 \
+	-x c -c - -o /dev/null 2>/dev/null && echo 1)
+
 # Order correspond to 'make run_tests' order
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_progs \
 	test_sockmap \
@@ -111,7 +124,6 @@ TEST_FILES = xsk_prereqs.sh $(wildcard progs/btf_dump_test_case_*.c)
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
 	test_lirc_mode2.sh \
-	test_xdping.sh \
 	test_bpftool_build.sh \
 	test_doc_build.sh \
 	test_xsk.sh \
@@ -134,7 +146,6 @@ TEST_GEN_PROGS_EXTENDED = \
 	xdp_features \
 	xdp_hw_metadata \
 	xdp_synproxy \
-	xdping \
 	xskxceiver
 
 TEST_GEN_FILES += $(TEST_KMODS) liburandom_read.so urandom_read sign-file uprobe_multi
@@ -153,12 +164,13 @@ override define CLEAN
 	$(Q)$(RM) -r $(TEST_KMODS)
 	$(Q)$(RM) -r $(EXTRA_CLEAN)
 	$(Q)$(MAKE) -C test_kmods clean
+	$(Q)$(MAKE) -C libarena clean
 	$(Q)$(MAKE) docs-clean
 endef
 
 include ../lib.mk
 
-NON_CHECK_FEAT_TARGETS := clean docs-clean
+NON_CHECK_FEAT_TARGETS := clean docs-clean emit_tests
 CHECK_FEAT := $(filter-out $(NON_CHECK_FEAT_TARGETS),$(or $(MAKECMDGOALS), "none"))
 ifneq ($(CHECK_FEAT),)
 FEATURE_USER := .selftests
@@ -182,8 +194,15 @@ ifeq ($(feature-llvm),1)
   LLVM_CONFIG_LIB_COMPONENTS := mcdisassembler all-targets
   # both llvm-config and lib.mk add -D_GNU_SOURCE, which ends up as conflict
   LLVM_CFLAGS  += $(filter-out -D_GNU_SOURCE,$(shell $(LLVM_CONFIG) --cflags))
-  # Prefer linking statically if it's available, otherwise fallback to shared
-  ifeq ($(shell $(LLVM_CONFIG) --link-static --libs >/dev/null 2>&1 && echo static),static)
+  # Cross compilation must use dynamic linking to avoid unresolved library
+  # dependencies. For native build, prefer linking statically if it's
+  # available, otherwise fallback to shared.
+  ifneq ($(ARCH), $(HOSTARCH))
+    LLVM_LINK_STATIC :=
+  else
+    LLVM_LINK_STATIC := $(shell $(LLVM_CONFIG) --link-static --libs >/dev/null 2>&1 && echo y)
+  endif
+  ifeq ($(LLVM_LINK_STATIC),y)
     LLVM_LDLIBS  += $(shell $(LLVM_CONFIG) --link-static --libs $(LLVM_CONFIG_LIB_COMPONENTS))
     LLVM_LDLIBS  += $(filter-out -lxml2,$(shell $(LLVM_CONFIG) --link-static --system-libs $(LLVM_CONFIG_LIB_COMPONENTS)))
     LLVM_LDLIBS  += -lstdc++
@@ -255,7 +274,7 @@ endif
 $(OUTPUT)/liburandom_read.so: urandom_read_lib1.c urandom_read_lib2.c liburandom_read.map
 	$(call msg,LIB,,$@)
 	$(Q)$(CLANG) $(CLANG_TARGET_ARCH) \
-		     $(filter-out -static,$(CFLAGS) $(LDFLAGS)) \
+		     $(filter-out -static,$(COMMON_CFLAGS) $(LDFLAGS)) \
 		     $(filter %.c,$^) $(filter-out -static,$(LDLIBS)) \
 		     -Wno-unused-command-line-argument \
 		     -fuse-ld=$(LLD) -Wl,-znoseparate-code -Wl,--build-id=sha1 \
@@ -265,7 +284,7 @@ $(OUTPUT)/liburandom_read.so: urandom_read_lib1.c urandom_read_lib2.c liburandom
 $(OUTPUT)/urandom_read: urandom_read.c urandom_read_aux.c $(OUTPUT)/liburandom_read.so
 	$(call msg,BINARY,,$@)
 	$(Q)$(CLANG) $(CLANG_TARGET_ARCH) \
-		     $(filter-out -static,$(CFLAGS) $(LDFLAGS)) $(filter %.c,$^) \
+		     $(filter-out -static,$(COMMON_CFLAGS) $(LDFLAGS)) $(filter %.c,$^) \
 		     -Wno-unused-command-line-argument \
 		     -lurandom_read $(filter-out -static,$(LDLIBS)) -L$(OUTPUT) \
 		     -fuse-ld=$(LLD) -Wl,-znoseparate-code -Wl,--build-id=sha1 \
@@ -284,13 +303,15 @@ $(OUTPUT)/sign-file: ../../../../scripts/sign-file.c
 # subst() turns the rule into a pattern matching rule
 $(addprefix test_kmods/,$(subst .ko,%ko,$(TEST_KMODS))): $(VMLINUX_BTF) $(RESOLVE_BTFIDS) $(wildcard test_kmods/Makefile test_kmods/*.[ch])
 	$(Q)$(RM) test_kmods/*.ko test_kmods/*.mod.o # force re-compilation
-	$(Q)$(MAKE) $(submake_extras) -C test_kmods	\
-		RESOLVE_BTFIDS=$(RESOLVE_BTFIDS)	\
+	$(Q)$(MAKE) $(submake_extras) -C test_kmods				\
+		$(if $(O),O=$(abspath $(O)))					\
+		$(if $(KBUILD_OUTPUT),KBUILD_OUTPUT=$(abspath $(KBUILD_OUTPUT)))\
+		RESOLVE_BTFIDS=$(RESOLVE_BTFIDS)				\
 		EXTRA_CFLAGS='' EXTRA_LDFLAGS=''
 
 $(TEST_KMOD_TARGETS): $(addprefix test_kmods/,$(TEST_KMODS))
 	$(call msg,MOD,,$@)
-	$(Q)cp test_kmods/$(@F) $@
+	$(Q)$(if $(PERMISSIVE),if [ -f test_kmods/$(@F) ]; then )cp test_kmods/$(@F) $@$(if $(PERMISSIVE),; fi)
 
 
 DEFAULT_BPFTOOL := $(HOST_SCRATCH_DIR)/sbin/bpftool
@@ -320,7 +341,6 @@ $(OUTPUT)/test_tcpnotify_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(TRACE_HELP
 $(OUTPUT)/test_sock_fields: $(CGROUP_HELPERS) $(TESTING_HELPERS)
 $(OUTPUT)/test_tag: $(TESTING_HELPERS)
 $(OUTPUT)/test_lirc_mode2_user: $(TESTING_HELPERS)
-$(OUTPUT)/xdping: $(TESTING_HELPERS)
 $(OUTPUT)/flow_dissector_load: $(TESTING_HELPERS)
 $(OUTPUT)/test_maps: $(TESTING_HELPERS)
 $(OUTPUT)/test_verifier: $(TESTING_HELPERS) $(CAP_HELPERS) $(UNPRIV_HELPERS)
@@ -446,6 +466,7 @@ endif
 CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG),$(CLANG_TARGET_ARCH))
 BPF_CFLAGS = -g -Wall -Werror -D__TARGET_ARCH_$(SRCARCH) $(MENDIAN)	\
 	     -I$(INCLUDE_DIR) -I$(CURDIR) -I$(APIDIR)			\
+	     -I$(CURDIR)/libarena/include				\
 	     -I$(abspath $(OUTPUT)/../usr/include)			\
 	     -std=gnu11		 					\
 	     -fno-strict-aliasing 					\
@@ -471,22 +492,26 @@ $(OUTPUT)/cgroup_getset_retval_hooks.o: cgroup_getset_retval_hooks.h
 # $4 - binary name
 define CLANG_BPF_BUILD_RULE
 	$(call msg,CLNG-BPF,$4,$2)
-	$(Q)$(CLANG) $3 -O2 $(BPF_TARGET_ENDIAN) -c $1 -mcpu=v3 -o $2
+	$(Q)$(CLANG) $3 -O2 $(BPF_TARGET_ENDIAN) -c $1 -mcpu=v3 -o $2 $(if $(PERMISSIVE),|| \
+		($(RM) $2; printf '  %-12s %s\n' 'SKIP-BPF' '$(notdir $2)' 1>&2))
 endef
 # Similar to CLANG_BPF_BUILD_RULE, but with disabled alu32
 define CLANG_NOALU32_BPF_BUILD_RULE
 	$(call msg,CLNG-BPF,$4,$2)
-	$(Q)$(CLANG) $3 -O2 $(BPF_TARGET_ENDIAN) -c $1 -mcpu=v2 -o $2
+	$(Q)$(CLANG) $3 -O2 $(BPF_TARGET_ENDIAN) -c $1 -mcpu=v2 -o $2 $(if $(PERMISSIVE),|| \
+		($(RM) $2; printf '  %-12s %s\n' 'SKIP-BPF' '$(notdir $2)' 1>&2))
 endef
 # Similar to CLANG_BPF_BUILD_RULE, but with cpu-v4
 define CLANG_CPUV4_BPF_BUILD_RULE
 	$(call msg,CLNG-BPF,$4,$2)
-	$(Q)$(CLANG) $3 -O2 $(BPF_TARGET_ENDIAN) -c $1 -mcpu=v4 -o $2
+	$(Q)$(CLANG) $3 -O2 $(BPF_TARGET_ENDIAN) -c $1 -mcpu=v4 -o $2 $(if $(PERMISSIVE),|| \
+		($(RM) $2; printf '  %-12s %s\n' 'SKIP-BPF' '$(notdir $2)' 1>&2))
 endef
 # Build BPF object using GCC
 define GCC_BPF_BUILD_RULE
 	$(call msg,GCC-BPF,$4,$2)
-	$(Q)$(BPF_GCC) $3 -DBPF_NO_PRESERVE_ACCESS_INDEX -Wno-attributes -O2 -c $1 -o $2
+	$(Q)$(BPF_GCC) $3 -DBPF_NO_PRESERVE_ACCESS_INDEX -Wno-attributes -O2 -c $1 -o $2 $(if $(PERMISSIVE),|| \
+		($(RM) $2; printf '  %-12s %s\n' 'SKIP-BPF' '$(notdir $2)' 1>&2))
 endef
 
 SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c
@@ -494,7 +519,10 @@ SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c
 LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h		\
 		linked_vars.skel.h linked_maps.skel.h 			\
 		test_subskeleton.skel.h test_subskeleton_lib.skel.h	\
-		test_usdt.skel.h
+		test_usdt.skel.h tracing_multi.skel.h			\
+		tracing_multi_module.skel.h				\
+		tracing_multi_intersect.skel.h				\
+		tracing_multi_session.skel.h
 
 LSKELS := fexit_sleep.c trace_printk.c trace_vprintk.c map_ptr_kern.c 	\
 	core_kern.c core_kern_overflow.c test_ringbuf.c			\
@@ -520,11 +548,16 @@ test_usdt.skel.h-deps := test_usdt.bpf.o test_usdt_multispec.bpf.o
 xsk_xdp_progs.skel.h-deps := xsk_xdp_progs.bpf.o
 xdp_hw_metadata.skel.h-deps := xdp_hw_metadata.bpf.o
 xdp_features.skel.h-deps := xdp_features.bpf.o
+tracing_multi.skel.h-deps := tracing_multi_attach.bpf.o tracing_multi_check.bpf.o
+tracing_multi_module.skel.h-deps := tracing_multi_attach_module.bpf.o tracing_multi_check.bpf.o
+tracing_multi_intersect.skel.h-deps := tracing_multi_intersect_attach.bpf.o tracing_multi_check.bpf.o
+tracing_multi_session.skel.h-deps := tracing_multi_session_attach.bpf.o tracing_multi_check.bpf.o
 
 LINKED_BPF_OBJS := $(foreach skel,$(LINKED_SKELS),$($(skel)-deps))
 LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.c,$(LINKED_BPF_OBJS))
 
 HEADERS_FOR_BPF_OBJS := $(wildcard $(BPFDIR)/*.bpf.h)		\
+			$(wildcard $(CURDIR)/libarena/include/*.[ch])	\
 			$(addprefix $(BPFDIR)/,	bpf_core_read.h	\
 			                        bpf_endian.h	\
 						bpf_helpers.h	\
@@ -569,6 +602,12 @@ endef
 # $2 - test runner extra "flavor" (e.g., no_alu32, cpuv4, bpf_gcc, etc)
 define DEFINE_TEST_RUNNER_RULES
 
+# Permissive build behaviour (skip-on-failure compile, partial-link) only
+# applies to test_progs and its flavors; runners that use strong cross-object
+# references (e.g. test_maps) keep strict semantics even when permissive.
+# The check is inlined per-runner so $1 is substituted at $(call) time and
+# the result is baked into each rule's recipe.
+
 ifeq ($($(TRUNNER_OUTPUT)-dir),)
 $(TRUNNER_OUTPUT)-dir := y
 $(TRUNNER_OUTPUT):
@@ -592,47 +631,81 @@ $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.bpf.o:				\
 					  $$($$<-$2-CFLAGS),$(TRUNNER_BINARY))
 
 $(TRUNNER_BPF_SKELS): %.skel.h: %.bpf.o $(BPFTOOL) | $(TRUNNER_OUTPUT)
-	$$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@)
-	$(Q)$$(BPFTOOL) gen object $$(<:.o=.linked1.o) $$<
-	$(Q)$$(BPFTOOL) gen object $$(<:.o=.linked2.o) $$(<:.o=.linked1.o)
-	$(Q)$$(BPFTOOL) gen object $$(<:.o=.linked3.o) $$(<:.o=.linked2.o)
-	$(Q)diff $$(<:.o=.linked2.o) $$(<:.o=.linked3.o)
-	$(Q)$$(BPFTOOL) gen skeleton $$(<:.o=.linked3.o) name $$(notdir $$(<:.bpf.o=)) > $$@
-	$(Q)$$(BPFTOOL) gen subskeleton $$(<:.o=.linked3.o) name $$(notdir $$(<:.bpf.o=)) > $$(@:.skel.h=.subskel.h)
-	$(Q)rm -f $$(<:.o=.linked1.o) $$(<:.o=.linked2.o) $$(<:.o=.linked3.o)
+	$(Q)$(if $(PERMISSIVE),if [ ! -f $$< ]; then			\
+		$$(RM) $$@ $$(@:.skel.h=.subskel.h);			\
+		printf '  %-12s %s\n' 'SKIP-SKEL' '$$(notdir $$@)' 1>&2; \
+		exit 0;							\
+	fi;)								\
+	printf '  %-12s %s\n' 'GEN-SKEL' '[$(TRUNNER_BINARY)] $$(notdir $$@)' 1>&2; \
+	$$(BPFTOOL) gen object $$(<:.o=.linked1.o) $$< &&		\
+	$$(BPFTOOL) gen object $$(<:.o=.linked2.o) $$(<:.o=.linked1.o) && \
+	$$(BPFTOOL) gen object $$(<:.o=.linked3.o) $$(<:.o=.linked2.o) && \
+	diff $$(<:.o=.linked2.o) $$(<:.o=.linked3.o) &&		\
+	$$(BPFTOOL) gen skeleton $$(<:.o=.linked3.o) name $$(notdir $$(<:.bpf.o=)) > $$@ && \
+	$$(BPFTOOL) gen subskeleton $$(<:.o=.linked3.o) name $$(notdir $$(<:.bpf.o=)) > $$(@:.skel.h=.subskel.h) $(if $(PERMISSIVE),|| { \
+		$$(RM) $$@ $$(@:.skel.h=.subskel.h); \
+		printf '  %-12s %s\n' 'SKIP-SKEL' '$$(notdir $$@)' 1>&2; \
+	}) && \
+	rm -f $$(<:.o=.linked1.o) $$(<:.o=.linked2.o) $$(<:.o=.linked3.o)
 
 $(TRUNNER_BPF_LSKELS): %.lskel.h: %.bpf.o $(BPFTOOL) | $(TRUNNER_OUTPUT)
-	$$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@)
-	$(Q)$$(BPFTOOL) gen object $$(<:.o=.llinked1.o) $$<
-	$(Q)$$(BPFTOOL) gen object $$(<:.o=.llinked2.o) $$(<:.o=.llinked1.o)
-	$(Q)$$(BPFTOOL) gen object $$(<:.o=.llinked3.o) $$(<:.o=.llinked2.o)
-	$(Q)diff $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o)
-	$(Q)$$(BPFTOOL) gen skeleton -L $$(<:.o=.llinked3.o) name $$(notdir $$(<:.bpf.o=_lskel)) > $$@
-	$(Q)rm -f $$(<:.o=.llinked1.o) $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o)
+	$(Q)$(if $(PERMISSIVE),if [ ! -f $$< ]; then			\
+		$$(RM) $$@;						\
+		printf '  %-12s %s\n' 'SKIP-SKEL' '$$(notdir $$@)' 1>&2; \
+		exit 0;							\
+	fi;)								\
+	printf '  %-12s %s\n' 'GEN-SKEL' '[$(TRUNNER_BINARY)] $$(notdir $$@)' 1>&2; \
+	$$(BPFTOOL) gen object $$(<:.o=.llinked1.o) $$< &&		\
+	$$(BPFTOOL) gen object $$(<:.o=.llinked2.o) $$(<:.o=.llinked1.o) && \
+	$$(BPFTOOL) gen object $$(<:.o=.llinked3.o) $$(<:.o=.llinked2.o) && \
+	diff $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o) &&		\
+	$$(BPFTOOL) gen skeleton -L $$(<:.o=.llinked3.o) name $$(notdir $$(<:.bpf.o=_lskel)) > $$@ $(if $(PERMISSIVE),|| { \
+		$$(RM) $$@; \
+		printf '  %-12s %s\n' 'SKIP-SKEL' '$$(notdir $$@)' 1>&2; \
+	}) && \
+	rm -f $$(<:.o=.llinked1.o) $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o)
 
 $(TRUNNER_BPF_LSKELS_SIGNED): %.lskel.h: %.bpf.o $(BPFTOOL) | $(TRUNNER_OUTPUT)
-	$$(call msg,GEN-SKEL,$(TRUNNER_BINARY) (signed),$$@)
-	$(Q)$$(BPFTOOL) gen object $$(<:.o=.llinked1.o) $$<
-	$(Q)$$(BPFTOOL) gen object $$(<:.o=.llinked2.o) $$(<:.o=.llinked1.o)
-	$(Q)$$(BPFTOOL) gen object $$(<:.o=.llinked3.o) $$(<:.o=.llinked2.o)
-	$(Q)diff $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o)
-	$(Q)$$(BPFTOOL) gen skeleton $(LSKEL_SIGN) $$(<:.o=.llinked3.o) name $$(notdir $$(<:.bpf.o=_lskel)) > $$@
-	$(Q)rm -f $$(<:.o=.llinked1.o) $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o)
+	$(Q)$(if $(PERMISSIVE),if [ ! -f $$< ]; then			\
+		$$(RM) $$@;						\
+		printf '  %-12s %s\n' 'SKIP-SKEL' '$$(notdir $$@)' 1>&2; \
+		exit 0;							\
+	fi;)								\
+	printf '  %-12s %s\n' 'GEN-SKEL' '[$(TRUNNER_BINARY) (signed)] $$(notdir $$@)' 1>&2; \
+	$$(BPFTOOL) gen object $$(<:.o=.llinked1.o) $$< &&		\
+	$$(BPFTOOL) gen object $$(<:.o=.llinked2.o) $$(<:.o=.llinked1.o) && \
+	$$(BPFTOOL) gen object $$(<:.o=.llinked3.o) $$(<:.o=.llinked2.o) && \
+	diff $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o) &&		\
+	$$(BPFTOOL) gen skeleton $(LSKEL_SIGN) $$(<:.o=.llinked3.o) name $$(notdir $$(<:.bpf.o=_lskel)) > $$@ $(if $(PERMISSIVE),|| { \
+		$$(RM) $$@; \
+		printf '  %-12s %s\n' 'SKIP-SKEL' '$$(notdir $$@)' 1>&2; \
+	}) && \
+	rm -f $$(<:.o=.llinked1.o) $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o)
 
 $(LINKED_BPF_OBJS): %: $(TRUNNER_OUTPUT)/%
 
 # .SECONDEXPANSION here allows to correctly expand %-deps variables as prerequisites
 .SECONDEXPANSION:
 $(TRUNNER_BPF_SKELS_LINKED): $(TRUNNER_OUTPUT)/%: $$$$(%-deps) $(BPFTOOL) | $(TRUNNER_OUTPUT)
-	$$(call msg,LINK-BPF,$(TRUNNER_BINARY),$$(@:.skel.h=.bpf.o))
-	$(Q)$$(BPFTOOL) gen object $$(@:.skel.h=.linked1.o) $$(addprefix $(TRUNNER_OUTPUT)/,$$($$(@F)-deps))
-	$(Q)$$(BPFTOOL) gen object $$(@:.skel.h=.linked2.o) $$(@:.skel.h=.linked1.o)
-	$(Q)$$(BPFTOOL) gen object $$(@:.skel.h=.linked3.o) $$(@:.skel.h=.linked2.o)
-	$(Q)diff $$(@:.skel.h=.linked2.o) $$(@:.skel.h=.linked3.o)
-	$$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@)
-	$(Q)$$(BPFTOOL) gen skeleton $$(@:.skel.h=.linked3.o) name $$(notdir $$(@:.skel.h=)) > $$@
-	$(Q)$$(BPFTOOL) gen subskeleton $$(@:.skel.h=.linked3.o) name $$(notdir $$(@:.skel.h=)) > $$(@:.skel.h=.subskel.h)
-	$(Q)rm -f $$(@:.skel.h=.linked1.o) $$(@:.skel.h=.linked2.o) $$(@:.skel.h=.linked3.o)
+	$(Q)$(if $(PERMISSIVE),for f in $$(addprefix $(TRUNNER_OUTPUT)/,$$($$(@F)-deps)); do \
+		if [ ! -f $$$$f ]; then						\
+			$$(RM) $$@ $$(@:.skel.h=.subskel.h);		\
+			printf '  %-12s %s\n' 'SKIP-SKEL' '$$(notdir $$@)' 1>&2; \
+			exit 0;							\
+		fi;								\
+	done;)									\
+	printf '  %-12s %s\n' 'LINK-BPF' '[$(TRUNNER_BINARY)] $$(notdir $$(@:.skel.h=.bpf.o))' 1>&2; \
+	$$(BPFTOOL) gen object $$(@:.skel.h=.linked1.o) $$(addprefix $(TRUNNER_OUTPUT)/,$$($$(@F)-deps)) && \
+	$$(BPFTOOL) gen object $$(@:.skel.h=.linked2.o) $$(@:.skel.h=.linked1.o) && \
+	$$(BPFTOOL) gen object $$(@:.skel.h=.linked3.o) $$(@:.skel.h=.linked2.o) && \
+	diff $$(@:.skel.h=.linked2.o) $$(@:.skel.h=.linked3.o) &&	\
+	printf '  %-12s %s\n' 'GEN-SKEL' '[$(TRUNNER_BINARY)] $$(notdir $$@)' 1>&2 && \
+	$$(BPFTOOL) gen skeleton $$(@:.skel.h=.linked3.o) name $$(notdir $$(@:.skel.h=)) > $$@ && \
+	$$(BPFTOOL) gen subskeleton $$(@:.skel.h=.linked3.o) name $$(notdir $$(@:.skel.h=)) > $$(@:.skel.h=.subskel.h) $(if $(PERMISSIVE),|| { \
+		$$(RM) $$@ $$(@:.skel.h=.subskel.h);			\
+		printf '  %-12s %s\n' 'SKIP-SKEL' '$$(notdir $$@)' 1>&2; \
+	}) &&									\
+	rm -f $$(@:.skel.h=.linked1.o) $$(@:.skel.h=.linked2.o) $$(@:.skel.h=.linked3.o)
 
 # When the compiler generates a %.d file, only skel basenames (not
 # full paths) are specified as prerequisites for corresponding %.o
@@ -664,22 +737,25 @@ $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o:			\
 		      $(TRUNNER_TESTS_DIR)/%.c				\
 		      | $(TRUNNER_OUTPUT)/%.test.d
 	$$(call msg,TEST-OBJ,$(TRUNNER_BINARY),$$@)
-	$(Q)cd $$(@D) && $$(CC) -I. $$(CFLAGS) -MMD -MT $$@ -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F)
+	$(Q)(cd $$(@D) && $$(CC) -I. $$(CFLAGS) -MMD -MT $$@ -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F)) $(if $(filter test_progs%,$1),$(if $(PERMISSIVE),|| \
+		($(RM) $$@; printf '  %-12s %s\n' 'SKIP-TEST' '$$(notdir $$@)' 1>&2)))
 	$$(if $$(TEST_NEEDS_BTFIDS),						\
-		$$(call msg,BTFIDS,$(TRUNNER_BINARY),$$@)			\
+		$(Q)if [ -f $$@ ]; then						\
+		$(if $(filter 1,$(V)),true,printf '  %-8s%s %s\n' "BTFIDS" " [$(TRUNNER_BINARY)]" "$$(notdir $$@)"); \
 		$(RESOLVE_BTFIDS) --btf $(TRUNNER_OUTPUT)/btf_data.bpf.o $$@;	\
-		$(RESOLVE_BTFIDS) --patch_btfids $$@.BTF_ids $$@)
+		$(RESOLVE_BTFIDS) --patch_btfids $$@.BTF_ids $$@;		\
+		fi)
 
 $(TRUNNER_TEST_OBJS:.o=.d): $(TRUNNER_OUTPUT)/%.test.d:			\
 			    $(TRUNNER_TESTS_DIR)/%.c			\
 			    $(TRUNNER_EXTRA_HDRS)			\
+			    $$(BPFOBJ) | $(TRUNNER_OUTPUT)		\
 			    $(TRUNNER_BPF_SKELS)			\
 			    $(TRUNNER_BPF_LSKELS)			\
 			    $(TRUNNER_BPF_LSKELS_SIGNED)		\
-			    $(TRUNNER_BPF_SKELS_LINKED)			\
-			    $$(BPFOBJ) | $(TRUNNER_OUTPUT)
+			    $(TRUNNER_BPF_SKELS_LINKED)
 
-ifeq ($(filter clean docs-clean,$(MAKECMDGOALS)),)
+ifeq ($(filter clean docs-clean emit_tests,$(MAKECMDGOALS)),)
 include $(wildcard $(TRUNNER_TEST_OBJS:.o=.d))
 endif
 
@@ -705,20 +781,21 @@ $(TRUNNER_LIB_OBJS): $(TRUNNER_OUTPUT)/%.o:$(TOOLSDIR)/lib/%.c
 $(TRUNNER_BINARY)-extras: $(TRUNNER_EXTRA_FILES) | $(TRUNNER_OUTPUT)
 ifneq ($2:$(OUTPUT),:$(shell pwd))
 	$$(call msg,EXT-COPY,$(TRUNNER_BINARY),$(TRUNNER_EXTRA_FILES))
-	$(Q)rsync -aq $$^ $(TRUNNER_OUTPUT)/
+	$(Q)rsync -aq $(if $(PERMISSIVE),--ignore-missing-args) $$^ $(TRUNNER_OUTPUT)/
 endif
 
 # some X.test.o files have runtime dependencies on Y.bpf.o files
 $(OUTPUT)/$(TRUNNER_BINARY): | $(TRUNNER_BPF_OBJS)
 
-$(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS)			\
+$(OUTPUT)/$(TRUNNER_BINARY): $(if $(filter test_progs%,$1),$(if $(PERMISSIVE),$$(wildcard $(TRUNNER_TEST_OBJS)),$(TRUNNER_TEST_OBJS)),$(TRUNNER_TEST_OBJS))	\
 			     $(TRUNNER_EXTRA_OBJS) $$(BPFOBJ)		\
 			     $(TRUNNER_LIB_OBJS)			\
 			     $(TRUNNER_BPFTOOL)				\
 			     $(OUTPUT)/veristat				\
-			     | $(TRUNNER_BINARY)-extras
+			     | $(TRUNNER_BINARY)-extras			\
+			     $(if $(filter test_progs%,$1),$(if $(PERMISSIVE),$(TRUNNER_TEST_OBJS)))
 	$$(call msg,BINARY,,$$@)
-	$(Q)$$(CC) $$(CFLAGS) $$(filter %.a %.o,$$^) $$(LDLIBS) $$(LLVM_LDLIBS) $$(LDFLAGS) $$(LLVM_LDFLAGS) -o $$@
+	$(Q)$$(CC) $$(CFLAGS) $(if $(filter test_progs%,$1),$(if $(PERMISSIVE),$$(filter %.a %.o,$$(wildcard $(TRUNNER_TEST_OBJS)) $$(filter-out $(TRUNNER_TEST_OBJS),$$^)),$$(filter %.a %.o,$$^)),$$(filter %.a %.o,$$^)) $$(LDLIBS) $$(LLVM_LDLIBS) $$(LDFLAGS) $$(LLVM_LDFLAGS) -o $$@
 	$(Q)ln -sf $(if $2,..,.)/tools/build/bpftool/$(USE_BOOTSTRAP)bpftool \
 		   $(OUTPUT)/$(if $2,$2/)bpftool
 
@@ -740,6 +817,37 @@ $(VERIFY_SIG_HDR): $(VERIFICATION_CERT)
 	 echo "};"; \
 	 echo "unsigned int test_progs_verification_cert_len = $$(wc -c < $<);") > $@
 
+LIBARENA_MAKE_ARGS = \
+		BPFTOOL="$(BPFTOOL)" \
+		INCLUDE_DIR="$(INCLUDE_DIR)" \
+		LIBBPF_INCLUDE="$(HOST_INCLUDE_DIR)" \
+		BPFOBJ="$(BPFOBJ)" \
+		LDLIBS="$(LDLIBS) -lzstd" \
+		CLANG="$(CLANG)" \
+		BPF_CFLAGS="$(BPF_CFLAGS) $(CLANG_CFLAGS)" \
+		BPF_TARGET_ENDIAN="$(BPF_TARGET_ENDIAN)" \
+		Q="$(Q)"
+
+LIBARENA_BPF_DEPS := $(wildcard libarena/Makefile		\
+				 libarena/include/*		\
+				 libarena/include/libarena/*	\
+				 libarena/src/*			\
+				 libarena/selftests/*		\
+				 libarena/*.bpf.o)
+
+LIBARENA_SKEL := libarena/libarena.skel.h
+
+$(LIBARENA_SKEL): $(INCLUDE_DIR)/vmlinux.h $(BPFOBJ) $(LIBARENA_BPF_DEPS)
+	+$(MAKE) -C libarena libarena.skel.h $(LIBARENA_MAKE_ARGS)
+
+ifneq ($(CLANG_HAS_ARENA_ASAN),)
+LIBARENA_ASAN_SKEL := libarena/libarena_asan.skel.h
+CFLAGS += -DHAS_BPF_ARENA_ASAN
+
+$(LIBARENA_ASAN_SKEL): $(INCLUDE_DIR)/vmlinux.h $(BPFOBJ) $(LIBARENA_BPF_DEPS)
+	+$(MAKE) -C libarena libarena_asan.skel.h $(LIBARENA_MAKE_ARGS)
+endif
+
 # Define test_progs test runner.
 TRUNNER_TESTS_DIR := prog_tests
 TRUNNER_BPF_PROGS_DIR := progs
@@ -764,7 +872,9 @@ TRUNNER_EXTRA_SOURCES := test_progs.c		\
 			 flow_dissector_load.h	\
 			 ip_check_defrag_frags.h	\
 			 bpftool_helpers.c	\
-			 usdt_1.c usdt_2.c
+			 usdt_1.c usdt_2.c	\
+			 $(LIBARENA_SKEL)	\
+			 $(LIBARENA_ASAN_SKEL)
 TRUNNER_LIB_SOURCES := find_bit.c
 TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read				\
 		       $(OUTPUT)/liburandom_read.so			\
@@ -849,7 +959,8 @@ $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ)
 # Benchmark runner
 $(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h $(BPFOBJ)
 	$(call msg,CC,,$@)
-	$(Q)$(CC) $(CFLAGS) -O2 -c $(filter %.c,$^) $(LDLIBS) -o $@
+	$(Q)$(CC) $(CFLAGS) -O2 -c $(filter %.c,$^) $(LDLIBS) -o $@ $(if $(PERMISSIVE),|| \
+		($(RM) $@; printf '  %-12s %s\n' 'SKIP-BENCH' '$(notdir $@)' 1>&2))
 $(OUTPUT)/bench_rename.o: $(OUTPUT)/test_overhead.skel.h
 $(OUTPUT)/bench_trigger.o: $(OUTPUT)/trigger_bench.skel.h
 $(OUTPUT)/bench_ringbufs.o: $(OUTPUT)/ringbuf_bench.skel.h \
@@ -866,6 +977,9 @@ $(OUTPUT)/bench_htab_mem.o: $(OUTPUT)/htab_mem_bench.skel.h
 $(OUTPUT)/bench_bpf_crypto.o: $(OUTPUT)/crypto_bench.skel.h
 $(OUTPUT)/bench_sockmap.o: $(OUTPUT)/bench_sockmap_prog.skel.h
 $(OUTPUT)/bench_lpm_trie_map.o: $(OUTPUT)/lpm_trie_bench.skel.h $(OUTPUT)/lpm_trie_map.skel.h
+$(OUTPUT)/bench_bpf_nop.o: $(OUTPUT)/bpf_nop_bench.skel.h bench_bpf_timing.h
+$(OUTPUT)/bench_xdp_lb.o: $(OUTPUT)/xdp_lb_bench.skel.h bench_bpf_timing.h
+$(OUTPUT)/bench_bpf_timing.o: bench_bpf_timing.h
 $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ)
 $(OUTPUT)/bench: LDLIBS += -lm
 $(OUTPUT)/bench: $(OUTPUT)/bench.o \
@@ -888,11 +1002,15 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \
 		 $(OUTPUT)/bench_bpf_crypto.o \
 		 $(OUTPUT)/bench_sockmap.o \
 		 $(OUTPUT)/bench_lpm_trie_map.o \
+		 $(OUTPUT)/bench_bpf_timing.o \
+		 $(OUTPUT)/bench_bpf_nop.o \
+		 $(OUTPUT)/bench_xdp_lb.o \
 		 $(OUTPUT)/usdt_1.o \
 		 $(OUTPUT)/usdt_2.o \
 		 #
 	$(call msg,BINARY,,$@)
-	$(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@
+	$(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@ $(if $(PERMISSIVE),|| \
+		($(RM) $@; printf '  %-12s %s\n' 'SKIP-LINK' '$(notdir $@) (some benchmarks may have been skipped)' 1>&2))
 
 # This works around GCC warning about snprintf truncating strings like:
 #
@@ -925,11 +1043,28 @@ EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR)			\
 # Delete partially updated (corrupted) files on error
 .DELETE_ON_ERROR:
 
+# When permissive, tell rsync to ignore missing source arguments so that
+# partial builds do not abort installation.
+ifneq ($(PERMISSIVE),)
+override define INSTALL_SINGLE_RULE
+	$(if $(INSTALL_LIST),@mkdir -p $(INSTALL_PATH))
+	$(if $(INSTALL_LIST),rsync -a --copy-unsafe-links --ignore-missing-args $(INSTALL_LIST) $(INSTALL_PATH)/)
+endef
+endif
+
 DEFAULT_INSTALL_RULE := $(INSTALL_RULE)
 override define INSTALL_RULE
 	$(DEFAULT_INSTALL_RULE)
-	@for DIR in $(TEST_INST_SUBDIRS); do		  \
-		mkdir -p $(INSTALL_PATH)/$$DIR;   \
-		rsync -a $(OUTPUT)/$$DIR/*.bpf.o $(INSTALL_PATH)/$$DIR;\
+	@for DIR in $(TEST_INST_SUBDIRS); do				  \
+		mkdir -p $(INSTALL_PATH)/$$DIR;				  \
+		rsync -a $(if $(PERMISSIVE),--ignore-missing-args)	  \
+			$(OUTPUT)/$$DIR/*.bpf.o				  \
+			$(INSTALL_PATH)/$$DIR;				  \
 	done
 endef
+
+libarena: $(LIBARENA_SKEL)
+
+ifneq ($(CLANG_HAS_ARENA_ASAN),)
+libarena_asan: $(LIBARENA_ASAN_SKEL)
+endif
diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst
index 776fbe3cb8f9..37164322a102 100644
--- a/tools/testing/selftests/bpf/README.rst
+++ b/tools/testing/selftests/bpf/README.rst
@@ -77,7 +77,7 @@ In case of linker errors when running selftests, try using static linking:
 
 .. code-block:: console
 
-  $ LDLIBS=-static PKG_CONFIG='pkg-config --static' vmtest.sh
+  $ LDLIBS=-static EXTRA_LDFLAGS=-static PKG_CONFIG='pkg-config --static' vmtest.sh
 
 .. note:: Some distros may not support static linking.
 
diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
index 029b3e21f438..3d9d2cd7764b 100644
--- a/tools/testing/selftests/bpf/bench.c
+++ b/tools/testing/selftests/bpf/bench.c
@@ -286,6 +286,7 @@ extern struct argp bench_trigger_batch_argp;
 extern struct argp bench_crypto_argp;
 extern struct argp bench_sockmap_argp;
 extern struct argp bench_lpm_trie_map_argp;
+extern struct argp bench_xdp_lb_argp;
 
 static const struct argp_child bench_parsers[] = {
 	{ &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 },
@@ -302,6 +303,7 @@ static const struct argp_child bench_parsers[] = {
 	{ &bench_crypto_argp, 0, "bpf crypto benchmark", 0 },
 	{ &bench_sockmap_argp, 0, "bpf sockmap benchmark", 0 },
 	{ &bench_lpm_trie_map_argp, 0, "LPM trie map benchmark", 0 },
+	{ &bench_xdp_lb_argp, 0, "XDP load-balancer benchmark", 0 },
 	{},
 };
 
@@ -558,13 +560,16 @@ extern const struct bench bench_bpf_loop;
 extern const struct bench bench_strncmp_no_helper;
 extern const struct bench bench_strncmp_helper;
 extern const struct bench bench_bpf_hashmap_full_update;
+extern const struct bench bench_bpf_rhashmap_full_update;
 extern const struct bench bench_local_storage_cache_seq_get;
 extern const struct bench bench_local_storage_cache_interleaved_get;
 extern const struct bench bench_local_storage_cache_hashmap_control;
 extern const struct bench bench_local_storage_tasks_trace;
 extern const struct bench bench_bpf_hashmap_lookup;
+extern const struct bench bench_bpf_rhashmap_lookup;
 extern const struct bench bench_local_storage_create;
 extern const struct bench bench_htab_mem;
+extern const struct bench bench_rhtab_mem;
 extern const struct bench bench_crypto_encrypt;
 extern const struct bench bench_crypto_decrypt;
 extern const struct bench bench_sockmap;
@@ -575,6 +580,8 @@ extern const struct bench bench_lpm_trie_insert;
 extern const struct bench bench_lpm_trie_update;
 extern const struct bench bench_lpm_trie_delete;
 extern const struct bench bench_lpm_trie_free;
+extern const struct bench bench_bpf_nop;
+extern const struct bench bench_xdp_lb;
 
 static const struct bench *benchs[] = {
 	&bench_count_global,
@@ -636,13 +643,16 @@ static const struct bench *benchs[] = {
 	&bench_strncmp_no_helper,
 	&bench_strncmp_helper,
 	&bench_bpf_hashmap_full_update,
+	&bench_bpf_rhashmap_full_update,
 	&bench_local_storage_cache_seq_get,
 	&bench_local_storage_cache_interleaved_get,
 	&bench_local_storage_cache_hashmap_control,
 	&bench_local_storage_tasks_trace,
 	&bench_bpf_hashmap_lookup,
+	&bench_bpf_rhashmap_lookup,
 	&bench_local_storage_create,
 	&bench_htab_mem,
+	&bench_rhtab_mem,
 	&bench_crypto_encrypt,
 	&bench_crypto_decrypt,
 	&bench_sockmap,
@@ -653,6 +663,8 @@ static const struct bench *benchs[] = {
 	&bench_lpm_trie_update,
 	&bench_lpm_trie_delete,
 	&bench_lpm_trie_free,
+	&bench_bpf_nop,
+	&bench_xdp_lb,
 };
 
 static void find_benchmark(void)
@@ -741,6 +753,13 @@ static void setup_benchmark(void)
 static pthread_mutex_t bench_done_mtx = PTHREAD_MUTEX_INITIALIZER;
 static pthread_cond_t bench_done = PTHREAD_COND_INITIALIZER;
 
+void bench_force_done(void)
+{
+	pthread_mutex_lock(&bench_done_mtx);
+	pthread_cond_signal(&bench_done);
+	pthread_mutex_unlock(&bench_done_mtx);
+}
+
 static void collect_measurements(long delta_ns) {
 	int iter = state.res_cnt++;
 	struct bench_res *res = &state.results[iter];
@@ -750,11 +769,8 @@ static void collect_measurements(long delta_ns) {
 	if (bench->report_progress)
 		bench->report_progress(iter, res, delta_ns);
 
-	if (iter == env.duration_sec + env.warmup_sec) {
-		pthread_mutex_lock(&bench_done_mtx);
-		pthread_cond_signal(&bench_done);
-		pthread_mutex_unlock(&bench_done_mtx);
-	}
+	if (iter == env.duration_sec + env.warmup_sec)
+		bench_force_done();
 }
 
 int main(int argc, char **argv)
diff --git a/tools/testing/selftests/bpf/bench.h b/tools/testing/selftests/bpf/bench.h
index 7cf21936e7ed..89a3fc72f70e 100644
--- a/tools/testing/selftests/bpf/bench.h
+++ b/tools/testing/selftests/bpf/bench.h
@@ -70,6 +70,7 @@ extern struct env env;
 extern const struct bench *bench;
 
 void setup_libbpf(void);
+void bench_force_done(void);
 void hits_drops_report_progress(int iter, struct bench_res *res, long delta_ns);
 void hits_drops_report_final(struct bench_res res[], int res_cnt);
 void false_hits_report_progress(int iter, struct bench_res *res, long delta_ns);
diff --git a/tools/testing/selftests/bpf/bench_bpf_timing.h b/tools/testing/selftests/bpf/bench_bpf_timing.h
new file mode 100644
index 000000000000..6ef23b6d6639
--- /dev/null
+++ b/tools/testing/selftests/bpf/bench_bpf_timing.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#ifndef __BENCH_BPF_TIMING_H__
+#define __BENCH_BPF_TIMING_H__
+
+#include <stdbool.h>
+#include <linux/types.h>
+#include "bench.h"
+
+#ifndef BENCH_NR_SAMPLES
+#define BENCH_NR_SAMPLES	4096
+#endif
+#ifndef BENCH_NR_CPUS
+#define BENCH_NR_CPUS		256
+#endif
+
+typedef void (*bpf_bench_run_fn)(void *ctx);
+
+struct bpf_bench_timing {
+	__u64 (*samples)[BENCH_NR_SAMPLES];	/* skel->bss->timing_samples */
+	__u32 *idx;				/* skel->bss->timing_idx */
+	volatile __u32 *timing_enabled;		/* &skel->bss->timing_enabled */
+	volatile __u32 *batch_iters_bss;	/* &skel->bss->batch_iters */
+	__u32 batch_iters;
+	__u32 target_samples;
+	__u32 nr_cpus;
+	int warmup_ticks;
+	bool done;
+	bool machine_readable;
+};
+
+#define BENCH_TIMING_INIT(t, skel, iters) do {				\
+	(t)->samples = (skel)->bss->timing_samples;			\
+	(t)->idx = (skel)->bss->timing_idx;				\
+	(t)->timing_enabled = &(skel)->bss->timing_enabled;		\
+	(t)->batch_iters_bss = &(skel)->bss->batch_iters;		\
+	(t)->batch_iters = (iters);					\
+	(t)->target_samples = 200;					\
+	(t)->nr_cpus = env.nr_cpus;					\
+	(t)->warmup_ticks = 0;						\
+	(t)->done = false;						\
+	(t)->machine_readable = false;					\
+} while (0)
+
+void bpf_bench_timing_measure(struct bpf_bench_timing *t, struct bench_res *res);
+void bpf_bench_timing_report(struct bpf_bench_timing *t, const char *name, const char *desc);
+void bpf_bench_calibrate(struct bpf_bench_timing *t, bpf_bench_run_fn run_fn, void *ctx);
+
+#endif /* __BENCH_BPF_TIMING_H__ */
diff --git a/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_full_update.c b/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_full_update.c
index ee1dc12c5e5e..7278fa860397 100644
--- a/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_full_update.c
+++ b/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_full_update.c
@@ -34,19 +34,29 @@ static void measure(struct bench_res *res)
 {
 }
 
-static void setup(void)
+static void hashmap_full_update_setup(enum bpf_map_type map_type)
 {
 	struct bpf_link *link;
 	int map_fd, i, max_entries;
 
 	setup_libbpf();
 
-	ctx.skel = bpf_hashmap_full_update_bench__open_and_load();
+	ctx.skel = bpf_hashmap_full_update_bench__open();
 	if (!ctx.skel) {
 		fprintf(stderr, "failed to open skeleton\n");
 		exit(1);
 	}
 
+	bpf_map__set_type(ctx.skel->maps.hash_map_bench, map_type);
+	if (map_type == BPF_MAP_TYPE_RHASH)
+		bpf_map__set_map_flags(ctx.skel->maps.hash_map_bench,
+				       BPF_F_NO_PREALLOC);
+
+	if (bpf_hashmap_full_update_bench__load(ctx.skel)) {
+		fprintf(stderr, "failed to load skeleton\n");
+		exit(1);
+	}
+
 	ctx.skel->bss->nr_loops = MAX_LOOP_NUM;
 
 	link = bpf_program__attach(ctx.skel->progs.benchmark);
@@ -62,6 +72,16 @@ static void setup(void)
 		bpf_map_update_elem(map_fd, &i, &i, BPF_ANY);
 }
 
+static void setup(void)
+{
+	hashmap_full_update_setup(BPF_MAP_TYPE_HASH);
+}
+
+static void rhash_setup(void)
+{
+	hashmap_full_update_setup(BPF_MAP_TYPE_RHASH);
+}
+
 static void hashmap_report_final(struct bench_res res[], int res_cnt)
 {
 	unsigned int nr_cpus = bpf_num_possible_cpus();
@@ -87,3 +107,13 @@ const struct bench bench_bpf_hashmap_full_update = {
 	.report_progress = NULL,
 	.report_final = hashmap_report_final,
 };
+
+const struct bench bench_bpf_rhashmap_full_update = {
+	.name = "bpf-rhashmap-full-update",
+	.validate = validate,
+	.setup = rhash_setup,
+	.producer_thread = producer,
+	.measure = measure,
+	.report_progress = NULL,
+	.report_final = hashmap_report_final,
+};
diff --git a/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_lookup.c b/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_lookup.c
index 279ff1b8b5b2..5264b7b20e39 100644
--- a/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_lookup.c
+++ b/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_lookup.c
@@ -148,9 +148,10 @@ static inline void patch_key(u32 i, u32 *key)
 	/* the rest of key is random */
 }
 
-static void setup(void)
+static void hashmap_lookup_setup(enum bpf_map_type map_type)
 {
 	struct bpf_link *link;
+	__u32 map_flags;
 	int map_fd;
 	int ret;
 	int i;
@@ -163,10 +164,15 @@ static void setup(void)
 		exit(1);
 	}
 
+	map_flags = args.map_flags;
+	if (map_type == BPF_MAP_TYPE_RHASH)
+		map_flags |= BPF_F_NO_PREALLOC;
+
+	bpf_map__set_type(ctx.skel->maps.hash_map_bench, map_type);
 	bpf_map__set_max_entries(ctx.skel->maps.hash_map_bench, args.max_entries);
 	bpf_map__set_key_size(ctx.skel->maps.hash_map_bench, args.key_size);
 	bpf_map__set_value_size(ctx.skel->maps.hash_map_bench, 8);
-	bpf_map__set_map_flags(ctx.skel->maps.hash_map_bench, args.map_flags);
+	bpf_map__set_map_flags(ctx.skel->maps.hash_map_bench, map_flags);
 
 	ctx.skel->bss->nr_entries = args.nr_entries;
 	ctx.skel->bss->nr_loops = args.nr_loops / args.nr_entries;
@@ -197,6 +203,16 @@ static void setup(void)
 	}
 }
 
+static void setup(void)
+{
+	hashmap_lookup_setup(BPF_MAP_TYPE_HASH);
+}
+
+static void rhash_setup(void)
+{
+	hashmap_lookup_setup(BPF_MAP_TYPE_RHASH);
+}
+
 static inline double events_from_time(u64 time)
 {
 	if (time)
@@ -275,3 +291,14 @@ const struct bench bench_bpf_hashmap_lookup = {
 	.report_progress = NULL,
 	.report_final = hashmap_report_final,
 };
+
+const struct bench bench_bpf_rhashmap_lookup = {
+	.name = "bpf-rhashmap-lookup",
+	.argp = &bench_hashmap_lookup_argp,
+	.validate = validate,
+	.setup = rhash_setup,
+	.producer_thread = producer,
+	.measure = measure,
+	.report_progress = NULL,
+	.report_final = hashmap_report_final,
+};
diff --git a/tools/testing/selftests/bpf/benchs/bench_bpf_nop.c b/tools/testing/selftests/bpf/benchs/bench_bpf_nop.c
new file mode 100644
index 000000000000..e2d8c2ccf384
--- /dev/null
+++ b/tools/testing/selftests/bpf/benchs/bench_bpf_nop.c
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#include "bench.h"
+#include "bench_bpf_timing.h"
+#include "bpf_nop_bench.skel.h"
+#include "bpf_util.h"
+
+static struct ctx {
+	struct bpf_nop_bench *skel;
+	struct bpf_bench_timing timing;
+	int prog_fd;
+} ctx;
+
+static void nop_validate(void)
+{
+	if (env.consumer_cnt != 0) {
+		fprintf(stderr, "benchmark doesn't support consumers\n");
+		exit(1);
+	}
+}
+
+static void nop_run_once(void *unused __always_unused)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
+
+	bpf_prog_test_run_opts(ctx.prog_fd, &topts);
+}
+
+static void nop_setup(void)
+{
+	struct bpf_nop_bench *skel;
+	int err;
+
+	setup_libbpf();
+
+	skel = bpf_nop_bench__open();
+	if (!skel) {
+		fprintf(stderr, "failed to open skeleton\n");
+		exit(1);
+	}
+
+	err = bpf_nop_bench__load(skel);
+	if (err) {
+		fprintf(stderr, "failed to load skeleton: %s\n", strerror(-err));
+		bpf_nop_bench__destroy(skel);
+		exit(1);
+	}
+
+	ctx.skel = skel;
+	ctx.prog_fd = bpf_program__fd(skel->progs.bench_nop);
+
+	BENCH_TIMING_INIT(&ctx.timing, skel, 0);
+	bpf_bench_calibrate(&ctx.timing, nop_run_once, NULL);
+
+	env.duration_sec = 600;
+}
+
+static void *nop_producer(void *input)
+{
+	while (true)
+		nop_run_once(NULL);
+
+	return NULL;
+}
+
+static void nop_measure(struct bench_res *res)
+{
+	bpf_bench_timing_measure(&ctx.timing, res);
+}
+
+static void nop_report_final(struct bench_res res[], int res_cnt)
+{
+	bpf_bench_timing_report(&ctx.timing, "bpf-nop", NULL);
+}
+
+const struct bench bench_bpf_nop = {
+	.name		= "bpf-nop",
+	.validate	= nop_validate,
+	.setup		= nop_setup,
+	.producer_thread = nop_producer,
+	.measure	= nop_measure,
+	.report_final	= nop_report_final,
+};
diff --git a/tools/testing/selftests/bpf/benchs/bench_bpf_timing.c b/tools/testing/selftests/bpf/benchs/bench_bpf_timing.c
new file mode 100644
index 000000000000..e02ad324f7bc
--- /dev/null
+++ b/tools/testing/selftests/bpf/benchs/bench_bpf_timing.c
@@ -0,0 +1,298 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "bench_bpf_timing.h"
+#include "bpf_util.h"
+
+struct timing_stats {
+	double min, max;
+	double median, p99;
+	double mean, stddev;
+	int count;
+};
+
+static int cmp_double(const void *a, const void *b)
+{
+	double da = *(const double *)a;
+	double db = *(const double *)b;
+
+	if (da < db)
+		return -1;
+	if (da > db)
+		return 1;
+	return 0;
+}
+
+static double percentile(const double *sorted, int n, double pct)
+{
+	int idx = (int)(n * pct / 100.0);
+
+	if (idx >= n)
+		idx = n - 1;
+	return sorted[idx];
+}
+
+static int collect_samples(struct bpf_bench_timing *t,
+			   double *out, int max_out)
+{
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	__u32 timed_iters = t->batch_iters;
+	int total = 0;
+
+	if (nr_cpus > BENCH_NR_CPUS)
+		nr_cpus = BENCH_NR_CPUS;
+
+	for (unsigned int cpu = 0; cpu < nr_cpus; cpu++) {
+		__u32 count = t->idx[cpu];
+
+		if (count > BENCH_NR_SAMPLES)
+			count = BENCH_NR_SAMPLES;
+
+		for (__u32 i = 0; i < count && total < max_out; i++) {
+			__u64 sample = t->samples[cpu][i];
+
+			if (sample == 0)
+				continue;
+			out[total++] = (double)sample / timed_iters;
+		}
+	}
+
+	qsort(out, total, sizeof(double), cmp_double);
+	return total;
+}
+
+static int filter_outliers_iqr(double *sorted, int n)
+{
+	double q1, q3, iqr, lo, hi;
+	int start = 0, end = n;
+
+	if (n < 8)
+		return n;
+
+	q1 = sorted[n / 4];
+	q3 = sorted[3 * n / 4];
+	iqr = q3 - q1;
+	lo = q1 - 1.5 * iqr;
+	hi = q3 + 1.5 * iqr;
+
+	while (start < end && sorted[start] < lo)
+		start++;
+	while (end > start && sorted[end - 1] > hi)
+		end--;
+
+	if (start > 0)
+		memmove(sorted, sorted + start, (end - start) * sizeof(double));
+
+	return end - start;
+}
+
+static void compute_stats(const double *sorted, int n,
+			  struct timing_stats *s)
+{
+	double sum = 0, var_sum = 0;
+
+	memset(s, 0, sizeof(*s));
+	s->count = n;
+
+	if (n == 0)
+		return;
+
+	s->min    = sorted[0];
+	s->max    = sorted[n - 1];
+	s->median = sorted[n / 2];
+	s->p99    = percentile(sorted, n, 99);
+
+	for (int i = 0; i < n; i++)
+		sum += sorted[i];
+	s->mean = sum / n;
+
+	for (int i = 0; i < n; i++) {
+		double d = sorted[i] - s->mean;
+
+		var_sum += d * d;
+	}
+	s->stddev = n > 1 ? sqrt(var_sum / (n - 1)) : 0;
+}
+
+void bpf_bench_timing_measure(struct bpf_bench_timing *t, struct bench_res *res)
+{
+	unsigned int nr_cpus;
+	__u32 total_samples;
+	int i;
+
+	t->warmup_ticks++;
+
+	if (t->warmup_ticks < env.warmup_sec)
+		return;
+
+	if (t->warmup_ticks == env.warmup_sec) {
+		*t->timing_enabled = 1;
+		return;
+	}
+
+	nr_cpus = bpf_num_possible_cpus();
+	if (nr_cpus > BENCH_NR_CPUS)
+		nr_cpus = BENCH_NR_CPUS;
+
+	total_samples = 0;
+	for (i = 0; i < (int)nr_cpus; i++) {
+		__u32 cnt = t->idx[i];
+
+		if (cnt > BENCH_NR_SAMPLES)
+			cnt = BENCH_NR_SAMPLES;
+		total_samples += cnt;
+	}
+
+	if (total_samples >= (__u32)env.producer_cnt * t->target_samples && !t->done) {
+		t->done = true;
+		*t->timing_enabled = 0;
+		bench_force_done();
+	}
+}
+
+void bpf_bench_timing_report(struct bpf_bench_timing *t, const char *name, const char *description)
+{
+	int max_out = BENCH_NR_CPUS * BENCH_NR_SAMPLES;
+	struct timing_stats s;
+	double *all;
+	int total;
+
+	all = calloc(max_out, sizeof(*all));
+	if (!all) {
+		fprintf(stderr, "failed to allocate timing buffer\n");
+		return;
+	}
+
+	total = collect_samples(t, all, max_out);
+
+	if (total == 0) {
+		printf("No timing samples collected.\n");
+		free(all);
+		return;
+	}
+
+	total = filter_outliers_iqr(all, total);
+	compute_stats(all, total, &s);
+
+	if (t->machine_readable) {
+		printf("RESULT scenario=%s samples=%d median=%.2f stddev=%.2f cv=%.2f min=%.2f "
+		       "p99=%.2f max=%.2f\n", name, total, s.median, s.stddev,
+		       s.mean > 0 ? s.stddev / s.mean * 100.0 : 0.0, s.min, s.p99, s.max);
+	} else {
+		printf("%s: median %.2f ns/op, stddev %.2f, p99 %.2f (%d samples)\n", name,
+		       s.median, s.stddev, s.p99, total);
+	}
+
+	free(all);
+}
+
+#define CALIBRATE_SEED_BATCH	100
+#define CALIBRATE_MIN_BATCH	100
+#define CALIBRATE_MAX_BATCH	10000000
+#define CALIBRATE_TARGET_MS	10
+#define CALIBRATE_RUNS		5
+#define PROPORTIONALITY_TOL	0.05	/* 5% */
+
+static void reset_timing(struct bpf_bench_timing *t)
+{
+	*t->timing_enabled = 0;
+	memset(t->samples, 0, sizeof(__u64) * BENCH_NR_CPUS * BENCH_NR_SAMPLES);
+	memset(t->idx, 0, sizeof(__u32) * BENCH_NR_CPUS);
+}
+
+static __u64 measure_elapsed(struct bpf_bench_timing *t, bpf_bench_run_fn run_fn, void *run_ctx,
+			     __u32 iters, int runs)
+{
+	__u64 buf[CALIBRATE_RUNS];
+	int n = 0, i, j;
+
+	reset_timing(t);
+	*t->batch_iters_bss = iters;
+	*t->timing_enabled = 1;
+
+	for (i = 0; i < runs; i++)
+		run_fn(run_ctx);
+
+	*t->timing_enabled = 0;
+
+	for (i = 0; i < BENCH_NR_CPUS && n < runs; i++) {
+		__u32 cnt = t->idx[i];
+
+		for (j = 0; j < (int)cnt && n < runs; j++)
+			buf[n++] = t->samples[i][j];
+	}
+
+	if (n == 0)
+		return 0;
+
+	for (i = 1; i < n; i++) {
+		__u64 key = buf[i];
+
+		j = i - 1;
+		while (j >= 0 && buf[j] > key) {
+			buf[j + 1] = buf[j];
+			j--;
+		}
+		buf[j + 1] = key;
+	}
+
+	return buf[n / 2];
+}
+
+static __u32 compute_batch_iters(__u64 per_op_ns)
+{
+	__u64 target_ns = (__u64)CALIBRATE_TARGET_MS * 1000000ULL;
+	__u32 iters;
+
+	if (per_op_ns == 0)
+		return CALIBRATE_MIN_BATCH;
+
+	iters = target_ns / per_op_ns;
+
+	if (iters < CALIBRATE_MIN_BATCH)
+		iters = CALIBRATE_MIN_BATCH;
+	if (iters > CALIBRATE_MAX_BATCH)
+		iters = CALIBRATE_MAX_BATCH;
+
+	return iters;
+}
+
+void bpf_bench_calibrate(struct bpf_bench_timing *t, bpf_bench_run_fn run_fn, void *run_ctx)
+{
+	__u64 elapsed, per_op_ns;
+	__u64 time_n, time_2n;
+	double ratio;
+
+	elapsed = measure_elapsed(t, run_fn, run_ctx, CALIBRATE_SEED_BATCH, CALIBRATE_RUNS);
+	if (elapsed == 0) {
+		fprintf(stderr, "calibration: no timing samples, using default\n");
+		t->batch_iters = 10000;
+		*t->batch_iters_bss = t->batch_iters;
+		reset_timing(t);
+		return;
+	}
+
+	per_op_ns = elapsed / CALIBRATE_SEED_BATCH;
+	t->batch_iters = compute_batch_iters(per_op_ns);
+
+	time_n = measure_elapsed(t, run_fn, run_ctx, t->batch_iters, CALIBRATE_RUNS);
+	time_2n = measure_elapsed(t, run_fn, run_ctx, t->batch_iters * 2, CALIBRATE_RUNS);
+
+	if (time_n > 0 && time_2n > 0) {
+		ratio = (double)time_2n / (double)time_n;
+
+		if (fabs(ratio - 2.0) / 2.0 > PROPORTIONALITY_TOL)
+			fprintf(stderr,
+				"WARNING: proportionality check failed (2N/N ratio=%.3f, "
+				"expected=2.000, error=%.1f%%)\n  System noise may be affecting "
+				"results.\n",
+				ratio, fabs(ratio - 2.0) / 2.0 * 100.0);
+	}
+
+	*t->batch_iters_bss = t->batch_iters;
+	reset_timing(t);
+}
diff --git a/tools/testing/selftests/bpf/benchs/bench_htab_mem.c b/tools/testing/selftests/bpf/benchs/bench_htab_mem.c
index 297e32390cd1..1ee217d97434 100644
--- a/tools/testing/selftests/bpf/benchs/bench_htab_mem.c
+++ b/tools/testing/selftests/bpf/benchs/bench_htab_mem.c
@@ -152,7 +152,7 @@ static const struct htab_mem_use_case *htab_mem_find_use_case_or_exit(const char
 	exit(1);
 }
 
-static void htab_mem_setup(void)
+static void htab_mem_setup_impl(enum bpf_map_type map_type)
 {
 	struct bpf_map *map;
 	const char **names;
@@ -178,10 +178,11 @@ static void htab_mem_setup(void)
 	}
 
 	map = ctx.skel->maps.htab;
+	bpf_map__set_type(map, map_type);
 	bpf_map__set_value_size(map, args.value_size);
 	/* Ensure that different CPUs can operate on different subset */
 	bpf_map__set_max_entries(map, MAX(8192, 64 * env.nr_cpus));
-	if (args.preallocated)
+	if (map_type != BPF_MAP_TYPE_RHASH && args.preallocated)
 		bpf_map__set_map_flags(map, bpf_map__map_flags(map) & ~BPF_F_NO_PREALLOC);
 
 	names = ctx.uc->progs;
@@ -220,6 +221,16 @@ cleanup:
 	exit(1);
 }
 
+static void htab_mem_setup(void)
+{
+	htab_mem_setup_impl(BPF_MAP_TYPE_HASH);
+}
+
+static void rhtab_mem_setup(void)
+{
+	htab_mem_setup_impl(BPF_MAP_TYPE_RHASH);
+}
+
 static void htab_mem_add_fn(pthread_barrier_t *notify)
 {
 	while (true) {
@@ -338,6 +349,15 @@ static void htab_mem_report_final(struct bench_res res[], int res_cnt)
 	cleanup_cgroup_environment();
 }
 
+static void rhtab_mem_validate(void)
+{
+	if (args.preallocated) {
+		fprintf(stderr, "rhash map does not support preallocation\n");
+		exit(1);
+	}
+	htab_mem_validate();
+}
+
 const struct bench bench_htab_mem = {
 	.name = "htab-mem",
 	.argp = &bench_htab_mem_argp,
@@ -348,3 +368,14 @@ const struct bench bench_htab_mem = {
 	.report_progress = htab_mem_report_progress,
 	.report_final = htab_mem_report_final,
 };
+
+const struct bench bench_rhtab_mem = {
+	.name = "rhtab-mem",
+	.argp = &bench_htab_mem_argp,
+	.validate = rhtab_mem_validate,
+	.setup = rhtab_mem_setup,
+	.producer_thread = htab_mem_producer,
+	.measure = htab_mem_measure,
+	.report_progress = htab_mem_report_progress,
+	.report_final = htab_mem_report_final,
+};
diff --git a/tools/testing/selftests/bpf/benchs/bench_xdp_lb.c b/tools/testing/selftests/bpf/benchs/bench_xdp_lb.c
new file mode 100644
index 000000000000..8e25bccbde92
--- /dev/null
+++ b/tools/testing/selftests/bpf/benchs/bench_xdp_lb.c
@@ -0,0 +1,1124 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#include <argp.h>
+#include <string.h>
+#include <arpa/inet.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include "bench.h"
+#include "bench_bpf_timing.h"
+#include "xdp_lb_bench.skel.h"
+#include "xdp_lb_bench_common.h"
+#include "bpf_util.h"
+
+#define IP4(a, b, c, d) (((__u32)(a) << 24) | ((__u32)(b) << 16) | ((__u32)(c) << 8) | (__u32)(d))
+
+#define IP6(a, b, c, d)  { (__u32)(a), (__u32)(b), (__u32)(c), (__u32)(d) }
+
+#define TNL_DST		IP4(192, 168, 1, 2)
+#define REAL_INDEX	1
+#define REAL_INDEX_V6	2
+#define MAX_PKT_SIZE	256
+#define IP_MF		0x2000
+
+static const __u32 tnl_dst_v6[4] = { 0xfd000000, 0, 0, 2 };
+
+static const __u8 lb_mac[ETH_ALEN]	= {0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff};
+static const __u8 client_mac[ETH_ALEN]	= {0x11, 0x22, 0x33, 0x44, 0x55, 0x66};
+static const __u8 router_mac[ETH_ALEN]	= {0xde, 0xad, 0xbe, 0xef, 0x00, 0x01};
+
+enum scenario_id {
+	S_TCP_V4_LRU_HIT,
+	S_TCP_V4_CH,
+	S_TCP_V6_LRU_HIT,
+	S_TCP_V6_CH,
+	S_UDP_V4_LRU_HIT,
+	S_UDP_V6_LRU_HIT,
+	S_TCP_V4V6_LRU_HIT,
+	S_TCP_V4_LRU_DIVERSE,
+	S_TCP_V4_CH_DIVERSE,
+	S_TCP_V6_LRU_DIVERSE,
+	S_TCP_V6_CH_DIVERSE,
+	S_UDP_V4_LRU_DIVERSE,
+	S_TCP_V4_LRU_MISS,
+	S_UDP_V4_LRU_MISS,
+	S_TCP_V4_LRU_WARMUP,
+	S_TCP_V4_SYN,
+	S_TCP_V4_RST_MISS,
+	S_PASS_V4_NO_VIP,
+	S_PASS_V6_NO_VIP,
+	S_PASS_V4_ICMP,
+	S_PASS_NON_IP,
+	S_DROP_V4_FRAG,
+	S_DROP_V4_OPTIONS,
+	S_DROP_V6_FRAG,
+	NUM_SCENARIOS,
+};
+
+enum lru_miss_type {
+	LRU_MISS_AUTO = 0,	/* compute from scenario flags (default) */
+	LRU_MISS_NONE,		/* 0 misses (all LRU hits) */
+	LRU_MISS_ALL,		/* batch_iters+1 misses (every op misses) */
+	LRU_MISS_FIRST,		/* 1 miss (first miss, then hits) */
+};
+
+#define S_BASE_ENCAP_V4							\
+	.expected_retval = XDP_TX, .expect_encap = true,		\
+	.tunnel_dst = TNL_DST
+
+#define S_BASE_ENCAP_V6							\
+	.expected_retval = XDP_TX, .expect_encap = true,		\
+	.is_v6 = true, .encap_v6_outer = true,				\
+	.tunnel_dst_v6 = { 0xfd000000, 0, 0, 2 }
+
+#define S_BASE_ENCAP_V4V6						\
+	.expected_retval = XDP_TX, .expect_encap = true,		\
+	.encap_v6_outer = true,						\
+	.tunnel_dst_v6 = { 0xfd000000, 0, 0, 2 }
+
+struct test_scenario {
+	const char *name;
+	const char *description;
+	int         expected_retval;
+	bool        expect_encap;
+	bool        is_v6;
+	__u32       vip_addr;
+	__u32       src_addr;
+	__u32       tunnel_dst;
+	__u32       vip_addr_v6[4];
+	__u32       src_addr_v6[4];
+	__u32       tunnel_dst_v6[4];
+	__u16       dst_port;
+	__u16       src_port;
+	__u8        ip_proto;
+	__u32       vip_flags;
+	__u32       vip_num;
+	bool        prepopulate_lru;
+	bool        set_frag;
+	__u16       eth_proto;
+	bool        encap_v6_outer;
+	__u32       flow_mask;
+	bool        cold_lru;
+	bool        set_syn;
+	bool        set_rst;
+	bool        set_ip_options;
+	__u32       fixed_batch_iters;	/* 0 = auto-calibrate, >0 = use this value */
+	enum lru_miss_type lru_miss;	/* expected LRU miss pattern */
+};
+
+static const struct test_scenario scenarios[NUM_SCENARIOS] = {
+	/* Single-flow baseline */
+	[S_TCP_V4_LRU_HIT] = {
+		S_BASE_ENCAP_V4, .ip_proto = IPPROTO_TCP,
+		.name        = "tcp-v4-lru-hit",
+		.description = "IPv4 TCP, LRU hit, IPIP encap",
+		.vip_addr    = IP4(10, 10, 1, 1), .dst_port = 80,
+		.src_addr    = IP4(10, 10, 2, 1), .src_port = 12345,
+		.prepopulate_lru = true, .lru_miss = LRU_MISS_NONE,
+	},
+	[S_TCP_V4_CH] = {
+		S_BASE_ENCAP_V4, .ip_proto = IPPROTO_TCP,
+		.name        = "tcp-v4-ch",
+		.description = "IPv4 TCP, CH (LRU bypass), IPIP encap",
+		.vip_addr    = IP4(10, 10, 1, 2), .dst_port = 80,
+		.src_addr    = IP4(10, 10, 2, 2), .src_port = 54321,
+		.vip_flags   = F_LRU_BYPASS, .vip_num = 1,
+		.lru_miss    = LRU_MISS_ALL,
+	},
+	[S_TCP_V6_LRU_HIT] = {
+		S_BASE_ENCAP_V6, .ip_proto = IPPROTO_TCP,
+		.name        = "tcp-v6-lru-hit",
+		.description = "IPv6 TCP, LRU hit, IP6IP6 encap",
+		.vip_addr_v6 = IP6(0xfd000100, 0, 0, 1), .dst_port = 80,
+		.src_addr_v6 = IP6(0xfd000200, 0, 0, 1), .src_port = 12345,
+		.vip_num     = 10,
+		.prepopulate_lru = true, .lru_miss = LRU_MISS_NONE,
+	},
+	[S_TCP_V6_CH] = {
+		S_BASE_ENCAP_V6, .ip_proto = IPPROTO_TCP,
+		.name        = "tcp-v6-ch",
+		.description = "IPv6 TCP, CH (LRU bypass), IP6IP6 encap",
+		.vip_addr_v6 = IP6(0xfd000100, 0, 0, 2), .dst_port = 80,
+		.src_addr_v6 = IP6(0xfd000200, 0, 0, 2), .src_port = 54321,
+		.vip_flags   = F_LRU_BYPASS, .vip_num = 12,
+		.lru_miss    = LRU_MISS_ALL,
+	},
+	[S_UDP_V4_LRU_HIT] = {
+		S_BASE_ENCAP_V4, .ip_proto = IPPROTO_UDP,
+		.name        = "udp-v4-lru-hit",
+		.description = "IPv4 UDP, LRU hit, IPIP encap",
+		.vip_addr    = IP4(10, 10, 1, 1), .dst_port = 443,
+		.src_addr    = IP4(10, 10, 3, 1), .src_port = 11111,
+		.vip_num     = 2,
+		.prepopulate_lru = true, .lru_miss = LRU_MISS_NONE,
+	},
+	[S_UDP_V6_LRU_HIT] = {
+		S_BASE_ENCAP_V6, .ip_proto = IPPROTO_UDP,
+		.name        = "udp-v6-lru-hit",
+		.description = "IPv6 UDP, LRU hit, IP6IP6 encap",
+		.vip_addr_v6 = IP6(0xfd000100, 0, 0, 1), .dst_port = 443,
+		.src_addr_v6 = IP6(0xfd000200, 0, 0, 3), .src_port = 22222,
+		.vip_num     = 14,
+		.prepopulate_lru = true, .lru_miss = LRU_MISS_NONE,
+	},
+	[S_TCP_V4V6_LRU_HIT] = {
+		S_BASE_ENCAP_V4V6, .ip_proto = IPPROTO_TCP,
+		.name        = "tcp-v4v6-lru-hit",
+		.description = "IPv4 TCP, LRU hit, IPv4-in-IPv6 encap",
+		.vip_addr    = IP4(10, 10, 1, 4), .dst_port = 80,
+		.src_addr    = IP4(10, 10, 2, 4), .src_port = 12347,
+		.vip_num     = 13,
+		.prepopulate_lru = true, .lru_miss = LRU_MISS_NONE,
+	},
+
+	/* Diverse flows (4K src addrs) */
+	[S_TCP_V4_LRU_DIVERSE] = {
+		S_BASE_ENCAP_V4, .ip_proto = IPPROTO_TCP,
+		.name        = "tcp-v4-lru-diverse",
+		.description = "IPv4 TCP, diverse flows, warm LRU",
+		.vip_addr    = IP4(10, 10, 1, 1), .dst_port = 80,
+		.src_addr    = IP4(10, 10, 2, 1), .src_port = 12345,
+		.prepopulate_lru = true, .flow_mask = 0xFFF,
+		.lru_miss    = LRU_MISS_NONE,
+	},
+	[S_TCP_V4_CH_DIVERSE] = {
+		S_BASE_ENCAP_V4, .ip_proto = IPPROTO_TCP,
+		.name        = "tcp-v4-ch-diverse",
+		.description = "IPv4 TCP, diverse flows, CH (LRU bypass)",
+		.vip_addr    = IP4(10, 10, 1, 2), .dst_port = 80,
+		.src_addr    = IP4(10, 10, 2, 2), .src_port = 54321,
+		.vip_flags   = F_LRU_BYPASS, .vip_num = 1,
+		.flow_mask   = 0xFFF, .lru_miss = LRU_MISS_ALL,
+	},
+	[S_TCP_V6_LRU_DIVERSE] = {
+		S_BASE_ENCAP_V6, .ip_proto = IPPROTO_TCP,
+		.name        = "tcp-v6-lru-diverse",
+		.description = "IPv6 TCP, diverse flows, warm LRU",
+		.vip_addr_v6 = IP6(0xfd000100, 0, 0, 1), .dst_port = 80,
+		.src_addr_v6 = IP6(0xfd000200, 0, 0, 1), .src_port = 12345,
+		.vip_num     = 10,
+		.prepopulate_lru = true, .flow_mask = 0xFFF,
+		.lru_miss    = LRU_MISS_NONE,
+	},
+	[S_TCP_V6_CH_DIVERSE] = {
+		S_BASE_ENCAP_V6, .ip_proto = IPPROTO_TCP,
+		.name        = "tcp-v6-ch-diverse",
+		.description = "IPv6 TCP, diverse flows, CH (LRU bypass)",
+		.vip_addr_v6 = IP6(0xfd000100, 0, 0, 2), .dst_port = 80,
+		.src_addr_v6 = IP6(0xfd000200, 0, 0, 2), .src_port = 54321,
+		.vip_flags   = F_LRU_BYPASS, .vip_num = 12,
+		.flow_mask   = 0xFFF, .lru_miss = LRU_MISS_ALL,
+	},
+	[S_UDP_V4_LRU_DIVERSE] = {
+		S_BASE_ENCAP_V4, .ip_proto = IPPROTO_UDP,
+		.name        = "udp-v4-lru-diverse",
+		.description = "IPv4 UDP, diverse flows, warm LRU",
+		.vip_addr    = IP4(10, 10, 1, 1), .dst_port = 443,
+		.src_addr    = IP4(10, 10, 3, 1), .src_port = 11111,
+		.vip_num     = 2,
+		.prepopulate_lru = true, .flow_mask = 0xFFF,
+		.lru_miss    = LRU_MISS_NONE,
+	},
+
+	/* LRU stress */
+	[S_TCP_V4_LRU_MISS] = {
+		S_BASE_ENCAP_V4, .ip_proto = IPPROTO_TCP,
+		.name        = "tcp-v4-lru-miss",
+		.description = "IPv4 TCP, LRU miss (16M flow space), CH lookup",
+		.vip_addr    = IP4(10, 10, 1, 1), .dst_port = 80,
+		.src_addr    = IP4(10, 10, 2, 1), .src_port = 12345,
+		.flow_mask   = 0xFFFFFF, .cold_lru = true,
+		.lru_miss    = LRU_MISS_FIRST,
+	},
+	[S_UDP_V4_LRU_MISS] = {
+		S_BASE_ENCAP_V4, .ip_proto = IPPROTO_UDP,
+		.name        = "udp-v4-lru-miss",
+		.description = "IPv4 UDP, LRU miss (16M flow space), CH lookup",
+		.vip_addr    = IP4(10, 10, 1, 1), .dst_port = 443,
+		.src_addr    = IP4(10, 10, 3, 1), .src_port = 11111,
+		.vip_num     = 2,
+		.flow_mask   = 0xFFFFFF, .cold_lru = true,
+		.lru_miss    = LRU_MISS_FIRST,
+	},
+	[S_TCP_V4_LRU_WARMUP] = {
+		S_BASE_ENCAP_V4, .ip_proto = IPPROTO_TCP,
+		.name        = "tcp-v4-lru-warmup",
+		.description = "IPv4 TCP, 4K flows, ~50% LRU miss",
+		.vip_addr    = IP4(10, 10, 1, 1), .dst_port = 80,
+		.src_addr    = IP4(10, 10, 2, 1), .src_port = 12345,
+		.flow_mask   = 0xFFF, .cold_lru = true,
+		.fixed_batch_iters = 6500,
+		.lru_miss    = LRU_MISS_FIRST,
+	},
+
+	/* TCP flags */
+	[S_TCP_V4_SYN] = {
+		S_BASE_ENCAP_V4, .ip_proto = IPPROTO_TCP,
+		.name        = "tcp-v4-syn",
+		.description = "IPv4 TCP SYN, skip LRU, CH + LRU insert",
+		.vip_addr    = IP4(10, 10, 1, 1), .dst_port = 80,
+		.src_addr    = IP4(10, 10, 8, 2), .src_port = 60001,
+		.set_syn     = true, .lru_miss = LRU_MISS_ALL,
+	},
+	[S_TCP_V4_RST_MISS] = {
+		S_BASE_ENCAP_V4, .ip_proto = IPPROTO_TCP,
+		.name        = "tcp-v4-rst-miss",
+		.description = "IPv4 TCP RST, CH lookup, no LRU insert",
+		.vip_addr    = IP4(10, 10, 1, 1), .dst_port = 80,
+		.src_addr    = IP4(10, 10, 8, 1), .src_port = 60000,
+		.flow_mask   = 0xFFFFFF, .cold_lru = true,
+		.set_rst     = true, .lru_miss = LRU_MISS_ALL,
+	},
+
+	/* Early exits */
+	[S_PASS_V4_NO_VIP] = {
+		.name            = "pass-v4-no-vip",
+		.description     = "IPv4 TCP, unknown VIP, XDP_PASS",
+		.expected_retval = XDP_PASS,
+		.ip_proto        = IPPROTO_TCP,
+		.vip_addr        = IP4(10, 10, 9, 9), .dst_port = 80,
+		.src_addr        = IP4(10, 10, 4, 1), .src_port = 33333,
+	},
+	[S_PASS_V6_NO_VIP] = {
+		.name            = "pass-v6-no-vip",
+		.description     = "IPv6 TCP, unknown VIP, XDP_PASS",
+		.expected_retval = XDP_PASS, .is_v6 = true,
+		.ip_proto        = IPPROTO_TCP,
+		.vip_addr_v6     = IP6(0xfd009900, 0, 0, 1), .dst_port = 80,
+		.src_addr_v6     = IP6(0xfd000400, 0, 0, 1), .src_port = 33333,
+	},
+	[S_PASS_V4_ICMP] = {
+		.name            = "pass-v4-icmp",
+		.description     = "IPv4 ICMP, non-TCP/UDP protocol, XDP_PASS",
+		.expected_retval = XDP_PASS,
+		.ip_proto        = IPPROTO_ICMP,
+		.vip_addr        = IP4(10, 10, 1, 1),
+		.src_addr        = IP4(10, 10, 6, 1),
+	},
+	[S_PASS_NON_IP] = {
+		.name            = "pass-non-ip",
+		.description     = "Non-IP (ARP), earliest XDP_PASS exit",
+		.expected_retval = XDP_PASS,
+		.eth_proto       = ETH_P_ARP,
+	},
+	[S_DROP_V4_FRAG] = {
+		.name            = "drop-v4-frag",
+		.description     = "IPv4 fragmented, XDP_DROP",
+		.expected_retval = XDP_DROP, .ip_proto = IPPROTO_TCP,
+		.vip_addr        = IP4(10, 10, 1, 1), .dst_port = 80,
+		.src_addr        = IP4(10, 10, 5, 1), .src_port = 44444,
+		.set_frag        = true,
+	},
+	[S_DROP_V4_OPTIONS] = {
+		.name            = "drop-v4-options",
+		.description     = "IPv4 with IP options (ihl>5), XDP_DROP",
+		.expected_retval = XDP_DROP, .ip_proto = IPPROTO_TCP,
+		.vip_addr        = IP4(10, 10, 1, 1), .dst_port = 80,
+		.src_addr        = IP4(10, 10, 7, 1), .src_port = 55555,
+		.set_ip_options  = true,
+	},
+	[S_DROP_V6_FRAG] = {
+		.name            = "drop-v6-frag",
+		.description     = "IPv6 fragment extension header, XDP_DROP",
+		.expected_retval = XDP_DROP, .is_v6 = true,
+		.ip_proto        = IPPROTO_TCP,
+		.vip_addr_v6     = IP6(0xfd000100, 0, 0, 1), .dst_port = 80,
+		.src_addr_v6     = IP6(0xfd000500, 0, 0, 1), .src_port = 44444,
+		.set_frag        = true,
+	},
+};
+
+#define MAX_ENCAP_SIZE	(MAX_PKT_SIZE + sizeof(struct ipv6hdr))
+
+static __u8  pkt_buf[NUM_SCENARIOS][MAX_PKT_SIZE];
+static __u32 pkt_len[NUM_SCENARIOS];
+static __u8  expected_buf[NUM_SCENARIOS][MAX_ENCAP_SIZE];
+static __u32 expected_len[NUM_SCENARIOS];
+
+static int lru_inner_fds[BENCH_NR_CPUS];
+static int nr_inner_maps;
+
+static struct ctx {
+	struct xdp_lb_bench *skel;
+	struct bpf_bench_timing timing;
+	int prog_fd;
+} ctx;
+
+static struct {
+	int   scenario;
+	bool  machine_readable;
+} args = {
+	.scenario = -1,
+};
+
+static __u16 ip_checksum(const void *hdr, int len)
+{
+	const __u16 *p = hdr;
+	__u32 csum = 0;
+	int i;
+
+	for (i = 0; i < len / 2; i++)
+		csum += p[i];
+
+	while (csum >> 16)
+		csum = (csum & 0xffff) + (csum >> 16);
+
+	return ~csum;
+}
+
+static void htonl_v6(__be32 dst[4], const __u32 src[4])
+{
+	int i;
+
+	for (i = 0; i < 4; i++)
+		dst[i] = htonl(src[i]);
+}
+
+static void build_flow_key(struct flow_key *fk, const struct test_scenario *sc)
+{
+	memset(fk, 0, sizeof(*fk));
+	if (sc->is_v6) {
+		htonl_v6(fk->srcv6, sc->src_addr_v6);
+		htonl_v6(fk->dstv6, sc->vip_addr_v6);
+	} else {
+		fk->src = htonl(sc->src_addr);
+		fk->dst = htonl(sc->vip_addr);
+	}
+	fk->proto = sc->ip_proto;
+	fk->port16[0] = htons(sc->src_port);
+	fk->port16[1] = htons(sc->dst_port);
+}
+
+static void build_l4(const struct test_scenario *sc, __u8 *p, __u32 *off)
+{
+	if (sc->ip_proto == IPPROTO_TCP) {
+		struct tcphdr tcp = {};
+
+		tcp.source = htons(sc->src_port);
+		tcp.dest   = htons(sc->dst_port);
+		tcp.doff   = 5;
+		tcp.syn    = sc->set_syn ? 1 : 0;
+		tcp.rst    = sc->set_rst ? 1 : 0;
+		tcp.window = htons(8192);
+		memcpy(p + *off, &tcp, sizeof(tcp));
+		*off += sizeof(tcp);
+	} else if (sc->ip_proto == IPPROTO_UDP) {
+		struct udphdr udp = {};
+
+		udp.source = htons(sc->src_port);
+		udp.dest   = htons(sc->dst_port);
+		udp.len    = htons(sizeof(udp) + 16);
+		memcpy(p + *off, &udp, sizeof(udp));
+		*off += sizeof(udp);
+	}
+}
+
+static void build_packet(int idx)
+{
+	const struct test_scenario *sc = &scenarios[idx];
+	__u8 *p = pkt_buf[idx];
+	struct ethhdr eth = {};
+	__u16 proto;
+	__u32 off = 0;
+
+	memcpy(eth.h_dest, lb_mac, ETH_ALEN);
+	memcpy(eth.h_source, client_mac, ETH_ALEN);
+
+	if (sc->eth_proto)
+		proto = sc->eth_proto;
+	else if (sc->is_v6)
+		proto = ETH_P_IPV6;
+	else
+		proto = ETH_P_IP;
+
+	eth.h_proto = htons(proto);
+	memcpy(p, &eth, sizeof(eth));
+	off += sizeof(eth);
+
+	if (proto != ETH_P_IP && proto != ETH_P_IPV6) {
+		memcpy(p + off, "bench___payload!", 16);
+		off += 16;
+		pkt_len[idx] = off;
+		return;
+	}
+
+	if (sc->is_v6) {
+		struct ipv6hdr ip6h = {};
+		__u32 ip6_off = off;
+
+		ip6h.version  = 6;
+		ip6h.nexthdr  = sc->set_frag ? 44 : sc->ip_proto;
+		ip6h.hop_limit = 64;
+		htonl_v6((__be32 *)&ip6h.saddr, sc->src_addr_v6);
+		htonl_v6((__be32 *)&ip6h.daddr, sc->vip_addr_v6);
+		off += sizeof(ip6h);
+
+		if (sc->set_frag) {
+			memset(p + off, 0, 8);
+			p[off] = sc->ip_proto;
+			off += 8;
+		}
+
+		build_l4(sc, p, &off);
+
+		memcpy(p + off, "bench___payload!", 16);
+		off += 16;
+
+		ip6h.payload_len = htons(off - ip6_off - sizeof(ip6h));
+		memcpy(p + ip6_off, &ip6h, sizeof(ip6h));
+	} else {
+		struct iphdr iph = {};
+		__u32 ip_off = off;
+
+		iph.version  = 4;
+		iph.ihl      = sc->set_ip_options ? 6 : 5;
+		iph.ttl      = 64;
+		iph.protocol = sc->ip_proto;
+		iph.saddr    = htonl(sc->src_addr);
+		iph.daddr    = htonl(sc->vip_addr);
+		iph.frag_off = sc->set_frag ? htons(IP_MF) : 0;
+		off += sizeof(iph);
+
+		if (sc->set_ip_options) {
+			/* NOP option padding (4 bytes = 1 word) */
+			__u32 nop = htonl(0x01010101);
+
+			memcpy(p + off, &nop, sizeof(nop));
+			off += sizeof(nop);
+		}
+
+		build_l4(sc, p, &off);
+
+		memcpy(p + off, "bench___payload!", 16);
+		off += 16;
+
+		iph.tot_len = htons(off - ip_off);
+		iph.check   = ip_checksum(&iph, sizeof(iph));
+		memcpy(p + ip_off, &iph, sizeof(iph));
+	}
+
+	pkt_len[idx] = off;
+}
+
+static void populate_vip(struct xdp_lb_bench *skel, const struct test_scenario *sc)
+{
+	struct vip_definition key = {};
+	struct vip_meta val = {};
+	int err;
+
+	if (sc->is_v6)
+		htonl_v6(key.vipv6, sc->vip_addr_v6);
+	else
+		key.vip = htonl(sc->vip_addr);
+	key.port  = htons(sc->dst_port);
+	key.proto = sc->ip_proto;
+	val.flags   = sc->vip_flags;
+	val.vip_num = sc->vip_num;
+
+	err = bpf_map_update_elem(bpf_map__fd(skel->maps.vip_map), &key, &val, BPF_ANY);
+	if (err) {
+		fprintf(stderr, "vip_map [%s]: %s\n", sc->name, strerror(errno));
+		exit(1);
+	}
+}
+
+static void create_per_cpu_lru_maps(struct xdp_lb_bench *skel)
+{
+	int outer_fd = bpf_map__fd(skel->maps.lru_mapping);
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	int i, inner_fd, err;
+	__u32 cpu;
+
+	if (nr_cpus > BENCH_NR_CPUS)
+		nr_cpus = BENCH_NR_CPUS;
+
+	for (i = 0; i < (int)nr_cpus; i++) {
+		LIBBPF_OPTS(bpf_map_create_opts, opts);
+
+		inner_fd = bpf_map_create(BPF_MAP_TYPE_LRU_HASH, "lru_inner",
+					  sizeof(struct flow_key),
+					  sizeof(struct real_pos_lru),
+					  DEFAULT_LRU_SIZE, &opts);
+		if (inner_fd < 0) {
+			fprintf(stderr, "lru_inner[%d]: %s\n", i, strerror(errno));
+			exit(1);
+		}
+
+		cpu = i;
+		err = bpf_map_update_elem(outer_fd, &cpu, &inner_fd, BPF_ANY);
+		if (err) {
+			fprintf(stderr, "lru_mapping[%d]: %s\n", i, strerror(errno));
+			close(inner_fd);
+			exit(1);
+		}
+
+		lru_inner_fds[i] = inner_fd;
+	}
+
+	nr_inner_maps = nr_cpus;
+}
+
+static __u64 ktime_get_ns(void)
+{
+	struct timespec ts;
+
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+	return (__u64)ts.tv_sec * 1000000000ULL + ts.tv_nsec;
+}
+
+static void populate_lru(const struct test_scenario *sc, __u32 real_idx)
+{
+	struct real_pos_lru lru = { .pos = real_idx };
+	struct flow_key fk;
+	int i, err;
+
+	if (sc->ip_proto == IPPROTO_UDP)
+		lru.atime = ktime_get_ns();
+
+	build_flow_key(&fk, sc);
+
+	/* Insert into every per-CPU inner LRU so the entry is found
+	 * regardless of which CPU runs the BPF program.
+	 */
+	for (i = 0; i < nr_inner_maps; i++) {
+		err = bpf_map_update_elem(lru_inner_fds[i], &fk, &lru, BPF_ANY);
+		if (err) {
+			fprintf(stderr, "lru_inner[%d] [%s]: %s\n", i, sc->name,
+				strerror(errno));
+			exit(1);
+		}
+	}
+}
+
+static void populate_maps(struct xdp_lb_bench *skel)
+{
+	struct real_definition real_v4 = {};
+	struct real_definition real_v6 = {};
+	struct ctl_value cval = {};
+	__u32 key, real_idx = REAL_INDEX;
+	int ch_fd, err, i;
+
+	if (scenarios[args.scenario].expect_encap)
+		populate_vip(skel, &scenarios[args.scenario]);
+
+	ch_fd = bpf_map__fd(skel->maps.ch_rings);
+	for (i = 0; i < CH_RINGS_SIZE; i++) {
+		__u32 k = i;
+
+		err = bpf_map_update_elem(ch_fd, &k, &real_idx, BPF_ANY);
+		if (err) {
+			fprintf(stderr, "ch_rings[%d]: %s\n", i, strerror(errno));
+			exit(1);
+		}
+	}
+
+	memcpy(cval.mac, router_mac, ETH_ALEN);
+	key = 0;
+	err = bpf_map_update_elem(bpf_map__fd(skel->maps.ctl_array), &key, &cval, BPF_ANY);
+	if (err) {
+		fprintf(stderr, "ctl_array: %s\n", strerror(errno));
+		exit(1);
+	}
+
+	key = REAL_INDEX;
+	real_v4.dst = htonl(TNL_DST);
+	htonl_v6(real_v4.dstv6, tnl_dst_v6);
+	err = bpf_map_update_elem(bpf_map__fd(skel->maps.reals), &key, &real_v4, BPF_ANY);
+	if (err) {
+		fprintf(stderr, "reals[%d]: %s\n", REAL_INDEX, strerror(errno));
+		exit(1);
+	}
+
+	key = REAL_INDEX_V6;
+	htonl_v6(real_v6.dstv6, tnl_dst_v6);
+	real_v6.flags = F_IPV6;
+	err = bpf_map_update_elem(bpf_map__fd(skel->maps.reals), &key, &real_v6, BPF_ANY);
+	if (err) {
+		fprintf(stderr, "reals[%d]: %s\n", REAL_INDEX_V6, strerror(errno));
+		exit(1);
+	}
+
+	create_per_cpu_lru_maps(skel);
+
+	if (scenarios[args.scenario].prepopulate_lru) {
+		const struct test_scenario *sc = &scenarios[args.scenario];
+		__u32 ridx = sc->encap_v6_outer ? REAL_INDEX_V6 : REAL_INDEX;
+
+		populate_lru(sc, ridx);
+	}
+
+	if (scenarios[args.scenario].expect_encap) {
+		const struct test_scenario *sc = &scenarios[args.scenario];
+		struct vip_definition miss_vip = {};
+
+		if (sc->is_v6)
+			htonl_v6(miss_vip.vipv6, sc->vip_addr_v6);
+		else
+			miss_vip.vip = htonl(sc->vip_addr);
+		miss_vip.port = htons(sc->dst_port);
+		miss_vip.proto = sc->ip_proto;
+
+		key = 0;
+		err = bpf_map_update_elem(bpf_map__fd(skel->maps.vip_miss_stats),
+					  &key, &miss_vip, BPF_ANY);
+		if (err) {
+			fprintf(stderr, "vip_miss_stats: %s\n", strerror(errno));
+			exit(1);
+		}
+	}
+}
+
+static void build_expected_packet(int idx)
+{
+	const struct test_scenario *sc = &scenarios[idx];
+	__u8 *p = expected_buf[idx];
+	struct ethhdr eth = {};
+	const __u8 *in = pkt_buf[idx];
+	__u32 in_len = pkt_len[idx];
+	__u32 off = 0;
+	__u32 inner_len = in_len - sizeof(struct ethhdr);
+
+	if (sc->expected_retval == XDP_DROP) {
+		expected_len[idx] = 0;
+		return;
+	}
+
+	if (sc->expected_retval == XDP_PASS) {
+		memcpy(p, in, in_len);
+		expected_len[idx] = in_len;
+		return;
+	}
+
+	memcpy(eth.h_dest, router_mac, ETH_ALEN);
+	memcpy(eth.h_source, lb_mac, ETH_ALEN);
+	eth.h_proto = htons(sc->encap_v6_outer ? ETH_P_IPV6 : ETH_P_IP);
+	memcpy(p, &eth, sizeof(eth));
+	off += sizeof(eth);
+
+	if (sc->encap_v6_outer) {
+		struct ipv6hdr ip6h = {};
+		__u8 nexthdr = sc->is_v6 ? IPPROTO_IPV6 : IPPROTO_IPIP;
+
+		ip6h.version     = 6;
+		ip6h.nexthdr     = nexthdr;
+		ip6h.payload_len = htons(inner_len);
+		ip6h.hop_limit   = 64;
+
+		create_encap_ipv6_src(htons(sc->src_port),
+				      sc->is_v6 ? htonl(sc->src_addr_v6[0])
+						: htonl(sc->src_addr),
+				      (__be32 *)&ip6h.saddr);
+		htonl_v6((__be32 *)&ip6h.daddr, sc->tunnel_dst_v6);
+
+		memcpy(p + off, &ip6h, sizeof(ip6h));
+		off += sizeof(ip6h);
+	} else {
+		struct iphdr iph = {};
+
+		iph.version  = 4;
+		iph.ihl      = sizeof(iph) >> 2;
+		iph.protocol = IPPROTO_IPIP;
+		iph.tot_len  = htons(inner_len + sizeof(iph));
+		iph.ttl      = 64;
+		iph.saddr    = create_encap_ipv4_src(htons(sc->src_port),
+						     htonl(sc->src_addr));
+		iph.daddr    = htonl(sc->tunnel_dst);
+		iph.check    = ip_checksum(&iph, sizeof(iph));
+
+		memcpy(p + off, &iph, sizeof(iph));
+		off += sizeof(iph);
+	}
+
+	memcpy(p + off, in + sizeof(struct ethhdr), inner_len);
+	off += inner_len;
+
+	expected_len[idx] = off;
+}
+
+static void print_hex_diff(const char *name, const __u8 *got, __u32 got_len, const __u8 *exp,
+			   __u32 exp_len)
+{
+	__u32 max_len = got_len > exp_len ? got_len : exp_len;
+	__u32 i, ndiffs = 0;
+
+	fprintf(stderr, "  [%s] got %u bytes, expected %u bytes\n",
+		name, got_len, exp_len);
+
+	for (i = 0; i < max_len && ndiffs < 8; i++) {
+		__u8 g = i < got_len ? got[i] : 0;
+		__u8 e = i < exp_len ? exp[i] : 0;
+
+		if (g != e || i >= got_len || i >= exp_len) {
+			fprintf(stderr, "    offset 0x%03x: got 0x%02x  expected 0x%02x\n",
+				i, g, e);
+			ndiffs++;
+		}
+	}
+
+	if (ndiffs >= 8 && i < max_len)
+		fprintf(stderr, "    ... (more differences)\n");
+}
+
+static void read_stat(int stats_fd, __u32 key, __u64 *v1_out, __u64 *v2_out)
+{
+	struct lb_stats values[BENCH_NR_CPUS];
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	__u64 v1 = 0, v2 = 0;
+	unsigned int i;
+
+	if (nr_cpus > BENCH_NR_CPUS)
+		nr_cpus = BENCH_NR_CPUS;
+
+	if (bpf_map_lookup_elem(stats_fd, &key, values) == 0) {
+		for (i = 0; i < nr_cpus; i++) {
+			v1 += values[i].v1;
+			v2 += values[i].v2;
+		}
+	}
+
+	*v1_out = v1;
+	*v2_out = v2;
+}
+
+static void reset_stats(int stats_fd)
+{
+	struct lb_stats zeros[BENCH_NR_CPUS];
+	__u32 key;
+
+	memset(zeros, 0, sizeof(zeros));
+	for (key = 0; key < STATS_SIZE; key++)
+		bpf_map_update_elem(stats_fd, &key, zeros, BPF_ANY);
+}
+
+static bool validate_counters(int idx)
+{
+	const struct test_scenario *sc = &scenarios[idx];
+	int stats_fd = bpf_map__fd(ctx.skel->maps.stats);
+	__u64 xdp_tx, xdp_pass, xdp_drop, lru_pkts, lru_misses, tcp_misses;
+	__u64 expected_misses;
+	__u64 dummy;
+	/*
+	 * BENCH_BPF_LOOP runs batch_iters timed + 1 untimed iteration.
+	 * Each iteration calls process_packet -> count_action, so all
+	 * counters are incremented (batch_iters + 1) times.
+	 */
+	__u64 n = ctx.timing.batch_iters + 1;
+	bool pass = true;
+
+	read_stat(stats_fd, STATS_XDP_TX, &xdp_tx, &dummy);
+	read_stat(stats_fd, STATS_XDP_PASS, &xdp_pass, &dummy);
+	read_stat(stats_fd, STATS_XDP_DROP, &xdp_drop, &dummy);
+	read_stat(stats_fd, STATS_LRU, &lru_pkts, &lru_misses);
+	read_stat(stats_fd, STATS_LRU_MISS, &tcp_misses, &dummy);
+
+	if (sc->expected_retval == XDP_TX && xdp_tx != n) {
+		fprintf(stderr, "  [%s] COUNTER FAIL: STATS_XDP_TX=%llu, expected %llu\n", sc->name,
+			(unsigned long long)xdp_tx, (unsigned long long)n);
+		pass = false;
+	}
+	if (sc->expected_retval == XDP_PASS && xdp_pass != n) {
+		fprintf(stderr, "  [%s] COUNTER FAIL: STATS_XDP_PASS=%llu, expected %llu\n",
+			sc->name, (unsigned long long)xdp_pass, (unsigned long long)n);
+		pass = false;
+	}
+	if (sc->expected_retval == XDP_DROP && xdp_drop != n) {
+		fprintf(stderr, "  [%s] COUNTER FAIL: STATS_XDP_DROP=%llu, expected %llu\n",
+			sc->name, (unsigned long long)xdp_drop, (unsigned long long)n);
+		pass = false;
+	}
+
+	if (!sc->expect_encap)
+		goto out;
+
+	if (lru_pkts != n) {
+		fprintf(stderr, "  [%s] COUNTER FAIL: STATS_LRU.v1=%llu, expected %llu\n",
+			sc->name, (unsigned long long)lru_pkts, (unsigned long long)n);
+		pass = false;
+	}
+
+	switch (sc->lru_miss) {
+	case LRU_MISS_NONE:
+		expected_misses = 0;
+		break;
+	case LRU_MISS_ALL:
+		expected_misses = n;
+		break;
+	case LRU_MISS_FIRST:
+		expected_misses = 1;
+		break;
+	default:
+		/* LRU_MISS_AUTO: compute from scenario flags */
+		if (sc->prepopulate_lru && !sc->set_syn)
+			expected_misses = 0;
+		else if (sc->set_syn || sc->set_rst ||
+			 (sc->vip_flags & F_LRU_BYPASS))
+			expected_misses = n;
+		else if (sc->cold_lru)
+			expected_misses = 1;
+		else
+			expected_misses = n;
+		break;
+	}
+
+	if (lru_misses != expected_misses) {
+		fprintf(stderr, "  [%s] COUNTER FAIL: LRU misses=%llu, expected %llu\n",
+			sc->name, (unsigned long long)lru_misses,
+			(unsigned long long)expected_misses);
+		pass = false;
+	}
+
+	if (sc->ip_proto == IPPROTO_TCP && lru_misses > 0) {
+		if (tcp_misses != lru_misses) {
+			fprintf(stderr, "  [%s] COUNTER FAIL: TCP LRU misses=%llu, expected %llu\n",
+				sc->name, (unsigned long long)tcp_misses,
+				(unsigned long long)lru_misses);
+			pass = false;
+		}
+	}
+
+out:
+	reset_stats(stats_fd);
+	return pass;
+}
+
+static const char *xdp_action_str(int action)
+{
+	switch (action) {
+	case XDP_DROP:	return "XDP_DROP";
+	case XDP_PASS:	return "XDP_PASS";
+	case XDP_TX:	return "XDP_TX";
+	default:	return "UNKNOWN";
+	}
+}
+
+static bool validate_scenario(int idx)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
+	const struct test_scenario *sc = &scenarios[idx];
+	__u8 out[MAX_ENCAP_SIZE];
+	int err;
+
+	topts.data_in = pkt_buf[idx];
+	topts.data_size_in = pkt_len[idx];
+	topts.data_out = out;
+	topts.data_size_out = sizeof(out);
+	topts.repeat = 1;
+
+	err = bpf_prog_test_run_opts(ctx.prog_fd, &topts);
+	if (err) {
+		fprintf(stderr, "  [%s] FAIL: test_run: %s\n", sc->name, strerror(errno));
+		return false;
+	}
+
+	if ((int)topts.retval != sc->expected_retval) {
+		fprintf(stderr, "  [%s] FAIL: retval %s, expected %s\n", sc->name,
+			xdp_action_str(topts.retval), xdp_action_str(sc->expected_retval));
+		return false;
+	}
+
+	/*
+	 * Compare output packet when it's deterministic.
+	 * Skip for XDP_DROP (no output) and cold_lru (source IP poisoned).
+	 */
+	if (sc->expected_retval != XDP_DROP && !sc->cold_lru) {
+		if (topts.data_size_out != expected_len[idx] ||
+		    memcmp(out, expected_buf[idx], expected_len[idx]) != 0) {
+			fprintf(stderr, "  [%s] FAIL: output packet mismatch\n", sc->name);
+			print_hex_diff(sc->name, out, topts.data_size_out, expected_buf[idx],
+				       expected_len[idx]);
+			return false;
+		}
+	}
+
+	if (!validate_counters(idx))
+		return false;
+	return true;
+}
+
+static int find_scenario(const char *name)
+{
+	int i;
+
+	for (i = 0; i < NUM_SCENARIOS; i++) {
+		if (strcmp(scenarios[i].name, name) == 0)
+			return i;
+	}
+	return -1;
+}
+
+static void xdp_lb_validate(void)
+{
+	if (env.consumer_cnt != 0) {
+		fprintf(stderr, "benchmark doesn't support consumers\n");
+		exit(1);
+	}
+	if (bpf_num_possible_cpus() > BENCH_NR_CPUS) {
+		fprintf(stderr, "too many CPUs (%d > %d), increase BENCH_NR_CPUS\n",
+			bpf_num_possible_cpus(), BENCH_NR_CPUS);
+		exit(1);
+	}
+}
+
+static void xdp_lb_run_once(void *unused __always_unused)
+{
+	int idx = args.scenario;
+
+	LIBBPF_OPTS(bpf_test_run_opts, topts,
+		.data_in      = pkt_buf[idx],
+		.data_size_in = pkt_len[idx],
+		.repeat       = 1,
+	);
+
+	bpf_prog_test_run_opts(ctx.prog_fd, &topts);
+}
+
+static void xdp_lb_setup(void)
+{
+	struct xdp_lb_bench *skel;
+	int err;
+
+	if (args.scenario < 0) {
+		fprintf(stderr, "--scenario is required. Use --list-scenarios to see options.\n");
+		exit(1);
+	}
+
+	setup_libbpf();
+
+	skel = xdp_lb_bench__open();
+	if (!skel) {
+		fprintf(stderr, "failed to open skeleton\n");
+		exit(1);
+	}
+
+	err = xdp_lb_bench__load(skel);
+	if (err) {
+		fprintf(stderr, "failed to load skeleton: %s\n", strerror(-err));
+		xdp_lb_bench__destroy(skel);
+		exit(1);
+	}
+
+	ctx.skel    = skel;
+	ctx.prog_fd = bpf_program__fd(skel->progs.xdp_lb_bench);
+
+	build_packet(args.scenario);
+	build_expected_packet(args.scenario);
+
+	populate_maps(skel);
+
+	BENCH_TIMING_INIT(&ctx.timing, skel, 0);
+	ctx.timing.machine_readable = args.machine_readable;
+
+	if (scenarios[args.scenario].fixed_batch_iters) {
+		ctx.timing.batch_iters = scenarios[args.scenario].fixed_batch_iters;
+		skel->bss->batch_iters = ctx.timing.batch_iters;
+	} else {
+		bpf_bench_calibrate(&ctx.timing, xdp_lb_run_once, NULL);
+	}
+
+	env.duration_sec = 600;
+
+	/*
+	 * Enable cold_lru before validation so LRU miss counters are
+	 * correct.  Seed the LRU with one run so the original flow is
+	 * present; validation then sees exactly 1 miss (the poisoned
+	 * flow) regardless of whether calibration ran.
+	 */
+	if (scenarios[args.scenario].cold_lru) {
+		skel->bss->cold_lru = 1;
+		xdp_lb_run_once(NULL);
+	}
+
+	reset_stats(bpf_map__fd(skel->maps.stats));
+
+	if (!validate_scenario(args.scenario)) {
+		fprintf(stderr, "Validation FAILED - aborting benchmark\n");
+		exit(1);
+	}
+
+	if (scenarios[args.scenario].flow_mask)
+		skel->bss->flow_mask = scenarios[args.scenario].flow_mask;
+}
+
+static void *xdp_lb_producer(void *input)
+{
+	while (true)
+		xdp_lb_run_once(NULL);
+
+	return NULL;
+}
+
+static void xdp_lb_measure(struct bench_res *res)
+{
+	bpf_bench_timing_measure(&ctx.timing, res);
+}
+
+static void xdp_lb_report_final(struct bench_res res[], int res_cnt)
+{
+	bpf_bench_timing_report(&ctx.timing, scenarios[args.scenario].name,
+				scenarios[args.scenario].description);
+}
+
+enum {
+	ARG_SCENARIO         = 9001,
+	ARG_LIST_SCENARIOS   = 9002,
+	ARG_MACHINE_READABLE = 9003,
+};
+
+static const struct argp_option opts[] = {
+	{ "scenario", ARG_SCENARIO, "NAME", 0,
+	  "Scenario to benchmark (required)" },
+	{ "list-scenarios", ARG_LIST_SCENARIOS, NULL, 0,
+	  "List available scenarios and exit" },
+	{ "machine-readable", ARG_MACHINE_READABLE, NULL, 0,
+	  "Print only a machine-readable RESULT line" },
+	{},
+};
+
+static error_t parse_arg(int key, char *arg, struct argp_state *state)
+{
+	int i;
+
+	switch (key) {
+	case ARG_SCENARIO:
+		args.scenario = find_scenario(arg);
+		if (args.scenario < 0) {
+			fprintf(stderr, "unknown scenario: '%s'\n", arg);
+			fprintf(stderr, "use --list-scenarios to see options\n");
+			argp_usage(state);
+		}
+		break;
+	case ARG_LIST_SCENARIOS:
+		printf("Available scenarios:\n");
+		for (i = 0; i < NUM_SCENARIOS; i++)
+			printf("  %-20s  %s\n", scenarios[i].name, scenarios[i].description);
+		exit(0);
+	case ARG_MACHINE_READABLE:
+		args.machine_readable = true;
+		env.quiet = true;
+		break;
+	default:
+		return ARGP_ERR_UNKNOWN;
+	}
+
+	return 0;
+}
+
+const struct argp bench_xdp_lb_argp = {
+	.options = opts,
+	.parser  = parse_arg,
+};
+
+const struct bench bench_xdp_lb = {
+	.name            = "xdp-lb",
+	.argp            = &bench_xdp_lb_argp,
+	.validate        = xdp_lb_validate,
+	.setup           = xdp_lb_setup,
+	.producer_thread = xdp_lb_producer,
+	.measure         = xdp_lb_measure,
+	.report_final    = xdp_lb_report_final,
+};
diff --git a/tools/testing/selftests/bpf/benchs/run_bench_xdp_lb.sh b/tools/testing/selftests/bpf/benchs/run_bench_xdp_lb.sh
new file mode 100755
index 000000000000..f65cf46214a3
--- /dev/null
+++ b/tools/testing/selftests/bpf/benchs/run_bench_xdp_lb.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source ./benchs/run_common.sh
+
+set -eufo pipefail
+
+WARMUP=${WARMUP:-3}
+
+RUN="sudo ./bench -q -w${WARMUP} -a xdp-lb --machine-readable"
+
+SEP="  +----------------------------------+----------+---------+----------+"
+HDR="  | %-32s | %8s | %7s | %8s |\n"
+ROW="  | %-32s | %8s | %7s | %8s |\n"
+
+function group_header()
+{
+	printf "%s\n" "$SEP"
+	printf "$HDR" "$1" "p50" "stddev" "p99"
+	printf "%s\n" "$SEP"
+}
+
+function rval()
+{
+	echo "$1" | sed -nE "s/.*$2=([^ ]+).*/\1/p"
+}
+
+function run_scenario()
+{
+	local sc="$1"
+	shift
+	local output rline
+
+	output=$($RUN --scenario "$sc" "$@" 2>&1) || true
+	rline=$(echo "$output" | grep '^RESULT ' || true)
+
+	if [ -z "$rline" ]; then
+		printf "$ROW" "$sc" "ERR" "-" "-"
+		return
+	fi
+
+	printf "$ROW" "$sc" \
+		"$(rval "$rline" median)" \
+		"$(rval "$rline" stddev)" \
+		"$(rval "$rline" p99)"
+}
+
+header "XDP load-balancer benchmark"
+
+group_header "Single-flow baseline"
+for sc in tcp-v4-lru-hit tcp-v4-ch \
+	  tcp-v6-lru-hit tcp-v6-ch \
+	  udp-v4-lru-hit udp-v6-lru-hit \
+	  tcp-v4v6-lru-hit; do
+	run_scenario "$sc"
+done
+
+group_header "Diverse flows (4K src addrs)"
+for sc in tcp-v4-lru-diverse tcp-v4-ch-diverse \
+	  tcp-v6-lru-diverse tcp-v6-ch-diverse \
+	  udp-v4-lru-diverse; do
+	run_scenario "$sc"
+done
+
+group_header "TCP flags"
+run_scenario tcp-v4-syn
+run_scenario tcp-v4-rst-miss
+
+group_header "LRU stress"
+run_scenario tcp-v4-lru-miss
+run_scenario udp-v4-lru-miss
+run_scenario tcp-v4-lru-warmup
+
+group_header "Early exits"
+for sc in pass-v4-no-vip pass-v6-no-vip pass-v4-icmp pass-non-ip drop-v4-frag drop-v4-options \
+	  drop-v6-frag; do
+	run_scenario "$sc"
+done
+printf "%s\n" "$SEP"
diff --git a/tools/testing/selftests/bpf/bpf_arena_alloc.h b/tools/testing/selftests/bpf/bpf_arena_alloc.h
index c27678299e0c..cda147fd9d25 100644
--- a/tools/testing/selftests/bpf/bpf_arena_alloc.h
+++ b/tools/testing/selftests/bpf/bpf_arena_alloc.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
 /* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
 #pragma once
-#include "bpf_arena_common.h"
+#include <bpf_arena_common.h>
 
 #ifndef __round_mask
 #define __round_mask(x, y) ((__typeof__(x))((y)-1))
diff --git a/tools/testing/selftests/bpf/bpf_arena_htab.h b/tools/testing/selftests/bpf/bpf_arena_htab.h
index acc01a876668..d7ba86362d86 100644
--- a/tools/testing/selftests/bpf/bpf_arena_htab.h
+++ b/tools/testing/selftests/bpf/bpf_arena_htab.h
@@ -14,9 +14,8 @@ struct htab {
 	htab_bucket_t *buckets;
 	int n_buckets;
 };
-typedef struct htab __arena htab_t;
 
-static inline htab_bucket_t *__select_bucket(htab_t *htab, __u32 hash)
+static inline htab_bucket_t *__select_bucket(struct htab __arena *htab, __u32 hash)
 {
 	htab_bucket_t *b = htab->buckets;
 
@@ -24,7 +23,7 @@ static inline htab_bucket_t *__select_bucket(htab_t *htab, __u32 hash)
 	return &b[hash & (htab->n_buckets - 1)];
 }
 
-static inline arena_list_head_t *select_bucket(htab_t *htab, __u32 hash)
+static inline arena_list_head_t *select_bucket(struct htab __arena *htab, __u32 hash)
 {
 	return &__select_bucket(htab, hash)->head;
 }
@@ -53,7 +52,7 @@ static int htab_hash(int key)
 	return key;
 }
 
-__weak int htab_lookup_elem(htab_t *htab __arg_arena, int key)
+__weak int htab_lookup_elem(struct htab __arena *htab, int key)
 {
 	hashtab_elem_t *l_old;
 	arena_list_head_t *head;
@@ -66,7 +65,7 @@ __weak int htab_lookup_elem(htab_t *htab __arg_arena, int key)
 	return 0;
 }
 
-__weak int htab_update_elem(htab_t *htab __arg_arena, int key, int value)
+__weak int htab_update_elem(struct htab __arena *htab, int key, int value)
 {
 	hashtab_elem_t *l_new = NULL, *l_old;
 	arena_list_head_t *head;
@@ -90,7 +89,7 @@ __weak int htab_update_elem(htab_t *htab __arg_arena, int key, int value)
 	return 0;
 }
 
-void htab_init(htab_t *htab)
+void htab_init(struct htab __arena *htab)
 {
 	void __arena *buckets = bpf_arena_alloc_pages(&arena, NULL, 2, NUMA_NO_NODE, 0);
 
diff --git a/tools/testing/selftests/bpf/bpf_arena_list.h b/tools/testing/selftests/bpf/bpf_arena_list.h
index e16fa7d95fcf..1af2ffc27d9c 100644
--- a/tools/testing/selftests/bpf/bpf_arena_list.h
+++ b/tools/testing/selftests/bpf/bpf_arena_list.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
 /* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
 #pragma once
-#include "bpf_arena_common.h"
+#include <bpf_arena_common.h>
 
 struct arena_list_node;
 
diff --git a/tools/testing/selftests/bpf/bpf_arena_strsearch.h b/tools/testing/selftests/bpf/bpf_arena_strsearch.h
index c1b6eaa905bb..10a70667c8bf 100644
--- a/tools/testing/selftests/bpf/bpf_arena_strsearch.h
+++ b/tools/testing/selftests/bpf/bpf_arena_strsearch.h
@@ -1,9 +1,9 @@
 /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
 /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
 #pragma once
-#include "bpf_arena_common.h"
+#include <bpf_arena_common.h>
 
-__noinline int bpf_arena_strlen(const char __arena *s __arg_arena)
+__noinline int bpf_arena_strlen(const char __arena *s)
 {
 	const char __arena *sc;
 
@@ -40,7 +40,7 @@ __noinline int bpf_arena_strlen(const char __arena *s __arg_arena)
  *
  * An opening bracket without a matching close is matched literally.
  */
-__noinline bool glob_match(char const __arena *pat __arg_arena, char const __arena *str __arg_arena)
+__noinline bool glob_match(char const __arena *pat, char const __arena *str)
 {
 	/*
 	 * Backtrack to previous * on mismatch and retry starting one
diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h
index 2234bd6bc9d3..d1db355e872b 100644
--- a/tools/testing/selftests/bpf/bpf_experimental.h
+++ b/tools/testing/selftests/bpf/bpf_experimental.h
@@ -5,6 +5,7 @@
 #include <bpf/bpf_tracing.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_core_read.h>
+#include <bpf_may_goto.h>
 
 #define __contains(name, node) __attribute__((btf_decl_tag("contains:" #name ":" #node)))
 
@@ -204,89 +205,6 @@ l_true:												\
        })
 #endif
 
-/*
- * Note that cond_break can only be portably used in the body of a breakable
- * construct, whereas can_loop can be used anywhere.
- */
-#ifdef __BPF_FEATURE_MAY_GOTO
-#define can_loop					\
-	({ __label__ l_break, l_continue;		\
-	bool ret = true;				\
-	asm volatile goto("may_goto %l[l_break]"	\
-		      :::: l_break);			\
-	goto l_continue;				\
-	l_break: ret = false;				\
-	l_continue:;					\
-	ret;						\
-	})
-
-#define __cond_break(expr)				\
-	({ __label__ l_break, l_continue;		\
-	asm volatile goto("may_goto %l[l_break]"	\
-		      :::: l_break);			\
-	goto l_continue;				\
-	l_break: expr;					\
-	l_continue:;					\
-	})
-#else
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-#define can_loop					\
-	({ __label__ l_break, l_continue;		\
-	bool ret = true;				\
-	asm volatile goto("1:.byte 0xe5;		\
-		      .byte 0;				\
-		      .long ((%l[l_break] - 1b - 8) / 8) & 0xffff;	\
-		      .short 0"				\
-		      :::: l_break);			\
-	goto l_continue;				\
-	l_break: ret = false;				\
-	l_continue:;					\
-	ret;						\
-	})
-
-#define __cond_break(expr)				\
-	({ __label__ l_break, l_continue;		\
-	asm volatile goto("1:.byte 0xe5;		\
-		      .byte 0;				\
-		      .long ((%l[l_break] - 1b - 8) / 8) & 0xffff;	\
-		      .short 0"				\
-		      :::: l_break);			\
-	goto l_continue;				\
-	l_break: expr;					\
-	l_continue:;					\
-	})
-#else
-#define can_loop					\
-	({ __label__ l_break, l_continue;		\
-	bool ret = true;				\
-	asm volatile goto("1:.byte 0xe5;		\
-		      .byte 0;				\
-		      .long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16;	\
-		      .short 0"				\
-		      :::: l_break);			\
-	goto l_continue;				\
-	l_break: ret = false;				\
-	l_continue:;					\
-	ret;						\
-	})
-
-#define __cond_break(expr)				\
-	({ __label__ l_break, l_continue;		\
-	asm volatile goto("1:.byte 0xe5;		\
-		      .byte 0;				\
-		      .long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16;	\
-		      .short 0"				\
-		      :::: l_break);			\
-	goto l_continue;				\
-	l_break: expr;					\
-	l_continue:;					\
-	})
-#endif
-#endif
-
-#define cond_break __cond_break(break)
-#define cond_break_label(label) __cond_break(goto label)
-
 #ifndef bpf_nop_mov
 #define bpf_nop_mov(var) \
 	asm volatile("%[reg]=%[reg]"::[reg]"r"((short)var))
diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h
index 7dad01439391..ae71e9b69051 100644
--- a/tools/testing/selftests/bpf/bpf_kfuncs.h
+++ b/tools/testing/selftests/bpf/bpf_kfuncs.h
@@ -40,7 +40,7 @@ extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, __u64 offset,
 extern void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *ptr, __u64 offset, void *buffer,
 				   __u64 buffer__szk) __ksym __weak;
 
-extern int bpf_dynptr_adjust(const struct bpf_dynptr *ptr, __u64 start, __u64 end) __ksym __weak;
+extern int bpf_dynptr_adjust(struct bpf_dynptr *ptr, __u64 start, __u64 end) __ksym __weak;
 extern bool bpf_dynptr_is_null(const struct bpf_dynptr *ptr) __ksym __weak;
 extern bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *ptr) __ksym __weak;
 extern __u64 bpf_dynptr_size(const struct bpf_dynptr *ptr) __ksym __weak;
@@ -70,13 +70,13 @@ extern void *bpf_rdonly_cast(const void *obj, __u32 btf_id) __ksym __weak;
 
 extern int bpf_get_file_xattr(struct file *file, const char *name,
 			      struct bpf_dynptr *value_ptr) __ksym;
-extern int bpf_get_fsverity_digest(struct file *file, struct bpf_dynptr *digest_ptr) __ksym;
+extern int bpf_get_fsverity_digest(struct file *file, const struct bpf_dynptr *digest_ptr) __ksym;
 
 extern struct bpf_key *bpf_lookup_user_key(__s32 serial, __u64 flags) __ksym;
 extern struct bpf_key *bpf_lookup_system_key(__u64 id) __ksym;
 extern void bpf_key_put(struct bpf_key *key) __ksym;
-extern int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_ptr,
-				      struct bpf_dynptr *sig_ptr,
+extern int bpf_verify_pkcs7_signature(const struct bpf_dynptr *data_ptr,
+				      const struct bpf_dynptr *sig_ptr,
 				      struct bpf_key *trusted_keyring) __ksym;
 
 struct dentry;
diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
index 24855381290d..bac60b444551 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -130,4 +130,5 @@ CONFIG_INFINIBAND=y
 CONFIG_SMC=y
 CONFIG_SMC_HS_CTRL_BPF=y
 CONFIG_DIBS=y
-CONFIG_DIBS_LO=y
-\ No newline at end of file
+CONFIG_DIBS_LO=y
+CONFIG_PM_WAKELOCKS=y
diff --git a/tools/testing/selftests/bpf/default.profraw b/tools/testing/selftests/bpf/default.profraw
new file mode 100644
index 000000000000..e865e87829f8
--- /dev/null
+++ b/tools/testing/selftests/bpf/default.profraw
diff --git a/tools/testing/selftests/bpf/jit_disasm_helpers.c b/tools/testing/selftests/bpf/jit_disasm_helpers.c
index 364c557c5115..3558fe10e28c 100644
--- a/tools/testing/selftests/bpf/jit_disasm_helpers.c
+++ b/tools/testing/selftests/bpf/jit_disasm_helpers.c
@@ -96,10 +96,19 @@ static int disasm_one_func(FILE *text_out, uint8_t *image, __u32 len)
 	__u32 *label_pc, pc;
 	int i, cnt, err = 0;
 	char buf[64];
+	char *cpu, *features;
 
 	triple = LLVMGetDefaultTargetTriple();
-	ctx = LLVMCreateDisasm(triple, &labels, 0, NULL, lookup_symbol);
-	if (!ASSERT_OK_PTR(ctx, "LLVMCreateDisasm")) {
+
+	cpu = LLVMGetHostCPUName();
+	features = LLVMGetHostCPUFeatures();
+
+	ctx = LLVMCreateDisasmCPUFeatures(triple, cpu, features, &labels, 0, NULL, lookup_symbol);
+
+	LLVMDisposeMessage(cpu);
+	LLVMDisposeMessage(features);
+
+	if (!ASSERT_OK_PTR(ctx, "LLVMCreateDisasmCPUFeatures")) {
 		err = -EINVAL;
 		goto out;
 	}
diff --git a/tools/testing/selftests/bpf/libarena/Makefile b/tools/testing/selftests/bpf/libarena/Makefile
new file mode 100644
index 000000000000..5e2ab514805e
--- /dev/null
+++ b/tools/testing/selftests/bpf/libarena/Makefile
@@ -0,0 +1,92 @@
+# SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
+# Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
+
+.PHONY: clean
+
+# Defaults for standalone builds
+
+CLANG ?= clang
+BPFTOOL ?= bpftool
+LDLIBS ?= -lbpf -lelf -lz -lrt -lpthread -lzstd
+
+ifeq ($(V),1)
+Q =
+msg =
+else
+Q ?= @
+msg = @printf '  %-8s%s %s%s\n' "$(1)" "$(if $(2), [$(2)])" "$(notdir $(3))" "$(if $(4), $(4))";
+endif
+
+IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - </dev/null | \
+			grep 'define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__')
+BPF_TARGET_ENDIAN ?= $(if $(IS_LITTLE_ENDIAN),--target=bpfel,--target=bpfeb)
+
+LIBARENA=$(abspath .)
+BPFDIR=$(abspath $(LIBARENA)/..)
+
+INCLUDE_DIR ?= $(BPFDIR)/tools/include
+LIBBPF_INCLUDE ?= $(INCLUDE_DIR)
+
+# Scan src/ and selftests/ to generate the final binaries
+LIBARENA_SOURCES = $(wildcard $(LIBARENA)/src/*.bpf.c) $(wildcard $(LIBARENA)/selftests/*.bpf.c)
+LIBARENA_OBJECTS = $(notdir $(LIBARENA_SOURCES:.bpf.c=.bpf.o))
+LIBARENA_OBJECTS_ASAN = $(notdir $(LIBARENA_SOURCES:.bpf.c=_asan.bpf.o))
+
+INCLUDES = -I$(LIBARENA)/include -I$(BPFDIR)
+ifneq ($(INCLUDE_DIR),)
+INCLUDES += -I$(INCLUDE_DIR)
+endif
+ifneq ($(LIBBPF_INCLUDE),)
+INCLUDES += -I$(LIBBPF_INCLUDE)
+endif
+
+ASAN_FLAGS = -fsanitize=kernel-address -fno-stack-protector -fno-builtin
+ASAN_FLAGS += -mllvm -asan-instrument-address-spaces=1 -mllvm -asan-shadow-addr-space=1
+ASAN_FLAGS += -mllvm -asan-use-stack-safety=0 -mllvm -asan-stack=0
+ASAN_FLAGS += -mllvm -asan-kernel=1
+ASAN_FLAGS += -mllvm -asan-constructor-kind=none
+ASAN_FLAGS += -mllvm -asan-destructor-kind=none
+
+# ENABLE_ATOMICS_TESTS required because we use arena spinlocks
+override BPF_CFLAGS += -DENABLE_ATOMICS_TESTS
+override BPF_CFLAGS += -O2 -g
+override BPF_CFLAGS += -Wno-incompatible-pointer-types-discards-qualifiers
+# Required for suppressing harmless vmlinux.h-related warnings.
+override BPF_CFLAGS += -Wno-missing-declarations
+override BPF_CFLAGS += $(INCLUDES)
+
+CFLAGS = -O2 -no-pie
+CFLAGS += $(INCLUDES)
+
+vpath %.bpf.c $(LIBARENA)/src $(LIBARENA)/selftests
+vpath %.c $(LIBARENA)/src $(LIBARENA)/selftests
+
+skeletons: libarena.skel.h libarena_asan.skel.h
+.PHONY: skeletons
+
+libarena_asan.skel.h: libarena_asan.bpf.o
+	$(call msg,GEN-SKEL,libarena,$@)
+	$(Q)$(BPFTOOL) gen skeleton $< name "libarena_asan" > $@
+
+libarena.skel.h: libarena.bpf.o
+	$(call msg,GEN-SKEL,libarena,$@)
+	$(Q)$(BPFTOOL) gen skeleton $< name "libarena" > $@
+
+libarena_asan.bpf.o: $(LIBARENA_OBJECTS_ASAN)
+	$(call msg,GEN-OBJ,libarena,$@)
+	$(Q)$(BPFTOOL) gen object $@ $^
+
+libarena.bpf.o: $(LIBARENA_OBJECTS)
+	$(call msg,GEN-OBJ,libarena,$@)
+	$(Q)$(BPFTOOL) gen object $@ $^
+
+%_asan.bpf.o: %.bpf.c
+	$(call msg,CLNG-BPF,libarena,$@)
+	$(Q)$(CLANG) $(BPF_CFLAGS) $(ASAN_FLAGS) -DBPF_ARENA_ASAN $(BPF_TARGET_ENDIAN) -c $< -o $@
+
+%.bpf.o: %.bpf.c
+	$(call msg,CLNG-BPF,libarena,$@)
+	$(Q)$(CLANG) $(BPF_CFLAGS) $(BPF_TARGET_ENDIAN) -c $< -o $@
+
+clean:
+	$(Q)rm -f *.skel.h *.bpf.o *.linked*.o
diff --git a/tools/testing/selftests/bpf/bpf_arena_common.h b/tools/testing/selftests/bpf/libarena/include/bpf_arena_common.h
index 16f8ce832004..82aafe879fae 100644
--- a/tools/testing/selftests/bpf/bpf_arena_common.h
+++ b/tools/testing/selftests/bpf/libarena/include/bpf_arena_common.h
@@ -33,12 +33,12 @@
 #endif
 
 #if defined(__BPF_FEATURE_ADDR_SPACE_CAST) && !defined(BPF_ARENA_FORCE_ASM)
-#define __arena __attribute__((address_space(1)))
+#define __arena __attribute__((address_space(1))) __attribute__((btf_type_tag("arena")))
 #define __arena_global __attribute__((address_space(1)))
 #define cast_kern(ptr) /* nop for bpf prog. emitted by LLVM */
 #define cast_user(ptr) /* nop for bpf prog. emitted by LLVM */
 #else
-#define __arena
+#define __arena __attribute__((btf_type_tag("arena")))
 #define __arena_global SEC(".addr_space.1")
 #define cast_kern(ptr) bpf_addr_space_cast(ptr, 0, 1)
 #define cast_user(ptr) bpf_addr_space_cast(ptr, 1, 0)
@@ -54,7 +54,6 @@ void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) __ksym _
 #else /* when compiled as user space code */
 
 #define __arena
-#define __arg_arena
 #define cast_kern(ptr) /* nop for user space */
 #define cast_user(ptr) /* nop for user space */
 __weak char arena[1];
diff --git a/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h b/tools/testing/selftests/bpf/libarena/include/bpf_arena_spin_lock.h
index f90531cf3ee5..ae6b72d15bb6 100644
--- a/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h
+++ b/tools/testing/selftests/bpf/libarena/include/bpf_arena_spin_lock.h
@@ -5,7 +5,7 @@
 
 #include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
-#include "bpf_atomic.h"
+#include <bpf_atomic.h>
 
 #define arch_mcs_spin_lock_contended_label(l, label) smp_cond_load_acquire_label(l, VAL, label)
 #define arch_mcs_spin_unlock_contended(l) smp_store_release((l), 1)
@@ -16,10 +16,6 @@
 #define EOPNOTSUPP 95
 #define ETIMEDOUT 110
 
-#ifndef __arena
-#define __arena __attribute__((address_space(1)))
-#endif
-
 extern unsigned long CONFIG_NR_CPUS __kconfig;
 
 /*
@@ -107,7 +103,12 @@ struct arena_qnode {
 #define _Q_LOCKED_VAL		(1U << _Q_LOCKED_OFFSET)
 #define _Q_PENDING_VAL		(1U << _Q_PENDING_OFFSET)
 
-struct arena_qnode __arena qnodes[_Q_MAX_CPUS][_Q_MAX_NODES];
+/*
+ * The qnodes are marked __weak so we can define them in the header
+ * while still ensuring all compilation units use the same struct
+ * instance.
+ */
+struct arena_qnode __weak __arena __hidden qnodes[_Q_MAX_CPUS][_Q_MAX_NODES];
 
 static inline u32 encode_tail(int cpu, int idx)
 {
@@ -240,8 +241,8 @@ static __always_inline int arena_spin_trylock(arena_spinlock_t __arena *lock)
 	return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL));
 }
 
-__noinline
-int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val)
+__noinline __weak
+int arena_spin_lock_slowpath(arena_spinlock_t __arena *lock, u32 val)
 {
 	struct arena_mcs_spinlock __arena *prev, *next, *node0, *node;
 	int ret = -ETIMEDOUT;
diff --git a/tools/testing/selftests/bpf/bpf_atomic.h b/tools/testing/selftests/bpf/libarena/include/bpf_atomic.h
index c550e5711967..b7b230431929 100644
--- a/tools/testing/selftests/bpf/bpf_atomic.h
+++ b/tools/testing/selftests/bpf/libarena/include/bpf_atomic.h
@@ -5,7 +5,7 @@
 
 #include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
-#include "bpf_experimental.h"
+#include <bpf_may_goto.h>
 
 extern bool CONFIG_X86_64 __kconfig __weak;
 
@@ -42,7 +42,9 @@ extern bool CONFIG_X86_64 __kconfig __weak;
 
 #define READ_ONCE(x) (*(volatile typeof(x) *)&(x))
 
+#ifndef WRITE_ONCE
 #define WRITE_ONCE(x, val) ((*(volatile typeof(x) *)&(x)) = (val))
+#endif
 
 #define cmpxchg(p, old, new) __sync_val_compare_and_swap((p), old, new)
 
diff --git a/tools/testing/selftests/bpf/libarena/include/bpf_may_goto.h b/tools/testing/selftests/bpf/libarena/include/bpf_may_goto.h
new file mode 100644
index 000000000000..9ba90689d6ba
--- /dev/null
+++ b/tools/testing/selftests/bpf/libarena/include/bpf_may_goto.h
@@ -0,0 +1,84 @@
+#pragma once
+
+/*
+ * Note that cond_break can only be portably used in the body of a breakable
+ * construct, whereas can_loop can be used anywhere.
+ */
+#ifdef __BPF_FEATURE_MAY_GOTO
+#define can_loop					\
+	({ __label__ l_break, l_continue;		\
+	bool ret = true;				\
+	asm volatile goto("may_goto %l[l_break]"	\
+		      :::: l_break);			\
+	goto l_continue;				\
+	l_break: ret = false;				\
+	l_continue:;					\
+	ret;						\
+	})
+
+#define __cond_break(expr)				\
+	({ __label__ l_break, l_continue;		\
+	asm volatile goto("may_goto %l[l_break]"	\
+		      :::: l_break);			\
+	goto l_continue;				\
+	l_break: expr;					\
+	l_continue:;					\
+	})
+#else
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define can_loop					\
+	({ __label__ l_break, l_continue;		\
+	bool ret = true;				\
+	asm volatile goto("1:.byte 0xe5;		\
+		      .byte 0;				\
+		      .long ((%l[l_break] - 1b - 8) / 8) & 0xffff;	\
+		      .short 0"				\
+		      :::: l_break);			\
+	goto l_continue;				\
+	l_break: ret = false;				\
+	l_continue:;					\
+	ret;						\
+	})
+
+#define __cond_break(expr)				\
+	({ __label__ l_break, l_continue;		\
+	asm volatile goto("1:.byte 0xe5;		\
+		      .byte 0;				\
+		      .long ((%l[l_break] - 1b - 8) / 8) & 0xffff;	\
+		      .short 0"				\
+		      :::: l_break);			\
+	goto l_continue;				\
+	l_break: expr;					\
+	l_continue:;					\
+	})
+#else
+#define can_loop					\
+	({ __label__ l_break, l_continue;		\
+	bool ret = true;				\
+	asm volatile goto("1:.byte 0xe5;		\
+		      .byte 0;				\
+		      .long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16;	\
+		      .short 0"				\
+		      :::: l_break);			\
+	goto l_continue;				\
+	l_break: ret = false;				\
+	l_continue:;					\
+	ret;						\
+	})
+
+#define __cond_break(expr)				\
+	({ __label__ l_break, l_continue;		\
+	asm volatile goto("1:.byte 0xe5;		\
+		      .byte 0;				\
+		      .long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16;	\
+		      .short 0"				\
+		      :::: l_break);			\
+	goto l_continue;				\
+	l_break: expr;					\
+	l_continue:;					\
+	})
+#endif
+#endif
+
+#define cond_break __cond_break(break)
+#define cond_break_label(label) __cond_break(goto label)
diff --git a/tools/testing/selftests/bpf/libarena/include/libarena/asan.h b/tools/testing/selftests/bpf/libarena/include/libarena/asan.h
new file mode 100644
index 000000000000..900267159292
--- /dev/null
+++ b/tools/testing/selftests/bpf/libarena/include/libarena/asan.h
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+#pragma once
+
+struct asan_init_args {
+	u64 arena_all_pages;
+	u64 arena_globals_pages;
+};
+
+int asan_init(struct asan_init_args *args);
+
+extern volatile u64 __asan_shadow_memory_dynamic_address;
+extern volatile u32 asan_reported;
+extern volatile bool asan_inited;
+extern volatile bool asan_report_once;
+
+#ifdef __BPF__
+
+#define ASAN_SHADOW_SHIFT 3
+#define ASAN_SHADOW_SCALE (1ULL << ASAN_SHADOW_SHIFT)
+#define ASAN_GRANULE_MASK ((1ULL << ASAN_SHADOW_SHIFT) - 1)
+#define ASAN_GRANULE(addr) ((s8)((u32)(u64)((addr)) & ASAN_GRANULE_MASK))
+
+#define __noasan __attribute__((no_sanitize("address")))
+
+#ifdef BPF_ARENA_ASAN
+
+static inline
+s8 __arena *mem_to_shadow(void __arena *addr)
+{
+	return (s8 __arena *)(((u32)(u64)addr >> ASAN_SHADOW_SHIFT) +
+			__asan_shadow_memory_dynamic_address);
+}
+
+__weak __noasan
+bool asan_ready(void)
+{
+	return __asan_shadow_memory_dynamic_address;
+}
+
+int asan_poison(void __arena *addr, s8 val, size_t size);
+int asan_unpoison(void __arena *addr, size_t size);
+bool asan_shadow_set(void __arena *addr);
+
+/*
+ * Dummy calls to ensure the ASAN runtime's BTF information is present
+ * in every object file when compiling the runtime and local BPF code
+ * separately. The runtime calls are injected into the LLVM IR file
+ */
+#define DECLARE_ASAN_LOAD_STORE_SIZE(size)				\
+	void __asan_store##size(intptr_t addr);				\
+	void __asan_store##size##_noabort(intptr_t addr);	\
+	void __asan_load##size(intptr_t addr);				\
+	void __asan_load##size##_noabort(intptr_t addr);	\
+	void __asan_report_store##size(intptr_t addr);			\
+	void __asan_report_store##size##_noabort(intptr_t addr);		\
+	void __asan_report_load##size(intptr_t addr);			\
+	void __asan_report_load##size##_noabort(intptr_t addr);
+
+DECLARE_ASAN_LOAD_STORE_SIZE(1);
+DECLARE_ASAN_LOAD_STORE_SIZE(2);
+DECLARE_ASAN_LOAD_STORE_SIZE(4);
+DECLARE_ASAN_LOAD_STORE_SIZE(8);
+
+void __asan_storeN(intptr_t addr, ssize_t size);
+void __asan_storeN_noabort(intptr_t addr, ssize_t size);
+void __asan_loadN(intptr_t addr, ssize_t size);
+void __asan_loadN_noabort(intptr_t addr, ssize_t size);
+
+/*
+ * Force LLVM to emit BTF information for the stubs,
+ * because the ASAN pass in LLVM by itself doesn't.
+ */
+#define ASAN_LOAD_STORE_SIZE(size)		\
+	__asan_store##size,			\
+	__asan_store##size##_noabort,		\
+	__asan_load##size,			\
+	__asan_load##size##_noabort,		\
+	__asan_report_store##size,		\
+	__asan_report_store##size##_noabort,	\
+	__asan_report_load##size,		\
+	__asan_report_load##size##_noabort
+
+__attribute__((used))
+static void (*__asan_btf_anchors[])(intptr_t) = {
+	ASAN_LOAD_STORE_SIZE(1),
+	ASAN_LOAD_STORE_SIZE(2),
+	ASAN_LOAD_STORE_SIZE(4),
+	ASAN_LOAD_STORE_SIZE(8),
+};
+
+#else /* BPF_ARENA_ASAN */
+
+static inline int asan_poison(void __arena *addr, s8 val, size_t size) { return 0; }
+static inline int asan_unpoison(void __arena *addr, size_t size) { return 0; }
+static inline bool asan_shadow_set(void __arena *addr) { return 0; }
+__weak bool asan_ready(void) { return true; }
+
+#endif /* BPF_ARENA_ASAN */
+
+#endif /* __BPF__ */
diff --git a/tools/testing/selftests/bpf/libarena/include/libarena/buddy.h b/tools/testing/selftests/bpf/libarena/include/libarena/buddy.h
new file mode 100644
index 000000000000..528c69a1f38e
--- /dev/null
+++ b/tools/testing/selftests/bpf/libarena/include/libarena/buddy.h
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+#pragma once
+
+enum buddy_consts {
+	/*
+	 * Minimum allocation is 1 << BUDDY_MIN_ALLOC_SHIFT.
+	 * Larger sizes increase internal fragmentation, but smaller
+	 * sizes increase the space overhead of the block metadata.
+	 */
+	BUDDY_MIN_ALLOC_SHIFT	= 4,
+	BUDDY_MIN_ALLOC_BYTES	= 1 << BUDDY_MIN_ALLOC_SHIFT,
+
+	/*
+	 * How many orders the buddy allocator can serve. Minimum block
+	 * size is 1 << BUDDY_MIN_ALLOC_SHIFT, maximum block size is
+	 * 1 << (BUDDY_MIN_ALLOC_SHIFT + BUDDY_CHUNK_NUM_ORDERS - 1):
+	 * Each block has size 1 << BUDDY_MIN_ALLOC_SHIFT, and the
+	 * allocation orders are in [0, BUDDY_CHUNK_NUM_ORDERS).
+	 * We keep two blocks of the maximum size to retain the
+	 * property in the code that all blocks have a buddy.
+	 * Higher values increase the maximum allocation size,
+	 * but also the size of the metadata for each block.
+	 */
+	BUDDY_CHUNK_NUM_ORDERS	= 1 << 4,
+	BUDDY_CHUNK_BYTES	= BUDDY_MIN_ALLOC_BYTES << (BUDDY_CHUNK_NUM_ORDERS),
+
+	/* Offset of the buddy header within a free block, see buddy.bpf.c for details */
+	BUDDY_HEADER_OFF	= 8,
+
+	/* The maximum number of blocks a chunk may have to track. */
+	BUDDY_CHUNK_ITEMS	= 1 << (BUDDY_CHUNK_NUM_ORDERS),
+	BUDDY_CHUNK_OFFSET_MASK	= BUDDY_CHUNK_BYTES - 1,
+
+	/*
+	 * Alignment for chunk allocations based on bpf_arena_alloc_pages.
+	 * The arena allocation kfunc does not have an alignment argument,
+	 * but that is required for all block calculations in the chunk to
+	 * work.
+	 */
+	BUDDY_VADDR_OFFSET	= BUDDY_CHUNK_BYTES,
+
+	/* Total arena virtual address space the allocator can consume. */
+	BUDDY_VADDR_SIZE	= BUDDY_CHUNK_BYTES << 10
+};
+
+struct buddy_header {
+	u32 prev_index;	/* "Pointer" to the previous available allocation of the same size. */
+	u32 next_index; /* Same for the next allocation. */
+};
+
+/*
+ * We bring memory into the allocator 1 MiB at a time.
+ */
+struct buddy_chunk {
+	/* The order of the current allocation for a item. 4 bits per order. */
+	u8		orders[BUDDY_CHUNK_ITEMS / 2];
+	/*
+	 * Bit to denote whether chunk is allocated. Size of the allocated/free
+	 * chunk found from the orders array.
+	 */
+	u8		allocated[BUDDY_CHUNK_ITEMS / 8];
+	/* Freelists for O(1) allocation. */
+	u64		freelists[BUDDY_CHUNK_NUM_ORDERS];
+	struct buddy_chunk __arena	*next;
+};
+
+struct buddy {
+	struct buddy_chunk __arena *first_chunk;		/* Pointer to the chunk linked list. */
+	arena_spinlock_t lock;			/* Allocator lock */
+	u64 vaddr;				/* Allocation into reserved vaddr */
+};
+
+#ifdef __BPF__
+
+int buddy_init(struct buddy __arena *buddy);
+int buddy_destroy(struct buddy __arena *buddy);
+int buddy_free(struct buddy __arena *buddy, void __arena *free);
+void __arena *buddy_alloc(struct buddy __arena *buddy, size_t size);
+
+#endif /* __BPF__  */
diff --git a/tools/testing/selftests/bpf/libarena/include/libarena/common.h b/tools/testing/selftests/bpf/libarena/include/libarena/common.h
new file mode 100644
index 000000000000..a3eb1641ac36
--- /dev/null
+++ b/tools/testing/selftests/bpf/libarena/include/libarena/common.h
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+#pragma once
+
+#ifdef __BPF__
+
+#include <vmlinux.h>
+
+#include <bpf_arena_common.h>
+#include <bpf_arena_spin_lock.h>
+
+#include <asm-generic/errno.h>
+
+#ifndef __BPF_FEATURE_ADDR_SPACE_CAST
+#error "Arena allocators require bpf_addr_space_cast feature"
+#endif
+
+#define arena_stdout(fmt, ...) bpf_stream_printk(1, (fmt), ##__VA_ARGS__)
+#define arena_stderr(fmt, ...) bpf_stream_printk(2, (fmt), ##__VA_ARGS__)
+
+#ifndef __maybe_unused
+#define __maybe_unused __attribute__((__unused__))
+#endif
+
+#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8)))
+
+#define ARENA_PAGES (1UL << (32 - __builtin_ffs(__PAGE_SIZE) + 1))
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARENA);
+	__uint(map_flags, BPF_F_MMAPABLE);
+	__uint(max_entries, ARENA_PAGES); /* number of pages */
+#if defined(__TARGET_ARCH_arm64) || defined(__aarch64__)
+	__ulong(map_extra, (1ull << 32)); /* start of mmap() region */
+#else
+	__ulong(map_extra, (1ull << 44)); /* start of mmap() region */
+#endif
+} arena __weak SEC(".maps");
+
+/*
+ * This is a variable used to aid verification. The may_goto directive
+ * permits open-coded for loops, but requires that the index variable is
+ * imprecise. To force the variable to be imprecise, initialize it with
+ * the opaque volatile variable 0 instead of the constant 0.
+ */
+extern const volatile u32 zero;
+extern volatile u64 asan_violated;
+
+int arena_fls(__u64 word);
+
+void __arena *arena_malloc(size_t size);
+void arena_free(void __arena *ptr);
+
+/*
+ * The verifier associates arenas with programs by checking LD.IMM
+ * instruction operands for an arena and populating the program state
+ * with the first instance it finds. This requires accessing our global
+ * arena variable, but subprogs do not necessarily do so while still
+ * using pointers from that arena. Insert an LD.IMM instruction  to
+ * access the arena and help the verifier.
+ */
+#define arena_subprog_init() do { asm volatile ("" :: "r"(&arena)); } while (0)
+
+#else /* ! __BPF__ */
+
+#include <stdint.h>
+
+#define __arena
+
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+typedef int8_t s8;
+typedef int16_t s16;
+typedef int32_t s32;
+typedef int64_t s64;
+
+/* Dummy "definition" for userspace. */
+#define arena_spinlock_t int
+
+#endif /* __BPF__ */
+
+struct arena_get_info_args {
+	void __arena *arena_base;
+};
+
+struct arena_alloc_reserve_args {
+	u64 nr_pages;
+};
+
+/* Reasonable default number of pages reserved by arena_alloc_reserve. */
+#define ARENA_RESERVE_PAGES_DFL (8)
diff --git a/tools/testing/selftests/bpf/libarena/include/libarena/rbtree.h b/tools/testing/selftests/bpf/libarena/include/libarena/rbtree.h
new file mode 100644
index 000000000000..486428911d96
--- /dev/null
+++ b/tools/testing/selftests/bpf/libarena/include/libarena/rbtree.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause */
+
+#pragma once
+
+#define RB_MAXLVL_PRINT (16)
+
+struct rbnode;
+
+struct rbnode {
+	struct rbnode __arena *parent;
+	union {
+		struct {
+			struct rbnode __arena *left;
+			struct rbnode __arena *right;
+		};
+
+		struct rbnode __arena *child[2];
+	};
+	uint64_t key;
+	/* Used as a linked list or to store KV pairs. */
+	union {
+		struct rbnode __arena *next;
+		uint64_t value;
+	};
+	bool is_red;
+};
+
+/*
+ * Does the rbtree allocate its own nodes, or do they get
+ * allocated by the caller?
+ */
+enum rbtree_alloc {
+	RB_ALLOC,
+	RB_NOALLOC,
+};
+
+/*
+ * Specify the behavior of rbtree insertions when the key is
+ * already present in the tree.
+ *
+ * RB_DEFAULT: Default behavior, reject the new insert.
+ *
+ * RB_UPDATE: Update the existing value in the rbtree.
+ * This updates the node itself, not just the value in
+ * the existing node.
+ *
+ * RB_DUPLICATE: Allow nodes with identical keys in the rbtree.
+ * Finding/popping/removing a key acts on any of the nodes
+ * with the appropriate key - there is no ordering by time
+ * of insertion.
+ */
+enum rbtree_insert_mode {
+	RB_DEFAULT,
+	RB_UPDATE,
+	RB_DUPLICATE,
+};
+
+struct rbtree {
+	struct rbnode __arena *root;
+	enum rbtree_alloc alloc;
+	enum rbtree_insert_mode insert;
+};
+
+#ifdef __BPF__
+struct rbtree __arena *rb_create(enum rbtree_alloc alloc, enum rbtree_insert_mode insert);
+
+int rb_destroy(struct rbtree __arena *rbtree);
+int rb_insert(struct rbtree __arena *rbtree, u64 key, u64 value);
+int rb_remove(struct rbtree __arena *rbtree, u64 key);
+int rb_find(struct rbtree __arena *rbtree, u64 key, u64 *value);
+int rb_print(struct rbtree __arena *rbtree);
+int rb_least(struct rbtree __arena *rbtree, u64 *key, u64 *value);
+int rb_pop(struct rbtree __arena *rbtree, u64 *key, u64 *value);
+
+int rb_insert_node(struct rbtree __arena *rbtree, struct rbnode __arena *node);
+int rb_remove_node(struct rbtree __arena *rbtree, struct rbnode __arena *node);
+
+struct rbnode __arena *rb_node_alloc(u64 key, u64 value);
+void rb_node_free(struct rbnode __arena *rbnode);
+
+int rb_integrity_check(struct rbtree __arena *rbtree);
+
+#endif /* __BPF__ */
diff --git a/tools/testing/selftests/bpf/libarena/include/libarena/spmc.h b/tools/testing/selftests/bpf/libarena/include/libarena/spmc.h
new file mode 100644
index 000000000000..75611276ce13
--- /dev/null
+++ b/tools/testing/selftests/bpf/libarena/include/libarena/spmc.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause */
+
+#pragma once
+
+struct spmc_arr;
+
+#define SPMC_ARR_BASESZ 128
+#define SPMC_ARR_ORDERS 10
+
+struct spmc_arr {
+	u64 __arena *data;
+	u64 order;
+};
+
+struct spmc {
+	volatile struct spmc_arr __arena *cur;
+	volatile u64 top;
+	volatile u64 bottom;
+	struct spmc_arr arr[SPMC_ARR_ORDERS];
+};
+
+int spmc_owned_add(struct spmc __arena *spmc, u64 val);
+int spmc_owned_remove(struct spmc __arena *spmc, u64 *val);
+int spmc_steal(struct spmc __arena *spmc, u64 *val);
+
+struct spmc __arena *spmc_create(void);
+int spmc_destroy(struct spmc __arena *spmc);
diff --git a/tools/testing/selftests/bpf/libarena/include/libarena/userspace.h b/tools/testing/selftests/bpf/libarena/include/libarena/userspace.h
new file mode 100644
index 000000000000..fc27a4bcf5d7
--- /dev/null
+++ b/tools/testing/selftests/bpf/libarena/include/libarena/userspace.h
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+#pragma once
+
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/mman.h>
+
+#include <bpf/libbpf.h>
+#include <bpf/bpf.h>
+
+static inline int libarena_run_prog(int prog_fd)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, opts);
+	int ret;
+
+	ret = bpf_prog_test_run_opts(prog_fd, &opts);
+	if (ret)
+		return ret;
+
+	return opts.retval;
+}
+
+static inline bool libarena_is_test_prog(const char *name)
+{
+	return strstr(name, "test_") == name;
+}
+
+static inline bool libarena_is_asan_test_prog(const char *name)
+{
+	return strstr(name, "asan_test") == name;
+}
+
+static inline bool libarena_is_parallel_test_prog(const char *name)
+{
+	return strstr(name, "parallel_test") == name;
+}
+
+
+static inline int libarena_run_prog_args(int prog_fd, void *args, size_t argsize)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, opts);
+	int ret;
+
+	opts.ctx_in = args;
+	opts.ctx_size_in = argsize;
+
+	ret = bpf_prog_test_run_opts(prog_fd, &opts);
+
+	return ret ?: opts.retval;
+}
+
+static inline int libarena_get_arena_base(int arena_get_info_fd,
+					  void **arena_base)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, opts);
+	struct arena_get_info_args args = { .arena_base = NULL };
+	int ret;
+
+	opts.ctx_in = &args;
+	opts.ctx_size_in = sizeof(args);
+
+	ret = bpf_prog_test_run_opts(arena_get_info_fd, &opts);
+	if (ret)
+		return ret;
+	if (opts.retval)
+		return opts.retval;
+
+	*arena_base = args.arena_base;
+	return 0;
+}
+
+static inline int libarena_get_globals_pages(int arena_get_globals_fd,
+					     size_t arena_all_pages,
+					     u64 *globals_pages)
+{
+	size_t pgsize = sysconf(_SC_PAGESIZE);
+	void *arena_base;
+	ssize_t i;
+	u8 *vec;
+	int ret;
+
+	ret = libarena_get_arena_base(arena_get_globals_fd, &arena_base);
+	if (ret)
+		return ret;
+
+	if (!arena_base)
+		return -EINVAL;
+
+	vec = calloc(arena_all_pages, sizeof(*vec));
+	if (!vec)
+		return -ENOMEM;
+
+	if (mincore(arena_base, arena_all_pages * pgsize, vec) < 0) {
+		ret = -errno;
+		free(vec);
+		return ret;
+	}
+
+	*globals_pages = 0;
+	for (i = arena_all_pages - 1; i >= 0; i--) {
+		if (!(vec[i] & 0x1))
+			break;
+		*globals_pages += 1;
+	}
+
+	free(vec);
+	return 0;
+}
+
+static inline int libarena_asan_init(int arena_asan_init_fd,
+				     int asan_init_fd,
+				     size_t arena_all_pages)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, opts);
+	struct asan_init_args args;
+	u64 globals_pages;
+	int ret;
+
+	ret = libarena_get_globals_pages(arena_asan_init_fd,
+					 arena_all_pages, &globals_pages);
+	if (ret)
+		return ret;
+
+	args = (struct asan_init_args){
+		.arena_all_pages = arena_all_pages,
+		.arena_globals_pages = globals_pages,
+	};
+
+	opts.ctx_in = &args;
+	opts.ctx_size_in = sizeof(args);
+
+	ret = bpf_prog_test_run_opts(asan_init_fd, &opts);
+	if (ret)
+		return ret;
+	return opts.retval;
+}
diff --git a/tools/testing/selftests/bpf/libarena/selftests/st_asan_buddy.bpf.c b/tools/testing/selftests/bpf/libarena/selftests/st_asan_buddy.bpf.c
new file mode 100644
index 000000000000..686caba2c643
--- /dev/null
+++ b/tools/testing/selftests/bpf/libarena/selftests/st_asan_buddy.bpf.c
@@ -0,0 +1,258 @@
+// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#include <libarena/common.h>
+#include <libarena/asan.h>
+#include <libarena/buddy.h>
+
+/* Required for parsing the ASAN call stacks. */
+#include "test_progs_compat.h"
+
+extern struct buddy __arena buddy;
+
+#ifdef BPF_ARENA_ASAN
+
+#include "st_asan_common.h"
+
+static __always_inline int asan_test_buddy_oob_single(size_t alloc_size)
+{
+	u8 __arena *mem;
+	int ret, i;
+
+	ret = asan_validate();
+	if (ret < 0)
+		return ret;
+
+	mem = buddy_alloc(&buddy, alloc_size);
+	if (!mem) {
+		arena_stdout("buddy_alloc failed for size %lu", alloc_size);
+		return -ENOMEM;
+	}
+
+	ret = asan_validate();
+	if (ret < 0)
+		return ret;
+
+	for (i = zero; i < alloc_size && can_loop; i++) {
+		mem[i] = 0xba;
+		ret = asan_validate_addr(false, &mem[i]);
+		if (ret < 0)
+			return ret;
+	}
+
+	mem[alloc_size] = 0xba;
+	ret = asan_validate_addr(true, &mem[alloc_size]);
+	if (ret < 0)
+		return ret;
+
+	buddy_free(&buddy, mem);
+
+	return 0;
+}
+
+/*
+ * Factored out because asan_validate_addr is complex enough to cause
+ * verification failures if verified with the rest of asan_test_buddy_uaf_single.
+ */
+__weak int asan_test_buddy_byte(u8 __arena *mem, int i, bool freed)
+{
+	int ret;
+
+	/* The header in freed blocks doesn't get poisoned. */
+	if (freed && BUDDY_HEADER_OFF <= i &&
+		i < BUDDY_HEADER_OFF + sizeof(struct buddy_header))
+		return 0;
+
+	mem[i] = 0xba;
+	ret = asan_validate_addr(freed, &mem[i]);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+__weak int asan_test_buddy_uaf_single(size_t alloc_size)
+{
+	u8 __arena *mem;
+	int ret;
+	int i;
+
+	mem = buddy_alloc(&buddy, alloc_size);
+	if (!mem) {
+		arena_stdout("buddy_alloc failed for size %lu", alloc_size);
+		return -ENOMEM;
+	}
+
+	ret = asan_validate();
+	if (ret < 0)
+		return ret;
+
+	for (i = zero; i < alloc_size && can_loop; i++) {
+		ret = asan_test_buddy_byte(mem, i, false);
+		if (ret)
+			return ret;
+	}
+
+	ret = asan_validate();
+	if (ret < 0)
+		return ret;
+
+	buddy_free(&buddy, mem);
+
+	for (i = zero; i < alloc_size && can_loop; i++) {
+		ret = asan_test_buddy_byte(mem, i, true);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+struct buddy_blob {
+	volatile u8 mem[48];
+	u8 oob;
+};
+
+static __always_inline int asan_test_buddy_blob_single(void)
+{
+	volatile struct buddy_blob __arena *blob;
+	const size_t alloc_size = sizeof(struct buddy_blob) - 1;
+	int ret;
+
+	blob = buddy_alloc(&buddy, alloc_size);
+	if (!blob)
+		return -ENOMEM;
+
+	blob->mem[0] = 0xba;
+	ret = asan_validate_addr(false, &blob->mem[0]);
+	if (ret < 0)
+		return ret;
+
+	blob->mem[47] = 0xba;
+	ret = asan_validate_addr(false, &blob->mem[47]);
+	if (ret < 0)
+		return ret;
+
+	blob->oob = 0;
+	ret = asan_validate_addr(true, &blob->oob);
+	if (ret < 0)
+		return ret;
+
+	buddy_free(&buddy, (void __arena *)blob);
+
+	return 0;
+}
+
+SEC("syscall")
+__stderr("Memory violation for address {{.*}} for write of size 1")
+__stderr("CPU: {{[0-9]+}} UID: 0 PID: {{[0-9]+}} Comm: {{.*}}")
+__stderr("Call trace:\n"
+"{{([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n"
+"|[ \t]+[^\n]+\n)*}}")
+__weak int asan_test_buddy_oob(void)
+{
+	size_t sizes[] = {
+		7, 8, 17, 18, 64, 256, 317, 512, 1024,
+	};
+	int ret, i;
+
+	ret = buddy_init(&buddy);
+	if (ret) {
+		arena_stdout("buddy_init failed with %d", ret);
+		return ret;
+	}
+
+	for (i = zero; i < sizeof(sizes) / sizeof(sizes[0]) && can_loop; i++) {
+		ret = asan_test_buddy_oob_single(sizes[i]);
+		if (ret) {
+			arena_stdout("%s:%d Failed for size %lu", __func__,
+				   __LINE__, sizes[i]);
+			buddy_destroy(&buddy);
+			return ret;
+		}
+	}
+
+	buddy_destroy(&buddy);
+
+	ret = asan_validate();
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+SEC("syscall")
+__stderr("Memory violation for address {{.*}} for write of size 1")
+__stderr("CPU: {{[0-9]+}} UID: 0 PID: {{[0-9]+}} Comm: {{.*}}")
+__stderr("Call trace:\n"
+"{{([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n"
+"|[ \t]+[^\n]+\n)*}}")
+__weak int asan_test_buddy_uaf(void)
+{
+	size_t sizes[] = { 16, 32, 64, 128, 256, 512, 1024, 16384 };
+	int ret, i;
+
+	ret = buddy_init(&buddy);
+	if (ret) {
+		arena_stdout("buddy_init failed with %d", ret);
+		return ret;
+	}
+
+	for (i = zero; i < sizeof(sizes) / sizeof(sizes[0]) && can_loop; i++) {
+		ret = asan_test_buddy_uaf_single(sizes[i]);
+		if (ret) {
+			arena_stdout("%s:%d Failed for size %lu", __func__,
+				   __LINE__, sizes[i]);
+			buddy_destroy(&buddy);
+			return ret;
+		}
+	}
+
+	buddy_destroy(&buddy);
+
+	ret = asan_validate();
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+SEC("syscall")
+__stderr("Memory violation for address {{.*}} for write of size 1")
+__stderr("CPU: {{[0-9]+}} UID: 0 PID: {{[0-9]+}} Comm: {{.*}}")
+__stderr("Call trace:\n"
+"{{([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n"
+"|[ \t]+[^\n]+\n)*}}")
+__weak int asan_test_buddy_blob(void)
+{
+	const int iters = 10;
+	int ret, i;
+
+	ret = buddy_init(&buddy);
+	if (ret) {
+		arena_stdout("buddy_init failed with %d", ret);
+		return ret;
+	}
+
+	for (i = zero; i < iters && can_loop; i++) {
+		ret = asan_test_buddy_blob_single();
+		if (ret) {
+			arena_stdout("%s:%d Failed on iteration %d", __func__,
+				   __LINE__, i);
+			buddy_destroy(&buddy);
+			return ret;
+		}
+	}
+
+	buddy_destroy(&buddy);
+
+	ret = asan_validate();
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+#endif
+
+__weak char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/libarena/selftests/st_asan_common.h b/tools/testing/selftests/bpf/libarena/selftests/st_asan_common.h
new file mode 100644
index 000000000000..34a7918cb4cf
--- /dev/null
+++ b/tools/testing/selftests/bpf/libarena/selftests/st_asan_common.h
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#pragma once
+
+#define ST_PAGES 64
+
+static inline void print_asan_map_state(void __arena *addr)
+{
+	arena_stdout("%s:%d ASAN %p -> (val: %x gran: %x set: [%s])",
+			__func__, __LINE__, addr,
+			*(s8 __arena *)(addr), ASAN_GRANULE(addr),
+			asan_shadow_set(addr) ? "yes" : "no");
+}
+
+/*
+ * Emit an error and force the current function to exit if the ASAN
+ * violation state is unexpected. Reset the violation state after.
+ */
+static inline int asan_validate_addr(bool cond, void __arena *addr)
+{
+	if ((asan_violated != 0) == cond) {
+		asan_violated = 0;
+		return 0;
+	}
+
+	arena_stdout("%s:%d ASAN asan_violated %lx", __func__, __LINE__,
+			(u64)asan_violated);
+	print_asan_map_state(addr);
+
+	asan_violated = 0;
+
+	return -EINVAL;
+}
+
+static inline int asan_validate(void)
+{
+	if (!asan_violated)
+		return 0;
+
+	arena_stdout("%s:%d Found ASAN violation at %lx", __func__, __LINE__,
+			asan_violated);
+
+	asan_violated = 0;
+
+	return -EINVAL;
+}
+
+struct blob {
+	volatile u8 mem[59];
+	u8 oob;
+};
diff --git a/tools/testing/selftests/bpf/libarena/selftests/st_buddy.bpf.c b/tools/testing/selftests/bpf/libarena/selftests/st_buddy.bpf.c
new file mode 100644
index 000000000000..b45a306816c0
--- /dev/null
+++ b/tools/testing/selftests/bpf/libarena/selftests/st_buddy.bpf.c
@@ -0,0 +1,209 @@
+// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#include <libarena/common.h>
+
+#include <libarena/asan.h>
+#include <libarena/buddy.h>
+
+extern struct buddy __arena buddy;
+
+struct segarr_entry {
+	u8 __arena *block;
+	size_t sz;
+	u8 poison;
+};
+
+#define SEGARRLEN (512)
+static struct segarr_entry __arena segarr[SEGARRLEN];
+static void __arena *ptrs[17];
+size_t __arena alloc_sizes[] = { 3, 17, 1025, 129, 16350, 333, 9, 517 };
+size_t __arena alloc_multiple_sizes[] = { 3, 17, 1025, 129, 16350, 333, 9, 517, 2099 };
+size_t __arena alloc_free_sizes[] = { 3, 17, 64, 129, 256, 333, 512, 517 };
+size_t __arena alignment_sizes[] = { 1, 3, 7, 8, 9, 15, 16, 17, 31,
+				     32, 64, 100, 128, 255, 256, 512, 1000 };
+
+SEC("syscall")
+__weak int test_buddy_create(void)
+{
+	const int iters = 10;
+	int ret, i;
+
+	for (i = zero; i < iters && can_loop; i++) {
+		ret = buddy_init(&buddy);
+		if (ret)
+			return ret;
+
+		ret = buddy_destroy(&buddy);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+SEC("syscall")
+__weak int test_buddy_alloc(void)
+{
+	void __arena *mem;
+	int ret, i;
+
+	for (i = zero; i < 8 && can_loop; i++) {
+		ret = buddy_init(&buddy);
+		if (ret)
+			return ret;
+
+		mem = buddy_alloc(&buddy, alloc_sizes[i]);
+		if (!mem) {
+			buddy_destroy(&buddy);
+			return -ENOMEM;
+		}
+
+		buddy_destroy(&buddy);
+	}
+
+	return 0;
+}
+
+SEC("syscall")
+__weak int test_buddy_alloc_free(void)
+{
+	const int iters = 800;
+	void __arena *mem;
+	int ret, i;
+
+	ret = buddy_init(&buddy);
+	if (ret)
+		return ret;
+
+	for (i = zero; i < iters && can_loop; i++) {
+		mem = buddy_alloc(&buddy, alloc_free_sizes[(i * 5) % 8]);
+		if (!mem) {
+			buddy_destroy(&buddy);
+			return -ENOMEM;
+		}
+
+		buddy_free(&buddy, mem);
+	}
+
+	buddy_destroy(&buddy);
+
+	return 0;
+}
+
+SEC("syscall")
+__weak int test_buddy_alloc_multiple(void)
+{
+	int ret, j;
+	u32 i, idx;
+	u8 __arena *mem;
+	size_t sz;
+	u8 poison;
+
+	ret = buddy_init(&buddy);
+	if (ret)
+		return ret;
+
+	/*
+	 * Cycle through each size, allocating an entry in the
+	 * segarr. Continue for SEGARRLEN iterations. For every
+	 * allocation write down the size, use the current index
+	 * as a poison value, and log it with the pointer in the
+	 * segarr entry. Use the poison value to poison the entire
+	 * allocated memory according to the size given.
+	 */
+	for (i = zero; i < SEGARRLEN && can_loop; i++) {
+		sz = alloc_multiple_sizes[i % 9];
+		poison = (u8)i;
+
+		mem = buddy_alloc(&buddy, sz);
+		if (!mem) {
+			buddy_destroy(&buddy);
+			arena_stdout("%s:%d", __func__, __LINE__);
+			return -ENOMEM;
+		}
+
+		segarr[i].block = mem;
+		segarr[i].sz = sz;
+		segarr[i].poison = poison;
+
+		for (j = zero; j < sz && can_loop; j++) {
+			mem[j] = poison;
+			if (mem[j] != poison) {
+				buddy_destroy(&buddy);
+				return -EINVAL;
+			}
+		}
+	}
+
+	/*
+	 * Go to (i * 17) % SEGARRLEN, and free the block pointed to.
+	 * Before freeing, check all bytes have the poisoned value
+	 * corresponding to the element. If any values are unexpected,
+	 * return an error. Skip some elements to test destroying the
+	 * buddy allocator while data is still allocated.
+	 */
+	for (i = 10; i < SEGARRLEN && can_loop; i++) {
+		idx = (i * 17) % SEGARRLEN;
+
+		mem = segarr[idx].block;
+		sz = segarr[idx].sz;
+		poison = segarr[idx].poison;
+
+		for (j = zero; j < sz && can_loop; j++) {
+			if (mem[j] != poison) {
+				buddy_destroy(&buddy);
+				arena_stdout("%s:%d %lx %u vs %u", __func__,
+					   __LINE__, (uintptr_t)&mem[j],
+					   mem[j], poison);
+				return -EINVAL;
+			}
+		}
+
+		buddy_free(&buddy, mem);
+	}
+
+	buddy_destroy(&buddy);
+
+	return 0;
+}
+
+SEC("syscall")
+__weak int test_buddy_alignment(void)
+{
+	int ret, i;
+
+	ret = buddy_init(&buddy);
+	if (ret)
+		return ret;
+
+	/* Allocate various sizes and check alignment */
+	for (i = zero; i < 17 && can_loop; i++) {
+		ptrs[i] = buddy_alloc(&buddy, alignment_sizes[i]);
+		if (!ptrs[i]) {
+			arena_stdout("alignment test: alloc failed for size %lu",
+				   alignment_sizes[i]);
+			buddy_destroy(&buddy);
+			return -ENOMEM;
+		}
+
+		/* Check 8-byte alignment */
+		if ((u64)ptrs[i] & 0x7) {
+			arena_stdout(
+				"alignment test: ptr %llx not 8-byte aligned (size %lu)",
+				(u64)ptrs[i], alignment_sizes[i]);
+			buddy_destroy(&buddy);
+			return -EINVAL;
+		}
+	}
+
+	/* Free all allocations */
+	for (i = zero; i < 17 && can_loop; i++)
+		buddy_free(&buddy, ptrs[i]);
+
+	buddy_destroy(&buddy);
+
+	return 0;
+}
+
+__weak char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/libarena/selftests/test_parallel_spmc.bpf.c b/tools/testing/selftests/bpf/libarena/selftests/test_parallel_spmc.bpf.c
new file mode 100644
index 000000000000..f08f2a92e194
--- /dev/null
+++ b/tools/testing/selftests/bpf/libarena/selftests/test_parallel_spmc.bpf.c
@@ -0,0 +1,669 @@
+// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
+
+#include <bpf_atomic.h>
+
+#include <libarena/common.h>
+
+#include <libarena/asan.h>
+#include <libarena/spmc.h>
+
+#define TEST_SPMC_THREADS 3
+#define TEST_SPMC_STEALERS (TEST_SPMC_THREADS - 1)
+
+/* 
+ * The test requires the stealers/owners to sometimes quiesce
+ * before continuing the benchmark. Normally we'd use something
+ * like a condition variable, but since the benchmark is short-lived
+ * and operations are wait-free we just spin around the quiescence
+ * point instead. If we time out, we just fail the benchmark.
+ */
+#define TEST_SPMC_SYNC_SPINS BPF_MAX_LOOPS
+
+/*
+ * We track all the values we retrieve from the queue
+ * to get some guarantee we're, not corrupting data,
+ * e.g., accidentally reusing a past value from a slot.
+ */
+#define TEST_SPMC_MAX_VALUES (1024)
+static u64 __arena seen[TEST_SPMC_MAX_VALUES];
+
+/* The single spmc queue for the benchmark. */
+static struct spmc __arena *spmc;
+
+/* Owner and stealer epochs. We define the , */
+static volatile u64 owner_epoch;
+static volatile u64 stealer_epoch;
+
+/* Map owner epochs to stealer epochs (simply scale by # of stealers). */
+#define STEALER_EPOCH(owner_epoch) ((owner_epoch) * TEST_SPMC_STEALERS)
+
+/* Global abort switch. If any thread fails, all others exit ASAP. */
+static volatile bool test_abort;
+
+/* 
+ * Counters useful for ensuring conservation of pushes/pops of unique values
+ * (we're not stealing/popping more/fewer items than were pushed).
+ */
+static volatile u64 expected_total;
+static volatile u64 total_seen;
+
+/* Measure how many pops and steals we've made (irrespective of retrieved value). */
+static volatile u64 pops;
+static volatile u64 steals;
+
+/* Used for the resize selftest, see below. */
+static volatile u64 stealers_started;
+
+/* Used for the mixed selftest, see below. */
+static volatile u64 round_steals;
+
+/*
+ * We have multiple stealers and a single owner. We sometimes want the owner
+ * to successfully outproduce the stealers, we add a busy loop in them.
+ */
+#define TEST_SPMC_WASTE_ROUNDS (1UL << 12)
+
+/*
+ * The spmc data structure depends on the runtime fully
+ * supporting acquire/release semantics, which is not
+ * the case for all architectures.
+ */
+#if defined(ENABLE_ATOMICS_TESTS) &&		  \
+	(defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \
+	 (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64))
+static bool spmc_tests_enabled(void)
+{
+	return true;
+}
+#else
+static bool spmc_tests_enabled(void)
+{
+	return false;
+}
+#endif
+
+/*
+ * Scaffolding for each parallel test. Each test has setup/teardown,
+ * a single owner thread that owns the queue, and TEST_SPMC_STEALER
+ * threads that try to steal.
+ */
+#define DEFINE_PARALLEL_SPMC_TEST(prefix, expected_total)		\
+	SEC("syscall") int parallel_test_spmc_##prefix##__enabled(void)	\
+	{								\
+		return spmc_tests_enabled() ? 0 : -EOPNOTSUPP;		\
+	}								\
+	SEC("syscall") int parallel_test_spmc_##prefix##__init(void)	\
+	{								\
+		return spmc_common_init(expected_total);		\
+	}								\
+	SEC("syscall") int parallel_test_spmc_##prefix##__fini(void)	\
+	{								\
+		return spmc_common_fini();				\
+	}								\
+	SEC("syscall") int parallel_test_spmc_##prefix##__0(void)	\
+	{								\
+		return spmc_##prefix##_owner();				\
+	}								\
+	SEC("syscall") int parallel_test_spmc_##prefix##__1(void)	\
+	{								\
+		return spmc_##prefix##_stealer();					\
+	}								\
+	SEC("syscall") int parallel_test_spmc_##prefix##__2(void)	\
+	{								\
+		return spmc_##prefix##_stealer();					\
+	}								\
+
+static int spmc_common_init(u64 total)
+{
+	u64 i;
+
+	if (total > TEST_SPMC_MAX_VALUES)
+		return -E2BIG;
+
+	owner_epoch = 0;
+	stealer_epoch = 0;
+	test_abort = false;
+	expected_total = total;
+	total_seen = 0;
+	pops = 0;
+	steals = 0;
+	stealers_started = 0;
+	round_steals = 0;
+
+	for (i = zero; i < TEST_SPMC_MAX_VALUES && can_loop; i++)
+		seen[i] = 0;
+
+	spmc = spmc_create();
+	if (!spmc)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int spmc_common_fini(void)
+{
+	int ret;
+
+	ret = spmc_destroy(spmc);
+	spmc = NULL;
+
+	return ret;
+}
+
+__weak
+int spmc_quiesce_on_owner(u64 epoch)
+{
+	u64 i;
+
+	bpf_for(i, 0, TEST_SPMC_SYNC_SPINS) {
+		if (test_abort)
+			return -EINTR;
+		if (smp_load_acquire(&owner_epoch) >= epoch)
+			return 0;
+	}
+
+	test_abort = true;
+
+	return -ETIMEDOUT;
+}
+
+__weak
+int spmc_quiesce_on_stealer(u64 epoch)
+{
+	u64 target, cur;
+	unsigned int i;
+	int err = -ETIMEDOUT;
+
+	target = STEALER_EPOCH(epoch);
+	bpf_for(i, 0, TEST_SPMC_SYNC_SPINS) {
+
+		if (test_abort) {
+			err = -EINTR;
+			break;
+		}
+
+		cur = smp_load_acquire(&stealer_epoch);
+		if (cur > target) {
+			err = -EINVAL;
+			test_abort = true;
+			break;
+		}
+
+		if (cur == target)
+			return 0;
+	}
+
+	test_abort = true;
+
+	return err;
+}
+
+static int spmc_update_stats(u64 val, bool owner)
+{
+	u64 total;
+
+	total = expected_total;
+	if (val >= total || val >= TEST_SPMC_MAX_VALUES) {
+		test_abort = true;
+		return -EINVAL;
+	}
+
+	if (__sync_fetch_and_add(&seen[val], 1) != 0) {
+		test_abort = true;
+		return -EINVAL;
+	}
+
+	__sync_fetch_and_add(&total_seen, 1);
+	if (owner)
+		__sync_fetch_and_add(&pops, 1);
+	else
+		__sync_fetch_and_add(&steals, 1);
+
+	return 0;
+}
+
+static int spmc_validate_owner_empty(void)
+{
+	u64 val;
+	int ret;
+
+	ret = spmc_owned_remove(spmc, &val);
+	if (ret != -ENOENT) {
+		test_abort = true;
+		/* Change a 0 return value into -EINVAL. */
+		return ret ?: -EINVAL;
+	}
+
+	return 0;
+}
+
+__weak
+int spmc_validate_all_seen(void)
+{
+	u64 i, total;
+
+	total = expected_total;
+	if (total_seen != total)
+		goto err;
+
+	if (pops + steals != total)
+		goto err;
+
+	for (i = zero; i < total && can_loop; i++) {
+		if (seen[i % TEST_SPMC_MAX_VALUES] != 1)
+			goto err;
+	}
+
+	return 0;
+
+err:
+	test_abort = true;
+
+	return -EINVAL;
+}
+
+/*
+ * Single value benchmark. The owner adds an item then races with
+ * the stealers for it. This way directly race between owner and
+ * stealers on the same slot.
+ */
+
+
+#define TEST_SPMC_SINGLEVAL_ITERS (64)
+
+__weak
+int spmc_singleval_tryconsume(u64 expected, bool steal)
+{
+	u64 val;
+	int ret;
+
+	while (can_loop) {
+		if (steal)
+			ret = spmc_steal(spmc, &val);
+		else
+			ret = spmc_owned_remove(spmc, &val);
+
+		/* Success. Update and validate. */
+		if (!ret) {
+			if (val != expected)
+				return -EINVAL;
+
+			ret = spmc_update_stats(val, !steal);
+			if (ret)
+				return ret;
+
+			return 0;
+		}
+
+		/*
+		 * If we got -ENOENT, the queue is empty
+		 * and we're good to go.
+		 */
+		if (ret != -EAGAIN)
+			return (ret == -ENOENT) ? 0 : ret;
+	}
+
+	/* Impossible. */
+	return -EINVAL;
+}
+
+static int spmc_singleval_owner(void)
+{
+	int ret;
+	u64 i;
+
+	for (i = zero; i < TEST_SPMC_SINGLEVAL_ITERS && can_loop; i++) {
+		ret = spmc_quiesce_on_stealer(i);
+		if (ret)
+			goto err;
+
+		ret = spmc_owned_add(spmc, i);
+		if (ret)
+			goto err;
+
+		__sync_fetch_and_add(&owner_epoch, 1);
+
+		ret = spmc_singleval_tryconsume(i, false);
+		if (ret)
+			goto err;
+
+		ret = spmc_quiesce_on_stealer(i + 1);
+		if (ret)
+			goto err;
+	}
+
+	ret = spmc_validate_owner_empty();
+	if (ret)
+		return ret;
+
+	return spmc_validate_all_seen();
+
+err:
+	test_abort = true;
+	return -EINVAL;
+}
+
+static int spmc_singleval_stealer(void)
+{
+	int ret;
+	u64 i;
+
+	for (i = zero; i < TEST_SPMC_SINGLEVAL_ITERS && can_loop; i++) {
+		ret = spmc_quiesce_on_owner(i + 1);
+		if (ret)
+			goto err;
+
+		ret = spmc_singleval_tryconsume(i, true);
+		if (ret)
+			goto err;
+
+		__sync_fetch_and_add(&stealer_epoch, 1);
+	}
+
+	return 0;
+
+err:
+	test_abort = true;
+	return -EINVAL;
+}
+
+DEFINE_PARALLEL_SPMC_TEST(singleval, TEST_SPMC_SINGLEVAL_ITERS)
+
+/*
+ * The resize test. Force a resize from the owner even while the stealers
+ * are trying to consume. Then make sure the queue is still consistent
+ * after the resize.
+ *
+ * The owner _doesn't_ consume from the queue. The test makes sure that
+ * switching the array from underneath the stealers works.
+ */
+
+/* Force 2 resizes (since the rate of resize is logarithmic). */
+#define TEST_SPMC_RESIZE_ORDER (2)
+#define TEST_SPMC_RESIZE_PREFILL ((SPMC_ARR_BASESZ << TEST_SPMC_RESIZE_ORDER) - 1)
+
+/* */
+#define TEST_SPMC_RESIZE_TAIL (SPMC_ARR_BASESZ << TEST_SPMC_RESIZE_ORDER)
+#define TEST_SPMC_RESIZE_TOTAL (TEST_SPMC_RESIZE_PREFILL + TEST_SPMC_RESIZE_TAIL)
+
+__weak
+int spmc_wait_for_stealers_to_start(u64 target)
+{
+	u64 i;
+
+	bpf_for(i, 0, TEST_SPMC_SYNC_SPINS) {
+		if (test_abort)
+			return -EINTR;
+		if (READ_ONCE(stealers_started) >= target)
+			return 0;
+	}
+
+	test_abort = true;
+
+	return -ETIMEDOUT;
+}
+
+__weak
+void spmc_waste_time(void)
+{
+	int i;
+	int j;
+
+	for (i = zero; i < TEST_SPMC_WASTE_ROUNDS && can_loop; i++) {
+		/* Random computation. */
+		WRITE_ONCE(j, i * 17 + 23);
+	}
+}
+
+static int spmc_resize_owner(void)
+{
+	bool resized = false;
+	u64 i;
+	int ret;
+
+	/* Get a head start vs the consumers. */
+	for (i = zero; i < TEST_SPMC_RESIZE_PREFILL && can_loop; i++) {
+		ret = spmc_owned_add(spmc, i);
+		if (ret) {
+			test_abort = true;
+			return ret;
+		}
+	}
+
+	__sync_fetch_and_add(&owner_epoch, 1);
+
+	/* Wait for stealers to start then start racing. */
+	ret = spmc_wait_for_stealers_to_start(TEST_SPMC_STEALERS);
+	if (ret)
+		return ret;
+
+	for (i = TEST_SPMC_RESIZE_PREFILL; i < TEST_SPMC_RESIZE_TOTAL && can_loop; i++) {
+		ret = spmc_owned_add(spmc, i);
+		if (ret) {
+			test_abort = true;
+			return ret;
+		}
+
+		if (spmc->cur->order > TEST_SPMC_RESIZE_ORDER)
+			resized = true;
+	}
+
+	/* Did we get to resize while racing? */
+	if (!resized) {
+		test_abort = true;
+		return -EINVAL;
+	}
+
+	/* 
+	 * Wait for the stealers to drain and make sure
+	 * we didn't lose any items along the way.
+	 */
+	__sync_fetch_and_add(&owner_epoch, 1);
+
+	ret = spmc_quiesce_on_stealer(1);
+	if (ret)
+		return ret;
+
+	ret = spmc_validate_owner_empty();
+	if (ret)
+		return ret;
+
+	return spmc_validate_all_seen();
+}
+
+static int spmc_resize_stealer(void)
+{
+	bool owner_done = false;
+	u64 val;
+	int ret;
+
+	arena_subprog_init();
+
+	ret = spmc_quiesce_on_owner(1);
+	if (ret)
+		return ret;
+
+	__sync_fetch_and_add(&stealers_started, 1);
+
+	while (can_loop) {
+		spmc_waste_time();
+		if (test_abort)
+			return -EINTR;
+
+		ret = spmc_steal(spmc, &val);
+		if (!ret) {
+			ret = spmc_update_stats(val, false);
+			if (ret)
+				return ret;
+			continue;
+		}
+
+		if (ret == -EAGAIN)
+			continue;
+
+		if (ret == -ENOENT) {
+			if (owner_done)
+				break;
+			owner_done = owner_epoch >= 2;
+			continue;
+		}
+
+		test_abort = true;
+		return ret;
+	}
+
+	__sync_fetch_and_add(&stealer_epoch, 1);
+
+	return 0;
+}
+
+DEFINE_PARALLEL_SPMC_TEST(resize, TEST_SPMC_RESIZE_TOTAL)
+
+/*
+ * The burst benchmark. The owner generates data all at once,
+ * then waits for the stealers to steal half then starts removing 
+ * items until the queue empties. The owner also makes sure the
+ * item order is not jumbled.
+ */
+
+#define TEST_SPMC_BURST_ROUNDS (4)
+#define TEST_SPMC_BURST_BURST (64)
+#define TEST_SPMC_BURST_TOTAL (TEST_SPMC_BURST_ROUNDS * TEST_SPMC_BURST_BURST)
+#define TEST_SPMC_BURST_STEAL_TARGET (TEST_SPMC_BURST_BURST / 2)
+
+static int spmc_wait_for_round_steals(u64 target)
+{
+	u64 i;
+
+	arena_subprog_init();
+
+	bpf_for(i, 0, TEST_SPMC_SYNC_SPINS) {
+		if (test_abort)
+			return -EINTR;
+		if (round_steals >= target)
+			return 0;
+	}
+
+	test_abort = true;
+
+	return -ETIMEDOUT;
+}
+
+__weak int
+spmc_burst_owner_round(u64 round)
+{
+	u64 i, base, stolen, expected, val;
+	int ret;
+
+	base = round * TEST_SPMC_BURST_BURST;
+	round_steals = 0;
+
+	for (i = zero; i < TEST_SPMC_BURST_BURST && can_loop; i++) {
+		ret = spmc_owned_add(spmc, base + i);
+		if (ret)
+			return ret;
+	}
+
+	__sync_fetch_and_add(&owner_epoch, 1);
+
+	ret = spmc_wait_for_round_steals(TEST_SPMC_BURST_STEAL_TARGET);
+	if (ret == -EINTR || ret == -ETIMEDOUT)
+		return ret;
+
+	__sync_fetch_and_add(&owner_epoch, 1);
+
+	ret = spmc_quiesce_on_stealer(round + 1);
+	if (ret)
+		return ret;
+
+	stolen = round_steals;
+	if (stolen > TEST_SPMC_BURST_BURST)
+		return -EINVAL;
+
+	for (i = zero; i < TEST_SPMC_BURST_BURST - stolen && can_loop; i++) {
+		ret = spmc_owned_remove(spmc, &val);
+		if (ret)
+			return ret;
+
+		expected = base + TEST_SPMC_BURST_BURST - 1 - i;
+		if (val != expected)
+			return -EINVAL;
+
+		ret = spmc_update_stats(val, true);
+		if (ret) {
+			test_abort = true;
+			return -EINVAL;
+		}
+	}
+
+	ret = spmc_validate_owner_empty();
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int spmc_burst_owner(void)
+{
+	u64 round;
+	int ret;
+
+	arena_subprog_init();
+
+	for (round = zero; round < TEST_SPMC_BURST_ROUNDS && can_loop; round++) {
+		ret = spmc_burst_owner_round(round);
+		if (ret)
+			goto err;
+	}
+
+	return spmc_validate_all_seen();
+
+err:
+	test_abort = true;
+	return -EINVAL;
+}
+
+static int spmc_burst_stealer(void)
+{
+	u64 round, val, active_epoch;
+	int ret;
+
+	arena_subprog_init();
+
+	for (round = zero; round < TEST_SPMC_BURST_ROUNDS && can_loop; round++) {
+		active_epoch = round * 2 + 1;
+
+		/* 
+		 * Wait till the owner prefills the queue then
+		 * start stealing.
+		 */
+		ret = spmc_quiesce_on_owner(active_epoch);
+		if (ret)
+			return ret;
+
+		while (owner_epoch == active_epoch && can_loop) {
+			if (test_abort)
+				return -EINTR;
+
+			ret = spmc_steal(spmc, &val);
+			if (!ret) {
+				ret = spmc_update_stats(val, false);
+				if (ret)
+					return ret;
+				__sync_fetch_and_add(&round_steals, 1);
+				continue;
+			}
+			if (ret == -EAGAIN || ret == -ENOENT)
+				continue;
+
+			test_abort = true;
+			return ret;
+		}
+
+		__sync_fetch_and_add(&stealer_epoch, 1);
+	}
+
+	return 0;
+}
+
+DEFINE_PARALLEL_SPMC_TEST(burst, TEST_SPMC_BURST_TOTAL)
diff --git a/tools/testing/selftests/bpf/libarena/selftests/test_progs_compat.h b/tools/testing/selftests/bpf/libarena/selftests/test_progs_compat.h
new file mode 100644
index 000000000000..9d431376c42f
--- /dev/null
+++ b/tools/testing/selftests/bpf/libarena/selftests/test_progs_compat.h
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+#pragma once
+
+#ifdef __BPF__
+
+/* Selftests use these tags for compatibility with test_progs. */
+#define __test_tag(tag)		__attribute__((btf_decl_tag("comment:" XSTR(__COUNTER__) ":" tag)))
+#define __stderr(msg)		__test_tag("test_expect_stderr=" msg)
+#define __stderr_unpriv(msg)	__test_tag("test_expect_stderr_unpriv=" msg)
+
+#define XSTR(s) STR(s)
+#define STR(s) #s
+
+#endif
diff --git a/tools/testing/selftests/bpf/libarena/selftests/test_rbtree.bpf.c b/tools/testing/selftests/bpf/libarena/selftests/test_rbtree.bpf.c
new file mode 100644
index 000000000000..856c484a009a
--- /dev/null
+++ b/tools/testing/selftests/bpf/libarena/selftests/test_rbtree.bpf.c
@@ -0,0 +1,968 @@
+// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
+
+#include <libarena/common.h>
+
+#include <libarena/asan.h>
+#include <libarena/rbtree.h>
+
+typedef struct node_ctx __arena *node_ctx;
+
+struct node_ctx {
+	struct rbnode rbnode;
+	node_ctx next;
+};
+
+static const u64 keys[] = { 51, 43,  37, 3, 301,  46, 383, 990, 776, 729, 871, 96, 189, 213,
+	376, 167, 131, 939, 626, 119, 374, 700, 772, 154, 883, 620, 641, 5,
+	428, 516, 105, 622, 988, 811, 931, 973, 246, 690, 934, 744, 210, 311,
+	32, 255, 960, 830, 523, 429, 541, 738, 705, 774, 715, 446, 98, 578,
+	777, 191, 279, 91, 767 };
+
+static const u64 morekeys[] = { 173, 636, 1201, 8642, 5957, 3617, 4586, 8053, 6551, 7592, 1748, 1589, 8644, 9918, 6977,
+	4448, 5852, 4640, 9717, 2303, 7424, 7695, 2334, 8876, 8618, 5745, 7134, 2178, 5280, 2140, 1138,
+	5083, 8922, 1516, 2437, 2488, 4307, 4329, 5088, 8456, 5938, 1441, 1684, 5750, 721, 1107, 2089,
+	9737, 4687, 5016, 4849, 8193, 9603, 9147, 5992, 166, 6721, 812, 4144, 6237, 6509, 3466, 9255,
+	7767, 3960, 6759, 2968, 6046, 9784, 8395, 2619, 1711, 528, 6424, 9084, 3179, 1342, 5676, 9445,
+	5691, 6678, 8487, 1627, 998, 6178, 2229, 1987, 3319, 572, 169, 2161, 3018, 5439, 7287, 7265, 5995,
+	5003, 5857, 2836, 5634, 4735, 9261, 8287, 5359, 533, 1406, 9573, 4026, 714, 3956, 1722, 6395,
+	9648, 3887, 7185, 470, 4482, 4997, 841, 8913, 9946, 3999, 9357, 9847, 277, 8184, 8704, 6766, 3323,
+	5468, 8638, 7905, 8858, 6142, 3685, 3452, 4689, 8878, 8836, 158, 831, 7914, 3031, 8374, 4921,
+	4207, 3460, 5547, 3358, 1083, 4619, 7818, 2962, 4879, 4583, 2172, 8819, 9830, 1194, 2666, 9812,
+	5704, 8432, 5916, 6007, 6609, 4791, 1985, 3226, 2478, 9605, 5236, 8079, 3042, 1965, 3539, 9704,
+	4267, 6416, 760, 9968, 2983, 1190, 1964, 3211, 2870, 3106, 2794, 1542, 6916, 5986, 9096, 441,
+	5894, 8353, 7765, 3757, 5732, 88, 3091, 5637, 6042, 8447, 4073, 6923, 5491, 7010, 3663, 5029,
+	6162, 822, 4874, 7491, 5100, 3461, 6983, 2170, 1458, 1856, 648, 6272, 4887, 976, 2369, 5909, 4274,
+	3324, 6968, 2312, 2271, 8891, 6268, 6581, 1610, 8880, 6194, 6144, 9764, 6915, 829, 3774, 2265,
+	1752, 1314, 6377, 8760, 8004, 501, 4912, 9278, 1425, 9578, 7337, 307, 1885, 3151, 9617, 1647,
+	2458, 3702, 6091, 8902, 5663, 9378, 7640, 3336, 557, 1644, 6848, 1559, 8821, 266, 4330, 9790,
+	5920, 4222, 1143, 6248, 5792, 4847, 9726, 6303, 821, 6839, 6062, 7133, 3649, 9888, 2528, 1966,
+	5456, 4914, 3615, 1543, 3206, 3353, 6097, 2800, 1424, 9094, 7920, 7243, 1394, 5464, 1707, 576,
+	6524, 4261, 4187, 7889, 5336, 3377, 2921, 7244, 2766, 6584, 5514, 1387, 2957, 2258, 1077, 9979,
+	1128, 876, 4056, 4668, 4532, 1982, 7093, 4184, 5460, 7588, 4704, 6717, 61, 3959, 1826, 2294, 18,
+	8170, 9394, 8796, 7288, 7285, 7143, 148, 6676, 6603, 1051, 8225, 4169, 3230, 7697, 6971, 3454,
+	7501, 9514, 394, 2339, 4993, 5606, 6060, 1297, 8273, 3012, 157, 8181, 6765, 7207, 1005, 8833, 1914,
+	7456, 1846, 8375, 2741, 2074, 1712, 5286 };
+
+SEC("syscall")
+__weak int test_rbtree_find_nonexistent(void)
+{
+	u64 key = 0xdeadbeef;
+	u64 value = 0;
+	int ret;
+
+	struct rbtree __arena *rbtree;
+
+	rbtree = rb_create(RB_ALLOC, RB_DEFAULT);
+	if (!rbtree)
+		return 1;
+
+	/* Should return -EINVAL */
+	ret = rb_find(rbtree, key, &value);
+	if (!ret)
+		return 2;
+
+	return rb_destroy(rbtree);
+}
+
+SEC("syscall")
+__weak int test_rbtree_insert_existing(void)
+{
+	u64 key = 525252;
+	u64 value = 24;
+	int ret;
+
+	struct rbtree __arena *rbtree;
+
+	rbtree = rb_create(RB_ALLOC, RB_DEFAULT);
+	if (!rbtree)
+		return 1;
+
+	ret = rb_insert(rbtree, key, value);
+	if (ret)
+		return 2;
+
+	/* Should return -EALREADY. */
+	ret = rb_insert(rbtree, key, value);
+	if (ret != -EALREADY) {
+		return 3;
+	}
+
+	return rb_destroy(rbtree);
+}
+
+SEC("syscall")
+__weak int test_rbtree_update_existing(void)
+{
+	u64 key = 33333;
+	u64 value;
+	int ret;
+
+	struct rbtree __arena *rbtree;
+
+	rbtree = rb_create(RB_ALLOC, RB_UPDATE);
+	if (!rbtree)
+		return 1;
+
+	value = 52;
+	ret = rb_insert(rbtree, key, value);
+	if (ret)
+		return 2;
+
+	ret = rb_find(rbtree, key, &value);
+	if (ret)
+		return 3;
+
+	if (value != 52)
+		return 4;
+
+	value = 65;
+
+	/* Should succeed. */
+	ret = rb_insert(rbtree, key, value);
+	if (ret)
+		return 5;
+
+	/* Should be updated. */
+	ret = rb_find(rbtree, key, &value);
+	if (ret)
+		return 6;
+
+	if (value != 65)
+		return 7;
+
+	return rb_destroy(rbtree);
+}
+
+SEC("syscall")
+__weak int test_rbtree_insert_one(void)
+{
+	u64 key = 202020;
+	u64 value = 0xbadcafe;
+	int ret;
+
+	struct rbtree __arena *rbtree;
+
+	rbtree = rb_create(RB_ALLOC, RB_UPDATE);
+	if (!rbtree)
+		return 1;
+
+	ret = rb_insert(rbtree, key, value);
+	if (ret)
+		return 2;
+
+	ret = rb_find(rbtree, key, &value);
+	if (ret)
+		return 3;
+
+	if (value != 0xbadcafe)
+		return 4;
+
+	return rb_destroy(rbtree);
+}
+
+SEC("syscall")
+__weak int test_rbtree_insert_ten(void)
+{
+	u64 key, value;
+	int ret, i;
+
+	struct rbtree __arena *rbtree;
+
+	rbtree = rb_create(RB_ALLOC, RB_UPDATE);
+	if (!rbtree)
+		return 1;
+
+	for (i = 0; i < 10 && can_loop; i++) {
+		key = keys[i];
+		ret = rb_insert(rbtree, key, 2 * key);
+		if (ret)
+			return 2 + 3 * i;
+
+		/* Read it back. */
+		ret = rb_find(rbtree, key, &value);
+		if (ret)
+			return 2 + 3 * i + 1;
+
+		if (value != 2 * key)
+			return 2 + 3 * i + 2;
+	}
+
+	/* Go find all inserted pairs. */
+	for (i = 0; i < 10 && can_loop; i++) {
+		key = keys[i];
+
+		ret = rb_find(rbtree, key, &value);
+		if (ret)
+			return 35 + 2 * i;
+
+		if (value != 2 * key)
+			return 35 + 2 * i + 1;
+	}
+
+	return rb_destroy(rbtree);
+}
+
+SEC("syscall")
+__weak int test_rbtree_duplicate(void)
+{
+	u64 key = 0x121212;
+	u64 value;
+	int ret, i;
+
+	struct rbtree __arena *rbtree;
+
+	rbtree = rb_create(RB_ALLOC, RB_DUPLICATE);
+	if (!rbtree)
+		return 1;
+
+	for (i = 0; i < 10 && can_loop; i++) {
+		ret = rb_insert(rbtree, key, 2 * key);
+		if (ret)
+			return 2 + 3 * i;
+
+		/* Read it back. */
+		ret = rb_find(rbtree, key, &value);
+		if (ret)
+			return 2 + 3 * i + 1;
+
+		if (value != 2 * key)
+			return 2 + 3 * i + 2;
+	}
+
+	/* Go find all inserted copies and remove them. */
+	for (i = 0; i < 10 && can_loop; i++) {
+		ret = rb_find(rbtree, key, &value);
+		if (ret) {
+			rb_print(rbtree);
+			return 35 + 3 * i;
+		}
+
+		if (value != 2 * key)
+			return 35 + 3 * i + 1;
+
+		ret = rb_remove(rbtree, key);
+		if (ret)
+			return 35 + 3 * i + 2;
+	}
+
+	return rb_destroy(rbtree);
+}
+
+static inline int
+clean_up_noalloc_tree(struct rbtree __arena *rbtree)
+{
+	node_ctx nodec;
+	int ret;
+
+	if (rbtree->alloc != RB_NOALLOC)
+		return -EINVAL;
+
+	/* Can't destroy an RB_NOALLOC tree that still has nodes. */
+	if (rb_destroy(rbtree) != -EBUSY)
+		return -EINVAL;
+
+	while (rbtree->root && can_loop) {
+		nodec = (node_ctx)arena_container_of(rbtree->root, struct node_ctx, rbnode);
+		ret = rb_remove_node(rbtree, &nodec->rbnode);
+		if (ret)
+			return ret;
+
+		arena_free(nodec);
+	}
+
+	return 0;
+}
+
+int insert_many(enum rbtree_alloc alloc, enum rbtree_insert_mode insert)
+{
+	const size_t numkeys = sizeof(keys) / sizeof(keys[0]);
+	node_ctx nodec;
+	u64 key, value;
+	int ret;
+	int i;
+
+	struct rbtree __arena *rbtree;
+
+	rbtree = rb_create(alloc, insert);
+	if (!rbtree)
+		return 1;
+
+	for (i = 0; i < numkeys && can_loop; i++) {
+		key = keys[i];
+		if (rbtree->alloc != RB_ALLOC) {
+			nodec = arena_malloc(sizeof(*nodec));
+			if (!nodec) {
+				arena_stderr("out of memory\n");
+				return -ENOMEM;
+			}
+			nodec->rbnode.key = key;
+			nodec->rbnode.value = 2 * key;
+			ret = rb_insert_node(rbtree, &nodec->rbnode);
+		} else {
+			ret = rb_insert(rbtree, key, 2 * key);
+		}
+		if (ret)
+			return 2 + 3 * i;
+
+		/* Read it back. */
+		ret = rb_find(rbtree, key, &value);
+		if (ret)
+			return 2 + 3 * i + 1;
+
+		if (value != 2 * key)
+			return 2 + 3 * i + 2;
+	}
+
+	/* Go find all inserted pairs. */
+	for (i = 0; i < numkeys && can_loop; i++) {
+		key = keys[i];
+
+		ret = rb_find(rbtree, key, &value);
+		if (ret)
+			return 302 + 2 * i;
+
+		if (value != 2 * key)
+			return 302 + 2 * i + 1;
+	}
+
+	/* RB_ALLOC trees are destroyed while still having elements. */
+	if (rbtree->alloc == RB_ALLOC)
+		return rb_destroy(rbtree);
+
+	/* Otherwise manually clean up the tree. */
+	if (clean_up_noalloc_tree(rbtree))
+		return 5;
+
+	return rb_destroy(rbtree);
+}
+
+SEC("syscall")
+__weak int test_rbtree_remove_one(void)
+{
+	u64 key = 20, value = 5, newvalue;
+	int ret;
+
+	struct rbtree __arena *rbtree;
+
+	rbtree = rb_create(RB_ALLOC, RB_DEFAULT);
+	if (!rbtree)
+		return 1;
+
+	ret = rb_find(rbtree, key, &newvalue);
+	if (!ret)
+		return 2;
+
+	ret = rb_insert(rbtree, key, value);
+	if (ret)
+		return 3;
+
+	ret = rb_find(rbtree, key, &newvalue);
+	if (ret || value != newvalue)
+		return 4;
+
+	ret = rb_remove(rbtree, key);
+	if (ret)
+		return 5;
+
+	ret = rb_find(rbtree, key, &newvalue);
+	if (!ret)
+		return 6;
+
+	return rb_destroy(rbtree);
+}
+
+static __always_inline int remove_many_verify_all_present(struct rbtree __arena *rbtree)
+{
+	const size_t numkeys = sizeof(morekeys) / sizeof(morekeys[0]);
+	u64 value;
+	int ret;
+	int i;
+
+	for (i = 0; i < numkeys && can_loop; i++) {
+		u64 key = morekeys[i];
+
+		ret = rb_find(rbtree, key, &value);
+		if (ret)
+			return -1;
+
+		if (value != 2 * key)
+			return -1;
+	}
+
+	return 0;
+}
+
+static __always_inline int remove_many_verify_remaining(struct rbtree __arena *rbtree)
+{
+	const size_t numkeys = sizeof(morekeys) / sizeof(morekeys[0]);
+	u64 value;
+	int ret;
+	int i;
+
+	for (i = 0; i < numkeys && can_loop; i += 2) {
+		u64 key = morekeys[i];
+
+		ret = rb_find(rbtree, key, &value);
+		if (!ret)
+			return -1;
+
+		if (i + 1 >= numkeys)
+			break;
+
+		key = morekeys[i + 1];
+		ret = rb_find(rbtree, key, &value);
+		if (ret)
+			return -1;
+
+		if (value != 2 * key)
+			return -1;
+	}
+
+	for (i = 1; i < numkeys && can_loop; i += 2) {
+		u64 key = morekeys[i];
+
+		ret = rb_find(rbtree, key, &value);
+		if (ret)
+			return -1;
+
+		if (value != 2 * key)
+			return -1;
+	}
+
+	return 0;
+}
+
+static __noinline int remove_many_alloc(struct rbtree __arena *rbtree)
+{
+	const size_t numkeys = sizeof(morekeys) / sizeof(morekeys[0]);
+	u64 value;
+	int ret;
+	int i;
+
+	for (i = 0; i < numkeys && can_loop; i++) {
+		u64 key = morekeys[i];
+
+		ret = rb_insert(rbtree, key, 2 * key);
+		if (ret)
+			return -1;
+
+		if (rb_integrity_check(rbtree)) {
+			arena_stderr("iteration %d\n", i);
+			return -EINVAL;
+		}
+
+		ret = rb_find(rbtree, key, &value);
+		if (ret)
+			return -1;
+
+		if (value != 2 * key)
+			return -1;
+	}
+
+	ret = remove_many_verify_all_present(rbtree);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < numkeys && can_loop; i += 2) {
+		u64 key = morekeys[i];
+
+		ret = rb_remove(rbtree, key);
+		if (ret) {
+			arena_stderr("Failed to remove %ld\n", key);
+			return -1;
+		}
+
+		ret = rb_find(rbtree, key, &value);
+		if (!ret)
+			return -1;
+	}
+
+	return remove_many_verify_remaining(rbtree);
+}
+
+static __noinline int remove_many_noalloc(struct rbtree __arena *rbtree)
+{
+	const size_t numkeys = sizeof(morekeys) / sizeof(morekeys[0]);
+	node_ctx first = NULL, last = NULL;
+	u64 value;
+	int ret;
+	int i;
+
+	for (i = 0; i < numkeys && can_loop; i++) {
+		u64 key = morekeys[i];
+		node_ctx nodec = arena_malloc(sizeof(*nodec));
+
+		if (!nodec) {
+			arena_stderr("out of memory\n");
+			return -ENOMEM;
+		}
+		nodec->rbnode.key = key;
+		nodec->rbnode.value = 2 * key;
+		nodec->next = NULL;
+
+		if (!first)
+			first = nodec;
+
+		if (last)
+			last->next = nodec;
+		last = nodec;
+
+		ret = rb_insert_node(rbtree, &nodec->rbnode);
+		if (ret)
+			return -1;
+
+		if (rb_integrity_check(rbtree)) {
+			arena_stderr("iteration %d\n", i);
+			return -EINVAL;
+		}
+
+		ret = rb_find(rbtree, key, &value);
+		if (ret)
+			return -1;
+
+		if (value != 2 * key)
+			return -1;
+	}
+
+	ret = remove_many_verify_all_present(rbtree);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < numkeys && can_loop; i += 2) {
+		u64 key = morekeys[i];
+		node_ctx nodec = first;
+
+		if (!nodec || key != nodec->rbnode.key)
+			return -1;
+
+		first = nodec->next ? nodec->next->next : NULL;
+		ret = rb_remove_node(rbtree, &nodec->rbnode);
+		if (ret) {
+			arena_stderr("Failed to remove %ld\n", key);
+			return -1;
+		}
+
+		ret = rb_find(rbtree, key, &value);
+		if (!ret)
+			return -1;
+	}
+
+	return remove_many_verify_remaining(rbtree);
+}
+
+static inline int remove_many(enum rbtree_alloc alloc,
+			      enum rbtree_insert_mode insert)
+{
+	int ret;
+	struct rbtree __arena *rbtree;
+
+	rbtree = rb_create(alloc, insert);
+	if (!rbtree)
+		return -ENOMEM;
+
+	ret = (alloc == RB_ALLOC) ? remove_many_alloc(rbtree)
+				: remove_many_noalloc(rbtree);
+	if (ret)
+		return ret;
+
+	if (alloc == RB_ALLOC)
+		return rb_destroy(rbtree);
+
+	ret = clean_up_noalloc_tree(rbtree);
+	if (ret)
+		return ret;
+
+	return rb_destroy(rbtree);
+}
+
+SEC("syscall")
+__weak int test_rbtree_insert_many_update(void)
+{
+	return insert_many(RB_ALLOC, RB_UPDATE);
+}
+
+SEC("syscall")
+__weak int test_rbtree_insert_many_noalloc(void)
+{
+	return insert_many(RB_NOALLOC, RB_DUPLICATE);
+}
+
+SEC("syscall")
+__weak int test_rbtree_remove_many_update(void)
+{
+	return remove_many(RB_ALLOC, RB_UPDATE);
+}
+
+SEC("syscall")
+__weak int test_rbtree_remove_many_noalloc(void)
+{
+	return remove_many(RB_NOALLOC, RB_DUPLICATE);
+}
+
+SEC("syscall")
+__weak int test_rbtree_add_remove_circular(void)
+{
+	const size_t iters = 60;
+	const size_t prefill = 10;
+	const size_t numkeys = 50;
+	const size_t prefix = 400000;
+	u64 value, rmval;
+	int errval = 1;
+	u64 key;
+	int ret;
+	int i;
+
+	struct rbtree __arena *rbtree;
+
+	rbtree = rb_create(RB_ALLOC, RB_UPDATE);
+	if (!rbtree)
+		return 1;
+
+	for (i = 0; i < prefill && can_loop; i++) {
+		ret = rb_insert(rbtree, prefix + (i % numkeys), i);
+		if (ret)
+			return errval;
+
+		errval += 1;
+	}
+
+	errval = 2 * 1000 * 1000;
+
+	for (i = 0; i < prefill && can_loop; i++) {
+		/* Read it back. */
+		ret = rb_find(rbtree, prefix + (i % numkeys), &value);
+		if (ret)
+			return errval;
+
+		if (value != i)
+			return errval;
+	}
+
+	errval = 3 * 1000 * 1000;
+
+	for (i = prefill; i < iters && can_loop; i++) {
+		key = prefix + (i % numkeys);
+
+		ret = rb_find(rbtree, key, &value);
+		if (!ret) {
+			arena_stderr("Key %d already present\n", key);
+			return errval;
+		}
+
+		errval += 1;
+
+		ret = rb_insert(rbtree, key, i);
+		if (ret) {
+			arena_stderr("ITERATION %d\n", i);
+			rb_print(rbtree);
+			return errval;
+		}
+
+		rmval = i - prefill;
+
+		errval += 1;
+
+		ret = rb_find(rbtree, prefix + (rmval % numkeys), &value);
+		if (ret)
+			return errval;
+
+		errval += 1;
+
+		if (value != rmval)
+			return errval;
+
+		errval += 1;
+
+		ret = rb_remove(rbtree, prefix + (rmval % numkeys));
+		if (ret) {
+			arena_stderr("ITERATION %d\n", i);
+			return errval;
+		}
+
+		errval += 1;
+	}
+
+	for (i = 0; i < numkeys && can_loop; i++) {
+		rb_remove(rbtree, prefix + i);
+	}
+
+	return rb_destroy(rbtree);
+}
+
+SEC("syscall")
+__weak int test_rbtree_add_remove_circular_reverse(void)
+{
+	const size_t iters = 110;
+	const size_t prefill = 10;
+	const size_t numkeys = 50;
+	const size_t prefix = 500000;
+	u64 value, rmval;
+	int errval = 1;
+	u64 key;
+	int ret;
+	int i;
+
+	struct rbtree __arena *rbtree;
+
+	rbtree = rb_create(RB_ALLOC, RB_UPDATE);
+	if (!rbtree)
+		return 1;
+
+	for (i = 0; i < prefill && can_loop; i++) {
+		ret = rb_insert(rbtree, prefix - (i % numkeys), i);
+		if (ret)
+			return errval;
+
+		errval += 1;
+	}
+
+	errval = 2 * 1000 * 1000;
+
+	for (i = 0; i < prefill && can_loop; i++) {
+		/* Read it back. */
+		ret = rb_find(rbtree, prefix - (i % numkeys), &value);
+		if (ret)
+			return errval;
+
+		if (value != i)
+			return errval;
+	}
+
+	errval = 3 * 1000 * 1000;
+
+	for (i = prefill; i < iters && can_loop; i++) {
+		key = prefix - (i % numkeys);
+
+		ret = rb_find(rbtree, key, &value);
+		if (!ret) {
+			arena_stderr("Key %d already present\n", key);
+			return errval;
+		}
+
+		errval += 1;
+
+		ret = rb_insert(rbtree, key, i);
+		if (ret) {
+			arena_stderr("error %d on insert\n", ret);
+			rb_print(rbtree);
+			return errval;
+		}
+
+		rmval = i - prefill;
+
+		errval += 1;
+
+		ret = rb_find(rbtree, prefix - (rmval % numkeys), &value);
+		if (ret)
+			return errval;
+
+		errval += 1;
+
+		if (value != rmval)
+			return errval;
+
+		errval += 1;
+
+		ret = rb_remove(rbtree, prefix - (rmval % numkeys));
+		if (ret)
+			return errval;
+
+		errval += 1;
+	}
+
+
+	errval = 4 * 1000 * 1000;
+	for (i = 0; i < prefill && can_loop; i++) {
+		ret = rb_remove(rbtree, prefix - i);
+		if (ret) {
+			arena_stderr("Did not remove %d, error %d\n", prefix - i, ret);
+			return errval + i;
+		}
+	}
+
+	return rb_destroy(rbtree);
+}
+
+SEC("syscall")
+__weak int test_rbtree_least_pop(void)
+{
+	const size_t keys = 10;
+	u64 key, value;
+	int errval = 1;
+	int ret, i;
+
+	struct rbtree __arena *rbtree;
+
+	rbtree = rb_create(RB_ALLOC, RB_DEFAULT);
+	if (!rbtree)
+		return errval;
+
+	errval += 1;
+
+	for (i = 0; i < keys / 2 && can_loop; i++) {
+		ret = rb_insert(rbtree, i, i);
+		if (ret)
+			return errval;
+
+		errval += 1;
+
+		ret = rb_insert(rbtree, keys - 1 - i, keys - 1 - i);
+		if (ret)
+			return errval;
+
+		errval += 1;
+
+		ret = rb_least(rbtree, &key, &value);
+		if (ret)
+			return errval;
+
+		errval += 1;
+
+		if (key != 0 || value != 0)
+			return errval;
+
+		errval += 1;
+	}
+
+	errval = 1000;
+
+	for (i = 0; i < keys && can_loop; i++) {
+		ret = rb_least(rbtree, &key, &value);
+		if (ret) {
+			arena_stderr("rb_least failed with %d\n", ret);
+			return errval;
+		}
+
+		errval += 1;
+
+		if (key != i || value != i) {
+			arena_stderr("Got KV %ld/%ld expected %d\n", key, value, i);
+			return errval;
+		}
+
+		errval += 1;
+
+		ret = rb_pop(rbtree, &key, &value);
+		if (ret) {
+			arena_stderr("Error %d during pop on iter %d\n", ret, i);
+			return errval;
+		}
+
+		errval += 1;
+
+		if (key != i || value != i)
+			return errval;
+	}
+
+	return rb_destroy(rbtree);
+}
+
+/* Reject rb_pop() for RB_NOALLOC trees. */
+SEC("syscall")
+__weak int test_rbtree_noalloc_pop(void)
+{
+	const u64 expect_value = 1;
+	const u64 expect_key = 0;
+	struct rbtree __arena *rbtree;
+	struct rbnode __arena *node;
+	u64 value = 0;
+	int ret;
+
+	rbtree = rb_create(RB_NOALLOC, RB_DEFAULT);
+	if (!rbtree)
+		return 1;
+
+	node = rb_node_alloc(expect_key, expect_value);
+	if (!node) {
+		rb_destroy(rbtree);
+		return 2;
+	}
+
+	ret = rb_insert_node(rbtree, node);
+	if (ret) {
+		rb_node_free(node);
+		rb_destroy(rbtree);
+		return 3;
+	}
+
+	ret = rb_pop(rbtree, NULL, &value);
+	if (ret != -EINVAL)
+		return 4;
+
+	ret = rb_find(rbtree, expect_key, &value);
+	if (ret)
+		return 5;
+
+	if (value != expect_value)
+		return 6;
+
+	ret = rb_remove_node(rbtree, node);
+	if (ret)
+		return 7;
+
+	rb_node_free(node);
+
+	return rb_destroy(rbtree);
+}
+
+SEC("syscall")
+__weak int test_rbtree_alloc_check(void)
+{
+	struct rbtree __arena *alloc, *noalloc;
+	struct rbnode __arena *node;
+	int ret;
+
+	alloc = rb_create(RB_ALLOC, RB_DEFAULT);
+	if (!alloc)
+		return 1;
+
+	noalloc = rb_create(RB_NOALLOC, RB_DEFAULT);
+	if (!noalloc)
+		return 2;
+
+
+	node = rb_node_alloc(0, 0);
+	if (!node)
+		return 3;
+
+	/*
+	 * RB_ALLOC trees can use rb_insert, RB_NOALLOC trees can
+	 * use rb_insert_node. RB_ALLOC and RB_NOALLOC trees cannot
+	 * use each other's APIs.
+	 *
+	 * NOTE: This begs the question, why not different types? We
+	 * want to partially share the API and that would require us
+	 * to duplicate it.
+	 */
+	if (rb_insert(alloc, 0, 0))
+		return 4;
+
+	if (!rb_insert_node(alloc, node))
+		return 5;
+
+	if (!rb_remove_node(alloc, node))
+		return 6;
+
+	if (rb_remove(alloc, 0))
+		return 7;
+
+	if (rb_insert_node(noalloc, node))
+		return 8;
+
+	if (!rb_insert(noalloc, 0, 0))
+		return 9;
+
+	if (!rb_remove(noalloc, 0))
+		return 10;
+
+	if (rb_remove_node(noalloc, node))
+		return 11;
+
+	rb_node_free(node);
+
+	ret = rb_destroy(alloc);
+	if (ret)
+		return ret;
+
+	return rb_destroy(noalloc);
+}
diff --git a/tools/testing/selftests/bpf/libarena/selftests/test_spmc.bpf.c b/tools/testing/selftests/bpf/libarena/selftests/test_spmc.bpf.c
new file mode 100644
index 000000000000..4d7a520115d1
--- /dev/null
+++ b/tools/testing/selftests/bpf/libarena/selftests/test_spmc.bpf.c
@@ -0,0 +1,194 @@
+// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
+
+#include <libarena/common.h>
+
+#include <libarena/asan.h>
+#include <libarena/spmc.h>
+
+/*
+ * NOTE: These selftests only test for the single-threaded use case, which for
+ * Lev-Chase queues is obviously the simplest one. Still, it is important to
+ * exercise the API to ensure it passes verification and basic checks.
+ */
+
+SEC("syscall")
+int test_spmc_remove_empty(void)
+{
+	u64 val;
+	int ret;
+
+	struct spmc __arena *spmc = spmc_create();
+
+	if (!spmc)
+		return 1;
+
+	ret = spmc_owned_remove(spmc, &val);
+	if (ret != -ENOENT)
+		return 1;
+
+	spmc_destroy(spmc);
+
+	return 0;
+}
+
+SEC("syscall")
+int test_spmc_steal_empty(void)
+{
+	u64 val;
+	int ret;
+
+	struct spmc __arena *spmc = spmc_create();
+
+	if (!spmc)
+		return 1;
+
+	ret = spmc_steal(spmc, &val);
+	if (ret != -ENOENT)
+		return 1;
+
+	spmc_destroy(spmc);
+
+	return 0;
+}
+
+SEC("syscall")
+int test_spmc_steal_one(void)
+{
+	u64 val, newval;
+	int ret, i;
+
+	struct spmc __arena *spmc = spmc_create();
+
+	if (!spmc)
+		return 1;
+
+	for (i = 0; i < 10 && can_loop; i++) {
+		val = i;
+
+		ret = spmc_owned_add(spmc, val);
+		if (ret)
+			return 1;
+
+		ret = spmc_steal(spmc, &newval);
+		if (ret)
+			return 2;
+
+		if (val != newval)
+			return 3;
+	}
+
+	spmc_destroy(spmc);
+
+	return 0;
+}
+
+SEC("syscall")
+int test_spmc_remove_one(void)
+{
+	u64 val, newval;
+	int ret, i;
+
+	struct spmc __arena *spmc = spmc_create();
+
+	if (!spmc)
+		return 1;
+
+	for (i = 0; i < 10 && can_loop; i++) {
+		val = i;
+
+		ret = spmc_owned_add(spmc, val);
+		if (ret)
+			return 1;
+
+		ret = spmc_owned_remove(spmc, &newval);
+		if (ret)
+			return 2;
+
+		if (val != newval)
+			return 3;
+	}
+
+	spmc_destroy(spmc);
+
+	return 0;
+}
+
+SEC("syscall")
+int test_spmc_remove_many(void)
+{
+	u64 val, newval;
+	int ret, i;
+	u64 expected;
+
+	struct spmc __arena *spmc = spmc_create();
+
+	if (!spmc)
+		return 1;
+
+	for (i = 0; i < 500 && can_loop; i++) {
+		val = i;
+
+		ret = spmc_owned_add(spmc, val);
+		if (ret) {
+			arena_stderr("%s:%d error %d\n", __func__, __LINE__, ret);
+			return 1;
+		}
+	}
+
+	for (i = 0; i < 500 && can_loop; i++) {
+		ret = spmc_owned_remove(spmc, &newval);
+		if (ret) {
+			arena_stderr("%s:%d error %d\n", __func__, __LINE__, ret);
+			return 1;
+		}
+
+		expected = 500 - 1 - i;
+		if (newval != expected) {
+			arena_stderr("%s:%d expected %llu found %llu\n", __func__, __LINE__, expected, newval);
+			return 1;
+		}
+	}
+
+	spmc_destroy(spmc);
+
+	return 0;
+}
+
+SEC("syscall")
+int test_spmc_steal_many(void)
+{
+	u64 val, newval;
+	int ret, i;
+
+	struct spmc __arena *spmc = spmc_create();
+
+	if (!spmc)
+		return 1;
+
+	for (i = 0; i < 500 && can_loop; i++) {
+		val = i;
+
+		ret = spmc_owned_add(spmc, val);
+		if (ret) {
+			arena_stderr("%s:%d error %d\n", __func__, __LINE__, ret);
+			return 1;
+		}
+	}
+
+	for (i = 0; i < 500 && can_loop; i++) {
+		ret = spmc_steal(spmc, &newval);
+		if (ret) {
+			arena_stderr("%s:%d error %d\n", __func__, __LINE__, ret);
+			return 1;
+		}
+
+		if (newval != i) {
+			arena_stderr("%s:%d expected %d found %llu\n", __func__, __LINE__, i, newval);
+			return 1;
+		}
+	}
+
+	spmc_destroy(spmc);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/libarena/src/asan.bpf.c b/tools/testing/selftests/bpf/libarena/src/asan.bpf.c
new file mode 100644
index 000000000000..5135d5c72a46
--- /dev/null
+++ b/tools/testing/selftests/bpf/libarena/src/asan.bpf.c
@@ -0,0 +1,553 @@
+// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+#include <vmlinux.h>
+#include <libarena/common.h>
+#include <libarena/asan.h>
+
+
+enum {
+	/*
+	 * Is the access checked by check_region_inline
+	 * a read or a write?
+	 */
+	ASAN_READ		= 0x0U,
+	ASAN_WRITE		= 0x1U,
+};
+
+/*
+ * Address sanitizer (ASAN) for arena-based BPF programs, inspired
+ * by KASAN.
+ *
+ * The API
+ * -------
+ *
+ * The implementation includes two kinds of components: Implementation
+ * of ASAN hooks injected by LLVM into the program, and API calls that
+ * allocators use to mark memory as valid or invalid. The full list is:
+ *
+ * LLVM stubs:
+ *
+ * void __asan_{load, store}<size>(intptr_t addr)
+ *	Checks whether an access is valid. All variations covered
+ *	by check_region_inline().
+ *
+ * void __asan_{store, load}((intptr_t addr, ssize_t size)
+ *
+ * void __asan_report_{load, store}<size>(intptr_t addr)
+ *	Report an access violation for the program. Used when LLVM
+ *	uses direct code generation for shadow map checks.
+ *
+ * void *__asan_memcpy(void *d, const void *s, size_t n)
+ * void *__asan_memmove(void *d, const void *s, size_t n)
+ * void *__asan_memset(void *p, int c, size_t n)
+ *	Hooks for ASAN instrumentation of the LLVM mem* builtins.
+ *	Currently unimplemented just like the builtins themselves.
+ *
+ * API methods:
+ *
+ * asan_init()
+ *	Initialize the ASAN map for the arena.
+ *
+ * asan_poison()
+ *	Mark a region of memory as poisoned. Accessing poisoned memory
+ *	causes asan_report() to fire. Invoked during free().
+ *
+ * asan_unpoison()
+ *	Mark a region as unpoisoned after alloc().
+ *
+ * asan_shadow_set()
+ *	Check a byte's validity directly.
+ *
+ * The Algorithm In Brief
+ * ----------------------
+ * Each group of 8 bytes is mapped to a "granule" in the shadow map. This
+ * granule is the size of the byte and describes which bytes are valid.
+ * Possible values are:
+ *
+ * 0: All bytes are valid. Makes checks in the middle of an allocated region
+ * (most of them) fast.
+ * (0, 7]: How many consecutive bytes are valid, starting from the lowest one.
+ * The tradeoff is that we can't poison individual bytes in the middle of a
+ * valid region.
+ * [0x80, 0xff]: Special poison values, can be used to denote specific error
+ * modes (e.g., recently freed vs uninitialized memory).
+ *
+ * The mapping between a memory location and its shadow is:
+ * shadow_addr = shadow_base + (addr >> 3). We retain the 8:1 data:shadow
+ * ratio of existing ASAN implementations as a compromise between tracking
+ * granularity and space usage/scan overhead.
+ */
+
+#ifdef BPF_ARENA_ASAN
+
+#pragma clang attribute push(__attribute__((no_sanitize("address"))), \
+			     apply_to = function)
+
+#define SHADOW_ALL_ZEROES ((u64)-1)
+
+/*
+ * Canary variable for ASAN violations. Set to the offending address.
+ */
+volatile u64 asan_violated = 0;
+
+/*
+ * Shadow map occupancy map.
+ */
+volatile u64 __asan_shadow_memory_dynamic_address;
+
+volatile u32 asan_reported = false;
+volatile bool asan_inited = false;
+
+/*
+ * Set during program load.
+ */
+volatile bool asan_report_once = false;
+
+/*
+ * BPF does not currently support the memset/memcpy/memcmp intrinsics.
+ * For large sequential copies, or assignments of large data structures,
+ * the frontend will generate an intrinsic that causes the BPF backend
+ * to exit due to a missing implementation. Provide a simple implementation
+ * just for memset to use it for poisoning/unpoisoning the map.
+ */
+__weak int asan_memset(s8 __arena *dst, s8 val, size_t size)
+{
+	size_t i;
+
+	for (i = zero; i < size && can_loop; i++)
+		dst[i] = val;
+
+	return 0;
+}
+
+/* Validate a 1-byte access, always within a single byte. */
+static __always_inline bool memory_is_poisoned_1(s8 __arena *addr)
+{
+	s8 shadow_value = *(s8 __arena *)mem_to_shadow(addr);
+
+	/* Byte is 0, access is valid. */
+	if (likely(!shadow_value))
+		return false;
+
+	/*
+	 * Byte is non-zero. Access is valid if granule offset in [0, shadow_value),
+	 * so the memory is poisoned if shadow_value is negative or smaller than
+	 * the granule's value.
+	 */
+
+	return ASAN_GRANULE(addr) >= shadow_value;
+}
+
+/* Validate a 2- 4-, 8-byte access, shadow spans up to 2 bytes. */
+static __always_inline bool memory_is_poisoned_2_4_8(s8 __arena *addr, u64 size)
+{
+	u64 end = (u64)addr + size - 1;
+
+	/*
+	 * Region fully within a single byte (addition didn't
+	 * overflow above ASAN_GRANULE).
+	 */
+	if (likely(ASAN_GRANULE(end) >= size - 1))
+		return memory_is_poisoned_1((s8 __arena *)end);
+
+	/*
+	 * Otherwise first byte must be fully unpoisoned, and second byte
+	 * must be unpoisoned up to the end of the accessed region.
+	 */
+
+	return *(s8 __arena *)mem_to_shadow(addr) || memory_is_poisoned_1((s8 __arena *)end);
+}
+
+__weak bool asan_shadow_set(void __arena *addr)
+{
+	return memory_is_poisoned_1(addr);
+}
+
+static __always_inline u64 first_nonzero_byte(u64 addr, size_t size)
+{
+	while (size && can_loop) {
+		if (unlikely(*(s8 __arena *)addr))
+			return addr;
+		addr += 1;
+		size -= 1;
+	}
+
+	return SHADOW_ALL_ZEROES;
+}
+
+static __always_inline bool memory_is_poisoned_n(s8 __arena *addr, u64 size)
+{
+	u64 ret;
+	u64 start;
+	u64 end;
+
+	/* Size of [start, end] is end - start + 1. */
+	start = (u64)mem_to_shadow(addr);
+	end = (u64)mem_to_shadow(addr + size - 1);
+
+	ret = first_nonzero_byte(start, (end - start) + 1);
+	if (likely(ret == SHADOW_ALL_ZEROES))
+		return false;
+
+	return unlikely(ret != end || ASAN_GRANULE(addr + size - 1) >= *(s8 __arena *)end);
+}
+
+__weak int asan_report(s8 __arena *addr, size_t sz, u32 flags)
+{
+	u32 reported = __sync_val_compare_and_swap(&asan_reported, false, true);
+
+	/* Only report the first ASAN violation. */
+	if (reported && asan_report_once)
+		return 0;
+
+	asan_violated = (u64)addr;
+
+	arena_stderr("Memory violation for address %p (0x%lx) for %s of size %ld\n",
+			addr, (u64)addr,
+			(flags & ASAN_WRITE) ? "write" : "read",
+			sz);
+	bpf_stream_print_stack(BPF_STDERR);
+
+	return 0;
+}
+
+static __always_inline bool check_asan_args(s8 __arena *addr, size_t size,
+					    bool *result)
+{
+	bool valid = true;
+
+	/* Size 0 accesses are valid even if the address is invalid. */
+	if (unlikely(size == 0))
+		goto confirmed_valid;
+
+	/*
+	 * Wraparound is possible for values close to the the edge of the
+	 * 4GiB boundary of the arena (last valid address is 1UL << 32 - 1).
+	 *
+	 *
+	 * The wraparound detection below works for small sizes. check_asan_args is
+	 * always called from the builtin ASAN checks, so 1 <= size <= 64. Even
+	 * for storeN/loadN that we do not expect to encounter the intrinsics will
+	 * not have a large enough size that:
+	 *
+	 * - addr + size  > MAX_U32
+	 * - (u32)(addr + size) > (u32) addr
+	 *
+	 * which would defeat wraparound detection.
+	 */
+	if (unlikely((u32)(u64)(addr + size) < (u32)(u64)addr))
+		goto confirmed_invalid;
+
+	return false;
+
+confirmed_invalid:
+	valid = false;
+
+	/* FALLTHROUGH */
+confirmed_valid:
+	*result = valid;
+
+	return true;
+}
+
+static __always_inline bool check_region_inline(intptr_t ptr, size_t size,
+						u32 flags)
+{
+	s8 __arena *addr = (s8 __arena *)(u64)ptr;
+	bool is_poisoned, is_valid;
+
+	if (check_asan_args(addr, size, &is_valid)) {
+		if (!is_valid)
+			asan_report(addr, size, flags);
+		return is_valid;
+	}
+
+	switch (size) {
+	case 1:
+		is_poisoned = memory_is_poisoned_1(addr);
+		break;
+	case 2:
+	case 4:
+	case 8:
+		is_poisoned = memory_is_poisoned_2_4_8(addr, size);
+		break;
+	default:
+		is_poisoned = memory_is_poisoned_n(addr, size);
+	}
+
+	if (is_poisoned) {
+		asan_report(addr, size, flags);
+		return false;
+	}
+
+	return true;
+}
+
+/*
+ * __alias is not supported for BPF so define *__noabort() variants as wrappers.
+ */
+#define DEFINE_ASAN_LOAD_STORE(size)                                  \
+	__hidden void __asan_store##size(intptr_t addr)                  \
+	{                                                             \
+		check_region_inline(addr, size, ASAN_WRITE);          \
+	}                                                             \
+	__hidden void __asan_store##size##_noabort(intptr_t addr)        \
+	{                                                             \
+		check_region_inline(addr, size, ASAN_WRITE);          \
+	}                                                             \
+	__hidden void __asan_load##size(intptr_t addr)                   \
+	{                                                             \
+		check_region_inline(addr, size, ASAN_READ);           \
+	}                                                             \
+	__hidden void __asan_load##size##_noabort(intptr_t addr)         \
+	{                                                             \
+		check_region_inline(addr, size, ASAN_READ);           \
+	}                                                             \
+	__hidden void __asan_report_store##size(intptr_t addr)           \
+	{                                                             \
+		asan_report((s8 __arena *)addr, size, ASAN_WRITE);           \
+	}                                                             \
+	__hidden void __asan_report_store##size##_noabort(intptr_t addr) \
+	{                                                             \
+		asan_report((s8 __arena *)addr, size, ASAN_WRITE);           \
+	}                                                             \
+	__hidden void __asan_report_load##size(intptr_t addr)            \
+	{                                                             \
+		asan_report((s8 __arena *)addr, size, ASAN_READ);            \
+	}                                                             \
+	__hidden void __asan_report_load##size##_noabort(intptr_t addr)  \
+	{                                                             \
+		asan_report((s8 __arena *)addr, size, ASAN_READ);            \
+	}
+
+DEFINE_ASAN_LOAD_STORE(1);
+DEFINE_ASAN_LOAD_STORE(2);
+DEFINE_ASAN_LOAD_STORE(4);
+DEFINE_ASAN_LOAD_STORE(8);
+
+void __asan_storeN(intptr_t addr, ssize_t size)
+{
+	check_region_inline(addr, size, ASAN_WRITE);
+}
+
+void __asan_storeN_noabort(intptr_t addr, ssize_t size)
+{
+	check_region_inline(addr, size, ASAN_WRITE);
+}
+
+void __asan_loadN(intptr_t addr, ssize_t size)
+{
+	check_region_inline(addr, size, ASAN_READ);
+}
+
+void __asan_loadN_noabort(intptr_t addr, ssize_t size)
+{
+	check_region_inline(addr, size, ASAN_READ);
+}
+
+/*
+ * We currently do not sanitize globals.
+ */
+void __asan_register_globals(intptr_t globals, size_t n)
+{
+}
+
+void __asan_unregister_globals(intptr_t globals, size_t n)
+{
+}
+
+/*
+ * We do not currently have memcpy/memmove/memset intrinsics
+ * in LLVM. Do not implement sanitization.
+ */
+void *__asan_memcpy(void *d, const void *s, size_t n)
+{
+	arena_stderr("ASAN: Unexpected %s call", __func__);
+	return NULL;
+}
+
+void *__asan_memmove(void *d, const void *s, size_t n)
+{
+	arena_stderr("ASAN: Unexpected %s call", __func__);
+	return NULL;
+}
+
+void *__asan_memset(void *p, int c, size_t n)
+{
+	arena_stderr("ASAN: Unexpected %s call", __func__);
+	return NULL;
+}
+
+/*
+ * Poisoning code, used when we add more freed memory to the allocator by:
+ * 	a) pulling memory from the arena segment using bpf_arena_alloc_pages()
+ * 	b) freeing memory from application code
+ */
+__hidden __noasan int asan_poison(void __arena *addr, s8 val, size_t size)
+{
+	s8 __arena *shadow;
+	size_t len;
+
+	/*
+	 * Poisoning from a non-granule address makes no sense: We can only allocate
+	 * memory to the application that has a granule-aligned starting address,
+	 * and bpf_arena_alloc_pages returns page-aligned memory. A non-aligned
+	 * addr then implies we're freeing a different address than the one we
+	 * allocated.
+	 */
+	if (unlikely((u64)addr & ASAN_GRANULE_MASK))
+		return -EINVAL;
+
+	/*
+	 * We cannot free an unaligned region because it'd be possible that we
+	 * cannot describe the resulting poisoning state of the granule in
+	 * the ASAN encoding.
+	 *
+	 * Every granule represents a region of memory that looks like the
+	 * following (P for poisoned bytes, C for clear):
+	 *
+	 * <Clear>  <Poisoned>
+	 * [ C C C ... P P ]
+	 *
+	 * The value of the granule's shadow map is the number of clear bytes in
+	 * it. We cannot represent granules with the following state:
+	 *
+	 * [ P P ... C C ... P P ]
+	 *
+	 * That would be possible if we could free unaligned regions, so prevent that.
+	 */
+	if (unlikely(size & ASAN_GRANULE_MASK))
+		return -EINVAL;
+
+	shadow = mem_to_shadow(addr);
+	len = size >> ASAN_SHADOW_SHIFT;
+
+	asan_memset(shadow, val, len);
+
+	return 0;
+}
+
+/*
+ * Unpoisoning code for marking memory as valid during allocation calls.
+ *
+ * Very similar to asan_poison, except we need to round up instead of
+ * down, then partially poison the last granule if necessary.
+ *
+ * Partial poisoning is useful for keeping the padding poisoned. Allocations
+ * are granule-aligned, so we we're reserving granule-aligned sizes for the
+ * allocation. However, we want to still treat accesses to the padding as
+ * invalid. Partial poisoning takes care of that. Freeing and poisoning the
+ * memory is still done in granule-aligned sizes and repoisons the already
+ * poisoned padding.
+ */
+__hidden __noasan int asan_unpoison(void __arena *addr, size_t size)
+{
+	size_t partial = size & ASAN_GRANULE_MASK;
+	s8 __arena *shadow;
+	size_t len;
+
+	/*
+	 * We cannot allocate in the middle of the granule. The ASAN shadow
+	 * map encoding only describes regions of memory where every granule
+	 * follows this format (P for poisoned, C for clear):
+	 *
+	 * <Clear>  <Poisoned>
+	 * [ C C C ... P P ]
+	 *
+	 * This is so we can use a single number in [0, ASAN_SHADOW_SCALE)
+	 * to represent the poison state of the granule.
+	 */
+	if (unlikely((u64)addr & ASAN_GRANULE_MASK))
+		return -EINVAL;
+
+	shadow = mem_to_shadow(addr);
+	len = size >> ASAN_SHADOW_SHIFT;
+
+	asan_memset(shadow, 0, len);
+
+	/*
+	 * If we are allocating a non-granule aligned region, we need to adjust
+	 * the last byte of the shadow map to list how many bytes in the granule
+	 * are unpoisoned. If the region is aligned, then the memset call above
+	 * was enough.
+	 */
+	if (partial)
+		shadow[len] = partial;
+
+	return 0;
+}
+
+/*
+ * Initialize ASAN state when necessary. Triggered from userspace before
+ * allocator startup.
+ */
+SEC("syscall")
+__weak __noasan int asan_init(struct asan_init_args *args)
+{
+	u64 globals_pages = args->arena_globals_pages;
+	u64 all_pages = args->arena_all_pages;
+	u64 shadow_map, shadow_pgoff;
+	u64 shadow_pages;
+
+	if (asan_inited)
+		return 0;
+
+	/*
+	 * Round up the shadow map size to the nearest page.
+	 */
+	shadow_pages = all_pages >> ASAN_SHADOW_SHIFT;
+	if ((all_pages & ((1 << ASAN_SHADOW_SHIFT) - 1)))
+		shadow_pages += 1;
+
+	if (all_pages > (1ULL << 32) / __PAGE_SIZE) {
+		arena_stderr("error: arena size %lx too large", all_pages);
+		return -EINVAL;
+	}
+
+	if (globals_pages > all_pages) {
+		arena_stderr("error: globals %lx do not fit in arena %lx",
+				globals_pages, all_pages);
+		return -EINVAL;
+	}
+
+	if (globals_pages + shadow_pages >= all_pages) {
+		arena_stderr("error: globals %lx do not leave room for shadow map %lx "
+				"(arena pages %lx)",
+				globals_pages, shadow_pages, all_pages);
+		return -EINVAL;
+	}
+
+	shadow_pgoff = all_pages - shadow_pages - globals_pages;
+	__asan_shadow_memory_dynamic_address = shadow_pgoff * __PAGE_SIZE;
+
+	/*
+	 * Allocate the last (1/ASAN_SHADOW_SCALE)th of an arena's pages for the map
+	 * We find the offset and size from the arena map.
+	 *
+	 * The allocated map pages are zeroed out, meaning all memory is marked as valid
+	 * even if it's not allocated already. This is expected: Since the actual memory
+	 * pages are not allocated, accesses to it will trigger page faults and will be
+	 * reported through BPF streams. Any pages allocated through bpf_arena_alloc_pages
+	 * should be poisoned by the allocator right after the call succeeds.
+	 */
+	shadow_map = (u64)bpf_arena_alloc_pages(
+		&arena, (void __arena *)__asan_shadow_memory_dynamic_address,
+		shadow_pages, NUMA_NO_NODE, 0);
+	if (!shadow_map) {
+		arena_stderr("Could not allocate shadow map\n");
+
+		__asan_shadow_memory_dynamic_address = 0;
+
+		return -ENOMEM;
+	}
+
+	asan_inited = true;
+
+	return 0;
+}
+
+#pragma clang attribute pop
+
+#endif /* BPF_ARENA_ASAN */
+
+__weak char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/libarena/src/buddy.bpf.c b/tools/testing/selftests/bpf/libarena/src/buddy.bpf.c
new file mode 100644
index 000000000000..c674ee5cfcc1
--- /dev/null
+++ b/tools/testing/selftests/bpf/libarena/src/buddy.bpf.c
@@ -0,0 +1,903 @@
+// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#include <libarena/common.h>
+#include <libarena/asan.h>
+#include <libarena/buddy.h>
+
+/*
+ * Buddy allocator arena-based implementation.
+ *
+ * Memory is organized into chunks. These chunks
+ * cannot be coalesced or split. Allocating
+ * chunks allocates their memory eagerly.
+ *
+ * Internally, each chunk is organized into blocks.
+ * Blocks _can_ be coalesced/split, but only inside
+ * the chunk. Each block can be allocated or
+ * unallocated. If allocated, the entire block holds
+ * user data. If unallocated, the block is mostly
+ * invalid memory, with the exception of a header
+ * used for freelist tracking.
+ *
+ * The header is placed at an offset inside the block
+ * to prevent off-by-one errors from the previous block
+ * from trivially overwriting the header. Such an error
+ * is also not catchable by ASAN, since the header remains
+ * valid memory even after the block is freed. It is still
+ * theoretically possible for the header to be corrupted
+ * without being caught by ASAN, but harder.
+ *
+ * Since the allocator needs to track order information for
+ * both allocated and free blocks, and allocated blocks cannot
+ * store a header, the allocator also stores per-chunk order
+ * information in a reserved region at the beginning of the
+ * chunk. The header includes a bitmap with the order of blocks
+ * and their allocation state. It also includes the freelist
+ * heads for the allocation itself.
+ */
+
+
+enum {
+	BUDDY_POISONED = (s8)0xef,
+
+	/* Number of pages to be allocated per chunk. */
+	BUDDY_CHUNK_PAGES	= BUDDY_CHUNK_BYTES / __PAGE_SIZE
+};
+
+static inline int buddy_lock(struct buddy __arena *buddy)
+{
+	return arena_spin_lock(&buddy->lock);
+}
+
+static inline void buddy_unlock(struct buddy __arena *buddy)
+{
+	arena_spin_unlock(&buddy->lock);
+}
+
+/*
+ * Reserve part of the arena address space for the allocator. We use
+ * this to get aligned addresses for the chunks, since the arena
+ * page alloc kfuncs do not support aligning to a boundary (in this
+ * case 1 MiB, see buddy.h on how this is derived).
+ */
+static int buddy_reserve_arena_vaddr(struct buddy __arena *buddy)
+{
+	buddy->vaddr = 0;
+
+	return bpf_arena_reserve_pages(&arena,
+				       (void __arena *)BUDDY_VADDR_OFFSET,
+				       BUDDY_VADDR_SIZE / __PAGE_SIZE);
+}
+
+/*
+ * Free up any unused address space. Used only during teardown.
+ */
+static void buddy_unreserve_arena_vaddr(struct buddy __arena *buddy)
+{
+	bpf_arena_free_pages(
+		&arena, (void __arena *)(BUDDY_VADDR_OFFSET + buddy->vaddr),
+		(BUDDY_VADDR_SIZE - buddy->vaddr) / __PAGE_SIZE);
+
+	buddy->vaddr = 0;
+}
+
+/*
+ * Carve out part of the reserved address space and hand it over
+ * to the buddy allocator.
+ *
+ * We are assuming the buddy allocator is the only allocator in the
+ * system, so there is no race between this function reserving a
+ * page range and some other allocator actually making the BPF call
+ * to really create and reserve it.
+ *
+ * However, bump allocation must still be atomic because this function
+ * is called without the buddy lock from multiple threads concurrently.
+ */
+__weak int buddy_alloc_arena_vaddr(struct buddy __arena *buddy, u64 *vaddrp)
+{
+	u64 vaddr, old, new;
+
+	if (!buddy || !vaddrp)
+		return -EINVAL;
+
+	do {
+		vaddr = buddy->vaddr;
+		new = vaddr + BUDDY_CHUNK_BYTES;
+
+		if (new > BUDDY_VADDR_SIZE)
+			return -EINVAL;
+
+		old = __sync_val_compare_and_swap(&buddy->vaddr, vaddr, new);
+	} while (old != vaddr && can_loop);
+
+	if (old != vaddr)
+		return -EINVAL;
+
+	*vaddrp = BUDDY_VADDR_OFFSET + vaddr;
+
+	return 0;
+}
+
+static u64 arena_next_pow2(__u64 n)
+{
+	n--;
+	n |= n >> 1;
+	n |= n >> 2;
+	n |= n >> 4;
+	n |= n >> 8;
+	n |= n >> 16;
+	n |= n >> 32;
+	n++;
+
+	return n;
+}
+
+__weak
+int idx_set_allocated(struct buddy_chunk __arena *chunk, u64 idx, bool allocated)
+{
+	bool already_allocated;
+
+	if (unlikely(idx >= BUDDY_CHUNK_ITEMS)) {
+		arena_stderr("setting state of invalid idx (%ld, max %d)\n", idx,
+			     BUDDY_CHUNK_ITEMS);
+		return -EINVAL;
+	}
+
+	already_allocated = chunk->allocated[idx / 8] & (1 << (idx % 8));
+	if (unlikely(already_allocated == allocated)) {
+		arena_stderr("Double %s of idx %ld for chunk %p",
+				allocated ? "alloc" : "free",
+				idx, chunk);
+		return -EINVAL;
+	}
+
+	if (allocated)
+		chunk->allocated[idx / 8] |= 1 << (idx % 8);
+	else
+		chunk->allocated[idx / 8] &= ~(1 << (idx % 8));
+
+	return 0;
+}
+
+static int idx_is_allocated(struct buddy_chunk __arena *chunk, u64 idx, bool *allocated)
+{
+	if (unlikely(idx >= BUDDY_CHUNK_ITEMS)) {
+		arena_stderr("getting state of invalid idx (%llu, max %d)\n", idx,
+			     BUDDY_CHUNK_ITEMS);
+		return -EINVAL;
+	}
+
+	*allocated = chunk->allocated[idx / 8] & (1 << (idx % 8));
+	return 0;
+}
+
+__weak
+int idx_set_order(struct buddy_chunk __arena *chunk, u64 idx, u8 order)
+{
+	u8 prev_order;
+
+	if (unlikely(order >= BUDDY_CHUNK_NUM_ORDERS)) {
+		arena_stderr("setting invalid order %u\n", order);
+		return -EINVAL;
+	}
+
+	if (unlikely(idx >= BUDDY_CHUNK_ITEMS)) {
+		arena_stderr("setting order of invalid idx (%d, max %d)\n", idx,
+			     BUDDY_CHUNK_ITEMS);
+		return -EINVAL;
+	}
+
+	/*
+	 * We store two order instances per byte, one per nibble.
+	 * Retain the existing nibble.
+	 */
+	prev_order = chunk->orders[idx / 2];
+	if (idx & 0x1) {
+		order &= 0xf;
+		order |= (prev_order & 0xf0);
+	} else {
+		order <<= 4;
+		order |= (prev_order & 0xf);
+	}
+
+	chunk->orders[idx / 2] = order;
+
+	return 0;
+}
+
+static u8 idx_get_order(struct buddy_chunk __arena *chunk, u64 idx)
+{
+	u8 result;
+
+	_Static_assert(BUDDY_CHUNK_NUM_ORDERS <= 16,
+		       "order must fit in 4 bits");
+
+	if (unlikely(idx >= BUDDY_CHUNK_ITEMS)) {
+		arena_stderr("getting order of invalid idx %u\n", idx);
+		return BUDDY_CHUNK_NUM_ORDERS;
+	}
+
+	result = chunk->orders[idx / 2];
+
+	return (idx & 0x1) ? (result & 0xf) : (result >> 4);
+}
+
+static void __arena *idx_to_addr(struct buddy_chunk __arena *chunk, size_t idx)
+{
+	u64 address;
+
+	if (unlikely(idx >= BUDDY_CHUNK_ITEMS)) {
+		arena_stderr("translating invalid idx %u\n", idx);
+		return NULL;
+	}
+
+	/*
+	 * The data blocks start in the chunk after the metadata block.
+	 * We find the actual address by indexing into the region at an
+	 * BUDDY_MIN_ALLOC_BYTES granularity, the minimum allowed.
+	 * The index number already accounts for the fact that the first
+	 * blocks in the chunk are occupied by the metadata, so we do
+	 * not need to offset it.
+	 */
+
+	address = (u64)chunk + (idx * BUDDY_MIN_ALLOC_BYTES);
+
+	return (void __arena *)address;
+}
+
+static struct buddy_header __arena *idx_to_header(struct buddy_chunk __arena *chunk, size_t idx)
+{
+	bool allocated;
+	u64 address;
+
+	if (unlikely(idx_is_allocated(chunk, idx, &allocated))) {
+		arena_stderr("accessing invalid idx 0x%lx\n", idx);
+		return NULL;
+	}
+
+	if (unlikely(allocated)) {
+		arena_stderr("accessing allocated idx 0x%lx as header\n", idx);
+		return NULL;
+	}
+
+	address = (u64)idx_to_addr(chunk, idx);
+	if (!address)
+		return NULL;
+
+	/*
+	 * Offset the header within the block. This avoids accidental overwrites
+	 * to the header because of off-by-one errors when using adjacent blocks.
+	 *
+	 * The offset has been chosen as a compromise between ASAN effectiveness
+	 * and allocator granularity:
+	 * 1) ASAN dictates valid data runs are 8-byte aligned.
+	 * 2) We want to keep a low minimum allocation size (currently 16).
+	 *
+	 * As a result, we have only two possible positions for the header: Bytes
+	 * 0 and 8. Keeping the header in byte 0 means off-by-ones from the previous
+	 * block touch the header, and, since the header must be accessible, ASAN
+	 * will not trigger. Keeping the header on byte 8 means off-by-one errors from
+	 * the previous block are caught by ASAN. Negative offsets are rarer, so
+	 * while accesses into the block from the next block are possible, they are
+	 * less probable.
+	 */
+
+	return (struct buddy_header __arena *)(address + BUDDY_HEADER_OFF);
+}
+
+static void header_add_freelist(struct buddy_chunk __arena *chunk, struct buddy_header __arena *header,
+		u64 idx, u8 order)
+{
+	struct buddy_header __arena *tmp_header;
+
+	idx_set_order(chunk, idx, order);
+
+	header->next_index = chunk->freelists[order];
+	header->prev_index = BUDDY_CHUNK_ITEMS;
+
+	if (header->next_index != BUDDY_CHUNK_ITEMS) {
+		tmp_header = idx_to_header(chunk, header->next_index);
+		tmp_header->prev_index = idx;
+	}
+
+	chunk->freelists[order] = idx;
+}
+
+static void header_remove_freelist(struct buddy_chunk __arena  *chunk,
+				   struct buddy_header __arena *header, u8 order)
+{
+	struct buddy_header __arena *tmp_header;
+
+	if (header->prev_index != BUDDY_CHUNK_ITEMS) {
+		tmp_header = idx_to_header(chunk, header->prev_index);
+		tmp_header->next_index = header->next_index;
+	}
+
+	if (header->next_index != BUDDY_CHUNK_ITEMS) {
+		tmp_header = idx_to_header(chunk, header->next_index);
+		tmp_header->prev_index = header->prev_index;
+	}
+
+	/* Pop off the list head if necessary. */
+	if (idx_to_header(chunk, chunk->freelists[order]) == header)
+		chunk->freelists[order] = header->next_index;
+
+	header->prev_index = BUDDY_CHUNK_ITEMS;
+	header->next_index = BUDDY_CHUNK_ITEMS;
+}
+
+static u64 size_to_order(size_t size)
+{
+	u64 order;
+
+	/*
+	 * Legal sizes are [1, 4GiB] (the biggest possible arena).
+	 * Of course, sizes close to GiB are practically impossible
+	 * to fulfill and allocation will fail, but that's taken care
+	 * of by the caller.
+	 */
+
+	if (unlikely(size == 0 || size > (1UL << 32))) {
+		arena_stderr("illegal size request %lu\n", size);
+		return 64;
+	}
+	/*
+	 * To find the order of the allocation we find the first power of two
+	 * >= the requested size, take the log2, then adjust it for the minimum
+	 * allocation size by removing the minimum shift from it. Requests
+	 * smaller than the minimum allocation size are rounded up.
+	 */
+	order = arena_fls(arena_next_pow2(size)) - 1;
+	if (order < BUDDY_MIN_ALLOC_SHIFT)
+		return 0;
+
+	return order - BUDDY_MIN_ALLOC_SHIFT;
+}
+
+__weak
+int add_leftovers_to_freelist(struct buddy_chunk __arena *chunk, u32 cur_idx,
+		u64 min_order, u64 max_order)
+{
+	struct buddy_header __arena *header;
+	u64 ord;
+	u32 idx;
+
+	for (ord = min_order; ord < max_order && can_loop; ord++) {
+		/* Mark the buddy as free and add it to the freelists. */
+		idx = cur_idx + (1 << ord);
+
+		header = idx_to_header(chunk, idx);
+		if (unlikely(!header)) {
+			arena_stderr("idx %u has no header", idx);
+			return -EINVAL;
+		}
+
+		asan_unpoison(header, sizeof(*header));
+
+		header_add_freelist(chunk, header, idx, ord);
+	}
+
+	return 0;
+}
+
+static struct buddy_chunk __arena *buddy_chunk_get(struct buddy __arena *buddy)
+{
+	u64 order, ord, min_order, max_order;
+	struct buddy_chunk __arena  *chunk;
+	size_t left;
+	int power2;
+	u64 vaddr;
+	u32 idx;
+	int ret;
+
+	/*
+	 * Step 1:  Allocate a properly aligned chunk, and
+	 * prep it for insertion into the buddy allocator.
+	 * We don't need the allocator lock until step 2.
+	 */
+
+	ret = buddy_alloc_arena_vaddr(buddy, &vaddr);
+	if (ret)
+		return NULL;
+
+	/* Addresses must be aligned to the chunk boundary. */
+	if (vaddr % BUDDY_CHUNK_BYTES)
+		return NULL;
+
+	/* Unreserve the address space. */
+	bpf_arena_free_pages(&arena, (void __arena *)vaddr,
+			     BUDDY_CHUNK_PAGES);
+
+	chunk = bpf_arena_alloc_pages(&arena, (void __arena *)vaddr,
+				      BUDDY_CHUNK_PAGES, NUMA_NO_NODE, 0);
+	if (!chunk) {
+		arena_stderr("[ALLOC FAILED]");
+		return NULL;
+	}
+
+	if (buddy_lock(buddy)) {
+		/*
+		 * We cannot reclaim the vaddr space, but that is ok - this
+		 * operation should always succeed. The error path is to catch
+		 * accidental deadlocks that will cause -ENOMEMs to the program as
+		 * the allocator fails to refill itself, in which case vaddr usage
+		 * is the least of our worries.
+		 */
+		bpf_arena_free_pages(&arena, (void __arena *)vaddr, BUDDY_CHUNK_PAGES);
+		return NULL;
+	}
+
+	asan_poison(chunk, BUDDY_POISONED, BUDDY_CHUNK_PAGES * __PAGE_SIZE);
+
+	/* Unpoison the chunk itself. */
+	asan_unpoison(chunk, sizeof(*chunk));
+
+	/* Mark all freelists as empty. */
+	for (ord = zero; ord < BUDDY_CHUNK_NUM_ORDERS && can_loop; ord++)
+		chunk->freelists[ord] = BUDDY_CHUNK_ITEMS;
+
+	/*
+	 * Initialize the chunk by carving out a page range to hold the metadata
+	 * struct above, then dumping the rest of the pages into the allocator.
+	 */
+
+	_Static_assert(BUDDY_CHUNK_PAGES * __PAGE_SIZE >=
+			       BUDDY_MIN_ALLOC_BYTES *
+				       BUDDY_CHUNK_ITEMS,
+		       "chunk must fit within the allocation");
+
+	/*
+	 * Step 2: Reserve a chunk for the chunk metadata, then breaks
+	 * the rest of the full allocation into the different buckets.
+	 * We allocating the memory by grabbing blocks of progressively
+	 * smaller sizes from the allocator, which are guaranteed to be
+	 * continuous.
+	 *
+	 * This operation also populates the allocator.
+	 *
+	 * Algorithm:
+	 *
+	 * - max_order: The last order allocation we made
+	 * - left: How many bytes are left to allocate
+	 * - cur_index: Current index into the top-level block we are
+	 * allocating from.
+	 *
+	 * Step 3:
+	 * - Find the largest power-of-2 allocation still smaller than left (infimum)
+	 * - Reserve a chunk of that size, along with its buddy
+	 * - For every order from [infimum + 1, last order), carve out a block
+	 *   and put it into the allocator.
+	 *
+	 *  Example: Chunk size 0b1010000 (80 bytes)
+	 *
+	 *  Step 1:
+	 *
+	 *   idx  infimum                             1 << max_order
+	 *   0        64        128                    1 << 20
+	 *   |________|_________|______________________|
+	 *
+	 *   Blocks set aside:
+	 *   	[0, 64)         - Completely allocated
+	 *   	[64, 128)       - Will be further split in the next iteration
+	 *
+	 *   Blocks added to the allocator:
+	 *   	[128, 256)
+	 *   	[256, 512)
+	 *   	...
+	 *   	[1 << 18, 1 << 19)
+	 *   	[1 << 19, 1 << 20)
+	 *
+	 *  Step 2:
+	 *
+	 *   idx  infimum			   idx + 1 << max_order
+	 *   64	      80	96		   	64 + 1 << 6 = 128
+	 *   |________|_________|______________________|
+	 *
+	 *   Blocks set aside:
+	 *   	[64, 80)	- Completely allocated
+	 *
+	 *   Blocks added to the allocator:
+	 *      [80, 96) - left == 0 so the buddy is unused and marked as freed
+	 *   	[96, 128)
+	 */
+	 max_order = BUDDY_CHUNK_NUM_ORDERS;
+	left = sizeof(*chunk);
+	idx = 0;
+	while (left && can_loop) {
+		power2 = arena_fls(left) - 1;
+		/*
+		 * Note: The condition below only triggers to catch serious bugs
+		 * early. There is no sane way to undo any block insertions from
+		 * the allocated chunk, so just leak any leftover allocations,
+		 * emit a diagnostic, unlock and exit.
+		 *
+		 */
+		if (unlikely(power2 >= BUDDY_CHUNK_NUM_ORDERS)) {
+			arena_stderr(
+				"buddy chunk metadata require allocation of order %d\n",
+				power2);
+			arena_stderr(
+				"chunk has size of 0x%lx bytes (left %lx bytes)\n",
+				sizeof(*chunk), left);
+			buddy_unlock(buddy);
+
+			return NULL;
+		}
+
+		/* Round up allocations that are too small. */
+
+		left -= (power2 >= BUDDY_MIN_ALLOC_SHIFT) ? 1 << power2 : left;
+		order = (power2 >= BUDDY_MIN_ALLOC_SHIFT) ? power2 - BUDDY_MIN_ALLOC_SHIFT : 0;
+
+		if (idx_set_allocated(chunk, idx, true)) {
+			buddy_unlock(buddy);
+			return NULL;
+		}
+
+		/*
+		 * Starting an order above the one we allocated, populate
+		 * the allocator with free blocks. If this is the last
+		 * allocation (left == 0), also mark the buddy as free.
+		 *
+		 * See comment above about error handling: The error path
+		 * is only there as a way to mitigate deeply buggy allocator
+		 * states by emitting a diagnostic in add_leftovers_to_freelist()
+		 * and leaking any memory not added in the freelists.
+		 */
+		min_order = left ? order + 1 : order;
+		if (add_leftovers_to_freelist(chunk, idx, min_order, max_order)) {
+			buddy_unlock(buddy);
+			return NULL;
+		}
+
+		/* Adjust the index. */
+		idx += 1 << order;
+		max_order = order;
+	}
+
+	buddy_unlock(buddy);
+
+	return chunk;
+}
+
+__weak int buddy_init(struct buddy __arena *buddy)
+{
+	struct buddy_chunk __arena *chunk;
+	int ret;
+
+	if (!asan_ready())
+		return -EINVAL;
+
+	/* Reserve enough address space to ensure allocations are aligned. */
+	ret = buddy_reserve_arena_vaddr(buddy);
+	if (ret)
+		return ret;
+
+	_Static_assert(BUDDY_CHUNK_PAGES > 0,
+		       "chunk must use one or more pages");
+
+	chunk = buddy_chunk_get(buddy);
+
+	if (buddy_lock(buddy)) {
+		bpf_arena_free_pages(&arena, chunk, BUDDY_CHUNK_PAGES);
+		return -EINVAL;
+	}
+
+	/* Chunk is already properly unpoisoned if allocated. */
+	if (chunk)
+		chunk->next = buddy->first_chunk;
+
+	/* Put the chunk at the beginning of the list. */
+	buddy->first_chunk = chunk;
+
+	buddy_unlock(buddy);
+
+	return chunk ? 0 : -ENOMEM;
+}
+
+/*
+ * Destroy the allocator. This does not check whether there are any allocations
+ * currently in use, so any pages being accessed will start taking arena faults.
+ * We do not take a lock because we are freeing arena pages, and nobody should
+ * be using the allocator at that point in the execution.
+ */
+__weak int buddy_destroy(struct buddy __arena *buddy)
+{
+	struct buddy_chunk __arena *chunk, *next;
+
+	if (!buddy)
+		return -EINVAL;
+
+	/*
+	 * Traverse all buddy chunks and free them back to the arena
+	 * with the same granularity they were allocated with.
+	 */
+	for (chunk = buddy->first_chunk; chunk && can_loop; chunk = next) {
+		next = chunk->next;
+
+		/* Wholesale poison the entire block. */
+		asan_poison(chunk, BUDDY_POISONED,
+			    BUDDY_CHUNK_PAGES * __PAGE_SIZE);
+		bpf_arena_free_pages(&arena, chunk, BUDDY_CHUNK_PAGES);
+	}
+
+	/* Free up any part of the address space that did not get used. */
+	buddy_unreserve_arena_vaddr(buddy);
+
+	/* Clear all fields. */
+	buddy->first_chunk = NULL;
+
+	return 0;
+}
+
+__weak u64 buddy_chunk_alloc(struct buddy_chunk __arena *chunk, int order_req)
+{
+	struct buddy_header __arena *header, *tmp_header, *next_header;
+	u32 idx, tmpidx, retidx;
+	u64 address;
+	u64 order = 0;
+	u64 i;
+
+	for (order = order_req; order < BUDDY_CHUNK_NUM_ORDERS && can_loop; order++) {
+		if (chunk->freelists[order] != BUDDY_CHUNK_ITEMS)
+			break;
+	}
+
+	if (order >= BUDDY_CHUNK_NUM_ORDERS)
+		return (u64)NULL;
+
+	retidx = chunk->freelists[order];
+	header = idx_to_header(chunk, retidx);
+	if (unlikely(!header))
+		return (u64) NULL;
+
+	chunk->freelists[order] = header->next_index;
+
+	if (header->next_index != BUDDY_CHUNK_ITEMS) {
+		next_header = idx_to_header(chunk, header->next_index);
+		next_header->prev_index = BUDDY_CHUNK_ITEMS;
+	}
+
+	header->prev_index = BUDDY_CHUNK_ITEMS;
+	header->next_index = BUDDY_CHUNK_ITEMS;
+	if (idx_set_order(chunk, retidx, order_req))
+		return (u64)NULL;
+
+	if (idx_set_allocated(chunk, retidx, true))
+		return (u64)NULL;
+
+	/*
+	 * Do not unpoison the address yet, will be done by the caller
+	 * because the caller has the exact allocation size requested.
+	 */
+	address = (u64)idx_to_addr(chunk, retidx);
+	if (!address)
+		return (u64)NULL;
+
+	/* If we allocated from a larger-order chunk, split the buddies. */
+	for (i = order_req; i < order && can_loop; i++) {
+		/*
+		 * Flip the bit for the current order (the bit is guaranteed
+		 * to be 0, so just add 1 << i).
+		 */
+		idx = retidx + (1 << i);
+
+		/* Add the buddy of the allocation to the free list. */
+		header = idx_to_header(chunk, idx);
+		/* Unpoison the buddy header */
+		asan_unpoison(header, sizeof(*header));
+
+		if (idx_set_order(chunk, idx, i))
+			return (u64)NULL;
+
+		/* Push the header to the beginning of the freelists list. */
+		tmpidx = chunk->freelists[i];
+
+		header->prev_index = BUDDY_CHUNK_ITEMS;
+		header->next_index = tmpidx;
+
+		if (tmpidx != BUDDY_CHUNK_ITEMS) {
+			tmp_header = idx_to_header(chunk, tmpidx);
+			tmp_header->prev_index = idx;
+		}
+
+		chunk->freelists[i] = idx;
+	}
+
+	return address;
+}
+
+/* Scan the existing chunks for available memory. */
+static u64 buddy_alloc_from_existing_chunks(struct buddy __arena *buddy, int order)
+{
+	struct buddy_chunk __arena *chunk;
+	u64 address;
+
+	for (chunk = buddy->first_chunk; chunk != NULL && can_loop;
+	     chunk = chunk->next) {
+		address = buddy_chunk_alloc(chunk, order);
+		if (address)
+			return address;
+	}
+
+	return (u64)NULL;
+}
+
+/*
+ * Try an allocation from a newly allocated chunk. Also
+ * incorporate the chunk into the linked list.
+ */
+static u64 buddy_alloc_from_new_chunk(struct buddy __arena *buddy, struct buddy_chunk __arena *chunk, int order)
+{
+	u64 address;
+
+	if (buddy_lock(buddy))
+		return (u64)NULL;
+
+
+	/*
+	 * Add the chunk into the allocator and try
+	 * to allocate specifically from that chunk.
+	 */
+	chunk->next = buddy->first_chunk;
+	buddy->first_chunk = chunk;
+
+	address = buddy_chunk_alloc(buddy->first_chunk, order);
+
+	buddy_unlock(buddy);
+
+	return (u64)address;
+}
+__weak
+void __arena *buddy_alloc(struct buddy __arena *buddy, size_t size)
+{
+	void __arena *address = NULL;
+	struct buddy_chunk __arena *chunk;
+	int order;
+
+	if (!buddy)
+		return NULL;
+
+	order = size_to_order(size);
+	if (order >= BUDDY_CHUNK_NUM_ORDERS || order < 0) {
+		arena_stderr("invalid order %d (sz %lu)\n", order, size);
+		return NULL;
+	}
+
+	if (buddy_lock(buddy))
+		return NULL;
+
+	address = (u8 __arena *)buddy_alloc_from_existing_chunks(buddy, order);
+	buddy_unlock(buddy);
+	if (address)
+		goto done;
+
+	/* Get a new chunk. */
+	chunk = buddy_chunk_get(buddy);
+	if (chunk)
+		address = (u8 __arena *)buddy_alloc_from_new_chunk(buddy, chunk, order);
+
+done:
+	/* If we failed to allocate memory, return NULL. */
+	if (!address)
+		return NULL;
+
+	/*
+	 * Unpoison exactly the amount of bytes requested. If the
+	 * data is smaller than the header, we must poison any
+	 * unused bytes that were part of the header.
+	 */
+	if (size < BUDDY_HEADER_OFF + sizeof(struct buddy_header __arena))
+		asan_poison(address + BUDDY_HEADER_OFF, BUDDY_POISONED,
+			    sizeof(struct buddy_header __arena));
+
+	asan_unpoison(address, size);
+
+	return address;
+}
+
+static __always_inline int buddy_free_unlocked(struct buddy __arena *buddy, u64 addr)
+{
+	struct buddy_header __arena *header, *buddy_header;
+	u64 idx, buddy_idx, tmp_idx;
+	struct buddy_chunk __arena *chunk;
+	bool allocated;
+	u8 order;
+	int ret;
+
+	if (!buddy)
+		return -EINVAL;
+
+	if (addr & (BUDDY_MIN_ALLOC_BYTES - 1)) {
+		arena_stderr("Freeing unaligned address %llx\n", addr);
+		return -EINVAL;
+	}
+
+	/* Get (chunk, idx) out of the address. */
+	chunk = (void __arena *)(addr & ~BUDDY_CHUNK_OFFSET_MASK);
+	idx = (addr & BUDDY_CHUNK_OFFSET_MASK) / BUDDY_MIN_ALLOC_BYTES;
+
+	/* Mark the block as unallocated so we can access the header. */
+	ret = idx_set_allocated(chunk, idx, false);
+	if (ret)
+		return ret;
+
+	order  = idx_get_order(chunk, idx);
+	header = idx_to_header(chunk, idx);
+
+	/* The header is in the block itself, keep it unpoisoned. */
+	asan_poison((u8 __arena *)addr, BUDDY_POISONED,
+		    BUDDY_MIN_ALLOC_BYTES << order);
+	asan_unpoison(header, sizeof(*header));
+
+	/*
+	 * Coalescing loop. Merge with free buddies of equal order.
+	 * For every coalescing step, keep the left buddy and
+	 * drop the right buddy's header.
+	 */
+	for (; order < BUDDY_CHUNK_NUM_ORDERS && can_loop; order++) {
+		buddy_idx = idx ^ (1 << order);
+
+		/* Check if the buddy is actually free. */
+		idx_is_allocated(chunk, buddy_idx, &allocated);
+		if (allocated)
+			break;
+
+		/*
+		 * If buddy is not the same order as the chunk
+		 * being freed, then we're done coalescing.
+		 */
+		if (idx_get_order(chunk, buddy_idx) != order)
+			break;
+
+		buddy_header = idx_to_header(chunk, buddy_idx);
+		header_remove_freelist(chunk, buddy_header, order);
+
+		/* Keep the left header out of the two buddies, drop the other one. */
+		if (buddy_idx < idx) {
+			tmp_idx = idx;
+			idx = buddy_idx;
+			buddy_idx = tmp_idx;
+		}
+
+		/* Remove the buddy from the freelists so that we can merge it. */
+		idx_set_order(chunk, buddy_idx, order);
+
+		buddy_header = idx_to_header(chunk, buddy_idx);
+		asan_poison(buddy_header, BUDDY_POISONED,
+			    sizeof(*buddy_header));
+	}
+
+	/* Header properly freed but not in any freelists yet .*/
+	idx_set_order(chunk, idx, order);
+
+	header = idx_to_header(chunk, idx);
+	header_add_freelist(chunk, header, idx, order);
+
+	return 0;
+}
+
+__weak int buddy_free(struct buddy __arena *buddy, void __arena *addr)
+{
+	int ret;
+
+	if (!buddy)
+		return -EINVAL;
+
+	/* Freeing NULL is a valid no-op. */
+	if (!addr)
+		return 0;
+
+	ret = buddy_lock(buddy);
+	if (ret)
+		return ret;
+
+	ret = buddy_free_unlocked(buddy, (u64)addr);
+
+	buddy_unlock(buddy);
+
+	return ret;
+}
+
+__weak char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/libarena/src/common.bpf.c b/tools/testing/selftests/bpf/libarena/src/common.bpf.c
new file mode 100644
index 000000000000..50be57213dfb
--- /dev/null
+++ b/tools/testing/selftests/bpf/libarena/src/common.bpf.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+#include <libarena/common.h>
+#include <libarena/asan.h>
+#include <libarena/buddy.h>
+
+const volatile u32 zero = 0;
+
+struct buddy __arena buddy;
+
+int arena_fls(__u64 word)
+{
+	if (!word)
+		return 0;
+
+	return 64 - __builtin_clzll(word);
+}
+
+SEC("syscall")
+__weak int arena_get_info(struct arena_get_info_args *args)
+{
+	args->arena_base = arena_base(&arena);
+
+	return 0;
+}
+
+SEC("syscall")
+__weak int arena_alloc_reserve(struct arena_alloc_reserve_args *args)
+{
+	return bpf_arena_reserve_pages(&arena, NULL, args->nr_pages);
+}
+
+SEC("syscall")
+__weak int arena_buddy_reset(void)
+{
+	buddy_destroy(&buddy);
+
+	return buddy_init(&buddy);
+}
+
+__weak void __arena *arena_malloc(size_t size)
+{
+	return buddy_alloc(&buddy, size);
+}
+
+__weak void arena_free(void __arena *ptr)
+{
+	buddy_free(&buddy, ptr);
+}
+
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/libarena/src/rbtree.bpf.c b/tools/testing/selftests/bpf/libarena/src/rbtree.bpf.c
new file mode 100644
index 000000000000..7f0f6dc3e17d
--- /dev/null
+++ b/tools/testing/selftests/bpf/libarena/src/rbtree.bpf.c
@@ -0,0 +1,1047 @@
+// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
+/*
+ * Copyright (c) 2025-2026 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2025-2026 Emil Tsalapatis <emil@etsalapatis.com>
+ */
+
+#include <libarena/common.h>
+
+#include <libarena/asan.h>
+#include <libarena/rbtree.h>
+
+int rb_integrity_check(struct rbtree __arena *rbtree);
+void rbnode_print(size_t depth, struct rbnode __arena *rbn);
+static int rbnode_replace(struct rbtree __arena *rbtree,
+			  struct rbnode __arena *existing,
+			  struct rbnode __arena *replacement);
+
+struct rbtree __arena *rb_create(enum rbtree_alloc alloc,
+				 enum rbtree_insert_mode insert)
+{
+	struct rbtree __arena *rbtree;
+
+	rbtree = arena_malloc(sizeof(*rbtree));
+	if (unlikely(!rbtree))
+		return NULL;
+
+	/*
+	 * RB_UPDATE overwrites existing values in the nodes, but RB_NOALLOC
+	 * trees manage the tree nodes directly (including holding pointers
+	 * to them). Disallow mixing the two modes to avoid dealing with
+	 * unintuitive semantics.
+	 */
+	if (alloc == RB_NOALLOC && insert == RB_UPDATE) {
+		arena_stderr("WARNING: Cannot combine RB_NOALLOC and RB_UPDATE");
+		arena_free(rbtree);
+		return NULL;
+	}
+
+	rbtree->alloc = alloc;
+	rbtree->insert = insert;
+	rbtree->root = NULL;
+
+	return rbtree;
+}
+
+__weak
+int rb_destroy(struct rbtree __arena *rbtree)
+{
+	int ret = 0;
+
+	arena_subprog_init();
+
+	if (unlikely(!rbtree))
+		return -EINVAL;
+
+	if (rbtree->alloc == RB_NOALLOC) {
+		/*
+		 * We cannot do anything about RB_NOALLOC nodes. The whole
+		 * point of RB_NOALLOC is that the nodes are directly owned
+		 * by the caller that allocates and inserts them. We could
+		 * unilaterally grab all nodes and free them anyway, but that
+		 * would almost certainly cause UAF as the callers keep accessing
+		 * the now freed nodes. Throw an error instead.
+		 */
+		if (rbtree->root) {
+			arena_stderr("WARNING: Destroying RB_NOALLOC tree with > 0 nodes");
+			return -EBUSY;
+		}
+
+		goto out;
+	}
+
+	while (rbtree->root && can_loop) {
+		ret = rb_remove(rbtree, rbtree->root->key);
+		if (ret)
+			break;
+	}
+
+out:
+	arena_free(rbtree);
+	return ret;
+}
+
+static inline int rbnode_dir(struct rbnode __arena *node)
+{
+	/* Arbitrarily choose a direction for the root. */
+	if (unlikely(!node->parent))
+		return 0;
+
+	return (node->parent->left == node) ? 0 : 1;
+}
+
+/*
+ * The __noinline is to prevent inlining from bloating the add
+ * remove calls, in turn causing register splits and increasing
+ * stack usage above what is permitted.
+ */
+__noinline
+int rbnode_rotate(struct rbtree __arena *rbtree,
+		  struct rbnode __arena *node, int dir)
+{
+	struct rbnode __arena *tmp, *parent;
+	int parentdir;
+
+	parent = node->parent;
+	if (parent)
+		parentdir = rbnode_dir(node);
+
+	/* If we're doing a root change, are we the root? */
+	if (unlikely(!parent && rbtree->root != node))
+		return -EINVAL;
+
+	/*
+	 * Does the node we're turning into the root into exist?
+	 * Note that the new root is on the opposite side of the
+	 * rotation's direction.
+	 */
+	tmp = node->child[1 - dir];
+	if (unlikely(!tmp))
+		return -EINVAL;
+
+	/* Steal the closest child of the new root. */
+	node->child[1 - dir] = tmp->child[dir];
+	if (node->child[1 - dir])
+		node->child[1 - dir]->parent = node;
+
+	/* Put the node below the new root.*/
+	tmp->child[dir] = node;
+	node->parent = tmp;
+
+	tmp->parent = parent;
+	if (parent)
+		parent->child[parentdir] = tmp;
+	else
+		rbtree->root = tmp;
+
+	return 0;
+}
+
+static
+struct rbnode __arena *rbnode_find(struct rbnode __arena *subtree, u64 key)
+{
+	struct rbnode __arena *node = subtree;
+	int dir;
+
+	if (!subtree)
+		return NULL;
+
+	while (can_loop) {
+		if (node->key == key)
+			break;
+
+		dir = (key < node->key) ? 0 : 1;
+
+		if (!node->child[dir])
+			break;
+
+		node = node->child[dir];
+	}
+
+	return node;
+}
+
+static
+struct rbnode __arena *rbnode_least_upper_bound(struct rbnode __arena *subtree, uint64_t key)
+{
+	struct rbnode __arena *node = subtree;
+	int dir;
+
+	if (!subtree)
+		return NULL;
+
+	while (can_loop) {
+		dir = (key <= node->key) ? 0 : 1;
+
+		if (!node->child[dir])
+			break;
+
+		node = node->child[dir];
+	}
+
+	return node;
+}
+
+__weak
+int rb_find(struct rbtree __arena *rbtree, u64 key, u64 *value)
+{
+	struct rbnode __arena *node;
+
+	if (unlikely(!rbtree))
+		return -EINVAL;
+
+	if (unlikely(!value))
+		return -EINVAL;
+
+	node = rbnode_find(rbtree->root, key);
+	if (!node || node->key != key)
+		return -ENOENT;
+
+	*value = node->value;
+
+	return 0;
+}
+
+__weak
+struct rbnode __arena *rb_node_alloc(u64 key, u64 value)
+{
+	struct rbnode __arena *rbnode = NULL;
+
+	rbnode = (struct rbnode __arena *)arena_malloc(sizeof(*rbnode));
+	if (!rbnode)
+		return NULL;
+
+	/*
+	 * WARNING: The order of assignments is weird on purpose.
+	 * See comment in rb_insert_node() for more context.
+	 * TL;DR: Prevent consecutive 0 assignments from being
+	 * promoted into an unverifiable memset by the compiler.
+	 */
+
+	rbnode->key = key;
+	rbnode->parent = NULL;
+	rbnode->value = value;
+	rbnode->left = NULL;
+	rbnode->is_red = true;
+	rbnode->right = NULL;
+
+	return rbnode;
+}
+
+__weak
+void rb_node_free(struct rbnode __arena *rbnode)
+{
+	arena_free(rbnode);
+}
+
+static
+int rb_node_insert(struct rbtree __arena *rbtree,
+		   struct rbnode __arena *node)
+{
+	struct rbnode __arena *grandparent, *parent = rbtree->root;
+	u64 key = node->key;
+	struct rbnode __arena *uncle;
+	int dir;
+	int ret;
+
+	if (unlikely(!rbtree))
+		return -EINVAL;
+
+	if (!parent) {
+		rbtree->root = node;
+		return 0;
+	}
+
+	if (rbtree->insert != RB_DUPLICATE)
+		parent = rbnode_find(parent, key);
+	else
+		parent = rbnode_least_upper_bound(parent, key);
+
+	if (key == parent->key && rbtree->insert != RB_DUPLICATE) {
+		if (rbtree->insert == RB_UPDATE) {
+			/*
+			 * Replace the old node with the new one.
+			 * Free up the old node.
+			 */
+			ret = rbnode_replace(rbtree, parent, node);
+			if (ret)
+				return ret;
+
+			if (rbtree->alloc == RB_ALLOC)
+				rb_node_free(parent);
+
+			return 0;
+		}
+
+		/* Otherwise it's RB_DEFAULT. */
+		return -EALREADY;
+	}
+
+	node->parent = parent;
+	/* Also works if key == parent->key. */
+	if (key <= parent->key)
+		parent->left = node;
+	else
+		parent->right = node;
+
+	while (can_loop) {
+		parent = node->parent;
+		if (!parent)
+			return 0;
+
+		if (!parent->is_red)
+			return 0;
+
+		grandparent = parent->parent;
+		if (!grandparent) {
+			parent->is_red = false;
+			return 0;
+		}
+
+		dir = rbnode_dir(parent);
+		uncle = grandparent->child[1 - dir];
+
+		if (!uncle || !uncle->is_red) {
+			if (node == parent->child[1 - dir]) {
+				rbnode_rotate(rbtree, parent, dir);
+				node = parent;
+				parent = grandparent->child[dir];
+			}
+
+			rbnode_rotate(rbtree, grandparent, 1 - dir);
+			parent->is_red = false;
+			grandparent->is_red = true;
+
+			return 0;
+		}
+
+		/* Uncle is red. */
+
+		parent->is_red = false;
+		uncle->is_red = false;
+		grandparent->is_red = true;
+
+		node = grandparent;
+	}
+
+	return 0;
+}
+
+int rb_insert_node(struct rbtree __arena *rbtree,
+		   struct rbnode __arena *node)
+{
+	if (unlikely(!rbtree))
+		return -EINVAL;
+
+	if (unlikely(rbtree->alloc == RB_ALLOC))
+		return -EINVAL;
+
+	node->left = NULL;
+
+	/*
+	 * Workaround to break an optimization that causes
+	 * verification failures on some compilers. Assignments
+	 * of the kind
+	 *
+	 * *(r0 + 0) = 0;
+	 * *(r0 + 8) = 0;
+	 * *(r0 + 16) = 0;
+	 *
+	 * get promoted into a memset, and that in turn is not
+	 * handled properly for arena memory by LLVM 21 and GCC 15.
+	 * Add a barrier for now to prevent the assignments from being fused.
+	 */
+	barrier();
+
+	node->parent = NULL;
+	node->right = NULL;
+	
+	node->is_red = true;
+
+	return rb_node_insert(rbtree, node);
+}
+
+__weak
+int rb_insert(struct rbtree __arena *rbtree, u64 key, u64 value)
+{
+	struct rbnode __arena *node;
+	int ret;
+
+	if (unlikely(!rbtree))
+		return -EINVAL;
+
+	if (unlikely(rbtree->alloc != RB_ALLOC))
+		return -EINVAL;
+
+	node = rb_node_alloc(key, value);
+	if (!node)
+		return -ENOMEM;
+
+	ret = rb_node_insert(rbtree, node);
+	if (ret) {
+		rb_node_free(node);
+		return ret;
+	}
+
+	return 0;
+}
+
+static inline struct rbnode __arena *rbnode_least(struct rbnode __arena *subtree)
+{
+	while (subtree->left && can_loop)
+		subtree = subtree->left;
+
+	return subtree;
+}
+
+__weak int rb_least(struct rbtree __arena *rbtree, u64 *key, u64 *value)
+{
+	struct rbnode __arena *least;
+
+	if (unlikely(!rbtree))
+		return -EINVAL;
+
+	if (!rbtree->root)
+		return -ENOENT;
+
+	least = rbnode_least(rbtree->root);
+	if (key)
+		*key = least->key;
+	if (value)
+		*value = least->value;
+
+	return 0;
+}
+
+
+/*
+ * If we are referencing ourselves, a and b have a parent-child relation,
+ * and we should be pointing at the other node instead.
+ */
+static inline void rbnode_fixup_pointers(struct rbnode __arena *a,
+					 struct rbnode __arena *b)
+{
+#define fixup(n1, n2, member) do { if (n1->member == n1) n1->member = n2; } while (0)
+	fixup(a, b, left);
+	fixup(a, b, right);
+	fixup(a, b, parent);
+#undef fixup
+}
+
+static inline void rbnode_swap_values(struct rbnode __arena *a,
+				      struct rbnode __arena *b)
+{
+#define swap(n1, n2, tmp) do { (tmp) = (n1); (n1) = (n2); (n2) = (tmp); } while (0)
+	struct rbnode __arena *tmpnode;
+	u64 tmp;
+
+	/* Swap the pointers. */
+	swap(a->is_red, b->is_red, tmp);
+
+	swap(a->left, b->left, tmpnode);
+	swap(a->right, b->right, tmpnode);
+	swap(a->parent, b->parent, tmpnode);
+#undef swap
+
+	/* Account for the nodes being parent and child. */
+	rbnode_fixup_pointers(b, a);
+	rbnode_fixup_pointers(a, b);
+}
+
+static inline void rbnode_adjust_neighbors(struct rbtree __arena *rbtree,
+					   struct rbnode __arena *node, int dir)
+{
+	if (node->left)
+		node->left->parent = node;
+	if (node->right)
+		node->right->parent = node;
+
+	if (node->parent) {
+		node->parent->child[dir] = node;
+		return;
+	}
+
+	rbtree->root = node;
+}
+
+/*
+ * Directly replace an existing node with a replacement. The replacement node
+ * should not already be in the tree.
+ */
+static int rbnode_replace(struct rbtree __arena *rbtree,
+			  struct rbnode __arena *existing,
+			  struct rbnode __arena *replacement)
+{
+	int dir = 0;
+
+	if (unlikely(replacement->parent || replacement->left || replacement->right))
+		return -EINVAL;
+
+	if (existing->parent)
+		dir = rbnode_dir(existing);
+
+	replacement->is_red = existing->is_red;
+	replacement->left = existing->left;
+	replacement->right = existing->right;
+	replacement->parent = existing->parent;
+
+	/* Fix up the new node's neighbors. */
+	rbnode_adjust_neighbors(rbtree, replacement, dir);
+
+	return 0;
+}
+
+/*
+ * Switch two nodes in the tree in place. This is useful during node deletion.
+ * This is more involved than switching the values of the two nodes because we
+ * must update all tree pointers.
+ */
+static void rbnode_switch(struct rbtree __arena *rbtree,
+			  struct rbnode __arena *a,
+			  struct rbnode __arena *b)
+{
+	int adir = 0, bdir = 0;
+
+	/*
+	 * Store the direction in the parent because we will not
+	 * be able to recompute it once we start swapping values.
+	 */
+	if (a->parent)
+		adir = rbnode_dir(a);
+
+	if (b->parent)
+		bdir = rbnode_dir(b);
+
+	rbnode_swap_values(a, b);
+
+	/*
+	 * Fix up the pointers from the children/parent to the
+	 * new nodes.
+	 */
+	rbnode_adjust_neighbors(rbtree, a, bdir);
+	rbnode_adjust_neighbors(rbtree, b, adir);
+}
+
+static inline int rbnode_remove_node_single_child(struct rbtree __arena *rbtree,
+						  struct rbnode __arena *node,
+						  bool free)
+{
+	struct rbnode __arena *child;
+	int dir;
+
+	if (unlikely(node->is_red)) {
+		arena_stderr("Node unexpectedly red\n");
+		return -EINVAL;
+	}
+
+	child = node->left ? node->left : node->right;
+	if (unlikely(!child->is_red)) {
+		arena_stderr("Only child is black\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * Since it's the immediate child, we can just
+	 * remove the parent.
+	 */
+	child->parent = node->parent;
+
+	if (node->parent) {
+		dir = rbnode_dir(node);
+		node->parent->child[dir] = child;
+	} else {
+		rbtree->root = child;
+	}
+
+	/* Color the child black. */
+	child->is_red = false;
+
+	/* Only free if called from rb_remove. */
+	if (free)
+		rb_node_free(node);
+
+	return 0;
+}
+
+static inline bool rbnode_has_red_children(struct rbnode __arena *node)
+{
+	if (node->left && node->left->is_red)
+		return true;
+
+	return node->right && node->right->is_red;
+}
+
+static
+int rb_node_remove(struct rbtree __arena *rbtree,
+		   struct rbnode __arena *node)
+{
+	struct rbnode __arena *parent, *sibling, *close_nephew, *distant_nephew;
+	bool free = (rbtree->alloc == RB_ALLOC);
+	struct rbnode __arena *replace, *initial;
+	bool is_red;
+	int dir;
+
+	/* Both children present, replace with next largest key. */
+	if (node->left && node->right) {
+		/*
+		 * Swap the node itself instead of just the
+		 * key/value pair to account for nodes embedded
+		 * in other structs.
+		 */
+
+		replace = rbnode_least(node->right);
+		rbnode_switch(rbtree, replace, node);
+
+		/*
+		 * FALLTHROUGH: We moved the node we are removing to
+		 * the leftmost position of the subtree. We can now
+		 * remove it as if it was always where we moved it to.
+		 */
+	}
+
+	initial = node;
+
+	/* Only one child present, replace with child and paint it black. */
+	if (!node->left != !node->right)
+		return rbnode_remove_node_single_child(rbtree, node, free);
+
+	/* (!node->left && !node->right) */
+
+	parent = node->parent;
+	if (!parent) {
+		/* Check that we're _actually_ the root. */
+		if (rbtree->root == node)
+			rbtree->root = NULL;
+		else
+			arena_stderr("WARNING: Attempting to remove detached node from rbtree\n");
+
+		if (free)
+			rb_node_free(node);
+		return 0;
+	}
+
+	dir = rbnode_dir(node);
+	parent->child[dir] = NULL;
+	is_red = node->is_red;
+
+	if (free)
+		rb_node_free(node);
+
+	/* If we removed a red node, we did not unbalance the tree.*/
+	if (is_red)
+		return 0;
+
+	sibling = parent->child[1 - dir];
+	if (unlikely(!sibling)) {
+		arena_stderr("rbtree: removed black node has no sibling\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * We removed a black node, causing a change in path
+	 * weight. Start rebalancing. The invariant is that
+	 * all paths going through the node are shortened
+	 * by one, and the current node is black.
+	 */
+	while (can_loop) {
+
+		/* Balancing reached the root, there can be no imbalance. */
+		if (!parent)
+			return 0;
+
+		/*
+		 * We already determined the dir, either above or
+		 * at the end of the loop.
+		 */
+
+		/*
+		 * If we have no sibling, the tree was
+		 * already unbalanced.
+		 */
+		sibling = parent->child[1 - dir];
+		if (unlikely(!sibling)) {
+			arena_stderr("rbtree: removed black node has no sibling\n");
+			return -EINVAL;
+		}
+
+		/* Sibling is red, turn it into the grandparent. */
+		if (sibling->is_red) {
+			/*
+			 * Sibling is red. Transform the tree to turn
+			 * the sibling into the parent's position, and
+			 * repaint them. This does not balance the tree
+			 * but makes it so we know the sibling is black
+			 * and so can use the transformations to balance.
+			 */
+			rbnode_rotate(rbtree, parent, dir);
+			parent->is_red = true;
+			sibling->is_red = false;
+
+			/* Our new sibling is now the close nephew. */
+			sibling = parent->child[1 - dir];
+			/* If sibling has any red siblings, break out. */
+			if (rbnode_has_red_children(sibling))
+				break;
+
+			/* We can repaint the sibling and parent, we're done. */
+			sibling->is_red = true;
+			parent->is_red = false;
+
+			return 0;
+		}
+
+		/* Sibling guaranteed to be black. If it has red children, break out. */
+		if (rbnode_has_red_children(sibling))
+			break;
+
+		/*
+		 * Both sibling and children are black. If parent is red, swap
+		 * colors with the sibling. Otherwise
+		 */
+		if (parent->is_red) {
+			parent->is_red = false;
+			sibling->is_red = true;
+			return 0;
+		}
+
+		/*
+		 * Parent, sibling, and all its children are black. Repaint the sibling.
+		 * This shortens the paths through it, so pop up a level in the
+		 * tree and repeat the balancing.
+		 */
+		sibling->is_red = true;
+		node = parent;
+		parent = node->parent;
+		dir = rbnode_dir(node);
+	}
+
+	if (node != initial) {
+		dir = rbnode_dir(node);
+		parent = node->parent;
+		sibling = parent->child[1-dir];
+	}
+	/*
+	 * Almost there. We know between the parent, sibling,
+	 * and nephews only one or two of the nephews are red. If
+	 * it is the close one, rotate it to the sibling position,
+	 * paint it black, and paint the previous sibling red.
+	 */
+
+	close_nephew = sibling->child[dir];
+	distant_nephew = sibling->child[1 - dir];
+
+	/*
+	 * If the distant red nephew is not red, rotate
+	 * and repaint. We need the distant nephew
+	 * to be red. We know the close nephew is red
+	 * because at least one of them are, so the
+	 * distant one is black if it exists.
+	 */
+	if (!distant_nephew || !distant_nephew->is_red) {
+		rbnode_rotate(rbtree, sibling, 1 - dir);
+		sibling->is_red = true;
+		close_nephew->is_red = false;
+		distant_nephew = sibling;
+		sibling = close_nephew;
+	}
+
+	/*
+	 * We now know it's the distant nephew that's red.
+	 * Rotate the sibling into our parent's position
+	 * and paint both black.
+	 */
+
+	rbnode_rotate(rbtree, parent, dir);
+	sibling->is_red = parent->is_red;
+	parent->is_red = false;
+	distant_nephew->is_red = false;
+
+	return 0;
+}
+
+__weak
+int rb_remove_node(struct rbtree __arena *rbtree,
+		   struct rbnode __arena *node)
+{
+	if (unlikely(!rbtree))
+		return -EINVAL;
+
+	if (unlikely(rbtree->alloc == RB_ALLOC))
+		return -EINVAL;
+
+	return rb_node_remove(rbtree, node);
+}
+
+__weak
+int rb_remove(struct rbtree __arena *rbtree, u64 key)
+{
+	struct rbnode __arena *node;
+
+	if (unlikely(!rbtree))
+		return -EINVAL;
+
+	if (unlikely(rbtree->alloc != RB_ALLOC))
+		return -EINVAL;
+
+	if (!rbtree->root)
+		return -ENOENT;
+
+	node = rbnode_find(rbtree->root, key);
+	if (!node || node->key != key)
+		return -ENOENT;
+
+	return rb_node_remove(rbtree, node);
+}
+
+__weak
+int rb_pop(struct rbtree __arena *rbtree, u64 *key, u64 *value)
+{
+	struct rbnode __arena *node;
+
+	if (unlikely(!rbtree))
+		return -EINVAL;
+
+	if (!rbtree->root)
+		return -ENOENT;
+
+	if (rbtree->alloc != RB_ALLOC)
+		return -EINVAL;
+
+	node = rbnode_least(rbtree->root);
+	if (unlikely(!node))
+		return -ENOENT;
+
+	if (key)
+		*key = node->key;
+	if (value)
+		*value = node->value;
+
+	return rb_node_remove(rbtree, node);
+}
+
+inline void rbnode_print(size_t depth, struct rbnode __arena *rbn)
+{
+	arena_stderr("[DEPTH %d] %p (%s)\n PARENT %p", depth, rbn, rbn->is_red ? "red" : "black", rbn->parent);
+	arena_stderr("\tKV (%ld, %ld)\n LEFT %p RIGHT %p]\n", rbn->key, rbn->value, rbn->left, rbn->right);
+}
+
+enum rb_print_state {
+	RB_NONE_VISITED,
+	RB_LEFT_VISITED,
+	RB_RIGHT_VISITED,
+};
+
+__weak
+enum rb_print_state rb_print_next_state(struct rbnode __arena *rbnode,
+					enum rb_print_state state, u64 *next)
+{
+	if (unlikely(!next))
+		return RB_NONE_VISITED;
+
+	switch (state) {
+	case RB_NONE_VISITED:
+		if (rbnode->left) {
+			*next = (u64)rbnode->left;
+			state = RB_LEFT_VISITED;
+			break;
+		}
+
+		/* FALLTHROUGH */
+
+	case RB_LEFT_VISITED:
+		if (rbnode->right) {
+			*next = (u64)rbnode->right;
+			state = RB_RIGHT_VISITED;
+			break;
+		}
+
+		/* FALLTHROUGH */
+
+	default:
+		*next = 0;
+		state = RB_RIGHT_VISITED;
+	}
+
+	return state;
+}
+
+__weak
+int rb_print_pop_up(struct rbnode __arena **rbnodep, u8 *depthp, enum rb_print_state (*stack)[RB_MAXLVL_PRINT], enum rb_print_state *state)
+{
+	struct rbnode __arena *rbnode;
+	volatile u8 depth;
+	int j;
+
+	if (unlikely(!rbnodep || !depthp || !stack || !state))
+		return -EINVAL;
+
+	rbnode = *rbnodep;
+	depth = *depthp;
+
+	for (j = 0; j < RB_MAXLVL_PRINT && can_loop; j++) {
+		if (*state != RB_RIGHT_VISITED)
+			break;
+
+		depth -= 1;
+		if (depth < 0 || depth >= RB_MAXLVL_PRINT)
+			break;
+
+		*state = (*stack)[depth % RB_MAXLVL_PRINT];
+		rbnode = rbnode->parent;
+	}
+
+	*rbnodep = rbnode;
+	*depthp = depth;
+
+	return 0;
+}
+
+__weak
+int rb_print(struct rbtree __arena *rbtree)
+{
+	enum rb_print_state stack[RB_MAXLVL_PRINT];
+	struct rbnode __arena *rbnode = rbtree->root;
+	enum rb_print_state state;
+	struct rbnode __arena *next;
+	u64 next_addr;
+	u8 depth;
+	int ret;
+
+	if (unlikely(!rbtree))
+		return -EINVAL;
+
+	depth = 0;
+	state = RB_NONE_VISITED;
+
+	arena_stderr("=== RB TREE START ===\n");
+
+	if (!rbtree->root)
+		goto out;
+
+	/* Even with can_loop, the verifier doesn't like infinite loops. */
+	while (can_loop) {
+		if (state == RB_NONE_VISITED)
+			rbnode_print(depth, rbnode);
+
+		/* Find which child to traverse next. */
+		state = rb_print_next_state(rbnode, state, &next_addr);
+		next = (struct rbnode __arena *)next_addr;
+
+		/* Child found. Store the node state and go on. */
+		if (next) {
+			if (depth < 0 || depth >= RB_MAXLVL_PRINT)
+				return 0;
+
+			stack[depth++] = state;
+
+			rbnode = next;
+			state = RB_NONE_VISITED;
+
+			continue;
+		}
+
+		/* Otherwise, go as far up as possible. */
+		ret = rb_print_pop_up(&rbnode, &depth, &stack, &state);
+		if (ret)
+			return -EINVAL;
+
+		if (depth < 0 || depth >= RB_MAXLVL_PRINT) {
+			arena_stderr("=== RB TREE END (depth %d\n)===", depth);
+			return 0;
+		}
+
+	}
+
+out:
+	arena_stderr("=== RB TREE END ===\n");
+
+	return 0;
+}
+
+__weak
+int rb_integrity_check(struct rbtree __arena *rbtree)
+{
+	enum rb_print_state stack[RB_MAXLVL_PRINT];
+	struct rbnode __arena *rbnode = rbtree->root;
+	enum rb_print_state state;
+	struct rbnode __arena *next;
+	u64 next_addr;
+	u8 depth;
+	int ret;
+
+	if (unlikely(!rbtree))
+		return -EINVAL;
+
+	if (!rbtree->root)
+		return 0;
+
+	depth = 0;
+	state = RB_NONE_VISITED;
+
+	/* Even with can_loop, the verifier doesn't like infinite loops. */
+	while (can_loop) {
+		if (rbnode->parent && rbnode->parent->left != rbnode
+			&& rbnode->parent->right != rbnode) {
+			arena_stderr("WARNING: Inconsistent tree. Parent %p has no child %p\n", rbnode->parent, rbnode);
+			return -EINVAL;
+		}
+
+		if (rbnode->parent == rbnode) {
+			arena_stderr("WARNING: Inconsistent tree, node %p is its own parent\n", rbnode);
+			return -EINVAL;
+		}
+
+		if (rbnode->left == rbnode) {
+			arena_stderr("WARNING: Inconsistent tree, node %p is its own left child\n", rbnode);
+			return -EINVAL;
+		}
+
+		if (rbnode->right == rbnode) {
+			arena_stderr("WARNING: Inconsistent tree, node %p is its own right child\n", rbnode);
+			return -EINVAL;
+		}
+
+		if (rbnode->is_red) {
+			if (rbnode->left && rbnode->left->is_red) {
+				arena_stderr("WARNING: Inconsistent tree. Parent has %p has red child %p\n", rbnode, rbnode->left);
+				return -EINVAL;
+			}
+			if (rbnode->right && rbnode->right->is_red) {
+				arena_stderr("WARNING: Inconsistent tree. Parent has %p has red child %p\n", rbnode, rbnode->right);
+				return -EINVAL;
+			}
+		} else if (rbnode->parent && rbnode->parent->child[1 - rbnode_dir(rbnode)] == NULL) {
+			arena_stderr("WARNING: Inconsistent tree. Black node %p has no sibling\n", rbnode);
+			return -EINVAL;
+		}
+
+		/* Find which child to traverse next. */
+		state = rb_print_next_state(rbnode, state, &next_addr);
+		next = (struct rbnode __arena *)next_addr;
+
+		/* Child found. Store the node state and go on. */
+		if (next) {
+			if (depth < 0 || depth >= RB_MAXLVL_PRINT)
+				return 0;
+
+			stack[depth++] = state;
+
+			rbnode = next;
+			state = RB_NONE_VISITED;
+
+			continue;
+		}
+
+		/* Otherwise, go as far up as possible. */
+		ret = rb_print_pop_up(&rbnode, &depth, &stack, &state);
+		if (ret)
+			return -EINVAL;
+
+		if (depth < 0 || depth >= RB_MAXLVL_PRINT) {
+			return 0;
+		}
+
+	}
+
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/libarena/src/spmc.bpf.c b/tools/testing/selftests/bpf/libarena/src/spmc.bpf.c
new file mode 100644
index 000000000000..42732b7d29a6
--- /dev/null
+++ b/tools/testing/selftests/bpf/libarena/src/spmc.bpf.c
@@ -0,0 +1,234 @@
+// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
+/*
+ * Copyright (c) 2025-2026 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2025-2026 Emil Tsalapatis <etsal@meta.com>
+ */
+
+#include <bpf_atomic.h>
+
+#include <libarena/common.h>
+
+#include <libarena/asan.h>
+#include <libarena/spmc.h>
+
+static inline
+u64 spmc_arr_size(volatile struct spmc_arr __arena *spmc_arr)
+{
+	return SPMC_ARR_BASESZ << spmc_arr->order;
+}
+
+static inline
+u64 spmc_arr_get(volatile struct spmc_arr __arena *spmc_arr, u64 ind)
+{
+	u64 ret = READ_ONCE(spmc_arr->data[ind % spmc_arr_size(spmc_arr)]);
+
+	return ret;
+}
+
+static inline
+void spmc_arr_put(volatile struct spmc_arr __arena *spmc_arr, u64 ind, u64 value)
+{
+	WRITE_ONCE(spmc_arr->data[ind % spmc_arr_size(spmc_arr)], value);
+}
+
+static inline
+void spmc_arr_copy(volatile struct spmc_arr __arena *dst,
+		   volatile struct spmc_arr __arena *src, u64 b, u64 t)
+{
+	u64 i;
+
+	for (i = t; i < b && can_loop; i++)
+		spmc_arr_put(dst, i, spmc_arr_get(src, i));
+}
+
+static inline
+int spmc_order_init(struct spmc __arena *spmc, int order)
+{
+	volatile struct spmc_arr __arena *arr = &spmc->arr[order];
+
+	if (unlikely(!spmc))
+		return -EINVAL;
+
+	if (order >= SPMC_ARR_ORDERS)
+		return -E2BIG;
+
+	/* Already allocated? */
+	if (arr->data)
+		return 0;
+
+	arr->data = arena_malloc((SPMC_ARR_BASESZ << order) * sizeof(*arr->data));
+	if (!arr->data)
+		return -ENOMEM;
+
+	return 0;
+}
+
+__weak
+int spmc_owned_add(struct spmc __arena *spmc, u64 val)
+{
+	volatile struct spmc_arr __arena *newarr;
+	volatile struct spmc_arr __arena *arr;
+	ssize_t sz;
+	u64 b, t;
+	int ret;
+
+	if (unlikely(!spmc))
+		return -EINVAL;
+
+	/* 
+	 * Bottom must always be read first, also
+	 * see spmc_steal().
+	 */
+	b = smp_load_acquire(&spmc->bottom);
+	t = READ_ONCE(spmc->top);
+	arr = READ_ONCE(spmc->cur);
+
+	sz = b - t;
+	if (sz >= spmc_arr_size(arr) - 1) {
+		ret = spmc_order_init(spmc, arr->order + 1);
+		if (ret)
+			return ret;
+
+		newarr = &spmc->arr[arr->order + 1];
+
+		spmc_arr_copy(newarr, arr, b, t);
+		smp_store_release(&spmc->cur, newarr);
+		arr = newarr;
+	}
+
+	spmc_arr_put(arr, b, val);
+	smp_store_release(&spmc->bottom, b + 1);
+
+	return 0;
+}
+
+
+__weak
+int spmc_owned_remove(struct spmc __arena *spmc, u64 *val)
+{
+	volatile struct spmc_arr __arena *arr;
+	int ret = 0;
+	ssize_t sz;
+	u64 value;
+	u64 b, t;
+
+	if (unlikely(!spmc || !val))
+		return -EINVAL;
+
+	b = READ_ONCE(spmc->bottom) - 1;
+	WRITE_ONCE(spmc->bottom, b);
+	smp_mb();
+
+	t = READ_ONCE(spmc->top);
+	arr = READ_ONCE(spmc->cur);
+
+	sz = b - t;
+	if (sz < 0) {
+		WRITE_ONCE(spmc->bottom, t);
+		return -ENOENT;
+	}
+
+	value = spmc_arr_get(arr, b);
+	if (sz > 0) {
+		*val = value;
+		return 0;
+	}
+
+	if (cmpxchg(&spmc->top, t, t + 1) != t)
+		ret = -EAGAIN;
+
+	WRITE_ONCE(spmc->bottom, t + 1);
+
+	if (ret)
+		return ret;
+
+	*val = value;
+
+	return 0;
+}
+
+__weak
+int spmc_steal(struct spmc __arena *spmc, u64 *val)
+{
+	volatile struct spmc_arr __arena *arr;
+	ssize_t sz;
+	u64 value;
+	u64 b, t;
+
+	if (unlikely(!spmc || !val))
+		return -EINVAL;
+
+	/*
+	 * It is important that t is read before b for
+	 * stealers to avoid racing with the owner.
+	 * Races between stealers are dealt with using
+	 * CAS to increment the top value below.
+	 */
+	t = smp_load_acquire(&spmc->top);
+	b = smp_load_acquire(&spmc->bottom);
+
+	sz = b - t;
+	if (sz <= 0)
+		return -ENOENT;
+
+	arr = smp_load_acquire(&spmc->cur);
+	value = spmc_arr_get(arr, t);
+
+	if (cmpxchg(&spmc->top, t, t + 1) != t)
+		return -EAGAIN;
+
+	*val = value;
+
+	return 0;
+}
+
+
+__weak
+struct spmc __arena *spmc_create(void)
+{
+	/*
+	 * Marked as volatile because otherwise the array
+	 * reference in the internal loop gets demoted to
+	 * scalar and the program fails verification.
+	 */
+	struct spmc __arena *volatile spmc;
+	int ret, i;
+
+	spmc = arena_malloc(sizeof(*spmc));
+	if (!spmc)
+		return NULL;
+
+	spmc->bottom = 0;
+	spmc->top = 0;
+
+	for (i = 0; i < SPMC_ARR_ORDERS && can_loop; i++) {
+		spmc->arr[i].data = NULL;
+		spmc->arr[i].order = i;
+	}
+
+	ret = spmc_order_init((struct spmc __arena *)spmc, 0);
+	if (ret) {
+		arena_free(spmc);
+		return NULL;
+	}
+
+	spmc->cur = &spmc->arr[0];
+
+	return (struct spmc __arena *)spmc;
+}
+
+__weak
+int spmc_destroy(struct spmc __arena *spmc)
+{
+	int i;
+
+	if (unlikely(!spmc))
+		return -EINVAL;
+
+	for (i = 0; i < SPMC_ARR_ORDERS && can_loop; i++)
+		arena_free(spmc->arr[i].data);
+
+	arena_free(spmc);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/arena_direct_value.c b/tools/testing/selftests/bpf/prog_tests/arena_direct_value.c
new file mode 100644
index 000000000000..4b4adb3f4b71
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/arena_direct_value.c
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <test_progs.h>
+#include <bpf/bpf.h>
+#include <errno.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#define ARENA_PAGES 32
+
+static char log_buf[16384];
+
+static void test_arena_direct_value_one_past_end(void)
+{
+	char expected[128];
+	__u32 arena_sz = ARENA_PAGES * getpagesize();
+	struct bpf_insn insns[] = {
+		BPF_LD_IMM64_RAW(BPF_REG_1, BPF_PSEUDO_MAP_VALUE, 0),
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	};
+	LIBBPF_OPTS(bpf_map_create_opts, map_opts);
+	LIBBPF_OPTS(bpf_prog_load_opts, prog_opts);
+	void *arena;
+	int map_fd, prog_fd;
+
+	map_opts.map_flags = BPF_F_MMAPABLE;
+	prog_opts.log_buf = log_buf;
+	prog_opts.log_size = sizeof(log_buf);
+	prog_opts.log_level = 1;
+
+	map_fd = bpf_map_create(BPF_MAP_TYPE_ARENA, "arena_direct_value",
+				0, 0, ARENA_PAGES, &map_opts);
+	if (map_fd < 0) {
+		if (errno == EOPNOTSUPP) {
+			test__skip();
+			return;
+		}
+		ASSERT_GE(map_fd, 0, "bpf_map_create");
+		return;
+	}
+
+	arena = mmap(NULL, arena_sz, PROT_READ | PROT_WRITE, MAP_SHARED, map_fd, 0);
+	if (!ASSERT_NEQ(arena, MAP_FAILED, "arena_mmap"))
+		goto cleanup;
+
+	insns[0].imm = map_fd;
+	insns[1].imm = arena_sz;
+
+	prog_fd = bpf_prog_load(BPF_PROG_TYPE_RAW_TRACEPOINT,
+				"arena_direct_value", "GPL", insns,
+				ARRAY_SIZE(insns), &prog_opts);
+	if (!ASSERT_LT(prog_fd, 0, "prog_load")) {
+		close(prog_fd);
+		goto cleanup;
+	}
+
+	snprintf(expected, sizeof(expected),
+		 "invalid access to map value pointer, value_size=0 off=%u",
+		 arena_sz);
+	ASSERT_HAS_SUBSTR(log_buf, expected, "verifier_log");
+
+cleanup:
+	if (arena != MAP_FAILED)
+		munmap(arena, arena_sz);
+	close(map_fd);
+}
+
+void test_arena_direct_value(void)
+{
+	if (test__start_subtest("one_past_end"))
+		test_arena_direct_value_one_past_end();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c b/tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c
index 693fd86fbde6..acb9d53b5973 100644
--- a/tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c
+++ b/tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c
@@ -5,13 +5,6 @@
 #include <sys/sysinfo.h>
 
 struct __qspinlock { int val; };
-typedef struct __qspinlock arena_spinlock_t;
-
-struct arena_qnode {
-	unsigned long next;
-	int count;
-	int locked;
-};
 
 #include "arena_spin_lock.skel.h"
 
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_attr_size.c b/tools/testing/selftests/bpf/prog_tests/bpf_attr_size.c
new file mode 100644
index 000000000000..87842c4347a6
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_attr_size.c
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Google LLC */
+#include <linux/bpf.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <test_progs.h>
+#include <cgroup_helpers.h>
+#include "cgroup_skb_direct_packet_access.skel.h"
+
+#define OLD_QUERY_SIZE		offsetofend(union bpf_attr, query.prog_cnt)
+#define FULL_QUERY_SIZE		offsetofend(union bpf_attr, query.revision)
+
+static void test_query_size_boundaries(void)
+{
+	struct cgroup_skb_direct_packet_access *skel;
+	struct bpf_link *link = NULL;
+	union bpf_attr attr;
+	int cg_fd = -1;
+	int err;
+
+	skel = cgroup_skb_direct_packet_access__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel_load"))
+		return;
+
+	cg_fd = test__join_cgroup("/attr_size_cg");
+	if (!ASSERT_GE(cg_fd, 0, "join_cgroup"))
+		goto cleanup;
+
+	link = bpf_program__attach_cgroup(skel->progs.direct_packet_access,
+					  cg_fd);
+	if (!ASSERT_OK_PTR(link, "cg_attach"))
+		goto cleanup;
+
+	memset(&attr, 0, sizeof(attr));
+	attr.query.target_fd = cg_fd;
+	attr.query.attach_type = BPF_CGROUP_INET_INGRESS;
+	attr.query.revision = 0xdeadbeefdeadbeefULL;
+
+	err = syscall(__NR_bpf, BPF_PROG_QUERY, &attr, OLD_QUERY_SIZE);
+	if (ASSERT_OK(err, "query_old_size")) {
+		ASSERT_EQ(attr.query.prog_cnt, 1, "prog_cnt_written_old");
+		ASSERT_EQ(attr.query.revision, 0xdeadbeefdeadbeefULL,
+			  "revision_not_written_old");
+	}
+
+	memset(&attr, 0, sizeof(attr));
+	attr.query.target_fd = cg_fd;
+	attr.query.attach_type = BPF_CGROUP_INET_INGRESS;
+
+	err = syscall(__NR_bpf, BPF_PROG_QUERY, &attr, FULL_QUERY_SIZE);
+	if (!ASSERT_OK(err, "query_full_size"))
+		goto cleanup;
+
+	ASSERT_EQ(attr.query.prog_cnt, 1, "prog_cnt_written");
+	ASSERT_GT(attr.query.revision, 0, "revision_written");
+
+cleanup:
+	if (link)
+		bpf_link__destroy(link);
+	if (cg_fd >= 0)
+		close(cg_fd);
+	cgroup_skb_direct_packet_access__destroy(skel);
+}
+
+static void test_map_info_tail_zero(void)
+{
+	LIBBPF_OPTS(bpf_map_create_opts, map_opts);
+	struct bpf_map_info_fake {
+		__u8 info[offsetofend(struct bpf_map_info, hash_size)];
+		__u32 pad;
+	} info = {
+		.pad = 1,
+	};
+	int map_fd, err;
+	__u32 info_len;
+
+	map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "arr", sizeof(int), 1, 1, &map_opts);
+	if (!ASSERT_GE(map_fd, 0, "bpf_map_create"))
+		return;
+
+	info_len = sizeof(info);
+	err = bpf_obj_get_info_by_fd(map_fd, &info, &info_len);
+	ASSERT_EQ(err, -E2BIG, "bpf_obj_get_info_by_fd");
+
+	close(map_fd);
+}
+
+static void test_prog_info_tail_zero(void)
+{
+	LIBBPF_OPTS(bpf_prog_load_opts, prog_opts);
+	struct bpf_insn insns[] = {
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	};
+	struct bpf_prog_info_fake {
+		__u8 info[offsetofend(struct bpf_prog_info, attach_btf_id)];
+		__u32 pad;
+	} info = {
+		.pad = 1,
+	};
+	int prog_fd, err;
+	__u32 info_len;
+
+	prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, "test_prog", "GPL", insns,
+				ARRAY_SIZE(insns), &prog_opts);
+	if (!ASSERT_GE(prog_fd, 0, "bpf_prog_load"))
+		return;
+
+	info_len = sizeof(info);
+	err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
+	ASSERT_EQ(err, -E2BIG, "bpf_obj_get_info_by_fd");
+
+	close(prog_fd);
+}
+
+void test_bpf_attr_size(void)
+{
+	if (test__start_subtest("query_size_boundaries"))
+		test_query_size_boundaries();
+	if (test__start_subtest("map_info_tail_zero"))
+		test_map_info_tail_zero();
+	if (test__start_subtest("prog_info_tail_zero"))
+		test_prog_info_tail_zero();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c
index 35adc3f6d443..fa484d00a7a5 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c
@@ -252,10 +252,17 @@ cleanup:
 	kprobe_multi__destroy(skel);
 }
 
-/* defined in prog_tests/uprobe_multi_test.c */
-void uprobe_multi_func_1(void);
-void uprobe_multi_func_2(void);
-void uprobe_multi_func_3(void);
+/*
+ * Weak uprobe target stubs. noinline is required because
+ * uprobe_multi_test_run() takes their addresses to configure the BPF
+ * program's attachment points; an inlined function has no stable
+ * address in the binary to probe. The strong definitions in
+ * uprobe_multi_test.c take precedence when that translation unit is
+ * linked.
+ */
+noinline __weak void uprobe_multi_func_1(void) { asm volatile (""); }
+noinline __weak void uprobe_multi_func_2(void) { asm volatile (""); }
+noinline __weak void uprobe_multi_func_3(void) { asm volatile (""); }
 
 static void uprobe_multi_test_run(struct uprobe_multi *skel)
 {
@@ -574,8 +581,6 @@ cleanup:
 		close(fmod_ret_fd);
 }
 
-int stack_mprotect(void);
-
 static void lsm_subtest(struct test_bpf_cookie *skel)
 {
 	__u64 cookie;
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
index 215878ea04de..b33dba4b126e 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
@@ -11,18 +11,18 @@ struct {
 	const char *prog_name;
 	const char *err_msg;
 } test_bpf_nf_fail_tests[] = {
-	{ "alloc_release", "kernel function bpf_ct_release args#0 expected pointer to STRUCT nf_conn but" },
-	{ "insert_insert", "kernel function bpf_ct_insert_entry args#0 expected pointer to STRUCT nf_conn___init but" },
-	{ "lookup_insert", "kernel function bpf_ct_insert_entry args#0 expected pointer to STRUCT nf_conn___init but" },
-	{ "set_timeout_after_insert", "kernel function bpf_ct_set_timeout args#0 expected pointer to STRUCT nf_conn___init but" },
-	{ "set_status_after_insert", "kernel function bpf_ct_set_status args#0 expected pointer to STRUCT nf_conn___init but" },
-	{ "change_timeout_after_alloc", "kernel function bpf_ct_change_timeout args#0 expected pointer to STRUCT nf_conn but" },
-	{ "change_status_after_alloc", "kernel function bpf_ct_change_status args#0 expected pointer to STRUCT nf_conn but" },
+	{ "alloc_release", "kernel function bpf_ct_release R1 expected pointer to STRUCT nf_conn but" },
+	{ "insert_insert", "kernel function bpf_ct_insert_entry R1 expected pointer to STRUCT nf_conn___init but" },
+	{ "lookup_insert", "kernel function bpf_ct_insert_entry R1 expected pointer to STRUCT nf_conn___init but" },
+	{ "set_timeout_after_insert", "kernel function bpf_ct_set_timeout R1 expected pointer to STRUCT nf_conn___init but" },
+	{ "set_status_after_insert", "kernel function bpf_ct_set_status R1 expected pointer to STRUCT nf_conn___init but" },
+	{ "change_timeout_after_alloc", "kernel function bpf_ct_change_timeout R1 expected pointer to STRUCT nf_conn but" },
+	{ "change_status_after_alloc", "kernel function bpf_ct_change_status R1 expected pointer to STRUCT nf_conn but" },
 	{ "write_not_allowlisted_field", "no write support to nf_conn at off" },
-	{ "lookup_null_bpf_tuple", "Possibly NULL pointer passed to trusted arg1" },
-	{ "lookup_null_bpf_opts", "Possibly NULL pointer passed to trusted arg3" },
-	{ "xdp_lookup_null_bpf_tuple", "Possibly NULL pointer passed to trusted arg1" },
-	{ "xdp_lookup_null_bpf_opts", "Possibly NULL pointer passed to trusted arg3" },
+	{ "lookup_null_bpf_tuple", "Possibly NULL pointer passed to trusted R2" },
+	{ "lookup_null_bpf_opts", "Possibly NULL pointer passed to trusted R4" },
+	{ "xdp_lookup_null_bpf_tuple", "Possibly NULL pointer passed to trusted R2" },
+	{ "xdp_lookup_null_bpf_opts", "Possibly NULL pointer passed to trusted R4" },
 };
 
 enum {
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c b/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c
index 730357cd0c9a..77f1c0550c9b 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c
@@ -8,6 +8,10 @@
 #include "bpf_qdisc_fifo.skel.h"
 #include "bpf_qdisc_fq.skel.h"
 #include "bpf_qdisc_fail__incompl_ops.skel.h"
+#include "bpf_qdisc_fail__invalid_dynptr.skel.h"
+#include "bpf_qdisc_fail__invalid_dynptr_slice.skel.h"
+#include "bpf_qdisc_fail__invalid_dynptr_cross_frame.skel.h"
+#include "bpf_qdisc_dynptr_use_after_invalidate_clone.skel.h"
 
 #define LO_IFINDEX 1
 
@@ -223,6 +227,10 @@ void test_ns_bpf_qdisc(void)
 		test_qdisc_attach_to_non_root();
 	if (test__start_subtest("incompl_ops"))
 		test_incompl_ops();
+	RUN_TESTS(bpf_qdisc_fail__invalid_dynptr);
+	RUN_TESTS(bpf_qdisc_fail__invalid_dynptr_cross_frame);
+	RUN_TESTS(bpf_qdisc_fail__invalid_dynptr_slice);
+	RUN_TESTS(bpf_qdisc_dynptr_use_after_invalidate_clone);
 }
 
 void serial_test_bpf_qdisc_default(void)
diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c
index 054ecb6b1e9f..96f719a0cec9 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf.c
@@ -1924,11 +1924,11 @@ static struct btf_raw_test raw_tests[] = {
 },
 
 {
-	.descr = "invalid BTF_INFO",
+	.descr = "invalid BTF kind",
 	.raw_types = {
 		/* int */				/* [1] */
 		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
-		BTF_TYPE_ENC(0, 0x20000000, 4),
+		BTF_TYPE_ENC(0, 0x7f000000, 4),
 		BTF_END_RAW,
 	},
 	.str_sec = "",
@@ -1941,7 +1941,7 @@ static struct btf_raw_test raw_tests[] = {
 	.value_type_id = 1,
 	.max_entries = 4,
 	.btf_load_err = true,
-	.err_str = "Invalid btf_info",
+	.err_str = "Invalid kind",
 },
 
 {
@@ -4258,6 +4258,43 @@ static struct btf_raw_test raw_tests[] = {
 	.max_entries = 1,
 },
 
+{
+	.descr = "struct test repeated fields count overflow",
+	.raw_types = {
+		BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),	/* [1] */
+		BTF_STRUCT_ENC(NAME_TBD, 0, 0),				/* [2] */
+		BTF_TYPE_TAG_ENC(NAME_TBD, 2),				/* [3] */
+		BTF_PTR_ENC(3),						/* [4] */
+		BTF_TYPE_ARRAY_ENC(4, 1, 1),				/* [5] */
+		BTF_STRUCT_ENC(NAME_TBD, 10, 8),			/* [6] */
+		BTF_MEMBER_ENC(NAME_TBD, 5, 0),
+		BTF_MEMBER_ENC(NAME_TBD, 5, 0),
+		BTF_MEMBER_ENC(NAME_TBD, 5, 0),
+		BTF_MEMBER_ENC(NAME_TBD, 5, 0),
+		BTF_MEMBER_ENC(NAME_TBD, 5, 0),
+		BTF_MEMBER_ENC(NAME_TBD, 5, 0),
+		BTF_MEMBER_ENC(NAME_TBD, 5, 0),
+		BTF_MEMBER_ENC(NAME_TBD, 5, 0),
+		BTF_MEMBER_ENC(NAME_TBD, 5, 0),
+		BTF_MEMBER_ENC(NAME_TBD, 5, 0),
+		BTF_TYPE_ARRAY_ENC(6, 1, 0x1999999aU),			/* [7] */
+		BTF_STRUCT_ENC(NAME_TBD, 2, 8 + 8 * 0x1999999aU),	/* [8] */
+		BTF_MEMBER_ENC(NAME_TBD, 4, 0),
+		BTF_MEMBER_ENC(NAME_TBD, 7, 64),
+		BTF_END_RAW,
+	},
+	BTF_STR_SEC("\0int\0prog_test_ref_kfunc\0kptr_untrusted\0elem"
+		    "\0p0\0p1\0p2\0p3\0p4\0p5\0p6\0p7\0p8\0p9"
+		    "\0outer\0trigger\0elems"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "repeat_fields",
+	.key_size = sizeof(int),
+	.value_size = 8 + 8 * 0x1999999aU,
+	.key_type_id = 1,
+	.value_type_id = 8,
+	.max_entries = 1,
+	.btf_load_err = true,
+},
 }; /* struct btf_raw_test raw_tests[] */
 
 static const char *get_next_str(const char *start, const char *end)
@@ -8092,7 +8129,7 @@ static struct btf_dedup_test dedup_tests[] = {
 static int btf_type_size(const struct btf_type *t)
 {
 	int base_size = sizeof(struct btf_type);
-	__u16 vlen = BTF_INFO_VLEN(t->info);
+	__u32 vlen = BTF_INFO_VLEN(t->info);
 	__u16 kind = BTF_INFO_KIND(t->info);
 
 	switch (kind) {
diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c b/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c
index 5bc15bb6b7ce..9d6161151593 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c
@@ -20,18 +20,22 @@ static void test_split_simple() {
 	btf__add_struct(btf1, "s1", 4);			/* [3] struct s1 { */
 	btf__add_field(btf1, "f1", 1, 0, 0);		/*      int f1; */
 							/* } */
+	btf__add_typedef(btf1, "t1", 1);		/* [4] typedef int */
 
 	VALIDATE_RAW_BTF(
 		btf1,
 		"[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED",
 		"[2] PTR '(anon)' type_id=1",
 		"[3] STRUCT 's1' size=4 vlen=1\n"
-		"\t'f1' type_id=1 bits_offset=0");
+		"\t'f1' type_id=1 bits_offset=0",
+		"[4] TYPEDEF 't1' type_id=1");
 
 	ASSERT_STREQ(btf_type_c_dump(btf1), "\
 struct s1 {\n\
 	int f1;\n\
-};\n\n", "c_dump");
+};\n\
+\n\
+typedef int t1;\n\n", "c_dump");
 
 	btf2 = btf__new_empty_split(btf1);
 	if (!ASSERT_OK_PTR(btf2, "empty_split_btf"))
@@ -49,39 +53,46 @@ struct s1 {\n\
 	ASSERT_EQ(btf_is_int(t), true, "int_kind");
 	ASSERT_STREQ(btf__str_by_offset(btf2, t->name_off), "int", "int_name");
 
-	btf__add_struct(btf2, "s2", 16);		/* [4] struct s2 {	*/
-	btf__add_field(btf2, "f1", 6, 0, 0);		/*      struct s1 f1;	*/
-	btf__add_field(btf2, "f2", 5, 32, 0);		/*      int f2;		*/
+	btf__add_struct(btf2, "s2", 16);		/* [5] struct s2 {	*/
+	btf__add_field(btf2, "f1", 7, 0, 0);		/*      struct s1 f1;	*/
+	btf__add_field(btf2, "f2", 6, 32, 0);		/*      int f2;		*/
 	btf__add_field(btf2, "f3", 2, 64, 0);		/*      int *f3;	*/
 							/* } */
 
 	/* duplicated int */
-	btf__add_int(btf2, "int", 4, BTF_INT_SIGNED);	/* [5] int */
+	btf__add_int(btf2, "int", 4, BTF_INT_SIGNED);	/* [6] int */
 
 	/* duplicated struct s1 */
-	btf__add_struct(btf2, "s1", 4);			/* [6] struct s1 { */
-	btf__add_field(btf2, "f1", 5, 0, 0);		/*      int f1; */
+	btf__add_struct(btf2, "s1", 4);			/* [7] struct s1 { */
+	btf__add_field(btf2, "f1", 6, 0, 0);		/*      int f1; */
 							/* } */
 
+	/* duplicated typedef t1 */
+	btf__add_typedef(btf2, "t1", 6);		/* [8] typedef int */
+
 	VALIDATE_RAW_BTF(
 		btf2,
 		"[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED",
 		"[2] PTR '(anon)' type_id=1",
 		"[3] STRUCT 's1' size=4 vlen=1\n"
 		"\t'f1' type_id=1 bits_offset=0",
-		"[4] STRUCT 's2' size=16 vlen=3\n"
-		"\t'f1' type_id=6 bits_offset=0\n"
-		"\t'f2' type_id=5 bits_offset=32\n"
+		"[4] TYPEDEF 't1' type_id=1",
+		"[5] STRUCT 's2' size=16 vlen=3\n"
+		"\t'f1' type_id=7 bits_offset=0\n"
+		"\t'f2' type_id=6 bits_offset=32\n"
 		"\t'f3' type_id=2 bits_offset=64",
-		"[5] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED",
-		"[6] STRUCT 's1' size=4 vlen=1\n"
-		"\t'f1' type_id=5 bits_offset=0");
+		"[6] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED",
+		"[7] STRUCT 's1' size=4 vlen=1\n"
+		"\t'f1' type_id=6 bits_offset=0",
+		"[8] TYPEDEF 't1' type_id=6");
 
 	ASSERT_STREQ(btf_type_c_dump(btf2), "\
 struct s1 {\n\
 	int f1;\n\
 };\n\
 \n\
+typedef int t1;\n\
+\n\
 struct s1___2 {\n\
 	int f1;\n\
 };\n\
@@ -90,7 +101,9 @@ struct s2 {\n\
 	struct s1___2 f1;\n\
 	int f2;\n\
 	int *f3;\n\
-};\n\n", "c_dump");
+};\n\
+\n\
+typedef int t1___2;\n\n", "c_dump");
 
 	err = btf__dedup(btf2, NULL);
 	if (!ASSERT_OK(err, "btf_dedup"))
@@ -102,7 +115,8 @@ struct s2 {\n\
 		"[2] PTR '(anon)' type_id=1",
 		"[3] STRUCT 's1' size=4 vlen=1\n"
 		"\t'f1' type_id=1 bits_offset=0",
-		"[4] STRUCT 's2' size=16 vlen=3\n"
+		"[4] TYPEDEF 't1' type_id=1",
+		"[5] STRUCT 's2' size=16 vlen=3\n"
 		"\t'f1' type_id=3 bits_offset=0\n"
 		"\t'f2' type_id=1 bits_offset=32\n"
 		"\t'f3' type_id=2 bits_offset=64");
@@ -112,6 +126,8 @@ struct s1 {\n\
 	int f1;\n\
 };\n\
 \n\
+typedef int t1;\n\
+\n\
 struct s2 {\n\
 	struct s1 f1;\n\
 	int f2;\n\
@@ -487,9 +503,8 @@ static void test_split_module(void)
 	for (i = 0; i < ARRAY_SIZE(mod_funcs); i++) {
 		const struct btf_param *p;
 		const struct btf_type *t;
-		__u16 vlen;
+		__u32 vlen, j;
 		__u32 id;
-		int j;
 
 		id = btf__find_by_name_kind(btf1, mod_funcs[i], BTF_KIND_FUNC);
 		if (!ASSERT_GE(id, nr_base_types, "func_id"))
diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c
index f1642794f70e..9f1b50e07a29 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c
@@ -1027,8 +1027,8 @@ static void test_btf_dump_datasec_data(char *str)
 	char license[4] = "GPL";
 	struct btf_dump *d;
 
-	btf = btf__parse("xdping_kern.bpf.o", NULL);
-	if (!ASSERT_OK_PTR(btf, "xdping_kern.bpf.o BTF not found"))
+	btf = btf__parse("xdp_dummy.bpf.o", NULL);
+	if (!ASSERT_OK_PTR(btf, "xdp_dummy.bpf.o BTF not found"))
 		return;
 
 	d = btf_dump__new(btf, btf_dump_snprintf, str, NULL);
diff --git a/tools/testing/selftests/bpf/prog_tests/cb_refs.c b/tools/testing/selftests/bpf/prog_tests/cb_refs.c
index c40df623a8f7..78566b817fd7 100644
--- a/tools/testing/selftests/bpf/prog_tests/cb_refs.c
+++ b/tools/testing/selftests/bpf/prog_tests/cb_refs.c
@@ -11,8 +11,8 @@ struct {
 	const char *prog_name;
 	const char *err_msg;
 } cb_refs_tests[] = {
-	{ "underflow_prog", "must point to scalar, or struct with scalar" },
-	{ "leak_prog", "Possibly NULL pointer passed to helper arg2" },
+	{ "underflow_prog", "release kfunc bpf_kfunc_call_test_release expects referenced PTR_TO_BTF_ID passed to R1" },
+	{ "leak_prog", "Possibly NULL pointer passed to helper R2" },
 	{ "nested_cb", "Unreleased reference id=4 alloc_insn=2" }, /* alloc_insn=2{4,5} */
 	{ "non_cb_transfer_ref", "Unreleased reference id=4 alloc_insn=1" }, /* alloc_insn=1{1,2} */
 };
diff --git a/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c b/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c
index 478a77cb67e6..c4398ccf3493 100644
--- a/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c
+++ b/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c
@@ -176,7 +176,7 @@ static void test_cgroup_iter_sleepable(int cgroup_fd, __u64 cgroup_id)
 	DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
 	union bpf_iter_link_info linfo;
 	struct cgrp_ls_sleepable *skel;
-	struct bpf_link *link;
+	struct bpf_link *link, *fexit_link;
 	int err, iter_fd;
 	char buf[16];
 
@@ -200,16 +200,27 @@ static void test_cgroup_iter_sleepable(int cgroup_fd, __u64 cgroup_id)
 	if (!ASSERT_OK_PTR(link, "attach_iter"))
 		goto out;
 
+	fexit_link = bpf_program__attach(skel->progs.fexit_update);
+	if (!ASSERT_OK_PTR(fexit_link, "attach_fexit"))
+		goto out_link;
+
 	iter_fd = bpf_iter_create(bpf_link__fd(link));
 	if (!ASSERT_GE(iter_fd, 0, "iter_create"))
-		goto out_link;
+		goto out_fexit_link;
+
+	skel->bss->target_pid = sys_gettid();
 
 	/* trigger the program run */
 	(void)read(iter_fd, buf, sizeof(buf));
 
+	skel->bss->target_pid = 0;
+
+	ASSERT_EQ(skel->bss->update_err, 0, "update_err");
 	ASSERT_EQ(skel->bss->cgroup_id, cgroup_id, "cgroup_id");
 
 	close(iter_fd);
+out_fexit_link:
+	bpf_link__destroy(fexit_link);
 out_link:
 	bpf_link__destroy(link);
 out:
diff --git a/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c b/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c
index 469e92869523..2c3124092b73 100644
--- a/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c
+++ b/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c
@@ -69,19 +69,19 @@ static struct test_case test_cases[] = {
 #if defined(__x86_64__) || defined(__aarch64__)
 	{
 		N(SCHED_CLS, struct __sk_buff, tstamp),
-		.read  = "r11 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);"
-			 "if w11 & 0x4 goto pc+1;"
+		.read  = "r12 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);"
+			 "if w12 & 0x4 goto pc+1;"
 			 "goto pc+4;"
-			 "if w11 & 0x3 goto pc+1;"
+			 "if w12 & 0x3 goto pc+1;"
 			 "goto pc+2;"
 			 "$dst = 0;"
 			 "goto pc+1;"
 			 "$dst = *(u64 *)($ctx + sk_buff::tstamp);",
-		.write = "r11 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);"
-			 "if w11 & 0x4 goto pc+1;"
+		.write = "r12 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);"
+			 "if w12 & 0x4 goto pc+1;"
 			 "goto pc+2;"
-			 "w11 &= -4;"
-			 "*(u8 *)($ctx + sk_buff::__mono_tc_offset) = r11;"
+			 "w12 &= -4;"
+			 "*(u8 *)($ctx + sk_buff::__mono_tc_offset) = r12;"
 			 "*(u64 *)($ctx + sk_buff::tstamp) = $src;",
 	},
 #endif
@@ -253,8 +253,7 @@ static int find_field_offset_aux(struct btf *btf, int btf_id, char *field_name,
 {
 	const struct btf_type *type = btf__type_by_id(btf, btf_id);
 	const struct btf_member *m;
-	__u16 mnum;
-	int i;
+	__u32 mnum, i;
 
 	if (!type) {
 		PRINT_FAIL("Can't find btf_type for id %d\n", btf_id);
diff --git a/tools/testing/selftests/bpf/prog_tests/exceptions.c b/tools/testing/selftests/bpf/prog_tests/exceptions.c
index e8cbaf2a3e82..3588d6f97fd4 100644
--- a/tools/testing/selftests/bpf/prog_tests/exceptions.c
+++ b/tools/testing/selftests/bpf/prog_tests/exceptions.c
@@ -85,6 +85,13 @@ static void test_exceptions_success(void)
 	RUN_SUCCESS(exception_bad_assert_range_with, 10);
 	RUN_SUCCESS(exception_throw_from_void_global, 11);
 
+	if (skel->rodata->has_stack_arg) {
+		RUN_SUCCESS(exception_throw_stack_arg, 56);
+		RUN_SUCCESS(exception_throw_after_stack_arg, 56);
+		RUN_SUCCESS(exception_throw_subprog_stack_arg, 56);
+		RUN_SUCCESS(exception_throw_subprog_after_stack_arg, 56);
+	}
+
 #define RUN_EXT(load_ret, attach_err, expr, msg, after_link)			  \
 	{									  \
 		LIBBPF_OPTS(bpf_object_open_opts, o, .kernel_log_buf = log_buf,		 \
diff --git a/tools/testing/selftests/bpf/prog_tests/file_reader.c b/tools/testing/selftests/bpf/prog_tests/file_reader.c
index 5cde32b35da4..48aae7ea0e4b 100644
--- a/tools/testing/selftests/bpf/prog_tests/file_reader.c
+++ b/tools/testing/selftests/bpf/prog_tests/file_reader.c
@@ -10,6 +10,7 @@
 
 const char *user_ptr = "hello world";
 char file_contents[256000];
+void *addr;
 
 void *get_executable_base_addr(void)
 {
@@ -26,8 +27,7 @@ void *get_executable_base_addr(void)
 static int initialize_file_contents(void)
 {
 	int fd, page_sz = sysconf(_SC_PAGESIZE);
-	ssize_t n = 0, cur, off;
-	void *addr;
+	ssize_t n = 0, cur;
 
 	fd = open("/proc/self/exe", O_RDONLY);
 	if (!ASSERT_OK_FD(fd, "Open /proc/self/exe\n"))
@@ -52,16 +52,6 @@ static int initialize_file_contents(void)
 	/* page-align base file address */
 	addr = (void *)((unsigned long)addr & ~(page_sz - 1));
 
-	/*
-	 * Page out range 0..512K, use 0..256K for positive tests and
-	 * 256K..512K for negative tests expecting page faults
-	 */
-	for (off = 0; off < sizeof(file_contents) * 2; off += page_sz) {
-		if (!ASSERT_OK(madvise(addr + off, page_sz, MADV_PAGEOUT),
-			       "madvise pageout"))
-			return errno;
-	}
-
 	return 0;
 }
 
@@ -90,6 +80,14 @@ static void run_test(const char *prog_name)
 	if (!ASSERT_OK(err, "file_reader__load"))
 		goto cleanup;
 
+	/*
+	 * Page out range 0..512K, use 0..256K for positive tests and
+	 * 256K..512K for negative tests expecting page faults
+	 */
+	if (!ASSERT_OK(madvise(addr, sizeof(file_contents) * 2, MADV_PAGEOUT),
+		       "madvise pageout"))
+		goto cleanup;
+
 	err = file_reader__attach(skel);
 	if (!ASSERT_OK(err, "file_reader__attach"))
 		goto cleanup;
diff --git a/tools/testing/selftests/bpf/prog_tests/fill_link_info.c b/tools/testing/selftests/bpf/prog_tests/fill_link_info.c
index e40114620751..f589eefbf9fb 100644
--- a/tools/testing/selftests/bpf/prog_tests/fill_link_info.c
+++ b/tools/testing/selftests/bpf/prog_tests/fill_link_info.c
@@ -469,7 +469,7 @@ verify_umulti_link_info(int fd, bool retprobe, __u64 *offsets,
 
 		ASSERT_EQ(info.uprobe_multi.pid, getpid(), "info.uprobe_multi.pid");
 		ASSERT_EQ(info.uprobe_multi.count, 3, "info.uprobe_multi.count");
-		ASSERT_EQ(info.uprobe_multi.flags & BPF_F_KPROBE_MULTI_RETURN,
+		ASSERT_EQ(info.uprobe_multi.flags & BPF_F_UPROBE_MULTI_RETURN,
 			  retprobe, "info.uprobe_multi.flags.retprobe");
 		ASSERT_EQ(info.uprobe_multi.path_size, strlen(path) + 1, "info.uprobe_multi.path_size");
 		ASSERT_STREQ(path_buf, path, "info.uprobe_multi.path");
diff --git a/tools/testing/selftests/bpf/prog_tests/htab_update.c b/tools/testing/selftests/bpf/prog_tests/htab_update.c
index ea1a6766fbe9..0a28d4346924 100644
--- a/tools/testing/selftests/bpf/prog_tests/htab_update.c
+++ b/tools/testing/selftests/bpf/prog_tests/htab_update.c
@@ -23,7 +23,7 @@ static void test_reenter_update(void)
 	if (!ASSERT_OK_PTR(skel, "htab_update__open"))
 		return;
 
-	bpf_program__set_autoload(skel->progs.bpf_obj_free_fields, true);
+	bpf_program__set_autoload(skel->progs.bpf_obj_cancel_fields, true);
 	err = htab_update__load(skel);
 	if (!ASSERT_TRUE(!err, "htab_update__load") || err)
 		goto out;
@@ -50,7 +50,7 @@ static void test_reenter_update(void)
 	/*
 	 * Second update: replace existing element with same key and trigger
 	 * the reentrancy of bpf_map_update_elem().
-	 * check_and_free_fields() calls bpf_obj_free_fields() on the old
+	 * check_and_cancel_fields() calls bpf_obj_cancel_fields() on the old
 	 * value, which is where fentry program runs and performs a nested
 	 * bpf_map_update_elem(), triggering -EDEADLK.
 	 */
diff --git a/tools/testing/selftests/bpf/prog_tests/iters.c b/tools/testing/selftests/bpf/prog_tests/iters.c
index a539980a2fbe..c0b6082f345a 100644
--- a/tools/testing/selftests/bpf/prog_tests/iters.c
+++ b/tools/testing/selftests/bpf/prog_tests/iters.c
@@ -202,8 +202,6 @@ cleanup:
 	iters_task__destroy(skel);
 }
 
-extern int stack_mprotect(void);
-
 static void subtest_css_task_iters(void)
 {
 	struct iters_css_task *skel = NULL;
diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c
index 62f3fb79f5d1..3df07680f9e0 100644
--- a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c
+++ b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c
@@ -68,7 +68,7 @@ static struct kfunc_test_params kfunc_tests[] = {
 	TC_FAIL(kfunc_call_test_get_mem_fail_oob, 0, "min value is outside of the allowed memory range"),
 	TC_FAIL(kfunc_call_test_get_mem_fail_not_const, 0, "is not a const"),
 	TC_FAIL(kfunc_call_test_mem_acquire_fail, 0, "acquire kernel function does not return PTR_TO_BTF_ID"),
-	TC_FAIL(kfunc_call_test_pointer_arg_type_mismatch, 0, "arg#0 expected pointer to ctx, but got scalar"),
+	TC_FAIL(kfunc_call_test_pointer_arg_type_mismatch, 0, "R1 expected pointer to ctx, but got scalar"),
 
 	/* success cases */
 	TC_TEST(kfunc_call_test1, 12),
diff --git a/tools/testing/selftests/bpf/prog_tests/libarena.c b/tools/testing/selftests/bpf/prog_tests/libarena.c
new file mode 100644
index 000000000000..61ea68dce410
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/libarena.c
@@ -0,0 +1,253 @@
+// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+#include <test_progs.h>
+#include <unistd.h>
+
+#include <libarena/common.h>
+#include <libarena/asan.h>
+#include <libarena/buddy.h>
+#include <libarena/userspace.h>
+
+#include "libarena/libarena.skel.h"
+
+static void run_libarena_test(struct libarena *skel, struct bpf_program *prog,
+		const char *name)
+{
+	int ret;
+
+	if (!strstr(name, "test_buddy")) {
+		ret = libarena_run_prog(bpf_program__fd(skel->progs.arena_buddy_reset));
+		if (!ASSERT_OK(ret, "arena_buddy_reset"))
+			return;
+	}
+
+	ret = libarena_run_prog(bpf_program__fd(prog));
+
+	ASSERT_OK(ret, name);
+
+}
+
+static void *run_libarena_parallel_prog(void *arg)
+{
+	struct bpf_program *prog = arg;
+
+	return (void *)(long)libarena_run_prog(bpf_program__fd(prog));
+}
+
+/* Max suffix is ceil((lg 2^32) / (lg 10)) + sizeof("__") = 10 + 2 = 12. */
+#define MAX_PARTEST_SUFFIX (12)
+#define MAX_PARTEST_NAME (1024)
+#define MAX_PARTEST_PREFIX (MAX_PARTEST_NAME - MAX_PARTEST_SUFFIX)
+
+static int run_libarena_parallel_fini(struct libarena *skel, const char *name,
+				      size_t prefixlen)
+{
+	char tdname[MAX_PARTEST_NAME];
+	struct bpf_program *fini_prog;
+	int ret;
+
+	ret = snprintf(tdname, sizeof(tdname), "%.*s__fini", (int)prefixlen, name);
+	if (!ASSERT_LT(ret, sizeof(tdname), "partest fini name"))
+		return -ENAMETOOLONG;
+
+	fini_prog = bpf_object__find_program_by_name(skel->obj, tdname);
+	if (!ASSERT_TRUE(fini_prog, "partest fini prog"))
+		return -ENOENT;
+
+	ret = libarena_run_prog(bpf_program__fd(fini_prog));
+	ASSERT_OK(ret, tdname);
+
+	return ret;
+}
+
+static int run_libarena_parallel_test_workers(struct libarena *skel,
+		const char *name, size_t prefixlen)
+{
+	pthread_t *threads = NULL, *tmp_threads;
+	char tdname[MAX_PARTEST_NAME];
+	struct bpf_program *tdprog;
+	uint32_t nthreads;
+	void *thread_ret;
+	int ret, err = 0;
+	int i;
+
+	for (nthreads = 0; nthreads < UINT_MAX; nthreads++) {
+		ret = snprintf(tdname, sizeof(tdname), "%.*s__%u", (int)prefixlen,
+			       name, nthreads);
+		if (!ASSERT_LT(ret, sizeof(tdname), "test worker name")) {
+			err = -ENAMETOOLONG;
+			break;
+		}
+
+		/* 
+		 * We enumerate the worker threads for a given test with __0, __1,
+		 * and so on. The suffixes always start from 0 and are contiguous,
+		 * so if we don't find a program with the requested name we have
+		 * discovered all available worker programs.
+		 */
+		tdprog = bpf_object__find_program_by_name(skel->obj, tdname);
+		if (!tdprog)
+			break;
+
+		/* Bump the alloc array to accommodate the new thread. */
+		tmp_threads = realloc(threads, (nthreads + 1) * sizeof(*threads));
+		if (!ASSERT_TRUE(tmp_threads, "realloc")) {
+			err = -ENOMEM;
+			break;
+		}
+		threads = tmp_threads;
+
+		ret = pthread_create(&threads[nthreads], NULL,
+				     run_libarena_parallel_prog,
+				     tdprog);
+		if (!ASSERT_OK(ret, "pthread_create")) {
+			err = ret;
+			break;
+		}
+	}
+
+
+	for (i = 0; i < nthreads; i++) {
+		ret = pthread_join(threads[i], &thread_ret);
+		if (!ASSERT_OK(ret, "pthread_join")) {
+			err = err ?: ret;
+			continue;
+		}
+
+		err = err ?: (long)thread_ret;
+	}
+
+	free(threads);
+
+	return err;
+}
+
+static bool libarena_parallel_test_enabled(struct libarena *skel,
+					   const char *prefix,
+					   size_t prefixlen)
+{
+	struct bpf_program *prog;
+	char progname[MAX_PARTEST_NAME];
+	int ret;
+
+	ret = snprintf(progname, sizeof(progname), "%.*s__enabled", (int)prefixlen,
+		       prefix);
+	if (!ASSERT_LT(ret, sizeof(progname), "partest enabled name"))
+		return false;
+
+	prog = bpf_object__find_program_by_name(skel->obj, progname);
+	if (!prog)
+		return true;
+
+	ret = libarena_run_prog(bpf_program__fd(prog));
+	if (ret == -EOPNOTSUPP)
+		return false;
+	if (!ASSERT_OK(ret, progname))
+		return false;
+	return true;
+}
+
+static void run_libarena_parallel_test(struct libarena *skel, struct bpf_program *prog,
+		const char *name)
+{
+	char testname[MAX_PARTEST_NAME];
+	size_t prefixlen;
+	const char *pos;
+	int ret;
+
+	/*
+	 * We annotate the initialization prog with __init. If the current prog does
+	 * not match, it is one of the parallel threads instead and is ignored.
+	 *
+	 * We assume the test writer knows what they are doing and do not add __init
+	 * randomly in the middle of a test name.
+	 */
+	pos = strstr(name, "__init");
+	if (!pos)
+		return;
+
+	prefixlen = pos - name;
+	if (!ASSERT_LT(prefixlen, MAX_PARTEST_PREFIX, "partest prefix too long"))
+		return;
+
+	/* The name of the test without the __init suffix. Looks nicer in the test log. */
+	ret = snprintf(testname, sizeof(testname), "%.*s", (int)prefixlen, name);
+	if (!ASSERT_LT(ret, sizeof(testname), "partest test name"))
+		return;
+
+	if (!test__start_subtest(testname))
+		return;
+
+	if (!libarena_parallel_test_enabled(skel, testname, prefixlen)) {
+		test__skip();
+		return;
+	}
+
+	ret = libarena_run_prog(bpf_program__fd(skel->progs.arena_buddy_reset));
+	if (!ASSERT_OK(ret, "arena_buddy_reset"))
+		return;
+
+	ret = libarena_run_prog(bpf_program__fd(prog));
+	if (!ASSERT_OK(ret, testname))
+		return;
+
+	ret = run_libarena_parallel_test_workers(skel, name, prefixlen);
+
+	ASSERT_OK(ret, testname);
+
+	run_libarena_parallel_fini(skel, name, prefixlen);
+}
+
+void test_libarena(void)
+{
+	struct arena_alloc_reserve_args args;
+	struct libarena *skel;
+	struct bpf_program *prog;
+	int ret;
+
+	skel = libarena__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "open_and_load"))
+		return;
+
+	ret = libarena__attach(skel);
+	if (!ASSERT_OK(ret, "attach"))
+		goto out;
+
+	args.nr_pages = ARENA_RESERVE_PAGES_DFL;
+
+	ret = libarena_run_prog_args(bpf_program__fd(skel->progs.arena_alloc_reserve),
+			&args, sizeof(args));
+	if (!ASSERT_OK(ret, "arena_alloc_reserve"))
+		goto out;
+
+	bpf_object__for_each_program(prog, skel->obj) {
+		const char *name = bpf_program__name(prog);
+
+		/*
+		 * Handle parallel test progs separately. For those
+		 * progs it's not a matter of test/skip, because each
+		 * parallel test prog includes an initialization prog
+		 * and a set of progs to be run in parallel. For the
+		 * latter we do not record them as skipped or run,
+		 * because we run them all at once when we come across
+		 * the initialization prog. For more details on how we
+		 * discover the progs see the comment on
+		 * run_libarena_parallel_test.
+		 */
+		if (libarena_is_parallel_test_prog(name)) {
+			run_libarena_parallel_test(skel, prog, name);
+			continue;
+		}
+
+		if (!libarena_is_test_prog(name))
+			continue;
+
+		if (!test__start_subtest(name))
+			continue;
+
+		run_libarena_test(skel, prog, name);
+	}
+
+out:
+	libarena__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/libarena_asan.c b/tools/testing/selftests/bpf/prog_tests/libarena_asan.c
new file mode 100644
index 000000000000..d59d9dd12ef2
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/libarena_asan.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+#include <test_progs.h>
+
+#ifdef HAS_BPF_ARENA_ASAN
+#include <unistd.h>
+
+#include <libarena/common.h>
+#include <libarena/asan.h>
+#include <libarena/buddy.h>
+#include <libarena/userspace.h>
+
+#include "libarena/libarena_asan.skel.h"
+
+static void run_libarena_asan_test(struct libarena_asan *skel,
+		struct bpf_program *prog, const char *name)
+{
+	int ret;
+
+	if (!strstr(name, "test_buddy")) {
+		ret = libarena_run_prog(bpf_program__fd(skel->progs.arena_buddy_reset));
+		if (!ASSERT_OK(ret, "arena_buddy_reset"))
+			return;
+	}
+
+	ret = libarena_run_prog(bpf_program__fd(prog));
+	ASSERT_OK(ret, name);
+
+	verify_test_stderr(skel->obj, prog);
+}
+
+static void run_test(void)
+{
+	struct arena_alloc_reserve_args args;
+	struct libarena_asan *skel;
+	struct bpf_program *prog;
+	int ret;
+
+	skel = libarena_asan__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "open_and_load"))
+		return;
+
+	ret = libarena_asan__attach(skel);
+	if (!ASSERT_OK(ret, "attach"))
+		goto out;
+
+	args.nr_pages = ARENA_RESERVE_PAGES_DFL;
+
+	ret = libarena_run_prog_args(bpf_program__fd(skel->progs.arena_alloc_reserve),
+			&args, sizeof(args));
+	if (!ASSERT_OK(ret, "arena_alloc_reserve"))
+		goto out;
+
+	ret = libarena_asan_init(
+		bpf_program__fd(skel->progs.arena_get_info),
+		bpf_program__fd(skel->progs.asan_init),
+		(1ULL << 32) / sysconf(_SC_PAGESIZE));
+	if (!ASSERT_OK(ret, "libarena_asan_init"))
+		goto out;
+
+	bpf_object__for_each_program(prog, skel->obj) {
+		const char *name = bpf_program__name(prog);
+
+		if (!libarena_is_asan_test_prog(name))
+			continue;
+
+		if (!test__start_subtest(name))
+			continue;
+
+		run_libarena_asan_test(skel, prog, name);
+	}
+
+out:
+	libarena_asan__destroy(skel);
+}
+
+#endif /* HAS_BPF_ARENA_ASAN */
+
+/*
+ * Run the test depending on whether LLVM can compile arena ASAN
+ * programs.
+ */
+void test_libarena_asan(void)
+{
+#ifdef HAS_BPF_ARENA_ASAN
+	run_test();
+#else
+	test__skip();
+#endif
+
+	return;
+}
+
diff --git a/tools/testing/selftests/bpf/prog_tests/linked_list.c b/tools/testing/selftests/bpf/prog_tests/linked_list.c
index 6f25b5f39a79..8defea0253ed 100644
--- a/tools/testing/selftests/bpf/prog_tests/linked_list.c
+++ b/tools/testing/selftests/bpf/prog_tests/linked_list.c
@@ -81,8 +81,8 @@ static struct {
 	{ "direct_write_node", "direct access to bpf_list_node is disallowed" },
 	{ "use_after_unlock_push_front", "invalid mem access 'scalar'" },
 	{ "use_after_unlock_push_back", "invalid mem access 'scalar'" },
-	{ "double_push_front", "arg#1 expected pointer to allocated object" },
-	{ "double_push_back", "arg#1 expected pointer to allocated object" },
+	{ "double_push_front", "R2 expected pointer to allocated object" },
+	{ "double_push_back", "R2 expected pointer to allocated object" },
 	{ "no_node_value_type", "bpf_list_node not found at offset=0" },
 	{ "incorrect_value_type",
 	  "operation on bpf_list_head expects arg#1 bpf_list_node at offset=48 in struct foo, "
@@ -131,13 +131,14 @@ end:
 	linked_list_fail__destroy(skel);
 }
 
-static void clear_fields(struct bpf_map *map)
+static void clear_fields(struct bpf_program *prog)
 {
-	char buf[24];
-	int key = 0;
+	LIBBPF_OPTS(bpf_test_run_opts, opts);
+	int ret;
 
-	memset(buf, 0xff, sizeof(buf));
-	ASSERT_OK(bpf_map__update_elem(map, &key, sizeof(key), buf, sizeof(buf), 0), "check_and_free_fields");
+	ret = bpf_prog_test_run_opts(bpf_program__fd(prog), &opts);
+	ASSERT_OK(ret, "clear_fields");
+	ASSERT_OK(opts.retval, "clear_fields retval");
 }
 
 enum {
@@ -170,31 +171,31 @@ static void test_linked_list_success(int mode, bool leave_in_map)
 	ASSERT_OK(ret, "map_list_push_pop");
 	ASSERT_OK(opts.retval, "map_list_push_pop retval");
 	if (!leave_in_map)
-		clear_fields(skel->maps.array_map);
+		clear_fields(skel->progs.clear_map_list);
 
 	ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.inner_map_list_push_pop), &opts);
 	ASSERT_OK(ret, "inner_map_list_push_pop");
 	ASSERT_OK(opts.retval, "inner_map_list_push_pop retval");
 	if (!leave_in_map)
-		clear_fields(skel->maps.inner_map);
+		clear_fields(skel->progs.clear_inner_map_list);
 
 	ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.global_list_push_pop), &opts);
 	ASSERT_OK(ret, "global_list_push_pop");
 	ASSERT_OK(opts.retval, "global_list_push_pop retval");
 	if (!leave_in_map)
-		clear_fields(skel->maps.bss_A);
+		clear_fields(skel->progs.clear_global_list);
 
 	ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.global_list_push_pop_nested), &opts);
 	ASSERT_OK(ret, "global_list_push_pop_nested");
 	ASSERT_OK(opts.retval, "global_list_push_pop_nested retval");
 	if (!leave_in_map)
-		clear_fields(skel->maps.bss_A);
+		clear_fields(skel->progs.clear_global_nested_list);
 
 	ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.global_list_array_push_pop), &opts);
 	ASSERT_OK(ret, "global_list_array_push_pop");
 	ASSERT_OK(opts.retval, "global_list_array_push_pop retval");
 	if (!leave_in_map)
-		clear_fields(skel->maps.bss_A);
+		clear_fields(skel->progs.clear_global_array_list);
 
 	if (mode == PUSH_POP)
 		goto end;
@@ -204,19 +205,19 @@ ppm:
 	ASSERT_OK(ret, "map_list_push_pop_multiple");
 	ASSERT_OK(opts.retval, "map_list_push_pop_multiple retval");
 	if (!leave_in_map)
-		clear_fields(skel->maps.array_map);
+		clear_fields(skel->progs.clear_map_list);
 
 	ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.inner_map_list_push_pop_multiple), &opts);
 	ASSERT_OK(ret, "inner_map_list_push_pop_multiple");
 	ASSERT_OK(opts.retval, "inner_map_list_push_pop_multiple retval");
 	if (!leave_in_map)
-		clear_fields(skel->maps.inner_map);
+		clear_fields(skel->progs.clear_inner_map_list);
 
 	ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.global_list_push_pop_multiple), &opts);
 	ASSERT_OK(ret, "global_list_push_pop_multiple");
 	ASSERT_OK(opts.retval, "global_list_push_pop_multiple retval");
 	if (!leave_in_map)
-		clear_fields(skel->maps.bss_A);
+		clear_fields(skel->progs.clear_global_list);
 
 	if (mode == PUSH_POP_MULT)
 		goto end;
@@ -226,19 +227,19 @@ lil:
 	ASSERT_OK(ret, "map_list_in_list");
 	ASSERT_OK(opts.retval, "map_list_in_list retval");
 	if (!leave_in_map)
-		clear_fields(skel->maps.array_map);
+		clear_fields(skel->progs.clear_map_list);
 
 	ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.inner_map_list_in_list), &opts);
 	ASSERT_OK(ret, "inner_map_list_in_list");
 	ASSERT_OK(opts.retval, "inner_map_list_in_list retval");
 	if (!leave_in_map)
-		clear_fields(skel->maps.inner_map);
+		clear_fields(skel->progs.clear_inner_map_list);
 
 	ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.global_list_in_list), &opts);
 	ASSERT_OK(ret, "global_list_in_list");
 	ASSERT_OK(opts.retval, "global_list_in_list retval");
 	if (!leave_in_map)
-		clear_fields(skel->maps.bss_A);
+		clear_fields(skel->progs.clear_global_list);
 end:
 	linked_list__destroy(skel);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/lru_lock_nmi.c b/tools/testing/selftests/bpf/prog_tests/lru_lock_nmi.c
new file mode 100644
index 000000000000..60666a9ba41f
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/lru_lock_nmi.c
@@ -0,0 +1,243 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Stress every LRU lock-failure and orphan-recovery.
+ * perf_event NMI BPF on every online CPU does
+ * update+delete on a small LRU map; userspace threads on every CPU do
+ * the same from syscall context.
+ */
+#define _GNU_SOURCE
+#include <pthread.h>
+#include <sched.h>
+#include <sys/syscall.h>
+#include <linux/perf_event.h>
+#include <test_progs.h>
+#include "testing_helpers.h"
+#include "lru_lock_nmi.skel.h"
+
+#define MAP_ENTRIES	64
+#define KEY_RANGE	(MAP_ENTRIES * 2)
+#define STRESS_NS	(500 * 1000 * 1000ULL)
+
+struct hammer_arg {
+	int map_fd;
+	int cpu;
+	__u64 deadline_ns;
+};
+
+struct refill_arg {
+	int map_fd;
+	int cpu;
+	int per_cpu_quota;
+	int update_errors;
+};
+
+/*
+ * Pin the calling thread to @cpu. Uses dynamically-allocated CPU sets so
+ * we stay correct on hosts with @cpu >= CPU_SETSIZE (default 1024).
+ */
+static int pin_to_cpu(int cpu)
+{
+	cpu_set_t *cs;
+	size_t cs_size;
+	int err;
+
+	cs = CPU_ALLOC(cpu + 1);
+	if (!cs)
+		return -ENOMEM;
+	cs_size = CPU_ALLOC_SIZE(cpu + 1);
+
+	CPU_ZERO_S(cs_size, cs);
+	CPU_SET_S(cpu, cs_size, cs);
+	err = pthread_setaffinity_np(pthread_self(), cs_size, cs);
+	CPU_FREE(cs);
+	return err;
+}
+
+static void *hammer_thread(void *p)
+{
+	struct hammer_arg *a = p;
+	int nr_possible_cpus = libbpf_num_possible_cpus();
+	__u64 val[nr_possible_cpus];
+	unsigned int seed;
+	__u32 key;
+
+	memset(val, 0, sizeof(val));
+	pin_to_cpu(a->cpu);
+
+	seed = (unsigned int)a->cpu ^ (unsigned int)(uintptr_t)pthread_self();
+
+	while (get_time_ns() < a->deadline_ns) {
+		bool do_update = rand_r(&seed) & 1;
+
+		key = rand_r(&seed) % KEY_RANGE;
+		if (do_update)
+			bpf_map_update_elem(a->map_fd, &key, val, BPF_ANY);
+		else
+			bpf_map_delete_elem(a->map_fd, &key);
+	}
+	return NULL;
+}
+
+static void *refill_thread(void *p)
+{
+	struct refill_arg *a = p;
+	int nr_possible_cpus = libbpf_num_possible_cpus();
+	__u64 val[nr_possible_cpus];
+	__u32 start, end, key;
+
+	memset(val, 0, sizeof(val));
+	pin_to_cpu(a->cpu);
+
+	start = (__u32)a->cpu * (__u32)a->per_cpu_quota;
+	end   = start + (__u32)a->per_cpu_quota;
+	for (key = start; key < end; key++)
+		if (bpf_map_update_elem(a->map_fd, &key, val, BPF_ANY))
+			a->update_errors++;
+	return NULL;
+}
+
+/*
+ * Drain the map, then refill it with each CPU inserting only its own
+ * quota of keys.
+ * After refill, lookup every key we inserted - a stranded node on any
+ * CPU's pool would have forced eviction.
+ */
+static int drain_then_verify_capacity(int map_fd, int nr_cpus)
+{
+	int per_cpu_quota = MAP_ENTRIES / nr_cpus;
+	int total = per_cpu_quota * nr_cpus;
+	int nr_possible_cpus = libbpf_num_possible_cpus();
+	pthread_t threads[nr_cpus];
+	struct refill_arg args[nr_cpus];
+	__u64 val[nr_possible_cpus];
+	int i, hits = 0, nthreads = 0;
+	__u32 key;
+
+	memset(val, 0, sizeof(val));
+
+	for (key = 0; key < KEY_RANGE; key++)
+		bpf_map_delete_elem(map_fd, &key);
+
+	for (i = 0; i < nr_cpus; i++) {
+		args[i] = (struct refill_arg){
+			.map_fd = map_fd,
+			.cpu = i,
+			.per_cpu_quota = per_cpu_quota,
+		};
+		if (pthread_create(&threads[nthreads], NULL, refill_thread, &args[i]) == 0)
+			nthreads++;
+	}
+	for (i = 0; i < nthreads; i++)
+		pthread_join(threads[i], NULL);
+
+	for (i = 0; i < nr_cpus; i++)
+		if (args[i].update_errors)
+			return -ENOMEM;
+
+	for (key = 0; key < (__u32)total; key++)
+		if (bpf_map_lookup_elem(map_fd, &key, val) == 0)
+			hits++;
+
+	return hits == total ? 0 : -EIO;
+}
+
+static void run_variant(enum bpf_map_type type, __u32 map_flags, const char *name)
+{
+	struct perf_event_attr attr = {
+		.size = sizeof(attr),
+		.type = PERF_TYPE_HARDWARE,
+		.config = PERF_COUNT_HW_CPU_CYCLES,
+		.freq = 1,
+	};
+	int nr_cpus, max_cpus = 64;
+	struct bpf_link *links[max_cpus];
+	pthread_t threads[max_cpus];
+	struct hammer_arg args[max_cpus];
+	struct lru_lock_nmi *skel = NULL;
+	int map_fd, i, err, nr_threads = 0, pmu_fd = -1;
+	__u64 deadline;
+
+	nr_cpus = libbpf_num_possible_cpus();
+	if (!ASSERT_GT(nr_cpus, 0, "num_cpus"))
+		return;
+
+	if (nr_cpus > max_cpus)
+		nr_cpus = max_cpus;
+
+	if (!test__start_subtest(name))
+		return;
+
+	memset(links, 0, sizeof(links));
+	skel = lru_lock_nmi__open();
+	if (!ASSERT_OK_PTR(skel, "skel_open"))
+		goto cleanup;
+
+	err = bpf_map__set_type(skel->maps.lru_map, type);
+	if (!ASSERT_OK(err, "set_type"))
+		goto cleanup;
+	err = bpf_map__set_map_flags(skel->maps.lru_map, map_flags);
+	if (!ASSERT_OK(err, "set_flags"))
+		goto cleanup;
+	err = bpf_map__set_max_entries(skel->maps.lru_map, MAP_ENTRIES);
+	if (!ASSERT_OK(err, "set_max_entries"))
+		goto cleanup;
+
+	err = lru_lock_nmi__load(skel);
+	if (!ASSERT_OK(err, "skel_load"))
+		goto cleanup;
+
+	skel->bss->hits = 0;
+	map_fd = bpf_map__fd(skel->maps.lru_map);
+	attr.sample_freq = read_perf_max_sample_freq();
+
+	for (i = 0; i < nr_cpus; i++) {
+		pmu_fd = syscall(__NR_perf_event_open, &attr, -1, i, -1, 0);
+		if (pmu_fd < 0) {
+			if (i == 0 &&
+			    (errno == ENOENT || errno == EOPNOTSUPP)) {
+				test__skip();
+				goto cleanup;
+			}
+			continue;
+		}
+		/* libbpf takes ownership of pfd on success */
+		links[i] = bpf_program__attach_perf_event(skel->progs.oncpu, pmu_fd);
+		if (!links[i])
+			close(pmu_fd);
+	}
+
+	deadline = get_time_ns() + STRESS_NS;
+	for (i = 0; i < nr_cpus; i++) {
+		args[i].map_fd = map_fd;
+		args[i].cpu = i;
+		args[i].deadline_ns = deadline;
+		if (pthread_create(&threads[nr_threads], NULL, hammer_thread, &args[i]) == 0)
+			nr_threads++;
+	}
+	for (i = 0; i < nr_threads; i++)
+		pthread_join(threads[i], NULL);
+
+	for (i = 0; i < nr_cpus; i++) {
+		if (links[i]) {
+			bpf_link__destroy(links[i]);
+			links[i] = NULL;
+		}
+	}
+
+	ASSERT_GT(skel->bss->hits, 0, "nmi_bpf_ran");
+	ASSERT_OK(drain_then_verify_capacity(map_fd, nr_cpus), "drain_then_verify_capacity");
+
+cleanup:
+	for (i = 0; i < nr_cpus; i++) {
+		if (links[i])
+			bpf_link__destroy(links[i]);
+	}
+	lru_lock_nmi__destroy(skel);
+}
+
+void serial_test_lru_lock_nmi(void)
+{
+	run_variant(BPF_MAP_TYPE_LRU_HASH, 0, "common_lru");
+	run_variant(BPF_MAP_TYPE_LRU_HASH, BPF_F_NO_COMMON_LRU, "no_common_lru");
+	run_variant(BPF_MAP_TYPE_LRU_PERCPU_HASH, 0, "percpu_lru");
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c b/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c
index 6df25de8f080..41e867467f6c 100644
--- a/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c
+++ b/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c
@@ -2,6 +2,7 @@
 
 #include <sys/types.h>
 #include <sys/socket.h>
+#include <sys/xattr.h>
 #include <test_progs.h>
 #include <bpf/btf.h>
 
@@ -309,11 +310,89 @@ static void test_lsm_cgroup_nonvoid(void)
 	lsm_cgroup_nonvoid__destroy(skel);
 }
 
+static void test_lsm_cgroup_retval(void)
+{
+	struct lsm_cgroup *skel = NULL;
+	int skipcap_prog_fd1, skipcap_prog_fd2, socket_prog_fd1, socket_prog_fd2;
+	int cgroup_fd = -1;
+	int err, fd;
+	char tmpfile[] = "/tmp/test_lsm_cgroup_retval.XXXXXX";
+
+	fd = mkstemp(tmpfile);
+	if (!ASSERT_OK_FD(fd, "mkstemp"))
+		return;
+	close(fd);
+
+	cgroup_fd = test__join_cgroup("/default_retval");
+	if (!ASSERT_OK_FD(cgroup_fd, "join_cgroup"))
+		goto cleanup_tmpfile;
+
+	skel = lsm_cgroup__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "open_and_load"))
+		goto cleanup_cgroup;
+
+	skipcap_prog_fd1 = bpf_program__fd(skel->progs.skipcap_first);
+	skipcap_prog_fd2 = bpf_program__fd(skel->progs.skipcap_second);
+	socket_prog_fd1 = bpf_program__fd(skel->progs.socket_first);
+	socket_prog_fd2 = bpf_program__fd(skel->progs.socket_second);
+
+	err = bpf_prog_attach(skipcap_prog_fd1, cgroup_fd, BPF_LSM_CGROUP, BPF_F_ALLOW_MULTI);
+	if (err == -ENOTSUPP) {
+		test__skip();
+		goto cleanup_skeleton;
+	}
+	if (!ASSERT_OK(err, "attach first skipcap prog"))
+		goto cleanup_skeleton;
+
+	err = bpf_prog_attach(skipcap_prog_fd2, cgroup_fd, BPF_LSM_CGROUP, BPF_F_ALLOW_MULTI);
+	if (!ASSERT_OK(err, "attach second skipcap prog"))
+		goto cleanup_skipcap1;
+
+	err = bpf_prog_attach(socket_prog_fd1, cgroup_fd, BPF_LSM_CGROUP, BPF_F_ALLOW_MULTI);
+	if (!ASSERT_OK(err, "attach first sock_create prog"))
+		goto cleanup_skipcap2;
+
+	err = bpf_prog_attach(socket_prog_fd2, cgroup_fd, BPF_LSM_CGROUP, BPF_F_ALLOW_MULTI);
+	if (!ASSERT_OK(err, "attach second sock_create prog"))
+		goto cleanup_sock_create1;
+
+	/* trigger the bool hook by setxattr */
+	err = setxattr(tmpfile, "user.test", "value", 5, 0);
+	if (!ASSERT_OK(err, "setxattr"))
+		goto cleanup_sock_create2;
+
+	/* trigger the errno hook by creating a socket */
+	fd = socket(AF_INET, SOCK_STREAM, 0);
+	if (!ASSERT_OK_FD(fd, "socket"))
+		goto cleanup_sock_create2;
+	close(fd);
+
+	ASSERT_EQ(skel->data->skipcap_retval, 0, "bool_hook_retval_should_be_0");
+	ASSERT_EQ(skel->data->socket_retval, -EPERM, "errno_hook_retval_should_be_EPERM");
+
+cleanup_sock_create2:
+	bpf_prog_detach2(socket_prog_fd2, cgroup_fd, BPF_LSM_CGROUP);
+cleanup_sock_create1:
+	bpf_prog_detach2(socket_prog_fd1, cgroup_fd, BPF_LSM_CGROUP);
+cleanup_skipcap2:
+	bpf_prog_detach2(skipcap_prog_fd2, cgroup_fd, BPF_LSM_CGROUP);
+cleanup_skipcap1:
+	bpf_prog_detach2(skipcap_prog_fd1, cgroup_fd, BPF_LSM_CGROUP);
+cleanup_skeleton:
+	lsm_cgroup__destroy(skel);
+cleanup_cgroup:
+	close(cgroup_fd);
+cleanup_tmpfile:
+	unlink(tmpfile);
+}
+
 void test_lsm_cgroup(void)
 {
 	if (test__start_subtest("functional"))
 		test_lsm_cgroup_functional();
 	if (test__start_subtest("nonvoid"))
 		test_lsm_cgroup_nonvoid();
+	if (test__start_subtest("retval"))
+		test_lsm_cgroup_retval();
 	btf__free(btf);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/lwt_ip_encap.c b/tools/testing/selftests/bpf/prog_tests/lwt_ip_encap.c
index b6391af5f6f9..6606f0ed9a9a 100644
--- a/tools/testing/selftests/bpf/prog_tests/lwt_ip_encap.c
+++ b/tools/testing/selftests/bpf/prog_tests/lwt_ip_encap.c
@@ -3,6 +3,7 @@
 
 #include "network_helpers.h"
 #include "test_progs.h"
+#include "test_lwt_ip_encap.skel.h"
 
 #define BPF_FILE "test_lwt_ip_encap.bpf.o"
 
@@ -32,6 +33,9 @@
 #define IP6_ADDR_8 "fb08::1"
 #define IP6_ADDR_GRE "fb10::1"
 
+#define IP4_ADDR_VXLAN  "172.16.17.100"
+#define IP6_ADDR_VXLAN  "fb11::1"
+
 #define IP6_ADDR_SRC IP6_ADDR_1
 #define IP6_ADDR_DST IP6_ADDR_4
 
@@ -538,3 +542,144 @@ void test_lwt_ip_encap_ipv4(void)
 	if (test__start_subtest("ingress"))
 		lwt_ip_encap(IPV4_ENCAP, INGRESS, "");
 }
+
+/*
+ * VxLAN Setup/topology:
+ *
+ * NS1 (IP*_ADDR_1)                NS2                  NS3 (IP*_ADDR_4)
+ *       [ping src]
+ *           |                          top route
+ *         veth1 (LWT encap)  <<-- veth2        veth3  <<-- veth4 (ping dst)
+ *           |                                                ^
+ *       (bottom route)                                       | (inner pkt)
+ *           v                        bottom route            |
+ *         veth5              -->> veth6        veth7  -->> veth8 (vxlan decap)
+ *                                                          (IP*_ADDR_VXLAN)
+ *
+ * Add the VxLAN endpoint addresses to NS3's veth8, create standard
+ * VxLAN decap devices bound to those addresses, and install routes so
+ * NS1/NS2 can reach the endpoints via the bottom route.  NS2 here is to
+ * make sure the LWT-encap VxLAN packets are routed to NS3 correctly.
+ */
+static int setup_vxlan_routes(const char *ns3, const char *ns1, const char *ns2)
+{
+	struct nstoken *nstoken;
+
+	nstoken = open_netns(ns3);
+	if (!ASSERT_OK_PTR(nstoken, "open ns3 for vxlan"))
+		return -1;
+
+	SYS(fail_close, "ip    a add %s/32  dev veth8", IP4_ADDR_VXLAN);
+	SYS(fail_close, "ip -6 a add %s/128 dev veth8", IP6_ADDR_VXLAN);
+	/*
+	 * Standard VxLAN devices to decap the encapsulated packets.  The inner
+	 * Ethernet frame uses a broadcast dst MAC so the IP stack accepts it
+	 * without ARP or FDB configuration.
+	 */
+	SYS(fail_close, "ip link add vxlan4 type vxlan id 1 dstport 4789 local %s dev veth8 nolearning noudpcsum",
+	    IP4_ADDR_VXLAN);
+	SYS(fail_close, "ip link set vxlan4 up");
+	SYS(fail_close, "ip link add vxlan6 type vxlan id 1 dstport 4789 local %s dev veth8 nolearning udp6zerocsumrx",
+	    IP6_ADDR_VXLAN);
+	SYS(fail_close, "ip link set vxlan6 up");
+	close_netns(nstoken);
+
+	SYS(fail, "ip -n %s    route add %s/32  dev veth5 via %s",
+	    ns1, IP4_ADDR_VXLAN, IP4_ADDR_6);
+	SYS(fail, "ip -n %s    route add %s/32  dev veth7 via %s",
+	    ns2, IP4_ADDR_VXLAN, IP4_ADDR_8);
+	SYS(fail, "ip -n %s -6 route add %s/128 dev veth5 via %s",
+	    ns1, IP6_ADDR_VXLAN, IP6_ADDR_6);
+	SYS(fail, "ip -n %s -6 route add %s/128 dev veth7 via %s",
+	    ns2, IP6_ADDR_VXLAN, IP6_ADDR_8);
+	return 0;
+
+fail_close:
+	close_netns(nstoken);
+fail:
+	return -1;
+}
+
+static void lwt_ip_encap_vxlan(bool ipv4_encap)
+{
+	char ns1[NETNS_NAME_SIZE] = NETNS_BASE "-1-";
+	char ns2[NETNS_NAME_SIZE] = NETNS_BASE "-2-";
+	char ns3[NETNS_NAME_SIZE] = NETNS_BASE "-3-";
+	const char *sec = ipv4_encap ? "encap_vxlan" : "encap_vxlan6";
+	int expected_offset = ipv4_encap ? (int)sizeof(struct iphdr)
+					 : (int)sizeof(struct ipv6hdr);
+	struct test_lwt_ip_encap *skel = NULL;
+	int thdr_offset, err;
+
+	if (!ASSERT_OK(create_ns(ns1, NETNS_NAME_SIZE), "create ns1"))
+		goto out;
+	if (!ASSERT_OK(create_ns(ns2, NETNS_NAME_SIZE), "create ns2"))
+		goto out;
+	if (!ASSERT_OK(create_ns(ns3, NETNS_NAME_SIZE), "create ns3"))
+		goto out;
+
+	if (!ASSERT_OK(setup_network(ns1, ns2, ns3, ""), "setup network"))
+		goto out;
+
+	if (!ASSERT_OK(setup_vxlan_routes(ns3, ns1, ns2), "setup vxlan routes"))
+		goto out;
+
+	skel = test_lwt_ip_encap__open();
+	if (!ASSERT_OK_PTR(skel, "test_lwt_ip_encap__open"))
+		goto out;
+
+	bpf_program__set_autoload(skel->progs.bpf_lwt_encap_gre, false);
+	bpf_program__set_autoload(skel->progs.bpf_lwt_encap_gre6, false);
+	bpf_program__set_autoload(skel->progs.bpf_lwt_encap_vxlan, false);
+	bpf_program__set_autoload(skel->progs.bpf_lwt_encap_vxlan6, false);
+	bpf_program__set_autoload(skel->progs.fexit_lwt_push_ip_encap, true);
+	skel->rodata->tgt_ip_version = ipv4_encap ? 4 : 6;
+
+	err = test_lwt_ip_encap__load(skel);
+	if (!ASSERT_OK(err, "test_lwt_ip_encap__load"))
+		goto out;
+
+	err = test_lwt_ip_encap__attach(skel);
+	if (!ASSERT_OK(err, "test_lwt_ip_encap__attach"))
+		goto out;
+
+	/* Remove the direct NS2->DST route so packets must go via LWT encap. */
+	SYS(out, "ip -n %s    route del %s/32  dev veth3", ns2, IP4_ADDR_DST);
+	SYS(out, "ip -n %s -6 route del %s/128 dev veth3", ns2, IP6_ADDR_DST);
+
+	if (ipv4_encap)
+		SYS(out, "ip -n %s route add %s encap bpf xmit obj %s sec %s dev veth1",
+		    ns1, IP4_ADDR_DST, BPF_FILE, sec);
+	else
+		SYS(out, "ip -n %s -6 route add %s encap bpf xmit obj %s sec %s dev veth1",
+		    ns1, IP6_ADDR_DST, BPF_FILE, sec);
+
+	skel->bss->fexit_triggered = false;
+
+	if (ipv4_encap)
+		SYS(out, "ip netns exec %s ping  -c 1 -W1 %s", ns1, IP4_ADDR_DST);
+	else
+		SYS(out, "ip netns exec %s ping6 -c 1 -W1 %s", ns1, IP6_ADDR_DST);
+
+	if (!ASSERT_TRUE(skel->bss->fexit_triggered, "fexit_triggered"))
+		goto out;
+
+	thdr_offset = (int)skel->bss->transport_hdr - (int)skel->bss->network_hdr;
+	ASSERT_EQ(thdr_offset, expected_offset, "transport_hdr offset");
+
+out:
+	test_lwt_ip_encap__destroy(skel);
+	SYS_NOFAIL("ip netns del %s", ns1);
+	SYS_NOFAIL("ip netns del %s", ns2);
+	SYS_NOFAIL("ip netns del %s", ns3);
+}
+
+void test_lwt_ip_encap_vxlan_ipv4(void)
+{
+	lwt_ip_encap_vxlan(IPV4_ENCAP);
+}
+
+void test_lwt_ip_encap_vxlan_ipv6(void)
+{
+	lwt_ip_encap_vxlan(IPV6_ENCAP);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/map_excl.c b/tools/testing/selftests/bpf/prog_tests/map_excl.c
index 6bdc6d6de0da..3f4422b9ffa6 100644
--- a/tools/testing/selftests/bpf/prog_tests/map_excl.c
+++ b/tools/testing/selftests/bpf/prog_tests/map_excl.c
@@ -7,6 +7,11 @@
 #include <bpf/btf.h>
 
 #include "map_excl.skel.h"
+#include "bpf_iter_bpf_array_map.skel.h"
+
+#ifndef SHA256_DIGEST_SIZE
+#define SHA256_DIGEST_SIZE	32
+#endif
 
 static void test_map_excl_allowed(void)
 {
@@ -45,10 +50,127 @@ out:
 
 }
 
+static void test_map_excl_no_map_in_map(void)
+{
+	__u8 hash[SHA256_DIGEST_SIZE] = {};
+	LIBBPF_OPTS(bpf_map_create_opts, excl_opts,
+		    .excl_prog_hash = hash,
+		    .excl_prog_hash_size = sizeof(hash));
+	LIBBPF_OPTS(bpf_map_create_opts, outer_opts);
+	int excl_fd, tmpl_fd = -1, outer_fd = -1, err;
+	__u32 key = 0;
+
+	excl_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "excl_inner", 4, 4, 1, &excl_opts);
+	if (!ASSERT_OK_FD(excl_fd, "create exclusive map"))
+		return;
+
+	outer_opts.inner_map_fd = excl_fd;
+	err = bpf_map_create(BPF_MAP_TYPE_ARRAY_OF_MAPS, "outer_from_excl",
+			     4, 4, 1, &outer_opts);
+	if (err >= 0)
+		close(err);
+	ASSERT_EQ(err, -ENOTSUPP, "reject exclusive map as map-in-map template");
+
+	tmpl_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "tmpl", 4, 4, 1, NULL);
+	if (!ASSERT_OK_FD(tmpl_fd, "create inner template"))
+		goto out;
+
+	outer_opts.inner_map_fd = tmpl_fd;
+	outer_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY_OF_MAPS, "outer", 4, 4, 1, &outer_opts);
+	if (!ASSERT_OK_FD(outer_fd, "create map-of-maps"))
+		goto out;
+
+	err = bpf_map_update_elem(outer_fd, &key, &excl_fd, 0);
+	ASSERT_EQ(err, -ENOTSUPP, "reject exclusive map as map-in-map element");
+out:
+	if (outer_fd >= 0)
+		close(outer_fd);
+	if (tmpl_fd >= 0)
+		close(tmpl_fd);
+	close(excl_fd);
+}
+
+static void test_map_excl_no_map_iter(void)
+{
+	__u8 hash[SHA256_DIGEST_SIZE] = {};
+	LIBBPF_OPTS(bpf_map_create_opts, excl_opts,
+		    .excl_prog_hash = hash,
+		    .excl_prog_hash_size = sizeof(hash));
+	DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+	struct bpf_iter_bpf_array_map *skel = NULL;
+	union bpf_iter_link_info linfo;
+	struct bpf_link *link;
+	int excl_fd;
+
+	excl_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "excl_iter", 4, 8, 3, &excl_opts);
+	if (!ASSERT_OK_FD(excl_fd, "create exclusive map"))
+		return;
+
+	skel = bpf_iter_bpf_array_map__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "bpf_iter_bpf_array_map__open_and_load"))
+		goto out;
+
+	memset(&linfo, 0, sizeof(linfo));
+	linfo.map.map_fd = excl_fd;
+	opts.link_info = &linfo;
+	opts.link_info_len = sizeof(linfo);
+
+	link = bpf_program__attach_iter(skel->progs.dump_bpf_array_map, &opts);
+	if (!ASSERT_ERR_PTR(link, "reject exclusive map as iter target")) {
+		bpf_link__destroy(link);
+		goto out;
+	}
+	ASSERT_EQ(libbpf_get_error(link), -EPERM, "iter attach errno");
+out:
+	bpf_iter_bpf_array_map__destroy(skel);
+	close(excl_fd);
+}
+
+static void test_map_excl_create_validation(void)
+{
+	LIBBPF_OPTS(bpf_map_create_opts, o);
+	__u8 hash[SHA256_DIGEST_SIZE] = {};
+	int fd;
+
+	o.excl_prog_hash = hash;
+	o.excl_prog_hash_size = SHA256_DIGEST_SIZE / 2;
+	fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "excl", 4, 4, 1, &o);
+	if (fd >= 0)
+		close(fd);
+	ASSERT_EQ(fd, -EINVAL, "reject short excl_prog_hash_size");
+
+	o.excl_prog_hash = hash;
+	o.excl_prog_hash_size = SHA256_DIGEST_SIZE * 2;
+	fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "excl", 4, 4, 1, &o);
+	if (fd >= 0)
+		close(fd);
+	ASSERT_EQ(fd, -EINVAL, "reject long excl_prog_hash_size");
+
+	o.excl_prog_hash = hash;
+	o.excl_prog_hash_size = 0;
+	fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "excl", 4, 4, 1, &o);
+	if (fd >= 0)
+		close(fd);
+	ASSERT_EQ(fd, -EINVAL, "reject hash pointer with zero size");
+
+	o.excl_prog_hash = NULL;
+	o.excl_prog_hash_size = SHA256_DIGEST_SIZE;
+	fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "excl", 4, 4, 1, &o);
+	if (fd >= 0)
+		close(fd);
+	ASSERT_EQ(fd, -EINVAL, "reject size with NULL hash pointer");
+}
+
 void test_map_excl(void)
 {
 	if (test__start_subtest("map_excl_allowed"))
 		test_map_excl_allowed();
 	if (test__start_subtest("map_excl_denied"))
 		test_map_excl_denied();
+	if (test__start_subtest("map_excl_no_map_in_map"))
+		test_map_excl_no_map_in_map();
+	if (test__start_subtest("map_excl_no_map_iter"))
+		test_map_excl_no_map_iter();
+	if (test__start_subtest("map_excl_create_validation"))
+		test_map_excl_create_validation();
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/map_init.c b/tools/testing/selftests/bpf/prog_tests/map_init.c
index 14a31109dd0e..c804c3ce9be9 100644
--- a/tools/testing/selftests/bpf/prog_tests/map_init.c
+++ b/tools/testing/selftests/bpf/prog_tests/map_init.c
@@ -212,3 +212,195 @@ void test_map_init(void)
 	if (test__start_subtest("pcpu_lru_map_init"))
 		test_pcpu_lru_map_init();
 }
+
+static void test_map_create(enum bpf_map_type map_type, const char *map_name,
+			    struct bpf_map_create_opts *opts, const char *exp_msg)
+{
+	const int key_size = 4, value_size = 4, max_entries = 1;
+	char log_buf[128];
+	int fd;
+	LIBBPF_OPTS(bpf_log_opts, log_opts);
+
+	log_buf[0] = '\0';
+	log_opts.buf = log_buf;
+	log_opts.size = sizeof(log_buf);
+	log_opts.level = 1;
+	opts->log_opts = &log_opts;
+	fd = bpf_map_create(map_type, map_name, key_size, value_size, max_entries, opts);
+	if (!ASSERT_LT(fd, 0, "bpf_map_create")) {
+		close(fd);
+		return;
+	}
+
+	ASSERT_STREQ(log_buf, exp_msg, "log_buf");
+	ASSERT_EQ(log_opts.true_size, strlen(exp_msg) + 1, "true_size");
+}
+
+static void test_map_create_array(struct bpf_map_create_opts *opts, const char *exp_msg)
+{
+	test_map_create(BPF_MAP_TYPE_ARRAY, "test_map_create", opts, exp_msg);
+}
+
+static void test_invalid_vmlinux_value_type_id_struct_ops(void)
+{
+	const char *msg = "btf_vmlinux_value_type_id can only be used with struct_ops maps.\n";
+	LIBBPF_OPTS(bpf_map_create_opts, opts,
+		    .btf_vmlinux_value_type_id = 1,
+	);
+
+	test_map_create_array(&opts, msg);
+}
+
+static void test_invalid_vmlinux_value_type_id_kv_type_id(void)
+{
+	const char *msg = "btf_vmlinux_value_type_id is mutually exclusive with btf_key_type_id and btf_value_type_id.\n";
+	LIBBPF_OPTS(bpf_map_create_opts, opts,
+		    .btf_vmlinux_value_type_id = 1,
+		    .btf_key_type_id = 1,
+	);
+
+	test_map_create(BPF_MAP_TYPE_STRUCT_OPS, "test_map_create", &opts, msg);
+}
+
+static void test_invalid_value_type_id(void)
+{
+	const char *msg = "Invalid btf_value_type_id.\n";
+	LIBBPF_OPTS(bpf_map_create_opts, opts,
+		    .btf_key_type_id = 1,
+	);
+
+	test_map_create_array(&opts, msg);
+}
+
+static void test_invalid_map_extra(void)
+{
+	const char *msg = "Invalid map_extra.\n";
+	LIBBPF_OPTS(bpf_map_create_opts, opts,
+		    .map_extra = 1,
+	);
+
+	test_map_create_array(&opts, msg);
+}
+
+static void test_invalid_numa_node(void)
+{
+	const char *msg = "Invalid numa_node.\n";
+	LIBBPF_OPTS(bpf_map_create_opts, opts,
+		    .map_flags = BPF_F_NUMA_NODE,
+		    .numa_node = 0xFF,
+	);
+
+	test_map_create_array(&opts, msg);
+}
+
+static void test_invalid_map_type(void)
+{
+	const char *msg = "Invalid map_type.\n";
+	LIBBPF_OPTS(bpf_map_create_opts, opts);
+
+	test_map_create(__MAX_BPF_MAP_TYPE, "test_map_create", &opts, msg);
+}
+
+static void test_invalid_token_fd(void)
+{
+	const char *msg = "Invalid map_token_fd.\n";
+	LIBBPF_OPTS(bpf_map_create_opts, opts,
+		    .map_flags = BPF_F_TOKEN_FD,
+		    .token_fd = -1,
+	);
+
+	test_map_create_array(&opts, msg);
+}
+
+static void test_invalid_map_name(void)
+{
+	const char *msg = "Invalid map_name.\n";
+	LIBBPF_OPTS(bpf_map_create_opts, opts);
+
+	test_map_create(BPF_MAP_TYPE_ARRAY, "test-!@#", &opts, msg);
+}
+
+static void test_invalid_btf_fd(void)
+{
+	const char *msg = "Invalid btf_fd.\n";
+	LIBBPF_OPTS(bpf_map_create_opts, opts,
+		    .btf_fd = -1,
+		    .btf_key_type_id = 1,
+		    .btf_value_type_id = 1,
+	);
+
+	test_map_create_array(&opts, msg);
+}
+
+static void test_excl_prog_hash_size_1(void)
+{
+	const char *msg = "Invalid excl_prog_hash_size.\n";
+	const char *hash = "DEADCODE";
+	LIBBPF_OPTS(bpf_map_create_opts, opts,
+		    .excl_prog_hash = hash,
+	);
+
+	test_map_create_array(&opts, msg);
+}
+
+static void test_excl_prog_hash_size_2(void)
+{
+	const char *msg = "Invalid excl_prog_hash_size.\n";
+	LIBBPF_OPTS(bpf_map_create_opts, opts,
+		    .excl_prog_hash_size = 1,
+	);
+
+	test_map_create_array(&opts, msg);
+}
+
+static void test_common_attr_padding(void)
+{
+	struct bpf_common_attr_fake {
+		__u8 attrs[offsetofend(struct bpf_common_attr, log_true_size)];
+		__u32 pad;
+	} attr_common = {
+		.pad = 1,
+	};
+	union bpf_attr attr = {
+		.map_type    = BPF_MAP_TYPE_ARRAY,
+		.key_size    = 4,
+		.value_size  = 4,
+		.max_entries = 1,
+	};
+	int fd;
+
+	fd = syscall(__NR_bpf, BPF_MAP_CREATE | BPF_COMMON_ATTRS, &attr, sizeof(attr), &attr_common,
+		     sizeof(attr_common));
+	if (!ASSERT_LT(fd, 0, "syscall"))
+		close(fd);
+	else
+		ASSERT_EQ(errno, E2BIG, "errno");
+}
+
+void test_map_create_failure(void)
+{
+	if (test__start_subtest("invalid_vmlinux_value_type_id_struct_ops"))
+		test_invalid_vmlinux_value_type_id_struct_ops();
+	if (test__start_subtest("invalid_vmlinux_value_type_id_kv_type_id"))
+		test_invalid_vmlinux_value_type_id_kv_type_id();
+	if (test__start_subtest("invalid_value_type_id"))
+		test_invalid_value_type_id();
+	if (test__start_subtest("invalid_map_extra"))
+		test_invalid_map_extra();
+	if (test__start_subtest("invalid_numa_node"))
+		test_invalid_numa_node();
+	if (test__start_subtest("invalid_map_type"))
+		test_invalid_map_type();
+	if (test__start_subtest("invalid_token_fd"))
+		test_invalid_token_fd();
+	if (test__start_subtest("invalid_map_name"))
+		test_invalid_map_name();
+	if (test__start_subtest("invalid_btf_fd"))
+		test_invalid_btf_fd();
+	if (test__start_subtest("invalid_excl_prog_hash_size_1"))
+		test_excl_prog_hash_size_1();
+	if (test__start_subtest("invalid_excl_prog_hash_size_2"))
+		test_excl_prog_hash_size_2();
+	if (test__start_subtest("common_attr_padding"))
+		test_common_attr_padding();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/map_kptr.c b/tools/testing/selftests/bpf/prog_tests/map_kptr.c
index 03b46f17cf53..17e707dddda8 100644
--- a/tools/testing/selftests/bpf/prog_tests/map_kptr.c
+++ b/tools/testing/selftests/bpf/prog_tests/map_kptr.c
@@ -51,7 +51,6 @@ static void test_map_kptr_success(bool test_run)
 	ret = bpf_map__update_elem(skel->maps.array_map,
 				   &key, sizeof(key), buf, sizeof(buf), 0);
 	ASSERT_OK(ret, "array_map update");
-	skel->data->ref--;
 	ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_map_kptr_ref3), &opts);
 	ASSERT_OK(ret, "test_map_kptr_ref3 refcount");
 	ASSERT_OK(opts.retval, "test_map_kptr_ref3 retval");
@@ -59,49 +58,42 @@ static void test_map_kptr_success(bool test_run)
 	ret = bpf_map__update_elem(skel->maps.pcpu_array_map,
 				   &key, sizeof(key), pbuf, cpu * sizeof(buf), 0);
 	ASSERT_OK(ret, "pcpu_array_map update");
-	skel->data->ref--;
 	ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_map_kptr_ref3), &opts);
 	ASSERT_OK(ret, "test_map_kptr_ref3 refcount");
 	ASSERT_OK(opts.retval, "test_map_kptr_ref3 retval");
 
 	ret = bpf_map__delete_elem(skel->maps.hash_map, &key, sizeof(key), 0);
 	ASSERT_OK(ret, "hash_map delete");
-	skel->data->ref--;
 	ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_map_kptr_ref3), &opts);
 	ASSERT_OK(ret, "test_map_kptr_ref3 refcount");
 	ASSERT_OK(opts.retval, "test_map_kptr_ref3 retval");
 
 	ret = bpf_map__delete_elem(skel->maps.pcpu_hash_map, &key, sizeof(key), 0);
 	ASSERT_OK(ret, "pcpu_hash_map delete");
-	skel->data->ref--;
 	ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_map_kptr_ref3), &opts);
 	ASSERT_OK(ret, "test_map_kptr_ref3 refcount");
 	ASSERT_OK(opts.retval, "test_map_kptr_ref3 retval");
 
 	ret = bpf_map__delete_elem(skel->maps.hash_malloc_map, &key, sizeof(key), 0);
 	ASSERT_OK(ret, "hash_malloc_map delete");
-	skel->data->ref--;
 	ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_map_kptr_ref3), &opts);
 	ASSERT_OK(ret, "test_map_kptr_ref3 refcount");
 	ASSERT_OK(opts.retval, "test_map_kptr_ref3 retval");
 
 	ret = bpf_map__delete_elem(skel->maps.pcpu_hash_malloc_map, &key, sizeof(key), 0);
 	ASSERT_OK(ret, "pcpu_hash_malloc_map delete");
-	skel->data->ref--;
 	ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_map_kptr_ref3), &opts);
 	ASSERT_OK(ret, "test_map_kptr_ref3 refcount");
 	ASSERT_OK(opts.retval, "test_map_kptr_ref3 retval");
 
 	ret = bpf_map__delete_elem(skel->maps.lru_hash_map, &key, sizeof(key), 0);
 	ASSERT_OK(ret, "lru_hash_map delete");
-	skel->data->ref--;
 	ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_map_kptr_ref3), &opts);
 	ASSERT_OK(ret, "test_map_kptr_ref3 refcount");
 	ASSERT_OK(opts.retval, "test_map_kptr_ref3 retval");
 
 	ret = bpf_map__delete_elem(skel->maps.lru_pcpu_hash_map, &key, sizeof(key), 0);
 	ASSERT_OK(ret, "lru_pcpu_hash_map delete");
-	skel->data->ref--;
 	ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_map_kptr_ref3), &opts);
 	ASSERT_OK(ret, "test_map_kptr_ref3 refcount");
 	ASSERT_OK(opts.retval, "test_map_kptr_ref3 retval");
@@ -151,12 +143,68 @@ static void wait_for_map_release(void)
 	map_kptr__destroy(skel);
 }
 
+enum map_update_kptr_case {
+	MAP_UPDATE_KPTR_ARRAY,
+	MAP_UPDATE_KPTR_HASH,
+	MAP_UPDATE_KPTR_HASH_MALLOC,
+};
+
+static struct bpf_program *map_update_kptr_prog(struct map_kptr *skel,
+						enum map_update_kptr_case test)
+{
+	switch (test) {
+	case MAP_UPDATE_KPTR_ARRAY:
+		return skel->progs.test_array_map_update_kptr;
+	case MAP_UPDATE_KPTR_HASH:
+		return skel->progs.test_hash_map_update_kptr;
+	case MAP_UPDATE_KPTR_HASH_MALLOC:
+		return skel->progs.test_hash_malloc_map_update_kptr;
+	}
+
+	return NULL;
+}
+
+static void test_map_update_kptr(enum map_update_kptr_case test)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, opts);
+	struct map_kptr *skel;
+	struct bpf_program *prog;
+	int ret;
+
+	skel = map_kptr__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "map_kptr__open_and_load"))
+		return;
+
+	prog = map_update_kptr_prog(skel, test);
+	if (!ASSERT_OK_PTR(prog, "map_update_kptr_prog"))
+		goto out;
+
+	ret = bpf_prog_test_run_opts(bpf_program__fd(prog), &opts);
+	if (!ASSERT_OK(ret, "map_update_kptr"))
+		goto out;
+	if (!ASSERT_OK(opts.retval, "map_update_kptr retval"))
+		goto out;
+
+	ASSERT_EQ(skel->bss->num_of_refs, 3, "refs_after_update");
+
+out:
+	map_kptr__destroy(skel);
+	wait_for_map_release();
+}
+
 void serial_test_map_kptr(void)
 {
 	struct rcu_tasks_trace_gp *skel;
 
 	RUN_TESTS(map_kptr_fail);
 
+	if (test__start_subtest("update_array_map_kptr"))
+		test_map_update_kptr(MAP_UPDATE_KPTR_ARRAY);
+	if (test__start_subtest("update_hash_map_kptr"))
+		test_map_update_kptr(MAP_UPDATE_KPTR_HASH);
+	if (test__start_subtest("update_hash_malloc_map_kptr"))
+		test_map_update_kptr(MAP_UPDATE_KPTR_HASH_MALLOC);
+
 	skel = rcu_tasks_trace_gp__open_and_load();
 	if (!ASSERT_OK_PTR(skel, "rcu_tasks_trace_gp__open_and_load"))
 		return;
@@ -175,7 +223,7 @@ void serial_test_map_kptr(void)
 		ASSERT_OK(kern_sync_rcu(), "sync rcu");
 		wait_for_map_release();
 
-		/* Observe refcount dropping to 1 on synchronous delete elem */
+		/* Observe refcount dropping to 1 on map release. */
 		test_map_kptr_success(true);
 	}
 
diff --git a/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c b/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c
index d2c0542716a8..1737eba34323 100644
--- a/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c
+++ b/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c
@@ -57,6 +57,7 @@ void test_percpu_hash_refcounted_kptr_refcount_leak(void)
 		    .data_size_in = sizeof(pkt_v4),
 		    .repeat = 1,
 	);
+	LIBBPF_OPTS(bpf_test_run_opts, syscall_opts);
 
 	cpu_nr = libbpf_num_possible_cpus();
 	if (!ASSERT_GT(cpu_nr, 0, "libbpf_num_possible_cpus"))
@@ -87,8 +88,11 @@ void test_percpu_hash_refcounted_kptr_refcount_leak(void)
 	if (!ASSERT_EQ(opts.retval, 2, "opts.retval"))
 		goto out;
 
-	err = bpf_map__update_elem(map, &key, sizeof(key), values, values_sz, 0);
-	if (!ASSERT_OK(err, "bpf_map__update_elem"))
+	fd = bpf_program__fd(skel->progs.clear_percpu_hash_kptr);
+	err = bpf_prog_test_run_opts(fd, &syscall_opts);
+	if (!ASSERT_OK(err, "bpf_prog_test_run_opts"))
+		goto out;
+	if (!ASSERT_EQ(syscall_opts.retval, 1, "syscall_opts.retval"))
 		goto out;
 
 	fd = bpf_program__fd(skel->progs.check_percpu_hash_refcount);
diff --git a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c
index 71f5240cc5b7..7f170a69d1d8 100644
--- a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c
+++ b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c
@@ -478,6 +478,52 @@ static struct range range_refine_in_halves(enum num_t x_t, struct range x,
 
 }
 
+static __always_inline u64 next_u32_block(u64 x) { return x + (1ULL << 32); }
+static __always_inline u64 prev_u32_block(u64 x) { return x - (1ULL << 32); }
+
+/* Is v within the circular u64 range [base, base + len]? */
+static __always_inline bool u64_range_contains(u64 v, u64 base, u64 len)
+{
+	return v - base <= len;
+}
+
+/* Is v within the circular u32 range [base, base + len]? */
+static __always_inline bool u32_range_contains(u32 v, u32 base, u32 len)
+{
+	return v - base <= len;
+}
+
+static bool range64_range32_intersect(enum num_t a_t,
+				      struct range a /* 64 */,
+				      struct range b /* 32 */,
+				      struct range *out /* 64 */)
+{
+	u64 b_len = (u32)(b.b - b.a);
+	u64 a_len = a.b - a.a;
+	u64 lo, hi;
+
+	if (u32_range_contains((u32)a.a, (u32)b.a, b_len)) {
+		lo = a.a;
+	} else {
+		lo = swap_low32(a.a, (u32)b.a);
+		if (!u64_range_contains(lo, a.a, a_len))
+			lo = next_u32_block(lo);
+		if (!u64_range_contains(lo, a.a, a_len))
+			return false;
+	}
+	if (u32_range_contains(a.b, (u32)b.a, b_len)) {
+		hi = a.b;
+	} else {
+		hi = swap_low32(a.b, (u32)b.b);
+		if (!u64_range_contains(hi, a.a, a_len))
+			hi = prev_u32_block(hi);
+		if (!u64_range_contains(hi, a.a, a_len))
+			return false;
+	}
+	*out = range(a_t, lo, hi);
+	return true;
+}
+
 static struct range range_refine(enum num_t x_t, struct range x, enum num_t y_t, struct range y)
 {
 	struct range y_cast;
@@ -533,23 +579,12 @@ static struct range range_refine(enum num_t x_t, struct range x, enum num_t y_t,
 		}
 	}
 
-	/* the case when new range knowledge, *y*, is a 32-bit subregister
-	 * range, while previous range knowledge, *x*, is a full register
-	 * 64-bit range, needs special treatment to take into account upper 32
-	 * bits of full register range
-	 */
 	if (t_is_32(y_t) && !t_is_32(x_t)) {
-		struct range x_swap;
+		struct range x1;
 
-		/* some combinations of upper 32 bits and sign bit can lead to
-		 * invalid ranges, in such cases it's easier to detect them
-		 * after cast/swap than try to enumerate all the conditions
-		 * under which transformation and knowledge transfer is valid
-		 */
-		x_swap = range(x_t, swap_low32(x.a, y_cast.a), swap_low32(x.b, y_cast.b));
-		if (!is_valid_range(x_t, x_swap))
-			return x;
-		return range_intersection(x_t, x, x_swap);
+		if (range64_range32_intersect(x_t, x, y, &x1))
+			return x1;
+		return x;
 	}
 
 	/* otherwise, plain range cast and intersection works */
@@ -1300,6 +1335,26 @@ static bool assert_range_eq(enum num_t t, struct range x, struct range y,
 	return false;
 }
 
+/* For a pair of signed/unsigned t1/t2 checks if r1/r2 intersect in two intervals. */
+static bool needs_two_arcs(enum num_t t1, struct range r1,
+			   enum num_t t2, struct range r2)
+{
+	u64 lo = cast_t(t1, r2.a);
+	u64 hi = cast_t(t1, r2.b);
+
+	/* does r2 wrap in t1's domain: [0, hi] ∪ [lo, MAX]? */
+	return lo > hi && r1.a <= hi && r1.b >= lo;
+}
+
+static bool reg_state_needs_two_arcs(struct reg_state *s)
+{
+	if (!s->valid)
+		return false;
+
+	return needs_two_arcs(U64, s->r[U64], S64, s->r[S64]) ||
+	       needs_two_arcs(U32, s->r[U32], S32, s->r[S32]);
+}
+
 /* Validate that register states match, and print details if they don't */
 static bool assert_reg_state_eq(struct reg_state *r, struct reg_state *e, const char *ctx)
 {
@@ -1524,6 +1579,11 @@ static int verify_case_op(enum num_t init_t, enum num_t cond_t,
 	    !assert_reg_state_eq(&fr2, &fe2, "false_reg2") ||
 	    !assert_reg_state_eq(&tr1, &te1, "true_reg1") ||
 	    !assert_reg_state_eq(&tr2, &te2, "true_reg2")) {
+		if (reg_state_needs_two_arcs(&fe1) || reg_state_needs_two_arcs(&fe2) ||
+		    reg_state_needs_two_arcs(&te1) || reg_state_needs_two_arcs(&te2)) {
+			test__skip();
+			return 0;
+		}
 		failed = true;
 	}
 
diff --git a/tools/testing/selftests/bpf/prog_tests/rhash.c b/tools/testing/selftests/bpf/prog_tests/rhash.c
new file mode 100644
index 000000000000..98bb66907b7f
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/rhash.c
@@ -0,0 +1,183 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+#include <test_progs.h>
+#include <string.h>
+#include <stdio.h>
+#include "rhash.skel.h"
+#include "bpf_iter_bpf_rhash_map.skel.h"
+#include <linux/bpf.h>
+#include <linux/perf_event.h>
+#include <sys/syscall.h>
+
+static void rhash_run(const char *prog_name)
+{
+	struct rhash *skel;
+	struct bpf_program *prog;
+	LIBBPF_OPTS(bpf_test_run_opts, opts);
+	int err;
+
+	skel = rhash__open();
+	if (!ASSERT_OK_PTR(skel, "rhash__open"))
+		return;
+
+	prog = bpf_object__find_program_by_name(skel->obj, prog_name);
+	if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name"))
+		goto cleanup;
+	bpf_program__set_autoload(prog, true);
+
+	err = rhash__load(skel);
+	if (!ASSERT_OK(err, "skel_load"))
+		goto cleanup;
+
+	err = bpf_prog_test_run_opts(bpf_program__fd(prog), &opts);
+	if (!ASSERT_OK(err, "prog run"))
+		goto cleanup;
+
+	if (!ASSERT_OK(opts.retval, "prog retval"))
+		goto cleanup;
+
+	if (!ASSERT_OK(skel->bss->err, "bss->err"))
+		goto cleanup;
+
+cleanup:
+	rhash__destroy(skel);
+}
+
+static int rhash_map_create(__u32 max_entries, __u64 map_extra)
+{
+	LIBBPF_OPTS(bpf_map_create_opts, opts,
+		    .map_flags = BPF_F_NO_PREALLOC,
+		    .map_extra = map_extra);
+
+	return bpf_map_create(BPF_MAP_TYPE_RHASH, "rhash_extra",
+			      sizeof(__u32), sizeof(__u64), max_entries, &opts);
+}
+
+static void rhash_map_extra_presize(void)
+{
+	const __u32 max_entries = 1024;
+	const __u32 nelem_hint = 256;
+	struct bpf_map_info info = {};
+	__u32 info_len = sizeof(info);
+	__u64 val = 0;
+	__u32 key;
+	int fd, i;
+
+	fd = rhash_map_create(max_entries, nelem_hint);
+	if (!ASSERT_GE(fd, 0, "rhash_map_create presize"))
+		return;
+
+	if (!ASSERT_OK(bpf_map_get_info_by_fd(fd, &info, &info_len), "info"))
+		goto close;
+	ASSERT_EQ(info.map_extra, nelem_hint, "info.map_extra");
+
+	for (i = 0; i < (int)nelem_hint; i++) {
+		key = i;
+		if (!ASSERT_OK(bpf_map_update_elem(fd, &key, &val, BPF_NOEXIST),
+			       "update"))
+			goto close;
+	}
+close:
+	close(fd);
+}
+
+static void rhash_map_extra_too_big(void)
+{
+	int fd;
+
+	fd = rhash_map_create(1U << 20, 0x10000);
+	if (!ASSERT_LT(fd, 0, "rhash_map_create hint > U16_MAX"))
+		close(fd);
+}
+
+static void rhash_iter_test(void)
+{
+	DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+	struct bpf_iter_bpf_rhash_map *skel;
+	int err, i, len, map_fd, iter_fd;
+	union bpf_iter_link_info linfo;
+	u32 expected_key_sum = 0, key;
+	struct bpf_link *link;
+	u64 val = 0;
+	char buf[64];
+
+	skel = bpf_iter_bpf_rhash_map__open();
+	if (!ASSERT_OK_PTR(skel, "bpf_iter_bpf_rhash_map__open"))
+		return;
+
+	err = bpf_iter_bpf_rhash_map__load(skel);
+	if (!ASSERT_OK(err, "bpf_iter_bpf_rhash_map__load"))
+		goto out;
+
+	map_fd = bpf_map__fd(skel->maps.rhashmap);
+
+	/* Populate map with test data */
+	for (i = 0; i < 64; i++) {
+		key = i + 1;
+		expected_key_sum += key;
+
+		err = bpf_map_update_elem(map_fd, &key, &val, BPF_NOEXIST);
+		if (!ASSERT_OK(err, "map_update"))
+			goto out;
+	}
+
+	memset(&linfo, 0, sizeof(linfo));
+	linfo.map.map_fd = map_fd;
+	opts.link_info = &linfo;
+	opts.link_info_len = sizeof(linfo);
+
+	link = bpf_program__attach_iter(skel->progs.dump_bpf_rhash_map, &opts);
+	if (!ASSERT_OK_PTR(link, "attach_iter"))
+		goto out;
+
+	iter_fd = bpf_iter_create(bpf_link__fd(link));
+	if (!ASSERT_GE(iter_fd, 0, "create_iter"))
+		goto free_link;
+
+	do {
+		len = read(iter_fd, buf, sizeof(buf));
+	} while (len > 0);
+
+	ASSERT_EQ(skel->bss->key_sum, expected_key_sum, "key_sum");
+	ASSERT_EQ(skel->bss->elem_count, 64, "elem_count");
+
+	close(iter_fd);
+
+free_link:
+	bpf_link__destroy(link);
+out:
+	bpf_iter_bpf_rhash_map__destroy(skel);
+}
+
+void test_rhash(void)
+{
+	if (test__start_subtest("test_rhash_lookup_update"))
+		rhash_run("test_rhash_lookup_update");
+
+	if (test__start_subtest("test_rhash_update_delete"))
+		rhash_run("test_rhash_update_delete");
+
+	if (test__start_subtest("test_rhash_update_elements"))
+		rhash_run("test_rhash_update_elements");
+
+	if (test__start_subtest("test_rhash_update_exist"))
+		rhash_run("test_rhash_update_exist");
+
+	if (test__start_subtest("test_rhash_update_any"))
+		rhash_run("test_rhash_update_any");
+
+	if (test__start_subtest("test_rhash_noexist_duplicate"))
+		rhash_run("test_rhash_noexist_duplicate");
+
+	if (test__start_subtest("test_rhash_delete_nonexistent"))
+		rhash_run("test_rhash_delete_nonexistent");
+
+	if (test__start_subtest("test_rhash_map_extra_presize"))
+		rhash_map_extra_presize();
+
+	if (test__start_subtest("test_rhash_map_extra_too_big"))
+		rhash_map_extra_too_big();
+
+	if (test__start_subtest("test_rhash_iter"))
+		rhash_iter_test();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c b/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c
index 77fe1bfb7504..4e91d9b615ce 100644
--- a/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c
+++ b/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c
@@ -199,6 +199,83 @@ err_out:
 	bpf_link__destroy(getsockopt_link);
 }
 
+static int connect_to_v4mapped_v6_fd(int server_fd)
+{
+	struct sockaddr_storage addr;
+	struct sockaddr_in *addr4 = (void *)&addr;
+	socklen_t addrlen = sizeof(addr);
+	struct sockaddr_in6 addr6 = {};
+	int fd = -1, v6only = 0, err;
+
+	err = getsockname(server_fd, (struct sockaddr *)&addr, &addrlen);
+	if (!ASSERT_OK(err, "getsockname"))
+		return -1;
+
+	fd = socket(AF_INET6, SOCK_STREAM, 0);
+	if (!ASSERT_GE(fd, 0, "socket"))
+		return -1;
+
+	err = settimeo(fd, 0);
+	if (!ASSERT_OK(err, "settimeo"))
+		goto err_out;
+
+	err = setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &v6only, sizeof(v6only));
+	if (!ASSERT_OK(err, "clear_v6only"))
+		goto err_out;
+
+	addr6.sin6_family = AF_INET6;
+	addr6.sin6_port = addr4->sin_port;
+	addr6.sin6_addr.s6_addr[10] = 0xff;
+	addr6.sin6_addr.s6_addr[11] = 0xff;
+	memcpy(&addr6.sin6_addr.s6_addr[12], &addr4->sin_addr, sizeof(addr4->sin_addr));
+
+	err = connect(fd, (struct sockaddr *)&addr6, sizeof(addr6));
+	if (!ASSERT_OK(err, "connect"))
+		goto err_out;
+
+	return fd;
+
+err_out:
+	close(fd);
+	return -1;
+}
+
+static void test_v4mapped_v6_ip_tos(void)
+{
+	struct setget_sockopt__bss *bss = skel->bss;
+	int sfd = -1, fd = -1, got = 0, exp = 0x1c;
+	socklen_t optlen;
+
+	memset(bss, 0, sizeof(*bss));
+	bss->v4mapped_v6_ip_tos_enable = 1;
+	bss->v4mapped_v6_ip_tos_ret = -1;
+	bss->v4mapped_v6_ip_tos_val = exp;
+
+	sfd = start_server(AF_INET, SOCK_STREAM, addr4_str, 0, 0);
+	if (!ASSERT_GE(sfd, 0, "start_server"))
+		goto err_out;
+
+	fd = connect_to_v4mapped_v6_fd(sfd);
+	if (!ASSERT_GE(fd, 0, "connect_to_v4mapped_v6_fd"))
+		goto err_out;
+
+	ASSERT_GT(bss->v4mapped_v6_ip_tos_cnt, 0, "v4mapped_v6_ip_tos_cnt");
+	ASSERT_EQ(bss->v4mapped_v6_ip_tos_ret, 0, "v4mapped_v6_ip_tos_ret");
+
+	optlen = sizeof(got);
+	if (!ASSERT_OK(getsockopt(fd, SOL_IP, IP_TOS, &got, &optlen), "getsockopt_ip_tos"))
+		goto err_out;
+
+	ASSERT_EQ(got, exp, "ip_tos");
+
+err_out:
+	bss->v4mapped_v6_ip_tos_enable = 0;
+	if (fd >= 0)
+		close(fd);
+	if (sfd >= 0)
+		close(sfd);
+}
+
 void test_setget_sockopt(void)
 {
 	cg_fd = test__join_cgroup(CG_NAME);
@@ -238,6 +315,7 @@ void test_setget_sockopt(void)
 	test_ktls(AF_INET);
 	test_nonstandard_opt(AF_INET);
 	test_nonstandard_opt(AF_INET6);
+	test_v4mapped_v6_ip_tos();
 
 done:
 	setget_sockopt__destroy(skel);
diff --git a/tools/testing/selftests/bpf/prog_tests/signed_loader.c b/tools/testing/selftests/bpf/prog_tests/signed_loader.c
new file mode 100644
index 000000000000..5fc417e31fc6
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/signed_loader.c
@@ -0,0 +1,1135 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Isovalent */
+
+#include <test_progs.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <linux/keyctl.h>
+#include <linux/bpf.h>
+
+#include "bpf/libbpf_internal.h" /* for libbpf_sha256() */
+#include "bpf/skel_internal.h"	 /* for loader ctx layout (bpf_loader_ctx etc) */
+
+#include "test_signed_loader.skel.h"
+#include "test_signed_loader_map.skel.h"
+#include "test_signed_loader_data.skel.h"
+#include "test_signed_loader_lsm.skel.h"
+
+#define SIG_MATCH_INSNS 33 /* excl (5) + 4 * sha-dword (7) */
+
+enum {
+	BPF_SIG_UNSIGNED = 0,
+	BPF_SIG_VERIFIED,
+};
+
+enum {
+	BPF_SIG_KEYRING_NONE = 0,
+	BPF_SIG_KEYRING_BUILTIN,
+	BPF_SIG_KEYRING_SECONDARY,
+	BPF_SIG_KEYRING_PLATFORM,
+	BPF_SIG_KEYRING_USER,
+};
+
+static int load_loader(const void *insns, __u32 insns_sz, int map_fd,
+		       const void *sig, __u32 sig_sz, __s32 keyring_id)
+{
+	union bpf_attr attr;
+	int fd;
+
+	memset(&attr, 0, sizeof(attr));
+	attr.prog_type = BPF_PROG_TYPE_SYSCALL;
+	attr.insns = ptr_to_u64(insns);
+	attr.insn_cnt = insns_sz / sizeof(struct bpf_insn);
+	attr.license = ptr_to_u64("Dual BSD/GPL");
+	attr.prog_flags = BPF_F_SLEEPABLE;
+	attr.fd_array = ptr_to_u64(&map_fd);
+	if (sig) {
+		attr.signature = ptr_to_u64(sig);
+		attr.signature_size = sig_sz;
+		attr.keyring_id = keyring_id;
+	}
+	memcpy(attr.prog_name, "__loader.prog", sizeof("__loader.prog"));
+	fd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr,
+		     offsetofend(union bpf_attr, keyring_id));
+	return fd < 0 ? -errno : fd;
+}
+
+static int run_gen_loader(const void *insns, __u32 insns_sz,
+			  const void *data, __u32 data_sz,
+			  const void *excl, __u32 excl_sz,
+			  const void *sig, __u32 sig_sz,
+			  bool get_hash, void *ctx, __u32 ctx_sz, bool *loader_ran)
+{
+	LIBBPF_OPTS(bpf_map_create_opts, mopts,
+		    .excl_prog_hash = excl,
+		    .excl_prog_hash_size = excl_sz);
+	__u8 hbuf[SHA256_DIGEST_LENGTH];
+	struct bpf_map_info info;
+	__u32 ilen = sizeof(info), key = 0;
+	union bpf_attr attr;
+	int map_fd, prog_fd, ret;
+
+	*loader_ran = false;
+
+	map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "__loader.map",
+				4, data_sz, 1, &mopts);
+	if (map_fd < 0)
+		return -errno;
+	if (bpf_map_update_elem(map_fd, &key, data, 0)) {
+		ret = -errno;
+		goto out_map;
+	}
+	if (bpf_map_freeze(map_fd)) {
+		ret = -errno;
+		goto out_map;
+	}
+	if (get_hash) {
+		memset(&info, 0, sizeof(info));
+		info.hash = ptr_to_u64(hbuf);
+		info.hash_size = sizeof(hbuf);
+		if (bpf_map_get_info_by_fd(map_fd, &info, &ilen)) {
+			ret = -errno;
+			goto out_map;
+		}
+	}
+
+	memset(&attr, 0, sizeof(attr));
+	attr.prog_type = BPF_PROG_TYPE_SYSCALL;
+	attr.insns = ptr_to_u64(insns);
+	attr.insn_cnt = insns_sz / sizeof(struct bpf_insn);
+	attr.license = ptr_to_u64("Dual BSD/GPL");
+	attr.prog_flags = BPF_F_SLEEPABLE;
+	attr.fd_array = ptr_to_u64(&map_fd);
+	if (sig) {
+		attr.signature = ptr_to_u64(sig);
+		attr.signature_size = sig_sz;
+		attr.keyring_id = KEY_SPEC_SESSION_KEYRING;
+	}
+	memcpy(attr.prog_name, "__loader.prog", sizeof("__loader.prog"));
+	prog_fd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr,
+			  offsetofend(union bpf_attr, keyring_id));
+	if (prog_fd < 0) {
+		ret = -errno;
+		goto out_map;
+	}
+
+	memset(&attr, 0, sizeof(attr));
+	attr.test.prog_fd = prog_fd;
+	attr.test.ctx_in = ptr_to_u64(ctx);
+	attr.test.ctx_size_in = ctx_sz;
+	if (syscall(__NR_bpf, BPF_PROG_RUN, &attr,
+		    offsetofend(union bpf_attr, test)) < 0) {
+		ret = -errno;
+		goto out_prog;
+	}
+	*loader_ran = true;
+	ret = (int)attr.test.retval;
+out_prog:
+	close(prog_fd);
+out_map:
+	close(map_fd);
+	return ret;
+}
+
+static void close_loader_ctx_fds(void *ctx, int nr_maps, int nr_progs)
+{
+	struct bpf_map_desc *md = (struct bpf_map_desc *)((char *)ctx +
+				  sizeof(struct bpf_loader_ctx));
+	struct bpf_prog_desc *pd = (struct bpf_prog_desc *)(md + nr_maps);
+	int i;
+
+	for (i = 0; i < nr_maps; i++)
+		if (md[i].map_fd > 0)
+			close(md[i].map_fd);
+	for (i = 0; i < nr_progs; i++)
+		if (pd[i].prog_fd > 0)
+			close(pd[i].prog_fd);
+}
+
+static int run_setup(const char *cmd, const char *dir)
+{
+	int pid, status;
+
+	pid = fork();
+	if (pid < 0)
+		return -errno;
+	if (pid == 0) {
+		execlp("./verify_sig_setup.sh", "./verify_sig_setup.sh",
+		       cmd, dir, NULL);
+		exit(1);
+	}
+	if (waitpid(pid, &status, 0) < 0)
+		return -errno;
+	return (WIFEXITED(status) &&
+		WEXITSTATUS(status) == 0) ? 0 : -EINVAL;
+}
+
+static int sign_buf(const char *dir, const void *buf, __u32 len,
+		    void *sig, __u32 *sig_sz)
+{
+	char data_tmpl[PATH_MAX], key[PATH_MAX];
+	char sigpath[PATH_MAX + sizeof(".p7s")];
+	int fd, pid, status, ret;
+	struct stat st;
+
+	ret = snprintf(data_tmpl, sizeof(data_tmpl), "%s/dataXXXXXX", dir);
+	if (ret < 0 || ret >= (int)sizeof(data_tmpl))
+		return -ENAMETOOLONG;
+	ret = 0;
+
+	fd = mkstemp(data_tmpl);
+	if (fd < 0)
+		return -errno;
+	if (write(fd, buf, len) != (ssize_t)len) {
+		close(fd);
+		ret = -EIO;
+		goto out;
+	}
+	close(fd);
+
+	pid = fork();
+	if (pid < 0) {
+		ret = -errno;
+		goto out;
+	}
+	if (pid == 0) {
+		snprintf(key, sizeof(key), "%s/signing_key.pem", dir);
+		execlp("./sign-file", "./sign-file", "-d", "sha256",
+		       key, key, data_tmpl, NULL);
+		exit(1);
+	}
+	if (waitpid(pid, &status, 0) < 0 ||
+	    !WIFEXITED(status) || WEXITSTATUS(status)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	snprintf(sigpath, sizeof(sigpath), "%s.p7s", data_tmpl);
+	if (stat(sigpath, &st) < 0) {
+		ret = -errno;
+		goto out;
+	}
+	if (st.st_size > (off_t)*sig_sz) {
+		ret = -E2BIG;
+		goto out_sig;
+	}
+	fd = open(sigpath, O_RDONLY);
+	if (fd < 0) {
+		ret = -errno;
+		goto out_sig;
+	}
+	if (read(fd, sig, st.st_size) != st.st_size) {
+		close(fd);
+		ret = -EIO;
+		goto out_sig;
+	}
+	close(fd);
+	*sig_sz = st.st_size;
+out_sig:
+	unlink(sigpath);
+out:
+	unlink(data_tmpl);
+	return ret;
+}
+
+static void check_sig_match_shape(const struct bpf_insn *in, int n)
+{
+	int a = -1, cleanup = -1, i, base, t, br[5], nb = 0;
+
+	/* BPF_PSEUDO_MAP_IDX (the struct bpf_map * form) is used only here. */
+	for (i = 0; i + 1 < n; i++) {
+		if (in[i].code == (BPF_LD | BPF_IMM | BPF_DW) &&
+		    in[i].src_reg == BPF_PSEUDO_MAP_IDX) {
+			a = i;
+			break;
+		}
+	}
+	if (!ASSERT_GE(a, 0, "emit_signature_match present"))
+		return;
+	if (!ASSERT_LE(a + SIG_MATCH_INSNS, n, "block fits in program"))
+		return;
+
+	/* excl check: r2 = *(u32 *)(map + 32); if r2 != 1 goto cleanup */
+	ASSERT_EQ(in[a + 2].code, (BPF_LDX | BPF_MEM | BPF_W), "excl load width");
+	ASSERT_EQ(in[a + 2].off, SHA256_DIGEST_LENGTH, "excl field offset");
+	ASSERT_EQ(in[a + 4].code, (BPF_JMP | BPF_JNE | BPF_K), "excl branch op");
+	ASSERT_EQ(in[a + 4].imm, 1, "excl compared to 1");
+	br[nb++] = a + 4;
+
+	/* 4 sha-dword checks: r2 = *(u64 *)(map + i*8); if r2 != r3 goto cleanup */
+	for (i = 0; i < 4; i++) {
+		base = a + 5 + i * 7;
+		ASSERT_EQ(in[base + 2].code, (BPF_LDX | BPF_MEM | BPF_DW), "sha load width");
+		ASSERT_EQ(in[base + 2].off, i * 8, "sha dword offset");
+		ASSERT_EQ(in[base + 3].code, (BPF_LD | BPF_IMM | BPF_DW), "sha imm64 (H_meta)");
+		ASSERT_EQ(in[base + 6].code, (BPF_JMP | BPF_JNE | BPF_X), "sha branch op");
+		br[nb++] = base + 6;
+	}
+
+	/*
+	 * Locate the real cleanup label so we can pin the exact jump target,
+	 * not just "some backward label". bpf_gen__init() emits the cleanup
+	 * block as a prog-fd close loop whose first instruction is the label
+	 * every error branch jumps to.
+	 */
+	for (i = 0; i + 2 < a; i++) {
+		if (in[i].code == (BPF_LDX | BPF_MEM | BPF_W) &&
+		    in[i].dst_reg == BPF_REG_1 && in[i].src_reg == BPF_REG_10 &&
+		    in[i + 1].code == (BPF_JMP | BPF_JSLE | BPF_K) &&
+		    in[i + 1].dst_reg == BPF_REG_1 && in[i + 1].imm == 0 &&
+		    in[i + 1].off == 1 &&
+		    in[i + 2].code == (BPF_JMP | BPF_CALL) &&
+		    in[i + 2].imm == BPF_FUNC_sys_close) {
+			cleanup = i;
+			break;
+		}
+	}
+	if (!ASSERT_GE(cleanup, 0, "cleanup label located"))
+		return;
+	for (i = 0; i < nb; i++) {
+		t = br[i] + 1 + in[br[i]].off;
+		ASSERT_EQ(t, cleanup, "sig-match lands on cleanup");
+	}
+	/*
+	 * Same invariant for every other cleanup-bound jump in the program:
+	 * emit_check_err() is the only source of "if (r7 < 0) goto cleanup",
+	 * so each of those must also resolve exactly to cleanup.
+	 */
+	for (i = 0, t = 0; i < n; i++) {
+		if (in[i].code != (BPF_JMP | BPF_JSLT | BPF_K) ||
+		    in[i].dst_reg != BPF_REG_7 || in[i].imm != 0 || in[i].off >= 0)
+			continue;
+		ASSERT_EQ(i + 1 + in[i].off, cleanup, "err-check lands on cleanup");
+		t++;
+	}
+	ASSERT_GT(t, 0, "found emit_check_err jumps");
+}
+
+struct gen_loader_fixture {
+	struct test_signed_loader *skel;
+	struct gen_loader_opts gopts;
+	unsigned char *blob;
+	void *ctx;
+	__u32 data_sz;
+	__u32 ctx_sz;
+	int nr_maps;
+	int nr_progs;
+	__u8 excl[SHA256_DIGEST_LENGTH];
+};
+
+static int gen_loader_fixture_init(struct gen_loader_fixture *f)
+{
+	LIBBPF_OPTS(gen_loader_opts, gopts, .gen_hash = true);
+	int nr_maps = 0, nr_progs = 0;
+	struct bpf_program *p;
+	struct bpf_map *m;
+
+	memset(f, 0, sizeof(*f));
+	f->skel = test_signed_loader__open();
+	if (!ASSERT_OK_PTR(f->skel, "skel_open"))
+		return -1;
+	if (!ASSERT_OK(bpf_object__gen_loader(f->skel->obj, &gopts), "gen_loader"))
+		return -1;
+	if (!ASSERT_OK(bpf_object__load(f->skel->obj), "gen_load"))
+		return -1;
+	f->gopts = gopts;
+
+	bpf_object__for_each_program(p, f->skel->obj)
+		nr_progs++;
+	bpf_object__for_each_map(m, f->skel->obj)
+		nr_maps++;
+	f->nr_maps = nr_maps;
+	f->nr_progs = nr_progs;
+	f->ctx_sz = sizeof(struct bpf_loader_ctx) +
+		    nr_maps * sizeof(struct bpf_map_desc) +
+		    nr_progs * sizeof(struct bpf_prog_desc);
+	f->ctx = calloc(1, f->ctx_sz);
+	if (!ASSERT_OK_PTR(f->ctx, "ctx_alloc"))
+		return -1;
+	((struct bpf_loader_ctx *)f->ctx)->sz = f->ctx_sz;
+
+	f->data_sz = gopts.data_sz;
+	f->blob = malloc(f->data_sz);
+	if (!ASSERT_OK_PTR(f->blob, "blob_alloc"))
+		return -1;
+	memcpy(f->blob, gopts.data, f->data_sz);
+
+	/* excl_prog_hash = SHA256(loader insns) == the loader's prog->digest. */
+	libbpf_sha256(gopts.insns, gopts.insns_sz, f->excl);
+	return 0;
+}
+
+static void gen_loader_fixture_fini(struct gen_loader_fixture *f)
+{
+	if (f->ctx)
+		close_loader_ctx_fds(f->ctx, f->nr_maps, f->nr_progs);
+	free(f->blob);
+	free(f->ctx);
+	test_signed_loader__destroy(f->skel);
+}
+
+static void metadata_check_shape(void)
+{
+	struct gen_loader_fixture f;
+
+	if (gen_loader_fixture_init(&f) == 0)
+		check_sig_match_shape((const struct bpf_insn *)f.gopts.insns,
+				      f.gopts.insns_sz / sizeof(struct bpf_insn));
+	gen_loader_fixture_fini(&f);
+}
+
+static void metadata_match(void)
+{
+	struct gen_loader_fixture f;
+	bool ran;
+	int r;
+
+	if (gen_loader_fixture_init(&f) == 0) {
+		r = run_gen_loader(f.gopts.insns, f.gopts.insns_sz, f.blob,
+				   f.data_sz, f.excl, sizeof(f.excl), NULL, 0,
+				   true, f.ctx, f.ctx_sz, &ran);
+		ASSERT_TRUE(ran, "loader ran");
+		ASSERT_EQ(r, 0, "honest loader retval");
+	}
+	gen_loader_fixture_fini(&f);
+}
+
+static void metadata_sha_mismatch(void)
+{
+	struct gen_loader_fixture f;
+	bool ran;
+	int r;
+
+	if (gen_loader_fixture_init(&f) == 0) {
+		/*
+		 * blob[0] lives in the loader's fd_array scratch (first add_data in
+		 * bpf_gen__init); a 0-map program never reads it, so flipping it
+		 * changes only map->sha. The metadata check is the only thing that
+		 * can notice -> isolates emit_signature_match.
+		 */
+		f.blob[0] ^= 0xff;
+		r = run_gen_loader(f.gopts.insns, f.gopts.insns_sz, f.blob,
+				   f.data_sz, f.excl, sizeof(f.excl), NULL, 0,
+				   true, f.ctx, f.ctx_sz, &ran);
+		ASSERT_TRUE(ran, "loader ran");
+		ASSERT_EQ(r, -EINVAL, "tampered blob rejected by emit_signature_match");
+	}
+	gen_loader_fixture_fini(&f);
+}
+
+static void metadata_not_exclusive(void)
+{
+	struct gen_loader_fixture f;
+	bool ran;
+	int r;
+
+	if (gen_loader_fixture_init(&f) == 0) {
+		/*
+		 * Correct blob but a non-exclusive metadata map: the verifier does
+		 * not reject (excl_prog_sha unset), so the runtime map->excl == 1
+		 * check in the loader must.
+		 */
+		r = run_gen_loader(f.gopts.insns, f.gopts.insns_sz, f.blob,
+				   f.data_sz, NULL, 0, NULL, 0, true, f.ctx,
+				   f.ctx_sz, &ran);
+		ASSERT_TRUE(ran, "loader ran");
+		ASSERT_EQ(r, -EINVAL, "non-exclusive metadata map rejected");
+	}
+	gen_loader_fixture_fini(&f);
+}
+
+static void metadata_hash_not_computed(void)
+{
+	struct gen_loader_fixture f;
+	bool ran;
+	int r;
+
+	if (gen_loader_fixture_init(&f) == 0) {
+		/*
+		 * Correct, exclusive, frozen map, but its hash was never computed
+		 * (no OBJ_GET_INFO_BY_FD), so map->sha stays zero. The loader must
+		 * fail closed rather than treat an unset hash as a match.
+		 */
+		r = run_gen_loader(f.gopts.insns, f.gopts.insns_sz, f.blob,
+				   f.data_sz, f.excl, sizeof(f.excl), NULL, 0,
+				   false, f.ctx, f.ctx_sz, &ran);
+		ASSERT_TRUE(ran, "loader ran");
+		ASSERT_EQ(r, -EINVAL, "uncomputed metadata hash rejected");
+	}
+	gen_loader_fixture_fini(&f);
+}
+
+static void signature_enforced(void)
+{
+	static const __u8 junk[64] = { 0x30, 0x42, 0x13, 0x37, };
+	struct gen_loader_fixture f;
+	int fd;
+
+	if (gen_loader_fixture_init(&f) == 0) {
+		/*
+		 * A present-but-invalid signature (the cert bytes are not a
+		 * PKCS#7 signature) must be rejected at load: the signature
+		 * path is honored, not ignored. (The valid path is covered by
+		 * the signed lskels.)
+		 */
+		fd = load_loader(f.gopts.insns, f.gopts.insns_sz, -1, junk,
+				 sizeof(junk), KEY_SPEC_SESSION_KEYRING);
+		ASSERT_LT(fd, 0, "invalid signature rejected at load");
+	}
+	gen_loader_fixture_fini(&f);
+}
+
+static void signature_too_large(void)
+{
+	static const __u8 junk[64] = {};
+	struct gen_loader_fixture f;
+	int fd;
+
+	if (gen_loader_fixture_init(&f) == 0) {
+		/*
+		 * signature_size beyond the kernel's bound (KMALLOC_MAX_CACHE_SIZE)
+		 * is rejected before the buffer is read.
+		 */
+		fd = load_loader(f.gopts.insns, f.gopts.insns_sz, -1, junk,
+				 64 << 20, KEY_SPEC_SESSION_KEYRING);
+		ASSERT_EQ(fd, -EINVAL, "oversized signature rejected");
+	}
+	gen_loader_fixture_fini(&f);
+}
+
+static void signature_bad_keyring(void)
+{
+	static const __u8 junk[64] = {};
+	struct gen_loader_fixture f;
+	int fd;
+
+	if (gen_loader_fixture_init(&f) == 0) {
+		/*
+		 * A present signature with a keyring_id that resolves to no key is
+		 * rejected up front: bpf_prog_verify_signature() fails the keyring
+		 * lookup (-EINVAL) before it ever looks at the signature bytes. A
+		 * large positive serial takes the user-keyring path and won't exist.
+		 */
+		fd = load_loader(f.gopts.insns, f.gopts.insns_sz, -1, junk,
+				 sizeof(junk), INT_MAX);
+		ASSERT_EQ(fd, -EINVAL, "signature with bad keyring_id rejected");
+	}
+	gen_loader_fixture_fini(&f);
+}
+
+/*
+ * A signed loader must ignore ctx-supplied map dimensions: the host cannot
+ * resize a signed program's maps via the loader ctx. Drive a one-map program
+ * through gen_loader, ask (via ctx) for every map to be resized to a bogus
+ * value, and confirm the created maps keep their attested size.
+ */
+#define GATING_BOGUS_MAX 0x4000
+
+static void metadata_ctx_max_entries_ignored(void)
+{
+	LIBBPF_OPTS(gen_loader_opts, gopts, .gen_hash = true);
+	struct test_signed_loader_map *skel;
+	__u8 excl[SHA256_DIGEST_LENGTH];
+	int nr_maps = 0, nr_progs = 0, i, checked = 0, r;
+	struct bpf_program *p;
+	struct bpf_map *m;
+	struct bpf_map_desc *md;
+	unsigned char *blob;
+	__u32 ctx_sz, data_sz;
+	void *ctx;
+	bool ran;
+
+	skel = test_signed_loader_map__open();
+	if (!ASSERT_OK_PTR(skel, "skel_open"))
+		return;
+	if (!ASSERT_OK(bpf_object__gen_loader(skel->obj, &gopts), "gen_loader"))
+		goto destroy;
+	if (!ASSERT_OK(bpf_object__load(skel->obj), "gen_load"))
+		goto destroy;
+
+	bpf_object__for_each_program(p, skel->obj)
+		nr_progs++;
+	bpf_object__for_each_map(m, skel->obj)
+		nr_maps++;
+	ctx_sz = sizeof(struct bpf_loader_ctx) +
+		 nr_maps * sizeof(struct bpf_map_desc) +
+		 nr_progs * sizeof(struct bpf_prog_desc);
+	ctx = calloc(1, ctx_sz);
+	if (!ASSERT_OK_PTR(ctx, "ctx_alloc"))
+		goto destroy;
+	((struct bpf_loader_ctx *)ctx)->sz = ctx_sz;
+
+	md = (struct bpf_map_desc *)((char *)ctx + sizeof(struct bpf_loader_ctx));
+	for (i = 0; i < nr_maps; i++)
+		md[i].max_entries = GATING_BOGUS_MAX;
+
+	libbpf_sha256(gopts.insns, gopts.insns_sz, excl);
+	data_sz = gopts.data_sz;
+	blob = malloc(data_sz);
+	if (!ASSERT_OK_PTR(blob, "blob_alloc"))
+		goto free_ctx;
+	memcpy(blob, gopts.data, data_sz);
+
+	r = run_gen_loader(gopts.insns, gopts.insns_sz, blob, data_sz,
+			   excl, sizeof(excl), NULL, 0, true, ctx, ctx_sz, &ran);
+	if (!ASSERT_TRUE(ran, "loader ran") ||
+	    !ASSERT_EQ(r, 0, "loader retval"))
+		goto free_blob;
+
+	for (i = 0; i < nr_maps; i++) {
+		struct bpf_map_info info;
+		__u32 ilen = sizeof(info);
+		int fd = md[i].map_fd;
+
+		if (fd <= 0)
+			continue;
+		memset(&info, 0, sizeof(info));
+		if (ASSERT_OK(bpf_map_get_info_by_fd(fd, &info, &ilen), "map_info")) {
+			ASSERT_NEQ(info.max_entries, GATING_BOGUS_MAX,
+				   "ctx max_entries ignored for signed loader");
+			checked++;
+		}
+	}
+	ASSERT_GT(checked, 0, "inspected a created map");
+
+free_blob:
+	free(blob);
+free_ctx:
+	close_loader_ctx_fds(ctx, nr_maps, nr_progs);
+	free(ctx);
+destroy:
+	test_signed_loader_map__destroy(skel);
+}
+
+/*
+ * A signed loader must also ignore ctx-supplied initial_value: the host cannot
+ * re-seed a signed program's map contents through the loader ctx. Drive a
+ * program with one initialized global (a .data map) through gen_loader, point
+ * every map's ctx initial_value at an adversarial buffer, and confirm the
+ * created map still holds the attested value, never the ctx bytes.
+ */
+#define DATA_MAGIC 0x5eed1234abad1deaULL
+
+static void metadata_ctx_initial_value_ignored(void)
+{
+	LIBBPF_OPTS(gen_loader_opts, gopts, .gen_hash = true);
+	struct test_signed_loader_data *skel;
+	__u8 excl[SHA256_DIGEST_LENGTH], evil[64];
+	int nr_maps = 0, nr_progs = 0, i, found = 0, r;
+	struct bpf_program *p;
+	struct bpf_map *m;
+	struct bpf_map_desc *md;
+	unsigned char *blob;
+	__u32 ctx_sz, data_sz;
+	void *ctx;
+	bool ran;
+
+	skel = test_signed_loader_data__open();
+	if (!ASSERT_OK_PTR(skel, "skel_open"))
+		return;
+	if (!ASSERT_OK(bpf_object__gen_loader(skel->obj, &gopts), "gen_loader"))
+		goto destroy;
+	if (!ASSERT_OK(bpf_object__load(skel->obj), "gen_load"))
+		goto destroy;
+
+	bpf_object__for_each_program(p, skel->obj)
+		nr_progs++;
+	bpf_object__for_each_map(m, skel->obj)
+		nr_maps++;
+	ctx_sz = sizeof(struct bpf_loader_ctx) +
+		 nr_maps * sizeof(struct bpf_map_desc) +
+		 nr_progs * sizeof(struct bpf_prog_desc);
+	ctx = calloc(1, ctx_sz);
+	if (!ASSERT_OK_PTR(ctx, "ctx_alloc"))
+		goto destroy;
+	((struct bpf_loader_ctx *)ctx)->sz = ctx_sz;
+
+	memset(evil, 0xAA, sizeof(evil));
+	md = (struct bpf_map_desc *)((char *)ctx + sizeof(struct bpf_loader_ctx));
+	for (i = 0; i < nr_maps; i++)
+		md[i].initial_value = ptr_to_u64(evil);
+
+	libbpf_sha256(gopts.insns, gopts.insns_sz, excl);
+	data_sz = gopts.data_sz;
+	blob = malloc(data_sz);
+	if (!ASSERT_OK_PTR(blob, "blob_alloc"))
+		goto free_ctx;
+	memcpy(blob, gopts.data, data_sz);
+
+	r = run_gen_loader(gopts.insns, gopts.insns_sz, blob, data_sz,
+			   excl, sizeof(excl), NULL, 0, true, ctx, ctx_sz, &ran);
+	if (!ASSERT_TRUE(ran, "loader ran") ||
+	    !ASSERT_EQ(r, 0, "loader retval"))
+		goto free_blob;
+
+	for (i = 0; i < nr_maps; i++) {
+		struct bpf_map_info info;
+		__u32 ilen = sizeof(info), key = 0;
+		__u8 value[64] = {};
+		__u64 got;
+		int fd = md[i].map_fd;
+
+		if (fd <= 0)
+			continue;
+		memset(&info, 0, sizeof(info));
+		if (!ASSERT_OK(bpf_map_get_info_by_fd(fd, &info, &ilen), "map_info"))
+			continue;
+		if (info.value_size <= sizeof(value) &&
+		    bpf_map_lookup_elem(fd, &key, value) == 0) {
+			memcpy(&got, value, sizeof(got));
+			/* attested .data survives; ctx bytes (0xAA..) ignored */
+			if (got == DATA_MAGIC)
+				found = 1;
+			ASSERT_NEQ(got, 0xAAAAAAAAAAAAAAAAULL,
+				   "ctx initial_value ignored for signed loader");
+		}
+	}
+	ASSERT_EQ(found, 1, "attested .data value preserved");
+
+free_blob:
+	free(blob);
+free_ctx:
+	close_loader_ctx_fds(ctx, nr_maps, nr_progs);
+	free(ctx);
+destroy:
+	test_signed_loader_data__destroy(skel);
+}
+
+/*
+ * The load-time signature must authenticate the loader instructions: a valid
+ * signature loads, and the very same signature over one-byte-tampered insns is
+ * rejected. Uses ./verify_sig_setup.sh + ./sign-file at runtime, like
+ * verify_pkcs7_sig, and verifies against the session keyring the key was added
+ * to. (signature_enforced/_too_large only cover a malformed signature.)
+ */
+static void signature_authenticates_insns(void)
+{
+	LIBBPF_OPTS(gen_loader_opts, gopts, .gen_hash = true);
+	char dir_tmpl[] = "/tmp/signed_loaderXXXXXX", *dir;
+	struct test_signed_loader *skel = NULL;
+	__u8 excl[SHA256_DIGEST_LENGTH], sig[8192];
+	__u32 sig_sz = sizeof(sig), insns_sz, data_sz, ctx_sz;
+	unsigned char *insns = NULL, *tampered = NULL, *blob = NULL;
+	int nr_maps = 0, nr_progs = 0, r;
+	struct bpf_program *p;
+	struct bpf_map *m;
+	void *ctx = NULL;
+	bool ran;
+
+	syscall(__NR_request_key, "keyring", "_uid.0", NULL,
+		KEY_SPEC_SESSION_KEYRING);
+	dir = mkdtemp(dir_tmpl);
+	if (!ASSERT_OK_PTR(dir, "mkdtemp"))
+		return;
+	if (!ASSERT_OK(run_setup("setup", dir), "verify_sig_setup")) {
+		rmdir(dir);
+		return;
+	}
+
+	skel = test_signed_loader__open();
+	if (!ASSERT_OK_PTR(skel, "skel_open"))
+		goto cleanup;
+	if (!ASSERT_OK(bpf_object__gen_loader(skel->obj, &gopts), "gen_loader"))
+		goto cleanup;
+	if (!ASSERT_OK(bpf_object__load(skel->obj), "gen_load"))
+		goto cleanup;
+
+	bpf_object__for_each_program(p, skel->obj)
+		nr_progs++;
+	bpf_object__for_each_map(m, skel->obj)
+		nr_maps++;
+	ctx_sz = sizeof(struct bpf_loader_ctx) +
+		 nr_maps * sizeof(struct bpf_map_desc) +
+		 nr_progs * sizeof(struct bpf_prog_desc);
+	insns_sz = gopts.insns_sz;
+	data_sz = gopts.data_sz;
+	ctx = calloc(1, ctx_sz);
+	insns = malloc(insns_sz);
+	tampered = malloc(insns_sz);
+	blob = malloc(data_sz);
+	if (!ASSERT_OK_PTR(ctx, "ctx") ||
+	    !ASSERT_OK_PTR(insns, "insns") ||
+	    !ASSERT_OK_PTR(tampered, "tampered") ||
+	    !ASSERT_OK_PTR(blob, "blob"))
+		goto cleanup;
+	memcpy(insns, gopts.insns, insns_sz);
+	memcpy(blob, gopts.data, data_sz);
+	libbpf_sha256(insns, insns_sz, excl);
+
+	if (!ASSERT_OK(sign_buf(dir, insns, insns_sz, sig, &sig_sz), "sign-file"))
+		goto cleanup;
+
+	memset(ctx, 0, ctx_sz);
+	((struct bpf_loader_ctx *)ctx)->sz = ctx_sz;
+	r = run_gen_loader(insns, insns_sz, blob, data_sz, excl, sizeof(excl),
+			   sig, sig_sz, true, ctx, ctx_sz, &ran);
+	ASSERT_TRUE(ran, "valid signature: loader loaded and ran");
+	ASSERT_EQ(r, 0, "valid signature accepted");
+	close_loader_ctx_fds(ctx, nr_maps, nr_progs);
+
+	memcpy(tampered, insns, insns_sz);
+	tampered[insns_sz / 2] ^= 0xff;
+	memset(ctx, 0, ctx_sz);
+	((struct bpf_loader_ctx *)ctx)->sz = ctx_sz;
+	r = run_gen_loader(tampered, insns_sz, blob, data_sz, excl, sizeof(excl),
+			   sig, sig_sz, true, ctx, ctx_sz, &ran);
+	ASSERT_FALSE(ran, "tampered loader rejected before run");
+	ASSERT_EQ(r, -EKEYREJECTED, "signature is bound to the instructions");
+cleanup:
+	free(insns);
+	free(tampered);
+	free(blob);
+	free(ctx);
+	test_signed_loader__destroy(skel);
+	run_setup("cleanup", dir);
+}
+
+static int make_excl_map(__u32 flags, __u32 value_size)
+{
+	LIBBPF_OPTS(bpf_map_create_opts, opts);
+	__u8 hash[SHA256_DIGEST_LENGTH] = { 1 };	/* any 32-byte value */
+
+	opts.excl_prog_hash = hash;
+	opts.excl_prog_hash_size = sizeof(hash);
+	opts.map_flags = flags;
+	return bpf_map_create(BPF_MAP_TYPE_ARRAY, "md", 4, value_size, 1, &opts);
+}
+
+static void hash_requires_frozen(void)
+{
+	__u8 hbuf[SHA256_DIGEST_LENGTH], val[64] = {};
+	struct bpf_map_info info;
+	__u32 ilen, key = 0;
+	int fd;
+
+	fd = make_excl_map(0, sizeof(val));
+	if (!ASSERT_OK_FD(fd, "excl_map"))
+		return;
+	ASSERT_OK(bpf_map_update_elem(fd, &key, val, 0), "update");
+
+	memset(&info, 0, sizeof(info));
+	info.hash = ptr_to_u64(hbuf);
+	info.hash_size = sizeof(hbuf);
+	ilen = sizeof(info);
+	ASSERT_EQ(bpf_map_get_info_by_fd(fd, &info, &ilen), -EPERM,
+		  "hash of unfrozen map rejected");
+	close(fd);
+}
+
+static void no_update_after_freeze(void)
+{
+	__u8 val[64] = {};
+	__u32 key = 0;
+	int fd;
+
+	fd = make_excl_map(0, sizeof(val));
+	if (!ASSERT_OK_FD(fd, "excl_map"))
+		return;
+	ASSERT_OK(bpf_map_update_elem(fd, &key, val, 0), "update");
+	ASSERT_OK(bpf_map_freeze(fd), "freeze");
+	ASSERT_EQ(bpf_map_update_elem(fd, &key, val, 0), -EPERM,
+		  "update after freeze rejected");
+	close(fd);
+}
+
+static void freeze_writable_mmap(void)
+{
+	void *w;
+	int fd;
+
+	fd = make_excl_map(BPF_F_MMAPABLE, 4096);
+	if (!ASSERT_OK_FD(fd, "excl_mmapable_map"))
+		return;
+	w = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+	if (ASSERT_OK_PTR(w, "writable_mmap")) {
+		ASSERT_EQ(bpf_map_freeze(fd), -EBUSY,
+			  "freeze rejected while writable mmap held");
+		munmap(w, 4096);
+	}
+	close(fd);
+}
+
+static void no_writable_mmap_frozen(void)
+{
+	void *w;
+	int fd;
+
+	fd = make_excl_map(BPF_F_MMAPABLE, 4096);
+	if (!ASSERT_OK_FD(fd, "excl_mmapable_map"))
+		return;
+	ASSERT_OK(bpf_map_freeze(fd), "freeze");
+	w = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+	ASSERT_EQ(w, MAP_FAILED, "writable mmap of frozen map rejected");
+	if (w != MAP_FAILED)
+		munmap(w, 4096);
+	close(fd);
+}
+
+static void map_hash_matches_libbpf(void)
+{
+	__u8 kbuf[SHA256_DIGEST_LENGTH], lbuf[SHA256_DIGEST_LENGTH], val[64] = {};
+	struct bpf_map_info info;
+	__u32 ilen, key = 0;
+	int fd, i;
+
+	/*
+	 * The signing scheme assumes the kernel's map hash equals what libbpf
+	 * computes over the same bytes (gen_loader bakes libbpf_sha256(blob);
+	 * the kernel recomputes via array_map_get_hash). Pin that they agree.
+	 */
+	for (i = 0; i < (int)sizeof(val); i++)
+		val[i] = i * 7 + 1;
+	fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "h", 4, sizeof(val), 1, NULL);
+	if (!ASSERT_OK_FD(fd, "array_map"))
+		return;
+	ASSERT_OK(bpf_map_update_elem(fd, &key, val, 0), "update");
+	ASSERT_OK(bpf_map_freeze(fd), "freeze");
+	memset(&info, 0, sizeof(info));
+	info.hash = ptr_to_u64(kbuf);
+	info.hash_size = sizeof(kbuf);
+	ilen = sizeof(info);
+	if (ASSERT_OK(bpf_map_get_info_by_fd(fd, &info, &ilen), "get_hash")) {
+		libbpf_sha256(val, sizeof(val), lbuf);
+		ASSERT_EQ(memcmp(kbuf, lbuf, sizeof(kbuf)), 0,
+			  "kernel map hash matches libbpf_sha256");
+	}
+	close(fd);
+}
+
+static void map_hash_multi_element(void)
+{
+	const __u32 nr = 8, value_size = 64;
+	__u8 kbuf[SHA256_DIGEST_LENGTH], lbuf[SHA256_DIGEST_LENGTH];
+	struct bpf_map_info info;
+	__u32 ilen, i, j;
+	__u8 *full;
+	int fd;
+
+	/*
+	 * array_map_get_hash() hashes elem_size * max_entries (the whole value
+	 * area), not just element 0. With an 8-aligned value_size elem_size has
+	 * no padding, so pin that a >1-entry array's kernel hash equals
+	 * libbpf_sha256() over the full, concatenated element contents.
+	 */
+	fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "h", 4, value_size, nr, NULL);
+	if (!ASSERT_OK_FD(fd, "array_map"))
+		return;
+	full = calloc(nr, value_size);
+	if (!ASSERT_OK_PTR(full, "buf"))
+		goto close_fd;
+	for (i = 0; i < nr; i++) {
+		__u8 *v = full + i * value_size;
+
+		for (j = 0; j < value_size; j++)
+			v[j] = i * 31 + j * 7 + 1;
+		ASSERT_OK(bpf_map_update_elem(fd, &i, v, 0), "update");
+	}
+	ASSERT_OK(bpf_map_freeze(fd), "freeze");
+	memset(&info, 0, sizeof(info));
+	info.hash = ptr_to_u64(kbuf);
+	info.hash_size = sizeof(kbuf);
+	ilen = sizeof(info);
+	if (ASSERT_OK(bpf_map_get_info_by_fd(fd, &info, &ilen), "get_hash")) {
+		libbpf_sha256(full, (size_t)nr * value_size, lbuf);
+		ASSERT_EQ(memcmp(kbuf, lbuf, sizeof(kbuf)), 0,
+			  "kernel hash covers full multi-element value area");
+	}
+	free(full);
+close_fd:
+	close(fd);
+}
+
+static void map_hash_bad_size(void)
+{
+	__u8 kbuf[SHA256_DIGEST_LENGTH], val[64] = {};
+	struct bpf_map_info info;
+	__u32 ilen, key = 0;
+	int fd;
+
+	fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "h", 4, sizeof(val), 1, NULL);
+	if (!ASSERT_OK_FD(fd, "array_map"))
+		return;
+	ASSERT_OK(bpf_map_update_elem(fd, &key, val, 0), "update");
+	ASSERT_OK(bpf_map_freeze(fd), "freeze");
+	memset(&info, 0, sizeof(info));
+	info.hash = ptr_to_u64(kbuf);
+	info.hash_size = sizeof(kbuf) / 2;
+	ilen = sizeof(info);
+	ASSERT_EQ(bpf_map_get_info_by_fd(fd, &info, &ilen), -EINVAL,
+		  "wrong hash_size rejected");
+	close(fd);
+}
+
+static void map_hash_unsupported_type(void)
+{
+	__u8 kbuf[SHA256_DIGEST_LENGTH];
+	struct bpf_map_info info;
+	__u32 ilen;
+	int fd;
+
+	/* Only arrays implement map_get_hash; a hash map must be refused. */
+	fd = bpf_map_create(BPF_MAP_TYPE_HASH, "h", 4, 8, 4, NULL);
+	if (!ASSERT_OK_FD(fd, "hash_map"))
+		return;
+	memset(&info, 0, sizeof(info));
+	info.hash = ptr_to_u64(kbuf);
+	info.hash_size = sizeof(kbuf);
+	ilen = sizeof(info);
+	ASSERT_EQ(bpf_map_get_info_by_fd(fd, &info, &ilen), -EINVAL,
+		  "hash unsupported for non-array map");
+	close(fd);
+}
+
+static int setup_meta_map(const struct gen_loader_fixture *f)
+{
+	LIBBPF_OPTS(bpf_map_create_opts, mopts,
+		    .excl_prog_hash = f->excl,
+		    .excl_prog_hash_size = sizeof(f->excl));
+	__u32 key = 0;
+	int fd;
+
+	fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "__loader.map", 4,
+			    f->data_sz, 1, &mopts);
+	if (fd < 0)
+		return -errno;
+	if (bpf_map_update_elem(fd, &key, f->blob, 0) || bpf_map_freeze(fd)) {
+		close(fd);
+		return -errno;
+	}
+	return fd;
+}
+
+static void lsm_signature_verdict(void)
+{
+	char dir_tmpl[] = "/tmp/signed_loader_lsmXXXXXX", *dir = NULL;
+	struct test_signed_loader_lsm *lsm = NULL;
+	int map_fd = -1, prog_fd = -1;
+	bool have_fixture = false;
+	struct gen_loader_fixture f;
+	__u32 sig_sz = 8192;
+	__s32 ses_serial;
+	__u8 sig[8192];
+
+	lsm = test_signed_loader_lsm__open_and_load();
+	if (!ASSERT_OK_PTR(lsm, "lsm_skel_load"))
+		return;
+	lsm->bss->monitored_tid = sys_gettid();
+	if (!ASSERT_OK(test_signed_loader_lsm__attach(lsm), "lsm_attach"))
+		goto out;
+
+	have_fixture = true;
+	if (gen_loader_fixture_init(&f) != 0)
+		goto out;
+
+	map_fd = setup_meta_map(&f);
+	if (!ASSERT_OK_FD(map_fd, "meta_map_unsigned"))
+		goto out;
+	lsm->bss->seen = 0;
+	prog_fd = load_loader(f.gopts.insns, f.gopts.insns_sz, map_fd, NULL, 0, 0);
+	close(map_fd);
+	map_fd = -1;
+	if (!ASSERT_OK_FD(prog_fd, "unsigned loader load"))
+		goto out;
+	close(prog_fd);
+	prog_fd = -1;
+	if (!ASSERT_NEQ(lsm->bss->seen, 0, "bpf LSM in the active LSM set"))
+		goto out;
+	ASSERT_EQ(lsm->bss->seen, 1, "unsigned: one observed load");
+	ASSERT_EQ(lsm->bss->sig_verdict, BPF_SIG_UNSIGNED, "unsigned verdict");
+	ASSERT_EQ(lsm->bss->sig_keyring_type, BPF_SIG_KEYRING_NONE, "unsigned keyring type");
+	ASSERT_EQ(lsm->bss->sig_keyring_serial, 0, "unsigned: no keyring serial");
+
+	syscall(__NR_request_key, "keyring", "_uid.0", NULL,
+		KEY_SPEC_SESSION_KEYRING);
+	dir = mkdtemp(dir_tmpl);
+	if (!ASSERT_OK_PTR(dir, "mkdtemp"))
+		goto out;
+	if (!ASSERT_OK(run_setup("setup", dir), "verify_sig_setup")) {
+		rmdir(dir);
+		dir = NULL;
+		goto out;
+	}
+	if (!ASSERT_OK(sign_buf(dir, f.gopts.insns, f.gopts.insns_sz, sig,
+				&sig_sz), "sign-file"))
+		goto out;
+
+	map_fd = setup_meta_map(&f);
+	if (!ASSERT_OK_FD(map_fd, "meta_map_signed"))
+		goto out;
+	lsm->bss->seen = 0;
+	prog_fd = load_loader(f.gopts.insns, f.gopts.insns_sz, map_fd, sig,
+			      sig_sz, KEY_SPEC_SESSION_KEYRING);
+	close(map_fd);
+	map_fd = -1;
+	if (!ASSERT_OK_FD(prog_fd, "signed loader load"))
+		goto out;
+	close(prog_fd);
+	prog_fd = -1;
+
+	ses_serial = syscall(__NR_keyctl, KEYCTL_GET_KEYRING_ID,
+			     KEY_SPEC_SESSION_KEYRING, 0);
+	ASSERT_EQ(lsm->bss->seen, 1, "signed: one observed load");
+	ASSERT_EQ(lsm->bss->sig_verdict, BPF_SIG_VERIFIED, "signed verdict");
+	ASSERT_EQ(lsm->bss->sig_keyring_type, BPF_SIG_KEYRING_USER, "signed keyring type");
+	ASSERT_GT(ses_serial, 0, "session keyring serial resolved");
+	ASSERT_EQ(lsm->bss->sig_keyring_serial, ses_serial,
+		  "signed: validated against session keyring");
+out:
+	if (map_fd >= 0)
+		close(map_fd);
+	if (prog_fd >= 0)
+		close(prog_fd);
+	if (have_fixture)
+		gen_loader_fixture_fini(&f);
+	if (dir)
+		run_setup("cleanup", dir);
+	test_signed_loader_lsm__destroy(lsm);
+}
+
+void test_signed_loader(void)
+{
+	if (test__start_subtest("metadata_check_shape"))
+		metadata_check_shape();
+	if (test__start_subtest("metadata_match"))
+		metadata_match();
+	if (test__start_subtest("metadata_sha_mismatch"))
+		metadata_sha_mismatch();
+	if (test__start_subtest("metadata_not_exclusive"))
+		metadata_not_exclusive();
+	if (test__start_subtest("metadata_hash_not_computed"))
+		metadata_hash_not_computed();
+	if (test__start_subtest("signature_enforced"))
+		signature_enforced();
+	if (test__start_subtest("signature_too_large"))
+		signature_too_large();
+	if (test__start_subtest("signature_bad_keyring"))
+		signature_bad_keyring();
+	if (test__start_subtest("metadata_ctx_max_entries_ignored"))
+		metadata_ctx_max_entries_ignored();
+	if (test__start_subtest("metadata_ctx_initial_value_ignored"))
+		metadata_ctx_initial_value_ignored();
+	if (test__start_subtest("signature_authenticates_insns"))
+		signature_authenticates_insns();
+	if (test__start_subtest("hash_requires_frozen"))
+		hash_requires_frozen();
+	if (test__start_subtest("no_update_after_freeze"))
+		no_update_after_freeze();
+	if (test__start_subtest("freeze_writable_mmap"))
+		freeze_writable_mmap();
+	if (test__start_subtest("no_writable_mmap_frozen"))
+		no_writable_mmap_frozen();
+	if (test__start_subtest("map_hash_matches_libbpf"))
+		map_hash_matches_libbpf();
+	if (test__start_subtest("map_hash_multi_element"))
+		map_hash_multi_element();
+	if (test__start_subtest("map_hash_bad_size"))
+		map_hash_bad_size();
+	if (test__start_subtest("map_hash_unsupported_type"))
+		map_hash_unsupported_type();
+	if (test__start_subtest("lsm_signature_verdict"))
+		lsm_signature_verdict();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/sleepable_tracepoints.c b/tools/testing/selftests/bpf/prog_tests/sleepable_tracepoints.c
new file mode 100644
index 000000000000..19500b785ee3
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/sleepable_tracepoints.c
@@ -0,0 +1,142 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+
+#include <test_progs.h>
+#include <unistd.h>
+#include "test_sleepable_tracepoints.skel.h"
+#include "test_sleepable_tracepoints_fail.skel.h"
+
+static void run_test(struct test_sleepable_tracepoints *skel)
+{
+	char buf[PATH_MAX] = "/";
+
+	skel->bss->target_pid = getpid();
+	skel->bss->prog_triggered = 0;
+	skel->bss->err = 0;
+	skel->bss->copied_byte = 0;
+
+	syscall(__NR_getcwd, buf, sizeof(buf));
+
+	ASSERT_EQ(skel->bss->prog_triggered, 1, "prog_triggered");
+	ASSERT_EQ(skel->bss->err, 0, "err");
+	ASSERT_EQ(skel->bss->copied_byte, '/', "copied_byte");
+}
+
+static void run_auto_attach_test(struct bpf_program *prog,
+				 struct test_sleepable_tracepoints *skel)
+{
+	struct bpf_link *link;
+
+	link = bpf_program__attach(prog);
+	if (!ASSERT_OK_PTR(link, "prog_attach"))
+		return;
+
+	run_test(skel);
+	bpf_link__destroy(link);
+}
+
+static void test_attach_only(struct bpf_program *prog)
+{
+	struct bpf_link *link;
+
+	link = bpf_program__attach(prog);
+	if (ASSERT_OK_PTR(link, "attach"))
+		bpf_link__destroy(link);
+}
+
+static void test_attach_reject(struct bpf_program *prog)
+{
+	struct bpf_link *link;
+
+	link = bpf_program__attach(prog);
+	if (!ASSERT_ERR_PTR(link, "attach_should_fail"))
+		bpf_link__destroy(link);
+}
+
+static void test_raw_tp_bare(struct test_sleepable_tracepoints *skel)
+{
+	struct bpf_link *link;
+
+	link = bpf_program__attach_raw_tracepoint(skel->progs.handle_raw_tp_bare,
+						  "sys_enter");
+	if (ASSERT_OK_PTR(link, "attach"))
+		bpf_link__destroy(link);
+}
+
+static void test_tp_bare(struct test_sleepable_tracepoints *skel)
+{
+	struct bpf_link *link;
+
+	link = bpf_program__attach_tracepoint(skel->progs.handle_tp_bare,
+					      "syscalls", "sys_enter_getcwd");
+	if (ASSERT_OK_PTR(link, "attach"))
+		bpf_link__destroy(link);
+}
+
+static void test_test_run(struct test_sleepable_tracepoints *skel)
+{
+	__u64 args[2] = {0x1234ULL, 0x5678ULL};
+	LIBBPF_OPTS(bpf_test_run_opts, topts,
+		.ctx_in = args,
+		.ctx_size_in = sizeof(args),
+	);
+	int fd, err;
+
+	fd = bpf_program__fd(skel->progs.handle_test_run);
+	err = bpf_prog_test_run_opts(fd, &topts);
+	ASSERT_OK(err, "test_run");
+	ASSERT_EQ(topts.retval, args[0] + args[1], "test_run_retval");
+}
+
+static void test_test_run_on_cpu_reject(struct test_sleepable_tracepoints *skel)
+{
+	__u64 args[2] = {};
+	LIBBPF_OPTS(bpf_test_run_opts, topts,
+		.ctx_in = args,
+		.ctx_size_in = sizeof(args),
+		.flags = BPF_F_TEST_RUN_ON_CPU,
+	);
+	int fd, err;
+
+	fd = bpf_program__fd(skel->progs.handle_test_run);
+	err = bpf_prog_test_run_opts(fd, &topts);
+	ASSERT_ERR(err, "test_run_on_cpu_reject");
+}
+
+void test_sleepable_tracepoints(void)
+{
+	struct test_sleepable_tracepoints *skel;
+
+	skel = test_sleepable_tracepoints__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "open_and_load"))
+		return;
+
+	if (test__start_subtest("tp_btf"))
+		run_auto_attach_test(skel->progs.handle_sys_enter_tp_btf, skel);
+	if (test__start_subtest("raw_tp"))
+		run_auto_attach_test(skel->progs.handle_sys_enter_raw_tp, skel);
+	if (test__start_subtest("tracepoint"))
+		run_auto_attach_test(skel->progs.handle_sys_enter_tp, skel);
+	if (test__start_subtest("sys_exit"))
+		run_auto_attach_test(skel->progs.handle_sys_exit_tp, skel);
+	if (test__start_subtest("tracepoint_alias"))
+		test_attach_only(skel->progs.handle_sys_enter_tp_alias);
+	if (test__start_subtest("raw_tracepoint_alias"))
+		test_attach_only(skel->progs.handle_sys_enter_raw_tp_alias);
+	if (test__start_subtest("raw_tp_bare"))
+		test_raw_tp_bare(skel);
+	if (test__start_subtest("tp_bare"))
+		test_tp_bare(skel);
+	if (test__start_subtest("test_run"))
+		test_test_run(skel);
+	if (test__start_subtest("test_run_on_cpu_reject"))
+		test_test_run_on_cpu_reject(skel);
+	if (test__start_subtest("raw_tp_non_faultable"))
+		test_attach_reject(skel->progs.handle_raw_tp_non_faultable);
+	if (test__start_subtest("tp_non_syscall"))
+		test_attach_reject(skel->progs.handle_tp_non_syscall);
+	if (test__start_subtest("tp_btf_non_faultable_reject"))
+		RUN_TESTS(test_sleepable_tracepoints_fail);
+
+	test_sleepable_tracepoints__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
index d2846579285f..cb3229711f93 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
@@ -14,6 +14,7 @@
 #include "test_sockmap_pass_prog.skel.h"
 #include "test_sockmap_drop_prog.skel.h"
 #include "test_sockmap_change_tail.skel.h"
+#include "test_sockmap_msg_pop_data.skel.h"
 #include "bpf_iter_sockmap.skel.h"
 
 #include "sockmap_helpers.h"
@@ -666,6 +667,51 @@ out:
 	test_sockmap_change_tail__destroy(skel);
 }
 
+static void test_sockmap_msg_verdict_pop_data(void)
+{
+	struct test_sockmap_msg_pop_data *skel;
+	int err, map, verdict;
+	int c1 = -1, p1 = -1, sent;
+	int zero = 0;
+	char *buf;
+	const size_t len = 32 * 1024;
+
+	skel = test_sockmap_msg_pop_data__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "open_and_load"))
+		return;
+
+	verdict = bpf_program__fd(skel->progs.prog_msg_pop_data);
+	map = bpf_map__fd(skel->maps.sock_map);
+
+	err = bpf_prog_attach(verdict, map, BPF_SK_MSG_VERDICT, 0);
+	if (!ASSERT_OK(err, "bpf_prog_attach"))
+		goto out;
+
+	err = create_pair(AF_INET, SOCK_STREAM, &c1, &p1);
+	if (!ASSERT_OK(err, "create_pair"))
+		goto out;
+
+	err = bpf_map_update_elem(map, &zero, &c1, BPF_NOEXIST);
+	if (!ASSERT_OK(err, "bpf_map_update_elem"))
+		goto out_close;
+
+	buf = calloc(len, 1);
+	if (!ASSERT_OK_PTR(buf, "calloc"))
+		goto out_close;
+
+	sent = xsend(c1, buf, len, 0);
+	ASSERT_EQ(sent, (ssize_t)len, "xsend");
+	ASSERT_EQ(skel->data->pop_data_ret, -EINVAL, "pop_data_rejects overflow");
+
+	free(buf);
+
+out_close:
+	close(c1);
+	close(p1);
+out:
+	test_sockmap_msg_pop_data__destroy(skel);
+}
+
 static void test_sockmap_skb_verdict_peek_helper(int map)
 {
 	int err, c1, p1, zero = 0, sent, recvd, avail;
@@ -1373,6 +1419,8 @@ void test_sockmap_basic(void)
 		test_sockmap_skb_verdict_fionread(false);
 	if (test__start_subtest("sockmap skb_verdict change tail"))
 		test_sockmap_skb_verdict_change_tail();
+	if (test__start_subtest("sockmap msg_verdict pop_data overflow"))
+		test_sockmap_msg_verdict_pop_data();
 	if (test__start_subtest("sockmap skb_verdict msg_f_peek"))
 		test_sockmap_skb_verdict_peek();
 	if (test__start_subtest("sockmap skb_verdict msg_f_peek with link"))
diff --git a/tools/testing/selftests/bpf/prog_tests/spin_lock.c b/tools/testing/selftests/bpf/prog_tests/spin_lock.c
index bbe476f4c47d..5c3579438427 100644
--- a/tools/testing/selftests/bpf/prog_tests/spin_lock.c
+++ b/tools/testing/selftests/bpf/prog_tests/spin_lock.c
@@ -13,8 +13,8 @@ static struct {
 	const char *err_msg;
 } spin_lock_fail_tests[] = {
 	{ "lock_id_kptr_preserve",
-	  "[0-9]\\+: (bf) r1 = r0                       ; R0=ptr_foo(id=2,ref_obj_id=2)"
-	  " R1=ptr_foo(id=2,ref_obj_id=2) refs=2\n"
+	  "[0-9]\\+: (bf) r1 = r0                       ; R0=ptr_foo(id=2)"
+	  " R1=ptr_foo(id=2) refs=2\n"
 	  "[0-9]\\+: (85) call bpf_this_cpu_ptr#154\n"
 	  "R1 type=ptr_ expected=percpu_ptr_" },
 	{ "lock_id_global_zero",
diff --git a/tools/testing/selftests/bpf/prog_tests/stack_arg.c b/tools/testing/selftests/bpf/prog_tests/stack_arg.c
new file mode 100644
index 000000000000..57193543f260
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/stack_arg.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#include <test_progs.h>
+#include <network_helpers.h>
+#include "stack_arg.skel.h"
+#include "stack_arg_kfunc.skel.h"
+
+static void run_subtest(struct bpf_program *prog, int expected)
+{
+	int err, prog_fd;
+	LIBBPF_OPTS(bpf_test_run_opts, topts,
+		.data_in = &pkt_v4,
+		.data_size_in = sizeof(pkt_v4),
+		.repeat = 1,
+	);
+
+	prog_fd = bpf_program__fd(prog);
+	err = bpf_prog_test_run_opts(prog_fd, &topts);
+	ASSERT_OK(err, "test_run");
+	ASSERT_EQ(topts.retval, expected, "retval");
+}
+
+static void test_global_many(void)
+{
+	struct stack_arg *skel;
+
+	skel = stack_arg__open();
+	if (!ASSERT_OK_PTR(skel, "open"))
+		return;
+
+	if (!skel->rodata->has_stack_arg) {
+		test__skip();
+		goto out;
+	}
+
+	if (!ASSERT_OK(stack_arg__load(skel), "load"))
+		goto out;
+
+	run_subtest(skel->progs.test_global_many_args, 55);
+
+out:
+	stack_arg__destroy(skel);
+}
+
+static void test_async_cb_many(void)
+{
+	struct stack_arg *skel;
+
+	skel = stack_arg__open();
+	if (!ASSERT_OK_PTR(skel, "open"))
+		return;
+
+	if (!skel->rodata->has_stack_arg) {
+		test__skip();
+		goto out;
+	}
+
+	if (!ASSERT_OK(stack_arg__load(skel), "load"))
+		goto out;
+
+	run_subtest(skel->progs.test_async_cb_many_args, 0);
+
+	/* Wait for the timer callback to fire and verify the result.
+	 * 10+20+30+40+50+60+70+80+90+100 = 550
+	 */
+	usleep(50);
+	ASSERT_EQ(skel->bss->timer_result, 550, "timer_result");
+
+out:
+	stack_arg__destroy(skel);
+}
+
+static void test_bpf2bpf(void)
+{
+	struct stack_arg *skel;
+
+	skel = stack_arg__open();
+	if (!ASSERT_OK_PTR(skel, "open"))
+		return;
+
+	if (!skel->rodata->has_stack_arg) {
+		test__skip();
+		goto out;
+	}
+
+	if (!ASSERT_OK(stack_arg__load(skel), "load"))
+		goto out;
+
+	run_subtest(skel->progs.test_bpf2bpf_ptr_stack_arg, 75);
+	run_subtest(skel->progs.test_bpf2bpf_mix_stack_args, 66);
+	run_subtest(skel->progs.test_bpf2bpf_nesting_stack_arg, 84);
+	run_subtest(skel->progs.test_bpf2bpf_dynptr_stack_arg, 99);
+	run_subtest(skel->progs.test_two_callees, 133);
+
+out:
+	stack_arg__destroy(skel);
+}
+
+static void test_kfunc(void)
+{
+	struct stack_arg_kfunc *skel;
+
+	skel = stack_arg_kfunc__open();
+	if (!ASSERT_OK_PTR(skel, "open"))
+		return;
+
+	if (!skel->rodata->has_stack_arg) {
+		test__skip();
+		goto out;
+	}
+
+	if (!ASSERT_OK(stack_arg_kfunc__load(skel), "load"))
+		goto out;
+
+	run_subtest(skel->progs.test_stack_arg_scalar, 55);
+	run_subtest(skel->progs.test_stack_arg_ptr, 75);
+	run_subtest(skel->progs.test_stack_arg_mix, 66);
+	run_subtest(skel->progs.test_stack_arg_dynptr, 99);
+	run_subtest(skel->progs.test_stack_arg_mem, 151);
+	run_subtest(skel->progs.test_stack_arg_iter, 145);
+	run_subtest(skel->progs.test_stack_arg_const_str, 45);
+	run_subtest(skel->progs.test_stack_arg_timer, 45);
+
+out:
+	stack_arg_kfunc__destroy(skel);
+}
+
+void test_stack_arg(void)
+{
+	if (test__start_subtest("global_many_args"))
+		test_global_many();
+	if (test__start_subtest("async_cb_many_args"))
+		test_async_cb_many();
+	if (test__start_subtest("bpf2bpf"))
+		test_bpf2bpf();
+	if (test__start_subtest("kfunc"))
+		test_kfunc();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/stack_arg_fail.c b/tools/testing/selftests/bpf/prog_tests/stack_arg_fail.c
new file mode 100644
index 000000000000..090af1330953
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/stack_arg_fail.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#include <test_progs.h>
+#include "stack_arg_fail.skel.h"
+
+void test_stack_arg_fail(void)
+{
+	RUN_TESTS(stack_arg_fail);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/stack_arg_precision.c b/tools/testing/selftests/bpf/prog_tests/stack_arg_precision.c
new file mode 100644
index 000000000000..1ab041d66de3
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/stack_arg_precision.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#include <test_progs.h>
+#include "stack_arg_precision.skel.h"
+
+void test_stack_arg_precision(void)
+{
+	RUN_TESTS(stack_arg_precision);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/tailcalls.c b/tools/testing/selftests/bpf/prog_tests/tailcalls.c
index 7d534fde0af9..a5a226d0104c 100644
--- a/tools/testing/selftests/bpf/prog_tests/tailcalls.c
+++ b/tools/testing/selftests/bpf/prog_tests/tailcalls.c
@@ -8,6 +8,9 @@
 #include "tailcall_freplace.skel.h"
 #include "tc_bpf2bpf.skel.h"
 #include "tailcall_fail.skel.h"
+#include "tailcall_cgrp_storage_owner.skel.h"
+#include "tailcall_cgrp_storage_no_storage.skel.h"
+#include "tailcall_cgrp_storage.skel.h"
 #include "tailcall_sleepable.skel.h"
 
 /* test_tailcall_1 checks basic functionality by patching multiple locations
@@ -1654,6 +1657,179 @@ static void test_tailcall_failure()
 	RUN_TESTS(tailcall_fail);
 }
 
+static void test_tailcall_cgrp_storage(void)
+{
+	struct tailcall_cgrp_storage_owner *owner_skel = NULL;
+	struct tailcall_cgrp_storage *skel = NULL;
+	int err, key = 0, prog_array_fd, prog_fd, storage_map_fd;
+
+	owner_skel = tailcall_cgrp_storage_owner__open_and_load();
+	if (!ASSERT_OK_PTR(owner_skel, "owner_open_and_load"))
+		return;
+
+	prog_array_fd = bpf_map__fd(owner_skel->maps.prog_array);
+	storage_map_fd = bpf_map__fd(owner_skel->maps.storage_map);
+
+	skel = tailcall_cgrp_storage__open();
+	if (!ASSERT_OK_PTR(skel, "tailcall_cgrp_storage__open"))
+		goto out;
+
+	err = bpf_map__reuse_fd(skel->maps.prog_array, prog_array_fd);
+	if (!ASSERT_OK(err, "reuse_prog_array"))
+		goto out;
+
+	err = bpf_map__reuse_fd(skel->maps.storage_map, storage_map_fd);
+	if (!ASSERT_OK(err, "reuse_storage_map"))
+		goto out;
+
+	err = bpf_object__load(skel->obj);
+	if (!ASSERT_OK(err, "tailcall_cgrp_storage__load"))
+		goto out;
+
+	prog_fd = bpf_program__fd(skel->progs.callee_prog);
+	err = bpf_map_update_elem(prog_array_fd, &key, &prog_fd, BPF_ANY);
+	ASSERT_OK(err, "update_prog_array");
+out:
+	tailcall_cgrp_storage__destroy(skel);
+	tailcall_cgrp_storage_owner__destroy(owner_skel);
+}
+
+static void test_tailcall_cgrp_storage_diff_storage(void)
+{
+	struct tailcall_cgrp_storage_owner *owner_skel = NULL;
+	struct tailcall_cgrp_storage *skel = NULL;
+	int err, prog_array_fd;
+
+	owner_skel = tailcall_cgrp_storage_owner__open_and_load();
+	if (!ASSERT_OK_PTR(owner_skel, "owner_open_and_load"))
+		return;
+
+	prog_array_fd = bpf_map__fd(owner_skel->maps.prog_array);
+
+	skel = tailcall_cgrp_storage__open();
+	if (!ASSERT_OK_PTR(skel, "tailcall_cgrp_storage__open"))
+		goto out;
+
+	err = bpf_map__reuse_fd(skel->maps.prog_array, prog_array_fd);
+	if (!ASSERT_OK(err, "reuse_prog_array"))
+		goto out;
+
+	err = bpf_object__load(skel->obj);
+	ASSERT_ERR(err, "tailcall_cgrp_storage__load");
+out:
+	tailcall_cgrp_storage__destroy(skel);
+	tailcall_cgrp_storage_owner__destroy(owner_skel);
+}
+
+static void test_tailcall_cgrp_storage_no_storage(void)
+{
+	struct tailcall_cgrp_storage_owner *owner_skel = NULL;
+	struct tailcall_cgrp_storage_no_storage *skel = NULL;
+	int err, prog_array_fd;
+
+	owner_skel = tailcall_cgrp_storage_owner__open_and_load();
+	if (!ASSERT_OK_PTR(owner_skel, "owner_open_and_load"))
+		return;
+
+	prog_array_fd = bpf_map__fd(owner_skel->maps.prog_array);
+
+	skel = tailcall_cgrp_storage_no_storage__open();
+	if (!ASSERT_OK_PTR(skel, "tailcall_cgrp_storage_no_storage__open"))
+		goto out;
+
+	err = bpf_map__reuse_fd(skel->maps.prog_array, prog_array_fd);
+	if (!ASSERT_OK(err, "reuse_prog_array"))
+		goto out;
+
+	err = bpf_object__load(skel->obj);
+	ASSERT_ERR(err, "tailcall_cgrp_storage_no_storage__load");
+out:
+	tailcall_cgrp_storage_no_storage__destroy(skel);
+	tailcall_cgrp_storage_owner__destroy(owner_skel);
+}
+
+static void test_tailcall_cgrp_storage_no_storage_leaf(void)
+{
+	struct tailcall_cgrp_storage_owner *owner_skel = NULL;
+	struct tailcall_cgrp_storage_no_storage *skel = NULL;
+	int err, key = 0, prog_array_fd, prog_fd;
+
+	owner_skel = tailcall_cgrp_storage_owner__open_and_load();
+	if (!ASSERT_OK_PTR(owner_skel, "owner_open_and_load"))
+		return;
+
+	prog_array_fd = bpf_map__fd(owner_skel->maps.prog_array);
+
+	skel = tailcall_cgrp_storage_no_storage__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "tailcall_cgrp_storage_no_storage__open_and_load"))
+		goto out;
+
+	prog_fd = bpf_program__fd(skel->progs.leaf_prog);
+	err = bpf_map_update_elem(prog_array_fd, &key, &prog_fd, BPF_ANY);
+	if (!ASSERT_OK(err, "update_prog_array_leaf"))
+		goto out;
+
+	prog_fd = bpf_program__fd(skel->progs.caller_prog);
+	err = bpf_map_update_elem(prog_array_fd, &key, &prog_fd, BPF_ANY);
+	ASSERT_ERR(err, "update_prog_array_bridge");
+out:
+	tailcall_cgrp_storage_no_storage__destroy(skel);
+	tailcall_cgrp_storage_owner__destroy(owner_skel);
+}
+
+static void test_tailcall_cgrp_storage_no_storage_bridge(void)
+{
+	struct tailcall_cgrp_storage_owner *owner_skel = NULL;
+	struct tailcall_cgrp_storage_no_storage *bridge_skel = NULL;
+	struct tailcall_cgrp_storage *callee_skel = NULL;
+	int err, key = 0, prog_array_fd, prog_fd, storage_map_fd;
+
+	owner_skel = tailcall_cgrp_storage_owner__open_and_load();
+	if (!ASSERT_OK_PTR(owner_skel, "owner_open_and_load"))
+		return;
+
+	prog_array_fd = bpf_map__fd(owner_skel->maps.prog_array);
+	storage_map_fd = bpf_map__fd(owner_skel->maps.storage_map);
+
+	callee_skel = tailcall_cgrp_storage__open();
+	if (!ASSERT_OK_PTR(callee_skel, "tailcall_cgrp_storage__open"))
+		goto out;
+
+	bpf_program__set_autoload(callee_skel->progs.caller_prog, false);
+
+	err = bpf_map__reuse_fd(callee_skel->maps.prog_array, prog_array_fd);
+	if (!ASSERT_OK(err, "reuse_prog_array"))
+		goto out;
+
+	err = bpf_map__reuse_fd(callee_skel->maps.storage_map, storage_map_fd);
+	if (!ASSERT_OK(err, "reuse_storage_map"))
+		goto out;
+
+	err = bpf_object__load(callee_skel->obj);
+	if (!ASSERT_OK(err, "tailcall_cgrp_storage__load"))
+		goto out;
+
+	prog_fd = bpf_program__fd(callee_skel->progs.callee_prog);
+	err = bpf_map_update_elem(prog_array_fd, &key, &prog_fd, BPF_ANY);
+	if (!ASSERT_OK(err, "update_prog_array"))
+		goto out;
+
+	bridge_skel = tailcall_cgrp_storage_no_storage__open();
+	if (!ASSERT_OK_PTR(bridge_skel, "tailcall_cgrp_storage_no_storage__open"))
+		goto out;
+
+	err = bpf_map__reuse_fd(bridge_skel->maps.prog_array, prog_array_fd);
+	if (!ASSERT_OK(err, "reuse_prog_array"))
+		goto out;
+
+	err = bpf_object__load(bridge_skel->obj);
+	ASSERT_ERR(err, "tailcall_cgrp_storage_no_storage_bridge__load");
+out:
+	tailcall_cgrp_storage_no_storage__destroy(bridge_skel);
+	tailcall_cgrp_storage__destroy(callee_skel);
+	tailcall_cgrp_storage_owner__destroy(owner_skel);
+}
+
 noinline void uprobe_sleepable_trigger(void)
 {
 	asm volatile ("");
@@ -1781,4 +1957,14 @@ void test_tailcalls(void)
 		test_tailcall_failure();
 	if (test__start_subtest("tailcall_sleepable"))
 		test_tailcall_sleepable();
+	if (test__start_subtest("tailcall_cgrp_storage"))
+		test_tailcall_cgrp_storage();
+	if (test__start_subtest("tailcall_cgrp_storage_diff_storage"))
+		test_tailcall_cgrp_storage_diff_storage();
+	if (test__start_subtest("tailcall_cgrp_storage_no_storage"))
+		test_tailcall_cgrp_storage_no_storage();
+	if (test__start_subtest("tailcall_cgrp_storage_no_storage_leaf"))
+		test_tailcall_cgrp_storage_no_storage_leaf();
+	if (test__start_subtest("tailcall_cgrp_storage_no_storage_bridge"))
+		test_tailcall_cgrp_storage_no_storage_bridge();
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/task_kfunc.c b/tools/testing/selftests/bpf/prog_tests/task_kfunc.c
index 83b90335967a..e6e95c1416e6 100644
--- a/tools/testing/selftests/bpf/prog_tests/task_kfunc.c
+++ b/tools/testing/selftests/bpf/prog_tests/task_kfunc.c
@@ -68,6 +68,36 @@ cleanup:
 	task_kfunc_success__destroy(skel);
 }
 
+static void run_syscall_success_test(const char *prog_name)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, opts);
+	struct task_kfunc_success *skel;
+	struct bpf_program *prog;
+	int err;
+
+	skel = open_load_task_kfunc_skel();
+	if (!ASSERT_OK_PTR(skel, "open_load_skel"))
+		return;
+
+	if (!ASSERT_OK(skel->bss->err, "pre_run_err"))
+		goto cleanup;
+
+	prog = bpf_object__find_program_by_name(skel->obj, prog_name);
+	if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name"))
+		goto cleanup;
+
+	err = bpf_prog_test_run_opts(bpf_program__fd(prog), &opts);
+	if (!ASSERT_OK(err, "bpf_prog_test_run_opts"))
+		goto cleanup;
+	if (!ASSERT_EQ(opts.retval, 0, "retval"))
+		goto cleanup;
+
+	ASSERT_OK(skel->bss->err, "post_run_err");
+
+cleanup:
+	task_kfunc_success__destroy(skel);
+}
+
 static int run_vpid_test(void *prog_name)
 {
 	struct task_kfunc_success *skel;
@@ -140,7 +170,6 @@ static const char * const success_tests[] = {
 	"test_task_acquire_release_argument",
 	"test_task_acquire_release_current",
 	"test_task_acquire_leave_in_map",
-	"test_task_xchg_release",
 	"test_task_map_acquire_release",
 	"test_task_current_acquire_release",
 	"test_task_from_pid_arg",
@@ -151,6 +180,10 @@ static const char * const success_tests[] = {
 	"test_task_kfunc_flavor_relo_not_found",
 };
 
+static const char * const syscall_success_tests[] = {
+	"test_task_xchg_release",
+};
+
 static const char * const vpid_success_tests[] = {
 	"test_task_from_vpid_current",
 	"test_task_from_vpid_invalid",
@@ -167,6 +200,13 @@ void test_task_kfunc(void)
 		run_success_test(success_tests[i]);
 	}
 
+	for (i = 0; i < ARRAY_SIZE(syscall_success_tests); i++) {
+		if (!test__start_subtest(syscall_success_tests[i]))
+			continue;
+
+		run_syscall_success_test(syscall_success_tests[i]);
+	}
+
 	for (i = 0; i < ARRAY_SIZE(vpid_success_tests); i++) {
 		if (!test__start_subtest(vpid_success_tests[i]))
 			continue;
diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
index 1b26c12f255a..5b2b56cc3a4f 100644
--- a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
+++ b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
@@ -47,6 +47,7 @@ static void test_sys_enter_exit(void)
 	skel->bss->target_pid = 0;
 
 	/* 2x gettid syscalls */
+	ASSERT_EQ(skel->bss->update_err, 0, "update_err");
 	ASSERT_EQ(skel->bss->enter_cnt, 2, "enter_cnt");
 	ASSERT_EQ(skel->bss->exit_cnt, 2, "exit_cnt");
 	ASSERT_EQ(skel->bss->mismatch_cnt, 0, "mismatch_cnt");
diff --git a/tools/testing/selftests/bpf/prog_tests/test_lsm.c b/tools/testing/selftests/bpf/prog_tests/test_lsm.c
index bdc4fc06bc5a..d7495efd4a56 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_lsm.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_lsm.c
@@ -5,36 +5,14 @@
  */
 
 #include <test_progs.h>
-#include <sys/mman.h>
 #include <sys/wait.h>
 #include <unistd.h>
-#include <malloc.h>
-#include <stdlib.h>
 
 #include "lsm.skel.h"
 #include "lsm_tailcall.skel.h"
 
 char *CMD_ARGS[] = {"true", NULL};
 
-#define GET_PAGE_ADDR(ADDR, PAGE_SIZE)					\
-	(char *)(((unsigned long) (ADDR + PAGE_SIZE)) & ~(PAGE_SIZE-1))
-
-int stack_mprotect(void)
-{
-	void *buf;
-	long sz;
-	int ret;
-
-	sz = sysconf(_SC_PAGESIZE);
-	if (sz < 0)
-		return sz;
-
-	buf = alloca(sz * 3);
-	ret = mprotect(GET_PAGE_ADDR(buf, sz), sz,
-		       PROT_READ | PROT_WRITE | PROT_EXEC);
-	return ret;
-}
-
 int exec_cmd(int *monitored_pid)
 {
 	int child_pid, child_status;
diff --git a/tools/testing/selftests/bpf/prog_tests/test_xdp_veth.c b/tools/testing/selftests/bpf/prog_tests/test_xdp_veth.c
index 3e98a1665936..1675b32753a8 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_xdp_veth.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_xdp_veth.c
@@ -456,7 +456,11 @@ static void xdp_veth_egress(u32 flags)
 			.remote_flags = flags,
 		}
 	};
-	const char magic_mac[6] = { 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF};
+	const unsigned char egress_macs[VETH_PAIRS_COUNT][ETH_ALEN] = {
+		{ 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0x01 },
+		{ 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0x02 },
+		{ 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0x03 },
+	};
 	struct xdp_redirect_multi_kern *xdp_redirect_multi_kern;
 	struct bpf_object *bpf_objs[VETH_EGRESS_SKEL_NB];
 	struct xdp_redirect_map *xdp_redirect_map;
@@ -512,7 +516,13 @@ static void xdp_veth_egress(u32 flags)
 						 &net_config, prog_cfg, i))
 			goto destroy_xdp_redirect_map;
 
-		err = bpf_map_update_elem(mac_map, &ifindex, magic_mac, 0);
+		{
+			__be64 mac = 0;
+
+			memcpy(&mac, egress_macs[i], ETH_ALEN);
+			err = bpf_map_update_elem(mac_map, &ifindex, &mac, 0);
+		}
+
 		if (!ASSERT_OK(err, "bpf_map_update_elem"))
 			goto destroy_xdp_redirect_map;
 
@@ -531,15 +541,162 @@ static void xdp_veth_egress(u32 flags)
 
 	for (i = 0; i < 2; i++) {
 		u32 key = i;
+		__be64 expected = 0;
 		u64 res;
 
 		err = bpf_map_lookup_elem(res_map, &key, &res);
 		if (!ASSERT_OK(err, "get MAC res"))
 			goto destroy_xdp_redirect_map;
 
-		ASSERT_STRNEQ((const char *)&res, magic_mac, ETH_ALEN, "compare mac");
+		/* store_mac_1/2 run on the second/third remote veths. */
+		memcpy(&expected, egress_macs[i + 1], ETH_ALEN);
+		ASSERT_EQ(res, expected, "compare mac");
+	}
+
+destroy_xdp_redirect_map:
+	close_netns(nstoken);
+	xdp_redirect_map__destroy(xdp_redirect_map);
+destroy_xdp_redirect_multi_kern:
+	xdp_redirect_multi_kern__destroy(xdp_redirect_multi_kern);
+destroy_xdp_dummy:
+	xdp_dummy__destroy(xdp_dummy);
+
+	cleanup_network(&net_config);
+}
+
+static void xdp_veth_egress_last_dst(u32 flags)
+{
+	struct prog_configuration prog_cfg[VETH_PAIRS_COUNT] = {
+		{
+			.local_name = "xdp_redirect_map_all_prog",
+			.remote_name = "xdp_dummy_prog",
+			.local_flags = flags,
+			.remote_flags = flags,
+		},
+		{
+			.local_name = "xdp_redirect_map_all_prog",
+			.remote_name = "store_mac_1",
+			.local_flags = flags,
+			.remote_flags = flags,
+		},
+		{
+			.local_name = "xdp_redirect_map_all_prog",
+			.remote_name = "xdp_dummy_prog",
+			.local_flags = flags,
+			.remote_flags = flags,
+		}
+	};
+	const unsigned char egress_macs[VETH_PAIRS_COUNT][ETH_ALEN] = {
+		{ 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0x01 },
+		{ 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0x02 },
+		{ 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0x03 },
+	};
+	struct xdp_redirect_multi_kern *xdp_redirect_multi_kern;
+	struct bpf_object *bpf_objs[VETH_EGRESS_SKEL_NB];
+	struct xdp_redirect_map *xdp_redirect_map;
+	struct net_configuration net_config = {};
+	int mac_map, egress_map, res_map;
+	struct nstoken *nstoken = NULL;
+	struct xdp_dummy *xdp_dummy;
+	__be64 sentinel_mac = 0;
+	__be64 last_mac = 0;
+	__be64 res;
+	u32 key;
+	int err;
+	int i;
+
+	xdp_dummy = xdp_dummy__open_and_load();
+	if (!ASSERT_OK_PTR(xdp_dummy, "xdp_dummy__open_and_load"))
+		return;
+
+	xdp_redirect_multi_kern = xdp_redirect_multi_kern__open_and_load();
+	if (!ASSERT_OK_PTR(xdp_redirect_multi_kern, "xdp_redirect_multi_kern__open_and_load"))
+		goto destroy_xdp_dummy;
+
+	xdp_redirect_map = xdp_redirect_map__open_and_load();
+	if (!ASSERT_OK_PTR(xdp_redirect_map, "xdp_redirect_map__open_and_load"))
+		goto destroy_xdp_redirect_multi_kern;
+
+	if (!ASSERT_OK(create_network(&net_config), "create network"))
+		goto destroy_xdp_redirect_map;
+
+	mac_map = bpf_map__fd(xdp_redirect_multi_kern->maps.mac_map);
+	if (!ASSERT_OK_FD(mac_map, "open mac_map"))
+		goto destroy_xdp_redirect_map;
+
+	egress_map = bpf_map__fd(xdp_redirect_multi_kern->maps.map_egress);
+	if (!ASSERT_OK_FD(egress_map, "open map_egress"))
+		goto destroy_xdp_redirect_map;
+
+	bpf_objs[0] = xdp_dummy->obj;
+	bpf_objs[1] = xdp_redirect_multi_kern->obj;
+	bpf_objs[2] = xdp_redirect_map->obj;
+
+	nstoken = open_netns(net_config.ns0_name);
+	if (!ASSERT_OK_PTR(nstoken, "open NS0"))
+		goto destroy_xdp_redirect_map;
+
+	for (i = 0; i < VETH_PAIRS_COUNT; i++) {
+		struct bpf_devmap_val devmap_val = {};
+		int ifindex = if_nametoindex(net_config.veth_cfg[i].local_veth);
+		u32 key = i;
+
+		SYS(destroy_xdp_redirect_map,
+		    "ip -n %s neigh add %s lladdr 00:00:00:00:00:01 dev %s",
+		    net_config.veth_cfg[i].namespace, IP_NEIGH,
+		    net_config.veth_cfg[i].remote_veth);
+
+		if (attach_programs_to_veth_pair(bpf_objs, VETH_EGRESS_SKEL_NB,
+						 &net_config, prog_cfg, i))
+			goto destroy_xdp_redirect_map;
+
+		{
+			__be64 mac = 0;
+
+			memcpy(&mac, egress_macs[i], ETH_ALEN);
+			err = bpf_map_update_elem(mac_map, &ifindex, &mac, 0);
+		}
+
+		if (!ASSERT_OK(err, "bpf_map_update_elem"))
+			goto destroy_xdp_redirect_map;
+
+		devmap_val.ifindex = ifindex;
+		devmap_val.bpf_prog.fd = -1;
+
+		if (i == VETH_PAIRS_COUNT - 1)
+			devmap_val.bpf_prog.fd =
+				bpf_program__fd(xdp_redirect_multi_kern->progs.xdp_devmap_prog);
+
+		err = bpf_map_update_elem(egress_map, &key, &devmap_val, 0);
+		if (!ASSERT_OK(err, "bpf_map_update_elem"))
+			goto destroy_xdp_redirect_map;
 	}
 
+	res_map = bpf_map__fd(xdp_redirect_map->maps.rx_mac);
+	if (!ASSERT_OK_FD(res_map, "open rx_map"))
+		goto destroy_xdp_redirect_map;
+
+	memcpy(&sentinel_mac, egress_macs[VETH_PAIRS_COUNT - 1], ETH_ALEN);
+	memcpy(&last_mac, egress_macs[VETH_PAIRS_COUNT - 1], ETH_ALEN);
+
+	key = 0;
+	err = bpf_map_update_elem(res_map, &key, &sentinel_mac, 0);
+	if (!ASSERT_OK(err, "init rx mac"))
+		goto destroy_xdp_redirect_map;
+
+	SYS_NOFAIL("ip netns exec %s ping %s -i 0.1 -c 4 -W1 > /dev/null ",
+		   net_config.veth_cfg[0].namespace, IP_NEIGH);
+
+	err = bpf_map_lookup_elem(res_map, &key, &res);
+	if (!ASSERT_OK(err, "get MAC res"))
+		goto destroy_xdp_redirect_map;
+
+	if (!ASSERT_NEQ(res, sentinel_mac, "rx_mac overwritten by store_mac_1"))
+		goto destroy_xdp_redirect_map;
+
+	if (!ASSERT_NEQ(res, last_mac, "earlier dst not rewritten by last dst"))
+		goto destroy_xdp_redirect_map;
+
 destroy_xdp_redirect_map:
 	close_netns(nstoken);
 	xdp_redirect_map__destroy(xdp_redirect_map);
@@ -596,4 +753,7 @@ void test_xdp_veth_egress(void)
 
 	if (test__start_subtest("SKB_MODE/egress"))
 		xdp_veth_egress(XDP_FLAGS_SKB_MODE);
+
+	if (test__start_subtest("SKB_MODE/egress_last_dst"))
+		xdp_veth_egress_last_dst(XDP_FLAGS_SKB_MODE);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
new file mode 100644
index 000000000000..f02ffc7f41d7
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c
@@ -0,0 +1,960 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <test_progs.h>
+#include <bpf/btf.h>
+#include <search.h>
+#include "bpf/libbpf_internal.h"
+#include "tracing_multi.skel.h"
+#include "tracing_multi_module.skel.h"
+#include "tracing_multi_intersect.skel.h"
+#include "tracing_multi_session.skel.h"
+#include "tracing_multi_fail.skel.h"
+#include "tracing_multi_verifier.skel.h"
+#include "tracing_multi_bench.skel.h"
+#include "tracing_multi_rollback.skel.h"
+#include "trace_helpers.h"
+
+static __u64 bpf_fentry_test_cookies[] = {
+	8,  /* bpf_fentry_test1 */
+	9,  /* bpf_fentry_test2 */
+	7,  /* bpf_fentry_test3 */
+	5,  /* bpf_fentry_test4 */
+	4,  /* bpf_fentry_test5 */
+	2,  /* bpf_fentry_test6 */
+	3,  /* bpf_fentry_test7 */
+	1,  /* bpf_fentry_test8 */
+	10, /* bpf_fentry_test9 */
+	6,  /* bpf_fentry_test10 */
+};
+
+static const char * const bpf_fentry_test[] = {
+	"bpf_fentry_test1",
+	"bpf_fentry_test2",
+	"bpf_fentry_test3",
+	"bpf_fentry_test4",
+	"bpf_fentry_test5",
+	"bpf_fentry_test6",
+	"bpf_fentry_test7",
+	"bpf_fentry_test8",
+	"bpf_fentry_test9",
+	"bpf_fentry_test10",
+};
+
+static const char * const bpf_testmod_fentry_test[] = {
+	"bpf_testmod_fentry_test1",
+	"bpf_testmod_fentry_test2",
+	"bpf_testmod_fentry_test3",
+	"bpf_testmod_fentry_test7",
+	"bpf_testmod_fentry_test11",
+};
+
+#define FUNCS_CNT (ARRAY_SIZE(bpf_fentry_test))
+
+static int get_random_funcs(const char **funcs)
+{
+	int i, cnt = 0;
+
+	for (i = 0; i < FUNCS_CNT; i++) {
+		if (rand() % 2)
+			funcs[cnt++] = bpf_fentry_test[i];
+	}
+	/* we always need at least one.. */
+	if (!cnt)
+		funcs[cnt++] = bpf_fentry_test[rand() % FUNCS_CNT];
+	return cnt;
+}
+
+static int compare(const void *ppa, const void *ppb)
+{
+	const char *pa = *(const char **) ppa;
+	const char *pb = *(const char **) ppb;
+
+	return strcmp(pa, pb);
+}
+
+static void tdestroy_free_nop(void *ptr)
+{
+}
+
+static __u32 *get_ids(const char * const funcs[], int funcs_cnt, const char *mod)
+{
+	struct btf *btf, *vmlinux_btf = NULL;
+	__u32 nr, type_id, cnt = 0;
+	void *root = NULL;
+	__u32 *ids = NULL;
+	int i, err = 0;
+
+	btf = btf__load_vmlinux_btf();
+	if (!ASSERT_OK_PTR(btf, "btf__load_vmlinux_btf"))
+		return NULL;
+
+	if (mod) {
+		vmlinux_btf = btf;
+		btf = btf__load_module_btf(mod, vmlinux_btf);
+		if (!ASSERT_OK_PTR(btf, "btf__load_module_btf")) {
+			btf__free(vmlinux_btf);
+			return NULL;
+		}
+	}
+
+	ids = calloc(funcs_cnt, sizeof(ids[0]));
+	if (!ids)
+		goto out;
+
+	/*
+	 * We sort function names by name and search them
+	 * below for each function.
+	 */
+	for (i = 0; i < funcs_cnt; i++) {
+		if (!tsearch(&funcs[i], &root, compare)) {
+			ASSERT_FAIL("tsearch failed");
+			err = -1;
+			goto error;
+		}
+	}
+
+	nr = btf__type_cnt(btf);
+	for (type_id = 1; type_id < nr && cnt < funcs_cnt; type_id++) {
+		const struct btf_type *type;
+		const char *str, ***val;
+		unsigned int idx;
+
+		type = btf__type_by_id(btf, type_id);
+		if (!type) {
+			err = -1;
+			break;
+		}
+
+		if (BTF_INFO_KIND(type->info) != BTF_KIND_FUNC)
+			continue;
+
+		str = btf__name_by_offset(btf, type->name_off);
+		if (!str) {
+			err = -1;
+			break;
+		}
+
+		val = tfind(&str, &root, compare);
+		if (!val)
+			continue;
+
+		/*
+		 * We keep pointer for each function name so we can get the original
+		 * array index and have the resulting ids array matching the original
+		 * function array.
+		 *
+		 * Doing it this way allow us to easily test the cookies support,
+		 * because each cookie is attached to particular function/id.
+		 */
+		idx = *val - funcs;
+		ids[idx] = type_id;
+		cnt++;
+	}
+
+error:
+	if (err) {
+		free(ids);
+		ids = NULL;
+	}
+
+out:
+	tdestroy(root, tdestroy_free_nop);
+	btf__free(vmlinux_btf);
+	btf__free(btf);
+	return ids;
+}
+
+static void tracing_multi_test_run(struct tracing_multi *skel)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
+	int err, prog_fd;
+
+	prog_fd = bpf_program__fd(skel->progs.test_fentry);
+	err = bpf_prog_test_run_opts(prog_fd, &topts);
+	ASSERT_OK(err, "test_run");
+
+	/* extra +1 count for sleepable programs */
+	ASSERT_EQ(skel->bss->test_result_fentry, FUNCS_CNT + 1, "test_result_fentry");
+	ASSERT_EQ(skel->bss->test_result_fexit, FUNCS_CNT + 1, "test_result_fexit");
+}
+
+static void test_skel_api(void)
+{
+	struct tracing_multi *skel;
+	int err;
+
+	skel = tracing_multi__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "tracing_multi__open_and_load"))
+		return;
+
+	skel->bss->pid = getpid();
+
+	err = tracing_multi__attach(skel);
+	if (!ASSERT_OK(err, "tracing_multi__attach"))
+		goto cleanup;
+
+	tracing_multi_test_run(skel);
+
+cleanup:
+	tracing_multi__destroy(skel);
+}
+
+static void test_link_api_pattern(void)
+{
+	struct tracing_multi *skel;
+
+	skel = tracing_multi__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "tracing_multi__open_and_load"))
+		return;
+
+	skel->bss->pid = getpid();
+
+	skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry,
+					"bpf_fentry_test*", NULL);
+	if (!ASSERT_OK_PTR(skel->links.test_fentry, "bpf_program__attach_tracing_multi"))
+		goto cleanup;
+
+	skel->links.test_fexit = bpf_program__attach_tracing_multi(skel->progs.test_fexit,
+					"bpf_fentry_test*", NULL);
+	if (!ASSERT_OK_PTR(skel->links.test_fexit, "bpf_program__attach_tracing_multi"))
+		goto cleanup;
+
+	skel->links.test_fentry_s = bpf_program__attach_tracing_multi(skel->progs.test_fentry_s,
+					"bpf_fentry_test1", NULL);
+	if (!ASSERT_OK_PTR(skel->links.test_fentry_s, "bpf_program__attach_tracing_multi"))
+		goto cleanup;
+
+	skel->links.test_fexit_s = bpf_program__attach_tracing_multi(skel->progs.test_fexit_s,
+					"bpf_fentry_test1", NULL);
+	if (!ASSERT_OK_PTR(skel->links.test_fexit_s, "bpf_program__attach_tracing_multi"))
+		goto cleanup;
+
+	tracing_multi_test_run(skel);
+
+cleanup:
+	tracing_multi__destroy(skel);
+}
+
+static void test_link_api_ids(bool test_cookies)
+{
+	LIBBPF_OPTS(bpf_tracing_multi_opts, opts);
+	struct tracing_multi *skel;
+	size_t cnt = FUNCS_CNT;
+	__u32 *ids;
+
+	skel = tracing_multi__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "tracing_multi__open_and_load"))
+		return;
+
+	skel->bss->pid = getpid();
+	skel->bss->test_cookies = test_cookies;
+
+	ids = get_ids(bpf_fentry_test, cnt, NULL);
+	if (!ASSERT_OK_PTR(ids, "get_ids"))
+		goto cleanup;
+
+	opts.ids = ids;
+	opts.cnt = cnt;
+
+	if (test_cookies)
+		opts.cookies = bpf_fentry_test_cookies;
+
+	skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry,
+						NULL, &opts);
+	if (!ASSERT_OK_PTR(skel->links.test_fentry, "bpf_program__attach_tracing_multi"))
+		goto cleanup;
+
+	skel->links.test_fexit = bpf_program__attach_tracing_multi(skel->progs.test_fexit,
+						NULL, &opts);
+	if (!ASSERT_OK_PTR(skel->links.test_fexit, "bpf_program__attach_tracing_multi"))
+		goto cleanup;
+
+	/* Only bpf_fentry_test1 is allowed for sleepable programs. */
+	opts.cnt = 1;
+	skel->links.test_fentry_s = bpf_program__attach_tracing_multi(skel->progs.test_fentry_s,
+						NULL, &opts);
+	if (!ASSERT_OK_PTR(skel->links.test_fentry_s, "bpf_program__attach_tracing_multi"))
+		goto cleanup;
+
+	skel->links.test_fexit_s = bpf_program__attach_tracing_multi(skel->progs.test_fexit_s,
+						NULL, &opts);
+	if (!ASSERT_OK_PTR(skel->links.test_fexit_s, "bpf_program__attach_tracing_multi"))
+		goto cleanup;
+
+	tracing_multi_test_run(skel);
+
+cleanup:
+	tracing_multi__destroy(skel);
+	free(ids);
+}
+
+static void test_module_skel_api(void)
+{
+	struct tracing_multi_module *skel = NULL;
+	int err;
+
+	skel = tracing_multi_module__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "tracing_multi__open_and_load"))
+		return;
+
+	skel->bss->pid = getpid();
+
+	err = tracing_multi_module__attach(skel);
+	if (!ASSERT_OK(err, "tracing_multi__attach"))
+		goto cleanup;
+
+	ASSERT_OK(trigger_module_test_read(1), "trigger_read");
+	ASSERT_EQ(skel->bss->test_result_fentry, 5, "test_result_fentry");
+	ASSERT_EQ(skel->bss->test_result_fexit, 5, "test_result_fexit");
+
+cleanup:
+	tracing_multi_module__destroy(skel);
+}
+
+static void test_module_link_api_pattern(void)
+{
+	struct tracing_multi_module *skel = NULL;
+
+	skel = tracing_multi_module__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "tracing_multi_module__open_and_load"))
+		return;
+
+	skel->bss->pid = getpid();
+
+	skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry,
+					"bpf_testmod:bpf_testmod_fentry_test*", NULL);
+	if (!ASSERT_OK_PTR(skel->links.test_fentry, "bpf_program__attach_tracing_multi"))
+		goto cleanup;
+
+	skel->links.test_fexit = bpf_program__attach_tracing_multi(skel->progs.test_fexit,
+					"bpf_testmod:bpf_testmod_fentry_test*", NULL);
+	if (!ASSERT_OK_PTR(skel->links.test_fexit, "bpf_program__attach_tracing_multi"))
+		goto cleanup;
+
+	ASSERT_OK(trigger_module_test_read(1), "trigger_read");
+	ASSERT_EQ(skel->bss->test_result_fentry, 5, "test_result_fentry");
+	ASSERT_EQ(skel->bss->test_result_fexit, 5, "test_result_fexit");
+
+cleanup:
+	tracing_multi_module__destroy(skel);
+}
+
+static void test_module_link_api_ids(void)
+{
+	size_t cnt = ARRAY_SIZE(bpf_testmod_fentry_test);
+	LIBBPF_OPTS(bpf_tracing_multi_opts, opts);
+	struct tracing_multi_module *skel = NULL;
+	__u32 *ids;
+
+	skel = tracing_multi_module__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "tracing_multi_module__open_and_load"))
+		return;
+
+	skel->bss->pid = getpid();
+
+	ids = get_ids(bpf_testmod_fentry_test, cnt, "bpf_testmod");
+	if (!ASSERT_OK_PTR(ids, "get_ids"))
+		goto cleanup;
+
+	opts.ids = ids;
+	opts.cnt = cnt;
+
+	skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry,
+						NULL, &opts);
+	if (!ASSERT_OK_PTR(skel->links.test_fentry, "bpf_program__attach_tracing_multi"))
+		goto cleanup;
+
+	skel->links.test_fexit = bpf_program__attach_tracing_multi(skel->progs.test_fexit,
+						NULL, &opts);
+	if (!ASSERT_OK_PTR(skel->links.test_fexit, "bpf_program__attach_tracing_multi"))
+		goto cleanup;
+
+	ASSERT_OK(trigger_module_test_read(1), "trigger_read");
+	ASSERT_EQ(skel->bss->test_result_fentry, 5, "test_result_fentry");
+	ASSERT_EQ(skel->bss->test_result_fexit, 5, "test_result_fexit");
+
+cleanup:
+	tracing_multi_module__destroy(skel);
+	free(ids);
+}
+
+static bool is_set(__u32 mask, __u32 bit)
+{
+	return (1 << bit) & mask;
+}
+
+static void __test_intersect(__u32 mask, const struct bpf_program *progs[4], __u64 *test_results[4])
+{
+	LIBBPF_OPTS(bpf_tracing_multi_opts, opts);
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
+	struct bpf_link *links[4] = { NULL };
+	const char *funcs[FUNCS_CNT];
+	__u64 expected[4];
+	__u32 *ids, i;
+	int err, cnt;
+
+	/*
+	 * We have 4 programs in progs and the mask bits pick which
+	 * of them gets attached to randomly chosen functions.
+	 */
+	for (i = 0; i < 4; i++) {
+		if (!is_set(mask, i))
+			continue;
+
+		cnt = get_random_funcs(funcs);
+		ids = get_ids(funcs, cnt, NULL);
+		if (!ASSERT_OK_PTR(ids, "get_ids"))
+			goto cleanup;
+
+		opts.ids = ids;
+		opts.cnt = cnt;
+		links[i] = bpf_program__attach_tracing_multi(progs[i], NULL, &opts);
+		free(ids);
+
+		if (!ASSERT_OK_PTR(links[i], "bpf_program__attach_tracing_multi"))
+			goto cleanup;
+
+		expected[i] = *test_results[i] + cnt;
+	}
+
+	err = bpf_prog_test_run_opts(bpf_program__fd(progs[0]), &topts);
+	ASSERT_OK(err, "test_run");
+
+	for (i = 0; i < 4; i++) {
+		if (!is_set(mask, i))
+			continue;
+		ASSERT_EQ(*test_results[i], expected[i], "test_results");
+	}
+
+cleanup:
+	for (i = 0; i < 4; i++)
+		bpf_link__destroy(links[i]);
+}
+
+static void test_intersect(void)
+{
+	struct tracing_multi_intersect *skel;
+	const struct bpf_program *progs[4];
+	__u64 *test_results[4];
+	__u32 i;
+
+	skel = tracing_multi_intersect__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "tracing_multi_intersect__open_and_load"))
+		return;
+
+	skel->bss->pid = getpid();
+
+	progs[0] = skel->progs.fentry_1;
+	progs[1] = skel->progs.fexit_1;
+	progs[2] = skel->progs.fentry_2;
+	progs[3] = skel->progs.fexit_2;
+
+	test_results[0] = &skel->bss->test_result_fentry_1;
+	test_results[1] = &skel->bss->test_result_fexit_1;
+	test_results[2] = &skel->bss->test_result_fentry_2;
+	test_results[3] = &skel->bss->test_result_fexit_2;
+
+	for (i = 1; i < 16; i++)
+		__test_intersect(i, progs, test_results);
+
+	tracing_multi_intersect__destroy(skel);
+}
+
+static void test_session(void)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
+	struct tracing_multi_session *skel;
+	int err, prog_fd;
+
+	skel = tracing_multi_session__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "tracing_multi_session__open_and_load"))
+		return;
+
+	skel->bss->pid = getpid();
+
+	err = tracing_multi_session__attach(skel);
+	if (!ASSERT_OK(err, "tracing_multi_session__attach"))
+		goto cleanup;
+
+	/* execute kernel session */
+	prog_fd = bpf_program__fd(skel->progs.test_session_1);
+	err = bpf_prog_test_run_opts(prog_fd, &topts);
+	ASSERT_OK(err, "test_run");
+
+	/* 10 for test_session_1, 1 for test_fsession_s */
+	ASSERT_EQ(skel->bss->test_result_fentry, 11, "test_result_fentry");
+	/* extra count (+1 for each fexit execution) for test_result_fexit cookie check/inc */
+	ASSERT_EQ(skel->bss->test_result_fexit, 22, "test_result_fexit");
+
+	skel->bss->test_result_fentry = 0;
+	skel->bss->test_result_fexit = 0;
+
+	/* execute bpf_testmo.ko session */
+	ASSERT_OK(trigger_module_test_read(1), "trigger_read");
+
+	/* 5 for test_session_2 */
+	ASSERT_EQ(skel->bss->test_result_fentry, 5, "test_result_fentry");
+	/* extra count (+1 for each fexit execution) for test_result_fexit cookie */
+	ASSERT_EQ(skel->bss->test_result_fexit, 10, "test_result_fexit");
+
+
+cleanup:
+	tracing_multi_session__destroy(skel);
+}
+
+static void test_attach_api_fails(void)
+{
+	LIBBPF_OPTS(bpf_tracing_multi_opts, opts);
+	static const char * const func[] = {
+		"bpf_fentry_test2",
+	};
+	struct tracing_multi_fail *skel = NULL;
+	__u32 ids[2] = {}, *ids2 = NULL;
+	__u64 cookies[2];
+
+	skel = tracing_multi_fail__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "tracing_multi_fail__open_and_load"))
+		return;
+
+	/* fail#1 (libbpf) pattern and opts NULL */
+	skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry,
+						NULL, NULL);
+	if (!ASSERT_EQ(libbpf_get_error(skel->links.test_fentry), -EINVAL, "fail_1"))
+		goto cleanup;
+
+	/* fail#2 (libbpf) pattern and ids */
+	LIBBPF_OPTS_RESET(opts,
+		.ids = ids,
+		.cnt = 2,
+	);
+
+	skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry,
+						"bpf_fentry_test*", &opts);
+	if (!ASSERT_EQ(libbpf_get_error(skel->links.test_fentry), -EINVAL, "fail_2"))
+		goto cleanup;
+
+	/* fail#3 (libbpf) pattern and cookies */
+	LIBBPF_OPTS_RESET(opts,
+		.ids = NULL,
+		.cnt = 2,
+		.cookies = cookies,
+	);
+
+	skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry,
+						"bpf_fentry_test*", &opts);
+	if (!ASSERT_EQ(libbpf_get_error(skel->links.test_fentry), -EINVAL, "fail_3"))
+		goto cleanup;
+
+	/* fail#4 (libbpf) bogus pattern */
+	skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry,
+						"bpf_not_really_a_function*", NULL);
+	if (!ASSERT_EQ(libbpf_get_error(skel->links.test_fentry), -EINVAL, "fail_4"))
+		goto cleanup;
+
+	/* fail#5 (kernel) abnormal cnt */
+	LIBBPF_OPTS_RESET(opts,
+		.ids = ids,
+		.cnt = INT_MAX,
+	);
+
+	skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry,
+						NULL, &opts);
+	if (!ASSERT_EQ(libbpf_get_error(skel->links.test_fentry), -E2BIG, "fail_5"))
+		goto cleanup;
+
+	/* fail#6 (kernel) attach sleepable program to not-allowed function */
+	ids2 = get_ids(func, 1, NULL);
+	if (!ASSERT_OK_PTR(ids2, "get_ids"))
+		goto cleanup;
+
+	LIBBPF_OPTS_RESET(opts,
+		.ids = ids2,
+		.cnt = 1,
+	);
+
+	skel->links.test_fentry_s = bpf_program__attach_tracing_multi(skel->progs.test_fentry_s,
+						NULL, &opts);
+	if (!ASSERT_EQ(libbpf_get_error(skel->links.test_fentry_s), -EINVAL, "fail_6"))
+		goto cleanup;
+
+	/* fail#7 (kernel) attach with duplicate id */
+	ids[0] = ids2[0];
+	ids[1] = ids2[0];
+
+	LIBBPF_OPTS_RESET(opts,
+		.ids = ids,
+		.cnt = 2,
+	);
+
+	skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry,
+						NULL, &opts);
+	ASSERT_EQ(libbpf_get_error(skel->links.test_fentry), -EINVAL, "fail_7");
+
+cleanup:
+	tracing_multi_fail__destroy(skel);
+	free(ids2);
+}
+
+void serial_test_tracing_multi_bench_attach(void)
+{
+	LIBBPF_OPTS(bpf_tracing_multi_opts, opts);
+	struct tracing_multi_bench *skel = NULL;
+	long attach_start_ns, attach_end_ns;
+	long detach_start_ns, detach_end_ns;
+	double attach_delta, detach_delta;
+	struct bpf_link *link = NULL;
+	size_t i, cap = 0, cnt = 0;
+	struct ksyms *ksyms = NULL;
+	void *root = NULL;
+	void *dups = NULL;
+	__u32 *ids = NULL;
+	__u32 nr, type_id;
+	struct btf *btf;
+	int err;
+
+#ifndef __x86_64__
+	test__skip();
+	return;
+#endif
+
+	btf = btf__load_vmlinux_btf();
+	if (!ASSERT_OK_PTR(btf, "btf__load_vmlinux_btf"))
+		return;
+
+	skel = tracing_multi_bench__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "tracing_multi_bench__open_and_load"))
+		goto cleanup;
+
+	if (!ASSERT_OK(bpf_get_ksyms(&ksyms, true), "get_syms"))
+		goto cleanup;
+
+	/* Get all ftrace 'safe' symbols.. */
+	for (i = 0; i < ksyms->filtered_cnt; i++) {
+		if (!tsearch(&ksyms->filtered_syms[i], &root, compare)) {
+			ASSERT_FAIL("tsearch failed");
+			goto cleanup;
+		}
+	}
+
+	/*
+	 * Collect names that are not unique in kallsyms. The kernel resolves a
+	 * tracing-multi BTF id to an address with kallsyms_lookup_name(), which
+	 * returns the first symbol of that name. For a duplicate name that may
+	 * be a different (non-ftrace-able) instance than the ftrace-able one in
+	 * available_filter_functions, so attaching to it by BTF id fails with
+	 * -ENOENT (e.g. t_start/t_next/t_stop). ksyms->syms is sorted by name,
+	 * so equal names are adjacent.
+	 */
+	for (i = 1; i < ksyms->sym_cnt; i++) {
+		if (strcmp(ksyms->syms[i].name, ksyms->syms[i - 1].name))
+			continue;
+		if (!tsearch(&ksyms->syms[i].name, &dups, compare)) {
+			ASSERT_FAIL("tsearch failed");
+			goto cleanup;
+		}
+	}
+
+	/* ..and filter them through BTF and btf_type_is_traceable_func. */
+	nr = btf__type_cnt(btf);
+	for (type_id = 1; type_id < nr; type_id++) {
+		const struct btf_type *type;
+		const char *str;
+
+		type = btf__type_by_id(btf, type_id);
+		if (!type)
+			break;
+
+		if (BTF_INFO_KIND(type->info) != BTF_KIND_FUNC)
+			continue;
+
+		str = btf__name_by_offset(btf, type->name_off);
+		if (!str)
+			break;
+
+		if (!tfind(&str, &root, compare))
+			continue;
+
+		/* Skip names that are not unique in kallsyms, see above. */
+		if (tfind(&str, &dups, compare))
+			continue;
+
+		if (!btf_type_is_traceable_func(btf, type))
+			continue;
+
+		err = libbpf_ensure_mem((void **) &ids, &cap, sizeof(*ids), cnt + 1);
+		if (err)
+			goto cleanup;
+
+		ids[cnt++] = type_id;
+	}
+
+	opts.ids = ids;
+	opts.cnt = cnt;
+
+	attach_start_ns = get_time_ns();
+	link = bpf_program__attach_tracing_multi(skel->progs.bench, NULL, &opts);
+	attach_end_ns = get_time_ns();
+
+	if (!ASSERT_OK_PTR(link, "bpf_program__attach_tracing_multi"))
+		goto cleanup;
+
+	detach_start_ns = get_time_ns();
+	bpf_link__destroy(link);
+	detach_end_ns = get_time_ns();
+
+	attach_delta = (attach_end_ns - attach_start_ns) / 1000000000.0;
+	detach_delta = (detach_end_ns - detach_start_ns) / 1000000000.0;
+
+	printf("%s: found %lu functions\n", __func__, cnt);
+	printf("%s: attached in %7.3lfs\n", __func__, attach_delta);
+	printf("%s: detached in %7.3lfs\n", __func__, detach_delta);
+
+cleanup:
+	tracing_multi_bench__destroy(skel);
+	tdestroy(root, tdestroy_free_nop);
+	tdestroy(dups, tdestroy_free_nop);
+	free_kallsyms_local(ksyms);
+	free(ids);
+	btf__free(btf);
+}
+
+static void tracing_multi_rollback_run(struct tracing_multi_rollback *skel)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
+	int err, prog_fd;
+
+	prog_fd = bpf_program__fd(skel->progs.test_fentry);
+	err = bpf_prog_test_run_opts(prog_fd, &topts);
+	ASSERT_OK(err, "test_run");
+
+	/* make sure the rollback code did not leave any program attached */
+	ASSERT_EQ(skel->bss->test_result_fentry, 0, "test_result_fentry");
+	ASSERT_EQ(skel->bss->test_result_fexit, 0, "test_result_fexit");
+}
+
+static void test_rollback_put(void)
+{
+	LIBBPF_OPTS(bpf_tracing_multi_opts, opts);
+	struct tracing_multi_rollback *skel = NULL;
+	size_t cnt = FUNCS_CNT;
+	__u32 *ids = NULL;
+	int err;
+
+	skel = tracing_multi_rollback__open();
+	if (!ASSERT_OK_PTR(skel, "tracing_multi_rollback__open"))
+		return;
+
+	bpf_program__set_autoload(skel->progs.test_fentry, true);
+	bpf_program__set_autoload(skel->progs.test_fexit, true);
+
+	err = tracing_multi_rollback__load(skel);
+	if (!ASSERT_OK(err, "tracing_multi_rollback__load"))
+		goto cleanup;
+
+	ids = get_ids(bpf_fentry_test, cnt, NULL);
+	if (!ASSERT_OK_PTR(ids, "get_ids"))
+		goto cleanup;
+
+	/*
+	 * Mangle last id to trigger rollback, which needs to do put
+	 * on get-ed trampolines.
+	 */
+	ids[9] = 0;
+
+	opts.ids = ids;
+	opts.cnt = cnt;
+
+	skel->bss->pid = getpid();
+
+	skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry,
+						NULL, &opts);
+	if (!ASSERT_ERR_PTR(skel->links.test_fentry, "bpf_program__attach_tracing_multi"))
+		goto cleanup;
+
+	skel->links.test_fexit = bpf_program__attach_tracing_multi(skel->progs.test_fexit,
+						NULL, &opts);
+	if (!ASSERT_ERR_PTR(skel->links.test_fexit, "bpf_program__attach_tracing_multi"))
+		goto cleanup;
+
+	/* We don't really attach any program, but let's make sure. */
+	tracing_multi_rollback_run(skel);
+
+cleanup:
+	tracing_multi_rollback__destroy(skel);
+	free(ids);
+}
+
+static void fillers_cleanup(struct tracing_multi_rollback **skels, int cnt)
+{
+	int i;
+
+	for (i = 0; i < cnt; i++)
+		tracing_multi_rollback__destroy(skels[i]);
+
+	free(skels);
+}
+
+static struct tracing_multi_rollback *extra_load_and_link(void)
+{
+	struct tracing_multi_rollback *skel;
+	int err;
+
+	skel = tracing_multi_rollback__open();
+	if (!ASSERT_OK_PTR(skel, "tracing_multi_rollback__open"))
+		goto cleanup;
+
+	bpf_program__set_autoload(skel->progs.extra, true);
+
+	err = tracing_multi_rollback__load(skel);
+	if (!ASSERT_OK(err, "tracing_multi_rollback__load"))
+		goto cleanup;
+
+	skel->links.extra = bpf_program__attach_trace(skel->progs.extra);
+	if (!ASSERT_OK_PTR(skel->links.extra, "bpf_program__attach_trace"))
+		goto cleanup;
+
+	return skel;
+
+cleanup:
+	tracing_multi_rollback__destroy(skel);
+	return NULL;
+}
+
+static struct tracing_multi_rollback **fillers_load_and_link(int max)
+{
+	struct tracing_multi_rollback **skels, *skel;
+	int i, err;
+
+	skels = calloc(max + 1, sizeof(*skels));
+	if (!ASSERT_OK_PTR(skels, "calloc"))
+		return NULL;
+
+	for (i = 0; i < max; i++) {
+		skel = skels[i] = tracing_multi_rollback__open();
+		if (!ASSERT_OK_PTR(skels[i], "tracing_multi_rollback__open"))
+			goto cleanup;
+
+		bpf_program__set_autoload(skel->progs.filler, true);
+
+		err = tracing_multi_rollback__load(skel);
+		if (!ASSERT_OK(err, "tracing_multi_rollback__load"))
+			goto cleanup;
+
+		skel->links.filler = bpf_program__attach_trace(skel->progs.filler);
+		if (!ASSERT_OK_PTR(skels[i]->links.filler, "bpf_program__attach_trace"))
+			goto cleanup;
+	}
+
+	return skels;
+
+cleanup:
+	fillers_cleanup(skels, i + 1);
+	return NULL;
+}
+
+static void test_rollback_unlink(void)
+{
+	struct tracing_multi_rollback *skel = NULL, *extra;
+	LIBBPF_OPTS(bpf_tracing_multi_opts, opts);
+	struct tracing_multi_rollback **fillers;
+	size_t cnt = FUNCS_CNT;
+	__u32 *ids = NULL;
+	int err, max;
+
+	max = get_bpf_max_tramp_links();
+	if (!ASSERT_GE(max, 1, "bpf_max_tramp_links"))
+		return;
+
+	/* Attach maximum allowed programs to bpf_fentry_test10 */
+	fillers = fillers_load_and_link(max);
+	if (!ASSERT_OK_PTR(fillers, "fillers_load_and_link"))
+		return;
+
+	extra = extra_load_and_link();
+	if (!ASSERT_OK_PTR(extra, "extra_load_and_link"))
+		goto cleanup;
+
+	skel = tracing_multi_rollback__open();
+	if (!ASSERT_OK_PTR(skel, "tracing_multi_rollback__open"))
+		goto cleanup;
+
+	bpf_program__set_autoload(skel->progs.test_fentry, true);
+	bpf_program__set_autoload(skel->progs.test_fexit, true);
+
+	/*
+	 * Attach tracing_multi link on bpf_fentry_test1-10, which will
+	 * fail on bpf_fentry_test10 function, because it already has
+	 * maximum allowed programs attached.
+	 *
+	 * The rollback needs to unlink already link-ed trampolines and
+	 * put all of them.
+	 */
+	err = tracing_multi_rollback__load(skel);
+	if (!ASSERT_OK(err, "tracing_multi_rollback__load"))
+		goto cleanup;
+
+	ids = get_ids(bpf_fentry_test, cnt, NULL);
+	if (!ASSERT_OK_PTR(ids, "get_ids"))
+		goto cleanup;
+
+	opts.ids = ids;
+	opts.cnt = cnt;
+
+	skel->bss->pid = getpid();
+
+	skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry,
+						NULL, &opts);
+	if (!ASSERT_ERR_PTR(skel->links.test_fentry, "bpf_program__attach_tracing_multi"))
+		goto cleanup;
+
+	skel->links.test_fexit = bpf_program__attach_tracing_multi(skel->progs.test_fexit,
+						NULL, &opts);
+	if (!ASSERT_ERR_PTR(skel->links.test_fexit, "bpf_program__attach_tracing_multi"))
+		goto cleanup;
+
+	tracing_multi_rollback_run(skel);
+
+cleanup:
+	fillers_cleanup(fillers, max);
+	tracing_multi_rollback__destroy(extra);
+	tracing_multi_rollback__destroy(skel);
+	free(ids);
+}
+
+void serial_test_tracing_multi_attach_rollback(void)
+{
+	if (test__start_subtest("put"))
+		test_rollback_put();
+	if (test__start_subtest("unlink"))
+		test_rollback_unlink();
+}
+
+void test_tracing_multi_test(void)
+{
+#ifndef __x86_64__
+	test__skip();
+	return;
+#endif
+
+	if (test__start_subtest("skel_api"))
+		test_skel_api();
+	if (test__start_subtest("link_api_pattern"))
+		test_link_api_pattern();
+	if (test__start_subtest("link_api_ids"))
+		test_link_api_ids(false);
+	if (test__start_subtest("module_skel_api"))
+		test_module_skel_api();
+	if (test__start_subtest("module_link_api_pattern"))
+		test_module_link_api_pattern();
+	if (test__start_subtest("module_link_api_ids"))
+		test_module_link_api_ids();
+	if (test__start_subtest("intersect"))
+		test_intersect();
+	if (test__start_subtest("cookies"))
+		test_link_api_ids(true);
+	if (test__start_subtest("session"))
+		test_session();
+	if (test__start_subtest("attach_api_fails"))
+		test_attach_api_fails();
+	RUN_TESTS(tracing_multi_verifier);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
index 56cbea280fbd..f0baf5738b75 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c
@@ -2,6 +2,7 @@
 
 #include <unistd.h>
 #include <pthread.h>
+#include <fcntl.h>
 #include <test_progs.h>
 #include "uprobe_multi.skel.h"
 #include "uprobe_multi_bench.skel.h"
@@ -536,7 +537,37 @@ static void test_attach_api_fails(void)
 	link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_UPROBE_MULTI, &opts);
 	if (!ASSERT_ERR(link_fd, "link_fd"))
 		goto cleanup;
-	ASSERT_EQ(link_fd, -EINVAL, "pid_is_wrong");
+	if (!ASSERT_EQ(link_fd, -EINVAL, "pid_is_wrong"))
+		goto cleanup;
+
+	/* wrong path_fd */
+	LIBBPF_OPTS_RESET(opts,
+		.uprobe_multi.path = NULL,
+		.uprobe_multi.path_fd = -1,
+		.uprobe_multi.flags = BPF_F_UPROBE_MULTI_PATH_FD,
+		.uprobe_multi.offsets = (unsigned long *)&offset,
+		.uprobe_multi.cnt = 1,
+	);
+
+	link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_UPROBE_MULTI, &opts);
+	if (!ASSERT_ERR(link_fd, "link_fd"))
+		goto cleanup;
+	if (!ASSERT_EQ(link_fd, -EBADF, "path_fd_is_wrong"))
+		goto cleanup;
+
+	/* path and path_fd both set with BPF_F_UPROBE_MULTI_PATH_FD flag */
+	LIBBPF_OPTS_RESET(opts,
+		.uprobe_multi.path = path,
+		.uprobe_multi.path_fd = 1,
+		.uprobe_multi.flags = BPF_F_UPROBE_MULTI_PATH_FD,
+		.uprobe_multi.offsets = (unsigned long *)&offset,
+		.uprobe_multi.cnt = 1,
+	);
+
+	link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_UPROBE_MULTI, &opts);
+	if (!ASSERT_ERR(link_fd, "link_fd"))
+		goto cleanup;
+	ASSERT_EQ(link_fd, -EINVAL, "path_and_path_fd_together");
 
 cleanup:
 	if (link_fd >= 0)
@@ -757,6 +788,65 @@ static void test_link_api(void)
 	__test_link_api(&child);
 }
 
+static void test_link_api_path_fd(void)
+{
+	LIBBPF_OPTS(bpf_link_create_opts, opts);
+	const char *resolve_path = "/proc/self/exe";
+	int prog_fd, link_fd = -1, path_fd = -1;
+	struct uprobe_multi *skel = NULL;
+	unsigned long *offsets = NULL;
+	const char *syms[3] = {
+		"uprobe_multi_func_1",
+		"uprobe_multi_func_2",
+		"uprobe_multi_func_3",
+	};
+	int err;
+
+	err = elf_resolve_syms_offsets(resolve_path, ARRAY_SIZE(syms), syms,
+				       &offsets, STT_FUNC);
+	if (!ASSERT_OK(err, "elf_resolve_syms_offsets"))
+		return;
+
+	path_fd = open(resolve_path, O_RDONLY);
+	if (!ASSERT_GE(path_fd, 0, "path_fd"))
+		goto cleanup;
+
+	opts.uprobe_multi.path_fd = path_fd;
+	opts.uprobe_multi.offsets = offsets;
+	opts.uprobe_multi.cnt = ARRAY_SIZE(syms);
+	opts.uprobe_multi.flags = BPF_F_UPROBE_MULTI_PATH_FD;
+
+	skel = uprobe_multi__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "uprobe_multi__open_and_load"))
+		goto cleanup;
+
+	prog_fd = bpf_program__fd(skel->progs.uprobe);
+	link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_UPROBE_MULTI, &opts);
+	if (!ASSERT_GE(link_fd, 0, "bpf_link_create"))
+		goto cleanup;
+
+	skel->bss->uprobe_multi_func_1_addr = (__u64)uprobe_multi_func_1;
+	skel->bss->uprobe_multi_func_2_addr = (__u64)uprobe_multi_func_2;
+	skel->bss->uprobe_multi_func_3_addr = (__u64)uprobe_multi_func_3;
+	skel->bss->pid = getpid();
+
+	uprobe_multi_func_1();
+	uprobe_multi_func_2();
+	uprobe_multi_func_3();
+
+	ASSERT_EQ(skel->bss->uprobe_multi_func_1_result, 1, "uprobe_multi_func_1_result");
+	ASSERT_EQ(skel->bss->uprobe_multi_func_2_result, 1, "uprobe_multi_func_2_result");
+	ASSERT_EQ(skel->bss->uprobe_multi_func_3_result, 1, "uprobe_multi_func_3_result");
+
+cleanup:
+	if (link_fd >= 0)
+		close(link_fd);
+	if (path_fd >= 0)
+		close(path_fd);
+	uprobe_multi__destroy(skel);
+	free(offsets);
+}
+
 static struct bpf_program *
 get_program(struct uprobe_multi_consumers *skel, int prog)
 {
@@ -1354,6 +1444,8 @@ void test_uprobe_multi_test(void)
 		test_attach_api_syms();
 	if (test__start_subtest("link_api"))
 		test_link_api();
+	if (test__start_subtest("link_api_path_fd"))
+		test_link_api_path_fd();
 	if (test__start_subtest("bench_uprobe"))
 		test_bench_attach_uprobe();
 	if (test__start_subtest("bench_usdt"))
diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c
index 06cd24e37b3f..8a3d69e2453c 100644
--- a/tools/testing/selftests/bpf/prog_tests/verifier.c
+++ b/tools/testing/selftests/bpf/prog_tests/verifier.c
@@ -38,6 +38,7 @@
 #include "verifier_div0.skel.h"
 #include "verifier_div_mod_bounds.skel.h"
 #include "verifier_div_overflow.skel.h"
+#include "verifier_flow_keys.skel.h"
 #include "verifier_global_subprogs.skel.h"
 #include "verifier_global_ptr_args.skel.h"
 #include "verifier_gotol.skel.h"
@@ -92,6 +93,8 @@
 #include "verifier_sockmap_mutate.skel.h"
 #include "verifier_spill_fill.skel.h"
 #include "verifier_spin_lock.skel.h"
+#include "verifier_stack_arg.skel.h"
+#include "verifier_stack_arg_order.skel.h"
 #include "verifier_stack_ptr.skel.h"
 #include "verifier_store_release.skel.h"
 #include "verifier_subprog_precision.skel.h"
@@ -115,6 +118,7 @@
 #include "verifier_xdp.skel.h"
 #include "verifier_xdp_direct_packet_access.skel.h"
 #include "verifier_bits_iter.skel.h"
+#include "verifier_set_retval.skel.h"
 #include "verifier_lsm.skel.h"
 #include "verifier_jit_inline.skel.h"
 #include "irq.skel.h"
@@ -187,6 +191,7 @@ void test_verifier_direct_stack_access_wraparound(void) { RUN(verifier_direct_st
 void test_verifier_div0(void)                 { RUN(verifier_div0); }
 void test_verifier_div_mod_bounds(void)       { RUN(verifier_div_mod_bounds); }
 void test_verifier_div_overflow(void)         { RUN(verifier_div_overflow); }
+void test_verifier_flow_keys(void)            { RUN(verifier_flow_keys); }
 void test_verifier_global_subprogs(void)      { RUN(verifier_global_subprogs); }
 void test_verifier_global_ptr_args(void)      { RUN(verifier_global_ptr_args); }
 void test_verifier_gotol(void)                { RUN(verifier_gotol); }
@@ -240,6 +245,8 @@ void test_verifier_sock_addr(void)            { RUN(verifier_sock_addr); }
 void test_verifier_sockmap_mutate(void)       { RUN(verifier_sockmap_mutate); }
 void test_verifier_spill_fill(void)           { RUN(verifier_spill_fill); }
 void test_verifier_spin_lock(void)            { RUN(verifier_spin_lock); }
+void test_verifier_stack_arg(void)            { RUN(verifier_stack_arg); }
+void test_verifier_stack_arg_order(void)      { RUN(verifier_stack_arg_order); }
 void test_verifier_stack_ptr(void)            { RUN(verifier_stack_ptr); }
 void test_verifier_store_release(void)        { RUN(verifier_store_release); }
 void test_verifier_subprog_precision(void)    { RUN(verifier_subprog_precision); }
@@ -262,6 +269,7 @@ void test_verifier_xadd(void)                 { RUN(verifier_xadd); }
 void test_verifier_xdp(void)                  { RUN(verifier_xdp); }
 void test_verifier_xdp_direct_packet_access(void) { RUN(verifier_xdp_direct_packet_access); }
 void test_verifier_bits_iter(void) { RUN(verifier_bits_iter); }
+void test_verifier_set_retval(void)            { RUN(verifier_set_retval); }
 void test_verifier_lsm(void)                  { RUN(verifier_lsm); }
 void test_irq(void)			      { RUN(irq); }
 void test_verifier_mtu(void)		      { RUN(verifier_mtu); }
diff --git a/tools/testing/selftests/bpf/prog_tests/verifier_log.c b/tools/testing/selftests/bpf/prog_tests/verifier_log.c
index c01c0114af1b..4542bb586d72 100644
--- a/tools/testing/selftests/bpf/prog_tests/verifier_log.c
+++ b/tools/testing/selftests/bpf/prog_tests/verifier_log.c
@@ -317,6 +317,7 @@ static void verif_btf_log_subtest(bool bad_btf)
 	res = load_btf(&opts, true);
 	ASSERT_EQ(res, -ENOSPC, "half_log_fd");
 	ASSERT_EQ(strlen(logs.buf), 24, "log_fixed_25");
+	strscpy(op_name, "log_fixed", sizeof(op_name));
 	ASSERT_STRNEQ(logs.buf, logs.reference, 24, op_name);
 
 	/* validate rolling verifier log logic: try all variations of log buf
diff --git a/tools/testing/selftests/bpf/prog_tests/vmlinux.c b/tools/testing/selftests/bpf/prog_tests/vmlinux.c
index 6fb2217d940b..b5fdd593910d 100644
--- a/tools/testing/selftests/bpf/prog_tests/vmlinux.c
+++ b/tools/testing/selftests/bpf/prog_tests/vmlinux.c
@@ -14,21 +14,61 @@ static void nsleep()
 	(void)syscall(__NR_nanosleep, &ts, NULL);
 }
 
+static const char *hrtimer_func = "hrtimer_start_range_ns";
+
+static int setup_hrtimer_progs(struct test_vmlinux *skel)
+{
+	int err;
+
+	if (libbpf_find_vmlinux_btf_id("hrtimer_start_range_ns_user", BPF_TRACE_FENTRY) > 0)
+		hrtimer_func = "hrtimer_start_range_ns_user";
+
+	err = bpf_program__set_attach_target(skel->progs.handle__fentry, 0, hrtimer_func);
+	if (err)
+		return err;
+
+	/*
+	 * Bare SEC("kprobe") has no target function, so attach it manually
+	 * later after selecting the hrtimer function to probe.
+	 */
+	bpf_program__set_autoattach(skel->progs.handle__kprobe, false);
+
+	return 0;
+}
+
 void test_vmlinux(void)
 {
 	int err;
 	struct test_vmlinux* skel;
 	struct test_vmlinux__bss *bss;
+	struct bpf_link *kprobe_link = NULL;
 
-	skel = test_vmlinux__open_and_load();
-	if (!ASSERT_OK_PTR(skel, "test_vmlinux__open_and_load"))
+	skel = test_vmlinux__open();
+	if (!ASSERT_OK_PTR(skel, "test_vmlinux__open"))
 		return;
+
+	err = setup_hrtimer_progs(skel);
+	if (!ASSERT_OK(err, "setup_hrtimer_progs"))
+		goto cleanup;
+
+	err = test_vmlinux__load(skel);
+	if (!ASSERT_OK(err, "test_vmlinux__load"))
+		goto cleanup;
+
 	bss = skel->bss;
 
 	err = test_vmlinux__attach(skel);
 	if (!ASSERT_OK(err, "test_vmlinux__attach"))
 		goto cleanup;
 
+	/* manually attach kprobe with the selected function */
+	if (hrtimer_func) {
+		kprobe_link = bpf_program__attach_kprobe(skel->progs.handle__kprobe,
+							 false /* retprobe */, hrtimer_func);
+		if (!ASSERT_OK_PTR(kprobe_link, "bpf_program__attach_kprobe"))
+			goto cleanup;
+	}
+
 	/* trigger everything */
 	nsleep();
 
@@ -39,5 +79,6 @@ void test_vmlinux(void)
 	ASSERT_TRUE(bss->fentry_called, "fentry");
 
 cleanup:
+	bpf_link__destroy(kprobe_link);
 	test_vmlinux__destroy(skel);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/wakeup_source.c b/tools/testing/selftests/bpf/prog_tests/wakeup_source.c
new file mode 100644
index 000000000000..ebfdc03271b9
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/wakeup_source.c
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2026 Google LLC */
+
+#include <test_progs.h>
+#include <bpf/btf.h>
+#include <fcntl.h>
+#include "test_wakeup_source.skel.h"
+#include "wakeup_source_fail.skel.h"
+#include "progs/wakeup_source.h"
+
+static int lock_ws(const char *name)
+{
+	int fd;
+	ssize_t bytes;
+
+	fd = open("/sys/power/wake_lock", O_WRONLY);
+	if (!ASSERT_OK_FD(fd, "open /sys/power/wake_lock"))
+		return -1;
+
+	bytes = write(fd, name, strlen(name));
+	close(fd);
+	if (!ASSERT_EQ(bytes, strlen(name), "write to wake_lock"))
+		return -1;
+
+	return 0;
+}
+
+static void unlock_ws(const char *name)
+{
+	int fd;
+
+	fd = open("/sys/power/wake_unlock", O_WRONLY);
+	if (fd < 0)
+		return;
+
+	write(fd, name, strlen(name));
+	close(fd);
+}
+
+struct rb_ctx {
+	const char *name;
+	bool found;
+	long long active_time_ns;
+	long long total_time_ns;
+};
+
+static int process_sample(void *ctx, void *data, size_t len)
+{
+	struct rb_ctx *rb_ctx = ctx;
+	struct wakeup_event_t *e = data;
+
+	if (strcmp(e->name, rb_ctx->name) == 0) {
+		rb_ctx->found = true;
+		rb_ctx->active_time_ns = e->active_time_ns;
+		rb_ctx->total_time_ns = e->total_time_ns;
+	}
+	return 0;
+}
+
+void test_wakeup_source(void)
+{
+	struct btf *btf;
+	int id;
+
+	btf = btf__load_vmlinux_btf();
+	if (!ASSERT_OK_PTR(btf, "btf_vmlinux"))
+		return;
+
+	id = btf__find_by_name_kind(btf, "bpf_wakeup_sources_get_head", BTF_KIND_FUNC);
+	btf__free(btf);
+
+	if (id < 0) {
+		printf("%s:SKIP:bpf_wakeup_sources_get_head kfunc not found in BTF\n", __func__);
+		test__skip();
+		return;
+	}
+
+	if (test__start_subtest("iterate_and_verify_times")) {
+		struct test_wakeup_source *skel;
+		struct ring_buffer *rb = NULL;
+		struct rb_ctx rb_ctx = {
+			.name = "bpf_selftest_ws_times",
+			.found = false,
+		};
+		int err;
+
+		skel = test_wakeup_source__open_and_load();
+		if (!ASSERT_OK_PTR(skel, "skel_open_and_load"))
+			return;
+
+		rb = ring_buffer__new(bpf_map__fd(skel->maps.rb), process_sample, &rb_ctx, NULL);
+		if (!ASSERT_OK_PTR(rb, "ring_buffer__new"))
+			goto destroy;
+
+		/* Create a temporary wakeup source */
+		if (!ASSERT_OK(lock_ws(rb_ctx.name), "lock_ws"))
+			goto unlock;
+
+		err = bpf_prog_test_run_opts(bpf_program__fd(
+				skel->progs.iterate_wakeupsources), NULL);
+		ASSERT_OK(err, "bpf_prog_test_run");
+
+		ring_buffer__consume(rb);
+
+		ASSERT_TRUE(rb_ctx.found, "found_test_ws_in_rb");
+		ASSERT_GT(rb_ctx.active_time_ns, 0, "active_time_gt_0");
+		ASSERT_GT(rb_ctx.total_time_ns, 0, "total_time_gt_0");
+
+unlock:
+		unlock_ws(rb_ctx.name);
+destroy:
+		if (rb)
+			ring_buffer__free(rb);
+		test_wakeup_source__destroy(skel);
+	}
+
+	RUN_TESTS(wakeup_source_fail);
+}
diff --git a/tools/testing/selftests/bpf/progs/arena_atomics.c b/tools/testing/selftests/bpf/progs/arena_atomics.c
index d1841aac94a2..2e7751a85399 100644
--- a/tools/testing/selftests/bpf/progs/arena_atomics.c
+++ b/tools/testing/selftests/bpf/progs/arena_atomics.c
@@ -5,7 +5,7 @@
 #include <bpf/bpf_tracing.h>
 #include <stdbool.h>
 #include <stdatomic.h>
-#include "bpf_arena_common.h"
+#include <bpf_arena_common.h>
 #include "../../../include/linux/filter.h"
 #include "bpf_misc.h"
 
diff --git a/tools/testing/selftests/bpf/progs/arena_spin_lock.c b/tools/testing/selftests/bpf/progs/arena_spin_lock.c
index 086b57a426cf..cf7cda79c16c 100644
--- a/tools/testing/selftests/bpf/progs/arena_spin_lock.c
+++ b/tools/testing/selftests/bpf/progs/arena_spin_lock.c
@@ -4,7 +4,8 @@
 #include <bpf/bpf_tracing.h>
 #include <bpf/bpf_helpers.h>
 #include "bpf_misc.h"
-#include "bpf_arena_spin_lock.h"
+#include <bpf_arena_common.h>
+#include <bpf_arena_spin_lock.h>
 
 struct {
 	__uint(type, BPF_MAP_TYPE_ARENA);
diff --git a/tools/testing/selftests/bpf/progs/bench_bpf_timing.bpf.h b/tools/testing/selftests/bpf/progs/bench_bpf_timing.bpf.h
new file mode 100644
index 000000000000..6a1ad75f1fd7
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bench_bpf_timing.bpf.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#ifndef __BENCH_BPF_TIMING_BPF_H__
+#define __BENCH_BPF_TIMING_BPF_H__
+
+#include <stdbool.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf_may_goto.h>
+
+#ifndef BENCH_NR_SAMPLES
+#define BENCH_NR_SAMPLES	4096
+#endif
+#ifndef BENCH_NR_CPUS
+#define BENCH_NR_CPUS		256
+#endif
+#define BENCH_CPU_MASK		(BENCH_NR_CPUS - 1)
+
+__u64 timing_samples[BENCH_NR_CPUS][BENCH_NR_SAMPLES];
+__u32 timing_idx[BENCH_NR_CPUS];
+
+volatile __u32 batch_iters;
+volatile __u32 timing_enabled;
+
+static __always_inline void bench_record_sample(__u64 elapsed_ns)
+{
+	__u32 cpu, idx;
+
+	if (!timing_enabled)
+		return;
+
+	cpu = bpf_get_smp_processor_id() & BENCH_CPU_MASK;
+	idx = timing_idx[cpu];
+
+	if (idx >= BENCH_NR_SAMPLES)
+		return;
+
+	timing_samples[cpu][idx] = elapsed_ns;
+	timing_idx[cpu] = idx + 1;
+}
+
+/*
+ * @body:  expression to time; return value (int) stored in __bench_result.
+ * @reset: undo body's side-effects so each iteration starts identically.
+ *         May reference __bench_result.  Use ({}) for empty reset.
+ *
+ * Runs batch_iters timed iterations, then one untimed iteration whose
+ * return value the macro evaluates to (for validation).
+ */
+#define BENCH_BPF_LOOP(body, reset) ({					\
+	__u64 __bench_start = bpf_ktime_get_ns();			\
+	__u32 __bench_i;						\
+	int __bench_result;						\
+									\
+	for (__bench_i = 0;						\
+	     __bench_i < batch_iters && can_loop;			\
+	     __bench_i++) {						\
+		__bench_result = (body);				\
+		reset;							\
+	}								\
+									\
+	bench_record_sample(bpf_ktime_get_ns() - __bench_start);	\
+									\
+	__bench_result = (body);					\
+	__bench_result;							\
+})
+
+#endif /* __BENCH_BPF_TIMING_BPF_H__ */
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_rhash_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_rhash_map.c
new file mode 100644
index 000000000000..86f6c0d5eadb
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_rhash_map.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+	__uint(type, BPF_MAP_TYPE_RHASH);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__uint(max_entries, 64);
+	__type(key, __u32);
+	__type(value, __u64);
+} rhashmap SEC(".maps");
+
+__u32 key_sum = 0;
+__u64 val_sum = 0;
+__u32 elem_count = 0;
+__u32 err = 0;
+
+SEC("iter/bpf_map_elem")
+int dump_bpf_rhash_map(struct bpf_iter__bpf_map_elem *ctx)
+{
+	__u32 *key = ctx->key;
+	__u64 *val = ctx->value;
+
+	if (!key || !val)
+		return 0;
+
+	key_sum += *key;
+	val_sum += *val;
+	elem_count++;
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_vmas.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_vmas.c
index d64ba7ddaed5..d7fb561ed4fb 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_task_vmas.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_vmas.c
@@ -52,7 +52,7 @@ SEC("iter/task_vma") int proc_maps(struct bpf_iter__task_vma *ctx)
 		bpf_d_path(&file->f_path, d_path_buf, D_PATH_BUF_SIZE);
 
 		BPF_SEQ_PRINTF(seq, "%08llx ", vma->vm_pgoff << 12);
-		BPF_SEQ_PRINTF(seq, "%02x:%02x %u", MAJOR(dev), MINOR(dev),
+		BPF_SEQ_PRINTF(seq, "%02x:%02x %llu", MAJOR(dev), MINOR(dev),
 			       file->f_inode->i_ino);
 		BPF_SEQ_PRINTF(seq, "\t%s\n", d_path_buf);
 	} else {
diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h
index a0d7b15a24b1..9eeb5b0b63d6 100644
--- a/tools/testing/selftests/bpf/progs/bpf_misc.h
+++ b/tools/testing/selftests/bpf/progs/bpf_misc.h
@@ -152,6 +152,7 @@
 #define __auxiliary		__test_tag("test_auxiliary")
 #define __auxiliary_unpriv	__test_tag("test_auxiliary_unpriv")
 #define __btf_path(path)	__test_tag("test_btf_path=" path)
+#define __btf_func_path(path)	__test_tag("test_btf_func_path=" path)
 #define __arch(arch)		__test_tag("test_arch=" arch)
 #define __arch_x86_64		__arch("X86_64")
 #define __arch_arm64		__arch("ARM64")
diff --git a/tools/testing/selftests/bpf/progs/bpf_nop_bench.c b/tools/testing/selftests/bpf/progs/bpf_nop_bench.c
new file mode 100644
index 000000000000..01ed284c1bb3
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_nop_bench.c
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bench_bpf_timing.bpf.h"
+
+SEC("syscall")
+int bench_nop(void *ctx)
+{
+	return BENCH_BPF_LOOP(0, ({}));
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/bpf_qdisc_dynptr_use_after_invalidate_clone.c b/tools/testing/selftests/bpf/progs/bpf_qdisc_dynptr_use_after_invalidate_clone.c
new file mode 100644
index 000000000000..ac626cfa2a98
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_qdisc_dynptr_use_after_invalidate_clone.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <vmlinux.h>
+#include "bpf_experimental.h"
+#include "bpf_qdisc_common.h"
+#include "bpf_misc.h"
+
+char _license[] SEC("license") = "GPL";
+
+int proto;
+
+SEC("struct_ops")
+__success
+int BPF_PROG(dynptr_use_after_invalidate_clone, struct sk_buff *skb, struct Qdisc *sch,
+	     struct bpf_sk_buff_ptr *to_free)
+{
+	struct bpf_dynptr ptr, ptr_clone;
+	struct ethhdr *hdr;
+
+	bpf_dynptr_from_skb((struct __sk_buff *)skb, 0, &ptr);
+
+	bpf_dynptr_clone(&ptr, &ptr_clone);
+
+	hdr = bpf_dynptr_slice(&ptr_clone, 0, NULL, sizeof(*hdr));
+	if (!hdr) {
+		bpf_qdisc_skb_drop(skb, to_free);
+		return NET_XMIT_DROP;
+	}
+
+	*(int *)&ptr = 0;
+
+	proto = hdr->h_proto;
+
+	bpf_qdisc_skb_drop(skb, to_free);
+
+	return NET_XMIT_DROP;
+}
+
+SEC("struct_ops")
+__auxiliary
+struct sk_buff *BPF_PROG(bpf_qdisc_test_dequeue, struct Qdisc *sch)
+{
+	return NULL;
+}
+
+SEC("struct_ops")
+__auxiliary
+int BPF_PROG(bpf_qdisc_test_init, struct Qdisc *sch, struct nlattr *opt,
+	     struct netlink_ext_ack *extack)
+{
+	return 0;
+}
+
+SEC("struct_ops")
+__auxiliary
+void BPF_PROG(bpf_qdisc_test_reset, struct Qdisc *sch)
+{
+}
+
+SEC("struct_ops")
+__auxiliary
+void BPF_PROG(bpf_qdisc_test_destroy, struct Qdisc *sch)
+{
+}
+
+SEC(".struct_ops")
+struct Qdisc_ops test = {
+	.enqueue   = (void *)dynptr_use_after_invalidate_clone,
+	.dequeue   = (void *)bpf_qdisc_test_dequeue,
+	.init      = (void *)bpf_qdisc_test_init,
+	.reset     = (void *)bpf_qdisc_test_reset,
+	.destroy   = (void *)bpf_qdisc_test_destroy,
+	.id        = "bpf_qdisc_test",
+};
diff --git a/tools/testing/selftests/bpf/progs/bpf_qdisc_fail__invalid_dynptr.c b/tools/testing/selftests/bpf/progs/bpf_qdisc_fail__invalid_dynptr.c
new file mode 100644
index 000000000000..1d96f7987a3f
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_qdisc_fail__invalid_dynptr.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <vmlinux.h>
+#include "bpf_experimental.h"
+#include "bpf_qdisc_common.h"
+#include "bpf_misc.h"
+
+char _license[] SEC("license") = "GPL";
+
+int proto;
+
+SEC("struct_ops")
+__failure __msg("Expected an initialized dynptr as R1")
+int BPF_PROG(invalid_dynptr, struct sk_buff *skb, struct Qdisc *sch,
+	     struct bpf_sk_buff_ptr *to_free)
+{
+	struct bpf_dynptr ptr;
+	struct ethhdr *hdr;
+
+	bpf_dynptr_from_skb((struct __sk_buff *)skb, 0, &ptr);
+
+	bpf_qdisc_skb_drop(skb, to_free);
+
+	hdr = bpf_dynptr_slice(&ptr, 0, NULL, sizeof(*hdr));
+	if (!hdr)
+		return NET_XMIT_DROP;
+
+	proto = hdr->h_proto;
+
+	return NET_XMIT_DROP;
+}
+
+SEC("struct_ops")
+__auxiliary
+struct sk_buff *BPF_PROG(bpf_qdisc_test_dequeue, struct Qdisc *sch)
+{
+	return NULL;
+}
+
+SEC("struct_ops")
+__auxiliary
+int BPF_PROG(bpf_qdisc_test_init, struct Qdisc *sch, struct nlattr *opt,
+	     struct netlink_ext_ack *extack)
+{
+	return 0;
+}
+
+SEC("struct_ops")
+__auxiliary
+void BPF_PROG(bpf_qdisc_test_reset, struct Qdisc *sch)
+{
+}
+
+SEC("struct_ops")
+__auxiliary
+void BPF_PROG(bpf_qdisc_test_destroy, struct Qdisc *sch)
+{
+}
+
+SEC(".struct_ops")
+struct Qdisc_ops test = {
+	.enqueue   = (void *)invalid_dynptr,
+	.dequeue   = (void *)bpf_qdisc_test_dequeue,
+	.init      = (void *)bpf_qdisc_test_init,
+	.reset     = (void *)bpf_qdisc_test_reset,
+	.destroy   = (void *)bpf_qdisc_test_destroy,
+	.id        = "bpf_qdisc_test",
+};
diff --git a/tools/testing/selftests/bpf/progs/bpf_qdisc_fail__invalid_dynptr_cross_frame.c b/tools/testing/selftests/bpf/progs/bpf_qdisc_fail__invalid_dynptr_cross_frame.c
new file mode 100644
index 000000000000..2e23b8593af9
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_qdisc_fail__invalid_dynptr_cross_frame.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <vmlinux.h>
+#include "bpf_experimental.h"
+#include "bpf_qdisc_common.h"
+#include "bpf_misc.h"
+
+char _license[] SEC("license") = "GPL";
+
+int proto;
+
+static __noinline int free_skb(struct sk_buff *skb)
+{
+	bpf_kfree_skb(skb);
+	return 0;
+}
+
+SEC("struct_ops")
+__failure __msg("invalid mem access 'scalar'")
+int BPF_PROG(invalid_dynptr_cross_frame, struct sk_buff *skb, struct Qdisc *sch,
+	     struct bpf_sk_buff_ptr *to_free)
+{
+	struct bpf_dynptr ptr;
+	struct ethhdr *hdr;
+
+	bpf_dynptr_from_skb((struct __sk_buff *)skb, 0, &ptr);
+
+	hdr = bpf_dynptr_slice(&ptr, 0, NULL, sizeof(*hdr));
+	if (!hdr)
+		return NET_XMIT_DROP;
+
+	free_skb(skb);
+
+	proto = hdr->h_proto;
+
+	return NET_XMIT_DROP;
+}
+
+SEC("struct_ops")
+__auxiliary
+struct sk_buff *BPF_PROG(bpf_qdisc_test_dequeue, struct Qdisc *sch)
+{
+	return NULL;
+}
+
+SEC("struct_ops")
+__auxiliary
+int BPF_PROG(bpf_qdisc_test_init, struct Qdisc *sch, struct nlattr *opt,
+	     struct netlink_ext_ack *extack)
+{
+	return 0;
+}
+
+SEC("struct_ops")
+__auxiliary
+void BPF_PROG(bpf_qdisc_test_reset, struct Qdisc *sch)
+{
+}
+
+SEC("struct_ops")
+__auxiliary
+void BPF_PROG(bpf_qdisc_test_destroy, struct Qdisc *sch)
+{
+}
+
+SEC(".struct_ops")
+struct Qdisc_ops test = {
+	.enqueue   = (void *)invalid_dynptr_cross_frame,
+	.dequeue   = (void *)bpf_qdisc_test_dequeue,
+	.init      = (void *)bpf_qdisc_test_init,
+	.reset     = (void *)bpf_qdisc_test_reset,
+	.destroy   = (void *)bpf_qdisc_test_destroy,
+	.id        = "bpf_qdisc_test",
+};
diff --git a/tools/testing/selftests/bpf/progs/bpf_qdisc_fail__invalid_dynptr_slice.c b/tools/testing/selftests/bpf/progs/bpf_qdisc_fail__invalid_dynptr_slice.c
new file mode 100644
index 000000000000..731216c4e45a
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_qdisc_fail__invalid_dynptr_slice.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <vmlinux.h>
+#include "bpf_experimental.h"
+#include "bpf_qdisc_common.h"
+#include "bpf_misc.h"
+
+char _license[] SEC("license") = "GPL";
+
+int proto;
+
+SEC("struct_ops")
+__failure __msg("invalid mem access 'scalar'")
+int BPF_PROG(invalid_dynptr_slice, struct sk_buff *skb, struct Qdisc *sch,
+	     struct bpf_sk_buff_ptr *to_free)
+{
+	struct bpf_dynptr ptr;
+	struct ethhdr *hdr;
+
+	bpf_dynptr_from_skb((struct __sk_buff *)skb, 0, &ptr);
+
+	hdr = bpf_dynptr_slice(&ptr, 0, NULL, sizeof(*hdr));
+	if (!hdr) {
+		bpf_qdisc_skb_drop(skb, to_free);
+		return NET_XMIT_DROP;
+	}
+
+	bpf_qdisc_skb_drop(skb, to_free);
+
+	proto = hdr->h_proto;
+
+	return NET_XMIT_DROP;
+}
+
+SEC("struct_ops")
+__auxiliary
+struct sk_buff *BPF_PROG(bpf_qdisc_test_dequeue, struct Qdisc *sch)
+{
+	return NULL;
+}
+
+SEC("struct_ops")
+__auxiliary
+int BPF_PROG(bpf_qdisc_test_init, struct Qdisc *sch, struct nlattr *opt,
+	     struct netlink_ext_ack *extack)
+{
+	return 0;
+}
+
+SEC("struct_ops")
+__auxiliary
+void BPF_PROG(bpf_qdisc_test_reset, struct Qdisc *sch)
+{
+}
+
+SEC("struct_ops")
+__auxiliary
+void BPF_PROG(bpf_qdisc_test_destroy, struct Qdisc *sch)
+{
+}
+
+SEC(".struct_ops")
+struct Qdisc_ops test = {
+	.enqueue   = (void *)invalid_dynptr_slice,
+	.dequeue   = (void *)bpf_qdisc_test_dequeue,
+	.init      = (void *)bpf_qdisc_test_init,
+	.reset     = (void *)bpf_qdisc_test_reset,
+	.destroy   = (void *)bpf_qdisc_test_destroy,
+	.id        = "bpf_qdisc_test",
+};
diff --git a/tools/testing/selftests/bpf/progs/bpf_qdisc_fq.c b/tools/testing/selftests/bpf/progs/bpf_qdisc_fq.c
index 1a3233a275c7..8107f5934d2d 100644
--- a/tools/testing/selftests/bpf/progs/bpf_qdisc_fq.c
+++ b/tools/testing/selftests/bpf/progs/bpf_qdisc_fq.c
@@ -196,18 +196,13 @@ fq_flows_remove_front(struct bpf_list_head *head, struct bpf_spin_lock *lock,
 static bool
 fq_flows_is_empty(struct bpf_list_head *head, struct bpf_spin_lock *lock)
 {
-	struct bpf_list_node *node;
+	bool empty;
 
 	bpf_spin_lock(lock);
-	node = bpf_list_pop_front(head);
-	if (node) {
-		bpf_list_push_front(head, node);
-		bpf_spin_unlock(lock);
-		return false;
-	}
+	empty = bpf_list_empty(head);
 	bpf_spin_unlock(lock);
 
-	return true;
+	return empty;
 }
 
 /* flow->age is used to denote the state of the flow (not-detached, detached, throttled)
diff --git a/tools/testing/selftests/bpf/progs/btf__stack_arg_precision.c b/tools/testing/selftests/bpf/progs/btf__stack_arg_precision.c
new file mode 100644
index 000000000000..8d38aafe66a2
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__stack_arg_precision.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include "../test_kmods/bpf_testmod_kfunc.h"
+
+#if (defined(__TARGET_ARCH_x86) || defined(__TARGET_ARCH_arm64)) && \
+	defined(__BPF_FEATURE_STACK_ARGUMENT)
+
+long subprog_call_mem_kfunc(long a, long b, long c, long d, long e, long size)
+{
+	char buf[8] = {};
+
+	return bpf_kfunc_call_stack_arg_mem(a, b, c, d, e, buf, size);
+}
+
+#else
+
+long subprog_call_mem_kfunc(void)
+{
+	return 0;
+}
+
+#endif
diff --git a/tools/testing/selftests/bpf/progs/btf__verifier_stack_arg_order.c b/tools/testing/selftests/bpf/progs/btf__verifier_stack_arg_order.c
new file mode 100644
index 000000000000..99bc115f8380
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__verifier_stack_arg_order.c
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+
+#if (defined(__TARGET_ARCH_x86) || defined(__TARGET_ARCH_arm64)) && \
+	defined(__BPF_FEATURE_STACK_ARGUMENT)
+
+int subprog_bad_order_6args(int a, int b, int c, int d, int e, int f)
+{
+	return a + b + c + d + e + f;
+}
+
+int subprog_call_before_load_6args(int a, int b, int c, int d, int e, int f)
+{
+	return a + b + c + d + e + f;
+}
+
+int subprog_pruning_call_before_load_6args(int a, int b, int c, int d, int e, int f)
+{
+	return a + b + c + d + e + f;
+}
+
+void subprog_bad_ptr_7args(long *a, int b, int c, int d, int e, int f, int g)
+{
+}
+
+#else
+
+int subprog_bad_order_6args(void)
+{
+	return 0;
+}
+
+int subprog_call_before_load_6args(void)
+{
+	return 0;
+}
+
+int subprog_pruning_call_before_load_6args(void)
+{
+	return 0;
+}
+
+void subprog_bad_ptr_7args(void)
+{
+}
+
+#endif
diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c
index 9fe9c4a4e8f6..d0d65d6d450c 100644
--- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c
+++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c
@@ -29,7 +29,7 @@ static struct __cgrps_kfunc_map_value *insert_lookup_cgrp(struct cgroup *cgrp)
 }
 
 SEC("tp_btf/cgroup_mkdir")
-__failure __msg("Possibly NULL pointer passed to trusted arg0")
+__failure __msg("Possibly NULL pointer passed to trusted R1")
 int BPF_PROG(cgrp_kfunc_acquire_untrusted, struct cgroup *cgrp, const char *path)
 {
 	struct cgroup *acquired;
@@ -48,7 +48,7 @@ int BPF_PROG(cgrp_kfunc_acquire_untrusted, struct cgroup *cgrp, const char *path
 }
 
 SEC("tp_btf/cgroup_mkdir")
-__failure __msg("Possibly NULL pointer passed to trusted arg0")
+__failure __msg("Possibly NULL pointer passed to trusted R1")
 int BPF_PROG(cgrp_kfunc_acquire_no_null_check, struct cgroup *cgrp, const char *path)
 {
 	struct cgroup *acquired;
@@ -64,7 +64,7 @@ int BPF_PROG(cgrp_kfunc_acquire_no_null_check, struct cgroup *cgrp, const char *
 }
 
 SEC("tp_btf/cgroup_mkdir")
-__failure __msg("arg#0 pointer type STRUCT cgroup must point")
+__failure __msg("R1 pointer type STRUCT cgroup must point")
 int BPF_PROG(cgrp_kfunc_acquire_fp, struct cgroup *cgrp, const char *path)
 {
 	struct cgroup *acquired, *stack_cgrp = (struct cgroup *)&path;
@@ -106,7 +106,7 @@ int BPF_PROG(cgrp_kfunc_acquire_trusted_walked, struct cgroup *cgrp, const char
 }
 
 SEC("tp_btf/cgroup_mkdir")
-__failure __msg("Possibly NULL pointer passed to trusted arg0")
+__failure __msg("Possibly NULL pointer passed to trusted R1")
 int BPF_PROG(cgrp_kfunc_acquire_null, struct cgroup *cgrp, const char *path)
 {
 	struct cgroup *acquired;
@@ -154,7 +154,7 @@ int BPF_PROG(cgrp_kfunc_xchg_unreleased, struct cgroup *cgrp, const char *path)
 }
 
 SEC("tp_btf/cgroup_mkdir")
-__failure __msg("must be referenced or trusted")
+__failure __msg("release kfunc bpf_cgroup_release expects referenced PTR_TO_BTF_ID passed to R1")
 int BPF_PROG(cgrp_kfunc_rcu_get_release, struct cgroup *cgrp, const char *path)
 {
 	struct cgroup *kptr;
@@ -175,7 +175,7 @@ int BPF_PROG(cgrp_kfunc_rcu_get_release, struct cgroup *cgrp, const char *path)
 }
 
 SEC("tp_btf/cgroup_mkdir")
-__failure __msg("Possibly NULL pointer passed to trusted arg0")
+__failure __msg("Possibly NULL pointer passed to trusted R1")
 int BPF_PROG(cgrp_kfunc_release_untrusted, struct cgroup *cgrp, const char *path)
 {
 	struct __cgrps_kfunc_map_value *v;
@@ -191,7 +191,7 @@ int BPF_PROG(cgrp_kfunc_release_untrusted, struct cgroup *cgrp, const char *path
 }
 
 SEC("tp_btf/cgroup_mkdir")
-__failure __msg("arg#0 pointer type STRUCT cgroup must point")
+__failure __msg("release kfunc bpf_cgroup_release expects referenced PTR_TO_BTF_ID passed to R1")
 int BPF_PROG(cgrp_kfunc_release_fp, struct cgroup *cgrp, const char *path)
 {
 	struct cgroup *acquired = (struct cgroup *)&path;
@@ -203,7 +203,7 @@ int BPF_PROG(cgrp_kfunc_release_fp, struct cgroup *cgrp, const char *path)
 }
 
 SEC("tp_btf/cgroup_mkdir")
-__failure __msg("Possibly NULL pointer passed to trusted arg0")
+__failure __msg("Possibly NULL pointer passed to trusted R1")
 int BPF_PROG(cgrp_kfunc_release_null, struct cgroup *cgrp, const char *path)
 {
 	struct __cgrps_kfunc_map_value local, *v;
@@ -237,7 +237,7 @@ int BPF_PROG(cgrp_kfunc_release_null, struct cgroup *cgrp, const char *path)
 }
 
 SEC("tp_btf/cgroup_mkdir")
-__failure __msg("release kernel function bpf_cgroup_release expects")
+__failure __msg("release kfunc bpf_cgroup_release expects referenced PTR_TO_BTF_ID passed to R1")
 int BPF_PROG(cgrp_kfunc_release_unacquired, struct cgroup *cgrp, const char *path)
 {
 	/* Cannot release trusted cgroup pointer which was not acquired. */
diff --git a/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c b/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c
index a2de95f85648..37bd6b03ba01 100644
--- a/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c
+++ b/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c
@@ -4,6 +4,7 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 #include "bpf_misc.h"
+#include "err.h"
 
 char _license[] SEC("license") = "GPL";
 
@@ -16,6 +17,7 @@ struct {
 
 __s32 target_pid;
 __u64 cgroup_id;
+long update_err;
 int target_hid;
 bool is_cgroup1;
 
@@ -123,3 +125,19 @@ int yes_rcu_lock(void *ctx)
 	bpf_rcu_read_unlock();
 	return 0;
 }
+
+SEC("fexit/bpf_local_storage_update")
+int BPF_PROG(fexit_update, void *owner, struct bpf_local_storage_map *smap,
+	     void *value, u64 map_flags, bool swap_uptrs,
+	     struct bpf_local_storage_data *ret)
+{
+	struct task_struct *task = bpf_get_current_task_btf();
+
+	if (task->pid != target_pid)
+		return 0;
+
+	if (IS_ERR_VALUE(ret))
+		update_err = PTR_ERR(ret);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/compute_live_registers.c b/tools/testing/selftests/bpf/progs/compute_live_registers.c
index f05e120f3450..d055fc7b3b95 100644
--- a/tools/testing/selftests/bpf/progs/compute_live_registers.c
+++ b/tools/testing/selftests/bpf/progs/compute_live_registers.c
@@ -3,7 +3,7 @@
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
 #include "../../../include/linux/filter.h"
-#include "bpf_arena_common.h"
+#include <bpf_arena_common.h>
 #include "bpf_misc.h"
 
 struct {
diff --git a/tools/testing/selftests/bpf/progs/cpumask_failure.c b/tools/testing/selftests/bpf/progs/cpumask_failure.c
index 61c32e91e8c3..4c45346fe6f7 100644
--- a/tools/testing/selftests/bpf/progs/cpumask_failure.c
+++ b/tools/testing/selftests/bpf/progs/cpumask_failure.c
@@ -45,7 +45,7 @@ int BPF_PROG(test_alloc_no_release, struct task_struct *task, u64 clone_flags)
 }
 
 SEC("tp_btf/task_newtask")
-__failure __msg("NULL pointer passed to trusted arg0")
+__failure __msg("NULL pointer passed to trusted R1")
 int BPF_PROG(test_alloc_double_release, struct task_struct *task, u64 clone_flags)
 {
 	struct bpf_cpumask *cpumask;
@@ -73,7 +73,7 @@ int BPF_PROG(test_acquire_wrong_cpumask, struct task_struct *task, u64 clone_fla
 }
 
 SEC("tp_btf/task_newtask")
-__failure __msg("bpf_cpumask_set_cpu args#1 expected pointer to STRUCT bpf_cpumask")
+__failure __msg("bpf_cpumask_set_cpu R2 expected pointer to STRUCT bpf_cpumask")
 int BPF_PROG(test_mutate_cpumask, struct task_struct *task, u64 clone_flags)
 {
 	/* Can't set the CPU of a non-struct bpf_cpumask. */
@@ -107,7 +107,7 @@ int BPF_PROG(test_insert_remove_no_release, struct task_struct *task, u64 clone_
 }
 
 SEC("tp_btf/task_newtask")
-__failure __msg("NULL pointer passed to trusted arg0")
+__failure __msg("NULL pointer passed to trusted R1")
 int BPF_PROG(test_cpumask_null, struct task_struct *task, u64 clone_flags)
 {
   /* NULL passed to kfunc. */
@@ -151,7 +151,7 @@ int BPF_PROG(test_global_mask_out_of_rcu, struct task_struct *task, u64 clone_fl
 }
 
 SEC("tp_btf/task_newtask")
-__failure __msg("NULL pointer passed to trusted arg1")
+__failure __msg("NULL pointer passed to trusted R2")
 int BPF_PROG(test_global_mask_no_null_check, struct task_struct *task, u64 clone_flags)
 {
 	struct bpf_cpumask *local, *prev;
@@ -179,7 +179,7 @@ int BPF_PROG(test_global_mask_no_null_check, struct task_struct *task, u64 clone
 }
 
 SEC("tp_btf/task_newtask")
-__failure __msg("Possibly NULL pointer passed to helper arg2")
+__failure __msg("Possibly NULL pointer passed to helper R2")
 int BPF_PROG(test_global_mask_rcu_no_null_check, struct task_struct *task, u64 clone_flags)
 {
 	struct bpf_cpumask *prev, *curr;
diff --git a/tools/testing/selftests/bpf/progs/cpumask_success.c b/tools/testing/selftests/bpf/progs/cpumask_success.c
index 0e04c31b91c0..774706e7b058 100644
--- a/tools/testing/selftests/bpf/progs/cpumask_success.c
+++ b/tools/testing/selftests/bpf/progs/cpumask_success.c
@@ -866,7 +866,7 @@ int BPF_PROG(test_populate, struct task_struct *task, u64 clone_flags)
 	 * access NR_CPUS, the upper bound for nr_cpus, so we infer
 	 * it from the size of cpumask_t.
 	 */
-	if (nr_cpus < 0 || nr_cpus >= CPUMASK_TEST_MASKLEN * 8) {
+	if (nr_cpus < 0 || nr_cpus > CPUMASK_TEST_MASKLEN * 8) {
 		err = 3;
 		goto out;
 	}
diff --git a/tools/testing/selftests/bpf/progs/crypto_bench.c b/tools/testing/selftests/bpf/progs/crypto_bench.c
index 4ac956b26240..4c0a09aa1e6c 100644
--- a/tools/testing/selftests/bpf/progs/crypto_bench.c
+++ b/tools/testing/selftests/bpf/progs/crypto_bench.c
@@ -11,10 +11,19 @@
 #include "crypto_common.h"
 
 const volatile unsigned int len = 16;
-char cipher[128] = {};
+/*
+ * cipher[] and key[] are 8-byte aligned and 'params' is kept off the stack to
+ * work around an LLVM code generation bug. clang lowers the memcpy() of these
+ * byte-aligned globals into a per-byte load/store sequence staged on the stack,
+ * and additionally materializes the on-stack 'struct bpf_crypto_params' twice.
+ * Both blow the 512-byte BPF stack limit. Aligning the sources lets clang copy
+ * word-wise, and a global 'params' removes the large object from the stack.
+ */
+char cipher[128] __attribute__((aligned(8))) = {};
 u32 key_len, authsize;
 char dst[256] = {};
-u8 key[256] = {};
+u8 key[256] __attribute__((aligned(8))) = {};
+static struct bpf_crypto_params params;
 long hits = 0;
 int status;
 
@@ -22,11 +31,6 @@ SEC("syscall")
 int crypto_setup(void *args)
 {
 	struct bpf_crypto_ctx *cctx;
-	struct bpf_crypto_params params = {
-		.type = "skcipher",
-		.key_len = key_len,
-		.authsize = authsize,
-	};
 	int err = 0;
 
 	status = 0;
@@ -36,6 +40,9 @@ int crypto_setup(void *args)
 		return 0;
 	}
 
+	__builtin_memcpy(&params.type, "skcipher", sizeof("skcipher"));
+	params.key_len = key_len;
+	params.authsize = authsize;
 	__builtin_memcpy(&params.algo, cipher, sizeof(cipher));
 	__builtin_memcpy(&params.key, key, sizeof(key));
 	cctx = bpf_crypto_ctx_create(&params, sizeof(params), &err);
diff --git a/tools/testing/selftests/bpf/progs/crypto_sanity.c b/tools/testing/selftests/bpf/progs/crypto_sanity.c
index dfd8a258f14a..e81f5ac3b1ae 100644
--- a/tools/testing/selftests/bpf/progs/crypto_sanity.c
+++ b/tools/testing/selftests/bpf/progs/crypto_sanity.c
@@ -10,11 +10,20 @@
 #include "bpf_kfuncs.h"
 #include "crypto_common.h"
 
-unsigned char key[256] = {};
+/*
+ * key[] and algo[] are 8-byte aligned and 'params' is kept off the stack to
+ * work around an LLVM code generation bug. clang lowers the memcpy() of these
+ * byte-aligned globals into a per-byte load/store sequence staged on the stack,
+ * and additionally materializes the on-stack 'struct bpf_crypto_params' twice.
+ * Both blow the 512-byte BPF stack limit. Aligning the sources lets clang copy
+ * word-wise, and a global 'params' removes the large object from the stack.
+ */
+unsigned char key[256] __attribute__((aligned(8))) = {};
 u16 udp_test_port = 7777;
 u32 authsize, key_len;
-char algo[128] = {};
+char algo[128] __attribute__((aligned(8))) = {};
 char dst[16] = {}, dst_bad[8] = {};
+static struct bpf_crypto_params params;
 int status;
 
 static int skb_dynptr_validate(struct __sk_buff *skb, struct bpf_dynptr *psrc)
@@ -53,11 +62,6 @@ static int skb_dynptr_validate(struct __sk_buff *skb, struct bpf_dynptr *psrc)
 SEC("syscall")
 int skb_crypto_setup(void *ctx)
 {
-	struct bpf_crypto_params params = {
-		.type = "skcipher",
-		.key_len = key_len,
-		.authsize = authsize,
-	};
 	struct bpf_crypto_ctx *cctx;
 	int err;
 
@@ -67,6 +71,9 @@ int skb_crypto_setup(void *ctx)
 		return 0;
 	}
 
+	__builtin_memcpy(&params.type, "skcipher", sizeof("skcipher"));
+	params.key_len = key_len;
+	params.authsize = authsize;
 	__builtin_memcpy(&params.algo, algo, sizeof(algo));
 	__builtin_memcpy(&params.key, key, sizeof(key));
 
diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c
index b62773ce5219..344fb2aa0813 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c
@@ -78,7 +78,7 @@ static int get_map_val_dynptr(struct bpf_dynptr *ptr)
  * bpf_ringbuf_submit/discard_dynptr call
  */
 SEC("?raw_tp")
-__failure __msg("Unreleased reference id=2")
+__failure __msg("Unreleased reference id=1")
 int ringbuf_missing_release1(void *ctx)
 {
 	struct bpf_dynptr ptr = {};
@@ -91,7 +91,7 @@ int ringbuf_missing_release1(void *ctx)
 }
 
 SEC("?raw_tp")
-__failure __msg("Unreleased reference id=4")
+__failure __msg("Unreleased reference id=3")
 int ringbuf_missing_release2(void *ctx)
 {
 	struct bpf_dynptr ptr1, ptr2;
@@ -136,7 +136,7 @@ int ringbuf_missing_release_callback(void *ctx)
 
 /* Can't call bpf_ringbuf_submit/discard_dynptr on a non-initialized dynptr */
 SEC("?raw_tp")
-__failure __msg("arg 1 is an unacquired reference")
+__failure __msg("Expected an initialized dynptr as R1")
 int ringbuf_release_uninit_dynptr(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -149,7 +149,7 @@ int ringbuf_release_uninit_dynptr(void *ctx)
 
 /* A dynptr can't be used after it has been invalidated */
 SEC("?raw_tp")
-__failure __msg("Expected an initialized dynptr as arg #2")
+__failure __msg("Expected an initialized dynptr as R3")
 int use_after_invalid(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -448,7 +448,7 @@ int invalid_helper2(void *ctx)
 
 /* A bpf_dynptr is invalidated if it's been written into */
 SEC("?raw_tp")
-__failure __msg("Expected an initialized dynptr as arg #0")
+__failure __msg("Expected an initialized dynptr as R1")
 int invalid_write1(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -650,7 +650,7 @@ int invalid_offset(void *ctx)
 
 /* Can't release a dynptr twice */
 SEC("?raw_tp")
-__failure __msg("arg 1 is an unacquired reference")
+__failure __msg("Expected an initialized dynptr as R1")
 int release_twice(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -677,7 +677,7 @@ static int release_twice_callback_fn(__u32 index, void *data)
  * within a callback function, fails
  */
 SEC("?raw_tp")
-__failure __msg("arg 1 is an unacquired reference")
+__failure __msg("Expected an initialized dynptr as R1")
 int release_twice_callback(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -705,6 +705,48 @@ int dynptr_from_mem_invalid_api(void *ctx)
 	return 0;
 }
 
+/* Cannot create dynptr from dynptr data */
+SEC("?raw_tp")
+__failure __msg("Unsupported reg type mem for bpf_dynptr_from_mem data")
+int dynptr_from_dynptr_data(void *ctx)
+{
+	struct bpf_dynptr ptr, ptr2;
+	__u8 *data;
+
+	if (get_map_val_dynptr(&ptr))
+		return 0;
+
+	data = bpf_dynptr_data(&ptr, 0, sizeof(__u32));
+	if (!data)
+		return 0;
+
+	/* this should fail */
+	bpf_dynptr_from_mem(data, sizeof(__u32), 0, &ptr2);
+
+	return 0;
+}
+
+/* Cannot create dynptr from dynptr slice */
+SEC("?tc")
+__failure __msg("Unsupported reg type mem for bpf_dynptr_from_mem data")
+int dynptr_from_dynptr_slice(struct __sk_buff *skb)
+{
+	struct bpf_dynptr ptr, ptr2;
+	struct ethhdr *hdr;
+	char buffer[sizeof(*hdr)] = {};
+
+	bpf_dynptr_from_skb(skb, 0, &ptr);
+
+	hdr = bpf_dynptr_slice_rdwr(&ptr, 0, buffer, sizeof(buffer));
+	if (!hdr)
+		return SK_DROP;
+
+	/* this should fail */
+	bpf_dynptr_from_mem(hdr, sizeof(*hdr), 0, &ptr2);
+
+	return SK_PASS;
+}
+
 SEC("?tc")
 __failure __msg("cannot overwrite referenced dynptr") __log_level(2)
 int dynptr_pruning_overwrite(struct __sk_buff *ctx)
@@ -1642,7 +1684,7 @@ int invalid_slice_rdwr_rdonly(struct __sk_buff *skb)
 
 /* bpf_dynptr_adjust can only be called on initialized dynptrs */
 SEC("?raw_tp")
-__failure __msg("Expected an initialized dynptr as arg #0")
+__failure __msg("Expected an initialized dynptr as R1")
 int dynptr_adjust_invalid(void *ctx)
 {
 	struct bpf_dynptr ptr = {};
@@ -1655,7 +1697,7 @@ int dynptr_adjust_invalid(void *ctx)
 
 /* bpf_dynptr_is_null can only be called on initialized dynptrs */
 SEC("?raw_tp")
-__failure __msg("Expected an initialized dynptr as arg #0")
+__failure __msg("Expected an initialized dynptr as R1")
 int dynptr_is_null_invalid(void *ctx)
 {
 	struct bpf_dynptr ptr = {};
@@ -1668,7 +1710,7 @@ int dynptr_is_null_invalid(void *ctx)
 
 /* bpf_dynptr_is_rdonly can only be called on initialized dynptrs */
 SEC("?raw_tp")
-__failure __msg("Expected an initialized dynptr as arg #0")
+__failure __msg("Expected an initialized dynptr as R1")
 int dynptr_is_rdonly_invalid(void *ctx)
 {
 	struct bpf_dynptr ptr = {};
@@ -1681,7 +1723,7 @@ int dynptr_is_rdonly_invalid(void *ctx)
 
 /* bpf_dynptr_size can only be called on initialized dynptrs */
 SEC("?raw_tp")
-__failure __msg("Expected an initialized dynptr as arg #0")
+__failure __msg("Expected an initialized dynptr as R1")
 int dynptr_size_invalid(void *ctx)
 {
 	struct bpf_dynptr ptr = {};
@@ -1694,7 +1736,7 @@ int dynptr_size_invalid(void *ctx)
 
 /* Only initialized dynptrs can be cloned */
 SEC("?raw_tp")
-__failure __msg("Expected an initialized dynptr as arg #0")
+__failure __msg("Expected an initialized dynptr as R1")
 int clone_invalid1(void *ctx)
 {
 	struct bpf_dynptr ptr1 = {};
@@ -1728,7 +1770,7 @@ int clone_invalid2(struct xdp_md *xdp)
 
 /* Invalidating a dynptr should invalidate its clones */
 SEC("?raw_tp")
-__failure __msg("Expected an initialized dynptr as arg #2")
+__failure __msg("Expected an initialized dynptr as R3")
 int clone_invalidate1(void *ctx)
 {
 	struct bpf_dynptr clone;
@@ -1749,7 +1791,7 @@ int clone_invalidate1(void *ctx)
 
 /* Invalidating a dynptr should invalidate its parent */
 SEC("?raw_tp")
-__failure __msg("Expected an initialized dynptr as arg #2")
+__failure __msg("Expected an initialized dynptr as R3")
 int clone_invalidate2(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -1770,7 +1812,7 @@ int clone_invalidate2(void *ctx)
 
 /* Invalidating a dynptr should invalidate its siblings */
 SEC("?raw_tp")
-__failure __msg("Expected an initialized dynptr as arg #2")
+__failure __msg("Expected an initialized dynptr as R3")
 int clone_invalidate3(void *ctx)
 {
 	struct bpf_dynptr ptr;
@@ -1981,7 +2023,7 @@ __noinline long global_call_bpf_dynptr(const struct bpf_dynptr *dynptr)
 }
 
 SEC("?raw_tp")
-__failure __msg("arg#0 expected pointer to stack or const struct bpf_dynptr")
+__failure __msg("R1 expected pointer to stack or const struct bpf_dynptr")
 int test_dynptr_reg_type(void *ctx)
 {
 	struct task_struct *current = NULL;
diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c
index e0d672d93adf..e0745b6e467e 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_success.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_success.c
@@ -914,7 +914,7 @@ void *user_ptr;
 char expected_str[384];
 __u32 test_len[7] = {0/* placeholder */, 0, 1, 2, 255, 256, 257};
 
-typedef int (*bpf_read_dynptr_fn_t)(struct bpf_dynptr *dptr, u64 off,
+typedef int (*bpf_read_dynptr_fn_t)(const struct bpf_dynptr *dptr, u64 off,
 				    u64 size, const void *unsafe_ptr);
 
 /* Returns the offset just before the end of the maximum sized xdp fragment.
@@ -1106,7 +1106,7 @@ int test_copy_from_user_str_dynptr(void *ctx)
 	return 0;
 }
 
-static int bpf_copy_data_from_user_task(struct bpf_dynptr *dptr, u64 off,
+static int bpf_copy_data_from_user_task(const struct bpf_dynptr *dptr, u64 off,
 					u64 size, const void *unsafe_ptr)
 {
 	struct task_struct *task = bpf_get_current_task_btf();
@@ -1114,7 +1114,7 @@ static int bpf_copy_data_from_user_task(struct bpf_dynptr *dptr, u64 off,
 	return bpf_copy_from_user_task_dynptr(dptr, off, size, unsafe_ptr, task);
 }
 
-static int bpf_copy_data_from_user_task_str(struct bpf_dynptr *dptr, u64 off,
+static int bpf_copy_data_from_user_task_str(const struct bpf_dynptr *dptr, u64 off,
 					    u64 size, const void *unsafe_ptr)
 {
 	struct task_struct *task = bpf_get_current_task_btf();
diff --git a/tools/testing/selftests/bpf/progs/exceptions.c b/tools/testing/selftests/bpf/progs/exceptions.c
index 4206f59d7b86..c8d716fbd419 100644
--- a/tools/testing/selftests/bpf/progs/exceptions.c
+++ b/tools/testing/selftests/bpf/progs/exceptions.c
@@ -379,4 +379,118 @@ int exception_bad_assert_range_with(struct __sk_buff *ctx)
 	return 1;
 }
 
+#if (defined(__TARGET_ARCH_x86) || defined(__TARGET_ARCH_arm64)) \
+	&& defined(__BPF_FEATURE_STACK_ARGUMENT)
+
+const volatile bool has_stack_arg = true;
+
+long arg1 = 1, arg2 = 2, arg3 = 3, arg4 = 4, arg5 = 5;
+long arg6 = 6, arg7 = 7, arg8 = 8, arg9 = 9, arg10 = 10;
+
+__noinline static long throwing_many_args(long a, long b, long c, long d,
+					  long e, long f, long g, long h,
+					  long i, long j)
+{
+	bpf_throw(a + b + c + d + e + f + g + h + i + j);
+	return 0;
+}
+
+__noinline int exception_cb_sa(u64 cookie)
+{
+	return cookie + 1;
+}
+
+SEC("tc")
+__exception_cb(exception_cb_sa)
+int exception_throw_stack_arg(struct __sk_buff *ctx)
+{
+	throwing_many_args(arg1, arg2, arg3, arg4, arg5,
+			   arg6, arg7, arg8, arg9, arg10);
+	return 0;
+}
+
+__noinline static long no_throw_many_args(long a, long b, long c, long d,
+					  long e, long f, long g, long h,
+					  long i, long j)
+{
+	return a + b + c + d + e + f + g + h + i + j;
+}
+
+SEC("tc")
+__exception_cb(exception_cb_sa)
+int exception_throw_after_stack_arg(struct __sk_buff *ctx)
+{
+	long ret;
+
+	ret = no_throw_many_args(arg1, arg2, arg3, arg4, arg5,
+				 arg6, arg7, arg8, arg9, arg10);
+	if (ret > 0)
+		bpf_throw(ret);
+	return 0;
+}
+
+__noinline static long subprog_throw_sa(long val)
+{
+	throwing_many_args(val, val + 1, val + 2, val + 3, val + 4,
+			   val + 5, val + 6, val + 7, val + 8, val + 9);
+	return 0;
+}
+
+SEC("tc")
+__exception_cb(exception_cb_sa)
+int exception_throw_subprog_stack_arg(struct __sk_buff *ctx)
+{
+	subprog_throw_sa(arg1);
+	return 0;
+}
+
+__noinline static long subprog_throw_after_sa(long val)
+{
+	long ret;
+
+	ret = no_throw_many_args(val, val + 1, val + 2, val + 3, val + 4,
+				 val + 5, val + 6, val + 7, val + 8, val + 9);
+	if (ret > 0)
+		bpf_throw(ret);
+	return 0;
+}
+
+SEC("tc")
+__exception_cb(exception_cb_sa)
+int exception_throw_subprog_after_stack_arg(struct __sk_buff *ctx)
+{
+	subprog_throw_after_sa(arg1);
+	return 0;
+}
+
+#else
+
+const volatile bool has_stack_arg = false;
+
+SEC("tc")
+int exception_throw_stack_arg(struct __sk_buff *ctx)
+{
+	return 0;
+}
+
+SEC("tc")
+int exception_throw_after_stack_arg(struct __sk_buff *ctx)
+{
+	return 0;
+}
+
+SEC("tc")
+int exception_throw_subprog_stack_arg(struct __sk_buff *ctx)
+{
+	return 0;
+}
+
+SEC("tc")
+int exception_throw_subprog_after_stack_arg(struct __sk_buff *ctx)
+{
+	return 0;
+}
+
+#endif
+
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c b/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c
index 983b7c233382..f4bbf87b82dd 100644
--- a/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c
+++ b/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c
@@ -53,14 +53,23 @@ int BPF_PROG(test_subprog1, struct sk_buff *skb, int ret)
  *   r0 = *(u32 *)(r1 + 0)
  *   w0 <<= 1
  *   exit
- * In such case the verifier falls back to conservative and
+ * Before llvm23, in such case the verifier falls back to conservative and
  * tracing program can access arguments and return value as u64
- * instead of accurate types.
+ * instead of accurate types. With llvm23, the true signature
+ *   int test_pkt_access_subprog2(volatile struct __sk_buff *skb)
+ * is available in btf.
  */
+#if __clang_major__ >= 23
+struct args_subprog2 {
+	__u64 args[1];
+	__u64 ret;
+};
+#else
 struct args_subprog2 {
 	__u64 args[5];
 	__u64 ret;
 };
+#endif
 __u64 test_result_subprog2 = 0;
 SEC("fexit/test_pkt_access_subprog2")
 int test_subprog2(struct args_subprog2 *ctx)
diff --git a/tools/testing/selftests/bpf/progs/file_reader.c b/tools/testing/selftests/bpf/progs/file_reader.c
index 462712ff3b8a..aa2c05cce2b3 100644
--- a/tools/testing/selftests/bpf/progs/file_reader.c
+++ b/tools/testing/selftests/bpf/progs/file_reader.c
@@ -50,7 +50,7 @@ int on_open_expect_fault(void *c)
 		goto out;
 
 	local_err = bpf_dynptr_read(tmp_buf, user_buf_sz, &dynptr, user_buf_sz, 0);
-	if (local_err == -EFAULT) { /* Expect page fault */
+	if (local_err == -EFAULT || local_err == 0) { /* Expect page fault or success */
 		local_err = 0;
 		run_success = 1;
 	}
diff --git a/tools/testing/selftests/bpf/progs/file_reader_fail.c b/tools/testing/selftests/bpf/progs/file_reader_fail.c
index 32fe28ed2439..3bb9e2612f8f 100644
--- a/tools/testing/selftests/bpf/progs/file_reader_fail.c
+++ b/tools/testing/selftests/bpf/progs/file_reader_fail.c
@@ -30,7 +30,7 @@ int on_nanosleep_unreleased_ref(void *ctx)
 
 SEC("xdp")
 __failure
-__msg("Expected a dynptr of type file as arg #0")
+__msg("Expected a dynptr of type file as R1")
 int xdp_wrong_dynptr_type(struct xdp_md *xdp)
 {
 	struct bpf_dynptr dynptr;
@@ -42,7 +42,7 @@ int xdp_wrong_dynptr_type(struct xdp_md *xdp)
 
 SEC("xdp")
 __failure
-__msg("Expected an initialized dynptr as arg #0")
+__msg("Expected an initialized dynptr as R1")
 int xdp_no_dynptr_type(struct xdp_md *xdp)
 {
 	struct bpf_dynptr dynptr;
@@ -50,3 +50,63 @@ int xdp_no_dynptr_type(struct xdp_md *xdp)
 	bpf_dynptr_file_discard(&dynptr);
 	return 0;
 }
+
+SEC("lsm/file_open")
+__failure
+__msg("Leaking reference id={{[0-9]+}} alloc_insn={{[0-9]+}}. Release it first.")
+int use_file_dynptr_after_put_file(void *ctx)
+{
+	struct task_struct *task = bpf_get_current_task_btf();
+	struct file *file = bpf_get_task_exe_file(task);
+	struct bpf_dynptr dynptr;
+	char buf[64];
+
+	if (!file)
+		return 0;
+
+	if (bpf_dynptr_from_file(file, 0, &dynptr))
+		goto out;
+
+	/* this should fail - file dynptr should be discarded first to prevent resource leak */
+	bpf_put_file(file);
+
+	bpf_dynptr_read(buf, sizeof(buf), &dynptr, 0, 0);
+	return 0;
+
+out:
+	bpf_dynptr_file_discard(&dynptr);
+	bpf_put_file(file);
+	return 0;
+}
+
+SEC("lsm/file_open")
+__failure
+__msg("Leaking reference id={{[0-9]+}} alloc_insn={{[0-9]+}}. Release it first.")
+int use_file_dynptr_slice_after_put_file(void *ctx)
+{
+	struct task_struct *task = bpf_get_current_task_btf();
+	struct file *file = bpf_get_task_exe_file(task);
+	struct bpf_dynptr dynptr;
+	char buf[1];
+	const char *data;
+
+	if (!file)
+		return 0;
+
+	if (bpf_dynptr_from_file(file, 0, &dynptr))
+		goto out;
+
+	data = bpf_dynptr_slice(&dynptr, 0, buf, sizeof(buf));
+	if (!data)
+		goto out;
+
+	/* this should fail - file dynptr should be discarded first to prevent resource leak */
+	bpf_put_file(file);
+
+	return data[0];
+
+out:
+	bpf_dynptr_file_discard(&dynptr);
+	bpf_put_file(file);
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/htab_update.c b/tools/testing/selftests/bpf/progs/htab_update.c
index 195d3b2fba00..62c1b1325ec2 100644
--- a/tools/testing/selftests/bpf/progs/htab_update.c
+++ b/tools/testing/selftests/bpf/progs/htab_update.c
@@ -22,8 +22,8 @@ struct {
 int pid = 0;
 int update_err = 0;
 
-SEC("?fentry/bpf_obj_free_fields")
-int bpf_obj_free_fields(void *ctx)
+SEC("?fentry/bpf_obj_cancel_fields")
+int bpf_obj_cancel_fields(void *ctx)
 {
 	__u32 key = 0;
 	struct val value = { .payload = 1 };
diff --git a/tools/testing/selftests/bpf/progs/irq.c b/tools/testing/selftests/bpf/progs/irq.c
index e11e82d98904..a4a007866a33 100644
--- a/tools/testing/selftests/bpf/progs/irq.c
+++ b/tools/testing/selftests/bpf/progs/irq.c
@@ -15,7 +15,7 @@ struct bpf_res_spin_lock lockA __hidden SEC(".data.A");
 struct bpf_res_spin_lock lockB __hidden SEC(".data.B");
 
 SEC("?tc")
-__failure __msg("arg#0 doesn't point to an irq flag on stack")
+__failure __msg("R1 doesn't point to an irq flag on stack")
 int irq_save_bad_arg(struct __sk_buff *ctx)
 {
 	bpf_local_irq_save(&global_flags);
@@ -23,7 +23,7 @@ int irq_save_bad_arg(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
-__failure __msg("arg#0 doesn't point to an irq flag on stack")
+__failure __msg("R1 doesn't point to an irq flag on stack")
 int irq_restore_bad_arg(struct __sk_buff *ctx)
 {
 	bpf_local_irq_restore(&global_flags);
diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c
index 86b74e3579d9..0fa70b133d93 100644
--- a/tools/testing/selftests/bpf/progs/iters.c
+++ b/tools/testing/selftests/bpf/progs/iters.c
@@ -1605,7 +1605,7 @@ int iter_subprog_check_stacksafe(const void *ctx)
 struct bpf_iter_num global_it;
 
 SEC("raw_tp")
-__failure __msg("arg#0 expected pointer to an iterator on stack")
+__failure __msg("R1 expected pointer to an iterator on stack")
 int iter_new_bad_arg(const void *ctx)
 {
 	bpf_iter_num_new(&global_it, 0, 1);
@@ -1613,7 +1613,7 @@ int iter_new_bad_arg(const void *ctx)
 }
 
 SEC("raw_tp")
-__failure __msg("arg#0 expected pointer to an iterator on stack")
+__failure __msg("R1 expected pointer to an iterator on stack")
 int iter_next_bad_arg(const void *ctx)
 {
 	bpf_iter_num_next(&global_it);
@@ -1621,7 +1621,7 @@ int iter_next_bad_arg(const void *ctx)
 }
 
 SEC("raw_tp")
-__failure __msg("arg#0 expected pointer to an iterator on stack")
+__failure __msg("R1 expected pointer to an iterator on stack")
 int iter_destroy_bad_arg(const void *ctx)
 {
 	bpf_iter_num_destroy(&global_it);
diff --git a/tools/testing/selftests/bpf/progs/iters_state_safety.c b/tools/testing/selftests/bpf/progs/iters_state_safety.c
index d273b46dfc7c..646026430e9b 100644
--- a/tools/testing/selftests/bpf/progs/iters_state_safety.c
+++ b/tools/testing/selftests/bpf/progs/iters_state_safety.c
@@ -30,7 +30,7 @@ int force_clang_to_emit_btf_for_externs(void *ctx)
 
 SEC("?raw_tp")
 __success __log_level(2)
-__msg("fp-8=iter_num(ref_id=1,state=active,depth=0)")
+__msg("fp-8=iter_num(id=1,state=active,depth=0)")
 int create_and_destroy(void *ctx)
 {
 	struct bpf_iter_num iter;
@@ -73,7 +73,7 @@ int create_and_forget_to_destroy_fail(void *ctx)
 }
 
 SEC("?raw_tp")
-__failure __msg("expected an initialized iter_num as arg #0")
+__failure __msg("expected an initialized iter_num as R1")
 int destroy_without_creating_fail(void *ctx)
 {
 	/* init with zeros to stop verifier complaining about uninit stack */
@@ -91,7 +91,7 @@ int destroy_without_creating_fail(void *ctx)
 }
 
 SEC("?raw_tp")
-__failure __msg("expected an initialized iter_num as arg #0")
+__failure __msg("expected an initialized iter_num as R1")
 int compromise_iter_w_direct_write_fail(void *ctx)
 {
 	struct bpf_iter_num iter;
@@ -143,7 +143,7 @@ int compromise_iter_w_direct_write_and_skip_destroy_fail(void *ctx)
 }
 
 SEC("?raw_tp")
-__failure __msg("expected an initialized iter_num as arg #0")
+__failure __msg("expected an initialized iter_num as R1")
 int compromise_iter_w_helper_write_fail(void *ctx)
 {
 	struct bpf_iter_num iter;
@@ -196,7 +196,7 @@ int leak_iter_from_subprog_fail(void *ctx)
 
 SEC("?raw_tp")
 __success __log_level(2)
-__msg("fp-8=iter_num(ref_id=1,state=active,depth=0)")
+__msg("fp-8=iter_num(id=1,state=active,depth=0)")
 int valid_stack_reuse(void *ctx)
 {
 	struct bpf_iter_num iter;
@@ -230,7 +230,7 @@ int valid_stack_reuse(void *ctx)
 }
 
 SEC("?raw_tp")
-__failure __msg("expected uninitialized iter_num as arg #0")
+__failure __msg("expected uninitialized iter_num as R1")
 int double_create_fail(void *ctx)
 {
 	struct bpf_iter_num iter;
@@ -258,7 +258,7 @@ int double_create_fail(void *ctx)
 }
 
 SEC("?raw_tp")
-__failure __msg("expected an initialized iter_num as arg #0")
+__failure __msg("expected an initialized iter_num as R1")
 int double_destroy_fail(void *ctx)
 {
 	struct bpf_iter_num iter;
@@ -284,7 +284,7 @@ int double_destroy_fail(void *ctx)
 }
 
 SEC("?raw_tp")
-__failure __msg("expected an initialized iter_num as arg #0")
+__failure __msg("expected an initialized iter_num as R1")
 int next_without_new_fail(void *ctx)
 {
 	struct bpf_iter_num iter;
@@ -305,7 +305,7 @@ int next_without_new_fail(void *ctx)
 }
 
 SEC("?raw_tp")
-__failure __msg("expected an initialized iter_num as arg #0")
+__failure __msg("expected an initialized iter_num as R1")
 int next_after_destroy_fail(void *ctx)
 {
 	struct bpf_iter_num iter;
diff --git a/tools/testing/selftests/bpf/progs/iters_testmod.c b/tools/testing/selftests/bpf/progs/iters_testmod.c
index 5379e9960ffd..76012dbbdb41 100644
--- a/tools/testing/selftests/bpf/progs/iters_testmod.c
+++ b/tools/testing/selftests/bpf/progs/iters_testmod.c
@@ -29,7 +29,7 @@ out:
 }
 
 SEC("raw_tp/sys_enter")
-__failure __msg("Possibly NULL pointer passed to trusted arg0")
+__failure __msg("Possibly NULL pointer passed to trusted R1")
 int iter_next_trusted_or_null(const void *ctx)
 {
 	struct task_struct *cur_task = bpf_get_current_task_btf();
@@ -67,7 +67,7 @@ out:
 }
 
 SEC("raw_tp/sys_enter")
-__failure __msg("Possibly NULL pointer passed to trusted arg0")
+__failure __msg("Possibly NULL pointer passed to trusted R1")
 int iter_next_rcu_or_null(const void *ctx)
 {
 	struct task_struct *cur_task = bpf_get_current_task_btf();
diff --git a/tools/testing/selftests/bpf/progs/iters_testmod_seq.c b/tools/testing/selftests/bpf/progs/iters_testmod_seq.c
index 83791348bed5..d00888f6687a 100644
--- a/tools/testing/selftests/bpf/progs/iters_testmod_seq.c
+++ b/tools/testing/selftests/bpf/progs/iters_testmod_seq.c
@@ -20,8 +20,8 @@ __s64 res_empty;
 
 SEC("raw_tp/sys_enter")
 __success __log_level(2)
-__msg("fp-16=iter_testmod_seq(ref_id=1,state=active,depth=0)")
-__msg("fp-16=iter_testmod_seq(ref_id=1,state=drained,depth=0)")
+__msg("fp-16=iter_testmod_seq(id=1,state=active,depth=0)")
+__msg("fp-16=iter_testmod_seq(id=1,state=drained,depth=0)")
 __msg("call bpf_iter_testmod_seq_destroy")
 int testmod_seq_empty(const void *ctx)
 {
@@ -38,8 +38,8 @@ __s64 res_full;
 
 SEC("raw_tp/sys_enter")
 __success __log_level(2)
-__msg("fp-16=iter_testmod_seq(ref_id=1,state=active,depth=0)")
-__msg("fp-16=iter_testmod_seq(ref_id=1,state=drained,depth=0)")
+__msg("fp-16=iter_testmod_seq(id=1,state=active,depth=0)")
+__msg("fp-16=iter_testmod_seq(id=1,state=drained,depth=0)")
 __msg("call bpf_iter_testmod_seq_destroy")
 int testmod_seq_full(const void *ctx)
 {
@@ -58,8 +58,8 @@ static volatile int zero = 0;
 
 SEC("raw_tp/sys_enter")
 __success __log_level(2)
-__msg("fp-16=iter_testmod_seq(ref_id=1,state=active,depth=0)")
-__msg("fp-16=iter_testmod_seq(ref_id=1,state=drained,depth=0)")
+__msg("fp-16=iter_testmod_seq(id=1,state=active,depth=0)")
+__msg("fp-16=iter_testmod_seq(id=1,state=drained,depth=0)")
 __msg("call bpf_iter_testmod_seq_destroy")
 int testmod_seq_truncated(const void *ctx)
 {
@@ -79,7 +79,7 @@ int testmod_seq_truncated(const void *ctx)
 
 SEC("?raw_tp")
 __failure
-__msg("expected an initialized iter_testmod_seq as arg #1")
+__msg("expected an initialized iter_testmod_seq as R2")
 int testmod_seq_getter_before_bad(const void *ctx)
 {
 	struct bpf_iter_testmod_seq it;
@@ -89,7 +89,7 @@ int testmod_seq_getter_before_bad(const void *ctx)
 
 SEC("?raw_tp")
 __failure
-__msg("expected an initialized iter_testmod_seq as arg #1")
+__msg("expected an initialized iter_testmod_seq as R2")
 int testmod_seq_getter_after_bad(const void *ctx)
 {
 	struct bpf_iter_testmod_seq it;
diff --git a/tools/testing/selftests/bpf/progs/linked_list.c b/tools/testing/selftests/bpf/progs/linked_list.c
index 421f40835acd..fa97faa5358b 100644
--- a/tools/testing/selftests/bpf/progs/linked_list.c
+++ b/tools/testing/selftests/bpf/progs/linked_list.c
@@ -290,6 +290,77 @@ int test_list_in_list(struct bpf_spin_lock *lock, struct bpf_list_head *head)
 	return list_in_list(lock, head, true);
 }
 
+#define MAX_LIST_CLEAR_NODES 256
+
+static __always_inline
+int clear_list(struct bpf_spin_lock *lock, struct bpf_list_head *head)
+{
+	struct bpf_list_node *n;
+	int i;
+
+	for (i = 0; i < MAX_LIST_CLEAR_NODES; i++) {
+		bpf_spin_lock(lock);
+		n = bpf_list_pop_front(head);
+		bpf_spin_unlock(lock);
+		if (!n)
+			return 0;
+		bpf_obj_drop(container_of(n, struct foo, node2));
+	}
+	return 1;
+}
+
+SEC("syscall")
+int clear_map_list(void *ctx)
+{
+	struct map_value *v;
+
+	v = bpf_map_lookup_elem(&array_map, &(int){0});
+	if (!v)
+		return 1;
+	return clear_list(&v->lock, &v->head);
+}
+
+SEC("syscall")
+int clear_inner_map_list(void *ctx)
+{
+	struct map_value *v;
+	void *map;
+
+	map = bpf_map_lookup_elem(&map_of_maps, &(int){0});
+	if (!map)
+		return 1;
+	v = bpf_map_lookup_elem(map, &(int){0});
+	if (!v)
+		return 1;
+	return clear_list(&v->lock, &v->head);
+}
+
+SEC("syscall")
+int clear_global_list(void *ctx)
+{
+	return clear_list(&glock, &ghead);
+}
+
+SEC("syscall")
+int clear_global_nested_list(void *ctx)
+{
+	return clear_list(&ghead_nested.inner.lock, &ghead_nested.inner.head);
+}
+
+SEC("syscall")
+int clear_global_array_list(void *ctx)
+{
+	int ret;
+
+	ret = clear_list(&glock_c, &ghead_array[0]);
+	if (ret)
+		return ret;
+	ret = clear_list(&glock_c, &ghead_array[1]);
+	if (ret)
+		return ret;
+	return clear_list(&glock_c, &ghead_array_one[0]);
+}
+
 SEC("tc")
 int map_list_push_pop(void *ctx)
 {
diff --git a/tools/testing/selftests/bpf/progs/lpm_trie_bench.c b/tools/testing/selftests/bpf/progs/lpm_trie_bench.c
index a0e6ebd5507a..2831cf4445e8 100644
--- a/tools/testing/selftests/bpf/progs/lpm_trie_bench.c
+++ b/tools/testing/selftests/bpf/progs/lpm_trie_bench.c
@@ -7,7 +7,7 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_core_read.h>
 #include "bpf_misc.h"
-#include "bpf_atomic.h"
+#include <bpf_atomic.h>
 #include "progs/lpm_trie.h"
 
 #define BPF_OBJ_NAME_LEN 16U
diff --git a/tools/testing/selftests/bpf/progs/lru_lock_nmi.c b/tools/testing/selftests/bpf/progs/lru_lock_nmi.c
new file mode 100644
index 000000000000..c0692cd54237
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/lru_lock_nmi.c
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+
+struct {
+	__uint(type, BPF_MAP_TYPE_LRU_HASH);
+	__uint(max_entries, 64);
+	__type(key, __u32);
+	__type(value, __u64);
+} lru_map SEC(".maps");
+
+int hits;
+
+SEC("perf_event")
+int oncpu(void *ctx)
+{
+	/*
+	 * Key range deliberately wider than max_entries to force LRU
+	 * eviction on every other update.
+	 */
+	__u32 key = bpf_get_prandom_u32() % 128;
+	bool do_update = bpf_get_prandom_u32() & 1;
+	__u64 val = 1;
+
+	if (do_update)
+		bpf_map_update_elem(&lru_map, &key, &val, BPF_ANY);
+	else
+		bpf_map_delete_elem(&lru_map, &key);
+	__sync_fetch_and_add(&hits, 1);
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/lsm_cgroup.c b/tools/testing/selftests/bpf/progs/lsm_cgroup.c
index d7598538aa2d..3bfa479104be 100644
--- a/tools/testing/selftests/bpf/progs/lsm_cgroup.c
+++ b/tools/testing/selftests/bpf/progs/lsm_cgroup.c
@@ -35,6 +35,8 @@ int called_socket_bind;
 int called_socket_bind2;
 int called_socket_alloc;
 int called_socket_clone;
+int skipcap_retval = -4095;
+int socket_retval = -4095;
 
 static __always_inline int test_local_storage(void)
 {
@@ -190,3 +192,31 @@ int BPF_PROG(socket_clone, struct sock *newsk, const struct request_sock *req)
 
 	return 1;
 }
+
+SEC("lsm_cgroup/inode_xattr_skipcap")
+int BPF_PROG(skipcap_first, const char *name)
+{
+	return 0;
+}
+
+SEC("lsm_cgroup/inode_xattr_skipcap")
+int BPF_PROG(skipcap_second, const char *name)
+{
+	skipcap_retval = bpf_get_retval();
+	bpf_set_retval(0);
+	return 1;
+}
+
+SEC("lsm_cgroup/socket_create")
+int BPF_PROG(socket_first, int family, int type, int protocol, int kern)
+{
+	return 0;
+}
+
+SEC("lsm_cgroup/socket_create")
+int BPF_PROG(socket_second, int family, int type, int protocol, int kern)
+{
+	socket_retval = bpf_get_retval();
+	bpf_set_retval(0);
+	return 1;
+}
diff --git a/tools/testing/selftests/bpf/progs/map_kptr.c b/tools/testing/selftests/bpf/progs/map_kptr.c
index e708ffbe1f61..3fbefc568e0a 100644
--- a/tools/testing/selftests/bpf/progs/map_kptr.c
+++ b/tools/testing/selftests/bpf/progs/map_kptr.c
@@ -489,8 +489,7 @@ int test_map_kptr_ref3(struct __sk_buff *ctx)
 
 int num_of_refs;
 
-SEC("syscall")
-int count_ref(void *ctx)
+static __always_inline int read_ref_count(void)
 {
 	struct prog_test_ref_kfunc *p;
 	unsigned long arg = 0;
@@ -500,12 +499,96 @@ int count_ref(void *ctx)
 		return 1;
 
 	num_of_refs = p->cnt.refs.counter;
-
 	bpf_kfunc_call_test_release(p);
 	return 0;
 }
 
 SEC("syscall")
+int count_ref(void *ctx)
+{
+	return read_ref_count();
+}
+
+static __always_inline int stash_ref_ptr(struct map_value *v)
+{
+	struct prog_test_ref_kfunc *p, *old;
+	unsigned long arg = 0;
+
+	p = bpf_kfunc_call_test_acquire(&arg);
+	if (!p)
+		return 1;
+
+	old = bpf_kptr_xchg(&v->ref_ptr, p);
+	if (old) {
+		bpf_kfunc_call_test_release(old);
+		old = bpf_kptr_xchg(&v->ref_ptr, NULL);
+		if (old)
+			bpf_kfunc_call_test_release(old);
+		return 2;
+	}
+	return 0;
+}
+
+static __always_inline int check_refs(int expected)
+{
+	int ret;
+
+	ret = read_ref_count();
+	if (ret)
+		return ret;
+	return num_of_refs == expected ? 0 : 3;
+}
+
+SEC("syscall")
+int test_array_map_update_kptr(void *ctx)
+{
+	struct map_value init = {}, *v;
+	int key = 0, ret;
+
+	v = bpf_map_lookup_elem(&array_map, &key);
+	if (!v)
+		return 1;
+	ret = stash_ref_ptr(v);
+	if (ret)
+		return ret;
+	ret = check_refs(3);
+	if (ret)
+		return ret;
+	ret = bpf_map_update_elem(&array_map, &key, &init, BPF_EXIST);
+	if (ret)
+		return 4;
+	return check_refs(3);
+}
+
+#define DEFINE_HASH_UPDATE_KPTR_TEST(name, map)			\
+SEC("syscall")							\
+int name(void *ctx)						\
+{								\
+	struct map_value init = {}, *v;				\
+	int key = 0, ret;					\
+								\
+	ret = bpf_map_update_elem(&map, &key, &init, BPF_NOEXIST); \
+	if (ret)						\
+		return 1;					\
+	v = bpf_map_lookup_elem(&map, &key);			\
+	if (!v)							\
+		return 2;					\
+	ret = stash_ref_ptr(v);					\
+	if (ret)						\
+		return ret;					\
+	ret = check_refs(3);					\
+	if (ret)						\
+		return ret;					\
+	ret = bpf_map_update_elem(&map, &key, &init, BPF_EXIST); \
+	if (ret)						\
+		return 4;					\
+	return check_refs(3);					\
+}
+
+DEFINE_HASH_UPDATE_KPTR_TEST(test_hash_map_update_kptr, hash_map)
+DEFINE_HASH_UPDATE_KPTR_TEST(test_hash_malloc_map_update_kptr, hash_malloc_map)
+
+SEC("syscall")
 int test_ls_map_kptr_ref1(void *ctx)
 {
 	struct task_struct *current;
diff --git a/tools/testing/selftests/bpf/progs/map_kptr_fail.c b/tools/testing/selftests/bpf/progs/map_kptr_fail.c
index ee053b24e6ca..f11848dfa78f 100644
--- a/tools/testing/selftests/bpf/progs/map_kptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/map_kptr_fail.c
@@ -252,7 +252,7 @@ int reject_untrusted_store_to_ref(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
-__failure __msg("R2 must be referenced")
+__failure __msg("release helper bpf_kptr_xchg expects referenced PTR_TO_BTF_ID passed to R2")
 int reject_untrusted_xchg(struct __sk_buff *ctx)
 {
 	struct prog_test_ref_kfunc *p;
@@ -364,7 +364,7 @@ int kptr_xchg_ref_state(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
-__failure __msg("Possibly NULL pointer passed to helper arg2")
+__failure __msg("Possibly NULL pointer passed to helper R2")
 int kptr_xchg_possibly_null(struct __sk_buff *ctx)
 {
 	struct prog_test_ref_kfunc *p;
diff --git a/tools/testing/selftests/bpf/progs/percpu_alloc_fail.c b/tools/testing/selftests/bpf/progs/percpu_alloc_fail.c
index 81813c724fa9..08379c3b6a03 100644
--- a/tools/testing/selftests/bpf/progs/percpu_alloc_fail.c
+++ b/tools/testing/selftests/bpf/progs/percpu_alloc_fail.c
@@ -110,7 +110,7 @@ int BPF_PROG(test_array_map_3)
 }
 
 SEC("?fentry.s/bpf_fentry_test1")
-__failure __msg("arg#0 expected for bpf_percpu_obj_drop()")
+__failure __msg("R1 expected for bpf_percpu_obj_drop()")
 int BPF_PROG(test_array_map_4)
 {
 	struct val_t __percpu_kptr *p;
@@ -124,7 +124,7 @@ int BPF_PROG(test_array_map_4)
 }
 
 SEC("?fentry.s/bpf_fentry_test1")
-__failure __msg("arg#0 expected for bpf_obj_drop()")
+__failure __msg("R1 expected for bpf_obj_drop()")
 int BPF_PROG(test_array_map_5)
 {
 	struct val_t *p;
diff --git a/tools/testing/selftests/bpf/progs/rbtree_fail.c b/tools/testing/selftests/bpf/progs/rbtree_fail.c
index 70b7baf9304b..555379952dcc 100644
--- a/tools/testing/selftests/bpf/progs/rbtree_fail.c
+++ b/tools/testing/selftests/bpf/progs/rbtree_fail.c
@@ -134,7 +134,7 @@ unlock_err:
 }
 
 SEC("?tc")
-__failure __msg("arg#1 expected pointer to allocated object")
+__failure __msg("R2 expected pointer to allocated object")
 long rbtree_api_add_to_multiple_trees(void *ctx)
 {
 	struct node_data *n;
@@ -153,7 +153,7 @@ long rbtree_api_add_to_multiple_trees(void *ctx)
 }
 
 SEC("?tc")
-__failure __msg("Possibly NULL pointer passed to trusted arg1")
+__failure __msg("Possibly NULL pointer passed to trusted R2")
 long rbtree_api_use_unchecked_remove_retval(void *ctx)
 {
 	struct bpf_rb_node *res;
@@ -281,7 +281,7 @@ long add_with_cb(bool (cb)(struct bpf_rb_node *a, const struct bpf_rb_node *b))
 }
 
 SEC("?tc")
-__failure __msg("arg#1 expected pointer to allocated object")
+__failure __msg("R2 expected pointer to allocated object")
 long rbtree_api_add_bad_cb_bad_fn_call_add(void *ctx)
 {
 	return add_with_cb(less__bad_fn_call_add);
diff --git a/tools/testing/selftests/bpf/progs/refcounted_kptr.c b/tools/testing/selftests/bpf/progs/refcounted_kptr.c
index c847398837cc..61906f48025c 100644
--- a/tools/testing/selftests/bpf/progs/refcounted_kptr.c
+++ b/tools/testing/selftests/bpf/progs/refcounted_kptr.c
@@ -368,6 +368,427 @@ INSERT_STASH_READ(true, "insert_stash_read: remove from tree");
 INSERT_STASH_READ(false, "insert_stash_read: don't remove from tree");
 
 SEC("tc")
+__description("list_empty_test: list empty before add, non-empty after add")
+__success __retval(0)
+int list_empty_test(void *ctx)
+{
+	struct node_data *node_new;
+
+	bpf_spin_lock(&lock);
+	if (!bpf_list_empty(&head)) {
+		bpf_spin_unlock(&lock);
+		return -1;
+	}
+	bpf_spin_unlock(&lock);
+
+	node_new = bpf_obj_new(typeof(*node_new));
+	if (!node_new)
+		return -2;
+
+	bpf_spin_lock(&lock);
+	bpf_list_push_front(&head, &node_new->l);
+
+	if (bpf_list_empty(&head)) {
+		bpf_spin_unlock(&lock);
+		return -3;
+	}
+	bpf_spin_unlock(&lock);
+	return 0;
+}
+
+static struct node_data *__add_in_list(struct bpf_list_head *head,
+				       struct bpf_spin_lock *lock)
+{
+	struct node_data *node_new, *node_ref;
+
+	node_new = bpf_obj_new(typeof(*node_new));
+	if (!node_new)
+		return NULL;
+
+	node_ref = bpf_refcount_acquire(node_new);
+
+	bpf_spin_lock(lock);
+	bpf_list_push_front(head, &node_new->l);
+	bpf_spin_unlock(lock);
+	return node_ref;
+}
+
+SEC("tc")
+__description("list_is_edge_test1: is_first on first node, is_last on last node")
+__success __retval(0)
+int list_is_edge_test1(void *ctx)
+{
+	struct node_data *node_first, *node_last;
+	int err = 0;
+
+	node_last = __add_in_list(&head, &lock);
+	if (!node_last)
+		return -1;
+
+	node_first = __add_in_list(&head, &lock);
+	if (!node_first) {
+		bpf_obj_drop(node_last);
+		return -2;
+	}
+
+	bpf_spin_lock(&lock);
+	if (!bpf_list_is_first(&head, &node_first->l)) {
+		err = -3;
+		goto fail;
+	}
+	if (!bpf_list_is_last(&head, &node_last->l))
+		err = -4;
+
+fail:
+	bpf_spin_unlock(&lock);
+	bpf_obj_drop(node_first);
+	bpf_obj_drop(node_last);
+	return err;
+}
+
+SEC("tc")
+__description("list_is_edge_test2: accept list_front/list_back return value")
+__success __retval(0)
+int list_is_edge_test2(void *ctx)
+{
+	struct bpf_list_node *front, *back;
+	struct node_data *a, *b;
+	long err = 0;
+
+	a = __add_in_list(&head, &lock);
+	if (!a)
+		return -1;
+
+	b = __add_in_list(&head, &lock);
+	if (!b) {
+		bpf_obj_drop(a);
+		return -2;
+	}
+
+	bpf_spin_lock(&lock);
+	front = bpf_list_front(&head);
+	back = bpf_list_back(&head);
+	if (!front || !back) {
+		err = -3;
+		goto out_unlock;
+	}
+
+	if (!bpf_list_is_first(&head, front) || bpf_list_is_last(&head, front)) {
+		err = -4;
+		goto out_unlock;
+	}
+
+	if (!bpf_list_is_last(&head, back) || bpf_list_is_first(&head, back)) {
+		err = -5;
+		goto out_unlock;
+	}
+
+out_unlock:
+	bpf_spin_unlock(&lock);
+	bpf_obj_drop(a);
+	bpf_obj_drop(b);
+	return err;
+}
+
+SEC("tc")
+__description("list_is_edge_test3: single node is both first and last")
+__success __retval(0)
+int list_is_edge_test3(void *ctx)
+{
+	struct node_data *tmp;
+	struct bpf_list_node *node;
+	long err = 0;
+
+	tmp = __add_in_list(&head, &lock);
+	if (!tmp)
+		return -1;
+
+	bpf_spin_lock(&lock);
+	node = bpf_list_front(&head);
+	if (!node) {
+		bpf_spin_unlock(&lock);
+		bpf_obj_drop(tmp);
+		return -2;
+	}
+
+	if (!bpf_list_is_first(&head, node) || !bpf_list_is_last(&head, node))
+		err = -3;
+	bpf_spin_unlock(&lock);
+
+	bpf_obj_drop(tmp);
+	return err;
+}
+
+SEC("tc")
+__description("list_del_test1: del returns removed nodes")
+__success __retval(0)
+int list_del_test1(void *ctx)
+{
+	struct node_data *node_first, *node_last;
+	struct bpf_list_node *bpf_node_first, *bpf_node_last;
+	int err = 0;
+
+	node_last = __add_in_list(&head, &lock);
+	if (!node_last)
+		return -1;
+
+	node_first = __add_in_list(&head, &lock);
+	if (!node_first) {
+		bpf_obj_drop(node_last);
+		return -2;
+	}
+
+	bpf_spin_lock(&lock);
+	bpf_node_last = bpf_list_del(&head, &node_last->l);
+	bpf_node_first = bpf_list_del(&head, &node_first->l);
+	bpf_spin_unlock(&lock);
+
+	if (bpf_node_first)
+		bpf_obj_drop(container_of(bpf_node_first, struct node_data, l));
+	else
+		err = -3;
+
+	if (bpf_node_last)
+		bpf_obj_drop(container_of(bpf_node_last, struct node_data, l));
+	else
+		err = -4;
+
+	bpf_obj_drop(node_first);
+	bpf_obj_drop(node_last);
+	return err;
+}
+
+SEC("tc")
+__description("list_del_test2: remove an arbitrary node from the list")
+__success __retval(0)
+int list_del_test2(void *ctx)
+{
+	struct bpf_rb_node *rb;
+	struct bpf_list_node *l;
+	struct node_data *n;
+	long err;
+
+	err = __insert_in_tree_and_list(&head, &root, &lock);
+	if (err)
+		return err;
+
+	bpf_spin_lock(&lock);
+	rb = bpf_rbtree_first(&root);
+	if (!rb) {
+		bpf_spin_unlock(&lock);
+		return -4;
+	}
+
+	rb = bpf_rbtree_remove(&root, rb);
+	if (!rb) {
+		bpf_spin_unlock(&lock);
+		return -5;
+	}
+
+	n = container_of(rb, struct node_data, r);
+	l = bpf_list_del(&head, &n->l);
+	bpf_spin_unlock(&lock);
+	bpf_obj_drop(n);
+	if (!l)
+		return -6;
+
+	bpf_obj_drop(container_of(l, struct node_data, l));
+	return 0;
+}
+
+SEC("tc")
+__description("list_del_test3: list_del accepts list_front return value as node")
+__success __retval(0)
+int list_del_test3(void *ctx)
+{
+	struct node_data *tmp;
+	struct bpf_list_node *bpf_node, *l;
+	long err = 0;
+
+	tmp = __add_in_list(&head, &lock);
+	if (!tmp)
+		return -1;
+
+	bpf_spin_lock(&lock);
+	bpf_node = bpf_list_front(&head);
+	if (!bpf_node) {
+		bpf_spin_unlock(&lock);
+		err = -2;
+		goto fail;
+	}
+
+	l = bpf_list_del(&head, bpf_node);
+	bpf_spin_unlock(&lock);
+	if (!l) {
+		err = -3;
+		goto fail;
+	}
+
+	bpf_obj_drop(container_of(l, struct node_data, l));
+	bpf_obj_drop(tmp);
+	return 0;
+
+fail:
+	bpf_obj_drop(tmp);
+	return err;
+}
+
+SEC("tc")
+__description("list_add_test1: insert new node after prev")
+__success __retval(0)
+int list_add_test1(void *ctx)
+{
+	struct node_data *node_first;
+	struct node_data *new_node;
+	long err = 0;
+
+	node_first = __add_in_list(&head, &lock);
+	if (!node_first)
+		return -1;
+
+	new_node = bpf_obj_new(typeof(*new_node));
+	if (!new_node) {
+		err = -2;
+		goto fail;
+	}
+
+	bpf_spin_lock(&lock);
+	err = bpf_list_add(&head, &new_node->l, &node_first->l);
+	bpf_spin_unlock(&lock);
+	if (err) {
+		err = -3;
+		goto fail;
+	}
+
+fail:
+	bpf_obj_drop(node_first);
+	return err;
+}
+
+SEC("tc")
+__description("list_add_test2: list_add accepts list_front return value as prev")
+__success __retval(0)
+int list_add_test2(void *ctx)
+{
+	struct node_data *new_node, *tmp;
+	struct bpf_list_node *bpf_node;
+	long err = 0;
+
+	tmp = __add_in_list(&head, &lock);
+	if (!tmp)
+		return -1;
+
+	new_node = bpf_obj_new(typeof(*new_node));
+	if (!new_node) {
+		err = -2;
+		goto fail;
+	}
+
+	bpf_spin_lock(&lock);
+	bpf_node = bpf_list_front(&head);
+	if (!bpf_node) {
+		bpf_spin_unlock(&lock);
+		bpf_obj_drop(new_node);
+		err = -3;
+		goto fail;
+	}
+
+	err = bpf_list_add(&head, &new_node->l, bpf_node);
+	bpf_spin_unlock(&lock);
+	if (err) {
+		err = -4;
+		goto fail;
+	}
+
+fail:
+	bpf_obj_drop(tmp);
+	return err;
+}
+
+struct uninit_head_val {
+	struct bpf_spin_lock lock;
+	struct bpf_list_head head __contains(node_data, l);
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, int);
+	__type(value, struct uninit_head_val);
+	__uint(max_entries, 1);
+} uninit_head_map SEC(".maps");
+
+SEC("tc")
+__description("list_push_back_uninit_head: push_back on 0-initialized list head")
+__success __retval(0)
+int list_push_back_uninit_head(void *ctx)
+{
+	struct uninit_head_val *st;
+	struct node_data *node;
+	int ret = -1, key = 0;
+
+	st = bpf_map_lookup_elem(&uninit_head_map, &key);
+	if (!st)
+		return -1;
+
+	node = bpf_obj_new(typeof(*node));
+	if (!node)
+		return -1;
+
+	bpf_spin_lock(&st->lock);
+	ret = bpf_list_push_back(&st->head, &node->l);
+	bpf_spin_unlock(&st->lock);
+
+	return ret;
+}
+
+SEC("?tc")
+__failure __msg("bpf_spin_lock at off=32 must be held for bpf_list_head")
+long list_del_without_lock_fail(void *ctx)
+{
+	struct node_data *n;
+	struct bpf_list_node *l;
+
+	n = bpf_obj_new(typeof(*n));
+	if (!n)
+		return -1;
+
+	/* Error case: delete list node without holding lock */
+	l = bpf_list_del(&head, &n->l);
+	bpf_obj_drop(n);
+	if (!l)
+		return -2;
+	bpf_obj_drop(container_of(l, struct node_data, l));
+
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("bpf_spin_lock at off=32 must be held for bpf_list_head")
+long list_add_without_lock_fail(void *ctx)
+{
+	struct node_data *n, *prev;
+	long err;
+
+	n = bpf_obj_new(typeof(*n));
+	if (!n)
+		return -1;
+
+	prev = bpf_obj_new(typeof(*prev));
+	if (!prev) {
+		bpf_obj_drop(n);
+		return -1;
+	}
+
+	/* Error case: add list node without holding lock */
+	err = bpf_list_add(&head, &n->l, &prev->l);
+	bpf_obj_drop(prev);
+	if (err)
+		return -2;
+
+	return 0;
+}
+
+SEC("tc")
 __success
 long rbtree_refcounted_node_ref_escapes(void *ctx)
 {
@@ -615,13 +1036,31 @@ int percpu_hash_refcount_leak(void *ctx)
 	struct map_value *v;
 	int key = 0;
 
-	v = bpf_map_lookup_elem(&percpu_hash, &key);
+	v = bpf_map_lookup_percpu_elem(&percpu_hash, &key, 0);
 	if (!v)
 		return 0;
 
 	return __insert_in_list(&head, &lock, &v->node);
 }
 
+SEC("syscall")
+int clear_percpu_hash_kptr(void *ctx)
+{
+	struct node_data *n;
+	struct map_value *v;
+	int key = 0;
+
+	v = bpf_map_lookup_percpu_elem(&percpu_hash, &key, 0);
+	if (!v)
+		return 0;
+
+	n = bpf_kptr_xchg(&v->node, NULL);
+	if (!n)
+		return 0;
+	bpf_obj_drop(n);
+	return probe_read_refcount();
+}
+
 SEC("tc")
 int check_percpu_hash_refcount(void *ctx)
 {
diff --git a/tools/testing/selftests/bpf/progs/refcounted_kptr_fail.c b/tools/testing/selftests/bpf/progs/refcounted_kptr_fail.c
index b2808bfcec29..7247a20c0a3b 100644
--- a/tools/testing/selftests/bpf/progs/refcounted_kptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/refcounted_kptr_fail.c
@@ -54,7 +54,7 @@ long rbtree_refcounted_node_ref_escapes(void *ctx)
 }
 
 SEC("?tc")
-__failure __msg("Possibly NULL pointer passed to trusted arg0")
+__failure __msg("Possibly NULL pointer passed to trusted R1")
 long refcount_acquire_maybe_null(void *ctx)
 {
 	struct node_acquire *n, *m;
diff --git a/tools/testing/selftests/bpf/progs/rhash.c b/tools/testing/selftests/bpf/progs/rhash.c
new file mode 100644
index 000000000000..fc2dac3a719e
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/rhash.c
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <stdbool.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
+
+#define ENOENT 2
+#define EEXIST 17
+
+char _license[] SEC("license") = "GPL";
+
+int err;
+
+struct elem {
+	char arr[128];
+	int val;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_RHASH);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__uint(max_entries, 128);
+	__type(key, int);
+	__type(value, struct elem);
+} rhmap SEC(".maps");
+
+SEC("syscall")
+int test_rhash_lookup_update(void *ctx)
+{
+	int key = 5;
+	struct elem empty = {.val = 3, .arr = {0}};
+	struct elem *e;
+
+	err = 1;
+	e = bpf_map_lookup_elem(&rhmap, &key);
+	if (e)
+		return 1;
+
+	err = bpf_map_update_elem(&rhmap, &key, &empty, BPF_NOEXIST);
+	if (err)
+		return 1;
+
+	e = bpf_map_lookup_elem(&rhmap, &key);
+	if (!e || e->val != empty.val) {
+		err = 2;
+		return 2;
+	}
+
+	err = 0;
+	return 0;
+}
+
+SEC("syscall")
+int test_rhash_update_delete(void *ctx)
+{
+	int key = 6;
+	struct elem empty = {.val = 4, .arr = {0}};
+	struct elem *e;
+
+	err = 1;
+	e = bpf_map_lookup_elem(&rhmap, &key);
+	if (e)
+		return 1;
+
+	err = bpf_map_update_elem(&rhmap, &key, &empty, BPF_NOEXIST);
+	if (err)
+		return 2;
+
+	err = bpf_map_delete_elem(&rhmap, &key);
+	if (err)
+		return 3;
+
+	e = bpf_map_lookup_elem(&rhmap, &key);
+	if (e) {
+		err = 4;
+		return 4;
+	}
+
+	err = 0;
+	return 0;
+}
+
+SEC("syscall")
+int test_rhash_update_elements(void *ctx)
+{
+	int key = 0;
+	struct elem empty = {.val = 4, .arr = {0}};
+	struct elem *e;
+	int i;
+
+	err = 1;
+
+	for (i = 0; i < 128; ++i) {
+		key = i;
+		e = bpf_map_lookup_elem(&rhmap, &key);
+		if (e)
+			return 1;
+
+		empty.val = key;
+		err = bpf_map_update_elem(&rhmap, &key, &empty, BPF_NOEXIST);
+		if (err)
+			return 2;
+
+		e = bpf_map_lookup_elem(&rhmap, &key);
+		if (!e || e->val != key) {
+			err = 4;
+			return 4;
+		}
+	}
+
+	for (i = 0; i < 128; ++i) {
+		key = i;
+		err = bpf_map_delete_elem(&rhmap, &key);
+		if (err)
+			return 3;
+
+		e = bpf_map_lookup_elem(&rhmap, &key);
+		if (e) {
+			err = 5;
+			return 5;
+		}
+	}
+
+	err = 0;
+	return 0;
+}
+
+SEC("syscall")
+int test_rhash_update_exist(void *ctx)
+{
+	int key = 10;
+	struct elem val1 = {.val = 100, .arr = {0}};
+	struct elem val2 = {.val = 200, .arr = {0}};
+	struct elem *e;
+	int ret;
+
+	err = 1;
+
+	/* BPF_EXIST on non-existent key should fail with -ENOENT */
+	ret = bpf_map_update_elem(&rhmap, &key, &val1, BPF_EXIST);
+	if (ret != -ENOENT)
+		return 1;
+
+	/* Insert element first */
+	ret = bpf_map_update_elem(&rhmap, &key, &val1, BPF_NOEXIST);
+	if (ret)
+		return 2;
+
+	/* Verify initial value */
+	e = bpf_map_lookup_elem(&rhmap, &key);
+	if (!e || e->val != 100)
+		return 3;
+
+	/* BPF_EXIST on existing key should succeed and update value */
+	ret = bpf_map_update_elem(&rhmap, &key, &val2, BPF_EXIST);
+	if (ret)
+		return 4;
+
+	/* Verify value was updated */
+	e = bpf_map_lookup_elem(&rhmap, &key);
+	if (!e || e->val != 200)
+		return 5;
+
+	/* Cleanup */
+	bpf_map_delete_elem(&rhmap, &key);
+	err = 0;
+	return 0;
+}
+
+SEC("syscall")
+int test_rhash_update_any(void *ctx)
+{
+	int key = 11;
+	struct elem val1 = {.val = 111, .arr = {0}};
+	struct elem val2 = {.val = 222, .arr = {0}};
+	struct elem *e;
+	int ret;
+
+	err = 1;
+
+	/* BPF_ANY on non-existent key should insert */
+	ret = bpf_map_update_elem(&rhmap, &key, &val1, BPF_ANY);
+	if (ret)
+		return 1;
+
+	e = bpf_map_lookup_elem(&rhmap, &key);
+	if (!e || e->val != 111)
+		return 2;
+
+	/* BPF_ANY on existing key should update */
+	ret = bpf_map_update_elem(&rhmap, &key, &val2, BPF_ANY);
+	if (ret)
+		return 3;
+
+	e = bpf_map_lookup_elem(&rhmap, &key);
+	if (!e || e->val != 222)
+		return 4;
+
+	/* Cleanup */
+	bpf_map_delete_elem(&rhmap, &key);
+	err = 0;
+	return 0;
+}
+
+SEC("syscall")
+int test_rhash_noexist_duplicate(void *ctx)
+{
+	int key = 12;
+	struct elem val = {.val = 600, .arr = {0}};
+	int ret;
+
+	err = 1;
+
+	/* Insert element */
+	ret = bpf_map_update_elem(&rhmap, &key, &val, BPF_NOEXIST);
+	if (ret)
+		return 1;
+
+	/* Try to insert again with BPF_NOEXIST - should fail with -EEXIST */
+	ret = bpf_map_update_elem(&rhmap, &key, &val, BPF_NOEXIST);
+	if (ret != -EEXIST)
+		return 2;
+
+	/* Cleanup */
+	bpf_map_delete_elem(&rhmap, &key);
+	err = 0;
+	return 0;
+}
+
+SEC("syscall")
+int test_rhash_delete_nonexistent(void *ctx)
+{
+	int key = 99999;
+	int ret;
+
+	err = 1;
+
+	/* Delete non-existent key should return -ENOENT */
+	ret = bpf_map_delete_elem(&rhmap, &key);
+	if (ret != -ENOENT)
+		return 1;
+
+	err = 0;
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/setget_sockopt.c b/tools/testing/selftests/bpf/progs/setget_sockopt.c
index d330b1511979..636a7cd8e2fa 100644
--- a/tools/testing/selftests/bpf/progs/setget_sockopt.c
+++ b/tools/testing/selftests/bpf/progs/setget_sockopt.c
@@ -387,6 +387,24 @@ int _getsockopt(struct bpf_sockopt *ctx)
 	return 1;
 }
 
+int v4mapped_v6_ip_tos_enable;
+int v4mapped_v6_ip_tos_ret;
+int v4mapped_v6_ip_tos_cnt;
+int v4mapped_v6_ip_tos_val;
+
+static void test_v4mapped_v6_ip_tos(struct bpf_sock_ops *skops)
+{
+	int tos = v4mapped_v6_ip_tos_val;
+
+	if (!v4mapped_v6_ip_tos_enable || skops->op != BPF_SOCK_OPS_TCP_CONNECT_CB)
+		return;
+	if (skops->family != AF_INET6)
+		return;
+
+	v4mapped_v6_ip_tos_cnt++;
+	v4mapped_v6_ip_tos_ret = bpf_setsockopt(skops, IPPROTO_IP, IP_TOS, &tos, sizeof(tos));
+}
+
 SEC("sockops")
 int skops_sockopt(struct bpf_sock_ops *skops)
 {
@@ -401,6 +419,11 @@ int skops_sockopt(struct bpf_sock_ops *skops)
 	if (!sk)
 		return 1;
 
+	if (v4mapped_v6_ip_tos_enable) {
+		test_v4mapped_v6_ip_tos(skops);
+		return 1;
+	}
+
 	switch (skops->op) {
 	case BPF_SOCK_OPS_TCP_LISTEN_CB:
 		nr_listen += !(bpf_test_sockopt(skops, sk) ||
diff --git a/tools/testing/selftests/bpf/progs/sk_bypass_prot_mem.c b/tools/testing/selftests/bpf/progs/sk_bypass_prot_mem.c
index 09a00d11ffcc..bae5283fca6b 100644
--- a/tools/testing/selftests/bpf/progs/sk_bypass_prot_mem.c
+++ b/tools/testing/selftests/bpf/progs/sk_bypass_prot_mem.c
@@ -5,6 +5,7 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 #include <errno.h>
+#include "err.h"
 
 extern int tcp_memory_per_cpu_fw_alloc __ksym;
 extern int udp_memory_per_cpu_fw_alloc __ksym;
@@ -97,6 +98,7 @@ int sock_create(struct bpf_sock *ctx)
 	return 1;
 
 err:
+	set_if_not_errno_or_zero(err, -EFAULT);
 	bpf_set_retval(err);
 	return 0;
 }
diff --git a/tools/testing/selftests/bpf/progs/stack_arg.c b/tools/testing/selftests/bpf/progs/stack_arg.c
new file mode 100644
index 000000000000..944e3bb603e7
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/stack_arg.c
@@ -0,0 +1,273 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <stdbool.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_kfuncs.h"
+
+#define CLOCK_MONOTONIC 1
+
+struct timer_elem {
+	struct bpf_timer timer;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 1);
+	__type(key, int);
+	__type(value, struct timer_elem);
+} timer_map SEC(".maps");
+
+int timer_result;
+
+#if (defined(__TARGET_ARCH_x86) || defined(__TARGET_ARCH_arm64)) && \
+	defined(__BPF_FEATURE_STACK_ARGUMENT)
+
+const volatile bool has_stack_arg = true;
+
+__noinline static int static_func_many_args(int a, int b, int c, int d,
+					    int e, int f, int g, int h,
+					    int i, int j)
+{
+	return a + b + c + d + e + f + g + h + i + j;
+}
+
+__noinline int global_calls_many_args(int a, int b, int c)
+{
+	return static_func_many_args(a, b, c, a + 3, a + 4, a + 5, a + 6,
+				     a + 7, a + 8, a + 9);
+}
+
+SEC("tc")
+int test_global_many_args(void)
+{
+	return global_calls_many_args(1, 2, 3);
+}
+
+struct test_data {
+	long x;
+	long y;
+};
+
+/* 1+2+3+4+5+6+7+8+9+10+20 = 75 */
+__noinline static long func_with_ptr_stack_arg(long a, long b, long c, long d,
+					       long e, long f, long g, long h,
+					       long i, struct test_data *p)
+{
+	return a + b + c + d + e + f + g + h + i + p->x + p->y;
+}
+
+__noinline long global_ptr_stack_arg(long a, long b, long c, long d, long e)
+{
+	struct test_data data = { .x = 10, .y = 20 };
+
+	return func_with_ptr_stack_arg(a, b, c, d, e, a + 5, a + 6, a + 7,
+				      a + 8, &data);
+}
+
+SEC("tc")
+int test_bpf2bpf_ptr_stack_arg(void)
+{
+	return global_ptr_stack_arg(1, 2, 3, 4, 5);
+}
+
+/* 1+2+3+4+5+6+7+10+8+20 = 66 */
+__noinline static long func_with_mix_stack_args(long a, long b, long c, long d,
+						long e, long f, long g,
+						struct test_data *p,
+						long h, struct test_data *q)
+{
+	return a + b + c + d + e + f + g + p->x + h + q->y;
+}
+
+__noinline long global_mix_stack_args(long a, long b, long c, long d, long e)
+{
+	struct test_data p = { .x = 10 };
+	struct test_data q = { .y = 20 };
+
+	return func_with_mix_stack_args(a, b, c, d, e, e + 1, e + 2, &p,
+					e + 3, &q);
+}
+
+SEC("tc")
+int test_bpf2bpf_mix_stack_args(void)
+{
+	return global_mix_stack_args(1, 2, 3, 4, 5);
+}
+
+/*
+ * Nesting test: func_outer calls func_inner, both with struct pointer
+ * as stack arg.
+ *
+ * func_inner: (a+1)+...+(i+1) + p->x + p->y
+ *           = 2+3+4+5+6+7+8+9+10+10+20 = 84
+ */
+__noinline static long func_inner_ptr(long a, long b, long c, long d,
+				      long e, long f, long g, long h,
+				      long i, struct test_data *p)
+{
+	return a + b + c + d + e + f + g + h + i + p->x + p->y;
+}
+
+__noinline static long func_outer_ptr(long a, long b, long c, long d,
+				      long e, long f, long g, long h,
+				      long i, struct test_data *p)
+{
+	return func_inner_ptr(a + 1, b + 1, c + 1, d + 1, e + 1,
+			      f + 1, g + 1, h + 1, i + 1, p);
+}
+
+__noinline long global_nesting_ptr(long a, long b, long c, long d, long e)
+{
+	struct test_data data = { .x = 10, .y = 20 };
+
+	return func_outer_ptr(a, b, c, d, e, a + 5, a + 6, a + 7, a + 8,
+			      &data);
+}
+
+SEC("tc")
+int test_bpf2bpf_nesting_stack_arg(void)
+{
+	return global_nesting_ptr(1, 2, 3, 4, 5);
+}
+
+/* 1+2+3+4+5+6+7+8+9+sizeof(pkt_v4) = 45+54 = 99 */
+__noinline static long func_with_dynptr(long a, long b, long c, long d,
+					long e, long f, long g, long h,
+					long i, struct bpf_dynptr *ptr)
+{
+	return a + b + c + d + e + f + g + h + i + bpf_dynptr_size(ptr);
+}
+
+__noinline long global_dynptr_stack_arg(void *ctx __arg_ctx, long a, long b,
+					long c, long d)
+{
+	struct bpf_dynptr ptr;
+
+	bpf_dynptr_from_skb(ctx, 0, &ptr);
+	return func_with_dynptr(a, b, c, d, d + 1, d + 2, d + 3, d + 4,
+				d + 5, &ptr);
+}
+
+SEC("tc")
+int test_bpf2bpf_dynptr_stack_arg(struct __sk_buff *skb)
+{
+	return global_dynptr_stack_arg(skb, 1, 2, 3, 4);
+}
+
+/* foo1: a+b+c+d+e+f+g+h+i+j */
+__noinline static int foo1(int a, int b, int c, int d, int e,
+			   int f, int g, int h, int i, int j)
+{
+	return a + b + c + d + e + f + g + h + i + j;
+}
+
+/* foo2: a+b+c+d+e+f+g+h+i+j+k+l */
+__noinline static int foo2(int a, int b, int c, int d, int e,
+			   int f, int g, int h, int i, int j,
+			   int k, int l)
+{
+	return a + b + c + d + e + f + g + h + i + j + k + l;
+}
+
+/* global_two_callees calls foo1 (5 stack args) and foo2 (7 stack args).
+ * The outgoing stack arg area is sized for foo2 (the larger callee).
+ * Stores for foo1 are a subset of the area used by foo2.
+ * Result: foo1(1..10) + foo2(1..12) = 55 + 78 = 133
+ *
+ * Pass a-e through so the compiler can't constant-fold the stack args away.
+ */
+__noinline int global_two_callees(int a, int b, int c, int d, int e)
+{
+	int ret;
+
+	ret = foo1(a, b, c, d, e, a + 5, a + 6, a + 7, a + 8, a + 9);
+	ret += foo2(a, b, c, d, e, a + 5, a + 6, a + 7, a + 8, a + 9,
+		    a + 10, a + 11);
+	return ret;
+}
+
+SEC("tc")
+int test_two_callees(void)
+{
+	return global_two_callees(1, 2, 3, 4, 5);
+}
+
+const volatile int timer_base = 10;
+
+static int timer_cb_many_args(void *map, int *key, struct bpf_timer *timer)
+{
+	int v = timer_base;
+
+	timer_result = static_func_many_args(v, v * 2, v * 3, v * 4, v * 5,
+					     v * 6, v * 7, v * 8, v * 9,
+					     v * 10);
+	return 0;
+}
+
+SEC("tc")
+int test_async_cb_many_args(void)
+{
+	struct timer_elem *elem;
+	int key = 0;
+
+	elem = bpf_map_lookup_elem(&timer_map, &key);
+	if (!elem)
+		return -1;
+
+	bpf_timer_init(&elem->timer, &timer_map, CLOCK_MONOTONIC);
+	bpf_timer_set_callback(&elem->timer, timer_cb_many_args);
+	bpf_timer_start(&elem->timer, 1, 0);
+	return 0;
+}
+
+#else
+
+const volatile bool has_stack_arg = false;
+
+SEC("tc")
+int test_global_many_args(void)
+{
+	return 0;
+}
+
+SEC("tc")
+int test_bpf2bpf_ptr_stack_arg(void)
+{
+	return 0;
+}
+
+SEC("tc")
+int test_bpf2bpf_mix_stack_args(void)
+{
+	return 0;
+}
+
+SEC("tc")
+int test_bpf2bpf_nesting_stack_arg(void)
+{
+	return 0;
+}
+
+SEC("tc")
+int test_bpf2bpf_dynptr_stack_arg(struct __sk_buff *skb)
+{
+	return 0;
+}
+
+SEC("tc")
+int test_two_callees(void)
+{
+	return 0;
+}
+
+SEC("tc")
+int test_async_cb_many_args(void)
+{
+	return 0;
+}
+
+#endif
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/stack_arg_fail.c b/tools/testing/selftests/bpf/progs/stack_arg_fail.c
new file mode 100644
index 000000000000..ad9d4bfe15dc
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/stack_arg_fail.c
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include "../test_kmods/bpf_testmod_kfunc.h"
+#include "bpf_misc.h"
+
+#if defined(__BPF_FEATURE_STACK_ARGUMENT)
+
+SEC("tc")
+__failure __msg("Unrecognized *(R11-8) type STRUCT")
+int test_stack_arg_big(struct __sk_buff *skb)
+{
+	struct prog_test_big_arg s = { .a = 1, .b = 2 };
+
+	return bpf_kfunc_call_stack_arg_big(1, 2, 3, 4, 5, s);
+}
+
+SEC("socket")
+__description("r11 in ALU instruction")
+__failure __msg("R11 is invalid")
+__naked void r11_alu_reject(void)
+{
+	asm volatile (
+	"r11 += 1;"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+SEC("socket")
+__description("r11 store with non-DW size")
+__failure __msg("R11 is invalid")
+__naked void r11_store_non_dw(void)
+{
+	asm volatile (
+	"*(u32 *)(r11 - 8) = r1;"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+SEC("socket")
+__description("r11 store with unaligned offset")
+__failure __msg("R11 is invalid")
+__naked void r11_store_unaligned(void)
+{
+	asm volatile (
+	"*(u64 *)(r11 - 4) = r1;"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+SEC("socket")
+__description("r11 store with positive offset")
+__failure __msg("R11 is invalid")
+__naked void r11_store_positive_off(void)
+{
+	asm volatile (
+	"*(u64 *)(r11 + 8) = r1;"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+SEC("socket")
+__description("r11 load with negative offset")
+__failure __msg("R11 is invalid")
+__naked void r11_load_negative_off(void)
+{
+	asm volatile (
+	"r0 = *(u64 *)(r11 - 8);"
+	"exit;"
+	::: __clobber_all);
+}
+
+SEC("socket")
+__description("r11 load with non-DW size")
+__failure __msg("R11 is invalid")
+__naked void r11_load_non_dw(void)
+{
+	asm volatile (
+	"r0 = *(u32 *)(r11 + 8);"
+	"exit;"
+	::: __clobber_all);
+}
+
+SEC("socket")
+__description("r11 store with zero offset")
+__failure __msg("R11 is invalid")
+__naked void r11_store_zero_off(void)
+{
+	asm volatile (
+	"*(u64 *)(r11 + 0) = r1;"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+#else
+
+SEC("tc")
+__description("stack_arg_fail: not supported, dummy test")
+__success
+int test_stack_arg_big(struct __sk_buff *skb)
+{
+	return 0;
+}
+
+#endif
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/stack_arg_kfunc.c b/tools/testing/selftests/bpf/progs/stack_arg_kfunc.c
new file mode 100644
index 000000000000..345f2da2e361
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/stack_arg_kfunc.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_kfuncs.h"
+#include "../test_kmods/bpf_testmod_kfunc.h"
+
+#if (defined(__TARGET_ARCH_x86) || defined(__TARGET_ARCH_arm64)) && \
+	defined(__BPF_FEATURE_STACK_ARGUMENT)
+
+const volatile bool has_stack_arg = true;
+
+struct bpf_iter_testmod_seq {
+	u64 :64;
+	u64 :64;
+};
+
+extern int bpf_iter_testmod_seq_new(struct bpf_iter_testmod_seq *it, s64 value, int cnt) __ksym;
+extern void bpf_iter_testmod_seq_destroy(struct bpf_iter_testmod_seq *it) __ksym;
+
+struct timer_map_value {
+	struct bpf_timer timer;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 1);
+	__type(key, int);
+	__type(value, struct timer_map_value);
+} kfunc_timer_map SEC(".maps");
+
+SEC("tc")
+int test_stack_arg_scalar(struct __sk_buff *skb)
+{
+	return bpf_kfunc_call_stack_arg(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
+}
+
+SEC("tc")
+int test_stack_arg_ptr(struct __sk_buff *skb)
+{
+	struct prog_test_pass1 p = { .x0 = 10, .x1 = 20 };
+
+	return bpf_kfunc_call_stack_arg_ptr(1, 2, 3, 4, 5, 6, 7, 8, 9, &p);
+}
+
+SEC("tc")
+int test_stack_arg_mix(struct __sk_buff *skb)
+{
+	struct prog_test_pass1 p = { .x0 = 10 };
+	struct prog_test_pass1 q = { .x1 = 20 };
+
+	return bpf_kfunc_call_stack_arg_mix(1, 2, 3, 4, 5, 6, 7, &p, 8, &q);
+}
+
+/* 1+2+3+4+5+6+7+8+9+sizeof(pkt_v4) = 45+54 = 99 */
+SEC("tc")
+int test_stack_arg_dynptr(struct __sk_buff *skb)
+{
+	struct bpf_dynptr ptr;
+
+	bpf_dynptr_from_skb(skb, 0, &ptr);
+	return bpf_kfunc_call_stack_arg_dynptr(1, 2, 3, 4, 5, 6, 7, 8, 9, &ptr);
+}
+
+/* 1 + 2 + 3 + 4 + 5 + (1 + 2 + ... + 16) = 15 + 136 = 151 */
+SEC("tc")
+int test_stack_arg_mem(struct __sk_buff *skb)
+{
+	char buf[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+
+	return bpf_kfunc_call_stack_arg_mem(1, 2, 3, 4, 5, buf, sizeof(buf));
+}
+
+/* 1+2+3+4+5+6+7+8+9+100 = 145 */
+SEC("tc")
+int test_stack_arg_iter(struct __sk_buff *skb)
+{
+	struct bpf_iter_testmod_seq it;
+	u64 ret;
+
+	bpf_iter_testmod_seq_new(&it, 100, 10);
+	ret = bpf_kfunc_call_stack_arg_iter(1, 2, 3, 4, 5, 6, 7, 8, 9, &it);
+	bpf_iter_testmod_seq_destroy(&it);
+	return ret;
+}
+
+const char cstr[] = "hello";
+
+/* 1+2+3+4+5+6+7+8+9 = 45 */
+SEC("tc")
+int test_stack_arg_const_str(struct __sk_buff *skb)
+{
+	return bpf_kfunc_call_stack_arg_const_str(1, 2, 3, 4, 5, 6, 7, 8, 9,
+						  cstr);
+}
+
+/* 1+2+3+4+5+6+7+8+9 = 45 */
+SEC("tc")
+int test_stack_arg_timer(struct __sk_buff *skb)
+{
+	struct timer_map_value *val;
+	int key = 0;
+
+	val = bpf_map_lookup_elem(&kfunc_timer_map, &key);
+	if (!val)
+		return 0;
+	return bpf_kfunc_call_stack_arg_timer(1, 2, 3, 4, 5, 6, 7, 8, 9,
+					      &val->timer);
+}
+
+#else
+
+const volatile bool has_stack_arg = false;
+
+SEC("tc")
+int test_stack_arg_scalar(struct __sk_buff *skb)
+{
+	return 0;
+}
+
+SEC("tc")
+int test_stack_arg_ptr(struct __sk_buff *skb)
+{
+	return 0;
+}
+
+SEC("tc")
+int test_stack_arg_mix(struct __sk_buff *skb)
+{
+	return 0;
+}
+
+SEC("tc")
+int test_stack_arg_dynptr(struct __sk_buff *skb)
+{
+	return 0;
+}
+
+SEC("tc")
+int test_stack_arg_mem(struct __sk_buff *skb)
+{
+	return 0;
+}
+
+SEC("tc")
+int test_stack_arg_iter(struct __sk_buff *skb)
+{
+	return 0;
+}
+
+SEC("tc")
+int test_stack_arg_const_str(struct __sk_buff *skb)
+{
+	return 0;
+}
+
+SEC("tc")
+int test_stack_arg_timer(struct __sk_buff *skb)
+{
+	return 0;
+}
+
+#endif
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/stack_arg_precision.c b/tools/testing/selftests/bpf/progs/stack_arg_precision.c
new file mode 100644
index 000000000000..bee2eeec021d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/stack_arg_precision.c
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include "../test_kmods/bpf_testmod_kfunc.h"
+#include "bpf_misc.h"
+
+#if (defined(__TARGET_ARCH_x86) || defined(__TARGET_ARCH_arm64)) && \
+	defined(__BPF_FEATURE_STACK_ARGUMENT)
+
+/* Force kfunc extern BTF generation for inline asm call below.
+ * Uses its own SEC so it's not included as a .text subprog.
+ * The '?' prefix sets autoload=false so libbpf won't load it.
+ */
+SEC("?tc")
+int __btf_kfunc_gen(struct __sk_buff *ctx)
+{
+	char buf[8] = {};
+
+	return bpf_kfunc_call_stack_arg_mem(0, 0, 0, 0, 0, buf, sizeof(buf));
+}
+
+/*
+ * Test precision backtracking across bpf-to-bpf call for kfunc stack arg.
+ * subprog_call_mem_kfunc receives a size as incoming stack arg (arg6)
+ * and forwards it as mem__sz (arg7) to bpf_kfunc_call_stack_arg_mem.
+ */
+__naked __noinline __used
+static long subprog_call_mem_kfunc(long a, long b, long c, long d, long e, long size)
+{
+	asm volatile (
+		"r1 = *(u64 *)(r11 + 8);"	/* r1 = incoming arg6 (size) */
+		"r2 = 0x0807060504030201 ll;"	/* r2 = buf contents */
+		"*(u64 *)(r10 - 8) = r2;"	/* store buf to stack */
+		"r2 = r10;"
+		"r2 += -8;"			/* r2 = &buf */
+		"*(u64 *)(r11 - 8) = r2;"	/* outgoing arg6 = buf */
+		"*(u64 *)(r11 - 16) = r1;"	/* outgoing arg7 = size */
+		"r1 = 1;"
+		"r2 = 2;"
+		"r3 = 3;"
+		"r4 = 4;"
+		"r5 = 5;"
+		"call %[bpf_kfunc_call_stack_arg_mem];"
+		"exit;"
+		:
+		: __imm(bpf_kfunc_call_stack_arg_mem)
+		: __clobber_all
+	);
+}
+
+SEC("tc")
+__description("stack_arg: precision backtracking across bpf2bpf call for kfunc")
+__success
+__log_level(2)
+__flag(BPF_F_TEST_STATE_FREQ)
+__btf_func_path("btf__stack_arg_precision.bpf.o")
+__msg("mark_precise: frame1: last_idx 26 first_idx 13 subseq_idx -1")
+__msg("mark_precise: frame1: regs= stack= before 25: (b7) r5 = 5")
+__msg("mark_precise: frame1: regs= stack= before 24: (b7) r4 = 4")
+__msg("mark_precise: frame1: regs= stack= before 23: (b7) r3 = 3")
+__msg("mark_precise: frame1: regs= stack= before 22: (b7) r2 = 2")
+__msg("mark_precise: frame1: regs= stack= before 21: (b7) r1 = 1")
+__msg("mark_precise: frame1: regs= stack= before 20: (7b) *(u64 *)(r11 -16) = r1")
+__msg("mark_precise: frame1: regs=r1 stack= before 19: (7b) *(u64 *)(r11 -8) = r2")
+__msg("mark_precise: frame1: regs=r1 stack= before 18: (07) r2 += -8")
+__msg("mark_precise: frame1: regs=r1 stack= before 17: (bf) r2 = r10")
+__msg("mark_precise: frame1: regs=r1 stack= before 16: (7b) *(u64 *)(r10 -8) = r2")
+__msg("mark_precise: frame1: regs=r1 stack= before 14: (18) r2 = 0x807060504030201")
+__msg("mark_precise: frame1: regs=r1 stack= before 13: (79) r1 = *(u64 *)(r11 +8)")
+__msg("mark_precise: frame1: parent state regs= stack=:  frame1: R10=fp0")
+__msg("mark_precise: frame0: parent state regs= stack=:  R10=fp0")
+__msg("mark_precise: frame1: last_idx 11 first_idx 11 subseq_idx 13")
+__msg("mark_precise: frame1: regs= stack= before 11: (85) call pc+1")
+__msg("mark_precise: frame0: parent state regs= stack=:  R1=1 R2=2 R3=3 R4=4 R5=5 R10=fp0")
+__msg("mark_precise: frame0: last_idx 9 first_idx 7 subseq_idx 11")
+__msg("mark_precise: frame0: regs= stack= before 9: (05) goto pc+1")
+__msg("mark_precise: frame0: regs= stack= before 8: (7a) *(u64 *)(r11 -8) = 4")
+__msg("mark_precise: frame1: last_idx 26 first_idx 13 subseq_idx -1 ")
+__msg("mark_precise: frame1: regs= stack= before 25: (b7) r5 = 5")
+__msg("mark_precise: frame1: regs= stack= before 24: (b7) r4 = 4")
+__msg("mark_precise: frame1: regs= stack= before 23: (b7) r3 = 3")
+__msg("mark_precise: frame1: regs= stack= before 22: (b7) r2 = 2")
+__msg("mark_precise: frame1: regs= stack= before 21: (b7) r1 = 1")
+__msg("mark_precise: frame1: regs= stack= before 20: (7b) *(u64 *)(r11 -16) = r1")
+__msg("mark_precise: frame1: regs=r1 stack= before 19: (7b) *(u64 *)(r11 -8) = r2")
+__msg("mark_precise: frame1: regs=r1 stack= before 18: (07) r2 += -8")
+__msg("mark_precise: frame1: regs=r1 stack= before 17: (bf) r2 = r10")
+__msg("mark_precise: frame1: regs=r1 stack= before 16: (7b) *(u64 *)(r10 -8) = r2")
+__msg("mark_precise: frame1: regs=r1 stack= before 14: (18) r2 = 0x807060504030201")
+__msg("mark_precise: frame1: regs=r1 stack= before 13: (79) r1 = *(u64 *)(r11 +8)")
+__msg("mark_precise: frame1: parent state regs= stack=:  frame1: R10=fp0")
+__msg("mark_precise: frame0: parent state regs= stack=:  R10=fp0")
+__msg("mark_precise: frame1: last_idx 11 first_idx 11 subseq_idx 13 ")
+__msg("mark_precise: frame1: regs= stack= before 11: (85) call pc+1")
+__msg("mark_precise: frame0: parent state regs= stack=:  R1=1 R2=2 R3=3 R4=4 R5=5 R10=fp0")
+__msg("mark_precise: frame0: last_idx 10 first_idx 10 subseq_idx 11 ")
+__msg("mark_precise: frame0: regs= stack= before 10: (7a) *(u64 *)(r11 -8) = 6")
+__naked void stack_arg_precision_bpf2bpf(void)
+{
+	asm volatile (
+		"call %[bpf_get_prandom_u32];"
+		"r6 = r0;"
+		"r1 = 1;"
+		"r2 = 2;"
+		"r3 = 3;"
+		"r4 = 4;"
+		"r5 = 5;"
+		"if r6 < 2 goto l0_%=;"
+		"*(u64 *)(r11 - 8) = 4;"
+		"goto l1_%=;"
+	"l0_%=:"
+		"*(u64 *)(r11 - 8) = 6;"
+	"l1_%=:"
+		"call subprog_call_mem_kfunc;"
+		"exit;"
+		:: __imm(bpf_get_prandom_u32)
+		: __clobber_all
+	);
+}
+
+#else
+
+SEC("socket")
+__description("stack_arg_precision: not supported, dummy test")
+__success
+int dummy_test(void)
+{
+	return 0;
+}
+
+#endif
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/stream.c b/tools/testing/selftests/bpf/progs/stream.c
index 6f999ba951a3..92ba1d72e0ec 100644
--- a/tools/testing/selftests/bpf/progs/stream.c
+++ b/tools/testing/selftests/bpf/progs/stream.c
@@ -5,7 +5,7 @@
 #include <bpf/bpf_helpers.h>
 #include "bpf_misc.h"
 #include "bpf_experimental.h"
-#include "bpf_arena_common.h"
+#include <bpf_arena_common.h>
 
 struct arr_elem {
 	struct bpf_res_spin_lock lock;
diff --git a/tools/testing/selftests/bpf/progs/stream_fail.c b/tools/testing/selftests/bpf/progs/stream_fail.c
index 8e8249f3521c..21428bb1ee59 100644
--- a/tools/testing/selftests/bpf/progs/stream_fail.c
+++ b/tools/testing/selftests/bpf/progs/stream_fail.c
@@ -23,7 +23,7 @@ int stream_vprintk_scalar_arg(void *ctx)
 }
 
 SEC("syscall")
-__failure __msg("arg#1 doesn't point to a const string")
+__failure __msg("R2 doesn't point to a const string")
 int stream_vprintk_string_arg(void *ctx)
 {
 	bpf_stream_vprintk(BPF_STDOUT, ctx, NULL, 0);
diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf2.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf2.c
index ce97d141daee..c4fadee5aadc 100644
--- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf2.c
+++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf2.c
@@ -13,11 +13,14 @@ struct {
 static __noinline
 int subprog_tail(struct __sk_buff *skb)
 {
+	int ret = 1;
+
 	if (load_byte(skb, 0))
 		bpf_tail_call_static(skb, &jmp_table, 1);
 	else
 		bpf_tail_call_static(skb, &jmp_table, 0);
-	return 1;
+	barrier_var(ret);
+	return ret;
 }
 
 int count = 0;
diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c
index d556b19413d7..1fd07824d88a 100644
--- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c
+++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c
@@ -16,20 +16,25 @@ int count = 0;
 static __noinline
 int subprog_tail(struct __sk_buff *skb)
 {
+	int ret = 0;
+
 	bpf_tail_call_static(skb, &jmp_table, 0);
-	return 0;
+	barrier_var(ret);
+	return ret;
 }
 
 SEC("tc")
 int entry(struct __sk_buff *skb)
 {
-	int ret = 1;
+	int ret = 1, ret1, ret2;
 
 	clobber_regs_stack();
 
 	count++;
-	subprog_tail(skb);
-	subprog_tail(skb);
+	ret1 = subprog_tail(skb);
+	ret2 = subprog_tail(skb);
+	__sink(ret1);
+	__sink(ret2);
 
 	return ret;
 }
diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c
index ae94c9c70ab7..6fde0ab92148 100644
--- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c
+++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c
@@ -25,8 +25,11 @@ int count1 = 0;
 static __noinline
 int subprog_tail0(struct __sk_buff *skb)
 {
+	int ret = 0;
+
 	bpf_tail_call_static(skb, &jmp_table, 0);
-	return 0;
+	barrier_var(ret);
+	return ret;
 }
 
 __auxiliary
@@ -41,16 +44,22 @@ int classifier_0(struct __sk_buff *skb)
 static __noinline
 int subprog_tail1(struct __sk_buff *skb)
 {
+	int ret = 0;
+
 	bpf_tail_call_static(skb, &jmp_table, 1);
-	return 0;
+	barrier_var(ret);
+	return ret;
 }
 
 __auxiliary
 SEC("tc")
 int classifier_1(struct __sk_buff *skb)
 {
+	int ret;
+
 	count1++;
-	subprog_tail1(skb);
+	ret = subprog_tail1(skb);
+	__sink(ret);
 	return 0;
 }
 
@@ -59,13 +68,14 @@ __retval(33)
 SEC("tc")
 int tailcall_bpf2bpf_hierarchy_2(struct __sk_buff *skb)
 {
-	int ret = 0;
+	int ret = 0, ret1, ret2;
 
 	clobber_regs_stack();
 
-	subprog_tail0(skb);
-	subprog_tail1(skb);
-
+	ret1 = subprog_tail0(skb);
+	ret2 = subprog_tail1(skb);
+	__sink(ret1);
+	__sink(ret2);
 	__sink(ret);
 	return (count1 << 16) | count0;
 }
diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c
index 56b6b0099840..0ef9cfb2da8d 100644
--- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c
+++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c
@@ -33,17 +33,24 @@ int count = 0;
 static __noinline
 int subprog_tail(struct __sk_buff *skb, void *jmp_table)
 {
+	int ret = 0;
+
 	bpf_tail_call_static(skb, jmp_table, 0);
-	return 0;
+	barrier_var(ret);
+	return ret;
 }
 
 __auxiliary
 SEC("tc")
 int classifier_0(struct __sk_buff *skb)
 {
+	int ret1, ret2;
+
 	count++;
-	subprog_tail(skb, &jmp_table0);
-	subprog_tail(skb, &jmp_table1);
+	ret1 = subprog_tail(skb, &jmp_table0);
+	ret2 = subprog_tail(skb, &jmp_table1);
+	__sink(ret1);
+	__sink(ret2);
 	return count;
 }
 
diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c
index 5261395713cd..6db9afee2095 100644
--- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c
+++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c
@@ -18,18 +18,25 @@ int count = 0;
 static __noinline
 int subprog_tail(void *ctx)
 {
+	int ret = 0;
+
 	bpf_tail_call_static(ctx, &jmp_table, 0);
-	return 0;
+	barrier_var(ret);
+	return ret;
 }
 
 SEC("fentry/dummy")
 int BPF_PROG(fentry, struct sk_buff *skb)
 {
+	int ret1, ret2;
+
 	clobber_regs_stack();
 
 	count++;
-	subprog_tail(ctx);
-	subprog_tail(ctx);
+	ret1 = subprog_tail(ctx);
+	ret2 = subprog_tail(ctx);
+	__sink(ret1);
+	__sink(ret2);
 
 	return 0;
 }
diff --git a/tools/testing/selftests/bpf/progs/tailcall_cgrp_storage.c b/tools/testing/selftests/bpf/progs/tailcall_cgrp_storage.c
new file mode 100644
index 000000000000..4dd3a0033d75
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tailcall_cgrp_storage.c
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
+	__type(key, struct bpf_cgroup_storage_key);
+	__type(value, __u64);
+} storage_map SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+	__uint(max_entries, 1);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(__u32));
+} prog_array SEC(".maps");
+
+SEC("cgroup_skb/egress")
+int caller_prog(struct __sk_buff *skb)
+{
+	__u64 *storage;
+
+	storage = bpf_get_local_storage(&storage_map, 0);
+	if (storage)
+		*storage = 1;
+
+	bpf_tail_call(skb, &prog_array, 0);
+	return 1;
+}
+
+SEC("cgroup_skb/egress")
+int callee_prog(struct __sk_buff *skb)
+{
+	__u64 *storage;
+
+	storage = bpf_get_local_storage(&storage_map, 0);
+	if (storage)
+		*storage = 1;
+
+	return 1;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/tailcall_cgrp_storage_no_storage.c b/tools/testing/selftests/bpf/progs/tailcall_cgrp_storage_no_storage.c
new file mode 100644
index 000000000000..5c69b0af6ff9
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tailcall_cgrp_storage_no_storage.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+	__uint(max_entries, 1);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(__u32));
+} prog_array SEC(".maps");
+
+SEC("cgroup_skb/egress")
+int caller_prog(struct __sk_buff *skb)
+{
+	bpf_tail_call(skb, &prog_array, 0);
+	return 1;
+}
+
+SEC("cgroup_skb/egress")
+int leaf_prog(struct __sk_buff *skb)
+{
+	return 1;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/tailcall_cgrp_storage_owner.c b/tools/testing/selftests/bpf/progs/tailcall_cgrp_storage_owner.c
new file mode 100644
index 000000000000..d7e8ec9855c5
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tailcall_cgrp_storage_owner.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
+	__type(key, struct bpf_cgroup_storage_key);
+	__type(value, __u64);
+} storage_map SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+	__uint(max_entries, 1);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(__u32));
+} prog_array SEC(".maps");
+
+SEC("cgroup_skb/egress")
+int prog_array_owner(struct __sk_buff *skb)
+{
+	__u64 *storage;
+
+	storage = bpf_get_local_storage(&storage_map, 0);
+	if (storage)
+		*storage = 1;
+
+	bpf_tail_call(skb, &prog_array, 0);
+	return 1;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c
index 4c07ea193f72..8942b5478129 100644
--- a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c
+++ b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c
@@ -5,6 +5,7 @@
 #include <bpf/bpf_tracing.h>
 #include <bpf/bpf_helpers.h>
 
+#include "../bpf_experimental.h"
 #include "bpf_misc.h"
 #include "task_kfunc_common.h"
 
@@ -28,7 +29,7 @@ static struct __tasks_kfunc_map_value *insert_lookup_task(struct task_struct *ta
 }
 
 SEC("tp_btf/task_newtask")
-__failure __msg("Possibly NULL pointer passed to trusted arg0")
+__failure __msg("Possibly NULL pointer passed to trusted R1")
 int BPF_PROG(task_kfunc_acquire_untrusted, struct task_struct *task, u64 clone_flags)
 {
 	struct task_struct *acquired;
@@ -49,7 +50,7 @@ int BPF_PROG(task_kfunc_acquire_untrusted, struct task_struct *task, u64 clone_f
 }
 
 SEC("tp_btf/task_newtask")
-__failure __msg("arg#0 pointer type STRUCT task_struct must point")
+__failure __msg("R1 pointer type STRUCT task_struct must point")
 int BPF_PROG(task_kfunc_acquire_fp, struct task_struct *task, u64 clone_flags)
 {
 	struct task_struct *acquired, *stack_task = (struct task_struct *)&clone_flags;
@@ -100,7 +101,7 @@ int BPF_PROG(task_kfunc_acquire_unsafe_kretprobe_rcu, struct task_struct *task,
 }
 
 SEC("tp_btf/task_newtask")
-__failure __msg("Possibly NULL pointer passed to trusted arg0")
+__failure __msg("Possibly NULL pointer passed to trusted R1")
 int BPF_PROG(task_kfunc_acquire_null, struct task_struct *task, u64 clone_flags)
 {
 	struct task_struct *acquired;
@@ -149,7 +150,7 @@ int BPF_PROG(task_kfunc_xchg_unreleased, struct task_struct *task, u64 clone_fla
 }
 
 SEC("tp_btf/task_newtask")
-__failure __msg("Possibly NULL pointer passed to trusted arg0")
+__failure __msg("Possibly NULL pointer passed to trusted R1")
 int BPF_PROG(task_kfunc_acquire_release_no_null_check, struct task_struct *task, u64 clone_flags)
 {
 	struct task_struct *acquired;
@@ -162,7 +163,7 @@ int BPF_PROG(task_kfunc_acquire_release_no_null_check, struct task_struct *task,
 }
 
 SEC("tp_btf/task_newtask")
-__failure __msg("Possibly NULL pointer passed to trusted arg0")
+__failure __msg("Possibly NULL pointer passed to trusted R1")
 int BPF_PROG(task_kfunc_release_untrusted, struct task_struct *task, u64 clone_flags)
 {
 	struct __tasks_kfunc_map_value *v;
@@ -178,7 +179,7 @@ int BPF_PROG(task_kfunc_release_untrusted, struct task_struct *task, u64 clone_f
 }
 
 SEC("tp_btf/task_newtask")
-__failure __msg("arg#0 pointer type STRUCT task_struct must point")
+__failure __msg("release kfunc bpf_task_release expects referenced PTR_TO_BTF_ID passed to R1")
 int BPF_PROG(task_kfunc_release_fp, struct task_struct *task, u64 clone_flags)
 {
 	struct task_struct *acquired = (struct task_struct *)&clone_flags;
@@ -190,7 +191,7 @@ int BPF_PROG(task_kfunc_release_fp, struct task_struct *task, u64 clone_flags)
 }
 
 SEC("tp_btf/task_newtask")
-__failure __msg("Possibly NULL pointer passed to trusted arg0")
+__failure __msg("Possibly NULL pointer passed to trusted R1")
 int BPF_PROG(task_kfunc_release_null, struct task_struct *task, u64 clone_flags)
 {
 	struct __tasks_kfunc_map_value local, *v;
@@ -224,7 +225,7 @@ int BPF_PROG(task_kfunc_release_null, struct task_struct *task, u64 clone_flags)
 }
 
 SEC("tp_btf/task_newtask")
-__failure __msg("release kernel function bpf_task_release expects")
+__failure __msg("release kfunc bpf_task_release expects referenced PTR_TO_BTF_ID passed to R1")
 int BPF_PROG(task_kfunc_release_unacquired, struct task_struct *task, u64 clone_flags)
 {
 	/* Cannot release trusted task pointer which was not acquired. */
@@ -234,7 +235,46 @@ int BPF_PROG(task_kfunc_release_unacquired, struct task_struct *task, u64 clone_
 }
 
 SEC("tp_btf/task_newtask")
-__failure __msg("Possibly NULL pointer passed to trusted arg0")
+__failure __msg("bpf_obj_drop cannot be used in tracing programs on types with NMI unsafe fields")
+int BPF_PROG(task_kfunc_obj_drop_with_kptr, struct task_struct *task, u64 clone_flags)
+{
+	struct __tasks_kfunc_map_value *local;
+
+	local = bpf_obj_new(typeof(*local));
+	if (!local)
+		return 0;
+
+	bpf_obj_drop(local);
+	return 0;
+}
+
+SEC("tp_btf/task_newtask")
+__failure __msg("bpf_obj_drop cannot be used in tracing programs on types with NMI unsafe fields")
+int BPF_PROG(task_kfunc_obj_drop_nmi_with_kptr, struct task_struct *task,
+	     u64 clone_flags)
+{
+	struct __tasks_kfunc_map_value *local;
+	struct task_struct *acquired, *old;
+
+	(void)clone_flags;
+
+	local = bpf_obj_new(typeof(*local));
+	if (!local)
+		return 0;
+
+	acquired = bpf_task_acquire(task);
+	if (acquired) {
+		old = bpf_kptr_xchg(&local->task, acquired);
+		if (old)
+			bpf_task_release(old);
+	}
+
+	bpf_obj_drop(local);
+	return 0;
+}
+
+SEC("tp_btf/task_newtask")
+__failure __msg("Possibly NULL pointer passed to trusted R1")
 int BPF_PROG(task_kfunc_from_pid_no_null_check, struct task_struct *task, u64 clone_flags)
 {
 	struct task_struct *acquired;
@@ -248,7 +288,7 @@ int BPF_PROG(task_kfunc_from_pid_no_null_check, struct task_struct *task, u64 cl
 }
 
 SEC("tp_btf/task_newtask")
-__failure __msg("Possibly NULL pointer passed to trusted arg0")
+__failure __msg("Possibly NULL pointer passed to trusted R1")
 int BPF_PROG(task_kfunc_from_vpid_no_null_check, struct task_struct *task, u64 clone_flags)
 {
 	struct task_struct *acquired;
@@ -313,7 +353,7 @@ int BPF_PROG(task_access_comm4, struct task_struct *task, const char *buf, bool
 }
 
 SEC("tp_btf/task_newtask")
-__failure __msg("R1 must be referenced or trusted")
+__failure __msg("release kfunc bpf_task_release expects referenced PTR_TO_BTF_ID passed to R1")
 int BPF_PROG(task_kfunc_release_in_map, struct task_struct *task, u64 clone_flags)
 {
 	struct task_struct *local;
diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_success.c b/tools/testing/selftests/bpf/progs/task_kfunc_success.c
index 5fb4fc19d26a..d63a79ee33dc 100644
--- a/tools/testing/selftests/bpf/progs/task_kfunc_success.c
+++ b/tools/testing/selftests/bpf/progs/task_kfunc_success.c
@@ -140,17 +140,17 @@ int BPF_PROG(test_task_acquire_leave_in_map, struct task_struct *task, u64 clone
 	return 0;
 }
 
-SEC("tp_btf/task_newtask")
-int BPF_PROG(test_task_xchg_release, struct task_struct *task, u64 clone_flags)
+SEC("syscall")
+int test_task_xchg_release(const void *ctx)
 {
-	struct task_struct *kptr, *acquired;
+	struct task_struct *task, *kptr, *acquired;
 	struct __tasks_kfunc_map_value *v, *local;
 	int refcnt, refcnt_after_drop;
 	long status;
 
-	if (!is_test_kfunc_task())
-		return 0;
+	(void)ctx;
 
+	task = bpf_get_current_task_btf();
 	status = tasks_kfunc_map_insert(task);
 	if (status) {
 		err = 1;
@@ -191,7 +191,7 @@ int BPF_PROG(test_task_xchg_release, struct task_struct *task, u64 clone_flags)
 		return 0;
 	}
 
-	/* Stash a copy into local kptr and check if it is released recursively */
+	/* Stash a copy into local kptr and check if it is released recursively. */
 	acquired = bpf_task_acquire(kptr);
 	if (!acquired) {
 		err = 7;
@@ -220,7 +220,6 @@ int BPF_PROG(test_task_xchg_release, struct task_struct *task, u64 clone_flags)
 	}
 
 	bpf_task_release(kptr);
-
 	return 0;
 }
 
diff --git a/tools/testing/selftests/bpf/progs/task_local_storage.c b/tools/testing/selftests/bpf/progs/task_local_storage.c
index 80a0a20db88d..34fa3d6451d2 100644
--- a/tools/testing/selftests/bpf/progs/task_local_storage.c
+++ b/tools/testing/selftests/bpf/progs/task_local_storage.c
@@ -14,12 +14,15 @@ struct {
 	__type(value, long);
 } enter_id SEC(".maps");
 
+#include "err.h"
+
 #define MAGIC_VALUE 0xabcd1234
 
 pid_t target_pid = 0;
 int mismatch_cnt = 0;
 int enter_cnt = 0;
 int exit_cnt = 0;
+long update_err = 0;
 
 SEC("tp_btf/sys_enter")
 int BPF_PROG(on_enter, struct pt_regs *regs, long id)
@@ -62,3 +65,19 @@ int BPF_PROG(on_exit, struct pt_regs *regs, long id)
 		__sync_fetch_and_add(&mismatch_cnt, 1);
 	return 0;
 }
+
+SEC("fexit/bpf_local_storage_update")
+int BPF_PROG(fexit_update, void *owner, struct bpf_local_storage_map *smap,
+	     void *value, u64 map_flags, bool swap_uptrs,
+	     struct bpf_local_storage_data *ret)
+{
+	struct task_struct *task = bpf_get_current_task_btf();
+
+	if (task->pid != target_pid)
+		return 0;
+
+	if (IS_ERR_VALUE(ret))
+		update_err = PTR_ERR(ret);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/task_work_fail.c b/tools/testing/selftests/bpf/progs/task_work_fail.c
index 82e4b8913333..3186e7b4b24e 100644
--- a/tools/testing/selftests/bpf/progs/task_work_fail.c
+++ b/tools/testing/selftests/bpf/progs/task_work_fail.c
@@ -58,7 +58,7 @@ int mismatch_map(struct pt_regs *args)
 }
 
 SEC("perf_event")
-__failure __msg("arg#1 doesn't point to a map value")
+__failure __msg("R2 doesn't point to a map value")
 int no_map_task_work(struct pt_regs *args)
 {
 	struct task_struct *task;
@@ -70,7 +70,7 @@ int no_map_task_work(struct pt_regs *args)
 }
 
 SEC("perf_event")
-__failure __msg("Possibly NULL pointer passed to trusted arg1")
+__failure __msg("Possibly NULL pointer passed to trusted R2")
 int task_work_null(struct pt_regs *args)
 {
 	struct task_struct *task;
@@ -81,7 +81,7 @@ int task_work_null(struct pt_regs *args)
 }
 
 SEC("perf_event")
-__failure __msg("Possibly NULL pointer passed to trusted arg2")
+__failure __msg("Possibly NULL pointer passed to trusted R3")
 int map_null(struct pt_regs *args)
 {
 	struct elem *work;
diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c b/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c
index 2c156cd166af..332cda89caba 100644
--- a/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c
+++ b/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c
@@ -152,7 +152,7 @@ int change_status_after_alloc(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
-__failure __msg("Possibly NULL pointer passed to trusted arg1")
+__failure __msg("Possibly NULL pointer passed to trusted R2")
 int lookup_null_bpf_tuple(struct __sk_buff *ctx)
 {
 	struct bpf_ct_opts___local opts = {};
@@ -165,7 +165,7 @@ int lookup_null_bpf_tuple(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
-__failure __msg("Possibly NULL pointer passed to trusted arg3")
+__failure __msg("Possibly NULL pointer passed to trusted R4")
 int lookup_null_bpf_opts(struct __sk_buff *ctx)
 {
 	struct bpf_sock_tuple tup = {};
@@ -178,7 +178,7 @@ int lookup_null_bpf_opts(struct __sk_buff *ctx)
 }
 
 SEC("?xdp")
-__failure __msg("Possibly NULL pointer passed to trusted arg1")
+__failure __msg("Possibly NULL pointer passed to trusted R2")
 int xdp_lookup_null_bpf_tuple(struct xdp_md *ctx)
 {
 	struct bpf_ct_opts___local opts = {};
@@ -191,7 +191,7 @@ int xdp_lookup_null_bpf_tuple(struct xdp_md *ctx)
 }
 
 SEC("?xdp")
-__failure __msg("Possibly NULL pointer passed to trusted arg3")
+__failure __msg("Possibly NULL pointer passed to trusted R4")
 int xdp_lookup_null_bpf_opts(struct xdp_md *ctx)
 {
 	struct bpf_sock_tuple tup = {};
diff --git a/tools/testing/selftests/bpf/progs/test_fill_link_info.c b/tools/testing/selftests/bpf/progs/test_fill_link_info.c
index fac33a14f200..137bd6292163 100644
--- a/tools/testing/selftests/bpf/progs/test_fill_link_info.c
+++ b/tools/testing/selftests/bpf/progs/test_fill_link_info.c
@@ -12,7 +12,7 @@ extern bool CONFIG_PPC64 __kconfig __weak;
 
 /* This function is here to have CONFIG_X86_KERNEL_IBT,
  * CONFIG_PPC_FTRACE_OUT_OF_LINE, CONFIG_KPROBES_ON_FTRACE,
- * CONFIG_PPC6 used and added to object BTF.
+ * CONFIG_PPC64 used and added to object BTF.
  */
 int unused(void)
 {
diff --git a/tools/testing/selftests/bpf/progs/test_global_func3.c b/tools/testing/selftests/bpf/progs/test_global_func3.c
index 974fd8c19561..b66abb350fb0 100644
--- a/tools/testing/selftests/bpf/progs/test_global_func3.c
+++ b/tools/testing/selftests/bpf/progs/test_global_func3.c
@@ -53,9 +53,57 @@ int f8(struct __sk_buff *skb)
 	return f7(skb);
 }
 
+static __attribute__ ((noinline))
+int f9(struct __sk_buff *skb)
+{
+	return f8(skb);
+}
+
+static __attribute__ ((noinline))
+int f10(struct __sk_buff *skb)
+{
+	return f9(skb);
+}
+
+static __attribute__ ((noinline))
+int f11(struct __sk_buff *skb)
+{
+	return f10(skb);
+}
+
+static __attribute__ ((noinline))
+int f12(struct __sk_buff *skb)
+{
+	return f11(skb);
+}
+
+static __attribute__ ((noinline))
+int f13(struct __sk_buff *skb)
+{
+	return f12(skb);
+}
+
+static __attribute__ ((noinline))
+int f14(struct __sk_buff *skb)
+{
+	return f13(skb);
+}
+
+static __attribute__ ((noinline))
+int f15(struct __sk_buff *skb)
+{
+	return f14(skb);
+}
+
+static __attribute__ ((noinline))
+int f16(struct __sk_buff *skb)
+{
+	return f15(skb);
+}
+
 SEC("tc")
-__failure __msg("the call stack of 9 frames")
+__failure __msg("the call stack of 17 frames")
 int global_func3(struct __sk_buff *skb)
 {
-	return f8(skb);
+	return f16(skb);
 }
diff --git a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c
index d249113ed657..bf48fc43c7ab 100644
--- a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c
+++ b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c
@@ -11,12 +11,7 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 #include "bpf_misc.h"
-
-extern struct bpf_key *bpf_lookup_system_key(__u64 id) __ksym;
-extern void bpf_key_put(struct bpf_key *key) __ksym;
-extern int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_ptr,
-				      struct bpf_dynptr *sig_ptr,
-				      struct bpf_key *trusted_keyring) __ksym;
+#include "bpf_kfuncs.h"
 
 struct {
 	__uint(type, BPF_MAP_TYPE_RINGBUF);
@@ -38,14 +33,14 @@ SEC("?lsm.s/bpf")
 __failure __msg("cannot pass in dynptr at an offset=-8")
 int BPF_PROG(not_valid_dynptr, int cmd, union bpf_attr *attr, unsigned int size, bool kernel)
 {
-	unsigned long val;
+	unsigned long val = 0;
 
 	return bpf_verify_pkcs7_signature((struct bpf_dynptr *)&val,
 					  (struct bpf_dynptr *)&val, NULL);
 }
 
 SEC("?lsm.s/bpf")
-__failure __msg("arg#0 expected pointer to stack or const struct bpf_dynptr")
+__failure __msg("R1 expected pointer to stack or const struct bpf_dynptr")
 int BPF_PROG(not_ptr_to_stack, int cmd, union bpf_attr *attr, unsigned int size, bool kernel)
 {
 	static struct bpf_dynptr val;
diff --git a/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c b/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c
index 967081bbcfe1..ca35b92ea095 100644
--- a/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c
+++ b/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c
@@ -29,7 +29,7 @@ int kfunc_dynptr_nullable_test2(struct __sk_buff *skb)
 }
 
 SEC("tc")
-__failure __msg("Possibly NULL pointer passed to trusted arg0")
+__failure __msg("Possibly NULL pointer passed to trusted R1")
 int kfunc_dynptr_nullable_test3(struct __sk_buff *skb)
 {
 	struct bpf_dynptr data;
diff --git a/tools/testing/selftests/bpf/progs/test_lirc_mode2_kern.c b/tools/testing/selftests/bpf/progs/test_lirc_mode2_kern.c
index 7a6620671a83..cbe4284c032f 100644
--- a/tools/testing/selftests/bpf/progs/test_lirc_mode2_kern.c
+++ b/tools/testing/selftests/bpf/progs/test_lirc_mode2_kern.c
@@ -13,9 +13,9 @@ int bpf_decoder(unsigned int *sample)
 	if (LIRC_IS_PULSE(*sample)) {
 		unsigned int duration = LIRC_VALUE(*sample);
 
-		if (duration & 0x10000)
+		if (duration & 0x1000)
 			bpf_rc_keydown(sample, 0x40, duration & 0xffff, 0);
-		if (duration & 0x20000)
+		if (duration & 0x2000)
 			bpf_rc_pointer_rel(sample, (duration >> 8) & 0xff,
 					   duration & 0xff);
 	}
diff --git a/tools/testing/selftests/bpf/progs/test_lwt_ip_encap.c b/tools/testing/selftests/bpf/progs/test_lwt_ip_encap.c
index d6cb986e7533..4a934fccf8f5 100644
--- a/tools/testing/selftests/bpf/progs/test_lwt_ip_encap.c
+++ b/tools/testing/selftests/bpf/progs/test_lwt_ip_encap.c
@@ -1,11 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <stddef.h>
+#include "vmlinux.h"
 #include <string.h>
-#include <linux/bpf.h>
-#include <linux/ip.h>
-#include <linux/ipv6.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_endian.h>
+#include <bpf/bpf_tracing.h>
 
 struct grehdr {
 	__be16 flags;
@@ -64,13 +62,13 @@ int bpf_lwt_encap_gre6(struct __sk_buff *skb)
 	hdr.ip6hdr.nexthdr = 47;  /* IPPROTO_GRE */
 	hdr.ip6hdr.hop_limit = 0x40;
 	/* fb01::1 */
-	hdr.ip6hdr.saddr.s6_addr[0] = 0xfb;
-	hdr.ip6hdr.saddr.s6_addr[1] = 1;
-	hdr.ip6hdr.saddr.s6_addr[15] = 1;
+	hdr.ip6hdr.saddr.in6_u.u6_addr8[0] = 0xfb;
+	hdr.ip6hdr.saddr.in6_u.u6_addr8[1] = 1;
+	hdr.ip6hdr.saddr.in6_u.u6_addr8[15] = 1;
 	/* fb10::1 */
-	hdr.ip6hdr.daddr.s6_addr[0] = 0xfb;
-	hdr.ip6hdr.daddr.s6_addr[1] = 0x10;
-	hdr.ip6hdr.daddr.s6_addr[15] = 1;
+	hdr.ip6hdr.daddr.in6_u.u6_addr8[0] = 0xfb;
+	hdr.ip6hdr.daddr.in6_u.u6_addr8[1] = 0x10;
+	hdr.ip6hdr.daddr.in6_u.u6_addr8[15] = 1;
 
 	hdr.greh.protocol = skb->protocol;
 
@@ -82,4 +80,141 @@ int bpf_lwt_encap_gre6(struct __sk_buff *skb)
 	return BPF_LWT_REROUTE;
 }
 
+#define VXLAN_PORT  4789
+#define VXLAN_FLAGS 0x08000000
+#define VXLAN_VNI   1
+
+#define ETH_ALEN	6		/* Octets in one ethernet addr	 */
+#define ETH_P_IP	0x0800		/* Internet Protocol packet	*/
+#define ETH_P_IPV6	0x86DD		/* IPv6 over bluebook		*/
+
+static const __u8 bcast[ETH_ALEN] = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+};
+
+static const __u8 srcmac[ETH_ALEN] = {
+	0x02, 0x00, 0x00, 0x00, 0x00, 0x01,
+};
+
+SEC("encap_vxlan")
+int bpf_lwt_encap_vxlan(struct __sk_buff *skb)
+{
+	struct encap_hdr {
+		struct iphdr    iph;
+		struct udphdr   udph;
+		struct vxlanhdr vxh;
+		struct ethhdr   eth;
+	} __attribute__((__packed__)) hdr;
+	int err;
+
+	memset(&hdr, 0, sizeof(hdr));
+
+	hdr.iph.ihl      = 5;
+	hdr.iph.version  = 4;
+	hdr.iph.ttl      = 0x40;
+	hdr.iph.protocol = 17; /* IPPROTO_UDP */
+	hdr.iph.tot_len  = bpf_htons(skb->len + sizeof(hdr));
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	hdr.iph.saddr = 0x640510ac;  /* 172.16.5.100  */
+	hdr.iph.daddr = 0x641110ac;  /* 172.16.17.100 */
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	hdr.iph.saddr = 0xac100564;  /* 172.16.5.100 */
+	hdr.iph.daddr = 0xac101164;  /* 172.16.17.100 */
+#else
+#error "Fix your compiler's __BYTE_ORDER__?!"
+#endif
+
+	hdr.udph.source = bpf_htons(VXLAN_PORT);
+	hdr.udph.dest   = bpf_htons(VXLAN_PORT);
+	hdr.udph.len    = bpf_htons(skb->len + sizeof(hdr.udph) + sizeof(hdr.vxh) +
+				    sizeof(hdr.eth));
+
+	hdr.vxh.vx_flags = bpf_htonl(VXLAN_FLAGS);
+	hdr.vxh.vx_vni   = bpf_htonl(VXLAN_VNI << 8);
+
+	__builtin_memcpy(hdr.eth.h_dest, bcast, ETH_ALEN);
+	__builtin_memcpy(hdr.eth.h_source, srcmac, ETH_ALEN);
+	hdr.eth.h_proto = bpf_htons(ETH_P_IP);
+
+	err = bpf_lwt_push_encap(skb, BPF_LWT_ENCAP_IP, &hdr, sizeof(hdr));
+	if (err)
+		return BPF_DROP;
+
+	return BPF_LWT_REROUTE;
+}
+
+SEC("encap_vxlan6")
+int bpf_lwt_encap_vxlan6(struct __sk_buff *skb)
+{
+	struct encap_hdr {
+		struct ipv6hdr  ip6hdr;
+		struct udphdr   udph;
+		struct vxlanhdr vxh;
+		struct ethhdr   eth;
+	} __attribute__((__packed__)) hdr;
+	int err;
+
+	memset(&hdr, 0, sizeof(hdr));
+
+	hdr.ip6hdr.version     = 6;
+	hdr.ip6hdr.nexthdr     = 17; /* IPPROTO_UDP */
+	hdr.ip6hdr.hop_limit   = 0x40;
+	hdr.ip6hdr.payload_len = bpf_htons(skb->len + sizeof(hdr.udph) + sizeof(hdr.vxh) +
+					   sizeof(hdr.eth));
+	/* fb05::1 */
+	hdr.ip6hdr.saddr.in6_u.u6_addr8[0]  = 0xfb;
+	hdr.ip6hdr.saddr.in6_u.u6_addr8[1]  = 0x05;
+	hdr.ip6hdr.saddr.in6_u.u6_addr8[15] = 1;
+	/* fb11::1 */
+	hdr.ip6hdr.daddr.in6_u.u6_addr8[0]  = 0xfb;
+	hdr.ip6hdr.daddr.in6_u.u6_addr8[1]  = 0x11;
+	hdr.ip6hdr.daddr.in6_u.u6_addr8[15] = 1;
+
+	hdr.udph.source = bpf_htons(VXLAN_PORT);
+	hdr.udph.dest   = bpf_htons(VXLAN_PORT);
+	hdr.udph.len    = bpf_htons(skb->len + sizeof(hdr.udph) + sizeof(hdr.vxh) +
+				    sizeof(hdr.eth));
+
+	hdr.vxh.vx_flags = bpf_htonl(VXLAN_FLAGS);
+	hdr.vxh.vx_vni   = bpf_htonl(VXLAN_VNI << 8);
+
+	__builtin_memcpy(hdr.eth.h_dest, bcast, ETH_ALEN);
+	__builtin_memcpy(hdr.eth.h_source, srcmac, ETH_ALEN);
+	hdr.eth.h_proto = bpf_htons(ETH_P_IPV6);
+
+	err = bpf_lwt_push_encap(skb, BPF_LWT_ENCAP_IP, &hdr, sizeof(hdr));
+	if (err)
+		return BPF_DROP;
+
+	return BPF_LWT_REROUTE;
+}
+
+volatile const int tgt_ip_version;
+
+__u16 transport_hdr = 0;
+__u16 network_hdr = 0;
+bool fexit_triggered = false;
+
+SEC("?fexit/bpf_lwt_push_ip_encap")
+int BPF_PROG(fexit_lwt_push_ip_encap, struct sk_buff *skb, void *hdr, u32 len, bool ingress,
+	     int retval)
+{
+	struct iphdr *iph;
+
+	if (retval || fexit_triggered)
+		return 0;
+
+	iph = (typeof(iph)) (skb->head + skb->network_header);
+	if (iph->version != tgt_ip_version)
+		return 0;
+
+	if ((iph->version == 4 && iph->protocol == 17 /* IPPROTO_UDP */) ||
+	    (iph->version == 6 && ((struct ipv6hdr *)iph)->nexthdr == 17 /* IPPROTO_UDP */)) {
+		fexit_triggered = true;
+		transport_hdr   = skb->transport_header;
+		network_hdr     = skb->network_header;
+	}
+	return 0;
+}
+
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf_map_key.c b/tools/testing/selftests/bpf/progs/test_ringbuf_map_key.c
index 21bb7da90ea5..0efafa927a3d 100644
--- a/tools/testing/selftests/bpf/progs/test_ringbuf_map_key.c
+++ b/tools/testing/selftests/bpf/progs/test_ringbuf_map_key.c
@@ -35,7 +35,7 @@ SEC("fentry/" SYS_PREFIX "sys_getpgid")
 int test_ringbuf_mem_map_key(void *ctx)
 {
 	int cur_pid = bpf_get_current_pid_tgid() >> 32;
-	struct sample *sample, sample_copy;
+	struct sample *sample;
 	int *lookup_val;
 
 	if (cur_pid != pid)
@@ -55,16 +55,11 @@ int test_ringbuf_mem_map_key(void *ctx)
 	lookup_val = (int *)bpf_map_lookup_elem(&hash_map, sample);
 	__sink(lookup_val);
 
-	/* workaround - memcpy is necessary so that verifier doesn't
-	 * complain with:
-	 *   verifier internal error: more than one arg with ref_obj_id R3
-	 * when trying to do bpf_map_update_elem(&hash_map, sample, &sample->seq, BPF_ANY);
-	 *
+	/*
 	 * Since bpf_map_lookup_elem above uses 'sample' as key, test using
 	 * sample field as value below
 	 */
-	__builtin_memcpy(&sample_copy, sample, sizeof(struct sample));
-	bpf_map_update_elem(&hash_map, &sample_copy, &sample->seq, BPF_ANY);
+	bpf_map_update_elem(&hash_map, sample, &sample->seq, BPF_ANY);
 
 	bpf_ringbuf_submit(sample, 0);
 	return 0;
diff --git a/tools/testing/selftests/bpf/progs/test_signed_loader.c b/tools/testing/selftests/bpf/progs/test_signed_loader.c
new file mode 100644
index 000000000000..d9a4b85f9391
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_signed_loader.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+/*
+ * Minimal, map-less program. Driven through libbpf's gen_loader (gen_hash)
+ * by prog_tests/signed_loader.c so the generated light-skeleton loader (with
+ * the emit_signature_match metadata check) can be exercised against good
+ * and tampered metadata. A socket filter needs no load-time attach resolution,
+ * and having no maps keeps the generated loader's ctx trivial (0 maps, 1 prog).
+ */
+SEC("socket")
+int probe(void *ctx)
+{
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_signed_loader_data.c b/tools/testing/selftests/bpf/progs/test_signed_loader_data.c
new file mode 100644
index 000000000000..43e2074d0042
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_signed_loader_data.c
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+/*
+ * A single initialized global, so the generated loader has one internal
+ * (.data) map that it seeds with an initial value while loading.
+ * prog_tests/signed_loader.c uses this to check that a signed loader
+ * keeps the attested contents and ignores a ctx-supplied initial_value:
+ * the host cannot re-seed a signed program's maps through the loader ctx.
+ */
+__u64 magic = 0x5eed1234abad1deaULL;
+
+SEC("socket")
+int probe(void *ctx)
+{
+	return (int)magic;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_signed_loader_lsm.c b/tools/testing/selftests/bpf/progs/test_signed_loader_lsm.c
new file mode 100644
index 000000000000..575a9b7910c8
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_signed_loader_lsm.c
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+__u32 monitored_tid;
+
+int sig_keyring_serial;
+int sig_keyring_type;
+int sig_verdict;
+int seen;
+
+SEC("lsm/bpf_prog_load")
+int BPF_PROG(inspect_prog_load, struct bpf_prog *prog, union bpf_attr *attr,
+	     struct bpf_token *token, bool kernel)
+{
+	__u32 tid = bpf_get_current_pid_tgid() & 0xffffffff;
+
+	if (!monitored_tid || tid != monitored_tid)
+		return 0;
+
+	seen++;
+	sig_keyring_serial = prog->aux->sig.keyring_serial;
+	sig_keyring_type = prog->aux->sig.keyring_type;
+	sig_verdict = prog->aux->sig.verdict;
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_signed_loader_map.c b/tools/testing/selftests/bpf/progs/test_signed_loader_map.c
new file mode 100644
index 000000000000..4478ce6f1fd9
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_signed_loader_map.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+/*
+ * One explicit array map and no global variables, so the generated loader
+ * has exactly one map to create (no .rodata/.bss). prog_tests/signed_loader.c
+ * uses this to check that a signed loader ignores ctx-supplied max_entries:
+ * the map must keep its attested size (4), not whatever the host puts in
+ * the loader ctx.
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 4);
+	__type(key, __u32);
+	__type(value, __u64);
+} amap SEC(".maps");
+
+SEC("socket")
+int probe(void *ctx)
+{
+	__u32 key = 0;
+	__u64 *val = bpf_map_lookup_elem(&amap, &key);
+
+	return val ? (int)*val : 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_sleepable_tracepoints.c b/tools/testing/selftests/bpf/progs/test_sleepable_tracepoints.c
new file mode 100644
index 000000000000..254f7fd895d9
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_sleepable_tracepoints.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <asm/unistd.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+
+int target_pid;
+int prog_triggered;
+long err;
+char copied_byte;
+
+static int copy_getcwd_arg(char *ubuf)
+{
+	err = bpf_copy_from_user(&copied_byte, sizeof(copied_byte), ubuf);
+	if (err)
+		return err;
+
+	prog_triggered = 1;
+	return 0;
+}
+
+SEC("tp_btf.s/sys_enter")
+int BPF_PROG(handle_sys_enter_tp_btf, struct pt_regs *regs, long id)
+{
+	if ((bpf_get_current_pid_tgid() >> 32) != target_pid ||
+	    id != __NR_getcwd)
+		return 0;
+
+	return copy_getcwd_arg((void *)PT_REGS_PARM1_SYSCALL(regs));
+}
+
+SEC("raw_tp.s/sys_enter")
+int BPF_PROG(handle_sys_enter_raw_tp, struct pt_regs *regs, long id)
+{
+	if ((bpf_get_current_pid_tgid() >> 32) != target_pid ||
+	    id != __NR_getcwd)
+		return 0;
+
+	return copy_getcwd_arg((void *)PT_REGS_PARM1_CORE_SYSCALL(regs));
+}
+
+SEC("tp.s/syscalls/sys_enter_getcwd")
+int handle_sys_enter_tp(struct syscall_trace_enter *args)
+{
+	if ((bpf_get_current_pid_tgid() >> 32) != target_pid)
+		return 0;
+
+	return copy_getcwd_arg((void *)args->args[0]);
+}
+
+SEC("tp.s/syscalls/sys_exit_getcwd")
+int handle_sys_exit_tp(struct syscall_trace_exit *args)
+{
+	struct pt_regs *regs;
+
+	if ((bpf_get_current_pid_tgid() >> 32) != target_pid)
+		return 0;
+
+	regs = (struct pt_regs *)bpf_task_pt_regs(bpf_get_current_task_btf());
+	return copy_getcwd_arg((void *)PT_REGS_PARM1_CORE_SYSCALL(regs));
+}
+
+SEC("raw_tp.s")
+int BPF_PROG(handle_raw_tp_bare, struct pt_regs *regs, long id)
+{
+	return 0;
+}
+
+SEC("tp.s")
+int handle_tp_bare(void *ctx)
+{
+	return 0;
+}
+
+SEC("tracepoint.s/syscalls/sys_enter_getcwd")
+int handle_sys_enter_tp_alias(struct syscall_trace_enter *args)
+{
+	return 0;
+}
+
+SEC("raw_tracepoint.s/sys_enter")
+int BPF_PROG(handle_sys_enter_raw_tp_alias, struct pt_regs *regs, long id)
+{
+	return 0;
+}
+
+SEC("raw_tp.s/sys_enter")
+int BPF_PROG(handle_test_run, struct pt_regs *regs, long id)
+{
+	if ((__u64)regs == 0x1234ULL && (__u64)id == 0x5678ULL)
+		return (__u64)regs + (__u64)id;
+
+	return 0;
+}
+
+SEC("raw_tp.s/sched_switch")
+int BPF_PROG(handle_raw_tp_non_faultable, bool preempt,
+	     struct task_struct *prev, struct task_struct *next)
+{
+	return 0;
+}
+
+SEC("tp.s/sched/sched_switch")
+int handle_tp_non_syscall(void *ctx)
+{
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_sleepable_tracepoints_fail.c b/tools/testing/selftests/bpf/progs/test_sleepable_tracepoints_fail.c
new file mode 100644
index 000000000000..1a0748a9520b
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_sleepable_tracepoints_fail.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+char _license[] SEC("license") = "GPL";
+
+/* Sleepable program on a non-faultable tracepoint should fail to load */
+SEC("tp_btf.s/sched_switch")
+__failure __msg("Sleepable program cannot attach to non-faultable tracepoint")
+int BPF_PROG(handle_sched_switch, bool preempt,
+	     struct task_struct *prev, struct task_struct *next)
+{
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_msg_pop_data.c b/tools/testing/selftests/bpf/progs/test_sockmap_msg_pop_data.c
new file mode 100644
index 000000000000..301e65b95256
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_sockmap_msg_pop_data.c
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+struct {
+	__uint(type, BPF_MAP_TYPE_SOCKMAP);
+	__uint(max_entries, 1);
+	__type(key, int);
+	__type(value, int);
+} sock_map SEC(".maps");
+
+#define POP_START 0x48a3
+#define POP_LEN   0xfffffffd
+
+long pop_data_ret = 1;
+
+SEC("sk_msg")
+int prog_msg_pop_data(struct sk_msg_md *msg)
+{
+	if (msg->size <= POP_START)
+		return SK_PASS;
+
+	pop_data_ret = bpf_msg_pop_data(msg, POP_START, POP_LEN, 0);
+	return SK_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
index 32127f1cd687..30f1de458669 100644
--- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
+++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
@@ -6,6 +6,7 @@
  * modify it under the terms of version 2 of the GNU General Public
  * License as published by the Free Software Foundation.
  */
+#define BPF_NO_KFUNC_PROTOTYPES
 #include "vmlinux.h"
 #include <bpf/bpf_core_read.h>
 #include <bpf/bpf_helpers.h>
@@ -36,12 +37,10 @@ enum bpf_fou_encap_type___local {
 	FOU_BPF_ENCAP_GUE___local,
 };
 
-struct bpf_fou_encap;
-
 int bpf_skb_set_fou_encap(struct __sk_buff *skb_ctx,
-			  struct bpf_fou_encap *encap, int type) __ksym;
+			  struct bpf_fou_encap___local *encap, int type) __ksym;
 int bpf_skb_get_fou_encap(struct __sk_buff *skb_ctx,
-			  struct bpf_fou_encap *encap) __ksym;
+			  struct bpf_fou_encap___local *encap) __ksym;
 struct xfrm_state *
 bpf_xdp_get_xfrm_state(struct xdp_md *ctx, struct bpf_xfrm_state_opts *opts,
 		       u32 opts__sz) __ksym;
@@ -781,7 +780,7 @@ int ipip_gue_set_tunnel(struct __sk_buff *skb)
 	encap.sport = 0;
 	encap.dport = bpf_htons(5555);
 
-	ret = bpf_skb_set_fou_encap(skb, (struct bpf_fou_encap *)&encap,
+	ret = bpf_skb_set_fou_encap(skb, &encap,
 				    bpf_core_enum_value(enum bpf_fou_encap_type___local,
 							FOU_BPF_ENCAP_GUE___local));
 	if (ret < 0) {
@@ -820,7 +819,7 @@ int ipip_fou_set_tunnel(struct __sk_buff *skb)
 	encap.sport = 0;
 	encap.dport = bpf_htons(5555);
 
-	ret = bpf_skb_set_fou_encap(skb, (struct bpf_fou_encap *)&encap,
+	ret = bpf_skb_set_fou_encap(skb, &encap,
 				    FOU_BPF_ENCAP_FOU___local);
 	if (ret < 0) {
 		log_err(ret);
@@ -843,7 +842,7 @@ int ipip_encap_get_tunnel(struct __sk_buff *skb)
 		return TC_ACT_SHOT;
 	}
 
-	ret = bpf_skb_get_fou_encap(skb, (struct bpf_fou_encap *)&encap);
+	ret = bpf_skb_get_fou_encap(skb, &encap);
 	if (ret < 0) {
 		log_err(ret);
 		return TC_ACT_SHOT;
diff --git a/tools/testing/selftests/bpf/progs/test_vmlinux.c b/tools/testing/selftests/bpf/progs/test_vmlinux.c
index 78b23934d9f8..eea556940df6 100644
--- a/tools/testing/selftests/bpf/progs/test_vmlinux.c
+++ b/tools/testing/selftests/bpf/progs/test_vmlinux.c
@@ -69,7 +69,7 @@ int BPF_PROG(handle__tp_btf, struct pt_regs *regs, long id)
 	return 0;
 }
 
-SEC("kprobe/hrtimer_start_range_ns")
+SEC("kprobe")
 int BPF_KPROBE(handle__kprobe, struct hrtimer *timer, ktime_t tim, u64 delta_ns,
 	       const enum hrtimer_mode mode)
 {
@@ -78,7 +78,7 @@ int BPF_KPROBE(handle__kprobe, struct hrtimer *timer, ktime_t tim, u64 delta_ns,
 	return 0;
 }
 
-SEC("fentry/hrtimer_start_range_ns")
+SEC("fentry")
 int BPF_PROG(handle__fentry, struct hrtimer *timer, ktime_t tim, u64 delta_ns,
 	     const enum hrtimer_mode mode)
 {
diff --git a/tools/testing/selftests/bpf/progs/test_wakeup_source.c b/tools/testing/selftests/bpf/progs/test_wakeup_source.c
new file mode 100644
index 000000000000..fd2fb6aebd82
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_wakeup_source.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2026 Google LLC */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+#include "bpf_experimental.h"
+#include "bpf_misc.h"
+#include "wakeup_source.h"
+
+#define MAX_LOOP_ITER 1000
+#define RB_SIZE (16384 * 4)
+
+struct {
+	__uint(type, BPF_MAP_TYPE_RINGBUF);
+	__uint(max_entries, RB_SIZE);
+} rb SEC(".maps");
+
+struct bpf_ws_lock;
+struct bpf_ws_lock *bpf_wakeup_sources_read_lock(void) __ksym;
+void bpf_wakeup_sources_read_unlock(struct bpf_ws_lock *lock) __ksym;
+void *bpf_wakeup_sources_get_head(void) __ksym;
+
+SEC("syscall")
+__success __retval(0)
+int iterate_wakeupsources(void *ctx)
+{
+	struct list_head *head = bpf_wakeup_sources_get_head();
+	struct list_head *pos = head;
+	struct bpf_ws_lock *lock;
+	int i;
+
+	lock = bpf_wakeup_sources_read_lock();
+	if (!lock)
+		return 0;
+
+	bpf_for(i, 0, MAX_LOOP_ITER) {
+		if (bpf_core_read(&pos, sizeof(pos), &pos->next) || !pos || pos == head)
+			break;
+
+		struct wakeup_event_t *e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
+
+		if (!e)
+			break;
+
+		struct wakeup_source *ws = bpf_core_cast(
+				(void *)pos - bpf_core_field_offset(struct wakeup_source, entry),
+				struct wakeup_source);
+		s64 active_time = 0;
+		bool active = BPF_CORE_READ_BITFIELD(ws, active);
+		bool autosleep_enable = BPF_CORE_READ_BITFIELD(ws, autosleep_enabled);
+		s64 last_time = ws->last_time;
+		s64 max_time = ws->max_time;
+		s64 prevent_sleep_time = ws->prevent_sleep_time;
+		s64 total_time = ws->total_time;
+
+		if (active) {
+			s64 curr_time = bpf_ktime_get_ns();
+			s64 prevent_time = ws->start_prevent_time;
+
+			if (curr_time > last_time)
+				active_time = curr_time - last_time;
+
+			total_time += active_time;
+			if (active_time > max_time)
+				max_time = active_time;
+			if (autosleep_enable && curr_time > prevent_time)
+				prevent_sleep_time += curr_time - prevent_time;
+		}
+
+		e->active_count = ws->active_count;
+		e->active_time_ns = active_time;
+		e->event_count = ws->event_count;
+		e->expire_count = ws->expire_count;
+		e->last_time_ns = last_time;
+		e->max_time_ns = max_time;
+		e->prevent_sleep_time_ns = prevent_sleep_time;
+		e->total_time_ns = total_time;
+		e->wakeup_count = ws->wakeup_count;
+
+		if (bpf_probe_read_kernel_str(
+				e->name, WAKEUP_NAME_LEN, ws->name) < 0)
+			e->name[0] = '\0';
+
+		bpf_ringbuf_submit(e, 0);
+	}
+
+	bpf_wakeup_sources_read_unlock(lock);
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_attach.c b/tools/testing/selftests/bpf/progs/tracing_multi_attach.c
new file mode 100644
index 000000000000..332d0a423a43
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tracing_multi_attach.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+__hidden extern int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return);
+
+__u64 test_result_fentry = 0;
+__u64 test_result_fexit = 0;
+
+SEC("fentry.multi/bpf_fentry_test*")
+int BPF_PROG(test_fentry)
+{
+	tracing_multi_arg_check(ctx, &test_result_fentry, false);
+	return 0;
+}
+
+SEC("fexit.multi/bpf_fentry_test*")
+int BPF_PROG(test_fexit)
+{
+	tracing_multi_arg_check(ctx, &test_result_fexit, true);
+	return 0;
+}
+
+SEC("fentry.multi.s/bpf_fentry_test1")
+int BPF_PROG(test_fentry_s)
+{
+	tracing_multi_arg_check(ctx, &test_result_fentry, false);
+	return 0;
+}
+
+SEC("fexit.multi.s/bpf_fentry_test1")
+int BPF_PROG(test_fexit_s)
+{
+	tracing_multi_arg_check(ctx, &test_result_fexit, true);
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_attach_module.c b/tools/testing/selftests/bpf/progs/tracing_multi_attach_module.c
new file mode 100644
index 000000000000..b3374f2db450
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tracing_multi_attach_module.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+__hidden extern int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return);
+
+__u64 test_result_fentry = 0;
+__u64 test_result_fexit = 0;
+
+SEC("fentry.multi/bpf_testmod:bpf_testmod_fentry_test*")
+int BPF_PROG(test_fentry)
+{
+	tracing_multi_arg_check(ctx, &test_result_fentry, false);
+	return 0;
+}
+
+SEC("fexit.multi/bpf_testmod:bpf_testmod_fentry_test*")
+int BPF_PROG(test_fexit)
+{
+	tracing_multi_arg_check(ctx, &test_result_fexit, true);
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_bench.c b/tools/testing/selftests/bpf/progs/tracing_multi_bench.c
new file mode 100644
index 000000000000..beae946cb8c4
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tracing_multi_bench.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+SEC("fentry.multi")
+int BPF_PROG(bench)
+{
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_check.c b/tools/testing/selftests/bpf/progs/tracing_multi_check.c
new file mode 100644
index 000000000000..b2959ba71179
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tracing_multi_check.c
@@ -0,0 +1,214 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+int pid = 0;
+bool test_cookies = false;
+
+/* bpf_fentry_test1 is exported as kfunc via vmlinux.h */
+extern const void bpf_fentry_test2 __ksym;
+extern const void bpf_fentry_test3 __ksym;
+extern const void bpf_fentry_test4 __ksym;
+extern const void bpf_fentry_test5 __ksym;
+extern const void bpf_fentry_test6 __ksym;
+extern const void bpf_fentry_test7 __ksym;
+extern const void bpf_fentry_test8 __ksym;
+extern const void bpf_fentry_test9 __ksym;
+extern const void bpf_fentry_test10 __ksym;
+
+extern const void bpf_testmod_fentry_test1 __ksym;
+extern const void bpf_testmod_fentry_test2 __ksym;
+extern const void bpf_testmod_fentry_test3 __ksym;
+extern const void bpf_testmod_fentry_test7 __ksym;
+extern const void bpf_testmod_fentry_test11 __ksym;
+
+int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return)
+{
+	void *ip = (void *) bpf_get_func_ip(ctx);
+	__u64 value = 0, ret = 0, cookie = 0;
+	long err = 0;
+
+	if (bpf_get_current_pid_tgid() >> 32 != pid)
+		return 1;
+
+	if (is_return)
+		err |= bpf_get_func_ret(ctx, &ret);
+	if (test_cookies)
+		cookie = bpf_get_attach_cookie(ctx);
+
+	if (ip == &bpf_fentry_test1) {
+		int a;
+
+		err |= bpf_get_func_arg(ctx, 0, &value);
+		a = (int) value;
+
+		err |= is_return ? ret != 2 : 0;
+		err |= test_cookies ? cookie != 8 : 0;
+
+		*test_result += err == 0 && a == 1;
+	} else if (ip == &bpf_fentry_test2) {
+		__u64 b;
+		int a;
+
+		err |= bpf_get_func_arg(ctx, 0, &value);
+		a = (int) value;
+		err |= bpf_get_func_arg(ctx, 1, &value);
+		b = value;
+
+		err |= is_return ? ret != 5 : 0;
+		err |= test_cookies ? cookie != 9 : 0;
+
+		*test_result += err == 0 && a == 2 && b == 3;
+	} else if (ip == &bpf_fentry_test3) {
+		__u64 c;
+		char a;
+		int b;
+
+		err |= bpf_get_func_arg(ctx, 0, &value);
+		a = (char) value;
+		err |= bpf_get_func_arg(ctx, 1, &value);
+		b = (int) value;
+		err |= bpf_get_func_arg(ctx, 2, &value);
+		c = value;
+
+		err |= is_return ? ret != 15 : 0;
+		err |= test_cookies ? cookie != 7 : 0;
+
+		*test_result += err == 0 && a == 4 && b == 5 && c == 6;
+	} else if (ip == &bpf_fentry_test4) {
+		void *a;
+		char b;
+		int c;
+		__u64 d;
+
+		err |= bpf_get_func_arg(ctx, 0, &value);
+		a = (void *) value;
+		err |= bpf_get_func_arg(ctx, 1, &value);
+		b = (char) value;
+		err |= bpf_get_func_arg(ctx, 2, &value);
+		c = (int) value;
+		err |= bpf_get_func_arg(ctx, 3, &value);
+		d = value;
+
+		err |= is_return ? ret != 34 : 0;
+		err |= test_cookies ? cookie != 5 : 0;
+
+		*test_result += err == 0 && a == (void *) 7 && b == 8 && c == 9 && d == 10;
+	} else if (ip == &bpf_fentry_test5) {
+		__u64 a;
+		void *b;
+		short c;
+		int d;
+		__u64 e;
+
+		err |= bpf_get_func_arg(ctx, 0, &value);
+		a = value;
+		err |= bpf_get_func_arg(ctx, 1, &value);
+		b = (void *) value;
+		err |= bpf_get_func_arg(ctx, 2, &value);
+		c = (short) value;
+		err |= bpf_get_func_arg(ctx, 3, &value);
+		d = (int) value;
+		err |= bpf_get_func_arg(ctx, 4, &value);
+		e = value;
+
+		err |= is_return ? ret != 65 : 0;
+		err |= test_cookies ? cookie != 4 : 0;
+
+		*test_result += err == 0 && a == 11 && b == (void *) 12 && c == 13 && d == 14 && e == 15;
+	} else if (ip == &bpf_fentry_test6) {
+		__u64 a;
+		void *b;
+		short c;
+		int d;
+		void *e;
+		__u64 f;
+
+		err |= bpf_get_func_arg(ctx, 0, &value);
+		a = value;
+		err |= bpf_get_func_arg(ctx, 1, &value);
+		b = (void *) value;
+		err |= bpf_get_func_arg(ctx, 2, &value);
+		c = (short) value;
+		err |= bpf_get_func_arg(ctx, 3, &value);
+		d = (int) value;
+		err |= bpf_get_func_arg(ctx, 4, &value);
+		e = (void *) value;
+		err |= bpf_get_func_arg(ctx, 5, &value);
+		f = value;
+
+		err |= is_return ? ret != 111 : 0;
+		err |= test_cookies ? cookie != 2 : 0;
+
+		*test_result += err == 0 && a == 16 && b == (void *) 17 && c == 18 && d == 19 && e == (void *) 20 && f == 21;
+	} else if (ip == &bpf_fentry_test7) {
+		err |= is_return ? ret != 0 : 0;
+		err |= test_cookies ? cookie != 3 : 0;
+
+		*test_result += err == 0 ? 1 : 0;
+	} else if (ip == &bpf_fentry_test8) {
+		err |= is_return ? ret != 0 : 0;
+		err |= test_cookies ? cookie != 1 : 0;
+
+		*test_result += err == 0 ? 1 : 0;
+	} else if (ip == &bpf_fentry_test9) {
+		err |= is_return ? ret != 0 : 0;
+		err |= test_cookies ? cookie != 10 : 0;
+
+		*test_result += err == 0 ? 1 : 0;
+	} else if (ip == &bpf_fentry_test10) {
+		err |= is_return ? ret != 0 : 0;
+		err |= test_cookies ? cookie != 6 : 0;
+
+		*test_result += err == 0 ? 1 : 0;
+	} else if (ip == &bpf_testmod_fentry_test1) {
+		int a;
+
+		err |= bpf_get_func_arg(ctx, 0, &value);
+		a = (int) value;
+
+		err |= is_return ? ret != 2 : 0;
+
+		*test_result += err == 0 && a == 1;
+	} else if (ip == &bpf_testmod_fentry_test2) {
+		int a;
+		__u64 b;
+
+		err |= bpf_get_func_arg(ctx, 0, &value);
+		a = (int) value;
+		err |= bpf_get_func_arg(ctx, 1, &value);
+		b = (__u64) value;
+
+		err |= is_return ? ret != 5 : 0;
+
+		*test_result += err == 0 && a == 2 && b == 3;
+	} else if (ip == &bpf_testmod_fentry_test3) {
+		char a;
+		int b;
+		__u64 c;
+
+		err |= bpf_get_func_arg(ctx, 0, &value);
+		a = (char) value;
+		err |= bpf_get_func_arg(ctx, 1, &value);
+		b = (int) value;
+		err |= bpf_get_func_arg(ctx, 2, &value);
+		c = (__u64) value;
+
+		err |= is_return ? ret != 15 : 0;
+
+		*test_result += err == 0 && a == 4 && b == 5 && c == 6;
+	} else if (ip == &bpf_testmod_fentry_test7) {
+		err |= is_return ? ret != 133 : 0;
+
+		*test_result += err == 0;
+	} else if (ip == &bpf_testmod_fentry_test11) {
+		err |= is_return ? ret != 231 : 0;
+
+		*test_result += err == 0;
+	}
+
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_fail.c b/tools/testing/selftests/bpf/progs/tracing_multi_fail.c
new file mode 100644
index 000000000000..7f0375f4213d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tracing_multi_fail.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+SEC("fentry.multi")
+int BPF_PROG(test_fentry)
+{
+	return 0;
+}
+
+SEC("fentry.multi.s")
+int BPF_PROG(test_fentry_s)
+{
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_intersect_attach.c b/tools/testing/selftests/bpf/progs/tracing_multi_intersect_attach.c
new file mode 100644
index 000000000000..cd5be0bb6ffd
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tracing_multi_intersect_attach.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+__hidden extern int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return);
+
+__u64 test_result_fentry_1 = 0;
+__u64 test_result_fentry_2 = 0;
+__u64 test_result_fexit_1 = 0;
+__u64 test_result_fexit_2 = 0;
+
+SEC("fentry.multi")
+int BPF_PROG(fentry_1)
+{
+	tracing_multi_arg_check(ctx, &test_result_fentry_1, false);
+	return 0;
+}
+
+SEC("fentry.multi")
+int BPF_PROG(fentry_2)
+{
+	tracing_multi_arg_check(ctx, &test_result_fentry_2, false);
+	return 0;
+}
+
+SEC("fexit.multi")
+int BPF_PROG(fexit_1)
+{
+	tracing_multi_arg_check(ctx, &test_result_fexit_1, true);
+	return 0;
+}
+
+SEC("fexit.multi")
+int BPF_PROG(fexit_2)
+{
+	tracing_multi_arg_check(ctx, &test_result_fexit_2, true);
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_rollback.c b/tools/testing/selftests/bpf/progs/tracing_multi_rollback.c
new file mode 100644
index 000000000000..a49d1d841f3a
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tracing_multi_rollback.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+int pid = 0;
+
+__u64 test_result_fentry = 0;
+__u64 test_result_fexit = 0;
+
+SEC("?fentry.multi")
+int BPF_PROG(test_fentry)
+{
+	if (bpf_get_current_pid_tgid() >> 32 != pid)
+		return 0;
+
+	test_result_fentry++;
+	return 0;
+}
+
+SEC("?fexit.multi")
+int BPF_PROG(test_fexit)
+{
+	if (bpf_get_current_pid_tgid() >> 32 != pid)
+		return 0;
+
+	test_result_fexit++;
+	return 0;
+}
+
+SEC("?fentry/bpf_fentry_test1")
+int BPF_PROG(extra)
+{
+	return 0;
+}
+
+SEC("?fentry/bpf_fentry_test10")
+int BPF_PROG(filler)
+{
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_session_attach.c b/tools/testing/selftests/bpf/progs/tracing_multi_session_attach.c
new file mode 100644
index 000000000000..7c9a46016ccd
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tracing_multi_session_attach.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+__hidden extern int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return);
+
+__u64 test_result_fentry = 0;
+__u64 test_result_fexit = 0;
+
+SEC("fsession.multi/bpf_fentry_test*")
+int BPF_PROG(test_session_1)
+{
+	volatile __u64 *cookie = bpf_session_cookie(ctx);
+
+	if (bpf_session_is_return(ctx)) {
+		if (tracing_multi_arg_check(ctx, &test_result_fexit, true))
+			return 0;
+		/* extra count for test_result_fexit cookie */
+		test_result_fexit += *cookie == 0xbeafbeafbeafbeaf;
+	} else {
+		if (tracing_multi_arg_check(ctx, &test_result_fentry, false))
+			return 0;
+		*cookie = 0xbeafbeafbeafbeaf;
+	}
+	return 0;
+}
+
+SEC("fsession.multi.s/bpf_fentry_test1")
+int BPF_PROG(test_fsession_s)
+{
+	volatile __u64 *cookie = bpf_session_cookie(ctx);
+
+	if (bpf_session_is_return(ctx)) {
+		if (tracing_multi_arg_check(ctx, &test_result_fexit, true))
+			return 0;
+		/* extra count for test_result_fexit cookie */
+		test_result_fexit += *cookie == 0xbeafbeafbeafbeaf;
+	} else {
+		if (tracing_multi_arg_check(ctx, &test_result_fentry, false))
+			return 0;
+		*cookie = 0xbeafbeafbeafbeaf;
+	}
+	return 0;
+}
+
+SEC("fsession.multi/bpf_testmod:bpf_testmod_fentry_test*")
+int BPF_PROG(test_session_2)
+{
+	volatile __u64 *cookie = bpf_session_cookie(ctx);
+
+	if (bpf_session_is_return(ctx)) {
+		if (tracing_multi_arg_check(ctx, &test_result_fexit, true))
+			return 0;
+		/* extra count for test_result_fexit cookie */
+		test_result_fexit += *cookie == 0xbeafbeafbeafbeaf;
+	} else {
+		if (tracing_multi_arg_check(ctx, &test_result_fentry, false))
+			return 0;
+		*cookie = 0xbeafbeafbeafbeaf;
+	}
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_verifier.c b/tools/testing/selftests/bpf/progs/tracing_multi_verifier.c
new file mode 100644
index 000000000000..7b6ed41bf452
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tracing_multi_verifier.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
+
+char _license[] SEC("license") = "GPL";
+
+SEC("fentry.multi/bpf_fentry_test1")
+__failure
+__msg("func 'bpf_multi_func' doesn't have 1-th argument")
+int BPF_PROG(fentry_direct_access, int a)
+{
+	return a;
+}
+
+SEC("fexit.multi/bpf_fentry_test3")
+__failure
+__msg("invalid bpf_context access off=24 size=8")
+int BPF_PROG(fexit_direct_access, char a, int b, __u64 c, int ret)
+{
+	return ret;
+}
+
+SEC("fsession.multi/bpf_fentry_test4")
+__failure
+__msg("invalid bpf_context access off=16 size=8")
+int BPF_PROG(fsession_direct_access, void *a, char b, int c, __u64 d, int ret)
+{
+	return c;
+}
diff --git a/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c b/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c
index 54de0389f878..c0d0422b8030 100644
--- a/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c
+++ b/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c
@@ -146,7 +146,7 @@ try_discard_dynptr(struct bpf_dynptr *dynptr, void *context)
  * not be able to read past the end of the pointer.
  */
 SEC("?raw_tp")
-__failure __msg("cannot release unowned const bpf_dynptr")
+__failure __msg("CONST_PTR_TO_DYNPTR cannot be released")
 int user_ringbuf_callback_discard_dynptr(void *ctx)
 {
 	bpf_user_ringbuf_drain(&user_ringbuf, try_discard_dynptr, NULL, 0);
@@ -166,7 +166,7 @@ try_submit_dynptr(struct bpf_dynptr *dynptr, void *context)
  * not be able to read past the end of the pointer.
  */
 SEC("?raw_tp")
-__failure __msg("cannot release unowned const bpf_dynptr")
+__failure __msg("CONST_PTR_TO_DYNPTR cannot be released")
 int user_ringbuf_callback_submit_dynptr(void *ctx)
 {
 	bpf_user_ringbuf_drain(&user_ringbuf, try_submit_dynptr, NULL, 0);
diff --git a/tools/testing/selftests/bpf/progs/verifier_arena.c b/tools/testing/selftests/bpf/progs/verifier_arena.c
index 62e282f4448a..df0e22d1a29b 100644
--- a/tools/testing/selftests/bpf/progs/verifier_arena.c
+++ b/tools/testing/selftests/bpf/progs/verifier_arena.c
@@ -8,7 +8,7 @@
 #include <bpf/bpf_tracing.h>
 #include "bpf_misc.h"
 #include "bpf_experimental.h"
-#include "bpf_arena_common.h"
+#include <bpf_arena_common.h>
 
 #define private(name) SEC(".bss." #name) __hidden __attribute__((aligned(8)))
 
@@ -607,4 +607,71 @@ int non_arena_ptr_add_to_arena_ptr(void *ctx)
 
 #endif
 
+static __noinline
+u32 __arena *check_arena_arg_nonglobal(u32 __arena *arg)
+{
+	volatile u32 val = *arg;
+
+	*arg = val + 1;
+
+	return arg;
+}
+
+__weak
+u32 __arena *check_arena_arg_global(u32 __arena *arg)
+{
+	volatile u32 val = *arg;
+
+	*arg = val + 1;
+
+	return arg;
+}
+
+__weak
+u32 volatile __arena *check_arena_arg_quals1(u32 volatile __arena *arg1, u32 __arena volatile *arg2)
+{
+	*arg1 = *arg1 + 1;
+	*arg2 = *arg1 + 1;
+
+	return arg2;
+}
+
+__weak
+u32 __arena volatile *check_arena_arg_quals2(u32 volatile __arena *arg1, u32 __arena volatile *arg2)
+{
+	*arg1 = *arg1 + 1;
+	*arg2 = *arg2 + 1;
+
+	return arg2;
+}
+
+SEC("syscall")
+__success __retval(0)
+int check_arena_arg_ret(void *ctx)
+{
+	u32 __arena *page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
+	u32 __arena *arg = page;
+	u32 __arena volatile *arg1;
+	u32 __arena volatile *ret1;
+	u32 volatile __arena *arg2;
+	u32 volatile __arena *ret2;
+
+	if (!arg)
+		return 1;
+
+	/* Make sure we use {arg, ret}{1, 2}. */
+
+	arg = check_arena_arg_nonglobal(page);
+	arg = check_arena_arg_global(arg);
+
+	arg1 = arg2 = page;
+	ret1 = check_arena_arg_quals1(arg1, arg2);
+	ret2 = check_arena_arg_quals2(arg1, arg2);
+
+	if (!(*ret1 ||*ret2))
+		return -EINVAL;
+
+	return 0;
+}
+
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c b/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c
index 83182ddbfb95..45d364b0bc85 100644
--- a/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c
+++ b/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c
@@ -6,7 +6,7 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 #include "bpf_experimental.h"
-#include "bpf_arena_common.h"
+#include <bpf_arena_common.h>
 #include "bpf_misc.h"
 
 #define ARENA_PAGES (1UL<< (32 - __builtin_ffs(__PAGE_SIZE) + 1))
diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_globals2.c b/tools/testing/selftests/bpf/progs/verifier_arena_globals2.c
index e6bd7b61f9f1..b51594dbc005 100644
--- a/tools/testing/selftests/bpf/progs/verifier_arena_globals2.c
+++ b/tools/testing/selftests/bpf/progs/verifier_arena_globals2.c
@@ -7,7 +7,7 @@
 #include <bpf/bpf_tracing.h>
 #include "bpf_misc.h"
 #include "bpf_experimental.h"
-#include "bpf_arena_common.h"
+#include <bpf_arena_common.h>
 
 #define ARENA_PAGES (32)
 
diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_large.c b/tools/testing/selftests/bpf/progs/verifier_arena_large.c
index 5f7e7afee169..6ab8730d4878 100644
--- a/tools/testing/selftests/bpf/progs/verifier_arena_large.c
+++ b/tools/testing/selftests/bpf/progs/verifier_arena_large.c
@@ -7,7 +7,7 @@
 #include <bpf/bpf_tracing.h>
 #include "bpf_misc.h"
 #include "bpf_experimental.h"
-#include "bpf_arena_common.h"
+#include <bpf_arena_common.h>
 
 #define ARENA_SIZE (1ull << 32)
 
diff --git a/tools/testing/selftests/bpf/progs/verifier_bits_iter.c b/tools/testing/selftests/bpf/progs/verifier_bits_iter.c
index 8bcddadfc4da..dd97f2027505 100644
--- a/tools/testing/selftests/bpf/progs/verifier_bits_iter.c
+++ b/tools/testing/selftests/bpf/progs/verifier_bits_iter.c
@@ -32,7 +32,7 @@ int BPF_PROG(no_destroy, struct bpf_iter_meta *meta, struct cgroup *cgrp)
 
 SEC("iter/cgroup")
 __description("uninitialized iter in ->next()")
-__failure __msg("expected an initialized iter_bits as arg #0")
+__failure __msg("expected an initialized iter_bits as R1")
 int BPF_PROG(next_uninit, struct bpf_iter_meta *meta, struct cgroup *cgrp)
 {
 	struct bpf_iter_bits it = {};
@@ -43,7 +43,7 @@ int BPF_PROG(next_uninit, struct bpf_iter_meta *meta, struct cgroup *cgrp)
 
 SEC("iter/cgroup")
 __description("uninitialized iter in ->destroy()")
-__failure __msg("expected an initialized iter_bits as arg #0")
+__failure __msg("expected an initialized iter_bits as R1")
 int BPF_PROG(destroy_uninit, struct bpf_iter_meta *meta, struct cgroup *cgrp)
 {
 	struct bpf_iter_bits it = {};
diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c
index c1ae013dee29..bc038ac2df98 100644
--- a/tools/testing/selftests/bpf/progs/verifier_bounds.c
+++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c
@@ -1239,7 +1239,8 @@ l0_%=:	r0 = 0;						\
 SEC("tc")
 __description("multiply mixed sign bounds. test 1")
 __success __log_level(2)
-__msg("r6 *= r7 {{.*}}; R6=scalar(smin=umin=0x1bc16d5cd4927ee1,smax=umax=0x1bc16d674ec80000,smax32=0x7ffffeff,umax32=0xfffffeff,var_off=(0x1bc16d4000000000; 0x3ffffffeff))")
+__msg("r6 *= r7 {{.*}}; R6=scalar(smin=umin=0x1bc16d5cd4927ee1,smax=umax=0x1bc16d674ec80000,smax32=0x7ffffeff,var_off=(0x1bc16d4000000000; 0x3ffffffeff))")
+/* cnum can't represent both [0, 0xffff_feff] and [0x8000_0000, 0x7fff_feff], so it picks one */
 __naked void mult_mixed0_sign(void)
 {
 	asm volatile (
@@ -1648,7 +1649,8 @@ l0_%=:	r0 = 0;				\
 SEC("socket")
 __description("bounds deduction cross sign boundary, two overlaps")
 __failure
-__msg("3: (2d) if r0 > r1 {{.*}} R0=scalar(smin=smin32=-128,smax=smax32=127,umax=0xffffffffffffff80)")
+__msg("3: (2d) if r0 > r1 {{.*}} R0=scalar(smin=smin32=-128,smax=smax32=127)")
+/* smin=-128 includes point 0xffffffffffffff80 */
 __msg("frame pointer is read only")
 __naked void bounds_deduct_two_overlaps(void)
 {
@@ -1890,25 +1892,25 @@ __naked void bounds_refinement_tnum_umax(void *ctx)
 /* This test covers the bounds deduction when the u64 range and the tnum
  * overlap only at umin. After instruction 3, the ranges look as follows:
  *
- * 0    umin=0xe00     umax=0xeff                              U64_MAX
+ * 0    umin=0xe1      umax=0xf0                               U64_MAX
  * |    [xxxxxxxxxxxxxx]                                       |
  * |----------------------------|------------------------------|
  * |    x               x                                      | tnum values
  *
- * The verifier can therefore deduce that the R0=0xe0=224.
+ * The verifier can therefore deduce that the R0=0xe1=225.
  */
 SEC("socket")
 __description("bounds refinement with single-value tnum on umin")
-__msg("3: (15) if r0 == 0xf0 {{.*}} R0=224")
+__msg("3: (15) if r0 == 0xf1 {{.*}} R0=225")
 __success __log_level(2)
 __naked void bounds_refinement_tnum_umin(void *ctx)
 {
 	asm volatile("			\
 	call %[bpf_get_prandom_u32];	\
-	r0 |= 0xe0;			\
-	r0 &= 0xf0;			\
-	if r0 == 0xf0 goto +2;		\
-	if r0 == 0xe0 goto +1;		\
+	r0 |= 0xe1;			\
+	r0 &= 0xf1;			\
+	if r0 == 0xf1 goto +2;		\
+	if r0 == 0xe1 goto +1;		\
 	r10 = 0;			\
 	exit;				\
 "	:
@@ -2043,7 +2045,8 @@ __naked void signed_unsigned_intersection32_case2(void *ctx)
  */
 SEC("socket")
 __description("bounds refinement: 64bits ranges not overwritten by 32bits ranges")
-__msg("3: (65) if r0 s> 0x2 {{.*}} R0=scalar(smin=0x8000000000000002,smax=2,umin=smin32=umin32=2,umax=0xffffffff00000003,smax32=umax32=3")
+__msg("3: (65) if r0 s> 0x2 {{.*}} R0=scalar(smin=0x8000000000000002,smax=2,smin32=umin32=2,smax32=umax32=3,var_off{{.*}}))")
+/* Can't represent both [S64_MIN+2, 2] and [2, U64_MAX - U32_MAX + 2] at the same time, picks shorter interval */
 __msg("4: (25) if r0 > 0x13 {{.*}} R0=2")
 __success __log_level(2)
 __naked void refinement_32bounds_not_overwriting_64bounds(void *ctx)
@@ -2184,4 +2187,111 @@ __naked void tnums_equal_impossible_constant(void *ctx)
 	: __clobber_all);
 }
 
+/*
+ * 32-bit range starts before 64-bit range low bits in each 2^32 block.
+ *
+ * N*2^32                   (N+1)*2^32                (N+2)*2^32                (N+3)*2^32
+ * ||----|=====|--|----------||----|=====|-------------||--|-|=====|-------------||
+ *       |< b >|  |                |< b >|                 | |< b >|
+ *                |                |     |                 |
+ *                |<---------------+- a -+---------------->|
+ *                                 |     |
+ *                                 |< t >| refined r0 range
+ *
+ * a = u64 [0x1'00000008, 0x3'00000001]
+ * b = u32 [2, 5]
+ * t = u64 [0x2'00000002, 0x2'00000005]
+ */
+SEC("socket")
+__success
+__flag(BPF_F_TEST_REG_INVARIANTS)
+__naked void deduce64_from_32_before_block_start(void)
+{
+	asm volatile ("							\
+	call %[bpf_get_prandom_u32];					\
+	r1 = 0x100000008 ll;						\
+	if r0 < r1 goto 2f;						\
+	r1 = 0x300000001 ll;						\
+	if r0 > r1 goto 2f;	/* u64: [0x1'00000008, 0x3'00000001] */	\
+	if w0 < 2 goto 2f;						\
+	if w0 > 5 goto 2f;	/* u32: [2, 5] */			\
+	r2 = 0x200000002 ll;						\
+	r3 = 0x200000005 ll;						\
+	if r0 >= r2 goto 1f;	/* should be always true */		\
+	r10 = 0;		/* dead code */				\
+1:	if r0 <= r3 goto 2f;	/* should be always true */		\
+	r10 = 0;		/* dead code */				\
+2:	exit;								\
+	"
+	:: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+/*
+ * 32-bit range crossing U32_MAX / 0 boundary.
+ *
+ * N*2^32                   (N+1)*2^32                (N+2)*2^32                (N+3)*2^32
+ * ||===|---------|------|===||===|----------------|===||===|---------|------|===||
+ *  |b >|         |      |< b||b >|                |< b||b >|         |      |< b|
+ *                |      |                                  |         |
+ *                |<-----+----------------- a --------------+-------->|
+ *                       |                                  |
+ *                       |<---------------- t ------------->| refined r0 range
+ *
+ * a = u64 [0x1'00000006, 0x2'FFFFFFEF]
+ * b = s32 [-16, 5] (u32 wrapping [0xFFFFFFF0, 0x00000005])
+ * t = u64 [0x1'FFFFFFF0, 0x2'00000005]
+ */
+SEC("socket")
+__success
+__flag(BPF_F_TEST_REG_INVARIANTS)
+__naked void deduce64_from_32_wrapping_32bit(void)
+{
+	asm volatile ("							\
+	call %[bpf_get_prandom_u32];					\
+	r1 = 0x100000006 ll;						\
+	if r0 < r1 goto 2f;						\
+	r1 = 0x2ffffffef ll;						\
+	if r0 > r1 goto 2f;	/* u64: [0x1'00000006, 0x2'FFFFFFEF] */	\
+	if w0 s< -16 goto 2f;						\
+	if w0 s> 5 goto 2f;	/* s32: [-16, 5] */			\
+	r1 = 0x1fffffff0 ll;						\
+	r2 = 0x200000005 ll;						\
+	if r0 >= r1 goto 1f;	/* should be always true */		\
+	r10 = 0;		/* dead code */				\
+1:	if r0 <= r2 goto 2f;	/* should be always true */		\
+	r10 = 0;		/* dead code */				\
+2:	exit;								\
+	"
+	:: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+/* Check that range_within() compares cnum ranges, not min/max projections. */
+SEC("socket")
+__failure __msg("div by zero")
+__flag(BPF_F_TEST_STATE_FREQ)
+__naked void range_within_cnum_cross_both_boundaries(void)
+{
+	asm volatile ("							\
+	call %[bpf_get_prandom_u32];					\
+	r1 = 0x80000020;						\
+	if r0 > r1 goto 1f;						\
+	r0 += 0x7FFFFFF0;			/* PATH 1 */		\
+	goto 2f;							\
+1:	call %[bpf_get_prandom_u32];		/* PATH 2 */		\
+	if r0 < 0x100 goto 3f;						\
+	if r0 > 0x200 goto 3f;						\
+2:	/* PATH 1: r0 ∈ [0x7FFFFFF0, U32_MAX] ∪ [0, 0x10] */		\
+	/* PATH 2: r0 ∈ [0x100, 0x200] */				\
+	if r0 != 0x100 goto 3f;	/* True only on PATH 2 */		\
+	r0 /= 0;							\
+3:	exit;								\
+	"
+	:: __imm(bpf_map_lookup_elem),
+	   __imm_addr(map_hash_8b),
+	   __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c
index fb4fa465d67c..8d7ff38e4c06 100644
--- a/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c
+++ b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c
@@ -630,13 +630,13 @@ __xlated("...")
 __xlated("4: r0 = &(void __percpu *)(r0)")
 __xlated("...")
 /* may_goto expansion starts */
-__xlated("6: r11 = *(u64 *)(r10 -24)")
-__xlated("7: if r11 == 0x0 goto pc+6")
-__xlated("8: r11 -= 1")
-__xlated("9: if r11 != 0x0 goto pc+2")
-__xlated("10: r11 = -24")
+__xlated("6: r12 = *(u64 *)(r10 -24)")
+__xlated("7: if r12 == 0x0 goto pc+6")
+__xlated("8: r12 -= 1")
+__xlated("9: if r12 != 0x0 goto pc+2")
+__xlated("10: r12 = -24")
 __xlated("11: call unknown")
-__xlated("12: *(u64 *)(r10 -24) = r11")
+__xlated("12: *(u64 *)(r10 -24) = r12")
 /* may_goto expansion ends */
 __xlated("13: *(u64 *)(r10 -8) = r1")
 __xlated("14: exit")
@@ -668,13 +668,13 @@ __xlated("1: *(u64 *)(r10 -16) =")
 __xlated("2: r1 = 1")
 __xlated("3: call bpf_get_smp_processor_id")
 /* may_goto expansion starts */
-__xlated("4: r11 = *(u64 *)(r10 -24)")
-__xlated("5: if r11 == 0x0 goto pc+6")
-__xlated("6: r11 -= 1")
-__xlated("7: if r11 != 0x0 goto pc+2")
-__xlated("8: r11 = -24")
+__xlated("4: r12 = *(u64 *)(r10 -24)")
+__xlated("5: if r12 == 0x0 goto pc+6")
+__xlated("6: r12 -= 1")
+__xlated("7: if r12 != 0x0 goto pc+2")
+__xlated("8: r12 = -24")
 __xlated("9: call unknown")
-__xlated("10: *(u64 *)(r10 -24) = r11")
+__xlated("10: *(u64 *)(r10 -24) = r12")
 /* may_goto expansion ends */
 __xlated("11: *(u64 *)(r10 -8) = r1")
 __xlated("12: exit")
@@ -799,8 +799,7 @@ __naked int bpf_loop_interaction2(void)
 
 SEC("raw_tp")
 __arch_x86_64
-__log_level(4)
-__msg("stack depth 512+0")
+__log_level(4) __msg("stack depth 512+0 max 512")
 /* just to print xlated version when debugging */
 __xlated("r0 = &(void __percpu *)(r0)")
 __success
diff --git a/tools/testing/selftests/bpf/progs/verifier_flow_keys.c b/tools/testing/selftests/bpf/progs/verifier_flow_keys.c
new file mode 100644
index 000000000000..d780a36a6e9a
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/verifier_flow_keys.c
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Bounds checks for PTR_TO_FLOW_KEYS pointer arithmetic. */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+/* sizeof(struct bpf_flow_keys) is well under 4096, so +0x1000 is OOB. */
+
+SEC("flow_dissector")
+__description("flow_keys: in-bounds constant pointer arithmetic accepted")
+__success
+__naked void flow_keys_const_inbounds(void)
+{
+	asm volatile ("					\
+	r1 = *(u64 *)(r1 + %[flow_keys]);		\
+	r1 += 8;					\
+	r0 = *(u64 *)(r1 + 0);				\
+	r0 = 0;						\
+	exit;						\
+"	:
+	: __imm_const(flow_keys, offsetof(struct __sk_buff, flow_keys))
+	: __clobber_all);
+}
+
+SEC("flow_dissector")
+__description("flow_keys: OOB via constant pointer arithmetic rejected")
+__failure __msg("invalid access to flow keys off=4096 size=8")
+__naked void flow_keys_const_oob_read(void)
+{
+	asm volatile ("					\
+	r1 = *(u64 *)(r1 + %[flow_keys]);		\
+	r1 += 4096;					\
+	r0 = *(u64 *)(r1 + 0);				\
+	r0 = 0;						\
+	exit;						\
+"	:
+	: __imm_const(flow_keys, offsetof(struct __sk_buff, flow_keys))
+	: __clobber_all);
+}
+
+SEC("flow_dissector")
+__description("flow_keys: OOB write via constant pointer arithmetic rejected")
+__failure __msg("invalid access to flow keys off=4096 size=8")
+__naked void flow_keys_const_oob_write(void)
+{
+	asm volatile ("					\
+	r1 = *(u64 *)(r1 + %[flow_keys]);		\
+	r1 += 4096;					\
+	r2 = 0;						\
+	*(u64 *)(r1 + 0) = r2;				\
+	r0 = 0;						\
+	exit;						\
+"	:
+	: __imm_const(flow_keys, offsetof(struct __sk_buff, flow_keys))
+	: __clobber_all);
+}
+
+/* Equivalent OOB expressed directly in insn->off; this form was always
+ * rejected and is kept to show both forms now share one diagnostic.
+ */
+SEC("flow_dissector")
+__description("flow_keys: OOB via insn->off rejected")
+__failure __msg("invalid access to flow keys off=4096 size=8")
+__naked void flow_keys_insn_off_oob(void)
+{
+	asm volatile ("					\
+	r1 = *(u64 *)(r1 + %[flow_keys]);		\
+	r0 = *(u64 *)(r1 + 4096);			\
+	r0 = 0;						\
+	exit;						\
+"	:
+	: __imm_const(flow_keys, offsetof(struct __sk_buff, flow_keys))
+	: __clobber_all);
+}
+
+SEC("flow_dissector")
+__description("flow_keys: variable pointer arithmetic rejected")
+__failure __msg("R1 pointer arithmetic on flow_keys prohibited")
+__naked void flow_keys_var_read(void)
+{
+	asm volatile ("					\
+	r6 = r1;					\
+	call %[bpf_get_prandom_u32];			\
+	r0 &= 0xFFFF;					\
+	r1 = *(u64 *)(r6 + %[flow_keys]);		\
+	r1 += r0;					\
+	r0 = *(u64 *)(r1 + 0);				\
+	r0 = 0;						\
+	exit;						\
+"	:
+	: __imm_const(flow_keys, offsetof(struct __sk_buff, flow_keys)),
+	  __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c
index e7dae0cf9c17..0bdeb7bc4687 100644
--- a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c
+++ b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c
@@ -153,7 +153,7 @@ __weak int subprog_trusted_destroy(struct task_struct *task __arg_trusted)
 
 SEC("?tp_btf/task_newtask")
 __failure __log_level(2)
-__msg("release kernel function bpf_task_release expects refcounted PTR_TO_BTF_ID")
+__msg("release kfunc bpf_task_release expects referenced PTR_TO_BTF_ID passed to R1")
 int BPF_PROG(trusted_destroy_fail, struct task_struct *task, u64 clone_flags)
 {
 	return subprog_trusted_destroy(task);
@@ -287,6 +287,25 @@ int trusted_to_untrusted_mem(void *ctx)
 	return subprog_void_untrusted(bpf_get_current_task_btf());
 }
 
+__weak int subprog_write_mem_arg(int *p)
+{
+	if (!p)
+		return 0;
+
+	*p = 42;
+	return 0;
+}
+
+SEC("?tp_btf/task_newtask")
+__failure
+__msg("only read is supported")
+int trusted_btf_field_to_writable_mem(void *ctx)
+{
+	struct task_struct *task = bpf_get_current_task_btf();
+
+	return subprog_write_mem_arg(&task->prio);
+}
+
 SEC("tp_btf/sys_enter")
 __success
 int anything_to_untrusted_mem(void *ctx)
diff --git a/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c
index 1e08aff7532e..75a2e3f48d0f 100644
--- a/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c
+++ b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c
@@ -46,12 +46,13 @@ __noinline long global_dead(void)
 }
 
 SEC("?raw_tp")
-__success __log_level(2)
+__success __log_level(6)
 /* main prog is validated completely first */
 __msg("('global_calls_good_only') is global and assumed valid.")
 /* eventually global_good() is transitively validated as well */
 __msg("Validating global_good() func")
 __msg("('global_good') is safe for any args that match its prototype")
+__msg("insns processed {{[0-9]+\\+[0-9]+\\+[0-9]+$}}")
 int chained_global_func_calls_success(void)
 {
 	int sum = 0;
@@ -151,6 +152,23 @@ int anon_user_mem_valid(void *ctx)
 	return subprog_user_anon_mem(&t);
 }
 
+__noinline __weak int subprog_user_anon_mem_huge(int (*p)[0x3fffffff])
+{
+	return p ? (*p)[1] : 0;
+}
+
+SEC("?tracepoint")
+__failure __log_level(2)
+__msg("R1 memory size 4294967292 is too large")
+int anon_user_mem_huge_size_invalid(void *ctx)
+{
+	int (*p)[0x3fffffff];
+	int tiny = 42;
+
+	p = (void *)&tiny;
+	return subprog_user_anon_mem_huge(p) + tiny;
+}
+
 __noinline __weak int subprog_nonnull_ptr_good(int *p1 __arg_nonnull, int *p2 __arg_nonnull)
 {
 	return (*p1) * (*p2); /* good, no need for NULL checks */
diff --git a/tools/testing/selftests/bpf/progs/verifier_jit_inline.c b/tools/testing/selftests/bpf/progs/verifier_jit_inline.c
index 4ea254063646..76d80605ec7f 100644
--- a/tools/testing/selftests/bpf/progs/verifier_jit_inline.c
+++ b/tools/testing/selftests/bpf/progs/verifier_jit_inline.c
@@ -9,7 +9,9 @@ __success __retval(0)
 __arch_x86_64
 __jited("	addq	%gs:{{.*}}, %rax")
 __arch_arm64
-__jited("	mrs	x7, SP_EL0")
+__jited("	mrs	x8, SP_EL0")
+__arch_riscv64
+__jited("	mv	a5, tp")
 int inline_bpf_get_current_task(void)
 {
 	bpf_get_current_task();
diff --git a/tools/testing/selftests/bpf/progs/verifier_ldsx.c b/tools/testing/selftests/bpf/progs/verifier_ldsx.c
index c8494b682c31..41340877dc9d 100644
--- a/tools/testing/selftests/bpf/progs/verifier_ldsx.c
+++ b/tools/testing/selftests/bpf/progs/verifier_ldsx.c
@@ -3,7 +3,7 @@
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
 #include "bpf_misc.h"
-#include "bpf_arena_common.h"
+#include <bpf_arena_common.h>
 
 #if (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \
 	(defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64) || \
@@ -274,11 +274,11 @@ __jited("movslq	0x10(%rdi,%r12), %r15")
 __jited("movswq	0x18(%rdi,%r12), %r15")
 __jited("movsbq	0x20(%rdi,%r12), %r15")
 __arch_arm64
-__jited("add	x11, x7, x28")
+__jited("add	x11, x8, x28")
 __jited("ldrsw	x21, [x11, #0x10]")
-__jited("add	x11, x7, x28")
+__jited("add	x11, x8, x28")
 __jited("ldrsh	x21, [x11, #0x18]")
-__jited("add	x11, x7, x28")
+__jited("add	x11, x8, x28")
 __jited("ldrsb	x21, [x11, #0x20]")
 __jited("add	x11, x0, x28")
 __jited("ldrsw	x22, [x11, #0x10]")
diff --git a/tools/testing/selftests/bpf/progs/verifier_liveness_exp.c b/tools/testing/selftests/bpf/progs/verifier_liveness_exp.c
index b058de623200..72646fa2745e 100644
--- a/tools/testing/selftests/bpf/progs/verifier_liveness_exp.c
+++ b/tools/testing/selftests/bpf/progs/verifier_liveness_exp.c
@@ -15,7 +15,7 @@
  * FP offset at each call site.  arg_track keys on (frame, off[]), so
  * r1=fp-8, r1=fp-16, ... r1=fp-400 produce 50 unique cache keys per level.
  *
- * This test chains 8 subprograms (the MAX_CALL_FRAMES limit).  Each
+ * This test chains 8 subprograms (within the MAX_CALL_FRAMES limit).  Each
  * intermediate function calls the next one 50 times, each time with a
  * different FP-relative offset in r1.
  *
diff --git a/tools/testing/selftests/bpf/progs/verifier_lsm.c b/tools/testing/selftests/bpf/progs/verifier_lsm.c
index 38e8e9176862..2f8103bfa14e 100644
--- a/tools/testing/selftests/bpf/progs/verifier_lsm.c
+++ b/tools/testing/selftests/bpf/progs/verifier_lsm.c
@@ -188,4 +188,13 @@ int BPF_PROG(null_check, struct file *file)
 	return 0;
 }
 
+SEC("lsm_cgroup/file_open")
+__description("sleepable lsm_cgroup program is rejected")
+__failure __msg("Program of this type cannot be sleepable")
+__flag(BPF_F_SLEEPABLE)
+int BPF_PROG(sleepable_lsm_cgroup)
+{
+	return 0;
+}
+
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/verifier_map_in_map.c b/tools/testing/selftests/bpf/progs/verifier_map_in_map.c
index 16b761e510f0..b606b5dca734 100644
--- a/tools/testing/selftests/bpf/progs/verifier_map_in_map.c
+++ b/tools/testing/selftests/bpf/progs/verifier_map_in_map.c
@@ -18,6 +18,20 @@ struct {
 	});
 } map_in_map SEC(".maps");
 
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
+	__uint(max_entries, 1);
+	__type(key, int);
+	__type(value, int);
+	__array(values, struct {
+		__uint(type, BPF_MAP_TYPE_ARRAY);
+		__uint(map_flags, BPF_F_INNER_MAP);
+		__uint(max_entries, 8);
+		__type(key, int);
+		__type(value, long);
+	});
+} map_in_map_dyn SEC(".maps");
+
 SEC("socket")
 __description("map in map access")
 __success __success_unpriv __retval(0)
@@ -45,6 +59,32 @@ l0_%=:	r0 = 0;						\
 	: __clobber_all);
 }
 
+SEC("socket")
+__description("map in map dynamic inner array lookup is nullable")
+__failure __msg("invalid mem access 'map_value_or_null'")
+__naked void map_in_map_dynamic_inner_array_lookup_is_nullable(void)
+{
+	asm volatile ("					\
+	r1 = 0;						\
+	*(u32*)(r10 - 4) = r1;				\
+	r2 = r10;					\
+	r2 += -4;					\
+	r1 = %[map_in_map_dyn] ll;			\
+	call %[bpf_map_lookup_elem];			\
+	if r0 == 0 goto l0_%=;				\
+	*(u32*)(r10 - 8) = 4;				\
+	r2 = r10;					\
+	r2 += -8;					\
+	r1 = r0;					\
+	call %[bpf_map_lookup_elem];			\
+	r0 = *(u64 *)(r0 + 0);				\
+l0_%=:	exit;						\
+"	:
+	: __imm(bpf_map_lookup_elem),
+	  __imm_addr(map_in_map_dyn)
+	: __clobber_all);
+}
+
 SEC("xdp")
 __description("map in map state pruning")
 __success __msg("processed 15 insns")
diff --git a/tools/testing/selftests/bpf/progs/verifier_map_ptr.c b/tools/testing/selftests/bpf/progs/verifier_map_ptr.c
index e2767d27d8aa..166193659870 100644
--- a/tools/testing/selftests/bpf/progs/verifier_map_ptr.c
+++ b/tools/testing/selftests/bpf/progs/verifier_map_ptr.c
@@ -70,13 +70,16 @@ __naked void bpf_map_ptr_write_rejected(void)
 	: __clobber_all);
 }
 
-/* The first element of struct bpf_map is a SHA256 hash of 32 bytes, accessing
- * into this array is valid. The opts field is now at offset 33.
+/*
+ * struct bpf_map starts with the SHA256 hash sha[32] at offset 0 (a readable
+ * byte array), the u32 excl field at offset 32, and the ops pointer at offset
+ * 40. Reading a u32 at offset 41 reaches into the middle of the ops pointer,
+ * i.e. a partial pointer access, which is rejected.
  */
 SEC("socket")
 __description("bpf_map_ptr: read non-existent field rejected")
 __failure
-__msg("cannot access ptr member ops with moff 32 in struct bpf_map with off 33 size 4")
+__msg("cannot access ptr member ops with moff 40 in struct bpf_map with off 41 size 4")
 __failure_unpriv
 __msg_unpriv("access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN")
 __flag(BPF_F_ANY_ALIGNMENT)
@@ -85,6 +88,31 @@ __naked void read_non_existent_field_rejected(void)
 	asm volatile ("					\
 	r6 = 0;						\
 	r1 = %[map_array_48b] ll;			\
+	r6 = *(u32*)(r1 + 41);				\
+	r0 = 1;						\
+	exit;						\
+"	:
+	: __imm_addr(map_array_48b)
+	: __clobber_all);
+}
+
+/*
+ * The u32 excl field spans offsets 32..35 (mend 36). Reading a u32 at offset
+ * 33 starts inside excl but extends past its end, which the verifier rejects
+ * as an out-of-bounds scalar access.
+ */
+SEC("socket")
+__description("bpf_map_ptr: read beyond excl field rejected")
+__failure
+__msg("access beyond the end of member excl (mend:36) in struct bpf_map with off 33 size 4")
+__failure_unpriv
+__msg_unpriv("access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN")
+__flag(BPF_F_ANY_ALIGNMENT)
+__naked void read_beyond_excl_field_rejected(void)
+{
+	asm volatile ("					\
+	r6 = 0;						\
+	r1 = %[map_array_48b] ll;			\
 	r6 = *(u32*)(r1 + 33);				\
 	r0 = 1;						\
 	exit;						\
@@ -103,7 +131,7 @@ __naked void ptr_read_ops_field_accepted(void)
 	asm volatile ("					\
 	r6 = 0;						\
 	r1 = %[map_array_48b] ll;			\
-	r6 = *(u64*)(r1 + 0);				\
+	r6 = *(u64*)(r1 + 40);				\
 	r0 = 1;						\
 	exit;						\
 "	:
diff --git a/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c b/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c
index 6d1edaef9213..4bdf4256a41e 100644
--- a/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c
+++ b/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c
@@ -81,13 +81,13 @@ __arch_s390x
 __arch_arm64
 __xlated("0: *(u64 *)(r10 -16) = 65535")
 __xlated("1: *(u64 *)(r10 -8) = 0")
-__xlated("2: r11 = *(u64 *)(r10 -16)")
-__xlated("3: if r11 == 0x0 goto pc+6")
-__xlated("4: r11 -= 1")
-__xlated("5: if r11 != 0x0 goto pc+2")
-__xlated("6: r11 = -16")
+__xlated("2: r12 = *(u64 *)(r10 -16)")
+__xlated("3: if r12 == 0x0 goto pc+6")
+__xlated("4: r12 -= 1")
+__xlated("5: if r12 != 0x0 goto pc+2")
+__xlated("6: r12 = -16")
 __xlated("7: call unknown")
-__xlated("8: *(u64 *)(r10 -16) = r11")
+__xlated("8: *(u64 *)(r10 -16) = r12")
 __xlated("9: r0 = 1")
 __xlated("10: r0 = 2")
 __xlated("11: exit")
diff --git a/tools/testing/selftests/bpf/progs/verifier_private_stack.c b/tools/testing/selftests/bpf/progs/verifier_private_stack.c
index 646e8ef82051..bb8206e10880 100644
--- a/tools/testing/selftests/bpf/progs/verifier_private_stack.c
+++ b/tools/testing/selftests/bpf/progs/verifier_private_stack.c
@@ -86,6 +86,7 @@ __naked static void cumulative_stack_depth_subprog(void)
 SEC("kprobe")
 __description("Private stack, subtree > MAX_BPF_STACK")
 __success
+__log_level(4) __msg("stack depth 512+32 max 512")
 __arch_x86_64
 /* private stack fp for the main prog */
 __jited("	movabsq	$0x{{.*}}, %r9")
@@ -93,6 +94,7 @@ __jited("	addq	%gs:{{.*}}, %r9")
 __jited("	movl	$0x2a, %edi")
 __jited("	movq	%rdi, -0x200(%r9)")
 __jited("	pushq	%r9")
+__jited("...")
 __jited("	callq	0x{{.*}}")
 __jited("	popq	%r9")
 __jited("	xorl	%eax, %eax")
@@ -152,11 +154,13 @@ __jited("	endbr64")
 __jited("	movabsq	$0x{{.*}}, %r9")
 __jited("	addq	%gs:{{.*}}, %r9")
 __jited("	pushq	%r9")
+__jited("...")
 __jited("	callq")
 __jited("	popq	%r9")
 __jited("	movl	$0x2a, %edi")
 __jited("	movq	%rdi, -0x200(%r9)")
 __jited("	pushq	%r9")
+__jited("...")
 __jited("	callq")
 __jited("	popq	%r9")
 __arch_arm64
@@ -170,12 +174,12 @@ __jited("	mrs	x10, TPIDR_EL{{[0-1]}}")
 __jited("	add	x27, x27, x10")
 __jited("	add	x25, x27, {{.*}}")
 __jited("	bl	0x{{.*}}")
-__jited("	mov	x7, x0")
+__jited("	mov	x8, x0")
 __jited("	mov	x0, #0x2a")
 __jited("	str	x0, [x27]")
 __jited("	bl	0x{{.*}}")
-__jited("	mov	x7, x0")
-__jited("	mov	x7, #0x0")
+__jited("	mov	x8, x0")
+__jited("	mov	x8, #0x0")
 __jited("	ldp	x25, x27, [sp], {{.*}}")
 __naked void private_stack_callback(void)
 {
@@ -198,6 +202,7 @@ __description("Private stack, exception in main prog")
 __success __retval(0)
 __arch_x86_64
 __jited("	pushq	%r9")
+__jited("...")
 __jited("	callq")
 __jited("	popq	%r9")
 __arch_arm64
@@ -220,7 +225,7 @@ __jited("	mov	x0, #0x2a")
 __jited("	str	x0, [x27]")
 __jited("	mov	x0, #0x0")
 __jited("	bl	0x{{.*}}")
-__jited("	mov	x7, x0")
+__jited("	mov	x8, x0")
 __jited("	ldp	x27, x28, [sp], #0x10")
 int private_stack_exception_main_prog(void)
 {
@@ -245,6 +250,7 @@ __success __retval(0)
 __arch_x86_64
 __jited("	movq	%rdi, -0x200(%r9)")
 __jited("	pushq	%r9")
+__jited("...")
 __jited("	callq")
 __jited("	popq	%r9")
 __arch_arm64
@@ -258,7 +264,7 @@ __jited("	add	x25, x27, {{.*}}")
 __jited("	mov	x0, #0x2a")
 __jited("	str	x0, [x27]")
 __jited("	bl	0x{{.*}}")
-__jited("	mov	x7, x0")
+__jited("	mov	x8, x0")
 __jited("	ldp	x27, x28, [sp], #0x10")
 int private_stack_exception_sub_prog(void)
 {
@@ -324,6 +330,8 @@ int private_stack_async_callback_1(void)
 SEC("fentry/bpf_fentry_test9")
 __description("Private stack, async callback, potential nesting")
 __success __retval(0)
+__load_if_JITed()
+__log_level(4) __msg("stack depth 8+0+256+0 max 272")
 __arch_x86_64
 __jited("	subq	$0x100, %rsp")
 __arch_arm64
@@ -344,6 +352,18 @@ int private_stack_async_callback_2(void)
 	return 0;
 }
 
+SEC("fentry/bpf_fentry_test9")
+__description("private stack, max stack depth is private stack")
+__success
+__log_level(4) __msg("stack depth 8+256+0 max 256")
+int private_stack_max_depth(void)
+{
+	int x = 0;
+
+	subprog1(&x);
+	return 0;
+}
+
 #else
 
 SEC("kprobe")
diff --git a/tools/testing/selftests/bpf/progs/verifier_ref_tracking.c b/tools/testing/selftests/bpf/progs/verifier_ref_tracking.c
index 910365201f68..199ad18f8eb5 100644
--- a/tools/testing/selftests/bpf/progs/verifier_ref_tracking.c
+++ b/tools/testing/selftests/bpf/progs/verifier_ref_tracking.c
@@ -263,7 +263,7 @@ l0_%=:	r0 = 0;						\
 
 SEC("lsm.s/bpf")
 __description("reference tracking: release user key reference without check")
-__failure __msg("Possibly NULL pointer passed to trusted arg0")
+__failure __msg("Possibly NULL pointer passed to trusted R1")
 __naked void user_key_reference_without_check(void)
 {
 	asm volatile ("					\
@@ -282,7 +282,7 @@ __naked void user_key_reference_without_check(void)
 
 SEC("lsm.s/bpf")
 __description("reference tracking: release system key reference without check")
-__failure __msg("Possibly NULL pointer passed to trusted arg0")
+__failure __msg("Possibly NULL pointer passed to trusted R1")
 __naked void system_key_reference_without_check(void)
 {
 	asm volatile ("					\
@@ -300,7 +300,7 @@ __naked void system_key_reference_without_check(void)
 
 SEC("lsm.s/bpf")
 __description("reference tracking: release with NULL key pointer")
-__failure __msg("Possibly NULL pointer passed to trusted arg0")
+__failure __msg("Possibly NULL pointer passed to trusted R1")
 __naked void release_with_null_key_pointer(void)
 {
 	asm volatile ("					\
@@ -1288,7 +1288,7 @@ l1_%=:	r1 = r6;					\
 
 SEC("tc")
 __description("reference tracking: bpf_sk_release(listen_sk)")
-__failure __msg("R1 must be referenced when passed to release function")
+__failure __msg("release helper bpf_sk_release expects referenced PTR_TO_BTF_ID passed to R1")
 __naked void bpf_sk_release_listen_sk(void)
 {
 	asm volatile (
diff --git a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c
index 70ae14d6084f..e38f102da45f 100644
--- a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c
+++ b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c
@@ -372,37 +372,36 @@ __naked void precision_two_ids(void)
 SEC("socket")
 __success __log_level(2)
 __flag(BPF_F_TEST_STATE_FREQ)
-/* check that r0 and r6 have different IDs after 'if',
- * collect_linked_regs() can't tie more than 6 registers for a single insn.
+/*
+ * check that r0 and r5 have different IDs after 'if',
+ * collect_linked_regs() can't tie more than 5 registers for a single insn.
  */
-__msg("8: (25) if r0 > 0x7 goto pc+0         ; R0=scalar(id=1")
-__msg("14: (bf) r6 = r6                      ; R6=scalar(id=2")
-/* check that r{0-5} are marked precise after 'if' */
-__msg("frame0: regs=r0 stack= before 8: (25) if r0 > 0x7 goto pc+0")
-__msg("frame0: parent state regs=r0,r1,r2,r3,r4,r5 stack=:")
+__msg("7: (25) if r0 > 0x7 goto pc+0         ; R0=scalar(id=1")
+__msg("12: (bf) r5 = r5                      ; R5=scalar(id=2")
+/* check that r{0-4} are marked precise after 'if' */
+__msg("frame0: regs=r0 stack= before 7: (25) if r0 > 0x7 goto pc+0")
+__msg("frame0: parent state regs=r0,r1,r2,r3,r4 stack=:")
 __naked void linked_regs_too_many_regs(void)
 {
 	asm volatile (
 	/* r0 = random number up to 0xff */
 	"call %[bpf_ktime_get_ns];"
 	"r0 &= 0xff;"
-	/* tie r{0-6} IDs */
+	/* tie r{0-5} IDs */
 	"r1 = r0;"
 	"r2 = r0;"
 	"r3 = r0;"
 	"r4 = r0;"
 	"r5 = r0;"
-	"r6 = r0;"
-	/* propagate range for r{0-6} */
+	/* propagate range for r{0-5} */
 	"if r0 > 7 goto +0;"
-	/* keep r{1-5} live */
+	/* keep r{1-4} live */
 	"r1 = r1;"
 	"r2 = r2;"
 	"r3 = r3;"
 	"r4 = r4;"
+	/* make r5 appear in the log */
 	"r5 = r5;"
-	/* make r6 appear in the log */
-	"r6 = r6;"
 	/* force r0 to be precise,
 	 * this would cause r{0-4} to be precise because of shared IDs
 	 */
diff --git a/tools/testing/selftests/bpf/progs/verifier_sdiv.c b/tools/testing/selftests/bpf/progs/verifier_sdiv.c
index fd59d57e8e37..95f3239ce228 100644
--- a/tools/testing/selftests/bpf/progs/verifier_sdiv.c
+++ b/tools/testing/selftests/bpf/progs/verifier_sdiv.c
@@ -778,10 +778,10 @@ __arch_x86_64
 __xlated("0: r2 = 0x8000000000000000")
 __xlated("2: r3 = -1")
 __xlated("3: r4 = r2")
-__xlated("4: r11 = r3")
-__xlated("5: r11 += 1")
-__xlated("6: if r11 > 0x1 goto pc+4")
-__xlated("7: if r11 == 0x0 goto pc+1")
+__xlated("4: r12 = r3")
+__xlated("5: r12 += 1")
+__xlated("6: if r12 > 0x1 goto pc+4")
+__xlated("7: if r12 == 0x0 goto pc+1")
 __xlated("8: r2 = 0")
 __xlated("9: r2 = -r2")
 __xlated("10: goto pc+1")
@@ -812,10 +812,10 @@ __success __retval(-5)
 __arch_x86_64
 __xlated("0: r2 = 5")
 __xlated("1: r3 = -1")
-__xlated("2: r11 = r3")
-__xlated("3: r11 += 1")
-__xlated("4: if r11 > 0x1 goto pc+4")
-__xlated("5: if r11 == 0x0 goto pc+1")
+__xlated("2: r12 = r3")
+__xlated("3: r12 += 1")
+__xlated("4: if r12 > 0x1 goto pc+4")
+__xlated("5: if r12 == 0x0 goto pc+1")
 __xlated("6: r2 = 0")
 __xlated("7: r2 = -r2")
 __xlated("8: goto pc+1")
@@ -890,10 +890,10 @@ __arch_x86_64
 __xlated("0: w2 = -2147483648")
 __xlated("1: w3 = -1")
 __xlated("2: w4 = w2")
-__xlated("3: r11 = r3")
-__xlated("4: w11 += 1")
-__xlated("5: if w11 > 0x1 goto pc+4")
-__xlated("6: if w11 == 0x0 goto pc+1")
+__xlated("3: r12 = r3")
+__xlated("4: w12 += 1")
+__xlated("5: if w12 > 0x1 goto pc+4")
+__xlated("6: if w12 == 0x0 goto pc+1")
 __xlated("7: w2 = 0")
 __xlated("8: w2 = -w2")
 __xlated("9: goto pc+1")
@@ -925,10 +925,10 @@ __arch_x86_64
 __xlated("0: w2 = -5")
 __xlated("1: w3 = -1")
 __xlated("2: w4 = w2")
-__xlated("3: r11 = r3")
-__xlated("4: w11 += 1")
-__xlated("5: if w11 > 0x1 goto pc+4")
-__xlated("6: if w11 == 0x0 goto pc+1")
+__xlated("3: r12 = r3")
+__xlated("4: w12 += 1")
+__xlated("5: if w12 > 0x1 goto pc+4")
+__xlated("6: if w12 == 0x0 goto pc+1")
 __xlated("7: w2 = 0")
 __xlated("8: w2 = -w2")
 __xlated("9: goto pc+1")
@@ -1004,10 +1004,10 @@ __arch_x86_64
 __xlated("0: r2 = 0x8000000000000000")
 __xlated("2: r3 = -1")
 __xlated("3: r4 = r2")
-__xlated("4: r11 = r3")
-__xlated("5: r11 += 1")
-__xlated("6: if r11 > 0x1 goto pc+3")
-__xlated("7: if r11 == 0x1 goto pc+3")
+__xlated("4: r12 = r3")
+__xlated("5: r12 += 1")
+__xlated("6: if r12 > 0x1 goto pc+3")
+__xlated("7: if r12 == 0x1 goto pc+3")
 __xlated("8: w2 = 0")
 __xlated("9: goto pc+1")
 __xlated("10: r2 s%= r3")
@@ -1034,10 +1034,10 @@ __arch_x86_64
 __xlated("0: r2 = 5")
 __xlated("1: r3 = -1")
 __xlated("2: r4 = r2")
-__xlated("3: r11 = r3")
-__xlated("4: r11 += 1")
-__xlated("5: if r11 > 0x1 goto pc+3")
-__xlated("6: if r11 == 0x1 goto pc+3")
+__xlated("3: r12 = r3")
+__xlated("4: r12 += 1")
+__xlated("5: if r12 > 0x1 goto pc+3")
+__xlated("6: if r12 == 0x1 goto pc+3")
 __xlated("7: w2 = 0")
 __xlated("8: goto pc+1")
 __xlated("9: r2 s%= r3")
@@ -1108,10 +1108,10 @@ __arch_x86_64
 __xlated("0: w2 = -2147483648")
 __xlated("1: w3 = -1")
 __xlated("2: w4 = w2")
-__xlated("3: r11 = r3")
-__xlated("4: w11 += 1")
-__xlated("5: if w11 > 0x1 goto pc+3")
-__xlated("6: if w11 == 0x1 goto pc+4")
+__xlated("3: r12 = r3")
+__xlated("4: w12 += 1")
+__xlated("5: if w12 > 0x1 goto pc+3")
+__xlated("6: if w12 == 0x1 goto pc+4")
 __xlated("7: w2 = 0")
 __xlated("8: goto pc+1")
 __xlated("9: w2 s%= w3")
@@ -1140,10 +1140,10 @@ __arch_x86_64
 __xlated("0: w2 = -5")
 __xlated("1: w3 = -1")
 __xlated("2: w4 = w2")
-__xlated("3: r11 = r3")
-__xlated("4: w11 += 1")
-__xlated("5: if w11 > 0x1 goto pc+3")
-__xlated("6: if w11 == 0x1 goto pc+4")
+__xlated("3: r12 = r3")
+__xlated("4: w12 += 1")
+__xlated("5: if w12 > 0x1 goto pc+3")
+__xlated("6: if w12 == 0x1 goto pc+4")
 __xlated("7: w2 = 0")
 __xlated("8: goto pc+1")
 __xlated("9: w2 s%= w3")
diff --git a/tools/testing/selftests/bpf/progs/verifier_set_retval.c b/tools/testing/selftests/bpf/progs/verifier_set_retval.c
new file mode 100644
index 000000000000..1415cd15cede
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/verifier_set_retval.c
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
+
+SEC("lsm_cgroup/socket_create")
+__description("lsm_cgroup bpf_set_retval success")
+__success
+int BPF_PROG(lsm_cgroup_set_retval_zero_valid, int family, int type, int protocol, int kern)
+{
+	bpf_set_retval(0);
+	return 0;
+}
+
+SEC("lsm_cgroup/socket_create")
+__description("lsm_cgroup bpf_set_retval valid errno")
+__success
+int BPF_PROG(lsm_cgroup_set_retval_negative_valid, int family, int type, int protocol, int kern)
+{
+	bpf_set_retval(-12);
+	return 0;
+}
+
+SEC("lsm_cgroup/socket_create")
+__description("lsm_cgroup bpf_set_retval invalid negative value")
+__failure __msg("should have been in [-4095, 0]")
+int BPF_PROG(lsm_cgroup_set_retval_negative_invalid, int family, int type, int protocol, int kern)
+{
+	bpf_set_retval(-4096);
+	return 0;
+}
+
+SEC("lsm_cgroup/socket_create")
+__description("lsm_cgroup bpf_set_retval invalid positive value")
+__failure __msg("should have been in [-4095, 0]")
+int BPF_PROG(lsm_cgroup_set_retval_positive_invalid, int family, int type, int protocol, int kern)
+{
+	bpf_set_retval(1);
+	return 0;
+}
+
+SEC("cgroup/dev")
+__description("cgroup_device bpf_set_retval success")
+__success
+int cgroup_dev_set_retval_0(struct bpf_cgroup_dev_ctx *ctx)
+{
+	bpf_set_retval(0);
+	return 1;
+}
+
+SEC("cgroup/dev")
+__description("cgroup_device bpf_set_retval valid errno")
+__success
+int cgroup_dev_set_retval_neg_maxerrno(struct bpf_cgroup_dev_ctx *ctx)
+{
+	bpf_set_retval(-4095);
+	return 1;
+}
+
+SEC("cgroup/dev")
+__description("cgroup_device bpf_set_retval invalid positive value")
+__failure __msg("should have been in [-4095, 0]")
+int cgroup_dev_set_retval_1(struct bpf_cgroup_dev_ctx *ctx)
+{
+	bpf_set_retval(1);
+	return 1;
+}
+
+SEC("cgroup/dev")
+__description("cgroup_device bpf_set_retval invalid negative value")
+__failure __msg("should have been in [-4095, 0]")
+int cgroup_dev_set_retval_neg_4096(struct bpf_cgroup_dev_ctx *ctx)
+{
+	bpf_set_retval(-4096);
+	return 1;
+}
+
+SEC("cgroup/dev")
+__description("bpf_set_retval bounds check survives state pruning")
+__failure __msg("should have been in [-4095, 0]")
+__naked int cgroup_dev_set_retval_pruning_bypass(struct bpf_cgroup_dev_ctx *ctx)
+{
+	asm volatile (
+		"call %[bpf_get_prandom_u32];"
+		"if r0 != 0 goto 1f;"
+		"r0 = r0;"
+		"r0 = r0;"
+		"r0 = r0;"
+		"r0 = r0;"
+		"goto 2f;"
+	"1:"
+		"call %[bpf_get_prandom_u32];"
+	"2:"
+		"r1 = r0;"
+		"call %[bpf_set_retval];"
+		"r0 = 1;"
+		"exit;"
+		:
+		: __imm(bpf_get_prandom_u32),
+		  __imm(bpf_set_retval)
+		: __clobber_common
+	);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/verifier_sock.c b/tools/testing/selftests/bpf/progs/verifier_sock.c
index a2132c72d3b8..4f2f3209eec8 100644
--- a/tools/testing/selftests/bpf/progs/verifier_sock.c
+++ b/tools/testing/selftests/bpf/progs/verifier_sock.c
@@ -603,7 +603,7 @@ l2_%=:	r0 = *(u32*)(r0 + %[bpf_tcp_sock_snd_cwnd]);	\
 
 SEC("tc")
 __description("bpf_sk_release(skb->sk)")
-__failure __msg("R1 must be referenced when passed to release function")
+__failure __msg("release helper bpf_sk_release expects referenced PTR_TO_BTF_ID passed to R1")
 __naked void bpf_sk_release_skb_sk(void)
 {
 	asm volatile ("					\
@@ -620,7 +620,7 @@ l0_%=:	r0 = 0;						\
 
 SEC("tc")
 __description("bpf_sk_release(bpf_sk_fullsock(skb->sk))")
-__failure __msg("R1 must be referenced when passed to release function")
+__failure __msg("release helper bpf_sk_release expects referenced PTR_TO_BTF_ID passed to R1")
 __naked void bpf_sk_fullsock_skb_sk(void)
 {
 	asm volatile ("					\
@@ -644,7 +644,7 @@ l1_%=:	r1 = r0;					\
 
 SEC("tc")
 __description("bpf_sk_release(bpf_tcp_sock(skb->sk))")
-__failure __msg("R1 must be referenced when passed to release function")
+__failure __msg("release helper bpf_sk_release expects referenced PTR_TO_BTF_ID passed to R1")
 __naked void bpf_tcp_sock_skb_sk(void)
 {
 	asm volatile ("					\
@@ -1120,8 +1120,11 @@ int tail_call(struct __sk_buff *sk)
 static __noinline
 int static_tail_call(struct __sk_buff *sk)
 {
+	int ret = 0;
+
 	bpf_tail_call_static(sk, &jmp_table, 0);
-	return 0;
+	barrier_var(ret);
+	return ret;
 }
 
 /* Tail calls in sub-programs invalidate packet pointers. */
@@ -1144,10 +1147,12 @@ __failure __msg("invalid mem access")
 int invalidate_pkt_pointers_by_static_tail_call(struct __sk_buff *sk)
 {
 	int *p = (void *)(long)sk->data;
+	int ret;
 
 	if ((void *)(p + 1) > (void *)(long)sk->data_end)
 		return TCX_DROP;
-	static_tail_call(sk);
+	ret = static_tail_call(sk);
+	__sink(ret);
 	*p = 42; /* this is unsafe */
 	return TCX_PASS;
 }
diff --git a/tools/testing/selftests/bpf/progs/verifier_stack_arg.c b/tools/testing/selftests/bpf/progs/verifier_stack_arg.c
new file mode 100644
index 000000000000..7e0ce5db28a0
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/verifier_stack_arg.c
@@ -0,0 +1,447 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(max_entries, 1);
+	__type(key, long long);
+	__type(value, long long);
+} map_hash_8b SEC(".maps");
+
+#if (defined(__TARGET_ARCH_x86) || defined(__TARGET_ARCH_arm64)) && \
+	defined(__BPF_FEATURE_STACK_ARGUMENT)
+
+__noinline __used
+static int subprog_6args(int a, int b, int c, int d, int e, int f)
+{
+	return a + b + c + d + e + f;
+}
+
+__noinline __used
+static int subprog_7args(int a, int b, int c, int d, int e, int f, int g)
+{
+	return a + b + c + d + e + f + g;
+}
+
+__noinline __used
+static long subprog_deref_arg6(long a, long b, long c, long d, long e, long *f)
+{
+	return *f;
+}
+
+SEC("tc")
+__description("stack_arg: subprog with 6 args")
+__success __retval(21)
+__naked void stack_arg_6args(void)
+{
+	asm volatile (
+		"r1 = 1;"
+		"r2 = 2;"
+		"r3 = 3;"
+		"r4 = 4;"
+		"r5 = 5;"
+		"*(u64 *)(r11 - 8) = 6;"
+		"call subprog_6args;"
+		"exit;"
+		::: __clobber_all
+	);
+}
+
+SEC("tc")
+__description("stack_arg: two subprogs with >5 args")
+__success __retval(90)
+__naked void stack_arg_two_subprogs(void)
+{
+	asm volatile (
+		"r1 = 1;"
+		"r2 = 2;"
+		"r3 = 3;"
+		"r4 = 4;"
+		"r5 = 5;"
+		"*(u64 *)(r11 - 8) = 10;"
+		"call subprog_6args;"
+		"r6 = r0;"
+		"r1 = 1;"
+		"r2 = 2;"
+		"r3 = 3;"
+		"r4 = 4;"
+		"r5 = 5;"
+		"*(u64 *)(r11 - 16) = 30;"
+		"*(u64 *)(r11 - 8) = 20;"
+		"call subprog_7args;"
+		"r0 += r6;"
+		"exit;"
+		::: __clobber_all
+	);
+}
+
+SEC("tc")
+__description("stack_arg: read from uninitialized stack arg slot")
+__failure
+__msg("invalid read from stack arg off 8 depth 0")
+__naked void stack_arg_read_uninitialized(void)
+{
+	asm volatile (
+		"r0 = *(u64 *)(r11 + 8);"
+		"r0 = 0;"
+		"exit;"
+		::: __clobber_all
+	);
+}
+
+SEC("tc")
+__description("stack_arg: gap at offset -8, only wrote -16")
+__failure
+__msg("callee expects 7 args, stack arg1 is not initialized")
+__naked void stack_arg_gap_at_minus8(void)
+{
+	asm volatile (
+		"r1 = 1;"
+		"r2 = 2;"
+		"r3 = 3;"
+		"r4 = 4;"
+		"r5 = 5;"
+		"*(u64 *)(r11 - 16) = 30;"
+		"call subprog_7args;"
+		"exit;"
+		::: __clobber_all
+	);
+}
+
+SEC("tc")
+__description("stack_arg: pruning with different stack arg types")
+__failure __log_level(2)
+__flag(BPF_F_TEST_STATE_FREQ)
+__msg("arg JOIN insn 9 -> 10 r1: fp0-8 + _ => fp0-8|fp0+0")
+__msg("arg JOIN insn 9 -> 10 sa0: fp0-8 + _ => fp0-8|fp0+0")
+__msg("R{{[0-9]}} invalid mem access 'scalar'")
+__naked void stack_arg_pruning_type_mismatch(void)
+{
+	asm volatile (
+		"call %[bpf_get_prandom_u32];"
+		"r6 = r0;"
+		/* local = 0 on program stack */
+		"r7 = 0;"
+		"*(u64 *)(r10 - 8) = r7;"
+		/* Branch based on random value */
+		"if r6 s> 3 goto l0_%=;"
+		/* Path 1: store stack pointer to outgoing arg6 */
+		"r1 = r10;"
+		"r1 += -8;"
+		"*(u64 *)(r11 - 8) = r1;"
+		"goto l1_%=;"
+	"l0_%=:"
+		/* Path 2: store scalar to outgoing arg6 */
+		"*(u64 *)(r11 - 8) = 42;"
+	"l1_%=:"
+		/* Call subprog that dereferences arg6 */
+		"r1 = r6;"
+		"r2 = 0;"
+		"r3 = 0;"
+		"r4 = 0;"
+		"r5 = 0;"
+		"call subprog_deref_arg6;"
+		"exit;"
+		:: __imm(bpf_get_prandom_u32)
+		: __clobber_all
+	);
+}
+
+SEC("tc")
+__description("stack_arg: release_reference invalidates stack arg slot")
+__failure
+__msg("callee expects 6 args, stack arg1 is not initialized")
+__naked void stack_arg_release_ref(void)
+{
+	asm volatile (
+		"r6 = r1;"
+		/* struct bpf_sock_tuple tuple = {} */
+		"r2 = 0;"
+		"*(u32 *)(r10 - 8) = r2;"
+		"*(u64 *)(r10 - 16) = r2;"
+		"*(u64 *)(r10 - 24) = r2;"
+		"*(u64 *)(r10 - 32) = r2;"
+		"*(u64 *)(r10 - 40) = r2;"
+		"*(u64 *)(r10 - 48) = r2;"
+		/* sk = bpf_sk_lookup_tcp(ctx, &tuple, sizeof(tuple), 0, 0) */
+		"r1 = r6;"
+		"r2 = r10;"
+		"r2 += -48;"
+		"r3 = %[sizeof_bpf_sock_tuple];"
+		"r4 = 0;"
+		"r5 = 0;"
+		"call %[bpf_sk_lookup_tcp];"
+		/* r0 = sk (PTR_TO_SOCK_OR_NULL) */
+		"if r0 == 0 goto l0_%=;"
+		/* Store sock ref to outgoing arg6 slot */
+		"*(u64 *)(r11 - 8) = r0;"
+		/* Release the reference — invalidates the stack arg slot */
+		"r1 = r0;"
+		"call %[bpf_sk_release];"
+		/* Call subprog that dereferences arg6 — should fail */
+		"r1 = 1;"
+		"r2 = 2;"
+		"r3 = 3;"
+		"r4 = 4;"
+		"r5 = 5;"
+		"call subprog_deref_arg6;"
+	"l0_%=:"
+		"r0 = 0;"
+		"exit;"
+		:
+		: __imm(bpf_sk_lookup_tcp),
+		  __imm(bpf_sk_release),
+		  __imm_const(sizeof_bpf_sock_tuple, sizeof(struct bpf_sock_tuple))
+		: __clobber_all
+	);
+}
+
+SEC("tc")
+__description("stack_arg: pkt pointer in stack arg slot invalidated after pull_data")
+__failure
+__msg("callee expects 6 args, stack arg1 is not initialized")
+__naked void stack_arg_stale_pkt_ptr(void)
+{
+	asm volatile (
+		"r6 = r1;"
+		"r7 = *(u32 *)(r6 + %[__sk_buff_data]);"
+		"r8 = *(u32 *)(r6 + %[__sk_buff_data_end]);"
+		/* check pkt has at least 1 byte */
+		"r0 = r7;"
+		"r0 += 8;"
+		"if r0 > r8 goto l0_%=;"
+		/* Store valid pkt pointer to outgoing arg6 slot */
+		"*(u64 *)(r11 - 8) = r7;"
+		/* bpf_skb_pull_data invalidates all pkt pointers */
+		"r1 = r6;"
+		"r2 = 0;"
+		"call %[bpf_skb_pull_data];"
+		/* Call subprog that dereferences arg6 — should fail */
+		"r1 = 1;"
+		"r2 = 2;"
+		"r3 = 3;"
+		"r4 = 4;"
+		"r5 = 5;"
+		"call subprog_deref_arg6;"
+	"l0_%=:"
+		"r0 = 0;"
+		"exit;"
+		:
+		: __imm(bpf_skb_pull_data),
+		  __imm_const(__sk_buff_data, offsetof(struct __sk_buff, data)),
+		  __imm_const(__sk_buff_data_end, offsetof(struct __sk_buff, data_end))
+		: __clobber_all
+	);
+}
+
+SEC("tc")
+__description("stack_arg: null propagation rejects deref on null branch")
+__failure
+__msg("R{{[0-9]}} invalid mem access 'scalar'")
+__naked void stack_arg_null_propagation_fail(void)
+{
+	asm volatile (
+		"r1 = 0;"
+		"*(u64 *)(r10 - 8) = r1;"
+		/* r0 = bpf_map_lookup_elem(&map_hash_8b, &key) */
+		"r2 = r10;"
+		"r2 += -8;"
+		"r1 = %[map_hash_8b] ll;"
+		"call %[bpf_map_lookup_elem];"
+		/* Store PTR_TO_MAP_VALUE_OR_NULL to outgoing arg6 slot */
+		"*(u64 *)(r11 - 8) = r0;"
+		/* null check on r0 */
+		"if r0 != 0 goto l0_%=;"
+		/*
+		 * On null branch, outgoing slot is SCALAR(0).
+		 * Call subprog that dereferences arg6 — should fail.
+		 */
+		"r1 = 0;"
+		"r2 = 0;"
+		"r3 = 0;"
+		"r4 = 0;"
+		"r5 = 0;"
+		"call subprog_deref_arg6;"
+	"l0_%=:"
+		"r0 = 0;"
+		"exit;"
+		:
+		: __imm(bpf_map_lookup_elem),
+		  __imm_addr(map_hash_8b)
+		: __clobber_all
+	);
+}
+
+SEC("tc")
+__description("stack_arg: missing store on one branch")
+__failure
+__msg("callee expects 7 args, stack arg1 is not initialized")
+__naked void stack_arg_missing_store_one_branch(void)
+{
+	asm volatile (
+		"call %[bpf_get_prandom_u32];"
+		"r1 = 1;"
+		"r2 = 2;"
+		"r3 = 3;"
+		"r4 = 4;"
+		"r5 = 5;"
+		/* Write arg7 (r11-16) before branch */
+		"*(u64 *)(r11 - 16) = 20;"
+		"if r0 > 0 goto l0_%=;"
+		/* Path 1: write arg6 and call */
+		"*(u64 *)(r11 - 8) = 10;"
+		"r1 = 1;"
+		"r2 = 2;"
+		"r3 = 3;"
+		"r4 = 4;"
+		"r5 = 5;"
+		"call subprog_7args;"
+		"goto l1_%=;"
+	"l0_%=:"
+		/* Path 2: missing arg6 store, call should fail */
+		"r1 = 1;"
+		"r2 = 2;"
+		"r3 = 3;"
+		"r4 = 4;"
+		"r5 = 5;"
+		"call subprog_7args;"
+	"l1_%=:"
+		"r0 = 0;"
+		"exit;"
+		:: __imm(bpf_get_prandom_u32)
+		: __clobber_all
+	);
+}
+
+SEC("tc")
+__description("stack_arg: share a store for both branches")
+__success __retval(0)
+__naked void stack_arg_shared_store(void)
+{
+	asm volatile (
+		"call %[bpf_get_prandom_u32];"
+		"r1 = 1;"
+		"r2 = 2;"
+		"r3 = 3;"
+		"r4 = 4;"
+		"r5 = 5;"
+		/* Write arg7 (r11-16) before branch */
+		"*(u64 *)(r11 - 16) = 20;"
+		"if r0 > 0 goto l0_%=;"
+		/* Path 1: write arg6 and call */
+		"*(u64 *)(r11 - 8) = 10;"
+		"r1 = 1;"
+		"r2 = 2;"
+		"r3 = 3;"
+		"r4 = 4;"
+		"r5 = 5;"
+		"call subprog_7args;"
+		"goto l1_%=;"
+	"l0_%=:"
+		/* Path 2: also write arg6 and call */
+		"*(u64 *)(r11 - 8) = 30;"
+		"r1 = 1;"
+		"r2 = 2;"
+		"r3 = 3;"
+		"r4 = 4;"
+		"r5 = 5;"
+		"call subprog_7args;"
+	"l1_%=:"
+		"r0 = 0;"
+		"exit;"
+		:: __imm(bpf_get_prandom_u32)
+		: __clobber_all
+	);
+}
+
+SEC("tc")
+__description("stack_arg: write beyond max outgoing depth")
+__failure
+__msg("stack arg write offset -80 exceeds max 7 stack args")
+__naked void stack_arg_write_beyond_max(void)
+{
+	asm volatile (
+		"r1 = 1;"
+		"r2 = 2;"
+		"r3 = 3;"
+		"r4 = 4;"
+		"r5 = 5;"
+		/* Write to offset -80, way beyond any callee's needs */
+		"*(u64 *)(r11 - 80) = 99;"
+		"*(u64 *)(r11 - 16) = 20;"
+		"*(u64 *)(r11 - 8) = 10;"
+		"call subprog_7args;"
+		"r0 = 0;"
+		"exit;"
+		::: __clobber_all
+	);
+}
+
+SEC("tc")
+__description("stack_arg: write unused stack arg slot")
+__failure
+__msg("func#0 writes 5 stack arg slots, but calls only require 2")
+__naked void stack_arg_write_unused_slot(void)
+{
+	asm volatile (
+		"r1 = 1;"
+		"r2 = 2;"
+		"r3 = 3;"
+		"r4 = 4;"
+		"r5 = 5;"
+		/* Write to offset -40, unused for the callee */
+		"*(u64 *)(r11 - 40) = 99;"
+		"*(u64 *)(r11 - 16) = 20;"
+		"*(u64 *)(r11 - 8) = 10;"
+		"call subprog_7args;"
+		"r0 = 0;"
+		"exit;"
+		::: __clobber_all
+	);
+}
+
+SEC("tc")
+__description("stack_arg: sequential calls reuse slots")
+__failure
+__msg("callee expects 7 args, stack arg1 is not initialized")
+__naked void stack_arg_sequential_calls(void)
+{
+	asm volatile (
+		"r1 = 1;"
+		"r2 = 2;"
+		"r3 = 3;"
+		"r4 = 4;"
+		"r5 = 5;"
+		"*(u64 *)(r11 - 8) = 6;"
+		"*(u64 *)(r11 - 16) = 7;"
+		"call subprog_7args;"
+		"r6 = r0;"
+		"r1 = 1;"
+		"r2 = 2;"
+		"r3 = 3;"
+		"r4 = 4;"
+		"r5 = 5;"
+		"call subprog_7args;"
+		"r0 += r6;"
+		"exit;"
+		::: __clobber_all
+	);
+}
+
+#else
+
+SEC("socket")
+__description("stack_arg is not supported by compiler or jit, use a dummy test")
+__success
+int dummy_test(void)
+{
+	return 0;
+}
+
+#endif
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/verifier_stack_arg_order.c b/tools/testing/selftests/bpf/progs/verifier_stack_arg_order.c
new file mode 100644
index 000000000000..c9fe4857da3f
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/verifier_stack_arg_order.c
@@ -0,0 +1,185 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+#if (defined(__TARGET_ARCH_x86) || defined(__TARGET_ARCH_arm64)) && \
+	defined(__BPF_FEATURE_STACK_ARGUMENT)
+
+__noinline __used __naked
+static int subprog_bad_order_6args(int a, int b, int c, int d, int e, int f)
+{
+	asm volatile (
+		"*(u64 *)(r11 - 8) = r1;"
+		"r0 = *(u64 *)(r11 + 8);"
+		"exit;"
+		::: __clobber_all
+	);
+}
+
+SEC("tc")
+__description("stack_arg: r11 load after r11 store")
+__failure
+__msg("r11 load must be before any r11 store or call insn")
+__btf_func_path("btf__verifier_stack_arg_order.bpf.o")
+__naked void stack_arg_load_after_store(void)
+{
+	asm volatile (
+		"r1 = 1;"
+		"r2 = 2;"
+		"r3 = 3;"
+		"r4 = 4;"
+		"r5 = 5;"
+		"*(u64 *)(r11 - 8) = 6;"
+		"call subprog_bad_order_6args;"
+		"exit;"
+		::: __clobber_all
+	);
+}
+
+__noinline __used __naked
+static int subprog_call_before_load_6args(int a, int b, int c, int d, int e,
+					  int f)
+{
+	asm volatile (
+		"call %[bpf_get_prandom_u32];"
+		"r0 = *(u64 *)(r11 + 8);"
+		"exit;"
+		:: __imm(bpf_get_prandom_u32)
+		: __clobber_all
+	);
+}
+
+SEC("tc")
+__description("stack_arg: r11 load after a call")
+__failure
+__msg("r11 load must be before any r11 store or call insn")
+__btf_func_path("btf__verifier_stack_arg_order.bpf.o")
+__naked void stack_arg_load_after_call(void)
+{
+	asm volatile (
+		"r1 = 1;"
+		"r2 = 2;"
+		"r3 = 3;"
+		"r4 = 4;"
+		"r5 = 5;"
+		"*(u64 *)(r11 - 8) = 6;"
+		"call subprog_call_before_load_6args;"
+		"exit;"
+		::: __clobber_all
+	);
+}
+
+__noinline __used __naked
+static int subprog_pruning_call_before_load_6args(int a, int b, int c, int d,
+						  int e, int f)
+{
+	asm volatile (
+		"if r1 s> 0 goto l0_%=;"
+		"goto l1_%=;"
+	"l0_%=:"
+		"call %[bpf_get_prandom_u32];"
+	"l1_%=:"
+		"r0 = *(u64 *)(r11 + 8);"
+		"exit;"
+		:: __imm(bpf_get_prandom_u32)
+		: __clobber_all
+	);
+}
+
+SEC("tc")
+__description("stack_arg: pruning keeps r11 load ordering")
+__failure
+__flag(BPF_F_TEST_STATE_FREQ)
+__msg("r11 load must be before any r11 store or call insn")
+__btf_func_path("btf__verifier_stack_arg_order.bpf.o")
+__naked void stack_arg_pruning_load_after_call(void)
+{
+	asm volatile (
+		"call %[bpf_get_prandom_u32];"
+		"r1 = r0;"
+		"r2 = 2;"
+		"r3 = 3;"
+		"r4 = 4;"
+		"r5 = 5;"
+		"*(u64 *)(r11 - 8) = 6;"
+		"call subprog_pruning_call_before_load_6args;"
+		"exit;"
+		:: __imm(bpf_get_prandom_u32)
+		: __clobber_all
+	);
+}
+
+/*
+ * "bad_ptr": the first arg is 'long *', which is not a recognized pointer
+ * type for static subprogs (not ctx, dynptr, or tagged).  btf_prepare_func_args()
+ * sets arg_cnt = 7 / stack_arg_cnt = 2, then fails with -EINVAL.  The subprog
+ * is marked unreliable but the call still proceeds for static subprogs.
+ */
+__noinline __used __naked
+static void subprog_bad_ptr_7args(long *a, int b, int c, int d, int e, int f, int g)
+{
+	asm volatile (
+		"r0 = *(u64 *)(r11 + 8);"
+		"r1 = *(u64 *)(r11 + 16);"
+		"exit;"
+		::: __clobber_all
+	);
+}
+
+SEC("tc")
+__description("stack_arg: read without caller write")
+__failure
+__msg("callee expects 7 args, stack arg1 is not initialized")
+__btf_func_path("btf__verifier_stack_arg_order.bpf.o")
+__naked void stack_arg_read_without_write_1(void)
+{
+	asm volatile (
+		"r1 = 0;"
+		"r2 = 0;"
+		"r3 = 0;"
+		"r4 = 0;"
+		"r5 = 0;"
+		"call subprog_bad_ptr_7args;"
+		"exit;"
+		::: __clobber_all
+	);
+}
+
+SEC("tc")
+__description("stack_arg: read with not-initialized caller write")
+__failure
+__msg("R0 !read_ok")
+__btf_func_path("btf__verifier_stack_arg_order.bpf.o")
+__naked void stack_arg_read_without_write_2(void)
+{
+	asm volatile (
+		"r1 = 0;"
+		"r2 = 0;"
+		"r3 = 0;"
+		"r4 = 0;"
+		"r5 = 0;"
+		"*(u64 *)(r11 - 8) = 0;"
+		"*(u64 *)(r11 - 16) = 0;"
+		"call subprog_bad_ptr_7args;"
+		"call subprog_bad_ptr_7args;"
+		"exit;"
+		::: __clobber_all
+	);
+}
+
+#else
+
+SEC("socket")
+__description("stack_arg order is not supported by compiler or jit, use a dummy test")
+__success
+int dummy_test(void)
+{
+	return 0;
+}
+
+#endif
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/verifier_subreg.c b/tools/testing/selftests/bpf/progs/verifier_subreg.c
index 31832a306f91..73b5b0cf6706 100644
--- a/tools/testing/selftests/bpf/progs/verifier_subreg.c
+++ b/tools/testing/selftests/bpf/progs/verifier_subreg.c
@@ -558,7 +558,8 @@ __description("arsh32 imm sign negative extend check")
 __success __retval(0)
 __log_level(2)
 __msg("3: (17) r6 -= 4095                    ; R6=scalar(smin=smin32=-4095,smax=smax32=0)")
-__msg("4: (67) r6 <<= 32                     ; R6=scalar(smin=0xfffff00100000000,smax=smax32=umax32=0,umax=0xffffffff00000000,smin32=0,var_off=(0x0; 0xffffffff00000000))")
+__msg("4: (67) r6 <<= 32                     ; R6=scalar(smin=0xfffff00100000000,smax=smax32=umax32=0,smin32=0,var_off=(0x0; 0xffffffff00000000))")
+/* represents shorter of signed / unsigned 64-bit ranges */
 __msg("5: (c7) r6 s>>= 32                    ; R6=scalar(smin=smin32=-4095,smax=smax32=0)")
 __naked void arsh32_imm_sign_extend_negative_check(void)
 {
@@ -581,7 +582,8 @@ __description("arsh32 imm sign extend check")
 __success __retval(0)
 __log_level(2)
 __msg("3: (17) r6 -= 2047                    ; R6=scalar(smin=smin32=-2047,smax=smax32=2048)")
-__msg("4: (67) r6 <<= 32                     ; R6=scalar(smin=0xfffff80100000000,smax=0x80000000000,umax=0xffffffff00000000,smin32=0,smax32=umax32=0,var_off=(0x0; 0xffffffff00000000))")
+__msg("4: (67) r6 <<= 32                     ; R6=scalar(smin=0xfffff80100000000,smax=0x80000000000,smin32=0,smax32=umax32=0,var_off=(0x0; 0xffffffff00000000))")
+/* represents shorter of signed / unsigned 64-bit ranges */
 __msg("5: (c7) r6 s>>= 32                    ; R6=scalar(smin=smin32=-2047,smax=smax32=2048)")
 __naked void arsh32_imm_sign_extend_check(void)
 {
diff --git a/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c b/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c
index 8d60c634a114..48fa34d2959f 100644
--- a/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c
+++ b/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c
@@ -56,6 +56,7 @@ __jited("L1:	pushq	%rax")			/* rbp[-16] = rax         */
  * (cause original rax might be clobbered by this point)
  */
 __jited("	movq	-0x10(%rbp), %rax")
+__jited("...")
 __jited("	callq	0x{{.*}}")		/* call to sub()          */
 __jited("	xorl	%eax, %eax")
 __jited("	leave")
diff --git a/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c b/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c
index 4b392c6c8fc4..2870738d93f7 100644
--- a/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c
+++ b/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c
@@ -13,7 +13,7 @@
 static char buf[PATH_MAX];
 
 SEC("lsm.s/file_open")
-__failure __msg("Possibly NULL pointer passed to trusted arg0")
+__failure __msg("Possibly NULL pointer passed to trusted R1")
 int BPF_PROG(get_task_exe_file_kfunc_null)
 {
 	struct file *acquired;
@@ -28,7 +28,7 @@ int BPF_PROG(get_task_exe_file_kfunc_null)
 }
 
 SEC("lsm.s/inode_getxattr")
-__failure __msg("arg#0 pointer type STRUCT task_struct must point to scalar, or struct with scalar")
+__failure __msg("R1 pointer type STRUCT task_struct must point to scalar, or struct with scalar")
 int BPF_PROG(get_task_exe_file_kfunc_fp)
 {
 	u64 x;
@@ -80,7 +80,7 @@ int BPF_PROG(get_task_exe_file_kfunc_unreleased)
 }
 
 SEC("lsm.s/file_open")
-__failure __msg("release kernel function bpf_put_file expects")
+__failure __msg("release kfunc bpf_put_file expects referenced PTR_TO_BTF_ID passed to R1")
 int BPF_PROG(put_file_kfunc_unacquired, struct file *file)
 {
 	/* Can't release an unacquired pointer. */
@@ -89,7 +89,7 @@ int BPF_PROG(put_file_kfunc_unacquired, struct file *file)
 }
 
 SEC("lsm.s/file_open")
-__failure __msg("Possibly NULL pointer passed to trusted arg0")
+__failure __msg("Possibly NULL pointer passed to trusted R1")
 int BPF_PROG(path_d_path_kfunc_null)
 {
 	/* Can't pass NULL value to bpf_path_d_path() kfunc. */
@@ -128,7 +128,7 @@ int BPF_PROG(path_d_path_kfunc_untrusted_from_current)
 }
 
 SEC("lsm.s/file_open")
-__failure __msg("kernel function bpf_path_d_path args#0 expected pointer to STRUCT path but R1 has a pointer to STRUCT file")
+__failure __msg("kernel function bpf_path_d_path R1 expected pointer to STRUCT path but R1 has a pointer to STRUCT file")
 int BPF_PROG(path_d_path_kfunc_type_mismatch, struct file *file)
 {
 	bpf_path_d_path((struct path *)&file->f_task_work, buf, sizeof(buf));
diff --git a/tools/testing/selftests/bpf/progs/wakeup_source.h b/tools/testing/selftests/bpf/progs/wakeup_source.h
new file mode 100644
index 000000000000..cd74de92c82f
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/wakeup_source.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright 2026 Google LLC */
+
+#ifndef __WAKEUP_SOURCE_H__
+#define __WAKEUP_SOURCE_H__
+
+#define WAKEUP_NAME_LEN 128
+
+struct wakeup_event_t {
+	unsigned long active_count;
+	long long active_time_ns;
+	unsigned long event_count;
+	unsigned long expire_count;
+	long long last_time_ns;
+	long long max_time_ns;
+	long long prevent_sleep_time_ns;
+	long long total_time_ns;
+	unsigned long wakeup_count;
+	char name[WAKEUP_NAME_LEN];
+};
+
+#endif /* __WAKEUP_SOURCE_H__ */
diff --git a/tools/testing/selftests/bpf/progs/wakeup_source_fail.c b/tools/testing/selftests/bpf/progs/wakeup_source_fail.c
new file mode 100644
index 000000000000..d4d0f1610853
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/wakeup_source_fail.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2026 Google LLC */
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+struct bpf_ws_lock;
+
+struct bpf_ws_lock *bpf_wakeup_sources_read_lock(void) __ksym;
+void bpf_wakeup_sources_read_unlock(struct bpf_ws_lock *lock) __ksym;
+void *bpf_wakeup_sources_get_head(void) __ksym;
+
+SEC("syscall")
+__failure __msg("BPF_EXIT instruction in main prog would lead to reference leak")
+int wakeup_source_lock_no_unlock(void *ctx)
+{
+	struct bpf_ws_lock *lock;
+
+	lock = bpf_wakeup_sources_read_lock();
+	if (!lock)
+		return 0;
+
+	return 0;
+}
+
+SEC("syscall")
+__failure __msg("access beyond struct")
+int wakeup_source_access_lock_fields(void *ctx)
+{
+	struct bpf_ws_lock *lock;
+	int val;
+
+	lock = bpf_wakeup_sources_read_lock();
+	if (!lock)
+		return 0;
+
+	val = *(int *)lock;
+
+	bpf_wakeup_sources_read_unlock(lock);
+	return val;
+}
+
+SEC("syscall")
+__failure __msg("release kfunc bpf_wakeup_sources_read_unlock expects referenced PTR_TO_BTF_ID passed to R1")
+int wakeup_source_unlock_no_lock(void *ctx)
+{
+	struct bpf_ws_lock *lock = (void *)0x1;
+
+	bpf_wakeup_sources_read_unlock(lock);
+
+	return 0;
+}
+
+SEC("syscall")
+__failure __msg("Possibly NULL pointer passed to trusted")
+int wakeup_source_unlock_null(void *ctx)
+{
+	bpf_wakeup_sources_read_unlock(NULL);
+
+	return 0;
+}
+
+SEC("syscall")
+__failure __msg("R0 invalid mem access 'scalar'")
+int wakeup_source_unsafe_dereference(void *ctx)
+{
+	struct list_head *head = bpf_wakeup_sources_get_head();
+
+	if (head->next)
+		return 1;
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/wq_failures.c b/tools/testing/selftests/bpf/progs/wq_failures.c
index 3767f5595bbc..32dc8827e128 100644
--- a/tools/testing/selftests/bpf/progs/wq_failures.c
+++ b/tools/testing/selftests/bpf/progs/wq_failures.c
@@ -98,7 +98,7 @@ __failure
  * is a correct bpf_wq pointer.
  */
 __msg(": (85) call bpf_wq_set_callback#") /* anchor message */
-__msg("arg#0 doesn't point to a map value")
+__msg("R1 doesn't point to a map value")
 long test_wrong_wq_pointer(void *ctx)
 {
 	int key = 0;
diff --git a/tools/testing/selftests/bpf/progs/xdp_flowtable.c b/tools/testing/selftests/bpf/progs/xdp_flowtable.c
index 7fdc7b23ee74..e67daa02749d 100644
--- a/tools/testing/selftests/bpf/progs/xdp_flowtable.c
+++ b/tools/testing/selftests/bpf/progs/xdp_flowtable.c
@@ -15,7 +15,10 @@ struct bpf_flowtable_opts___local {
 	s32 error;
 };
 
-struct flow_offload_tuple_rhash *
+struct flow_offload_tuple_rhash___local {
+};
+
+struct flow_offload_tuple_rhash___local *
 bpf_xdp_flow_lookup(struct xdp_md *, struct bpf_fib_lookup *,
 		    struct bpf_flowtable_opts___local *, u32) __ksym;
 
@@ -67,7 +70,7 @@ int xdp_flowtable_do_lookup(struct xdp_md *ctx)
 {
 	void *data_end = (void *)(long)ctx->data_end;
 	struct bpf_flowtable_opts___local opts = {};
-	struct flow_offload_tuple_rhash *tuplehash;
+	struct flow_offload_tuple_rhash___local *tuplehash;
 	struct bpf_fib_lookup tuple = {
 		.ifindex = ctx->ingress_ifindex,
 	};
diff --git a/tools/testing/selftests/bpf/progs/xdp_lb_bench.c b/tools/testing/selftests/bpf/progs/xdp_lb_bench.c
new file mode 100644
index 000000000000..13777b3dcac8
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/xdp_lb_bench.c
@@ -0,0 +1,647 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#include <stddef.h>
+#include <stdbool.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+#include "bpf_compiler.h"
+#include "xdp_lb_bench_common.h"
+#include "bench_bpf_timing.bpf.h"
+
+#ifndef IPPROTO_FRAGMENT
+#define IPPROTO_FRAGMENT 44
+#endif
+
+/* jhash helpers */
+
+static inline __u32 rol32(__u32 word, unsigned int shift)
+{
+	return (word << shift) | (word >> ((-shift) & 31));
+}
+
+#define __jhash_mix(a, b, c)			\
+{						\
+	a -= c;  a ^= rol32(c, 4);  c += b;	\
+	b -= a;  b ^= rol32(a, 6);  a += c;	\
+	c -= b;  c ^= rol32(b, 8);  b += a;	\
+	a -= c;  a ^= rol32(c, 16); c += b;	\
+	b -= a;  b ^= rol32(a, 19); a += c;	\
+	c -= b;  c ^= rol32(b, 4);  b += a;	\
+}
+
+#define __jhash_final(a, b, c)			\
+{						\
+	c ^= b; c -= rol32(b, 14);		\
+	a ^= c; a -= rol32(c, 11);		\
+	b ^= a; b -= rol32(a, 25);		\
+	c ^= b; c -= rol32(b, 16);		\
+	a ^= c; a -= rol32(c, 4);		\
+	b ^= a; b -= rol32(a, 14);		\
+	c ^= b; c -= rol32(b, 24);		\
+}
+
+#define JHASH_INITVAL 0xdeadbeef
+
+static inline __u32 __jhash_nwords(__u32 a, __u32 b, __u32 c, __u32 initval)
+{
+	a += initval;
+	b += initval;
+	c += initval;
+	__jhash_final(a, b, c);
+	return c;
+}
+
+static inline __u32 jhash_2words(__u32 a, __u32 b, __u32 initval)
+{
+	return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2));
+}
+
+static inline __u32 jhash2_4words(const __u32 *k, __u32 initval)
+{
+	__u32 a, b, c;
+
+	a = b = c = JHASH_INITVAL + (4 << 2) + initval;
+
+	a += k[0]; b += k[1]; c += k[2];
+	__jhash_mix(a, b, c);
+
+	a += k[3];
+	__jhash_final(a, b, c);
+
+	return c;
+}
+
+static __always_inline void ipv4_csum(struct iphdr *iph)
+{
+	__u16 *next_iph = (__u16 *)iph;
+	__u32 csum = 0;
+	int i;
+
+	__pragma_loop_unroll_full
+	for (i = 0; i < (int)(sizeof(*iph) >> 1); i++)
+		csum += *next_iph++;
+
+	csum = (csum & 0xffff) + (csum >> 16);
+	csum = (csum & 0xffff) + (csum >> 16);
+	iph->check = ~csum;
+}
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(max_entries, 64);
+	__type(key, struct vip_definition);
+	__type(value, struct vip_meta);
+} vip_map SEC(".maps");
+
+struct lru_inner_map {
+	__uint(type, BPF_MAP_TYPE_LRU_HASH);
+	__type(key, struct flow_key);
+	__type(value, struct real_pos_lru);
+	__uint(max_entries, DEFAULT_LRU_SIZE);
+} lru_inner SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
+	__type(key, __u32);
+	__type(value, __u32);
+	__uint(max_entries, BENCH_NR_CPUS);
+	__array(values, struct lru_inner_map);
+} lru_mapping SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, CH_RINGS_SIZE);
+	__type(key, __u32);
+	__type(value, __u32);
+} ch_rings SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, MAX_REALS);
+	__type(key, __u32);
+	__type(value, struct real_definition);
+} reals SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(max_entries, STATS_SIZE);
+	__type(key, __u32);
+	__type(value, struct lb_stats);
+} stats SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(max_entries, MAX_REALS);
+	__type(key, __u32);
+	__type(value, struct lb_stats);
+} reals_stats SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 1);
+	__type(key, __u32);
+	__type(value, struct ctl_value);
+} ctl_array SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 1);
+	__type(key, __u32);
+	__type(value, struct vip_definition);
+} vip_miss_stats SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(max_entries, MAX_REALS);
+	__type(key, __u32);
+	__type(value, __u32);
+} lru_miss_stats SEC(".maps");
+
+volatile __u32 flow_mask;
+volatile __u32 cold_lru;
+__u32 batch_gen;
+
+/*
+ * old_eth MUST be read BEFORE writing the outer header because
+ * bpf_xdp_adjust_head makes them overlap.
+ */
+static __always_inline int encap_v4(struct xdp_md *xdp, __be32 saddr, __be32 daddr,
+				    __u16 payload_len, const __u8 *dst_mac)
+{
+	struct ethhdr *new_eth, *old_eth;
+	void *data, *data_end;
+	struct iphdr *iph;
+
+	if (bpf_xdp_adjust_head(xdp, -(int)sizeof(struct iphdr)))
+		return -1;
+
+	data     = (void *)(long)xdp->data;
+	data_end = (void *)(long)xdp->data_end;
+
+	new_eth = data;
+	iph     = data + sizeof(struct ethhdr);
+	old_eth = data + sizeof(struct iphdr);
+
+	if (new_eth + 1 > data_end || old_eth + 1 > data_end || iph + 1 > data_end)
+		return -1;
+
+	__builtin_memcpy(new_eth->h_source, old_eth->h_dest, sizeof(new_eth->h_source));
+	__builtin_memcpy(new_eth->h_dest, dst_mac, sizeof(new_eth->h_dest));
+	new_eth->h_proto = bpf_htons(ETH_P_IP);
+
+	__builtin_memset(iph, 0, sizeof(*iph));
+	iph->version  = 4;
+	iph->ihl      = sizeof(*iph) >> 2;
+	iph->protocol = IPPROTO_IPIP;
+	iph->tot_len  = bpf_htons(payload_len + sizeof(*iph));
+	iph->ttl      = 64;
+	iph->saddr    = saddr;
+	iph->daddr    = daddr;
+	ipv4_csum(iph);
+
+	return 0;
+}
+
+static __always_inline int encap_v6(struct xdp_md *xdp, const __be32 saddr[4],
+				    const __be32 daddr[4], __u8 nexthdr, __u16 payload_len,
+				    const __u8 *dst_mac)
+{
+	struct ethhdr *new_eth, *old_eth;
+	void *data, *data_end;
+	struct ipv6hdr *ip6h;
+
+	if (bpf_xdp_adjust_head(xdp, -(int)sizeof(struct ipv6hdr)))
+		return -1;
+
+	data     = (void *)(long)xdp->data;
+	data_end = (void *)(long)xdp->data_end;
+
+	new_eth = data;
+	ip6h    = data + sizeof(struct ethhdr);
+	old_eth = data + sizeof(struct ipv6hdr);
+
+	if (new_eth + 1 > data_end || old_eth + 1 > data_end || ip6h + 1 > data_end)
+		return -1;
+
+	__builtin_memcpy(new_eth->h_source, old_eth->h_dest, sizeof(new_eth->h_source));
+	__builtin_memcpy(new_eth->h_dest, dst_mac, sizeof(new_eth->h_dest));
+	new_eth->h_proto = bpf_htons(ETH_P_IPV6);
+
+	__builtin_memset(ip6h, 0, sizeof(*ip6h));
+	ip6h->version     = 6;
+	ip6h->nexthdr     = nexthdr;
+	ip6h->payload_len = bpf_htons(payload_len);
+	ip6h->hop_limit   = 64;
+	__builtin_memcpy(&ip6h->saddr, saddr, sizeof(ip6h->saddr));
+	__builtin_memcpy(&ip6h->daddr, daddr, sizeof(ip6h->daddr));
+
+	return 0;
+}
+
+static __always_inline void update_stats(void *map, __u32 key, __u16 bytes)
+{
+	struct lb_stats *st = bpf_map_lookup_elem(map, &key);
+
+	if (st) {
+		st->v1 += 1;
+		st->v2 += bytes;
+	}
+}
+
+static __always_inline void count_action(int action)
+{
+	struct lb_stats *st;
+	__u32 key;
+
+	if (action == XDP_TX)
+		key = STATS_XDP_TX;
+	else if (action == XDP_PASS)
+		key = STATS_XDP_PASS;
+	else
+		key = STATS_XDP_DROP;
+
+	st = bpf_map_lookup_elem(&stats, &key);
+	if (st)
+		st->v1 += 1;
+}
+
+static __always_inline bool is_under_flood(void)
+{
+	__u32 key = STATS_NEW_CONN;
+	struct lb_stats *conn_st = bpf_map_lookup_elem(&stats, &key);
+	__u64 cur_time;
+
+	if (!conn_st)
+		return true;
+
+	cur_time = bpf_ktime_get_ns();
+	if ((cur_time - conn_st->v2) > ONE_SEC) {
+		conn_st->v1 = 1;
+		conn_st->v2 = cur_time;
+	} else {
+		conn_st->v1 += 1;
+		if (conn_st->v1 > MAX_CONN_RATE)
+			return true;
+	}
+	return false;
+}
+
+static __always_inline struct real_definition *connection_table_lookup(void *lru_map,
+								       struct flow_key *flow,
+								       __u32 *out_pos)
+{
+	struct real_pos_lru *dst_lru;
+	struct real_definition *real;
+	__u32 key;
+
+	dst_lru = bpf_map_lookup_elem(lru_map, flow);
+	if (!dst_lru)
+		return NULL;
+
+	/* UDP connections use atime-based timeout instead of FIN/RST */
+	if (flow->proto == IPPROTO_UDP) {
+		__u64 cur_time = bpf_ktime_get_ns();
+
+		if (cur_time - dst_lru->atime > LRU_UDP_TIMEOUT)
+			return NULL;
+		dst_lru->atime = cur_time;
+	}
+
+	key = dst_lru->pos;
+	*out_pos = key;
+	real = bpf_map_lookup_elem(&reals, &key);
+	return real;
+}
+
+static __always_inline bool get_packet_dst(struct real_definition **real, struct flow_key *flow,
+					   struct vip_meta *vip_info, bool is_v6, void *lru_map,
+					   bool is_rst, __u32 *out_pos)
+{
+	bool under_flood;
+	__u32 hash, ch_key;
+	__u32 *ch_val;
+	__u32 real_pos;
+
+	under_flood = is_under_flood();
+
+	if (is_v6) {
+		__u32 src_hash = jhash2_4words((__u32 *)flow->srcv6, MAX_VIPS);
+
+		hash = jhash_2words(src_hash, flow->ports, CH_RING_SIZE);
+	} else {
+		hash = jhash_2words(flow->src, flow->ports, CH_RING_SIZE);
+	}
+
+	ch_key = CH_RING_SIZE * vip_info->vip_num + hash % CH_RING_SIZE;
+	ch_val = bpf_map_lookup_elem(&ch_rings, &ch_key);
+	if (!ch_val)
+		return false;
+	real_pos = *ch_val;
+
+	*real = bpf_map_lookup_elem(&reals, &real_pos);
+	if (!(*real))
+		return false;
+
+	if (!(vip_info->flags & F_LRU_BYPASS) && !under_flood && !is_rst) {
+		struct real_pos_lru new_lru = { .pos = real_pos };
+
+		if (flow->proto == IPPROTO_UDP)
+			new_lru.atime = bpf_ktime_get_ns();
+		bpf_map_update_elem(lru_map, flow, &new_lru, BPF_ANY);
+	}
+
+	*out_pos = real_pos;
+	return true;
+}
+
+static __always_inline void update_vip_lru_miss_stats(struct vip_definition *vip, bool is_v6,
+						      __u32 real_idx)
+{
+	struct vip_definition *miss_vip;
+	__u32 key = 0;
+	__u32 *cnt;
+
+	miss_vip = bpf_map_lookup_elem(&vip_miss_stats, &key);
+	if (!miss_vip)
+		return;
+
+	if (is_v6) {
+		if (miss_vip->vipv6[0] != vip->vipv6[0] || miss_vip->vipv6[1] != vip->vipv6[1] ||
+		    miss_vip->vipv6[2] != vip->vipv6[2] || miss_vip->vipv6[3] != vip->vipv6[3])
+			return;
+	} else {
+		if (miss_vip->vip != vip->vip)
+			return;
+	}
+
+	if (miss_vip->port != vip->port || miss_vip->proto != vip->proto)
+		return;
+
+	cnt = bpf_map_lookup_elem(&lru_miss_stats, &real_idx);
+	if (cnt)
+		*cnt += 1;
+}
+
+static __noinline int process_packet(struct xdp_md *xdp)
+{
+	void *data     = (void *)(long)xdp->data;
+	void *data_end = (void *)(long)xdp->data_end;
+	struct ethhdr *eth = data;
+	struct real_definition *dst = NULL;
+	struct vip_definition vip_def = {};
+	struct ctl_value *cval;
+	struct flow_key flow = {};
+	struct vip_meta *vip_info;
+	struct lb_stats *data_stats;
+	struct udphdr *uh;
+	__be32 tnl_src[4];
+	void *lru_map;
+	void *l4;
+	__u16 payload_len;
+	__u32 real_pos = 0, cpu_num, key;
+	__u8 proto;
+	int action = XDP_DROP;
+	bool is_v6, is_syn = false, is_rst = false;
+
+	if (eth + 1 > data_end)
+		goto out;
+
+	if (eth->h_proto == bpf_htons(ETH_P_IPV6)) {
+		is_v6 = true;
+	} else if (eth->h_proto == bpf_htons(ETH_P_IP)) {
+		is_v6 = false;
+	} else {
+		action = XDP_PASS;
+		goto out;
+	}
+
+	if (is_v6) {
+		struct ipv6hdr *ip6h = (void *)(eth + 1);
+
+		if (ip6h + 1 > data_end)
+			goto out;
+		if (ip6h->nexthdr == IPPROTO_FRAGMENT)
+			goto out;
+
+		payload_len = sizeof(struct ipv6hdr) + bpf_ntohs(ip6h->payload_len);
+		proto = ip6h->nexthdr;
+
+		__builtin_memcpy(flow.srcv6, &ip6h->saddr, sizeof(flow.srcv6));
+		__builtin_memcpy(flow.dstv6, &ip6h->daddr, sizeof(flow.dstv6));
+		__builtin_memcpy(vip_def.vipv6, &ip6h->daddr, sizeof(vip_def.vipv6));
+		l4 = (void *)(ip6h + 1);
+	} else {
+		struct iphdr *iph = (void *)(eth + 1);
+
+		if (iph + 1 > data_end)
+			goto out;
+		if (iph->ihl != 5)
+			goto out;
+		if (iph->frag_off & bpf_htons(PCKT_FRAGMENTED))
+			goto out;
+
+		payload_len = bpf_ntohs(iph->tot_len);
+		proto = iph->protocol;
+
+		flow.src    = iph->saddr;
+		flow.dst    = iph->daddr;
+		vip_def.vip = iph->daddr;
+		l4 = (void *)(iph + 1);
+	}
+
+	/* TCP and UDP share the same port layout at offset 0 */
+	if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) {
+		action = XDP_PASS;
+		goto out;
+	}
+
+	uh = l4;
+	if ((void *)(uh + 1) > data_end)
+		goto out;
+	flow.port16[0] = uh->source;
+	flow.port16[1] = uh->dest;
+
+	if (proto == IPPROTO_TCP) {
+		struct tcphdr *th = l4;
+
+		if ((void *)(th + 1) > data_end)
+			goto out;
+		is_syn = th->syn;
+		is_rst = th->rst;
+	}
+
+	flow.proto    = proto;
+	vip_def.port  = flow.port16[1];
+	vip_def.proto = proto;
+
+	vip_info = bpf_map_lookup_elem(&vip_map, &vip_def);
+	if (!vip_info) {
+		action = XDP_PASS;
+		goto out;
+	}
+
+	key = STATS_LRU;
+	data_stats = bpf_map_lookup_elem(&stats, &key);
+	if (!data_stats)
+		goto out;
+	data_stats->v1 += 1;
+
+	cpu_num = bpf_get_smp_processor_id();
+	lru_map = bpf_map_lookup_elem(&lru_mapping, &cpu_num);
+	if (!lru_map)
+		goto out;
+
+	if (!(vip_info->flags & F_LRU_BYPASS) && !is_syn)
+		dst = connection_table_lookup(lru_map, &flow, &real_pos);
+
+	if (!dst) {
+		if (flow.proto == IPPROTO_TCP) {
+			struct lb_stats *miss_st;
+
+			key = STATS_LRU_MISS;
+			miss_st = bpf_map_lookup_elem(&stats, &key);
+			if (miss_st)
+				miss_st->v1 += 1;
+		}
+
+		if (!get_packet_dst(&dst, &flow, vip_info, is_v6, lru_map, is_rst, &real_pos))
+			goto out;
+
+		update_vip_lru_miss_stats(&vip_def, is_v6, real_pos);
+		data_stats->v2 += 1;
+	}
+
+	key = 0;
+	cval = bpf_map_lookup_elem(&ctl_array, &key);
+	if (!cval)
+		goto out;
+
+	update_stats(&stats, vip_info->vip_num, payload_len);
+	update_stats(&reals_stats, real_pos, payload_len);
+
+	if (is_v6) {
+		create_encap_ipv6_src(flow.port16[0], flow.srcv6[0], tnl_src);
+		if (encap_v6(xdp, tnl_src, dst->dstv6, IPPROTO_IPV6, payload_len, cval->mac))
+			goto out;
+	} else if (dst->flags & F_IPV6) {
+		create_encap_ipv6_src(flow.port16[0], flow.src, tnl_src);
+		if (encap_v6(xdp, tnl_src, dst->dstv6, IPPROTO_IPIP, payload_len, cval->mac))
+			goto out;
+	} else {
+		if (encap_v4(xdp, create_encap_ipv4_src(flow.port16[0], flow.src), dst->dst,
+			     payload_len, cval->mac))
+			goto out;
+	}
+
+	action = XDP_TX;
+
+out:
+	count_action(action);
+	return action;
+}
+
+static __always_inline int strip_encap(struct xdp_md *xdp, const struct ethhdr *saved_eth)
+{
+	void *data = (void *)(long)xdp->data;
+	void *data_end = (void *)(long)xdp->data_end;
+	struct ethhdr *eth = data;
+	int hdr_sz;
+
+	if (eth + 1 > data_end)
+		return -1;
+
+	hdr_sz = (eth->h_proto == bpf_htons(ETH_P_IPV6)) ? (int)sizeof(struct ipv6hdr)
+							 : (int)sizeof(struct iphdr);
+
+	if (bpf_xdp_adjust_head(xdp, hdr_sz))
+		return -1;
+
+	data     = (void *)(long)xdp->data;
+	data_end = (void *)(long)xdp->data_end;
+	eth      = data;
+
+	if (eth + 1 > data_end)
+		return -1;
+
+	__builtin_memcpy(eth, saved_eth, sizeof(*saved_eth));
+	return 0;
+}
+
+static __always_inline void randomize_src(struct xdp_md *xdp, int saddr_off, __u32 *rand_state)
+{
+	void *data     = (void *)(long)xdp->data;
+	void *data_end = (void *)(long)xdp->data_end;
+	__u32 *saddr   = data + saddr_off;
+
+	*rand_state ^= *rand_state << 13;
+	*rand_state ^= *rand_state >> 17;
+	*rand_state ^= *rand_state << 5;
+
+	if ((void *)(saddr + 1) <= data_end)
+		*saddr = *rand_state & flow_mask;
+}
+
+SEC("xdp")
+int xdp_lb_bench(struct xdp_md *xdp)
+{
+	void *data     = (void *)(long)xdp->data;
+	void *data_end = (void *)(long)xdp->data_end;
+	struct ethhdr *eth = data;
+	struct ethhdr saved_eth;
+	__u32 rand_state = 0;
+	__u32 batch_hash = 0;
+	int saddr_off = 0;
+	bool is_v6;
+
+	if (eth + 1 > data_end)
+		return XDP_DROP;
+
+	__builtin_memcpy(&saved_eth, eth, sizeof(saved_eth));
+
+	is_v6 = (saved_eth.h_proto == bpf_htons(ETH_P_IPV6));
+
+	saddr_off = sizeof(struct ethhdr) + (is_v6 ? offsetof(struct ipv6hdr, saddr) :
+					     offsetof(struct iphdr, saddr));
+
+	if (flow_mask)
+		rand_state = bpf_get_prandom_u32() | 1;
+
+	if (cold_lru) {
+		__u32 *saddr = data + saddr_off;
+
+		batch_gen++;
+		batch_hash = (batch_gen + bpf_get_smp_processor_id()) * KNUTH_HASH_MULT;
+		if ((void *)(saddr + 1) <= data_end)
+			*saddr ^= batch_hash;
+	}
+
+	return BENCH_BPF_LOOP(
+		process_packet(xdp),
+		({
+			if (__bench_result == XDP_TX) {
+				if (strip_encap(xdp, &saved_eth))
+					return XDP_DROP;
+				if (rand_state)
+					randomize_src(xdp, saddr_off, &rand_state);
+			}
+			if (cold_lru) {
+				void *d = (void *)(long)xdp->data;
+				void *de = (void *)(long)xdp->data_end;
+				__u32 *__sa = d + saddr_off;
+
+				if ((void *)(__sa + 1) <= de)
+					*__sa ^= batch_hash;
+			}
+		})
+	);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/xdping_kern.c b/tools/testing/selftests/bpf/progs/xdping_kern.c
deleted file mode 100644
index 44e2b0ef23ae..000000000000
--- a/tools/testing/selftests/bpf/progs/xdping_kern.c
+++ /dev/null
@@ -1,183 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */
-
-#define KBUILD_MODNAME "foo"
-#include <stddef.h>
-#include <string.h>
-#include <linux/bpf.h>
-#include <linux/icmp.h>
-#include <linux/in.h>
-#include <linux/if_ether.h>
-#include <linux/if_packet.h>
-#include <linux/if_vlan.h>
-#include <linux/ip.h>
-
-#include <bpf/bpf_helpers.h>
-#include <bpf/bpf_endian.h>
-
-#include "bpf_compiler.h"
-#include "xdping.h"
-
-struct {
-	__uint(type, BPF_MAP_TYPE_HASH);
-	__uint(max_entries, 256);
-	__type(key, __u32);
-	__type(value, struct pinginfo);
-} ping_map SEC(".maps");
-
-static __always_inline void swap_src_dst_mac(void *data)
-{
-	unsigned short *p = data;
-	unsigned short dst[3];
-
-	dst[0] = p[0];
-	dst[1] = p[1];
-	dst[2] = p[2];
-	p[0] = p[3];
-	p[1] = p[4];
-	p[2] = p[5];
-	p[3] = dst[0];
-	p[4] = dst[1];
-	p[5] = dst[2];
-}
-
-static __always_inline __u16 csum_fold_helper(__wsum sum)
-{
-	sum = (sum & 0xffff) + (sum >> 16);
-	return ~((sum & 0xffff) + (sum >> 16));
-}
-
-static __always_inline __u16 ipv4_csum(void *data_start, int data_size)
-{
-	__wsum sum;
-
-	sum = bpf_csum_diff(0, 0, data_start, data_size, 0);
-	return csum_fold_helper(sum);
-}
-
-#define ICMP_ECHO_LEN		64
-
-static __always_inline int icmp_check(struct xdp_md *ctx, int type)
-{
-	void *data_end = (void *)(long)ctx->data_end;
-	void *data = (void *)(long)ctx->data;
-	struct ethhdr *eth = data;
-	struct icmphdr *icmph;
-	struct iphdr *iph;
-
-	if (data + sizeof(*eth) + sizeof(*iph) + ICMP_ECHO_LEN > data_end)
-		return XDP_PASS;
-
-	if (eth->h_proto != bpf_htons(ETH_P_IP))
-		return XDP_PASS;
-
-	iph = data + sizeof(*eth);
-
-	if (iph->protocol != IPPROTO_ICMP)
-		return XDP_PASS;
-
-	if (bpf_ntohs(iph->tot_len) - sizeof(*iph) != ICMP_ECHO_LEN)
-		return XDP_PASS;
-
-	icmph = data + sizeof(*eth) + sizeof(*iph);
-
-	if (icmph->type != type)
-		return XDP_PASS;
-
-	return XDP_TX;
-}
-
-SEC("xdp")
-int xdping_client(struct xdp_md *ctx)
-{
-	void *data = (void *)(long)ctx->data;
-	struct pinginfo *pinginfo = NULL;
-	struct ethhdr *eth = data;
-	struct icmphdr *icmph;
-	struct iphdr *iph;
-	__u64 recvtime;
-	__be32 raddr;
-	__be16 seq;
-	int ret;
-	__u8 i;
-
-	ret = icmp_check(ctx, ICMP_ECHOREPLY);
-
-	if (ret != XDP_TX)
-		return ret;
-
-	iph = data + sizeof(*eth);
-	icmph = data + sizeof(*eth) + sizeof(*iph);
-	raddr = iph->saddr;
-
-	/* Record time reply received. */
-	recvtime = bpf_ktime_get_ns();
-	pinginfo = bpf_map_lookup_elem(&ping_map, &raddr);
-	if (!pinginfo || pinginfo->seq != icmph->un.echo.sequence)
-		return XDP_PASS;
-
-	if (pinginfo->start) {
-		__pragma_loop_unroll_full
-		for (i = 0; i < XDPING_MAX_COUNT; i++) {
-			if (pinginfo->times[i] == 0)
-				break;
-		}
-		/* verifier is fussy here... */
-		if (i < XDPING_MAX_COUNT) {
-			pinginfo->times[i] = recvtime -
-					     pinginfo->start;
-			pinginfo->start = 0;
-			i++;
-		}
-		/* No more space for values? */
-		if (i == pinginfo->count || i == XDPING_MAX_COUNT)
-			return XDP_PASS;
-	}
-
-	/* Now convert reply back into echo request. */
-	swap_src_dst_mac(data);
-	iph->saddr = iph->daddr;
-	iph->daddr = raddr;
-	icmph->type = ICMP_ECHO;
-	seq = bpf_htons(bpf_ntohs(icmph->un.echo.sequence) + 1);
-	icmph->un.echo.sequence = seq;
-	icmph->checksum = 0;
-	icmph->checksum = ipv4_csum(icmph, ICMP_ECHO_LEN);
-
-	pinginfo->seq = seq;
-	pinginfo->start = bpf_ktime_get_ns();
-
-	return XDP_TX;
-}
-
-SEC("xdp")
-int xdping_server(struct xdp_md *ctx)
-{
-	void *data = (void *)(long)ctx->data;
-	struct ethhdr *eth = data;
-	struct icmphdr *icmph;
-	struct iphdr *iph;
-	__be32 raddr;
-	int ret;
-
-	ret = icmp_check(ctx, ICMP_ECHO);
-
-	if (ret != XDP_TX)
-		return ret;
-
-	iph = data + sizeof(*eth);
-	icmph = data + sizeof(*eth) + sizeof(*iph);
-	raddr = iph->saddr;
-
-	/* Now convert request into echo reply. */
-	swap_src_dst_mac(data);
-	iph->saddr = iph->daddr;
-	iph->daddr = raddr;
-	icmph->type = ICMP_ECHOREPLY;
-	icmph->checksum = 0;
-	icmph->checksum = ipv4_csum(icmph, ICMP_ECHO_LEN);
-
-	return XDP_TX;
-}
-
-char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_kmods/Makefile b/tools/testing/selftests/bpf/test_kmods/Makefile
index 63c4d3f6a12f..031c7454ce65 100644
--- a/tools/testing/selftests/bpf/test_kmods/Makefile
+++ b/tools/testing/selftests/bpf/test_kmods/Makefile
@@ -1,5 +1,16 @@
 TEST_KMOD_DIR := $(realpath $(dir $(abspath $(lastword $(MAKEFILE_LIST)))))
-KDIR ?= $(abspath $(TEST_KMOD_DIR)/../../../../..)
+SRCTREE_KDIR := $(abspath $(TEST_KMOD_DIR)/../../../../..)
+# Honor O=/KBUILD_OUTPUT only if they point at a prepared kernel build
+# directory (one containing Module.symvers); otherwise treat the value as a
+# selftests-only output directory and fall back to in-tree or distro headers.
+# The parent bpf/Makefile resolves O=/KBUILD_OUTPUT to absolute paths before
+# invoking this sub-make so relative paths still anchor to the user's
+# invocation directory.
+KMOD_O := $(or $(O),$(KBUILD_OUTPUT))
+KMOD_O_VALID := $(if $(KMOD_O),$(if $(wildcard $(KMOD_O)/Module.symvers),$(KMOD_O)))
+KDIR ?= $(if $(KMOD_O_VALID),$(SRCTREE_KDIR), \
+	    $(if $(wildcard $(SRCTREE_KDIR)/Module.symvers),$(SRCTREE_KDIR), \
+		/lib/modules/$(shell uname -r)/build))
 
 ifeq ($(V),1)
 Q =
@@ -14,8 +25,21 @@ $(foreach m,$(MODULES),$(eval obj-m += $(m:.ko=.o)))
 
 CFLAGS_bpf_testmod.o = -I$(src)
 
+# When BPF_STRICT_BUILD != 0, a missing KDIR is fatal (the default).
+# When permissive, skip silently.
+PERMISSIVE := $(filter 0,$(BPF_STRICT_BUILD))
+
 all:
-	$(Q)$(MAKE) -C $(KDIR) M=$(TEST_KMOD_DIR) modules
+ifeq ($(PERMISSIVE),)
+	$(Q)$(MAKE) -C $(KDIR) $(if $(KMOD_O_VALID),O=$(KMOD_O_VALID) KBUILD_OUTPUT=$(KMOD_O_VALID),KBUILD_OUTPUT=) \
+		M=$(TEST_KMOD_DIR) modules
+else ifneq ("$(wildcard $(KDIR))", "")
+	$(Q)$(MAKE) -C $(KDIR) $(if $(KMOD_O_VALID),O=$(KMOD_O_VALID) KBUILD_OUTPUT=$(KMOD_O_VALID),KBUILD_OUTPUT=) \
+		M=$(TEST_KMOD_DIR) modules
+endif
 
 clean:
-	$(Q)$(MAKE) -C $(KDIR) M=$(TEST_KMOD_DIR) clean
+ifneq ("$(wildcard $(KDIR))", "")
+	$(Q)$(MAKE) -C $(KDIR) $(if $(KMOD_O_VALID),O=$(KMOD_O_VALID) KBUILD_OUTPUT=$(KMOD_O_VALID),KBUILD_OUTPUT=) \
+		M=$(TEST_KMOD_DIR) clean
+endif
diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
index d876314a4d67..30f1cd23093c 100644
--- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
@@ -825,6 +825,76 @@ __bpf_kfunc int bpf_kfunc_call_test5(u8 a, u16 b, u32 c)
 	return 0;
 }
 
+__bpf_kfunc u64 bpf_kfunc_call_stack_arg(u64 a, u64 b, u64 c, u64 d,
+					 u64 e, u64 f, u64 g, u64 h,
+					 u64 i, u64 j)
+{
+	return a + b + c + d + e + f + g + h + i + j;
+}
+
+__bpf_kfunc u64 bpf_kfunc_call_stack_arg_ptr(u64 a, u64 b, u64 c, u64 d, u64 e,
+					     u64 f, u64 g, u64 h, u64 i,
+					     struct prog_test_pass1 *p)
+{
+	return a + b + c + d + e + f + g + h + i + p->x0 + p->x1;
+}
+
+__bpf_kfunc u64 bpf_kfunc_call_stack_arg_mix(u64 a, u64 b, u64 c, u64 d, u64 e,
+					     u64 f, u64 g,
+					     struct prog_test_pass1 *p, u64 h,
+					     struct prog_test_pass1 *q)
+{
+	return a + b + c + d + e + f + g + p->x0 + h + q->x1;
+}
+
+__bpf_kfunc u64 bpf_kfunc_call_stack_arg_dynptr(u64 a, u64 b, u64 c, u64 d, u64 e,
+					       u64 f, u64 g, u64 h, u64 i,
+					       struct bpf_dynptr *ptr)
+{
+	const struct bpf_dynptr_kern *kern_ptr = (void *)ptr;
+
+	return a + b + c + d + e + f + g + h + i + (kern_ptr->size & 0xFFFFFF);
+}
+
+__bpf_kfunc u64 bpf_kfunc_call_stack_arg_mem(u64 a, u64 b, u64 c, u64 d, u64 e,
+					     void *mem, int mem__sz)
+{
+	const unsigned char *p = mem;
+	u64 sum = a + b + c + d + e;
+	int i;
+
+	for (i = 0; i < mem__sz; i++)
+		sum += p[i];
+	return sum;
+}
+
+__bpf_kfunc u64 bpf_kfunc_call_stack_arg_iter(u64 a, u64 b, u64 c, u64 d, u64 e,
+					      u64 f, u64 g, u64 h, u64 i,
+					      struct bpf_iter_testmod_seq *it__iter)
+{
+	return a + b + c + d + e + f + g + h + i + it__iter->value;
+}
+
+__bpf_kfunc u64 bpf_kfunc_call_stack_arg_const_str(u64 a, u64 b, u64 c, u64 d, u64 e,
+						   u64 f, u64 g, u64 h, u64 i,
+						   const char *str__str)
+{
+	return a + b + c + d + e + f + g + h + i;
+}
+
+__bpf_kfunc u64 bpf_kfunc_call_stack_arg_timer(u64 a, u64 b, u64 c, u64 d, u64 e,
+					       u64 f, u64 g, u64 h, u64 i,
+					       struct bpf_timer *timer)
+{
+	return a + b + c + d + e + f + g + h + i;
+}
+
+__bpf_kfunc u64 bpf_kfunc_call_stack_arg_big(u64 a, u64 b, u64 c, u64 d, u64 e,
+					     struct prog_test_big_arg s)
+{
+	return a + b + c + d + e + s.a + s.b;
+}
+
 static struct prog_test_ref_kfunc prog_test_struct = {
 	.a = 42,
 	.b = 108,
@@ -1288,6 +1358,15 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test2)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test3)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test4)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test5)
+BTF_ID_FLAGS(func, bpf_kfunc_call_stack_arg)
+BTF_ID_FLAGS(func, bpf_kfunc_call_stack_arg_ptr)
+BTF_ID_FLAGS(func, bpf_kfunc_call_stack_arg_mix)
+BTF_ID_FLAGS(func, bpf_kfunc_call_stack_arg_dynptr)
+BTF_ID_FLAGS(func, bpf_kfunc_call_stack_arg_mem)
+BTF_ID_FLAGS(func, bpf_kfunc_call_stack_arg_iter)
+BTF_ID_FLAGS(func, bpf_kfunc_call_stack_arg_const_str)
+BTF_ID_FLAGS(func, bpf_kfunc_call_stack_arg_timer)
+BTF_ID_FLAGS(func, bpf_kfunc_call_stack_arg_big)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail1)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail2)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_acquire, KF_ACQUIRE | KF_RET_NULL)
diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h
index aa0b8d41e71b..c36bb911defa 100644
--- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h
@@ -26,6 +26,8 @@ struct prog_test_ref_kfunc {
 };
 #endif
 
+struct bpf_iter_testmod_seq;
+
 struct prog_test_pass1 {
 	int x0;
 	struct {
@@ -48,6 +50,11 @@ struct prog_test_pass2 {
 	} x;
 };
 
+struct prog_test_big_arg {
+	__u64 a;
+	__u64 b;
+};
+
 struct prog_test_fail1 {
 	void *p;
 	int x;
@@ -111,6 +118,32 @@ int bpf_kfunc_call_test2(struct sock *sk, __u32 a, __u32 b) __ksym;
 struct sock *bpf_kfunc_call_test3(struct sock *sk) __ksym;
 long bpf_kfunc_call_test4(signed char a, short b, int c, long d) __ksym;
 int bpf_kfunc_call_test5(__u8 a, __u16 b, __u32 c) __ksym;
+__u64 bpf_kfunc_call_stack_arg(__u64 a, __u64 b, __u64 c, __u64 d,
+			       __u64 e, __u64 f, __u64 g, __u64 h,
+			       __u64 i, __u64 j) __ksym;
+__u64 bpf_kfunc_call_stack_arg_ptr(__u64 a, __u64 b, __u64 c, __u64 d, __u64 e,
+				   __u64 f, __u64 g, __u64 h, __u64 i,
+				   struct prog_test_pass1 *p) __ksym;
+__u64 bpf_kfunc_call_stack_arg_mix(__u64 a, __u64 b, __u64 c, __u64 d, __u64 e,
+				   __u64 f, __u64 g,
+				   struct prog_test_pass1 *p, __u64 h,
+				   struct prog_test_pass1 *q) __ksym;
+__u64 bpf_kfunc_call_stack_arg_dynptr(__u64 a, __u64 b, __u64 c, __u64 d, __u64 e,
+				      __u64 f, __u64 g, __u64 h, __u64 i,
+				      struct bpf_dynptr *ptr) __ksym;
+__u64 bpf_kfunc_call_stack_arg_mem(__u64 a, __u64 b, __u64 c, __u64 d, __u64 e,
+				   void *mem, int mem__sz) __ksym;
+__u64 bpf_kfunc_call_stack_arg_iter(__u64 a, __u64 b, __u64 c, __u64 d, __u64 e,
+				    __u64 f, __u64 g, __u64 h, __u64 i,
+				    struct bpf_iter_testmod_seq *it__iter) __ksym;
+__u64 bpf_kfunc_call_stack_arg_const_str(__u64 a, __u64 b, __u64 c, __u64 d, __u64 e,
+					 __u64 f, __u64 g, __u64 h, __u64 i,
+					 const char *str__str) __ksym;
+__u64 bpf_kfunc_call_stack_arg_timer(__u64 a, __u64 b, __u64 c, __u64 d, __u64 e,
+				     __u64 f, __u64 g, __u64 h, __u64 i,
+				     struct bpf_timer *timer) __ksym;
+__u64 bpf_kfunc_call_stack_arg_big(__u64 a, __u64 b, __u64 c, __u64 d, __u64 e,
+				   struct prog_test_big_arg s) __ksym;
 
 void bpf_kfunc_call_test_pass_ctx(struct __sk_buff *skb) __ksym;
 void bpf_kfunc_call_test_pass1(struct prog_test_pass1 *p) __ksym;
diff --git a/tools/testing/selftests/bpf/test_lirc_mode2_user.c b/tools/testing/selftests/bpf/test_lirc_mode2_user.c
index 88e4aeab21b7..cd191da20d14 100644
--- a/tools/testing/selftests/bpf/test_lirc_mode2_user.c
+++ b/tools/testing/selftests/bpf/test_lirc_mode2_user.c
@@ -50,8 +50,8 @@ int main(int argc, char **argv)
 {
 	struct bpf_object *obj;
 	int ret, lircfd, progfd, inputfd;
-	int testir1 = 0x1dead;
-	int testir2 = 0x20101;
+	int testir1 = 0x1ead;
+	int testir2 = 0x2101;
 	u32 prog_ids[10], prog_flags[10], prog_cnt;
 
 	if (argc != 3) {
@@ -125,7 +125,7 @@ int main(int argc, char **argv)
 		}
 
 		if (event.type == EV_MSC && event.code == MSC_SCAN &&
-		    event.value == 0xdead) {
+		    event.value == 0x1ead) {
 			break;
 		}
 	}
diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c
index c4c34cae6102..abdb9e6e3713 100644
--- a/tools/testing/selftests/bpf/test_loader.c
+++ b/tools/testing/selftests/bpf/test_loader.c
@@ -63,6 +63,7 @@ struct test_spec {
 	struct test_subspec priv;
 	struct test_subspec unpriv;
 	const char *btf_custom_path;
+	const char *btf_custom_func_path;
 	int log_level;
 	int prog_flags;
 	int mode_mask;
@@ -93,7 +94,7 @@ void test_loader_fini(struct test_loader *tester)
 	free(tester->log_buf);
 }
 
-static void free_msgs(struct expected_msgs *msgs)
+void free_msgs(struct expected_msgs *msgs)
 {
 	int i;
 
@@ -590,6 +591,8 @@ static int parse_test_spec(struct test_loader *tester,
 			jit_on_next_line = true;
 		} else if ((val = str_has_pfx(s, "test_btf_path="))) {
 			spec->btf_custom_path = val;
+		} else if ((val = str_has_pfx(s, "test_btf_func_path="))) {
+			spec->btf_custom_func_path = val;
 		} else if ((val = str_has_pfx(s, "test_caps_unpriv="))) {
 			err = parse_caps(val, &spec->unpriv.caps, "test caps");
 			if (err)
@@ -789,6 +792,43 @@ static void emit_stderr(const char *stderr, bool force)
 	fprintf(stdout, "STDERR:\n=============\n%s=============\n", stderr);
 }
 
+static void verify_stderr(int prog_fd, struct expected_msgs *msgs)
+{
+	LIBBPF_OPTS(bpf_prog_stream_read_opts, ropts);
+	char *buf;
+	int ret;
+
+	if (!msgs->cnt)
+		return;
+
+	buf = malloc(TEST_LOADER_LOG_BUF_SZ);
+	if (!ASSERT_OK_PTR(buf, "malloc"))
+		return;
+
+	ret = bpf_prog_stream_read(prog_fd, 2, buf, TEST_LOADER_LOG_BUF_SZ - 1,
+				    &ropts);
+	if (ret > 0) {
+		buf[ret] = '\0';
+		emit_stderr(buf, false);
+		validate_msgs(buf, msgs, emit_stderr);
+	} else {
+		ASSERT_GT(ret, 0, "stderr stream read");
+	}
+
+	free(buf);
+}
+
+void verify_test_stderr(struct bpf_object *obj, struct bpf_program *prog)
+{
+	struct test_spec spec = {};
+
+	if (parse_test_spec(NULL, obj, prog, &spec))
+		return;
+
+	verify_stderr(bpf_program__fd(prog), &spec.priv.stderr);
+	free_test_spec(&spec);
+}
+
 static void emit_stdout(const char *bpf_stdout, bool force)
 {
 	if (!force && env.verbosity == VERBOSE_NONE)
@@ -1138,6 +1178,123 @@ static int get_stream(int stream_id, int prog_fd, char *text, size_t text_sz)
 	return ret;
 }
 
+/*
+ * Fix up the program's BTF using BTF from a separate file.
+ *
+ * For __naked subprogs, clang drops parameter names from BTF. Find FUNC
+ * entries with anonymous parameters and replace their FUNC_PROTO with the
+ * properly-named version from the custom file.
+ */
+static int fixup_btf_from_path(struct bpf_object *obj, const char *path)
+{
+	struct btf *prog_btf, *custom_btf;
+	__u32 i, j, cnt, custom_cnt;
+	int err = 0;
+
+	prog_btf = bpf_object__btf(obj);
+	if (!prog_btf)
+		return 0;
+
+	custom_btf = btf__parse(path, NULL);
+	if (!ASSERT_OK_PTR(custom_btf, "parse_custom_btf"))
+		return -EINVAL;
+
+	cnt = btf__type_cnt(prog_btf);
+	custom_cnt = btf__type_cnt(custom_btf);
+
+	/* Fix up FUNC entries with anonymous params.
+	 * Save all data from prog_btf BEFORE calling btf__add_*,
+	 * since those calls may reallocate the BTF data buffer
+	 * and invalidate any pointers obtained from btf__type_by_id.
+	 */
+	for (i = 1; i < cnt; i++) {
+		const struct btf_type *t = btf__type_by_id(prog_btf, i);
+		const struct btf_type *fp, *custom_t, *custom_fp;
+		const struct btf_param *params, *custom_params;
+		__u32 ret_type_id, vlen;
+		__u32 *prog_param_types = NULL;
+		const char *name;
+		int new_proto_id;
+
+		if (!btf_is_func(t))
+			continue;
+
+		fp = btf__type_by_id(prog_btf, t->type);
+		if (!fp || !btf_is_func_proto(fp) || btf_vlen(fp) == 0)
+			continue;
+
+		/* Check if any param is anonymous */
+		params = btf_params(fp);
+		if (params[0].name_off != 0)
+			continue;
+
+		/* Find matching FUNC by name in custom BTF */
+		name = btf__name_by_offset(prog_btf, t->name_off);
+		if (!name)
+			continue;
+
+		for (j = 1; j < custom_cnt; j++) {
+			const char *cname;
+
+			custom_t = btf__type_by_id(custom_btf, j);
+			if (!btf_is_func(custom_t))
+				continue;
+			cname = btf__name_by_offset(custom_btf, custom_t->name_off);
+			if (cname && strcmp(name, cname) == 0)
+				break;
+		}
+		if (j >= custom_cnt)
+			continue;
+
+		custom_fp = btf__type_by_id(custom_btf, custom_t->type);
+		if (!custom_fp || !btf_is_func_proto(custom_fp))
+			continue;
+
+		vlen = btf_vlen(fp);
+		if (vlen != btf_vlen(custom_fp))
+			continue;
+
+		/* Save data before btf__add_* calls invalidate pointers */
+		ret_type_id = fp->type;
+		prog_param_types = malloc(vlen * sizeof(*prog_param_types));
+		if (!prog_param_types) {
+			err = -ENOMEM;
+			break;
+		}
+		for (j = 0; j < vlen; j++)
+			prog_param_types[j] = params[j].type;
+
+		/* Add a new FUNC_PROTO: param names from custom, types from prog */
+		new_proto_id = btf__add_func_proto(prog_btf, ret_type_id);
+		if (new_proto_id < 0) {
+			err = new_proto_id;
+			free(prog_param_types);
+			break;
+		}
+
+		custom_params = btf_params(custom_fp);
+		for (j = 0; j < vlen; j++) {
+			const char *pname;
+
+			pname = btf__name_by_offset(custom_btf, custom_params[j].name_off);
+			err = btf__add_func_param(prog_btf, pname ?: "", prog_param_types[j]);
+			if (err)
+				break;
+		}
+		free(prog_param_types);
+		if (err)
+			break;
+
+		/* Update the FUNC to point to the new FUNC_PROTO (re-fetch
+		 * since btf__add_* may have reallocated the data buffer).
+		 */
+		((struct btf_type *)btf__type_by_id(prog_btf, i))->type = new_proto_id;
+	}
+
+	btf__free(custom_btf);
+	return err;
+}
+
 /* this function is forced noinline and has short generic name to look better
  * in test_progs output (in case of a failure)
  */
@@ -1194,13 +1351,27 @@ void run_subtest(struct test_loader *tester,
 		}
 	}
 
-	/* Implicitly reset to NULL if next test case doesn't specify */
+	/* Implicitly reset to NULL if next test case doesn't specify.
+	 * btf_custom_func_path also serves as btf_custom_path for kfunc resolution.
+	 */
 	open_opts->btf_custom_path = spec->btf_custom_path;
+	if (!open_opts->btf_custom_path)
+		open_opts->btf_custom_path = spec->btf_custom_func_path;
 
 	tobj = bpf_object__open_mem(obj_bytes, obj_byte_cnt, open_opts);
 	if (!ASSERT_OK_PTR(tobj, "obj_open_mem")) /* shouldn't happen */
 		goto subtest_cleanup;
 
+	/* Fix up __naked subprog BTF using a separate file with named params */
+	if (spec->btf_custom_func_path) {
+		err = fixup_btf_from_path(tobj, spec->btf_custom_func_path);
+		if (err) {
+			PRINT_FAIL("failed to fixup BTF from %s: %d\n",
+				   spec->btf_custom_func_path, err);
+			goto tobj_cleanup;
+		}
+	}
+
 	i = 0;
 	bpf_object__for_each_program(tprog_iter, tobj) {
 		spec_iter = &specs[i++];
@@ -1314,17 +1485,7 @@ void run_subtest(struct test_loader *tester,
 			goto tobj_cleanup;
 		}
 
-		if (subspec->stderr.cnt) {
-			err = get_stream(2, bpf_program__fd(tprog),
-					 tester->log_buf, tester->log_buf_sz);
-			if (err <= 0) {
-				PRINT_FAIL("Unexpected retval from get_stream(): %d, errno = %d\n",
-					   err, errno);
-				goto tobj_cleanup;
-			}
-			emit_stderr(tester->log_buf, false /*force*/);
-			validate_msgs(tester->log_buf, &subspec->stderr, emit_stderr);
-		}
+		verify_stderr(bpf_program__fd(tprog), &subspec->stderr);
 
 		if (subspec->stdout.cnt) {
 			err = get_stream(1, bpf_program__fd(tprog),
diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
index ccc5acd55ff9..c32da7bd8be2 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -260,6 +260,16 @@ static void test_hashmap_percpu(unsigned int task, void *data)
 	close(fd);
 }
 
+#define MAP_RETRIES 20
+
+static bool can_retry(int err)
+{
+	return (err == EAGAIN || err == EBUSY ||
+		((err == ENOMEM || err == E2BIG) &&
+		 map_opts.map_flags == BPF_F_NO_PREALLOC));
+}
+
+
 #define VALUE_SIZE 3
 static int helper_fill_hashmap(int max_entries)
 {
@@ -274,10 +284,11 @@ static int helper_fill_hashmap(int max_entries)
 
 	for (i = 0; i < max_entries; i++) {
 		key = i; value[0] = key;
-		ret = bpf_map_update_elem(fd, &key, value, BPF_NOEXIST);
+		ret = map_update_retriable(fd, &key, value, BPF_NOEXIST,
+					   MAP_RETRIES, can_retry);
 		CHECK(ret != 0,
 		      "can't update hashmap",
-		      "err: %s\n", strerror(ret));
+		      "err: %s\n", strerror(-ret));
 	}
 
 	return fd;
@@ -1392,17 +1403,9 @@ static void test_map_stress(void)
 #define DO_UPDATE 1
 #define DO_DELETE 0
 
-#define MAP_RETRIES 20
 #define MAX_DELAY_US 50000
 #define MIN_DELAY_RANGE_US 5000
 
-static bool can_retry(int err)
-{
-	return (err == EAGAIN || err == EBUSY ||
-		((err == ENOMEM || err == E2BIG) &&
-		 map_opts.map_flags == BPF_F_NO_PREALLOC));
-}
-
 int map_update_retriable(int map_fd, const void *key, const void *value, int flags, int attempts,
 			 retry_for_error_fn need_retry)
 {
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 7fe16b5131b1..7ba82974ee78 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -165,6 +165,8 @@ struct prog_test_def {
 	void (*run_test)(void);
 	void (*run_serial_test)(void);
 	bool should_run;
+	bool not_built;
+	bool selected;
 	bool need_cgroup_cleanup;
 	bool should_tmon;
 };
@@ -372,6 +374,8 @@ static void print_test_result(const struct prog_test_def *test, const struct tes
 	fprintf(env.stdout_saved, "#%-*d %s:", TEST_NUM_WIDTH, test->test_num, test->test_name);
 	if (test_state->error_cnt)
 		fprintf(env.stdout_saved, "FAIL");
+	else if (test->not_built)
+		fprintf(env.stdout_saved, "SKIP (not built)");
 	else if (!skipped_cnt)
 		fprintf(env.stdout_saved, "OK");
 	else if (skipped_cnt == subtests_cnt || !subtests_cnt)
@@ -1257,7 +1261,7 @@ int get_bpf_max_tramp_links_from(struct btf *btf)
 	const struct btf_type *t;
 	__u32 i, type_cnt;
 	const char *name;
-	__u16 j, vlen;
+	__u32 j, vlen;
 
 	for (i = 1, type_cnt = btf__type_cnt(btf); i < type_cnt; i++) {
 		t = btf__type_by_id(btf, i);
@@ -1641,6 +1645,7 @@ static void calculate_summary_and_print_errors(struct test_env *env)
 	json_writer_t *w = NULL;
 
 	for (i = 0; i < prog_test_cnt; i++) {
+		struct prog_test_def *test = &prog_test_defs[i];
 		struct test_state *state = &test_states[i];
 
 		if (!state->tested)
@@ -1651,7 +1656,7 @@ static void calculate_summary_and_print_errors(struct test_env *env)
 
 		if (state->error_cnt)
 			fail_cnt++;
-		else
+		else if (!test->not_built)
 			succ_cnt++;
 	}
 
@@ -1700,8 +1705,13 @@ static void calculate_summary_and_print_errors(struct test_env *env)
 	if (env->json)
 		fclose(env->json);
 
-	printf("Summary: %d/%d PASSED, %d SKIPPED, %d FAILED\n",
-	       succ_cnt, sub_succ_cnt, skip_cnt, fail_cnt);
+	if (env->not_built_cnt)
+		printf("Summary: %d/%d PASSED, %d SKIPPED (%d not built), %d FAILED\n",
+		       succ_cnt, sub_succ_cnt, skip_cnt, env->not_built_cnt,
+		       fail_cnt);
+	else
+		printf("Summary: %d/%d PASSED, %d SKIPPED, %d FAILED\n",
+		       succ_cnt, sub_succ_cnt, skip_cnt, fail_cnt);
 
 	env->succ_cnt = succ_cnt;
 	env->sub_succ_cnt = sub_succ_cnt;
@@ -1772,6 +1782,19 @@ static void server_main(void)
 		run_one_test(i);
 	}
 
+	/* mark not-built tests as skipped */
+	for (int i = 0; i < prog_test_cnt; i++) {
+		struct prog_test_def *test = &prog_test_defs[i];
+		struct test_state *state = &test_states[i];
+
+		if (test->not_built && test->selected) {
+			state->tested = true;
+			state->skip_cnt = 1;
+			env.not_built_cnt++;
+			print_test_result(test, state);
+		}
+	}
+
 	/* generate summary */
 	fflush(stderr);
 	fflush(stdout);
@@ -2046,15 +2069,20 @@ int main(int argc, char **argv)
 		struct prog_test_def *test = &prog_test_defs[i];
 
 		test->test_num = i + 1;
-		test->should_run = should_run(&env.test_selector,
-					      test->test_num, test->test_name);
+		test->selected = should_run(&env.test_selector,
+					    test->test_num, test->test_name);
+		test->should_run = test->selected;
 
-		if ((test->run_test == NULL && test->run_serial_test == NULL) ||
-		    (test->run_test != NULL && test->run_serial_test != NULL)) {
+		if (test->run_test && test->run_serial_test) {
 			fprintf(stderr, "Test %d:%s must have either test_%s() or serial_test_%sl() defined.\n",
 				test->test_num, test->test_name, test->test_name, test->test_name);
 			exit(EXIT_ERR_SETUP_INFRA);
 		}
+		if (!test->run_test && !test->run_serial_test) {
+			test->not_built = true;
+			test->should_run = false;
+			continue;
+		}
 		if (test->should_run)
 			test->should_tmon = should_tmon(&env.tmon_selector, test->test_name);
 	}
@@ -2106,9 +2134,18 @@ int main(int argc, char **argv)
 
 	for (i = 0; i < prog_test_cnt; i++) {
 		struct prog_test_def *test = &prog_test_defs[i];
+		struct test_state *state = &test_states[i];
 
-		if (!test->should_run)
+		if (!test->should_run) {
+			if (test->not_built && test->selected &&
+			    !env.get_test_cnt && !env.list_test_names) {
+				state->tested = true;
+				state->skip_cnt = 1;
+				env.not_built_cnt++;
+				print_test_result(test, state);
+			}
 			continue;
+		}
 
 		if (env.get_test_cnt) {
 			env.succ_cnt++;
diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h
index 1a44467f4310..2cf950afcd85 100644
--- a/tools/testing/selftests/bpf/test_progs.h
+++ b/tools/testing/selftests/bpf/test_progs.h
@@ -125,6 +125,7 @@ struct test_env {
 	int sub_succ_cnt; /* successful sub-tests */
 	int fail_cnt; /* total failed tests + sub-tests */
 	int skip_cnt; /* skipped tests */
+	int not_built_cnt; /* tests not built */
 
 	int saved_netns_fd;
 	int workers; /* number of worker process */
@@ -563,5 +564,7 @@ struct expected_msgs {
 
 void validate_msgs(const char *log_buf, struct expected_msgs *msgs,
 		   void (*emit_fn)(const char *buf, bool force));
+void free_msgs(struct expected_msgs *msgs);
+void verify_test_stderr(struct bpf_object *obj, struct bpf_program *prog);
 
 #endif /* __TEST_PROGS_H */
diff --git a/tools/testing/selftests/bpf/test_xdping.sh b/tools/testing/selftests/bpf/test_xdping.sh
deleted file mode 100755
index c3d82e0a7378..000000000000
--- a/tools/testing/selftests/bpf/test_xdping.sh
+++ /dev/null
@@ -1,103 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-# xdping tests
-#   Here we setup and teardown configuration required to run
-#   xdping, exercising its options.
-#
-#   Setup is similar to test_tunnel tests but without the tunnel.
-#
-# Topology:
-# ---------
-#     root namespace   |     tc_ns0 namespace
-#                      |
-#      ----------      |     ----------
-#      |  veth1  | --------- |  veth0  |
-#      ----------    peer    ----------
-#
-# Device Configuration
-# --------------------
-# Root namespace with BPF
-# Device names and addresses:
-#	veth1 IP: 10.1.1.200
-#	xdp added to veth1, xdpings originate from here.
-#
-# Namespace tc_ns0 with BPF
-# Device names and addresses:
-#       veth0 IPv4: 10.1.1.100
-#	For some tests xdping run in server mode here.
-#
-
-readonly TARGET_IP="10.1.1.100"
-readonly TARGET_NS="xdp_ns0"
-
-readonly LOCAL_IP="10.1.1.200"
-
-setup()
-{
-	ip netns add $TARGET_NS
-	ip link add veth0 type veth peer name veth1
-	ip link set veth0 netns $TARGET_NS
-	ip netns exec $TARGET_NS ip addr add ${TARGET_IP}/24 dev veth0
-	ip addr add ${LOCAL_IP}/24 dev veth1
-	ip netns exec $TARGET_NS ip link set veth0 up
-	ip link set veth1 up
-}
-
-cleanup()
-{
-	set +e
-	ip netns delete $TARGET_NS 2>/dev/null
-	ip link del veth1 2>/dev/null
-	if [[ $server_pid -ne 0 ]]; then
-		kill -TERM $server_pid
-	fi
-}
-
-test()
-{
-	client_args="$1"
-	server_args="$2"
-
-	echo "Test client args '$client_args'; server args '$server_args'"
-
-	server_pid=0
-	if [[ -n "$server_args" ]]; then
-		ip netns exec $TARGET_NS ./xdping $server_args &
-		server_pid=$!
-		sleep 10
-	fi
-	./xdping $client_args $TARGET_IP
-
-	if [[ $server_pid -ne 0 ]]; then
-		kill -TERM $server_pid
-		server_pid=0
-	fi
-
-	echo "Test client args '$client_args'; server args '$server_args': PASS"
-}
-
-set -e
-
-server_pid=0
-
-trap cleanup EXIT
-
-setup
-
-for server_args in "" "-I veth0 -s -S" ; do
-	# client in skb mode
-	client_args="-I veth1 -S"
-	test "$client_args" "$server_args"
-
-	# client with count of 10 RTT measurements.
-	client_args="-I veth1 -S -c 10"
-	test "$client_args" "$server_args"
-done
-
-# Test drv mode
-test "-I veth1 -N" "-I veth0 -s -N"
-test "-I veth1 -N -c 10" "-I veth0 -s -N"
-
-echo "OK. All tests passed"
-exit 0
diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c
index 6fbe1e995660..c970e7793dfc 100644
--- a/tools/testing/selftests/bpf/testing_helpers.c
+++ b/tools/testing/selftests/bpf/testing_helpers.c
@@ -5,6 +5,8 @@
 #include <stdlib.h>
 #include <string.h>
 #include <errno.h>
+#include <sys/mman.h>
+#include <alloca.h>
 #include <bpf/bpf.h>
 #include <bpf/libbpf.h>
 #include "disasm.h"
@@ -516,3 +518,19 @@ bool is_jit_enabled(void)
 
 	return enabled;
 }
+
+int stack_mprotect(void)
+{
+	void *buf;
+	long sz;
+	int ret;
+
+	sz = sysconf(_SC_PAGESIZE);
+	if (sz < 0)
+		return sz;
+
+	buf = alloca(sz * 3);
+	ret = mprotect((void *)(((unsigned long)(buf + sz)) & ~(sz - 1)), sz,
+		       PROT_READ | PROT_WRITE | PROT_EXEC);
+	return ret;
+}
diff --git a/tools/testing/selftests/bpf/testing_helpers.h b/tools/testing/selftests/bpf/testing_helpers.h
index 2ca2356a0b58..2edc6fb7fc52 100644
--- a/tools/testing/selftests/bpf/testing_helpers.h
+++ b/tools/testing/selftests/bpf/testing_helpers.h
@@ -59,5 +59,6 @@ struct bpf_insn;
 int get_xlated_program(int fd_prog, struct bpf_insn **buf, __u32 *cnt);
 int testing_prog_flags(void);
 bool is_jit_enabled(void);
+int stack_mprotect(void);
 
 #endif /* __TESTING_HELPERS_H */
diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c
index 0e63daf83ed5..679008b310d9 100644
--- a/tools/testing/selftests/bpf/trace_helpers.c
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -546,9 +546,10 @@ static const char * const trace_blacklist[] = {
 	"__rcu_read_lock",
 	"__rcu_read_unlock",
 	"bpf_get_numa_node_id",
+	"___migrate_enable",
 };
 
-static bool skip_entry(char *name)
+bool is_unsafe_function(const char *name)
 {
 	int i;
 
@@ -651,7 +652,7 @@ int bpf_get_ksyms(struct ksyms **ksymsp, bool kernel)
 		free(name);
 		if (sscanf(buf, "%ms$*[^\n]\n", &name) != 1)
 			continue;
-		if (skip_entry(name))
+		if (is_unsafe_function(name))
 			continue;
 
 		ks = search_kallsyms_custom_local(ksyms, name, search_kallsyms_compare);
@@ -728,7 +729,7 @@ int bpf_get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel)
 		free(name);
 		if (sscanf(buf, "%p %ms$*[^\n]\n", &addr, &name) != 2)
 			continue;
-		if (skip_entry(name))
+		if (is_unsafe_function(name))
 			continue;
 
 		if (cnt == max_cnt) {
diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h
index d5bf1433675d..01c8ecc45627 100644
--- a/tools/testing/selftests/bpf/trace_helpers.h
+++ b/tools/testing/selftests/bpf/trace_helpers.h
@@ -63,4 +63,5 @@ int read_build_id(const char *path, char *build_id, size_t size);
 int bpf_get_ksyms(struct ksyms **ksymsp, bool kernel);
 int bpf_get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel);
 
+bool is_unsafe_function(const char *name);
 #endif
diff --git a/tools/testing/selftests/bpf/uprobe_multi.c b/tools/testing/selftests/bpf/uprobe_multi.c
index 3e58a86b8e25..0af330b6c364 100644
--- a/tools/testing/selftests/bpf/uprobe_multi.c
+++ b/tools/testing/selftests/bpf/uprobe_multi.c
@@ -144,6 +144,8 @@ int main(int argc, char **argv)
 		return trigger_uprobe(true /* page-in build ID */);
 
 error:
-	fprintf(stderr, "usage: %s <bench|usdt>\n", argv[0]);
+	fprintf(stderr,
+		"usage: %s <bench|usdt|uprobe-paged-out|uprobe-paged-in>\n",
+		argv[0]);
 	return -1;
 }
diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c
index c3164b9b2be5..302d712e0d7e 100644
--- a/tools/testing/selftests/bpf/verifier/calls.c
+++ b/tools/testing/selftests/bpf/verifier/calls.c
@@ -31,7 +31,7 @@
 	},
 	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	.result = REJECT,
-	.errstr = "arg#0 pointer type STRUCT prog_test_fail1 must point to scalar",
+	.errstr = "R1 pointer type STRUCT prog_test_fail1 must point to scalar",
 	.fixup_kfunc_btf_id = {
 		{ "bpf_kfunc_call_test_fail1", 2 },
 	},
@@ -46,7 +46,7 @@
 	},
 	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	.result = REJECT,
-	.errstr = "max struct nesting depth exceeded\narg#0 pointer type STRUCT prog_test_fail2",
+	.errstr = "max struct nesting depth exceeded\nR1 pointer type STRUCT prog_test_fail2",
 	.fixup_kfunc_btf_id = {
 		{ "bpf_kfunc_call_test_fail2", 2 },
 	},
@@ -61,7 +61,7 @@
 	},
 	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	.result = REJECT,
-	.errstr = "arg#0 pointer type STRUCT prog_test_fail3 must point to scalar",
+	.errstr = "R1 pointer type STRUCT prog_test_fail3 must point to scalar",
 	.fixup_kfunc_btf_id = {
 		{ "bpf_kfunc_call_test_fail3", 2 },
 	},
@@ -76,7 +76,7 @@
 	},
 	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	.result = REJECT,
-	.errstr = "arg#0 expected pointer to ctx, but got fp",
+	.errstr = "R1 expected pointer to ctx, but got fp",
 	.fixup_kfunc_btf_id = {
 		{ "bpf_kfunc_call_test_pass_ctx", 2 },
 	},
@@ -91,7 +91,7 @@
 	},
 	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	.result = REJECT,
-	.errstr = "arg#0 pointer type UNKNOWN  must point to scalar",
+	.errstr = "R1 pointer type UNKNOWN  must point to scalar",
 	.fixup_kfunc_btf_id = {
 		{ "bpf_kfunc_call_test_mem_len_fail1", 2 },
 	},
@@ -109,7 +109,7 @@
 	},
 	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	.result = REJECT,
-	.errstr = "Possibly NULL pointer passed to trusted arg0",
+	.errstr = "Possibly NULL pointer passed to trusted R1",
 	.fixup_kfunc_btf_id = {
 		{ "bpf_kfunc_call_test_acquire", 3 },
 		{ "bpf_kfunc_call_test_release", 5 },
@@ -152,7 +152,7 @@
 	},
 	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	.result = REJECT,
-	.errstr = "kernel function bpf_kfunc_call_memb1_release args#0 expected pointer",
+	.errstr = "kernel function bpf_kfunc_call_memb1_release R1 expected pointer",
 	.fixup_kfunc_btf_id = {
 		{ "bpf_kfunc_call_memb_acquire", 1 },
 		{ "bpf_kfunc_call_memb1_release", 5 },
@@ -1219,6 +1219,30 @@
 	BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call H */
 	BPF_EXIT_INSN(),
 	/* H */
+	BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call I */
+	BPF_EXIT_INSN(),
+	/* I */
+	BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call J */
+	BPF_EXIT_INSN(),
+	/* J */
+	BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call K */
+	BPF_EXIT_INSN(),
+	/* K */
+	BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call L */
+	BPF_EXIT_INSN(),
+	/* L */
+	BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call M */
+	BPF_EXIT_INSN(),
+	/* M */
+	BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call N */
+	BPF_EXIT_INSN(),
+	/* N */
+	BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call O */
+	BPF_EXIT_INSN(),
+	/* O */
+	BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call P */
+	BPF_EXIT_INSN(),
+	/* P */
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
@@ -1257,6 +1281,30 @@
 	BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call H */
 	BPF_EXIT_INSN(),
 	/* H */
+	BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call I */
+	BPF_EXIT_INSN(),
+	/* I */
+	BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call J */
+	BPF_EXIT_INSN(),
+	/* J */
+	BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call K */
+	BPF_EXIT_INSN(),
+	/* K */
+	BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call L */
+	BPF_EXIT_INSN(),
+	/* L */
+	BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call M */
+	BPF_EXIT_INSN(),
+	/* M */
+	BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call N */
+	BPF_EXIT_INSN(),
+	/* N */
+	BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call O */
+	BPF_EXIT_INSN(),
+	/* O */
+	BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call P */
+	BPF_EXIT_INSN(),
+	/* P */
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
@@ -2410,27 +2458,3 @@
 	.errstr_unpriv = "",
 	.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
 },
-{
-	"calls: several args with ref_obj_id",
-	.insns = {
-	/* Reserve at least sizeof(struct iphdr) bytes in the ring buffer.
-	 * With a smaller size, the verifier would reject the call to
-	 * bpf_tcp_raw_gen_syncookie_ipv4 before we can reach the
-	 * ref_obj_id error.
-	 */
-	BPF_MOV64_IMM(BPF_REG_2, 20),
-	BPF_MOV64_IMM(BPF_REG_3, 0),
-	BPF_LD_MAP_FD(BPF_REG_1, 0),
-	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_reserve),
-	/* if r0 == 0 goto <exit> */
-	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
-	BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
-	BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
-	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tcp_raw_gen_syncookie_ipv4),
-	BPF_EXIT_INSN(),
-	},
-	.fixup_map_ringbuf = { 2 },
-	.result = REJECT,
-	.errstr = "more than one arg with ref_obj_id",
-	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
-},
diff --git a/tools/testing/selftests/bpf/verifier/sleepable.c b/tools/testing/selftests/bpf/verifier/sleepable.c
index c2b7f5ebf168..6dabc5522945 100644
--- a/tools/testing/selftests/bpf/verifier/sleepable.c
+++ b/tools/testing/selftests/bpf/verifier/sleepable.c
@@ -76,7 +76,20 @@
 	.runs = -1,
 },
 {
-	"sleepable raw tracepoint reject",
+	"sleepable raw tracepoint accept",
+	.insns = {
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.prog_type = BPF_PROG_TYPE_TRACING,
+	.expected_attach_type = BPF_TRACE_RAW_TP,
+	.kfunc = "sys_enter",
+	.result = ACCEPT,
+	.flags = BPF_F_SLEEPABLE,
+	.runs = -1,
+},
+{
+	"sleepable raw tracepoint reject non-faultable",
 	.insns = {
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
@@ -85,7 +98,7 @@
 	.expected_attach_type = BPF_TRACE_RAW_TP,
 	.kfunc = "sched_switch",
 	.result = REJECT,
-	.errstr = "Only fentry/fexit/fsession/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable",
+	.errstr = "Sleepable program cannot attach to non-faultable tracepoint",
 	.flags = BPF_F_SLEEPABLE,
 	.runs = -1,
 },
diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c
index 5c82950e6978..a7db6f04f7e1 100644
--- a/tools/testing/selftests/bpf/veristat.c
+++ b/tools/testing/selftests/bpf/veristat.c
@@ -48,6 +48,7 @@ enum stat_id {
 	SIZE,
 	JITED_SIZE,
 	STACK,
+	MAX_STACK,
 	PROG_TYPE,
 	ATTACH_TYPE,
 	MEMORY_PEAK,
@@ -789,13 +790,13 @@ cleanup:
 }
 
 static const struct stat_specs default_csv_output_spec = {
-	.spec_cnt = 15,
+	.spec_cnt = 16,
 	.ids = {
 		FILE_NAME, PROG_NAME, VERDICT, DURATION,
 		TOTAL_INSNS, TOTAL_STATES, PEAK_STATES,
 		MAX_STATES_PER_INSN, MARK_READ_MAX_LEN,
 		SIZE, JITED_SIZE, PROG_TYPE, ATTACH_TYPE,
-		STACK, MEMORY_PEAK,
+		STACK, MAX_STACK, MEMORY_PEAK,
 	},
 };
 
@@ -834,6 +835,7 @@ static struct stat_def {
 	[SIZE] = { "Program size", {"prog_size"}, },
 	[JITED_SIZE] = { "Jited size", {"prog_size_jited"}, },
 	[STACK] = {"Stack depth", {"stack_depth", "stack"}, },
+	[MAX_STACK] = {"Max stack depth", {"max_stack_depth"}, },
 	[PROG_TYPE] = { "Program type", {"prog_type"}, },
 	[ATTACH_TYPE] = { "Attach type", {"attach_type", }, },
 	[MEMORY_PEAK] = { "Peak memory (MiB)", {"mem_peak", }, },
@@ -1023,7 +1025,7 @@ static int parse_verif_log(char * const buf, size_t buf_sz, struct verif_stats *
 				&s->stats[MARK_READ_MAX_LEN]))
 			continue;
 
-		if (1 == sscanf(cur, "stack depth %511s", stack))
+		if (2 == sscanf(cur, "stack depth %511s max %ld", stack, &s->stats[MAX_STACK]))
 			continue;
 	}
 	while ((token = strtok_r(cnt++ ? NULL : stack, "+", &state))) {
@@ -2278,6 +2280,7 @@ static int cmp_stat(const struct verif_stats *s1, const struct verif_stats *s2,
 	case SIZE:
 	case JITED_SIZE:
 	case STACK:
+	case MAX_STACK:
 	case VERDICT:
 	case DURATION:
 	case TOTAL_INSNS:
@@ -2512,6 +2515,7 @@ static void prepare_value(const struct verif_stats *s, enum stat_id id,
 	case MAX_STATES_PER_INSN:
 	case MARK_READ_MAX_LEN:
 	case STACK:
+	case MAX_STACK:
 	case SIZE:
 	case JITED_SIZE:
 	case MEMORY_PEAK:
@@ -2602,7 +2606,8 @@ static int parse_stat_value(const char *str, enum stat_id id, struct verif_stats
 	case SIZE:
 	case JITED_SIZE:
 	case MEMORY_PEAK:
-	case STACK: {
+	case STACK:
+	case MAX_STACK: {
 		long val;
 		int err, n;
 
diff --git a/tools/testing/selftests/bpf/vmtest.sh b/tools/testing/selftests/bpf/vmtest.sh
index 2f869daf8a06..9ca802285393 100755
--- a/tools/testing/selftests/bpf/vmtest.sh
+++ b/tools/testing/selftests/bpf/vmtest.sh
@@ -382,7 +382,7 @@ main()
 	local exit_command="poweroff -f"
 	local debug_shell="no"
 
-	while getopts ':hskl:id:j:' opt; do
+	while getopts ':hsl:id:j:' opt; do
 		case ${opt} in
 		l)
 			LOCAL_ROOTFS_IMAGE="$OPTARG"
diff --git a/tools/testing/selftests/bpf/xdp_lb_bench_common.h b/tools/testing/selftests/bpf/xdp_lb_bench_common.h
new file mode 100644
index 000000000000..aed20a963701
--- /dev/null
+++ b/tools/testing/selftests/bpf/xdp_lb_bench_common.h
@@ -0,0 +1,112 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#ifndef XDP_LB_BENCH_COMMON_H
+#define XDP_LB_BENCH_COMMON_H
+
+#define F_IPV6		(1 << 0)
+#define F_LRU_BYPASS	(1 << 1)
+
+#define CH_RING_SIZE	65537		/* per-VIP consistent hash ring slots */
+#define MAX_VIPS	16
+#define CH_RINGS_SIZE	(MAX_VIPS * CH_RING_SIZE)
+#define MAX_REALS	512
+#define DEFAULT_LRU_SIZE 100000		/* connection tracking cache size */
+#define ONE_SEC		1000000000U	/* 1 sec in nanosec */
+#define MAX_CONN_RATE	100000000	/* high enough to never trigger in bench */
+#define LRU_UDP_TIMEOUT	30000000000ULL	/* 30 sec in nanosec */
+#define PCKT_FRAGMENTED	0x3FFF
+#define KNUTH_HASH_MULT	2654435761U
+#define IPIP_V4_PREFIX	4268		/* 172.16/12 in network order */
+#define IPIP_V6_PREFIX1	1		/* 0100::/64 (RFC 6666 discard) */
+#define IPIP_V6_PREFIX2	0
+#define IPIP_V6_PREFIX3	0
+
+/* Stats indices (0..MAX_VIPS-1 are per-VIP packet/byte counters) */
+#define STATS_LRU	(MAX_VIPS + 0)	/* v1: total VIP packets, v2: LRU misses */
+#define STATS_XDP_TX	(MAX_VIPS + 1)
+#define STATS_XDP_PASS	(MAX_VIPS + 2)
+#define STATS_XDP_DROP	(MAX_VIPS + 3)
+#define STATS_NEW_CONN	(MAX_VIPS + 4)	/* v1: conn count, v2: last reset ts */
+#define STATS_LRU_MISS	(MAX_VIPS + 5)	/* v1: TCP LRU misses */
+#define STATS_SIZE	(MAX_VIPS + 6)
+
+#ifdef __BPF__
+#define lb_htons(x)	bpf_htons(x)
+#define LB_INLINE	static __always_inline
+#else
+#define lb_htons(x)	htons(x)
+#define LB_INLINE	static inline
+#endif
+
+LB_INLINE __be32 create_encap_ipv4_src(__u16 port, __be32 src)
+{
+	__u32 ip_suffix = lb_htons(port);
+
+	ip_suffix <<= 16;
+	ip_suffix ^= src;
+	return (0xFFFF0000 & ip_suffix) | IPIP_V4_PREFIX;
+}
+
+LB_INLINE void create_encap_ipv6_src(__u16 port, __be32 src, __be32 *saddr)
+{
+	saddr[0] = IPIP_V6_PREFIX1;
+	saddr[1] = IPIP_V6_PREFIX2;
+	saddr[2] = IPIP_V6_PREFIX3;
+	saddr[3] = src ^ port;
+}
+
+struct flow_key {
+	union {
+		__be32 src;
+		__be32 srcv6[4];
+	};
+	union {
+		__be32 dst;
+		__be32 dstv6[4];
+	};
+	union {
+		__u32 ports;
+		__u16 port16[2];
+	};
+	__u8 proto;
+	__u8 pad[3];
+};
+
+struct vip_definition {
+	union {
+		__be32 vip;
+		__be32 vipv6[4];
+	};
+	__u16 port;
+	__u8 proto;
+	__u8 pad;
+};
+
+struct vip_meta {
+	__u32 flags;
+	__u32 vip_num;
+};
+
+struct real_pos_lru {
+	__u32 pos;
+	__u64 atime;
+};
+
+struct real_definition {
+	__be32 dst;
+	__be32 dstv6[4];
+	__u8   flags;
+};
+
+struct lb_stats {
+	__u64 v1;
+	__u64 v2;
+};
+
+struct ctl_value {
+	__u8 mac[6];
+	__u8 pad[2];
+};
+
+#endif /* XDP_LB_BENCH_COMMON_H */
diff --git a/tools/testing/selftests/bpf/xdping.c b/tools/testing/selftests/bpf/xdping.c
deleted file mode 100644
index 9ed8c796645d..000000000000
--- a/tools/testing/selftests/bpf/xdping.c
+++ /dev/null
@@ -1,254 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */
-
-#include <linux/bpf.h>
-#include <linux/if_link.h>
-#include <arpa/inet.h>
-#include <assert.h>
-#include <errno.h>
-#include <signal.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <libgen.h>
-#include <net/if.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <netdb.h>
-
-#include "bpf/bpf.h"
-#include "bpf/libbpf.h"
-
-#include "xdping.h"
-#include "testing_helpers.h"
-
-static int ifindex;
-static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
-
-static void cleanup(int sig)
-{
-	bpf_xdp_detach(ifindex, xdp_flags, NULL);
-	if (sig)
-		exit(1);
-}
-
-static int get_stats(int fd, __u16 count, __u32 raddr)
-{
-	struct pinginfo pinginfo = { 0 };
-	char inaddrbuf[INET_ADDRSTRLEN];
-	struct in_addr inaddr;
-	__u16 i;
-
-	inaddr.s_addr = raddr;
-
-	printf("\nXDP RTT data:\n");
-
-	if (bpf_map_lookup_elem(fd, &raddr, &pinginfo)) {
-		perror("bpf_map_lookup elem");
-		return 1;
-	}
-
-	for (i = 0; i < count; i++) {
-		if (pinginfo.times[i] == 0)
-			break;
-
-		printf("64 bytes from %s: icmp_seq=%d ttl=64 time=%#.5f ms\n",
-		       inet_ntop(AF_INET, &inaddr, inaddrbuf,
-				 sizeof(inaddrbuf)),
-		       count + i + 1,
-		       (double)pinginfo.times[i]/1000000);
-	}
-
-	if (i < count) {
-		fprintf(stderr, "Expected %d samples, got %d.\n", count, i);
-		return 1;
-	}
-
-	bpf_map_delete_elem(fd, &raddr);
-
-	return 0;
-}
-
-static void show_usage(const char *prog)
-{
-	fprintf(stderr,
-		"usage: %s [OPTS] -I interface destination\n\n"
-		"OPTS:\n"
-		"    -c count		Stop after sending count requests\n"
-		"			(default %d, max %d)\n"
-		"    -I interface	interface name\n"
-		"    -N			Run in driver mode\n"
-		"    -s			Server mode\n"
-		"    -S			Run in skb mode\n",
-		prog, XDPING_DEFAULT_COUNT, XDPING_MAX_COUNT);
-}
-
-int main(int argc, char **argv)
-{
-	__u32 mode_flags = XDP_FLAGS_DRV_MODE | XDP_FLAGS_SKB_MODE;
-	struct addrinfo *a, hints = { .ai_family = AF_INET };
-	__u16 count = XDPING_DEFAULT_COUNT;
-	struct pinginfo pinginfo = { 0 };
-	const char *optstr = "c:I:NsS";
-	struct bpf_program *main_prog;
-	int prog_fd = -1, map_fd = -1;
-	struct sockaddr_in rin;
-	struct bpf_object *obj;
-	struct bpf_map *map;
-	char *ifname = NULL;
-	char filename[256];
-	int opt, ret = 1;
-	__u32 raddr = 0;
-	int server = 0;
-	char cmd[256];
-
-	while ((opt = getopt(argc, argv, optstr)) != -1) {
-		switch (opt) {
-		case 'c':
-			count = atoi(optarg);
-			if (count < 1 || count > XDPING_MAX_COUNT) {
-				fprintf(stderr,
-					"min count is 1, max count is %d\n",
-					XDPING_MAX_COUNT);
-				return 1;
-			}
-			break;
-		case 'I':
-			ifname = optarg;
-			ifindex = if_nametoindex(ifname);
-			if (!ifindex) {
-				fprintf(stderr, "Could not get interface %s\n",
-					ifname);
-				return 1;
-			}
-			break;
-		case 'N':
-			xdp_flags |= XDP_FLAGS_DRV_MODE;
-			break;
-		case 's':
-			/* use server program */
-			server = 1;
-			break;
-		case 'S':
-			xdp_flags |= XDP_FLAGS_SKB_MODE;
-			break;
-		default:
-			show_usage(basename(argv[0]));
-			return 1;
-		}
-	}
-
-	if (!ifname) {
-		show_usage(basename(argv[0]));
-		return 1;
-	}
-	if (!server && optind == argc) {
-		show_usage(basename(argv[0]));
-		return 1;
-	}
-
-	if ((xdp_flags & mode_flags) == mode_flags) {
-		fprintf(stderr, "-N or -S can be specified, not both.\n");
-		show_usage(basename(argv[0]));
-		return 1;
-	}
-
-	if (!server) {
-		/* Only supports IPv4; see hints initialization above. */
-		if (getaddrinfo(argv[optind], NULL, &hints, &a) || !a) {
-			fprintf(stderr, "Could not resolve %s\n", argv[optind]);
-			return 1;
-		}
-		memcpy(&rin, a->ai_addr, sizeof(rin));
-		raddr = rin.sin_addr.s_addr;
-		freeaddrinfo(a);
-	}
-
-	/* Use libbpf 1.0 API mode */
-	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
-
-	snprintf(filename, sizeof(filename), "%s_kern.bpf.o", argv[0]);
-
-	if (bpf_prog_test_load(filename, BPF_PROG_TYPE_XDP, &obj, &prog_fd)) {
-		fprintf(stderr, "load of %s failed\n", filename);
-		return 1;
-	}
-
-	main_prog = bpf_object__find_program_by_name(obj,
-						     server ? "xdping_server" : "xdping_client");
-	if (main_prog)
-		prog_fd = bpf_program__fd(main_prog);
-	if (!main_prog || prog_fd < 0) {
-		fprintf(stderr, "could not find xdping program");
-		return 1;
-	}
-
-	map = bpf_object__next_map(obj, NULL);
-	if (map)
-		map_fd = bpf_map__fd(map);
-	if (!map || map_fd < 0) {
-		fprintf(stderr, "Could not find ping map");
-		goto done;
-	}
-
-	signal(SIGINT, cleanup);
-	signal(SIGTERM, cleanup);
-
-	printf("Setting up XDP for %s, please wait...\n", ifname);
-
-	printf("XDP setup disrupts network connectivity, hit Ctrl+C to quit\n");
-
-	if (bpf_xdp_attach(ifindex, prog_fd, xdp_flags, NULL) < 0) {
-		fprintf(stderr, "Link set xdp fd failed for %s\n", ifname);
-		goto done;
-	}
-
-	if (server) {
-		close(prog_fd);
-		close(map_fd);
-		printf("Running server on %s; press Ctrl+C to exit...\n",
-		       ifname);
-		do { } while (1);
-	}
-
-	/* Start xdping-ing from last regular ping reply, e.g. for a count
-	 * of 10 ICMP requests, we start xdping-ing using reply with seq number
-	 * 10.  The reason the last "real" ping RTT is much higher is that
-	 * the ping program sees the ICMP reply associated with the last
-	 * XDP-generated packet, so ping doesn't get a reply until XDP is done.
-	 */
-	pinginfo.seq = htons(count);
-	pinginfo.count = count;
-
-	if (bpf_map_update_elem(map_fd, &raddr, &pinginfo, BPF_ANY)) {
-		fprintf(stderr, "could not communicate with BPF map: %s\n",
-			strerror(errno));
-		cleanup(0);
-		goto done;
-	}
-
-	/* We need to wait for XDP setup to complete. */
-	sleep(10);
-
-	snprintf(cmd, sizeof(cmd), "ping -c %d -I %s %s",
-		 count, ifname, argv[optind]);
-
-	printf("\nNormal ping RTT data\n");
-	printf("[Ignore final RTT; it is distorted by XDP using the reply]\n");
-
-	ret = system(cmd);
-
-	if (!ret)
-		ret = get_stats(map_fd, count, raddr);
-
-	cleanup(0);
-
-done:
-	if (prog_fd > 0)
-		close(prog_fd);
-	if (map_fd > 0)
-		close(map_fd);
-
-	return ret;
-}
diff --git a/tools/testing/selftests/bpf/xdping.h b/tools/testing/selftests/bpf/xdping.h
deleted file mode 100644
index afc578df77be..000000000000
--- a/tools/testing/selftests/bpf/xdping.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */
-
-#define	XDPING_MAX_COUNT	10
-#define	XDPING_DEFAULT_COUNT	4
-
-struct pinginfo {
-	__u64	start;
-	__be16	seq;
-	__u16	count;
-	__u32	pad;
-	__u64	times[XDPING_MAX_COUNT];
-};