From 9b8367b604c739947ec308874f087fe0eb80f412 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 6 Jun 2025 09:31:36 -0700 Subject: cgroup: Add bpf prog revisions to struct cgroup_bpf One of key items in mprog API is revision for prog list. The revision number will be increased if the prog list changed, e.g., attach, detach or replace. Add 'revisions' field to struct cgroup_bpf, representing revisions for all cgroup related attachment types. The initial revision value is set to 1, the same as kernel mprog implementations. Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250606163136.2428732-1-yonghong.song@linux.dev --- include/linux/bpf-cgroup-defs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup-defs.h b/include/linux/bpf-cgroup-defs.h index 0985221d5478..c9e6b26abab6 100644 --- a/include/linux/bpf-cgroup-defs.h +++ b/include/linux/bpf-cgroup-defs.h @@ -63,6 +63,7 @@ struct cgroup_bpf { */ struct hlist_head progs[MAX_CGROUP_BPF_ATTACH_TYPE]; u8 flags[MAX_CGROUP_BPF_ATTACH_TYPE]; + u64 revisions[MAX_CGROUP_BPF_ATTACH_TYPE]; /* list of cgroup shared storages */ struct list_head storages; -- cgit v1.2.3 From 03c68a0f8c68936a0bb915b030693923784724cb Mon Sep 17 00:00:00 2001 From: Luis Gerhorst Date: Tue, 3 Jun 2025 23:13:18 +0200 Subject: bpf, arm64, powerpc: Add bpf_jit_bypass_spec_v1/v4() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit JITs can set bpf_jit_bypass_spec_v1/v4() if they want the verifier to skip analysis/patching for the respective vulnerability. For v4, this will reduce the number of barriers the verifier inserts. For v1, it allows more programs to be accepted. The primary motivation for this is to not regress unpriv BPF's performance on ARM64 in a future commit where BPF_NOSPEC is also used against Spectre v1. This has the user-visible change that v1-induced rejections on non-vulnerable PowerPC CPUs are avoided. For now, this does not change the semantics of BPF_NOSPEC. It is still a v4-only barrier and must not be implemented if bypass_spec_v4 is always true for the arch. Changing it to a v1 AND v4-barrier is done in a future commit. As an alternative to bypass_spec_v1/v4, one could introduce NOSPEC_V1 AND NOSPEC_V4 instructions and allow backends to skip their lowering as suggested by commit f5e81d111750 ("bpf: Introduce BPF nospec instruction for mitigating Spectre v4"). Adding bpf_jit_bypass_spec_v1/v4() was found to be preferable for the following reason: * bypass_spec_v1/v4 benefits non-vulnerable CPUs: Always performing the same analysis (not taking into account whether the current CPU is vulnerable), needlessly restricts users of CPUs that are not vulnerable. The only use case for this would be portability-testing, but this can later be added easily when needed by allowing users to force bypass_spec_v1/v4 to false. * Portability is still acceptable: Directly disabling the analysis instead of skipping the lowering of BPF_NOSPEC(_V1/V4) might allow programs on non-vulnerable CPUs to be accepted while the program will be rejected on vulnerable CPUs. With the fallback to speculation barriers for Spectre v1 implemented in a future commit, this will only affect programs that do variable stack-accesses or are very complex. For PowerPC, the SEC_FTR checking in bpf_jit_bypass_spec_v4() is based on the check that was previously located in the BPF_NOSPEC case. For LoongArch, it would likely be safe to set both bpf_jit_bypass_spec_v1() and _v4() according to commit a6f6a95f2580 ("LoongArch, bpf: Fix jit to skip speculation barrier opcode"). This is omitted here as I am unable to do any testing for LoongArch. Hari's ack concerns the PowerPC part only. Signed-off-by: Luis Gerhorst Acked-by: Hari Bathini Cc: Henriette Herzog Cc: Maximilian Ott Cc: Milan Stephan Link: https://lore.kernel.org/r/20250603211318.337474-1-luis.gerhorst@fau.de Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 21 ++++++++++++--------- arch/powerpc/net/bpf_jit_comp64.c | 21 +++++++++++++++++---- include/linux/bpf.h | 11 +++++++++-- kernel/bpf/core.c | 15 +++++++++++++++ 4 files changed, 53 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index da8b89dd2910..2cab9063f563 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1632,15 +1632,7 @@ emit_cond_jmp: /* speculation barrier */ case BPF_ST | BPF_NOSPEC: - /* - * Nothing required here. - * - * In case of arm64, we rely on the firmware mitigation of - * Speculative Store Bypass as controlled via the ssbd kernel - * parameter. Whenever the mitigation is enabled, it works - * for all of the kernel code with no need to provide any - * additional instructions. - */ + /* See bpf_jit_bypass_spec_v4() */ break; /* ST: *(size *)(dst + off) = imm */ @@ -2911,6 +2903,17 @@ bool bpf_jit_supports_percpu_insn(void) return true; } +bool bpf_jit_bypass_spec_v4(void) +{ + /* In case of arm64, we rely on the firmware mitigation of Speculative + * Store Bypass as controlled via the ssbd kernel parameter. Whenever + * the mitigation is enabled, it works for all of the kernel code with + * no need to provide any additional instructions. Therefore, skip + * inserting nospec insns against Spectre v4. + */ + return true; +} + bool bpf_jit_inlines_helper_call(s32 imm) { switch (imm) { diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 5daa77aee7f7..a4335761b7f9 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -370,6 +370,23 @@ static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 o return 0; } +bool bpf_jit_bypass_spec_v1(void) +{ +#if defined(CONFIG_PPC_E500) || defined(CONFIG_PPC_BOOK3S_64) + return !(security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) && + security_ftr_enabled(SEC_FTR_BNDS_CHK_SPEC_BAR)); +#else + return true; +#endif +} + +bool bpf_jit_bypass_spec_v4(void) +{ + return !(security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) && + security_ftr_enabled(SEC_FTR_STF_BARRIER) && + stf_barrier_type_get() != STF_BARRIER_NONE); +} + /* * We spill into the redzone always, even if the bpf program has its own stackframe. * Offsets hardcoded based on BPF_PPC_STACK_SAVE -- see bpf_jit_stack_local() @@ -791,10 +808,6 @@ emit_clear: * BPF_ST NOSPEC (speculation barrier) */ case BPF_ST | BPF_NOSPEC: - if (!security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) || - !security_ftr_enabled(SEC_FTR_STF_BARRIER)) - break; - switch (stf_barrier) { case STF_BARRIER_EIEIO: EMIT(PPC_RAW_EIEIO() | 0x02000000); diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5b25d278409b..5dd556e89cce 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2288,6 +2288,9 @@ bpf_prog_run_array_uprobe(const struct bpf_prog_array *array, return ret; } +bool bpf_jit_bypass_spec_v1(void); +bool bpf_jit_bypass_spec_v4(void); + #ifdef CONFIG_BPF_SYSCALL DECLARE_PER_CPU(int, bpf_prog_active); extern struct mutex bpf_stats_enabled_mutex; @@ -2475,12 +2478,16 @@ static inline bool bpf_allow_uninit_stack(const struct bpf_token *token) static inline bool bpf_bypass_spec_v1(const struct bpf_token *token) { - return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON); + return bpf_jit_bypass_spec_v1() || + cpu_mitigations_off() || + bpf_token_capable(token, CAP_PERFMON); } static inline bool bpf_bypass_spec_v4(const struct bpf_token *token) { - return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON); + return bpf_jit_bypass_spec_v4() || + cpu_mitigations_off() || + bpf_token_capable(token, CAP_PERFMON); } int bpf_map_new_fd(struct bpf_map *map, int flags); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c20babbf998f..f9bd9625438b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -3034,6 +3034,21 @@ bool __weak bpf_jit_needs_zext(void) return false; } +/* By default, enable the verifier's mitigations against Spectre v1 and v4 for + * all archs. The value returned must not change at runtime as there is + * currently no support for reloading programs that were loaded without + * mitigations. + */ +bool __weak bpf_jit_bypass_spec_v1(void) +{ + return false; +} + +bool __weak bpf_jit_bypass_spec_v4(void) +{ + return false; +} + /* Return true if the JIT inlines the call to the helper corresponding to * the imm. * -- cgit v1.2.3 From dff883d9e93a7f2f2fa4e38a9444b2c79d6da91a Mon Sep 17 00:00:00 2001 From: Luis Gerhorst Date: Tue, 3 Jun 2025 23:17:03 +0200 Subject: bpf, arm64, powerpc: Change nospec to include v1 barrier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This changes the semantics of BPF_NOSPEC (previously a v4-only barrier) to always emit a speculation barrier that works against both Spectre v1 AND v4. If mitigation is not needed on an architecture, the backend should set bpf_jit_bypass_spec_v4/v1(). As of now, this commit only has the user-visible implication that unpriv BPF's performance on PowerPC is reduced. This is the case because we have to emit additional v1 barrier instructions for BPF_NOSPEC now. This commit is required for a future commit to allow us to rely on BPF_NOSPEC for Spectre v1 mitigation. As of this commit, the feature that nospec acts as a v1 barrier is unused. Commit f5e81d111750 ("bpf: Introduce BPF nospec instruction for mitigating Spectre v4") noted that mitigation instructions for v1 and v4 might be different on some archs. While this would potentially offer improved performance on PowerPC, it was dismissed after the following considerations: * Only having one barrier simplifies the verifier and allows us to easily rely on v4-induced barriers for reducing the complexity of v1-induced speculative path verification. * For the architectures that implemented BPF_NOSPEC, only PowerPC has distinct instructions for v1 and v4. Even there, some insns may be shared between the barriers for v1 and v4 (e.g., 'ori 31,31,0' and 'sync'). If this is still found to impact performance in an unacceptable way, BPF_NOSPEC can be split into BPF_NOSPEC_V1 and BPF_NOSPEC_V4 later. As an optimization, we can already skip v1/v4 insns from being emitted for PowerPC with this setup if bypass_spec_v1/v4 is set. Vulnerability-status for BPF_NOSPEC-based Spectre mitigations (v4 as of this commit, v1 in the future) is therefore: * x86 (32-bit and 64-bit), ARM64, and PowerPC (64-bit): Mitigated - This patch implements BPF_NOSPEC for these architectures. The previous v4-only version was supported since commit f5e81d111750 ("bpf: Introduce BPF nospec instruction for mitigating Spectre v4") and commit b7540d625094 ("powerpc/bpf: Emit stf barrier instruction sequences for BPF_NOSPEC"). * LoongArch: Not Vulnerable - Commit a6f6a95f2580 ("LoongArch, bpf: Fix jit to skip speculation barrier opcode") is the only other past commit related to BPF_NOSPEC and indicates that the insn is not required there. * MIPS: Vulnerable (if unprivileged BPF is enabled) - Commit a6f6a95f2580 ("LoongArch, bpf: Fix jit to skip speculation barrier opcode") indicates that it is not vulnerable, but this contradicts the kernel and Debian documentation. Therefore, I assume that there exist vulnerable MIPS CPUs (but maybe not from Loongson?). In the future, BPF_NOSPEC could be implemented for MIPS based on the GCC speculation_barrier [1]. For now, we rely on unprivileged BPF being disabled by default. * Other: Unknown - To the best of my knowledge there is no definitive information available that indicates that any other arch is vulnerable. They are therefore left untouched (BPF_NOSPEC is not implemented, but bypass_spec_v1/v4 is also not set). I did the following testing to ensure the insn encoding is correct: * ARM64: * 'dsb nsh; isb' was successfully tested with the BPF CI in [2] * 'sb' locally using QEMU v7.2.15 -cpu max (emitted sb insn is executed for example with './test_progs -t verifier_array_access') * PowerPC: The following configs were tested locally with ppc64le QEMU v8.2 '-machine pseries -cpu POWER9': * STF_BARRIER_EIEIO + CONFIG_PPC_BOOK32_64 * STF_BARRIER_SYNC_ORI (forced on) + CONFIG_PPC_BOOK32_64 * STF_BARRIER_FALLBACK (forced on) + CONFIG_PPC_BOOK32_64 * CONFIG_PPC_E500 (forced on) + STF_BARRIER_EIEIO * CONFIG_PPC_E500 (forced on) + STF_BARRIER_SYNC_ORI (forced on) * CONFIG_PPC_E500 (forced on) + STF_BARRIER_FALLBACK (forced on) * CONFIG_PPC_E500 (forced on) + STF_BARRIER_NONE (forced on) Most of those cobinations should not occur in practice, but I was not able to get an PPC e6500 rootfs (for testing PPC_E500 without forcing it on). In any case, this should ensure that there are no unexpected conflicts between the insns when combined like this. Individual v1/v4 barriers were already emitted elsewhere. Hari's ack is for the PowerPC changes only. [1] https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=29b74545531f6afbee9fc38c267524326dbfbedf ("MIPS: Add speculation_barrier support") [2] https://github.com/kernel-patches/bpf/pull/8576 Signed-off-by: Luis Gerhorst Acked-by: Hari Bathini Cc: Henriette Herzog Cc: Maximilian Ott Cc: Milan Stephan Link: https://lore.kernel.org/r/20250603211703.337860-1-luis.gerhorst@fau.de Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit.h | 5 ++++ arch/arm64/net/bpf_jit_comp.c | 9 ++++-- arch/powerpc/net/bpf_jit_comp64.c | 59 ++++++++++++++++++++++++++++----------- include/linux/filter.h | 2 +- kernel/bpf/core.c | 17 +++++------ 5 files changed, 65 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/net/bpf_jit.h b/arch/arm64/net/bpf_jit.h index a3b0e693a125..bbea4f36f9f2 100644 --- a/arch/arm64/net/bpf_jit.h +++ b/arch/arm64/net/bpf_jit.h @@ -325,4 +325,9 @@ #define A64_MRS_SP_EL0(Rt) \ aarch64_insn_gen_mrs(Rt, AARCH64_INSN_SYSREG_SP_EL0) +/* Barriers */ +#define A64_SB aarch64_insn_get_sb_value() +#define A64_DSB_NSH (aarch64_insn_get_dsb_base_value() | 0x7 << 8) +#define A64_ISB aarch64_insn_get_isb_value() + #endif /* _BPF_JIT_H */ diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 2cab9063f563..b6c42b5c9668 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1630,9 +1630,14 @@ emit_cond_jmp: return ret; break; - /* speculation barrier */ + /* speculation barrier against v1 and v4 */ case BPF_ST | BPF_NOSPEC: - /* See bpf_jit_bypass_spec_v4() */ + if (alternative_has_cap_likely(ARM64_HAS_SB)) { + emit(A64_SB, ctx); + } else { + emit(A64_DSB_NSH, ctx); + emit(A64_ISB, ctx); + } break; /* ST: *(size *)(dst + off) = imm */ diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index a4335761b7f9..3665ff8bb4bc 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -414,6 +414,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code u32 *addrs, int pass, bool extra_pass) { enum stf_barrier_type stf_barrier = stf_barrier_type_get(); + bool sync_emitted, ori31_emitted; const struct bpf_insn *insn = fp->insnsi; int flen = fp->len; int i, ret; @@ -806,26 +807,52 @@ emit_clear: /* * BPF_ST NOSPEC (speculation barrier) + * + * The following must act as a barrier against both Spectre v1 + * and v4 if we requested both mitigations. Therefore, also emit + * 'isync; sync' on E500 or 'ori31' on BOOK3S_64 in addition to + * the insns needed for a Spectre v4 barrier. + * + * If we requested only !bypass_spec_v1 OR only !bypass_spec_v4, + * we can skip the respective other barrier type as an + * optimization. */ case BPF_ST | BPF_NOSPEC: - switch (stf_barrier) { - case STF_BARRIER_EIEIO: - EMIT(PPC_RAW_EIEIO() | 0x02000000); - break; - case STF_BARRIER_SYNC_ORI: + sync_emitted = false; + ori31_emitted = false; +#ifdef CONFIG_PPC_E500 + if (!bpf_jit_bypass_spec_v1()) { + EMIT(PPC_RAW_ISYNC()); EMIT(PPC_RAW_SYNC()); - EMIT(PPC_RAW_LD(tmp1_reg, _R13, 0)); - EMIT(PPC_RAW_ORI(_R31, _R31, 0)); - break; - case STF_BARRIER_FALLBACK: - ctx->seen |= SEEN_FUNC; - PPC_LI64(_R12, dereference_kernel_function_descriptor(bpf_stf_barrier)); - EMIT(PPC_RAW_MTCTR(_R12)); - EMIT(PPC_RAW_BCTRL()); - break; - case STF_BARRIER_NONE: - break; + sync_emitted = true; + } +#endif + if (!bpf_jit_bypass_spec_v4()) { + switch (stf_barrier) { + case STF_BARRIER_EIEIO: + EMIT(PPC_RAW_EIEIO() | 0x02000000); + break; + case STF_BARRIER_SYNC_ORI: + if (!sync_emitted) + EMIT(PPC_RAW_SYNC()); + EMIT(PPC_RAW_LD(tmp1_reg, _R13, 0)); + EMIT(PPC_RAW_ORI(_R31, _R31, 0)); + ori31_emitted = true; + break; + case STF_BARRIER_FALLBACK: + ctx->seen |= SEEN_FUNC; + PPC_LI64(_R12, dereference_kernel_function_descriptor(bpf_stf_barrier)); + EMIT(PPC_RAW_MTCTR(_R12)); + EMIT(PPC_RAW_BCTRL()); + break; + case STF_BARRIER_NONE: + break; + } } +#ifdef CONFIG_PPC_BOOK3S_64 + if (!bpf_jit_bypass_spec_v1() && !ori31_emitted) + EMIT(PPC_RAW_ORI(_R31, _R31, 0)); +#endif break; /* diff --git a/include/linux/filter.h b/include/linux/filter.h index f5cf4d35d83e..eca229752cbe 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -82,7 +82,7 @@ struct ctl_table_header; #define BPF_CALL_ARGS 0xe0 /* unused opcode to mark speculation barrier for mitigating - * Speculative Store Bypass + * Spectre v1 and v4 */ #define BPF_NOSPEC 0xc0 diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f9bd9625438b..e536a34a32c8 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2102,14 +2102,15 @@ out: #undef COND_JMP /* ST, STX and LDX*/ ST_NOSPEC: - /* Speculation barrier for mitigating Speculative Store Bypass. - * In case of arm64, we rely on the firmware mitigation as - * controlled via the ssbd kernel parameter. Whenever the - * mitigation is enabled, it works for all of the kernel code - * with no need to provide any additional instructions here. - * In case of x86, we use 'lfence' insn for mitigation. We - * reuse preexisting logic from Spectre v1 mitigation that - * happens to produce the required code on x86 for v4 as well. + /* Speculation barrier for mitigating Speculative Store Bypass, + * Bounds-Check Bypass and Type Confusion. In case of arm64, we + * rely on the firmware mitigation as controlled via the ssbd + * kernel parameter. Whenever the mitigation is enabled, it + * works for all of the kernel code with no need to provide any + * additional instructions here. In case of x86, we use 'lfence' + * insn for mitigation. We reuse preexisting logic from Spectre + * v1 mitigation that happens to produce the required code on + * x86 for v4 as well. */ barrier_nospec(); CONT; -- cgit v1.2.3 From 9124a4508007f146206a279f0c5e81dde314bda1 Mon Sep 17 00:00:00 2001 From: Luis Gerhorst Date: Tue, 3 Jun 2025 23:20:24 +0200 Subject: bpf: Rename sanitize_stack_spill to nospec_result This is made to clarify that this flag will cause a nospec to be added after this insn and can therefore be relied upon to reduce speculative path analysis. Signed-off-by: Luis Gerhorst Acked-by: Kumar Kartikeya Dwivedi Cc: Henriette Herzog Cc: Maximilian Ott Cc: Milan Stephan Link: https://lore.kernel.org/r/20250603212024.338154-1-luis.gerhorst@fau.de Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 +- kernel/bpf/verifier.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 256274acb1d8..2b0954202226 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -580,7 +580,7 @@ struct bpf_insn_aux_data { u64 map_key_state; /* constant (32 bit) key tracking for maps */ int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ u32 seen; /* this insn was processed by the verifier at env->pass_cnt */ - bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */ + bool nospec_result; /* result is unsafe under speculation, nospec must follow */ bool zext_dst; /* this insn zero extends dst reg */ bool needs_zext; /* alu op needs to clear upper bits */ bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 04465e317f10..79ae0ee395b0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5027,7 +5027,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, } if (sanitize) - env->insn_aux_data[insn_idx].sanitize_stack_spill = true; + env->insn_aux_data[insn_idx].nospec_result = true; } err = destroy_if_dynptr_stack_slot(env, state, spi); @@ -20930,7 +20930,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } if (type == BPF_WRITE && - env->insn_aux_data[i + delta].sanitize_stack_spill) { + env->insn_aux_data[i + delta].nospec_result) { struct bpf_insn patch[] = { *insn, BPF_ST_NOSPEC(), -- cgit v1.2.3 From d6f1c85f22534d2d9fea9b32645da19c91ebe7d2 Mon Sep 17 00:00:00 2001 From: Luis Gerhorst Date: Tue, 3 Jun 2025 23:24:28 +0200 Subject: bpf: Fall back to nospec for Spectre v1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This implements the core of the series and causes the verifier to fall back to mitigating Spectre v1 using speculation barriers. The approach was presented at LPC'24 [1] and RAID'24 [2]. If we find any forbidden behavior on a speculative path, we insert a nospec (e.g., lfence speculation barrier on x86) before the instruction and stop verifying the path. While verifying a speculative path, we can furthermore stop verification of that path whenever we encounter a nospec instruction. A minimal example program would look as follows: A = true B = true if A goto e f() if B goto e unsafe() e: exit There are the following speculative and non-speculative paths (`cur->speculative` and `speculative` referring to the value of the push_stack() parameters): - A = true - B = true - if A goto e - A && !cur->speculative && !speculative - exit - !A && !cur->speculative && speculative - f() - if B goto e - B && cur->speculative && !speculative - exit - !B && cur->speculative && speculative - unsafe() If f() contains any unsafe behavior under Spectre v1 and the unsafe behavior matches `state->speculative && error_recoverable_with_nospec(err)`, do_check() will now add a nospec before f() instead of rejecting the program: A = true B = true if A goto e nospec f() if B goto e unsafe() e: exit Alternatively, the algorithm also takes advantage of nospec instructions inserted for other reasons (e.g., Spectre v4). Taking the program above as an example, speculative path exploration can stop before f() if a nospec was inserted there because of Spectre v4 sanitization. In this example, all instructions after the nospec are dead code (and with the nospec they are also dead code speculatively). For this, it relies on the fact that speculation barriers generally prevent all later instructions from executing if the speculation was not correct: * On Intel x86_64, lfence acts as full speculation barrier, not only as a load fence [3]: An LFENCE instruction or a serializing instruction will ensure that no later instructions execute, even speculatively, until all prior instructions complete locally. [...] Inserting an LFENCE instruction after a bounds check prevents later operations from executing before the bound check completes. This was experimentally confirmed in [4]. * On AMD x86_64, lfence is dispatch-serializing [5] (requires MSR C001_1029[1] to be set if the MSR is supported, this happens in init_amd()). AMD further specifies "A dispatch serializing instruction forces the processor to retire the serializing instruction and all previous instructions before the next instruction is executed" [8]. As dispatch is not specific to memory loads or branches, lfence therefore also affects all instructions there. Also, if retiring a branch means it's PC change becomes architectural (should be), this means any "wrong" speculation is aborted as required for this series. * ARM's SB speculation barrier instruction also affects "any instruction that appears later in the program order than the barrier" [6]. * PowerPC's barrier also affects all subsequent instructions [7]: [...] executing an ori R31,R31,0 instruction ensures that all instructions preceding the ori R31,R31,0 instruction have completed before the ori R31,R31,0 instruction completes, and that no subsequent instructions are initiated, even out-of-order, until after the ori R31,R31,0 instruction completes. The ori R31,R31,0 instruction may complete before storage accesses associated with instructions preceding the ori R31,R31,0 instruction have been performed Regarding the example, this implies that `if B goto e` will not execute before `if A goto e` completes. Once `if A goto e` completes, the CPU should find that the speculation was wrong and continue with `exit`. If there is any other path that leads to `if B goto e` (and therefore `unsafe()`) without going through `if A goto e`, then a nospec will still be needed there. However, this patch assumes this other path will be explored separately and therefore be discovered by the verifier even if the exploration discussed here stops at the nospec. This patch furthermore has the unfortunate consequence that Spectre v1 mitigations now only support architectures which implement BPF_NOSPEC. Before this commit, Spectre v1 mitigations prevented exploits by rejecting the programs on all architectures. Because some JITs do not implement BPF_NOSPEC, this patch therefore may regress unpriv BPF's security to a limited extent: * The regression is limited to systems vulnerable to Spectre v1, have unprivileged BPF enabled, and do NOT emit insns for BPF_NOSPEC. The latter is not the case for x86 64- and 32-bit, arm64, and powerpc 64-bit and they are therefore not affected by the regression. According to commit a6f6a95f2580 ("LoongArch, bpf: Fix jit to skip speculation barrier opcode"), LoongArch is not vulnerable to Spectre v1 and therefore also not affected by the regression. * To the best of my knowledge this regression may therefore only affect MIPS. This is deemed acceptable because unpriv BPF is still disabled there by default. As stated in a previous commit, BPF_NOSPEC could be implemented for MIPS based on GCC's speculation_barrier implementation. * It is unclear which other architectures (besides x86 64- and 32-bit, ARM64, PowerPC 64-bit, LoongArch, and MIPS) supported by the kernel are vulnerable to Spectre v1. Also, it is not clear if barriers are available on these architectures. Implementing BPF_NOSPEC on these architectures therefore is non-trivial. Searching GCC and the kernel for speculation barrier implementations for these architectures yielded no result. * If any of those regressed systems is also vulnerable to Spectre v4, the system was already vulnerable to Spectre v4 attacks based on unpriv BPF before this patch and the impact is therefore further limited. As an alternative to regressing security, one could still reject programs if the architecture does not emit BPF_NOSPEC (e.g., by removing the empty BPF_NOSPEC-case from all JITs except for LoongArch where it appears justified). However, this will cause rejections on these archs that are likely unfounded in the vast majority of cases. In the tests, some are now successful where we previously had a false-positive (i.e., rejection). Change them to reflect where the nospec should be inserted (using __xlated_unpriv) and modify the error message if the nospec is able to mitigate a problem that previously shadowed another problem (in that case __xlated_unpriv does not work, therefore just add a comment). Define SPEC_V1 to avoid duplicating this ifdef whenever we check for nospec insns using __xlated_unpriv, define it here once. This also improves readability. PowerPC can probably also be added here. However, omit it for now because the BPF CI currently does not include a test. Limit it to EPERM, EACCES, and EINVAL (and not everything except for EFAULT and ENOMEM) as it already has the desired effect for most real-world programs. Briefly went through all the occurrences of EPERM, EINVAL, and EACCESS in verifier.c to validate that catching them like this makes sense. Thanks to Dustin for their help in checking the vendor documentation. [1] https://lpc.events/event/18/contributions/1954/ ("Mitigating Spectre-PHT using Speculation Barriers in Linux eBPF") [2] https://arxiv.org/pdf/2405.00078 ("VeriFence: Lightweight and Precise Spectre Defenses for Untrusted Linux Kernel Extensions") [3] https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/runtime-speculative-side-channel-mitigations.html ("Managed Runtime Speculative Execution Side Channel Mitigations") [4] https://dl.acm.org/doi/pdf/10.1145/3359789.3359837 ("Speculator: a tool to analyze speculative execution attacks and mitigations" - Section 4.6 "Stopping Speculative Execution") [5] https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/programmer-references/software-techniques-for-managing-speculation.pdf ("White Paper - SOFTWARE TECHNIQUES FOR MANAGING SPECULATION ON AMD PROCESSORS - REVISION 5.09.23") [6] https://developer.arm.com/documentation/ddi0597/2020-12/Base-Instructions/SB--Speculation-Barrier- ("SB - Speculation Barrier - Arm Armv8-A A32/T32 Instruction Set Architecture (2020-12)") [7] https://wiki.raptorcs.com/w/images/5/5f/OPF_PowerISA_v3.1C.pdf ("Power ISA™ - Version 3.1C - May 26, 2024 - Section 9.2.1 of Book III") [8] https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/programmer-references/40332.pdf ("AMD64 Architecture Programmer’s Manual Volumes 1–5 - Revision 4.08 - April 2024 - 7.6.4 Serializing Instructions") Signed-off-by: Luis Gerhorst Acked-by: Kumar Kartikeya Dwivedi Acked-by: Henriette Herzog Cc: Dustin Nguyen Cc: Maximilian Ott Cc: Milan Stephan Link: https://lore.kernel.org/r/20250603212428.338473-1-luis.gerhorst@fau.de Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 78 ++++++++++++++++++++-- tools/testing/selftests/bpf/progs/bpf_misc.h | 4 ++ tools/testing/selftests/bpf/progs/verifier_and.c | 8 ++- .../testing/selftests/bpf/progs/verifier_bounds.c | 61 +++++++++++++---- tools/testing/selftests/bpf/progs/verifier_movsx.c | 16 ++++- .../testing/selftests/bpf/progs/verifier_unpriv.c | 8 ++- .../selftests/bpf/progs/verifier_value_ptr_arith.c | 16 +++-- tools/testing/selftests/bpf/verifier/dead_code.c | 3 +- tools/testing/selftests/bpf/verifier/jmp32.c | 33 +++------ tools/testing/selftests/bpf/verifier/jset.c | 10 ++- 11 files changed, 184 insertions(+), 54 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 2b0954202226..e6c26393c029 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -580,6 +580,7 @@ struct bpf_insn_aux_data { u64 map_key_state; /* constant (32 bit) key tracking for maps */ int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ u32 seen; /* this insn was processed by the verifier at env->pass_cnt */ + bool nospec; /* do not execute this instruction speculatively */ bool nospec_result; /* result is unsafe under speculation, nospec must follow */ bool zext_dst; /* this insn zero extends dst reg */ bool needs_zext; /* alu op needs to clear upper bits */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 79ae0ee395b0..b1f797616f20 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2013,6 +2013,18 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, return 0; } +static bool error_recoverable_with_nospec(int err) +{ + /* Should only return true for non-fatal errors that are allowed to + * occur during speculative verification. For these we can insert a + * nospec and the program might still be accepted. Do not include + * something like ENOMEM because it is likely to re-occur for the next + * architectural path once it has been recovered-from in all speculative + * paths. + */ + return err == -EPERM || err == -EACCES || err == -EINVAL; +} + static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx, bool speculative) @@ -11147,7 +11159,7 @@ static int check_get_func_ip(struct bpf_verifier_env *env) return -ENOTSUPP; } -static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env) +static struct bpf_insn_aux_data *cur_aux(const struct bpf_verifier_env *env) { return &env->insn_aux_data[env->insn_idx]; } @@ -14015,7 +14027,9 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env, const struct bpf_insn *insn) { - return env->bypass_spec_v1 || BPF_SRC(insn->code) == BPF_K; + return env->bypass_spec_v1 || + BPF_SRC(insn->code) == BPF_K || + cur_aux(env)->nospec; } static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux, @@ -19732,10 +19746,41 @@ static int do_check(struct bpf_verifier_env *env) sanitize_mark_insn_seen(env); prev_insn_idx = env->insn_idx; + /* Reduce verification complexity by stopping speculative path + * verification when a nospec is encountered. + */ + if (state->speculative && cur_aux(env)->nospec) + goto process_bpf_exit; + err = do_check_insn(env, &do_print_state); - if (err < 0) { + if (state->speculative && error_recoverable_with_nospec(err)) { + /* Prevent this speculative path from ever reaching the + * insn that would have been unsafe to execute. + */ + cur_aux(env)->nospec = true; + /* If it was an ADD/SUB insn, potentially remove any + * markings for alu sanitization. + */ + cur_aux(env)->alu_state = 0; + goto process_bpf_exit; + } else if (err < 0) { return err; } else if (err == PROCESS_BPF_EXIT) { + goto process_bpf_exit; + } + WARN_ON_ONCE(err); + + if (state->speculative && cur_aux(env)->nospec_result) { + /* If we are on a path that performed a jump-op, this + * may skip a nospec patched-in after the jump. This can + * currently never happen because nospec_result is only + * used for the write-ops + * `*(size*)(dst_reg+off)=src_reg|imm32` which must + * never skip the following insn. Still, add a warning + * to document this in case nospec_result is used + * elsewhere in the future. + */ + WARN_ON_ONCE(env->insn_idx != prev_insn_idx + 1); process_bpf_exit: mark_verifier_state_scratched(env); update_branch_counts(env, env->cur_state); @@ -19753,7 +19798,6 @@ process_bpf_exit: continue; } } - WARN_ON_ONCE(err); } return 0; @@ -20881,6 +20925,29 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) bpf_convert_ctx_access_t convert_ctx_access; u8 mode; + if (env->insn_aux_data[i + delta].nospec) { + WARN_ON_ONCE(env->insn_aux_data[i + delta].alu_state); + struct bpf_insn patch[] = { + BPF_ST_NOSPEC(), + *insn, + }; + + cnt = ARRAY_SIZE(patch); + new_prog = bpf_patch_insn_data(env, i + delta, patch, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = new_prog; + insn = new_prog->insnsi + i + delta; + /* This can not be easily merged with the + * nospec_result-case, because an insn may require a + * nospec before and after itself. Therefore also do not + * 'continue' here but potentially apply further + * patching to insn. *insn should equal patch[1] now. + */ + } + if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) || insn->code == (BPF_LDX | BPF_MEM | BPF_H) || insn->code == (BPF_LDX | BPF_MEM | BPF_W) || @@ -20931,6 +20998,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) if (type == BPF_WRITE && env->insn_aux_data[i + delta].nospec_result) { + /* nospec_result is only used to mitigate Spectre v4 and + * to limit verification-time for Spectre v1. + */ struct bpf_insn patch[] = { *insn, BPF_ST_NOSPEC(), diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h index 6e208e24ba3b..a678463e972c 100644 --- a/tools/testing/selftests/bpf/progs/bpf_misc.h +++ b/tools/testing/selftests/bpf/progs/bpf_misc.h @@ -231,4 +231,8 @@ #define CAN_USE_LOAD_ACQ_STORE_REL #endif +#if defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) +#define SPEC_V1 +#endif + #endif diff --git a/tools/testing/selftests/bpf/progs/verifier_and.c b/tools/testing/selftests/bpf/progs/verifier_and.c index e97e518516b6..2b4fdca162be 100644 --- a/tools/testing/selftests/bpf/progs/verifier_and.c +++ b/tools/testing/selftests/bpf/progs/verifier_and.c @@ -85,8 +85,14 @@ l0_%=: r0 = r0; \ SEC("socket") __description("check known subreg with unknown reg") -__success __failure_unpriv __msg_unpriv("R1 !read_ok") +__success __success_unpriv __retval(0) +#ifdef SPEC_V1 +__xlated_unpriv("if w0 < 0x1 goto pc+2") +__xlated_unpriv("nospec") /* inserted to prevent `R1 !read_ok'` */ +__xlated_unpriv("goto pc-1") /* `r1 = *(u32*)(r1 + 512)`, sanitized dead code */ +__xlated_unpriv("r0 = 0") +#endif __naked void known_subreg_with_unknown_reg(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c index 0eb33bb801b5..30e16153fdf1 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bounds.c +++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c @@ -620,8 +620,14 @@ l1_%=: exit; \ SEC("socket") __description("bounds check mixed 32bit and 64bit arithmetic. test1") -__success __failure_unpriv __msg_unpriv("R0 invalid mem access 'scalar'") +__success __success_unpriv __retval(0) +#ifdef SPEC_V1 +__xlated_unpriv("goto pc+2") +__xlated_unpriv("nospec") /* inserted to prevent `R0 invalid mem access 'scalar'` */ +__xlated_unpriv("goto pc-1") /* sanitized dead code */ +__xlated_unpriv("exit") +#endif __naked void _32bit_and_64bit_arithmetic_test1(void) { asm volatile (" \ @@ -643,8 +649,14 @@ l1_%=: exit; \ SEC("socket") __description("bounds check mixed 32bit and 64bit arithmetic. test2") -__success __failure_unpriv __msg_unpriv("R0 invalid mem access 'scalar'") +__success __success_unpriv __retval(0) +#ifdef SPEC_V1 +__xlated_unpriv("goto pc+2") +__xlated_unpriv("nospec") /* inserted to prevent `R0 invalid mem access 'scalar'` */ +__xlated_unpriv("goto pc-1") /* sanitized dead code */ +__xlated_unpriv("exit") +#endif __naked void _32bit_and_64bit_arithmetic_test2(void) { asm volatile (" \ @@ -691,9 +703,14 @@ l0_%=: r0 = 0; \ SEC("socket") __description("bounds check for reg = 0, reg xor 1") -__success __failure_unpriv -__msg_unpriv("R0 min value is outside of the allowed memory range") +__success __success_unpriv __retval(0) +#ifdef SPEC_V1 +__xlated_unpriv("if r1 != 0x0 goto pc+2") +__xlated_unpriv("nospec") /* inserted to prevent `R0 min value is outside of the allowed memory range` */ +__xlated_unpriv("goto pc-1") /* sanitized dead code */ +__xlated_unpriv("r0 = 0") +#endif __naked void reg_0_reg_xor_1(void) { asm volatile (" \ @@ -719,9 +736,14 @@ l1_%=: r0 = 0; \ SEC("socket") __description("bounds check for reg32 = 0, reg32 xor 1") -__success __failure_unpriv -__msg_unpriv("R0 min value is outside of the allowed memory range") +__success __success_unpriv __retval(0) +#ifdef SPEC_V1 +__xlated_unpriv("if w1 != 0x0 goto pc+2") +__xlated_unpriv("nospec") /* inserted to prevent `R0 min value is outside of the allowed memory range` */ +__xlated_unpriv("goto pc-1") /* sanitized dead code */ +__xlated_unpriv("r0 = 0") +#endif __naked void reg32_0_reg32_xor_1(void) { asm volatile (" \ @@ -747,9 +769,14 @@ l1_%=: r0 = 0; \ SEC("socket") __description("bounds check for reg = 2, reg xor 3") -__success __failure_unpriv -__msg_unpriv("R0 min value is outside of the allowed memory range") +__success __success_unpriv __retval(0) +#ifdef SPEC_V1 +__xlated_unpriv("if r1 > 0x0 goto pc+2") +__xlated_unpriv("nospec") /* inserted to prevent `R0 min value is outside of the allowed memory range` */ +__xlated_unpriv("goto pc-1") /* sanitized dead code */ +__xlated_unpriv("r0 = 0") +#endif __naked void reg_2_reg_xor_3(void) { asm volatile (" \ @@ -829,9 +856,14 @@ l1_%=: r0 = 0; \ SEC("socket") __description("bounds check for reg > 0, reg xor 3") -__success __failure_unpriv -__msg_unpriv("R0 min value is outside of the allowed memory range") +__success __success_unpriv __retval(0) +#ifdef SPEC_V1 +__xlated_unpriv("if r1 >= 0x0 goto pc+2") +__xlated_unpriv("nospec") /* inserted to prevent `R0 min value is outside of the allowed memory range` */ +__xlated_unpriv("goto pc-1") /* sanitized dead code */ +__xlated_unpriv("r0 = 0") +#endif __naked void reg_0_reg_xor_3(void) { asm volatile (" \ @@ -858,9 +890,14 @@ l1_%=: r0 = 0; \ SEC("socket") __description("bounds check for reg32 > 0, reg32 xor 3") -__success __failure_unpriv -__msg_unpriv("R0 min value is outside of the allowed memory range") +__success __success_unpriv __retval(0) +#ifdef SPEC_V1 +__xlated_unpriv("if w1 >= 0x0 goto pc+2") +__xlated_unpriv("nospec") /* inserted to prevent `R0 min value is outside of the allowed memory range` */ +__xlated_unpriv("goto pc-1") /* sanitized dead code */ +__xlated_unpriv("r0 = 0") +#endif __naked void reg32_0_reg32_xor_3(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/progs/verifier_movsx.c b/tools/testing/selftests/bpf/progs/verifier_movsx.c index 994bbc346d25..a4d8814eb5ed 100644 --- a/tools/testing/selftests/bpf/progs/verifier_movsx.c +++ b/tools/testing/selftests/bpf/progs/verifier_movsx.c @@ -245,7 +245,13 @@ l0_%=: \ SEC("socket") __description("MOV32SX, S8, var_off not u32_max, positive after s8 extension") __success __retval(0) -__failure_unpriv __msg_unpriv("frame pointer is read only") +__success_unpriv +#ifdef SPEC_V1 +__xlated_unpriv("w0 = 0") +__xlated_unpriv("exit") +__xlated_unpriv("nospec") /* inserted to prevent `frame pointer is read only` */ +__xlated_unpriv("goto pc-1") +#endif __naked void mov64sx_s32_varoff_2(void) { asm volatile (" \ @@ -267,7 +273,13 @@ l0_%=: \ SEC("socket") __description("MOV32SX, S8, var_off not u32_max, negative after s8 extension") __success __retval(0) -__failure_unpriv __msg_unpriv("frame pointer is read only") +__success_unpriv +#ifdef SPEC_V1 +__xlated_unpriv("w0 = 0") +__xlated_unpriv("exit") +__xlated_unpriv("nospec") /* inserted to prevent `frame pointer is read only` */ +__xlated_unpriv("goto pc-1") +#endif __naked void mov64sx_s32_varoff_3(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/progs/verifier_unpriv.c b/tools/testing/selftests/bpf/progs/verifier_unpriv.c index db52ba66e880..9bd4aef72140 100644 --- a/tools/testing/selftests/bpf/progs/verifier_unpriv.c +++ b/tools/testing/selftests/bpf/progs/verifier_unpriv.c @@ -572,8 +572,14 @@ l0_%=: exit; \ SEC("socket") __description("alu32: mov u32 const") -__success __failure_unpriv __msg_unpriv("R7 invalid mem access 'scalar'") +__success __success_unpriv __retval(0) +#ifdef SPEC_V1 +__xlated_unpriv("if r0 == 0x0 goto pc+2") +__xlated_unpriv("nospec") /* inserted to prevent `R7 invalid mem access 'scalar'` */ +__xlated_unpriv("goto pc-1") /* sanitized dead code */ +__xlated_unpriv("exit") +#endif __naked void alu32_mov_u32_const(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/progs/verifier_value_ptr_arith.c b/tools/testing/selftests/bpf/progs/verifier_value_ptr_arith.c index 5ba6e53571c8..fcea9819e359 100644 --- a/tools/testing/selftests/bpf/progs/verifier_value_ptr_arith.c +++ b/tools/testing/selftests/bpf/progs/verifier_value_ptr_arith.c @@ -398,7 +398,8 @@ l2_%=: r0 = 1; \ SEC("socket") __description("map access: mixing value pointer and scalar, 1") -__success __failure_unpriv __msg_unpriv("R2 pointer comparison prohibited") +__success __failure_unpriv +__msg_unpriv("R2 tried to add from different maps, paths or scalars, pointer arithmetic with it prohibited for !root") __retval(0) __naked void value_pointer_and_scalar_1(void) { @@ -433,6 +434,7 @@ l2_%=: /* common instruction */ \ l3_%=: /* branch B */ \ r0 = 0x13371337; \ /* verifier follows fall-through */ \ + /* unpriv: nospec (inserted to prevent `R2 pointer comparison prohibited`) */\ if r2 != 0x100000 goto l4_%=; \ r0 = 0; \ exit; \ @@ -450,7 +452,8 @@ l4_%=: /* fake-dead code; targeted from branch A to \ SEC("socket") __description("map access: mixing value pointer and scalar, 2") -__success __failure_unpriv __msg_unpriv("R0 invalid mem access 'scalar'") +__success __failure_unpriv +__msg_unpriv("R2 tried to add from different maps, paths or scalars, pointer arithmetic with it prohibited for !root") __retval(0) __naked void value_pointer_and_scalar_2(void) { @@ -492,6 +495,7 @@ l4_%=: /* fake-dead code; targeted from branch A to \ * prevent dead code sanitization, rejected \ * via branch B however \ */ \ + /* unpriv: nospec (inserted to prevent `R0 invalid mem access 'scalar'`) */\ r0 = *(u8*)(r0 + 0); \ r0 = 0; \ exit; \ @@ -1296,9 +1300,13 @@ l0_%=: r0 = 1; \ SEC("socket") __description("map access: value_ptr -= unknown scalar, 2") -__success __failure_unpriv -__msg_unpriv("R0 pointer arithmetic of map value goes out of range") +__success __success_unpriv __retval(1) +#ifdef SPEC_V1 +__xlated_unpriv("r1 &= 7") +__xlated_unpriv("nospec") /* inserted to prevent `R0 pointer arithmetic of map value goes out of range` */ +__xlated_unpriv("r0 -= r1") +#endif __naked void value_ptr_unknown_scalar_2_2(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/verifier/dead_code.c b/tools/testing/selftests/bpf/verifier/dead_code.c index ee454327e5c6..77207b498c6f 100644 --- a/tools/testing/selftests/bpf/verifier/dead_code.c +++ b/tools/testing/selftests/bpf/verifier/dead_code.c @@ -2,14 +2,13 @@ "dead code: start", .insns = { BPF_JMP_IMM(BPF_JA, 0, 0, 2), + /* unpriv: nospec (inserted to prevent "R9 !read_ok") */ BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0), BPF_JMP_IMM(BPF_JA, 0, 0, 2), BPF_MOV64_IMM(BPF_REG_0, 7), BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 10, -4), BPF_EXIT_INSN(), }, - .errstr_unpriv = "R9 !read_ok", - .result_unpriv = REJECT, .result = ACCEPT, .retval = 7, }, diff --git a/tools/testing/selftests/bpf/verifier/jmp32.c b/tools/testing/selftests/bpf/verifier/jmp32.c index 43776f6f92f4..91d83e9cb148 100644 --- a/tools/testing/selftests/bpf/verifier/jmp32.c +++ b/tools/testing/selftests/bpf/verifier/jmp32.c @@ -84,11 +84,10 @@ BPF_JMP32_IMM(BPF_JSET, BPF_REG_7, 0x10, 1), BPF_EXIT_INSN(), BPF_JMP32_IMM(BPF_JGE, BPF_REG_7, 0x10, 1), + /* unpriv: nospec (inserted to prevent "R9 !read_ok") */ BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0), BPF_EXIT_INSN(), }, - .errstr_unpriv = "R9 !read_ok", - .result_unpriv = REJECT, .result = ACCEPT, }, { @@ -149,11 +148,10 @@ BPF_JMP32_IMM(BPF_JEQ, BPF_REG_7, 0x10, 1), BPF_EXIT_INSN(), BPF_JMP32_IMM(BPF_JSGE, BPF_REG_7, 0xf, 1), + /* unpriv: nospec (inserted to prevent "R9 !read_ok") */ BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0), BPF_EXIT_INSN(), }, - .errstr_unpriv = "R9 !read_ok", - .result_unpriv = REJECT, .result = ACCEPT, }, { @@ -214,11 +212,10 @@ BPF_JMP32_IMM(BPF_JNE, BPF_REG_7, 0x10, 1), BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x10, 1), BPF_EXIT_INSN(), + /* unpriv: nospec (inserted to prevent "R9 !read_ok") */ BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0), BPF_EXIT_INSN(), }, - .errstr_unpriv = "R9 !read_ok", - .result_unpriv = REJECT, .result = ACCEPT, }, { @@ -283,11 +280,10 @@ BPF_JMP32_REG(BPF_JGE, BPF_REG_7, BPF_REG_8, 1), BPF_EXIT_INSN(), BPF_JMP32_IMM(BPF_JGE, BPF_REG_7, 0x7ffffff0, 1), + /* unpriv: nospec (inserted to prevent "R0 invalid mem access 'scalar'") */ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .errstr_unpriv = "R0 invalid mem access 'scalar'", - .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, @@ -354,11 +350,10 @@ BPF_JMP32_REG(BPF_JGT, BPF_REG_7, BPF_REG_8, 1), BPF_EXIT_INSN(), BPF_JMP_IMM(BPF_JGT, BPF_REG_7, 0x7ffffff0, 1), + /* unpriv: nospec (inserted to prevent "R0 invalid mem access 'scalar'") */ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .errstr_unpriv = "R0 invalid mem access 'scalar'", - .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, @@ -425,11 +420,10 @@ BPF_JMP32_REG(BPF_JLE, BPF_REG_7, BPF_REG_8, 1), BPF_EXIT_INSN(), BPF_JMP32_IMM(BPF_JLE, BPF_REG_7, 0x7ffffff0, 1), + /* unpriv: nospec (inserted to prevent "R0 invalid mem access 'scalar'") */ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .errstr_unpriv = "R0 invalid mem access 'scalar'", - .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, @@ -496,11 +490,10 @@ BPF_JMP32_REG(BPF_JLT, BPF_REG_7, BPF_REG_8, 1), BPF_EXIT_INSN(), BPF_JMP_IMM(BPF_JSLT, BPF_REG_7, 0x7ffffff0, 1), + /* unpriv: nospec (inserted to prevent "R0 invalid mem access 'scalar'") */ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .errstr_unpriv = "R0 invalid mem access 'scalar'", - .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, @@ -567,11 +560,10 @@ BPF_JMP32_REG(BPF_JSGE, BPF_REG_7, BPF_REG_8, 1), BPF_EXIT_INSN(), BPF_JMP_IMM(BPF_JSGE, BPF_REG_7, 0x7ffffff0, 1), + /* unpriv: nospec (inserted to prevent "R0 invalid mem access 'scalar'") */ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .errstr_unpriv = "R0 invalid mem access 'scalar'", - .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, @@ -638,11 +630,10 @@ BPF_JMP32_REG(BPF_JSGT, BPF_REG_7, BPF_REG_8, 1), BPF_EXIT_INSN(), BPF_JMP_IMM(BPF_JSGT, BPF_REG_7, -2, 1), + /* unpriv: nospec (inserted to prevent "R0 invalid mem access 'scalar'") */ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .errstr_unpriv = "R0 invalid mem access 'scalar'", - .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, @@ -709,11 +700,10 @@ BPF_JMP32_REG(BPF_JSLE, BPF_REG_7, BPF_REG_8, 1), BPF_EXIT_INSN(), BPF_JMP_IMM(BPF_JSLE, BPF_REG_7, 0x7ffffff0, 1), + /* unpriv: nospec (inserted to prevent "R0 invalid mem access 'scalar'") */ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .errstr_unpriv = "R0 invalid mem access 'scalar'", - .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, @@ -780,11 +770,10 @@ BPF_JMP32_REG(BPF_JSLT, BPF_REG_7, BPF_REG_8, 1), BPF_EXIT_INSN(), BPF_JMP32_IMM(BPF_JSLT, BPF_REG_7, -1, 1), + /* unpriv: nospec (inserted to prevent "R0 invalid mem access 'scalar'") */ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .errstr_unpriv = "R0 invalid mem access 'scalar'", - .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, diff --git a/tools/testing/selftests/bpf/verifier/jset.c b/tools/testing/selftests/bpf/verifier/jset.c index 11fc68da735e..e901eefd774a 100644 --- a/tools/testing/selftests/bpf/verifier/jset.c +++ b/tools/testing/selftests/bpf/verifier/jset.c @@ -78,12 +78,11 @@ .insns = { BPF_MOV64_IMM(BPF_REG_0, 1), BPF_JMP_IMM(BPF_JSET, BPF_REG_0, 1, 1), + /* unpriv: nospec (inserted to prevent "R9 !read_ok") */ BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0), BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .errstr_unpriv = "R9 !read_ok", - .result_unpriv = REJECT, .retval = 1, .result = ACCEPT, }, @@ -136,13 +135,12 @@ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32), BPF_ALU64_IMM(BPF_OR, BPF_REG_0, 2), BPF_JMP_IMM(BPF_JSET, BPF_REG_0, 3, 1), + /* unpriv: nospec (inserted to prevent "R9 !read_ok") */ BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .errstr_unpriv = "R9 !read_ok", - .result_unpriv = REJECT, .result = ACCEPT, }, { @@ -154,16 +152,16 @@ BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0xff), BPF_JMP_IMM(BPF_JSET, BPF_REG_1, 0xf0, 3), BPF_JMP_IMM(BPF_JLT, BPF_REG_1, 0x10, 1), + /* unpriv: nospec (inserted to prevent "R9 !read_ok") */ BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0), BPF_EXIT_INSN(), BPF_JMP_IMM(BPF_JSET, BPF_REG_1, 0x10, 1), BPF_EXIT_INSN(), BPF_JMP_IMM(BPF_JGE, BPF_REG_1, 0x10, 1), + /* unpriv: nospec (inserted to prevent "R9 !read_ok") */ BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0), BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .errstr_unpriv = "R9 !read_ok", - .result_unpriv = REJECT, .result = ACCEPT, }, -- cgit v1.2.3 From baaebe0928bf321a1cd980d569e308dec66be94c Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Wed, 11 Jun 2025 13:08:26 -0700 Subject: Revert "bpf: use common instruction history across all states" This reverts commit 96a30e469ca1d2b8cc7811b40911f8614b558241. Next patches in the series modify propagate_precision() to allow arbitrary starting state. Precision propagation requires access to jump history, and arbitrary states represent history not belonging to `env->cur_state`. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250611200836.4135542-1-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 19 ++++---- kernel/bpf/verifier.c | 109 ++++++++++++++++++++++--------------------- 2 files changed, 64 insertions(+), 64 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index e6c26393c029..3e77befdbc4b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -344,7 +344,7 @@ struct bpf_func_state { #define MAX_CALL_FRAMES 8 -/* instruction history flags, used in bpf_insn_hist_entry.flags field */ +/* instruction history flags, used in bpf_jmp_history_entry.flags field */ enum { /* instruction references stack slot through PTR_TO_STACK register; * we also store stack's frame number in lower 3 bits (MAX_CALL_FRAMES is 8) @@ -366,7 +366,7 @@ enum { static_assert(INSN_F_FRAMENO_MASK + 1 >= MAX_CALL_FRAMES); static_assert(INSN_F_SPI_MASK + 1 >= MAX_BPF_STACK / 8); -struct bpf_insn_hist_entry { +struct bpf_jmp_history_entry { u32 idx; /* insn idx can't be bigger than 1 million */ u32 prev_idx : 20; @@ -459,14 +459,13 @@ struct bpf_verifier_state { * See get_loop_entry() for more information. */ struct bpf_verifier_state *loop_entry; - /* Sub-range of env->insn_hist[] corresponding to this state's - * instruction history. - * Backtracking is using it to go from last to first. - * For most states instruction history is short, 0-3 instructions. + /* jmp history recorded from first to last. + * backtracking is using it to go from last to first. + * For most states jmp_history_cnt is [0-3]. * For loops can go up to ~40. */ - u32 insn_hist_start; - u32 insn_hist_end; + struct bpf_jmp_history_entry *jmp_history; + u32 jmp_history_cnt; u32 dfs_depth; u32 callback_unroll_depth; u32 may_goto_depth; @@ -776,9 +775,7 @@ struct bpf_verifier_env { int cur_postorder; } cfg; struct backtrack_state bt; - struct bpf_insn_hist_entry *insn_hist; - struct bpf_insn_hist_entry *cur_hist_ent; - u32 insn_hist_cap; + struct bpf_jmp_history_entry *cur_hist_ent; u32 pass_cnt; /* number of times do_check() was called */ u32 subprog_cnt; /* number of instructions analyzed by the verifier */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b1f797616f20..92f2dad5f453 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1660,6 +1660,13 @@ static void free_func_state(struct bpf_func_state *state) kfree(state); } +static void clear_jmp_history(struct bpf_verifier_state *state) +{ + kfree(state->jmp_history); + state->jmp_history = NULL; + state->jmp_history_cnt = 0; +} + static void free_verifier_state(struct bpf_verifier_state *state, bool free_self) { @@ -1670,6 +1677,7 @@ static void free_verifier_state(struct bpf_verifier_state *state, state->frame[i] = NULL; } kfree(state->refs); + clear_jmp_history(state); if (free_self) kfree(state); } @@ -1734,6 +1742,13 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, struct bpf_func_state *dst; int i, err; + dst_state->jmp_history = copy_array(dst_state->jmp_history, src->jmp_history, + src->jmp_history_cnt, sizeof(*dst_state->jmp_history), + GFP_USER); + if (!dst_state->jmp_history) + return -ENOMEM; + dst_state->jmp_history_cnt = src->jmp_history_cnt; + /* if dst has more stack frames then src frame, free them, this is also * necessary in case of exceptional exits using bpf_throw. */ @@ -1751,8 +1766,6 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->parent = src->parent; dst_state->first_insn_idx = src->first_insn_idx; dst_state->last_insn_idx = src->last_insn_idx; - dst_state->insn_hist_start = src->insn_hist_start; - dst_state->insn_hist_end = src->insn_hist_end; dst_state->dfs_depth = src->dfs_depth; dst_state->callback_unroll_depth = src->callback_unroll_depth; dst_state->used_as_loop_entry = src->used_as_loop_entry; @@ -2820,14 +2833,9 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, * The caller state doesn't matter. * This is async callback. It starts in a fresh stack. * Initialize it similar to do_check_common(). - * But we do need to make sure to not clobber insn_hist, so we keep - * chaining insn_hist_start/insn_hist_end indices as for a normal - * child state. */ elem->st.branches = 1; elem->st.in_sleepable = is_sleepable; - elem->st.insn_hist_start = env->cur_state->insn_hist_end; - elem->st.insn_hist_end = elem->st.insn_hist_start; frame = kzalloc(sizeof(*frame), GFP_KERNEL); if (!frame) goto err; @@ -3856,10 +3864,11 @@ static void linked_regs_unpack(u64 val, struct linked_regs *s) } /* for any branch, call, exit record the history of jmps in the given state */ -static int push_insn_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, - int insn_flags, u64 linked_regs) +static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, + int insn_flags, u64 linked_regs) { - struct bpf_insn_hist_entry *p; + u32 cnt = cur->jmp_history_cnt; + struct bpf_jmp_history_entry *p; size_t alloc_size; /* combine instruction flags if we already recorded this instruction */ @@ -3879,32 +3888,29 @@ static int push_insn_history(struct bpf_verifier_env *env, struct bpf_verifier_s return 0; } - if (cur->insn_hist_end + 1 > env->insn_hist_cap) { - alloc_size = size_mul(cur->insn_hist_end + 1, sizeof(*p)); - p = kvrealloc(env->insn_hist, alloc_size, GFP_USER); - if (!p) - return -ENOMEM; - env->insn_hist = p; - env->insn_hist_cap = alloc_size / sizeof(*p); - } + cnt++; + alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p))); + p = krealloc(cur->jmp_history, alloc_size, GFP_USER); + if (!p) + return -ENOMEM; + cur->jmp_history = p; - p = &env->insn_hist[cur->insn_hist_end]; + p = &cur->jmp_history[cnt - 1]; p->idx = env->insn_idx; p->prev_idx = env->prev_insn_idx; p->flags = insn_flags; p->linked_regs = linked_regs; - - cur->insn_hist_end++; + cur->jmp_history_cnt = cnt; env->cur_hist_ent = p; return 0; } -static struct bpf_insn_hist_entry *get_insn_hist_entry(struct bpf_verifier_env *env, - u32 hist_start, u32 hist_end, int insn_idx) +static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_state *st, + u32 hist_end, int insn_idx) { - if (hist_end > hist_start && env->insn_hist[hist_end - 1].idx == insn_idx) - return &env->insn_hist[hist_end - 1]; + if (hist_end > 0 && st->jmp_history[hist_end - 1].idx == insn_idx) + return &st->jmp_history[hist_end - 1]; return NULL; } @@ -3921,26 +3927,25 @@ static struct bpf_insn_hist_entry *get_insn_hist_entry(struct bpf_verifier_env * * history entry recording a jump from last instruction of parent state and * first instruction of given state. */ -static int get_prev_insn_idx(const struct bpf_verifier_env *env, - struct bpf_verifier_state *st, - int insn_idx, u32 hist_start, u32 *hist_endp) +static int get_prev_insn_idx(struct bpf_verifier_state *st, int i, + u32 *history) { - u32 hist_end = *hist_endp; - u32 cnt = hist_end - hist_start; + u32 cnt = *history; - if (insn_idx == st->first_insn_idx) { + if (i == st->first_insn_idx) { if (cnt == 0) return -ENOENT; - if (cnt == 1 && env->insn_hist[hist_start].idx == insn_idx) + if (cnt == 1 && st->jmp_history[0].idx == i) return -ENOENT; } - if (cnt && env->insn_hist[hist_end - 1].idx == insn_idx) { - (*hist_endp)--; - return env->insn_hist[hist_end - 1].prev_idx; + if (cnt && st->jmp_history[cnt - 1].idx == i) { + i = st->jmp_history[cnt - 1].prev_idx; + (*history)--; } else { - return insn_idx - 1; + i--; } + return i; } static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn) @@ -4121,7 +4126,7 @@ static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask) /* If any register R in hist->linked_regs is marked as precise in bt, * do bt_set_frame_{reg,slot}(bt, R) for all registers in hist->linked_regs. */ -static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_insn_hist_entry *hist) +static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist) { struct linked_regs linked_regs; bool some_precise = false; @@ -4166,7 +4171,7 @@ static bool calls_callback(struct bpf_verifier_env *env, int insn_idx); * - *was* processed previously during backtracking. */ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, - struct bpf_insn_hist_entry *hist, struct backtrack_state *bt) + struct bpf_jmp_history_entry *hist, struct backtrack_state *bt) { struct bpf_insn *insn = env->prog->insnsi + idx; u8 class = BPF_CLASS(insn->code); @@ -4584,7 +4589,7 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_ * SCALARS, as well as any other registers and slots that contribute to * a tracked state of given registers/stack slots, depending on specific BPF * assembly instructions (see backtrack_insns() for exact instruction handling - * logic). This backtracking relies on recorded insn_hist and is able to + * logic). This backtracking relies on recorded jmp_history and is able to * traverse entire chain of parent states. This process ends only when all the * necessary registers/slots and their transitive dependencies are marked as * precise. @@ -4701,9 +4706,8 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) for (;;) { DECLARE_BITMAP(mask, 64); - u32 hist_start = st->insn_hist_start; - u32 hist_end = st->insn_hist_end; - struct bpf_insn_hist_entry *hist; + u32 history = st->jmp_history_cnt; + struct bpf_jmp_history_entry *hist; if (env->log.level & BPF_LOG_LEVEL2) { verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n", @@ -4741,7 +4745,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) err = 0; skip_first = false; } else { - hist = get_insn_hist_entry(env, hist_start, hist_end, i); + hist = get_jmp_hist_entry(st, history, i); err = backtrack_insn(env, i, subseq_idx, hist, bt); } if (err == -ENOTSUPP) { @@ -4758,7 +4762,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) */ return 0; subseq_idx = i; - i = get_prev_insn_idx(env, st, i, hist_start, &hist_end); + i = get_prev_insn_idx(st, i, &history); if (i == -ENOENT) break; if (i >= env->prog->len) { @@ -5122,7 +5126,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, } if (insn_flags) - return push_insn_history(env, env->cur_state, insn_flags, 0); + return push_jmp_history(env, env->cur_state, insn_flags, 0); return 0; } @@ -5429,7 +5433,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, insn_flags = 0; /* we are not restoring spilled register */ } if (insn_flags) - return push_insn_history(env, env->cur_state, insn_flags, 0); + return push_jmp_history(env, env->cur_state, insn_flags, 0); return 0; } @@ -16496,7 +16500,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } if (insn_flags) { - err = push_insn_history(env, this_branch, insn_flags, 0); + err = push_jmp_history(env, this_branch, insn_flags, 0); if (err) return err; } @@ -16554,7 +16558,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (dst_reg->type == SCALAR_VALUE && dst_reg->id) collect_linked_regs(this_branch, dst_reg->id, &linked_regs); if (linked_regs.cnt > 1) { - err = push_insn_history(env, this_branch, 0, linked_regs_pack(&linked_regs)); + err = push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs)); if (err) return err; } @@ -19052,7 +19056,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) || /* Avoid accumulating infinitely long jmp history */ - cur->insn_hist_end - cur->insn_hist_start > 40; + cur->jmp_history_cnt > 40; /* bpf progs typically have pruning point every 4 instructions * http://vger.kernel.org/bpfconf2019.html#session-1 @@ -19251,7 +19255,7 @@ hit: * the current state. */ if (is_jmp_point(env, env->insn_idx)) - err = err ? : push_insn_history(env, cur, 0, 0); + err = err ? : push_jmp_history(env, cur, 0, 0); err = err ? : propagate_precision(env, &sl->state); if (err) return err; @@ -19333,8 +19337,8 @@ miss: cur->parent = new; cur->first_insn_idx = insn_idx; - cur->insn_hist_start = cur->insn_hist_end; cur->dfs_depth = new->dfs_depth + 1; + clear_jmp_history(cur); list_add(&new_sl->node, head); /* connect new state to parentage chain. Current frame needs all @@ -19704,7 +19708,7 @@ static int do_check(struct bpf_verifier_env *env) } if (is_jmp_point(env, env->insn_idx)) { - err = push_insn_history(env, state, 0, 0); + err = push_jmp_history(env, state, 0, 0); if (err) return err; } @@ -24291,7 +24295,6 @@ err_unlock: if (!is_priv) mutex_unlock(&bpf_verifier_lock); vfree(env->insn_aux_data); - kvfree(env->insn_hist); err_free_env: kvfree(env->cfg.insn_postorder); kvfree(env); -- cgit v1.2.3 From 96c6aa4c63af0bb0675c41b3e61a2fc7f6fed998 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Wed, 11 Jun 2025 13:08:27 -0700 Subject: bpf: compute SCCs in program control flow graph Compute strongly connected components in the program CFG. Assign an SCC number to each instruction, recorded in env->insn_aux[*].scc. Use Tarjan's algorithm for SCC computation adapted to run non-recursively. For debug purposes print out computed SCCs as a part of full program dump in compute_live_registers() at log level 2, e.g.: func#0 @0 Live regs before insn: 0: .......... (b4) w6 = 10 2 1: ......6... (18) r1 = 0xffff88810bbb5565 2 3: .1....6... (b4) w2 = 2 2 4: .12...6... (85) call bpf_trace_printk#6 2 5: ......6... (04) w6 += -1 2 6: ......6... (56) if w6 != 0x0 goto pc-6 7: .......... (b4) w6 = 5 1 8: ......6... (18) r1 = 0xffff88810bbb5567 1 10: .1....6... (b4) w2 = 2 1 11: .12...6... (85) call bpf_trace_printk#6 1 12: ......6... (04) w6 += -1 1 13: ......6... (56) if w6 != 0x0 goto pc-6 14: .......... (b4) w0 = 0 15: 0......... (95) exit ^^^ SCC number for the instruction Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250611200836.4135542-2-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 5 ++ kernel/bpf/verifier.c | 182 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 187 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 3e77befdbc4b..95f5211610f4 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -609,6 +609,11 @@ struct bpf_insn_aux_data { * accepts callback function as a parameter. */ bool calls_callback; + /* + * CFG strongly connected component this instruction belongs to, + * zero if it is a singleton SCC. + */ + u32 scc; /* registers alive before this instruction. */ u16 live_regs_before; }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 92f2dad5f453..75e4f6544b2a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -24013,6 +24013,10 @@ static int compute_live_registers(struct bpf_verifier_env *env) if (env->log.level & BPF_LOG_LEVEL2) { verbose(env, "Live regs before insn:\n"); for (i = 0; i < insn_cnt; ++i) { + if (env->insn_aux_data[i].scc) + verbose(env, "%3d ", env->insn_aux_data[i].scc); + else + verbose(env, " "); verbose(env, "%3d: ", i); for (j = BPF_REG_0; j < BPF_REG_10; ++j) if (insn_aux[i].live_regs_before & BIT(j)) @@ -24034,6 +24038,180 @@ out: return err; } +/* + * Compute strongly connected components (SCCs) on the CFG. + * Assign an SCC number to each instruction, recorded in env->insn_aux[*].scc. + * If instruction is a sole member of its SCC and there are no self edges, + * assign it SCC number of zero. + * Uses a non-recursive adaptation of Tarjan's algorithm for SCC computation. + */ +static int compute_scc(struct bpf_verifier_env *env) +{ + const u32 NOT_ON_STACK = U32_MAX; + + struct bpf_insn_aux_data *aux = env->insn_aux_data; + const u32 insn_cnt = env->prog->len; + int stack_sz, dfs_sz, err = 0; + u32 *stack, *pre, *low, *dfs; + u32 succ_cnt, i, j, t, w; + u32 next_preorder_num; + u32 next_scc_id; + bool assign_scc; + u32 succ[2]; + + next_preorder_num = 1; + next_scc_id = 1; + /* + * - 'stack' accumulates vertices in DFS order, see invariant comment below; + * - 'pre[t] == p' => preorder number of vertex 't' is 'p'; + * - 'low[t] == n' => smallest preorder number of the vertex reachable from 't' is 'n'; + * - 'dfs' DFS traversal stack, used to emulate explicit recursion. + */ + stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + pre = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + low = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + dfs = kvcalloc(insn_cnt, sizeof(*dfs), GFP_KERNEL); + if (!stack || !pre || !low || !dfs) { + err = -ENOMEM; + goto exit; + } + /* + * References: + * [1] R. Tarjan "Depth-First Search and Linear Graph Algorithms" + * [2] D. J. Pearce "A Space-Efficient Algorithm for Finding Strongly Connected Components" + * + * The algorithm maintains the following invariant: + * - suppose there is a path 'u' ~> 'v', such that 'pre[v] < pre[u]'; + * - then, vertex 'u' remains on stack while vertex 'v' is on stack. + * + * Consequently: + * - If 'low[v] < pre[v]', there is a path from 'v' to some vertex 'u', + * such that 'pre[u] == low[v]'; vertex 'u' is currently on the stack, + * and thus there is an SCC (loop) containing both 'u' and 'v'. + * - If 'low[v] == pre[v]', loops containing 'v' have been explored, + * and 'v' can be considered the root of some SCC. + * + * Here is a pseudo-code for an explicitly recursive version of the algorithm: + * + * NOT_ON_STACK = insn_cnt + 1 + * pre = [0] * insn_cnt + * low = [0] * insn_cnt + * scc = [0] * insn_cnt + * stack = [] + * + * next_preorder_num = 1 + * next_scc_id = 1 + * + * def recur(w): + * nonlocal next_preorder_num + * nonlocal next_scc_id + * + * pre[w] = next_preorder_num + * low[w] = next_preorder_num + * next_preorder_num += 1 + * stack.append(w) + * for s in successors(w): + * # Note: for classic algorithm the block below should look as: + * # + * # if pre[s] == 0: + * # recur(s) + * # low[w] = min(low[w], low[s]) + * # elif low[s] != NOT_ON_STACK: + * # low[w] = min(low[w], pre[s]) + * # + * # But replacing both 'min' instructions with 'low[w] = min(low[w], low[s])' + * # does not break the invariant and makes itartive version of the algorithm + * # simpler. See 'Algorithm #3' from [2]. + * + * # 's' not yet visited + * if pre[s] == 0: + * recur(s) + * # if 's' is on stack, pick lowest reachable preorder number from it; + * # if 's' is not on stack 'low[s] == NOT_ON_STACK > low[w]', + * # so 'min' would be a noop. + * low[w] = min(low[w], low[s]) + * + * if low[w] == pre[w]: + * # 'w' is the root of an SCC, pop all vertices + * # below 'w' on stack and assign same SCC to them. + * while True: + * t = stack.pop() + * low[t] = NOT_ON_STACK + * scc[t] = next_scc_id + * if t == w: + * break + * next_scc_id += 1 + * + * for i in range(0, insn_cnt): + * if pre[i] == 0: + * recur(i) + * + * Below implementation replaces explicit recusion with array 'dfs'. + */ + for (i = 0; i < insn_cnt; i++) { + if (pre[i]) + continue; + stack_sz = 0; + dfs_sz = 1; + dfs[0] = i; +dfs_continue: + while (dfs_sz) { + w = dfs[dfs_sz - 1]; + if (pre[w] == 0) { + low[w] = next_preorder_num; + pre[w] = next_preorder_num; + next_preorder_num++; + stack[stack_sz++] = w; + } + /* Visit 'w' successors */ + succ_cnt = insn_successors(env->prog, w, succ); + for (j = 0; j < succ_cnt; ++j) { + if (pre[succ[j]]) { + low[w] = min(low[w], low[succ[j]]); + } else { + dfs[dfs_sz++] = succ[j]; + goto dfs_continue; + } + } + /* + * Preserve the invariant: if some vertex above in the stack + * is reachable from 'w', keep 'w' on the stack. + */ + if (low[w] < pre[w]) { + dfs_sz--; + goto dfs_continue; + } + /* + * Assign SCC number only if component has two or more elements, + * or if component has a self reference. + */ + assign_scc = stack[stack_sz - 1] != w; + for (j = 0; j < succ_cnt; ++j) { + if (succ[j] == w) { + assign_scc = true; + break; + } + } + /* Pop component elements from stack */ + do { + t = stack[--stack_sz]; + low[t] = NOT_ON_STACK; + if (assign_scc) + aux[t].scc = next_scc_id; + } while (t != w); + if (assign_scc) + next_scc_id++; + dfs_sz--; + } + } +exit: + kvfree(stack); + kvfree(pre); + kvfree(low); + kvfree(dfs); + return err; +} + int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size) { u64 start_time = ktime_get_ns(); @@ -24155,6 +24333,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret) goto skip_full_check; + ret = compute_scc(env); + if (ret < 0) + goto skip_full_check; + ret = compute_live_registers(env); if (ret < 0) goto skip_full_check; -- cgit v1.2.3 From c9e31900b54cadf5398dfb838c0a63effa1defec Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Wed, 11 Jun 2025 13:08:33 -0700 Subject: bpf: propagate read/precision marks over state graph backedges Current loop_entry-based exact states comparison logic does not handle the following case: .-> A --. Assume the states are visited in the order A, B, C. | | | Assume that state B reaches a state equivalent to state A. | v v At this point, state C is not processed yet, so state A '-- B C has not received any read or precision marks from C. As a result, these marks won't be propagated to B. If B has incomplete marks, it is unsafe to use it in states_equal() checks. This commit replaces the existing logic with the following: - Strongly connected components (SCCs) are computed over the program's control flow graph (intraprocedurally). - When a verifier state enters an SCC, that state is recorded as the SCC entry point. - When a verifier state is found equivalent to another (e.g., B to A in the example), it is recorded as a states graph backedge. Backedges are accumulated per SCC. - When an SCC entry state reaches `branches == 0`, read and precision marks are propagated through the backedges (e.g., from A to B, from C to A, and then again from A to B). To support nested subprogram calls, the entry state and backedge list are associated not with the SCC itself but with an object called `bpf_scc_callchain`. A callchain is a tuple `(callsite*, scc_id)`, where `callsite` is the index of a call instruction for each frame except the last. See the comments added in `is_state_visited()` and `compute_scc_callchain()` for more details. Fixes: 2a0992829ea3 ("bpf: correct loop detection for iterators convergence") Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250611200836.4135542-8-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 38 ++++ kernel/bpf/verifier.c | 452 ++++++++++++++++++++++++++++++++++++------- 2 files changed, 422 insertions(+), 68 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 95f5211610f4..b0273f759589 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -459,6 +459,10 @@ struct bpf_verifier_state { * See get_loop_entry() for more information. */ struct bpf_verifier_state *loop_entry; + /* if this state is a backedge state then equal_state + * records cached state to which this state is equal. + */ + struct bpf_verifier_state *equal_state; /* jmp history recorded from first to last. * backtracking is using it to go from last to first. * For most states jmp_history_cnt is [0-3]. @@ -723,6 +727,37 @@ struct bpf_idset { u32 ids[BPF_ID_MAP_SIZE]; }; +/* see verifier.c:compute_scc_callchain() */ +struct bpf_scc_callchain { + /* call sites from bpf_verifier_state->frame[*]->callsite leading to this SCC */ + u32 callsites[MAX_CALL_FRAMES - 1]; + /* last frame in a chain is identified by SCC id */ + u32 scc; +}; + +/* verifier state waiting for propagate_backedges() */ +struct bpf_scc_backedge { + struct bpf_scc_backedge *next; + struct bpf_verifier_state state; +}; + +struct bpf_scc_visit { + struct bpf_scc_callchain callchain; + /* first state in current verification path that entered SCC + * identified by the callchain + */ + struct bpf_verifier_state *entry_state; + struct bpf_scc_backedge *backedges; /* list of backedges */ +}; + +/* An array of bpf_scc_visit structs sharing tht same bpf_scc_callchain->scc + * but having different bpf_scc_callchain->callsites. + */ +struct bpf_scc_info { + u32 num_visits; + struct bpf_scc_visit visits[]; +}; + /* single container for all structs * one verifier_env per bpf_check() call */ @@ -819,6 +854,9 @@ struct bpf_verifier_env { char tmp_str_buf[TMP_STR_BUF_LEN]; struct bpf_insn insn_buf[INSN_BUF_SIZE]; struct bpf_insn epilogue_buf[INSN_BUF_SIZE]; + /* array of pointers to bpf_scc_info indexed by SCC id */ + struct bpf_scc_info **scc_info; + u32 scc_cnt; }; static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 90b3d1a0bd86..aa1bb4be7b8b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1700,6 +1700,9 @@ static struct bpf_verifier_state_list *state_loop_entry_as_list(struct bpf_verif return NULL; } +static bool incomplete_read_marks(struct bpf_verifier_env *env, + struct bpf_verifier_state *st); + /* A state can be freed if it is no longer referenced: * - is in the env->free_list; * - has no children states; @@ -1710,20 +1713,14 @@ static struct bpf_verifier_state_list *state_loop_entry_as_list(struct bpf_verif static void maybe_free_verifier_state(struct bpf_verifier_env *env, struct bpf_verifier_state_list *sl) { - struct bpf_verifier_state_list *loop_entry_sl; - - while (sl && sl->in_free_list && - sl->state.branches == 0 && - sl->state.used_as_loop_entry == 0) { - loop_entry_sl = state_loop_entry_as_list(&sl->state); - if (loop_entry_sl) - loop_entry_sl->state.used_as_loop_entry--; - list_del(&sl->node); - free_verifier_state(&sl->state, false); - kfree(sl); - env->free_list_size--; - sl = loop_entry_sl; - } + if (!sl->in_free_list + || sl->state.branches != 0 + || incomplete_read_marks(env, &sl->state)) + return; + list_del(&sl->node); + free_verifier_state(&sl->state, false); + kfree(sl); + env->free_list_size--; } /* copy verifier state from src to dst growing dst stack space @@ -1771,6 +1768,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->used_as_loop_entry = src->used_as_loop_entry; dst_state->may_goto_depth = src->may_goto_depth; dst_state->loop_entry = src->loop_entry; + dst_state->equal_state = src->equal_state; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { @@ -1972,22 +1970,218 @@ static u32 frame_insn_idx(struct bpf_verifier_state *st, u32 frame) : st->frame[frame + 1]->callsite; } -static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st) +/* For state @st look for a topmost frame with frame_insn_idx() in some SCC, + * if such frame exists form a corresponding @callchain as an array of + * call sites leading to this frame and SCC id. + * E.g.: + * + * void foo() { A: loop {... SCC#1 ...}; } + * void bar() { B: loop { C: foo(); ... SCC#2 ... } + * D: loop { E: foo(); ... SCC#3 ... } } + * void main() { F: bar(); } + * + * @callchain at (A) would be either (F,SCC#2) or (F,SCC#3) depending + * on @st frame call sites being (F,C,A) or (F,E,A). + */ +static bool compute_scc_callchain(struct bpf_verifier_env *env, + struct bpf_verifier_state *st, + struct bpf_scc_callchain *callchain) +{ + u32 i, scc, insn_idx; + + memset(callchain, 0, sizeof(*callchain)); + for (i = 0; i <= st->curframe; i++) { + insn_idx = frame_insn_idx(st, i); + scc = env->insn_aux_data[insn_idx].scc; + if (scc) { + callchain->scc = scc; + break; + } else if (i < st->curframe) { + callchain->callsites[i] = insn_idx; + } else { + return false; + } + } + return true; +} + +/* Check if bpf_scc_visit instance for @callchain exists. */ +static struct bpf_scc_visit *scc_visit_lookup(struct bpf_verifier_env *env, + struct bpf_scc_callchain *callchain) +{ + struct bpf_scc_info *info = env->scc_info[callchain->scc]; + struct bpf_scc_visit *visits = info->visits; + u32 i; + + if (!info) + return NULL; + for (i = 0; i < info->num_visits; i++) + if (memcmp(callchain, &visits[i].callchain, sizeof(*callchain)) == 0) + return &visits[i]; + return NULL; +} + +/* Allocate a new bpf_scc_visit instance corresponding to @callchain. + * Allocated instances are alive for a duration of the do_check_common() + * call and are freed by free_states(). + */ +static struct bpf_scc_visit *scc_visit_alloc(struct bpf_verifier_env *env, + struct bpf_scc_callchain *callchain) +{ + struct bpf_scc_visit *visit; + struct bpf_scc_info *info; + u32 scc, num_visits; + u64 new_sz; + + scc = callchain->scc; + info = env->scc_info[scc]; + num_visits = info ? info->num_visits : 0; + new_sz = sizeof(*info) + sizeof(struct bpf_scc_visit) * (num_visits + 1); + info = kvrealloc(env->scc_info[scc], new_sz, GFP_KERNEL); + if (!info) + return NULL; + env->scc_info[scc] = info; + info->num_visits = num_visits + 1; + visit = &info->visits[num_visits]; + memset(visit, 0, sizeof(*visit)); + memcpy(&visit->callchain, callchain, sizeof(*callchain)); + return visit; +} + +/* Form a string '(callsite#1,callsite#2,...,scc)' in env->tmp_str_buf */ +static char *format_callchain(struct bpf_verifier_env *env, struct bpf_scc_callchain *callchain) +{ + char *buf = env->tmp_str_buf; + int i, delta = 0; + + delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "("); + for (i = 0; i < ARRAY_SIZE(callchain->callsites); i++) { + if (!callchain->callsites[i]) + break; + delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "%u,", + callchain->callsites[i]); + } + delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "%u)", callchain->scc); + return env->tmp_str_buf; +} + +/* If callchain for @st exists (@st is in some SCC), ensure that + * bpf_scc_visit instance for this callchain exists. + * If instance does not exist or is empty, assign visit->entry_state to @st. + */ +static int maybe_enter_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st) +{ + struct bpf_scc_callchain callchain; + struct bpf_scc_visit *visit; + + if (!compute_scc_callchain(env, st, &callchain)) + return 0; + visit = scc_visit_lookup(env, &callchain); + visit = visit ?: scc_visit_alloc(env, &callchain); + if (!visit) + return -ENOMEM; + if (!visit->entry_state) { + visit->entry_state = st; + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "SCC enter %s\n", format_callchain(env, &callchain)); + } + return 0; +} + +static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visit *visit); + +/* If callchain for @st exists (@st is in some SCC), make it empty: + * - set visit->entry_state to NULL; + * - flush accumulated backedges. + */ +static int maybe_exit_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st) +{ + struct bpf_scc_callchain callchain; + struct bpf_scc_visit *visit; + + if (!compute_scc_callchain(env, st, &callchain)) + return 0; + visit = scc_visit_lookup(env, &callchain); + if (!visit) { + verifier_bug(env, "scc exit: no visit info for call chain %s", + format_callchain(env, &callchain)); + return -EFAULT; + } + if (visit->entry_state != st) + return 0; + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "SCC exit %s\n", format_callchain(env, &callchain)); + visit->entry_state = NULL; + return propagate_backedges(env, visit); +} + +/* Lookup an bpf_scc_visit instance corresponding to @st callchain + * and add @backedge to visit->backedges. @st callchain must exist. + */ +static int add_scc_backedge(struct bpf_verifier_env *env, + struct bpf_verifier_state *st, + struct bpf_scc_backedge *backedge) +{ + struct bpf_scc_callchain callchain; + struct bpf_scc_visit *visit; + + if (!compute_scc_callchain(env, st, &callchain)) { + verifier_bug(env, "add backedge: no SCC in verification path, insn_idx %d", + st->insn_idx); + return -EFAULT; + } + visit = scc_visit_lookup(env, &callchain); + if (!visit) { + verifier_bug(env, "add backedge: no visit info for call chain %s", + format_callchain(env, &callchain)); + return -EFAULT; + } + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "SCC backedge %s\n", format_callchain(env, &callchain)); + backedge->next = visit->backedges; + visit->backedges = backedge; + return 0; +} + +/* bpf_reg_state->live marks for registers in a state @st are incomplete, + * if state @st is in some SCC and not all execution paths starting at this + * SCC are fully explored. + */ +static bool incomplete_read_marks(struct bpf_verifier_env *env, + struct bpf_verifier_state *st) +{ + struct bpf_scc_callchain callchain; + struct bpf_scc_visit *visit; + + if (!compute_scc_callchain(env, st, &callchain)) + return false; + visit = scc_visit_lookup(env, &callchain); + if (!visit) + return false; + return !!visit->backedges; +} + +static void free_backedges(struct bpf_scc_visit *visit) +{ + struct bpf_scc_backedge *backedge, *next; + + for (backedge = visit->backedges; backedge; backedge = next) { + free_verifier_state(&backedge->state, false); + next = backedge->next; + kvfree(backedge); + } + visit->backedges = NULL; +} + +static int update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st) { struct bpf_verifier_state_list *sl = NULL, *parent_sl; struct bpf_verifier_state *parent; + int err; while (st) { u32 br = --st->branches; - /* br == 0 signals that DFS exploration for 'st' is finished, - * thus it is necessary to update parent's loop entry if it - * turned out that st is a part of some loop. - * This is a part of 'case A' in get_loop_entry() comment. - */ - if (br == 0 && st->parent && st->loop_entry) - update_loop_entry(env, st->parent, st->loop_entry); - /* WARN_ON(br > 1) technically makes sense here, * but see comment in push_stack(), hence: */ @@ -1996,6 +2190,9 @@ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifi br); if (br) break; + err = maybe_exit_scc(env, st); + if (err) + return err; parent = st->parent; parent_sl = state_parent_as_list(st); if (sl) @@ -2003,6 +2200,7 @@ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifi st = parent; sl = parent_sl; } + return 0; } static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, @@ -18344,7 +18542,6 @@ static void clean_verifier_state(struct bpf_verifier_env *env, static void clean_live_states(struct bpf_verifier_env *env, int insn, struct bpf_verifier_state *cur) { - struct bpf_verifier_state *loop_entry; struct bpf_verifier_state_list *sl; struct list_head *pos, *head; @@ -18353,15 +18550,14 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn, sl = container_of(pos, struct bpf_verifier_state_list, node); if (sl->state.branches) continue; - loop_entry = get_loop_entry(env, &sl->state); - if (!IS_ERR_OR_NULL(loop_entry) && loop_entry->branches) - continue; if (sl->state.insn_idx != insn || !same_callsites(&sl->state, cur)) continue; if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) /* all regs in this state in all frames were already marked */ continue; + if (incomplete_read_marks(env, &sl->state)) + continue; clean_verifier_state(env, &sl->state); } } @@ -18963,6 +19159,46 @@ static int propagate_precision(struct bpf_verifier_env *env, return 0; } +#define MAX_BACKEDGE_ITERS 64 + +/* Propagate read and precision marks from visit->backedges[*].state->equal_state + * to corresponding parent states of visit->backedges[*].state until fixed point is reached, + * then free visit->backedges. + * After execution of this function incomplete_read_marks() will return false + * for all states corresponding to @visit->callchain. + */ +static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visit *visit) +{ + struct bpf_scc_backedge *backedge; + struct bpf_verifier_state *st; + bool changed; + int i, err; + + i = 0; + do { + if (i++ > MAX_BACKEDGE_ITERS) { + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "%s: too many iterations\n", __func__); + for (backedge = visit->backedges; backedge; backedge = backedge->next) + mark_all_scalars_precise(env, &backedge->state); + break; + } + changed = false; + for (backedge = visit->backedges; backedge; backedge = backedge->next) { + st = &backedge->state; + err = propagate_liveness(env, st->equal_state, st, &changed); + if (err) + return err; + err = propagate_precision(env, st->equal_state, st, &changed); + if (err) + return err; + } + } while (changed); + + free_backedges(visit); + return 0; +} + static bool states_maybe_looping(struct bpf_verifier_state *old, struct bpf_verifier_state *cur) { @@ -19072,9 +19308,9 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl; - struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry; + struct bpf_verifier_state *cur = env->cur_state, *new; + bool force_new_state, add_new_state, loop; int i, j, n, err, states_cnt = 0; - bool force_new_state, add_new_state, force_exact; struct list_head *pos, *tmp, *head; force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) || @@ -19096,6 +19332,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) clean_live_states(env, insn_idx, cur); + loop = false; head = explored_state(env, insn_idx); list_for_each_safe(pos, tmp, head) { sl = container_of(pos, struct bpf_verifier_state_list, node); @@ -19175,7 +19412,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) spi = __get_spi(iter_reg->off + iter_reg->var_off.value); iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr; if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) { - update_loop_entry(env, cur, &sl->state); + loop = true; goto hit; } } @@ -19184,7 +19421,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) if (is_may_goto_insn_at(env, insn_idx)) { if (sl->state.may_goto_depth != cur->may_goto_depth && states_equal(env, &sl->state, cur, RANGE_WITHIN)) { - update_loop_entry(env, cur, &sl->state); + loop = true; goto hit; } } @@ -19226,38 +19463,9 @@ skip_inf_loop_check: add_new_state = false; goto miss; } - /* If sl->state is a part of a loop and this loop's entry is a part of - * current verification path then states have to be compared exactly. - * 'force_exact' is needed to catch the following case: - * - * initial Here state 'succ' was processed first, - * | it was eventually tracked to produce a - * V state identical to 'hdr'. - * .---------> hdr All branches from 'succ' had been explored - * | | and thus 'succ' has its .branches == 0. - * | V - * | .------... Suppose states 'cur' and 'succ' correspond - * | | | to the same instruction + callsites. - * | V V In such case it is necessary to check - * | ... ... if 'succ' and 'cur' are states_equal(). - * | | | If 'succ' and 'cur' are a part of the - * | V V same loop exact flag has to be set. - * | succ <- cur To check if that is the case, verify - * | | if loop entry of 'succ' is in current - * | V DFS path. - * | ... - * | | - * '----' - * - * Additional details are in the comment before get_loop_entry(). - */ - loop_entry = get_loop_entry(env, &sl->state); - if (IS_ERR(loop_entry)) - return PTR_ERR(loop_entry); - force_exact = loop_entry && loop_entry->branches > 0; - if (states_equal(env, &sl->state, cur, force_exact ? RANGE_WITHIN : NOT_EXACT)) { - if (force_exact) - update_loop_entry(env, cur, loop_entry); + /* See comments for mark_all_regs_read_and_precise() */ + loop = incomplete_read_marks(env, &sl->state); + if (states_equal(env, &sl->state, cur, loop ? RANGE_WITHIN : NOT_EXACT)) { hit: sl->hit_cnt++; /* reached equivalent register/stack state, @@ -19282,6 +19490,94 @@ hit: err = err ? : propagate_precision(env, &sl->state, cur, NULL); if (err) return err; + /* When processing iterator based loops above propagate_liveness and + * propagate_precision calls are not sufficient to transfer all relevant + * read and precision marks. E.g. consider the following case: + * + * .-> A --. Assume the states are visited in the order A, B, C. + * | | | Assume that state B reaches a state equivalent to state A. + * | v v At this point, state C is not processed yet, so state A + * '-- B C has not received any read or precision marks from C. + * Thus, marks propagated from A to B are incomplete. + * + * The verifier mitigates this by performing the following steps: + * + * - Prior to the main verification pass, strongly connected components + * (SCCs) are computed over the program's control flow graph, + * intraprocedurally. + * + * - During the main verification pass, `maybe_enter_scc()` checks + * whether the current verifier state is entering an SCC. If so, an + * instance of a `bpf_scc_visit` object is created, and the state + * entering the SCC is recorded as the entry state. + * + * - This instance is associated not with the SCC itself, but with a + * `bpf_scc_callchain`: a tuple consisting of the call sites leading to + * the SCC and the SCC id. See `compute_scc_callchain()`. + * + * - When a verification path encounters a `states_equal(..., + * RANGE_WITHIN)` condition, there exists a call chain describing the + * current state and a corresponding `bpf_scc_visit` instance. A copy + * of the current state is created and added to + * `bpf_scc_visit->backedges`. + * + * - When a verification path terminates, `maybe_exit_scc()` is called + * from `update_branch_counts()`. For states with `branches == 0`, it + * checks whether the state is the entry state of any `bpf_scc_visit` + * instance. If it is, this indicates that all paths originating from + * this SCC visit have been explored. `propagate_backedges()` is then + * called, which propagates read and precision marks through the + * backedges until a fixed point is reached. + * (In the earlier example, this would propagate marks from A to B, + * from C to A, and then again from A to B.) + * + * A note on callchains + * -------------------- + * + * Consider the following example: + * + * void foo() { loop { ... SCC#1 ... } } + * void main() { + * A: foo(); + * B: ... + * C: foo(); + * } + * + * Here, there are two distinct callchains leading to SCC#1: + * - (A, SCC#1) + * - (C, SCC#1) + * + * Each callchain identifies a separate `bpf_scc_visit` instance that + * accumulates backedge states. The `propagate_{liveness,precision}()` + * functions traverse the parent state of each backedge state, which + * means these parent states must remain valid (i.e., not freed) while + * the corresponding `bpf_scc_visit` instance exists. + * + * Associating `bpf_scc_visit` instances directly with SCCs instead of + * callchains would break this invariant: + * - States explored during `C: foo()` would contribute backedges to + * SCC#1, but SCC#1 would only be exited once the exploration of + * `A: foo()` completes. + * - By that time, the states explored between `A: foo()` and `C: foo()` + * (i.e., `B: ...`) may have already been freed, causing the parent + * links for states from `C: foo()` to become invalid. + */ + if (loop) { + struct bpf_scc_backedge *backedge; + + backedge = kzalloc(sizeof(*backedge), GFP_KERNEL); + if (!backedge) + return -ENOMEM; + err = copy_verifier_state(&backedge->state, cur); + backedge->state.equal_state = &sl->state; + backedge->state.insn_idx = insn_idx; + err = err ?: add_scc_backedge(env, &sl->state, backedge); + if (err) { + free_verifier_state(&backedge->state, false); + kvfree(backedge); + return err; + } + } return 1; } miss: @@ -19357,6 +19653,12 @@ miss: new->insn_idx = insn_idx; WARN_ONCE(new->branches != 1, "BUG is_state_visited:branches_to_explore=%d insn %d\n", new->branches, insn_idx); + err = maybe_enter_scc(env, new); + if (err) { + free_verifier_state(new, false); + kvfree(new_sl); + return err; + } cur->parent = new; cur->first_insn_idx = insn_idx; @@ -19811,7 +20113,9 @@ static int do_check(struct bpf_verifier_env *env) WARN_ON_ONCE(env->insn_idx != prev_insn_idx + 1); process_bpf_exit: mark_verifier_state_scratched(env); - update_branch_counts(env, env->cur_state); + err = update_branch_counts(env, env->cur_state); + if (err) + return err; err = pop_stack(env, &prev_insn_idx, &env->insn_idx, pop_log); if (err < 0) { @@ -19819,9 +20123,6 @@ process_bpf_exit: return err; break; } else { - if (verifier_bug_if(env->cur_state->loop_entry, env, - "broken loop detection")) - return -EFAULT; do_print_state = true; continue; } @@ -22787,7 +23088,8 @@ static void free_states(struct bpf_verifier_env *env) { struct bpf_verifier_state_list *sl; struct list_head *head, *pos, *tmp; - int i; + struct bpf_scc_info *info; + int i, j; list_for_each_safe(pos, tmp, &env->free_list) { sl = container_of(pos, struct bpf_verifier_state_list, node); @@ -22796,6 +23098,14 @@ static void free_states(struct bpf_verifier_env *env) } INIT_LIST_HEAD(&env->free_list); + for (i = 0; i < env->scc_cnt; ++i) { + info = env->scc_info[i]; + for (j = 0; j < info->num_visits; j++) + free_backedges(&info->visits[j]); + kvfree(info); + env->scc_info[i] = NULL; + } + if (!env->explored_states) return; @@ -24228,6 +24538,11 @@ dfs_continue: dfs_sz--; } } + env->scc_info = kvcalloc(next_scc_id, sizeof(*env->scc_info), GFP_KERNEL); + if (!env->scc_info) { + err = -ENOMEM; + goto exit; + } exit: kvfree(stack); kvfree(pre); @@ -24503,6 +24818,7 @@ err_unlock: vfree(env->insn_aux_data); err_free_env: kvfree(env->cfg.insn_postorder); + kvfree(env->scc_info); kvfree(env); return ret; } -- cgit v1.2.3 From 0e0da5f901f582b97bfeefbf1f36a27e9d427ff4 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Wed, 11 Jun 2025 13:08:34 -0700 Subject: bpf: remove {update,get}_loop_entry functions The previous patch switched read and precision tracking for iterator-based loops from state-graph-based loop tracking to control-flow-graph-based loop tracking. This patch removes the now-unused `update_loop_entry()` and `get_loop_entry()` functions, which were part of the state-graph-based logic. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250611200836.4135542-9-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 15 ---- kernel/bpf/verifier.c | 165 +------------------------------------------ 2 files changed, 1 insertion(+), 179 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index b0273f759589..1ae588679e20 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -449,16 +449,6 @@ struct bpf_verifier_state { /* first and last insn idx of this verifier state */ u32 first_insn_idx; u32 last_insn_idx; - /* If this state is a part of states loop this field points to some - * parent of this state such that: - * - it is also a member of the same states loop; - * - DFS states traversal starting from initial state visits loop_entry - * state before this state. - * Used to compute topmost loop entry for state loops. - * State loops might appear because of open coded iterators logic. - * See get_loop_entry() for more information. - */ - struct bpf_verifier_state *loop_entry; /* if this state is a backedge state then equal_state * records cached state to which this state is equal. */ @@ -473,11 +463,6 @@ struct bpf_verifier_state { u32 dfs_depth; u32 callback_unroll_depth; u32 may_goto_depth; - /* If this state was ever pointed-to by other state's loop_entry field - * this flag would be set to true. Used to avoid freeing such states - * while they are still in use. - */ - u32 used_as_loop_entry; }; #define bpf_get_spilled_reg(slot, frame, mask) \ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index aa1bb4be7b8b..48847f8da5b1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1682,7 +1682,7 @@ static void free_verifier_state(struct bpf_verifier_state *state, kfree(state); } -/* struct bpf_verifier_state->{parent,loop_entry} refer to states +/* struct bpf_verifier_state->parent refers to states * that are in either of env->{expored_states,free_list}. * In both cases the state is contained in struct bpf_verifier_state_list. */ @@ -1693,22 +1693,12 @@ static struct bpf_verifier_state_list *state_parent_as_list(struct bpf_verifier_ return NULL; } -static struct bpf_verifier_state_list *state_loop_entry_as_list(struct bpf_verifier_state *st) -{ - if (st->loop_entry) - return container_of(st->loop_entry, struct bpf_verifier_state_list, state); - return NULL; -} - static bool incomplete_read_marks(struct bpf_verifier_env *env, struct bpf_verifier_state *st); /* A state can be freed if it is no longer referenced: * - is in the env->free_list; * - has no children states; - * - is not used as loop_entry. - * - * Freeing a state can make it's loop_entry free-able. */ static void maybe_free_verifier_state(struct bpf_verifier_env *env, struct bpf_verifier_state_list *sl) @@ -1765,9 +1755,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->last_insn_idx = src->last_insn_idx; dst_state->dfs_depth = src->dfs_depth; dst_state->callback_unroll_depth = src->callback_unroll_depth; - dst_state->used_as_loop_entry = src->used_as_loop_entry; dst_state->may_goto_depth = src->may_goto_depth; - dst_state->loop_entry = src->loop_entry; dst_state->equal_state = src->equal_state; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; @@ -1811,157 +1799,6 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta return true; } -/* Open coded iterators allow back-edges in the state graph in order to - * check unbounded loops that iterators. - * - * In is_state_visited() it is necessary to know if explored states are - * part of some loops in order to decide whether non-exact states - * comparison could be used: - * - non-exact states comparison establishes sub-state relation and uses - * read and precision marks to do so, these marks are propagated from - * children states and thus are not guaranteed to be final in a loop; - * - exact states comparison just checks if current and explored states - * are identical (and thus form a back-edge). - * - * Paper "A New Algorithm for Identifying Loops in Decompilation" - * by Tao Wei, Jian Mao, Wei Zou and Yu Chen [1] presents a convenient - * algorithm for loop structure detection and gives an overview of - * relevant terminology. It also has helpful illustrations. - * - * [1] https://api.semanticscholar.org/CorpusID:15784067 - * - * We use a similar algorithm but because loop nested structure is - * irrelevant for verifier ours is significantly simpler and resembles - * strongly connected components algorithm from Sedgewick's textbook. - * - * Define topmost loop entry as a first node of the loop traversed in a - * depth first search starting from initial state. The goal of the loop - * tracking algorithm is to associate topmost loop entries with states - * derived from these entries. - * - * For each step in the DFS states traversal algorithm needs to identify - * the following situations: - * - * initial initial initial - * | | | - * V V V - * ... ... .---------> hdr - * | | | | - * V V | V - * cur .-> succ | .------... - * | | | | | | - * V | V | V V - * succ '-- cur | ... ... - * | | | - * | V V - * | succ <- cur - * | | - * | V - * | ... - * | | - * '----' - * - * (A) successor state of cur (B) successor state of cur or it's entry - * not yet traversed are in current DFS path, thus cur and succ - * are members of the same outermost loop - * - * initial initial - * | | - * V V - * ... ... - * | | - * V V - * .------... .------... - * | | | | - * V V V V - * .-> hdr ... ... ... - * | | | | | - * | V V V V - * | succ <- cur succ <- cur - * | | | - * | V V - * | ... ... - * | | | - * '----' exit - * - * (C) successor state of cur is a part of some loop but this loop - * does not include cur or successor state is not in a loop at all. - * - * Algorithm could be described as the following python code: - * - * traversed = set() # Set of traversed nodes - * entries = {} # Mapping from node to loop entry - * depths = {} # Depth level assigned to graph node - * path = set() # Current DFS path - * - * # Find outermost loop entry known for n - * def get_loop_entry(n): - * h = entries.get(n, None) - * while h in entries: - * h = entries[h] - * return h - * - * # Update n's loop entry if h comes before n in current DFS path. - * def update_loop_entry(n, h): - * if h in path and depths[entries.get(n, n)] < depths[n]: - * entries[n] = h1 - * - * def dfs(n, depth): - * traversed.add(n) - * path.add(n) - * depths[n] = depth - * for succ in G.successors(n): - * if succ not in traversed: - * # Case A: explore succ and update cur's loop entry - * # only if succ's entry is in current DFS path. - * dfs(succ, depth + 1) - * h = entries.get(succ, None) - * update_loop_entry(n, h) - * else: - * # Case B or C depending on `h1 in path` check in update_loop_entry(). - * update_loop_entry(n, succ) - * path.remove(n) - * - * To adapt this algorithm for use with verifier: - * - use st->branch == 0 as a signal that DFS of succ had been finished - * and cur's loop entry has to be updated (case A), handle this in - * update_branch_counts(); - * - use st->branch > 0 as a signal that st is in the current DFS path; - * - handle cases B and C in is_state_visited(). - */ -static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_env *env, - struct bpf_verifier_state *st) -{ - struct bpf_verifier_state *topmost = st->loop_entry; - u32 steps = 0; - - while (topmost && topmost->loop_entry) { - if (verifier_bug_if(steps++ > st->dfs_depth, env, "infinite loop")) - return ERR_PTR(-EFAULT); - topmost = topmost->loop_entry; - } - return topmost; -} - -static void update_loop_entry(struct bpf_verifier_env *env, - struct bpf_verifier_state *cur, struct bpf_verifier_state *hdr) -{ - /* The hdr->branches check decides between cases B and C in - * comment for get_loop_entry(). If hdr->branches == 0 then - * head's topmost loop entry is not in current DFS path, - * hence 'cur' and 'hdr' are not in the same loop and there is - * no need to update cur->loop_entry. - */ - if (hdr->branches && hdr->dfs_depth < (cur->loop_entry ?: cur)->dfs_depth) { - if (cur->loop_entry) { - cur->loop_entry->used_as_loop_entry--; - maybe_free_verifier_state(env, state_loop_entry_as_list(cur)); - } - cur->loop_entry = hdr; - hdr->used_as_loop_entry++; - } -} - /* Return IP for a given frame in a call stack */ static u32 frame_insn_idx(struct bpf_verifier_state *st, u32 frame) { -- cgit v1.2.3 From 0f54ff54700315caa8ed3bea36fa0ff3ebc53f56 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Wed, 11 Jun 2025 13:08:35 -0700 Subject: bpf: include backedges in peak_states stat Count states accumulated in bpf_scc_visit->backedges in env->peak_states. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250611200836.4135542-10-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 ++ kernel/bpf/verifier.c | 8 +++++++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 1ae588679e20..7e459e839f8b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -733,6 +733,7 @@ struct bpf_scc_visit { */ struct bpf_verifier_state *entry_state; struct bpf_scc_backedge *backedges; /* list of backedges */ + u32 num_backedges; }; /* An array of bpf_scc_visit structs sharing tht same bpf_scc_callchain->scc @@ -822,6 +823,7 @@ struct bpf_verifier_env { u32 longest_mark_read_walk; u32 free_list_size; u32 explored_states_size; + u32 num_backedges; bpfptr_t fd_array; /* bit mask to keep track of whether a register has been accessed diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 48847f8da5b1..1d3277bf935e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1648,7 +1648,7 @@ static void update_peak_states(struct bpf_verifier_env *env) { u32 cur_states; - cur_states = env->explored_states_size + env->free_list_size; + cur_states = env->explored_states_size + env->free_list_size + env->num_backedges; env->peak_states = max(env->peak_states, cur_states); } @@ -1949,6 +1949,9 @@ static int maybe_exit_scc(struct bpf_verifier_env *env, struct bpf_verifier_stat if (env->log.level & BPF_LOG_LEVEL2) verbose(env, "SCC exit %s\n", format_callchain(env, &callchain)); visit->entry_state = NULL; + env->num_backedges -= visit->num_backedges; + visit->num_backedges = 0; + update_peak_states(env); return propagate_backedges(env, visit); } @@ -1977,6 +1980,9 @@ static int add_scc_backedge(struct bpf_verifier_env *env, verbose(env, "SCC backedge %s\n", format_callchain(env, &callchain)); backedge->next = visit->backedges; visit->backedges = backedge; + visit->num_backedges++; + env->num_backedges++; + update_peak_states(env); return 0; } -- cgit v1.2.3 From aced132599b3c8884c050218d4c48eef203678f6 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 25 Jun 2025 09:40:24 -0700 Subject: bpf: Add range tracking for BPF_NEG Add range tracking for instruction BPF_NEG. Without this logic, a trivial program like the following will fail volatile bool found_value_b; SEC("lsm.s/socket_connect") int BPF_PROG(test_socket_connect) { if (!found_value_b) return -1; return 0; } with verifier log: "At program exit the register R0 has smin=0 smax=4294967295 should have been in [-4095, 0]". This is because range information is lost in BPF_NEG: 0: R1=ctx() R10=fp0 ; if (!found_value_b) @ xxxx.c:24 0: (18) r1 = 0xffa00000011e7048 ; R1_w=map_value(...) 2: (71) r0 = *(u8 *)(r1 +0) ; R0_w=scalar(smin32=0,smax=255) 3: (a4) w0 ^= 1 ; R0_w=scalar(smin32=0,smax=255) 4: (84) w0 = -w0 ; R0_w=scalar(range info lost) Note that, the log above is manually modified to highlight relevant bits. Fix this by maintaining proper range information with BPF_NEG, so that the verifier will know: 4: (84) w0 = -w0 ; R0_w=scalar(smin32=-255,smax=0) Also updated selftests based on the expected behavior. Signed-off-by: Song Liu Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250625164025.3310203-2-song@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/tnum.h | 2 ++ kernel/bpf/tnum.c | 5 +++++ kernel/bpf/verifier.c | 17 ++++++++++++++++- .../bpf/progs/verifier_bounds_deduction.c | 11 +++++++---- .../selftests/bpf/progs/verifier_value_ptr_arith.c | 22 ++++++++++++++++------ 5 files changed, 46 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tnum.h b/include/linux/tnum.h index 3c13240077b8..57ed3035cc30 100644 --- a/include/linux/tnum.h +++ b/include/linux/tnum.h @@ -40,6 +40,8 @@ struct tnum tnum_arshift(struct tnum a, u8 min_shift, u8 insn_bitness); struct tnum tnum_add(struct tnum a, struct tnum b); /* Subtract two tnums, return @a - @b */ struct tnum tnum_sub(struct tnum a, struct tnum b); +/* Neg of a tnum, return 0 - @a */ +struct tnum tnum_neg(struct tnum a); /* Bitwise-AND, return @a & @b */ struct tnum tnum_and(struct tnum a, struct tnum b); /* Bitwise-OR, return @a | @b */ diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index 9dbc31b25e3d..fa353c5d550f 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -83,6 +83,11 @@ struct tnum tnum_sub(struct tnum a, struct tnum b) return TNUM(dv & ~mu, mu); } +struct tnum tnum_neg(struct tnum a) +{ + return tnum_sub(TNUM(0, 0), a); +} + struct tnum tnum_and(struct tnum a, struct tnum b) { u64 alpha, beta, v; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f403524bd215..2ff22ef42348 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15182,6 +15182,7 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, switch (BPF_OP(insn->code)) { case BPF_ADD: case BPF_SUB: + case BPF_NEG: case BPF_AND: case BPF_XOR: case BPF_OR: @@ -15250,6 +15251,13 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, scalar_min_max_sub(dst_reg, &src_reg); dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off); break; + case BPF_NEG: + env->fake_reg[0] = *dst_reg; + __mark_reg_known(dst_reg, 0); + scalar32_min_max_sub(dst_reg, &env->fake_reg[0]); + scalar_min_max_sub(dst_reg, &env->fake_reg[0]); + dst_reg->var_off = tnum_neg(env->fake_reg[0].var_off); + break; case BPF_MUL: dst_reg->var_off = tnum_mul(dst_reg->var_off, src_reg.var_off); scalar32_min_max_mul(dst_reg, &src_reg); @@ -15473,7 +15481,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } /* check dest operand */ - err = check_reg_arg(env, insn->dst_reg, DST_OP); + if (opcode == BPF_NEG) { + err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); + err = err ?: adjust_scalar_min_max_vals(env, insn, + ®s[insn->dst_reg], + regs[insn->dst_reg]); + } else { + err = check_reg_arg(env, insn->dst_reg, DST_OP); + } if (err) return err; diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds_deduction.c b/tools/testing/selftests/bpf/progs/verifier_bounds_deduction.c index c506afbdd936..260a6df264e3 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bounds_deduction.c +++ b/tools/testing/selftests/bpf/progs/verifier_bounds_deduction.c @@ -159,13 +159,16 @@ __failure_unpriv __naked void deducing_bounds_from_const_10(void) { asm volatile (" \ + r6 = r1; \ r0 = 0; \ if r0 s<= 0 goto l0_%=; \ -l0_%=: /* Marks reg as unknown. */ \ - r0 = -r0; \ - r0 -= r1; \ +l0_%=: /* Marks r0 as unknown. */ \ + call %[bpf_get_prandom_u32]; \ + r0 -= r6; \ exit; \ -" ::: __clobber_all); +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); } char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_value_ptr_arith.c b/tools/testing/selftests/bpf/progs/verifier_value_ptr_arith.c index fcea9819e359..af7938ce56cb 100644 --- a/tools/testing/selftests/bpf/progs/verifier_value_ptr_arith.c +++ b/tools/testing/selftests/bpf/progs/verifier_value_ptr_arith.c @@ -231,6 +231,10 @@ __retval(1) __naked void ptr_unknown_vs_unknown_lt(void) { asm volatile (" \ + r8 = r1; \ + call %[bpf_get_prandom_u32]; \ + r9 = r0; \ + r1 = r8; \ r0 = *(u32*)(r1 + %[__sk_buff_len]); \ r1 = 0; \ *(u64*)(r10 - 8) = r1; \ @@ -245,11 +249,11 @@ l1_%=: call %[bpf_map_lookup_elem]; \ r4 = *(u8*)(r0 + 0); \ if r4 == 1 goto l3_%=; \ r1 = 6; \ - r1 = -r1; \ + r1 = r9; \ r1 &= 0x3; \ goto l4_%=; \ l3_%=: r1 = 6; \ - r1 = -r1; \ + r1 = r9; \ r1 &= 0x7; \ l4_%=: r1 += r0; \ r0 = *(u8*)(r1 + 0); \ @@ -259,7 +263,8 @@ l2_%=: r0 = 1; \ : __imm(bpf_map_lookup_elem), __imm_addr(map_array_48b), __imm_addr(map_hash_16b), - __imm_const(__sk_buff_len, offsetof(struct __sk_buff, len)) + __imm_const(__sk_buff_len, offsetof(struct __sk_buff, len)), + __imm(bpf_get_prandom_u32) : __clobber_all); } @@ -271,6 +276,10 @@ __retval(1) __naked void ptr_unknown_vs_unknown_gt(void) { asm volatile (" \ + r8 = r1; \ + call %[bpf_get_prandom_u32]; \ + r9 = r0; \ + r1 = r8; \ r0 = *(u32*)(r1 + %[__sk_buff_len]); \ r1 = 0; \ *(u64*)(r10 - 8) = r1; \ @@ -285,11 +294,11 @@ l1_%=: call %[bpf_map_lookup_elem]; \ r4 = *(u8*)(r0 + 0); \ if r4 == 1 goto l3_%=; \ r1 = 6; \ - r1 = -r1; \ + r1 = r9; \ r1 &= 0x7; \ goto l4_%=; \ l3_%=: r1 = 6; \ - r1 = -r1; \ + r1 = r9; \ r1 &= 0x3; \ l4_%=: r1 += r0; \ r0 = *(u8*)(r1 + 0); \ @@ -299,7 +308,8 @@ l2_%=: r0 = 1; \ : __imm(bpf_map_lookup_elem), __imm_addr(map_array_48b), __imm_addr(map_hash_16b), - __imm_const(__sk_buff_len, offsetof(struct __sk_buff, len)) + __imm_const(__sk_buff_len, offsetof(struct __sk_buff, len)), + __imm(bpf_get_prandom_u32) : __clobber_all); } -- cgit v1.2.3 From d83caf7c8dad96051267c18786b7bc446b537f3c Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Wed, 25 Jun 2025 15:16:21 +0000 Subject: bpf: add btf_type_is_i{32,64} helpers There are places in BPF code which check if a BTF type is an integer of particular size. This code can be made simpler by using helpers. Add new btf_type_is_i{32,64} helpers, and simplify code in a few files. (Suggested by Eduard for a patch which copy-pasted such a check [1].) v1 -> v2: * export less generic helpers (Eduard) * make subject less generic than in [v1] (Eduard) [1] https://lore.kernel.org/bpf/7edb47e73baa46705119a23c6bf4af26517a640f.camel@gmail.com/ [v1] https://lore.kernel.org/bpf/20250624193655.733050-1-a.s.protopopov@gmail.com/ Suggested-by: Eduard Zingerman Signed-off-by: Anton Protopopov Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250625151621.1000584-1-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 2 ++ kernel/bpf/arraymap.c | 11 +++-------- kernel/bpf/bpf_local_storage.c | 8 +------- kernel/bpf/btf.c | 41 ++++++++++++++++++++++++++--------------- kernel/bpf/local_storage.c | 9 +-------- 5 files changed, 33 insertions(+), 38 deletions(-) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index b2983706292f..a40beb9cf160 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -221,6 +221,8 @@ bool btf_is_vmlinux(const struct btf *btf); struct module *btf_try_get_module(const struct btf *btf); u32 btf_nr_types(const struct btf *btf); struct btf *btf_base_btf(const struct btf *btf); +bool btf_type_is_i32(const struct btf_type *t); +bool btf_type_is_i64(const struct btf_type *t); bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, const struct btf_member *m, u32 expected_offset, u32 expected_size); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index eb28c0f219ee..3d080916faf9 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -530,8 +530,6 @@ static int array_map_check_btf(const struct bpf_map *map, const struct btf_type *key_type, const struct btf_type *value_type) { - u32 int_data; - /* One exception for keyless BTF: .bss/.data/.rodata map */ if (btf_type_is_void(key_type)) { if (map->map_type != BPF_MAP_TYPE_ARRAY || @@ -544,14 +542,11 @@ static int array_map_check_btf(const struct bpf_map *map, return 0; } - if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) - return -EINVAL; - - int_data = *(u32 *)(key_type + 1); - /* bpf array can only take a u32 key. This check makes sure + /* + * Bpf array can only take a u32 key. This check makes sure * that the btf matches the attr used during map_create. */ - if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) + if (!btf_type_is_i32(key_type)) return -EINVAL; return 0; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index fa56c30833ff..b931fbceb54d 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -722,13 +722,7 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, const struct btf_type *key_type, const struct btf_type *value_type) { - u32 int_data; - - if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) - return -EINVAL; - - int_data = *(u32 *)(key_type + 1); - if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) + if (!btf_type_is_i32(key_type)) return -EINVAL; return 0; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 682acb1ed234..05fd64a371af 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -858,26 +858,37 @@ const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) EXPORT_SYMBOL_GPL(btf_type_by_id); /* - * Regular int is not a bit field and it must be either - * u8/u16/u32/u64 or __int128. + * Check that the type @t is a regular int. This means that @t is not + * a bit field and it has the same size as either of u8/u16/u32/u64 + * or __int128. If @expected_size is not zero, then size of @t should + * be the same. A caller should already have checked that the type @t + * is an integer. */ +static bool __btf_type_int_is_regular(const struct btf_type *t, size_t expected_size) +{ + u32 int_data = btf_type_int(t); + u8 nr_bits = BTF_INT_BITS(int_data); + u8 nr_bytes = BITS_ROUNDUP_BYTES(nr_bits); + + return BITS_PER_BYTE_MASKED(nr_bits) == 0 && + BTF_INT_OFFSET(int_data) == 0 && + (nr_bytes <= 16 && is_power_of_2(nr_bytes)) && + (expected_size == 0 || nr_bytes == expected_size); +} + static bool btf_type_int_is_regular(const struct btf_type *t) { - u8 nr_bits, nr_bytes; - u32 int_data; + return __btf_type_int_is_regular(t, 0); +} - int_data = btf_type_int(t); - nr_bits = BTF_INT_BITS(int_data); - nr_bytes = BITS_ROUNDUP_BYTES(nr_bits); - if (BITS_PER_BYTE_MASKED(nr_bits) || - BTF_INT_OFFSET(int_data) || - (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) && - nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64) && - nr_bytes != (2 * sizeof(u64)))) { - return false; - } +bool btf_type_is_i32(const struct btf_type *t) +{ + return btf_type_is_int(t) && __btf_type_int_is_regular(t, 4); +} - return true; +bool btf_type_is_i64(const struct btf_type *t) +{ + return btf_type_is_int(t) && __btf_type_int_is_regular(t, 8); } /* diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 3969eb0382af..632d51b05fe9 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -394,17 +394,10 @@ static int cgroup_storage_check_btf(const struct bpf_map *map, if (!btf_member_is_reg_int(btf, key_type, m, offset, size)) return -EINVAL; } else { - u32 int_data; - /* * Key is expected to be u64, which stores the cgroup_inode_id */ - - if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) - return -EINVAL; - - int_data = *(u32 *)(key_type + 1); - if (BTF_INT_BITS(int_data) != 64 || BTF_INT_OFFSET(int_data)) + if (!btf_type_is_i64(key_type)) return -EINVAL; } -- cgit v1.2.3 From 3a95a561f2763e3854e207de3ea821e795a1f1e0 Mon Sep 17 00:00:00 2001 From: Viktor Malik Date: Thu, 26 Jun 2025 08:08:28 +0200 Subject: uaccess: Define pagefault lock guard Define a pagefault lock guard which allows to simplify functions that need to disable page faults. Signed-off-by: Viktor Malik Link: https://lore.kernel.org/r/8a01beb0b671923976f08297d81242bb2129881d.1750917800.git.vmalik@redhat.com Signed-off-by: Alexei Starovoitov --- include/linux/uaccess.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 7c06f4795670..1beb5b395d81 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -296,6 +296,8 @@ static inline bool pagefault_disabled(void) */ #define faulthandler_disabled() (pagefault_disabled() || in_atomic()) +DEFINE_LOCK_GUARD_0(pagefault, pagefault_disable(), pagefault_enable()) + #ifndef CONFIG_ARCH_HAS_SUBPAGE_FAULTS /** -- cgit v1.2.3 From 803f0700a3bbf528c4c624a22f87d08178ca0fbe Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Wed, 2 Jul 2025 23:39:56 +0800 Subject: bpf: Show precise link_type for {uprobe,kprobe}_multi fdinfo Alexei suggested, 'link_type' can be more precise and differentiate for human in fdinfo. In fact BPF_LINK_TYPE_KPROBE_MULTI includes kretprobe_multi type, the same as BPF_LINK_TYPE_UPROBE_MULTI, so we can show it more concretely. link_type: kprobe_multi link_id: 1 prog_tag: d2b307e915f0dd37 ... link_type: kretprobe_multi link_id: 2 prog_tag: ab9ea0545870781d ... link_type: uprobe_multi link_id: 9 prog_tag: e729f789e34a8eca ... link_type: uretprobe_multi link_id: 10 prog_tag: 7db356c03e61a4d4 Co-developed-by: Jiri Olsa Signed-off-by: Jiri Olsa Signed-off-by: Tao Chen Link: https://lore.kernel.org/r/20250702153958.639852-1-chen.dylane@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/bpf/syscall.c | 9 ++++++++- kernel/trace/bpf_trace.c | 10 ++++------ 3 files changed, 13 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5dd556e89cce..287c956cdbd2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1702,6 +1702,7 @@ struct bpf_link { * link's semantics is determined by target attach hook */ bool sleepable; + u32 flags; /* rcu is used before freeing, work can be used to schedule that * RCU-based freeing before that, so they never overlap */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 56500381c28a..f1d9ee9717a1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3228,7 +3228,14 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; if (type < ARRAY_SIZE(bpf_link_type_strs) && bpf_link_type_strs[type]) { - seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]); + if (link->type == BPF_LINK_TYPE_KPROBE_MULTI) + seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_KPROBE_MULTI_RETURN ? + "kretprobe_multi" : "kprobe_multi"); + else if (link->type == BPF_LINK_TYPE_UPROBE_MULTI) + seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_UPROBE_MULTI_RETURN ? + "uretprobe_multi" : "uprobe_multi"); + else + seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]); } else { WARN_ONCE(1, "missing BPF_LINK_TYPE(...) for link type %u\n", type); seq_printf(m, "link_type:\t<%u>\n", type); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0a06ea6638fe..81d7a4e5ae15 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2466,7 +2466,6 @@ struct bpf_kprobe_multi_link { u32 cnt; u32 mods_cnt; struct module **mods; - u32 flags; }; struct bpf_kprobe_multi_run_ctx { @@ -2586,7 +2585,7 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link, kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); info->kprobe_multi.count = kmulti_link->cnt; - info->kprobe_multi.flags = kmulti_link->flags; + info->kprobe_multi.flags = kmulti_link->link.flags; info->kprobe_multi.missed = kmulti_link->fp.nmissed; if (!uaddrs) @@ -2976,7 +2975,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr link->addrs = addrs; link->cookies = cookies; link->cnt = cnt; - link->flags = flags; + link->link.flags = flags; if (cookies) { /* @@ -3045,7 +3044,6 @@ struct bpf_uprobe_multi_link { struct path path; struct bpf_link link; u32 cnt; - u32 flags; struct bpf_uprobe *uprobes; struct task_struct *task; }; @@ -3109,7 +3107,7 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link, umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); info->uprobe_multi.count = umulti_link->cnt; - info->uprobe_multi.flags = umulti_link->flags; + info->uprobe_multi.flags = umulti_link->link.flags; info->uprobe_multi.pid = umulti_link->task ? task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0; @@ -3369,7 +3367,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr link->uprobes = uprobes; link->path = path; link->task = task; - link->flags = flags; + link->link.flags = flags; bpf_link_init(&link->link, BPF_LINK_TYPE_UPROBE_MULTI, &bpf_uprobe_multi_link_lops, prog); -- cgit v1.2.3 From 0426729f46cd1f6354fad07267a21579186a5757 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 3 Jul 2025 13:48:07 -0700 Subject: bpf: Refactor bprintf buffer support Refactor code to be able to get and put bprintf buffers and use bpf_printf_prepare independently. This will be used in the next patch to implement BPF streams support, particularly as a staging buffer for strings that need to be formatted and then allocated and pushed into a stream. Reviewed-by: Emil Tsalapatis Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250703204818.925464-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 15 ++++++++++++++- kernel/bpf/helpers.c | 26 +++++++++++--------------- 2 files changed, 25 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 287c956cdbd2..a07451457d9e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3551,6 +3551,16 @@ bool btf_id_set_contains(const struct btf_id_set *set, u32 id); #define MAX_BPRINTF_VARARGS 12 #define MAX_BPRINTF_BUF 1024 +/* Per-cpu temp buffers used by printf-like helpers to store the bprintf binary + * arguments representation. + */ +#define MAX_BPRINTF_BIN_ARGS 512 + +struct bpf_bprintf_buffers { + char bin_args[MAX_BPRINTF_BIN_ARGS]; + char buf[MAX_BPRINTF_BUF]; +}; + struct bpf_bprintf_data { u32 *bin_args; char *buf; @@ -3558,9 +3568,12 @@ struct bpf_bprintf_data { bool get_buf; }; -int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, +int bpf_bprintf_prepare(const char *fmt, u32 fmt_size, const u64 *raw_args, u32 num_args, struct bpf_bprintf_data *data); void bpf_bprintf_cleanup(struct bpf_bprintf_data *data); +int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs); +void bpf_put_buffers(void); + #ifdef CONFIG_BPF_LSM void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 5269381d6d3d..da66ce307e75 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -764,22 +764,13 @@ static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, return -EINVAL; } -/* Per-cpu temp buffers used by printf-like helpers to store the bprintf binary - * arguments representation. - */ -#define MAX_BPRINTF_BIN_ARGS 512 - /* Support executing three nested bprintf helper calls on a given CPU */ #define MAX_BPRINTF_NEST_LEVEL 3 -struct bpf_bprintf_buffers { - char bin_args[MAX_BPRINTF_BIN_ARGS]; - char buf[MAX_BPRINTF_BUF]; -}; static DEFINE_PER_CPU(struct bpf_bprintf_buffers[MAX_BPRINTF_NEST_LEVEL], bpf_bprintf_bufs); static DEFINE_PER_CPU(int, bpf_bprintf_nest_level); -static int try_get_buffers(struct bpf_bprintf_buffers **bufs) +int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs) { int nest_level; @@ -795,16 +786,21 @@ static int try_get_buffers(struct bpf_bprintf_buffers **bufs) return 0; } -void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) +void bpf_put_buffers(void) { - if (!data->bin_args && !data->buf) - return; if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0)) return; this_cpu_dec(bpf_bprintf_nest_level); preempt_enable(); } +void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) +{ + if (!data->bin_args && !data->buf) + return; + bpf_put_buffers(); +} + /* * bpf_bprintf_prepare - Generic pass on format strings for bprintf-like helpers * @@ -819,7 +815,7 @@ void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) * In argument preparation mode, if 0 is returned, safe temporary buffers are * allocated and bpf_bprintf_cleanup should be called to free them after use. */ -int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, +int bpf_bprintf_prepare(const char *fmt, u32 fmt_size, const u64 *raw_args, u32 num_args, struct bpf_bprintf_data *data) { bool get_buffers = (data->get_bin_args && num_args) || data->get_buf; @@ -835,7 +831,7 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, return -EINVAL; fmt_size = fmt_end - fmt; - if (get_buffers && try_get_buffers(&buffers)) + if (get_buffers && bpf_try_get_buffers(&buffers)) return -EBUSY; if (data->get_bin_args) { -- cgit v1.2.3 From 5ab154f1463a111e1dc8fd5d31eaa7a2a71fe2e6 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 3 Jul 2025 13:48:08 -0700 Subject: bpf: Introduce BPF standard streams Add support for a stream API to the kernel and expose related kfuncs to BPF programs. Two streams are exposed, BPF_STDOUT and BPF_STDERR. These can be used for printing messages that can be consumed from user space, thus it's similar in spirit to existing trace_pipe interface. The kernel will use the BPF_STDERR stream to notify the program of any errors encountered at runtime. BPF programs themselves may use both streams for writing debug messages. BPF library-like code may use BPF_STDERR to print warnings or errors on misuse at runtime. The implementation of a stream is as follows. Everytime a message is emitted from the kernel (directly, or through a BPF program), a record is allocated by bump allocating from per-cpu region backed by a page obtained using alloc_pages_nolock(). This ensures that we can allocate memory from any context. The eventual plan is to discard this scheme in favor of Alexei's kmalloc_nolock() [0]. This record is then locklessly inserted into a list (llist_add()) so that the printing side doesn't require holding any locks, and works in any context. Each stream has a maximum capacity of 4MB of text, and each printed message is accounted against this limit. Messages from a program are emitted using the bpf_stream_vprintk kfunc, which takes a stream_id argument in addition to working otherwise similar to bpf_trace_vprintk. The bprintf buffer helpers are extracted out to be reused for printing the string into them before copying it into the stream, so that we can (with the defined max limit) format a string and know its true length before performing allocations of the stream element. For consuming elements from a stream, we expose a bpf(2) syscall command named BPF_PROG_STREAM_READ_BY_FD, which allows reading data from the stream of a given prog_fd into a user space buffer. The main logic is implemented in bpf_stream_read(). The log messages are queued in bpf_stream::log by the bpf_stream_vprintk kfunc, and then pulled and ordered correctly in the stream backlog. For this purpose, we hold a lock around bpf_stream_backlog_peek(), as llist_del_first() (if we maintained a second lockless list for the backlog) wouldn't be safe from multiple threads anyway. Then, if we fail to find something in the backlog log, we splice out everything from the lockless log, and place it in the backlog log, and then return the head of the backlog. Once the full length of the element is consumed, we will pop it and free it. The lockless list bpf_stream::log is a LIFO stack. Elements obtained using a llist_del_all() operation are in LIFO order, thus would break the chronological ordering if printed directly. Hence, this batch of messages is first reversed. Then, it is stashed into a separate list in the stream, i.e. the backlog_log. The head of this list is the actual message that should always be returned to the caller. All of this is done in bpf_stream_backlog_fill(). From the kernel side, the writing into the stream will be a bit more involved than the typical printk. First, the kernel typically may print a collection of messages into the stream, and parallel writers into the stream may suffer from interleaving of messages. To ensure each group of messages is visible atomically, we can lift the advantage of using a lockless list for pushing in messages. To enable this, we add a bpf_stream_stage() macro, and require kernel users to use bpf_stream_printk statements for the passed expression to write into the stream. Underneath the macro, we have a message staging API, where a bpf_stream_stage object on the stack accumulates the messages being printed into a local llist_head, and then a commit operation splices the whole batch into the stream's lockless log list. This is especially pertinent for rqspinlock deadlock messages printed to program streams. After this change, we see each deadlock invocation as a non-interleaving contiguous message without any confusion on the reader's part, improving their user experience in debugging the fault. While programs cannot benefit from this staged stream writing API, they could just as well hold an rqspinlock around their print statements to serialize messages, hence this is kept kernel-internal for now. Overall, this infrastructure provides NMI-safe any context printing of messages to two dedicated streams. Later patches will add support for printing splats in case of BPF arena page faults, rqspinlock deadlocks, and cond_break timeouts, and integration of this facility into bpftool for dumping messages to user space. [0]: https://lore.kernel.org/bpf/20250501032718.65476-1-alexei.starovoitov@gmail.com Reviewed-by: Eduard Zingerman Reviewed-by: Emil Tsalapatis Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250703204818.925464-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 52 +++++ include/uapi/linux/bpf.h | 24 +++ kernel/bpf/Makefile | 2 +- kernel/bpf/core.c | 5 + kernel/bpf/helpers.c | 1 + kernel/bpf/stream.c | 478 +++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 25 +++ kernel/bpf/verifier.c | 1 + tools/include/uapi/linux/bpf.h | 24 +++ 9 files changed, 611 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/stream.c (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a07451457d9e..f61aeccb23c3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1538,6 +1538,37 @@ struct btf_mod_pair { struct bpf_kfunc_desc_tab; +enum bpf_stream_id { + BPF_STDOUT = 1, + BPF_STDERR = 2, +}; + +struct bpf_stream_elem { + struct llist_node node; + int total_len; + int consumed_len; + char str[]; +}; + +enum { + /* 100k bytes */ + BPF_STREAM_MAX_CAPACITY = 100000ULL, +}; + +struct bpf_stream { + atomic_t capacity; + struct llist_head log; /* list of in-flight stream elements in LIFO order */ + + struct mutex lock; /* lock protecting backlog_{head,tail} */ + struct llist_node *backlog_head; /* list of in-flight stream elements in FIFO order */ + struct llist_node *backlog_tail; /* tail of the list above */ +}; + +struct bpf_stream_stage { + struct llist_head log; + int len; +}; + struct bpf_prog_aux { atomic64_t refcnt; u32 used_map_cnt; @@ -1646,6 +1677,7 @@ struct bpf_prog_aux { struct work_struct work; struct rcu_head rcu; }; + struct bpf_stream stream[2]; }; struct bpf_prog { @@ -2409,6 +2441,7 @@ int generic_map_delete_batch(struct bpf_map *map, struct bpf_map *bpf_map_get_curr_or_next(u32 *id); struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id); + int bpf_map_alloc_pages(const struct bpf_map *map, int nid, unsigned long nr_pages, struct page **page_array); #ifdef CONFIG_MEMCG @@ -3574,6 +3607,25 @@ void bpf_bprintf_cleanup(struct bpf_bprintf_data *data); int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs); void bpf_put_buffers(void); +void bpf_prog_stream_init(struct bpf_prog *prog); +void bpf_prog_stream_free(struct bpf_prog *prog); +int bpf_prog_stream_read(struct bpf_prog *prog, enum bpf_stream_id stream_id, void __user *buf, int len); +void bpf_stream_stage_init(struct bpf_stream_stage *ss); +void bpf_stream_stage_free(struct bpf_stream_stage *ss); +__printf(2, 3) +int bpf_stream_stage_printk(struct bpf_stream_stage *ss, const char *fmt, ...); +int bpf_stream_stage_commit(struct bpf_stream_stage *ss, struct bpf_prog *prog, + enum bpf_stream_id stream_id); + +#define bpf_stream_printk(ss, ...) bpf_stream_stage_printk(&ss, __VA_ARGS__) + +#define bpf_stream_stage(ss, prog, stream_id, expr) \ + ({ \ + bpf_stream_stage_init(&ss); \ + (expr); \ + bpf_stream_stage_commit(&ss, prog, stream_id); \ + bpf_stream_stage_free(&ss); \ + }) #ifdef CONFIG_BPF_LSM void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 719ba230032f..0670e15a6100 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -906,6 +906,17 @@ union bpf_iter_link_info { * A new file descriptor (a nonnegative integer), or -1 if an * error occurred (in which case, *errno* is set appropriately). * + * BPF_PROG_STREAM_READ_BY_FD + * Description + * Read data of a program's BPF stream. The program is identified + * by *prog_fd*, and the stream is identified by the *stream_id*. + * The data is copied to a buffer pointed to by *stream_buf*, and + * filled less than or equal to *stream_buf_len* bytes. + * + * Return + * Number of bytes read from the stream on success, or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * * NOTES * eBPF objects (maps and programs) can be shared between processes. * @@ -961,6 +972,7 @@ enum bpf_cmd { BPF_LINK_DETACH, BPF_PROG_BIND_MAP, BPF_TOKEN_CREATE, + BPF_PROG_STREAM_READ_BY_FD, __MAX_BPF_CMD, }; @@ -1463,6 +1475,11 @@ struct bpf_stack_build_id { #define BPF_OBJ_NAME_LEN 16U +enum { + BPF_STREAM_STDOUT = 1, + BPF_STREAM_STDERR = 2, +}; + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ @@ -1849,6 +1866,13 @@ union bpf_attr { __u32 bpffs_fd; } token_create; + struct { + __aligned_u64 stream_buf; + __u32 stream_buf_len; + __u32 stream_id; + __u32 prog_fd; + } prog_stream_read; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 3a335c50e6e3..269c04a24664 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -14,7 +14,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o obj-$(CONFIG_BPF_JIT) += trampoline.o -obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o rqspinlock.o +obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o rqspinlock.o stream.o ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy) obj-$(CONFIG_BPF_SYSCALL) += arena.o range_tree.o endif diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index e536a34a32c8..f0def24573ae 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -134,6 +134,10 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag mutex_init(&fp->aux->ext_mutex); mutex_init(&fp->aux->dst_mutex); +#ifdef CONFIG_BPF_SYSCALL + bpf_prog_stream_init(fp); +#endif + return fp; } @@ -2862,6 +2866,7 @@ static void bpf_prog_free_deferred(struct work_struct *work) aux = container_of(work, struct bpf_prog_aux, work); #ifdef CONFIG_BPF_SYSCALL bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab); + bpf_prog_stream_free(aux->prog); #endif #ifdef CONFIG_CGROUP_BPF if (aux->cgroup_atype != CGROUP_BPF_ATTACH_TYPE_INVALID) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index da66ce307e75..6bfcbcdf6588 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3825,6 +3825,7 @@ BTF_ID_FLAGS(func, bpf_strnstr); #if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS) BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif +BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_TRUSTED_ARGS) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c new file mode 100644 index 000000000000..e434541358db --- /dev/null +++ b/kernel/bpf/stream.c @@ -0,0 +1,478 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Simple per-CPU NMI-safe bump allocation mechanism, backed by the NMI-safe + * try_alloc_pages()/free_pages_nolock() primitives. We allocate a page and + * stash it in a local per-CPU variable, and bump allocate from the page + * whenever items need to be printed to a stream. Each page holds a global + * atomic refcount in its first 4 bytes, and then records of variable length + * that describe the printed messages. Once the global refcount has dropped to + * zero, it is a signal to free the page back to the kernel's page allocator, + * given all the individual records in it have been consumed. + * + * It is possible the same page is used to serve allocations across different + * programs, which may be consumed at different times individually, hence + * maintaining a reference count per-page is critical for correct lifetime + * tracking. + * + * The bpf_stream_page code will be replaced to use kmalloc_nolock() once it + * lands. + */ +struct bpf_stream_page { + refcount_t ref; + u32 consumed; + char buf[]; +}; + +/* Available room to add data to a refcounted page. */ +#define BPF_STREAM_PAGE_SZ (PAGE_SIZE - offsetofend(struct bpf_stream_page, consumed)) + +static DEFINE_PER_CPU(local_trylock_t, stream_local_lock) = INIT_LOCAL_TRYLOCK(stream_local_lock); +static DEFINE_PER_CPU(struct bpf_stream_page *, stream_pcpu_page); + +static bool bpf_stream_page_local_lock(unsigned long *flags) +{ + return local_trylock_irqsave(&stream_local_lock, *flags); +} + +static void bpf_stream_page_local_unlock(unsigned long *flags) +{ + local_unlock_irqrestore(&stream_local_lock, *flags); +} + +static void bpf_stream_page_free(struct bpf_stream_page *stream_page) +{ + struct page *p; + + if (!stream_page) + return; + p = virt_to_page(stream_page); + free_pages_nolock(p, 0); +} + +static void bpf_stream_page_get(struct bpf_stream_page *stream_page) +{ + refcount_inc(&stream_page->ref); +} + +static void bpf_stream_page_put(struct bpf_stream_page *stream_page) +{ + if (refcount_dec_and_test(&stream_page->ref)) + bpf_stream_page_free(stream_page); +} + +static void bpf_stream_page_init(struct bpf_stream_page *stream_page) +{ + refcount_set(&stream_page->ref, 1); + stream_page->consumed = 0; +} + +static struct bpf_stream_page *bpf_stream_page_replace(void) +{ + struct bpf_stream_page *stream_page, *old_stream_page; + struct page *page; + + page = alloc_pages_nolock(NUMA_NO_NODE, 0); + if (!page) + return NULL; + stream_page = page_address(page); + bpf_stream_page_init(stream_page); + + old_stream_page = this_cpu_read(stream_pcpu_page); + if (old_stream_page) + bpf_stream_page_put(old_stream_page); + this_cpu_write(stream_pcpu_page, stream_page); + return stream_page; +} + +static int bpf_stream_page_check_room(struct bpf_stream_page *stream_page, int len) +{ + int min = offsetof(struct bpf_stream_elem, str[0]); + int consumed = stream_page->consumed; + int total = BPF_STREAM_PAGE_SZ; + int rem = max(0, total - consumed - min); + + /* Let's give room of at least 8 bytes. */ + WARN_ON_ONCE(rem % 8 != 0); + rem = rem < 8 ? 0 : rem; + return min(len, rem); +} + +static void bpf_stream_elem_init(struct bpf_stream_elem *elem, int len) +{ + init_llist_node(&elem->node); + elem->total_len = len; + elem->consumed_len = 0; +} + +static struct bpf_stream_page *bpf_stream_page_from_elem(struct bpf_stream_elem *elem) +{ + unsigned long addr = (unsigned long)elem; + + return (struct bpf_stream_page *)PAGE_ALIGN_DOWN(addr); +} + +static struct bpf_stream_elem *bpf_stream_page_push_elem(struct bpf_stream_page *stream_page, int len) +{ + u32 consumed = stream_page->consumed; + + stream_page->consumed += round_up(offsetof(struct bpf_stream_elem, str[len]), 8); + return (struct bpf_stream_elem *)&stream_page->buf[consumed]; +} + +static struct bpf_stream_elem *bpf_stream_page_reserve_elem(int len) +{ + struct bpf_stream_elem *elem = NULL; + struct bpf_stream_page *page; + int room = 0; + + page = this_cpu_read(stream_pcpu_page); + if (!page) + page = bpf_stream_page_replace(); + if (!page) + return NULL; + + room = bpf_stream_page_check_room(page, len); + if (room != len) + page = bpf_stream_page_replace(); + if (!page) + return NULL; + bpf_stream_page_get(page); + room = bpf_stream_page_check_room(page, len); + WARN_ON_ONCE(room != len); + + elem = bpf_stream_page_push_elem(page, room); + bpf_stream_elem_init(elem, room); + return elem; +} + +static struct bpf_stream_elem *bpf_stream_elem_alloc(int len) +{ + const int max_len = ARRAY_SIZE((struct bpf_bprintf_buffers){}.buf); + struct bpf_stream_elem *elem; + unsigned long flags; + + BUILD_BUG_ON(max_len > BPF_STREAM_PAGE_SZ); + /* + * Length denotes the amount of data to be written as part of stream element, + * thus includes '\0' byte. We're capped by how much bpf_bprintf_buffers can + * accomodate, therefore deny allocations that won't fit into them. + */ + if (len < 0 || len > max_len) + return NULL; + + if (!bpf_stream_page_local_lock(&flags)) + return NULL; + elem = bpf_stream_page_reserve_elem(len); + bpf_stream_page_local_unlock(&flags); + return elem; +} + +static int __bpf_stream_push_str(struct llist_head *log, const char *str, int len) +{ + struct bpf_stream_elem *elem = NULL; + + /* + * Allocate a bpf_prog_stream_elem and push it to the bpf_prog_stream + * log, elements will be popped at once and reversed to print the log. + */ + elem = bpf_stream_elem_alloc(len); + if (!elem) + return -ENOMEM; + + memcpy(elem->str, str, len); + llist_add(&elem->node, log); + + return 0; +} + +static int bpf_stream_consume_capacity(struct bpf_stream *stream, int len) +{ + if (atomic_read(&stream->capacity) >= BPF_STREAM_MAX_CAPACITY) + return -ENOSPC; + if (atomic_add_return(len, &stream->capacity) >= BPF_STREAM_MAX_CAPACITY) { + atomic_sub(len, &stream->capacity); + return -ENOSPC; + } + return 0; +} + +static void bpf_stream_release_capacity(struct bpf_stream *stream, struct bpf_stream_elem *elem) +{ + int len = elem->total_len; + + atomic_sub(len, &stream->capacity); +} + +static int bpf_stream_push_str(struct bpf_stream *stream, const char *str, int len) +{ + int ret = bpf_stream_consume_capacity(stream, len); + + return ret ?: __bpf_stream_push_str(&stream->log, str, len); +} + +static struct bpf_stream *bpf_stream_get(enum bpf_stream_id stream_id, struct bpf_prog_aux *aux) +{ + if (stream_id != BPF_STDOUT && stream_id != BPF_STDERR) + return NULL; + return &aux->stream[stream_id - 1]; +} + +static void bpf_stream_free_elem(struct bpf_stream_elem *elem) +{ + struct bpf_stream_page *p; + + p = bpf_stream_page_from_elem(elem); + bpf_stream_page_put(p); +} + +static void bpf_stream_free_list(struct llist_node *list) +{ + struct bpf_stream_elem *elem, *tmp; + + llist_for_each_entry_safe(elem, tmp, list, node) + bpf_stream_free_elem(elem); +} + +static struct llist_node *bpf_stream_backlog_peek(struct bpf_stream *stream) +{ + return stream->backlog_head; +} + +static struct llist_node *bpf_stream_backlog_pop(struct bpf_stream *stream) +{ + struct llist_node *node; + + node = stream->backlog_head; + if (stream->backlog_head == stream->backlog_tail) + stream->backlog_head = stream->backlog_tail = NULL; + else + stream->backlog_head = node->next; + return node; +} + +static void bpf_stream_backlog_fill(struct bpf_stream *stream) +{ + struct llist_node *head, *tail; + + if (llist_empty(&stream->log)) + return; + tail = llist_del_all(&stream->log); + if (!tail) + return; + head = llist_reverse_order(tail); + + if (!stream->backlog_head) { + stream->backlog_head = head; + stream->backlog_tail = tail; + } else { + stream->backlog_tail->next = head; + stream->backlog_tail = tail; + } + + return; +} + +static bool bpf_stream_consume_elem(struct bpf_stream_elem *elem, int *len) +{ + int rem = elem->total_len - elem->consumed_len; + int used = min(rem, *len); + + elem->consumed_len += used; + *len -= used; + + return elem->consumed_len == elem->total_len; +} + +static int bpf_stream_read(struct bpf_stream *stream, void __user *buf, int len) +{ + int rem_len = len, cons_len, ret = 0; + struct bpf_stream_elem *elem = NULL; + struct llist_node *node; + + mutex_lock(&stream->lock); + + while (rem_len) { + int pos = len - rem_len; + bool cont; + + node = bpf_stream_backlog_peek(stream); + if (!node) { + bpf_stream_backlog_fill(stream); + node = bpf_stream_backlog_peek(stream); + } + if (!node) + break; + elem = container_of(node, typeof(*elem), node); + + cons_len = elem->consumed_len; + cont = bpf_stream_consume_elem(elem, &rem_len) == false; + + ret = copy_to_user(buf + pos, elem->str + cons_len, + elem->consumed_len - cons_len); + /* Restore in case of error. */ + if (ret) { + ret = -EFAULT; + elem->consumed_len = cons_len; + break; + } + + if (cont) + continue; + bpf_stream_backlog_pop(stream); + bpf_stream_release_capacity(stream, elem); + bpf_stream_free_elem(elem); + } + + mutex_unlock(&stream->lock); + return ret ? ret : len - rem_len; +} + +int bpf_prog_stream_read(struct bpf_prog *prog, enum bpf_stream_id stream_id, void __user *buf, int len) +{ + struct bpf_stream *stream; + + stream = bpf_stream_get(stream_id, prog->aux); + if (!stream) + return -ENOENT; + return bpf_stream_read(stream, buf, len); +} + +__bpf_kfunc_start_defs(); + +/* + * Avoid using enum bpf_stream_id so that kfunc users don't have to pull in the + * enum in headers. + */ +__bpf_kfunc int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args, u32 len__sz, void *aux__prog) +{ + struct bpf_bprintf_data data = { + .get_bin_args = true, + .get_buf = true, + }; + struct bpf_prog_aux *aux = aux__prog; + u32 fmt_size = strlen(fmt__str) + 1; + struct bpf_stream *stream; + u32 data_len = len__sz; + int ret, num_args; + + stream = bpf_stream_get(stream_id, aux); + if (!stream) + return -ENOENT; + + if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 || + (data_len && !args)) + return -EINVAL; + num_args = data_len / 8; + + ret = bpf_bprintf_prepare(fmt__str, fmt_size, args, num_args, &data); + if (ret < 0) + return ret; + + ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt__str, data.bin_args); + /* Exclude NULL byte during push. */ + ret = bpf_stream_push_str(stream, data.buf, ret); + bpf_bprintf_cleanup(&data); + + return ret; +} + +__bpf_kfunc_end_defs(); + +/* Added kfunc to common_btf_ids */ + +void bpf_prog_stream_init(struct bpf_prog *prog) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(prog->aux->stream); i++) { + atomic_set(&prog->aux->stream[i].capacity, 0); + init_llist_head(&prog->aux->stream[i].log); + mutex_init(&prog->aux->stream[i].lock); + prog->aux->stream[i].backlog_head = NULL; + prog->aux->stream[i].backlog_tail = NULL; + } +} + +void bpf_prog_stream_free(struct bpf_prog *prog) +{ + struct llist_node *list; + int i; + + for (i = 0; i < ARRAY_SIZE(prog->aux->stream); i++) { + list = llist_del_all(&prog->aux->stream[i].log); + bpf_stream_free_list(list); + bpf_stream_free_list(prog->aux->stream[i].backlog_head); + } +} + +void bpf_stream_stage_init(struct bpf_stream_stage *ss) +{ + init_llist_head(&ss->log); + ss->len = 0; +} + +void bpf_stream_stage_free(struct bpf_stream_stage *ss) +{ + struct llist_node *node; + + node = llist_del_all(&ss->log); + bpf_stream_free_list(node); +} + +int bpf_stream_stage_printk(struct bpf_stream_stage *ss, const char *fmt, ...) +{ + struct bpf_bprintf_buffers *buf; + va_list args; + int ret; + + if (bpf_try_get_buffers(&buf)) + return -EBUSY; + + va_start(args, fmt); + ret = vsnprintf(buf->buf, ARRAY_SIZE(buf->buf), fmt, args); + va_end(args); + ss->len += ret; + /* Exclude NULL byte during push. */ + ret = __bpf_stream_push_str(&ss->log, buf->buf, ret); + bpf_put_buffers(); + return ret; +} + +int bpf_stream_stage_commit(struct bpf_stream_stage *ss, struct bpf_prog *prog, + enum bpf_stream_id stream_id) +{ + struct llist_node *list, *head, *tail; + struct bpf_stream *stream; + int ret; + + stream = bpf_stream_get(stream_id, prog->aux); + if (!stream) + return -EINVAL; + + ret = bpf_stream_consume_capacity(stream, ss->len); + if (ret) + return ret; + + list = llist_del_all(&ss->log); + head = tail = list; + + if (!list) + return 0; + while (llist_next(list)) { + tail = llist_next(list); + list = tail; + } + llist_add_batch(head, tail, &stream->log); + return 0; +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f1d9ee9717a1..7db7182a3057 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5943,6 +5943,28 @@ static int token_create(union bpf_attr *attr) return bpf_token_create(attr); } +#define BPF_PROG_STREAM_READ_BY_FD_LAST_FIELD prog_stream_read.prog_fd + +static int prog_stream_read(union bpf_attr *attr) +{ + char __user *buf = u64_to_user_ptr(attr->prog_stream_read.stream_buf); + u32 len = attr->prog_stream_read.stream_buf_len; + struct bpf_prog *prog; + int ret; + + if (CHECK_ATTR(BPF_PROG_STREAM_READ_BY_FD)) + return -EINVAL; + + prog = bpf_prog_get(attr->prog_stream_read.prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + ret = bpf_prog_stream_read(prog, attr->prog_stream_read.stream_id, buf, len); + bpf_prog_put(prog); + + return ret; +} + static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) { union bpf_attr attr; @@ -6079,6 +6101,9 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) case BPF_TOKEN_CREATE: err = token_create(&attr); break; + case BPF_PROG_STREAM_READ_BY_FD: + err = prog_stream_read(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 52e36fd23f40..9f09dcd2eabb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -46,6 +46,7 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { enum bpf_features { BPF_FEAT_RDONLY_CAST_TO_VOID = 0, + BPF_FEAT_STREAMS = 1, __MAX_BPF_FEAT, }; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 719ba230032f..0670e15a6100 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -906,6 +906,17 @@ union bpf_iter_link_info { * A new file descriptor (a nonnegative integer), or -1 if an * error occurred (in which case, *errno* is set appropriately). * + * BPF_PROG_STREAM_READ_BY_FD + * Description + * Read data of a program's BPF stream. The program is identified + * by *prog_fd*, and the stream is identified by the *stream_id*. + * The data is copied to a buffer pointed to by *stream_buf*, and + * filled less than or equal to *stream_buf_len* bytes. + * + * Return + * Number of bytes read from the stream on success, or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * * NOTES * eBPF objects (maps and programs) can be shared between processes. * @@ -961,6 +972,7 @@ enum bpf_cmd { BPF_LINK_DETACH, BPF_PROG_BIND_MAP, BPF_TOKEN_CREATE, + BPF_PROG_STREAM_READ_BY_FD, __MAX_BPF_CMD, }; @@ -1463,6 +1475,11 @@ struct bpf_stack_build_id { #define BPF_OBJ_NAME_LEN 16U +enum { + BPF_STREAM_STDOUT = 1, + BPF_STREAM_STDERR = 2, +}; + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ @@ -1849,6 +1866,13 @@ union bpf_attr { __u32 bpffs_fd; } token_create; + struct { + __aligned_u64 stream_buf; + __u32 stream_buf_len; + __u32 stream_id; + __u32 prog_fd; + } prog_stream_read; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF -- cgit v1.2.3 From 0e521efaf36350b8f783984541efa56f560c90b0 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 3 Jul 2025 13:48:09 -0700 Subject: bpf: Add function to extract program source info Prepare a function for use in future patches that can extract the file info, line info, and the source line number for a given BPF program provided it's program counter. Only the basename of the file path is provided, given it can be excessively long in some cases. This will be used in later patches to print source info to the BPF stream. Reviewed-by: Emil Tsalapatis Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250703204818.925464-4-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +++ kernel/bpf/core.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f61aeccb23c3..c3802af11bac 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3661,4 +3661,7 @@ static inline bool bpf_is_subprog(const struct bpf_prog *prog) return prog->aux->func_idx != 0; } +int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char **filep, + const char **linep, int *nump); + #endif /* _LINUX_BPF_H */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f0def24573ae..2dc5b846ae50 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -3213,3 +3213,50 @@ EXPORT_SYMBOL(bpf_stats_enabled_key); EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx); + +#ifdef CONFIG_BPF_SYSCALL + +int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char **filep, + const char **linep, int *nump) +{ + int idx = -1, insn_start, insn_end, len; + struct bpf_line_info *linfo; + void **jited_linfo; + struct btf *btf; + + btf = prog->aux->btf; + linfo = prog->aux->linfo; + jited_linfo = prog->aux->jited_linfo; + + if (!btf || !linfo || !jited_linfo) + return -EINVAL; + len = prog->aux->func ? prog->aux->func[prog->aux->func_idx]->len : prog->len; + + linfo = &prog->aux->linfo[prog->aux->linfo_idx]; + jited_linfo = &prog->aux->jited_linfo[prog->aux->linfo_idx]; + + insn_start = linfo[0].insn_off; + insn_end = insn_start + len; + + for (int i = 0; i < prog->aux->nr_linfo && + linfo[i].insn_off >= insn_start && linfo[i].insn_off < insn_end; i++) { + if (jited_linfo[i] >= (void *)ip) + break; + idx = i; + } + + if (idx == -1) + return -ENOENT; + + /* Get base component of the file path. */ + *filep = btf_name_by_offset(btf, linfo[idx].file_name_off); + *filep = kbasename(*filep); + /* Obtain the source line, and strip whitespace in prefix. */ + *linep = btf_name_by_offset(btf, linfo[idx].line_off); + while (isspace(**linep)) + *linep += 1; + *nump = BPF_LINE_INFO_LINE_NUM(linfo[idx].line_col); + return 0; +} + +#endif -- cgit v1.2.3 From f0c53fd4a742f957da7077a691a85ef9775907dc Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 3 Jul 2025 13:48:11 -0700 Subject: bpf: Add function to find program from stack trace In preparation of figuring out the closest program that led to the current point in the kernel, implement a function that scans through the stack trace and finds out the closest BPF program when walking down the stack trace. Special care needs to be taken to skip over kernel and BPF subprog frames. We basically scan until we find a BPF main prog frame. The assumption is that if a program calls into us transitively, we'll hit it along the way. If not, we end up returning NULL. Contextually the function will be used in places where we know the program may have called into us. Due to reliance on arch_bpf_stack_walk(), this function only works on x86 with CONFIG_UNWINDER_ORC, arm64, and s390. Remove the warning from arch_bpf_stack_walk as well since we call it outside bpf_throw() context. Acked-by: Eduard Zingerman Reviewed-by: Emil Tsalapatis Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250703204818.925464-6-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 1 - include/linux/bpf.h | 1 + kernel/bpf/core.c | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 34 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 15672cb926fc..40e1b3b9634f 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -3845,7 +3845,6 @@ void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp } return; #endif - WARN(1, "verification of programs using bpf_throw should have failed\n"); } void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c3802af11bac..b267c378d884 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3663,5 +3663,6 @@ static inline bool bpf_is_subprog(const struct bpf_prog *prog) int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char **filep, const char **linep, int *nump); +struct bpf_prog *bpf_prog_find_from_stack(void); #endif /* _LINUX_BPF_H */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 833442661742..037d67cf5fb1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -3262,4 +3262,37 @@ int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char * return 0; } +struct walk_stack_ctx { + struct bpf_prog *prog; +}; + +static bool find_from_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp) +{ + struct walk_stack_ctx *ctxp = cookie; + struct bpf_prog *prog; + + /* + * The RCU read lock is held to safely traverse the latch tree, but we + * don't need its protection when accessing the prog, since it has an + * active stack frame on the current stack trace, and won't disappear. + */ + rcu_read_lock(); + prog = bpf_prog_ksym_find(ip); + rcu_read_unlock(); + if (!prog) + return true; + if (bpf_is_subprog(prog)) + return true; + ctxp->prog = prog; + return false; +} + +struct bpf_prog *bpf_prog_find_from_stack(void) +{ + struct walk_stack_ctx ctx = {}; + + arch_bpf_stack_walk(find_from_stack_cb, &ctx); + return ctx.prog; +} + #endif -- cgit v1.2.3 From d7c431cafcb4917b0d87b5cd10637cd47b6c8d79 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 3 Jul 2025 13:48:12 -0700 Subject: bpf: Add dump_stack() analogue to print to BPF stderr Introduce a kernel function which is the analogue of dump_stack() printing some useful information and the stack trace. This is not exposed to BPF programs yet, but can be made available in the future. When we have a program counter for a BPF program in the stack trace, also additionally output the filename and line number to make the trace helpful. The rest of the trace can be passed into ./decode_stacktrace.sh to obtain the line numbers for kernel symbols. Reviewed-by: Emil Tsalapatis Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250703204818.925464-7-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ kernel/bpf/stream.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b267c378d884..34dd90ec7fad 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3616,8 +3616,10 @@ __printf(2, 3) int bpf_stream_stage_printk(struct bpf_stream_stage *ss, const char *fmt, ...); int bpf_stream_stage_commit(struct bpf_stream_stage *ss, struct bpf_prog *prog, enum bpf_stream_id stream_id); +int bpf_stream_stage_dump_stack(struct bpf_stream_stage *ss); #define bpf_stream_printk(ss, ...) bpf_stream_stage_printk(&ss, __VA_ARGS__) +#define bpf_stream_dump_stack(ss) bpf_stream_stage_dump_stack(&ss) #define bpf_stream_stage(ss, prog, stream_id, expr) \ ({ \ diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c index e434541358db..8c842f845245 100644 --- a/kernel/bpf/stream.c +++ b/kernel/bpf/stream.c @@ -2,6 +2,7 @@ /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ #include +#include #include #include #include @@ -476,3 +477,50 @@ int bpf_stream_stage_commit(struct bpf_stream_stage *ss, struct bpf_prog *prog, llist_add_batch(head, tail, &stream->log); return 0; } + +struct dump_stack_ctx { + struct bpf_stream_stage *ss; + int err; +}; + +static bool dump_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp) +{ + struct dump_stack_ctx *ctxp = cookie; + const char *file = "", *line = ""; + struct bpf_prog *prog; + int num, ret; + + rcu_read_lock(); + prog = bpf_prog_ksym_find(ip); + rcu_read_unlock(); + if (prog) { + ret = bpf_prog_get_file_line(prog, ip, &file, &line, &num); + if (ret < 0) + goto end; + ctxp->err = bpf_stream_stage_printk(ctxp->ss, "%pS\n %s @ %s:%d\n", + (void *)ip, line, file, num); + return !ctxp->err; + } +end: + ctxp->err = bpf_stream_stage_printk(ctxp->ss, "%pS\n", (void *)ip); + return !ctxp->err; +} + +int bpf_stream_stage_dump_stack(struct bpf_stream_stage *ss) +{ + struct dump_stack_ctx ctx = { .ss = ss }; + int ret; + + ret = bpf_stream_stage_printk(ss, "CPU: %d UID: %d PID: %d Comm: %s\n", + raw_smp_processor_id(), __kuid_val(current_real_cred()->euid), + current->pid, current->comm); + if (ret) + return ret; + ret = bpf_stream_stage_printk(ss, "Call trace:\n"); + if (ret) + return ret; + arch_bpf_stack_walk(dump_stack_cb, &ctx); + if (ctx.err) + return ctx.err; + return bpf_stream_stage_printk(ss, "\n"); +} -- cgit v1.2.3 From 82bc4abf28d8147dd5da9ba52f0aa1bac23c125e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 3 Jul 2025 07:11:17 -0700 Subject: bpf: Avoid putting struct bpf_scc_callchain variables on the stack Add a 'struct bpf_scc_callchain callchain_buf' field in bpf_verifier_env. This way, the previous bpf_scc_callchain local variables can be replaced by taking address of env->callchain_buf. This can reduce stack usage and fix the following error: kernel/bpf/verifier.c:19921:12: error: stack frame size (1368) exceeds limit (1280) in 'do_check' [-Werror,-Wframe-larger-than] Reported-by: Arnd Bergmann Acked-by: Jiri Olsa Acked-by: Eduard Zingerman Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20250703141117.1485108-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 36 ++++++++++++++++++------------------ 2 files changed, 19 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 7e459e839f8b..94defa405c85 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -841,6 +841,7 @@ struct bpf_verifier_env { char tmp_str_buf[TMP_STR_BUF_LEN]; struct bpf_insn insn_buf[INSN_BUF_SIZE]; struct bpf_insn epilogue_buf[INSN_BUF_SIZE]; + struct bpf_scc_callchain callchain_buf; /* array of pointers to bpf_scc_info indexed by SCC id */ struct bpf_scc_info **scc_info; u32 scc_cnt; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 92dba3c9664f..0f6cc2275695 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1914,19 +1914,19 @@ static char *format_callchain(struct bpf_verifier_env *env, struct bpf_scc_callc */ static int maybe_enter_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st) { - struct bpf_scc_callchain callchain; + struct bpf_scc_callchain *callchain = &env->callchain_buf; struct bpf_scc_visit *visit; - if (!compute_scc_callchain(env, st, &callchain)) + if (!compute_scc_callchain(env, st, callchain)) return 0; - visit = scc_visit_lookup(env, &callchain); - visit = visit ?: scc_visit_alloc(env, &callchain); + visit = scc_visit_lookup(env, callchain); + visit = visit ?: scc_visit_alloc(env, callchain); if (!visit) return -ENOMEM; if (!visit->entry_state) { visit->entry_state = st; if (env->log.level & BPF_LOG_LEVEL2) - verbose(env, "SCC enter %s\n", format_callchain(env, &callchain)); + verbose(env, "SCC enter %s\n", format_callchain(env, callchain)); } return 0; } @@ -1939,21 +1939,21 @@ static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visi */ static int maybe_exit_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st) { - struct bpf_scc_callchain callchain; + struct bpf_scc_callchain *callchain = &env->callchain_buf; struct bpf_scc_visit *visit; - if (!compute_scc_callchain(env, st, &callchain)) + if (!compute_scc_callchain(env, st, callchain)) return 0; - visit = scc_visit_lookup(env, &callchain); + visit = scc_visit_lookup(env, callchain); if (!visit) { verifier_bug(env, "scc exit: no visit info for call chain %s", - format_callchain(env, &callchain)); + format_callchain(env, callchain)); return -EFAULT; } if (visit->entry_state != st) return 0; if (env->log.level & BPF_LOG_LEVEL2) - verbose(env, "SCC exit %s\n", format_callchain(env, &callchain)); + verbose(env, "SCC exit %s\n", format_callchain(env, callchain)); visit->entry_state = NULL; env->num_backedges -= visit->num_backedges; visit->num_backedges = 0; @@ -1968,22 +1968,22 @@ static int add_scc_backedge(struct bpf_verifier_env *env, struct bpf_verifier_state *st, struct bpf_scc_backedge *backedge) { - struct bpf_scc_callchain callchain; + struct bpf_scc_callchain *callchain = &env->callchain_buf; struct bpf_scc_visit *visit; - if (!compute_scc_callchain(env, st, &callchain)) { + if (!compute_scc_callchain(env, st, callchain)) { verifier_bug(env, "add backedge: no SCC in verification path, insn_idx %d", st->insn_idx); return -EFAULT; } - visit = scc_visit_lookup(env, &callchain); + visit = scc_visit_lookup(env, callchain); if (!visit) { verifier_bug(env, "add backedge: no visit info for call chain %s", - format_callchain(env, &callchain)); + format_callchain(env, callchain)); return -EFAULT; } if (env->log.level & BPF_LOG_LEVEL2) - verbose(env, "SCC backedge %s\n", format_callchain(env, &callchain)); + verbose(env, "SCC backedge %s\n", format_callchain(env, callchain)); backedge->next = visit->backedges; visit->backedges = backedge; visit->num_backedges++; @@ -1999,12 +1999,12 @@ static int add_scc_backedge(struct bpf_verifier_env *env, static bool incomplete_read_marks(struct bpf_verifier_env *env, struct bpf_verifier_state *st) { - struct bpf_scc_callchain callchain; + struct bpf_scc_callchain *callchain = &env->callchain_buf; struct bpf_scc_visit *visit; - if (!compute_scc_callchain(env, st, &callchain)) + if (!compute_scc_callchain(env, st, callchain)) return false; - visit = scc_visit_lookup(env, &callchain); + visit = scc_visit_lookup(env, callchain); if (!visit) return false; return !!visit->backedges; -- cgit v1.2.3 From c4aa454c64ae022e5a9d55b3c31e9b8dd8a1544f Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 4 Jul 2025 16:03:53 -0700 Subject: bpf: support for void/primitive __arg_untrusted global func params Allow specifying __arg_untrusted for void */char */int */long * parameters. Treat such parameters as PTR_TO_MEM|MEM_RDONLY|PTR_UNTRUSTED of size zero. Intended usage is as follows: int memcmp(char *a __arg_untrusted, char *b __arg_untrusted, size_t n) { bpf_for(i, 0, n) { if (a[i] - b[i]) // load at any offset is allowed return a[i] - b[i]; } return 0; } Allocate register id for ARG_PTR_TO_MEM parameters only when PTR_MAYBE_NULL is set. Register id for PTR_TO_MEM is used only to propagate non-null status after conditionals. Suggested-by: Alexei Starovoitov Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250704230354.1323244-8-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 1 + kernel/bpf/btf.c | 15 ++++++++++++++- kernel/bpf/verifier.c | 7 ++++--- 3 files changed, 19 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index a40beb9cf160..9eda6b113f9b 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -223,6 +223,7 @@ u32 btf_nr_types(const struct btf *btf); struct btf *btf_base_btf(const struct btf *btf); bool btf_type_is_i32(const struct btf_type *t); bool btf_type_is_i64(const struct btf_type *t); +bool btf_type_is_primitive(const struct btf_type *t); bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, const struct btf_member *m, u32 expected_offset, u32 expected_size); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index e0414d9f5e29..2dd13eea7b0e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -891,6 +891,12 @@ bool btf_type_is_i64(const struct btf_type *t) return btf_type_is_int(t) && __btf_type_int_is_regular(t, 8); } +bool btf_type_is_primitive(const struct btf_type *t) +{ + return (btf_type_is_int(t) && btf_type_int_is_regular(t)) || + btf_is_any_enum(t); +} + /* * Check that given struct member is a regular int with expected * offset and size. @@ -7830,6 +7836,13 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) return -EINVAL; } + ref_t = btf_type_skip_modifiers(btf, t->type, NULL); + if (btf_type_is_void(ref_t) || btf_type_is_primitive(ref_t)) { + sub->args[i].arg_type = ARG_PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED; + sub->args[i].mem_size = 0; + continue; + } + kern_type_id = btf_get_ptr_to_btf_id(log, i, btf, t); if (kern_type_id < 0) return kern_type_id; @@ -7838,7 +7851,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) ref_t = btf_type_by_id(vmlinux_btf, kern_type_id); if (!btf_type_is_struct(ref_t)) { tname = __btf_name_by_offset(vmlinux_btf, t->name_off); - bpf_log(log, "arg#%d has type %s '%s', but only struct types are allowed\n", + bpf_log(log, "arg#%d has type %s '%s', but only struct or primitive types are allowed\n", i, btf_type_str(ref_t), tname); return -EINVAL; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7af902c3ecc3..1e567fff6f23 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -23152,11 +23152,12 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) __mark_dynptr_reg(reg, BPF_DYNPTR_TYPE_LOCAL, true, ++env->id_gen); } else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) { reg->type = PTR_TO_MEM; - if (arg->arg_type & PTR_MAYBE_NULL) - reg->type |= PTR_MAYBE_NULL; + reg->type |= arg->arg_type & + (PTR_MAYBE_NULL | PTR_UNTRUSTED | MEM_RDONLY); mark_reg_known_zero(env, regs, i); reg->mem_size = arg->mem_size; - reg->id = ++env->id_gen; + if (arg->arg_type & PTR_MAYBE_NULL) + reg->id = ++env->id_gen; } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) { reg->type = PTR_TO_BTF_ID; if (arg->arg_type & PTR_MAYBE_NULL) -- cgit v1.2.3 From b725441f02c2b31c04a95d0e9ca5420fa029a767 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Thu, 10 Jul 2025 11:20:32 +0800 Subject: bpf: Add attach_type field to bpf_link Attach_type will be set when a link is created by user. It is better to record attach_type in bpf_link generically and have it available universally for all link types. So add the attach_type field in bpf_link and move the sleepable field to avoid unnecessary gap padding. Signed-off-by: Tao Chen Signed-off-by: Andrii Nakryiko Acked-by: Daniel Borkmann Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20250710032038.888700-2-chen.dylane@linux.dev --- drivers/net/netkit.c | 2 +- include/linux/bpf.h | 28 +++++++++++++++++----------- kernel/bpf/bpf_iter.c | 3 ++- kernel/bpf/bpf_struct_ops.c | 5 +++-- kernel/bpf/cgroup.c | 4 ++-- kernel/bpf/net_namespace.c | 2 +- kernel/bpf/syscall.c | 35 ++++++++++++++++++++++------------- kernel/bpf/tcx.c | 3 ++- kernel/bpf/trampoline.c | 10 ++++++---- kernel/trace/bpf_trace.c | 4 ++-- net/bpf/bpf_dummy_struct_ops.c | 3 ++- net/core/dev.c | 3 ++- net/core/sock_map.c | 3 ++- net/netfilter/nf_bpf_link.c | 3 ++- 14 files changed, 66 insertions(+), 42 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c index d072a7968f56..5928c99eac73 100644 --- a/drivers/net/netkit.c +++ b/drivers/net/netkit.c @@ -775,7 +775,7 @@ static int netkit_link_init(struct netkit_link *nkl, struct bpf_prog *prog) { bpf_link_init(&nkl->link, BPF_LINK_TYPE_NETKIT, - &netkit_link_lops, prog); + &netkit_link_lops, prog, attr->link_create.attach_type); nkl->location = attr->link_create.attach_type; nkl->dev = dev; return bpf_link_prime(&nkl->link, link_primer); diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 34dd90ec7fad..a9ee9c14b486 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1729,12 +1729,10 @@ struct bpf_link { enum bpf_link_type type; const struct bpf_link_ops *ops; struct bpf_prog *prog; - /* whether BPF link itself has "sleepable" semantics, which can differ - * from underlying BPF program having a "sleepable" semantics, as BPF - * link's semantics is determined by target attach hook - */ - bool sleepable; + u32 flags; + enum bpf_attach_type attach_type; + /* rcu is used before freeing, work can be used to schedule that * RCU-based freeing before that, so they never overlap */ @@ -1742,6 +1740,11 @@ struct bpf_link { struct rcu_head rcu; struct work_struct work; }; + /* whether BPF link itself has "sleepable" semantics, which can differ + * from underlying BPF program having a "sleepable" semantics, as BPF + * link's semantics is determined by target attach hook + */ + bool sleepable; }; struct bpf_link_ops { @@ -2034,11 +2037,13 @@ int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog, #if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM) int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, - int cgroup_atype); + int cgroup_atype, + enum bpf_attach_type attach_type); void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog); #else static inline int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, - int cgroup_atype) + int cgroup_atype, + enum bpf_attach_type attach_type) { return -EOPNOTSUPP; } @@ -2528,10 +2533,11 @@ int bpf_map_new_fd(struct bpf_map *map, int flags); int bpf_prog_new_fd(struct bpf_prog *prog); void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, - const struct bpf_link_ops *ops, struct bpf_prog *prog); + const struct bpf_link_ops *ops, struct bpf_prog *prog, + enum bpf_attach_type attach_type); void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog, - bool sleepable); + enum bpf_attach_type attach_type, bool sleepable); int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer); int bpf_link_settle(struct bpf_link_primer *primer); void bpf_link_cleanup(struct bpf_link_primer *primer); @@ -2883,13 +2889,13 @@ bpf_prog_inc_not_zero(struct bpf_prog *prog) static inline void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, - struct bpf_prog *prog) + struct bpf_prog *prog, enum bpf_attach_type attach_type) { } static inline void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog, - bool sleepable) + enum bpf_attach_type attach_type, bool sleepable) { } diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 303ab1f42d3a..0cbcae727079 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -552,7 +552,8 @@ int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, if (!link) return -ENOMEM; - bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog, + attr->link_create.attach_type); link->tinfo = tinfo; err = bpf_link_prime(&link->link, &link_primer); diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 96113633e391..687a3e9c76f5 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -808,7 +808,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, goto reset_unlock; } bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, - &bpf_struct_ops_link_lops, prog); + &bpf_struct_ops_link_lops, prog, prog->expected_attach_type); *plink++ = &link->link; ksym = kzalloc(sizeof(*ksym), GFP_USER); @@ -1351,7 +1351,8 @@ int bpf_struct_ops_link_create(union bpf_attr *attr) err = -ENOMEM; goto err_out; } - bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL); + bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL, + attr->link_create.attach_type); err = bpf_link_prime(&link->link, &link_primer); if (err) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index cd220e861d67..bacdd0ca7419 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -867,7 +867,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, cgrp->bpf.flags[atype] = saved_flags; if (type == BPF_LSM_CGROUP) { - err = bpf_trampoline_link_cgroup_shim(new_prog, atype); + err = bpf_trampoline_link_cgroup_shim(new_prog, atype, type); if (err) goto cleanup; } @@ -1495,7 +1495,7 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) goto out_put_cgroup; } bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops, - prog); + prog, attr->link_create.attach_type); link->cgroup = cgrp; link->type = attr->link_create.attach_type; diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 868cc2c43899..63702c862757 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -501,7 +501,7 @@ int netns_bpf_link_create(const union bpf_attr *attr, struct bpf_prog *prog) goto out_put_net; } bpf_link_init(&net_link->link, BPF_LINK_TYPE_NETNS, - &bpf_netns_link_ops, prog); + &bpf_netns_link_ops, prog, type); net_link->net = net; net_link->type = type; net_link->netns_type = netns_type; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3f36bfe13266..cd7321fe0ba3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3069,7 +3069,7 @@ static int bpf_obj_get(const union bpf_attr *attr) */ void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog, - bool sleepable) + enum bpf_attach_type attach_type, bool sleepable) { WARN_ON(ops->dealloc && ops->dealloc_deferred); atomic64_set(&link->refcnt, 1); @@ -3078,12 +3078,14 @@ void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type, link->id = 0; link->ops = ops; link->prog = prog; + link->attach_type = attach_type; } void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, - const struct bpf_link_ops *ops, struct bpf_prog *prog) + const struct bpf_link_ops *ops, struct bpf_prog *prog, + enum bpf_attach_type attach_type) { - bpf_link_init_sleepable(link, type, ops, prog, false); + bpf_link_init_sleepable(link, type, ops, prog, attach_type, false); } static void bpf_link_free_id(int id) @@ -3443,7 +3445,8 @@ static const struct bpf_link_ops bpf_tracing_link_lops = { static int bpf_tracing_prog_attach(struct bpf_prog *prog, int tgt_prog_fd, u32 btf_id, - u64 bpf_cookie) + u64 bpf_cookie, + enum bpf_attach_type attach_type) { struct bpf_link_primer link_primer; struct bpf_prog *tgt_prog = NULL; @@ -3511,7 +3514,8 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, goto out_put_prog; } bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING, - &bpf_tracing_link_lops, prog); + &bpf_tracing_link_lops, prog, attach_type); + link->attach_type = prog->expected_attach_type; link->link.cookie = bpf_cookie; @@ -4049,7 +4053,8 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro err = -ENOMEM; goto out_put_file; } - bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog, + attr->link_create.attach_type); link->perf_file = perf_file; err = bpf_link_prime(&link->link, &link_primer); @@ -4081,7 +4086,8 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro #endif /* CONFIG_PERF_EVENTS */ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, - const char __user *user_tp_name, u64 cookie) + const char __user *user_tp_name, u64 cookie, + enum bpf_attach_type attach_type) { struct bpf_link_primer link_primer; struct bpf_raw_tp_link *link; @@ -4104,7 +4110,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, tp_name = prog->aux->attach_func_name; break; } - return bpf_tracing_prog_attach(prog, 0, 0, 0); + return bpf_tracing_prog_attach(prog, 0, 0, 0, attach_type); case BPF_PROG_TYPE_RAW_TRACEPOINT: case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0) @@ -4126,7 +4132,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, goto out_put_btp; } bpf_link_init_sleepable(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, - &bpf_raw_tp_link_lops, prog, + &bpf_raw_tp_link_lops, prog, attach_type, tracepoint_is_faultable(btp->tp)); link->btp = btp; link->cookie = cookie; @@ -4168,7 +4174,7 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) tp_name = u64_to_user_ptr(attr->raw_tracepoint.name); cookie = attr->raw_tracepoint.cookie; - fd = bpf_raw_tp_link_attach(prog, tp_name, cookie); + fd = bpf_raw_tp_link_attach(prog, tp_name, cookie, prog->expected_attach_type); if (fd < 0) bpf_prog_put(prog); return fd; @@ -5525,7 +5531,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = bpf_tracing_prog_attach(prog, attr->link_create.target_fd, attr->link_create.target_btf_id, - attr->link_create.tracing.cookie); + attr->link_create.tracing.cookie, + attr->link_create.attach_type); break; case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_TRACING: @@ -5534,7 +5541,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) goto out; } if (prog->expected_attach_type == BPF_TRACE_RAW_TP) - ret = bpf_raw_tp_link_attach(prog, NULL, attr->link_create.tracing.cookie); + ret = bpf_raw_tp_link_attach(prog, NULL, attr->link_create.tracing.cookie, + attr->link_create.attach_type); else if (prog->expected_attach_type == BPF_TRACE_ITER) ret = bpf_iter_link_attach(attr, uattr, prog); else if (prog->expected_attach_type == BPF_LSM_CGROUP) @@ -5543,7 +5551,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = bpf_tracing_prog_attach(prog, attr->link_create.target_fd, attr->link_create.target_btf_id, - attr->link_create.tracing.cookie); + attr->link_create.tracing.cookie, + attr->link_create.attach_type); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: case BPF_PROG_TYPE_SK_LOOKUP: diff --git a/kernel/bpf/tcx.c b/kernel/bpf/tcx.c index 2e4885e7781f..e6a14f408d94 100644 --- a/kernel/bpf/tcx.c +++ b/kernel/bpf/tcx.c @@ -301,7 +301,8 @@ static int tcx_link_init(struct tcx_link *tcx, struct net_device *dev, struct bpf_prog *prog) { - bpf_link_init(&tcx->link, BPF_LINK_TYPE_TCX, &tcx_link_lops, prog); + bpf_link_init(&tcx->link, BPF_LINK_TYPE_TCX, &tcx_link_lops, prog, + attr->link_create.attach_type); tcx->location = attr->link_create.attach_type; tcx->dev = dev; return bpf_link_prime(&tcx->link, link_primer); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index b1e358c16eeb..0e364614c3a2 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -674,7 +674,8 @@ static const struct bpf_link_ops bpf_shim_tramp_link_lops = { static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog, bpf_func_t bpf_func, - int cgroup_atype) + int cgroup_atype, + enum bpf_attach_type attach_type) { struct bpf_shim_tramp_link *shim_link = NULL; struct bpf_prog *p; @@ -701,7 +702,7 @@ static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog p->expected_attach_type = BPF_LSM_MAC; bpf_prog_inc(p); bpf_link_init(&shim_link->link.link, BPF_LINK_TYPE_UNSPEC, - &bpf_shim_tramp_link_lops, p); + &bpf_shim_tramp_link_lops, p, attach_type); bpf_cgroup_atype_get(p->aux->attach_btf_id, cgroup_atype); return shim_link; @@ -726,7 +727,8 @@ static struct bpf_shim_tramp_link *cgroup_shim_find(struct bpf_trampoline *tr, } int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, - int cgroup_atype) + int cgroup_atype, + enum bpf_attach_type attach_type) { struct bpf_shim_tramp_link *shim_link = NULL; struct bpf_attach_target_info tgt_info = {}; @@ -763,7 +765,7 @@ int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, /* Allocate and install new shim. */ - shim_link = cgroup_shim_alloc(prog, bpf_func, cgroup_atype); + shim_link = cgroup_shim_alloc(prog, bpf_func, cgroup_atype, attach_type); if (!shim_link) { err = -ENOMEM; goto err; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e7f97a9a8bbd..ffdde840abb8 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2986,7 +2986,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr } bpf_link_init(&link->link, BPF_LINK_TYPE_KPROBE_MULTI, - &bpf_kprobe_multi_link_lops, prog); + &bpf_kprobe_multi_link_lops, prog, attr->link_create.attach_type); err = bpf_link_prime(&link->link, &link_primer); if (err) @@ -3441,7 +3441,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr link->link.flags = flags; bpf_link_init(&link->link, BPF_LINK_TYPE_UPROBE_MULTI, - &bpf_uprobe_multi_link_lops, prog); + &bpf_uprobe_multi_link_lops, prog, attr->link_create.attach_type); for (i = 0; i < cnt; i++) { uprobes[i].uprobe = uprobe_register(d_real_inode(link->path.dentry), diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c index f71f67c6896b..812457819b5a 100644 --- a/net/bpf/bpf_dummy_struct_ops.c +++ b/net/bpf/bpf_dummy_struct_ops.c @@ -171,7 +171,8 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, } /* prog doesn't take the ownership of the reference from caller */ bpf_prog_inc(prog); - bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_link_lops, prog, + prog->expected_attach_type); op_idx = prog->expected_attach_type; err = bpf_struct_ops_prepare_trampoline(tlinks, link, diff --git a/net/core/dev.c b/net/core/dev.c index be97c440ecd5..7969fddc94e3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10364,7 +10364,8 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) goto unlock; } - bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog, + attr->link_create.attach_type); link->dev = dev; link->flags = attr->link_create.flags; diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 82a14f131d00..fbe9a33ddf18 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1866,7 +1866,8 @@ int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog) } attach_type = attr->link_create.attach_type; - bpf_link_init(&sockmap_link->link, BPF_LINK_TYPE_SOCKMAP, &sock_map_link_ops, prog); + bpf_link_init(&sockmap_link->link, BPF_LINK_TYPE_SOCKMAP, &sock_map_link_ops, prog, + attach_type); sockmap_link->map = map; sockmap_link->attach_type = attach_type; diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c index 06b084844700..a054d3b216d8 100644 --- a/net/netfilter/nf_bpf_link.c +++ b/net/netfilter/nf_bpf_link.c @@ -225,7 +225,8 @@ int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) if (!link) return -ENOMEM; - bpf_link_init(&link->link, BPF_LINK_TYPE_NETFILTER, &bpf_nf_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_NETFILTER, &bpf_nf_link_lops, prog, + attr->link_create.attach_type); link->hook_ops.hook = nf_hook_run_bpf; link->hook_ops.hook_ops_type = NF_HOOK_OP_BPF; -- cgit v1.2.3 From 9b8d543dc2bbf5d3a1e2d60049df94ae2bc68b28 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Thu, 10 Jul 2025 11:20:33 +0800 Subject: bpf: Remove attach_type in bpf_cgroup_link Use attach_type in bpf_link, and remove it in bpf_cgroup_link. Signed-off-by: Tao Chen Signed-off-by: Andrii Nakryiko Acked-by: Daniel Borkmann Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20250710032038.888700-3-chen.dylane@linux.dev --- include/linux/bpf-cgroup.h | 1 - kernel/bpf/cgroup.c | 13 ++++++------- 2 files changed, 6 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 70c8b94e797a..082ccd8ad96b 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -103,7 +103,6 @@ struct bpf_cgroup_storage { struct bpf_cgroup_link { struct bpf_link link; struct cgroup *cgroup; - enum bpf_attach_type type; }; struct bpf_prog_list { diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index bacdd0ca7419..72c8b50dca0a 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -984,7 +984,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp, struct hlist_head *progs; bool found = false; - atype = bpf_cgroup_atype_find(link->type, new_prog->aux->attach_btf_id); + atype = bpf_cgroup_atype_find(link->link.attach_type, new_prog->aux->attach_btf_id); if (atype < 0) return -EINVAL; @@ -1396,8 +1396,8 @@ static void bpf_cgroup_link_release(struct bpf_link *link) } WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link, - cg_link->type, 0)); - if (cg_link->type == BPF_LSM_CGROUP) + link->attach_type, 0)); + if (link->attach_type == BPF_LSM_CGROUP) bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog); cg = cg_link->cgroup; @@ -1439,7 +1439,7 @@ static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link, "cgroup_id:\t%llu\n" "attach_type:\t%d\n", cg_id, - cg_link->type); + link->attach_type); } static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link, @@ -1455,7 +1455,7 @@ static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link, cgroup_unlock(); info->cgroup.cgroup_id = cg_id; - info->cgroup.attach_type = cg_link->type; + info->cgroup.attach_type = link->attach_type; return 0; } @@ -1497,7 +1497,6 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops, prog, attr->link_create.attach_type); link->cgroup = cgrp; - link->type = attr->link_create.attach_type; err = bpf_link_prime(&link->link, &link_primer); if (err) { @@ -1506,7 +1505,7 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) } err = cgroup_bpf_attach(cgrp, NULL, NULL, link, - link->type, BPF_F_ALLOW_MULTI | attr->link_create.flags, + link->link.attach_type, BPF_F_ALLOW_MULTI | attr->link_create.flags, attr->link_create.cgroup.relative_fd, attr->link_create.cgroup.expected_revision); if (err) { -- cgit v1.2.3 From 0eeeebdcc5feeec48118f7a3df2ac818e694ccc7 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Thu, 10 Jul 2025 11:20:37 +0800 Subject: bpf: Remove attach_type in bpf_tracing_link Use attach_type in bpf_link, and remove it in bpf_tracing_link. Signed-off-by: Tao Chen Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20250710032038.888700-7-chen.dylane@linux.dev --- include/linux/bpf.h | 1 - kernel/bpf/syscall.c | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a9ee9c14b486..bc887831eaa5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1784,7 +1784,6 @@ struct bpf_shim_tramp_link { struct bpf_tracing_link { struct bpf_tramp_link link; - enum bpf_attach_type attach_type; struct bpf_trampoline *trampoline; struct bpf_prog *tgt_prog; }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cd7321fe0ba3..1a26d17536be 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3414,7 +3414,7 @@ static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link, "target_obj_id:\t%u\n" "target_btf_id:\t%u\n" "cookie:\t%llu\n", - tr_link->attach_type, + link->attach_type, target_obj_id, target_btf_id, tr_link->link.cookie); @@ -3426,7 +3426,7 @@ static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); - info->tracing.attach_type = tr_link->attach_type; + info->tracing.attach_type = link->attach_type; info->tracing.cookie = tr_link->link.cookie; bpf_trampoline_unpack_key(tr_link->trampoline->key, &info->tracing.target_obj_id, @@ -3516,7 +3516,6 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING, &bpf_tracing_link_lops, prog, attach_type); - link->attach_type = prog->expected_attach_type; link->link.cookie = bpf_cookie; mutex_lock(&prog->aux->dst_mutex); -- cgit v1.2.3 From 19d18fdfc79217c86802271c9ce5b4ed174628cc Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Wed, 16 Jul 2025 21:46:53 +0800 Subject: bpf: Add struct bpf_token_info The 'commit 35f96de04127 ("bpf: Introduce BPF token object")' added BPF token as a new kind of BPF kernel object. And BPF_OBJ_GET_INFO_BY_FD already used to get BPF object info, so we can also get token info with this cmd. One usage scenario, when program runs failed with token, because of the permission failure, we can report what BPF token is allowing with this API for debugging. Acked-by: Andrii Nakryiko Signed-off-by: Tao Chen Link: https://lore.kernel.org/r/20250716134654.1162635-1-chen.dylane@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 11 +++++++++++ include/uapi/linux/bpf.h | 8 ++++++++ kernel/bpf/syscall.c | 18 ++++++++++++++++++ kernel/bpf/token.c | 25 ++++++++++++++++++++++++- tools/include/uapi/linux/bpf.h | 8 ++++++++ 5 files changed, 69 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bc887831eaa5..f9cd2164ed23 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2354,6 +2354,7 @@ extern const struct super_operations bpf_super_ops; extern const struct file_operations bpf_map_fops; extern const struct file_operations bpf_prog_fops; extern const struct file_operations bpf_iter_fops; +extern const struct file_operations bpf_token_fops; #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ extern const struct bpf_prog_ops _name ## _prog_ops; \ @@ -2551,6 +2552,9 @@ void bpf_token_inc(struct bpf_token *token); void bpf_token_put(struct bpf_token *token); int bpf_token_create(union bpf_attr *attr); struct bpf_token *bpf_token_get_from_fd(u32 ufd); +int bpf_token_get_info_by_fd(struct bpf_token *token, + const union bpf_attr *attr, + union bpf_attr __user *uattr); bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd); bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type); @@ -2949,6 +2953,13 @@ static inline struct bpf_token *bpf_token_get_from_fd(u32 ufd) return ERR_PTR(-EOPNOTSUPP); } +static inline int bpf_token_get_info_by_fd(struct bpf_token *token, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return -EOPNOTSUPP; +} + static inline void __dev_flush(struct list_head *flush_list) { } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0670e15a6100..233de8677382 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -450,6 +450,7 @@ union bpf_iter_link_info { * * **struct bpf_map_info** * * **struct bpf_btf_info** * * **struct bpf_link_info** + * * **struct bpf_token_info** * * Return * Returns zero on success. On error, -1 is returned and *errno* @@ -6803,6 +6804,13 @@ struct bpf_link_info { }; } __attribute__((aligned(8))); +struct bpf_token_info { + __u64 allowed_cmds; + __u64 allowed_maps; + __u64 allowed_progs; + __u64 allowed_attachs; +} __attribute__((aligned(8))); + /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed * by user and intended to be used by socket (e.g. to bind to, depends on * attach type). diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1a26d17536be..e63039817af3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5239,6 +5239,21 @@ static int bpf_link_get_info_by_fd(struct file *file, } +static int token_get_info_by_fd(struct file *file, + struct bpf_token *token, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_token_info __user *uinfo = u64_to_user_ptr(attr->info.info); + u32 info_len = attr->info.info_len; + int err; + + err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len); + if (err) + return err; + return bpf_token_get_info_by_fd(token, attr, uattr); +} + #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, @@ -5262,6 +5277,9 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, else if (fd_file(f)->f_op == &bpf_link_fops || fd_file(f)->f_op == &bpf_link_fops_poll) return bpf_link_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, uattr); + else if (fd_file(f)->f_op == &bpf_token_fops) + return token_get_info_by_fd(fd_file(f), fd_file(f)->private_data, + attr, uattr); return -EINVAL; } diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c index 26057aa13503..0bbe412f854e 100644 --- a/kernel/bpf/token.c +++ b/kernel/bpf/token.c @@ -103,7 +103,7 @@ static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp) static const struct inode_operations bpf_token_iops = { }; -static const struct file_operations bpf_token_fops = { +const struct file_operations bpf_token_fops = { .release = bpf_token_release, .show_fdinfo = bpf_token_show_fdinfo, }; @@ -210,6 +210,29 @@ out_file: return err; } +int bpf_token_get_info_by_fd(struct bpf_token *token, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_token_info __user *uinfo = u64_to_user_ptr(attr->info.info); + struct bpf_token_info info; + u32 info_len = attr->info.info_len; + + info_len = min_t(u32, info_len, sizeof(info)); + memset(&info, 0, sizeof(info)); + + info.allowed_cmds = token->allowed_cmds; + info.allowed_maps = token->allowed_maps; + info.allowed_progs = token->allowed_progs; + info.allowed_attachs = token->allowed_attachs; + + if (copy_to_user(uinfo, &info, info_len) || + put_user(info_len, &uattr->info.info_len)) + return -EFAULT; + + return 0; +} + struct bpf_token *bpf_token_get_from_fd(u32 ufd) { CLASS(fd, f)(ufd); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0670e15a6100..233de8677382 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -450,6 +450,7 @@ union bpf_iter_link_info { * * **struct bpf_map_info** * * **struct bpf_btf_info** * * **struct bpf_link_info** + * * **struct bpf_token_info** * * Return * Returns zero on success. On error, -1 is returned and *errno* @@ -6803,6 +6804,13 @@ struct bpf_link_info { }; } __attribute__((aligned(8))); +struct bpf_token_info { + __u64 allowed_cmds; + __u64 allowed_maps; + __u64 allowed_progs; + __u64 allowed_attachs; +} __attribute__((aligned(8))); + /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed * by user and intended to be used by socket (e.g. to bind to, depends on * attach type). -- cgit v1.2.3 From b7b3500bd4eef2c3b5124ed195f26eb048407d9b Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Mon, 21 Jul 2025 11:04:42 +0200 Subject: umd: Remove usermode driver framework MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The code is unused since 98e20e5e13d2 ("bpfilter: remove bpfilter"), therefore remove it. Signed-off-by: Thomas Weißschuh Signed-off-by: Daniel Borkmann Reviewed-by: Christoph Hellwig Reviewed-by: Christian Brauner Acked-by: "Eric W. Biederman" Link: https://lore.kernel.org/bpf/20250721-remove-usermode-driver-v1-2-0d0083334382@linutronix.de --- include/linux/usermode_driver.h | 19 ---- kernel/Makefile | 1 - kernel/bpf/preload/Kconfig | 4 - kernel/usermode_driver.c | 191 ---------------------------------------- 4 files changed, 215 deletions(-) delete mode 100644 include/linux/usermode_driver.h delete mode 100644 kernel/usermode_driver.c (limited to 'include/linux') diff --git a/include/linux/usermode_driver.h b/include/linux/usermode_driver.h deleted file mode 100644 index ad970416260d..000000000000 --- a/include/linux/usermode_driver.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef __LINUX_USERMODE_DRIVER_H__ -#define __LINUX_USERMODE_DRIVER_H__ - -#include -#include - -struct umd_info { - const char *driver_name; - struct file *pipe_to_umh; - struct file *pipe_from_umh; - struct path wd; - struct pid *tgid; -}; -int umd_load_blob(struct umd_info *info, const void *data, size_t len); -int umd_unload_blob(struct umd_info *info); -int fork_usermode_driver(struct umd_info *info); -void umd_cleanup_helper(struct umd_info *info); - -#endif /* __LINUX_USERMODE_DRIVER_H__ */ diff --git a/kernel/Makefile b/kernel/Makefile index 32e80dd626af..4332de7ffdee 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -12,7 +12,6 @@ obj-y = fork.o exec_domain.o panic.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o smpboot.o ucount.o regset.o ksyms_common.o -obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o obj-$(CONFIG_MULTIUSER) += groups.o obj-$(CONFIG_VHOST_TASK) += vhost_task.o diff --git a/kernel/bpf/preload/Kconfig b/kernel/bpf/preload/Kconfig index f9b11d01c3b5..aef7b0bc96d6 100644 --- a/kernel/bpf/preload/Kconfig +++ b/kernel/bpf/preload/Kconfig @@ -1,8 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only -config USERMODE_DRIVER - bool - default n - menuconfig BPF_PRELOAD bool "Preload BPF file system with kernel specific program and map iterators" depends on BPF diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c deleted file mode 100644 index 8303f4c7ca71..000000000000 --- a/kernel/usermode_driver.c +++ /dev/null @@ -1,191 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * umd - User mode driver support - */ -#include -#include -#include -#include -#include -#include - -static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *name) -{ - struct file_system_type *type; - struct vfsmount *mnt; - struct file *file; - ssize_t written; - loff_t pos = 0; - - type = get_fs_type("tmpfs"); - if (!type) - return ERR_PTR(-ENODEV); - - mnt = kern_mount(type); - put_filesystem(type); - if (IS_ERR(mnt)) - return mnt; - - file = file_open_root_mnt(mnt, name, O_CREAT | O_WRONLY, 0700); - if (IS_ERR(file)) { - kern_unmount(mnt); - return ERR_CAST(file); - } - - written = kernel_write(file, data, len, &pos); - if (written != len) { - int err = written; - if (err >= 0) - err = -ENOMEM; - filp_close(file, NULL); - kern_unmount(mnt); - return ERR_PTR(err); - } - - fput(file); - - /* Flush delayed fput so exec can open the file read-only */ - flush_delayed_fput(); - task_work_run(); - return mnt; -} - -/** - * umd_load_blob - Remember a blob of bytes for fork_usermode_driver - * @info: information about usermode driver - * @data: a blob of bytes that can be executed as a file - * @len: The lentgh of the blob - * - */ -int umd_load_blob(struct umd_info *info, const void *data, size_t len) -{ - struct vfsmount *mnt; - - if (WARN_ON_ONCE(info->wd.dentry || info->wd.mnt)) - return -EBUSY; - - mnt = blob_to_mnt(data, len, info->driver_name); - if (IS_ERR(mnt)) - return PTR_ERR(mnt); - - info->wd.mnt = mnt; - info->wd.dentry = mnt->mnt_root; - return 0; -} -EXPORT_SYMBOL_GPL(umd_load_blob); - -/** - * umd_unload_blob - Disassociate @info from a previously loaded blob - * @info: information about usermode driver - * - */ -int umd_unload_blob(struct umd_info *info) -{ - if (WARN_ON_ONCE(!info->wd.mnt || - !info->wd.dentry || - info->wd.mnt->mnt_root != info->wd.dentry)) - return -EINVAL; - - kern_unmount(info->wd.mnt); - info->wd.mnt = NULL; - info->wd.dentry = NULL; - return 0; -} -EXPORT_SYMBOL_GPL(umd_unload_blob); - -static int umd_setup(struct subprocess_info *info, struct cred *new) -{ - struct umd_info *umd_info = info->data; - struct file *from_umh[2]; - struct file *to_umh[2]; - int err; - - /* create pipe to send data to umh */ - err = create_pipe_files(to_umh, 0); - if (err) - return err; - err = replace_fd(0, to_umh[0], 0); - fput(to_umh[0]); - if (err < 0) { - fput(to_umh[1]); - return err; - } - - /* create pipe to receive data from umh */ - err = create_pipe_files(from_umh, 0); - if (err) { - fput(to_umh[1]); - replace_fd(0, NULL, 0); - return err; - } - err = replace_fd(1, from_umh[1], 0); - fput(from_umh[1]); - if (err < 0) { - fput(to_umh[1]); - replace_fd(0, NULL, 0); - fput(from_umh[0]); - return err; - } - - set_fs_pwd(current->fs, &umd_info->wd); - umd_info->pipe_to_umh = to_umh[1]; - umd_info->pipe_from_umh = from_umh[0]; - umd_info->tgid = get_pid(task_tgid(current)); - return 0; -} - -static void umd_cleanup(struct subprocess_info *info) -{ - struct umd_info *umd_info = info->data; - - /* cleanup if umh_setup() was successful but exec failed */ - if (info->retval) - umd_cleanup_helper(umd_info); -} - -/** - * umd_cleanup_helper - release the resources which were allocated in umd_setup - * @info: information about usermode driver - */ -void umd_cleanup_helper(struct umd_info *info) -{ - fput(info->pipe_to_umh); - fput(info->pipe_from_umh); - put_pid(info->tgid); - info->tgid = NULL; -} -EXPORT_SYMBOL_GPL(umd_cleanup_helper); - -/** - * fork_usermode_driver - fork a usermode driver - * @info: information about usermode driver (shouldn't be NULL) - * - * Returns either negative error or zero which indicates success in - * executing a usermode driver. In such case 'struct umd_info *info' - * is populated with two pipes and a tgid of the process. The caller is - * responsible for health check of the user process, killing it via - * tgid, and closing the pipes when user process is no longer needed. - */ -int fork_usermode_driver(struct umd_info *info) -{ - struct subprocess_info *sub_info; - const char *argv[] = { info->driver_name, NULL }; - int err; - - if (WARN_ON_ONCE(info->tgid)) - return -EBUSY; - - err = -ENOMEM; - sub_info = call_usermodehelper_setup(info->driver_name, - (char **)argv, NULL, GFP_KERNEL, - umd_setup, umd_cleanup, info); - if (!sub_info) - goto out; - - err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); -out: - return err; -} -EXPORT_SYMBOL_GPL(fork_usermode_driver); - - -- cgit v1.2.3 From 3ba58312e65665e5b9097c7969a51fa49914d85d Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 24 Jul 2025 12:02:53 +0000 Subject: bpf: Move bpf_jit_get_prog_name() to core.c bpf_jit_get_prog_name() will be used by all JITs when enabling support for private stack. This function is currently implemented in the x86 JIT. Move the function to core.c so that other JITs can easily use it in their implementation of private stack. Signed-off-by: Puranjay Mohan Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20250724120257.7299-2-puranjay@kernel.org --- arch/x86/net/bpf_jit_comp.c | 9 +-------- include/linux/filter.h | 2 ++ kernel/bpf/core.c | 7 +++++++ 3 files changed, 10 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 40e1b3b9634f..7e3fca164620 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -3501,13 +3501,6 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func return emit_bpf_dispatcher(&prog, 0, num_funcs - 1, funcs, image, buf); } -static const char *bpf_get_prog_name(struct bpf_prog *prog) -{ - if (prog->aux->ksym.prog) - return prog->aux->ksym.name; - return prog->aux->name; -} - static void priv_stack_init_guard(void __percpu *priv_stack_ptr, int alloc_size) { int cpu, underflow_idx = (alloc_size - PRIV_STACK_GUARD_SZ) >> 3; @@ -3531,7 +3524,7 @@ static void priv_stack_check_guard(void __percpu *priv_stack_ptr, int alloc_size if (stack_ptr[0] != PRIV_STACK_GUARD_VAL || stack_ptr[underflow_idx] != PRIV_STACK_GUARD_VAL) { pr_err("BPF private stack overflow/underflow detected for prog %sx\n", - bpf_get_prog_name(prog)); + bpf_jit_get_prog_name(prog)); break; } } diff --git a/include/linux/filter.h b/include/linux/filter.h index eca229752cbe..5cc7a82ec832 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1278,6 +1278,8 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog, const struct bpf_insn *insn, bool extra_pass, u64 *func_addr, bool *func_addr_fixed); +const char *bpf_jit_get_prog_name(struct bpf_prog *prog); + struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *fp); void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 61613785bdd0..29c0225c14aa 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1297,6 +1297,13 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog, return 0; } +const char *bpf_jit_get_prog_name(struct bpf_prog *prog) +{ + if (prog->aux->ksym.prog) + return prog->aux->ksym.name; + return prog->aux->name; +} + static int bpf_jit_blind_insn(const struct bpf_insn *from, const struct bpf_insn *aux, struct bpf_insn *to_buff, -- cgit v1.2.3