From e00931c02568dc6ac76f94b1ab471de05e6fdfe8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 Nov 2024 08:39:12 -0800 Subject: bpf: Enable private stack for eligible subprogs If private stack is used by any subprog, set that subprog prog->aux->jits_use_priv_stack to be true so later jit can allocate private stack for that subprog properly. Also set env->prog->aux->jits_use_priv_stack to be true if any subprog uses private stack. This is a use case for a single main prog (no subprogs) to use private stack, and also a use case for later struct-ops progs where env->prog->aux->jits_use_priv_stack will enable recursion check if any subprog uses private stack. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20241112163912.2224007-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/bpf.h') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7da41ae2eac8..129b29e85cec 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1523,6 +1523,7 @@ struct bpf_prog_aux { bool exception_cb; bool exception_boundary; bool is_extended; /* true if extended by freplace program */ + bool jits_use_priv_stack; u64 prog_array_member_cnt; /* counts how many times as member of prog_array */ struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */ struct bpf_arena *arena; -- cgit v1.2.3 From 7d1cd70d4b16ff0216a5f6c2ae7d0fa9fa978c07 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 Nov 2024 08:39:22 -0800 Subject: bpf, x86: Support private stack in jit Private stack is allocated in function bpf_int_jit_compile() with alignment 8. Private stack allocation size includes the stack size determined by verifier and additional space to protect stack overflow and underflow. See below an illustration: ---> memory address increasing [8 bytes to protect overflow] [normal stack] [8 bytes to protect underflow] If overflow/underflow is detected, kernel messages will be emited in dmesg like BPF private stack overflow/underflow detected for prog Fx BPF Private stack overflow/underflow detected for prog bpf_prog_a41699c234a1567a_subprog1x Those messages are generated when I made some changes to jitted code to intentially cause overflow for some progs. For the jited prog, The x86 register 9 (X86_REG_R9) is used to replace bpf frame register (BPF_REG_10). The private stack is used per subprog per cpu. The X86_REG_R9 is saved and restored around every func call (not including tailcall) to maintain correctness of X86_REG_R9. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20241112163922.2224385-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 136 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/bpf.h | 1 + 2 files changed, 137 insertions(+) (limited to 'include/linux/bpf.h') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 3ff638c37999..8f896c32172c 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -325,6 +325,22 @@ struct jit_context { /* Number of bytes that will be skipped on tailcall */ #define X86_TAIL_CALL_OFFSET (12 + ENDBR_INSN_SIZE) +static void push_r9(u8 **pprog) +{ + u8 *prog = *pprog; + + EMIT2(0x41, 0x51); /* push r9 */ + *pprog = prog; +} + +static void pop_r9(u8 **pprog) +{ + u8 *prog = *pprog; + + EMIT2(0x41, 0x59); /* pop r9 */ + *pprog = prog; +} + static void push_r12(u8 **pprog) { u8 *prog = *pprog; @@ -1404,6 +1420,24 @@ static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op) *pprog = prog; } +static void emit_priv_frame_ptr(u8 **pprog, void __percpu *priv_frame_ptr) +{ + u8 *prog = *pprog; + + /* movabs r9, priv_frame_ptr */ + emit_mov_imm64(&prog, X86_REG_R9, (__force long) priv_frame_ptr >> 32, + (u32) (__force long) priv_frame_ptr); + +#ifdef CONFIG_SMP + /* add , gs:[] */ + EMIT2(0x65, 0x4c); + EMIT3(0x03, 0x0c, 0x25); + EMIT((u32)(unsigned long)&this_cpu_off, 4); +#endif + + *pprog = prog; +} + #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp))) #define __LOAD_TCC_PTR(off) \ @@ -1412,6 +1446,10 @@ static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op) #define LOAD_TAIL_CALL_CNT_PTR(stack) \ __LOAD_TCC_PTR(BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack)) +/* Memory size/value to protect private stack overflow/underflow */ +#define PRIV_STACK_GUARD_SZ 8 +#define PRIV_STACK_GUARD_VAL 0xEB9F12345678eb9fULL + static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image, int oldproglen, struct jit_context *ctx, bool jmp_padding) { @@ -1421,7 +1459,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image int insn_cnt = bpf_prog->len; bool seen_exit = false; u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; + void __percpu *priv_frame_ptr = NULL; u64 arena_vm_start, user_vm_start; + void __percpu *priv_stack_ptr; int i, excnt = 0; int ilen, proglen = 0; u8 *prog = temp; @@ -1429,6 +1469,11 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image int err; stack_depth = bpf_prog->aux->stack_depth; + priv_stack_ptr = bpf_prog->aux->priv_stack_ptr; + if (priv_stack_ptr) { + priv_frame_ptr = priv_stack_ptr + PRIV_STACK_GUARD_SZ + round_up(stack_depth, 8); + stack_depth = 0; + } arena_vm_start = bpf_arena_get_kern_vm_start(bpf_prog->aux->arena); user_vm_start = bpf_arena_get_user_vm_start(bpf_prog->aux->arena); @@ -1457,6 +1502,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image emit_mov_imm64(&prog, X86_REG_R12, arena_vm_start >> 32, (u32) arena_vm_start); + if (priv_frame_ptr) + emit_priv_frame_ptr(&prog, priv_frame_ptr); + ilen = prog - temp; if (rw_image) memcpy(rw_image + proglen, temp, ilen); @@ -1476,6 +1524,14 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image u8 *func; int nops; + if (priv_frame_ptr) { + if (src_reg == BPF_REG_FP) + src_reg = X86_REG_R9; + + if (dst_reg == BPF_REG_FP) + dst_reg = X86_REG_R9; + } + switch (insn->code) { /* ALU */ case BPF_ALU | BPF_ADD | BPF_X: @@ -2136,9 +2192,15 @@ populate_extable: } if (!imm32) return -EINVAL; + if (priv_frame_ptr) { + push_r9(&prog); + ip += 2; + } ip += x86_call_depth_emit_accounting(&prog, func, ip); if (emit_call(&prog, func, ip)) return -EINVAL; + if (priv_frame_ptr) + pop_r9(&prog); break; } @@ -3306,6 +3368,42 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func return emit_bpf_dispatcher(&prog, 0, num_funcs - 1, funcs, image, buf); } +static const char *bpf_get_prog_name(struct bpf_prog *prog) +{ + if (prog->aux->ksym.prog) + return prog->aux->ksym.name; + return prog->aux->name; +} + +static void priv_stack_init_guard(void __percpu *priv_stack_ptr, int alloc_size) +{ + int cpu, underflow_idx = (alloc_size - PRIV_STACK_GUARD_SZ) >> 3; + u64 *stack_ptr; + + for_each_possible_cpu(cpu) { + stack_ptr = per_cpu_ptr(priv_stack_ptr, cpu); + stack_ptr[0] = PRIV_STACK_GUARD_VAL; + stack_ptr[underflow_idx] = PRIV_STACK_GUARD_VAL; + } +} + +static void priv_stack_check_guard(void __percpu *priv_stack_ptr, int alloc_size, + struct bpf_prog *prog) +{ + int cpu, underflow_idx = (alloc_size - PRIV_STACK_GUARD_SZ) >> 3; + u64 *stack_ptr; + + for_each_possible_cpu(cpu) { + stack_ptr = per_cpu_ptr(priv_stack_ptr, cpu); + if (stack_ptr[0] != PRIV_STACK_GUARD_VAL || + stack_ptr[underflow_idx] != PRIV_STACK_GUARD_VAL) { + pr_err("BPF private stack overflow/underflow detected for prog %sx\n", + bpf_get_prog_name(prog)); + break; + } + } +} + struct x64_jit_data { struct bpf_binary_header *rw_header; struct bpf_binary_header *header; @@ -3323,7 +3421,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) struct bpf_binary_header *rw_header = NULL; struct bpf_binary_header *header = NULL; struct bpf_prog *tmp, *orig_prog = prog; + void __percpu *priv_stack_ptr = NULL; struct x64_jit_data *jit_data; + int priv_stack_alloc_sz; int proglen, oldproglen = 0; struct jit_context ctx = {}; bool tmp_blinded = false; @@ -3359,6 +3459,23 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) } prog->aux->jit_data = jit_data; } + priv_stack_ptr = prog->aux->priv_stack_ptr; + if (!priv_stack_ptr && prog->aux->jits_use_priv_stack) { + /* Allocate actual private stack size with verifier-calculated + * stack size plus two memory guards to protect overflow and + * underflow. + */ + priv_stack_alloc_sz = round_up(prog->aux->stack_depth, 8) + + 2 * PRIV_STACK_GUARD_SZ; + priv_stack_ptr = __alloc_percpu_gfp(priv_stack_alloc_sz, 8, GFP_KERNEL); + if (!priv_stack_ptr) { + prog = orig_prog; + goto out_priv_stack; + } + + priv_stack_init_guard(priv_stack_ptr, priv_stack_alloc_sz); + prog->aux->priv_stack_ptr = priv_stack_ptr; + } addrs = jit_data->addrs; if (addrs) { ctx = jit_data->ctx; @@ -3494,6 +3611,11 @@ out_image: bpf_prog_fill_jited_linfo(prog, addrs + 1); out_addrs: kvfree(addrs); + if (!image && priv_stack_ptr) { + free_percpu(priv_stack_ptr); + prog->aux->priv_stack_ptr = NULL; + } +out_priv_stack: kfree(jit_data); prog->aux->jit_data = NULL; } @@ -3532,6 +3654,8 @@ void bpf_jit_free(struct bpf_prog *prog) if (prog->jited) { struct x64_jit_data *jit_data = prog->aux->jit_data; struct bpf_binary_header *hdr; + void __percpu *priv_stack_ptr; + int priv_stack_alloc_sz; /* * If we fail the final pass of JIT (from jit_subprogs), @@ -3547,6 +3671,13 @@ void bpf_jit_free(struct bpf_prog *prog) prog->bpf_func = (void *)prog->bpf_func - cfi_get_offset(); hdr = bpf_jit_binary_pack_hdr(prog); bpf_jit_binary_pack_free(hdr, NULL); + priv_stack_ptr = prog->aux->priv_stack_ptr; + if (priv_stack_ptr) { + priv_stack_alloc_sz = round_up(prog->aux->stack_depth, 8) + + 2 * PRIV_STACK_GUARD_SZ; + priv_stack_check_guard(priv_stack_ptr, priv_stack_alloc_sz, prog); + free_percpu(prog->aux->priv_stack_ptr); + } WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog)); } @@ -3562,6 +3693,11 @@ bool bpf_jit_supports_exceptions(void) return IS_ENABLED(CONFIG_UNWINDER_ORC); } +bool bpf_jit_supports_private_stack(void) +{ + return true; +} + void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie) { #if defined(CONFIG_UNWINDER_ORC) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 129b29e85cec..d32cc373dfd1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1507,6 +1507,7 @@ struct bpf_prog_aux { u32 max_rdwr_access; struct btf *attach_btf; const struct bpf_ctx_arg_aux *ctx_arg_info; + void __percpu *priv_stack_ptr; struct mutex dst_mutex; /* protects dst_* pointers below, *after* prog becomes visible */ struct bpf_prog *dst_prog; struct bpf_trampoline *dst_trampoline; -- cgit v1.2.3 From 5bd36da1e37e7a78e8b38efd287de6e1394b7d6e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 Nov 2024 08:39:33 -0800 Subject: bpf: Support private stack for struct_ops progs For struct_ops progs, whether a particular prog uses private stack depends on prog->aux->priv_stack_requested setting before actual insn-level verification for that prog. One particular implementation is to piggyback on struct_ops->check_member(). The next patch has an example for this. The struct_ops->check_member() sets prog->aux->priv_stack_requested to be true which enables private stack usage. The struct_ops prog follows the same rule as kprobe/tracing progs after function bpf_enable_priv_stack(). For example, even a struct_ops prog requests private stack, it could still use normal kernel stack if the stack size is small (< 64 bytes). Similar to tracing progs, nested same cpu same prog run will be skipped. A field (recursion_detected()) is added to bpf_prog_aux structure. If bpf_prog->aux->recursion_detected is implemented by the struct_ops subsystem and nested same cpu/prog happens, the function will be triggered to report an error, collect related info, etc. Acked-by: Tejun Heo Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20241112163933.2224962-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ include/linux/bpf_verifier.h | 1 + kernel/bpf/trampoline.c | 4 ++++ kernel/bpf/verifier.c | 7 ++++++- 4 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux/bpf.h') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d32cc373dfd1..10945c8858ce 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1525,9 +1525,11 @@ struct bpf_prog_aux { bool exception_boundary; bool is_extended; /* true if extended by freplace program */ bool jits_use_priv_stack; + bool priv_stack_requested; u64 prog_array_member_cnt; /* counts how many times as member of prog_array */ struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */ struct bpf_arena *arena; + void (*recursion_detected)(struct bpf_prog *prog); /* callback if recursion is detected */ /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ const struct btf_type *attach_func_proto; /* function name for valid attach_btf_id */ diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index d62bb2ca1828..6b7c91629176 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -879,6 +879,7 @@ static inline bool bpf_prog_check_recur(const struct bpf_prog *prog) case BPF_PROG_TYPE_TRACING: return prog->expected_attach_type != BPF_TRACE_ITER; case BPF_PROG_TYPE_STRUCT_OPS: + return prog->aux->jits_use_priv_stack; case BPF_PROG_TYPE_LSM: return false; default: diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 9f36c049f4c2..a8d188b31da5 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -899,6 +899,8 @@ static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tram if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { bpf_prog_inc_misses_counter(prog); + if (prog->aux->recursion_detected) + prog->aux->recursion_detected(prog); return 0; } return bpf_prog_start_time(); @@ -975,6 +977,8 @@ u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { bpf_prog_inc_misses_counter(prog); + if (prog->aux->recursion_detected) + prog->aux->recursion_detected(prog); return 0; } return bpf_prog_start_time(); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 176d19ad9d07..f4c39bb50511 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6110,7 +6110,7 @@ static enum priv_stack_mode bpf_enable_priv_stack(struct bpf_prog *prog) case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_STRUCT_OPS: - if (bpf_prog_check_recur(prog)) + if (prog->aux->priv_stack_requested || bpf_prog_check_recur(prog)) return PRIV_STACK_ADAPTIVE; fallthrough; default: @@ -22053,6 +22053,11 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) } } + if (prog->aux->priv_stack_requested && !bpf_jit_supports_private_stack()) { + verbose(env, "Private stack not supported by jit\n"); + return -EACCES; + } + /* btf_ctx_access() used this to provide argument type info */ prog->aux->ctx_arg_info = st_ops_desc->arg_info[member_idx].info; -- cgit v1.2.3