From ad35d8018669fd2eea76e3f74eb050fd3d2fb690 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Sat, 18 Apr 2026 23:09:44 -0400 Subject: libbpf: Report error when a negative kprobe offset is specified In attach_kprobe(), the parsing logic uses sscanf() to extract the target function name and offset from the section definition. Currently, if a user specifies a negative offset (e.g., SEC("kprobe/func+-100")), the input is not explicitly caught and reported as an error. This commit updates the logic to explicitly notify the user when a negative integer is provided. To facilitate this check, the offset variable is changed from unsigned long to long so that sscanf() can accurately capture a negative input for evaluation. If a negative offset is detected, the loader will now print an informative warning stating that the offset must be non-negative, and return -EINVAL. Additionally, free(func) is called in this new error path to prevent a memory leak, as the function name string is dynamically allocated by sscanf(). Fixes: e3f9bc35ea7e9 ("libbpf: Allow decimal offset for kprobes") Signed-off-by: Aaron Tomlin Acked-by: Mykyta Yatsenko Link: https://lore.kernel.org/bpf/20260419030944.1423642-1-atomlin@atomlin.com Signed-off-by: Kumar Kartikeya Dwivedi --- tools/lib/bpf/libbpf.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 3a80a018fc7d..83aae7a39d36 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -12280,7 +12280,7 @@ error: static int attach_kprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link) { DECLARE_LIBBPF_OPTS(bpf_kprobe_opts, opts); - unsigned long offset = 0; + long offset = 0; const char *func_name; char *func; int n; @@ -12302,6 +12302,13 @@ static int attach_kprobe(const struct bpf_program *prog, long cookie, struct bpf pr_warn("kprobe name is invalid: %s\n", func_name); return -EINVAL; } + + if (offset < 0) { + free(func); + pr_warn("kprobe offset must be a non-negative integer: %li\n", offset); + return -EINVAL; + } + if (opts.retprobe && offset != 0) { free(func); pr_warn("kretprobes do not support offset specification\n"); -- cgit v1.2.3 From 7c528b364bd8b2e5629aab1d84898c52c2085187 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Fri, 17 Apr 2026 16:36:31 -0700 Subject: selftests/bpf: Trace bpf_local_storage_update to debug flaky local storage tests task_local_storage/sys_enter_exit and cgrp_local_storage/ cgroup_iter_sleepable occasionally fail in CI possibly because bpf_{task,cgrp}_storage_get() returns NULL. Add a fexit probe on bpf_local_storage_update() to capture the actual error code when this happens. It will allow us to tell if it is trylock failure in kmalloc_nolock(), timeout/deadlock in rqspinlock or something else. Signed-off-by: Amery Hung Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20260417233631.1443199-1-ameryhung@gmail.com Signed-off-by: Kumar Kartikeya Dwivedi --- .../selftests/bpf/prog_tests/cgrp_local_storage.c | 15 +++++++++++++-- .../selftests/bpf/prog_tests/task_local_storage.c | 1 + tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c | 18 ++++++++++++++++++ .../testing/selftests/bpf/progs/task_local_storage.c | 19 +++++++++++++++++++ 4 files changed, 51 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c b/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c index 478a77cb67e6..c4398ccf3493 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c +++ b/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c @@ -176,7 +176,7 @@ static void test_cgroup_iter_sleepable(int cgroup_fd, __u64 cgroup_id) DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); union bpf_iter_link_info linfo; struct cgrp_ls_sleepable *skel; - struct bpf_link *link; + struct bpf_link *link, *fexit_link; int err, iter_fd; char buf[16]; @@ -200,16 +200,27 @@ static void test_cgroup_iter_sleepable(int cgroup_fd, __u64 cgroup_id) if (!ASSERT_OK_PTR(link, "attach_iter")) goto out; + fexit_link = bpf_program__attach(skel->progs.fexit_update); + if (!ASSERT_OK_PTR(fexit_link, "attach_fexit")) + goto out_link; + iter_fd = bpf_iter_create(bpf_link__fd(link)); if (!ASSERT_GE(iter_fd, 0, "iter_create")) - goto out_link; + goto out_fexit_link; + + skel->bss->target_pid = sys_gettid(); /* trigger the program run */ (void)read(iter_fd, buf, sizeof(buf)); + skel->bss->target_pid = 0; + + ASSERT_EQ(skel->bss->update_err, 0, "update_err"); ASSERT_EQ(skel->bss->cgroup_id, cgroup_id, "cgroup_id"); close(iter_fd); +out_fexit_link: + bpf_link__destroy(fexit_link); out_link: bpf_link__destroy(link); out: diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c index 1b26c12f255a..5b2b56cc3a4f 100644 --- a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c +++ b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c @@ -47,6 +47,7 @@ static void test_sys_enter_exit(void) skel->bss->target_pid = 0; /* 2x gettid syscalls */ + ASSERT_EQ(skel->bss->update_err, 0, "update_err"); ASSERT_EQ(skel->bss->enter_cnt, 2, "enter_cnt"); ASSERT_EQ(skel->bss->exit_cnt, 2, "exit_cnt"); ASSERT_EQ(skel->bss->mismatch_cnt, 0, "mismatch_cnt"); diff --git a/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c b/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c index a2de95f85648..37bd6b03ba01 100644 --- a/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c +++ b/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c @@ -4,6 +4,7 @@ #include #include #include "bpf_misc.h" +#include "err.h" char _license[] SEC("license") = "GPL"; @@ -16,6 +17,7 @@ struct { __s32 target_pid; __u64 cgroup_id; +long update_err; int target_hid; bool is_cgroup1; @@ -123,3 +125,19 @@ int yes_rcu_lock(void *ctx) bpf_rcu_read_unlock(); return 0; } + +SEC("fexit/bpf_local_storage_update") +int BPF_PROG(fexit_update, void *owner, struct bpf_local_storage_map *smap, + void *value, u64 map_flags, bool swap_uptrs, + struct bpf_local_storage_data *ret) +{ + struct task_struct *task = bpf_get_current_task_btf(); + + if (task->pid != target_pid) + return 0; + + if (IS_ERR_VALUE(ret)) + update_err = PTR_ERR(ret); + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/task_local_storage.c b/tools/testing/selftests/bpf/progs/task_local_storage.c index 80a0a20db88d..34fa3d6451d2 100644 --- a/tools/testing/selftests/bpf/progs/task_local_storage.c +++ b/tools/testing/selftests/bpf/progs/task_local_storage.c @@ -14,12 +14,15 @@ struct { __type(value, long); } enter_id SEC(".maps"); +#include "err.h" + #define MAGIC_VALUE 0xabcd1234 pid_t target_pid = 0; int mismatch_cnt = 0; int enter_cnt = 0; int exit_cnt = 0; +long update_err = 0; SEC("tp_btf/sys_enter") int BPF_PROG(on_enter, struct pt_regs *regs, long id) @@ -62,3 +65,19 @@ int BPF_PROG(on_exit, struct pt_regs *regs, long id) __sync_fetch_and_add(&mismatch_cnt, 1); return 0; } + +SEC("fexit/bpf_local_storage_update") +int BPF_PROG(fexit_update, void *owner, struct bpf_local_storage_map *smap, + void *value, u64 map_flags, bool swap_uptrs, + struct bpf_local_storage_data *ret) +{ + struct task_struct *task = bpf_get_current_task_btf(); + + if (task->pid != target_pid) + return 0; + + if (IS_ERR_VALUE(ret)) + update_err = PTR_ERR(ret); + + return 0; +} -- cgit v1.2.3 From 31f61ac33032ee87ea404d6d996ba2c386502a36 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Tue, 14 Apr 2026 12:10:14 -0700 Subject: bpf: Refactor dynptr mutability tracking Redefine dynptr mutability and fix inconsistency in the verifier and kfunc signatures. Dynptr mutability is at two levels. The first is the bpf_dynptr structure and the second is the memory the dynptr points to. The verifer currently tracks the mutability of the bpf_dynptr struct through helper and kfunc prototypes, where "const struct bpf_dynptr *" means the structure itself is immutable. The second level is tracked in upper bit of bpf_dynptr->size in runtime and is not changed in this patch. There are two type of inconsistency in the verfier regarding the mutability of the bpf_dynptr struct. First, there are many existing kfuncs whose prototypes are wrong. For example, bpf_dynptr_adjust() mutates a dynptr's start and offset but marks the argument as a const pointer. At the same time many other kfuncs that does not mutate the dynptr but mark themselves as mutable. Second, the verifier currently does not honor the const qualifier in kfunc prototypes as it determines whether tagging the arg_type with MEM_RDONLY or not based on the register state. Since all the verifier care is to prevent CONST_PTR_TO_DYNPTR from being destroyed in callback and global subprogram, redefine the mutability at the bpf_dynptr level to just bpf_dynptr_kern->data. Then, explicitly prohibit passing CONST_PTR_TO_DYNPTR to an argument tagged with MEM_UNINIT or OBJ_RELEASE. The mutability of a dynptr's view is not really interesting so drop MEM_RDONLY annotation for dynptr from the helpers and kfuncs. Plus, if the mutability of the entire bpf_dynptr were to be done correctly, it would kill the bpf_dynptr_adjust() usage in callback and global subporgram. Implementation wise - First, make sure all kfunc arg are correctly tagged: Tag the dynptr argument of bpf_dynptr_file_discard() with OBJ_RELEASE. - Then, in process_dynptr_func(), make sure CONST_PTR_TO_DYNPTR cannot be passed to argument tagged with MEM_UNINIT or OBJ_RELEASE. For MEM_UNINIT, it is already checked by is_dynptr_reg_valid_uninit(). For OBJ_RELEASE, check against OBJ_RELEASE instead of MEM_RDONLY and drop a now identical check in unmark_stack_slots_dynptr(). - Remove the mutual exclusive check between MEM_UNINIT and MEM_RDONLY, but don't add a MEM_UNINIT and OBJ_RELEASE version as it is obviously wrong. Note that while this patch stops following the C semantic for the mutability of bpf_dynptr, the prototype of kfuncs are still fixed to maintain the correct C semantics in the implementation. Adding or removing the const qualifier does not break backward compatibility. In addition, fix kfuncs dropping the const qualifier when casting the opaque bpf_dynptr to bpf_dynptr_kern. In test_kfunc_dynptr_param.c, initialize dynptr to 0 to avoid -Wuninitialized-const-pointer warning. Signed-off-by: Amery Hung Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20260414191014.1218567-1-ameryhung@gmail.com Signed-off-by: Kumar Kartikeya Dwivedi --- fs/bpf_fs_kfuncs.c | 2 +- fs/verity/measure.c | 4 +- include/linux/bpf.h | 8 +-- kernel/bpf/btf.c | 2 +- kernel/bpf/helpers.c | 36 +++++------ kernel/bpf/verifier.c | 70 ++++++---------------- kernel/trace/bpf_trace.c | 22 +++---- tools/testing/selftests/bpf/bpf_kfuncs.h | 8 +-- tools/testing/selftests/bpf/progs/dynptr_success.c | 6 +- .../selftests/bpf/progs/test_kfunc_dynptr_param.c | 9 +-- 10 files changed, 65 insertions(+), 102 deletions(-) diff --git a/fs/bpf_fs_kfuncs.c b/fs/bpf_fs_kfuncs.c index e4e51a1d0de2..9d27be058494 100644 --- a/fs/bpf_fs_kfuncs.c +++ b/fs/bpf_fs_kfuncs.c @@ -200,7 +200,7 @@ int bpf_set_dentry_xattr_locked(struct dentry *dentry, const char *name__str, const struct bpf_dynptr *value_p, int flags) { - struct bpf_dynptr_kern *value_ptr = (struct bpf_dynptr_kern *)value_p; + const struct bpf_dynptr_kern *value_ptr = (struct bpf_dynptr_kern *)value_p; struct inode *inode = d_inode(dentry); const void *value; u32 value_len; diff --git a/fs/verity/measure.c b/fs/verity/measure.c index 6a35623ebdf0..265fa0253e3d 100644 --- a/fs/verity/measure.c +++ b/fs/verity/measure.c @@ -118,9 +118,9 @@ __bpf_kfunc_start_defs(); * * Return: 0 on success, a negative value on error. */ -__bpf_kfunc int bpf_get_fsverity_digest(struct file *file, struct bpf_dynptr *digest_p) +__bpf_kfunc int bpf_get_fsverity_digest(struct file *file, const struct bpf_dynptr *digest_p) { - struct bpf_dynptr_kern *digest_ptr = (struct bpf_dynptr_kern *)digest_p; + const struct bpf_dynptr_kern *digest_ptr = (struct bpf_dynptr_kern *)digest_p; const struct inode *inode = file_inode(file); u32 dynptr_sz = __bpf_dynptr_size(digest_ptr); struct fsverity_digest *arg; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b4b703c90ca9..3cb6b9e70080 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3622,8 +3622,8 @@ static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags); struct bpf_key *bpf_lookup_system_key(u64 id); void bpf_key_put(struct bpf_key *bkey); -int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, - struct bpf_dynptr *sig_p, +int bpf_verify_pkcs7_signature(const struct bpf_dynptr *data_p, + const struct bpf_dynptr *sig_p, struct bpf_key *trusted_keyring); #else @@ -3641,8 +3641,8 @@ static inline void bpf_key_put(struct bpf_key *bkey) { } -static inline int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, - struct bpf_dynptr *sig_p, +static inline int bpf_verify_pkcs7_signature(const struct bpf_dynptr *data_p, + const struct bpf_dynptr *sig_p, struct bpf_key *trusted_keyring) { return -EOPNOTSUPP; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index a62d78581207..3c2aaa3c5004 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7973,7 +7973,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) bpf_log(log, "arg#%d has invalid combination of tags\n", i); return -EINVAL; } - sub->args[i].arg_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY; + sub->args[i].arg_type = ARG_PTR_TO_DYNPTR; continue; } if (tags & ARG_TAG_TRUSTED) { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 2bb60200c266..baa12b24bb64 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1944,7 +1944,7 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, - .arg3_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, + .arg3_type = ARG_PTR_TO_DYNPTR, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; @@ -2001,7 +2001,7 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = { .func = bpf_dynptr_write, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, + .arg1_type = ARG_PTR_TO_DYNPTR, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE_OR_ZERO, @@ -2044,7 +2044,7 @@ static const struct bpf_func_proto bpf_dynptr_data_proto = { .func = bpf_dynptr_data, .gpl_only = false, .ret_type = RET_PTR_TO_DYNPTR_MEM_OR_NULL, - .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, + .arg1_type = ARG_PTR_TO_DYNPTR, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, }; @@ -3072,7 +3072,7 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset, return bpf_dynptr_slice(p, offset, buffer__nullable, buffer__szk); } -__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u64 start, u64 end) +__bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr *p, u64 start, u64 end) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; u64 size; @@ -3093,14 +3093,14 @@ __bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u64 start, u64 end __bpf_kfunc bool bpf_dynptr_is_null(const struct bpf_dynptr *p) { - struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; + const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; return !ptr->data; } __bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p) { - struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; + const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; if (!ptr->data) return false; @@ -3110,7 +3110,7 @@ __bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p) __bpf_kfunc u64 bpf_dynptr_size(const struct bpf_dynptr *p) { - struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; + const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; if (!ptr->data) return -EINVAL; @@ -3122,7 +3122,7 @@ __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p, struct bpf_dynptr *clone__uninit) { struct bpf_dynptr_kern *clone = (struct bpf_dynptr_kern *)clone__uninit; - struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; + const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; if (!ptr->data) { bpf_dynptr_set_null(clone); @@ -3145,11 +3145,11 @@ __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p, * Copies data from source dynptr to destination dynptr. * Returns 0 on success; negative error, otherwise. */ -__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u64 dst_off, - struct bpf_dynptr *src_ptr, u64 src_off, u64 size) +__bpf_kfunc int bpf_dynptr_copy(const struct bpf_dynptr *dst_ptr, u64 dst_off, + const struct bpf_dynptr *src_ptr, u64 src_off, u64 size) { - struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr; - struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr; + const struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr; + const struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr; void *src_slice, *dst_slice; char buf[256]; u64 off; @@ -3200,9 +3200,9 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u64 dst_off, * at @offset with the constant byte @val. * Returns 0 on success; negative error, otherwise. */ -__bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u64 offset, u64 size, u8 val) +__bpf_kfunc int bpf_dynptr_memset(const struct bpf_dynptr *p, u64 offset, u64 size, u8 val) { - struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; + const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; u64 chunk_sz, write_off; char buf[256]; void* slice; @@ -4214,13 +4214,13 @@ __bpf_kfunc void bpf_key_put(struct bpf_key *bkey) * * Return: 0 on success, a negative value on error. */ -__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, - struct bpf_dynptr *sig_p, +__bpf_kfunc int bpf_verify_pkcs7_signature(const struct bpf_dynptr *data_p, + const struct bpf_dynptr *sig_p, struct bpf_key *trusted_keyring) { #ifdef CONFIG_SYSTEM_DATA_VERIFICATION - struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p; - struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p; + const struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p; + const struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p; const void *data, *sig; u32 data_len, sig_len; int ret; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 69d75515ed3f..185210b73385 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -717,15 +717,6 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re struct bpf_func_state *state = bpf_func(env, reg); int spi, ref_obj_id, i; - /* - * This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot - * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr - * is safe to do directly. - */ - if (reg->type == CONST_PTR_TO_DYNPTR) { - verifier_bug(env, "CONST_PTR_TO_DYNPTR cannot be released"); - return -EFAULT; - } spi = dynptr_get_spi(env, reg); if (spi < 0) return spi; @@ -7434,23 +7425,12 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, * bytes as STACK_DYNPTR in case of PTR_TO_STACK. In case of * CONST_PTR_TO_DYNPTR, we are guaranteed to get the beginning of the object. * - * Mutability of bpf_dynptr is at two levels, one is at the level of struct - * bpf_dynptr itself, i.e. whether the helper is receiving a pointer to struct - * bpf_dynptr or pointer to const struct bpf_dynptr. In the former case, it can - * mutate the view of the dynptr and also possibly destroy it. In the latter - * case, it cannot mutate the bpf_dynptr itself but it can still mutate the - * memory that dynptr points to. - * - * The verifier will keep track both levels of mutation (bpf_dynptr's in - * reg->type and the memory's in reg->dynptr.type), but there is no support for - * readonly dynptr view yet, hence only the first case is tracked and checked. - * - * This is consistent with how C applies the const modifier to a struct object, - * where the pointer itself inside bpf_dynptr becomes const but not what it - * points to. - * - * Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument - * type, and declare it as 'const struct bpf_dynptr *' in their prototype. + * Mutability of bpf_dynptr is at two levels: the dynptr and the memory the + * dynptr points to. At the first level, the verifier will make sure a + * CONST_PTR_TO_DYNPTR cannot be reinitialized or destroyed. The mutability of + * a dynptr's view (i.e., start and offset) is not tracked as there is not such + * use case. The second level is tracked using the upper bit of bpf_dynptr->size + * and checked dynamically during runtime. */ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn_idx, enum bpf_arg_type arg_type, int clone_ref_obj_id) @@ -7465,14 +7445,6 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn return -EINVAL; } - /* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an - * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*): - */ - if ((arg_type & (MEM_UNINIT | MEM_RDONLY)) == (MEM_UNINIT | MEM_RDONLY)) { - verifier_bug(env, "misconfigured dynptr helper type flags"); - return -EFAULT; - } - /* MEM_UNINIT - Points to memory that is an appropriate candidate for * constructing a mutable bpf_dynptr object. * @@ -7480,13 +7452,12 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn * pointing to a region of at least 16 bytes which doesn't * contain an existing bpf_dynptr. * - * MEM_RDONLY - Points to a initialized bpf_dynptr that will not be - * mutated or destroyed. However, the memory it points to - * may be mutated. + * OBJ_RELEASE - Points to a initialized bpf_dynptr that will be + * destroyed. * - * None - Points to a initialized dynptr that can be mutated and - * destroyed, including mutation of the memory it points - * to. + * None - Points to a initialized dynptr that cannot be + * reinitialized or destroyed. However, the view of the + * dynptr and the memory it points to may be mutated. */ if (arg_type & MEM_UNINIT) { int i; @@ -7505,10 +7476,10 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn } err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx, clone_ref_obj_id); - } else /* MEM_RDONLY and None case from above */ { + } else /* OBJ_RELEASE and None case from above */ { /* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */ - if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) { - verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n"); + if (reg->type == CONST_PTR_TO_DYNPTR && (arg_type & OBJ_RELEASE)) { + verbose(env, "CONST_PTR_TO_DYNPTR cannot be released\n"); return -EINVAL; } @@ -7519,8 +7490,8 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn return -EINVAL; } - /* Fold modifiers (in this case, MEM_RDONLY) when checking expected type */ - if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) { + /* Fold modifiers (in this case, OBJ_RELEASE) when checking expected type */ + if (!is_dynptr_type_expected(env, reg, arg_type & ~OBJ_RELEASE)) { verbose(env, "Expected a dynptr of type %s as arg #%d\n", dynptr_type_str(arg_to_dynptr_type(arg_type)), regno - 1); @@ -9366,7 +9337,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, bpf_log(log, "R%d is not a pointer to arena or scalar.\n", regno); return -EINVAL; } - } else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) { + } else if (arg->arg_type == ARG_PTR_TO_DYNPTR) { ret = check_func_arg_reg_off(env, reg, regno, ARG_PTR_TO_DYNPTR); if (ret) return ret; @@ -12273,9 +12244,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR; int clone_ref_obj_id = 0; - if (reg->type == CONST_PTR_TO_DYNPTR) - dynptr_arg_type |= MEM_RDONLY; - if (is_kfunc_arg_uninit(btf, &args[i])) dynptr_arg_type |= MEM_UNINIT; @@ -12288,7 +12256,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) { dynptr_arg_type |= DYNPTR_TYPE_FILE; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_file_discard]) { - dynptr_arg_type |= DYNPTR_TYPE_FILE; + dynptr_arg_type |= DYNPTR_TYPE_FILE | OBJ_RELEASE; meta->release_regno = regno; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] && (dynptr_arg_type & MEM_UNINIT)) { @@ -18745,7 +18713,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) } else if (arg->arg_type == ARG_ANYTHING) { reg->type = SCALAR_VALUE; mark_reg_unknown(env, regs, i); - } else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) { + } else if (arg->arg_type == ARG_PTR_TO_DYNPTR) { /* assume unspecial LOCAL dynptr type */ __mark_dynptr_reg(reg, BPF_DYNPTR_TYPE_LOCAL, true, ++env->id_gen); } else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index af7079aa0f36..e916f0ccbed9 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3397,12 +3397,12 @@ typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struc * direct calls into all the specific callback implementations * (copy_user_data_sleepable, copy_user_data_nofault, and so on) */ -static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u64 doff, u64 size, +static __always_inline int __bpf_dynptr_copy_str(const struct bpf_dynptr *dptr, u64 doff, u64 size, const void *unsafe_src, copy_fn_t str_copy_fn, struct task_struct *tsk) { - struct bpf_dynptr_kern *dst; + const struct bpf_dynptr_kern *dst; u64 chunk_sz, off; void *dst_slice; int cnt, err; @@ -3438,7 +3438,7 @@ static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u64 u64 size, const void *unsafe_src, copy_fn_t copy_fn, struct task_struct *tsk) { - struct bpf_dynptr_kern *dst; + const struct bpf_dynptr_kern *dst; void *dst_slice; char buf[256]; u64 off, chunk_sz; @@ -3539,49 +3539,49 @@ __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid return bpf_send_signal_common(sig, type, task, value); } -__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u64 off, +__bpf_kfunc int bpf_probe_read_user_dynptr(const struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign, copy_user_data_nofault, NULL); } -__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u64 off, +__bpf_kfunc int bpf_probe_read_kernel_dynptr(const struct bpf_dynptr *dptr, u64 off, u64 size, const void *unsafe_ptr__ign) { return __bpf_dynptr_copy(dptr, off, size, unsafe_ptr__ign, copy_kernel_data_nofault, NULL); } -__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u64 off, +__bpf_kfunc int bpf_probe_read_user_str_dynptr(const struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign, copy_user_str_nofault, NULL); } -__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u64 off, +__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(const struct bpf_dynptr *dptr, u64 off, u64 size, const void *unsafe_ptr__ign) { return __bpf_dynptr_copy_str(dptr, off, size, unsafe_ptr__ign, copy_kernel_str_nofault, NULL); } -__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u64 off, +__bpf_kfunc int bpf_copy_from_user_dynptr(const struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign, copy_user_data_sleepable, NULL); } -__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u64 off, +__bpf_kfunc int bpf_copy_from_user_str_dynptr(const struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign, copy_user_str_sleepable, NULL); } -__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off, +__bpf_kfunc int bpf_copy_from_user_task_dynptr(const struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign, struct task_struct *tsk) { @@ -3589,7 +3589,7 @@ __bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off, copy_user_data_sleepable, tsk); } -__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64 off, +__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(const struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign, struct task_struct *tsk) { diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h index 7dad01439391..ae71e9b69051 100644 --- a/tools/testing/selftests/bpf/bpf_kfuncs.h +++ b/tools/testing/selftests/bpf/bpf_kfuncs.h @@ -40,7 +40,7 @@ extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, __u64 offset, extern void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *ptr, __u64 offset, void *buffer, __u64 buffer__szk) __ksym __weak; -extern int bpf_dynptr_adjust(const struct bpf_dynptr *ptr, __u64 start, __u64 end) __ksym __weak; +extern int bpf_dynptr_adjust(struct bpf_dynptr *ptr, __u64 start, __u64 end) __ksym __weak; extern bool bpf_dynptr_is_null(const struct bpf_dynptr *ptr) __ksym __weak; extern bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *ptr) __ksym __weak; extern __u64 bpf_dynptr_size(const struct bpf_dynptr *ptr) __ksym __weak; @@ -70,13 +70,13 @@ extern void *bpf_rdonly_cast(const void *obj, __u32 btf_id) __ksym __weak; extern int bpf_get_file_xattr(struct file *file, const char *name, struct bpf_dynptr *value_ptr) __ksym; -extern int bpf_get_fsverity_digest(struct file *file, struct bpf_dynptr *digest_ptr) __ksym; +extern int bpf_get_fsverity_digest(struct file *file, const struct bpf_dynptr *digest_ptr) __ksym; extern struct bpf_key *bpf_lookup_user_key(__s32 serial, __u64 flags) __ksym; extern struct bpf_key *bpf_lookup_system_key(__u64 id) __ksym; extern void bpf_key_put(struct bpf_key *key) __ksym; -extern int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_ptr, - struct bpf_dynptr *sig_ptr, +extern int bpf_verify_pkcs7_signature(const struct bpf_dynptr *data_ptr, + const struct bpf_dynptr *sig_ptr, struct bpf_key *trusted_keyring) __ksym; struct dentry; diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c index e0d672d93adf..e0745b6e467e 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_success.c +++ b/tools/testing/selftests/bpf/progs/dynptr_success.c @@ -914,7 +914,7 @@ void *user_ptr; char expected_str[384]; __u32 test_len[7] = {0/* placeholder */, 0, 1, 2, 255, 256, 257}; -typedef int (*bpf_read_dynptr_fn_t)(struct bpf_dynptr *dptr, u64 off, +typedef int (*bpf_read_dynptr_fn_t)(const struct bpf_dynptr *dptr, u64 off, u64 size, const void *unsafe_ptr); /* Returns the offset just before the end of the maximum sized xdp fragment. @@ -1106,7 +1106,7 @@ int test_copy_from_user_str_dynptr(void *ctx) return 0; } -static int bpf_copy_data_from_user_task(struct bpf_dynptr *dptr, u64 off, +static int bpf_copy_data_from_user_task(const struct bpf_dynptr *dptr, u64 off, u64 size, const void *unsafe_ptr) { struct task_struct *task = bpf_get_current_task_btf(); @@ -1114,7 +1114,7 @@ static int bpf_copy_data_from_user_task(struct bpf_dynptr *dptr, u64 off, return bpf_copy_from_user_task_dynptr(dptr, off, size, unsafe_ptr, task); } -static int bpf_copy_data_from_user_task_str(struct bpf_dynptr *dptr, u64 off, +static int bpf_copy_data_from_user_task_str(const struct bpf_dynptr *dptr, u64 off, u64 size, const void *unsafe_ptr) { struct task_struct *task = bpf_get_current_task_btf(); diff --git a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c index d249113ed657..1c6cfd0888ba 100644 --- a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c +++ b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c @@ -11,12 +11,7 @@ #include #include #include "bpf_misc.h" - -extern struct bpf_key *bpf_lookup_system_key(__u64 id) __ksym; -extern void bpf_key_put(struct bpf_key *key) __ksym; -extern int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_ptr, - struct bpf_dynptr *sig_ptr, - struct bpf_key *trusted_keyring) __ksym; +#include "bpf_kfuncs.h" struct { __uint(type, BPF_MAP_TYPE_RINGBUF); @@ -38,7 +33,7 @@ SEC("?lsm.s/bpf") __failure __msg("cannot pass in dynptr at an offset=-8") int BPF_PROG(not_valid_dynptr, int cmd, union bpf_attr *attr, unsigned int size, bool kernel) { - unsigned long val; + unsigned long val = 0; return bpf_verify_pkcs7_signature((struct bpf_dynptr *)&val, (struct bpf_dynptr *)&val, NULL); -- cgit v1.2.3 From 0aa6378695b8c67146130812f635f07c4898f171 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Mon, 20 Apr 2026 09:37:34 +0000 Subject: selftests/bpf: Fix off-by-one in bpf_cpumask_populate related selftest The test_populate test uses >= instead of > when checking if the runtime nr_cpus exceeds the bit capacity of a cpumask_t. On a system where the physical CPU core count perfectly matches the CONFIG_NR_CPUS upper bound (e.g. nr_cpus = 512 and CONFIG_NR_CPUS = 512), the condition nr_cpus >= CPUMASK_TEST_MASKLEN * 8 evaluates to true (512 >= 512). This incorrectly causes the test to fail with an error value of 3. A 512-bit cpumask_t provides enough bits (indices 0 through 511) to represent 512 CPUs. The subsequent bpf_for(i, 0, nr_cpus) loop iterates up to nr_cpus - 1 (511), which perfectly aligns with the maximum valid index of the bitmask. Change the condition to nr_cpus > CPUMASK_TEST_MASKLEN * 8 to fix the false positive failure on these systems. Fixes: 918ba2636d4e ("selftests: bpf: add bpf_cpumask_populate selftests") Signed-off-by: Matt Bobrowski Acked-by: Paul Chaignon Link: https://lore.kernel.org/bpf/20260420093734.2400330-1-mattbobrowski@google.com Signed-off-by: Kumar Kartikeya Dwivedi --- tools/testing/selftests/bpf/progs/cpumask_success.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/cpumask_success.c b/tools/testing/selftests/bpf/progs/cpumask_success.c index 0e04c31b91c0..774706e7b058 100644 --- a/tools/testing/selftests/bpf/progs/cpumask_success.c +++ b/tools/testing/selftests/bpf/progs/cpumask_success.c @@ -866,7 +866,7 @@ int BPF_PROG(test_populate, struct task_struct *task, u64 clone_flags) * access NR_CPUS, the upper bound for nr_cpus, so we infer * it from the size of cpumask_t. */ - if (nr_cpus < 0 || nr_cpus >= CPUMASK_TEST_MASKLEN * 8) { + if (nr_cpus < 0 || nr_cpus > CPUMASK_TEST_MASKLEN * 8) { err = 3; goto out; } -- cgit v1.2.3 From f7a6b9eaff3e6693ba3b19c5812e28538049bbf2 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Fri, 17 Apr 2026 15:30:18 +0100 Subject: bpf: Extend BTF UAPI vlen, kinds to use unused bits BTF maximum vlen is encoded using 16 bits with a maximum vlen of 65535. This has sufficed for structs, function parameters and enumerated type values. However, with upcoming BTF location information - in particular information about inline sites - this limit is surpassed. Use bits 16-23 - currently unused in BTF info - to extend to 24 bits, giving a max vlen of (2^24 - 1), or 16 million. Also extend BTF kind encoding from 5 to 7 bits, giving a maximum available number of kinds of 128. Since with the BTF location work we use another 3 kinds, we are fast approaching the current limit of 32. Convert BTF_MAX_* values to enums to allow them to be encoded in kernel BTF; this will allow us to detect if the running kernel supports a 24-bit vlen or not. Add one for max _possible_ (not used) kind. Fix up a few places in the kernel where a 16-bit vlen is assumed; remove BTF_INFO_MASK as now all bits are used. The vlen expansion was suggested by Andrii in [1]; the kind expansion is tackled here too as it may be needed also to support new kinds in BTF. [1] https://lore.kernel.org/bpf/CAEf4BzZx=X6vGqcA8SPU6D+v6k+TR=ZewebXMuXtpmML058piw@mail.gmail.com/ Suggested-by: Andrii Nakryiko Signed-off-by: Alan Maguire Acked-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260417143023.1551481-2-alan.maguire@oracle.com Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 4 ++-- include/uapi/linux/btf.h | 26 ++++++++++++++------------ kernel/bpf/btf.c | 27 ++++++++++----------------- tools/include/uapi/linux/btf.h | 26 ++++++++++++++------------ 4 files changed, 40 insertions(+), 43 deletions(-) diff --git a/include/linux/btf.h b/include/linux/btf.h index 48108471c5b1..c82d0d689059 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -415,12 +415,12 @@ static inline bool btf_type_is_array(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; } -static inline u16 btf_type_vlen(const struct btf_type *t) +static inline u32 btf_type_vlen(const struct btf_type *t) { return BTF_INFO_VLEN(t->info); } -static inline u16 btf_vlen(const struct btf_type *t) +static inline u32 btf_vlen(const struct btf_type *t) { return btf_type_vlen(t); } diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 638615ebddc2..618167cab4e6 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -33,20 +33,22 @@ struct btf_header { __u32 layout_len; /* length of layout section */ }; -/* Max # of type identifier */ -#define BTF_MAX_TYPE 0x000fffff -/* Max offset into the string section */ -#define BTF_MAX_NAME_OFFSET 0x00ffffff -/* Max # of struct/union/enum members or func args */ -#define BTF_MAX_VLEN 0xffff +enum btf_max { + /* Max possible kind */ + BTF_MAX_KIND = 0x0000007f, + /* Max # of type identifier */ + BTF_MAX_TYPE = 0x000fffff, + /* Max offset into the string section */ + BTF_MAX_NAME_OFFSET = 0x00ffffff, + /* Max # of struct/union/enum members or func args */ + BTF_MAX_VLEN = 0x00ffffff, +}; struct btf_type { __u32 name_off; /* "info" bits arrangement - * bits 0-15: vlen (e.g. # of struct's members) - * bits 16-23: unused - * bits 24-28: kind (e.g. int, ptr, array...etc) - * bits 29-30: unused + * bits 0-23: vlen (e.g. # of struct's members) + * bits 24-30: kind (e.g. int, ptr, array...etc) * bit 31: kind_flag, currently used by * struct, union, enum, fwd, enum64, * decl_tag and type_tag @@ -65,8 +67,8 @@ struct btf_type { }; }; -#define BTF_INFO_KIND(info) (((info) >> 24) & 0x1f) -#define BTF_INFO_VLEN(info) ((info) & 0xffff) +#define BTF_INFO_KIND(info) (((info) >> 24) & 0x7f) +#define BTF_INFO_VLEN(info) ((info) & 0xffffff) #define BTF_INFO_KFLAG(info) ((info) >> 31) enum { diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 3c2aaa3c5004..77af44d8a3ad 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -182,7 +182,6 @@ #define BITS_ROUNDUP_BYTES(bits) \ (BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits)) -#define BTF_INFO_MASK 0x9f00ffff #define BTF_INT_MASK 0x0fffffff #define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE) #define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET) @@ -289,7 +288,7 @@ enum verifier_phase { struct resolve_vertex { const struct btf_type *t; u32 type_id; - u16 next_member; + u32 next_member; }; enum visit_state { @@ -2031,7 +2030,7 @@ static int env_stack_push(struct btf_verifier_env *env, } static void env_stack_set_next_member(struct btf_verifier_env *env, - u16 next_member) + u32 next_member) { env->stack[env->top_stack - 1].next_member = next_member; } @@ -3293,7 +3292,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, struct btf *btf = env->btf; u32 struct_size = t->size; u32 offset; - u16 i; + u32 i; meta_needed = btf_type_vlen(t) * sizeof(*member); if (meta_left < meta_needed) { @@ -3369,7 +3368,7 @@ static int btf_struct_resolve(struct btf_verifier_env *env, { const struct btf_member *member; int err; - u16 i; + u32 i; /* Before continue resolving the next_member, * ensure the last member is indeed resolved to a @@ -4447,7 +4446,7 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, const struct btf_enum *enums = btf_type_enum(t); struct btf *btf = env->btf; const char *fmt_str; - u16 i, nr_enums; + u32 i, nr_enums; u32 meta_needed; nr_enums = btf_type_vlen(t); @@ -4555,7 +4554,7 @@ static s32 btf_enum64_check_meta(struct btf_verifier_env *env, const struct btf_enum64 *enums = btf_type_enum64(t); struct btf *btf = env->btf; const char *fmt_str; - u16 i, nr_enums; + u32 i, nr_enums; u32 meta_needed; nr_enums = btf_type_vlen(t); @@ -4683,7 +4682,7 @@ static void btf_func_proto_log(struct btf_verifier_env *env, const struct btf_type *t) { const struct btf_param *args = (const struct btf_param *)(t + 1); - u16 nr_args = btf_type_vlen(t), i; + u32 nr_args = btf_type_vlen(t), i; btf_verifier_log(env, "return=%u args=(", t->type); if (!nr_args) { @@ -4929,7 +4928,7 @@ static int btf_datasec_resolve(struct btf_verifier_env *env, { const struct btf_var_secinfo *vsi; struct btf *btf = env->btf; - u16 i; + u32 i; env->resolve_mode = RESOLVE_TBD; for_each_vsi_from(i, v->next_member, v->t, vsi) { @@ -5183,7 +5182,7 @@ static int btf_func_proto_check(struct btf_verifier_env *env, const struct btf_type *ret_type; const struct btf_param *args; const struct btf *btf; - u16 nr_args, i; + u32 nr_args, i; int err; btf = env->btf; @@ -5278,7 +5277,7 @@ static int btf_func_check(struct btf_verifier_env *env, const struct btf_type *proto_type; const struct btf_param *args; const struct btf *btf; - u16 nr_args, i; + u32 nr_args, i; btf = env->btf; proto_type = btf_type_by_id(btf, t->type); @@ -5336,12 +5335,6 @@ static s32 btf_check_meta(struct btf_verifier_env *env, } meta_left -= sizeof(*t); - if (t->info & ~BTF_INFO_MASK) { - btf_verifier_log(env, "[%u] Invalid btf_info:%x", - env->log_type_id, t->info); - return -EINVAL; - } - if (BTF_INFO_KIND(t->info) > BTF_KIND_MAX || BTF_INFO_KIND(t->info) == BTF_KIND_UNKN) { btf_verifier_log(env, "[%u] Invalid kind:%u", diff --git a/tools/include/uapi/linux/btf.h b/tools/include/uapi/linux/btf.h index 638615ebddc2..618167cab4e6 100644 --- a/tools/include/uapi/linux/btf.h +++ b/tools/include/uapi/linux/btf.h @@ -33,20 +33,22 @@ struct btf_header { __u32 layout_len; /* length of layout section */ }; -/* Max # of type identifier */ -#define BTF_MAX_TYPE 0x000fffff -/* Max offset into the string section */ -#define BTF_MAX_NAME_OFFSET 0x00ffffff -/* Max # of struct/union/enum members or func args */ -#define BTF_MAX_VLEN 0xffff +enum btf_max { + /* Max possible kind */ + BTF_MAX_KIND = 0x0000007f, + /* Max # of type identifier */ + BTF_MAX_TYPE = 0x000fffff, + /* Max offset into the string section */ + BTF_MAX_NAME_OFFSET = 0x00ffffff, + /* Max # of struct/union/enum members or func args */ + BTF_MAX_VLEN = 0x00ffffff, +}; struct btf_type { __u32 name_off; /* "info" bits arrangement - * bits 0-15: vlen (e.g. # of struct's members) - * bits 16-23: unused - * bits 24-28: kind (e.g. int, ptr, array...etc) - * bits 29-30: unused + * bits 0-23: vlen (e.g. # of struct's members) + * bits 24-30: kind (e.g. int, ptr, array...etc) * bit 31: kind_flag, currently used by * struct, union, enum, fwd, enum64, * decl_tag and type_tag @@ -65,8 +67,8 @@ struct btf_type { }; }; -#define BTF_INFO_KIND(info) (((info) >> 24) & 0x1f) -#define BTF_INFO_VLEN(info) ((info) & 0xffff) +#define BTF_INFO_KIND(info) (((info) >> 24) & 0x7f) +#define BTF_INFO_VLEN(info) ((info) & 0xffffff) #define BTF_INFO_KFLAG(info) ((info) >> 31) enum { -- cgit v1.2.3 From cacd6729c09236245d921464eb28e69a6d573412 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Fri, 17 Apr 2026 15:30:19 +0100 Subject: libbpf: Adjust btf_vlen() to return a __u32 Now that vlen is 24 bits, btf_vlen() must return a __u32. Adjust use cases in libbpf accordingly. Also add error handling to avoid vlen overflow in btf_type_inc_vlen(). Signed-off-by: Alan Maguire Acked-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260417143023.1551481-3-alan.maguire@oracle.com Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/btf.c | 50 ++++++++++++++++++++++++++++++----------------- tools/lib/bpf/btf.h | 2 +- tools/lib/bpf/btf_dump.c | 24 +++++++++++------------ tools/lib/bpf/relo_core.c | 16 +++++++-------- 4 files changed, 53 insertions(+), 39 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index ceb57b46a878..267904939098 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -421,7 +421,7 @@ static int btf_type_size_unknown(const struct btf *btf, const struct btf_type *t { __u32 l_cnt = btf->hdr.layout_len / sizeof(struct btf_layout); struct btf_layout *l = btf->layout; - __u16 vlen = btf_vlen(t); + __u32 vlen = btf_vlen(t); __u32 kind = btf_kind(t); /* Fall back to base BTF if needed as they share layout information */ @@ -454,7 +454,7 @@ static int btf_type_size_unknown(const struct btf *btf, const struct btf_type *t static int btf_type_size(const struct btf *btf, const struct btf_type *t) { const int base_size = sizeof(struct btf_type); - __u16 vlen = btf_vlen(t); + __u32 vlen = btf_vlen(t); switch (btf_kind(t)) { case BTF_KIND_FWD: @@ -506,7 +506,7 @@ static int btf_bswap_type_rest(struct btf_type *t) struct btf_array *a; struct btf_param *p; struct btf_enum *e; - __u16 vlen = btf_vlen(t); + __u32 vlen = btf_vlen(t); int i; switch (btf_kind(t)) { @@ -1007,7 +1007,7 @@ int btf__align_of(const struct btf *btf, __u32 id) case BTF_KIND_STRUCT: case BTF_KIND_UNION: { const struct btf_member *m = btf_members(t); - __u16 vlen = btf_vlen(t); + __u32 vlen = btf_vlen(t); int i, max_align = 1, align; for (i = 0; i < vlen; i++, m++) { @@ -2121,9 +2121,12 @@ static void *btf_add_type_mem(struct btf *btf, size_t add_sz) btf->hdr.type_len, UINT_MAX, add_sz); } -static void btf_type_inc_vlen(struct btf_type *t) +static int btf_type_inc_vlen(struct btf_type *t) { + if (btf_vlen(t) == BTF_MAX_VLEN) + return -ENOSPC; t->info = btf_type_info(btf_kind(t), btf_vlen(t) + 1, btf_kflag(t)); + return 0; } static void btf_hdr_update_type_len(struct btf *btf, int new_len) @@ -2652,6 +2655,8 @@ int btf__add_field(struct btf *btf, const char *name, int type_id, t = btf_last_type(btf); if (!btf_is_composite(t)) return libbpf_err(-EINVAL); + if (btf_vlen(t) == BTF_MAX_VLEN) + return libbpf_err(-ENOSPC); if (validate_type_id(type_id)) return libbpf_err(-EINVAL); @@ -2686,6 +2691,7 @@ int btf__add_field(struct btf *btf, const char *name, int type_id, /* btf_add_type_mem can invalidate t pointer */ t = btf_last_type(btf); + /* update parent type's vlen and kflag */ t->info = btf_type_info(btf_kind(t), btf_vlen(t) + 1, is_bitfield || btf_kflag(t)); @@ -2796,7 +2802,9 @@ int btf__add_enum_value(struct btf *btf, const char *name, __s64 value) /* update parent type's vlen */ t = btf_last_type(btf); - btf_type_inc_vlen(t); + err = btf_type_inc_vlen(t); + if (err) + return libbpf_err(err); /* if negative value, set signedness to signed */ if (value < 0) @@ -2873,7 +2881,9 @@ int btf__add_enum64_value(struct btf *btf, const char *name, __u64 value) /* update parent type's vlen */ t = btf_last_type(btf); - btf_type_inc_vlen(t); + err = btf_type_inc_vlen(t); + if (err) + return libbpf_err(err); btf_hdr_update_type_len(btf, btf->hdr.type_len + sz); return 0; @@ -3115,7 +3125,9 @@ int btf__add_func_param(struct btf *btf, const char *name, int type_id) /* update parent type's vlen */ t = btf_last_type(btf); - btf_type_inc_vlen(t); + err = btf_type_inc_vlen(t); + if (err) + return libbpf_err(err); btf_hdr_update_type_len(btf, btf->hdr.type_len + sz); return 0; @@ -3257,7 +3269,9 @@ int btf__add_datasec_var_info(struct btf *btf, int var_type_id, __u32 offset, __ /* update parent type's vlen */ t = btf_last_type(btf); - btf_type_inc_vlen(t); + err = btf_type_inc_vlen(t); + if (err) + return libbpf_err(err); btf_hdr_update_type_len(btf, btf->hdr.type_len + sz); return 0; @@ -4311,7 +4325,7 @@ static long btf_hash_enum(struct btf_type *t) static bool btf_equal_enum_members(struct btf_type *t1, struct btf_type *t2) { const struct btf_enum *m1, *m2; - __u16 vlen; + __u32 vlen; int i; vlen = btf_vlen(t1); @@ -4329,7 +4343,7 @@ static bool btf_equal_enum_members(struct btf_type *t1, struct btf_type *t2) static bool btf_equal_enum64_members(struct btf_type *t1, struct btf_type *t2) { const struct btf_enum64 *m1, *m2; - __u16 vlen; + __u32 vlen; int i; vlen = btf_vlen(t1); @@ -4406,7 +4420,7 @@ static long btf_hash_struct(struct btf_type *t) static bool btf_shallow_equal_struct(struct btf_type *t1, struct btf_type *t2) { const struct btf_member *m1, *m2; - __u16 vlen; + __u32 vlen; int i; if (!btf_equal_common(t1, t2)) @@ -4482,7 +4496,7 @@ static bool btf_compat_array(struct btf_type *t1, struct btf_type *t2) static long btf_hash_fnproto(struct btf_type *t) { const struct btf_param *member = btf_params(t); - __u16 vlen = btf_vlen(t); + __u32 vlen = btf_vlen(t); long h = btf_hash_common(t); int i; @@ -4504,7 +4518,7 @@ static long btf_hash_fnproto(struct btf_type *t) static bool btf_equal_fnproto(struct btf_type *t1, struct btf_type *t2) { const struct btf_param *m1, *m2; - __u16 vlen; + __u32 vlen; int i; if (!btf_equal_common(t1, t2)) @@ -4530,7 +4544,7 @@ static bool btf_equal_fnproto(struct btf_type *t1, struct btf_type *t2) static bool btf_compat_fnproto(struct btf_type *t1, struct btf_type *t2) { const struct btf_param *m1, *m2; - __u16 vlen; + __u32 vlen; int i; /* skip return type ID */ @@ -5077,7 +5091,7 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id, case BTF_KIND_STRUCT: case BTF_KIND_UNION: { const struct btf_member *cand_m, *canon_m; - __u16 vlen; + __u32 vlen; if (!btf_shallow_equal_struct(cand_type, canon_type)) return 0; @@ -5105,7 +5119,7 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id, case BTF_KIND_FUNC_PROTO: { const struct btf_param *cand_p, *canon_p; - __u16 vlen; + __u32 vlen; if (!btf_compat_fnproto(cand_type, canon_type)) return 0; @@ -5439,7 +5453,7 @@ static int btf_dedup_ref_type(struct btf_dedup *d, __u32 type_id) case BTF_KIND_FUNC_PROTO: { struct btf_param *param; - __u16 vlen; + __u32 vlen; int i; ref_type_id = btf_dedup_ref_type(d, t->type); diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index a1f8deca2603..1a31f2da947f 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -435,7 +435,7 @@ static inline __u16 btf_kind(const struct btf_type *t) return BTF_INFO_KIND(t->info); } -static inline __u16 btf_vlen(const struct btf_type *t) +static inline __u32 btf_vlen(const struct btf_type *t) { return BTF_INFO_VLEN(t->info); } diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 53c6624161d7..cc1ba65bb6c5 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -316,7 +316,7 @@ static int btf_dump_mark_referenced(struct btf_dump *d) { int i, j, n = btf__type_cnt(d->btf); const struct btf_type *t; - __u16 vlen; + __u32 vlen; for (i = d->last_id + 1; i < n; i++) { t = btf__type_by_id(d->btf, i); @@ -485,7 +485,7 @@ static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr) */ struct btf_dump_type_aux_state *tstate = &d->type_states[id]; const struct btf_type *t; - __u16 vlen; + __u32 vlen; int err, i; /* return true, letting typedefs know that it's ok to be emitted */ @@ -798,7 +798,7 @@ static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id) */ if (top_level_def || t->name_off == 0) { const struct btf_member *m = btf_members(t); - __u16 vlen = btf_vlen(t); + __u32 vlen = btf_vlen(t); int i, new_cont_id; new_cont_id = t->name_off == 0 ? cont_id : id; @@ -820,7 +820,7 @@ static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id) break; case BTF_KIND_FUNC_PROTO: { const struct btf_param *p = btf_params(t); - __u16 n = btf_vlen(t); + __u32 n = btf_vlen(t); int i; btf_dump_emit_type(d, t->type, cont_id); @@ -839,7 +839,7 @@ static bool btf_is_struct_packed(const struct btf *btf, __u32 id, { const struct btf_member *m; int max_align = 1, align, i, bit_sz; - __u16 vlen; + __u32 vlen; m = btf_members(t); vlen = btf_vlen(t); @@ -973,7 +973,7 @@ static void btf_dump_emit_struct_def(struct btf_dump *d, bool is_struct = btf_is_struct(t); bool packed, prev_bitfield = false; int align, i, off = 0; - __u16 vlen = btf_vlen(t); + __u32 vlen = btf_vlen(t); align = btf__align_of(d->btf, id); packed = is_struct ? btf_is_struct_packed(d->btf, id, t) : 0; @@ -1064,7 +1064,7 @@ static void btf_dump_emit_enum_fwd(struct btf_dump *d, __u32 id, static void btf_dump_emit_enum32_val(struct btf_dump *d, const struct btf_type *t, - int lvl, __u16 vlen) + int lvl, __u32 vlen) { const struct btf_enum *v = btf_enum(t); bool is_signed = btf_kflag(t); @@ -1089,7 +1089,7 @@ static void btf_dump_emit_enum32_val(struct btf_dump *d, static void btf_dump_emit_enum64_val(struct btf_dump *d, const struct btf_type *t, - int lvl, __u16 vlen) + int lvl, __u32 vlen) { const struct btf_enum64 *v = btf_enum64(t); bool is_signed = btf_kflag(t); @@ -1122,7 +1122,7 @@ static void btf_dump_emit_enum_def(struct btf_dump *d, __u32 id, const struct btf_type *t, int lvl) { - __u16 vlen = btf_vlen(t); + __u32 vlen = btf_vlen(t); btf_dump_printf(d, "enum%s%s", t->name_off ? " " : "", @@ -1542,7 +1542,7 @@ static void btf_dump_emit_type_chain(struct btf_dump *d, } case BTF_KIND_FUNC_PROTO: { const struct btf_param *p = btf_params(t); - __u16 vlen = btf_vlen(t); + __u32 vlen = btf_vlen(t); int i; /* @@ -2159,7 +2159,7 @@ static int btf_dump_struct_data(struct btf_dump *d, const void *data) { const struct btf_member *m = btf_members(t); - __u16 n = btf_vlen(t); + __u32 n = btf_vlen(t); int i, err = 0; /* note that we increment depth before calling btf_dump_print() below; @@ -2449,7 +2449,7 @@ static int btf_dump_type_data_check_zero(struct btf_dump *d, case BTF_KIND_STRUCT: case BTF_KIND_UNION: { const struct btf_member *m = btf_members(t); - __u16 n = btf_vlen(t); + __u32 n = btf_vlen(t); /* if any struct/union member is non-zero, the struct/union * is considered non-zero and dumped. diff --git a/tools/lib/bpf/relo_core.c b/tools/lib/bpf/relo_core.c index 0ccc8f548cba..6ae3f2a15ad0 100644 --- a/tools/lib/bpf/relo_core.c +++ b/tools/lib/bpf/relo_core.c @@ -191,8 +191,8 @@ recur: case BTF_KIND_FUNC_PROTO: { struct btf_param *local_p = btf_params(local_type); struct btf_param *targ_p = btf_params(targ_type); - __u16 local_vlen = btf_vlen(local_type); - __u16 targ_vlen = btf_vlen(targ_type); + __u32 local_vlen = btf_vlen(local_type); + __u32 targ_vlen = btf_vlen(targ_type); int i, err; if (local_vlen != targ_vlen) @@ -1457,8 +1457,8 @@ static bool bpf_core_names_match(const struct btf *local_btf, size_t local_name_ static int bpf_core_enums_match(const struct btf *local_btf, const struct btf_type *local_t, const struct btf *targ_btf, const struct btf_type *targ_t) { - __u16 local_vlen = btf_vlen(local_t); - __u16 targ_vlen = btf_vlen(targ_t); + __u32 local_vlen = btf_vlen(local_t); + __u32 targ_vlen = btf_vlen(targ_t); int i, j; if (local_t->size != targ_t->size) @@ -1498,8 +1498,8 @@ static int bpf_core_composites_match(const struct btf *local_btf, const struct b bool behind_ptr, int level) { const struct btf_member *local_m = btf_members(local_t); - __u16 local_vlen = btf_vlen(local_t); - __u16 targ_vlen = btf_vlen(targ_t); + __u32 local_vlen = btf_vlen(local_t); + __u32 targ_vlen = btf_vlen(targ_t); int i, j, err; if (local_vlen > targ_vlen) @@ -1674,8 +1674,8 @@ recur: case BTF_KIND_FUNC_PROTO: { struct btf_param *local_p = btf_params(local_t); struct btf_param *targ_p = btf_params(targ_t); - __u16 local_vlen = btf_vlen(local_t); - __u16 targ_vlen = btf_vlen(targ_t); + __u32 local_vlen = btf_vlen(local_t); + __u32 targ_vlen = btf_vlen(targ_t); int i, err; if (local_k != targ_k) -- cgit v1.2.3 From 22b402457ee40f64ea220f4b60776a612f084636 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Fri, 17 Apr 2026 15:30:20 +0100 Subject: bpftool: Support 24-bit vlen Adjust btf_vlen() usage to handle 24-bit vlen. Signed-off-by: Alan Maguire Link: https://lore.kernel.org/r/20260417143023.1551481-4-alan.maguire@oracle.com Signed-off-by: Alexei Starovoitov --- tools/bpf/bpftool/btf.c | 17 ++++++----------- tools/bpf/bpftool/btf_dumper.c | 4 ++-- tools/bpf/bpftool/gen.c | 16 +++++++++------- 3 files changed, 17 insertions(+), 20 deletions(-) diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index 2e899e940034..6ef908adf3a4 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -179,8 +179,7 @@ static int dump_btf_type(const struct btf *btf, __u32 id, case BTF_KIND_STRUCT: case BTF_KIND_UNION: { const struct btf_member *m = (const void *)(t + 1); - __u16 vlen = BTF_INFO_VLEN(t->info); - int i; + __u32 i, vlen = BTF_INFO_VLEN(t->info); if (json_output) { jsonw_uint_field(w, "size", t->size); @@ -225,9 +224,8 @@ static int dump_btf_type(const struct btf *btf, __u32 id, } case BTF_KIND_ENUM: { const struct btf_enum *v = (const void *)(t + 1); - __u16 vlen = BTF_INFO_VLEN(t->info); + __u32 i, vlen = BTF_INFO_VLEN(t->info); const char *encoding; - int i; encoding = btf_kflag(t) ? "SIGNED" : "UNSIGNED"; if (json_output) { @@ -263,9 +261,8 @@ static int dump_btf_type(const struct btf *btf, __u32 id, } case BTF_KIND_ENUM64: { const struct btf_enum64 *v = btf_enum64(t); - __u16 vlen = btf_vlen(t); + __u32 i, vlen = btf_vlen(t); const char *encoding; - int i; encoding = btf_kflag(t) ? "SIGNED" : "UNSIGNED"; if (json_output) { @@ -325,8 +322,7 @@ static int dump_btf_type(const struct btf *btf, __u32 id, } case BTF_KIND_FUNC_PROTO: { const struct btf_param *p = (const void *)(t + 1); - __u16 vlen = BTF_INFO_VLEN(t->info); - int i; + __u32 i, vlen = BTF_INFO_VLEN(t->info); if (json_output) { jsonw_uint_field(w, "ret_type_id", t->type); @@ -369,8 +365,7 @@ static int dump_btf_type(const struct btf *btf, __u32 id, case BTF_KIND_DATASEC: { const struct btf_var_secinfo *v = (const void *)(t + 1); const struct btf_type *vt; - __u16 vlen = BTF_INFO_VLEN(t->info); - int i; + __u32 i, vlen = BTF_INFO_VLEN(t->info); if (json_output) { jsonw_uint_field(w, "size", t->size); @@ -675,7 +670,7 @@ static __u64 btf_name_hasher(__u64 hash, const struct btf *btf, __u32 name_off) static __u64 btf_type_disambig_hash(const struct btf *btf, __u32 id, bool include_members) { const struct btf_type *t = btf__type_by_id(btf, id); - int i; + __u32 i; size_t hash = 0; hash = btf_name_hasher(hash, btf, t->name_off); diff --git a/tools/bpf/bpftool/btf_dumper.c b/tools/bpf/bpftool/btf_dumper.c index def297e879f4..9dc8425b1789 100644 --- a/tools/bpf/bpftool/btf_dumper.c +++ b/tools/bpf/bpftool/btf_dumper.c @@ -150,7 +150,7 @@ static int btf_dumper_enum(const struct btf_dumper *d, { const struct btf_enum *enums = btf_enum(t); __s64 value; - __u16 i; + __u32 i; switch (t->size) { case 8: @@ -189,7 +189,7 @@ static int btf_dumper_enum64(const struct btf_dumper *d, const struct btf_enum64 *enums = btf_enum64(t); __u32 val_lo32, val_hi32; __u64 value; - __u16 i; + __u32 i; value = *(__u64 *)data; val_lo32 = (__u32)value; diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 2f9e10752e28..37159e02f418 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -2094,7 +2094,8 @@ btfgen_mark_type(struct btfgen_info *info, unsigned int type_id, bool follow_poi struct btf_type *cloned_type; struct btf_param *param; struct btf_array *array; - int err, i; + __u32 i; + int err; if (type_id == 0) return 0; @@ -2229,7 +2230,8 @@ static int btfgen_mark_type_match(struct btfgen_info *info, __u32 type_id, bool const struct btf_type *btf_type; struct btf *btf = info->src_btf; struct btf_type *cloned_type; - int i, err; + int err; + __u32 i; if (type_id == 0) return 0; @@ -2249,7 +2251,7 @@ static int btfgen_mark_type_match(struct btfgen_info *info, __u32 type_id, bool case BTF_KIND_STRUCT: case BTF_KIND_UNION: { struct btf_member *m = btf_members(btf_type); - __u16 vlen = btf_vlen(btf_type); + __u32 vlen = btf_vlen(btf_type); if (behind_ptr) break; @@ -2286,7 +2288,7 @@ static int btfgen_mark_type_match(struct btfgen_info *info, __u32 type_id, bool break; } case BTF_KIND_FUNC_PROTO: { - __u16 vlen = btf_vlen(btf_type); + __u32 vlen = btf_vlen(btf_type); struct btf_param *param; /* mark ret type */ @@ -2492,8 +2494,9 @@ static struct btf *btfgen_get_btf(struct btfgen_info *info) { struct btf *btf_new = NULL; unsigned int *ids = NULL; - unsigned int i, n = btf__type_cnt(info->marked_btf); + unsigned int n = btf__type_cnt(info->marked_btf); int err = 0; + __u32 i; btf_new = btf__new_empty(); if (!btf_new) { @@ -2523,8 +2526,7 @@ static struct btf *btfgen_get_btf(struct btfgen_info *info) /* add members for struct and union */ if (btf_is_composite(type)) { struct btf_member *cloned_m, *m; - unsigned short vlen; - int idx_src; + __u32 vlen, idx_src; name = btf__str_by_offset(info->src_btf, type->name_off); -- cgit v1.2.3 From 855af3e775670fa0a2493f3e61f4da38f956ef47 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Fri, 17 Apr 2026 15:30:21 +0100 Subject: selftests/bpf: Fix up btf/invalid test for extended kind With extended kinds, 32 becomes a valid (but not used) BTF info kind value; fix up the test to check for the "Invalid kind" rather than "Invalid btf_info" message. Since all bits are used in BTF info, it is no longer possible to craft an invalid BTF info value. Use 127 (new maximum possible kind value). Signed-off-by: Alan Maguire Acked-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260417143023.1551481-5-alan.maguire@oracle.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/btf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index 054ecb6b1e9f..0cc347e32db3 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -1924,11 +1924,11 @@ static struct btf_raw_test raw_tests[] = { }, { - .descr = "invalid BTF_INFO", + .descr = "invalid BTF kind", .raw_types = { /* int */ /* [1] */ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), - BTF_TYPE_ENC(0, 0x20000000, 4), + BTF_TYPE_ENC(0, 0x7f000000, 4), BTF_END_RAW, }, .str_sec = "", @@ -1941,7 +1941,7 @@ static struct btf_raw_test raw_tests[] = { .value_type_id = 1, .max_entries = 4, .btf_load_err = true, - .err_str = "Invalid btf_info", + .err_str = "Invalid kind", }, { -- cgit v1.2.3 From ad256554f1065feb17c094f7aab16d75ad41f60c Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Fri, 17 Apr 2026 15:30:22 +0100 Subject: selftests/bpf: Fix up __u16 vlen assumptions Fix up a few cases where we assume vlen is 16 bits. Signed-off-by: Alan Maguire Acked-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260417143023.1551481-6-alan.maguire@oracle.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/btf.c | 2 +- tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c | 3 +-- tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c | 3 +-- tools/testing/selftests/bpf/test_progs.c | 2 +- 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index 0cc347e32db3..a9de328a8697 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -8092,7 +8092,7 @@ static struct btf_dedup_test dedup_tests[] = { static int btf_type_size(const struct btf_type *t) { int base_size = sizeof(struct btf_type); - __u16 vlen = BTF_INFO_VLEN(t->info); + __u32 vlen = BTF_INFO_VLEN(t->info); __u16 kind = BTF_INFO_KIND(t->info); switch (kind) { diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c b/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c index 5bc15bb6b7ce..6bc31236805c 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c @@ -487,9 +487,8 @@ static void test_split_module(void) for (i = 0; i < ARRAY_SIZE(mod_funcs); i++) { const struct btf_param *p; const struct btf_type *t; - __u16 vlen; + __u32 vlen, j; __u32 id; - int j; id = btf__find_by_name_kind(btf1, mod_funcs[i], BTF_KIND_FUNC); if (!ASSERT_GE(id, nr_base_types, "func_id")) diff --git a/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c b/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c index 469e92869523..5064aeb8fe67 100644 --- a/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c +++ b/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c @@ -253,8 +253,7 @@ static int find_field_offset_aux(struct btf *btf, int btf_id, char *field_name, { const struct btf_type *type = btf__type_by_id(btf, btf_id); const struct btf_member *m; - __u16 mnum; - int i; + __u32 mnum, i; if (!type) { PRINT_FAIL("Can't find btf_type for id %d\n", btf_id); diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 7fe16b5131b1..cc14b13e23fe 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -1257,7 +1257,7 @@ int get_bpf_max_tramp_links_from(struct btf *btf) const struct btf_type *t; __u32 i, type_cnt; const char *name; - __u16 j, vlen; + __u32 j, vlen; for (i = 1, type_cnt = btf__type_cnt(btf); i < type_cnt; i++) { t = btf__type_by_id(btf, i); -- cgit v1.2.3 From 65350a0ecd41db80b117f86d65c4d275e2e3a3a5 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Fri, 17 Apr 2026 15:30:23 +0100 Subject: Documentation/bpf: Update btf doc with updated vlen, kind sizes Sync doc with updated UAPI changes utilizing unused bts for extended vlen, kind values. Signed-off-by: Alan Maguire Link: https://lore.kernel.org/r/20260417143023.1551481-7-alan.maguire@oracle.com Signed-off-by: Alexei Starovoitov --- Documentation/bpf/btf.rst | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Documentation/bpf/btf.rst b/Documentation/bpf/btf.rst index 3b60583f5db2..3f05f17990ad 100644 --- a/Documentation/bpf/btf.rst +++ b/Documentation/bpf/btf.rst @@ -97,10 +97,8 @@ Each type contains the following common data:: struct btf_type { __u32 name_off; /* "info" bits arrangement - * bits 0-15: vlen (e.g. # of struct's members) - * bits 16-23: unused - * bits 24-28: kind (e.g. int, ptr, array...etc) - * bits 29-30: unused + * bits 0-23: vlen (e.g. # of struct's members) + * bits 24-30: kind (e.g. int, ptr, array...etc) * bit 31: kind_flag, currently used by * struct, union, enum, fwd, enum64, * decl_tag and type_tag -- cgit v1.2.3 From c8f0ee969f76277e562e44a20a6ff8bb47acab15 Mon Sep 17 00:00:00 2001 From: Jie Meng Date: Mon, 13 Apr 2026 10:23:11 -0700 Subject: bpf: Exhaustive test coverage for signed division and modulo Extend lib/test_bpf.c to provide comprehensive test coverage for BPF signed division (SDIV) and signed modulo (SMOD) instructions, both 32-bit and 64-bit variants with immediate operands. Introduce F_ALU32 and F_SIGNED flags to replace the less readable bool alu32 and s16 off parameters throughout the test helpers. The BPF instruction 'off' field is derived from flags only at the point of instruction encoding. Changes: - Add enum { F_ALU32 = 1, F_SIGNED = 2 } for readable test flags. - __bpf_alu_result(): take u32 flags instead of separate signed/alu32 parameters. Narrows operands internally for ALU32 (unsigned via u32 cast, signed via s32 cast) before computing the reference result. - __bpf_emit_alu64_imm(), __bpf_emit_alu32_imm(): pass flags through to __bpf_alu_result, derive 'off' for instruction encoding locally. - __bpf_fill_alu_imm_regs(): take u32 flags, use F_ALU32/F_SIGNED for operand setup and single-line __bpf_alu_result() call. - __bpf_fill_alu_shift(), __bpf_fill_alu_shift_same_reg(): convert bool alu32 parameter to u32 flags for consistency. - New test fill functions: bpf_fill_alu{32,64}_{sdiv,smod}_imm() and bpf_fill_alu{32,64}_{sdiv,smod}_imm_regs(), each testing all immediate value magnitudes and all register pair combinations. - All existing unsigned tests updated to use flags (0 or F_ALU32), preserving backward compatibility. 8 new test cases added: ALU64_SDIV_K, ALU64_SMOD_K (immediate magnitudes + register combos) ALU32_SDIV_K, ALU32_SMOD_K (immediate magnitudes + register combos) Test results: test_bpf: Summary: 1061 PASSED, 0 FAILED, [1049/1049 JIT'ed] test_bpf: test_tail_calls: Summary: 10 PASSED, 0 FAILED, [10/10 JIT'ed] test_bpf: test_skb_segment: Summary: 2 PASSED, 0 FAILED Assisted-by: Claude:claude-opus-4-6 Signed-off-by: Jie Meng Link: https://lore.kernel.org/r/20260413172311.3918767-1-jmeng@fb.com Signed-off-by: Alexei Starovoitov --- lib/test_bpf.c | 363 +++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 263 insertions(+), 100 deletions(-) diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 5892c0f17ddc..af6f3340c034 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -560,8 +560,23 @@ static int bpf_fill_max_jmp_never_taken(struct bpf_test *self) } /* ALU result computation used in tests */ -static bool __bpf_alu_result(u64 *res, u64 v1, u64 v2, u8 op) +enum { F_ALU32 = 1, F_SIGNED = 2 }; + +static bool __bpf_alu_result(u64 *res, u64 v1, u64 v2, u8 op, u32 flags) { + bool is_signed = flags & F_SIGNED; + + /* Narrow operands for ALU32 */ + if (flags & F_ALU32) { + if (is_signed) { + v1 = (u64)(s32)v1; + v2 = (u64)(s32)v2; + } else { + v1 = (u32)v1; + v2 = (u32)v2; + } + } + *res = 0; switch (op) { case BPF_MOV: @@ -599,12 +614,28 @@ static bool __bpf_alu_result(u64 *res, u64 v1, u64 v2, u8 op) case BPF_DIV: if (v2 == 0) return false; - *res = div64_u64(v1, v2); + if (!is_signed) { + *res = div64_u64(v1, v2); + } else { + if ((s64)v2 == -1) /* Handled by verifier */ + return false; + *res = (u64)div64_s64(v1, v2); + } break; case BPF_MOD: if (v2 == 0) return false; - div64_u64_rem(v1, v2, res); + if (!is_signed) { + div64_u64_rem(v1, v2, res); + } else { + if ((s64)v2 == -1) + return false; + /* + * Avoid s64 % s64 which generates __moddi3 on + * 32-bit architectures. Use div64_s64 instead. + */ + *res = (u64)((s64)v1 - div64_s64(v1, v2) * (s64)v2); + } break; } return true; @@ -612,7 +643,7 @@ static bool __bpf_alu_result(u64 *res, u64 v1, u64 v2, u8 op) /* Test an ALU shift operation for all valid shift values */ static int __bpf_fill_alu_shift(struct bpf_test *self, u8 op, - u8 mode, bool alu32) + u8 mode, u32 flags) { static const s64 regs[] = { 0x0123456789abcdefLL, /* dword > 0, word < 0 */ @@ -620,7 +651,7 @@ static int __bpf_fill_alu_shift(struct bpf_test *self, u8 op, 0xfedcba0198765432LL, /* dword < 0, word < 0 */ 0x0123458967abcdefLL, /* dword > 0, word > 0 */ }; - int bits = alu32 ? 32 : 64; + int bits = (flags & F_ALU32) ? 32 : 64; int len = (2 + 7 * bits) * ARRAY_SIZE(regs) + 3; struct bpf_insn *insn; int imm, k; @@ -643,7 +674,7 @@ static int __bpf_fill_alu_shift(struct bpf_test *self, u8 op, /* Perform operation */ insn[i++] = BPF_ALU64_REG(BPF_MOV, R1, R3); insn[i++] = BPF_ALU64_IMM(BPF_MOV, R2, imm); - if (alu32) { + if (flags & F_ALU32) { if (mode == BPF_K) insn[i++] = BPF_ALU32_IMM(op, R1, imm); else @@ -653,14 +684,14 @@ static int __bpf_fill_alu_shift(struct bpf_test *self, u8 op, reg = (s32)reg; else reg = (u32)reg; - __bpf_alu_result(&val, reg, imm, op); + __bpf_alu_result(&val, reg, imm, op, 0); val = (u32)val; } else { if (mode == BPF_K) insn[i++] = BPF_ALU64_IMM(op, R1, imm); else insn[i++] = BPF_ALU64_REG(op, R1, R2); - __bpf_alu_result(&val, reg, imm, op); + __bpf_alu_result(&val, reg, imm, op, 0); } /* @@ -688,62 +719,62 @@ static int __bpf_fill_alu_shift(struct bpf_test *self, u8 op, static int bpf_fill_alu64_lsh_imm(struct bpf_test *self) { - return __bpf_fill_alu_shift(self, BPF_LSH, BPF_K, false); + return __bpf_fill_alu_shift(self, BPF_LSH, BPF_K, 0); } static int bpf_fill_alu64_rsh_imm(struct bpf_test *self) { - return __bpf_fill_alu_shift(self, BPF_RSH, BPF_K, false); + return __bpf_fill_alu_shift(self, BPF_RSH, BPF_K, 0); } static int bpf_fill_alu64_arsh_imm(struct bpf_test *self) { - return __bpf_fill_alu_shift(self, BPF_ARSH, BPF_K, false); + return __bpf_fill_alu_shift(self, BPF_ARSH, BPF_K, 0); } static int bpf_fill_alu64_lsh_reg(struct bpf_test *self) { - return __bpf_fill_alu_shift(self, BPF_LSH, BPF_X, false); + return __bpf_fill_alu_shift(self, BPF_LSH, BPF_X, 0); } static int bpf_fill_alu64_rsh_reg(struct bpf_test *self) { - return __bpf_fill_alu_shift(self, BPF_RSH, BPF_X, false); + return __bpf_fill_alu_shift(self, BPF_RSH, BPF_X, 0); } static int bpf_fill_alu64_arsh_reg(struct bpf_test *self) { - return __bpf_fill_alu_shift(self, BPF_ARSH, BPF_X, false); + return __bpf_fill_alu_shift(self, BPF_ARSH, BPF_X, 0); } static int bpf_fill_alu32_lsh_imm(struct bpf_test *self) { - return __bpf_fill_alu_shift(self, BPF_LSH, BPF_K, true); + return __bpf_fill_alu_shift(self, BPF_LSH, BPF_K, F_ALU32); } static int bpf_fill_alu32_rsh_imm(struct bpf_test *self) { - return __bpf_fill_alu_shift(self, BPF_RSH, BPF_K, true); + return __bpf_fill_alu_shift(self, BPF_RSH, BPF_K, F_ALU32); } static int bpf_fill_alu32_arsh_imm(struct bpf_test *self) { - return __bpf_fill_alu_shift(self, BPF_ARSH, BPF_K, true); + return __bpf_fill_alu_shift(self, BPF_ARSH, BPF_K, F_ALU32); } static int bpf_fill_alu32_lsh_reg(struct bpf_test *self) { - return __bpf_fill_alu_shift(self, BPF_LSH, BPF_X, true); + return __bpf_fill_alu_shift(self, BPF_LSH, BPF_X, F_ALU32); } static int bpf_fill_alu32_rsh_reg(struct bpf_test *self) { - return __bpf_fill_alu_shift(self, BPF_RSH, BPF_X, true); + return __bpf_fill_alu_shift(self, BPF_RSH, BPF_X, F_ALU32); } static int bpf_fill_alu32_arsh_reg(struct bpf_test *self) { - return __bpf_fill_alu_shift(self, BPF_ARSH, BPF_X, true); + return __bpf_fill_alu_shift(self, BPF_ARSH, BPF_X, F_ALU32); } /* @@ -751,9 +782,9 @@ static int bpf_fill_alu32_arsh_reg(struct bpf_test *self) * for the case when the source and destination are the same. */ static int __bpf_fill_alu_shift_same_reg(struct bpf_test *self, u8 op, - bool alu32) + u32 flags) { - int bits = alu32 ? 32 : 64; + int bits = (flags & F_ALU32) ? 32 : 64; int len = 3 + 6 * bits; struct bpf_insn *insn; int i = 0; @@ -770,14 +801,14 @@ static int __bpf_fill_alu_shift_same_reg(struct bpf_test *self, u8 op, /* Perform operation */ insn[i++] = BPF_ALU64_IMM(BPF_MOV, R1, val); - if (alu32) + if (flags & F_ALU32) insn[i++] = BPF_ALU32_REG(op, R1, R1); else insn[i++] = BPF_ALU64_REG(op, R1, R1); /* Compute the reference result */ - __bpf_alu_result(&res, val, val, op); - if (alu32) + __bpf_alu_result(&res, val, val, op, 0); + if (flags & F_ALU32) res = (u32)res; i += __bpf_ld_imm64(&insn[i], R2, res); @@ -798,32 +829,32 @@ static int __bpf_fill_alu_shift_same_reg(struct bpf_test *self, u8 op, static int bpf_fill_alu64_lsh_same_reg(struct bpf_test *self) { - return __bpf_fill_alu_shift_same_reg(self, BPF_LSH, false); + return __bpf_fill_alu_shift_same_reg(self, BPF_LSH, 0); } static int bpf_fill_alu64_rsh_same_reg(struct bpf_test *self) { - return __bpf_fill_alu_shift_same_reg(self, BPF_RSH, false); + return __bpf_fill_alu_shift_same_reg(self, BPF_RSH, 0); } static int bpf_fill_alu64_arsh_same_reg(struct bpf_test *self) { - return __bpf_fill_alu_shift_same_reg(self, BPF_ARSH, false); + return __bpf_fill_alu_shift_same_reg(self, BPF_ARSH, 0); } static int bpf_fill_alu32_lsh_same_reg(struct bpf_test *self) { - return __bpf_fill_alu_shift_same_reg(self, BPF_LSH, true); + return __bpf_fill_alu_shift_same_reg(self, BPF_LSH, F_ALU32); } static int bpf_fill_alu32_rsh_same_reg(struct bpf_test *self) { - return __bpf_fill_alu_shift_same_reg(self, BPF_RSH, true); + return __bpf_fill_alu_shift_same_reg(self, BPF_RSH, F_ALU32); } static int bpf_fill_alu32_arsh_same_reg(struct bpf_test *self) { - return __bpf_fill_alu_shift_same_reg(self, BPF_ARSH, true); + return __bpf_fill_alu_shift_same_reg(self, BPF_ARSH, F_ALU32); } /* @@ -936,17 +967,20 @@ static int __bpf_fill_pattern(struct bpf_test *self, void *arg, static int __bpf_emit_alu64_imm(struct bpf_test *self, void *arg, struct bpf_insn *insns, s64 dst, s64 imm) { - int op = *(int *)arg; + int *a = arg; + int op = a[0]; + u32 flags = a[1]; + s16 off = (flags & F_SIGNED) ? 1 : 0; int i = 0; u64 res; if (!insns) return 7; - if (__bpf_alu_result(&res, dst, (s32)imm, op)) { + if (__bpf_alu_result(&res, dst, (s32)imm, op, flags)) { i += __bpf_ld_imm64(&insns[i], R1, dst); i += __bpf_ld_imm64(&insns[i], R3, res); - insns[i++] = BPF_ALU64_IMM(op, R1, imm); + insns[i++] = BPF_ALU64_IMM_OFF(op, R1, imm, off); insns[i++] = BPF_JMP_REG(BPF_JEQ, R1, R3, 1); insns[i++] = BPF_EXIT_INSN(); } @@ -957,17 +991,20 @@ static int __bpf_emit_alu64_imm(struct bpf_test *self, void *arg, static int __bpf_emit_alu32_imm(struct bpf_test *self, void *arg, struct bpf_insn *insns, s64 dst, s64 imm) { - int op = *(int *)arg; + int *a = arg; + int op = a[0]; + u32 flags = a[1]; + s16 off = (flags & F_SIGNED) ? 1 : 0; int i = 0; u64 res; if (!insns) return 7; - if (__bpf_alu_result(&res, (u32)dst, (u32)imm, op)) { + if (__bpf_alu_result(&res, dst, (s32)imm, op, flags | F_ALU32)) { i += __bpf_ld_imm64(&insns[i], R1, dst); i += __bpf_ld_imm64(&insns[i], R3, (u32)res); - insns[i++] = BPF_ALU32_IMM(op, R1, imm); + insns[i++] = BPF_ALU32_IMM_OFF(op, R1, imm, off); insns[i++] = BPF_JMP_REG(BPF_JEQ, R1, R3, 1); insns[i++] = BPF_EXIT_INSN(); } @@ -985,7 +1022,7 @@ static int __bpf_emit_alu64_reg(struct bpf_test *self, void *arg, if (!insns) return 9; - if (__bpf_alu_result(&res, dst, src, op)) { + if (__bpf_alu_result(&res, dst, src, op, 0)) { i += __bpf_ld_imm64(&insns[i], R1, dst); i += __bpf_ld_imm64(&insns[i], R2, src); i += __bpf_ld_imm64(&insns[i], R3, res); @@ -1007,7 +1044,7 @@ static int __bpf_emit_alu32_reg(struct bpf_test *self, void *arg, if (!insns) return 9; - if (__bpf_alu_result(&res, (u32)dst, (u32)src, op)) { + if (__bpf_alu_result(&res, (u32)dst, (u32)src, op, 0)) { i += __bpf_ld_imm64(&insns[i], R1, dst); i += __bpf_ld_imm64(&insns[i], R2, src); i += __bpf_ld_imm64(&insns[i], R3, (u32)res); @@ -1019,16 +1056,20 @@ static int __bpf_emit_alu32_reg(struct bpf_test *self, void *arg, return i; } -static int __bpf_fill_alu64_imm(struct bpf_test *self, int op) +static int __bpf_fill_alu64_imm(struct bpf_test *self, int op, u32 flags) { - return __bpf_fill_pattern(self, &op, 64, 32, + int arg[2] = {op, flags}; + + return __bpf_fill_pattern(self, &arg, 64, 32, PATTERN_BLOCK1, PATTERN_BLOCK2, &__bpf_emit_alu64_imm); } -static int __bpf_fill_alu32_imm(struct bpf_test *self, int op) +static int __bpf_fill_alu32_imm(struct bpf_test *self, int op, u32 flags) { - return __bpf_fill_pattern(self, &op, 64, 32, + int arg[2] = {op, flags}; + + return __bpf_fill_pattern(self, &arg, 64, 32, PATTERN_BLOCK1, PATTERN_BLOCK2, &__bpf_emit_alu32_imm); } @@ -1050,93 +1091,115 @@ static int __bpf_fill_alu32_reg(struct bpf_test *self, int op) /* ALU64 immediate operations */ static int bpf_fill_alu64_mov_imm(struct bpf_test *self) { - return __bpf_fill_alu64_imm(self, BPF_MOV); + return __bpf_fill_alu64_imm(self, BPF_MOV, 0); } static int bpf_fill_alu64_and_imm(struct bpf_test *self) { - return __bpf_fill_alu64_imm(self, BPF_AND); + return __bpf_fill_alu64_imm(self, BPF_AND, 0); } static int bpf_fill_alu64_or_imm(struct bpf_test *self) { - return __bpf_fill_alu64_imm(self, BPF_OR); + return __bpf_fill_alu64_imm(self, BPF_OR, 0); } static int bpf_fill_alu64_xor_imm(struct bpf_test *self) { - return __bpf_fill_alu64_imm(self, BPF_XOR); + return __bpf_fill_alu64_imm(self, BPF_XOR, 0); } static int bpf_fill_alu64_add_imm(struct bpf_test *self) { - return __bpf_fill_alu64_imm(self, BPF_ADD); + return __bpf_fill_alu64_imm(self, BPF_ADD, 0); } static int bpf_fill_alu64_sub_imm(struct bpf_test *self) { - return __bpf_fill_alu64_imm(self, BPF_SUB); + return __bpf_fill_alu64_imm(self, BPF_SUB, 0); } static int bpf_fill_alu64_mul_imm(struct bpf_test *self) { - return __bpf_fill_alu64_imm(self, BPF_MUL); + return __bpf_fill_alu64_imm(self, BPF_MUL, 0); } static int bpf_fill_alu64_div_imm(struct bpf_test *self) { - return __bpf_fill_alu64_imm(self, BPF_DIV); + return __bpf_fill_alu64_imm(self, BPF_DIV, 0); } static int bpf_fill_alu64_mod_imm(struct bpf_test *self) { - return __bpf_fill_alu64_imm(self, BPF_MOD); + return __bpf_fill_alu64_imm(self, BPF_MOD, 0); +} + +/* Signed ALU64 immediate operations */ +static int bpf_fill_alu64_sdiv_imm(struct bpf_test *self) +{ + return __bpf_fill_alu64_imm(self, BPF_DIV, F_SIGNED); +} + +static int bpf_fill_alu64_smod_imm(struct bpf_test *self) +{ + return __bpf_fill_alu64_imm(self, BPF_MOD, F_SIGNED); +} + +/* Signed ALU32 immediate operations */ +static int bpf_fill_alu32_sdiv_imm(struct bpf_test *self) +{ + return __bpf_fill_alu32_imm(self, BPF_DIV, F_SIGNED); +} + +static int bpf_fill_alu32_smod_imm(struct bpf_test *self) +{ + return __bpf_fill_alu32_imm(self, BPF_MOD, F_SIGNED); } /* ALU32 immediate operations */ static int bpf_fill_alu32_mov_imm(struct bpf_test *self) { - return __bpf_fill_alu32_imm(self, BPF_MOV); + return __bpf_fill_alu32_imm(self, BPF_MOV, 0); } static int bpf_fill_alu32_and_imm(struct bpf_test *self) { - return __bpf_fill_alu32_imm(self, BPF_AND); + return __bpf_fill_alu32_imm(self, BPF_AND, 0); } static int bpf_fill_alu32_or_imm(struct bpf_test *self) { - return __bpf_fill_alu32_imm(self, BPF_OR); + return __bpf_fill_alu32_imm(self, BPF_OR, 0); } static int bpf_fill_alu32_xor_imm(struct bpf_test *self) { - return __bpf_fill_alu32_imm(self, BPF_XOR); + return __bpf_fill_alu32_imm(self, BPF_XOR, 0); } static int bpf_fill_alu32_add_imm(struct bpf_test *self) { - return __bpf_fill_alu32_imm(self, BPF_ADD); + return __bpf_fill_alu32_imm(self, BPF_ADD, 0); } static int bpf_fill_alu32_sub_imm(struct bpf_test *self) { - return __bpf_fill_alu32_imm(self, BPF_SUB); + return __bpf_fill_alu32_imm(self, BPF_SUB, 0); } static int bpf_fill_alu32_mul_imm(struct bpf_test *self) { - return __bpf_fill_alu32_imm(self, BPF_MUL); + return __bpf_fill_alu32_imm(self, BPF_MUL, 0); } static int bpf_fill_alu32_div_imm(struct bpf_test *self) { - return __bpf_fill_alu32_imm(self, BPF_DIV); + return __bpf_fill_alu32_imm(self, BPF_DIV, 0); } static int bpf_fill_alu32_mod_imm(struct bpf_test *self) { - return __bpf_fill_alu32_imm(self, BPF_MOD); + return __bpf_fill_alu32_imm(self, BPF_MOD, 0); } /* ALU64 register operations */ @@ -1235,7 +1298,8 @@ static int bpf_fill_alu32_mod_reg(struct bpf_test *self) * Test JITs that implement complex ALU operations as function * calls, and must re-arrange operands for argument passing. */ -static int __bpf_fill_alu_imm_regs(struct bpf_test *self, u8 op, bool alu32) +static int __bpf_fill_alu_imm_regs(struct bpf_test *self, u8 op, + u32 flags) { int len = 2 + 10 * 10; struct bpf_insn *insns; @@ -1249,28 +1313,37 @@ static int __bpf_fill_alu_imm_regs(struct bpf_test *self, u8 op, bool alu32) return -ENOMEM; /* Operand and result values according to operation */ - if (alu32) - dst = 0x76543210U; - else - dst = 0x7edcba9876543210ULL; + if (flags & F_SIGNED) { + if (flags & F_ALU32) + dst = -76543210; + else + dst = -7654321076543210LL; + } else { + if (flags & F_ALU32) + dst = 0x76543210U; + else + dst = 0x7edcba9876543210ULL; + } imm = 0x01234567U; if (op == BPF_LSH || op == BPF_RSH || op == BPF_ARSH) imm &= 31; - __bpf_alu_result(&res, dst, imm, op); + __bpf_alu_result(&res, dst, imm, op, flags); - if (alu32) + if (flags & F_ALU32) res = (u32)res; /* Check all operand registers */ for (rd = R0; rd <= R9; rd++) { i += __bpf_ld_imm64(&insns[i], rd, dst); - if (alu32) - insns[i++] = BPF_ALU32_IMM(op, rd, imm); + s16 off = (flags & F_SIGNED) ? 1 : 0; + + if (flags & F_ALU32) + insns[i++] = BPF_ALU32_IMM_OFF(op, rd, imm, off); else - insns[i++] = BPF_ALU64_IMM(op, rd, imm); + insns[i++] = BPF_ALU64_IMM_OFF(op, rd, imm, off); insns[i++] = BPF_JMP32_IMM(BPF_JEQ, rd, res, 2); insns[i++] = BPF_MOV64_IMM(R0, __LINE__); @@ -1295,123 +1368,145 @@ static int __bpf_fill_alu_imm_regs(struct bpf_test *self, u8 op, bool alu32) /* ALU64 K registers */ static int bpf_fill_alu64_mov_imm_regs(struct bpf_test *self) { - return __bpf_fill_alu_imm_regs(self, BPF_MOV, false); + return __bpf_fill_alu_imm_regs(self, BPF_MOV, 0); } static int bpf_fill_alu64_and_imm_regs(struct bpf_test *self) { - return __bpf_fill_alu_imm_regs(self, BPF_AND, false); + return __bpf_fill_alu_imm_regs(self, BPF_AND, 0); } static int bpf_fill_alu64_or_imm_regs(struct bpf_test *self) { - return __bpf_fill_alu_imm_regs(self, BPF_OR, false); + return __bpf_fill_alu_imm_regs(self, BPF_OR, 0); } static int bpf_fill_alu64_xor_imm_regs(struct bpf_test *self) { - return __bpf_fill_alu_imm_regs(self, BPF_XOR, false); + return __bpf_fill_alu_imm_regs(self, BPF_XOR, 0); } static int bpf_fill_alu64_lsh_imm_regs(struct bpf_test *self) { - return __bpf_fill_alu_imm_regs(self, BPF_LSH, false); + return __bpf_fill_alu_imm_regs(self, BPF_LSH, 0); } static int bpf_fill_alu64_rsh_imm_regs(struct bpf_test *self) { - return __bpf_fill_alu_imm_regs(self, BPF_RSH, false); + return __bpf_fill_alu_imm_regs(self, BPF_RSH, 0); } static int bpf_fill_alu64_arsh_imm_regs(struct bpf_test *self) { - return __bpf_fill_alu_imm_regs(self, BPF_ARSH, false); + return __bpf_fill_alu_imm_regs(self, BPF_ARSH, 0); } static int bpf_fill_alu64_add_imm_regs(struct bpf_test *self) { - return __bpf_fill_alu_imm_regs(self, BPF_ADD, false); + return __bpf_fill_alu_imm_regs(self, BPF_ADD, 0); } static int bpf_fill_alu64_sub_imm_regs(struct bpf_test *self) { - return __bpf_fill_alu_imm_regs(self, BPF_SUB, false); + return __bpf_fill_alu_imm_regs(self, BPF_SUB, 0); } static int bpf_fill_alu64_mul_imm_regs(struct bpf_test *self) { - return __bpf_fill_alu_imm_regs(self, BPF_MUL, false); + return __bpf_fill_alu_imm_regs(self, BPF_MUL, 0); } static int bpf_fill_alu64_div_imm_regs(struct bpf_test *self) { - return __bpf_fill_alu_imm_regs(self, BPF_DIV, false); + return __bpf_fill_alu_imm_regs(self, BPF_DIV, 0); } static int bpf_fill_alu64_mod_imm_regs(struct bpf_test *self) { - return __bpf_fill_alu_imm_regs(self, BPF_MOD, false); + return __bpf_fill_alu_imm_regs(self, BPF_MOD, 0); +} + +/* Signed ALU64 K registers */ +static int bpf_fill_alu64_sdiv_imm_regs(struct bpf_test *self) +{ + return __bpf_fill_alu_imm_regs(self, BPF_DIV, F_SIGNED); +} + +static int bpf_fill_alu64_smod_imm_regs(struct bpf_test *self) +{ + return __bpf_fill_alu_imm_regs(self, BPF_MOD, F_SIGNED); } /* ALU32 K registers */ static int bpf_fill_alu32_mov_imm_regs(struct bpf_test *self) { - return __bpf_fill_alu_imm_regs(self, BPF_MOV, true); + return __bpf_fill_alu_imm_regs(self, BPF_MOV, F_ALU32); } static int bpf_fill_alu32_and_imm_regs(struct bpf_test *self) { - return __bpf_fill_alu_imm_regs(self, BPF_AND, true); + return __bpf_fill_alu_imm_regs(self, BPF_AND, F_ALU32); } static int bpf_fill_alu32_or_imm_regs(struct bpf_test *self) { - return __bpf_fill_alu_imm_regs(self, BPF_OR, true); + return __bpf_fill_alu_imm_regs(self, BPF_OR, F_ALU32); } static int bpf_fill_alu32_xor_imm_regs(struct bpf_test *self) { - return __bpf_fill_alu_imm_regs(self, BPF_XOR, true); + return __bpf_fill_alu_imm_regs(self, BPF_XOR, F_ALU32); } static int bpf_fill_alu32_lsh_imm_regs(struct bpf_test *self) { - return __bpf_fill_alu_imm_regs(self, BPF_LSH, true); + return __bpf_fill_alu_imm_regs(self, BPF_LSH, F_ALU32); } static int bpf_fill_alu32_rsh_imm_regs(struct bpf_test *self) { - return __bpf_fill_alu_imm_regs(self, BPF_RSH, true); + return __bpf_fill_alu_imm_regs(self, BPF_RSH, F_ALU32); } static int bpf_fill_alu32_arsh_imm_regs(struct bpf_test *self) { - return __bpf_fill_alu_imm_regs(self, BPF_ARSH, true); + return __bpf_fill_alu_imm_regs(self, BPF_ARSH, F_ALU32); } static int bpf_fill_alu32_add_imm_regs(struct bpf_test *self) { - return __bpf_fill_alu_imm_regs(self, BPF_ADD, true); + return __bpf_fill_alu_imm_regs(self, BPF_ADD, F_ALU32); } static int bpf_fill_alu32_sub_imm_regs(struct bpf_test *self) { - return __bpf_fill_alu_imm_regs(self, BPF_SUB, true); + return __bpf_fill_alu_imm_regs(self, BPF_SUB, F_ALU32); } static int bpf_fill_alu32_mul_imm_regs(struct bpf_test *self) { - return __bpf_fill_alu_imm_regs(self, BPF_MUL, true); + return __bpf_fill_alu_imm_regs(self, BPF_MUL, F_ALU32); } static int bpf_fill_alu32_div_imm_regs(struct bpf_test *self) { - return __bpf_fill_alu_imm_regs(self, BPF_DIV, true); + return __bpf_fill_alu_imm_regs(self, BPF_DIV, F_ALU32); } static int bpf_fill_alu32_mod_imm_regs(struct bpf_test *self) { - return __bpf_fill_alu_imm_regs(self, BPF_MOD, true); + return __bpf_fill_alu_imm_regs(self, BPF_MOD, F_ALU32); +} + +/* Signed ALU32 K registers */ +static int bpf_fill_alu32_sdiv_imm_regs(struct bpf_test *self) +{ + return __bpf_fill_alu_imm_regs(self, BPF_DIV, F_ALU32 | F_SIGNED); +} + +static int bpf_fill_alu32_smod_imm_regs(struct bpf_test *self) +{ + return __bpf_fill_alu_imm_regs(self, BPF_MOD, F_ALU32 | F_SIGNED); } /* @@ -1442,8 +1537,8 @@ static int __bpf_fill_alu_reg_pairs(struct bpf_test *self, u8 op, bool alu32) if (op == BPF_LSH || op == BPF_RSH || op == BPF_ARSH) src &= 31; - __bpf_alu_result(&res, dst, src, op); - __bpf_alu_result(&same, src, src, op); + __bpf_alu_result(&res, dst, src, op, 0); + __bpf_alu_result(&same, src, src, op, 0); if (alu32) { res = (u32)res; @@ -1626,7 +1721,7 @@ static int __bpf_emit_atomic64(struct bpf_test *self, void *arg, res = src; break; default: - __bpf_alu_result(&res, dst, src, BPF_OP(op)); + __bpf_alu_result(&res, dst, src, BPF_OP(op), 0); } keep = 0x0123456789abcdefULL; @@ -1673,7 +1768,7 @@ static int __bpf_emit_atomic32(struct bpf_test *self, void *arg, res = src; break; default: - __bpf_alu_result(&res, (u32)dst, (u32)src, BPF_OP(op)); + __bpf_alu_result(&res, (u32)dst, (u32)src, BPF_OP(op), 0); } keep = 0x0123456789abcdefULL; @@ -1939,7 +2034,7 @@ static int __bpf_fill_atomic_reg_pairs(struct bpf_test *self, u8 width, u8 op) res = mem; break; default: - __bpf_alu_result(&res, mem, upd, BPF_OP(op)); + __bpf_alu_result(&res, mem, upd, BPF_OP(op), 0); } /* Test all operand registers */ @@ -12354,6 +12449,22 @@ static struct bpf_test tests[] = { { { 0, 1 } }, .fill_helper = bpf_fill_alu64_mod_imm_regs, }, + { + "ALU64_SDIV_K: registers", + { }, + INTERNAL, + { }, + { { 0, 1 } }, + .fill_helper = bpf_fill_alu64_sdiv_imm_regs, + }, + { + "ALU64_SMOD_K: registers", + { }, + INTERNAL, + { }, + { { 0, 1 } }, + .fill_helper = bpf_fill_alu64_smod_imm_regs, + }, /* ALU32 K registers */ { "ALU32_MOV_K: registers", @@ -12451,6 +12562,22 @@ static struct bpf_test tests[] = { { { 0, 1 } }, .fill_helper = bpf_fill_alu32_mod_imm_regs, }, + { + "ALU32_SDIV_K: registers", + { }, + INTERNAL, + { }, + { { 0, 1 } }, + .fill_helper = bpf_fill_alu32_sdiv_imm_regs, + }, + { + "ALU32_SMOD_K: registers", + { }, + INTERNAL, + { }, + { { 0, 1 } }, + .fill_helper = bpf_fill_alu32_smod_imm_regs, + }, /* ALU64 X register combinations */ { "ALU64_MOV_X: register combinations", @@ -12881,6 +13008,24 @@ static struct bpf_test tests[] = { .fill_helper = bpf_fill_alu64_mod_imm, .nr_testruns = NR_PATTERN_RUNS, }, + { + "ALU64_SDIV_K: all immediate value magnitudes", + { }, + INTERNAL | FLAG_NO_DATA, + { }, + { { 0, 1 } }, + .fill_helper = bpf_fill_alu64_sdiv_imm, + .nr_testruns = NR_PATTERN_RUNS, + }, + { + "ALU64_SMOD_K: all immediate value magnitudes", + { }, + INTERNAL | FLAG_NO_DATA, + { }, + { { 0, 1 } }, + .fill_helper = bpf_fill_alu64_smod_imm, + .nr_testruns = NR_PATTERN_RUNS, + }, /* ALU32 immediate magnitudes */ { "ALU32_MOV_K: all immediate value magnitudes", @@ -12963,6 +13108,24 @@ static struct bpf_test tests[] = { .fill_helper = bpf_fill_alu32_mod_imm, .nr_testruns = NR_PATTERN_RUNS, }, + { + "ALU32_SDIV_K: all immediate value magnitudes", + { }, + INTERNAL | FLAG_NO_DATA, + { }, + { { 0, 1 } }, + .fill_helper = bpf_fill_alu32_sdiv_imm, + .nr_testruns = NR_PATTERN_RUNS, + }, + { + "ALU32_SMOD_K: all immediate value magnitudes", + { }, + INTERNAL | FLAG_NO_DATA, + { }, + { { 0, 1 } }, + .fill_helper = bpf_fill_alu32_smod_imm, + .nr_testruns = NR_PATTERN_RUNS, + }, /* ALU64 register magnitudes */ { "ALU64_MOV_X: all register value magnitudes", -- cgit v1.2.3 From 9012cf2491e3c5d28d098b0d6da804af82977032 Mon Sep 17 00:00:00 2001 From: Maxim Khmelevskii Date: Tue, 14 Apr 2026 16:29:26 +0200 Subject: s390/bpf: Inline smp_processor_id and current_task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Inline these calls in bpf jit: - bpf_get_smp_processor_id() - bpf_get_current_task() - bpf_get_current_task_btf() s390 has a 8 KiB per-CPU prefix area in the CPU's virtual address space, called the lowcore. It is a struct that contains the cpu number and a pointer to the current task. These are exactly the values returned by the BPF helpers. Emit a load from the lowcore instead of a helper function call. JIT output for `bpf_get_smp_processor_id`: Before: After: --------------- ---------------- brasl %r14,0x3ffe0385460 ly %r14,928 lgr %r14,%r2 JIT output for `bpf_get_current_task`: Before: After: --------------- ---------------- brasl %r14,0x3ffe0362a90 lg %r14,832 lgr %r14,%r2 Benchmark using [1] on KVM(virtme-ng). ./benchs/run_bench_trigger.sh glob-arr-inc arr-inc hash-inc +---------------+--------------------+--------------------+--------------+ | Name | Before | After | % change | |---------------+--------------------+--------------------+--------------| | glob-arr-inc | 244.954 ± 0.654M/s | 278.501 ± 0.834M/s | + 13.70% | | arr-inc | 311.597 ± 1.016M/s | 313.610 ± 0.331M/s | + 0.65% | | hash-inc | 47.421 ± 0.017M/s | 47.600 ± 0.004M/s | + 0.38% | +---------------+--------------------+--------------------+--------------+ [1] https://github.com/anakryiko/linux/commit/8dec900975ef Signed-off-by: Maxim Khmelevskii Reviewed-by: Ilya Leoshkevich Link: https://lore.kernel.org/r/20260414142930.528751-1-max@linux.ibm.com Signed-off-by: Alexei Starovoitov --- arch/s390/net/bpf_jit_comp.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 94128fe6be23..14eaaa5b2185 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -1777,6 +1778,30 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int j, ret; u64 func; + /* Implement helper call to bpf_get_smp_processor_id() inline */ + if (insn->src_reg == 0 && + insn->imm == BPF_FUNC_get_smp_processor_id) { + const u32 *cpu_nr = &get_lowcore()->cpu_nr; + + /* ly %b0, cpu_nr */ + EMIT6_DISP_LH(0xe3000000, 0x0058, BPF_REG_0, REG_0, REG_0, + (unsigned long)cpu_nr); + break; + } + + /* Implement helper call to bpf_get_current_task/_btf() inline */ + if (insn->src_reg == 0 && + (insn->imm == BPF_FUNC_get_current_task || + insn->imm == BPF_FUNC_get_current_task_btf)) { + const u64 *current_task = + &get_lowcore()->current_task; + + /* lg %b0, current_task */ + EMIT6_DISP_LH(0xe3000000, 0x0004, BPF_REG_0, REG_0, REG_0, + (unsigned long)current_task); + break; + } + ret = bpf_jit_get_func_addr(fp, insn, extra_pass, &func, &func_addr_fixed); if (ret < 0) @@ -3057,3 +3082,15 @@ bool bpf_jit_supports_timed_may_goto(void) { return true; } + +bool bpf_jit_inlines_helper_call(s32 imm) +{ + switch (imm) { + case BPF_FUNC_get_smp_processor_id: + case BPF_FUNC_get_current_task: + case BPF_FUNC_get_current_task_btf: + return true; + default: + return false; + } +} -- cgit v1.2.3 From 439ebd5b5708f236f7a4a9784194f7ecb77cd814 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Wed, 22 Apr 2026 12:41:06 -0700 Subject: bpf: Add sleepable support for raw tracepoint programs Rework __bpf_trace_run() to support sleepable BPF programs by using explicit RCU flavor selection, following the uprobe_prog_run() pattern. For sleepable programs, use rcu_read_lock_tasks_trace() for lifetime protection with migrate_disable(). For non-sleepable programs, use the regular rcu_read_lock_dont_migrate(). Remove the preempt_disable_notrace/preempt_enable_notrace pair from the faultable tracepoint BPF probe wrapper in bpf_probe.h, since migration protection and RCU locking are now handled per-program inside __bpf_trace_run(). Adapt bpf_prog_test_run_raw_tp() for sleepable programs: reject BPF_F_TEST_RUN_ON_CPU since sleepable programs cannot run in hardirq or preempt-disabled context, and call __bpf_prog_test_run_raw_tp() directly instead of via smp_call_function_single(). Rework __bpf_prog_test_run_raw_tp() to select RCU flavor per-program and add per-program recursion context guard for private stack safety. Signed-off-by: Mykyta Yatsenko Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20260422-sleepable_tracepoints-v13-1-99005dff21ef@meta.com Signed-off-by: Kumar Kartikeya Dwivedi --- include/trace/bpf_probe.h | 2 -- kernel/trace/bpf_trace.c | 20 ++++++++++++--- net/bpf/test_run.c | 65 ++++++++++++++++++++++++++++++++++++----------- 3 files changed, 67 insertions(+), 20 deletions(-) diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h index 9391d54d3f12..d1de8f9aa07f 100644 --- a/include/trace/bpf_probe.h +++ b/include/trace/bpf_probe.h @@ -58,9 +58,7 @@ static notrace void \ __bpf_trace_##call(void *__data, proto) \ { \ might_fault(); \ - preempt_disable_notrace(); \ CONCATENATE(bpf_trace_run, COUNT_ARGS(args))(__data, CAST_TO_U64(args)); \ - preempt_enable_notrace(); \ } #undef DECLARE_EVENT_SYSCALL_CLASS diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e916f0ccbed9..7276c72c1d31 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2072,11 +2072,19 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) static __always_inline void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) { + struct srcu_ctr __percpu *scp = NULL; struct bpf_prog *prog = link->link.prog; + bool sleepable = prog->sleepable; struct bpf_run_ctx *old_run_ctx; struct bpf_trace_run_ctx run_ctx; - rcu_read_lock_dont_migrate(); + if (sleepable) { + scp = rcu_read_lock_tasks_trace(); + migrate_disable(); + } else { + rcu_read_lock_dont_migrate(); + } + if (unlikely(!bpf_prog_get_recursion_context(prog))) { bpf_prog_inc_misses_counter(prog); goto out; @@ -2085,12 +2093,18 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) run_ctx.bpf_cookie = link->cookie; old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); - (void) bpf_prog_run(prog, args); + (void)bpf_prog_run(prog, args); bpf_reset_run_ctx(old_run_ctx); out: bpf_prog_put_recursion_context(prog); - rcu_read_unlock_migrate(); + + if (sleepable) { + migrate_enable(); + rcu_read_unlock_tasks_trace(scp); + } else { + rcu_read_unlock_migrate(); + } } #define UNPACK(...) __VA_ARGS__ diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 2bc04feadfab..c9aea7052ba7 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -748,14 +748,35 @@ static void __bpf_prog_test_run_raw_tp(void *data) { struct bpf_raw_tp_test_run_info *info = data; + struct srcu_ctr __percpu *scp = NULL; struct bpf_trace_run_ctx run_ctx = {}; struct bpf_run_ctx *old_run_ctx; old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); - rcu_read_lock(); + if (info->prog->sleepable) { + scp = rcu_read_lock_tasks_trace(); + migrate_disable(); + } else { + rcu_read_lock(); + } + + if (unlikely(!bpf_prog_get_recursion_context(info->prog))) { + bpf_prog_inc_misses_counter(info->prog); + goto out; + } + info->retval = bpf_prog_run(info->prog, info->ctx); - rcu_read_unlock(); + +out: + bpf_prog_put_recursion_context(info->prog); + + if (info->prog->sleepable) { + migrate_enable(); + rcu_read_unlock_tasks_trace(scp); + } else { + rcu_read_unlock(); + } bpf_reset_run_ctx(old_run_ctx); } @@ -783,6 +804,13 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 && cpu != 0) return -EINVAL; + /* + * Sleepable programs cannot run with preemption disabled or in + * hardirq context (smp_call_function_single), reject the flag. + */ + if (prog->sleepable && (kattr->test.flags & BPF_F_TEST_RUN_ON_CPU)) + return -EINVAL; + if (ctx_size_in) { info.ctx = memdup_user(ctx_in, ctx_size_in); if (IS_ERR(info.ctx)) @@ -791,24 +819,31 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, info.ctx = NULL; } + info.retval = 0; info.prog = prog; - current_cpu = get_cpu(); - if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 || - cpu == current_cpu) { + if (prog->sleepable) { __bpf_prog_test_run_raw_tp(&info); - } else if (cpu >= nr_cpu_ids || !cpu_online(cpu)) { - /* smp_call_function_single() also checks cpu_online() - * after csd_lock(). However, since cpu is from user - * space, let's do an extra quick check to filter out - * invalid value before smp_call_function_single(). - */ - err = -ENXIO; } else { - err = smp_call_function_single(cpu, __bpf_prog_test_run_raw_tp, - &info, 1); + current_cpu = get_cpu(); + if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 || + cpu == current_cpu) { + __bpf_prog_test_run_raw_tp(&info); + } else if (cpu >= nr_cpu_ids || !cpu_online(cpu)) { + /* + * smp_call_function_single() also checks cpu_online() + * after csd_lock(). However, since cpu is from user + * space, let's do an extra quick check to filter out + * invalid value before smp_call_function_single(). + */ + err = -ENXIO; + } else { + err = smp_call_function_single(cpu, + __bpf_prog_test_run_raw_tp, + &info, 1); + } + put_cpu(); } - put_cpu(); if (!err && copy_to_user(&uattr->test.retval, &info.retval, sizeof(u32))) -- cgit v1.2.3 From 12628ffaf98b708a80857a462613119b9e16de4c Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Wed, 22 Apr 2026 12:41:07 -0700 Subject: bpf: Add bpf_prog_run_array_sleepable() Add bpf_prog_run_array_sleepable() for running BPF program arrays on faultable tracepoints. Unlike bpf_prog_run_array_uprobe(), it includes per-program recursion checking for private stack safety and hardcodes is_uprobe to false. Skip dummy_bpf_prog at the top of the loop. When bpf_prog_array_delete_safe() replaces a detached program with dummy_bpf_prog on allocation failure, the dummy is statically allocated and has NULL active, stats, and aux fields. Identify it by prog->len == 0, since every real program has at least one instruction. Keep bpf_prog_run_array_uprobe() unchanged for uprobe callers. Signed-off-by: Mykyta Yatsenko Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20260422-sleepable_tracepoints-v13-2-99005dff21ef@meta.com Signed-off-by: Kumar Kartikeya Dwivedi --- include/linux/bpf.h | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3cb6b9e70080..d3aea3931b85 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3079,6 +3079,56 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr); void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip); +static __always_inline u32 +bpf_prog_run_array_sleepable(const struct bpf_prog_array *array, + const void *ctx, bpf_prog_run_fn run_prog) +{ + const struct bpf_prog_array_item *item; + struct bpf_prog *prog; + struct bpf_run_ctx *old_run_ctx; + struct bpf_trace_run_ctx run_ctx; + u32 ret = 1; + + if (unlikely(!array)) + return ret; + + migrate_disable(); + + run_ctx.is_uprobe = false; + + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + item = &array->items[0]; + while ((prog = READ_ONCE(item->prog))) { + /* Skip dummy_bpf_prog placeholder (len == 0) */ + if (unlikely(!prog->len)) { + item++; + continue; + } + + if (unlikely(!bpf_prog_get_recursion_context(prog))) { + bpf_prog_inc_misses_counter(prog); + bpf_prog_put_recursion_context(prog); + item++; + continue; + } + + run_ctx.bpf_cookie = item->bpf_cookie; + + if (!prog->sleepable) { + guard(rcu)(); + ret &= run_prog(prog, ctx); + } else { + ret &= run_prog(prog, ctx); + } + + bpf_prog_put_recursion_context(prog); + item++; + } + bpf_reset_run_ctx(old_run_ctx); + migrate_enable(); + return ret; +} + #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { -- cgit v1.2.3 From 57918341dd19e5ca8a77622ffae3db19e5ba4cc7 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Wed, 22 Apr 2026 12:41:08 -0700 Subject: bpf: Add sleepable support for classic tracepoint programs Add trace_call_bpf_faultable(), a variant of trace_call_bpf() for faultable tracepoints that supports sleepable BPF programs. It uses rcu_tasks_trace for lifetime protection and bpf_prog_run_array_sleepable() for per-program RCU flavor selection, following the uprobe_prog_run() pattern. Restructure perf_syscall_enter() and perf_syscall_exit() to run BPF programs before perf event processing. Previously, BPF ran after the per-cpu perf trace buffer was allocated under preempt_disable, requiring cleanup via perf_swevent_put_recursion_context() on filter. Now BPF runs in faultable context before preempt_disable, reading syscall arguments from local variables instead of the per-cpu trace record, removing the dependency on buffer allocation. This allows sleepable BPF programs to execute and avoids unnecessary buffer allocation when BPF filters the event. The perf event submission path (buffer allocation, fill, submit) remains under preempt_disable as before. Since BPF no longer runs within the buffer allocation context, the fake_regs output parameter to perf_trace_buf_alloc() is no longer needed and is replaced with NULL. Add an attach-time check in __perf_event_set_bpf_prog() to reject sleepable BPF_PROG_TYPE_TRACEPOINT programs on non-syscall tracepoints, since only syscall tracepoints run in faultable context. This prepares the classic tracepoint runtime and attach paths for sleepable programs. The verifier changes to allow loading sleepable BPF_PROG_TYPE_TRACEPOINT programs are in a subsequent patch. To: Peter Zijlstra To: Steven Rostedt Signed-off-by: Mykyta Yatsenko Acked-by: Kumar Kartikeya Dwivedi # for BPF bits Acked-by: Steven Rostedt Link: https://lore.kernel.org/bpf/20260422-sleepable_tracepoints-v13-3-99005dff21ef@meta.com Signed-off-by: Kumar Kartikeya Dwivedi --- include/linux/trace_events.h | 6 +++ kernel/events/core.c | 9 ++++ kernel/trace/bpf_trace.c | 28 +++++++++++ kernel/trace/trace_syscalls.c | 110 ++++++++++++++++++++++-------------------- 4 files changed, 101 insertions(+), 52 deletions(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 40a43a4c7caf..d49338c44014 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -770,6 +770,7 @@ trace_trigger_soft_disabled(struct trace_event_file *file) #ifdef CONFIG_BPF_EVENTS unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx); +unsigned int trace_call_bpf_faultable(struct trace_event_call *call, void *ctx); int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie); void perf_event_detach_bpf_prog(struct perf_event *event); int perf_event_query_prog_array(struct perf_event *event, void __user *info); @@ -792,6 +793,11 @@ static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *c return 1; } +static inline unsigned int trace_call_bpf_faultable(struct trace_event_call *call, void *ctx) +{ + return 1; +} + static inline int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie) { diff --git a/kernel/events/core.c b/kernel/events/core.c index 6d1f8bad7e1c..0f9cacfa7cb8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11643,6 +11643,15 @@ static int __perf_event_set_bpf_prog(struct perf_event *event, /* only uprobe programs are allowed to be sleepable */ return -EINVAL; + if (prog->type == BPF_PROG_TYPE_TRACEPOINT && prog->sleepable) { + /* + * Sleepable tracepoint programs can only attach to faultable + * tracepoints. Currently only syscall tracepoints are faultable. + */ + if (!is_syscall_tp) + return -EINVAL; + } + /* Kprobe override only works for kprobes, not uprobes. */ if (prog->kprobe_override && !is_kprobe) return -EINVAL; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 7276c72c1d31..a822c589c9bd 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -152,6 +152,34 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) return ret; } +/** + * trace_call_bpf_faultable - invoke BPF program in faultable context + * @call: tracepoint event + * @ctx: opaque context pointer + * + * Variant of trace_call_bpf() for faultable tracepoints (syscall + * tracepoints). Supports sleepable BPF programs by using rcu_tasks_trace + * for lifetime protection and bpf_prog_run_array_sleepable() for per-program + * RCU flavor selection, following the uprobe pattern. + * + * Per-program recursion protection is provided by + * bpf_prog_run_array_sleepable(). Global bpf_prog_active is not + * needed because syscall tracepoints cannot self-recurse. + * + * Must be called from a faultable/preemptible context. + */ +unsigned int trace_call_bpf_faultable(struct trace_event_call *call, void *ctx) +{ + struct bpf_prog_array *prog_array; + + might_fault(); + guard(rcu_tasks_trace)(); + + prog_array = rcu_dereference_check(call->prog_array, + rcu_read_lock_trace_held()); + return bpf_prog_run_array_sleepable(prog_array, ctx, bpf_prog_run); +} + #ifdef CONFIG_BPF_KPROBE_OVERRIDE BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) { diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 8ad72e17d8eb..e98ee7e1e66f 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1371,33 +1371,33 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls); static int sys_perf_refcount_enter; static int sys_perf_refcount_exit; -static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs, +static int perf_call_bpf_enter(struct trace_event_call *call, struct syscall_metadata *sys_data, - struct syscall_trace_enter *rec) + int syscall_nr, unsigned long *args) { struct syscall_tp_t { struct trace_entry ent; int syscall_nr; unsigned long args[SYSCALL_DEFINE_MAXARGS]; } __aligned(8) param; + struct pt_regs regs = {}; int i; BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *)); - /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. ¶m) */ - perf_fetch_caller_regs(regs); - *(struct pt_regs **)¶m = regs; - param.syscall_nr = rec->nr; + /* bpf prog requires 'regs' to be the first member in the ctx */ + perf_fetch_caller_regs(®s); + *(struct pt_regs **)¶m = ®s; + param.syscall_nr = syscall_nr; for (i = 0; i < sys_data->nb_args; i++) - param.args[i] = rec->args[i]; - return trace_call_bpf(call, ¶m); + param.args[i] = args[i]; + return trace_call_bpf_faultable(call, ¶m); } static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) { struct syscall_metadata *sys_data; struct syscall_trace_enter *rec; - struct pt_regs *fake_regs; struct hlist_head *head; unsigned long args[6]; bool valid_prog_array; @@ -1410,12 +1410,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) int size = 0; int uargs = 0; - /* - * Syscall probe called with preemption enabled, but the ring - * buffer and per-cpu data require preemption to be disabled. - */ might_fault(); - guard(preempt_notrace)(); syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) @@ -1429,6 +1424,26 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) syscall_get_arguments(current, regs, args); + /* + * Run BPF program in faultable context before per-cpu buffer + * allocation, allowing sleepable BPF programs to execute. + */ + valid_prog_array = bpf_prog_array_valid(sys_data->enter_event); + if (valid_prog_array && + !perf_call_bpf_enter(sys_data->enter_event, sys_data, + syscall_nr, args)) + return; + + /* + * Per-cpu ring buffer and perf event list operations require + * preemption to be disabled. + */ + guard(preempt_notrace)(); + + head = this_cpu_ptr(sys_data->enter_event->perf_events); + if (hlist_empty(head)) + return; + /* Check if this syscall event faults in user space memory */ mayfault = sys_data->user_mask != 0; @@ -1438,17 +1453,12 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) return; } - head = this_cpu_ptr(sys_data->enter_event->perf_events); - valid_prog_array = bpf_prog_array_valid(sys_data->enter_event); - if (!valid_prog_array && hlist_empty(head)) - return; - /* get the size after alignment with the u32 buffer size field */ size += sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); size = ALIGN(size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - rec = perf_trace_buf_alloc(size, &fake_regs, &rctx); + rec = perf_trace_buf_alloc(size, NULL, &rctx); if (!rec) return; @@ -1458,13 +1468,6 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) if (mayfault) syscall_put_data(sys_data, rec, user_ptr, size, user_sizes, uargs); - if ((valid_prog_array && - !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) || - hlist_empty(head)) { - perf_swevent_put_recursion_context(rctx); - return; - } - perf_trace_buf_submit(rec, size, rctx, sys_data->enter_event->event.type, 1, regs, head, NULL); @@ -1514,40 +1517,35 @@ static void perf_sysenter_disable(struct trace_event_call *call) syscall_fault_buffer_disable(); } -static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs, - struct syscall_trace_exit *rec) +static int perf_call_bpf_exit(struct trace_event_call *call, + int syscall_nr, long ret_val) { struct syscall_tp_t { struct trace_entry ent; int syscall_nr; unsigned long ret; } __aligned(8) param; - - /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. ¶m) */ - perf_fetch_caller_regs(regs); - *(struct pt_regs **)¶m = regs; - param.syscall_nr = rec->nr; - param.ret = rec->ret; - return trace_call_bpf(call, ¶m); + struct pt_regs regs = {}; + + /* bpf prog requires 'regs' to be the first member in the ctx */ + perf_fetch_caller_regs(®s); + *(struct pt_regs **)¶m = ®s; + param.syscall_nr = syscall_nr; + param.ret = ret_val; + return trace_call_bpf_faultable(call, ¶m); } static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) { struct syscall_metadata *sys_data; struct syscall_trace_exit *rec; - struct pt_regs *fake_regs; struct hlist_head *head; bool valid_prog_array; int syscall_nr; int rctx; int size; - /* - * Syscall probe called with preemption enabled, but the ring - * buffer and per-cpu data require preemption to be disabled. - */ might_fault(); - guard(preempt_notrace)(); syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) @@ -1559,29 +1557,37 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) if (!sys_data) return; - head = this_cpu_ptr(sys_data->exit_event->perf_events); + /* + * Run BPF program in faultable context before per-cpu buffer + * allocation, allowing sleepable BPF programs to execute. + */ valid_prog_array = bpf_prog_array_valid(sys_data->exit_event); - if (!valid_prog_array && hlist_empty(head)) + if (valid_prog_array && + !perf_call_bpf_exit(sys_data->exit_event, syscall_nr, + syscall_get_return_value(current, regs))) + return; + + /* + * Per-cpu ring buffer and perf event list operations require + * preemption to be disabled. + */ + guard(preempt_notrace)(); + + head = this_cpu_ptr(sys_data->exit_event->perf_events); + if (hlist_empty(head)) return; /* We can probably do that at build time */ size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - rec = perf_trace_buf_alloc(size, &fake_regs, &rctx); + rec = perf_trace_buf_alloc(size, NULL, &rctx); if (!rec) return; rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); - if ((valid_prog_array && - !perf_call_bpf_exit(sys_data->exit_event, fake_regs, rec)) || - hlist_empty(head)) { - perf_swevent_put_recursion_context(rctx); - return; - } - perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type, 1, regs, head, NULL); } -- cgit v1.2.3 From 8cfb77d3092052b52582e804e644202e2b10167a Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Wed, 22 Apr 2026 12:41:09 -0700 Subject: bpf: Verifier support for sleepable tracepoint programs Allow BPF_PROG_TYPE_RAW_TRACEPOINT, BPF_PROG_TYPE_TRACEPOINT, and BPF_TRACE_RAW_TP (tp_btf) programs to be sleepable by adding them to can_be_sleepable(). For BTF-based raw tracepoints (tp_btf), add a load-time check in bpf_check_attach_target() that rejects sleepable programs attaching to non-faultable tracepoints with a descriptive error message. For raw tracepoints (raw_tp), add an attach-time check in bpf_raw_tp_link_attach() that rejects sleepable programs on non-faultable tracepoints. The attach-time check is needed because the tracepoint name is not known at load time for raw_tp. The attach-time check for classic tracepoints (tp) in __perf_event_set_bpf_prog() was added in the previous patch. Replace the verbose error message that enumerates allowed program types with a generic "Program of this type cannot be sleepable" message, since the list of sleepable-capable types keeps growing. Signed-off-by: Mykyta Yatsenko Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20260422-sleepable_tracepoints-v13-4-99005dff21ef@meta.com Signed-off-by: Kumar Kartikeya Dwivedi --- kernel/bpf/syscall.c | 5 +++++ kernel/bpf/verifier.c | 13 +++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a3c0214ca934..3b1f0ba02f61 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4281,6 +4281,11 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, if (!btp) return -ENOENT; + if (prog->sleepable && !tracepoint_is_faultable(btp->tp)) { + bpf_put_raw_tracepoint(btp); + return -EINVAL; + } + link = kzalloc_obj(*link, GFP_USER); if (!link) { err = -ENOMEM; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 185210b73385..5b4806fdb648 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19267,6 +19267,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, btp = bpf_get_raw_tracepoint(tname); if (!btp) return -EINVAL; + if (prog->sleepable && !tracepoint_is_faultable(btp->tp)) { + bpf_log(log, "Sleepable program cannot attach to non-faultable tracepoint %s\n", + tname); + bpf_put_raw_tracepoint(btp); + return -EINVAL; + } fname = kallsyms_lookup((unsigned long)btp->bpf_func, NULL, NULL, NULL, trace_symbol); bpf_put_raw_tracepoint(btp); @@ -19483,6 +19489,7 @@ static bool can_be_sleepable(struct bpf_prog *prog) case BPF_MODIFY_RETURN: case BPF_TRACE_ITER: case BPF_TRACE_FSESSION: + case BPF_TRACE_RAW_TP: return true; default: return false; @@ -19490,7 +19497,9 @@ static bool can_be_sleepable(struct bpf_prog *prog) } return prog->type == BPF_PROG_TYPE_LSM || prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ || - prog->type == BPF_PROG_TYPE_STRUCT_OPS; + prog->type == BPF_PROG_TYPE_STRUCT_OPS || + prog->type == BPF_PROG_TYPE_RAW_TRACEPOINT || + prog->type == BPF_PROG_TYPE_TRACEPOINT; } static int check_attach_btf_id(struct bpf_verifier_env *env) @@ -19512,7 +19521,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) } if (prog->sleepable && !can_be_sleepable(prog)) { - verbose(env, "Only fentry/fexit/fsession/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n"); + verbose(env, "Program of this type cannot be sleepable\n"); return -EINVAL; } -- cgit v1.2.3 From 0cd420a6f40c7ee4e58c5277df6bf66efcfcdf1a Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Wed, 22 Apr 2026 12:41:10 -0700 Subject: libbpf: Add section handlers for sleepable tracepoints Add SEC_DEF entries for sleepable tracepoint variants: - "tp_btf.s+" for sleepable BTF-based raw tracepoints - "raw_tp.s+" for sleepable raw tracepoints - "raw_tracepoint.s+" (alias) - "tp.s+" for sleepable classic tracepoints - "tracepoint.s+" (alias) Extract sec_name_match_prefix() to share the prefix matching logic between attach_tp() and attach_raw_tp(), eliminating duplicated loops and hardcoded strcmp() checks for bare section names. Signed-off-by: Mykyta Yatsenko Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20260422-sleepable_tracepoints-v13-5-99005dff21ef@meta.com Signed-off-by: Kumar Kartikeya Dwivedi --- tools/lib/bpf/libbpf.c | 88 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 57 insertions(+), 31 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 83aae7a39d36..ab2071fdd3e8 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -10018,11 +10018,16 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("netkit/peer", SCHED_CLS, BPF_NETKIT_PEER, SEC_NONE), SEC_DEF("tracepoint+", TRACEPOINT, 0, SEC_NONE, attach_tp), SEC_DEF("tp+", TRACEPOINT, 0, SEC_NONE, attach_tp), + SEC_DEF("tracepoint.s+", TRACEPOINT, 0, SEC_SLEEPABLE, attach_tp), + SEC_DEF("tp.s+", TRACEPOINT, 0, SEC_SLEEPABLE, attach_tp), SEC_DEF("raw_tracepoint+", RAW_TRACEPOINT, 0, SEC_NONE, attach_raw_tp), SEC_DEF("raw_tp+", RAW_TRACEPOINT, 0, SEC_NONE, attach_raw_tp), + SEC_DEF("raw_tracepoint.s+", RAW_TRACEPOINT, 0, SEC_SLEEPABLE, attach_raw_tp), + SEC_DEF("raw_tp.s+", RAW_TRACEPOINT, 0, SEC_SLEEPABLE, attach_raw_tp), SEC_DEF("raw_tracepoint.w+", RAW_TRACEPOINT_WRITABLE, 0, SEC_NONE, attach_raw_tp), SEC_DEF("raw_tp.w+", RAW_TRACEPOINT_WRITABLE, 0, SEC_NONE, attach_raw_tp), SEC_DEF("tp_btf+", TRACING, BPF_TRACE_RAW_TP, SEC_ATTACH_BTF, attach_trace), + SEC_DEF("tp_btf.s+", TRACING, BPF_TRACE_RAW_TP, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace), SEC_DEF("fentry+", TRACING, BPF_TRACE_FENTRY, SEC_ATTACH_BTF, attach_trace), SEC_DEF("fmod_ret+", TRACING, BPF_MODIFY_RETURN, SEC_ATTACH_BTF, attach_trace), SEC_DEF("fexit+", TRACING, BPF_TRACE_FEXIT, SEC_ATTACH_BTF, attach_trace), @@ -13152,25 +13157,61 @@ struct bpf_link *bpf_program__attach_tracepoint(const struct bpf_program *prog, return bpf_program__attach_tracepoint_opts(prog, tp_category, tp_name, NULL); } +/* + * Match section name against a prefix array. Returns pointer past + * "prefix/" on match, empty string for bare sections (exact prefix + * match), or NULL if no prefix matches. + */ +static const char *sec_name_match_prefix(const char *sec_name, + const char *const *prefixes, + size_t n) +{ + size_t i; + + for (i = 0; i < n; i++) { + size_t pfx_len; + + if (!str_has_pfx(sec_name, prefixes[i])) + continue; + + pfx_len = strlen(prefixes[i]); + if (sec_name[pfx_len] == '\0') + return sec_name + pfx_len; + + if (sec_name[pfx_len] != '/' || sec_name[pfx_len + 1] == '\0') + continue; + + return sec_name + pfx_len + 1; + } + return NULL; +} + static int attach_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link) { + static const char *const prefixes[] = { + "tp.s", + "tp", + "tracepoint.s", + "tracepoint", + }; char *sec_name, *tp_cat, *tp_name; + const char *match; *link = NULL; - /* no auto-attach for SEC("tp") or SEC("tracepoint") */ - if (strcmp(prog->sec_name, "tp") == 0 || strcmp(prog->sec_name, "tracepoint") == 0) + match = sec_name_match_prefix(prog->sec_name, prefixes, ARRAY_SIZE(prefixes)); + if (!match) { + pr_warn("prog '%s': invalid section name '%s'\n", prog->name, prog->sec_name); + return -EINVAL; + } + if (!match[0]) /* bare section name no autoattach */ return 0; sec_name = strdup(prog->sec_name); if (!sec_name) return -ENOMEM; - /* extract "tp//" or "tracepoint//" */ - if (str_has_pfx(prog->sec_name, "tp/")) - tp_cat = sec_name + sizeof("tp/") - 1; - else - tp_cat = sec_name + sizeof("tracepoint/") - 1; + tp_cat = sec_name + (match - prog->sec_name); tp_name = strchr(tp_cat, '/'); if (!tp_name) { free(sec_name); @@ -13234,37 +13275,22 @@ static int attach_raw_tp(const struct bpf_program *prog, long cookie, struct bpf "raw_tracepoint", "raw_tp.w", "raw_tracepoint.w", + "raw_tp.s", + "raw_tracepoint.s", }; - size_t i; - const char *tp_name = NULL; + const char *match; *link = NULL; - for (i = 0; i < ARRAY_SIZE(prefixes); i++) { - size_t pfx_len; - - if (!str_has_pfx(prog->sec_name, prefixes[i])) - continue; - - pfx_len = strlen(prefixes[i]); - /* no auto-attach case of, e.g., SEC("raw_tp") */ - if (prog->sec_name[pfx_len] == '\0') - return 0; - - if (prog->sec_name[pfx_len] != '/') - continue; - - tp_name = prog->sec_name + pfx_len + 1; - break; - } - - if (!tp_name) { - pr_warn("prog '%s': invalid section name '%s'\n", - prog->name, prog->sec_name); + match = sec_name_match_prefix(prog->sec_name, prefixes, ARRAY_SIZE(prefixes)); + if (!match) { + pr_warn("prog '%s': invalid section name '%s'\n", prog->name, prog->sec_name); return -EINVAL; } + if (!match[0]) + return 0; - *link = bpf_program__attach_raw_tracepoint(prog, tp_name); + *link = bpf_program__attach_raw_tracepoint(prog, match); return libbpf_get_error(*link); } -- cgit v1.2.3 From 8a20655749c625dcc4debdfdeeaa0cf8bb85c203 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Wed, 22 Apr 2026 12:41:11 -0700 Subject: selftests/bpf: Add tests for sleepable tracepoint programs Cover all three sleepable tracepoint types (tp_btf.s, raw_tp.s, tp.s) and sys_exit (via bpf_task_pt_regs) with functional tests using bpf_copy_from_user() on getcwd. Verify alias and bare SEC variants, bpf_prog_test_run_raw_tp() with BPF_F_TEST_RUN_ON_CPU rejection, attach-time rejection on non-faultable tracepoints, and load-time rejection for sleepable tp_btf on non-faultable tracepoints. Signed-off-by: Mykyta Yatsenko Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20260422-sleepable_tracepoints-v13-6-99005dff21ef@meta.com Signed-off-by: Kumar Kartikeya Dwivedi --- .../bpf/prog_tests/sleepable_tracepoints.c | 142 +++++++++++++++++++++ .../bpf/progs/test_sleepable_tracepoints.c | 112 ++++++++++++++++ .../bpf/progs/test_sleepable_tracepoints_fail.c | 18 +++ tools/testing/selftests/bpf/verifier/sleepable.c | 17 ++- 4 files changed, 287 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/sleepable_tracepoints.c create mode 100644 tools/testing/selftests/bpf/progs/test_sleepable_tracepoints.c create mode 100644 tools/testing/selftests/bpf/progs/test_sleepable_tracepoints_fail.c diff --git a/tools/testing/selftests/bpf/prog_tests/sleepable_tracepoints.c b/tools/testing/selftests/bpf/prog_tests/sleepable_tracepoints.c new file mode 100644 index 000000000000..19500b785ee3 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/sleepable_tracepoints.c @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include "test_sleepable_tracepoints.skel.h" +#include "test_sleepable_tracepoints_fail.skel.h" + +static void run_test(struct test_sleepable_tracepoints *skel) +{ + char buf[PATH_MAX] = "/"; + + skel->bss->target_pid = getpid(); + skel->bss->prog_triggered = 0; + skel->bss->err = 0; + skel->bss->copied_byte = 0; + + syscall(__NR_getcwd, buf, sizeof(buf)); + + ASSERT_EQ(skel->bss->prog_triggered, 1, "prog_triggered"); + ASSERT_EQ(skel->bss->err, 0, "err"); + ASSERT_EQ(skel->bss->copied_byte, '/', "copied_byte"); +} + +static void run_auto_attach_test(struct bpf_program *prog, + struct test_sleepable_tracepoints *skel) +{ + struct bpf_link *link; + + link = bpf_program__attach(prog); + if (!ASSERT_OK_PTR(link, "prog_attach")) + return; + + run_test(skel); + bpf_link__destroy(link); +} + +static void test_attach_only(struct bpf_program *prog) +{ + struct bpf_link *link; + + link = bpf_program__attach(prog); + if (ASSERT_OK_PTR(link, "attach")) + bpf_link__destroy(link); +} + +static void test_attach_reject(struct bpf_program *prog) +{ + struct bpf_link *link; + + link = bpf_program__attach(prog); + if (!ASSERT_ERR_PTR(link, "attach_should_fail")) + bpf_link__destroy(link); +} + +static void test_raw_tp_bare(struct test_sleepable_tracepoints *skel) +{ + struct bpf_link *link; + + link = bpf_program__attach_raw_tracepoint(skel->progs.handle_raw_tp_bare, + "sys_enter"); + if (ASSERT_OK_PTR(link, "attach")) + bpf_link__destroy(link); +} + +static void test_tp_bare(struct test_sleepable_tracepoints *skel) +{ + struct bpf_link *link; + + link = bpf_program__attach_tracepoint(skel->progs.handle_tp_bare, + "syscalls", "sys_enter_getcwd"); + if (ASSERT_OK_PTR(link, "attach")) + bpf_link__destroy(link); +} + +static void test_test_run(struct test_sleepable_tracepoints *skel) +{ + __u64 args[2] = {0x1234ULL, 0x5678ULL}; + LIBBPF_OPTS(bpf_test_run_opts, topts, + .ctx_in = args, + .ctx_size_in = sizeof(args), + ); + int fd, err; + + fd = bpf_program__fd(skel->progs.handle_test_run); + err = bpf_prog_test_run_opts(fd, &topts); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(topts.retval, args[0] + args[1], "test_run_retval"); +} + +static void test_test_run_on_cpu_reject(struct test_sleepable_tracepoints *skel) +{ + __u64 args[2] = {}; + LIBBPF_OPTS(bpf_test_run_opts, topts, + .ctx_in = args, + .ctx_size_in = sizeof(args), + .flags = BPF_F_TEST_RUN_ON_CPU, + ); + int fd, err; + + fd = bpf_program__fd(skel->progs.handle_test_run); + err = bpf_prog_test_run_opts(fd, &topts); + ASSERT_ERR(err, "test_run_on_cpu_reject"); +} + +void test_sleepable_tracepoints(void) +{ + struct test_sleepable_tracepoints *skel; + + skel = test_sleepable_tracepoints__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + if (test__start_subtest("tp_btf")) + run_auto_attach_test(skel->progs.handle_sys_enter_tp_btf, skel); + if (test__start_subtest("raw_tp")) + run_auto_attach_test(skel->progs.handle_sys_enter_raw_tp, skel); + if (test__start_subtest("tracepoint")) + run_auto_attach_test(skel->progs.handle_sys_enter_tp, skel); + if (test__start_subtest("sys_exit")) + run_auto_attach_test(skel->progs.handle_sys_exit_tp, skel); + if (test__start_subtest("tracepoint_alias")) + test_attach_only(skel->progs.handle_sys_enter_tp_alias); + if (test__start_subtest("raw_tracepoint_alias")) + test_attach_only(skel->progs.handle_sys_enter_raw_tp_alias); + if (test__start_subtest("raw_tp_bare")) + test_raw_tp_bare(skel); + if (test__start_subtest("tp_bare")) + test_tp_bare(skel); + if (test__start_subtest("test_run")) + test_test_run(skel); + if (test__start_subtest("test_run_on_cpu_reject")) + test_test_run_on_cpu_reject(skel); + if (test__start_subtest("raw_tp_non_faultable")) + test_attach_reject(skel->progs.handle_raw_tp_non_faultable); + if (test__start_subtest("tp_non_syscall")) + test_attach_reject(skel->progs.handle_tp_non_syscall); + if (test__start_subtest("tp_btf_non_faultable_reject")) + RUN_TESTS(test_sleepable_tracepoints_fail); + + test_sleepable_tracepoints__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_sleepable_tracepoints.c b/tools/testing/selftests/bpf/progs/test_sleepable_tracepoints.c new file mode 100644 index 000000000000..254f7fd895d9 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_sleepable_tracepoints.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +int target_pid; +int prog_triggered; +long err; +char copied_byte; + +static int copy_getcwd_arg(char *ubuf) +{ + err = bpf_copy_from_user(&copied_byte, sizeof(copied_byte), ubuf); + if (err) + return err; + + prog_triggered = 1; + return 0; +} + +SEC("tp_btf.s/sys_enter") +int BPF_PROG(handle_sys_enter_tp_btf, struct pt_regs *regs, long id) +{ + if ((bpf_get_current_pid_tgid() >> 32) != target_pid || + id != __NR_getcwd) + return 0; + + return copy_getcwd_arg((void *)PT_REGS_PARM1_SYSCALL(regs)); +} + +SEC("raw_tp.s/sys_enter") +int BPF_PROG(handle_sys_enter_raw_tp, struct pt_regs *regs, long id) +{ + if ((bpf_get_current_pid_tgid() >> 32) != target_pid || + id != __NR_getcwd) + return 0; + + return copy_getcwd_arg((void *)PT_REGS_PARM1_CORE_SYSCALL(regs)); +} + +SEC("tp.s/syscalls/sys_enter_getcwd") +int handle_sys_enter_tp(struct syscall_trace_enter *args) +{ + if ((bpf_get_current_pid_tgid() >> 32) != target_pid) + return 0; + + return copy_getcwd_arg((void *)args->args[0]); +} + +SEC("tp.s/syscalls/sys_exit_getcwd") +int handle_sys_exit_tp(struct syscall_trace_exit *args) +{ + struct pt_regs *regs; + + if ((bpf_get_current_pid_tgid() >> 32) != target_pid) + return 0; + + regs = (struct pt_regs *)bpf_task_pt_regs(bpf_get_current_task_btf()); + return copy_getcwd_arg((void *)PT_REGS_PARM1_CORE_SYSCALL(regs)); +} + +SEC("raw_tp.s") +int BPF_PROG(handle_raw_tp_bare, struct pt_regs *regs, long id) +{ + return 0; +} + +SEC("tp.s") +int handle_tp_bare(void *ctx) +{ + return 0; +} + +SEC("tracepoint.s/syscalls/sys_enter_getcwd") +int handle_sys_enter_tp_alias(struct syscall_trace_enter *args) +{ + return 0; +} + +SEC("raw_tracepoint.s/sys_enter") +int BPF_PROG(handle_sys_enter_raw_tp_alias, struct pt_regs *regs, long id) +{ + return 0; +} + +SEC("raw_tp.s/sys_enter") +int BPF_PROG(handle_test_run, struct pt_regs *regs, long id) +{ + if ((__u64)regs == 0x1234ULL && (__u64)id == 0x5678ULL) + return (__u64)regs + (__u64)id; + + return 0; +} + +SEC("raw_tp.s/sched_switch") +int BPF_PROG(handle_raw_tp_non_faultable, bool preempt, + struct task_struct *prev, struct task_struct *next) +{ + return 0; +} + +SEC("tp.s/sched/sched_switch") +int handle_tp_non_syscall(void *ctx) +{ + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_sleepable_tracepoints_fail.c b/tools/testing/selftests/bpf/progs/test_sleepable_tracepoints_fail.c new file mode 100644 index 000000000000..1a0748a9520b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_sleepable_tracepoints_fail.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +/* Sleepable program on a non-faultable tracepoint should fail to load */ +SEC("tp_btf.s/sched_switch") +__failure __msg("Sleepable program cannot attach to non-faultable tracepoint") +int BPF_PROG(handle_sched_switch, bool preempt, + struct task_struct *prev, struct task_struct *next) +{ + return 0; +} diff --git a/tools/testing/selftests/bpf/verifier/sleepable.c b/tools/testing/selftests/bpf/verifier/sleepable.c index c2b7f5ebf168..6dabc5522945 100644 --- a/tools/testing/selftests/bpf/verifier/sleepable.c +++ b/tools/testing/selftests/bpf/verifier/sleepable.c @@ -76,7 +76,20 @@ .runs = -1, }, { - "sleepable raw tracepoint reject", + "sleepable raw tracepoint accept", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_TRACING, + .expected_attach_type = BPF_TRACE_RAW_TP, + .kfunc = "sys_enter", + .result = ACCEPT, + .flags = BPF_F_SLEEPABLE, + .runs = -1, +}, +{ + "sleepable raw tracepoint reject non-faultable", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), @@ -85,7 +98,7 @@ .expected_attach_type = BPF_TRACE_RAW_TP, .kfunc = "sched_switch", .result = REJECT, - .errstr = "Only fentry/fexit/fsession/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable", + .errstr = "Sleepable program cannot attach to non-faultable tracepoint", .flags = BPF_F_SLEEPABLE, .runs = -1, }, -- cgit v1.2.3 From a20f97791a786203821570e84941ee7a67fd53e9 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Mon, 20 Apr 2026 15:46:37 +0200 Subject: selftests/bpf: Page out as late as possible in file_reader The file_reader/on_open_expect_fault fails consistently on my system. It expects a page fault on first dynptr read of some range the exe file of the current process because it has paged out that page range earlier. However a lot can happen to that range (which depending on the actual memory layout could contain text section, data section, sections )related to dynamic linking...) between the moment it was paged out and the moment the bpf program expected to hit a pagefault actually run. A bit of instrumentation with mincore() shows that pages from that range were accessed several times before the program is run. In particular the call of file_reader__load() seems to fault all the range in. Move the call to madvise(MADV_PAGEOUT) to just before attaching the program to minimize the risk of having those page pulled back in from under our feet. Signed-off-by: Jerome Marchand Acked-by: Mykyta Yatsenko Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20260420134637.2513867-1-jmarchan@redhat.com Signed-off-by: Kumar Kartikeya Dwivedi --- .../testing/selftests/bpf/prog_tests/file_reader.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/file_reader.c b/tools/testing/selftests/bpf/prog_tests/file_reader.c index 5cde32b35da4..48aae7ea0e4b 100644 --- a/tools/testing/selftests/bpf/prog_tests/file_reader.c +++ b/tools/testing/selftests/bpf/prog_tests/file_reader.c @@ -10,6 +10,7 @@ const char *user_ptr = "hello world"; char file_contents[256000]; +void *addr; void *get_executable_base_addr(void) { @@ -26,8 +27,7 @@ void *get_executable_base_addr(void) static int initialize_file_contents(void) { int fd, page_sz = sysconf(_SC_PAGESIZE); - ssize_t n = 0, cur, off; - void *addr; + ssize_t n = 0, cur; fd = open("/proc/self/exe", O_RDONLY); if (!ASSERT_OK_FD(fd, "Open /proc/self/exe\n")) @@ -52,16 +52,6 @@ static int initialize_file_contents(void) /* page-align base file address */ addr = (void *)((unsigned long)addr & ~(page_sz - 1)); - /* - * Page out range 0..512K, use 0..256K for positive tests and - * 256K..512K for negative tests expecting page faults - */ - for (off = 0; off < sizeof(file_contents) * 2; off += page_sz) { - if (!ASSERT_OK(madvise(addr + off, page_sz, MADV_PAGEOUT), - "madvise pageout")) - return errno; - } - return 0; } @@ -90,6 +80,14 @@ static void run_test(const char *prog_name) if (!ASSERT_OK(err, "file_reader__load")) goto cleanup; + /* + * Page out range 0..512K, use 0..256K for positive tests and + * 256K..512K for negative tests expecting page faults + */ + if (!ASSERT_OK(madvise(addr, sizeof(file_contents) * 2, MADV_PAGEOUT), + "madvise pageout")) + goto cleanup; + err = file_reader__attach(skel); if (!ASSERT_OK(err, "file_reader__attach")) goto cleanup; -- cgit v1.2.3 From 0831b110eb4591e4ad8c5fd0d8f0f3f9979a5ff5 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Fri, 17 Apr 2026 10:33:17 +0200 Subject: libbpf: Fix deduplication of typedef with base definitions When deduplicating definitions for a module, typedef defined in the base are not removed. This is because the hash used for base types differs from the one used in the deduplication logic in btf_dedup_struct_type. This was introduced by the referenced commit when moving the typedef deduplication logic handling from btf_dedup_ref_type to btf_dedup_struct_type, as this also changed the hash logic (btf_hash_common to btf_hash_typedef). This also impacts other types referencing those typedef (e.g. const). In my test, the BTF section size of the openvswitch module went from 31KB to 45KB. Fixes: 3781413465df ("libbpf: Fix BTF dedup to support recursive typedef definitions"). Signed-off-by: Antoine Tenart Signed-off-by: Andrii Nakryiko Tested-by: Alan Maguire Reviewed-by: Alan Maguire Link: https://lore.kernel.org/bpf/20260417083319.32716-1-atenart@kernel.org --- tools/lib/bpf/btf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 267904939098..823bce895178 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -4592,12 +4592,14 @@ static int btf_dedup_prep(struct btf_dedup *d) case BTF_KIND_RESTRICT: case BTF_KIND_PTR: case BTF_KIND_FWD: - case BTF_KIND_TYPEDEF: case BTF_KIND_FUNC: case BTF_KIND_FLOAT: case BTF_KIND_TYPE_TAG: h = btf_hash_common(t); break; + case BTF_KIND_TYPEDEF: + h = btf_hash_typedef(t); + break; case BTF_KIND_INT: case BTF_KIND_DECL_TAG: h = btf_hash_int_decl_tag(t); -- cgit v1.2.3 From 1980023d759decc4b5647718d72c94385925fe9c Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Fri, 17 Apr 2026 10:33:18 +0200 Subject: selftests/bpf: Ensure typedef are deduplicated in split BTF If a typedef is defined both in a base and in a split BTF, after deduplication a single instance should be found in the base BTF. Suggested-by: Alan Maguire Signed-off-by: Antoine Tenart Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260417083319.32716-2-atenart@kernel.org --- .../selftests/bpf/prog_tests/btf_dedup_split.c | 48 ++++++++++++++-------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c b/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c index 6bc31236805c..9d6161151593 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c @@ -20,18 +20,22 @@ static void test_split_simple() { btf__add_struct(btf1, "s1", 4); /* [3] struct s1 { */ btf__add_field(btf1, "f1", 1, 0, 0); /* int f1; */ /* } */ + btf__add_typedef(btf1, "t1", 1); /* [4] typedef int */ VALIDATE_RAW_BTF( btf1, "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", "[2] PTR '(anon)' type_id=1", "[3] STRUCT 's1' size=4 vlen=1\n" - "\t'f1' type_id=1 bits_offset=0"); + "\t'f1' type_id=1 bits_offset=0", + "[4] TYPEDEF 't1' type_id=1"); ASSERT_STREQ(btf_type_c_dump(btf1), "\ struct s1 {\n\ int f1;\n\ -};\n\n", "c_dump"); +};\n\ +\n\ +typedef int t1;\n\n", "c_dump"); btf2 = btf__new_empty_split(btf1); if (!ASSERT_OK_PTR(btf2, "empty_split_btf")) @@ -49,39 +53,46 @@ struct s1 {\n\ ASSERT_EQ(btf_is_int(t), true, "int_kind"); ASSERT_STREQ(btf__str_by_offset(btf2, t->name_off), "int", "int_name"); - btf__add_struct(btf2, "s2", 16); /* [4] struct s2 { */ - btf__add_field(btf2, "f1", 6, 0, 0); /* struct s1 f1; */ - btf__add_field(btf2, "f2", 5, 32, 0); /* int f2; */ + btf__add_struct(btf2, "s2", 16); /* [5] struct s2 { */ + btf__add_field(btf2, "f1", 7, 0, 0); /* struct s1 f1; */ + btf__add_field(btf2, "f2", 6, 32, 0); /* int f2; */ btf__add_field(btf2, "f3", 2, 64, 0); /* int *f3; */ /* } */ /* duplicated int */ - btf__add_int(btf2, "int", 4, BTF_INT_SIGNED); /* [5] int */ + btf__add_int(btf2, "int", 4, BTF_INT_SIGNED); /* [6] int */ /* duplicated struct s1 */ - btf__add_struct(btf2, "s1", 4); /* [6] struct s1 { */ - btf__add_field(btf2, "f1", 5, 0, 0); /* int f1; */ + btf__add_struct(btf2, "s1", 4); /* [7] struct s1 { */ + btf__add_field(btf2, "f1", 6, 0, 0); /* int f1; */ /* } */ + /* duplicated typedef t1 */ + btf__add_typedef(btf2, "t1", 6); /* [8] typedef int */ + VALIDATE_RAW_BTF( btf2, "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", "[2] PTR '(anon)' type_id=1", "[3] STRUCT 's1' size=4 vlen=1\n" "\t'f1' type_id=1 bits_offset=0", - "[4] STRUCT 's2' size=16 vlen=3\n" - "\t'f1' type_id=6 bits_offset=0\n" - "\t'f2' type_id=5 bits_offset=32\n" + "[4] TYPEDEF 't1' type_id=1", + "[5] STRUCT 's2' size=16 vlen=3\n" + "\t'f1' type_id=7 bits_offset=0\n" + "\t'f2' type_id=6 bits_offset=32\n" "\t'f3' type_id=2 bits_offset=64", - "[5] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", - "[6] STRUCT 's1' size=4 vlen=1\n" - "\t'f1' type_id=5 bits_offset=0"); + "[6] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[7] STRUCT 's1' size=4 vlen=1\n" + "\t'f1' type_id=6 bits_offset=0", + "[8] TYPEDEF 't1' type_id=6"); ASSERT_STREQ(btf_type_c_dump(btf2), "\ struct s1 {\n\ int f1;\n\ };\n\ \n\ +typedef int t1;\n\ +\n\ struct s1___2 {\n\ int f1;\n\ };\n\ @@ -90,7 +101,9 @@ struct s2 {\n\ struct s1___2 f1;\n\ int f2;\n\ int *f3;\n\ -};\n\n", "c_dump"); +};\n\ +\n\ +typedef int t1___2;\n\n", "c_dump"); err = btf__dedup(btf2, NULL); if (!ASSERT_OK(err, "btf_dedup")) @@ -102,7 +115,8 @@ struct s2 {\n\ "[2] PTR '(anon)' type_id=1", "[3] STRUCT 's1' size=4 vlen=1\n" "\t'f1' type_id=1 bits_offset=0", - "[4] STRUCT 's2' size=16 vlen=3\n" + "[4] TYPEDEF 't1' type_id=1", + "[5] STRUCT 's2' size=16 vlen=3\n" "\t'f1' type_id=3 bits_offset=0\n" "\t'f2' type_id=1 bits_offset=32\n" "\t'f3' type_id=2 bits_offset=64"); @@ -112,6 +126,8 @@ struct s1 {\n\ int f1;\n\ };\n\ \n\ +typedef int t1;\n\ +\n\ struct s2 {\n\ struct s1 f1;\n\ int f2;\n\ -- cgit v1.2.3 From 7f843c0584f438c1cc8cbe798ca8ab4207e67509 Mon Sep 17 00:00:00 2001 From: "Alexis Lothoré (eBPF Foundation)" Date: Tue, 21 Apr 2026 16:33:29 +0200 Subject: selftests/bpf: Fix uprobe_multi usage message MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit uprobe_multi usage message not in sync with the list of subtests it actually supports. Add the missing subtests in the help message. Signed-off-by: Alexis Lothoré (eBPF Foundation) Link: https://lore.kernel.org/bpf/20260421-uprobe_multi_usage-v1-1-4c51675955e6@bootlin.com Signed-off-by: Kumar Kartikeya Dwivedi --- tools/testing/selftests/bpf/uprobe_multi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/uprobe_multi.c b/tools/testing/selftests/bpf/uprobe_multi.c index 3e58a86b8e25..0af330b6c364 100644 --- a/tools/testing/selftests/bpf/uprobe_multi.c +++ b/tools/testing/selftests/bpf/uprobe_multi.c @@ -144,6 +144,8 @@ int main(int argc, char **argv) return trigger_uprobe(true /* page-in build ID */); error: - fprintf(stderr, "usage: %s \n", argv[0]); + fprintf(stderr, + "usage: %s \n", + argv[0]); return -1; } -- cgit v1.2.3 From a7088176d8299ff74276a89dbdef3c5ce8748eeb Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 22 Apr 2026 20:34:30 -0700 Subject: bpf: Remove unused parameter from check_map_kptr_access() The parameter 'regno' in check_map_kptr_access() is unused. Remove it. Acked-by: Puranjay Mohan Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260423033430.2537615-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5b4806fdb648..6118743d87e6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4710,7 +4710,7 @@ static int mark_uptr_ld_reg(struct bpf_verifier_env *env, u32 regno, return 0; } -static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, +static int check_map_kptr_access(struct bpf_verifier_env *env, int value_regno, int insn_idx, struct btf_field *kptr_field) { @@ -6357,7 +6357,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn kptr_field = btf_record_find(reg->map_ptr->record, off + reg->var_off.value, BPF_KPTR | BPF_UPTR); if (kptr_field) { - err = check_map_kptr_access(env, regno, value_regno, insn_idx, kptr_field); + err = check_map_kptr_access(env, value_regno, insn_idx, kptr_field); } else if (t == BPF_READ && value_regno >= 0) { struct bpf_map *map = reg->map_ptr; -- cgit v1.2.3 From 54c27ea6dadbe932a955bf40b7837947c5c202e1 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 22 Apr 2026 20:34:35 -0700 Subject: bpf: Fix tail_call_reachable leak In check_max_stack_depth_subprog(), the local variable tail_call_reachable is set when entering a callee that has a tail call, but never reset when popping back to the parent. This causes the flag to leak across sibling subprogs in the DFS traversal. This results in unnecessary JIT overhead: the JIT emits tail call counter preservation code for subprogs that can never be reached via a tail call path. Fix this by resetting tail_call_reachable to the parent's actual per-subprog flag when popping a frame. If the parent was already marked tail_call_reachable by a previous sibling's traversal, the local variable stays true. Otherwise it resets to false, so subsequent siblings start with a clean state. Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260423033435.2538013-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6118743d87e6..26b6cdfd8613 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5490,6 +5490,9 @@ continue_func: frame = dinfo[idx].frame; i = dinfo[idx].ret_insn; + /* reset tail_call_reachable to the parent's actual state */ + tail_call_reachable = subprog[idx].tail_call_reachable; + goto continue_func; } -- cgit v1.2.3 From 8a16c5b2b22ff22c1a2e3ff92cc991990efceb38 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 22 Apr 2026 20:34:41 -0700 Subject: bpf: Remove WARN_ON_ONCE in check_kfunc_mem_size_reg() The warning is too late if it does happen. Remove it. Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260423033441.2538149-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 26b6cdfd8613..d123449f5552 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7134,8 +7134,6 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg struct bpf_call_arg_meta meta; int err; - WARN_ON_ONCE(regno < BPF_REG_2 || regno > BPF_REG_5); - memset(&meta, 0, sizeof(meta)); if (may_be_null) { -- cgit v1.2.3 From 6a581b856c9e633c615483ad53910cba8cacbf28 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 22 Apr 2026 20:34:46 -0700 Subject: bpf: Refactor to avoid redundant calculation of bpf_reg_state In many cases, once a bpf_reg_state is defined, it can pass to callee's. Otherwise, callee will need to get bpf_reg_state again based on regno. More importantly, this is needed for later stack arguments for kfuncs since the register state for stack arguments does not have a corresponding regno. So it makes sense to pass reg state for callee's. The following is the only change to avoid compilation warning: static int sanitize_check_bounds(struct bpf_verifier_env *env, const struct bpf_insn *insn, - const struct bpf_reg_state *dst_reg) + struct bpf_reg_state *dst_reg) Acked-by: Puranjay Mohan Acked-by: Kumar Kartikeya Dwivedi Reviewed-by: Amery Hung Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260423033446.2538321-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 223 ++++++++++++++++++++++---------------------------- 1 file changed, 98 insertions(+), 125 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d123449f5552..debae6133e4c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3908,7 +3908,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, return 0; } -/* Write the stack: 'stack[ptr_regno + off] = value_regno'. 'ptr_regno' is +/* Write the stack: 'stack[ptr_reg + off] = value_regno'. 'ptr_reg' is * known to contain a variable offset. * This function checks whether the write is permitted and conservatively * tracks the effects of the write, considering that each stack slot in the @@ -3929,13 +3929,13 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, static int check_stack_write_var_off(struct bpf_verifier_env *env, /* func where register points to */ struct bpf_func_state *state, - int ptr_regno, int off, int size, + struct bpf_reg_state *ptr_reg, int off, int size, int value_regno, int insn_idx) { struct bpf_func_state *cur; /* state of the current function */ int min_off, max_off; int i, err; - struct bpf_reg_state *ptr_reg = NULL, *value_reg = NULL; + struct bpf_reg_state *value_reg = NULL; struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; bool writing_zero = false; /* set if the fact that we're writing a zero is used to let any @@ -3944,7 +3944,6 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, bool zero_used = false; cur = env->cur_state->frame[env->cur_state->curframe]; - ptr_reg = &cur->regs[ptr_regno]; min_off = ptr_reg->smin_value + off; max_off = ptr_reg->smax_value + off + size; if (value_regno >= 0) @@ -4241,7 +4240,7 @@ enum bpf_access_src { ACCESS_HELPER = 2, /* the access is performed by a helper */ }; -static int check_stack_range_initialized(struct bpf_verifier_env *env, +static int check_stack_range_initialized(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, int off, int access_size, bool zero_size_allowed, enum bpf_access_type type, @@ -4252,31 +4251,29 @@ static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno) return cur_regs(env) + regno; } -/* Read the stack at 'ptr_regno + off' and put the result into the register +/* Read the stack at 'reg + off' and put the result into the register * 'dst_regno'. - * 'off' includes the pointer register's fixed offset(i.e. 'ptr_regno.off'), + * 'off' includes the pointer register's fixed offset(i.e. 'reg->off'), * but not its variable offset. * 'size' is assumed to be <= reg size and the access is assumed to be aligned. * * As opposed to check_stack_read_fixed_off, this function doesn't deal with * filling registers (i.e. reads of spilled register cannot be detected when * the offset is not fixed). We conservatively mark 'dst_regno' as containing - * SCALAR_VALUE. That's why we assert that the 'ptr_regno' has a variable + * SCALAR_VALUE. That's why we assert that the 'reg' has a variable * offset; for a fixed offset check_stack_read_fixed_off should be used * instead. */ -static int check_stack_read_var_off(struct bpf_verifier_env *env, +static int check_stack_read_var_off(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int ptr_regno, int off, int size, int dst_regno) { - /* The state of the source register. */ - struct bpf_reg_state *reg = reg_state(env, ptr_regno); struct bpf_func_state *ptr_state = bpf_func(env, reg); int err; int min_off, max_off; /* Note that we pass a NULL meta, so raw access will not be permitted. */ - err = check_stack_range_initialized(env, ptr_regno, off, size, + err = check_stack_range_initialized(env, reg, ptr_regno, off, size, false, BPF_READ, NULL); if (err) return err; @@ -4298,10 +4295,9 @@ static int check_stack_read_var_off(struct bpf_verifier_env *env, * can be -1, meaning that the read value is not going to a register. */ static int check_stack_read(struct bpf_verifier_env *env, - int ptr_regno, int off, int size, + struct bpf_reg_state *reg, int ptr_regno, int off, int size, int dst_regno) { - struct bpf_reg_state *reg = reg_state(env, ptr_regno); struct bpf_func_state *state = bpf_func(env, reg); int err; /* Some accesses are only permitted with a static offset. */ @@ -4337,7 +4333,7 @@ static int check_stack_read(struct bpf_verifier_env *env, * than fixed offset ones. Note that dst_regno >= 0 on this * branch. */ - err = check_stack_read_var_off(env, ptr_regno, off, size, + err = check_stack_read_var_off(env, reg, ptr_regno, off, size, dst_regno); } return err; @@ -4347,17 +4343,16 @@ static int check_stack_read(struct bpf_verifier_env *env, /* check_stack_write dispatches to check_stack_write_fixed_off or * check_stack_write_var_off. * - * 'ptr_regno' is the register used as a pointer into the stack. + * 'reg' is the register used as a pointer into the stack. * 'value_regno' is the register whose value we're writing to the stack. It can * be -1, meaning that we're not writing from a register. * * The caller must ensure that the offset falls within the maximum stack size. */ static int check_stack_write(struct bpf_verifier_env *env, - int ptr_regno, int off, int size, + struct bpf_reg_state *reg, int off, int size, int value_regno, int insn_idx) { - struct bpf_reg_state *reg = reg_state(env, ptr_regno); struct bpf_func_state *state = bpf_func(env, reg); int err; @@ -4370,16 +4365,15 @@ static int check_stack_write(struct bpf_verifier_env *env, * than fixed offset ones. */ err = check_stack_write_var_off(env, state, - ptr_regno, off, size, + reg, off, size, value_regno, insn_idx); } return err; } -static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, +static int check_map_access_type(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int off, int size, enum bpf_access_type type) { - struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_map *map = reg->map_ptr; u32 cap = bpf_map_flags_to_cap(map); @@ -4399,17 +4393,15 @@ static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, } /* check read/write into memory region (e.g., map value, ringbuf sample, etc) */ -static int __check_mem_access(struct bpf_verifier_env *env, int regno, +static int __check_mem_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, int off, int size, u32 mem_size, bool zero_size_allowed) { bool size_ok = size > 0 || (size == 0 && zero_size_allowed); - struct bpf_reg_state *reg; if (off >= 0 && size_ok && (u64)off + size <= mem_size) return 0; - reg = &cur_regs(env)[regno]; switch (reg->type) { case PTR_TO_MAP_KEY: verbose(env, "invalid access to map key, key_size=%d off=%d size=%d\n", @@ -4439,13 +4431,10 @@ static int __check_mem_access(struct bpf_verifier_env *env, int regno, } /* check read/write into a memory region with possible variable offset */ -static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno, +static int check_mem_region_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, int off, int size, u32 mem_size, bool zero_size_allowed) { - struct bpf_verifier_state *vstate = env->cur_state; - struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *reg = &state->regs[regno]; int err; /* We may have adjusted the register pointing to memory region, so we @@ -4466,7 +4455,7 @@ static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno, regno); return -EACCES; } - err = __check_mem_access(env, regno, reg->smin_value + off, size, + err = __check_mem_access(env, reg, regno, reg->smin_value + off, size, mem_size, zero_size_allowed); if (err) { verbose(env, "R%d min value is outside of the allowed memory range\n", @@ -4483,7 +4472,7 @@ static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno, regno); return -EACCES; } - err = __check_mem_access(env, regno, reg->umax_value + off, size, + err = __check_mem_access(env, reg, regno, reg->umax_value + off, size, mem_size, zero_size_allowed); if (err) { verbose(env, "R%d max value is outside of the allowed memory range\n", @@ -4787,19 +4776,16 @@ static u32 map_mem_size(const struct bpf_map *map) } /* check read/write into a map element with possible variable offset */ -static int check_map_access(struct bpf_verifier_env *env, u32 regno, +static int check_map_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, int off, int size, bool zero_size_allowed, enum bpf_access_src src) { - struct bpf_verifier_state *vstate = env->cur_state; - struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *reg = &state->regs[regno]; struct bpf_map *map = reg->map_ptr; u32 mem_size = map_mem_size(map); struct btf_record *rec; int err, i; - err = check_mem_region_access(env, regno, off, size, mem_size, zero_size_allowed); + err = check_mem_region_access(env, reg, regno, off, size, mem_size, zero_size_allowed); if (err) return err; @@ -4895,10 +4881,9 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, } } -static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, +static int check_packet_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, int off, int size, bool zero_size_allowed) { - struct bpf_reg_state *reg = reg_state(env, regno); int err; if (reg->range < 0) { @@ -4906,7 +4891,7 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, return -EINVAL; } - err = check_mem_region_access(env, regno, off, size, reg->range, zero_size_allowed); + err = check_mem_region_access(env, reg, regno, off, size, reg->range, zero_size_allowed); if (err) return err; @@ -4961,7 +4946,7 @@ static int __check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int of return -EACCES; } -static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, +static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, struct bpf_reg_state *reg, u32 regno, int off, int access_size, enum bpf_access_type t, struct bpf_insn_access_aux *info) { @@ -4971,12 +4956,10 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, u32 regn */ bool var_off_ok = is_var_ctx_off_allowed(env->prog); bool fixed_off_ok = !env->ops->convert_ctx_access; - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = regs + regno; int err; if (var_off_ok) - err = check_mem_region_access(env, regno, off, access_size, U16_MAX, false); + err = check_mem_region_access(env, reg, regno, off, access_size, U16_MAX, false); else err = __check_ptr_off_reg(env, reg, regno, fixed_off_ok); if (err) @@ -5002,10 +4985,9 @@ static int check_flow_keys_access(struct bpf_verifier_env *env, int off, } static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, - u32 regno, int off, int size, + struct bpf_reg_state *reg, u32 regno, int off, int size, enum bpf_access_type t) { - struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_insn_access_aux info = {}; bool valid; @@ -5971,12 +5953,11 @@ static bool type_is_trusted_or_null(struct bpf_verifier_env *env, } static int check_ptr_to_btf_access(struct bpf_verifier_env *env, - struct bpf_reg_state *regs, + struct bpf_reg_state *regs, struct bpf_reg_state *reg, int regno, int off, int size, enum bpf_access_type atype, int value_regno) { - struct bpf_reg_state *reg = regs + regno; const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id); const char *tname = btf_name_by_offset(reg->btf, t->name_off); const char *field_name = NULL; @@ -6128,12 +6109,11 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, } static int check_ptr_to_map_access(struct bpf_verifier_env *env, - struct bpf_reg_state *regs, + struct bpf_reg_state *regs, struct bpf_reg_state *reg, int regno, int off, int size, enum bpf_access_type atype, int value_regno) { - struct bpf_reg_state *reg = regs + regno; struct bpf_map *map = reg->map_ptr; struct bpf_reg_state map_reg; enum bpf_type_flag flag = 0; @@ -6222,11 +6202,10 @@ static int check_stack_slot_within_bounds(struct bpf_verifier_env *env, * 'off' includes `regno->offset`, but not its dynamic part (if any). */ static int check_stack_access_within_bounds( - struct bpf_verifier_env *env, + struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, int off, int access_size, enum bpf_access_type type) { - struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_func_state *state = bpf_func(env, reg); s64 min_off, max_off; int err; @@ -6314,12 +6293,11 @@ static void add_scalar_to_reg(struct bpf_reg_state *dst_reg, s64 val) * if t==write && value_regno==-1, some unknown value is stored into memory * if t==read && value_regno==-1, don't care what we read from memory */ -static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, +static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct bpf_reg_state *reg, u32 regno, int off, int bpf_size, enum bpf_access_type t, int value_regno, bool strict_alignment_once, bool is_ldsx) { struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = regs + regno; int size, err = 0; size = bpf_size_to_bytes(bpf_size); @@ -6336,7 +6314,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return -EACCES; } - err = check_mem_region_access(env, regno, off, size, + err = check_mem_region_access(env, reg, regno, off, size, reg->map_ptr->key_size, false); if (err) return err; @@ -6350,10 +6328,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn verbose(env, "R%d leaks addr into map\n", value_regno); return -EACCES; } - err = check_map_access_type(env, regno, off, size, t); + err = check_map_access_type(env, reg, off, size, t); if (err) return err; - err = check_map_access(env, regno, off, size, false, ACCESS_DIRECT); + err = check_map_access(env, reg, regno, off, size, false, ACCESS_DIRECT); if (err) return err; if (tnum_is_const(reg->var_off)) @@ -6422,7 +6400,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * instructions, hence no need to check bounds in that case. */ if (!rdonly_untrusted) - err = check_mem_region_access(env, regno, off, size, + err = check_mem_region_access(env, reg, regno, off, size, reg->mem_size, false); if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem)) mark_reg_unknown(env, regs, value_regno); @@ -6440,7 +6418,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return -EACCES; } - err = check_ctx_access(env, insn_idx, regno, off, size, t, &info); + err = check_ctx_access(env, insn_idx, reg, regno, off, size, t, &info); if (!err && t == BPF_READ && value_regno >= 0) { /* ctx access returns either a scalar, or a * PTR_TO_PACKET[_META,_END]. In the latter @@ -6477,15 +6455,15 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else if (reg->type == PTR_TO_STACK) { /* Basic bounds checks. */ - err = check_stack_access_within_bounds(env, regno, off, size, t); + err = check_stack_access_within_bounds(env, reg, regno, off, size, t); if (err) return err; if (t == BPF_READ) - err = check_stack_read(env, regno, off, size, + err = check_stack_read(env, reg, regno, off, size, value_regno); else - err = check_stack_write(env, regno, off, size, + err = check_stack_write(env, reg, off, size, value_regno, insn_idx); } else if (reg_is_pkt_pointer(reg)) { if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { @@ -6498,7 +6476,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn value_regno); return -EACCES; } - err = check_packet_access(env, regno, off, size, false); + err = check_packet_access(env, reg, regno, off, size, false); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_FLOW_KEYS) { @@ -6518,7 +6496,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn regno, reg_type_str(env, reg->type)); return -EACCES; } - err = check_sock_access(env, insn_idx, regno, off, size, t); + err = check_sock_access(env, insn_idx, reg, regno, off, size, t); if (!err && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_TP_BUFFER) { @@ -6527,10 +6505,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn mark_reg_unknown(env, regs, value_regno); } else if (base_type(reg->type) == PTR_TO_BTF_ID && !type_may_be_null(reg->type)) { - err = check_ptr_to_btf_access(env, regs, regno, off, size, t, + err = check_ptr_to_btf_access(env, regs, reg, regno, off, size, t, value_regno); } else if (reg->type == CONST_PTR_TO_MAP) { - err = check_ptr_to_map_access(env, regs, regno, off, size, t, + err = check_ptr_to_map_access(env, regs, reg, regno, off, size, t, value_regno); } else if (base_type(reg->type) == PTR_TO_BUF && !type_may_be_null(reg->type)) { @@ -6599,7 +6577,7 @@ static int check_load_mem(struct bpf_verifier_env *env, struct bpf_insn *insn, /* Check if (src_reg + off) is readable. The state of dst_reg will be * updated by this call. */ - err = check_mem_access(env, env->insn_idx, insn->src_reg, insn->off, + err = check_mem_access(env, env->insn_idx, regs + insn->src_reg, insn->src_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, insn->dst_reg, strict_alignment_once, is_ldsx); err = err ?: save_aux_ptr_type(env, src_reg_type, @@ -6629,7 +6607,7 @@ static int check_store_reg(struct bpf_verifier_env *env, struct bpf_insn *insn, dst_reg_type = regs[insn->dst_reg].type; /* Check if (dst_reg + off) is writeable. */ - err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, + err = check_mem_access(env, env->insn_idx, regs + insn->dst_reg, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, insn->src_reg, strict_alignment_once, false); err = err ?: save_aux_ptr_type(env, dst_reg_type, false); @@ -6640,6 +6618,7 @@ static int check_store_reg(struct bpf_verifier_env *env, struct bpf_insn *insn, static int check_atomic_rmw(struct bpf_verifier_env *env, struct bpf_insn *insn) { + struct bpf_reg_state *dst_reg; int load_reg; int err; @@ -6701,13 +6680,15 @@ static int check_atomic_rmw(struct bpf_verifier_env *env, load_reg = -1; } + dst_reg = cur_regs(env) + insn->dst_reg; + /* Check whether we can read the memory, with second call for fetch * case to simulate the register fill. */ - err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, + err = check_mem_access(env, env->insn_idx, dst_reg, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, -1, true, false); if (!err && load_reg >= 0) - err = check_mem_access(env, env->insn_idx, insn->dst_reg, + err = check_mem_access(env, env->insn_idx, dst_reg, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, load_reg, true, false); if (err) @@ -6719,7 +6700,7 @@ static int check_atomic_rmw(struct bpf_verifier_env *env, return err; } /* Check whether we can write into the same memory. */ - err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, + err = check_mem_access(env, env->insn_idx, dst_reg, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1, true, false); if (err) return err; @@ -6808,11 +6789,10 @@ static int check_atomic(struct bpf_verifier_env *env, struct bpf_insn *insn) * read offsets are marked as read. */ static int check_stack_range_initialized( - struct bpf_verifier_env *env, int regno, int off, + struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, int off, int access_size, bool zero_size_allowed, enum bpf_access_type type, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_func_state *state = bpf_func(env, reg); int err, min_off, max_off, i, j, slot, spi; /* Some accesses can write anything into the stack, others are @@ -6834,7 +6814,7 @@ static int check_stack_range_initialized( return -EACCES; } - err = check_stack_access_within_bounds(env, regno, off, access_size, type); + err = check_stack_access_within_bounds(env, reg, regno, off, access_size, type); if (err) return err; @@ -6965,7 +6945,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, switch (base_type(reg->type)) { case PTR_TO_PACKET: case PTR_TO_PACKET_META: - return check_packet_access(env, regno, 0, access_size, + return check_packet_access(env, reg, regno, 0, access_size, zero_size_allowed); case PTR_TO_MAP_KEY: if (access_type == BPF_WRITE) { @@ -6973,12 +6953,12 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, reg_type_str(env, reg->type)); return -EACCES; } - return check_mem_region_access(env, regno, 0, access_size, + return check_mem_region_access(env, reg, regno, 0, access_size, reg->map_ptr->key_size, false); case PTR_TO_MAP_VALUE: - if (check_map_access_type(env, regno, 0, access_size, access_type)) + if (check_map_access_type(env, reg, 0, access_size, access_type)) return -EACCES; - return check_map_access(env, regno, 0, access_size, + return check_map_access(env, reg, regno, 0, access_size, zero_size_allowed, ACCESS_HELPER); case PTR_TO_MEM: if (type_is_rdonly_mem(reg->type)) { @@ -6988,7 +6968,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return -EACCES; } } - return check_mem_region_access(env, regno, 0, + return check_mem_region_access(env, reg, regno, 0, access_size, reg->mem_size, zero_size_allowed); case PTR_TO_BUF: @@ -7008,16 +6988,16 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, max_access); case PTR_TO_STACK: return check_stack_range_initialized( - env, + env, reg, regno, 0, access_size, zero_size_allowed, access_type, meta); case PTR_TO_BTF_ID: - return check_ptr_to_btf_access(env, regs, regno, 0, + return check_ptr_to_btf_access(env, regs, reg, regno, 0, access_size, BPF_READ, -1); case PTR_TO_CTX: /* Only permit reading or writing syscall context using helper calls. */ if (is_var_ctx_off_allowed(env->prog)) { - int err = check_mem_region_access(env, regno, 0, access_size, U16_MAX, + int err = check_mem_region_access(env, reg, regno, 0, access_size, U16_MAX, zero_size_allowed); if (err) return err; @@ -7178,11 +7158,10 @@ enum { * env->cur_state->active_locks remembers which map value element or allocated * object got locked and clears it after bpf_spin_unlock. */ -static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags) +static int process_spin_lock(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, int flags) { bool is_lock = flags & PROCESS_SPIN_LOCK, is_res_lock = flags & PROCESS_RES_LOCK; const char *lock_str = is_res_lock ? "bpf_res_spin" : "bpf_spin"; - struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_verifier_state *cur = env->cur_state; bool is_const = tnum_is_const(reg->var_off); bool is_irq = flags & PROCESS_LOCK_IRQ; @@ -7295,11 +7274,10 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags) } /* Check if @regno is a pointer to a specific field in a map value */ -static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, +static int check_map_field_pointer(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, enum btf_field_type field_type, struct bpf_map_desc *map_desc) { - struct bpf_reg_state *reg = reg_state(env, regno); bool is_const = tnum_is_const(reg->var_off); struct bpf_map *map = reg->map_ptr; u64 val = reg->var_off.value; @@ -7349,26 +7327,26 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, return 0; } -static int process_timer_func(struct bpf_verifier_env *env, int regno, +static int process_timer_func(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, struct bpf_map_desc *map) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) { verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n"); return -EOPNOTSUPP; } - return check_map_field_pointer(env, regno, BPF_TIMER, map); + return check_map_field_pointer(env, reg, regno, BPF_TIMER, map); } -static int process_timer_helper(struct bpf_verifier_env *env, int regno, +static int process_timer_helper(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, struct bpf_call_arg_meta *meta) { - return process_timer_func(env, regno, &meta->map); + return process_timer_func(env, reg, regno, &meta->map); } -static int process_timer_kfunc(struct bpf_verifier_env *env, int regno, +static int process_timer_kfunc(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, struct bpf_kfunc_call_arg_meta *meta) { - return process_timer_func(env, regno, &meta->map); + return process_timer_func(env, reg, regno, &meta->map); } static int process_kptr_func(struct bpf_verifier_env *env, int regno, @@ -7433,10 +7411,9 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, * use case. The second level is tracked using the upper bit of bpf_dynptr->size * and checked dynamically during runtime. */ -static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn_idx, +static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, int insn_idx, enum bpf_arg_type arg_type, int clone_ref_obj_id) { - struct bpf_reg_state *reg = reg_state(env, regno); int err; if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) { @@ -7470,7 +7447,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn /* we write BPF_DW bits (8 bytes) at a time */ for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) { - err = check_mem_access(env, insn_idx, regno, + err = check_mem_access(env, insn_idx, reg, regno, i, BPF_DW, BPF_WRITE, -1, false, false); if (err) return err; @@ -7540,10 +7517,9 @@ static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg_idx, return btf_param_match_suffix(meta->btf, arg, "__iter"); } -static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx, +static int process_iter_arg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, int insn_idx, struct bpf_kfunc_call_arg_meta *meta) { - struct bpf_reg_state *reg = reg_state(env, regno); const struct btf_type *t; int spi, err, i, nr_slots, btf_id; @@ -7575,7 +7551,7 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id } for (i = 0; i < nr_slots * 8; i += BPF_REG_SIZE) { - err = check_mem_access(env, insn_idx, regno, + err = check_mem_access(env, insn_idx, reg, regno, i, BPF_DW, BPF_WRITE, -1, false, false); if (err) return err; @@ -8014,12 +7990,11 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_DYNPTR] = &dynptr_types, }; -static int check_reg_type(struct bpf_verifier_env *env, u32 regno, +static int check_reg_type(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, enum bpf_arg_type arg_type, const u32 *arg_btf_id, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *reg = reg_state(env, regno); enum bpf_reg_type expected, type = reg->type; const struct bpf_reg_types *compatible; int i, j, err; @@ -8362,7 +8337,7 @@ static int check_reg_const_str(struct bpf_verifier_env *env, return -EACCES; } - err = check_map_access(env, regno, 0, + err = check_map_access(env, reg, regno, 0, map->value_size - reg->var_off.value, false, ACCESS_HELPER); if (err) @@ -8498,7 +8473,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, base_type(arg_type) == ARG_PTR_TO_SPIN_LOCK) arg_btf_id = fn->arg_btf_id[arg]; - err = check_reg_type(env, regno, arg_type, arg_btf_id, meta); + err = check_reg_type(env, reg, regno, arg_type, arg_btf_id, meta); if (err) return err; @@ -8636,11 +8611,11 @@ skip_type_check: return -EACCES; } if (meta->func_id == BPF_FUNC_spin_lock) { - err = process_spin_lock(env, regno, PROCESS_SPIN_LOCK); + err = process_spin_lock(env, reg, regno, PROCESS_SPIN_LOCK); if (err) return err; } else if (meta->func_id == BPF_FUNC_spin_unlock) { - err = process_spin_lock(env, regno, 0); + err = process_spin_lock(env, reg, regno, 0); if (err) return err; } else { @@ -8649,7 +8624,7 @@ skip_type_check: } break; case ARG_PTR_TO_TIMER: - err = process_timer_helper(env, regno, meta); + err = process_timer_helper(env, reg, regno, meta); if (err) return err; break; @@ -8684,7 +8659,7 @@ skip_type_check: true, meta); break; case ARG_PTR_TO_DYNPTR: - err = process_dynptr_func(env, regno, insn_idx, arg_type, 0); + err = process_dynptr_func(env, reg, regno, insn_idx, arg_type, 0); if (err) return err; break; @@ -9343,7 +9318,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, if (ret) return ret; - ret = process_dynptr_func(env, regno, -1, arg->arg_type, 0); + ret = process_dynptr_func(env, reg, regno, -1, arg->arg_type, 0); if (ret) return ret; } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) { @@ -9354,7 +9329,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, continue; memset(&meta, 0, sizeof(meta)); /* leave func_id as zero */ - err = check_reg_type(env, regno, arg->arg_type, &arg->btf_id, &meta); + err = check_reg_type(env, reg, regno, arg->arg_type, &arg->btf_id, &meta); err = err ?: check_func_arg_reg_off(env, reg, regno, arg->arg_type); if (err) return err; @@ -10312,18 +10287,18 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (err) return err; + regs = cur_regs(env); + /* Mark slots with STACK_MISC in case of raw mode, stack offset * is inferred from register state. */ for (i = 0; i < meta.access_size; i++) { - err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, + err = check_mem_access(env, insn_idx, regs + meta.regno, meta.regno, i, BPF_B, BPF_WRITE, -1, false, false); if (err) return err; } - regs = cur_regs(env); - if (meta.release_regno) { err = -EINVAL; if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) { @@ -11327,11 +11302,10 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta, const struct btf_type *t, const struct btf_type *ref_t, const char *ref_tname, const struct btf_param *args, - int argno, int nargs) + int argno, int nargs, struct bpf_reg_state *reg) { u32 regno = argno + 1; struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = ®s[regno]; bool arg_mem_size = false; if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] || @@ -11498,10 +11472,9 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, return 0; } -static int process_irq_flag(struct bpf_verifier_env *env, int regno, +static int process_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, struct bpf_kfunc_call_arg_meta *meta) { - struct bpf_reg_state *reg = reg_state(env, regno); int err, kfunc_class = IRQ_NATIVE_KFUNC; bool irq_save; @@ -11526,7 +11499,7 @@ static int process_irq_flag(struct bpf_verifier_env *env, int regno, return -EINVAL; } - err = check_mem_access(env, env->insn_idx, regno, 0, BPF_DW, BPF_WRITE, -1, false, false); + err = check_mem_access(env, env->insn_idx, reg, regno, 0, BPF_DW, BPF_WRITE, -1, false, false); if (err) return err; @@ -12114,7 +12087,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); ref_tname = btf_name_by_offset(btf, ref_t->name_off); - kf_arg_type = get_kfunc_ptr_arg_type(env, meta, t, ref_t, ref_tname, args, i, nargs); + kf_arg_type = get_kfunc_ptr_arg_type(env, meta, t, ref_t, ref_tname, args, i, nargs, reg); if (kf_arg_type < 0) return kf_arg_type; @@ -12276,7 +12249,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } } - ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type, clone_ref_obj_id); + ret = process_dynptr_func(env, reg, regno, insn_idx, dynptr_arg_type, clone_ref_obj_id); if (ret < 0) return ret; @@ -12301,7 +12274,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EINVAL; } } - ret = process_iter_arg(env, regno, insn_idx, meta); + ret = process_iter_arg(env, reg, regno, insn_idx, meta); if (ret < 0) return ret; break; @@ -12478,7 +12451,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "arg#%d doesn't point to a map value\n", i); return -EINVAL; } - ret = check_map_field_pointer(env, regno, BPF_WORKQUEUE, &meta->map); + ret = check_map_field_pointer(env, reg, regno, BPF_WORKQUEUE, &meta->map); if (ret < 0) return ret; break; @@ -12487,7 +12460,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "arg#%d doesn't point to a map value\n", i); return -EINVAL; } - ret = process_timer_kfunc(env, regno, meta); + ret = process_timer_kfunc(env, reg, regno, meta); if (ret < 0) return ret; break; @@ -12496,7 +12469,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "arg#%d doesn't point to a map value\n", i); return -EINVAL; } - ret = check_map_field_pointer(env, regno, BPF_TASK_WORK, &meta->map); + ret = check_map_field_pointer(env, reg, regno, BPF_TASK_WORK, &meta->map); if (ret < 0) return ret; break; @@ -12505,7 +12478,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "arg#%d doesn't point to an irq flag on stack\n", i); return -EINVAL; } - ret = process_irq_flag(env, regno, meta); + ret = process_irq_flag(env, reg, regno, meta); if (ret < 0) return ret; break; @@ -12526,7 +12499,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] || meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) flags |= PROCESS_LOCK_IRQ; - ret = process_spin_lock(env, regno, flags); + ret = process_spin_lock(env, reg, regno, flags); if (ret < 0) return ret; break; @@ -13660,7 +13633,7 @@ static int check_stack_access_for_ptr_arithmetic( static int sanitize_check_bounds(struct bpf_verifier_env *env, const struct bpf_insn *insn, - const struct bpf_reg_state *dst_reg) + struct bpf_reg_state *dst_reg) { u32 dst = insn->dst_reg; @@ -13677,7 +13650,7 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env, return -EACCES; break; case PTR_TO_MAP_VALUE: - if (check_map_access(env, dst, 0, 1, false, ACCESS_HELPER)) { + if (check_map_access(env, dst_reg, dst, 0, 1, false, ACCESS_HELPER)) { verbose(env, "R%d pointer arithmetic of map value goes out of range, " "prohibited for !root\n", dst); return -EACCES; @@ -17563,7 +17536,7 @@ static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state) dst_reg_type = cur_regs(env)[insn->dst_reg].type; - err = check_mem_access(env, env->insn_idx, insn->dst_reg, + err = check_mem_access(env, env->insn_idx, cur_regs(env) + insn->dst_reg, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1, false, false); if (err) -- cgit v1.2.3 From 024c0ab5db9459d114304d070936c440a61d3fd4 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 22 Apr 2026 20:34:51 -0700 Subject: bpf: Refactor to handle memory and size together Similar to the previous patch, try to pass bpf_reg_state from caller to callee. Both mem_reg and size_reg are passed to helper functions. This is important for stack arguments as they may be beyond registers 1-5. Acked-by: Puranjay Mohan Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260423033451.2539065-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 57 +++++++++++++++++++++++++-------------------------- 1 file changed, 28 insertions(+), 29 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index debae6133e4c..1b9a4918fa57 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6934,12 +6934,12 @@ mark: return 0; } -static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, +static int check_helper_mem_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, int access_size, enum bpf_access_type access_type, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *regs = cur_regs(env); u32 *max_access; switch (base_type(reg->type)) { @@ -7022,12 +7022,12 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, /* verify arguments to helpers or kfuncs consisting of a pointer and an access * size. * - * @regno is the register containing the access size. regno-1 is the register - * containing the pointer. + * @mem_reg contains the pointer, @size_reg contains the access size. */ static int check_mem_size_reg(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, u32 regno, - enum bpf_access_type access_type, + struct bpf_reg_state *mem_reg, + struct bpf_reg_state *size_reg, u32 mem_regno, + u32 size_regno, enum bpf_access_type access_type, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { @@ -7041,37 +7041,37 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, * out. Only upper bounds can be learned because retval is an * int type and negative retvals are allowed. */ - meta->msize_max_value = reg->umax_value; + meta->msize_max_value = size_reg->umax_value; /* The register is SCALAR_VALUE; the access check happens using * its boundaries. For unprivileged variable accesses, disable * raw mode so that the program is required to initialize all * the memory that the helper could just partially fill up. */ - if (!tnum_is_const(reg->var_off)) + if (!tnum_is_const(size_reg->var_off)) meta = NULL; - if (reg->smin_value < 0) { + if (size_reg->smin_value < 0) { verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n", - regno); + size_regno); return -EACCES; } - if (reg->umin_value == 0 && !zero_size_allowed) { + if (size_reg->umin_value == 0 && !zero_size_allowed) { verbose(env, "R%d invalid zero-sized read: u64=[%lld,%lld]\n", - regno, reg->umin_value, reg->umax_value); + size_regno, size_reg->umin_value, size_reg->umax_value); return -EACCES; } - if (reg->umax_value >= BPF_MAX_VAR_SIZ) { + if (size_reg->umax_value >= BPF_MAX_VAR_SIZ) { verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", - regno); + size_regno); return -EACCES; } - err = check_helper_mem_access(env, regno - 1, reg->umax_value, + err = check_helper_mem_access(env, mem_reg, mem_regno, size_reg->umax_value, access_type, zero_size_allowed, meta); if (!err) - err = mark_chain_precision(env, regno); + err = mark_chain_precision(env, size_regno); return err; } @@ -7096,8 +7096,8 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg int size = base_type(reg->type) == PTR_TO_STACK ? -(int)mem_size : mem_size; - err = check_helper_mem_access(env, regno, size, BPF_READ, true, NULL); - err = err ?: check_helper_mem_access(env, regno, size, BPF_WRITE, true, NULL); + err = check_helper_mem_access(env, reg, regno, size, BPF_READ, true, NULL); + err = err ?: check_helper_mem_access(env, reg, regno, size, BPF_WRITE, true, NULL); if (may_be_null) *reg = saved_reg; @@ -7105,10 +7105,9 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg return err; } -static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - u32 regno) +static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *mem_reg, + struct bpf_reg_state *size_reg, u32 mem_regno, u32 size_regno) { - struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1]; bool may_be_null = type_may_be_null(mem_reg->type); struct bpf_reg_state saved_reg; struct bpf_call_arg_meta meta; @@ -7121,8 +7120,8 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg mark_ptr_not_null_reg(mem_reg); } - err = check_mem_size_reg(env, reg, regno, BPF_READ, true, &meta); - err = err ?: check_mem_size_reg(env, reg, regno, BPF_WRITE, true, &meta); + err = check_mem_size_reg(env, mem_reg, size_reg, mem_regno, size_regno, BPF_READ, true, &meta); + err = err ?: check_mem_size_reg(env, mem_reg, size_reg, mem_regno, size_regno, BPF_WRITE, true, &meta); if (may_be_null) *mem_reg = saved_reg; @@ -8566,7 +8565,7 @@ skip_type_check: return -EFAULT; } key_size = meta->map.ptr->key_size; - err = check_helper_mem_access(env, regno, key_size, BPF_READ, false, NULL); + err = check_helper_mem_access(env, reg, regno, key_size, BPF_READ, false, NULL); if (err) return err; if (can_elide_value_nullness(meta->map.ptr->map_type)) { @@ -8593,7 +8592,7 @@ skip_type_check: return -EFAULT; } meta->raw_mode = arg_type & MEM_UNINIT; - err = check_helper_mem_access(env, regno, meta->map.ptr->value_size, + err = check_helper_mem_access(env, reg, regno, meta->map.ptr->value_size, arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ, false, meta); break; @@ -8637,7 +8636,7 @@ skip_type_check: */ meta->raw_mode = arg_type & MEM_UNINIT; if (arg_type & MEM_FIXED_SIZE) { - err = check_helper_mem_access(env, regno, fn->arg_size[arg], + err = check_helper_mem_access(env, reg, regno, fn->arg_size[arg], arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ, false, meta); if (err) @@ -8647,13 +8646,13 @@ skip_type_check: } break; case ARG_CONST_SIZE: - err = check_mem_size_reg(env, reg, regno, + err = check_mem_size_reg(env, reg_state(env, regno - 1), reg, regno - 1, regno, fn->arg_type[arg - 1] & MEM_WRITE ? BPF_WRITE : BPF_READ, false, meta); break; case ARG_CONST_SIZE_OR_ZERO: - err = check_mem_size_reg(env, reg, regno, + err = check_mem_size_reg(env, reg_state(env, regno - 1), reg, regno - 1, regno, fn->arg_type[arg - 1] & MEM_WRITE ? BPF_WRITE : BPF_READ, true, meta); @@ -12384,7 +12383,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ const struct btf_param *size_arg = &args[i + 1]; if (!bpf_register_is_null(buff_reg) || !is_kfunc_arg_nullable(meta->btf, buff_arg)) { - ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1); + ret = check_kfunc_mem_size_reg(env, buff_reg, size_reg, regno, regno + 1); if (ret < 0) { verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1); return ret; -- cgit v1.2.3 From 053a48cb2a5459659a5034ad20e7d9f28bc56f16 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 22 Apr 2026 20:34:56 -0700 Subject: bpf: Rename existing argno to arg To support stack arguments, in later patches, argno will represent both registers and stack arguments. To avoid confusion, rename existing argno to arg. Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260423033456.2539340-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 54 +++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1b9a4918fa57..81d77dfaaaf6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11301,9 +11301,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta, const struct btf_type *t, const struct btf_type *ref_t, const char *ref_tname, const struct btf_param *args, - int argno, int nargs, struct bpf_reg_state *reg) + int arg, int nargs, struct bpf_reg_state *reg) { - u32 regno = argno + 1; + u32 regno = arg + 1; struct bpf_reg_state *regs = cur_regs(env); bool arg_mem_size = false; @@ -11312,9 +11312,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, meta->func_id == special_kfunc_list[KF_bpf_session_cookie]) return KF_ARG_PTR_TO_CTX; - if (argno + 1 < nargs && - (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]) || - is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]))) + if (arg + 1 < nargs && + (is_kfunc_arg_mem_size(meta->btf, &args[arg + 1], ®s[regno + 1]) || + is_kfunc_arg_const_mem_size(meta->btf, &args[arg + 1], ®s[regno + 1]))) arg_mem_size = true; /* In this function, we verify the kfunc's BTF as per the argument type, @@ -11322,68 +11322,68 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, * type to our caller. When a set of conditions hold in the BTF type of * arguments, we resolve it to a known kfunc_ptr_arg_type. */ - if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno)) + if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), arg)) return KF_ARG_PTR_TO_CTX; - if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && bpf_register_is_null(reg) && + if (is_kfunc_arg_nullable(meta->btf, &args[arg]) && bpf_register_is_null(reg) && !arg_mem_size) return KF_ARG_PTR_TO_NULL; - if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno])) + if (is_kfunc_arg_alloc_obj(meta->btf, &args[arg])) return KF_ARG_PTR_TO_ALLOC_BTF_ID; - if (is_kfunc_arg_refcounted_kptr(meta->btf, &args[argno])) + if (is_kfunc_arg_refcounted_kptr(meta->btf, &args[arg])) return KF_ARG_PTR_TO_REFCOUNTED_KPTR; - if (is_kfunc_arg_dynptr(meta->btf, &args[argno])) + if (is_kfunc_arg_dynptr(meta->btf, &args[arg])) return KF_ARG_PTR_TO_DYNPTR; - if (is_kfunc_arg_iter(meta, argno, &args[argno])) + if (is_kfunc_arg_iter(meta, arg, &args[arg])) return KF_ARG_PTR_TO_ITER; - if (is_kfunc_arg_list_head(meta->btf, &args[argno])) + if (is_kfunc_arg_list_head(meta->btf, &args[arg])) return KF_ARG_PTR_TO_LIST_HEAD; - if (is_kfunc_arg_list_node(meta->btf, &args[argno])) + if (is_kfunc_arg_list_node(meta->btf, &args[arg])) return KF_ARG_PTR_TO_LIST_NODE; - if (is_kfunc_arg_rbtree_root(meta->btf, &args[argno])) + if (is_kfunc_arg_rbtree_root(meta->btf, &args[arg])) return KF_ARG_PTR_TO_RB_ROOT; - if (is_kfunc_arg_rbtree_node(meta->btf, &args[argno])) + if (is_kfunc_arg_rbtree_node(meta->btf, &args[arg])) return KF_ARG_PTR_TO_RB_NODE; - if (is_kfunc_arg_const_str(meta->btf, &args[argno])) + if (is_kfunc_arg_const_str(meta->btf, &args[arg])) return KF_ARG_PTR_TO_CONST_STR; - if (is_kfunc_arg_map(meta->btf, &args[argno])) + if (is_kfunc_arg_map(meta->btf, &args[arg])) return KF_ARG_PTR_TO_MAP; - if (is_kfunc_arg_wq(meta->btf, &args[argno])) + if (is_kfunc_arg_wq(meta->btf, &args[arg])) return KF_ARG_PTR_TO_WORKQUEUE; - if (is_kfunc_arg_timer(meta->btf, &args[argno])) + if (is_kfunc_arg_timer(meta->btf, &args[arg])) return KF_ARG_PTR_TO_TIMER; - if (is_kfunc_arg_task_work(meta->btf, &args[argno])) + if (is_kfunc_arg_task_work(meta->btf, &args[arg])) return KF_ARG_PTR_TO_TASK_WORK; - if (is_kfunc_arg_irq_flag(meta->btf, &args[argno])) + if (is_kfunc_arg_irq_flag(meta->btf, &args[arg])) return KF_ARG_PTR_TO_IRQ_FLAG; - if (is_kfunc_arg_res_spin_lock(meta->btf, &args[argno])) + if (is_kfunc_arg_res_spin_lock(meta->btf, &args[arg])) return KF_ARG_PTR_TO_RES_SPIN_LOCK; if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) { if (!btf_type_is_struct(ref_t)) { verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n", - meta->func_name, argno, btf_type_str(ref_t), ref_tname); + meta->func_name, arg, btf_type_str(ref_t), ref_tname); return -EINVAL; } return KF_ARG_PTR_TO_BTF_ID; } - if (is_kfunc_arg_callback(env, meta->btf, &args[argno])) + if (is_kfunc_arg_callback(env, meta->btf, &args[arg])) return KF_ARG_PTR_TO_CALLBACK; /* This is the catch all argument type of register types supported by @@ -11394,7 +11394,7 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (!btf_type_is_scalar(ref_t) && !__btf_type_is_scalar_struct(env, meta->btf, ref_t, 0) && (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) { verbose(env, "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n", - argno, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : ""); + arg, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : ""); return -EINVAL; } return arg_mem_size ? KF_ARG_PTR_TO_MEM_SIZE : KF_ARG_PTR_TO_MEM; @@ -11405,7 +11405,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, const struct btf_type *ref_t, const char *ref_tname, u32 ref_id, struct bpf_kfunc_call_arg_meta *meta, - int argno) + int arg) { const struct btf_type *reg_ref_t; bool strict_type_match = false; @@ -11464,7 +11464,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, taking_projection = btf_is_projection_of(ref_tname, reg_ref_tname); if (!taking_projection && !struct_same) { verbose(env, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n", - meta->func_name, argno, btf_type_str(ref_t), ref_tname, argno + 1, + meta->func_name, arg, btf_type_str(ref_t), ref_tname, arg + 1, btf_type_str(reg_ref_t), reg_ref_tname); return -EINVAL; } -- cgit v1.2.3 From 9b9f0b42703ceb88332bcb19453c4288c2683e34 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 22 Apr 2026 20:35:01 -0700 Subject: bpf: Prepare verifier logs for upcoming kfunc stack arguments This change prepares verifier log reporting for upcoming kfunc stack argument support. Currently verifier log code mostly assumes that an argument can be described directly by a register number. That works for arguments passed in `R1` to `R5`, but it does not work once kfunc arguments can also be passed on the stack. Introduce an opaque `argno_t` type that encodes both register-based and arg-based references. Four helpers form the interface: - argno_from_reg(regno): create from a register number - argno_from_arg(arg): create from a 1-based arg number - reg_from_argno(a): extract register number, or -1 - arg_from_argno(a): extract arg number, or -1 reg_arg_name() converts an argno_t to a human-readable string for verifier logs: "R%d" for register arguments, or "*(R11-off)" for stack arguments beyond R5. Update selftests accordingly. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260423033501.2539667-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 694 ++++++++++++--------- tools/testing/selftests/bpf/prog_tests/bpf_nf.c | 22 +- tools/testing/selftests/bpf/prog_tests/cb_refs.c | 2 +- .../testing/selftests/bpf/prog_tests/kfunc_call.c | 2 +- .../testing/selftests/bpf/prog_tests/linked_list.c | 4 +- .../selftests/bpf/progs/cgrp_kfunc_failure.c | 14 +- .../testing/selftests/bpf/progs/cpumask_failure.c | 10 +- tools/testing/selftests/bpf/progs/dynptr_fail.c | 22 +- .../testing/selftests/bpf/progs/file_reader_fail.c | 4 +- tools/testing/selftests/bpf/progs/irq.c | 4 +- tools/testing/selftests/bpf/progs/iters.c | 6 +- .../selftests/bpf/progs/iters_state_safety.c | 14 +- tools/testing/selftests/bpf/progs/iters_testmod.c | 4 +- .../selftests/bpf/progs/iters_testmod_seq.c | 4 +- tools/testing/selftests/bpf/progs/map_kptr_fail.c | 2 +- .../selftests/bpf/progs/percpu_alloc_fail.c | 4 +- tools/testing/selftests/bpf/progs/rbtree_fail.c | 6 +- .../selftests/bpf/progs/refcounted_kptr_fail.c | 2 +- tools/testing/selftests/bpf/progs/stream_fail.c | 2 +- .../selftests/bpf/progs/task_kfunc_failure.c | 18 +- tools/testing/selftests/bpf/progs/task_work_fail.c | 6 +- .../testing/selftests/bpf/progs/test_bpf_nf_fail.c | 8 +- .../selftests/bpf/progs/test_kfunc_dynptr_param.c | 2 +- .../bpf/progs/test_kfunc_param_nullable.c | 2 +- .../selftests/bpf/progs/verifier_bits_iter.c | 4 +- .../selftests/bpf/progs/verifier_ref_tracking.c | 6 +- .../selftests/bpf/progs/verifier_vfs_reject.c | 8 +- tools/testing/selftests/bpf/progs/wq_failures.c | 2 +- tools/testing/selftests/bpf/verifier/calls.c | 14 +- 30 files changed, 497 insertions(+), 396 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index b148f816f25b..d5b4303315dd 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -913,6 +913,7 @@ struct bpf_verifier_env { * e.g., in reg_type_str() to generate reg_type string */ char tmp_str_buf[TMP_STR_BUF_LEN]; + char tmp_arg_name[32]; struct bpf_insn insn_buf[INSN_BUF_SIZE]; struct bpf_insn epilogue_buf[INSN_BUF_SIZE]; struct bpf_scc_callchain callchain_buf; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 81d77dfaaaf6..ff6ff1c27517 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -261,6 +261,36 @@ struct bpf_kfunc_meta { struct btf *btf_vmlinux; +typedef struct argno { + int argno; +} argno_t; + +static argno_t argno_from_reg(u32 regno) +{ + return (argno_t){ .argno = regno }; +} + +static argno_t argno_from_arg(u32 arg) +{ + return (argno_t){ .argno = -arg }; +} + +static int reg_from_argno(argno_t a) +{ + if (a.argno >= 0) + return a.argno; + if (a.argno >= -MAX_BPF_FUNC_REG_ARGS) + return -a.argno; + return -1; +} + +static int arg_from_argno(argno_t a) +{ + if (a.argno < 0) + return -a.argno; + return -1; +} + static const char *btf_type_name(const struct btf *btf, u32 id) { return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off); @@ -1742,6 +1772,22 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, return &elem->st; } +static const char *reg_arg_name(struct bpf_verifier_env *env, argno_t argno) +{ + char *buf = env->tmp_arg_name; + int len = sizeof(env->tmp_arg_name); + int arg, regno = reg_from_argno(argno); + + if (regno >= 0) { + snprintf(buf, len, "R%d", regno); + } else { + arg = arg_from_argno(argno); + snprintf(buf, len, "*(R11-%u)", (arg - MAX_BPF_FUNC_REG_ARGS) * BPF_REG_SIZE); + } + + return buf; +} + static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; @@ -4241,7 +4287,7 @@ enum bpf_access_src { }; static int check_stack_range_initialized(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - int regno, int off, int access_size, + argno_t argno, int off, int access_size, bool zero_size_allowed, enum bpf_access_type type, struct bpf_call_arg_meta *meta); @@ -4265,7 +4311,7 @@ static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno) * instead. */ static int check_stack_read_var_off(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - int ptr_regno, int off, int size, int dst_regno) + argno_t ptr_argno, int off, int size, int dst_regno) { struct bpf_func_state *ptr_state = bpf_func(env, reg); int err; @@ -4273,7 +4319,7 @@ static int check_stack_read_var_off(struct bpf_verifier_env *env, struct bpf_reg /* Note that we pass a NULL meta, so raw access will not be permitted. */ - err = check_stack_range_initialized(env, reg, ptr_regno, off, size, + err = check_stack_range_initialized(env, reg, ptr_argno, off, size, false, BPF_READ, NULL); if (err) return err; @@ -4295,7 +4341,7 @@ static int check_stack_read_var_off(struct bpf_verifier_env *env, struct bpf_reg * can be -1, meaning that the read value is not going to a register. */ static int check_stack_read(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, int ptr_regno, int off, int size, + struct bpf_reg_state *reg, argno_t ptr_argno, int off, int size, int dst_regno) { struct bpf_func_state *state = bpf_func(env, reg); @@ -4333,7 +4379,7 @@ static int check_stack_read(struct bpf_verifier_env *env, * than fixed offset ones. Note that dst_regno >= 0 on this * branch. */ - err = check_stack_read_var_off(env, reg, ptr_regno, off, size, + err = check_stack_read_var_off(env, reg, ptr_argno, off, size, dst_regno); } return err; @@ -4393,7 +4439,7 @@ static int check_map_access_type(struct bpf_verifier_env *env, struct bpf_reg_st } /* check read/write into memory region (e.g., map value, ringbuf sample, etc) */ -static int __check_mem_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, +static int __check_mem_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int off, int size, u32 mem_size, bool zero_size_allowed) { @@ -4414,8 +4460,8 @@ static int __check_mem_access(struct bpf_verifier_env *env, struct bpf_reg_state case PTR_TO_PACKET: case PTR_TO_PACKET_META: case PTR_TO_PACKET_END: - verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", - off, size, regno, reg->id, off, mem_size); + verbose(env, "invalid access to packet, off=%d size=%d, %s(id=%d,off=%d,r=%d)\n", + off, size, reg_arg_name(env, argno), reg->id, off, mem_size); break; case PTR_TO_CTX: verbose(env, "invalid access to context, ctx_size=%d off=%d size=%d\n", @@ -4431,7 +4477,7 @@ static int __check_mem_access(struct bpf_verifier_env *env, struct bpf_reg_state } /* check read/write into a memory region with possible variable offset */ -static int check_mem_region_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, +static int check_mem_region_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int off, int size, u32 mem_size, bool zero_size_allowed) { @@ -4451,15 +4497,15 @@ static int check_mem_region_access(struct bpf_verifier_env *env, struct bpf_reg_ (reg->smin_value == S64_MIN || (off + reg->smin_value != (s64)(s32)(off + reg->smin_value)) || reg->smin_value + off < 0)) { - verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", - regno); + verbose(env, "%s min value is negative, either use unsigned index or do a if (index >=0) check.\n", + reg_arg_name(env, argno)); return -EACCES; } - err = __check_mem_access(env, reg, regno, reg->smin_value + off, size, + err = __check_mem_access(env, reg, argno, reg->smin_value + off, size, mem_size, zero_size_allowed); if (err) { - verbose(env, "R%d min value is outside of the allowed memory range\n", - regno); + verbose(env, "%s min value is outside of the allowed memory range\n", + reg_arg_name(env, argno)); return err; } @@ -4468,15 +4514,15 @@ static int check_mem_region_access(struct bpf_verifier_env *env, struct bpf_reg_ * If reg->umax_value + off could overflow, treat that as unbounded too. */ if (reg->umax_value >= BPF_MAX_VAR_OFF) { - verbose(env, "R%d unbounded memory access, make sure to bounds check any such access\n", - regno); + verbose(env, "%s unbounded memory access, make sure to bounds check any such access\n", + reg_arg_name(env, argno)); return -EACCES; } - err = __check_mem_access(env, reg, regno, reg->umax_value + off, size, + err = __check_mem_access(env, reg, argno, reg->umax_value + off, size, mem_size, zero_size_allowed); if (err) { - verbose(env, "R%d max value is outside of the allowed memory range\n", - regno); + verbose(env, "%s max value is outside of the allowed memory range\n", + reg_arg_name(env, argno)); return err; } @@ -4484,7 +4530,7 @@ static int check_mem_region_access(struct bpf_verifier_env *env, struct bpf_reg_ } static int __check_ptr_off_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno, + const struct bpf_reg_state *reg, argno_t argno, bool fixed_off_ok) { /* Access to this pointer-typed register or passing it to a helper @@ -4501,14 +4547,14 @@ static int __check_ptr_off_reg(struct bpf_verifier_env *env, } if (reg->smin_value < 0) { - verbose(env, "negative offset %s ptr R%d off=%lld disallowed\n", - reg_type_str(env, reg->type), regno, reg->var_off.value); + verbose(env, "negative offset %s ptr %s off=%lld disallowed\n", + reg_type_str(env, reg->type), reg_arg_name(env, argno), reg->var_off.value); return -EACCES; } if (!fixed_off_ok && reg->var_off.value != 0) { - verbose(env, "dereference of modified %s ptr R%d off=%lld disallowed\n", - reg_type_str(env, reg->type), regno, reg->var_off.value); + verbose(env, "dereference of modified %s ptr %s off=%lld disallowed\n", + reg_type_str(env, reg->type), reg_arg_name(env, argno), reg->var_off.value); return -EACCES; } @@ -4518,7 +4564,7 @@ static int __check_ptr_off_reg(struct bpf_verifier_env *env, static int check_ptr_off_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno) { - return __check_ptr_off_reg(env, reg, regno, false); + return __check_ptr_off_reg(env, reg, argno_from_reg(regno), false); } static int map_kptr_match_type(struct bpf_verifier_env *env, @@ -4556,7 +4602,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, * Since ref_ptr cannot be accessed directly by BPF insns, check for * reg->ref_obj_id is not needed here. */ - if (__check_ptr_off_reg(env, reg, regno, true)) + if (__check_ptr_off_reg(env, reg, argno_from_reg(regno), true)) return -EACCES; /* A full type match is needed, as BTF can be vmlinux, module or prog BTF, and @@ -4776,7 +4822,7 @@ static u32 map_mem_size(const struct bpf_map *map) } /* check read/write into a map element with possible variable offset */ -static int check_map_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, +static int check_map_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int off, int size, bool zero_size_allowed, enum bpf_access_src src) { @@ -4785,7 +4831,7 @@ static int check_map_access(struct bpf_verifier_env *env, struct bpf_reg_state * struct btf_record *rec; int err, i; - err = check_mem_region_access(env, reg, regno, off, size, mem_size, zero_size_allowed); + err = check_mem_region_access(env, reg, argno, off, size, mem_size, zero_size_allowed); if (err) return err; @@ -4881,17 +4927,17 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, } } -static int check_packet_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, int off, +static int check_packet_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int off, int size, bool zero_size_allowed) { int err; if (reg->range < 0) { - verbose(env, "R%d offset is outside of the packet\n", regno); + verbose(env, "%s offset is outside of the packet\n", reg_arg_name(env, argno)); return -EINVAL; } - err = check_mem_region_access(env, reg, regno, off, size, reg->range, zero_size_allowed); + err = check_mem_region_access(env, reg, argno, off, size, reg->range, zero_size_allowed); if (err) return err; @@ -4946,7 +4992,7 @@ static int __check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int of return -EACCES; } -static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, struct bpf_reg_state *reg, u32 regno, +static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, struct bpf_reg_state *reg, argno_t argno, int off, int access_size, enum bpf_access_type t, struct bpf_insn_access_aux *info) { @@ -4959,9 +5005,9 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, struct b int err; if (var_off_ok) - err = check_mem_region_access(env, reg, regno, off, access_size, U16_MAX, false); + err = check_mem_region_access(env, reg, argno, off, access_size, U16_MAX, false); else - err = __check_ptr_off_reg(env, reg, regno, fixed_off_ok); + err = __check_ptr_off_reg(env, reg, argno, fixed_off_ok); if (err) return err; off += reg->umax_value; @@ -4985,15 +5031,15 @@ static int check_flow_keys_access(struct bpf_verifier_env *env, int off, } static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, - struct bpf_reg_state *reg, u32 regno, int off, int size, + struct bpf_reg_state *reg, argno_t argno, int off, int size, enum bpf_access_type t) { struct bpf_insn_access_aux info = {}; bool valid; if (reg->smin_value < 0) { - verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", - regno); + verbose(env, "%s min value is negative, either use unsigned index or do a if (index >=0) check.\n", + reg_arg_name(env, argno)); return -EACCES; } @@ -5021,8 +5067,8 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, return 0; } - verbose(env, "R%d invalid %s access off=%d size=%d\n", - regno, reg_type_str(env, reg->type), off, size); + verbose(env, "%s invalid %s access off=%d size=%d\n", + reg_arg_name(env, argno), reg_type_str(env, reg->type), off, size); return -EACCES; } @@ -5535,12 +5581,12 @@ static int check_max_stack_depth(struct bpf_verifier_env *env) static int __check_buffer_access(struct bpf_verifier_env *env, const char *buf_info, const struct bpf_reg_state *reg, - int regno, int off, int size) + argno_t argno, int off, int size) { if (off < 0) { verbose(env, - "R%d invalid %s buffer access: off=%d, size=%d\n", - regno, buf_info, off, size); + "%s invalid %s buffer access: off=%d, size=%d\n", + reg_arg_name(env, argno), buf_info, off, size); return -EACCES; } if (!tnum_is_const(reg->var_off)) { @@ -5548,8 +5594,8 @@ static int __check_buffer_access(struct bpf_verifier_env *env, tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose(env, - "R%d invalid variable buffer offset: off=%d, var_off=%s\n", - regno, off, tn_buf); + "%s invalid variable buffer offset: off=%d, var_off=%s\n", + reg_arg_name(env, argno), off, tn_buf); return -EACCES; } @@ -5558,11 +5604,11 @@ static int __check_buffer_access(struct bpf_verifier_env *env, static int check_tp_buffer_access(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, - int regno, int off, int size) + argno_t argno, int off, int size) { int err; - err = __check_buffer_access(env, "tracepoint", reg, regno, off, size); + err = __check_buffer_access(env, "tracepoint", reg, argno, off, size); if (err) return err; @@ -5574,14 +5620,14 @@ static int check_tp_buffer_access(struct bpf_verifier_env *env, static int check_buffer_access(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, - int regno, int off, int size, + argno_t argno, int off, int size, bool zero_size_allowed, u32 *max_access) { const char *buf_info = type_is_rdonly_mem(reg->type) ? "rdonly" : "rdwr"; int err; - err = __check_buffer_access(env, buf_info, reg, regno, off, size); + err = __check_buffer_access(env, buf_info, reg, argno, off, size); if (err) return err; @@ -5954,7 +6000,7 @@ static bool type_is_trusted_or_null(struct bpf_verifier_env *env, static int check_ptr_to_btf_access(struct bpf_verifier_env *env, struct bpf_reg_state *regs, struct bpf_reg_state *reg, - int regno, int off, int size, + argno_t argno, int off, int size, enum bpf_access_type atype, int value_regno) { @@ -5983,8 +6029,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose(env, - "R%d is ptr_%s invalid variable offset: off=%d, var_off=%s\n", - regno, tname, off, tn_buf); + "%s is ptr_%s invalid variable offset: off=%d, var_off=%s\n", + reg_arg_name(env, argno), tname, off, tn_buf); return -EACCES; } @@ -5992,22 +6038,22 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (off < 0) { verbose(env, - "R%d is ptr_%s invalid negative access: off=%d\n", - regno, tname, off); + "%s is ptr_%s invalid negative access: off=%d\n", + reg_arg_name(env, argno), tname, off); return -EACCES; } if (reg->type & MEM_USER) { verbose(env, - "R%d is ptr_%s access user memory: off=%d\n", - regno, tname, off); + "%s is ptr_%s access user memory: off=%d\n", + reg_arg_name(env, argno), tname, off); return -EACCES; } if (reg->type & MEM_PERCPU) { verbose(env, - "R%d is ptr_%s access percpu memory: off=%d\n", - regno, tname, off); + "%s is ptr_%s access percpu memory: off=%d\n", + reg_arg_name(env, argno), tname, off); return -EACCES; } @@ -6110,7 +6156,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, static int check_ptr_to_map_access(struct bpf_verifier_env *env, struct bpf_reg_state *regs, struct bpf_reg_state *reg, - int regno, int off, int size, + argno_t argno, int off, int size, enum bpf_access_type atype, int value_regno) { @@ -6144,8 +6190,8 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, } if (off < 0) { - verbose(env, "R%d is %s invalid negative access: off=%d\n", - regno, tname, off); + verbose(env, "%s is %s invalid negative access: off=%d\n", + reg_arg_name(env, argno), tname, off); return -EACCES; } @@ -6203,7 +6249,7 @@ static int check_stack_slot_within_bounds(struct bpf_verifier_env *env, */ static int check_stack_access_within_bounds( struct bpf_verifier_env *env, struct bpf_reg_state *reg, - int regno, int off, int access_size, + argno_t argno, int off, int access_size, enum bpf_access_type type) { struct bpf_func_state *state = bpf_func(env, reg); @@ -6222,8 +6268,8 @@ static int check_stack_access_within_bounds( } else { if (reg->smax_value >= BPF_MAX_VAR_OFF || reg->smin_value <= -BPF_MAX_VAR_OFF) { - verbose(env, "invalid unbounded variable-offset%s stack R%d\n", - err_extra, regno); + verbose(env, "invalid unbounded variable-offset%s stack %s\n", + err_extra, reg_arg_name(env, argno)); return -EACCES; } min_off = reg->smin_value + off; @@ -6241,14 +6287,14 @@ static int check_stack_access_within_bounds( if (err) { if (tnum_is_const(reg->var_off)) { - verbose(env, "invalid%s stack R%d off=%lld size=%d\n", - err_extra, regno, min_off, access_size); + verbose(env, "invalid%s stack %s off=%lld size=%d\n", + err_extra, reg_arg_name(env, argno), min_off, access_size); } else { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "invalid variable-offset%s stack R%d var_off=%s off=%d size=%d\n", - err_extra, regno, tn_buf, off, access_size); + verbose(env, "invalid variable-offset%s stack %s var_off=%s off=%d size=%d\n", + err_extra, reg_arg_name(env, argno), tn_buf, off, access_size); } return err; } @@ -6293,7 +6339,7 @@ static void add_scalar_to_reg(struct bpf_reg_state *dst_reg, s64 val) * if t==write && value_regno==-1, some unknown value is stored into memory * if t==read && value_regno==-1, don't care what we read from memory */ -static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct bpf_reg_state *reg, u32 regno, +static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct bpf_reg_state *reg, argno_t argno, int off, int bpf_size, enum bpf_access_type t, int value_regno, bool strict_alignment_once, bool is_ldsx) { @@ -6310,11 +6356,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct b if (reg->type == PTR_TO_MAP_KEY) { if (t == BPF_WRITE) { - verbose(env, "write to change key R%d not allowed\n", regno); + verbose(env, "write to change key %s not allowed\n", + reg_arg_name(env, argno)); return -EACCES; } - err = check_mem_region_access(env, reg, regno, off, size, + err = check_mem_region_access(env, reg, argno, off, size, reg->map_ptr->key_size, false); if (err) return err; @@ -6331,7 +6378,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct b err = check_map_access_type(env, reg, off, size, t); if (err) return err; - err = check_map_access(env, reg, regno, off, size, false, ACCESS_DIRECT); + err = check_map_access(env, reg, argno, off, size, false, ACCESS_DIRECT); if (err) return err; if (tnum_is_const(reg->var_off)) @@ -6378,14 +6425,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct b bool rdonly_untrusted = rdonly_mem && (reg->type & PTR_UNTRUSTED); if (type_may_be_null(reg->type)) { - verbose(env, "R%d invalid mem access '%s'\n", regno, + verbose(env, "%s invalid mem access '%s'\n", reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } if (t == BPF_WRITE && rdonly_mem) { - verbose(env, "R%d cannot write into %s\n", - regno, reg_type_str(env, reg->type)); + verbose(env, "%s cannot write into %s\n", + reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } @@ -6400,7 +6447,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct b * instructions, hence no need to check bounds in that case. */ if (!rdonly_untrusted) - err = check_mem_region_access(env, reg, regno, off, size, + err = check_mem_region_access(env, reg, argno, off, size, reg->mem_size, false); if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem)) mark_reg_unknown(env, regs, value_regno); @@ -6418,7 +6465,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct b return -EACCES; } - err = check_ctx_access(env, insn_idx, reg, regno, off, size, t, &info); + err = check_ctx_access(env, insn_idx, reg, argno, off, size, t, &info); if (!err && t == BPF_READ && value_regno >= 0) { /* ctx access returns either a scalar, or a * PTR_TO_PACKET[_META,_END]. In the latter @@ -6455,12 +6502,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct b } else if (reg->type == PTR_TO_STACK) { /* Basic bounds checks. */ - err = check_stack_access_within_bounds(env, reg, regno, off, size, t); + err = check_stack_access_within_bounds(env, reg, argno, off, size, t); if (err) return err; if (t == BPF_READ) - err = check_stack_read(env, reg, regno, off, size, + err = check_stack_read(env, reg, argno, off, size, value_regno); else err = check_stack_write(env, reg, off, size, @@ -6476,7 +6523,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct b value_regno); return -EACCES; } - err = check_packet_access(env, reg, regno, off, size, false); + err = check_packet_access(env, reg, argno, off, size, false); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_FLOW_KEYS) { @@ -6492,23 +6539,23 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct b mark_reg_unknown(env, regs, value_regno); } else if (type_is_sk_pointer(reg->type)) { if (t == BPF_WRITE) { - verbose(env, "R%d cannot write into %s\n", - regno, reg_type_str(env, reg->type)); + verbose(env, "%s cannot write into %s\n", + reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } - err = check_sock_access(env, insn_idx, reg, regno, off, size, t); + err = check_sock_access(env, insn_idx, reg, argno, off, size, t); if (!err && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_TP_BUFFER) { - err = check_tp_buffer_access(env, reg, regno, off, size); + err = check_tp_buffer_access(env, reg, argno, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (base_type(reg->type) == PTR_TO_BTF_ID && !type_may_be_null(reg->type)) { - err = check_ptr_to_btf_access(env, regs, reg, regno, off, size, t, + err = check_ptr_to_btf_access(env, regs, reg, argno, off, size, t, value_regno); } else if (reg->type == CONST_PTR_TO_MAP) { - err = check_ptr_to_map_access(env, regs, reg, regno, off, size, t, + err = check_ptr_to_map_access(env, regs, reg, argno, off, size, t, value_regno); } else if (base_type(reg->type) == PTR_TO_BUF && !type_may_be_null(reg->type)) { @@ -6517,8 +6564,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct b if (rdonly_mem) { if (t == BPF_WRITE) { - verbose(env, "R%d cannot write into %s\n", - regno, reg_type_str(env, reg->type)); + verbose(env, "%s cannot write into %s\n", + reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } max_access = &env->prog->aux->max_rdonly_access; @@ -6526,7 +6573,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct b max_access = &env->prog->aux->max_rdwr_access; } - err = check_buffer_access(env, reg, regno, off, size, false, + err = check_buffer_access(env, reg, argno, off, size, false, max_access); if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ)) @@ -6535,7 +6582,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct b if (t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else { - verbose(env, "R%d invalid mem access '%s'\n", regno, + verbose(env, "%s invalid mem access '%s'\n", reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } @@ -6577,7 +6624,7 @@ static int check_load_mem(struct bpf_verifier_env *env, struct bpf_insn *insn, /* Check if (src_reg + off) is readable. The state of dst_reg will be * updated by this call. */ - err = check_mem_access(env, env->insn_idx, regs + insn->src_reg, insn->src_reg, insn->off, + err = check_mem_access(env, env->insn_idx, regs + insn->src_reg, argno_from_reg(insn->src_reg), insn->off, BPF_SIZE(insn->code), BPF_READ, insn->dst_reg, strict_alignment_once, is_ldsx); err = err ?: save_aux_ptr_type(env, src_reg_type, @@ -6607,7 +6654,7 @@ static int check_store_reg(struct bpf_verifier_env *env, struct bpf_insn *insn, dst_reg_type = regs[insn->dst_reg].type; /* Check if (dst_reg + off) is writeable. */ - err = check_mem_access(env, env->insn_idx, regs + insn->dst_reg, insn->dst_reg, insn->off, + err = check_mem_access(env, env->insn_idx, regs + insn->dst_reg, argno_from_reg(insn->dst_reg), insn->off, BPF_SIZE(insn->code), BPF_WRITE, insn->src_reg, strict_alignment_once, false); err = err ?: save_aux_ptr_type(env, dst_reg_type, false); @@ -6685,10 +6732,10 @@ static int check_atomic_rmw(struct bpf_verifier_env *env, /* Check whether we can read the memory, with second call for fetch * case to simulate the register fill. */ - err = check_mem_access(env, env->insn_idx, dst_reg, insn->dst_reg, insn->off, + err = check_mem_access(env, env->insn_idx, dst_reg, argno_from_reg(insn->dst_reg), insn->off, BPF_SIZE(insn->code), BPF_READ, -1, true, false); if (!err && load_reg >= 0) - err = check_mem_access(env, env->insn_idx, dst_reg, insn->dst_reg, + err = check_mem_access(env, env->insn_idx, dst_reg, argno_from_reg(insn->dst_reg), insn->off, BPF_SIZE(insn->code), BPF_READ, load_reg, true, false); if (err) @@ -6700,7 +6747,7 @@ static int check_atomic_rmw(struct bpf_verifier_env *env, return err; } /* Check whether we can write into the same memory. */ - err = check_mem_access(env, env->insn_idx, dst_reg, insn->dst_reg, insn->off, + err = check_mem_access(env, env->insn_idx, dst_reg, argno_from_reg(insn->dst_reg), insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1, true, false); if (err) return err; @@ -6789,7 +6836,7 @@ static int check_atomic(struct bpf_verifier_env *env, struct bpf_insn *insn) * read offsets are marked as read. */ static int check_stack_range_initialized( - struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, int off, + struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int off, int access_size, bool zero_size_allowed, enum bpf_access_type type, struct bpf_call_arg_meta *meta) { @@ -6814,7 +6861,7 @@ static int check_stack_range_initialized( return -EACCES; } - err = check_stack_access_within_bounds(env, reg, regno, off, access_size, type); + err = check_stack_access_within_bounds(env, reg, argno, off, access_size, type); if (err) return err; @@ -6831,8 +6878,8 @@ static int check_stack_range_initialized( char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n", - regno, tn_buf); + verbose(env, "%s variable offset stack access prohibited for !root, var_off=%s\n", + reg_arg_name(env, argno), tn_buf); return -EACCES; } /* Only initialized buffer on stack is allowed to be accessed @@ -6875,7 +6922,7 @@ static int check_stack_range_initialized( } } meta->access_size = access_size; - meta->regno = regno; + meta->regno = reg_from_argno(argno); return 0; } @@ -6915,17 +6962,17 @@ static int check_stack_range_initialized( if (*stype == STACK_POISON) { if (allow_poison) goto mark; - verbose(env, "reading from stack R%d off %d+%d size %d, slot poisoned by dead code elimination\n", - regno, min_off, i - min_off, access_size); + verbose(env, "reading from stack %s off %d+%d size %d, slot poisoned by dead code elimination\n", + reg_arg_name(env, argno), min_off, i - min_off, access_size); } else if (tnum_is_const(reg->var_off)) { - verbose(env, "invalid read from stack R%d off %d+%d size %d\n", - regno, min_off, i - min_off, access_size); + verbose(env, "invalid read from stack %s off %d+%d size %d\n", + reg_arg_name(env, argno), min_off, i - min_off, access_size); } else { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "invalid read from stack R%d var_off %s+%d size %d\n", - regno, tn_buf, i - min_off, access_size); + verbose(env, "invalid read from stack %s var_off %s+%d size %d\n", + reg_arg_name(env, argno), tn_buf, i - min_off, access_size); } return -EACCES; mark: @@ -6934,7 +6981,7 @@ mark: return 0; } -static int check_helper_mem_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, +static int check_helper_mem_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int access_size, enum bpf_access_type access_type, bool zero_size_allowed, struct bpf_call_arg_meta *meta) @@ -6945,37 +6992,37 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, struct bpf_reg_ switch (base_type(reg->type)) { case PTR_TO_PACKET: case PTR_TO_PACKET_META: - return check_packet_access(env, reg, regno, 0, access_size, + return check_packet_access(env, reg, argno, 0, access_size, zero_size_allowed); case PTR_TO_MAP_KEY: if (access_type == BPF_WRITE) { - verbose(env, "R%d cannot write into %s\n", regno, - reg_type_str(env, reg->type)); + verbose(env, "%s cannot write into %s\n", + reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } - return check_mem_region_access(env, reg, regno, 0, access_size, + return check_mem_region_access(env, reg, argno, 0, access_size, reg->map_ptr->key_size, false); case PTR_TO_MAP_VALUE: if (check_map_access_type(env, reg, 0, access_size, access_type)) return -EACCES; - return check_map_access(env, reg, regno, 0, access_size, + return check_map_access(env, reg, argno, 0, access_size, zero_size_allowed, ACCESS_HELPER); case PTR_TO_MEM: if (type_is_rdonly_mem(reg->type)) { if (access_type == BPF_WRITE) { - verbose(env, "R%d cannot write into %s\n", regno, - reg_type_str(env, reg->type)); + verbose(env, "%s cannot write into %s\n", + reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } } - return check_mem_region_access(env, reg, regno, 0, + return check_mem_region_access(env, reg, argno, 0, access_size, reg->mem_size, zero_size_allowed); case PTR_TO_BUF: if (type_is_rdonly_mem(reg->type)) { if (access_type == BPF_WRITE) { - verbose(env, "R%d cannot write into %s\n", regno, - reg_type_str(env, reg->type)); + verbose(env, "%s cannot write into %s\n", + reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } @@ -6983,21 +7030,21 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, struct bpf_reg_ } else { max_access = &env->prog->aux->max_rdwr_access; } - return check_buffer_access(env, reg, regno, 0, + return check_buffer_access(env, reg, argno, 0, access_size, zero_size_allowed, max_access); case PTR_TO_STACK: return check_stack_range_initialized( env, reg, - regno, 0, access_size, + argno, 0, access_size, zero_size_allowed, access_type, meta); case PTR_TO_BTF_ID: - return check_ptr_to_btf_access(env, regs, reg, regno, 0, + return check_ptr_to_btf_access(env, regs, reg, argno, 0, access_size, BPF_READ, -1); case PTR_TO_CTX: /* Only permit reading or writing syscall context using helper calls. */ if (is_var_ctx_off_allowed(env->prog)) { - int err = check_mem_region_access(env, reg, regno, 0, access_size, U16_MAX, + int err = check_mem_region_access(env, reg, argno, 0, access_size, U16_MAX, zero_size_allowed); if (err) return err; @@ -7012,7 +7059,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, struct bpf_reg_ bpf_register_is_null(reg)) return 0; - verbose(env, "R%d type=%s ", regno, + verbose(env, "%s type=%s ", reg_arg_name(env, argno), reg_type_str(env, reg->type)); verbose(env, "expected=%s\n", reg_type_str(env, PTR_TO_STACK)); return -EACCES; @@ -7026,8 +7073,8 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, struct bpf_reg_ */ static int check_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *mem_reg, - struct bpf_reg_state *size_reg, u32 mem_regno, - u32 size_regno, enum bpf_access_type access_type, + struct bpf_reg_state *size_reg, argno_t mem_argno, + argno_t size_argno, enum bpf_access_type access_type, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { @@ -7052,31 +7099,31 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, meta = NULL; if (size_reg->smin_value < 0) { - verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n", - size_regno); + verbose(env, "%s min value is negative, either use unsigned or 'var &= const'\n", + reg_arg_name(env, size_argno)); return -EACCES; } if (size_reg->umin_value == 0 && !zero_size_allowed) { - verbose(env, "R%d invalid zero-sized read: u64=[%lld,%lld]\n", - size_regno, size_reg->umin_value, size_reg->umax_value); + verbose(env, "%s invalid zero-sized read: u64=[%lld,%lld]\n", + reg_arg_name(env, size_argno), size_reg->umin_value, size_reg->umax_value); return -EACCES; } if (size_reg->umax_value >= BPF_MAX_VAR_SIZ) { - verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", - size_regno); + verbose(env, "%s unbounded memory access, use 'var &= const' or 'if (var < const)'\n", + reg_arg_name(env, size_argno)); return -EACCES; } - err = check_helper_mem_access(env, mem_reg, mem_regno, size_reg->umax_value, + err = check_helper_mem_access(env, mem_reg, mem_argno, size_reg->umax_value, access_type, zero_size_allowed, meta); if (!err) - err = mark_chain_precision(env, size_regno); + err = mark_chain_precision(env, reg_from_argno(size_argno)); return err; } static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - u32 regno, u32 mem_size) + argno_t argno, u32 mem_size) { bool may_be_null = type_may_be_null(reg->type); struct bpf_reg_state saved_reg; @@ -7096,8 +7143,8 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg int size = base_type(reg->type) == PTR_TO_STACK ? -(int)mem_size : mem_size; - err = check_helper_mem_access(env, reg, regno, size, BPF_READ, true, NULL); - err = err ?: check_helper_mem_access(env, reg, regno, size, BPF_WRITE, true, NULL); + err = check_helper_mem_access(env, reg, argno, size, BPF_READ, true, NULL); + err = err ?: check_helper_mem_access(env, reg, argno, size, BPF_WRITE, true, NULL); if (may_be_null) *reg = saved_reg; @@ -7106,7 +7153,7 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg } static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *mem_reg, - struct bpf_reg_state *size_reg, u32 mem_regno, u32 size_regno) + struct bpf_reg_state *size_reg, argno_t mem_argno, argno_t size_argno) { bool may_be_null = type_may_be_null(mem_reg->type); struct bpf_reg_state saved_reg; @@ -7120,8 +7167,8 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg mark_ptr_not_null_reg(mem_reg); } - err = check_mem_size_reg(env, mem_reg, size_reg, mem_regno, size_regno, BPF_READ, true, &meta); - err = err ?: check_mem_size_reg(env, mem_reg, size_reg, mem_regno, size_regno, BPF_WRITE, true, &meta); + err = check_mem_size_reg(env, mem_reg, size_reg, mem_argno, size_argno, BPF_READ, true, &meta); + err = err ?: check_mem_size_reg(env, mem_reg, size_reg, mem_argno, size_argno, BPF_WRITE, true, &meta); if (may_be_null) *mem_reg = saved_reg; @@ -7157,7 +7204,7 @@ enum { * env->cur_state->active_locks remembers which map value element or allocated * object got locked and clears it after bpf_spin_unlock. */ -static int process_spin_lock(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, int flags) +static int process_spin_lock(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int flags) { bool is_lock = flags & PROCESS_SPIN_LOCK, is_res_lock = flags & PROCESS_RES_LOCK; const char *lock_str = is_res_lock ? "bpf_res_spin" : "bpf_spin"; @@ -7173,8 +7220,8 @@ static int process_spin_lock(struct bpf_verifier_env *env, struct bpf_reg_state if (!is_const) { verbose(env, - "R%d doesn't have constant offset. %s_lock has to be at the constant offset\n", - regno, lock_str); + "%s doesn't have constant offset. %s_lock has to be at the constant offset\n", + reg_arg_name(env, argno), lock_str); return -EINVAL; } if (reg->type == PTR_TO_MAP_VALUE) { @@ -7273,7 +7320,7 @@ static int process_spin_lock(struct bpf_verifier_env *env, struct bpf_reg_state } /* Check if @regno is a pointer to a specific field in a map value */ -static int check_map_field_pointer(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, +static int check_map_field_pointer(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, enum btf_field_type field_type, struct bpf_map_desc *map_desc) { @@ -7285,8 +7332,8 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, struct bpf_reg_ if (!is_const) { verbose(env, - "R%d doesn't have constant offset. %s has to be at the constant offset\n", - regno, struct_name); + "%s doesn't have constant offset. %s has to be at the constant offset\n", + reg_arg_name(env, argno), struct_name); return -EINVAL; } if (!map->btf) { @@ -7326,26 +7373,26 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, struct bpf_reg_ return 0; } -static int process_timer_func(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, +static int process_timer_func(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, struct bpf_map_desc *map) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) { verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n"); return -EOPNOTSUPP; } - return check_map_field_pointer(env, reg, regno, BPF_TIMER, map); + return check_map_field_pointer(env, reg, argno, BPF_TIMER, map); } -static int process_timer_helper(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, +static int process_timer_helper(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, struct bpf_call_arg_meta *meta) { - return process_timer_func(env, reg, regno, &meta->map); + return process_timer_func(env, reg, argno, &meta->map); } -static int process_timer_kfunc(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, +static int process_timer_kfunc(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, struct bpf_kfunc_call_arg_meta *meta) { - return process_timer_func(env, reg, regno, &meta->map); + return process_timer_func(env, reg, argno, &meta->map); } static int process_kptr_func(struct bpf_verifier_env *env, int regno, @@ -7410,15 +7457,15 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, * use case. The second level is tracked using the upper bit of bpf_dynptr->size * and checked dynamically during runtime. */ -static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, int insn_idx, +static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int insn_idx, enum bpf_arg_type arg_type, int clone_ref_obj_id) { int err; if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) { verbose(env, - "arg#%d expected pointer to stack or const struct bpf_dynptr\n", - regno - 1); + "%s expected pointer to stack or const struct bpf_dynptr\n", + reg_arg_name(env, argno)); return -EINVAL; } @@ -7446,7 +7493,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_stat /* we write BPF_DW bits (8 bytes) at a time */ for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) { - err = check_mem_access(env, insn_idx, reg, regno, + err = check_mem_access(env, insn_idx, reg, argno, i, BPF_DW, BPF_WRITE, -1, false, false); if (err) return err; @@ -7461,17 +7508,17 @@ static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_stat } if (!is_dynptr_reg_valid_init(env, reg)) { - verbose(env, - "Expected an initialized dynptr as arg #%d\n", - regno - 1); + verbose(env, "Expected an initialized dynptr as %s\n", + reg_arg_name(env, argno)); return -EINVAL; } /* Fold modifiers (in this case, OBJ_RELEASE) when checking expected type */ if (!is_dynptr_type_expected(env, reg, arg_type & ~OBJ_RELEASE)) { verbose(env, - "Expected a dynptr of type %s as arg #%d\n", - dynptr_type_str(arg_to_dynptr_type(arg_type)), regno - 1); + "Expected a dynptr of type %s as %s\n", + dynptr_type_str(arg_to_dynptr_type(arg_type)), + reg_arg_name(env, argno)); return -EINVAL; } @@ -7516,14 +7563,16 @@ static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg_idx, return btf_param_match_suffix(meta->btf, arg, "__iter"); } -static int process_iter_arg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, int insn_idx, +static int process_iter_arg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int insn_idx, struct bpf_kfunc_call_arg_meta *meta) { const struct btf_type *t; + u32 arg_idx = arg_from_argno(argno) - 1; int spi, err, i, nr_slots, btf_id; if (reg->type != PTR_TO_STACK) { - verbose(env, "arg#%d expected pointer to an iterator on stack\n", regno - 1); + verbose(env, "%s expected pointer to an iterator on stack\n", + reg_arg_name(env, argno)); return -EINVAL; } @@ -7533,9 +7582,10 @@ static int process_iter_arg(struct bpf_verifier_env *env, struct bpf_reg_state * * to any kfunc, if arg has "__iter" suffix, we need to be a bit more * conservative here. */ - btf_id = btf_check_iter_arg(meta->btf, meta->func_proto, regno - 1); + btf_id = btf_check_iter_arg(meta->btf, meta->func_proto, arg_idx); if (btf_id < 0) { - verbose(env, "expected valid iter pointer as arg #%d\n", regno - 1); + verbose(env, "expected valid iter pointer as %s\n", + reg_arg_name(env, argno)); return -EINVAL; } t = btf_type_by_id(meta->btf, btf_id); @@ -7544,13 +7594,13 @@ static int process_iter_arg(struct bpf_verifier_env *env, struct bpf_reg_state * if (is_iter_new_kfunc(meta)) { /* bpf_iter__new() expects pointer to uninit iter state */ if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) { - verbose(env, "expected uninitialized iter_%s as arg #%d\n", - iter_type_str(meta->btf, btf_id), regno - 1); + verbose(env, "expected uninitialized iter_%s as %s\n", + iter_type_str(meta->btf, btf_id), reg_arg_name(env, argno)); return -EINVAL; } for (i = 0; i < nr_slots * 8; i += BPF_REG_SIZE) { - err = check_mem_access(env, insn_idx, reg, regno, + err = check_mem_access(env, insn_idx, reg, argno, i, BPF_DW, BPF_WRITE, -1, false, false); if (err) return err; @@ -7568,8 +7618,8 @@ static int process_iter_arg(struct bpf_verifier_env *env, struct bpf_reg_state * case 0: break; case -EINVAL: - verbose(env, "expected an initialized iter_%s as arg #%d\n", - iter_type_str(meta->btf, btf_id), regno - 1); + verbose(env, "expected an initialized iter_%s as %s\n", + iter_type_str(meta->btf, btf_id), reg_arg_name(env, argno)); return err; case -EPROTO: verbose(env, "expected an RCU CS when using %s\n", meta->func_name); @@ -7989,7 +8039,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_DYNPTR] = &dynptr_types, }; -static int check_reg_type(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, +static int check_reg_type(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, enum bpf_arg_type arg_type, const u32 *arg_btf_id, struct bpf_call_arg_meta *meta) @@ -8024,7 +8074,7 @@ static int check_reg_type(struct bpf_verifier_env *env, struct bpf_reg_state *re type &= ~DYNPTR_TYPE_FLAG_MASK; /* Local kptr types are allowed as the source argument of bpf_kptr_xchg */ - if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type) && regno == BPF_REG_2) { + if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type) && reg_from_argno(argno) == BPF_REG_2) { type &= ~MEM_ALLOC; type &= ~MEM_PERCPU; } @@ -8038,7 +8088,7 @@ static int check_reg_type(struct bpf_verifier_env *env, struct bpf_reg_state *re goto found; } - verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, reg->type)); + verbose(env, "%s type=%s expected=", reg_arg_name(env, argno), reg_type_str(env, reg->type)); for (j = 0; j + 1 < i; j++) verbose(env, "%s, ", reg_type_str(env, compatible->types[j])); verbose(env, "%s\n", reg_type_str(env, compatible->types[j])); @@ -8051,9 +8101,9 @@ found: if (compatible == &mem_types) { if (!(arg_type & MEM_RDONLY)) { verbose(env, - "%s() may write into memory pointed by R%d type=%s\n", + "%s() may write into memory pointed by %s type=%s\n", func_id_name(meta->func_id), - regno, reg_type_str(env, reg->type)); + reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } return 0; @@ -8076,7 +8126,8 @@ found: if (type_may_be_null(reg->type) && (!type_may_be_null(arg_type) || arg_type_is_release(arg_type))) { - verbose(env, "Possibly NULL pointer passed to helper arg%d\n", regno); + verbose(env, "Possibly NULL pointer passed to helper %s\n", + reg_arg_name(env, argno)); return -EACCES; } @@ -8089,25 +8140,26 @@ found: } if (meta->func_id == BPF_FUNC_kptr_xchg) { - if (map_kptr_match_type(env, meta->kptr_field, reg, regno)) + if (map_kptr_match_type(env, meta->kptr_field, reg, reg_from_argno(argno))) return -EACCES; } else { if (arg_btf_id == BPF_PTR_POISON) { verbose(env, "verifier internal error:"); - verbose(env, "R%d has non-overwritten BPF_PTR_POISON type\n", - regno); + verbose(env, "%s has non-overwritten BPF_PTR_POISON type\n", + reg_arg_name(env, argno)); return -EACCES; } - err = __check_ptr_off_reg(env, reg, regno, true); + err = __check_ptr_off_reg(env, reg, argno, true); if (err) return err; if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->var_off.value, btf_vmlinux, *arg_btf_id, strict_type_match)) { - verbose(env, "R%d is of type %s but %s is expected\n", - regno, btf_type_name(reg->btf, reg->btf_id), + verbose(env, "%s is of type %s but %s is expected\n", + reg_arg_name(env, argno), + btf_type_name(reg->btf, reg->btf_id), btf_type_name(btf_vmlinux, *arg_btf_id)); return -EACCES; } @@ -8124,8 +8176,11 @@ found: return -EFAULT; } /* Check if local kptr in src arg matches kptr in dst arg */ - if (meta->func_id == BPF_FUNC_kptr_xchg && regno == BPF_REG_2) { - if (map_kptr_match_type(env, meta->kptr_field, reg, regno)) + if (meta->func_id == BPF_FUNC_kptr_xchg) { + int regno = reg_from_argno(argno); + + if (regno == BPF_REG_2 && + map_kptr_match_type(env, meta->kptr_field, reg, regno)) return -EACCES; } break; @@ -8159,7 +8214,7 @@ reg_find_field_offset(const struct bpf_reg_state *reg, s32 off, u32 fields) } static int check_func_arg_reg_off(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno, + const struct bpf_reg_state *reg, argno_t argno, enum bpf_arg_type arg_type) { u32 type = reg->type; @@ -8185,8 +8240,8 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env, * to give the user a better error message. */ if (!tnum_is_const(reg->var_off) || reg->var_off.value != 0) { - verbose(env, "R%d must have zero offset when passed to release func or trusted arg to kfunc\n", - regno); + verbose(env, "%s must have zero offset when passed to release func or trusted arg to kfunc\n", + reg_arg_name(env, argno)); return -EINVAL; } } @@ -8222,7 +8277,7 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env, * cases. var_off always must be 0 for PTR_TO_BTF_ID, hence we * still need to do checks instead of returning. */ - return __check_ptr_off_reg(env, reg, regno, true); + return __check_ptr_off_reg(env, reg, argno, true); case PTR_TO_CTX: /* * Allow fixed and variable offsets for syscall context, but @@ -8234,7 +8289,7 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env, return 0; fallthrough; default: - return __check_ptr_off_reg(env, reg, regno, false); + return __check_ptr_off_reg(env, reg, argno, false); } } @@ -8304,8 +8359,8 @@ static enum bpf_dynptr_type dynptr_get_type(struct bpf_verifier_env *env, return state->stack[spi].spilled_ptr.dynptr.type; } -static int check_reg_const_str(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, u32 regno) +static int check_arg_const_str(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, argno_t argno) { struct bpf_map *map = reg->map_ptr; int err; @@ -8317,17 +8372,18 @@ static int check_reg_const_str(struct bpf_verifier_env *env, return -EINVAL; if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) { - verbose(env, "R%d points to insn_array map which cannot be used as const string\n", regno); + verbose(env, "%s points to insn_array map which cannot be used as const string\n", + reg_arg_name(env, argno)); return -EACCES; } if (!bpf_map_is_rdonly(map)) { - verbose(env, "R%d does not point to a readonly map'\n", regno); + verbose(env, "%s does not point to a readonly map'\n", reg_arg_name(env, argno)); return -EACCES; } if (!tnum_is_const(reg->var_off)) { - verbose(env, "R%d is not a constant address'\n", regno); + verbose(env, "%s is not a constant address'\n", reg_arg_name(env, argno)); return -EACCES; } @@ -8336,7 +8392,7 @@ static int check_reg_const_str(struct bpf_verifier_env *env, return -EACCES; } - err = check_map_access(env, reg, regno, 0, + err = check_map_access(env, reg, argno, 0, map->value_size - reg->var_off.value, false, ACCESS_HELPER); if (err) @@ -8472,11 +8528,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, base_type(arg_type) == ARG_PTR_TO_SPIN_LOCK) arg_btf_id = fn->arg_btf_id[arg]; - err = check_reg_type(env, reg, regno, arg_type, arg_btf_id, meta); + err = check_reg_type(env, reg, argno_from_reg(regno), arg_type, arg_btf_id, meta); if (err) return err; - err = check_func_arg_reg_off(env, reg, regno, arg_type); + err = check_func_arg_reg_off(env, reg, argno_from_reg(regno), arg_type); if (err) return err; @@ -8565,7 +8621,7 @@ skip_type_check: return -EFAULT; } key_size = meta->map.ptr->key_size; - err = check_helper_mem_access(env, reg, regno, key_size, BPF_READ, false, NULL); + err = check_helper_mem_access(env, reg, argno_from_reg(regno), key_size, BPF_READ, false, NULL); if (err) return err; if (can_elide_value_nullness(meta->map.ptr->map_type)) { @@ -8592,7 +8648,7 @@ skip_type_check: return -EFAULT; } meta->raw_mode = arg_type & MEM_UNINIT; - err = check_helper_mem_access(env, reg, regno, meta->map.ptr->value_size, + err = check_helper_mem_access(env, reg, argno_from_reg(regno), meta->map.ptr->value_size, arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ, false, meta); break; @@ -8610,11 +8666,11 @@ skip_type_check: return -EACCES; } if (meta->func_id == BPF_FUNC_spin_lock) { - err = process_spin_lock(env, reg, regno, PROCESS_SPIN_LOCK); + err = process_spin_lock(env, reg, argno_from_reg(regno), PROCESS_SPIN_LOCK); if (err) return err; } else if (meta->func_id == BPF_FUNC_spin_unlock) { - err = process_spin_lock(env, reg, regno, 0); + err = process_spin_lock(env, reg, argno_from_reg(regno), 0); if (err) return err; } else { @@ -8623,7 +8679,7 @@ skip_type_check: } break; case ARG_PTR_TO_TIMER: - err = process_timer_helper(env, reg, regno, meta); + err = process_timer_helper(env, reg, argno_from_reg(regno), meta); if (err) return err; break; @@ -8636,7 +8692,7 @@ skip_type_check: */ meta->raw_mode = arg_type & MEM_UNINIT; if (arg_type & MEM_FIXED_SIZE) { - err = check_helper_mem_access(env, reg, regno, fn->arg_size[arg], + err = check_helper_mem_access(env, reg, argno_from_reg(regno), fn->arg_size[arg], arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ, false, meta); if (err) @@ -8646,19 +8702,21 @@ skip_type_check: } break; case ARG_CONST_SIZE: - err = check_mem_size_reg(env, reg_state(env, regno - 1), reg, regno - 1, regno, + err = check_mem_size_reg(env, reg_state(env, regno - 1), reg, argno_from_reg(regno - 1), + argno_from_reg(regno), fn->arg_type[arg - 1] & MEM_WRITE ? BPF_WRITE : BPF_READ, false, meta); break; case ARG_CONST_SIZE_OR_ZERO: - err = check_mem_size_reg(env, reg_state(env, regno - 1), reg, regno - 1, regno, + err = check_mem_size_reg(env, reg_state(env, regno - 1), reg, argno_from_reg(regno - 1), + argno_from_reg(regno), fn->arg_type[arg - 1] & MEM_WRITE ? BPF_WRITE : BPF_READ, true, meta); break; case ARG_PTR_TO_DYNPTR: - err = process_dynptr_func(env, reg, regno, insn_idx, arg_type, 0); + err = process_dynptr_func(env, reg, argno_from_reg(regno), insn_idx, arg_type, 0); if (err) return err; break; @@ -8675,7 +8733,7 @@ skip_type_check: break; case ARG_PTR_TO_CONST_STR: { - err = check_reg_const_str(env, reg, regno); + err = check_arg_const_str(env, reg, argno_from_reg(regno)); if (err) return err; break; @@ -9264,13 +9322,14 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, * verifier sees. */ for (i = 0; i < sub->arg_cnt; i++) { + argno_t argno = argno_from_arg(i + 1); u32 regno = i + 1; struct bpf_reg_state *reg = ®s[regno]; struct bpf_subprog_arg_info *arg = &sub->args[i]; if (arg->arg_type == ARG_ANYTHING) { if (reg->type != SCALAR_VALUE) { - bpf_log(log, "R%d is not a scalar\n", regno); + bpf_log(log, "%s is not a scalar\n", reg_arg_name(env, argno)); return -EINVAL; } } else if (arg->arg_type & PTR_UNTRUSTED) { @@ -9280,24 +9339,26 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, * invalid memory access. */ } else if (arg->arg_type == ARG_PTR_TO_CTX) { - ret = check_func_arg_reg_off(env, reg, regno, ARG_PTR_TO_CTX); + ret = check_func_arg_reg_off(env, reg, argno, ARG_PTR_TO_CTX); if (ret < 0) return ret; /* If function expects ctx type in BTF check that caller * is passing PTR_TO_CTX. */ if (reg->type != PTR_TO_CTX) { - bpf_log(log, "arg#%d expects pointer to ctx\n", i); + bpf_log(log, "%s expects pointer to ctx\n", + reg_arg_name(env, argno)); return -EINVAL; } } else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) { - ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE); + ret = check_func_arg_reg_off(env, reg, argno, ARG_DONTCARE); if (ret < 0) return ret; - if (check_mem_reg(env, reg, regno, arg->mem_size)) + if (check_mem_reg(env, reg, argno, arg->mem_size)) return -EINVAL; if (!(arg->arg_type & PTR_MAYBE_NULL) && (reg->type & PTR_MAYBE_NULL)) { - bpf_log(log, "arg#%d is expected to be non-NULL\n", i); + bpf_log(log, "%s is expected to be non-NULL\n", + reg_arg_name(env, argno)); return -EINVAL; } } else if (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) { @@ -9309,15 +9370,16 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, * run-time debug nightmare. */ if (reg->type != PTR_TO_ARENA && reg->type != SCALAR_VALUE) { - bpf_log(log, "R%d is not a pointer to arena or scalar.\n", regno); + bpf_log(log, "%s is not a pointer to arena or scalar.\n", + reg_arg_name(env, argno)); return -EINVAL; } } else if (arg->arg_type == ARG_PTR_TO_DYNPTR) { - ret = check_func_arg_reg_off(env, reg, regno, ARG_PTR_TO_DYNPTR); + ret = check_func_arg_reg_off(env, reg, argno, ARG_PTR_TO_DYNPTR); if (ret) return ret; - ret = process_dynptr_func(env, reg, regno, -1, arg->arg_type, 0); + ret = process_dynptr_func(env, reg, argno, -1, arg->arg_type, 0); if (ret) return ret; } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) { @@ -9328,12 +9390,13 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, continue; memset(&meta, 0, sizeof(meta)); /* leave func_id as zero */ - err = check_reg_type(env, reg, regno, arg->arg_type, &arg->btf_id, &meta); - err = err ?: check_func_arg_reg_off(env, reg, regno, arg->arg_type); + err = check_reg_type(env, reg, argno, arg->arg_type, &arg->btf_id, &meta); + err = err ?: check_func_arg_reg_off(env, reg, argno, arg->arg_type); if (err) return err; } else { - verifier_bug(env, "unrecognized arg#%d type %d", i, arg->arg_type); + verifier_bug(env, "unrecognized %s type %d", + reg_arg_name(env, argno), arg->arg_type); return -EFAULT; } } @@ -10292,7 +10355,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn * is inferred from register state. */ for (i = 0; i < meta.access_size; i++) { - err = check_mem_access(env, insn_idx, regs + meta.regno, meta.regno, i, BPF_B, + err = check_mem_access(env, insn_idx, regs + meta.regno, argno_from_reg(meta.regno), i, BPF_B, BPF_WRITE, -1, false, false); if (err) return err; @@ -11301,7 +11364,7 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta, const struct btf_type *t, const struct btf_type *ref_t, const char *ref_tname, const struct btf_param *args, - int arg, int nargs, struct bpf_reg_state *reg) + int arg, int nargs, argno_t argno, struct bpf_reg_state *reg) { u32 regno = arg + 1; struct bpf_reg_state *regs = cur_regs(env); @@ -11376,8 +11439,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) { if (!btf_type_is_struct(ref_t)) { - verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n", - meta->func_name, arg, btf_type_str(ref_t), ref_tname); + verbose(env, "kernel function %s %s pointer type %s %s is not supported\n", + meta->func_name, reg_arg_name(env, argno), + btf_type_str(ref_t), ref_tname); return -EINVAL; } return KF_ARG_PTR_TO_BTF_ID; @@ -11393,8 +11457,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, */ if (!btf_type_is_scalar(ref_t) && !__btf_type_is_scalar_struct(env, meta->btf, ref_t, 0) && (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) { - verbose(env, "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n", - arg, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : ""); + verbose(env, "%s pointer type %s %s must point to %sscalar, or struct with scalar\n", + reg_arg_name(env, argno), + btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : ""); return -EINVAL; } return arg_mem_size ? KF_ARG_PTR_TO_MEM_SIZE : KF_ARG_PTR_TO_MEM; @@ -11405,7 +11470,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, const struct btf_type *ref_t, const char *ref_tname, u32 ref_id, struct bpf_kfunc_call_arg_meta *meta, - int arg) + int arg, argno_t argno) { const struct btf_type *reg_ref_t; bool strict_type_match = false; @@ -11463,15 +11528,16 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, */ taking_projection = btf_is_projection_of(ref_tname, reg_ref_tname); if (!taking_projection && !struct_same) { - verbose(env, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n", - meta->func_name, arg, btf_type_str(ref_t), ref_tname, arg + 1, + verbose(env, "kernel function %s %s expected pointer to %s %s but %s has a pointer to %s %s\n", + meta->func_name, reg_arg_name(env, argno), + btf_type_str(ref_t), ref_tname, reg_arg_name(env, argno), btf_type_str(reg_ref_t), reg_ref_tname); return -EINVAL; } return 0; } -static int process_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, +static int process_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, struct bpf_kfunc_call_arg_meta *meta) { int err, kfunc_class = IRQ_NATIVE_KFUNC; @@ -11494,11 +11560,13 @@ static int process_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state * if (irq_save) { if (!is_irq_flag_reg_valid_uninit(env, reg)) { - verbose(env, "expected uninitialized irq flag as arg#%d\n", regno - 1); + verbose(env, "expected uninitialized irq flag as %s\n", + reg_arg_name(env, argno)); return -EINVAL; } - err = check_mem_access(env, env->insn_idx, reg, regno, 0, BPF_DW, BPF_WRITE, -1, false, false); + err = check_mem_access(env, env->insn_idx, reg, argno, 0, BPF_DW, + BPF_WRITE, -1, false, false); if (err) return err; @@ -11508,7 +11576,8 @@ static int process_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state * } else { err = is_irq_flag_reg_valid_init(env, reg); if (err) { - verbose(env, "expected an initialized irq flag as arg#%d\n", regno - 1); + verbose(env, "expected an initialized irq flag as %s\n", + reg_arg_name(env, argno)); return err; } @@ -11799,7 +11868,7 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env, static int __process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, u32 regno, + struct bpf_reg_state *reg, argno_t argno, struct bpf_kfunc_call_arg_meta *meta, enum btf_field_type head_field_type, struct btf_field **head_field) @@ -11820,8 +11889,8 @@ __process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env, head_type_name = btf_field_type_name(head_field_type); if (!tnum_is_const(reg->var_off)) { verbose(env, - "R%d doesn't have constant offset. %s has to be at the constant offset\n", - regno, head_type_name); + "%s doesn't have constant offset. %s has to be at the constant offset\n", + reg_arg_name(env, argno), head_type_name); return -EINVAL; } @@ -11849,24 +11918,24 @@ __process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env, } static int process_kf_arg_ptr_to_list_head(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, u32 regno, + struct bpf_reg_state *reg, argno_t argno, struct bpf_kfunc_call_arg_meta *meta) { - return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_LIST_HEAD, + return __process_kf_arg_ptr_to_graph_root(env, reg, argno, meta, BPF_LIST_HEAD, &meta->arg_list_head.field); } static int process_kf_arg_ptr_to_rbtree_root(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, u32 regno, + struct bpf_reg_state *reg, argno_t argno, struct bpf_kfunc_call_arg_meta *meta) { - return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_RB_ROOT, + return __process_kf_arg_ptr_to_graph_root(env, reg, argno, meta, BPF_RB_ROOT, &meta->arg_rbtree_root.field); } static int __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, u32 regno, + struct bpf_reg_state *reg, argno_t argno, struct bpf_kfunc_call_arg_meta *meta, enum btf_field_type head_field_type, enum btf_field_type node_field_type, @@ -11888,8 +11957,8 @@ __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env, node_type_name = btf_field_type_name(node_field_type); if (!tnum_is_const(reg->var_off)) { verbose(env, - "R%d doesn't have constant offset. %s has to be at the constant offset\n", - regno, node_type_name); + "%s doesn't have constant offset. %s has to be at the constant offset\n", + reg_arg_name(env, argno), node_type_name); return -EINVAL; } @@ -11930,19 +11999,19 @@ __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env, } static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, u32 regno, + struct bpf_reg_state *reg, argno_t argno, struct bpf_kfunc_call_arg_meta *meta) { - return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta, + return __process_kf_arg_ptr_to_graph_node(env, reg, argno, meta, BPF_LIST_HEAD, BPF_LIST_NODE, &meta->arg_list_head.field); } static int process_kf_arg_ptr_to_rbtree_node(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, u32 regno, + struct bpf_reg_state *reg, argno_t argno, struct bpf_kfunc_call_arg_meta *meta) { - return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta, + return __process_kf_arg_ptr_to_graph_node(env, reg, argno, meta, BPF_RB_ROOT, BPF_RB_NODE, &meta->arg_rbtree_root.field); } @@ -11994,6 +12063,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[i + 1]; const struct btf_type *t, *ref_t, *resolve_ret; enum bpf_arg_type arg_type = ARG_DONTCARE; + argno_t argno = argno_from_arg(i + 1); u32 regno = i + 1, ref_id, type_size; bool is_ret_buf_sz = false; int kf_arg_type; @@ -12016,7 +12086,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (btf_type_is_scalar(t)) { if (reg->type != SCALAR_VALUE) { - verbose(env, "R%d is not a scalar\n", regno); + verbose(env, "%s is not a scalar\n", reg_arg_name(env, argno)); return -EINVAL; } @@ -12026,7 +12096,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EFAULT; } if (!tnum_is_const(reg->var_off)) { - verbose(env, "R%d must be a known constant\n", regno); + verbose(env, "%s must be a known constant\n", + reg_arg_name(env, argno)); return -EINVAL; } ret = mark_chain_precision(env, regno); @@ -12048,7 +12119,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } if (!tnum_is_const(reg->var_off)) { - verbose(env, "R%d is not a const\n", regno); + verbose(env, "%s is not a const\n", + reg_arg_name(env, argno)); return -EINVAL; } @@ -12061,20 +12133,22 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } if (!btf_type_is_ptr(t)) { - verbose(env, "Unrecognized arg#%d type %s\n", i, btf_type_str(t)); + verbose(env, "Unrecognized %s type %s\n", + reg_arg_name(env, argno), btf_type_str(t)); return -EINVAL; } if ((bpf_register_is_null(reg) || type_may_be_null(reg->type)) && !is_kfunc_arg_nullable(meta->btf, &args[i])) { - verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i); + verbose(env, "Possibly NULL pointer passed to trusted %s\n", + reg_arg_name(env, argno)); return -EACCES; } if (reg->ref_obj_id) { if (is_kfunc_release(meta) && meta->ref_obj_id) { - verifier_bug(env, "more than one arg with ref_obj_id R%d %u %u", - regno, reg->ref_obj_id, + verifier_bug(env, "more than one arg with ref_obj_id %s %u %u", + reg_arg_name(env, argno), reg->ref_obj_id, meta->ref_obj_id); return -EFAULT; } @@ -12086,7 +12160,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); ref_tname = btf_name_by_offset(btf, ref_t->name_off); - kf_arg_type = get_kfunc_ptr_arg_type(env, meta, t, ref_t, ref_tname, args, i, nargs, reg); + kf_arg_type = get_kfunc_ptr_arg_type(env, meta, t, ref_t, ref_tname, args, i, nargs, argno, reg); if (kf_arg_type < 0) return kf_arg_type; @@ -12095,7 +12169,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ continue; case KF_ARG_PTR_TO_MAP: if (!reg->map_ptr) { - verbose(env, "pointer in R%d isn't map pointer\n", regno); + verbose(env, "pointer in %s isn't map pointer\n", + reg_arg_name(env, argno)); return -EINVAL; } if (meta->map.ptr && (reg->map_ptr->record->wq_off >= 0 || @@ -12133,11 +12208,13 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_BTF_ID: if (!is_trusted_reg(reg)) { if (!is_kfunc_rcu(meta)) { - verbose(env, "R%d must be referenced or trusted\n", regno); + verbose(env, "%s must be referenced or trusted\n", + reg_arg_name(env, argno)); return -EINVAL; } if (!is_rcu_reg(reg)) { - verbose(env, "R%d must be a rcu pointer\n", regno); + verbose(env, "%s must be a rcu pointer\n", + reg_arg_name(env, argno)); return -EINVAL; } } @@ -12169,15 +12246,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (is_kfunc_release(meta) && reg->ref_obj_id) arg_type |= OBJ_RELEASE; - ret = check_func_arg_reg_off(env, reg, regno, arg_type); + ret = check_func_arg_reg_off(env, reg, argno, arg_type); if (ret < 0) return ret; switch (kf_arg_type) { case KF_ARG_PTR_TO_CTX: if (reg->type != PTR_TO_CTX) { - verbose(env, "arg#%d expected pointer to ctx, but got %s\n", - i, reg_type_str(env, reg->type)); + verbose(env, "%s expected pointer to ctx, but got %s\n", + reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EINVAL; } @@ -12191,16 +12268,19 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_ALLOC_BTF_ID: if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC)) { if (!is_bpf_obj_drop_kfunc(meta->func_id)) { - verbose(env, "arg#%d expected for bpf_obj_drop()\n", i); + verbose(env, "%s expected for bpf_obj_drop()\n", + reg_arg_name(env, argno)); return -EINVAL; } } else if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC | MEM_PERCPU)) { if (!is_bpf_percpu_obj_drop_kfunc(meta->func_id)) { - verbose(env, "arg#%d expected for bpf_percpu_obj_drop()\n", i); + verbose(env, "%s expected for bpf_percpu_obj_drop()\n", + reg_arg_name(env, argno)); return -EINVAL; } } else { - verbose(env, "arg#%d expected pointer to allocated object\n", i); + verbose(env, "%s expected pointer to allocated object\n", + reg_arg_name(env, argno)); return -EINVAL; } if (!reg->ref_obj_id) { @@ -12248,7 +12328,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } } - ret = process_dynptr_func(env, reg, regno, insn_idx, dynptr_arg_type, clone_ref_obj_id); + ret = process_dynptr_func(env, reg, argno, insn_idx, + dynptr_arg_type, clone_ref_obj_id); if (ret < 0) return ret; @@ -12273,55 +12354,59 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EINVAL; } } - ret = process_iter_arg(env, reg, regno, insn_idx, meta); + ret = process_iter_arg(env, reg, argno, insn_idx, meta); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_LIST_HEAD: if (reg->type != PTR_TO_MAP_VALUE && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { - verbose(env, "arg#%d expected pointer to map value or allocated object\n", i); + verbose(env, "%s expected pointer to map value or allocated object\n", + reg_arg_name(env, argno)); return -EINVAL; } if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) { verbose(env, "allocated object must be referenced\n"); return -EINVAL; } - ret = process_kf_arg_ptr_to_list_head(env, reg, regno, meta); + ret = process_kf_arg_ptr_to_list_head(env, reg, argno, meta); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_RB_ROOT: if (reg->type != PTR_TO_MAP_VALUE && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { - verbose(env, "arg#%d expected pointer to map value or allocated object\n", i); + verbose(env, "%s expected pointer to map value or allocated object\n", + reg_arg_name(env, argno)); return -EINVAL; } if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) { verbose(env, "allocated object must be referenced\n"); return -EINVAL; } - ret = process_kf_arg_ptr_to_rbtree_root(env, reg, regno, meta); + ret = process_kf_arg_ptr_to_rbtree_root(env, reg, argno, meta); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_LIST_NODE: if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { - verbose(env, "arg#%d expected pointer to allocated object\n", i); + verbose(env, "%s expected pointer to allocated object\n", + reg_arg_name(env, argno)); return -EINVAL; } if (!reg->ref_obj_id) { verbose(env, "allocated object must be referenced\n"); return -EINVAL; } - ret = process_kf_arg_ptr_to_list_node(env, reg, regno, meta); + ret = process_kf_arg_ptr_to_list_node(env, reg, argno, meta); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_RB_NODE: if (is_bpf_rbtree_add_kfunc(meta->func_id)) { if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { - verbose(env, "arg#%d expected pointer to allocated object\n", i); + verbose(env, "%s expected pointer to allocated object\n", + reg_arg_name(env, argno)); return -EINVAL; } if (!reg->ref_obj_id) { @@ -12339,7 +12424,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } } - ret = process_kf_arg_ptr_to_rbtree_node(env, reg, regno, meta); + ret = process_kf_arg_ptr_to_rbtree_node(env, reg, argno, meta); if (ret < 0) return ret; break; @@ -12354,24 +12439,26 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if ((base_type(reg->type) != PTR_TO_BTF_ID || (bpf_type_has_unsafe_modifiers(reg->type) && !is_rcu_reg(reg))) && !reg2btf_ids[base_type(reg->type)]) { - verbose(env, "arg#%d is %s ", i, reg_type_str(env, reg->type)); + verbose(env, "%s is %s ", reg_arg_name(env, argno), + reg_type_str(env, reg->type)); verbose(env, "expected %s or socket\n", reg_type_str(env, base_type(reg->type) | (type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS))); return -EINVAL; } - ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i); + ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i, argno); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_MEM: resolve_ret = btf_resolve_size(btf, ref_t, &type_size); if (IS_ERR(resolve_ret)) { - verbose(env, "arg#%d reference type('%s %s') size cannot be determined: %ld\n", - i, btf_type_str(ref_t), ref_tname, PTR_ERR(resolve_ret)); + verbose(env, "%s reference type('%s %s') size cannot be determined: %ld\n", + reg_arg_name(env, argno), btf_type_str(ref_t), + ref_tname, PTR_ERR(resolve_ret)); return -EINVAL; } - ret = check_mem_reg(env, reg, regno, type_size); + ret = check_mem_reg(env, reg, argno, type_size); if (ret < 0) return ret; break; @@ -12381,11 +12468,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ const struct btf_param *buff_arg = &args[i]; struct bpf_reg_state *size_reg = ®s[regno + 1]; const struct btf_param *size_arg = &args[i + 1]; + argno_t next_argno = argno_from_arg(i + 2); if (!bpf_register_is_null(buff_reg) || !is_kfunc_arg_nullable(meta->btf, buff_arg)) { - ret = check_kfunc_mem_size_reg(env, buff_reg, size_reg, regno, regno + 1); + ret = check_kfunc_mem_size_reg(env, buff_reg, size_reg, + argno, next_argno); if (ret < 0) { - verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1); + verbose(env, "%s and ", reg_arg_name(env, argno)); + verbose(env, "%s memory, len pair leads to invalid memory access\n", + reg_arg_name(env, next_argno)); return ret; } } @@ -12396,7 +12487,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EFAULT; } if (!tnum_is_const(size_reg->var_off)) { - verbose(env, "R%d must be a known constant\n", regno + 1); + verbose(env, "%s must be a known constant\n", + reg_arg_name(env, next_argno)); return -EINVAL; } meta->arg_constant.found = true; @@ -12409,14 +12501,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } case KF_ARG_PTR_TO_CALLBACK: if (reg->type != PTR_TO_FUNC) { - verbose(env, "arg%d expected pointer to func\n", i); + verbose(env, "%s expected pointer to func\n", reg_arg_name(env, argno)); return -EINVAL; } meta->subprogno = reg->subprogno; break; case KF_ARG_PTR_TO_REFCOUNTED_KPTR: if (!type_is_ptr_alloc_obj(reg->type)) { - verbose(env, "arg#%d is neither owning or non-owning ref\n", i); + verbose(env, "%s is neither owning or non-owning ref\n", + reg_arg_name(env, argno)); return -EINVAL; } if (!type_is_non_owning_ref(reg->type)) @@ -12429,7 +12522,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } if (rec->refcount_off < 0) { - verbose(env, "arg#%d doesn't point to a type with bpf_refcount field\n", i); + verbose(env, "%s doesn't point to a type with bpf_refcount field\n", + reg_arg_name(env, argno)); return -EINVAL; } @@ -12438,46 +12532,51 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ break; case KF_ARG_PTR_TO_CONST_STR: if (reg->type != PTR_TO_MAP_VALUE) { - verbose(env, "arg#%d doesn't point to a const string\n", i); + verbose(env, "%s doesn't point to a const string\n", + reg_arg_name(env, argno)); return -EINVAL; } - ret = check_reg_const_str(env, reg, regno); + ret = check_arg_const_str(env, reg, argno); if (ret) return ret; break; case KF_ARG_PTR_TO_WORKQUEUE: if (reg->type != PTR_TO_MAP_VALUE) { - verbose(env, "arg#%d doesn't point to a map value\n", i); + verbose(env, "%s doesn't point to a map value\n", + reg_arg_name(env, argno)); return -EINVAL; } - ret = check_map_field_pointer(env, reg, regno, BPF_WORKQUEUE, &meta->map); + ret = check_map_field_pointer(env, reg, argno, BPF_WORKQUEUE, &meta->map); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_TIMER: if (reg->type != PTR_TO_MAP_VALUE) { - verbose(env, "arg#%d doesn't point to a map value\n", i); + verbose(env, "%s doesn't point to a map value\n", + reg_arg_name(env, argno)); return -EINVAL; } - ret = process_timer_kfunc(env, reg, regno, meta); + ret = process_timer_kfunc(env, reg, argno, meta); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_TASK_WORK: if (reg->type != PTR_TO_MAP_VALUE) { - verbose(env, "arg#%d doesn't point to a map value\n", i); + verbose(env, "%s doesn't point to a map value\n", + reg_arg_name(env, argno)); return -EINVAL; } - ret = check_map_field_pointer(env, reg, regno, BPF_TASK_WORK, &meta->map); + ret = check_map_field_pointer(env, reg, argno, BPF_TASK_WORK, &meta->map); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_IRQ_FLAG: if (reg->type != PTR_TO_STACK) { - verbose(env, "arg#%d doesn't point to an irq flag on stack\n", i); + verbose(env, "%s doesn't point to an irq flag on stack\n", + reg_arg_name(env, argno)); return -EINVAL; } - ret = process_irq_flag(env, reg, regno, meta); + ret = process_irq_flag(env, reg, argno, meta); if (ret < 0) return ret; break; @@ -12486,7 +12585,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ int flags = PROCESS_RES_LOCK; if (reg->type != PTR_TO_MAP_VALUE && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { - verbose(env, "arg#%d doesn't point to map value or allocated object\n", i); + verbose(env, "%s doesn't point to map value or allocated object\n", + reg_arg_name(env, argno)); return -EINVAL; } @@ -12498,7 +12598,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] || meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) flags |= PROCESS_LOCK_IRQ; - ret = process_spin_lock(env, reg, regno, flags); + ret = process_spin_lock(env, reg, argno, flags); if (ret < 0) return ret; break; @@ -13649,7 +13749,7 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env, return -EACCES; break; case PTR_TO_MAP_VALUE: - if (check_map_access(env, dst_reg, dst, 0, 1, false, ACCESS_HELPER)) { + if (check_map_access(env, dst_reg, argno_from_reg(dst), 0, 1, false, ACCESS_HELPER)) { verbose(env, "R%d pointer arithmetic of map value goes out of range, " "prohibited for !root\n", dst); return -EACCES; @@ -16831,7 +16931,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char prog->aux->attach_func_proto->type, NULL); if (ret_type && ret_type == reg_type && reg->ref_obj_id) - return __check_ptr_off_reg(env, reg, regno, false); + return __check_ptr_off_reg(env, reg, argno_from_reg(regno), false); } /* eBPF calling convention is such that R0 is used @@ -17535,7 +17635,7 @@ static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state) dst_reg_type = cur_regs(env)[insn->dst_reg].type; - err = check_mem_access(env, env->insn_idx, cur_regs(env) + insn->dst_reg, insn->dst_reg, + err = check_mem_access(env, env->insn_idx, cur_regs(env) + insn->dst_reg, argno_from_reg(insn->dst_reg), insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1, false, false); if (err) @@ -18714,7 +18814,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) mark_reg_unknown(env, regs, i); } else { verifier_bug(env, "unhandled arg#%d type %d", - i - BPF_REG_1, arg->arg_type); + i - BPF_REG_1 + 1, arg->arg_type); ret = -EFAULT; goto out; } diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c index 215878ea04de..b33dba4b126e 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c @@ -11,18 +11,18 @@ struct { const char *prog_name; const char *err_msg; } test_bpf_nf_fail_tests[] = { - { "alloc_release", "kernel function bpf_ct_release args#0 expected pointer to STRUCT nf_conn but" }, - { "insert_insert", "kernel function bpf_ct_insert_entry args#0 expected pointer to STRUCT nf_conn___init but" }, - { "lookup_insert", "kernel function bpf_ct_insert_entry args#0 expected pointer to STRUCT nf_conn___init but" }, - { "set_timeout_after_insert", "kernel function bpf_ct_set_timeout args#0 expected pointer to STRUCT nf_conn___init but" }, - { "set_status_after_insert", "kernel function bpf_ct_set_status args#0 expected pointer to STRUCT nf_conn___init but" }, - { "change_timeout_after_alloc", "kernel function bpf_ct_change_timeout args#0 expected pointer to STRUCT nf_conn but" }, - { "change_status_after_alloc", "kernel function bpf_ct_change_status args#0 expected pointer to STRUCT nf_conn but" }, + { "alloc_release", "kernel function bpf_ct_release R1 expected pointer to STRUCT nf_conn but" }, + { "insert_insert", "kernel function bpf_ct_insert_entry R1 expected pointer to STRUCT nf_conn___init but" }, + { "lookup_insert", "kernel function bpf_ct_insert_entry R1 expected pointer to STRUCT nf_conn___init but" }, + { "set_timeout_after_insert", "kernel function bpf_ct_set_timeout R1 expected pointer to STRUCT nf_conn___init but" }, + { "set_status_after_insert", "kernel function bpf_ct_set_status R1 expected pointer to STRUCT nf_conn___init but" }, + { "change_timeout_after_alloc", "kernel function bpf_ct_change_timeout R1 expected pointer to STRUCT nf_conn but" }, + { "change_status_after_alloc", "kernel function bpf_ct_change_status R1 expected pointer to STRUCT nf_conn but" }, { "write_not_allowlisted_field", "no write support to nf_conn at off" }, - { "lookup_null_bpf_tuple", "Possibly NULL pointer passed to trusted arg1" }, - { "lookup_null_bpf_opts", "Possibly NULL pointer passed to trusted arg3" }, - { "xdp_lookup_null_bpf_tuple", "Possibly NULL pointer passed to trusted arg1" }, - { "xdp_lookup_null_bpf_opts", "Possibly NULL pointer passed to trusted arg3" }, + { "lookup_null_bpf_tuple", "Possibly NULL pointer passed to trusted R2" }, + { "lookup_null_bpf_opts", "Possibly NULL pointer passed to trusted R4" }, + { "xdp_lookup_null_bpf_tuple", "Possibly NULL pointer passed to trusted R2" }, + { "xdp_lookup_null_bpf_opts", "Possibly NULL pointer passed to trusted R4" }, }; enum { diff --git a/tools/testing/selftests/bpf/prog_tests/cb_refs.c b/tools/testing/selftests/bpf/prog_tests/cb_refs.c index c40df623a8f7..6300b67a3a84 100644 --- a/tools/testing/selftests/bpf/prog_tests/cb_refs.c +++ b/tools/testing/selftests/bpf/prog_tests/cb_refs.c @@ -12,7 +12,7 @@ struct { const char *err_msg; } cb_refs_tests[] = { { "underflow_prog", "must point to scalar, or struct with scalar" }, - { "leak_prog", "Possibly NULL pointer passed to helper arg2" }, + { "leak_prog", "Possibly NULL pointer passed to helper R2" }, { "nested_cb", "Unreleased reference id=4 alloc_insn=2" }, /* alloc_insn=2{4,5} */ { "non_cb_transfer_ref", "Unreleased reference id=4 alloc_insn=1" }, /* alloc_insn=1{1,2} */ }; diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c index 62f3fb79f5d1..3df07680f9e0 100644 --- a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c +++ b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c @@ -68,7 +68,7 @@ static struct kfunc_test_params kfunc_tests[] = { TC_FAIL(kfunc_call_test_get_mem_fail_oob, 0, "min value is outside of the allowed memory range"), TC_FAIL(kfunc_call_test_get_mem_fail_not_const, 0, "is not a const"), TC_FAIL(kfunc_call_test_mem_acquire_fail, 0, "acquire kernel function does not return PTR_TO_BTF_ID"), - TC_FAIL(kfunc_call_test_pointer_arg_type_mismatch, 0, "arg#0 expected pointer to ctx, but got scalar"), + TC_FAIL(kfunc_call_test_pointer_arg_type_mismatch, 0, "R1 expected pointer to ctx, but got scalar"), /* success cases */ TC_TEST(kfunc_call_test1, 12), diff --git a/tools/testing/selftests/bpf/prog_tests/linked_list.c b/tools/testing/selftests/bpf/prog_tests/linked_list.c index 6f25b5f39a79..dbff099860ba 100644 --- a/tools/testing/selftests/bpf/prog_tests/linked_list.c +++ b/tools/testing/selftests/bpf/prog_tests/linked_list.c @@ -81,8 +81,8 @@ static struct { { "direct_write_node", "direct access to bpf_list_node is disallowed" }, { "use_after_unlock_push_front", "invalid mem access 'scalar'" }, { "use_after_unlock_push_back", "invalid mem access 'scalar'" }, - { "double_push_front", "arg#1 expected pointer to allocated object" }, - { "double_push_back", "arg#1 expected pointer to allocated object" }, + { "double_push_front", "R2 expected pointer to allocated object" }, + { "double_push_back", "R2 expected pointer to allocated object" }, { "no_node_value_type", "bpf_list_node not found at offset=0" }, { "incorrect_value_type", "operation on bpf_list_head expects arg#1 bpf_list_node at offset=48 in struct foo, " diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c index 9fe9c4a4e8f6..a875ba8e5007 100644 --- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c +++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c @@ -29,7 +29,7 @@ static struct __cgrps_kfunc_map_value *insert_lookup_cgrp(struct cgroup *cgrp) } SEC("tp_btf/cgroup_mkdir") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(cgrp_kfunc_acquire_untrusted, struct cgroup *cgrp, const char *path) { struct cgroup *acquired; @@ -48,7 +48,7 @@ int BPF_PROG(cgrp_kfunc_acquire_untrusted, struct cgroup *cgrp, const char *path } SEC("tp_btf/cgroup_mkdir") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(cgrp_kfunc_acquire_no_null_check, struct cgroup *cgrp, const char *path) { struct cgroup *acquired; @@ -64,7 +64,7 @@ int BPF_PROG(cgrp_kfunc_acquire_no_null_check, struct cgroup *cgrp, const char * } SEC("tp_btf/cgroup_mkdir") -__failure __msg("arg#0 pointer type STRUCT cgroup must point") +__failure __msg("R1 pointer type STRUCT cgroup must point") int BPF_PROG(cgrp_kfunc_acquire_fp, struct cgroup *cgrp, const char *path) { struct cgroup *acquired, *stack_cgrp = (struct cgroup *)&path; @@ -106,7 +106,7 @@ int BPF_PROG(cgrp_kfunc_acquire_trusted_walked, struct cgroup *cgrp, const char } SEC("tp_btf/cgroup_mkdir") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(cgrp_kfunc_acquire_null, struct cgroup *cgrp, const char *path) { struct cgroup *acquired; @@ -175,7 +175,7 @@ int BPF_PROG(cgrp_kfunc_rcu_get_release, struct cgroup *cgrp, const char *path) } SEC("tp_btf/cgroup_mkdir") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(cgrp_kfunc_release_untrusted, struct cgroup *cgrp, const char *path) { struct __cgrps_kfunc_map_value *v; @@ -191,7 +191,7 @@ int BPF_PROG(cgrp_kfunc_release_untrusted, struct cgroup *cgrp, const char *path } SEC("tp_btf/cgroup_mkdir") -__failure __msg("arg#0 pointer type STRUCT cgroup must point") +__failure __msg("R1 pointer type STRUCT cgroup must point") int BPF_PROG(cgrp_kfunc_release_fp, struct cgroup *cgrp, const char *path) { struct cgroup *acquired = (struct cgroup *)&path; @@ -203,7 +203,7 @@ int BPF_PROG(cgrp_kfunc_release_fp, struct cgroup *cgrp, const char *path) } SEC("tp_btf/cgroup_mkdir") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(cgrp_kfunc_release_null, struct cgroup *cgrp, const char *path) { struct __cgrps_kfunc_map_value local, *v; diff --git a/tools/testing/selftests/bpf/progs/cpumask_failure.c b/tools/testing/selftests/bpf/progs/cpumask_failure.c index 61c32e91e8c3..4c45346fe6f7 100644 --- a/tools/testing/selftests/bpf/progs/cpumask_failure.c +++ b/tools/testing/selftests/bpf/progs/cpumask_failure.c @@ -45,7 +45,7 @@ int BPF_PROG(test_alloc_no_release, struct task_struct *task, u64 clone_flags) } SEC("tp_btf/task_newtask") -__failure __msg("NULL pointer passed to trusted arg0") +__failure __msg("NULL pointer passed to trusted R1") int BPF_PROG(test_alloc_double_release, struct task_struct *task, u64 clone_flags) { struct bpf_cpumask *cpumask; @@ -73,7 +73,7 @@ int BPF_PROG(test_acquire_wrong_cpumask, struct task_struct *task, u64 clone_fla } SEC("tp_btf/task_newtask") -__failure __msg("bpf_cpumask_set_cpu args#1 expected pointer to STRUCT bpf_cpumask") +__failure __msg("bpf_cpumask_set_cpu R2 expected pointer to STRUCT bpf_cpumask") int BPF_PROG(test_mutate_cpumask, struct task_struct *task, u64 clone_flags) { /* Can't set the CPU of a non-struct bpf_cpumask. */ @@ -107,7 +107,7 @@ int BPF_PROG(test_insert_remove_no_release, struct task_struct *task, u64 clone_ } SEC("tp_btf/task_newtask") -__failure __msg("NULL pointer passed to trusted arg0") +__failure __msg("NULL pointer passed to trusted R1") int BPF_PROG(test_cpumask_null, struct task_struct *task, u64 clone_flags) { /* NULL passed to kfunc. */ @@ -151,7 +151,7 @@ int BPF_PROG(test_global_mask_out_of_rcu, struct task_struct *task, u64 clone_fl } SEC("tp_btf/task_newtask") -__failure __msg("NULL pointer passed to trusted arg1") +__failure __msg("NULL pointer passed to trusted R2") int BPF_PROG(test_global_mask_no_null_check, struct task_struct *task, u64 clone_flags) { struct bpf_cpumask *local, *prev; @@ -179,7 +179,7 @@ int BPF_PROG(test_global_mask_no_null_check, struct task_struct *task, u64 clone } SEC("tp_btf/task_newtask") -__failure __msg("Possibly NULL pointer passed to helper arg2") +__failure __msg("Possibly NULL pointer passed to helper R2") int BPF_PROG(test_global_mask_rcu_no_null_check, struct task_struct *task, u64 clone_flags) { struct bpf_cpumask *prev, *curr; diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c index b62773ce5219..dbd97add5a5a 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_fail.c +++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c @@ -149,7 +149,7 @@ int ringbuf_release_uninit_dynptr(void *ctx) /* A dynptr can't be used after it has been invalidated */ SEC("?raw_tp") -__failure __msg("Expected an initialized dynptr as arg #2") +__failure __msg("Expected an initialized dynptr as R3") int use_after_invalid(void *ctx) { struct bpf_dynptr ptr; @@ -448,7 +448,7 @@ int invalid_helper2(void *ctx) /* A bpf_dynptr is invalidated if it's been written into */ SEC("?raw_tp") -__failure __msg("Expected an initialized dynptr as arg #0") +__failure __msg("Expected an initialized dynptr as R1") int invalid_write1(void *ctx) { struct bpf_dynptr ptr; @@ -1642,7 +1642,7 @@ int invalid_slice_rdwr_rdonly(struct __sk_buff *skb) /* bpf_dynptr_adjust can only be called on initialized dynptrs */ SEC("?raw_tp") -__failure __msg("Expected an initialized dynptr as arg #0") +__failure __msg("Expected an initialized dynptr as R1") int dynptr_adjust_invalid(void *ctx) { struct bpf_dynptr ptr = {}; @@ -1655,7 +1655,7 @@ int dynptr_adjust_invalid(void *ctx) /* bpf_dynptr_is_null can only be called on initialized dynptrs */ SEC("?raw_tp") -__failure __msg("Expected an initialized dynptr as arg #0") +__failure __msg("Expected an initialized dynptr as R1") int dynptr_is_null_invalid(void *ctx) { struct bpf_dynptr ptr = {}; @@ -1668,7 +1668,7 @@ int dynptr_is_null_invalid(void *ctx) /* bpf_dynptr_is_rdonly can only be called on initialized dynptrs */ SEC("?raw_tp") -__failure __msg("Expected an initialized dynptr as arg #0") +__failure __msg("Expected an initialized dynptr as R1") int dynptr_is_rdonly_invalid(void *ctx) { struct bpf_dynptr ptr = {}; @@ -1681,7 +1681,7 @@ int dynptr_is_rdonly_invalid(void *ctx) /* bpf_dynptr_size can only be called on initialized dynptrs */ SEC("?raw_tp") -__failure __msg("Expected an initialized dynptr as arg #0") +__failure __msg("Expected an initialized dynptr as R1") int dynptr_size_invalid(void *ctx) { struct bpf_dynptr ptr = {}; @@ -1694,7 +1694,7 @@ int dynptr_size_invalid(void *ctx) /* Only initialized dynptrs can be cloned */ SEC("?raw_tp") -__failure __msg("Expected an initialized dynptr as arg #0") +__failure __msg("Expected an initialized dynptr as R1") int clone_invalid1(void *ctx) { struct bpf_dynptr ptr1 = {}; @@ -1728,7 +1728,7 @@ int clone_invalid2(struct xdp_md *xdp) /* Invalidating a dynptr should invalidate its clones */ SEC("?raw_tp") -__failure __msg("Expected an initialized dynptr as arg #2") +__failure __msg("Expected an initialized dynptr as R3") int clone_invalidate1(void *ctx) { struct bpf_dynptr clone; @@ -1749,7 +1749,7 @@ int clone_invalidate1(void *ctx) /* Invalidating a dynptr should invalidate its parent */ SEC("?raw_tp") -__failure __msg("Expected an initialized dynptr as arg #2") +__failure __msg("Expected an initialized dynptr as R3") int clone_invalidate2(void *ctx) { struct bpf_dynptr ptr; @@ -1770,7 +1770,7 @@ int clone_invalidate2(void *ctx) /* Invalidating a dynptr should invalidate its siblings */ SEC("?raw_tp") -__failure __msg("Expected an initialized dynptr as arg #2") +__failure __msg("Expected an initialized dynptr as R3") int clone_invalidate3(void *ctx) { struct bpf_dynptr ptr; @@ -1981,7 +1981,7 @@ __noinline long global_call_bpf_dynptr(const struct bpf_dynptr *dynptr) } SEC("?raw_tp") -__failure __msg("arg#0 expected pointer to stack or const struct bpf_dynptr") +__failure __msg("R1 expected pointer to stack or const struct bpf_dynptr") int test_dynptr_reg_type(void *ctx) { struct task_struct *current = NULL; diff --git a/tools/testing/selftests/bpf/progs/file_reader_fail.c b/tools/testing/selftests/bpf/progs/file_reader_fail.c index 32fe28ed2439..0739620dea8a 100644 --- a/tools/testing/selftests/bpf/progs/file_reader_fail.c +++ b/tools/testing/selftests/bpf/progs/file_reader_fail.c @@ -30,7 +30,7 @@ int on_nanosleep_unreleased_ref(void *ctx) SEC("xdp") __failure -__msg("Expected a dynptr of type file as arg #0") +__msg("Expected a dynptr of type file as R1") int xdp_wrong_dynptr_type(struct xdp_md *xdp) { struct bpf_dynptr dynptr; @@ -42,7 +42,7 @@ int xdp_wrong_dynptr_type(struct xdp_md *xdp) SEC("xdp") __failure -__msg("Expected an initialized dynptr as arg #0") +__msg("Expected an initialized dynptr as R1") int xdp_no_dynptr_type(struct xdp_md *xdp) { struct bpf_dynptr dynptr; diff --git a/tools/testing/selftests/bpf/progs/irq.c b/tools/testing/selftests/bpf/progs/irq.c index e11e82d98904..a4a007866a33 100644 --- a/tools/testing/selftests/bpf/progs/irq.c +++ b/tools/testing/selftests/bpf/progs/irq.c @@ -15,7 +15,7 @@ struct bpf_res_spin_lock lockA __hidden SEC(".data.A"); struct bpf_res_spin_lock lockB __hidden SEC(".data.B"); SEC("?tc") -__failure __msg("arg#0 doesn't point to an irq flag on stack") +__failure __msg("R1 doesn't point to an irq flag on stack") int irq_save_bad_arg(struct __sk_buff *ctx) { bpf_local_irq_save(&global_flags); @@ -23,7 +23,7 @@ int irq_save_bad_arg(struct __sk_buff *ctx) } SEC("?tc") -__failure __msg("arg#0 doesn't point to an irq flag on stack") +__failure __msg("R1 doesn't point to an irq flag on stack") int irq_restore_bad_arg(struct __sk_buff *ctx) { bpf_local_irq_restore(&global_flags); diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c index 86b74e3579d9..0fa70b133d93 100644 --- a/tools/testing/selftests/bpf/progs/iters.c +++ b/tools/testing/selftests/bpf/progs/iters.c @@ -1605,7 +1605,7 @@ int iter_subprog_check_stacksafe(const void *ctx) struct bpf_iter_num global_it; SEC("raw_tp") -__failure __msg("arg#0 expected pointer to an iterator on stack") +__failure __msg("R1 expected pointer to an iterator on stack") int iter_new_bad_arg(const void *ctx) { bpf_iter_num_new(&global_it, 0, 1); @@ -1613,7 +1613,7 @@ int iter_new_bad_arg(const void *ctx) } SEC("raw_tp") -__failure __msg("arg#0 expected pointer to an iterator on stack") +__failure __msg("R1 expected pointer to an iterator on stack") int iter_next_bad_arg(const void *ctx) { bpf_iter_num_next(&global_it); @@ -1621,7 +1621,7 @@ int iter_next_bad_arg(const void *ctx) } SEC("raw_tp") -__failure __msg("arg#0 expected pointer to an iterator on stack") +__failure __msg("R1 expected pointer to an iterator on stack") int iter_destroy_bad_arg(const void *ctx) { bpf_iter_num_destroy(&global_it); diff --git a/tools/testing/selftests/bpf/progs/iters_state_safety.c b/tools/testing/selftests/bpf/progs/iters_state_safety.c index d273b46dfc7c..af8f9ec1ea98 100644 --- a/tools/testing/selftests/bpf/progs/iters_state_safety.c +++ b/tools/testing/selftests/bpf/progs/iters_state_safety.c @@ -73,7 +73,7 @@ int create_and_forget_to_destroy_fail(void *ctx) } SEC("?raw_tp") -__failure __msg("expected an initialized iter_num as arg #0") +__failure __msg("expected an initialized iter_num as R1") int destroy_without_creating_fail(void *ctx) { /* init with zeros to stop verifier complaining about uninit stack */ @@ -91,7 +91,7 @@ int destroy_without_creating_fail(void *ctx) } SEC("?raw_tp") -__failure __msg("expected an initialized iter_num as arg #0") +__failure __msg("expected an initialized iter_num as R1") int compromise_iter_w_direct_write_fail(void *ctx) { struct bpf_iter_num iter; @@ -143,7 +143,7 @@ int compromise_iter_w_direct_write_and_skip_destroy_fail(void *ctx) } SEC("?raw_tp") -__failure __msg("expected an initialized iter_num as arg #0") +__failure __msg("expected an initialized iter_num as R1") int compromise_iter_w_helper_write_fail(void *ctx) { struct bpf_iter_num iter; @@ -230,7 +230,7 @@ int valid_stack_reuse(void *ctx) } SEC("?raw_tp") -__failure __msg("expected uninitialized iter_num as arg #0") +__failure __msg("expected uninitialized iter_num as R1") int double_create_fail(void *ctx) { struct bpf_iter_num iter; @@ -258,7 +258,7 @@ int double_create_fail(void *ctx) } SEC("?raw_tp") -__failure __msg("expected an initialized iter_num as arg #0") +__failure __msg("expected an initialized iter_num as R1") int double_destroy_fail(void *ctx) { struct bpf_iter_num iter; @@ -284,7 +284,7 @@ int double_destroy_fail(void *ctx) } SEC("?raw_tp") -__failure __msg("expected an initialized iter_num as arg #0") +__failure __msg("expected an initialized iter_num as R1") int next_without_new_fail(void *ctx) { struct bpf_iter_num iter; @@ -305,7 +305,7 @@ int next_without_new_fail(void *ctx) } SEC("?raw_tp") -__failure __msg("expected an initialized iter_num as arg #0") +__failure __msg("expected an initialized iter_num as R1") int next_after_destroy_fail(void *ctx) { struct bpf_iter_num iter; diff --git a/tools/testing/selftests/bpf/progs/iters_testmod.c b/tools/testing/selftests/bpf/progs/iters_testmod.c index 5379e9960ffd..76012dbbdb41 100644 --- a/tools/testing/selftests/bpf/progs/iters_testmod.c +++ b/tools/testing/selftests/bpf/progs/iters_testmod.c @@ -29,7 +29,7 @@ out: } SEC("raw_tp/sys_enter") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int iter_next_trusted_or_null(const void *ctx) { struct task_struct *cur_task = bpf_get_current_task_btf(); @@ -67,7 +67,7 @@ out: } SEC("raw_tp/sys_enter") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int iter_next_rcu_or_null(const void *ctx) { struct task_struct *cur_task = bpf_get_current_task_btf(); diff --git a/tools/testing/selftests/bpf/progs/iters_testmod_seq.c b/tools/testing/selftests/bpf/progs/iters_testmod_seq.c index 83791348bed5..9b760dac333e 100644 --- a/tools/testing/selftests/bpf/progs/iters_testmod_seq.c +++ b/tools/testing/selftests/bpf/progs/iters_testmod_seq.c @@ -79,7 +79,7 @@ int testmod_seq_truncated(const void *ctx) SEC("?raw_tp") __failure -__msg("expected an initialized iter_testmod_seq as arg #1") +__msg("expected an initialized iter_testmod_seq as R2") int testmod_seq_getter_before_bad(const void *ctx) { struct bpf_iter_testmod_seq it; @@ -89,7 +89,7 @@ int testmod_seq_getter_before_bad(const void *ctx) SEC("?raw_tp") __failure -__msg("expected an initialized iter_testmod_seq as arg #1") +__msg("expected an initialized iter_testmod_seq as R2") int testmod_seq_getter_after_bad(const void *ctx) { struct bpf_iter_testmod_seq it; diff --git a/tools/testing/selftests/bpf/progs/map_kptr_fail.c b/tools/testing/selftests/bpf/progs/map_kptr_fail.c index ee053b24e6ca..8f36e74fd8f9 100644 --- a/tools/testing/selftests/bpf/progs/map_kptr_fail.c +++ b/tools/testing/selftests/bpf/progs/map_kptr_fail.c @@ -364,7 +364,7 @@ int kptr_xchg_ref_state(struct __sk_buff *ctx) } SEC("?tc") -__failure __msg("Possibly NULL pointer passed to helper arg2") +__failure __msg("Possibly NULL pointer passed to helper R2") int kptr_xchg_possibly_null(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *p; diff --git a/tools/testing/selftests/bpf/progs/percpu_alloc_fail.c b/tools/testing/selftests/bpf/progs/percpu_alloc_fail.c index 81813c724fa9..08379c3b6a03 100644 --- a/tools/testing/selftests/bpf/progs/percpu_alloc_fail.c +++ b/tools/testing/selftests/bpf/progs/percpu_alloc_fail.c @@ -110,7 +110,7 @@ int BPF_PROG(test_array_map_3) } SEC("?fentry.s/bpf_fentry_test1") -__failure __msg("arg#0 expected for bpf_percpu_obj_drop()") +__failure __msg("R1 expected for bpf_percpu_obj_drop()") int BPF_PROG(test_array_map_4) { struct val_t __percpu_kptr *p; @@ -124,7 +124,7 @@ int BPF_PROG(test_array_map_4) } SEC("?fentry.s/bpf_fentry_test1") -__failure __msg("arg#0 expected for bpf_obj_drop()") +__failure __msg("R1 expected for bpf_obj_drop()") int BPF_PROG(test_array_map_5) { struct val_t *p; diff --git a/tools/testing/selftests/bpf/progs/rbtree_fail.c b/tools/testing/selftests/bpf/progs/rbtree_fail.c index 70b7baf9304b..555379952dcc 100644 --- a/tools/testing/selftests/bpf/progs/rbtree_fail.c +++ b/tools/testing/selftests/bpf/progs/rbtree_fail.c @@ -134,7 +134,7 @@ unlock_err: } SEC("?tc") -__failure __msg("arg#1 expected pointer to allocated object") +__failure __msg("R2 expected pointer to allocated object") long rbtree_api_add_to_multiple_trees(void *ctx) { struct node_data *n; @@ -153,7 +153,7 @@ long rbtree_api_add_to_multiple_trees(void *ctx) } SEC("?tc") -__failure __msg("Possibly NULL pointer passed to trusted arg1") +__failure __msg("Possibly NULL pointer passed to trusted R2") long rbtree_api_use_unchecked_remove_retval(void *ctx) { struct bpf_rb_node *res; @@ -281,7 +281,7 @@ long add_with_cb(bool (cb)(struct bpf_rb_node *a, const struct bpf_rb_node *b)) } SEC("?tc") -__failure __msg("arg#1 expected pointer to allocated object") +__failure __msg("R2 expected pointer to allocated object") long rbtree_api_add_bad_cb_bad_fn_call_add(void *ctx) { return add_with_cb(less__bad_fn_call_add); diff --git a/tools/testing/selftests/bpf/progs/refcounted_kptr_fail.c b/tools/testing/selftests/bpf/progs/refcounted_kptr_fail.c index b2808bfcec29..7247a20c0a3b 100644 --- a/tools/testing/selftests/bpf/progs/refcounted_kptr_fail.c +++ b/tools/testing/selftests/bpf/progs/refcounted_kptr_fail.c @@ -54,7 +54,7 @@ long rbtree_refcounted_node_ref_escapes(void *ctx) } SEC("?tc") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") long refcount_acquire_maybe_null(void *ctx) { struct node_acquire *n, *m; diff --git a/tools/testing/selftests/bpf/progs/stream_fail.c b/tools/testing/selftests/bpf/progs/stream_fail.c index 8e8249f3521c..21428bb1ee59 100644 --- a/tools/testing/selftests/bpf/progs/stream_fail.c +++ b/tools/testing/selftests/bpf/progs/stream_fail.c @@ -23,7 +23,7 @@ int stream_vprintk_scalar_arg(void *ctx) } SEC("syscall") -__failure __msg("arg#1 doesn't point to a const string") +__failure __msg("R2 doesn't point to a const string") int stream_vprintk_string_arg(void *ctx) { bpf_stream_vprintk(BPF_STDOUT, ctx, NULL, 0); diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c index 4c07ea193f72..41047d81ec42 100644 --- a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c +++ b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c @@ -28,7 +28,7 @@ static struct __tasks_kfunc_map_value *insert_lookup_task(struct task_struct *ta } SEC("tp_btf/task_newtask") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(task_kfunc_acquire_untrusted, struct task_struct *task, u64 clone_flags) { struct task_struct *acquired; @@ -49,7 +49,7 @@ int BPF_PROG(task_kfunc_acquire_untrusted, struct task_struct *task, u64 clone_f } SEC("tp_btf/task_newtask") -__failure __msg("arg#0 pointer type STRUCT task_struct must point") +__failure __msg("R1 pointer type STRUCT task_struct must point") int BPF_PROG(task_kfunc_acquire_fp, struct task_struct *task, u64 clone_flags) { struct task_struct *acquired, *stack_task = (struct task_struct *)&clone_flags; @@ -100,7 +100,7 @@ int BPF_PROG(task_kfunc_acquire_unsafe_kretprobe_rcu, struct task_struct *task, } SEC("tp_btf/task_newtask") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(task_kfunc_acquire_null, struct task_struct *task, u64 clone_flags) { struct task_struct *acquired; @@ -149,7 +149,7 @@ int BPF_PROG(task_kfunc_xchg_unreleased, struct task_struct *task, u64 clone_fla } SEC("tp_btf/task_newtask") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(task_kfunc_acquire_release_no_null_check, struct task_struct *task, u64 clone_flags) { struct task_struct *acquired; @@ -162,7 +162,7 @@ int BPF_PROG(task_kfunc_acquire_release_no_null_check, struct task_struct *task, } SEC("tp_btf/task_newtask") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(task_kfunc_release_untrusted, struct task_struct *task, u64 clone_flags) { struct __tasks_kfunc_map_value *v; @@ -178,7 +178,7 @@ int BPF_PROG(task_kfunc_release_untrusted, struct task_struct *task, u64 clone_f } SEC("tp_btf/task_newtask") -__failure __msg("arg#0 pointer type STRUCT task_struct must point") +__failure __msg("R1 pointer type STRUCT task_struct must point") int BPF_PROG(task_kfunc_release_fp, struct task_struct *task, u64 clone_flags) { struct task_struct *acquired = (struct task_struct *)&clone_flags; @@ -190,7 +190,7 @@ int BPF_PROG(task_kfunc_release_fp, struct task_struct *task, u64 clone_flags) } SEC("tp_btf/task_newtask") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(task_kfunc_release_null, struct task_struct *task, u64 clone_flags) { struct __tasks_kfunc_map_value local, *v; @@ -234,7 +234,7 @@ int BPF_PROG(task_kfunc_release_unacquired, struct task_struct *task, u64 clone_ } SEC("tp_btf/task_newtask") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(task_kfunc_from_pid_no_null_check, struct task_struct *task, u64 clone_flags) { struct task_struct *acquired; @@ -248,7 +248,7 @@ int BPF_PROG(task_kfunc_from_pid_no_null_check, struct task_struct *task, u64 cl } SEC("tp_btf/task_newtask") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(task_kfunc_from_vpid_no_null_check, struct task_struct *task, u64 clone_flags) { struct task_struct *acquired; diff --git a/tools/testing/selftests/bpf/progs/task_work_fail.c b/tools/testing/selftests/bpf/progs/task_work_fail.c index 82e4b8913333..3186e7b4b24e 100644 --- a/tools/testing/selftests/bpf/progs/task_work_fail.c +++ b/tools/testing/selftests/bpf/progs/task_work_fail.c @@ -58,7 +58,7 @@ int mismatch_map(struct pt_regs *args) } SEC("perf_event") -__failure __msg("arg#1 doesn't point to a map value") +__failure __msg("R2 doesn't point to a map value") int no_map_task_work(struct pt_regs *args) { struct task_struct *task; @@ -70,7 +70,7 @@ int no_map_task_work(struct pt_regs *args) } SEC("perf_event") -__failure __msg("Possibly NULL pointer passed to trusted arg1") +__failure __msg("Possibly NULL pointer passed to trusted R2") int task_work_null(struct pt_regs *args) { struct task_struct *task; @@ -81,7 +81,7 @@ int task_work_null(struct pt_regs *args) } SEC("perf_event") -__failure __msg("Possibly NULL pointer passed to trusted arg2") +__failure __msg("Possibly NULL pointer passed to trusted R3") int map_null(struct pt_regs *args) { struct elem *work; diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c b/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c index 2c156cd166af..332cda89caba 100644 --- a/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c +++ b/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c @@ -152,7 +152,7 @@ int change_status_after_alloc(struct __sk_buff *ctx) } SEC("?tc") -__failure __msg("Possibly NULL pointer passed to trusted arg1") +__failure __msg("Possibly NULL pointer passed to trusted R2") int lookup_null_bpf_tuple(struct __sk_buff *ctx) { struct bpf_ct_opts___local opts = {}; @@ -165,7 +165,7 @@ int lookup_null_bpf_tuple(struct __sk_buff *ctx) } SEC("?tc") -__failure __msg("Possibly NULL pointer passed to trusted arg3") +__failure __msg("Possibly NULL pointer passed to trusted R4") int lookup_null_bpf_opts(struct __sk_buff *ctx) { struct bpf_sock_tuple tup = {}; @@ -178,7 +178,7 @@ int lookup_null_bpf_opts(struct __sk_buff *ctx) } SEC("?xdp") -__failure __msg("Possibly NULL pointer passed to trusted arg1") +__failure __msg("Possibly NULL pointer passed to trusted R2") int xdp_lookup_null_bpf_tuple(struct xdp_md *ctx) { struct bpf_ct_opts___local opts = {}; @@ -191,7 +191,7 @@ int xdp_lookup_null_bpf_tuple(struct xdp_md *ctx) } SEC("?xdp") -__failure __msg("Possibly NULL pointer passed to trusted arg3") +__failure __msg("Possibly NULL pointer passed to trusted R4") int xdp_lookup_null_bpf_opts(struct xdp_md *ctx) { struct bpf_sock_tuple tup = {}; diff --git a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c index 1c6cfd0888ba..bf48fc43c7ab 100644 --- a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c +++ b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c @@ -40,7 +40,7 @@ int BPF_PROG(not_valid_dynptr, int cmd, union bpf_attr *attr, unsigned int size, } SEC("?lsm.s/bpf") -__failure __msg("arg#0 expected pointer to stack or const struct bpf_dynptr") +__failure __msg("R1 expected pointer to stack or const struct bpf_dynptr") int BPF_PROG(not_ptr_to_stack, int cmd, union bpf_attr *attr, unsigned int size, bool kernel) { static struct bpf_dynptr val; diff --git a/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c b/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c index 967081bbcfe1..ca35b92ea095 100644 --- a/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c +++ b/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c @@ -29,7 +29,7 @@ int kfunc_dynptr_nullable_test2(struct __sk_buff *skb) } SEC("tc") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int kfunc_dynptr_nullable_test3(struct __sk_buff *skb) { struct bpf_dynptr data; diff --git a/tools/testing/selftests/bpf/progs/verifier_bits_iter.c b/tools/testing/selftests/bpf/progs/verifier_bits_iter.c index 8bcddadfc4da..dd97f2027505 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bits_iter.c +++ b/tools/testing/selftests/bpf/progs/verifier_bits_iter.c @@ -32,7 +32,7 @@ int BPF_PROG(no_destroy, struct bpf_iter_meta *meta, struct cgroup *cgrp) SEC("iter/cgroup") __description("uninitialized iter in ->next()") -__failure __msg("expected an initialized iter_bits as arg #0") +__failure __msg("expected an initialized iter_bits as R1") int BPF_PROG(next_uninit, struct bpf_iter_meta *meta, struct cgroup *cgrp) { struct bpf_iter_bits it = {}; @@ -43,7 +43,7 @@ int BPF_PROG(next_uninit, struct bpf_iter_meta *meta, struct cgroup *cgrp) SEC("iter/cgroup") __description("uninitialized iter in ->destroy()") -__failure __msg("expected an initialized iter_bits as arg #0") +__failure __msg("expected an initialized iter_bits as R1") int BPF_PROG(destroy_uninit, struct bpf_iter_meta *meta, struct cgroup *cgrp) { struct bpf_iter_bits it = {}; diff --git a/tools/testing/selftests/bpf/progs/verifier_ref_tracking.c b/tools/testing/selftests/bpf/progs/verifier_ref_tracking.c index 910365201f68..139f70bb3595 100644 --- a/tools/testing/selftests/bpf/progs/verifier_ref_tracking.c +++ b/tools/testing/selftests/bpf/progs/verifier_ref_tracking.c @@ -263,7 +263,7 @@ l0_%=: r0 = 0; \ SEC("lsm.s/bpf") __description("reference tracking: release user key reference without check") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") __naked void user_key_reference_without_check(void) { asm volatile (" \ @@ -282,7 +282,7 @@ __naked void user_key_reference_without_check(void) SEC("lsm.s/bpf") __description("reference tracking: release system key reference without check") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") __naked void system_key_reference_without_check(void) { asm volatile (" \ @@ -300,7 +300,7 @@ __naked void system_key_reference_without_check(void) SEC("lsm.s/bpf") __description("reference tracking: release with NULL key pointer") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") __naked void release_with_null_key_pointer(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c b/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c index 4b392c6c8fc4..0990de076844 100644 --- a/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c +++ b/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c @@ -13,7 +13,7 @@ static char buf[PATH_MAX]; SEC("lsm.s/file_open") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(get_task_exe_file_kfunc_null) { struct file *acquired; @@ -28,7 +28,7 @@ int BPF_PROG(get_task_exe_file_kfunc_null) } SEC("lsm.s/inode_getxattr") -__failure __msg("arg#0 pointer type STRUCT task_struct must point to scalar, or struct with scalar") +__failure __msg("R1 pointer type STRUCT task_struct must point to scalar, or struct with scalar") int BPF_PROG(get_task_exe_file_kfunc_fp) { u64 x; @@ -89,7 +89,7 @@ int BPF_PROG(put_file_kfunc_unacquired, struct file *file) } SEC("lsm.s/file_open") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(path_d_path_kfunc_null) { /* Can't pass NULL value to bpf_path_d_path() kfunc. */ @@ -128,7 +128,7 @@ int BPF_PROG(path_d_path_kfunc_untrusted_from_current) } SEC("lsm.s/file_open") -__failure __msg("kernel function bpf_path_d_path args#0 expected pointer to STRUCT path but R1 has a pointer to STRUCT file") +__failure __msg("kernel function bpf_path_d_path R1 expected pointer to STRUCT path but R1 has a pointer to STRUCT file") int BPF_PROG(path_d_path_kfunc_type_mismatch, struct file *file) { bpf_path_d_path((struct path *)&file->f_task_work, buf, sizeof(buf)); diff --git a/tools/testing/selftests/bpf/progs/wq_failures.c b/tools/testing/selftests/bpf/progs/wq_failures.c index 3767f5595bbc..32dc8827e128 100644 --- a/tools/testing/selftests/bpf/progs/wq_failures.c +++ b/tools/testing/selftests/bpf/progs/wq_failures.c @@ -98,7 +98,7 @@ __failure * is a correct bpf_wq pointer. */ __msg(": (85) call bpf_wq_set_callback#") /* anchor message */ -__msg("arg#0 doesn't point to a map value") +__msg("R1 doesn't point to a map value") long test_wrong_wq_pointer(void *ctx) { int key = 0; diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index c3164b9b2be5..0bb4337552c8 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -31,7 +31,7 @@ }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, - .errstr = "arg#0 pointer type STRUCT prog_test_fail1 must point to scalar", + .errstr = "R1 pointer type STRUCT prog_test_fail1 must point to scalar", .fixup_kfunc_btf_id = { { "bpf_kfunc_call_test_fail1", 2 }, }, @@ -46,7 +46,7 @@ }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, - .errstr = "max struct nesting depth exceeded\narg#0 pointer type STRUCT prog_test_fail2", + .errstr = "max struct nesting depth exceeded\nR1 pointer type STRUCT prog_test_fail2", .fixup_kfunc_btf_id = { { "bpf_kfunc_call_test_fail2", 2 }, }, @@ -61,7 +61,7 @@ }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, - .errstr = "arg#0 pointer type STRUCT prog_test_fail3 must point to scalar", + .errstr = "R1 pointer type STRUCT prog_test_fail3 must point to scalar", .fixup_kfunc_btf_id = { { "bpf_kfunc_call_test_fail3", 2 }, }, @@ -76,7 +76,7 @@ }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, - .errstr = "arg#0 expected pointer to ctx, but got fp", + .errstr = "R1 expected pointer to ctx, but got fp", .fixup_kfunc_btf_id = { { "bpf_kfunc_call_test_pass_ctx", 2 }, }, @@ -91,7 +91,7 @@ }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, - .errstr = "arg#0 pointer type UNKNOWN must point to scalar", + .errstr = "R1 pointer type UNKNOWN must point to scalar", .fixup_kfunc_btf_id = { { "bpf_kfunc_call_test_mem_len_fail1", 2 }, }, @@ -109,7 +109,7 @@ }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, - .errstr = "Possibly NULL pointer passed to trusted arg0", + .errstr = "Possibly NULL pointer passed to trusted R1", .fixup_kfunc_btf_id = { { "bpf_kfunc_call_test_acquire", 3 }, { "bpf_kfunc_call_test_release", 5 }, @@ -152,7 +152,7 @@ }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, - .errstr = "kernel function bpf_kfunc_call_memb1_release args#0 expected pointer", + .errstr = "kernel function bpf_kfunc_call_memb1_release R1 expected pointer", .fixup_kfunc_btf_id = { { "bpf_kfunc_call_memb_acquire", 1 }, { "bpf_kfunc_call_memb1_release", 5 }, -- cgit v1.2.3 From 246ad6e5ee259669692bdb7fb353e8c5d5bba628 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 22 Apr 2026 20:35:06 -0700 Subject: bpf: Introduce bpf register BPF_REG_PARAMS Introduce BPF_REG_PARAMS as a dedicated BPF register for stack argument accesses. It occupies the BPF register number 11 (R11), which is used as the base pointer for the stack argument area, keeping it separate from the R10-based (BPF_REG_FP) program stack. The kernel-internal hidden register BPF_REG_AX previously occupied slot 11 (MAX_BPF_REG). With BPF_REG_PARAMS taking that slot, BPF_REG_AX moves to slot 12 and MAX_BPF_EXT_REG increases accordingly. Acked-by: Puranjay Mohan Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260423033506.2542005-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 5 +- kernel/bpf/core.c | 4 +- .../testing/selftests/bpf/prog_tests/ctx_rewrite.c | 14 ++--- .../selftests/bpf/progs/verifier_bpf_fastcall.c | 24 ++++---- .../selftests/bpf/progs/verifier_may_goto_1.c | 12 ++-- tools/testing/selftests/bpf/progs/verifier_sdiv.c | 64 +++++++++++----------- 6 files changed, 62 insertions(+), 61 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 1ec6d5ba64cc..b77d0b06db6e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -58,8 +58,9 @@ struct ctl_table_header; #define BPF_REG_H BPF_REG_9 /* hlen, callee-saved */ /* Kernel hidden auxiliary/helper register. */ -#define BPF_REG_AX MAX_BPF_REG -#define MAX_BPF_EXT_REG (MAX_BPF_REG + 1) +#define BPF_REG_PARAMS MAX_BPF_REG +#define BPF_REG_AX (MAX_BPF_REG + 1) +#define MAX_BPF_EXT_REG (MAX_BPF_REG + 2) #define MAX_BPF_JIT_REG MAX_BPF_EXT_REG /* unused opcode to mark special call to bpf_tail_call() helper */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8b018ff48875..ae10b9ca018d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1299,8 +1299,8 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, u32 imm_rnd = get_random_u32(); s16 off; - BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG); - BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG); + BUILD_BUG_ON(BPF_REG_PARAMS + 2 != MAX_BPF_JIT_REG); + BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG); /* Constraints on AX register: * diff --git a/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c b/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c index 5064aeb8fe67..2c3124092b73 100644 --- a/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c +++ b/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c @@ -69,19 +69,19 @@ static struct test_case test_cases[] = { #if defined(__x86_64__) || defined(__aarch64__) { N(SCHED_CLS, struct __sk_buff, tstamp), - .read = "r11 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);" - "if w11 & 0x4 goto pc+1;" + .read = "r12 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);" + "if w12 & 0x4 goto pc+1;" "goto pc+4;" - "if w11 & 0x3 goto pc+1;" + "if w12 & 0x3 goto pc+1;" "goto pc+2;" "$dst = 0;" "goto pc+1;" "$dst = *(u64 *)($ctx + sk_buff::tstamp);", - .write = "r11 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);" - "if w11 & 0x4 goto pc+1;" + .write = "r12 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);" + "if w12 & 0x4 goto pc+1;" "goto pc+2;" - "w11 &= -4;" - "*(u8 *)($ctx + sk_buff::__mono_tc_offset) = r11;" + "w12 &= -4;" + "*(u8 *)($ctx + sk_buff::__mono_tc_offset) = r12;" "*(u64 *)($ctx + sk_buff::tstamp) = $src;", }, #endif diff --git a/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c index fb4fa465d67c..0d9e167555b5 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c +++ b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c @@ -630,13 +630,13 @@ __xlated("...") __xlated("4: r0 = &(void __percpu *)(r0)") __xlated("...") /* may_goto expansion starts */ -__xlated("6: r11 = *(u64 *)(r10 -24)") -__xlated("7: if r11 == 0x0 goto pc+6") -__xlated("8: r11 -= 1") -__xlated("9: if r11 != 0x0 goto pc+2") -__xlated("10: r11 = -24") +__xlated("6: r12 = *(u64 *)(r10 -24)") +__xlated("7: if r12 == 0x0 goto pc+6") +__xlated("8: r12 -= 1") +__xlated("9: if r12 != 0x0 goto pc+2") +__xlated("10: r12 = -24") __xlated("11: call unknown") -__xlated("12: *(u64 *)(r10 -24) = r11") +__xlated("12: *(u64 *)(r10 -24) = r12") /* may_goto expansion ends */ __xlated("13: *(u64 *)(r10 -8) = r1") __xlated("14: exit") @@ -668,13 +668,13 @@ __xlated("1: *(u64 *)(r10 -16) =") __xlated("2: r1 = 1") __xlated("3: call bpf_get_smp_processor_id") /* may_goto expansion starts */ -__xlated("4: r11 = *(u64 *)(r10 -24)") -__xlated("5: if r11 == 0x0 goto pc+6") -__xlated("6: r11 -= 1") -__xlated("7: if r11 != 0x0 goto pc+2") -__xlated("8: r11 = -24") +__xlated("4: r12 = *(u64 *)(r10 -24)") +__xlated("5: if r12 == 0x0 goto pc+6") +__xlated("6: r12 -= 1") +__xlated("7: if r12 != 0x0 goto pc+2") +__xlated("8: r12 = -24") __xlated("9: call unknown") -__xlated("10: *(u64 *)(r10 -24) = r11") +__xlated("10: *(u64 *)(r10 -24) = r12") /* may_goto expansion ends */ __xlated("11: *(u64 *)(r10 -8) = r1") __xlated("12: exit") diff --git a/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c b/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c index 6d1edaef9213..4bdf4256a41e 100644 --- a/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c +++ b/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c @@ -81,13 +81,13 @@ __arch_s390x __arch_arm64 __xlated("0: *(u64 *)(r10 -16) = 65535") __xlated("1: *(u64 *)(r10 -8) = 0") -__xlated("2: r11 = *(u64 *)(r10 -16)") -__xlated("3: if r11 == 0x0 goto pc+6") -__xlated("4: r11 -= 1") -__xlated("5: if r11 != 0x0 goto pc+2") -__xlated("6: r11 = -16") +__xlated("2: r12 = *(u64 *)(r10 -16)") +__xlated("3: if r12 == 0x0 goto pc+6") +__xlated("4: r12 -= 1") +__xlated("5: if r12 != 0x0 goto pc+2") +__xlated("6: r12 = -16") __xlated("7: call unknown") -__xlated("8: *(u64 *)(r10 -16) = r11") +__xlated("8: *(u64 *)(r10 -16) = r12") __xlated("9: r0 = 1") __xlated("10: r0 = 2") __xlated("11: exit") diff --git a/tools/testing/selftests/bpf/progs/verifier_sdiv.c b/tools/testing/selftests/bpf/progs/verifier_sdiv.c index fd59d57e8e37..95f3239ce228 100644 --- a/tools/testing/selftests/bpf/progs/verifier_sdiv.c +++ b/tools/testing/selftests/bpf/progs/verifier_sdiv.c @@ -778,10 +778,10 @@ __arch_x86_64 __xlated("0: r2 = 0x8000000000000000") __xlated("2: r3 = -1") __xlated("3: r4 = r2") -__xlated("4: r11 = r3") -__xlated("5: r11 += 1") -__xlated("6: if r11 > 0x1 goto pc+4") -__xlated("7: if r11 == 0x0 goto pc+1") +__xlated("4: r12 = r3") +__xlated("5: r12 += 1") +__xlated("6: if r12 > 0x1 goto pc+4") +__xlated("7: if r12 == 0x0 goto pc+1") __xlated("8: r2 = 0") __xlated("9: r2 = -r2") __xlated("10: goto pc+1") @@ -812,10 +812,10 @@ __success __retval(-5) __arch_x86_64 __xlated("0: r2 = 5") __xlated("1: r3 = -1") -__xlated("2: r11 = r3") -__xlated("3: r11 += 1") -__xlated("4: if r11 > 0x1 goto pc+4") -__xlated("5: if r11 == 0x0 goto pc+1") +__xlated("2: r12 = r3") +__xlated("3: r12 += 1") +__xlated("4: if r12 > 0x1 goto pc+4") +__xlated("5: if r12 == 0x0 goto pc+1") __xlated("6: r2 = 0") __xlated("7: r2 = -r2") __xlated("8: goto pc+1") @@ -890,10 +890,10 @@ __arch_x86_64 __xlated("0: w2 = -2147483648") __xlated("1: w3 = -1") __xlated("2: w4 = w2") -__xlated("3: r11 = r3") -__xlated("4: w11 += 1") -__xlated("5: if w11 > 0x1 goto pc+4") -__xlated("6: if w11 == 0x0 goto pc+1") +__xlated("3: r12 = r3") +__xlated("4: w12 += 1") +__xlated("5: if w12 > 0x1 goto pc+4") +__xlated("6: if w12 == 0x0 goto pc+1") __xlated("7: w2 = 0") __xlated("8: w2 = -w2") __xlated("9: goto pc+1") @@ -925,10 +925,10 @@ __arch_x86_64 __xlated("0: w2 = -5") __xlated("1: w3 = -1") __xlated("2: w4 = w2") -__xlated("3: r11 = r3") -__xlated("4: w11 += 1") -__xlated("5: if w11 > 0x1 goto pc+4") -__xlated("6: if w11 == 0x0 goto pc+1") +__xlated("3: r12 = r3") +__xlated("4: w12 += 1") +__xlated("5: if w12 > 0x1 goto pc+4") +__xlated("6: if w12 == 0x0 goto pc+1") __xlated("7: w2 = 0") __xlated("8: w2 = -w2") __xlated("9: goto pc+1") @@ -1004,10 +1004,10 @@ __arch_x86_64 __xlated("0: r2 = 0x8000000000000000") __xlated("2: r3 = -1") __xlated("3: r4 = r2") -__xlated("4: r11 = r3") -__xlated("5: r11 += 1") -__xlated("6: if r11 > 0x1 goto pc+3") -__xlated("7: if r11 == 0x1 goto pc+3") +__xlated("4: r12 = r3") +__xlated("5: r12 += 1") +__xlated("6: if r12 > 0x1 goto pc+3") +__xlated("7: if r12 == 0x1 goto pc+3") __xlated("8: w2 = 0") __xlated("9: goto pc+1") __xlated("10: r2 s%= r3") @@ -1034,10 +1034,10 @@ __arch_x86_64 __xlated("0: r2 = 5") __xlated("1: r3 = -1") __xlated("2: r4 = r2") -__xlated("3: r11 = r3") -__xlated("4: r11 += 1") -__xlated("5: if r11 > 0x1 goto pc+3") -__xlated("6: if r11 == 0x1 goto pc+3") +__xlated("3: r12 = r3") +__xlated("4: r12 += 1") +__xlated("5: if r12 > 0x1 goto pc+3") +__xlated("6: if r12 == 0x1 goto pc+3") __xlated("7: w2 = 0") __xlated("8: goto pc+1") __xlated("9: r2 s%= r3") @@ -1108,10 +1108,10 @@ __arch_x86_64 __xlated("0: w2 = -2147483648") __xlated("1: w3 = -1") __xlated("2: w4 = w2") -__xlated("3: r11 = r3") -__xlated("4: w11 += 1") -__xlated("5: if w11 > 0x1 goto pc+3") -__xlated("6: if w11 == 0x1 goto pc+4") +__xlated("3: r12 = r3") +__xlated("4: w12 += 1") +__xlated("5: if w12 > 0x1 goto pc+3") +__xlated("6: if w12 == 0x1 goto pc+4") __xlated("7: w2 = 0") __xlated("8: goto pc+1") __xlated("9: w2 s%= w3") @@ -1140,10 +1140,10 @@ __arch_x86_64 __xlated("0: w2 = -5") __xlated("1: w3 = -1") __xlated("2: w4 = w2") -__xlated("3: r11 = r3") -__xlated("4: w11 += 1") -__xlated("5: if w11 > 0x1 goto pc+3") -__xlated("6: if w11 == 0x1 goto pc+4") +__xlated("3: r12 = r3") +__xlated("4: w12 += 1") +__xlated("5: if w12 > 0x1 goto pc+3") +__xlated("6: if w12 == 0x1 goto pc+4") __xlated("7: w2 = 0") __xlated("8: goto pc+1") __xlated("9: w2 s%= w3") -- cgit v1.2.3 From 4439328d3878c97fdf5ddec828a43ea07c388452 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 22 Apr 2026 20:35:11 -0700 Subject: bpf: Reuse MAX_BPF_FUNC_ARGS for maximum number of arguments Currently, MAX_BPF_FUNC_ARGS is used for tracepoint related progs where the number of parameters cannot exceed MAX_BPF_FUNC_ARGS. Here, MAX_BPF_FUNC_ARGS is reused to set a limit of the number of arguments for bpf functions and kfuncs. The current value for MAX_BPF_FUNC_ARGS is 12 which should be sufficient for majority of bpf functions and kfuncs. Acked-by: Puranjay Mohan Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260423033511.2542870-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d3aea3931b85..715b6df9c403 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1151,6 +1151,11 @@ struct bpf_prog_offload { /* The longest tracepoint has 12 args. * See include/trace/bpf_probe.h + * + * Also reuse this macro for maximum number of arguments a BPF function + * or a kfunc can have. Args 1-5 are passed in registers, args 6-12 via + * stack arg slots. The JIT may map some stack arg slots to registers based + * on the native calling convention (e.g., arg 6 to R9 on x86-64). */ #define MAX_BPF_FUNC_ARGS 12 -- cgit v1.2.3 From d4a2eeb2ac7813ac9374568c71662c630689cc54 Mon Sep 17 00:00:00 2001 From: "Alexis Lothoré (eBPF Foundation)" Date: Wed, 22 Apr 2026 18:20:24 +0200 Subject: selftests/bpf: Make btf_dump use xdp_dummy rather than xdping_kern MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to prepare xdping tool removal from the BPF selftests directory, make the btf_dump test use another BPF program for the btf datasec dump test. Use xdp_dummy.bpf.o, as it is already used by various other tests. Signed-off-by: Alexis Lothoré (eBPF Foundation) Signed-off-by: Martin KaFai Lau Acked-by: Paul Chaignon Link: https://patch.msgid.link/20260422-xdping-v2-1-c0f8ccedcf91@bootlin.com --- tools/testing/selftests/bpf/prog_tests/btf_dump.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c index f1642794f70e..9f1b50e07a29 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c @@ -1027,8 +1027,8 @@ static void test_btf_dump_datasec_data(char *str) char license[4] = "GPL"; struct btf_dump *d; - btf = btf__parse("xdping_kern.bpf.o", NULL); - if (!ASSERT_OK_PTR(btf, "xdping_kern.bpf.o BTF not found")) + btf = btf__parse("xdp_dummy.bpf.o", NULL); + if (!ASSERT_OK_PTR(btf, "xdp_dummy.bpf.o BTF not found")) return; d = btf_dump__new(btf, btf_dump_snprintf, str, NULL); -- cgit v1.2.3 From feb13b19f3fa7202eba1ab9cc47535e092ef7968 Mon Sep 17 00:00:00 2001 From: "Alexis Lothoré (eBPF Foundation)" Date: Wed, 22 Apr 2026 18:20:25 +0200 Subject: selftests/bpf: Drop xdping tool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As part of a larger cleanup effort in the bpf selftests directory, tests and scripts are either being converted to the test_progs framework (so they are executed automatically in bpf CI), or removed if not relevant for such integration. The test_xdping.sh script (with the associated xdping.c) acts as a RTT measurement tool, by attaching two small xdp programs to two interfaces. Converting this test to test_progs may not make much sense: - RTT measurement does not really fit in the scope of a functional test, this is rather about measuring some performance level. - there are other existing tests in test_progs that actively validate XDP features like program attachment, return value processing, packet modification, etc Drop test_xdping.sh, the corresponding xdping.c userspace part, the xdping_kern.c program, and the shared header, xdping.h Signed-off-by: Alexis Lothoré (eBPF Foundation) Signed-off-by: Martin KaFai Lau Reviewed-by: Alan Maguire Acked-by: Paul Chaignon Link: https://patch.msgid.link/20260422-xdping-v2-2-c0f8ccedcf91@bootlin.com --- tools/testing/selftests/bpf/.gitignore | 1 - tools/testing/selftests/bpf/Makefile | 3 - tools/testing/selftests/bpf/progs/xdping_kern.c | 183 ----------------- tools/testing/selftests/bpf/test_xdping.sh | 103 ---------- tools/testing/selftests/bpf/xdping.c | 254 ------------------------ tools/testing/selftests/bpf/xdping.h | 13 -- 6 files changed, 557 deletions(-) delete mode 100644 tools/testing/selftests/bpf/progs/xdping_kern.c delete mode 100755 tools/testing/selftests/bpf/test_xdping.sh delete mode 100644 tools/testing/selftests/bpf/xdping.c delete mode 100644 tools/testing/selftests/bpf/xdping.h diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index bfdc5518ecc8..986a6389186b 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -21,7 +21,6 @@ test_lirc_mode2_user flow_dissector_load test_tcpnotify_user test_libbpf -xdping test_cpp *.d *.subskel.h diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 6ef6872adbc3..ac676d2a4a29 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -111,7 +111,6 @@ TEST_FILES = xsk_prereqs.sh $(wildcard progs/btf_dump_test_case_*.c) # Order correspond to 'make run_tests' order TEST_PROGS := test_kmod.sh \ test_lirc_mode2.sh \ - test_xdping.sh \ test_bpftool_build.sh \ test_doc_build.sh \ test_xsk.sh \ @@ -134,7 +133,6 @@ TEST_GEN_PROGS_EXTENDED = \ xdp_features \ xdp_hw_metadata \ xdp_synproxy \ - xdping \ xskxceiver TEST_GEN_FILES += $(TEST_KMODS) liburandom_read.so urandom_read sign-file uprobe_multi @@ -320,7 +318,6 @@ $(OUTPUT)/test_tcpnotify_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(TRACE_HELP $(OUTPUT)/test_sock_fields: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_tag: $(TESTING_HELPERS) $(OUTPUT)/test_lirc_mode2_user: $(TESTING_HELPERS) -$(OUTPUT)/xdping: $(TESTING_HELPERS) $(OUTPUT)/flow_dissector_load: $(TESTING_HELPERS) $(OUTPUT)/test_maps: $(TESTING_HELPERS) $(OUTPUT)/test_verifier: $(TESTING_HELPERS) $(CAP_HELPERS) $(UNPRIV_HELPERS) diff --git a/tools/testing/selftests/bpf/progs/xdping_kern.c b/tools/testing/selftests/bpf/progs/xdping_kern.c deleted file mode 100644 index 44e2b0ef23ae..000000000000 --- a/tools/testing/selftests/bpf/progs/xdping_kern.c +++ /dev/null @@ -1,183 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */ - -#define KBUILD_MODNAME "foo" -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "bpf_compiler.h" -#include "xdping.h" - -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, 256); - __type(key, __u32); - __type(value, struct pinginfo); -} ping_map SEC(".maps"); - -static __always_inline void swap_src_dst_mac(void *data) -{ - unsigned short *p = data; - unsigned short dst[3]; - - dst[0] = p[0]; - dst[1] = p[1]; - dst[2] = p[2]; - p[0] = p[3]; - p[1] = p[4]; - p[2] = p[5]; - p[3] = dst[0]; - p[4] = dst[1]; - p[5] = dst[2]; -} - -static __always_inline __u16 csum_fold_helper(__wsum sum) -{ - sum = (sum & 0xffff) + (sum >> 16); - return ~((sum & 0xffff) + (sum >> 16)); -} - -static __always_inline __u16 ipv4_csum(void *data_start, int data_size) -{ - __wsum sum; - - sum = bpf_csum_diff(0, 0, data_start, data_size, 0); - return csum_fold_helper(sum); -} - -#define ICMP_ECHO_LEN 64 - -static __always_inline int icmp_check(struct xdp_md *ctx, int type) -{ - void *data_end = (void *)(long)ctx->data_end; - void *data = (void *)(long)ctx->data; - struct ethhdr *eth = data; - struct icmphdr *icmph; - struct iphdr *iph; - - if (data + sizeof(*eth) + sizeof(*iph) + ICMP_ECHO_LEN > data_end) - return XDP_PASS; - - if (eth->h_proto != bpf_htons(ETH_P_IP)) - return XDP_PASS; - - iph = data + sizeof(*eth); - - if (iph->protocol != IPPROTO_ICMP) - return XDP_PASS; - - if (bpf_ntohs(iph->tot_len) - sizeof(*iph) != ICMP_ECHO_LEN) - return XDP_PASS; - - icmph = data + sizeof(*eth) + sizeof(*iph); - - if (icmph->type != type) - return XDP_PASS; - - return XDP_TX; -} - -SEC("xdp") -int xdping_client(struct xdp_md *ctx) -{ - void *data = (void *)(long)ctx->data; - struct pinginfo *pinginfo = NULL; - struct ethhdr *eth = data; - struct icmphdr *icmph; - struct iphdr *iph; - __u64 recvtime; - __be32 raddr; - __be16 seq; - int ret; - __u8 i; - - ret = icmp_check(ctx, ICMP_ECHOREPLY); - - if (ret != XDP_TX) - return ret; - - iph = data + sizeof(*eth); - icmph = data + sizeof(*eth) + sizeof(*iph); - raddr = iph->saddr; - - /* Record time reply received. */ - recvtime = bpf_ktime_get_ns(); - pinginfo = bpf_map_lookup_elem(&ping_map, &raddr); - if (!pinginfo || pinginfo->seq != icmph->un.echo.sequence) - return XDP_PASS; - - if (pinginfo->start) { - __pragma_loop_unroll_full - for (i = 0; i < XDPING_MAX_COUNT; i++) { - if (pinginfo->times[i] == 0) - break; - } - /* verifier is fussy here... */ - if (i < XDPING_MAX_COUNT) { - pinginfo->times[i] = recvtime - - pinginfo->start; - pinginfo->start = 0; - i++; - } - /* No more space for values? */ - if (i == pinginfo->count || i == XDPING_MAX_COUNT) - return XDP_PASS; - } - - /* Now convert reply back into echo request. */ - swap_src_dst_mac(data); - iph->saddr = iph->daddr; - iph->daddr = raddr; - icmph->type = ICMP_ECHO; - seq = bpf_htons(bpf_ntohs(icmph->un.echo.sequence) + 1); - icmph->un.echo.sequence = seq; - icmph->checksum = 0; - icmph->checksum = ipv4_csum(icmph, ICMP_ECHO_LEN); - - pinginfo->seq = seq; - pinginfo->start = bpf_ktime_get_ns(); - - return XDP_TX; -} - -SEC("xdp") -int xdping_server(struct xdp_md *ctx) -{ - void *data = (void *)(long)ctx->data; - struct ethhdr *eth = data; - struct icmphdr *icmph; - struct iphdr *iph; - __be32 raddr; - int ret; - - ret = icmp_check(ctx, ICMP_ECHO); - - if (ret != XDP_TX) - return ret; - - iph = data + sizeof(*eth); - icmph = data + sizeof(*eth) + sizeof(*iph); - raddr = iph->saddr; - - /* Now convert request into echo reply. */ - swap_src_dst_mac(data); - iph->saddr = iph->daddr; - iph->daddr = raddr; - icmph->type = ICMP_ECHOREPLY; - icmph->checksum = 0; - icmph->checksum = ipv4_csum(icmph, ICMP_ECHO_LEN); - - return XDP_TX; -} - -char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_xdping.sh b/tools/testing/selftests/bpf/test_xdping.sh deleted file mode 100755 index c3d82e0a7378..000000000000 --- a/tools/testing/selftests/bpf/test_xdping.sh +++ /dev/null @@ -1,103 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -# xdping tests -# Here we setup and teardown configuration required to run -# xdping, exercising its options. -# -# Setup is similar to test_tunnel tests but without the tunnel. -# -# Topology: -# --------- -# root namespace | tc_ns0 namespace -# | -# ---------- | ---------- -# | veth1 | --------- | veth0 | -# ---------- peer ---------- -# -# Device Configuration -# -------------------- -# Root namespace with BPF -# Device names and addresses: -# veth1 IP: 10.1.1.200 -# xdp added to veth1, xdpings originate from here. -# -# Namespace tc_ns0 with BPF -# Device names and addresses: -# veth0 IPv4: 10.1.1.100 -# For some tests xdping run in server mode here. -# - -readonly TARGET_IP="10.1.1.100" -readonly TARGET_NS="xdp_ns0" - -readonly LOCAL_IP="10.1.1.200" - -setup() -{ - ip netns add $TARGET_NS - ip link add veth0 type veth peer name veth1 - ip link set veth0 netns $TARGET_NS - ip netns exec $TARGET_NS ip addr add ${TARGET_IP}/24 dev veth0 - ip addr add ${LOCAL_IP}/24 dev veth1 - ip netns exec $TARGET_NS ip link set veth0 up - ip link set veth1 up -} - -cleanup() -{ - set +e - ip netns delete $TARGET_NS 2>/dev/null - ip link del veth1 2>/dev/null - if [[ $server_pid -ne 0 ]]; then - kill -TERM $server_pid - fi -} - -test() -{ - client_args="$1" - server_args="$2" - - echo "Test client args '$client_args'; server args '$server_args'" - - server_pid=0 - if [[ -n "$server_args" ]]; then - ip netns exec $TARGET_NS ./xdping $server_args & - server_pid=$! - sleep 10 - fi - ./xdping $client_args $TARGET_IP - - if [[ $server_pid -ne 0 ]]; then - kill -TERM $server_pid - server_pid=0 - fi - - echo "Test client args '$client_args'; server args '$server_args': PASS" -} - -set -e - -server_pid=0 - -trap cleanup EXIT - -setup - -for server_args in "" "-I veth0 -s -S" ; do - # client in skb mode - client_args="-I veth1 -S" - test "$client_args" "$server_args" - - # client with count of 10 RTT measurements. - client_args="-I veth1 -S -c 10" - test "$client_args" "$server_args" -done - -# Test drv mode -test "-I veth1 -N" "-I veth0 -s -N" -test "-I veth1 -N -c 10" "-I veth0 -s -N" - -echo "OK. All tests passed" -exit 0 diff --git a/tools/testing/selftests/bpf/xdping.c b/tools/testing/selftests/bpf/xdping.c deleted file mode 100644 index 9ed8c796645d..000000000000 --- a/tools/testing/selftests/bpf/xdping.c +++ /dev/null @@ -1,254 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "bpf/bpf.h" -#include "bpf/libbpf.h" - -#include "xdping.h" -#include "testing_helpers.h" - -static int ifindex; -static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; - -static void cleanup(int sig) -{ - bpf_xdp_detach(ifindex, xdp_flags, NULL); - if (sig) - exit(1); -} - -static int get_stats(int fd, __u16 count, __u32 raddr) -{ - struct pinginfo pinginfo = { 0 }; - char inaddrbuf[INET_ADDRSTRLEN]; - struct in_addr inaddr; - __u16 i; - - inaddr.s_addr = raddr; - - printf("\nXDP RTT data:\n"); - - if (bpf_map_lookup_elem(fd, &raddr, &pinginfo)) { - perror("bpf_map_lookup elem"); - return 1; - } - - for (i = 0; i < count; i++) { - if (pinginfo.times[i] == 0) - break; - - printf("64 bytes from %s: icmp_seq=%d ttl=64 time=%#.5f ms\n", - inet_ntop(AF_INET, &inaddr, inaddrbuf, - sizeof(inaddrbuf)), - count + i + 1, - (double)pinginfo.times[i]/1000000); - } - - if (i < count) { - fprintf(stderr, "Expected %d samples, got %d.\n", count, i); - return 1; - } - - bpf_map_delete_elem(fd, &raddr); - - return 0; -} - -static void show_usage(const char *prog) -{ - fprintf(stderr, - "usage: %s [OPTS] -I interface destination\n\n" - "OPTS:\n" - " -c count Stop after sending count requests\n" - " (default %d, max %d)\n" - " -I interface interface name\n" - " -N Run in driver mode\n" - " -s Server mode\n" - " -S Run in skb mode\n", - prog, XDPING_DEFAULT_COUNT, XDPING_MAX_COUNT); -} - -int main(int argc, char **argv) -{ - __u32 mode_flags = XDP_FLAGS_DRV_MODE | XDP_FLAGS_SKB_MODE; - struct addrinfo *a, hints = { .ai_family = AF_INET }; - __u16 count = XDPING_DEFAULT_COUNT; - struct pinginfo pinginfo = { 0 }; - const char *optstr = "c:I:NsS"; - struct bpf_program *main_prog; - int prog_fd = -1, map_fd = -1; - struct sockaddr_in rin; - struct bpf_object *obj; - struct bpf_map *map; - char *ifname = NULL; - char filename[256]; - int opt, ret = 1; - __u32 raddr = 0; - int server = 0; - char cmd[256]; - - while ((opt = getopt(argc, argv, optstr)) != -1) { - switch (opt) { - case 'c': - count = atoi(optarg); - if (count < 1 || count > XDPING_MAX_COUNT) { - fprintf(stderr, - "min count is 1, max count is %d\n", - XDPING_MAX_COUNT); - return 1; - } - break; - case 'I': - ifname = optarg; - ifindex = if_nametoindex(ifname); - if (!ifindex) { - fprintf(stderr, "Could not get interface %s\n", - ifname); - return 1; - } - break; - case 'N': - xdp_flags |= XDP_FLAGS_DRV_MODE; - break; - case 's': - /* use server program */ - server = 1; - break; - case 'S': - xdp_flags |= XDP_FLAGS_SKB_MODE; - break; - default: - show_usage(basename(argv[0])); - return 1; - } - } - - if (!ifname) { - show_usage(basename(argv[0])); - return 1; - } - if (!server && optind == argc) { - show_usage(basename(argv[0])); - return 1; - } - - if ((xdp_flags & mode_flags) == mode_flags) { - fprintf(stderr, "-N or -S can be specified, not both.\n"); - show_usage(basename(argv[0])); - return 1; - } - - if (!server) { - /* Only supports IPv4; see hints initialization above. */ - if (getaddrinfo(argv[optind], NULL, &hints, &a) || !a) { - fprintf(stderr, "Could not resolve %s\n", argv[optind]); - return 1; - } - memcpy(&rin, a->ai_addr, sizeof(rin)); - raddr = rin.sin_addr.s_addr; - freeaddrinfo(a); - } - - /* Use libbpf 1.0 API mode */ - libbpf_set_strict_mode(LIBBPF_STRICT_ALL); - - snprintf(filename, sizeof(filename), "%s_kern.bpf.o", argv[0]); - - if (bpf_prog_test_load(filename, BPF_PROG_TYPE_XDP, &obj, &prog_fd)) { - fprintf(stderr, "load of %s failed\n", filename); - return 1; - } - - main_prog = bpf_object__find_program_by_name(obj, - server ? "xdping_server" : "xdping_client"); - if (main_prog) - prog_fd = bpf_program__fd(main_prog); - if (!main_prog || prog_fd < 0) { - fprintf(stderr, "could not find xdping program"); - return 1; - } - - map = bpf_object__next_map(obj, NULL); - if (map) - map_fd = bpf_map__fd(map); - if (!map || map_fd < 0) { - fprintf(stderr, "Could not find ping map"); - goto done; - } - - signal(SIGINT, cleanup); - signal(SIGTERM, cleanup); - - printf("Setting up XDP for %s, please wait...\n", ifname); - - printf("XDP setup disrupts network connectivity, hit Ctrl+C to quit\n"); - - if (bpf_xdp_attach(ifindex, prog_fd, xdp_flags, NULL) < 0) { - fprintf(stderr, "Link set xdp fd failed for %s\n", ifname); - goto done; - } - - if (server) { - close(prog_fd); - close(map_fd); - printf("Running server on %s; press Ctrl+C to exit...\n", - ifname); - do { } while (1); - } - - /* Start xdping-ing from last regular ping reply, e.g. for a count - * of 10 ICMP requests, we start xdping-ing using reply with seq number - * 10. The reason the last "real" ping RTT is much higher is that - * the ping program sees the ICMP reply associated with the last - * XDP-generated packet, so ping doesn't get a reply until XDP is done. - */ - pinginfo.seq = htons(count); - pinginfo.count = count; - - if (bpf_map_update_elem(map_fd, &raddr, &pinginfo, BPF_ANY)) { - fprintf(stderr, "could not communicate with BPF map: %s\n", - strerror(errno)); - cleanup(0); - goto done; - } - - /* We need to wait for XDP setup to complete. */ - sleep(10); - - snprintf(cmd, sizeof(cmd), "ping -c %d -I %s %s", - count, ifname, argv[optind]); - - printf("\nNormal ping RTT data\n"); - printf("[Ignore final RTT; it is distorted by XDP using the reply]\n"); - - ret = system(cmd); - - if (!ret) - ret = get_stats(map_fd, count, raddr); - - cleanup(0); - -done: - if (prog_fd > 0) - close(prog_fd); - if (map_fd > 0) - close(map_fd); - - return ret; -} diff --git a/tools/testing/selftests/bpf/xdping.h b/tools/testing/selftests/bpf/xdping.h deleted file mode 100644 index afc578df77be..000000000000 --- a/tools/testing/selftests/bpf/xdping.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */ - -#define XDPING_MAX_COUNT 10 -#define XDPING_DEFAULT_COUNT 4 - -struct pinginfo { - __u64 start; - __be16 seq; - __u16 count; - __u32 pad; - __u64 times[XDPING_MAX_COUNT]; -}; -- cgit v1.2.3 From 256f0071f9b61ae5028f749449fd3fdad015889d Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 24 Apr 2026 15:52:42 -0700 Subject: bpf: representation and basic operations on circular numbers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit adds basic definitions for cnum32/cnum64. This is a unified numeric range representation for signed and unsigned domains. Inspired by an old post from Shung-Hsi Yu [1] and paper [2]. Operations correctness is verified using cbmc model checker, tests source code can be found in a separate repo [3]. The cnum64_cnum32_intersect() function is notable, because it handled several cases verifier.c:deduce_bounds_64_from_32() does not. Given: - a is a 64-bit range - b is a 32-bit range - t is a refined 64-bit range, such that ∀ v ∈ a, (u32)v ∈ b: v ∈ t. cnum64_cnum32_intersect() makes the following deductions: (A): 'b' is a sub-range of the first or the last 32-bit sub-range of 'a': 64-bit number axis ---> N*2^32 (N+1)*2^32 (N+2)*2^32 (N+3)*2^32 ||------|---|=====|-------||----------|=====|-------||----------|=====|----|--|| | |< b >| |< b >| |< b >| | | | | | |<--+--------------------------- a ---------------------------+--->| | | |<-------------------------- t -------------------------->| (B) 'b' does not intersect with the first of the last 32-bit sub-range of 'a': N*2^32 (N+1)*2^32 (N+2)*2^32 (N+3)*2^32 ||--|=====|----|----------||--|=====|---------------||--|=====|------------|--|| |< b >| | |< b >| |< b >| | | | | | |<-------------+--------- a -------------------|----------->| | | |<-------- t ------------------>| (C) 'b' crosses 0/U32_MAX boundary: N*2^32 (N+1)*2^32 (N+2)*2^32 (N+3)*2^32 ||===|---------|------|===||===|----------------|===||===|---------|------|===|| |b >| | |< b||b >| |< b||b >| | |< b| | | | | |<-----+----------------- a --------------+-------->| | | |<---------------- t ------------->| Current implementation of deduce_bounds_64_from_32() only handles case (A). [1] https://lore.kernel.org/all/ZTZxoDJJbX9mrQ9w@u94a/ [2] https://jorgenavas.github.io/papers/ACM-TOPLAS-wrapped.pdf [3] https://github.com/eddyz87/cnum-verif/tree/master Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260424-cnums-everywhere-rfc-v1-v3-1-ca434b39a486@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/cnum.h | 80 +++++++++++++++++ kernel/bpf/Makefile | 2 +- kernel/bpf/cnum.c | 120 ++++++++++++++++++++++++++ kernel/bpf/cnum_defs.h | 230 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 431 insertions(+), 1 deletion(-) create mode 100644 include/linux/cnum.h create mode 100644 kernel/bpf/cnum.c create mode 100644 kernel/bpf/cnum_defs.h diff --git a/include/linux/cnum.h b/include/linux/cnum.h new file mode 100644 index 000000000000..a7259b105b45 --- /dev/null +++ b/include/linux/cnum.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#ifndef _LINUX_CNUM_H +#define _LINUX_CNUM_H + +#include + +/* + * cnum32: a circular number. + * A unified representation for signed and unsigned ranges. + * + * Assume that a 32-bit range is a circle, with 0 being in the 12 o'clock + * position, numbers placed sequentially in clockwise order and U32_MAX + * in the 11 o'clock position. Signed values map onto the same circle: + * S32_MAX sits at 5 o'clock, S32_MIN sits at 6 o'clock (opposite 0), + * negative values occupy the left half and positive values the right half. + * + * @cnum32 represents an arc on this circle drawn clockwise. + * @base corresponds to the first value of the range. + * @size corresponds to the number of integers in the range excluding @base. + * (The @base is excluded to avoid integer overflow when representing the full + * 0..U32_MAX range, which corresponds to 2^32, which can't be stored in u32). + * + * For example: {U32_MAX, 1} corresponds to signed range [-1, 0], + * {S32_MAX, 1} corresponds to unsigned range [S32_MAX, S32_MIN]. + */ +struct cnum32 { + u32 base; + u32 size; +}; + +#define CNUM32_UNBOUNDED ((struct cnum32){ .base = 0, .size = U32_MAX }) +#define CNUM32_EMPTY ((struct cnum32){ .base = U32_MAX, .size = U32_MAX }) + +struct cnum32 cnum32_from_urange(u32 min, u32 max); +struct cnum32 cnum32_from_srange(s32 min, s32 max); +u32 cnum32_umin(struct cnum32 cnum); +u32 cnum32_umax(struct cnum32 cnum); +s32 cnum32_smin(struct cnum32 cnum); +s32 cnum32_smax(struct cnum32 cnum); +struct cnum32 cnum32_intersect(struct cnum32 a, struct cnum32 b); +void cnum32_intersect_with(struct cnum32 *dst, struct cnum32 src); +void cnum32_intersect_with_urange(struct cnum32 *dst, u32 min, u32 max); +void cnum32_intersect_with_srange(struct cnum32 *dst, s32 min, s32 max); +bool cnum32_contains(struct cnum32 cnum, u32 v); +bool cnum32_is_const(struct cnum32 cnum); +bool cnum32_is_empty(struct cnum32 cnum); +struct cnum32 cnum32_add(struct cnum32 a, struct cnum32 b); +struct cnum32 cnum32_negate(struct cnum32 a); + +/* Same as cnum32 but for 64-bit ranges */ +struct cnum64 { + u64 base; + u64 size; +}; + +#define CNUM64_UNBOUNDED ((struct cnum64){ .base = 0, .size = U64_MAX }) +#define CNUM64_EMPTY ((struct cnum64){ .base = U64_MAX, .size = U64_MAX }) + +struct cnum64 cnum64_from_urange(u64 min, u64 max); +struct cnum64 cnum64_from_srange(s64 min, s64 max); +u64 cnum64_umin(struct cnum64 cnum); +u64 cnum64_umax(struct cnum64 cnum); +s64 cnum64_smin(struct cnum64 cnum); +s64 cnum64_smax(struct cnum64 cnum); +struct cnum64 cnum64_intersect(struct cnum64 a, struct cnum64 b); +void cnum64_intersect_with(struct cnum64 *dst, struct cnum64 src); +void cnum64_intersect_with_urange(struct cnum64 *dst, u64 min, u64 max); +void cnum64_intersect_with_srange(struct cnum64 *dst, s64 min, s64 max); +bool cnum64_contains(struct cnum64 cnum, u64 v); +bool cnum64_is_const(struct cnum64 cnum); +bool cnum64_is_empty(struct cnum64 cnum); +struct cnum64 cnum64_add(struct cnum64 a, struct cnum64 b); +struct cnum64 cnum64_negate(struct cnum64 a); + +struct cnum32 cnum32_from_cnum64(struct cnum64 cnum); +struct cnum64 cnum64_cnum32_intersect(struct cnum64 a, struct cnum32 b); + +#endif /* _LINUX_CNUM_H */ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 399007b67a92..4dc41bf5780c 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse endif CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o const_fold.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o cnum.o log.o token.o liveness.o const_fold.o obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o bpf_insn_array.o diff --git a/kernel/bpf/cnum.c b/kernel/bpf/cnum.c new file mode 100644 index 000000000000..86142cb2aee5 --- /dev/null +++ b/kernel/bpf/cnum.c @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include + +#define T 32 +#include "cnum_defs.h" +#undef T + +#define T 64 +#include "cnum_defs.h" +#undef T + +struct cnum32 cnum32_from_cnum64(struct cnum64 cnum) +{ + if (cnum64_is_empty(cnum)) + return CNUM32_EMPTY; + + if (cnum.size >= U32_MAX) + return (struct cnum32){ .base = 0, .size = U32_MAX }; + else + return (struct cnum32){ .base = (u32)cnum.base, .size = cnum.size }; +} + +/* + * Suppose 'a' and 'b' are laid out as follows: + * + * 64-bit number axis ---> + * + * N*2^32 (N+1)*2^32 (N+2)*2^32 (N+3)*2^32 + * ||------|---|=====|-------||----------|=====|-------||----------|=====|----|--|| + * | |< b >| |< b >| |< b >| | + * | | | | + * |<--+--------------------------- a ---------------------------+--->| + * | | + * |<-------------------------- t -------------------------->| + * + * In such a case it is possible to infer a more tight representation t + * such that ∀ v ∈ a, (u32)v ∈ b: v ∈ t. + */ +struct cnum64 cnum64_cnum32_intersect(struct cnum64 a, struct cnum32 b) +{ + /* + * To simplify reasoning, rotate the circles so that [virtual] a1 starts + * at u32 boundary, b1 represents b in this new frame of reference. + */ + struct cnum32 b1 = { b.base - (u32)a.base, b.size }; + struct cnum64 t = a; + u64 d, b1_max; + + if (cnum64_is_empty(a) || cnum32_is_empty(b)) + return CNUM64_EMPTY; + + if (cnum32_urange_overflow(b1)) { + b1_max = (u32)b1.base + (u32)b1.size; /* overflow here is fine and necessary */ + if ((u32)a.size > b1_max && (u32)a.size < b1.base) { + /* + * N*2^32 (N+1)*2^32 + * ||=====|------------|=====||=====|---------|---|=====|| + * |b1 ->| |<- b1||b1 ->| | |<- b1| + * |<----------------- a1 ------------------>| + * |<-------------- t ------------>|<-- d -->| (after adjustment) + * ^ + * b1_max + */ + d = (u32)a.size - b1_max; + t.size -= d; + } else { + /* + * No adjustments possible in the following cases: + * + * ||=====|------------|=====||===|=|-------------|=|===|| + * |b1 ->| |<- b1||b1 +>| |<+ b1| + * |<----------------- a1 ------>| | + * |<----------------- (or) a1 ------------------->| + */ + } + } else { + if (t.size < b1.base) + /* + * N*2^32 (N+1)*2^32 + * ||----------|--|=======|--||------> + * |<-- a1 -->| |<- b ->| + */ + return CNUM64_EMPTY; + /* + * N*2^32 (N+1)*2^32 + * ||-------------|========|-||-----| -------|========|-|| + * | |<- b1 ->| | |<- b1 ->| + * |<------------+ a1 ------------>| + * |<------ t ------>| (after adjustment) + */ + t.base += b1.base; + t.size -= b1.base; + b1_max = b1.base + b1.size; + d = 0; + if ((u32)a.size < b1.base) + /* + * N*2^32 (N+1)*2^32 + * ||-------------|========|-||------|-------|========|-|| + * | |<- b1 ->| | |<- b1 ->| + * |<------------+-- a1 --+-------->| + * |<- t ->|<-- d -->| (after adjustment) + */ + d = (u32)a.size + (BIT_ULL(32) - b1_max); + else if ((u32)a.size >= b1_max) + /* + * N*2^32 (N+1)*2^32 + * ||--|========|------------||--|========|-------|-----|| + * | |<- b1 ->| |<- b1 ->| | + * |<-+------------------ a1 ------------+------>| + * |<-------------- t --------------->|<- d ->| (after adjustment) + */ + d = (u32)a.size - b1_max; + if (t.size < d) + return CNUM64_EMPTY; + t.size -= d; + } + return t; +} diff --git a/kernel/bpf/cnum_defs.h b/kernel/bpf/cnum_defs.h new file mode 100644 index 000000000000..3ebd8f723dbb --- /dev/null +++ b/kernel/bpf/cnum_defs.h @@ -0,0 +1,230 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#ifndef T +#error "Define T (bit width: 32, 64) before including cnum_defs.h" +#endif + +#include +#include +#include +#include + +#define cnum_t __PASTE(cnum, T) +#define ut __PASTE(u, T) +#define st __PASTE(s, T) +#define UT_MAX __PASTE(__PASTE(U, T), _MAX) +#define ST_MAX __PASTE(__PASTE(S, T), _MAX) +#define ST_MIN __PASTE(__PASTE(S, T), _MIN) +#define EMPTY __PASTE(__PASTE(CNUM, T), _EMPTY) +#define FN(name) __PASTE(__PASTE(cnum, T), __PASTE(_, name)) + +struct cnum_t FN(from_urange)(ut min, ut max) +{ + return (struct cnum_t){ .base = min, .size = (ut)max - min }; +} + +struct cnum_t FN(from_srange)(st min, st max) +{ + ut size = (ut)max - (ut)min; + ut base = size == UT_MAX ? 0 : (ut)min; + + return (struct cnum_t){ .base = base, .size = size }; +} + +/* True if this cnum represents two unsigned ranges. */ +static inline bool FN(urange_overflow)(struct cnum_t cnum) +{ + /* Same as cnum.base + cnum.size > UT_MAX but avoids overflow */ + return cnum.size > UT_MAX - (ut)cnum.base; +} + +/* + * cnum{T}_umin / cnum{T}_umax query an unsigned range represented by this cnum. + * If cnum represents a range crossing the UT_MAX/0 boundary, the unbound range + * [0..UT_MAX] is returned. + */ +ut FN(umin)(struct cnum_t cnum) +{ + return FN(urange_overflow)(cnum) ? 0 : cnum.base; +} + +ut FN(umax)(struct cnum_t cnum) +{ + return FN(urange_overflow)(cnum) ? UT_MAX : cnum.base + cnum.size; +} + +/* True if this cnum represents two signed ranges. */ +static inline bool FN(srange_overflow)(struct cnum_t cnum) +{ + return FN(contains)(cnum, (ut)ST_MAX) && FN(contains)(cnum, (ut)ST_MIN); +} + +/* + * cnum{T}_smin / cnum{T}_smax query a signed range represented by this cnum. + * If cnum represents a range crossing the ST_MAX/ST_MIN boundary, the unbound range + * [ST_MIN..ST_MAX] is returned. + */ +st FN(smin)(struct cnum_t cnum) +{ + return FN(srange_overflow)(cnum) + ? ST_MIN + : min((st)cnum.base, (st)(cnum.base + cnum.size)); +} + +st FN(smax)(struct cnum_t cnum) +{ + return FN(srange_overflow)(cnum) + ? ST_MAX + : max((st)cnum.base, (st)(cnum.base + cnum.size)); +} + +/* + * Returns a possibly empty intersection of cnums 'a' and 'b'. + * If 'a' and 'b' intersect in two sub-arcs, the function over-approximates + * and returns either 'a' or 'b', whichever is smaller. + */ +struct cnum_t FN(intersect)(struct cnum_t a, struct cnum_t b) +{ + struct cnum_t b1; + ut dbase; + + if (FN(is_empty)(a) || FN(is_empty)(b)) + return EMPTY; + + if (a.base > b.base) + swap(a, b); + + /* + * Rotate frame of reference such that a.base is 0. + * 'b1' is 'b' in this frame of reference. + */ + dbase = b.base - a.base; + b1 = (struct cnum_t){ dbase, b.size }; + if (FN(urange_overflow)(b1)) { + if (b1.base <= a.size) { + /* + * Rotated frame (a.base at origin): + * + * 0 UT_MAX + * |--------------------------------------------| + * [=== a ==========================] | + * [= b1 tail =] [========= b1 main ==========>] + * ^-- b1.base <= a.size + * + * 'a' and 'b' intersect in two disjoint arcs, + * can't represent as single cnum, over-approximate + * the result. + */ + return a.size <= b.size ? a : b; + } else { + /* + * Rotated frame (a.base at origin): + * + * 0 UT_MAX + * |--------------------------------------------| + * [=== a =============] | | + * [= b1 tail =] [======= b1 main ====>] + * ^-- b1.base > a.size + * + * Only 'b' tail intersects 'a'. + */ + return (struct cnum_t) { + .base = a.base, + .size = min(a.size, (ut)(b1.base + b1.size)), + }; + } + } else if (a.size >= b1.base) { + /* + * Rotated frame (a.base at origin): + * + * 0 UT_MAX + * |--------------------------------------------------| + * [=== a ==================================] | + * [== b1 =====================] + * + * 0 UT_MAX + * |--------------------------------------------------| + * [=== a ==================================] | + * [== b1 ====] + * ^-- b1.base <= a.size + * |<-- a.size - dbase -->| + * + * 'a' and 'b' intersect as one cnum. + */ + return (struct cnum_t) { + .base = b.base, + .size = min((ut)(a.size - dbase), b.size), + }; + } else { + return EMPTY; + } +} + +void FN(intersect_with)(struct cnum_t *dst, struct cnum_t src) +{ + *dst = FN(intersect)(*dst, src); +} + +void FN(intersect_with_urange)(struct cnum_t *dst, ut min, ut max) +{ + FN(intersect_with)(dst, FN(from_urange)(min, max)); +} + +void FN(intersect_with_srange)(struct cnum_t *dst, st min, st max) +{ + FN(intersect_with)(dst, FN(from_srange)(min, max)); +} + +static inline struct cnum_t FN(normalize)(struct cnum_t cnum) +{ + if (cnum.size == UT_MAX && cnum.base != 0 && cnum.base != (ut)ST_MAX) + cnum.base = 0; + return cnum; +} + +struct cnum_t FN(add)(struct cnum_t a, struct cnum_t b) +{ + if (FN(is_empty)(a) || FN(is_empty)(b)) + return EMPTY; + if (a.size > UT_MAX - b.size) + return (struct cnum_t){ 0, (ut)UT_MAX }; + else + return FN(normalize)((struct cnum_t){ a.base + b.base, a.size + b.size }); +} + +struct cnum_t FN(negate)(struct cnum_t a) +{ + if (FN(is_empty)(a)) + return EMPTY; + return FN(normalize)((struct cnum_t){ -((ut)a.base + a.size), a.size }); +} + +bool FN(is_empty)(struct cnum_t cnum) +{ + return cnum.base == EMPTY.base && cnum.size == EMPTY.size; +} + +bool FN(contains)(struct cnum_t cnum, ut v) +{ + if (FN(is_empty)(cnum)) + return false; + if (FN(urange_overflow)(cnum)) + return v >= cnum.base || v <= (ut)cnum.base + cnum.size; + else + return v >= cnum.base && v <= (ut)cnum.base + cnum.size; +} + +bool FN(is_const)(struct cnum_t cnum) +{ + return cnum.size == 0; +} + +#undef EMPTY +#undef cnum_t +#undef ut +#undef st +#undef UT_MAX +#undef ST_MAX +#undef ST_MIN +#undef FN -- cgit v1.2.3 From b93f7180f0bc37336cb26b43aa4796973d84852e Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 24 Apr 2026 15:52:43 -0700 Subject: bpf: use accessor functions for bpf_reg_state min/max fields Replace direct access to bpf_reg_state->{smin,smax,umin,umax, s32_min,s32_max,u32_min,u32_max}_value with getter/setter inline functions, preparing for future switch to cnum-based internal representation. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260424-cnums-everywhere-rfc-v1-v3-2-ca434b39a486@gmail.com Signed-off-by: Alexei Starovoitov --- drivers/net/ethernet/netronome/nfp/bpf/verifier.c | 8 +- include/linux/bpf_verifier.h | 64 ++ kernel/bpf/log.c | 24 +- kernel/bpf/states.c | 16 +- kernel/bpf/verifier.c | 1233 ++++++++++----------- 5 files changed, 678 insertions(+), 667 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c index 70368fe7c510..1caa87da72b5 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c @@ -561,10 +561,10 @@ nfp_bpf_check_alu(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, const struct bpf_reg_state *dreg = cur_regs(env) + meta->insn.dst_reg; - meta->umin_src = min(meta->umin_src, sreg->umin_value); - meta->umax_src = max(meta->umax_src, sreg->umax_value); - meta->umin_dst = min(meta->umin_dst, dreg->umin_value); - meta->umax_dst = max(meta->umax_dst, dreg->umax_value); + meta->umin_src = min(meta->umin_src, reg_umin(sreg)); + meta->umax_src = max(meta->umax_src, reg_umax(sreg)); + meta->umin_dst = min(meta->umin_dst, reg_umin(dreg)); + meta->umax_dst = max(meta->umax_dst, reg_umax(dreg)); /* NFP supports u16 and u32 multiplication. * diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index d5b4303315dd..bf3ffa56bbe5 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -209,6 +209,70 @@ struct bpf_reg_state { bool precise; }; +static inline s64 reg_smin(const struct bpf_reg_state *reg) +{ + return reg->smin_value; +} + +static inline s64 reg_smax(const struct bpf_reg_state *reg) +{ + return reg->smax_value; +} + +static inline u64 reg_umin(const struct bpf_reg_state *reg) +{ + return reg->umin_value; +} + +static inline u64 reg_umax(const struct bpf_reg_state *reg) +{ + return reg->umax_value; +} + +static inline s32 reg_s32_min(const struct bpf_reg_state *reg) +{ + return reg->s32_min_value; +} + +static inline s32 reg_s32_max(const struct bpf_reg_state *reg) +{ + return reg->s32_max_value; +} + +static inline u32 reg_u32_min(const struct bpf_reg_state *reg) +{ + return reg->u32_min_value; +} + +static inline u32 reg_u32_max(const struct bpf_reg_state *reg) +{ + return reg->u32_max_value; +} + +static inline void reg_set_srange32(struct bpf_reg_state *reg, s32 smin, s32 smax) +{ + reg->s32_min_value = smin; + reg->s32_max_value = smax; +} + +static inline void reg_set_urange32(struct bpf_reg_state *reg, u32 umin, u32 umax) +{ + reg->u32_min_value = umin; + reg->u32_max_value = umax; +} + +static inline void reg_set_srange64(struct bpf_reg_state *reg, s64 smin, s64 smax) +{ + reg->smin_value = smin; + reg->smax_value = smax; +} + +static inline void reg_set_urange64(struct bpf_reg_state *reg, u64 umin, u64 umax) +{ + reg->umin_value = umin; + reg->umax_value = umax; +} + enum bpf_stack_slot_type { STACK_INVALID, /* nothing was stored in this stack slot */ STACK_SPILL, /* register spilled into stack */ diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 011e4ec25acd..64566b86dd27 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -571,20 +571,20 @@ static void print_scalar_ranges(struct bpf_verifier_env *env, u64 val; bool omit; } minmaxs[] = { - {"smin", reg->smin_value, reg->smin_value == S64_MIN}, - {"smax", reg->smax_value, reg->smax_value == S64_MAX}, - {"umin", reg->umin_value, reg->umin_value == 0}, - {"umax", reg->umax_value, reg->umax_value == U64_MAX}, + {"smin", reg_smin(reg), reg_smin(reg) == S64_MIN}, + {"smax", reg_smax(reg), reg_smax(reg) == S64_MAX}, + {"umin", reg_umin(reg), reg_umin(reg) == 0}, + {"umax", reg_umax(reg), reg_umax(reg) == U64_MAX}, {"smin32", - is_snum_decimal((s64)reg->s32_min_value) - ? (s64)reg->s32_min_value - : (u32)reg->s32_min_value, reg->s32_min_value == S32_MIN}, + is_snum_decimal((s64)reg_s32_min(reg)) + ? (s64)reg_s32_min(reg) + : (u32)reg_s32_min(reg), reg_s32_min(reg) == S32_MIN}, {"smax32", - is_snum_decimal((s64)reg->s32_max_value) - ? (s64)reg->s32_max_value - : (u32)reg->s32_max_value, reg->s32_max_value == S32_MAX}, - {"umin32", reg->u32_min_value, reg->u32_min_value == 0}, - {"umax32", reg->u32_max_value, reg->u32_max_value == U32_MAX}, + is_snum_decimal((s64)reg_s32_max(reg)) + ? (s64)reg_s32_max(reg) + : (u32)reg_s32_max(reg), reg_s32_max(reg) == S32_MAX}, + {"umin32", reg_u32_min(reg), reg_u32_min(reg) == 0}, + {"umax32", reg_u32_max(reg), reg_u32_max(reg) == U32_MAX}, }, *m1, *m2, *mend = &minmaxs[ARRAY_SIZE(minmaxs)]; bool neg1, neg2; diff --git a/kernel/bpf/states.c b/kernel/bpf/states.c index 8478d2c6ed5b..a78ae891b743 100644 --- a/kernel/bpf/states.c +++ b/kernel/bpf/states.c @@ -301,14 +301,14 @@ int bpf_update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_s static bool range_within(const struct bpf_reg_state *old, const struct bpf_reg_state *cur) { - return old->umin_value <= cur->umin_value && - old->umax_value >= cur->umax_value && - old->smin_value <= cur->smin_value && - old->smax_value >= cur->smax_value && - old->u32_min_value <= cur->u32_min_value && - old->u32_max_value >= cur->u32_max_value && - old->s32_min_value <= cur->s32_min_value && - old->s32_max_value >= cur->s32_max_value; + return reg_umin(old) <= reg_umin(cur) && + reg_umax(old) >= reg_umax(cur) && + reg_smin(old) <= reg_smin(cur) && + reg_smax(old) >= reg_smax(cur) && + reg_u32_min(old) <= reg_u32_min(cur) && + reg_u32_max(old) >= reg_u32_max(cur) && + reg_s32_min(old) <= reg_s32_min(cur) && + reg_s32_max(old) >= reg_s32_max(cur); } /* If in the old state two registers had the same id, then they need to have diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ff6ff1c27517..b91d2789e7b9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -320,12 +320,12 @@ static void verbose_invalid_scalar(struct bpf_verifier_env *env, bool unknown = true; verbose(env, "%s the register %s has", ctx, reg_name); - if (reg->smin_value > S64_MIN) { - verbose(env, " smin=%lld", reg->smin_value); + if (reg_smin(reg) > S64_MIN) { + verbose(env, " smin=%lld", reg_smin(reg)); unknown = false; } - if (reg->smax_value < S64_MAX) { - verbose(env, " smax=%lld", reg->smax_value); + if (reg_smax(reg) < S64_MAX) { + verbose(env, " smax=%lld", reg_smax(reg)); unknown = false; } if (unknown) @@ -1796,15 +1796,10 @@ static const int caller_saved[CALLER_SAVED_REGS] = { static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm) { reg->var_off = tnum_const(imm); - reg->smin_value = (s64)imm; - reg->smax_value = (s64)imm; - reg->umin_value = imm; - reg->umax_value = imm; - - reg->s32_min_value = (s32)imm; - reg->s32_max_value = (s32)imm; - reg->u32_min_value = (u32)imm; - reg->u32_max_value = (u32)imm; + reg_set_srange64(reg, (s64)imm, (s64)imm); + reg_set_urange64(reg, imm, imm); + reg_set_srange32(reg, (s32)imm, (s32)imm); + reg_set_urange32(reg, (u32)imm, (u32)imm); } /* Mark the unknown part of a register (variable offset or scalar value) as @@ -1823,10 +1818,8 @@ static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm) static void __mark_reg32_known(struct bpf_reg_state *reg, u64 imm) { reg->var_off = tnum_const_subreg(reg->var_off, imm); - reg->s32_min_value = (s32)imm; - reg->s32_max_value = (s32)imm; - reg->u32_min_value = (u32)imm; - reg->u32_max_value = (u32)imm; + reg_set_srange32(reg, (s32)imm, (s32)imm); + reg_set_urange32(reg, (u32)imm, (u32)imm); } /* Mark the 'variable offset' part of a register as zero. This should be @@ -1937,34 +1930,25 @@ static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg, tnum_equals_const(reg->var_off, 0); } +static void __mark_reg32_unbounded(struct bpf_reg_state *reg) +{ + reg_set_srange32(reg, S32_MIN, S32_MAX); + reg_set_urange32(reg, 0, U32_MAX); +} + /* Reset the min/max bounds of a register */ static void __mark_reg_unbounded(struct bpf_reg_state *reg) { - reg->smin_value = S64_MIN; - reg->smax_value = S64_MAX; - reg->umin_value = 0; - reg->umax_value = U64_MAX; + reg_set_srange64(reg, S64_MIN, S64_MAX); + reg_set_urange64(reg, 0, U64_MAX); - reg->s32_min_value = S32_MIN; - reg->s32_max_value = S32_MAX; - reg->u32_min_value = 0; - reg->u32_max_value = U32_MAX; + __mark_reg32_unbounded(reg); } static void __mark_reg64_unbounded(struct bpf_reg_state *reg) { - reg->smin_value = S64_MIN; - reg->smax_value = S64_MAX; - reg->umin_value = 0; - reg->umax_value = U64_MAX; -} - -static void __mark_reg32_unbounded(struct bpf_reg_state *reg) -{ - reg->s32_min_value = S32_MIN; - reg->s32_max_value = S32_MAX; - reg->u32_min_value = 0; - reg->u32_max_value = U32_MAX; + reg_set_srange64(reg, S64_MIN, S64_MAX); + reg_set_urange64(reg, 0, U64_MAX); } static void reset_reg64_and_tnum(struct bpf_reg_state *reg) @@ -1983,15 +1967,14 @@ static void __update_reg32_bounds(struct bpf_reg_state *reg) { struct tnum var32_off = tnum_subreg(reg->var_off); - /* min signed is max(sign bit) | min(other bits) */ - reg->s32_min_value = max_t(s32, reg->s32_min_value, - var32_off.value | (var32_off.mask & S32_MIN)); - /* max signed is min(sign bit) | max(other bits) */ - reg->s32_max_value = min_t(s32, reg->s32_max_value, - var32_off.value | (var32_off.mask & S32_MAX)); - reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)var32_off.value); - reg->u32_max_value = min(reg->u32_max_value, - (u32)(var32_off.value | var32_off.mask)); + reg_set_srange32(reg, + /* min signed is max(sign bit) | min(other bits) */ + max_t(s32, reg_s32_min(reg), var32_off.value | (var32_off.mask & S32_MIN)), + /* max signed is min(sign bit) | max(other bits) */ + min_t(s32, reg_s32_max(reg), var32_off.value | (var32_off.mask & S32_MAX))); + reg_set_urange32(reg, + max_t(u32, reg_u32_min(reg), (u32)var32_off.value), + min(reg_u32_max(reg), (u32)(var32_off.value | var32_off.mask))); } static void __update_reg64_bounds(struct bpf_reg_state *reg) @@ -2000,25 +1983,27 @@ static void __update_reg64_bounds(struct bpf_reg_state *reg) bool umin_in_tnum; /* min signed is max(sign bit) | min(other bits) */ - reg->smin_value = max_t(s64, reg->smin_value, - reg->var_off.value | (reg->var_off.mask & S64_MIN)); /* max signed is min(sign bit) | max(other bits) */ - reg->smax_value = min_t(s64, reg->smax_value, - reg->var_off.value | (reg->var_off.mask & S64_MAX)); - reg->umin_value = max(reg->umin_value, reg->var_off.value); - reg->umax_value = min(reg->umax_value, - reg->var_off.value | reg->var_off.mask); + reg_set_srange64(reg, + max_t(s64, reg_smin(reg), + reg->var_off.value | (reg->var_off.mask & S64_MIN)), + min_t(s64, reg_smax(reg), + reg->var_off.value | (reg->var_off.mask & S64_MAX))); + reg_set_urange64(reg, + max(reg_umin(reg), reg->var_off.value), + min(reg_umax(reg), + reg->var_off.value | reg->var_off.mask)); /* Check if u64 and tnum overlap in a single value */ - tnum_next = tnum_step(reg->var_off, reg->umin_value); - umin_in_tnum = (reg->umin_value & ~reg->var_off.mask) == reg->var_off.value; + tnum_next = tnum_step(reg->var_off, reg_umin(reg)); + umin_in_tnum = (reg_umin(reg) & ~reg->var_off.mask) == reg->var_off.value; tmax = reg->var_off.value | reg->var_off.mask; - if (umin_in_tnum && tnum_next > reg->umax_value) { + if (umin_in_tnum && tnum_next > reg_umax(reg)) { /* The u64 range and the tnum only overlap in umin. * u64: ---[xxxxxx]----- * tnum: --xx----------x- */ - ___mark_reg_known(reg, reg->umin_value); + ___mark_reg_known(reg, reg_umin(reg)); } else if (!umin_in_tnum && tnum_next == tmax) { /* The u64 range and the tnum only overlap in the maximum value * represented by the tnum, called tmax. @@ -2026,8 +2011,8 @@ static void __update_reg64_bounds(struct bpf_reg_state *reg) * tnum: xx-----x-------- */ ___mark_reg_known(reg, tmax); - } else if (!umin_in_tnum && tnum_next <= reg->umax_value && - tnum_step(reg->var_off, tnum_next) > reg->umax_value) { + } else if (!umin_in_tnum && tnum_next <= reg_umax(reg) && + tnum_step(reg->var_off, tnum_next) > reg_umax(reg)) { /* The u64 range and the tnum only overlap in between umin * (excluded) and umax. * u64: ---[xxxxxx]----- @@ -2067,28 +2052,32 @@ static void deduce_bounds_32_from_64(struct bpf_reg_state *reg) * * So we use all these insights to derive bounds for subregisters here. */ - if ((reg->umin_value >> 32) == (reg->umax_value >> 32)) { + if ((reg_umin(reg) >> 32) == (reg_umax(reg) >> 32)) { /* u64 to u32 casting preserves validity of low 32 bits as * a range, if upper 32 bits are the same */ - reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->umin_value); - reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->umax_value); + reg_set_urange32(reg, + max_t(u32, reg_u32_min(reg), (u32)reg_umin(reg)), + min_t(u32, reg_u32_max(reg), (u32)reg_umax(reg))); - if ((s32)reg->umin_value <= (s32)reg->umax_value) { - reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value); - reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value); + if ((s32)reg_umin(reg) <= (s32)reg_umax(reg)) { + reg_set_srange32(reg, + max_t(s32, reg_s32_min(reg), (s32)reg_umin(reg)), + min_t(s32, reg_s32_max(reg), (s32)reg_umax(reg))); } } - if ((reg->smin_value >> 32) == (reg->smax_value >> 32)) { + if ((reg_smin(reg) >> 32) == (reg_smax(reg) >> 32)) { /* low 32 bits should form a proper u32 range */ - if ((u32)reg->smin_value <= (u32)reg->smax_value) { - reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->smin_value); - reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->smax_value); + if ((u32)reg_smin(reg) <= (u32)reg_smax(reg)) { + reg_set_urange32(reg, + max_t(u32, reg_u32_min(reg), (u32)reg_smin(reg)), + min_t(u32, reg_u32_max(reg), (u32)reg_smax(reg))); } /* low 32 bits should form a proper s32 range */ - if ((s32)reg->smin_value <= (s32)reg->smax_value) { - reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value); - reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value); + if ((s32)reg_smin(reg) <= (s32)reg_smax(reg)) { + reg_set_srange32(reg, + max_t(s32, reg_s32_min(reg), (s32)reg_smin(reg)), + min_t(s32, reg_s32_max(reg), (s32)reg_smax(reg))); } } /* Special case where upper bits form a small sequence of two @@ -2104,15 +2093,17 @@ static void deduce_bounds_32_from_64(struct bpf_reg_state *reg) * [0xfffffff0fffffff0; 0xfffffff100000010], forms a valid s32 range * [-16, 16] ([0xfffffff0; 0x00000010]) in its 32 bit subregister. */ - if ((u32)(reg->umin_value >> 32) + 1 == (u32)(reg->umax_value >> 32) && - (s32)reg->umin_value < 0 && (s32)reg->umax_value >= 0) { - reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value); - reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value); + if ((u32)(reg_umin(reg) >> 32) + 1 == (u32)(reg_umax(reg) >> 32) && + (s32)reg_umin(reg) < 0 && (s32)reg_umax(reg) >= 0) { + reg_set_srange32(reg, + max_t(s32, reg_s32_min(reg), (s32)reg_umin(reg)), + min_t(s32, reg_s32_max(reg), (s32)reg_umax(reg))); } - if ((u32)(reg->smin_value >> 32) + 1 == (u32)(reg->smax_value >> 32) && - (s32)reg->smin_value < 0 && (s32)reg->smax_value >= 0) { - reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value); - reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value); + if ((u32)(reg_smin(reg) >> 32) + 1 == (u32)(reg_smax(reg) >> 32) && + (s32)reg_smin(reg) < 0 && (s32)reg_smax(reg) >= 0) { + reg_set_srange32(reg, + max_t(s32, reg_s32_min(reg), (s32)reg_smin(reg)), + min_t(s32, reg_s32_max(reg), (s32)reg_smax(reg))); } } @@ -2121,19 +2112,21 @@ static void deduce_bounds_32_from_32(struct bpf_reg_state *reg) /* if u32 range forms a valid s32 range (due to matching sign bit), * try to learn from that */ - if ((s32)reg->u32_min_value <= (s32)reg->u32_max_value) { - reg->s32_min_value = max_t(s32, reg->s32_min_value, reg->u32_min_value); - reg->s32_max_value = min_t(s32, reg->s32_max_value, reg->u32_max_value); + if ((s32)reg_u32_min(reg) <= (s32)reg_u32_max(reg)) { + reg_set_srange32(reg, + max_t(s32, reg_s32_min(reg), reg_u32_min(reg)), + min_t(s32, reg_s32_max(reg), reg_u32_max(reg))); } /* If we cannot cross the sign boundary, then signed and unsigned bounds * are the same, so combine. This works even in the negative case, e.g. * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff. */ - if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) { - reg->u32_min_value = max_t(u32, reg->s32_min_value, reg->u32_min_value); - reg->u32_max_value = min_t(u32, reg->s32_max_value, reg->u32_max_value); + if ((u32)reg_s32_min(reg) <= (u32)reg_s32_max(reg)) { + reg_set_urange32(reg, + max_t(u32, reg_s32_min(reg), reg_u32_min(reg)), + min_t(u32, reg_s32_max(reg), reg_u32_max(reg))); } else { - if (reg->u32_max_value < (u32)reg->s32_min_value) { + if (reg_u32_max(reg) < (u32)reg_s32_min(reg)) { /* See __reg64_deduce_bounds() for detailed explanation. * Refine ranges in the following situation: * @@ -2143,9 +2136,11 @@ static void deduce_bounds_32_from_32(struct bpf_reg_state *reg) * |xxxxx s32 range xxxxxxxxx] [xxxxxxx| * 0 S32_MAX S32_MIN -1 */ - reg->s32_min_value = (s32)reg->u32_min_value; - reg->u32_max_value = min_t(u32, reg->u32_max_value, reg->s32_max_value); - } else if ((u32)reg->s32_max_value < reg->u32_min_value) { + reg_set_srange32(reg, (s32)reg_u32_min(reg), reg_s32_max(reg)); + reg_set_urange32(reg, + reg_u32_min(reg), + min_t(u32, reg_u32_max(reg), reg_s32_max(reg))); + } else if ((u32)reg_s32_max(reg) < reg_u32_min(reg)) { /* * 0 U32_MAX * | [xxxxxxxxxxxxxx u32 range xxxxxxxxxxxxxx] | @@ -2153,8 +2148,10 @@ static void deduce_bounds_32_from_32(struct bpf_reg_state *reg) * |xxxxxxxxx] [xxxxxxxxxxxx s32 range | * 0 S32_MAX S32_MIN -1 */ - reg->s32_max_value = (s32)reg->u32_max_value; - reg->u32_min_value = max_t(u32, reg->u32_min_value, reg->s32_min_value); + reg_set_srange32(reg, reg_s32_min(reg), (s32)reg_u32_max(reg)); + reg_set_urange32(reg, + max_t(u32, reg_u32_min(reg), reg_s32_min(reg)), + reg_u32_max(reg)); } } } @@ -2228,17 +2225,19 @@ static void deduce_bounds_64_from_64(struct bpf_reg_state *reg) * casting umin/umax as smin/smax and checking if they form valid * range, and vice versa. Those are equivalent checks. */ - if ((s64)reg->umin_value <= (s64)reg->umax_value) { - reg->smin_value = max_t(s64, reg->smin_value, reg->umin_value); - reg->smax_value = min_t(s64, reg->smax_value, reg->umax_value); + if ((s64)reg_umin(reg) <= (s64)reg_umax(reg)) { + reg_set_srange64(reg, + max_t(s64, reg_smin(reg), reg_umin(reg)), + min_t(s64, reg_smax(reg), reg_umax(reg))); } /* If we cannot cross the sign boundary, then signed and unsigned bounds * are the same, so combine. This works even in the negative case, e.g. * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff. */ - if ((u64)reg->smin_value <= (u64)reg->smax_value) { - reg->umin_value = max_t(u64, reg->smin_value, reg->umin_value); - reg->umax_value = min_t(u64, reg->smax_value, reg->umax_value); + if ((u64)reg_smin(reg) <= (u64)reg_smax(reg)) { + reg_set_urange64(reg, + max_t(u64, reg_smin(reg), reg_umin(reg)), + min_t(u64, reg_smax(reg), reg_umax(reg))); } else { /* If the s64 range crosses the sign boundary, then it's split * between the beginning and end of the U64 domain. In that @@ -2275,10 +2274,10 @@ static void deduce_bounds_64_from_64(struct bpf_reg_state *reg) * The first condition below corresponds to the first diagram * above. */ - if (reg->umax_value < (u64)reg->smin_value) { - reg->smin_value = (s64)reg->umin_value; - reg->umax_value = min_t(u64, reg->umax_value, reg->smax_value); - } else if ((u64)reg->smax_value < reg->umin_value) { + if (reg_umax(reg) < (u64)reg_smin(reg)) { + reg_set_srange64(reg, (s64)reg_umin(reg), reg_smax(reg)); + reg_set_urange64(reg, reg_umin(reg), min_t(u64, reg_umax(reg), reg_smax(reg))); + } else if ((u64)reg_smax(reg) < reg_umin(reg)) { /* This second condition considers the case where the u64 range * overlaps with the negative portion of the s64 range: * @@ -2288,8 +2287,8 @@ static void deduce_bounds_64_from_64(struct bpf_reg_state *reg) * |xxxxxxxxx] [xxxxxxxxxxxx s64 range | * 0 S64_MAX S64_MIN -1 */ - reg->smax_value = (s64)reg->umax_value; - reg->umin_value = max_t(u64, reg->umin_value, reg->smin_value); + reg_set_srange64(reg, reg_smin(reg), (s64)reg_umax(reg)); + reg_set_urange64(reg, max_t(u64, reg_umin(reg), reg_smin(reg)), reg_umax(reg)); } } } @@ -2312,15 +2311,17 @@ static void deduce_bounds_64_from_32(struct bpf_reg_state *reg) __s64 new_smin, new_smax; /* u32 -> u64 tightening, it's always well-formed */ - new_umin = (reg->umin_value & ~0xffffffffULL) | reg->u32_min_value; - new_umax = (reg->umax_value & ~0xffffffffULL) | reg->u32_max_value; - reg->umin_value = max_t(u64, reg->umin_value, new_umin); - reg->umax_value = min_t(u64, reg->umax_value, new_umax); + new_umin = (reg_umin(reg) & ~0xffffffffULL) | reg_u32_min(reg); + new_umax = (reg_umax(reg) & ~0xffffffffULL) | reg_u32_max(reg); + reg_set_urange64(reg, + max_t(u64, reg_umin(reg), new_umin), + min_t(u64, reg_umax(reg), new_umax)); /* u32 -> s64 tightening, u32 range embedded into s64 preserves range validity */ - new_smin = (reg->smin_value & ~0xffffffffULL) | reg->u32_min_value; - new_smax = (reg->smax_value & ~0xffffffffULL) | reg->u32_max_value; - reg->smin_value = max_t(s64, reg->smin_value, new_smin); - reg->smax_value = min_t(s64, reg->smax_value, new_smax); + new_smin = (reg_smin(reg) & ~0xffffffffULL) | reg_u32_min(reg); + new_smax = (reg_smax(reg) & ~0xffffffffULL) | reg_u32_max(reg); + reg_set_srange64(reg, + max_t(s64, reg_smin(reg), new_smin), + min_t(s64, reg_smax(reg), new_smax)); /* Here we would like to handle a special case after sign extending load, * when upper bits for a 64-bit range are all 1s or all 0s. @@ -2351,13 +2352,11 @@ static void deduce_bounds_64_from_32(struct bpf_reg_state *reg) * - 0x0000_0000_7fff_ffff == (s64)S32_MAX * These relations are used in the conditions below. */ - if (reg->s32_min_value >= 0 && reg->smin_value >= S32_MIN && reg->smax_value <= S32_MAX) { - reg->smin_value = reg->s32_min_value; - reg->smax_value = reg->s32_max_value; - reg->umin_value = reg->s32_min_value; - reg->umax_value = reg->s32_max_value; + if (reg_s32_min(reg) >= 0 && reg_smin(reg) >= S32_MIN && reg_smax(reg) <= S32_MAX) { + reg_set_srange64(reg, reg_s32_min(reg), reg_s32_max(reg)); + reg_set_urange64(reg, reg_s32_min(reg), reg_s32_max(reg)); reg->var_off = tnum_intersect(reg->var_off, - tnum_range(reg->smin_value, reg->smax_value)); + tnum_range(reg_smin(reg), reg_smax(reg))); } } @@ -2373,11 +2372,11 @@ static void __reg_deduce_bounds(struct bpf_reg_state *reg) static void __reg_bound_offset(struct bpf_reg_state *reg) { struct tnum var64_off = tnum_intersect(reg->var_off, - tnum_range(reg->umin_value, - reg->umax_value)); + tnum_range(reg_umin(reg), + reg_umax(reg))); struct tnum var32_off = tnum_intersect(tnum_subreg(var64_off), - tnum_range(reg->u32_min_value, - reg->u32_max_value)); + tnum_range(reg_u32_min(reg), + reg_u32_max(reg))); reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off); } @@ -2405,9 +2404,9 @@ static void reg_bounds_sync(struct bpf_reg_state *reg) static bool range_bounds_violation(struct bpf_reg_state *reg) { - return (reg->umin_value > reg->umax_value || reg->smin_value > reg->smax_value || - reg->u32_min_value > reg->u32_max_value || - reg->s32_min_value > reg->s32_max_value); + return (reg_umin(reg) > reg_umax(reg) || reg_smin(reg) > reg_smax(reg) || + reg_u32_min(reg) > reg_u32_max(reg) || + reg_s32_min(reg) > reg_s32_max(reg)); } static bool const_tnum_range_mismatch(struct bpf_reg_state *reg) @@ -2418,8 +2417,8 @@ static bool const_tnum_range_mismatch(struct bpf_reg_state *reg) if (!tnum_is_const(reg->var_off)) return false; - return reg->umin_value != uval || reg->umax_value != uval || - reg->smin_value != sval || reg->smax_value != sval; + return reg_umin(reg) != uval || reg_umax(reg) != uval || + reg_smin(reg) != sval || reg_smax(reg) != sval; } static bool const_tnum_range_mismatch_32(struct bpf_reg_state *reg) @@ -2430,8 +2429,8 @@ static bool const_tnum_range_mismatch_32(struct bpf_reg_state *reg) if (!tnum_subreg_is_const(reg->var_off)) return false; - return reg->u32_min_value != uval32 || reg->u32_max_value != uval32 || - reg->s32_min_value != sval32 || reg->s32_max_value != sval32; + return reg_u32_min(reg) != uval32 || reg_u32_max(reg) != uval32 || + reg_s32_min(reg) != sval32 || reg_s32_max(reg) != sval32; } static int reg_bounds_sanity_check(struct bpf_verifier_env *env, @@ -2458,10 +2457,10 @@ static int reg_bounds_sanity_check(struct bpf_verifier_env *env, out: verifier_bug(env, "REG INVARIANTS VIOLATION (%s): %s u64=[%#llx, %#llx] " "s64=[%#llx, %#llx] u32=[%#x, %#x] s32=[%#x, %#x] var_off=(%#llx, %#llx)", - ctx, msg, reg->umin_value, reg->umax_value, - reg->smin_value, reg->smax_value, - reg->u32_min_value, reg->u32_max_value, - reg->s32_min_value, reg->s32_max_value, + ctx, msg, reg_umin(reg), reg_umax(reg), + reg_smin(reg), reg_smax(reg), + reg_u32_min(reg), reg_u32_max(reg), + reg_s32_min(reg), reg_s32_max(reg), reg->var_off.value, reg->var_off.mask); if (env->test_reg_invariants) return -EFAULT; @@ -2476,21 +2475,17 @@ static bool __reg32_bound_s64(s32 a) static void __reg_assign_32_into_64(struct bpf_reg_state *reg) { - reg->umin_value = reg->u32_min_value; - reg->umax_value = reg->u32_max_value; + reg_set_urange64(reg, reg_u32_min(reg), reg_u32_max(reg)); /* Attempt to pull 32-bit signed bounds into 64-bit bounds but must * be positive otherwise set to worse case bounds and refine later * from tnum. */ - if (__reg32_bound_s64(reg->s32_min_value) && - __reg32_bound_s64(reg->s32_max_value)) { - reg->smin_value = reg->s32_min_value; - reg->smax_value = reg->s32_max_value; - } else { - reg->smin_value = 0; - reg->smax_value = U32_MAX; - } + if (__reg32_bound_s64(reg_s32_min(reg)) && + __reg32_bound_s64(reg_s32_max(reg))) + reg_set_srange64(reg, reg_s32_min(reg), reg_s32_max(reg)); + else + reg_set_srange64(reg, 0, U32_MAX); } /* Mark a register as having a completely unknown (scalar) value. */ @@ -2534,11 +2529,12 @@ static int __mark_reg_s32_range(struct bpf_verifier_env *env, { struct bpf_reg_state *reg = regs + regno; - reg->s32_min_value = max_t(s32, reg->s32_min_value, s32_min); - reg->s32_max_value = min_t(s32, reg->s32_max_value, s32_max); - - reg->smin_value = max_t(s64, reg->smin_value, s32_min); - reg->smax_value = min_t(s64, reg->smax_value, s32_max); + reg_set_srange32(reg, + max_t(s32, reg_s32_min(reg), s32_min), + min_t(s32, reg_s32_max(reg), s32_max)); + reg_set_srange64(reg, + max_t(s64, reg_smin(reg), s32_min), + min_t(s64, reg_smax(reg), s32_max)); reg_bounds_sync(reg); @@ -3801,7 +3797,7 @@ static bool is_bpf_st_mem(struct bpf_insn *insn) static int get_reg_width(struct bpf_reg_state *reg) { - return fls64(reg->umax_value); + return fls64(reg_umax(reg)); } /* See comment for mark_fastcall_pattern_for_call() */ @@ -3990,8 +3986,8 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, bool zero_used = false; cur = env->cur_state->frame[env->cur_state->curframe]; - min_off = ptr_reg->smin_value + off; - max_off = ptr_reg->smax_value + off + size; + min_off = reg_smin(ptr_reg) + off; + max_off = reg_smax(ptr_reg) + off + size; if (value_regno >= 0) value_reg = &cur->regs[value_regno]; if ((value_reg && bpf_register_is_null(value_reg)) || @@ -4324,8 +4320,8 @@ static int check_stack_read_var_off(struct bpf_verifier_env *env, struct bpf_reg if (err) return err; - min_off = reg->smin_value + off; - max_off = reg->smax_value + off; + min_off = reg_smin(reg) + off; + max_off = reg_smax(reg) + off; mark_reg_stack_read(env, ptr_state, min_off, max_off + size, dst_regno); check_fastcall_stack_contract(env, ptr_state, env->insn_idx, min_off); return 0; @@ -4425,13 +4421,13 @@ static int check_map_access_type(struct bpf_verifier_env *env, struct bpf_reg_st if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) { verbose(env, "write into map forbidden, value_size=%d off=%lld size=%d\n", - map->value_size, reg->smin_value + off, size); + map->value_size, reg_smin(reg) + off, size); return -EACCES; } if (type == BPF_READ && !(cap & BPF_MAP_CAN_READ)) { verbose(env, "read from map forbidden, value_size=%d off=%lld size=%d\n", - map->value_size, reg->smin_value + off, size); + map->value_size, reg_smin(reg) + off, size); return -EACCES; } @@ -4493,15 +4489,15 @@ static int check_mem_region_access(struct bpf_verifier_env *env, struct bpf_reg_ * index'es we need to make sure that whatever we use * will have a set floor within our range. */ - if (reg->smin_value < 0 && - (reg->smin_value == S64_MIN || - (off + reg->smin_value != (s64)(s32)(off + reg->smin_value)) || - reg->smin_value + off < 0)) { + if (reg_smin(reg) < 0 && + (reg_smin(reg) == S64_MIN || + (off + reg_smin(reg) != (s64)(s32)(off + reg_smin(reg))) || + reg_smin(reg) + off < 0)) { verbose(env, "%s min value is negative, either use unsigned index or do a if (index >=0) check.\n", reg_arg_name(env, argno)); return -EACCES; } - err = __check_mem_access(env, reg, argno, reg->smin_value + off, size, + err = __check_mem_access(env, reg, argno, reg_smin(reg) + off, size, mem_size, zero_size_allowed); if (err) { verbose(env, "%s min value is outside of the allowed memory range\n", @@ -4511,14 +4507,14 @@ static int check_mem_region_access(struct bpf_verifier_env *env, struct bpf_reg_ /* If we haven't set a max value then we need to bail since we can't be * sure we won't do bad things. - * If reg->umax_value + off could overflow, treat that as unbounded too. + * If reg_umax(reg) + off could overflow, treat that as unbounded too. */ - if (reg->umax_value >= BPF_MAX_VAR_OFF) { + if (reg_umax(reg) >= BPF_MAX_VAR_OFF) { verbose(env, "%s unbounded memory access, make sure to bounds check any such access\n", reg_arg_name(env, argno)); return -EACCES; } - err = __check_mem_access(env, reg, argno, reg->umax_value + off, size, + err = __check_mem_access(env, reg, argno, reg_umax(reg) + off, size, mem_size, zero_size_allowed); if (err) { verbose(env, "%s max value is outside of the allowed memory range\n", @@ -4546,7 +4542,7 @@ static int __check_ptr_off_reg(struct bpf_verifier_env *env, return -EACCES; } - if (reg->smin_value < 0) { + if (reg_smin(reg) < 0) { verbose(env, "negative offset %s ptr %s off=%lld disallowed\n", reg_type_str(env, reg->type), reg_arg_name(env, argno), reg->var_off.value); return -EACCES; @@ -4846,8 +4842,8 @@ static int check_map_access(struct bpf_verifier_env *env, struct bpf_reg_state * * this program. To check that [x1, x2) overlaps with [y1, y2), * it is sufficient to check x1 < y2 && y1 < x2. */ - if (reg->smin_value + off < p + field->size && - p < reg->umax_value + off + size) { + if (reg_smin(reg) + off < p + field->size && + p < reg_umax(reg) + off + size) { switch (field->type) { case BPF_KPTR_UNREF: case BPF_KPTR_REF: @@ -4942,14 +4938,14 @@ static int check_packet_access(struct bpf_verifier_env *env, struct bpf_reg_stat return err; /* __check_mem_access has made sure "off + size - 1" is within u16. - * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff, + * reg_umax(reg) can't be bigger than MAX_PACKET_OFF which is 0xffff, * otherwise find_good_pkt_pointers would have refused to set range info * that __check_mem_access would have rejected this pkt access. - * Therefore, "off + reg->umax_value + size - 1" won't overflow u32. + * Therefore, "off + reg_umax(reg) + size - 1" won't overflow u32. */ env->prog->aux->max_pkt_offset = max_t(u32, env->prog->aux->max_pkt_offset, - off + reg->umax_value + size - 1); + off + reg_umax(reg) + size - 1); return 0; } @@ -5010,7 +5006,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, struct b err = __check_ptr_off_reg(env, reg, argno, fixed_off_ok); if (err) return err; - off += reg->umax_value; + off += reg_umax(reg); err = __check_ctx_access(env, insn_idx, off, access_size, t, info); if (err) @@ -5037,7 +5033,7 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn_access_aux info = {}; bool valid; - if (reg->smin_value < 0) { + if (reg_smin(reg) < 0) { verbose(env, "%s min value is negative, either use unsigned index or do a if (index >=0) check.\n", reg_arg_name(env, argno)); return -EACCES; @@ -5655,15 +5651,12 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) /* fix arithmetic bounds */ mask = ((u64)1 << (size * 8)) - 1; - if ((reg->umin_value & ~mask) == (reg->umax_value & ~mask)) { - reg->umin_value &= mask; - reg->umax_value &= mask; + if ((reg_umin(reg) & ~mask) == (reg_umax(reg) & ~mask)) { + reg_set_urange64(reg, reg_umin(reg) & mask, reg_umax(reg) & mask); } else { - reg->umin_value = 0; - reg->umax_value = mask; + reg_set_urange64(reg, 0, mask); } - reg->smin_value = reg->umin_value; - reg->smax_value = reg->umax_value; + reg_set_srange64(reg, reg_umin(reg), reg_umax(reg)); /* If size is smaller than 32bit register the 32bit register * values are also truncated so we push 64-bit bounds into @@ -5678,19 +5671,18 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) static void set_sext64_default_val(struct bpf_reg_state *reg, int size) { if (size == 1) { - reg->smin_value = reg->s32_min_value = S8_MIN; - reg->smax_value = reg->s32_max_value = S8_MAX; + reg_set_srange64(reg, S8_MIN, S8_MAX); + reg_set_srange32(reg, S8_MIN, S8_MAX); } else if (size == 2) { - reg->smin_value = reg->s32_min_value = S16_MIN; - reg->smax_value = reg->s32_max_value = S16_MAX; + reg_set_srange64(reg, S16_MIN, S16_MAX); + reg_set_srange32(reg, S16_MIN, S16_MAX); } else { /* size == 4 */ - reg->smin_value = reg->s32_min_value = S32_MIN; - reg->smax_value = reg->s32_max_value = S32_MAX; + reg_set_srange64(reg, S32_MIN, S32_MAX); + reg_set_srange32(reg, S32_MIN, S32_MAX); } - reg->umin_value = reg->u32_min_value = 0; - reg->umax_value = U64_MAX; - reg->u32_max_value = U32_MAX; + reg_set_urange64(reg, 0, U64_MAX); + reg_set_urange32(reg, 0, U32_MAX); reg->var_off = tnum_unknown; } @@ -5711,29 +5703,29 @@ static void coerce_reg_to_size_sx(struct bpf_reg_state *reg, int size) reg->var_off = tnum_const((s32)u64_cval); u64_cval = reg->var_off.value; - reg->smax_value = reg->smin_value = u64_cval; - reg->umax_value = reg->umin_value = u64_cval; - reg->s32_max_value = reg->s32_min_value = u64_cval; - reg->u32_max_value = reg->u32_min_value = u64_cval; + reg_set_srange64(reg, u64_cval, u64_cval); + reg_set_urange64(reg, u64_cval, u64_cval); + reg_set_srange32(reg, u64_cval, u64_cval); + reg_set_urange32(reg, u64_cval, u64_cval); return; } - top_smax_value = ((u64)reg->smax_value >> num_bits) << num_bits; - top_smin_value = ((u64)reg->smin_value >> num_bits) << num_bits; + top_smax_value = ((u64)reg_smax(reg) >> num_bits) << num_bits; + top_smin_value = ((u64)reg_smin(reg) >> num_bits) << num_bits; if (top_smax_value != top_smin_value) goto out; /* find the s64_min and s64_min after sign extension */ if (size == 1) { - init_s64_max = (s8)reg->smax_value; - init_s64_min = (s8)reg->smin_value; + init_s64_max = (s8)reg_smax(reg); + init_s64_min = (s8)reg_smin(reg); } else if (size == 2) { - init_s64_max = (s16)reg->smax_value; - init_s64_min = (s16)reg->smin_value; + init_s64_max = (s16)reg_smax(reg); + init_s64_min = (s16)reg_smin(reg); } else { - init_s64_max = (s32)reg->smax_value; - init_s64_min = (s32)reg->smin_value; + init_s64_max = (s32)reg_smax(reg); + init_s64_min = (s32)reg_smin(reg); } s64_max = max(init_s64_max, init_s64_min); @@ -5741,10 +5733,10 @@ static void coerce_reg_to_size_sx(struct bpf_reg_state *reg, int size) /* both of s64_max/s64_min positive or negative */ if ((s64_max >= 0) == (s64_min >= 0)) { - reg->s32_min_value = reg->smin_value = s64_min; - reg->s32_max_value = reg->smax_value = s64_max; - reg->u32_min_value = reg->umin_value = s64_min; - reg->u32_max_value = reg->umax_value = s64_max; + reg_set_srange64(reg, s64_min, s64_max); + reg_set_urange64(reg, s64_min, s64_max); + reg_set_srange32(reg, s64_min, s64_max); + reg_set_urange32(reg, s64_min, s64_max); reg->var_off = tnum_range(s64_min, s64_max); return; } @@ -5755,16 +5747,12 @@ out: static void set_sext32_default_val(struct bpf_reg_state *reg, int size) { - if (size == 1) { - reg->s32_min_value = S8_MIN; - reg->s32_max_value = S8_MAX; - } else { + if (size == 1) + reg_set_srange32(reg, S8_MIN, S8_MAX); + else /* size == 2 */ - reg->s32_min_value = S16_MIN; - reg->s32_max_value = S16_MAX; - } - reg->u32_min_value = 0; - reg->u32_max_value = U32_MAX; + reg_set_srange32(reg, S16_MIN, S16_MAX); + reg_set_urange32(reg, 0, U32_MAX); reg->var_off = tnum_subreg(tnum_unknown); } @@ -5782,34 +5770,32 @@ static void coerce_subreg_to_size_sx(struct bpf_reg_state *reg, int size) reg->var_off = tnum_const((s16)u32_val); u32_val = reg->var_off.value; - reg->s32_min_value = reg->s32_max_value = u32_val; - reg->u32_min_value = reg->u32_max_value = u32_val; + reg_set_srange32(reg, u32_val, u32_val); + reg_set_urange32(reg, u32_val, u32_val); return; } - top_smax_value = ((u32)reg->s32_max_value >> num_bits) << num_bits; - top_smin_value = ((u32)reg->s32_min_value >> num_bits) << num_bits; + top_smax_value = ((u32)reg_s32_max(reg) >> num_bits) << num_bits; + top_smin_value = ((u32)reg_s32_min(reg) >> num_bits) << num_bits; if (top_smax_value != top_smin_value) goto out; /* find the s32_min and s32_min after sign extension */ if (size == 1) { - init_s32_max = (s8)reg->s32_max_value; - init_s32_min = (s8)reg->s32_min_value; + init_s32_max = (s8)reg_s32_max(reg); + init_s32_min = (s8)reg_s32_min(reg); } else { /* size == 2 */ - init_s32_max = (s16)reg->s32_max_value; - init_s32_min = (s16)reg->s32_min_value; + init_s32_max = (s16)reg_s32_max(reg); + init_s32_min = (s16)reg_s32_min(reg); } s32_max = max(init_s32_max, init_s32_min); s32_min = min(init_s32_max, init_s32_min); if ((s32_min >= 0) == (s32_max >= 0)) { - reg->s32_min_value = s32_min; - reg->s32_max_value = s32_max; - reg->u32_min_value = (u32)s32_min; - reg->u32_max_value = (u32)s32_max; + reg_set_srange32(reg, s32_min, s32_max); + reg_set_urange32(reg, (u32)s32_min, (u32)s32_max); reg->var_off = tnum_subreg(tnum_range(s32_min, s32_max)); return; } @@ -6266,14 +6252,14 @@ static int check_stack_access_within_bounds( min_off = (s64)reg->var_off.value + off; max_off = min_off + access_size; } else { - if (reg->smax_value >= BPF_MAX_VAR_OFF || - reg->smin_value <= -BPF_MAX_VAR_OFF) { + if (reg_smax(reg) >= BPF_MAX_VAR_OFF || + reg_smin(reg) <= -BPF_MAX_VAR_OFF) { verbose(env, "invalid unbounded variable-offset%s stack %s\n", err_extra, reg_arg_name(env, argno)); return -EACCES; } - min_off = reg->smin_value + off; - max_off = reg->smax_value + off + access_size; + min_off = reg_smin(reg) + off; + max_off = reg_smax(reg) + off + access_size; } err = check_stack_slot_within_bounds(env, min_off, state, type); @@ -6891,8 +6877,8 @@ static int check_stack_range_initialized( if (meta && meta->raw_mode) meta = NULL; - min_off = reg->smin_value + off; - max_off = reg->smax_value + off; + min_off = reg_smin(reg) + off; + max_off = reg_smax(reg) + off; } if (meta && meta->raw_mode) { @@ -7048,8 +7034,8 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, struct bpf_reg_ zero_size_allowed); if (err) return err; - if (env->prog->aux->max_ctx_offset < reg->umax_value + access_size) - env->prog->aux->max_ctx_offset = reg->umax_value + access_size; + if (env->prog->aux->max_ctx_offset < reg_umax(reg) + access_size) + env->prog->aux->max_ctx_offset = reg_umax(reg) + access_size; return 0; } fallthrough; @@ -7088,7 +7074,7 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, * out. Only upper bounds can be learned because retval is an * int type and negative retvals are allowed. */ - meta->msize_max_value = size_reg->umax_value; + meta->msize_max_value = reg_umax(size_reg); /* The register is SCALAR_VALUE; the access check happens using * its boundaries. For unprivileged variable accesses, disable @@ -7098,24 +7084,24 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, if (!tnum_is_const(size_reg->var_off)) meta = NULL; - if (size_reg->smin_value < 0) { + if (reg_smin(size_reg) < 0) { verbose(env, "%s min value is negative, either use unsigned or 'var &= const'\n", reg_arg_name(env, size_argno)); return -EACCES; } - if (size_reg->umin_value == 0 && !zero_size_allowed) { + if (reg_umin(size_reg) == 0 && !zero_size_allowed) { verbose(env, "%s invalid zero-sized read: u64=[%lld,%lld]\n", - reg_arg_name(env, size_argno), size_reg->umin_value, size_reg->umax_value); + reg_arg_name(env, size_argno), reg_umin(size_reg), reg_umax(size_reg)); return -EACCES; } - if (size_reg->umax_value >= BPF_MAX_VAR_SIZ) { + if (reg_umax(size_reg) >= BPF_MAX_VAR_SIZ) { verbose(env, "%s unbounded memory access, use 'var &= const' or 'if (var < const)'\n", reg_arg_name(env, size_argno)); return -EACCES; } - err = check_helper_mem_access(env, mem_reg, mem_argno, size_reg->umax_value, + err = check_helper_mem_access(env, mem_reg, mem_argno, reg_umax(size_reg), access_type, zero_size_allowed, meta); if (!err) err = mark_chain_precision(env, reg_from_argno(size_argno)); @@ -9848,9 +9834,9 @@ static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env) static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg) { if (range.return_32bit) - return range.minval <= reg->s32_min_value && reg->s32_max_value <= range.maxval; + return range.minval <= reg_s32_min(reg) && reg_s32_max(reg) <= range.maxval; else - return range.minval <= reg->smin_value && reg->smax_value <= range.maxval; + return range.minval <= reg_smin(reg) && reg_smax(reg) <= range.maxval; } static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) @@ -9959,21 +9945,15 @@ static int do_refine_retval_range(struct bpf_verifier_env *env, case BPF_FUNC_probe_read_str: case BPF_FUNC_probe_read_kernel_str: case BPF_FUNC_probe_read_user_str: - ret_reg->smax_value = meta->msize_max_value; - ret_reg->s32_max_value = meta->msize_max_value; - ret_reg->smin_value = -MAX_ERRNO; - ret_reg->s32_min_value = -MAX_ERRNO; + reg_set_srange64(ret_reg, -MAX_ERRNO, meta->msize_max_value); + reg_set_srange32(ret_reg, -MAX_ERRNO, meta->msize_max_value); reg_bounds_sync(ret_reg); break; case BPF_FUNC_get_smp_processor_id: - ret_reg->umax_value = nr_cpu_ids - 1; - ret_reg->u32_max_value = nr_cpu_ids - 1; - ret_reg->smax_value = nr_cpu_ids - 1; - ret_reg->s32_max_value = nr_cpu_ids - 1; - ret_reg->umin_value = 0; - ret_reg->u32_min_value = 0; - ret_reg->smin_value = 0; - ret_reg->s32_min_value = 0; + reg_set_urange64(ret_reg, 0, nr_cpu_ids - 1); + reg_set_urange32(ret_reg, 0, nr_cpu_ids - 1); + reg_set_srange64(ret_reg, 0, nr_cpu_ids - 1); + reg_set_srange32(ret_reg, 0, nr_cpu_ids - 1); reg_bounds_sync(ret_reg); break; } @@ -10438,7 +10418,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn err = mark_chain_precision(env, BPF_REG_1); if (err) return err; - if (cur_func(env)->callback_depth < regs[BPF_REG_1].umax_value) { + if (cur_func(env)->callback_depth < reg_umax(®s[BPF_REG_1])) { err = push_callback_call(env, insn, insn_idx, meta.subprogno, set_loop_callback_state); } else { @@ -13403,7 +13383,7 @@ static bool check_reg_sane_offset_scalar(struct bpf_verifier_env *env, { bool known = tnum_is_const(reg->var_off); s64 val = reg->var_off.value; - s64 smin = reg->smin_value; + s64 smin = reg_smin(reg); if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) { verbose(env, "math between %s pointer and %lld is not allowed\n", @@ -13432,7 +13412,7 @@ static bool check_reg_sane_offset_ptr(struct bpf_verifier_env *env, { bool known = tnum_is_const(reg->var_off); s64 val = reg->var_off.value; - s64 smin = reg->smin_value; + s64 smin = reg_smin(reg); if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) { verbose(env, "%s pointer offset %lld is not allowed\n", @@ -13474,7 +13454,7 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, break; case PTR_TO_MAP_VALUE: max = ptr_reg->map_ptr->value_size; - ptr_limit = mask_to_left ? ptr_reg->smin_value : ptr_reg->umax_value; + ptr_limit = mask_to_left ? reg_smin(ptr_reg) : reg_umax(ptr_reg); break; default: return REASON_TYPE; @@ -13563,7 +13543,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : &info->aux; struct bpf_verifier_state *vstate = env->cur_state; bool off_is_imm = tnum_is_const(off_reg->var_off); - bool off_is_neg = off_reg->smin_value < 0; + bool off_is_neg = reg_smin(off_reg) < 0; bool ptr_is_dst_reg = ptr_reg == dst_reg; u8 opcode = BPF_OP(insn->code); u32 alu_state, alu_limit; @@ -13582,7 +13562,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, if (!commit_window) { if (!tnum_is_const(off_reg->var_off) && - (off_reg->smin_value < 0) != (off_reg->smax_value < 0)) + (reg_smin(off_reg) < 0) != (reg_smax(off_reg) < 0)) return REASON_BOUNDS; info->mask_to_left = (opcode == BPF_ADD && off_is_neg) || @@ -13776,10 +13756,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = state->regs, *dst_reg; bool known = tnum_is_const(off_reg->var_off); - s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value, - smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value; - u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value, - umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value; + s64 smin_val = reg_smin(off_reg), smax_val = reg_smax(off_reg), + smin_ptr = reg_smin(ptr_reg), smax_ptr = reg_smax(ptr_reg); + u64 umin_val = reg_umin(off_reg), umax_val = reg_umax(off_reg), + umin_ptr = reg_umin(ptr_reg), umax_ptr = reg_umax(ptr_reg); struct bpf_sanitize_info info = {}; u8 opcode = BPF_OP(insn->code); u32 dst = insn->dst_reg; @@ -13881,15 +13861,22 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, * added into the variable offset, and we copy the fixed offset * from ptr_reg. */ - if (check_add_overflow(smin_ptr, smin_val, &dst_reg->smin_value) || - check_add_overflow(smax_ptr, smax_val, &dst_reg->smax_value)) { - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; + { + s64 smin_res, smax_res; + u64 umin_res, umax_res; + + if (check_add_overflow(smin_ptr, smin_val, &smin_res) || + check_add_overflow(smax_ptr, smax_val, &smax_res)) { + reg_set_srange64(dst_reg, S64_MIN, S64_MAX); + } else { + reg_set_srange64(dst_reg, smin_res, smax_res); + } + if (check_add_overflow(umin_ptr, umin_val, &umin_res) || + check_add_overflow(umax_ptr, umax_val, &umax_res)) { + reg_set_urange64(dst_reg, 0, U64_MAX); + } else { + reg_set_urange64(dst_reg, umin_res, umax_res); } - if (check_add_overflow(umin_ptr, umin_val, &dst_reg->umin_value) || - check_add_overflow(umax_ptr, umax_val, &dst_reg->umax_value)) { - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; } dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off); dst_reg->raw = ptr_reg->raw; @@ -13925,20 +13912,23 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, /* A new variable offset is created. If the subtrahend is known * nonnegative, then any reg->range we had before is still good. */ - if (check_sub_overflow(smin_ptr, smax_val, &dst_reg->smin_value) || - check_sub_overflow(smax_ptr, smin_val, &dst_reg->smax_value)) { + { + s64 smin_res, smax_res; + + if (check_sub_overflow(smin_ptr, smax_val, &smin_res) || + check_sub_overflow(smax_ptr, smin_val, &smax_res)) { /* Overflow possible, we know nothing */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; + reg_set_srange64(dst_reg, S64_MIN, S64_MAX); + } else { + reg_set_srange64(dst_reg, smin_res, smax_res); + } } if (umin_ptr < umax_val) { /* Overflow possible, we know nothing */ - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; + reg_set_urange64(dst_reg, 0, U64_MAX); } else { /* Cannot overflow (as long as bounds are consistent) */ - dst_reg->umin_value = umin_ptr - umax_val; - dst_reg->umax_value = umax_ptr - umin_val; + reg_set_urange64(dst_reg, umin_ptr - umax_val, umax_ptr - umin_val); } dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off); dst_reg->raw = ptr_reg->raw; @@ -13996,18 +13986,18 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, static void scalar32_min_max_add(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s32 *dst_smin = &dst_reg->s32_min_value; - s32 *dst_smax = &dst_reg->s32_max_value; - u32 *dst_umin = &dst_reg->u32_min_value; - u32 *dst_umax = &dst_reg->u32_max_value; - u32 umin_val = src_reg->u32_min_value; - u32 umax_val = src_reg->u32_max_value; + s32 smin = reg_s32_min(dst_reg); + s32 smax = reg_s32_max(dst_reg); + u32 umin = reg_u32_min(dst_reg); + u32 umax = reg_u32_max(dst_reg); + u32 umin_val = reg_u32_min(src_reg); + u32 umax_val = reg_u32_max(src_reg); bool min_overflow, max_overflow; - if (check_add_overflow(*dst_smin, src_reg->s32_min_value, dst_smin) || - check_add_overflow(*dst_smax, src_reg->s32_max_value, dst_smax)) { - *dst_smin = S32_MIN; - *dst_smax = S32_MAX; + if (check_add_overflow(smin, reg_s32_min(src_reg), &smin) || + check_add_overflow(smax, reg_s32_max(src_reg), &smax)) { + smin = S32_MIN; + smax = S32_MAX; } /* If either all additions overflow or no additions overflow, then @@ -14015,30 +14005,33 @@ static void scalar32_min_max_add(struct bpf_reg_state *dst_reg, * dst_umax + src_umax. Otherwise (some additions overflow), set * the output bounds to unbounded. */ - min_overflow = check_add_overflow(*dst_umin, umin_val, dst_umin); - max_overflow = check_add_overflow(*dst_umax, umax_val, dst_umax); + min_overflow = check_add_overflow(umin, umin_val, &umin); + max_overflow = check_add_overflow(umax, umax_val, &umax); if (!min_overflow && max_overflow) { - *dst_umin = 0; - *dst_umax = U32_MAX; + umin = 0; + umax = U32_MAX; } + + reg_set_srange32(dst_reg, smin, smax); + reg_set_urange32(dst_reg, umin, umax); } static void scalar_min_max_add(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s64 *dst_smin = &dst_reg->smin_value; - s64 *dst_smax = &dst_reg->smax_value; - u64 *dst_umin = &dst_reg->umin_value; - u64 *dst_umax = &dst_reg->umax_value; - u64 umin_val = src_reg->umin_value; - u64 umax_val = src_reg->umax_value; + s64 smin = reg_smin(dst_reg); + s64 smax = reg_smax(dst_reg); + u64 umin = reg_umin(dst_reg); + u64 umax = reg_umax(dst_reg); + u64 umin_val = reg_umin(src_reg); + u64 umax_val = reg_umax(src_reg); bool min_overflow, max_overflow; - if (check_add_overflow(*dst_smin, src_reg->smin_value, dst_smin) || - check_add_overflow(*dst_smax, src_reg->smax_value, dst_smax)) { - *dst_smin = S64_MIN; - *dst_smax = S64_MAX; + if (check_add_overflow(smin, reg_smin(src_reg), &smin) || + check_add_overflow(smax, reg_smax(src_reg), &smax)) { + smin = S64_MIN; + smax = S64_MAX; } /* If either all additions overflow or no additions overflow, then @@ -14046,31 +14039,34 @@ static void scalar_min_max_add(struct bpf_reg_state *dst_reg, * dst_umax + src_umax. Otherwise (some additions overflow), set * the output bounds to unbounded. */ - min_overflow = check_add_overflow(*dst_umin, umin_val, dst_umin); - max_overflow = check_add_overflow(*dst_umax, umax_val, dst_umax); + min_overflow = check_add_overflow(umin, umin_val, &umin); + max_overflow = check_add_overflow(umax, umax_val, &umax); if (!min_overflow && max_overflow) { - *dst_umin = 0; - *dst_umax = U64_MAX; + umin = 0; + umax = U64_MAX; } + + reg_set_srange64(dst_reg, smin, smax); + reg_set_urange64(dst_reg, umin, umax); } static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s32 *dst_smin = &dst_reg->s32_min_value; - s32 *dst_smax = &dst_reg->s32_max_value; - u32 *dst_umin = &dst_reg->u32_min_value; - u32 *dst_umax = &dst_reg->u32_max_value; - u32 umin_val = src_reg->u32_min_value; - u32 umax_val = src_reg->u32_max_value; + s32 smin = reg_s32_min(dst_reg); + s32 smax = reg_s32_max(dst_reg); + u32 umin = reg_u32_min(dst_reg); + u32 umax = reg_u32_max(dst_reg); + u32 umin_val = reg_u32_min(src_reg); + u32 umax_val = reg_u32_max(src_reg); bool min_underflow, max_underflow; - if (check_sub_overflow(*dst_smin, src_reg->s32_max_value, dst_smin) || - check_sub_overflow(*dst_smax, src_reg->s32_min_value, dst_smax)) { + if (check_sub_overflow(smin, reg_s32_max(src_reg), &smin) || + check_sub_overflow(smax, reg_s32_min(src_reg), &smax)) { /* Overflow possible, we know nothing */ - *dst_smin = S32_MIN; - *dst_smax = S32_MAX; + smin = S32_MIN; + smax = S32_MAX; } /* If either all subtractions underflow or no subtractions @@ -14078,31 +14074,34 @@ static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg, * dst_umax = dst_umax - src_umin. Otherwise (some subtractions * underflow), set the output bounds to unbounded. */ - min_underflow = check_sub_overflow(*dst_umin, umax_val, dst_umin); - max_underflow = check_sub_overflow(*dst_umax, umin_val, dst_umax); + min_underflow = check_sub_overflow(umin, umax_val, &umin); + max_underflow = check_sub_overflow(umax, umin_val, &umax); if (min_underflow && !max_underflow) { - *dst_umin = 0; - *dst_umax = U32_MAX; + umin = 0; + umax = U32_MAX; } + + reg_set_srange32(dst_reg, smin, smax); + reg_set_urange32(dst_reg, umin, umax); } static void scalar_min_max_sub(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s64 *dst_smin = &dst_reg->smin_value; - s64 *dst_smax = &dst_reg->smax_value; - u64 *dst_umin = &dst_reg->umin_value; - u64 *dst_umax = &dst_reg->umax_value; - u64 umin_val = src_reg->umin_value; - u64 umax_val = src_reg->umax_value; + s64 smin = reg_smin(dst_reg); + s64 smax = reg_smax(dst_reg); + u64 umin = reg_umin(dst_reg); + u64 umax = reg_umax(dst_reg); + u64 umin_val = reg_umin(src_reg); + u64 umax_val = reg_umax(src_reg); bool min_underflow, max_underflow; - if (check_sub_overflow(*dst_smin, src_reg->smax_value, dst_smin) || - check_sub_overflow(*dst_smax, src_reg->smin_value, dst_smax)) { + if (check_sub_overflow(smin, reg_smax(src_reg), &smin) || + check_sub_overflow(smax, reg_smin(src_reg), &smax)) { /* Overflow possible, we know nothing */ - *dst_smin = S64_MIN; - *dst_smax = S64_MAX; + smin = S64_MIN; + smax = S64_MAX; } /* If either all subtractions underflow or no subtractions @@ -14110,113 +14109,116 @@ static void scalar_min_max_sub(struct bpf_reg_state *dst_reg, * dst_umax = dst_umax - src_umin. Otherwise (some subtractions * underflow), set the output bounds to unbounded. */ - min_underflow = check_sub_overflow(*dst_umin, umax_val, dst_umin); - max_underflow = check_sub_overflow(*dst_umax, umin_val, dst_umax); + min_underflow = check_sub_overflow(umin, umax_val, &umin); + max_underflow = check_sub_overflow(umax, umin_val, &umax); if (min_underflow && !max_underflow) { - *dst_umin = 0; - *dst_umax = U64_MAX; + umin = 0; + umax = U64_MAX; } + + reg_set_srange64(dst_reg, smin, smax); + reg_set_urange64(dst_reg, umin, umax); } static void scalar32_min_max_mul(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s32 *dst_smin = &dst_reg->s32_min_value; - s32 *dst_smax = &dst_reg->s32_max_value; - u32 *dst_umin = &dst_reg->u32_min_value; - u32 *dst_umax = &dst_reg->u32_max_value; + s32 smin = reg_s32_min(dst_reg); + s32 smax = reg_s32_max(dst_reg); + u32 umin = reg_u32_min(dst_reg); + u32 umax = reg_u32_max(dst_reg); s32 tmp_prod[4]; - if (check_mul_overflow(*dst_umax, src_reg->u32_max_value, dst_umax) || - check_mul_overflow(*dst_umin, src_reg->u32_min_value, dst_umin)) { + if (check_mul_overflow(umax, reg_u32_max(src_reg), &umax) || + check_mul_overflow(umin, reg_u32_min(src_reg), &umin)) { /* Overflow possible, we know nothing */ - *dst_umin = 0; - *dst_umax = U32_MAX; + umin = 0; + umax = U32_MAX; } - if (check_mul_overflow(*dst_smin, src_reg->s32_min_value, &tmp_prod[0]) || - check_mul_overflow(*dst_smin, src_reg->s32_max_value, &tmp_prod[1]) || - check_mul_overflow(*dst_smax, src_reg->s32_min_value, &tmp_prod[2]) || - check_mul_overflow(*dst_smax, src_reg->s32_max_value, &tmp_prod[3])) { + if (check_mul_overflow(smin, reg_s32_min(src_reg), &tmp_prod[0]) || + check_mul_overflow(smin, reg_s32_max(src_reg), &tmp_prod[1]) || + check_mul_overflow(smax, reg_s32_min(src_reg), &tmp_prod[2]) || + check_mul_overflow(smax, reg_s32_max(src_reg), &tmp_prod[3])) { /* Overflow possible, we know nothing */ - *dst_smin = S32_MIN; - *dst_smax = S32_MAX; + smin = S32_MIN; + smax = S32_MAX; } else { - *dst_smin = min_array(tmp_prod, 4); - *dst_smax = max_array(tmp_prod, 4); + smin = min_array(tmp_prod, 4); + smax = max_array(tmp_prod, 4); } + + reg_set_srange32(dst_reg, smin, smax); + reg_set_urange32(dst_reg, umin, umax); } static void scalar_min_max_mul(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s64 *dst_smin = &dst_reg->smin_value; - s64 *dst_smax = &dst_reg->smax_value; - u64 *dst_umin = &dst_reg->umin_value; - u64 *dst_umax = &dst_reg->umax_value; + s64 smin = reg_smin(dst_reg); + s64 smax = reg_smax(dst_reg); + u64 umin = reg_umin(dst_reg); + u64 umax = reg_umax(dst_reg); s64 tmp_prod[4]; - if (check_mul_overflow(*dst_umax, src_reg->umax_value, dst_umax) || - check_mul_overflow(*dst_umin, src_reg->umin_value, dst_umin)) { + if (check_mul_overflow(umax, reg_umax(src_reg), &umax) || + check_mul_overflow(umin, reg_umin(src_reg), &umin)) { /* Overflow possible, we know nothing */ - *dst_umin = 0; - *dst_umax = U64_MAX; + umin = 0; + umax = U64_MAX; } - if (check_mul_overflow(*dst_smin, src_reg->smin_value, &tmp_prod[0]) || - check_mul_overflow(*dst_smin, src_reg->smax_value, &tmp_prod[1]) || - check_mul_overflow(*dst_smax, src_reg->smin_value, &tmp_prod[2]) || - check_mul_overflow(*dst_smax, src_reg->smax_value, &tmp_prod[3])) { + if (check_mul_overflow(smin, reg_smin(src_reg), &tmp_prod[0]) || + check_mul_overflow(smin, reg_smax(src_reg), &tmp_prod[1]) || + check_mul_overflow(smax, reg_smin(src_reg), &tmp_prod[2]) || + check_mul_overflow(smax, reg_smax(src_reg), &tmp_prod[3])) { /* Overflow possible, we know nothing */ - *dst_smin = S64_MIN; - *dst_smax = S64_MAX; + smin = S64_MIN; + smax = S64_MAX; } else { - *dst_smin = min_array(tmp_prod, 4); - *dst_smax = max_array(tmp_prod, 4); + smin = min_array(tmp_prod, 4); + smax = max_array(tmp_prod, 4); } + + reg_set_srange64(dst_reg, smin, smax); + reg_set_urange64(dst_reg, umin, umax); } static void scalar32_min_max_udiv(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u32 *dst_umin = &dst_reg->u32_min_value; - u32 *dst_umax = &dst_reg->u32_max_value; - u32 src_val = src_reg->u32_min_value; /* non-zero, const divisor */ + u32 src_val = reg_u32_min(src_reg); /* non-zero, const divisor */ - *dst_umin = *dst_umin / src_val; - *dst_umax = *dst_umax / src_val; + reg_set_urange32(dst_reg, reg_u32_min(dst_reg) / src_val, + reg_u32_max(dst_reg) / src_val); /* Reset other ranges/tnum to unbounded/unknown. */ - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; + reg_set_srange32(dst_reg, S32_MIN, S32_MAX); reset_reg64_and_tnum(dst_reg); } static void scalar_min_max_udiv(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u64 *dst_umin = &dst_reg->umin_value; - u64 *dst_umax = &dst_reg->umax_value; - u64 src_val = src_reg->umin_value; /* non-zero, const divisor */ + u64 src_val = reg_umin(src_reg); /* non-zero, const divisor */ - *dst_umin = div64_u64(*dst_umin, src_val); - *dst_umax = div64_u64(*dst_umax, src_val); + reg_set_urange64(dst_reg, div64_u64(reg_umin(dst_reg), src_val), + div64_u64(reg_umax(dst_reg), src_val)); /* Reset other ranges/tnum to unbounded/unknown. */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; + reg_set_srange64(dst_reg, S64_MIN, S64_MAX); reset_reg32_and_tnum(dst_reg); } static void scalar32_min_max_sdiv(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s32 *dst_smin = &dst_reg->s32_min_value; - s32 *dst_smax = &dst_reg->s32_max_value; - s32 src_val = src_reg->s32_min_value; /* non-zero, const divisor */ + s32 smin = reg_s32_min(dst_reg); + s32 smax = reg_s32_max(dst_reg); + s32 src_val = reg_s32_min(src_reg); /* non-zero, const divisor */ s32 res1, res2; /* BPF div specification: S32_MIN / -1 = S32_MIN */ - if (*dst_smin == S32_MIN && src_val == -1) { + if (smin == S32_MIN && src_val == -1) { /* * If the dividend range contains more than just S32_MIN, * we cannot precisely track the result, so it becomes unbounded. @@ -14225,35 +14227,35 @@ static void scalar32_min_max_sdiv(struct bpf_reg_state *dst_reg, * = {S32_MIN} U [S32_MAX-9, S32_MAX] = [S32_MIN, S32_MAX] * Otherwise (if dividend is exactly S32_MIN), result remains S32_MIN. */ - if (*dst_smax != S32_MIN) { - *dst_smin = S32_MIN; - *dst_smax = S32_MAX; + if (smax != S32_MIN) { + smin = S32_MIN; + smax = S32_MAX; } goto reset; } - res1 = *dst_smin / src_val; - res2 = *dst_smax / src_val; - *dst_smin = min(res1, res2); - *dst_smax = max(res1, res2); + res1 = smin / src_val; + res2 = smax / src_val; + smin = min(res1, res2); + smax = max(res1, res2); reset: + reg_set_srange32(dst_reg, smin, smax); /* Reset other ranges/tnum to unbounded/unknown. */ - dst_reg->u32_min_value = 0; - dst_reg->u32_max_value = U32_MAX; + reg_set_urange32(dst_reg, 0, U32_MAX); reset_reg64_and_tnum(dst_reg); } static void scalar_min_max_sdiv(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s64 *dst_smin = &dst_reg->smin_value; - s64 *dst_smax = &dst_reg->smax_value; - s64 src_val = src_reg->smin_value; /* non-zero, const divisor */ + s64 smin = reg_smin(dst_reg); + s64 smax = reg_smax(dst_reg); + s64 src_val = reg_smin(src_reg); /* non-zero, const divisor */ s64 res1, res2; /* BPF div specification: S64_MIN / -1 = S64_MIN */ - if (*dst_smin == S64_MIN && src_val == -1) { + if (smin == S64_MIN && src_val == -1) { /* * If the dividend range contains more than just S64_MIN, * we cannot precisely track the result, so it becomes unbounded. @@ -14262,79 +14264,69 @@ static void scalar_min_max_sdiv(struct bpf_reg_state *dst_reg, * = {S64_MIN} U [S64_MAX-9, S64_MAX] = [S64_MIN, S64_MAX] * Otherwise (if dividend is exactly S64_MIN), result remains S64_MIN. */ - if (*dst_smax != S64_MIN) { - *dst_smin = S64_MIN; - *dst_smax = S64_MAX; + if (smax != S64_MIN) { + smin = S64_MIN; + smax = S64_MAX; } goto reset; } - res1 = div64_s64(*dst_smin, src_val); - res2 = div64_s64(*dst_smax, src_val); - *dst_smin = min(res1, res2); - *dst_smax = max(res1, res2); + res1 = div64_s64(smin, src_val); + res2 = div64_s64(smax, src_val); + smin = min(res1, res2); + smax = max(res1, res2); reset: + reg_set_srange64(dst_reg, smin, smax); /* Reset other ranges/tnum to unbounded/unknown. */ - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; + reg_set_urange64(dst_reg, 0, U64_MAX); reset_reg32_and_tnum(dst_reg); } static void scalar32_min_max_umod(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u32 *dst_umin = &dst_reg->u32_min_value; - u32 *dst_umax = &dst_reg->u32_max_value; - u32 src_val = src_reg->u32_min_value; /* non-zero, const divisor */ + u32 src_val = reg_u32_min(src_reg); /* non-zero, const divisor */ u32 res_max = src_val - 1; /* * If dst_umax <= res_max, the result remains unchanged. * e.g., [2, 5] % 10 = [2, 5]. */ - if (*dst_umax <= res_max) + if (reg_u32_max(dst_reg) <= res_max) return; - *dst_umin = 0; - *dst_umax = min(*dst_umax, res_max); + reg_set_urange32(dst_reg, 0, min(reg_u32_max(dst_reg), res_max)); /* Reset other ranges/tnum to unbounded/unknown. */ - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; + reg_set_srange32(dst_reg, S32_MIN, S32_MAX); reset_reg64_and_tnum(dst_reg); } static void scalar_min_max_umod(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u64 *dst_umin = &dst_reg->umin_value; - u64 *dst_umax = &dst_reg->umax_value; - u64 src_val = src_reg->umin_value; /* non-zero, const divisor */ + u64 src_val = reg_umin(src_reg); /* non-zero, const divisor */ u64 res_max = src_val - 1; /* * If dst_umax <= res_max, the result remains unchanged. * e.g., [2, 5] % 10 = [2, 5]. */ - if (*dst_umax <= res_max) + if (reg_umax(dst_reg) <= res_max) return; - *dst_umin = 0; - *dst_umax = min(*dst_umax, res_max); + reg_set_urange64(dst_reg, 0, min(reg_umax(dst_reg), res_max)); /* Reset other ranges/tnum to unbounded/unknown. */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; + reg_set_srange64(dst_reg, S64_MIN, S64_MAX); reset_reg32_and_tnum(dst_reg); } static void scalar32_min_max_smod(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s32 *dst_smin = &dst_reg->s32_min_value; - s32 *dst_smax = &dst_reg->s32_max_value; - s32 src_val = src_reg->s32_min_value; /* non-zero, const divisor */ + s32 src_val = reg_s32_min(src_reg); /* non-zero, const divisor */ /* * Safe absolute value calculation: @@ -14354,33 +14346,27 @@ static void scalar32_min_max_smod(struct bpf_reg_state *dst_reg, * If the dividend is already within the result range, * the result remains unchanged. e.g., [-2, 5] % 10 = [-2, 5]. */ - if (*dst_smin >= -res_max_abs && *dst_smax <= res_max_abs) + if (reg_s32_min(dst_reg) >= -res_max_abs && reg_s32_max(dst_reg) <= res_max_abs) return; /* General case: result has the same sign as the dividend. */ - if (*dst_smin >= 0) { - *dst_smin = 0; - *dst_smax = min(*dst_smax, res_max_abs); - } else if (*dst_smax <= 0) { - *dst_smax = 0; - *dst_smin = max(*dst_smin, -res_max_abs); + if (reg_s32_min(dst_reg) >= 0) { + reg_set_srange32(dst_reg, 0, min(reg_s32_max(dst_reg), res_max_abs)); + } else if (reg_s32_max(dst_reg) <= 0) { + reg_set_srange32(dst_reg, max(reg_s32_min(dst_reg), -res_max_abs), 0); } else { - *dst_smin = -res_max_abs; - *dst_smax = res_max_abs; + reg_set_srange32(dst_reg, -res_max_abs, res_max_abs); } /* Reset other ranges/tnum to unbounded/unknown. */ - dst_reg->u32_min_value = 0; - dst_reg->u32_max_value = U32_MAX; + reg_set_urange32(dst_reg, 0, U32_MAX); reset_reg64_and_tnum(dst_reg); } static void scalar_min_max_smod(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s64 *dst_smin = &dst_reg->smin_value; - s64 *dst_smax = &dst_reg->smax_value; - s64 src_val = src_reg->smin_value; /* non-zero, const divisor */ + s64 src_val = reg_smin(src_reg); /* non-zero, const divisor */ /* * Safe absolute value calculation: @@ -14400,24 +14386,20 @@ static void scalar_min_max_smod(struct bpf_reg_state *dst_reg, * If the dividend is already within the result range, * the result remains unchanged. e.g., [-2, 5] % 10 = [-2, 5]. */ - if (*dst_smin >= -res_max_abs && *dst_smax <= res_max_abs) + if (reg_smin(dst_reg) >= -res_max_abs && reg_smax(dst_reg) <= res_max_abs) return; /* General case: result has the same sign as the dividend. */ - if (*dst_smin >= 0) { - *dst_smin = 0; - *dst_smax = min(*dst_smax, res_max_abs); - } else if (*dst_smax <= 0) { - *dst_smax = 0; - *dst_smin = max(*dst_smin, -res_max_abs); + if (reg_smin(dst_reg) >= 0) { + reg_set_srange64(dst_reg, 0, min(reg_smax(dst_reg), res_max_abs)); + } else if (reg_smax(dst_reg) <= 0) { + reg_set_srange64(dst_reg, max(reg_smin(dst_reg), -res_max_abs), 0); } else { - *dst_smin = -res_max_abs; - *dst_smax = res_max_abs; + reg_set_srange64(dst_reg, -res_max_abs, res_max_abs); } /* Reset other ranges/tnum to unbounded/unknown. */ - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; + reg_set_urange64(dst_reg, 0, U64_MAX); reset_reg32_and_tnum(dst_reg); } @@ -14427,7 +14409,7 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg, bool src_known = tnum_subreg_is_const(src_reg->var_off); bool dst_known = tnum_subreg_is_const(dst_reg->var_off); struct tnum var32_off = tnum_subreg(dst_reg->var_off); - u32 umax_val = src_reg->u32_max_value; + u32 umax_val = reg_u32_max(src_reg); if (src_known && dst_known) { __mark_reg32_known(dst_reg, var32_off.value); @@ -14437,19 +14419,15 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg, /* We get our minimum from the var_off, since that's inherently * bitwise. Our maximum is the minimum of the operands' maxima. */ - dst_reg->u32_min_value = var32_off.value; - dst_reg->u32_max_value = min(dst_reg->u32_max_value, umax_val); + reg_set_urange32(dst_reg, var32_off.value, min(reg_u32_max(dst_reg), umax_val)); /* Safe to set s32 bounds by casting u32 result into s32 when u32 * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded. */ - if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) { - dst_reg->s32_min_value = dst_reg->u32_min_value; - dst_reg->s32_max_value = dst_reg->u32_max_value; - } else { - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; - } + if ((s32)reg_u32_min(dst_reg) <= (s32)reg_u32_max(dst_reg)) + reg_set_srange32(dst_reg, reg_u32_min(dst_reg), reg_u32_max(dst_reg)); + else + reg_set_srange32(dst_reg, S32_MIN, S32_MAX); } static void scalar_min_max_and(struct bpf_reg_state *dst_reg, @@ -14457,7 +14435,7 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg, { bool src_known = tnum_is_const(src_reg->var_off); bool dst_known = tnum_is_const(dst_reg->var_off); - u64 umax_val = src_reg->umax_value; + u64 umax_val = reg_umax(src_reg); if (src_known && dst_known) { __mark_reg_known(dst_reg, dst_reg->var_off.value); @@ -14467,19 +14445,15 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg, /* We get our minimum from the var_off, since that's inherently * bitwise. Our maximum is the minimum of the operands' maxima. */ - dst_reg->umin_value = dst_reg->var_off.value; - dst_reg->umax_value = min(dst_reg->umax_value, umax_val); + reg_set_urange64(dst_reg, dst_reg->var_off.value, min(reg_umax(dst_reg), umax_val)); /* Safe to set s64 bounds by casting u64 result into s64 when u64 * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded. */ - if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) { - dst_reg->smin_value = dst_reg->umin_value; - dst_reg->smax_value = dst_reg->umax_value; - } else { - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } + if ((s64)reg_umin(dst_reg) <= (s64)reg_umax(dst_reg)) + reg_set_srange64(dst_reg, reg_umin(dst_reg), reg_umax(dst_reg)); + else + reg_set_srange64(dst_reg, S64_MIN, S64_MAX); /* We may learn something more from the var_off */ __update_reg_bounds(dst_reg); } @@ -14490,7 +14464,7 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg, bool src_known = tnum_subreg_is_const(src_reg->var_off); bool dst_known = tnum_subreg_is_const(dst_reg->var_off); struct tnum var32_off = tnum_subreg(dst_reg->var_off); - u32 umin_val = src_reg->u32_min_value; + u32 umin_val = reg_u32_min(src_reg); if (src_known && dst_known) { __mark_reg32_known(dst_reg, var32_off.value); @@ -14500,19 +14474,16 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg, /* We get our maximum from the var_off, and our minimum is the * maximum of the operands' minima */ - dst_reg->u32_min_value = max(dst_reg->u32_min_value, umin_val); - dst_reg->u32_max_value = var32_off.value | var32_off.mask; + reg_set_urange32(dst_reg, max(reg_u32_min(dst_reg), umin_val), + var32_off.value | var32_off.mask); /* Safe to set s32 bounds by casting u32 result into s32 when u32 * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded. */ - if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) { - dst_reg->s32_min_value = dst_reg->u32_min_value; - dst_reg->s32_max_value = dst_reg->u32_max_value; - } else { - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; - } + if ((s32)reg_u32_min(dst_reg) <= (s32)reg_u32_max(dst_reg)) + reg_set_srange32(dst_reg, reg_u32_min(dst_reg), reg_u32_max(dst_reg)); + else + reg_set_srange32(dst_reg, S32_MIN, S32_MAX); } static void scalar_min_max_or(struct bpf_reg_state *dst_reg, @@ -14520,7 +14491,7 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg, { bool src_known = tnum_is_const(src_reg->var_off); bool dst_known = tnum_is_const(dst_reg->var_off); - u64 umin_val = src_reg->umin_value; + u64 umin_val = reg_umin(src_reg); if (src_known && dst_known) { __mark_reg_known(dst_reg, dst_reg->var_off.value); @@ -14530,19 +14501,16 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg, /* We get our maximum from the var_off, and our minimum is the * maximum of the operands' minima */ - dst_reg->umin_value = max(dst_reg->umin_value, umin_val); - dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask; + reg_set_urange64(dst_reg, max(reg_umin(dst_reg), umin_val), + dst_reg->var_off.value | dst_reg->var_off.mask); /* Safe to set s64 bounds by casting u64 result into s64 when u64 * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded. */ - if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) { - dst_reg->smin_value = dst_reg->umin_value; - dst_reg->smax_value = dst_reg->umax_value; - } else { - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } + if ((s64)reg_umin(dst_reg) <= (s64)reg_umax(dst_reg)) + reg_set_srange64(dst_reg, reg_umin(dst_reg), reg_umax(dst_reg)); + else + reg_set_srange64(dst_reg, S64_MIN, S64_MAX); /* We may learn something more from the var_off */ __update_reg_bounds(dst_reg); } @@ -14560,19 +14528,15 @@ static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg, } /* We get both minimum and maximum from the var32_off. */ - dst_reg->u32_min_value = var32_off.value; - dst_reg->u32_max_value = var32_off.value | var32_off.mask; + reg_set_urange32(dst_reg, var32_off.value, var32_off.value | var32_off.mask); /* Safe to set s32 bounds by casting u32 result into s32 when u32 * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded. */ - if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) { - dst_reg->s32_min_value = dst_reg->u32_min_value; - dst_reg->s32_max_value = dst_reg->u32_max_value; - } else { - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; - } + if ((s32)reg_u32_min(dst_reg) <= (s32)reg_u32_max(dst_reg)) + reg_set_srange32(dst_reg, reg_u32_min(dst_reg), reg_u32_max(dst_reg)); + else + reg_set_srange32(dst_reg, S32_MIN, S32_MAX); } static void scalar_min_max_xor(struct bpf_reg_state *dst_reg, @@ -14588,19 +14552,16 @@ static void scalar_min_max_xor(struct bpf_reg_state *dst_reg, } /* We get both minimum and maximum from the var_off. */ - dst_reg->umin_value = dst_reg->var_off.value; - dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask; + reg_set_urange64(dst_reg, dst_reg->var_off.value, + dst_reg->var_off.value | dst_reg->var_off.mask); /* Safe to set s64 bounds by casting u64 result into s64 when u64 * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded. */ - if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) { - dst_reg->smin_value = dst_reg->umin_value; - dst_reg->smax_value = dst_reg->umax_value; - } else { - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } + if ((s64)reg_umin(dst_reg) <= (s64)reg_umax(dst_reg)) + reg_set_srange64(dst_reg, reg_umin(dst_reg), reg_umax(dst_reg)); + else + reg_set_srange64(dst_reg, S64_MIN, S64_MAX); __update_reg_bounds(dst_reg); } @@ -14611,23 +14572,20 @@ static void __scalar32_min_max_lsh(struct bpf_reg_state *dst_reg, /* We lose all sign bit information (except what we can pick * up from var_off) */ - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; + reg_set_srange32(dst_reg, S32_MIN, S32_MAX); /* If we might shift our top bit out, then we know nothing */ - if (umax_val > 31 || dst_reg->u32_max_value > 1ULL << (31 - umax_val)) { - dst_reg->u32_min_value = 0; - dst_reg->u32_max_value = U32_MAX; - } else { - dst_reg->u32_min_value <<= umin_val; - dst_reg->u32_max_value <<= umax_val; - } + if (umax_val > 31 || reg_u32_max(dst_reg) > 1ULL << (31 - umax_val)) + reg_set_urange32(dst_reg, 0, U32_MAX); + else + reg_set_urange32(dst_reg, reg_u32_min(dst_reg) << umin_val, + reg_u32_max(dst_reg) << umax_val); } static void scalar32_min_max_lsh(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u32 umax_val = src_reg->u32_max_value; - u32 umin_val = src_reg->u32_min_value; + u32 umax_val = reg_u32_max(src_reg); + u32 umin_val = reg_u32_min(src_reg); /* u32 alu operation will zext upper bits */ struct tnum subreg = tnum_subreg(dst_reg->var_off); @@ -14649,29 +14607,25 @@ static void __scalar64_min_max_lsh(struct bpf_reg_state *dst_reg, * because s32 bounds don't flip sign when shifting to the left by * 32bits. */ - if (umin_val == 32 && umax_val == 32) { - dst_reg->smax_value = (s64)dst_reg->s32_max_value << 32; - dst_reg->smin_value = (s64)dst_reg->s32_min_value << 32; - } else { - dst_reg->smax_value = S64_MAX; - dst_reg->smin_value = S64_MIN; - } + if (umin_val == 32 && umax_val == 32) + reg_set_srange64(dst_reg, (s64)reg_s32_min(dst_reg) << 32, + (s64)reg_s32_max(dst_reg) << 32); + else + reg_set_srange64(dst_reg, S64_MIN, S64_MAX); /* If we might shift our top bit out, then we know nothing */ - if (dst_reg->umax_value > 1ULL << (63 - umax_val)) { - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; - } else { - dst_reg->umin_value <<= umin_val; - dst_reg->umax_value <<= umax_val; - } + if (reg_umax(dst_reg) > 1ULL << (63 - umax_val)) + reg_set_urange64(dst_reg, 0, U64_MAX); + else + reg_set_urange64(dst_reg, reg_umin(dst_reg) << umin_val, + reg_umax(dst_reg) << umax_val); } static void scalar_min_max_lsh(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u64 umax_val = src_reg->umax_value; - u64 umin_val = src_reg->umin_value; + u64 umax_val = reg_umax(src_reg); + u64 umin_val = reg_umin(src_reg); /* scalar64 calc uses 32bit unshifted bounds so must be called first */ __scalar64_min_max_lsh(dst_reg, umin_val, umax_val); @@ -14686,8 +14640,8 @@ static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { struct tnum subreg = tnum_subreg(dst_reg->var_off); - u32 umax_val = src_reg->u32_max_value; - u32 umin_val = src_reg->u32_min_value; + u32 umax_val = reg_u32_max(src_reg); + u32 umin_val = reg_u32_min(src_reg); /* BPF_RSH is an unsigned shift. If the value in dst_reg might * be negative, then either: @@ -14703,12 +14657,11 @@ static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg, * and rely on inferring new ones from the unsigned bounds and * var_off of the result. */ - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; + reg_set_srange32(dst_reg, S32_MIN, S32_MAX); dst_reg->var_off = tnum_rshift(subreg, umin_val); - dst_reg->u32_min_value >>= umax_val; - dst_reg->u32_max_value >>= umin_val; + reg_set_urange32(dst_reg, reg_u32_min(dst_reg) >> umax_val, + reg_u32_max(dst_reg) >> umin_val); __mark_reg64_unbounded(dst_reg); __update_reg32_bounds(dst_reg); @@ -14717,8 +14670,8 @@ static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg, static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u64 umax_val = src_reg->umax_value; - u64 umin_val = src_reg->umin_value; + u64 umax_val = reg_umax(src_reg); + u64 umin_val = reg_umin(src_reg); /* BPF_RSH is an unsigned shift. If the value in dst_reg might * be negative, then either: @@ -14734,11 +14687,10 @@ static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg, * and rely on inferring new ones from the unsigned bounds and * var_off of the result. */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; + reg_set_srange64(dst_reg, S64_MIN, S64_MAX); dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val); - dst_reg->umin_value >>= umax_val; - dst_reg->umax_value >>= umin_val; + reg_set_urange64(dst_reg, reg_umin(dst_reg) >> umax_val, + reg_umax(dst_reg) >> umin_val); /* Its not easy to operate on alu32 bounds here because it depends * on bits being shifted in. Take easy way out and mark unbounded @@ -14751,21 +14703,21 @@ static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg, static void scalar32_min_max_arsh(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u64 umin_val = src_reg->u32_min_value; + u64 umin_val = reg_u32_min(src_reg); /* Upon reaching here, src_known is true and * umax_val is equal to umin_val. */ - dst_reg->s32_min_value = (u32)(((s32)dst_reg->s32_min_value) >> umin_val); - dst_reg->s32_max_value = (u32)(((s32)dst_reg->s32_max_value) >> umin_val); + reg_set_srange32(dst_reg, + (u32)(((s32)reg_s32_min(dst_reg)) >> umin_val), + (u32)(((s32)reg_s32_max(dst_reg)) >> umin_val)); dst_reg->var_off = tnum_arshift(tnum_subreg(dst_reg->var_off), umin_val, 32); /* blow away the dst_reg umin_value/umax_value and rely on * dst_reg var_off to refine the result. */ - dst_reg->u32_min_value = 0; - dst_reg->u32_max_value = U32_MAX; + reg_set_urange32(dst_reg, 0, U32_MAX); __mark_reg64_unbounded(dst_reg); __update_reg32_bounds(dst_reg); @@ -14774,21 +14726,20 @@ static void scalar32_min_max_arsh(struct bpf_reg_state *dst_reg, static void scalar_min_max_arsh(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u64 umin_val = src_reg->umin_value; + u64 umin_val = reg_umin(src_reg); /* Upon reaching here, src_known is true and umax_val is equal * to umin_val. */ - dst_reg->smin_value >>= umin_val; - dst_reg->smax_value >>= umin_val; + reg_set_srange64(dst_reg, reg_smin(dst_reg) >> umin_val, + reg_smax(dst_reg) >> umin_val); dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val, 64); /* blow away the dst_reg umin_value/umax_value and rely on * dst_reg var_off to refine the result. */ - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; + reg_set_urange64(dst_reg, 0, U64_MAX); /* Its not easy to operate on alu32 bounds here because it depends * on bits being shifted in from upper 32-bits. Take easy way out @@ -14855,13 +14806,13 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, if (insn_bitness == 32) { if (tnum_subreg_is_const(src_reg->var_off) - && src_reg->s32_min_value == src_reg->s32_max_value - && src_reg->u32_min_value == src_reg->u32_max_value) + && reg_s32_min(src_reg) == reg_s32_max(src_reg) + && reg_u32_min(src_reg) == reg_u32_max(src_reg)) src_is_const = true; } else { if (tnum_is_const(src_reg->var_off) - && src_reg->smin_value == src_reg->smax_value - && src_reg->umin_value == src_reg->umax_value) + && reg_smin(src_reg) == reg_smax(src_reg) + && reg_umin(src_reg) == reg_umax(src_reg)) src_is_const = true; } @@ -14891,7 +14842,7 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, case BPF_LSH: case BPF_RSH: case BPF_ARSH: - return (src_is_const && src_reg->umax_value < insn_bitness); + return (src_is_const && reg_umax(src_reg) < insn_bitness); default: return false; } @@ -14904,9 +14855,9 @@ static int maybe_fork_scalars(struct bpf_verifier_env *env, struct bpf_insn *ins struct bpf_reg_state *regs; bool alu32; - if (dst_reg->smin_value == -1 && dst_reg->smax_value == 0) + if (reg_smin(dst_reg) == -1 && reg_smax(dst_reg) == 0) alu32 = false; - else if (dst_reg->s32_min_value == -1 && dst_reg->s32_max_value == 0) + else if (reg_s32_min(dst_reg) == -1 && reg_s32_max(dst_reg) == 0) alu32 = true; else return 0; @@ -14990,7 +14941,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, break; case BPF_DIV: /* BPF div specification: x / 0 = 0 */ - if ((alu32 && src_reg.u32_min_value == 0) || (!alu32 && src_reg.umin_value == 0)) { + if ((alu32 && reg_u32_min(&src_reg) == 0) || (!alu32 && reg_umin(&src_reg) == 0)) { ___mark_reg_known(dst_reg, 0); break; } @@ -15007,7 +14958,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, break; case BPF_MOD: /* BPF mod specification: x % 0 = x */ - if ((alu32 && src_reg.u32_min_value == 0) || (!alu32 && src_reg.umin_value == 0)) + if ((alu32 && reg_u32_min(&src_reg) == 0) || (!alu32 && reg_umin(&src_reg) == 0)) break; if (alu32) if (off == 1) @@ -15195,7 +15146,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * umax_value before the ALU operation. After adjust_scalar_min_max_vals(), * alu32 ops will have zero-extended the result, making umax_value <= U32_MAX. */ - u64 dst_umax = dst_reg->umax_value; + u64 dst_umax = reg_umax(dst_reg); err = adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg); if (err) @@ -15337,7 +15288,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } else if (src_reg->type == SCALAR_VALUE) { bool no_sext; - no_sext = src_reg->umax_value < (1ULL << (insn->off - 1)); + no_sext = reg_umax(src_reg) < (1ULL << (insn->off - 1)); if (no_sext) assign_scalar_id_before_mov(env, src_reg); copy_register_state(dst_reg, src_reg); @@ -15372,7 +15323,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) dst_reg->subreg_def = env->insn_idx + 1; } else { /* case: W1 = (s8, s16)W2 */ - bool no_sext = src_reg->umax_value < (1ULL << (insn->off - 1)); + bool no_sext = reg_umax(src_reg) < (1ULL << (insn->off - 1)); if (no_sext) assign_scalar_id_before_mov(env, src_reg); @@ -15454,17 +15405,17 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, struct bpf_reg_state *reg; int new_range; - if (dst_reg->umax_value == 0 && range_right_open) + if (reg_umax(dst_reg) == 0 && range_right_open) /* This doesn't give us any range */ return; - if (dst_reg->umax_value > MAX_PACKET_OFF) + if (reg_umax(dst_reg) > MAX_PACKET_OFF) /* Risk of overflow. For instance, ptr + (1<<63) may be less * than pkt_end, but that's because it's also less than pkt. */ return; - new_range = dst_reg->umax_value; + new_range = reg_umax(dst_reg); if (range_right_open) new_range++; @@ -15513,7 +15464,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, /* If our ids match, then we must have the same max_value. And we * don't care about the other reg's fixed offset, since if it's too big * the range won't allow anything. - * dst_reg->umax_value is known < MAX_PACKET_OFF, therefore it fits in a u16. + * reg_umax(dst_reg) is known < MAX_PACKET_OFF, therefore it fits in a u16. */ bpf_for_each_reg_in_vstate(vstate, state, reg, ({ if (reg->type == type && reg->id == dst_reg->id) @@ -15569,14 +15520,14 @@ static int is_scalar_branch_taken(struct bpf_verifier_env *env, struct bpf_reg_s { struct tnum t1 = is_jmp32 ? tnum_subreg(reg1->var_off) : reg1->var_off; struct tnum t2 = is_jmp32 ? tnum_subreg(reg2->var_off) : reg2->var_off; - u64 umin1 = is_jmp32 ? (u64)reg1->u32_min_value : reg1->umin_value; - u64 umax1 = is_jmp32 ? (u64)reg1->u32_max_value : reg1->umax_value; - s64 smin1 = is_jmp32 ? (s64)reg1->s32_min_value : reg1->smin_value; - s64 smax1 = is_jmp32 ? (s64)reg1->s32_max_value : reg1->smax_value; - u64 umin2 = is_jmp32 ? (u64)reg2->u32_min_value : reg2->umin_value; - u64 umax2 = is_jmp32 ? (u64)reg2->u32_max_value : reg2->umax_value; - s64 smin2 = is_jmp32 ? (s64)reg2->s32_min_value : reg2->smin_value; - s64 smax2 = is_jmp32 ? (s64)reg2->s32_max_value : reg2->smax_value; + u64 umin1 = is_jmp32 ? (u64)reg_u32_min(reg1) : reg_umin(reg1); + u64 umax1 = is_jmp32 ? (u64)reg_u32_max(reg1) : reg_umax(reg1); + s64 smin1 = is_jmp32 ? (s64)reg_s32_min(reg1) : reg_smin(reg1); + s64 smax1 = is_jmp32 ? (s64)reg_s32_max(reg1) : reg_smax(reg1); + u64 umin2 = is_jmp32 ? (u64)reg_u32_min(reg2) : reg_umin(reg2); + u64 umax2 = is_jmp32 ? (u64)reg_u32_max(reg2) : reg_umax(reg2); + s64 smin2 = is_jmp32 ? (s64)reg_s32_min(reg2) : reg_smin(reg2); + s64 smax2 = is_jmp32 ? (s64)reg_s32_max(reg2) : reg_smax(reg2); if (reg1 == reg2) { switch (opcode) { @@ -15621,11 +15572,11 @@ static int is_scalar_branch_taken(struct bpf_verifier_env *env, struct bpf_reg_s * utilize 32-bit subrange knowledge to eliminate * branches that can't be taken a priori */ - if (reg1->u32_min_value > reg2->u32_max_value || - reg1->u32_max_value < reg2->u32_min_value) + if (reg_u32_min(reg1) > reg_u32_max(reg2) || + reg_u32_max(reg1) < reg_u32_min(reg2)) return 0; - if (reg1->s32_min_value > reg2->s32_max_value || - reg1->s32_max_value < reg2->s32_min_value) + if (reg_s32_min(reg1) > reg_s32_max(reg2) || + reg_s32_max(reg1) < reg_s32_min(reg2)) return 0; } break; @@ -15647,11 +15598,11 @@ static int is_scalar_branch_taken(struct bpf_verifier_env *env, struct bpf_reg_s * utilize 32-bit subrange knowledge to eliminate * branches that can't be taken a priori */ - if (reg1->u32_min_value > reg2->u32_max_value || - reg1->u32_max_value < reg2->u32_min_value) + if (reg_u32_min(reg1) > reg_u32_max(reg2) || + reg_u32_max(reg1) < reg_u32_min(reg2)) return 1; - if (reg1->s32_min_value > reg2->s32_max_value || - reg1->s32_max_value < reg2->s32_min_value) + if (reg_s32_min(reg1) > reg_s32_max(reg2) || + reg_s32_max(reg1) < reg_s32_min(reg2)) return 1; } break; @@ -15878,27 +15829,23 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state switch (opcode) { case BPF_JEQ: if (is_jmp32) { - reg1->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value); - reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value); - reg1->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value); - reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value); - reg2->u32_min_value = reg1->u32_min_value; - reg2->u32_max_value = reg1->u32_max_value; - reg2->s32_min_value = reg1->s32_min_value; - reg2->s32_max_value = reg1->s32_max_value; + reg_set_urange32(reg1, max(reg_u32_min(reg1), reg_u32_min(reg2)), + min(reg_u32_max(reg1), reg_u32_max(reg2))); + reg_set_srange32(reg1, max(reg_s32_min(reg1), reg_s32_min(reg2)), + min(reg_s32_max(reg1), reg_s32_max(reg2))); + reg_set_urange32(reg2, reg_u32_min(reg1), reg_u32_max(reg1)); + reg_set_srange32(reg2, reg_s32_min(reg1), reg_s32_max(reg1)); t = tnum_intersect(tnum_subreg(reg1->var_off), tnum_subreg(reg2->var_off)); reg1->var_off = tnum_with_subreg(reg1->var_off, t); reg2->var_off = tnum_with_subreg(reg2->var_off, t); } else { - reg1->umin_value = max(reg1->umin_value, reg2->umin_value); - reg1->umax_value = min(reg1->umax_value, reg2->umax_value); - reg1->smin_value = max(reg1->smin_value, reg2->smin_value); - reg1->smax_value = min(reg1->smax_value, reg2->smax_value); - reg2->umin_value = reg1->umin_value; - reg2->umax_value = reg1->umax_value; - reg2->smin_value = reg1->smin_value; - reg2->smax_value = reg1->smax_value; + reg_set_urange64(reg1, max(reg_umin(reg1), reg_umin(reg2)), + min(reg_umax(reg1), reg_umax(reg2))); + reg_set_srange64(reg1, max(reg_smin(reg1), reg_smin(reg2)), + min(reg_smax(reg1), reg_smax(reg2))); + reg_set_urange64(reg2, reg_umin(reg1), reg_umax(reg1)); + reg_set_srange64(reg2, reg_smin(reg1), reg_smax(reg1)); reg1->var_off = tnum_intersect(reg1->var_off, reg2->var_off); reg2->var_off = reg1->var_off; @@ -15915,8 +15862,8 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state */ val = reg_const_value(reg2, is_jmp32); if (is_jmp32) { - /* u32_min_value is not equal to 0xffffffff at this point, - * because otherwise u32_max_value is 0xffffffff as well, + /* u32_min is not equal to 0xffffffff at this point, + * because otherwise u32_max is 0xffffffff as well, * in such a case both reg1 and reg2 would be constants, * jump would be predicted and regs_refine_cond_op() * wouldn't be called. @@ -15924,23 +15871,23 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state * Same reasoning works for all {u,s}{min,max}{32,64} cases * below. */ - if (reg1->u32_min_value == (u32)val) - reg1->u32_min_value++; - if (reg1->u32_max_value == (u32)val) - reg1->u32_max_value--; - if (reg1->s32_min_value == (s32)val) - reg1->s32_min_value++; - if (reg1->s32_max_value == (s32)val) - reg1->s32_max_value--; + if (reg_u32_min(reg1) == (u32)val) + reg_set_urange32(reg1, reg_u32_min(reg1) + 1, reg_u32_max(reg1)); + if (reg_u32_max(reg1) == (u32)val) + reg_set_urange32(reg1, reg_u32_min(reg1), reg_u32_max(reg1) - 1); + if (reg_s32_min(reg1) == (s32)val) + reg_set_srange32(reg1, reg_s32_min(reg1) + 1, reg_s32_max(reg1)); + if (reg_s32_max(reg1) == (s32)val) + reg_set_srange32(reg1, reg_s32_min(reg1), reg_s32_max(reg1) - 1); } else { - if (reg1->umin_value == (u64)val) - reg1->umin_value++; - if (reg1->umax_value == (u64)val) - reg1->umax_value--; - if (reg1->smin_value == (s64)val) - reg1->smin_value++; - if (reg1->smax_value == (s64)val) - reg1->smax_value--; + if (reg_umin(reg1) == (u64)val) + reg_set_urange64(reg1, reg_umin(reg1) + 1, reg_umax(reg1)); + if (reg_umax(reg1) == (u64)val) + reg_set_urange64(reg1, reg_umin(reg1), reg_umax(reg1) - 1); + if (reg_smin(reg1) == (s64)val) + reg_set_srange64(reg1, reg_smin(reg1) + 1, reg_smax(reg1)); + if (reg_smax(reg1) == (s64)val) + reg_set_srange64(reg1, reg_smin(reg1), reg_smax(reg1) - 1); } break; case BPF_JSET: @@ -15987,38 +15934,38 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state break; case BPF_JLE: if (is_jmp32) { - reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value); - reg2->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value); + reg_set_urange32(reg1, reg_u32_min(reg1), min(reg_u32_max(reg1), reg_u32_max(reg2))); + reg_set_urange32(reg2, max(reg_u32_min(reg1), reg_u32_min(reg2)), reg_u32_max(reg2)); } else { - reg1->umax_value = min(reg1->umax_value, reg2->umax_value); - reg2->umin_value = max(reg1->umin_value, reg2->umin_value); + reg_set_urange64(reg1, reg_umin(reg1), min(reg_umax(reg1), reg_umax(reg2))); + reg_set_urange64(reg2, max(reg_umin(reg1), reg_umin(reg2)), reg_umax(reg2)); } break; case BPF_JLT: if (is_jmp32) { - reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value - 1); - reg2->u32_min_value = max(reg1->u32_min_value + 1, reg2->u32_min_value); + reg_set_urange32(reg1, reg_u32_min(reg1), min(reg_u32_max(reg1), reg_u32_max(reg2) - 1)); + reg_set_urange32(reg2, max(reg_u32_min(reg1) + 1, reg_u32_min(reg2)), reg_u32_max(reg2)); } else { - reg1->umax_value = min(reg1->umax_value, reg2->umax_value - 1); - reg2->umin_value = max(reg1->umin_value + 1, reg2->umin_value); + reg_set_urange64(reg1, reg_umin(reg1), min(reg_umax(reg1), reg_umax(reg2) - 1)); + reg_set_urange64(reg2, max(reg_umin(reg1) + 1, reg_umin(reg2)), reg_umax(reg2)); } break; case BPF_JSLE: if (is_jmp32) { - reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value); - reg2->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value); + reg_set_srange32(reg1, reg_s32_min(reg1), min(reg_s32_max(reg1), reg_s32_max(reg2))); + reg_set_srange32(reg2, max(reg_s32_min(reg1), reg_s32_min(reg2)), reg_s32_max(reg2)); } else { - reg1->smax_value = min(reg1->smax_value, reg2->smax_value); - reg2->smin_value = max(reg1->smin_value, reg2->smin_value); + reg_set_srange64(reg1, reg_smin(reg1), min(reg_smax(reg1), reg_smax(reg2))); + reg_set_srange64(reg2, max(reg_smin(reg1), reg_smin(reg2)), reg_smax(reg2)); } break; case BPF_JSLT: if (is_jmp32) { - reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value - 1); - reg2->s32_min_value = max(reg1->s32_min_value + 1, reg2->s32_min_value); + reg_set_srange32(reg1, reg_s32_min(reg1), min(reg_s32_max(reg1), reg_s32_max(reg2) - 1)); + reg_set_srange32(reg2, max(reg_s32_min(reg1) + 1, reg_s32_min(reg2)), reg_s32_max(reg2)); } else { - reg1->smax_value = min(reg1->smax_value, reg2->smax_value - 1); - reg2->smin_value = max(reg1->smin_value + 1, reg2->smin_value); + reg_set_srange64(reg1, reg_smin(reg1), min(reg_smax(reg1), reg_smax(reg2) - 1)); + reg_set_srange64(reg2, max(reg_smin(reg1) + 1, reg_smin(reg2)), reg_smax(reg2)); } break; default: @@ -17519,16 +17466,16 @@ static int indirect_jump_min_max_index(struct bpf_verifier_env *env, u32 *pmin_index, u32 *pmax_index) { struct bpf_reg_state *reg = reg_state(env, regno); - u64 min_index = reg->umin_value; - u64 max_index = reg->umax_value; + u64 min_index = reg_umin(reg); + u64 max_index = reg_umax(reg); const u32 size = 8; if (min_index > (u64) U32_MAX * size) { - verbose(env, "the sum of R%u umin_value %llu is too big\n", regno, reg->umin_value); + verbose(env, "the sum of R%u umin_value %llu is too big\n", regno, reg_umin(reg)); return -ERANGE; } if (max_index > (u64) U32_MAX * size) { - verbose(env, "the sum of R%u umax_value %llu is too big\n", regno, reg->umax_value); + verbose(env, "the sum of R%u umax_value %llu is too big\n", regno, reg_umax(reg)); return -ERANGE; } -- cgit v1.2.3 From bbc631085503a7fde9617be18b0657cc9a83910a Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 24 Apr 2026 15:52:44 -0700 Subject: bpf: replace min/max fields with struct cnum{32,64} Replace eight independent s64, u64, s32, u32 min/max fields in bpf_reg_state with two circular number fields: - cnum64 for a unified signed/unsigned 64-bit range tracking; - cnum32 for a unified signed/unsigned 32-bit range tracking. Each cnum represents a range as a single arc on the circular number line (base + size), from which signed and unsigned bounds are derived on demand via accessor functions introduced in the preceding commit. Notable changes: - Signed<->unsigned deductions in __reg_deduce_bounds() are removed. - 64<->32 bit deductions are replaced with: - reg->r32 = cnum32_intersect(reg->r32, cnum32_from_cnum64(reg->r64)); this is functionally equivalent to the old code. - reg->r64 = cnum64_cnum32_intersect(reg->r64, reg->r32); this handles a few additional cases, see commit message for "bpf: representation and basic operations on circular numbers". - regs_refine_cond_op() now computes results in terms of operations on sets, e.g. for JNE: /* Complement of the range [val, val] as cnum64. */ lo = (struct cnum64){ val + 1, U64_MAX - 1 }; reg1->r64 = cnum64_intersect(reg1->r64, lo); - For add, sub operations on scalars replace explicit bounds computations with cnum{32,64}_{add,negate}. - For add, sub operations on pointers deduplicate with arithmetic operations on scalars and use cnum{32,64}_{add,negate}. - For and, or, xor operations on scalars remove explicit signed bounds computations. - range_bounds_violation() reduces to checking cnum_is_empty(). - const_tnum_range_mismatch() reduces to checking cnum_is_const(). Selftest adjustments: a few existing tests are updated because a single cnum arc cannot always represent what the old system expressed as the intersection of independent signed and unsigned ranges. For example, if the old system tracked u64=[0, U64_MAX-U32_MAX+2] and s64=[S64_MIN+2, 2] independently, their intersection is a tight two-point set. A single cnum must pick the shorter arc, losing the other constraint. These cases are documented with comments in the adjusted tests. reg_bounds.c is updated with logic similar to cnum64_cnum32_intersect(). Instead of using cnums it inspects intersection between 'b' and first / last / next-after-first / previous-before-last sub-ranges of 'a'. reg_bounds.c is also updated to skip test cases that rely in signed and unsigned ranges intersecting in two intervals, as such cases are not representable by a single cnum. The following "crafted" test cases are affected: - reg_bounds_crafted/(s64)[0xffffffffffff8000; 0x7fff] (u32) [0; 0x1f] - reg_bounds_crafted/(s64)[0; 0x1f] (u32) [0xffffffffffffff80; 0x7f] - reg_bounds_crafted/(s64)[0xffffffffffffff80; 0x7f] (u32) [0; 0x1f] - reg_bounds_crafted/(u64)[0; 1] (s32) [1; 2147483648] - reg_bounds_crafted/(u64)[1; 2147483648] (s32) [0; 1] - reg_bounds_crafted/(u64)[0; 0xffffffff00000000] (s64) 0 - reg_bounds_crafted/(u64)0 (s64) [0; 0xffffffff00000000] - reg_bounds_crafted/(u64)[0; 0xffffffff00000000] (s32) 0 - reg_bounds_crafted/(u64)0 (s32) [0; 0xffffffff00000000] - reg_bounds_crafted/(s64)[S64_MIN; 0] (u64) S64_MIN - reg_bounds_crafted/(s64)S64_MIN (u64) [S64_MIN; 0] - reg_bounds_crafted/(s32)[S32_MIN; 0] (u32) S32_MIN - reg_bounds_crafted/(s32)S32_MIN (u32) [S32_MIN; 0] - reg_bounds_crafted/(s64)[0; 0x1f] (u32) [0xffffffff80000000; 0x7fffffff] - reg_bounds_crafted/(s64)[0xffffffff80000000; 0x7fffffff] (u32) [0; 0x1f] - reg_bounds_crafted/(s64)[0; 0x1f] (u32) [0xffffffffffff8000; 0x7fff] As well as some reg_bounds_roand_{consts,ranges}_A_B, where A and B differ in sign domain. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260424-cnums-everywhere-rfc-v1-v3-3-ca434b39a486@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 39 +- kernel/bpf/verifier.c | 843 +++------------------ .../testing/selftests/bpf/prog_tests/reg_bounds.c | 90 ++- .../testing/selftests/bpf/progs/verifier_bounds.c | 9 +- .../testing/selftests/bpf/progs/verifier_subreg.c | 6 +- 5 files changed, 218 insertions(+), 769 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index bf3ffa56bbe5..101ca6cc5424 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -8,6 +8,7 @@ #include /* for struct btf and btf_id() */ #include /* for MAX_BPF_STACK */ #include +#include /* Maximum variable offset umax_value permitted when resolving memory accesses. * In practice this is far bigger than any realistic pointer offset; this limit @@ -120,14 +121,8 @@ struct bpf_reg_state { * These refer to the same value as var_off, not necessarily the actual * contents of the register. */ - s64 smin_value; /* minimum possible (s64)value */ - s64 smax_value; /* maximum possible (s64)value */ - u64 umin_value; /* minimum possible (u64)value */ - u64 umax_value; /* maximum possible (u64)value */ - s32 s32_min_value; /* minimum possible (s32)value */ - s32 s32_max_value; /* maximum possible (s32)value */ - u32 u32_min_value; /* minimum possible (u32)value */ - u32 u32_max_value; /* maximum possible (u32)value */ + struct cnum64 r64; /* 64-bit range as circular number */ + struct cnum32 r32; /* 32-bit range as circular number */ /* For PTR_TO_PACKET, used to find other pointers with the same variable * offset, so they can share range knowledge. * For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we @@ -211,66 +206,62 @@ struct bpf_reg_state { static inline s64 reg_smin(const struct bpf_reg_state *reg) { - return reg->smin_value; + return cnum64_smin(reg->r64); } static inline s64 reg_smax(const struct bpf_reg_state *reg) { - return reg->smax_value; + return cnum64_smax(reg->r64); } static inline u64 reg_umin(const struct bpf_reg_state *reg) { - return reg->umin_value; + return cnum64_umin(reg->r64); } static inline u64 reg_umax(const struct bpf_reg_state *reg) { - return reg->umax_value; + return cnum64_umax(reg->r64); } static inline s32 reg_s32_min(const struct bpf_reg_state *reg) { - return reg->s32_min_value; + return cnum32_smin(reg->r32); } static inline s32 reg_s32_max(const struct bpf_reg_state *reg) { - return reg->s32_max_value; + return cnum32_smax(reg->r32); } static inline u32 reg_u32_min(const struct bpf_reg_state *reg) { - return reg->u32_min_value; + return cnum32_umin(reg->r32); } static inline u32 reg_u32_max(const struct bpf_reg_state *reg) { - return reg->u32_max_value; + return cnum32_umax(reg->r32); } static inline void reg_set_srange32(struct bpf_reg_state *reg, s32 smin, s32 smax) { - reg->s32_min_value = smin; - reg->s32_max_value = smax; + reg->r32 = cnum32_from_srange(smin, smax); } static inline void reg_set_urange32(struct bpf_reg_state *reg, u32 umin, u32 umax) { - reg->u32_min_value = umin; - reg->u32_max_value = umax; + reg->r32 = cnum32_from_urange(umin, umax); } static inline void reg_set_srange64(struct bpf_reg_state *reg, s64 smin, s64 smax) { - reg->smin_value = smin; - reg->smax_value = smax; + reg->r64 = cnum64_from_srange(smin, smax); } static inline void reg_set_urange64(struct bpf_reg_state *reg, u64 umin, u64 umax) { - reg->umin_value = umin; - reg->umax_value = umax; + reg->r64 = cnum64_from_urange(umin, umax); } enum bpf_stack_slot_type { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b91d2789e7b9..03f9e16c2abe 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -1796,10 +1797,8 @@ static const int caller_saved[CALLER_SAVED_REGS] = { static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm) { reg->var_off = tnum_const(imm); - reg_set_srange64(reg, (s64)imm, (s64)imm); - reg_set_urange64(reg, imm, imm); - reg_set_srange32(reg, (s32)imm, (s32)imm); - reg_set_urange32(reg, (u32)imm, (u32)imm); + reg->r64 = cnum64_from_urange(imm, imm); + reg->r32 = cnum32_from_urange((u32)imm, (u32)imm); } /* Mark the unknown part of a register (variable offset or scalar value) as @@ -1818,8 +1817,7 @@ static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm) static void __mark_reg32_known(struct bpf_reg_state *reg, u64 imm) { reg->var_off = tnum_const_subreg(reg->var_off, imm); - reg_set_srange32(reg, (s32)imm, (s32)imm); - reg_set_urange32(reg, (u32)imm, (u32)imm); + reg->r32 = cnum32_from_urange((u32)imm, (u32)imm); } /* Mark the 'variable offset' part of a register as zero. This should be @@ -1932,23 +1930,19 @@ static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg, static void __mark_reg32_unbounded(struct bpf_reg_state *reg) { - reg_set_srange32(reg, S32_MIN, S32_MAX); - reg_set_urange32(reg, 0, U32_MAX); + reg->r32 = CNUM32_UNBOUNDED; } -/* Reset the min/max bounds of a register */ -static void __mark_reg_unbounded(struct bpf_reg_state *reg) +static void __mark_reg64_unbounded(struct bpf_reg_state *reg) { - reg_set_srange64(reg, S64_MIN, S64_MAX); - reg_set_urange64(reg, 0, U64_MAX); - - __mark_reg32_unbounded(reg); + reg->r64 = CNUM64_UNBOUNDED; } -static void __mark_reg64_unbounded(struct bpf_reg_state *reg) +/* Reset the min/max bounds of a register */ +static void __mark_reg_unbounded(struct bpf_reg_state *reg) { - reg_set_srange64(reg, S64_MIN, S64_MAX); - reg_set_urange64(reg, 0, U64_MAX); + __mark_reg64_unbounded(reg); + __mark_reg32_unbounded(reg); } static void reset_reg64_and_tnum(struct bpf_reg_state *reg) @@ -1963,18 +1957,32 @@ static void reset_reg32_and_tnum(struct bpf_reg_state *reg) reg->var_off = tnum_unknown; } -static void __update_reg32_bounds(struct bpf_reg_state *reg) +static struct cnum32 cnum32_from_tnum(struct tnum tnum) { - struct tnum var32_off = tnum_subreg(reg->var_off); + tnum = tnum_subreg(tnum); + if ((tnum.mask & S32_MIN) || (tnum.value & S32_MIN)) + /* min signed is max(sign bit) | min(other bits) */ + /* max signed is min(sign bit) | max(other bits) */ + return cnum32_from_srange(tnum.value | (tnum.mask & S32_MIN), + tnum.value | (tnum.mask & S32_MAX)); + else + return cnum32_from_urange(tnum.value, (tnum.value | tnum.mask)); +} - reg_set_srange32(reg, - /* min signed is max(sign bit) | min(other bits) */ - max_t(s32, reg_s32_min(reg), var32_off.value | (var32_off.mask & S32_MIN)), - /* max signed is min(sign bit) | max(other bits) */ - min_t(s32, reg_s32_max(reg), var32_off.value | (var32_off.mask & S32_MAX))); - reg_set_urange32(reg, - max_t(u32, reg_u32_min(reg), (u32)var32_off.value), - min(reg_u32_max(reg), (u32)(var32_off.value | var32_off.mask))); +static struct cnum64 cnum64_from_tnum(struct tnum tnum) +{ + if ((tnum.mask & S64_MIN) || (tnum.value & S64_MIN)) + /* min signed is max(sign bit) | min(other bits) */ + /* max signed is min(sign bit) | max(other bits) */ + return cnum64_from_srange(tnum.value | (tnum.mask & S64_MIN), + tnum.value | (tnum.mask & S64_MAX)); + else + return cnum64_from_urange(tnum.value, (tnum.value | tnum.mask)); +} + +static void __update_reg32_bounds(struct bpf_reg_state *reg) +{ + cnum32_intersect_with(®->r32, cnum32_from_tnum(reg->var_off)); } static void __update_reg64_bounds(struct bpf_reg_state *reg) @@ -1982,17 +1990,7 @@ static void __update_reg64_bounds(struct bpf_reg_state *reg) u64 tnum_next, tmax; bool umin_in_tnum; - /* min signed is max(sign bit) | min(other bits) */ - /* max signed is min(sign bit) | max(other bits) */ - reg_set_srange64(reg, - max_t(s64, reg_smin(reg), - reg->var_off.value | (reg->var_off.mask & S64_MIN)), - min_t(s64, reg_smax(reg), - reg->var_off.value | (reg->var_off.mask & S64_MAX))); - reg_set_urange64(reg, - max(reg_umin(reg), reg->var_off.value), - min(reg_umax(reg), - reg->var_off.value | reg->var_off.mask)); + cnum64_intersect_with(®->r64, cnum64_from_tnum(reg->var_off)); /* Check if u64 and tnum overlap in a single value */ tnum_next = tnum_step(reg->var_off, reg_umin(reg)); @@ -2028,343 +2026,19 @@ static void __update_reg_bounds(struct bpf_reg_state *reg) __update_reg64_bounds(reg); } -/* Uses signed min/max values to inform unsigned, and vice-versa */ static void deduce_bounds_32_from_64(struct bpf_reg_state *reg) { - /* If upper 32 bits of u64/s64 range don't change, we can use lower 32 - * bits to improve our u32/s32 boundaries. - * - * E.g., the case where we have upper 32 bits as zero ([10, 20] in - * u64) is pretty trivial, it's obvious that in u32 we'll also have - * [10, 20] range. But this property holds for any 64-bit range as - * long as upper 32 bits in that entire range of values stay the same. - * - * E.g., u64 range [0x10000000A, 0x10000000F] ([4294967306, 4294967311] - * in decimal) has the same upper 32 bits throughout all the values in - * that range. As such, lower 32 bits form a valid [0xA, 0xF] ([10, 15]) - * range. - * - * Note also, that [0xA, 0xF] is a valid range both in u32 and in s32, - * following the rules outlined below about u64/s64 correspondence - * (which equally applies to u32 vs s32 correspondence). In general it - * depends on actual hexadecimal values of 32-bit range. They can form - * only valid u32, or only valid s32 ranges in some cases. - * - * So we use all these insights to derive bounds for subregisters here. - */ - if ((reg_umin(reg) >> 32) == (reg_umax(reg) >> 32)) { - /* u64 to u32 casting preserves validity of low 32 bits as - * a range, if upper 32 bits are the same - */ - reg_set_urange32(reg, - max_t(u32, reg_u32_min(reg), (u32)reg_umin(reg)), - min_t(u32, reg_u32_max(reg), (u32)reg_umax(reg))); - - if ((s32)reg_umin(reg) <= (s32)reg_umax(reg)) { - reg_set_srange32(reg, - max_t(s32, reg_s32_min(reg), (s32)reg_umin(reg)), - min_t(s32, reg_s32_max(reg), (s32)reg_umax(reg))); - } - } - if ((reg_smin(reg) >> 32) == (reg_smax(reg) >> 32)) { - /* low 32 bits should form a proper u32 range */ - if ((u32)reg_smin(reg) <= (u32)reg_smax(reg)) { - reg_set_urange32(reg, - max_t(u32, reg_u32_min(reg), (u32)reg_smin(reg)), - min_t(u32, reg_u32_max(reg), (u32)reg_smax(reg))); - } - /* low 32 bits should form a proper s32 range */ - if ((s32)reg_smin(reg) <= (s32)reg_smax(reg)) { - reg_set_srange32(reg, - max_t(s32, reg_s32_min(reg), (s32)reg_smin(reg)), - min_t(s32, reg_s32_max(reg), (s32)reg_smax(reg))); - } - } - /* Special case where upper bits form a small sequence of two - * sequential numbers (in 32-bit unsigned space, so 0xffffffff to - * 0x00000000 is also valid), while lower bits form a proper s32 range - * going from negative numbers to positive numbers. E.g., let's say we - * have s64 range [-1, 1] ([0xffffffffffffffff, 0x0000000000000001]). - * Possible s64 values are {-1, 0, 1} ({0xffffffffffffffff, - * 0x0000000000000000, 0x00000000000001}). Ignoring upper 32 bits, - * we still get a valid s32 range [-1, 1] ([0xffffffff, 0x00000001]). - * Note that it doesn't have to be 0xffffffff going to 0x00000000 in - * upper 32 bits. As a random example, s64 range - * [0xfffffff0fffffff0; 0xfffffff100000010], forms a valid s32 range - * [-16, 16] ([0xfffffff0; 0x00000010]) in its 32 bit subregister. - */ - if ((u32)(reg_umin(reg) >> 32) + 1 == (u32)(reg_umax(reg) >> 32) && - (s32)reg_umin(reg) < 0 && (s32)reg_umax(reg) >= 0) { - reg_set_srange32(reg, - max_t(s32, reg_s32_min(reg), (s32)reg_umin(reg)), - min_t(s32, reg_s32_max(reg), (s32)reg_umax(reg))); - } - if ((u32)(reg_smin(reg) >> 32) + 1 == (u32)(reg_smax(reg) >> 32) && - (s32)reg_smin(reg) < 0 && (s32)reg_smax(reg) >= 0) { - reg_set_srange32(reg, - max_t(s32, reg_s32_min(reg), (s32)reg_smin(reg)), - min_t(s32, reg_s32_max(reg), (s32)reg_smax(reg))); - } -} - -static void deduce_bounds_32_from_32(struct bpf_reg_state *reg) -{ - /* if u32 range forms a valid s32 range (due to matching sign bit), - * try to learn from that - */ - if ((s32)reg_u32_min(reg) <= (s32)reg_u32_max(reg)) { - reg_set_srange32(reg, - max_t(s32, reg_s32_min(reg), reg_u32_min(reg)), - min_t(s32, reg_s32_max(reg), reg_u32_max(reg))); - } - /* If we cannot cross the sign boundary, then signed and unsigned bounds - * are the same, so combine. This works even in the negative case, e.g. - * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff. - */ - if ((u32)reg_s32_min(reg) <= (u32)reg_s32_max(reg)) { - reg_set_urange32(reg, - max_t(u32, reg_s32_min(reg), reg_u32_min(reg)), - min_t(u32, reg_s32_max(reg), reg_u32_max(reg))); - } else { - if (reg_u32_max(reg) < (u32)reg_s32_min(reg)) { - /* See __reg64_deduce_bounds() for detailed explanation. - * Refine ranges in the following situation: - * - * 0 U32_MAX - * | [xxxxxxxxxxxxxx u32 range xxxxxxxxxxxxxx] | - * |----------------------------|----------------------------| - * |xxxxx s32 range xxxxxxxxx] [xxxxxxx| - * 0 S32_MAX S32_MIN -1 - */ - reg_set_srange32(reg, (s32)reg_u32_min(reg), reg_s32_max(reg)); - reg_set_urange32(reg, - reg_u32_min(reg), - min_t(u32, reg_u32_max(reg), reg_s32_max(reg))); - } else if ((u32)reg_s32_max(reg) < reg_u32_min(reg)) { - /* - * 0 U32_MAX - * | [xxxxxxxxxxxxxx u32 range xxxxxxxxxxxxxx] | - * |----------------------------|----------------------------| - * |xxxxxxxxx] [xxxxxxxxxxxx s32 range | - * 0 S32_MAX S32_MIN -1 - */ - reg_set_srange32(reg, reg_s32_min(reg), (s32)reg_u32_max(reg)); - reg_set_urange32(reg, - max_t(u32, reg_u32_min(reg), reg_s32_min(reg)), - reg_u32_max(reg)); - } - } -} - -static void deduce_bounds_64_from_64(struct bpf_reg_state *reg) -{ - /* If u64 range forms a valid s64 range (due to matching sign bit), - * try to learn from that. Let's do a bit of ASCII art to see when - * this is happening. Let's take u64 range first: - * - * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX - * |-------------------------------|--------------------------------| - * - * Valid u64 range is formed when umin and umax are anywhere in the - * range [0, U64_MAX], and umin <= umax. u64 case is simple and - * straightforward. Let's see how s64 range maps onto the same range - * of values, annotated below the line for comparison: - * - * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX - * |-------------------------------|--------------------------------| - * 0 S64_MAX S64_MIN -1 - * - * So s64 values basically start in the middle and they are logically - * contiguous to the right of it, wrapping around from -1 to 0, and - * then finishing as S64_MAX (0x7fffffffffffffff) right before - * S64_MIN. We can try drawing the continuity of u64 vs s64 values - * more visually as mapped to sign-agnostic range of hex values. - * - * u64 start u64 end - * _______________________________________________________________ - * / \ - * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX - * |-------------------------------|--------------------------------| - * 0 S64_MAX S64_MIN -1 - * / \ - * >------------------------------ -------------------------------> - * s64 continues... s64 end s64 start s64 "midpoint" - * - * What this means is that, in general, we can't always derive - * something new about u64 from any random s64 range, and vice versa. - * - * But we can do that in two particular cases. One is when entire - * u64/s64 range is *entirely* contained within left half of the above - * diagram or when it is *entirely* contained in the right half. I.e.: - * - * |-------------------------------|--------------------------------| - * ^ ^ ^ ^ - * A B C D - * - * [A, B] and [C, D] are contained entirely in their respective halves - * and form valid contiguous ranges as both u64 and s64 values. [A, B] - * will be non-negative both as u64 and s64 (and in fact it will be - * identical ranges no matter the signedness). [C, D] treated as s64 - * will be a range of negative values, while in u64 it will be - * non-negative range of values larger than 0x8000000000000000. - * - * Now, any other range here can't be represented in both u64 and s64 - * simultaneously. E.g., [A, C], [A, D], [B, C], [B, D] are valid - * contiguous u64 ranges, but they are discontinuous in s64. [B, C] - * in s64 would be properly presented as [S64_MIN, C] and [B, S64_MAX], - * for example. Similarly, valid s64 range [D, A] (going from negative - * to positive values), would be two separate [D, U64_MAX] and [0, A] - * ranges as u64. Currently reg_state can't represent two segments per - * numeric domain, so in such situations we can only derive maximal - * possible range ([0, U64_MAX] for u64, and [S64_MIN, S64_MAX] for s64). - * - * So we use these facts to derive umin/umax from smin/smax and vice - * versa only if they stay within the same "half". This is equivalent - * to checking sign bit: lower half will have sign bit as zero, upper - * half have sign bit 1. Below in code we simplify this by just - * casting umin/umax as smin/smax and checking if they form valid - * range, and vice versa. Those are equivalent checks. - */ - if ((s64)reg_umin(reg) <= (s64)reg_umax(reg)) { - reg_set_srange64(reg, - max_t(s64, reg_smin(reg), reg_umin(reg)), - min_t(s64, reg_smax(reg), reg_umax(reg))); - } - /* If we cannot cross the sign boundary, then signed and unsigned bounds - * are the same, so combine. This works even in the negative case, e.g. - * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff. - */ - if ((u64)reg_smin(reg) <= (u64)reg_smax(reg)) { - reg_set_urange64(reg, - max_t(u64, reg_smin(reg), reg_umin(reg)), - min_t(u64, reg_smax(reg), reg_umax(reg))); - } else { - /* If the s64 range crosses the sign boundary, then it's split - * between the beginning and end of the U64 domain. In that - * case, we can derive new bounds if the u64 range overlaps - * with only one end of the s64 range. - * - * In the following example, the u64 range overlaps only with - * positive portion of the s64 range. - * - * 0 U64_MAX - * | [xxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxx] | - * |----------------------------|----------------------------| - * |xxxxx s64 range xxxxxxxxx] [xxxxxxx| - * 0 S64_MAX S64_MIN -1 - * - * We can thus derive the following new s64 and u64 ranges. - * - * 0 U64_MAX - * | [xxxxxx u64 range xxxxx] | - * |----------------------------|----------------------------| - * | [xxxxxx s64 range xxxxx] | - * 0 S64_MAX S64_MIN -1 - * - * If they overlap in two places, we can't derive anything - * because reg_state can't represent two ranges per numeric - * domain. - * - * 0 U64_MAX - * | [xxxxxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxxxxx] | - * |----------------------------|----------------------------| - * |xxxxx s64 range xxxxxxxxx] [xxxxxxxxxx| - * 0 S64_MAX S64_MIN -1 - * - * The first condition below corresponds to the first diagram - * above. - */ - if (reg_umax(reg) < (u64)reg_smin(reg)) { - reg_set_srange64(reg, (s64)reg_umin(reg), reg_smax(reg)); - reg_set_urange64(reg, reg_umin(reg), min_t(u64, reg_umax(reg), reg_smax(reg))); - } else if ((u64)reg_smax(reg) < reg_umin(reg)) { - /* This second condition considers the case where the u64 range - * overlaps with the negative portion of the s64 range: - * - * 0 U64_MAX - * | [xxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxx] | - * |----------------------------|----------------------------| - * |xxxxxxxxx] [xxxxxxxxxxxx s64 range | - * 0 S64_MAX S64_MIN -1 - */ - reg_set_srange64(reg, reg_smin(reg), (s64)reg_umax(reg)); - reg_set_urange64(reg, max_t(u64, reg_umin(reg), reg_smin(reg)), reg_umax(reg)); - } - } + cnum32_intersect_with(®->r32, cnum32_from_cnum64(reg->r64)); } static void deduce_bounds_64_from_32(struct bpf_reg_state *reg) { - /* Try to tighten 64-bit bounds from 32-bit knowledge, using 32-bit - * values on both sides of 64-bit range in hope to have tighter range. - * E.g., if r1 is [0x1'00000000, 0x3'80000000], and we learn from - * 32-bit signed > 0 operation that s32 bounds are now [1; 0x7fffffff]. - * With this, we can substitute 1 as low 32-bits of _low_ 64-bit bound - * (0x100000000 -> 0x100000001) and 0x7fffffff as low 32-bits of - * _high_ 64-bit bound (0x380000000 -> 0x37fffffff) and arrive at a - * better overall bounds for r1 as [0x1'000000001; 0x3'7fffffff]. - * We just need to make sure that derived bounds we are intersecting - * with are well-formed ranges in respective s64 or u64 domain, just - * like we do with similar kinds of 32-to-64 or 64-to-32 adjustments. - */ - __u64 new_umin, new_umax; - __s64 new_smin, new_smax; - - /* u32 -> u64 tightening, it's always well-formed */ - new_umin = (reg_umin(reg) & ~0xffffffffULL) | reg_u32_min(reg); - new_umax = (reg_umax(reg) & ~0xffffffffULL) | reg_u32_max(reg); - reg_set_urange64(reg, - max_t(u64, reg_umin(reg), new_umin), - min_t(u64, reg_umax(reg), new_umax)); - /* u32 -> s64 tightening, u32 range embedded into s64 preserves range validity */ - new_smin = (reg_smin(reg) & ~0xffffffffULL) | reg_u32_min(reg); - new_smax = (reg_smax(reg) & ~0xffffffffULL) | reg_u32_max(reg); - reg_set_srange64(reg, - max_t(s64, reg_smin(reg), new_smin), - min_t(s64, reg_smax(reg), new_smax)); - - /* Here we would like to handle a special case after sign extending load, - * when upper bits for a 64-bit range are all 1s or all 0s. - * - * Upper bits are all 1s when register is in a range: - * [0xffff_ffff_0000_0000, 0xffff_ffff_ffff_ffff] - * Upper bits are all 0s when register is in a range: - * [0x0000_0000_0000_0000, 0x0000_0000_ffff_ffff] - * Together this forms are continuous range: - * [0xffff_ffff_0000_0000, 0x0000_0000_ffff_ffff] - * - * Now, suppose that register range is in fact tighter: - * [0xffff_ffff_8000_0000, 0x0000_0000_ffff_ffff] (R) - * Also suppose that it's 32-bit range is positive, - * meaning that lower 32-bits of the full 64-bit register - * are in the range: - * [0x0000_0000, 0x7fff_ffff] (W) - * - * If this happens, then any value in a range: - * [0xffff_ffff_0000_0000, 0xffff_ffff_7fff_ffff] - * is smaller than a lowest bound of the range (R): - * 0xffff_ffff_8000_0000 - * which means that upper bits of the full 64-bit register - * can't be all 1s, when lower bits are in range (W). - * - * Note that: - * - 0xffff_ffff_8000_0000 == (s64)S32_MIN - * - 0x0000_0000_7fff_ffff == (s64)S32_MAX - * These relations are used in the conditions below. - */ - if (reg_s32_min(reg) >= 0 && reg_smin(reg) >= S32_MIN && reg_smax(reg) <= S32_MAX) { - reg_set_srange64(reg, reg_s32_min(reg), reg_s32_max(reg)); - reg_set_urange64(reg, reg_s32_min(reg), reg_s32_max(reg)); - reg->var_off = tnum_intersect(reg->var_off, - tnum_range(reg_smin(reg), reg_smax(reg))); - } + reg->r64 = cnum64_cnum32_intersect(reg->r64, reg->r32); } static void __reg_deduce_bounds(struct bpf_reg_state *reg) { - deduce_bounds_64_from_64(reg); deduce_bounds_32_from_64(reg); - deduce_bounds_32_from_32(reg); deduce_bounds_64_from_32(reg); } @@ -2402,35 +2076,25 @@ static void reg_bounds_sync(struct bpf_reg_state *reg) __update_reg_bounds(reg); } -static bool range_bounds_violation(struct bpf_reg_state *reg) -{ - return (reg_umin(reg) > reg_umax(reg) || reg_smin(reg) > reg_smax(reg) || - reg_u32_min(reg) > reg_u32_max(reg) || - reg_s32_min(reg) > reg_s32_max(reg)); -} - static bool const_tnum_range_mismatch(struct bpf_reg_state *reg) { - u64 uval = reg->var_off.value; - s64 sval = (s64)uval; - if (!tnum_is_const(reg->var_off)) return false; - return reg_umin(reg) != uval || reg_umax(reg) != uval || - reg_smin(reg) != sval || reg_smax(reg) != sval; + return !cnum64_is_const(reg->r64) || reg->r64.base != reg->var_off.value; } static bool const_tnum_range_mismatch_32(struct bpf_reg_state *reg) { - u32 uval32 = tnum_subreg(reg->var_off).value; - s32 sval32 = (s32)uval32; - if (!tnum_subreg_is_const(reg->var_off)) return false; - return reg_u32_min(reg) != uval32 || reg_u32_max(reg) != uval32 || - reg_s32_min(reg) != sval32 || reg_s32_max(reg) != sval32; + return !cnum32_is_const(reg->r32) || reg->r32.base != tnum_subreg(reg->var_off).value; +} + +static bool range_bounds_violation(struct bpf_reg_state *reg) +{ + return cnum32_is_empty(reg->r32) || cnum64_is_empty(reg->r64); } static int reg_bounds_sanity_check(struct bpf_verifier_env *env, @@ -2455,12 +2119,11 @@ static int reg_bounds_sanity_check(struct bpf_verifier_env *env, return 0; out: - verifier_bug(env, "REG INVARIANTS VIOLATION (%s): %s u64=[%#llx, %#llx] " - "s64=[%#llx, %#llx] u32=[%#x, %#x] s32=[%#x, %#x] var_off=(%#llx, %#llx)", - ctx, msg, reg_umin(reg), reg_umax(reg), - reg_smin(reg), reg_smax(reg), - reg_u32_min(reg), reg_u32_max(reg), - reg_s32_min(reg), reg_s32_max(reg), + verifier_bug(env, "REG INVARIANTS VIOLATION (%s): %s r64={.base=%#llx, .size=%#llx} " + "r32={.base=%#x, .size=%#x} var_off=(%#llx, %#llx)", + ctx, msg, + reg->r64.base, reg->r64.size, + reg->r32.base, reg->r32.size, reg->var_off.value, reg->var_off.mask); if (env->test_reg_invariants) return -EFAULT; @@ -2468,26 +2131,6 @@ out: return 0; } -static bool __reg32_bound_s64(s32 a) -{ - return a >= 0 && a <= S32_MAX; -} - -static void __reg_assign_32_into_64(struct bpf_reg_state *reg) -{ - reg_set_urange64(reg, reg_u32_min(reg), reg_u32_max(reg)); - - /* Attempt to pull 32-bit signed bounds into 64-bit bounds but must - * be positive otherwise set to worse case bounds and refine later - * from tnum. - */ - if (__reg32_bound_s64(reg_s32_min(reg)) && - __reg32_bound_s64(reg_s32_max(reg))) - reg_set_srange64(reg, reg_s32_min(reg), reg_s32_max(reg)); - else - reg_set_srange64(reg, 0, U32_MAX); -} - /* Mark a register as having a completely unknown (scalar) value. */ void bpf_mark_reg_unknown_imprecise(struct bpf_reg_state *reg) { @@ -5636,7 +5279,7 @@ static int check_buffer_access(struct bpf_verifier_env *env, static void zext_32_to_64(struct bpf_reg_state *reg) { reg->var_off = tnum_subreg(reg->var_off); - __reg_assign_32_into_64(reg); + reg_set_urange64(reg, reg_u32_min(reg), reg_u32_max(reg)); } /* truncate register to smaller size (in bytes) @@ -5651,12 +5294,10 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) /* fix arithmetic bounds */ mask = ((u64)1 << (size * 8)) - 1; - if ((reg_umin(reg) & ~mask) == (reg_umax(reg) & ~mask)) { + if ((reg_umin(reg) & ~mask) == (reg_umax(reg) & ~mask)) reg_set_urange64(reg, reg_umin(reg) & mask, reg_umax(reg) & mask); - } else { + else reg_set_urange64(reg, 0, mask); - } - reg_set_srange64(reg, reg_umin(reg), reg_umax(reg)); /* If size is smaller than 32bit register the 32bit register * values are also truncated so we push 64-bit bounds into @@ -5681,8 +5322,6 @@ static void set_sext64_default_val(struct bpf_reg_state *reg, int size) reg_set_srange64(reg, S32_MIN, S32_MAX); reg_set_srange32(reg, S32_MIN, S32_MAX); } - reg_set_urange64(reg, 0, U64_MAX); - reg_set_urange32(reg, 0, U32_MAX); reg->var_off = tnum_unknown; } @@ -5703,10 +5342,8 @@ static void coerce_reg_to_size_sx(struct bpf_reg_state *reg, int size) reg->var_off = tnum_const((s32)u64_cval); u64_cval = reg->var_off.value; - reg_set_srange64(reg, u64_cval, u64_cval); - reg_set_urange64(reg, u64_cval, u64_cval); - reg_set_srange32(reg, u64_cval, u64_cval); - reg_set_urange32(reg, u64_cval, u64_cval); + reg->r64 = cnum64_from_urange(u64_cval, u64_cval); + reg->r32 = cnum32_from_urange((u32)u64_cval, (u32)u64_cval); return; } @@ -5734,9 +5371,7 @@ static void coerce_reg_to_size_sx(struct bpf_reg_state *reg, int size) /* both of s64_max/s64_min positive or negative */ if ((s64_max >= 0) == (s64_min >= 0)) { reg_set_srange64(reg, s64_min, s64_max); - reg_set_urange64(reg, s64_min, s64_max); reg_set_srange32(reg, s64_min, s64_max); - reg_set_urange32(reg, s64_min, s64_max); reg->var_off = tnum_range(s64_min, s64_max); return; } @@ -5752,7 +5387,6 @@ static void set_sext32_default_val(struct bpf_reg_state *reg, int size) else /* size == 2 */ reg_set_srange32(reg, S16_MIN, S16_MAX); - reg_set_urange32(reg, 0, U32_MAX); reg->var_off = tnum_subreg(tnum_unknown); } @@ -5771,7 +5405,6 @@ static void coerce_subreg_to_size_sx(struct bpf_reg_state *reg, int size) u32_val = reg->var_off.value; reg_set_srange32(reg, u32_val, u32_val); - reg_set_urange32(reg, u32_val, u32_val); return; } @@ -5795,7 +5428,6 @@ static void coerce_subreg_to_size_sx(struct bpf_reg_state *reg, int size) if ((s32_min >= 0) == (s32_max >= 0)) { reg_set_srange32(reg, s32_min, s32_max); - reg_set_urange32(reg, (u32)s32_min, (u32)s32_max); reg->var_off = tnum_subreg(tnum_range(s32_min, s32_max)); return; } @@ -9952,8 +9584,6 @@ static int do_refine_retval_range(struct bpf_verifier_env *env, case BPF_FUNC_get_smp_processor_id: reg_set_urange64(ret_reg, 0, nr_cpu_ids - 1); reg_set_urange32(ret_reg, 0, nr_cpu_ids - 1); - reg_set_srange64(ret_reg, 0, nr_cpu_ids - 1); - reg_set_srange32(ret_reg, 0, nr_cpu_ids - 1); reg_bounds_sync(ret_reg); break; } @@ -13756,10 +13386,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = state->regs, *dst_reg; bool known = tnum_is_const(off_reg->var_off); - s64 smin_val = reg_smin(off_reg), smax_val = reg_smax(off_reg), - smin_ptr = reg_smin(ptr_reg), smax_ptr = reg_smax(ptr_reg); - u64 umin_val = reg_umin(off_reg), umax_val = reg_umax(off_reg), - umin_ptr = reg_umin(ptr_reg), umax_ptr = reg_umax(ptr_reg); + s64 smin_val = reg_smin(off_reg), smax_val = reg_smax(off_reg); + u64 umin_val = reg_umin(off_reg), umax_val = reg_umax(off_reg); struct bpf_sanitize_info info = {}; u8 opcode = BPF_OP(insn->code); u32 dst = insn->dst_reg; @@ -13861,23 +13489,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, * added into the variable offset, and we copy the fixed offset * from ptr_reg. */ - { - s64 smin_res, smax_res; - u64 umin_res, umax_res; - - if (check_add_overflow(smin_ptr, smin_val, &smin_res) || - check_add_overflow(smax_ptr, smax_val, &smax_res)) { - reg_set_srange64(dst_reg, S64_MIN, S64_MAX); - } else { - reg_set_srange64(dst_reg, smin_res, smax_res); - } - if (check_add_overflow(umin_ptr, umin_val, &umin_res) || - check_add_overflow(umax_ptr, umax_val, &umax_res)) { - reg_set_urange64(dst_reg, 0, U64_MAX); - } else { - reg_set_urange64(dst_reg, umin_res, umax_res); - } - } + dst_reg->r64 = cnum64_add(ptr_reg->r64, off_reg->r64); dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off); dst_reg->raw = ptr_reg->raw; if (reg_is_pkt_pointer(ptr_reg)) { @@ -13909,27 +13521,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst); return -EACCES; } - /* A new variable offset is created. If the subtrahend is known - * nonnegative, then any reg->range we had before is still good. - */ - { - s64 smin_res, smax_res; - - if (check_sub_overflow(smin_ptr, smax_val, &smin_res) || - check_sub_overflow(smax_ptr, smin_val, &smax_res)) { - /* Overflow possible, we know nothing */ - reg_set_srange64(dst_reg, S64_MIN, S64_MAX); - } else { - reg_set_srange64(dst_reg, smin_res, smax_res); - } - } - if (umin_ptr < umax_val) { - /* Overflow possible, we know nothing */ - reg_set_urange64(dst_reg, 0, U64_MAX); - } else { - /* Cannot overflow (as long as bounds are consistent) */ - reg_set_urange64(dst_reg, umin_ptr - umax_val, umax_ptr - umin_val); - } + dst_reg->r64 = cnum64_add(ptr_reg->r64, cnum64_negate(off_reg->r64)); dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off); dst_reg->raw = ptr_reg->raw; if (reg_is_pkt_pointer(ptr_reg)) { @@ -13986,139 +13578,25 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, static void scalar32_min_max_add(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s32 smin = reg_s32_min(dst_reg); - s32 smax = reg_s32_max(dst_reg); - u32 umin = reg_u32_min(dst_reg); - u32 umax = reg_u32_max(dst_reg); - u32 umin_val = reg_u32_min(src_reg); - u32 umax_val = reg_u32_max(src_reg); - bool min_overflow, max_overflow; - - if (check_add_overflow(smin, reg_s32_min(src_reg), &smin) || - check_add_overflow(smax, reg_s32_max(src_reg), &smax)) { - smin = S32_MIN; - smax = S32_MAX; - } - - /* If either all additions overflow or no additions overflow, then - * it is okay to set: dst_umin = dst_umin + src_umin, dst_umax = - * dst_umax + src_umax. Otherwise (some additions overflow), set - * the output bounds to unbounded. - */ - min_overflow = check_add_overflow(umin, umin_val, &umin); - max_overflow = check_add_overflow(umax, umax_val, &umax); - - if (!min_overflow && max_overflow) { - umin = 0; - umax = U32_MAX; - } - - reg_set_srange32(dst_reg, smin, smax); - reg_set_urange32(dst_reg, umin, umax); + dst_reg->r32 = cnum32_add(dst_reg->r32, src_reg->r32); } static void scalar_min_max_add(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s64 smin = reg_smin(dst_reg); - s64 smax = reg_smax(dst_reg); - u64 umin = reg_umin(dst_reg); - u64 umax = reg_umax(dst_reg); - u64 umin_val = reg_umin(src_reg); - u64 umax_val = reg_umax(src_reg); - bool min_overflow, max_overflow; - - if (check_add_overflow(smin, reg_smin(src_reg), &smin) || - check_add_overflow(smax, reg_smax(src_reg), &smax)) { - smin = S64_MIN; - smax = S64_MAX; - } - - /* If either all additions overflow or no additions overflow, then - * it is okay to set: dst_umin = dst_umin + src_umin, dst_umax = - * dst_umax + src_umax. Otherwise (some additions overflow), set - * the output bounds to unbounded. - */ - min_overflow = check_add_overflow(umin, umin_val, &umin); - max_overflow = check_add_overflow(umax, umax_val, &umax); - - if (!min_overflow && max_overflow) { - umin = 0; - umax = U64_MAX; - } - - reg_set_srange64(dst_reg, smin, smax); - reg_set_urange64(dst_reg, umin, umax); + dst_reg->r64 = cnum64_add(dst_reg->r64, src_reg->r64); } static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s32 smin = reg_s32_min(dst_reg); - s32 smax = reg_s32_max(dst_reg); - u32 umin = reg_u32_min(dst_reg); - u32 umax = reg_u32_max(dst_reg); - u32 umin_val = reg_u32_min(src_reg); - u32 umax_val = reg_u32_max(src_reg); - bool min_underflow, max_underflow; - - if (check_sub_overflow(smin, reg_s32_max(src_reg), &smin) || - check_sub_overflow(smax, reg_s32_min(src_reg), &smax)) { - /* Overflow possible, we know nothing */ - smin = S32_MIN; - smax = S32_MAX; - } - - /* If either all subtractions underflow or no subtractions - * underflow, it is okay to set: dst_umin = dst_umin - src_umax, - * dst_umax = dst_umax - src_umin. Otherwise (some subtractions - * underflow), set the output bounds to unbounded. - */ - min_underflow = check_sub_overflow(umin, umax_val, &umin); - max_underflow = check_sub_overflow(umax, umin_val, &umax); - - if (min_underflow && !max_underflow) { - umin = 0; - umax = U32_MAX; - } - - reg_set_srange32(dst_reg, smin, smax); - reg_set_urange32(dst_reg, umin, umax); + dst_reg->r32 = cnum32_add(dst_reg->r32, cnum32_negate(src_reg->r32)); } static void scalar_min_max_sub(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s64 smin = reg_smin(dst_reg); - s64 smax = reg_smax(dst_reg); - u64 umin = reg_umin(dst_reg); - u64 umax = reg_umax(dst_reg); - u64 umin_val = reg_umin(src_reg); - u64 umax_val = reg_umax(src_reg); - bool min_underflow, max_underflow; - - if (check_sub_overflow(smin, reg_smax(src_reg), &smin) || - check_sub_overflow(smax, reg_smin(src_reg), &smax)) { - /* Overflow possible, we know nothing */ - smin = S64_MIN; - smax = S64_MAX; - } - - /* If either all subtractions underflow or no subtractions - * underflow, it is okay to set: dst_umin = dst_umin - src_umax, - * dst_umax = dst_umax - src_umin. Otherwise (some subtractions - * underflow), set the output bounds to unbounded. - */ - min_underflow = check_sub_overflow(umin, umax_val, &umin); - max_underflow = check_sub_overflow(umax, umin_val, &umax); - - if (min_underflow && !max_underflow) { - umin = 0; - umax = U64_MAX; - } - - reg_set_srange64(dst_reg, smin, smax); - reg_set_urange64(dst_reg, umin, umax); + dst_reg->r64 = cnum64_add(dst_reg->r64, cnum64_negate(src_reg->r64)); } static void scalar32_min_max_mul(struct bpf_reg_state *dst_reg, @@ -14148,8 +13626,8 @@ static void scalar32_min_max_mul(struct bpf_reg_state *dst_reg, smax = max_array(tmp_prod, 4); } - reg_set_srange32(dst_reg, smin, smax); - reg_set_urange32(dst_reg, umin, umax); + dst_reg->r32 = cnum32_intersect(cnum32_from_urange(umin, umax), + cnum32_from_srange(smin, smax)); } static void scalar_min_max_mul(struct bpf_reg_state *dst_reg, @@ -14179,8 +13657,8 @@ static void scalar_min_max_mul(struct bpf_reg_state *dst_reg, smax = max_array(tmp_prod, 4); } - reg_set_srange64(dst_reg, smin, smax); - reg_set_urange64(dst_reg, umin, umax); + dst_reg->r64 = cnum64_intersect(cnum64_from_urange(umin, umax), + cnum64_from_srange(smin, smax)); } static void scalar32_min_max_udiv(struct bpf_reg_state *dst_reg, @@ -14192,7 +13670,6 @@ static void scalar32_min_max_udiv(struct bpf_reg_state *dst_reg, reg_u32_max(dst_reg) / src_val); /* Reset other ranges/tnum to unbounded/unknown. */ - reg_set_srange32(dst_reg, S32_MIN, S32_MAX); reset_reg64_and_tnum(dst_reg); } @@ -14205,7 +13682,6 @@ static void scalar_min_max_udiv(struct bpf_reg_state *dst_reg, div64_u64(reg_umax(dst_reg), src_val)); /* Reset other ranges/tnum to unbounded/unknown. */ - reg_set_srange64(dst_reg, S64_MIN, S64_MAX); reset_reg32_and_tnum(dst_reg); } @@ -14242,7 +13718,6 @@ static void scalar32_min_max_sdiv(struct bpf_reg_state *dst_reg, reset: reg_set_srange32(dst_reg, smin, smax); /* Reset other ranges/tnum to unbounded/unknown. */ - reg_set_urange32(dst_reg, 0, U32_MAX); reset_reg64_and_tnum(dst_reg); } @@ -14279,7 +13754,6 @@ static void scalar_min_max_sdiv(struct bpf_reg_state *dst_reg, reset: reg_set_srange64(dst_reg, smin, smax); /* Reset other ranges/tnum to unbounded/unknown. */ - reg_set_urange64(dst_reg, 0, U64_MAX); reset_reg32_and_tnum(dst_reg); } @@ -14299,7 +13773,6 @@ static void scalar32_min_max_umod(struct bpf_reg_state *dst_reg, reg_set_urange32(dst_reg, 0, min(reg_u32_max(dst_reg), res_max)); /* Reset other ranges/tnum to unbounded/unknown. */ - reg_set_srange32(dst_reg, S32_MIN, S32_MAX); reset_reg64_and_tnum(dst_reg); } @@ -14319,7 +13792,6 @@ static void scalar_min_max_umod(struct bpf_reg_state *dst_reg, reg_set_urange64(dst_reg, 0, min(reg_umax(dst_reg), res_max)); /* Reset other ranges/tnum to unbounded/unknown. */ - reg_set_srange64(dst_reg, S64_MIN, S64_MAX); reset_reg32_and_tnum(dst_reg); } @@ -14359,7 +13831,6 @@ static void scalar32_min_max_smod(struct bpf_reg_state *dst_reg, } /* Reset other ranges/tnum to unbounded/unknown. */ - reg_set_urange32(dst_reg, 0, U32_MAX); reset_reg64_and_tnum(dst_reg); } @@ -14399,7 +13870,6 @@ static void scalar_min_max_smod(struct bpf_reg_state *dst_reg, } /* Reset other ranges/tnum to unbounded/unknown. */ - reg_set_urange64(dst_reg, 0, U64_MAX); reset_reg32_and_tnum(dst_reg); } @@ -14419,15 +13889,9 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg, /* We get our minimum from the var_off, since that's inherently * bitwise. Our maximum is the minimum of the operands' maxima. */ - reg_set_urange32(dst_reg, var32_off.value, min(reg_u32_max(dst_reg), umax_val)); - - /* Safe to set s32 bounds by casting u32 result into s32 when u32 - * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded. - */ - if ((s32)reg_u32_min(dst_reg) <= (s32)reg_u32_max(dst_reg)) - reg_set_srange32(dst_reg, reg_u32_min(dst_reg), reg_u32_max(dst_reg)); - else - reg_set_srange32(dst_reg, S32_MIN, S32_MAX); + reg_set_urange32(dst_reg, + var32_off.value, + min(reg_u32_max(dst_reg), umax_val)); } static void scalar_min_max_and(struct bpf_reg_state *dst_reg, @@ -14445,15 +13909,10 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg, /* We get our minimum from the var_off, since that's inherently * bitwise. Our maximum is the minimum of the operands' maxima. */ - reg_set_urange64(dst_reg, dst_reg->var_off.value, min(reg_umax(dst_reg), umax_val)); + reg_set_urange64(dst_reg, + dst_reg->var_off.value, + min(reg_umax(dst_reg), umax_val)); - /* Safe to set s64 bounds by casting u64 result into s64 when u64 - * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded. - */ - if ((s64)reg_umin(dst_reg) <= (s64)reg_umax(dst_reg)) - reg_set_srange64(dst_reg, reg_umin(dst_reg), reg_umax(dst_reg)); - else - reg_set_srange64(dst_reg, S64_MIN, S64_MAX); /* We may learn something more from the var_off */ __update_reg_bounds(dst_reg); } @@ -14474,16 +13933,9 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg, /* We get our maximum from the var_off, and our minimum is the * maximum of the operands' minima */ - reg_set_urange32(dst_reg, max(reg_u32_min(dst_reg), umin_val), + reg_set_urange32(dst_reg, + max(reg_u32_min(dst_reg), umin_val), var32_off.value | var32_off.mask); - - /* Safe to set s32 bounds by casting u32 result into s32 when u32 - * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded. - */ - if ((s32)reg_u32_min(dst_reg) <= (s32)reg_u32_max(dst_reg)) - reg_set_srange32(dst_reg, reg_u32_min(dst_reg), reg_u32_max(dst_reg)); - else - reg_set_srange32(dst_reg, S32_MIN, S32_MAX); } static void scalar_min_max_or(struct bpf_reg_state *dst_reg, @@ -14501,16 +13953,10 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg, /* We get our maximum from the var_off, and our minimum is the * maximum of the operands' minima */ - reg_set_urange64(dst_reg, max(reg_umin(dst_reg), umin_val), + reg_set_urange64(dst_reg, + max(reg_umin(dst_reg), umin_val), dst_reg->var_off.value | dst_reg->var_off.mask); - /* Safe to set s64 bounds by casting u64 result into s64 when u64 - * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded. - */ - if ((s64)reg_umin(dst_reg) <= (s64)reg_umax(dst_reg)) - reg_set_srange64(dst_reg, reg_umin(dst_reg), reg_umax(dst_reg)); - else - reg_set_srange64(dst_reg, S64_MIN, S64_MAX); /* We may learn something more from the var_off */ __update_reg_bounds(dst_reg); } @@ -14529,14 +13975,6 @@ static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg, /* We get both minimum and maximum from the var32_off. */ reg_set_urange32(dst_reg, var32_off.value, var32_off.value | var32_off.mask); - - /* Safe to set s32 bounds by casting u32 result into s32 when u32 - * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded. - */ - if ((s32)reg_u32_min(dst_reg) <= (s32)reg_u32_max(dst_reg)) - reg_set_srange32(dst_reg, reg_u32_min(dst_reg), reg_u32_max(dst_reg)); - else - reg_set_srange32(dst_reg, S32_MIN, S32_MAX); } static void scalar_min_max_xor(struct bpf_reg_state *dst_reg, @@ -14552,31 +13990,21 @@ static void scalar_min_max_xor(struct bpf_reg_state *dst_reg, } /* We get both minimum and maximum from the var_off. */ - reg_set_urange64(dst_reg, dst_reg->var_off.value, + reg_set_urange64(dst_reg, + dst_reg->var_off.value, dst_reg->var_off.value | dst_reg->var_off.mask); - - /* Safe to set s64 bounds by casting u64 result into s64 when u64 - * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded. - */ - if ((s64)reg_umin(dst_reg) <= (s64)reg_umax(dst_reg)) - reg_set_srange64(dst_reg, reg_umin(dst_reg), reg_umax(dst_reg)); - else - reg_set_srange64(dst_reg, S64_MIN, S64_MAX); - - __update_reg_bounds(dst_reg); } static void __scalar32_min_max_lsh(struct bpf_reg_state *dst_reg, u64 umin_val, u64 umax_val) { - /* We lose all sign bit information (except what we can pick - * up from var_off) - */ - reg_set_srange32(dst_reg, S32_MIN, S32_MAX); /* If we might shift our top bit out, then we know nothing */ if (umax_val > 31 || reg_u32_max(dst_reg) > 1ULL << (31 - umax_val)) reg_set_urange32(dst_reg, 0, U32_MAX); else + /* We lose all sign bit information (except what we can pick + * up from var_off) + */ reg_set_urange32(dst_reg, reg_u32_min(dst_reg) << umin_val, reg_u32_max(dst_reg) << umax_val); } @@ -14602,23 +14030,27 @@ static void scalar32_min_max_lsh(struct bpf_reg_state *dst_reg, static void __scalar64_min_max_lsh(struct bpf_reg_state *dst_reg, u64 umin_val, u64 umax_val) { + struct cnum64 u, s; + /* Special case <<32 because it is a common compiler pattern to sign * extend subreg by doing <<32 s>>32. smin/smax assignments are correct * because s32 bounds don't flip sign when shifting to the left by * 32bits. */ if (umin_val == 32 && umax_val == 32) - reg_set_srange64(dst_reg, (s64)reg_s32_min(dst_reg) << 32, - (s64)reg_s32_max(dst_reg) << 32); + s = cnum64_from_srange((s64)reg_s32_min(dst_reg) << 32, + (s64)reg_s32_max(dst_reg) << 32); else - reg_set_srange64(dst_reg, S64_MIN, S64_MAX); + s = CNUM64_UNBOUNDED; /* If we might shift our top bit out, then we know nothing */ if (reg_umax(dst_reg) > 1ULL << (63 - umax_val)) - reg_set_urange64(dst_reg, 0, U64_MAX); + u = CNUM64_UNBOUNDED; else - reg_set_urange64(dst_reg, reg_umin(dst_reg) << umin_val, - reg_umax(dst_reg) << umax_val); + u = cnum64_from_urange(reg_umin(dst_reg) << umin_val, + reg_umax(dst_reg) << umax_val); + + dst_reg->r64 = cnum64_intersect(u, s); } static void scalar_min_max_lsh(struct bpf_reg_state *dst_reg, @@ -14657,7 +14089,6 @@ static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg, * and rely on inferring new ones from the unsigned bounds and * var_off of the result. */ - reg_set_srange32(dst_reg, S32_MIN, S32_MAX); dst_reg->var_off = tnum_rshift(subreg, umin_val); reg_set_urange32(dst_reg, reg_u32_min(dst_reg) >> umax_val, @@ -14687,7 +14118,6 @@ static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg, * and rely on inferring new ones from the unsigned bounds and * var_off of the result. */ - reg_set_srange64(dst_reg, S64_MIN, S64_MAX); dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val); reg_set_urange64(dst_reg, reg_umin(dst_reg) >> umax_val, reg_umax(dst_reg) >> umin_val); @@ -14707,6 +14137,8 @@ static void scalar32_min_max_arsh(struct bpf_reg_state *dst_reg, /* Upon reaching here, src_known is true and * umax_val is equal to umin_val. + * Blow away the dst_reg umin_value/umax_value and rely on + * dst_reg var_off to refine the result. */ reg_set_srange32(dst_reg, (u32)(((s32)reg_s32_min(dst_reg)) >> umin_val), @@ -14714,11 +14146,6 @@ static void scalar32_min_max_arsh(struct bpf_reg_state *dst_reg, dst_reg->var_off = tnum_arshift(tnum_subreg(dst_reg->var_off), umin_val, 32); - /* blow away the dst_reg umin_value/umax_value and rely on - * dst_reg var_off to refine the result. - */ - reg_set_urange32(dst_reg, 0, U32_MAX); - __mark_reg64_unbounded(dst_reg); __update_reg32_bounds(dst_reg); } @@ -14736,11 +14163,6 @@ static void scalar_min_max_arsh(struct bpf_reg_state *dst_reg, dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val, 64); - /* blow away the dst_reg umin_value/umax_value and rely on - * dst_reg var_off to refine the result. - */ - reg_set_urange64(dst_reg, 0, U64_MAX); - /* Its not easy to operate on alu32 bounds here because it depends * on bits being shifted in from upper 32-bits. Take easy way out * and mark unbounded so we can recalculate later from tnum. @@ -15829,23 +15251,15 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state switch (opcode) { case BPF_JEQ: if (is_jmp32) { - reg_set_urange32(reg1, max(reg_u32_min(reg1), reg_u32_min(reg2)), - min(reg_u32_max(reg1), reg_u32_max(reg2))); - reg_set_srange32(reg1, max(reg_s32_min(reg1), reg_s32_min(reg2)), - min(reg_s32_max(reg1), reg_s32_max(reg2))); - reg_set_urange32(reg2, reg_u32_min(reg1), reg_u32_max(reg1)); - reg_set_srange32(reg2, reg_s32_min(reg1), reg_s32_max(reg1)); + reg1->r32 = cnum32_intersect(reg1->r32, reg2->r32); + reg2->r32 = reg1->r32; t = tnum_intersect(tnum_subreg(reg1->var_off), tnum_subreg(reg2->var_off)); reg1->var_off = tnum_with_subreg(reg1->var_off, t); reg2->var_off = tnum_with_subreg(reg2->var_off, t); } else { - reg_set_urange64(reg1, max(reg_umin(reg1), reg_umin(reg2)), - min(reg_umax(reg1), reg_umax(reg2))); - reg_set_srange64(reg1, max(reg_smin(reg1), reg_smin(reg2)), - min(reg_smax(reg1), reg_smax(reg2))); - reg_set_urange64(reg2, reg_umin(reg1), reg_umax(reg1)); - reg_set_srange64(reg2, reg_smin(reg1), reg_smax(reg1)); + reg1->r64 = cnum64_intersect(reg1->r64, reg2->r64); + reg2->r64 = reg1->r64; reg1->var_off = tnum_intersect(reg1->var_off, reg2->var_off); reg2->var_off = reg1->var_off; @@ -15862,32 +15276,11 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state */ val = reg_const_value(reg2, is_jmp32); if (is_jmp32) { - /* u32_min is not equal to 0xffffffff at this point, - * because otherwise u32_max is 0xffffffff as well, - * in such a case both reg1 and reg2 would be constants, - * jump would be predicted and regs_refine_cond_op() - * wouldn't be called. - * - * Same reasoning works for all {u,s}{min,max}{32,64} cases - * below. - */ - if (reg_u32_min(reg1) == (u32)val) - reg_set_urange32(reg1, reg_u32_min(reg1) + 1, reg_u32_max(reg1)); - if (reg_u32_max(reg1) == (u32)val) - reg_set_urange32(reg1, reg_u32_min(reg1), reg_u32_max(reg1) - 1); - if (reg_s32_min(reg1) == (s32)val) - reg_set_srange32(reg1, reg_s32_min(reg1) + 1, reg_s32_max(reg1)); - if (reg_s32_max(reg1) == (s32)val) - reg_set_srange32(reg1, reg_s32_min(reg1), reg_s32_max(reg1) - 1); + /* Complement of the range [val, val] as cnum32. */ + cnum32_intersect_with(®1->r32, (struct cnum32){ val + 1, U32_MAX - 1 }); } else { - if (reg_umin(reg1) == (u64)val) - reg_set_urange64(reg1, reg_umin(reg1) + 1, reg_umax(reg1)); - if (reg_umax(reg1) == (u64)val) - reg_set_urange64(reg1, reg_umin(reg1), reg_umax(reg1) - 1); - if (reg_smin(reg1) == (s64)val) - reg_set_srange64(reg1, reg_smin(reg1) + 1, reg_smax(reg1)); - if (reg_smax(reg1) == (s64)val) - reg_set_srange64(reg1, reg_smin(reg1), reg_smax(reg1) - 1); + /* Complement of the range [val, val] as cnum64. */ + cnum64_intersect_with(®1->r64, (struct cnum64){ val + 1, U64_MAX - 1 }); } break; case BPF_JSET: @@ -15934,38 +15327,38 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state break; case BPF_JLE: if (is_jmp32) { - reg_set_urange32(reg1, reg_u32_min(reg1), min(reg_u32_max(reg1), reg_u32_max(reg2))); - reg_set_urange32(reg2, max(reg_u32_min(reg1), reg_u32_min(reg2)), reg_u32_max(reg2)); + cnum32_intersect_with_urange(®1->r32, 0, reg_u32_max(reg2)); + cnum32_intersect_with_urange(®2->r32, reg_u32_min(reg1), U32_MAX); } else { - reg_set_urange64(reg1, reg_umin(reg1), min(reg_umax(reg1), reg_umax(reg2))); - reg_set_urange64(reg2, max(reg_umin(reg1), reg_umin(reg2)), reg_umax(reg2)); + cnum64_intersect_with_urange(®1->r64, 0, reg_umax(reg2)); + cnum64_intersect_with_urange(®2->r64, reg_umin(reg1), U64_MAX); } break; case BPF_JLT: if (is_jmp32) { - reg_set_urange32(reg1, reg_u32_min(reg1), min(reg_u32_max(reg1), reg_u32_max(reg2) - 1)); - reg_set_urange32(reg2, max(reg_u32_min(reg1) + 1, reg_u32_min(reg2)), reg_u32_max(reg2)); + cnum32_intersect_with_urange(®1->r32, 0, reg_u32_max(reg2) - 1); + cnum32_intersect_with_urange(®2->r32, reg_u32_min(reg1) + 1, U32_MAX); } else { - reg_set_urange64(reg1, reg_umin(reg1), min(reg_umax(reg1), reg_umax(reg2) - 1)); - reg_set_urange64(reg2, max(reg_umin(reg1) + 1, reg_umin(reg2)), reg_umax(reg2)); + cnum64_intersect_with_urange(®1->r64, 0, reg_umax(reg2) - 1); + cnum64_intersect_with_urange(®2->r64, reg_umin(reg1) + 1, U64_MAX); } break; case BPF_JSLE: if (is_jmp32) { - reg_set_srange32(reg1, reg_s32_min(reg1), min(reg_s32_max(reg1), reg_s32_max(reg2))); - reg_set_srange32(reg2, max(reg_s32_min(reg1), reg_s32_min(reg2)), reg_s32_max(reg2)); + cnum32_intersect_with_srange(®1->r32, S32_MIN, reg_s32_max(reg2)); + cnum32_intersect_with_srange(®2->r32, reg_s32_min(reg1), S32_MAX); } else { - reg_set_srange64(reg1, reg_smin(reg1), min(reg_smax(reg1), reg_smax(reg2))); - reg_set_srange64(reg2, max(reg_smin(reg1), reg_smin(reg2)), reg_smax(reg2)); + cnum64_intersect_with_srange(®1->r64, S64_MIN, reg_smax(reg2)); + cnum64_intersect_with_srange(®2->r64, reg_smin(reg1), S64_MAX); } break; case BPF_JSLT: if (is_jmp32) { - reg_set_srange32(reg1, reg_s32_min(reg1), min(reg_s32_max(reg1), reg_s32_max(reg2) - 1)); - reg_set_srange32(reg2, max(reg_s32_min(reg1) + 1, reg_s32_min(reg2)), reg_s32_max(reg2)); + cnum32_intersect_with_srange(®1->r32, S32_MIN, reg_s32_max(reg2) - 1); + cnum32_intersect_with_srange(®2->r32, reg_s32_min(reg1) + 1, S32_MAX); } else { - reg_set_srange64(reg1, reg_smin(reg1), min(reg_smax(reg1), reg_smax(reg2) - 1)); - reg_set_srange64(reg2, max(reg_smin(reg1) + 1, reg_smin(reg2)), reg_smax(reg2)); + cnum64_intersect_with_srange(®1->r64, S64_MIN, reg_smax(reg2) - 1); + cnum64_intersect_with_srange(®2->r64, reg_smin(reg1) + 1, S64_MAX); } break; default: diff --git a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c index 71f5240cc5b7..7f170a69d1d8 100644 --- a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c +++ b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c @@ -478,6 +478,52 @@ static struct range range_refine_in_halves(enum num_t x_t, struct range x, } +static __always_inline u64 next_u32_block(u64 x) { return x + (1ULL << 32); } +static __always_inline u64 prev_u32_block(u64 x) { return x - (1ULL << 32); } + +/* Is v within the circular u64 range [base, base + len]? */ +static __always_inline bool u64_range_contains(u64 v, u64 base, u64 len) +{ + return v - base <= len; +} + +/* Is v within the circular u32 range [base, base + len]? */ +static __always_inline bool u32_range_contains(u32 v, u32 base, u32 len) +{ + return v - base <= len; +} + +static bool range64_range32_intersect(enum num_t a_t, + struct range a /* 64 */, + struct range b /* 32 */, + struct range *out /* 64 */) +{ + u64 b_len = (u32)(b.b - b.a); + u64 a_len = a.b - a.a; + u64 lo, hi; + + if (u32_range_contains((u32)a.a, (u32)b.a, b_len)) { + lo = a.a; + } else { + lo = swap_low32(a.a, (u32)b.a); + if (!u64_range_contains(lo, a.a, a_len)) + lo = next_u32_block(lo); + if (!u64_range_contains(lo, a.a, a_len)) + return false; + } + if (u32_range_contains(a.b, (u32)b.a, b_len)) { + hi = a.b; + } else { + hi = swap_low32(a.b, (u32)b.b); + if (!u64_range_contains(hi, a.a, a_len)) + hi = prev_u32_block(hi); + if (!u64_range_contains(hi, a.a, a_len)) + return false; + } + *out = range(a_t, lo, hi); + return true; +} + static struct range range_refine(enum num_t x_t, struct range x, enum num_t y_t, struct range y) { struct range y_cast; @@ -533,23 +579,12 @@ static struct range range_refine(enum num_t x_t, struct range x, enum num_t y_t, } } - /* the case when new range knowledge, *y*, is a 32-bit subregister - * range, while previous range knowledge, *x*, is a full register - * 64-bit range, needs special treatment to take into account upper 32 - * bits of full register range - */ if (t_is_32(y_t) && !t_is_32(x_t)) { - struct range x_swap; + struct range x1; - /* some combinations of upper 32 bits and sign bit can lead to - * invalid ranges, in such cases it's easier to detect them - * after cast/swap than try to enumerate all the conditions - * under which transformation and knowledge transfer is valid - */ - x_swap = range(x_t, swap_low32(x.a, y_cast.a), swap_low32(x.b, y_cast.b)); - if (!is_valid_range(x_t, x_swap)) - return x; - return range_intersection(x_t, x, x_swap); + if (range64_range32_intersect(x_t, x, y, &x1)) + return x1; + return x; } /* otherwise, plain range cast and intersection works */ @@ -1300,6 +1335,26 @@ static bool assert_range_eq(enum num_t t, struct range x, struct range y, return false; } +/* For a pair of signed/unsigned t1/t2 checks if r1/r2 intersect in two intervals. */ +static bool needs_two_arcs(enum num_t t1, struct range r1, + enum num_t t2, struct range r2) +{ + u64 lo = cast_t(t1, r2.a); + u64 hi = cast_t(t1, r2.b); + + /* does r2 wrap in t1's domain: [0, hi] ∪ [lo, MAX]? */ + return lo > hi && r1.a <= hi && r1.b >= lo; +} + +static bool reg_state_needs_two_arcs(struct reg_state *s) +{ + if (!s->valid) + return false; + + return needs_two_arcs(U64, s->r[U64], S64, s->r[S64]) || + needs_two_arcs(U32, s->r[U32], S32, s->r[S32]); +} + /* Validate that register states match, and print details if they don't */ static bool assert_reg_state_eq(struct reg_state *r, struct reg_state *e, const char *ctx) { @@ -1524,6 +1579,11 @@ static int verify_case_op(enum num_t init_t, enum num_t cond_t, !assert_reg_state_eq(&fr2, &fe2, "false_reg2") || !assert_reg_state_eq(&tr1, &te1, "true_reg1") || !assert_reg_state_eq(&tr2, &te2, "true_reg2")) { + if (reg_state_needs_two_arcs(&fe1) || reg_state_needs_two_arcs(&fe2) || + reg_state_needs_two_arcs(&te1) || reg_state_needs_two_arcs(&te2)) { + test__skip(); + return 0; + } failed = true; } diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c index c1ae013dee29..f0b3fbbbb627 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bounds.c +++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c @@ -1239,7 +1239,8 @@ l0_%=: r0 = 0; \ SEC("tc") __description("multiply mixed sign bounds. test 1") __success __log_level(2) -__msg("r6 *= r7 {{.*}}; R6=scalar(smin=umin=0x1bc16d5cd4927ee1,smax=umax=0x1bc16d674ec80000,smax32=0x7ffffeff,umax32=0xfffffeff,var_off=(0x1bc16d4000000000; 0x3ffffffeff))") +__msg("r6 *= r7 {{.*}}; R6=scalar(smin=umin=0x1bc16d5cd4927ee1,smax=umax=0x1bc16d674ec80000,smax32=0x7ffffeff,var_off=(0x1bc16d4000000000; 0x3ffffffeff))") +/* cnum can't represent both [0, 0xffff_feff] and [0x8000_0000, 0x7fff_feff], so it picks one */ __naked void mult_mixed0_sign(void) { asm volatile ( @@ -1648,7 +1649,8 @@ l0_%=: r0 = 0; \ SEC("socket") __description("bounds deduction cross sign boundary, two overlaps") __failure -__msg("3: (2d) if r0 > r1 {{.*}} R0=scalar(smin=smin32=-128,smax=smax32=127,umax=0xffffffffffffff80)") +__msg("3: (2d) if r0 > r1 {{.*}} R0=scalar(smin=smin32=-128,smax=smax32=127)") +/* smin=-128 includes point 0xffffffffffffff80 */ __msg("frame pointer is read only") __naked void bounds_deduct_two_overlaps(void) { @@ -2043,7 +2045,8 @@ __naked void signed_unsigned_intersection32_case2(void *ctx) */ SEC("socket") __description("bounds refinement: 64bits ranges not overwritten by 32bits ranges") -__msg("3: (65) if r0 s> 0x2 {{.*}} R0=scalar(smin=0x8000000000000002,smax=2,umin=smin32=umin32=2,umax=0xffffffff00000003,smax32=umax32=3") +__msg("3: (65) if r0 s> 0x2 {{.*}} R0=scalar(smin=0x8000000000000002,smax=2,smin32=umin32=2,smax32=umax32=3,var_off{{.*}}))") +/* Can't represent both [S64_MIN+2, 2] and [2, U64_MAX - U32_MAX + 2] at the same time, picks shorter interval */ __msg("4: (25) if r0 > 0x13 {{.*}} R0=2") __success __log_level(2) __naked void refinement_32bounds_not_overwriting_64bounds(void *ctx) diff --git a/tools/testing/selftests/bpf/progs/verifier_subreg.c b/tools/testing/selftests/bpf/progs/verifier_subreg.c index 31832a306f91..73b5b0cf6706 100644 --- a/tools/testing/selftests/bpf/progs/verifier_subreg.c +++ b/tools/testing/selftests/bpf/progs/verifier_subreg.c @@ -558,7 +558,8 @@ __description("arsh32 imm sign negative extend check") __success __retval(0) __log_level(2) __msg("3: (17) r6 -= 4095 ; R6=scalar(smin=smin32=-4095,smax=smax32=0)") -__msg("4: (67) r6 <<= 32 ; R6=scalar(smin=0xfffff00100000000,smax=smax32=umax32=0,umax=0xffffffff00000000,smin32=0,var_off=(0x0; 0xffffffff00000000))") +__msg("4: (67) r6 <<= 32 ; R6=scalar(smin=0xfffff00100000000,smax=smax32=umax32=0,smin32=0,var_off=(0x0; 0xffffffff00000000))") +/* represents shorter of signed / unsigned 64-bit ranges */ __msg("5: (c7) r6 s>>= 32 ; R6=scalar(smin=smin32=-4095,smax=smax32=0)") __naked void arsh32_imm_sign_extend_negative_check(void) { @@ -581,7 +582,8 @@ __description("arsh32 imm sign extend check") __success __retval(0) __log_level(2) __msg("3: (17) r6 -= 2047 ; R6=scalar(smin=smin32=-2047,smax=smax32=2048)") -__msg("4: (67) r6 <<= 32 ; R6=scalar(smin=0xfffff80100000000,smax=0x80000000000,umax=0xffffffff00000000,smin32=0,smax32=umax32=0,var_off=(0x0; 0xffffffff00000000))") +__msg("4: (67) r6 <<= 32 ; R6=scalar(smin=0xfffff80100000000,smax=0x80000000000,smin32=0,smax32=umax32=0,var_off=(0x0; 0xffffffff00000000))") +/* represents shorter of signed / unsigned 64-bit ranges */ __msg("5: (c7) r6 s>>= 32 ; R6=scalar(smin=smin32=-2047,smax=smax32=2048)") __naked void arsh32_imm_sign_extend_check(void) { -- cgit v1.2.3 From 4c0710ab011ec144fa96670f960a0686bdeb153a Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 24 Apr 2026 15:52:45 -0700 Subject: selftests/bpf: new cases handled by 32->64 range refinements 1. 32-bit range starts before 64-bit range's low bits in each block, causing intersection to skip entire blocks. 2. 32-bit range crosses the U32_MAX/0 boundary, represented as s32 range crossing sign boundary. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260424-cnums-everywhere-rfc-v1-v3-4-ca434b39a486@gmail.com Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/progs/verifier_bounds.c | 80 ++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c index f0b3fbbbb627..5dd243e653c9 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bounds.c +++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c @@ -2187,4 +2187,84 @@ __naked void tnums_equal_impossible_constant(void *ctx) : __clobber_all); } +/* + * 32-bit range starts before 64-bit range low bits in each 2^32 block. + * + * N*2^32 (N+1)*2^32 (N+2)*2^32 (N+3)*2^32 + * ||----|=====|--|----------||----|=====|-------------||--|-|=====|-------------|| + * |< b >| | |< b >| | |< b >| + * | | | | + * |<---------------+- a -+---------------->| + * | | + * |< t >| refined r0 range + * + * a = u64 [0x1'00000008, 0x3'00000001] + * b = u32 [2, 5] + * t = u64 [0x2'00000002, 0x2'00000005] + */ +SEC("socket") +__success +__flag(BPF_F_TEST_REG_INVARIANTS) +__naked void deduce64_from_32_before_block_start(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = 0x100000008 ll; \ + if r0 < r1 goto 2f; \ + r1 = 0x300000001 ll; \ + if r0 > r1 goto 2f; /* u64: [0x1'00000008, 0x3'00000001] */ \ + if w0 < 2 goto 2f; \ + if w0 > 5 goto 2f; /* u32: [2, 5] */ \ + r2 = 0x200000002 ll; \ + r3 = 0x200000005 ll; \ + if r0 >= r2 goto 1f; /* should be always true */ \ + r10 = 0; /* dead code */ \ +1: if r0 <= r3 goto 2f; /* should be always true */ \ + r10 = 0; /* dead code */ \ +2: exit; \ + " + :: __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +/* + * 32-bit range crossing U32_MAX / 0 boundary. + * + * N*2^32 (N+1)*2^32 (N+2)*2^32 (N+3)*2^32 + * ||===|---------|------|===||===|----------------|===||===|---------|------|===|| + * |b >| | |< b||b >| |< b||b >| | |< b| + * | | | | + * |<-----+----------------- a --------------+-------->| + * | | + * |<---------------- t ------------->| refined r0 range + * + * a = u64 [0x1'00000006, 0x2'FFFFFFEF] + * b = s32 [-16, 5] (u32 wrapping [0xFFFFFFF0, 0x00000005]) + * t = u64 [0x1'FFFFFFF0, 0x2'00000005] + */ +SEC("socket") +__success +__flag(BPF_F_TEST_REG_INVARIANTS) +__naked void deduce64_from_32_wrapping_32bit(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = 0x100000006 ll; \ + if r0 < r1 goto 2f; \ + r1 = 0x2ffffffef ll; \ + if r0 > r1 goto 2f; /* u64: [0x1'00000006, 0x2'FFFFFFEF] */ \ + if w0 s< -16 goto 2f; \ + if w0 s> 5 goto 2f; /* s32: [-16, 5] */ \ + r1 = 0x1fffffff0 ll; \ + r2 = 0x200000005 ll; \ + if r0 >= r1 goto 1f; /* should be always true */ \ + r10 = 0; /* dead code */ \ +1: if r0 <= r2 goto 2f; /* should be always true */ \ + r10 = 0; /* dead code */ \ +2: exit; \ + " + :: __imm(bpf_get_prandom_u32) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From ac985e7bf840e34a8dafe0808cc571fd85896c30 Mon Sep 17 00:00:00 2001 From: Gregory Bell Date: Fri, 17 Apr 2026 11:41:21 -0400 Subject: selftests/bpf: Use local type for flow_offload_tuple_rhash in xdp_flowtable Define flow_offload_tuple_rhash___local and use it in place of the forward-declared kernel type for the bpf_xdp_flow_lookup kfunc return type and tuplehash variable. This is consistent with how bpf_flowtable_opts___local is already handled in the same file and avoids relying on a forward declaration of the struct. Fixes: eeb23b54e447 ("selftests/bpf: fix compilation failure when CONFIG_NF_FLOW_TABLE=m") Signed-off-by: Gregory Bell Link: https://lore.kernel.org/r/20260417154122.2558890-2-grbell@redhat.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/xdp_flowtable.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/xdp_flowtable.c b/tools/testing/selftests/bpf/progs/xdp_flowtable.c index 7fdc7b23ee74..e67daa02749d 100644 --- a/tools/testing/selftests/bpf/progs/xdp_flowtable.c +++ b/tools/testing/selftests/bpf/progs/xdp_flowtable.c @@ -15,7 +15,10 @@ struct bpf_flowtable_opts___local { s32 error; }; -struct flow_offload_tuple_rhash * +struct flow_offload_tuple_rhash___local { +}; + +struct flow_offload_tuple_rhash___local * bpf_xdp_flow_lookup(struct xdp_md *, struct bpf_fib_lookup *, struct bpf_flowtable_opts___local *, u32) __ksym; @@ -67,7 +70,7 @@ int xdp_flowtable_do_lookup(struct xdp_md *ctx) { void *data_end = (void *)(long)ctx->data_end; struct bpf_flowtable_opts___local opts = {}; - struct flow_offload_tuple_rhash *tuplehash; + struct flow_offload_tuple_rhash___local *tuplehash; struct bpf_fib_lookup tuple = { .ifindex = ctx->ingress_ifindex, }; -- cgit v1.2.3 From afb0450be061907a0f5d36bd8b010ca30eda3d3b Mon Sep 17 00:00:00 2001 From: Gregory Bell Date: Fri, 17 Apr 2026 11:41:22 -0400 Subject: selftests/bpf: Use local type for bpf_fou_encap in test_tunnel_kern Replace the forward-declared struct bpf_fou_encap with the existing bpf_fou_encap___local type in the bpf_skb_set_fou_encap and bpf_skb_get_fou_encap declarations. This removes the need for the forward declaration and the explicit casts at each call. Fixes: d17f9b370df6 ("selftests/bpf: Fix compilation failure when CONFIG_NET_FOU!=y") Signed-off-by: Gregory Bell Link: https://lore.kernel.org/r/20260417154122.2558890-3-grbell@redhat.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/test_tunnel_kern.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c index 32127f1cd687..30f1de458669 100644 --- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c @@ -6,6 +6,7 @@ * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. */ +#define BPF_NO_KFUNC_PROTOTYPES #include "vmlinux.h" #include #include @@ -36,12 +37,10 @@ enum bpf_fou_encap_type___local { FOU_BPF_ENCAP_GUE___local, }; -struct bpf_fou_encap; - int bpf_skb_set_fou_encap(struct __sk_buff *skb_ctx, - struct bpf_fou_encap *encap, int type) __ksym; + struct bpf_fou_encap___local *encap, int type) __ksym; int bpf_skb_get_fou_encap(struct __sk_buff *skb_ctx, - struct bpf_fou_encap *encap) __ksym; + struct bpf_fou_encap___local *encap) __ksym; struct xfrm_state * bpf_xdp_get_xfrm_state(struct xdp_md *ctx, struct bpf_xfrm_state_opts *opts, u32 opts__sz) __ksym; @@ -781,7 +780,7 @@ int ipip_gue_set_tunnel(struct __sk_buff *skb) encap.sport = 0; encap.dport = bpf_htons(5555); - ret = bpf_skb_set_fou_encap(skb, (struct bpf_fou_encap *)&encap, + ret = bpf_skb_set_fou_encap(skb, &encap, bpf_core_enum_value(enum bpf_fou_encap_type___local, FOU_BPF_ENCAP_GUE___local)); if (ret < 0) { @@ -820,7 +819,7 @@ int ipip_fou_set_tunnel(struct __sk_buff *skb) encap.sport = 0; encap.dport = bpf_htons(5555); - ret = bpf_skb_set_fou_encap(skb, (struct bpf_fou_encap *)&encap, + ret = bpf_skb_set_fou_encap(skb, &encap, FOU_BPF_ENCAP_FOU___local); if (ret < 0) { log_err(ret); @@ -843,7 +842,7 @@ int ipip_encap_get_tunnel(struct __sk_buff *skb) return TC_ACT_SHOT; } - ret = bpf_skb_get_fou_encap(skb, (struct bpf_fou_encap *)&encap); + ret = bpf_skb_get_fou_encap(skb, &encap); if (ret < 0) { log_err(ret); return TC_ACT_SHOT; -- cgit v1.2.3 From 1fb8e9b32e19f7fa444863a251a5310c54585172 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Sun, 26 Apr 2026 15:03:31 -0400 Subject: selftests/bpf: Add ifdef guard for WRITE_ONCE macro in bpf_atomic.h The WRITE_ONCE macro is identically defined both in bpf_atomic.h and in bpf_arena_common.h. However, the bpf_atomic.h definition has no ifdef guard. If bpf_atomic.h is included after bpf_arena.common.h, compilation fails because of the duplicate definition. Guard the definiton in bpf_atomic.h with and ifdef to let programs include the two headers in any order. Duplicating the definition is the simplest solution out of all the alternatives: - Keeping one of the two existing definitions is not possible because both BPF atomics and arena programs need the macro, and the two features are independent. Using one should not require the header for the other. - Factoring out the definition into a new header that only includes it is more churn than just duplicating it. - Factoring out the definition into bpf_experimental.h requires all users of WRITE_ONCE to include the header. However, the arena library introduced in subsequent commits must be self-contained, while bpf_experimental.h is in the base selftests/bpf directory. Both headers are moved to the arena library in a subsequent patch. Signed-off-by: Emil Tsalapatis Reviewed-by: Matt Bobrowski Link: https://lore.kernel.org/r/20260426190338.4615-2-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bpf_atomic.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/bpf/bpf_atomic.h b/tools/testing/selftests/bpf/bpf_atomic.h index c550e5711967..d89a22d63c1c 100644 --- a/tools/testing/selftests/bpf/bpf_atomic.h +++ b/tools/testing/selftests/bpf/bpf_atomic.h @@ -42,7 +42,9 @@ extern bool CONFIG_X86_64 __kconfig __weak; #define READ_ONCE(x) (*(volatile typeof(x) *)&(x)) +#ifndef WRITE_ONCE #define WRITE_ONCE(x, val) ((*(volatile typeof(x) *)&(x)) = (val)) +#endif #define cmpxchg(p, old, new) __sync_val_compare_and_swap((p), old, new) -- cgit v1.2.3 From d5327480a12a031f283c85c3c9c9201685099036 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Sun, 26 Apr 2026 15:03:32 -0400 Subject: selftests/bpf: Add basic libarena scaffolding Add initial code and a Makefile for an arena-based BPF library. Modules can be added just by including the source file in the library's src/ subdirectory. Future commits will introduce the library code itself. The code includes workarounds that are removed in subsequent patches that ensure bisectability. Signed-off-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260426190338.4615-3-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 27 ++++++ tools/testing/selftests/bpf/libarena/Makefile | 69 +++++++++++++++ .../bpf/libarena/include/libarena/common.h | 79 +++++++++++++++++ .../bpf/libarena/include/libarena/userspace.h | 99 ++++++++++++++++++++++ .../selftests/bpf/libarena/src/common.bpf.c | 29 +++++++ 5 files changed, 303 insertions(+) create mode 100644 tools/testing/selftests/bpf/libarena/Makefile create mode 100644 tools/testing/selftests/bpf/libarena/include/libarena/common.h create mode 100644 tools/testing/selftests/bpf/libarena/include/libarena/userspace.h create mode 100644 tools/testing/selftests/bpf/libarena/src/common.bpf.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index ac676d2a4a29..9fe30a665c2e 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -151,6 +151,7 @@ override define CLEAN $(Q)$(RM) -r $(TEST_KMODS) $(Q)$(RM) -r $(EXTRA_CLEAN) $(Q)$(MAKE) -C test_kmods clean + $(Q)$(MAKE) -C libarena clean $(Q)$(MAKE) docs-clean endef @@ -522,6 +523,7 @@ LINKED_BPF_OBJS := $(foreach skel,$(LINKED_SKELS),$($(skel)-deps)) LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.c,$(LINKED_BPF_OBJS)) HEADERS_FOR_BPF_OBJS := $(wildcard $(BPFDIR)/*.bpf.h) \ + $(wildcard $(CURDIR)/libarena/include/*.[ch]) \ $(addprefix $(BPFDIR)/, bpf_core_read.h \ bpf_endian.h \ bpf_helpers.h \ @@ -737,6 +739,29 @@ $(VERIFY_SIG_HDR): $(VERIFICATION_CERT) echo "};"; \ echo "unsigned int test_progs_verification_cert_len = $$(wc -c < $<);") > $@ +LIBARENA_MAKE_ARGS = \ + BPFTOOL="$(BPFTOOL)" \ + INCLUDE_DIR="$(INCLUDE_DIR)" \ + LIBBPF_INCLUDE="$(HOST_INCLUDE_DIR)" \ + BPFOBJ="$(BPFOBJ)" \ + LDLIBS="$(LDLIBS) -lzstd" \ + CLANG="$(CLANG)" \ + BPF_CFLAGS="$(BPF_CFLAGS) $(CLANG_CFLAGS)" \ + BPF_TARGET_ENDIAN="$(BPF_TARGET_ENDIAN)" \ + Q="$(Q)" + +LIBARENA_BPF_DEPS := $(wildcard libarena/Makefile \ + libarena/include/* \ + libarena/include/libarena/* \ + libarena/src/* \ + libarena/selftests/* \ + libarena/*.bpf.o) + +LIBARENA_SKEL := libarena/libarena.skel.h + +$(LIBARENA_SKEL): $(INCLUDE_DIR)/vmlinux.h $(BPFOBJ) $(LIBARENA_BPF_DEPS) + +$(MAKE) -C libarena libarena.skel.h $(LIBARENA_MAKE_ARGS) + # Define test_progs test runner. TRUNNER_TESTS_DIR := prog_tests TRUNNER_BPF_PROGS_DIR := progs @@ -930,3 +955,5 @@ override define INSTALL_RULE rsync -a $(OUTPUT)/$$DIR/*.bpf.o $(INSTALL_PATH)/$$DIR;\ done endef + +libarena: $(LIBARENA_SKEL) diff --git a/tools/testing/selftests/bpf/libarena/Makefile b/tools/testing/selftests/bpf/libarena/Makefile new file mode 100644 index 000000000000..e85b3ad96890 --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/Makefile @@ -0,0 +1,69 @@ +# SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +# Copyright (c) 2026 Meta Platforms, Inc. and affiliates. + +.PHONY: clean + +# Defaults for standalone builds + +CLANG ?= clang +BPFTOOL ?= bpftool +LDLIBS ?= -lbpf -lelf -lz -lrt -lpthread -lzstd + +ifeq ($(V),1) +Q = +msg = +else +Q ?= @ +msg = @printf ' %-8s%s %s%s\n' "$(1)" "$(if $(2), [$(2)])" "$(notdir $(3))" "$(if $(4), $(4))"; +endif + +IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - $@ + +libarena.bpf.o: $(LIBARENA_OBJECTS) + $(call msg,GEN-OBJ,libarena,$@) + $(Q)$(BPFTOOL) gen object $@ $^ + +%.bpf.o: %.bpf.c + $(call msg,CLNG-BPF,libarena,$@) + $(Q)$(CLANG) $(BPF_CFLAGS) $(BPF_TARGET_ENDIAN) -c $< -o $@ + +clean: + $(Q)rm -f *.skel.h *.bpf.o diff --git a/tools/testing/selftests/bpf/libarena/include/libarena/common.h b/tools/testing/selftests/bpf/libarena/include/libarena/common.h new file mode 100644 index 000000000000..92b67b20ed15 --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/include/libarena/common.h @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#pragma once + +#ifdef __BPF__ + +#include + +#include "../../bpf_arena_common.h" +#include "../../progs/bpf_arena_spin_lock.h" + +#include + +#ifndef __BPF_FEATURE_ADDR_SPACE_CAST +#error "Arena allocators require bpf_addr_space_cast feature" +#endif + +#define arena_stdout(fmt, ...) bpf_stream_printk(1, (fmt), ##__VA_ARGS__) +#define arena_stderr(fmt, ...) bpf_stream_printk(2, (fmt), ##__VA_ARGS__) + +#ifndef __maybe_unused +#define __maybe_unused __attribute__((__unused__)) +#endif + +#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8))) + +#define ARENA_PAGES (1UL << (32 - __builtin_ffs(__PAGE_SIZE) + 1)) + +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, ARENA_PAGES); /* number of pages */ +#if defined(__TARGET_ARCH_arm64) || defined(__aarch64__) + __ulong(map_extra, (1ull << 32)); /* start of mmap() region */ +#else + __ulong(map_extra, (1ull << 44)); /* start of mmap() region */ +#endif +} arena __weak SEC(".maps"); + +/* + * This is a variable used to aid verification. The may_goto directive + * permits open-coded for loops, but requires that the index variable is + * imprecise. To force the variable to be imprecise, initialize it with + * the opaque volatile variable 0 instead of the constant 0. + */ +extern const volatile u32 zero; + +int arena_fls(__u64 word); + +#else /* ! __BPF__ */ + +#include + +#define __arena + +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; +typedef int8_t s8; +typedef int16_t s16; +typedef int32_t s32; +typedef int64_t s64; + +/* Dummy "definition" for userspace. */ +#define arena_spinlock_t int + +#endif /* __BPF__ */ + +struct arena_get_info_args { + void __arena *arena_base; +}; + +struct arena_alloc_reserve_args { + u64 nr_pages; +}; + +/* Reasonable default number of pages reserved by arena_alloc_reserve. */ +#define ARENA_RESERVE_PAGES_DFL (8) diff --git a/tools/testing/selftests/bpf/libarena/include/libarena/userspace.h b/tools/testing/selftests/bpf/libarena/include/libarena/userspace.h new file mode 100644 index 000000000000..0438a751d5fd --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/include/libarena/userspace.h @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#pragma once + +#include +#include +#include +#include + +#include +#include + +static inline int libarena_run_prog(int prog_fd) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + int ret; + + ret = bpf_prog_test_run_opts(prog_fd, &opts); + if (ret) + return ret; + + return opts.retval; +} + +static inline bool libarena_is_test_prog(const char *name) +{ + return strstr(name, "test_") == name; +} + +static inline int libarena_run_prog_args(int prog_fd, void *args, size_t argsize) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + int ret; + + opts.ctx_in = args; + opts.ctx_size_in = argsize; + + ret = bpf_prog_test_run_opts(prog_fd, &opts); + + return ret ?: opts.retval; +} + +static inline int libarena_get_arena_base(int arena_get_info_fd, + void **arena_base) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct arena_get_info_args args = { .arena_base = NULL }; + int ret; + + opts.ctx_in = &args; + opts.ctx_size_in = sizeof(args); + + ret = bpf_prog_test_run_opts(arena_get_info_fd, &opts); + if (ret) + return ret; + if (opts.retval) + return opts.retval; + + *arena_base = args.arena_base; + return 0; +} + +static inline int libarena_get_globals_pages(int arena_get_globals_fd, + size_t arena_all_pages, + u64 *globals_pages) +{ + size_t pgsize = sysconf(_SC_PAGESIZE); + void *arena_base; + ssize_t i; + u8 *vec; + int ret; + + ret = libarena_get_arena_base(arena_get_globals_fd, &arena_base); + if (ret) + return ret; + + if (!arena_base) + return -EINVAL; + + vec = calloc(arena_all_pages, sizeof(*vec)); + if (!vec) + return -ENOMEM; + + if (mincore(arena_base, arena_all_pages * pgsize, vec) < 0) { + ret = -errno; + free(vec); + return ret; + } + + *globals_pages = 0; + for (i = arena_all_pages - 1; i >= 0; i--) { + if (!(vec[i] & 0x1)) + break; + *globals_pages += 1; + } + + free(vec); + return 0; +} diff --git a/tools/testing/selftests/bpf/libarena/src/common.bpf.c b/tools/testing/selftests/bpf/libarena/src/common.bpf.c new file mode 100644 index 000000000000..659ccead5624 --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/src/common.bpf.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include + +const volatile u32 zero = 0; + +int arena_fls(__u64 word) +{ + if (!word) + return 0; + + return 64 - __builtin_clzll(word); +} + +SEC("syscall") +__weak int arena_get_info(struct arena_get_info_args *args) +{ + args->arena_base = arena_base(&arena); + + return 0; +} + +SEC("syscall") +__weak int arena_alloc_reserve(struct arena_alloc_reserve_args *args) +{ + return bpf_arena_reserve_pages(&arena, NULL, args->nr_pages); +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 8c1e1c33fe5ad867bc0b6ba121911d70e7881d88 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Sun, 26 Apr 2026 15:03:33 -0400 Subject: selftests/bpf: Move arena-related headers into libarena The BPF selftest headers include functionality that is specific to arenas and is required by libarena. Keep libarena self-contained by moving all functionality into its include/ directory. Also add libarena/include to the standard include paths for the selftests to make the moved headers easy to access by existing selftests. Some functionality is required by libarena but not strictly arena-related. We still move it to the libarena/include path, which is an upgrade from directly accessing them from the selftests/bpf directory using relative paths. A new bpf_may_goto.h file is split off of bpf_experimental.h. bpf_arena_spin_lock.h and bpf_arena_common.h are moved to libarena/include. bpf_atomic.h is also moved to libarena because it is necessary for arena spinlocks. For bpf_arena_spin_lock.h, mark the spinlock state array as __weak to define the spinlock state array in the header while also being compatible with multi-compilation unit programs. While we're at it, we remove unnecessary definitions from existing test programs. Signed-off-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260426190338.4615-4-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 4 +- tools/testing/selftests/bpf/bpf_arena_alloc.h | 2 +- tools/testing/selftests/bpf/bpf_arena_common.h | 75 --- tools/testing/selftests/bpf/bpf_arena_list.h | 2 +- tools/testing/selftests/bpf/bpf_arena_strsearch.h | 2 +- tools/testing/selftests/bpf/bpf_atomic.h | 142 ------ tools/testing/selftests/bpf/bpf_experimental.h | 84 +--- .../bpf/libarena/include/bpf_arena_common.h | 75 +++ .../bpf/libarena/include/bpf_arena_spin_lock.h | 547 +++++++++++++++++++++ .../selftests/bpf/libarena/include/bpf_atomic.h | 142 ++++++ .../selftests/bpf/libarena/include/bpf_may_goto.h | 84 ++++ .../bpf/libarena/include/libarena/common.h | 4 +- .../selftests/bpf/prog_tests/arena_spin_lock.c | 7 - tools/testing/selftests/bpf/progs/arena_atomics.c | 2 +- .../testing/selftests/bpf/progs/arena_spin_lock.c | 2 +- .../selftests/bpf/progs/bpf_arena_spin_lock.h | 542 -------------------- .../selftests/bpf/progs/compute_live_registers.c | 2 +- tools/testing/selftests/bpf/progs/lpm_trie_bench.c | 2 +- tools/testing/selftests/bpf/progs/stream.c | 2 +- tools/testing/selftests/bpf/progs/verifier_arena.c | 2 +- .../selftests/bpf/progs/verifier_arena_globals1.c | 2 +- .../selftests/bpf/progs/verifier_arena_globals2.c | 2 +- .../selftests/bpf/progs/verifier_arena_large.c | 2 +- tools/testing/selftests/bpf/progs/verifier_ldsx.c | 2 +- 24 files changed, 867 insertions(+), 865 deletions(-) delete mode 100644 tools/testing/selftests/bpf/bpf_arena_common.h delete mode 100644 tools/testing/selftests/bpf/bpf_atomic.h create mode 100644 tools/testing/selftests/bpf/libarena/include/bpf_arena_common.h create mode 100644 tools/testing/selftests/bpf/libarena/include/bpf_arena_spin_lock.h create mode 100644 tools/testing/selftests/bpf/libarena/include/bpf_atomic.h create mode 100644 tools/testing/selftests/bpf/libarena/include/bpf_may_goto.h delete mode 100644 tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 9fe30a665c2e..71c7873c4b15 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -56,7 +56,8 @@ CFLAGS += -g $(OPT_FLAGS) -rdynamic -std=gnu11 \ -Wno-unused-but-set-variable \ $(GENFLAGS) $(SAN_CFLAGS) $(LIBELF_CFLAGS) \ -I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ - -I$(TOOLSINCDIR) -I$(TOOLSARCHINCDIR) -I$(APIDIR) -I$(OUTPUT) + -I$(TOOLSINCDIR) -I$(TOOLSARCHINCDIR) -I$(APIDIR) -I$(OUTPUT) \ + -I$(CURDIR)/libarena/include LDFLAGS += $(SAN_LDFLAGS) LDLIBS += $(LIBELF_LIBS) -lz -lrt -lpthread @@ -444,6 +445,7 @@ endif CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG),$(CLANG_TARGET_ARCH)) BPF_CFLAGS = -g -Wall -Werror -D__TARGET_ARCH_$(SRCARCH) $(MENDIAN) \ -I$(INCLUDE_DIR) -I$(CURDIR) -I$(APIDIR) \ + -I$(CURDIR)/libarena/include \ -I$(abspath $(OUTPUT)/../usr/include) \ -std=gnu11 \ -fno-strict-aliasing \ diff --git a/tools/testing/selftests/bpf/bpf_arena_alloc.h b/tools/testing/selftests/bpf/bpf_arena_alloc.h index c27678299e0c..cda147fd9d25 100644 --- a/tools/testing/selftests/bpf/bpf_arena_alloc.h +++ b/tools/testing/selftests/bpf/bpf_arena_alloc.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ /* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ #pragma once -#include "bpf_arena_common.h" +#include #ifndef __round_mask #define __round_mask(x, y) ((__typeof__(x))((y)-1)) diff --git a/tools/testing/selftests/bpf/bpf_arena_common.h b/tools/testing/selftests/bpf/bpf_arena_common.h deleted file mode 100644 index 16f8ce832004..000000000000 --- a/tools/testing/selftests/bpf/bpf_arena_common.h +++ /dev/null @@ -1,75 +0,0 @@ -/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ -/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ -#pragma once - -#ifndef WRITE_ONCE -#define WRITE_ONCE(x, val) ((*(volatile typeof(x) *) &(x)) = (val)) -#endif - -#ifndef NUMA_NO_NODE -#define NUMA_NO_NODE (-1) -#endif - -#ifndef arena_container_of -#define arena_container_of(ptr, type, member) \ - ({ \ - void __arena *__mptr = (void __arena *)(ptr); \ - ((type *)(__mptr - offsetof(type, member))); \ - }) -#endif - -#ifdef __BPF__ /* when compiled as bpf program */ - -#ifndef PAGE_SIZE -#define PAGE_SIZE __PAGE_SIZE -/* - * for older kernels try sizeof(struct genradix_node) - * or flexible: - * static inline long __bpf_page_size(void) { - * return bpf_core_enum_value(enum page_size_enum___l, __PAGE_SIZE___l) ?: sizeof(struct genradix_node); - * } - * but generated code is not great. - */ -#endif - -#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) && !defined(BPF_ARENA_FORCE_ASM) -#define __arena __attribute__((address_space(1))) -#define __arena_global __attribute__((address_space(1))) -#define cast_kern(ptr) /* nop for bpf prog. emitted by LLVM */ -#define cast_user(ptr) /* nop for bpf prog. emitted by LLVM */ -#else -#define __arena -#define __arena_global SEC(".addr_space.1") -#define cast_kern(ptr) bpf_addr_space_cast(ptr, 0, 1) -#define cast_user(ptr) bpf_addr_space_cast(ptr, 1, 0) -#endif - -void __arena* bpf_arena_alloc_pages(void *map, void __arena *addr, __u32 page_cnt, - int node_id, __u64 flags) __ksym __weak; -int bpf_arena_reserve_pages(void *map, void __arena *addr, __u32 page_cnt) __ksym __weak; -void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) __ksym __weak; - -#define arena_base(map) ((void __arena *)((struct bpf_arena *)(map))->user_vm_start) - -#else /* when compiled as user space code */ - -#define __arena -#define __arg_arena -#define cast_kern(ptr) /* nop for user space */ -#define cast_user(ptr) /* nop for user space */ -__weak char arena[1]; - -#ifndef offsetof -#define offsetof(type, member) ((unsigned long)&((type *)0)->member) -#endif - -static inline void __arena* bpf_arena_alloc_pages(void *map, void *addr, __u32 page_cnt, - int node_id, __u64 flags) -{ - return NULL; -} -static inline void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) -{ -} - -#endif diff --git a/tools/testing/selftests/bpf/bpf_arena_list.h b/tools/testing/selftests/bpf/bpf_arena_list.h index e16fa7d95fcf..1af2ffc27d9c 100644 --- a/tools/testing/selftests/bpf/bpf_arena_list.h +++ b/tools/testing/selftests/bpf/bpf_arena_list.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ /* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ #pragma once -#include "bpf_arena_common.h" +#include struct arena_list_node; diff --git a/tools/testing/selftests/bpf/bpf_arena_strsearch.h b/tools/testing/selftests/bpf/bpf_arena_strsearch.h index c1b6eaa905bb..f0d575daef5a 100644 --- a/tools/testing/selftests/bpf/bpf_arena_strsearch.h +++ b/tools/testing/selftests/bpf/bpf_arena_strsearch.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ #pragma once -#include "bpf_arena_common.h" +#include __noinline int bpf_arena_strlen(const char __arena *s __arg_arena) { diff --git a/tools/testing/selftests/bpf/bpf_atomic.h b/tools/testing/selftests/bpf/bpf_atomic.h deleted file mode 100644 index d89a22d63c1c..000000000000 --- a/tools/testing/selftests/bpf/bpf_atomic.h +++ /dev/null @@ -1,142 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ -#ifndef BPF_ATOMIC_H -#define BPF_ATOMIC_H - -#include -#include -#include "bpf_experimental.h" - -extern bool CONFIG_X86_64 __kconfig __weak; - -/* - * __unqual_typeof(x) - Declare an unqualified scalar type, leaving - * non-scalar types unchanged, - * - * Prefer C11 _Generic for better compile-times and simpler code. Note: 'char' - * is not type-compatible with 'signed char', and we define a separate case. - * - * This is copied verbatim from kernel's include/linux/compiler_types.h, but - * with default expression (for pointers) changed from (x) to (typeof(x)0). - * - * This is because LLVM has a bug where for lvalue (x), it does not get rid of - * an extra address_space qualifier, but does in case of rvalue (typeof(x)0). - * Hence, for pointers, we need to create an rvalue expression to get the - * desired type. See https://github.com/llvm/llvm-project/issues/53400. - */ -#define __scalar_type_to_expr_cases(type) \ - unsigned type : (unsigned type)0, signed type : (signed type)0 - -#define __unqual_typeof(x) \ - typeof(_Generic((x), \ - char: (char)0, \ - __scalar_type_to_expr_cases(char), \ - __scalar_type_to_expr_cases(short), \ - __scalar_type_to_expr_cases(int), \ - __scalar_type_to_expr_cases(long), \ - __scalar_type_to_expr_cases(long long), \ - default: (typeof(x))0)) - -/* No-op for BPF */ -#define cpu_relax() ({}) - -#define READ_ONCE(x) (*(volatile typeof(x) *)&(x)) - -#ifndef WRITE_ONCE -#define WRITE_ONCE(x, val) ((*(volatile typeof(x) *)&(x)) = (val)) -#endif - -#define cmpxchg(p, old, new) __sync_val_compare_and_swap((p), old, new) - -#define try_cmpxchg(p, pold, new) \ - ({ \ - __unqual_typeof(*(pold)) __o = *(pold); \ - __unqual_typeof(*(p)) __r = cmpxchg(p, __o, new); \ - if (__r != __o) \ - *(pold) = __r; \ - __r == __o; \ - }) - -#define try_cmpxchg_relaxed(p, pold, new) try_cmpxchg(p, pold, new) - -#define try_cmpxchg_acquire(p, pold, new) try_cmpxchg(p, pold, new) - -#define smp_mb() \ - ({ \ - volatile unsigned long __val; \ - __sync_fetch_and_add(&__val, 0); \ - }) - -#define smp_rmb() \ - ({ \ - if (!CONFIG_X86_64) \ - smp_mb(); \ - else \ - barrier(); \ - }) - -#define smp_wmb() \ - ({ \ - if (!CONFIG_X86_64) \ - smp_mb(); \ - else \ - barrier(); \ - }) - -/* Control dependency provides LOAD->STORE, provide LOAD->LOAD */ -#define smp_acquire__after_ctrl_dep() ({ smp_rmb(); }) - -#define smp_load_acquire(p) \ - ({ \ - __unqual_typeof(*(p)) __v = READ_ONCE(*(p)); \ - if (!CONFIG_X86_64) \ - smp_mb(); \ - barrier(); \ - __v; \ - }) - -#define smp_store_release(p, val) \ - ({ \ - if (!CONFIG_X86_64) \ - smp_mb(); \ - barrier(); \ - WRITE_ONCE(*(p), val); \ - }) - -#define smp_cond_load_relaxed_label(p, cond_expr, label) \ - ({ \ - typeof(p) __ptr = (p); \ - __unqual_typeof(*(p)) VAL; \ - for (;;) { \ - VAL = (__unqual_typeof(*(p)))READ_ONCE(*__ptr); \ - if (cond_expr) \ - break; \ - cond_break_label(label); \ - cpu_relax(); \ - } \ - (typeof(*(p)))VAL; \ - }) - -#define smp_cond_load_acquire_label(p, cond_expr, label) \ - ({ \ - __unqual_typeof(*p) __val = \ - smp_cond_load_relaxed_label(p, cond_expr, label); \ - smp_acquire__after_ctrl_dep(); \ - (typeof(*(p)))__val; \ - }) - -#define atomic_read(p) READ_ONCE((p)->counter) - -#define atomic_cond_read_relaxed_label(p, cond_expr, label) \ - smp_cond_load_relaxed_label(&(p)->counter, cond_expr, label) - -#define atomic_cond_read_acquire_label(p, cond_expr, label) \ - smp_cond_load_acquire_label(&(p)->counter, cond_expr, label) - -#define atomic_try_cmpxchg_relaxed(p, pold, new) \ - try_cmpxchg_relaxed(&(p)->counter, pold, new) - -#define atomic_try_cmpxchg_acquire(p, pold, new) \ - try_cmpxchg_acquire(&(p)->counter, pold, new) - -#endif /* BPF_ATOMIC_H */ diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index 2234bd6bc9d3..d1db355e872b 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -5,6 +5,7 @@ #include #include #include +#include #define __contains(name, node) __attribute__((btf_decl_tag("contains:" #name ":" #node))) @@ -204,89 +205,6 @@ l_true: \ }) #endif -/* - * Note that cond_break can only be portably used in the body of a breakable - * construct, whereas can_loop can be used anywhere. - */ -#ifdef __BPF_FEATURE_MAY_GOTO -#define can_loop \ - ({ __label__ l_break, l_continue; \ - bool ret = true; \ - asm volatile goto("may_goto %l[l_break]" \ - :::: l_break); \ - goto l_continue; \ - l_break: ret = false; \ - l_continue:; \ - ret; \ - }) - -#define __cond_break(expr) \ - ({ __label__ l_break, l_continue; \ - asm volatile goto("may_goto %l[l_break]" \ - :::: l_break); \ - goto l_continue; \ - l_break: expr; \ - l_continue:; \ - }) -#else -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -#define can_loop \ - ({ __label__ l_break, l_continue; \ - bool ret = true; \ - asm volatile goto("1:.byte 0xe5; \ - .byte 0; \ - .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \ - .short 0" \ - :::: l_break); \ - goto l_continue; \ - l_break: ret = false; \ - l_continue:; \ - ret; \ - }) - -#define __cond_break(expr) \ - ({ __label__ l_break, l_continue; \ - asm volatile goto("1:.byte 0xe5; \ - .byte 0; \ - .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \ - .short 0" \ - :::: l_break); \ - goto l_continue; \ - l_break: expr; \ - l_continue:; \ - }) -#else -#define can_loop \ - ({ __label__ l_break, l_continue; \ - bool ret = true; \ - asm volatile goto("1:.byte 0xe5; \ - .byte 0; \ - .long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16; \ - .short 0" \ - :::: l_break); \ - goto l_continue; \ - l_break: ret = false; \ - l_continue:; \ - ret; \ - }) - -#define __cond_break(expr) \ - ({ __label__ l_break, l_continue; \ - asm volatile goto("1:.byte 0xe5; \ - .byte 0; \ - .long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16; \ - .short 0" \ - :::: l_break); \ - goto l_continue; \ - l_break: expr; \ - l_continue:; \ - }) -#endif -#endif - -#define cond_break __cond_break(break) -#define cond_break_label(label) __cond_break(goto label) - #ifndef bpf_nop_mov #define bpf_nop_mov(var) \ asm volatile("%[reg]=%[reg]"::[reg]"r"((short)var)) diff --git a/tools/testing/selftests/bpf/libarena/include/bpf_arena_common.h b/tools/testing/selftests/bpf/libarena/include/bpf_arena_common.h new file mode 100644 index 000000000000..16f8ce832004 --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/include/bpf_arena_common.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#pragma once + +#ifndef WRITE_ONCE +#define WRITE_ONCE(x, val) ((*(volatile typeof(x) *) &(x)) = (val)) +#endif + +#ifndef NUMA_NO_NODE +#define NUMA_NO_NODE (-1) +#endif + +#ifndef arena_container_of +#define arena_container_of(ptr, type, member) \ + ({ \ + void __arena *__mptr = (void __arena *)(ptr); \ + ((type *)(__mptr - offsetof(type, member))); \ + }) +#endif + +#ifdef __BPF__ /* when compiled as bpf program */ + +#ifndef PAGE_SIZE +#define PAGE_SIZE __PAGE_SIZE +/* + * for older kernels try sizeof(struct genradix_node) + * or flexible: + * static inline long __bpf_page_size(void) { + * return bpf_core_enum_value(enum page_size_enum___l, __PAGE_SIZE___l) ?: sizeof(struct genradix_node); + * } + * but generated code is not great. + */ +#endif + +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) && !defined(BPF_ARENA_FORCE_ASM) +#define __arena __attribute__((address_space(1))) +#define __arena_global __attribute__((address_space(1))) +#define cast_kern(ptr) /* nop for bpf prog. emitted by LLVM */ +#define cast_user(ptr) /* nop for bpf prog. emitted by LLVM */ +#else +#define __arena +#define __arena_global SEC(".addr_space.1") +#define cast_kern(ptr) bpf_addr_space_cast(ptr, 0, 1) +#define cast_user(ptr) bpf_addr_space_cast(ptr, 1, 0) +#endif + +void __arena* bpf_arena_alloc_pages(void *map, void __arena *addr, __u32 page_cnt, + int node_id, __u64 flags) __ksym __weak; +int bpf_arena_reserve_pages(void *map, void __arena *addr, __u32 page_cnt) __ksym __weak; +void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) __ksym __weak; + +#define arena_base(map) ((void __arena *)((struct bpf_arena *)(map))->user_vm_start) + +#else /* when compiled as user space code */ + +#define __arena +#define __arg_arena +#define cast_kern(ptr) /* nop for user space */ +#define cast_user(ptr) /* nop for user space */ +__weak char arena[1]; + +#ifndef offsetof +#define offsetof(type, member) ((unsigned long)&((type *)0)->member) +#endif + +static inline void __arena* bpf_arena_alloc_pages(void *map, void *addr, __u32 page_cnt, + int node_id, __u64 flags) +{ + return NULL; +} +static inline void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) +{ +} + +#endif diff --git a/tools/testing/selftests/bpf/libarena/include/bpf_arena_spin_lock.h b/tools/testing/selftests/bpf/libarena/include/bpf_arena_spin_lock.h new file mode 100644 index 000000000000..164638690a4d --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/include/bpf_arena_spin_lock.h @@ -0,0 +1,547 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#ifndef BPF_ARENA_SPIN_LOCK_H +#define BPF_ARENA_SPIN_LOCK_H + +#include +#include +#include + +#define arch_mcs_spin_lock_contended_label(l, label) smp_cond_load_acquire_label(l, VAL, label) +#define arch_mcs_spin_unlock_contended(l) smp_store_release((l), 1) + +#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST) + +#define EBUSY 16 +#define EOPNOTSUPP 95 +#define ETIMEDOUT 110 + +#ifndef __arena +#define __arena __attribute__((address_space(1))) +#endif + +extern unsigned long CONFIG_NR_CPUS __kconfig; + +/* + * Typically, we'd just rely on the definition in vmlinux.h for qspinlock, but + * PowerPC overrides the definition to define lock->val as u32 instead of + * atomic_t, leading to compilation errors. Import a local definition below so + * that we don't depend on the vmlinux.h version. + */ + +struct __qspinlock { + union { + atomic_t val; +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + struct { + u8 locked; + u8 pending; + }; + struct { + u16 locked_pending; + u16 tail; + }; +#else + struct { + u16 tail; + u16 locked_pending; + }; + struct { + u8 reserved[2]; + u8 pending; + u8 locked; + }; +#endif + }; +}; + +#define arena_spinlock_t struct __qspinlock +/* FIXME: Using typedef causes CO-RE relocation error */ +/* typedef struct qspinlock arena_spinlock_t; */ + +struct arena_mcs_spinlock { + struct arena_mcs_spinlock __arena *next; + int locked; + int count; +}; + +struct arena_qnode { + struct arena_mcs_spinlock mcs; +}; + +#define _Q_MAX_NODES 4 +#define _Q_PENDING_LOOPS 1 + +/* + * Bitfields in the atomic value: + * + * 0- 7: locked byte + * 8: pending + * 9-15: not used + * 16-17: tail index + * 18-31: tail cpu (+1) + */ +#define _Q_MAX_CPUS 1024 + +#define _Q_SET_MASK(type) (((1U << _Q_ ## type ## _BITS) - 1)\ + << _Q_ ## type ## _OFFSET) +#define _Q_LOCKED_OFFSET 0 +#define _Q_LOCKED_BITS 8 +#define _Q_LOCKED_MASK _Q_SET_MASK(LOCKED) + +#define _Q_PENDING_OFFSET (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS) +#define _Q_PENDING_BITS 8 +#define _Q_PENDING_MASK _Q_SET_MASK(PENDING) + +#define _Q_TAIL_IDX_OFFSET (_Q_PENDING_OFFSET + _Q_PENDING_BITS) +#define _Q_TAIL_IDX_BITS 2 +#define _Q_TAIL_IDX_MASK _Q_SET_MASK(TAIL_IDX) + +#define _Q_TAIL_CPU_OFFSET (_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS) +#define _Q_TAIL_CPU_BITS (32 - _Q_TAIL_CPU_OFFSET) +#define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU) + +#define _Q_TAIL_OFFSET _Q_TAIL_IDX_OFFSET +#define _Q_TAIL_MASK (_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK) + +#define _Q_LOCKED_VAL (1U << _Q_LOCKED_OFFSET) +#define _Q_PENDING_VAL (1U << _Q_PENDING_OFFSET) + +/* + * The qnodes are marked __weak so we can define them in the header + * while still ensuring all compilation units use the same struct + * instance. + */ +struct arena_qnode __weak __arena __hidden qnodes[_Q_MAX_CPUS][_Q_MAX_NODES]; + +static inline u32 encode_tail(int cpu, int idx) +{ + u32 tail; + + tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; + tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ + + return tail; +} + +static inline struct arena_mcs_spinlock __arena *decode_tail(u32 tail) +{ + u32 cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; + u32 idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; + + return &qnodes[cpu][idx].mcs; +} + +static inline +struct arena_mcs_spinlock __arena *grab_mcs_node(struct arena_mcs_spinlock __arena *base, int idx) +{ + return &((struct arena_qnode __arena *)base + idx)->mcs; +} + +#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) + +/** + * xchg_tail - Put in the new queue tail code word & retrieve previous one + * @lock : Pointer to queued spinlock structure + * @tail : The new queue tail code word + * Return: The previous queue tail code word + * + * xchg(lock, tail) + * + * p,*,* -> n,*,* ; prev = xchg(lock, node) + */ +static __always_inline u32 xchg_tail(arena_spinlock_t __arena *lock, u32 tail) +{ + u32 old, new; + + old = atomic_read(&lock->val); + do { + new = (old & _Q_LOCKED_PENDING_MASK) | tail; + /* + * We can use relaxed semantics since the caller ensures that + * the MCS node is properly initialized before updating the + * tail. + */ + /* These loops are not expected to stall, but we still need to + * prove to the verifier they will terminate eventually. + */ + cond_break_label(out); + } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); + + return old; +out: + bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__); + return old; +} + +/** + * clear_pending - clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,* -> *,0,* + */ +static __always_inline void clear_pending(arena_spinlock_t __arena *lock) +{ + WRITE_ONCE(lock->pending, 0); +} + +/** + * clear_pending_set_locked - take ownership and clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,0 -> *,0,1 + * + * Lock stealing is not allowed if this function is used. + */ +static __always_inline void clear_pending_set_locked(arena_spinlock_t __arena *lock) +{ + WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); +} + +/** + * set_locked - Set the lock bit and own the lock + * @lock: Pointer to queued spinlock structure + * + * *,*,0 -> *,0,1 + */ +static __always_inline void set_locked(arena_spinlock_t __arena *lock) +{ + WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); +} + +static __always_inline +u32 arena_fetch_set_pending_acquire(arena_spinlock_t __arena *lock) +{ + u32 old, new; + + old = atomic_read(&lock->val); + do { + new = old | _Q_PENDING_VAL; + /* + * These loops are not expected to stall, but we still need to + * prove to the verifier they will terminate eventually. + */ + cond_break_label(out); + } while (!atomic_try_cmpxchg_acquire(&lock->val, &old, new)); + + return old; +out: + bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__); + return old; +} + +/** + * arena_spin_trylock - try to acquire the queued spinlock + * @lock : Pointer to queued spinlock structure + * Return: 1 if lock acquired, 0 if failed + */ +static __always_inline int arena_spin_trylock(arena_spinlock_t __arena *lock) +{ + int val = atomic_read(&lock->val); + + if (unlikely(val)) + return 0; + + return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)); +} + +__noinline __weak +int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val) +{ + struct arena_mcs_spinlock __arena *prev, *next, *node0, *node; + int ret = -ETIMEDOUT; + u32 old, tail; + int idx; + + /* + * Wait for in-progress pending->locked hand-overs with a bounded + * number of spins so that we guarantee forward progress. + * + * 0,1,0 -> 0,0,1 + */ + if (val == _Q_PENDING_VAL) { + int cnt = _Q_PENDING_LOOPS; + val = atomic_cond_read_relaxed_label(&lock->val, + (VAL != _Q_PENDING_VAL) || !cnt--, + release_err); + } + + /* + * If we observe any contention; queue. + */ + if (val & ~_Q_LOCKED_MASK) + goto queue; + + /* + * trylock || pending + * + * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock + */ + val = arena_fetch_set_pending_acquire(lock); + + /* + * If we observe contention, there is a concurrent locker. + * + * Undo and queue; our setting of PENDING might have made the + * n,0,0 -> 0,0,0 transition fail and it will now be waiting + * on @next to become !NULL. + */ + if (unlikely(val & ~_Q_LOCKED_MASK)) { + + /* Undo PENDING if we set it. */ + if (!(val & _Q_PENDING_MASK)) + clear_pending(lock); + + goto queue; + } + + /* + * We're pending, wait for the owner to go away. + * + * 0,1,1 -> *,1,0 + * + * this wait loop must be a load-acquire such that we match the + * store-release that clears the locked bit and create lock + * sequentiality; this is because not all + * clear_pending_set_locked() implementations imply full + * barriers. + */ + if (val & _Q_LOCKED_MASK) + (void)smp_cond_load_acquire_label(&lock->locked, !VAL, release_err); + + /* + * take ownership and clear the pending bit. + * + * 0,1,0 -> 0,0,1 + */ + clear_pending_set_locked(lock); + return 0; + + /* + * End of pending bit optimistic spinning and beginning of MCS + * queuing. + */ +queue: + node0 = &(qnodes[bpf_get_smp_processor_id()])[0].mcs; + idx = node0->count++; + tail = encode_tail(bpf_get_smp_processor_id(), idx); + + /* + * 4 nodes are allocated based on the assumption that there will not be + * nested NMIs taking spinlocks. That may not be true in some + * architectures even though the chance of needing more than 4 nodes + * will still be extremely unlikely. When that happens, we simply return + * an error. Original qspinlock has a trylock fallback in this case. + */ + if (unlikely(idx >= _Q_MAX_NODES)) { + ret = -EBUSY; + goto release_node_err; + } + + node = grab_mcs_node(node0, idx); + + /* + * Ensure that we increment the head node->count before initialising + * the actual node. If the compiler is kind enough to reorder these + * stores, then an IRQ could overwrite our assignments. + */ + barrier(); + + node->locked = 0; + node->next = NULL; + + /* + * We touched a (possibly) cold cacheline in the per-cpu queue node; + * attempt the trylock once more in the hope someone let go while we + * weren't watching. + */ + if (arena_spin_trylock(lock)) + goto release; + + /* + * Ensure that the initialisation of @node is complete before we + * publish the updated tail via xchg_tail() and potentially link + * @node into the waitqueue via WRITE_ONCE(prev->next, node) below. + */ + smp_wmb(); + + /* + * Publish the updated tail. + * We have already touched the queueing cacheline; don't bother with + * pending stuff. + * + * p,*,* -> n,*,* + */ + old = xchg_tail(lock, tail); + next = NULL; + + /* + * if there was a previous node; link it and wait until reaching the + * head of the waitqueue. + */ + if (old & _Q_TAIL_MASK) { + prev = decode_tail(old); + + /* Link @node into the waitqueue. */ + WRITE_ONCE(prev->next, node); + + (void)arch_mcs_spin_lock_contended_label(&node->locked, release_node_err); + + /* + * While waiting for the MCS lock, the next pointer may have + * been set by another lock waiter. We cannot prefetch here + * due to lack of equivalent instruction in BPF ISA. + */ + next = READ_ONCE(node->next); + } + + /* + * we're at the head of the waitqueue, wait for the owner & pending to + * go away. + * + * *,x,y -> *,0,0 + * + * this wait loop must use a load-acquire such that we match the + * store-release that clears the locked bit and create lock + * sequentiality; this is because the set_locked() function below + * does not imply a full barrier. + */ + val = atomic_cond_read_acquire_label(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK), + release_node_err); + + /* + * claim the lock: + * + * n,0,0 -> 0,0,1 : lock, uncontended + * *,*,0 -> *,*,1 : lock, contended + * + * If the queue head is the only one in the queue (lock value == tail) + * and nobody is pending, clear the tail code and grab the lock. + * Otherwise, we only need to grab the lock. + */ + + /* + * In the PV case we might already have _Q_LOCKED_VAL set, because + * of lock stealing; therefore we must also allow: + * + * n,0,1 -> 0,0,1 + * + * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the + * above wait condition, therefore any concurrent setting of + * PENDING will make the uncontended transition fail. + */ + if ((val & _Q_TAIL_MASK) == tail) { + if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) + goto release; /* No contention */ + } + + /* + * Either somebody is queued behind us or _Q_PENDING_VAL got set + * which will then detect the remaining tail and queue behind us + * ensuring we'll see a @next. + */ + set_locked(lock); + + /* + * contended path; wait for next if not observed yet, release. + */ + if (!next) + next = smp_cond_load_relaxed_label(&node->next, (VAL), release_node_err); + + arch_mcs_spin_unlock_contended(&next->locked); + +release:; + /* + * release the node + * + * Doing a normal dec vs this_cpu_dec is fine. An upper context always + * decrements count it incremented before returning, thus we're fine. + * For contexts interrupting us, they either observe our dec or not. + * Just ensure the compiler doesn't reorder this statement, as a + * this_cpu_dec implicitly implied that. + */ + barrier(); + node0->count--; + return 0; +release_node_err: + barrier(); + node0->count--; + goto release_err; +release_err: + return ret; +} + +/** + * arena_spin_lock - acquire a queued spinlock + * @lock: Pointer to queued spinlock structure + * + * On error, returned value will be negative. + * On success, zero is returned. + * + * The return value _must_ be tested against zero for success, + * instead of checking it against negative, for passing the + * BPF verifier. + * + * The user should do: + * if (arena_spin_lock(...) != 0) // failure + * or + * if (arena_spin_lock(...) == 0) // success + * or + * if (arena_spin_lock(...)) // failure + * or + * if (!arena_spin_lock(...)) // success + * instead of: + * if (arena_spin_lock(...) < 0) // failure + * + * The return value can still be inspected later. + */ +static __always_inline int arena_spin_lock(arena_spinlock_t __arena *lock) +{ + int val = 0; + + if (CONFIG_NR_CPUS > 1024) + return -EOPNOTSUPP; + + bpf_preempt_disable(); + if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) + return 0; + + val = arena_spin_lock_slowpath(lock, val); + /* FIXME: bpf_assert_range(-MAX_ERRNO, 0) once we have it working for all cases. */ + if (val) + bpf_preempt_enable(); + return val; +} + +/** + * arena_spin_unlock - release a queued spinlock + * @lock : Pointer to queued spinlock structure + */ +static __always_inline void arena_spin_unlock(arena_spinlock_t __arena *lock) +{ + /* + * unlock() needs release semantics: + */ + smp_store_release(&lock->locked, 0); + bpf_preempt_enable(); +} + +#define arena_spin_lock_irqsave(lock, flags) \ + ({ \ + int __ret; \ + bpf_local_irq_save(&(flags)); \ + __ret = arena_spin_lock((lock)); \ + if (__ret) \ + bpf_local_irq_restore(&(flags)); \ + (__ret); \ + }) + +#define arena_spin_unlock_irqrestore(lock, flags) \ + ({ \ + arena_spin_unlock((lock)); \ + bpf_local_irq_restore(&(flags)); \ + }) + +#endif + +#endif /* BPF_ARENA_SPIN_LOCK_H */ diff --git a/tools/testing/selftests/bpf/libarena/include/bpf_atomic.h b/tools/testing/selftests/bpf/libarena/include/bpf_atomic.h new file mode 100644 index 000000000000..b7b230431929 --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/include/bpf_atomic.h @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#ifndef BPF_ATOMIC_H +#define BPF_ATOMIC_H + +#include +#include +#include + +extern bool CONFIG_X86_64 __kconfig __weak; + +/* + * __unqual_typeof(x) - Declare an unqualified scalar type, leaving + * non-scalar types unchanged, + * + * Prefer C11 _Generic for better compile-times and simpler code. Note: 'char' + * is not type-compatible with 'signed char', and we define a separate case. + * + * This is copied verbatim from kernel's include/linux/compiler_types.h, but + * with default expression (for pointers) changed from (x) to (typeof(x)0). + * + * This is because LLVM has a bug where for lvalue (x), it does not get rid of + * an extra address_space qualifier, but does in case of rvalue (typeof(x)0). + * Hence, for pointers, we need to create an rvalue expression to get the + * desired type. See https://github.com/llvm/llvm-project/issues/53400. + */ +#define __scalar_type_to_expr_cases(type) \ + unsigned type : (unsigned type)0, signed type : (signed type)0 + +#define __unqual_typeof(x) \ + typeof(_Generic((x), \ + char: (char)0, \ + __scalar_type_to_expr_cases(char), \ + __scalar_type_to_expr_cases(short), \ + __scalar_type_to_expr_cases(int), \ + __scalar_type_to_expr_cases(long), \ + __scalar_type_to_expr_cases(long long), \ + default: (typeof(x))0)) + +/* No-op for BPF */ +#define cpu_relax() ({}) + +#define READ_ONCE(x) (*(volatile typeof(x) *)&(x)) + +#ifndef WRITE_ONCE +#define WRITE_ONCE(x, val) ((*(volatile typeof(x) *)&(x)) = (val)) +#endif + +#define cmpxchg(p, old, new) __sync_val_compare_and_swap((p), old, new) + +#define try_cmpxchg(p, pold, new) \ + ({ \ + __unqual_typeof(*(pold)) __o = *(pold); \ + __unqual_typeof(*(p)) __r = cmpxchg(p, __o, new); \ + if (__r != __o) \ + *(pold) = __r; \ + __r == __o; \ + }) + +#define try_cmpxchg_relaxed(p, pold, new) try_cmpxchg(p, pold, new) + +#define try_cmpxchg_acquire(p, pold, new) try_cmpxchg(p, pold, new) + +#define smp_mb() \ + ({ \ + volatile unsigned long __val; \ + __sync_fetch_and_add(&__val, 0); \ + }) + +#define smp_rmb() \ + ({ \ + if (!CONFIG_X86_64) \ + smp_mb(); \ + else \ + barrier(); \ + }) + +#define smp_wmb() \ + ({ \ + if (!CONFIG_X86_64) \ + smp_mb(); \ + else \ + barrier(); \ + }) + +/* Control dependency provides LOAD->STORE, provide LOAD->LOAD */ +#define smp_acquire__after_ctrl_dep() ({ smp_rmb(); }) + +#define smp_load_acquire(p) \ + ({ \ + __unqual_typeof(*(p)) __v = READ_ONCE(*(p)); \ + if (!CONFIG_X86_64) \ + smp_mb(); \ + barrier(); \ + __v; \ + }) + +#define smp_store_release(p, val) \ + ({ \ + if (!CONFIG_X86_64) \ + smp_mb(); \ + barrier(); \ + WRITE_ONCE(*(p), val); \ + }) + +#define smp_cond_load_relaxed_label(p, cond_expr, label) \ + ({ \ + typeof(p) __ptr = (p); \ + __unqual_typeof(*(p)) VAL; \ + for (;;) { \ + VAL = (__unqual_typeof(*(p)))READ_ONCE(*__ptr); \ + if (cond_expr) \ + break; \ + cond_break_label(label); \ + cpu_relax(); \ + } \ + (typeof(*(p)))VAL; \ + }) + +#define smp_cond_load_acquire_label(p, cond_expr, label) \ + ({ \ + __unqual_typeof(*p) __val = \ + smp_cond_load_relaxed_label(p, cond_expr, label); \ + smp_acquire__after_ctrl_dep(); \ + (typeof(*(p)))__val; \ + }) + +#define atomic_read(p) READ_ONCE((p)->counter) + +#define atomic_cond_read_relaxed_label(p, cond_expr, label) \ + smp_cond_load_relaxed_label(&(p)->counter, cond_expr, label) + +#define atomic_cond_read_acquire_label(p, cond_expr, label) \ + smp_cond_load_acquire_label(&(p)->counter, cond_expr, label) + +#define atomic_try_cmpxchg_relaxed(p, pold, new) \ + try_cmpxchg_relaxed(&(p)->counter, pold, new) + +#define atomic_try_cmpxchg_acquire(p, pold, new) \ + try_cmpxchg_acquire(&(p)->counter, pold, new) + +#endif /* BPF_ATOMIC_H */ diff --git a/tools/testing/selftests/bpf/libarena/include/bpf_may_goto.h b/tools/testing/selftests/bpf/libarena/include/bpf_may_goto.h new file mode 100644 index 000000000000..9ba90689d6ba --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/include/bpf_may_goto.h @@ -0,0 +1,84 @@ +#pragma once + +/* + * Note that cond_break can only be portably used in the body of a breakable + * construct, whereas can_loop can be used anywhere. + */ +#ifdef __BPF_FEATURE_MAY_GOTO +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("may_goto %l[l_break]" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + +#define __cond_break(expr) \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("may_goto %l[l_break]" \ + :::: l_break); \ + goto l_continue; \ + l_break: expr; \ + l_continue:; \ + }) +#else +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + +#define __cond_break(expr) \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: expr; \ + l_continue:; \ + }) +#else +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + +#define __cond_break(expr) \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: expr; \ + l_continue:; \ + }) +#endif +#endif + +#define cond_break __cond_break(break) +#define cond_break_label(label) __cond_break(goto label) diff --git a/tools/testing/selftests/bpf/libarena/include/libarena/common.h b/tools/testing/selftests/bpf/libarena/include/libarena/common.h index 92b67b20ed15..d088f3e75798 100644 --- a/tools/testing/selftests/bpf/libarena/include/libarena/common.h +++ b/tools/testing/selftests/bpf/libarena/include/libarena/common.h @@ -6,8 +6,8 @@ #include -#include "../../bpf_arena_common.h" -#include "../../progs/bpf_arena_spin_lock.h" +#include +#include #include diff --git a/tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c b/tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c index 693fd86fbde6..acb9d53b5973 100644 --- a/tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c @@ -5,13 +5,6 @@ #include struct __qspinlock { int val; }; -typedef struct __qspinlock arena_spinlock_t; - -struct arena_qnode { - unsigned long next; - int count; - int locked; -}; #include "arena_spin_lock.skel.h" diff --git a/tools/testing/selftests/bpf/progs/arena_atomics.c b/tools/testing/selftests/bpf/progs/arena_atomics.c index d1841aac94a2..2e7751a85399 100644 --- a/tools/testing/selftests/bpf/progs/arena_atomics.c +++ b/tools/testing/selftests/bpf/progs/arena_atomics.c @@ -5,7 +5,7 @@ #include #include #include -#include "bpf_arena_common.h" +#include #include "../../../include/linux/filter.h" #include "bpf_misc.h" diff --git a/tools/testing/selftests/bpf/progs/arena_spin_lock.c b/tools/testing/selftests/bpf/progs/arena_spin_lock.c index 086b57a426cf..7236d92d382f 100644 --- a/tools/testing/selftests/bpf/progs/arena_spin_lock.c +++ b/tools/testing/selftests/bpf/progs/arena_spin_lock.c @@ -4,7 +4,7 @@ #include #include #include "bpf_misc.h" -#include "bpf_arena_spin_lock.h" +#include struct { __uint(type, BPF_MAP_TYPE_ARENA); diff --git a/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h b/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h deleted file mode 100644 index f90531cf3ee5..000000000000 --- a/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h +++ /dev/null @@ -1,542 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ -#ifndef BPF_ARENA_SPIN_LOCK_H -#define BPF_ARENA_SPIN_LOCK_H - -#include -#include -#include "bpf_atomic.h" - -#define arch_mcs_spin_lock_contended_label(l, label) smp_cond_load_acquire_label(l, VAL, label) -#define arch_mcs_spin_unlock_contended(l) smp_store_release((l), 1) - -#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST) - -#define EBUSY 16 -#define EOPNOTSUPP 95 -#define ETIMEDOUT 110 - -#ifndef __arena -#define __arena __attribute__((address_space(1))) -#endif - -extern unsigned long CONFIG_NR_CPUS __kconfig; - -/* - * Typically, we'd just rely on the definition in vmlinux.h for qspinlock, but - * PowerPC overrides the definition to define lock->val as u32 instead of - * atomic_t, leading to compilation errors. Import a local definition below so - * that we don't depend on the vmlinux.h version. - */ - -struct __qspinlock { - union { - atomic_t val; -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - struct { - u8 locked; - u8 pending; - }; - struct { - u16 locked_pending; - u16 tail; - }; -#else - struct { - u16 tail; - u16 locked_pending; - }; - struct { - u8 reserved[2]; - u8 pending; - u8 locked; - }; -#endif - }; -}; - -#define arena_spinlock_t struct __qspinlock -/* FIXME: Using typedef causes CO-RE relocation error */ -/* typedef struct qspinlock arena_spinlock_t; */ - -struct arena_mcs_spinlock { - struct arena_mcs_spinlock __arena *next; - int locked; - int count; -}; - -struct arena_qnode { - struct arena_mcs_spinlock mcs; -}; - -#define _Q_MAX_NODES 4 -#define _Q_PENDING_LOOPS 1 - -/* - * Bitfields in the atomic value: - * - * 0- 7: locked byte - * 8: pending - * 9-15: not used - * 16-17: tail index - * 18-31: tail cpu (+1) - */ -#define _Q_MAX_CPUS 1024 - -#define _Q_SET_MASK(type) (((1U << _Q_ ## type ## _BITS) - 1)\ - << _Q_ ## type ## _OFFSET) -#define _Q_LOCKED_OFFSET 0 -#define _Q_LOCKED_BITS 8 -#define _Q_LOCKED_MASK _Q_SET_MASK(LOCKED) - -#define _Q_PENDING_OFFSET (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS) -#define _Q_PENDING_BITS 8 -#define _Q_PENDING_MASK _Q_SET_MASK(PENDING) - -#define _Q_TAIL_IDX_OFFSET (_Q_PENDING_OFFSET + _Q_PENDING_BITS) -#define _Q_TAIL_IDX_BITS 2 -#define _Q_TAIL_IDX_MASK _Q_SET_MASK(TAIL_IDX) - -#define _Q_TAIL_CPU_OFFSET (_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS) -#define _Q_TAIL_CPU_BITS (32 - _Q_TAIL_CPU_OFFSET) -#define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU) - -#define _Q_TAIL_OFFSET _Q_TAIL_IDX_OFFSET -#define _Q_TAIL_MASK (_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK) - -#define _Q_LOCKED_VAL (1U << _Q_LOCKED_OFFSET) -#define _Q_PENDING_VAL (1U << _Q_PENDING_OFFSET) - -struct arena_qnode __arena qnodes[_Q_MAX_CPUS][_Q_MAX_NODES]; - -static inline u32 encode_tail(int cpu, int idx) -{ - u32 tail; - - tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; - tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ - - return tail; -} - -static inline struct arena_mcs_spinlock __arena *decode_tail(u32 tail) -{ - u32 cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; - u32 idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; - - return &qnodes[cpu][idx].mcs; -} - -static inline -struct arena_mcs_spinlock __arena *grab_mcs_node(struct arena_mcs_spinlock __arena *base, int idx) -{ - return &((struct arena_qnode __arena *)base + idx)->mcs; -} - -#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) - -/** - * xchg_tail - Put in the new queue tail code word & retrieve previous one - * @lock : Pointer to queued spinlock structure - * @tail : The new queue tail code word - * Return: The previous queue tail code word - * - * xchg(lock, tail) - * - * p,*,* -> n,*,* ; prev = xchg(lock, node) - */ -static __always_inline u32 xchg_tail(arena_spinlock_t __arena *lock, u32 tail) -{ - u32 old, new; - - old = atomic_read(&lock->val); - do { - new = (old & _Q_LOCKED_PENDING_MASK) | tail; - /* - * We can use relaxed semantics since the caller ensures that - * the MCS node is properly initialized before updating the - * tail. - */ - /* These loops are not expected to stall, but we still need to - * prove to the verifier they will terminate eventually. - */ - cond_break_label(out); - } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); - - return old; -out: - bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__); - return old; -} - -/** - * clear_pending - clear the pending bit. - * @lock: Pointer to queued spinlock structure - * - * *,1,* -> *,0,* - */ -static __always_inline void clear_pending(arena_spinlock_t __arena *lock) -{ - WRITE_ONCE(lock->pending, 0); -} - -/** - * clear_pending_set_locked - take ownership and clear the pending bit. - * @lock: Pointer to queued spinlock structure - * - * *,1,0 -> *,0,1 - * - * Lock stealing is not allowed if this function is used. - */ -static __always_inline void clear_pending_set_locked(arena_spinlock_t __arena *lock) -{ - WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); -} - -/** - * set_locked - Set the lock bit and own the lock - * @lock: Pointer to queued spinlock structure - * - * *,*,0 -> *,0,1 - */ -static __always_inline void set_locked(arena_spinlock_t __arena *lock) -{ - WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); -} - -static __always_inline -u32 arena_fetch_set_pending_acquire(arena_spinlock_t __arena *lock) -{ - u32 old, new; - - old = atomic_read(&lock->val); - do { - new = old | _Q_PENDING_VAL; - /* - * These loops are not expected to stall, but we still need to - * prove to the verifier they will terminate eventually. - */ - cond_break_label(out); - } while (!atomic_try_cmpxchg_acquire(&lock->val, &old, new)); - - return old; -out: - bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__); - return old; -} - -/** - * arena_spin_trylock - try to acquire the queued spinlock - * @lock : Pointer to queued spinlock structure - * Return: 1 if lock acquired, 0 if failed - */ -static __always_inline int arena_spin_trylock(arena_spinlock_t __arena *lock) -{ - int val = atomic_read(&lock->val); - - if (unlikely(val)) - return 0; - - return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)); -} - -__noinline -int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val) -{ - struct arena_mcs_spinlock __arena *prev, *next, *node0, *node; - int ret = -ETIMEDOUT; - u32 old, tail; - int idx; - - /* - * Wait for in-progress pending->locked hand-overs with a bounded - * number of spins so that we guarantee forward progress. - * - * 0,1,0 -> 0,0,1 - */ - if (val == _Q_PENDING_VAL) { - int cnt = _Q_PENDING_LOOPS; - val = atomic_cond_read_relaxed_label(&lock->val, - (VAL != _Q_PENDING_VAL) || !cnt--, - release_err); - } - - /* - * If we observe any contention; queue. - */ - if (val & ~_Q_LOCKED_MASK) - goto queue; - - /* - * trylock || pending - * - * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock - */ - val = arena_fetch_set_pending_acquire(lock); - - /* - * If we observe contention, there is a concurrent locker. - * - * Undo and queue; our setting of PENDING might have made the - * n,0,0 -> 0,0,0 transition fail and it will now be waiting - * on @next to become !NULL. - */ - if (unlikely(val & ~_Q_LOCKED_MASK)) { - - /* Undo PENDING if we set it. */ - if (!(val & _Q_PENDING_MASK)) - clear_pending(lock); - - goto queue; - } - - /* - * We're pending, wait for the owner to go away. - * - * 0,1,1 -> *,1,0 - * - * this wait loop must be a load-acquire such that we match the - * store-release that clears the locked bit and create lock - * sequentiality; this is because not all - * clear_pending_set_locked() implementations imply full - * barriers. - */ - if (val & _Q_LOCKED_MASK) - (void)smp_cond_load_acquire_label(&lock->locked, !VAL, release_err); - - /* - * take ownership and clear the pending bit. - * - * 0,1,0 -> 0,0,1 - */ - clear_pending_set_locked(lock); - return 0; - - /* - * End of pending bit optimistic spinning and beginning of MCS - * queuing. - */ -queue: - node0 = &(qnodes[bpf_get_smp_processor_id()])[0].mcs; - idx = node0->count++; - tail = encode_tail(bpf_get_smp_processor_id(), idx); - - /* - * 4 nodes are allocated based on the assumption that there will not be - * nested NMIs taking spinlocks. That may not be true in some - * architectures even though the chance of needing more than 4 nodes - * will still be extremely unlikely. When that happens, we simply return - * an error. Original qspinlock has a trylock fallback in this case. - */ - if (unlikely(idx >= _Q_MAX_NODES)) { - ret = -EBUSY; - goto release_node_err; - } - - node = grab_mcs_node(node0, idx); - - /* - * Ensure that we increment the head node->count before initialising - * the actual node. If the compiler is kind enough to reorder these - * stores, then an IRQ could overwrite our assignments. - */ - barrier(); - - node->locked = 0; - node->next = NULL; - - /* - * We touched a (possibly) cold cacheline in the per-cpu queue node; - * attempt the trylock once more in the hope someone let go while we - * weren't watching. - */ - if (arena_spin_trylock(lock)) - goto release; - - /* - * Ensure that the initialisation of @node is complete before we - * publish the updated tail via xchg_tail() and potentially link - * @node into the waitqueue via WRITE_ONCE(prev->next, node) below. - */ - smp_wmb(); - - /* - * Publish the updated tail. - * We have already touched the queueing cacheline; don't bother with - * pending stuff. - * - * p,*,* -> n,*,* - */ - old = xchg_tail(lock, tail); - next = NULL; - - /* - * if there was a previous node; link it and wait until reaching the - * head of the waitqueue. - */ - if (old & _Q_TAIL_MASK) { - prev = decode_tail(old); - - /* Link @node into the waitqueue. */ - WRITE_ONCE(prev->next, node); - - (void)arch_mcs_spin_lock_contended_label(&node->locked, release_node_err); - - /* - * While waiting for the MCS lock, the next pointer may have - * been set by another lock waiter. We cannot prefetch here - * due to lack of equivalent instruction in BPF ISA. - */ - next = READ_ONCE(node->next); - } - - /* - * we're at the head of the waitqueue, wait for the owner & pending to - * go away. - * - * *,x,y -> *,0,0 - * - * this wait loop must use a load-acquire such that we match the - * store-release that clears the locked bit and create lock - * sequentiality; this is because the set_locked() function below - * does not imply a full barrier. - */ - val = atomic_cond_read_acquire_label(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK), - release_node_err); - - /* - * claim the lock: - * - * n,0,0 -> 0,0,1 : lock, uncontended - * *,*,0 -> *,*,1 : lock, contended - * - * If the queue head is the only one in the queue (lock value == tail) - * and nobody is pending, clear the tail code and grab the lock. - * Otherwise, we only need to grab the lock. - */ - - /* - * In the PV case we might already have _Q_LOCKED_VAL set, because - * of lock stealing; therefore we must also allow: - * - * n,0,1 -> 0,0,1 - * - * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the - * above wait condition, therefore any concurrent setting of - * PENDING will make the uncontended transition fail. - */ - if ((val & _Q_TAIL_MASK) == tail) { - if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) - goto release; /* No contention */ - } - - /* - * Either somebody is queued behind us or _Q_PENDING_VAL got set - * which will then detect the remaining tail and queue behind us - * ensuring we'll see a @next. - */ - set_locked(lock); - - /* - * contended path; wait for next if not observed yet, release. - */ - if (!next) - next = smp_cond_load_relaxed_label(&node->next, (VAL), release_node_err); - - arch_mcs_spin_unlock_contended(&next->locked); - -release:; - /* - * release the node - * - * Doing a normal dec vs this_cpu_dec is fine. An upper context always - * decrements count it incremented before returning, thus we're fine. - * For contexts interrupting us, they either observe our dec or not. - * Just ensure the compiler doesn't reorder this statement, as a - * this_cpu_dec implicitly implied that. - */ - barrier(); - node0->count--; - return 0; -release_node_err: - barrier(); - node0->count--; - goto release_err; -release_err: - return ret; -} - -/** - * arena_spin_lock - acquire a queued spinlock - * @lock: Pointer to queued spinlock structure - * - * On error, returned value will be negative. - * On success, zero is returned. - * - * The return value _must_ be tested against zero for success, - * instead of checking it against negative, for passing the - * BPF verifier. - * - * The user should do: - * if (arena_spin_lock(...) != 0) // failure - * or - * if (arena_spin_lock(...) == 0) // success - * or - * if (arena_spin_lock(...)) // failure - * or - * if (!arena_spin_lock(...)) // success - * instead of: - * if (arena_spin_lock(...) < 0) // failure - * - * The return value can still be inspected later. - */ -static __always_inline int arena_spin_lock(arena_spinlock_t __arena *lock) -{ - int val = 0; - - if (CONFIG_NR_CPUS > 1024) - return -EOPNOTSUPP; - - bpf_preempt_disable(); - if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) - return 0; - - val = arena_spin_lock_slowpath(lock, val); - /* FIXME: bpf_assert_range(-MAX_ERRNO, 0) once we have it working for all cases. */ - if (val) - bpf_preempt_enable(); - return val; -} - -/** - * arena_spin_unlock - release a queued spinlock - * @lock : Pointer to queued spinlock structure - */ -static __always_inline void arena_spin_unlock(arena_spinlock_t __arena *lock) -{ - /* - * unlock() needs release semantics: - */ - smp_store_release(&lock->locked, 0); - bpf_preempt_enable(); -} - -#define arena_spin_lock_irqsave(lock, flags) \ - ({ \ - int __ret; \ - bpf_local_irq_save(&(flags)); \ - __ret = arena_spin_lock((lock)); \ - if (__ret) \ - bpf_local_irq_restore(&(flags)); \ - (__ret); \ - }) - -#define arena_spin_unlock_irqrestore(lock, flags) \ - ({ \ - arena_spin_unlock((lock)); \ - bpf_local_irq_restore(&(flags)); \ - }) - -#endif - -#endif /* BPF_ARENA_SPIN_LOCK_H */ diff --git a/tools/testing/selftests/bpf/progs/compute_live_registers.c b/tools/testing/selftests/bpf/progs/compute_live_registers.c index f05e120f3450..d055fc7b3b95 100644 --- a/tools/testing/selftests/bpf/progs/compute_live_registers.c +++ b/tools/testing/selftests/bpf/progs/compute_live_registers.c @@ -3,7 +3,7 @@ #include #include #include "../../../include/linux/filter.h" -#include "bpf_arena_common.h" +#include #include "bpf_misc.h" struct { diff --git a/tools/testing/selftests/bpf/progs/lpm_trie_bench.c b/tools/testing/selftests/bpf/progs/lpm_trie_bench.c index a0e6ebd5507a..2831cf4445e8 100644 --- a/tools/testing/selftests/bpf/progs/lpm_trie_bench.c +++ b/tools/testing/selftests/bpf/progs/lpm_trie_bench.c @@ -7,7 +7,7 @@ #include #include #include "bpf_misc.h" -#include "bpf_atomic.h" +#include #include "progs/lpm_trie.h" #define BPF_OBJ_NAME_LEN 16U diff --git a/tools/testing/selftests/bpf/progs/stream.c b/tools/testing/selftests/bpf/progs/stream.c index 6f999ba951a3..92ba1d72e0ec 100644 --- a/tools/testing/selftests/bpf/progs/stream.c +++ b/tools/testing/selftests/bpf/progs/stream.c @@ -5,7 +5,7 @@ #include #include "bpf_misc.h" #include "bpf_experimental.h" -#include "bpf_arena_common.h" +#include struct arr_elem { struct bpf_res_spin_lock lock; diff --git a/tools/testing/selftests/bpf/progs/verifier_arena.c b/tools/testing/selftests/bpf/progs/verifier_arena.c index 62e282f4448a..89d72c8d756a 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena.c @@ -8,7 +8,7 @@ #include #include "bpf_misc.h" #include "bpf_experimental.h" -#include "bpf_arena_common.h" +#include #define private(name) SEC(".bss." #name) __hidden __attribute__((aligned(8))) diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c b/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c index 83182ddbfb95..45d364b0bc85 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c @@ -6,7 +6,7 @@ #include #include #include "bpf_experimental.h" -#include "bpf_arena_common.h" +#include #include "bpf_misc.h" #define ARENA_PAGES (1UL<< (32 - __builtin_ffs(__PAGE_SIZE) + 1)) diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_globals2.c b/tools/testing/selftests/bpf/progs/verifier_arena_globals2.c index e6bd7b61f9f1..b51594dbc005 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena_globals2.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena_globals2.c @@ -7,7 +7,7 @@ #include #include "bpf_misc.h" #include "bpf_experimental.h" -#include "bpf_arena_common.h" +#include #define ARENA_PAGES (32) diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_large.c b/tools/testing/selftests/bpf/progs/verifier_arena_large.c index 5f7e7afee169..6ab8730d4878 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena_large.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena_large.c @@ -7,7 +7,7 @@ #include #include "bpf_misc.h" #include "bpf_experimental.h" -#include "bpf_arena_common.h" +#include #define ARENA_SIZE (1ull << 32) diff --git a/tools/testing/selftests/bpf/progs/verifier_ldsx.c b/tools/testing/selftests/bpf/progs/verifier_ldsx.c index c8494b682c31..1026524a1983 100644 --- a/tools/testing/selftests/bpf/progs/verifier_ldsx.c +++ b/tools/testing/selftests/bpf/progs/verifier_ldsx.c @@ -3,7 +3,7 @@ #include #include #include "bpf_misc.h" -#include "bpf_arena_common.h" +#include #if (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \ (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64) || \ -- cgit v1.2.3 From 9ab78691eb5fd0d3ad0a1994d4103223678eb78b Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Sun, 26 Apr 2026 15:03:34 -0400 Subject: selftests/bpf: Add arena ASAN runtime to libarena Add an address sanitizer (ASAN) runtime to the arena library. The ASAN runtime implements the functions injected into BPF binaries by LLVM sanitization when ASAN is enabled during compilation. The runtime also includes functions called explicitly by memory allocation code to mark memory as poisoned/unpoisoned to ASAN. This code is a no-op when sanitization is turned off. Signed-off-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260426190338.4615-5-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/libarena/include/libarena/asan.h | 103 ++++ .../bpf/libarena/include/libarena/common.h | 1 + .../testing/selftests/bpf/libarena/src/asan.bpf.c | 553 +++++++++++++++++++++ 3 files changed, 657 insertions(+) create mode 100644 tools/testing/selftests/bpf/libarena/include/libarena/asan.h create mode 100644 tools/testing/selftests/bpf/libarena/src/asan.bpf.c diff --git a/tools/testing/selftests/bpf/libarena/include/libarena/asan.h b/tools/testing/selftests/bpf/libarena/include/libarena/asan.h new file mode 100644 index 000000000000..eb9fc69d9eb0 --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/include/libarena/asan.h @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#pragma once + +struct asan_init_args { + u64 arena_all_pages; + u64 arena_globals_pages; +}; + +int asan_init(struct asan_init_args *args); + +extern volatile u64 __asan_shadow_memory_dynamic_address; +extern volatile u32 asan_reported; +extern volatile bool asan_inited; +extern volatile bool asan_report_once; + +#ifdef __BPF__ + +#define ASAN_SHADOW_SHIFT 3 +#define ASAN_SHADOW_SCALE (1ULL << ASAN_SHADOW_SHIFT) +#define ASAN_GRANULE_MASK ((1ULL << ASAN_SHADOW_SHIFT) - 1) +#define ASAN_GRANULE(addr) ((s8)((u32)(u64)((addr)) & ASAN_GRANULE_MASK)) + +#define __noasan __attribute__((no_sanitize("address"))) + +#ifdef BPF_ARENA_ASAN + +typedef s8 __arena s8a; + +static inline +s8a *mem_to_shadow(void __arena __arg_arena *addr) +{ + return (s8a *)(((u32)(u64)addr >> ASAN_SHADOW_SHIFT) + + __asan_shadow_memory_dynamic_address); +} + +__weak __noasan +bool asan_ready(void) +{ + return __asan_shadow_memory_dynamic_address; +} + +int asan_poison(void __arena *addr, s8 val, size_t size); +int asan_unpoison(void __arena *addr, size_t size); +bool asan_shadow_set(void __arena *addr); + +/* + * Dummy calls to ensure the ASAN runtime's BTF information is present + * in every object file when compiling the runtime and local BPF code + * separately. The runtime calls are injected into the LLVM IR file + */ +#define DECLARE_ASAN_LOAD_STORE_SIZE(size) \ + void __asan_store##size(intptr_t addr); \ + void __asan_store##size##_noabort(intptr_t addr); \ + void __asan_load##size(intptr_t addr); \ + void __asan_load##size##_noabort(intptr_t addr); \ + void __asan_report_store##size(intptr_t addr); \ + void __asan_report_store##size##_noabort(intptr_t addr); \ + void __asan_report_load##size(intptr_t addr); \ + void __asan_report_load##size##_noabort(intptr_t addr); + +DECLARE_ASAN_LOAD_STORE_SIZE(1); +DECLARE_ASAN_LOAD_STORE_SIZE(2); +DECLARE_ASAN_LOAD_STORE_SIZE(4); +DECLARE_ASAN_LOAD_STORE_SIZE(8); + +void __asan_storeN(intptr_t addr, ssize_t size); +void __asan_storeN_noabort(intptr_t addr, ssize_t size); +void __asan_loadN(intptr_t addr, ssize_t size); +void __asan_loadN_noabort(intptr_t addr, ssize_t size); + +/* + * Force LLVM to emit BTF information for the stubs, + * because the ASAN pass in LLVM by itself doesn't. + */ +#define ASAN_LOAD_STORE_SIZE(size) \ + __asan_store##size, \ + __asan_store##size##_noabort, \ + __asan_load##size, \ + __asan_load##size##_noabort, \ + __asan_report_store##size, \ + __asan_report_store##size##_noabort, \ + __asan_report_load##size, \ + __asan_report_load##size##_noabort + +__attribute__((used)) +static void (*__asan_btf_anchors[])(intptr_t) = { + ASAN_LOAD_STORE_SIZE(1), + ASAN_LOAD_STORE_SIZE(2), + ASAN_LOAD_STORE_SIZE(4), + ASAN_LOAD_STORE_SIZE(8), +}; + +#else /* BPF_ARENA_ASAN */ + +static inline int asan_poison(void __arena *addr, s8 val, size_t size) { return 0; } +static inline int asan_unpoison(void __arena *addr, size_t size) { return 0; } +static inline bool asan_shadow_set(void __arena *addr) { return 0; } +__weak bool asan_ready(void) { return true; } + +#endif /* BPF_ARENA_ASAN */ + +#endif /* __BPF__ */ diff --git a/tools/testing/selftests/bpf/libarena/include/libarena/common.h b/tools/testing/selftests/bpf/libarena/include/libarena/common.h index d088f3e75798..21eb18bf4533 100644 --- a/tools/testing/selftests/bpf/libarena/include/libarena/common.h +++ b/tools/testing/selftests/bpf/libarena/include/libarena/common.h @@ -44,6 +44,7 @@ struct { * the opaque volatile variable 0 instead of the constant 0. */ extern const volatile u32 zero; +extern volatile u64 asan_violated; int arena_fls(__u64 word); diff --git a/tools/testing/selftests/bpf/libarena/src/asan.bpf.c b/tools/testing/selftests/bpf/libarena/src/asan.bpf.c new file mode 100644 index 000000000000..64c5b990086c --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/src/asan.bpf.c @@ -0,0 +1,553 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include + + +enum { + /* + * Is the access checked by check_region_inline + * a read or a write? + */ + ASAN_READ = 0x0U, + ASAN_WRITE = 0x1U, +}; + +/* + * Address sanitizer (ASAN) for arena-based BPF programs, inspired + * by KASAN. + * + * The API + * ------- + * + * The implementation includes two kinds of components: Implementation + * of ASAN hooks injected by LLVM into the program, and API calls that + * allocators use to mark memory as valid or invalid. The full list is: + * + * LLVM stubs: + * + * void __asan_{load, store}(intptr_t addr) + * Checks whether an access is valid. All variations covered + * by check_region_inline(). + * + * void __asan_{store, load}((intptr_t addr, ssize_t size) + * + * void __asan_report_{load, store}(intptr_t addr) + * Report an access violation for the program. Used when LLVM + * uses direct code generation for shadow map checks. + * + * void *__asan_memcpy(void *d, const void *s, size_t n) + * void *__asan_memmove(void *d, const void *s, size_t n) + * void *__asan_memset(void *p, int c, size_t n) + * Hooks for ASAN instrumentation of the LLVM mem* builtins. + * Currently unimplemented just like the builtins themselves. + * + * API methods: + * + * asan_init() + * Initialize the ASAN map for the arena. + * + * asan_poison() + * Mark a region of memory as poisoned. Accessing poisoned memory + * causes asan_report() to fire. Invoked during free(). + * + * asan_unpoison() + * Mark a region as unpoisoned after alloc(). + * + * asan_shadow_set() + * Check a byte's validity directly. + * + * The Algorithm In Brief + * ---------------------- + * Each group of 8 bytes is mapped to a "granule" in the shadow map. This + * granule is the size of the byte and describes which bytes are valid. + * Possible values are: + * + * 0: All bytes are valid. Makes checks in the middle of an allocated region + * (most of them) fast. + * (0, 7]: How many consecutive bytes are valid, starting from the lowest one. + * The tradeoff is that we can't poison individual bytes in the middle of a + * valid region. + * [0x80, 0xff]: Special poison values, can be used to denote specific error + * modes (e.g., recently freed vs uninitialized memory). + * + * The mapping between a memory location and its shadow is: + * shadow_addr = shadow_base + (addr >> 3). We retain the 8:1 data:shadow + * ratio of existing ASAN implementations as a compromise between tracking + * granularity and space usage/scan overhead. + */ + +#ifdef BPF_ARENA_ASAN + +#pragma clang attribute push(__attribute__((no_sanitize("address"))), \ + apply_to = function) + +#define SHADOW_ALL_ZEROES ((u64)-1) + +/* + * Canary variable for ASAN violations. Set to the offending address. + */ +volatile u64 asan_violated = 0; + +/* + * Shadow map occupancy map. + */ +volatile u64 __asan_shadow_memory_dynamic_address; + +volatile u32 asan_reported = false; +volatile bool asan_inited = false; + +/* + * Set during program load. + */ +volatile bool asan_report_once = false; + +/* + * BPF does not currently support the memset/memcpy/memcmp intrinsics. + * For large sequential copies, or assignments of large data structures, + * the frontend will generate an intrinsic that causes the BPF backend + * to exit due to a missing implementation. Provide a simple implementation + * just for memset to use it for poisoning/unpoisoning the map. + */ +__weak int asan_memset(s8a __arg_arena *dst, s8 val, size_t size) +{ + size_t i; + + for (i = zero; i < size && can_loop; i++) + dst[i] = val; + + return 0; +} + +/* Validate a 1-byte access, always within a single byte. */ +static __always_inline bool memory_is_poisoned_1(s8a *addr) +{ + s8 shadow_value = *(s8a *)mem_to_shadow(addr); + + /* Byte is 0, access is valid. */ + if (likely(!shadow_value)) + return false; + + /* + * Byte is non-zero. Access is valid if granule offset in [0, shadow_value), + * so the memory is poisoned if shadow_value is negative or smaller than + * the granule's value. + */ + + return ASAN_GRANULE(addr) >= shadow_value; +} + +/* Validate a 2- 4-, 8-byte access, shadow spans up to 2 bytes. */ +static __always_inline bool memory_is_poisoned_2_4_8(s8a *addr, u64 size) +{ + u64 end = (u64)addr + size - 1; + + /* + * Region fully within a single byte (addition didn't + * overflow above ASAN_GRANULE). + */ + if (likely(ASAN_GRANULE(end) >= size - 1)) + return memory_is_poisoned_1((s8a *)end); + + /* + * Otherwise first byte must be fully unpoisoned, and second byte + * must be unpoisoned up to the end of the accessed region. + */ + + return *(s8a *)mem_to_shadow(addr) || memory_is_poisoned_1((s8a *)end); +} + +__weak bool asan_shadow_set(void __arena __arg_arena *addr) +{ + return memory_is_poisoned_1(addr); +} + +static __always_inline u64 first_nonzero_byte(u64 addr, size_t size) +{ + while (size && can_loop) { + if (unlikely(*(s8a *)addr)) + return addr; + addr += 1; + size -= 1; + } + + return SHADOW_ALL_ZEROES; +} + +static __always_inline bool memory_is_poisoned_n(s8a *addr, u64 size) +{ + u64 ret; + u64 start; + u64 end; + + /* Size of [start, end] is end - start + 1. */ + start = (u64)mem_to_shadow(addr); + end = (u64)mem_to_shadow(addr + size - 1); + + ret = first_nonzero_byte(start, (end - start) + 1); + if (likely(ret == SHADOW_ALL_ZEROES)) + return false; + + return unlikely(ret != end || ASAN_GRANULE(addr + size - 1) >= *(s8a *)end); +} + +__weak int asan_report(s8a __arg_arena *addr, size_t sz, u32 flags) +{ + u32 reported = __sync_val_compare_and_swap(&asan_reported, false, true); + + /* Only report the first ASAN violation. */ + if (reported && asan_report_once) + return 0; + + asan_violated = (u64)addr; + + arena_stderr("Memory violation for address %p (0x%lx) for %s of size %ld\n", + addr, (u64)addr, + (flags & ASAN_WRITE) ? "write" : "read", + sz); + bpf_stream_print_stack(BPF_STDERR); + + return 0; +} + +static __always_inline bool check_asan_args(s8a *addr, size_t size, + bool *result) +{ + bool valid = true; + + /* Size 0 accesses are valid even if the address is invalid. */ + if (unlikely(size == 0)) + goto confirmed_valid; + + /* + * Wraparound is possible for values close to the the edge of the + * 4GiB boundary of the arena (last valid address is 1UL << 32 - 1). + * + * + * The wraparound detection below works for small sizes. check_asan_args is + * always called from the builtin ASAN checks, so 1 <= size <= 64. Even + * for storeN/loadN that we do not expect to encounter the intrinsics will + * not have a large enough size that: + * + * - addr + size > MAX_U32 + * - (u32)(addr + size) > (u32) addr + * + * which would defeat wraparound detection. + */ + if (unlikely((u32)(u64)(addr + size) < (u32)(u64)addr)) + goto confirmed_invalid; + + return false; + +confirmed_invalid: + valid = false; + + /* FALLTHROUGH */ +confirmed_valid: + *result = valid; + + return true; +} + +static __always_inline bool check_region_inline(intptr_t ptr, size_t size, + u32 flags) +{ + s8a *addr = (s8a *)(u64)ptr; + bool is_poisoned, is_valid; + + if (check_asan_args(addr, size, &is_valid)) { + if (!is_valid) + asan_report(addr, size, flags); + return is_valid; + } + + switch (size) { + case 1: + is_poisoned = memory_is_poisoned_1(addr); + break; + case 2: + case 4: + case 8: + is_poisoned = memory_is_poisoned_2_4_8(addr, size); + break; + default: + is_poisoned = memory_is_poisoned_n(addr, size); + } + + if (is_poisoned) { + asan_report(addr, size, flags); + return false; + } + + return true; +} + +/* + * __alias is not supported for BPF so define *__noabort() variants as wrappers. + */ +#define DEFINE_ASAN_LOAD_STORE(size) \ + __hidden void __asan_store##size(intptr_t addr) \ + { \ + check_region_inline(addr, size, ASAN_WRITE); \ + } \ + __hidden void __asan_store##size##_noabort(intptr_t addr) \ + { \ + check_region_inline(addr, size, ASAN_WRITE); \ + } \ + __hidden void __asan_load##size(intptr_t addr) \ + { \ + check_region_inline(addr, size, ASAN_READ); \ + } \ + __hidden void __asan_load##size##_noabort(intptr_t addr) \ + { \ + check_region_inline(addr, size, ASAN_READ); \ + } \ + __hidden void __asan_report_store##size(intptr_t addr) \ + { \ + asan_report((s8a *)addr, size, ASAN_WRITE); \ + } \ + __hidden void __asan_report_store##size##_noabort(intptr_t addr) \ + { \ + asan_report((s8a *)addr, size, ASAN_WRITE); \ + } \ + __hidden void __asan_report_load##size(intptr_t addr) \ + { \ + asan_report((s8a *)addr, size, ASAN_READ); \ + } \ + __hidden void __asan_report_load##size##_noabort(intptr_t addr) \ + { \ + asan_report((s8a *)addr, size, ASAN_READ); \ + } + +DEFINE_ASAN_LOAD_STORE(1); +DEFINE_ASAN_LOAD_STORE(2); +DEFINE_ASAN_LOAD_STORE(4); +DEFINE_ASAN_LOAD_STORE(8); + +void __asan_storeN(intptr_t addr, ssize_t size) +{ + check_region_inline(addr, size, ASAN_WRITE); +} + +void __asan_storeN_noabort(intptr_t addr, ssize_t size) +{ + check_region_inline(addr, size, ASAN_WRITE); +} + +void __asan_loadN(intptr_t addr, ssize_t size) +{ + check_region_inline(addr, size, ASAN_READ); +} + +void __asan_loadN_noabort(intptr_t addr, ssize_t size) +{ + check_region_inline(addr, size, ASAN_READ); +} + +/* + * We currently do not sanitize globals. + */ +void __asan_register_globals(intptr_t globals, size_t n) +{ +} + +void __asan_unregister_globals(intptr_t globals, size_t n) +{ +} + +/* + * We do not currently have memcpy/memmove/memset intrinsics + * in LLVM. Do not implement sanitization. + */ +void *__asan_memcpy(void *d, const void *s, size_t n) +{ + arena_stderr("ASAN: Unexpected %s call", __func__); + return NULL; +} + +void *__asan_memmove(void *d, const void *s, size_t n) +{ + arena_stderr("ASAN: Unexpected %s call", __func__); + return NULL; +} + +void *__asan_memset(void *p, int c, size_t n) +{ + arena_stderr("ASAN: Unexpected %s call", __func__); + return NULL; +} + +/* + * Poisoning code, used when we add more freed memory to the allocator by: + * a) pulling memory from the arena segment using bpf_arena_alloc_pages() + * b) freeing memory from application code + */ +__hidden __noasan int asan_poison(void __arena *addr, s8 val, size_t size) +{ + s8a *shadow; + size_t len; + + /* + * Poisoning from a non-granule address makes no sense: We can only allocate + * memory to the application that has a granule-aligned starting address, + * and bpf_arena_alloc_pages returns page-aligned memory. A non-aligned + * addr then implies we're freeing a different address than the one we + * allocated. + */ + if (unlikely((u64)addr & ASAN_GRANULE_MASK)) + return -EINVAL; + + /* + * We cannot free an unaligned region because it'd be possible that we + * cannot describe the resulting poisoning state of the granule in + * the ASAN encoding. + * + * Every granule represents a region of memory that looks like the + * following (P for poisoned bytes, C for clear): + * + * + * [ C C C ... P P ] + * + * The value of the granule's shadow map is the number of clear bytes in + * it. We cannot represent granules with the following state: + * + * [ P P ... C C ... P P ] + * + * That would be possible if we could free unaligned regions, so prevent that. + */ + if (unlikely(size & ASAN_GRANULE_MASK)) + return -EINVAL; + + shadow = mem_to_shadow(addr); + len = size >> ASAN_SHADOW_SHIFT; + + asan_memset(shadow, val, len); + + return 0; +} + +/* + * Unpoisoning code for marking memory as valid during allocation calls. + * + * Very similar to asan_poison, except we need to round up instead of + * down, then partially poison the last granule if necessary. + * + * Partial poisoning is useful for keeping the padding poisoned. Allocations + * are granule-aligned, so we we're reserving granule-aligned sizes for the + * allocation. However, we want to still treat accesses to the padding as + * invalid. Partial poisoning takes care of that. Freeing and poisoning the + * memory is still done in granule-aligned sizes and repoisons the already + * poisoned padding. + */ +__hidden __noasan int asan_unpoison(void __arena *addr, size_t size) +{ + size_t partial = size & ASAN_GRANULE_MASK; + s8a *shadow; + size_t len; + + /* + * We cannot allocate in the middle of the granule. The ASAN shadow + * map encoding only describes regions of memory where every granule + * follows this format (P for poisoned, C for clear): + * + * + * [ C C C ... P P ] + * + * This is so we can use a single number in [0, ASAN_SHADOW_SCALE) + * to represent the poison state of the granule. + */ + if (unlikely((u64)addr & ASAN_GRANULE_MASK)) + return -EINVAL; + + shadow = mem_to_shadow(addr); + len = size >> ASAN_SHADOW_SHIFT; + + asan_memset(shadow, 0, len); + + /* + * If we are allocating a non-granule aligned region, we need to adjust + * the last byte of the shadow map to list how many bytes in the granule + * are unpoisoned. If the region is aligned, then the memset call above + * was enough. + */ + if (partial) + shadow[len] = partial; + + return 0; +} + +/* + * Initialize ASAN state when necessary. Triggered from userspace before + * allocator startup. + */ +SEC("syscall") +__weak __noasan int asan_init(struct asan_init_args *args) +{ + u64 globals_pages = args->arena_globals_pages; + u64 all_pages = args->arena_all_pages; + u64 shadow_map, shadow_pgoff; + u64 shadow_pages; + + if (asan_inited) + return 0; + + /* + * Round up the shadow map size to the nearest page. + */ + shadow_pages = all_pages >> ASAN_SHADOW_SHIFT; + if ((all_pages & ((1 << ASAN_SHADOW_SHIFT) - 1))) + shadow_pages += 1; + + if (all_pages > (1ULL << 32) / __PAGE_SIZE) { + arena_stderr("error: arena size %lx too large", all_pages); + return -EINVAL; + } + + if (globals_pages > all_pages) { + arena_stderr("error: globals %lx do not fit in arena %lx", + globals_pages, all_pages); + return -EINVAL; + } + + if (globals_pages + shadow_pages >= all_pages) { + arena_stderr("error: globals %lx do not leave room for shadow map %lx " + "(arena pages %lx)", + globals_pages, shadow_pages, all_pages); + return -EINVAL; + } + + shadow_pgoff = all_pages - shadow_pages - globals_pages; + __asan_shadow_memory_dynamic_address = shadow_pgoff * __PAGE_SIZE; + + /* + * Allocate the last (1/ASAN_SHADOW_SCALE)th of an arena's pages for the map + * We find the offset and size from the arena map. + * + * The allocated map pages are zeroed out, meaning all memory is marked as valid + * even if it's not allocated already. This is expected: Since the actual memory + * pages are not allocated, accesses to it will trigger page faults and will be + * reported through BPF streams. Any pages allocated through bpf_arena_alloc_pages + * should be poisoned by the allocator right after the call succeeds. + */ + shadow_map = (u64)bpf_arena_alloc_pages( + &arena, (void __arena *)__asan_shadow_memory_dynamic_address, + shadow_pages, NUMA_NO_NODE, 0); + if (!shadow_map) { + arena_stderr("Could not allocate shadow map\n"); + + __asan_shadow_memory_dynamic_address = 0; + + return -ENOMEM; + } + + asan_inited = true; + + return 0; +} + +#pragma clang attribute pop + +#endif /* BPF_ARENA_ASAN */ + +__weak char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From cfc00618b9dfc75cd507f1a4f0d83b4429627399 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Sun, 26 Apr 2026 15:03:35 -0400 Subject: selftests/bpf: Add ASAN support for libarena selftests Expand the arena library selftest infrastructure to support address sanitization. Add the compiler flags necessary to compile the library under ASAN when supported. Signed-off-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260426190338.4615-6-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 22 ++++++++- tools/testing/selftests/bpf/libarena/Makefile | 25 ++++++++++- .../bpf/libarena/include/libarena/userspace.h | 33 ++++++++++++++ .../bpf/libarena/selftests/st_asan_common.h | 52 ++++++++++++++++++++++ .../selftests/bpf/libarena/src/common.bpf.c | 2 + 5 files changed, 132 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/bpf/libarena/selftests/st_asan_common.h diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 71c7873c4b15..97ee61f2ade5 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -79,6 +79,12 @@ ifneq ($(shell $(CLANG) --target=bpf -mcpu=help 2>&1 | grep 'v4'),) CLANG_CPUV4 := 1 endif +# Check whether clang supports BPF address sanitizer (requires LLVM 22+) +CLANG_HAS_ARENA_ASAN := $(shell echo 'int x;' | \ + $(CLANG) --target=bpf -fsanitize=kernel-address \ + -mllvm -asan-shadow-addr-space=1 \ + -x c -c - -o /dev/null 2>/dev/null && echo 1) + # Order correspond to 'make run_tests' order TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_progs \ test_sockmap \ @@ -764,6 +770,14 @@ LIBARENA_SKEL := libarena/libarena.skel.h $(LIBARENA_SKEL): $(INCLUDE_DIR)/vmlinux.h $(BPFOBJ) $(LIBARENA_BPF_DEPS) +$(MAKE) -C libarena libarena.skel.h $(LIBARENA_MAKE_ARGS) +ifneq ($(CLANG_HAS_ARENA_ASAN),) +LIBARENA_ASAN_SKEL := libarena/libarena_asan.skel.h +CFLAGS += -DHAS_BPF_ARENA_ASAN + +$(LIBARENA_ASAN_SKEL): $(INCLUDE_DIR)/vmlinux.h $(BPFOBJ) $(LIBARENA_BPF_DEPS) + +$(MAKE) -C libarena libarena_asan.skel.h $(LIBARENA_MAKE_ARGS) +endif + # Define test_progs test runner. TRUNNER_TESTS_DIR := prog_tests TRUNNER_BPF_PROGS_DIR := progs @@ -788,7 +802,9 @@ TRUNNER_EXTRA_SOURCES := test_progs.c \ flow_dissector_load.h \ ip_check_defrag_frags.h \ bpftool_helpers.c \ - usdt_1.c usdt_2.c + usdt_1.c usdt_2.c \ + $(LIBARENA_SKEL) \ + $(LIBARENA_ASAN_SKEL) TRUNNER_LIB_SOURCES := find_bit.c TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read \ $(OUTPUT)/liburandom_read.so \ @@ -959,3 +975,7 @@ override define INSTALL_RULE endef libarena: $(LIBARENA_SKEL) + +ifneq ($(CLANG_HAS_ARENA_ASAN),) +libarena_asan: $(LIBARENA_ASAN_SKEL) +endif diff --git a/tools/testing/selftests/bpf/libarena/Makefile b/tools/testing/selftests/bpf/libarena/Makefile index e85b3ad96890..5e2ab514805e 100644 --- a/tools/testing/selftests/bpf/libarena/Makefile +++ b/tools/testing/selftests/bpf/libarena/Makefile @@ -30,6 +30,7 @@ LIBBPF_INCLUDE ?= $(INCLUDE_DIR) # Scan src/ and selftests/ to generate the final binaries LIBARENA_SOURCES = $(wildcard $(LIBARENA)/src/*.bpf.c) $(wildcard $(LIBARENA)/selftests/*.bpf.c) LIBARENA_OBJECTS = $(notdir $(LIBARENA_SOURCES:.bpf.c=.bpf.o)) +LIBARENA_OBJECTS_ASAN = $(notdir $(LIBARENA_SOURCES:.bpf.c=_asan.bpf.o)) INCLUDES = -I$(LIBARENA)/include -I$(BPFDIR) ifneq ($(INCLUDE_DIR),) @@ -39,6 +40,13 @@ ifneq ($(LIBBPF_INCLUDE),) INCLUDES += -I$(LIBBPF_INCLUDE) endif +ASAN_FLAGS = -fsanitize=kernel-address -fno-stack-protector -fno-builtin +ASAN_FLAGS += -mllvm -asan-instrument-address-spaces=1 -mllvm -asan-shadow-addr-space=1 +ASAN_FLAGS += -mllvm -asan-use-stack-safety=0 -mllvm -asan-stack=0 +ASAN_FLAGS += -mllvm -asan-kernel=1 +ASAN_FLAGS += -mllvm -asan-constructor-kind=none +ASAN_FLAGS += -mllvm -asan-destructor-kind=none + # ENABLE_ATOMICS_TESTS required because we use arena spinlocks override BPF_CFLAGS += -DENABLE_ATOMICS_TESTS override BPF_CFLAGS += -O2 -g @@ -53,17 +61,32 @@ CFLAGS += $(INCLUDES) vpath %.bpf.c $(LIBARENA)/src $(LIBARENA)/selftests vpath %.c $(LIBARENA)/src $(LIBARENA)/selftests +skeletons: libarena.skel.h libarena_asan.skel.h +.PHONY: skeletons + +libarena_asan.skel.h: libarena_asan.bpf.o + $(call msg,GEN-SKEL,libarena,$@) + $(Q)$(BPFTOOL) gen skeleton $< name "libarena_asan" > $@ + libarena.skel.h: libarena.bpf.o $(call msg,GEN-SKEL,libarena,$@) $(Q)$(BPFTOOL) gen skeleton $< name "libarena" > $@ +libarena_asan.bpf.o: $(LIBARENA_OBJECTS_ASAN) + $(call msg,GEN-OBJ,libarena,$@) + $(Q)$(BPFTOOL) gen object $@ $^ + libarena.bpf.o: $(LIBARENA_OBJECTS) $(call msg,GEN-OBJ,libarena,$@) $(Q)$(BPFTOOL) gen object $@ $^ +%_asan.bpf.o: %.bpf.c + $(call msg,CLNG-BPF,libarena,$@) + $(Q)$(CLANG) $(BPF_CFLAGS) $(ASAN_FLAGS) -DBPF_ARENA_ASAN $(BPF_TARGET_ENDIAN) -c $< -o $@ + %.bpf.o: %.bpf.c $(call msg,CLNG-BPF,libarena,$@) $(Q)$(CLANG) $(BPF_CFLAGS) $(BPF_TARGET_ENDIAN) -c $< -o $@ clean: - $(Q)rm -f *.skel.h *.bpf.o + $(Q)rm -f *.skel.h *.bpf.o *.linked*.o diff --git a/tools/testing/selftests/bpf/libarena/include/libarena/userspace.h b/tools/testing/selftests/bpf/libarena/include/libarena/userspace.h index 0438a751d5fd..88b68ac73cca 100644 --- a/tools/testing/selftests/bpf/libarena/include/libarena/userspace.h +++ b/tools/testing/selftests/bpf/libarena/include/libarena/userspace.h @@ -27,6 +27,11 @@ static inline bool libarena_is_test_prog(const char *name) return strstr(name, "test_") == name; } +static inline bool libarena_is_asan_test_prog(const char *name) +{ + return strstr(name, "asan_test") == name; +} + static inline int libarena_run_prog_args(int prog_fd, void *args, size_t argsize) { LIBBPF_OPTS(bpf_test_run_opts, opts); @@ -97,3 +102,31 @@ static inline int libarena_get_globals_pages(int arena_get_globals_fd, free(vec); return 0; } + +static inline int libarena_asan_init(int arena_asan_init_fd, + int asan_init_fd, + size_t arena_all_pages) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct asan_init_args args; + u64 globals_pages; + int ret; + + ret = libarena_get_globals_pages(arena_asan_init_fd, + arena_all_pages, &globals_pages); + if (ret) + return ret; + + args = (struct asan_init_args){ + .arena_all_pages = arena_all_pages, + .arena_globals_pages = globals_pages, + }; + + opts.ctx_in = &args; + opts.ctx_size_in = sizeof(args); + + ret = bpf_prog_test_run_opts(asan_init_fd, &opts); + if (ret) + return ret; + return opts.retval; +} diff --git a/tools/testing/selftests/bpf/libarena/selftests/st_asan_common.h b/tools/testing/selftests/bpf/libarena/selftests/st_asan_common.h new file mode 100644 index 000000000000..1d3edc4372ac --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/selftests/st_asan_common.h @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#pragma once + +#define ST_PAGES 64 + +static inline void print_asan_map_state(void __arena *addr) +{ + arena_stdout("%s:%d ASAN %p -> (val: %x gran: %x set: [%s])", + __func__, __LINE__, addr, + *(s8a *)(addr), ASAN_GRANULE(addr), + asan_shadow_set(addr) ? "yes" : "no"); +} + +/* + * Emit an error and force the current function to exit if the ASAN + * violation state is unexpected. Reset the violation state after. + */ +static inline int asan_validate_addr(bool cond, void __arena *addr) +{ + if ((asan_violated != 0) == cond) { + asan_violated = 0; + return 0; + } + + arena_stdout("%s:%d ASAN asan_violated %lx", __func__, __LINE__, + (u64)asan_violated); + print_asan_map_state(addr); + + asan_violated = 0; + + return -EINVAL; +} + +static inline int asan_validate(void) +{ + if (!asan_violated) + return 0; + + arena_stdout("%s:%d Found ASAN violation at %lx", __func__, __LINE__, + asan_violated); + + asan_violated = 0; + + return -EINVAL; +} + +struct blob { + volatile u8 mem[59]; + u8 oob; +}; diff --git a/tools/testing/selftests/bpf/libarena/src/common.bpf.c b/tools/testing/selftests/bpf/libarena/src/common.bpf.c index 659ccead5624..84e8a8b7d42e 100644 --- a/tools/testing/selftests/bpf/libarena/src/common.bpf.c +++ b/tools/testing/selftests/bpf/libarena/src/common.bpf.c @@ -2,6 +2,8 @@ /* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ #include +#include + const volatile u32 zero = 0; int arena_fls(__u64 word) -- cgit v1.2.3 From 86426a28c52d756a5edbe29885716128b8915991 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Sun, 26 Apr 2026 15:03:36 -0400 Subject: selftests/bpf: Add buddy allocator for libarena Add a byte-oriented buddy allocator for libarena. The buddy allocator provides an alloc/free interface for small arena allocations ranging from 16 bytes to 512 KiB. Lower allocations values are rounded up to 16 bytes. The buddy allocator does not handle larger allocations that can instead use the existing bpf_arena_{alloc, free}_pages() kfunc. Signed-off-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260426190338.4615-7-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/default.profraw | Bin 0 -> 160 bytes tools/testing/selftests/bpf/libarena/Makefile | 2 + .../bpf/libarena/include/libarena/buddy.h | 92 +++ .../bpf/libarena/include/libarena/common.h | 14 + .../testing/selftests/bpf/libarena/src/buddy.bpf.c | 903 +++++++++++++++++++++ .../selftests/bpf/libarena/src/common.bpf.c | 23 +- 6 files changed, 1033 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/default.profraw create mode 100644 tools/testing/selftests/bpf/libarena/include/libarena/buddy.h create mode 100644 tools/testing/selftests/bpf/libarena/src/buddy.bpf.c diff --git a/tools/testing/selftests/bpf/default.profraw b/tools/testing/selftests/bpf/default.profraw new file mode 100644 index 000000000000..e865e87829f8 Binary files /dev/null and b/tools/testing/selftests/bpf/default.profraw differ diff --git a/tools/testing/selftests/bpf/libarena/Makefile b/tools/testing/selftests/bpf/libarena/Makefile index 5e2ab514805e..3c695f9c0054 100644 --- a/tools/testing/selftests/bpf/libarena/Makefile +++ b/tools/testing/selftests/bpf/libarena/Makefile @@ -51,6 +51,8 @@ ASAN_FLAGS += -mllvm -asan-destructor-kind=none override BPF_CFLAGS += -DENABLE_ATOMICS_TESTS override BPF_CFLAGS += -O2 -g override BPF_CFLAGS += -Wno-incompatible-pointer-types-discards-qualifiers +# Required to define our own arena-based free() +override BPF_CFLAGS += -Wno-incompatible-library-redeclaration # Required for suppressing harmless vmlinux.h-related warnings. override BPF_CFLAGS += -Wno-missing-declarations override BPF_CFLAGS += $(INCLUDES) diff --git a/tools/testing/selftests/bpf/libarena/include/libarena/buddy.h b/tools/testing/selftests/bpf/libarena/include/libarena/buddy.h new file mode 100644 index 000000000000..00e2437128ef --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/include/libarena/buddy.h @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#pragma once + +struct buddy_chunk; +typedef struct buddy_chunk __arena buddy_chunk_t; + +struct buddy_header; +typedef struct buddy_header __arena buddy_header_t; + +enum buddy_consts { + /* + * Minimum allocation is 1 << BUDDY_MIN_ALLOC_SHIFT. + * Larger sizes increase internal fragmentation, but smaller + * sizes increase the space overhead of the block metadata. + */ + BUDDY_MIN_ALLOC_SHIFT = 4, + BUDDY_MIN_ALLOC_BYTES = 1 << BUDDY_MIN_ALLOC_SHIFT, + + /* + * How many orders the buddy allocator can serve. Minimum block + * size is 1 << BUDDY_MIN_ALLOC_SHIFT, maximum block size is + * 1 << (BUDDY_MIN_ALLOC_SHIFT + BUDDY_CHUNK_NUM_ORDERS - 1): + * Each block has size 1 << BUDDY_MIN_ALLOC_SHIFT, and the + * allocation orders are in [0, BUDDY_CHUNK_NUM_ORDERS). + * We keep two blocks of the maximum size to retain the + * property in the code that all blocks have a buddy. + * Higher values increase the maximum allocation size, + * but also the size of the metadata for each block. + */ + BUDDY_CHUNK_NUM_ORDERS = 1 << 4, + BUDDY_CHUNK_BYTES = BUDDY_MIN_ALLOC_BYTES << (BUDDY_CHUNK_NUM_ORDERS), + + /* Offset of the buddy header within a free block, see buddy.bpf.c for details */ + BUDDY_HEADER_OFF = 8, + + /* The maximum number of blocks a chunk may have to track. */ + BUDDY_CHUNK_ITEMS = 1 << (BUDDY_CHUNK_NUM_ORDERS), + BUDDY_CHUNK_OFFSET_MASK = BUDDY_CHUNK_BYTES - 1, + + /* + * Alignment for chunk allocations based on bpf_arena_alloc_pages. + * The arena allocation kfunc does not have an alignment argument, + * but that is required for all block calculations in the chunk to + * work. + */ + BUDDY_VADDR_OFFSET = BUDDY_CHUNK_BYTES, + + /* Total arena virtual address space the allocator can consume. */ + BUDDY_VADDR_SIZE = BUDDY_CHUNK_BYTES << 10 +}; + +struct buddy_header { + u32 prev_index; /* "Pointer" to the previous available allocation of the same size. */ + u32 next_index; /* Same for the next allocation. */ +}; + +/* + * We bring memory into the allocator 1 MiB at a time. + */ +struct buddy_chunk { + /* The order of the current allocation for a item. 4 bits per order. */ + u8 orders[BUDDY_CHUNK_ITEMS / 2]; + /* + * Bit to denote whether chunk is allocated. Size of the allocated/free + * chunk found from the orders array. + */ + u8 allocated[BUDDY_CHUNK_ITEMS / 8]; + /* Freelists for O(1) allocation. */ + u64 freelists[BUDDY_CHUNK_NUM_ORDERS]; + buddy_chunk_t *next; +}; + +struct buddy { + buddy_chunk_t *first_chunk; /* Pointer to the chunk linked list. */ + arena_spinlock_t lock; /* Allocator lock */ + u64 vaddr; /* Allocation into reserved vaddr */ +}; + +typedef struct buddy __arena buddy_t; + +#ifdef __BPF__ + +int buddy_init(buddy_t *buddy); +int buddy_destroy(buddy_t *buddy); +int buddy_free_internal(buddy_t *buddy, u64 free); +#define buddy_free(buddy, ptr) buddy_free_internal((buddy), (u64)(ptr)) +u64 buddy_alloc_internal(buddy_t *buddy, size_t size); +#define buddy_alloc(alloc, size) ((void __arena *)buddy_alloc_internal((alloc), (size))) + + +#endif /* __BPF__ */ diff --git a/tools/testing/selftests/bpf/libarena/include/libarena/common.h b/tools/testing/selftests/bpf/libarena/include/libarena/common.h index 21eb18bf4533..e54cb7b869bd 100644 --- a/tools/testing/selftests/bpf/libarena/include/libarena/common.h +++ b/tools/testing/selftests/bpf/libarena/include/libarena/common.h @@ -48,6 +48,20 @@ extern volatile u64 asan_violated; int arena_fls(__u64 word); +u64 malloc_internal(size_t size); +#define malloc(size) ((void __arena *)malloc_internal((size))) +void free(void __arena *ptr); + +/* + * The verifier associates arenas with programs by checking LD.IMM + * instruction operands for an arena and populating the program state + * with the first instance it finds. This requires accessing our global + * arena variable, but subprogs do not necessarily do so while still + * using pointers from that arena. Insert an LD.IMM instruction to + * access the arena and help the verifier. + */ +#define arena_subprog_init() do { asm volatile ("" :: "r"(&arena)); } while (0) + #else /* ! __BPF__ */ #include diff --git a/tools/testing/selftests/bpf/libarena/src/buddy.bpf.c b/tools/testing/selftests/bpf/libarena/src/buddy.bpf.c new file mode 100644 index 000000000000..865e00803daa --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/src/buddy.bpf.c @@ -0,0 +1,903 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include + +/* + * Buddy allocator arena-based implementation. + * + * Memory is organized into chunks. These chunks + * cannot be coalesced or split. Allocating + * chunks allocates their memory eagerly. + * + * Internally, each chunk is organized into blocks. + * Blocks _can_ be coalesced/split, but only inside + * the chunk. Each block can be allocated or + * unallocated. If allocated, the entire block holds + * user data. If unallocated, the block is mostly + * invalid memory, with the exception of a header + * used for freelist tracking. + * + * The header is placed at an offset inside the block + * to prevent off-by-one errors from the previous block + * from trivially overwriting the header. Such an error + * is also not catchable by ASAN, since the header remains + * valid memory even after the block is freed. It is still + * theoretically possible for the header to be corrupted + * without being caught by ASAN, but harder. + * + * Since the allocator needs to track order information for + * both allocated and free blocks, and allocated blocks cannot + * store a header, the allocator also stores per-chunk order + * information in a reserved region at the beginning of the + * chunk. The header includes a bitmap with the order of blocks + * and their allocation state. It also includes the freelist + * heads for the allocation itself. + */ + + +enum { + BUDDY_POISONED = (s8)0xef, + + /* Number of pages to be allocated per chunk. */ + BUDDY_CHUNK_PAGES = BUDDY_CHUNK_BYTES / __PAGE_SIZE +}; + +static inline int buddy_lock(buddy_t *buddy) +{ + return arena_spin_lock(&buddy->lock); +} + +static inline void buddy_unlock(buddy_t *buddy) +{ + arena_spin_unlock(&buddy->lock); +} + +/* + * Reserve part of the arena address space for the allocator. We use + * this to get aligned addresses for the chunks, since the arena + * page alloc kfuncs do not support aligning to a boundary (in this + * case 1 MiB, see buddy.h on how this is derived). + */ +static int buddy_reserve_arena_vaddr(buddy_t *buddy) +{ + buddy->vaddr = 0; + + return bpf_arena_reserve_pages(&arena, + (void __arena *)BUDDY_VADDR_OFFSET, + BUDDY_VADDR_SIZE / __PAGE_SIZE); +} + +/* + * Free up any unused address space. Used only during teardown. + */ +static void buddy_unreserve_arena_vaddr(buddy_t *buddy) +{ + bpf_arena_free_pages( + &arena, (void __arena *)(BUDDY_VADDR_OFFSET + buddy->vaddr), + (BUDDY_VADDR_SIZE - buddy->vaddr) / __PAGE_SIZE); + + buddy->vaddr = 0; +} + +/* + * Carve out part of the reserved address space and hand it over + * to the buddy allocator. + * + * We are assuming the buddy allocator is the only allocator in the + * system, so there is no race between this function reserving a + * page range and some other allocator actually making the BPF call + * to really create and reserve it. + * + * However, bump allocation must still be atomic because this function + * is called without the buddy lock from multiple threads concurrently. + */ +__weak int buddy_alloc_arena_vaddr(buddy_t __arg_arena *buddy, u64 *vaddrp) +{ + u64 vaddr, old, new; + + if (!buddy || !vaddrp) + return -EINVAL; + + do { + vaddr = buddy->vaddr; + new = vaddr + BUDDY_CHUNK_BYTES; + + if (new > BUDDY_VADDR_SIZE) + return -EINVAL; + + old = __sync_val_compare_and_swap(&buddy->vaddr, vaddr, new); + } while (old != vaddr && can_loop); + + if (old != vaddr) + return -EINVAL; + + *vaddrp = BUDDY_VADDR_OFFSET + vaddr; + + return 0; +} + +static u64 arena_next_pow2(__u64 n) +{ + n--; + n |= n >> 1; + n |= n >> 2; + n |= n >> 4; + n |= n >> 8; + n |= n >> 16; + n |= n >> 32; + n++; + + return n; +} + +__weak +int idx_set_allocated(buddy_chunk_t __arg_arena *chunk, u64 idx, bool allocated) +{ + bool already_allocated; + + if (unlikely(idx >= BUDDY_CHUNK_ITEMS)) { + arena_stderr("setting state of invalid idx (%ld, max %d)\n", idx, + BUDDY_CHUNK_ITEMS); + return -EINVAL; + } + + already_allocated = chunk->allocated[idx / 8] & (1 << (idx % 8)); + if (unlikely(already_allocated == allocated)) { + arena_stderr("Double %s of idx %ld for chunk %p", + allocated ? "alloc" : "free", + idx, chunk); + return -EINVAL; + } + + if (allocated) + chunk->allocated[idx / 8] |= 1 << (idx % 8); + else + chunk->allocated[idx / 8] &= ~(1 << (idx % 8)); + + return 0; +} + +static int idx_is_allocated(buddy_chunk_t *chunk, u64 idx, bool *allocated) +{ + if (unlikely(idx >= BUDDY_CHUNK_ITEMS)) { + arena_stderr("getting state of invalid idx (%llu, max %d)\n", idx, + BUDDY_CHUNK_ITEMS); + return -EINVAL; + } + + *allocated = chunk->allocated[idx / 8] & (1 << (idx % 8)); + return 0; +} + +__weak +int idx_set_order(buddy_chunk_t __arg_arena *chunk, u64 idx, u8 order) +{ + u8 prev_order; + + if (unlikely(order >= BUDDY_CHUNK_NUM_ORDERS)) { + arena_stderr("setting invalid order %u\n", order); + return -EINVAL; + } + + if (unlikely(idx >= BUDDY_CHUNK_ITEMS)) { + arena_stderr("setting order of invalid idx (%d, max %d)\n", idx, + BUDDY_CHUNK_ITEMS); + return -EINVAL; + } + + /* + * We store two order instances per byte, one per nibble. + * Retain the existing nibble. + */ + prev_order = chunk->orders[idx / 2]; + if (idx & 0x1) { + order &= 0xf; + order |= (prev_order & 0xf0); + } else { + order <<= 4; + order |= (prev_order & 0xf); + } + + chunk->orders[idx / 2] = order; + + return 0; +} + +static u8 idx_get_order(buddy_chunk_t *chunk, u64 idx) +{ + u8 result; + + _Static_assert(BUDDY_CHUNK_NUM_ORDERS <= 16, + "order must fit in 4 bits"); + + if (unlikely(idx >= BUDDY_CHUNK_ITEMS)) { + arena_stderr("getting order of invalid idx %u\n", idx); + return BUDDY_CHUNK_NUM_ORDERS; + } + + result = chunk->orders[idx / 2]; + + return (idx & 0x1) ? (result & 0xf) : (result >> 4); +} + +static void __arena *idx_to_addr(buddy_chunk_t *chunk, size_t idx) +{ + u64 address; + + if (unlikely(idx >= BUDDY_CHUNK_ITEMS)) { + arena_stderr("translating invalid idx %u\n", idx); + return NULL; + } + + /* + * The data blocks start in the chunk after the metadata block. + * We find the actual address by indexing into the region at an + * BUDDY_MIN_ALLOC_BYTES granularity, the minimum allowed. + * The index number already accounts for the fact that the first + * blocks in the chunk are occupied by the metadata, so we do + * not need to offset it. + */ + + address = (u64)chunk + (idx * BUDDY_MIN_ALLOC_BYTES); + + return (void __arena *)address; +} + +static buddy_header_t *idx_to_header(buddy_chunk_t *chunk, size_t idx) +{ + bool allocated; + u64 address; + + if (unlikely(idx_is_allocated(chunk, idx, &allocated))) { + arena_stderr("accessing invalid idx 0x%lx\n", idx); + return NULL; + } + + if (unlikely(allocated)) { + arena_stderr("accessing allocated idx 0x%lx as header\n", idx); + return NULL; + } + + address = (u64)idx_to_addr(chunk, idx); + if (!address) + return NULL; + + /* + * Offset the header within the block. This avoids accidental overwrites + * to the header because of off-by-one errors when using adjacent blocks. + * + * The offset has been chosen as a compromise between ASAN effectiveness + * and allocator granularity: + * 1) ASAN dictates valid data runs are 8-byte aligned. + * 2) We want to keep a low minimum allocation size (currently 16). + * + * As a result, we have only two possible positions for the header: Bytes + * 0 and 8. Keeping the header in byte 0 means off-by-ones from the previous + * block touch the header, and, since the header must be accessible, ASAN + * will not trigger. Keeping the header on byte 8 means off-by-one errors from + * the previous block are caught by ASAN. Negative offsets are rarer, so + * while accesses into the block from the next block are possible, they are + * less probable. + */ + + return (buddy_header_t *)(address + BUDDY_HEADER_OFF); +} + +static void header_add_freelist(buddy_chunk_t *chunk, buddy_header_t *header, + u64 idx, u8 order) +{ + buddy_header_t *tmp_header; + + idx_set_order(chunk, idx, order); + + header->next_index = chunk->freelists[order]; + header->prev_index = BUDDY_CHUNK_ITEMS; + + if (header->next_index != BUDDY_CHUNK_ITEMS) { + tmp_header = idx_to_header(chunk, header->next_index); + tmp_header->prev_index = idx; + } + + chunk->freelists[order] = idx; +} + +static void header_remove_freelist(buddy_chunk_t *chunk, + buddy_header_t *header, u8 order) +{ + buddy_header_t *tmp_header; + + if (header->prev_index != BUDDY_CHUNK_ITEMS) { + tmp_header = idx_to_header(chunk, header->prev_index); + tmp_header->next_index = header->next_index; + } + + if (header->next_index != BUDDY_CHUNK_ITEMS) { + tmp_header = idx_to_header(chunk, header->next_index); + tmp_header->prev_index = header->prev_index; + } + + /* Pop off the list head if necessary. */ + if (idx_to_header(chunk, chunk->freelists[order]) == header) + chunk->freelists[order] = header->next_index; + + header->prev_index = BUDDY_CHUNK_ITEMS; + header->next_index = BUDDY_CHUNK_ITEMS; +} + +static u64 size_to_order(size_t size) +{ + u64 order; + + /* + * Legal sizes are [1, 4GiB] (the biggest possible arena). + * Of course, sizes close to GiB are practically impossible + * to fulfill and allocation will fail, but that's taken care + * of by the caller. + */ + + if (unlikely(size == 0 || size > (1UL << 32))) { + arena_stderr("illegal size request %lu\n", size); + return 64; + } + /* + * To find the order of the allocation we find the first power of two + * >= the requested size, take the log2, then adjust it for the minimum + * allocation size by removing the minimum shift from it. Requests + * smaller than the minimum allocation size are rounded up. + */ + order = arena_fls(arena_next_pow2(size)) - 1; + if (order < BUDDY_MIN_ALLOC_SHIFT) + return 0; + + return order - BUDDY_MIN_ALLOC_SHIFT; +} + +__weak +int add_leftovers_to_freelist(buddy_chunk_t __arg_arena *chunk, u32 cur_idx, + u64 min_order, u64 max_order) +{ + buddy_header_t *header; + u64 ord; + u32 idx; + + for (ord = min_order; ord < max_order && can_loop; ord++) { + /* Mark the buddy as free and add it to the freelists. */ + idx = cur_idx + (1 << ord); + + header = idx_to_header(chunk, idx); + if (unlikely(!header)) { + arena_stderr("idx %u has no header", idx); + return -EINVAL; + } + + asan_unpoison(header, sizeof(*header)); + + header_add_freelist(chunk, header, idx, ord); + } + + return 0; +} + +static buddy_chunk_t *buddy_chunk_get(buddy_t *buddy) +{ + u64 order, ord, min_order, max_order; + buddy_chunk_t *chunk; + size_t left; + int power2; + u64 vaddr; + u32 idx; + int ret; + + /* + * Step 1: Allocate a properly aligned chunk, and + * prep it for insertion into the buddy allocator. + * We don't need the allocator lock until step 2. + */ + + ret = buddy_alloc_arena_vaddr(buddy, &vaddr); + if (ret) + return NULL; + + /* Addresses must be aligned to the chunk boundary. */ + if (vaddr % BUDDY_CHUNK_BYTES) + return NULL; + + /* Unreserve the address space. */ + bpf_arena_free_pages(&arena, (void __arena *)vaddr, + BUDDY_CHUNK_PAGES); + + chunk = bpf_arena_alloc_pages(&arena, (void __arena *)vaddr, + BUDDY_CHUNK_PAGES, NUMA_NO_NODE, 0); + if (!chunk) { + arena_stderr("[ALLOC FAILED]"); + return NULL; + } + + if (buddy_lock(buddy)) { + /* + * We cannot reclaim the vaddr space, but that is ok - this + * operation should always succeed. The error path is to catch + * accidental deadlocks that will cause -ENOMEMs to the program as + * the allocator fails to refill itself, in which case vaddr usage + * is the least of our worries. + */ + bpf_arena_free_pages(&arena, (void __arena *)vaddr, BUDDY_CHUNK_PAGES); + return NULL; + } + + asan_poison(chunk, BUDDY_POISONED, BUDDY_CHUNK_PAGES * __PAGE_SIZE); + + /* Unpoison the chunk itself. */ + asan_unpoison(chunk, sizeof(*chunk)); + + /* Mark all freelists as empty. */ + for (ord = zero; ord < BUDDY_CHUNK_NUM_ORDERS && can_loop; ord++) + chunk->freelists[ord] = BUDDY_CHUNK_ITEMS; + + /* + * Initialize the chunk by carving out a page range to hold the metadata + * struct above, then dumping the rest of the pages into the allocator. + */ + + _Static_assert(BUDDY_CHUNK_PAGES * __PAGE_SIZE >= + BUDDY_MIN_ALLOC_BYTES * + BUDDY_CHUNK_ITEMS, + "chunk must fit within the allocation"); + + /* + * Step 2: Reserve a chunk for the chunk metadata, then breaks + * the rest of the full allocation into the different buckets. + * We allocating the memory by grabbing blocks of progressively + * smaller sizes from the allocator, which are guaranteed to be + * continuous. + * + * This operation also populates the allocator. + * + * Algorithm: + * + * - max_order: The last order allocation we made + * - left: How many bytes are left to allocate + * - cur_index: Current index into the top-level block we are + * allocating from. + * + * Step 3: + * - Find the largest power-of-2 allocation still smaller than left (infimum) + * - Reserve a chunk of that size, along with its buddy + * - For every order from [infimum + 1, last order), carve out a block + * and put it into the allocator. + * + * Example: Chunk size 0b1010000 (80 bytes) + * + * Step 1: + * + * idx infimum 1 << max_order + * 0 64 128 1 << 20 + * |________|_________|______________________| + * + * Blocks set aside: + * [0, 64) - Completely allocated + * [64, 128) - Will be further split in the next iteration + * + * Blocks added to the allocator: + * [128, 256) + * [256, 512) + * ... + * [1 << 18, 1 << 19) + * [1 << 19, 1 << 20) + * + * Step 2: + * + * idx infimum idx + 1 << max_order + * 64 80 96 64 + 1 << 6 = 128 + * |________|_________|______________________| + * + * Blocks set aside: + * [64, 80) - Completely allocated + * + * Blocks added to the allocator: + * [80, 96) - left == 0 so the buddy is unused and marked as freed + * [96, 128) + */ + max_order = BUDDY_CHUNK_NUM_ORDERS; + left = sizeof(*chunk); + idx = 0; + while (left && can_loop) { + power2 = arena_fls(left) - 1; + /* + * Note: The condition below only triggers to catch serious bugs + * early. There is no sane way to undo any block insertions from + * the allocated chunk, so just leak any leftover allocations, + * emit a diagnostic, unlock and exit. + * + */ + if (unlikely(power2 >= BUDDY_CHUNK_NUM_ORDERS)) { + arena_stderr( + "buddy chunk metadata require allocation of order %d\n", + power2); + arena_stderr( + "chunk has size of 0x%lx bytes (left %lx bytes)\n", + sizeof(*chunk), left); + buddy_unlock(buddy); + + return NULL; + } + + /* Round up allocations that are too small. */ + + left -= (power2 >= BUDDY_MIN_ALLOC_SHIFT) ? 1 << power2 : left; + order = (power2 >= BUDDY_MIN_ALLOC_SHIFT) ? power2 - BUDDY_MIN_ALLOC_SHIFT : 0; + + if (idx_set_allocated(chunk, idx, true)) { + buddy_unlock(buddy); + return NULL; + } + + /* + * Starting an order above the one we allocated, populate + * the allocator with free blocks. If this is the last + * allocation (left == 0), also mark the buddy as free. + * + * See comment above about error handling: The error path + * is only there as a way to mitigate deeply buggy allocator + * states by emitting a diagnostic in add_leftovers_to_freelist() + * and leaking any memory not added in the freelists. + */ + min_order = left ? order + 1 : order; + if (add_leftovers_to_freelist(chunk, idx, min_order, max_order)) { + buddy_unlock(buddy); + return NULL; + } + + /* Adjust the index. */ + idx += 1 << order; + max_order = order; + } + + buddy_unlock(buddy); + + return chunk; +} + +__weak int buddy_init(buddy_t __arg_arena *buddy) +{ + buddy_chunk_t *chunk; + int ret; + + if (!asan_ready()) + return -EINVAL; + + /* Reserve enough address space to ensure allocations are aligned. */ + ret = buddy_reserve_arena_vaddr(buddy); + if (ret) + return ret; + + _Static_assert(BUDDY_CHUNK_PAGES > 0, + "chunk must use one or more pages"); + + chunk = buddy_chunk_get(buddy); + + if (buddy_lock(buddy)) { + bpf_arena_free_pages(&arena, chunk, BUDDY_CHUNK_PAGES); + return -EINVAL; + } + + /* Chunk is already properly unpoisoned if allocated. */ + if (chunk) + chunk->next = buddy->first_chunk; + + /* Put the chunk at the beginning of the list. */ + buddy->first_chunk = chunk; + + buddy_unlock(buddy); + + return chunk ? 0 : -ENOMEM; +} + +/* + * Destroy the allocator. This does not check whether there are any allocations + * currently in use, so any pages being accessed will start taking arena faults. + * We do not take a lock because we are freeing arena pages, and nobody should + * be using the allocator at that point in the execution. + */ +__weak int buddy_destroy(buddy_t __arg_arena *buddy) +{ + buddy_chunk_t *chunk, *next; + + if (!buddy) + return -EINVAL; + + /* + * Traverse all buddy chunks and free them back to the arena + * with the same granularity they were allocated with. + */ + for (chunk = buddy->first_chunk; chunk && can_loop; chunk = next) { + next = chunk->next; + + /* Wholesale poison the entire block. */ + asan_poison(chunk, BUDDY_POISONED, + BUDDY_CHUNK_PAGES * __PAGE_SIZE); + bpf_arena_free_pages(&arena, chunk, BUDDY_CHUNK_PAGES); + } + + /* Free up any part of the address space that did not get used. */ + buddy_unreserve_arena_vaddr(buddy); + + /* Clear all fields. */ + buddy->first_chunk = NULL; + + return 0; +} + +__weak u64 buddy_chunk_alloc(buddy_chunk_t __arg_arena *chunk, int order_req) +{ + buddy_header_t *header, *tmp_header, *next_header; + u32 idx, tmpidx, retidx; + u64 address; + u64 order = 0; + u64 i; + + for (order = order_req; order < BUDDY_CHUNK_NUM_ORDERS && can_loop; order++) { + if (chunk->freelists[order] != BUDDY_CHUNK_ITEMS) + break; + } + + if (order >= BUDDY_CHUNK_NUM_ORDERS) + return (u64)NULL; + + retidx = chunk->freelists[order]; + header = idx_to_header(chunk, retidx); + if (unlikely(!header)) + return (u64) NULL; + + chunk->freelists[order] = header->next_index; + + if (header->next_index != BUDDY_CHUNK_ITEMS) { + next_header = idx_to_header(chunk, header->next_index); + next_header->prev_index = BUDDY_CHUNK_ITEMS; + } + + header->prev_index = BUDDY_CHUNK_ITEMS; + header->next_index = BUDDY_CHUNK_ITEMS; + if (idx_set_order(chunk, retidx, order_req)) + return (u64)NULL; + + if (idx_set_allocated(chunk, retidx, true)) + return (u64)NULL; + + /* + * Do not unpoison the address yet, will be done by the caller + * because the caller has the exact allocation size requested. + */ + address = (u64)idx_to_addr(chunk, retidx); + if (!address) + return (u64)NULL; + + /* If we allocated from a larger-order chunk, split the buddies. */ + for (i = order_req; i < order && can_loop; i++) { + /* + * Flip the bit for the current order (the bit is guaranteed + * to be 0, so just add 1 << i). + */ + idx = retidx + (1 << i); + + /* Add the buddy of the allocation to the free list. */ + header = idx_to_header(chunk, idx); + /* Unpoison the buddy header */ + asan_unpoison(header, sizeof(*header)); + + if (idx_set_order(chunk, idx, i)) + return (u64)NULL; + + /* Push the header to the beginning of the freelists list. */ + tmpidx = chunk->freelists[i]; + + header->prev_index = BUDDY_CHUNK_ITEMS; + header->next_index = tmpidx; + + if (tmpidx != BUDDY_CHUNK_ITEMS) { + tmp_header = idx_to_header(chunk, tmpidx); + tmp_header->prev_index = idx; + } + + chunk->freelists[i] = idx; + } + + return address; +} + +/* Scan the existing chunks for available memory. */ +static u64 buddy_alloc_from_existing_chunks(buddy_t *buddy, int order) +{ + buddy_chunk_t *chunk; + u64 address; + + for (chunk = buddy->first_chunk; chunk != NULL && can_loop; + chunk = chunk->next) { + address = buddy_chunk_alloc(chunk, order); + if (address) + return address; + } + + return (u64)NULL; +} + +/* + * Try an allocation from a newly allocated chunk. Also + * incorporate the chunk into the linked list. + */ +static u64 buddy_alloc_from_new_chunk(buddy_t *buddy, buddy_chunk_t *chunk, int order) +{ + u64 address; + + if (buddy_lock(buddy)) + return (u64)NULL; + + + /* + * Add the chunk into the allocator and try + * to allocate specifically from that chunk. + */ + chunk->next = buddy->first_chunk; + buddy->first_chunk = chunk; + + address = buddy_chunk_alloc(buddy->first_chunk, order); + + buddy_unlock(buddy); + + return (u64)address; +} +__weak +u64 buddy_alloc_internal(buddy_t __arg_arena *buddy, size_t size) +{ + buddy_chunk_t *chunk; + u64 address = (u64)NULL; + int order; + + if (!buddy) + return (u64)NULL; + + order = size_to_order(size); + if (order >= BUDDY_CHUNK_NUM_ORDERS || order < 0) { + arena_stderr("invalid order %d (sz %lu)\n", order, size); + return (u64)NULL; + } + + if (buddy_lock(buddy)) + return (u64)NULL; + + address = buddy_alloc_from_existing_chunks(buddy, order); + buddy_unlock(buddy); + if (address) + goto done; + + /* Get a new chunk. */ + chunk = buddy_chunk_get(buddy); + if (chunk) + address = buddy_alloc_from_new_chunk(buddy, chunk, order); + +done: + /* If we failed to allocate memory, return NULL. */ + if (!address) + return (u64)NULL; + + /* + * Unpoison exactly the amount of bytes requested. If the + * data is smaller than the header, we must poison any + * unused bytes that were part of the header. + */ + if (size < BUDDY_HEADER_OFF + sizeof(buddy_header_t)) + asan_poison((u8 __arena *)address + BUDDY_HEADER_OFF, + BUDDY_POISONED, sizeof(buddy_header_t)); + + asan_unpoison((u8 __arena *)address, size); + + return address; +} + +static __always_inline int buddy_free_unlocked(buddy_t *buddy, u64 addr) +{ + buddy_header_t *header, *buddy_header; + u64 idx, buddy_idx, tmp_idx; + buddy_chunk_t *chunk; + bool allocated; + u8 order; + int ret; + + if (!buddy) + return -EINVAL; + + if (addr & (BUDDY_MIN_ALLOC_BYTES - 1)) { + arena_stderr("Freeing unaligned address %llx\n", addr); + return -EINVAL; + } + + /* Get (chunk, idx) out of the address. */ + chunk = (void __arena *)(addr & ~BUDDY_CHUNK_OFFSET_MASK); + idx = (addr & BUDDY_CHUNK_OFFSET_MASK) / BUDDY_MIN_ALLOC_BYTES; + + /* Mark the block as unallocated so we can access the header. */ + ret = idx_set_allocated(chunk, idx, false); + if (ret) + return ret; + + order = idx_get_order(chunk, idx); + header = idx_to_header(chunk, idx); + + /* The header is in the block itself, keep it unpoisoned. */ + asan_poison((u8 __arena *)addr, BUDDY_POISONED, + BUDDY_MIN_ALLOC_BYTES << order); + asan_unpoison(header, sizeof(*header)); + + /* + * Coalescing loop. Merge with free buddies of equal order. + * For every coalescing step, keep the left buddy and + * drop the right buddy's header. + */ + for (; order < BUDDY_CHUNK_NUM_ORDERS && can_loop; order++) { + buddy_idx = idx ^ (1 << order); + + /* Check if the buddy is actually free. */ + idx_is_allocated(chunk, buddy_idx, &allocated); + if (allocated) + break; + + /* + * If buddy is not the same order as the chunk + * being freed, then we're done coalescing. + */ + if (idx_get_order(chunk, buddy_idx) != order) + break; + + buddy_header = idx_to_header(chunk, buddy_idx); + header_remove_freelist(chunk, buddy_header, order); + + /* Keep the left header out of the two buddies, drop the other one. */ + if (buddy_idx < idx) { + tmp_idx = idx; + idx = buddy_idx; + buddy_idx = tmp_idx; + } + + /* Remove the buddy from the freelists so that we can merge it. */ + idx_set_order(chunk, buddy_idx, order); + + buddy_header = idx_to_header(chunk, buddy_idx); + asan_poison(buddy_header, BUDDY_POISONED, + sizeof(*buddy_header)); + } + + /* Header properly freed but not in any freelists yet .*/ + idx_set_order(chunk, idx, order); + + header = idx_to_header(chunk, idx); + header_add_freelist(chunk, header, idx, order); + + return 0; +} + +__weak int buddy_free_internal(buddy_t __arg_arena *buddy, u64 addr) +{ + int ret; + + if (!buddy) + return -EINVAL; + + /* Freeing NULL is a valid no-op. */ + if (!addr) + return 0; + + ret = buddy_lock(buddy); + if (ret) + return ret; + + ret = buddy_free_unlocked(buddy, addr); + + buddy_unlock(buddy); + + return ret; +} + +__weak char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/libarena/src/common.bpf.c b/tools/testing/selftests/bpf/libarena/src/common.bpf.c index 84e8a8b7d42e..e5da1e37e83e 100644 --- a/tools/testing/selftests/bpf/libarena/src/common.bpf.c +++ b/tools/testing/selftests/bpf/libarena/src/common.bpf.c @@ -1,11 +1,13 @@ // SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause /* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ #include - #include +#include const volatile u32 zero = 0; +buddy_t buddy; + int arena_fls(__u64 word) { if (!word) @@ -28,4 +30,23 @@ __weak int arena_alloc_reserve(struct arena_alloc_reserve_args *args) return bpf_arena_reserve_pages(&arena, NULL, args->nr_pages); } +SEC("syscall") +__weak int arena_buddy_reset(void) +{ + buddy_destroy(&buddy); + + return buddy_init(&buddy); +} + +__weak u64 malloc_internal(size_t size) +{ + return buddy_alloc_internal(&buddy, size); +} + +__weak void free(void __arg_arena __arena *ptr) +{ + buddy_free_internal(&buddy, (u64)ptr); +} + + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From b1487dc1b181ad6aaea95357030a421bb180d8e7 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Sun, 26 Apr 2026 15:03:37 -0400 Subject: selftests/bpf: Add selftests for libarena buddy allocator Introduce selftests for the buddy allocator with and without ASAN. Add the libarena selftests both to the libarena test runner and to test_progs, so that they are a) available when libarena is pulled as a standalone library, and b) exercised along with all other test programs in this directory. ASAN for libarena requires LLVM 22. Add logic in the top-level selftests Makefile to only compile the ASAN variant if the compiler supports it, otherwise skip the test. Signed-off-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260426190338.4615-8-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- .../bpf/libarena/selftests/st_asan_buddy.bpf.c | 240 +++++++++++++++++++++ .../bpf/libarena/selftests/st_buddy.bpf.c | 209 ++++++++++++++++++ tools/testing/selftests/bpf/prog_tests/libarena.c | 66 ++++++ .../selftests/bpf/prog_tests/libarena_asan.c | 91 ++++++++ 4 files changed, 606 insertions(+) create mode 100644 tools/testing/selftests/bpf/libarena/selftests/st_asan_buddy.bpf.c create mode 100644 tools/testing/selftests/bpf/libarena/selftests/st_buddy.bpf.c create mode 100644 tools/testing/selftests/bpf/prog_tests/libarena.c create mode 100644 tools/testing/selftests/bpf/prog_tests/libarena_asan.c diff --git a/tools/testing/selftests/bpf/libarena/selftests/st_asan_buddy.bpf.c b/tools/testing/selftests/bpf/libarena/selftests/st_asan_buddy.bpf.c new file mode 100644 index 000000000000..9dd2980b5d6c --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/selftests/st_asan_buddy.bpf.c @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include + +extern buddy_t buddy; + +#ifdef BPF_ARENA_ASAN + +#include "st_asan_common.h" + +static __always_inline int asan_test_buddy_oob_single(size_t alloc_size) +{ + u8 __arena *mem; + int ret, i; + + ret = asan_validate(); + if (ret < 0) + return ret; + + mem = buddy_alloc(&buddy, alloc_size); + if (!mem) { + arena_stdout("buddy_alloc failed for size %lu", alloc_size); + return -ENOMEM; + } + + ret = asan_validate(); + if (ret < 0) + return ret; + + for (i = zero; i < alloc_size && can_loop; i++) { + mem[i] = 0xba; + ret = asan_validate_addr(false, &mem[i]); + if (ret < 0) + return ret; + } + + mem[alloc_size] = 0xba; + ret = asan_validate_addr(true, &mem[alloc_size]); + if (ret < 0) + return ret; + + buddy_free(&buddy, mem); + + return 0; +} + +/* + * Factored out because asan_validate_addr is complex enough to cause + * verification failures if verified with the rest of asan_test_buddy_uaf_single. + */ +__weak int asan_test_buddy_byte(u8 __arena __arg_arena *mem, int i, bool freed) +{ + int ret; + + /* The header in freed blocks doesn't get poisoned. */ + if (freed && BUDDY_HEADER_OFF <= i && + i < BUDDY_HEADER_OFF + sizeof(struct buddy_header)) + return 0; + + mem[i] = 0xba; + ret = asan_validate_addr(freed, &mem[i]); + if (ret < 0) + return ret; + + return 0; +} + +__weak int asan_test_buddy_uaf_single(size_t alloc_size) +{ + u8 __arena *mem; + int ret; + int i; + + mem = buddy_alloc(&buddy, alloc_size); + if (!mem) { + arena_stdout("buddy_alloc failed for size %lu", alloc_size); + return -ENOMEM; + } + + ret = asan_validate(); + if (ret < 0) + return ret; + + for (i = zero; i < alloc_size && can_loop; i++) { + ret = asan_test_buddy_byte(mem, i, false); + if (ret) + return ret; + } + + ret = asan_validate(); + if (ret < 0) + return ret; + + buddy_free(&buddy, mem); + + for (i = zero; i < alloc_size && can_loop; i++) { + ret = asan_test_buddy_byte(mem, i, true); + if (ret) + return ret; + } + + return 0; +} + +struct buddy_blob { + volatile u8 mem[48]; + u8 oob; +}; + +static __always_inline int asan_test_buddy_blob_single(void) +{ + volatile struct buddy_blob __arena *blob; + const size_t alloc_size = sizeof(struct buddy_blob) - 1; + int ret; + + blob = buddy_alloc(&buddy, alloc_size); + if (!blob) + return -ENOMEM; + + blob->mem[0] = 0xba; + ret = asan_validate_addr(false, &blob->mem[0]); + if (ret < 0) + return ret; + + blob->mem[47] = 0xba; + ret = asan_validate_addr(false, &blob->mem[47]); + if (ret < 0) + return ret; + + blob->oob = 0; + ret = asan_validate_addr(true, &blob->oob); + if (ret < 0) + return ret; + + buddy_free(&buddy, (void __arena *)blob); + + return 0; +} + +SEC("syscall") +__weak int asan_test_buddy_oob(void) +{ + size_t sizes[] = { + 7, 8, 17, 18, 64, 256, 317, 512, 1024, + }; + int ret, i; + + ret = buddy_init(&buddy); + if (ret) { + arena_stdout("buddy_init failed with %d", ret); + return ret; + } + + for (i = zero; i < sizeof(sizes) / sizeof(sizes[0]) && can_loop; i++) { + ret = asan_test_buddy_oob_single(sizes[i]); + if (ret) { + arena_stdout("%s:%d Failed for size %lu", __func__, + __LINE__, sizes[i]); + buddy_destroy(&buddy); + return ret; + } + } + + buddy_destroy(&buddy); + + ret = asan_validate(); + if (ret < 0) + return ret; + + return 0; +} + +SEC("syscall") +__weak int asan_test_buddy_uaf(void) +{ + size_t sizes[] = { 16, 32, 64, 128, 256, 512, 1024, 16384 }; + int ret, i; + + ret = buddy_init(&buddy); + if (ret) { + arena_stdout("buddy_init failed with %d", ret); + return ret; + } + + for (i = zero; i < sizeof(sizes) / sizeof(sizes[0]) && can_loop; i++) { + ret = asan_test_buddy_uaf_single(sizes[i]); + if (ret) { + arena_stdout("%s:%d Failed for size %lu", __func__, + __LINE__, sizes[i]); + buddy_destroy(&buddy); + return ret; + } + } + + buddy_destroy(&buddy); + + ret = asan_validate(); + if (ret < 0) + return ret; + + return 0; +} + +SEC("syscall") +__weak int asan_test_buddy_blob(void) +{ + const int iters = 10; + int ret, i; + + ret = buddy_init(&buddy); + if (ret) { + arena_stdout("buddy_init failed with %d", ret); + return ret; + } + + for (i = zero; i < iters && can_loop; i++) { + ret = asan_test_buddy_blob_single(); + if (ret) { + arena_stdout("%s:%d Failed on iteration %d", __func__, + __LINE__, i); + buddy_destroy(&buddy); + return ret; + } + } + + buddy_destroy(&buddy); + + ret = asan_validate(); + if (ret < 0) + return ret; + + return 0; +} + +#endif + +__weak char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/libarena/selftests/st_buddy.bpf.c b/tools/testing/selftests/bpf/libarena/selftests/st_buddy.bpf.c new file mode 100644 index 000000000000..79e6f0baabfe --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/selftests/st_buddy.bpf.c @@ -0,0 +1,209 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include + +#include +#include + +extern buddy_t buddy; + +struct segarr_entry { + u8 __arena *block; + size_t sz; + u8 poison; +}; + +#define SEGARRLEN (512) +static struct segarr_entry __arena segarr[SEGARRLEN]; +static void __arena *ptrs[17]; +size_t __arena alloc_sizes[] = { 3, 17, 1025, 129, 16350, 333, 9, 517 }; +size_t __arena alloc_multiple_sizes[] = { 3, 17, 1025, 129, 16350, 333, 9, 517, 2099 }; +size_t __arena alloc_free_sizes[] = { 3, 17, 64, 129, 256, 333, 512, 517 }; +size_t __arena alignment_sizes[] = { 1, 3, 7, 8, 9, 15, 16, 17, 31, + 32, 64, 100, 128, 255, 256, 512, 1000 }; + +SEC("syscall") +__weak int test_buddy_create(void) +{ + const int iters = 10; + int ret, i; + + for (i = zero; i < iters && can_loop; i++) { + ret = buddy_init(&buddy); + if (ret) + return ret; + + ret = buddy_destroy(&buddy); + if (ret) + return ret; + } + + return 0; +} + +SEC("syscall") +__weak int test_buddy_alloc(void) +{ + void __arena *mem; + int ret, i; + + for (i = zero; i < 8 && can_loop; i++) { + ret = buddy_init(&buddy); + if (ret) + return ret; + + mem = buddy_alloc(&buddy, alloc_sizes[i]); + if (!mem) { + buddy_destroy(&buddy); + return -ENOMEM; + } + + buddy_destroy(&buddy); + } + + return 0; +} + +SEC("syscall") +__weak int test_buddy_alloc_free(void) +{ + const int iters = 800; + void __arena *mem; + int ret, i; + + ret = buddy_init(&buddy); + if (ret) + return ret; + + for (i = zero; i < iters && can_loop; i++) { + mem = buddy_alloc(&buddy, alloc_free_sizes[(i * 5) % 8]); + if (!mem) { + buddy_destroy(&buddy); + return -ENOMEM; + } + + buddy_free(&buddy, mem); + } + + buddy_destroy(&buddy); + + return 0; +} + +SEC("syscall") +__weak int test_buddy_alloc_multiple(void) +{ + int ret, j; + u32 i, idx; + u8 __arena *mem; + size_t sz; + u8 poison; + + ret = buddy_init(&buddy); + if (ret) + return ret; + + /* + * Cycle through each size, allocating an entry in the + * segarr. Continue for SEGARRLEN iterations. For every + * allocation write down the size, use the current index + * as a poison value, and log it with the pointer in the + * segarr entry. Use the poison value to poison the entire + * allocated memory according to the size given. + */ + for (i = zero; i < SEGARRLEN && can_loop; i++) { + sz = alloc_multiple_sizes[i % 9]; + poison = (u8)i; + + mem = buddy_alloc(&buddy, sz); + if (!mem) { + buddy_destroy(&buddy); + arena_stdout("%s:%d", __func__, __LINE__); + return -ENOMEM; + } + + segarr[i].block = mem; + segarr[i].sz = sz; + segarr[i].poison = poison; + + for (j = zero; j < sz && can_loop; j++) { + mem[j] = poison; + if (mem[j] != poison) { + buddy_destroy(&buddy); + return -EINVAL; + } + } + } + + /* + * Go to (i * 17) % SEGARRLEN, and free the block pointed to. + * Before freeing, check all bytes have the poisoned value + * corresponding to the element. If any values are unexpected, + * return an error. Skip some elements to test destroying the + * buddy allocator while data is still allocated. + */ + for (i = 10; i < SEGARRLEN && can_loop; i++) { + idx = (i * 17) % SEGARRLEN; + + mem = segarr[idx].block; + sz = segarr[idx].sz; + poison = segarr[idx].poison; + + for (j = zero; j < sz && can_loop; j++) { + if (mem[j] != poison) { + buddy_destroy(&buddy); + arena_stdout("%s:%d %lx %u vs %u", __func__, + __LINE__, (uintptr_t)&mem[j], + mem[j], poison); + return -EINVAL; + } + } + + buddy_free(&buddy, mem); + } + + buddy_destroy(&buddy); + + return 0; +} + +SEC("syscall") +__weak int test_buddy_alignment(void) +{ + int ret, i; + + ret = buddy_init(&buddy); + if (ret) + return ret; + + /* Allocate various sizes and check alignment */ + for (i = zero; i < 17 && can_loop; i++) { + ptrs[i] = buddy_alloc(&buddy, alignment_sizes[i]); + if (!ptrs[i]) { + arena_stdout("alignment test: alloc failed for size %lu", + alignment_sizes[i]); + buddy_destroy(&buddy); + return -ENOMEM; + } + + /* Check 8-byte alignment */ + if ((u64)ptrs[i] & 0x7) { + arena_stdout( + "alignment test: ptr %llx not 8-byte aligned (size %lu)", + (u64)ptrs[i], alignment_sizes[i]); + buddy_destroy(&buddy); + return -EINVAL; + } + } + + /* Free all allocations */ + for (i = zero; i < 17 && can_loop; i++) + buddy_free(&buddy, ptrs[i]); + + buddy_destroy(&buddy); + + return 0; +} + +__weak char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/prog_tests/libarena.c b/tools/testing/selftests/bpf/prog_tests/libarena.c new file mode 100644 index 000000000000..81bdb084c271 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/libarena.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include +#include + +#include +#include +#include +#include + +#include "libarena/libarena.skel.h" + +static void run_libarena_test(struct libarena *skel, struct bpf_program *prog, + const char *name) +{ + int ret; + + if (!strstr(name, "test_buddy")) { + ret = libarena_run_prog(bpf_program__fd(skel->progs.arena_buddy_reset)); + if (!ASSERT_OK(ret, "arena_buddy_reset")) + return; + } + + ret = libarena_run_prog(bpf_program__fd(prog)); + + ASSERT_OK(ret, name); + +} + +void test_libarena(void) +{ + struct arena_alloc_reserve_args args; + struct libarena *skel; + struct bpf_program *prog; + int ret; + + skel = libarena__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + ret = libarena__attach(skel); + if (!ASSERT_OK(ret, "attach")) + goto out; + + args.nr_pages = ARENA_RESERVE_PAGES_DFL; + + ret = libarena_run_prog_args(bpf_program__fd(skel->progs.arena_alloc_reserve), + &args, sizeof(args)); + if (!ASSERT_OK(ret, "arena_alloc_reserve")) + goto out; + + bpf_object__for_each_program(prog, skel->obj) { + const char *name = bpf_program__name(prog); + + if (!libarena_is_test_prog(name)) + continue; + + if (!test__start_subtest(name)) + continue; + + run_libarena_test(skel, prog, name); + } + +out: + libarena__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/libarena_asan.c b/tools/testing/selftests/bpf/prog_tests/libarena_asan.c new file mode 100644 index 000000000000..b4fba10cdfbf --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/libarena_asan.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include + +#ifdef HAS_BPF_ARENA_ASAN +#include + +#include +#include +#include +#include + +#include "libarena/libarena_asan.skel.h" + +static void run_libarena_asan_test(struct libarena_asan *skel, + struct bpf_program *prog, const char *name) +{ + int ret; + + if (!strstr(name, "test_buddy")) { + ret = libarena_run_prog(bpf_program__fd(skel->progs.arena_buddy_reset)); + if (!ASSERT_OK(ret, "arena_buddy_reset")) + return; + } + + ret = libarena_run_prog(bpf_program__fd(prog)); + ASSERT_OK(ret, name); +} + +static void run_test(void) +{ + struct arena_alloc_reserve_args args; + struct libarena_asan *skel; + struct bpf_program *prog; + int ret; + + skel = libarena_asan__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + ret = libarena_asan__attach(skel); + if (!ASSERT_OK(ret, "attach")) + goto out; + + args.nr_pages = ARENA_RESERVE_PAGES_DFL; + + ret = libarena_run_prog_args(bpf_program__fd(skel->progs.arena_alloc_reserve), + &args, sizeof(args)); + if (!ASSERT_OK(ret, "arena_alloc_reserve")) + goto out; + + ret = libarena_asan_init( + bpf_program__fd(skel->progs.arena_get_info), + bpf_program__fd(skel->progs.asan_init), + (1ULL << 32) / sysconf(_SC_PAGESIZE)); + if (!ASSERT_OK(ret, "libarena_asan_init")) + goto out; + + bpf_object__for_each_program(prog, skel->obj) { + const char *name = bpf_program__name(prog); + + if (!libarena_is_asan_test_prog(name)) + continue; + + if (!test__start_subtest(name)) + continue; + + run_libarena_asan_test(skel, prog, name); + } + +out: + libarena_asan__destroy(skel); +} + +#endif /* HAS_BPF_ARENA_ASAN */ + +/* + * Run the test depending on whether LLVM can compile arena ASAN + * programs. + */ +void test_libarena_asan(void) +{ +#ifdef HAS_BPF_ARENA_ASAN + run_test(); +#else + test__skip(); +#endif + + return; +} + -- cgit v1.2.3 From 554e4eb9e4b75358f73733e2be7a59aaf4b7875e Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Sun, 26 Apr 2026 15:03:38 -0400 Subject: selftests/bpf: Reuse stderr parsing for libarena ASAN tests Add code to directly test the output of libarena ASAN tests. The code reuses testing infrastructure originally for BPF streams to verify that ASAN emits call stacks when the selftests trigger a memory error. Since stderr() testing uses logic from test_progs, it is only available on the test_progs-based selftest runner. The standalone runner still uses internal ASAN state to verify access errors are triaged as expected. Signed-off-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260426190338.4615-9-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- .../bpf/libarena/selftests/st_asan_buddy.bpf.c | 18 ++++++++ .../bpf/libarena/selftests/test_progs_compat.h | 15 +++++++ .../selftests/bpf/prog_tests/libarena_asan.c | 2 + tools/testing/selftests/bpf/test_loader.c | 51 +++++++++++++++++----- tools/testing/selftests/bpf/test_progs.h | 2 + 5 files changed, 76 insertions(+), 12 deletions(-) create mode 100644 tools/testing/selftests/bpf/libarena/selftests/test_progs_compat.h diff --git a/tools/testing/selftests/bpf/libarena/selftests/st_asan_buddy.bpf.c b/tools/testing/selftests/bpf/libarena/selftests/st_asan_buddy.bpf.c index 9dd2980b5d6c..97acd50ffa5c 100644 --- a/tools/testing/selftests/bpf/libarena/selftests/st_asan_buddy.bpf.c +++ b/tools/testing/selftests/bpf/libarena/selftests/st_asan_buddy.bpf.c @@ -5,6 +5,9 @@ #include #include +/* Required for parsing the ASAN call stacks. */ +#include "test_progs_compat.h" + extern buddy_t buddy; #ifdef BPF_ARENA_ASAN @@ -141,6 +144,11 @@ static __always_inline int asan_test_buddy_blob_single(void) } SEC("syscall") +__stderr("Memory violation for address {{.*}} for write of size 1") +__stderr("CPU: {{[0-9]+}} UID: 0 PID: {{[0-9]+}} Comm: {{.*}}") +__stderr("Call trace:\n" +"{{([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" +"|[ \t]+[^\n]+\n)*}}") __weak int asan_test_buddy_oob(void) { size_t sizes[] = { @@ -174,6 +182,11 @@ __weak int asan_test_buddy_oob(void) } SEC("syscall") +__stderr("Memory violation for address {{.*}} for write of size 1") +__stderr("CPU: {{[0-9]+}} UID: 0 PID: {{[0-9]+}} Comm: {{.*}}") +__stderr("Call trace:\n" +"{{([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" +"|[ \t]+[^\n]+\n)*}}") __weak int asan_test_buddy_uaf(void) { size_t sizes[] = { 16, 32, 64, 128, 256, 512, 1024, 16384 }; @@ -205,6 +218,11 @@ __weak int asan_test_buddy_uaf(void) } SEC("syscall") +__stderr("Memory violation for address {{.*}} for write of size 1") +__stderr("CPU: {{[0-9]+}} UID: 0 PID: {{[0-9]+}} Comm: {{.*}}") +__stderr("Call trace:\n" +"{{([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" +"|[ \t]+[^\n]+\n)*}}") __weak int asan_test_buddy_blob(void) { const int iters = 10; diff --git a/tools/testing/selftests/bpf/libarena/selftests/test_progs_compat.h b/tools/testing/selftests/bpf/libarena/selftests/test_progs_compat.h new file mode 100644 index 000000000000..9d431376c42f --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/selftests/test_progs_compat.h @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#pragma once + +#ifdef __BPF__ + +/* Selftests use these tags for compatibility with test_progs. */ +#define __test_tag(tag) __attribute__((btf_decl_tag("comment:" XSTR(__COUNTER__) ":" tag))) +#define __stderr(msg) __test_tag("test_expect_stderr=" msg) +#define __stderr_unpriv(msg) __test_tag("test_expect_stderr_unpriv=" msg) + +#define XSTR(s) STR(s) +#define STR(s) #s + +#endif diff --git a/tools/testing/selftests/bpf/prog_tests/libarena_asan.c b/tools/testing/selftests/bpf/prog_tests/libarena_asan.c index b4fba10cdfbf..d59d9dd12ef2 100644 --- a/tools/testing/selftests/bpf/prog_tests/libarena_asan.c +++ b/tools/testing/selftests/bpf/prog_tests/libarena_asan.c @@ -25,6 +25,8 @@ static void run_libarena_asan_test(struct libarena_asan *skel, ret = libarena_run_prog(bpf_program__fd(prog)); ASSERT_OK(ret, name); + + verify_test_stderr(skel->obj, prog); } static void run_test(void) diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index c4c34cae6102..ee637809a1d4 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -93,7 +93,7 @@ void test_loader_fini(struct test_loader *tester) free(tester->log_buf); } -static void free_msgs(struct expected_msgs *msgs) +void free_msgs(struct expected_msgs *msgs) { int i; @@ -789,6 +789,43 @@ static void emit_stderr(const char *stderr, bool force) fprintf(stdout, "STDERR:\n=============\n%s=============\n", stderr); } +static void verify_stderr(int prog_fd, struct expected_msgs *msgs) +{ + LIBBPF_OPTS(bpf_prog_stream_read_opts, ropts); + char *buf; + int ret; + + if (!msgs->cnt) + return; + + buf = malloc(TEST_LOADER_LOG_BUF_SZ); + if (!ASSERT_OK_PTR(buf, "malloc")) + return; + + ret = bpf_prog_stream_read(prog_fd, 2, buf, TEST_LOADER_LOG_BUF_SZ - 1, + &ropts); + if (ret > 0) { + buf[ret] = '\0'; + emit_stderr(buf, false); + validate_msgs(buf, msgs, emit_stderr); + } else { + ASSERT_GT(ret, 0, "stderr stream read"); + } + + free(buf); +} + +void verify_test_stderr(struct bpf_object *obj, struct bpf_program *prog) +{ + struct test_spec spec = {}; + + if (parse_test_spec(NULL, obj, prog, &spec)) + return; + + verify_stderr(bpf_program__fd(prog), &spec.priv.stderr); + free_test_spec(&spec); +} + static void emit_stdout(const char *bpf_stdout, bool force) { if (!force && env.verbosity == VERBOSE_NONE) @@ -1314,17 +1351,7 @@ void run_subtest(struct test_loader *tester, goto tobj_cleanup; } - if (subspec->stderr.cnt) { - err = get_stream(2, bpf_program__fd(tprog), - tester->log_buf, tester->log_buf_sz); - if (err <= 0) { - PRINT_FAIL("Unexpected retval from get_stream(): %d, errno = %d\n", - err, errno); - goto tobj_cleanup; - } - emit_stderr(tester->log_buf, false /*force*/); - validate_msgs(tester->log_buf, &subspec->stderr, emit_stderr); - } + verify_stderr(bpf_program__fd(tprog), &subspec->stderr); if (subspec->stdout.cnt) { err = get_stream(1, bpf_program__fd(tprog), diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index 1a44467f4310..37955a8ad385 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -563,5 +563,7 @@ struct expected_msgs { void validate_msgs(const char *log_buf, struct expected_msgs *msgs, void (*emit_fn)(const char *buf, bool force)); +void free_msgs(struct expected_msgs *msgs); +void verify_test_stderr(struct bpf_object *obj, struct bpf_program *prog); #endif /* __TEST_PROGS_H */ -- cgit v1.2.3 From cd5b460ed1eca9e48f3eb07db1ee0a522c0eaa23 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Sat, 25 Apr 2026 15:48:23 -0700 Subject: bpf: range_within() must check cnum ranges instead of min/max pairs states.c:range_within() must be updated to properly check if cnum-based range in an old state is a superset of a range in the cur state. Currently it makes the decision using min/max accessors: reg_umin(old) <= reg_umin(cur) <= reg_umax(old) This is wrong for cnums that cross both UT_MAX/0 and ST_MAX/ST_MIN boundaries. Consider cnum32{base=0x7FFFFFF0, size=0x80000020}, which represents values [0x7FFFFFF0, ..., U32_MAX, 0, ..., 0x10]. Its projections are u32_min/max=0/U32_MAX, s32_min/max=S32_MIN/MAX. A register with range [0x100, 0x200] (which lies entirely in the gap of the wrapping range) would pass the min/max check despite having no overlap with the actual cnum arc. This commit replaces min/max comparison with cnum{32,64}_is_subset() operation. The operation implementation is verified using cbmc model checker in [1]. [1] https://github.com/eddyz87/cnum-verif/ Fixes: bbc631085503 ("bpf: replace min/max fields with struct cnum{32,64}") Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260425-cnum-range-within-v1-1-2fdca70cb09d@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/cnum.h | 2 ++ kernel/bpf/cnum_defs.h | 14 ++++++++++++++ kernel/bpf/states.c | 11 +++-------- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/include/linux/cnum.h b/include/linux/cnum.h index a7259b105b45..49b7d0c7645d 100644 --- a/include/linux/cnum.h +++ b/include/linux/cnum.h @@ -48,6 +48,7 @@ bool cnum32_is_const(struct cnum32 cnum); bool cnum32_is_empty(struct cnum32 cnum); struct cnum32 cnum32_add(struct cnum32 a, struct cnum32 b); struct cnum32 cnum32_negate(struct cnum32 a); +bool cnum32_is_subset(struct cnum32 outer, struct cnum32 inner); /* Same as cnum32 but for 64-bit ranges */ struct cnum64 { @@ -73,6 +74,7 @@ bool cnum64_is_const(struct cnum64 cnum); bool cnum64_is_empty(struct cnum64 cnum); struct cnum64 cnum64_add(struct cnum64 a, struct cnum64 b); struct cnum64 cnum64_negate(struct cnum64 a); +bool cnum64_is_subset(struct cnum64 outer, struct cnum64 inner); struct cnum32 cnum32_from_cnum64(struct cnum64 cnum); struct cnum64 cnum64_cnum32_intersect(struct cnum64 a, struct cnum32 b); diff --git a/kernel/bpf/cnum_defs.h b/kernel/bpf/cnum_defs.h index 3ebd8f723dbb..1f232138b6e9 100644 --- a/kernel/bpf/cnum_defs.h +++ b/kernel/bpf/cnum_defs.h @@ -220,6 +220,20 @@ bool FN(is_const)(struct cnum_t cnum) return cnum.size == 0; } +bool FN(is_subset)(struct cnum_t bigger, struct cnum_t smaller) +{ + if (FN(is_empty(smaller))) + return true; + if (FN(is_empty(bigger))) + return false; + /* rotate both arcs such that 'bigger' starts at origin, hence does not overflow */ + smaller.base -= bigger.base; + bigger.base = 0; + if (FN(urange_overflow)(smaller) && bigger.size < UT_MAX) + return false; + return smaller.base + smaller.size <= bigger.size; +} + #undef EMPTY #undef cnum_t #undef ut diff --git a/kernel/bpf/states.c b/kernel/bpf/states.c index a78ae891b743..bd9c22945050 100644 --- a/kernel/bpf/states.c +++ b/kernel/bpf/states.c @@ -2,6 +2,7 @@ /* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ #include #include +#include #include #define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args) @@ -301,14 +302,8 @@ int bpf_update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_s static bool range_within(const struct bpf_reg_state *old, const struct bpf_reg_state *cur) { - return reg_umin(old) <= reg_umin(cur) && - reg_umax(old) >= reg_umax(cur) && - reg_smin(old) <= reg_smin(cur) && - reg_smax(old) >= reg_smax(cur) && - reg_u32_min(old) <= reg_u32_min(cur) && - reg_u32_max(old) >= reg_u32_max(cur) && - reg_s32_min(old) <= reg_s32_min(cur) && - reg_s32_max(old) >= reg_s32_max(cur); + return cnum64_is_subset(old->r64, cur->r64) && + cnum32_is_subset(old->r32, cur->r32); } /* If in the old state two registers had the same id, then they need to have -- cgit v1.2.3 From af469e10b4bc1446391514f69eeede843f29cf9c Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Sat, 25 Apr 2026 15:48:24 -0700 Subject: selftests/bpf: a test for proper cnums compare in is_state_visited() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test case demonstrating a bug in cnum comparison logic fixed by previous commit. A pruning point is reached with r6 in two states: 1. 32-bit range of [0x7FFFFFF0, U32_MAX] ∪ [0, 0x10] 2. 32-bit range of [0x100, 0x200] At pruning point the buggy is_state_visited() logic would assume that would assume range (2) to be a subset of (1) and fail to explore the path performing division by zero. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260425-cnum-range-within-v1-2-2fdca70cb09d@gmail.com Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/progs/verifier_bounds.c | 27 ++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c index 5dd243e653c9..a3e4c0945137 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bounds.c +++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c @@ -2267,4 +2267,31 @@ __naked void deduce64_from_32_wrapping_32bit(void) : __clobber_all); } +/* Check that range_within() compares cnum ranges, not min/max projections. */ +SEC("socket") +__failure __msg("div by zero") +__flag(BPF_F_TEST_STATE_FREQ) +__naked void range_within_cnum_cross_both_boundaries(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = 0x80000020; \ + if r0 > r1 goto 1f; \ + r0 += 0x7FFFFFF0; /* PATH 1 */ \ + goto 2f; \ +1: call %[bpf_get_prandom_u32]; /* PATH 2 */ \ + if r0 < 0x100 goto 3f; \ + if r0 > 0x200 goto 3f; \ +2: /* PATH 1: r0 ∈ [0x7FFFFFF0, U32_MAX] ∪ [0, 0x10] */ \ + /* PATH 2: r0 ∈ [0x100, 0x200] */ \ + if r0 != 0x100 goto 3f; /* True only on PATH 2 */ \ + r0 /= 0; \ +3: exit; \ + " + :: __imm(bpf_map_lookup_elem), + __imm_addr(map_hash_8b), + __imm(bpf_get_prandom_u32) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 79b8ebcbe483fee401e1b91dd32470348d9aa5b8 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Mon, 27 Apr 2026 12:22:05 +0100 Subject: bpf: Export cnum_umin/umax() helpers for netronome driver ERROR: modpost: "cnum64_umin" [drivers/net/ethernet/netronome/nfp/nfp.ko] undefined! ERROR: modpost: "cnum64_umax" [drivers/net/ethernet/netronome/nfp/nfp.ko] undefined! Export symbols for these references. Reported-by: Kaitao Cheng Fixes: bbc631085503 ("bpf: replace min/max fields with struct cnum{32,64}") Signed-off-by: Alan Maguire Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260427112205.1346733-1-alan.maguire@oracle.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/cnum_defs.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/bpf/cnum_defs.h b/kernel/bpf/cnum_defs.h index 1f232138b6e9..a90e317e3578 100644 --- a/kernel/bpf/cnum_defs.h +++ b/kernel/bpf/cnum_defs.h @@ -6,6 +6,7 @@ #endif #include +#include #include #include #include @@ -48,11 +49,13 @@ ut FN(umin)(struct cnum_t cnum) { return FN(urange_overflow)(cnum) ? 0 : cnum.base; } +EXPORT_SYMBOL_GPL(FN(umin)); ut FN(umax)(struct cnum_t cnum) { return FN(urange_overflow)(cnum) ? UT_MAX : cnum.base + cnum.size; } +EXPORT_SYMBOL_GPL(FN(umax)); /* True if this cnum represents two signed ranges. */ static inline bool FN(srange_overflow)(struct cnum_t cnum) -- cgit v1.2.3 From cfeddb4244268c246d67cbe50269a9475cb112fc Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 24 Apr 2026 17:39:05 +0200 Subject: bpf: Remove obsolete WARN_ON call The WARN_ON call in bpf_trampoline_update could never hit, because we direct the code path with (total == 0) to out label, which effectively skips the WARN_ON call. The WARN_ON made sense back then when it checked tr->selector, but now with total being set just inside the function it's useless. Signed-off-by: Jiri Olsa Acked-by: Song Liu Link: https://lore.kernel.org/r/20260424153905.354922-2-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/trampoline.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index f02254a21585..a4298a25d4ba 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -685,7 +685,6 @@ again: if (err) goto out_free; - WARN_ON(tr->cur_image && total == 0); if (tr->cur_image) /* progs already running at this address */ err = modify_fentry(tr, orig_flags, tr->cur_image->image, -- cgit v1.2.3 From 9f5b3ffc3f1dac7204e32eeeff84bc5cc55c393e Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Tue, 28 Apr 2026 06:42:52 -0700 Subject: selftests/bpf: Rename libarena malloc/free methods The s390 architecture uses the token "free" for an enum, conflicting with the malloc/free definitions. Rename the calls to arena_malloc and arena_free instead to prevent collisions. Reported-by: Ihor Solodrai Signed-off-by: Emil Tsalapatis Fixes: 86426a28c52d ("selftests/bpf: Add buddy allocator for libarena") Acked-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260428134252.2783519-1-etsal@meta.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/libarena/Makefile | 2 -- tools/testing/selftests/bpf/libarena/include/libarena/common.h | 6 +++--- tools/testing/selftests/bpf/libarena/src/common.bpf.c | 4 ++-- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/libarena/Makefile b/tools/testing/selftests/bpf/libarena/Makefile index 3c695f9c0054..5e2ab514805e 100644 --- a/tools/testing/selftests/bpf/libarena/Makefile +++ b/tools/testing/selftests/bpf/libarena/Makefile @@ -51,8 +51,6 @@ ASAN_FLAGS += -mllvm -asan-destructor-kind=none override BPF_CFLAGS += -DENABLE_ATOMICS_TESTS override BPF_CFLAGS += -O2 -g override BPF_CFLAGS += -Wno-incompatible-pointer-types-discards-qualifiers -# Required to define our own arena-based free() -override BPF_CFLAGS += -Wno-incompatible-library-redeclaration # Required for suppressing harmless vmlinux.h-related warnings. override BPF_CFLAGS += -Wno-missing-declarations override BPF_CFLAGS += $(INCLUDES) diff --git a/tools/testing/selftests/bpf/libarena/include/libarena/common.h b/tools/testing/selftests/bpf/libarena/include/libarena/common.h index e54cb7b869bd..ca1a6c1d6477 100644 --- a/tools/testing/selftests/bpf/libarena/include/libarena/common.h +++ b/tools/testing/selftests/bpf/libarena/include/libarena/common.h @@ -48,9 +48,9 @@ extern volatile u64 asan_violated; int arena_fls(__u64 word); -u64 malloc_internal(size_t size); -#define malloc(size) ((void __arena *)malloc_internal((size))) -void free(void __arena *ptr); +u64 arena_malloc_internal(size_t size); +#define arena_malloc(size) ((void __arena *)arena_malloc_internal((size))) +void arena_free(void __arena *ptr); /* * The verifier associates arenas with programs by checking LD.IMM diff --git a/tools/testing/selftests/bpf/libarena/src/common.bpf.c b/tools/testing/selftests/bpf/libarena/src/common.bpf.c index e5da1e37e83e..544bf9e1cb38 100644 --- a/tools/testing/selftests/bpf/libarena/src/common.bpf.c +++ b/tools/testing/selftests/bpf/libarena/src/common.bpf.c @@ -38,12 +38,12 @@ __weak int arena_buddy_reset(void) return buddy_init(&buddy); } -__weak u64 malloc_internal(size_t size) +__weak u64 arena_malloc_internal(size_t size) { return buddy_alloc_internal(&buddy, size); } -__weak void free(void __arg_arena __arena *ptr) +__weak void arena_free(void __arg_arena __arena *ptr) { buddy_free_internal(&buddy, (u64)ptr); } -- cgit v1.2.3 From f603e84ab7918db6470c0b06b46ece7fbdb71e9a Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Thu, 30 Apr 2026 10:44:28 +0200 Subject: bpf: Print breakdown of insns processed by subprogs When using global functions (i.e. subprogs), the verifier performs function-by-function verification. In that case, the sum of the instructions processed in each global function and in the main program counts towards the 1 million instructions limit. Only that sum is reported in the verifier logs. While starting to use global functions in Cilium (finally!), we found it can be useful to have the breakdown per global function, to understand exactly where the budget is currently spent. This patch implements this breakdown, under BPF_LOG_STATS, as done for the stack depths. When iterating over subprogs, we need to skip the hidden subprogs at the end because they don't have a corresponding func_info_aux entry and calling bpf_subprog_is_global() would result in an OOB access. Signed-off-by: Paul Chaignon Link: https://lore.kernel.org/bpf/5590f9c67e614ec9054d0c7e74e87cc690a52c56.1777538384.git.paul.chaignon@gmail.com Signed-off-by: Kumar Kartikeya Dwivedi --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 25 ++++++++++++++++--------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 101ca6cc5424..976e2b2f40e8 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -779,6 +779,7 @@ struct bpf_subprog_info { u32 exit_idx; /* Index of one of the BPF_EXIT instructions in this subprogram */ u16 stack_depth; /* max. stack depth used by this function */ u16 stack_extra; + u32 insn_processed; /* offsets in range [stack_depth .. fastcall_stack_off) * are used for bpf_fastcall spills and fills. */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 03f9e16c2abe..11054ad89c14 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18215,6 +18215,7 @@ static int do_check_subprogs(struct bpf_verifier_env *env) struct bpf_prog_aux *aux = env->prog->aux; struct bpf_func_info_aux *sub_aux; int i, ret, new_cnt; + u32 insn_processed; if (!aux->func_info) return 0; @@ -18229,6 +18230,8 @@ again: if (!bpf_subprog_is_global(env, i)) continue; + insn_processed = env->insn_processed; + sub_aux = subprog_aux(env, i); if (!sub_aux->called || sub_aux->verified) continue; @@ -18236,6 +18239,7 @@ again: env->insn_idx = env->subprog_info[i].start; WARN_ON_ONCE(env->insn_idx == 0); ret = do_check_common(env, i); + env->subprog_info[i].insn_processed = env->insn_processed - insn_processed; if (ret) { return ret; } else if (env->log.level & BPF_LOG_LEVEL) { @@ -18262,10 +18266,12 @@ again: static int do_check_main(struct bpf_verifier_env *env) { + u32 insn_processed = env->insn_processed; int ret; env->insn_idx = 0; ret = do_check_common(env, 0); + env->subprog_info[0].insn_processed = env->insn_processed - insn_processed; if (!ret) env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; return ret; @@ -18274,19 +18280,20 @@ static int do_check_main(struct bpf_verifier_env *env) static void print_verification_stats(struct bpf_verifier_env *env) { - int i; + /* Skip over hidden subprogs which are not verified. */ + int i, subprog_cnt = env->subprog_cnt - env->hidden_subprog_cnt; if (env->log.level & BPF_LOG_STATS) { verbose(env, "verification time %lld usec\n", div_u64(env->verification_time, 1000)); - verbose(env, "stack depth "); - for (i = 0; i < env->subprog_cnt; i++) { - u32 depth = env->subprog_info[i].stack_depth; - - verbose(env, "%d", depth); - if (i + 1 < env->subprog_cnt) - verbose(env, "+"); - } + verbose(env, "stack depth %d", env->subprog_info[0].stack_depth); + for (i = 1; i < subprog_cnt; i++) + verbose(env, "+%d", env->subprog_info[i].stack_depth); + verbose(env, "\n"); + verbose(env, "insns processed %d", env->subprog_info[0].insn_processed); + for (i = 1; i < subprog_cnt; i++) + if (bpf_subprog_is_global(env, i)) + verbose(env, "+%d", env->subprog_info[i].insn_processed); verbose(env, "\n"); } verbose(env, "processed %d insns (limit %d) max_states_per_insn %d " -- cgit v1.2.3 From 2ca6723a5f7b68c739dba47b2639e3eaa7884b09 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Thu, 30 Apr 2026 10:45:24 +0200 Subject: selftests/bpf: Test insns processed breakdown This patch covers in global subprog selftests the new verifier log with the breakdown of instructions processed by global subprogs. The test ensures the log line is present and that it has the right number of subcounts. Signed-off-by: Paul Chaignon Link: https://lore.kernel.org/bpf/3a5157f4573edaa8846f6fc4041f715136f693b1.1777538384.git.paul.chaignon@gmail.com Signed-off-by: Kumar Kartikeya Dwivedi --- tools/testing/selftests/bpf/progs/verifier_global_subprogs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c index 1e08aff7532e..dc09d0e2d8ad 100644 --- a/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c +++ b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c @@ -46,12 +46,13 @@ __noinline long global_dead(void) } SEC("?raw_tp") -__success __log_level(2) +__success __log_level(6) /* main prog is validated completely first */ __msg("('global_calls_good_only') is global and assumed valid.") /* eventually global_good() is transitively validated as well */ __msg("Validating global_good() func") __msg("('global_good') is safe for any args that match its prototype") +__msg("insns processed {{[0-9]+\\+[0-9]+\\+[0-9]+$}}") int chained_global_func_calls_success(void) { int sum = 0; -- cgit v1.2.3 From 2b6f0a1e4c9e0f618179c4a108249cc4a0442d11 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Mon, 27 Apr 2026 16:22:58 -0700 Subject: selftests/bpf: Add bench_force_done() for early benchmark completion The bench framework waits for duration_sec to elapse before collecting results. Benchmarks that know exactly how many samples they need can call bench_force_done() to signal completion early, avoiding wasted wall-clock time. Also refactor collect_measurements() to reuse bench_force_done() instead of open-coding the same mutex/cond_signal sequence. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260427232313.1582588-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bench.c | 14 +++++++++----- tools/testing/selftests/bpf/bench.h | 1 + 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index 029b3e21f438..47a4e72208d6 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -741,6 +741,13 @@ static void setup_benchmark(void) static pthread_mutex_t bench_done_mtx = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t bench_done = PTHREAD_COND_INITIALIZER; +void bench_force_done(void) +{ + pthread_mutex_lock(&bench_done_mtx); + pthread_cond_signal(&bench_done); + pthread_mutex_unlock(&bench_done_mtx); +} + static void collect_measurements(long delta_ns) { int iter = state.res_cnt++; struct bench_res *res = &state.results[iter]; @@ -750,11 +757,8 @@ static void collect_measurements(long delta_ns) { if (bench->report_progress) bench->report_progress(iter, res, delta_ns); - if (iter == env.duration_sec + env.warmup_sec) { - pthread_mutex_lock(&bench_done_mtx); - pthread_cond_signal(&bench_done); - pthread_mutex_unlock(&bench_done_mtx); - } + if (iter == env.duration_sec + env.warmup_sec) + bench_force_done(); } int main(int argc, char **argv) diff --git a/tools/testing/selftests/bpf/bench.h b/tools/testing/selftests/bpf/bench.h index 7cf21936e7ed..89a3fc72f70e 100644 --- a/tools/testing/selftests/bpf/bench.h +++ b/tools/testing/selftests/bpf/bench.h @@ -70,6 +70,7 @@ extern struct env env; extern const struct bench *bench; void setup_libbpf(void); +void bench_force_done(void); void hits_drops_report_progress(int iter, struct bench_res *res, long delta_ns); void hits_drops_report_final(struct bench_res res[], int res_cnt); void false_hits_report_progress(int iter, struct bench_res *res, long delta_ns); -- cgit v1.2.3 From 08158c111d7d87d88269d9f873a2fc54b87bcb99 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Mon, 27 Apr 2026 16:22:59 -0700 Subject: selftests/bpf: Add BPF batch-timing library Add a reusable timing library for BPF benchmarks that need to measure BPF program execution time. The BPF side (progs/bench_bpf_timing.bpf.h) provides per-CPU sample arrays and BENCH_BPF_LOOP(), a macro that brackets batch_iters iterations with bpf_ktime_get_ns() reads and records the elapsed time. One extra untimed iteration runs afterward for output validation. The userspace side (benchs/bench_bpf_timing.c) collects samples from the skeleton BSS, computes percentile statistics, and auto-calibrates batch_iters to target ~10 ms per batch. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260427232313.1582588-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 2 + tools/testing/selftests/bpf/bench_bpf_timing.h | 50 ++++ .../selftests/bpf/benchs/bench_bpf_timing.c | 272 +++++++++++++++++++++ .../selftests/bpf/progs/bench_bpf_timing.bpf.h | 69 ++++++ 4 files changed, 393 insertions(+) create mode 100644 tools/testing/selftests/bpf/bench_bpf_timing.h create mode 100644 tools/testing/selftests/bpf/benchs/bench_bpf_timing.c create mode 100644 tools/testing/selftests/bpf/progs/bench_bpf_timing.bpf.h diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 97ee61f2ade5..3d516f10f29e 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -906,6 +906,7 @@ $(OUTPUT)/bench_htab_mem.o: $(OUTPUT)/htab_mem_bench.skel.h $(OUTPUT)/bench_bpf_crypto.o: $(OUTPUT)/crypto_bench.skel.h $(OUTPUT)/bench_sockmap.o: $(OUTPUT)/bench_sockmap_prog.skel.h $(OUTPUT)/bench_lpm_trie_map.o: $(OUTPUT)/lpm_trie_bench.skel.h $(OUTPUT)/lpm_trie_map.skel.h +$(OUTPUT)/bench_bpf_timing.o: bench_bpf_timing.h $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ) $(OUTPUT)/bench: LDLIBS += -lm $(OUTPUT)/bench: $(OUTPUT)/bench.o \ @@ -928,6 +929,7 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \ $(OUTPUT)/bench_bpf_crypto.o \ $(OUTPUT)/bench_sockmap.o \ $(OUTPUT)/bench_lpm_trie_map.o \ + $(OUTPUT)/bench_bpf_timing.o \ $(OUTPUT)/usdt_1.o \ $(OUTPUT)/usdt_2.o \ # diff --git a/tools/testing/selftests/bpf/bench_bpf_timing.h b/tools/testing/selftests/bpf/bench_bpf_timing.h new file mode 100644 index 000000000000..6ef23b6d6639 --- /dev/null +++ b/tools/testing/selftests/bpf/bench_bpf_timing.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#ifndef __BENCH_BPF_TIMING_H__ +#define __BENCH_BPF_TIMING_H__ + +#include +#include +#include "bench.h" + +#ifndef BENCH_NR_SAMPLES +#define BENCH_NR_SAMPLES 4096 +#endif +#ifndef BENCH_NR_CPUS +#define BENCH_NR_CPUS 256 +#endif + +typedef void (*bpf_bench_run_fn)(void *ctx); + +struct bpf_bench_timing { + __u64 (*samples)[BENCH_NR_SAMPLES]; /* skel->bss->timing_samples */ + __u32 *idx; /* skel->bss->timing_idx */ + volatile __u32 *timing_enabled; /* &skel->bss->timing_enabled */ + volatile __u32 *batch_iters_bss; /* &skel->bss->batch_iters */ + __u32 batch_iters; + __u32 target_samples; + __u32 nr_cpus; + int warmup_ticks; + bool done; + bool machine_readable; +}; + +#define BENCH_TIMING_INIT(t, skel, iters) do { \ + (t)->samples = (skel)->bss->timing_samples; \ + (t)->idx = (skel)->bss->timing_idx; \ + (t)->timing_enabled = &(skel)->bss->timing_enabled; \ + (t)->batch_iters_bss = &(skel)->bss->batch_iters; \ + (t)->batch_iters = (iters); \ + (t)->target_samples = 200; \ + (t)->nr_cpus = env.nr_cpus; \ + (t)->warmup_ticks = 0; \ + (t)->done = false; \ + (t)->machine_readable = false; \ +} while (0) + +void bpf_bench_timing_measure(struct bpf_bench_timing *t, struct bench_res *res); +void bpf_bench_timing_report(struct bpf_bench_timing *t, const char *name, const char *desc); +void bpf_bench_calibrate(struct bpf_bench_timing *t, bpf_bench_run_fn run_fn, void *ctx); + +#endif /* __BENCH_BPF_TIMING_H__ */ diff --git a/tools/testing/selftests/bpf/benchs/bench_bpf_timing.c b/tools/testing/selftests/bpf/benchs/bench_bpf_timing.c new file mode 100644 index 000000000000..75a39da69655 --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_bpf_timing.c @@ -0,0 +1,272 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include +#include +#include "bench_bpf_timing.h" +#include "bpf_util.h" + +struct timing_stats { + double min, max; + double median, p99; + double mean, stddev; + int count; +}; + +static int cmp_double(const void *a, const void *b) +{ + double da = *(const double *)a; + double db = *(const double *)b; + + if (da < db) + return -1; + if (da > db) + return 1; + return 0; +} + +static double percentile(const double *sorted, int n, double pct) +{ + int idx = (int)(n * pct / 100.0); + + if (idx >= n) + idx = n - 1; + return sorted[idx]; +} + +static int collect_samples(struct bpf_bench_timing *t, + double *out, int max_out) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + __u32 timed_iters = t->batch_iters; + int total = 0; + + if (nr_cpus > BENCH_NR_CPUS) + nr_cpus = BENCH_NR_CPUS; + + for (unsigned int cpu = 0; cpu < nr_cpus; cpu++) { + __u32 count = t->idx[cpu]; + + if (count > BENCH_NR_SAMPLES) + count = BENCH_NR_SAMPLES; + + for (__u32 i = 0; i < count && total < max_out; i++) { + __u64 sample = t->samples[cpu][i]; + + if (sample == 0) + continue; + out[total++] = (double)sample / timed_iters; + } + } + + qsort(out, total, sizeof(double), cmp_double); + return total; +} + +static void compute_stats(const double *sorted, int n, + struct timing_stats *s) +{ + double sum = 0, var_sum = 0; + + memset(s, 0, sizeof(*s)); + s->count = n; + + if (n == 0) + return; + + s->min = sorted[0]; + s->max = sorted[n - 1]; + s->median = sorted[n / 2]; + s->p99 = percentile(sorted, n, 99); + + for (int i = 0; i < n; i++) + sum += sorted[i]; + s->mean = sum / n; + + for (int i = 0; i < n; i++) { + double d = sorted[i] - s->mean; + + var_sum += d * d; + } + s->stddev = n > 1 ? sqrt(var_sum / (n - 1)) : 0; +} + +void bpf_bench_timing_measure(struct bpf_bench_timing *t, struct bench_res *res) +{ + unsigned int nr_cpus; + __u32 total_samples; + int i; + + t->warmup_ticks++; + + if (t->warmup_ticks < env.warmup_sec) + return; + + if (t->warmup_ticks == env.warmup_sec) { + *t->timing_enabled = 1; + return; + } + + nr_cpus = bpf_num_possible_cpus(); + if (nr_cpus > BENCH_NR_CPUS) + nr_cpus = BENCH_NR_CPUS; + + total_samples = 0; + for (i = 0; i < (int)nr_cpus; i++) { + __u32 cnt = t->idx[i]; + + if (cnt > BENCH_NR_SAMPLES) + cnt = BENCH_NR_SAMPLES; + total_samples += cnt; + } + + if (total_samples >= (__u32)env.producer_cnt * t->target_samples && !t->done) { + t->done = true; + *t->timing_enabled = 0; + bench_force_done(); + } +} + +void bpf_bench_timing_report(struct bpf_bench_timing *t, const char *name, const char *description) +{ + int max_out = BENCH_NR_CPUS * BENCH_NR_SAMPLES; + struct timing_stats s; + double *all; + int total; + + all = calloc(max_out, sizeof(*all)); + if (!all) { + fprintf(stderr, "failed to allocate timing buffer\n"); + return; + } + + total = collect_samples(t, all, max_out); + + if (total == 0) { + printf("No timing samples collected.\n"); + free(all); + return; + } + + compute_stats(all, total, &s); + + if (t->machine_readable) { + printf("RESULT scenario=%s samples=%d median=%.2f stddev=%.2f cv=%.2f min=%.2f " + "p99=%.2f max=%.2f\n", name, total, s.median, s.stddev, + s.mean > 0 ? s.stddev / s.mean * 100.0 : 0.0, s.min, s.p99, s.max); + } else { + printf("%s: median %.2f ns/op, stddev %.2f, p99 %.2f (%d samples)\n", name, + s.median, s.stddev, s.p99, total); + } + + free(all); +} + +#define CALIBRATE_SEED_BATCH 100 +#define CALIBRATE_MIN_BATCH 100 +#define CALIBRATE_MAX_BATCH 10000000 +#define CALIBRATE_TARGET_MS 10 +#define CALIBRATE_RUNS 5 +#define PROPORTIONALITY_TOL 0.05 /* 5% */ + +static void reset_timing(struct bpf_bench_timing *t) +{ + *t->timing_enabled = 0; + memset(t->samples, 0, sizeof(__u64) * BENCH_NR_CPUS * BENCH_NR_SAMPLES); + memset(t->idx, 0, sizeof(__u32) * BENCH_NR_CPUS); +} + +static __u64 measure_elapsed(struct bpf_bench_timing *t, bpf_bench_run_fn run_fn, void *run_ctx, + __u32 iters, int runs) +{ + __u64 buf[CALIBRATE_RUNS]; + int n = 0, i, j; + + reset_timing(t); + *t->batch_iters_bss = iters; + *t->timing_enabled = 1; + + for (i = 0; i < runs; i++) + run_fn(run_ctx); + + *t->timing_enabled = 0; + + for (i = 0; i < BENCH_NR_CPUS && n < runs; i++) { + __u32 cnt = t->idx[i]; + + for (j = 0; j < (int)cnt && n < runs; j++) + buf[n++] = t->samples[i][j]; + } + + if (n == 0) + return 0; + + for (i = 1; i < n; i++) { + __u64 key = buf[i]; + + j = i - 1; + while (j >= 0 && buf[j] > key) { + buf[j + 1] = buf[j]; + j--; + } + buf[j + 1] = key; + } + + return buf[n / 2]; +} + +static __u32 compute_batch_iters(__u64 per_op_ns) +{ + __u64 target_ns = (__u64)CALIBRATE_TARGET_MS * 1000000ULL; + __u32 iters; + + if (per_op_ns == 0) + return CALIBRATE_MIN_BATCH; + + iters = target_ns / per_op_ns; + + if (iters < CALIBRATE_MIN_BATCH) + iters = CALIBRATE_MIN_BATCH; + if (iters > CALIBRATE_MAX_BATCH) + iters = CALIBRATE_MAX_BATCH; + + return iters; +} + +void bpf_bench_calibrate(struct bpf_bench_timing *t, bpf_bench_run_fn run_fn, void *run_ctx) +{ + __u64 elapsed, per_op_ns; + __u64 time_n, time_2n; + double ratio; + + elapsed = measure_elapsed(t, run_fn, run_ctx, CALIBRATE_SEED_BATCH, CALIBRATE_RUNS); + if (elapsed == 0) { + fprintf(stderr, "calibration: no timing samples, using default\n"); + t->batch_iters = 10000; + *t->batch_iters_bss = t->batch_iters; + reset_timing(t); + return; + } + + per_op_ns = elapsed / CALIBRATE_SEED_BATCH; + t->batch_iters = compute_batch_iters(per_op_ns); + + time_n = measure_elapsed(t, run_fn, run_ctx, t->batch_iters, CALIBRATE_RUNS); + time_2n = measure_elapsed(t, run_fn, run_ctx, t->batch_iters * 2, CALIBRATE_RUNS); + + if (time_n > 0 && time_2n > 0) { + ratio = (double)time_2n / (double)time_n; + + if (fabs(ratio - 2.0) / 2.0 > PROPORTIONALITY_TOL) + fprintf(stderr, + "WARNING: proportionality check failed (2N/N ratio=%.3f, " + "expected=2.000, error=%.1f%%)\n System noise may be affecting " + "results.\n", + ratio, fabs(ratio - 2.0) / 2.0 * 100.0); + } + + *t->batch_iters_bss = t->batch_iters; + reset_timing(t); +} diff --git a/tools/testing/selftests/bpf/progs/bench_bpf_timing.bpf.h b/tools/testing/selftests/bpf/progs/bench_bpf_timing.bpf.h new file mode 100644 index 000000000000..6a1ad75f1fd7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bench_bpf_timing.bpf.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#ifndef __BENCH_BPF_TIMING_BPF_H__ +#define __BENCH_BPF_TIMING_BPF_H__ + +#include +#include +#include +#include + +#ifndef BENCH_NR_SAMPLES +#define BENCH_NR_SAMPLES 4096 +#endif +#ifndef BENCH_NR_CPUS +#define BENCH_NR_CPUS 256 +#endif +#define BENCH_CPU_MASK (BENCH_NR_CPUS - 1) + +__u64 timing_samples[BENCH_NR_CPUS][BENCH_NR_SAMPLES]; +__u32 timing_idx[BENCH_NR_CPUS]; + +volatile __u32 batch_iters; +volatile __u32 timing_enabled; + +static __always_inline void bench_record_sample(__u64 elapsed_ns) +{ + __u32 cpu, idx; + + if (!timing_enabled) + return; + + cpu = bpf_get_smp_processor_id() & BENCH_CPU_MASK; + idx = timing_idx[cpu]; + + if (idx >= BENCH_NR_SAMPLES) + return; + + timing_samples[cpu][idx] = elapsed_ns; + timing_idx[cpu] = idx + 1; +} + +/* + * @body: expression to time; return value (int) stored in __bench_result. + * @reset: undo body's side-effects so each iteration starts identically. + * May reference __bench_result. Use ({}) for empty reset. + * + * Runs batch_iters timed iterations, then one untimed iteration whose + * return value the macro evaluates to (for validation). + */ +#define BENCH_BPF_LOOP(body, reset) ({ \ + __u64 __bench_start = bpf_ktime_get_ns(); \ + __u32 __bench_i; \ + int __bench_result; \ + \ + for (__bench_i = 0; \ + __bench_i < batch_iters && can_loop; \ + __bench_i++) { \ + __bench_result = (body); \ + reset; \ + } \ + \ + bench_record_sample(bpf_ktime_get_ns() - __bench_start); \ + \ + __bench_result = (body); \ + __bench_result; \ +}) + +#endif /* __BENCH_BPF_TIMING_BPF_H__ */ -- cgit v1.2.3 From dcf11479c2a8d3520953e8366f587ec2a36505a8 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Mon, 27 Apr 2026 16:23:00 -0700 Subject: selftests/bpf: Add bpf-nop benchmark for timing overhead baseline Add a minimal benchmark that measures the overhead of the batch-timing infrastructure itself. The BPF program runs an empty BENCH_BPF_LOOP body (~1.5-2 ns/op), establishing the floor cost that all timing-library benchmarks include. [root@virtme-ng tools/testing/selftests/bpf]# sudo ./bench -a -p8 bpf-nop Setting up benchmark 'bpf-nop'... Benchmark 'bpf-nop' started. bpf-nop: median 1.82 ns/op, stddev 0.01, p99 1.86 (1754 samples) Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260427232313.1582588-4-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 2 + tools/testing/selftests/bpf/bench.c | 2 + tools/testing/selftests/bpf/benchs/bench_bpf_nop.c | 84 ++++++++++++++++++++++ tools/testing/selftests/bpf/progs/bpf_nop_bench.c | 14 ++++ 4 files changed, 102 insertions(+) create mode 100644 tools/testing/selftests/bpf/benchs/bench_bpf_nop.c create mode 100644 tools/testing/selftests/bpf/progs/bpf_nop_bench.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 3d516f10f29e..97f9fbd41244 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -906,6 +906,7 @@ $(OUTPUT)/bench_htab_mem.o: $(OUTPUT)/htab_mem_bench.skel.h $(OUTPUT)/bench_bpf_crypto.o: $(OUTPUT)/crypto_bench.skel.h $(OUTPUT)/bench_sockmap.o: $(OUTPUT)/bench_sockmap_prog.skel.h $(OUTPUT)/bench_lpm_trie_map.o: $(OUTPUT)/lpm_trie_bench.skel.h $(OUTPUT)/lpm_trie_map.skel.h +$(OUTPUT)/bench_bpf_nop.o: $(OUTPUT)/bpf_nop_bench.skel.h bench_bpf_timing.h $(OUTPUT)/bench_bpf_timing.o: bench_bpf_timing.h $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ) $(OUTPUT)/bench: LDLIBS += -lm @@ -930,6 +931,7 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \ $(OUTPUT)/bench_sockmap.o \ $(OUTPUT)/bench_lpm_trie_map.o \ $(OUTPUT)/bench_bpf_timing.o \ + $(OUTPUT)/bench_bpf_nop.o \ $(OUTPUT)/usdt_1.o \ $(OUTPUT)/usdt_2.o \ # diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index 47a4e72208d6..1696de5d6780 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -575,6 +575,7 @@ extern const struct bench bench_lpm_trie_insert; extern const struct bench bench_lpm_trie_update; extern const struct bench bench_lpm_trie_delete; extern const struct bench bench_lpm_trie_free; +extern const struct bench bench_bpf_nop; static const struct bench *benchs[] = { &bench_count_global, @@ -653,6 +654,7 @@ static const struct bench *benchs[] = { &bench_lpm_trie_update, &bench_lpm_trie_delete, &bench_lpm_trie_free, + &bench_bpf_nop, }; static void find_benchmark(void) diff --git a/tools/testing/selftests/bpf/benchs/bench_bpf_nop.c b/tools/testing/selftests/bpf/benchs/bench_bpf_nop.c new file mode 100644 index 000000000000..e2d8c2ccf384 --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_bpf_nop.c @@ -0,0 +1,84 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include "bench.h" +#include "bench_bpf_timing.h" +#include "bpf_nop_bench.skel.h" +#include "bpf_util.h" + +static struct ctx { + struct bpf_nop_bench *skel; + struct bpf_bench_timing timing; + int prog_fd; +} ctx; + +static void nop_validate(void) +{ + if (env.consumer_cnt != 0) { + fprintf(stderr, "benchmark doesn't support consumers\n"); + exit(1); + } +} + +static void nop_run_once(void *unused __always_unused) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + + bpf_prog_test_run_opts(ctx.prog_fd, &topts); +} + +static void nop_setup(void) +{ + struct bpf_nop_bench *skel; + int err; + + setup_libbpf(); + + skel = bpf_nop_bench__open(); + if (!skel) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } + + err = bpf_nop_bench__load(skel); + if (err) { + fprintf(stderr, "failed to load skeleton: %s\n", strerror(-err)); + bpf_nop_bench__destroy(skel); + exit(1); + } + + ctx.skel = skel; + ctx.prog_fd = bpf_program__fd(skel->progs.bench_nop); + + BENCH_TIMING_INIT(&ctx.timing, skel, 0); + bpf_bench_calibrate(&ctx.timing, nop_run_once, NULL); + + env.duration_sec = 600; +} + +static void *nop_producer(void *input) +{ + while (true) + nop_run_once(NULL); + + return NULL; +} + +static void nop_measure(struct bench_res *res) +{ + bpf_bench_timing_measure(&ctx.timing, res); +} + +static void nop_report_final(struct bench_res res[], int res_cnt) +{ + bpf_bench_timing_report(&ctx.timing, "bpf-nop", NULL); +} + +const struct bench bench_bpf_nop = { + .name = "bpf-nop", + .validate = nop_validate, + .setup = nop_setup, + .producer_thread = nop_producer, + .measure = nop_measure, + .report_final = nop_report_final, +}; diff --git a/tools/testing/selftests/bpf/progs/bpf_nop_bench.c b/tools/testing/selftests/bpf/progs/bpf_nop_bench.c new file mode 100644 index 000000000000..01ed284c1bb3 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_nop_bench.c @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include "bench_bpf_timing.bpf.h" + +SEC("syscall") +int bench_nop(void *ctx) +{ + return BENCH_BPF_LOOP(0, ({})); +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 6b4003a7b333602fb24b514a27067e7a2c98136e Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Mon, 27 Apr 2026 16:23:01 -0700 Subject: selftests/bpf: Add XDP load-balancer common definitions Add the shared header for the XDP load-balancer benchmark. This defines the data structures used by both the BPF program and userspace: flow_key, vip_definition, real_definition, and the stats/control structures. Also provides the encapsulation source-address helpers shared between the BPF datapath (for encap) and userspace (for building expected output packets used in validation). Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260427232313.1582588-5-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/xdp_lb_bench_common.h | 112 ++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 tools/testing/selftests/bpf/xdp_lb_bench_common.h diff --git a/tools/testing/selftests/bpf/xdp_lb_bench_common.h b/tools/testing/selftests/bpf/xdp_lb_bench_common.h new file mode 100644 index 000000000000..aed20a963701 --- /dev/null +++ b/tools/testing/selftests/bpf/xdp_lb_bench_common.h @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#ifndef XDP_LB_BENCH_COMMON_H +#define XDP_LB_BENCH_COMMON_H + +#define F_IPV6 (1 << 0) +#define F_LRU_BYPASS (1 << 1) + +#define CH_RING_SIZE 65537 /* per-VIP consistent hash ring slots */ +#define MAX_VIPS 16 +#define CH_RINGS_SIZE (MAX_VIPS * CH_RING_SIZE) +#define MAX_REALS 512 +#define DEFAULT_LRU_SIZE 100000 /* connection tracking cache size */ +#define ONE_SEC 1000000000U /* 1 sec in nanosec */ +#define MAX_CONN_RATE 100000000 /* high enough to never trigger in bench */ +#define LRU_UDP_TIMEOUT 30000000000ULL /* 30 sec in nanosec */ +#define PCKT_FRAGMENTED 0x3FFF +#define KNUTH_HASH_MULT 2654435761U +#define IPIP_V4_PREFIX 4268 /* 172.16/12 in network order */ +#define IPIP_V6_PREFIX1 1 /* 0100::/64 (RFC 6666 discard) */ +#define IPIP_V6_PREFIX2 0 +#define IPIP_V6_PREFIX3 0 + +/* Stats indices (0..MAX_VIPS-1 are per-VIP packet/byte counters) */ +#define STATS_LRU (MAX_VIPS + 0) /* v1: total VIP packets, v2: LRU misses */ +#define STATS_XDP_TX (MAX_VIPS + 1) +#define STATS_XDP_PASS (MAX_VIPS + 2) +#define STATS_XDP_DROP (MAX_VIPS + 3) +#define STATS_NEW_CONN (MAX_VIPS + 4) /* v1: conn count, v2: last reset ts */ +#define STATS_LRU_MISS (MAX_VIPS + 5) /* v1: TCP LRU misses */ +#define STATS_SIZE (MAX_VIPS + 6) + +#ifdef __BPF__ +#define lb_htons(x) bpf_htons(x) +#define LB_INLINE static __always_inline +#else +#define lb_htons(x) htons(x) +#define LB_INLINE static inline +#endif + +LB_INLINE __be32 create_encap_ipv4_src(__u16 port, __be32 src) +{ + __u32 ip_suffix = lb_htons(port); + + ip_suffix <<= 16; + ip_suffix ^= src; + return (0xFFFF0000 & ip_suffix) | IPIP_V4_PREFIX; +} + +LB_INLINE void create_encap_ipv6_src(__u16 port, __be32 src, __be32 *saddr) +{ + saddr[0] = IPIP_V6_PREFIX1; + saddr[1] = IPIP_V6_PREFIX2; + saddr[2] = IPIP_V6_PREFIX3; + saddr[3] = src ^ port; +} + +struct flow_key { + union { + __be32 src; + __be32 srcv6[4]; + }; + union { + __be32 dst; + __be32 dstv6[4]; + }; + union { + __u32 ports; + __u16 port16[2]; + }; + __u8 proto; + __u8 pad[3]; +}; + +struct vip_definition { + union { + __be32 vip; + __be32 vipv6[4]; + }; + __u16 port; + __u8 proto; + __u8 pad; +}; + +struct vip_meta { + __u32 flags; + __u32 vip_num; +}; + +struct real_pos_lru { + __u32 pos; + __u64 atime; +}; + +struct real_definition { + __be32 dst; + __be32 dstv6[4]; + __u8 flags; +}; + +struct lb_stats { + __u64 v1; + __u64 v2; +}; + +struct ctl_value { + __u8 mac[6]; + __u8 pad[2]; +}; + +#endif /* XDP_LB_BENCH_COMMON_H */ -- cgit v1.2.3 From 4b4f2229104c9010005d50125ccbfb1b4be68be5 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Mon, 27 Apr 2026 16:23:02 -0700 Subject: selftests/bpf: Add XDP load-balancer BPF program Add the BPF datapath for the XDP load-balancer benchmark, a simplified L4 load-balancer inspired by katran. The pipeline: L3/L4 parse -> VIP lookup -> per-CPU LRU connection table or consistent-hash fallback -> real server lookup -> per-VIP and per-real stats -> IPIP/IP6IP6 encapsulation. TCP SYN forces the consistent-hash path (skipping LRU); TCP RST skips LRU insert to avoid polluting the table. process_packet() is marked __noinline so that the BENCH_BPF_LOOP reset block (which strips encapsulation) operates on valid packet pointers after bpf_xdp_adjust_head(). Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260427232313.1582588-6-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/xdp_lb_bench.c | 647 +++++++++++++++++++++++ 1 file changed, 647 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/xdp_lb_bench.c diff --git a/tools/testing/selftests/bpf/progs/xdp_lb_bench.c b/tools/testing/selftests/bpf/progs/xdp_lb_bench.c new file mode 100644 index 000000000000..b9fd848c035d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/xdp_lb_bench.c @@ -0,0 +1,647 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "bpf_compiler.h" +#include "xdp_lb_bench_common.h" +#include "bench_bpf_timing.bpf.h" + +#ifndef IPPROTO_FRAGMENT +#define IPPROTO_FRAGMENT 44 +#endif + +/* jhash helpers */ + +static inline __u32 rol32(__u32 word, unsigned int shift) +{ + return (word << shift) | (word >> ((-shift) & 31)); +} + +#define __jhash_mix(a, b, c) \ +{ \ + a -= c; a ^= rol32(c, 4); c += b; \ + b -= a; b ^= rol32(a, 6); a += c; \ + c -= b; c ^= rol32(b, 8); b += a; \ + a -= c; a ^= rol32(c, 16); c += b; \ + b -= a; b ^= rol32(a, 19); a += c; \ + c -= b; c ^= rol32(b, 4); b += a; \ +} + +#define __jhash_final(a, b, c) \ +{ \ + c ^= b; c -= rol32(b, 14); \ + a ^= c; a -= rol32(c, 11); \ + b ^= a; b -= rol32(a, 25); \ + c ^= b; c -= rol32(b, 16); \ + a ^= c; a -= rol32(c, 4); \ + b ^= a; b -= rol32(a, 14); \ + c ^= b; c -= rol32(b, 24); \ +} + +#define JHASH_INITVAL 0xdeadbeef + +static inline __u32 __jhash_nwords(__u32 a, __u32 b, __u32 c, __u32 initval) +{ + a += initval; + b += initval; + c += initval; + __jhash_final(a, b, c); + return c; +} + +static inline __u32 jhash_2words(__u32 a, __u32 b, __u32 initval) +{ + return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2)); +} + +static inline __u32 jhash2_4words(const __u32 *k, __u32 initval) +{ + __u32 a, b, c; + + a = b = c = JHASH_INITVAL + (4 << 2) + initval; + + a += k[0]; b += k[1]; c += k[2]; + __jhash_mix(a, b, c); + + a += k[3]; + __jhash_final(a, b, c); + + return c; +} + +static __always_inline void ipv4_csum(struct iphdr *iph) +{ + __u16 *next_iph = (__u16 *)iph; + __u32 csum = 0; + int i; + + __pragma_loop_unroll_full + for (i = 0; i < (int)(sizeof(*iph) >> 1); i++) + csum += *next_iph++; + + csum = (csum & 0xffff) + (csum >> 16); + csum = (csum & 0xffff) + (csum >> 16); + iph->check = ~csum; +} + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 64); + __type(key, struct vip_definition); + __type(value, struct vip_meta); +} vip_map SEC(".maps"); + +struct lru_inner_map { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __type(key, struct flow_key); + __type(value, struct real_pos_lru); + __uint(max_entries, DEFAULT_LRU_SIZE); +} lru_inner SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __type(key, __u32); + __type(value, __u32); + __uint(max_entries, BENCH_NR_CPUS); + __array(values, struct lru_inner_map); +} lru_mapping SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, CH_RINGS_SIZE); + __type(key, __u32); + __type(value, __u32); +} ch_rings SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, MAX_REALS); + __type(key, __u32); + __type(value, struct real_definition); +} reals SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, STATS_SIZE); + __type(key, __u32); + __type(value, struct lb_stats); +} stats SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, MAX_REALS); + __type(key, __u32); + __type(value, struct lb_stats); +} reals_stats SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, struct ctl_value); +} ctl_array SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, struct vip_definition); +} vip_miss_stats SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, MAX_REALS); + __type(key, __u32); + __type(value, __u32); +} lru_miss_stats SEC(".maps"); + +volatile __u32 flow_mask; +volatile __u32 cold_lru; +__u32 batch_gen; + +/* + * old_eth MUST be read BEFORE writing the outer header because + * bpf_xdp_adjust_head makes them overlap. + */ +static __always_inline int encap_v4(struct xdp_md *xdp, __be32 saddr, __be32 daddr, + __u16 payload_len, const __u8 *dst_mac) +{ + struct ethhdr *new_eth, *old_eth; + void *data, *data_end; + struct iphdr *iph; + + if (bpf_xdp_adjust_head(xdp, -(int)sizeof(struct iphdr))) + return -1; + + data = (void *)(long)xdp->data; + data_end = (void *)(long)xdp->data_end; + + new_eth = data; + iph = data + sizeof(struct ethhdr); + old_eth = data + sizeof(struct iphdr); + + if (new_eth + 1 > data_end || old_eth + 1 > data_end || iph + 1 > data_end) + return -1; + + __builtin_memcpy(new_eth->h_source, old_eth->h_dest, sizeof(new_eth->h_source)); + __builtin_memcpy(new_eth->h_dest, dst_mac, sizeof(new_eth->h_dest)); + new_eth->h_proto = bpf_htons(ETH_P_IP); + + __builtin_memset(iph, 0, sizeof(*iph)); + iph->version = 4; + iph->ihl = sizeof(*iph) >> 2; + iph->protocol = IPPROTO_IPIP; + iph->tot_len = bpf_htons(payload_len + sizeof(*iph)); + iph->ttl = 64; + iph->saddr = saddr; + iph->daddr = daddr; + ipv4_csum(iph); + + return 0; +} + +static __always_inline int encap_v6(struct xdp_md *xdp, const __be32 saddr[4], + const __be32 daddr[4], __u8 nexthdr, __u16 payload_len, + const __u8 *dst_mac) +{ + struct ethhdr *new_eth, *old_eth; + void *data, *data_end; + struct ipv6hdr *ip6h; + + if (bpf_xdp_adjust_head(xdp, -(int)sizeof(struct ipv6hdr))) + return -1; + + data = (void *)(long)xdp->data; + data_end = (void *)(long)xdp->data_end; + + new_eth = data; + ip6h = data + sizeof(struct ethhdr); + old_eth = data + sizeof(struct ipv6hdr); + + if (new_eth + 1 > data_end || old_eth + 1 > data_end || ip6h + 1 > data_end) + return -1; + + __builtin_memcpy(new_eth->h_source, old_eth->h_dest, sizeof(new_eth->h_source)); + __builtin_memcpy(new_eth->h_dest, dst_mac, sizeof(new_eth->h_dest)); + new_eth->h_proto = bpf_htons(ETH_P_IPV6); + + __builtin_memset(ip6h, 0, sizeof(*ip6h)); + ip6h->version = 6; + ip6h->nexthdr = nexthdr; + ip6h->payload_len = bpf_htons(payload_len); + ip6h->hop_limit = 64; + __builtin_memcpy(&ip6h->saddr, saddr, sizeof(ip6h->saddr)); + __builtin_memcpy(&ip6h->daddr, daddr, sizeof(ip6h->daddr)); + + return 0; +} + +static __always_inline void update_stats(void *map, __u32 key, __u16 bytes) +{ + struct lb_stats *st = bpf_map_lookup_elem(map, &key); + + if (st) { + st->v1 += 1; + st->v2 += bytes; + } +} + +static __always_inline void count_action(int action) +{ + struct lb_stats *st; + __u32 key; + + if (action == XDP_TX) + key = STATS_XDP_TX; + else if (action == XDP_PASS) + key = STATS_XDP_PASS; + else + key = STATS_XDP_DROP; + + st = bpf_map_lookup_elem(&stats, &key); + if (st) + st->v1 += 1; +} + +static __always_inline bool is_under_flood(void) +{ + __u32 key = STATS_NEW_CONN; + struct lb_stats *conn_st = bpf_map_lookup_elem(&stats, &key); + __u64 cur_time; + + if (!conn_st) + return true; + + cur_time = bpf_ktime_get_ns(); + if ((cur_time - conn_st->v2) > ONE_SEC) { + conn_st->v1 = 1; + conn_st->v2 = cur_time; + } else { + conn_st->v1 += 1; + if (conn_st->v1 > MAX_CONN_RATE) + return true; + } + return false; +} + +static __always_inline struct real_definition *connection_table_lookup(void *lru_map, + struct flow_key *flow, + __u32 *out_pos) +{ + struct real_pos_lru *dst_lru; + struct real_definition *real; + __u32 key; + + dst_lru = bpf_map_lookup_elem(lru_map, flow); + if (!dst_lru) + return NULL; + + /* UDP connections use atime-based timeout instead of FIN/RST */ + if (flow->proto == IPPROTO_UDP) { + __u64 cur_time = bpf_ktime_get_ns(); + + if (cur_time - dst_lru->atime > LRU_UDP_TIMEOUT) + return NULL; + dst_lru->atime = cur_time; + } + + key = dst_lru->pos; + *out_pos = key; + real = bpf_map_lookup_elem(&reals, &key); + return real; +} + +static __always_inline bool get_packet_dst(struct real_definition **real, struct flow_key *flow, + struct vip_meta *vip_info, bool is_v6, void *lru_map, + bool is_rst, __u32 *out_pos) +{ + bool under_flood; + __u32 hash, ch_key; + __u32 *ch_val; + __u32 real_pos; + + under_flood = is_under_flood(); + + if (is_v6) { + __u32 src_hash = jhash2_4words((__u32 *)flow->srcv6, MAX_VIPS); + + hash = jhash_2words(src_hash, flow->ports, CH_RING_SIZE); + } else { + hash = jhash_2words(flow->src, flow->ports, CH_RING_SIZE); + } + + ch_key = CH_RING_SIZE * vip_info->vip_num + hash % CH_RING_SIZE; + ch_val = bpf_map_lookup_elem(&ch_rings, &ch_key); + if (!ch_val) + return false; + real_pos = *ch_val; + + *real = bpf_map_lookup_elem(&reals, &real_pos); + if (!(*real)) + return false; + + if (!(vip_info->flags & F_LRU_BYPASS) && !under_flood && !is_rst) { + struct real_pos_lru new_lru = { .pos = real_pos }; + + if (flow->proto == IPPROTO_UDP) + new_lru.atime = bpf_ktime_get_ns(); + bpf_map_update_elem(lru_map, flow, &new_lru, BPF_ANY); + } + + *out_pos = real_pos; + return true; +} + +static __always_inline void update_vip_lru_miss_stats(struct vip_definition *vip, bool is_v6, + __u32 real_idx) +{ + struct vip_definition *miss_vip; + __u32 key = 0; + __u32 *cnt; + + miss_vip = bpf_map_lookup_elem(&vip_miss_stats, &key); + if (!miss_vip) + return; + + if (is_v6) { + if (miss_vip->vipv6[0] != vip->vipv6[0] || miss_vip->vipv6[1] != vip->vipv6[1] || + miss_vip->vipv6[2] != vip->vipv6[2] || miss_vip->vipv6[3] != vip->vipv6[3]) + return; + } else { + if (miss_vip->vip != vip->vip) + return; + } + + if (miss_vip->port != vip->port || miss_vip->proto != vip->proto) + return; + + cnt = bpf_map_lookup_elem(&lru_miss_stats, &real_idx); + if (cnt) + *cnt += 1; +} + +static __noinline int process_packet(struct xdp_md *xdp) +{ + void *data = (void *)(long)xdp->data; + void *data_end = (void *)(long)xdp->data_end; + struct ethhdr *eth = data; + struct real_definition *dst = NULL; + struct vip_definition vip_def = {}; + struct ctl_value *cval; + struct flow_key flow = {}; + struct vip_meta *vip_info; + struct lb_stats *data_stats; + struct udphdr *uh; + __be32 tnl_src[4]; + void *lru_map; + void *l4; + __u16 payload_len; + __u32 real_pos = 0, cpu_num, key; + __u8 proto; + int action = XDP_DROP; + bool is_v6, is_syn = false, is_rst = false; + + if (eth + 1 > data_end) + goto out; + + if (eth->h_proto == bpf_htons(ETH_P_IPV6)) { + is_v6 = true; + } else if (eth->h_proto == bpf_htons(ETH_P_IP)) { + is_v6 = false; + } else { + action = XDP_PASS; + goto out; + } + + if (is_v6) { + struct ipv6hdr *ip6h = (void *)(eth + 1); + + if (ip6h + 1 > data_end) + goto out; + if (ip6h->nexthdr == IPPROTO_FRAGMENT) + goto out; + + payload_len = sizeof(struct ipv6hdr) + bpf_ntohs(ip6h->payload_len); + proto = ip6h->nexthdr; + + __builtin_memcpy(flow.srcv6, &ip6h->saddr, sizeof(flow.srcv6)); + __builtin_memcpy(flow.dstv6, &ip6h->daddr, sizeof(flow.dstv6)); + __builtin_memcpy(vip_def.vipv6, &ip6h->daddr, sizeof(vip_def.vipv6)); + l4 = (void *)(ip6h + 1); + } else { + struct iphdr *iph = (void *)(eth + 1); + + if (iph + 1 > data_end) + goto out; + if (iph->ihl != 5) + goto out; + if (iph->frag_off & bpf_htons(PCKT_FRAGMENTED)) + goto out; + + payload_len = bpf_ntohs(iph->tot_len); + proto = iph->protocol; + + flow.src = iph->saddr; + flow.dst = iph->daddr; + vip_def.vip = iph->daddr; + l4 = (void *)(iph + 1); + } + + /* TCP and UDP share the same port layout at offset 0 */ + if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) { + action = XDP_PASS; + goto out; + } + + uh = l4; + if ((void *)(uh + 1) > data_end) + goto out; + flow.port16[0] = uh->source; + flow.port16[1] = uh->dest; + + if (proto == IPPROTO_TCP) { + struct tcphdr *th = l4; + + if ((void *)(th + 1) > data_end) + goto out; + is_syn = th->syn; + is_rst = th->rst; + } + + flow.proto = proto; + vip_def.port = flow.port16[1]; + vip_def.proto = proto; + + vip_info = bpf_map_lookup_elem(&vip_map, &vip_def); + if (!vip_info) { + action = XDP_PASS; + goto out; + } + + key = STATS_LRU; + data_stats = bpf_map_lookup_elem(&stats, &key); + if (!data_stats) + goto out; + data_stats->v1 += 1; + + cpu_num = bpf_get_smp_processor_id(); + lru_map = bpf_map_lookup_elem(&lru_mapping, &cpu_num); + if (!lru_map) + goto out; + + if (!(vip_info->flags & F_LRU_BYPASS) && !is_syn) + dst = connection_table_lookup(lru_map, &flow, &real_pos); + + if (!dst) { + if (flow.proto == IPPROTO_TCP) { + struct lb_stats *miss_st; + + key = STATS_LRU_MISS; + miss_st = bpf_map_lookup_elem(&stats, &key); + if (miss_st) + miss_st->v1 += 1; + } + + if (!get_packet_dst(&dst, &flow, vip_info, is_v6, lru_map, is_rst, &real_pos)) + goto out; + + update_vip_lru_miss_stats(&vip_def, is_v6, real_pos); + data_stats->v2 += 1; + } + + key = 0; + cval = bpf_map_lookup_elem(&ctl_array, &key); + if (!cval) + goto out; + + update_stats(&stats, vip_info->vip_num, payload_len); + update_stats(&reals_stats, real_pos, payload_len); + + if (is_v6) { + create_encap_ipv6_src(flow.port16[0], flow.srcv6[0], tnl_src); + if (encap_v6(xdp, tnl_src, dst->dstv6, IPPROTO_IPV6, payload_len, cval->mac)) + goto out; + } else if (dst->flags & F_IPV6) { + create_encap_ipv6_src(flow.port16[0], flow.src, tnl_src); + if (encap_v6(xdp, tnl_src, dst->dstv6, IPPROTO_IPIP, payload_len, cval->mac)) + goto out; + } else { + if (encap_v4(xdp, create_encap_ipv4_src(flow.port16[0], flow.src), dst->dst, + payload_len, cval->mac)) + goto out; + } + + action = XDP_TX; + +out: + count_action(action); + return action; +} + +static __always_inline int strip_encap(struct xdp_md *xdp, const struct ethhdr *saved_eth) +{ + void *data = (void *)(long)xdp->data; + void *data_end = (void *)(long)xdp->data_end; + struct ethhdr *eth = data; + int hdr_sz; + + if (eth + 1 > data_end) + return -1; + + hdr_sz = (eth->h_proto == bpf_htons(ETH_P_IPV6)) ? (int)sizeof(struct ipv6hdr) + : (int)sizeof(struct iphdr); + + if (bpf_xdp_adjust_head(xdp, hdr_sz)) + return -1; + + data = (void *)(long)xdp->data; + data_end = (void *)(long)xdp->data_end; + eth = data; + + if (eth + 1 > data_end) + return -1; + + __builtin_memcpy(eth, saved_eth, sizeof(*saved_eth)); + return 0; +} + +static __always_inline void randomize_src(struct xdp_md *xdp, int saddr_off, __u32 *rand_state) +{ + void *data = (void *)(long)xdp->data; + void *data_end = (void *)(long)xdp->data_end; + __u32 *saddr = data + saddr_off; + + *rand_state ^= *rand_state << 13; + *rand_state ^= *rand_state >> 17; + *rand_state ^= *rand_state << 5; + + if ((void *)(saddr + 1) <= data_end) + *saddr = *rand_state & flow_mask; +} + +SEC("xdp") +int xdp_lb_bench(struct xdp_md *xdp) +{ + void *data = (void *)(long)xdp->data; + void *data_end = (void *)(long)xdp->data_end; + struct ethhdr *eth = data; + struct ethhdr saved_eth; + __u32 rand_state = 0; + __u32 batch_hash = 0; + int saddr_off = 0; + bool is_v6; + + if (eth + 1 > data_end) + return XDP_DROP; + + __builtin_memcpy(&saved_eth, eth, sizeof(saved_eth)); + + is_v6 = (saved_eth.h_proto == bpf_htons(ETH_P_IPV6)); + + saddr_off = sizeof(struct ethhdr) + (is_v6 ? offsetof(struct ipv6hdr, saddr) : + offsetof(struct iphdr, saddr)); + + if (flow_mask) + rand_state = bpf_get_prandom_u32() | 1; + + if (cold_lru) { + __u32 *saddr = data + saddr_off; + + batch_gen++; + batch_hash = (batch_gen ^ bpf_get_smp_processor_id()) * KNUTH_HASH_MULT; + if ((void *)(saddr + 1) <= data_end) + *saddr ^= batch_hash; + } + + return BENCH_BPF_LOOP( + process_packet(xdp), + ({ + if (__bench_result == XDP_TX) { + if (strip_encap(xdp, &saved_eth)) + return XDP_DROP; + if (rand_state) + randomize_src(xdp, saddr_off, &rand_state); + } + if (cold_lru) { + void *d = (void *)(long)xdp->data; + void *de = (void *)(long)xdp->data_end; + __u32 *__sa = d + saddr_off; + + if ((void *)(__sa + 1) <= de) + *__sa ^= batch_hash; + } + }) + ); +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From a4b5ba8187cb184aacac4ac8c86b4ef4821a4aa6 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Mon, 27 Apr 2026 16:23:03 -0700 Subject: selftests/bpf: Add XDP load-balancer benchmark driver Wire up the userspace side of the XDP load-balancer benchmark. 24 scenarios cover the full code-path matrix: TCP/UDP, IPv4/IPv6, cross-AF encap, LRU hit/miss/diverse/cold, consistent-hash bypass, SYN/RST flag handling, and early exits (unknown VIP, non-IP, ICMP, fragments, IP options). Before benchmarking each scenario validates correctness: the output packet is compared byte-for-byte against a pre-built expected packet and BPF map counters are checked against the expected values. Usage: sudo ./bench -a -w3 -p1 xdp-lb --scenario tcp-v4-lru-hit sudo ./bench xdp-lb --list-scenarios Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260427232313.1582588-7-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 2 + tools/testing/selftests/bpf/bench.c | 4 + tools/testing/selftests/bpf/benchs/bench_xdp_lb.c | 1113 +++++++++++++++++++++ 3 files changed, 1119 insertions(+) create mode 100644 tools/testing/selftests/bpf/benchs/bench_xdp_lb.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 97f9fbd41244..bc049620c774 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -907,6 +907,7 @@ $(OUTPUT)/bench_bpf_crypto.o: $(OUTPUT)/crypto_bench.skel.h $(OUTPUT)/bench_sockmap.o: $(OUTPUT)/bench_sockmap_prog.skel.h $(OUTPUT)/bench_lpm_trie_map.o: $(OUTPUT)/lpm_trie_bench.skel.h $(OUTPUT)/lpm_trie_map.skel.h $(OUTPUT)/bench_bpf_nop.o: $(OUTPUT)/bpf_nop_bench.skel.h bench_bpf_timing.h +$(OUTPUT)/bench_xdp_lb.o: $(OUTPUT)/xdp_lb_bench.skel.h bench_bpf_timing.h $(OUTPUT)/bench_bpf_timing.o: bench_bpf_timing.h $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ) $(OUTPUT)/bench: LDLIBS += -lm @@ -932,6 +933,7 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \ $(OUTPUT)/bench_lpm_trie_map.o \ $(OUTPUT)/bench_bpf_timing.o \ $(OUTPUT)/bench_bpf_nop.o \ + $(OUTPUT)/bench_xdp_lb.o \ $(OUTPUT)/usdt_1.o \ $(OUTPUT)/usdt_2.o \ # diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index 1696de5d6780..6155ce455c27 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -286,6 +286,7 @@ extern struct argp bench_trigger_batch_argp; extern struct argp bench_crypto_argp; extern struct argp bench_sockmap_argp; extern struct argp bench_lpm_trie_map_argp; +extern struct argp bench_xdp_lb_argp; static const struct argp_child bench_parsers[] = { { &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 }, @@ -302,6 +303,7 @@ static const struct argp_child bench_parsers[] = { { &bench_crypto_argp, 0, "bpf crypto benchmark", 0 }, { &bench_sockmap_argp, 0, "bpf sockmap benchmark", 0 }, { &bench_lpm_trie_map_argp, 0, "LPM trie map benchmark", 0 }, + { &bench_xdp_lb_argp, 0, "XDP load-balancer benchmark", 0 }, {}, }; @@ -576,6 +578,7 @@ extern const struct bench bench_lpm_trie_update; extern const struct bench bench_lpm_trie_delete; extern const struct bench bench_lpm_trie_free; extern const struct bench bench_bpf_nop; +extern const struct bench bench_xdp_lb; static const struct bench *benchs[] = { &bench_count_global, @@ -655,6 +658,7 @@ static const struct bench *benchs[] = { &bench_lpm_trie_delete, &bench_lpm_trie_free, &bench_bpf_nop, + &bench_xdp_lb, }; static void find_benchmark(void) diff --git a/tools/testing/selftests/bpf/benchs/bench_xdp_lb.c b/tools/testing/selftests/bpf/benchs/bench_xdp_lb.c new file mode 100644 index 000000000000..0b6709a2b03c --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_xdp_lb.c @@ -0,0 +1,1113 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "bench.h" +#include "bench_bpf_timing.h" +#include "xdp_lb_bench.skel.h" +#include "xdp_lb_bench_common.h" +#include "bpf_util.h" + +#define IP4(a, b, c, d) (((__u32)(a) << 24) | ((__u32)(b) << 16) | ((__u32)(c) << 8) | (__u32)(d)) + +#define IP6(a, b, c, d) { (__u32)(a), (__u32)(b), (__u32)(c), (__u32)(d) } + +#define TNL_DST IP4(192, 168, 1, 2) +#define REAL_INDEX 1 +#define REAL_INDEX_V6 2 +#define MAX_PKT_SIZE 256 +#define IP_MF 0x2000 + +static const __u32 tnl_dst_v6[4] = { 0xfd000000, 0, 0, 2 }; + +static const __u8 lb_mac[ETH_ALEN] = {0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff}; +static const __u8 client_mac[ETH_ALEN] = {0x11, 0x22, 0x33, 0x44, 0x55, 0x66}; +static const __u8 router_mac[ETH_ALEN] = {0xde, 0xad, 0xbe, 0xef, 0x00, 0x01}; + +enum scenario_id { + S_TCP_V4_LRU_HIT, + S_TCP_V4_CH, + S_TCP_V6_LRU_HIT, + S_TCP_V6_CH, + S_UDP_V4_LRU_HIT, + S_UDP_V6_LRU_HIT, + S_TCP_V4V6_LRU_HIT, + S_TCP_V4_LRU_DIVERSE, + S_TCP_V4_CH_DIVERSE, + S_TCP_V6_LRU_DIVERSE, + S_TCP_V6_CH_DIVERSE, + S_UDP_V4_LRU_DIVERSE, + S_TCP_V4_LRU_MISS, + S_UDP_V4_LRU_MISS, + S_TCP_V4_LRU_WARMUP, + S_TCP_V4_SYN, + S_TCP_V4_RST_MISS, + S_PASS_V4_NO_VIP, + S_PASS_V6_NO_VIP, + S_PASS_V4_ICMP, + S_PASS_NON_IP, + S_DROP_V4_FRAG, + S_DROP_V4_OPTIONS, + S_DROP_V6_FRAG, + NUM_SCENARIOS, +}; + +enum lru_miss_type { + LRU_MISS_AUTO = 0, /* compute from scenario flags (default) */ + LRU_MISS_NONE, /* 0 misses (all LRU hits) */ + LRU_MISS_ALL, /* batch_iters+1 misses (every op misses) */ + LRU_MISS_FIRST, /* 1 miss (first miss, then hits) */ +}; + +#define S_BASE_ENCAP_V4 \ + .expected_retval = XDP_TX, .expect_encap = true, \ + .tunnel_dst = TNL_DST + +#define S_BASE_ENCAP_V6 \ + .expected_retval = XDP_TX, .expect_encap = true, \ + .is_v6 = true, .encap_v6_outer = true, \ + .tunnel_dst_v6 = { 0xfd000000, 0, 0, 2 } + +#define S_BASE_ENCAP_V4V6 \ + .expected_retval = XDP_TX, .expect_encap = true, \ + .encap_v6_outer = true, \ + .tunnel_dst_v6 = { 0xfd000000, 0, 0, 2 } + +struct test_scenario { + const char *name; + const char *description; + int expected_retval; + bool expect_encap; + bool is_v6; + __u32 vip_addr; + __u32 src_addr; + __u32 tunnel_dst; + __u32 vip_addr_v6[4]; + __u32 src_addr_v6[4]; + __u32 tunnel_dst_v6[4]; + __u16 dst_port; + __u16 src_port; + __u8 ip_proto; + __u32 vip_flags; + __u32 vip_num; + bool prepopulate_lru; + bool set_frag; + __u16 eth_proto; + bool encap_v6_outer; + __u32 flow_mask; + bool cold_lru; + bool set_syn; + bool set_rst; + bool set_ip_options; + __u32 fixed_batch_iters; /* 0 = auto-calibrate, >0 = use this value */ + enum lru_miss_type lru_miss; /* expected LRU miss pattern */ +}; + +static const struct test_scenario scenarios[NUM_SCENARIOS] = { + /* Single-flow baseline */ + [S_TCP_V4_LRU_HIT] = { + S_BASE_ENCAP_V4, .ip_proto = IPPROTO_TCP, + .name = "tcp-v4-lru-hit", + .description = "IPv4 TCP, LRU hit, IPIP encap", + .vip_addr = IP4(10, 10, 1, 1), .dst_port = 80, + .src_addr = IP4(10, 10, 2, 1), .src_port = 12345, + .prepopulate_lru = true, .lru_miss = LRU_MISS_NONE, + }, + [S_TCP_V4_CH] = { + S_BASE_ENCAP_V4, .ip_proto = IPPROTO_TCP, + .name = "tcp-v4-ch", + .description = "IPv4 TCP, CH (LRU bypass), IPIP encap", + .vip_addr = IP4(10, 10, 1, 2), .dst_port = 80, + .src_addr = IP4(10, 10, 2, 2), .src_port = 54321, + .vip_flags = F_LRU_BYPASS, .vip_num = 1, + .lru_miss = LRU_MISS_ALL, + }, + [S_TCP_V6_LRU_HIT] = { + S_BASE_ENCAP_V6, .ip_proto = IPPROTO_TCP, + .name = "tcp-v6-lru-hit", + .description = "IPv6 TCP, LRU hit, IP6IP6 encap", + .vip_addr_v6 = IP6(0xfd000100, 0, 0, 1), .dst_port = 80, + .src_addr_v6 = IP6(0xfd000200, 0, 0, 1), .src_port = 12345, + .vip_num = 10, + .prepopulate_lru = true, .lru_miss = LRU_MISS_NONE, + }, + [S_TCP_V6_CH] = { + S_BASE_ENCAP_V6, .ip_proto = IPPROTO_TCP, + .name = "tcp-v6-ch", + .description = "IPv6 TCP, CH (LRU bypass), IP6IP6 encap", + .vip_addr_v6 = IP6(0xfd000100, 0, 0, 2), .dst_port = 80, + .src_addr_v6 = IP6(0xfd000200, 0, 0, 2), .src_port = 54321, + .vip_flags = F_LRU_BYPASS, .vip_num = 12, + .lru_miss = LRU_MISS_ALL, + }, + [S_UDP_V4_LRU_HIT] = { + S_BASE_ENCAP_V4, .ip_proto = IPPROTO_UDP, + .name = "udp-v4-lru-hit", + .description = "IPv4 UDP, LRU hit, IPIP encap", + .vip_addr = IP4(10, 10, 1, 1), .dst_port = 443, + .src_addr = IP4(10, 10, 3, 1), .src_port = 11111, + .vip_num = 2, + .prepopulate_lru = true, .lru_miss = LRU_MISS_NONE, + }, + [S_UDP_V6_LRU_HIT] = { + S_BASE_ENCAP_V6, .ip_proto = IPPROTO_UDP, + .name = "udp-v6-lru-hit", + .description = "IPv6 UDP, LRU hit, IP6IP6 encap", + .vip_addr_v6 = IP6(0xfd000100, 0, 0, 1), .dst_port = 443, + .src_addr_v6 = IP6(0xfd000200, 0, 0, 3), .src_port = 22222, + .vip_num = 14, + .prepopulate_lru = true, .lru_miss = LRU_MISS_NONE, + }, + [S_TCP_V4V6_LRU_HIT] = { + S_BASE_ENCAP_V4V6, .ip_proto = IPPROTO_TCP, + .name = "tcp-v4v6-lru-hit", + .description = "IPv4 TCP, LRU hit, IPv4-in-IPv6 encap", + .vip_addr = IP4(10, 10, 1, 4), .dst_port = 80, + .src_addr = IP4(10, 10, 2, 4), .src_port = 12347, + .vip_num = 13, + .prepopulate_lru = true, .lru_miss = LRU_MISS_NONE, + }, + + /* Diverse flows (4K src addrs) */ + [S_TCP_V4_LRU_DIVERSE] = { + S_BASE_ENCAP_V4, .ip_proto = IPPROTO_TCP, + .name = "tcp-v4-lru-diverse", + .description = "IPv4 TCP, diverse flows, warm LRU", + .vip_addr = IP4(10, 10, 1, 1), .dst_port = 80, + .src_addr = IP4(10, 10, 2, 1), .src_port = 12345, + .prepopulate_lru = true, .flow_mask = 0xFFF, + .lru_miss = LRU_MISS_NONE, + }, + [S_TCP_V4_CH_DIVERSE] = { + S_BASE_ENCAP_V4, .ip_proto = IPPROTO_TCP, + .name = "tcp-v4-ch-diverse", + .description = "IPv4 TCP, diverse flows, CH (LRU bypass)", + .vip_addr = IP4(10, 10, 1, 2), .dst_port = 80, + .src_addr = IP4(10, 10, 2, 2), .src_port = 54321, + .vip_flags = F_LRU_BYPASS, .vip_num = 1, + .flow_mask = 0xFFF, .lru_miss = LRU_MISS_ALL, + }, + [S_TCP_V6_LRU_DIVERSE] = { + S_BASE_ENCAP_V6, .ip_proto = IPPROTO_TCP, + .name = "tcp-v6-lru-diverse", + .description = "IPv6 TCP, diverse flows, warm LRU", + .vip_addr_v6 = IP6(0xfd000100, 0, 0, 1), .dst_port = 80, + .src_addr_v6 = IP6(0xfd000200, 0, 0, 1), .src_port = 12345, + .vip_num = 10, + .prepopulate_lru = true, .flow_mask = 0xFFF, + .lru_miss = LRU_MISS_NONE, + }, + [S_TCP_V6_CH_DIVERSE] = { + S_BASE_ENCAP_V6, .ip_proto = IPPROTO_TCP, + .name = "tcp-v6-ch-diverse", + .description = "IPv6 TCP, diverse flows, CH (LRU bypass)", + .vip_addr_v6 = IP6(0xfd000100, 0, 0, 2), .dst_port = 80, + .src_addr_v6 = IP6(0xfd000200, 0, 0, 2), .src_port = 54321, + .vip_flags = F_LRU_BYPASS, .vip_num = 12, + .flow_mask = 0xFFF, .lru_miss = LRU_MISS_ALL, + }, + [S_UDP_V4_LRU_DIVERSE] = { + S_BASE_ENCAP_V4, .ip_proto = IPPROTO_UDP, + .name = "udp-v4-lru-diverse", + .description = "IPv4 UDP, diverse flows, warm LRU", + .vip_addr = IP4(10, 10, 1, 1), .dst_port = 443, + .src_addr = IP4(10, 10, 3, 1), .src_port = 11111, + .vip_num = 2, + .prepopulate_lru = true, .flow_mask = 0xFFF, + .lru_miss = LRU_MISS_NONE, + }, + + /* LRU stress */ + [S_TCP_V4_LRU_MISS] = { + S_BASE_ENCAP_V4, .ip_proto = IPPROTO_TCP, + .name = "tcp-v4-lru-miss", + .description = "IPv4 TCP, LRU miss (16M flow space), CH lookup", + .vip_addr = IP4(10, 10, 1, 1), .dst_port = 80, + .src_addr = IP4(10, 10, 2, 1), .src_port = 12345, + .flow_mask = 0xFFFFFF, .cold_lru = true, + .lru_miss = LRU_MISS_FIRST, + }, + [S_UDP_V4_LRU_MISS] = { + S_BASE_ENCAP_V4, .ip_proto = IPPROTO_UDP, + .name = "udp-v4-lru-miss", + .description = "IPv4 UDP, LRU miss (16M flow space), CH lookup", + .vip_addr = IP4(10, 10, 1, 1), .dst_port = 443, + .src_addr = IP4(10, 10, 3, 1), .src_port = 11111, + .vip_num = 2, + .flow_mask = 0xFFFFFF, .cold_lru = true, + .lru_miss = LRU_MISS_FIRST, + }, + [S_TCP_V4_LRU_WARMUP] = { + S_BASE_ENCAP_V4, .ip_proto = IPPROTO_TCP, + .name = "tcp-v4-lru-warmup", + .description = "IPv4 TCP, 4K flows, ~50% LRU miss", + .vip_addr = IP4(10, 10, 1, 1), .dst_port = 80, + .src_addr = IP4(10, 10, 2, 1), .src_port = 12345, + .flow_mask = 0xFFF, .cold_lru = true, + .fixed_batch_iters = 6500, + .lru_miss = LRU_MISS_FIRST, + }, + + /* TCP flags */ + [S_TCP_V4_SYN] = { + S_BASE_ENCAP_V4, .ip_proto = IPPROTO_TCP, + .name = "tcp-v4-syn", + .description = "IPv4 TCP SYN, skip LRU, CH + LRU insert", + .vip_addr = IP4(10, 10, 1, 1), .dst_port = 80, + .src_addr = IP4(10, 10, 8, 2), .src_port = 60001, + .set_syn = true, .lru_miss = LRU_MISS_ALL, + }, + [S_TCP_V4_RST_MISS] = { + S_BASE_ENCAP_V4, .ip_proto = IPPROTO_TCP, + .name = "tcp-v4-rst-miss", + .description = "IPv4 TCP RST, CH lookup, no LRU insert", + .vip_addr = IP4(10, 10, 1, 1), .dst_port = 80, + .src_addr = IP4(10, 10, 8, 1), .src_port = 60000, + .flow_mask = 0xFFFFFF, .cold_lru = true, + .set_rst = true, .lru_miss = LRU_MISS_ALL, + }, + + /* Early exits */ + [S_PASS_V4_NO_VIP] = { + .name = "pass-v4-no-vip", + .description = "IPv4 TCP, unknown VIP, XDP_PASS", + .expected_retval = XDP_PASS, + .ip_proto = IPPROTO_TCP, + .vip_addr = IP4(10, 10, 9, 9), .dst_port = 80, + .src_addr = IP4(10, 10, 4, 1), .src_port = 33333, + }, + [S_PASS_V6_NO_VIP] = { + .name = "pass-v6-no-vip", + .description = "IPv6 TCP, unknown VIP, XDP_PASS", + .expected_retval = XDP_PASS, .is_v6 = true, + .ip_proto = IPPROTO_TCP, + .vip_addr_v6 = IP6(0xfd009900, 0, 0, 1), .dst_port = 80, + .src_addr_v6 = IP6(0xfd000400, 0, 0, 1), .src_port = 33333, + }, + [S_PASS_V4_ICMP] = { + .name = "pass-v4-icmp", + .description = "IPv4 ICMP, non-TCP/UDP protocol, XDP_PASS", + .expected_retval = XDP_PASS, + .ip_proto = IPPROTO_ICMP, + .vip_addr = IP4(10, 10, 1, 1), + .src_addr = IP4(10, 10, 6, 1), + }, + [S_PASS_NON_IP] = { + .name = "pass-non-ip", + .description = "Non-IP (ARP), earliest XDP_PASS exit", + .expected_retval = XDP_PASS, + .eth_proto = ETH_P_ARP, + }, + [S_DROP_V4_FRAG] = { + .name = "drop-v4-frag", + .description = "IPv4 fragmented, XDP_DROP", + .expected_retval = XDP_DROP, .ip_proto = IPPROTO_TCP, + .vip_addr = IP4(10, 10, 1, 1), .dst_port = 80, + .src_addr = IP4(10, 10, 5, 1), .src_port = 44444, + .set_frag = true, + }, + [S_DROP_V4_OPTIONS] = { + .name = "drop-v4-options", + .description = "IPv4 with IP options (ihl>5), XDP_DROP", + .expected_retval = XDP_DROP, .ip_proto = IPPROTO_TCP, + .vip_addr = IP4(10, 10, 1, 1), .dst_port = 80, + .src_addr = IP4(10, 10, 7, 1), .src_port = 55555, + .set_ip_options = true, + }, + [S_DROP_V6_FRAG] = { + .name = "drop-v6-frag", + .description = "IPv6 fragment extension header, XDP_DROP", + .expected_retval = XDP_DROP, .is_v6 = true, + .ip_proto = IPPROTO_TCP, + .vip_addr_v6 = IP6(0xfd000100, 0, 0, 1), .dst_port = 80, + .src_addr_v6 = IP6(0xfd000500, 0, 0, 1), .src_port = 44444, + .set_frag = true, + }, +}; + +#define MAX_ENCAP_SIZE (MAX_PKT_SIZE + sizeof(struct ipv6hdr)) + +static __u8 pkt_buf[NUM_SCENARIOS][MAX_PKT_SIZE]; +static __u32 pkt_len[NUM_SCENARIOS]; +static __u8 expected_buf[NUM_SCENARIOS][MAX_ENCAP_SIZE]; +static __u32 expected_len[NUM_SCENARIOS]; + +static int lru_inner_fds[BENCH_NR_CPUS]; +static int nr_inner_maps; + +static struct ctx { + struct xdp_lb_bench *skel; + struct bpf_bench_timing timing; + int prog_fd; +} ctx; + +static struct { + int scenario; + bool machine_readable; +} args = { + .scenario = -1, +}; + +static __u16 ip_checksum(const void *hdr, int len) +{ + const __u16 *p = hdr; + __u32 csum = 0; + int i; + + for (i = 0; i < len / 2; i++) + csum += p[i]; + + while (csum >> 16) + csum = (csum & 0xffff) + (csum >> 16); + + return ~csum; +} + +static void htonl_v6(__be32 dst[4], const __u32 src[4]) +{ + int i; + + for (i = 0; i < 4; i++) + dst[i] = htonl(src[i]); +} + +static void build_flow_key(struct flow_key *fk, const struct test_scenario *sc) +{ + memset(fk, 0, sizeof(*fk)); + if (sc->is_v6) { + htonl_v6(fk->srcv6, sc->src_addr_v6); + htonl_v6(fk->dstv6, sc->vip_addr_v6); + } else { + fk->src = htonl(sc->src_addr); + fk->dst = htonl(sc->vip_addr); + } + fk->proto = sc->ip_proto; + fk->port16[0] = htons(sc->src_port); + fk->port16[1] = htons(sc->dst_port); +} + +static void build_l4(const struct test_scenario *sc, __u8 *p, __u32 *off) +{ + if (sc->ip_proto == IPPROTO_TCP) { + struct tcphdr tcp = {}; + + tcp.source = htons(sc->src_port); + tcp.dest = htons(sc->dst_port); + tcp.doff = 5; + tcp.syn = sc->set_syn ? 1 : 0; + tcp.rst = sc->set_rst ? 1 : 0; + tcp.window = htons(8192); + memcpy(p + *off, &tcp, sizeof(tcp)); + *off += sizeof(tcp); + } else if (sc->ip_proto == IPPROTO_UDP) { + struct udphdr udp = {}; + + udp.source = htons(sc->src_port); + udp.dest = htons(sc->dst_port); + udp.len = htons(sizeof(udp) + 16); + memcpy(p + *off, &udp, sizeof(udp)); + *off += sizeof(udp); + } +} + +static void build_packet(int idx) +{ + const struct test_scenario *sc = &scenarios[idx]; + __u8 *p = pkt_buf[idx]; + struct ethhdr eth = {}; + __u16 proto; + __u32 off = 0; + + memcpy(eth.h_dest, lb_mac, ETH_ALEN); + memcpy(eth.h_source, client_mac, ETH_ALEN); + + if (sc->eth_proto) + proto = sc->eth_proto; + else if (sc->is_v6) + proto = ETH_P_IPV6; + else + proto = ETH_P_IP; + + eth.h_proto = htons(proto); + memcpy(p, ð, sizeof(eth)); + off += sizeof(eth); + + if (proto != ETH_P_IP && proto != ETH_P_IPV6) { + memcpy(p + off, "bench___payload!", 16); + off += 16; + pkt_len[idx] = off; + return; + } + + if (sc->is_v6) { + struct ipv6hdr ip6h = {}; + __u32 ip6_off = off; + + ip6h.version = 6; + ip6h.nexthdr = sc->set_frag ? 44 : sc->ip_proto; + ip6h.hop_limit = 64; + htonl_v6((__be32 *)&ip6h.saddr, sc->src_addr_v6); + htonl_v6((__be32 *)&ip6h.daddr, sc->vip_addr_v6); + off += sizeof(ip6h); + + if (sc->set_frag) { + memset(p + off, 0, 8); + p[off] = sc->ip_proto; + off += 8; + } + + build_l4(sc, p, &off); + + memcpy(p + off, "bench___payload!", 16); + off += 16; + + ip6h.payload_len = htons(off - ip6_off - sizeof(ip6h)); + memcpy(p + ip6_off, &ip6h, sizeof(ip6h)); + } else { + struct iphdr iph = {}; + __u32 ip_off = off; + + iph.version = 4; + iph.ihl = sc->set_ip_options ? 6 : 5; + iph.ttl = 64; + iph.protocol = sc->ip_proto; + iph.saddr = htonl(sc->src_addr); + iph.daddr = htonl(sc->vip_addr); + iph.frag_off = sc->set_frag ? htons(IP_MF) : 0; + off += sizeof(iph); + + if (sc->set_ip_options) { + /* NOP option padding (4 bytes = 1 word) */ + __u32 nop = htonl(0x01010101); + + memcpy(p + off, &nop, sizeof(nop)); + off += sizeof(nop); + } + + build_l4(sc, p, &off); + + memcpy(p + off, "bench___payload!", 16); + off += 16; + + iph.tot_len = htons(off - ip_off); + iph.check = ip_checksum(&iph, sizeof(iph)); + memcpy(p + ip_off, &iph, sizeof(iph)); + } + + pkt_len[idx] = off; +} + +static void populate_vip(struct xdp_lb_bench *skel, const struct test_scenario *sc) +{ + struct vip_definition key = {}; + struct vip_meta val = {}; + int err; + + if (sc->is_v6) + htonl_v6(key.vipv6, sc->vip_addr_v6); + else + key.vip = htonl(sc->vip_addr); + key.port = htons(sc->dst_port); + key.proto = sc->ip_proto; + val.flags = sc->vip_flags; + val.vip_num = sc->vip_num; + + err = bpf_map_update_elem(bpf_map__fd(skel->maps.vip_map), &key, &val, BPF_ANY); + if (err) { + fprintf(stderr, "vip_map [%s]: %s\n", sc->name, strerror(errno)); + exit(1); + } +} + +static void create_per_cpu_lru_maps(struct xdp_lb_bench *skel) +{ + int outer_fd = bpf_map__fd(skel->maps.lru_mapping); + unsigned int nr_cpus = bpf_num_possible_cpus(); + int i, inner_fd, err; + __u32 cpu; + + if (nr_cpus > BENCH_NR_CPUS) + nr_cpus = BENCH_NR_CPUS; + + for (i = 0; i < (int)nr_cpus; i++) { + LIBBPF_OPTS(bpf_map_create_opts, opts); + + inner_fd = bpf_map_create(BPF_MAP_TYPE_LRU_HASH, "lru_inner", + sizeof(struct flow_key), + sizeof(struct real_pos_lru), + DEFAULT_LRU_SIZE, &opts); + if (inner_fd < 0) { + fprintf(stderr, "lru_inner[%d]: %s\n", i, strerror(errno)); + exit(1); + } + + cpu = i; + err = bpf_map_update_elem(outer_fd, &cpu, &inner_fd, BPF_ANY); + if (err) { + fprintf(stderr, "lru_mapping[%d]: %s\n", i, strerror(errno)); + close(inner_fd); + exit(1); + } + + lru_inner_fds[i] = inner_fd; + } + + nr_inner_maps = nr_cpus; +} + +static void populate_lru(const struct test_scenario *sc, __u32 real_idx) +{ + struct real_pos_lru lru = { .pos = real_idx }; + struct flow_key fk; + int i, err; + + build_flow_key(&fk, sc); + + /* Insert into every per-CPU inner LRU so the entry is found + * regardless of which CPU runs the BPF program. + */ + for (i = 0; i < nr_inner_maps; i++) { + err = bpf_map_update_elem(lru_inner_fds[i], &fk, &lru, BPF_ANY); + if (err) { + fprintf(stderr, "lru_inner[%d] [%s]: %s\n", i, sc->name, + strerror(errno)); + exit(1); + } + } +} + +static void populate_maps(struct xdp_lb_bench *skel) +{ + struct real_definition real_v4 = {}; + struct real_definition real_v6 = {}; + struct ctl_value cval = {}; + __u32 key, real_idx = REAL_INDEX; + int ch_fd, err, i; + + if (scenarios[args.scenario].expect_encap) + populate_vip(skel, &scenarios[args.scenario]); + + ch_fd = bpf_map__fd(skel->maps.ch_rings); + for (i = 0; i < CH_RINGS_SIZE; i++) { + __u32 k = i; + + err = bpf_map_update_elem(ch_fd, &k, &real_idx, BPF_ANY); + if (err) { + fprintf(stderr, "ch_rings[%d]: %s\n", i, strerror(errno)); + exit(1); + } + } + + memcpy(cval.mac, router_mac, ETH_ALEN); + key = 0; + err = bpf_map_update_elem(bpf_map__fd(skel->maps.ctl_array), &key, &cval, BPF_ANY); + if (err) { + fprintf(stderr, "ctl_array: %s\n", strerror(errno)); + exit(1); + } + + key = REAL_INDEX; + real_v4.dst = htonl(TNL_DST); + htonl_v6(real_v4.dstv6, tnl_dst_v6); + err = bpf_map_update_elem(bpf_map__fd(skel->maps.reals), &key, &real_v4, BPF_ANY); + if (err) { + fprintf(stderr, "reals[%d]: %s\n", REAL_INDEX, strerror(errno)); + exit(1); + } + + key = REAL_INDEX_V6; + htonl_v6(real_v6.dstv6, tnl_dst_v6); + real_v6.flags = F_IPV6; + err = bpf_map_update_elem(bpf_map__fd(skel->maps.reals), &key, &real_v6, BPF_ANY); + if (err) { + fprintf(stderr, "reals[%d]: %s\n", REAL_INDEX_V6, strerror(errno)); + exit(1); + } + + create_per_cpu_lru_maps(skel); + + if (scenarios[args.scenario].prepopulate_lru) { + const struct test_scenario *sc = &scenarios[args.scenario]; + __u32 ridx = sc->encap_v6_outer ? REAL_INDEX_V6 : REAL_INDEX; + + populate_lru(sc, ridx); + } + + if (scenarios[args.scenario].expect_encap) { + const struct test_scenario *sc = &scenarios[args.scenario]; + struct vip_definition miss_vip = {}; + + if (sc->is_v6) + htonl_v6(miss_vip.vipv6, sc->vip_addr_v6); + else + miss_vip.vip = htonl(sc->vip_addr); + miss_vip.port = htons(sc->dst_port); + miss_vip.proto = sc->ip_proto; + + key = 0; + err = bpf_map_update_elem(bpf_map__fd(skel->maps.vip_miss_stats), + &key, &miss_vip, BPF_ANY); + if (err) { + fprintf(stderr, "vip_miss_stats: %s\n", strerror(errno)); + exit(1); + } + } +} + +static void build_expected_packet(int idx) +{ + const struct test_scenario *sc = &scenarios[idx]; + __u8 *p = expected_buf[idx]; + struct ethhdr eth = {}; + const __u8 *in = pkt_buf[idx]; + __u32 in_len = pkt_len[idx]; + __u32 off = 0; + __u32 inner_len = in_len - sizeof(struct ethhdr); + + if (sc->expected_retval == XDP_DROP) { + expected_len[idx] = 0; + return; + } + + if (sc->expected_retval == XDP_PASS) { + memcpy(p, in, in_len); + expected_len[idx] = in_len; + return; + } + + memcpy(eth.h_dest, router_mac, ETH_ALEN); + memcpy(eth.h_source, lb_mac, ETH_ALEN); + eth.h_proto = htons(sc->encap_v6_outer ? ETH_P_IPV6 : ETH_P_IP); + memcpy(p, ð, sizeof(eth)); + off += sizeof(eth); + + if (sc->encap_v6_outer) { + struct ipv6hdr ip6h = {}; + __u8 nexthdr = sc->is_v6 ? IPPROTO_IPV6 : IPPROTO_IPIP; + + ip6h.version = 6; + ip6h.nexthdr = nexthdr; + ip6h.payload_len = htons(inner_len); + ip6h.hop_limit = 64; + + create_encap_ipv6_src(htons(sc->src_port), + sc->is_v6 ? htonl(sc->src_addr_v6[0]) + : htonl(sc->src_addr), + (__be32 *)&ip6h.saddr); + htonl_v6((__be32 *)&ip6h.daddr, sc->tunnel_dst_v6); + + memcpy(p + off, &ip6h, sizeof(ip6h)); + off += sizeof(ip6h); + } else { + struct iphdr iph = {}; + + iph.version = 4; + iph.ihl = sizeof(iph) >> 2; + iph.protocol = IPPROTO_IPIP; + iph.tot_len = htons(inner_len + sizeof(iph)); + iph.ttl = 64; + iph.saddr = create_encap_ipv4_src(htons(sc->src_port), + htonl(sc->src_addr)); + iph.daddr = htonl(sc->tunnel_dst); + iph.check = ip_checksum(&iph, sizeof(iph)); + + memcpy(p + off, &iph, sizeof(iph)); + off += sizeof(iph); + } + + memcpy(p + off, in + sizeof(struct ethhdr), inner_len); + off += inner_len; + + expected_len[idx] = off; +} + +static void print_hex_diff(const char *name, const __u8 *got, __u32 got_len, const __u8 *exp, + __u32 exp_len) +{ + __u32 max_len = got_len > exp_len ? got_len : exp_len; + __u32 i, ndiffs = 0; + + fprintf(stderr, " [%s] got %u bytes, expected %u bytes\n", + name, got_len, exp_len); + + for (i = 0; i < max_len && ndiffs < 8; i++) { + __u8 g = i < got_len ? got[i] : 0; + __u8 e = i < exp_len ? exp[i] : 0; + + if (g != e || i >= got_len || i >= exp_len) { + fprintf(stderr, " offset 0x%03x: got 0x%02x expected 0x%02x\n", + i, g, e); + ndiffs++; + } + } + + if (ndiffs >= 8 && i < max_len) + fprintf(stderr, " ... (more differences)\n"); +} + +static void read_stat(int stats_fd, __u32 key, __u64 *v1_out, __u64 *v2_out) +{ + struct lb_stats values[BENCH_NR_CPUS]; + unsigned int nr_cpus = bpf_num_possible_cpus(); + __u64 v1 = 0, v2 = 0; + unsigned int i; + + if (nr_cpus > BENCH_NR_CPUS) + nr_cpus = BENCH_NR_CPUS; + + if (bpf_map_lookup_elem(stats_fd, &key, values) == 0) { + for (i = 0; i < nr_cpus; i++) { + v1 += values[i].v1; + v2 += values[i].v2; + } + } + + *v1_out = v1; + *v2_out = v2; +} + +static void reset_stats(int stats_fd) +{ + struct lb_stats zeros[BENCH_NR_CPUS]; + __u32 key; + + memset(zeros, 0, sizeof(zeros)); + for (key = 0; key < STATS_SIZE; key++) + bpf_map_update_elem(stats_fd, &key, zeros, BPF_ANY); +} + +static bool validate_counters(int idx) +{ + const struct test_scenario *sc = &scenarios[idx]; + int stats_fd = bpf_map__fd(ctx.skel->maps.stats); + __u64 xdp_tx, xdp_pass, xdp_drop, lru_pkts, lru_misses, tcp_misses; + __u64 expected_misses; + __u64 dummy; + /* + * BENCH_BPF_LOOP runs batch_iters timed + 1 untimed iteration. + * Each iteration calls process_packet -> count_action, so all + * counters are incremented (batch_iters + 1) times. + */ + __u64 n = ctx.timing.batch_iters + 1; + bool pass = true; + + read_stat(stats_fd, STATS_XDP_TX, &xdp_tx, &dummy); + read_stat(stats_fd, STATS_XDP_PASS, &xdp_pass, &dummy); + read_stat(stats_fd, STATS_XDP_DROP, &xdp_drop, &dummy); + read_stat(stats_fd, STATS_LRU, &lru_pkts, &lru_misses); + read_stat(stats_fd, STATS_LRU_MISS, &tcp_misses, &dummy); + + if (sc->expected_retval == XDP_TX && xdp_tx != n) { + fprintf(stderr, " [%s] COUNTER FAIL: STATS_XDP_TX=%llu, expected %llu\n", sc->name, + (unsigned long long)xdp_tx, (unsigned long long)n); + pass = false; + } + if (sc->expected_retval == XDP_PASS && xdp_pass != n) { + fprintf(stderr, " [%s] COUNTER FAIL: STATS_XDP_PASS=%llu, expected %llu\n", + sc->name, (unsigned long long)xdp_pass, (unsigned long long)n); + pass = false; + } + if (sc->expected_retval == XDP_DROP && xdp_drop != n) { + fprintf(stderr, " [%s] COUNTER FAIL: STATS_XDP_DROP=%llu, expected %llu\n", + sc->name, (unsigned long long)xdp_drop, (unsigned long long)n); + pass = false; + } + + if (!sc->expect_encap) + goto out; + + if (lru_pkts != n) { + fprintf(stderr, " [%s] COUNTER FAIL: STATS_LRU.v1=%llu, expected %llu\n", + sc->name, (unsigned long long)lru_pkts, (unsigned long long)n); + pass = false; + } + + switch (sc->lru_miss) { + case LRU_MISS_NONE: + expected_misses = 0; + break; + case LRU_MISS_ALL: + expected_misses = n; + break; + case LRU_MISS_FIRST: + expected_misses = 1; + break; + default: + /* LRU_MISS_AUTO: compute from scenario flags */ + if (sc->prepopulate_lru && !sc->set_syn) + expected_misses = 0; + else if (sc->set_syn || sc->set_rst || + (sc->vip_flags & F_LRU_BYPASS)) + expected_misses = n; + else if (sc->cold_lru) + expected_misses = 1; + else + expected_misses = n; + break; + } + + if (lru_misses != expected_misses) { + fprintf(stderr, " [%s] COUNTER FAIL: LRU misses=%llu, expected %llu\n", + sc->name, (unsigned long long)lru_misses, + (unsigned long long)expected_misses); + pass = false; + } + + if (sc->ip_proto == IPPROTO_TCP && lru_misses > 0) { + if (tcp_misses != lru_misses) { + fprintf(stderr, " [%s] COUNTER FAIL: TCP LRU misses=%llu, expected %llu\n", + sc->name, (unsigned long long)tcp_misses, + (unsigned long long)lru_misses); + pass = false; + } + } + +out: + reset_stats(stats_fd); + return pass; +} + +static const char *xdp_action_str(int action) +{ + switch (action) { + case XDP_DROP: return "XDP_DROP"; + case XDP_PASS: return "XDP_PASS"; + case XDP_TX: return "XDP_TX"; + default: return "UNKNOWN"; + } +} + +static bool validate_scenario(int idx) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + const struct test_scenario *sc = &scenarios[idx]; + __u8 out[MAX_ENCAP_SIZE]; + int err; + + topts.data_in = pkt_buf[idx]; + topts.data_size_in = pkt_len[idx]; + topts.data_out = out; + topts.data_size_out = sizeof(out); + topts.repeat = 1; + + err = bpf_prog_test_run_opts(ctx.prog_fd, &topts); + if (err) { + fprintf(stderr, " [%s] FAIL: test_run: %s\n", sc->name, strerror(errno)); + return false; + } + + if ((int)topts.retval != sc->expected_retval) { + fprintf(stderr, " [%s] FAIL: retval %s, expected %s\n", sc->name, + xdp_action_str(topts.retval), xdp_action_str(sc->expected_retval)); + return false; + } + + /* + * Compare output packet when it's deterministic. + * Skip for XDP_DROP (no output) and cold_lru (source IP poisoned). + */ + if (sc->expected_retval != XDP_DROP && !sc->cold_lru) { + if (topts.data_size_out != expected_len[idx] || + memcmp(out, expected_buf[idx], expected_len[idx]) != 0) { + fprintf(stderr, " [%s] FAIL: output packet mismatch\n", sc->name); + print_hex_diff(sc->name, out, topts.data_size_out, expected_buf[idx], + expected_len[idx]); + return false; + } + } + + if (!validate_counters(idx)) + return false; + return true; +} + +static int find_scenario(const char *name) +{ + int i; + + for (i = 0; i < NUM_SCENARIOS; i++) { + if (strcmp(scenarios[i].name, name) == 0) + return i; + } + return -1; +} + +static void xdp_lb_validate(void) +{ + if (env.consumer_cnt != 0) { + fprintf(stderr, "benchmark doesn't support consumers\n"); + exit(1); + } + if (bpf_num_possible_cpus() > BENCH_NR_CPUS) { + fprintf(stderr, "too many CPUs (%d > %d), increase BENCH_NR_CPUS\n", + bpf_num_possible_cpus(), BENCH_NR_CPUS); + exit(1); + } +} + +static void xdp_lb_run_once(void *unused __always_unused) +{ + int idx = args.scenario; + + LIBBPF_OPTS(bpf_test_run_opts, topts, + .data_in = pkt_buf[idx], + .data_size_in = pkt_len[idx], + .repeat = 1, + ); + + bpf_prog_test_run_opts(ctx.prog_fd, &topts); +} + +static void xdp_lb_setup(void) +{ + struct xdp_lb_bench *skel; + int err; + + if (args.scenario < 0) { + fprintf(stderr, "--scenario is required. Use --list-scenarios to see options.\n"); + exit(1); + } + + setup_libbpf(); + + skel = xdp_lb_bench__open(); + if (!skel) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } + + err = xdp_lb_bench__load(skel); + if (err) { + fprintf(stderr, "failed to load skeleton: %s\n", strerror(-err)); + xdp_lb_bench__destroy(skel); + exit(1); + } + + ctx.skel = skel; + ctx.prog_fd = bpf_program__fd(skel->progs.xdp_lb_bench); + + build_packet(args.scenario); + build_expected_packet(args.scenario); + + populate_maps(skel); + + BENCH_TIMING_INIT(&ctx.timing, skel, 0); + ctx.timing.machine_readable = args.machine_readable; + + if (scenarios[args.scenario].fixed_batch_iters) { + ctx.timing.batch_iters = scenarios[args.scenario].fixed_batch_iters; + skel->bss->batch_iters = ctx.timing.batch_iters; + } else { + bpf_bench_calibrate(&ctx.timing, xdp_lb_run_once, NULL); + } + + env.duration_sec = 600; + + /* + * Enable cold_lru before validation so LRU miss counters are + * correct. Seed the LRU with one run so the original flow is + * present; validation then sees exactly 1 miss (the poisoned + * flow) regardless of whether calibration ran. + */ + if (scenarios[args.scenario].cold_lru) { + skel->bss->cold_lru = 1; + xdp_lb_run_once(NULL); + } + + reset_stats(bpf_map__fd(skel->maps.stats)); + + if (!validate_scenario(args.scenario)) { + fprintf(stderr, "Validation FAILED - aborting benchmark\n"); + exit(1); + } + + if (scenarios[args.scenario].flow_mask) + skel->bss->flow_mask = scenarios[args.scenario].flow_mask; +} + +static void *xdp_lb_producer(void *input) +{ + while (true) + xdp_lb_run_once(NULL); + + return NULL; +} + +static void xdp_lb_measure(struct bench_res *res) +{ + bpf_bench_timing_measure(&ctx.timing, res); +} + +static void xdp_lb_report_final(struct bench_res res[], int res_cnt) +{ + bpf_bench_timing_report(&ctx.timing, scenarios[args.scenario].name, + scenarios[args.scenario].description); +} + +enum { + ARG_SCENARIO = 9001, + ARG_LIST_SCENARIOS = 9002, + ARG_MACHINE_READABLE = 9003, +}; + +static const struct argp_option opts[] = { + { "scenario", ARG_SCENARIO, "NAME", 0, + "Scenario to benchmark (required)" }, + { "list-scenarios", ARG_LIST_SCENARIOS, NULL, 0, + "List available scenarios and exit" }, + { "machine-readable", ARG_MACHINE_READABLE, NULL, 0, + "Print only a machine-readable RESULT line" }, + {}, +}; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + int i; + + switch (key) { + case ARG_SCENARIO: + args.scenario = find_scenario(arg); + if (args.scenario < 0) { + fprintf(stderr, "unknown scenario: '%s'\n", arg); + fprintf(stderr, "use --list-scenarios to see options\n"); + argp_usage(state); + } + break; + case ARG_LIST_SCENARIOS: + printf("Available scenarios:\n"); + for (i = 0; i < NUM_SCENARIOS; i++) + printf(" %-20s %s\n", scenarios[i].name, scenarios[i].description); + exit(0); + case ARG_MACHINE_READABLE: + args.machine_readable = true; + env.quiet = true; + break; + default: + return ARGP_ERR_UNKNOWN; + } + + return 0; +} + +const struct argp bench_xdp_lb_argp = { + .options = opts, + .parser = parse_arg, +}; + +const struct bench bench_xdp_lb = { + .name = "xdp-lb", + .argp = &bench_xdp_lb_argp, + .validate = xdp_lb_validate, + .setup = xdp_lb_setup, + .producer_thread = xdp_lb_producer, + .measure = xdp_lb_measure, + .report_final = xdp_lb_report_final, +}; -- cgit v1.2.3 From 51312b6360a92e7bccd7b05b028ba2066b093305 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Mon, 27 Apr 2026 16:23:04 -0700 Subject: selftests/bpf: Add XDP load-balancer benchmark run script Add a convenience script that runs all 24 XDP load-balancer scenarios and formats the results as a table with median, stddev, and p99 columns. ./benchs/run_bench_xdp_lb.sh Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260427232313.1582588-8-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/benchs/run_bench_xdp_lb.sh | 79 ++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100755 tools/testing/selftests/bpf/benchs/run_bench_xdp_lb.sh diff --git a/tools/testing/selftests/bpf/benchs/run_bench_xdp_lb.sh b/tools/testing/selftests/bpf/benchs/run_bench_xdp_lb.sh new file mode 100755 index 000000000000..f65cf46214a3 --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/run_bench_xdp_lb.sh @@ -0,0 +1,79 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source ./benchs/run_common.sh + +set -eufo pipefail + +WARMUP=${WARMUP:-3} + +RUN="sudo ./bench -q -w${WARMUP} -a xdp-lb --machine-readable" + +SEP=" +----------------------------------+----------+---------+----------+" +HDR=" | %-32s | %8s | %7s | %8s |\n" +ROW=" | %-32s | %8s | %7s | %8s |\n" + +function group_header() +{ + printf "%s\n" "$SEP" + printf "$HDR" "$1" "p50" "stddev" "p99" + printf "%s\n" "$SEP" +} + +function rval() +{ + echo "$1" | sed -nE "s/.*$2=([^ ]+).*/\1/p" +} + +function run_scenario() +{ + local sc="$1" + shift + local output rline + + output=$($RUN --scenario "$sc" "$@" 2>&1) || true + rline=$(echo "$output" | grep '^RESULT ' || true) + + if [ -z "$rline" ]; then + printf "$ROW" "$sc" "ERR" "-" "-" + return + fi + + printf "$ROW" "$sc" \ + "$(rval "$rline" median)" \ + "$(rval "$rline" stddev)" \ + "$(rval "$rline" p99)" +} + +header "XDP load-balancer benchmark" + +group_header "Single-flow baseline" +for sc in tcp-v4-lru-hit tcp-v4-ch \ + tcp-v6-lru-hit tcp-v6-ch \ + udp-v4-lru-hit udp-v6-lru-hit \ + tcp-v4v6-lru-hit; do + run_scenario "$sc" +done + +group_header "Diverse flows (4K src addrs)" +for sc in tcp-v4-lru-diverse tcp-v4-ch-diverse \ + tcp-v6-lru-diverse tcp-v6-ch-diverse \ + udp-v4-lru-diverse; do + run_scenario "$sc" +done + +group_header "TCP flags" +run_scenario tcp-v4-syn +run_scenario tcp-v4-rst-miss + +group_header "LRU stress" +run_scenario tcp-v4-lru-miss +run_scenario udp-v4-lru-miss +run_scenario tcp-v4-lru-warmup + +group_header "Early exits" +for sc in pass-v4-no-vip pass-v6-no-vip pass-v4-icmp pass-non-ip drop-v4-frag drop-v4-options \ + drop-v6-frag; do + run_scenario "$sc" +done +printf "%s\n" "$SEP" -- cgit v1.2.3 From 25bb05dd06ccffd209c26465f84851f1fd344c8c Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Fri, 8 May 2026 17:57:30 -0700 Subject: selftests/bpf: Use both hrtimer enqueue helpers in vmlinux test The vmlinux selftest triggers nanosleep and checks that both kprobe and fentry programs observe the hrtimer enqueue path. After the hrtimer_start_expires_user() conversion [1], nanosleep reaches hrtimer_start_range_ns_user() instead of hrtimer_start_range_ns(). Hard-coding either symbol makes the test fail either on bpf tree or on linux-next [2]. Update the test to resolve the target symbol at runtime via libbpf_find_vmlinux_btf_id(). This is a nice example of how to modify a BPF program to work on both older and newer kernel revision. [1] https://lore.kernel.org/all/20260408114952.062400833@kernel.org/ [2] https://github.com/kernel-patches/bpf/actions/runs/25485909958/job/74782902203 Signed-off-by: Ihor Solodrai Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20260509005730.250956-1-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/vmlinux.c | 45 ++++++++++++++++++++++-- tools/testing/selftests/bpf/progs/test_vmlinux.c | 4 +-- 2 files changed, 45 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/vmlinux.c b/tools/testing/selftests/bpf/prog_tests/vmlinux.c index 6fb2217d940b..b5fdd593910d 100644 --- a/tools/testing/selftests/bpf/prog_tests/vmlinux.c +++ b/tools/testing/selftests/bpf/prog_tests/vmlinux.c @@ -14,21 +14,61 @@ static void nsleep() (void)syscall(__NR_nanosleep, &ts, NULL); } +static const char *hrtimer_func = "hrtimer_start_range_ns"; + +static int setup_hrtimer_progs(struct test_vmlinux *skel) +{ + int err; + + if (libbpf_find_vmlinux_btf_id("hrtimer_start_range_ns_user", BPF_TRACE_FENTRY) > 0) + hrtimer_func = "hrtimer_start_range_ns_user"; + + err = bpf_program__set_attach_target(skel->progs.handle__fentry, 0, hrtimer_func); + if (err) + return err; + + /* + * Bare SEC("kprobe") has no target function, so attach it manually + * later after selecting the hrtimer function to probe. + */ + bpf_program__set_autoattach(skel->progs.handle__kprobe, false); + + return 0; +} + void test_vmlinux(void) { int err; struct test_vmlinux* skel; struct test_vmlinux__bss *bss; + struct bpf_link *kprobe_link = NULL; - skel = test_vmlinux__open_and_load(); - if (!ASSERT_OK_PTR(skel, "test_vmlinux__open_and_load")) + skel = test_vmlinux__open(); + if (!ASSERT_OK_PTR(skel, "test_vmlinux__open")) return; + + err = setup_hrtimer_progs(skel); + if (!ASSERT_OK(err, "setup_hrtimer_progs")) + goto cleanup; + + err = test_vmlinux__load(skel); + if (!ASSERT_OK(err, "test_vmlinux__load")) + goto cleanup; + bss = skel->bss; err = test_vmlinux__attach(skel); if (!ASSERT_OK(err, "test_vmlinux__attach")) goto cleanup; + /* manually attach kprobe with the selected function */ + if (hrtimer_func) { + kprobe_link = bpf_program__attach_kprobe(skel->progs.handle__kprobe, + false /* retprobe */, hrtimer_func); + if (!ASSERT_OK_PTR(kprobe_link, "bpf_program__attach_kprobe")) + goto cleanup; + } + /* trigger everything */ nsleep(); @@ -39,5 +79,6 @@ void test_vmlinux(void) ASSERT_TRUE(bss->fentry_called, "fentry"); cleanup: + bpf_link__destroy(kprobe_link); test_vmlinux__destroy(skel); } diff --git a/tools/testing/selftests/bpf/progs/test_vmlinux.c b/tools/testing/selftests/bpf/progs/test_vmlinux.c index 78b23934d9f8..eea556940df6 100644 --- a/tools/testing/selftests/bpf/progs/test_vmlinux.c +++ b/tools/testing/selftests/bpf/progs/test_vmlinux.c @@ -69,7 +69,7 @@ int BPF_PROG(handle__tp_btf, struct pt_regs *regs, long id) return 0; } -SEC("kprobe/hrtimer_start_range_ns") +SEC("kprobe") int BPF_KPROBE(handle__kprobe, struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode) { @@ -78,7 +78,7 @@ int BPF_KPROBE(handle__kprobe, struct hrtimer *timer, ktime_t tim, u64 delta_ns, return 0; } -SEC("fentry/hrtimer_start_range_ns") +SEC("fentry") int BPF_PROG(handle__fentry, struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode) { -- cgit v1.2.3 From f28771c0691bcb7f477a0f35550b17b88c32dea8 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Tue, 12 May 2026 23:31:50 +0800 Subject: bpf: Extend BPF syscall with common attributes support Add generic BPF syscall support for passing common attributes. The initial set of common attributes includes: 1. 'log_buf': User-provided buffer for storing logs. 2. 'log_size': Size of the log buffer. 3. 'log_level': Log verbosity level. 4. 'log_true_size': Actual log size reported by kernel. The common-attribute pointer and its size are passed as the 4th and 5th syscall arguments. A new command bit, 'BPF_COMMON_ATTRS' ('1 << 16'), indicates that common attributes are supplied. This commit adds syscall and uapi plumbing. Command-specific handling is added in follow-up patches. Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260512153157.28382-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/syscalls.h | 3 ++- include/uapi/linux/bpf.h | 8 ++++++++ kernel/bpf/syscall.c | 25 +++++++++++++++++++++---- tools/include/uapi/linux/bpf.h | 8 ++++++++ 4 files changed, 39 insertions(+), 5 deletions(-) diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index f5639d5ac331..50055ab73649 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -936,7 +936,8 @@ asmlinkage long sys_seccomp(unsigned int op, unsigned int flags, asmlinkage long sys_getrandom(char __user *buf, size_t count, unsigned int flags); asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags); -asmlinkage long sys_bpf(int cmd, union bpf_attr __user *attr, unsigned int size); +asmlinkage long sys_bpf(int cmd, union bpf_attr __user *attr, unsigned int size, + struct bpf_common_attr __user *attr_common, unsigned int size_common); asmlinkage long sys_execveat(int dfd, const char __user *filename, const char __user *const __user *argv, const char __user *const __user *envp, int flags); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 552bc5d9afbd..aec171ccb6ef 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -994,6 +994,7 @@ enum bpf_cmd { BPF_PROG_STREAM_READ_BY_FD, BPF_PROG_ASSOC_STRUCT_OPS, __MAX_BPF_CMD, + BPF_COMMON_ATTRS = 1 << 16, /* Indicate carrying syscall common attrs. */ }; enum bpf_map_type { @@ -1500,6 +1501,13 @@ struct bpf_stack_build_id { }; }; +struct bpf_common_attr { + __aligned_u64 log_buf; + __u32 log_size; + __u32 log_level; + __u32 log_true_size; +}; + #define BPF_OBJ_NAME_LEN 16U enum { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3b1f0ba02f61..354f6f471a08 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -6211,8 +6211,10 @@ put_prog: return ret; } -static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) +static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size, + bpfptr_t uattr_common, unsigned int size_common) { + struct bpf_common_attr attr_common; union bpf_attr attr; int err; @@ -6226,6 +6228,20 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) if (copy_from_bpfptr(&attr, uattr, size) != 0) return -EFAULT; + memset(&attr_common, 0, sizeof(attr_common)); + if (cmd & BPF_COMMON_ATTRS) { + err = bpf_check_uarg_tail_zero(uattr_common, sizeof(attr_common), size_common); + if (err) + return err; + + cmd &= ~BPF_COMMON_ATTRS; + size_common = min_t(u32, size_common, sizeof(attr_common)); + if (copy_from_bpfptr(&attr_common, uattr_common, size_common) != 0) + return -EFAULT; + } else { + size_common = 0; + } + err = security_bpf(cmd, &attr, size, uattr.is_kernel); if (err < 0) return err; @@ -6361,9 +6377,10 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) return err; } -SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) +SYSCALL_DEFINE5(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size, + struct bpf_common_attr __user *, uattr_common, unsigned int, size_common) { - return __sys_bpf(cmd, USER_BPFPTR(uattr), size); + return __sys_bpf(cmd, USER_BPFPTR(uattr), size, USER_BPFPTR(uattr_common), size_common); } static bool syscall_prog_is_valid_access(int off, int size, @@ -6393,7 +6410,7 @@ BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size) default: return -EINVAL; } - return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size); + return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size, KERNEL_BPFPTR(NULL), 0); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 677be9a47347..37142e6d911a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -994,6 +994,7 @@ enum bpf_cmd { BPF_PROG_STREAM_READ_BY_FD, BPF_PROG_ASSOC_STRUCT_OPS, __MAX_BPF_CMD, + BPF_COMMON_ATTRS = 1 << 16, /* Indicate carrying syscall common attrs. */ }; enum bpf_map_type { @@ -1500,6 +1501,13 @@ struct bpf_stack_build_id { }; }; +struct bpf_common_attr { + __aligned_u64 log_buf; + __u32 log_size; + __u32 log_level; + __u32 log_true_size; +}; + #define BPF_OBJ_NAME_LEN 16U enum { -- cgit v1.2.3 From b1bff40809429bcf80c201255a2bcdf1c5eec06e Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Tue, 12 May 2026 23:31:51 +0800 Subject: libbpf: Add support for extended BPF syscall To support the extended BPF syscall introduced in the previous commit, introduce the following internal APIs: * 'sys_bpf_ext()' * 'sys_bpf_ext_fd()' They wrap the raw 'syscall()' interface to support passing extended attributes. * 'probe_sys_bpf_ext()' Check whether current kernel supports the BPF syscall common attributes. Acked-by: Andrii Nakryiko Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260512153157.28382-3-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/bpf.c | 36 ++++++++++++++++++++++++++++++++++++ tools/lib/bpf/features.c | 8 ++++++++ tools/lib/bpf/libbpf_internal.h | 3 +++ 3 files changed, 47 insertions(+) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 5846de364209..9d8740761b7a 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -69,6 +69,42 @@ static inline __u64 ptr_to_u64(const void *ptr) return (__u64) (unsigned long) ptr; } +static inline int sys_bpf_ext(enum bpf_cmd cmd, union bpf_attr *attr, + unsigned int size, + struct bpf_common_attr *attr_common, + unsigned int size_common) +{ + cmd = attr_common ? (cmd | BPF_COMMON_ATTRS) : (cmd & ~BPF_COMMON_ATTRS); + return syscall(__NR_bpf, cmd, attr, size, attr_common, size_common); +} + +static inline int sys_bpf_ext_fd(enum bpf_cmd cmd, union bpf_attr *attr, + unsigned int size, + struct bpf_common_attr *attr_common, + unsigned int size_common) +{ + int fd; + + fd = sys_bpf_ext(cmd, attr, size, attr_common, size_common); + return ensure_good_fd(fd); +} + +int probe_sys_bpf_ext(void) +{ + const size_t attr_sz = offsetofend(union bpf_attr, prog_token_fd); + union bpf_attr attr; + int fd; + + memset(&attr, 0, attr_sz); + fd = syscall(__NR_bpf, BPF_PROG_LOAD | BPF_COMMON_ATTRS, &attr, attr_sz, NULL, + sizeof(struct bpf_common_attr)); + if (fd >= 0) { + close(fd); + return -EINVAL; + } + return errno == EFAULT ? 1 : 0; +} + static inline int sys_bpf(enum bpf_cmd cmd, union bpf_attr *attr, unsigned int size) { diff --git a/tools/lib/bpf/features.c b/tools/lib/bpf/features.c index 4f19a0d79b0c..b7e388f99d0b 100644 --- a/tools/lib/bpf/features.c +++ b/tools/lib/bpf/features.c @@ -615,6 +615,11 @@ static int probe_kern_btf_layout(int token_fd) (char *)layout, token_fd)); } +static int probe_bpf_syscall_common_attrs(int token_fd) +{ + return probe_sys_bpf_ext(); +} + typedef int (*feature_probe_fn)(int /* token_fd */); static struct kern_feature_cache feature_cache; @@ -699,6 +704,9 @@ static struct kern_feature_desc { [FEAT_BTF_LAYOUT] = { "kernel supports BTF layout", probe_kern_btf_layout, }, + [FEAT_BPF_SYSCALL_COMMON_ATTRS] = { + "BPF syscall common attributes support", probe_bpf_syscall_common_attrs, + }, }; bool feat_supported(struct kern_feature_cache *cache, enum kern_feature_id feat_id) diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 3781c45b46d3..7d93c6c01d60 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -398,6 +398,8 @@ enum kern_feature_id { FEAT_UPROBE_SYSCALL, /* Kernel supports BTF layout information */ FEAT_BTF_LAYOUT, + /* Kernel supports BPF syscall common attributes */ + FEAT_BPF_SYSCALL_COMMON_ATTRS, __FEAT_CNT, }; @@ -768,4 +770,5 @@ int probe_fd(int fd); #define SHA256_DWORD_SIZE SHA256_DIGEST_LENGTH / sizeof(__u64) void libbpf_sha256(const void *data, size_t len, __u8 out[SHA256_DIGEST_LENGTH]); +int probe_sys_bpf_ext(void); #endif /* __LIBBPF_LIBBPF_INTERNAL_H */ -- cgit v1.2.3 From 503c039ffeca7530ce9d6446a07b4bb776180b45 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Tue, 12 May 2026 23:31:52 +0800 Subject: bpf: Refactor reporting log_true_size for prog_load The next commit will add support for reporting logs via extended common attributes, including 'log_true_size'. To prepare for that, refactor the 'log_true_size' reporting logic by introducing a new struct bpf_log_attr to encapsulate log-related behavior: * bpf_log_attr_init(): initialize log fields, which will support extended common attributes in the next commit. * bpf_log_attr_finalize(): handle log finalization and write back 'log_true_size' to userspace. Acked-by: Andrii Nakryiko Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260512153157.28382-4-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 +++- include/linux/bpf_verifier.h | 12 ++++++++++++ kernel/bpf/log.c | 29 +++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 12 +++++++++--- kernel/bpf/verifier.c | 17 ++++------------- 5 files changed, 57 insertions(+), 17 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 14759972f148..9e16e91647d3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2919,7 +2919,9 @@ int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size, size_t actual_size); /* verify correctness of eBPF program */ -int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size); +struct bpf_log_attr; +int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr, + struct bpf_log_attr *attr_log); #ifndef CONFIG_BPF_JIT_ALWAYS_ON void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 976e2b2f40e8..8d27ad1f9f94 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -755,6 +755,18 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) return log && log->level; } +struct bpf_log_attr { + char __user *ubuf; + u32 size; + u32 level; + u32 offsetof_true_size; + bpfptr_t uattr; +}; + +int bpf_log_attr_init(struct bpf_log_attr *log, u64 log_buf, u32 log_size, u32 log_level, + u32 offsetof_log_true_size, bpfptr_t uattr); +int bpf_log_attr_finalize(struct bpf_log_attr *attr, struct bpf_verifier_log *log); + #define BPF_MAX_SUBPROGS 256 struct bpf_subprog_arg_info { diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 64566b86dd27..1b1efe75398b 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -825,3 +825,32 @@ void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_st } print_verifier_state(env, vstate, frameno, false); } + +int bpf_log_attr_init(struct bpf_log_attr *log, u64 log_buf, u32 log_size, u32 log_level, + u32 offsetof_log_true_size, bpfptr_t uattr) +{ + char __user *ubuf = u64_to_user_ptr(log_buf); + + memset(log, 0, sizeof(*log)); + log->ubuf = ubuf; + log->size = log_size; + log->level = log_level; + log->offsetof_true_size = offsetof_log_true_size; + log->uattr = uattr; + return 0; +} + +int bpf_log_attr_finalize(struct bpf_log_attr *attr, struct bpf_verifier_log *log) +{ + u32 log_true_size; + int err; + + err = bpf_vlog_finalize(log, &log_true_size); + + if (attr->offsetof_true_size && + copy_to_bpfptr_offset(attr->uattr, attr->offsetof_true_size, &log_true_size, + sizeof(log_true_size))) + return -EFAULT; + + return err; +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 354f6f471a08..70b78ddcdedb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2861,7 +2861,7 @@ static int bpf_prog_mark_insn_arrays_ready(struct bpf_prog *prog) /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD keyring_id -static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) +static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_attr *attr_log) { enum bpf_prog_type type = attr->prog_type; struct bpf_prog *prog, *dst_prog = NULL; @@ -3079,7 +3079,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) goto free_prog_sec; /* run eBPF verifier */ - err = bpf_check(&prog, attr, uattr, uattr_size); + err = bpf_check(&prog, attr, uattr, attr_log); if (err < 0) goto free_used_maps; @@ -6215,6 +6215,8 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size, bpfptr_t uattr_common, unsigned int size_common) { struct bpf_common_attr attr_common; + u32 offsetof_log_true_size = 0; + struct bpf_log_attr attr_log; union bpf_attr attr; int err; @@ -6266,7 +6268,11 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size, err = map_freeze(&attr); break; case BPF_PROG_LOAD: - err = bpf_prog_load(&attr, uattr, size); + if (size >= offsetofend(union bpf_attr, log_true_size)) + offsetof_log_true_size = offsetof(union bpf_attr, log_true_size); + err = bpf_log_attr_init(&attr_log, attr.log_buf, attr.log_size, attr.log_level, + offsetof_log_true_size, uattr); + err = err ?: bpf_prog_load(&attr, uattr, &attr_log); break; case BPF_OBJ_PIN: err = bpf_obj_pin(&attr); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 11054ad89c14..0e654ef01ae0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19294,12 +19294,12 @@ int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return 0; } -int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size) +int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, + struct bpf_log_attr *attr_log) { u64 start_time = ktime_get_ns(); struct bpf_verifier_env *env; int i, len, ret = -EINVAL, err; - u32 log_true_size; bool is_priv; BTF_TYPE_EMIT(enum bpf_features); @@ -19346,9 +19346,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 /* user could have requested verbose verifier output * and supplied buffer to store the verification trace */ - ret = bpf_vlog_init(&env->log, attr->log_level, - (char __user *) (unsigned long) attr->log_buf, - attr->log_size); + ret = bpf_vlog_init(&env->log, attr_log->level, attr_log->ubuf, attr_log->size); if (ret) goto err_unlock; @@ -19510,17 +19508,10 @@ skip_full_check: env->prog->aux->verified_insns = env->insn_processed; /* preserve original error even if log finalization is successful */ - err = bpf_vlog_finalize(&env->log, &log_true_size); + err = bpf_log_attr_finalize(attr_log, &env->log); if (err) ret = err; - if (uattr_size >= offsetofend(union bpf_attr, log_true_size) && - copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, log_true_size), - &log_true_size, sizeof(log_true_size))) { - ret = -EFAULT; - goto err_release_maps; - } - if (ret) goto err_release_maps; -- cgit v1.2.3 From ac89d33fdd8183df39fe92ffa525be7af6feb9d1 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Tue, 12 May 2026 23:31:53 +0800 Subject: bpf: Add syscall common attributes support for prog_load BPF_PROG_LOAD can now take log parameters from both union bpf_attr and struct bpf_common_attr. The merge rules are: - if both sides provide a complete log tuple (buf/size/level) and they match, use it; - if only one side provides log parameters, use that one; - if both sides provide complete tuples but they differ, return -EINVAL. Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260512153157.28382-5-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 3 ++- kernel/bpf/log.c | 34 +++++++++++++++++++++++++++------- kernel/bpf/syscall.c | 3 ++- 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 8d27ad1f9f94..8433430dedb7 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -764,7 +764,8 @@ struct bpf_log_attr { }; int bpf_log_attr_init(struct bpf_log_attr *log, u64 log_buf, u32 log_size, u32 log_level, - u32 offsetof_log_true_size, bpfptr_t uattr); + u32 offsetof_log_true_size, bpfptr_t uattr, struct bpf_common_attr *common, + bpfptr_t uattr_common, u32 size_common); int bpf_log_attr_finalize(struct bpf_log_attr *attr, struct bpf_verifier_log *log); #define BPF_MAX_SUBPROGS 256 diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 1b1efe75398b..fd12ad5a0338 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -13,17 +13,17 @@ #define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args) -static bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log) +static bool bpf_verifier_log_attr_valid(u32 log_level, char __user *log_buf, u32 log_size) { /* ubuf and len_total should both be specified (or not) together */ - if (!!log->ubuf != !!log->len_total) + if (!!log_buf != !!log_size) return false; /* log buf without log_level is meaningless */ - if (log->ubuf && log->level == 0) + if (log_buf && log_level == 0) return false; - if (log->level & ~BPF_LOG_MASK) + if (log_level & ~BPF_LOG_MASK) return false; - if (log->len_total > UINT_MAX >> 2) + if (log_size > UINT_MAX >> 2) return false; return true; } @@ -36,7 +36,7 @@ int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level, log->len_total = log_size; /* log attributes have to be sane */ - if (!bpf_verifier_log_attr_valid(log)) + if (!bpf_verifier_log_attr_valid(log_level, log_buf, log_size)) return -EINVAL; return 0; @@ -827,16 +827,36 @@ void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_st } int bpf_log_attr_init(struct bpf_log_attr *log, u64 log_buf, u32 log_size, u32 log_level, - u32 offsetof_log_true_size, bpfptr_t uattr) + u32 offsetof_log_true_size, bpfptr_t uattr, struct bpf_common_attr *common, + bpfptr_t uattr_common, u32 size_common) { + char __user *ubuf_common = u64_to_user_ptr(common->log_buf); char __user *ubuf = u64_to_user_ptr(log_buf); + if (!bpf_verifier_log_attr_valid(common->log_level, ubuf_common, common->log_size) || + !bpf_verifier_log_attr_valid(log_level, ubuf, log_size)) + return -EINVAL; + + if (ubuf && ubuf_common && (ubuf != ubuf_common || log_size != common->log_size || + log_level != common->log_level)) + return -EINVAL; + memset(log, 0, sizeof(*log)); log->ubuf = ubuf; log->size = log_size; log->level = log_level; log->offsetof_true_size = offsetof_log_true_size; log->uattr = uattr; + + if (!ubuf && ubuf_common) { + log->ubuf = ubuf_common; + log->size = common->log_size; + log->level = common->log_level; + log->uattr = uattr_common; + log->offsetof_true_size = 0; + if (size_common >= offsetofend(struct bpf_common_attr, log_true_size)) + log->offsetof_true_size = offsetof(struct bpf_common_attr, log_true_size); + } return 0; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 70b78ddcdedb..db893cae826c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -6271,7 +6271,8 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size, if (size >= offsetofend(union bpf_attr, log_true_size)) offsetof_log_true_size = offsetof(union bpf_attr, log_true_size); err = bpf_log_attr_init(&attr_log, attr.log_buf, attr.log_size, attr.log_level, - offsetof_log_true_size, uattr); + offsetof_log_true_size, uattr, &attr_common, uattr_common, + size_common); err = err ?: bpf_prog_load(&attr, uattr, &attr_log); break; case BPF_OBJ_PIN: -- cgit v1.2.3 From ceeb7eda94a3548958b30818495ef7eb12898727 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Tue, 12 May 2026 23:31:54 +0800 Subject: bpf: Add syscall common attributes support for btf_load BPF_BTF_LOAD can now take log parameters from both union bpf_attr and struct bpf_common_attr, with the same merge rules as BPF_PROG_LOAD: - if both sides provide a complete log tuple (buf/size/level) and they match, use it; - if only one side provides log parameters, use that one; - if both sides provide complete tuples but they differ, return -EINVAL. Acked-by: Andrii Nakryiko Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260512153157.28382-6-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 3 ++- kernel/bpf/btf.c | 30 +++++++----------------------- kernel/bpf/syscall.c | 11 ++++++++--- 3 files changed, 17 insertions(+), 27 deletions(-) diff --git a/include/linux/btf.h b/include/linux/btf.h index c82d0d689059..240401d9b25b 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -145,7 +145,8 @@ const char *btf_get_name(const struct btf *btf); void btf_get(struct btf *btf); void btf_put(struct btf *btf); const struct btf_header *btf_header(const struct btf *btf); -int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_sz); +struct bpf_log_attr; +int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_attr *attr_log); struct btf *btf_get_by_fd(int fd); int btf_get_info_by_fd(const struct btf *btf, const union bpf_attr *attr, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 77af44d8a3ad..a6bf4781943c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5907,25 +5907,10 @@ static int btf_check_type_tags(struct btf_verifier_env *env, return 0; } -static int finalize_log(struct bpf_verifier_log *log, bpfptr_t uattr, u32 uattr_size) -{ - u32 log_true_size; - int err; - - err = bpf_vlog_finalize(log, &log_true_size); - - if (uattr_size >= offsetofend(union bpf_attr, btf_log_true_size) && - copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, btf_log_true_size), - &log_true_size, sizeof(log_true_size))) - err = -EFAULT; - - return err; -} - -static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) +static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, + struct bpf_log_attr *attr_log) { bpfptr_t btf_data = make_bpfptr(attr->btf, uattr.is_kernel); - char __user *log_ubuf = u64_to_user_ptr(attr->btf_log_buf); struct btf_struct_metas *struct_meta_tab; struct btf_verifier_env *env = NULL; struct btf *btf = NULL; @@ -5942,8 +5927,7 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat /* user could have requested verbose verifier output * and supplied buffer to store the verification trace */ - err = bpf_vlog_init(&env->log, attr->btf_log_level, - log_ubuf, attr->btf_log_size); + err = bpf_vlog_init(&env->log, attr_log->level, attr_log->ubuf, attr_log->size); if (err) goto errout_free; @@ -6008,7 +5992,7 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat } } - err = finalize_log(&env->log, uattr, uattr_size); + err = bpf_log_attr_finalize(attr_log, &env->log); if (err) goto errout_free; @@ -6020,7 +6004,7 @@ errout_meta: btf_free_struct_meta_tab(btf); errout: /* overwrite err with -ENOSPC or -EFAULT */ - ret = finalize_log(&env->log, uattr, uattr_size); + ret = bpf_log_attr_finalize(attr_log, &env->log); if (ret) err = ret; errout_free: @@ -8189,12 +8173,12 @@ static int __btf_new_fd(struct btf *btf) return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC); } -int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) +int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_attr *attr_log) { struct btf *btf; int ret; - btf = btf_parse(attr, uattr, uattr_size); + btf = btf_parse(attr, uattr, attr_log); if (IS_ERR(btf)) return PTR_ERR(btf); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index db893cae826c..2fa05ba8f161 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5474,7 +5474,7 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, #define BPF_BTF_LOAD_LAST_FIELD btf_token_fd -static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size) +static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_attr *attr_log) { struct bpf_token *token = NULL; @@ -5501,7 +5501,7 @@ static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_ bpf_token_put(token); - return btf_new_fd(attr, uattr, uattr_size); + return btf_new_fd(attr, uattr, attr_log); } #define BPF_BTF_GET_FD_BY_ID_LAST_FIELD fd_by_id_token_fd @@ -6318,7 +6318,12 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size, err = bpf_raw_tracepoint_open(&attr); break; case BPF_BTF_LOAD: - err = bpf_btf_load(&attr, uattr, size); + if (size >= offsetofend(union bpf_attr, btf_log_true_size)) + offsetof_log_true_size = offsetof(union bpf_attr, btf_log_true_size); + err = bpf_log_attr_init(&attr_log, attr.btf_log_buf, attr.btf_log_size, + attr.btf_log_level, offsetof_log_true_size, uattr, + &attr_common, uattr_common, size_common); + err = err ?: bpf_btf_load(&attr, uattr, &attr_log); break; case BPF_BTF_GET_FD_BY_ID: err = bpf_btf_get_fd_by_id(&attr); -- cgit v1.2.3 From 49f9b2b2a18c5ce06b21fc2b3399352d80dee0c6 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Tue, 12 May 2026 23:31:55 +0800 Subject: bpf: Add syscall common attributes support for map_create Many BPF_MAP_CREATE validation failures currently return -EINVAL without any explanation to userspace. Plumb common syscall log attributes into map_create(), create a verifier log from bpf_common_attr::log_buf/log_size/log_level, and report map-creation failure reasons through that buffer. This improves debuggability by allowing userspace to inspect why map creation failed and read back log_true_size from common attributes. Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260512153157.28382-7-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 3 ++ kernel/bpf/log.c | 29 +++++++++++++++++++ kernel/bpf/syscall.c | 66 +++++++++++++++++++++++++++++++++++++------- 3 files changed, 88 insertions(+), 10 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 8433430dedb7..c15a4c26a43b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -766,6 +766,9 @@ struct bpf_log_attr { int bpf_log_attr_init(struct bpf_log_attr *log, u64 log_buf, u32 log_size, u32 log_level, u32 offsetof_log_true_size, bpfptr_t uattr, struct bpf_common_attr *common, bpfptr_t uattr_common, u32 size_common); +struct bpf_verifier_log *bpf_log_attr_create_vlog(struct bpf_log_attr *attr_log, + struct bpf_common_attr *common, bpfptr_t uattr, + u32 size); int bpf_log_attr_finalize(struct bpf_log_attr *attr, struct bpf_verifier_log *log); #define BPF_MAX_SUBPROGS 256 diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index fd12ad5a0338..62fe6ed18374 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -860,6 +860,35 @@ int bpf_log_attr_init(struct bpf_log_attr *log, u64 log_buf, u32 log_size, u32 l return 0; } +struct bpf_verifier_log *bpf_log_attr_create_vlog(struct bpf_log_attr *attr_log, + struct bpf_common_attr *common, bpfptr_t uattr, + u32 size) +{ + struct bpf_verifier_log *log; + int err; + + memset(attr_log, 0, sizeof(*attr_log)); + attr_log->uattr = uattr; + if (size >= offsetofend(struct bpf_common_attr, log_true_size)) + attr_log->offsetof_true_size = offsetof(struct bpf_common_attr, log_true_size); + + if (!size) + return NULL; + + log = kzalloc_obj(*log, GFP_KERNEL); + if (!log) + return ERR_PTR(-ENOMEM); + + err = bpf_vlog_init(log, common->log_level, u64_to_user_ptr(common->log_buf), + common->log_size); + if (err) { + kfree(log); + return ERR_PTR(err); + } + + return log; +} + int bpf_log_attr_finalize(struct bpf_log_attr *attr, struct bpf_verifier_log *log) { u32 log_true_size; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2fa05ba8f161..6600e126fbfb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1359,7 +1359,7 @@ free_map_tab: #define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size /* called via syscall */ -static int map_create(union bpf_attr *attr, bpfptr_t uattr) +static int __map_create(union bpf_attr *attr, bpfptr_t uattr, struct bpf_verifier_log *log) { const struct bpf_map_ops *ops; struct bpf_token *token = NULL; @@ -1371,8 +1371,10 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) int err; err = CHECK_ATTR(BPF_MAP_CREATE); - if (err) + if (err) { + bpf_log(log, "Invalid attr.\n"); return -EINVAL; + } /* check BPF_F_TOKEN_FD flag, remember if it's set, and then clear it * to avoid per-map type checks tripping on unknown flag @@ -1381,17 +1383,25 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) attr->map_flags &= ~BPF_F_TOKEN_FD; if (attr->btf_vmlinux_value_type_id) { - if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS || - attr->btf_key_type_id || attr->btf_value_type_id) + if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS) { + bpf_log(log, "btf_vmlinux_value_type_id can only be used with struct_ops maps.\n"); return -EINVAL; + } + if (attr->btf_key_type_id || attr->btf_value_type_id) { + bpf_log(log, "btf_vmlinux_value_type_id is mutually exclusive with btf_key_type_id and btf_value_type_id.\n"); + return -EINVAL; + } } else if (attr->btf_key_type_id && !attr->btf_value_type_id) { + bpf_log(log, "Invalid btf_value_type_id.\n"); return -EINVAL; } if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER && attr->map_type != BPF_MAP_TYPE_ARENA && - attr->map_extra != 0) + attr->map_extra != 0) { + bpf_log(log, "Invalid map_extra.\n"); return -EINVAL; + } f_flags = bpf_get_file_flag(attr->map_flags); if (f_flags < 0) @@ -1399,13 +1409,17 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) if (numa_node != NUMA_NO_NODE && ((unsigned int)numa_node >= nr_node_ids || - !node_online(numa_node))) + !node_online(numa_node))) { + bpf_log(log, "Invalid numa_node.\n"); return -EINVAL; + } /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ map_type = attr->map_type; - if (map_type >= ARRAY_SIZE(bpf_map_types)) + if (map_type >= ARRAY_SIZE(bpf_map_types)) { + bpf_log(log, "Invalid map_type.\n"); return -EINVAL; + } map_type = array_index_nospec(map_type, ARRAY_SIZE(bpf_map_types)); ops = bpf_map_types[map_type]; if (!ops) @@ -1423,8 +1437,10 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) if (token_flag) { token = bpf_token_get_from_fd(attr->map_token_fd); - if (IS_ERR(token)) + if (IS_ERR(token)) { + bpf_log(log, "Invalid map_token_fd.\n"); return PTR_ERR(token); + } /* if current token doesn't grant map creation permissions, * then we can't use this token, so ignore it and rely on @@ -1507,8 +1523,10 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) err = bpf_obj_name_cpy(map->name, attr->map_name, sizeof(attr->map_name)); - if (err < 0) + if (err < 0) { + bpf_log(log, "Invalid map_name.\n"); goto free_map; + } preempt_disable(); map->cookie = gen_cookie_next(&bpf_map_cookie); @@ -1531,6 +1549,7 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) btf = btf_get_by_fd(attr->btf_fd); if (IS_ERR(btf)) { + bpf_log(log, "Invalid btf_fd.\n"); err = PTR_ERR(btf); goto free_map; } @@ -1558,6 +1577,7 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) bpfptr_t uprog_hash = make_bpfptr(attr->excl_prog_hash, uattr.is_kernel); if (attr->excl_prog_hash_size != SHA256_DIGEST_SIZE) { + bpf_log(log, "Invalid excl_prog_hash_size.\n"); err = -EINVAL; goto free_map; } @@ -1573,6 +1593,7 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) goto free_map; } } else if (attr->excl_prog_hash_size) { + bpf_log(log, "Invalid excl_prog_hash_size.\n"); err = -EINVAL; goto free_map; } @@ -1611,6 +1632,31 @@ put_token: return err; } +static int map_create(union bpf_attr *attr, bpfptr_t uattr, struct bpf_common_attr *attr_common, + bpfptr_t uattr_common, u32 size_common) +{ + struct bpf_verifier_log *log; + struct bpf_log_attr attr_log; + int err, ret; + + log = bpf_log_attr_create_vlog(&attr_log, attr_common, uattr_common, size_common); + if (IS_ERR(log)) + return PTR_ERR(log); + + err = __map_create(attr, uattr, log); + + /* preserve original error even if log finalization is successful */ + ret = bpf_log_attr_finalize(&attr_log, log); + if (ret) { + if (err >= 0) + close_fd(err); + err = ret; + } + + kfree(log); + return err; +} + void bpf_map_inc(struct bpf_map *map) { atomic64_inc(&map->refcnt); @@ -6250,7 +6296,7 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size, switch (cmd) { case BPF_MAP_CREATE: - err = map_create(&attr, uattr); + err = map_create(&attr, uattr, &attr_common, uattr_common, size_common); break; case BPF_MAP_LOOKUP_ELEM: err = map_lookup_elem(&attr); -- cgit v1.2.3 From 702259006f9303c8773f99a06d1b698f05f082ac Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Tue, 12 May 2026 23:31:56 +0800 Subject: libbpf: Add syscall common attributes support for map_create With the previous commit adding common attribute support for BPF_MAP_CREATE, users can now retrieve detailed error messages when map creation fails via the log_buf field. Introduce struct bpf_log_opts with the following fields: log_buf, log_size, log_level, and log_true_size. Extend bpf_map_create_opts with a new field log_opts, allowing users to capture and inspect log messages on map creation failures. Acked-by: Andrii Nakryiko Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260512153157.28382-8-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/bpf.c | 16 +++++++++++++++- tools/lib/bpf/bpf.h | 17 ++++++++++++++++- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 9d8740761b7a..483c02cf21d1 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -209,6 +209,9 @@ int bpf_map_create(enum bpf_map_type map_type, const struct bpf_map_create_opts *opts) { const size_t attr_sz = offsetofend(union bpf_attr, excl_prog_hash_size); + const size_t attr_common_sz = sizeof(struct bpf_common_attr); + struct bpf_common_attr attr_common; + struct bpf_log_opts *log_opts; union bpf_attr attr; int fd; @@ -242,7 +245,18 @@ int bpf_map_create(enum bpf_map_type map_type, attr.excl_prog_hash = ptr_to_u64(OPTS_GET(opts, excl_prog_hash, NULL)); attr.excl_prog_hash_size = OPTS_GET(opts, excl_prog_hash_size, 0); - fd = sys_bpf_fd(BPF_MAP_CREATE, &attr, attr_sz); + log_opts = OPTS_GET(opts, log_opts, NULL); + if (log_opts && feat_supported(NULL, FEAT_BPF_SYSCALL_COMMON_ATTRS)) { + memset(&attr_common, 0, attr_common_sz); + attr_common.log_buf = ptr_to_u64(OPTS_GET(log_opts, buf, NULL)); + attr_common.log_size = OPTS_GET(log_opts, size, 0); + attr_common.log_level = OPTS_GET(log_opts, level, 0); + fd = sys_bpf_ext_fd(BPF_MAP_CREATE, &attr, attr_sz, &attr_common, attr_common_sz); + OPTS_SET(log_opts, true_size, attr_common.log_true_size); + } else { + fd = sys_bpf_fd(BPF_MAP_CREATE, &attr, attr_sz); + OPTS_SET(log_opts, true_size, 0); + } return libbpf_err_errno(fd); } diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 2c8e88ddb674..2312900a3263 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -37,6 +37,18 @@ extern "C" { LIBBPF_API int libbpf_set_memlock_rlim(size_t memlock_bytes); +struct bpf_log_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + + char *buf; + __u32 size; + __u32 level; + __u32 true_size; /* out parameter set by kernel */ + + size_t :0; +}; +#define bpf_log_opts__last_field true_size + struct bpf_map_create_opts { size_t sz; /* size of this struct for forward/backward compatibility */ @@ -57,9 +69,12 @@ struct bpf_map_create_opts { const void *excl_prog_hash; __u32 excl_prog_hash_size; + + struct bpf_log_opts *log_opts; + size_t :0; }; -#define bpf_map_create_opts__last_field excl_prog_hash_size +#define bpf_map_create_opts__last_field log_opts LIBBPF_API int bpf_map_create(enum bpf_map_type map_type, const char *map_name, -- cgit v1.2.3 From f675483cac1d762e11f134be1bbd80f876bf2e2f Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Tue, 12 May 2026 23:31:57 +0800 Subject: selftests/bpf: Add tests to verify map create failure log Add tests to verify that the kernel reports the expected error messages and correct log_true_size when map creation fails. Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260512153157.28382-9-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/map_init.c | 166 ++++++++++++++++++++++ 1 file changed, 166 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/map_init.c b/tools/testing/selftests/bpf/prog_tests/map_init.c index 14a31109dd0e..5c61c8e37306 100644 --- a/tools/testing/selftests/bpf/prog_tests/map_init.c +++ b/tools/testing/selftests/bpf/prog_tests/map_init.c @@ -212,3 +212,169 @@ void test_map_init(void) if (test__start_subtest("pcpu_lru_map_init")) test_pcpu_lru_map_init(); } + +static void test_map_create(enum bpf_map_type map_type, const char *map_name, + struct bpf_map_create_opts *opts, const char *exp_msg) +{ + const int key_size = 4, value_size = 4, max_entries = 1; + char log_buf[128]; + int fd; + LIBBPF_OPTS(bpf_log_opts, log_opts); + + log_buf[0] = '\0'; + log_opts.buf = log_buf; + log_opts.size = sizeof(log_buf); + log_opts.level = 1; + opts->log_opts = &log_opts; + fd = bpf_map_create(map_type, map_name, key_size, value_size, max_entries, opts); + if (!ASSERT_LT(fd, 0, "bpf_map_create")) { + close(fd); + return; + } + + ASSERT_STREQ(log_buf, exp_msg, "log_buf"); + ASSERT_EQ(log_opts.true_size, strlen(exp_msg) + 1, "true_size"); +} + +static void test_map_create_array(struct bpf_map_create_opts *opts, const char *exp_msg) +{ + test_map_create(BPF_MAP_TYPE_ARRAY, "test_map_create", opts, exp_msg); +} + +static void test_invalid_vmlinux_value_type_id_struct_ops(void) +{ + const char *msg = "btf_vmlinux_value_type_id can only be used with struct_ops maps.\n"; + LIBBPF_OPTS(bpf_map_create_opts, opts, + .btf_vmlinux_value_type_id = 1, + ); + + test_map_create_array(&opts, msg); +} + +static void test_invalid_vmlinux_value_type_id_kv_type_id(void) +{ + const char *msg = "btf_vmlinux_value_type_id is mutually exclusive with btf_key_type_id and btf_value_type_id.\n"; + LIBBPF_OPTS(bpf_map_create_opts, opts, + .btf_vmlinux_value_type_id = 1, + .btf_key_type_id = 1, + ); + + test_map_create(BPF_MAP_TYPE_STRUCT_OPS, "test_map_create", &opts, msg); +} + +static void test_invalid_value_type_id(void) +{ + const char *msg = "Invalid btf_value_type_id.\n"; + LIBBPF_OPTS(bpf_map_create_opts, opts, + .btf_key_type_id = 1, + ); + + test_map_create_array(&opts, msg); +} + +static void test_invalid_map_extra(void) +{ + const char *msg = "Invalid map_extra.\n"; + LIBBPF_OPTS(bpf_map_create_opts, opts, + .map_extra = 1, + ); + + test_map_create_array(&opts, msg); +} + +static void test_invalid_numa_node(void) +{ + const char *msg = "Invalid numa_node.\n"; + LIBBPF_OPTS(bpf_map_create_opts, opts, + .map_flags = BPF_F_NUMA_NODE, + .numa_node = 0xFF, + ); + + test_map_create_array(&opts, msg); +} + +static void test_invalid_map_type(void) +{ + const char *msg = "Invalid map_type.\n"; + LIBBPF_OPTS(bpf_map_create_opts, opts); + + test_map_create(__MAX_BPF_MAP_TYPE, "test_map_create", &opts, msg); +} + +static void test_invalid_token_fd(void) +{ + const char *msg = "Invalid map_token_fd.\n"; + LIBBPF_OPTS(bpf_map_create_opts, opts, + .map_flags = BPF_F_TOKEN_FD, + .token_fd = 0xFF, + ); + + test_map_create_array(&opts, msg); +} + +static void test_invalid_map_name(void) +{ + const char *msg = "Invalid map_name.\n"; + LIBBPF_OPTS(bpf_map_create_opts, opts); + + test_map_create(BPF_MAP_TYPE_ARRAY, "test-!@#", &opts, msg); +} + +static void test_invalid_btf_fd(void) +{ + const char *msg = "Invalid btf_fd.\n"; + LIBBPF_OPTS(bpf_map_create_opts, opts, + .btf_fd = -1, + .btf_key_type_id = 1, + .btf_value_type_id = 1, + ); + + test_map_create_array(&opts, msg); +} + +static void test_excl_prog_hash_size_1(void) +{ + const char *msg = "Invalid excl_prog_hash_size.\n"; + const char *hash = "DEADCODE"; + LIBBPF_OPTS(bpf_map_create_opts, opts, + .excl_prog_hash = hash, + ); + + test_map_create_array(&opts, msg); +} + +static void test_excl_prog_hash_size_2(void) +{ + const char *msg = "Invalid excl_prog_hash_size.\n"; + LIBBPF_OPTS(bpf_map_create_opts, opts, + .excl_prog_hash_size = 1, + ); + + test_map_create_array(&opts, msg); +} + +void test_map_create_failure(void) +{ + if (test__start_subtest("invalid_vmlinux_value_type_id_struct_ops")) + test_invalid_vmlinux_value_type_id_struct_ops(); + if (test__start_subtest("invalid_vmlinux_value_type_id_kv_type_id")) + test_invalid_vmlinux_value_type_id_kv_type_id(); + if (test__start_subtest("invalid_value_type_id")) + test_invalid_value_type_id(); + if (test__start_subtest("invalid_map_extra")) + test_invalid_map_extra(); + if (test__start_subtest("invalid_numa_node")) + test_invalid_numa_node(); + if (test__start_subtest("invalid_map_type")) + test_invalid_map_type(); + if (test__start_subtest("invalid_token_fd")) + test_invalid_token_fd(); + if (test__start_subtest("invalid_map_name")) + test_invalid_map_name(); + if (test__start_subtest("invalid_btf_fd")) + test_invalid_btf_fd(); + if (test__start_subtest("invalid_excl_prog_hash_size_1")) + test_excl_prog_hash_size_1(); + if (test__start_subtest("invalid_excl_prog_hash_size_2")) + test_excl_prog_hash_size_2(); +} -- cgit v1.2.3 From ede2dc5c6b571ce6d3aacf5a81933f8c5d5e6c7d Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:49:54 -0700 Subject: bpf: Convert bpf_get_spilled_reg macro to static inline function Convert the bpf_get_spilled_reg() macro to a static inline function for better type safety and readability. This also simplifies the macro definition in preparation for upcoming stack argument support which will introduce additional macros. No functional change. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513044954.2382693-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c15a4c26a43b..203fb751eeae 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -552,10 +552,14 @@ struct bpf_verifier_state { u32 may_goto_depth; }; -#define bpf_get_spilled_reg(slot, frame, mask) \ - (((slot < frame->allocated_stack / BPF_REG_SIZE) && \ - ((1 << frame->stack[slot].slot_type[BPF_REG_SIZE - 1]) & (mask))) \ - ? &frame->stack[slot].spilled_ptr : NULL) +static inline struct bpf_reg_state * +bpf_get_spilled_reg(int slot, struct bpf_func_state *frame, u32 mask) +{ + if (slot < frame->allocated_stack / BPF_REG_SIZE && + (1 << frame->stack[slot].slot_type[BPF_REG_SIZE - 1]) & mask) + return &frame->stack[slot].spilled_ptr; + return NULL; +} /* Iterate over 'frame', setting 'reg' to either NULL or a spilled register. */ #define bpf_for_each_spilled_reg(iter, frame, reg, mask) \ -- cgit v1.2.3 From 1cb229a54af1e36f19f7e8359692fa0d76fbc360 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:50:00 -0700 Subject: bpf: Remove copy_register_state wrapper function Remove the copy_register_state() helper which was just a plain struct assignment wrapper and replace all call sites with direct struct assignment. This simplifies the code in preparation for upcoming stack argument support. No functional change. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045000.2382933-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 44 +++++++++++++++++++------------------------- 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0e654ef01ae0..1dd9736c2a13 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3410,12 +3410,6 @@ static void assign_scalar_id_before_mov(struct bpf_verifier_env *env, src_reg->id = ++env->id_gen; } -/* Copy src state preserving dst->parent and dst->live fields */ -static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src) -{ - *dst = *src; -} - static void save_register_state(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi, struct bpf_reg_state *reg, @@ -3423,7 +3417,7 @@ static void save_register_state(struct bpf_verifier_env *env, { int i; - copy_register_state(&state->stack[spi].spilled_ptr, reg); + state->stack[spi].spilled_ptr = *reg; for (i = BPF_REG_SIZE; i > BPF_REG_SIZE - size; i--) state->stack[spi].slot_type[i - 1] = STACK_SPILL; @@ -3822,7 +3816,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, * with the destination register on fill. */ assign_scalar_id_before_mov(env, reg); - copy_register_state(&state->regs[dst_regno], reg); + state->regs[dst_regno] = *reg; state->regs[dst_regno].subreg_def = subreg_def; /* Break the relation on a narrowing fill. @@ -3877,7 +3871,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, * with the destination register on fill. */ assign_scalar_id_before_mov(env, reg); - copy_register_state(&state->regs[dst_regno], reg); + state->regs[dst_regno] = *reg; /* mark reg as written since spilled pointer state likely * has its liveness marks cleared by is_state_visited() * which resets stack/reg liveness for state transitions @@ -6031,7 +6025,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct b size); return -EACCES; } - copy_register_state(®s[value_regno], reg); + regs[value_regno] = *reg; add_scalar_to_reg(®s[value_regno], off); regs[value_regno].type = PTR_TO_INSN; } else { @@ -13248,7 +13242,7 @@ do_sim: */ if (!ptr_is_dst_reg) { tmp = *dst_reg; - copy_register_state(dst_reg, ptr_reg); + *dst_reg = *ptr_reg; } err = sanitize_speculative_path(env, NULL, env->insn_idx + 1, env->insn_idx); if (err < 0) @@ -14698,7 +14692,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) * copy register state to dest reg */ assign_scalar_id_before_mov(env, src_reg); - copy_register_state(dst_reg, src_reg); + *dst_reg = *src_reg; dst_reg->subreg_def = DEF_NOT_SUBREG; } else { /* case: R1 = (s8, s16 s32)R2 */ @@ -14713,7 +14707,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) no_sext = reg_umax(src_reg) < (1ULL << (insn->off - 1)); if (no_sext) assign_scalar_id_before_mov(env, src_reg); - copy_register_state(dst_reg, src_reg); + *dst_reg = *src_reg; if (!no_sext) clear_scalar_id(dst_reg); coerce_reg_to_size_sx(dst_reg, insn->off >> 3); @@ -14735,7 +14729,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (is_src_reg_u32) assign_scalar_id_before_mov(env, src_reg); - copy_register_state(dst_reg, src_reg); + *dst_reg = *src_reg; /* Make sure ID is cleared if src_reg is not in u32 * range otherwise dst_reg min/max could be incorrectly * propagated into src_reg by sync_linked_regs() @@ -14749,7 +14743,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (no_sext) assign_scalar_id_before_mov(env, src_reg); - copy_register_state(dst_reg, src_reg); + *dst_reg = *src_reg; if (!no_sext) clear_scalar_id(dst_reg); dst_reg->subreg_def = env->insn_idx + 1; @@ -15629,7 +15623,7 @@ static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_s reg->delta == known_reg->delta) { s32 saved_subreg_def = reg->subreg_def; - copy_register_state(reg, known_reg); + *reg = *known_reg; reg->subreg_def = saved_subreg_def; } else { s32 saved_subreg_def = reg->subreg_def; @@ -15640,7 +15634,7 @@ static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_s __mark_reg_known(&fake_reg, (s64)reg->delta - (s64)known_reg->delta); /* reg = known_reg; reg += delta */ - copy_register_state(reg, known_reg); + *reg = *known_reg; /* * Must preserve off, id and subreg_def flag, * otherwise another sync_linked_regs() will be incorrect. @@ -15743,10 +15737,10 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; - copy_register_state(&env->false_reg1, dst_reg); - copy_register_state(&env->false_reg2, src_reg); - copy_register_state(&env->true_reg1, dst_reg); - copy_register_state(&env->true_reg2, src_reg); + env->false_reg1 = *dst_reg; + env->false_reg2 = *src_reg; + env->true_reg1 = *dst_reg; + env->true_reg2 = *src_reg; pred = is_branch_taken(env, dst_reg, src_reg, opcode, is_jmp32); if (pred >= 0) { /* If we get here with a dst_reg pointer type it is because @@ -15815,11 +15809,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (err) return err; - copy_register_state(dst_reg, &env->false_reg1); - copy_register_state(src_reg, &env->false_reg2); - copy_register_state(&other_branch_regs[insn->dst_reg], &env->true_reg1); + *dst_reg = env->false_reg1; + *src_reg = env->false_reg2; + other_branch_regs[insn->dst_reg] = env->true_reg1; if (BPF_SRC(insn->code) == BPF_X) - copy_register_state(&other_branch_regs[insn->src_reg], &env->true_reg2); + other_branch_regs[insn->src_reg] = env->true_reg2; if (BPF_SRC(insn->code) == BPF_X && src_reg->type == SCALAR_VALUE && src_reg->id && -- cgit v1.2.3 From 78bbe61632f11b1091c03259f92b6559489222ae Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:50:05 -0700 Subject: bpf: Add helper functions for r11-based stack argument insns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add three static inline helper functions — is_stack_arg_ldx(), is_stack_arg_st(), and is_stack_arg_stx() — that identify r11-based (BPF_REG_PARAMS) instructions used for stack argument passing. These helpers encapsulate the detailed encoding requirements (operand size, register, offset alignment and sign) and hide raw BPF_REG_PARAMS usage from the verifier, making call sites more readable and explicit. A later patch ("bpf: Enable r11 based insns") will wire these helpers into the verifier. Until then, check_and_resolve_insns() rejects any r11-based registers. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045005.2383881-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/include/linux/filter.h b/include/linux/filter.h index b77d0b06db6e..918d9b34eac6 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -749,6 +749,27 @@ static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog, return ret; } +static inline bool is_stack_arg_ldx(const struct bpf_insn *insn) +{ + return insn->code == (BPF_LDX | BPF_MEM | BPF_DW) && + insn->src_reg == BPF_REG_PARAMS && + insn->off > 0 && insn->off % 8 == 0; +} + +static inline bool is_stack_arg_st(const struct bpf_insn *insn) +{ + return insn->code == (BPF_ST | BPF_MEM | BPF_DW) && + insn->dst_reg == BPF_REG_PARAMS && + insn->off < 0 && insn->off % 8 == 0; +} + +static inline bool is_stack_arg_stx(const struct bpf_insn *insn) +{ + return insn->code == (BPF_STX | BPF_MEM | BPF_DW) && + insn->dst_reg == BPF_REG_PARAMS && + insn->off < 0 && insn->off % 8 == 0; +} + #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN struct bpf_skb_data_end { -- cgit v1.2.3 From 3ab5bd317ee280b198b00ea2114adaad7a458ef8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:50:10 -0700 Subject: bpf: Set sub->arg_cnt earlier in btf_prepare_func_args() Move the "sub->arg_cnt = nargs" assignment to immediately after nargs is computed from btf_type_vlen(), instead of at the end of btf_prepare_func_args(). btf_prepare_func_args() can return -EINVAL early in several cases, e.g. when a static function has some non-int/enum arguments. Since -EINVAL from btf_prepare_func_args() does not immediately reject verification, arg_cnt remains zero after the early return. This causes later stack argument based load/store insns to incorrectly assume the function has no arguments. Setting arg_cnt right after nargs ensures it is available regardless of which path btf_prepare_func_args() takes. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045010.2384635-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index a6bf4781943c..099d7ca5a980 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7864,6 +7864,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) } args = (const struct btf_param *)(t + 1); nargs = btf_type_vlen(t); + sub->arg_cnt = nargs; if (nargs > MAX_BPF_FUNC_REG_ARGS) { if (!is_global) return -EINVAL; @@ -8051,7 +8052,6 @@ skip_pointer: return -EINVAL; } - sub->arg_cnt = nargs; sub->args_cached = true; return 0; -- cgit v1.2.3 From 0f6bd5e7a804af27e7f34b8306afde7a6b269318 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:50:15 -0700 Subject: bpf: Support stack arguments for bpf functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently BPF functions (subprogs) are limited to 5 register arguments. With [1], the compiler can emit code that passes additional arguments via a dedicated stack area through bpf register BPF_REG_PARAMS (r11), introduced in an earlier patch ([2]). The compiler uses positive r11 offsets for incoming (callee-side) args and negative r11 offsets for outgoing (caller-side) args, following the x86_64/arm64 calling convention direction. There is an 8-byte gap at offset 0 separating two regions: Incoming (callee reads): r11+8 (arg6), r11+16 (arg7), ... Outgoing (caller writes): r11-8 (arg6), r11-16 (arg7), ... The following is an example to show how stack arguments are saved and transferred between caller and callee: int foo(int a1, int a2, int a3, int a4, int a5, int a6, int a7) { ... bar(a1, a2, a3, a4, a5, a6, a7, a8); ... } Caller (foo) Callee (bar) ============ ============ Incoming (positive offsets): Incoming (positive offsets): r11+8: [incoming arg 6] r11+8: [incoming arg 6] <-+ r11+16: [incoming arg 7] r11+16: [incoming arg 7] <-|+ r11+24: [incoming arg 8] <-||+ Outgoing (negative offsets): ||| r11-8: [outgoing arg 6 to bar] -------->-------------------------+|| r11-16: [outgoing arg 7 to bar] -------->--------------------------+| r11-24: [outgoing arg 8 to bar] -------->---------------------------+ If the bpf function has more than one call: int foo(int a1, int a2, int a3, int a4, int a5, int a6, int a7) { ... bar1(a1, a2, a3, a4, a5, a6, a7, a8); ... bar2(a1, a2, a3, a4, a5, a6, a7, a8, a9); ... } Caller (foo) Callee (bar2) ============ ============== Incoming (positive offsets): Incoming (positive offsets): r11+8: [incoming arg 6] r11+8: [incoming arg 6] <+ r11+16: [incoming arg 7] r11+16: [incoming arg 7] <|+ r11+24: [incoming arg 8] <||+ Outgoing for bar2 (negative offsets): r11+32: [incoming arg 9] <|||+ r11-8: [outgoing arg 6] ---->----------->-------------------------+||| r11-16: [outgoing arg 7] ---->----------->--------------------------+|| r11-24: [outgoing arg 8] ---->----------->---------------------------+| r11-32: [outgoing arg 9] ---->----------->----------------------------+ The verifier tracks outgoing stack arguments in stack_arg_regs[] and out_stack_arg_cnt in bpf_func_state, separately from the regular r10 stack. The callee does not copy incoming args — it reads them directly from the caller's outgoing slots at positive r11 offsets. Similar to stacksafe(), introduce stack_arg_safe() to do pruning check. Outgoing stack arg slots are invalidated when the callee returns (e.g. in prepare_func_exit), not at call time. This allows the callee to read incoming args from the caller's outgoing slots during verification. The following are a few examples. Example 1: *(u64 *)(r11 - 8) = r6; *(u64 *)(r11 - 16) = r7; call bar1; // arg6 = r6, arg7 = r7 call bar2; // expected with 2 stack arguments, failed Example 2: To fix the Example 1: *(u64 *)(r11 - 8) = r6; *(u64 *)(r11 - 16) = r7; call bar1; // arg6 = r6, arg7 = r7 *(u64 *)(r11 - 8) = r8; *(u64 *)(r11 - 16) = r9; call bar2; // arg6 = r8, arg7 = r9 Example 3: The compiler can hoist the shared stack arg stores above the branch: *(u64 *)(r11 - 16) = r7; if cond goto else; *(u64 *)(r11 - 8) = r8; call bar1; // arg6 = r8, arg7 = r7 goto end; else: *(u64 *)(r11 - 8) = r9; call bar2; // arg6 = r9, arg7 = r7 end: Example 4: Within a loop: loop: *(u64 *)(r11 - 8) = r6; // arg6, before loop call bar; // reuses arg6 each iteration if ... goto loop; A separate max_out_stack_arg_cnt field in bpf_subprog_info tracks the deepest outgoing slot actually written. This intends to reject programs that write to slots beyond what any callee expects. It is necessary for JIT. Similar to typical compiler generated code, enforce the following orderings: - all stack arg reads must be ahead of any stack arg write - all stack arg reads must be before any bpf func, kfunc and helpers This is needed as JIT may emit 'mov' insns for read/write with the same register and bpf function, kfunc and helper will invalidate all arguments immediately after the call. Callback functions with stack arguments need kernel setup parameter types (including stack parameters) properly and then callback function can retrieve such information for verification purpose. Global subprogs and freplace with >5 args are not yet supported. [1] https://github.com/llvm/llvm-project/pull/189060 [2] https://lore.kernel.org/bpf/20260423033506.2542005-1-yonghong.song@linux.dev/ Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045015.2385013-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 43 ++++++++- kernel/bpf/btf.c | 14 ++- kernel/bpf/fixups.c | 16 +++- kernel/bpf/states.c | 32 +++++++ kernel/bpf/verifier.c | 203 ++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 294 insertions(+), 14 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 203fb751eeae..5398a02a1280 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -402,6 +402,7 @@ struct bpf_func_state { bool in_callback_fn; bool in_async_callback_fn; bool in_exception_callback_fn; + bool no_stack_arg_load; /* For callback calling functions that limit number of possible * callback executions (e.g. bpf_loop) keeps track of current * simulated iteration number. @@ -427,6 +428,9 @@ struct bpf_func_state { * `stack`. allocated_stack is always a multiple of BPF_REG_SIZE. */ int allocated_stack; + + u16 out_stack_arg_cnt; /* Number of outgoing on-stack argument slots */ + struct bpf_reg_state *stack_arg_regs; /* Outgoing on-stack arguments */ }; #define MAX_CALL_FRAMES 8 @@ -465,8 +469,10 @@ struct bpf_jmp_history_entry { u64 linked_regs; }; -/* Maximum number of register states that can exist at once */ -#define BPF_ID_MAP_SIZE ((MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) * MAX_CALL_FRAMES) +/* Maximum number of bpf_reg_state objects that can exist at once */ +#define MAX_STACK_ARG_SLOTS (MAX_BPF_FUNC_ARGS - MAX_BPF_FUNC_REG_ARGS) +#define BPF_ID_MAP_SIZE ((MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE + \ + MAX_STACK_ARG_SLOTS) * MAX_CALL_FRAMES) struct bpf_verifier_state { /* call stack tracking */ struct bpf_func_state *frame[MAX_CALL_FRAMES]; @@ -561,12 +567,27 @@ bpf_get_spilled_reg(int slot, struct bpf_func_state *frame, u32 mask) return NULL; } +static inline struct bpf_reg_state * +bpf_get_spilled_stack_arg(int slot, struct bpf_func_state *frame) +{ + if (slot < frame->out_stack_arg_cnt && + frame->stack_arg_regs[slot].type != NOT_INIT) + return &frame->stack_arg_regs[slot]; + return NULL; +} + /* Iterate over 'frame', setting 'reg' to either NULL or a spilled register. */ #define bpf_for_each_spilled_reg(iter, frame, reg, mask) \ for (iter = 0, reg = bpf_get_spilled_reg(iter, frame, mask); \ iter < frame->allocated_stack / BPF_REG_SIZE; \ iter++, reg = bpf_get_spilled_reg(iter, frame, mask)) +/* Iterate over 'frame', setting 'reg' to either NULL or a spilled stack arg. */ +#define bpf_for_each_spilled_stack_arg(iter, frame, reg) \ + for (iter = 0, reg = bpf_get_spilled_stack_arg(iter, frame); \ + iter < frame->out_stack_arg_cnt; \ + iter++, reg = bpf_get_spilled_stack_arg(iter, frame)) + #define bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, __mask, __expr) \ ({ \ struct bpf_verifier_state *___vstate = __vst; \ @@ -584,6 +605,11 @@ bpf_get_spilled_reg(int slot, struct bpf_func_state *frame, u32 mask) continue; \ (void)(__expr); \ } \ + bpf_for_each_spilled_stack_arg(___j, __state, __reg) { \ + if (!__reg) \ + continue; \ + (void)(__expr); \ + } \ } \ }) @@ -815,12 +841,21 @@ struct bpf_subprog_info { bool keep_fastcall_stack: 1; bool changes_pkt_data: 1; bool might_sleep: 1; - u8 arg_cnt:3; + u8 arg_cnt:4; enum priv_stack_mode priv_stack_mode; - struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS]; + struct bpf_subprog_arg_info args[MAX_BPF_FUNC_ARGS]; + u16 stack_arg_cnt; /* incoming + max outgoing */ + u16 max_out_stack_arg_cnt; }; +static inline u16 bpf_in_stack_arg_cnt(const struct bpf_subprog_info *sub) +{ + if (sub->arg_cnt > MAX_BPF_FUNC_REG_ARGS) + return sub->arg_cnt - MAX_BPF_FUNC_REG_ARGS; + return 0; +} + struct bpf_verifier_env; struct backtrack_state { diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 099d7ca5a980..4fb8641546b8 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7865,10 +7865,16 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) args = (const struct btf_param *)(t + 1); nargs = btf_type_vlen(t); sub->arg_cnt = nargs; - if (nargs > MAX_BPF_FUNC_REG_ARGS) { - if (!is_global) - return -EINVAL; - bpf_log(log, "Global function %s() with %d > %d args. Buggy compiler.\n", + if (nargs > MAX_BPF_FUNC_ARGS) { + bpf_log(log, "kernel supports at most %d parameters, function %s has %d\n", + MAX_BPF_FUNC_ARGS, tname, nargs); + return -EFAULT; + } + if (nargs > MAX_BPF_FUNC_REG_ARGS) + sub->stack_arg_cnt = nargs - MAX_BPF_FUNC_REG_ARGS; + + if (is_global && nargs > MAX_BPF_FUNC_REG_ARGS) { + bpf_log(log, "global function %s has %d > %d args, stack args not supported\n", tname, nargs, MAX_BPF_FUNC_REG_ARGS); return -EINVAL; } diff --git a/kernel/bpf/fixups.c b/kernel/bpf/fixups.c index fba9e8c00878..ba86039789fd 100644 --- a/kernel/bpf/fixups.c +++ b/kernel/bpf/fixups.c @@ -1378,9 +1378,21 @@ int bpf_fixup_call_args(struct bpf_verifier_env *env) struct bpf_prog *prog = env->prog; struct bpf_insn *insn = prog->insnsi; bool has_kfunc_call = bpf_prog_has_kfunc_call(prog); - int i, depth; + int depth; #endif - int err = 0; + int i, err = 0; + + for (i = 0; i < env->subprog_cnt; i++) { + struct bpf_subprog_info *subprog = &env->subprog_info[i]; + u16 outgoing = subprog->stack_arg_cnt - bpf_in_stack_arg_cnt(subprog); + + if (subprog->max_out_stack_arg_cnt > outgoing) { + verbose(env, + "func#%d writes %u stack arg slots, but calls only require %u\n", + i, subprog->max_out_stack_arg_cnt, outgoing); + return -EINVAL; + } + } if (env->prog->jit_requested && !bpf_prog_is_offloaded(env->prog->aux)) { diff --git a/kernel/bpf/states.c b/kernel/bpf/states.c index bd9c22945050..3ce6d2652b27 100644 --- a/kernel/bpf/states.c +++ b/kernel/bpf/states.c @@ -833,6 +833,32 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, return true; } +/* + * Compare stack arg slots between old and current states. + * Outgoing stack args are path-local state and must agree for pruning. + */ +static bool stack_arg_safe(struct bpf_verifier_env *env, struct bpf_func_state *old, + struct bpf_func_state *cur, struct bpf_idmap *idmap, + enum exact_level exact) +{ + int i, nslots; + + nslots = max(old->out_stack_arg_cnt, cur->out_stack_arg_cnt); + for (i = 0; i < nslots; i++) { + struct bpf_reg_state *old_arg, *cur_arg; + struct bpf_reg_state not_init = { .type = NOT_INIT }; + + old_arg = i < old->out_stack_arg_cnt ? + &old->stack_arg_regs[i] : ¬_init; + cur_arg = i < cur->out_stack_arg_cnt ? + &cur->stack_arg_regs[i] : ¬_init; + if (!regsafe(env, old_arg, cur_arg, idmap, exact)) + return false; + } + + return true; +} + static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *cur, struct bpf_idmap *idmap) { @@ -915,6 +941,9 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat if (old->callback_depth > cur->callback_depth) return false; + if (!old->no_stack_arg_load && cur->no_stack_arg_load) + return false; + for (i = 0; i < MAX_BPF_REG; i++) if (((1 << i) & live_regs) && !regsafe(env, &old->regs[i], &cur->regs[i], @@ -924,6 +953,9 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat if (!stacksafe(env, old, cur, &env->idmap_scratch, exact)) return false; + if (!stack_arg_safe(env, old, cur, &env->idmap_scratch, exact)) + return false; + return true; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1dd9736c2a13..a29b3003cbec 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1362,6 +1362,18 @@ static int copy_stack_state(struct bpf_func_state *dst, const struct bpf_func_st return -ENOMEM; dst->allocated_stack = src->allocated_stack; + + /* copy stack args state */ + n = src->out_stack_arg_cnt; + if (n) { + dst->stack_arg_regs = copy_array(dst->stack_arg_regs, src->stack_arg_regs, n, + sizeof(struct bpf_reg_state), + GFP_KERNEL_ACCOUNT); + if (!dst->stack_arg_regs) + return -ENOMEM; + } + + dst->out_stack_arg_cnt = src->out_stack_arg_cnt; return 0; } @@ -1403,6 +1415,23 @@ static int grow_stack_state(struct bpf_verifier_env *env, struct bpf_func_state return 0; } +static int grow_stack_arg_slots(struct bpf_verifier_env *env, + struct bpf_func_state *state, int cnt) +{ + size_t old_n = state->out_stack_arg_cnt; + + if (old_n >= cnt) + return 0; + + state->stack_arg_regs = realloc_array(state->stack_arg_regs, old_n, cnt, + sizeof(struct bpf_reg_state)); + if (!state->stack_arg_regs) + return -ENOMEM; + + state->out_stack_arg_cnt = cnt; + return 0; +} + /* Acquire a pointer id from the env and update the state->refs to include * this new pointer reference. * On success, returns a valid pointer id to associate with the register @@ -1565,6 +1594,7 @@ static void free_func_state(struct bpf_func_state *state) { if (!state) return; + kfree(state->stack_arg_regs); kfree(state->stack); kfree(state); } @@ -4050,6 +4080,103 @@ static int check_stack_write(struct bpf_verifier_env *env, return err; } +/* + * Write a value to the outgoing stack arg area. + * off is a negative offset from r11 (e.g. -8 for arg6, -16 for arg7). + */ +static int check_stack_arg_write(struct bpf_verifier_env *env, struct bpf_func_state *state, + int off, struct bpf_reg_state *value_reg) +{ + int max_stack_arg_regs = MAX_BPF_FUNC_ARGS - MAX_BPF_FUNC_REG_ARGS; + struct bpf_subprog_info *subprog = &env->subprog_info[state->subprogno]; + int spi = -off / BPF_REG_SIZE - 1; + struct bpf_reg_state *arg; + int err; + + if (spi >= max_stack_arg_regs) { + verbose(env, "stack arg write offset %d exceeds max %d stack args\n", + off, max_stack_arg_regs); + return -EINVAL; + } + + err = grow_stack_arg_slots(env, state, spi + 1); + if (err) + return err; + + /* Track the max outgoing stack arg slot count. */ + if (spi + 1 > subprog->max_out_stack_arg_cnt) + subprog->max_out_stack_arg_cnt = spi + 1; + + if (value_reg) { + state->stack_arg_regs[spi] = *value_reg; + } else { + /* BPF_ST: store immediate, treat as scalar */ + arg = &state->stack_arg_regs[spi]; + arg->type = SCALAR_VALUE; + __mark_reg_known(arg, env->prog->insnsi[env->insn_idx].imm); + } + state->no_stack_arg_load = true; + return 0; +} + +/* + * Read a value from the incoming stack arg area. + * off is a positive offset from r11 (e.g. +8 for arg6, +16 for arg7). + */ +static int check_stack_arg_read(struct bpf_verifier_env *env, struct bpf_func_state *state, + int off, int dst_regno) +{ + struct bpf_subprog_info *subprog = &env->subprog_info[state->subprogno]; + struct bpf_verifier_state *vstate = env->cur_state; + int spi = off / BPF_REG_SIZE - 1; + struct bpf_func_state *caller, *cur; + struct bpf_reg_state *arg; + + if (state->no_stack_arg_load) { + verbose(env, "r11 load must be before any r11 store or call insn\n"); + return -EINVAL; + } + + if (spi + 1 > bpf_in_stack_arg_cnt(subprog)) { + verbose(env, "invalid read from stack arg off %d depth %d\n", + off, bpf_in_stack_arg_cnt(subprog) * BPF_REG_SIZE); + return -EACCES; + } + + caller = vstate->frame[vstate->curframe - 1]; + arg = &caller->stack_arg_regs[spi]; + cur = vstate->frame[vstate->curframe]; + cur->regs[dst_regno] = *arg; + return 0; +} + +static int check_outgoing_stack_args(struct bpf_verifier_env *env, struct bpf_func_state *caller, + int nargs) +{ + int i, spi; + + for (i = MAX_BPF_FUNC_REG_ARGS; i < nargs; i++) { + spi = i - MAX_BPF_FUNC_REG_ARGS; + if (spi >= caller->out_stack_arg_cnt || + caller->stack_arg_regs[spi].type == NOT_INIT) { + verbose(env, "callee expects %d args, stack arg%d is not initialized\n", + nargs, spi + 1); + return -EFAULT; + } + } + + return 0; +} + +static struct bpf_reg_state *get_func_arg_reg(struct bpf_func_state *caller, + struct bpf_reg_state *regs, int arg) +{ + if (arg < MAX_BPF_FUNC_REG_ARGS) + return ®s[arg + 1]; + + return &caller->stack_arg_regs[arg - MAX_BPF_FUNC_REG_ARGS]; +} + static int check_map_access_type(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int off, int size, enum bpf_access_type type) { @@ -6217,10 +6344,20 @@ static int check_load_mem(struct bpf_verifier_env *env, struct bpf_insn *insn, bool strict_alignment_once, bool is_ldsx, bool allow_trust_mismatch, const char *ctx) { + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = cur_regs(env); enum bpf_reg_type src_reg_type; int err; + /* Handle stack arg read */ + if (is_stack_arg_ldx(insn)) { + err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); + if (err) + return err; + return check_stack_arg_read(env, state, insn->off, insn->dst_reg); + } + /* check src operand */ err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) @@ -6249,10 +6386,20 @@ static int check_load_mem(struct bpf_verifier_env *env, struct bpf_insn *insn, static int check_store_reg(struct bpf_verifier_env *env, struct bpf_insn *insn, bool strict_alignment_once) { + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = cur_regs(env); enum bpf_reg_type dst_reg_type; int err; + /* Handle stack arg write */ + if (is_stack_arg_stx(insn)) { + err = check_reg_arg(env, insn->src_reg, SRC_OP); + if (err) + return err; + return check_stack_arg_write(env, state, insn->off, regs + insn->src_reg); + } + /* check src1 operand */ err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) @@ -8860,6 +9007,15 @@ static void clear_caller_saved_regs(struct bpf_verifier_env *env, } } +static void invalidate_outgoing_stack_args(const struct bpf_verifier_env *env, + struct bpf_func_state *state) +{ + int i, nslots = state->out_stack_arg_cnt; + + for (i = 0; i < nslots; i++) + bpf_mark_reg_not_init(env, &state->stack_arg_regs[i]); +} + typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee, @@ -8922,6 +9078,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs) { struct bpf_subprog_info *sub = subprog_info(env, subprog); + struct bpf_func_state *caller = cur_func(env); struct bpf_verifier_log *log = &env->log; u32 i; int ret; @@ -8930,13 +9087,16 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, if (ret) return ret; + ret = check_outgoing_stack_args(env, caller, sub->arg_cnt); + if (ret) + return ret; + /* check that BTF function arguments match actual types that the * verifier sees. */ for (i = 0; i < sub->arg_cnt; i++) { argno_t argno = argno_from_arg(i + 1); - u32 regno = i + 1; - struct bpf_reg_state *reg = ®s[regno]; + struct bpf_reg_state *reg = get_func_arg_reg(caller, regs, i); struct bpf_subprog_arg_info *arg = &sub->args[i]; if (arg->arg_type == ARG_ANYTHING) { @@ -9124,6 +9284,8 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state; + struct bpf_subprog_info *caller_info; + u16 callee_incoming, stack_arg_cnt; struct bpf_func_state *caller; int err, subprog, target_insn; @@ -9166,6 +9328,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* mark global subprog for verifying after main prog */ subprog_aux(env, subprog)->called = true; clear_caller_saved_regs(env, caller->regs); + invalidate_outgoing_stack_args(env, cur_func(env)); /* All non-void global functions return a 64-bit SCALAR_VALUE. */ if (!subprog_returns_void(env, subprog)) { @@ -9177,6 +9340,16 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return 0; } + /* + * Track caller's total stack arg count (incoming + max outgoing). + * This is needed so the JIT knows how much stack arg space to allocate. + */ + caller_info = &env->subprog_info[caller->subprogno]; + callee_incoming = bpf_in_stack_arg_cnt(&env->subprog_info[subprog]); + stack_arg_cnt = bpf_in_stack_arg_cnt(caller_info) + callee_incoming; + if (stack_arg_cnt > caller_info->stack_arg_cnt) + caller_info->stack_arg_cnt = stack_arg_cnt; + /* for regular function entry setup new frame and continue * from that frame. */ @@ -9534,6 +9707,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) * bpf_throw, this will be done by copy_verifier_state for extra frames. */ free_func_state(callee); state->frame[state->curframe--] = NULL; + invalidate_outgoing_stack_args(env, caller); /* for callbacks widen imprecise scalars to make programs like below verify: * @@ -10160,6 +10334,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn bpf_mark_reg_not_init(env, ®s[caller_saved[i]]); check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); } + invalidate_outgoing_stack_args(env, cur_func(env)); /* helper call returns 64-bit value. */ regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; @@ -12842,6 +13017,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, bpf_mark_reg_not_init(env, ®s[regno]); regs[regno].subreg_def = DEF_NOT_SUBREG; } + invalidate_outgoing_stack_args(env, cur_func(env)); /* Check return type */ t = btf_type_skip_modifiers(desc_btf, meta.func_proto->type, NULL); @@ -16961,6 +17137,14 @@ static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state) return check_store_reg(env, insn, false); case BPF_ST: { + /* Handle stack arg write (store immediate) */ + if (is_stack_arg_st(insn)) { + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + + return check_stack_arg_write(env, state, insn->off, NULL); + } + enum bpf_reg_type dst_reg_type; err = check_reg_arg(env, insn->dst_reg, SRC_OP); @@ -16995,6 +17179,8 @@ static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state) } } mark_reg_scratched(env, BPF_REG_0); + if (bpf_in_stack_arg_cnt(&env->subprog_info[cur_func(env)->subprogno])) + cur_func(env)->no_stack_arg_load = true; if (insn->src_reg == BPF_PSEUDO_CALL) return check_func_call(env, insn, &env->insn_idx); if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) @@ -18110,7 +18296,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) goto out; } } - for (i = BPF_REG_1; i <= sub->arg_cnt; i++) { + for (i = BPF_REG_1; i <= min_t(u32, sub->arg_cnt, MAX_BPF_FUNC_REG_ARGS); i++) { arg = &sub->args[i - BPF_REG_1]; reg = ®s[i]; @@ -18153,6 +18339,12 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) goto out; } } + if (env->prog->type == BPF_PROG_TYPE_EXT && sub->arg_cnt > MAX_BPF_FUNC_REG_ARGS) { + verbose(env, "freplace programs with >%d args not supported yet\n", + MAX_BPF_FUNC_REG_ARGS); + ret = -EINVAL; + goto out; + } } else { /* if main BPF program has associated BTF info, validate that * it's matching expected signature, and otherwise mark BTF @@ -18160,8 +18352,11 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) */ if (env->prog->aux->func_info_aux) { ret = btf_prepare_func_args(env, 0); - if (ret || sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_PTR_TO_CTX) + if (ret || sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_PTR_TO_CTX) { env->prog->aux->func_info_aux[0].unreliable = true; + sub->arg_cnt = 1; + sub->stack_arg_cnt = 0; + } } /* 1st arg to a function */ -- cgit v1.2.3 From 3a656670fd6da624f6241038ca4cf350f24fd5e8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:50:20 -0700 Subject: bpf: Refactor jmp history to use dedicated spi/frame fields Move stack slot index (spi) and frame number out of the flags field in bpf_jmp_history_entry into dedicated bitfields. This simplifies the encoding and makes room for new flags. Previously, spi and frame were packed into the lower 9 bits of the 12-bit flags field (3 bits frame + 6 bits spi), with INSN_F_STACK_ACCESS at BIT(9) and INSN_F_DST/SRC_REG_STACK at BIT(10)/BIT(11). But this has no room for an INSN_F_* flag for stack arguments. To resolve this issue, bpf_jmp_history_entry field idx is narrowed to 20 bits (sufficient for insn indices up to 1M), and the freed bits hold spi (6 bits) and frame (3 bits) as dedicated struct fields. The flags enum is simplified accordingly: INSN_F_STACK_ACCESS -> BIT(0) INSN_F_DST_REG_STACK -> BIT(1) INSN_F_SRC_REG_STACK -> BIT(2) which allows more room for additional INSN_F_* flags. bpf_push_jmp_history() now takes explicit spi and frame parameters instead of encoding them into flags. The insn_stack_access_flags(), insn_stack_access_spi(), and insn_stack_access_frameno() helpers are removed. No functional change. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045020.2385962-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 37 ++++++++++++++++--------------------- kernel/bpf/backtrack.c | 24 +++++++++--------------- kernel/bpf/states.c | 2 +- kernel/bpf/verifier.c | 23 +++++++++++------------ 4 files changed, 37 insertions(+), 49 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 5398a02a1280..3ec338169981 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -435,40 +435,35 @@ struct bpf_func_state { #define MAX_CALL_FRAMES 8 -/* instruction history flags, used in bpf_jmp_history_entry.flags field */ +/* instruction history flags, used in bpf_jmp_history_entry.flags field. + * Frame number and SPI are stored in dedicated fields of bpf_jmp_history_entry. + */ enum { - /* instruction references stack slot through PTR_TO_STACK register; - * we also store stack's frame number in lower 3 bits (MAX_CALL_FRAMES is 8) - * and accessed stack slot's index in next 6 bits (MAX_BPF_STACK is 512, - * 8 bytes per slot, so slot index (spi) is [0, 63]) - */ - INSN_F_FRAMENO_MASK = 0x7, /* 3 bits */ - - INSN_F_SPI_MASK = 0x3f, /* 6 bits */ - INSN_F_SPI_SHIFT = 3, /* shifted 3 bits to the left */ + INSN_F_STACK_ACCESS = BIT(0), - INSN_F_STACK_ACCESS = BIT(9), - - INSN_F_DST_REG_STACK = BIT(10), /* dst_reg is PTR_TO_STACK */ - INSN_F_SRC_REG_STACK = BIT(11), /* src_reg is PTR_TO_STACK */ - /* total 12 bits are used now. */ + INSN_F_DST_REG_STACK = BIT(1), /* dst_reg is PTR_TO_STACK */ + INSN_F_SRC_REG_STACK = BIT(2), /* src_reg is PTR_TO_STACK */ }; -static_assert(INSN_F_FRAMENO_MASK + 1 >= MAX_CALL_FRAMES); -static_assert(INSN_F_SPI_MASK + 1 >= MAX_BPF_STACK / 8); - struct bpf_jmp_history_entry { - u32 idx; /* insn idx can't be bigger than 1 million */ + u32 idx : 20; + u32 frame : 3; /* stack access frame number */ + u32 spi : 6; /* stack slot index (0..63) */ + u32 : 3; u32 prev_idx : 20; /* special INSN_F_xxx flags */ - u32 flags : 12; + u32 flags : 4; + u32 : 8; /* additional registers that need precision tracking when this * jump is backtracked, vector of six 10-bit records */ u64 linked_regs; }; +static_assert(MAX_CALL_FRAMES <= (1 << 3)); +static_assert(MAX_BPF_STACK / 8 <= (1 << 6)); + /* Maximum number of bpf_reg_state objects that can exist at once */ #define MAX_STACK_ARG_SLOTS (MAX_BPF_FUNC_ARGS - MAX_BPF_FUNC_REG_ARGS) #define BPF_ID_MAP_SIZE ((MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE + \ @@ -1198,7 +1193,7 @@ struct list_head *bpf_explored_state(struct bpf_verifier_env *env, int idx); void bpf_free_verifier_state(struct bpf_verifier_state *state, bool free_self); void bpf_free_backedges(struct bpf_scc_visit *visit); int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, - int insn_flags, u64 linked_regs); + int insn_flags, int spi, int frame, u64 linked_regs); void bpf_bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist); void bpf_mark_reg_not_init(const struct bpf_verifier_env *env, struct bpf_reg_state *reg); diff --git a/kernel/bpf/backtrack.c b/kernel/bpf/backtrack.c index 854731dc93fe..5e93e57fb7ae 100644 --- a/kernel/bpf/backtrack.c +++ b/kernel/bpf/backtrack.c @@ -9,7 +9,7 @@ /* for any branch, call, exit record the history of jmps in the given state */ int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, - int insn_flags, u64 linked_regs) + int insn_flags, int spi, int frame, u64 linked_regs) { u32 cnt = cur->jmp_history_cnt; struct bpf_jmp_history_entry *p; @@ -25,6 +25,8 @@ int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state env, "insn history: insn_idx %d cur flags %x new flags %x", env->insn_idx, env->cur_hist_ent->flags, insn_flags); env->cur_hist_ent->flags |= insn_flags; + env->cur_hist_ent->spi = spi; + env->cur_hist_ent->frame = frame; verifier_bug_if(env->cur_hist_ent->linked_regs != 0, env, "insn history: insn_idx %d linked_regs: %#llx", env->insn_idx, env->cur_hist_ent->linked_regs); @@ -43,6 +45,8 @@ int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state p->idx = env->insn_idx; p->prev_idx = env->prev_insn_idx; p->flags = insn_flags; + p->spi = spi; + p->frame = frame; p->linked_regs = linked_regs; cur->jmp_history_cnt = cnt; env->cur_hist_ent = p; @@ -64,16 +68,6 @@ static bool is_atomic_fetch_insn(const struct bpf_insn *insn) (insn->imm & BPF_FETCH); } -static int insn_stack_access_spi(int insn_flags) -{ - return (insn_flags >> INSN_F_SPI_SHIFT) & INSN_F_SPI_MASK; -} - -static int insn_stack_access_frameno(int insn_flags) -{ - return insn_flags & INSN_F_FRAMENO_MASK; -} - /* Backtrack one insn at a time. If idx is not at the top of recorded * history then previous instruction came from straight line execution. * Return -ENOENT if we exhausted all instructions within given state. @@ -353,8 +347,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * that [fp - off] slot contains scalar that needs to be * tracked with precision */ - spi = insn_stack_access_spi(hist->flags); - fr = insn_stack_access_frameno(hist->flags); + spi = hist->spi; + fr = hist->frame; bpf_bt_set_frame_slot(bt, fr, spi); } else if (class == BPF_STX || class == BPF_ST) { if (bt_is_reg_set(bt, dreg)) @@ -366,8 +360,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, /* scalars can only be spilled into stack */ if (!hist || !(hist->flags & INSN_F_STACK_ACCESS)) return 0; - spi = insn_stack_access_spi(hist->flags); - fr = insn_stack_access_frameno(hist->flags); + spi = hist->spi; + fr = hist->frame; if (!bt_is_frame_slot_set(bt, fr, spi)) return 0; bt_clear_frame_slot(bt, fr, spi); diff --git a/kernel/bpf/states.c b/kernel/bpf/states.c index 3ce6d2652b27..877338136009 100644 --- a/kernel/bpf/states.c +++ b/kernel/bpf/states.c @@ -1403,7 +1403,7 @@ hit: */ err = 0; if (bpf_is_jmp_point(env, env->insn_idx)) - err = bpf_push_jmp_history(env, cur, 0, 0); + err = bpf_push_jmp_history(env, cur, 0, 0, 0, 0); err = err ? : propagate_precision(env, &sl->state, cur, NULL); if (err) return err; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a29b3003cbec..d15aef2fe4a1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3198,11 +3198,6 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, return __check_reg_arg(env, state->regs, regno, t); } -static int insn_stack_access_flags(int frameno, int spi) -{ - return INSN_F_STACK_ACCESS | (spi << INSN_F_SPI_SHIFT) | frameno; -} - static void mark_indirect_target(struct bpf_verifier_env *env, int idx) { env->insn_aux_data[idx].indirect_target = true; @@ -3517,7 +3512,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; struct bpf_reg_state *reg = NULL; - int insn_flags = insn_stack_access_flags(state->frameno, spi); + int insn_flags = INSN_F_STACK_ACCESS; + int hist_spi = spi, hist_frame = state->frameno; /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, * so it's aligned access and [off, off + size) are within stack limits @@ -3613,7 +3609,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, } if (insn_flags) - return bpf_push_jmp_history(env, env->cur_state, insn_flags, 0); + return bpf_push_jmp_history(env, env->cur_state, insn_flags, + hist_spi, hist_frame, 0); return 0; } @@ -3809,7 +3806,8 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; struct bpf_reg_state *reg; u8 *stype, type; - int insn_flags = insn_stack_access_flags(reg_state->frameno, spi); + int insn_flags = INSN_F_STACK_ACCESS; + int hist_spi = spi, hist_frame = reg_state->frameno; stype = reg_state->stack[spi].slot_type; reg = ®_state->stack[spi].spilled_ptr; @@ -3940,7 +3938,8 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, insn_flags = 0; /* we are not restoring spilled register */ } if (insn_flags) - return bpf_push_jmp_history(env, env->cur_state, insn_flags, 0); + return bpf_push_jmp_history(env, env->cur_state, insn_flags, + hist_spi, hist_frame, 0); return 0; } @@ -15907,7 +15906,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } if (insn_flags) { - err = bpf_push_jmp_history(env, this_branch, insn_flags, 0); + err = bpf_push_jmp_history(env, this_branch, insn_flags, 0, 0, 0); if (err) return err; } @@ -15971,7 +15970,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (dst_reg->type == SCALAR_VALUE && dst_reg->id) collect_linked_regs(env, this_branch, dst_reg->id, &linked_regs); if (linked_regs.cnt > 1) { - err = bpf_push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs)); + err = bpf_push_jmp_history(env, this_branch, 0, 0, 0, linked_regs_pack(&linked_regs)); if (err) return err; } @@ -17278,7 +17277,7 @@ static int do_check(struct bpf_verifier_env *env) } if (bpf_is_jmp_point(env, env->insn_idx)) { - err = bpf_push_jmp_history(env, state, 0, 0); + err = bpf_push_jmp_history(env, state, 0, 0, 0, 0); if (err) return err; } -- cgit v1.2.3 From 0a0fdc64b68c28dab40f9deb0cffdf544e04b0ba Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:50:25 -0700 Subject: bpf: Add precision marking and backtracking for stack argument slots Extend the precision marking and backtracking infrastructure to support stack argument slots (r11-based accesses). Without this, precision demands for scalar values passed through stack arguments are silently dropped, which could allow the verifier to incorrectly prune states with different constant values in stack arg slots. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045025.2387526-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 8 ++++++ kernel/bpf/backtrack.c | 58 +++++++++++++++++++++++++++++++++++++++++++- kernel/bpf/verifier.c | 32 ++++++++++++++++++++---- 3 files changed, 92 insertions(+), 6 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 3ec338169981..6f12fc40b682 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -443,6 +443,8 @@ enum { INSN_F_DST_REG_STACK = BIT(1), /* dst_reg is PTR_TO_STACK */ INSN_F_SRC_REG_STACK = BIT(2), /* src_reg is PTR_TO_STACK */ + + INSN_F_STACK_ARG_ACCESS = BIT(3), }; struct bpf_jmp_history_entry { @@ -858,6 +860,7 @@ struct backtrack_state { u32 frame; u32 reg_masks[MAX_CALL_FRAMES]; u64 stack_masks[MAX_CALL_FRAMES]; + u8 stack_arg_masks[MAX_CALL_FRAMES]; }; struct bpf_id_pair { @@ -1256,6 +1259,11 @@ static inline void bpf_bt_set_frame_slot(struct backtrack_state *bt, u32 frame, bt->stack_masks[frame] |= 1ull << slot; } +static inline void bt_set_frame_stack_arg_slot(struct backtrack_state *bt, u32 frame, u32 slot) +{ + bt->stack_arg_masks[frame] |= 1 << slot; +} + static inline bool bt_is_frame_reg_set(struct backtrack_state *bt, u32 frame, u32 reg) { return bt->reg_masks[frame] & (1 << reg); diff --git a/kernel/bpf/backtrack.c b/kernel/bpf/backtrack.c index 5e93e57fb7ae..2e4ae0ef0860 100644 --- a/kernel/bpf/backtrack.c +++ b/kernel/bpf/backtrack.c @@ -129,11 +129,21 @@ static inline u32 bt_empty(struct backtrack_state *bt) int i; for (i = 0; i <= bt->frame; i++) - mask |= bt->reg_masks[i] | bt->stack_masks[i]; + mask |= bt->reg_masks[i] | bt->stack_masks[i] | bt->stack_arg_masks[i]; return mask == 0; } +static inline void bt_clear_frame_stack_arg_slot(struct backtrack_state *bt, u32 frame, u32 slot) +{ + bt->stack_arg_masks[frame] &= ~(1 << slot); +} + +static inline bool bt_is_frame_stack_arg_slot_set(struct backtrack_state *bt, u32 frame, u32 slot) +{ + return bt->stack_arg_masks[frame] & (1 << slot); +} + static inline int bt_subprog_enter(struct backtrack_state *bt) { if (bt->frame == MAX_CALL_FRAMES - 1) { @@ -194,6 +204,11 @@ static inline u64 bt_stack_mask(struct backtrack_state *bt) return bt->stack_masks[bt->frame]; } +static inline u8 bt_stack_arg_mask(struct backtrack_state *bt) +{ + return bt->stack_arg_masks[bt->frame]; +} + static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg) { return bt->reg_masks[bt->frame] & (1 << reg); @@ -335,6 +350,19 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, return 0; bt_clear_reg(bt, load_reg); + if (hist && hist->flags & INSN_F_STACK_ARG_ACCESS) { + spi = hist->spi; + /* + * Stack arg read: callee reads from r11+off, but + * the data lives in the caller's stack_arg_regs. + * Set the mask in the caller frame so precision + * is marked in the caller's slot at the callee + * entry checkpoint. + */ + bt_set_frame_stack_arg_slot(bt, bt->frame - 1, spi); + return 0; + } + /* scalars can only be spilled into stack w/o losing precision. * Load from any other memory can be zero extended. * The desire to keep that precision is already indicated @@ -357,6 +385,17 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * encountered a case of pointer subtraction. */ return -ENOTSUPP; + + if (hist && hist->flags & INSN_F_STACK_ARG_ACCESS) { + spi = hist->spi; + if (!bt_is_frame_stack_arg_slot_set(bt, bt->frame, spi)) + return 0; + bt_clear_frame_stack_arg_slot(bt, bt->frame, spi); + if (class == BPF_STX) + bt_set_reg(bt, sreg); + return 0; + } + /* scalars can only be spilled into stack */ if (!hist || !(hist->flags & INSN_F_STACK_ACCESS)) return 0; @@ -425,6 +464,12 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, bpf_bt_set_frame_reg(bt, bt->frame - 1, i); } } + if (bt_stack_arg_mask(bt)) { + verifier_bug(env, + "static subprog leftover stack arg slots %x", + bt_stack_arg_mask(bt)); + return -EFAULT; + } if (bt_subprog_exit(bt)) return -EFAULT; return 0; @@ -895,6 +940,17 @@ int bpf_mark_chain_precision(struct bpf_verifier_env *env, *changed = true; } } + for (i = 0; i < func->out_stack_arg_cnt; i++) { + if (!bt_is_frame_stack_arg_slot_set(bt, fr, i)) + continue; + reg = &func->stack_arg_regs[i]; + if (reg->type != SCALAR_VALUE || reg->precise) { + bt_clear_frame_stack_arg_slot(bt, fr, i); + } else { + reg->precise = true; + *changed = true; + } + } if (env->log.level & BPF_LOG_LEVEL2) { fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_frame_reg_mask(bt, fr)); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d15aef2fe4a1..ebd13661933e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -292,6 +292,11 @@ static int arg_from_argno(argno_t a) return -1; } +static int arg_idx_from_argno(argno_t a) +{ + return arg_from_argno(a) - 1; +} + static const char *btf_type_name(const struct btf *btf, u32 id) { return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off); @@ -4115,7 +4120,8 @@ static int check_stack_arg_write(struct bpf_verifier_env *env, struct bpf_func_s __mark_reg_known(arg, env->prog->insnsi[env->insn_idx].imm); } state->no_stack_arg_load = true; - return 0; + return bpf_push_jmp_history(env, env->cur_state, + INSN_F_STACK_ARG_ACCESS, spi, 0, 0); } /* @@ -4146,7 +4152,17 @@ static int check_stack_arg_read(struct bpf_verifier_env *env, struct bpf_func_st arg = &caller->stack_arg_regs[spi]; cur = vstate->frame[vstate->curframe]; cur->regs[dst_regno] = *arg; - return 0; + return bpf_push_jmp_history(env, env->cur_state, + INSN_F_STACK_ARG_ACCESS, spi, 0, 0); +} + +static int mark_stack_arg_precision(struct bpf_verifier_env *env, int arg_idx) +{ + struct bpf_func_state *caller = cur_func(env); + int spi = arg_idx - MAX_BPF_FUNC_REG_ARGS; + + bt_set_frame_stack_arg_slot(&env->bt, caller->frameno, spi); + return mark_chain_precision_batch(env, env->cur_state); } static int check_outgoing_stack_args(struct bpf_verifier_env *env, struct bpf_func_state *caller, @@ -6875,8 +6891,14 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, } err = check_helper_mem_access(env, mem_reg, mem_argno, reg_umax(size_reg), access_type, zero_size_allowed, meta); - if (!err) - err = mark_chain_precision(env, reg_from_argno(size_argno)); + if (!err) { + int regno = reg_from_argno(size_argno); + + if (regno >= 0) + err = mark_chain_precision(env, regno); + else + err = mark_stack_arg_precision(env, arg_idx_from_argno(size_argno)); + } return err; } @@ -7325,7 +7347,7 @@ static int process_iter_arg(struct bpf_verifier_env *env, struct bpf_reg_state * struct bpf_kfunc_call_arg_meta *meta) { const struct btf_type *t; - u32 arg_idx = arg_from_argno(argno) - 1; + u32 arg_idx = arg_idx_from_argno(argno); int spi, err, i, nr_slots, btf_id; if (reg->type != PTR_TO_STACK) { -- cgit v1.2.3 From 84dd7df76efef9fecb6f3e0defe2ea3ad89cd3cb Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:50:30 -0700 Subject: bpf: Refactor record_call_access() to extract per-arg logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extract the per-argument FP-derived pointer handling from record_call_access() into a new record_arg_access() helper. The existing loop body — checking arg_is_fp, querying stack access bytes, and calling record_stack_access/record_imprecise — will be reused for stack argument slots in the next patch. Factoring it out now avoids duplicating the logic. No functional change. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045030.2388067-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/liveness.c | 65 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 38 insertions(+), 27 deletions(-) diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c index 58197d73b120..c81337dfbfc7 100644 --- a/kernel/bpf/liveness.c +++ b/kernel/bpf/liveness.c @@ -1343,6 +1343,42 @@ static int record_load_store_access(struct bpf_verifier_env *env, return 0; } +static int record_arg_access(struct bpf_verifier_env *env, + struct func_instance *instance, + struct bpf_insn *insn, + struct arg_track *at, int arg_idx, + int insn_idx) +{ + int depth = instance->depth; + int frame = at->frame; + int err = 0; + s64 bytes; + + if (!arg_is_fp(at)) + return 0; + + if (bpf_helper_call(insn)) { + bytes = bpf_helper_stack_access_bytes(env, insn, arg_idx, insn_idx); + } else if (bpf_pseudo_kfunc_call(insn)) { + bytes = bpf_kfunc_stack_access_bytes(env, insn, arg_idx, insn_idx); + } else { + for (int f = 0; f <= depth; f++) { + err = mark_stack_read(instance, f, insn_idx, SPIS_ALL); + if (err) + return err; + } + return 0; + } + if (bytes == 0) + return 0; + + if (frame >= 0 && frame <= depth) + err = record_stack_access(instance, at, bytes, frame, insn_idx); + else if (frame == ARG_IMPRECISE) + err = record_imprecise(instance, at->mask, insn_idx); + return err; +} + /* Record stack access for a given 'at' state of helper/kfunc 'insn' */ static int record_call_access(struct bpf_verifier_env *env, struct func_instance *instance, @@ -1350,9 +1386,8 @@ static int record_call_access(struct bpf_verifier_env *env, int insn_idx) { struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; - int depth = instance->depth; struct bpf_call_summary cs; - int r, err = 0, num_params = 5; + int r, err, num_params = 5; if (bpf_pseudo_call(insn)) return 0; @@ -1361,31 +1396,7 @@ static int record_call_access(struct bpf_verifier_env *env, num_params = cs.num_params; for (r = BPF_REG_1; r < BPF_REG_1 + num_params; r++) { - int frame = at[r].frame; - s64 bytes; - - if (!arg_is_fp(&at[r])) - continue; - - if (bpf_helper_call(insn)) { - bytes = bpf_helper_stack_access_bytes(env, insn, r - 1, insn_idx); - } else if (bpf_pseudo_kfunc_call(insn)) { - bytes = bpf_kfunc_stack_access_bytes(env, insn, r - 1, insn_idx); - } else { - for (int f = 0; f <= depth; f++) { - err = mark_stack_read(instance, f, insn_idx, SPIS_ALL); - if (err) - return err; - } - return 0; - } - if (bytes == 0) - continue; - - if (frame >= 0 && frame <= depth) - err = record_stack_access(instance, &at[r], bytes, frame, insn_idx); - else if (frame == ARG_IMPRECISE) - err = record_imprecise(instance, at[r].mask, insn_idx); + err = record_arg_access(env, instance, insn, &at[r], r - 1, insn_idx); if (err) return err; } -- cgit v1.2.3 From f44e815e65a30f465fe4dd793df8cb98f1f9c0b1 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:50:35 -0700 Subject: bpf: Use arg_is_fp() in has_fp_args() Replace "frame != ARG_NONE" with arg_is_fp() in has_fp_args(). The function's purpose is to check whether any argument is derived from a frame pointer, which is exactly what arg_is_fp() tests (frame >= 0 || frame == ARG_IMPRECISE). Using the dedicated predicate is clearer and more consistent with the rest of the file. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045035.2388671-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/liveness.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c index c81337dfbfc7..13dc5ae44d2b 100644 --- a/kernel/bpf/liveness.c +++ b/kernel/bpf/liveness.c @@ -1689,7 +1689,7 @@ err_free: static bool has_fp_args(struct arg_track *args) { for (int r = BPF_REG_1; r <= BPF_REG_5; r++) - if (args[r].frame != ARG_NONE) + if (arg_is_fp(&args[r])) return true; return false; } -- cgit v1.2.3 From 2af4e792773f9fc05e5dbd5f297707cfe15cd817 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:50:40 -0700 Subject: bpf: Extend liveness analysis to track stack argument slots BPF_REG_PARAMS (R11) is at index MAX_BPF_REG, which is beyond the register tracking arrays in const_fold.c and liveness.c. Handle it explicitly to avoid out-of-bounds accesses. Extend the arg tracking dataflow to cover stack arg slots. Otherwise, pointers passed through stack args are invisible to liveness, causing the pointed-to stack slots to be incorrectly poisoned. Extend the at_out tracking array to MAX_AT_TRACK_REGS (registers plus stack arg slots) so that outgoing stack arg stores are tracked alongside registers. Add a separate at_stack_arg_entry array in compute_subprog_args(), passed to arg_track_xfer(), to restore FP-derived values on incoming stack arg reads. Extend record_call_access() to check stack arg slots for FP-derived pointers at kfunc call sites, reusing the record_arg_access() helper extracted in the previous patch. Pass stack arg state from caller to callee in analyze_subprog() so that callees can track pointers received through stack args, hence avoid poisoning. Skip stack arg instructions in record_load_store_access(). Stack arg STX uses dst_reg=BPF_REG_PARAMS (index 11), but at[11] is repurposed to track the value stored in stack arg slot 0. Without the skip, if a prior stack arg STX stored an FP-derived pointer (e.g., fp-64) into slot 0, a subsequent stack arg STX would read that FP-derived value as the base pointer and spuriously mark a regular stack slot (e.g., fp-72 from -64 + -8) as accessed in the liveness bitmap. Extend arg_track_log() to log state transitions for outgoing stack arg slots at indices MAX_BPF_REG through MAX_AT_TRACK_REGS-1. Without this, changes to at_out[11..17] caused by stack arg store instructions are silently omitted from BPF_LOG_LEVEL2 output. For example, when a caller passes fp-64 through a stack argument: subprog#0: 10: (bf) r6 = r10 11: (07) r6 += -64 12: (7b) *(u64 *)(r11 -8) = r6 sa0: none -> fp0-64 13: (85) call pc+5 Without the fix, the "sa0: none -> fp0-64" transition at insn 12 would not appear. Extend print_subprog_arg_access() to include stack arg slots in the per-instruction FP-derived state dump. For example: subprog#0: 12: (7b) *(u64 *)(r11 - 8) = r6 // r6=fp0-64 13: (85) call pc+5 // r6=fp0-64 sa0=fp0-64 Without the fix, the "sa0=fp0-64" annotation at insn 13 would not appear, making it harder to debug liveness analysis for programs that pass FP-derived pointers through stack arguments. Extend has_fp_args() to also check stack arg slots for FP-derived pointers, so that callees receiving pointers only through stack args are still recursively analyzed. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045043.2389049-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/const_fold.c | 8 ++++ kernel/bpf/liveness.c | 114 +++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 106 insertions(+), 16 deletions(-) diff --git a/kernel/bpf/const_fold.c b/kernel/bpf/const_fold.c index db73c4740b1e..b2a19acadb91 100644 --- a/kernel/bpf/const_fold.c +++ b/kernel/bpf/const_fold.c @@ -58,6 +58,14 @@ static void const_reg_xfer(struct bpf_verifier_env *env, struct const_arg_info * u8 opcode = BPF_OP(insn->code) | BPF_SRC(insn->code); int r; + /* Stack arg stores (r11-based) are outside the tracked register set. */ + if (is_stack_arg_st(insn) || is_stack_arg_stx(insn)) + return; + if (is_stack_arg_ldx(insn)) { + ci_out[insn->dst_reg] = unknown; + return; + } + switch (class) { case BPF_ALU: case BPF_ALU64: diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c index 13dc5ae44d2b..7f4a0e4c2c49 100644 --- a/kernel/bpf/liveness.c +++ b/kernel/bpf/liveness.c @@ -610,6 +610,21 @@ enum arg_track_state { /* Track callee stack slots fp-8 through fp-512 (64 slots of 8 bytes each) */ #define MAX_ARG_SPILL_SLOTS 64 +/* + * Combined register + stack arg tracking: R0-R10 at indices 0-10, + * outgoing stack arg slots at indices MAX_BPF_REG..MAX_BPF_REG+6. + */ +#define MAX_AT_TRACK_REGS (MAX_BPF_REG + MAX_STACK_ARG_SLOTS) + +static int stack_arg_off_to_slot(s16 off) +{ + int aoff = off < 0 ? -off : off; + + if (aoff / 8 > MAX_STACK_ARG_SLOTS) + return -1; + return aoff / 8 - 1; +} + static bool arg_is_visited(const struct arg_track *at) { return at->frame != ARG_UNVISITED; @@ -1032,6 +1047,21 @@ static void arg_track_log(struct bpf_verifier_env *env, struct bpf_insn *insn, i verbose(env, "\tr%d: ", i); verbose_arg_track(env, &at_in[i]); verbose(env, " -> "); verbose_arg_track(env, &at_out[i]); } + /* Log outgoing stack arg slot transitions at indices MAX_BPF_REG..MAX_AT_TRACK_REGS-1 */ + for (i = 0; i < MAX_STACK_ARG_SLOTS; i++) { + int ai = MAX_BPF_REG + i; + + if (arg_track_eq(&at_out[ai], &at_in[ai])) + continue; + if (!printed) { + verbose(env, "%3d: ", idx); + bpf_verbose_insn(env, insn); + bpf_vlog_reset(&env->log, env->log.end_pos - 1); + printed = true; + } + verbose(env, "\tsa%d: ", i); verbose_arg_track(env, &at_in[ai]); + verbose(env, " -> "); verbose_arg_track(env, &at_out[ai]); + } for (i = 0; i < MAX_ARG_SPILL_SLOTS; i++) { if (arg_track_eq(&at_stack_out[i], &at_stack_in[i])) continue; @@ -1062,6 +1092,7 @@ static bool can_be_local_fp(int depth, int regno, struct arg_track *at) static void arg_track_xfer(struct bpf_verifier_env *env, struct bpf_insn *insn, int insn_idx, struct arg_track *at_out, struct arg_track *at_stack_out, + const struct arg_track *at_stack_arg_entry, struct func_instance *instance, u32 *callsites) { @@ -1071,9 +1102,21 @@ static void arg_track_xfer(struct bpf_verifier_env *env, struct bpf_insn *insn, struct arg_track *dst = &at_out[insn->dst_reg]; struct arg_track *src = &at_out[insn->src_reg]; struct arg_track none = { .frame = ARG_NONE }; - int r; - - if (class == BPF_ALU64 && BPF_SRC(insn->code) == BPF_K) { + int r, slot; + + /* Handle stack arg stores and loads. */ + if (is_stack_arg_st(insn) || is_stack_arg_stx(insn)) { + slot = stack_arg_off_to_slot(insn->off); + if (slot >= 0) { + if (is_stack_arg_stx(insn)) + at_out[MAX_BPF_REG + slot] = at_out[insn->src_reg]; + else + at_out[MAX_BPF_REG + slot] = none; + } + } else if (is_stack_arg_ldx(insn)) { + slot = stack_arg_off_to_slot(insn->off); + at_out[insn->dst_reg] = (slot >= 0) ? at_stack_arg_entry[slot] : none; + } else if (class == BPF_ALU64 && BPF_SRC(insn->code) == BPF_K) { if (code == BPF_MOV) { *dst = none; } else if (dst->frame >= 0) { @@ -1297,6 +1340,16 @@ static int record_load_store_access(struct bpf_verifier_env *env, struct arg_track resolved, *ptr; int oi; + /* + * Stack arg insns use dst_reg/src_reg=BPF_REG_PARAMS(11). Since at[] + * is extended to MAX_AT_TRACK_REGS, at[11] holds the arg_track for + * outgoing stack arg slot 0 — not the pointer used for the memory + * access. Skip so the slot's tracked value isn't confused with the + * base register that record_stack_access() expects. + */ + if (is_stack_arg_stx(insn) || is_stack_arg_st(insn) || is_stack_arg_ldx(insn)) + return 0; + switch (class) { case BPF_LDX: ptr = &at[insn->src_reg]; @@ -1395,11 +1448,18 @@ static int record_call_access(struct bpf_verifier_env *env, if (bpf_get_call_summary(env, insn, &cs)) num_params = cs.num_params; - for (r = BPF_REG_1; r < BPF_REG_1 + num_params; r++) { + for (r = BPF_REG_1; r < BPF_REG_1 + min(num_params, MAX_BPF_FUNC_REG_ARGS); r++) { err = record_arg_access(env, instance, insn, &at[r], r - 1, insn_idx); if (err) return err; } + + for (r = 0; r < MAX_STACK_ARG_SLOTS && r < num_params - MAX_BPF_FUNC_REG_ARGS; r++) { + err = record_arg_access(env, instance, insn, &at[MAX_BPF_REG + r], + r + MAX_BPF_FUNC_REG_ARGS, insn_idx); + if (err) + return err; + } return 0; } @@ -1456,7 +1516,7 @@ static int find_callback_subprog(struct bpf_verifier_env *env, /* Per-subprog intermediate state kept alive across analysis phases */ struct subprog_at_info { - struct arg_track (*at_in)[MAX_BPF_REG]; + struct arg_track (*at_in)[MAX_AT_TRACK_REGS]; int len; }; @@ -1490,6 +1550,9 @@ static void print_subprog_arg_access(struct bpf_verifier_env *env, for (r = 0; r < MAX_BPF_REG - 1; r++) if (arg_is_fp(&info->at_in[i][r])) has_extra = true; + for (r = 0; r < MAX_STACK_ARG_SLOTS; r++) + if (arg_is_fp(&info->at_in[i][MAX_BPF_REG + r])) + has_extra = true; } if (is_ldx_stx_call) { for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++) @@ -1514,6 +1577,12 @@ static void print_subprog_arg_access(struct bpf_verifier_env *env, verbose(env, " r%d=", r); verbose_arg_track(env, &info->at_in[i][r]); } + for (r = 0; r < MAX_STACK_ARG_SLOTS; r++) { + if (!arg_is_fp(&info->at_in[i][MAX_BPF_REG + r])) + continue; + verbose(env, " sa%d=", r); + verbose_arg_track(env, &info->at_in[i][MAX_BPF_REG + r]); + } } if (is_ldx_stx_call) { @@ -1536,7 +1605,7 @@ static void print_subprog_arg_access(struct bpf_verifier_env *env, * Runs forward fixed-point with arg_track_xfer(), then records * memory accesses in a single linear pass over converged state. * - * @callee_entry: pre-populated entry state for R1-R5 + * @callee_entry: pre-populated entry state for R1-R5 and stack args * NULL for main (subprog 0). * @info: stores at_in, len for debug printing. */ @@ -1554,10 +1623,11 @@ static int compute_subprog_args(struct bpf_verifier_env *env, int end = env->subprog_info[subprog + 1].start; int po_end = env->subprog_info[subprog + 1].postorder_start; int len = end - start; - struct arg_track (*at_in)[MAX_BPF_REG] = NULL; - struct arg_track at_out[MAX_BPF_REG]; + struct arg_track (*at_in)[MAX_AT_TRACK_REGS] = NULL; + struct arg_track at_out[MAX_AT_TRACK_REGS]; struct arg_track (*at_stack_in)[MAX_ARG_SPILL_SLOTS] = NULL; struct arg_track *at_stack_out = NULL; + struct arg_track at_stack_arg_entry[MAX_STACK_ARG_SLOTS]; struct arg_track unvisited = { .frame = ARG_UNVISITED }; struct arg_track none = { .frame = ARG_NONE }; bool changed; @@ -1576,13 +1646,13 @@ static int compute_subprog_args(struct bpf_verifier_env *env, goto err_free; for (i = 0; i < len; i++) { - for (r = 0; r < MAX_BPF_REG; r++) + for (r = 0; r < MAX_AT_TRACK_REGS; r++) at_in[i][r] = unvisited; for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++) at_stack_in[i][r] = unvisited; } - for (r = 0; r < MAX_BPF_REG; r++) + for (r = 0; r < MAX_AT_TRACK_REGS; r++) at_in[0][r] = none; /* Entry: R10 is always precisely the current frame's FP */ @@ -1598,6 +1668,10 @@ static int compute_subprog_args(struct bpf_verifier_env *env, for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++) at_stack_in[0][r] = none; + /* Entry: incoming stack args from caller, or ARG_NONE for main */ + for (r = 0; r < MAX_STACK_ARG_SLOTS; r++) + at_stack_arg_entry[r] = callee_entry ? callee_entry[MAX_BPF_REG + r] : none; + if (env->log.level & BPF_LOG_LEVEL2) verbose(env, "subprog#%d: analyzing (depth %d)...\n", subprog, depth); @@ -1616,7 +1690,8 @@ redo: memcpy(at_out, at_in[i], sizeof(at_out)); memcpy(at_stack_out, at_stack_in[i], MAX_ARG_SPILL_SLOTS * sizeof(*at_stack_out)); - arg_track_xfer(env, insn, idx, at_out, at_stack_out, instance, callsites); + arg_track_xfer(env, insn, idx, at_out, at_stack_out, + at_stack_arg_entry, instance, callsites); arg_track_log(env, insn, idx, at_in[i], at_stack_in[i], at_out, at_stack_out); /* Propagate to successors within this subprogram */ @@ -1630,7 +1705,7 @@ redo: continue; ti = target - start; - for (r = 0; r < MAX_BPF_REG; r++) + for (r = 0; r < MAX_AT_TRACK_REGS; r++) changed |= arg_track_join(env, idx, target, r, &at_in[ti][r], at_out[r]); @@ -1685,12 +1760,15 @@ err_free: return err; } -/* Return true if any of R1-R5 is derived from a frame pointer. */ +/* Return true if any of R1-R5 or stack args is derived from a frame pointer. */ static bool has_fp_args(struct arg_track *args) { for (int r = BPF_REG_1; r <= BPF_REG_5; r++) if (arg_is_fp(&args[r])) return true; + for (int r = 0; r < MAX_STACK_ARG_SLOTS; r++) + if (arg_is_fp(&args[MAX_BPF_REG + r])) + return true; return false; } @@ -1814,7 +1892,7 @@ static int analyze_subprog(struct bpf_verifier_env *env, /* For each reachable call site in the subprog, recurse into callees */ for (int p = po_start; p < po_end; p++) { int idx = env->cfg.insn_postorder[p]; - struct arg_track callee_args[BPF_REG_5 + 1]; + struct arg_track callee_args[MAX_AT_TRACK_REGS] = {}; struct arg_track none = { .frame = ARG_NONE }; struct bpf_insn *insn = &insns[idx]; struct func_instance *callee_instance; @@ -1829,9 +1907,11 @@ static int analyze_subprog(struct bpf_verifier_env *env, if (callee < 0) continue; - /* Build entry args: R1-R5 from at_in at call site */ + /* Build entry args: R1-R5 and stack args from at_in at call site */ for (int r = BPF_REG_1; r <= BPF_REG_5; r++) callee_args[r] = info[subprog].at_in[j][r]; + for (int r = 0; r < MAX_STACK_ARG_SLOTS; r++) + callee_args[MAX_BPF_REG + r] = info[subprog].at_in[j][MAX_BPF_REG + r]; } else if (bpf_calls_callback(env, idx)) { callee = find_callback_subprog(env, insn, idx, &caller_reg, &cb_callee_reg); if (callee == -2) { @@ -1853,6 +1933,8 @@ static int analyze_subprog(struct bpf_verifier_env *env, for (int r = BPF_REG_1; r <= BPF_REG_5; r++) callee_args[r] = none; + for (int r = 0; r < MAX_STACK_ARG_SLOTS; r++) + callee_args[MAX_BPF_REG + r] = none; callee_args[cb_callee_reg] = info[subprog].at_in[j][caller_reg]; } else { continue; @@ -2096,7 +2178,7 @@ static void compute_insn_live_regs(struct bpf_verifier_env *env, def = ALL_CALLER_SAVED_REGS; use = def & ~BIT(BPF_REG_0); if (bpf_get_call_summary(env, insn, &cs)) - use = GENMASK(cs.num_params, 1); + use = GENMASK(min_t(u8, cs.num_params, MAX_BPF_FUNC_REG_ARGS), 1); break; default: def = 0; -- cgit v1.2.3 From dc8f1cf6787c4bb1d8cabfac1e44d2d0ab435caa Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:50:49 -0700 Subject: bpf: Reject stack arguments in non-JITed programs The interpreter does not understand the bpf register r11 (BPF_REG_PARAMS) used for stack arguments. So reject interpreter usage if stack arguments are used either in the main program or any subprogram. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045049.2390444-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 2 +- kernel/bpf/fixups.c | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ae10b9ca018d..958d86f0beac 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2599,7 +2599,7 @@ struct bpf_prog *__bpf_prog_select_runtime(struct bpf_verifier_env *env, struct goto finalize; if (IS_ENABLED(CONFIG_BPF_JIT_ALWAYS_ON) || - bpf_prog_has_kfunc_call(fp)) + bpf_prog_has_kfunc_call(fp) || (env && env->subprog_info[0].stack_arg_cnt)) jit_needed = true; if (!bpf_prog_select_interpreter(fp)) diff --git a/kernel/bpf/fixups.c b/kernel/bpf/fixups.c index ba86039789fd..19056016eed8 100644 --- a/kernel/bpf/fixups.c +++ b/kernel/bpf/fixups.c @@ -1407,6 +1407,12 @@ int bpf_fixup_call_args(struct bpf_verifier_env *env) verbose(env, "calling kernel functions are not allowed in non-JITed programs\n"); return -EINVAL; } + for (i = 1; i < env->subprog_cnt; i++) { + if (bpf_in_stack_arg_cnt(&env->subprog_info[i])) { + verbose(env, "stack args are not supported in non-JITed programs\n"); + return -EINVAL; + } + } if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) { /* When JIT fails the progs with bpf2bpf calls and tail_calls * have to be rejected, since interpreter doesn't support them yet. -- cgit v1.2.3 From 848d624acf668ae0d71b128f163d1d18d2ac6b90 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:50:54 -0700 Subject: bpf: Prepare architecture JIT support for stack arguments Add bpf_jit_supports_stack_args() as a weak function defaulting to false. Architectures that implement JIT support for stack arguments override it to return true. Reject BPF functions with more than 5 parameters at verification time if the architecture does not support stack arguments. Acked-by: Puranjay Mohan Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045054.2390945-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 1 + kernel/bpf/btf.c | 8 +++++++- kernel/bpf/core.c | 5 +++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 918d9b34eac6..a515a9769078 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1184,6 +1184,7 @@ bool bpf_jit_inlines_helper_call(s32 imm); bool bpf_jit_supports_subprog_tailcalls(void); bool bpf_jit_supports_percpu_insn(void); bool bpf_jit_supports_kfunc_call(void); +bool bpf_jit_supports_stack_args(void); bool bpf_jit_supports_far_kfunc_call(void); bool bpf_jit_supports_exceptions(void); bool bpf_jit_supports_ptr_xchg(void); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 4fb8641546b8..17d4ab0a8206 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7870,8 +7870,14 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) MAX_BPF_FUNC_ARGS, tname, nargs); return -EFAULT; } - if (nargs > MAX_BPF_FUNC_REG_ARGS) + if (nargs > MAX_BPF_FUNC_REG_ARGS) { + if (!bpf_jit_supports_stack_args()) { + bpf_log(log, "JIT does not support function %s() with %d args\n", + tname, nargs); + return -EFAULT; + } sub->stack_arg_cnt = nargs - MAX_BPF_FUNC_REG_ARGS; + } if (is_global && nargs > MAX_BPF_FUNC_REG_ARGS) { bpf_log(log, "global function %s has %d > %d args, stack args not supported\n", diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 958d86f0beac..e6b836f846eb 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -3217,6 +3217,11 @@ bool __weak bpf_jit_supports_kfunc_call(void) return false; } +bool __weak bpf_jit_supports_stack_args(void) +{ + return false; +} + bool __weak bpf_jit_supports_far_kfunc_call(void) { return false; -- cgit v1.2.3 From 9fae4cba3bfd583198afe15ed4b4433eafafd11c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:50:59 -0700 Subject: bpf: Enable r11 based insns BPF_REG_PARAMS (r11) is used for stack argument accesses and the following are only insns with r11 presence: - load incoming stack arg - store register to outgoing stack arg - store immediate to outgoing stack arg The detailed insn format can be found in is_stack_arg_ldx/st/stx() helpers. After this patch, stack arg ldx/st/stx insns become valid for kernel and these insns can be properly checked by verifier. The LLVM compiler [1] implemented the above BPF_REG_PARAMS insns. [1] https://github.com/llvm/llvm-project/pull/189060 Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045059.2391192-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ebd13661933e..b0d3c2d179e4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18006,11 +18006,12 @@ static int check_and_resolve_insns(struct bpf_verifier_env *env) return err; for (i = 0; i < insn_cnt; i++, insn++) { - if (insn->dst_reg >= MAX_BPF_REG) { + if (insn->dst_reg >= MAX_BPF_REG && + !is_stack_arg_st(insn) && !is_stack_arg_stx(insn)) { verbose(env, "R%d is invalid\n", insn->dst_reg); return -EINVAL; } - if (insn->src_reg >= MAX_BPF_REG) { + if (insn->src_reg >= MAX_BPF_REG && !is_stack_arg_ldx(insn)) { verbose(env, "R%d is invalid\n", insn->src_reg); return -EINVAL; } -- cgit v1.2.3 From e0b7b91c72db6dae0392dd90db3b866218a7870b Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:51:04 -0700 Subject: bpf: Support stack arguments for kfunc calls Extend the stack argument mechanism to kfunc calls, allowing kfuncs with more than 5 parameters to receive additional arguments via the r11-based stack arg area. For kfuncs, the caller is a BPF program and the callee is a kernel function. The BPF program writes outgoing args at negative r11 offsets, following the same convention as BPF-to-BPF calls: Outgoing: r11 - 8 (arg6), ..., r11 - N*8 (last arg) The following is an example: int foo(int a1, int a2, int a3, int a4, int a5, int a6, int a7) { ... kfunc1(a1, a2, a3, a4, a5, a6, a7, a8); ... kfunc2(a1, a2, a3, a4, a5, a6, a7, a8, a9); ... } Caller (foo), generated by llvm =============================== Incoming (positive offsets): r11+8: [incoming arg 6] r11+16: [incoming arg 7] Outgoing for kfunc1 (negative offsets): r11-8: [outgoing arg 6] r11-16: [outgoing arg 7] r11-24: [outgoing arg 8] Outgoing for kfunc2 (negative offsets): r11-8: [outgoing arg 6] r11-16: [outgoing arg 7] r11-24: [outgoing arg 8] r11-32: [outgoing arg 9] Later JIT will marshal outgoing arguments to the native calling convention for kfunc1() and kfunc2(). For kfunc calls where stack args are used as constant or size parameters, a mark_stack_arg_precision() helper is used to propagate precision and do proper backtracking. There are two places where meta->release_regno needs to keep regno for later releasing the reference. Also, 'cur_aux(env)->arg_prog = regno' is also keeping regno for later fixup. Since stack arguments don't have a valid register number (regno is negative), these three cases are rejected for now if the argument is on the stack. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045104.2391543-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 77 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 60 insertions(+), 17 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b0d3c2d179e4..1a734ab91a31 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11160,14 +11160,12 @@ bool bpf_is_kfunc_pkt_changing(struct bpf_kfunc_call_arg_meta *meta) } static enum kfunc_ptr_arg_type -get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, - struct bpf_kfunc_call_arg_meta *meta, +get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, struct bpf_func_state *caller, + struct bpf_reg_state *regs, struct bpf_kfunc_call_arg_meta *meta, const struct btf_type *t, const struct btf_type *ref_t, const char *ref_tname, const struct btf_param *args, int arg, int nargs, argno_t argno, struct bpf_reg_state *reg) { - u32 regno = arg + 1; - struct bpf_reg_state *regs = cur_regs(env); bool arg_mem_size = false; if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] || @@ -11176,8 +11174,8 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, return KF_ARG_PTR_TO_CTX; if (arg + 1 < nargs && - (is_kfunc_arg_mem_size(meta->btf, &args[arg + 1], ®s[regno + 1]) || - is_kfunc_arg_const_mem_size(meta->btf, &args[arg + 1], ®s[regno + 1]))) + (is_kfunc_arg_mem_size(meta->btf, &args[arg + 1], get_func_arg_reg(caller, regs, arg + 1)) || + is_kfunc_arg_const_mem_size(meta->btf, &args[arg + 1], get_func_arg_reg(caller, regs, arg + 1)))) arg_mem_size = true; /* In this function, we verify the kfunc's BTF as per the argument type, @@ -11842,6 +11840,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ int insn_idx) { const char *func_name = meta->func_name, *ref_tname; + struct bpf_func_state *caller = cur_func(env); + struct bpf_reg_state *regs = cur_regs(env); const struct btf *btf = meta->btf; const struct btf_param *args; struct btf_record *rec; @@ -11850,21 +11850,31 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ args = (const struct btf_param *)(meta->func_proto + 1); nargs = btf_type_vlen(meta->func_proto); - if (nargs > MAX_BPF_FUNC_REG_ARGS) { + if (nargs > MAX_BPF_FUNC_ARGS) { verbose(env, "Function %s has %d > %d args\n", func_name, nargs, - MAX_BPF_FUNC_REG_ARGS); + MAX_BPF_FUNC_ARGS); return -EINVAL; } + if (nargs > MAX_BPF_FUNC_REG_ARGS && !bpf_jit_supports_stack_args()) { + verbose(env, "JIT does not support kfunc %s() with %d args\n", + func_name, nargs); + return -ENOTSUPP; + } + + ret = check_outgoing_stack_args(env, caller, nargs); + if (ret) + return ret; /* Check that BTF function arguments match actual types that the * verifier sees. */ for (i = 0; i < nargs; i++) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[i + 1]; + struct bpf_reg_state *reg = get_func_arg_reg(caller, regs, i); const struct btf_type *t, *ref_t, *resolve_ret; enum bpf_arg_type arg_type = ARG_DONTCARE; argno_t argno = argno_from_arg(i + 1); - u32 regno = i + 1, ref_id, type_size; + int regno = reg_from_argno(argno); + u32 ref_id, type_size; bool is_ret_buf_sz = false; int kf_arg_type; @@ -11874,6 +11884,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verifier_bug(env, "Only 1 prog->aux argument supported per-kfunc"); return -EFAULT; } + if (regno < 0) { + verbose(env, "%s prog->aux cannot be a stack argument\n", + reg_arg_name(env, argno)); + return -EINVAL; + } meta->arg_prog = true; cur_aux(env)->arg_prog = regno; continue; @@ -11900,7 +11915,10 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ reg_arg_name(env, argno)); return -EINVAL; } - ret = mark_chain_precision(env, regno); + if (regno >= 0) + ret = mark_chain_precision(env, regno); + else + ret = mark_stack_arg_precision(env, i); if (ret < 0) return ret; meta->arg_constant.found = true; @@ -11925,7 +11943,10 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } meta->r0_size = reg->var_off.value; - ret = mark_chain_precision(env, regno); + if (regno >= 0) + ret = mark_chain_precision(env, regno); + else + ret = mark_stack_arg_precision(env, i); if (ret) return ret; } @@ -11953,14 +11974,21 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EFAULT; } meta->ref_obj_id = reg->ref_obj_id; - if (is_kfunc_release(meta)) + if (is_kfunc_release(meta)) { + if (regno < 0) { + verbose(env, "%s release arg cannot be a stack argument\n", + reg_arg_name(env, argno)); + return -EINVAL; + } meta->release_regno = regno; + } } ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); ref_tname = btf_name_by_offset(btf, ref_t->name_off); - kf_arg_type = get_kfunc_ptr_arg_type(env, meta, t, ref_t, ref_tname, args, i, nargs, argno, reg); + kf_arg_type = get_kfunc_ptr_arg_type(env, caller, regs, meta, t, ref_t, ref_tname, + args, i, nargs, argno, reg); if (kf_arg_type < 0) return kf_arg_type; @@ -12110,6 +12138,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ dynptr_arg_type |= DYNPTR_TYPE_FILE; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_file_discard]) { dynptr_arg_type |= DYNPTR_TYPE_FILE | OBJ_RELEASE; + if (regno < 0) { + verbose(env, "%s release arg cannot be a stack argument\n", + reg_arg_name(env, argno)); + return -EINVAL; + } meta->release_regno = regno; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] && (dynptr_arg_type & MEM_UNINIT)) { @@ -12264,9 +12297,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ break; case KF_ARG_PTR_TO_MEM_SIZE: { - struct bpf_reg_state *buff_reg = ®s[regno]; + struct bpf_reg_state *buff_reg = reg; const struct btf_param *buff_arg = &args[i]; - struct bpf_reg_state *size_reg = ®s[regno + 1]; + struct bpf_reg_state *size_reg = get_func_arg_reg(caller, regs, i + 1); const struct btf_param *size_arg = &args[i + 1]; argno_t next_argno = argno_from_arg(i + 2); @@ -13171,8 +13204,18 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, clear_all_pkt_pointers(env); nargs = btf_type_vlen(meta.func_proto); + if (nargs > MAX_BPF_FUNC_REG_ARGS) { + struct bpf_func_state *caller = cur_func(env); + struct bpf_subprog_info *caller_info = &env->subprog_info[caller->subprogno]; + u16 out_stack_arg_cnt = nargs - MAX_BPF_FUNC_REG_ARGS; + u16 stack_arg_cnt = bpf_in_stack_arg_cnt(caller_info) + out_stack_arg_cnt; + + if (stack_arg_cnt > caller_info->stack_arg_cnt) + caller_info->stack_arg_cnt = stack_arg_cnt; + } + args = (const struct btf_param *)(meta.func_proto + 1); - for (i = 0; i < nargs; i++) { + for (i = 0; i < min_t(int, nargs, MAX_BPF_FUNC_REG_ARGS); i++) { u32 regno = i + 1; t = btf_type_skip_modifiers(desc_btf, args[i].type, NULL); -- cgit v1.2.3 From 35b78733160c120767332d924a0447a87109bbde Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:51:09 -0700 Subject: bpf: Reject stack arguments if tail call reachable Tail calls are deprecated and will be replaced by indirect calls in the future. Reject programs that combine tail calls with stack arguments rather than adding complexity for a deprecated feature. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045109.2392108-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1a734ab91a31..a10cc045057d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5267,14 +5267,23 @@ continue_func: * this info will be utilized by JIT so that we will be preserving the * tail call counter throughout bpf2bpf calls combined with tailcalls */ - if (tail_call_reachable) + if (tail_call_reachable) { for (tmp = idx; tmp >= 0; tmp = dinfo[tmp].caller) { if (subprog[tmp].is_exception_cb) { verbose(env, "cannot tail call within exception cb\n"); return -EINVAL; } + if (subprog[tmp].stack_arg_cnt) { + verbose(env, "tail_calls are not allowed in programs with stack args\n"); + return -EINVAL; + } subprog[tmp].tail_call_reachable = true; } + } else if (!idx && subprog[0].has_tail_call && subprog[0].stack_arg_cnt) { + verbose(env, "tail_calls are not allowed in programs with stack args\n"); + return -EINVAL; + } + if (subprog[0].tail_call_reachable) env->prog->aux->tail_call_reachable = true; -- cgit v1.2.3 From cb6af5314056cb06456cfa8774aa158d61929bcd Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:51:14 -0700 Subject: bpf: Disable private stack for x86_64 if stack arguments used Other architectures like arm64, riscv, etc. have enough register and for them private stack can be used together with stack arguments. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045114.2392291-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a10cc045057d..82b9531f87f6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5163,7 +5163,10 @@ process_func: } subprog_depth = round_up_stack_depth(env, subprog[idx].stack_depth); - if (priv_stack_supported) { + if (IS_ENABLED(CONFIG_X86_64) && subprog[idx].stack_arg_cnt) { + /* x86-64 uses R9 for both private stack frame pointer and arg6. */ + subprog[idx].priv_stack_mode = NO_PRIV_STACK; + } else if (priv_stack_supported) { /* Request private stack support only if the subprog stack * depth is no less than BPF_PRIV_STACK_MIN_SIZE. This is to * avoid jit penalty if the stack usage is small. -- cgit v1.2.3 From 324c3ca6eed6fb7ec4e50f31d537953038b13c5f Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:51:19 -0700 Subject: bpf,x86: Implement JIT support for stack arguments Add x86_64 JIT support for BPF functions and kfuncs with more than 5 arguments. The extra arguments are passed through a stack area addressed by register r11 (BPF_REG_PARAMS) in BPF bytecode, which the JIT translates to native code. The JIT follows the x86-64 calling convention for both BPF-to-BPF and kfunc calls: - Arg 6 is passed in the R9 register - Args 7+ are passed on the stack Incoming arg 6 (BPF r11+8) is translated to a MOV from R9 rather than a memory load. Incoming args 7+ (BPF r11+16, r11+24, ...) map directly to [rbp + 16], [rbp + 24], ..., matching the x86-64 stack layout after CALL + PUSH RBP, so no offset adjustment is needed. tail_call_reachable is rejected by the verifier and priv_stack is disabled by the JIT when stack args exist, so R9 is always available. When BPF bytecode writes to the arg-6 stack slot (offset -8), the JIT emits a MOV into R9 instead of a memory store. Outgoing args 7+ are placed at [rsp] in a pre-allocated area below callee-saved registers, using: native_off = outgoing_arg_base - outgoing_rsp - bpf_off - 16 The native x86_64 stack layout with stack arguments: high address +-------------------------+ | incoming stack arg N | [rbp + 16 + (N-7)*8] (from caller) | ... | | incoming stack arg 7 | [rbp + 16] +-------------------------+ | return address | [rbp + 8] | saved rbp | [rbp] +-------------------------+ | BPF program stack | (round_up(stack_depth, 8) bytes) +-------------------------+ | callee-saved regs | (r12, rbx, r13, r14, r15 as needed) +-------------------------+ | outgoing arg M | [rsp + (M-7)*8] | ... | | outgoing arg 7 | [rsp] +-------------------------+ rsp low address Acked-by: Puranjay Mohan Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045122.2393118-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 149 ++++++++++++++++++++++++++++++++++++++++++-- include/linux/bpf.h | 1 + kernel/bpf/core.c | 10 +++ 3 files changed, 154 insertions(+), 6 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index ea9e707e8abf..ceefefb4da21 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -390,6 +391,34 @@ static void pop_callee_regs(u8 **pprog, bool *callee_regs_used) *pprog = prog; } +/* add rsp, depth */ +static void emit_add_rsp(u8 **pprog, u16 depth) +{ + u8 *prog = *pprog; + + if (!depth) + return; + if (is_imm8(depth)) + EMIT4(0x48, 0x83, 0xC4, depth); /* add rsp, imm8 */ + else + EMIT3_off32(0x48, 0x81, 0xC4, depth); /* add rsp, imm32 */ + *pprog = prog; +} + +/* sub rsp, depth */ +static void emit_sub_rsp(u8 **pprog, u16 depth) +{ + u8 *prog = *pprog; + + if (!depth) + return; + if (is_imm8(depth)) + EMIT4(0x48, 0x83, 0xEC, depth); /* sub rsp, imm8 */ + else + EMIT3_off32(0x48, 0x81, 0xEC, depth); /* sub rsp, imm32 */ + *pprog = prog; +} + static void emit_nops(u8 **pprog, int len) { u8 *prog = *pprog; @@ -1659,21 +1688,47 @@ static int do_jit(struct bpf_verifier_env *env, struct bpf_prog *bpf_prog, int * bool seen_exit = false; u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; void __percpu *priv_frame_ptr = NULL; + u16 out_stack_arg_cnt, outgoing_rsp; u64 arena_vm_start, user_vm_start; void __percpu *priv_stack_ptr; int i, excnt = 0; int ilen, proglen = 0; u8 *ip, *prog = temp; u32 stack_depth; + int callee_saved_size; + s32 outgoing_arg_base; int err; stack_depth = bpf_prog->aux->stack_depth; + out_stack_arg_cnt = bpf_out_stack_arg_cnt(env, bpf_prog); priv_stack_ptr = bpf_prog->aux->priv_stack_ptr; if (priv_stack_ptr) { priv_frame_ptr = priv_stack_ptr + PRIV_STACK_GUARD_SZ + round_up(stack_depth, 8); stack_depth = 0; } + /* + * Follow x86-64 calling convention for both BPF-to-BPF and + * kfunc calls: + * - Arg 6 is passed in R9 register + * - Args 7+ are passed on the stack at [rsp] + * + * Incoming arg 6 is read from R9 (BPF r11+8 → MOV from R9). + * Incoming args 7+ are read from [rbp + 16], [rbp + 24], ... + * (BPF r11+16, r11+24, ... map directly with no offset change). + * + * tail_call_reachable is rejected by the verifier and priv_stack + * is disabled by the JIT when stack args exist, so R9 is always + * available. + * + * Stack layout (high to low): + * [rbp + 16 + ...] incoming stack args 7+ (from caller) + * [rbp + 8] return address + * [rbp] saved rbp + * [rbp - prog_stack] program stack + * [below] callee-saved regs + * [below] outgoing args 7+ (= rsp) + */ arena_vm_start = bpf_arena_get_kern_vm_start(bpf_prog->aux->arena); user_vm_start = bpf_arena_get_user_vm_start(bpf_prog->aux->arena); @@ -1700,6 +1755,42 @@ static int do_jit(struct bpf_verifier_env *env, struct bpf_prog *bpf_prog, int * push_r12(&prog); push_callee_regs(&prog, callee_regs_used); } + + /* Compute callee-saved register area size. */ + callee_saved_size = 0; + if (bpf_prog->aux->exception_boundary || arena_vm_start) + callee_saved_size += 8; /* r12 */ + if (bpf_prog->aux->exception_boundary) { + callee_saved_size += 4 * 8; /* rbx, r13, r14, r15 */ + } else { + int j; + + for (j = 0; j < 4; j++) + if (callee_regs_used[j]) + callee_saved_size += 8; + } + /* + * Base offset from rbp for translating BPF outgoing args 7+ + * to native offsets. BPF uses negative offsets from r11 + * (r11-8 for arg6, r11-16 for arg7, ...) while x86 uses + * positive offsets from rsp ([rsp+0] for arg7, [rsp+8] for + * arg8, ...). Arg 6 goes to R9 directly. + * + * The translation reverses direction: + * native_off = outgoing_arg_base - outgoing_rsp - bpf_off - 16 + * + * Note that tail_call_reachable is guaranteed to be false when + * stack args exist, so tcc pushes need not be accounted for. + */ + outgoing_arg_base = -(round_up(stack_depth, 8) + callee_saved_size); + + /* + * Allocate outgoing stack arg area for args 7+ only. + * Arg 6 goes into r9 register, not on stack. + */ + outgoing_rsp = out_stack_arg_cnt > 1 ? (out_stack_arg_cnt - 1) * 8 : 0; + emit_sub_rsp(&prog, outgoing_rsp); + if (arena_vm_start) emit_mov_imm64(&prog, X86_REG_R12, arena_vm_start >> 32, (u32) arena_vm_start); @@ -1721,7 +1812,7 @@ static int do_jit(struct bpf_verifier_env *env, struct bpf_prog *bpf_prog, int * u8 b2 = 0, b3 = 0; u8 *start_of_ldx; s64 jmp_offset; - s16 insn_off; + s32 insn_off; u8 jmp_cond; u8 *func; int nops; @@ -2134,12 +2225,27 @@ static int do_jit(struct bpf_verifier_env *env, struct bpf_prog *bpf_prog, int * EMIT1(0xC7); goto st; case BPF_ST | BPF_MEM | BPF_DW: + if (dst_reg == BPF_REG_PARAMS && insn->off == -8) { + /* Arg 6: store immediate in r9 register */ + emit_mov_imm64(&prog, X86_REG_R9, imm32 >> 31, (u32)imm32); + break; + } EMIT2(add_1mod(0x48, dst_reg), 0xC7); -st: if (is_imm8(insn->off)) - EMIT2(add_1reg(0x40, dst_reg), insn->off); +st: insn_off = insn->off; + if (dst_reg == BPF_REG_PARAMS) { + /* + * Args 7+: reverse BPF negative offsets to + * x86 positive rsp offsets. + * BPF off=-16 → [rsp+0], off=-24 → [rsp+8], ... + */ + insn_off = outgoing_arg_base - outgoing_rsp - insn_off - 16; + dst_reg = BPF_REG_FP; + } + if (is_imm8(insn_off)) + EMIT2(add_1reg(0x40, dst_reg), insn_off); else - EMIT1_off32(add_1reg(0x80, dst_reg), insn->off); + EMIT1_off32(add_1reg(0x80, dst_reg), insn_off); EMIT(imm32, bpf_size_to_x86_bytes(BPF_SIZE(insn->code))); break; @@ -2149,7 +2255,17 @@ st: if (is_imm8(insn->off)) case BPF_STX | BPF_MEM | BPF_H: case BPF_STX | BPF_MEM | BPF_W: case BPF_STX | BPF_MEM | BPF_DW: - emit_stx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); + if (dst_reg == BPF_REG_PARAMS && insn->off == -8) { + /* Arg 6: store register value in r9 */ + EMIT_mov(X86_REG_R9, src_reg); + break; + } + insn_off = insn->off; + if (dst_reg == BPF_REG_PARAMS) { + insn_off = outgoing_arg_base - outgoing_rsp - insn_off - 16; + dst_reg = BPF_REG_FP; + } + emit_stx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn_off); break; case BPF_ST | BPF_PROBE_MEM32 | BPF_B: @@ -2248,6 +2364,19 @@ populate_extable: case BPF_LDX | BPF_PROBE_MEMSX | BPF_H: case BPF_LDX | BPF_PROBE_MEMSX | BPF_W: insn_off = insn->off; + if (src_reg == BPF_REG_PARAMS) { + if (insn_off == 8) { + /* Incoming arg 6: read from r9 */ + EMIT_mov(dst_reg, X86_REG_R9); + break; + } + src_reg = BPF_REG_FP; + /* + * Incoming args 7+: native_off == bpf_off + * (r11+16 → [rbp+16], r11+24 → [rbp+24], ...) + * No offset adjustment needed. + */ + } if (BPF_MODE(insn->code) == BPF_PROBE_MEM || BPF_MODE(insn->code) == BPF_PROBE_MEMSX) { @@ -2736,6 +2865,8 @@ emit_jmp: if (emit_spectre_bhb_barrier(&prog, ip, bpf_prog)) return -EINVAL; } + /* Deallocate outgoing args 7+ area. */ + emit_add_rsp(&prog, outgoing_rsp); if (bpf_prog->aux->exception_boundary) { pop_callee_regs(&prog, all_callee_regs_used); pop_r12(&prog); @@ -3793,7 +3924,8 @@ skip_init_addrs: for (pass = 0; pass < MAX_PASSES || image; pass++) { if (!padding && pass >= PADDING_PASSES) padding = true; - proglen = do_jit(env, prog, addrs, image, rw_image, oldproglen, &ctx, padding); + proglen = do_jit(env, prog, addrs, image, rw_image, oldproglen, + &ctx, padding); if (proglen <= 0) { out_image: image = NULL; @@ -3910,6 +4042,11 @@ bool bpf_jit_supports_kfunc_call(void) return true; } +bool bpf_jit_supports_stack_args(void) +{ + return true; +} + void *bpf_arch_text_copy(void *dst, void *src, size_t len) { if (text_poke_copy(dst, src, len) == NULL) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9e16e91647d3..242f9597d9ab 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1548,6 +1548,7 @@ void bpf_jit_uncharge_modmem(u32 size); bool bpf_prog_has_trampoline(const struct bpf_prog *prog); bool bpf_insn_is_indirect_target(const struct bpf_verifier_env *env, const struct bpf_prog *prog, int insn_idx); +u16 bpf_out_stack_arg_cnt(const struct bpf_verifier_env *env, const struct bpf_prog *prog); #else static inline int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index e6b836f846eb..427a6d828e01 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1582,6 +1582,16 @@ bool bpf_insn_is_indirect_target(const struct bpf_verifier_env *env, const struc insn_idx += prog->aux->subprog_start; return env->insn_aux_data[insn_idx].indirect_target; } + +u16 bpf_out_stack_arg_cnt(const struct bpf_verifier_env *env, const struct bpf_prog *prog) +{ + const struct bpf_subprog_info *sub; + + if (!env) + return 0; + sub = &env->subprog_info[prog->aux->func_idx]; + return sub->stack_arg_cnt - bpf_in_stack_arg_cnt(sub); +} #endif /* CONFIG_BPF_JIT */ /* Base function for offset calculation. Needs to go into .text section, -- cgit v1.2.3 From 79e7ec00634e95e20217ba922906574041b9bbf0 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:51:27 -0700 Subject: selftests/bpf: Add tests for BPF function stack arguments Add selftests covering stack argument passing for both BPF-to-BPF subprog calls and kfunc calls with more than 5 arguments. All tests are guarded by __BPF_FEATURE_STACK_ARGUMENT and __TARGET_ARCH_x86. BPF-to-BPF subprog call tests (stack_arg.c): - Scalar stack args - Pointer stack args - Mixed pointer/scalar stack args - Nested calls - Dynptr stack arg - Two callees with different stack arg counts - Async callback Kfunc call tests (stack_arg_kfunc.c, with bpf_testmod kfuncs): - Scalar stack args - Pointer stack args - Mixed pointer/scalar stack args - Dynptr stack arg - Memory buffer + size pair - Iterator - Const string pointer - Timer pointer Acked-by: Puranjay Mohan Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045127.2397187-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/stack_arg.c | 139 ++++++++++++ tools/testing/selftests/bpf/progs/stack_arg.c | 252 +++++++++++++++++++++ .../testing/selftests/bpf/progs/stack_arg_kfunc.c | 163 +++++++++++++ .../testing/selftests/bpf/test_kmods/bpf_testmod.c | 65 ++++++ .../selftests/bpf/test_kmods/bpf_testmod_kfunc.h | 20 +- 5 files changed, 638 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/stack_arg.c create mode 100644 tools/testing/selftests/bpf/progs/stack_arg.c create mode 100644 tools/testing/selftests/bpf/progs/stack_arg_kfunc.c diff --git a/tools/testing/selftests/bpf/prog_tests/stack_arg.c b/tools/testing/selftests/bpf/prog_tests/stack_arg.c new file mode 100644 index 000000000000..d61bac33f809 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/stack_arg.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include "stack_arg.skel.h" +#include "stack_arg_kfunc.skel.h" + +static void run_subtest(struct bpf_program *prog, int expected) +{ + int err, prog_fd; + LIBBPF_OPTS(bpf_test_run_opts, topts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .repeat = 1, + ); + + prog_fd = bpf_program__fd(prog); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(topts.retval, expected, "retval"); +} + +static void test_global_many(void) +{ + struct stack_arg *skel; + + skel = stack_arg__open(); + if (!ASSERT_OK_PTR(skel, "open")) + return; + + if (!skel->rodata->has_stack_arg) { + test__skip(); + goto out; + } + + if (!ASSERT_OK(stack_arg__load(skel), "load")) + goto out; + + run_subtest(skel->progs.test_global_many_args, 36); + +out: + stack_arg__destroy(skel); +} + +static void test_async_cb_many(void) +{ + struct stack_arg *skel; + + skel = stack_arg__open(); + if (!ASSERT_OK_PTR(skel, "open")) + return; + + if (!skel->rodata->has_stack_arg) { + test__skip(); + goto out; + } + + if (!ASSERT_OK(stack_arg__load(skel), "load")) + goto out; + + run_subtest(skel->progs.test_async_cb_many_args, 0); + + /* Wait for the timer callback to fire and verify the result. + * 10+20+30+40+50+60+70+80 = 360 + */ + usleep(50); + ASSERT_EQ(skel->bss->timer_result, 360, "timer_result"); + +out: + stack_arg__destroy(skel); +} + +static void test_bpf2bpf(void) +{ + struct stack_arg *skel; + + skel = stack_arg__open(); + if (!ASSERT_OK_PTR(skel, "open")) + return; + + if (!skel->rodata->has_stack_arg) { + test__skip(); + goto out; + } + + if (!ASSERT_OK(stack_arg__load(skel), "load")) + goto out; + + run_subtest(skel->progs.test_bpf2bpf_ptr_stack_arg, 45); + run_subtest(skel->progs.test_bpf2bpf_mix_stack_args, 51); + run_subtest(skel->progs.test_bpf2bpf_nesting_stack_arg, 50); + run_subtest(skel->progs.test_bpf2bpf_dynptr_stack_arg, 69); + run_subtest(skel->progs.test_two_callees, 91); + +out: + stack_arg__destroy(skel); +} + +static void test_kfunc(void) +{ + struct stack_arg_kfunc *skel; + + skel = stack_arg_kfunc__open(); + if (!ASSERT_OK_PTR(skel, "open")) + return; + + if (!skel->rodata->has_stack_arg) { + test__skip(); + goto out; + } + + if (!ASSERT_OK(stack_arg_kfunc__load(skel), "load")) + goto out; + + run_subtest(skel->progs.test_stack_arg_scalar, 36); + run_subtest(skel->progs.test_stack_arg_ptr, 45); + run_subtest(skel->progs.test_stack_arg_mix, 51); + run_subtest(skel->progs.test_stack_arg_dynptr, 69); + run_subtest(skel->progs.test_stack_arg_mem, 151); + run_subtest(skel->progs.test_stack_arg_iter, 115); + run_subtest(skel->progs.test_stack_arg_const_str, 15); + run_subtest(skel->progs.test_stack_arg_timer, 15); + +out: + stack_arg_kfunc__destroy(skel); +} + +void test_stack_arg(void) +{ + if (test__start_subtest("global_many_args")) + test_global_many(); + if (test__start_subtest("async_cb_many_args")) + test_async_cb_many(); + if (test__start_subtest("bpf2bpf")) + test_bpf2bpf(); + if (test__start_subtest("kfunc")) + test_kfunc(); +} diff --git a/tools/testing/selftests/bpf/progs/stack_arg.c b/tools/testing/selftests/bpf/progs/stack_arg.c new file mode 100644 index 000000000000..ab6240b997c5 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/stack_arg.c @@ -0,0 +1,252 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include +#include "bpf_kfuncs.h" + +#define CLOCK_MONOTONIC 1 + +struct timer_elem { + struct bpf_timer timer; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct timer_elem); +} timer_map SEC(".maps"); + +int timer_result; + +#if defined(__TARGET_ARCH_x86) && defined(__BPF_FEATURE_STACK_ARGUMENT) + +const volatile bool has_stack_arg = true; + +__noinline static int static_func_many_args(int a, int b, int c, int d, + int e, int f, int g, int h) +{ + return a + b + c + d + e + f + g + h; +} + +__noinline int global_calls_many_args(int a, int b, int c) +{ + return static_func_many_args(a, b, c, 4, 5, 6, 7, 8); +} + +SEC("tc") +int test_global_many_args(void) +{ + return global_calls_many_args(1, 2, 3); +} + +struct test_data { + long x; + long y; +}; + +/* 1 + 2 + 3 + 4 + 5 + 10 + 20 = 45 */ +__noinline static long func_with_ptr_stack_arg(long a, long b, long c, long d, + long e, struct test_data *p) +{ + return a + b + c + d + e + p->x + p->y; +} + +__noinline long global_ptr_stack_arg(long a, long b, long c, long d, long e) +{ + struct test_data data = { .x = 10, .y = 20 }; + + return func_with_ptr_stack_arg(a, b, c, d, e, &data); +} + +SEC("tc") +int test_bpf2bpf_ptr_stack_arg(void) +{ + return global_ptr_stack_arg(1, 2, 3, 4, 5); +} + +/* 1 + 2 + 3 + 4 + 5 + 10 + 6 + 20 = 51 */ +__noinline static long func_with_mix_stack_args(long a, long b, long c, long d, + long e, struct test_data *p, + long f, struct test_data *q) +{ + return a + b + c + d + e + p->x + f + q->y; +} + +__noinline long global_mix_stack_args(long a, long b, long c, long d, long e) +{ + struct test_data p = { .x = 10 }; + struct test_data q = { .y = 20 }; + + return func_with_mix_stack_args(a, b, c, d, e, &p, e + 1, &q); +} + +SEC("tc") +int test_bpf2bpf_mix_stack_args(void) +{ + return global_mix_stack_args(1, 2, 3, 4, 5); +} + +/* + * Nesting test: func_outer calls func_inner, both with struct pointer + * as stack arg. + * + * func_inner: (a+1) + (b+1) + (c+1) + (d+1) + (e+1) + p->x + p->y + * = 2 + 3 + 4 + 5 + 6 + 10 + 20 = 50 + */ +__noinline static long func_inner_ptr(long a, long b, long c, long d, + long e, struct test_data *p) +{ + return a + b + c + d + e + p->x + p->y; +} + +__noinline static long func_outer_ptr(long a, long b, long c, long d, + long e, struct test_data *p) +{ + return func_inner_ptr(a + 1, b + 1, c + 1, d + 1, e + 1, p); +} + +__noinline long global_nesting_ptr(long a, long b, long c, long d, long e) +{ + struct test_data data = { .x = 10, .y = 20 }; + + return func_outer_ptr(a, b, c, d, e, &data); +} + +SEC("tc") +int test_bpf2bpf_nesting_stack_arg(void) +{ + return global_nesting_ptr(1, 2, 3, 4, 5); +} + +/* 1 + 2 + 3 + 4 + 5 + sizeof(pkt_v4) = 15 + 54 = 69 */ +__noinline static long func_with_dynptr(long a, long b, long c, long d, + long e, struct bpf_dynptr *ptr) +{ + return a + b + c + d + e + bpf_dynptr_size(ptr); +} + +__noinline long global_dynptr_stack_arg(void *ctx __arg_ctx, long a, long b, + long c, long d) +{ + struct bpf_dynptr ptr; + + bpf_dynptr_from_skb(ctx, 0, &ptr); + return func_with_dynptr(a, b, c, d, d + 1, &ptr); +} + +SEC("tc") +int test_bpf2bpf_dynptr_stack_arg(struct __sk_buff *skb) +{ + return global_dynptr_stack_arg(skb, 1, 2, 3, 4); +} + +/* foo1: a+b+c+d+e+f+g+h */ +__noinline static int foo1(int a, int b, int c, int d, + int e, int f, int g, int h) +{ + return a + b + c + d + e + f + g + h; +} + +/* foo2: a+b+c+d+e+f+g+h+i+j */ +__noinline static int foo2(int a, int b, int c, int d, int e, + int f, int g, int h, int i, int j) +{ + return a + b + c + d + e + f + g + h + i + j; +} + +/* global_two_callees calls foo1 (3 stack args) and foo2 (5 stack args). + * The outgoing stack arg area is sized for foo2 (the larger callee). + * Stores for foo1 are a subset of the area used by foo2. + * Result: foo1(1,2,3,4,5,6,7,8) + foo2(1,2,3,4,5,6,7,8,9,10) = 36 + 55 = 91 + * + * Pass a-e through so the compiler can't constant-fold the stack args away. + */ +__noinline int global_two_callees(int a, int b, int c, int d, int e) +{ + int ret; + + ret = foo1(a, b, c, d, e, a + 5, a + 6, a + 7); + ret += foo2(a, b, c, d, e, a + 5, a + 6, a + 7, a + 8, a + 9); + return ret; +} + +SEC("tc") +int test_two_callees(void) +{ + return global_two_callees(1, 2, 3, 4, 5); +} + +static int timer_cb_many_args(void *map, int *key, struct bpf_timer *timer) +{ + timer_result = static_func_many_args(10, 20, 30, 40, 50, 60, 70, 80); + return 0; +} + +SEC("tc") +int test_async_cb_many_args(void) +{ + struct timer_elem *elem; + int key = 0; + + elem = bpf_map_lookup_elem(&timer_map, &key); + if (!elem) + return -1; + + bpf_timer_init(&elem->timer, &timer_map, CLOCK_MONOTONIC); + bpf_timer_set_callback(&elem->timer, timer_cb_many_args); + bpf_timer_start(&elem->timer, 1, 0); + return 0; +} + +#else + +const volatile bool has_stack_arg = false; + +SEC("tc") +int test_global_many_args(void) +{ + return 0; +} + +SEC("tc") +int test_bpf2bpf_ptr_stack_arg(void) +{ + return 0; +} + +SEC("tc") +int test_bpf2bpf_mix_stack_args(void) +{ + return 0; +} + +SEC("tc") +int test_bpf2bpf_nesting_stack_arg(void) +{ + return 0; +} + +SEC("tc") +int test_bpf2bpf_dynptr_stack_arg(struct __sk_buff *skb) +{ + return 0; +} + +SEC("tc") +int test_two_callees(void) +{ + return 0; +} + +SEC("tc") +int test_async_cb_many_args(void) +{ + return 0; +} + +#endif + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/stack_arg_kfunc.c b/tools/testing/selftests/bpf/progs/stack_arg_kfunc.c new file mode 100644 index 000000000000..fa9def876ea5 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/stack_arg_kfunc.c @@ -0,0 +1,163 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include "bpf_kfuncs.h" +#include "../test_kmods/bpf_testmod_kfunc.h" + +#if defined(__TARGET_ARCH_x86) && defined(__BPF_FEATURE_STACK_ARGUMENT) + +const volatile bool has_stack_arg = true; + +struct bpf_iter_testmod_seq { + u64 :64; + u64 :64; +}; + +extern int bpf_iter_testmod_seq_new(struct bpf_iter_testmod_seq *it, s64 value, int cnt) __ksym; +extern void bpf_iter_testmod_seq_destroy(struct bpf_iter_testmod_seq *it) __ksym; + +struct timer_map_value { + struct bpf_timer timer; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct timer_map_value); +} kfunc_timer_map SEC(".maps"); + +SEC("tc") +int test_stack_arg_scalar(struct __sk_buff *skb) +{ + return bpf_kfunc_call_stack_arg(1, 2, 3, 4, 5, 6, 7, 8); +} + +SEC("tc") +int test_stack_arg_ptr(struct __sk_buff *skb) +{ + struct prog_test_pass1 p = { .x0 = 10, .x1 = 20 }; + + return bpf_kfunc_call_stack_arg_ptr(1, 2, 3, 4, 5, &p); +} + +SEC("tc") +int test_stack_arg_mix(struct __sk_buff *skb) +{ + struct prog_test_pass1 p = { .x0 = 10 }; + struct prog_test_pass1 q = { .x1 = 20 }; + + return bpf_kfunc_call_stack_arg_mix(1, 2, 3, 4, 5, &p, 6, &q); +} + +/* 1 + 2 + 3 + 4 + 5 + sizeof(pkt_v4) = 15 + 54 = 69 */ +SEC("tc") +int test_stack_arg_dynptr(struct __sk_buff *skb) +{ + struct bpf_dynptr ptr; + + bpf_dynptr_from_skb(skb, 0, &ptr); + return bpf_kfunc_call_stack_arg_dynptr(1, 2, 3, 4, 5, &ptr); +} + +/* 1 + 2 + 3 + 4 + 5 + (1 + 2 + ... + 16) = 15 + 136 = 151 */ +SEC("tc") +int test_stack_arg_mem(struct __sk_buff *skb) +{ + char buf[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; + + return bpf_kfunc_call_stack_arg_mem(1, 2, 3, 4, 5, buf, sizeof(buf)); +} + +/* 1 + 2 + 3 + 4 + 5 + 100 = 115 */ +SEC("tc") +int test_stack_arg_iter(struct __sk_buff *skb) +{ + struct bpf_iter_testmod_seq it; + u64 ret; + + bpf_iter_testmod_seq_new(&it, 100, 10); + ret = bpf_kfunc_call_stack_arg_iter(1, 2, 3, 4, 5, &it); + bpf_iter_testmod_seq_destroy(&it); + return ret; +} + +const char cstr[] = "hello"; + +/* 1 + 2 + 3 + 4 + 5 = 15 */ +SEC("tc") +int test_stack_arg_const_str(struct __sk_buff *skb) +{ + return bpf_kfunc_call_stack_arg_const_str(1, 2, 3, 4, 5, cstr); +} + +/* 1 + 2 + 3 + 4 + 5 = 15 */ +SEC("tc") +int test_stack_arg_timer(struct __sk_buff *skb) +{ + struct timer_map_value *val; + int key = 0; + + val = bpf_map_lookup_elem(&kfunc_timer_map, &key); + if (!val) + return 0; + return bpf_kfunc_call_stack_arg_timer(1, 2, 3, 4, 5, &val->timer); +} + +#else + +const volatile bool has_stack_arg = false; + +SEC("tc") +int test_stack_arg_scalar(struct __sk_buff *skb) +{ + return 0; +} + +SEC("tc") +int test_stack_arg_ptr(struct __sk_buff *skb) +{ + return 0; +} + +SEC("tc") +int test_stack_arg_mix(struct __sk_buff *skb) +{ + return 0; +} + +SEC("tc") +int test_stack_arg_dynptr(struct __sk_buff *skb) +{ + return 0; +} + +SEC("tc") +int test_stack_arg_mem(struct __sk_buff *skb) +{ + return 0; +} + +SEC("tc") +int test_stack_arg_iter(struct __sk_buff *skb) +{ + return 0; +} + +SEC("tc") +int test_stack_arg_const_str(struct __sk_buff *skb) +{ + return 0; +} + +SEC("tc") +int test_stack_arg_timer(struct __sk_buff *skb) +{ + return 0; +} + +#endif + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index d876314a4d67..aef2f68b7e83 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -825,6 +825,63 @@ __bpf_kfunc int bpf_kfunc_call_test5(u8 a, u16 b, u32 c) return 0; } +__bpf_kfunc u64 bpf_kfunc_call_stack_arg(u64 a, u64 b, u64 c, u64 d, + u64 e, u64 f, u64 g, u64 h) +{ + return a + b + c + d + e + f + g + h; +} + +__bpf_kfunc u64 bpf_kfunc_call_stack_arg_ptr(u64 a, u64 b, u64 c, u64 d, u64 e, + struct prog_test_pass1 *p) +{ + return a + b + c + d + e + p->x0 + p->x1; +} + +__bpf_kfunc u64 bpf_kfunc_call_stack_arg_mix(u64 a, u64 b, u64 c, u64 d, u64 e, + struct prog_test_pass1 *p, u64 f, + struct prog_test_pass1 *q) +{ + return a + b + c + d + e + p->x0 + f + q->x1; +} + +__bpf_kfunc u64 bpf_kfunc_call_stack_arg_dynptr(u64 a, u64 b, u64 c, u64 d, u64 e, + struct bpf_dynptr *ptr) +{ + const struct bpf_dynptr_kern *kern_ptr = (void *)ptr; + + return a + b + c + d + e + (kern_ptr->size & 0xFFFFFF); +} + +__bpf_kfunc u64 bpf_kfunc_call_stack_arg_mem(u64 a, u64 b, u64 c, u64 d, u64 e, + void *mem, int mem__sz) +{ + const unsigned char *p = mem; + u64 sum = a + b + c + d + e; + int i; + + for (i = 0; i < mem__sz; i++) + sum += p[i]; + return sum; +} + +__bpf_kfunc u64 bpf_kfunc_call_stack_arg_iter(u64 a, u64 b, u64 c, u64 d, u64 e, + struct bpf_iter_testmod_seq *it__iter) +{ + return a + b + c + d + e + it__iter->value; +} + +__bpf_kfunc u64 bpf_kfunc_call_stack_arg_const_str(u64 a, u64 b, u64 c, u64 d, u64 e, + const char *str__str) +{ + return a + b + c + d + e; +} + +__bpf_kfunc u64 bpf_kfunc_call_stack_arg_timer(u64 a, u64 b, u64 c, u64 d, u64 e, + struct bpf_timer *timer) +{ + return a + b + c + d + e; +} + static struct prog_test_ref_kfunc prog_test_struct = { .a = 42, .b = 108, @@ -1288,6 +1345,14 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test2) BTF_ID_FLAGS(func, bpf_kfunc_call_test3) BTF_ID_FLAGS(func, bpf_kfunc_call_test4) BTF_ID_FLAGS(func, bpf_kfunc_call_test5) +BTF_ID_FLAGS(func, bpf_kfunc_call_stack_arg) +BTF_ID_FLAGS(func, bpf_kfunc_call_stack_arg_ptr) +BTF_ID_FLAGS(func, bpf_kfunc_call_stack_arg_mix) +BTF_ID_FLAGS(func, bpf_kfunc_call_stack_arg_dynptr) +BTF_ID_FLAGS(func, bpf_kfunc_call_stack_arg_mem) +BTF_ID_FLAGS(func, bpf_kfunc_call_stack_arg_iter) +BTF_ID_FLAGS(func, bpf_kfunc_call_stack_arg_const_str) +BTF_ID_FLAGS(func, bpf_kfunc_call_stack_arg_timer) BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail1) BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail2) BTF_ID_FLAGS(func, bpf_kfunc_call_test_acquire, KF_ACQUIRE | KF_RET_NULL) diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h index aa0b8d41e71b..2c1cb118f886 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h @@ -26,6 +26,8 @@ struct prog_test_ref_kfunc { }; #endif +struct bpf_iter_testmod_seq; + struct prog_test_pass1 { int x0; struct { @@ -111,7 +113,23 @@ int bpf_kfunc_call_test2(struct sock *sk, __u32 a, __u32 b) __ksym; struct sock *bpf_kfunc_call_test3(struct sock *sk) __ksym; long bpf_kfunc_call_test4(signed char a, short b, int c, long d) __ksym; int bpf_kfunc_call_test5(__u8 a, __u16 b, __u32 c) __ksym; - +__u64 bpf_kfunc_call_stack_arg(__u64 a, __u64 b, __u64 c, __u64 d, + __u64 e, __u64 f, __u64 g, __u64 h) __ksym; +__u64 bpf_kfunc_call_stack_arg_ptr(__u64 a, __u64 b, __u64 c, __u64 d, __u64 e, + struct prog_test_pass1 *p) __ksym; +__u64 bpf_kfunc_call_stack_arg_mix(__u64 a, __u64 b, __u64 c, __u64 d, __u64 e, + struct prog_test_pass1 *p, __u64 f, + struct prog_test_pass1 *q) __ksym; +__u64 bpf_kfunc_call_stack_arg_dynptr(__u64 a, __u64 b, __u64 c, __u64 d, __u64 e, + struct bpf_dynptr *ptr) __ksym; +__u64 bpf_kfunc_call_stack_arg_mem(__u64 a, __u64 b, __u64 c, __u64 d, __u64 e, + void *mem, int mem__sz) __ksym; +__u64 bpf_kfunc_call_stack_arg_iter(__u64 a, __u64 b, __u64 c, __u64 d, __u64 e, + struct bpf_iter_testmod_seq *it__iter) __ksym; +__u64 bpf_kfunc_call_stack_arg_const_str(__u64 a, __u64 b, __u64 c, __u64 d, __u64 e, + const char *str__str) __ksym; +__u64 bpf_kfunc_call_stack_arg_timer(__u64 a, __u64 b, __u64 c, __u64 d, __u64 e, + struct bpf_timer *timer) __ksym; void bpf_kfunc_call_test_pass_ctx(struct __sk_buff *skb) __ksym; void bpf_kfunc_call_test_pass1(struct prog_test_pass1 *p) __ksym; void bpf_kfunc_call_test_pass2(struct prog_test_pass2 *p) __ksym; -- cgit v1.2.3 From 9f42204c62d51d666df0acb83af8d154c7580ace Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:51:32 -0700 Subject: selftests/bpf: Add tests for stack argument validation Add negative tests that verify the kfunc (rejecting kfunc call with >8 byte struct as stack argument) and the verifier (rejecting invalid uses of r11 for stack arguments). Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045132.2398371-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/stack_arg_fail.c | 10 ++ tools/testing/selftests/bpf/progs/stack_arg_fail.c | 114 +++++++++++++++++++++ .../testing/selftests/bpf/test_kmods/bpf_testmod.c | 7 ++ .../selftests/bpf/test_kmods/bpf_testmod_kfunc.h | 8 ++ 4 files changed, 139 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/stack_arg_fail.c create mode 100644 tools/testing/selftests/bpf/progs/stack_arg_fail.c diff --git a/tools/testing/selftests/bpf/prog_tests/stack_arg_fail.c b/tools/testing/selftests/bpf/prog_tests/stack_arg_fail.c new file mode 100644 index 000000000000..090af1330953 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/stack_arg_fail.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include +#include "stack_arg_fail.skel.h" + +void test_stack_arg_fail(void) +{ + RUN_TESTS(stack_arg_fail); +} diff --git a/tools/testing/selftests/bpf/progs/stack_arg_fail.c b/tools/testing/selftests/bpf/progs/stack_arg_fail.c new file mode 100644 index 000000000000..ad9d4bfe15dc --- /dev/null +++ b/tools/testing/selftests/bpf/progs/stack_arg_fail.c @@ -0,0 +1,114 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include "../test_kmods/bpf_testmod_kfunc.h" +#include "bpf_misc.h" + +#if defined(__BPF_FEATURE_STACK_ARGUMENT) + +SEC("tc") +__failure __msg("Unrecognized *(R11-8) type STRUCT") +int test_stack_arg_big(struct __sk_buff *skb) +{ + struct prog_test_big_arg s = { .a = 1, .b = 2 }; + + return bpf_kfunc_call_stack_arg_big(1, 2, 3, 4, 5, s); +} + +SEC("socket") +__description("r11 in ALU instruction") +__failure __msg("R11 is invalid") +__naked void r11_alu_reject(void) +{ + asm volatile ( + "r11 += 1;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__description("r11 store with non-DW size") +__failure __msg("R11 is invalid") +__naked void r11_store_non_dw(void) +{ + asm volatile ( + "*(u32 *)(r11 - 8) = r1;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__description("r11 store with unaligned offset") +__failure __msg("R11 is invalid") +__naked void r11_store_unaligned(void) +{ + asm volatile ( + "*(u64 *)(r11 - 4) = r1;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__description("r11 store with positive offset") +__failure __msg("R11 is invalid") +__naked void r11_store_positive_off(void) +{ + asm volatile ( + "*(u64 *)(r11 + 8) = r1;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__description("r11 load with negative offset") +__failure __msg("R11 is invalid") +__naked void r11_load_negative_off(void) +{ + asm volatile ( + "r0 = *(u64 *)(r11 - 8);" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__description("r11 load with non-DW size") +__failure __msg("R11 is invalid") +__naked void r11_load_non_dw(void) +{ + asm volatile ( + "r0 = *(u32 *)(r11 + 8);" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__description("r11 store with zero offset") +__failure __msg("R11 is invalid") +__naked void r11_store_zero_off(void) +{ + asm volatile ( + "*(u64 *)(r11 + 0) = r1;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +#else + +SEC("tc") +__description("stack_arg_fail: not supported, dummy test") +__success +int test_stack_arg_big(struct __sk_buff *skb) +{ + return 0; +} + +#endif + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index aef2f68b7e83..0be918fe3021 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -882,6 +882,12 @@ __bpf_kfunc u64 bpf_kfunc_call_stack_arg_timer(u64 a, u64 b, u64 c, u64 d, u64 e return a + b + c + d + e; } +__bpf_kfunc u64 bpf_kfunc_call_stack_arg_big(u64 a, u64 b, u64 c, u64 d, u64 e, + struct prog_test_big_arg s) +{ + return a + b + c + d + e + s.a + s.b; +} + static struct prog_test_ref_kfunc prog_test_struct = { .a = 42, .b = 108, @@ -1353,6 +1359,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_stack_arg_mem) BTF_ID_FLAGS(func, bpf_kfunc_call_stack_arg_iter) BTF_ID_FLAGS(func, bpf_kfunc_call_stack_arg_const_str) BTF_ID_FLAGS(func, bpf_kfunc_call_stack_arg_timer) +BTF_ID_FLAGS(func, bpf_kfunc_call_stack_arg_big) BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail1) BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail2) BTF_ID_FLAGS(func, bpf_kfunc_call_test_acquire, KF_ACQUIRE | KF_RET_NULL) diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h index 2c1cb118f886..2edc36b66de9 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h @@ -50,6 +50,11 @@ struct prog_test_pass2 { } x; }; +struct prog_test_big_arg { + __u64 a; + __u64 b; +}; + struct prog_test_fail1 { void *p; int x; @@ -130,6 +135,9 @@ __u64 bpf_kfunc_call_stack_arg_const_str(__u64 a, __u64 b, __u64 c, __u64 d, __u const char *str__str) __ksym; __u64 bpf_kfunc_call_stack_arg_timer(__u64 a, __u64 b, __u64 c, __u64 d, __u64 e, struct bpf_timer *timer) __ksym; +__u64 bpf_kfunc_call_stack_arg_big(__u64 a, __u64 b, __u64 c, __u64 d, __u64 e, + struct prog_test_big_arg s) __ksym; + void bpf_kfunc_call_test_pass_ctx(struct __sk_buff *skb) __ksym; void bpf_kfunc_call_test_pass1(struct prog_test_pass1 *p) __ksym; void bpf_kfunc_call_test_pass2(struct prog_test_pass2 *p) __ksym; -- cgit v1.2.3 From 5b31de88920b867edcbcd8d6d77b8be5b822b3dd Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:51:38 -0700 Subject: selftests/bpf: Add BTF fixup for __naked subprog parameter names When __naked subprogs are used in verifier tests, clang drops parameter names from their BTF FUNC_PROTO entries. This prevents the verifier from resolving stack argument slots by name. Add a __btf_func_path(path) annotation that points to a separate BTF file containing properly-named FUNC entries. The test_loader matches FUNC entries by name, detects anonymous parameters, and replaces the FUNC_PROTO with a new one that carries parameter names from the custom file while preserving the original type IDs. The custom BTF file also serves as btf_custom_path for kfunc resolution when no separate btf_custom_path is specified. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045138.2398886-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/bpf_misc.h | 1 + tools/testing/selftests/bpf/test_loader.c | 136 ++++++++++++++++++++++++++- 2 files changed, 136 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h index a0d7b15a24b1..9eeb5b0b63d6 100644 --- a/tools/testing/selftests/bpf/progs/bpf_misc.h +++ b/tools/testing/selftests/bpf/progs/bpf_misc.h @@ -152,6 +152,7 @@ #define __auxiliary __test_tag("test_auxiliary") #define __auxiliary_unpriv __test_tag("test_auxiliary_unpriv") #define __btf_path(path) __test_tag("test_btf_path=" path) +#define __btf_func_path(path) __test_tag("test_btf_func_path=" path) #define __arch(arch) __test_tag("test_arch=" arch) #define __arch_x86_64 __arch("X86_64") #define __arch_arm64 __arch("ARM64") diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index ee637809a1d4..abdb9e6e3713 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -63,6 +63,7 @@ struct test_spec { struct test_subspec priv; struct test_subspec unpriv; const char *btf_custom_path; + const char *btf_custom_func_path; int log_level; int prog_flags; int mode_mask; @@ -590,6 +591,8 @@ static int parse_test_spec(struct test_loader *tester, jit_on_next_line = true; } else if ((val = str_has_pfx(s, "test_btf_path="))) { spec->btf_custom_path = val; + } else if ((val = str_has_pfx(s, "test_btf_func_path="))) { + spec->btf_custom_func_path = val; } else if ((val = str_has_pfx(s, "test_caps_unpriv="))) { err = parse_caps(val, &spec->unpriv.caps, "test caps"); if (err) @@ -1175,6 +1178,123 @@ static int get_stream(int stream_id, int prog_fd, char *text, size_t text_sz) return ret; } +/* + * Fix up the program's BTF using BTF from a separate file. + * + * For __naked subprogs, clang drops parameter names from BTF. Find FUNC + * entries with anonymous parameters and replace their FUNC_PROTO with the + * properly-named version from the custom file. + */ +static int fixup_btf_from_path(struct bpf_object *obj, const char *path) +{ + struct btf *prog_btf, *custom_btf; + __u32 i, j, cnt, custom_cnt; + int err = 0; + + prog_btf = bpf_object__btf(obj); + if (!prog_btf) + return 0; + + custom_btf = btf__parse(path, NULL); + if (!ASSERT_OK_PTR(custom_btf, "parse_custom_btf")) + return -EINVAL; + + cnt = btf__type_cnt(prog_btf); + custom_cnt = btf__type_cnt(custom_btf); + + /* Fix up FUNC entries with anonymous params. + * Save all data from prog_btf BEFORE calling btf__add_*, + * since those calls may reallocate the BTF data buffer + * and invalidate any pointers obtained from btf__type_by_id. + */ + for (i = 1; i < cnt; i++) { + const struct btf_type *t = btf__type_by_id(prog_btf, i); + const struct btf_type *fp, *custom_t, *custom_fp; + const struct btf_param *params, *custom_params; + __u32 ret_type_id, vlen; + __u32 *prog_param_types = NULL; + const char *name; + int new_proto_id; + + if (!btf_is_func(t)) + continue; + + fp = btf__type_by_id(prog_btf, t->type); + if (!fp || !btf_is_func_proto(fp) || btf_vlen(fp) == 0) + continue; + + /* Check if any param is anonymous */ + params = btf_params(fp); + if (params[0].name_off != 0) + continue; + + /* Find matching FUNC by name in custom BTF */ + name = btf__name_by_offset(prog_btf, t->name_off); + if (!name) + continue; + + for (j = 1; j < custom_cnt; j++) { + const char *cname; + + custom_t = btf__type_by_id(custom_btf, j); + if (!btf_is_func(custom_t)) + continue; + cname = btf__name_by_offset(custom_btf, custom_t->name_off); + if (cname && strcmp(name, cname) == 0) + break; + } + if (j >= custom_cnt) + continue; + + custom_fp = btf__type_by_id(custom_btf, custom_t->type); + if (!custom_fp || !btf_is_func_proto(custom_fp)) + continue; + + vlen = btf_vlen(fp); + if (vlen != btf_vlen(custom_fp)) + continue; + + /* Save data before btf__add_* calls invalidate pointers */ + ret_type_id = fp->type; + prog_param_types = malloc(vlen * sizeof(*prog_param_types)); + if (!prog_param_types) { + err = -ENOMEM; + break; + } + for (j = 0; j < vlen; j++) + prog_param_types[j] = params[j].type; + + /* Add a new FUNC_PROTO: param names from custom, types from prog */ + new_proto_id = btf__add_func_proto(prog_btf, ret_type_id); + if (new_proto_id < 0) { + err = new_proto_id; + free(prog_param_types); + break; + } + + custom_params = btf_params(custom_fp); + for (j = 0; j < vlen; j++) { + const char *pname; + + pname = btf__name_by_offset(custom_btf, custom_params[j].name_off); + err = btf__add_func_param(prog_btf, pname ?: "", prog_param_types[j]); + if (err) + break; + } + free(prog_param_types); + if (err) + break; + + /* Update the FUNC to point to the new FUNC_PROTO (re-fetch + * since btf__add_* may have reallocated the data buffer). + */ + ((struct btf_type *)btf__type_by_id(prog_btf, i))->type = new_proto_id; + } + + btf__free(custom_btf); + return err; +} + /* this function is forced noinline and has short generic name to look better * in test_progs output (in case of a failure) */ @@ -1231,13 +1351,27 @@ void run_subtest(struct test_loader *tester, } } - /* Implicitly reset to NULL if next test case doesn't specify */ + /* Implicitly reset to NULL if next test case doesn't specify. + * btf_custom_func_path also serves as btf_custom_path for kfunc resolution. + */ open_opts->btf_custom_path = spec->btf_custom_path; + if (!open_opts->btf_custom_path) + open_opts->btf_custom_path = spec->btf_custom_func_path; tobj = bpf_object__open_mem(obj_bytes, obj_byte_cnt, open_opts); if (!ASSERT_OK_PTR(tobj, "obj_open_mem")) /* shouldn't happen */ goto subtest_cleanup; + /* Fix up __naked subprog BTF using a separate file with named params */ + if (spec->btf_custom_func_path) { + err = fixup_btf_from_path(tobj, spec->btf_custom_func_path); + if (err) { + PRINT_FAIL("failed to fixup BTF from %s: %d\n", + spec->btf_custom_func_path, err); + goto tobj_cleanup; + } + } + i = 0; bpf_object__for_each_program(tprog_iter, tobj) { spec_iter = &specs[i++]; -- cgit v1.2.3 From 00c3ac4292a6bc3039008cdb45bd423087acb98e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:51:43 -0700 Subject: selftests/bpf: Add verifier tests for stack argument validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add inline-asm based verifier tests that exercise stack argument validation logic directly. Positive tests: - subprog call with 6 arg's - Two sequential calls to different subprogs (6-arg and 7-arg) - Share a r11 store for both branches Negative tests — verifier rejection: - Read from uninitialized incoming stack arg slot - Gap in outgoing slots: only r11-16 written, r11-8 missing - Write at r11-80, exceeding max 7 stack args - Missing store on one branch with a shared store - First call has proper stack arguments and the second call intends to inherit stack arguments but not working - r11 load ordering issue Negative tests — pointer/ref tracking: - Pruning type mismatch: one branch stores PTR_TO_STACK, the other stores a scalar, callee dereferences — must not prune - Release invalidation: bpf_sk_release invalidates a socket pointer stored in a stack arg slot - Packet pointer invalidation: bpf_skb_pull_data invalidates a packet pointer stored in a stack arg slot - Null propagation: PTR_TO_MAP_VALUE_OR_NULL stored in stack arg slot, null branch attempts dereference via callee Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045143.2399278-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/verifier.c | 4 + .../bpf/progs/btf__verifier_stack_arg_order.c | 40 ++ .../selftests/bpf/progs/verifier_stack_arg.c | 444 +++++++++++++++++++++ .../selftests/bpf/progs/verifier_stack_arg_order.c | 126 ++++++ 4 files changed, 614 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/btf__verifier_stack_arg_order.c create mode 100644 tools/testing/selftests/bpf/progs/verifier_stack_arg.c create mode 100644 tools/testing/selftests/bpf/progs/verifier_stack_arg_order.c diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index a96b25ebff23..ee3d929fac8a 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -91,6 +91,8 @@ #include "verifier_sockmap_mutate.skel.h" #include "verifier_spill_fill.skel.h" #include "verifier_spin_lock.skel.h" +#include "verifier_stack_arg.skel.h" +#include "verifier_stack_arg_order.skel.h" #include "verifier_stack_ptr.skel.h" #include "verifier_store_release.skel.h" #include "verifier_subprog_precision.skel.h" @@ -238,6 +240,8 @@ void test_verifier_sock_addr(void) { RUN(verifier_sock_addr); } void test_verifier_sockmap_mutate(void) { RUN(verifier_sockmap_mutate); } void test_verifier_spill_fill(void) { RUN(verifier_spill_fill); } void test_verifier_spin_lock(void) { RUN(verifier_spin_lock); } +void test_verifier_stack_arg(void) { RUN(verifier_stack_arg); } +void test_verifier_stack_arg_order(void) { RUN(verifier_stack_arg_order); } void test_verifier_stack_ptr(void) { RUN(verifier_stack_ptr); } void test_verifier_store_release(void) { RUN(verifier_store_release); } void test_verifier_subprog_precision(void) { RUN(verifier_subprog_precision); } diff --git a/tools/testing/selftests/bpf/progs/btf__verifier_stack_arg_order.c b/tools/testing/selftests/bpf/progs/btf__verifier_stack_arg_order.c new file mode 100644 index 000000000000..83692570d5bc --- /dev/null +++ b/tools/testing/selftests/bpf/progs/btf__verifier_stack_arg_order.c @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include +#include + +#if defined(__TARGET_ARCH_x86) && defined(__BPF_FEATURE_STACK_ARGUMENT) + +int subprog_bad_order_6args(int a, int b, int c, int d, int e, int f) +{ + return a + b + c + d + e + f; +} + +int subprog_call_before_load_6args(int a, int b, int c, int d, int e, int f) +{ + return a + b + c + d + e + f; +} + +int subprog_pruning_call_before_load_6args(int a, int b, int c, int d, int e, int f) +{ + return a + b + c + d + e + f; +} + +#else + +int subprog_bad_order_6args(void) +{ + return 0; +} + +int subprog_call_before_load_6args(void) +{ + return 0; +} + +int subprog_pruning_call_before_load_6args(void) +{ + return 0; +} + +#endif diff --git a/tools/testing/selftests/bpf/progs/verifier_stack_arg.c b/tools/testing/selftests/bpf/progs/verifier_stack_arg.c new file mode 100644 index 000000000000..6587bf912bc0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_stack_arg.c @@ -0,0 +1,444 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include "bpf_misc.h" + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1); + __type(key, long long); + __type(value, long long); +} map_hash_8b SEC(".maps"); + +#if defined(__TARGET_ARCH_x86) && defined(__BPF_FEATURE_STACK_ARGUMENT) + +__noinline __used +static int subprog_6args(int a, int b, int c, int d, int e, int f) +{ + return a + b + c + d + e + f; +} + +__noinline __used +static int subprog_7args(int a, int b, int c, int d, int e, int f, int g) +{ + return a + b + c + d + e + f + g; +} + +__noinline __used +static long subprog_deref_arg6(long a, long b, long c, long d, long e, long *f) +{ + return *f; +} + +SEC("tc") +__description("stack_arg: subprog with 6 args") +__success __retval(21) +__naked void stack_arg_6args(void) +{ + asm volatile ( + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "*(u64 *)(r11 - 8) = 6;" + "call subprog_6args;" + "exit;" + ::: __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: two subprogs with >5 args") +__success __retval(90) +__naked void stack_arg_two_subprogs(void) +{ + asm volatile ( + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "*(u64 *)(r11 - 8) = 10;" + "call subprog_6args;" + "r6 = r0;" + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "*(u64 *)(r11 - 16) = 30;" + "*(u64 *)(r11 - 8) = 20;" + "call subprog_7args;" + "r0 += r6;" + "exit;" + ::: __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: read from uninitialized stack arg slot") +__failure +__msg("invalid read from stack arg off 8 depth 0") +__naked void stack_arg_read_uninitialized(void) +{ + asm volatile ( + "r0 = *(u64 *)(r11 + 8);" + "r0 = 0;" + "exit;" + ::: __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: gap at offset -8, only wrote -16") +__failure +__msg("callee expects 7 args, stack arg1 is not initialized") +__naked void stack_arg_gap_at_minus8(void) +{ + asm volatile ( + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "*(u64 *)(r11 - 16) = 30;" + "call subprog_7args;" + "exit;" + ::: __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: pruning with different stack arg types") +__failure +__flag(BPF_F_TEST_STATE_FREQ) +__msg("R{{[0-9]}} invalid mem access 'scalar'") +__naked void stack_arg_pruning_type_mismatch(void) +{ + asm volatile ( + "call %[bpf_get_prandom_u32];" + "r6 = r0;" + /* local = 0 on program stack */ + "r7 = 0;" + "*(u64 *)(r10 - 8) = r7;" + /* Branch based on random value */ + "if r6 s> 3 goto l0_%=;" + /* Path 1: store stack pointer to outgoing arg6 */ + "r1 = r10;" + "r1 += -8;" + "*(u64 *)(r11 - 8) = r1;" + "goto l1_%=;" + "l0_%=:" + /* Path 2: store scalar to outgoing arg6 */ + "*(u64 *)(r11 - 8) = 42;" + "l1_%=:" + /* Call subprog that dereferences arg6 */ + "r1 = r6;" + "r2 = 0;" + "r3 = 0;" + "r4 = 0;" + "r5 = 0;" + "call subprog_deref_arg6;" + "exit;" + :: __imm(bpf_get_prandom_u32) + : __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: release_reference invalidates stack arg slot") +__failure +__msg("R{{[0-9]}} !read_ok") +__naked void stack_arg_release_ref(void) +{ + asm volatile ( + "r6 = r1;" + /* struct bpf_sock_tuple tuple = {} */ + "r2 = 0;" + "*(u32 *)(r10 - 8) = r2;" + "*(u64 *)(r10 - 16) = r2;" + "*(u64 *)(r10 - 24) = r2;" + "*(u64 *)(r10 - 32) = r2;" + "*(u64 *)(r10 - 40) = r2;" + "*(u64 *)(r10 - 48) = r2;" + /* sk = bpf_sk_lookup_tcp(ctx, &tuple, sizeof(tuple), 0, 0) */ + "r1 = r6;" + "r2 = r10;" + "r2 += -48;" + "r3 = %[sizeof_bpf_sock_tuple];" + "r4 = 0;" + "r5 = 0;" + "call %[bpf_sk_lookup_tcp];" + /* r0 = sk (PTR_TO_SOCK_OR_NULL) */ + "if r0 == 0 goto l0_%=;" + /* Store sock ref to outgoing arg6 slot */ + "*(u64 *)(r11 - 8) = r0;" + /* Release the reference — invalidates the stack arg slot */ + "r1 = r0;" + "call %[bpf_sk_release];" + /* Call subprog that dereferences arg6 — should fail */ + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "call subprog_deref_arg6;" + "l0_%=:" + "r0 = 0;" + "exit;" + : + : __imm(bpf_sk_lookup_tcp), + __imm(bpf_sk_release), + __imm_const(sizeof_bpf_sock_tuple, sizeof(struct bpf_sock_tuple)) + : __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: pkt pointer in stack arg slot invalidated after pull_data") +__failure +__msg("R{{[0-9]}} !read_ok") +__naked void stack_arg_stale_pkt_ptr(void) +{ + asm volatile ( + "r6 = r1;" + "r7 = *(u32 *)(r6 + %[__sk_buff_data]);" + "r8 = *(u32 *)(r6 + %[__sk_buff_data_end]);" + /* check pkt has at least 1 byte */ + "r0 = r7;" + "r0 += 8;" + "if r0 > r8 goto l0_%=;" + /* Store valid pkt pointer to outgoing arg6 slot */ + "*(u64 *)(r11 - 8) = r7;" + /* bpf_skb_pull_data invalidates all pkt pointers */ + "r1 = r6;" + "r2 = 0;" + "call %[bpf_skb_pull_data];" + /* Call subprog that dereferences arg6 — should fail */ + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "call subprog_deref_arg6;" + "l0_%=:" + "r0 = 0;" + "exit;" + : + : __imm(bpf_skb_pull_data), + __imm_const(__sk_buff_data, offsetof(struct __sk_buff, data)), + __imm_const(__sk_buff_data_end, offsetof(struct __sk_buff, data_end)) + : __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: null propagation rejects deref on null branch") +__failure +__msg("R{{[0-9]}} invalid mem access 'scalar'") +__naked void stack_arg_null_propagation_fail(void) +{ + asm volatile ( + "r1 = 0;" + "*(u64 *)(r10 - 8) = r1;" + /* r0 = bpf_map_lookup_elem(&map_hash_8b, &key) */ + "r2 = r10;" + "r2 += -8;" + "r1 = %[map_hash_8b] ll;" + "call %[bpf_map_lookup_elem];" + /* Store PTR_TO_MAP_VALUE_OR_NULL to outgoing arg6 slot */ + "*(u64 *)(r11 - 8) = r0;" + /* null check on r0 */ + "if r0 != 0 goto l0_%=;" + /* + * On null branch, outgoing slot is SCALAR(0). + * Call subprog that dereferences arg6 — should fail. + */ + "r1 = 0;" + "r2 = 0;" + "r3 = 0;" + "r4 = 0;" + "r5 = 0;" + "call subprog_deref_arg6;" + "l0_%=:" + "r0 = 0;" + "exit;" + : + : __imm(bpf_map_lookup_elem), + __imm_addr(map_hash_8b) + : __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: missing store on one branch") +__failure +__msg("callee expects 7 args, stack arg1 is not initialized") +__naked void stack_arg_missing_store_one_branch(void) +{ + asm volatile ( + "call %[bpf_get_prandom_u32];" + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + /* Write arg7 (r11-16) before branch */ + "*(u64 *)(r11 - 16) = 20;" + "if r0 > 0 goto l0_%=;" + /* Path 1: write arg6 and call */ + "*(u64 *)(r11 - 8) = 10;" + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "call subprog_7args;" + "goto l1_%=;" + "l0_%=:" + /* Path 2: missing arg6 store, call should fail */ + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "call subprog_7args;" + "l1_%=:" + "r0 = 0;" + "exit;" + :: __imm(bpf_get_prandom_u32) + : __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: share a store for both branches") +__success __retval(0) +__naked void stack_arg_shared_store(void) +{ + asm volatile ( + "call %[bpf_get_prandom_u32];" + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + /* Write arg7 (r11-16) before branch */ + "*(u64 *)(r11 - 16) = 20;" + "if r0 > 0 goto l0_%=;" + /* Path 1: write arg6 and call */ + "*(u64 *)(r11 - 8) = 10;" + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "call subprog_7args;" + "goto l1_%=;" + "l0_%=:" + /* Path 2: also write arg6 and call */ + "*(u64 *)(r11 - 8) = 30;" + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "call subprog_7args;" + "l1_%=:" + "r0 = 0;" + "exit;" + :: __imm(bpf_get_prandom_u32) + : __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: write beyond max outgoing depth") +__failure +__msg("stack arg write offset -80 exceeds max 7 stack args") +__naked void stack_arg_write_beyond_max(void) +{ + asm volatile ( + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + /* Write to offset -80, way beyond any callee's needs */ + "*(u64 *)(r11 - 80) = 99;" + "*(u64 *)(r11 - 16) = 20;" + "*(u64 *)(r11 - 8) = 10;" + "call subprog_7args;" + "r0 = 0;" + "exit;" + ::: __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: write unused stack arg slot") +__failure +__msg("func#0 writes 5 stack arg slots, but calls only require 2") +__naked void stack_arg_write_unused_slot(void) +{ + asm volatile ( + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + /* Write to offset -40, unused for the callee */ + "*(u64 *)(r11 - 40) = 99;" + "*(u64 *)(r11 - 16) = 20;" + "*(u64 *)(r11 - 8) = 10;" + "call subprog_7args;" + "r0 = 0;" + "exit;" + ::: __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: sequential calls reuse slots") +__failure +__msg("callee expects 7 args, stack arg1 is not initialized") +__naked void stack_arg_sequential_calls(void) +{ + asm volatile ( + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "*(u64 *)(r11 - 8) = 6;" + "*(u64 *)(r11 - 16) = 7;" + "call subprog_7args;" + "r6 = r0;" + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "call subprog_7args;" + "r0 += r6;" + "exit;" + ::: __clobber_all + ); +} + +#else + +SEC("socket") +__description("stack_arg is not supported by compiler or jit, use a dummy test") +__success +int dummy_test(void) +{ + return 0; +} + +#endif + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_stack_arg_order.c b/tools/testing/selftests/bpf/progs/verifier_stack_arg_order.c new file mode 100644 index 000000000000..938f4a2f5482 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_stack_arg_order.c @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include "bpf_misc.h" + +#if defined(__TARGET_ARCH_x86) && defined(__BPF_FEATURE_STACK_ARGUMENT) + +__noinline __used __naked +static int subprog_bad_order_6args(int a, int b, int c, int d, int e, int f) +{ + asm volatile ( + "*(u64 *)(r11 - 8) = r1;" + "r0 = *(u64 *)(r11 + 8);" + "exit;" + ::: __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: r11 load after r11 store") +__failure +__msg("r11 load must be before any r11 store or call insn") +__btf_func_path("btf__verifier_stack_arg_order.bpf.o") +__naked void stack_arg_load_after_store(void) +{ + asm volatile ( + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "*(u64 *)(r11 - 8) = 6;" + "call subprog_bad_order_6args;" + "exit;" + ::: __clobber_all + ); +} + +__noinline __used __naked +static int subprog_call_before_load_6args(int a, int b, int c, int d, int e, + int f) +{ + asm volatile ( + "call %[bpf_get_prandom_u32];" + "r0 = *(u64 *)(r11 + 8);" + "exit;" + :: __imm(bpf_get_prandom_u32) + : __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: r11 load after a call") +__failure +__msg("r11 load must be before any r11 store or call insn") +__btf_func_path("btf__verifier_stack_arg_order.bpf.o") +__naked void stack_arg_load_after_call(void) +{ + asm volatile ( + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "*(u64 *)(r11 - 8) = 6;" + "call subprog_call_before_load_6args;" + "exit;" + ::: __clobber_all + ); +} + +__noinline __used __naked +static int subprog_pruning_call_before_load_6args(int a, int b, int c, int d, + int e, int f) +{ + asm volatile ( + "if r1 s> 0 goto l0_%=;" + "goto l1_%=;" + "l0_%=:" + "call %[bpf_get_prandom_u32];" + "l1_%=:" + "r0 = *(u64 *)(r11 + 8);" + "exit;" + :: __imm(bpf_get_prandom_u32) + : __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: pruning keeps r11 load ordering") +__failure +__flag(BPF_F_TEST_STATE_FREQ) +__msg("r11 load must be before any r11 store or call insn") +__btf_func_path("btf__verifier_stack_arg_order.bpf.o") +__naked void stack_arg_pruning_load_after_call(void) +{ + asm volatile ( + "call %[bpf_get_prandom_u32];" + "r1 = r0;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "*(u64 *)(r11 - 8) = 6;" + "call subprog_pruning_call_before_load_6args;" + "exit;" + :: __imm(bpf_get_prandom_u32) + : __clobber_all + ); +} + +#else + +SEC("socket") +__description("stack_arg order is not supported by compiler or jit, use a dummy test") +__success +int dummy_test(void) +{ + return 0; +} + +#endif + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 6e277efbb19dd1a536cbffd9ea5c049a427dc7cb Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:51:48 -0700 Subject: selftests/bpf: Add precision backtracking test for stack arguments Add a test that verifies precision backtracking works correctly across BPF-to-BPF calls when stack arguments are involved. The test passes a size value as incoming stack arg (arg6) to a subprog, which forwards it as the mem__sz parameter (outgoing arg7) to bpf_kfunc_call_stack_arg_mem. The expected __msg annotations verify that precision propagates from the kfunc's mem__sz argument back through the subprog frame to the caller's outgoing stack arg store. A companion BTF file (btf__stack_arg_precision.c) provides named parameter BTF for the __naked subprog via __btf_func_path. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045148.2400087-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/stack_arg_precision.c | 10 ++ .../selftests/bpf/progs/btf__stack_arg_precision.c | 23 ++++ .../selftests/bpf/progs/stack_arg_precision.c | 134 +++++++++++++++++++++ 3 files changed, 167 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/stack_arg_precision.c create mode 100644 tools/testing/selftests/bpf/progs/btf__stack_arg_precision.c create mode 100644 tools/testing/selftests/bpf/progs/stack_arg_precision.c diff --git a/tools/testing/selftests/bpf/prog_tests/stack_arg_precision.c b/tools/testing/selftests/bpf/prog_tests/stack_arg_precision.c new file mode 100644 index 000000000000..1ab041d66de3 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/stack_arg_precision.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include +#include "stack_arg_precision.skel.h" + +void test_stack_arg_precision(void) +{ + RUN_TESTS(stack_arg_precision); +} diff --git a/tools/testing/selftests/bpf/progs/btf__stack_arg_precision.c b/tools/testing/selftests/bpf/progs/btf__stack_arg_precision.c new file mode 100644 index 000000000000..296fddfe6804 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/btf__stack_arg_precision.c @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include "../test_kmods/bpf_testmod_kfunc.h" + +#if defined(__TARGET_ARCH_x86) && defined(__BPF_FEATURE_STACK_ARGUMENT) + +long subprog_call_mem_kfunc(long a, long b, long c, long d, long e, long size) +{ + char buf[8] = {}; + + return bpf_kfunc_call_stack_arg_mem(a, b, c, d, e, buf, size); +} + +#else + +long subprog_call_mem_kfunc(void) +{ + return 0; +} + +#endif diff --git a/tools/testing/selftests/bpf/progs/stack_arg_precision.c b/tools/testing/selftests/bpf/progs/stack_arg_precision.c new file mode 100644 index 000000000000..2a0a344c83ca --- /dev/null +++ b/tools/testing/selftests/bpf/progs/stack_arg_precision.c @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include "../test_kmods/bpf_testmod_kfunc.h" +#include "bpf_misc.h" + +#if defined(__TARGET_ARCH_x86) && defined(__BPF_FEATURE_STACK_ARGUMENT) + +/* Force kfunc extern BTF generation for inline asm call below. + * Uses its own SEC so it's not included as a .text subprog. + * The '?' prefix sets autoload=false so libbpf won't load it. + */ +SEC("?tc") +int __btf_kfunc_gen(struct __sk_buff *ctx) +{ + char buf[8] = {}; + + return bpf_kfunc_call_stack_arg_mem(0, 0, 0, 0, 0, buf, sizeof(buf)); +} + +/* + * Test precision backtracking across bpf-to-bpf call for kfunc stack arg. + * subprog_call_mem_kfunc receives a size as incoming stack arg (arg6) + * and forwards it as mem__sz (arg7) to bpf_kfunc_call_stack_arg_mem. + */ +__naked __noinline __used +static long subprog_call_mem_kfunc(long a, long b, long c, long d, long e, long size) +{ + asm volatile ( + "r1 = *(u64 *)(r11 + 8);" /* r1 = incoming arg6 (size) */ + "r2 = 0x0807060504030201 ll;" /* r2 = buf contents */ + "*(u64 *)(r10 - 8) = r2;" /* store buf to stack */ + "r2 = r10;" + "r2 += -8;" /* r2 = &buf */ + "*(u64 *)(r11 - 8) = r2;" /* outgoing arg6 = buf */ + "*(u64 *)(r11 - 16) = r1;" /* outgoing arg7 = size */ + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "call %[bpf_kfunc_call_stack_arg_mem];" + "exit;" + : + : __imm(bpf_kfunc_call_stack_arg_mem) + : __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: precision backtracking across bpf2bpf call for kfunc") +__success +__log_level(2) +__flag(BPF_F_TEST_STATE_FREQ) +__btf_func_path("btf__stack_arg_precision.bpf.o") +__msg("mark_precise: frame1: last_idx 26 first_idx 13 subseq_idx -1") +__msg("mark_precise: frame1: regs= stack= before 25: (b7) r5 = 5") +__msg("mark_precise: frame1: regs= stack= before 24: (b7) r4 = 4") +__msg("mark_precise: frame1: regs= stack= before 23: (b7) r3 = 3") +__msg("mark_precise: frame1: regs= stack= before 22: (b7) r2 = 2") +__msg("mark_precise: frame1: regs= stack= before 21: (b7) r1 = 1") +__msg("mark_precise: frame1: regs= stack= before 20: (7b) *(u64 *)(r11 -16) = r1") +__msg("mark_precise: frame1: regs=r1 stack= before 19: (7b) *(u64 *)(r11 -8) = r2") +__msg("mark_precise: frame1: regs=r1 stack= before 18: (07) r2 += -8") +__msg("mark_precise: frame1: regs=r1 stack= before 17: (bf) r2 = r10") +__msg("mark_precise: frame1: regs=r1 stack= before 16: (7b) *(u64 *)(r10 -8) = r2") +__msg("mark_precise: frame1: regs=r1 stack= before 14: (18) r2 = 0x807060504030201") +__msg("mark_precise: frame1: regs=r1 stack= before 13: (79) r1 = *(u64 *)(r11 +8)") +__msg("mark_precise: frame1: parent state regs= stack=: frame1: R10=fp0") +__msg("mark_precise: frame0: parent state regs= stack=: R10=fp0") +__msg("mark_precise: frame1: last_idx 11 first_idx 11 subseq_idx 13") +__msg("mark_precise: frame1: regs= stack= before 11: (85) call pc+1") +__msg("mark_precise: frame0: parent state regs= stack=: R1=1 R2=2 R3=3 R4=4 R5=5 R10=fp0") +__msg("mark_precise: frame0: last_idx 9 first_idx 7 subseq_idx 11") +__msg("mark_precise: frame0: regs= stack= before 9: (05) goto pc+1") +__msg("mark_precise: frame0: regs= stack= before 8: (7a) *(u64 *)(r11 -8) = 4") +__msg("mark_precise: frame1: last_idx 26 first_idx 13 subseq_idx -1 ") +__msg("mark_precise: frame1: regs= stack= before 25: (b7) r5 = 5") +__msg("mark_precise: frame1: regs= stack= before 24: (b7) r4 = 4") +__msg("mark_precise: frame1: regs= stack= before 23: (b7) r3 = 3") +__msg("mark_precise: frame1: regs= stack= before 22: (b7) r2 = 2") +__msg("mark_precise: frame1: regs= stack= before 21: (b7) r1 = 1") +__msg("mark_precise: frame1: regs= stack= before 20: (7b) *(u64 *)(r11 -16) = r1") +__msg("mark_precise: frame1: regs=r1 stack= before 19: (7b) *(u64 *)(r11 -8) = r2") +__msg("mark_precise: frame1: regs=r1 stack= before 18: (07) r2 += -8") +__msg("mark_precise: frame1: regs=r1 stack= before 17: (bf) r2 = r10") +__msg("mark_precise: frame1: regs=r1 stack= before 16: (7b) *(u64 *)(r10 -8) = r2") +__msg("mark_precise: frame1: regs=r1 stack= before 14: (18) r2 = 0x807060504030201") +__msg("mark_precise: frame1: regs=r1 stack= before 13: (79) r1 = *(u64 *)(r11 +8)") +__msg("mark_precise: frame1: parent state regs= stack=: frame1: R10=fp0") +__msg("mark_precise: frame0: parent state regs= stack=: R10=fp0") +__msg("mark_precise: frame1: last_idx 11 first_idx 11 subseq_idx 13 ") +__msg("mark_precise: frame1: regs= stack= before 11: (85) call pc+1") +__msg("mark_precise: frame0: parent state regs= stack=: R1=1 R2=2 R3=3 R4=4 R5=5 R10=fp0") +__msg("mark_precise: frame0: last_idx 10 first_idx 10 subseq_idx 11 ") +__msg("mark_precise: frame0: regs= stack= before 10: (7a) *(u64 *)(r11 -8) = 6") +__naked void stack_arg_precision_bpf2bpf(void) +{ + asm volatile ( + "call %[bpf_get_prandom_u32];" + "r6 = r0;" + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "if r6 < 2 goto l0_%=;" + "*(u64 *)(r11 - 8) = 4;" + "goto l1_%=;" + "l0_%=:" + "*(u64 *)(r11 - 8) = 6;" + "l1_%=:" + "call subprog_call_mem_kfunc;" + "exit;" + :: __imm(bpf_get_prandom_u32) + : __clobber_all + ); +} + +#else + +SEC("socket") +__description("stack_arg_precision: not supported, dummy test") +__success +int dummy_test(void) +{ + return 0; +} + +#endif + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 68e5627579d788d9e992cc06a69760f20b6841d6 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Tue, 12 May 2026 21:51:53 -0700 Subject: bpf, arm64: Map BPF_REG_0 to x8 instead of x7 Move the BPF return value register from x7 to x8, freeing x7 for use as an argument register. AAPCS64 designates x8 as the indirect result location register; it is caller-saved and not used for argument passing, making it a suitable home for BPF_REG_0. This is a prerequisite for stack argument support, which needs x5-x7 to pass arguments 6-8 to native kfuncs following the AAPCS64 calling convention. Signed-off-by: Puranjay Mohan Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045153.2402197-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 4 ++-- arch/arm64/net/bpf_timed_may_goto.S | 8 ++++---- tools/testing/selftests/bpf/progs/verifier_jit_inline.c | 2 +- tools/testing/selftests/bpf/progs/verifier_ldsx.c | 6 +++--- tools/testing/selftests/bpf/progs/verifier_private_stack.c | 10 +++++----- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 0816c40fc7af..085e650662e3 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -47,7 +47,7 @@ /* Map BPF registers to A64 registers */ static const int bpf2a64[] = { /* return value from in-kernel function, and exit value from eBPF */ - [BPF_REG_0] = A64_R(7), + [BPF_REG_0] = A64_R(8), /* arguments from eBPF program to in-kernel function */ [BPF_REG_1] = A64_R(0), [BPF_REG_2] = A64_R(1), @@ -1048,7 +1048,7 @@ static void build_epilogue(struct jit_ctx *ctx, bool was_classic) /* Restore FP/LR registers */ emit(A64_POP(A64_FP, A64_LR, A64_SP), ctx); - /* Move the return value from bpf:r0 (aka x7) to x0 */ + /* Move the return value from bpf:r0 (aka x8) to x0 */ emit(A64_MOV(1, A64_R(0), r0), ctx); /* Authenticate lr */ diff --git a/arch/arm64/net/bpf_timed_may_goto.S b/arch/arm64/net/bpf_timed_may_goto.S index 894cfcd7b241..a9a802711a7f 100644 --- a/arch/arm64/net/bpf_timed_may_goto.S +++ b/arch/arm64/net/bpf_timed_may_goto.S @@ -8,8 +8,8 @@ SYM_FUNC_START(arch_bpf_timed_may_goto) stp x29, x30, [sp, #-64]! mov x29, sp - /* Save BPF registers R0 - R5 (x7, x0-x4)*/ - stp x7, x0, [sp, #16] + /* Save BPF registers R0 - R5 (x8, x0-x4)*/ + stp x8, x0, [sp, #16] stp x1, x2, [sp, #32] stp x3, x4, [sp, #48] @@ -28,8 +28,8 @@ SYM_FUNC_START(arch_bpf_timed_may_goto) /* BPF_REG_AX(x9) will be stored into count, so move return value to it. */ mov x9, x0 - /* Restore BPF registers R0 - R5 (x7, x0-x4) */ - ldp x7, x0, [sp, #16] + /* Restore BPF registers R0 - R5 (x8, x0-x4) */ + ldp x8, x0, [sp, #16] ldp x1, x2, [sp, #32] ldp x3, x4, [sp, #48] diff --git a/tools/testing/selftests/bpf/progs/verifier_jit_inline.c b/tools/testing/selftests/bpf/progs/verifier_jit_inline.c index 4ea254063646..885ff69a3a62 100644 --- a/tools/testing/selftests/bpf/progs/verifier_jit_inline.c +++ b/tools/testing/selftests/bpf/progs/verifier_jit_inline.c @@ -9,7 +9,7 @@ __success __retval(0) __arch_x86_64 __jited(" addq %gs:{{.*}}, %rax") __arch_arm64 -__jited(" mrs x7, SP_EL0") +__jited(" mrs x8, SP_EL0") int inline_bpf_get_current_task(void) { bpf_get_current_task(); diff --git a/tools/testing/selftests/bpf/progs/verifier_ldsx.c b/tools/testing/selftests/bpf/progs/verifier_ldsx.c index 1026524a1983..41340877dc9d 100644 --- a/tools/testing/selftests/bpf/progs/verifier_ldsx.c +++ b/tools/testing/selftests/bpf/progs/verifier_ldsx.c @@ -274,11 +274,11 @@ __jited("movslq 0x10(%rdi,%r12), %r15") __jited("movswq 0x18(%rdi,%r12), %r15") __jited("movsbq 0x20(%rdi,%r12), %r15") __arch_arm64 -__jited("add x11, x7, x28") +__jited("add x11, x8, x28") __jited("ldrsw x21, [x11, #0x10]") -__jited("add x11, x7, x28") +__jited("add x11, x8, x28") __jited("ldrsh x21, [x11, #0x18]") -__jited("add x11, x7, x28") +__jited("add x11, x8, x28") __jited("ldrsb x21, [x11, #0x20]") __jited("add x11, x0, x28") __jited("ldrsw x22, [x11, #0x10]") diff --git a/tools/testing/selftests/bpf/progs/verifier_private_stack.c b/tools/testing/selftests/bpf/progs/verifier_private_stack.c index 646e8ef82051..c5078face38d 100644 --- a/tools/testing/selftests/bpf/progs/verifier_private_stack.c +++ b/tools/testing/selftests/bpf/progs/verifier_private_stack.c @@ -170,12 +170,12 @@ __jited(" mrs x10, TPIDR_EL{{[0-1]}}") __jited(" add x27, x27, x10") __jited(" add x25, x27, {{.*}}") __jited(" bl 0x{{.*}}") -__jited(" mov x7, x0") +__jited(" mov x8, x0") __jited(" mov x0, #0x2a") __jited(" str x0, [x27]") __jited(" bl 0x{{.*}}") -__jited(" mov x7, x0") -__jited(" mov x7, #0x0") +__jited(" mov x8, x0") +__jited(" mov x8, #0x0") __jited(" ldp x25, x27, [sp], {{.*}}") __naked void private_stack_callback(void) { @@ -220,7 +220,7 @@ __jited(" mov x0, #0x2a") __jited(" str x0, [x27]") __jited(" mov x0, #0x0") __jited(" bl 0x{{.*}}") -__jited(" mov x7, x0") +__jited(" mov x8, x0") __jited(" ldp x27, x28, [sp], #0x10") int private_stack_exception_main_prog(void) { @@ -258,7 +258,7 @@ __jited(" add x25, x27, {{.*}}") __jited(" mov x0, #0x2a") __jited(" str x0, [x27]") __jited(" bl 0x{{.*}}") -__jited(" mov x7, x0") +__jited(" mov x8, x0") __jited(" ldp x27, x28, [sp], #0x10") int private_stack_exception_sub_prog(void) { -- cgit v1.2.3 From 235b2fe772f559416a5dfda33cf141ee07ce78d6 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Tue, 12 May 2026 21:51:58 -0700 Subject: bpf, arm64: Add JIT support for stack arguments Implement stack argument passing for BPF-to-BPF and kfunc calls with more than 5 parameters on arm64, following the AAPCS64 calling convention. BPF R1-R5 already map to x0-x4. With BPF_REG_0 moved to x8 by the previous commit, x5-x7 are free for arguments 6-8. Arguments 9-12 spill onto the stack at [SP+0], [SP+8], ... and the callee reads them from [FP+16], [FP+24], ... (above the saved FP/LR pair). BPF convention uses fixed offsets from BPF_REG_PARAMS (r11): off=-8 is always arg 6, off=-16 arg 7, etc. The verifier invalidates all outgoing stack arg slots after each call, so the compiler must re-store before every call. This means x5-x7 don't need to be saved on stack. Signed-off-by: Puranjay Mohan Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045158.2402494-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 88 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 87 insertions(+), 1 deletion(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 085e650662e3..e3bbeaa94590 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -86,6 +86,7 @@ struct jit_ctx { __le32 *image; __le32 *ro_image; u32 stack_size; + u16 stack_arg_size; u64 user_vm_start; u64 arena_vm_start; bool fp_used; @@ -533,13 +534,19 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf) * | | * +-----+ <= (BPF_FP - prog->aux->stack_depth) * |RSVD | padding - * current A64_SP => +-----+ <= (BPF_FP - ctx->stack_size) + * +-----+ <= (BPF_FP - ctx->stack_size) + * | | + * | ... | outgoing stack args (9+, if any) + * | | + * current A64_SP => +-----+ * | | * | ... | Function call stack * | | * +-----+ * low * + * Stack args 6-8 are passed in x5-x7, args 9+ at [SP]. + * Incoming args 9+ are at [FP + 16], [FP + 24], ... */ emit_kcfi(is_main_prog ? cfi_bpf_hash : cfi_bpf_subprog_hash, ctx); @@ -613,6 +620,9 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf) if (ctx->stack_size && !ctx->priv_sp_used) emit(A64_SUB_I(1, A64_SP, A64_SP, ctx->stack_size), ctx); + if (ctx->stack_arg_size) + emit(A64_SUB_I(1, A64_SP, A64_SP, ctx->stack_arg_size), ctx); + if (ctx->arena_vm_start) emit_a64_mov_i64(arena_vm_base, ctx->arena_vm_start, ctx); @@ -673,6 +683,9 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx) /* Update tail_call_cnt if the slot is populated. */ emit(A64_STR64I(tcc, ptr, 0), ctx); + if (ctx->stack_arg_size) + emit(A64_ADD_I(1, A64_SP, A64_SP, ctx->stack_arg_size), ctx); + /* restore SP */ if (ctx->stack_size && !ctx->priv_sp_used) emit(A64_ADD_I(1, A64_SP, A64_SP, ctx->stack_size), ctx); @@ -1034,6 +1047,9 @@ static void build_epilogue(struct jit_ctx *ctx, bool was_classic) const u8 r0 = bpf2a64[BPF_REG_0]; const u8 ptr = bpf2a64[TCCNT_PTR]; + if (ctx->stack_arg_size) + emit(A64_ADD_I(1, A64_SP, A64_SP, ctx->stack_arg_size), ctx); + /* We're done with BPF stack */ if (ctx->stack_size && !ctx->priv_sp_used) emit(A64_ADD_I(1, A64_SP, A64_SP, ctx->stack_size), ctx); @@ -1191,6 +1207,41 @@ static int add_exception_handler(const struct bpf_insn *insn, return 0; } +static const u8 stack_arg_reg[] = { A64_R(5), A64_R(6), A64_R(7) }; + +#define NR_STACK_ARG_REGS ARRAY_SIZE(stack_arg_reg) + +static void emit_stack_arg_load(u8 dst, s16 bpf_off, struct jit_ctx *ctx) +{ + int idx = bpf_off / sizeof(u64) - 1; + + if (idx < NR_STACK_ARG_REGS) + emit(A64_MOV(1, dst, stack_arg_reg[idx]), ctx); + else + emit(A64_LDR64I(dst, A64_FP, (idx - NR_STACK_ARG_REGS) * sizeof(u64) + 16), ctx); +} + +static void emit_stack_arg_store(u8 src_a64, s16 bpf_off, struct jit_ctx *ctx) +{ + int idx = -bpf_off / sizeof(u64) - 1; + + if (idx < NR_STACK_ARG_REGS) + emit(A64_MOV(1, stack_arg_reg[idx], src_a64), ctx); + else + emit(A64_STR64I(src_a64, A64_SP, (idx - NR_STACK_ARG_REGS) * sizeof(u64)), ctx); +} + +static void emit_stack_arg_store_imm(s32 imm, s16 bpf_off, const u8 tmp, struct jit_ctx *ctx) +{ + int idx = -bpf_off / sizeof(u64) - 1; + + emit_a64_mov_i(1, tmp, imm, ctx); + if (idx < NR_STACK_ARG_REGS) + emit(A64_MOV(1, stack_arg_reg[idx], tmp), ctx); + else + emit(A64_STR64I(tmp, A64_SP, (idx - NR_STACK_ARG_REGS) * sizeof(u64)), ctx); +} + /* JITs an eBPF instruction. * Returns: * 0 - successfully JITed an 8-byte eBPF instruction. @@ -1646,6 +1697,11 @@ emit_cond_jmp: case BPF_LDX | BPF_MEM | BPF_H: case BPF_LDX | BPF_MEM | BPF_B: case BPF_LDX | BPF_MEM | BPF_DW: + if (insn->src_reg == BPF_REG_PARAMS) { + emit_stack_arg_load(dst, off, ctx); + break; + } + fallthrough; case BPF_LDX | BPF_PROBE_MEM | BPF_DW: case BPF_LDX | BPF_PROBE_MEM | BPF_W: case BPF_LDX | BPF_PROBE_MEM | BPF_H: @@ -1672,6 +1728,8 @@ emit_cond_jmp: if (src == fp) { src_adj = ctx->priv_sp_used ? priv_sp : A64_SP; off_adj = off + ctx->stack_size; + if (!ctx->priv_sp_used) + off_adj += ctx->stack_arg_size; } else { src_adj = src; off_adj = off; @@ -1752,6 +1810,11 @@ emit_cond_jmp: case BPF_ST | BPF_MEM | BPF_H: case BPF_ST | BPF_MEM | BPF_B: case BPF_ST | BPF_MEM | BPF_DW: + if (insn->dst_reg == BPF_REG_PARAMS) { + emit_stack_arg_store_imm(imm, off, tmp, ctx); + break; + } + fallthrough; case BPF_ST | BPF_PROBE_MEM32 | BPF_B: case BPF_ST | BPF_PROBE_MEM32 | BPF_H: case BPF_ST | BPF_PROBE_MEM32 | BPF_W: @@ -1763,6 +1826,8 @@ emit_cond_jmp: if (dst == fp) { dst_adj = ctx->priv_sp_used ? priv_sp : A64_SP; off_adj = off + ctx->stack_size; + if (!ctx->priv_sp_used) + off_adj += ctx->stack_arg_size; } else { dst_adj = dst; off_adj = off; @@ -1814,6 +1879,11 @@ emit_cond_jmp: case BPF_STX | BPF_MEM | BPF_H: case BPF_STX | BPF_MEM | BPF_B: case BPF_STX | BPF_MEM | BPF_DW: + if (insn->dst_reg == BPF_REG_PARAMS) { + emit_stack_arg_store(src, off, ctx); + break; + } + fallthrough; case BPF_STX | BPF_PROBE_MEM32 | BPF_B: case BPF_STX | BPF_PROBE_MEM32 | BPF_H: case BPF_STX | BPF_PROBE_MEM32 | BPF_W: @@ -1825,6 +1895,8 @@ emit_cond_jmp: if (dst == fp) { dst_adj = ctx->priv_sp_used ? priv_sp : A64_SP; off_adj = off + ctx->stack_size; + if (!ctx->priv_sp_used) + off_adj += ctx->stack_arg_size; } else { dst_adj = dst; off_adj = off; @@ -2018,6 +2090,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_pr u8 *ro_image_ptr; int body_idx; int exentry_idx; + int out_cnt; if (!prog->jit_requested) return prog; @@ -2065,6 +2138,14 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_pr ctx.user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena); ctx.arena_vm_start = bpf_arena_get_kern_vm_start(prog->aux->arena); + out_cnt = bpf_out_stack_arg_cnt(env, prog); + if (out_cnt) { + int nr_on_stack = out_cnt - NR_STACK_ARG_REGS; + + if (nr_on_stack > 0) + ctx.stack_arg_size = round_up(nr_on_stack * sizeof(u64), 16); + } + if (priv_stack_ptr) ctx.priv_sp_used = true; @@ -2229,6 +2310,11 @@ bool bpf_jit_supports_kfunc_call(void) return true; } +bool bpf_jit_supports_stack_args(void) +{ + return true; +} + void *bpf_arch_text_copy(void *dst, void *src, size_t len) { if (!aarch64_insn_copy(dst, src, len)) -- cgit v1.2.3 From 90e43f1b47535cc7aceef3add1a61ba3260b7aee Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Tue, 12 May 2026 21:52:04 -0700 Subject: selftests/bpf: Enable stack argument tests for arm64 Now that arm64 supports stack arguments, enable the existing stack_arg, stack_arg_kfunc and verifier_stack_arg tests for __TARGET_ARCH_arm64. Signed-off-by: Puranjay Mohan Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045204.2403441-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/btf__stack_arg_precision.c | 3 ++- tools/testing/selftests/bpf/progs/btf__verifier_stack_arg_order.c | 3 ++- tools/testing/selftests/bpf/progs/stack_arg.c | 3 ++- tools/testing/selftests/bpf/progs/stack_arg_kfunc.c | 3 ++- tools/testing/selftests/bpf/progs/stack_arg_precision.c | 3 ++- tools/testing/selftests/bpf/progs/verifier_stack_arg.c | 3 ++- tools/testing/selftests/bpf/progs/verifier_stack_arg_order.c | 3 ++- 7 files changed, 14 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/btf__stack_arg_precision.c b/tools/testing/selftests/bpf/progs/btf__stack_arg_precision.c index 296fddfe6804..8d38aafe66a2 100644 --- a/tools/testing/selftests/bpf/progs/btf__stack_arg_precision.c +++ b/tools/testing/selftests/bpf/progs/btf__stack_arg_precision.c @@ -4,7 +4,8 @@ #include #include "../test_kmods/bpf_testmod_kfunc.h" -#if defined(__TARGET_ARCH_x86) && defined(__BPF_FEATURE_STACK_ARGUMENT) +#if (defined(__TARGET_ARCH_x86) || defined(__TARGET_ARCH_arm64)) && \ + defined(__BPF_FEATURE_STACK_ARGUMENT) long subprog_call_mem_kfunc(long a, long b, long c, long d, long e, long size) { diff --git a/tools/testing/selftests/bpf/progs/btf__verifier_stack_arg_order.c b/tools/testing/selftests/bpf/progs/btf__verifier_stack_arg_order.c index 83692570d5bc..da34e8456b6c 100644 --- a/tools/testing/selftests/bpf/progs/btf__verifier_stack_arg_order.c +++ b/tools/testing/selftests/bpf/progs/btf__verifier_stack_arg_order.c @@ -3,7 +3,8 @@ #include #include -#if defined(__TARGET_ARCH_x86) && defined(__BPF_FEATURE_STACK_ARGUMENT) +#if (defined(__TARGET_ARCH_x86) || defined(__TARGET_ARCH_arm64)) && \ + defined(__BPF_FEATURE_STACK_ARGUMENT) int subprog_bad_order_6args(int a, int b, int c, int d, int e, int f) { diff --git a/tools/testing/selftests/bpf/progs/stack_arg.c b/tools/testing/selftests/bpf/progs/stack_arg.c index ab6240b997c5..b5e9929a4d63 100644 --- a/tools/testing/selftests/bpf/progs/stack_arg.c +++ b/tools/testing/selftests/bpf/progs/stack_arg.c @@ -21,7 +21,8 @@ struct { int timer_result; -#if defined(__TARGET_ARCH_x86) && defined(__BPF_FEATURE_STACK_ARGUMENT) +#if (defined(__TARGET_ARCH_x86) || defined(__TARGET_ARCH_arm64)) && \ + defined(__BPF_FEATURE_STACK_ARGUMENT) const volatile bool has_stack_arg = true; diff --git a/tools/testing/selftests/bpf/progs/stack_arg_kfunc.c b/tools/testing/selftests/bpf/progs/stack_arg_kfunc.c index fa9def876ea5..da0d4f91d273 100644 --- a/tools/testing/selftests/bpf/progs/stack_arg_kfunc.c +++ b/tools/testing/selftests/bpf/progs/stack_arg_kfunc.c @@ -6,7 +6,8 @@ #include "bpf_kfuncs.h" #include "../test_kmods/bpf_testmod_kfunc.h" -#if defined(__TARGET_ARCH_x86) && defined(__BPF_FEATURE_STACK_ARGUMENT) +#if (defined(__TARGET_ARCH_x86) || defined(__TARGET_ARCH_arm64)) && \ + defined(__BPF_FEATURE_STACK_ARGUMENT) const volatile bool has_stack_arg = true; diff --git a/tools/testing/selftests/bpf/progs/stack_arg_precision.c b/tools/testing/selftests/bpf/progs/stack_arg_precision.c index 2a0a344c83ca..bee2eeec021d 100644 --- a/tools/testing/selftests/bpf/progs/stack_arg_precision.c +++ b/tools/testing/selftests/bpf/progs/stack_arg_precision.c @@ -6,7 +6,8 @@ #include "../test_kmods/bpf_testmod_kfunc.h" #include "bpf_misc.h" -#if defined(__TARGET_ARCH_x86) && defined(__BPF_FEATURE_STACK_ARGUMENT) +#if (defined(__TARGET_ARCH_x86) || defined(__TARGET_ARCH_arm64)) && \ + defined(__BPF_FEATURE_STACK_ARGUMENT) /* Force kfunc extern BTF generation for inline asm call below. * Uses its own SEC so it's not included as a .text subprog. diff --git a/tools/testing/selftests/bpf/progs/verifier_stack_arg.c b/tools/testing/selftests/bpf/progs/verifier_stack_arg.c index 6587bf912bc0..d43a9b42034c 100644 --- a/tools/testing/selftests/bpf/progs/verifier_stack_arg.c +++ b/tools/testing/selftests/bpf/progs/verifier_stack_arg.c @@ -12,7 +12,8 @@ struct { __type(value, long long); } map_hash_8b SEC(".maps"); -#if defined(__TARGET_ARCH_x86) && defined(__BPF_FEATURE_STACK_ARGUMENT) +#if (defined(__TARGET_ARCH_x86) || defined(__TARGET_ARCH_arm64)) && \ + defined(__BPF_FEATURE_STACK_ARGUMENT) __noinline __used static int subprog_6args(int a, int b, int c, int d, int e, int f) diff --git a/tools/testing/selftests/bpf/progs/verifier_stack_arg_order.c b/tools/testing/selftests/bpf/progs/verifier_stack_arg_order.c index 938f4a2f5482..1240cf8a40d6 100644 --- a/tools/testing/selftests/bpf/progs/verifier_stack_arg_order.c +++ b/tools/testing/selftests/bpf/progs/verifier_stack_arg_order.c @@ -5,7 +5,8 @@ #include #include "bpf_misc.h" -#if defined(__TARGET_ARCH_x86) && defined(__BPF_FEATURE_STACK_ARGUMENT) +#if (defined(__TARGET_ARCH_x86) || defined(__TARGET_ARCH_arm64)) && \ + defined(__BPF_FEATURE_STACK_ARGUMENT) __noinline __used __naked static int subprog_bad_order_6args(int a, int b, int c, int d, int e, int f) -- cgit v1.2.3 From 74a9bb761a434ea3be1e0c59cd67b37217eb042c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 12 May 2026 22:08:07 -0700 Subject: libbpf: Use strscpy() in kernel code for skel_map_create() Linux has deprecated[1] strncpy(), and the use in skel_map_create() is best replaced with strscpy(). Since we still need to build this file in userspace, leave the strncpy() in place in that case. This is the last use of strncpy() in the kernel. Link: https://github.com/KSPP/linux/issues/90 [1] Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20260513050806.do.620-kees@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/skel_internal.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/lib/bpf/skel_internal.h b/tools/lib/bpf/skel_internal.h index 6a8f5c7a02eb..74503d358bc8 100644 --- a/tools/lib/bpf/skel_internal.h +++ b/tools/lib/bpf/skel_internal.h @@ -243,7 +243,12 @@ static inline int skel_map_create(enum bpf_map_type map_type, attr.excl_prog_hash = (unsigned long) excl_prog_hash; attr.excl_prog_hash_size = excl_prog_hash_sz; +#ifdef __KERNEL__ + if (strscpy(attr.map_name, map_name) < 0) + return -EINVAL; +#else strncpy(attr.map_name, map_name, sizeof(attr.map_name)); +#endif attr.key_size = key_size; attr.value_size = value_size; attr.max_entries = max_entries; -- cgit v1.2.3 From f41f34ec64748e16e5a90ab391cec39e30942f32 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Wed, 13 May 2026 21:34:50 +0200 Subject: bpf: Report maximum combined stack depth We've hit the 512 bytes limit on stack depth a few times in Cilium recently. As a result, we started reporting in CI our current maximum stack depth across all configurations for each BPF program. Unfortunately, that is not trivial to compute in userspace. The verifier reports the stack depths of individual subprogs at the end of the logs. However the maximum combined stack depth also depends on the callgraph of those subprogs (the max combined stack depth is the height of the callgraph weighted by per-subprog stack depths). We can compute a callgraph in userspace from the loaded instructions, but it often doesn't match the verifier's own callgraph because of dead code elimination. Our current approach relies on dumping the BPF_LOG_LEVEL2 logs, but this feels overkill considering the verifier already has the information we need. The patch lets the verifier dump the maximum combined stack depth in the logs, on the same line as the per-subprog stack depths: stack depth 16+256 max 272 The per-subprog stack depths and the new max stack depth are not directly comparable. The former is sometimes updated during fixups, while the latter is not. As a result, even with a single subprog, we may end up with two slightly different values. The aim of the new max value is to be closest to what is actually enforced by the verifier. Signed-off-by: Paul Chaignon Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/d3d23a0410f87f116f3bbaa98a815dbae113bda2.1778700777.git.paul.chaignon@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 ++ kernel/bpf/verifier.c | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 6f12fc40b682..20c421b43849 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -989,6 +989,8 @@ struct bpf_verifier_env { u32 prev_insn_processed, insn_processed; /* number of jmps, calls, exits analyzed so far */ u32 prev_jmps_processed, jmps_processed; + /* maximum combined stack depth */ + u32 max_stack_depth; /* total verification time */ u64 verification_time; /* maximum number of verifier states kept in 'branching' instructions */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 82b9531f87f6..76a07f09ab64 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5177,6 +5177,8 @@ process_func: } if (subprog[idx].priv_stack_mode == PRIV_STACK_ADAPTIVE) { + if (subprog_depth > env->max_stack_depth) + env->max_stack_depth = subprog_depth; if (subprog_depth > MAX_BPF_STACK) { verbose(env, "stack size of subprog %d is %d. Too large\n", idx, subprog_depth); @@ -5184,6 +5186,8 @@ process_func: } } else { depth += subprog_depth; + if (depth > env->max_stack_depth) + env->max_stack_depth = depth; if (depth > MAX_BPF_STACK) { total = 0; for (tmp = idx; tmp >= 0; tmp = dinfo[tmp].caller) @@ -18555,7 +18559,7 @@ static void print_verification_stats(struct bpf_verifier_env *env) verbose(env, "stack depth %d", env->subprog_info[0].stack_depth); for (i = 1; i < subprog_cnt; i++) verbose(env, "+%d", env->subprog_info[i].stack_depth); - verbose(env, "\n"); + verbose(env, " max %d\n", env->max_stack_depth); verbose(env, "insns processed %d", env->subprog_info[0].insn_processed); for (i = 1; i < subprog_cnt; i++) if (bpf_subprog_is_global(env, i)) -- cgit v1.2.3 From 2a5b22e87ba5aeb5cad8acb1c7d9866981c37d1b Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Wed, 13 May 2026 21:35:01 +0200 Subject: selftests/bpf: Test reported max stack depth This patch tests the maximum stack depth reporting in verifier logs, with a couple special cases covered: fastcall, private stacks (main subprog & callee), and rounding up to 16 bytes. For that last one, we need to skip the test when JIT compilation is disabled as the rounding is then to 32 bytes. Signed-off-by: Paul Chaignon Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/075d22efd4338385a92f13b7817025cc3f04ec60.1778700777.git.paul.chaignon@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c | 3 +-- .../testing/selftests/bpf/progs/verifier_private_stack.c | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c index 0d9e167555b5..8d7ff38e4c06 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c +++ b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c @@ -799,8 +799,7 @@ __naked int bpf_loop_interaction2(void) SEC("raw_tp") __arch_x86_64 -__log_level(4) -__msg("stack depth 512+0") +__log_level(4) __msg("stack depth 512+0 max 512") /* just to print xlated version when debugging */ __xlated("r0 = &(void __percpu *)(r0)") __success diff --git a/tools/testing/selftests/bpf/progs/verifier_private_stack.c b/tools/testing/selftests/bpf/progs/verifier_private_stack.c index c5078face38d..046f7445a458 100644 --- a/tools/testing/selftests/bpf/progs/verifier_private_stack.c +++ b/tools/testing/selftests/bpf/progs/verifier_private_stack.c @@ -86,6 +86,7 @@ __naked static void cumulative_stack_depth_subprog(void) SEC("kprobe") __description("Private stack, subtree > MAX_BPF_STACK") __success +__log_level(4) __msg("stack depth 512+32 max 512") __arch_x86_64 /* private stack fp for the main prog */ __jited(" movabsq $0x{{.*}}, %r9") @@ -324,6 +325,8 @@ int private_stack_async_callback_1(void) SEC("fentry/bpf_fentry_test9") __description("Private stack, async callback, potential nesting") __success __retval(0) +__load_if_JITed() +__log_level(4) __msg("stack depth 8+0+256+0 max 272") __arch_x86_64 __jited(" subq $0x100, %rsp") __arch_arm64 @@ -344,6 +347,18 @@ int private_stack_async_callback_2(void) return 0; } +SEC("fentry/bpf_fentry_test9") +__description("private stack, max stack depth is private stack") +__success +__log_level(4) __msg("stack depth 8+256+0 max 256") +int private_stack_max_depth(void) +{ + int x = 0; + + subprog1(&x); + return 0; +} + #else SEC("kprobe") -- cgit v1.2.3 From f0015ffbf40c7c6db148163bd6f8c53f14933b53 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Wed, 13 May 2026 21:35:36 +0200 Subject: veristat: Report max stack depth This patch adds a new "Max stack depth" field to the set of gathered statistics. This field reports the maximum combined stack depth compared to the 512 bytes limit. It is null for rejected programs. Suggested-by: Eduard Zingerman Signed-off-by: Paul Chaignon Link: https://lore.kernel.org/r/a27ed8f336669152c4b1b05e920aee4438e3e2b3.1778700777.git.paul.chaignon@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/veristat.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index 5c82950e6978..a7db6f04f7e1 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -48,6 +48,7 @@ enum stat_id { SIZE, JITED_SIZE, STACK, + MAX_STACK, PROG_TYPE, ATTACH_TYPE, MEMORY_PEAK, @@ -789,13 +790,13 @@ cleanup: } static const struct stat_specs default_csv_output_spec = { - .spec_cnt = 15, + .spec_cnt = 16, .ids = { FILE_NAME, PROG_NAME, VERDICT, DURATION, TOTAL_INSNS, TOTAL_STATES, PEAK_STATES, MAX_STATES_PER_INSN, MARK_READ_MAX_LEN, SIZE, JITED_SIZE, PROG_TYPE, ATTACH_TYPE, - STACK, MEMORY_PEAK, + STACK, MAX_STACK, MEMORY_PEAK, }, }; @@ -834,6 +835,7 @@ static struct stat_def { [SIZE] = { "Program size", {"prog_size"}, }, [JITED_SIZE] = { "Jited size", {"prog_size_jited"}, }, [STACK] = {"Stack depth", {"stack_depth", "stack"}, }, + [MAX_STACK] = {"Max stack depth", {"max_stack_depth"}, }, [PROG_TYPE] = { "Program type", {"prog_type"}, }, [ATTACH_TYPE] = { "Attach type", {"attach_type", }, }, [MEMORY_PEAK] = { "Peak memory (MiB)", {"mem_peak", }, }, @@ -1023,7 +1025,7 @@ static int parse_verif_log(char * const buf, size_t buf_sz, struct verif_stats * &s->stats[MARK_READ_MAX_LEN])) continue; - if (1 == sscanf(cur, "stack depth %511s", stack)) + if (2 == sscanf(cur, "stack depth %511s max %ld", stack, &s->stats[MAX_STACK])) continue; } while ((token = strtok_r(cnt++ ? NULL : stack, "+", &state))) { @@ -2278,6 +2280,7 @@ static int cmp_stat(const struct verif_stats *s1, const struct verif_stats *s2, case SIZE: case JITED_SIZE: case STACK: + case MAX_STACK: case VERDICT: case DURATION: case TOTAL_INSNS: @@ -2512,6 +2515,7 @@ static void prepare_value(const struct verif_stats *s, enum stat_id id, case MAX_STATES_PER_INSN: case MARK_READ_MAX_LEN: case STACK: + case MAX_STACK: case SIZE: case JITED_SIZE: case MEMORY_PEAK: @@ -2602,7 +2606,8 @@ static int parse_stat_value(const char *str, enum stat_id id, struct verif_stats case SIZE: case JITED_SIZE: case MEMORY_PEAK: - case STACK: { + case STACK: + case MAX_STACK: { long val; int err, n; -- cgit v1.2.3 From 5ff44955447eb04f77161736ff5729c8c0994f7f Mon Sep 17 00:00:00 2001 From: Samuel Wu Date: Mon, 11 May 2026 10:45:56 -0700 Subject: PM: wakeup: Add kfuncs to traverse over wakeup_sources Iterating through wakeup sources via sysfs or debugfs can be inefficient or restricted. Introduce BPF kfuncs to allow high-performance and safe in-kernel traversal of the wakeup_sources list. There is at least a 30x speedup for walking 150 wakeup sources and all their attributes. The new kfuncs include: - bpf_wakeup_sources_get_head() to obtain the list head. - bpf_wakeup_sources_read_lock/unlock() to manage the SRCU lock. For verifier safety, the underlying SRCU index is wrapped in an opaque 'struct bpf_ws_lock' pointer. This enables the use of KF_ACQUIRE and KF_RELEASE flags, allowing the BPF verifier to strictly enforce paired lock/unlock cycles and prevent resource leaks. Signed-off-by: Samuel Wu Acked-by: Kumar Kartikeya Dwivedi Acked-by: Rafael J. Wysocki (Intel) Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20260511174559.659782-2-wusamuel@google.com Signed-off-by: Alexei Starovoitov --- drivers/base/power/power.h | 7 +++++ drivers/base/power/wakeup.c | 71 +++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 76 insertions(+), 2 deletions(-) diff --git a/drivers/base/power/power.h b/drivers/base/power/power.h index 922ed457db19..8823aceeac8b 100644 --- a/drivers/base/power/power.h +++ b/drivers/base/power/power.h @@ -168,3 +168,10 @@ static inline void device_pm_init(struct device *dev) device_pm_sleep_init(dev); pm_runtime_init(dev); } + +#ifdef CONFIG_BPF_SYSCALL +struct bpf_ws_lock { }; +struct bpf_ws_lock *bpf_wakeup_sources_read_lock(void); +void bpf_wakeup_sources_read_unlock(struct bpf_ws_lock *lock); +void *bpf_wakeup_sources_get_head(void); +#endif diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c index b8e48a023bf0..80b497de2deb 100644 --- a/drivers/base/power/wakeup.c +++ b/drivers/base/power/wakeup.c @@ -1168,11 +1168,78 @@ static const struct file_operations wakeup_sources_stats_fops = { .release = seq_release_private, }; -static int __init wakeup_sources_debugfs_init(void) +#ifdef CONFIG_BPF_SYSCALL +#include + +__bpf_kfunc_start_defs(); + +/** + * bpf_wakeup_sources_read_lock - Acquire the SRCU lock for wakeup sources + * + * The underlying SRCU lock returns an integer index. However, the BPF verifier + * requires a pointer (PTR_TO_BTF_ID) to strictly track the state of acquired + * resources using KF_ACQUIRE and KF_RELEASE semantics. We use an opaque + * structure pointer (struct bpf_ws_lock *) to satisfy the verifier while + * safely encoding the integer index within the pointer address itself. + * + * Return: An opaque pointer encoding the SRCU lock index + 1 (to avoid NULL). + */ +__bpf_kfunc struct bpf_ws_lock *bpf_wakeup_sources_read_lock(void) +{ + return (struct bpf_ws_lock *)(long)(wakeup_sources_read_lock() + 1); +} + +/** + * bpf_wakeup_sources_read_unlock - Release the SRCU lock for wakeup sources + * @lock: The opaque pointer returned by bpf_wakeup_sources_read_lock() + * + * The BPF verifier guarantees that @lock is a valid, unreleased pointer from + * the acquire function. We decode the pointer back into the integer SRCU index + * by subtracting 1 and release the lock. + */ +__bpf_kfunc void bpf_wakeup_sources_read_unlock(struct bpf_ws_lock *lock) +{ + wakeup_sources_read_unlock((int)(long)lock - 1); +} + +/** + * bpf_wakeup_sources_get_head - Get the head of the wakeup sources list + * + * Return: The head of the wakeup sources list. + */ +__bpf_kfunc void *bpf_wakeup_sources_get_head(void) +{ + return &wakeup_sources; +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(wakeup_source_kfunc_ids) +BTF_ID_FLAGS(func, bpf_wakeup_sources_read_lock, KF_ACQUIRE) +BTF_ID_FLAGS(func, bpf_wakeup_sources_read_unlock, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_wakeup_sources_get_head) +BTF_KFUNCS_END(wakeup_source_kfunc_ids) + +static const struct btf_kfunc_id_set wakeup_source_kfunc_set = { + .set = &wakeup_source_kfunc_ids, +}; + +static void __init wakeup_sources_bpf_init(void) +{ + if (register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &wakeup_source_kfunc_set)) + pm_pr_dbg("Wakeup: failed to register BTF kfuncs\n"); +} +#else +static inline void wakeup_sources_bpf_init(void) {} +#endif /* CONFIG_BPF_SYSCALL */ + +static int __init wakeup_sources_init(void) { debugfs_create_file("wakeup_sources", 0444, NULL, NULL, &wakeup_sources_stats_fops); + wakeup_sources_bpf_init(); + return 0; } -postcore_initcall(wakeup_sources_debugfs_init); +postcore_initcall(wakeup_sources_init); -- cgit v1.2.3 From 9ef647114201b50b60a43054506af893f74ae8b8 Mon Sep 17 00:00:00 2001 From: Samuel Wu Date: Mon, 11 May 2026 10:45:57 -0700 Subject: selftests/bpf: Add tests for wakeup_sources kfuncs Introduce a set of BPF selftests to verify the safety and functionality of wakeup_source kfuncs. The suite includes: 1. A functional test (test_wakeup_source.c) that iterates over the global wakeup_sources list. It uses CO-RE to read timing statistics and validates them in user-space via the BPF ring buffer. 2. A negative test suite (wakeup_source_fail.c) ensuring the BPF verifier correctly enforces reference tracking and type safety. 3. Enable CONFIG_PM_WAKELOCKS in the test config, allowing creation of wakeup sources via /sys/power/wake_lock. A shared header (wakeup_source.h) is introduced to ensure consistent memory layout for the Ring Buffer data between BPF and user-space. Signed-off-by: Samuel Wu Link: https://lore.kernel.org/r/20260511174559.659782-3-wusamuel@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/config | 3 +- .../selftests/bpf/prog_tests/wakeup_source.c | 118 +++++++++++++++++++++ .../selftests/bpf/progs/test_wakeup_source.c | 92 ++++++++++++++++ tools/testing/selftests/bpf/progs/wakeup_source.h | 22 ++++ .../selftests/bpf/progs/wakeup_source_fail.c | 76 +++++++++++++ 5 files changed, 310 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/wakeup_source.c create mode 100644 tools/testing/selftests/bpf/progs/test_wakeup_source.c create mode 100644 tools/testing/selftests/bpf/progs/wakeup_source.h create mode 100644 tools/testing/selftests/bpf/progs/wakeup_source_fail.c diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 24855381290d..bac60b444551 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -130,4 +130,5 @@ CONFIG_INFINIBAND=y CONFIG_SMC=y CONFIG_SMC_HS_CTRL_BPF=y CONFIG_DIBS=y -CONFIG_DIBS_LO=y \ No newline at end of file +CONFIG_DIBS_LO=y +CONFIG_PM_WAKELOCKS=y diff --git a/tools/testing/selftests/bpf/prog_tests/wakeup_source.c b/tools/testing/selftests/bpf/prog_tests/wakeup_source.c new file mode 100644 index 000000000000..ebfdc03271b9 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/wakeup_source.c @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright 2026 Google LLC */ + +#include +#include +#include +#include "test_wakeup_source.skel.h" +#include "wakeup_source_fail.skel.h" +#include "progs/wakeup_source.h" + +static int lock_ws(const char *name) +{ + int fd; + ssize_t bytes; + + fd = open("/sys/power/wake_lock", O_WRONLY); + if (!ASSERT_OK_FD(fd, "open /sys/power/wake_lock")) + return -1; + + bytes = write(fd, name, strlen(name)); + close(fd); + if (!ASSERT_EQ(bytes, strlen(name), "write to wake_lock")) + return -1; + + return 0; +} + +static void unlock_ws(const char *name) +{ + int fd; + + fd = open("/sys/power/wake_unlock", O_WRONLY); + if (fd < 0) + return; + + write(fd, name, strlen(name)); + close(fd); +} + +struct rb_ctx { + const char *name; + bool found; + long long active_time_ns; + long long total_time_ns; +}; + +static int process_sample(void *ctx, void *data, size_t len) +{ + struct rb_ctx *rb_ctx = ctx; + struct wakeup_event_t *e = data; + + if (strcmp(e->name, rb_ctx->name) == 0) { + rb_ctx->found = true; + rb_ctx->active_time_ns = e->active_time_ns; + rb_ctx->total_time_ns = e->total_time_ns; + } + return 0; +} + +void test_wakeup_source(void) +{ + struct btf *btf; + int id; + + btf = btf__load_vmlinux_btf(); + if (!ASSERT_OK_PTR(btf, "btf_vmlinux")) + return; + + id = btf__find_by_name_kind(btf, "bpf_wakeup_sources_get_head", BTF_KIND_FUNC); + btf__free(btf); + + if (id < 0) { + printf("%s:SKIP:bpf_wakeup_sources_get_head kfunc not found in BTF\n", __func__); + test__skip(); + return; + } + + if (test__start_subtest("iterate_and_verify_times")) { + struct test_wakeup_source *skel; + struct ring_buffer *rb = NULL; + struct rb_ctx rb_ctx = { + .name = "bpf_selftest_ws_times", + .found = false, + }; + int err; + + skel = test_wakeup_source__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + return; + + rb = ring_buffer__new(bpf_map__fd(skel->maps.rb), process_sample, &rb_ctx, NULL); + if (!ASSERT_OK_PTR(rb, "ring_buffer__new")) + goto destroy; + + /* Create a temporary wakeup source */ + if (!ASSERT_OK(lock_ws(rb_ctx.name), "lock_ws")) + goto unlock; + + err = bpf_prog_test_run_opts(bpf_program__fd( + skel->progs.iterate_wakeupsources), NULL); + ASSERT_OK(err, "bpf_prog_test_run"); + + ring_buffer__consume(rb); + + ASSERT_TRUE(rb_ctx.found, "found_test_ws_in_rb"); + ASSERT_GT(rb_ctx.active_time_ns, 0, "active_time_gt_0"); + ASSERT_GT(rb_ctx.total_time_ns, 0, "total_time_gt_0"); + +unlock: + unlock_ws(rb_ctx.name); +destroy: + if (rb) + ring_buffer__free(rb); + test_wakeup_source__destroy(skel); + } + + RUN_TESTS(wakeup_source_fail); +} diff --git a/tools/testing/selftests/bpf/progs/test_wakeup_source.c b/tools/testing/selftests/bpf/progs/test_wakeup_source.c new file mode 100644 index 000000000000..fd2fb6aebd82 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_wakeup_source.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright 2026 Google LLC */ + +#include "vmlinux.h" +#include +#include +#include "bpf_experimental.h" +#include "bpf_misc.h" +#include "wakeup_source.h" + +#define MAX_LOOP_ITER 1000 +#define RB_SIZE (16384 * 4) + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, RB_SIZE); +} rb SEC(".maps"); + +struct bpf_ws_lock; +struct bpf_ws_lock *bpf_wakeup_sources_read_lock(void) __ksym; +void bpf_wakeup_sources_read_unlock(struct bpf_ws_lock *lock) __ksym; +void *bpf_wakeup_sources_get_head(void) __ksym; + +SEC("syscall") +__success __retval(0) +int iterate_wakeupsources(void *ctx) +{ + struct list_head *head = bpf_wakeup_sources_get_head(); + struct list_head *pos = head; + struct bpf_ws_lock *lock; + int i; + + lock = bpf_wakeup_sources_read_lock(); + if (!lock) + return 0; + + bpf_for(i, 0, MAX_LOOP_ITER) { + if (bpf_core_read(&pos, sizeof(pos), &pos->next) || !pos || pos == head) + break; + + struct wakeup_event_t *e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); + + if (!e) + break; + + struct wakeup_source *ws = bpf_core_cast( + (void *)pos - bpf_core_field_offset(struct wakeup_source, entry), + struct wakeup_source); + s64 active_time = 0; + bool active = BPF_CORE_READ_BITFIELD(ws, active); + bool autosleep_enable = BPF_CORE_READ_BITFIELD(ws, autosleep_enabled); + s64 last_time = ws->last_time; + s64 max_time = ws->max_time; + s64 prevent_sleep_time = ws->prevent_sleep_time; + s64 total_time = ws->total_time; + + if (active) { + s64 curr_time = bpf_ktime_get_ns(); + s64 prevent_time = ws->start_prevent_time; + + if (curr_time > last_time) + active_time = curr_time - last_time; + + total_time += active_time; + if (active_time > max_time) + max_time = active_time; + if (autosleep_enable && curr_time > prevent_time) + prevent_sleep_time += curr_time - prevent_time; + } + + e->active_count = ws->active_count; + e->active_time_ns = active_time; + e->event_count = ws->event_count; + e->expire_count = ws->expire_count; + e->last_time_ns = last_time; + e->max_time_ns = max_time; + e->prevent_sleep_time_ns = prevent_sleep_time; + e->total_time_ns = total_time; + e->wakeup_count = ws->wakeup_count; + + if (bpf_probe_read_kernel_str( + e->name, WAKEUP_NAME_LEN, ws->name) < 0) + e->name[0] = '\0'; + + bpf_ringbuf_submit(e, 0); + } + + bpf_wakeup_sources_read_unlock(lock); + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/wakeup_source.h b/tools/testing/selftests/bpf/progs/wakeup_source.h new file mode 100644 index 000000000000..cd74de92c82f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/wakeup_source.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright 2026 Google LLC */ + +#ifndef __WAKEUP_SOURCE_H__ +#define __WAKEUP_SOURCE_H__ + +#define WAKEUP_NAME_LEN 128 + +struct wakeup_event_t { + unsigned long active_count; + long long active_time_ns; + unsigned long event_count; + unsigned long expire_count; + long long last_time_ns; + long long max_time_ns; + long long prevent_sleep_time_ns; + long long total_time_ns; + unsigned long wakeup_count; + char name[WAKEUP_NAME_LEN]; +}; + +#endif /* __WAKEUP_SOURCE_H__ */ diff --git a/tools/testing/selftests/bpf/progs/wakeup_source_fail.c b/tools/testing/selftests/bpf/progs/wakeup_source_fail.c new file mode 100644 index 000000000000..b8bbb61d4d4e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/wakeup_source_fail.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright 2026 Google LLC */ + +#include +#include +#include "bpf_misc.h" + +struct bpf_ws_lock; + +struct bpf_ws_lock *bpf_wakeup_sources_read_lock(void) __ksym; +void bpf_wakeup_sources_read_unlock(struct bpf_ws_lock *lock) __ksym; +void *bpf_wakeup_sources_get_head(void) __ksym; + +SEC("syscall") +__failure __msg("BPF_EXIT instruction in main prog would lead to reference leak") +int wakeup_source_lock_no_unlock(void *ctx) +{ + struct bpf_ws_lock *lock; + + lock = bpf_wakeup_sources_read_lock(); + if (!lock) + return 0; + + return 0; +} + +SEC("syscall") +__failure __msg("access beyond struct") +int wakeup_source_access_lock_fields(void *ctx) +{ + struct bpf_ws_lock *lock; + int val; + + lock = bpf_wakeup_sources_read_lock(); + if (!lock) + return 0; + + val = *(int *)lock; + + bpf_wakeup_sources_read_unlock(lock); + return val; +} + +SEC("syscall") +__failure __msg("type=scalar expected=fp") +int wakeup_source_unlock_no_lock(void *ctx) +{ + struct bpf_ws_lock *lock = (void *)0x1; + + bpf_wakeup_sources_read_unlock(lock); + + return 0; +} + +SEC("syscall") +__failure __msg("Possibly NULL pointer passed to trusted") +int wakeup_source_unlock_null(void *ctx) +{ + bpf_wakeup_sources_read_unlock(NULL); + + return 0; +} + +SEC("syscall") +__failure __msg("R0 invalid mem access 'scalar'") +int wakeup_source_unsafe_dereference(void *ctx) +{ + struct list_head *head = bpf_wakeup_sources_get_head(); + + if (head->next) + return 1; + + return 0; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From c8993263ffd3831c96f258ca76e59303122a47bb Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Mon, 11 May 2026 14:10:22 +0200 Subject: bpf: Add Jiayuan Chen to sockmap maintainers Nominate Jiayuan Chen for the sockmap co-maintainer. Jiayuan has been a regular contributor and reviewer for the sockmap and networking code. Since we are now down to just two maintainers, and John has to split his time between BPF core, BPF networking, and sockmap, having three maintainers again will help with the review load. Signed-off-by: Jakub Sitnicki Acked-by: Jiayuan Chen Link: https://lore.kernel.org/r/20260511-sockmap-ktls-fix-1-v1-1-96ff8c1906e4@cloudflare.com Signed-off-by: Alexei Starovoitov --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index b2040011a386..dfc621ff629d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4861,6 +4861,7 @@ F: kernel/bpf/*iter.c BPF [L7 FRAMEWORK] (sockmap) M: John Fastabend M: Jakub Sitnicki +M: Jiayuan Chen L: netdev@vger.kernel.org L: bpf@vger.kernel.org S: Maintained -- cgit v1.2.3 From 4286f5deee14b26a9f0447b566d4c7cb7e2e2702 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 15 May 2026 15:50:40 -0700 Subject: bpf: Validate outgoing stack args when btf_prepare_func_args fails btf_prepare_func_args() sets sub->arg_cnt before validating arg types. If validation fails (e.g. unsupported pointer type in a static subprog), check_outgoing_stack_args() is skipped because btf_check_func_arg_match() returns early. For static subprogs, check_func_call() ignores non-EFAULT errors and proceeds with the call. This causes the callee to read stack arg slots that the caller never stored or not initialized, potentially dereferencing NULL caller->stack_arg_regs or getting no-initialized value. To fix the issue, when btf_prepare_func_args() fails and the subprog expects stack args, call check_outgoing_stack_args() to verify the caller initialized the slots. Return -EFAULT on failure so the error is not ignored. Fixes: 3ab5bd317ee2 ("bpf: Set sub->arg_cnt earlier in btf_prepare_func_args()") Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260515225040.821515-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 76a07f09ab64..8dd79b735a69 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9118,11 +9118,17 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_func_state *caller = cur_func(env); struct bpf_verifier_log *log = &env->log; u32 i; - int ret; + int ret, err; ret = btf_prepare_func_args(env, subprog); - if (ret) + if (ret) { + if (bpf_in_stack_arg_cnt(sub) > 0) { + err = check_outgoing_stack_args(env, caller, sub->arg_cnt); + if (err) + return err; + } return ret; + } ret = check_outgoing_stack_args(env, caller, sub->arg_cnt); if (ret) -- cgit v1.2.3 From ef1b54e0db671a161887475ef70cd570cbb2a6ab Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 15 May 2026 15:50:45 -0700 Subject: selftests/bpf: Add test for stack arg read without caller write Add negative tests for the outgoing stack arg validation. A static subprog with a 'long *' arg causes btf_prepare_func_args() to fail after setting arg_cnt. The validation ensures check_outgoing_stack_args() still runs. Also update two existing tests (release_ref, stale_pkt_ptr) whose expected error messages changed: invalidated stack arg slots are now caught by check_outgoing_stack_args() at the call site instead of at the callee's dereference. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260515225045.822104-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- .../bpf/progs/btf__verifier_stack_arg_order.c | 8 +++ .../selftests/bpf/progs/verifier_stack_arg.c | 4 +- .../selftests/bpf/progs/verifier_stack_arg_order.c | 58 ++++++++++++++++++++++ 3 files changed, 68 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/btf__verifier_stack_arg_order.c b/tools/testing/selftests/bpf/progs/btf__verifier_stack_arg_order.c index da34e8456b6c..99bc115f8380 100644 --- a/tools/testing/selftests/bpf/progs/btf__verifier_stack_arg_order.c +++ b/tools/testing/selftests/bpf/progs/btf__verifier_stack_arg_order.c @@ -21,6 +21,10 @@ int subprog_pruning_call_before_load_6args(int a, int b, int c, int d, int e, in return a + b + c + d + e + f; } +void subprog_bad_ptr_7args(long *a, int b, int c, int d, int e, int f, int g) +{ +} + #else int subprog_bad_order_6args(void) @@ -38,4 +42,8 @@ int subprog_pruning_call_before_load_6args(void) return 0; } +void subprog_bad_ptr_7args(void) +{ +} + #endif diff --git a/tools/testing/selftests/bpf/progs/verifier_stack_arg.c b/tools/testing/selftests/bpf/progs/verifier_stack_arg.c index d43a9b42034c..d45339b83795 100644 --- a/tools/testing/selftests/bpf/progs/verifier_stack_arg.c +++ b/tools/testing/selftests/bpf/progs/verifier_stack_arg.c @@ -152,7 +152,7 @@ __naked void stack_arg_pruning_type_mismatch(void) SEC("tc") __description("stack_arg: release_reference invalidates stack arg slot") __failure -__msg("R{{[0-9]}} !read_ok") +__msg("callee expects 6 args, stack arg1 is not initialized") __naked void stack_arg_release_ref(void) { asm volatile ( @@ -201,7 +201,7 @@ __naked void stack_arg_release_ref(void) SEC("tc") __description("stack_arg: pkt pointer in stack arg slot invalidated after pull_data") __failure -__msg("R{{[0-9]}} !read_ok") +__msg("callee expects 6 args, stack arg1 is not initialized") __naked void stack_arg_stale_pkt_ptr(void) { asm volatile ( diff --git a/tools/testing/selftests/bpf/progs/verifier_stack_arg_order.c b/tools/testing/selftests/bpf/progs/verifier_stack_arg_order.c index 1240cf8a40d6..c9fe4857da3f 100644 --- a/tools/testing/selftests/bpf/progs/verifier_stack_arg_order.c +++ b/tools/testing/selftests/bpf/progs/verifier_stack_arg_order.c @@ -112,6 +112,64 @@ __naked void stack_arg_pruning_load_after_call(void) ); } +/* + * "bad_ptr": the first arg is 'long *', which is not a recognized pointer + * type for static subprogs (not ctx, dynptr, or tagged). btf_prepare_func_args() + * sets arg_cnt = 7 / stack_arg_cnt = 2, then fails with -EINVAL. The subprog + * is marked unreliable but the call still proceeds for static subprogs. + */ +__noinline __used __naked +static void subprog_bad_ptr_7args(long *a, int b, int c, int d, int e, int f, int g) +{ + asm volatile ( + "r0 = *(u64 *)(r11 + 8);" + "r1 = *(u64 *)(r11 + 16);" + "exit;" + ::: __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: read without caller write") +__failure +__msg("callee expects 7 args, stack arg1 is not initialized") +__btf_func_path("btf__verifier_stack_arg_order.bpf.o") +__naked void stack_arg_read_without_write_1(void) +{ + asm volatile ( + "r1 = 0;" + "r2 = 0;" + "r3 = 0;" + "r4 = 0;" + "r5 = 0;" + "call subprog_bad_ptr_7args;" + "exit;" + ::: __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: read with not-initialized caller write") +__failure +__msg("R0 !read_ok") +__btf_func_path("btf__verifier_stack_arg_order.bpf.o") +__naked void stack_arg_read_without_write_2(void) +{ + asm volatile ( + "r1 = 0;" + "r2 = 0;" + "r3 = 0;" + "r4 = 0;" + "r5 = 0;" + "*(u64 *)(r11 - 8) = 0;" + "*(u64 *)(r11 - 16) = 0;" + "call subprog_bad_ptr_7args;" + "call subprog_bad_ptr_7args;" + "exit;" + ::: __clobber_all + ); +} + #else SEC("socket") -- cgit v1.2.3 From 0e2647792f60df746422d6089daf9d56945d5f91 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 15 May 2026 15:50:51 -0700 Subject: selftests/bpf: Log arg_track_join for stack arg slots in liveness analysis Commit 2af4e792773f ("bpf: Extend liveness analysis to track stack argument slots") added stack arg supports. For selftest verifier_stack_arg/stack_arg: pruning with different stack arg types the following are two arg JOIN messages: arg JOIN insn 9 -> 10 r1: fp0-8 + _ => fp0-8|fp0+0 arg JOIN insn 9 -> 10 r11: fp0-8 + _ => fp0-8|fp0+0 Here the "r11:" label for stack arg slot 0 is misleading since r11 is a special register (BPF_REG_PARAMS). The next patch corrects this to "sa0:", properly representing the 'stack arg slot 0'. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260515225051.822739-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/verifier_stack_arg.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/verifier_stack_arg.c b/tools/testing/selftests/bpf/progs/verifier_stack_arg.c index d45339b83795..df0c3438529e 100644 --- a/tools/testing/selftests/bpf/progs/verifier_stack_arg.c +++ b/tools/testing/selftests/bpf/progs/verifier_stack_arg.c @@ -114,8 +114,10 @@ __naked void stack_arg_gap_at_minus8(void) SEC("tc") __description("stack_arg: pruning with different stack arg types") -__failure +__failure __log_level(2) __flag(BPF_F_TEST_STATE_FREQ) +__msg("arg JOIN insn 9 -> 10 r1: fp0-8 + _ => fp0-8|fp0+0") +__msg("arg JOIN insn 9 -> 10 r11: fp0-8 + _ => fp0-8|fp0+0") __msg("R{{[0-9]}} invalid mem access 'scalar'") __naked void stack_arg_pruning_type_mismatch(void) { -- cgit v1.2.3 From d1dbe443a0abb4ea3ec35a16e36efe6d3bbf72f6 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 15 May 2026 15:50:56 -0700 Subject: bpf: Fix arg_track_join log to use sa prefix for stack arg slots arg_track_join() logs state transitions at CFG merge points. For stack arg slots (r >= MAX_BPF_REG), it printed "r11:", "r12:", etc., which is misleading since r11 is a special register (BPF_REG_PARAMS) not meaningful to the user. Fix it to print "sa0:", "sa1:", etc., matching the per-instruction transition log in arg_track_log() which already uses the "sa" prefix. Update the existing stack_arg_pruning_type_mismatch selftest to expect the corrected format. Fixes: 2af4e792773f ("bpf: Extend liveness analysis to track stack argument slots") Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260515225056.823086-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/liveness.c | 4 +++- tools/testing/selftests/bpf/progs/verifier_stack_arg.c | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c index 7f4a0e4c2c49..0aadfbae0acc 100644 --- a/kernel/bpf/liveness.c +++ b/kernel/bpf/liveness.c @@ -806,7 +806,9 @@ static bool arg_track_join(struct bpf_verifier_env *env, int idx, int target, in return true; verbose(env, "arg JOIN insn %d -> %d ", idx, target); - if (r >= 0) + if (r >= MAX_BPF_REG) + verbose(env, "sa%d: ", r - MAX_BPF_REG); + else if (r >= 0) verbose(env, "r%d: ", r); else verbose(env, "fp%+d: ", r * 8); diff --git a/tools/testing/selftests/bpf/progs/verifier_stack_arg.c b/tools/testing/selftests/bpf/progs/verifier_stack_arg.c index df0c3438529e..7e0ce5db28a0 100644 --- a/tools/testing/selftests/bpf/progs/verifier_stack_arg.c +++ b/tools/testing/selftests/bpf/progs/verifier_stack_arg.c @@ -117,7 +117,7 @@ __description("stack_arg: pruning with different stack arg types") __failure __log_level(2) __flag(BPF_F_TEST_STATE_FREQ) __msg("arg JOIN insn 9 -> 10 r1: fp0-8 + _ => fp0-8|fp0+0") -__msg("arg JOIN insn 9 -> 10 r11: fp0-8 + _ => fp0-8|fp0+0") +__msg("arg JOIN insn 9 -> 10 sa0: fp0-8 + _ => fp0-8|fp0+0") __msg("R{{[0-9]}} invalid mem access 'scalar'") __naked void stack_arg_pruning_type_mismatch(void) { -- cgit v1.2.3 From 98540a12823a016e2e1fa0db15543b22ac1fa056 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 15 May 2026 15:51:01 -0700 Subject: bpf: Clean up redundant stack arg checks for non-JITed programs Remove a redundant stack_arg_cnt check in __bpf_prog_select_runtime() and start the stack arg loop from index 0 in bpf_fixup_call_args(). Both changes are no-ops that simplify the code: In __bpf_prog_select_runtime(), the subprog_info[0].stack_arg_cnt check is unreachable: - when there is only a main program (no bpf-to-bpf calls), subprog_info[0].stack_arg_cnt is always 0 because the main program's arg_cnt is forced to 1 - when bpf-to-bpf calls use stack args and JIT succeeds, fp->bpf_func is set and this code is skipped - when JIT fails, bpf_fixup_call_args() rejects the program before we get to __bpf_prog_select_runtime(). In bpf_fixup_call_args(), starting the loop at i=1 skipped subprog 0, which is safe since the main program always has arg_cnt=1 and thus bpf_in_stack_arg_cnt() returns 0. Starting at i=0 removes the need to reason about this invariant. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260515225101.824054-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 2 +- kernel/bpf/fixups.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 427a6d828e01..cdbe9fdf474f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2609,7 +2609,7 @@ struct bpf_prog *__bpf_prog_select_runtime(struct bpf_verifier_env *env, struct goto finalize; if (IS_ENABLED(CONFIG_BPF_JIT_ALWAYS_ON) || - bpf_prog_has_kfunc_call(fp) || (env && env->subprog_info[0].stack_arg_cnt)) + bpf_prog_has_kfunc_call(fp)) jit_needed = true; if (!bpf_prog_select_interpreter(fp)) diff --git a/kernel/bpf/fixups.c b/kernel/bpf/fixups.c index 19056016eed8..2cec4e8cd4a0 100644 --- a/kernel/bpf/fixups.c +++ b/kernel/bpf/fixups.c @@ -1407,7 +1407,7 @@ int bpf_fixup_call_args(struct bpf_verifier_env *env) verbose(env, "calling kernel functions are not allowed in non-JITed programs\n"); return -EINVAL; } - for (i = 1; i < env->subprog_cnt; i++) { + for (i = 0; i < env->subprog_cnt; i++) { if (bpf_in_stack_arg_cnt(&env->subprog_info[i])) { verbose(env, "stack args are not supported in non-JITed programs\n"); return -EINVAL; -- cgit v1.2.3 From 18a37465b0ab5237a1d0ebf93a2a3b6a2da540b3 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sun, 17 May 2026 08:07:02 -0700 Subject: bpf,x86: Fix exception unwinding with outgoing stack arguments When a main program with exception_boundary has outgoing stack arguments (e.g. from calling subprogs with >5 args), bpf_throw() fails to correctly restore callee-saved registers, causing a kernel crash. The x86 JIT allocates the outgoing stack arg area below the callee-saved registers via 'sub rsp, outgoing_rsp' in the prologue. When bpf_throw() unwinds, it captures the main program's sp (which includes this outgoing area) and passes it to the exception callback. The callback gets rsp and rbp, followed by pop_callee_regs, but rsp points into the outgoing arg area rather than the callee-saved registers, so the pops restore garbage values. Returning to the kernel with corrupted callee-saved registers causes a crash. Fix this by adjusting the sp (adding stack_arg_sp_adjust) passed to the exception callback, so it points to the bottom of the callee-saved registers instead of the outgoing arg area. When stack_arg_sp_adjust is 0 (the common case), this is a no-op. Fixes: 324c3ca6eed6 ("bpf,x86: Implement JIT support for stack arguments") Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260517150702.288031-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 2 ++ include/linux/bpf.h | 1 + kernel/bpf/fixups.c | 1 + kernel/bpf/helpers.c | 2 +- 4 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index ceefefb4da21..a0c541a441cf 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1789,6 +1789,8 @@ static int do_jit(struct bpf_verifier_env *env, struct bpf_prog *bpf_prog, int * * Arg 6 goes into r9 register, not on stack. */ outgoing_rsp = out_stack_arg_cnt > 1 ? (out_stack_arg_cnt - 1) * 8 : 0; + if (bpf_prog->aux->exception_boundary) + bpf_prog->aux->stack_arg_sp_adjust = outgoing_rsp; emit_sub_rsp(&prog, outgoing_rsp); if (arena_vm_start) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 242f9597d9ab..1b28cacc3075 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1736,6 +1736,7 @@ struct bpf_prog_aux { struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; char name[BPF_OBJ_NAME_LEN]; u64 (*bpf_exception_cb)(u64 cookie, u64 sp, u64 bp, u64, u64); + u16 stack_arg_sp_adjust; #ifdef CONFIG_SECURITY void *security; #endif diff --git a/kernel/bpf/fixups.c b/kernel/bpf/fixups.c index 2cec4e8cd4a0..52535671cb9a 100644 --- a/kernel/bpf/fixups.c +++ b/kernel/bpf/fixups.c @@ -1265,6 +1265,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) prog->aux->real_func_cnt = env->subprog_cnt; prog->aux->bpf_exception_cb = (void *)func[env->exception_callback_subprog]->bpf_func; prog->aux->exception_boundary = func[0]->aux->exception_boundary; + prog->aux->stack_arg_sp_adjust = func[0]->aux->stack_arg_sp_adjust; bpf_prog_jit_attempt_done(prog); return 0; out_free: diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index baa12b24bb64..07de26e7314c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3301,7 +3301,7 @@ __bpf_kfunc void bpf_throw(u64 cookie) * which skips compiler generated instrumentation to do the same. */ kasan_unpoison_task_stack_below((void *)(long)ctx.sp); - ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp, 0, 0); + ctx.aux->bpf_exception_cb(cookie, ctx.sp + ctx.aux->stack_arg_sp_adjust, ctx.bp, 0, 0); WARN(1, "A call to BPF exception callback should never return\n"); } -- cgit v1.2.3 From 576482b55c19e7ec00e162a0fde4c4f1a95128c7 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sun, 17 May 2026 08:07:07 -0700 Subject: selftests/bpf: Add exception tests with stack arguments Add tests to verify that bpf_throw() correctly unwinds the stack when the program uses outgoing stack arguments (functions with >5 args). Without the preceding x86 fix, these tests crash the kernel on x86 due to corrupted callee-saved register restore. There is no change for arm64 to support exception with stack arguments. Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260517150707.289273-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/prog_tests/exceptions.c | 7 ++ tools/testing/selftests/bpf/progs/exceptions.c | 114 +++++++++++++++++++++ 2 files changed, 121 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/exceptions.c b/tools/testing/selftests/bpf/prog_tests/exceptions.c index e8cbaf2a3e82..3588d6f97fd4 100644 --- a/tools/testing/selftests/bpf/prog_tests/exceptions.c +++ b/tools/testing/selftests/bpf/prog_tests/exceptions.c @@ -85,6 +85,13 @@ static void test_exceptions_success(void) RUN_SUCCESS(exception_bad_assert_range_with, 10); RUN_SUCCESS(exception_throw_from_void_global, 11); + if (skel->rodata->has_stack_arg) { + RUN_SUCCESS(exception_throw_stack_arg, 56); + RUN_SUCCESS(exception_throw_after_stack_arg, 56); + RUN_SUCCESS(exception_throw_subprog_stack_arg, 56); + RUN_SUCCESS(exception_throw_subprog_after_stack_arg, 56); + } + #define RUN_EXT(load_ret, attach_err, expr, msg, after_link) \ { \ LIBBPF_OPTS(bpf_object_open_opts, o, .kernel_log_buf = log_buf, \ diff --git a/tools/testing/selftests/bpf/progs/exceptions.c b/tools/testing/selftests/bpf/progs/exceptions.c index 4206f59d7b86..c8d716fbd419 100644 --- a/tools/testing/selftests/bpf/progs/exceptions.c +++ b/tools/testing/selftests/bpf/progs/exceptions.c @@ -379,4 +379,118 @@ int exception_bad_assert_range_with(struct __sk_buff *ctx) return 1; } +#if (defined(__TARGET_ARCH_x86) || defined(__TARGET_ARCH_arm64)) \ + && defined(__BPF_FEATURE_STACK_ARGUMENT) + +const volatile bool has_stack_arg = true; + +long arg1 = 1, arg2 = 2, arg3 = 3, arg4 = 4, arg5 = 5; +long arg6 = 6, arg7 = 7, arg8 = 8, arg9 = 9, arg10 = 10; + +__noinline static long throwing_many_args(long a, long b, long c, long d, + long e, long f, long g, long h, + long i, long j) +{ + bpf_throw(a + b + c + d + e + f + g + h + i + j); + return 0; +} + +__noinline int exception_cb_sa(u64 cookie) +{ + return cookie + 1; +} + +SEC("tc") +__exception_cb(exception_cb_sa) +int exception_throw_stack_arg(struct __sk_buff *ctx) +{ + throwing_many_args(arg1, arg2, arg3, arg4, arg5, + arg6, arg7, arg8, arg9, arg10); + return 0; +} + +__noinline static long no_throw_many_args(long a, long b, long c, long d, + long e, long f, long g, long h, + long i, long j) +{ + return a + b + c + d + e + f + g + h + i + j; +} + +SEC("tc") +__exception_cb(exception_cb_sa) +int exception_throw_after_stack_arg(struct __sk_buff *ctx) +{ + long ret; + + ret = no_throw_many_args(arg1, arg2, arg3, arg4, arg5, + arg6, arg7, arg8, arg9, arg10); + if (ret > 0) + bpf_throw(ret); + return 0; +} + +__noinline static long subprog_throw_sa(long val) +{ + throwing_many_args(val, val + 1, val + 2, val + 3, val + 4, + val + 5, val + 6, val + 7, val + 8, val + 9); + return 0; +} + +SEC("tc") +__exception_cb(exception_cb_sa) +int exception_throw_subprog_stack_arg(struct __sk_buff *ctx) +{ + subprog_throw_sa(arg1); + return 0; +} + +__noinline static long subprog_throw_after_sa(long val) +{ + long ret; + + ret = no_throw_many_args(val, val + 1, val + 2, val + 3, val + 4, + val + 5, val + 6, val + 7, val + 8, val + 9); + if (ret > 0) + bpf_throw(ret); + return 0; +} + +SEC("tc") +__exception_cb(exception_cb_sa) +int exception_throw_subprog_after_stack_arg(struct __sk_buff *ctx) +{ + subprog_throw_after_sa(arg1); + return 0; +} + +#else + +const volatile bool has_stack_arg = false; + +SEC("tc") +int exception_throw_stack_arg(struct __sk_buff *ctx) +{ + return 0; +} + +SEC("tc") +int exception_throw_after_stack_arg(struct __sk_buff *ctx) +{ + return 0; +} + +SEC("tc") +int exception_throw_subprog_stack_arg(struct __sk_buff *ctx) +{ + return 0; +} + +SEC("tc") +int exception_throw_subprog_after_stack_arg(struct __sk_buff *ctx) +{ + return 0; +} + +#endif + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From f05ddc6771c4a2eb1801dfdd0f7a212a78fa18a7 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Mon, 18 May 2026 22:54:42 +0800 Subject: bpf: Check tail zero of bpf_common_attr using offsetofend Because of the 8-byte alignment, the compiler will pad struct bpf_common_attr to 24 bytes. That said, sizeof(attr_common) is 24 instead of 20. When check tail zero using sizeof(attr_common) in bpf_check_uarg_tail_zero(), there will be 4 bytes that won't be checked. To also check the padding 4 bytes, replace sizeof(attr_common) with offsetofend(struct bpf_common_attr, log_true_size). Fixes: f28771c0691b ("bpf: Extend BPF syscall with common attributes support") Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260518145446.6794-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6600e126fbfb..83de8fb9b9aa 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -6278,7 +6278,9 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size, memset(&attr_common, 0, sizeof(attr_common)); if (cmd & BPF_COMMON_ATTRS) { - err = bpf_check_uarg_tail_zero(uattr_common, sizeof(attr_common), size_common); + err = bpf_check_uarg_tail_zero(uattr_common, + offsetofend(struct bpf_common_attr, log_true_size), + size_common); if (err) return err; -- cgit v1.2.3 From b4844cb6d1ecff732c99b70998749973c6f50591 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Mon, 18 May 2026 22:54:44 +0800 Subject: libbpf: Add OPTS_VALID() for log_opts in bpf_map_create There should be an OPTS_VALID() check for log_opts before extracting its fields. If no such OPTS_VALID() check and an application compiled against a future libbpf header passes a log_opts with new, non-zero fields to libbpf.so, those fields will be ignored silently. Fixes: 702259006f93 ("libbpf: Add syscall common attributes support for map_create") Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260518145446.6794-4-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/bpf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 483c02cf21d1..3cd705802330 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -246,6 +246,9 @@ int bpf_map_create(enum bpf_map_type map_type, attr.excl_prog_hash_size = OPTS_GET(opts, excl_prog_hash_size, 0); log_opts = OPTS_GET(opts, log_opts, NULL); + if (!OPTS_VALID(log_opts, bpf_log_opts)) + return libbpf_err(-EINVAL); + if (log_opts && feat_supported(NULL, FEAT_BPF_SYSCALL_COMMON_ATTRS)) { memset(&attr_common, 0, attr_common_sz); attr_common.log_buf = ptr_to_u64(OPTS_GET(log_opts, buf, NULL)); -- cgit v1.2.3 From 652f0c2c999d28d820bbe2e1aa16d8e0fea369ea Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Mon, 18 May 2026 22:54:45 +0800 Subject: selftests/bpf: Use -1 as token_fd in map create failure test Because 0xFF can be an open BPF token fd in the test runner that will fail test_invalid_token_fd(), change token_fd from 0xFF to -1 to avoid such test failure. Fixes: f675483cac1d ("selftests/bpf: Add tests to verify map create failure log") Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260518145446.6794-5-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/map_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/map_init.c b/tools/testing/selftests/bpf/prog_tests/map_init.c index 5c61c8e37306..b0b902d5783d 100644 --- a/tools/testing/selftests/bpf/prog_tests/map_init.c +++ b/tools/testing/selftests/bpf/prog_tests/map_init.c @@ -306,7 +306,7 @@ static void test_invalid_token_fd(void) const char *msg = "Invalid map_token_fd.\n"; LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_TOKEN_FD, - .token_fd = 0xFF, + .token_fd = -1, ); test_map_create_array(&opts, msg); -- cgit v1.2.3 From 7732ad2412fd402913976e490921f7e792a0a33b Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Mon, 18 May 2026 22:54:46 +0800 Subject: selftests/bpf: Add test to verify checking padding bytes for BPF syscall common attributes Add a test to verify that the tailing padding 4 bytes are checked in syscall.c::__sys_bpf() using bpf_check_uarg_tail_zero(). Without the fix, the test fails with: test_common_attr_padding:FAIL:syscall unexpected syscall: actual 4 >= expected 0 #213/12 map_create_failure/common_attr_padding:FAIL Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260518145446.6794-6-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/map_init.c | 26 +++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/map_init.c b/tools/testing/selftests/bpf/prog_tests/map_init.c index b0b902d5783d..c804c3ce9be9 100644 --- a/tools/testing/selftests/bpf/prog_tests/map_init.c +++ b/tools/testing/selftests/bpf/prog_tests/map_init.c @@ -353,6 +353,30 @@ static void test_excl_prog_hash_size_2(void) test_map_create_array(&opts, msg); } +static void test_common_attr_padding(void) +{ + struct bpf_common_attr_fake { + __u8 attrs[offsetofend(struct bpf_common_attr, log_true_size)]; + __u32 pad; + } attr_common = { + .pad = 1, + }; + union bpf_attr attr = { + .map_type = BPF_MAP_TYPE_ARRAY, + .key_size = 4, + .value_size = 4, + .max_entries = 1, + }; + int fd; + + fd = syscall(__NR_bpf, BPF_MAP_CREATE | BPF_COMMON_ATTRS, &attr, sizeof(attr), &attr_common, + sizeof(attr_common)); + if (!ASSERT_LT(fd, 0, "syscall")) + close(fd); + else + ASSERT_EQ(errno, E2BIG, "errno"); +} + void test_map_create_failure(void) { if (test__start_subtest("invalid_vmlinux_value_type_id_struct_ops")) @@ -377,4 +401,6 @@ void test_map_create_failure(void) test_excl_prog_hash_size_1(); if (test__start_subtest("invalid_excl_prog_hash_size_2")) test_excl_prog_hash_size_2(); + if (test__start_subtest("common_attr_padding")) + test_common_attr_padding(); } -- cgit v1.2.3 From 879daba303f7d7c3057f4d218921621e751f1912 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Wed, 13 May 2026 13:24:37 +0200 Subject: selftests/bpf: Override EXTRA_LDFLAGS for static builds When running vmtest.sh with static linking, the bpftool_map_access selftests fail. These selftests are calling the bpftool binary in tools/sbin/ directly, which results in the following error: error while loading shared libraries: libLLVM.so.21.1: cannot open shared object file: No such file or directory To fix this, we need to also build bpftool statically. That can be done by setting EXTRA_LDFLAGS=-static. Fixes: 2d96bbdfd3b5 ("selftests/bpf: convert test_bpftool_map_access.sh into test_progs framework") Signed-off-by: Paul Chaignon Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/r/714556da329c812988010ffe53173d9152570a78.1778669303.git.paul.chaignon@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst index 776fbe3cb8f9..37164322a102 100644 --- a/tools/testing/selftests/bpf/README.rst +++ b/tools/testing/selftests/bpf/README.rst @@ -77,7 +77,7 @@ In case of linker errors when running selftests, try using static linking: .. code-block:: console - $ LDLIBS=-static PKG_CONFIG='pkg-config --static' vmtest.sh + $ LDLIBS=-static EXTRA_LDFLAGS=-static PKG_CONFIG='pkg-config --static' vmtest.sh .. note:: Some distros may not support static linking. -- cgit v1.2.3 From 6df582112aa9ac9d190169abdb0e42e496659ec9 Mon Sep 17 00:00:00 2001 From: Roman Kvasnytskyi Date: Sat, 16 May 2026 14:06:25 +0200 Subject: selftests/bpf: Reject unsupported -k option in vmtest.sh vmtest.sh does not document a -k option and does not handle it in the getopts case statement. However, the getopts optstring includes k, which causes the script to accept -k silently instead of reporting it as an invalid option. Remove k from the optstring so unsupported options are rejected through the existing invalid-option path. Fixes: c9709f52386d ("bpf: Helper script for running BPF presubmit tests") Signed-off-by: Roman Kvasnytskyi Acked-by: Paul Chaignon Link: https://lore.kernel.org/r/20260516120625.80839-1-roman@kvasnytskyi.net Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/vmtest.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/vmtest.sh b/tools/testing/selftests/bpf/vmtest.sh index 2f869daf8a06..9ca802285393 100755 --- a/tools/testing/selftests/bpf/vmtest.sh +++ b/tools/testing/selftests/bpf/vmtest.sh @@ -382,7 +382,7 @@ main() local exit_command="poweroff -f" local debug_shell="no" - while getopts ':hskl:id:j:' opt; do + while getopts ':hsl:id:j:' opt; do case ${opt} in l) LOCAL_ROOTFS_IMAGE="$OPTARG" -- cgit v1.2.3 From 523d2f42b406f5be2989f436b03eacebf3679835 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Mon, 18 May 2026 18:26:35 +0200 Subject: selftests/bpf: Fix test for refinement of single-value tnum This patch fixes the "bounds refinement with single-value tnum on umin" verifier selftest. This selftest was introduced in commit e6ad477d1bf8 ("selftests/bpf: Test refinement of single-value tnum") to cover the logic from __update_reg64_bounds(), introduced in commit efc11a667878 ("bpf: Improve bounds when tnum has a single possible value"). However, the test still passes if that last commit is reverted. The test is supposed to cover the case when the tnum and u64 range (or cnum64 now) overlap in a single value. __update_reg64_bounds() detects that case and refines the bounds to a known constant. However, the constants for the test were poorly chosen and the bounds get refined to a known constant even without __update_reg64_bounds(). The code is as follows: 0: call bpf_get_prandom_u32#7 ; R0=scalar() 1: r0 |= 224 ; R0=scalar(umin=umin32=224,var_off=(0xe0; 0xffffffffffffff1f)) 2: r0 &= 240 ; R0=scalar(smin=umin=smin32=umin32=224,smax=umax=smax32=umax32=240,var_off=(0xe0; 0x10)) 3: if r0 == 0xf0 goto pc+2 ; R0=224 After instruction 3, we have u64=[0xe0; 0xef] and tnum=(0xe0; 0x10). __reg_bound_offset() is able to deduce a new tnum from the u64, tnum=(0xe0; 0x0f), which combined with the existing tnum gives us a constant: 0xe0 or 224. We can easily fix this by choosing different starting bounds. If we make it u64=[0xe1; 0xf0], then __reg_bound_offset() doesn't have any impact. Fixes: e6ad477d1bf8 ("selftests/bpf: Test refinement of single-value tnum") Signed-off-by: Paul Chaignon Link: https://lore.kernel.org/r/be2dc2c3d85120286e60b3029b3338fff339f942.1779121582.git.paul.chaignon@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/verifier_bounds.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c index a3e4c0945137..bc038ac2df98 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bounds.c +++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c @@ -1892,25 +1892,25 @@ __naked void bounds_refinement_tnum_umax(void *ctx) /* This test covers the bounds deduction when the u64 range and the tnum * overlap only at umin. After instruction 3, the ranges look as follows: * - * 0 umin=0xe00 umax=0xeff U64_MAX + * 0 umin=0xe1 umax=0xf0 U64_MAX * | [xxxxxxxxxxxxxx] | * |----------------------------|------------------------------| * | x x | tnum values * - * The verifier can therefore deduce that the R0=0xe0=224. + * The verifier can therefore deduce that the R0=0xe1=225. */ SEC("socket") __description("bounds refinement with single-value tnum on umin") -__msg("3: (15) if r0 == 0xf0 {{.*}} R0=224") +__msg("3: (15) if r0 == 0xf1 {{.*}} R0=225") __success __log_level(2) __naked void bounds_refinement_tnum_umin(void *ctx) { asm volatile(" \ call %[bpf_get_prandom_u32]; \ - r0 |= 0xe0; \ - r0 &= 0xf0; \ - if r0 == 0xf0 goto +2; \ - if r0 == 0xe0 goto +1; \ + r0 |= 0xe1; \ + r0 &= 0xf1; \ + if r0 == 0xf1 goto +2; \ + if r0 == 0xe1 goto +1; \ r10 = 0; \ exit; \ " : -- cgit v1.2.3 From fa747e9f843ba3a0fa4d3fabaf50c9e11aaf963f Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Wed, 20 May 2026 06:33:30 -0700 Subject: selftests/bpf: Fix cold_lru producing zero batch_hash in XDP LB benchmark batch_hash = (batch_gen ^ cpu_id) * KNUTH_HASH_MULT; When batch_gen == cpu_id the XOR produces zero, batch_hash is zero, and *saddr ^= 0 is a no-op. Every iteration hits the warm LRU entry. During validation batch_gen is 2, so running on CPU 2 triggers: [udp-v4-lru-miss] COUNTER FAIL: LRU misses=0, expected 1 Replace XOR with addition so the multiplier input is always >= 1. This also preserves the per-CPU salt for multi-producer runs. Fixes: 4b4f2229104c ("selftests/bpf: Add XDP load-balancer BPF program") Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260520133338.3392667-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/xdp_lb_bench.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/xdp_lb_bench.c b/tools/testing/selftests/bpf/progs/xdp_lb_bench.c index b9fd848c035d..13777b3dcac8 100644 --- a/tools/testing/selftests/bpf/progs/xdp_lb_bench.c +++ b/tools/testing/selftests/bpf/progs/xdp_lb_bench.c @@ -618,7 +618,7 @@ int xdp_lb_bench(struct xdp_md *xdp) __u32 *saddr = data + saddr_off; batch_gen++; - batch_hash = (batch_gen ^ bpf_get_smp_processor_id()) * KNUTH_HASH_MULT; + batch_hash = (batch_gen + bpf_get_smp_processor_id()) * KNUTH_HASH_MULT; if ((void *)(saddr + 1) <= data_end) *saddr ^= batch_hash; } -- cgit v1.2.3 From 12e896b9794bbd88f56aeac2a5807ae8d4bb5ad8 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Wed, 20 May 2026 06:33:31 -0700 Subject: selftests/bpf: Fix expired UDP LRU entries in XDP LB benchmark populate_lru() zero-initializes atime: struct real_pos_lru lru = { .pos = real_idx }; connection_table_lookup() treats UDP entries with cur_time - atime > 30s as expired, so every pre-populated entry expires immediately. Calibration masks this on the CPU it runs on, but if validation migrates to another CPU: [udp-v4-lru-hit] COUNTER FAIL: LRU misses=1, expected 0 Initialize atime from CLOCK_MONOTONIC for UDP flows. Fixes: a4b5ba8187cb ("selftests/bpf: Add XDP load-balancer benchmark driver") Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260520133338.3392667-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/benchs/bench_xdp_lb.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/testing/selftests/bpf/benchs/bench_xdp_lb.c b/tools/testing/selftests/bpf/benchs/bench_xdp_lb.c index 0b6709a2b03c..8e25bccbde92 100644 --- a/tools/testing/selftests/bpf/benchs/bench_xdp_lb.c +++ b/tools/testing/selftests/bpf/benchs/bench_xdp_lb.c @@ -563,12 +563,23 @@ static void create_per_cpu_lru_maps(struct xdp_lb_bench *skel) nr_inner_maps = nr_cpus; } +static __u64 ktime_get_ns(void) +{ + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC, &ts); + return (__u64)ts.tv_sec * 1000000000ULL + ts.tv_nsec; +} + static void populate_lru(const struct test_scenario *sc, __u32 real_idx) { struct real_pos_lru lru = { .pos = real_idx }; struct flow_key fk; int i, err; + if (sc->ip_proto == IPPROTO_UDP) + lru.atime = ktime_get_ns(); + build_flow_key(&fk, sc); /* Insert into every per-CPU inner LRU so the entry is found -- cgit v1.2.3 From abac8acb633a9448369d658889ac2bcfbd96f54b Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Wed, 20 May 2026 06:33:32 -0700 Subject: selftests/bpf: Filter timing outliers with IQR in batch-timing library System noise (timer interrupts, scheduling) can inflate the reported stddev. tcp-v4-syn showed stddev 37.86 ns without filtering vs 0.16 ns with filtering on the same run data. Filter samples outside [Q1 - 1.5*IQR, Q3 + 1.5*IQR] before computing statistics. Scenarios with genuinely wide distributions have large IQR so the fences stay wide and the filter has minimal effect. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260520133338.3392667-4-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/benchs/bench_bpf_timing.c | 26 ++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tools/testing/selftests/bpf/benchs/bench_bpf_timing.c b/tools/testing/selftests/bpf/benchs/bench_bpf_timing.c index 75a39da69655..e02ad324f7bc 100644 --- a/tools/testing/selftests/bpf/benchs/bench_bpf_timing.c +++ b/tools/testing/selftests/bpf/benchs/bench_bpf_timing.c @@ -65,6 +65,31 @@ static int collect_samples(struct bpf_bench_timing *t, return total; } +static int filter_outliers_iqr(double *sorted, int n) +{ + double q1, q3, iqr, lo, hi; + int start = 0, end = n; + + if (n < 8) + return n; + + q1 = sorted[n / 4]; + q3 = sorted[3 * n / 4]; + iqr = q3 - q1; + lo = q1 - 1.5 * iqr; + hi = q3 + 1.5 * iqr; + + while (start < end && sorted[start] < lo) + start++; + while (end > start && sorted[end - 1] > hi) + end--; + + if (start > 0) + memmove(sorted, sorted + start, (end - start) * sizeof(double)); + + return end - start; +} + static void compute_stats(const double *sorted, int n, struct timing_stats *s) { @@ -150,6 +175,7 @@ void bpf_bench_timing_report(struct bpf_bench_timing *t, const char *name, const return; } + total = filter_outliers_iqr(all, total); compute_stats(all, total, &s); if (t->machine_readable) { -- cgit v1.2.3 From cb339ac61d72f7fb7f57bfc0516b7b2b65bc1bad Mon Sep 17 00:00:00 2001 From: Kaitao Cheng Date: Thu, 21 May 2026 11:22:59 +0800 Subject: bpf: refactor __bpf_list_del to take list node pointer Refactor __bpf_list_del to accept (head, struct list_head *n) instead of (head, bool tail). The caller now passes the specific node to remove: bpf_list_pop_front passes h->next, bpf_list_pop_back passes h->prev. Prepares for introducing bpf_list_del(head, node) kfunc to remove an arbitrary node when the user holds ownership. Signed-off-by: Kaitao Cheng Reviewed-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260521032306.97118-2-kaitao.cheng@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 07de26e7314c..094457c3e6d3 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2550,37 +2550,44 @@ __bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head, return bpf_list_push_back(head, node, meta__ign, off); } -static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail) +static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, + struct list_head *n) { - struct list_head *n, *h = (void *)head; + struct list_head *h = (void *)head; struct bpf_list_node_kern *node; /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't * called on its fields, so init here */ - if (unlikely(!h->next)) + if (unlikely(!h->next)) { INIT_LIST_HEAD(h); + return NULL; + } if (list_empty(h)) return NULL; - n = tail ? h->prev : h->next; node = container_of(n, struct bpf_list_node_kern, list_head); - if (WARN_ON_ONCE(READ_ONCE(node->owner) != head)) + if (unlikely(READ_ONCE(node->owner) != head)) return NULL; list_del_init(n); - WRITE_ONCE(node->owner, NULL); + /* Ensure __bpf_list_add() sees the node as unlinked. */ + smp_store_release(&node->owner, NULL); return (struct bpf_list_node *)n; } __bpf_kfunc struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) { - return __bpf_list_del(head, false); + struct list_head *h = (void *)head; + + return __bpf_list_del(head, h->next); } __bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) { - return __bpf_list_del(head, true); + struct list_head *h = (void *)head; + + return __bpf_list_del(head, h->prev); } __bpf_kfunc struct bpf_list_node *bpf_list_front(struct bpf_list_head *head) -- cgit v1.2.3 From cfa6afa4b931aed08288454943e5077f114fd7f3 Mon Sep 17 00:00:00 2001 From: Kaitao Cheng Date: Thu, 21 May 2026 11:23:00 +0800 Subject: bpf: clear list node owner and unlink before drop The issue only becomes exposed once bpf_list_del() is available: callers can pass an arbitrary bpf_list_head and bpf_list_node pair, including nodes that are not actually linked to the supplied head, or nodes that outlive their original head after refcount-based retention. This was not practically reachable for callers restricted to pop-style helpers alone; bpf_list_del() widens the API surface. A failure mode appears when bpf_list_head_free() runs while a program still holds an independent refcount on a node (for example via bpf_refcount_acquire()). The list head value embedded in map memory can go away while the node object survives. If node->owner is left pointing at the old head address until drop completes, that pointer becomes stale. If a new bpf_list_head is later allocated at the same address and the stale node is passed to bpf_list_del(), the owner comparison can succeed even though the node is not really linked to the new head, and list_del_init() will follow bogus next/prev pointers with the risk of memory corruption. When draining a bpf_list_head, mark each node owner with BPF_PTR_POISON under the map spinlock while moving it to a private drain list, then list_del_init() the node and clear owner to NULL before calling __bpf_obj_drop_impl(). Concurrent readers therefore never observe a node that appears linked to a head while its list_head is inconsistent, and surviving refcounted nodes never retain a stale non-NULL owner. Signed-off-by: Kaitao Cheng Link: https://lore.kernel.org/r/20260521032306.97118-3-kaitao.cheng@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 094457c3e6d3..59855b434f0b 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2247,10 +2247,11 @@ EXPORT_SYMBOL_GPL(bpf_base_func_proto); void bpf_list_head_free(const struct btf_field *field, void *list_head, struct bpf_spin_lock *spin_lock) { - struct list_head *head = list_head, *orig_head = list_head; + struct list_head *head = list_head, drain, *pos, *n; BUILD_BUG_ON(sizeof(struct list_head) > sizeof(struct bpf_list_head)); BUILD_BUG_ON(__alignof__(struct list_head) > __alignof__(struct bpf_list_head)); + INIT_LIST_HEAD(&drain); /* Do the actual list draining outside the lock to not hold the lock for * too long, and also prevent deadlocks if tracing programs end up @@ -2261,20 +2262,30 @@ void bpf_list_head_free(const struct btf_field *field, void *list_head, __bpf_spin_lock_irqsave(spin_lock); if (!head->next || list_empty(head)) goto unlock; - head = head->next; + list_for_each_safe(pos, n, head) { + struct bpf_list_node_kern *node; + + node = container_of(pos, struct bpf_list_node_kern, list_head); + WRITE_ONCE(node->owner, BPF_PTR_POISON); + list_move_tail(pos, &drain); + } unlock: - INIT_LIST_HEAD(orig_head); + INIT_LIST_HEAD(head); __bpf_spin_unlock_irqrestore(spin_lock); - while (head != orig_head) { - void *obj = head; + while (!list_empty(&drain)) { + struct bpf_list_node_kern *node; - obj -= field->graph_root.node_offset; - head = head->next; + pos = drain.next; + node = container_of(pos, struct bpf_list_node_kern, list_head); + list_del_init(pos); + /* Ensure __bpf_list_add() sees the node as unlinked. */ + smp_store_release(&node->owner, NULL); /* The contained type can also have resources, including a * bpf_list_head which needs to be freed. */ - __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); + __bpf_obj_drop_impl((char *)pos - field->graph_root.node_offset, + field->graph_root.value_rec, false); } } -- cgit v1.2.3 From 7c8c71591b768284caa81e92cb47a6912952d3f2 Mon Sep 17 00:00:00 2001 From: Kaitao Cheng Date: Thu, 21 May 2026 11:23:01 +0800 Subject: bpf: allow non-owning list-node args via __nonown_allowed KF_ARG_PTR_TO_LIST_NODE normally requires an owning reference (PTR_TO_BTF_ID | MEM_ALLOC with ref_obj_id). Introduce the __nonown_allowed annotation on selected list-node arguments so non-owning references with ref_obj_id==0 are accepted as well. This patch only adds the generic verifier support and documents the annotation. Later patches in the series will apply it to bpf_list_add /del(), and bpf_list_is_first/last(), allowing bpf_list_front/back() results to be used as the insertion point, deletion target, or query target for those kfuncs. Verifier keeps existing owning-ref checks by default; only arguments annotated with __nonown_allowed bypass MEM_ALLOC/ref_obj_id checks and then follow the same list-node validation path. Signed-off-by: Kaitao Cheng Reviewed-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260521032306.97118-4-kaitao.cheng@linux.dev Signed-off-by: Alexei Starovoitov --- Documentation/bpf/kfuncs.rst | 22 ++++++++++++++++++++-- kernel/bpf/verifier.c | 13 +++++++++++++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst index 75e6c078e0e7..3a9db1108b95 100644 --- a/Documentation/bpf/kfuncs.rst +++ b/Documentation/bpf/kfuncs.rst @@ -207,8 +207,26 @@ Here, the buffer may be NULL. If the buffer is not NULL, it must be at least buffer__szk bytes in size. The kfunc is responsible for checking if the buffer is NULL before using it. -2.3.5 __str Annotation ----------------------------- +2.3.5 __nonown_allowed Annotation +--------------------------------- + +This annotation is used to indicate that the parameter may be a non-owning reference. + +An example is given below:: + + __bpf_kfunc int bpf_list_add(..., struct bpf_list_node + *prev__nonown_allowed, ...) + { + ... + } + +For the ``prev__nonown_allowed`` parameter (resolved as ``KF_ARG_PTR_TO_LIST_NODE``), +suffix ``__nonown_allowed`` retains the usual owning-pointer rules and also +permits a non-owning reference with no ref_obj_id (e.g. the return value of +bpf_list_front() / bpf_list_back()). + +2.3.6 __str Annotation +---------------------- This annotation is used to indicate that the argument is a constant string. An example is given below:: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8dd79b735a69..f3cf8d85bea0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10714,6 +10714,11 @@ static bool is_kfunc_arg_nullable(const struct btf *btf, const struct btf_param return btf_param_match_suffix(btf, arg, "__nullable"); } +static bool is_kfunc_arg_nonown_allowed(const struct btf *btf, const struct btf_param *arg) +{ + return btf_param_match_suffix(btf, arg, "__nonown_allowed"); +} + static bool is_kfunc_arg_const_str(const struct btf *btf, const struct btf_param *arg) { return btf_param_match_suffix(btf, arg, "__str"); @@ -12244,6 +12249,13 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return ret; break; case KF_ARG_PTR_TO_LIST_NODE: + if (is_kfunc_arg_nonown_allowed(btf, &args[i]) && + type_is_non_owning_ref(reg->type) && !reg->ref_obj_id) { + /* Allow bpf_list_front/back return value for + * __nonown_allowed list-node arguments. + */ + goto check_ok; + } if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { verbose(env, "%s expected pointer to allocated object\n", reg_arg_name(env, argno)); @@ -12253,6 +12265,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "allocated object must be referenced\n"); return -EINVAL; } +check_ok: ret = process_kf_arg_ptr_to_list_node(env, reg, argno, meta); if (ret < 0) return ret; -- cgit v1.2.3 From 187baa10963ac9f1db5123fa2ab761ab34ea06b9 Mon Sep 17 00:00:00 2001 From: Kaitao Cheng Date: Thu, 21 May 2026 11:23:02 +0800 Subject: bpf: Introduce the bpf_list_del kfunc. Allow users to remove any node from a linked list. We have added an additional parameter bpf_list_head *head to bpf_list_del, as the verifier requires the head parameter to check whether the lock is being held. Signed-off-by: Kaitao Cheng Reviewed-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260521032306.97118-5-kaitao.cheng@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 10 ++++++++++ kernel/bpf/verifier.c | 6 +++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 59855b434f0b..804c201c28f3 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2601,6 +2601,15 @@ __bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) return __bpf_list_del(head, h->prev); } +__bpf_kfunc struct bpf_list_node *bpf_list_del(struct bpf_list_head *head, + struct bpf_list_node *node__nonown_allowed) +{ + struct bpf_list_node_kern *kn = (void *)node__nonown_allowed; + + /* verifier guarantees node is a list node rather than list head */ + return __bpf_list_del(head, &kn->list_head); +} + __bpf_kfunc struct bpf_list_node *bpf_list_front(struct bpf_list_head *head) { struct list_head *h = (struct list_head *)head; @@ -4733,6 +4742,7 @@ BTF_ID_FLAGS(func, bpf_list_push_back, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_list_push_back_impl) BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_list_del, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_front, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_back, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f3cf8d85bea0..35eebb5e7769 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10961,6 +10961,7 @@ enum special_kfunc_type { KF_bpf_list_push_back, KF_bpf_list_pop_front, KF_bpf_list_pop_back, + KF_bpf_list_del, KF_bpf_list_front, KF_bpf_list_back, KF_bpf_cast_to_kern_ctx, @@ -11029,6 +11030,7 @@ BTF_ID(func, bpf_list_push_back_impl) BTF_ID(func, bpf_list_push_back) BTF_ID(func, bpf_list_pop_front) BTF_ID(func, bpf_list_pop_back) +BTF_ID(func, bpf_list_del) BTF_ID(func, bpf_list_front) BTF_ID(func, bpf_list_back) BTF_ID(func, bpf_cast_to_kern_ctx) @@ -11549,6 +11551,7 @@ static bool is_bpf_list_api_kfunc(u32 btf_id) return is_bpf_list_push_kfunc(btf_id) || btf_id == special_kfunc_list[KF_bpf_list_pop_front] || btf_id == special_kfunc_list[KF_bpf_list_pop_back] || + btf_id == special_kfunc_list[KF_bpf_list_del] || btf_id == special_kfunc_list[KF_bpf_list_front] || btf_id == special_kfunc_list[KF_bpf_list_back]; } @@ -11671,7 +11674,8 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env, switch (node_field_type) { case BPF_LIST_NODE: - ret = is_bpf_list_push_kfunc(kfunc_btf_id); + ret = is_bpf_list_push_kfunc(kfunc_btf_id) || + kfunc_btf_id == special_kfunc_list[KF_bpf_list_del]; break; case BPF_RB_NODE: ret = (is_bpf_rbtree_add_kfunc(kfunc_btf_id) || -- cgit v1.2.3 From e6919ff67c1e612ec1ce3be51dba6e7ffc47997a Mon Sep 17 00:00:00 2001 From: Kaitao Cheng Date: Thu, 21 May 2026 11:23:03 +0800 Subject: bpf: refactor __bpf_list_add to take insertion point via **prev_ptr Refactor __bpf_list_add to accept (node, head, struct list_head **prev_ptr, ..) instead of (node, head, bool tail, ..). Load prev from *prev_ptr after INIT_LIST_HEAD(h), so we never dereference an uninitialized h->prev when head was 0-initialized (e.g. push_back passes &h->prev). When prev is not the list head, validate that prev is in the list via its owner. Prepares for bpf_list_add(head, new, prev, ..) to insert after a given list node. Signed-off-by: Kaitao Cheng Reviewed-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260521032306.97118-6-kaitao.cheng@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 804c201c28f3..1c69476c8a09 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2478,9 +2478,11 @@ __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta static int __bpf_list_add(struct bpf_list_node_kern *node, struct bpf_list_head *head, - bool tail, struct btf_record *rec, u64 off) + struct list_head **prev_ptr, + struct btf_record *rec, u64 off) { struct list_head *n = &node->list_head, *h = (void *)head; + struct list_head *prev; /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't * called on its fields, so init here @@ -2488,19 +2490,31 @@ static int __bpf_list_add(struct bpf_list_node_kern *node, if (unlikely(!h->next)) INIT_LIST_HEAD(h); + prev = *prev_ptr; + + /* When prev is not the list head, it must be a node in this list. */ + if (prev != h) { + struct bpf_list_node_kern *prev_kn = + container_of(prev, struct bpf_list_node_kern, list_head); + + if (unlikely(READ_ONCE(prev_kn->owner) != head)) + goto fail; + } + /* node->owner != NULL implies !list_empty(n), no need to separately * check the latter */ - if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) { - /* Only called from BPF prog, no need to migrate_disable */ - __bpf_obj_drop_impl((void *)n - off, rec, false); - return -EINVAL; - } + if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) + goto fail; - tail ? list_add_tail(n, h) : list_add(n, h); + list_add(n, prev); WRITE_ONCE(node->owner, head); - return 0; + +fail: + /* Only called from BPF prog, no need to migrate_disable */ + __bpf_obj_drop_impl((void *)n - off, rec, false); + return -EINVAL; } /** @@ -2521,8 +2535,9 @@ __bpf_kfunc int bpf_list_push_front(struct bpf_list_head *head, u64 off) { struct bpf_list_node_kern *n = (void *)node; + struct list_head *h = (void *)head; - return __bpf_list_add(n, head, false, meta ? meta->record : NULL, off); + return __bpf_list_add(n, head, &h, meta ? meta->record : NULL, off); } __bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head, @@ -2550,8 +2565,9 @@ __bpf_kfunc int bpf_list_push_back(struct bpf_list_head *head, u64 off) { struct bpf_list_node_kern *n = (void *)node; + struct list_head *h = (void *)head; - return __bpf_list_add(n, head, true, meta ? meta->record : NULL, off); + return __bpf_list_add(n, head, &h->prev, meta ? meta->record : NULL, off); } __bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head, -- cgit v1.2.3 From a3493ca504f16877bf29a123f27835c3f841a05f Mon Sep 17 00:00:00 2001 From: Kaitao Cheng Date: Thu, 21 May 2026 11:23:04 +0800 Subject: bpf: Add bpf_list_add to insert node after a given list node Add a new kfunc bpf_list_add(head, new, prev, meta, off) that inserts 'new' after 'prev' in the BPF linked list. Both must be in the same list; 'prev' must already be in the list. The new node must be an owning reference (e.g. from bpf_obj_new); the kfunc consumes that reference and the node becomes non-owning once inserted. We have added an additional parameter bpf_list_head *head to bpf_list_add, as the verifier requires the head parameter to check whether the lock is being held. Returns 0 on success, -EINVAL if 'prev' is not in a list or 'new' is already in a list (or duplicate insertion). On failure, the kernel drops the passed-in node. Signed-off-by: Kaitao Cheng Reviewed-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260521032306.97118-7-kaitao.cheng@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 11 +++++++++++ kernel/bpf/verifier.c | 12 +++++++++--- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1c69476c8a09..89579165ef4d 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2577,6 +2577,16 @@ __bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head, return bpf_list_push_back(head, node, meta__ign, off); } +__bpf_kfunc int bpf_list_add(struct bpf_list_head *head, struct bpf_list_node *new, + struct bpf_list_node *prev__nonown_allowed, + struct btf_struct_meta *meta, u64 off) +{ + struct bpf_list_node_kern *n = (void *)new, *p = (void *)prev__nonown_allowed; + struct list_head *prev_ptr = &p->list_head; + + return __bpf_list_add(n, head, &prev_ptr, meta ? meta->record : NULL, off); +} + static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, struct list_head *n) { @@ -4756,6 +4766,7 @@ BTF_ID_FLAGS(func, bpf_list_push_front, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_list_push_front_impl) BTF_ID_FLAGS(func, bpf_list_push_back, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_list_push_back_impl) +BTF_ID_FLAGS(func, bpf_list_add, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_del, KF_ACQUIRE | KF_RET_NULL) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 35eebb5e7769..662ad7312697 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10959,6 +10959,7 @@ enum special_kfunc_type { KF_bpf_list_push_front, KF_bpf_list_push_back_impl, KF_bpf_list_push_back, + KF_bpf_list_add, KF_bpf_list_pop_front, KF_bpf_list_pop_back, KF_bpf_list_del, @@ -11028,6 +11029,7 @@ BTF_ID(func, bpf_list_push_front_impl) BTF_ID(func, bpf_list_push_front) BTF_ID(func, bpf_list_push_back_impl) BTF_ID(func, bpf_list_push_back) +BTF_ID(func, bpf_list_add) BTF_ID(func, bpf_list_pop_front) BTF_ID(func, bpf_list_pop_back) BTF_ID(func, bpf_list_del) @@ -11140,7 +11142,8 @@ static bool is_bpf_list_push_kfunc(u32 func_id) return func_id == special_kfunc_list[KF_bpf_list_push_front] || func_id == special_kfunc_list[KF_bpf_list_push_front_impl] || func_id == special_kfunc_list[KF_bpf_list_push_back] || - func_id == special_kfunc_list[KF_bpf_list_push_back_impl]; + func_id == special_kfunc_list[KF_bpf_list_push_back_impl] || + func_id == special_kfunc_list[KF_bpf_list_add]; } static bool is_bpf_rbtree_add_kfunc(u32 func_id) @@ -19524,8 +19527,11 @@ int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int struct_meta_reg = BPF_REG_3; int node_offset_reg = BPF_REG_4; - /* rbtree_add has extra 'less' arg, so args-to-fixup are in diff regs */ - if (is_bpf_rbtree_add_kfunc(desc->func_id)) { + /* list_add/rbtree_add have an extra arg (prev/less), + * so args-to-fixup are in diff regs. + */ + if (desc->func_id == special_kfunc_list[KF_bpf_list_add] || + is_bpf_rbtree_add_kfunc(desc->func_id)) { struct_meta_reg = BPF_REG_4; node_offset_reg = BPF_REG_5; } -- cgit v1.2.3 From 745515d386eb5e6891d9f91a92ad15dace3a33ef Mon Sep 17 00:00:00 2001 From: Kaitao Cheng Date: Thu, 21 May 2026 11:23:05 +0800 Subject: bpf: add bpf_list_is_first/last/empty kfuncs Add three kfuncs for BPF linked list queries: - bpf_list_is_first(head, node): true if node is the first in the list. - bpf_list_is_last(head, node): true if node is the last in the list. - bpf_list_empty(head): true if the list has no entries. Currently, without these kfuncs, to implement the above functionality it is necessary to first call bpf_list_pop_front/back to retrieve the first or last node before checking whether the passed-in node was the first or last one. After the check, the node had to be pushed back into the list using bpf_list_push_front/back, which was very inefficient. Now, with the bpf_list_is_first/last/empty kfuncs, we can directly check whether a node is the first, last, or whether the list is empty, without having to first retrieve the node. Signed-off-by: Kaitao Cheng Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260521032306.97118-8-kaitao.cheng@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 40 ++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 15 +++++++++++++-- 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 89579165ef4d..b6c3d02d5593 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2656,6 +2656,43 @@ __bpf_kfunc struct bpf_list_node *bpf_list_back(struct bpf_list_head *head) return (struct bpf_list_node *)h->prev; } +__bpf_kfunc bool bpf_list_is_first(struct bpf_list_head *head, + struct bpf_list_node *node__nonown_allowed) +{ + struct list_head *h = (struct list_head *)head; + struct bpf_list_node_kern *kn = (struct bpf_list_node_kern *)node__nonown_allowed; + + if (READ_ONCE(kn->owner) != head) + return false; + + return list_is_first(&kn->list_head, h); +} + +__bpf_kfunc bool bpf_list_is_last(struct bpf_list_head *head, + struct bpf_list_node *node__nonown_allowed) +{ + struct list_head *h = (struct list_head *)head; + struct bpf_list_node_kern *kn = (struct bpf_list_node_kern *)node__nonown_allowed; + + if (READ_ONCE(kn->owner) != head) + return false; + + return list_is_last(&kn->list_head, h); +} + +__bpf_kfunc bool bpf_list_empty(struct bpf_list_head *head) +{ + struct list_head *h = (struct list_head *)head; + + /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't + * called on its fields, so init here + */ + if (unlikely(!h->next)) + INIT_LIST_HEAD(h); + + return list_empty(h); +} + __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, struct bpf_rb_node *node) { @@ -4772,6 +4809,9 @@ BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_del, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_front, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_back, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_list_is_first) +BTF_ID_FLAGS(func, bpf_list_is_last) +BTF_ID_FLAGS(func, bpf_list_empty) BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 662ad7312697..d9bdc3b32c05 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10965,6 +10965,9 @@ enum special_kfunc_type { KF_bpf_list_del, KF_bpf_list_front, KF_bpf_list_back, + KF_bpf_list_is_first, + KF_bpf_list_is_last, + KF_bpf_list_empty, KF_bpf_cast_to_kern_ctx, KF_bpf_rdonly_cast, KF_bpf_rcu_read_lock, @@ -11035,6 +11038,9 @@ BTF_ID(func, bpf_list_pop_back) BTF_ID(func, bpf_list_del) BTF_ID(func, bpf_list_front) BTF_ID(func, bpf_list_back) +BTF_ID(func, bpf_list_is_first) +BTF_ID(func, bpf_list_is_last) +BTF_ID(func, bpf_list_empty) BTF_ID(func, bpf_cast_to_kern_ctx) BTF_ID(func, bpf_rdonly_cast) BTF_ID(func, bpf_rcu_read_lock) @@ -11556,7 +11562,10 @@ static bool is_bpf_list_api_kfunc(u32 btf_id) btf_id == special_kfunc_list[KF_bpf_list_pop_back] || btf_id == special_kfunc_list[KF_bpf_list_del] || btf_id == special_kfunc_list[KF_bpf_list_front] || - btf_id == special_kfunc_list[KF_bpf_list_back]; + btf_id == special_kfunc_list[KF_bpf_list_back] || + btf_id == special_kfunc_list[KF_bpf_list_is_first] || + btf_id == special_kfunc_list[KF_bpf_list_is_last] || + btf_id == special_kfunc_list[KF_bpf_list_empty]; } static bool is_bpf_rbtree_api_kfunc(u32 btf_id) @@ -11678,7 +11687,9 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env, switch (node_field_type) { case BPF_LIST_NODE: ret = is_bpf_list_push_kfunc(kfunc_btf_id) || - kfunc_btf_id == special_kfunc_list[KF_bpf_list_del]; + kfunc_btf_id == special_kfunc_list[KF_bpf_list_del] || + kfunc_btf_id == special_kfunc_list[KF_bpf_list_is_first] || + kfunc_btf_id == special_kfunc_list[KF_bpf_list_is_last]; break; case BPF_RB_NODE: ret = (is_bpf_rbtree_add_kfunc(kfunc_btf_id) || -- cgit v1.2.3 From ba3dc064f4065471487a8cc93c47efda4fe358dd Mon Sep 17 00:00:00 2001 From: Kaitao Cheng Date: Thu, 21 May 2026 11:23:06 +0800 Subject: selftests/bpf: Add test cases for bpf_list_del/add/is_first/is_last/empty Extend refcounted_kptr with tests for bpf_list_add (including prev from bpf_list_front and bpf_refcount_acquire), bpf_list_del (including node from bpf_list_front, bpf_rbtree_remove and bpf_refcount_acquire), bpf_list_empty, bpf_list_is_first/last, and push_back on uninit head. To verify the validity of bpf_list_del/add, the test also expects the verifier to reject calls to bpf_list_del/add made without holding the spin_lock. Signed-off-by: Kaitao Cheng Link: https://lore.kernel.org/r/20260521032306.97118-9-kaitao.cheng@linux.dev Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/progs/refcounted_kptr.c | 421 +++++++++++++++++++++ 1 file changed, 421 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/refcounted_kptr.c b/tools/testing/selftests/bpf/progs/refcounted_kptr.c index c847398837cc..13de169ad68f 100644 --- a/tools/testing/selftests/bpf/progs/refcounted_kptr.c +++ b/tools/testing/selftests/bpf/progs/refcounted_kptr.c @@ -367,6 +367,427 @@ long insert_rbtree_and_stash__del_tree_##rem_tree(void *ctx) \ INSERT_STASH_READ(true, "insert_stash_read: remove from tree"); INSERT_STASH_READ(false, "insert_stash_read: don't remove from tree"); +SEC("tc") +__description("list_empty_test: list empty before add, non-empty after add") +__success __retval(0) +int list_empty_test(void *ctx) +{ + struct node_data *node_new; + + bpf_spin_lock(&lock); + if (!bpf_list_empty(&head)) { + bpf_spin_unlock(&lock); + return -1; + } + bpf_spin_unlock(&lock); + + node_new = bpf_obj_new(typeof(*node_new)); + if (!node_new) + return -2; + + bpf_spin_lock(&lock); + bpf_list_push_front(&head, &node_new->l); + + if (bpf_list_empty(&head)) { + bpf_spin_unlock(&lock); + return -3; + } + bpf_spin_unlock(&lock); + return 0; +} + +static struct node_data *__add_in_list(struct bpf_list_head *head, + struct bpf_spin_lock *lock) +{ + struct node_data *node_new, *node_ref; + + node_new = bpf_obj_new(typeof(*node_new)); + if (!node_new) + return NULL; + + node_ref = bpf_refcount_acquire(node_new); + + bpf_spin_lock(lock); + bpf_list_push_front(head, &node_new->l); + bpf_spin_unlock(lock); + return node_ref; +} + +SEC("tc") +__description("list_is_edge_test1: is_first on first node, is_last on last node") +__success __retval(0) +int list_is_edge_test1(void *ctx) +{ + struct node_data *node_first, *node_last; + int err = 0; + + node_last = __add_in_list(&head, &lock); + if (!node_last) + return -1; + + node_first = __add_in_list(&head, &lock); + if (!node_first) { + bpf_obj_drop(node_last); + return -2; + } + + bpf_spin_lock(&lock); + if (!bpf_list_is_first(&head, &node_first->l)) { + err = -3; + goto fail; + } + if (!bpf_list_is_last(&head, &node_last->l)) + err = -4; + +fail: + bpf_spin_unlock(&lock); + bpf_obj_drop(node_first); + bpf_obj_drop(node_last); + return err; +} + +SEC("tc") +__description("list_is_edge_test2: accept list_front/list_back return value") +__success __retval(0) +int list_is_edge_test2(void *ctx) +{ + struct bpf_list_node *front, *back; + struct node_data *a, *b; + long err = 0; + + a = __add_in_list(&head, &lock); + if (!a) + return -1; + + b = __add_in_list(&head, &lock); + if (!b) { + bpf_obj_drop(a); + return -2; + } + + bpf_spin_lock(&lock); + front = bpf_list_front(&head); + back = bpf_list_back(&head); + if (!front || !back) { + err = -3; + goto out_unlock; + } + + if (!bpf_list_is_first(&head, front) || bpf_list_is_last(&head, front)) { + err = -4; + goto out_unlock; + } + + if (!bpf_list_is_last(&head, back) || bpf_list_is_first(&head, back)) { + err = -5; + goto out_unlock; + } + +out_unlock: + bpf_spin_unlock(&lock); + bpf_obj_drop(a); + bpf_obj_drop(b); + return err; +} + +SEC("tc") +__description("list_is_edge_test3: single node is both first and last") +__success __retval(0) +int list_is_edge_test3(void *ctx) +{ + struct node_data *tmp; + struct bpf_list_node *node; + long err = 0; + + tmp = __add_in_list(&head, &lock); + if (!tmp) + return -1; + + bpf_spin_lock(&lock); + node = bpf_list_front(&head); + if (!node) { + bpf_spin_unlock(&lock); + bpf_obj_drop(tmp); + return -2; + } + + if (!bpf_list_is_first(&head, node) || !bpf_list_is_last(&head, node)) + err = -3; + bpf_spin_unlock(&lock); + + bpf_obj_drop(tmp); + return err; +} + +SEC("tc") +__description("list_del_test1: del returns removed nodes") +__success __retval(0) +int list_del_test1(void *ctx) +{ + struct node_data *node_first, *node_last; + struct bpf_list_node *bpf_node_first, *bpf_node_last; + int err = 0; + + node_last = __add_in_list(&head, &lock); + if (!node_last) + return -1; + + node_first = __add_in_list(&head, &lock); + if (!node_first) { + bpf_obj_drop(node_last); + return -2; + } + + bpf_spin_lock(&lock); + bpf_node_last = bpf_list_del(&head, &node_last->l); + bpf_node_first = bpf_list_del(&head, &node_first->l); + bpf_spin_unlock(&lock); + + if (bpf_node_first) + bpf_obj_drop(container_of(bpf_node_first, struct node_data, l)); + else + err = -3; + + if (bpf_node_last) + bpf_obj_drop(container_of(bpf_node_last, struct node_data, l)); + else + err = -4; + + bpf_obj_drop(node_first); + bpf_obj_drop(node_last); + return err; +} + +SEC("tc") +__description("list_del_test2: remove an arbitrary node from the list") +__success __retval(0) +int list_del_test2(void *ctx) +{ + struct bpf_rb_node *rb; + struct bpf_list_node *l; + struct node_data *n; + long err; + + err = __insert_in_tree_and_list(&head, &root, &lock); + if (err) + return err; + + bpf_spin_lock(&lock); + rb = bpf_rbtree_first(&root); + if (!rb) { + bpf_spin_unlock(&lock); + return -4; + } + + rb = bpf_rbtree_remove(&root, rb); + if (!rb) { + bpf_spin_unlock(&lock); + return -5; + } + + n = container_of(rb, struct node_data, r); + l = bpf_list_del(&head, &n->l); + bpf_spin_unlock(&lock); + bpf_obj_drop(n); + if (!l) + return -6; + + bpf_obj_drop(container_of(l, struct node_data, l)); + return 0; +} + +SEC("tc") +__description("list_del_test3: list_del accepts list_front return value as node") +__success __retval(0) +int list_del_test3(void *ctx) +{ + struct node_data *tmp; + struct bpf_list_node *bpf_node, *l; + long err = 0; + + tmp = __add_in_list(&head, &lock); + if (!tmp) + return -1; + + bpf_spin_lock(&lock); + bpf_node = bpf_list_front(&head); + if (!bpf_node) { + bpf_spin_unlock(&lock); + err = -2; + goto fail; + } + + l = bpf_list_del(&head, bpf_node); + bpf_spin_unlock(&lock); + if (!l) { + err = -3; + goto fail; + } + + bpf_obj_drop(container_of(l, struct node_data, l)); + bpf_obj_drop(tmp); + return 0; + +fail: + bpf_obj_drop(tmp); + return err; +} + +SEC("tc") +__description("list_add_test1: insert new node after prev") +__success __retval(0) +int list_add_test1(void *ctx) +{ + struct node_data *node_first; + struct node_data *new_node; + long err = 0; + + node_first = __add_in_list(&head, &lock); + if (!node_first) + return -1; + + new_node = bpf_obj_new(typeof(*new_node)); + if (!new_node) { + err = -2; + goto fail; + } + + bpf_spin_lock(&lock); + err = bpf_list_add(&head, &new_node->l, &node_first->l); + bpf_spin_unlock(&lock); + if (err) { + err = -3; + goto fail; + } + +fail: + bpf_obj_drop(node_first); + return err; +} + +SEC("tc") +__description("list_add_test2: list_add accepts list_front return value as prev") +__success __retval(0) +int list_add_test2(void *ctx) +{ + struct node_data *new_node, *tmp; + struct bpf_list_node *bpf_node; + long err = 0; + + tmp = __add_in_list(&head, &lock); + if (!tmp) + return -1; + + new_node = bpf_obj_new(typeof(*new_node)); + if (!new_node) { + err = -2; + goto fail; + } + + bpf_spin_lock(&lock); + bpf_node = bpf_list_front(&head); + if (!bpf_node) { + bpf_spin_unlock(&lock); + bpf_obj_drop(new_node); + err = -3; + goto fail; + } + + err = bpf_list_add(&head, &new_node->l, bpf_node); + bpf_spin_unlock(&lock); + if (err) { + err = -4; + goto fail; + } + +fail: + bpf_obj_drop(tmp); + return err; +} + +struct uninit_head_val { + struct bpf_spin_lock lock; + struct bpf_list_head head __contains(node_data, l); +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, struct uninit_head_val); + __uint(max_entries, 1); +} uninit_head_map SEC(".maps"); + +SEC("tc") +__description("list_push_back_uninit_head: push_back on 0-initialized list head") +__success __retval(0) +int list_push_back_uninit_head(void *ctx) +{ + struct uninit_head_val *st; + struct node_data *node; + int ret = -1, key = 0; + + st = bpf_map_lookup_elem(&uninit_head_map, &key); + if (!st) + return -1; + + node = bpf_obj_new(typeof(*node)); + if (!node) + return -1; + + bpf_spin_lock(&st->lock); + ret = bpf_list_push_back(&st->head, &node->l); + bpf_spin_unlock(&st->lock); + + return ret; +} + +SEC("?tc") +__failure __msg("bpf_spin_lock at off=32 must be held for bpf_list_head") +long list_del_without_lock_fail(void *ctx) +{ + struct node_data *n; + struct bpf_list_node *l; + + n = bpf_obj_new(typeof(*n)); + if (!n) + return -1; + + /* Error case: delete list node without holding lock */ + l = bpf_list_del(&head, &n->l); + bpf_obj_drop(n); + if (!l) + return -2; + bpf_obj_drop(container_of(l, struct node_data, l)); + + return 0; +} + +SEC("?tc") +__failure __msg("bpf_spin_lock at off=32 must be held for bpf_list_head") +long list_add_without_lock_fail(void *ctx) +{ + struct node_data *n, *prev; + long err; + + n = bpf_obj_new(typeof(*n)); + if (!n) + return -1; + + prev = bpf_obj_new(typeof(*prev)); + if (!prev) { + bpf_obj_drop(n); + return -1; + } + + /* Error case: add list node without holding lock */ + err = bpf_list_add(&head, &n->l, &prev->l); + bpf_obj_drop(prev); + if (err) + return -2; + + return 0; +} + SEC("tc") __success long rbtree_refcounted_node_ref_escapes(void *ctx) -- cgit v1.2.3 From 258df8fce42fecc23cd04242de3d39f1fe836433 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2026 07:22:12 -1000 Subject: mm: Add ptep_try_set() for lockless empty-slot installs Add ptep_try_set(ptep, new_pte): atomically set *ptep to new_pte iff it is currently pte_none(). Returns true on success, false if the slot was already populated or the arch has no implementation. The intended caller is the upcoming bpf_arena kernel-side fault recovery path. The install runs from a page fault that can be nested under locks held by the faulting kernel caller (e.g. a BPF program holding raw_res_spin_lock_irqsave on its arena's spinlock), so trylock-and-retry would A-A deadlock. Lock-free cmpxchg is the only viable option, which constrains this helper to special kernel page tables where concurrent writers cooperate via atomic accessors. The generic version in returns false. x86 and arm64 override with try_cmpxchg-based implementations on the underlying pteval. Other architectures get the false stub - the callers there already fall through to oops. v2: Rename to ptep_try_set(). Tighten kerneldoc. (David, Alexei) v3: Note that strict-zero cmpxchg is narrower than pte_none(). (Andrea) Suggested-by: Kumar Kartikeya Dwivedi Suggested-by: Alexei Starovoitov Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi Cc: David Hildenbrand Acked-by: David Hildenbrand (arm) Link: https://lore.kernel.org/r/20260522172219.1423324-2-tj@kernel.org Signed-off-by: Alexei Starovoitov --- arch/arm64/include/asm/pgtable.h | 12 ++++++++++++ arch/x86/include/asm/pgtable.h | 12 ++++++++++++ include/linux/pgtable.h | 25 +++++++++++++++++++++++++ 3 files changed, 49 insertions(+) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 4dfa42b7d053..984f0502c9d0 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1830,6 +1830,18 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, return __ptep_get_and_clear(mm, addr, ptep); } +/* + * Note: strictly-zero compare is narrower than pte_none(), but the gap is + * harmless: a fresh kernel PTE has no software bits set. + */ +static inline bool ptep_try_set(pte_t *ptep, pte_t new_pte) +{ + pteval_t old = 0; + + return try_cmpxchg(&pte_val(*ptep), &old, pte_val(new_pte)); +} +#define ptep_try_set ptep_try_set + #define test_and_clear_young_ptes test_and_clear_young_ptes static inline bool test_and_clear_young_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 2187e9cfcefa..ac295ca6c92f 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1284,6 +1284,18 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, } while (!try_cmpxchg((long *)&ptep->pte, (long *)&old_pte, *(long *)&new_pte)); } +/* + * Note: strictly-zero compare is narrower than pte_none(), but the gap is + * harmless: _PAGE_DIRTY and _PAGE_ACCESSED aren't set on untouched kernel PTEs. + */ +static inline bool ptep_try_set(pte_t *ptep, pte_t new_pte) +{ + pte_t old_pte = __pte(0); + + return try_cmpxchg((long *)&ptep->pte, (long *)&old_pte, *(long *)&new_pte); +} +#define ptep_try_set ptep_try_set + #define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0) #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index cdd68ed3ae1a..b5739bb99fc1 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1036,6 +1036,31 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres } #endif +#ifndef ptep_try_set +/** + * ptep_try_set - atomically set an empty kernel PTE + * @ptep: page table entry + * @new_pte: value to install + * + * Atomically set *@ptep to @new_pte iff *@ptep is pte_none(). Return true on + * success, false if the slot was already populated or the arch has no + * implementation. + * + * For special kernel page tables only - never user page tables. The caller must + * prevent concurrent teardown of @ptep and must accept that other writers may + * race. Concurrent clearers must use ptep_get_and_clear() so racing accesses + * agree on the outcome. + * + * Architectures opt in by providing a cmpxchg-based override and defining + * ptep_try_set as an identity macro. The generic stub returns false, which is + * correct for callers that fall through to oops on failure. + */ +static inline bool ptep_try_set(pte_t *ptep, pte_t new_pte) +{ + return false; +} +#endif + #ifndef wrprotect_ptes /** * wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same -- cgit v1.2.3 From dc11a4dba2464e5144c318ffaf7fb16b1a5c74d6 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 22 May 2026 07:22:13 -1000 Subject: bpf: Recover arena kernel faults with scratch page BPF arena usage is becoming more prevalent, but kernel <-> BPF communication over arena memory is awkward today. Data has to be staged through a trusted kernel pointer with extra code and copying on the BPF side. While reads through arena pointers can use a fault-safe helper, writes don't have a good solution. The in-line alternative would need instruction emulation or asm fixup labels. Enable direct kernel-side reads and writes within GUARD_SZ / 2 of any handed-in arena pointer, without bounds checking. A per-arena scratch page is installed by the arch fault path into empty arena kernel PTEs - x86 from page_fault_oops() for not-present faults, arm64 from __do_kernel_fault() for translation faults, both after the existing exception-table and KFENCE handling. The faulting instruction retries and the access is also reported through the program's BPF stream, preserving error reporting. bpf_prog_find_from_stack() resolves the current BPF program (and its arena) from the kernel stack - no new bpf_run_ctx state is added. Recovery covers the 4 GiB arena plus the upper half-guard (GUARD_SZ / 2). The lower half-guard is excluded because well-behaved kfuncs only access forward from arena pointers. The kfunc-author contract - access at most GUARD_SZ / 2 past a handed-in pointer - is documented in Documentation/bpf/kfuncs.rst. The install is lock-free via ptep_try_set(). On race-loss the winning installer's PTE is already valid, so the access retry succeeds. The arena clear path uses ptep_get_and_clear() so installer and clearer race through atomic accessors. No flush_tlb_kernel_range() afterwards. Stale "not mapped" entries just cause one extra re-fault, cheaper than a global IPI on every install. Scratch exists only to keep the kernel from oopsing on an in-line arena access. Its presence at a PTE means the BPF program has already malfunctioned, and the violation is reported through the program's BPF stream. The only requirement for behavior on a scratched PTE is that the kernel doesn't crash. In particular, any user-side access through such a PTE may segfault. The shared scratch page is freed once during map destruction. BPF instruction faults continue to use the existing JIT exception-table path. This patch changes only the kernel-text fault path. No UAPI flag is added. The new behavior is the default. v2: Use ptep_get_and_clear() in apply_range_clear_cb(). (David) v3: Stub bpf_arena_handle_page_fault() for !CONFIG_BPF_SYSCALL. (lkp) Suggested-by: Alexei Starovoitov Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Tejun Heo Reviewed-by: Emil Tsalapatis Cc: David Hildenbrand Link: https://lore.kernel.org/r/20260522172219.1423324-3-tj@kernel.org Signed-off-by: Alexei Starovoitov --- Documentation/bpf/kfuncs.rst | 14 ++++ arch/arm64/mm/fault.c | 10 ++- arch/x86/mm/fault.c | 12 ++- include/linux/bpf.h | 1 + include/linux/bpf_defs.h | 19 +++++ kernel/bpf/arena.c | 177 +++++++++++++++++++++++++++++++++---------- kernel/bpf/core.c | 5 ++ 7 files changed, 191 insertions(+), 47 deletions(-) create mode 100644 include/linux/bpf_defs.h diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst index 75e6c078e0e7..6d497e720998 100644 --- a/Documentation/bpf/kfuncs.rst +++ b/Documentation/bpf/kfuncs.rst @@ -462,6 +462,20 @@ In order to accommodate such requirements, the verifier will enforce strict PTR_TO_BTF_ID type matching if two types have the exact same name, with one being suffixed with ``___init``. +2.8 Accessing arena memory through kfunc arguments +-------------------------------------------------- + +A read or write at any address inside an arena does not oops the kernel. +Unallocated arena pages are lazily backed by a scratch page and the +access is reported through the program's BPF stream as an error. Only +the BPF program's correctness is affected; the kernel itself remains +intact. + +The arena is followed by a ``GUARD_SZ / 2`` (32 KiB) guard region that +is also covered by this recovery. A kfunc handed an arena pointer may +therefore access up to ``GUARD_SZ / 2`` past it without bounds-checking +against the arena. Larger accesses must verify the range explicitly. + .. _BPF_kfunc_lifecycle_expectations: 3. kfunc lifecycle expectations diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 0f3c5c7ca054..b4290d16ff92 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -436,9 +437,12 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr, } else if (is_pkvm_stage2_abort(esr)) { msg = "access to hypervisor-protected memory"; } else { - if (esr_fsc_is_translation_fault(esr) && - kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs)) - return; + if (esr_fsc_is_translation_fault(esr)) { + if (kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs)) + return; + if (bpf_arena_handle_page_fault(addr, esr & ESR_ELx_WNR, regs->pc)) + return; + } msg = "paging request"; } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 63de8e8684f2..7ea6a9362173 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -8,6 +8,7 @@ #include /* task_stack_*(), ... */ #include /* oops_begin/end, ... */ #include /* max_low_pfn */ +#include /* bpf_arena_handle_page_fault */ #include /* kfence_handle_page_fault */ #include /* NOKPROBE_SYMBOL, ... */ #include /* kmmio_handler, ... */ @@ -688,10 +689,13 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code, if (IS_ENABLED(CONFIG_EFI)) efi_crash_gracefully_on_page_fault(address, regs); - /* Only not-present faults should be handled by KFENCE. */ - if (!(error_code & X86_PF_PROT) && - kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs)) - return; + /* Only not-present faults should be handled by KFENCE or BPF arena. */ + if (!(error_code & X86_PF_PROT)) { + if (kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs)) + return; + if (bpf_arena_handle_page_fault(address, error_code & X86_PF_WRITE, regs->ip)) + return; + } oops: /* diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 01e203964892..bb4261a5df64 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -6,6 +6,7 @@ #include #include +#include #include #include diff --git a/include/linux/bpf_defs.h b/include/linux/bpf_defs.h new file mode 100644 index 000000000000..2185cd3966d4 --- /dev/null +++ b/include/linux/bpf_defs.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Subset of bpf.h declarations, split out so files that need only these + * declarations can avoid bpf.h's full include cost. + */ +#ifndef _LINUX_BPF_DEFS_H +#define _LINUX_BPF_DEFS_H + +#ifdef CONFIG_BPF_SYSCALL +bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, unsigned long fault_ip); +#else +static inline bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, + unsigned long fault_ip) +{ + return false; +} +#endif + +#endif /* _LINUX_BPF_DEFS_H */ diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 49a8f7b1beef..2da2c275cff6 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -53,6 +53,7 @@ struct bpf_arena { u64 user_vm_start; u64 user_vm_end; struct vm_struct *kern_vm; + struct page *scratch_page; struct range_tree rt; /* protects rt */ rqspinlock_t spinlock; @@ -118,6 +119,11 @@ struct apply_range_data { int i; }; +struct clear_range_data { + struct llist_head *free_pages; + struct page *scratch_page; +}; + static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data) { struct apply_range_data *d = data; @@ -144,33 +150,59 @@ static void flush_vmap_cache(unsigned long start, unsigned long size) flush_cache_vmap(start, start + size); } -static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *free_pages) +static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data) { + struct clear_range_data *d = data; pte_t old_pte; struct page *page; - /* sanity check */ - old_pte = ptep_get(pte); + /* + * Pairs with ptep_try_set() in the kernel-fault scratch installer. + * Both sides must be atomic. + */ + old_pte = ptep_get_and_clear(&init_mm, addr, pte); if (pte_none(old_pte) || !pte_present(old_pte)) - return 0; /* nothing to do */ + return 0; page = pte_page(old_pte); if (WARN_ON_ONCE(!page)) return -EINVAL; - pte_clear(&init_mm, addr, pte); + /* + * Skip the per-arena scratch page. A kernel fault on an unallocated uaddr + * scratches its PTE. A later bpf_arena_free_pages() over that range walks + * here. Without the skip, scratch_page would be freed. + */ + if (page == d->scratch_page) + return 0; + + __llist_add(&page->pcp_llist, d->free_pages); + return 0; +} - /* Add page to the list so it is freed later */ - if (free_pages) - __llist_add(&page->pcp_llist, free_pages); +static int apply_range_set_scratch_cb(pte_t *pte, unsigned long addr, void *data) +{ + struct page *scratch_page = data; + if (!pte_none(ptep_get(pte))) + return 0; + /* + * Best-effort install. ptep_try_set() returns false only if another + * installer (real allocation or concurrent fault) won the cmpxchg. + * Their PTE is already valid, so the access retry succeeds. + * + * No flush_tlb_kernel_range() needed. Stale "not mapped" entries just + * cause one extra re-fault through this same path. + */ + ptep_try_set(pte, mk_pte(scratch_page, PAGE_KERNEL)); return 0; } static int populate_pgtable_except_pte(struct bpf_arena *arena) { + /* Populate intermediates for the recovery range (4 GiB + upper half-guard). */ return apply_to_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena), - KERN_VM_SZ - GUARD_SZ, apply_range_set_cb, NULL); + SZ_4G + GUARD_SZ / 2, apply_range_set_cb, NULL); } static struct bpf_map *arena_map_alloc(union bpf_attr *attr) @@ -221,22 +253,29 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr) init_irq_work(&arena->free_irq, arena_free_irq); INIT_WORK(&arena->free_work, arena_free_worker); bpf_map_init_from_attr(&arena->map, attr); + + err = bpf_map_alloc_pages(&arena->map, NUMA_NO_NODE, 1, &arena->scratch_page); + if (err) + goto err_free_arena; + range_tree_init(&arena->rt); err = range_tree_set(&arena->rt, 0, attr->max_entries); - if (err) { - bpf_map_area_free(arena); - goto err; - } + if (err) + goto err_free_scratch; mutex_init(&arena->lock); raw_res_spin_lock_init(&arena->spinlock); err = populate_pgtable_except_pte(arena); - if (err) { - range_tree_destroy(&arena->rt); - bpf_map_area_free(arena); - goto err; - } + if (err) + goto err_destroy_rt; return &arena->map; + +err_destroy_rt: + range_tree_destroy(&arena->rt); +err_free_scratch: + __free_page(arena->scratch_page); +err_free_arena: + bpf_map_area_free(arena); err: free_vm_area(kern_vm); return ERR_PTR(err); @@ -244,6 +283,7 @@ err: static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data) { + struct bpf_arena *arena = data; struct page *page; pte_t pte; @@ -251,6 +291,12 @@ static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data) if (!pte_present(pte)) /* sanity check */ return 0; page = pte_page(pte); + /* + * Skip the scratch page. The walk is page-table-driven, not range-tree-driven, + * so it can visit scratch PTEs at uaddrs the BPF program never allocated. + */ + if (page == arena->scratch_page) + return 0; /* * We do not update pte here: * 1. Nobody should be accessing bpf_arena's range outside of a kernel bug @@ -286,9 +332,10 @@ static void arena_map_free(struct bpf_map *map) * free those pages. */ apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena), - KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL); + SZ_4G + GUARD_SZ / 2, existing_page_cb, arena); free_vm_area(arena->kern_vm); range_tree_destroy(&arena->rt); + __free_page(arena->scratch_page); bpf_map_area_free(arena); } @@ -384,33 +431,37 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) return VM_FAULT_RETRY; page = vmalloc_to_page((void *)kaddr); - if (page) + if (page) { + if (page == arena->scratch_page) + /* BPF triggered scratch here; don't lazy-alloc over it */ + goto out_sigsegv; /* already have a page vmap-ed */ goto out; + } bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT) /* User space requested to segfault when page is not allocated by bpf prog */ - goto out_unlock_sigsegv; + goto out_sigsegv_memcg; ret = range_tree_clear(&arena->rt, vmf->pgoff, 1); if (ret) - goto out_unlock_sigsegv; + goto out_sigsegv_memcg; struct apply_range_data data = { .pages = &page, .i = 0 }; /* Account into memcg of the process that created bpf_arena */ ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page); if (ret) { range_tree_set(&arena->rt, vmf->pgoff, 1); - goto out_unlock_sigsegv; + goto out_sigsegv_memcg; } ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data); if (ret) { range_tree_set(&arena->rt, vmf->pgoff, 1); free_pages_nolock(page, 0); - goto out_unlock_sigsegv; + goto out_sigsegv_memcg; } flush_vmap_cache(kaddr, PAGE_SIZE); bpf_map_memcg_exit(old_memcg, new_memcg); @@ -419,8 +470,9 @@ out: raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); vmf->page = page; return 0; -out_unlock_sigsegv: +out_sigsegv_memcg: bpf_map_memcg_exit(old_memcg, new_memcg); +out_sigsegv: raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); return VM_FAULT_SIGSEGV; } @@ -685,6 +737,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, struct llist_head free_pages; struct llist_node *pos, *t; struct arena_free_span *s; + struct clear_range_data cdata; unsigned long flags; int ret = 0; @@ -713,9 +766,11 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, range_tree_set(&arena->rt, pgoff, page_cnt); init_llist_head(&free_pages); + cdata.free_pages = &free_pages; + cdata.scratch_page = arena->scratch_page; /* clear ptes and collect struct pages */ apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT, - apply_range_clear_cb, &free_pages); + apply_range_clear_cb, &cdata); /* drop the lock to do the tlb flush and zap pages */ raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); @@ -805,6 +860,7 @@ static void arena_free_worker(struct work_struct *work) struct arena_free_span *s; u64 arena_vm_start, user_vm_start; struct llist_head free_pages; + struct clear_range_data cdata; struct page *page; unsigned long full_uaddr; long kaddr, page_cnt, pgoff; @@ -818,6 +874,8 @@ static void arena_free_worker(struct work_struct *work) bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); init_llist_head(&free_pages); + cdata.free_pages = &free_pages; + cdata.scratch_page = arena->scratch_page; arena_vm_start = bpf_arena_get_kern_vm_start(arena); user_vm_start = bpf_arena_get_user_vm_start(arena); @@ -830,7 +888,7 @@ static void arena_free_worker(struct work_struct *work) /* clear ptes and collect pages in free_pages llist */ apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT, - apply_range_clear_cb, &free_pages); + apply_range_clear_cb, &cdata); range_tree_set(&arena->rt, pgoff, page_cnt); } @@ -945,23 +1003,12 @@ static int __init kfunc_init(void) } late_initcall(kfunc_init); -void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip) +static void __bpf_prog_report_arena_violation(struct bpf_prog *prog, bool write, + unsigned long addr, unsigned long fault_ip) { struct bpf_stream_stage ss; - struct bpf_prog *prog; u64 user_vm_start; - /* - * The RCU read lock is held to safely traverse the latch tree, but we - * don't need its protection when accessing the prog, since it will not - * disappear while we are handling the fault. - */ - rcu_read_lock(); - prog = bpf_prog_ksym_find(fault_ip); - rcu_read_unlock(); - if (!prog) - return; - /* Use main prog for stream access */ prog = prog->aux->main_prog_aux->prog; @@ -974,3 +1021,53 @@ void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned lo bpf_stream_dump_stack(ss); })); } + +bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, unsigned long fault_ip) +{ + struct bpf_arena *arena; + struct bpf_prog *prog; + unsigned long kbase; + unsigned long page_addr = addr & PAGE_MASK; + + prog = bpf_prog_find_from_stack(); + if (!prog) + return false; + + arena = prog->aux->arena; + /* a prog not using arena may be on stack, so arena can be NULL */ + if (!arena) + return false; + + kbase = bpf_arena_get_kern_vm_start(arena); + + /* + * Recovery covers the 4 GiB mappable band plus the upper half-guard. + * Lower guard is unreachable from kfuncs; an address there indicates + * a different bug class - leave it to the regular kernel oops path. + */ + if (page_addr < kbase || page_addr >= kbase + SZ_4G + GUARD_SZ / 2) + return false; + + apply_to_page_range(&init_mm, page_addr, PAGE_SIZE, + apply_range_set_scratch_cb, arena->scratch_page); + flush_vmap_cache(page_addr, PAGE_SIZE); + __bpf_prog_report_arena_violation(prog, is_write, page_addr - kbase, fault_ip); + return true; +} + +void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip) +{ + struct bpf_prog *prog; + + /* + * The RCU read lock is held to safely traverse the latch tree, but we + * don't need its protection when accessing the prog, since it will not + * disappear while we are handling the fault. + */ + rcu_read_lock(); + prog = bpf_prog_ksym_find(fault_ip); + rcu_read_unlock(); + if (!prog) + return; + __bpf_prog_report_arena_violation(prog, write, addr, fault_ip); +} diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8b018ff48875..fc3ee67486ce 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -3350,6 +3350,11 @@ __weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena) { return 0; } +__weak bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, + unsigned long fault_ip) +{ + return false; +} #ifdef CONFIG_BPF_SYSCALL static int __init bpf_global_ma_init(void) -- cgit v1.2.3 From f211c81ddc368e5cc6ad69d171bca0fa52e71ad7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2026 07:22:14 -1000 Subject: bpf: Add sleepable variant of bpf_arena_alloc_pages for kernel callers The existing kernel-side export of bpf_arena_alloc_pages is _non_sleepable only - it's used by the verifier to inline the kfunc when the call site is non-sleepable. There is no sleepable equivalent for kernel callers. The kfunc bpf_arena_alloc_pages itself is BPF-only. sched_ext needs sleepable kernel-side allocs for its arena pool init/grow paths. Add bpf_arena_alloc_pages_sleepable() mirroring the _non_sleepable wrapper but passing sleepable=true to arena_alloc_pages(). Signed-off-by: Tejun Heo Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260522172219.1423324-4-tj@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 8 ++++++++ kernel/bpf/arena.c | 13 +++++++++++++ 2 files changed, 21 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bb4261a5df64..c00be24e7244 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -679,6 +679,8 @@ int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags, void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt, int node_id, u64 flags); void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt); +void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt, int node_id, + u64 flags); #else static inline void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt, int node_id, u64 flags) @@ -689,6 +691,12 @@ static inline void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr static inline void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt) { } + +static inline void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt, + int node_id, u64 flags) +{ + return NULL; +} #endif extern const struct bpf_map_ops bpf_map_offload_ops; diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 2da2c275cff6..9e379ef27d41 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -951,6 +951,19 @@ void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 pag return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, false); } + +void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt, + int node_id, u64 flags) +{ + struct bpf_map *map = p__map; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt) + return NULL; + + return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true); +} + __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt) { struct bpf_map *map = p__map; -- cgit v1.2.3 From 7c48a28c1bbe26e272bc978a42adb757fc6aa639 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2026 07:22:15 -1000 Subject: bpf: Add bpf_struct_ops_for_each_prog() Add a helper that walks the member progs of the struct_ops map containing a given @kdata vmtable. struct_ops ->reg() callbacks (and similar) sometimes need to inspect the loaded BPF programs, e.g. to discover maps they reference via prog->aux->used_maps. The implementation mirrors bpf_struct_ops_id(): container_of @kdata to recover the bpf_struct_ops_map, then iterate st_map->links[i]->prog for i in [0, funcs_cnt). Same access pattern, no new locking - by the time ->reg() fires st_map is fully populated and stable. A sched_ext follow-up walks the member progs of a cid-form scheduler's struct_ops map, reads prog->aux->arena directly, and requires all member progs to reference exactly one arena, without requiring the BPF program to call a registration kfunc. Signed-off-by: Tejun Heo Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260522172219.1423324-5-tj@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +++ kernel/bpf/bpf_struct_ops.c | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c00be24e7244..491cc6750504 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2131,6 +2131,9 @@ int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map); void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog); void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux); u32 bpf_struct_ops_id(const void *kdata); +int bpf_struct_ops_for_each_prog(const void *kdata, + int (*cb)(struct bpf_prog *prog, void *data), + void *data); #ifdef CONFIG_NET /* Define it here to avoid the use of forward declaration */ diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 521cb9d7e8c7..5e51c1211673 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -1204,6 +1204,42 @@ u32 bpf_struct_ops_id(const void *kdata) } EXPORT_SYMBOL_GPL(bpf_struct_ops_id); +/** + * bpf_struct_ops_for_each_prog - Invoke @cb for each member prog + * @kdata: kernel-side struct_ops vmtable (the @kdata arg to ->reg/->update/->unreg) + * @cb: callback invoked once per member prog; non-zero return stops iteration + * @data: opaque argument passed to @cb + * + * Walks the struct_ops member progs registered on the map containing @kdata. + * Intended for use from struct_ops ->reg() callbacks (and similar) that need to + * inspect the loaded BPF programs (for example to discover maps they reference + * via @prog->aux->used_maps). + * + * Return 0 if iteration completed, otherwise the first non-zero @cb return. + */ +int bpf_struct_ops_for_each_prog(const void *kdata, + int (*cb)(struct bpf_prog *prog, void *data), + void *data) +{ + struct bpf_struct_ops_value *kvalue; + struct bpf_struct_ops_map *st_map; + u32 i; + int ret; + + kvalue = container_of(kdata, struct bpf_struct_ops_value, data); + st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); + + for (i = 0; i < st_map->funcs_cnt; i++) { + if (!st_map->links[i]) + continue; + ret = cb(st_map->links[i]->prog, data); + if (ret) + return ret; + } + return 0; +} +EXPORT_SYMBOL_GPL(bpf_struct_ops_for_each_prog); + static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; -- cgit v1.2.3 From 53cc12a2dc88c2c6f62f507548640885a70a56a8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2026 07:22:16 -1000 Subject: bpf/arena: Add bpf_arena_map_kern_vm_start() and bpf_prog_arena() struct bpf_arena is opaque to callers outside arena.c. Add two helpers for struct_ops subsystems that need to reach into an arena: bpf_arena_map_kern_vm_start(struct bpf_map *map) returns @map's kern_vm_start. A sched_ext follow-up needs this to translate kern_va <-> uaddr. bpf_prog_arena(struct bpf_prog *prog) returns the bpf_map of the arena referenced by @prog (NULL if @prog references no arena). The verifier enforces at most one arena per program. Used by struct_ops callers that auto-discover an arena from a member prog and need to take a map reference. Suggested-by: Kumar Kartikeya Dwivedi Signed-off-by: Tejun Heo Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260522172219.1423324-6-tj@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ kernel/bpf/arena.c | 26 ++++++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 491cc6750504..c323b3e027fe 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -618,6 +618,8 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root, struct bpf_spin_lock *spin_lock); u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena); u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena); +u64 bpf_arena_map_kern_vm_start(struct bpf_map *map); +struct bpf_map *bpf_prog_arena(struct bpf_prog *prog); int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size); struct bpf_offload_dev; diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 9e379ef27d41..1727503b25d8 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -84,6 +84,32 @@ u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena) return arena ? arena->user_vm_start : 0; } +/** + * bpf_arena_map_kern_vm_start - kern_vm_start lookup by struct bpf_map * + * @map: a BPF_MAP_TYPE_ARENA map + * + * Return @map's kern_vm_start. + */ +u64 bpf_arena_map_kern_vm_start(struct bpf_map *map) +{ + return bpf_arena_get_kern_vm_start(container_of(map, struct bpf_arena, map)); +} + +/** + * bpf_prog_arena - return the bpf_map of the arena referenced by @prog + * @prog: a loaded BPF program + * + * The verifier enforces at most one arena per program and stores it in + * prog->aux->arena. Return that arena's underlying bpf_map, or NULL if + * @prog does not reference an arena. + */ +struct bpf_map *bpf_prog_arena(struct bpf_prog *prog) +{ + struct bpf_arena *arena = prog->aux->arena; + + return arena ? &arena->map : NULL; +} + static long arena_map_peek_elem(struct bpf_map *map, void *value) { return -EOPNOTSUPP; -- cgit v1.2.3 From e42e53ae23b7d41df22ccd7788192bf578f24da2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 27 May 2026 09:26:32 -1000 Subject: bpf: Fix bpf_arena_handle_page_fault() redefinition without CONFIG_BPF_SYSCALL On configs with CONFIG_BPF=y but CONFIG_BPF_SYSCALL=n (e.g. arm multi_v7_defconfig), kernel/bpf/core.c defines a __weak bpf_arena_handle_page_fault() while bpf_defs.h already supplies a static inline stub for it, causing a redefinition error. Build the __weak definition only under CONFIG_BPF_SYSCALL, matching the bpf_defs.h declaration and the CONFIG_BPF_SYSCALL-gated strong definition in arena.c. Fixes: dc11a4dba246 ("bpf: Recover arena kernel faults with scratch page") Reported-by: Mark Brown Signed-off-by: Tejun Heo Acked-by: Song Liu Link: https://lore.kernel.org/r/20260527192632.2109419-1-tj@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8ecba2989d88..a656a8572bdb 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -3376,13 +3376,14 @@ __weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena) { return 0; } + +#ifdef CONFIG_BPF_SYSCALL __weak bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, unsigned long fault_ip) { return false; } -#ifdef CONFIG_BPF_SYSCALL static int __init bpf_global_ma_init(void) { int ret; -- cgit v1.2.3 From fee9a38174f4c6454fb1fbaf2b9b5a1cca9070d0 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Fri, 22 May 2026 16:13:53 -0400 Subject: libbpf: Harden parse_vma_segs() path parsing parse_vma_segs() in tools/lib/bpf/usdt.c parses /proc//maps with two widthless scansets, "%s" into mode[16] and "%[^\n]" into line[4096]. A VMA name in maps is not limited to that local buffer; a deeply nested backing path can produce a maps record long enough to overflow the stack buffer. Bound both scansets to the declared buffer sizes ("%15s" for mode[16] and "%4095[^\n]" for line[4096]) and drain any residue past line[4094] with "%*[^\n]" before the trailing "\n". Without the drain, the residue of an over-long record would stay in the stream and break the next "%zx-%zx" parse, so the loop would exit early and silently skip later maps records. Also stop using sscanf(..., "%s") to peel the /proc//root prefix from lib_path. Parse the pid and prefix length with "%n", check for the following slash, and copy the remainder with libbpf_strlcpy(). That removes a second unbounded stack write and preserves paths containing spaces. Fixes: 74cc6311cec9 ("libbpf: Add USDT notes parsing and resolution logic") Signed-off-by: Michael Bommarito Signed-off-by: Andrii Nakryiko Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/bpf/20260522201353.1454653-1-michael.bommarito@gmail.com --- tools/lib/bpf/usdt.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c index e3710933fd52..57fb82bb81b5 100644 --- a/tools/lib/bpf/usdt.c +++ b/tools/lib/bpf/usdt.c @@ -468,10 +468,10 @@ static int parse_elf_segs(Elf *elf, const char *path, struct elf_seg **segs, siz static int parse_vma_segs(int pid, const char *lib_path, struct elf_seg **segs, size_t *seg_cnt) { - char path[PATH_MAX], line[PATH_MAX], mode[16]; + char path[PATH_MAX], line[4096], mode[16]; size_t seg_start, seg_end, seg_off; struct elf_seg *seg; - int tmp_pid, i, err; + int tmp_pid, n, i, err; FILE *f; *seg_cnt = 0; @@ -480,8 +480,13 @@ static int parse_vma_segs(int pid, const char *lib_path, struct elf_seg **segs, * /proc//root/. They will be reported as just / in * /proc//maps. */ - if (sscanf(lib_path, "/proc/%d/root%s", &tmp_pid, path) == 2 && pid == tmp_pid) + /* %n is not counted in sscanf() return value, so initialize it. */ + n = 0; + if (sscanf(lib_path, "/proc/%d/root%n", &tmp_pid, &n) == 1 && + n > 0 && pid == tmp_pid && lib_path[n] == '/') { + libbpf_strlcpy(path, lib_path + n, sizeof(path)); goto proceed; + } if (!realpath(lib_path, path)) { pr_warn("usdt: failed to get absolute path of '%s' (err %s), using path as is...\n", @@ -504,8 +509,11 @@ proceed: * 7f5c6f5d1000-7f5c6f5d3000 rw-p 001c7000 08:04 21238613 /usr/lib64/libc-2.17.so * 7f5c6f5d3000-7f5c6f5d8000 rw-p 00000000 00:00 0 * 7f5c6f5d8000-7f5c6f5d9000 r-xp 00000000 103:01 362990598 /data/users/andriin/linux/tools/bpf/usdt/libhello_usdt.so + * + * Some VMA names can be longer than the local buffer. Bound the + * writes, but still consume the rest of the line. */ - while (fscanf(f, "%zx-%zx %s %zx %*s %*d%[^\n]\n", + while (fscanf(f, "%zx-%zx %15s %zx %*s %*d%4095[^\n]%*[^\n]\n", &seg_start, &seg_end, mode, &seg_off, line) == 5) { void *tmp; -- cgit v1.2.3 From be4c6c7bc42952b71188894933946b410deadcfe Mon Sep 17 00:00:00 2001 From: Siddharth Nayyar Date: Wed, 20 May 2026 09:40:44 +0000 Subject: bpftool: Fix typo in struct_ops map FD generation for light skeleton When generating light skeletons for BPF programs containing struct_ops maps, bpftool incorrectly outputs a stray literal 't' instead of a tab character for the map file descriptor member in the links structure. This causes a compilation error when the generated light skeleton is used. Correct the format string by replacing 't' with '\t'. Fixes: 08ac454e258e ("libbpf: Auto-attach struct_ops BPF maps in BPF skeleton") Signed-off-by: Siddharth Nayyar Signed-off-by: Andrii Nakryiko Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20260520-struct_ops_gen_typo_fix-v1-1-4dee3771da46@google.com --- tools/bpf/bpftool/gen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 37159e02f418..6ae7262ebe0c 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -1399,7 +1399,7 @@ static int do_skeleton(int argc, char **argv) continue; if (use_loader) - printf("t\tint %s_fd;\n", ident); + printf("\t\tint %s_fd;\n", ident); else printf("\t\tstruct bpf_link *%s;\n", ident); } -- cgit v1.2.3 From b23705e6afb6ac4ae6d220dcb35975698667dd76 Mon Sep 17 00:00:00 2001 From: Carlos Llamas Date: Sat, 23 May 2026 16:27:21 +0000 Subject: libbpf: Fix UAF in strset__add_str() strset_add_str_mem() might reallocate the strset data buffer in order to accommodate the provided string 's'. However, if 's' points to a string already present in the buffer, it becomes dangling after the realloc. This leads to a use-after-free when attempting to memcpy() the string into the new buffer. One scenario that triggers this problematic path is when resolve_btfids attempts to patch kfunc prototypes using existing BTF parameter names: | resolve_btfids: function bpf_list_push_back_impl already exists in BTF | Segmentation fault (core dumped) Compiling resolve_btfids with fsanitize=address generates a detailed report of the UAF: | ================================================================= | ERROR: AddressSanitizer: heap-use-after-free on address 0x7f4c4a500bd4 | ==1507892==ERROR: AddressSanitizer: heap-use-after-free on address 0x7f4c4a500bd4 at pc 0x55d25155a2a8 bp 0x7ffcef879060 sp 0x7ffcef878818 | READ of size 5 at 0x7f4c4a500bd4 thread T0 | #0 0x55d25155a2a7 in memcpy (tools/bpf/resolve_btfids/resolve_btfids+0xcf2a7) | #1 0x55d2515d708e in strset__add_str tools/lib/bpf/strset.c:162:2 | #2 0x55d2515c730b in btf__add_str tools/lib/bpf/btf.c:2109:8 | #3 0x55d2515c9020 in btf__add_func_param tools/lib/bpf/btf.c:3108:14 | #4 0x55d25159f0b5 in process_kfunc_with_implicit_args tools/bpf/resolve_btfids/main.c:1196:9 | #5 0x55d25159e004 in btf2btf tools/bpf/resolve_btfids/main.c:1229:9 | #6 0x55d25159cee7 in main tools/bpf/resolve_btfids/main.c:1535:6 | #7 0x7f4c78e29f76 in __libc_start_call_main csu/../sysdeps/nptl/libc_start_call_main.h:58:16 | #8 0x7f4c78e2a026 in __libc_start_main csu/../csu/libc-start.c:360:3 | #9 0x55d2514bb860 in _start (tools/bpf/resolve_btfids/resolve_btfids+0x30860) | | 0x7f4c4a500bd4 is located 13268 bytes inside of 2829000-byte region [0x7f4c4a4fd800,0x7f4c4a7b02c8) | freed by thread T0 here: | #0 0x55d25155b700 in realloc (tools/bpf/resolve_btfids/resolve_btfids+0xd0700) | #1 0x55d2515c426c in libbpf_reallocarray tools/lib/bpf/./libbpf_internal.h:220:9 | #2 0x55d2515c426c in libbpf_add_mem tools/lib/bpf/btf.c:224:13 | | previously allocated by thread T0 here: | #0 0x55d25155b2e3 in malloc (tools/bpf/resolve_btfids/resolve_btfids+0xd02e3) | #1 0x55d2515d6e7d in strset__new tools/lib/bpf/strset.c:58:20 While resolve_btfids could be refactored to avoid this call path, let's instead fix this issue at the source in strset__add_str() and avoid similar scenarios. Let's check if set->strs_data was reallocated and whether 's' points to an internal string within the old strset buffer. In such case, 's' is reconstructed to point to the new buffer. While already here, also fix strset__find_str() which suffers from the same problem by factoring out the common operations into a new helper function strset_str_append(). Fixes: 90d76d3ececc ("libbpf: Extract internal set-of-strings datastructure APIs") Suggested-by: Andrii Nakryiko Suggested-by: Mykyta Yatsenko Signed-off-by: Carlos Llamas Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260523162722.2718940-1-cmllamas@google.com --- tools/lib/bpf/strset.c | 62 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 21 deletions(-) diff --git a/tools/lib/bpf/strset.c b/tools/lib/bpf/strset.c index 2464bcbd04e0..ace73c6b3d62 100644 --- a/tools/lib/bpf/strset.c +++ b/tools/lib/bpf/strset.c @@ -107,6 +107,41 @@ static void *strset_add_str_mem(struct strset *set, size_t add_sz) set->strs_data_len, set->strs_data_max_len, add_sz); } +static long strset_str_append(struct strset *set, const char *s) +{ + uintptr_t old_data = (uintptr_t)set->strs_data; + size_t old_data_len = set->strs_data_len; + uintptr_t old_s = (uintptr_t)s; + long len = strlen(s) + 1; + void *p; + + /* + * Hashmap keys are always offsets within set->strs_data, so to even + * look up some string from the "outside", we need to first append it + * at the end, so that it can be addressed with an offset. Luckily, + * until set->strs_data_len is incremented, that string is just a piece + * of garbage for the rest of the code, so no harm, no foul. On the + * other hand, if the string is unique, it's already appended and + * ready to be used, only a simple set->strs_data_len increment away. + */ + p = strset_add_str_mem(set, len); + if (!p) + return -ENOMEM; + + /* + * The set->strs_data might have reallocated and if 's' pointed + * to an internal string within the old buffer, then it became + * dangling and needs to be reconstructed before the copy. + */ + if (old_data && old_data != (uintptr_t)set->strs_data && + old_s >= old_data && old_s < old_data + old_data_len) + s = set->strs_data + (old_s - old_data); + + memcpy(p, s, len); + + return len; +} + /* Find string offset that corresponds to a given string *s*. * Returns: * - >0 offset into string data, if string is found; @@ -116,16 +151,12 @@ static void *strset_add_str_mem(struct strset *set, size_t add_sz) int strset__find_str(struct strset *set, const char *s) { long old_off, new_off, len; - void *p; - /* see strset__add_str() for why we do this */ - len = strlen(s) + 1; - p = strset_add_str_mem(set, len); - if (!p) - return -ENOMEM; + len = strset_str_append(set, s); + if (len < 0) + return len; new_off = set->strs_data_len; - memcpy(p, s, len); if (hashmap__find(set->strs_hash, new_off, &old_off)) return old_off; @@ -142,24 +173,13 @@ int strset__find_str(struct strset *set, const char *s) int strset__add_str(struct strset *set, const char *s) { long old_off, new_off, len; - void *p; int err; - /* Hashmap keys are always offsets within set->strs_data, so to even - * look up some string from the "outside", we need to first append it - * at the end, so that it can be addressed with an offset. Luckily, - * until set->strs_data_len is incremented, that string is just a piece - * of garbage for the rest of the code, so no harm, no foul. On the - * other hand, if the string is unique, it's already appended and - * ready to be used, only a simple set->strs_data_len increment away. - */ - len = strlen(s) + 1; - p = strset_add_str_mem(set, len); - if (!p) - return -ENOMEM; + len = strset_str_append(set, s); + if (len < 0) + return len; new_off = set->strs_data_len; - memcpy(p, s, len); /* Now attempt to add the string, but only if the string with the same * contents doesn't exist already (HASHMAP_ADD strategy). If such -- cgit v1.2.3 From a4a5d4ee061240a1d39053db0a87f841d43277c0 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 26 May 2026 14:39:36 +0800 Subject: libbpf: Add __NR_bpf definition for LoongArch LoongArch uses the generic syscall table, where __NR_bpf is defined as 280 in include/uapi/asm-generic/unistd.h. To align with other architectures, add the __NR_bpf definition for LoongArch to avoid a potential compilation failure: "error __NR_bpf not defined. libbpf does not support your arch." This is a follow up patch of: commit b0c47807d31d ("bpf: Add sparc support to tools and samples.") commit bad1926dd2f6 ("bpf, s390: fix build for libbpf and selftest suite") commit ca31ca8247e2 ("tools/bpf: fix perf build error with uClibc (seen on ARC)") commit e32cb12ff52a ("bpf, mips: Fix build errors about __NR_bpf undeclared") Signed-off-by: Tiezhu Yang Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260526063936.16769-1-yangtiezhu@loongson.cn --- tools/build/feature/test-bpf.c | 2 ++ tools/lib/bpf/bpf.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/tools/build/feature/test-bpf.c b/tools/build/feature/test-bpf.c index e7a405f83af6..89d59674f39b 100644 --- a/tools/build/feature/test-bpf.c +++ b/tools/build/feature/test-bpf.c @@ -20,6 +20,8 @@ # define __NR_bpf 6319 # elif defined(__mips__) && defined(_ABI64) # define __NR_bpf 5315 +# elif defined(__loongarch__) +# define __NR_bpf 280 # else # error __NR_bpf not defined. libbpf does not support your arch. # endif diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 3cd705802330..bc513aa8f404 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -59,6 +59,8 @@ # define __NR_bpf 6319 # elif defined(__mips__) && defined(_ABI64) # define __NR_bpf 5315 +# elif defined(__loongarch__) +# define __NR_bpf 280 # else # error __NR_bpf not defined. libbpf does not support your arch. # endif -- cgit v1.2.3 From fc99547a8bda22a6a489284641385d8dcfb3ecd8 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 25 May 2026 15:39:46 -0700 Subject: bpf: Factor out stack_map build ID helpers Factor out helpers from stack_map_get_build_id_offset() in preparation for adding a sleepable build ID resolution path: stack_map_build_id_set_ip(), stack_map_build_id_offset(), and stack_map_build_id_set_valid(). While here, refactor stack_map_get_build_id_offset(): * use continue-driven control flow in the main loop and remove build_id_valid label * update prev_vma and prev_build_id on the fall-back-to-IP branch so the cache reflects the actual VMA seen on the previous IP [1] * guard fetch_build_id() with vma_is_anonymous() [2] to skip parse attempts that would otherwise fail the ELF magic check [1] https://lore.kernel.org/bpf/CAEf4Bzac9uWWqBvzH0iFzKvJcq3vxscZ3pKm0sUHmN-F-z9wVQ@mail.gmail.com/ [2] https://lore.kernel.org/bpf/226398c1ff3f2b686c0aeb010408d85fb15df13f9ff60a045bee31e79b9e41e9@mail.kernel.org/ Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Acked-by: Mykyta Yatsenko Link: https://lore.kernel.org/bpf/20260525223948.1920986-2-ihor.solodrai@linux.dev --- kernel/bpf/stackmap.c | 57 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 17 deletions(-) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index da3d328f5c15..e23be7d44503 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -152,6 +152,28 @@ static int fetch_build_id(struct vm_area_struct *vma, unsigned char *build_id, b : build_id_parse_nofault(vma, build_id, NULL); } +static inline void stack_map_build_id_set_ip(struct bpf_stack_build_id *id) +{ + id->status = BPF_STACK_BUILD_ID_IP; + memset(id->build_id, 0, BUILD_ID_SIZE_MAX); +} + +static inline u64 stack_map_build_id_offset(unsigned long vm_pgoff, + unsigned long vm_start, u64 ip) +{ + return (vm_pgoff << PAGE_SHIFT) + ip - vm_start; +} + +static inline void stack_map_build_id_set_valid(struct bpf_stack_build_id *id, + u64 offset, + const unsigned char *build_id) +{ + id->status = BPF_STACK_BUILD_ID_VALID; + id->offset = offset; + if (id->build_id != build_id) + memcpy(id->build_id, build_id, BUILD_ID_SIZE_MAX); +} + /* * Expects all id_offs[i].ip values to be set to correct initial IPs. * They will be subsequently: @@ -165,44 +187,45 @@ static int fetch_build_id(struct vm_area_struct *vma, unsigned char *build_id, b static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, u32 trace_nr, bool user, bool may_fault) { - int i; struct mmap_unlock_irq_work *work = NULL; bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work); + bool has_user_ctx = user && current && current->mm; struct vm_area_struct *vma, *prev_vma = NULL; - const char *prev_build_id; + const unsigned char *prev_build_id = NULL; + int i; /* If the irq_work is in use, fall back to report ips. Same * fallback is used for kernel stack (!user) on a stackmap with * build_id. */ - if (!user || !current || !current->mm || irq_work_busy || - !mmap_read_trylock(current->mm)) { + if (!has_user_ctx || irq_work_busy || !mmap_read_trylock(current->mm)) { /* cannot access current->mm, fall back to ips */ - for (i = 0; i < trace_nr; i++) { - id_offs[i].status = BPF_STACK_BUILD_ID_IP; - memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); - } + for (i = 0; i < trace_nr; i++) + stack_map_build_id_set_ip(&id_offs[i]); return; } for (i = 0; i < trace_nr; i++) { u64 ip = READ_ONCE(id_offs[i].ip); + u64 offset; - if (range_in_vma(prev_vma, ip, ip)) { + if (prev_build_id && range_in_vma(prev_vma, ip, ip)) { vma = prev_vma; - memcpy(id_offs[i].build_id, prev_build_id, BUILD_ID_SIZE_MAX); - goto build_id_valid; + offset = stack_map_build_id_offset(vma->vm_pgoff, vma->vm_start, ip); + stack_map_build_id_set_valid(&id_offs[i], offset, prev_build_id); + continue; } vma = find_vma(current->mm, ip); - if (!vma || fetch_build_id(vma, id_offs[i].build_id, may_fault)) { + if (!vma || vma_is_anonymous(vma) || + fetch_build_id(vma, id_offs[i].build_id, may_fault)) { /* per entry fall back to ips */ - id_offs[i].status = BPF_STACK_BUILD_ID_IP; - memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); + stack_map_build_id_set_ip(&id_offs[i]); + prev_vma = vma; + prev_build_id = NULL; continue; } -build_id_valid: - id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ip - vma->vm_start; - id_offs[i].status = BPF_STACK_BUILD_ID_VALID; + offset = stack_map_build_id_offset(vma->vm_pgoff, vma->vm_start, ip); + stack_map_build_id_set_valid(&id_offs[i], offset, id_offs[i].build_id); prev_vma = vma; prev_build_id = id_offs[i].build_id; } -- cgit v1.2.3 From fad3021faf7b0b64e9daea41c5662b65c8ad7379 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 25 May 2026 15:39:47 -0700 Subject: bpf: Avoid faultable build ID reads under mm locks Sleepable build ID parsing can block in __kernel_read() [1], so the stackmap sleepable path must not call it while holding mmap_lock or a per-VMA read lock. The issue and the fix are conceptually similar to a recent procfs patch [2]. A similar VMA locking pattern has already been used in PROCMAP_QUERY [3]. Resolve each covered VMA with a stable read-side reference, preferring lock_vma_under_rcu() and falling back to mmap_read_trylock() only long enough to acquire the VMA read lock. Take a reference to the backing file, drop the VMA lock, and then parse the build ID through (sleepable) build_id_parse_file(). We have to use mmap_read_trylock() (and give up on failure) in this context because taking mmap_read_lock() is generally unsafe on code paths reachable from BPF programs [4], and may lead to deadlocks. [1] https://lore.kernel.org/all/20251218005818.614819-1-shakeel.butt@linux.dev/ [2] https://lore.kernel.org/all/20260128183232.2854138-1-andrii@kernel.org/ [3] https://lore.kernel.org/all/20250808152850.2580887-1-surenb@google.com/ [4] https://lore.kernel.org/bpf/2895ecd8-df1e-4cc0-b9f9-aef893dc2360@linux.dev/ Fixes: d4dd9775ec24 ("bpf: wire up sleepable bpf_get_stack() and bpf_get_task_stack() helpers") Suggested-by: Puranjay Mohan Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260525223948.1920986-3-ihor.solodrai@linux.dev --- kernel/bpf/stackmap.c | 109 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index e23be7d44503..c53cfd9a67cf 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "percpu_freelist.h" #include "mmap_unlock_work.h" @@ -174,6 +175,109 @@ static inline void stack_map_build_id_set_valid(struct bpf_stack_build_id *id, memcpy(id->build_id, build_id, BUILD_ID_SIZE_MAX); } +struct stack_map_vma_lock { + struct vm_area_struct *vma; + struct mm_struct *mm; +}; + +/* + * Acquire a stable read-side reference on the VMA covering @ip. + * + * With CONFIG_PER_VMA_LOCK=y this returns a VMA with its per-VMA read + * lock held and mmap_lock dropped, so the caller may sleep. + * + * With CONFIG_PER_VMA_LOCK=n it returns a VMA with mmap_lock still + * held; the caller must snapshot any fields it needs and pin vm_file + * with get_file() before stack_map_unlock_vma() drops mmap_lock, as + * the VMA may be split, merged, or freed after that. + * + * Returns NULL on failure, in which case no lock is held. + */ +static struct vm_area_struct * +stack_map_lock_vma(struct stack_map_vma_lock *lock, unsigned long ip) +{ + struct mm_struct *mm = lock->mm; + struct vm_area_struct *vma; + + /* noop under !CONFIG_PER_VMA_LOCK */ + vma = lock_vma_under_rcu(mm, ip); + if (vma) { + lock->vma = vma; + return vma; + } + + /* + * Taking mmap_read_lock() is unsafe here, because the caller BPF + * program might already hold it, causing a deadlock. + */ + if (!mmap_read_trylock(mm)) + return NULL; + + vma = vma_lookup(mm, ip); + if (!vma) { + mmap_read_unlock(mm); + return NULL; + } + +#ifdef CONFIG_PER_VMA_LOCK + if (!vma_start_read_locked(vma)) { + mmap_read_unlock(mm); + return NULL; + } + mmap_read_unlock(mm); +#endif + + lock->vma = vma; + return vma; +} + +static void stack_map_unlock_vma(struct stack_map_vma_lock *lock) +{ +#ifdef CONFIG_PER_VMA_LOCK + vma_end_read(lock->vma); +#else + mmap_read_unlock(lock->mm); +#endif + lock->vma = NULL; +} + +static void stack_map_get_build_id_offset_sleepable(struct bpf_stack_build_id *id_offs, + u32 trace_nr) +{ + struct mm_struct *mm = current->mm; + struct stack_map_vma_lock lock = { .mm = mm }; + struct vm_area_struct *vma; + struct file *file; + u64 offset; + u64 ip; + + for (u32 i = 0; i < trace_nr; i++) { + ip = READ_ONCE(id_offs[i].ip); + + vma = stack_map_lock_vma(&lock, ip); + if (!vma) { + stack_map_build_id_set_ip(&id_offs[i]); + continue; + } + if (vma_is_anonymous(vma) || !vma->vm_file) { + stack_map_build_id_set_ip(&id_offs[i]); + stack_map_unlock_vma(&lock); + continue; + } + + file = get_file(vma->vm_file); + offset = stack_map_build_id_offset(vma->vm_pgoff, vma->vm_start, ip); + stack_map_unlock_vma(&lock); + + /* build_id_parse_file() may block on filesystem reads */ + if (build_id_parse_file(file, id_offs[i].build_id, NULL)) + stack_map_build_id_set_ip(&id_offs[i]); + else + stack_map_build_id_set_valid(&id_offs[i], offset, id_offs[i].build_id); + fput(file); + } +} + /* * Expects all id_offs[i].ip values to be set to correct initial IPs. * They will be subsequently: @@ -194,6 +298,11 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, const unsigned char *prev_build_id = NULL; int i; + if (may_fault && has_user_ctx) { + stack_map_get_build_id_offset_sleepable(id_offs, trace_nr); + return; + } + /* If the irq_work is in use, fall back to report ips. Same * fallback is used for kernel stack (!user) on a stackmap with * build_id. -- cgit v1.2.3 From 5e9099d8ff24ec32aa5af872a4d84d086bd70579 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 25 May 2026 15:39:48 -0700 Subject: bpf: Cache build IDs in sleepable stackmap path Stack traces often contain adjacent IPs from the same VMA or from different VMAs backed by the same ELF file. Cache the last successfully parsed build id together with the resolved VMA range and backing file so the sleepable build id path can avoid repeated VMA locking and file parsing in common cases. Suggested-by: Mykyta Yatsenko Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Acked-by: Mykyta Yatsenko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260525223948.1920986-4-ihor.solodrai@linux.dev --- kernel/bpf/stackmap.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 55 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index c53cfd9a67cf..77ba03216c09 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -246,6 +246,14 @@ static void stack_map_get_build_id_offset_sleepable(struct bpf_stack_build_id *i { struct mm_struct *mm = current->mm; struct stack_map_vma_lock lock = { .mm = mm }; + struct { + struct file *file; + const unsigned char *build_id; + unsigned long vm_start; + unsigned long vm_end; + unsigned long vm_pgoff; + } cache = {}; + unsigned long vm_pgoff, vm_start, vm_end; struct vm_area_struct *vma; struct file *file; u64 offset; @@ -254,6 +262,17 @@ static void stack_map_get_build_id_offset_sleepable(struct bpf_stack_build_id *i for (u32 i = 0; i < trace_nr; i++) { ip = READ_ONCE(id_offs[i].ip); + /* + * Range cache fast path: if ip falls within the previously + * resolved VMA range, reuse the cache build_id without + * re-acquiring the VMA lock. + */ + if (cache.build_id && ip >= cache.vm_start && ip < cache.vm_end) { + offset = stack_map_build_id_offset(cache.vm_pgoff, cache.vm_start, ip); + stack_map_build_id_set_valid(&id_offs[i], offset, cache.build_id); + continue; + } + vma = stack_map_lock_vma(&lock, ip); if (!vma) { stack_map_build_id_set_ip(&id_offs[i]); @@ -265,17 +284,47 @@ static void stack_map_get_build_id_offset_sleepable(struct bpf_stack_build_id *i continue; } - file = get_file(vma->vm_file); - offset = stack_map_build_id_offset(vma->vm_pgoff, vma->vm_start, ip); + file = vma->vm_file; + vm_pgoff = vma->vm_pgoff; + vm_start = vma->vm_start; + vm_end = vma->vm_end; + offset = stack_map_build_id_offset(vm_pgoff, vm_start, ip); + + /* + * Same backing file as previous (e.g. different VMAs + * of the same ELF binary). Reuse the cache build_id. + */ + if (file == cache.file) { + stack_map_unlock_vma(&lock); + stack_map_build_id_set_valid(&id_offs[i], offset, cache.build_id); + cache.vm_start = vm_start; + cache.vm_end = vm_end; + cache.vm_pgoff = vm_pgoff; + continue; + } + + file = get_file(file); stack_map_unlock_vma(&lock); /* build_id_parse_file() may block on filesystem reads */ - if (build_id_parse_file(file, id_offs[i].build_id, NULL)) + if (build_id_parse_file(file, id_offs[i].build_id, NULL)) { stack_map_build_id_set_ip(&id_offs[i]); - else - stack_map_build_id_set_valid(&id_offs[i], offset, id_offs[i].build_id); - fput(file); + fput(file); + continue; + } + + stack_map_build_id_set_valid(&id_offs[i], offset, id_offs[i].build_id); + if (cache.file) + fput(cache.file); + cache.file = file; + cache.build_id = id_offs[i].build_id; + cache.vm_start = vm_start; + cache.vm_end = vm_end; + cache.vm_pgoff = vm_pgoff; } + + if (cache.file) + fput(cache.file); } /* -- cgit v1.2.3 From 9b435d23f51e55b62dda3a345a9f8931248ca514 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Thu, 21 May 2026 22:29:09 +0800 Subject: bpf: Fix race between bpf_map_new_fd() and close_fd() Because there is time gap between bpf_map_new_fd() and close_fd(), a concurrent thread is able to close the new fd and opens a new, unrelated file with the exact same fd number. Thereafter, this close_fd() might inadvertently close the unrelated file. To avoid such regression, do finalize log before security_bpf_map_create(). However, in order to achieve it, move bpf_get_file_flag(), security_bpf_map_create(), bpf_map_alloc_id(), and bpf_map_new_fd() from __map_create() to map_create(). And, rename __map_create() to map_create_alloc() meanwhile. Then, in order to reuse the map and token when all checks pass in map_create_alloc(), pass "struct bpf_map **" and "struct bpf_token **" to map_create_alloc(). Fixes: 49f9b2b2a18c ("bpf: Add syscall common attributes support for map_create") Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260521142909.95818-1-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 80 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 33 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8d4ea700aac6..9e91fb2fb492 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1359,7 +1359,8 @@ free_map_tab: #define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size /* called via syscall */ -static int __map_create(union bpf_attr *attr, bpfptr_t uattr, struct bpf_verifier_log *log) +static int map_create_alloc(union bpf_attr *attr, bpfptr_t uattr, struct bpf_verifier_log *log, + struct bpf_map **mapp, struct bpf_token **tokenp) { const struct bpf_map_ops *ops; struct bpf_token *token = NULL; @@ -1367,7 +1368,6 @@ static int __map_create(union bpf_attr *attr, bpfptr_t uattr, struct bpf_verifie u32 map_type = attr->map_type; struct bpf_map *map; bool token_flag; - int f_flags; int err; err = CHECK_ATTR(BPF_MAP_CREATE); @@ -1403,10 +1403,6 @@ static int __map_create(union bpf_attr *attr, bpfptr_t uattr, struct bpf_verifie return -EINVAL; } - f_flags = bpf_get_file_flag(attr->map_flags); - if (f_flags < 0) - return f_flags; - if (numa_node != NUMA_NO_NODE && ((unsigned int)numa_node >= nr_node_ids || !node_online(numa_node))) { @@ -1598,6 +1594,49 @@ static int __map_create(union bpf_attr *attr, bpfptr_t uattr, struct bpf_verifie goto free_map; } + *mapp = map; + *tokenp = token; + return 0; + +free_map: + bpf_map_free(map); +put_token: + bpf_token_put(token); + return err; +} + +static int map_create(union bpf_attr *attr, bpfptr_t uattr, struct bpf_common_attr *attr_common, + bpfptr_t uattr_common, u32 size_common) +{ + struct bpf_token *token = NULL; + struct bpf_verifier_log *log; + struct bpf_log_attr attr_log; + struct bpf_map *map = NULL; + int err, ret; + int f_flags; + + log = bpf_log_attr_create_vlog(&attr_log, attr_common, uattr_common, size_common); + if (IS_ERR(log)) + return PTR_ERR(log); + + err = map_create_alloc(attr, uattr, log, &map, &token); + + /* preserve original error even if log finalization is successful */ + ret = bpf_log_attr_finalize(&attr_log, log); + if (ret) + err = ret; + + kfree(log); + + if (err) + goto free_map; + + f_flags = bpf_get_file_flag(attr->map_flags); + if (f_flags < 0) { + err = f_flags; + goto free_map; + } + err = security_bpf_map_create(map, attr, token, uattr.is_kernel); if (err) goto free_map_sec; @@ -1626,37 +1665,12 @@ static int __map_create(union bpf_attr *attr, bpfptr_t uattr, struct bpf_verifie free_map_sec: security_bpf_map_free(map); free_map: - bpf_map_free(map); -put_token: + if (map) + bpf_map_free(map); bpf_token_put(token); return err; } -static int map_create(union bpf_attr *attr, bpfptr_t uattr, struct bpf_common_attr *attr_common, - bpfptr_t uattr_common, u32 size_common) -{ - struct bpf_verifier_log *log; - struct bpf_log_attr attr_log; - int err, ret; - - log = bpf_log_attr_create_vlog(&attr_log, attr_common, uattr_common, size_common); - if (IS_ERR(log)) - return PTR_ERR(log); - - err = __map_create(attr, uattr, log); - - /* preserve original error even if log finalization is successful */ - ret = bpf_log_attr_finalize(&attr_log, log); - if (ret) { - if (err >= 0) - close_fd(err); - err = ret; - } - - kfree(log); - return err; -} - void bpf_map_inc(struct bpf_map *map) { atomic64_inc(&map->refcnt); -- cgit v1.2.3 From 9a720e090eb5155fbd584a3f7eca18f82610a2b3 Mon Sep 17 00:00:00 2001 From: Suchit Karunakaran Date: Sun, 24 May 2026 08:28:53 +0530 Subject: bpf: replace pop/push emptiness check with bpf_list_empty() Simplify fq_flows_is_empty() by replacing the pop/push based emptiness check with a direct call to bpf_list_empty(). This avoids unnecessary list mutation and simplifies the code while preserving correctness. Signed-off-by: Suchit Karunakaran Changes since v1: - Removed unused variable node Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260524025853.13786-1-suchitkarunakaran@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/bpf_qdisc_fq.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/bpf_qdisc_fq.c b/tools/testing/selftests/bpf/progs/bpf_qdisc_fq.c index 1a3233a275c7..8107f5934d2d 100644 --- a/tools/testing/selftests/bpf/progs/bpf_qdisc_fq.c +++ b/tools/testing/selftests/bpf/progs/bpf_qdisc_fq.c @@ -196,18 +196,13 @@ fq_flows_remove_front(struct bpf_list_head *head, struct bpf_spin_lock *lock, static bool fq_flows_is_empty(struct bpf_list_head *head, struct bpf_spin_lock *lock) { - struct bpf_list_node *node; + bool empty; bpf_spin_lock(lock); - node = bpf_list_pop_front(head); - if (node) { - bpf_list_push_front(head, node); - bpf_spin_unlock(lock); - return false; - } + empty = bpf_list_empty(head); bpf_spin_unlock(lock); - return true; + return empty; } /* flow->age is used to denote the state of the flow (not-detached, detached, throttled) -- cgit v1.2.3 From 21c4b99b27f3f85b89256e81b3e997dec0a460d0 Mon Sep 17 00:00:00 2001 From: Yuyang Huang Date: Sun, 31 May 2026 15:55:59 +0800 Subject: bpf: fix BPF_PROG_QUERY OOB write and cgroup backward compat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BPF_PROG_QUERY writes back the 'query.revision' field unconditionally to userspace. If userspace passes a smaller 'bpf_attr' structure (e.g. 40 bytes, which was the layout before the addition of 'query.revision'), the kernel performs an out-of-bounds write. Fix this by propagating the user-provided attribute size 'uattr_size' down to the cgroup query handlers, and conditionally skipping writing the revision field to userspace when the provided buffer size is insufficient. query.revision in bpf_mprog_query is structurally identical to the cgroup case: a late tail field, written unconditionally. But the backward-compat hazard is not the same. The min-historical-size test is per command, and bpf_mprog_query only serves attach types that were born with revision in the struct: - tcx_prog_query -> BPF_TCX_INGRESS/EGRESS - netkit_prog_query -> BPF_NETKIT_PRIMARY/PEER tcx, netkit, the revision field, and bpf_mprog_query itself all landed in the same v6.6 merge window (053c8e1f235d added the mprog query API + revision; tcx in e420bed02507, netkit in 35dfaad7188c). There has never been a tcx/netkit BPF_PROG_QUERY userspace that doesn't know about revision. So for these commands the minimum legitimate struct already covers offset 56-64 — no old binary can be broken here. Contrast with cgroup: BPF_PROG_QUERY on cgroup attach types shipped in 2017; revision write-back was bolted on years later (120933984460). That path has a real population of pre-revision callers. Fixes: 120933984460 ("bpf: Implement mprog API on top of existing cgroup progs") Cc: Maciej Å»enczykowski Cc: Lorenzo Colitti Signed-off-by: Yuyang Huang Link: https://lore.kernel.org/r/20260531075600.4058207-2-yuyanghuang@google.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf-cgroup.h | 5 +++-- kernel/bpf/cgroup.c | 13 +++++++------ kernel/bpf/syscall.c | 6 +++--- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index b2e79c2b41d5..4d0cc65976a1 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -421,7 +421,7 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int cgroup_bpf_prog_query(const union bpf_attr *attr, - union bpf_attr __user *uattr); + union bpf_attr __user *uattr, u32 uattr_size); const struct bpf_func_proto * cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); @@ -452,7 +452,8 @@ static inline int cgroup_bpf_link_attach(const union bpf_attr *attr, } static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, - union bpf_attr __user *uattr) + union bpf_attr __user *uattr, + u32 uattr_size) { return -EINVAL; } diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 876f6a81a9b6..2c2bdaa86aa7 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1208,7 +1208,7 @@ static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, /* Must be called with cgroup_mutex held to avoid races. */ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, - union bpf_attr __user *uattr) + union bpf_attr __user *uattr, u32 uattr_size) { __u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags); bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE; @@ -1259,7 +1259,8 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, return -EFAULT; if (!effective_query && from_atype == to_atype) revision = cgrp->bpf.revisions[from_atype]; - if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision))) + if (uattr_size >= offsetofend(union bpf_attr, query.revision) && + copy_to_user(&uattr->query.revision, &revision, sizeof(revision))) return -EFAULT; if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt) /* return early if user requested only program count + flags */ @@ -1312,12 +1313,12 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, } static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, - union bpf_attr __user *uattr) + union bpf_attr __user *uattr, u32 uattr_size) { int ret; cgroup_lock(); - ret = __cgroup_bpf_query(cgrp, attr, uattr); + ret = __cgroup_bpf_query(cgrp, attr, uattr, uattr_size); cgroup_unlock(); return ret; } @@ -1520,7 +1521,7 @@ out_put_cgroup: } int cgroup_bpf_prog_query(const union bpf_attr *attr, - union bpf_attr __user *uattr) + union bpf_attr __user *uattr, u32 uattr_size) { struct cgroup *cgrp; int ret; @@ -1529,7 +1530,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, if (IS_ERR(cgrp)) return PTR_ERR(cgrp); - ret = cgroup_bpf_query(cgrp, attr, uattr); + ret = cgroup_bpf_query(cgrp, attr, uattr, uattr_size); cgroup_put(cgrp); return ret; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9e91fb2fb492..93bbbe610a7a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4719,7 +4719,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) #define BPF_PROG_QUERY_LAST_FIELD query.revision static int bpf_prog_query(const union bpf_attr *attr, - union bpf_attr __user *uattr) + union bpf_attr __user *uattr, u32 uattr_size) { if (!bpf_net_capable()) return -EPERM; @@ -4758,7 +4758,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_GETSOCKOPT: case BPF_CGROUP_SETSOCKOPT: case BPF_LSM_CGROUP: - return cgroup_bpf_prog_query(attr, uattr); + return cgroup_bpf_prog_query(attr, uattr, uattr_size); case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); case BPF_FLOW_DISSECTOR: @@ -6376,7 +6376,7 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size, err = bpf_prog_detach(&attr); break; case BPF_PROG_QUERY: - err = bpf_prog_query(&attr, uattr.user); + err = bpf_prog_query(&attr, uattr.user, size); break; case BPF_PROG_TEST_RUN: err = bpf_prog_test_run(&attr, uattr.user); -- cgit v1.2.3 From 5add3a4ad1a3bc15404e8bd338813ed0a636f5c9 Mon Sep 17 00:00:00 2001 From: Yuyang Huang Date: Sun, 31 May 2026 15:56:00 +0800 Subject: selftests/bpf: add verification for BPF_PROG_QUERY attr size boundaries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new selftest to verify that the BPF syscall (specifically BPF_PROG_QUERY) correctly handles different user-declared attribute sizes. Specifically, verify that: - For cgroup queries, a query with a size that covers 'prog_cnt' but is smaller than 'revision' (OLD_QUERY_SIZE) succeeds, but does not write to 'revision' (verifying backward compatibility). - A query with full size (FULL_QUERY_SIZE) succeeds and writes both 'prog_cnt' and 'revision'. Fixes: 120933984460 ("bpf: Implement mprog API on top of existing cgroup progs") Cc: Maciej Å»enczykowski Cc: Lorenzo Colitti Signed-off-by: Yuyang Huang Link: https://lore.kernel.org/r/20260531075600.4058207-3-yuyanghuang@google.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/bpf_attr_size.c | 69 ++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_attr_size.c diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_attr_size.c b/tools/testing/selftests/bpf/prog_tests/bpf_attr_size.c new file mode 100644 index 000000000000..32159dc64da8 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/bpf_attr_size.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Google LLC */ +#include +#include +#include +#include +#include +#include "cgroup_skb_direct_packet_access.skel.h" + +#define OLD_QUERY_SIZE offsetofend(union bpf_attr, query.prog_cnt) +#define FULL_QUERY_SIZE offsetofend(union bpf_attr, query.revision) + +static void test_query_size_boundaries(void) +{ + struct cgroup_skb_direct_packet_access *skel; + struct bpf_link *link = NULL; + union bpf_attr attr; + int cg_fd = -1; + int err; + + skel = cgroup_skb_direct_packet_access__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_load")) + return; + + cg_fd = test__join_cgroup("/attr_size_cg"); + if (!ASSERT_GE(cg_fd, 0, "join_cgroup")) + goto cleanup; + + link = bpf_program__attach_cgroup(skel->progs.direct_packet_access, + cg_fd); + if (!ASSERT_OK_PTR(link, "cg_attach")) + goto cleanup; + + memset(&attr, 0, sizeof(attr)); + attr.query.target_fd = cg_fd; + attr.query.attach_type = BPF_CGROUP_INET_INGRESS; + attr.query.revision = 0xdeadbeefdeadbeefULL; + + err = syscall(__NR_bpf, BPF_PROG_QUERY, &attr, OLD_QUERY_SIZE); + if (ASSERT_OK(err, "query_old_size")) { + ASSERT_EQ(attr.query.prog_cnt, 1, "prog_cnt_written_old"); + ASSERT_EQ(attr.query.revision, 0xdeadbeefdeadbeefULL, + "revision_not_written_old"); + } + + memset(&attr, 0, sizeof(attr)); + attr.query.target_fd = cg_fd; + attr.query.attach_type = BPF_CGROUP_INET_INGRESS; + + err = syscall(__NR_bpf, BPF_PROG_QUERY, &attr, FULL_QUERY_SIZE); + if (!ASSERT_OK(err, "query_full_size")) + goto cleanup; + + ASSERT_EQ(attr.query.prog_cnt, 1, "prog_cnt_written"); + ASSERT_GT(attr.query.revision, 0, "revision_written"); + +cleanup: + if (link) + bpf_link__destroy(link); + if (cg_fd >= 0) + close(cg_fd); + cgroup_skb_direct_packet_access__destroy(skel); +} + +void test_bpf_attr_size(void) +{ + if (test__start_subtest("query_size_boundaries")) + test_query_size_boundaries(); +} -- cgit v1.2.3 From 7c7c42d606ed540301b14571ae000041a2d6f39d Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 29 May 2026 13:39:09 -0700 Subject: bpf: Update bpf maintainers I am making a life change and will take a long break from my current work, so I will step down from the "M:" responsibility. I am currently a "R:" in "BPF [GENERAL]", this part stays unchanged. I am folding most of the parts into "BPF [GENERAL]". For "BPF [BTF]", it is long overdue as I am no longer involved. It is folded into the "BPF [GENERAL]". The "BPF [STORAGE & CGROUPS]" will also be covered by "BPF [GENERAL]". For struct_ops, its usage is no longer limited to networking, so this naturally should move back to "BPF [GENERAL]". For the reuseport, it will continue to be maintained together by "BPF [GENERAL]" and the "NETWORKING [SOCKETS]". For other "BPF [NETWORKING]...", I am moving myself to "R:". Thanks! Signed-off-by: Martin KaFai Lau Acked-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260529203909.1222164-1-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- MAINTAINERS | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 67ef966f77b9..afae76600fc6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4775,13 +4775,6 @@ S: Supported F: arch/x86/net/ X: arch/x86/net/bpf_jit_comp32.c -BPF [BTF] -M: Martin KaFai Lau -L: bpf@vger.kernel.org -S: Maintained -F: include/linux/btf* -F: kernel/bpf/btf.c - BPF [CORE] M: Alexei Starovoitov M: Daniel Borkmann @@ -4914,18 +4907,11 @@ F: tools/testing/selftests/bpf/prog_tests/tc_netkit.c F: tools/testing/selftests/drivers/net/hw/nk_qlease.py F: tools/testing/selftests/net/nk_qlease.py -BPF [NETWORKING] (struct_ops, reuseport) -M: Martin KaFai Lau -L: bpf@vger.kernel.org -L: netdev@vger.kernel.org -S: Maintained -F: kernel/bpf/bpf_struct* - BPF [NETWORKING] (tcx & tc BPF, sock_addr) -M: Martin KaFai Lau M: Daniel Borkmann R: John Fastabend R: Stanislav Fomichev +R: Martin KaFai Lau L: bpf@vger.kernel.org L: netdev@vger.kernel.org S: Maintained @@ -4960,14 +4946,6 @@ L: bpf@vger.kernel.org S: Maintained F: tools/testing/selftests/bpf/ -BPF [STORAGE & CGROUPS] -M: Martin KaFai Lau -L: bpf@vger.kernel.org -S: Maintained -F: kernel/bpf/*storage.c -F: kernel/bpf/bpf_lru* -F: kernel/bpf/cgroup.c - BPF [TOOLING] (bpftool) M: Quentin Monnet L: bpf@vger.kernel.org -- cgit v1.2.3 From e2c88266147ff92ca25e6577158a9a0b3b261a30 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 29 May 2026 11:41:16 +0200 Subject: libbpf: Drop redundant self-loop in emit_check_err When the cleanup-label jump offset does not fit in s16, emit_check_err() sets gen->error = -ERANGE and then emits a BPF_JMP_IMM(BPF_JA, 0, 0, -1) self-loop. The latter emit() is dead: gen->error is assigned on the preceding line, and emit() then bails out early in realloc_insn_buf() the moment gen->error is set, so the jump is never written into the instruction stream. gen->error alone already marks the generation as failed. This is a follow-up to 7dd62566e0d1 ("libbpf: fix off-by-one in emit_signature_match jump offset") which removed the jump in emit_signature_match() but not in other locations. Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260529094119.307264-1-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/gen_loader.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c index 9478b8f78f26..7b95ced7bcba 100644 --- a/tools/lib/bpf/gen_loader.c +++ b/tools/lib/bpf/gen_loader.c @@ -293,7 +293,6 @@ static void emit_check_err(struct bpf_gen *gen) emit(gen, BPF_JMP_IMM(BPF_JSLT, BPF_REG_7, 0, off)); } else { gen->error = -ERANGE; - emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, -1)); } } -- cgit v1.2.3 From 3c5e2f1a85844abbb65df4694f5ebad0a13e219c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 29 May 2026 11:41:17 +0200 Subject: libbpf: Skip hash computation when loader generation failed bpf_gen__finish() calls compute_sha_update_offsets() gated only on the gen_hash option, without first consulting gen->error. On a failed generation this is buggy: a failed realloc_data_buf() sets gen->data_start to NULL (leaving gen->data_cur dangling), so compute_sha_update_offsets() runs libbpf_sha256() over a NULL buffer with a bogus length; a failed realloc_insn_buf() likewise sets gen->insn_start to NULL and the hash immediates get patched through that NULL base. The computed program is discarded in either case, since the following "if (!gen->error)" block does not publish opts->insns once an error is set. Thus, skip the hash pass when generation has already failed. Fixes: ea923080c145 ("libbpf: Embed and verify the metadata hash in the loader") Reported-by: sashiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260529094119.307264-2-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/gen_loader.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c index 7b95ced7bcba..3a6e1d53f287 100644 --- a/tools/lib/bpf/gen_loader.c +++ b/tools/lib/bpf/gen_loader.c @@ -397,13 +397,12 @@ int bpf_gen__finish(struct bpf_gen *gen, int nr_progs, int nr_maps) blob_fd_array_off(gen, i)); emit(gen, BPF_MOV64_IMM(BPF_REG_0, 0)); emit(gen, BPF_EXIT_INSN()); - if (OPTS_GET(gen->opts, gen_hash, false)) - compute_sha_update_offsets(gen); - - pr_debug("gen: finish %s\n", errstr(gen->error)); if (!gen->error) { struct gen_loader_opts *opts = gen->opts; + if (OPTS_GET(opts, gen_hash, false)) + compute_sha_update_offsets(gen); + opts->insns = gen->insn_start; opts->insns_sz = gen->insn_cur - gen->insn_start; opts->data = gen->data_start; @@ -418,6 +417,7 @@ int bpf_gen__finish(struct bpf_gen *gen, int nr_progs, int nr_maps) bpf_insn_bswap(insn++); } } + pr_debug("gen: finish %s\n", errstr(gen->error)); return gen->error; } -- cgit v1.2.3 From d2f7bd066ed492aeaf82864fbf1f06770f9d9f9d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 29 May 2026 11:41:18 +0200 Subject: libbpf: Also reset {insn,data}_cur on realloc failure realloc_insn_buf() as well as realloc_data_buf() free and NULL gen->insn_start / gen->data_start on -ENOMEM but leave gen->insn_cur / gen->data_cur pointing into the old, freed buffer. Just reset the cursors to NULL alongside the base pointers so the freed state is coherent. Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260529094119.307264-3-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/gen_loader.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c index 3a6e1d53f287..492360ca07ea 100644 --- a/tools/lib/bpf/gen_loader.c +++ b/tools/lib/bpf/gen_loader.c @@ -63,6 +63,7 @@ static int realloc_insn_buf(struct bpf_gen *gen, __u32 size) gen->error = -ENOMEM; free(gen->insn_start); gen->insn_start = NULL; + gen->insn_cur = NULL; return -ENOMEM; } gen->insn_start = insn_start; @@ -86,6 +87,7 @@ static int realloc_data_buf(struct bpf_gen *gen, __u32 size) gen->error = -ENOMEM; free(gen->data_start); gen->data_start = NULL; + gen->data_cur = NULL; return -ENOMEM; } gen->data_start = data_start; -- cgit v1.2.3 From 41300d032a1b1d91a3ed996ad21905463e344beb Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 29 May 2026 18:28:29 +0200 Subject: libbpf: Skip endianness swap when loader generation failed bpf_gen__prog_load() byte-swaps the program insns and the {func,line}_info and CO-RE relo blobs in place for cross-endian targets. The blob offsets come from add_data(), which returns 0 on failure: realloc_data_buf() either frees and NULLs gen->data_start (realloc OOM) or returns early on an already-latched gen->error, leaving a stale, possibly too-small buffer. Neither bswap site checked for this. With gen->swapped_endian set and a failed generation, "gen->data_start + off" becomes NULL + 0. Guard the same way via !gen->error so they are skipped once generation has failed. Fixes: 8ca3323dce43 ("libbpf: Support creating light skeleton of either endianness") Reported-by: sashiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260529162829.315921-1-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/gen_loader.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c index 492360ca07ea..3702c5944bc0 100644 --- a/tools/lib/bpf/gen_loader.c +++ b/tools/lib/bpf/gen_loader.c @@ -1054,7 +1054,7 @@ void bpf_gen__prog_load(struct bpf_gen *gen, prog_idx, prog_type, insns_off, insn_cnt, license_off); /* convert blob insns to target endianness */ - if (gen->swapped_endian) { + if (gen->swapped_endian && !gen->error) { struct bpf_insn *insn = gen->data_start + insns_off; int i; @@ -1092,7 +1092,7 @@ void bpf_gen__prog_load(struct bpf_gen *gen, sizeof(struct bpf_core_relo)); /* convert all info blobs to target endianness */ - if (gen->swapped_endian) + if (gen->swapped_endian && !gen->error) info_blob_bswap(gen, func_info, line_info, core_relos, load_attr); libbpf_strlcpy(attr.prog_name, prog_name, sizeof(attr.prog_name)); -- cgit v1.2.3 From 12a585e607fa6e3fbe2c02158c7ad284cbf75792 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 28 May 2026 09:17:47 -0700 Subject: bpf, arm64: Fix redundant MOV and clarify stack arg comments emit_stack_arg_store_imm() materializes the immediate into tmp and then moves tmp to the target register (x5-x7). Emit the immediate directly into the target register to avoid the redundant MOV. While here, qualify the bare "FP" in the stack-layout ASCII art as "A64_FP" so it is not confused with BPF_FP, and note that incoming stack arguments sit above the FP/LR pair pushed by the callee prologue. Suggested-by: Will Deacon Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260528161750.1900674-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index e3bbeaa94590..b4abc3138f37 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -546,7 +546,8 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf) * low * * Stack args 6-8 are passed in x5-x7, args 9+ at [SP]. - * Incoming args 9+ are at [FP + 16], [FP + 24], ... + * Incoming args 9+ are at [A64_FP + 16], [A64_FP + 24], ... + * (above the saved FP/LR pair pushed in the callee prologue). */ emit_kcfi(is_main_prog ? cfi_bpf_hash : cfi_bpf_subprog_hash, ctx); @@ -1235,11 +1236,12 @@ static void emit_stack_arg_store_imm(s32 imm, s16 bpf_off, const u8 tmp, struct { int idx = -bpf_off / sizeof(u64) - 1; - emit_a64_mov_i(1, tmp, imm, ctx); - if (idx < NR_STACK_ARG_REGS) - emit(A64_MOV(1, stack_arg_reg[idx], tmp), ctx); - else + if (idx < NR_STACK_ARG_REGS) { + emit_a64_mov_i(1, stack_arg_reg[idx], imm, ctx); + } else { + emit_a64_mov_i(1, tmp, imm, ctx); emit(A64_STR64I(tmp, A64_SP, (idx - NR_STACK_ARG_REGS) * sizeof(u64)), ctx); + } } /* JITs an eBPF instruction. -- cgit v1.2.3 From 157317ba662a7c476320fdb334216154eaa8b856 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 28 May 2026 09:17:48 -0700 Subject: selftests/bpf: Use at least 10 args in stack argument tests On arm64, the first 8 arguments are passed in registers (x0-x7), so tests with 8 or fewer arguments never exercise the native stack argument path in the JIT. Increase argument counts to at least 10 across all BPF-to-BPF subprog and kfunc stack argument tests so that at least 2 arguments land on the arm64 stack. For the two-callees test, bump foo1 from 8 to 10 and foo2 from 10 to 12 args to preserve the different-stack-depth flavor of the test. The bpf_kfunc_call_stack_arg_mem kfunc is left unchanged at 7 args to avoid breaking the precision backtracking test which relies on hardcoded verifier log instruction indices. Suggested-by: Will Deacon Signed-off-by: Puranjay Mohan Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20260528161750.1900674-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/stack_arg.c | 30 ++++---- tools/testing/selftests/bpf/progs/stack_arg.c | 90 +++++++++++++--------- .../testing/selftests/bpf/progs/stack_arg_kfunc.c | 24 +++--- .../testing/selftests/bpf/test_kmods/bpf_testmod.c | 25 +++--- .../selftests/bpf/test_kmods/bpf_testmod_kfunc.h | 11 ++- 5 files changed, 108 insertions(+), 72 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/stack_arg.c b/tools/testing/selftests/bpf/prog_tests/stack_arg.c index d61bac33f809..57193543f260 100644 --- a/tools/testing/selftests/bpf/prog_tests/stack_arg.c +++ b/tools/testing/selftests/bpf/prog_tests/stack_arg.c @@ -37,7 +37,7 @@ static void test_global_many(void) if (!ASSERT_OK(stack_arg__load(skel), "load")) goto out; - run_subtest(skel->progs.test_global_many_args, 36); + run_subtest(skel->progs.test_global_many_args, 55); out: stack_arg__destroy(skel); @@ -62,10 +62,10 @@ static void test_async_cb_many(void) run_subtest(skel->progs.test_async_cb_many_args, 0); /* Wait for the timer callback to fire and verify the result. - * 10+20+30+40+50+60+70+80 = 360 + * 10+20+30+40+50+60+70+80+90+100 = 550 */ usleep(50); - ASSERT_EQ(skel->bss->timer_result, 360, "timer_result"); + ASSERT_EQ(skel->bss->timer_result, 550, "timer_result"); out: stack_arg__destroy(skel); @@ -87,11 +87,11 @@ static void test_bpf2bpf(void) if (!ASSERT_OK(stack_arg__load(skel), "load")) goto out; - run_subtest(skel->progs.test_bpf2bpf_ptr_stack_arg, 45); - run_subtest(skel->progs.test_bpf2bpf_mix_stack_args, 51); - run_subtest(skel->progs.test_bpf2bpf_nesting_stack_arg, 50); - run_subtest(skel->progs.test_bpf2bpf_dynptr_stack_arg, 69); - run_subtest(skel->progs.test_two_callees, 91); + run_subtest(skel->progs.test_bpf2bpf_ptr_stack_arg, 75); + run_subtest(skel->progs.test_bpf2bpf_mix_stack_args, 66); + run_subtest(skel->progs.test_bpf2bpf_nesting_stack_arg, 84); + run_subtest(skel->progs.test_bpf2bpf_dynptr_stack_arg, 99); + run_subtest(skel->progs.test_two_callees, 133); out: stack_arg__destroy(skel); @@ -113,14 +113,14 @@ static void test_kfunc(void) if (!ASSERT_OK(stack_arg_kfunc__load(skel), "load")) goto out; - run_subtest(skel->progs.test_stack_arg_scalar, 36); - run_subtest(skel->progs.test_stack_arg_ptr, 45); - run_subtest(skel->progs.test_stack_arg_mix, 51); - run_subtest(skel->progs.test_stack_arg_dynptr, 69); + run_subtest(skel->progs.test_stack_arg_scalar, 55); + run_subtest(skel->progs.test_stack_arg_ptr, 75); + run_subtest(skel->progs.test_stack_arg_mix, 66); + run_subtest(skel->progs.test_stack_arg_dynptr, 99); run_subtest(skel->progs.test_stack_arg_mem, 151); - run_subtest(skel->progs.test_stack_arg_iter, 115); - run_subtest(skel->progs.test_stack_arg_const_str, 15); - run_subtest(skel->progs.test_stack_arg_timer, 15); + run_subtest(skel->progs.test_stack_arg_iter, 145); + run_subtest(skel->progs.test_stack_arg_const_str, 45); + run_subtest(skel->progs.test_stack_arg_timer, 45); out: stack_arg_kfunc__destroy(skel); diff --git a/tools/testing/selftests/bpf/progs/stack_arg.c b/tools/testing/selftests/bpf/progs/stack_arg.c index b5e9929a4d63..944e3bb603e7 100644 --- a/tools/testing/selftests/bpf/progs/stack_arg.c +++ b/tools/testing/selftests/bpf/progs/stack_arg.c @@ -27,14 +27,16 @@ int timer_result; const volatile bool has_stack_arg = true; __noinline static int static_func_many_args(int a, int b, int c, int d, - int e, int f, int g, int h) + int e, int f, int g, int h, + int i, int j) { - return a + b + c + d + e + f + g + h; + return a + b + c + d + e + f + g + h + i + j; } __noinline int global_calls_many_args(int a, int b, int c) { - return static_func_many_args(a, b, c, 4, 5, 6, 7, 8); + return static_func_many_args(a, b, c, a + 3, a + 4, a + 5, a + 6, + a + 7, a + 8, a + 9); } SEC("tc") @@ -48,18 +50,20 @@ struct test_data { long y; }; -/* 1 + 2 + 3 + 4 + 5 + 10 + 20 = 45 */ +/* 1+2+3+4+5+6+7+8+9+10+20 = 75 */ __noinline static long func_with_ptr_stack_arg(long a, long b, long c, long d, - long e, struct test_data *p) + long e, long f, long g, long h, + long i, struct test_data *p) { - return a + b + c + d + e + p->x + p->y; + return a + b + c + d + e + f + g + h + i + p->x + p->y; } __noinline long global_ptr_stack_arg(long a, long b, long c, long d, long e) { struct test_data data = { .x = 10, .y = 20 }; - return func_with_ptr_stack_arg(a, b, c, d, e, &data); + return func_with_ptr_stack_arg(a, b, c, d, e, a + 5, a + 6, a + 7, + a + 8, &data); } SEC("tc") @@ -68,12 +72,13 @@ int test_bpf2bpf_ptr_stack_arg(void) return global_ptr_stack_arg(1, 2, 3, 4, 5); } -/* 1 + 2 + 3 + 4 + 5 + 10 + 6 + 20 = 51 */ +/* 1+2+3+4+5+6+7+10+8+20 = 66 */ __noinline static long func_with_mix_stack_args(long a, long b, long c, long d, - long e, struct test_data *p, - long f, struct test_data *q) + long e, long f, long g, + struct test_data *p, + long h, struct test_data *q) { - return a + b + c + d + e + p->x + f + q->y; + return a + b + c + d + e + f + g + p->x + h + q->y; } __noinline long global_mix_stack_args(long a, long b, long c, long d, long e) @@ -81,7 +86,8 @@ __noinline long global_mix_stack_args(long a, long b, long c, long d, long e) struct test_data p = { .x = 10 }; struct test_data q = { .y = 20 }; - return func_with_mix_stack_args(a, b, c, d, e, &p, e + 1, &q); + return func_with_mix_stack_args(a, b, c, d, e, e + 1, e + 2, &p, + e + 3, &q); } SEC("tc") @@ -94,26 +100,30 @@ int test_bpf2bpf_mix_stack_args(void) * Nesting test: func_outer calls func_inner, both with struct pointer * as stack arg. * - * func_inner: (a+1) + (b+1) + (c+1) + (d+1) + (e+1) + p->x + p->y - * = 2 + 3 + 4 + 5 + 6 + 10 + 20 = 50 + * func_inner: (a+1)+...+(i+1) + p->x + p->y + * = 2+3+4+5+6+7+8+9+10+10+20 = 84 */ __noinline static long func_inner_ptr(long a, long b, long c, long d, - long e, struct test_data *p) + long e, long f, long g, long h, + long i, struct test_data *p) { - return a + b + c + d + e + p->x + p->y; + return a + b + c + d + e + f + g + h + i + p->x + p->y; } __noinline static long func_outer_ptr(long a, long b, long c, long d, - long e, struct test_data *p) + long e, long f, long g, long h, + long i, struct test_data *p) { - return func_inner_ptr(a + 1, b + 1, c + 1, d + 1, e + 1, p); + return func_inner_ptr(a + 1, b + 1, c + 1, d + 1, e + 1, + f + 1, g + 1, h + 1, i + 1, p); } __noinline long global_nesting_ptr(long a, long b, long c, long d, long e) { struct test_data data = { .x = 10, .y = 20 }; - return func_outer_ptr(a, b, c, d, e, &data); + return func_outer_ptr(a, b, c, d, e, a + 5, a + 6, a + 7, a + 8, + &data); } SEC("tc") @@ -122,11 +132,12 @@ int test_bpf2bpf_nesting_stack_arg(void) return global_nesting_ptr(1, 2, 3, 4, 5); } -/* 1 + 2 + 3 + 4 + 5 + sizeof(pkt_v4) = 15 + 54 = 69 */ +/* 1+2+3+4+5+6+7+8+9+sizeof(pkt_v4) = 45+54 = 99 */ __noinline static long func_with_dynptr(long a, long b, long c, long d, - long e, struct bpf_dynptr *ptr) + long e, long f, long g, long h, + long i, struct bpf_dynptr *ptr) { - return a + b + c + d + e + bpf_dynptr_size(ptr); + return a + b + c + d + e + f + g + h + i + bpf_dynptr_size(ptr); } __noinline long global_dynptr_stack_arg(void *ctx __arg_ctx, long a, long b, @@ -135,7 +146,8 @@ __noinline long global_dynptr_stack_arg(void *ctx __arg_ctx, long a, long b, struct bpf_dynptr ptr; bpf_dynptr_from_skb(ctx, 0, &ptr); - return func_with_dynptr(a, b, c, d, d + 1, &ptr); + return func_with_dynptr(a, b, c, d, d + 1, d + 2, d + 3, d + 4, + d + 5, &ptr); } SEC("tc") @@ -144,24 +156,25 @@ int test_bpf2bpf_dynptr_stack_arg(struct __sk_buff *skb) return global_dynptr_stack_arg(skb, 1, 2, 3, 4); } -/* foo1: a+b+c+d+e+f+g+h */ -__noinline static int foo1(int a, int b, int c, int d, - int e, int f, int g, int h) +/* foo1: a+b+c+d+e+f+g+h+i+j */ +__noinline static int foo1(int a, int b, int c, int d, int e, + int f, int g, int h, int i, int j) { - return a + b + c + d + e + f + g + h; + return a + b + c + d + e + f + g + h + i + j; } -/* foo2: a+b+c+d+e+f+g+h+i+j */ +/* foo2: a+b+c+d+e+f+g+h+i+j+k+l */ __noinline static int foo2(int a, int b, int c, int d, int e, - int f, int g, int h, int i, int j) + int f, int g, int h, int i, int j, + int k, int l) { - return a + b + c + d + e + f + g + h + i + j; + return a + b + c + d + e + f + g + h + i + j + k + l; } -/* global_two_callees calls foo1 (3 stack args) and foo2 (5 stack args). +/* global_two_callees calls foo1 (5 stack args) and foo2 (7 stack args). * The outgoing stack arg area is sized for foo2 (the larger callee). * Stores for foo1 are a subset of the area used by foo2. - * Result: foo1(1,2,3,4,5,6,7,8) + foo2(1,2,3,4,5,6,7,8,9,10) = 36 + 55 = 91 + * Result: foo1(1..10) + foo2(1..12) = 55 + 78 = 133 * * Pass a-e through so the compiler can't constant-fold the stack args away. */ @@ -169,8 +182,9 @@ __noinline int global_two_callees(int a, int b, int c, int d, int e) { int ret; - ret = foo1(a, b, c, d, e, a + 5, a + 6, a + 7); - ret += foo2(a, b, c, d, e, a + 5, a + 6, a + 7, a + 8, a + 9); + ret = foo1(a, b, c, d, e, a + 5, a + 6, a + 7, a + 8, a + 9); + ret += foo2(a, b, c, d, e, a + 5, a + 6, a + 7, a + 8, a + 9, + a + 10, a + 11); return ret; } @@ -180,9 +194,15 @@ int test_two_callees(void) return global_two_callees(1, 2, 3, 4, 5); } +const volatile int timer_base = 10; + static int timer_cb_many_args(void *map, int *key, struct bpf_timer *timer) { - timer_result = static_func_many_args(10, 20, 30, 40, 50, 60, 70, 80); + int v = timer_base; + + timer_result = static_func_many_args(v, v * 2, v * 3, v * 4, v * 5, + v * 6, v * 7, v * 8, v * 9, + v * 10); return 0; } diff --git a/tools/testing/selftests/bpf/progs/stack_arg_kfunc.c b/tools/testing/selftests/bpf/progs/stack_arg_kfunc.c index da0d4f91d273..345f2da2e361 100644 --- a/tools/testing/selftests/bpf/progs/stack_arg_kfunc.c +++ b/tools/testing/selftests/bpf/progs/stack_arg_kfunc.c @@ -33,7 +33,7 @@ struct { SEC("tc") int test_stack_arg_scalar(struct __sk_buff *skb) { - return bpf_kfunc_call_stack_arg(1, 2, 3, 4, 5, 6, 7, 8); + return bpf_kfunc_call_stack_arg(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); } SEC("tc") @@ -41,7 +41,7 @@ int test_stack_arg_ptr(struct __sk_buff *skb) { struct prog_test_pass1 p = { .x0 = 10, .x1 = 20 }; - return bpf_kfunc_call_stack_arg_ptr(1, 2, 3, 4, 5, &p); + return bpf_kfunc_call_stack_arg_ptr(1, 2, 3, 4, 5, 6, 7, 8, 9, &p); } SEC("tc") @@ -50,17 +50,17 @@ int test_stack_arg_mix(struct __sk_buff *skb) struct prog_test_pass1 p = { .x0 = 10 }; struct prog_test_pass1 q = { .x1 = 20 }; - return bpf_kfunc_call_stack_arg_mix(1, 2, 3, 4, 5, &p, 6, &q); + return bpf_kfunc_call_stack_arg_mix(1, 2, 3, 4, 5, 6, 7, &p, 8, &q); } -/* 1 + 2 + 3 + 4 + 5 + sizeof(pkt_v4) = 15 + 54 = 69 */ +/* 1+2+3+4+5+6+7+8+9+sizeof(pkt_v4) = 45+54 = 99 */ SEC("tc") int test_stack_arg_dynptr(struct __sk_buff *skb) { struct bpf_dynptr ptr; bpf_dynptr_from_skb(skb, 0, &ptr); - return bpf_kfunc_call_stack_arg_dynptr(1, 2, 3, 4, 5, &ptr); + return bpf_kfunc_call_stack_arg_dynptr(1, 2, 3, 4, 5, 6, 7, 8, 9, &ptr); } /* 1 + 2 + 3 + 4 + 5 + (1 + 2 + ... + 16) = 15 + 136 = 151 */ @@ -72,7 +72,7 @@ int test_stack_arg_mem(struct __sk_buff *skb) return bpf_kfunc_call_stack_arg_mem(1, 2, 3, 4, 5, buf, sizeof(buf)); } -/* 1 + 2 + 3 + 4 + 5 + 100 = 115 */ +/* 1+2+3+4+5+6+7+8+9+100 = 145 */ SEC("tc") int test_stack_arg_iter(struct __sk_buff *skb) { @@ -80,21 +80,22 @@ int test_stack_arg_iter(struct __sk_buff *skb) u64 ret; bpf_iter_testmod_seq_new(&it, 100, 10); - ret = bpf_kfunc_call_stack_arg_iter(1, 2, 3, 4, 5, &it); + ret = bpf_kfunc_call_stack_arg_iter(1, 2, 3, 4, 5, 6, 7, 8, 9, &it); bpf_iter_testmod_seq_destroy(&it); return ret; } const char cstr[] = "hello"; -/* 1 + 2 + 3 + 4 + 5 = 15 */ +/* 1+2+3+4+5+6+7+8+9 = 45 */ SEC("tc") int test_stack_arg_const_str(struct __sk_buff *skb) { - return bpf_kfunc_call_stack_arg_const_str(1, 2, 3, 4, 5, cstr); + return bpf_kfunc_call_stack_arg_const_str(1, 2, 3, 4, 5, 6, 7, 8, 9, + cstr); } -/* 1 + 2 + 3 + 4 + 5 = 15 */ +/* 1+2+3+4+5+6+7+8+9 = 45 */ SEC("tc") int test_stack_arg_timer(struct __sk_buff *skb) { @@ -104,7 +105,8 @@ int test_stack_arg_timer(struct __sk_buff *skb) val = bpf_map_lookup_elem(&kfunc_timer_map, &key); if (!val) return 0; - return bpf_kfunc_call_stack_arg_timer(1, 2, 3, 4, 5, &val->timer); + return bpf_kfunc_call_stack_arg_timer(1, 2, 3, 4, 5, 6, 7, 8, 9, + &val->timer); } #else diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index 0be918fe3021..30f1cd23093c 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -826,30 +826,34 @@ __bpf_kfunc int bpf_kfunc_call_test5(u8 a, u16 b, u32 c) } __bpf_kfunc u64 bpf_kfunc_call_stack_arg(u64 a, u64 b, u64 c, u64 d, - u64 e, u64 f, u64 g, u64 h) + u64 e, u64 f, u64 g, u64 h, + u64 i, u64 j) { - return a + b + c + d + e + f + g + h; + return a + b + c + d + e + f + g + h + i + j; } __bpf_kfunc u64 bpf_kfunc_call_stack_arg_ptr(u64 a, u64 b, u64 c, u64 d, u64 e, + u64 f, u64 g, u64 h, u64 i, struct prog_test_pass1 *p) { - return a + b + c + d + e + p->x0 + p->x1; + return a + b + c + d + e + f + g + h + i + p->x0 + p->x1; } __bpf_kfunc u64 bpf_kfunc_call_stack_arg_mix(u64 a, u64 b, u64 c, u64 d, u64 e, - struct prog_test_pass1 *p, u64 f, + u64 f, u64 g, + struct prog_test_pass1 *p, u64 h, struct prog_test_pass1 *q) { - return a + b + c + d + e + p->x0 + f + q->x1; + return a + b + c + d + e + f + g + p->x0 + h + q->x1; } __bpf_kfunc u64 bpf_kfunc_call_stack_arg_dynptr(u64 a, u64 b, u64 c, u64 d, u64 e, + u64 f, u64 g, u64 h, u64 i, struct bpf_dynptr *ptr) { const struct bpf_dynptr_kern *kern_ptr = (void *)ptr; - return a + b + c + d + e + (kern_ptr->size & 0xFFFFFF); + return a + b + c + d + e + f + g + h + i + (kern_ptr->size & 0xFFFFFF); } __bpf_kfunc u64 bpf_kfunc_call_stack_arg_mem(u64 a, u64 b, u64 c, u64 d, u64 e, @@ -865,21 +869,24 @@ __bpf_kfunc u64 bpf_kfunc_call_stack_arg_mem(u64 a, u64 b, u64 c, u64 d, u64 e, } __bpf_kfunc u64 bpf_kfunc_call_stack_arg_iter(u64 a, u64 b, u64 c, u64 d, u64 e, + u64 f, u64 g, u64 h, u64 i, struct bpf_iter_testmod_seq *it__iter) { - return a + b + c + d + e + it__iter->value; + return a + b + c + d + e + f + g + h + i + it__iter->value; } __bpf_kfunc u64 bpf_kfunc_call_stack_arg_const_str(u64 a, u64 b, u64 c, u64 d, u64 e, + u64 f, u64 g, u64 h, u64 i, const char *str__str) { - return a + b + c + d + e; + return a + b + c + d + e + f + g + h + i; } __bpf_kfunc u64 bpf_kfunc_call_stack_arg_timer(u64 a, u64 b, u64 c, u64 d, u64 e, + u64 f, u64 g, u64 h, u64 i, struct bpf_timer *timer) { - return a + b + c + d + e; + return a + b + c + d + e + f + g + h + i; } __bpf_kfunc u64 bpf_kfunc_call_stack_arg_big(u64 a, u64 b, u64 c, u64 d, u64 e, diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h index 2edc36b66de9..c36bb911defa 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h @@ -119,21 +119,28 @@ struct sock *bpf_kfunc_call_test3(struct sock *sk) __ksym; long bpf_kfunc_call_test4(signed char a, short b, int c, long d) __ksym; int bpf_kfunc_call_test5(__u8 a, __u16 b, __u32 c) __ksym; __u64 bpf_kfunc_call_stack_arg(__u64 a, __u64 b, __u64 c, __u64 d, - __u64 e, __u64 f, __u64 g, __u64 h) __ksym; + __u64 e, __u64 f, __u64 g, __u64 h, + __u64 i, __u64 j) __ksym; __u64 bpf_kfunc_call_stack_arg_ptr(__u64 a, __u64 b, __u64 c, __u64 d, __u64 e, + __u64 f, __u64 g, __u64 h, __u64 i, struct prog_test_pass1 *p) __ksym; __u64 bpf_kfunc_call_stack_arg_mix(__u64 a, __u64 b, __u64 c, __u64 d, __u64 e, - struct prog_test_pass1 *p, __u64 f, + __u64 f, __u64 g, + struct prog_test_pass1 *p, __u64 h, struct prog_test_pass1 *q) __ksym; __u64 bpf_kfunc_call_stack_arg_dynptr(__u64 a, __u64 b, __u64 c, __u64 d, __u64 e, + __u64 f, __u64 g, __u64 h, __u64 i, struct bpf_dynptr *ptr) __ksym; __u64 bpf_kfunc_call_stack_arg_mem(__u64 a, __u64 b, __u64 c, __u64 d, __u64 e, void *mem, int mem__sz) __ksym; __u64 bpf_kfunc_call_stack_arg_iter(__u64 a, __u64 b, __u64 c, __u64 d, __u64 e, + __u64 f, __u64 g, __u64 h, __u64 i, struct bpf_iter_testmod_seq *it__iter) __ksym; __u64 bpf_kfunc_call_stack_arg_const_str(__u64 a, __u64 b, __u64 c, __u64 d, __u64 e, + __u64 f, __u64 g, __u64 h, __u64 i, const char *str__str) __ksym; __u64 bpf_kfunc_call_stack_arg_timer(__u64 a, __u64 b, __u64 c, __u64 d, __u64 e, + __u64 f, __u64 g, __u64 h, __u64 i, struct bpf_timer *timer) __ksym; __u64 bpf_kfunc_call_stack_arg_big(__u64 a, __u64 b, __u64 c, __u64 d, __u64 e, struct prog_test_big_arg s) __ksym; -- cgit v1.2.3 From de36adca634634c205a9eb8b56a28175ab7abf5f Mon Sep 17 00:00:00 2001 From: Taegu Ha Date: Thu, 28 May 2026 15:21:55 +0900 Subject: bpf: reject overlarge global subprog argument sizes Global subprogram argument checking derives generic pointer sizes from BTF and passes the resolved size to check_mem_reg() as a u32. The access-size validation path then uses a signed int, and stack pointers negate the value before calling check_helper_mem_access(). This creates a wrap when BTF describes a pointee size larger than S32_MAX. For example, a global subprogram argument of type: int (*p)[0x3fffffff] has a BTF-resolved pointee size of 0xfffffffc bytes. At a call site the caller can pass a pointer to a 4-byte stack slot at fp-4. The current PTR_TO_STACK path computes: size = -(int)mem_size so 0xfffffffc becomes -4 as a signed int and the negation validates only a 4-byte stack range. That range is covered by the caller's stack slot, so the call is accepted. The callee is then verified independently with R1 as PTR_TO_MEM and mem_size 0xfffffffc. A small instruction such as: r0 = *(u32 *)(r1 + 4) is accepted as being inside that BTF-described memory region. At run time, however, the actual argument value is still fp-4, so r1 + 4 addresses fp+0, outside the 4-byte object that the caller provided. Reject sizes that cannot be represented by the verifier's signed access-size API before the stack-specific negation. Add a verifier regression test for the oversized BTF argument. Fixes: 2cb27158adb3 ("bpf: poison dead stack slots") Signed-off-by: Taegu Ha Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20260528062155.3988156-1-hataegu0826@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 ++++++ .../selftests/bpf/progs/verifier_global_subprogs.c | 17 +++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c8d980fdd709..3a270bc485c2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6927,6 +6927,12 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg if (bpf_register_is_null(reg)) return 0; + if (mem_size > S32_MAX) { + verbose(env, "%s memory size %u is too large\n", + reg_arg_name(env, argno), mem_size); + return -EACCES; + } + /* Assuming that the register contains a value check if the memory * access is safe. Temporarily save and restore the register's state as * the conversion shouldn't be visible to a caller. diff --git a/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c index dc09d0e2d8ad..75a2e3f48d0f 100644 --- a/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c +++ b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c @@ -152,6 +152,23 @@ int anon_user_mem_valid(void *ctx) return subprog_user_anon_mem(&t); } +__noinline __weak int subprog_user_anon_mem_huge(int (*p)[0x3fffffff]) +{ + return p ? (*p)[1] : 0; +} + +SEC("?tracepoint") +__failure __log_level(2) +__msg("R1 memory size 4294967292 is too large") +int anon_user_mem_huge_size_invalid(void *ctx) +{ + int (*p)[0x3fffffff]; + int tiny = 42; + + p = (void *)&tiny; + return subprog_user_anon_mem_huge(p) + tiny; +} + __noinline __weak int subprog_nonnull_ptr_good(int *p1 __arg_nonnull, int *p2 __arg_nonnull) { return (*p1) * (*p2); /* good, no need for NULL checks */ -- cgit v1.2.3 From 868d43cf8f970b456fd93334bee40f792cf27e4d Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Sat, 23 May 2026 12:00:26 -0400 Subject: bpf: Fix security_bpf_prog_load() error handling If security_bpf_prog_load() fails there is no need to call into security_bpf_prog_free() as the LSM will handle the cleanup of any partial LSM state before returning to the caller with an error. Thankfully this isn't an issue with any of the existing code as the LSMs which currently provide BPF hook callback implementations don't allocate any internal state, but this is something we want to fix for potential future users. Signed-off-by: Paul Moore Link: https://lore.kernel.org/r/20260523160025.16363-2-paul@paul-moore.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 93bbbe610a7a..2aafd2131983 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3136,7 +3136,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_at err = security_bpf_prog_load(prog, attr, token, uattr.is_kernel); if (err) - goto free_prog_sec; + goto free_prog; /* run eBPF verifier */ err = bpf_check(&prog, attr, uattr, attr_log); @@ -3182,8 +3182,6 @@ free_used_maps: __bpf_prog_put_noref(prog, prog->aux->real_func_cnt); return err; -free_prog_sec: - security_bpf_prog_free(prog); free_prog: free_uid(prog->aux->user); if (prog->aux->attach_btf) -- cgit v1.2.3 From bab08d1f70438cf06de9ab438313b7ef3099514c Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 28 May 2026 18:49:24 -0700 Subject: bpf: Simplify mark_stack_slot_obj_read() and callers Rename mark_stack_slot_obj_read() as mark_stack_slots_scratched() and directly call it from functions processing iter, dynptr and irq_flag. Commit 6762e3a0bce5 ("bpf: simplify liveness to use (callsite, depth) keyed func_instances") has removed the dynamic liveness component in mark_stack_slot_obj_read(). The function effectively only marks stack slots as scratched and always succeed. Therefore, return void, drop the unused bpf_reg_state argument and rename it to mark_stack_slots_scratched() to reflect what it does now. In addition, to prepare for unifying dynptr handling, dynptr_get_spi() will be moved out of mark_dynptr_read(). As mark_dynptr_read() would join mark_iter_read() as a thin wrapper of mark_stack_slots_scratched(), just open code these helpers. Acked-by: Eduard Zingerman Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260529014936.2811085-2-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 69 ++++++++++++++++----------------------------------- 1 file changed, 21 insertions(+), 48 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3a270bc485c2..d048f3033220 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3006,50 +3006,13 @@ out: return ret; } -static int mark_stack_slot_obj_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - int spi, int nr_slots) +static void mark_stack_slots_scratched(struct bpf_verifier_env *env, + int spi, int nr_slots) { int i; for (i = 0; i < nr_slots; i++) mark_stack_slot_scratched(env, spi - i); - return 0; -} - -static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg) -{ - int spi; - - /* For CONST_PTR_TO_DYNPTR, it must have already been done by - * check_reg_arg in check_helper_call and mark_btf_func_reg_size in - * check_kfunc_call. - */ - if (reg->type == CONST_PTR_TO_DYNPTR) - return 0; - spi = dynptr_get_spi(env, reg); - if (spi < 0) - return spi; - /* Caller ensures dynptr is valid and initialized, which means spi is in - * bounds and spi is the first dynptr slot. Simply mark stack slot as - * read. - */ - return mark_stack_slot_obj_read(env, reg, spi, BPF_DYNPTR_NR_SLOTS); -} - -static int mark_iter_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - int spi, int nr_slots) -{ - return mark_stack_slot_obj_read(env, reg, spi, nr_slots); -} - -static int mark_irq_flag_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg) -{ - int spi; - - spi = irq_flag_get_spi(env, reg); - if (spi < 0) - return spi; - return mark_stack_slot_obj_read(env, reg, spi, 1); } /* This function is supposed to be used by the following 32-bit optimization @@ -7261,7 +7224,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int insn_idx, enum bpf_arg_type arg_type, int clone_ref_obj_id) { - int err; + int spi, err = 0; if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) { verbose(env, @@ -7323,7 +7286,17 @@ static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_stat return -EINVAL; } - err = mark_dynptr_read(env, reg); + if (reg->type != CONST_PTR_TO_DYNPTR) { + spi = dynptr_get_spi(env, reg); + if (spi < 0) + return spi; + + /* + * For CONST_PTR_TO_DYNPTR, reg is already scratched by check_reg_arg + * in check_helper_call and mark_btf_func_reg_size in check_kfunc_call. + */ + mark_stack_slots_scratched(env, spi, BPF_DYNPTR_NR_SLOTS); + } } return err; } @@ -7433,9 +7406,7 @@ static int process_iter_arg(struct bpf_verifier_env *env, struct bpf_reg_state * if (spi < 0) return spi; - err = mark_iter_read(env, reg, spi, nr_slots); - if (err) - return err; + mark_stack_slots_scratched(env, spi, nr_slots); /* remember meta->iter info for process_iter_next_call() */ meta->iter.spi = spi; @@ -11399,7 +11370,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, static int process_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, struct bpf_kfunc_call_arg_meta *meta) { - int err, kfunc_class = IRQ_NATIVE_KFUNC; + int err, spi, kfunc_class = IRQ_NATIVE_KFUNC; bool irq_save; if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_save] || @@ -11440,9 +11411,11 @@ static int process_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state * return err; } - err = mark_irq_flag_read(env, reg); - if (err) - return err; + spi = irq_flag_get_spi(env, reg); + if (spi < 0) + return spi; + + mark_stack_slots_scratched(env, spi, 1); err = unmark_stack_slot_irq_flag(env, reg, kfunc_class); if (err) -- cgit v1.2.3 From b5c0a07eb2c23bfd0c42ad6b461e6881b4b0995b Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 28 May 2026 18:49:25 -0700 Subject: bpf: Unify dynptr handling in the verifier Simplify dynptr checking for helper and kfunc by unifying it. Remember the initialized dynptr (i.e.,g !(arg_type |= MEM_UNINIT)) pass to a dynptr kfunc during process_dynptr_func() so that we can easily retrieve the information for verification later. By saving it in meta->dynptr, there is no need to call dynptr helpers such as dynptr_id(), dynptr_ref_obj_id() and dynptr_type() in check_func_arg(). Remove and open code the helpers in process_dynptr_func() when saving id, ref_obj_id, and type. Besides, since dynptr ref_obj_id information is now pass around in meta->bpf_dynptr_desc, drop the check in helper_multiple_ref_obj_use. Acked-by: Eduard Zingerman Acked-by: Mykyta Yatsenko Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260529014936.2811085-3-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 13 ++-- kernel/bpf/verifier.c | 178 ++++++++----------------------------------- 2 files changed, 40 insertions(+), 151 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 5cbad3b64130..3a5c226bf1c3 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -1438,6 +1438,13 @@ struct bpf_map_desc { int uid; }; +/* The last initialized dynptr; Populated by process_dynptr_func() */ +struct bpf_dynptr_desc { + enum bpf_dynptr_type type; + u32 id; + u32 ref_obj_id; +}; + struct bpf_kfunc_call_arg_meta { /* In parameters */ struct btf *btf; @@ -1478,16 +1485,12 @@ struct bpf_kfunc_call_arg_meta { struct { struct btf_field *field; } arg_rbtree_root; - struct { - enum bpf_dynptr_type type; - u32 id; - u32 ref_obj_id; - } initialized_dynptr; struct { u8 spi; u8 frameno; } iter; struct bpf_map_desc map; + struct bpf_dynptr_desc dynptr; u64 mem_size; }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d048f3033220..0c9792d42668 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -233,6 +233,7 @@ static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state) struct bpf_call_arg_meta { struct bpf_map_desc map; + struct bpf_dynptr_desc dynptr; bool raw_mode; bool pkt_access; u8 release_regno; @@ -241,7 +242,6 @@ struct bpf_call_arg_meta { int mem_size; u64 msize_max_value; int ref_obj_id; - int dynptr_id; int func_id; struct btf *btf; u32 btf_id; @@ -470,11 +470,6 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id) func_id == BPF_FUNC_skc_to_tcp_request_sock; } -static bool is_dynptr_ref_function(enum bpf_func_id func_id) -{ - return func_id == BPF_FUNC_dynptr_data; -} - static bool is_sync_callback_calling_kfunc(u32 btf_id); static bool is_async_callback_calling_kfunc(u32 btf_id); static bool is_callback_calling_kfunc(u32 btf_id); @@ -542,8 +537,6 @@ static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id, ref_obj_uses++; if (is_acquire_function(func_id, map)) ref_obj_uses++; - if (is_dynptr_ref_function(func_id)) - ref_obj_uses++; return ref_obj_uses > 1; } @@ -7221,8 +7214,9 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, * use case. The second level is tracked using the upper bit of bpf_dynptr->size * and checked dynamically during runtime. */ -static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int insn_idx, - enum bpf_arg_type arg_type, int clone_ref_obj_id) +static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + argno_t argno, int insn_idx, enum bpf_arg_type arg_type, + int clone_ref_obj_id, struct bpf_dynptr_desc *dynptr) { int spi, err = 0; @@ -7287,6 +7281,8 @@ static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_stat } if (reg->type != CONST_PTR_TO_DYNPTR) { + struct bpf_func_state *state = bpf_func(env, reg); + spi = dynptr_get_spi(env, reg); if (spi < 0) return spi; @@ -7296,6 +7292,14 @@ static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_stat * in check_helper_call and mark_btf_func_reg_size in check_kfunc_call. */ mark_stack_slots_scratched(env, spi, BPF_DYNPTR_NR_SLOTS); + + reg = &state->stack[spi].spilled_ptr; + } + + if (dynptr) { + dynptr->type = reg->dynptr.type; + dynptr->id = reg->id; + dynptr->ref_obj_id = reg->ref_obj_id; } } return err; @@ -8065,72 +8069,6 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env, } } -static struct bpf_reg_state *get_dynptr_arg_reg(struct bpf_verifier_env *env, - const struct bpf_func_proto *fn, - struct bpf_reg_state *regs) -{ - struct bpf_reg_state *state = NULL; - int i; - - for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) - if (arg_type_is_dynptr(fn->arg_type[i])) { - if (state) { - verbose(env, "verifier internal error: multiple dynptr args\n"); - return NULL; - } - state = ®s[BPF_REG_1 + i]; - } - - if (!state) - verbose(env, "verifier internal error: no dynptr arg found\n"); - - return state; -} - -static int dynptr_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) -{ - struct bpf_func_state *state = bpf_func(env, reg); - int spi; - - if (reg->type == CONST_PTR_TO_DYNPTR) - return reg->id; - spi = dynptr_get_spi(env, reg); - if (spi < 0) - return spi; - return state->stack[spi].spilled_ptr.id; -} - -static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) -{ - struct bpf_func_state *state = bpf_func(env, reg); - int spi; - - if (reg->type == CONST_PTR_TO_DYNPTR) - return reg->ref_obj_id; - spi = dynptr_get_spi(env, reg); - if (spi < 0) - return spi; - return state->stack[spi].spilled_ptr.ref_obj_id; -} - -static enum bpf_dynptr_type dynptr_get_type(struct bpf_verifier_env *env, - struct bpf_reg_state *reg) -{ - struct bpf_func_state *state = bpf_func(env, reg); - int spi; - - if (reg->type == CONST_PTR_TO_DYNPTR) - return reg->dynptr.type; - - spi = bpf_get_spi(reg->var_off.value); - if (spi < 0) { - verbose(env, "verifier internal error: invalid spi when querying dynptr type\n"); - return BPF_DYNPTR_TYPE_INVALID; - } - - return state->stack[spi].spilled_ptr.dynptr.type; -} - static int check_arg_const_str(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno) { @@ -8488,7 +8426,8 @@ skip_type_check: true, meta); break; case ARG_PTR_TO_DYNPTR: - err = process_dynptr_func(env, reg, argno_from_reg(regno), insn_idx, arg_type, 0); + err = process_dynptr_func(env, reg, argno_from_reg(regno), insn_idx, arg_type, 0, + &meta->dynptr); if (err) return err; break; @@ -9170,7 +9109,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, if (ret) return ret; - ret = process_dynptr_func(env, reg, argno, -1, arg->arg_type, 0); + ret = process_dynptr_func(env, reg, argno, -1, arg->arg_type, 0, NULL); if (ret) return ret; } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) { @@ -10278,52 +10217,10 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } } break; - case BPF_FUNC_dynptr_data: - { - struct bpf_reg_state *reg; - int id, ref_obj_id; - - reg = get_dynptr_arg_reg(env, fn, regs); - if (!reg) - return -EFAULT; - - - if (meta.dynptr_id) { - verifier_bug(env, "meta.dynptr_id already set"); - return -EFAULT; - } - if (meta.ref_obj_id) { - verifier_bug(env, "meta.ref_obj_id already set"); - return -EFAULT; - } - - id = dynptr_id(env, reg); - if (id < 0) { - verifier_bug(env, "failed to obtain dynptr id"); - return id; - } - - ref_obj_id = dynptr_ref_obj_id(env, reg); - if (ref_obj_id < 0) { - verifier_bug(env, "failed to obtain dynptr ref_obj_id"); - return ref_obj_id; - } - - meta.dynptr_id = id; - meta.ref_obj_id = ref_obj_id; - - break; - } case BPF_FUNC_dynptr_write: { - enum bpf_dynptr_type dynptr_type; - struct bpf_reg_state *reg; + enum bpf_dynptr_type dynptr_type = meta.dynptr.type; - reg = get_dynptr_arg_reg(env, fn, regs); - if (!reg) - return -EFAULT; - - dynptr_type = dynptr_get_type(env, reg); if (dynptr_type == BPF_DYNPTR_TYPE_INVALID) return -EFAULT; @@ -10515,10 +10412,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EFAULT; } - if (is_dynptr_ref_function(func_id)) - regs[BPF_REG_0].dynptr_id = meta.dynptr_id; - - if (is_ptr_cast_function(func_id) || is_dynptr_ref_function(func_id)) { + if (is_ptr_cast_function(func_id)) { /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; } else if (is_acquire_function(func_id, meta.map.ptr)) { @@ -10532,6 +10426,11 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].ref_obj_id = id; } + if (func_id == BPF_FUNC_dynptr_data) { + regs[BPF_REG_0].dynptr_id = meta.dynptr.id; + regs[BPF_REG_0].ref_obj_id = meta.dynptr.ref_obj_id; + } + err = do_refine_retval_range(env, regs, fn->ret_type, func_id, &meta); if (err) return err; @@ -12187,7 +12086,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ meta->release_regno = regno; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] && (dynptr_arg_type & MEM_UNINIT)) { - enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type; + enum bpf_dynptr_type parent_type = meta->dynptr.type; if (parent_type == BPF_DYNPTR_TYPE_INVALID) { verifier_bug(env, "no dynptr type for parent of clone"); @@ -12195,30 +12094,17 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } dynptr_arg_type |= (unsigned int)get_dynptr_type_flag(parent_type); - clone_ref_obj_id = meta->initialized_dynptr.ref_obj_id; + clone_ref_obj_id = meta->dynptr.ref_obj_id; if (dynptr_type_refcounted(parent_type) && !clone_ref_obj_id) { verifier_bug(env, "missing ref obj id for parent of clone"); return -EFAULT; } } - ret = process_dynptr_func(env, reg, argno, insn_idx, - dynptr_arg_type, clone_ref_obj_id); + ret = process_dynptr_func(env, reg, argno, insn_idx, dynptr_arg_type, + clone_ref_obj_id, &meta->dynptr); if (ret < 0) return ret; - - if (!(dynptr_arg_type & MEM_UNINIT)) { - int id = dynptr_id(env, reg); - - if (id < 0) { - verifier_bug(env, "failed to obtain dynptr id"); - return id; - } - meta->initialized_dynptr.id = id; - meta->initialized_dynptr.type = dynptr_get_type(env, reg); - meta->initialized_dynptr.ref_obj_id = dynptr_ref_obj_id(env, reg); - } - break; } case KF_ARG_PTR_TO_ITER: @@ -12849,7 +12735,7 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca } } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice] || meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) { - enum bpf_type_flag type_flag = get_dynptr_type_flag(meta->initialized_dynptr.type); + enum bpf_type_flag type_flag = get_dynptr_type_flag(meta->dynptr.type); mark_reg_known_zero(env, regs, BPF_REG_0); @@ -12873,11 +12759,11 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca } } - if (!meta->initialized_dynptr.id) { + if (!meta->dynptr.id) { verifier_bug(env, "no dynptr id"); return -EFAULT; } - regs[BPF_REG_0].dynptr_id = meta->initialized_dynptr.id; + regs[BPF_REG_0].dynptr_id = meta->dynptr.id; /* we don't need to set BPF_REG_0's ref obj id * because packet slices are not refcounted (see @@ -13063,7 +12949,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (meta.release_regno) { struct bpf_reg_state *reg = ®s[meta.release_regno]; - if (meta.initialized_dynptr.ref_obj_id) { + if (meta.dynptr.ref_obj_id) { err = unmark_stack_slots_dynptr(env, reg); } else { err = release_reference(env, reg->ref_obj_id); -- cgit v1.2.3 From 94ac7553361f3f20b0a60c13f9e7d0a859c73c12 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 28 May 2026 18:49:26 -0700 Subject: bpf: Assign reg->id when getting referenced kptr from ctx Assign reg->id when getting referenced kptr from read program context to be consistent with R0 of KF_ACQUIRE kfunc. skb dynptr will track the referenced skb in qdisc programs using a new field reg->parent_id in a later patch. Acked-by: Andrii Nakryiko Acked-by: Eduard Zingerman Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260529014936.2811085-4-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0c9792d42668..a6974ff50514 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6208,8 +6208,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct b } else { mark_reg_known_zero(env, regs, value_regno); - if (type_may_be_null(info.reg_type)) - regs[value_regno].id = ++env->id_gen; /* A load of ctx field could have different * actual load size with the one encoded in the * insn. When the dst is PTR, it is for sure not @@ -6219,8 +6217,11 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct b if (base_type(info.reg_type) == PTR_TO_BTF_ID) { regs[value_regno].btf = info.btf; regs[value_regno].btf_id = info.btf_id; + regs[value_regno].id = info.ref_obj_id; regs[value_regno].ref_obj_id = info.ref_obj_id; } + if (type_may_be_null(info.reg_type) && !regs[value_regno].id) + regs[value_regno].id = ++env->id_gen; } regs[value_regno].type = info.reg_type; } -- cgit v1.2.3 From 06d518a5583fa681dc5c204d578a894f5b11ffa5 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 28 May 2026 18:49:27 -0700 Subject: bpf: Preserve reg->id of pointer objects after null-check Preserve reg->id of pointer objects after null-checking the register so that children objects derived from it can still refer to it in the new object relationship tracking mechanism introduced in a later patch. This change incurs a slight increase in the number of states in one selftest bpf object, rbtree_search.bpf.o. For Meta bpf objects, the increase of states is also negligible. Selftest BPF objects with insns_diff > 0 Program Insns (A) Insns (B) Insns (DIFF) States (A) States (B) States (DIFF) ------------------------ --------- --------- -------------- ---------- ---------- ------------- rbtree_search 6820 7326 +506 (+7.42%) 379 398 +19 (+5.01%) Meta BPF objects with insns_diff > 0 Program Insns (A) Insns (B) Insns (DIFF) States (A) States (B) States (DIFF) ------------------------ --------- --------- -------------- ---------- ---------- ------------- ned_imex_be_tclass 52 57 +5 (+9.62%) 5 6 +1 (+20.00%) ned_imex_be_tclass 52 57 +5 (+9.62%) 5 6 +1 (+20.00%) ned_skop_auto_flowlabel 523 526 +3 (+0.57%) 39 40 +1 (+2.56%) ned_skop_mss 289 292 +3 (+1.04%) 20 20 +0 (+0.00%) ned_skopt_bet_classifier 78 82 +4 (+5.13%) 8 8 +0 (+0.00%) dctcp_update_alpha 252 320 +68 (+26.98%) 21 27 +6 (+28.57%) dctcp_update_alpha 252 320 +68 (+26.98%) 21 27 +6 (+28.57%) ned_ts_func 119 126 +7 (+5.88%) 6 7 +1 (+16.67%) tw_egress 1119 1128 +9 (+0.80%) 95 96 +1 (+1.05%) tw_ingress 1128 1137 +9 (+0.80%) 95 96 +1 (+1.05%) tw_tproxy_router 4380 4465 +85 (+1.94%) 114 118 +4 (+3.51%) tw_tproxy_router4 3093 3170 +77 (+2.49%) 83 88 +5 (+6.02%) ttls_tc_ingress 34656 35717 +1061 (+3.06%) 936 970 +34 (+3.63%) tw_twfw_egress 222327 222338 +11 (+0.00%) 10563 10564 +1 (+0.01%) tw_twfw_ingress 78295 78299 +4 (+0.01%) 3825 3826 +1 (+0.03%) tw_twfw_tc_eg 222839 222859 +20 (+0.01%) 10584 10585 +1 (+0.01%) tw_twfw_tc_in 78295 78299 +4 (+0.01%) 3825 3826 +1 (+0.03%) tw_twfw_egress 8080 8085 +5 (+0.06%) 456 456 +0 (+0.00%) tw_twfw_ingress 8053 8056 +3 (+0.04%) 454 454 +0 (+0.00%) tw_twfw_tc_eg 8154 8174 +20 (+0.25%) 456 457 +1 (+0.22%) tw_twfw_tc_in 8060 8063 +3 (+0.04%) 455 455 +0 (+0.00%) tw_twfw_egress 222327 222338 +11 (+0.00%) 10563 10564 +1 (+0.01%) tw_twfw_ingress 78295 78299 +4 (+0.01%) 3825 3826 +1 (+0.03%) tw_twfw_tc_eg 222839 222859 +20 (+0.01%) 10584 10585 +1 (+0.01%) tw_twfw_tc_in 78295 78299 +4 (+0.01%) 3825 3826 +1 (+0.03%) tw_twfw_egress 8080 8085 +5 (+0.06%) 456 456 +0 (+0.00%) tw_twfw_ingress 8053 8056 +3 (+0.04%) 454 454 +0 (+0.00%) tw_twfw_tc_eg 8154 8174 +20 (+0.25%) 456 457 +1 (+0.22%) tw_twfw_tc_in 8060 8063 +3 (+0.04%) 455 455 +0 (+0.00%) Looking into rbtree_search, the reason for such increase is that the verifier has to explore the main loop shown below for one more iteration until state pruning decides the current state is safe. long rbtree_search(void *ctx) { ... bpf_spin_lock(&glock0); rb_n = bpf_rbtree_root(&groot0); while (can_loop) { if (!rb_n) { bpf_spin_unlock(&glock0); return __LINE__; } n = rb_entry(rb_n, struct node_data, r0); if (lookup_key == n->key0) break; if (nr_gc < NR_NODES) gc_ns[nr_gc++] = rb_n; if (lookup_key < n->key0) rb_n = bpf_rbtree_left(&groot0, rb_n); else rb_n = bpf_rbtree_right(&groot0, rb_n); } ... } Below is what the verifier sees at the start of each iteration (65: may_goto) after preserving id of rb_n. Without id of rb_n, the verifier stops exploring the loop at iter 16. rb_n gc_ns[15] iter 15 257 257 iter 16 290 257 rb_n: idmap add 257->290 gc_ns[15]: check 257 != 290 --> state not equal iter 17 325 257 rb_n: idmap add 290->325 gc_ns[15]: idmap add 257->257 --> state safe Acked-by: Andrii Nakryiko Acked-by: Eduard Zingerman Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260529014936.2811085-5-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a6974ff50514..0d8be0b68bd8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15576,15 +15576,10 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, mark_ptr_not_null_reg(reg); - if (!reg_may_point_to_spin_lock(reg)) { - /* For not-NULL ptr, reg->ref_obj_id will be reset - * in release_reference(). - * - * reg->id is still used by spin_lock ptr. Other - * than spin_lock ptr type, reg->id can be reset. - */ - reg->id = 0; - } + /* + * reg->id is preserved for object relationship tracking + * and spin_lock lock state tracking + */ } } -- cgit v1.2.3 From 308c7a0ae8859b34d9d90a3dff953b2d14242145 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 28 May 2026 18:49:28 -0700 Subject: bpf: Refactor object relationship tracking and fix dynptr UAF bug Refactor object relationship tracking in the verifier and fix a dynptr use-after-free bug where file/skb dynptrs are not invalidated when the parent referenced object is freed. Add parent_id to bpf_reg_state to precisely track child-parent relationships. A child object's parent_id points to the parent object's id. This replaces the PTR_TO_MEM-specific dynptr_id. Remove ref_obj_id from bpf_reg_state by folding its role into the existing id field. Previously, id tracked pointer identity for null checking while ref_obj_id tracked the owning reference for lifetime management. These are now unified: acquire helpers and kfuncs set id to the acquired reference id, and release paths use id directly. Add reg_is_referenced() which checks if a register is referenced by looking up its id in the reference array. This replaces all former ref_obj_id checks. For release_reference(), invalidating an object now also invalidates all descendants by traversing the object tree. This is done using stack-based DFS to avoid recursive call chains of release_reference() -> unmark_stack_slots_dynptr() -> release_reference(). Referenced objects encountered during tree traversal are reported as leaked references. Add parent_id to bpf_reference_state to enable hierarchical reference tracking. When acquiring a reference, a parent_id can be specified to link the new reference to an existing one (e.g., referenced dynptrs acquire a reference with parent_id linking to the parent object's reference). Pointer casting: For pointer casting helpers (bpf_sk_fullsock, bpf_tcp_sock), instead of propagating ref_obj_id, the cast result reuses the same reference id as the source pointer. Since the cast may return NULL for a non-NULL input, the NULL case is explored as a separate verifier branch. This allows releasing any of the original or cast pointers to invalidate all others. Referenced dynptrs: When constructing a referenced dynptr, acquire a intermediate reference with parent_id linking to the parent referenced object. The dynptr and all clones share the same parent_id (pointing to the intermediate ref) but get unique ids for independent slice tracking. Releasing a referenced dynptr releases the parent reference, which in turn invalidates all clones and their derived slices. Owning to non-owning reference conversion: After converting owning to non-owning by clearing id (e.g., object(id=1) -> object(id=0)), the verifier releases the reference state via release_reference_nomark(). Note that the error message "reference has not been acquired before" in the helper and kfunc release paths is removed. This message was already unreachable. The verifier only calls release_reference() after confirming the reference is valid, so the condition could never trigger in practice. Fixes: 870c28588afa ("bpf: net_sched: Add basic bpf qdisc kfuncs") Signed-off-by: Amery Hung Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260529014936.2811085-6-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 +- include/linux/bpf_verifier.h | 77 ++- kernel/bpf/btf.c | 2 +- kernel/bpf/fixups.c | 2 +- kernel/bpf/log.c | 18 +- kernel/bpf/states.c | 11 +- kernel/bpf/verifier.c | 560 ++++++++++----------- tools/testing/selftests/bpf/prog_tests/spin_lock.c | 4 +- tools/testing/selftests/bpf/progs/dynptr_fail.c | 4 +- .../selftests/bpf/progs/iters_state_safety.c | 4 +- .../selftests/bpf/progs/iters_testmod_seq.c | 12 +- 11 files changed, 338 insertions(+), 360 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1c6863ce89e0..d1a17c118316 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1062,7 +1062,7 @@ struct bpf_insn_access_aux { struct { struct btf *btf; u32 btf_id; - u32 ref_obj_id; + u32 ref_id; }; }; struct bpf_verifier_log *log; /* for verbose logs */ @@ -1631,7 +1631,7 @@ struct bpf_ctx_arg_aux { enum bpf_reg_type reg_type; struct btf *btf; u32 btf_id; - u32 ref_obj_id; + u32 ref_id; bool refcounted; }; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 3a5c226bf1c3..75b287d8d92f 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -66,7 +66,6 @@ struct bpf_reg_state { struct { /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */ u32 mem_size; - u32 dynptr_id; /* for dynptr slices */ }; /* For dynptr stack slots */ @@ -148,46 +147,14 @@ struct bpf_reg_state { #define BPF_ADD_CONST32 (1U << 30) #define BPF_ADD_CONST (BPF_ADD_CONST64 | BPF_ADD_CONST32) u32 id; - /* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned - * from a pointer-cast helper, bpf_sk_fullsock() and - * bpf_tcp_sock(). - * - * Consider the following where "sk" is a reference counted - * pointer returned from "sk = bpf_sk_lookup_tcp();": - * - * 1: sk = bpf_sk_lookup_tcp(); - * 2: if (!sk) { return 0; } - * 3: fullsock = bpf_sk_fullsock(sk); - * 4: if (!fullsock) { bpf_sk_release(sk); return 0; } - * 5: tp = bpf_tcp_sock(fullsock); - * 6: if (!tp) { bpf_sk_release(sk); return 0; } - * 7: bpf_sk_release(sk); - * 8: snd_cwnd = tp->snd_cwnd; // verifier will complain - * - * After bpf_sk_release(sk) at line 7, both "fullsock" ptr and - * "tp" ptr should be invalidated also. In order to do that, - * the reg holding "fullsock" and "sk" need to remember - * the original refcounted ptr id (i.e. sk_reg->id) in ref_obj_id - * such that the verifier can reset all regs which have - * ref_obj_id matching the sk_reg->id. - * - * sk_reg->ref_obj_id is set to sk_reg->id at line 1. - * sk_reg->id will stay as NULL-marking purpose only. - * After NULL-marking is done, sk_reg->id can be reset to 0. - * - * After "fullsock = bpf_sk_fullsock(sk);" at line 3, - * fullsock_reg->ref_obj_id is set to sk_reg->ref_obj_id. - * - * After "tp = bpf_tcp_sock(fullsock);" at line 5, - * tp_reg->ref_obj_id is set to fullsock_reg->ref_obj_id - * which is the same as sk_reg->ref_obj_id. - * - * From the verifier perspective, if sk, fullsock and tp - * are not NULL, they are the same ptr with different - * reg->type. In particular, bpf_sk_release(tp) is also - * allowed and has the same effect as bpf_sk_release(sk). + /* + * Tracks the parent object this register was derived from. + * Used for cascading invalidation: when the parent object is + * released or invalidated, all registers with matching parent_id + * are also invalidated. For example, a slice from bpf_dynptr_data() + * gets parent_id set to the dynptr's id. */ - u32 ref_obj_id; + u32 parent_id; /* Inside the callee two registers can be both PTR_TO_STACK like * R1=fp-8 and R2=fp-8, but one of them points to this function stack * while another to the caller's stack. To differentiate them 'frameno' @@ -364,10 +331,14 @@ struct bpf_reference_state { * is used purely to inform the user of a reference leak. */ int insn_idx; - /* Use to keep track of the source object of a lock, to ensure - * it matches on unlock. - */ - void *ptr; + union { + /* For REF_TYPE_PTR */ + int parent_id; + /* Use to keep track of the source object of a lock, to ensure + * it matches on unlock. + */ + void *ptr; + }; }; struct bpf_retval_range { @@ -585,7 +556,7 @@ bpf_get_spilled_stack_arg(int slot, struct bpf_func_state *frame) iter < frame->out_stack_arg_cnt; \ iter++, reg = bpf_get_spilled_stack_arg(iter, frame)) -#define bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, __mask, __expr) \ +#define bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, __stack, __mask, __expr) \ ({ \ struct bpf_verifier_state *___vstate = __vst; \ int ___i, ___j; \ @@ -593,6 +564,7 @@ bpf_get_spilled_stack_arg(int slot, struct bpf_func_state *frame) struct bpf_reg_state *___regs; \ __state = ___vstate->frame[___i]; \ ___regs = __state->regs; \ + __stack = NULL; \ for (___j = 0; ___j < MAX_BPF_REG; ___j++) { \ __reg = &___regs[___j]; \ (void)(__expr); \ @@ -600,8 +572,10 @@ bpf_get_spilled_stack_arg(int slot, struct bpf_func_state *frame) bpf_for_each_spilled_reg(___j, __state, __reg, __mask) { \ if (!__reg) \ continue; \ + __stack = &__state->stack[___j]; \ (void)(__expr); \ } \ + __stack = NULL; \ bpf_for_each_spilled_stack_arg(___j, __state, __reg) { \ if (!__reg) \ continue; \ @@ -611,8 +585,13 @@ bpf_get_spilled_stack_arg(int slot, struct bpf_func_state *frame) }) /* Invoke __expr over regsiters in __vst, setting __state and __reg */ -#define bpf_for_each_reg_in_vstate(__vst, __state, __reg, __expr) \ - bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, 1 << STACK_SPILL, __expr) +#define bpf_for_each_reg_in_vstate(__vst, __state, __reg, __expr) \ + ({ \ + struct bpf_stack_state * ___stack; \ + (void)___stack; \ + bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, ___stack,\ + 1 << STACK_SPILL, __expr); \ + }) /* linked list of verifier states used to prune search */ struct bpf_verifier_state_list { @@ -1442,7 +1421,7 @@ struct bpf_map_desc { struct bpf_dynptr_desc { enum bpf_dynptr_type type; u32 id; - u32 ref_obj_id; + u32 parent_id; }; struct bpf_kfunc_call_arg_meta { @@ -1453,7 +1432,7 @@ struct bpf_kfunc_call_arg_meta { const struct btf_type *func_proto; const char *func_name; /* Out parameters */ - u32 ref_obj_id; + u32 id; u8 release_regno; bool r0_rdonly; u32 ret_btf_id; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 17d4ab0a8206..f429f6f58cb2 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6957,7 +6957,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, info->reg_type = ctx_arg_info->reg_type; info->btf = ctx_arg_info->btf ? : btf_vmlinux; info->btf_id = ctx_arg_info->btf_id; - info->ref_obj_id = ctx_arg_info->ref_obj_id; + info->ref_id = ctx_arg_info->ref_id; return true; } } diff --git a/kernel/bpf/fixups.c b/kernel/bpf/fixups.c index 12739add2dda..5aa3f7d99ac9 100644 --- a/kernel/bpf/fixups.c +++ b/kernel/bpf/fixups.c @@ -870,7 +870,7 @@ int bpf_convert_ctx_accesses(struct bpf_verifier_env *env) case PTR_TO_BTF_ID: case PTR_TO_BTF_ID | PTR_UNTRUSTED: /* PTR_TO_BTF_ID | MEM_ALLOC always has a valid lifetime, unlike - * PTR_TO_BTF_ID, and an active ref_obj_id, but the same cannot + * PTR_TO_BTF_ID, and an active referenced id, but the same cannot * be said once it is marked PTR_UNTRUSTED, hence we must handle * any faults for loads into such types. BPF_WRITE is disallowed * for this case. diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 62fe6ed18374..b740fa73ee26 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -665,8 +665,8 @@ static void print_reg_state(struct bpf_verifier_env *env, verbose_a("id=%d", reg->id & ~BPF_ADD_CONST); if (reg->id & BPF_ADD_CONST) verbose(env, "%+d", reg->delta); - if (reg->ref_obj_id) - verbose_a("ref_obj_id=%d", reg->ref_obj_id); + if (reg->parent_id) + verbose_a("parent_id=%d", reg->parent_id); if (type_is_non_owning_ref(reg->type)) verbose_a("%s", "non_own_ref"); if (type_is_map_ptr(t)) { @@ -768,21 +768,19 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie verbose(env, "=dynptr_%s(", dynptr_type_str(reg->dynptr.type)); if (reg->id) verbose_a("id=%d", reg->id); - if (reg->ref_obj_id) - verbose_a("ref_id=%d", reg->ref_obj_id); - if (reg->dynptr_id) - verbose_a("dynptr_id=%d", reg->dynptr_id); + if (reg->parent_id) + verbose_a("parent_id=%d", reg->parent_id); verbose(env, ")"); break; case STACK_ITER: - /* only main slot has ref_obj_id set; skip others */ - if (!reg->ref_obj_id) + /* only main slot has id set; skip others */ + if (!reg->id) continue; - verbose(env, " fp%d=iter_%s(ref_id=%d,state=%s,depth=%u)", + verbose(env, " fp%d=iter_%s(id=%d,state=%s,depth=%u)", (-i - 1) * BPF_REG_SIZE, iter_type_str(reg->iter.btf, reg->iter.btf_id), - reg->ref_obj_id, iter_state_str(reg->iter.state), + reg->id, iter_state_str(reg->iter.state), reg->iter.depth); break; case STACK_MISC: diff --git a/kernel/bpf/states.c b/kernel/bpf/states.c index 877338136009..5945956a7573 100644 --- a/kernel/bpf/states.c +++ b/kernel/bpf/states.c @@ -489,7 +489,7 @@ static bool regs_exact(const struct bpf_reg_state *rold, { return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && check_ids(rold->id, rcur->id, idmap) && - check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap); + check_ids(rold->parent_id, rcur->parent_id, idmap); } enum exact_level { @@ -614,7 +614,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off) && check_ids(rold->id, rcur->id, idmap) && - check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap); + check_ids(rold->parent_id, rcur->parent_id, idmap); case PTR_TO_PACKET_META: case PTR_TO_PACKET: /* We must have at least as much range as the old ptr @@ -794,7 +794,8 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, cur_reg = &cur->stack[spi].spilled_ptr; if (old_reg->dynptr.type != cur_reg->dynptr.type || old_reg->dynptr.first_slot != cur_reg->dynptr.first_slot || - !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) + !check_ids(old_reg->id, cur_reg->id, idmap) || + !check_ids(old_reg->parent_id, cur_reg->parent_id, idmap)) return false; break; case STACK_ITER: @@ -810,13 +811,13 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, old_reg->iter.btf_id != cur_reg->iter.btf_id || old_reg->iter.state != cur_reg->iter.state || /* ignore {old_reg,cur_reg}->iter.depth, see above */ - !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) + !check_ids(old_reg->id, cur_reg->id, idmap)) return false; break; case STACK_IRQ_FLAG: old_reg = &old->stack[spi].spilled_ptr; cur_reg = &cur->stack[spi].spilled_ptr; - if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap) || + if (!check_ids(old_reg->id, cur_reg->id, idmap) || old_reg->irq.kfunc_class != cur_reg->irq.kfunc_class) return false; break; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0d8be0b68bd8..6d82ca5acacb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -200,14 +200,14 @@ struct bpf_verifier_stack_elem { #define BPF_PRIV_STACK_MIN_SIZE 64 -static int acquire_reference(struct bpf_verifier_env *env, int insn_idx); -static int release_reference_nomark(struct bpf_verifier_state *state, int ref_obj_id); -static int release_reference(struct bpf_verifier_env *env, int ref_obj_id); +static int acquire_reference(struct bpf_verifier_env *env, int insn_idx, int parent_id); +static int release_reference_nomark(struct bpf_verifier_state *state, int id); +static int release_reference(struct bpf_verifier_env *env, int id); static void invalidate_non_owning_refs(struct bpf_verifier_env *env); static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env); static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg); -static bool is_trusted_reg(const struct bpf_reg_state *reg); +static bool is_trusted_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg); static inline bool in_sleepable_context(struct bpf_verifier_env *env); static const char *non_sleepable_context_description(struct bpf_verifier_env *env); static void scalar32_min_max_add(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg); @@ -241,7 +241,7 @@ struct bpf_call_arg_meta { int access_size; int mem_size; u64 msize_max_value; - int ref_obj_id; + u32 id; int func_id; struct btf *btf; u32 btf_id; @@ -339,7 +339,7 @@ static void verbose_invalid_scalar(struct bpf_verifier_env *env, verbose(env, " should have been in [%d, %d]\n", range.minval, range.maxval); } -static bool reg_not_null(const struct bpf_reg_state *reg) +static bool reg_not_null(struct bpf_verifier_env *env, const struct bpf_reg_state *reg) { enum bpf_reg_type type; @@ -353,7 +353,7 @@ static bool reg_not_null(const struct bpf_reg_state *reg) type == PTR_TO_MAP_VALUE || type == PTR_TO_MAP_KEY || type == PTR_TO_SOCK_COMMON || - (type == PTR_TO_BTF_ID && is_trusted_reg(reg)) || + (type == PTR_TO_BTF_ID && is_trusted_reg(env, reg)) || (type == PTR_TO_MEM && !(reg->type & PTR_UNTRUSTED)) || type == CONST_PTR_TO_MAP; } @@ -638,43 +638,44 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type) } } -static bool dynptr_type_refcounted(enum bpf_dynptr_type type) +static bool dynptr_type_referenced(enum bpf_dynptr_type type) { return type == BPF_DYNPTR_TYPE_RINGBUF || type == BPF_DYNPTR_TYPE_FILE; } static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type, - bool first_slot, int dynptr_id); + bool first_slot, int id, int parent_id); static void mark_dynptr_stack_regs(struct bpf_verifier_env *env, struct bpf_reg_state *sreg1, struct bpf_reg_state *sreg2, - enum bpf_dynptr_type type) + enum bpf_dynptr_type type, int parent_id) { int id = ++env->id_gen; - __mark_dynptr_reg(sreg1, type, true, id); - __mark_dynptr_reg(sreg2, type, false, id); + __mark_dynptr_reg(sreg1, type, true, id, parent_id); + __mark_dynptr_reg(sreg2, type, false, id, parent_id); } static void mark_dynptr_cb_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, enum bpf_dynptr_type type) { - __mark_dynptr_reg(reg, type, true, ++env->id_gen); + __mark_dynptr_reg(reg, type, true, ++env->id_gen, 0); } static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi); static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - enum bpf_arg_type arg_type, int insn_idx, int clone_ref_obj_id) + enum bpf_arg_type arg_type, int insn_idx, int parent_id, + struct bpf_dynptr_desc *dynptr) { struct bpf_func_state *state = bpf_func(env, reg); - enum bpf_dynptr_type type; int spi, i, err; + enum bpf_dynptr_type type; spi = dynptr_get_spi(env, reg); if (spi < 0) @@ -705,85 +706,62 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ if (type == BPF_DYNPTR_TYPE_INVALID) return -EINVAL; - mark_dynptr_stack_regs(env, &state->stack[spi].spilled_ptr, - &state->stack[spi - 1].spilled_ptr, type); - - if (dynptr_type_refcounted(type)) { - /* The id is used to track proper releasing */ - int id; + if (dynptr->type == BPF_DYNPTR_TYPE_INVALID) { /* dynptr constructors */ + if (dynptr_type_referenced(type)) { + int id; - if (clone_ref_obj_id) - id = clone_ref_obj_id; - else - id = acquire_reference(env, insn_idx); - - if (id < 0) - return id; + /* + * Create an intermediate reference that tracks the referenced + * object for the referenced dynptr. Freeing a referenced dynptr + * through helpers/kfuncs will invalidate all clones. + */ + id = acquire_reference(env, insn_idx, parent_id); + if (id < 0) + return id; - state->stack[spi].spilled_ptr.ref_obj_id = id; - state->stack[spi - 1].spilled_ptr.ref_obj_id = id; + parent_id = id; + } + } else { /* bpf_dynptr_clone() */ + parent_id = dynptr->parent_id; } + mark_dynptr_stack_regs(env, &state->stack[spi].spilled_ptr, + &state->stack[spi - 1].spilled_ptr, type, parent_id); + return 0; } -static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi) +static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_stack_state *stack) { int i; for (i = 0; i < BPF_REG_SIZE; i++) { - state->stack[spi].slot_type[i] = STACK_INVALID; - state->stack[spi - 1].slot_type[i] = STACK_INVALID; + stack[0].slot_type[i] = STACK_INVALID; + stack[1].slot_type[i] = STACK_INVALID; } - bpf_mark_reg_not_init(env, &state->stack[spi].spilled_ptr); - bpf_mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); + bpf_mark_reg_not_init(env, &stack[0].spilled_ptr); + bpf_mark_reg_not_init(env, &stack[1].spilled_ptr); } static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { struct bpf_func_state *state = bpf_func(env, reg); - int spi, ref_obj_id, i; + int spi; spi = dynptr_get_spi(env, reg); if (spi < 0) return spi; - if (!dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) { - invalidate_dynptr(env, state, spi); - return 0; - } - - ref_obj_id = state->stack[spi].spilled_ptr.ref_obj_id; - - /* If the dynptr has a ref_obj_id, then we need to invalidate - * two things: - * - * 1) Any dynptrs with a matching ref_obj_id (clones) - * 2) Any slices derived from this dynptr. + /* + * For referenced dynptr, release the parent ref which cascades to + * all clones and derived slices. For non-referenced dynptr, only + * the dynptr and slices derived from it will be invalidated. */ - - /* Invalidate any slices associated with this dynptr */ - WARN_ON_ONCE(release_reference(env, ref_obj_id)); - - /* Invalidate any dynptr clones */ - for (i = 1; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].spilled_ptr.ref_obj_id != ref_obj_id) - continue; - - /* it should always be the case that if the ref obj id - * matches then the stack slot also belongs to a - * dynptr - */ - if (state->stack[i].slot_type[0] != STACK_DYNPTR) { - verifier_bug(env, "misconfigured ref_obj_id"); - return -EFAULT; - } - if (state->stack[i].spilled_ptr.dynptr.first_slot) - invalidate_dynptr(env, state, i); - } - - return 0; + reg = &state->stack[spi].spilled_ptr; + return release_reference(env, dynptr_type_referenced(reg->dynptr.type) + ? reg->parent_id + : reg->id); } static void __mark_reg_unknown(const struct bpf_verifier_env *env, @@ -800,9 +778,7 @@ static void mark_reg_invalid(const struct bpf_verifier_env *env, struct bpf_reg_ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi) { - struct bpf_func_state *fstate; - struct bpf_reg_state *dreg; - int i, dynptr_id; + int i, err = 0; /* We always ensure that STACK_DYNPTR is never set partially, * hence just checking for slot_type[0] is enough. This is @@ -816,13 +792,13 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, if (!state->stack[spi].spilled_ptr.dynptr.first_slot) spi = spi + 1; - if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) { - int ref_obj_id = state->stack[spi].spilled_ptr.ref_obj_id; + if (dynptr_type_referenced(state->stack[spi].spilled_ptr.dynptr.type)) { + int v_parent_id = state->stack[spi].spilled_ptr.parent_id; int ref_cnt = 0; /* * A referenced dynptr can be overwritten only if there is at - * least one other dynptr sharing the same ref_obj_id, + * least one other dynptr sharing the same virtual ref parent, * ensuring the reference can still be properly released. */ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { @@ -830,7 +806,7 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, continue; if (!state->stack[i].spilled_ptr.dynptr.first_slot) continue; - if (state->stack[i].spilled_ptr.ref_obj_id == ref_obj_id) + if (state->stack[i].spilled_ptr.parent_id == v_parent_id) ref_cnt++; } @@ -840,32 +816,14 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, } } - mark_stack_slot_scratched(env, spi); - mark_stack_slot_scratched(env, spi - 1); - - /* Writing partially to one dynptr stack slot destroys both. */ - for (i = 0; i < BPF_REG_SIZE; i++) { - state->stack[spi].slot_type[i] = STACK_INVALID; - state->stack[spi - 1].slot_type[i] = STACK_INVALID; + /* Invalidate the dynptr and any derived slices */ + err = release_reference(env, state->stack[spi].spilled_ptr.id); + if (!err) { + mark_stack_slot_scratched(env, spi); + mark_stack_slot_scratched(env, spi - 1); } - dynptr_id = state->stack[spi].spilled_ptr.id; - /* Invalidate any slices associated with this dynptr */ - bpf_for_each_reg_in_vstate(env->cur_state, fstate, dreg, ({ - /* Dynptr slices are only PTR_TO_MEM_OR_NULL and PTR_TO_MEM */ - if (dreg->type != (PTR_TO_MEM | PTR_MAYBE_NULL) && dreg->type != PTR_TO_MEM) - continue; - if (dreg->dynptr_id == dynptr_id) - mark_reg_invalid(env, dreg); - })); - - /* Do not release reference state, we are destroying dynptr on stack, - * not using some helper to release it. Just reset register. - */ - bpf_mark_reg_not_init(env, &state->stack[spi].spilled_ptr); - bpf_mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); - - return 0; + return err; } static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg) @@ -965,7 +923,7 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env, if (spi < 0) return spi; - id = acquire_reference(env, insn_idx); + id = acquire_reference(env, insn_idx, 0); if (id < 0) return id; @@ -981,7 +939,7 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env, else st->type |= PTR_UNTRUSTED; } - st->ref_obj_id = i == 0 ? id : 0; + st->id = i == 0 ? id : 0; st->iter.btf = btf; st->iter.btf_id = btf_id; st->iter.state = BPF_ITER_STATE_ACTIVE; @@ -1011,7 +969,7 @@ static int unmark_stack_slots_iter(struct bpf_verifier_env *env, struct bpf_reg_state *st = &slot->spilled_ptr; if (i == 0) - WARN_ON_ONCE(release_reference(env, st->ref_obj_id)); + WARN_ON_ONCE(release_reference(env, st->id)); bpf_mark_reg_not_init(env, st); @@ -1067,10 +1025,10 @@ static int is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_s if (st->type & PTR_UNTRUSTED) return -EPROTO; - /* only main (first) slot has ref_obj_id set */ - if (i == 0 && !st->ref_obj_id) + /* only main (first) slot has id set */ + if (i == 0 && !st->id) return -EINVAL; - if (i != 0 && st->ref_obj_id) + if (i != 0 && st->id) return -EINVAL; if (st->iter.btf != btf || st->iter.btf_id != btf_id) return -EINVAL; @@ -1109,7 +1067,7 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env, __mark_reg_known_zero(st); st->type = PTR_TO_STACK; /* we don't have dedicated reg type */ - st->ref_obj_id = id; + st->id = id; st->irq.kfunc_class = kfunc_class; for (i = 0; i < BPF_REG_SIZE; i++) @@ -1143,7 +1101,7 @@ static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_r return -EINVAL; } - err = release_irq_state(env->cur_state, st->ref_obj_id); + err = release_irq_state(env->cur_state, st->id); WARN_ON_ONCE(err && err != -EACCES); if (err) { int insn_idx = 0; @@ -1207,7 +1165,7 @@ static int is_irq_flag_reg_valid_init(struct bpf_verifier_env *env, struct bpf_r slot = &state->stack[spi]; st = &slot->spilled_ptr; - if (!st->ref_obj_id) + if (!st->id) return -EINVAL; for (i = 0; i < BPF_REG_SIZE; i++) @@ -1448,7 +1406,7 @@ static struct bpf_reference_state *acquire_reference_state(struct bpf_verifier_e return &state->refs[new_ofs]; } -static int acquire_reference(struct bpf_verifier_env *env, int insn_idx) +static int acquire_reference(struct bpf_verifier_env *env, int insn_idx, int parent_id) { struct bpf_reference_state *s; @@ -1457,6 +1415,7 @@ static int acquire_reference(struct bpf_verifier_env *env, int insn_idx) return -ENOMEM; s->type = REF_TYPE_PTR; s->id = ++env->id_gen; + s->parent_id = parent_id; return s->id; } @@ -1513,17 +1472,25 @@ static void release_reference_state(struct bpf_verifier_state *state, int idx) return; } -static bool find_reference_state(struct bpf_verifier_state *state, int ptr_id) +static bool find_reference_state(struct bpf_verifier_state *state, int id) { int i; - for (i = 0; i < state->acquired_refs; i++) - if (state->refs[i].id == ptr_id) + for (i = 0; i < state->acquired_refs; i++) { + if (state->refs[i].type != REF_TYPE_PTR) + continue; + if (state->refs[i].id == id) return true; + } return false; } +static bool reg_is_referenced(struct bpf_verifier_env *env, const struct bpf_reg_state *reg) +{ + return find_reference_state(env->cur_state, reg->id); +} + static int release_lock_state(struct bpf_verifier_state *state, int type, int id, void *ptr) { void *prev_ptr = NULL; @@ -1837,7 +1804,7 @@ static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm) memset(((u8 *)reg) + sizeof(reg->type), 0, offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type)); reg->id = 0; - reg->ref_obj_id = 0; + reg->parent_id = 0; ___mark_reg_known(reg, imm); } @@ -1872,7 +1839,7 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env, } static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type, - bool first_slot, int dynptr_id) + bool first_slot, int id, int parent_id) { /* reg->type has no meaning for STACK_DYNPTR, but when we set reg for * callback arguments, it does need to be CONST_PTR_TO_DYNPTR, so simply @@ -1881,7 +1848,8 @@ static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type ty __mark_reg_known_zero(reg); reg->type = CONST_PTR_TO_DYNPTR; /* Give each dynptr a unique id to uniquely associate slices to it. */ - reg->id = dynptr_id; + reg->id = id; + reg->parent_id = parent_id; reg->dynptr.type = type; reg->dynptr.first_slot = first_slot; } @@ -2161,17 +2129,12 @@ out: /* Mark a register as having a completely unknown (scalar) value. */ void bpf_mark_reg_unknown_imprecise(struct bpf_reg_state *reg) { - /* - * Clear type, off, and union(map_ptr, range) and - * padding between 'type' and union - */ - memset(reg, 0, offsetof(struct bpf_reg_state, var_off)); + s32 subreg_def = reg->subreg_def; + + memset(reg, 0, sizeof(*reg)); reg->type = SCALAR_VALUE; - reg->id = 0; - reg->ref_obj_id = 0; reg->var_off = tnum_unknown; - reg->frameno = 0; - reg->precise = false; + reg->subreg_def = subreg_def; __mark_reg_unbounded(reg); } @@ -4330,7 +4293,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, * referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the * normal store of unreferenced kptr, we must ensure var_off is zero. * Since ref_ptr cannot be accessed directly by BPF insns, check for - * reg->ref_obj_id is not needed here. + * reg->id is not needed here. */ if (__check_ptr_off_reg(env, reg, argno_from_reg(regno), true)) return -EACCES; @@ -4703,8 +4666,8 @@ static int __check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int of * type of narrower access. */ if (base_type(info->reg_type) == PTR_TO_BTF_ID) { - if (info->ref_obj_id && - !find_reference_state(env->cur_state, info->ref_obj_id)) { + if (info->ref_id && + !find_reference_state(env->cur_state, info->ref_id)) { verbose(env, "invalid bpf_context access off=%d. Reference may already be released\n", off); return -EACCES; @@ -4873,10 +4836,10 @@ static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = { [CONST_PTR_TO_MAP] = btf_bpf_map_id, }; -static bool is_trusted_reg(const struct bpf_reg_state *reg) +static bool is_trusted_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg) { /* A referenced register is always trusted. */ - if (reg->ref_obj_id) + if (reg_is_referenced(env, reg)) return true; /* Types listed in the reg2btf_ids are always trusted */ @@ -5790,7 +5753,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, ret = env->ops->btf_struct_access(&env->log, reg, off, size); } else { /* Writes are permitted with default btf_struct_access for - * program allocated objects (which always have ref_obj_id > 0), + * program allocated objects (which always have id > 0), * but not for untrusted PTR_TO_BTF_ID | MEM_ALLOC. */ if (atype != BPF_READ && !type_is_ptr_alloc_obj(reg->type)) { @@ -5799,8 +5762,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, } if (type_is_alloc(reg->type) && !type_is_non_owning_ref(reg->type) && - !(reg->type & MEM_RCU) && !reg->ref_obj_id) { - verifier_bug(env, "ref_obj_id for allocated object must be non-zero"); + !(reg->type & MEM_RCU) && !reg_is_referenced(env, reg)) { + verifier_bug(env, "allocated object must have a referenced id"); return -EFAULT; } @@ -5819,7 +5782,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, */ flag = PTR_UNTRUSTED; - } else if (is_trusted_reg(reg) || is_rcu_reg(reg)) { + } else if (is_trusted_reg(env, reg) || is_rcu_reg(reg)) { /* By default any pointer obtained from walking a trusted pointer is no * longer trusted, unless the field being accessed has explicitly been * marked as inheriting its parent's state of trust (either full or RCU). @@ -6217,8 +6180,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct b if (base_type(info.reg_type) == PTR_TO_BTF_ID) { regs[value_regno].btf = info.btf; regs[value_regno].btf_id = info.btf_id; - regs[value_regno].id = info.ref_obj_id; - regs[value_regno].ref_obj_id = info.ref_obj_id; + regs[value_regno].id = info.ref_id; } if (type_may_be_null(info.reg_type) && !regs[value_regno].id) regs[value_regno].id = ++env->id_gen; @@ -7201,7 +7163,16 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, return 0; } -/* There are two register types representing a bpf_dynptr, one is PTR_TO_STACK +/* + * Validate dynptr arguments for helper, kfunc and subprog. + * + * @dynptr is both input and output. It is populated when the argument is + * tagged with MEM_UNINIT (i.e., the dynptr argument that will be constructed) + * and consumed when the argument is expecting to be an initialized dynptr. + * @parent_id is used to track the referenced parent object (e.g., file or skb in + * qdisc program) when constructing a dynptr. + * + * There are two register types representing a bpf_dynptr, one is PTR_TO_STACK * which points to a stack slot, and the other is CONST_PTR_TO_DYNPTR. * * In both cases we deal with the first 8 bytes, but need to mark the next 8 @@ -7217,7 +7188,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, */ static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int insn_idx, enum bpf_arg_type arg_type, - int clone_ref_obj_id, struct bpf_dynptr_desc *dynptr) + int parent_id, struct bpf_dynptr_desc *dynptr) { int spi, err = 0; @@ -7258,7 +7229,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_stat return err; } - err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx, clone_ref_obj_id); + err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx, parent_id, dynptr); } else /* OBJ_RELEASE and None case from above */ { /* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */ if (reg->type == CONST_PTR_TO_DYNPTR && (arg_type & OBJ_RELEASE)) { @@ -7300,17 +7271,17 @@ static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_stat if (dynptr) { dynptr->type = reg->dynptr.type; dynptr->id = reg->id; - dynptr->ref_obj_id = reg->ref_obj_id; + dynptr->parent_id = reg->parent_id; } } return err; } -static u32 iter_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi) +static u32 iter_ref_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi) { struct bpf_func_state *state = bpf_func(env, reg); - return state->stack[spi].spilled_ptr.ref_obj_id; + return state->stack[spi].spilled_ptr.id; } static bool is_iter_kfunc(struct bpf_kfunc_call_arg_meta *meta) @@ -7416,7 +7387,7 @@ static int process_iter_arg(struct bpf_verifier_env *env, struct bpf_reg_state * /* remember meta->iter info for process_iter_next_call() */ meta->iter.spi = spi; meta->iter.frameno = reg->frameno; - meta->ref_obj_id = iter_ref_obj_id(env, reg, spi); + meta->id = iter_ref_id(env, reg, spi); if (is_iter_destroy_kfunc(meta)) { err = unmark_stack_slots_iter(env, reg, nr_slots); @@ -7999,7 +7970,7 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env, /* When referenced register is passed to release function, its fixed * offset must be 0. * - * We will check arg_type_is_release reg has ref_obj_id when storing + * We will check arg_type_is_release reg has id when storing * meta->release_regno. */ if (arg_type_is_release(arg_type)) { @@ -8260,7 +8231,7 @@ skip_type_check: */ if (reg->type == PTR_TO_STACK) { spi = dynptr_get_spi(env, reg); - if (spi < 0 || !state->stack[spi].spilled_ptr.ref_obj_id) { + if (spi < 0 || !state->stack[spi].spilled_ptr.id) { verbose(env, "arg %d is an unacquired reference\n", regno); return -EINVAL; } @@ -8268,7 +8239,7 @@ skip_type_check: verbose(env, "cannot release unowned const bpf_dynptr\n"); return -EINVAL; } - } else if (!reg->ref_obj_id && !bpf_register_is_null(reg)) { + } else if (!reg_is_referenced(env, reg) && !bpf_register_is_null(reg)) { verbose(env, "R%d must be referenced when passed to release function\n", regno); return -EINVAL; @@ -8280,14 +8251,14 @@ skip_type_check: meta->release_regno = regno; } - if (reg->ref_obj_id && base_type(arg_type) != ARG_KPTR_XCHG_DEST) { - if (meta->ref_obj_id) { - verbose(env, "more than one arg with ref_obj_id R%d %u %u", - regno, reg->ref_obj_id, - meta->ref_obj_id); + if (reg_is_referenced(env, reg) && base_type(arg_type) != ARG_KPTR_XCHG_DEST) { + if (meta->id) { + verbose(env, "more than one arg with referenced id R%d %u %u", + regno, reg->id, + meta->id); return -EACCES; } - meta->ref_obj_id = reg->ref_obj_id; + meta->id = reg->id; } switch (base_type(arg_type)) { @@ -8898,14 +8869,14 @@ static void mark_pkt_end(struct bpf_verifier_state *vstate, int regn, bool range reg->range = AT_PKT_END; } -static int release_reference_nomark(struct bpf_verifier_state *state, int ref_obj_id) +static int release_reference_nomark(struct bpf_verifier_state *state, int id) { int i; for (i = 0; i < state->acquired_refs; i++) { if (state->refs[i].type != REF_TYPE_PTR) continue; - if (state->refs[i].id == ref_obj_id) { + if (state->refs[i].id == id) { release_reference_state(state, i); return 0; } @@ -8913,26 +8884,83 @@ static int release_reference_nomark(struct bpf_verifier_state *state, int ref_ob return -EINVAL; } -/* The pointer with the specified id has released its reference to kernel - * resources. Identify all copies of the same pointer and clear the reference. - * - * This is the release function corresponding to acquire_reference(). Idempotent. - */ -static int release_reference(struct bpf_verifier_env *env, int ref_obj_id) +static int idstack_push(struct bpf_idmap *idmap, u32 id) +{ + int i; + + if (!id) + return 0; + + for (i = 0; i < idmap->cnt; i++) + if (idmap->map[i].old == id) + return 0; + + if (WARN_ON_ONCE(idmap->cnt >= BPF_ID_MAP_SIZE)) + return -EFAULT; + + idmap->map[idmap->cnt++].old = id; + return 0; +} + +static int idstack_pop(struct bpf_idmap *idmap) { + if (!idmap->cnt) + return 0; + + return idmap->map[--idmap->cnt].old; +} + +/* Release id and objects derived from it iteratively in a DFS manner */ +static int release_reference(struct bpf_verifier_env *env, int id) +{ + u32 mask = (1 << STACK_SPILL) | (1 << STACK_DYNPTR); struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_idmap *idstack = &env->idmap_scratch; + struct bpf_stack_state *stack; struct bpf_func_state *state; struct bpf_reg_state *reg; - int err; + int i, err; - err = release_reference_nomark(vstate, ref_obj_id); + idstack->cnt = 0; + err = idstack_push(idstack, id); if (err) return err; - bpf_for_each_reg_in_vstate(vstate, state, reg, ({ - if (reg->ref_obj_id == ref_obj_id) - mark_reg_invalid(env, reg); - })); + if (find_reference_state(vstate, id)) + WARN_ON_ONCE(release_reference_nomark(vstate, id)); + + while ((id = idstack_pop(idstack))) { + /* + * Child references are inaccessible after parent is released, + * any child references that exist at this point are a leak. + */ + for (i = 0; i < vstate->acquired_refs; i++) { + if (vstate->refs[i].type != REF_TYPE_PTR) + continue; + if (vstate->refs[i].parent_id != id) + continue; + verbose(env, "Leaking reference id=%d alloc_insn=%d. Release it first.\n", + vstate->refs[i].id, vstate->refs[i].insn_idx); + return -EINVAL; + } + + bpf_for_each_reg_in_vstate_mask(vstate, state, reg, stack, mask, ({ + if (reg->id != id && reg->parent_id != id) + continue; + + /* Free objects derived from the current object */ + if (reg->parent_id == id) { + err = idstack_push(idstack, reg->id); + if (err) + return err; + } + + if (!stack || stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL) + mark_reg_invalid(env, reg); + else if (stack->slot_type[BPF_REG_SIZE - 1] == STACK_DYNPTR) + invalidate_dynptr(env, stack); + })); + } return 0; } @@ -9833,7 +9861,7 @@ static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exi * kernel. Type checks are performed later in check_return_code. */ if (type == BPF_PROG_TYPE_STRUCT_OPS && !exception_exit && - reg->ref_obj_id == state->refs[i].id) + reg->id == state->refs[i].id) continue; verbose(env, "Unreleased reference id=%d alloc_insn=%d\n", state->refs[i].id, state->refs[i].insn_idx); @@ -10116,18 +10144,18 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn err = -EINVAL; if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) { err = unmark_stack_slots_dynptr(env, ®s[meta.release_regno]); - } else if (func_id == BPF_FUNC_kptr_xchg && meta.ref_obj_id) { - u32 ref_obj_id = meta.ref_obj_id; + } else if (func_id == BPF_FUNC_kptr_xchg && meta.id) { + u32 id = meta.id; bool in_rcu = in_rcu_cs(env); struct bpf_func_state *state; struct bpf_reg_state *reg; - err = release_reference_nomark(env->cur_state, ref_obj_id); + err = release_reference_nomark(env->cur_state, id); if (!err) { bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ - if (reg->ref_obj_id == ref_obj_id) { + if (reg->id == id) { if (in_rcu && (reg->type & MEM_ALLOC) && (reg->type & MEM_PERCPU)) { - reg->ref_obj_id = 0; + reg->id = 0; reg->type &= ~MEM_ALLOC; reg->type |= MEM_RCU; } else { @@ -10136,19 +10164,16 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } })); } - } else if (meta.ref_obj_id) { - err = release_reference(env, meta.ref_obj_id); + } else if (meta.id) { + err = release_reference(env, meta.id); } else if (bpf_register_is_null(®s[meta.release_regno])) { - /* meta.ref_obj_id can only be 0 if register that is meant to be + /* meta.id can only be 0 if register that is meant to be * released is NULL, which must be > R0. */ err = 0; } - if (err) { - verbose(env, "func %s#%d reference has not been acquired before\n", - func_id_name(func_id), func_id); + if (err) return err; - } } switch (func_id) { @@ -10413,24 +10438,40 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EFAULT; } - if (is_ptr_cast_function(func_id)) { - /* For release_reference() */ - regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; + if (is_ptr_cast_function(func_id) && + find_reference_state(env->cur_state, meta.id)) { + struct bpf_verifier_state *branch; + struct bpf_reg_state *r0; + + /* + * In order for a release of any of the original or cast pointers + * to invalidate all other pointers, reuse the same reference id for + * the cast result. + * This reference id can't be used for nullness propagation, + * as cast might return NULL for a non-NULL input. + * Hence, explore the NULL case as a separate branch. + */ + branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false); + if (IS_ERR(branch)) + return PTR_ERR(branch); + + r0 = &branch->frame[branch->curframe]->regs[BPF_REG_0]; + __mark_reg_known_zero(r0); + r0->type = SCALAR_VALUE; + + regs[BPF_REG_0].type &= ~PTR_MAYBE_NULL; + regs[BPF_REG_0].id = meta.id; } else if (is_acquire_function(func_id, meta.map.ptr)) { - int id = acquire_reference(env, insn_idx); + int id = acquire_reference(env, insn_idx, 0); if (id < 0) return id; - /* For mark_ptr_or_null_reg() */ + regs[BPF_REG_0].id = id; - /* For release_reference() */ - regs[BPF_REG_0].ref_obj_id = id; } - if (func_id == BPF_FUNC_dynptr_data) { - regs[BPF_REG_0].dynptr_id = meta.dynptr.id; - regs[BPF_REG_0].ref_obj_id = meta.dynptr.ref_obj_id; - } + if (func_id == BPF_FUNC_dynptr_data) + regs[BPF_REG_0].parent_id = meta.dynptr.id; err = do_refine_retval_range(env, regs, fn->ret_type, func_id, &meta); if (err) @@ -11242,7 +11283,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, * btf_struct_ids_match() to walk the struct at the 0th offset, and * resolve types. */ - if ((is_kfunc_release(meta) && reg->ref_obj_id) || + if ((is_kfunc_release(meta) && reg_is_referenced(env, reg)) || btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id)) strict_type_match = true; @@ -11346,36 +11387,21 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state return 0; } -static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_obj_id) +static void ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 id) { - struct bpf_verifier_state *state = env->cur_state; struct bpf_func_state *unused; struct bpf_reg_state *reg; - int i; - if (!ref_obj_id) { - verifier_bug(env, "ref_obj_id is zero for owning -> non-owning conversion"); - return -EFAULT; - } + WARN_ON_ONCE(release_reference_nomark(env->cur_state, id)); - for (i = 0; i < state->acquired_refs; i++) { - if (state->refs[i].id != ref_obj_id) - continue; - - /* Clear ref_obj_id here so release_reference doesn't clobber - * the whole reg - */ - bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({ - if (reg->ref_obj_id == ref_obj_id) { - reg->ref_obj_id = 0; - ref_set_non_owning(env, reg); - } - })); - return 0; - } + bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({ + if (reg->id == id) { + reg->id = 0; + ref_set_non_owning(env, reg); + } + })); - verifier_bug(env, "ref state missing for ref_obj_id"); - return -EFAULT; + return; } /* Implementation details: @@ -11907,14 +11933,14 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EACCES; } - if (reg->ref_obj_id) { - if (is_kfunc_release(meta) && meta->ref_obj_id) { - verifier_bug(env, "more than one arg with ref_obj_id %s %u %u", - reg_arg_name(env, argno), reg->ref_obj_id, - meta->ref_obj_id); + if (reg_is_referenced(env, reg)) { + if (is_kfunc_release(meta) && meta->id) { + verifier_bug(env, "more than one arg with referenced id %s %u %u", + reg_arg_name(env, argno), reg->id, + meta->id); return -EFAULT; } - meta->ref_obj_id = reg->ref_obj_id; + meta->id = reg->id; if (is_kfunc_release(meta)) { if (regno < 0) { verbose(env, "%s release arg cannot be a stack argument\n", @@ -11975,7 +12001,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ fallthrough; case KF_ARG_PTR_TO_ALLOC_BTF_ID: case KF_ARG_PTR_TO_BTF_ID: - if (!is_trusted_reg(reg)) { + if (!is_trusted_reg(env, reg)) { if (!is_kfunc_rcu(meta)) { verbose(env, "%s must be referenced or trusted\n", reg_arg_name(env, argno)); @@ -12013,7 +12039,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EFAULT; } - if (is_kfunc_release(meta) && reg->ref_obj_id) + if (is_kfunc_release(meta) && reg_is_referenced(env, reg)) arg_type |= OBJ_RELEASE; ret = check_func_arg_reg_off(env, reg, argno, arg_type); if (ret < 0) @@ -12052,7 +12078,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ reg_arg_name(env, argno)); return -EINVAL; } - if (!reg->ref_obj_id) { + if (!reg_is_referenced(env, reg)) { verbose(env, "allocated object must be referenced\n"); return -EINVAL; } @@ -12064,7 +12090,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_DYNPTR: { enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR; - int clone_ref_obj_id = 0; if (is_kfunc_arg_uninit(btf, &args[i])) dynptr_arg_type |= MEM_UNINIT; @@ -12095,15 +12120,10 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } dynptr_arg_type |= (unsigned int)get_dynptr_type_flag(parent_type); - clone_ref_obj_id = meta->dynptr.ref_obj_id; - if (dynptr_type_refcounted(parent_type) && !clone_ref_obj_id) { - verifier_bug(env, "missing ref obj id for parent of clone"); - return -EFAULT; - } } ret = process_dynptr_func(env, reg, argno, insn_idx, dynptr_arg_type, - clone_ref_obj_id, &meta->dynptr); + meta->id, &meta->dynptr); if (ret < 0) return ret; break; @@ -12126,7 +12146,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ reg_arg_name(env, argno)); return -EINVAL; } - if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) { + if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && + !reg_is_referenced(env, reg)) { verbose(env, "allocated object must be referenced\n"); return -EINVAL; } @@ -12141,7 +12162,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ reg_arg_name(env, argno)); return -EINVAL; } - if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) { + if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && + !reg_is_referenced(env, reg)) { verbose(env, "allocated object must be referenced\n"); return -EINVAL; } @@ -12151,7 +12173,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ break; case KF_ARG_PTR_TO_LIST_NODE: if (is_kfunc_arg_nonown_allowed(btf, &args[i]) && - type_is_non_owning_ref(reg->type) && !reg->ref_obj_id) { + type_is_non_owning_ref(reg->type) && !reg_is_referenced(env, reg)) { /* Allow bpf_list_front/back return value for * __nonown_allowed list-node arguments. */ @@ -12162,7 +12184,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ reg_arg_name(env, argno)); return -EINVAL; } - if (!reg->ref_obj_id) { + if (!reg_is_referenced(env, reg)) { verbose(env, "allocated object must be referenced\n"); return -EINVAL; } @@ -12178,12 +12200,13 @@ check_ok: reg_arg_name(env, argno)); return -EINVAL; } - if (!reg->ref_obj_id) { + if (!reg_is_referenced(env, reg)) { verbose(env, "allocated object must be referenced\n"); return -EINVAL; } } else { - if (!type_is_non_owning_ref(reg->type) && !reg->ref_obj_id) { + if (!type_is_non_owning_ref(reg->type) && + !reg_is_referenced(env, reg)) { verbose(env, "%s can only take non-owning or refcounted bpf_rb_node pointer\n", func_name); return -EINVAL; } @@ -12764,12 +12787,7 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca verifier_bug(env, "no dynptr id"); return -EFAULT; } - regs[BPF_REG_0].dynptr_id = meta->dynptr.id; - - /* we don't need to set BPF_REG_0's ref obj id - * because packet slices are not refcounted (see - * dynptr_type_refcounted) - */ + regs[BPF_REG_0].parent_id = meta->dynptr.id; } else { return 0; } @@ -12783,13 +12801,13 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { bool sleepable, rcu_lock, rcu_unlock, preempt_disable, preempt_enable; - u32 i, nargs, ptr_type_id, release_ref_obj_id; struct bpf_reg_state *regs = cur_regs(env); const char *func_name, *ptr_type_name; const struct btf_type *t, *ptr_type; struct bpf_kfunc_call_arg_meta meta; struct bpf_insn_aux_data *insn_aux; int err, insn_idx = *insn_idx_p; + u32 i, nargs, ptr_type_id, id; const struct btf_param *args; struct btf *desc_btf; @@ -12902,6 +12920,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (rcu_lock) { env->cur_state->active_rcu_locks++; } else if (rcu_unlock) { + struct bpf_stack_state *stack; struct bpf_func_state *state; struct bpf_reg_state *reg; u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER); @@ -12911,7 +12930,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EINVAL; } if (--env->cur_state->active_rcu_locks == 0) { - bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, clear_mask, ({ + bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, stack, clear_mask, ({ if (reg->type & MEM_RCU) { reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL); reg->type |= PTR_UNTRUSTED; @@ -12950,35 +12969,20 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (meta.release_regno) { struct bpf_reg_state *reg = ®s[meta.release_regno]; - if (meta.dynptr.ref_obj_id) { + if (meta.dynptr.id) { err = unmark_stack_slots_dynptr(env, reg); } else { - err = release_reference(env, reg->ref_obj_id); - if (err) - verbose(env, "kfunc %s#%d reference has not been acquired before\n", - func_name, meta.func_id); + err = release_reference(env, reg->id); } if (err) return err; } if (is_bpf_list_push_kfunc(meta.func_id) || is_bpf_rbtree_add_kfunc(meta.func_id)) { - release_ref_obj_id = regs[BPF_REG_2].ref_obj_id; + id = regs[BPF_REG_2].id; insn_aux->insert_off = regs[BPF_REG_2].var_off.value; insn_aux->kptr_struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id); - err = ref_convert_owning_non_owning(env, release_ref_obj_id); - if (err) { - verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n", - func_name, meta.func_id); - return err; - } - - err = release_reference(env, release_ref_obj_id); - if (err) { - verbose(env, "kfunc %s#%d reference has not been acquired before\n", - func_name, meta.func_id); - return err; - } + ref_convert_owning_non_owning(env, id); } if (meta.func_id == special_kfunc_list[KF_bpf_throw]) { @@ -13063,8 +13067,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, regs[BPF_REG_0].type |= MEM_RDONLY; /* Ensures we don't access the memory after a release_reference() */ - if (meta.ref_obj_id) - regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; + if (meta.id) + regs[BPF_REG_0].parent_id = meta.id; if (is_kfunc_rcu_protected(&meta)) regs[BPF_REG_0].type |= MEM_RCU; @@ -13110,13 +13114,10 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *)); if (is_kfunc_acquire(&meta)) { - int id = acquire_reference(env, insn_idx); - + id = acquire_reference(env, insn_idx, 0); if (id < 0) return id; - if (is_kfunc_ret_null(&meta)) - regs[BPF_REG_0].id = id; - regs[BPF_REG_0].ref_obj_id = id; + regs[BPF_REG_0].id = id; } else if (is_rbtree_node_type(ptr_type) || is_list_node_type(ptr_type)) { ref_set_non_owning(env, ®s[BPF_REG_0]); } @@ -15347,7 +15348,7 @@ static int is_branch_taken(struct bpf_verifier_env *env, struct bpf_reg_state *r if (!is_reg_const(reg2, is_jmp32)) return -1; - if (!reg_not_null(reg1)) + if (!reg_not_null(env, reg1)) return -1; /* If pointer is valid tests against zero will fail so we can @@ -15564,7 +15565,7 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, WARN_ON_ONCE(!tnum_equals_const(reg->var_off, 0))) return; if (is_null) { - /* We don't need id and ref_obj_id from this point + /* We don't need id from this point * onwards anymore, thus we should better reset it, * so that state pruning has chances to take effect. */ @@ -15591,10 +15592,9 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, { struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = state->regs, *reg; - u32 ref_obj_id = regs[regno].ref_obj_id; u32 id = regs[regno].id; - if (ref_obj_id && ref_obj_id == id && is_null) + if (is_null && find_reference_state(vstate, id)) /* regs[regno] is in the " == NULL" branch. * No one could have freed the reference state before * doing the NULL check. @@ -16433,7 +16433,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char ret_type = btf_type_resolve_ptr(prog->aux->attach_btf, prog->aux->attach_func_proto->type, NULL); - if (ret_type && ret_type == reg_type && reg->ref_obj_id) + if (ret_type && ret_type == reg_type && reg_is_referenced(env, reg)) return __check_ptr_off_reg(env, reg, argno_from_reg(regno), false); } @@ -18302,7 +18302,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) mark_reg_unknown(env, regs, i); } else if (arg->arg_type == ARG_PTR_TO_DYNPTR) { /* assume unspecial LOCAL dynptr type */ - __mark_dynptr_reg(reg, BPF_DYNPTR_TYPE_LOCAL, true, ++env->id_gen); + __mark_dynptr_reg(reg, BPF_DYNPTR_TYPE_LOCAL, true, ++env->id_gen, 0); } else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) { reg->type = PTR_TO_MEM; reg->type |= arg->arg_type & @@ -18361,8 +18361,8 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) /* Acquire references for struct_ops program arguments tagged with "__ref" */ if (!subprog && env->prog->type == BPF_PROG_TYPE_STRUCT_OPS) { for (i = 0; i < aux->ctx_arg_info_size; i++) - aux->ctx_arg_info[i].ref_obj_id = aux->ctx_arg_info[i].refcounted ? - acquire_reference(env, 0) : 0; + aux->ctx_arg_info[i].ref_id = aux->ctx_arg_info[i].refcounted ? + acquire_reference(env, 0, 0) : 0; } ret = do_check(env); diff --git a/tools/testing/selftests/bpf/prog_tests/spin_lock.c b/tools/testing/selftests/bpf/prog_tests/spin_lock.c index bbe476f4c47d..5c3579438427 100644 --- a/tools/testing/selftests/bpf/prog_tests/spin_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/spin_lock.c @@ -13,8 +13,8 @@ static struct { const char *err_msg; } spin_lock_fail_tests[] = { { "lock_id_kptr_preserve", - "[0-9]\\+: (bf) r1 = r0 ; R0=ptr_foo(id=2,ref_obj_id=2)" - " R1=ptr_foo(id=2,ref_obj_id=2) refs=2\n" + "[0-9]\\+: (bf) r1 = r0 ; R0=ptr_foo(id=2)" + " R1=ptr_foo(id=2) refs=2\n" "[0-9]\\+: (85) call bpf_this_cpu_ptr#154\n" "R1 type=ptr_ expected=percpu_ptr_" }, { "lock_id_global_zero", diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c index dbd97add5a5a..fa0beeaad1be 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_fail.c +++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c @@ -78,7 +78,7 @@ static int get_map_val_dynptr(struct bpf_dynptr *ptr) * bpf_ringbuf_submit/discard_dynptr call */ SEC("?raw_tp") -__failure __msg("Unreleased reference id=2") +__failure __msg("Unreleased reference id=1") int ringbuf_missing_release1(void *ctx) { struct bpf_dynptr ptr = {}; @@ -91,7 +91,7 @@ int ringbuf_missing_release1(void *ctx) } SEC("?raw_tp") -__failure __msg("Unreleased reference id=4") +__failure __msg("Unreleased reference id=3") int ringbuf_missing_release2(void *ctx) { struct bpf_dynptr ptr1, ptr2; diff --git a/tools/testing/selftests/bpf/progs/iters_state_safety.c b/tools/testing/selftests/bpf/progs/iters_state_safety.c index af8f9ec1ea98..646026430e9b 100644 --- a/tools/testing/selftests/bpf/progs/iters_state_safety.c +++ b/tools/testing/selftests/bpf/progs/iters_state_safety.c @@ -30,7 +30,7 @@ int force_clang_to_emit_btf_for_externs(void *ctx) SEC("?raw_tp") __success __log_level(2) -__msg("fp-8=iter_num(ref_id=1,state=active,depth=0)") +__msg("fp-8=iter_num(id=1,state=active,depth=0)") int create_and_destroy(void *ctx) { struct bpf_iter_num iter; @@ -196,7 +196,7 @@ int leak_iter_from_subprog_fail(void *ctx) SEC("?raw_tp") __success __log_level(2) -__msg("fp-8=iter_num(ref_id=1,state=active,depth=0)") +__msg("fp-8=iter_num(id=1,state=active,depth=0)") int valid_stack_reuse(void *ctx) { struct bpf_iter_num iter; diff --git a/tools/testing/selftests/bpf/progs/iters_testmod_seq.c b/tools/testing/selftests/bpf/progs/iters_testmod_seq.c index 9b760dac333e..d00888f6687a 100644 --- a/tools/testing/selftests/bpf/progs/iters_testmod_seq.c +++ b/tools/testing/selftests/bpf/progs/iters_testmod_seq.c @@ -20,8 +20,8 @@ __s64 res_empty; SEC("raw_tp/sys_enter") __success __log_level(2) -__msg("fp-16=iter_testmod_seq(ref_id=1,state=active,depth=0)") -__msg("fp-16=iter_testmod_seq(ref_id=1,state=drained,depth=0)") +__msg("fp-16=iter_testmod_seq(id=1,state=active,depth=0)") +__msg("fp-16=iter_testmod_seq(id=1,state=drained,depth=0)") __msg("call bpf_iter_testmod_seq_destroy") int testmod_seq_empty(const void *ctx) { @@ -38,8 +38,8 @@ __s64 res_full; SEC("raw_tp/sys_enter") __success __log_level(2) -__msg("fp-16=iter_testmod_seq(ref_id=1,state=active,depth=0)") -__msg("fp-16=iter_testmod_seq(ref_id=1,state=drained,depth=0)") +__msg("fp-16=iter_testmod_seq(id=1,state=active,depth=0)") +__msg("fp-16=iter_testmod_seq(id=1,state=drained,depth=0)") __msg("call bpf_iter_testmod_seq_destroy") int testmod_seq_full(const void *ctx) { @@ -58,8 +58,8 @@ static volatile int zero = 0; SEC("raw_tp/sys_enter") __success __log_level(2) -__msg("fp-16=iter_testmod_seq(ref_id=1,state=active,depth=0)") -__msg("fp-16=iter_testmod_seq(ref_id=1,state=drained,depth=0)") +__msg("fp-16=iter_testmod_seq(id=1,state=active,depth=0)") +__msg("fp-16=iter_testmod_seq(id=1,state=drained,depth=0)") __msg("call bpf_iter_testmod_seq_destroy") int testmod_seq_truncated(const void *ctx) { -- cgit v1.2.3 From 92d681b42746d4497dcc8afb45edd4af5737542f Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 28 May 2026 18:49:29 -0700 Subject: bpf: Remove redundant dynptr arg check for helper unmark_stack_slots_dynptr() already makes sure that CONST_PTR_TO_DYNPTR cannot be released. process_dynptr_func() also prevents passing uninitialized dynptr to helpers expecting initialized dynptr. Now that unmark_stack_slots_dynptr() also reports error returned from release_reference(), there should be no reason to keep these redundant checks. Acked-by: Eduard Zingerman Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260529014936.2811085-7-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 21 +-------------------- tools/testing/selftests/bpf/progs/dynptr_fail.c | 6 +++--- .../testing/selftests/bpf/progs/user_ringbuf_fail.c | 4 ++-- 3 files changed, 6 insertions(+), 25 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6d82ca5acacb..4f75e5f95d27 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8220,26 +8220,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, skip_type_check: if (arg_type_is_release(arg_type)) { - if (arg_type_is_dynptr(arg_type)) { - struct bpf_func_state *state = bpf_func(env, reg); - int spi; - - /* Only dynptr created on stack can be released, thus - * the get_spi and stack state checks for spilled_ptr - * should only be done before process_dynptr_func for - * PTR_TO_STACK. - */ - if (reg->type == PTR_TO_STACK) { - spi = dynptr_get_spi(env, reg); - if (spi < 0 || !state->stack[spi].spilled_ptr.id) { - verbose(env, "arg %d is an unacquired reference\n", regno); - return -EINVAL; - } - } else { - verbose(env, "cannot release unowned const bpf_dynptr\n"); - return -EINVAL; - } - } else if (!reg_is_referenced(env, reg) && !bpf_register_is_null(reg)) { + if (!arg_type_is_dynptr(arg_type) && !reg_is_referenced(env, reg) && !bpf_register_is_null(reg)) { verbose(env, "R%d must be referenced when passed to release function\n", regno); return -EINVAL; diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c index fa0beeaad1be..40a14a5174a5 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_fail.c +++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c @@ -136,7 +136,7 @@ int ringbuf_missing_release_callback(void *ctx) /* Can't call bpf_ringbuf_submit/discard_dynptr on a non-initialized dynptr */ SEC("?raw_tp") -__failure __msg("arg 1 is an unacquired reference") +__failure __msg("Expected an initialized dynptr as R1") int ringbuf_release_uninit_dynptr(void *ctx) { struct bpf_dynptr ptr; @@ -650,7 +650,7 @@ int invalid_offset(void *ctx) /* Can't release a dynptr twice */ SEC("?raw_tp") -__failure __msg("arg 1 is an unacquired reference") +__failure __msg("Expected an initialized dynptr as R1") int release_twice(void *ctx) { struct bpf_dynptr ptr; @@ -677,7 +677,7 @@ static int release_twice_callback_fn(__u32 index, void *data) * within a callback function, fails */ SEC("?raw_tp") -__failure __msg("arg 1 is an unacquired reference") +__failure __msg("Expected an initialized dynptr as R1") int release_twice_callback(void *ctx) { struct bpf_dynptr ptr; diff --git a/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c b/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c index 54de0389f878..c0d0422b8030 100644 --- a/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c +++ b/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c @@ -146,7 +146,7 @@ try_discard_dynptr(struct bpf_dynptr *dynptr, void *context) * not be able to read past the end of the pointer. */ SEC("?raw_tp") -__failure __msg("cannot release unowned const bpf_dynptr") +__failure __msg("CONST_PTR_TO_DYNPTR cannot be released") int user_ringbuf_callback_discard_dynptr(void *ctx) { bpf_user_ringbuf_drain(&user_ringbuf, try_discard_dynptr, NULL, 0); @@ -166,7 +166,7 @@ try_submit_dynptr(struct bpf_dynptr *dynptr, void *context) * not be able to read past the end of the pointer. */ SEC("?raw_tp") -__failure __msg("cannot release unowned const bpf_dynptr") +__failure __msg("CONST_PTR_TO_DYNPTR cannot be released") int user_ringbuf_callback_submit_dynptr(void *ctx) { bpf_user_ringbuf_drain(&user_ringbuf, try_submit_dynptr, NULL, 0); -- cgit v1.2.3 From b7dd2b388657d99689161e82ed13515505838232 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 28 May 2026 18:49:30 -0700 Subject: bpf: Unify referenced object tracking in verifier Helpers and kfuncs independently tracked referenced object metadata using standalone id fields in their respective arg_meta structs. This led to duplicated logic and inconsistent error handling between the two paths. Introduce struct ref_obj_desc to consolidate id and parent_id along with a count of how many arguments carry a reference. Add update_ref_obj() to populate it from a bpf_reg_state, replacing open-coded assignments in check_func_arg(), check_kfunc_args(), and process_iter_arg(). Add validate_ref_obj() to check for ambiguous ref_obj before using it. For ref_obj releasing helpers and kfuncs, keep checking it before calling update_ref_obj() for now. A later patch will make these functions not depending on ref_obj. For other users of ref_obj, move the checks to the use locations. For helper, this means moving the checks inside helper_multiple_ref_obj_use() to use locations. is_acquire_function() is dropped as ref_obj is never used. Pass ref_obj_desc into process_dynptr_func()/mark_stack_slots_dynptr() instead of a bare parent_id to make it less confusing. Drop the selftest introduced in 7ec899ac90a2 ("selftests/bpf: Negative test case for ref_obj_id in args") since the verifier no longer complains about ambiguous ref_obj if it is not used. Acked-by: Eduard Zingerman Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260529014936.2811085-8-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 14 ++- kernel/bpf/verifier.c | 122 +++++++++++---------- .../selftests/bpf/progs/test_ringbuf_map_key.c | 11 +- tools/testing/selftests/bpf/verifier/calls.c | 24 ---- 4 files changed, 78 insertions(+), 93 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 75b287d8d92f..b0521ba7787a 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -1424,6 +1424,18 @@ struct bpf_dynptr_desc { u32 parent_id; }; +/* + * The last seen rereferenced object; Updated by update_ref_obj() when a register refers to a + * referenced object. Used when the helper or kfunc is releasing a referenced object, casting + * a referenced object, returning allocated memory derived from referenced object or creating + * a dynptr with a referenced object as parent. + */ +struct ref_obj_desc { + u32 id; + u32 parent_id; + u8 cnt; +}; + struct bpf_kfunc_call_arg_meta { /* In parameters */ struct btf *btf; @@ -1432,7 +1444,6 @@ struct bpf_kfunc_call_arg_meta { const struct btf_type *func_proto; const char *func_name; /* Out parameters */ - u32 id; u8 release_regno; bool r0_rdonly; u32 ret_btf_id; @@ -1470,6 +1481,7 @@ struct bpf_kfunc_call_arg_meta { } iter; struct bpf_map_desc map; struct bpf_dynptr_desc dynptr; + struct ref_obj_desc ref_obj; u64 mem_size; }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4f75e5f95d27..bc8a09c858d8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -231,9 +231,28 @@ static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state) (poisoned ? BPF_MAP_KEY_POISON : 0ULL); } +static void update_ref_obj(struct ref_obj_desc *ref_obj, struct bpf_reg_state *reg) +{ + ref_obj->id = reg->id; + ref_obj->parent_id = reg->parent_id; + ref_obj->cnt++; +} + +static int validate_ref_obj(struct bpf_verifier_env *env, struct ref_obj_desc *ref_obj) +{ + if (ref_obj->cnt > 1) { + verifier_bug(env, "function expects only one referenced object but got %d\n", + ref_obj->cnt); + return -EFAULT; + } + + return 0; +} + struct bpf_call_arg_meta { struct bpf_map_desc map; struct bpf_dynptr_desc dynptr; + struct ref_obj_desc ref_obj; bool raw_mode; bool pkt_access; u8 release_regno; @@ -241,7 +260,6 @@ struct bpf_call_arg_meta { int access_size; int mem_size; u64 msize_max_value; - u32 id; int func_id; struct btf *btf; u32 btf_id; @@ -528,20 +546,6 @@ bool bpf_is_may_goto_insn(struct bpf_insn *insn) return insn->code == (BPF_JMP | BPF_JCOND) && insn->src_reg == BPF_MAY_GOTO; } -static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id, - const struct bpf_map *map) -{ - int ref_obj_uses = 0; - - if (is_ptr_cast_function(func_id)) - ref_obj_uses++; - if (is_acquire_function(func_id, map)) - ref_obj_uses++; - - return ref_obj_uses > 1; -} - - static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots) { int allocated_slots = state->allocated_stack / BPF_REG_SIZE; @@ -670,11 +674,11 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi); static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - enum bpf_arg_type arg_type, int insn_idx, int parent_id, - struct bpf_dynptr_desc *dynptr) + enum bpf_arg_type arg_type, int insn_idx, + struct ref_obj_desc *ref_obj, struct bpf_dynptr_desc *dynptr) { struct bpf_func_state *state = bpf_func(env, reg); - int spi, i, err; + int spi, i, err, parent_id = 0; enum bpf_dynptr_type type; spi = dynptr_get_spi(env, reg); @@ -707,6 +711,13 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ return -EINVAL; if (dynptr->type == BPF_DYNPTR_TYPE_INVALID) { /* dynptr constructors */ + err = validate_ref_obj(env, ref_obj); + if (err) + return err; + + /* Track parent's id if the parent is a referenced object */ + parent_id = ref_obj->id; + if (dynptr_type_referenced(type)) { int id; @@ -7188,7 +7199,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, */ static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int insn_idx, enum bpf_arg_type arg_type, - int parent_id, struct bpf_dynptr_desc *dynptr) + struct ref_obj_desc *ref_obj, struct bpf_dynptr_desc *dynptr) { int spi, err = 0; @@ -7229,7 +7240,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_stat return err; } - err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx, parent_id, dynptr); + err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx, ref_obj, dynptr); } else /* OBJ_RELEASE and None case from above */ { /* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */ if (reg->type == CONST_PTR_TO_DYNPTR && (arg_type & OBJ_RELEASE)) { @@ -7277,13 +7288,6 @@ static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_stat return err; } -static u32 iter_ref_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi) -{ - struct bpf_func_state *state = bpf_func(env, reg); - - return state->stack[spi].spilled_ptr.id; -} - static bool is_iter_kfunc(struct bpf_kfunc_call_arg_meta *meta) { return meta->kfunc_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY); @@ -7316,6 +7320,7 @@ static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg_idx, static int process_iter_arg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int insn_idx, struct bpf_kfunc_call_arg_meta *meta) { + struct bpf_func_state *state = bpf_func(env, reg); const struct btf_type *t; u32 arg_idx = arg_idx_from_argno(argno); int spi, err, i, nr_slots, btf_id; @@ -7387,7 +7392,7 @@ static int process_iter_arg(struct bpf_verifier_env *env, struct bpf_reg_state * /* remember meta->iter info for process_iter_next_call() */ meta->iter.spi = spi; meta->iter.frameno = reg->frameno; - meta->id = iter_ref_id(env, reg, spi); + update_ref_obj(&meta->ref_obj, &state->stack[spi].spilled_ptr); if (is_iter_destroy_kfunc(meta)) { err = unmark_stack_slots_iter(env, reg, nr_slots); @@ -8166,6 +8171,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, u32 regno = BPF_REG_1 + arg; struct bpf_reg_state *reg = reg_state(env, regno); enum bpf_arg_type arg_type = fn->arg_type[arg]; + argno_t argno = argno_from_arg(arg + 1); enum bpf_reg_type type = reg->type; u32 *arg_btf_id = NULL; u32 key_size; @@ -8232,15 +8238,8 @@ skip_type_check: meta->release_regno = regno; } - if (reg_is_referenced(env, reg) && base_type(arg_type) != ARG_KPTR_XCHG_DEST) { - if (meta->id) { - verbose(env, "more than one arg with referenced id R%d %u %u", - regno, reg->id, - meta->id); - return -EACCES; - } - meta->id = reg->id; - } + if (reg_is_referenced(env, reg)) + update_ref_obj(&meta->ref_obj, reg); switch (base_type(arg_type)) { case ARG_CONST_MAP_PTR: @@ -8379,7 +8378,7 @@ skip_type_check: true, meta); break; case ARG_PTR_TO_DYNPTR: - err = process_dynptr_func(env, reg, argno_from_reg(regno), insn_idx, arg_type, 0, + err = process_dynptr_func(env, reg, argno_from_reg(regno), insn_idx, arg_type, &meta->ref_obj, &meta->dynptr); if (err) return err; @@ -9042,6 +9041,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_subprog_info *sub = subprog_info(env, subprog); struct bpf_func_state *caller = cur_func(env); struct bpf_verifier_log *log = &env->log; + struct ref_obj_desc ref_obj = {}; u32 i; int ret, err; @@ -9119,7 +9119,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, if (ret) return ret; - ret = process_dynptr_func(env, reg, argno, -1, arg->arg_type, 0, NULL); + ret = process_dynptr_func(env, reg, argno, -1, arg->arg_type, &ref_obj, NULL); if (ret) return ret; } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) { @@ -10125,8 +10125,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn err = -EINVAL; if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) { err = unmark_stack_slots_dynptr(env, ®s[meta.release_regno]); - } else if (func_id == BPF_FUNC_kptr_xchg && meta.id) { - u32 id = meta.id; + } else if (func_id == BPF_FUNC_kptr_xchg && meta.ref_obj.id) { + u32 id = meta.ref_obj.id; bool in_rcu = in_rcu_cs(env); struct bpf_func_state *state; struct bpf_reg_state *reg; @@ -10145,10 +10145,10 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } })); } - } else if (meta.id) { - err = release_reference(env, meta.id); + } else if (meta.ref_obj.id) { + err = release_reference(env, meta.ref_obj.id); } else if (bpf_register_is_null(®s[meta.release_regno])) { - /* meta.id can only be 0 if register that is meant to be + /* meta.ref_obj.id can only be 0 if register that is meant to be * released is NULL, which must be > R0. */ err = 0; @@ -10413,17 +10413,15 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (type_may_be_null(regs[BPF_REG_0].type)) regs[BPF_REG_0].id = ++env->id_gen; - if (helper_multiple_ref_obj_use(func_id, meta.map.ptr)) { - verifier_bug(env, "func %s#%d sets ref_obj_id more than once", - func_id_name(func_id), func_id); - return -EFAULT; - } - if (is_ptr_cast_function(func_id) && - find_reference_state(env->cur_state, meta.id)) { + find_reference_state(env->cur_state, meta.ref_obj.id)) { struct bpf_verifier_state *branch; struct bpf_reg_state *r0; + err = validate_ref_obj(env, &meta.ref_obj); + if (err) + return err; + /* * In order for a release of any of the original or cast pointers * to invalidate all other pointers, reuse the same reference id for @@ -10441,7 +10439,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn r0->type = SCALAR_VALUE; regs[BPF_REG_0].type &= ~PTR_MAYBE_NULL; - regs[BPF_REG_0].id = meta.id; + regs[BPF_REG_0].id = meta.ref_obj.id; } else if (is_acquire_function(func_id, meta.map.ptr)) { int id = acquire_reference(env, insn_idx, 0); @@ -11915,13 +11913,13 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } if (reg_is_referenced(env, reg)) { - if (is_kfunc_release(meta) && meta->id) { - verifier_bug(env, "more than one arg with referenced id %s %u %u", - reg_arg_name(env, argno), reg->id, - meta->id); + if (is_kfunc_release(meta) && meta->ref_obj.cnt) { + verbose(env, "more than one arg with referenced id %s %u %u", + reg_arg_name(env, argno), reg->id, + meta->ref_obj.id); return -EFAULT; } - meta->id = reg->id; + update_ref_obj(&meta->ref_obj, reg); if (is_kfunc_release(meta)) { if (regno < 0) { verbose(env, "%s release arg cannot be a stack argument\n", @@ -12104,7 +12102,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } ret = process_dynptr_func(env, reg, argno, insn_idx, dynptr_arg_type, - meta->id, &meta->dynptr); + &meta->ref_obj, &meta->dynptr); if (ret < 0) return ret; break; @@ -13048,8 +13046,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, regs[BPF_REG_0].type |= MEM_RDONLY; /* Ensures we don't access the memory after a release_reference() */ - if (meta.id) - regs[BPF_REG_0].parent_id = meta.id; + if (meta.ref_obj.id) { + err = validate_ref_obj(env, &meta.ref_obj); + if (err) + return err; + regs[BPF_REG_0].parent_id = meta.ref_obj.id; + } if (is_kfunc_rcu_protected(&meta)) regs[BPF_REG_0].type |= MEM_RCU; diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf_map_key.c b/tools/testing/selftests/bpf/progs/test_ringbuf_map_key.c index 21bb7da90ea5..0efafa927a3d 100644 --- a/tools/testing/selftests/bpf/progs/test_ringbuf_map_key.c +++ b/tools/testing/selftests/bpf/progs/test_ringbuf_map_key.c @@ -35,7 +35,7 @@ SEC("fentry/" SYS_PREFIX "sys_getpgid") int test_ringbuf_mem_map_key(void *ctx) { int cur_pid = bpf_get_current_pid_tgid() >> 32; - struct sample *sample, sample_copy; + struct sample *sample; int *lookup_val; if (cur_pid != pid) @@ -55,16 +55,11 @@ int test_ringbuf_mem_map_key(void *ctx) lookup_val = (int *)bpf_map_lookup_elem(&hash_map, sample); __sink(lookup_val); - /* workaround - memcpy is necessary so that verifier doesn't - * complain with: - * verifier internal error: more than one arg with ref_obj_id R3 - * when trying to do bpf_map_update_elem(&hash_map, sample, &sample->seq, BPF_ANY); - * + /* * Since bpf_map_lookup_elem above uses 'sample' as key, test using * sample field as value below */ - __builtin_memcpy(&sample_copy, sample, sizeof(struct sample)); - bpf_map_update_elem(&hash_map, &sample_copy, &sample->seq, BPF_ANY); + bpf_map_update_elem(&hash_map, sample, &sample->seq, BPF_ANY); bpf_ringbuf_submit(sample, 0); return 0; diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index 0bb4337552c8..42d523a21a43 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -2410,27 +2410,3 @@ .errstr_unpriv = "", .prog_type = BPF_PROG_TYPE_CGROUP_SKB, }, -{ - "calls: several args with ref_obj_id", - .insns = { - /* Reserve at least sizeof(struct iphdr) bytes in the ring buffer. - * With a smaller size, the verifier would reject the call to - * bpf_tcp_raw_gen_syncookie_ipv4 before we can reach the - * ref_obj_id error. - */ - BPF_MOV64_IMM(BPF_REG_2, 20), - BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_reserve), - /* if r0 == 0 goto */ - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), - BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), - BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tcp_raw_gen_syncookie_ipv4), - BPF_EXIT_INSN(), - }, - .fixup_map_ringbuf = { 2 }, - .result = REJECT, - .errstr = "more than one arg with ref_obj_id", - .prog_type = BPF_PROG_TYPE_SCHED_CLS, -}, -- cgit v1.2.3 From bcfcb15fde94ed39068eb1d6e4b9b37d27111965 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 28 May 2026 18:49:31 -0700 Subject: bpf: Unify release handling for helpers and kfuncs Introduce release_reg() to consolidate the release logic shared by both helpers and kfuncs: dynptr release, kptr_xchg percpu-to-RCU conversion, regular reference release, and NULL pass-through. NULL pass-through is only allowed if the prototype indicates the argument may be null. Determine release_regno from the function prototype/metadata before argument checking, rather than discovering it dynamically during argument processing. For helpers, scan the arg_type array in check_func_proto() via check_proto_release_reg(). For kfuncs, set release_regno to BPF_REG_1 in bpf_fetch_kfunc_arg_meta() when KF_RELEASE is set. In the future when we start adding decl_tag to kfunc arguments, we can just look at the function prototype instead of a release_regno. Extract ref_convert_alloc_rcu_protected() and invalidate_rcu_protected_refs() to make it more clear what the code is doing. For ref_convert_alloc_rcu_protected(), it pre-converts MEM_ALLOC | MEM_PERCPU registers to MEM_RCU (clearing id so they survive), then calls release_reference() to invalidate the remaining registers and release the reference state. Add KF_RELEASE to bpf_dynptr_file_discard() so its release_regno is set via fetch_kfunc_meta rather than being assigned manually in the dynptr argument processing. Set arg_type to ARG_PTR_TO_DYNPTR for KF_ARG_PTR_TO_DYNPTR so that check_func_arg_reg_off() correctly allows non-zero stack offsets for dynptr release arguments same as helper. Acked-by: Eduard Zingerman Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260529014936.2811085-9-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 6 +- kernel/bpf/helpers.c | 2 +- kernel/bpf/verifier.c | 198 +++++++++++---------- tools/testing/selftests/bpf/prog_tests/cb_refs.c | 2 +- .../selftests/bpf/progs/cgrp_kfunc_failure.c | 6 +- tools/testing/selftests/bpf/progs/map_kptr_fail.c | 2 +- .../selftests/bpf/progs/task_kfunc_failure.c | 6 +- .../selftests/bpf/progs/verifier_global_ptr_args.c | 2 +- .../selftests/bpf/progs/verifier_ref_tracking.c | 2 +- tools/testing/selftests/bpf/progs/verifier_sock.c | 6 +- .../selftests/bpf/progs/verifier_vfs_reject.c | 2 +- .../selftests/bpf/progs/wakeup_source_fail.c | 2 +- 12 files changed, 122 insertions(+), 114 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index b0521ba7787a..3dd2d21230af 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -1426,9 +1426,9 @@ struct bpf_dynptr_desc { /* * The last seen rereferenced object; Updated by update_ref_obj() when a register refers to a - * referenced object. Used when the helper or kfunc is releasing a referenced object, casting - * a referenced object, returning allocated memory derived from referenced object or creating - * a dynptr with a referenced object as parent. + * referenced object. Used when the helper or kfunc is casting a referenced object, returning + * allocated memory derived from referenced object or creating a dynptr with a referenced + * object as parent. */ struct ref_obj_desc { u32 id; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 9ca195104667..03004e4451f5 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4957,7 +4957,7 @@ BTF_ID_FLAGS(func, bpf_stream_print_stack, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_dynptr_from_file) -BTF_ID_FLAGS(func, bpf_dynptr_file_discard) +BTF_ID_FLAGS(func, bpf_dynptr_file_discard, KF_RELEASE) BTF_ID_FLAGS(func, bpf_timer_cancel_async) BTF_KFUNCS_END(common_btf_ids) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bc8a09c858d8..caa455fad877 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8225,17 +8225,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, return err; skip_type_check: - if (arg_type_is_release(arg_type)) { - if (!arg_type_is_dynptr(arg_type) && !reg_is_referenced(env, reg) && !bpf_register_is_null(reg)) { - verbose(env, "R%d must be referenced when passed to release function\n", - regno); - return -EINVAL; - } - if (meta->release_regno) { - verifier_bug(env, "more than one release argument"); - return -EFAULT; - } - meta->release_regno = regno; + if (arg_type_is_release(arg_type) && !arg_type_is_dynptr(arg_type) && + !reg_is_referenced(env, reg) && !bpf_register_is_null(reg)) { + verbose(env, "release helper %s expects referenced PTR_TO_BTF_ID passed to %s\n", + func_id_name(meta->func_id), reg_arg_name(env, argno)); + return -EINVAL; } if (reg_is_referenced(env, reg)) @@ -8798,11 +8792,29 @@ static bool check_mem_arg_rw_flag_ok(const struct bpf_func_proto *fn) return true; } -static int check_func_proto(const struct bpf_func_proto *fn) +static bool check_proto_release_reg(const struct bpf_func_proto *fn, struct bpf_call_arg_meta *meta) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) { + enum bpf_arg_type arg_type = fn->arg_type[i]; + + if (arg_type_is_release(arg_type)) { + if (meta->release_regno) + return false; + meta->release_regno = i + 1; + } + } + + return true; +} + +static int check_func_proto(const struct bpf_func_proto *fn, struct bpf_call_arg_meta *meta) { return check_raw_mode_ok(fn) && check_arg_pair_ok(fn) && check_mem_arg_rw_flag_ok(fn) && + check_proto_release_reg(fn, meta) && check_btf_id_ok(fn) ? 0 : -EINVAL; } @@ -8956,6 +8968,42 @@ static void invalidate_non_owning_refs(struct bpf_verifier_env *env) })); } +static void invalidate_rcu_protected_refs(struct bpf_verifier_env *env) +{ + struct bpf_stack_state *stack; + struct bpf_func_state *state; + struct bpf_reg_state *reg; + u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER); + + bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, stack, clear_mask, ({ + if (reg->type & MEM_RCU) { + reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL); + reg->type |= PTR_UNTRUSTED; + } + })); +} + +static int ref_convert_alloc_rcu_protected(struct bpf_verifier_env *env, u32 id) +{ + struct bpf_func_state *state; + struct bpf_reg_state *reg; + int err; + + err = release_reference_nomark(env->cur_state, id); + + bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ + if (reg->id != id) + continue; + if ((reg->type & MEM_ALLOC) && (reg->type & MEM_PERCPU)) { + reg->id = 0; + reg->type &= ~MEM_ALLOC; + reg->type |= MEM_RCU; + } + })); + + return err; +} + static void clear_caller_saved_regs(struct bpf_verifier_env *env, struct bpf_reg_state *regs) { @@ -10028,6 +10076,24 @@ static const char *non_sleepable_context_description(struct bpf_verifier_env *en return "non-sleepable prog"; } +static int release_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + bool convert_rcu, bool release_dynptr) +{ + int err = -EINVAL; + + if (bpf_register_is_null(reg)) + return 0; + + if (release_dynptr) + err = unmark_stack_slots_dynptr(env, reg); + else if (convert_rcu) + err = ref_convert_alloc_rcu_protected(env, reg->id); + else if (reg_is_referenced(env, reg)) + err = release_reference(env, reg->id); + + return err; +} + static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { @@ -10077,7 +10143,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; - err = check_func_proto(fn); + err = check_func_proto(fn, &meta); if (err) { verifier_bug(env, "incorrect func proto %s#%d", func_id_name(func_id), func_id); return err; @@ -10122,37 +10188,11 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } if (meta.release_regno) { - err = -EINVAL; - if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) { - err = unmark_stack_slots_dynptr(env, ®s[meta.release_regno]); - } else if (func_id == BPF_FUNC_kptr_xchg && meta.ref_obj.id) { - u32 id = meta.ref_obj.id; - bool in_rcu = in_rcu_cs(env); - struct bpf_func_state *state; - struct bpf_reg_state *reg; - - err = release_reference_nomark(env->cur_state, id); - if (!err) { - bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ - if (reg->id == id) { - if (in_rcu && (reg->type & MEM_ALLOC) && (reg->type & MEM_PERCPU)) { - reg->id = 0; - reg->type &= ~MEM_ALLOC; - reg->type |= MEM_RCU; - } else { - mark_reg_invalid(env, reg); - } - } - })); - } - } else if (meta.ref_obj.id) { - err = release_reference(env, meta.ref_obj.id); - } else if (bpf_register_is_null(®s[meta.release_regno])) { - /* meta.ref_obj.id can only be 0 if register that is meant to be - * released is NULL, which must be > R0. - */ - err = 0; - } + struct bpf_reg_state *reg = ®s[meta.release_regno]; + bool convert_rcu = (func_id == BPF_FUNC_kptr_xchg) && in_rcu_cs(env) && + (reg->type & MEM_ALLOC) && (reg->type & MEM_PERCPU); + + err = release_reg(env, reg, convert_rcu, !!meta.dynptr.id); if (err) return err; } @@ -10547,7 +10587,6 @@ static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta) return meta->kfunc_flags & KF_RELEASE; } - static bool is_kfunc_destructive(struct bpf_kfunc_call_arg_meta *meta) { return meta->kfunc_flags & KF_DESTRUCTIVE; @@ -11912,24 +11951,16 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EACCES; } - if (reg_is_referenced(env, reg)) { - if (is_kfunc_release(meta) && meta->ref_obj.cnt) { - verbose(env, "more than one arg with referenced id %s %u %u", - reg_arg_name(env, argno), reg->id, - meta->ref_obj.id); - return -EFAULT; - } - update_ref_obj(&meta->ref_obj, reg); - if (is_kfunc_release(meta)) { - if (regno < 0) { - verbose(env, "%s release arg cannot be a stack argument\n", - reg_arg_name(env, argno)); - return -EINVAL; - } - meta->release_regno = regno; - } + if (regno == meta->release_regno && !is_kfunc_arg_dynptr(meta->btf, &args[i]) && + !reg_is_referenced(env, reg) && !bpf_register_is_null(reg)) { + verbose(env, "release kfunc %s expects referenced PTR_TO_BTF_ID passed to %s\n", + func_name, reg_arg_name(env, argno)); + return -EINVAL; } + if (reg_is_referenced(env, reg)) + update_ref_obj(&meta->ref_obj, reg); + ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); ref_tname = btf_name_by_offset(btf, ref_t->name_off); @@ -11993,7 +12024,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } } fallthrough; - case KF_ARG_PTR_TO_DYNPTR: case KF_ARG_PTR_TO_ITER: case KF_ARG_PTR_TO_LIST_HEAD: case KF_ARG_PTR_TO_LIST_NODE: @@ -12010,6 +12040,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_IRQ_FLAG: case KF_ARG_PTR_TO_RES_SPIN_LOCK: break; + case KF_ARG_PTR_TO_DYNPTR: + arg_type = ARG_PTR_TO_DYNPTR; + break; case KF_ARG_PTR_TO_CTX: arg_type = ARG_PTR_TO_CTX; break; @@ -12018,7 +12051,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EFAULT; } - if (is_kfunc_release(meta) && reg_is_referenced(env, reg)) + if (regno == meta->release_regno) arg_type |= OBJ_RELEASE; ret = check_func_arg_reg_off(env, reg, argno, arg_type); if (ret < 0) @@ -12083,12 +12116,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ dynptr_arg_type |= DYNPTR_TYPE_FILE; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_file_discard]) { dynptr_arg_type |= DYNPTR_TYPE_FILE | OBJ_RELEASE; - if (regno < 0) { - verbose(env, "%s release arg cannot be a stack argument\n", - reg_arg_name(env, argno)); - return -EINVAL; - } - meta->release_regno = regno; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] && (dynptr_arg_type & MEM_UNINIT)) { enum bpf_dynptr_type parent_type = meta->dynptr.type; @@ -12377,12 +12404,6 @@ check_ok: } } - if (is_kfunc_release(meta) && !meta->release_regno) { - verbose(env, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n", - func_name); - return -EINVAL; - } - return 0; } @@ -12409,6 +12430,10 @@ int bpf_fetch_kfunc_arg_meta(struct bpf_verifier_env *env, meta->kfunc_flags = *kfunc.flags; + /* Only support release referenced argument passed by register */ + if (is_kfunc_release(meta)) + meta->release_regno = BPF_REG_1; + return 0; } @@ -12899,23 +12924,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (rcu_lock) { env->cur_state->active_rcu_locks++; } else if (rcu_unlock) { - struct bpf_stack_state *stack; - struct bpf_func_state *state; - struct bpf_reg_state *reg; - u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER); - if (env->cur_state->active_rcu_locks == 0) { verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name); return -EINVAL; } - if (--env->cur_state->active_rcu_locks == 0) { - bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, stack, clear_mask, ({ - if (reg->type & MEM_RCU) { - reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL); - reg->type |= PTR_UNTRUSTED; - } - })); - } + if (--env->cur_state->active_rcu_locks == 0) + invalidate_rcu_protected_refs(env); } else if (preempt_disable) { env->cur_state->active_preempt_locks++; } else if (preempt_enable) { @@ -12946,13 +12960,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now. */ if (meta.release_regno) { - struct bpf_reg_state *reg = ®s[meta.release_regno]; - - if (meta.dynptr.id) { - err = unmark_stack_slots_dynptr(env, reg); - } else { - err = release_reference(env, reg->id); - } + err = release_reg(env, ®s[meta.release_regno], false, !!meta.dynptr.id); if (err) return err; } diff --git a/tools/testing/selftests/bpf/prog_tests/cb_refs.c b/tools/testing/selftests/bpf/prog_tests/cb_refs.c index 6300b67a3a84..78566b817fd7 100644 --- a/tools/testing/selftests/bpf/prog_tests/cb_refs.c +++ b/tools/testing/selftests/bpf/prog_tests/cb_refs.c @@ -11,7 +11,7 @@ struct { const char *prog_name; const char *err_msg; } cb_refs_tests[] = { - { "underflow_prog", "must point to scalar, or struct with scalar" }, + { "underflow_prog", "release kfunc bpf_kfunc_call_test_release expects referenced PTR_TO_BTF_ID passed to R1" }, { "leak_prog", "Possibly NULL pointer passed to helper R2" }, { "nested_cb", "Unreleased reference id=4 alloc_insn=2" }, /* alloc_insn=2{4,5} */ { "non_cb_transfer_ref", "Unreleased reference id=4 alloc_insn=1" }, /* alloc_insn=1{1,2} */ diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c index a875ba8e5007..d0d65d6d450c 100644 --- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c +++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c @@ -154,7 +154,7 @@ int BPF_PROG(cgrp_kfunc_xchg_unreleased, struct cgroup *cgrp, const char *path) } SEC("tp_btf/cgroup_mkdir") -__failure __msg("must be referenced or trusted") +__failure __msg("release kfunc bpf_cgroup_release expects referenced PTR_TO_BTF_ID passed to R1") int BPF_PROG(cgrp_kfunc_rcu_get_release, struct cgroup *cgrp, const char *path) { struct cgroup *kptr; @@ -191,7 +191,7 @@ int BPF_PROG(cgrp_kfunc_release_untrusted, struct cgroup *cgrp, const char *path } SEC("tp_btf/cgroup_mkdir") -__failure __msg("R1 pointer type STRUCT cgroup must point") +__failure __msg("release kfunc bpf_cgroup_release expects referenced PTR_TO_BTF_ID passed to R1") int BPF_PROG(cgrp_kfunc_release_fp, struct cgroup *cgrp, const char *path) { struct cgroup *acquired = (struct cgroup *)&path; @@ -237,7 +237,7 @@ int BPF_PROG(cgrp_kfunc_release_null, struct cgroup *cgrp, const char *path) } SEC("tp_btf/cgroup_mkdir") -__failure __msg("release kernel function bpf_cgroup_release expects") +__failure __msg("release kfunc bpf_cgroup_release expects referenced PTR_TO_BTF_ID passed to R1") int BPF_PROG(cgrp_kfunc_release_unacquired, struct cgroup *cgrp, const char *path) { /* Cannot release trusted cgroup pointer which was not acquired. */ diff --git a/tools/testing/selftests/bpf/progs/map_kptr_fail.c b/tools/testing/selftests/bpf/progs/map_kptr_fail.c index 8f36e74fd8f9..f11848dfa78f 100644 --- a/tools/testing/selftests/bpf/progs/map_kptr_fail.c +++ b/tools/testing/selftests/bpf/progs/map_kptr_fail.c @@ -252,7 +252,7 @@ int reject_untrusted_store_to_ref(struct __sk_buff *ctx) } SEC("?tc") -__failure __msg("R2 must be referenced") +__failure __msg("release helper bpf_kptr_xchg expects referenced PTR_TO_BTF_ID passed to R2") int reject_untrusted_xchg(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *p; diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c index 41047d81ec42..8e947d445f8e 100644 --- a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c +++ b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c @@ -178,7 +178,7 @@ int BPF_PROG(task_kfunc_release_untrusted, struct task_struct *task, u64 clone_f } SEC("tp_btf/task_newtask") -__failure __msg("R1 pointer type STRUCT task_struct must point") +__failure __msg("release kfunc bpf_task_release expects referenced PTR_TO_BTF_ID passed to R1") int BPF_PROG(task_kfunc_release_fp, struct task_struct *task, u64 clone_flags) { struct task_struct *acquired = (struct task_struct *)&clone_flags; @@ -224,7 +224,7 @@ int BPF_PROG(task_kfunc_release_null, struct task_struct *task, u64 clone_flags) } SEC("tp_btf/task_newtask") -__failure __msg("release kernel function bpf_task_release expects") +__failure __msg("release kfunc bpf_task_release expects referenced PTR_TO_BTF_ID passed to R1") int BPF_PROG(task_kfunc_release_unacquired, struct task_struct *task, u64 clone_flags) { /* Cannot release trusted task pointer which was not acquired. */ @@ -313,7 +313,7 @@ int BPF_PROG(task_access_comm4, struct task_struct *task, const char *buf, bool } SEC("tp_btf/task_newtask") -__failure __msg("R1 must be referenced or trusted") +__failure __msg("release kfunc bpf_task_release expects referenced PTR_TO_BTF_ID passed to R1") int BPF_PROG(task_kfunc_release_in_map, struct task_struct *task, u64 clone_flags) { struct task_struct *local; diff --git a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c index e7dae0cf9c17..ea273e152209 100644 --- a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c +++ b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c @@ -153,7 +153,7 @@ __weak int subprog_trusted_destroy(struct task_struct *task __arg_trusted) SEC("?tp_btf/task_newtask") __failure __log_level(2) -__msg("release kernel function bpf_task_release expects refcounted PTR_TO_BTF_ID") +__msg("release kfunc bpf_task_release expects referenced PTR_TO_BTF_ID passed to R1") int BPF_PROG(trusted_destroy_fail, struct task_struct *task, u64 clone_flags) { return subprog_trusted_destroy(task); diff --git a/tools/testing/selftests/bpf/progs/verifier_ref_tracking.c b/tools/testing/selftests/bpf/progs/verifier_ref_tracking.c index 139f70bb3595..199ad18f8eb5 100644 --- a/tools/testing/selftests/bpf/progs/verifier_ref_tracking.c +++ b/tools/testing/selftests/bpf/progs/verifier_ref_tracking.c @@ -1288,7 +1288,7 @@ l1_%=: r1 = r6; \ SEC("tc") __description("reference tracking: bpf_sk_release(listen_sk)") -__failure __msg("R1 must be referenced when passed to release function") +__failure __msg("release helper bpf_sk_release expects referenced PTR_TO_BTF_ID passed to R1") __naked void bpf_sk_release_listen_sk(void) { asm volatile ( diff --git a/tools/testing/selftests/bpf/progs/verifier_sock.c b/tools/testing/selftests/bpf/progs/verifier_sock.c index a2132c72d3b8..9f680cf44512 100644 --- a/tools/testing/selftests/bpf/progs/verifier_sock.c +++ b/tools/testing/selftests/bpf/progs/verifier_sock.c @@ -603,7 +603,7 @@ l2_%=: r0 = *(u32*)(r0 + %[bpf_tcp_sock_snd_cwnd]); \ SEC("tc") __description("bpf_sk_release(skb->sk)") -__failure __msg("R1 must be referenced when passed to release function") +__failure __msg("release helper bpf_sk_release expects referenced PTR_TO_BTF_ID passed to R1") __naked void bpf_sk_release_skb_sk(void) { asm volatile (" \ @@ -620,7 +620,7 @@ l0_%=: r0 = 0; \ SEC("tc") __description("bpf_sk_release(bpf_sk_fullsock(skb->sk))") -__failure __msg("R1 must be referenced when passed to release function") +__failure __msg("release helper bpf_sk_release expects referenced PTR_TO_BTF_ID passed to R1") __naked void bpf_sk_fullsock_skb_sk(void) { asm volatile (" \ @@ -644,7 +644,7 @@ l1_%=: r1 = r0; \ SEC("tc") __description("bpf_sk_release(bpf_tcp_sock(skb->sk))") -__failure __msg("R1 must be referenced when passed to release function") +__failure __msg("release helper bpf_sk_release expects referenced PTR_TO_BTF_ID passed to R1") __naked void bpf_tcp_sock_skb_sk(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c b/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c index 0990de076844..2870738d93f7 100644 --- a/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c +++ b/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c @@ -80,7 +80,7 @@ int BPF_PROG(get_task_exe_file_kfunc_unreleased) } SEC("lsm.s/file_open") -__failure __msg("release kernel function bpf_put_file expects") +__failure __msg("release kfunc bpf_put_file expects referenced PTR_TO_BTF_ID passed to R1") int BPF_PROG(put_file_kfunc_unacquired, struct file *file) { /* Can't release an unacquired pointer. */ diff --git a/tools/testing/selftests/bpf/progs/wakeup_source_fail.c b/tools/testing/selftests/bpf/progs/wakeup_source_fail.c index b8bbb61d4d4e..d4d0f1610853 100644 --- a/tools/testing/selftests/bpf/progs/wakeup_source_fail.c +++ b/tools/testing/selftests/bpf/progs/wakeup_source_fail.c @@ -42,7 +42,7 @@ int wakeup_source_access_lock_fields(void *ctx) } SEC("syscall") -__failure __msg("type=scalar expected=fp") +__failure __msg("release kfunc bpf_wakeup_sources_read_unlock expects referenced PTR_TO_BTF_ID passed to R1") int wakeup_source_unlock_no_lock(void *ctx) { struct bpf_ws_lock *lock = (void *)0x1; -- cgit v1.2.3 From 2eee6fe8ac2cd21e931c009d475e5a8407d42d76 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 28 May 2026 18:49:32 -0700 Subject: bpf: Fix dynptr ref counting to scan all call frames When checking whether a referenced dynptr can be overwritten, destroy_if_dynptr_stack_slot only counted sibling dynptrs in the current call frame. If a clone sharing the same virtual ref parent existed in a different frame (e.g., passed to a subprog), it would not be counted, causing the verifier to incorrectly reject the overwrite with "cannot overwrite referenced dynptr". Fix by extracting the counting into dynptr_ref_cnt() which uses bpf_for_each_reg_in_vstate_mask() to scan dynptr stack slots across all call frames. Fixes: 017f5c4ef73c ("bpf: Allow overwriting referenced dynptr when refcnt > 1") Reported-by: Eduard Zingerman Acked-by: Eduard Zingerman Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260529014936.2811085-10-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 52 ++++++++++++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index caa455fad877..5d8f2656dbfd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -786,10 +786,29 @@ static void mark_reg_invalid(const struct bpf_verifier_env *env, struct bpf_reg_ __mark_reg_unknown(env, reg); } +static int dynptr_ref_cnt(struct bpf_verifier_env *env, int v_parent_id) +{ + struct bpf_stack_state *stack; + struct bpf_func_state *state; + struct bpf_reg_state *reg; + int ref_cnt = 0; + + bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, stack, 1 << STACK_DYNPTR, ({ + if (!stack || stack->slot_type[0] != STACK_DYNPTR) + continue; + if (!stack->spilled_ptr.dynptr.first_slot) + continue; + if (stack->spilled_ptr.parent_id == v_parent_id) + ref_cnt++; + })); + + return ref_cnt; +} + static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi) { - int i, err = 0; + int err = 0; /* We always ensure that STACK_DYNPTR is never set partially, * hence just checking for slot_type[0] is enough. This is @@ -803,28 +822,15 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, if (!state->stack[spi].spilled_ptr.dynptr.first_slot) spi = spi + 1; - if (dynptr_type_referenced(state->stack[spi].spilled_ptr.dynptr.type)) { - int v_parent_id = state->stack[spi].spilled_ptr.parent_id; - int ref_cnt = 0; - - /* - * A referenced dynptr can be overwritten only if there is at - * least one other dynptr sharing the same virtual ref parent, - * ensuring the reference can still be properly released. - */ - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] != STACK_DYNPTR) - continue; - if (!state->stack[i].spilled_ptr.dynptr.first_slot) - continue; - if (state->stack[i].spilled_ptr.parent_id == v_parent_id) - ref_cnt++; - } - - if (ref_cnt <= 1) { - verbose(env, "cannot overwrite referenced dynptr\n"); - return -EINVAL; - } + /* + * A referenced dynptr can be overwritten only if there is at + * least one other dynptr sharing the same virtual ref parent, + * ensuring the reference can still be properly released. + */ + if (dynptr_type_referenced(state->stack[spi].spilled_ptr.dynptr.type) && + dynptr_ref_cnt(env, state->stack[spi].spilled_ptr.parent_id) <= 1) { + verbose(env, "cannot overwrite referenced dynptr\n"); + return -EINVAL; } /* Invalidate the dynptr and any derived slices */ -- cgit v1.2.3 From fbcc68af60479c4beebe411c1ee5e3c873e3adcf Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 28 May 2026 18:49:33 -0700 Subject: selftests/bpf: Test creating dynptr from dynptr data and slice The verifier currently does not allow creating dynptr from dynptr data or slice. Add a selftest to test this explicitly. Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260529014936.2811085-11-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/dynptr_fail.c | 42 +++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c index 40a14a5174a5..344fb2aa0813 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_fail.c +++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c @@ -705,6 +705,48 @@ int dynptr_from_mem_invalid_api(void *ctx) return 0; } +/* Cannot create dynptr from dynptr data */ +SEC("?raw_tp") +__failure __msg("Unsupported reg type mem for bpf_dynptr_from_mem data") +int dynptr_from_dynptr_data(void *ctx) +{ + struct bpf_dynptr ptr, ptr2; + __u8 *data; + + if (get_map_val_dynptr(&ptr)) + return 0; + + data = bpf_dynptr_data(&ptr, 0, sizeof(__u32)); + if (!data) + return 0; + + /* this should fail */ + bpf_dynptr_from_mem(data, sizeof(__u32), 0, &ptr2); + + return 0; +} + +/* Cannot create dynptr from dynptr slice */ +SEC("?tc") +__failure __msg("Unsupported reg type mem for bpf_dynptr_from_mem data") +int dynptr_from_dynptr_slice(struct __sk_buff *skb) +{ + struct bpf_dynptr ptr, ptr2; + struct ethhdr *hdr; + char buffer[sizeof(*hdr)] = {}; + + bpf_dynptr_from_skb(skb, 0, &ptr); + + hdr = bpf_dynptr_slice_rdwr(&ptr, 0, buffer, sizeof(buffer)); + if (!hdr) + return SK_DROP; + + /* this should fail */ + bpf_dynptr_from_mem(hdr, sizeof(*hdr), 0, &ptr2); + + return SK_PASS; +} + SEC("?tc") __failure __msg("cannot overwrite referenced dynptr") __log_level(2) int dynptr_pruning_overwrite(struct __sk_buff *ctx) -- cgit v1.2.3 From 925320666e0644c2e884a0dc49ab2dc22b061891 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 28 May 2026 18:49:34 -0700 Subject: selftests/bpf: Test using slice after invalidating dynptr clone The parent object of a cloned dynptr is skb not the original dynptr. Invalidate the original dynptr should not prevent the program from using the slice derived from the cloned dynptr. Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260529014936.2811085-12-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c | 8 +++ .../bpf_qdisc_dynptr_use_after_invalidate_clone.c | 74 ++++++++++++++++++++++ 2 files changed, 82 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/bpf_qdisc_dynptr_use_after_invalidate_clone.c diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c b/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c index 730357cd0c9a..77f1c0550c9b 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c @@ -8,6 +8,10 @@ #include "bpf_qdisc_fifo.skel.h" #include "bpf_qdisc_fq.skel.h" #include "bpf_qdisc_fail__incompl_ops.skel.h" +#include "bpf_qdisc_fail__invalid_dynptr.skel.h" +#include "bpf_qdisc_fail__invalid_dynptr_slice.skel.h" +#include "bpf_qdisc_fail__invalid_dynptr_cross_frame.skel.h" +#include "bpf_qdisc_dynptr_use_after_invalidate_clone.skel.h" #define LO_IFINDEX 1 @@ -223,6 +227,10 @@ void test_ns_bpf_qdisc(void) test_qdisc_attach_to_non_root(); if (test__start_subtest("incompl_ops")) test_incompl_ops(); + RUN_TESTS(bpf_qdisc_fail__invalid_dynptr); + RUN_TESTS(bpf_qdisc_fail__invalid_dynptr_cross_frame); + RUN_TESTS(bpf_qdisc_fail__invalid_dynptr_slice); + RUN_TESTS(bpf_qdisc_dynptr_use_after_invalidate_clone); } void serial_test_bpf_qdisc_default(void) diff --git a/tools/testing/selftests/bpf/progs/bpf_qdisc_dynptr_use_after_invalidate_clone.c b/tools/testing/selftests/bpf/progs/bpf_qdisc_dynptr_use_after_invalidate_clone.c new file mode 100644 index 000000000000..ac626cfa2a98 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_qdisc_dynptr_use_after_invalidate_clone.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include "bpf_experimental.h" +#include "bpf_qdisc_common.h" +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +int proto; + +SEC("struct_ops") +__success +int BPF_PROG(dynptr_use_after_invalidate_clone, struct sk_buff *skb, struct Qdisc *sch, + struct bpf_sk_buff_ptr *to_free) +{ + struct bpf_dynptr ptr, ptr_clone; + struct ethhdr *hdr; + + bpf_dynptr_from_skb((struct __sk_buff *)skb, 0, &ptr); + + bpf_dynptr_clone(&ptr, &ptr_clone); + + hdr = bpf_dynptr_slice(&ptr_clone, 0, NULL, sizeof(*hdr)); + if (!hdr) { + bpf_qdisc_skb_drop(skb, to_free); + return NET_XMIT_DROP; + } + + *(int *)&ptr = 0; + + proto = hdr->h_proto; + + bpf_qdisc_skb_drop(skb, to_free); + + return NET_XMIT_DROP; +} + +SEC("struct_ops") +__auxiliary +struct sk_buff *BPF_PROG(bpf_qdisc_test_dequeue, struct Qdisc *sch) +{ + return NULL; +} + +SEC("struct_ops") +__auxiliary +int BPF_PROG(bpf_qdisc_test_init, struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + return 0; +} + +SEC("struct_ops") +__auxiliary +void BPF_PROG(bpf_qdisc_test_reset, struct Qdisc *sch) +{ +} + +SEC("struct_ops") +__auxiliary +void BPF_PROG(bpf_qdisc_test_destroy, struct Qdisc *sch) +{ +} + +SEC(".struct_ops") +struct Qdisc_ops test = { + .enqueue = (void *)dynptr_use_after_invalidate_clone, + .dequeue = (void *)bpf_qdisc_test_dequeue, + .init = (void *)bpf_qdisc_test_init, + .reset = (void *)bpf_qdisc_test_reset, + .destroy = (void *)bpf_qdisc_test_destroy, + .id = "bpf_qdisc_test", +}; -- cgit v1.2.3 From 3f75a757a3afa9c0d5a2637910659e92d236e7f2 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 28 May 2026 18:49:35 -0700 Subject: selftests/bpf: Test using file dynptr after the reference on file is dropped File dynptr and slice should be invalidated when the parent file's reference is dropped in the program. Without the verifier tracking dyntpr's parent referenced object, the dynptr would continute to be incorrectly used even if the underlying file is being tear down or gone. Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260529014936.2811085-13-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/progs/file_reader_fail.c | 60 ++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/file_reader_fail.c b/tools/testing/selftests/bpf/progs/file_reader_fail.c index 0739620dea8a..d5fae5e4cf9a 100644 --- a/tools/testing/selftests/bpf/progs/file_reader_fail.c +++ b/tools/testing/selftests/bpf/progs/file_reader_fail.c @@ -50,3 +50,63 @@ int xdp_no_dynptr_type(struct xdp_md *xdp) bpf_dynptr_file_discard(&dynptr); return 0; } + +SEC("lsm/file_open") +__failure +__msg("Leaking reference id={{[0-9]+}} alloc_insn={{[0-9]+}}. Release it first.") +int use_file_dynptr_after_put_file(void *ctx) +{ + struct task_struct *task = bpf_get_current_task_btf(); + struct file *file = bpf_get_task_exe_file(task); + struct bpf_dynptr dynptr; + char buf[64]; + + if (!file) + return 0; + + if (bpf_dynptr_from_file(file, 0, &dynptr)) + goto out; + + /* this should fail - file dynptr should be discarded first to prevent resource leak */ + bpf_put_file(file); + + bpf_dynptr_read(buf, sizeof(buf), &dynptr, 0, 0); + return 0; + +out: + bpf_dynptr_file_discard(&dynptr); + bpf_put_file(file); + return 0; +} + +SEC("lsm/file_open") +__failure +__msg("Leaking reference id={{[0-9]+}} alloc_insn={{[0-9]+}}. Release it first.") +int use_file_dynptr_slice_after_put_file(void *ctx) +{ + struct task_struct *task = bpf_get_current_task_btf(); + struct file *file = bpf_get_task_exe_file(task); + struct bpf_dynptr dynptr; + char *data; + + if (!file) + return 0; + + if (bpf_dynptr_from_file(file, 0, &dynptr)) + goto out; + + data = bpf_dynptr_data(&dynptr, 0, 1); + if (!data) + goto out; + + /* this should fail - file dynptr should be discarded first to prevent resource leak */ + bpf_put_file(file); + + *data = 'x'; + return 0; + +out: + bpf_dynptr_file_discard(&dynptr); + bpf_put_file(file); + return 0; +} -- cgit v1.2.3 From 60c7c3b880c8b3ad7fe025bb68b13bfbc440ceaf Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 28 May 2026 18:49:36 -0700 Subject: selftests/bpf: Test using dynptr after freeing the underlying object Make sure the verifier invalidates the dynptr and dynptr slice derived from an skb after the skb is freed. Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260529014936.2811085-14-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- .../bpf/progs/bpf_qdisc_fail__invalid_dynptr.c | 68 ++++++++++++++++++++ .../bpf_qdisc_fail__invalid_dynptr_cross_frame.c | 74 ++++++++++++++++++++++ .../progs/bpf_qdisc_fail__invalid_dynptr_slice.c | 70 ++++++++++++++++++++ 3 files changed, 212 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/bpf_qdisc_fail__invalid_dynptr.c create mode 100644 tools/testing/selftests/bpf/progs/bpf_qdisc_fail__invalid_dynptr_cross_frame.c create mode 100644 tools/testing/selftests/bpf/progs/bpf_qdisc_fail__invalid_dynptr_slice.c diff --git a/tools/testing/selftests/bpf/progs/bpf_qdisc_fail__invalid_dynptr.c b/tools/testing/selftests/bpf/progs/bpf_qdisc_fail__invalid_dynptr.c new file mode 100644 index 000000000000..1d96f7987a3f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_qdisc_fail__invalid_dynptr.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include "bpf_experimental.h" +#include "bpf_qdisc_common.h" +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +int proto; + +SEC("struct_ops") +__failure __msg("Expected an initialized dynptr as R1") +int BPF_PROG(invalid_dynptr, struct sk_buff *skb, struct Qdisc *sch, + struct bpf_sk_buff_ptr *to_free) +{ + struct bpf_dynptr ptr; + struct ethhdr *hdr; + + bpf_dynptr_from_skb((struct __sk_buff *)skb, 0, &ptr); + + bpf_qdisc_skb_drop(skb, to_free); + + hdr = bpf_dynptr_slice(&ptr, 0, NULL, sizeof(*hdr)); + if (!hdr) + return NET_XMIT_DROP; + + proto = hdr->h_proto; + + return NET_XMIT_DROP; +} + +SEC("struct_ops") +__auxiliary +struct sk_buff *BPF_PROG(bpf_qdisc_test_dequeue, struct Qdisc *sch) +{ + return NULL; +} + +SEC("struct_ops") +__auxiliary +int BPF_PROG(bpf_qdisc_test_init, struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + return 0; +} + +SEC("struct_ops") +__auxiliary +void BPF_PROG(bpf_qdisc_test_reset, struct Qdisc *sch) +{ +} + +SEC("struct_ops") +__auxiliary +void BPF_PROG(bpf_qdisc_test_destroy, struct Qdisc *sch) +{ +} + +SEC(".struct_ops") +struct Qdisc_ops test = { + .enqueue = (void *)invalid_dynptr, + .dequeue = (void *)bpf_qdisc_test_dequeue, + .init = (void *)bpf_qdisc_test_init, + .reset = (void *)bpf_qdisc_test_reset, + .destroy = (void *)bpf_qdisc_test_destroy, + .id = "bpf_qdisc_test", +}; diff --git a/tools/testing/selftests/bpf/progs/bpf_qdisc_fail__invalid_dynptr_cross_frame.c b/tools/testing/selftests/bpf/progs/bpf_qdisc_fail__invalid_dynptr_cross_frame.c new file mode 100644 index 000000000000..2e23b8593af9 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_qdisc_fail__invalid_dynptr_cross_frame.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include "bpf_experimental.h" +#include "bpf_qdisc_common.h" +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +int proto; + +static __noinline int free_skb(struct sk_buff *skb) +{ + bpf_kfree_skb(skb); + return 0; +} + +SEC("struct_ops") +__failure __msg("invalid mem access 'scalar'") +int BPF_PROG(invalid_dynptr_cross_frame, struct sk_buff *skb, struct Qdisc *sch, + struct bpf_sk_buff_ptr *to_free) +{ + struct bpf_dynptr ptr; + struct ethhdr *hdr; + + bpf_dynptr_from_skb((struct __sk_buff *)skb, 0, &ptr); + + hdr = bpf_dynptr_slice(&ptr, 0, NULL, sizeof(*hdr)); + if (!hdr) + return NET_XMIT_DROP; + + free_skb(skb); + + proto = hdr->h_proto; + + return NET_XMIT_DROP; +} + +SEC("struct_ops") +__auxiliary +struct sk_buff *BPF_PROG(bpf_qdisc_test_dequeue, struct Qdisc *sch) +{ + return NULL; +} + +SEC("struct_ops") +__auxiliary +int BPF_PROG(bpf_qdisc_test_init, struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + return 0; +} + +SEC("struct_ops") +__auxiliary +void BPF_PROG(bpf_qdisc_test_reset, struct Qdisc *sch) +{ +} + +SEC("struct_ops") +__auxiliary +void BPF_PROG(bpf_qdisc_test_destroy, struct Qdisc *sch) +{ +} + +SEC(".struct_ops") +struct Qdisc_ops test = { + .enqueue = (void *)invalid_dynptr_cross_frame, + .dequeue = (void *)bpf_qdisc_test_dequeue, + .init = (void *)bpf_qdisc_test_init, + .reset = (void *)bpf_qdisc_test_reset, + .destroy = (void *)bpf_qdisc_test_destroy, + .id = "bpf_qdisc_test", +}; diff --git a/tools/testing/selftests/bpf/progs/bpf_qdisc_fail__invalid_dynptr_slice.c b/tools/testing/selftests/bpf/progs/bpf_qdisc_fail__invalid_dynptr_slice.c new file mode 100644 index 000000000000..731216c4e45a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_qdisc_fail__invalid_dynptr_slice.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include "bpf_experimental.h" +#include "bpf_qdisc_common.h" +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +int proto; + +SEC("struct_ops") +__failure __msg("invalid mem access 'scalar'") +int BPF_PROG(invalid_dynptr_slice, struct sk_buff *skb, struct Qdisc *sch, + struct bpf_sk_buff_ptr *to_free) +{ + struct bpf_dynptr ptr; + struct ethhdr *hdr; + + bpf_dynptr_from_skb((struct __sk_buff *)skb, 0, &ptr); + + hdr = bpf_dynptr_slice(&ptr, 0, NULL, sizeof(*hdr)); + if (!hdr) { + bpf_qdisc_skb_drop(skb, to_free); + return NET_XMIT_DROP; + } + + bpf_qdisc_skb_drop(skb, to_free); + + proto = hdr->h_proto; + + return NET_XMIT_DROP; +} + +SEC("struct_ops") +__auxiliary +struct sk_buff *BPF_PROG(bpf_qdisc_test_dequeue, struct Qdisc *sch) +{ + return NULL; +} + +SEC("struct_ops") +__auxiliary +int BPF_PROG(bpf_qdisc_test_init, struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + return 0; +} + +SEC("struct_ops") +__auxiliary +void BPF_PROG(bpf_qdisc_test_reset, struct Qdisc *sch) +{ +} + +SEC("struct_ops") +__auxiliary +void BPF_PROG(bpf_qdisc_test_destroy, struct Qdisc *sch) +{ +} + +SEC(".struct_ops") +struct Qdisc_ops test = { + .enqueue = (void *)invalid_dynptr_slice, + .dequeue = (void *)bpf_qdisc_test_dequeue, + .init = (void *)bpf_qdisc_test_init, + .reset = (void *)bpf_qdisc_test_reset, + .destroy = (void *)bpf_qdisc_test_destroy, + .id = "bpf_qdisc_test", +}; -- cgit v1.2.3 From 9a3c3c49c333760c8944dadacbe114c1884546ef Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 1 Jun 2026 17:02:42 +0200 Subject: bpf: Reject exclusive maps as inner maps in map-in-map An exclusive map (created with excl_prog_hash) is bound to a single program by hash: check_map_prog_compatibility() refuses to load any program whose digest does not match map->excl_prog_sha. That check only runs for maps a program references directly, i.e. its used_maps. A map reached at runtime through a map-of-maps is never in used_maps, and bpf_map_meta_equal() does not consider excl_prog_sha, so an exclusive map can be inserted into a non-exclusive outer map and then looked up and mutated by an unrelated program, bypassing the exclusivity guarantee. For the signed loader this defeats the metadata map exclusivity check added in the signed loader: the cached map->sha[] is validated against the signed hash while another program on a hostile host rewrites the frozen map's contents through the outer map. Fixes: baefdbdf6812 ("bpf: Implement exclusive map creation") Reported-by: sashiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260601150248.394863-2-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- kernel/bpf/map_in_map.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 645bd30bc9a9..d2cbab4bdf64 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -20,7 +20,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) /* Does not support >1 level map-in-map */ if (inner_map->inner_map_meta) return ERR_PTR(-EINVAL); - + if (inner_map->excl_prog_sha) + return ERR_PTR(-ENOTSUPP); if (!inner_map->ops->map_meta_equal) return ERR_PTR(-ENOTSUPP); @@ -101,6 +102,8 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map, inner_map = __bpf_map_get(f); if (IS_ERR(inner_map)) return inner_map; + if (inner_map->excl_prog_sha) + return ERR_PTR(-ENOTSUPP); inner_map_meta = map->inner_map_meta; if (inner_map_meta->ops->map_meta_equal(inner_map_meta, inner_map)) -- cgit v1.2.3 From c48c3a7e7d5bed644208ed443d63bb6a6f411676 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 1 Jun 2026 17:02:43 +0200 Subject: bpf: Drop redundant hash_buf from map_get_hash operation bpf_map_get_info_by_fd() is the only caller of the ->map_get_hash and always invokes it with hash_buf == map->sha and hash_buf_size of SHA256_DIGEST_SIZE. array_map_get_hash() in turn lets sha256() write the digest directly into that buffer (map->sha) and then performs a trailing memcpy(), which evaluates to memcpy(map->sha, map->sha, 32): a redundant self-copy. The hash_buf_size argument was never used at all. Simplify this a bit, no functional change. Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260601150248.394863-3-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- kernel/bpf/arraymap.c | 6 ++---- kernel/bpf/syscall.c | 8 +++----- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d1a17c118316..c0510d223685 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -111,7 +111,7 @@ struct bpf_map_ops { long (*map_pop_elem)(struct bpf_map *map, void *value); long (*map_peek_elem)(struct bpf_map *map, void *value); void *(*map_lookup_percpu_elem)(struct bpf_map *map, void *key, u32 cpu); - int (*map_get_hash)(struct bpf_map *map, u32 hash_buf_size, void *hash_buf); + int (*map_get_hash)(struct bpf_map *map); /* funcs called by prog_array and perf_event_array map */ void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index dfb2110ab733..e6271a2bf6d6 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -175,14 +175,12 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) return array->value + (u64)array->elem_size * (index & array->index_mask); } -static int array_map_get_hash(struct bpf_map *map, u32 hash_buf_size, - void *hash_buf) +static int array_map_get_hash(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); sha256(array->value, (u64)array->elem_size * array->map.max_entries, - hash_buf); - memcpy(array->map.sha, hash_buf, sizeof(array->map.sha)); + array->map.sha); return 0; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2aafd2131983..a27fa2b9b405 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5434,18 +5434,16 @@ static int bpf_map_get_info_by_fd(struct file *file, if (!map->ops->map_get_hash) return -EINVAL; - - if (info.hash_size != SHA256_DIGEST_SIZE) + if (info.hash_size != sizeof(map->sha)) return -EINVAL; - if (!READ_ONCE(map->frozen)) return -EPERM; - err = map->ops->map_get_hash(map, SHA256_DIGEST_SIZE, map->sha); + err = map->ops->map_get_hash(map); if (err != 0) return err; - if (copy_to_user(uhash, map->sha, SHA256_DIGEST_SIZE) != 0) + if (copy_to_user(uhash, map->sha, sizeof(map->sha)) != 0) return -EFAULT; } else if (info.hash_size) { return -EINVAL; -- cgit v1.2.3 From 0fb6c9ed6493b4af01be8bb0a384574eba7df636 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Mon, 1 Jun 2026 17:02:44 +0200 Subject: libbpf: Reject non-exclusive metadata maps in the signed loader The loader verifies map->sha against the metadata hash in its instructions. map->sha is calculated when BPF_OBJ_GET_INFO_BY_FD is called on the frozen map. While the map is frozen, the /signed loader/ must also ensure the map is exclusive, as, without exclusivity (which a hostile host could just omit when loading the loader), another BPF program with map access can mutate the contents afterwards, so the check passes on stale data. With the extra check as part of the signed loader, it now refuses to move on with map->sha validation if the host set it up wrongly. Fixes: fb2b0e290147 ("libbpf: Update light skeleton for signing") Signed-off-by: KP Singh Co-developed-by: Daniel Borkmann Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260601150248.394863-4-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/bpf/syscall.c | 7 +++++++ tools/lib/bpf/gen_loader.c | 17 +++++++++++++++++ 3 files changed, 25 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c0510d223685..8599b451dd7a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -296,6 +296,7 @@ struct bpf_map_owner { struct bpf_map { u8 sha[SHA256_DIGEST_SIZE]; + u32 excl; const struct bpf_map_ops *ops; struct bpf_map *inner_map_meta; #ifdef CONFIG_SECURITY diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a27fa2b9b405..625a4366fe6d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1588,6 +1588,13 @@ static int map_create_alloc(union bpf_attr *attr, bpfptr_t uattr, struct bpf_ver err = -EFAULT; goto free_map; } + + /* See libbpf: emit_signature_match() */ + BUILD_BUG_ON(offsetof(struct bpf_map, excl) != SHA256_DIGEST_SIZE); + BUILD_BUG_ON(!__same_type(map->excl, u32)); + BUILD_BUG_ON(offsetof(struct bpf_map, sha) != 0); + BUILD_BUG_ON(!__same_type(map->sha, u8[SHA256_DIGEST_SIZE])); + map->excl = 1; } else if (attr->excl_prog_hash_size) { bpf_log(log, "Invalid excl_prog_hash_size.\n"); err = -EINVAL; diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c index 3702c5944bc0..66a02039da8c 100644 --- a/tools/lib/bpf/gen_loader.c +++ b/tools/lib/bpf/gen_loader.c @@ -586,6 +586,23 @@ static void emit_signature_match(struct bpf_gen *gen) __s64 off; int i; + /* + * Reject if the metadata map is not exclusive. Without exclusivity + * the cached map->sha[] verified above can be stale: another BPF + * program with map access could have mutated the contents between + * BPF_OBJ_GET_INFO_BY_FD and loader execution. + */ + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX, + 0, 0, 0, 0)); + emit(gen, BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, SHA256_DIGEST_LENGTH)); + off = -(gen->insn_cur - gen->insn_start - gen->cleanup_label) / 8 - 2; + if (is_simm16(off)) { + emit(gen, BPF_MOV64_IMM(BPF_REG_7, -EINVAL)); + emit(gen, BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 1, off)); + } else { + gen->error = -ERANGE; + } + for (i = 0; i < SHA256_DWORD_SIZE; i++) { emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX, 0, 0, 0, 0)); -- cgit v1.2.3 From 61e084152328867fe2279cc790573aae39959cd5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 1 Jun 2026 17:02:45 +0200 Subject: libbpf: Skip initial_value override on signed loaders bpf_gen__map_update_elem() emits code that, when the host-supplied loader ctx provides a non-NULL map_desc[idx].initial_value, overwrites the blob value with bytes read from the host (bpf_copy_from_user / bpf_probe_read_kernel) before the BPF_MAP_UPDATE_ELEM that populates the program's .data/.rodata/.bss maps. This override runs after emit_signature_match() has validated map->sha[], and initial_value is part of neither the signed loader instructions nor the hashed data blob. For a signed loader this lets an untrusted host substitute global-variable contents into a program whose code carries a valid signature, thus weakening what the signature attests to. The blob already contains the signer-provided value (added via add_data() and covered by the embedded, signed hash), so simply skip emitting the override for signed loaders (gen_hash). Runtime initialization stays available for the unsigned light-skeleton path as before. The jump offsets within the override block are internal to it, so guarding the whole block leaves them unchanged. Fixes: ea923080c145 ("libbpf: Embed and verify the metadata hash in the loader") Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260601150248.394863-5-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/gen_loader.c | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c index 66a02039da8c..a5d9c7a5261b 100644 --- a/tools/lib/bpf/gen_loader.c +++ b/tools/lib/bpf/gen_loader.c @@ -1187,27 +1187,36 @@ void bpf_gen__map_update_elem(struct bpf_gen *gen, int map_idx, void *pvalue, value = add_data(gen, pvalue, value_size); key = add_data(gen, &zero, sizeof(zero)); - /* if (map_desc[map_idx].initial_value) { + /* + * if (map_desc[map_idx].initial_value) { * if (ctx->flags & BPF_SKEL_KERNEL) * bpf_probe_read_kernel(value, value_size, initial_value); * else * bpf_copy_from_user(value, value_size, initial_value); * } + * + * The runtime initial_value comes from the host-supplied loader + * ctx and would overwrite the blob value after emit_signature_match() + * has already validated map->sha[]. For a signed loader (gen_hash) + * the attested blob value must be authoritative, so skip the override + * and leave the hashed value in place. */ - emit(gen, BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_6, - sizeof(struct bpf_loader_ctx) + - sizeof(struct bpf_map_desc) * map_idx + - offsetof(struct bpf_map_desc, initial_value))); - emit(gen, BPF_JMP_IMM(BPF_JEQ, BPF_REG_3, 0, 8)); - emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, - 0, 0, 0, value)); - emit(gen, BPF_MOV64_IMM(BPF_REG_2, value_size)); - emit(gen, BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, - offsetof(struct bpf_loader_ctx, flags))); - emit(gen, BPF_JMP_IMM(BPF_JSET, BPF_REG_0, BPF_SKEL_KERNEL, 2)); - emit(gen, BPF_EMIT_CALL(BPF_FUNC_copy_from_user)); - emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, 1)); - emit(gen, BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel)); + if (!OPTS_GET(gen->opts, gen_hash, false)) { + emit(gen, BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_6, + sizeof(struct bpf_loader_ctx) + + sizeof(struct bpf_map_desc) * map_idx + + offsetof(struct bpf_map_desc, initial_value))); + emit(gen, BPF_JMP_IMM(BPF_JEQ, BPF_REG_3, 0, 8)); + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, value)); + emit(gen, BPF_MOV64_IMM(BPF_REG_2, value_size)); + emit(gen, BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, + offsetof(struct bpf_loader_ctx, flags))); + emit(gen, BPF_JMP_IMM(BPF_JSET, BPF_REG_0, BPF_SKEL_KERNEL, 2)); + emit(gen, BPF_EMIT_CALL(BPF_FUNC_copy_from_user)); + emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, 1)); + emit(gen, BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel)); + } map_update_attr = add_data(gen, &attr, attr_size); pr_debug("gen: map_update_elem: idx %d, value: off %d size %d, attr: off %d size %d\n", -- cgit v1.2.3 From 60214435b365ecdd40b2f96d4e54564b5c927645 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 1 Jun 2026 17:02:46 +0200 Subject: libbpf: Skip max_entries override on signed loaders bpf_gen__map_create() lets the host-supplied loader ctx override a map's max_entries at runtime (map_desc[idx].max_entries, when non-zero). This is how the light skeleton sizes maps to the target machine, but it happens after emit_signature_match() and is covered by neither the signed loader instructions nor the hashed blob. For a signed loader this means an untrusted host can re-dimension the program's maps, outside what the signature attests to. Gate the override on gen_hash so signed loaders use the signer-provided max_entries baked into the blob. Fixes: ea923080c145 ("libbpf: Embed and verify the metadata hash in the loader") Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260601150248.394863-6-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/gen_loader.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c index a5d9c7a5261b..66e13566bc31 100644 --- a/tools/lib/bpf/gen_loader.c +++ b/tools/lib/bpf/gen_loader.c @@ -546,13 +546,22 @@ void bpf_gen__map_create(struct bpf_gen *gen, default: break; } - /* conditionally update max_entries */ - if (map_idx >= 0) + + /* + * Conditionally update max_entries from the host-supplied loader + * ctx. This sizes the map at runtime, but for a signed loader + * (gen_hash) it would let an untrusted host re-dimension the + * program's maps after emit_signature_match(), outside what the + * signature attests to. Keep the signer-provided max_entries + * baked into the blob in that case. + */ + if (map_idx >= 0 && !OPTS_GET(gen->opts, gen_hash, false)) move_ctx2blob(gen, attr_field(map_create_attr, max_entries), 4, sizeof(struct bpf_loader_ctx) + sizeof(struct bpf_map_desc) * map_idx + offsetof(struct bpf_map_desc, max_entries), true /* check that max_entries != 0 */); + /* emit MAP_CREATE command */ emit_sys_bpf(gen, BPF_MAP_CREATE, map_create_attr, attr_size); debug_ret(gen, "map_create %s idx %d type %d value_size %d value_btf_id %d", -- cgit v1.2.3 From 38498c0ebacd54dbaac3513a548a13f1a8455c4e Mon Sep 17 00:00:00 2001 From: KP Singh Date: Mon, 1 Jun 2026 17:02:47 +0200 Subject: selftests/bpf: Adjust verifier_map_ptr for the map's excl field Adding the u32 excl field at offset 32 of struct bpf_map right after the sha[SHA256_DIGEST_SIZE] hash shifts the ops pointer from offset 32 to 40. Therefore, fix up the test case. # LDLIBS=-static PKG_CONFIG='pkg-config --static' ./vmtest.sh -- ./test_progs -t verifier_map_ptr [...] #637/1 verifier_map_ptr/bpf_map_ptr: read with negative offset rejected:OK #637/2 verifier_map_ptr/bpf_map_ptr: read with negative offset rejected @unpriv:OK #637/3 verifier_map_ptr/bpf_map_ptr: write rejected:OK #637/4 verifier_map_ptr/bpf_map_ptr: write rejected @unpriv:OK #637/5 verifier_map_ptr/bpf_map_ptr: read non-existent field rejected:OK #637/6 verifier_map_ptr/bpf_map_ptr: read non-existent field rejected @unpriv:OK #637/7 verifier_map_ptr/bpf_map_ptr: read ops field accepted:OK #637/8 verifier_map_ptr/bpf_map_ptr: read ops field accepted @unpriv:OK [...] Summary: 2/18 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: KP Singh Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260601150248.394863-7-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/verifier_map_ptr.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/verifier_map_ptr.c b/tools/testing/selftests/bpf/progs/verifier_map_ptr.c index e2767d27d8aa..d8e822d1a8ba 100644 --- a/tools/testing/selftests/bpf/progs/verifier_map_ptr.c +++ b/tools/testing/selftests/bpf/progs/verifier_map_ptr.c @@ -70,13 +70,15 @@ __naked void bpf_map_ptr_write_rejected(void) : __clobber_all); } -/* The first element of struct bpf_map is a SHA256 hash of 32 bytes, accessing - * into this array is valid. The opts field is now at offset 33. +/* + * struct bpf_map starts with the SHA256 hash sha[32] at offset 0 (a readable + * byte array), followed by the u32 excl field at offset 32. Reading a u32 at + * offset 33 runs past the end of excl and is rejected. */ SEC("socket") __description("bpf_map_ptr: read non-existent field rejected") __failure -__msg("cannot access ptr member ops with moff 32 in struct bpf_map with off 33 size 4") +__msg("access beyond the end of member excl (mend:36) in struct bpf_map with off 33 size 4") __failure_unpriv __msg_unpriv("access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN") __flag(BPF_F_ANY_ALIGNMENT) -- cgit v1.2.3 From 32f725458a1ab5973c64e4636659ca2c0db42f48 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 1 Jun 2026 17:02:48 +0200 Subject: selftests/bpf: Test that exclusive maps are rejected in map-in-map Add a subtest to map_excl that verifies an exclusive map (created with excl_prog_hash) cannot be used in a map-of-maps, covering both kernel enforcement points: i) the inner-map template at map-of-maps creation and, ii) the element inserted into an existing map-of-maps. # LDLIBS=-static PKG_CONFIG='pkg-config --static' ./vmtest.sh -- ./test_progs -t map_excl ./test_progs -t map_excl [ 1.728106] bpf_testmod: loading out-of-tree module taints kernel. [ 1.730473] bpf_testmod: module verification failed: signature and/or required key missing - tainting kernel #215/1 map_excl/map_excl_allowed:OK #215/2 map_excl/map_excl_denied:OK #215/3 map_excl/map_excl_no_map_in_map:OK #215 map_excl:OK Summary: 1/3 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260601150248.394863-8-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/map_excl.c | 46 +++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/map_excl.c b/tools/testing/selftests/bpf/prog_tests/map_excl.c index 6bdc6d6de0da..a213dd559aae 100644 --- a/tools/testing/selftests/bpf/prog_tests/map_excl.c +++ b/tools/testing/selftests/bpf/prog_tests/map_excl.c @@ -8,6 +8,10 @@ #include "map_excl.skel.h" +#ifndef SHA256_DIGEST_SIZE +#define SHA256_DIGEST_SIZE 32 +#endif + static void test_map_excl_allowed(void) { struct map_excl *skel = map_excl__open(); @@ -45,10 +49,52 @@ out: } +static void test_map_excl_no_map_in_map(void) +{ + __u8 hash[SHA256_DIGEST_SIZE] = {}; + LIBBPF_OPTS(bpf_map_create_opts, excl_opts, + .excl_prog_hash = hash, + .excl_prog_hash_size = sizeof(hash)); + LIBBPF_OPTS(bpf_map_create_opts, outer_opts); + int excl_fd, tmpl_fd = -1, outer_fd = -1, err; + __u32 key = 0; + + excl_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "excl_inner", 4, 4, 1, &excl_opts); + if (!ASSERT_OK_FD(excl_fd, "create exclusive map")) + return; + + outer_opts.inner_map_fd = excl_fd; + err = bpf_map_create(BPF_MAP_TYPE_ARRAY_OF_MAPS, "outer_from_excl", + 4, 4, 1, &outer_opts); + if (err >= 0) + close(err); + ASSERT_EQ(err, -ENOTSUPP, "reject exclusive map as map-in-map template"); + + tmpl_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "tmpl", 4, 4, 1, NULL); + if (!ASSERT_OK_FD(tmpl_fd, "create inner template")) + goto out; + + outer_opts.inner_map_fd = tmpl_fd; + outer_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY_OF_MAPS, "outer", 4, 4, 1, &outer_opts); + if (!ASSERT_OK_FD(outer_fd, "create map-of-maps")) + goto out; + + err = bpf_map_update_elem(outer_fd, &key, &excl_fd, 0); + ASSERT_EQ(err, -ENOTSUPP, "reject exclusive map as map-in-map element"); +out: + if (outer_fd >= 0) + close(outer_fd); + if (tmpl_fd >= 0) + close(tmpl_fd); + close(excl_fd); +} + void test_map_excl(void) { if (test__start_subtest("map_excl_allowed")) test_map_excl_allowed(); if (test__start_subtest("map_excl_denied")) test_map_excl_denied(); + if (test__start_subtest("map_excl_no_map_in_map")) + test_map_excl_no_map_in_map(); } -- cgit v1.2.3 From a0fa68d8ce759dbf6aaf19a043ddd77a2128c26c Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Mon, 1 Jun 2026 20:41:15 -0400 Subject: selftests/bpf: libarena: Add "arena" BTF type tag to __arena qualifier The arena qualifier currently designates its associated type as belonging to address space 1. This property affects code generation, but is not reflected in the BTF information of the function. This lack of information at the BTF level prevents us from returning arena pointers from global subprograms. Subprogs cannot return any data structure more complex than a scalar, so pointers to structs are rejected as a return type. We have no way of marking the return type as a pointer to an arena, which is safe provided the two subprogs have the same arena. Expand the __arena qualifier to also attach a BTF type tag to the type. This lets us determine whether a variable belongs to an arena from its type alone through BTF parsing. Signed-off-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260602004120.17087-2-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/libarena/include/bpf_arena_common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/libarena/include/bpf_arena_common.h b/tools/testing/selftests/bpf/libarena/include/bpf_arena_common.h index 16f8ce832004..445be3c4edec 100644 --- a/tools/testing/selftests/bpf/libarena/include/bpf_arena_common.h +++ b/tools/testing/selftests/bpf/libarena/include/bpf_arena_common.h @@ -33,7 +33,7 @@ #endif #if defined(__BPF_FEATURE_ADDR_SPACE_CAST) && !defined(BPF_ARENA_FORCE_ASM) -#define __arena __attribute__((address_space(1))) +#define __arena __attribute__((address_space(1))) __attribute__((btf_type_tag("arena"))) #define __arena_global __attribute__((address_space(1))) #define cast_kern(ptr) /* nop for bpf prog. emitted by LLVM */ #define cast_user(ptr) /* nop for bpf prog. emitted by LLVM */ -- cgit v1.2.3 From 5ab4bc67d818ba27388d64cb9c52cb0c3bdac254 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Mon, 1 Jun 2026 20:41:16 -0400 Subject: verifier: parse BTF type tags for function arguments The BTF parsing logic for function arguments goes through the arguments' decl tags, but does not go into their type tags. Add type tag parsing for function arguments. Acked-by: Eduard Zingerman Signed-off-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260602004120.17087-3-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 120 +++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 85 insertions(+), 35 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index f429f6f58cb2..6fb3461e3ac2 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7802,6 +7802,84 @@ enum btf_arg_tag { ARG_TAG_ARENA = BIT_ULL(5), }; +static int btf_scan_decl_tags(struct bpf_verifier_env *env, + const struct btf *btf, + const struct btf_type *fn_t, + u32 arg_idx, bool is_global, u32 *tags) +{ + int id = btf_named_start_id(btf, false) - 1; + + /* + * The 'arg:' decl_tag takes precedence over the derivation + * of the register type from the BTF type itself. + */ + while ((id = btf_find_next_decl_tag(btf, fn_t, arg_idx, "arg:", id)) > 0) { + const struct btf_type *tag_t = btf_type_by_id(btf, id); + const char *tag = __btf_name_by_offset(btf, tag_t->name_off) + 4; + + /* disallow arg tags in static subprogs */ + if (!is_global) { + bpf_log(&env->log, + "arg#%d type tag is not supported in static functions\n", + arg_idx); + return -EOPNOTSUPP; + } + + if (strcmp(tag, "ctx") == 0) { + *tags |= ARG_TAG_CTX; + } else if (strcmp(tag, "trusted") == 0) { + *tags |= ARG_TAG_TRUSTED; + } else if (strcmp(tag, "untrusted") == 0) { + *tags |= ARG_TAG_UNTRUSTED; + } else if (strcmp(tag, "nonnull") == 0) { + *tags |= ARG_TAG_NONNULL; + } else if (strcmp(tag, "nullable") == 0) { + *tags |= ARG_TAG_NULLABLE; + } else if (strcmp(tag, "arena") == 0) { + *tags |= ARG_TAG_ARENA; + } else { + bpf_log(&env->log, "arg#%d has unsupported set of tags\n", arg_idx); + return -EOPNOTSUPP; + } + } + if (id != -ENOENT) { + bpf_log(&env->log, "arg#%d type tag fetching failure: %d\n", arg_idx, id); + return id; + } + + return 0; +} + +static int btf_scan_type_tags(struct bpf_verifier_env *env, + const struct btf *btf, u32 type_id, + u32 *tags) +{ + const struct btf_type *t; + + /* Find the first pointer type in the chain. */ + t = btf_type_skip_modifiers(btf, type_id, NULL); + if (!t || !btf_type_is_ptr(t)) + return 0; + + /* We got a pointer, get all associated type tags. */ + t = btf_type_by_id(btf, t->type); + while (t && btf_type_is_type_tag(t)) { + const char *tag = __btf_name_by_offset(btf, t->name_off); + + if (strcmp(tag, "arena") == 0) { + *tags |= ARG_TAG_ARENA; + } else { + bpf_log(&env->log, "function signature member has unsupported type tag '%s'\n", + tag); + return -EOPNOTSUPP; + } + + t = btf_type_by_id(btf, t->type); + } + + return 0; +} + /* Process BTF of a function to produce high-level expectation of function * arguments (like ARG_PTR_TO_CTX, or ARG_PTR_TO_MEM, etc). This information * is cached in subprog info for reuse. @@ -7820,6 +7898,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) struct btf *btf = prog->aux->btf; const struct btf_param *args; const struct btf_type *t, *ref_t, *fn_t; + int err; u32 i, nargs, btf_id; const char *tname; @@ -7903,42 +7982,13 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) */ for (i = 0; i < nargs; i++) { u32 tags = 0; - int id = btf_named_start_id(btf, false) - 1; - - /* 'arg:' decl_tag takes precedence over derivation of - * register type from BTF type itself - */ - while ((id = btf_find_next_decl_tag(btf, fn_t, i, "arg:", id)) > 0) { - const struct btf_type *tag_t = btf_type_by_id(btf, id); - const char *tag = __btf_name_by_offset(btf, tag_t->name_off) + 4; - - /* disallow arg tags in static subprogs */ - if (!is_global) { - bpf_log(log, "arg#%d type tag is not supported in static functions\n", i); - return -EOPNOTSUPP; - } + err = btf_scan_decl_tags(env, btf, fn_t, i, is_global, &tags); + if (err) + return err; - if (strcmp(tag, "ctx") == 0) { - tags |= ARG_TAG_CTX; - } else if (strcmp(tag, "trusted") == 0) { - tags |= ARG_TAG_TRUSTED; - } else if (strcmp(tag, "untrusted") == 0) { - tags |= ARG_TAG_UNTRUSTED; - } else if (strcmp(tag, "nonnull") == 0) { - tags |= ARG_TAG_NONNULL; - } else if (strcmp(tag, "nullable") == 0) { - tags |= ARG_TAG_NULLABLE; - } else if (strcmp(tag, "arena") == 0) { - tags |= ARG_TAG_ARENA; - } else { - bpf_log(log, "arg#%d has unsupported set of tags\n", i); - return -EOPNOTSUPP; - } - } - if (id != -ENOENT) { - bpf_log(log, "arg#%d type tag fetching failure: %d\n", i, id); - return id; - } + err = btf_scan_type_tags(env, btf, args[i].type, &tags); + if (err) + return err; t = btf_type_by_id(btf, args[i].type); while (btf_type_is_modifier(t)) -- cgit v1.2.3 From 3e924e9272c80939677aa6902aced311c85fe48c Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Mon, 1 Jun 2026 20:41:17 -0400 Subject: bpf: Allow subprogs to return arena pointers BPF subprogs currently only return void or scalar values. However, it is also safe to return arena pointers between subprogs in the same BPF program: Arena pointers are guaranteed to be safe for both programs at any point. Expand the verifier to permit returning an arena pointer to the caller. The main subprog is still not allowed to return an arena pointer because arena pointers are internal to the BPF program, and the return values permitted for each main subprog depend on the program type anyway. Acked-by: Eduard Zingerman Signed-off-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260602004120.17087-4-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 66 ++++++++++++++++++++++++++++++++++++++------------- kernel/bpf/verifier.c | 4 ++++ 2 files changed, 54 insertions(+), 16 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 6fb3461e3ac2..68921d9172b5 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7858,12 +7858,22 @@ static int btf_scan_type_tags(struct bpf_verifier_env *env, /* Find the first pointer type in the chain. */ t = btf_type_skip_modifiers(btf, type_id, NULL); + + /* + * We currently reject type tags on non-pointer types, + * which neither LLVM nor GCC support anyway. + */ if (!t || !btf_type_is_ptr(t)) return 0; /* We got a pointer, get all associated type tags. */ - t = btf_type_by_id(btf, t->type); - while (t && btf_type_is_type_tag(t)) { + for (t = btf_type_by_id(btf, t->type); t && btf_type_is_modifier(t); + t = btf_type_by_id(btf, t->type)) { + + /* Skip non-type tag modifiers. */ + if (!btf_type_is_type_tag(t)) + continue; + const char *tag = __btf_name_by_offset(btf, t->name_off); if (strcmp(tag, "arena") == 0) { @@ -7873,13 +7883,39 @@ static int btf_scan_type_tags(struct bpf_verifier_env *env, tag); return -EOPNOTSUPP; } - - t = btf_type_by_id(btf, t->type); } return 0; } +/* Check whether the type is a valid return type. */ +static int btf_validate_return_type(struct bpf_verifier_env *env, struct btf *btf, + const struct btf_type *t, int subprog) +{ + u32 tags = 0; + int err; + + err = btf_scan_type_tags(env, btf, t->type, &tags); + if (err) + return err; + + t = btf_type_skip_modifiers(btf, t->type, NULL); + + /* + * We allow all subprogs except for the main one to return any kind of arena pointer. + * General arena variables are not allowed, since it makes no sense to return by value + * a variable that's on the heap in the first place. + */ + if (subprog && (tags & ARG_TAG_ARENA) && btf_type_is_ptr(t)) + return 0; + + /* We always accept void or scalars. */ + if (btf_type_is_void(t) || btf_type_is_int(t) || btf_is_any_enum(t)) + return 0; + + return -EOPNOTSUPP; +} + /* Process BTF of a function to produce high-level expectation of function * arguments (like ARG_PTR_TO_CTX, or ARG_PTR_TO_MEM, etc). This information * is cached in subprog info for reuse. @@ -7963,18 +7999,16 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) tname, nargs, MAX_BPF_FUNC_REG_ARGS); return -EINVAL; } - /* check that function is void or returns int, exception cb also requires this */ - t = btf_type_by_id(btf, t->type); - while (btf_type_is_modifier(t)) - t = btf_type_by_id(btf, t->type); - if (!btf_type_is_void(t) && !btf_type_is_int(t) && !btf_is_any_enum(t)) { - if (!is_global) - return -EINVAL; - bpf_log(log, - "Global function %s() return value not void or scalar. " - "Only those are supported.\n", - tname); - return -EINVAL; + + err = btf_validate_return_type(env, btf, t, subprog); + if (err) { + if (is_global) { + bpf_log(log, + "Global function %s() return value not void or scalar. " + "Only those are supported.\n", + tname); + } + return err; } /* Convert BTF function arguments into verifier types. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5d8f2656dbfd..8ed484cb1a8a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -16503,6 +16503,10 @@ static int check_global_subprog_return_code(struct bpf_verifier_env *env) if (err) return err; + /* Pointers to arena are safe to pass between subprograms. */ + if (is_arena_reg(env, BPF_REG_0)) + return 0; + if (is_pointer_value(env, BPF_REG_0)) { verbose(env, "R%d leaks addr as return value\n", BPF_REG_0); return -EACCES; -- cgit v1.2.3 From b9b23fe1761117f4a0109a25d16d337c900437ad Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Mon, 1 Jun 2026 20:41:18 -0400 Subject: selftests/bpf: Remove __arg_arena from the codebase Now that BPF __arg_arena has been subsumed by __arena, remove __arg_arena from the codebase. This way the user has one fewer annotation to worry about. To remove __arg_arena we remove the typedefs we were previously using to minimize __arena annotations. This is because __arena now also includes a BTF type tag, which is ignored for non-pointer types. As a result, we cannot capture the whole __arena annotation inside a typedef and need to directly annotate the pointer type when declaring the variable. The extra verbosity is worth it because the use of the __arena tag is intuitive to the programmer and removes the __arg_arena tag that has been a consistent source of confusion for users. The typedefs can be reintroduced later (without __arg_arena) once compilers start supporting BTF type tags for non-pointer types. Acked-by: Eduard Zingerman Signed-off-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260602004120.17087-5-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bpf_arena_htab.h | 11 ++- tools/testing/selftests/bpf/bpf_arena_strsearch.h | 4 +- .../bpf/libarena/include/bpf_arena_common.h | 3 +- .../bpf/libarena/include/bpf_arena_spin_lock.h | 6 +- .../selftests/bpf/libarena/include/libarena/asan.h | 6 +- .../bpf/libarena/include/libarena/buddy.h | 22 ++---- .../bpf/libarena/selftests/st_asan_buddy.bpf.c | 4 +- .../bpf/libarena/selftests/st_asan_common.h | 2 +- .../bpf/libarena/selftests/st_buddy.bpf.c | 2 +- .../testing/selftests/bpf/libarena/src/asan.bpf.c | 38 +++++----- .../testing/selftests/bpf/libarena/src/buddy.bpf.c | 80 +++++++++++----------- .../selftests/bpf/libarena/src/common.bpf.c | 6 +- .../testing/selftests/bpf/progs/arena_spin_lock.c | 1 + 13 files changed, 84 insertions(+), 101 deletions(-) diff --git a/tools/testing/selftests/bpf/bpf_arena_htab.h b/tools/testing/selftests/bpf/bpf_arena_htab.h index acc01a876668..d7ba86362d86 100644 --- a/tools/testing/selftests/bpf/bpf_arena_htab.h +++ b/tools/testing/selftests/bpf/bpf_arena_htab.h @@ -14,9 +14,8 @@ struct htab { htab_bucket_t *buckets; int n_buckets; }; -typedef struct htab __arena htab_t; -static inline htab_bucket_t *__select_bucket(htab_t *htab, __u32 hash) +static inline htab_bucket_t *__select_bucket(struct htab __arena *htab, __u32 hash) { htab_bucket_t *b = htab->buckets; @@ -24,7 +23,7 @@ static inline htab_bucket_t *__select_bucket(htab_t *htab, __u32 hash) return &b[hash & (htab->n_buckets - 1)]; } -static inline arena_list_head_t *select_bucket(htab_t *htab, __u32 hash) +static inline arena_list_head_t *select_bucket(struct htab __arena *htab, __u32 hash) { return &__select_bucket(htab, hash)->head; } @@ -53,7 +52,7 @@ static int htab_hash(int key) return key; } -__weak int htab_lookup_elem(htab_t *htab __arg_arena, int key) +__weak int htab_lookup_elem(struct htab __arena *htab, int key) { hashtab_elem_t *l_old; arena_list_head_t *head; @@ -66,7 +65,7 @@ __weak int htab_lookup_elem(htab_t *htab __arg_arena, int key) return 0; } -__weak int htab_update_elem(htab_t *htab __arg_arena, int key, int value) +__weak int htab_update_elem(struct htab __arena *htab, int key, int value) { hashtab_elem_t *l_new = NULL, *l_old; arena_list_head_t *head; @@ -90,7 +89,7 @@ __weak int htab_update_elem(htab_t *htab __arg_arena, int key, int value) return 0; } -void htab_init(htab_t *htab) +void htab_init(struct htab __arena *htab) { void __arena *buckets = bpf_arena_alloc_pages(&arena, NULL, 2, NUMA_NO_NODE, 0); diff --git a/tools/testing/selftests/bpf/bpf_arena_strsearch.h b/tools/testing/selftests/bpf/bpf_arena_strsearch.h index f0d575daef5a..10a70667c8bf 100644 --- a/tools/testing/selftests/bpf/bpf_arena_strsearch.h +++ b/tools/testing/selftests/bpf/bpf_arena_strsearch.h @@ -3,7 +3,7 @@ #pragma once #include -__noinline int bpf_arena_strlen(const char __arena *s __arg_arena) +__noinline int bpf_arena_strlen(const char __arena *s) { const char __arena *sc; @@ -40,7 +40,7 @@ __noinline int bpf_arena_strlen(const char __arena *s __arg_arena) * * An opening bracket without a matching close is matched literally. */ -__noinline bool glob_match(char const __arena *pat __arg_arena, char const __arena *str __arg_arena) +__noinline bool glob_match(char const __arena *pat, char const __arena *str) { /* * Backtrack to previous * on mismatch and retry starting one diff --git a/tools/testing/selftests/bpf/libarena/include/bpf_arena_common.h b/tools/testing/selftests/bpf/libarena/include/bpf_arena_common.h index 445be3c4edec..82aafe879fae 100644 --- a/tools/testing/selftests/bpf/libarena/include/bpf_arena_common.h +++ b/tools/testing/selftests/bpf/libarena/include/bpf_arena_common.h @@ -38,7 +38,7 @@ #define cast_kern(ptr) /* nop for bpf prog. emitted by LLVM */ #define cast_user(ptr) /* nop for bpf prog. emitted by LLVM */ #else -#define __arena +#define __arena __attribute__((btf_type_tag("arena"))) #define __arena_global SEC(".addr_space.1") #define cast_kern(ptr) bpf_addr_space_cast(ptr, 0, 1) #define cast_user(ptr) bpf_addr_space_cast(ptr, 1, 0) @@ -54,7 +54,6 @@ void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) __ksym _ #else /* when compiled as user space code */ #define __arena -#define __arg_arena #define cast_kern(ptr) /* nop for user space */ #define cast_user(ptr) /* nop for user space */ __weak char arena[1]; diff --git a/tools/testing/selftests/bpf/libarena/include/bpf_arena_spin_lock.h b/tools/testing/selftests/bpf/libarena/include/bpf_arena_spin_lock.h index 164638690a4d..ae6b72d15bb6 100644 --- a/tools/testing/selftests/bpf/libarena/include/bpf_arena_spin_lock.h +++ b/tools/testing/selftests/bpf/libarena/include/bpf_arena_spin_lock.h @@ -16,10 +16,6 @@ #define EOPNOTSUPP 95 #define ETIMEDOUT 110 -#ifndef __arena -#define __arena __attribute__((address_space(1))) -#endif - extern unsigned long CONFIG_NR_CPUS __kconfig; /* @@ -246,7 +242,7 @@ static __always_inline int arena_spin_trylock(arena_spinlock_t __arena *lock) } __noinline __weak -int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val) +int arena_spin_lock_slowpath(arena_spinlock_t __arena *lock, u32 val) { struct arena_mcs_spinlock __arena *prev, *next, *node0, *node; int ret = -ETIMEDOUT; diff --git a/tools/testing/selftests/bpf/libarena/include/libarena/asan.h b/tools/testing/selftests/bpf/libarena/include/libarena/asan.h index eb9fc69d9eb0..900267159292 100644 --- a/tools/testing/selftests/bpf/libarena/include/libarena/asan.h +++ b/tools/testing/selftests/bpf/libarena/include/libarena/asan.h @@ -25,12 +25,10 @@ extern volatile bool asan_report_once; #ifdef BPF_ARENA_ASAN -typedef s8 __arena s8a; - static inline -s8a *mem_to_shadow(void __arena __arg_arena *addr) +s8 __arena *mem_to_shadow(void __arena *addr) { - return (s8a *)(((u32)(u64)addr >> ASAN_SHADOW_SHIFT) + + return (s8 __arena *)(((u32)(u64)addr >> ASAN_SHADOW_SHIFT) + __asan_shadow_memory_dynamic_address); } diff --git a/tools/testing/selftests/bpf/libarena/include/libarena/buddy.h b/tools/testing/selftests/bpf/libarena/include/libarena/buddy.h index 00e2437128ef..4d57fc1b5c26 100644 --- a/tools/testing/selftests/bpf/libarena/include/libarena/buddy.h +++ b/tools/testing/selftests/bpf/libarena/include/libarena/buddy.h @@ -2,12 +2,6 @@ /* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ #pragma once -struct buddy_chunk; -typedef struct buddy_chunk __arena buddy_chunk_t; - -struct buddy_header; -typedef struct buddy_header __arena buddy_header_t; - enum buddy_consts { /* * Minimum allocation is 1 << BUDDY_MIN_ALLOC_SHIFT. @@ -68,25 +62,21 @@ struct buddy_chunk { u8 allocated[BUDDY_CHUNK_ITEMS / 8]; /* Freelists for O(1) allocation. */ u64 freelists[BUDDY_CHUNK_NUM_ORDERS]; - buddy_chunk_t *next; + struct buddy_chunk __arena *next; }; struct buddy { - buddy_chunk_t *first_chunk; /* Pointer to the chunk linked list. */ + struct buddy_chunk __arena *first_chunk; /* Pointer to the chunk linked list. */ arena_spinlock_t lock; /* Allocator lock */ u64 vaddr; /* Allocation into reserved vaddr */ }; -typedef struct buddy __arena buddy_t; - #ifdef __BPF__ -int buddy_init(buddy_t *buddy); -int buddy_destroy(buddy_t *buddy); -int buddy_free_internal(buddy_t *buddy, u64 free); -#define buddy_free(buddy, ptr) buddy_free_internal((buddy), (u64)(ptr)) -u64 buddy_alloc_internal(buddy_t *buddy, size_t size); +int buddy_init(struct buddy __arena *buddy); +int buddy_destroy(struct buddy __arena *buddy); +int buddy_free(struct buddy __arena *buddy, void __arena *free); +u64 buddy_alloc_internal(struct buddy __arena *buddy, size_t size); #define buddy_alloc(alloc, size) ((void __arena *)buddy_alloc_internal((alloc), (size))) - #endif /* __BPF__ */ diff --git a/tools/testing/selftests/bpf/libarena/selftests/st_asan_buddy.bpf.c b/tools/testing/selftests/bpf/libarena/selftests/st_asan_buddy.bpf.c index 97acd50ffa5c..686caba2c643 100644 --- a/tools/testing/selftests/bpf/libarena/selftests/st_asan_buddy.bpf.c +++ b/tools/testing/selftests/bpf/libarena/selftests/st_asan_buddy.bpf.c @@ -8,7 +8,7 @@ /* Required for parsing the ASAN call stacks. */ #include "test_progs_compat.h" -extern buddy_t buddy; +extern struct buddy __arena buddy; #ifdef BPF_ARENA_ASAN @@ -54,7 +54,7 @@ static __always_inline int asan_test_buddy_oob_single(size_t alloc_size) * Factored out because asan_validate_addr is complex enough to cause * verification failures if verified with the rest of asan_test_buddy_uaf_single. */ -__weak int asan_test_buddy_byte(u8 __arena __arg_arena *mem, int i, bool freed) +__weak int asan_test_buddy_byte(u8 __arena *mem, int i, bool freed) { int ret; diff --git a/tools/testing/selftests/bpf/libarena/selftests/st_asan_common.h b/tools/testing/selftests/bpf/libarena/selftests/st_asan_common.h index 1d3edc4372ac..34a7918cb4cf 100644 --- a/tools/testing/selftests/bpf/libarena/selftests/st_asan_common.h +++ b/tools/testing/selftests/bpf/libarena/selftests/st_asan_common.h @@ -9,7 +9,7 @@ static inline void print_asan_map_state(void __arena *addr) { arena_stdout("%s:%d ASAN %p -> (val: %x gran: %x set: [%s])", __func__, __LINE__, addr, - *(s8a *)(addr), ASAN_GRANULE(addr), + *(s8 __arena *)(addr), ASAN_GRANULE(addr), asan_shadow_set(addr) ? "yes" : "no"); } diff --git a/tools/testing/selftests/bpf/libarena/selftests/st_buddy.bpf.c b/tools/testing/selftests/bpf/libarena/selftests/st_buddy.bpf.c index 79e6f0baabfe..b45a306816c0 100644 --- a/tools/testing/selftests/bpf/libarena/selftests/st_buddy.bpf.c +++ b/tools/testing/selftests/bpf/libarena/selftests/st_buddy.bpf.c @@ -6,7 +6,7 @@ #include #include -extern buddy_t buddy; +extern struct buddy __arena buddy; struct segarr_entry { u8 __arena *block; diff --git a/tools/testing/selftests/bpf/libarena/src/asan.bpf.c b/tools/testing/selftests/bpf/libarena/src/asan.bpf.c index 64c5b990086c..5135d5c72a46 100644 --- a/tools/testing/selftests/bpf/libarena/src/asan.bpf.c +++ b/tools/testing/selftests/bpf/libarena/src/asan.bpf.c @@ -110,7 +110,7 @@ volatile bool asan_report_once = false; * to exit due to a missing implementation. Provide a simple implementation * just for memset to use it for poisoning/unpoisoning the map. */ -__weak int asan_memset(s8a __arg_arena *dst, s8 val, size_t size) +__weak int asan_memset(s8 __arena *dst, s8 val, size_t size) { size_t i; @@ -121,9 +121,9 @@ __weak int asan_memset(s8a __arg_arena *dst, s8 val, size_t size) } /* Validate a 1-byte access, always within a single byte. */ -static __always_inline bool memory_is_poisoned_1(s8a *addr) +static __always_inline bool memory_is_poisoned_1(s8 __arena *addr) { - s8 shadow_value = *(s8a *)mem_to_shadow(addr); + s8 shadow_value = *(s8 __arena *)mem_to_shadow(addr); /* Byte is 0, access is valid. */ if (likely(!shadow_value)) @@ -139,7 +139,7 @@ static __always_inline bool memory_is_poisoned_1(s8a *addr) } /* Validate a 2- 4-, 8-byte access, shadow spans up to 2 bytes. */ -static __always_inline bool memory_is_poisoned_2_4_8(s8a *addr, u64 size) +static __always_inline bool memory_is_poisoned_2_4_8(s8 __arena *addr, u64 size) { u64 end = (u64)addr + size - 1; @@ -148,17 +148,17 @@ static __always_inline bool memory_is_poisoned_2_4_8(s8a *addr, u64 size) * overflow above ASAN_GRANULE). */ if (likely(ASAN_GRANULE(end) >= size - 1)) - return memory_is_poisoned_1((s8a *)end); + return memory_is_poisoned_1((s8 __arena *)end); /* * Otherwise first byte must be fully unpoisoned, and second byte * must be unpoisoned up to the end of the accessed region. */ - return *(s8a *)mem_to_shadow(addr) || memory_is_poisoned_1((s8a *)end); + return *(s8 __arena *)mem_to_shadow(addr) || memory_is_poisoned_1((s8 __arena *)end); } -__weak bool asan_shadow_set(void __arena __arg_arena *addr) +__weak bool asan_shadow_set(void __arena *addr) { return memory_is_poisoned_1(addr); } @@ -166,7 +166,7 @@ __weak bool asan_shadow_set(void __arena __arg_arena *addr) static __always_inline u64 first_nonzero_byte(u64 addr, size_t size) { while (size && can_loop) { - if (unlikely(*(s8a *)addr)) + if (unlikely(*(s8 __arena *)addr)) return addr; addr += 1; size -= 1; @@ -175,7 +175,7 @@ static __always_inline u64 first_nonzero_byte(u64 addr, size_t size) return SHADOW_ALL_ZEROES; } -static __always_inline bool memory_is_poisoned_n(s8a *addr, u64 size) +static __always_inline bool memory_is_poisoned_n(s8 __arena *addr, u64 size) { u64 ret; u64 start; @@ -189,10 +189,10 @@ static __always_inline bool memory_is_poisoned_n(s8a *addr, u64 size) if (likely(ret == SHADOW_ALL_ZEROES)) return false; - return unlikely(ret != end || ASAN_GRANULE(addr + size - 1) >= *(s8a *)end); + return unlikely(ret != end || ASAN_GRANULE(addr + size - 1) >= *(s8 __arena *)end); } -__weak int asan_report(s8a __arg_arena *addr, size_t sz, u32 flags) +__weak int asan_report(s8 __arena *addr, size_t sz, u32 flags) { u32 reported = __sync_val_compare_and_swap(&asan_reported, false, true); @@ -211,7 +211,7 @@ __weak int asan_report(s8a __arg_arena *addr, size_t sz, u32 flags) return 0; } -static __always_inline bool check_asan_args(s8a *addr, size_t size, +static __always_inline bool check_asan_args(s8 __arena *addr, size_t size, bool *result) { bool valid = true; @@ -253,7 +253,7 @@ confirmed_valid: static __always_inline bool check_region_inline(intptr_t ptr, size_t size, u32 flags) { - s8a *addr = (s8a *)(u64)ptr; + s8 __arena *addr = (s8 __arena *)(u64)ptr; bool is_poisoned, is_valid; if (check_asan_args(addr, size, &is_valid)) { @@ -305,19 +305,19 @@ static __always_inline bool check_region_inline(intptr_t ptr, size_t size, } \ __hidden void __asan_report_store##size(intptr_t addr) \ { \ - asan_report((s8a *)addr, size, ASAN_WRITE); \ + asan_report((s8 __arena *)addr, size, ASAN_WRITE); \ } \ __hidden void __asan_report_store##size##_noabort(intptr_t addr) \ { \ - asan_report((s8a *)addr, size, ASAN_WRITE); \ + asan_report((s8 __arena *)addr, size, ASAN_WRITE); \ } \ __hidden void __asan_report_load##size(intptr_t addr) \ { \ - asan_report((s8a *)addr, size, ASAN_READ); \ + asan_report((s8 __arena *)addr, size, ASAN_READ); \ } \ __hidden void __asan_report_load##size##_noabort(intptr_t addr) \ { \ - asan_report((s8a *)addr, size, ASAN_READ); \ + asan_report((s8 __arena *)addr, size, ASAN_READ); \ } DEFINE_ASAN_LOAD_STORE(1); @@ -385,7 +385,7 @@ void *__asan_memset(void *p, int c, size_t n) */ __hidden __noasan int asan_poison(void __arena *addr, s8 val, size_t size) { - s8a *shadow; + s8 __arena *shadow; size_t len; /* @@ -443,7 +443,7 @@ __hidden __noasan int asan_poison(void __arena *addr, s8 val, size_t size) __hidden __noasan int asan_unpoison(void __arena *addr, size_t size) { size_t partial = size & ASAN_GRANULE_MASK; - s8a *shadow; + s8 __arena *shadow; size_t len; /* diff --git a/tools/testing/selftests/bpf/libarena/src/buddy.bpf.c b/tools/testing/selftests/bpf/libarena/src/buddy.bpf.c index 865e00803daa..f4ed4c3abb4b 100644 --- a/tools/testing/selftests/bpf/libarena/src/buddy.bpf.c +++ b/tools/testing/selftests/bpf/libarena/src/buddy.bpf.c @@ -45,12 +45,12 @@ enum { BUDDY_CHUNK_PAGES = BUDDY_CHUNK_BYTES / __PAGE_SIZE }; -static inline int buddy_lock(buddy_t *buddy) +static inline int buddy_lock(struct buddy __arena *buddy) { return arena_spin_lock(&buddy->lock); } -static inline void buddy_unlock(buddy_t *buddy) +static inline void buddy_unlock(struct buddy __arena *buddy) { arena_spin_unlock(&buddy->lock); } @@ -61,7 +61,7 @@ static inline void buddy_unlock(buddy_t *buddy) * page alloc kfuncs do not support aligning to a boundary (in this * case 1 MiB, see buddy.h on how this is derived). */ -static int buddy_reserve_arena_vaddr(buddy_t *buddy) +static int buddy_reserve_arena_vaddr(struct buddy __arena *buddy) { buddy->vaddr = 0; @@ -73,7 +73,7 @@ static int buddy_reserve_arena_vaddr(buddy_t *buddy) /* * Free up any unused address space. Used only during teardown. */ -static void buddy_unreserve_arena_vaddr(buddy_t *buddy) +static void buddy_unreserve_arena_vaddr(struct buddy __arena *buddy) { bpf_arena_free_pages( &arena, (void __arena *)(BUDDY_VADDR_OFFSET + buddy->vaddr), @@ -94,7 +94,7 @@ static void buddy_unreserve_arena_vaddr(buddy_t *buddy) * However, bump allocation must still be atomic because this function * is called without the buddy lock from multiple threads concurrently. */ -__weak int buddy_alloc_arena_vaddr(buddy_t __arg_arena *buddy, u64 *vaddrp) +__weak int buddy_alloc_arena_vaddr(struct buddy __arena *buddy, u64 *vaddrp) { u64 vaddr, old, new; @@ -134,7 +134,7 @@ static u64 arena_next_pow2(__u64 n) } __weak -int idx_set_allocated(buddy_chunk_t __arg_arena *chunk, u64 idx, bool allocated) +int idx_set_allocated(struct buddy_chunk __arena *chunk, u64 idx, bool allocated) { bool already_allocated; @@ -160,7 +160,7 @@ int idx_set_allocated(buddy_chunk_t __arg_arena *chunk, u64 idx, bool allocated) return 0; } -static int idx_is_allocated(buddy_chunk_t *chunk, u64 idx, bool *allocated) +static int idx_is_allocated(struct buddy_chunk __arena *chunk, u64 idx, bool *allocated) { if (unlikely(idx >= BUDDY_CHUNK_ITEMS)) { arena_stderr("getting state of invalid idx (%llu, max %d)\n", idx, @@ -173,7 +173,7 @@ static int idx_is_allocated(buddy_chunk_t *chunk, u64 idx, bool *allocated) } __weak -int idx_set_order(buddy_chunk_t __arg_arena *chunk, u64 idx, u8 order) +int idx_set_order(struct buddy_chunk __arena *chunk, u64 idx, u8 order) { u8 prev_order; @@ -206,7 +206,7 @@ int idx_set_order(buddy_chunk_t __arg_arena *chunk, u64 idx, u8 order) return 0; } -static u8 idx_get_order(buddy_chunk_t *chunk, u64 idx) +static u8 idx_get_order(struct buddy_chunk __arena *chunk, u64 idx) { u8 result; @@ -223,7 +223,7 @@ static u8 idx_get_order(buddy_chunk_t *chunk, u64 idx) return (idx & 0x1) ? (result & 0xf) : (result >> 4); } -static void __arena *idx_to_addr(buddy_chunk_t *chunk, size_t idx) +static void __arena *idx_to_addr(struct buddy_chunk __arena *chunk, size_t idx) { u64 address; @@ -246,7 +246,7 @@ static void __arena *idx_to_addr(buddy_chunk_t *chunk, size_t idx) return (void __arena *)address; } -static buddy_header_t *idx_to_header(buddy_chunk_t *chunk, size_t idx) +static struct buddy_header __arena *idx_to_header(struct buddy_chunk __arena *chunk, size_t idx) { bool allocated; u64 address; @@ -283,13 +283,13 @@ static buddy_header_t *idx_to_header(buddy_chunk_t *chunk, size_t idx) * less probable. */ - return (buddy_header_t *)(address + BUDDY_HEADER_OFF); + return (struct buddy_header __arena *)(address + BUDDY_HEADER_OFF); } -static void header_add_freelist(buddy_chunk_t *chunk, buddy_header_t *header, +static void header_add_freelist(struct buddy_chunk __arena *chunk, struct buddy_header __arena *header, u64 idx, u8 order) { - buddy_header_t *tmp_header; + struct buddy_header __arena *tmp_header; idx_set_order(chunk, idx, order); @@ -304,10 +304,10 @@ static void header_add_freelist(buddy_chunk_t *chunk, buddy_header_t *header, chunk->freelists[order] = idx; } -static void header_remove_freelist(buddy_chunk_t *chunk, - buddy_header_t *header, u8 order) +static void header_remove_freelist(struct buddy_chunk __arena *chunk, + struct buddy_header __arena *header, u8 order) { - buddy_header_t *tmp_header; + struct buddy_header __arena *tmp_header; if (header->prev_index != BUDDY_CHUNK_ITEMS) { tmp_header = idx_to_header(chunk, header->prev_index); @@ -356,10 +356,10 @@ static u64 size_to_order(size_t size) } __weak -int add_leftovers_to_freelist(buddy_chunk_t __arg_arena *chunk, u32 cur_idx, +int add_leftovers_to_freelist(struct buddy_chunk __arena *chunk, u32 cur_idx, u64 min_order, u64 max_order) { - buddy_header_t *header; + struct buddy_header __arena *header; u64 ord; u32 idx; @@ -381,10 +381,10 @@ int add_leftovers_to_freelist(buddy_chunk_t __arg_arena *chunk, u32 cur_idx, return 0; } -static buddy_chunk_t *buddy_chunk_get(buddy_t *buddy) +static struct buddy_chunk __arena *buddy_chunk_get(struct buddy __arena *buddy) { u64 order, ord, min_order, max_order; - buddy_chunk_t *chunk; + struct buddy_chunk __arena *chunk; size_t left; int power2; u64 vaddr; @@ -561,9 +561,9 @@ static buddy_chunk_t *buddy_chunk_get(buddy_t *buddy) return chunk; } -__weak int buddy_init(buddy_t __arg_arena *buddy) +__weak int buddy_init(struct buddy __arena *buddy) { - buddy_chunk_t *chunk; + struct buddy_chunk __arena *chunk; int ret; if (!asan_ready()) @@ -602,9 +602,9 @@ __weak int buddy_init(buddy_t __arg_arena *buddy) * We do not take a lock because we are freeing arena pages, and nobody should * be using the allocator at that point in the execution. */ -__weak int buddy_destroy(buddy_t __arg_arena *buddy) +__weak int buddy_destroy(struct buddy __arena *buddy) { - buddy_chunk_t *chunk, *next; + struct buddy_chunk __arena *chunk, *next; if (!buddy) return -EINVAL; @@ -631,9 +631,9 @@ __weak int buddy_destroy(buddy_t __arg_arena *buddy) return 0; } -__weak u64 buddy_chunk_alloc(buddy_chunk_t __arg_arena *chunk, int order_req) +__weak u64 buddy_chunk_alloc(struct buddy_chunk __arena *chunk, int order_req) { - buddy_header_t *header, *tmp_header, *next_header; + struct buddy_header __arena *header, *tmp_header, *next_header; u32 idx, tmpidx, retidx; u64 address; u64 order = 0; @@ -709,9 +709,9 @@ __weak u64 buddy_chunk_alloc(buddy_chunk_t __arg_arena *chunk, int order_req) } /* Scan the existing chunks for available memory. */ -static u64 buddy_alloc_from_existing_chunks(buddy_t *buddy, int order) +static u64 buddy_alloc_from_existing_chunks(struct buddy __arena *buddy, int order) { - buddy_chunk_t *chunk; + struct buddy_chunk __arena *chunk; u64 address; for (chunk = buddy->first_chunk; chunk != NULL && can_loop; @@ -728,7 +728,7 @@ static u64 buddy_alloc_from_existing_chunks(buddy_t *buddy, int order) * Try an allocation from a newly allocated chunk. Also * incorporate the chunk into the linked list. */ -static u64 buddy_alloc_from_new_chunk(buddy_t *buddy, buddy_chunk_t *chunk, int order) +static u64 buddy_alloc_from_new_chunk(struct buddy __arena *buddy, struct buddy_chunk __arena *chunk, int order) { u64 address; @@ -750,10 +750,10 @@ static u64 buddy_alloc_from_new_chunk(buddy_t *buddy, buddy_chunk_t *chunk, int return (u64)address; } __weak -u64 buddy_alloc_internal(buddy_t __arg_arena *buddy, size_t size) +u64 buddy_alloc_internal(struct buddy __arena *buddy, size_t size) { - buddy_chunk_t *chunk; u64 address = (u64)NULL; + struct buddy_chunk __arena *chunk; int order; if (!buddy) @@ -788,20 +788,20 @@ done: * data is smaller than the header, we must poison any * unused bytes that were part of the header. */ - if (size < BUDDY_HEADER_OFF + sizeof(buddy_header_t)) - asan_poison((u8 __arena *)address + BUDDY_HEADER_OFF, - BUDDY_POISONED, sizeof(buddy_header_t)); + if (size < BUDDY_HEADER_OFF + sizeof(struct buddy_header __arena)) + asan_poison((u8 __arena *)address + BUDDY_HEADER_OFF, BUDDY_POISONED, + sizeof(struct buddy_header __arena)); asan_unpoison((u8 __arena *)address, size); return address; } -static __always_inline int buddy_free_unlocked(buddy_t *buddy, u64 addr) +static __always_inline int buddy_free_unlocked(struct buddy __arena *buddy, u64 addr) { - buddy_header_t *header, *buddy_header; + struct buddy_header __arena *header, *buddy_header; u64 idx, buddy_idx, tmp_idx; - buddy_chunk_t *chunk; + struct buddy_chunk __arena *chunk; bool allocated; u8 order; int ret; @@ -878,7 +878,7 @@ static __always_inline int buddy_free_unlocked(buddy_t *buddy, u64 addr) return 0; } -__weak int buddy_free_internal(buddy_t __arg_arena *buddy, u64 addr) +__weak int buddy_free(struct buddy __arena *buddy, void __arena *addr) { int ret; @@ -893,7 +893,7 @@ __weak int buddy_free_internal(buddy_t __arg_arena *buddy, u64 addr) if (ret) return ret; - ret = buddy_free_unlocked(buddy, addr); + ret = buddy_free_unlocked(buddy, (u64)addr); buddy_unlock(buddy); diff --git a/tools/testing/selftests/bpf/libarena/src/common.bpf.c b/tools/testing/selftests/bpf/libarena/src/common.bpf.c index 544bf9e1cb38..ec9de29e6f3e 100644 --- a/tools/testing/selftests/bpf/libarena/src/common.bpf.c +++ b/tools/testing/selftests/bpf/libarena/src/common.bpf.c @@ -6,7 +6,7 @@ const volatile u32 zero = 0; -buddy_t buddy; +struct buddy __arena buddy; int arena_fls(__u64 word) { @@ -43,9 +43,9 @@ __weak u64 arena_malloc_internal(size_t size) return buddy_alloc_internal(&buddy, size); } -__weak void arena_free(void __arg_arena __arena *ptr) +__weak void arena_free(void __arena *ptr) { - buddy_free_internal(&buddy, (u64)ptr); + buddy_free(&buddy, ptr); } diff --git a/tools/testing/selftests/bpf/progs/arena_spin_lock.c b/tools/testing/selftests/bpf/progs/arena_spin_lock.c index 7236d92d382f..cf7cda79c16c 100644 --- a/tools/testing/selftests/bpf/progs/arena_spin_lock.c +++ b/tools/testing/selftests/bpf/progs/arena_spin_lock.c @@ -4,6 +4,7 @@ #include #include #include "bpf_misc.h" +#include #include struct { -- cgit v1.2.3 From 367e6e4a8173d47b4c57181cdd9dcbfc291755f0 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Mon, 1 Jun 2026 20:41:19 -0400 Subject: selftests/bpf: libarena: Directly return arena pointers from functions Now that the __arena annotation includes a BTF type tag, and the verifier can identify arena pointers at BTF loading time, return arena pointers as their true type instead of casting to u64. Remove the preprocessor typecast wrappers used to hide this from the caller. Acked-by: Eduard Zingerman Signed-off-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260602004120.17087-6-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/libarena/include/libarena/buddy.h | 3 +-- .../selftests/bpf/libarena/include/libarena/common.h | 3 +-- tools/testing/selftests/bpf/libarena/src/buddy.bpf.c | 20 ++++++++++---------- .../testing/selftests/bpf/libarena/src/common.bpf.c | 4 ++-- 4 files changed, 14 insertions(+), 16 deletions(-) diff --git a/tools/testing/selftests/bpf/libarena/include/libarena/buddy.h b/tools/testing/selftests/bpf/libarena/include/libarena/buddy.h index 4d57fc1b5c26..528c69a1f38e 100644 --- a/tools/testing/selftests/bpf/libarena/include/libarena/buddy.h +++ b/tools/testing/selftests/bpf/libarena/include/libarena/buddy.h @@ -76,7 +76,6 @@ struct buddy { int buddy_init(struct buddy __arena *buddy); int buddy_destroy(struct buddy __arena *buddy); int buddy_free(struct buddy __arena *buddy, void __arena *free); -u64 buddy_alloc_internal(struct buddy __arena *buddy, size_t size); -#define buddy_alloc(alloc, size) ((void __arena *)buddy_alloc_internal((alloc), (size))) +void __arena *buddy_alloc(struct buddy __arena *buddy, size_t size); #endif /* __BPF__ */ diff --git a/tools/testing/selftests/bpf/libarena/include/libarena/common.h b/tools/testing/selftests/bpf/libarena/include/libarena/common.h index ca1a6c1d6477..a3eb1641ac36 100644 --- a/tools/testing/selftests/bpf/libarena/include/libarena/common.h +++ b/tools/testing/selftests/bpf/libarena/include/libarena/common.h @@ -48,8 +48,7 @@ extern volatile u64 asan_violated; int arena_fls(__u64 word); -u64 arena_malloc_internal(size_t size); -#define arena_malloc(size) ((void __arena *)arena_malloc_internal((size))) +void __arena *arena_malloc(size_t size); void arena_free(void __arena *ptr); /* diff --git a/tools/testing/selftests/bpf/libarena/src/buddy.bpf.c b/tools/testing/selftests/bpf/libarena/src/buddy.bpf.c index f4ed4c3abb4b..c674ee5cfcc1 100644 --- a/tools/testing/selftests/bpf/libarena/src/buddy.bpf.c +++ b/tools/testing/selftests/bpf/libarena/src/buddy.bpf.c @@ -750,25 +750,25 @@ static u64 buddy_alloc_from_new_chunk(struct buddy __arena *buddy, struct buddy_ return (u64)address; } __weak -u64 buddy_alloc_internal(struct buddy __arena *buddy, size_t size) +void __arena *buddy_alloc(struct buddy __arena *buddy, size_t size) { - u64 address = (u64)NULL; + void __arena *address = NULL; struct buddy_chunk __arena *chunk; int order; if (!buddy) - return (u64)NULL; + return NULL; order = size_to_order(size); if (order >= BUDDY_CHUNK_NUM_ORDERS || order < 0) { arena_stderr("invalid order %d (sz %lu)\n", order, size); - return (u64)NULL; + return NULL; } if (buddy_lock(buddy)) - return (u64)NULL; + return NULL; - address = buddy_alloc_from_existing_chunks(buddy, order); + address = (u8 __arena *)buddy_alloc_from_existing_chunks(buddy, order); buddy_unlock(buddy); if (address) goto done; @@ -776,12 +776,12 @@ u64 buddy_alloc_internal(struct buddy __arena *buddy, size_t size) /* Get a new chunk. */ chunk = buddy_chunk_get(buddy); if (chunk) - address = buddy_alloc_from_new_chunk(buddy, chunk, order); + address = (u8 __arena *)buddy_alloc_from_new_chunk(buddy, chunk, order); done: /* If we failed to allocate memory, return NULL. */ if (!address) - return (u64)NULL; + return NULL; /* * Unpoison exactly the amount of bytes requested. If the @@ -789,10 +789,10 @@ done: * unused bytes that were part of the header. */ if (size < BUDDY_HEADER_OFF + sizeof(struct buddy_header __arena)) - asan_poison((u8 __arena *)address + BUDDY_HEADER_OFF, BUDDY_POISONED, + asan_poison(address + BUDDY_HEADER_OFF, BUDDY_POISONED, sizeof(struct buddy_header __arena)); - asan_unpoison((u8 __arena *)address, size); + asan_unpoison(address, size); return address; } diff --git a/tools/testing/selftests/bpf/libarena/src/common.bpf.c b/tools/testing/selftests/bpf/libarena/src/common.bpf.c index ec9de29e6f3e..50be57213dfb 100644 --- a/tools/testing/selftests/bpf/libarena/src/common.bpf.c +++ b/tools/testing/selftests/bpf/libarena/src/common.bpf.c @@ -38,9 +38,9 @@ __weak int arena_buddy_reset(void) return buddy_init(&buddy); } -__weak u64 arena_malloc_internal(size_t size) +__weak void __arena *arena_malloc(size_t size) { - return buddy_alloc_internal(&buddy, size); + return buddy_alloc(&buddy, size); } __weak void arena_free(void __arena *ptr) -- cgit v1.2.3 From 9fd5bf96ac4be2ec784598c818f672422182042c Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Mon, 1 Jun 2026 20:41:20 -0400 Subject: selftests/bpf: Add tests for the new type-tag based __arena identifier Add selftests that combine the new type-based __arena identifier with the volatile qualifier both in functions' arguments and return values. This way we test both that they are recognized as arena arguments and that they are not sensitive to the position they are placed in the type compared to other qualifiers. Signed-off-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260602004120.17087-7-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/verifier_arena.c | 67 ++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_arena.c b/tools/testing/selftests/bpf/progs/verifier_arena.c index 89d72c8d756a..df0e22d1a29b 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena.c @@ -607,4 +607,71 @@ int non_arena_ptr_add_to_arena_ptr(void *ctx) #endif +static __noinline +u32 __arena *check_arena_arg_nonglobal(u32 __arena *arg) +{ + volatile u32 val = *arg; + + *arg = val + 1; + + return arg; +} + +__weak +u32 __arena *check_arena_arg_global(u32 __arena *arg) +{ + volatile u32 val = *arg; + + *arg = val + 1; + + return arg; +} + +__weak +u32 volatile __arena *check_arena_arg_quals1(u32 volatile __arena *arg1, u32 __arena volatile *arg2) +{ + *arg1 = *arg1 + 1; + *arg2 = *arg1 + 1; + + return arg2; +} + +__weak +u32 __arena volatile *check_arena_arg_quals2(u32 volatile __arena *arg1, u32 __arena volatile *arg2) +{ + *arg1 = *arg1 + 1; + *arg2 = *arg2 + 1; + + return arg2; +} + +SEC("syscall") +__success __retval(0) +int check_arena_arg_ret(void *ctx) +{ + u32 __arena *page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + u32 __arena *arg = page; + u32 __arena volatile *arg1; + u32 __arena volatile *ret1; + u32 volatile __arena *arg2; + u32 volatile __arena *ret2; + + if (!arg) + return 1; + + /* Make sure we use {arg, ret}{1, 2}. */ + + arg = check_arena_arg_nonglobal(page); + arg = check_arena_arg_global(arg); + + arg1 = arg2 = page; + ret1 = check_arena_arg_quals1(arg1, arg2); + ret2 = check_arena_arg_quals2(arg1, arg2); + + if (!(*ret1 ||*ret2)) + return -EINVAL; + + return 0; +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From b93c55b4932dd7e32dca8cf34a3443cc87a02906 Mon Sep 17 00:00:00 2001 From: Deepanshu Kartikey Date: Tue, 2 Jun 2026 08:22:49 +0530 Subject: bpf: fix UAF by restoring RCU-delayed inode freeing in bpffs commit 4f375ade6aa9 ("bpf: Avoid RCU context warning when unpinning htab with internal structs") moved inode cleanup from ->free_inode() into ->destroy_inode() to avoid sleeping in RCU context when calling bpf_any_put(). However this removed the RCU delay on freeing the inode itself and the cached symlink body (i_link), both of which can be accessed by RCU pathwalk (pick_link, may_lookup etc.). This causes a use-after-free when a concurrent unlinkat() drops the last inode reference and destroy_inode() frees the inode immediately, while another task is still walking the path in RCU mode and reads inode->i_opflags (offset +2) inside current_time() -> is_mgtime(). KASAN reports: BUG: KASAN: slab-use-after-free in is_mgtime include/linux/fs.h:2313 Read of size 2 at addr ffff8880407e4282 (offset +2 = i_opflags) The rules (per Al Viro): ->destroy_inode() called immediately, can sleep, use for blocking cleanup e.g. bpf_any_put() ->free_inode() called after RCU grace period, use for freeing inode and anything RCU-accessible e.g. i_link Fix: split the two concerns properly: - keep bpf_any_put() in bpf_destroy_inode() since it is blocking and needs to run promptly - introduce bpf_free_inode() to handle kfree(i_link) and free_inode_nonrcu() with proper RCU delay, preventing the UAF Fixes: 4f375ade6aa9 ("bpf: Avoid RCU context warning when unpinning htab with internal structs") Reported-by: syzbot+36e50496c8ac4bcde3f9@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=36e50496c8ac4bcde3f9 Suggested-by: Al Viro Link: https://lore.kernel.org/all/20260423043906.GN3518998@ZenIV/ Link: https://lore.kernel.org/all/20260602002607.110866-1-kartikey406@gmail.com/T/ [v1] Signed-off-by: Deepanshu Kartikey Acked-by: Al Viro Link: https://lore.kernel.org/r/20260602025249.113828-1-kartikey406@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/inode.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 25c06a011825..188c774a469c 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -766,10 +766,18 @@ static void bpf_destroy_inode(struct inode *inode) { enum bpf_type type; - if (S_ISLNK(inode->i_mode)) - kfree(inode->i_link); if (!bpf_inode_type(inode, &type)) bpf_any_put(inode->i_private, type); +} + +/* + * Called after RCU grace period - safe to free inode and anything + * that might be accessed by RCU pathwalk (inode fields, i_link). + */ +static void bpf_free_inode(struct inode *inode) +{ + if (S_ISLNK(inode->i_mode)) + kfree(inode->i_link); free_inode_nonrcu(inode); } @@ -778,6 +786,7 @@ const struct super_operations bpf_super_ops = { .drop_inode = inode_just_drop, .show_options = bpf_show_options, .destroy_inode = bpf_destroy_inode, + .free_inode = bpf_free_inode, }; enum { -- cgit v1.2.3 From 3c56ee343f9412d81918635c3e25e22a5dd6d87e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 2 Jun 2026 15:30:49 +0200 Subject: bpf: Reject exclusive maps for bpf_map_elem iterators Exclusive maps (aka excl_prog_hash) are meant to be reachable only from the single program whose hash matches. This is enforced by check_map_prog_compatibility() when the map is referenced from a program such as signed BPF loaders. A bpf_map_elem iterator, however, binds its target map at attach time in bpf_iter_attach_map() instead of referencing it from the program, so the exclusivity check is never reached. On top of that, the iterator exposes the map value as a writable buffer. Fixes: baefdbdf6812 ("bpf: Implement exclusive map creation") Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260602133052.423725-2-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- kernel/bpf/map_iter.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 261a03ea73d3..ae0741a09c6d 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -112,6 +112,10 @@ static int bpf_iter_attach_map(struct bpf_prog *prog, map = bpf_map_get_with_uref(linfo->map.map_fd); if (IS_ERR(map)) return PTR_ERR(map); + if (map->excl_prog_sha) { + err = -EPERM; + goto put_map; + } if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || -- cgit v1.2.3 From 7fef1796ec4d8c4cce70c374efafdbbc8d6d6cbc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 2 Jun 2026 15:30:50 +0200 Subject: libbpf: Guard add_data() against size overflow add_data() computes size8 = roundup(size, 8) and then hands size8 to realloc_data_buf() before doing memcpy(gen->data_cur, data, size) with the original size. A wrapped size8 passes through the realloc_data_buf() INT32_MAX check. Harden this against overflow, though not realistic to happen in practice. Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260602133052.423725-3-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/gen_loader.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c index 66e13566bc31..d79695f01c87 100644 --- a/tools/lib/bpf/gen_loader.c +++ b/tools/lib/bpf/gen_loader.c @@ -160,10 +160,16 @@ void bpf_gen__init(struct bpf_gen *gen, int log_level, int nr_progs, int nr_maps static int add_data(struct bpf_gen *gen, const void *data, __u32 size) { - __u32 size8 = roundup(size, 8); __u64 zero = 0; + __u32 size8; void *prev; + if (size > INT32_MAX) { + gen->error = -ERANGE; + return 0; + } + size8 = roundup(size, 8); + if (realloc_data_buf(gen, size8)) return 0; prev = gen->data_cur; -- cgit v1.2.3 From 082c412097716b93ff1365689fc4ddcd1ce8296f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 2 Jun 2026 15:30:51 +0200 Subject: selftests/bpf: Keep verifier_map_ptr exercising ops pointer access sashiko complained that 38498c0ebacd ("selftests/bpf: Adjust verifier_map_ptr for the map's excl field") would slightly decrease the test coverage given before the test was against the verifier rejecting the ops pointer. Recover the old test with the right offsets and add the existing one as an additional test case. # LDLIBS=-static PKG_CONFIG='pkg-config --static' ./vmtest.sh -- ./test_progs -t verifier_map_ptr [ 1.672932] bpf_testmod: module verification failed: signature and/or required key missing - tainting kernel #637/1 verifier_map_ptr/bpf_map_ptr: read with negative offset rejected:OK #637/2 verifier_map_ptr/bpf_map_ptr: read with negative offset rejected @unpriv:OK #637/3 verifier_map_ptr/bpf_map_ptr: write rejected:OK #637/4 verifier_map_ptr/bpf_map_ptr: write rejected @unpriv:OK #637/5 verifier_map_ptr/bpf_map_ptr: read non-existent field rejected:OK #637/6 verifier_map_ptr/bpf_map_ptr: read non-existent field rejected @unpriv:OK #637/7 verifier_map_ptr/bpf_map_ptr: read beyond excl field rejected:OK #637/8 verifier_map_ptr/bpf_map_ptr: read beyond excl field rejected @unpriv:OK #637/9 verifier_map_ptr/bpf_map_ptr: read ops field accepted:OK #637/10 verifier_map_ptr/bpf_map_ptr: read ops field accepted @unpriv:OK #637/11 verifier_map_ptr/bpf_map_ptr: r = 0, map_ptr = map_ptr + r:OK #637/12 verifier_map_ptr/bpf_map_ptr: r = 0, map_ptr = map_ptr + r @unpriv:OK #637/13 verifier_map_ptr/bpf_map_ptr: r = 0, r = r + map_ptr:OK #637/14 verifier_map_ptr/bpf_map_ptr: r = 0, r = r + map_ptr @unpriv:OK #637 verifier_map_ptr:OK [...] Summary: 2/20 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260602133052.423725-4-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/progs/verifier_map_ptr.c | 34 +++++++++++++++++++--- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/verifier_map_ptr.c b/tools/testing/selftests/bpf/progs/verifier_map_ptr.c index d8e822d1a8ba..166193659870 100644 --- a/tools/testing/selftests/bpf/progs/verifier_map_ptr.c +++ b/tools/testing/selftests/bpf/progs/verifier_map_ptr.c @@ -72,17 +72,43 @@ __naked void bpf_map_ptr_write_rejected(void) /* * struct bpf_map starts with the SHA256 hash sha[32] at offset 0 (a readable - * byte array), followed by the u32 excl field at offset 32. Reading a u32 at - * offset 33 runs past the end of excl and is rejected. + * byte array), the u32 excl field at offset 32, and the ops pointer at offset + * 40. Reading a u32 at offset 41 reaches into the middle of the ops pointer, + * i.e. a partial pointer access, which is rejected. */ SEC("socket") __description("bpf_map_ptr: read non-existent field rejected") __failure -__msg("access beyond the end of member excl (mend:36) in struct bpf_map with off 33 size 4") +__msg("cannot access ptr member ops with moff 40 in struct bpf_map with off 41 size 4") __failure_unpriv __msg_unpriv("access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN") __flag(BPF_F_ANY_ALIGNMENT) __naked void read_non_existent_field_rejected(void) +{ + asm volatile (" \ + r6 = 0; \ + r1 = %[map_array_48b] ll; \ + r6 = *(u32*)(r1 + 41); \ + r0 = 1; \ + exit; \ +" : + : __imm_addr(map_array_48b) + : __clobber_all); +} + +/* + * The u32 excl field spans offsets 32..35 (mend 36). Reading a u32 at offset + * 33 starts inside excl but extends past its end, which the verifier rejects + * as an out-of-bounds scalar access. + */ +SEC("socket") +__description("bpf_map_ptr: read beyond excl field rejected") +__failure +__msg("access beyond the end of member excl (mend:36) in struct bpf_map with off 33 size 4") +__failure_unpriv +__msg_unpriv("access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN") +__flag(BPF_F_ANY_ALIGNMENT) +__naked void read_beyond_excl_field_rejected(void) { asm volatile (" \ r6 = 0; \ @@ -105,7 +131,7 @@ __naked void ptr_read_ops_field_accepted(void) asm volatile (" \ r6 = 0; \ r1 = %[map_array_48b] ll; \ - r6 = *(u64*)(r1 + 0); \ + r6 = *(u64*)(r1 + 40); \ r0 = 1; \ exit; \ " : -- cgit v1.2.3 From 8dedd34122d0950c6b69785db0fa740fdbbf5b2c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 2 Jun 2026 15:30:52 +0200 Subject: selftests/bpf: Test that exclusive maps are rejected as iter targets Add a subtest to map_excl that creates an exclusive map and verifies a bpf_map_elem iterator cannot be attached to it, which would otherwise let an unrelated program read and overwrite the map's contents through the iterator's writable value buffer. # LDLIBS=-static PKG_CONFIG='pkg-config --static' ./vmtest.sh -- ./test_progs -t map_excl [...] ./test_progs -t map_excl [ 1.704382] bpf_testmod: loading out-of-tree module taints kernel. [ 1.706068] bpf_testmod: module verification failed: signature and/or required key missing - tainting kernel #215/1 map_excl/map_excl_allowed:OK #215/2 map_excl/map_excl_denied:OK #215/3 map_excl/map_excl_no_map_in_map:OK #215/4 map_excl/map_excl_no_map_iter:OK #215 map_excl:OK Summary: 1/4 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260602133052.423725-5-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/map_excl.c | 39 +++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/map_excl.c b/tools/testing/selftests/bpf/prog_tests/map_excl.c index a213dd559aae..3088668e2e45 100644 --- a/tools/testing/selftests/bpf/prog_tests/map_excl.c +++ b/tools/testing/selftests/bpf/prog_tests/map_excl.c @@ -7,6 +7,7 @@ #include #include "map_excl.skel.h" +#include "bpf_iter_bpf_array_map.skel.h" #ifndef SHA256_DIGEST_SIZE #define SHA256_DIGEST_SIZE 32 @@ -89,6 +90,42 @@ out: close(excl_fd); } +static void test_map_excl_no_map_iter(void) +{ + __u8 hash[SHA256_DIGEST_SIZE] = {}; + LIBBPF_OPTS(bpf_map_create_opts, excl_opts, + .excl_prog_hash = hash, + .excl_prog_hash_size = sizeof(hash)); + DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); + struct bpf_iter_bpf_array_map *skel = NULL; + union bpf_iter_link_info linfo; + struct bpf_link *link; + int excl_fd; + + excl_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "excl_iter", 4, 8, 3, &excl_opts); + if (!ASSERT_OK_FD(excl_fd, "create exclusive map")) + return; + + skel = bpf_iter_bpf_array_map__open_and_load(); + if (!ASSERT_OK_PTR(skel, "bpf_iter_bpf_array_map__open_and_load")) + goto out; + + memset(&linfo, 0, sizeof(linfo)); + linfo.map.map_fd = excl_fd; + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + + link = bpf_program__attach_iter(skel->progs.dump_bpf_array_map, &opts); + if (!ASSERT_ERR_PTR(link, "reject exclusive map as iter target")) { + bpf_link__destroy(link); + goto out; + } + ASSERT_EQ(libbpf_get_error(link), -EPERM, "iter attach errno"); +out: + bpf_iter_bpf_array_map__destroy(skel); + close(excl_fd); +} + void test_map_excl(void) { if (test__start_subtest("map_excl_allowed")) @@ -97,4 +134,6 @@ void test_map_excl(void) test_map_excl_denied(); if (test__start_subtest("map_excl_no_map_in_map")) test_map_excl_no_map_in_map(); + if (test__start_subtest("map_excl_no_map_iter")) + test_map_excl_no_map_iter(); } -- cgit v1.2.3 From c169a2a5fd9cfdb2ae93cf6d86be4d2a5e3d813c Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Tue, 2 Jun 2026 10:52:04 -0700 Subject: bpf: Silence unused-but-set-variable warning in bpf_for_each_reg_in_vstate_mask The macro requires callers to pass a stack variable, but not all callbacks use it. Add (void)__stack to suppress the clang W=1 warning. Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260602175204.624401-1-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 3dd2d21230af..c248ff41f42a 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -582,6 +582,7 @@ bpf_get_spilled_stack_arg(int slot, struct bpf_func_state *frame) (void)(__expr); \ } \ } \ + (void)__stack; \ }) /* Invoke __expr over regsiters in __vst, setting __state and __reg */ -- cgit v1.2.3 From 8a7f2bff2165e53595d1e91c160b340f978c0ab7 Mon Sep 17 00:00:00 2001 From: Woojin Ji Date: Wed, 3 Jun 2026 09:33:39 +0900 Subject: bpftool: Use libbpf error code for flow dissector query bpf_prog_query() returns a negative errno on failure. query_flow_dissector() currently closes the namespace fd and then reads errno to decide whether -EINVAL means that the running kernel does not support flow dissector queries. That errno check controls behavior, not just diagnostics: -EINVAL is handled as a non-fatal old-kernel case, while any other error makes bpftool net fail. The namespace fd is opened read-only, so close() is not expected to commonly fail in normal use. Still, the BPF_PROG_QUERY error is already available in err, and reading errno after an intervening close() is fragile. If close() does change errno, the compatibility branch may be based on close()'s error instead of the BPF_PROG_QUERY result. This was reproduced with an LD_PRELOAD fault injector that forced BPF_PROG_QUERY for BPF_FLOW_DISSECTOR to fail with EINVAL and then forced close() on the netns fd to fail with EIO. The unpatched bpftool reported "can't query prog: Input/output error". With this change, the same injected failure is handled as the intended non-fatal EINVAL compatibility case. Use the libbpf-returned error code instead. Keep the existing errno reset in the non-fatal path to preserve batch mode behavior. The success path is unchanged. Fixes: 7f0c57fec80f ("bpftool: show flow_dissector attachment status") Signed-off-by: Woojin Ji Signed-off-by: Andrii Nakryiko Acked-by: Leon Hwang Acked-by: Yonghong Song Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20260603003339.33791-1-random6.xyz@gmail.com Assisted-by: ChatGPT:gpt-5.5 --- tools/bpf/bpftool/net.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c index 974189da8a91..dba28755d284 100644 --- a/tools/bpf/bpftool/net.c +++ b/tools/bpf/bpftool/net.c @@ -603,14 +603,14 @@ static int query_flow_dissector(struct bpf_attach_info *attach_info) &attach_flags, prog_ids, &prog_cnt); close(fd); if (err) { - if (errno == EINVAL) { + if (err == -EINVAL) { /* Older kernel's don't support querying * flow dissector programs. */ errno = 0; return 0; } - p_err("can't query prog: %s", strerror(errno)); + p_err("can't query prog: %s", strerror(-err)); return -1; } -- cgit v1.2.3 From e87d898bc766a6dc3cec63478b4cdf4e6286aff1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 3 Jun 2026 23:16:57 +0200 Subject: selftests/bpf: Cover exclusive map create-time validation map_excl exercises exclusive-map binding (allowed/denied), map-in-map and map iterator rejection. It does not cover the create-time validation of excl_prog_hash: the kernel only accepts a SHA-256-sized hash and requires the pointer and size to be consistent. Add map_excl_create_validation to check the rejected combinations: # LDLIBS=-static PKG_CONFIG='pkg-config --static' ./vmtest.sh -- ./test_progs -t map_excl [...] [ 1.780305] clocksource: Switched to clocksource tsc #215/1 map_excl/map_excl_allowed:OK #215/2 map_excl/map_excl_denied:OK #215/3 map_excl/map_excl_no_map_in_map:OK #215/4 map_excl/map_excl_no_map_iter:OK #215/5 map_excl/map_excl_create_validation:OK #215 map_excl:OK Summary: 1/5 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260603211658.471212-1-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/map_excl.c | 37 +++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/map_excl.c b/tools/testing/selftests/bpf/prog_tests/map_excl.c index 3088668e2e45..3f4422b9ffa6 100644 --- a/tools/testing/selftests/bpf/prog_tests/map_excl.c +++ b/tools/testing/selftests/bpf/prog_tests/map_excl.c @@ -126,6 +126,41 @@ out: close(excl_fd); } +static void test_map_excl_create_validation(void) +{ + LIBBPF_OPTS(bpf_map_create_opts, o); + __u8 hash[SHA256_DIGEST_SIZE] = {}; + int fd; + + o.excl_prog_hash = hash; + o.excl_prog_hash_size = SHA256_DIGEST_SIZE / 2; + fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "excl", 4, 4, 1, &o); + if (fd >= 0) + close(fd); + ASSERT_EQ(fd, -EINVAL, "reject short excl_prog_hash_size"); + + o.excl_prog_hash = hash; + o.excl_prog_hash_size = SHA256_DIGEST_SIZE * 2; + fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "excl", 4, 4, 1, &o); + if (fd >= 0) + close(fd); + ASSERT_EQ(fd, -EINVAL, "reject long excl_prog_hash_size"); + + o.excl_prog_hash = hash; + o.excl_prog_hash_size = 0; + fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "excl", 4, 4, 1, &o); + if (fd >= 0) + close(fd); + ASSERT_EQ(fd, -EINVAL, "reject hash pointer with zero size"); + + o.excl_prog_hash = NULL; + o.excl_prog_hash_size = SHA256_DIGEST_SIZE; + fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "excl", 4, 4, 1, &o); + if (fd >= 0) + close(fd); + ASSERT_EQ(fd, -EINVAL, "reject size with NULL hash pointer"); +} + void test_map_excl(void) { if (test__start_subtest("map_excl_allowed")) @@ -136,4 +171,6 @@ void test_map_excl(void) test_map_excl_no_map_in_map(); if (test__start_subtest("map_excl_no_map_iter")) test_map_excl_no_map_iter(); + if (test__start_subtest("map_excl_create_validation")) + test_map_excl_create_validation(); } -- cgit v1.2.3 From 5b88319e4775ee1924d5b709084b25f72e6fe78d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 3 Jun 2026 23:16:58 +0200 Subject: selftests/bpf: Test signed loader error paths The positive path for signed BPF loaders is covered today by the signed lskels (fentry_test, fexit_test, atomics). But the runtime metadata check the generated loader performs (libbpf gen_loader's emit_signature_match), the map content hash it relies on, the load-time signature, and the immutability invariants of its metadata map are not yet covered. Thus, add a new, extensive test suite which drives libbpf's gen_loader (bpf_object__gen_loader, gen_hash=true), the same machinery which bpftool uses for signed light skeletons, and exercise corner cases so that we can assert this in BPF CI: # LDLIBS=-static PKG_CONFIG='pkg-config --static' ./vmtest.sh -- ./test_progs -t signed_loader [...] [ 1.840842] clocksource: Switched to clocksource tsc #405/1 signed_loader/metadata_check_shape:OK #405/2 signed_loader/metadata_match:OK #405/3 signed_loader/metadata_sha_mismatch:OK #405/4 signed_loader/metadata_not_exclusive:OK #405/5 signed_loader/metadata_hash_not_computed:OK #405/6 signed_loader/signature_enforced:OK #405/7 signed_loader/signature_too_large:OK #405/8 signed_loader/signature_bad_keyring:OK #405/9 signed_loader/metadata_ctx_max_entries_ignored:OK #405/10 signed_loader/metadata_ctx_initial_value_ignored:OK #405/11 signed_loader/signature_authenticates_insns:OK #405/12 signed_loader/hash_requires_frozen:OK #405/13 signed_loader/no_update_after_freeze:OK #405/14 signed_loader/freeze_writable_mmap:OK #405/15 signed_loader/no_writable_mmap_frozen:OK #405/16 signed_loader/map_hash_matches_libbpf:OK #405/17 signed_loader/map_hash_multi_element:OK #405/18 signed_loader/map_hash_bad_size:OK #405/19 signed_loader/map_hash_unsupported_type:OK #405 signed_loader:OK Summary: 1/19 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260603211658.471212-2-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/signed_loader.c | 1013 ++++++++++++++++++++ .../selftests/bpf/progs/test_signed_loader.c | 18 + .../selftests/bpf/progs/test_signed_loader_data.c | 20 + .../selftests/bpf/progs/test_signed_loader_map.c | 28 + 4 files changed, 1079 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/signed_loader.c create mode 100644 tools/testing/selftests/bpf/progs/test_signed_loader.c create mode 100644 tools/testing/selftests/bpf/progs/test_signed_loader_data.c create mode 100644 tools/testing/selftests/bpf/progs/test_signed_loader_map.c diff --git a/tools/testing/selftests/bpf/prog_tests/signed_loader.c b/tools/testing/selftests/bpf/prog_tests/signed_loader.c new file mode 100644 index 000000000000..dcfdd2d96b05 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/signed_loader.c @@ -0,0 +1,1013 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Isovalent */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bpf/libbpf_internal.h" /* for libbpf_sha256() */ +#include "bpf/skel_internal.h" /* for loader ctx layout (bpf_loader_ctx etc) */ + +#include "test_signed_loader.skel.h" +#include "test_signed_loader_map.skel.h" +#include "test_signed_loader_data.skel.h" + +#define SIG_MATCH_INSNS 33 /* excl (5) + 4 * sha-dword (7) */ + +static int load_loader(const void *insns, __u32 insns_sz, int map_fd, + const void *sig, __u32 sig_sz, __s32 keyring_id) +{ + union bpf_attr attr; + int fd; + + memset(&attr, 0, sizeof(attr)); + attr.prog_type = BPF_PROG_TYPE_SYSCALL; + attr.insns = ptr_to_u64(insns); + attr.insn_cnt = insns_sz / sizeof(struct bpf_insn); + attr.license = ptr_to_u64("Dual BSD/GPL"); + attr.prog_flags = BPF_F_SLEEPABLE; + attr.fd_array = ptr_to_u64(&map_fd); + if (sig) { + attr.signature = ptr_to_u64(sig); + attr.signature_size = sig_sz; + attr.keyring_id = keyring_id; + } + memcpy(attr.prog_name, "__loader.prog", sizeof("__loader.prog")); + fd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, + offsetofend(union bpf_attr, keyring_id)); + return fd < 0 ? -errno : fd; +} + +static int run_gen_loader(const void *insns, __u32 insns_sz, + const void *data, __u32 data_sz, + const void *excl, __u32 excl_sz, + const void *sig, __u32 sig_sz, + bool get_hash, void *ctx, __u32 ctx_sz, bool *loader_ran) +{ + LIBBPF_OPTS(bpf_map_create_opts, mopts, + .excl_prog_hash = excl, + .excl_prog_hash_size = excl_sz); + __u8 hbuf[SHA256_DIGEST_LENGTH]; + struct bpf_map_info info; + __u32 ilen = sizeof(info), key = 0; + union bpf_attr attr; + int map_fd, prog_fd, ret; + + *loader_ran = false; + + map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "__loader.map", + 4, data_sz, 1, &mopts); + if (map_fd < 0) + return -errno; + if (bpf_map_update_elem(map_fd, &key, data, 0)) { + ret = -errno; + goto out_map; + } + if (bpf_map_freeze(map_fd)) { + ret = -errno; + goto out_map; + } + if (get_hash) { + memset(&info, 0, sizeof(info)); + info.hash = ptr_to_u64(hbuf); + info.hash_size = sizeof(hbuf); + if (bpf_map_get_info_by_fd(map_fd, &info, &ilen)) { + ret = -errno; + goto out_map; + } + } + + memset(&attr, 0, sizeof(attr)); + attr.prog_type = BPF_PROG_TYPE_SYSCALL; + attr.insns = ptr_to_u64(insns); + attr.insn_cnt = insns_sz / sizeof(struct bpf_insn); + attr.license = ptr_to_u64("Dual BSD/GPL"); + attr.prog_flags = BPF_F_SLEEPABLE; + attr.fd_array = ptr_to_u64(&map_fd); + if (sig) { + attr.signature = ptr_to_u64(sig); + attr.signature_size = sig_sz; + attr.keyring_id = KEY_SPEC_SESSION_KEYRING; + } + memcpy(attr.prog_name, "__loader.prog", sizeof("__loader.prog")); + prog_fd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, + offsetofend(union bpf_attr, keyring_id)); + if (prog_fd < 0) { + ret = -errno; + goto out_map; + } + + memset(&attr, 0, sizeof(attr)); + attr.test.prog_fd = prog_fd; + attr.test.ctx_in = ptr_to_u64(ctx); + attr.test.ctx_size_in = ctx_sz; + if (syscall(__NR_bpf, BPF_PROG_RUN, &attr, + offsetofend(union bpf_attr, test)) < 0) { + ret = -errno; + goto out_prog; + } + *loader_ran = true; + ret = (int)attr.test.retval; +out_prog: + close(prog_fd); +out_map: + close(map_fd); + return ret; +} + +static void close_loader_ctx_fds(void *ctx, int nr_maps, int nr_progs) +{ + struct bpf_map_desc *md = (struct bpf_map_desc *)((char *)ctx + + sizeof(struct bpf_loader_ctx)); + struct bpf_prog_desc *pd = (struct bpf_prog_desc *)(md + nr_maps); + int i; + + for (i = 0; i < nr_maps; i++) + if (md[i].map_fd > 0) + close(md[i].map_fd); + for (i = 0; i < nr_progs; i++) + if (pd[i].prog_fd > 0) + close(pd[i].prog_fd); +} + +static int run_setup(const char *cmd, const char *dir) +{ + int pid, status; + + pid = fork(); + if (pid < 0) + return -errno; + if (pid == 0) { + execlp("./verify_sig_setup.sh", "./verify_sig_setup.sh", + cmd, dir, NULL); + exit(1); + } + if (waitpid(pid, &status, 0) < 0) + return -errno; + return (WIFEXITED(status) && + WEXITSTATUS(status) == 0) ? 0 : -EINVAL; +} + +static int sign_buf(const char *dir, const void *buf, __u32 len, + void *sig, __u32 *sig_sz) +{ + char data_tmpl[PATH_MAX], key[PATH_MAX]; + char sigpath[PATH_MAX + sizeof(".p7s")]; + int fd, pid, status, ret; + struct stat st; + + ret = snprintf(data_tmpl, sizeof(data_tmpl), "%s/dataXXXXXX", dir); + if (ret < 0 || ret >= (int)sizeof(data_tmpl)) + return -ENAMETOOLONG; + ret = 0; + + fd = mkstemp(data_tmpl); + if (fd < 0) + return -errno; + if (write(fd, buf, len) != (ssize_t)len) { + close(fd); + ret = -EIO; + goto out; + } + close(fd); + + pid = fork(); + if (pid < 0) { + ret = -errno; + goto out; + } + if (pid == 0) { + snprintf(key, sizeof(key), "%s/signing_key.pem", dir); + execlp("./sign-file", "./sign-file", "-d", "sha256", + key, key, data_tmpl, NULL); + exit(1); + } + if (waitpid(pid, &status, 0) < 0 || + !WIFEXITED(status) || WEXITSTATUS(status)) { + ret = -EINVAL; + goto out; + } + + snprintf(sigpath, sizeof(sigpath), "%s.p7s", data_tmpl); + if (stat(sigpath, &st) < 0) { + ret = -errno; + goto out; + } + if (st.st_size > (off_t)*sig_sz) { + ret = -E2BIG; + goto out_sig; + } + fd = open(sigpath, O_RDONLY); + if (fd < 0) { + ret = -errno; + goto out_sig; + } + if (read(fd, sig, st.st_size) != st.st_size) { + close(fd); + ret = -EIO; + goto out_sig; + } + close(fd); + *sig_sz = st.st_size; +out_sig: + unlink(sigpath); +out: + unlink(data_tmpl); + return ret; +} + +static void check_sig_match_shape(const struct bpf_insn *in, int n) +{ + int a = -1, cleanup = -1, i, base, t, br[5], nb = 0; + + /* BPF_PSEUDO_MAP_IDX (the struct bpf_map * form) is used only here. */ + for (i = 0; i + 1 < n; i++) { + if (in[i].code == (BPF_LD | BPF_IMM | BPF_DW) && + in[i].src_reg == BPF_PSEUDO_MAP_IDX) { + a = i; + break; + } + } + if (!ASSERT_GE(a, 0, "emit_signature_match present")) + return; + if (!ASSERT_LE(a + SIG_MATCH_INSNS, n, "block fits in program")) + return; + + /* excl check: r2 = *(u32 *)(map + 32); if r2 != 1 goto cleanup */ + ASSERT_EQ(in[a + 2].code, (BPF_LDX | BPF_MEM | BPF_W), "excl load width"); + ASSERT_EQ(in[a + 2].off, SHA256_DIGEST_LENGTH, "excl field offset"); + ASSERT_EQ(in[a + 4].code, (BPF_JMP | BPF_JNE | BPF_K), "excl branch op"); + ASSERT_EQ(in[a + 4].imm, 1, "excl compared to 1"); + br[nb++] = a + 4; + + /* 4 sha-dword checks: r2 = *(u64 *)(map + i*8); if r2 != r3 goto cleanup */ + for (i = 0; i < 4; i++) { + base = a + 5 + i * 7; + ASSERT_EQ(in[base + 2].code, (BPF_LDX | BPF_MEM | BPF_DW), "sha load width"); + ASSERT_EQ(in[base + 2].off, i * 8, "sha dword offset"); + ASSERT_EQ(in[base + 3].code, (BPF_LD | BPF_IMM | BPF_DW), "sha imm64 (H_meta)"); + ASSERT_EQ(in[base + 6].code, (BPF_JMP | BPF_JNE | BPF_X), "sha branch op"); + br[nb++] = base + 6; + } + + /* + * Locate the real cleanup label so we can pin the exact jump target, + * not just "some backward label". bpf_gen__init() emits the cleanup + * block as a prog-fd close loop whose first instruction is the label + * every error branch jumps to. + */ + for (i = 0; i + 2 < a; i++) { + if (in[i].code == (BPF_LDX | BPF_MEM | BPF_W) && + in[i].dst_reg == BPF_REG_1 && in[i].src_reg == BPF_REG_10 && + in[i + 1].code == (BPF_JMP | BPF_JSLE | BPF_K) && + in[i + 1].dst_reg == BPF_REG_1 && in[i + 1].imm == 0 && + in[i + 1].off == 1 && + in[i + 2].code == (BPF_JMP | BPF_CALL) && + in[i + 2].imm == BPF_FUNC_sys_close) { + cleanup = i; + break; + } + } + if (!ASSERT_GE(cleanup, 0, "cleanup label located")) + return; + for (i = 0; i < nb; i++) { + t = br[i] + 1 + in[br[i]].off; + ASSERT_EQ(t, cleanup, "sig-match lands on cleanup"); + } + /* + * Same invariant for every other cleanup-bound jump in the program: + * emit_check_err() is the only source of "if (r7 < 0) goto cleanup", + * so each of those must also resolve exactly to cleanup. + */ + for (i = 0, t = 0; i < n; i++) { + if (in[i].code != (BPF_JMP | BPF_JSLT | BPF_K) || + in[i].dst_reg != BPF_REG_7 || in[i].imm != 0 || in[i].off >= 0) + continue; + ASSERT_EQ(i + 1 + in[i].off, cleanup, "err-check lands on cleanup"); + t++; + } + ASSERT_GT(t, 0, "found emit_check_err jumps"); +} + +struct gen_loader_fixture { + struct test_signed_loader *skel; + struct gen_loader_opts gopts; + unsigned char *blob; + void *ctx; + __u32 data_sz; + __u32 ctx_sz; + int nr_maps; + int nr_progs; + __u8 excl[SHA256_DIGEST_LENGTH]; +}; + +static int gen_loader_fixture_init(struct gen_loader_fixture *f) +{ + LIBBPF_OPTS(gen_loader_opts, gopts, .gen_hash = true); + int nr_maps = 0, nr_progs = 0; + struct bpf_program *p; + struct bpf_map *m; + + memset(f, 0, sizeof(*f)); + f->skel = test_signed_loader__open(); + if (!ASSERT_OK_PTR(f->skel, "skel_open")) + return -1; + if (!ASSERT_OK(bpf_object__gen_loader(f->skel->obj, &gopts), "gen_loader")) + return -1; + if (!ASSERT_OK(bpf_object__load(f->skel->obj), "gen_load")) + return -1; + f->gopts = gopts; + + bpf_object__for_each_program(p, f->skel->obj) + nr_progs++; + bpf_object__for_each_map(m, f->skel->obj) + nr_maps++; + f->nr_maps = nr_maps; + f->nr_progs = nr_progs; + f->ctx_sz = sizeof(struct bpf_loader_ctx) + + nr_maps * sizeof(struct bpf_map_desc) + + nr_progs * sizeof(struct bpf_prog_desc); + f->ctx = calloc(1, f->ctx_sz); + if (!ASSERT_OK_PTR(f->ctx, "ctx_alloc")) + return -1; + ((struct bpf_loader_ctx *)f->ctx)->sz = f->ctx_sz; + + f->data_sz = gopts.data_sz; + f->blob = malloc(f->data_sz); + if (!ASSERT_OK_PTR(f->blob, "blob_alloc")) + return -1; + memcpy(f->blob, gopts.data, f->data_sz); + + /* excl_prog_hash = SHA256(loader insns) == the loader's prog->digest. */ + libbpf_sha256(gopts.insns, gopts.insns_sz, f->excl); + return 0; +} + +static void gen_loader_fixture_fini(struct gen_loader_fixture *f) +{ + if (f->ctx) + close_loader_ctx_fds(f->ctx, f->nr_maps, f->nr_progs); + free(f->blob); + free(f->ctx); + test_signed_loader__destroy(f->skel); +} + +static void metadata_check_shape(void) +{ + struct gen_loader_fixture f; + + if (gen_loader_fixture_init(&f) == 0) + check_sig_match_shape((const struct bpf_insn *)f.gopts.insns, + f.gopts.insns_sz / sizeof(struct bpf_insn)); + gen_loader_fixture_fini(&f); +} + +static void metadata_match(void) +{ + struct gen_loader_fixture f; + bool ran; + int r; + + if (gen_loader_fixture_init(&f) == 0) { + r = run_gen_loader(f.gopts.insns, f.gopts.insns_sz, f.blob, + f.data_sz, f.excl, sizeof(f.excl), NULL, 0, + true, f.ctx, f.ctx_sz, &ran); + ASSERT_TRUE(ran, "loader ran"); + ASSERT_EQ(r, 0, "honest loader retval"); + } + gen_loader_fixture_fini(&f); +} + +static void metadata_sha_mismatch(void) +{ + struct gen_loader_fixture f; + bool ran; + int r; + + if (gen_loader_fixture_init(&f) == 0) { + /* + * blob[0] lives in the loader's fd_array scratch (first add_data in + * bpf_gen__init); a 0-map program never reads it, so flipping it + * changes only map->sha. The metadata check is the only thing that + * can notice -> isolates emit_signature_match. + */ + f.blob[0] ^= 0xff; + r = run_gen_loader(f.gopts.insns, f.gopts.insns_sz, f.blob, + f.data_sz, f.excl, sizeof(f.excl), NULL, 0, + true, f.ctx, f.ctx_sz, &ran); + ASSERT_TRUE(ran, "loader ran"); + ASSERT_EQ(r, -EINVAL, "tampered blob rejected by emit_signature_match"); + } + gen_loader_fixture_fini(&f); +} + +static void metadata_not_exclusive(void) +{ + struct gen_loader_fixture f; + bool ran; + int r; + + if (gen_loader_fixture_init(&f) == 0) { + /* + * Correct blob but a non-exclusive metadata map: the verifier does + * not reject (excl_prog_sha unset), so the runtime map->excl == 1 + * check in the loader must. + */ + r = run_gen_loader(f.gopts.insns, f.gopts.insns_sz, f.blob, + f.data_sz, NULL, 0, NULL, 0, true, f.ctx, + f.ctx_sz, &ran); + ASSERT_TRUE(ran, "loader ran"); + ASSERT_EQ(r, -EINVAL, "non-exclusive metadata map rejected"); + } + gen_loader_fixture_fini(&f); +} + +static void metadata_hash_not_computed(void) +{ + struct gen_loader_fixture f; + bool ran; + int r; + + if (gen_loader_fixture_init(&f) == 0) { + /* + * Correct, exclusive, frozen map, but its hash was never computed + * (no OBJ_GET_INFO_BY_FD), so map->sha stays zero. The loader must + * fail closed rather than treat an unset hash as a match. + */ + r = run_gen_loader(f.gopts.insns, f.gopts.insns_sz, f.blob, + f.data_sz, f.excl, sizeof(f.excl), NULL, 0, + false, f.ctx, f.ctx_sz, &ran); + ASSERT_TRUE(ran, "loader ran"); + ASSERT_EQ(r, -EINVAL, "uncomputed metadata hash rejected"); + } + gen_loader_fixture_fini(&f); +} + +static void signature_enforced(void) +{ + static const __u8 junk[64] = { 0x30, 0x42, 0x13, 0x37, }; + struct gen_loader_fixture f; + int fd; + + if (gen_loader_fixture_init(&f) == 0) { + /* + * A present-but-invalid signature (the cert bytes are not a + * PKCS#7 signature) must be rejected at load: the signature + * path is honored, not ignored. (The valid path is covered by + * the signed lskels.) + */ + fd = load_loader(f.gopts.insns, f.gopts.insns_sz, -1, junk, + sizeof(junk), KEY_SPEC_SESSION_KEYRING); + ASSERT_LT(fd, 0, "invalid signature rejected at load"); + } + gen_loader_fixture_fini(&f); +} + +static void signature_too_large(void) +{ + static const __u8 junk[64] = {}; + struct gen_loader_fixture f; + int fd; + + if (gen_loader_fixture_init(&f) == 0) { + /* + * signature_size beyond the kernel's bound (KMALLOC_MAX_CACHE_SIZE) + * is rejected before the buffer is read. + */ + fd = load_loader(f.gopts.insns, f.gopts.insns_sz, -1, junk, + 64 << 20, KEY_SPEC_SESSION_KEYRING); + ASSERT_EQ(fd, -EINVAL, "oversized signature rejected"); + } + gen_loader_fixture_fini(&f); +} + +static void signature_bad_keyring(void) +{ + static const __u8 junk[64] = {}; + struct gen_loader_fixture f; + int fd; + + if (gen_loader_fixture_init(&f) == 0) { + /* + * A present signature with a keyring_id that resolves to no key is + * rejected up front: bpf_prog_verify_signature() fails the keyring + * lookup (-EINVAL) before it ever looks at the signature bytes. A + * large positive serial takes the user-keyring path and won't exist. + */ + fd = load_loader(f.gopts.insns, f.gopts.insns_sz, -1, junk, + sizeof(junk), INT_MAX); + ASSERT_EQ(fd, -EINVAL, "signature with bad keyring_id rejected"); + } + gen_loader_fixture_fini(&f); +} + +/* + * A signed loader must ignore ctx-supplied map dimensions: the host cannot + * resize a signed program's maps via the loader ctx. Drive a one-map program + * through gen_loader, ask (via ctx) for every map to be resized to a bogus + * value, and confirm the created maps keep their attested size. + */ +#define GATING_BOGUS_MAX 0x4000 + +static void metadata_ctx_max_entries_ignored(void) +{ + LIBBPF_OPTS(gen_loader_opts, gopts, .gen_hash = true); + struct test_signed_loader_map *skel; + __u8 excl[SHA256_DIGEST_LENGTH]; + int nr_maps = 0, nr_progs = 0, i, checked = 0, r; + struct bpf_program *p; + struct bpf_map *m; + struct bpf_map_desc *md; + unsigned char *blob; + __u32 ctx_sz, data_sz; + void *ctx; + bool ran; + + skel = test_signed_loader_map__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + if (!ASSERT_OK(bpf_object__gen_loader(skel->obj, &gopts), "gen_loader")) + goto destroy; + if (!ASSERT_OK(bpf_object__load(skel->obj), "gen_load")) + goto destroy; + + bpf_object__for_each_program(p, skel->obj) + nr_progs++; + bpf_object__for_each_map(m, skel->obj) + nr_maps++; + ctx_sz = sizeof(struct bpf_loader_ctx) + + nr_maps * sizeof(struct bpf_map_desc) + + nr_progs * sizeof(struct bpf_prog_desc); + ctx = calloc(1, ctx_sz); + if (!ASSERT_OK_PTR(ctx, "ctx_alloc")) + goto destroy; + ((struct bpf_loader_ctx *)ctx)->sz = ctx_sz; + + md = (struct bpf_map_desc *)((char *)ctx + sizeof(struct bpf_loader_ctx)); + for (i = 0; i < nr_maps; i++) + md[i].max_entries = GATING_BOGUS_MAX; + + libbpf_sha256(gopts.insns, gopts.insns_sz, excl); + data_sz = gopts.data_sz; + blob = malloc(data_sz); + if (!ASSERT_OK_PTR(blob, "blob_alloc")) + goto free_ctx; + memcpy(blob, gopts.data, data_sz); + + r = run_gen_loader(gopts.insns, gopts.insns_sz, blob, data_sz, + excl, sizeof(excl), NULL, 0, true, ctx, ctx_sz, &ran); + if (!ASSERT_TRUE(ran, "loader ran") || + !ASSERT_EQ(r, 0, "loader retval")) + goto free_blob; + + for (i = 0; i < nr_maps; i++) { + struct bpf_map_info info; + __u32 ilen = sizeof(info); + int fd = md[i].map_fd; + + if (fd <= 0) + continue; + memset(&info, 0, sizeof(info)); + if (ASSERT_OK(bpf_map_get_info_by_fd(fd, &info, &ilen), "map_info")) { + ASSERT_NEQ(info.max_entries, GATING_BOGUS_MAX, + "ctx max_entries ignored for signed loader"); + checked++; + } + } + ASSERT_GT(checked, 0, "inspected a created map"); + +free_blob: + free(blob); +free_ctx: + close_loader_ctx_fds(ctx, nr_maps, nr_progs); + free(ctx); +destroy: + test_signed_loader_map__destroy(skel); +} + +/* + * A signed loader must also ignore ctx-supplied initial_value: the host cannot + * re-seed a signed program's map contents through the loader ctx. Drive a + * program with one initialized global (a .data map) through gen_loader, point + * every map's ctx initial_value at an adversarial buffer, and confirm the + * created map still holds the attested value, never the ctx bytes. + */ +#define DATA_MAGIC 0x5eed1234abad1deaULL + +static void metadata_ctx_initial_value_ignored(void) +{ + LIBBPF_OPTS(gen_loader_opts, gopts, .gen_hash = true); + struct test_signed_loader_data *skel; + __u8 excl[SHA256_DIGEST_LENGTH], evil[64]; + int nr_maps = 0, nr_progs = 0, i, found = 0, r; + struct bpf_program *p; + struct bpf_map *m; + struct bpf_map_desc *md; + unsigned char *blob; + __u32 ctx_sz, data_sz; + void *ctx; + bool ran; + + skel = test_signed_loader_data__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + if (!ASSERT_OK(bpf_object__gen_loader(skel->obj, &gopts), "gen_loader")) + goto destroy; + if (!ASSERT_OK(bpf_object__load(skel->obj), "gen_load")) + goto destroy; + + bpf_object__for_each_program(p, skel->obj) + nr_progs++; + bpf_object__for_each_map(m, skel->obj) + nr_maps++; + ctx_sz = sizeof(struct bpf_loader_ctx) + + nr_maps * sizeof(struct bpf_map_desc) + + nr_progs * sizeof(struct bpf_prog_desc); + ctx = calloc(1, ctx_sz); + if (!ASSERT_OK_PTR(ctx, "ctx_alloc")) + goto destroy; + ((struct bpf_loader_ctx *)ctx)->sz = ctx_sz; + + memset(evil, 0xAA, sizeof(evil)); + md = (struct bpf_map_desc *)((char *)ctx + sizeof(struct bpf_loader_ctx)); + for (i = 0; i < nr_maps; i++) + md[i].initial_value = ptr_to_u64(evil); + + libbpf_sha256(gopts.insns, gopts.insns_sz, excl); + data_sz = gopts.data_sz; + blob = malloc(data_sz); + if (!ASSERT_OK_PTR(blob, "blob_alloc")) + goto free_ctx; + memcpy(blob, gopts.data, data_sz); + + r = run_gen_loader(gopts.insns, gopts.insns_sz, blob, data_sz, + excl, sizeof(excl), NULL, 0, true, ctx, ctx_sz, &ran); + if (!ASSERT_TRUE(ran, "loader ran") || + !ASSERT_EQ(r, 0, "loader retval")) + goto free_blob; + + for (i = 0; i < nr_maps; i++) { + struct bpf_map_info info; + __u32 ilen = sizeof(info), key = 0; + __u8 value[64] = {}; + __u64 got; + int fd = md[i].map_fd; + + if (fd <= 0) + continue; + memset(&info, 0, sizeof(info)); + if (!ASSERT_OK(bpf_map_get_info_by_fd(fd, &info, &ilen), "map_info")) + continue; + if (info.value_size <= sizeof(value) && + bpf_map_lookup_elem(fd, &key, value) == 0) { + memcpy(&got, value, sizeof(got)); + /* attested .data survives; ctx bytes (0xAA..) ignored */ + if (got == DATA_MAGIC) + found = 1; + ASSERT_NEQ(got, 0xAAAAAAAAAAAAAAAAULL, + "ctx initial_value ignored for signed loader"); + } + } + ASSERT_EQ(found, 1, "attested .data value preserved"); + +free_blob: + free(blob); +free_ctx: + close_loader_ctx_fds(ctx, nr_maps, nr_progs); + free(ctx); +destroy: + test_signed_loader_data__destroy(skel); +} + +/* + * The load-time signature must authenticate the loader instructions: a valid + * signature loads, and the very same signature over one-byte-tampered insns is + * rejected. Uses ./verify_sig_setup.sh + ./sign-file at runtime, like + * verify_pkcs7_sig, and verifies against the session keyring the key was added + * to. (signature_enforced/_too_large only cover a malformed signature.) + */ +static void signature_authenticates_insns(void) +{ + LIBBPF_OPTS(gen_loader_opts, gopts, .gen_hash = true); + char dir_tmpl[] = "/tmp/signed_loaderXXXXXX", *dir; + struct test_signed_loader *skel = NULL; + __u8 excl[SHA256_DIGEST_LENGTH], sig[8192]; + __u32 sig_sz = sizeof(sig), insns_sz, data_sz, ctx_sz; + unsigned char *insns = NULL, *tampered = NULL, *blob = NULL; + int nr_maps = 0, nr_progs = 0, r; + struct bpf_program *p; + struct bpf_map *m; + void *ctx = NULL; + bool ran; + + syscall(__NR_request_key, "keyring", "_uid.0", NULL, + KEY_SPEC_SESSION_KEYRING); + dir = mkdtemp(dir_tmpl); + if (!ASSERT_OK_PTR(dir, "mkdtemp")) + return; + if (!ASSERT_OK(run_setup("setup", dir), "verify_sig_setup")) { + rmdir(dir); + return; + } + + skel = test_signed_loader__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + goto cleanup; + if (!ASSERT_OK(bpf_object__gen_loader(skel->obj, &gopts), "gen_loader")) + goto cleanup; + if (!ASSERT_OK(bpf_object__load(skel->obj), "gen_load")) + goto cleanup; + + bpf_object__for_each_program(p, skel->obj) + nr_progs++; + bpf_object__for_each_map(m, skel->obj) + nr_maps++; + ctx_sz = sizeof(struct bpf_loader_ctx) + + nr_maps * sizeof(struct bpf_map_desc) + + nr_progs * sizeof(struct bpf_prog_desc); + insns_sz = gopts.insns_sz; + data_sz = gopts.data_sz; + ctx = calloc(1, ctx_sz); + insns = malloc(insns_sz); + tampered = malloc(insns_sz); + blob = malloc(data_sz); + if (!ASSERT_OK_PTR(ctx, "ctx") || + !ASSERT_OK_PTR(insns, "insns") || + !ASSERT_OK_PTR(tampered, "tampered") || + !ASSERT_OK_PTR(blob, "blob")) + goto cleanup; + memcpy(insns, gopts.insns, insns_sz); + memcpy(blob, gopts.data, data_sz); + libbpf_sha256(insns, insns_sz, excl); + + if (!ASSERT_OK(sign_buf(dir, insns, insns_sz, sig, &sig_sz), "sign-file")) + goto cleanup; + + memset(ctx, 0, ctx_sz); + ((struct bpf_loader_ctx *)ctx)->sz = ctx_sz; + r = run_gen_loader(insns, insns_sz, blob, data_sz, excl, sizeof(excl), + sig, sig_sz, true, ctx, ctx_sz, &ran); + ASSERT_TRUE(ran, "valid signature: loader loaded and ran"); + ASSERT_EQ(r, 0, "valid signature accepted"); + close_loader_ctx_fds(ctx, nr_maps, nr_progs); + + memcpy(tampered, insns, insns_sz); + tampered[insns_sz / 2] ^= 0xff; + memset(ctx, 0, ctx_sz); + ((struct bpf_loader_ctx *)ctx)->sz = ctx_sz; + r = run_gen_loader(tampered, insns_sz, blob, data_sz, excl, sizeof(excl), + sig, sig_sz, true, ctx, ctx_sz, &ran); + ASSERT_FALSE(ran, "tampered loader rejected before run"); + ASSERT_EQ(r, -EKEYREJECTED, "signature is bound to the instructions"); +cleanup: + free(insns); + free(tampered); + free(blob); + free(ctx); + test_signed_loader__destroy(skel); + run_setup("cleanup", dir); +} + +static int make_excl_map(__u32 flags, __u32 value_size) +{ + LIBBPF_OPTS(bpf_map_create_opts, opts); + __u8 hash[SHA256_DIGEST_LENGTH] = { 1 }; /* any 32-byte value */ + + opts.excl_prog_hash = hash; + opts.excl_prog_hash_size = sizeof(hash); + opts.map_flags = flags; + return bpf_map_create(BPF_MAP_TYPE_ARRAY, "md", 4, value_size, 1, &opts); +} + +static void hash_requires_frozen(void) +{ + __u8 hbuf[SHA256_DIGEST_LENGTH], val[64] = {}; + struct bpf_map_info info; + __u32 ilen, key = 0; + int fd; + + fd = make_excl_map(0, sizeof(val)); + if (!ASSERT_OK_FD(fd, "excl_map")) + return; + ASSERT_OK(bpf_map_update_elem(fd, &key, val, 0), "update"); + + memset(&info, 0, sizeof(info)); + info.hash = ptr_to_u64(hbuf); + info.hash_size = sizeof(hbuf); + ilen = sizeof(info); + ASSERT_EQ(bpf_map_get_info_by_fd(fd, &info, &ilen), -EPERM, + "hash of unfrozen map rejected"); + close(fd); +} + +static void no_update_after_freeze(void) +{ + __u8 val[64] = {}; + __u32 key = 0; + int fd; + + fd = make_excl_map(0, sizeof(val)); + if (!ASSERT_OK_FD(fd, "excl_map")) + return; + ASSERT_OK(bpf_map_update_elem(fd, &key, val, 0), "update"); + ASSERT_OK(bpf_map_freeze(fd), "freeze"); + ASSERT_EQ(bpf_map_update_elem(fd, &key, val, 0), -EPERM, + "update after freeze rejected"); + close(fd); +} + +static void freeze_writable_mmap(void) +{ + void *w; + int fd; + + fd = make_excl_map(BPF_F_MMAPABLE, 4096); + if (!ASSERT_OK_FD(fd, "excl_mmapable_map")) + return; + w = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (ASSERT_OK_PTR(w, "writable_mmap")) { + ASSERT_EQ(bpf_map_freeze(fd), -EBUSY, + "freeze rejected while writable mmap held"); + munmap(w, 4096); + } + close(fd); +} + +static void no_writable_mmap_frozen(void) +{ + void *w; + int fd; + + fd = make_excl_map(BPF_F_MMAPABLE, 4096); + if (!ASSERT_OK_FD(fd, "excl_mmapable_map")) + return; + ASSERT_OK(bpf_map_freeze(fd), "freeze"); + w = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + ASSERT_EQ(w, MAP_FAILED, "writable mmap of frozen map rejected"); + if (w != MAP_FAILED) + munmap(w, 4096); + close(fd); +} + +static void map_hash_matches_libbpf(void) +{ + __u8 kbuf[SHA256_DIGEST_LENGTH], lbuf[SHA256_DIGEST_LENGTH], val[64] = {}; + struct bpf_map_info info; + __u32 ilen, key = 0; + int fd, i; + + /* + * The signing scheme assumes the kernel's map hash equals what libbpf + * computes over the same bytes (gen_loader bakes libbpf_sha256(blob); + * the kernel recomputes via array_map_get_hash). Pin that they agree. + */ + for (i = 0; i < (int)sizeof(val); i++) + val[i] = i * 7 + 1; + fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "h", 4, sizeof(val), 1, NULL); + if (!ASSERT_OK_FD(fd, "array_map")) + return; + ASSERT_OK(bpf_map_update_elem(fd, &key, val, 0), "update"); + ASSERT_OK(bpf_map_freeze(fd), "freeze"); + memset(&info, 0, sizeof(info)); + info.hash = ptr_to_u64(kbuf); + info.hash_size = sizeof(kbuf); + ilen = sizeof(info); + if (ASSERT_OK(bpf_map_get_info_by_fd(fd, &info, &ilen), "get_hash")) { + libbpf_sha256(val, sizeof(val), lbuf); + ASSERT_EQ(memcmp(kbuf, lbuf, sizeof(kbuf)), 0, + "kernel map hash matches libbpf_sha256"); + } + close(fd); +} + +static void map_hash_multi_element(void) +{ + const __u32 nr = 8, value_size = 64; + __u8 kbuf[SHA256_DIGEST_LENGTH], lbuf[SHA256_DIGEST_LENGTH]; + struct bpf_map_info info; + __u32 ilen, i, j; + __u8 *full; + int fd; + + /* + * array_map_get_hash() hashes elem_size * max_entries (the whole value + * area), not just element 0. With an 8-aligned value_size elem_size has + * no padding, so pin that a >1-entry array's kernel hash equals + * libbpf_sha256() over the full, concatenated element contents. + */ + fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "h", 4, value_size, nr, NULL); + if (!ASSERT_OK_FD(fd, "array_map")) + return; + full = calloc(nr, value_size); + if (!ASSERT_OK_PTR(full, "buf")) + goto close_fd; + for (i = 0; i < nr; i++) { + __u8 *v = full + i * value_size; + + for (j = 0; j < value_size; j++) + v[j] = i * 31 + j * 7 + 1; + ASSERT_OK(bpf_map_update_elem(fd, &i, v, 0), "update"); + } + ASSERT_OK(bpf_map_freeze(fd), "freeze"); + memset(&info, 0, sizeof(info)); + info.hash = ptr_to_u64(kbuf); + info.hash_size = sizeof(kbuf); + ilen = sizeof(info); + if (ASSERT_OK(bpf_map_get_info_by_fd(fd, &info, &ilen), "get_hash")) { + libbpf_sha256(full, (size_t)nr * value_size, lbuf); + ASSERT_EQ(memcmp(kbuf, lbuf, sizeof(kbuf)), 0, + "kernel hash covers full multi-element value area"); + } + free(full); +close_fd: + close(fd); +} + +static void map_hash_bad_size(void) +{ + __u8 kbuf[SHA256_DIGEST_LENGTH], val[64] = {}; + struct bpf_map_info info; + __u32 ilen, key = 0; + int fd; + + fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "h", 4, sizeof(val), 1, NULL); + if (!ASSERT_OK_FD(fd, "array_map")) + return; + ASSERT_OK(bpf_map_update_elem(fd, &key, val, 0), "update"); + ASSERT_OK(bpf_map_freeze(fd), "freeze"); + memset(&info, 0, sizeof(info)); + info.hash = ptr_to_u64(kbuf); + info.hash_size = sizeof(kbuf) / 2; + ilen = sizeof(info); + ASSERT_EQ(bpf_map_get_info_by_fd(fd, &info, &ilen), -EINVAL, + "wrong hash_size rejected"); + close(fd); +} + +static void map_hash_unsupported_type(void) +{ + __u8 kbuf[SHA256_DIGEST_LENGTH]; + struct bpf_map_info info; + __u32 ilen; + int fd; + + /* Only arrays implement map_get_hash; a hash map must be refused. */ + fd = bpf_map_create(BPF_MAP_TYPE_HASH, "h", 4, 8, 4, NULL); + if (!ASSERT_OK_FD(fd, "hash_map")) + return; + memset(&info, 0, sizeof(info)); + info.hash = ptr_to_u64(kbuf); + info.hash_size = sizeof(kbuf); + ilen = sizeof(info); + ASSERT_EQ(bpf_map_get_info_by_fd(fd, &info, &ilen), -EINVAL, + "hash unsupported for non-array map"); + close(fd); +} + +void test_signed_loader(void) +{ + if (test__start_subtest("metadata_check_shape")) + metadata_check_shape(); + if (test__start_subtest("metadata_match")) + metadata_match(); + if (test__start_subtest("metadata_sha_mismatch")) + metadata_sha_mismatch(); + if (test__start_subtest("metadata_not_exclusive")) + metadata_not_exclusive(); + if (test__start_subtest("metadata_hash_not_computed")) + metadata_hash_not_computed(); + if (test__start_subtest("signature_enforced")) + signature_enforced(); + if (test__start_subtest("signature_too_large")) + signature_too_large(); + if (test__start_subtest("signature_bad_keyring")) + signature_bad_keyring(); + if (test__start_subtest("metadata_ctx_max_entries_ignored")) + metadata_ctx_max_entries_ignored(); + if (test__start_subtest("metadata_ctx_initial_value_ignored")) + metadata_ctx_initial_value_ignored(); + if (test__start_subtest("signature_authenticates_insns")) + signature_authenticates_insns(); + if (test__start_subtest("hash_requires_frozen")) + hash_requires_frozen(); + if (test__start_subtest("no_update_after_freeze")) + no_update_after_freeze(); + if (test__start_subtest("freeze_writable_mmap")) + freeze_writable_mmap(); + if (test__start_subtest("no_writable_mmap_frozen")) + no_writable_mmap_frozen(); + if (test__start_subtest("map_hash_matches_libbpf")) + map_hash_matches_libbpf(); + if (test__start_subtest("map_hash_multi_element")) + map_hash_multi_element(); + if (test__start_subtest("map_hash_bad_size")) + map_hash_bad_size(); + if (test__start_subtest("map_hash_unsupported_type")) + map_hash_unsupported_type(); +} diff --git a/tools/testing/selftests/bpf/progs/test_signed_loader.c b/tools/testing/selftests/bpf/progs/test_signed_loader.c new file mode 100644 index 000000000000..d9a4b85f9391 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_signed_loader.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "vmlinux.h" +#include + +/* + * Minimal, map-less program. Driven through libbpf's gen_loader (gen_hash) + * by prog_tests/signed_loader.c so the generated light-skeleton loader (with + * the emit_signature_match metadata check) can be exercised against good + * and tampered metadata. A socket filter needs no load-time attach resolution, + * and having no maps keeps the generated loader's ctx trivial (0 maps, 1 prog). + */ +SEC("socket") +int probe(void *ctx) +{ + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_signed_loader_data.c b/tools/testing/selftests/bpf/progs/test_signed_loader_data.c new file mode 100644 index 000000000000..43e2074d0042 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_signed_loader_data.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "vmlinux.h" +#include + +/* + * A single initialized global, so the generated loader has one internal + * (.data) map that it seeds with an initial value while loading. + * prog_tests/signed_loader.c uses this to check that a signed loader + * keeps the attested contents and ignores a ctx-supplied initial_value: + * the host cannot re-seed a signed program's maps through the loader ctx. + */ +__u64 magic = 0x5eed1234abad1deaULL; + +SEC("socket") +int probe(void *ctx) +{ + return (int)magic; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_signed_loader_map.c b/tools/testing/selftests/bpf/progs/test_signed_loader_map.c new file mode 100644 index 000000000000..4478ce6f1fd9 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_signed_loader_map.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "vmlinux.h" +#include + +/* + * One explicit array map and no global variables, so the generated loader + * has exactly one map to create (no .rodata/.bss). prog_tests/signed_loader.c + * uses this to check that a signed loader ignores ctx-supplied max_entries: + * the map must keep its attested size (4), not whatever the host puts in + * the loader ctx. + */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 4); + __type(key, __u32); + __type(value, __u64); +} amap SEC(".maps"); + +SEC("socket") +int probe(void *ctx) +{ + __u32 key = 0; + __u64 *val = bpf_map_lookup_elem(&amap, &key); + + return val ? (int)*val : 0; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From fbd6dc50d9aedc594ec3196211a190170a275ab6 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Wed, 3 Jun 2026 20:18:22 +0000 Subject: bpf: clean up btf_scan_decl_tags() Refactor the newly introduced btf_scan_decl_tags() to improve readability and maintainability. The current implementation uses a manual if-else chain and a magic number offset to strip the "arg:" prefix from declaration tags. Replace the if-else logic with a table-driven approach using a static const array. This separates the tag data from the scanning logic, making the helper more extensible for future tags. Additionally, replace the magic number '4' with a sizeof-based calculation on the prefix string to ensure the offset remains synchronized with the search key. Finally, optimize the loop by moving the is_global check to the top of the block. This allows the verifier to fail-fast on static subprograms without performing unnecessary BTF string and type lookups. Signed-off-by: Matt Bobrowski Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260603201822.770596-1-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 46 ++++++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 68921d9172b5..55aa3ba1b1e0 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7808,14 +7808,28 @@ static int btf_scan_decl_tags(struct bpf_verifier_env *env, u32 arg_idx, bool is_global, u32 *tags) { int id = btf_named_start_id(btf, false) - 1; + const char tag_key[] = "arg:"; + static const struct { + const char *tag_value; + enum btf_arg_tag arg_tag; + } tag_values[] = { + { "ctx", ARG_TAG_CTX }, + { "trusted", ARG_TAG_TRUSTED }, + { "untrusted", ARG_TAG_UNTRUSTED }, + { "nonnull", ARG_TAG_NONNULL }, + { "nullable", ARG_TAG_NULLABLE }, + { "arena", ARG_TAG_ARENA }, + }; /* * The 'arg:' decl_tag takes precedence over the derivation * of the register type from the BTF type itself. */ - while ((id = btf_find_next_decl_tag(btf, fn_t, arg_idx, "arg:", id)) > 0) { - const struct btf_type *tag_t = btf_type_by_id(btf, id); - const char *tag = __btf_name_by_offset(btf, tag_t->name_off) + 4; + while ((id = btf_find_next_decl_tag(btf, fn_t, arg_idx, tag_key, id)) > 0) { + const struct btf_type *tag_t; + const char *tag; + int i; + bool found; /* disallow arg tags in static subprogs */ if (!is_global) { @@ -7825,19 +7839,19 @@ static int btf_scan_decl_tags(struct bpf_verifier_env *env, return -EOPNOTSUPP; } - if (strcmp(tag, "ctx") == 0) { - *tags |= ARG_TAG_CTX; - } else if (strcmp(tag, "trusted") == 0) { - *tags |= ARG_TAG_TRUSTED; - } else if (strcmp(tag, "untrusted") == 0) { - *tags |= ARG_TAG_UNTRUSTED; - } else if (strcmp(tag, "nonnull") == 0) { - *tags |= ARG_TAG_NONNULL; - } else if (strcmp(tag, "nullable") == 0) { - *tags |= ARG_TAG_NULLABLE; - } else if (strcmp(tag, "arena") == 0) { - *tags |= ARG_TAG_ARENA; - } else { + tag_t = btf_type_by_id(btf, id); + tag = __btf_name_by_offset(btf, tag_t->name_off) + (sizeof(tag_key) - 1); + + found = false; + for (i = 0; i < ARRAY_SIZE(tag_values); ++i) { + if (!strcmp(tag, tag_values[i].tag_value)) { + *tags |= tag_values[i].arg_tag; + found = true; + break; + } + } + + if (!found) { bpf_log(&env->log, "arg#%d has unsupported set of tags\n", arg_idx); return -EOPNOTSUPP; } -- cgit v1.2.3 From 80b89d0226a05e8b67969de99c31b51fcd54f76a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 28 May 2026 15:20:14 -0700 Subject: bpf: Take mmap_lock in zap_pages() zap_vma_range() requires the owning mm's mmap_lock to be held. Taking mmap_read_lock under arena->lock would AB-BA against arena_vm_close() and arena_map_mmap(), both of which run with mmap_write_lock held and then acquire arena->lock. Instead drop arena->lock, mmget_not_zero() the vma's mm, take mmap_read_lock, and re-resolve the vma via find_vma() since it may have been unmapped or replaced while waiting. Track processed vmls with a per-call generation in vml->zap_gen and serialize zap_pages() callers with a new arena->zap_mutex so concurrent callers on different uaddr ranges do not mark each other's vmls processed before the zap is done. Reported-by: David Hildenbrand Fixes: 317460317a02 ("bpf: Introduce bpf_arena.") Signed-off-by: Alexei Starovoitov Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260528222014.38980-1-alexei.starovoitov@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/arena.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 57 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 1727503b25d8..9b2dea229b38 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -60,6 +60,8 @@ struct bpf_arena { struct list_head vma_list; /* protects vma_list */ struct mutex lock; + u64 zap_gen; + struct mutex zap_mutex; struct irq_work free_irq; struct work_struct free_work; struct llist_head free_spans; @@ -289,6 +291,7 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr) if (err) goto err_free_scratch; mutex_init(&arena->lock); + mutex_init(&arena->zap_mutex); raw_res_spin_lock_init(&arena->spinlock); err = populate_pgtable_except_pte(arena); if (err) @@ -391,6 +394,7 @@ struct vma_list { struct vm_area_struct *vma; struct list_head head; refcount_t mmap_count; + u64 zap_gen; }; static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma) @@ -403,6 +407,7 @@ static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma) refcount_set(&vml->mmap_count, 1); vma->vm_private_data = vml; vml->vma = vma; + vml->zap_gen = 0; list_add(&vml->head, &arena->vma_list); return 0; } @@ -746,12 +751,60 @@ out_free_pages: */ static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt) { + unsigned long size = (unsigned long)page_cnt << PAGE_SHIFT; + struct vm_area_struct *vma; + struct mm_struct *mm; struct vma_list *vml; + unsigned long vm_start; + u64 my_gen; - guard(mutex)(&arena->lock); - /* iterate link list under lock */ - list_for_each_entry(vml, &arena->vma_list, head) - zap_vma_range(vml->vma, uaddr, PAGE_SIZE * page_cnt); + /* + * Taking mmap_read_lock() under arena->lock would deadlock against + * arena_vm_close(), which runs with mmap_write_lock held and then + * acquires arena->lock. Drop arena->lock for mmap_read_lock(). + * + * Use per-call my_gen, recorded in vml->zap_gen, to remember which + * vmls this invocation has already processed across the lock drop. + * Hold zap_mutex around the whole walk so concurrent zap_pages() + * callers cannot overwrite each other's marks on shared vmls -- + * otherwise call B's mark would make call A skip a vml that A has + * not yet zapped for A's uaddr range. + */ + mutex_lock(&arena->zap_mutex); + mutex_lock(&arena->lock); + my_gen = ++arena->zap_gen; + for (;;) { + mm = NULL; + list_for_each_entry(vml, &arena->vma_list, head) { + if (vml->zap_gen >= my_gen) + continue; + vml->zap_gen = my_gen; + if (!mmget_not_zero(vml->vma->vm_mm)) + continue; + mm = vml->vma->vm_mm; + vm_start = vml->vma->vm_start; + break; + } + if (!mm) + break; + mutex_unlock(&arena->lock); + + mmap_read_lock(mm); + /* + * Re-resolve: while we waited the VMA could have been unmapped + * and a different mapping installed at the same address. + */ + vma = find_vma(mm, vm_start); + if (vma && vma->vm_start == vm_start && + vma->vm_file && vma->vm_file->private_data == &arena->map) + zap_vma_range(vma, uaddr, size); + mmap_read_unlock(mm); + mmput(mm); + + mutex_lock(&arena->lock); + } + mutex_unlock(&arena->lock); + mutex_unlock(&arena->zap_mutex); } static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable) -- cgit v1.2.3 From bf29346fc39355cc57118e4e825109f66ac3542d Mon Sep 17 00:00:00 2001 From: "Alexis Lothoré (eBPF Foundation)" Date: Thu, 28 May 2026 15:27:14 +0200 Subject: selftests/bpf: ignore call depth accounting for retbleed in verifier tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When running the selftests on a retbleed-affected platform (eg: Skylake), with call depth accounting enabled (CONFIG_CALL_DEPTH_TRACKING=y) _and_ with retbleed=stuff, some verifier selftests fail to validate the jited instructions. For example: MATCHED SUBSTR: ' endbr64' MATCHED SUBSTR: ' nopl (%rax,%rax)' MATCHED SUBSTR: ' xorq %rax, %rax' MATCHED SUBSTR: ' pushq %rbp' MATCHED SUBSTR: ' movq %rsp, %rbp' MATCHED SUBSTR: ' endbr64' MATCHED SUBSTR: ' cmpq $0x21, %rax' MATCHED SUBSTR: ' ja L0' MATCHED SUBSTR: ' pushq %rax' MATCHED SUBSTR: ' movq %rsp, %rax' MATCHED SUBSTR: ' jmp L1' MATCHED SUBSTR: 'L0: pushq %rax' MATCHED SUBSTR: 'L1: pushq %rax' MATCHED SUBSTR: ' movq -0x10(%rbp), %rax' WRONG LINE REGEX: ' callq 0x{{.*}}' Those affected selftests allways fail on some call instruction: this failure is due to the JIT compiler emitting call depth accounting for retbleed mitigation (see x86_call_depth_emit_accounting calls in bpf_jit_comp.c), resulting in an additional instruction being inserted in front of every call instruction, similar to this one: sarq $0x5, %gs:-0x39882741(%rip) Fix those selftests by allowing them to ignore this possibly present call depth accounting instruction. Signed-off-by: Alexis Lothoré (eBPF Foundation) Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260528-fix_tests_for_retbleed_stuff-v1-1-c2022a1f3bee@bootlin.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/verifier_private_stack.c | 5 +++++ tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c | 1 + 2 files changed, 6 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_private_stack.c b/tools/testing/selftests/bpf/progs/verifier_private_stack.c index 046f7445a458..bb8206e10880 100644 --- a/tools/testing/selftests/bpf/progs/verifier_private_stack.c +++ b/tools/testing/selftests/bpf/progs/verifier_private_stack.c @@ -94,6 +94,7 @@ __jited(" addq %gs:{{.*}}, %r9") __jited(" movl $0x2a, %edi") __jited(" movq %rdi, -0x200(%r9)") __jited(" pushq %r9") +__jited("...") __jited(" callq 0x{{.*}}") __jited(" popq %r9") __jited(" xorl %eax, %eax") @@ -153,11 +154,13 @@ __jited(" endbr64") __jited(" movabsq $0x{{.*}}, %r9") __jited(" addq %gs:{{.*}}, %r9") __jited(" pushq %r9") +__jited("...") __jited(" callq") __jited(" popq %r9") __jited(" movl $0x2a, %edi") __jited(" movq %rdi, -0x200(%r9)") __jited(" pushq %r9") +__jited("...") __jited(" callq") __jited(" popq %r9") __arch_arm64 @@ -199,6 +202,7 @@ __description("Private stack, exception in main prog") __success __retval(0) __arch_x86_64 __jited(" pushq %r9") +__jited("...") __jited(" callq") __jited(" popq %r9") __arch_arm64 @@ -246,6 +250,7 @@ __success __retval(0) __arch_x86_64 __jited(" movq %rdi, -0x200(%r9)") __jited(" pushq %r9") +__jited("...") __jited(" callq") __jited(" popq %r9") __arch_arm64 diff --git a/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c b/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c index 8d60c634a114..48fa34d2959f 100644 --- a/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c +++ b/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c @@ -56,6 +56,7 @@ __jited("L1: pushq %rax") /* rbp[-16] = rax */ * (cause original rax might be clobbered by this point) */ __jited(" movq -0x10(%rbp), %rax") +__jited("...") __jited(" callq 0x{{.*}}") /* call to sub() */ __jited(" xorl %eax, %eax") __jited(" leave") -- cgit v1.2.3 From 8f4fa9f89b72845fa8ac956bff2e1d2ba5722f2e Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Fri, 5 Jun 2026 04:41:18 -0700 Subject: rhashtable: Add rhashtable_next_key() API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a simpler iteration mechanism for rhashtable that lets the caller continue from an arbitrary position by supplying the previous key, without the per-iterator state of the rhashtable_walk_* API. void *rhashtable_next_key(struct rhashtable *ht, const void *prev_key); Caller holds RCU; passes NULL prev_key for the first element or the previously returned key to advance. Walks tbl->future_tbl chain so in-flight rehashes are observed. Best-effort: in case of concurrent resize, provides no guarantees: - may produce duplicate elements - may skip any amount of elements - termination of the loop is not guaranteed in case of sustained rehash. Callers are advised to bound loop externally or avoid inserting new elements during such loop. Returns ERR_PTR(-ENOENT) if prev_key is not found. Behavior on tables with duplicate keys is undefined. rhltable is not supported — returns ERR_PTR(-EOPNOTSUPP). Signed-off-by: Mykyta Yatsenko Acked-by: Herbert Xu Link: https://lore.kernel.org/r/20260605-rhash-v7-1-5b8e05f8630d@meta.com Signed-off-by: Alexei Starovoitov --- include/linux/rhashtable.h | 40 +++++++++++++++++++++++++++ lib/rhashtable.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+) diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index ef5230cece36..6f3aea498515 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -650,6 +650,46 @@ restart: return NULL; } +/** + * rhashtable_next_key - return next element after a given key + * @ht: hash table + * @prev_key: pointer to previous key, or NULL for the first element + * + * WARNING: this walk is highly unstable. Unlike rhashtable_walk_*(), + * it cannot detect a concurrent resize or rehash, so a full iteration + * is NOT guaranteed to terminate under adversarial or sustained + * rehashing. Callers MUST tolerate skipped and duplicated elements and + * SHOULD bound their loop externally. + * + * Returns the next element in best-effort iteration order, walking the + * @tbl chain (including any future_tbl in flight). Caller must hold RCU. + * + * Pass @prev_key == NULL to obtain the first element. To iterate, set + * @prev_key to the key of the previously returned element on each call, + * and stop when NULL is returned. + * + * Best-effort semantics: + * - Across the tbl->future_tbl chain, an element being migrated may + * transiently appear in both tables and be observed twice. + * - Concurrent inserts may or may not be observed. + * - Termination of a full iteration loop is NOT guaranteed under + * adversarial continuous rehash; callers MUST tolerate skips and + * repeats and SHOULD bound their loop externally. + * - Behavior on tables that contain duplicate keys is undefined: + * duplicates may be skipped, repeated, or trap the walk in a + * cycle. Callers requiring duplicate-key iteration must use + * rhashtable_walk_*() instead. + * - rhltable instances are not supported and return + * ERR_PTR(-EOPNOTSUPP). + * - If prev_key was concurrently deleted and is not present in any + * in-flight table, returns ERR_PTR(-ENOENT). + * + * Returns entry of the next element, or NULL when iteration is exhausted, + * or ERR_PTR(-ENOENT) if prev_key is not found, or + * ERR_PTR(-EOPNOTSUPP) if @ht is an rhltable. + */ +void *rhashtable_next_key(struct rhashtable *ht, const void *prev_key); + /** * rhashtable_lookup - search hash table * @ht: hash table diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 04b3a808fca9..dd6eaa09c55d 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -687,6 +687,75 @@ void *rhashtable_insert_slow(struct rhashtable *ht, const void *key, } EXPORT_SYMBOL_GPL(rhashtable_insert_slow); +/* Scan one element forward from prev_key's position in @tbl. + * Returns first rhash_head whose bucket > prev_key's bucket, or the + * element immediately after prev_key inside prev_key's bucket. + * Returns the first element if prev_key is NULL, NULL when @tbl is + * exhausted, or ERR_PTR(-ENOENT) if prev_key is not found in @tbl. + */ +static struct rhash_head *__rhashtable_next_in_table( + struct rhashtable *ht, struct bucket_table *tbl, + const void *prev_key) +{ + struct rhashtable_compare_arg arg = { .ht = ht, .key = prev_key }; + const struct rhashtable_params params = ht->p; + struct rhash_head *he; + unsigned int b = 0; + bool found = false; + + if (prev_key) { + b = rht_key_hashfn(ht, tbl, prev_key, params); + rht_for_each_rcu(he, tbl, b) { + bool match = params.obj_cmpfn + ? !params.obj_cmpfn(&arg, rht_obj(ht, he)) + : !rhashtable_compare(&arg, rht_obj(ht, he)); + if (found) { + if (match) + continue; + return he; + } + if (match) + found = true; + } + if (!found) + return ERR_PTR(-ENOENT); + b++; + } + + for (; b < tbl->size; b++) + rht_for_each_rcu(he, tbl, b) + return he; + return NULL; +} + +/** + * rhashtable_next_key - return next element after a given key + * + * See include/linux/rhashtable.h for the full contract. + */ +void *rhashtable_next_key(struct rhashtable *ht, const void *prev_key) +{ + struct bucket_table *tbl; + struct rhash_head *he; + + if (unlikely(ht->rhlist)) + return ERR_PTR(-EOPNOTSUPP); + + tbl = rht_dereference_rcu(ht->tbl, ht); + do { + he = __rhashtable_next_in_table(ht, tbl, prev_key); + if (!IS_ERR_OR_NULL(he)) + return rht_obj(ht, he); + if (!he) + prev_key = NULL; + /* See any new future_tbl attached during a rehash. */ + smp_rmb(); + tbl = rht_dereference_rcu(tbl->future_tbl, ht); + } while (tbl); + return he; /* NULL or -ENOENT */ +} +EXPORT_SYMBOL_GPL(rhashtable_next_key); + /** * rhashtable_walk_enter - Initialise an iterator * @ht: Table to walk over -- cgit v1.2.3 From e673eee0f49ea41bf1af7aab08682ef98876c792 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Fri, 5 Jun 2026 04:41:19 -0700 Subject: rhashtable: Add selftest for rhashtable_next_key() Insert n elements, then verify: - NULL prev_key walks from the beginning, visiting all n - non-existing prev_key returns ERR_PTR(-ENOENT) Signed-off-by: Mykyta Yatsenko Acked-by: Herbert Xu Link: https://lore.kernel.org/r/20260605-rhash-v7-2-5b8e05f8630d@meta.com Signed-off-by: Alexei Starovoitov --- lib/test_rhashtable.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c index 0b33559a910b..b767a38a74f9 100644 --- a/lib/test_rhashtable.c +++ b/lib/test_rhashtable.c @@ -679,6 +679,78 @@ out: return err; } +static int __init test_rhashtable_next_key(void) +{ + struct rhashtable_params params = test_rht_params; + struct test_obj_val key_missing = { .id = 99999, .tid = 0 }; + struct test_obj_val *prev_key = NULL; + struct rhashtable ht; + struct test_obj *objs, *cur; + int i, count = 0, err; + int visited_keys[8] = { 0 }; + const int n = ARRAY_SIZE(visited_keys); + + params.nelem_hint = n; + + err = rhashtable_init(&ht, ¶ms); + if (err) + return err; + + objs = kcalloc(n, sizeof(*objs), GFP_KERNEL); + if (!objs) { + rhashtable_destroy(&ht); + return -ENOMEM; + } + + for (i = 0; i < n; i++) { + objs[i].value.id = i; + err = rhashtable_insert_fast(&ht, &objs[i].node, params); + if (err) + goto out; + } + + rcu_read_lock(); + + /* NULL prev_key: walk from the beginning, expect all n elements. */ + while ((cur = rhashtable_next_key(&ht, prev_key))) { + if (IS_ERR(cur)) { + err = -EINVAL; + goto unlock; + } + count++; + prev_key = &cur->value; + visited_keys[cur->value.id] = 1; + if (count > n) + break; + } + + if (count != n) { + err = -EINVAL; + goto unlock; + } + + for (i = 0; i < n; i++) { + if (!visited_keys[i]) { + err = -EINVAL; + goto unlock; + } + } + + /* Non-existing prev_key: must return ERR_PTR(-ENOENT). */ + cur = rhashtable_next_key(&ht, &key_missing); + if (!IS_ERR(cur) || PTR_ERR(cur) != -ENOENT) + err = -EINVAL; + +unlock: + rcu_read_unlock(); +out: + for (i = 0; i < n; i++) + rhashtable_remove_fast(&ht, &objs[i].node, params); + kfree(objs); + rhashtable_destroy(&ht); + return err; +} + static int __init test_rht_init(void) { unsigned int entries; @@ -738,6 +810,9 @@ static int __init test_rht_init(void) test_insert_duplicates_run(); + pr_info("Testing rhashtable_next_key: %s\n", + test_rhashtable_next_key() == 0 ? "pass" : "FAIL"); + if (!tcount) return 0; -- cgit v1.2.3 From 46730ee6e884be667365e4d3a380ac504697559a Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 5 Jun 2026 04:41:20 -0700 Subject: rhashtable: Use irq work for shrinking Use irq work for automatic shrinking so that this may be called in NMI context. Signed-off-by: Herbert Xu Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260605-rhash-v7-3-5b8e05f8630d@meta.com Signed-off-by: Alexei Starovoitov --- include/linux/rhashtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 6f3aea498515..3de3412d53c8 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -1157,7 +1157,7 @@ unlocked: atomic_dec(&ht->nelems); if (unlikely(ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl))) - schedule_work(&ht->run_work); + irq_work_queue(&ht->run_irq_work); err = 0; } -- cgit v1.2.3 From 16b4d3e2fb24aac3e68a8d86e3bc5e302e1b5cb7 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Fri, 5 Jun 2026 04:41:21 -0700 Subject: bpf: Implement resizable hashmap basic functions Use rhashtable_lookup_likely() for lookups, rhashtable_remove_fast() for deletes, and rhashtable_lookup_get_insert_fast() for inserts. Updates modify values in place under RCU rather than allocating a new element and swapping the pointer (as regular htab does). This trades read consistency for performance: concurrent readers may see partial updates. BPF_F_LOCK support and special-field handling (timers, kptrs, etc.) follow in a later commit. Initialize rhashtable with bpf_mem_alloc element cache. Require BPF_F_NO_PREALLOC. Limit max_entries to 2^31. Free elements via rhashtable_free_and_destroy(). Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260605-rhash-v7-4-5b8e05f8630d@meta.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_types.h | 1 + include/uapi/linux/bpf.h | 6 + kernel/bpf/hashtab.c | 311 +++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 3 + kernel/bpf/verifier.c | 1 + tools/include/uapi/linux/bpf.h | 6 + 6 files changed, 328 insertions(+) diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index b13de31e163f..56e4c3f983d3 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -134,6 +134,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_ARENA, arena_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_INSN_ARRAY, insn_array_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_RHASH, rhtab_map_ops) BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint) BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index aec171ccb6ef..bed9b1b4d5ef 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1047,6 +1047,7 @@ enum bpf_map_type { BPF_MAP_TYPE_CGRP_STORAGE, BPF_MAP_TYPE_ARENA, BPF_MAP_TYPE_INSN_ARRAY, + BPF_MAP_TYPE_RHASH, __MAX_BPF_MAP_TYPE }; @@ -1545,6 +1546,11 @@ union bpf_attr { * * BPF_MAP_TYPE_ARENA - contains the address where user space * is going to mmap() the arena. It has to be page aligned. + * + * BPF_MAP_TYPE_RHASH - initial table size hint + * (nelem_hint). 0 = use rhashtable default. Must be + * <= min(max_entries, U16_MAX). Upper 32 bits reserved, + * must be zero. */ __u64 map_extra; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3dd9b4924ae4..10f3a058747b 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -2739,3 +2740,313 @@ const struct bpf_map_ops htab_of_maps_map_ops = { BATCH_OPS(htab), .map_btf_id = &htab_map_btf_ids[0], }; + +struct rhtab_elem { + struct rhash_head node; + /* key bytes, then value bytes follow */ + u8 data[] __aligned(8); +}; + +struct bpf_rhtab { + struct bpf_map map; + struct rhashtable ht; + struct bpf_mem_alloc ma; + u32 elem_size; +}; + +static const struct rhashtable_params rhtab_params = { + .head_offset = offsetof(struct rhtab_elem, node), + .key_offset = offsetof(struct rhtab_elem, data), +}; + +static inline void *rhtab_elem_value(struct rhtab_elem *l, u32 key_size) +{ + return l->data + round_up(key_size, 8); +} + +static struct bpf_map *rhtab_map_alloc(union bpf_attr *attr) +{ + struct rhashtable_params params; + struct bpf_rhtab *rhtab; + int err = 0; + + rhtab = bpf_map_area_alloc(sizeof(*rhtab), NUMA_NO_NODE); + if (!rhtab) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&rhtab->map, attr); + + if (rhtab->map.max_entries > 1UL << 31) { + err = -E2BIG; + goto free_rhtab; + } + + rhtab->elem_size = sizeof(struct rhtab_elem) + round_up(rhtab->map.key_size, 8) + + round_up(rhtab->map.value_size, 8); + + params = rhtab_params; + params.key_len = rhtab->map.key_size; + params.nelem_hint = (u32)attr->map_extra; + params.automatic_shrinking = true; + + err = rhashtable_init(&rhtab->ht, ¶ms); + if (err) + goto free_rhtab; + + /* Set max_elems after rhashtable_init() since init zeroes the struct */ + rhtab->ht.max_elems = rhtab->map.max_entries; + + err = bpf_mem_alloc_init(&rhtab->ma, rhtab->elem_size, false); + if (err) + goto destroy_rhtab; + + return &rhtab->map; + +destroy_rhtab: + rhashtable_destroy(&rhtab->ht); +free_rhtab: + bpf_map_area_free(rhtab); + return ERR_PTR(err); +} + +static int rhtab_map_alloc_check(union bpf_attr *attr) +{ + if (!(attr->map_flags & BPF_F_NO_PREALLOC)) + return -EINVAL; + + if (attr->map_flags & BPF_F_ZERO_SEED) + return -EINVAL; + + if (attr->key_size > U16_MAX) + return -E2BIG; + + if (attr->map_extra >> 32) + return -EINVAL; + + if ((u32)attr->map_extra > U16_MAX) + return -E2BIG; + + if ((u32)attr->map_extra > attr->max_entries) + return -EINVAL; + + return htab_map_alloc_check(attr); +} + +static void rhtab_free_elem(void *ptr, void *arg) +{ + struct bpf_rhtab *rhtab = arg; + struct rhtab_elem *elem = ptr; + + bpf_mem_cache_free_rcu(&rhtab->ma, elem); +} + +static void rhtab_map_free(struct bpf_map *map) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + + rhashtable_free_and_destroy(&rhtab->ht, rhtab_free_elem, rhtab); + bpf_mem_alloc_destroy(&rhtab->ma); + bpf_map_area_free(rhtab); +} + +static void *rhtab_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + + /* Hold RCU lock in case sleepable program calls via gen_lookup */ + guard(rcu)(); + + return rhashtable_lookup_likely(&rhtab->ht, key, rhtab_params); +} + +static void *rhtab_map_lookup_elem(struct bpf_map *map, void *key) __must_hold(RCU) +{ + struct rhtab_elem *l; + + l = rhtab_lookup_elem(map, key); + return l ? rhtab_elem_value(l, map->key_size) : NULL; +} + +static void rhtab_read_elem_value(struct bpf_map *map, void *dst, struct rhtab_elem *elem, + u64 flags) +{ + void *src = rhtab_elem_value(elem, map->key_size); + + if (flags & BPF_F_LOCK) + copy_map_value_locked(map, dst, src, true); + else + copy_map_value(map, dst, src); +} + +static int rhtab_delete_elem(struct bpf_rhtab *rhtab, struct rhtab_elem *elem, void *copy, + u64 flags) +{ + int err; + + /* + * disable_instrumentation() mitigates the deadlock for programs running in NMI context. + * rhashtable locks bucket with local_irq_save(). Only NMI programs may reenter + * rhashtable code, bpf_disable_instrumentation() disables programs running in NMI, except + * raw tracepoints, which we don't have in rhashtable. + */ + bpf_disable_instrumentation(); + err = rhashtable_remove_fast(&rhtab->ht, &elem->node, rhtab_params); + bpf_enable_instrumentation(); + + if (err) + return err; + + if (copy) { + rhtab_read_elem_value(&rhtab->map, copy, elem, flags); + check_and_init_map_value(&rhtab->map, copy); + } + + bpf_mem_cache_free_rcu(&rhtab->ma, elem); + return 0; +} + + +static long rhtab_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + struct rhtab_elem *elem; + + guard(rcu)(); + + elem = rhtab_lookup_elem(map, key); + if (!elem) + return -ENOENT; + + return rhtab_delete_elem(rhtab, elem, NULL, 0); +} + +static int rhtab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, void *value, u64 flags) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + struct rhtab_elem *elem; + int err; + + err = bpf_map_check_op_flags(map, flags, BPF_F_LOCK); + if (err) + return err; + + guard(rcu)(); + + elem = rhtab_lookup_elem(map, key); + if (!elem) + return -ENOENT; + + return rhtab_delete_elem(rhtab, elem, value, flags); +} + +static long rhtab_map_update_existing(struct bpf_map *map, struct rhtab_elem *elem, void *value, + u64 map_flags) +{ + void *old_val = rhtab_elem_value(elem, map->key_size); + + if (map_flags & BPF_NOEXIST) + return -EEXIST; + + if (map_flags & BPF_F_LOCK) + copy_map_value_locked(map, old_val, value, false); + else + copy_map_value(map, old_val, value); + return 0; +} + +static long rhtab_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + struct rhtab_elem *elem, *tmp; + + if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) + return -EINVAL; + + if ((map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)) + return -EINVAL; + + guard(rcu)(); + elem = rhtab_lookup_elem(map, key); + if (elem) + return rhtab_map_update_existing(map, elem, value, map_flags); + + if (map_flags & BPF_EXIST) + return -ENOENT; + + /* Check max_entries limit before inserting new element */ + if (atomic_read(&rhtab->ht.nelems) >= map->max_entries) + return -E2BIG; + + elem = bpf_mem_cache_alloc(&rhtab->ma); + if (!elem) + return -ENOMEM; + + memcpy(elem->data, key, map->key_size); + copy_map_value(map, rhtab_elem_value(elem, map->key_size), value); + + /* Prevent deadlock for NMI programs attempting to take bucket lock */ + bpf_disable_instrumentation(); + tmp = rhashtable_lookup_get_insert_fast(&rhtab->ht, &elem->node, rhtab_params); + bpf_enable_instrumentation(); + + if (tmp) { + bpf_mem_cache_free(&rhtab->ma, elem); + if (IS_ERR(tmp)) + return PTR_ERR(tmp); + + return rhtab_map_update_existing(map, tmp, value, map_flags); + } + + return 0; +} + +static int rhtab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +{ + struct bpf_insn *insn = insn_buf; + const int ret = BPF_REG_0; + + BUILD_BUG_ON(!__same_type(&rhtab_lookup_elem, + (void *(*)(struct bpf_map *map, void *key)) NULL)); + *insn++ = BPF_EMIT_CALL(rhtab_lookup_elem); + *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); + *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, + offsetof(struct rhtab_elem, data) + round_up(map->key_size, 8)); + + return insn - insn_buf; +} + +static void rhtab_map_free_internal_structs(struct bpf_map *map) +{ +} + +static int rhtab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + return -EOPNOTSUPP; +} + +static u64 rhtab_map_mem_usage(const struct bpf_map *map) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + u64 num_entries; + + /* Excludes rhashtable bucket overhead (~ nelems * sizeof(void *) at 75% load). */ + num_entries = atomic_read(&rhtab->ht.nelems); + return sizeof(struct bpf_rhtab) + rhtab->elem_size * num_entries; +} + +BTF_ID_LIST_SINGLE(rhtab_map_btf_ids, struct, bpf_rhtab) +const struct bpf_map_ops rhtab_map_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc_check = rhtab_map_alloc_check, + .map_alloc = rhtab_map_alloc, + .map_free = rhtab_map_free, + .map_get_next_key = rhtab_map_get_next_key, + .map_release_uref = rhtab_map_free_internal_structs, + .map_lookup_elem = rhtab_map_lookup_elem, + .map_lookup_and_delete_elem = rhtab_map_lookup_and_delete_elem, + .map_update_elem = rhtab_map_update_elem, + .map_delete_elem = rhtab_map_delete_elem, + .map_gen_lookup = rhtab_map_gen_lookup, + .map_mem_usage = rhtab_map_mem_usage, + .map_btf_id = &rhtab_map_btf_ids[0], +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 625a4366fe6d..1faae184de48 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1398,6 +1398,7 @@ static int map_create_alloc(union bpf_attr *attr, bpfptr_t uattr, struct bpf_ver if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER && attr->map_type != BPF_MAP_TYPE_ARENA && + attr->map_type != BPF_MAP_TYPE_RHASH && attr->map_extra != 0) { bpf_log(log, "Invalid map_extra.\n"); return -EINVAL; @@ -1469,6 +1470,7 @@ static int map_create_alloc(union bpf_attr *attr, bpfptr_t uattr, struct bpf_ver case BPF_MAP_TYPE_CGROUP_ARRAY: case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH: + case BPF_MAP_TYPE_RHASH: case BPF_MAP_TYPE_PERCPU_HASH: case BPF_MAP_TYPE_HASH_OF_MAPS: case BPF_MAP_TYPE_RINGBUF: @@ -2259,6 +2261,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_RHASH || map->map_type == BPF_MAP_TYPE_STACK_TRACE) { if (!bpf_map_is_offloaded(map)) { bpf_disable_instrumentation(); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8ed484cb1a8a..7d27ba396d32 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -17657,6 +17657,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, if (prog->sleepable) switch (map->map_type) { case BPF_MAP_TYPE_HASH: + case BPF_MAP_TYPE_RHASH: case BPF_MAP_TYPE_LRU_HASH: case BPF_MAP_TYPE_ARRAY: case BPF_MAP_TYPE_PERCPU_HASH: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 37142e6d911a..7d0b282ba674 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1047,6 +1047,7 @@ enum bpf_map_type { BPF_MAP_TYPE_CGRP_STORAGE, BPF_MAP_TYPE_ARENA, BPF_MAP_TYPE_INSN_ARRAY, + BPF_MAP_TYPE_RHASH, __MAX_BPF_MAP_TYPE }; @@ -1545,6 +1546,11 @@ union bpf_attr { * * BPF_MAP_TYPE_ARENA - contains the address where user space * is going to mmap() the arena. It has to be page aligned. + * + * BPF_MAP_TYPE_RHASH - initial table size hint + * (nelem_hint). 0 = use rhashtable default. Must be + * <= min(max_entries, U16_MAX). Upper 32 bits reserved, + * must be zero. */ __u64 map_extra; -- cgit v1.2.3 From 818e0084822742fc00eacbf5df3476a5e72c7d0e Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Fri, 5 Jun 2026 04:41:22 -0700 Subject: bpf: Implement iteration ops for resizable hashtab MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement get_next_key, batch lookup/lookup-and-delete, for_each_map_elem callback, and the seq_file BPF iterator for BPF_MAP_TYPE_RHASH. get_next_key() and batch use rhashtable_next_key() — stateless, matches the syscall UAPI shape (no kernel-side iterator state). get_next_key falls back to the first key when prev_key was concurrently deleted (matches htab semantics). Batch reports cursor loss as -EAGAIN so userspace can distinguish it from end-of-iteration (-ENOENT) and restart from NULL. The seq_file BPF iterator uses rhashtable_walk_* instead. It runs only from read() syscall context, so the walker's spin_lock is safe, and seq_file's per-fd state lets the walker handle rehash correctly (retry on -EAGAIN) for stronger coverage than the stateless API can provide. Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260605-rhash-v7-5-5b8e05f8630d@meta.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 347 +++++++++++++++++++++++++++++++++++++++++++++++++- kernel/bpf/map_iter.c | 3 +- 2 files changed, 348 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 10f3a058747b..a149713d0953 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -3020,8 +3020,79 @@ static void rhtab_map_free_internal_structs(struct bpf_map *map) } static int rhtab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) + __must_hold_shared(RCU) { - return -EOPNOTSUPP; + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + struct rhtab_elem *elem; + + elem = rhashtable_next_key(&rhtab->ht, key); + + /* if not found, return the first key */ + if (PTR_ERR(elem) == -ENOENT) + elem = rhashtable_next_key(&rhtab->ht, NULL); + + if (IS_ERR(elem)) + return PTR_ERR(elem); + if (!elem) + return -ENOENT; + + memcpy(next_key, elem->data, map->key_size); + return 0; +} + +static void rhtab_map_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m) +{ + void *value; + + /* Guarantee that hashtab value is not freed */ + guard(rcu)(); + + value = rhtab_map_lookup_elem(map, key); + if (!value) + return; + + btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); + seq_puts(m, ": "); + btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); + seq_putc(m, '\n'); +} + +static long bpf_each_rhash_elem(struct bpf_map *map, bpf_callback_t callback_fn, + void *callback_ctx, u64 flags) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + void *prev_key = NULL; + struct rhtab_elem *elem; + int num_elems = 0; + u64 ret = 0; + + cant_migrate(); + + if (flags != 0) + return -EINVAL; + + rcu_read_lock(); + /* + * Best-effort iteration: if rhashtable is concurrently resized or + * elements are deleted/inserted, there may be missed or duplicate + * elements visited. + */ + while ((elem = rhashtable_next_key(&rhtab->ht, prev_key))) { + if (IS_ERR(elem)) + break; + num_elems++; + ret = callback_fn((u64)(long)map, + (u64)(long)elem->data, + (u64)(long)rhtab_elem_value(elem, map->key_size), + (u64)(long)callback_ctx, 0); + if (ret) + break; + + prev_key = elem->data; /* valid while RCU held */ + } + rcu_read_unlock(); + + return num_elems; } static u64 rhtab_map_mem_usage(const struct bpf_map *map) @@ -3034,6 +3105,275 @@ static u64 rhtab_map_mem_usage(const struct bpf_map *map) return sizeof(struct bpf_rhtab) + rhtab->elem_size * num_entries; } +static int __rhtab_map_lookup_and_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr, + bool do_delete) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + void __user *uvalues = u64_to_user_ptr(attr->batch.values); + void __user *ukeys = u64_to_user_ptr(attr->batch.keys); + void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); + void *cursor = NULL, *keys = NULL, *values = NULL, *dst_key, *dst_val; + struct rhtab_elem **del_elems = NULL; + u32 max_count, total, key_size, value_size, i; + bool has_next_cursor = false; + struct rhtab_elem *elem; + u64 elem_map_flags, map_flags; + int ret = 0; + + elem_map_flags = attr->batch.elem_flags; + ret = bpf_map_check_op_flags(map, elem_map_flags, BPF_F_LOCK); + if (ret) + return ret; + + map_flags = attr->batch.flags; + if (map_flags) + return -EINVAL; + + max_count = attr->batch.count; + if (!max_count) + return 0; + + if (put_user(0, &uattr->batch.count)) + return -EFAULT; + + key_size = map->key_size; + value_size = map->value_size; + + keys = kvmalloc_array(max_count, key_size, GFP_USER | __GFP_NOWARN); + values = kvmalloc_array(max_count, value_size, GFP_USER | __GFP_NOWARN); + if (do_delete) + del_elems = kvmalloc_array(max_count, sizeof(void *), + GFP_USER | __GFP_NOWARN); + cursor = kmalloc(key_size, GFP_USER | __GFP_NOWARN); + + if (!keys || !values || !cursor || (do_delete && !del_elems)) { + ret = -ENOMEM; + goto free; + } + + if (ubatch && copy_from_user(cursor, ubatch, key_size)) { + ret = -EFAULT; + goto free; + } + + dst_key = keys; + dst_val = values; + total = 0; + + rcu_read_lock(); + + /* + * Cursor stores the key of the next-to-process element (stashed by + * the previous batch). Look it up directly so the element is included + * here rather than skipped by next_key(). If the cursor was deleted + * concurrently (or by the previous do_delete batch), return -EAGAIN + * so userspace can distinguish a lost cursor from end-of-iteration + * (-ENOENT) and restart from a NULL cursor. + */ + if (ubatch) { + elem = rhtab_lookup_elem(map, cursor); + if (!elem) { + rcu_read_unlock(); + ret = -EAGAIN; + goto free; + } + } else { + elem = rhashtable_next_key(&rhtab->ht, NULL); + } + + while (elem && !IS_ERR(elem) && total < max_count) { + memcpy(dst_key, elem->data, key_size); + rhtab_read_elem_value(map, dst_val, elem, elem_map_flags); + check_and_init_map_value(map, dst_val); + + if (do_delete) + del_elems[total] = elem; + + elem = rhashtable_next_key(&rhtab->ht, dst_key); + dst_key += key_size; + dst_val += value_size; + total++; + + /* Bail to userspace to avoid stalls. */ + if (need_resched()) + break; + } + + if (elem && !IS_ERR(elem)) { + /* Stash next-to-process key as cursor for the next batch. */ + memcpy(cursor, elem->data, key_size); + has_next_cursor = true; + } + + if (do_delete) { + for (i = 0; i < total; i++) + rhtab_delete_elem(rhtab, del_elems[i], NULL, 0); + } + + rcu_read_unlock(); + + if (total == 0) { + ret = -ENOENT; + goto free; + } + + /* No more elements after this batch. */ + if (!has_next_cursor) + ret = -ENOENT; + + if (copy_to_user(ukeys, keys, (size_t)total * key_size) || + copy_to_user(uvalues, values, (size_t)total * value_size) || + put_user(total, &uattr->batch.count) || + (has_next_cursor && + copy_to_user(u64_to_user_ptr(attr->batch.out_batch), + cursor, key_size))) { + ret = -EFAULT; + goto free; + } + +free: + kfree(cursor); + kvfree(keys); + kvfree(values); + kvfree(del_elems); + return ret; +} + +static int rhtab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __rhtab_map_lookup_and_delete_batch(map, attr, uattr, false); +} + +static int rhtab_map_lookup_and_delete_batch(struct bpf_map *map, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __rhtab_map_lookup_and_delete_batch(map, attr, uattr, true); +} + +struct bpf_iter_seq_rhash_map_info { + struct bpf_map *map; + struct bpf_rhtab *rhtab; + struct rhashtable_iter iter; +}; + +static void *bpf_rhash_map_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + struct bpf_iter_seq_rhash_map_info *info = seq->private; + struct rhtab_elem *elem; + + rhashtable_walk_start(&info->iter); + /* + * Re-deliver the element returned by walk_next() at the end of the + * previous read() — bpf_seq_read may have stopped before show() + * consumed it. Rehash rewinds the walker; retry on -EAGAIN. + */ + do { + elem = rhashtable_walk_peek(&info->iter); + } while (PTR_ERR(elem) == -EAGAIN); + + if (IS_ERR(elem)) + return NULL; + + if (elem && *pos == 0) + ++*pos; + return elem; +} + +static void *bpf_rhash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_rhash_map_info *info = seq->private; + struct rhtab_elem *elem; + + ++*pos; + + /* Rehash rewinds the walker; retry until it stops returning -EAGAIN. */ + do { + elem = rhashtable_walk_next(&info->iter); + } while (PTR_ERR(elem) == -EAGAIN); + + if (IS_ERR(elem)) + return NULL; + return elem; +} + +static int __bpf_rhash_map_seq_show(struct seq_file *seq, + struct rhtab_elem *elem) +{ + struct bpf_iter_seq_rhash_map_info *info = seq->private; + struct bpf_iter__bpf_map_elem ctx = {}; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret = 0; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, elem == NULL); + if (prog) { + ctx.meta = &meta; + ctx.map = info->map; + if (elem) { + ctx.key = elem->data; + ctx.value = rhtab_elem_value(elem, info->map->key_size); + } + ret = bpf_iter_run_prog(prog, &ctx); + } + + return ret; +} + +static int bpf_rhash_map_seq_show(struct seq_file *seq, void *v) +{ + return __bpf_rhash_map_seq_show(seq, v); +} + +static void bpf_rhash_map_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + struct bpf_iter_seq_rhash_map_info *info = seq->private; + + if (!v) + (void)__bpf_rhash_map_seq_show(seq, NULL); + + rhashtable_walk_stop(&info->iter); +} + +static int bpf_iter_init_rhash_map(void *priv_data, struct bpf_iter_aux_info *aux) +{ + struct bpf_iter_seq_rhash_map_info *info = priv_data; + struct bpf_map *map = aux->map; + + bpf_map_inc_with_uref(map); + info->map = map; + info->rhtab = container_of(map, struct bpf_rhtab, map); + rhashtable_walk_enter(&info->rhtab->ht, &info->iter); + return 0; +} + +static void bpf_iter_fini_rhash_map(void *priv_data) +{ + struct bpf_iter_seq_rhash_map_info *info = priv_data; + + rhashtable_walk_exit(&info->iter); + bpf_map_put_with_uref(info->map); +} + +static const struct seq_operations bpf_rhash_map_seq_ops = { + .start = bpf_rhash_map_seq_start, + .next = bpf_rhash_map_seq_next, + .stop = bpf_rhash_map_seq_stop, + .show = bpf_rhash_map_seq_show, +}; + +static const struct bpf_iter_seq_info rhash_iter_seq_info = { + .seq_ops = &bpf_rhash_map_seq_ops, + .init_seq_private = bpf_iter_init_rhash_map, + .fini_seq_private = bpf_iter_fini_rhash_map, + .seq_priv_size = sizeof(struct bpf_iter_seq_rhash_map_info), +}; + BTF_ID_LIST_SINGLE(rhtab_map_btf_ids, struct, bpf_rhtab) const struct bpf_map_ops rhtab_map_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -3047,6 +3387,11 @@ const struct bpf_map_ops rhtab_map_ops = { .map_update_elem = rhtab_map_update_elem, .map_delete_elem = rhtab_map_delete_elem, .map_gen_lookup = rhtab_map_gen_lookup, + .map_seq_show_elem = rhtab_map_seq_show_elem, + .map_set_for_each_callback_args = map_set_for_each_callback_args, + .map_for_each_callback = bpf_each_rhash_elem, .map_mem_usage = rhtab_map_mem_usage, + BATCH_OPS(rhtab), .map_btf_id = &rhtab_map_btf_ids[0], + .iter_seq_info = &rhash_iter_seq_info, }; diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index ae0741a09c6d..c19b360bad9e 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -123,7 +123,8 @@ static int bpf_iter_attach_map(struct bpf_prog *prog, is_percpu = true; else if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && - map->map_type != BPF_MAP_TYPE_ARRAY) + map->map_type != BPF_MAP_TYPE_ARRAY && + map->map_type != BPF_MAP_TYPE_RHASH) goto put_map; key_acc_size = prog->aux->max_rdonly_access; -- cgit v1.2.3 From 6905f8601298ecd2d1932a4b4849bf265201118e Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Fri, 5 Jun 2026 04:41:23 -0700 Subject: bpf: Allow special fields in resizable hashtab Add support for timers, workqueues, task work, spin locks and kptrs. Without this, users needing deferred callbacks, BPF_F_LOCK, or refcounted kernel pointers in a dynamically-sized map have no option - fixed-size htab is the only map supporting these field types. Resizable hashtab should offer the same capability. kptr semantics under in-place updates are identical to array map. Properly clean up BTF record fields on element delete and map teardown by wiring up bpf_obj_free_fields through a memory allocator destructor, matching the pattern used by htab for non-prealloc maps. Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260605-rhash-v7-6-5b8e05f8630d@meta.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 111 ++++++++++++++++++++++++++++++++++++++++++++++----- kernel/bpf/syscall.c | 3 ++ 2 files changed, 104 insertions(+), 10 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index a149713d0953..7b9408b8320c 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -497,28 +497,26 @@ static void htab_dtor_ctx_free(void *ctx) kfree(ctx); } -static int htab_set_dtor(struct bpf_htab *htab, void (*dtor)(void *, void *)) +static int bpf_ma_set_dtor(struct bpf_map *map, struct bpf_mem_alloc *ma, + void (*dtor)(void *, void *)) { - u32 key_size = htab->map.key_size; - struct bpf_mem_alloc *ma; struct htab_btf_record *hrec; int err; /* No need for dtors. */ - if (IS_ERR_OR_NULL(htab->map.record)) + if (IS_ERR_OR_NULL(map->record)) return 0; hrec = kzalloc(sizeof(*hrec), GFP_KERNEL); if (!hrec) return -ENOMEM; - hrec->key_size = key_size; - hrec->record = btf_record_dup(htab->map.record); + hrec->key_size = map->key_size; + hrec->record = btf_record_dup(map->record); if (IS_ERR(hrec->record)) { err = PTR_ERR(hrec->record); kfree(hrec); return err; } - ma = htab_is_percpu(htab) ? &htab->pcpu_ma : &htab->ma; bpf_mem_alloc_set_dtor(ma, dtor, htab_dtor_ctx_free, hrec); return 0; } @@ -535,9 +533,9 @@ static int htab_map_check_btf(struct bpf_map *map, const struct btf *btf, * populated in htab_map_alloc(), so it will always appear as NULL. */ if (htab_is_percpu(htab)) - return htab_set_dtor(htab, htab_pcpu_mem_dtor); + return bpf_ma_set_dtor(map, &htab->pcpu_ma, htab_pcpu_mem_dtor); else - return htab_set_dtor(htab, htab_mem_dtor); + return bpf_ma_set_dtor(map, &htab->ma, htab_mem_dtor); } static struct bpf_map *htab_map_alloc(union bpf_attr *attr) @@ -2752,6 +2750,7 @@ struct bpf_rhtab { struct rhashtable ht; struct bpf_mem_alloc ma; u32 elem_size; + bool freeing_internal; }; static const struct rhashtable_params rhtab_params = { @@ -2832,11 +2831,34 @@ static int rhtab_map_alloc_check(union bpf_attr *attr) return htab_map_alloc_check(attr); } +static void rhtab_check_and_free_fields(struct bpf_rhtab *rhtab, + struct rhtab_elem *elem) +{ + if (IS_ERR_OR_NULL(rhtab->map.record)) + return; + + bpf_obj_free_fields(rhtab->map.record, + rhtab_elem_value(elem, rhtab->map.key_size)); +} + +static void rhtab_mem_dtor(void *obj, void *ctx) +{ + struct htab_btf_record *hrec = ctx; + struct rhtab_elem *elem = obj; + + if (IS_ERR_OR_NULL(hrec->record)) + return; + + bpf_obj_free_fields(hrec->record, + rhtab_elem_value(elem, hrec->key_size)); +} + static void rhtab_free_elem(void *ptr, void *arg) { struct bpf_rhtab *rhtab = arg; struct rhtab_elem *elem = ptr; + bpf_map_free_internal_structs(&rhtab->map, rhtab_elem_value(elem, rhtab->map.key_size)); bpf_mem_cache_free_rcu(&rhtab->ma, elem); } @@ -2900,7 +2922,8 @@ static int rhtab_delete_elem(struct bpf_rhtab *rhtab, struct rhtab_elem *elem, v rhtab_read_elem_value(&rhtab->map, copy, elem, flags); check_and_init_map_value(&rhtab->map, copy); } - + /* Release internal structs: kptr, bpf_timer, task_work, wq */ + rhtab_check_and_free_fields(rhtab, elem); bpf_mem_cache_free_rcu(&rhtab->ma, elem); return 0; } @@ -2942,6 +2965,7 @@ static int rhtab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, void static long rhtab_map_update_existing(struct bpf_map *map, struct rhtab_elem *elem, void *value, u64 map_flags) { + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); void *old_val = rhtab_elem_value(elem, map->key_size); if (map_flags & BPF_NOEXIST) @@ -2951,6 +2975,17 @@ static long rhtab_map_update_existing(struct bpf_map *map, struct rhtab_elem *el copy_map_value_locked(map, old_val, value, false); else copy_map_value(map, old_val, value); + + /* + * Torn reads: a concurrent reader without BPF_F_LOCK may observe + * the value mid-copy. Callers requiring consistent reads must use + * BPF_F_LOCK, matching arraymap semantics. + * + * copy_map_value() skips special-field offsets, so old timers/ + * kptrs/etc. still sit in the slot. Cancel them after the copy + * to match arraymap's update semantics. + */ + rhtab_check_and_free_fields(rhtab, elem); return 0; } @@ -2973,6 +3008,14 @@ static long rhtab_map_update_elem(struct bpf_map *map, void *key, void *value, u if (map_flags & BPF_EXIST) return -ENOENT; + /* + * Reject new insertions while map_release_uref cleanup walks the + * table. Without this, new elements could keep triggering rehash + * and prevent the walk from terminating. + */ + if (READ_ONCE(rhtab->freeing_internal)) + return -EBUSY; + /* Check max_entries limit before inserting new element */ if (atomic_read(&rhtab->ht.nelems) >= map->max_entries) return -E2BIG; @@ -2983,6 +3026,7 @@ static long rhtab_map_update_elem(struct bpf_map *map, void *key, void *value, u memcpy(elem->data, key, map->key_size); copy_map_value(map, rhtab_elem_value(elem, map->key_size), value); + check_and_init_map_value(map, rhtab_elem_value(elem, map->key_size)); /* Prevent deadlock for NMI programs attempting to take bucket lock */ bpf_disable_instrumentation(); @@ -3015,8 +3059,54 @@ static int rhtab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) return insn - insn_buf; } +static int rhtab_map_check_btf(struct bpf_map *map, const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + + return bpf_ma_set_dtor(map, &rhtab->ma, rhtab_mem_dtor); +} + static void rhtab_map_free_internal_structs(struct bpf_map *map) { + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + struct rhashtable_iter iter; + struct rhtab_elem *elem; + + if (!bpf_map_has_internal_structs(map)) + return; + + /* + * Block new insertions. Once observed, no new growth is triggered, + * so any in-flight rehash will drain and the walker is guaranteed + * to stop returning -EAGAIN. Treat -EAGAIN as "rehash in progress, + * retry"; do not wait for the worker. + */ + WRITE_ONCE(rhtab->freeing_internal, true); + + rhashtable_walk_enter(&rhtab->ht, &iter); + rhashtable_walk_start(&iter); + + while ((elem = rhashtable_walk_next(&iter))) { + if (IS_ERR(elem)) { + if (PTR_ERR(elem) == -EAGAIN) + continue; + break; + } + + bpf_map_free_internal_structs(map, rhtab_elem_value(elem, map->key_size)); + + if (need_resched()) { /* Avoid stalls on large maps */ + rhashtable_walk_stop(&iter); + cond_resched(); + rhashtable_walk_start(&iter); + } + } + + rhashtable_walk_stop(&iter); + rhashtable_walk_exit(&iter); + WRITE_ONCE(rhtab->freeing_internal, false); } static int rhtab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) @@ -3382,6 +3472,7 @@ const struct bpf_map_ops rhtab_map_ops = { .map_free = rhtab_map_free, .map_get_next_key = rhtab_map_get_next_key, .map_release_uref = rhtab_map_free_internal_structs, + .map_check_btf = rhtab_map_check_btf, .map_lookup_elem = rhtab_map_lookup_elem, .map_lookup_and_delete_elem = rhtab_map_lookup_and_delete_elem, .map_update_elem = rhtab_map_update_elem, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1faae184de48..31a3b70a0b5d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1280,6 +1280,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, case BPF_SPIN_LOCK: case BPF_RES_SPIN_LOCK: if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_RHASH && map->map_type != BPF_MAP_TYPE_ARRAY && map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && map->map_type != BPF_MAP_TYPE_SK_STORAGE && @@ -1294,6 +1295,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, case BPF_WORKQUEUE: case BPF_TASK_WORK: if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_RHASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY) { ret = -EOPNOTSUPP; @@ -1305,6 +1307,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, case BPF_KPTR_PERCPU: case BPF_REFCOUNT: if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_RHASH && map->map_type != BPF_MAP_TYPE_PERCPU_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH && -- cgit v1.2.3 From 9dcbb5045fe5a00f04c99aede5a10726f8bb937b Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Fri, 5 Jun 2026 04:41:24 -0700 Subject: bpf: Optimize word-sized keys for resizable hashtable Specialize the lookup/update/delete paths for keys whose size matches sizeof(long) (4 bytes on 32-bit, 8 bytes on 64-bit). A static-const rhashtable_params lets the compiler inline a custom XOR-fold hashfn and a single-word equality cmpfn, eliminating the indirect jhash dispatch. The same hashfn and cmpfn are installed into rhashtable's stored params at rhashtable_init time, so the rehash worker, slow-path inserts, and rhashtable_next_key() all agree with the inlined fast paths. The seq_file BPF iterator uses rhashtable_walk_* and is unaffected. Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260605-rhash-v7-7-5b8e05f8630d@meta.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 47 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 7b9408b8320c..b4366cad3cfa 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -2763,6 +2763,31 @@ static inline void *rhtab_elem_value(struct rhtab_elem *l, u32 key_size) return l->data + round_up(key_size, 8); } +/* Specialize hash function and objcmp for long sized key */ +static __always_inline int rhtab_key_cmp_long(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + const unsigned long key1 = *(const unsigned long *)arg->key; + const struct rhtab_elem *key2 = ptr; + + return key1 != *(const unsigned long *)key2->data; +} + +static __always_inline u32 rhtab_hashfn_long(const void *data, u32 len, u32 seed) +{ + u64 k = *(const unsigned long *)data; + + return (u32)(k ^ (k >> 32)) ^ seed; +} + +static const struct rhashtable_params rhtab_params_long = { + .head_offset = offsetof(struct rhtab_elem, node), + .key_offset = offsetof(struct rhtab_elem, data), + .key_len = sizeof(long), + .hashfn = rhtab_hashfn_long, + .obj_cmpfn = rhtab_key_cmp_long, +}; + static struct bpf_map *rhtab_map_alloc(union bpf_attr *attr) { struct rhashtable_params params; @@ -2788,6 +2813,11 @@ static struct bpf_map *rhtab_map_alloc(union bpf_attr *attr) params.nelem_hint = (u32)attr->map_extra; params.automatic_shrinking = true; + if (rhtab->map.key_size == sizeof(long)) { + params.hashfn = rhtab_hashfn_long; + params.obj_cmpfn = rhtab_key_cmp_long; + } + err = rhashtable_init(&rhtab->ht, ¶ms); if (err) goto free_rhtab; @@ -2878,6 +2908,9 @@ static void *rhtab_lookup_elem(struct bpf_map *map, void *key) /* Hold RCU lock in case sleepable program calls via gen_lookup */ guard(rcu)(); + if (map->key_size == sizeof(long)) + return rhashtable_lookup_likely(&rhtab->ht, key, rhtab_params_long); + return rhashtable_lookup_likely(&rhtab->ht, key, rhtab_params); } @@ -2912,7 +2945,12 @@ static int rhtab_delete_elem(struct bpf_rhtab *rhtab, struct rhtab_elem *elem, v * raw tracepoints, which we don't have in rhashtable. */ bpf_disable_instrumentation(); - err = rhashtable_remove_fast(&rhtab->ht, &elem->node, rhtab_params); + + if (rhtab->map.key_size == sizeof(long)) + err = rhashtable_remove_fast(&rhtab->ht, &elem->node, rhtab_params_long); + else + err = rhashtable_remove_fast(&rhtab->ht, &elem->node, rhtab_params); + bpf_enable_instrumentation(); if (err) @@ -3030,7 +3068,12 @@ static long rhtab_map_update_elem(struct bpf_map *map, void *key, void *value, u /* Prevent deadlock for NMI programs attempting to take bucket lock */ bpf_disable_instrumentation(); - tmp = rhashtable_lookup_get_insert_fast(&rhtab->ht, &elem->node, rhtab_params); + + if (map->key_size == sizeof(long)) + tmp = rhashtable_lookup_get_insert_fast(&rhtab->ht, &elem->node, rhtab_params_long); + else + tmp = rhashtable_lookup_get_insert_fast(&rhtab->ht, &elem->node, rhtab_params); + bpf_enable_instrumentation(); if (tmp) { -- cgit v1.2.3 From 6e46ff0abefde32c2341ca2c61ab1f8855e8cac9 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Fri, 5 Jun 2026 04:41:25 -0700 Subject: libbpf: Support resizable hashtable Add BPF_MAP_TYPE_RHASH to libbpf's map type name table and feature probing so that libbpf-based tools can create and identify resizable hash maps. Signed-off-by: Mykyta Yatsenko Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260605-rhash-v7-8-5b8e05f8630d@meta.com Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 1 + tools/lib/bpf/libbpf_probes.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index ab2071fdd3e8..1354bcbc8b30 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -192,6 +192,7 @@ static const char * const map_type_name[] = { [BPF_MAP_TYPE_CGRP_STORAGE] = "cgrp_storage", [BPF_MAP_TYPE_ARENA] = "arena", [BPF_MAP_TYPE_INSN_ARRAY] = "insn_array", + [BPF_MAP_TYPE_RHASH] = "rhash", }; static const char * const prog_type_name[] = { diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index b70d9637ecf5..e40819465ddc 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -309,6 +309,9 @@ static int probe_map_create(enum bpf_map_type map_type) value_size = sizeof(__u64); opts.map_flags = BPF_F_NO_PREALLOC; break; + case BPF_MAP_TYPE_RHASH: + opts.map_flags = BPF_F_NO_PREALLOC; + break; case BPF_MAP_TYPE_CGROUP_STORAGE: case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE: key_size = sizeof(struct bpf_cgroup_storage_key); -- cgit v1.2.3 From 249996365b66d09db31bbf3a86c07715f47ea133 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Fri, 5 Jun 2026 04:41:26 -0700 Subject: selftests/bpf: Add basic tests for resizable hash map Test basic map operations (lookup, update, delete) for BPF_MAP_TYPE_RHASH including boundary conditions like duplicate key insertion and deletion of nonexistent keys. Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260605-rhash-v7-9-5b8e05f8630d@meta.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/rhash.c | 120 ++++++++++++ tools/testing/selftests/bpf/progs/rhash.c | 248 +++++++++++++++++++++++++ 2 files changed, 368 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/rhash.c create mode 100644 tools/testing/selftests/bpf/progs/rhash.c diff --git a/tools/testing/selftests/bpf/prog_tests/rhash.c b/tools/testing/selftests/bpf/prog_tests/rhash.c new file mode 100644 index 000000000000..69686bf69ba5 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/rhash.c @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include "rhash.skel.h" +#include +#include +#include + +static void rhash_run(const char *prog_name) +{ + struct rhash *skel; + struct bpf_program *prog; + LIBBPF_OPTS(bpf_test_run_opts, opts); + int err; + + skel = rhash__open(); + if (!ASSERT_OK_PTR(skel, "rhash__open")) + return; + + prog = bpf_object__find_program_by_name(skel->obj, prog_name); + if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) + goto cleanup; + bpf_program__set_autoload(prog, true); + + err = rhash__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + err = bpf_prog_test_run_opts(bpf_program__fd(prog), &opts); + if (!ASSERT_OK(err, "prog run")) + goto cleanup; + + if (!ASSERT_OK(opts.retval, "prog retval")) + goto cleanup; + + if (!ASSERT_OK(skel->bss->err, "bss->err")) + goto cleanup; + +cleanup: + rhash__destroy(skel); +} + +static int rhash_map_create(__u32 max_entries, __u64 map_extra) +{ + LIBBPF_OPTS(bpf_map_create_opts, opts, + .map_flags = BPF_F_NO_PREALLOC, + .map_extra = map_extra); + + return bpf_map_create(BPF_MAP_TYPE_RHASH, "rhash_extra", + sizeof(__u32), sizeof(__u64), max_entries, &opts); +} + +static void rhash_map_extra_presize(void) +{ + const __u32 max_entries = 1024; + const __u32 nelem_hint = 256; + struct bpf_map_info info = {}; + __u32 info_len = sizeof(info); + __u64 val = 0; + __u32 key; + int fd, i; + + fd = rhash_map_create(max_entries, nelem_hint); + if (!ASSERT_GE(fd, 0, "rhash_map_create presize")) + return; + + if (!ASSERT_OK(bpf_map_get_info_by_fd(fd, &info, &info_len), "info")) + goto close; + ASSERT_EQ(info.map_extra, nelem_hint, "info.map_extra"); + + for (i = 0; i < (int)nelem_hint; i++) { + key = i; + if (!ASSERT_OK(bpf_map_update_elem(fd, &key, &val, BPF_NOEXIST), + "update")) + goto close; + } +close: + close(fd); +} + +static void rhash_map_extra_too_big(void) +{ + int fd; + + fd = rhash_map_create(1U << 20, 0x10000); + if (!ASSERT_LT(fd, 0, "rhash_map_create hint > U16_MAX")) + close(fd); +} + +void test_rhash(void) +{ + if (test__start_subtest("test_rhash_lookup_update")) + rhash_run("test_rhash_lookup_update"); + + if (test__start_subtest("test_rhash_update_delete")) + rhash_run("test_rhash_update_delete"); + + if (test__start_subtest("test_rhash_update_elements")) + rhash_run("test_rhash_update_elements"); + + if (test__start_subtest("test_rhash_update_exist")) + rhash_run("test_rhash_update_exist"); + + if (test__start_subtest("test_rhash_update_any")) + rhash_run("test_rhash_update_any"); + + if (test__start_subtest("test_rhash_noexist_duplicate")) + rhash_run("test_rhash_noexist_duplicate"); + + if (test__start_subtest("test_rhash_delete_nonexistent")) + rhash_run("test_rhash_delete_nonexistent"); + + if (test__start_subtest("test_rhash_map_extra_presize")) + rhash_map_extra_presize(); + + if (test__start_subtest("test_rhash_map_extra_too_big")) + rhash_map_extra_too_big(); +} diff --git a/tools/testing/selftests/bpf/progs/rhash.c b/tools/testing/selftests/bpf/progs/rhash.c new file mode 100644 index 000000000000..fc2dac3a719e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/rhash.c @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include +#include +#include "bpf_misc.h" + +#define ENOENT 2 +#define EEXIST 17 + +char _license[] SEC("license") = "GPL"; + +int err; + +struct elem { + char arr[128]; + int val; +}; + +struct { + __uint(type, BPF_MAP_TYPE_RHASH); + __uint(map_flags, BPF_F_NO_PREALLOC); + __uint(max_entries, 128); + __type(key, int); + __type(value, struct elem); +} rhmap SEC(".maps"); + +SEC("syscall") +int test_rhash_lookup_update(void *ctx) +{ + int key = 5; + struct elem empty = {.val = 3, .arr = {0}}; + struct elem *e; + + err = 1; + e = bpf_map_lookup_elem(&rhmap, &key); + if (e) + return 1; + + err = bpf_map_update_elem(&rhmap, &key, &empty, BPF_NOEXIST); + if (err) + return 1; + + e = bpf_map_lookup_elem(&rhmap, &key); + if (!e || e->val != empty.val) { + err = 2; + return 2; + } + + err = 0; + return 0; +} + +SEC("syscall") +int test_rhash_update_delete(void *ctx) +{ + int key = 6; + struct elem empty = {.val = 4, .arr = {0}}; + struct elem *e; + + err = 1; + e = bpf_map_lookup_elem(&rhmap, &key); + if (e) + return 1; + + err = bpf_map_update_elem(&rhmap, &key, &empty, BPF_NOEXIST); + if (err) + return 2; + + err = bpf_map_delete_elem(&rhmap, &key); + if (err) + return 3; + + e = bpf_map_lookup_elem(&rhmap, &key); + if (e) { + err = 4; + return 4; + } + + err = 0; + return 0; +} + +SEC("syscall") +int test_rhash_update_elements(void *ctx) +{ + int key = 0; + struct elem empty = {.val = 4, .arr = {0}}; + struct elem *e; + int i; + + err = 1; + + for (i = 0; i < 128; ++i) { + key = i; + e = bpf_map_lookup_elem(&rhmap, &key); + if (e) + return 1; + + empty.val = key; + err = bpf_map_update_elem(&rhmap, &key, &empty, BPF_NOEXIST); + if (err) + return 2; + + e = bpf_map_lookup_elem(&rhmap, &key); + if (!e || e->val != key) { + err = 4; + return 4; + } + } + + for (i = 0; i < 128; ++i) { + key = i; + err = bpf_map_delete_elem(&rhmap, &key); + if (err) + return 3; + + e = bpf_map_lookup_elem(&rhmap, &key); + if (e) { + err = 5; + return 5; + } + } + + err = 0; + return 0; +} + +SEC("syscall") +int test_rhash_update_exist(void *ctx) +{ + int key = 10; + struct elem val1 = {.val = 100, .arr = {0}}; + struct elem val2 = {.val = 200, .arr = {0}}; + struct elem *e; + int ret; + + err = 1; + + /* BPF_EXIST on non-existent key should fail with -ENOENT */ + ret = bpf_map_update_elem(&rhmap, &key, &val1, BPF_EXIST); + if (ret != -ENOENT) + return 1; + + /* Insert element first */ + ret = bpf_map_update_elem(&rhmap, &key, &val1, BPF_NOEXIST); + if (ret) + return 2; + + /* Verify initial value */ + e = bpf_map_lookup_elem(&rhmap, &key); + if (!e || e->val != 100) + return 3; + + /* BPF_EXIST on existing key should succeed and update value */ + ret = bpf_map_update_elem(&rhmap, &key, &val2, BPF_EXIST); + if (ret) + return 4; + + /* Verify value was updated */ + e = bpf_map_lookup_elem(&rhmap, &key); + if (!e || e->val != 200) + return 5; + + /* Cleanup */ + bpf_map_delete_elem(&rhmap, &key); + err = 0; + return 0; +} + +SEC("syscall") +int test_rhash_update_any(void *ctx) +{ + int key = 11; + struct elem val1 = {.val = 111, .arr = {0}}; + struct elem val2 = {.val = 222, .arr = {0}}; + struct elem *e; + int ret; + + err = 1; + + /* BPF_ANY on non-existent key should insert */ + ret = bpf_map_update_elem(&rhmap, &key, &val1, BPF_ANY); + if (ret) + return 1; + + e = bpf_map_lookup_elem(&rhmap, &key); + if (!e || e->val != 111) + return 2; + + /* BPF_ANY on existing key should update */ + ret = bpf_map_update_elem(&rhmap, &key, &val2, BPF_ANY); + if (ret) + return 3; + + e = bpf_map_lookup_elem(&rhmap, &key); + if (!e || e->val != 222) + return 4; + + /* Cleanup */ + bpf_map_delete_elem(&rhmap, &key); + err = 0; + return 0; +} + +SEC("syscall") +int test_rhash_noexist_duplicate(void *ctx) +{ + int key = 12; + struct elem val = {.val = 600, .arr = {0}}; + int ret; + + err = 1; + + /* Insert element */ + ret = bpf_map_update_elem(&rhmap, &key, &val, BPF_NOEXIST); + if (ret) + return 1; + + /* Try to insert again with BPF_NOEXIST - should fail with -EEXIST */ + ret = bpf_map_update_elem(&rhmap, &key, &val, BPF_NOEXIST); + if (ret != -EEXIST) + return 2; + + /* Cleanup */ + bpf_map_delete_elem(&rhmap, &key); + err = 0; + return 0; +} + +SEC("syscall") +int test_rhash_delete_nonexistent(void *ctx) +{ + int key = 99999; + int ret; + + err = 1; + + /* Delete non-existent key should return -ENOENT */ + ret = bpf_map_delete_elem(&rhmap, &key); + if (ret != -ENOENT) + return 1; + + err = 0; + return 0; +} -- cgit v1.2.3 From a996794fda8463afbc2bc70fbc7f6a2a9c1547ef Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Fri, 5 Jun 2026 04:41:27 -0700 Subject: selftests/bpf: Add BPF iterator tests for resizable hash map Test basic BPF iterator functionality for BPF_MAP_TYPE_RHASH, verifying all elements are visited. Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260605-rhash-v7-10-5b8e05f8630d@meta.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/rhash.c | 63 ++++++++++++++++++++++ .../selftests/bpf/progs/bpf_iter_bpf_rhash_map.c | 34 ++++++++++++ 2 files changed, 97 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_bpf_rhash_map.c diff --git a/tools/testing/selftests/bpf/prog_tests/rhash.c b/tools/testing/selftests/bpf/prog_tests/rhash.c index 69686bf69ba5..98bb66907b7f 100644 --- a/tools/testing/selftests/bpf/prog_tests/rhash.c +++ b/tools/testing/selftests/bpf/prog_tests/rhash.c @@ -4,6 +4,7 @@ #include #include #include "rhash.skel.h" +#include "bpf_iter_bpf_rhash_map.skel.h" #include #include #include @@ -89,6 +90,65 @@ static void rhash_map_extra_too_big(void) close(fd); } +static void rhash_iter_test(void) +{ + DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); + struct bpf_iter_bpf_rhash_map *skel; + int err, i, len, map_fd, iter_fd; + union bpf_iter_link_info linfo; + u32 expected_key_sum = 0, key; + struct bpf_link *link; + u64 val = 0; + char buf[64]; + + skel = bpf_iter_bpf_rhash_map__open(); + if (!ASSERT_OK_PTR(skel, "bpf_iter_bpf_rhash_map__open")) + return; + + err = bpf_iter_bpf_rhash_map__load(skel); + if (!ASSERT_OK(err, "bpf_iter_bpf_rhash_map__load")) + goto out; + + map_fd = bpf_map__fd(skel->maps.rhashmap); + + /* Populate map with test data */ + for (i = 0; i < 64; i++) { + key = i + 1; + expected_key_sum += key; + + err = bpf_map_update_elem(map_fd, &key, &val, BPF_NOEXIST); + if (!ASSERT_OK(err, "map_update")) + goto out; + } + + memset(&linfo, 0, sizeof(linfo)); + linfo.map.map_fd = map_fd; + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + + link = bpf_program__attach_iter(skel->progs.dump_bpf_rhash_map, &opts); + if (!ASSERT_OK_PTR(link, "attach_iter")) + goto out; + + iter_fd = bpf_iter_create(bpf_link__fd(link)); + if (!ASSERT_GE(iter_fd, 0, "create_iter")) + goto free_link; + + do { + len = read(iter_fd, buf, sizeof(buf)); + } while (len > 0); + + ASSERT_EQ(skel->bss->key_sum, expected_key_sum, "key_sum"); + ASSERT_EQ(skel->bss->elem_count, 64, "elem_count"); + + close(iter_fd); + +free_link: + bpf_link__destroy(link); +out: + bpf_iter_bpf_rhash_map__destroy(skel); +} + void test_rhash(void) { if (test__start_subtest("test_rhash_lookup_update")) @@ -117,4 +177,7 @@ void test_rhash(void) if (test__start_subtest("test_rhash_map_extra_too_big")) rhash_map_extra_too_big(); + + if (test__start_subtest("test_rhash_iter")) + rhash_iter_test(); } diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_rhash_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_rhash_map.c new file mode 100644 index 000000000000..86f6c0d5eadb --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_rhash_map.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include +#include + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_RHASH); + __uint(map_flags, BPF_F_NO_PREALLOC); + __uint(max_entries, 64); + __type(key, __u32); + __type(value, __u64); +} rhashmap SEC(".maps"); + +__u32 key_sum = 0; +__u64 val_sum = 0; +__u32 elem_count = 0; +__u32 err = 0; + +SEC("iter/bpf_map_elem") +int dump_bpf_rhash_map(struct bpf_iter__bpf_map_elem *ctx) +{ + __u32 *key = ctx->key; + __u64 *val = ctx->value; + + if (!key || !val) + return 0; + + key_sum += *key; + val_sum += *val; + elem_count++; + return 0; +} -- cgit v1.2.3 From 2bea44ea3c4ef6cee3a7c8b6bd74ace093632bef Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Fri, 5 Jun 2026 04:41:28 -0700 Subject: bpftool: Add rhash map documentation Make bpftool documentation aware of the resizable hash map. Signed-off-by: Mykyta Yatsenko Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260605-rhash-v7-11-5b8e05f8630d@meta.com Signed-off-by: Alexei Starovoitov --- tools/bpf/bpftool/Documentation/bpftool-map.rst | 2 +- tools/bpf/bpftool/map.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 1af3305ea2b2..5daf3de5c744 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -56,7 +56,7 @@ MAP COMMANDS | | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** | | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage** | | **task_storage** | **bloom_filter** | **user_ringbuf** | **cgrp_storage** | **arena** -| | **insn_array** } +| | **insn_array** | **rhash** } DESCRIPTION =========== diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index 7ebf7dbcfba4..71a45d96617e 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -1478,7 +1478,7 @@ static int do_help(int argc, char **argv) " cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n" " queue | stack | sk_storage | struct_ops | ringbuf | inode_storage |\n" " task_storage | bloom_filter | user_ringbuf | cgrp_storage | arena |\n" - " insn_array }\n" + " insn_array | rhash }\n" " " HELP_SPEC_OPTIONS " |\n" " {-f|--bpffs} | {-n|--nomount} }\n" "", -- cgit v1.2.3 From 84f7a49e76ec8e0a1e18f3758e89800f8cf8cfc6 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Fri, 5 Jun 2026 04:41:29 -0700 Subject: selftests/bpf: Add resizable hashmap to benchmarks Support resizable hashmap in BPF map benchmarks. 1. LOOKUP (single producer, M events/sec) key | max | nr | htab | rhtab | ratio | delta ----+-----+-------+---------+---------+-------+------- 8 | 1K | 750 | 99.85 | 81.92 | 0.82x | -18 % 8 | 1K | 1K | 100.71 | 80.19 | 0.80x | -20 % 8 | 1M | 750K | 23.37 | 72.09 | 3.08x | +208 % 8 | 1M | 1M | 13.39 | 53.72 | 4.01x | +301 % 32 | 1K | 750 | 51.57 | 42.78 | 0.83x | -17 % 32 | 1K | 1K | 50.81 | 45.83 | 0.90x | -10 % 32 | 1M | 750K | 11.27 | 15.29 | 1.36x | +36 % 32 | 1M | 1M | 7.32 | 8.75 | 1.19x | +19 % 256 | 1K | 750 | 7.58 | 7.88 | 1.04x | +4 % 256 | 1K | 1K | 7.43 | 7.81 | 1.05x | +5 % 256 | 1M | 750K | 3.69 | 4.27 | 1.16x | +16 % 256 | 1M | 1M | 2.60 | 3.12 | 1.20x | +20 % Pattern: * Small map (1K): htab wins for 8 / 32 byte keys by 10-20% * Large map (1M): rhtab wins everywhere, up to 4x at high load factor with 8 byte keys. * Higher load factor amplifies rhtab's lead: rhtab grows the bucket array; htab stays at user-declared max. 2. FULL UPDATE (M events/sec per producer) htab per-producer: 20.33 22.02 19.27 23.61 24.18 23.17 21.07 mean 21.94 range 19.27 - 24.18 rhtab per-producer: 133.51 129.47 74.52 129.29 102.26 129.98 107.64 mean 115.24 range 74.52 - 133.51 speedup (mean): 5.25x (+425 %) In-place memcpy avoids the per-update alloc + RCU pointer swap that htab pays. 3. MEMORY value_size | htab ops/s | rhtab ops/s | htab mem | rhtab mem -----------+-------------+-------------+----------+---------- 32 B | 122.87 k/s | 133.04 k/s | 2.47 MiB | 2.49 MiB 4096 B | 64.43 k/s | 65.38 k/s | 6.74 MiB | 6.44 MiB rhtab/htab : +8 % ops, +0.8 % mem (32 B) +1 % ops, -4 % mem (4096 B) Throughput effectively tied SUMMARY * Small / well-fitting map: htab is faster (cache-friendly fixed bucket array), but only by ~10-20 %. * Large / high-load-factor map: rhtab is dramatically faster (1.2x to 4x) because rhashtable resizes to keep the load factor sane while htab stays stuck at user-declared max. * Update-heavy workloads: rhtab is ~5x faster per producer via in-place memcpy. * Memory benchmark: effectively on par. Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260605-rhash-v7-12-5b8e05f8630d@meta.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bench.c | 6 ++++ .../bpf/benchs/bench_bpf_hashmap_full_update.c | 34 +++++++++++++++++++-- .../bpf/benchs/bench_bpf_hashmap_lookup.c | 31 +++++++++++++++++-- .../testing/selftests/bpf/benchs/bench_htab_mem.c | 35 ++++++++++++++++++++-- 4 files changed, 100 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index 6155ce455c27..3d9d2cd7764b 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -560,13 +560,16 @@ extern const struct bench bench_bpf_loop; extern const struct bench bench_strncmp_no_helper; extern const struct bench bench_strncmp_helper; extern const struct bench bench_bpf_hashmap_full_update; +extern const struct bench bench_bpf_rhashmap_full_update; extern const struct bench bench_local_storage_cache_seq_get; extern const struct bench bench_local_storage_cache_interleaved_get; extern const struct bench bench_local_storage_cache_hashmap_control; extern const struct bench bench_local_storage_tasks_trace; extern const struct bench bench_bpf_hashmap_lookup; +extern const struct bench bench_bpf_rhashmap_lookup; extern const struct bench bench_local_storage_create; extern const struct bench bench_htab_mem; +extern const struct bench bench_rhtab_mem; extern const struct bench bench_crypto_encrypt; extern const struct bench bench_crypto_decrypt; extern const struct bench bench_sockmap; @@ -640,13 +643,16 @@ static const struct bench *benchs[] = { &bench_strncmp_no_helper, &bench_strncmp_helper, &bench_bpf_hashmap_full_update, + &bench_bpf_rhashmap_full_update, &bench_local_storage_cache_seq_get, &bench_local_storage_cache_interleaved_get, &bench_local_storage_cache_hashmap_control, &bench_local_storage_tasks_trace, &bench_bpf_hashmap_lookup, + &bench_bpf_rhashmap_lookup, &bench_local_storage_create, &bench_htab_mem, + &bench_rhtab_mem, &bench_crypto_encrypt, &bench_crypto_decrypt, &bench_sockmap, diff --git a/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_full_update.c b/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_full_update.c index ee1dc12c5e5e..7278fa860397 100644 --- a/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_full_update.c +++ b/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_full_update.c @@ -34,19 +34,29 @@ static void measure(struct bench_res *res) { } -static void setup(void) +static void hashmap_full_update_setup(enum bpf_map_type map_type) { struct bpf_link *link; int map_fd, i, max_entries; setup_libbpf(); - ctx.skel = bpf_hashmap_full_update_bench__open_and_load(); + ctx.skel = bpf_hashmap_full_update_bench__open(); if (!ctx.skel) { fprintf(stderr, "failed to open skeleton\n"); exit(1); } + bpf_map__set_type(ctx.skel->maps.hash_map_bench, map_type); + if (map_type == BPF_MAP_TYPE_RHASH) + bpf_map__set_map_flags(ctx.skel->maps.hash_map_bench, + BPF_F_NO_PREALLOC); + + if (bpf_hashmap_full_update_bench__load(ctx.skel)) { + fprintf(stderr, "failed to load skeleton\n"); + exit(1); + } + ctx.skel->bss->nr_loops = MAX_LOOP_NUM; link = bpf_program__attach(ctx.skel->progs.benchmark); @@ -62,6 +72,16 @@ static void setup(void) bpf_map_update_elem(map_fd, &i, &i, BPF_ANY); } +static void setup(void) +{ + hashmap_full_update_setup(BPF_MAP_TYPE_HASH); +} + +static void rhash_setup(void) +{ + hashmap_full_update_setup(BPF_MAP_TYPE_RHASH); +} + static void hashmap_report_final(struct bench_res res[], int res_cnt) { unsigned int nr_cpus = bpf_num_possible_cpus(); @@ -87,3 +107,13 @@ const struct bench bench_bpf_hashmap_full_update = { .report_progress = NULL, .report_final = hashmap_report_final, }; + +const struct bench bench_bpf_rhashmap_full_update = { + .name = "bpf-rhashmap-full-update", + .validate = validate, + .setup = rhash_setup, + .producer_thread = producer, + .measure = measure, + .report_progress = NULL, + .report_final = hashmap_report_final, +}; diff --git a/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_lookup.c b/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_lookup.c index 279ff1b8b5b2..5264b7b20e39 100644 --- a/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_lookup.c +++ b/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_lookup.c @@ -148,9 +148,10 @@ static inline void patch_key(u32 i, u32 *key) /* the rest of key is random */ } -static void setup(void) +static void hashmap_lookup_setup(enum bpf_map_type map_type) { struct bpf_link *link; + __u32 map_flags; int map_fd; int ret; int i; @@ -163,10 +164,15 @@ static void setup(void) exit(1); } + map_flags = args.map_flags; + if (map_type == BPF_MAP_TYPE_RHASH) + map_flags |= BPF_F_NO_PREALLOC; + + bpf_map__set_type(ctx.skel->maps.hash_map_bench, map_type); bpf_map__set_max_entries(ctx.skel->maps.hash_map_bench, args.max_entries); bpf_map__set_key_size(ctx.skel->maps.hash_map_bench, args.key_size); bpf_map__set_value_size(ctx.skel->maps.hash_map_bench, 8); - bpf_map__set_map_flags(ctx.skel->maps.hash_map_bench, args.map_flags); + bpf_map__set_map_flags(ctx.skel->maps.hash_map_bench, map_flags); ctx.skel->bss->nr_entries = args.nr_entries; ctx.skel->bss->nr_loops = args.nr_loops / args.nr_entries; @@ -197,6 +203,16 @@ static void setup(void) } } +static void setup(void) +{ + hashmap_lookup_setup(BPF_MAP_TYPE_HASH); +} + +static void rhash_setup(void) +{ + hashmap_lookup_setup(BPF_MAP_TYPE_RHASH); +} + static inline double events_from_time(u64 time) { if (time) @@ -275,3 +291,14 @@ const struct bench bench_bpf_hashmap_lookup = { .report_progress = NULL, .report_final = hashmap_report_final, }; + +const struct bench bench_bpf_rhashmap_lookup = { + .name = "bpf-rhashmap-lookup", + .argp = &bench_hashmap_lookup_argp, + .validate = validate, + .setup = rhash_setup, + .producer_thread = producer, + .measure = measure, + .report_progress = NULL, + .report_final = hashmap_report_final, +}; diff --git a/tools/testing/selftests/bpf/benchs/bench_htab_mem.c b/tools/testing/selftests/bpf/benchs/bench_htab_mem.c index 297e32390cd1..1ee217d97434 100644 --- a/tools/testing/selftests/bpf/benchs/bench_htab_mem.c +++ b/tools/testing/selftests/bpf/benchs/bench_htab_mem.c @@ -152,7 +152,7 @@ static const struct htab_mem_use_case *htab_mem_find_use_case_or_exit(const char exit(1); } -static void htab_mem_setup(void) +static void htab_mem_setup_impl(enum bpf_map_type map_type) { struct bpf_map *map; const char **names; @@ -178,10 +178,11 @@ static void htab_mem_setup(void) } map = ctx.skel->maps.htab; + bpf_map__set_type(map, map_type); bpf_map__set_value_size(map, args.value_size); /* Ensure that different CPUs can operate on different subset */ bpf_map__set_max_entries(map, MAX(8192, 64 * env.nr_cpus)); - if (args.preallocated) + if (map_type != BPF_MAP_TYPE_RHASH && args.preallocated) bpf_map__set_map_flags(map, bpf_map__map_flags(map) & ~BPF_F_NO_PREALLOC); names = ctx.uc->progs; @@ -220,6 +221,16 @@ cleanup: exit(1); } +static void htab_mem_setup(void) +{ + htab_mem_setup_impl(BPF_MAP_TYPE_HASH); +} + +static void rhtab_mem_setup(void) +{ + htab_mem_setup_impl(BPF_MAP_TYPE_RHASH); +} + static void htab_mem_add_fn(pthread_barrier_t *notify) { while (true) { @@ -338,6 +349,15 @@ static void htab_mem_report_final(struct bench_res res[], int res_cnt) cleanup_cgroup_environment(); } +static void rhtab_mem_validate(void) +{ + if (args.preallocated) { + fprintf(stderr, "rhash map does not support preallocation\n"); + exit(1); + } + htab_mem_validate(); +} + const struct bench bench_htab_mem = { .name = "htab-mem", .argp = &bench_htab_mem_argp, @@ -348,3 +368,14 @@ const struct bench bench_htab_mem = { .report_progress = htab_mem_report_progress, .report_final = htab_mem_report_final, }; + +const struct bench bench_rhtab_mem = { + .name = "rhtab-mem", + .argp = &bench_htab_mem_argp, + .validate = rhtab_mem_validate, + .setup = rhtab_mem_setup, + .producer_thread = htab_mem_producer, + .measure = htab_mem_measure, + .report_progress = htab_mem_report_progress, + .report_final = htab_mem_report_final, +}; -- cgit v1.2.3 From 390dc36ccfa1a46b19de994181fc5248aff4c177 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Thu, 4 Jun 2026 14:42:52 -0400 Subject: MAINTAINERS: BPF: Add self as reviewer and run parse_maintainers.pl Add myself as a reviewer for the BPF subsystem. While at it, run ./scripts/parse_maintainers.pl --order and reorder the BPF-related entries in the file accordingly. Signed-off-by: Emil Tsalapatis Acked-by: Eduard Zingerman Acked-by: Kumar Kartikeya Dwivedi Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20260604184252.9917-1-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- MAINTAINERS | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index afae76600fc6..87ac5cafb55b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4811,12 +4811,13 @@ BPF [GENERAL] (Safe Dynamic Programs and Tools) M: Alexei Starovoitov M: Daniel Borkmann M: Andrii Nakryiko -R: Martin KaFai Lau M: Eduard Zingerman M: Kumar Kartikeya Dwivedi +R: Martin KaFai Lau R: Song Liu R: Yonghong Song R: Jiri Olsa +R: Emil Tsalapatis L: bpf@vger.kernel.org S: Supported W: https://bpf.io/ @@ -4826,7 +4827,9 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git F: Documentation/bpf/ F: Documentation/networking/filter.rst F: Documentation/userspace-api/ebpf/ +F: arch/*/include/asm/rqspinlock.h F: arch/*/net/* +F: include/asm-generic/rqspinlock.h F: include/linux/bpf* F: include/linux/btf* F: include/linux/buildid.h @@ -4838,17 +4841,15 @@ F: include/uapi/linux/filter.h F: kernel/bpf/ F: kernel/trace/bpf_trace.c F: lib/buildid.c -F: arch/*/include/asm/rqspinlock.h -F: include/asm-generic/rqspinlock.h F: lib/test_bpf.c F: net/bpf/ F: net/core/filter.c F: net/sched/act_bpf.c F: net/sched/cls_bpf.c F: samples/bpf/ +F: scripts/Makefile.btf F: scripts/bpf_doc.py F: scripts/gen-btf.sh -F: scripts/Makefile.btf F: scripts/pahole-version.sh F: tools/bpf/ F: tools/lib/bpf/ -- cgit v1.2.3 From 27ffbfd14d774adfc64ae1f8f76aa6195411087a Mon Sep 17 00:00:00 2001 From: Song Chen Date: Wed, 3 Jun 2026 17:19:10 +0800 Subject: bpf: Reject registration of duplicated kfunc Search for duplicated kfunc in btf_vmlinux and btf_modules before a kernel module attempts to register a kfunc. If kfunc would shadow existing kfunc then pr_err() and reject module loading. Reviewed-by: Yonghong Song Signed-off-by: Song Chen Link: https://lore.kernel.org/r/20260603091910.7212-1-chensong_2000@126.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 55aa3ba1b1e0..ef4402274786 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -8771,6 +8771,39 @@ static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name, return 0; } +static int btf_check_kfunc_name(struct btf *btf, const char *func_name, u32 kind) +{ +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + struct btf_module *btf_mod, *tmp; +#endif + s32 id; + + if (!btf_is_module(btf)) + return 0; + + id = btf_find_by_name_kind(bpf_get_btf_vmlinux(), func_name, kind); + if (id >= 0) { + pr_err("kfunc %s (id: %d) is already present in vmlinux.\n", + func_name, id); + return -EINVAL; + } + +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + guard(mutex)(&btf_module_mutex); + list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) { + if (btf_mod->btf == btf) + continue; + id = btf_find_by_name_kind(btf_mod->btf, func_name, kind); + if (id >= 0) { + pr_err("kfunc %s (id: %d) is already present in module %s.\n", + func_name, id, btf_mod->module->name); + return -EINVAL; + } + } +#endif + return 0; +} + static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags) { const struct btf_type *func; @@ -8784,7 +8817,8 @@ static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags) /* sanity check kfunc name */ func_name = btf_name_by_offset(btf, func->name_off); - if (!func_name || !func_name[0]) + if (!func_name || !func_name[0] || + btf_check_kfunc_name(btf, func_name, BTF_INFO_KIND(func->info))) return -EINVAL; func = btf_type_by_id(btf, func->type); -- cgit v1.2.3 From aa496720618f1a6054f1c870bf10b4f6c99bf656 Mon Sep 17 00:00:00 2001 From: Zhao Zhang Date: Tue, 2 Jun 2026 16:43:33 +0800 Subject: bpf: Reject fragmented frames in devmap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Devmap broadcast redirects clone the packet for all but the last destination. For native XDP, that clone path copies only the linear xdp_frame data, while fragmented frames keep skb_shared_info in tailroom outside the linear area. Cloning such a frame leaves XDP_FLAGS_HAS_FRAGS set but without valid frag metadata, and the later free path can interpret uninitialized tail data as skb_shared_info, leading to an out-of-bounds access during frame return. Reject fragmented native XDP frames in dev_map_enqueue_clone(). Add the same restriction to the generic XDP clone path in dev_map_redirect_clone(). Generic XDP represents fragmented packets as nonlinear skbs, and rejecting them here keeps clone-based broadcast support aligned between native and generic XDP. Fixes: e624d4ed4aa8 ("xdp: Extend xdp_redirect_map with broadcast support") Cc: stable@kernel.org Reported-by: Yuan Tan Reported-by: Zhengchuan Liang Reported-by: Xin Liu Assisted-by: Codex:GPT-5.4 Signed-off-by: Zhao Zhang Signed-off-by: Ren Wei Reviewed-by: Emil Tsalapatis Reviewed-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/21c2d153dd25603d359069a02bf06779b51f6423.1780385378.git.zzhan461@ucr.edu Signed-off-by: Alexei Starovoitov --- kernel/bpf/devmap.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index cc0a43ebab6b..5b9eac5342a9 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -581,6 +581,10 @@ static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj, { struct xdp_frame *nxdpf; + /* Frags live outside the linear frame and cannot be cloned safely. */ + if (unlikely(xdp_frame_has_frags(xdpf))) + return -EOPNOTSUPP; + nxdpf = xdpf_clone(xdpf); if (!nxdpf) return -ENOMEM; @@ -726,6 +730,9 @@ static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst, struct sk_buff *nskb; int err; + if (unlikely(skb_is_nonlinear(skb))) + return -EOPNOTSUPP; + nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return -ENOMEM; -- cgit v1.2.3 From f64c723741c911544cca4c838d7a291b06b3ad1d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Jun 2026 08:37:28 -1000 Subject: bpf: Replace scratch PTE atomically when allocating arena pages apply_range_set_cb() maps the pages for a new arena allocation and returned -EBUSY when the target PTE was already populated. Kernel-fault recovery leaves the per-arena scratch page in unallocated arena PTEs, so a later bpf_arena_alloc_pages() over such a page hits that -EBUSY, and every subsequent allocation of it fails the same way. Allocation must install the real page over scratch instead. Overwriting the scratch PTE in place is a valid->valid change, which arm64 forbids without break-before-make. Route through an invalid entry instead: ptep_try_set() fills only a none slot, so the PTE goes scratch->none->page. On finding scratch, clear it and flush_tlb_before_set() before retrying. The new flush_tlb_before_set() is a no-op except on arches like arm64 that need the break-before-make TLB invalidate. The loop also copes with a concurrent fault re-scratching the slot. Arches without ptep_try_set() never install the scratch page, so keep the must-be-empty check and set_pte_at() for them. Fixes: dc11a4dba246 ("bpf: Recover arena kernel faults with scratch page") Signed-off-by: Tejun Heo Cc: Alexei Starovoitov Cc: David Hildenbrand Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260601183728.1800490-1-tj@kernel.org Signed-off-by: Alexei Starovoitov --- arch/arm64/include/asm/pgtable.h | 11 +++++++++++ include/linux/pgtable.h | 18 ++++++++++++++++++ kernel/bpf/arena.c | 38 +++++++++++++++++++++++++++++++++----- 3 files changed, 62 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 984f0502c9d0..3ce0f2a6cab6 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1842,6 +1842,17 @@ static inline bool ptep_try_set(pte_t *ptep, pte_t new_pte) } #define ptep_try_set ptep_try_set +/* + * arm64 mandates break-before-make: a cleared kernel PTE must have its TLB + * invalidated before a different page is installed in its place. The broadcast + * TLBI is an instruction, not an IPI, so this is safe with interrupts disabled. + */ +static inline void flush_tlb_before_set(unsigned long addr) +{ + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); +} +#define flush_tlb_before_set flush_tlb_before_set + #define test_and_clear_young_ptes test_and_clear_young_ptes static inline bool test_and_clear_young_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index b5739bb99fc1..4c6c4081ef71 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1061,6 +1061,24 @@ static inline bool ptep_try_set(pte_t *ptep, pte_t new_pte) } #endif +#ifndef flush_tlb_before_set +/** + * flush_tlb_before_set - invalidate a kernel PTE's TLB before re-setting it + * @addr: kernel virtual address whose PTE was just cleared + * + * Some architectures (e.g. arm64) do not allow a live page-table entry to be + * repointed at a different page in one step. The old entry must first be made + * invalid and its translation flushed from every TLB, and only then may the new + * entry be written. + * + * This is only for the lockless atomic kernel-PTE installers (ptep_try_set()). + * It must be callable with interrupts disabled. + */ +static inline void flush_tlb_before_set(unsigned long addr) +{ +} +#endif + #ifndef wrprotect_ptes /** * wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 9b2dea229b38..af49c154473d 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -144,6 +144,7 @@ static long compute_pgoff(struct bpf_arena *arena, long uaddr) struct apply_range_data { struct page **pages; + struct page *scratch_page; int i; }; @@ -156,19 +157,44 @@ static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data) { struct apply_range_data *d = data; struct page *page; + pte_t pteval; if (!data) return 0; - /* sanity check */ - if (unlikely(!pte_none(ptep_get(pte)))) - return -EBUSY; page = d->pages[d->i]; /* paranoia, similar to vmap_pages_pte_range() */ if (WARN_ON_ONCE(!pfn_valid(page_to_pfn(page)))) return -EINVAL; - set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL)); + pteval = mk_pte(page, PAGE_KERNEL); +#ifdef ptep_try_set + /* + * Kernel-fault recovery may have installed the scratch page here, and + * some architectures (arm64) prohibit valid->valid PTE transitions. + * Install atomically into a none slot. If scratch is present, clear it + * and flush_tlb_before_set() (break-before-make) before retrying. + */ + while (!ptep_try_set(pte, pteval)) { + pte_t old = ptep_get(pte); + + if (pte_none(old)) + continue; + if (WARN_ON_ONCE(pte_page(old) != d->scratch_page)) + return -EBUSY; + ptep_get_and_clear(&init_mm, addr, pte); + flush_tlb_before_set(addr); + } +#else + /* + * Without ptep_try_set() there is no atomic installer, but such arches + * also do not wire up bpf_arena_handle_page_fault(), so no scratch page + * is ever installed and the slot is always none here. + */ + if (unlikely(!pte_none(ptep_get(pte)))) + return -EBUSY; + set_pte_at(&init_mm, addr, pte, pteval); +#endif d->i++; return 0; } @@ -480,7 +506,8 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) if (ret) goto out_sigsegv_memcg; - struct apply_range_data data = { .pages = &page, .i = 0 }; + struct apply_range_data data = { .pages = &page, .i = 0, + .scratch_page = arena->scratch_page }; /* Account into memcg of the process that created bpf_arena */ ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page); if (ret) { @@ -670,6 +697,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt return 0; } data.pages = pages; + data.scratch_page = arena->scratch_page; if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) goto out_free_pages; -- cgit v1.2.3 From aa22d619ba22177f430693cf5e9495052d996644 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Wed, 3 Jun 2026 07:39:15 -0700 Subject: selftests/bpf: Fix flaky file_reader test file_reader/on_open_expect_fault test expects page fault when reading pages from the test harness executable. It is not guaranteed that those are paged out, even after madvise(MADV_PAGEOUT). Relax the condition in the test to succeed with both 0 and -EFAULT returned. Fixes: 784cdf931543 ("selftests/bpf: add file dynptr tests") Reported-by: Shung-Hsi Yu Closes: https://lore.kernel.org/all/ah6g7JSYOWGp2oAG@u94a/ Signed-off-by: Mykyta Yatsenko Tested-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260603-file_reader_flake-v1-1-7f3f52d1e388@meta.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/file_reader.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/file_reader.c b/tools/testing/selftests/bpf/progs/file_reader.c index 462712ff3b8a..aa2c05cce2b3 100644 --- a/tools/testing/selftests/bpf/progs/file_reader.c +++ b/tools/testing/selftests/bpf/progs/file_reader.c @@ -50,7 +50,7 @@ int on_open_expect_fault(void *c) goto out; local_err = bpf_dynptr_read(tmp_buf, user_buf_sz, &dynptr, user_buf_sz, 0); - if (local_err == -EFAULT) { /* Expect page fault */ + if (local_err == -EFAULT || local_err == 0) { /* Expect page fault or success */ local_err = 0; run_success = 1; } -- cgit v1.2.3 From 231fc9bc27fd03db171cab4e75116923250af7a7 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 30 May 2026 18:07:50 -0700 Subject: bpftool: Restrict feature tests during bootstrap compilation When the perf build executes 'make -C ../bpf/bpftool bootstrap', bpftool's Makefile unconditionally evaluated feature checks for llvm, libcap, libbfd, and disassembler libraries because the bootstrap target was not exempted. Since the bootstrap bpftool strictly compiles minimal AST parsing and C code generation logic without linking LLVM or disassembler libraries, these feature check sub-makes are completely redundant. Exempt the bootstrap target from non-essential feature tests to eliminate unneeded sub-make fork overhead during Kbuild startup. Tested-by: James Clark Assisted-by: Gemini:gemini-3.1-pro-preview Signed-off-by: Ian Rogers Acked-by: Quentin Monnet Link: https://lore.kernel.org/r/20260531010750.525160-1-irogers@google.com Signed-off-by: Alexei Starovoitov --- tools/bpf/bpftool/Makefile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index 0febf60e1b64..8f50bc163bb2 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -106,6 +106,10 @@ ifneq ($(SKIP_CRYPTO),1) CRYPTO_LIBS := -lcrypto endif +ifeq ($(MAKECMDGOALS),bootstrap) +FEATURE_TESTS := libelf-zstd +FEATURE_DISPLAY := +else FEATURE_TESTS := clang-bpf-co-re FEATURE_TESTS += llvm FEATURE_TESTS += libcap @@ -122,6 +126,7 @@ FEATURE_DISPLAY += libcap FEATURE_DISPLAY += libbfd FEATURE_DISPLAY += libbfd-liberty FEATURE_DISPLAY += libbfd-liberty-z +endif check_feat := 1 NON_CHECK_FEAT_TARGETS := clean uninstall doc doc-clean doc-install doc-uninstall -- cgit v1.2.3 From a3863aa4f55e5c17f32e1fd64a0a64adf2af16d9 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Fri, 5 Jun 2026 13:20:52 -0700 Subject: bpf: Fix dead error check on acquire_reference() in check_kfunc_call acquire_reference() returns a signed int that may be a negative errno but was converted to unsigned, which makes the subsequent error check deadcode. Fix it by declaring 'id' as int so the error path is taken correctly. Fixes: 308c7a0ae885 ("bpf: Refactor object relationship tracking and fix dynptr UAF bug") Acked-by: Eduard Zingerman Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260605202056.1780352-2-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7d27ba396d32..a741bf447931 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12817,9 +12817,10 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_kfunc_call_arg_meta meta; struct bpf_insn_aux_data *insn_aux; int err, insn_idx = *insn_idx_p; - u32 i, nargs, ptr_type_id, id; const struct btf_param *args; + u32 i, nargs, ptr_type_id; struct btf *desc_btf; + int id; /* skip for now, but return error when we find this in fixup_kfunc_call */ if (!insn->imm) -- cgit v1.2.3 From 73d475dc6c13177fce0d9d892bff33299c8ad56a Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Fri, 5 Jun 2026 13:20:53 -0700 Subject: bpf: Check acquire_reference() error for "__ref" struct_ops arguments When acquiring references for struct_ops program arguments tagged with "__ref", the return value of acquire_reference() was stored directly into u32 ctx_arg_info[i].ref_id without checking for failure. acquire_reference() returns -ENOMEM when acquire_reference_state() fails to allocate, so the error was silently stored as a ref_id instead of aborting verification. Fix it by checking the return. Fixes: a687df2008f6 ("bpf: Support getting referenced kptr from struct_ops argument") Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260605202056.1780352-3-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a741bf447931..3b874bbbaac0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18363,9 +18363,13 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) /* Acquire references for struct_ops program arguments tagged with "__ref" */ if (!subprog && env->prog->type == BPF_PROG_TYPE_STRUCT_OPS) { - for (i = 0; i < aux->ctx_arg_info_size; i++) - aux->ctx_arg_info[i].ref_id = aux->ctx_arg_info[i].refcounted ? - acquire_reference(env, 0, 0) : 0; + for (i = 0; i < aux->ctx_arg_info_size; i++) { + ret = aux->ctx_arg_info[i].refcounted ? acquire_reference(env, 0, 0) : 0; + if (ret < 0) + goto out; + + aux->ctx_arg_info[i].ref_id = ret; + } } ret = do_check(env); -- cgit v1.2.3 From 41025f441fe6addd93d2c333a3a184331e8ef6cf Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Fri, 5 Jun 2026 13:20:54 -0700 Subject: bpf: Compare parent_id in refsafe() for REF_TYPE_PTR refsafe() compared each reference's id and type but not its parent_id, so two states whose PTR references differ only in the parent object they were derived from could be wrongly treated as equivalent and pruned. Fix it by checking parent_id too. Fixes: 308c7a0ae885 ("bpf: Refactor object relationship tracking and fix dynptr UAF bug") Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260605202056.1780352-4-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/states.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/bpf/states.c b/kernel/bpf/states.c index 5945956a7573..06d9ae24f006 100644 --- a/kernel/bpf/states.c +++ b/kernel/bpf/states.c @@ -890,6 +890,9 @@ static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c return false; switch (old->refs[i].type) { case REF_TYPE_PTR: + if (!check_ids(old->refs[i].parent_id, cur->refs[i].parent_id, idmap)) + return false; + break; case REF_TYPE_IRQ: break; case REF_TYPE_LOCK: -- cgit v1.2.3 From ac7f6c9da6b6b46bba34a45c51603c81e7d42eb2 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Fri, 5 Jun 2026 13:20:55 -0700 Subject: bpf: Remove WARN_ON_ONCE in check_ids() check_ids() warned when it ran out of idmap slots, assuming this was impossible because the slots are bounded by the number of registers and stack slots. That assumption no longer holds: referenced dynptrs acquire an intermediate reference that lives in refs[] but is not backed by any register or stack slot [0], so a program can accumulate more reference ids than the idmap can hold and exhaust it. Exhaustion is fine for verification correctness. check_ids() already returns false, which makes the states compare as not equivalent and prevents unsound pruning. The only effect of the WARN_ON_ONCE() is log noise, or a panic under panic_on_warn. Drop the warning and keep returning false. [0] 308c7a0ae885 ("bpf: Refactor object relationship tracking and fix dynptr UAF bug") Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260605202056.1780352-5-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/states.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/states.c b/kernel/bpf/states.c index 06d9ae24f006..32f346ce3ffc 100644 --- a/kernel/bpf/states.c +++ b/kernel/bpf/states.c @@ -343,8 +343,12 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap) return true; } - /* We ran out of idmap slots, which should be impossible */ - WARN_ON_ONCE(1); + /* + * idmap slots are bounded by the number of registers and stack slots. + * Since referenced dynptrs acquire intermediate references that do + * not live in either, so the map can be exhausted. Since it is unlikely, + * fail the verification by treating the states as not equivalent. + */ return false; } -- cgit v1.2.3 From d83d4f63cb8f92aa6254dfc001eac0e41f5b2c35 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Fri, 5 Jun 2026 13:20:56 -0700 Subject: selftests/bpf: Use bpf_dynptr_slice() to read file dynptr in leak test use_file_dynptr_slice_after_put_file() reads the dynptr via bpf_dynptr_data(), which always returns NULL for a read-only file dynptr, making the example confusing. Switch to bpf_dynptr_slice(), the correct read API for file dynptrs, and read (rather than write) the slice since it is read-only. The test still fails as expected. Acked-by: Eduard Zingerman Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260605202056.1780352-6-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/file_reader_fail.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/file_reader_fail.c b/tools/testing/selftests/bpf/progs/file_reader_fail.c index d5fae5e4cf9a..3bb9e2612f8f 100644 --- a/tools/testing/selftests/bpf/progs/file_reader_fail.c +++ b/tools/testing/selftests/bpf/progs/file_reader_fail.c @@ -87,7 +87,8 @@ int use_file_dynptr_slice_after_put_file(void *ctx) struct task_struct *task = bpf_get_current_task_btf(); struct file *file = bpf_get_task_exe_file(task); struct bpf_dynptr dynptr; - char *data; + char buf[1]; + const char *data; if (!file) return 0; @@ -95,15 +96,14 @@ int use_file_dynptr_slice_after_put_file(void *ctx) if (bpf_dynptr_from_file(file, 0, &dynptr)) goto out; - data = bpf_dynptr_data(&dynptr, 0, 1); + data = bpf_dynptr_slice(&dynptr, 0, buf, sizeof(buf)); if (!data) goto out; /* this should fail - file dynptr should be discarded first to prevent resource leak */ bpf_put_file(file); - *data = 'x'; - return 0; + return data[0]; out: bpf_dynptr_file_discard(&dynptr); -- cgit v1.2.3 From 4a7910ee060d8ce55612f5b3cc267f3a265a3cec Mon Sep 17 00:00:00 2001 From: Kaitao Cheng Date: Fri, 5 Jun 2026 17:41:43 +0800 Subject: bpf: Clear rb node linkage when freeing bpf_rb_root bpf_rb_root_free() detaches the root by copying the current rb_root_cached and then replacing the live root with RB_ROOT_CACHED. It then walks the copied root and drops each object contained in the tree. This leaves the rb node state intact while dropping the object. If the object is refcounted and survives the drop, its bpf_rb_node_kern still contains an owner pointer to the freed root and stale rb tree linkage. If a later bpf_rb_root allocation reuses the same address, bpf_rbtree_remove() can incorrectly pass the owner check and call rb_erase_cached() on a node whose rb pointers belong to the old tree. Mirror the list draining behavior by marking nodes as busy while the root is being detached, then clear the rb node and release the owner before dropping the containing object. This makes surviving nodes unowned and safe to reject from remove or accept for a later add. Fixes: 9c395c1b99bd ("bpf: Add basic bpf_rb_{root,node} support") Signed-off-by: Kaitao Cheng Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20260605094143.5509-1-kaitao.cheng@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 03004e4451f5..8ba2b8965caf 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2306,6 +2306,7 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root, struct bpf_spin_lock *spin_lock) { struct rb_root_cached orig_root, *root = rb_root; + struct bpf_rb_node_kern *node; struct rb_node *pos, *n; void *obj; @@ -2314,14 +2315,20 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root, __bpf_spin_lock_irqsave(spin_lock); orig_root = *root; + bpf_rbtree_postorder_for_each_entry_safe(pos, n, &orig_root.rb_root) { + node = rb_entry(pos, struct bpf_rb_node_kern, rb_node); + WRITE_ONCE(node->owner, BPF_PTR_POISON); + } *root = RB_ROOT_CACHED; __bpf_spin_unlock_irqrestore(spin_lock); bpf_rbtree_postorder_for_each_entry_safe(pos, n, &orig_root.rb_root) { obj = pos; obj -= field->graph_root.node_offset; - - + node = rb_entry(pos, struct bpf_rb_node_kern, rb_node); + RB_CLEAR_NODE(pos); + /* Ensure __bpf_rbtree_add() sees the node as unlinked. */ + smp_store_release(&node->owner, NULL); __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); } } -- cgit v1.2.3 From a6850fa388f6f6ff365b3b72cb71e6d9a8a614ed Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marlière" Date: Tue, 2 Jun 2026 10:02:50 -0300 Subject: selftests/bpf: Add BPF_STRICT_BUILD toggle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Distro kernels often lack BTF types or kernel features required by some BPF selftests, causing the build to abort on the first failure and preventing the remaining tests from running. Add BPF_STRICT_BUILD (default 1) to control build failure tolerance. When set to 0, the PERMISSIVE make variable is assigned a non-empty value that subsequent Makefile rules use to make individual build steps non-fatal. When set to 1 (the default), the build fails on any error, preserving the existing behavior for CI and direct builds. Users can opt in to permissive mode on the command line: make -C tools/testing/selftests \ TARGETS=bpf SKIP_TARGETS= BPF_STRICT_BUILD=0 Suggested-by: Alan Maguire Signed-off-by: Ricardo B. Marlière Link: https://lore.kernel.org/r/20260602-selftests-bpf_misconfig-v12-1-27f898b3ba26@suse.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index bc049620c774..75036c1b5c4f 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -44,6 +44,12 @@ SKIP_LLVM ?= SKIP_LIBBFD ?= SKIP_CRYPTO ?= +# When BPF_STRICT_BUILD is 1, any BPF object, skeleton, test object, or +# benchmark compilation failure is fatal. Set to 0 to tolerate failures +# and continue building the remaining tests. +BPF_STRICT_BUILD ?= 1 +PERMISSIVE := $(filter 0,$(BPF_STRICT_BUILD)) + ifeq ($(srctree),) srctree := $(patsubst %/,%,$(dir $(CURDIR))) srctree := $(patsubst %/,%,$(dir $(srctree))) -- cgit v1.2.3 From 9779193e871b144e34ec4a3e50109b3778a51a69 Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marlière" Date: Tue, 2 Jun 2026 10:02:51 -0300 Subject: selftests/bpf: Fix test_kmods KDIR to honor O= and distro kernels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test_kmods/Makefile always pointed KDIR at the kernel source tree root, ignoring O= and KBUILD_OUTPUT. On distro kernels where the source tree has not been built, the Makefile had no fallback and would fail unconditionally. When O= or KBUILD_OUTPUT is set and points at a prepared kernel build directory (one containing Module.symvers), pass it through so kbuild can locate the correct build infrastructure (scripts, Kconfig, etc.). Note that the module artifacts themselves still land in the M= directory, which is test_kmods/; O= only controls where kbuild finds its build infrastructure. Fall back to /lib/modules/$(uname -r)/build when neither an explicit valid build directory nor an in-tree Module.symvers is present. A selftests-only O= value (one that does not contain Module.symvers, e.g. a private output directory) is intentionally not treated as a kernel build directory. Without this guard, a user invoking "make -C tools/testing/selftests/bpf O=/tmp/out" would have test_kmods try to use /tmp/out as the kernel build dir and fail. The parent bpf/Makefile resolves O= and KBUILD_OUTPUT to absolute paths before invoking the test_kmods sub-make. Without this, $(abspath ...) inside test_kmods/Makefile would resolve relative paths against the sub-make's CWD (test_kmods/) rather than the user's invocation directory. When O= is passed to kbuild, also pass KBUILD_OUTPUT=$(KMOD_O_VALID) explicitly. The parent invocation lifts KBUILD_OUTPUT into MAKEFLAGS as a command-line variable, which would otherwise suppress kbuild's own "KBUILD_OUTPUT := $(O)" assignment and cause it to use the inherited KBUILD_OUTPUT instead of the validated O=. Guard both all and clean against a missing KDIR so the step is silently skipped rather than fatal. Make the parent Makefile's cp conditional so it does not abort when modules were not built. Signed-off-by: Ricardo B. Marlière Link: https://lore.kernel.org/r/20260602-selftests-bpf_misconfig-v12-2-27f898b3ba26@suse.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 10 +++++---- tools/testing/selftests/bpf/test_kmods/Makefile | 30 ++++++++++++++++++++++--- 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 75036c1b5c4f..e912526a65dd 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -296,13 +296,15 @@ $(OUTPUT)/sign-file: ../../../../scripts/sign-file.c # subst() turns the rule into a pattern matching rule $(addprefix test_kmods/,$(subst .ko,%ko,$(TEST_KMODS))): $(VMLINUX_BTF) $(RESOLVE_BTFIDS) $(wildcard test_kmods/Makefile test_kmods/*.[ch]) $(Q)$(RM) test_kmods/*.ko test_kmods/*.mod.o # force re-compilation - $(Q)$(MAKE) $(submake_extras) -C test_kmods \ - RESOLVE_BTFIDS=$(RESOLVE_BTFIDS) \ + $(Q)$(MAKE) $(submake_extras) -C test_kmods \ + $(if $(O),O=$(abspath $(O))) \ + $(if $(KBUILD_OUTPUT),KBUILD_OUTPUT=$(abspath $(KBUILD_OUTPUT)))\ + RESOLVE_BTFIDS=$(RESOLVE_BTFIDS) \ EXTRA_CFLAGS='' EXTRA_LDFLAGS='' $(TEST_KMOD_TARGETS): $(addprefix test_kmods/,$(TEST_KMODS)) $(call msg,MOD,,$@) - $(Q)cp test_kmods/$(@F) $@ + $(Q)$(if $(PERMISSIVE),if [ -f test_kmods/$(@F) ]; then )cp test_kmods/$(@F) $@$(if $(PERMISSIVE),; fi) DEFAULT_BPFTOOL := $(HOST_SCRATCH_DIR)/sbin/bpftool @@ -718,7 +720,7 @@ $(TRUNNER_LIB_OBJS): $(TRUNNER_OUTPUT)/%.o:$(TOOLSDIR)/lib/%.c $(TRUNNER_BINARY)-extras: $(TRUNNER_EXTRA_FILES) | $(TRUNNER_OUTPUT) ifneq ($2:$(OUTPUT),:$(shell pwd)) $$(call msg,EXT-COPY,$(TRUNNER_BINARY),$(TRUNNER_EXTRA_FILES)) - $(Q)rsync -aq $$^ $(TRUNNER_OUTPUT)/ + $(Q)rsync -aq $(if $(PERMISSIVE),--ignore-missing-args) $$^ $(TRUNNER_OUTPUT)/ endif # some X.test.o files have runtime dependencies on Y.bpf.o files diff --git a/tools/testing/selftests/bpf/test_kmods/Makefile b/tools/testing/selftests/bpf/test_kmods/Makefile index 63c4d3f6a12f..031c7454ce65 100644 --- a/tools/testing/selftests/bpf/test_kmods/Makefile +++ b/tools/testing/selftests/bpf/test_kmods/Makefile @@ -1,5 +1,16 @@ TEST_KMOD_DIR := $(realpath $(dir $(abspath $(lastword $(MAKEFILE_LIST))))) -KDIR ?= $(abspath $(TEST_KMOD_DIR)/../../../../..) +SRCTREE_KDIR := $(abspath $(TEST_KMOD_DIR)/../../../../..) +# Honor O=/KBUILD_OUTPUT only if they point at a prepared kernel build +# directory (one containing Module.symvers); otherwise treat the value as a +# selftests-only output directory and fall back to in-tree or distro headers. +# The parent bpf/Makefile resolves O=/KBUILD_OUTPUT to absolute paths before +# invoking this sub-make so relative paths still anchor to the user's +# invocation directory. +KMOD_O := $(or $(O),$(KBUILD_OUTPUT)) +KMOD_O_VALID := $(if $(KMOD_O),$(if $(wildcard $(KMOD_O)/Module.symvers),$(KMOD_O))) +KDIR ?= $(if $(KMOD_O_VALID),$(SRCTREE_KDIR), \ + $(if $(wildcard $(SRCTREE_KDIR)/Module.symvers),$(SRCTREE_KDIR), \ + /lib/modules/$(shell uname -r)/build)) ifeq ($(V),1) Q = @@ -14,8 +25,21 @@ $(foreach m,$(MODULES),$(eval obj-m += $(m:.ko=.o))) CFLAGS_bpf_testmod.o = -I$(src) +# When BPF_STRICT_BUILD != 0, a missing KDIR is fatal (the default). +# When permissive, skip silently. +PERMISSIVE := $(filter 0,$(BPF_STRICT_BUILD)) + all: - $(Q)$(MAKE) -C $(KDIR) M=$(TEST_KMOD_DIR) modules +ifeq ($(PERMISSIVE),) + $(Q)$(MAKE) -C $(KDIR) $(if $(KMOD_O_VALID),O=$(KMOD_O_VALID) KBUILD_OUTPUT=$(KMOD_O_VALID),KBUILD_OUTPUT=) \ + M=$(TEST_KMOD_DIR) modules +else ifneq ("$(wildcard $(KDIR))", "") + $(Q)$(MAKE) -C $(KDIR) $(if $(KMOD_O_VALID),O=$(KMOD_O_VALID) KBUILD_OUTPUT=$(KMOD_O_VALID),KBUILD_OUTPUT=) \ + M=$(TEST_KMOD_DIR) modules +endif clean: - $(Q)$(MAKE) -C $(KDIR) M=$(TEST_KMOD_DIR) clean +ifneq ("$(wildcard $(KDIR))", "") + $(Q)$(MAKE) -C $(KDIR) $(if $(KMOD_O_VALID),O=$(KMOD_O_VALID) KBUILD_OUTPUT=$(KMOD_O_VALID),KBUILD_OUTPUT=) \ + M=$(TEST_KMOD_DIR) clean +endif -- cgit v1.2.3 From c476bdf27657c6ea4a447c18de169c7bdcdd419d Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marlière" Date: Tue, 2 Jun 2026 10:02:52 -0300 Subject: selftests/bpf: Tolerate BPF and skeleton generation failures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some BPF programs cannot be built on distro kernels because required BTF types or features are missing. A single failure currently aborts the selftests/bpf build. Make BPF object and skeleton generation best effort in permissive mode: emit SKIP-BPF or SKIP-SKEL to stderr, remove failed outputs so downstream rules can detect absence, and continue with remaining tests. Apply the same tolerance to linked skeletons (TRUNNER_BPF_SKELS_LINKED), which depend on multiple .bpf.o files and abort the build when any dependency is missing. Note that progress messages (GEN-SKEL, LINK-BPF) are also redirected to stderr as a side effect of rewriting the recipes into single-shell pipelines; the $(call msg,...) macro is a make-recipe construct that cannot be used inside an &&-chained shell command sequence. Signed-off-by: Ricardo B. Marlière Link: https://lore.kernel.org/r/20260602-selftests-bpf_misconfig-v12-3-27f898b3ba26@suse.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 108 +++++++++++++++++++++++------------ 1 file changed, 73 insertions(+), 35 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index e912526a65dd..dc1f4a4a3582 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -485,22 +485,26 @@ $(OUTPUT)/cgroup_getset_retval_hooks.o: cgroup_getset_retval_hooks.h # $4 - binary name define CLANG_BPF_BUILD_RULE $(call msg,CLNG-BPF,$4,$2) - $(Q)$(CLANG) $3 -O2 $(BPF_TARGET_ENDIAN) -c $1 -mcpu=v3 -o $2 + $(Q)$(CLANG) $3 -O2 $(BPF_TARGET_ENDIAN) -c $1 -mcpu=v3 -o $2 $(if $(PERMISSIVE),|| \ + ($(RM) $2; printf ' %-12s %s\n' 'SKIP-BPF' '$(notdir $2)' 1>&2)) endef # Similar to CLANG_BPF_BUILD_RULE, but with disabled alu32 define CLANG_NOALU32_BPF_BUILD_RULE $(call msg,CLNG-BPF,$4,$2) - $(Q)$(CLANG) $3 -O2 $(BPF_TARGET_ENDIAN) -c $1 -mcpu=v2 -o $2 + $(Q)$(CLANG) $3 -O2 $(BPF_TARGET_ENDIAN) -c $1 -mcpu=v2 -o $2 $(if $(PERMISSIVE),|| \ + ($(RM) $2; printf ' %-12s %s\n' 'SKIP-BPF' '$(notdir $2)' 1>&2)) endef # Similar to CLANG_BPF_BUILD_RULE, but with cpu-v4 define CLANG_CPUV4_BPF_BUILD_RULE $(call msg,CLNG-BPF,$4,$2) - $(Q)$(CLANG) $3 -O2 $(BPF_TARGET_ENDIAN) -c $1 -mcpu=v4 -o $2 + $(Q)$(CLANG) $3 -O2 $(BPF_TARGET_ENDIAN) -c $1 -mcpu=v4 -o $2 $(if $(PERMISSIVE),|| \ + ($(RM) $2; printf ' %-12s %s\n' 'SKIP-BPF' '$(notdir $2)' 1>&2)) endef # Build BPF object using GCC define GCC_BPF_BUILD_RULE $(call msg,GCC-BPF,$4,$2) - $(Q)$(BPF_GCC) $3 -DBPF_NO_PRESERVE_ACCESS_INDEX -Wno-attributes -O2 -c $1 -o $2 + $(Q)$(BPF_GCC) $3 -DBPF_NO_PRESERVE_ACCESS_INDEX -Wno-attributes -O2 -c $1 -o $2 $(if $(PERMISSIVE),|| \ + ($(RM) $2; printf ' %-12s %s\n' 'SKIP-BPF' '$(notdir $2)' 1>&2)) endef SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c @@ -607,47 +611,81 @@ $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.bpf.o: \ $$($$<-$2-CFLAGS),$(TRUNNER_BINARY)) $(TRUNNER_BPF_SKELS): %.skel.h: %.bpf.o $(BPFTOOL) | $(TRUNNER_OUTPUT) - $$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@) - $(Q)$$(BPFTOOL) gen object $$(<:.o=.linked1.o) $$< - $(Q)$$(BPFTOOL) gen object $$(<:.o=.linked2.o) $$(<:.o=.linked1.o) - $(Q)$$(BPFTOOL) gen object $$(<:.o=.linked3.o) $$(<:.o=.linked2.o) - $(Q)diff $$(<:.o=.linked2.o) $$(<:.o=.linked3.o) - $(Q)$$(BPFTOOL) gen skeleton $$(<:.o=.linked3.o) name $$(notdir $$(<:.bpf.o=)) > $$@ - $(Q)$$(BPFTOOL) gen subskeleton $$(<:.o=.linked3.o) name $$(notdir $$(<:.bpf.o=)) > $$(@:.skel.h=.subskel.h) - $(Q)rm -f $$(<:.o=.linked1.o) $$(<:.o=.linked2.o) $$(<:.o=.linked3.o) + $(Q)$(if $(PERMISSIVE),if [ ! -f $$< ]; then \ + $$(RM) $$@ $$(@:.skel.h=.subskel.h); \ + printf ' %-12s %s\n' 'SKIP-SKEL' '$$(notdir $$@)' 1>&2; \ + exit 0; \ + fi;) \ + printf ' %-12s %s\n' 'GEN-SKEL' '[$(TRUNNER_BINARY)] $$(notdir $$@)' 1>&2; \ + $$(BPFTOOL) gen object $$(<:.o=.linked1.o) $$< && \ + $$(BPFTOOL) gen object $$(<:.o=.linked2.o) $$(<:.o=.linked1.o) && \ + $$(BPFTOOL) gen object $$(<:.o=.linked3.o) $$(<:.o=.linked2.o) && \ + diff $$(<:.o=.linked2.o) $$(<:.o=.linked3.o) && \ + $$(BPFTOOL) gen skeleton $$(<:.o=.linked3.o) name $$(notdir $$(<:.bpf.o=)) > $$@ && \ + $$(BPFTOOL) gen subskeleton $$(<:.o=.linked3.o) name $$(notdir $$(<:.bpf.o=)) > $$(@:.skel.h=.subskel.h) $(if $(PERMISSIVE),|| { \ + $$(RM) $$@ $$(@:.skel.h=.subskel.h); \ + printf ' %-12s %s\n' 'SKIP-SKEL' '$$(notdir $$@)' 1>&2; \ + }) && \ + rm -f $$(<:.o=.linked1.o) $$(<:.o=.linked2.o) $$(<:.o=.linked3.o) $(TRUNNER_BPF_LSKELS): %.lskel.h: %.bpf.o $(BPFTOOL) | $(TRUNNER_OUTPUT) - $$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@) - $(Q)$$(BPFTOOL) gen object $$(<:.o=.llinked1.o) $$< - $(Q)$$(BPFTOOL) gen object $$(<:.o=.llinked2.o) $$(<:.o=.llinked1.o) - $(Q)$$(BPFTOOL) gen object $$(<:.o=.llinked3.o) $$(<:.o=.llinked2.o) - $(Q)diff $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o) - $(Q)$$(BPFTOOL) gen skeleton -L $$(<:.o=.llinked3.o) name $$(notdir $$(<:.bpf.o=_lskel)) > $$@ - $(Q)rm -f $$(<:.o=.llinked1.o) $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o) + $(Q)$(if $(PERMISSIVE),if [ ! -f $$< ]; then \ + $$(RM) $$@; \ + printf ' %-12s %s\n' 'SKIP-SKEL' '$$(notdir $$@)' 1>&2; \ + exit 0; \ + fi;) \ + printf ' %-12s %s\n' 'GEN-SKEL' '[$(TRUNNER_BINARY)] $$(notdir $$@)' 1>&2; \ + $$(BPFTOOL) gen object $$(<:.o=.llinked1.o) $$< && \ + $$(BPFTOOL) gen object $$(<:.o=.llinked2.o) $$(<:.o=.llinked1.o) && \ + $$(BPFTOOL) gen object $$(<:.o=.llinked3.o) $$(<:.o=.llinked2.o) && \ + diff $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o) && \ + $$(BPFTOOL) gen skeleton -L $$(<:.o=.llinked3.o) name $$(notdir $$(<:.bpf.o=_lskel)) > $$@ $(if $(PERMISSIVE),|| { \ + $$(RM) $$@; \ + printf ' %-12s %s\n' 'SKIP-SKEL' '$$(notdir $$@)' 1>&2; \ + }) && \ + rm -f $$(<:.o=.llinked1.o) $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o) $(TRUNNER_BPF_LSKELS_SIGNED): %.lskel.h: %.bpf.o $(BPFTOOL) | $(TRUNNER_OUTPUT) - $$(call msg,GEN-SKEL,$(TRUNNER_BINARY) (signed),$$@) - $(Q)$$(BPFTOOL) gen object $$(<:.o=.llinked1.o) $$< - $(Q)$$(BPFTOOL) gen object $$(<:.o=.llinked2.o) $$(<:.o=.llinked1.o) - $(Q)$$(BPFTOOL) gen object $$(<:.o=.llinked3.o) $$(<:.o=.llinked2.o) - $(Q)diff $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o) - $(Q)$$(BPFTOOL) gen skeleton $(LSKEL_SIGN) $$(<:.o=.llinked3.o) name $$(notdir $$(<:.bpf.o=_lskel)) > $$@ - $(Q)rm -f $$(<:.o=.llinked1.o) $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o) + $(Q)$(if $(PERMISSIVE),if [ ! -f $$< ]; then \ + $$(RM) $$@; \ + printf ' %-12s %s\n' 'SKIP-SKEL' '$$(notdir $$@)' 1>&2; \ + exit 0; \ + fi;) \ + printf ' %-12s %s\n' 'GEN-SKEL' '[$(TRUNNER_BINARY) (signed)] $$(notdir $$@)' 1>&2; \ + $$(BPFTOOL) gen object $$(<:.o=.llinked1.o) $$< && \ + $$(BPFTOOL) gen object $$(<:.o=.llinked2.o) $$(<:.o=.llinked1.o) && \ + $$(BPFTOOL) gen object $$(<:.o=.llinked3.o) $$(<:.o=.llinked2.o) && \ + diff $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o) && \ + $$(BPFTOOL) gen skeleton $(LSKEL_SIGN) $$(<:.o=.llinked3.o) name $$(notdir $$(<:.bpf.o=_lskel)) > $$@ $(if $(PERMISSIVE),|| { \ + $$(RM) $$@; \ + printf ' %-12s %s\n' 'SKIP-SKEL' '$$(notdir $$@)' 1>&2; \ + }) && \ + rm -f $$(<:.o=.llinked1.o) $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o) $(LINKED_BPF_OBJS): %: $(TRUNNER_OUTPUT)/% # .SECONDEXPANSION here allows to correctly expand %-deps variables as prerequisites .SECONDEXPANSION: $(TRUNNER_BPF_SKELS_LINKED): $(TRUNNER_OUTPUT)/%: $$$$(%-deps) $(BPFTOOL) | $(TRUNNER_OUTPUT) - $$(call msg,LINK-BPF,$(TRUNNER_BINARY),$$(@:.skel.h=.bpf.o)) - $(Q)$$(BPFTOOL) gen object $$(@:.skel.h=.linked1.o) $$(addprefix $(TRUNNER_OUTPUT)/,$$($$(@F)-deps)) - $(Q)$$(BPFTOOL) gen object $$(@:.skel.h=.linked2.o) $$(@:.skel.h=.linked1.o) - $(Q)$$(BPFTOOL) gen object $$(@:.skel.h=.linked3.o) $$(@:.skel.h=.linked2.o) - $(Q)diff $$(@:.skel.h=.linked2.o) $$(@:.skel.h=.linked3.o) - $$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@) - $(Q)$$(BPFTOOL) gen skeleton $$(@:.skel.h=.linked3.o) name $$(notdir $$(@:.skel.h=)) > $$@ - $(Q)$$(BPFTOOL) gen subskeleton $$(@:.skel.h=.linked3.o) name $$(notdir $$(@:.skel.h=)) > $$(@:.skel.h=.subskel.h) - $(Q)rm -f $$(@:.skel.h=.linked1.o) $$(@:.skel.h=.linked2.o) $$(@:.skel.h=.linked3.o) + $(Q)$(if $(PERMISSIVE),for f in $$(addprefix $(TRUNNER_OUTPUT)/,$$($$(@F)-deps)); do \ + if [ ! -f $$$$f ]; then \ + $$(RM) $$@ $$(@:.skel.h=.subskel.h); \ + printf ' %-12s %s\n' 'SKIP-SKEL' '$$(notdir $$@)' 1>&2; \ + exit 0; \ + fi; \ + done;) \ + printf ' %-12s %s\n' 'LINK-BPF' '[$(TRUNNER_BINARY)] $$(notdir $$(@:.skel.h=.bpf.o))' 1>&2; \ + $$(BPFTOOL) gen object $$(@:.skel.h=.linked1.o) $$(addprefix $(TRUNNER_OUTPUT)/,$$($$(@F)-deps)) && \ + $$(BPFTOOL) gen object $$(@:.skel.h=.linked2.o) $$(@:.skel.h=.linked1.o) && \ + $$(BPFTOOL) gen object $$(@:.skel.h=.linked3.o) $$(@:.skel.h=.linked2.o) && \ + diff $$(@:.skel.h=.linked2.o) $$(@:.skel.h=.linked3.o) && \ + printf ' %-12s %s\n' 'GEN-SKEL' '[$(TRUNNER_BINARY)] $$(notdir $$@)' 1>&2 && \ + $$(BPFTOOL) gen skeleton $$(@:.skel.h=.linked3.o) name $$(notdir $$(@:.skel.h=)) > $$@ && \ + $$(BPFTOOL) gen subskeleton $$(@:.skel.h=.linked3.o) name $$(notdir $$(@:.skel.h=)) > $$(@:.skel.h=.subskel.h) $(if $(PERMISSIVE),|| { \ + $$(RM) $$@ $$(@:.skel.h=.subskel.h); \ + printf ' %-12s %s\n' 'SKIP-SKEL' '$$(notdir $$@)' 1>&2; \ + }) && \ + rm -f $$(@:.skel.h=.linked1.o) $$(@:.skel.h=.linked2.o) $$(@:.skel.h=.linked3.o) # When the compiler generates a %.d file, only skel basenames (not # full paths) are specified as prerequisites for corresponding %.o -- cgit v1.2.3 From a97bfc9aae076f49f0bcad713bde02b87553b995 Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marlière" Date: Tue, 2 Jun 2026 10:02:53 -0300 Subject: selftests/bpf: Avoid rebuilds when running emit_tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit emit_tests is used while installing selftests to generate the kselftest list. Pulling in .d files for this goal can trigger BPF rebuild rules and mix build output into list generation. Skip dependency file inclusion for emit_tests, like clean goals, so list generation stays side-effect free. Also add emit_tests to NON_CHECK_FEAT_TARGETS so that feature detection is skipped; without this, Makefile.feature's $(info) output leaks into stdout and corrupts the test list captured by the top-level selftests Makefile. Signed-off-by: Ricardo B. Marlière Link: https://lore.kernel.org/r/20260602-selftests-bpf_misconfig-v12-4-27f898b3ba26@suse.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index dc1f4a4a3582..49f6a5503e84 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -170,7 +170,7 @@ endef include ../lib.mk -NON_CHECK_FEAT_TARGETS := clean docs-clean +NON_CHECK_FEAT_TARGETS := clean docs-clean emit_tests CHECK_FEAT := $(filter-out $(NON_CHECK_FEAT_TARGETS),$(or $(MAKECMDGOALS), "none")) ifneq ($(CHECK_FEAT),) FEATURE_USER := .selftests @@ -732,7 +732,7 @@ $(TRUNNER_TEST_OBJS:.o=.d): $(TRUNNER_OUTPUT)/%.test.d: \ $(TRUNNER_BPF_SKELS_LINKED) \ $$(BPFOBJ) | $(TRUNNER_OUTPUT) -ifeq ($(filter clean docs-clean,$(MAKECMDGOALS)),) +ifeq ($(filter clean docs-clean emit_tests,$(MAKECMDGOALS)),) include $(wildcard $(TRUNNER_TEST_OBJS:.o=.d)) endif -- cgit v1.2.3 From 5498e47741c8a742f730bf9996234bdae1c08ccc Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marlière" Date: Tue, 2 Jun 2026 10:02:54 -0300 Subject: selftests/bpf: Make skeleton headers order-only prerequisites of .test.d MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .test.d dependency files are generated by the C preprocessor and list the headers each test file actually #includes. Skeleton headers appear in those generated lists, so the .test.o -> .skel.h dependency is already tracked by the .d file content. Making skeletons order-only prerequisites of .test.d means that a missing or skipped skeleton does not prevent .test.d generation, and regenerating a skeleton does not force .test.d to be recreated. This avoids unnecessary recompilation and, more importantly, avoids build errors when a skeleton was intentionally skipped due to a BPF compilation failure. $$(BPFOBJ) is intentionally kept as a normal prerequisite: a libbpf rebuild legitimately invalidates .test.d, since libbpf header changes can affect the headers .test.o sees. Only the skeleton headers are moved to order-only. Note that adding a new BPF skeleton via a modified existing local header still works correctly: GNU make builds order-only prerequisites that do not exist (the order-only qualifier only suppresses timestamp-driven rebuilds, not existence-driven builds), so a brand-new .skel.h listed in TRUNNER_BPF_SKELS is generated even when .test.d is otherwise up to date. The modified local header invalidates .test.o through the previously included .d content, forcing a recompile that regenerates .test.d with the new .skel.h dependency captured by gcc -MMD. Signed-off-by: Ricardo B. Marlière Link: https://lore.kernel.org/r/20260602-selftests-bpf_misconfig-v12-5-27f898b3ba26@suse.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 49f6a5503e84..09de69a81112 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -726,11 +726,11 @@ $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o: \ $(TRUNNER_TEST_OBJS:.o=.d): $(TRUNNER_OUTPUT)/%.test.d: \ $(TRUNNER_TESTS_DIR)/%.c \ $(TRUNNER_EXTRA_HDRS) \ + $$(BPFOBJ) | $(TRUNNER_OUTPUT) \ $(TRUNNER_BPF_SKELS) \ $(TRUNNER_BPF_LSKELS) \ $(TRUNNER_BPF_LSKELS_SIGNED) \ - $(TRUNNER_BPF_SKELS_LINKED) \ - $$(BPFOBJ) | $(TRUNNER_OUTPUT) + $(TRUNNER_BPF_SKELS_LINKED) ifeq ($(filter clean docs-clean emit_tests,$(MAKECMDGOALS)),) include $(wildcard $(TRUNNER_TEST_OBJS:.o=.d)) -- cgit v1.2.3 From 9c4de137a9a5280c95515e83e97838826603ea93 Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marlière" Date: Tue, 2 Jun 2026 10:02:55 -0300 Subject: selftests/bpf: Tolerate test file compilation failures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Individual test files may fail to compile when headers or kernel features required by that test are absent. Currently this aborts the entire build. Make the per-test compilation non-fatal: remove the output object on failure and print a SKIP-TEST marker to stderr. Guard the BTFIDS post-processing step so it is skipped when the object file is absent. The linker step will later ignore absent objects, allowing the remaining tests to build and run. Group cd and CC in a sub-shell so a cd failure cannot leak into the error-handling branch and operate in the original working directory; use $@ (absolute path) for $(RM) so it cannot match an unrelated file there. Replace the $(call msg,...) in the BTFIDS block with a plain printf (the msg macro expands to @printf, which is a make-recipe construct and is invalid inside a shell if-then-fi body) and gate the printf on $(filter 1,$(V)) so verbose mode (V=1) does not double-print the line that the recipe shell already echoes; non-verbose modes (V unset, V=0, V=2, ...) still print the BTFIDS marker, matching the convention of the shared msg macro. Restrict tolerance to test_progs and its flavors via an inlined $(if $(filter test_progs%,$1),$(if $(PERMISSIVE),...)) check: runners with strong cross-object references (e.g. test_maps) would link-fail with a partial object set, so they keep strict semantics even when BPF_STRICT_BUILD=0. The check is inlined rather than stored in a helper variable so $1 is substituted at $(call) time and the per-runner result is baked into each recipe. Note on bisectability: this change is gated entirely behind PERMISSIVE for test_progs%, so default builds (BPF_STRICT_BUILD!=0) compile and run identically at every commit in the series. Bisecting in PERMISSIVE mode at this commit still requires the next two patches ("selftests/bpf: Skip tests whose objects were not built" and "selftests/bpf: Allow test_progs to link with a partial object set") to avoid the linker rejecting missing objects and the runtime aborting on NULL function pointers. Signed-off-by: Ricardo B. Marlière Link: https://lore.kernel.org/r/20260602-selftests-bpf_misconfig-v12-6-27f898b3ba26@suse.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 09de69a81112..7739799c2566 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -588,6 +588,12 @@ endef # $2 - test runner extra "flavor" (e.g., no_alu32, cpuv4, bpf_gcc, etc) define DEFINE_TEST_RUNNER_RULES +# Permissive build behaviour (skip-on-failure compile, partial-link) only +# applies to test_progs and its flavors; runners that use strong cross-object +# references (e.g. test_maps) keep strict semantics even when permissive. +# The check is inlined per-runner so $1 is substituted at $(call) time and +# the result is baked into each rule's recipe. + ifeq ($($(TRUNNER_OUTPUT)-dir),) $(TRUNNER_OUTPUT)-dir := y $(TRUNNER_OUTPUT): @@ -717,11 +723,14 @@ $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o: \ $(TRUNNER_TESTS_DIR)/%.c \ | $(TRUNNER_OUTPUT)/%.test.d $$(call msg,TEST-OBJ,$(TRUNNER_BINARY),$$@) - $(Q)cd $$(@D) && $$(CC) -I. $$(CFLAGS) -MMD -MT $$@ -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F) + $(Q)(cd $$(@D) && $$(CC) -I. $$(CFLAGS) -MMD -MT $$@ -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F)) $(if $(filter test_progs%,$1),$(if $(PERMISSIVE),|| \ + ($(RM) $$@; printf ' %-12s %s\n' 'SKIP-TEST' '$$(notdir $$@)' 1>&2))) $$(if $$(TEST_NEEDS_BTFIDS), \ - $$(call msg,BTFIDS,$(TRUNNER_BINARY),$$@) \ + $(Q)if [ -f $$@ ]; then \ + $(if $(filter 1,$(V)),true,printf ' %-8s%s %s\n' "BTFIDS" " [$(TRUNNER_BINARY)]" "$$(notdir $$@)"); \ $(RESOLVE_BTFIDS) --btf $(TRUNNER_OUTPUT)/btf_data.bpf.o $$@; \ - $(RESOLVE_BTFIDS) --patch_btfids $$@.BTF_ids $$@) + $(RESOLVE_BTFIDS) --patch_btfids $$@.BTF_ids $$@; \ + fi) $(TRUNNER_TEST_OBJS:.o=.d): $(TRUNNER_OUTPUT)/%.test.d: \ $(TRUNNER_TESTS_DIR)/%.c \ -- cgit v1.2.3 From aeb73a9f301de4f0df7c858ea465a7a9f5d09fd7 Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marlière" Date: Tue, 2 Jun 2026 10:02:56 -0300 Subject: selftests/bpf: Skip tests whose objects were not built MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When both run_test and run_serial_test are NULL (because the corresponding .test.o was not compiled), mark the test as not built instead of fatally aborting. Report these tests as "SKIP (not built)" in per-test output and include them in the skip count so they remain visible in CI results and JSON output. The summary line shows the not-built count when nonzero: Summary: 50/55 PASSED, 5 SKIPPED (3 not built), 0 FAILED Tests filtered out by -t/-n remain invisible as before; only genuinely unbuilt tests are surfaced. Signed-off-by: Ricardo B. Marlière Link: https://lore.kernel.org/r/20260602-selftests-bpf_misconfig-v12-7-27f898b3ba26@suse.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_progs.c | 53 +++++++++++++++++++++++++++----- tools/testing/selftests/bpf/test_progs.h | 1 + 2 files changed, 46 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index cc14b13e23fe..7ba82974ee78 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -165,6 +165,8 @@ struct prog_test_def { void (*run_test)(void); void (*run_serial_test)(void); bool should_run; + bool not_built; + bool selected; bool need_cgroup_cleanup; bool should_tmon; }; @@ -372,6 +374,8 @@ static void print_test_result(const struct prog_test_def *test, const struct tes fprintf(env.stdout_saved, "#%-*d %s:", TEST_NUM_WIDTH, test->test_num, test->test_name); if (test_state->error_cnt) fprintf(env.stdout_saved, "FAIL"); + else if (test->not_built) + fprintf(env.stdout_saved, "SKIP (not built)"); else if (!skipped_cnt) fprintf(env.stdout_saved, "OK"); else if (skipped_cnt == subtests_cnt || !subtests_cnt) @@ -1641,6 +1645,7 @@ static void calculate_summary_and_print_errors(struct test_env *env) json_writer_t *w = NULL; for (i = 0; i < prog_test_cnt; i++) { + struct prog_test_def *test = &prog_test_defs[i]; struct test_state *state = &test_states[i]; if (!state->tested) @@ -1651,7 +1656,7 @@ static void calculate_summary_and_print_errors(struct test_env *env) if (state->error_cnt) fail_cnt++; - else + else if (!test->not_built) succ_cnt++; } @@ -1700,8 +1705,13 @@ static void calculate_summary_and_print_errors(struct test_env *env) if (env->json) fclose(env->json); - printf("Summary: %d/%d PASSED, %d SKIPPED, %d FAILED\n", - succ_cnt, sub_succ_cnt, skip_cnt, fail_cnt); + if (env->not_built_cnt) + printf("Summary: %d/%d PASSED, %d SKIPPED (%d not built), %d FAILED\n", + succ_cnt, sub_succ_cnt, skip_cnt, env->not_built_cnt, + fail_cnt); + else + printf("Summary: %d/%d PASSED, %d SKIPPED, %d FAILED\n", + succ_cnt, sub_succ_cnt, skip_cnt, fail_cnt); env->succ_cnt = succ_cnt; env->sub_succ_cnt = sub_succ_cnt; @@ -1772,6 +1782,19 @@ static void server_main(void) run_one_test(i); } + /* mark not-built tests as skipped */ + for (int i = 0; i < prog_test_cnt; i++) { + struct prog_test_def *test = &prog_test_defs[i]; + struct test_state *state = &test_states[i]; + + if (test->not_built && test->selected) { + state->tested = true; + state->skip_cnt = 1; + env.not_built_cnt++; + print_test_result(test, state); + } + } + /* generate summary */ fflush(stderr); fflush(stdout); @@ -2046,15 +2069,20 @@ int main(int argc, char **argv) struct prog_test_def *test = &prog_test_defs[i]; test->test_num = i + 1; - test->should_run = should_run(&env.test_selector, - test->test_num, test->test_name); + test->selected = should_run(&env.test_selector, + test->test_num, test->test_name); + test->should_run = test->selected; - if ((test->run_test == NULL && test->run_serial_test == NULL) || - (test->run_test != NULL && test->run_serial_test != NULL)) { + if (test->run_test && test->run_serial_test) { fprintf(stderr, "Test %d:%s must have either test_%s() or serial_test_%sl() defined.\n", test->test_num, test->test_name, test->test_name, test->test_name); exit(EXIT_ERR_SETUP_INFRA); } + if (!test->run_test && !test->run_serial_test) { + test->not_built = true; + test->should_run = false; + continue; + } if (test->should_run) test->should_tmon = should_tmon(&env.tmon_selector, test->test_name); } @@ -2106,9 +2134,18 @@ int main(int argc, char **argv) for (i = 0; i < prog_test_cnt; i++) { struct prog_test_def *test = &prog_test_defs[i]; + struct test_state *state = &test_states[i]; - if (!test->should_run) + if (!test->should_run) { + if (test->not_built && test->selected && + !env.get_test_cnt && !env.list_test_names) { + state->tested = true; + state->skip_cnt = 1; + env.not_built_cnt++; + print_test_result(test, state); + } continue; + } if (env.get_test_cnt) { env.succ_cnt++; diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index 37955a8ad385..2cf950afcd85 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -125,6 +125,7 @@ struct test_env { int sub_succ_cnt; /* successful sub-tests */ int fail_cnt; /* total failed tests + sub-tests */ int skip_cnt; /* skipped tests */ + int not_built_cnt; /* tests not built */ int saved_netns_fd; int workers; /* number of worker process */ -- cgit v1.2.3 From af490669fd339988765d87de9dd1b25e62ec64cf Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marlière" Date: Tue, 2 Jun 2026 10:02:57 -0300 Subject: selftests/bpf: Allow test_progs to link with a partial object set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When individual test files are skipped due to compilation failures, their .test.o files are absent. The linker step currently lists all expected .test.o files as explicit prerequisites, so make considers any missing one an error. In permissive mode, declare the test objects that already exist on disk (via parse-time $(wildcard ...)) as normal prerequisites of the binary so that modifications to a test source still trigger a relink, and keep the full TRUNNER_TEST_OBJS list as order-only prerequisites so that initial fresh builds still produce them and missing objects do not abort the link. The recipe filter is split per mode: in permissive mode it combines a recipe-time $(wildcard ...) (which catches objects freshly produced via the order-only path on a fresh build) with $(filter-out $(TRUNNER_TEST_OBJS),$^) (which keeps the non-test inputs from $^ but drops the parse-time wildcard duplicates). This avoids passing the same .test.o twice to the linker while still presenting test objects before libbpf.a so that GNU ld, which scans static archives left-to-right, pulls in archive members referenced exclusively by test objects (e.g. ring_buffer__new from ringbuf.c). In default (strict) mode the recipe remains the simple $(filter %.a %.o,$^) since TRUNNER_TEST_OBJS is part of $^ exactly once. Gate the partial-link behavior on $(if $(filter test_progs%,$1),...) so it only applies to test_progs and its flavors. test_maps and similar runners using strong cross-object references would link-fail with a partial set and intentionally retain strict link semantics. Note: adding a brand-new test_*.c file in permissive mode requires removing the binary (or a clean rebuild) before the new test is linked in, because the parse-time $(wildcard ...) is evaluated when the Makefile is read and will not yet see the new .test.o. This is acceptable since permissive mode targets tolerant CI builds rather than incremental development. Signed-off-by: Ricardo B. Marlière Link: https://lore.kernel.org/r/20260602-selftests-bpf_misconfig-v12-8-27f898b3ba26@suse.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 7739799c2566..bc845022a7ef 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -773,14 +773,15 @@ endif # some X.test.o files have runtime dependencies on Y.bpf.o files $(OUTPUT)/$(TRUNNER_BINARY): | $(TRUNNER_BPF_OBJS) -$(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS) \ +$(OUTPUT)/$(TRUNNER_BINARY): $(if $(filter test_progs%,$1),$(if $(PERMISSIVE),$$(wildcard $(TRUNNER_TEST_OBJS)),$(TRUNNER_TEST_OBJS)),$(TRUNNER_TEST_OBJS)) \ $(TRUNNER_EXTRA_OBJS) $$(BPFOBJ) \ $(TRUNNER_LIB_OBJS) \ $(TRUNNER_BPFTOOL) \ $(OUTPUT)/veristat \ - | $(TRUNNER_BINARY)-extras + | $(TRUNNER_BINARY)-extras \ + $(if $(filter test_progs%,$1),$(if $(PERMISSIVE),$(TRUNNER_TEST_OBJS))) $$(call msg,BINARY,,$$@) - $(Q)$$(CC) $$(CFLAGS) $$(filter %.a %.o,$$^) $$(LDLIBS) $$(LLVM_LDLIBS) $$(LDFLAGS) $$(LLVM_LDFLAGS) -o $$@ + $(Q)$$(CC) $$(CFLAGS) $(if $(filter test_progs%,$1),$(if $(PERMISSIVE),$$(filter %.a %.o,$$(wildcard $(TRUNNER_TEST_OBJS)) $$(filter-out $(TRUNNER_TEST_OBJS),$$^)),$$(filter %.a %.o,$$^)),$$(filter %.a %.o,$$^)) $$(LDLIBS) $$(LLVM_LDLIBS) $$(LDFLAGS) $$(LLVM_LDFLAGS) -o $$@ $(Q)ln -sf $(if $2,..,.)/tools/build/bpftool/$(USE_BOOTSTRAP)bpftool \ $(OUTPUT)/$(if $2,$2/)bpftool -- cgit v1.2.3 From f813a4d6877e9197f6e85120c144738e3c1c3b80 Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marlière" Date: Tue, 2 Jun 2026 10:02:58 -0300 Subject: selftests/bpf: Tolerate benchmark build failures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Benchmark objects depend on skeletons that may be missing when some BPF programs fail to build. In that case, benchmark object compilation or final bench linking should not abort the full selftests/bpf build. Keep both steps non-fatal, emit SKIP-BENCH or SKIP-LINK, and remove failed outputs so stale objects or binaries are not reused by later incremental builds. Note that because bench.c statically references every benchmark via extern symbols, partial linking is not possible: if any single benchmark object fails, the entire bench binary is skipped. This is by design -- the error handler catches all compilation failures including genuine ones, but those are caught by full-config CI runs. Signed-off-by: Ricardo B. Marlière Link: https://lore.kernel.org/r/20260602-selftests-bpf_misconfig-v12-9-27f898b3ba26@suse.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index bc845022a7ef..4eebc15670e3 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -945,7 +945,8 @@ $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ) # Benchmark runner $(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h $(BPFOBJ) $(call msg,CC,,$@) - $(Q)$(CC) $(CFLAGS) -O2 -c $(filter %.c,$^) $(LDLIBS) -o $@ + $(Q)$(CC) $(CFLAGS) -O2 -c $(filter %.c,$^) $(LDLIBS) -o $@ $(if $(PERMISSIVE),|| \ + ($(RM) $@; printf ' %-12s %s\n' 'SKIP-BENCH' '$(notdir $@)' 1>&2)) $(OUTPUT)/bench_rename.o: $(OUTPUT)/test_overhead.skel.h $(OUTPUT)/bench_trigger.o: $(OUTPUT)/trigger_bench.skel.h $(OUTPUT)/bench_ringbufs.o: $(OUTPUT)/ringbuf_bench.skel.h \ @@ -994,7 +995,8 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \ $(OUTPUT)/usdt_2.o \ # $(call msg,BINARY,,$@) - $(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@ + $(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@ $(if $(PERMISSIVE),|| \ + ($(RM) $@; printf ' %-12s %s\n' 'SKIP-LINK' '$(notdir $@) (some benchmarks may have been skipped)' 1>&2)) # This works around GCC warning about snprintf truncating strings like: # -- cgit v1.2.3 From b85e63cb65f96df373b034cc347b0e18231cb0d5 Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marlière" Date: Tue, 2 Jun 2026 10:02:59 -0300 Subject: selftests/bpf: Provide weak definitions for cross-test functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some test files reference functions defined in other translation units that may not be compiled when skeletons are missing. Replace forward declarations of uprobe_multi_func_{1,2,3}() with weak no-op stubs so the linker resolves them regardless of which objects are present. The stub bodies are `asm volatile ("")` rather than empty, matching the shape of the strong definitions in prog_tests/uprobe_multi_test.c. This keeps the weak and strong sides on the same footing for the optimiser (noinline + asm-barrier), which is the form upstream already relies on for these functions. Move stack_mprotect() from test_lsm.c into testing_helpers.c so it is always available. The previous weak-stub approach returned 0, which would cause callers expecting -1/EPERM to fail their assertions deterministically. Having the real implementation in a shared utility avoids this problem entirely. Include for alloca() so the build does not rely on glibc's implicit declaration via . Signed-off-by: Ricardo B. Marlière Link: https://lore.kernel.org/r/20260602-selftests-bpf_misconfig-v12-10-27f898b3ba26@suse.com Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/prog_tests/bpf_cookie.c | 17 +++++++++++------ tools/testing/selftests/bpf/prog_tests/iters.c | 2 -- tools/testing/selftests/bpf/prog_tests/test_lsm.c | 22 ---------------------- tools/testing/selftests/bpf/testing_helpers.c | 18 ++++++++++++++++++ tools/testing/selftests/bpf/testing_helpers.h | 1 + 5 files changed, 30 insertions(+), 30 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c index 35adc3f6d443..fa484d00a7a5 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c @@ -252,10 +252,17 @@ cleanup: kprobe_multi__destroy(skel); } -/* defined in prog_tests/uprobe_multi_test.c */ -void uprobe_multi_func_1(void); -void uprobe_multi_func_2(void); -void uprobe_multi_func_3(void); +/* + * Weak uprobe target stubs. noinline is required because + * uprobe_multi_test_run() takes their addresses to configure the BPF + * program's attachment points; an inlined function has no stable + * address in the binary to probe. The strong definitions in + * uprobe_multi_test.c take precedence when that translation unit is + * linked. + */ +noinline __weak void uprobe_multi_func_1(void) { asm volatile (""); } +noinline __weak void uprobe_multi_func_2(void) { asm volatile (""); } +noinline __weak void uprobe_multi_func_3(void) { asm volatile (""); } static void uprobe_multi_test_run(struct uprobe_multi *skel) { @@ -574,8 +581,6 @@ cleanup: close(fmod_ret_fd); } -int stack_mprotect(void); - static void lsm_subtest(struct test_bpf_cookie *skel) { __u64 cookie; diff --git a/tools/testing/selftests/bpf/prog_tests/iters.c b/tools/testing/selftests/bpf/prog_tests/iters.c index a539980a2fbe..c0b6082f345a 100644 --- a/tools/testing/selftests/bpf/prog_tests/iters.c +++ b/tools/testing/selftests/bpf/prog_tests/iters.c @@ -202,8 +202,6 @@ cleanup: iters_task__destroy(skel); } -extern int stack_mprotect(void); - static void subtest_css_task_iters(void) { struct iters_css_task *skel = NULL; diff --git a/tools/testing/selftests/bpf/prog_tests/test_lsm.c b/tools/testing/selftests/bpf/prog_tests/test_lsm.c index bdc4fc06bc5a..d7495efd4a56 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_lsm.c +++ b/tools/testing/selftests/bpf/prog_tests/test_lsm.c @@ -5,36 +5,14 @@ */ #include -#include #include #include -#include -#include #include "lsm.skel.h" #include "lsm_tailcall.skel.h" char *CMD_ARGS[] = {"true", NULL}; -#define GET_PAGE_ADDR(ADDR, PAGE_SIZE) \ - (char *)(((unsigned long) (ADDR + PAGE_SIZE)) & ~(PAGE_SIZE-1)) - -int stack_mprotect(void) -{ - void *buf; - long sz; - int ret; - - sz = sysconf(_SC_PAGESIZE); - if (sz < 0) - return sz; - - buf = alloca(sz * 3); - ret = mprotect(GET_PAGE_ADDR(buf, sz), sz, - PROT_READ | PROT_WRITE | PROT_EXEC); - return ret; -} - int exec_cmd(int *monitored_pid) { int child_pid, child_status; diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c index 6fbe1e995660..c970e7793dfc 100644 --- a/tools/testing/selftests/bpf/testing_helpers.c +++ b/tools/testing/selftests/bpf/testing_helpers.c @@ -5,6 +5,8 @@ #include #include #include +#include +#include #include #include #include "disasm.h" @@ -516,3 +518,19 @@ bool is_jit_enabled(void) return enabled; } + +int stack_mprotect(void) +{ + void *buf; + long sz; + int ret; + + sz = sysconf(_SC_PAGESIZE); + if (sz < 0) + return sz; + + buf = alloca(sz * 3); + ret = mprotect((void *)(((unsigned long)(buf + sz)) & ~(sz - 1)), sz, + PROT_READ | PROT_WRITE | PROT_EXEC); + return ret; +} diff --git a/tools/testing/selftests/bpf/testing_helpers.h b/tools/testing/selftests/bpf/testing_helpers.h index 2ca2356a0b58..2edc6fb7fc52 100644 --- a/tools/testing/selftests/bpf/testing_helpers.h +++ b/tools/testing/selftests/bpf/testing_helpers.h @@ -59,5 +59,6 @@ struct bpf_insn; int get_xlated_program(int fd_prog, struct bpf_insn **buf, __u32 *cnt); int testing_prog_flags(void); bool is_jit_enabled(void); +int stack_mprotect(void); #endif /* __TESTING_HELPERS_H */ -- cgit v1.2.3 From 3ca6543464f8f396eee018399b5e266196b0a9a7 Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marlière" Date: Tue, 2 Jun 2026 10:03:00 -0300 Subject: selftests/bpf: Tolerate missing files during install MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With partial builds, some TEST_GEN_FILES entries can be absent at install time. rsync treats missing source arguments as fatal and aborts kselftest installation. Override INSTALL_SINGLE_RULE in selftests/bpf to use --ignore-missing-args, while keeping the existing bpf-specific INSTALL_RULE extension logic. Also add --ignore-missing-args to the TEST_INST_SUBDIRS rsync loop so that subdirectories with no .bpf.o files (e.g. when a test runner flavor was skipped) do not abort installation. Note that the INSTALL_SINGLE_RULE override applies globally to all file categories including static source files (TEST_PROGS, TEST_FILES). These are version-controlled and should always be present, so the practical risk is negligible. Signed-off-by: Ricardo B. Marlière Link: https://lore.kernel.org/r/20260602-selftests-bpf_misconfig-v12-11-27f898b3ba26@suse.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 4eebc15670e3..42d9cf848b25 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -1029,12 +1029,23 @@ EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \ # Delete partially updated (corrupted) files on error .DELETE_ON_ERROR: +# When permissive, tell rsync to ignore missing source arguments so that +# partial builds do not abort installation. +ifneq ($(PERMISSIVE),) +override define INSTALL_SINGLE_RULE + $(if $(INSTALL_LIST),@mkdir -p $(INSTALL_PATH)) + $(if $(INSTALL_LIST),rsync -a --copy-unsafe-links --ignore-missing-args $(INSTALL_LIST) $(INSTALL_PATH)/) +endef +endif + DEFAULT_INSTALL_RULE := $(INSTALL_RULE) override define INSTALL_RULE $(DEFAULT_INSTALL_RULE) - @for DIR in $(TEST_INST_SUBDIRS); do \ - mkdir -p $(INSTALL_PATH)/$$DIR; \ - rsync -a $(OUTPUT)/$$DIR/*.bpf.o $(INSTALL_PATH)/$$DIR;\ + @for DIR in $(TEST_INST_SUBDIRS); do \ + mkdir -p $(INSTALL_PATH)/$$DIR; \ + rsync -a $(if $(PERMISSIVE),--ignore-missing-args) \ + $(OUTPUT)/$$DIR/*.bpf.o \ + $(INSTALL_PATH)/$$DIR; \ done endef -- cgit v1.2.3 From e2a49fdb1beed150125b4104c90eb2a96ec7f63a Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Fri, 5 Jun 2026 23:52:47 +0800 Subject: bpf: Check tail zero of bpf_map_info Since there're 4 bytes padding at the end of struct bpf_map_info, they won't be checked by bpf_check_uarg_tail_zero(). pahole -C bpf_map_info ./vmlinux struct bpf_map_info { ... __u64 hash __attribute__((__aligned__(8))); /* 88 8 */ __u32 hash_size; /* 96 4 */ /* size: 104, cachelines: 2, members: 18 */ /* padding: 4 */ /* forced alignments: 1 */ /* last cacheline: 40 bytes */ } __attribute__((__aligned__(8))); If a future kernel extension adds a new 4-byte field, older userspace programs allocating this structure on the stack might inadvertently pass uninitialized stack garbage into the new field, permanently breaking backward compatibility. -- sashiko [1] Fix it by changing sizeof(info) to offsetofend(struct bpf_map_info, hash_size). And, add "__u32 :32" to the tail of struct bpf_map_info. [1] https://lore.kernel.org/bpf/20260513224823.6494FC19425@smtp.kernel.org/ Fixes: ea2e6467ac36 ("bpf: Return hashes of maps in BPF_OBJ_GET_INFO_BY_FD") Acked-by: Mykyta Yatsenko Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260605155249.20772-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 5 +++-- tools/include/uapi/linux/bpf.h | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bed9b1b4d5ef..e1730f449d9e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6733,6 +6733,7 @@ struct bpf_map_info { __u64 map_extra; __aligned_u64 hash; __u32 hash_size; + __u32 :32; } __attribute__((aligned(8))); struct bpf_btf_info { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 31a3b70a0b5d..89f020a44fc9 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5406,10 +5406,11 @@ static int bpf_map_get_info_by_fd(struct file *file, { struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info); struct bpf_map_info info; - u32 info_len = attr->info.info_len; + u32 info_len = attr->info.info_len, len; int err; - err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len); + len = offsetofend(struct bpf_map_info, hash_size); + err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), len, info_len); if (err) return err; info_len = min_t(u32, sizeof(info), info_len); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7d0b282ba674..7caf667e86fe 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6733,6 +6733,7 @@ struct bpf_map_info { __u64 map_extra; __aligned_u64 hash; __u32 hash_size; + __u32 :32; } __attribute__((aligned(8))); struct bpf_btf_info { -- cgit v1.2.3 From 786be2b05980a5828e67fc564ad7517e2adbe9bd Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Fri, 5 Jun 2026 23:52:48 +0800 Subject: bpf: Check tail zero of bpf_prog_info Since there're 4 bytes padding at the end of struct bpf_prog_info, they won't be checked by bpf_check_uarg_tail_zero(). pahole -C bpf_prog_info ./vmlinux struct bpf_prog_info { ... __u32 attach_btf_obj_id; /* 220 4 */ __u32 attach_btf_id; /* 224 4 */ /* size: 232, cachelines: 4, members: 38 */ /* sum members: 224 */ /* sum bitfield members: 1 bits, bit holes: 1, sum bit holes: 31 bits */ /* padding: 4 */ /* forced alignments: 9 */ /* last cacheline: 40 bytes */ } __attribute__((__aligned__(8))); If a future kernel extension adds a new 4-byte field, older userspace programs allocating this structure on the stack might inadvertently pass uninitialized stack garbage into the new field, permanently breaking backward compatibility. -- sashiko [1] Fix it by changing sizeof(info) to offsetofend(struct bpf_prog_info, attach_btf_id). And, add "__u32 :32" to the tail of struct bpf_prog_info. [1] https://lore.kernel.org/bpf/20260513224823.6494FC19425@smtp.kernel.org/ Fixes: aba64c7da983 ("bpf: Add verified_insns to bpf_prog_info and fdinfo") Acked-by: Mykyta Yatsenko Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260605155249.20772-3-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 5 +++-- tools/include/uapi/linux/bpf.h | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e1730f449d9e..d5238df5e5eb 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6712,6 +6712,7 @@ struct bpf_prog_info { __u32 verified_insns; __u32 attach_btf_obj_id; __u32 attach_btf_id; + __u32 :32; } __attribute__((aligned(8))); struct bpf_map_info { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 89f020a44fc9..c5d4ae957e87 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5121,10 +5121,11 @@ static int bpf_prog_get_info_by_fd(struct file *file, u32 info_len = attr->info.info_len; struct bpf_prog_kstats stats; char __user *uinsns; - u32 ulen; + u32 ulen, len; int err; - err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len); + len = offsetofend(struct bpf_prog_info, attach_btf_id); + err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), len, info_len); if (err) return err; info_len = min_t(u32, sizeof(info), info_len); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7caf667e86fe..3829db087449 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6712,6 +6712,7 @@ struct bpf_prog_info { __u32 verified_insns; __u32 attach_btf_obj_id; __u32 attach_btf_id; + __u32 :32; } __attribute__((aligned(8))); struct bpf_map_info { -- cgit v1.2.3 From d47e67a487bfb6952a7831a6b36b7a90534c6044 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Fri, 5 Jun 2026 23:52:49 +0800 Subject: selftests/bpf: Add tests to verify checking padding bytes for bpf_[map,prog]_info Add two tests to verify that the tail padding 4 bytes of struct bpf_map_info and bpf_prog_info are checked in syscall.c using bpf_check_uarg_tail_zero(). Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260605155249.20772-4-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/bpf_attr_size.c | 55 ++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_attr_size.c b/tools/testing/selftests/bpf/prog_tests/bpf_attr_size.c index 32159dc64da8..87842c4347a6 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_attr_size.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_attr_size.c @@ -62,8 +62,63 @@ cleanup: cgroup_skb_direct_packet_access__destroy(skel); } +static void test_map_info_tail_zero(void) +{ + LIBBPF_OPTS(bpf_map_create_opts, map_opts); + struct bpf_map_info_fake { + __u8 info[offsetofend(struct bpf_map_info, hash_size)]; + __u32 pad; + } info = { + .pad = 1, + }; + int map_fd, err; + __u32 info_len; + + map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "arr", sizeof(int), 1, 1, &map_opts); + if (!ASSERT_GE(map_fd, 0, "bpf_map_create")) + return; + + info_len = sizeof(info); + err = bpf_obj_get_info_by_fd(map_fd, &info, &info_len); + ASSERT_EQ(err, -E2BIG, "bpf_obj_get_info_by_fd"); + + close(map_fd); +} + +static void test_prog_info_tail_zero(void) +{ + LIBBPF_OPTS(bpf_prog_load_opts, prog_opts); + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + struct bpf_prog_info_fake { + __u8 info[offsetofend(struct bpf_prog_info, attach_btf_id)]; + __u32 pad; + } info = { + .pad = 1, + }; + int prog_fd, err; + __u32 info_len; + + prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, "test_prog", "GPL", insns, + ARRAY_SIZE(insns), &prog_opts); + if (!ASSERT_GE(prog_fd, 0, "bpf_prog_load")) + return; + + info_len = sizeof(info); + err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); + ASSERT_EQ(err, -E2BIG, "bpf_obj_get_info_by_fd"); + + close(prog_fd); +} + void test_bpf_attr_size(void) { if (test__start_subtest("query_size_boundaries")) test_query_size_boundaries(); + if (test__start_subtest("map_info_tail_zero")) + test_map_info_tail_zero(); + if (test__start_subtest("prog_info_tail_zero")) + test_prog_info_tail_zero(); } -- cgit v1.2.3 From 557d0cc3f2520feba45360beeafb93203b3230e0 Mon Sep 17 00:00:00 2001 From: Varun R Mallya Date: Wed, 3 Jun 2026 02:28:46 +0530 Subject: selftests/bpf: use host CPU features in JIT disassembler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pass the host CPU name and feature string to LLVMCreateDisasmCPUFeatures() instead of using LLVMCreateDisasm(), so the disassembler correctly decodes CPU-specific instructions and extensions such as RISC-V compressed and vector instructions. Signed-off-by: Varun R Mallya Reviewed-by: Björn Töpel Link: https://lore.kernel.org/r/20260602205847.102825-2-varunrmallya@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/jit_disasm_helpers.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/jit_disasm_helpers.c b/tools/testing/selftests/bpf/jit_disasm_helpers.c index 364c557c5115..3558fe10e28c 100644 --- a/tools/testing/selftests/bpf/jit_disasm_helpers.c +++ b/tools/testing/selftests/bpf/jit_disasm_helpers.c @@ -96,10 +96,19 @@ static int disasm_one_func(FILE *text_out, uint8_t *image, __u32 len) __u32 *label_pc, pc; int i, cnt, err = 0; char buf[64]; + char *cpu, *features; triple = LLVMGetDefaultTargetTriple(); - ctx = LLVMCreateDisasm(triple, &labels, 0, NULL, lookup_symbol); - if (!ASSERT_OK_PTR(ctx, "LLVMCreateDisasm")) { + + cpu = LLVMGetHostCPUName(); + features = LLVMGetHostCPUFeatures(); + + ctx = LLVMCreateDisasmCPUFeatures(triple, cpu, features, &labels, 0, NULL, lookup_symbol); + + LLVMDisposeMessage(cpu); + LLVMDisposeMessage(features); + + if (!ASSERT_OK_PTR(ctx, "LLVMCreateDisasmCPUFeatures")) { err = -EINVAL; goto out; } -- cgit v1.2.3 From 6d13ddb1d46525931d2324d9358721eb3c495d72 Mon Sep 17 00:00:00 2001 From: Varun R Mallya Date: Wed, 3 Jun 2026 02:28:47 +0530 Subject: bpf, riscv: inline bpf_get_current_task() and bpf_get_current_task_btf() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On RISC-V, the current task pointer is stored in the thread pointer register (tp). Emit a single `mv a5, tp` instead of a full helper call for BPF_FUNC_get_current_task and BPF_FUNC_get_current_task_btf. Register bpf_jit_inlines_helper_call() entries for both helpers so the verifier treats them as inlined, and add the expected `mv a5, tp` annotation to the riscv64 selftests. The following show changes before and after this patch. Before patch: auipc t1,0x817a # load upper PC-relative address jalr -2004(t1) # call bpf_get_current_task helper mv a5,a0 # move return value to BPF_REG_0 After patch: mv a5,tp # directly: a5 = current (tp = thread pointer) Benchmark (bpf_prog_test_run wrapping bpf_get_current_task in loop, batch=100, 10s, QEMU RISC-V): | runs/sec | helper-calls/sec | ns/call -------------+-----------+------------------+--------- Before patch | 173,490 | 17,349,090 | 57 After patch | 320,497 | 32,049,780 | 31 -------------+-----------+------------------+--------- Improvement | +84.7% | +84.7% | -45.6% Signed-off-by: Varun R Mallya Acked-by: Björn Töpel Link: https://lore.kernel.org/r/20260602205847.102825-3-varunrmallya@gmail.com Signed-off-by: Alexei Starovoitov --- arch/riscv/net/bpf_jit_comp64.c | 9 +++++++++ tools/testing/selftests/bpf/progs/verifier_jit_inline.c | 2 ++ 2 files changed, 11 insertions(+) diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 2f1109dbf105..e2c70c70cca8 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -1808,6 +1808,13 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, break; } + /* Implement helper call to bpf_get_current_task/_btf() inline */ + if (insn->src_reg == 0 && (insn->imm == BPF_FUNC_get_current_task || + insn->imm == BPF_FUNC_get_current_task_btf)) { + emit_mv(bpf_to_rv_reg(BPF_REG_0, ctx), RV_REG_TP, ctx); + break; + } + mark_call(ctx); ret = bpf_jit_get_func_addr(ctx->prog, insn, extra_pass, &addr, &fixed_addr); @@ -2138,6 +2145,8 @@ bool bpf_jit_inlines_helper_call(s32 imm) { switch (imm) { case BPF_FUNC_get_smp_processor_id: + case BPF_FUNC_get_current_task: + case BPF_FUNC_get_current_task_btf: return true; default: return false; diff --git a/tools/testing/selftests/bpf/progs/verifier_jit_inline.c b/tools/testing/selftests/bpf/progs/verifier_jit_inline.c index 885ff69a3a62..76d80605ec7f 100644 --- a/tools/testing/selftests/bpf/progs/verifier_jit_inline.c +++ b/tools/testing/selftests/bpf/progs/verifier_jit_inline.c @@ -10,6 +10,8 @@ __arch_x86_64 __jited(" addq %gs:{{.*}}, %rax") __arch_arm64 __jited(" mrs x8, SP_EL0") +__arch_riscv64 +__jited(" mv a5, tp") int inline_bpf_get_current_task(void) { bpf_get_current_task(); -- cgit v1.2.3 From 82d7d0adbc678064543e9d254864f6b4ea4a388c Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Tue, 2 Jun 2026 23:09:30 +0800 Subject: bpf: Update transport_header when encapsulating UDP tunnel in lwt Currently, bpf_lwt_push_ip_encap() does not update skb->transport_header. When a driver, e.g. ice, reuses the stale skb->transport_header to offload checksum computation to NIC hardware, VxLAN packets encapsulated by bpf_lwt_push_encap() helper may be dropped due to incorrect checksum. Update skb->transport_header in bpf_lwt_push_ip_encap() whenever the encapsulated packet uses UDP, so checksum offload works correctly. Fixes: 52f278774e79 ("bpf: implement BPF_LWT_ENCAP_IP mode in bpf_lwt_push_encap") Cc: Leon Hwang Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260602150931.49629-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- net/core/lwt_bpf.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index f71ef82a5f3d..bf588f508b79 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -599,6 +599,7 @@ static int handle_gso_encap(struct sk_buff *skb, bool ipv4, int encap_len) int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress) { + bool is_udp_tunnel; struct iphdr *iph; bool ipv4; int err; @@ -612,10 +613,16 @@ int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress) ipv4 = true; if (unlikely(len < iph->ihl * 4)) return -EINVAL; + is_udp_tunnel = iph->protocol == IPPROTO_UDP; + if (unlikely(is_udp_tunnel && len < iph->ihl * 4 + sizeof(struct udphdr))) + return -EINVAL; } else if (iph->version == 6) { ipv4 = false; if (unlikely(len < sizeof(struct ipv6hdr))) return -EINVAL; + is_udp_tunnel = ((struct ipv6hdr *)iph)->nexthdr == NEXTHDR_UDP; + if (unlikely(is_udp_tunnel && len < sizeof(struct ipv6hdr) + sizeof(struct udphdr))) + return -EINVAL; } else { return -EINVAL; } @@ -637,6 +644,11 @@ int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress) if (ingress) skb_postpush_rcsum(skb, iph, len); skb_reset_network_header(skb); + if (is_udp_tunnel) { + size_t iph_sz = ipv4 ? iph->ihl * 4 : sizeof(struct ipv6hdr); + + skb_set_transport_header(skb, skb_network_offset(skb) + iph_sz); + } memcpy(skb_network_header(skb), hdr, len); bpf_compute_data_pointers(skb); skb_clear_hash(skb); -- cgit v1.2.3 From 5477d55f351fea3eeb2c5c77a9224eed0fd4d6a9 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Tue, 2 Jun 2026 23:09:31 +0800 Subject: selftests/bpf: Add tests to verify the fix of encapsulating VxLAN in lwt Add two tests to verify the transport header of skb has been set when encapsulate VxLAN using bpf_lwt_push_encap() helper. 1. VxLAN over IPv4. 2. VxLAN over IPv6. Without the fix, the tests would fail: lwt_ip_encap_vxlan:FAIL:transport_hdr offset unexpected transport_hdr offset: actual 70 != expected 20 #208 lwt_ip_encap_vxlan_ipv4:FAIL lwt_ip_encap_vxlan:FAIL:transport_hdr offset unexpected transport_hdr offset: actual 110 != expected 40 #209 lwt_ip_encap_vxlan_ipv6:FAIL The unexpected offsets are: outer encap headers (IPv4: iphdr+udp+vxlan+eth = 50 bytes, IPv6: ipv6hdr+udp+vxlan+eth = 70 bytes) plus the inner IP header (20 or 40 bytes), because without the fix transport_header still points at the inner transport layer instead of the outer UDP header. Assisted-by: Claude:claude-sonnet-4-6 Cc: Leon Hwang Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260602150931.49629-3-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/lwt_ip_encap.c | 145 +++++++++++++++++++ .../selftests/bpf/progs/test_lwt_ip_encap.c | 155 +++++++++++++++++++-- 2 files changed, 290 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/lwt_ip_encap.c b/tools/testing/selftests/bpf/prog_tests/lwt_ip_encap.c index b6391af5f6f9..6606f0ed9a9a 100644 --- a/tools/testing/selftests/bpf/prog_tests/lwt_ip_encap.c +++ b/tools/testing/selftests/bpf/prog_tests/lwt_ip_encap.c @@ -3,6 +3,7 @@ #include "network_helpers.h" #include "test_progs.h" +#include "test_lwt_ip_encap.skel.h" #define BPF_FILE "test_lwt_ip_encap.bpf.o" @@ -32,6 +33,9 @@ #define IP6_ADDR_8 "fb08::1" #define IP6_ADDR_GRE "fb10::1" +#define IP4_ADDR_VXLAN "172.16.17.100" +#define IP6_ADDR_VXLAN "fb11::1" + #define IP6_ADDR_SRC IP6_ADDR_1 #define IP6_ADDR_DST IP6_ADDR_4 @@ -538,3 +542,144 @@ void test_lwt_ip_encap_ipv4(void) if (test__start_subtest("ingress")) lwt_ip_encap(IPV4_ENCAP, INGRESS, ""); } + +/* + * VxLAN Setup/topology: + * + * NS1 (IP*_ADDR_1) NS2 NS3 (IP*_ADDR_4) + * [ping src] + * | top route + * veth1 (LWT encap) <<-- veth2 veth3 <<-- veth4 (ping dst) + * | ^ + * (bottom route) | (inner pkt) + * v bottom route | + * veth5 -->> veth6 veth7 -->> veth8 (vxlan decap) + * (IP*_ADDR_VXLAN) + * + * Add the VxLAN endpoint addresses to NS3's veth8, create standard + * VxLAN decap devices bound to those addresses, and install routes so + * NS1/NS2 can reach the endpoints via the bottom route. NS2 here is to + * make sure the LWT-encap VxLAN packets are routed to NS3 correctly. + */ +static int setup_vxlan_routes(const char *ns3, const char *ns1, const char *ns2) +{ + struct nstoken *nstoken; + + nstoken = open_netns(ns3); + if (!ASSERT_OK_PTR(nstoken, "open ns3 for vxlan")) + return -1; + + SYS(fail_close, "ip a add %s/32 dev veth8", IP4_ADDR_VXLAN); + SYS(fail_close, "ip -6 a add %s/128 dev veth8", IP6_ADDR_VXLAN); + /* + * Standard VxLAN devices to decap the encapsulated packets. The inner + * Ethernet frame uses a broadcast dst MAC so the IP stack accepts it + * without ARP or FDB configuration. + */ + SYS(fail_close, "ip link add vxlan4 type vxlan id 1 dstport 4789 local %s dev veth8 nolearning noudpcsum", + IP4_ADDR_VXLAN); + SYS(fail_close, "ip link set vxlan4 up"); + SYS(fail_close, "ip link add vxlan6 type vxlan id 1 dstport 4789 local %s dev veth8 nolearning udp6zerocsumrx", + IP6_ADDR_VXLAN); + SYS(fail_close, "ip link set vxlan6 up"); + close_netns(nstoken); + + SYS(fail, "ip -n %s route add %s/32 dev veth5 via %s", + ns1, IP4_ADDR_VXLAN, IP4_ADDR_6); + SYS(fail, "ip -n %s route add %s/32 dev veth7 via %s", + ns2, IP4_ADDR_VXLAN, IP4_ADDR_8); + SYS(fail, "ip -n %s -6 route add %s/128 dev veth5 via %s", + ns1, IP6_ADDR_VXLAN, IP6_ADDR_6); + SYS(fail, "ip -n %s -6 route add %s/128 dev veth7 via %s", + ns2, IP6_ADDR_VXLAN, IP6_ADDR_8); + return 0; + +fail_close: + close_netns(nstoken); +fail: + return -1; +} + +static void lwt_ip_encap_vxlan(bool ipv4_encap) +{ + char ns1[NETNS_NAME_SIZE] = NETNS_BASE "-1-"; + char ns2[NETNS_NAME_SIZE] = NETNS_BASE "-2-"; + char ns3[NETNS_NAME_SIZE] = NETNS_BASE "-3-"; + const char *sec = ipv4_encap ? "encap_vxlan" : "encap_vxlan6"; + int expected_offset = ipv4_encap ? (int)sizeof(struct iphdr) + : (int)sizeof(struct ipv6hdr); + struct test_lwt_ip_encap *skel = NULL; + int thdr_offset, err; + + if (!ASSERT_OK(create_ns(ns1, NETNS_NAME_SIZE), "create ns1")) + goto out; + if (!ASSERT_OK(create_ns(ns2, NETNS_NAME_SIZE), "create ns2")) + goto out; + if (!ASSERT_OK(create_ns(ns3, NETNS_NAME_SIZE), "create ns3")) + goto out; + + if (!ASSERT_OK(setup_network(ns1, ns2, ns3, ""), "setup network")) + goto out; + + if (!ASSERT_OK(setup_vxlan_routes(ns3, ns1, ns2), "setup vxlan routes")) + goto out; + + skel = test_lwt_ip_encap__open(); + if (!ASSERT_OK_PTR(skel, "test_lwt_ip_encap__open")) + goto out; + + bpf_program__set_autoload(skel->progs.bpf_lwt_encap_gre, false); + bpf_program__set_autoload(skel->progs.bpf_lwt_encap_gre6, false); + bpf_program__set_autoload(skel->progs.bpf_lwt_encap_vxlan, false); + bpf_program__set_autoload(skel->progs.bpf_lwt_encap_vxlan6, false); + bpf_program__set_autoload(skel->progs.fexit_lwt_push_ip_encap, true); + skel->rodata->tgt_ip_version = ipv4_encap ? 4 : 6; + + err = test_lwt_ip_encap__load(skel); + if (!ASSERT_OK(err, "test_lwt_ip_encap__load")) + goto out; + + err = test_lwt_ip_encap__attach(skel); + if (!ASSERT_OK(err, "test_lwt_ip_encap__attach")) + goto out; + + /* Remove the direct NS2->DST route so packets must go via LWT encap. */ + SYS(out, "ip -n %s route del %s/32 dev veth3", ns2, IP4_ADDR_DST); + SYS(out, "ip -n %s -6 route del %s/128 dev veth3", ns2, IP6_ADDR_DST); + + if (ipv4_encap) + SYS(out, "ip -n %s route add %s encap bpf xmit obj %s sec %s dev veth1", + ns1, IP4_ADDR_DST, BPF_FILE, sec); + else + SYS(out, "ip -n %s -6 route add %s encap bpf xmit obj %s sec %s dev veth1", + ns1, IP6_ADDR_DST, BPF_FILE, sec); + + skel->bss->fexit_triggered = false; + + if (ipv4_encap) + SYS(out, "ip netns exec %s ping -c 1 -W1 %s", ns1, IP4_ADDR_DST); + else + SYS(out, "ip netns exec %s ping6 -c 1 -W1 %s", ns1, IP6_ADDR_DST); + + if (!ASSERT_TRUE(skel->bss->fexit_triggered, "fexit_triggered")) + goto out; + + thdr_offset = (int)skel->bss->transport_hdr - (int)skel->bss->network_hdr; + ASSERT_EQ(thdr_offset, expected_offset, "transport_hdr offset"); + +out: + test_lwt_ip_encap__destroy(skel); + SYS_NOFAIL("ip netns del %s", ns1); + SYS_NOFAIL("ip netns del %s", ns2); + SYS_NOFAIL("ip netns del %s", ns3); +} + +void test_lwt_ip_encap_vxlan_ipv4(void) +{ + lwt_ip_encap_vxlan(IPV4_ENCAP); +} + +void test_lwt_ip_encap_vxlan_ipv6(void) +{ + lwt_ip_encap_vxlan(IPV6_ENCAP); +} diff --git a/tools/testing/selftests/bpf/progs/test_lwt_ip_encap.c b/tools/testing/selftests/bpf/progs/test_lwt_ip_encap.c index d6cb986e7533..4a934fccf8f5 100644 --- a/tools/testing/selftests/bpf/progs/test_lwt_ip_encap.c +++ b/tools/testing/selftests/bpf/progs/test_lwt_ip_encap.c @@ -1,11 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include "vmlinux.h" #include -#include -#include -#include #include #include +#include struct grehdr { __be16 flags; @@ -64,13 +62,13 @@ int bpf_lwt_encap_gre6(struct __sk_buff *skb) hdr.ip6hdr.nexthdr = 47; /* IPPROTO_GRE */ hdr.ip6hdr.hop_limit = 0x40; /* fb01::1 */ - hdr.ip6hdr.saddr.s6_addr[0] = 0xfb; - hdr.ip6hdr.saddr.s6_addr[1] = 1; - hdr.ip6hdr.saddr.s6_addr[15] = 1; + hdr.ip6hdr.saddr.in6_u.u6_addr8[0] = 0xfb; + hdr.ip6hdr.saddr.in6_u.u6_addr8[1] = 1; + hdr.ip6hdr.saddr.in6_u.u6_addr8[15] = 1; /* fb10::1 */ - hdr.ip6hdr.daddr.s6_addr[0] = 0xfb; - hdr.ip6hdr.daddr.s6_addr[1] = 0x10; - hdr.ip6hdr.daddr.s6_addr[15] = 1; + hdr.ip6hdr.daddr.in6_u.u6_addr8[0] = 0xfb; + hdr.ip6hdr.daddr.in6_u.u6_addr8[1] = 0x10; + hdr.ip6hdr.daddr.in6_u.u6_addr8[15] = 1; hdr.greh.protocol = skb->protocol; @@ -82,4 +80,141 @@ int bpf_lwt_encap_gre6(struct __sk_buff *skb) return BPF_LWT_REROUTE; } +#define VXLAN_PORT 4789 +#define VXLAN_FLAGS 0x08000000 +#define VXLAN_VNI 1 + +#define ETH_ALEN 6 /* Octets in one ethernet addr */ +#define ETH_P_IP 0x0800 /* Internet Protocol packet */ +#define ETH_P_IPV6 0x86DD /* IPv6 over bluebook */ + +static const __u8 bcast[ETH_ALEN] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, +}; + +static const __u8 srcmac[ETH_ALEN] = { + 0x02, 0x00, 0x00, 0x00, 0x00, 0x01, +}; + +SEC("encap_vxlan") +int bpf_lwt_encap_vxlan(struct __sk_buff *skb) +{ + struct encap_hdr { + struct iphdr iph; + struct udphdr udph; + struct vxlanhdr vxh; + struct ethhdr eth; + } __attribute__((__packed__)) hdr; + int err; + + memset(&hdr, 0, sizeof(hdr)); + + hdr.iph.ihl = 5; + hdr.iph.version = 4; + hdr.iph.ttl = 0x40; + hdr.iph.protocol = 17; /* IPPROTO_UDP */ + hdr.iph.tot_len = bpf_htons(skb->len + sizeof(hdr)); +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + hdr.iph.saddr = 0x640510ac; /* 172.16.5.100 */ + hdr.iph.daddr = 0x641110ac; /* 172.16.17.100 */ +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + hdr.iph.saddr = 0xac100564; /* 172.16.5.100 */ + hdr.iph.daddr = 0xac101164; /* 172.16.17.100 */ +#else +#error "Fix your compiler's __BYTE_ORDER__?!" +#endif + + hdr.udph.source = bpf_htons(VXLAN_PORT); + hdr.udph.dest = bpf_htons(VXLAN_PORT); + hdr.udph.len = bpf_htons(skb->len + sizeof(hdr.udph) + sizeof(hdr.vxh) + + sizeof(hdr.eth)); + + hdr.vxh.vx_flags = bpf_htonl(VXLAN_FLAGS); + hdr.vxh.vx_vni = bpf_htonl(VXLAN_VNI << 8); + + __builtin_memcpy(hdr.eth.h_dest, bcast, ETH_ALEN); + __builtin_memcpy(hdr.eth.h_source, srcmac, ETH_ALEN); + hdr.eth.h_proto = bpf_htons(ETH_P_IP); + + err = bpf_lwt_push_encap(skb, BPF_LWT_ENCAP_IP, &hdr, sizeof(hdr)); + if (err) + return BPF_DROP; + + return BPF_LWT_REROUTE; +} + +SEC("encap_vxlan6") +int bpf_lwt_encap_vxlan6(struct __sk_buff *skb) +{ + struct encap_hdr { + struct ipv6hdr ip6hdr; + struct udphdr udph; + struct vxlanhdr vxh; + struct ethhdr eth; + } __attribute__((__packed__)) hdr; + int err; + + memset(&hdr, 0, sizeof(hdr)); + + hdr.ip6hdr.version = 6; + hdr.ip6hdr.nexthdr = 17; /* IPPROTO_UDP */ + hdr.ip6hdr.hop_limit = 0x40; + hdr.ip6hdr.payload_len = bpf_htons(skb->len + sizeof(hdr.udph) + sizeof(hdr.vxh) + + sizeof(hdr.eth)); + /* fb05::1 */ + hdr.ip6hdr.saddr.in6_u.u6_addr8[0] = 0xfb; + hdr.ip6hdr.saddr.in6_u.u6_addr8[1] = 0x05; + hdr.ip6hdr.saddr.in6_u.u6_addr8[15] = 1; + /* fb11::1 */ + hdr.ip6hdr.daddr.in6_u.u6_addr8[0] = 0xfb; + hdr.ip6hdr.daddr.in6_u.u6_addr8[1] = 0x11; + hdr.ip6hdr.daddr.in6_u.u6_addr8[15] = 1; + + hdr.udph.source = bpf_htons(VXLAN_PORT); + hdr.udph.dest = bpf_htons(VXLAN_PORT); + hdr.udph.len = bpf_htons(skb->len + sizeof(hdr.udph) + sizeof(hdr.vxh) + + sizeof(hdr.eth)); + + hdr.vxh.vx_flags = bpf_htonl(VXLAN_FLAGS); + hdr.vxh.vx_vni = bpf_htonl(VXLAN_VNI << 8); + + __builtin_memcpy(hdr.eth.h_dest, bcast, ETH_ALEN); + __builtin_memcpy(hdr.eth.h_source, srcmac, ETH_ALEN); + hdr.eth.h_proto = bpf_htons(ETH_P_IPV6); + + err = bpf_lwt_push_encap(skb, BPF_LWT_ENCAP_IP, &hdr, sizeof(hdr)); + if (err) + return BPF_DROP; + + return BPF_LWT_REROUTE; +} + +volatile const int tgt_ip_version; + +__u16 transport_hdr = 0; +__u16 network_hdr = 0; +bool fexit_triggered = false; + +SEC("?fexit/bpf_lwt_push_ip_encap") +int BPF_PROG(fexit_lwt_push_ip_encap, struct sk_buff *skb, void *hdr, u32 len, bool ingress, + int retval) +{ + struct iphdr *iph; + + if (retval || fexit_triggered) + return 0; + + iph = (typeof(iph)) (skb->head + skb->network_header); + if (iph->version != tgt_ip_version) + return 0; + + if ((iph->version == 4 && iph->protocol == 17 /* IPPROTO_UDP */) || + (iph->version == 6 && ((struct ipv6hdr *)iph)->nexthdr == 17 /* IPPROTO_UDP */)) { + fexit_triggered = true; + transport_hdr = skb->transport_header; + network_hdr = skb->network_header; + } + return 0; +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From a66e3b5bacf38d6ab29fa05a9754f7a114485605 Mon Sep 17 00:00:00 2001 From: Dawei Feng Date: Wed, 3 Jun 2026 18:53:15 +0800 Subject: bpf: NUL-terminate replaced sysctl value When writing to sysctls, proc_sys_call_handler() guarantees that the buffer passed to proc handlers is NUL-terminated. If bpf_sysctl_set_new_value() replaces the pending sysctl value, it can hand a replacement buffer directly to proc handlers. However, the helper currently copies only buf_len bytes into that buffer without appending a NUL terminator, leaving downstream parsers vulnerable to out-of-bounds access. Fix this by appending a '\0' after the replaced value to restore the expected sysctl semantics. Since the helper already rejects buf_len greater than PAGE_SIZE - 1, there is always room for the extra byte. Reproduced in a QEMU x86_64 guest booted with KASAN while exercising the sysctl replacement path with a cgroup/sysctl BPF program. The reproducer targets `/proc/sys/net/core/flow_limit_cpu_bitmap`, fills the original user write buffer with non-zero bytes, and overrides the sysctl value so the replacement buffer lacks a terminating NUL. Under that setup, the pre-fix kernel reported: BUG: KASAN: slab-out-of-bounds in strnchrnul+0x72/0x90 Read of size 1 at addr ffff88800de57000 by task repro_patch3/66 CPU: 0 UID: 0 PID: 66 Comm: repro_patch3 Not tainted 7.1.0-rc3-00269-g8370ca1f87cc #6 PREEMPT(lazy) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 Call Trace: dump_stack_lvl+0x68/0xa0 print_report+0xcb/0x5e0 ? __virt_addr_valid+0x21d/0x3f0 ? strnchrnul+0x72/0x90 ? strnchrnul+0x72/0x90 kasan_report+0xca/0x100 ? strnchrnul+0x72/0x90 strnchrnul+0x72/0x90 bitmap_parse+0x37/0x2e0 flow_limit_cpu_sysctl+0xc6/0x840 ? __pfx_flow_limit_cpu_sysctl+0x10/0x10 ? __kvmalloc_node_noprof+0x5ba/0x870 proc_sys_call_handler+0x31d/0x480 ? __pfx_proc_sys_call_handler+0x10/0x10 ? selinux_file_permission+0x39f/0x500 ? lock_is_held_type+0x9e/0x120 vfs_write+0x98e/0x1000 ... The buggy address is located 0 bytes to the right of allocated 4096-byte region [ffff88800de56000, ffff88800de57000) With this fix applied, rerunning the same sysctl-targeted path yields no corresponding KASAN reports. Signed-off-by: Zilin Guan Signed-off-by: Dawei Feng Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20260603105317.944304-2-dawei.feng@seu.edu.cn Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 2c2bdaa86aa7..3fe87b9d368e 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -2343,6 +2343,7 @@ BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx, return -E2BIG; memcpy(ctx->new_val, buf, buf_len); + ((char *)ctx->new_val)[buf_len] = '\0'; ctx->new_len = buf_len; ctx->new_updated = 1; -- cgit v1.2.3 From 4c21b5927d4364bfe7365f2700da5fea0ed0d004 Mon Sep 17 00:00:00 2001 From: Dawei Feng Date: Wed, 3 Jun 2026 18:53:16 +0800 Subject: bpf: use kvfree() for replaced sysctl write buffer proc_sys_call_handler() allocates its temporary sysctl buffer with kvzalloc() and passes it to __cgroup_bpf_run_filter_sysctl(). Since kvzalloc() may fall back to vmalloc() for large allocations, freeing that buffer with kfree() is wrong and can corrupt memory. Use kvfree() to safely handle both kmalloc and kvzalloc()/vmalloc allocations. The bug was first flagged by an experimental analysis tool we are developing for kernel memory-management bugs while analyzing v6.13-rc1. The tool is still under development and is not yet publicly available. Manual inspection confirms that the bug is still present in v7.1-rc5. Reproduced the bug based on v7.1-rc4 in a QEMU x86_64 guest booted with KASAN and CONFIG_FAILSLAB enabled. To exercise the replacement path, the test tree also included the accompanying fix for the stale ret == 1 check in __cgroup_bpf_run_filter_sysctl(). The reproducer confines failslab injections to the proc_sys_call_handler() range, uses stacktrace-depth=32, and injects fail-nth=1 while writing 8191 bytes to /proc/sys/kernel/domainname from a task in the target cgroup. Under that setup, fail-nth=1 triggered the fault: BUG: unable to handle page fault for address: ffffeb0200024d48 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: Oops: 0000 SMP KASAN NOPTI CPU: 2 UID: 0 PID: 209 Comm: repro_proc_sys_ Not tainted 7.1.0-rc4-00686-g97625979a5d4 PREEMPT(lazy) Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.15.0-1 04/01/2014 RIP: 0010:kfree+0x6e/0x510 ... Call Trace: ? __cgroup_bpf_run_filter_sysctl+0x626/0xc30 __cgroup_bpf_run_filter_sysctl+0x74d/0xc30 ? __pfx___cgroup_bpf_run_filter_sysctl+0x10/0x10 ? srso_return_thunk+0x5/0x5f ? __kvmalloc_node_noprof+0x345/0x870 ? proc_sys_call_handler+0x250/0x480 ? srso_return_thunk+0x5/0x5f proc_sys_call_handler+0x3a2/0x480 ? __pfx_proc_sys_call_handler+0x10/0x10 ? srso_return_thunk+0x5/0x5f ? selinux_file_permission+0x39f/0x500 ? srso_return_thunk+0x5/0x5f ? lock_is_held_type+0x9e/0x120 vfs_write+0x98e/0x1000 ... With this fix applied on top of the same test setup, rerunning the reproducer with fail-nth=1 yields no corresponding Oops reports. Fixes: 4508943794ef ("proc: use kvzalloc for our kernel buffer") Cc: stable@vger.kernel.org Reviewed-by: Emil Tsalapatis Reviewed-by: Jiayuan Chen Acked-by: Yonghong Song Signed-off-by: Zilin Guan Signed-off-by: Dawei Feng Link: https://lore.kernel.org/r/20260603105317.944304-3-dawei.feng@seu.edu.cn Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 3fe87b9d368e..4bf0ec94e719 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1937,7 +1937,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, kfree(ctx.cur_val); if (ret == 1 && ctx.new_updated) { - kfree(*buf); + kvfree(*buf); *buf = ctx.new_val; *pcount = ctx.new_len; } else { -- cgit v1.2.3 From 2566c3b24219c5b30e35205cba029ff34ff7c78b Mon Sep 17 00:00:00 2001 From: Dawei Feng Date: Wed, 3 Jun 2026 18:53:17 +0800 Subject: bpf: Restore sysctl new-value from 1 to 0 Commit 4e63acdff864 ("bpf: Introduce bpf_sysctl_{get,set}_new_value helpers") changed the success return value to 0, but failed to update the corresponding check in __cgroup_bpf_run_filter_sysctl(). Since bpf_prog_run_array_cg() now returns 0 on success, the legacy ret == 1 condition is never satisfied. As a result, the modified value is ignored, and bpf_sysctl_set_new_value() fails to replace the write buffer. Fix this by checking for a return value of 0 instead, so cgroup/sysctl programs can correctly replace the pending sysctl buffer. This bug was discovered during a manual code review. Tested via a cgroup/sysctl BPF reproducer overriding writes to a target sysctl. Pre-fix, bpf_sysctl_set_new_value("foo") was silently ignored: the write returned 8192 and the value remained "600". Post-fix, the BPF replacement buffer properly propagates: the write returns 3 and the value updates to "foo". Fixes: f10d05966196 ("bpf: Make BPF_PROG_RUN_ARRAY return -err instead of allow boolean") Cc: stable@vger.kernel.org Acked-by: Yonghong Song Signed-off-by: Zilin Guan Signed-off-by: Dawei Feng Reviewed-by: Jiayuan Chen Acked-by: Xu Kuohai Link: https://lore.kernel.org/r/20260603105317.944304-4-dawei.feng@seu.edu.cn Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 4bf0ec94e719..35d1f1428ef3 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1936,7 +1936,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, kfree(ctx.cur_val); - if (ret == 1 && ctx.new_updated) { + if (!ret && ctx.new_updated) { kvfree(*buf); *buf = ctx.new_val; *pcount = ctx.new_len; -- cgit v1.2.3 From 6fa2839893e3db43566e623f12805daeca64d9c4 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Fri, 5 Jun 2026 14:02:41 +0000 Subject: selftests/bpf: Restrict bpf_set_retval argument in sk_bypass_prot_mem Test sk_bypass_prot_mem passes an unchecked value as argument to helper bpf_set_retval(). The argument can be outside the valid range enforced by the strict retval validation added in the next patch. Restrict the argument to -EFAULT when it is outside the valid range, so the test will not be rejected by the verifier when retval validation is enforced. Signed-off-by: Xu Kuohai Link: https://lore.kernel.org/r/20260605140243.664590-2-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/sk_bypass_prot_mem.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/sk_bypass_prot_mem.c b/tools/testing/selftests/bpf/progs/sk_bypass_prot_mem.c index 09a00d11ffcc..bae5283fca6b 100644 --- a/tools/testing/selftests/bpf/progs/sk_bypass_prot_mem.c +++ b/tools/testing/selftests/bpf/progs/sk_bypass_prot_mem.c @@ -5,6 +5,7 @@ #include #include #include +#include "err.h" extern int tcp_memory_per_cpu_fw_alloc __ksym; extern int udp_memory_per_cpu_fw_alloc __ksym; @@ -97,6 +98,7 @@ int sock_create(struct bpf_sock *ctx) return 1; err: + set_if_not_errno_or_zero(err, -EFAULT); bpf_set_retval(err); return 0; } -- cgit v1.2.3 From b1f7f67b74c2e29e9c83f7952290a3c7f156bf9a Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Fri, 5 Jun 2026 14:02:42 +0000 Subject: bpf: Add validation for bpf_set_retval argument The bpf_set_retval() helper is used by cgroup BPF programs to set the return value of the target hook. The argument type for this helper is ARG_ANYTHING. This allows setting a positive value, which no cgroup hook expects and can cause issues, such as: - BPF_LSM_CGROUP: a positive value from bpf_lsm_socket_create bypasses the err < 0 check in __sock_create(), leaving the socket object unallocated. The positive return value is then propagated to the syscall entry __sys_socket(), which also bypasses the IS_ERR() guard and ultimately causes a NULL pointer dereference. - BPF_CGROUP_DEVICE: a positive value can be returned through cgroup device bpf prog -> devcgroup_check_permission() -> bdev_permission() -> bdev_file_open_by_dev(), where ERR_PTR(positive) produces a pointer that IS_ERR() does not catch, leading to a wild pointer dereference. - BPF_CGROUP_SOCK: a positive value can be returned through cgroup sock bpf prog -> __cgroup_bpf_run_filter_sk() -> inet_create() -> __sock_create(), where inet_create() frees the newly allocated sk via sk_common_release() and sets sock->sk = NULL on the non-zero return, but __sock_create() only checks err < 0 for cleanup, so a positive retval bypasses cleanup and returns a socket with NULL sk to userspace, triggering a NULL pointer dereference on subsequent socket operations. - BPF_CGROUP_SYSCTL: a positive value can be returned through the cgroup bpf prog -> __cgroup_bpf_run_filter_sysctl() -> proc_sys_call_handler(), where a non-zero return bypasses the normal sysctl proc_handler and is returned directly to userspace as return value of read() or write() syscall. So add validation for the argument of the bpf_set_retval() helper. For BPF_LSM_CGROUP, enforce the LSM hook specific range returned by bpf_lsm_get_retval_range(). For all other cgroup program types, restrict the argument to [-MAX_ERRNO, 0], which matches the kernel convention of 0 for success and negative errno for error. BPF_CGROUP_GETSOCKOPT is an exception, since valid getsockopt implementations may return positive values, as allowed by commit c4dcfdd406aa ("bpf: Move getsockopt retval to struct bpf_cg_run_ctx"). Also refine the return value range of bpf_get_retval() so that values returned by bpf_get_retval() can be passed directly to bpf_set_retval() without extra manual bounds checking. Fixes: b44123b4a3dc ("bpf: Add cgroup helpers bpf_{get,set}_retval to get/set syscall return value") Fixes: 69fd337a975c ("bpf: per-cgroup lsm flavor") Reported-by: Quan Sun <2022090917019@std.uestc.edu.cn> Closes: https://lore.kernel.org/all/567d3206-74a5-44e5-99c6-779c425f399e@std.uestc.edu.cn Signed-off-by: Xu Kuohai Link: https://lore.kernel.org/r/20260605140243.664590-3-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3b874bbbaac0..935595138aa0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9770,7 +9770,9 @@ static int do_refine_retval_range(struct bpf_verifier_env *env, int func_id, struct bpf_call_arg_meta *meta) { + struct bpf_retval_range range; struct bpf_reg_state *ret_reg = ®s[BPF_REG_0]; + enum bpf_prog_type prog_type = resolve_prog_type(env->prog); if (ret_type != RET_INTEGER) return 0; @@ -9790,6 +9792,29 @@ static int do_refine_retval_range(struct bpf_verifier_env *env, reg_set_urange32(ret_reg, 0, nr_cpu_ids - 1); reg_bounds_sync(ret_reg); break; + case BPF_FUNC_get_retval: + /* + * bpf_get_retval may see arbitrary value passed by bpf_prog_run_array_cg for + * CGROUP_GETSOCKOPT type. + */ + if (prog_type == BPF_PROG_TYPE_CGROUP_SOCKOPT && + env->prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT) + break; + + if (prog_type == BPF_PROG_TYPE_LSM && + env->prog->expected_attach_type == BPF_LSM_CGROUP) { + if (!env->prog->aux->attach_func_proto->type) + break; + bpf_lsm_get_retval_range(env->prog, &range); + } else { + range.minval = -MAX_ERRNO; + range.maxval = 0; + } + + reg_set_srange64(ret_reg, range.minval, range.maxval); + reg_set_srange32(ret_reg, range.minval, range.maxval); + reg_bounds_sync(ret_reg); + break; } return reg_bounds_sanity_check(env, ret_reg, "retval"); @@ -10259,6 +10284,24 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } break; case BPF_FUNC_set_retval: + { + struct bpf_retval_range range = { + .minval = -MAX_ERRNO, + .maxval = 0, + .return_32bit = true + }; + struct bpf_reg_state *r1 = ®s[BPF_REG_1]; + + if (r1->type != SCALAR_VALUE) { + verbose(env, "R1 is not a scalar\n"); + return -EINVAL; + } + + /* CGROUP_GETSOCKOPT is allowed to return arbitrary value */ + if (prog_type == BPF_PROG_TYPE_CGROUP_SOCKOPT && + env->prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT) + break; + if (prog_type == BPF_PROG_TYPE_LSM && env->prog->expected_attach_type == BPF_LSM_CGROUP) { if (!env->prog->aux->attach_func_proto->type) { @@ -10268,8 +10311,20 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn verbose(env, "BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n"); return -EINVAL; } + bpf_lsm_get_retval_range(env->prog, &range); } + + err = mark_chain_precision(env, BPF_REG_1); + if (err) + return err; + + if (!retval_range_within(range, r1)) { + verbose_invalid_scalar(env, r1, range, "At bpf_set_retval", "R1"); + return -EINVAL; + } + break; + } case BPF_FUNC_dynptr_write: { enum bpf_dynptr_type dynptr_type = meta.dynptr.type; -- cgit v1.2.3 From 7913cdb54ee3271f608ad518bf8e75ad72cc3a3d Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Fri, 5 Jun 2026 14:02:43 +0000 Subject: selftests/bpf: Add tests for bpf_set_retval validation Add verifier tests to validate bpf_set_retval argument for cgroup program types. Reviewed-by: Emil Tsalapatis #v1 Signed-off-by: Xu Kuohai Link: https://lore.kernel.org/r/20260605140243.664590-4-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/verifier.c | 2 + .../selftests/bpf/progs/verifier_set_retval.c | 107 +++++++++++++++++++++ 2 files changed, 109 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_set_retval.c diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 219ff2969868..89779d897aba 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -117,6 +117,7 @@ #include "verifier_xdp.skel.h" #include "verifier_xdp_direct_packet_access.skel.h" #include "verifier_bits_iter.skel.h" +#include "verifier_set_retval.skel.h" #include "verifier_lsm.skel.h" #include "verifier_jit_inline.skel.h" #include "irq.skel.h" @@ -266,6 +267,7 @@ void test_verifier_xadd(void) { RUN(verifier_xadd); } void test_verifier_xdp(void) { RUN(verifier_xdp); } void test_verifier_xdp_direct_packet_access(void) { RUN(verifier_xdp_direct_packet_access); } void test_verifier_bits_iter(void) { RUN(verifier_bits_iter); } +void test_verifier_set_retval(void) { RUN(verifier_set_retval); } void test_verifier_lsm(void) { RUN(verifier_lsm); } void test_irq(void) { RUN(irq); } void test_verifier_mtu(void) { RUN(verifier_mtu); } diff --git a/tools/testing/selftests/bpf/progs/verifier_set_retval.c b/tools/testing/selftests/bpf/progs/verifier_set_retval.c new file mode 100644 index 000000000000..1415cd15cede --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_set_retval.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include "bpf_misc.h" + +SEC("lsm_cgroup/socket_create") +__description("lsm_cgroup bpf_set_retval success") +__success +int BPF_PROG(lsm_cgroup_set_retval_zero_valid, int family, int type, int protocol, int kern) +{ + bpf_set_retval(0); + return 0; +} + +SEC("lsm_cgroup/socket_create") +__description("lsm_cgroup bpf_set_retval valid errno") +__success +int BPF_PROG(lsm_cgroup_set_retval_negative_valid, int family, int type, int protocol, int kern) +{ + bpf_set_retval(-12); + return 0; +} + +SEC("lsm_cgroup/socket_create") +__description("lsm_cgroup bpf_set_retval invalid negative value") +__failure __msg("should have been in [-4095, 0]") +int BPF_PROG(lsm_cgroup_set_retval_negative_invalid, int family, int type, int protocol, int kern) +{ + bpf_set_retval(-4096); + return 0; +} + +SEC("lsm_cgroup/socket_create") +__description("lsm_cgroup bpf_set_retval invalid positive value") +__failure __msg("should have been in [-4095, 0]") +int BPF_PROG(lsm_cgroup_set_retval_positive_invalid, int family, int type, int protocol, int kern) +{ + bpf_set_retval(1); + return 0; +} + +SEC("cgroup/dev") +__description("cgroup_device bpf_set_retval success") +__success +int cgroup_dev_set_retval_0(struct bpf_cgroup_dev_ctx *ctx) +{ + bpf_set_retval(0); + return 1; +} + +SEC("cgroup/dev") +__description("cgroup_device bpf_set_retval valid errno") +__success +int cgroup_dev_set_retval_neg_maxerrno(struct bpf_cgroup_dev_ctx *ctx) +{ + bpf_set_retval(-4095); + return 1; +} + +SEC("cgroup/dev") +__description("cgroup_device bpf_set_retval invalid positive value") +__failure __msg("should have been in [-4095, 0]") +int cgroup_dev_set_retval_1(struct bpf_cgroup_dev_ctx *ctx) +{ + bpf_set_retval(1); + return 1; +} + +SEC("cgroup/dev") +__description("cgroup_device bpf_set_retval invalid negative value") +__failure __msg("should have been in [-4095, 0]") +int cgroup_dev_set_retval_neg_4096(struct bpf_cgroup_dev_ctx *ctx) +{ + bpf_set_retval(-4096); + return 1; +} + +SEC("cgroup/dev") +__description("bpf_set_retval bounds check survives state pruning") +__failure __msg("should have been in [-4095, 0]") +__naked int cgroup_dev_set_retval_pruning_bypass(struct bpf_cgroup_dev_ctx *ctx) +{ + asm volatile ( + "call %[bpf_get_prandom_u32];" + "if r0 != 0 goto 1f;" + "r0 = r0;" + "r0 = r0;" + "r0 = r0;" + "r0 = r0;" + "goto 2f;" + "1:" + "call %[bpf_get_prandom_u32];" + "2:" + "r1 = r0;" + "call %[bpf_set_retval];" + "r0 = 1;" + "exit;" + : + : __imm(bpf_get_prandom_u32), + __imm(bpf_set_retval) + : __clobber_common + ); +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From d5e5745f8a1dfd0d026fe36eb1265268bce4988c Mon Sep 17 00:00:00 2001 From: Sean Young Date: Fri, 5 Jun 2026 16:14:16 +0100 Subject: selftests/bpf: Fix test_lirc test Since commit 68a99f6a0ebf ("media: lirc: report ir receiver overflow"), the rc-loopback driver does not accept edges over 50ms, as these are never seen in real life ir protocols. Fix this. Signed-off-by: Sean Young Link: https://lore.kernel.org/r/20260605151417.777614-1-sean@mess.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/test_lirc_mode2_kern.c | 4 ++-- tools/testing/selftests/bpf/test_lirc_mode2_user.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_lirc_mode2_kern.c b/tools/testing/selftests/bpf/progs/test_lirc_mode2_kern.c index 7a6620671a83..cbe4284c032f 100644 --- a/tools/testing/selftests/bpf/progs/test_lirc_mode2_kern.c +++ b/tools/testing/selftests/bpf/progs/test_lirc_mode2_kern.c @@ -13,9 +13,9 @@ int bpf_decoder(unsigned int *sample) if (LIRC_IS_PULSE(*sample)) { unsigned int duration = LIRC_VALUE(*sample); - if (duration & 0x10000) + if (duration & 0x1000) bpf_rc_keydown(sample, 0x40, duration & 0xffff, 0); - if (duration & 0x20000) + if (duration & 0x2000) bpf_rc_pointer_rel(sample, (duration >> 8) & 0xff, duration & 0xff); } diff --git a/tools/testing/selftests/bpf/test_lirc_mode2_user.c b/tools/testing/selftests/bpf/test_lirc_mode2_user.c index 88e4aeab21b7..cd191da20d14 100644 --- a/tools/testing/selftests/bpf/test_lirc_mode2_user.c +++ b/tools/testing/selftests/bpf/test_lirc_mode2_user.c @@ -50,8 +50,8 @@ int main(int argc, char **argv) { struct bpf_object *obj; int ret, lircfd, progfd, inputfd; - int testir1 = 0x1dead; - int testir2 = 0x20101; + int testir1 = 0x1ead; + int testir2 = 0x2101; u32 prog_ids[10], prog_flags[10], prog_cnt; if (argc != 3) { @@ -125,7 +125,7 @@ int main(int argc, char **argv) } if (event.type == EV_MSC && event.code == MSC_SCAN && - event.value == 0xdead) { + event.value == 0x1ead) { break; } } -- cgit v1.2.3 From 6c3e8a4d476521bc33362e90b2569548f1adb7a4 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Fri, 5 Jun 2026 18:20:18 -0400 Subject: selftests/bpf: libarena: Add rbtree data structure Add a native red-black tree data structure to libarena. The data structure supports multiple APIs (key-value based, node based) with which users can query and modify it. The tree uses the libarena memory allocator to manage its data. Signed-off-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260605222020.5231-2-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- .../bpf/libarena/include/libarena/rbtree.h | 83 ++ .../bpf/libarena/selftests/test_rbtree.bpf.c | 968 ++++++++++++++++++ .../selftests/bpf/libarena/src/rbtree.bpf.c | 1047 ++++++++++++++++++++ 3 files changed, 2098 insertions(+) create mode 100644 tools/testing/selftests/bpf/libarena/include/libarena/rbtree.h create mode 100644 tools/testing/selftests/bpf/libarena/selftests/test_rbtree.bpf.c create mode 100644 tools/testing/selftests/bpf/libarena/src/rbtree.bpf.c diff --git a/tools/testing/selftests/bpf/libarena/include/libarena/rbtree.h b/tools/testing/selftests/bpf/libarena/include/libarena/rbtree.h new file mode 100644 index 000000000000..486428911d96 --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/include/libarena/rbtree.h @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause */ + +#pragma once + +#define RB_MAXLVL_PRINT (16) + +struct rbnode; + +struct rbnode { + struct rbnode __arena *parent; + union { + struct { + struct rbnode __arena *left; + struct rbnode __arena *right; + }; + + struct rbnode __arena *child[2]; + }; + uint64_t key; + /* Used as a linked list or to store KV pairs. */ + union { + struct rbnode __arena *next; + uint64_t value; + }; + bool is_red; +}; + +/* + * Does the rbtree allocate its own nodes, or do they get + * allocated by the caller? + */ +enum rbtree_alloc { + RB_ALLOC, + RB_NOALLOC, +}; + +/* + * Specify the behavior of rbtree insertions when the key is + * already present in the tree. + * + * RB_DEFAULT: Default behavior, reject the new insert. + * + * RB_UPDATE: Update the existing value in the rbtree. + * This updates the node itself, not just the value in + * the existing node. + * + * RB_DUPLICATE: Allow nodes with identical keys in the rbtree. + * Finding/popping/removing a key acts on any of the nodes + * with the appropriate key - there is no ordering by time + * of insertion. + */ +enum rbtree_insert_mode { + RB_DEFAULT, + RB_UPDATE, + RB_DUPLICATE, +}; + +struct rbtree { + struct rbnode __arena *root; + enum rbtree_alloc alloc; + enum rbtree_insert_mode insert; +}; + +#ifdef __BPF__ +struct rbtree __arena *rb_create(enum rbtree_alloc alloc, enum rbtree_insert_mode insert); + +int rb_destroy(struct rbtree __arena *rbtree); +int rb_insert(struct rbtree __arena *rbtree, u64 key, u64 value); +int rb_remove(struct rbtree __arena *rbtree, u64 key); +int rb_find(struct rbtree __arena *rbtree, u64 key, u64 *value); +int rb_print(struct rbtree __arena *rbtree); +int rb_least(struct rbtree __arena *rbtree, u64 *key, u64 *value); +int rb_pop(struct rbtree __arena *rbtree, u64 *key, u64 *value); + +int rb_insert_node(struct rbtree __arena *rbtree, struct rbnode __arena *node); +int rb_remove_node(struct rbtree __arena *rbtree, struct rbnode __arena *node); + +struct rbnode __arena *rb_node_alloc(u64 key, u64 value); +void rb_node_free(struct rbnode __arena *rbnode); + +int rb_integrity_check(struct rbtree __arena *rbtree); + +#endif /* __BPF__ */ diff --git a/tools/testing/selftests/bpf/libarena/selftests/test_rbtree.bpf.c b/tools/testing/selftests/bpf/libarena/selftests/test_rbtree.bpf.c new file mode 100644 index 000000000000..856c484a009a --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/selftests/test_rbtree.bpf.c @@ -0,0 +1,968 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause + +#include + +#include +#include + +typedef struct node_ctx __arena *node_ctx; + +struct node_ctx { + struct rbnode rbnode; + node_ctx next; +}; + +static const u64 keys[] = { 51, 43, 37, 3, 301, 46, 383, 990, 776, 729, 871, 96, 189, 213, + 376, 167, 131, 939, 626, 119, 374, 700, 772, 154, 883, 620, 641, 5, + 428, 516, 105, 622, 988, 811, 931, 973, 246, 690, 934, 744, 210, 311, + 32, 255, 960, 830, 523, 429, 541, 738, 705, 774, 715, 446, 98, 578, + 777, 191, 279, 91, 767 }; + +static const u64 morekeys[] = { 173, 636, 1201, 8642, 5957, 3617, 4586, 8053, 6551, 7592, 1748, 1589, 8644, 9918, 6977, + 4448, 5852, 4640, 9717, 2303, 7424, 7695, 2334, 8876, 8618, 5745, 7134, 2178, 5280, 2140, 1138, + 5083, 8922, 1516, 2437, 2488, 4307, 4329, 5088, 8456, 5938, 1441, 1684, 5750, 721, 1107, 2089, + 9737, 4687, 5016, 4849, 8193, 9603, 9147, 5992, 166, 6721, 812, 4144, 6237, 6509, 3466, 9255, + 7767, 3960, 6759, 2968, 6046, 9784, 8395, 2619, 1711, 528, 6424, 9084, 3179, 1342, 5676, 9445, + 5691, 6678, 8487, 1627, 998, 6178, 2229, 1987, 3319, 572, 169, 2161, 3018, 5439, 7287, 7265, 5995, + 5003, 5857, 2836, 5634, 4735, 9261, 8287, 5359, 533, 1406, 9573, 4026, 714, 3956, 1722, 6395, + 9648, 3887, 7185, 470, 4482, 4997, 841, 8913, 9946, 3999, 9357, 9847, 277, 8184, 8704, 6766, 3323, + 5468, 8638, 7905, 8858, 6142, 3685, 3452, 4689, 8878, 8836, 158, 831, 7914, 3031, 8374, 4921, + 4207, 3460, 5547, 3358, 1083, 4619, 7818, 2962, 4879, 4583, 2172, 8819, 9830, 1194, 2666, 9812, + 5704, 8432, 5916, 6007, 6609, 4791, 1985, 3226, 2478, 9605, 5236, 8079, 3042, 1965, 3539, 9704, + 4267, 6416, 760, 9968, 2983, 1190, 1964, 3211, 2870, 3106, 2794, 1542, 6916, 5986, 9096, 441, + 5894, 8353, 7765, 3757, 5732, 88, 3091, 5637, 6042, 8447, 4073, 6923, 5491, 7010, 3663, 5029, + 6162, 822, 4874, 7491, 5100, 3461, 6983, 2170, 1458, 1856, 648, 6272, 4887, 976, 2369, 5909, 4274, + 3324, 6968, 2312, 2271, 8891, 6268, 6581, 1610, 8880, 6194, 6144, 9764, 6915, 829, 3774, 2265, + 1752, 1314, 6377, 8760, 8004, 501, 4912, 9278, 1425, 9578, 7337, 307, 1885, 3151, 9617, 1647, + 2458, 3702, 6091, 8902, 5663, 9378, 7640, 3336, 557, 1644, 6848, 1559, 8821, 266, 4330, 9790, + 5920, 4222, 1143, 6248, 5792, 4847, 9726, 6303, 821, 6839, 6062, 7133, 3649, 9888, 2528, 1966, + 5456, 4914, 3615, 1543, 3206, 3353, 6097, 2800, 1424, 9094, 7920, 7243, 1394, 5464, 1707, 576, + 6524, 4261, 4187, 7889, 5336, 3377, 2921, 7244, 2766, 6584, 5514, 1387, 2957, 2258, 1077, 9979, + 1128, 876, 4056, 4668, 4532, 1982, 7093, 4184, 5460, 7588, 4704, 6717, 61, 3959, 1826, 2294, 18, + 8170, 9394, 8796, 7288, 7285, 7143, 148, 6676, 6603, 1051, 8225, 4169, 3230, 7697, 6971, 3454, + 7501, 9514, 394, 2339, 4993, 5606, 6060, 1297, 8273, 3012, 157, 8181, 6765, 7207, 1005, 8833, 1914, + 7456, 1846, 8375, 2741, 2074, 1712, 5286 }; + +SEC("syscall") +__weak int test_rbtree_find_nonexistent(void) +{ + u64 key = 0xdeadbeef; + u64 value = 0; + int ret; + + struct rbtree __arena *rbtree; + + rbtree = rb_create(RB_ALLOC, RB_DEFAULT); + if (!rbtree) + return 1; + + /* Should return -EINVAL */ + ret = rb_find(rbtree, key, &value); + if (!ret) + return 2; + + return rb_destroy(rbtree); +} + +SEC("syscall") +__weak int test_rbtree_insert_existing(void) +{ + u64 key = 525252; + u64 value = 24; + int ret; + + struct rbtree __arena *rbtree; + + rbtree = rb_create(RB_ALLOC, RB_DEFAULT); + if (!rbtree) + return 1; + + ret = rb_insert(rbtree, key, value); + if (ret) + return 2; + + /* Should return -EALREADY. */ + ret = rb_insert(rbtree, key, value); + if (ret != -EALREADY) { + return 3; + } + + return rb_destroy(rbtree); +} + +SEC("syscall") +__weak int test_rbtree_update_existing(void) +{ + u64 key = 33333; + u64 value; + int ret; + + struct rbtree __arena *rbtree; + + rbtree = rb_create(RB_ALLOC, RB_UPDATE); + if (!rbtree) + return 1; + + value = 52; + ret = rb_insert(rbtree, key, value); + if (ret) + return 2; + + ret = rb_find(rbtree, key, &value); + if (ret) + return 3; + + if (value != 52) + return 4; + + value = 65; + + /* Should succeed. */ + ret = rb_insert(rbtree, key, value); + if (ret) + return 5; + + /* Should be updated. */ + ret = rb_find(rbtree, key, &value); + if (ret) + return 6; + + if (value != 65) + return 7; + + return rb_destroy(rbtree); +} + +SEC("syscall") +__weak int test_rbtree_insert_one(void) +{ + u64 key = 202020; + u64 value = 0xbadcafe; + int ret; + + struct rbtree __arena *rbtree; + + rbtree = rb_create(RB_ALLOC, RB_UPDATE); + if (!rbtree) + return 1; + + ret = rb_insert(rbtree, key, value); + if (ret) + return 2; + + ret = rb_find(rbtree, key, &value); + if (ret) + return 3; + + if (value != 0xbadcafe) + return 4; + + return rb_destroy(rbtree); +} + +SEC("syscall") +__weak int test_rbtree_insert_ten(void) +{ + u64 key, value; + int ret, i; + + struct rbtree __arena *rbtree; + + rbtree = rb_create(RB_ALLOC, RB_UPDATE); + if (!rbtree) + return 1; + + for (i = 0; i < 10 && can_loop; i++) { + key = keys[i]; + ret = rb_insert(rbtree, key, 2 * key); + if (ret) + return 2 + 3 * i; + + /* Read it back. */ + ret = rb_find(rbtree, key, &value); + if (ret) + return 2 + 3 * i + 1; + + if (value != 2 * key) + return 2 + 3 * i + 2; + } + + /* Go find all inserted pairs. */ + for (i = 0; i < 10 && can_loop; i++) { + key = keys[i]; + + ret = rb_find(rbtree, key, &value); + if (ret) + return 35 + 2 * i; + + if (value != 2 * key) + return 35 + 2 * i + 1; + } + + return rb_destroy(rbtree); +} + +SEC("syscall") +__weak int test_rbtree_duplicate(void) +{ + u64 key = 0x121212; + u64 value; + int ret, i; + + struct rbtree __arena *rbtree; + + rbtree = rb_create(RB_ALLOC, RB_DUPLICATE); + if (!rbtree) + return 1; + + for (i = 0; i < 10 && can_loop; i++) { + ret = rb_insert(rbtree, key, 2 * key); + if (ret) + return 2 + 3 * i; + + /* Read it back. */ + ret = rb_find(rbtree, key, &value); + if (ret) + return 2 + 3 * i + 1; + + if (value != 2 * key) + return 2 + 3 * i + 2; + } + + /* Go find all inserted copies and remove them. */ + for (i = 0; i < 10 && can_loop; i++) { + ret = rb_find(rbtree, key, &value); + if (ret) { + rb_print(rbtree); + return 35 + 3 * i; + } + + if (value != 2 * key) + return 35 + 3 * i + 1; + + ret = rb_remove(rbtree, key); + if (ret) + return 35 + 3 * i + 2; + } + + return rb_destroy(rbtree); +} + +static inline int +clean_up_noalloc_tree(struct rbtree __arena *rbtree) +{ + node_ctx nodec; + int ret; + + if (rbtree->alloc != RB_NOALLOC) + return -EINVAL; + + /* Can't destroy an RB_NOALLOC tree that still has nodes. */ + if (rb_destroy(rbtree) != -EBUSY) + return -EINVAL; + + while (rbtree->root && can_loop) { + nodec = (node_ctx)arena_container_of(rbtree->root, struct node_ctx, rbnode); + ret = rb_remove_node(rbtree, &nodec->rbnode); + if (ret) + return ret; + + arena_free(nodec); + } + + return 0; +} + +int insert_many(enum rbtree_alloc alloc, enum rbtree_insert_mode insert) +{ + const size_t numkeys = sizeof(keys) / sizeof(keys[0]); + node_ctx nodec; + u64 key, value; + int ret; + int i; + + struct rbtree __arena *rbtree; + + rbtree = rb_create(alloc, insert); + if (!rbtree) + return 1; + + for (i = 0; i < numkeys && can_loop; i++) { + key = keys[i]; + if (rbtree->alloc != RB_ALLOC) { + nodec = arena_malloc(sizeof(*nodec)); + if (!nodec) { + arena_stderr("out of memory\n"); + return -ENOMEM; + } + nodec->rbnode.key = key; + nodec->rbnode.value = 2 * key; + ret = rb_insert_node(rbtree, &nodec->rbnode); + } else { + ret = rb_insert(rbtree, key, 2 * key); + } + if (ret) + return 2 + 3 * i; + + /* Read it back. */ + ret = rb_find(rbtree, key, &value); + if (ret) + return 2 + 3 * i + 1; + + if (value != 2 * key) + return 2 + 3 * i + 2; + } + + /* Go find all inserted pairs. */ + for (i = 0; i < numkeys && can_loop; i++) { + key = keys[i]; + + ret = rb_find(rbtree, key, &value); + if (ret) + return 302 + 2 * i; + + if (value != 2 * key) + return 302 + 2 * i + 1; + } + + /* RB_ALLOC trees are destroyed while still having elements. */ + if (rbtree->alloc == RB_ALLOC) + return rb_destroy(rbtree); + + /* Otherwise manually clean up the tree. */ + if (clean_up_noalloc_tree(rbtree)) + return 5; + + return rb_destroy(rbtree); +} + +SEC("syscall") +__weak int test_rbtree_remove_one(void) +{ + u64 key = 20, value = 5, newvalue; + int ret; + + struct rbtree __arena *rbtree; + + rbtree = rb_create(RB_ALLOC, RB_DEFAULT); + if (!rbtree) + return 1; + + ret = rb_find(rbtree, key, &newvalue); + if (!ret) + return 2; + + ret = rb_insert(rbtree, key, value); + if (ret) + return 3; + + ret = rb_find(rbtree, key, &newvalue); + if (ret || value != newvalue) + return 4; + + ret = rb_remove(rbtree, key); + if (ret) + return 5; + + ret = rb_find(rbtree, key, &newvalue); + if (!ret) + return 6; + + return rb_destroy(rbtree); +} + +static __always_inline int remove_many_verify_all_present(struct rbtree __arena *rbtree) +{ + const size_t numkeys = sizeof(morekeys) / sizeof(morekeys[0]); + u64 value; + int ret; + int i; + + for (i = 0; i < numkeys && can_loop; i++) { + u64 key = morekeys[i]; + + ret = rb_find(rbtree, key, &value); + if (ret) + return -1; + + if (value != 2 * key) + return -1; + } + + return 0; +} + +static __always_inline int remove_many_verify_remaining(struct rbtree __arena *rbtree) +{ + const size_t numkeys = sizeof(morekeys) / sizeof(morekeys[0]); + u64 value; + int ret; + int i; + + for (i = 0; i < numkeys && can_loop; i += 2) { + u64 key = morekeys[i]; + + ret = rb_find(rbtree, key, &value); + if (!ret) + return -1; + + if (i + 1 >= numkeys) + break; + + key = morekeys[i + 1]; + ret = rb_find(rbtree, key, &value); + if (ret) + return -1; + + if (value != 2 * key) + return -1; + } + + for (i = 1; i < numkeys && can_loop; i += 2) { + u64 key = morekeys[i]; + + ret = rb_find(rbtree, key, &value); + if (ret) + return -1; + + if (value != 2 * key) + return -1; + } + + return 0; +} + +static __noinline int remove_many_alloc(struct rbtree __arena *rbtree) +{ + const size_t numkeys = sizeof(morekeys) / sizeof(morekeys[0]); + u64 value; + int ret; + int i; + + for (i = 0; i < numkeys && can_loop; i++) { + u64 key = morekeys[i]; + + ret = rb_insert(rbtree, key, 2 * key); + if (ret) + return -1; + + if (rb_integrity_check(rbtree)) { + arena_stderr("iteration %d\n", i); + return -EINVAL; + } + + ret = rb_find(rbtree, key, &value); + if (ret) + return -1; + + if (value != 2 * key) + return -1; + } + + ret = remove_many_verify_all_present(rbtree); + if (ret) + return ret; + + for (i = 0; i < numkeys && can_loop; i += 2) { + u64 key = morekeys[i]; + + ret = rb_remove(rbtree, key); + if (ret) { + arena_stderr("Failed to remove %ld\n", key); + return -1; + } + + ret = rb_find(rbtree, key, &value); + if (!ret) + return -1; + } + + return remove_many_verify_remaining(rbtree); +} + +static __noinline int remove_many_noalloc(struct rbtree __arena *rbtree) +{ + const size_t numkeys = sizeof(morekeys) / sizeof(morekeys[0]); + node_ctx first = NULL, last = NULL; + u64 value; + int ret; + int i; + + for (i = 0; i < numkeys && can_loop; i++) { + u64 key = morekeys[i]; + node_ctx nodec = arena_malloc(sizeof(*nodec)); + + if (!nodec) { + arena_stderr("out of memory\n"); + return -ENOMEM; + } + nodec->rbnode.key = key; + nodec->rbnode.value = 2 * key; + nodec->next = NULL; + + if (!first) + first = nodec; + + if (last) + last->next = nodec; + last = nodec; + + ret = rb_insert_node(rbtree, &nodec->rbnode); + if (ret) + return -1; + + if (rb_integrity_check(rbtree)) { + arena_stderr("iteration %d\n", i); + return -EINVAL; + } + + ret = rb_find(rbtree, key, &value); + if (ret) + return -1; + + if (value != 2 * key) + return -1; + } + + ret = remove_many_verify_all_present(rbtree); + if (ret) + return ret; + + for (i = 0; i < numkeys && can_loop; i += 2) { + u64 key = morekeys[i]; + node_ctx nodec = first; + + if (!nodec || key != nodec->rbnode.key) + return -1; + + first = nodec->next ? nodec->next->next : NULL; + ret = rb_remove_node(rbtree, &nodec->rbnode); + if (ret) { + arena_stderr("Failed to remove %ld\n", key); + return -1; + } + + ret = rb_find(rbtree, key, &value); + if (!ret) + return -1; + } + + return remove_many_verify_remaining(rbtree); +} + +static inline int remove_many(enum rbtree_alloc alloc, + enum rbtree_insert_mode insert) +{ + int ret; + struct rbtree __arena *rbtree; + + rbtree = rb_create(alloc, insert); + if (!rbtree) + return -ENOMEM; + + ret = (alloc == RB_ALLOC) ? remove_many_alloc(rbtree) + : remove_many_noalloc(rbtree); + if (ret) + return ret; + + if (alloc == RB_ALLOC) + return rb_destroy(rbtree); + + ret = clean_up_noalloc_tree(rbtree); + if (ret) + return ret; + + return rb_destroy(rbtree); +} + +SEC("syscall") +__weak int test_rbtree_insert_many_update(void) +{ + return insert_many(RB_ALLOC, RB_UPDATE); +} + +SEC("syscall") +__weak int test_rbtree_insert_many_noalloc(void) +{ + return insert_many(RB_NOALLOC, RB_DUPLICATE); +} + +SEC("syscall") +__weak int test_rbtree_remove_many_update(void) +{ + return remove_many(RB_ALLOC, RB_UPDATE); +} + +SEC("syscall") +__weak int test_rbtree_remove_many_noalloc(void) +{ + return remove_many(RB_NOALLOC, RB_DUPLICATE); +} + +SEC("syscall") +__weak int test_rbtree_add_remove_circular(void) +{ + const size_t iters = 60; + const size_t prefill = 10; + const size_t numkeys = 50; + const size_t prefix = 400000; + u64 value, rmval; + int errval = 1; + u64 key; + int ret; + int i; + + struct rbtree __arena *rbtree; + + rbtree = rb_create(RB_ALLOC, RB_UPDATE); + if (!rbtree) + return 1; + + for (i = 0; i < prefill && can_loop; i++) { + ret = rb_insert(rbtree, prefix + (i % numkeys), i); + if (ret) + return errval; + + errval += 1; + } + + errval = 2 * 1000 * 1000; + + for (i = 0; i < prefill && can_loop; i++) { + /* Read it back. */ + ret = rb_find(rbtree, prefix + (i % numkeys), &value); + if (ret) + return errval; + + if (value != i) + return errval; + } + + errval = 3 * 1000 * 1000; + + for (i = prefill; i < iters && can_loop; i++) { + key = prefix + (i % numkeys); + + ret = rb_find(rbtree, key, &value); + if (!ret) { + arena_stderr("Key %d already present\n", key); + return errval; + } + + errval += 1; + + ret = rb_insert(rbtree, key, i); + if (ret) { + arena_stderr("ITERATION %d\n", i); + rb_print(rbtree); + return errval; + } + + rmval = i - prefill; + + errval += 1; + + ret = rb_find(rbtree, prefix + (rmval % numkeys), &value); + if (ret) + return errval; + + errval += 1; + + if (value != rmval) + return errval; + + errval += 1; + + ret = rb_remove(rbtree, prefix + (rmval % numkeys)); + if (ret) { + arena_stderr("ITERATION %d\n", i); + return errval; + } + + errval += 1; + } + + for (i = 0; i < numkeys && can_loop; i++) { + rb_remove(rbtree, prefix + i); + } + + return rb_destroy(rbtree); +} + +SEC("syscall") +__weak int test_rbtree_add_remove_circular_reverse(void) +{ + const size_t iters = 110; + const size_t prefill = 10; + const size_t numkeys = 50; + const size_t prefix = 500000; + u64 value, rmval; + int errval = 1; + u64 key; + int ret; + int i; + + struct rbtree __arena *rbtree; + + rbtree = rb_create(RB_ALLOC, RB_UPDATE); + if (!rbtree) + return 1; + + for (i = 0; i < prefill && can_loop; i++) { + ret = rb_insert(rbtree, prefix - (i % numkeys), i); + if (ret) + return errval; + + errval += 1; + } + + errval = 2 * 1000 * 1000; + + for (i = 0; i < prefill && can_loop; i++) { + /* Read it back. */ + ret = rb_find(rbtree, prefix - (i % numkeys), &value); + if (ret) + return errval; + + if (value != i) + return errval; + } + + errval = 3 * 1000 * 1000; + + for (i = prefill; i < iters && can_loop; i++) { + key = prefix - (i % numkeys); + + ret = rb_find(rbtree, key, &value); + if (!ret) { + arena_stderr("Key %d already present\n", key); + return errval; + } + + errval += 1; + + ret = rb_insert(rbtree, key, i); + if (ret) { + arena_stderr("error %d on insert\n", ret); + rb_print(rbtree); + return errval; + } + + rmval = i - prefill; + + errval += 1; + + ret = rb_find(rbtree, prefix - (rmval % numkeys), &value); + if (ret) + return errval; + + errval += 1; + + if (value != rmval) + return errval; + + errval += 1; + + ret = rb_remove(rbtree, prefix - (rmval % numkeys)); + if (ret) + return errval; + + errval += 1; + } + + + errval = 4 * 1000 * 1000; + for (i = 0; i < prefill && can_loop; i++) { + ret = rb_remove(rbtree, prefix - i); + if (ret) { + arena_stderr("Did not remove %d, error %d\n", prefix - i, ret); + return errval + i; + } + } + + return rb_destroy(rbtree); +} + +SEC("syscall") +__weak int test_rbtree_least_pop(void) +{ + const size_t keys = 10; + u64 key, value; + int errval = 1; + int ret, i; + + struct rbtree __arena *rbtree; + + rbtree = rb_create(RB_ALLOC, RB_DEFAULT); + if (!rbtree) + return errval; + + errval += 1; + + for (i = 0; i < keys / 2 && can_loop; i++) { + ret = rb_insert(rbtree, i, i); + if (ret) + return errval; + + errval += 1; + + ret = rb_insert(rbtree, keys - 1 - i, keys - 1 - i); + if (ret) + return errval; + + errval += 1; + + ret = rb_least(rbtree, &key, &value); + if (ret) + return errval; + + errval += 1; + + if (key != 0 || value != 0) + return errval; + + errval += 1; + } + + errval = 1000; + + for (i = 0; i < keys && can_loop; i++) { + ret = rb_least(rbtree, &key, &value); + if (ret) { + arena_stderr("rb_least failed with %d\n", ret); + return errval; + } + + errval += 1; + + if (key != i || value != i) { + arena_stderr("Got KV %ld/%ld expected %d\n", key, value, i); + return errval; + } + + errval += 1; + + ret = rb_pop(rbtree, &key, &value); + if (ret) { + arena_stderr("Error %d during pop on iter %d\n", ret, i); + return errval; + } + + errval += 1; + + if (key != i || value != i) + return errval; + } + + return rb_destroy(rbtree); +} + +/* Reject rb_pop() for RB_NOALLOC trees. */ +SEC("syscall") +__weak int test_rbtree_noalloc_pop(void) +{ + const u64 expect_value = 1; + const u64 expect_key = 0; + struct rbtree __arena *rbtree; + struct rbnode __arena *node; + u64 value = 0; + int ret; + + rbtree = rb_create(RB_NOALLOC, RB_DEFAULT); + if (!rbtree) + return 1; + + node = rb_node_alloc(expect_key, expect_value); + if (!node) { + rb_destroy(rbtree); + return 2; + } + + ret = rb_insert_node(rbtree, node); + if (ret) { + rb_node_free(node); + rb_destroy(rbtree); + return 3; + } + + ret = rb_pop(rbtree, NULL, &value); + if (ret != -EINVAL) + return 4; + + ret = rb_find(rbtree, expect_key, &value); + if (ret) + return 5; + + if (value != expect_value) + return 6; + + ret = rb_remove_node(rbtree, node); + if (ret) + return 7; + + rb_node_free(node); + + return rb_destroy(rbtree); +} + +SEC("syscall") +__weak int test_rbtree_alloc_check(void) +{ + struct rbtree __arena *alloc, *noalloc; + struct rbnode __arena *node; + int ret; + + alloc = rb_create(RB_ALLOC, RB_DEFAULT); + if (!alloc) + return 1; + + noalloc = rb_create(RB_NOALLOC, RB_DEFAULT); + if (!noalloc) + return 2; + + + node = rb_node_alloc(0, 0); + if (!node) + return 3; + + /* + * RB_ALLOC trees can use rb_insert, RB_NOALLOC trees can + * use rb_insert_node. RB_ALLOC and RB_NOALLOC trees cannot + * use each other's APIs. + * + * NOTE: This begs the question, why not different types? We + * want to partially share the API and that would require us + * to duplicate it. + */ + if (rb_insert(alloc, 0, 0)) + return 4; + + if (!rb_insert_node(alloc, node)) + return 5; + + if (!rb_remove_node(alloc, node)) + return 6; + + if (rb_remove(alloc, 0)) + return 7; + + if (rb_insert_node(noalloc, node)) + return 8; + + if (!rb_insert(noalloc, 0, 0)) + return 9; + + if (!rb_remove(noalloc, 0)) + return 10; + + if (rb_remove_node(noalloc, node)) + return 11; + + rb_node_free(node); + + ret = rb_destroy(alloc); + if (ret) + return ret; + + return rb_destroy(noalloc); +} diff --git a/tools/testing/selftests/bpf/libarena/src/rbtree.bpf.c b/tools/testing/selftests/bpf/libarena/src/rbtree.bpf.c new file mode 100644 index 000000000000..7f0f6dc3e17d --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/src/rbtree.bpf.c @@ -0,0 +1,1047 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* + * Copyright (c) 2025-2026 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2025-2026 Emil Tsalapatis + */ + +#include + +#include +#include + +int rb_integrity_check(struct rbtree __arena *rbtree); +void rbnode_print(size_t depth, struct rbnode __arena *rbn); +static int rbnode_replace(struct rbtree __arena *rbtree, + struct rbnode __arena *existing, + struct rbnode __arena *replacement); + +struct rbtree __arena *rb_create(enum rbtree_alloc alloc, + enum rbtree_insert_mode insert) +{ + struct rbtree __arena *rbtree; + + rbtree = arena_malloc(sizeof(*rbtree)); + if (unlikely(!rbtree)) + return NULL; + + /* + * RB_UPDATE overwrites existing values in the nodes, but RB_NOALLOC + * trees manage the tree nodes directly (including holding pointers + * to them). Disallow mixing the two modes to avoid dealing with + * unintuitive semantics. + */ + if (alloc == RB_NOALLOC && insert == RB_UPDATE) { + arena_stderr("WARNING: Cannot combine RB_NOALLOC and RB_UPDATE"); + arena_free(rbtree); + return NULL; + } + + rbtree->alloc = alloc; + rbtree->insert = insert; + rbtree->root = NULL; + + return rbtree; +} + +__weak +int rb_destroy(struct rbtree __arena *rbtree) +{ + int ret = 0; + + arena_subprog_init(); + + if (unlikely(!rbtree)) + return -EINVAL; + + if (rbtree->alloc == RB_NOALLOC) { + /* + * We cannot do anything about RB_NOALLOC nodes. The whole + * point of RB_NOALLOC is that the nodes are directly owned + * by the caller that allocates and inserts them. We could + * unilaterally grab all nodes and free them anyway, but that + * would almost certainly cause UAF as the callers keep accessing + * the now freed nodes. Throw an error instead. + */ + if (rbtree->root) { + arena_stderr("WARNING: Destroying RB_NOALLOC tree with > 0 nodes"); + return -EBUSY; + } + + goto out; + } + + while (rbtree->root && can_loop) { + ret = rb_remove(rbtree, rbtree->root->key); + if (ret) + break; + } + +out: + arena_free(rbtree); + return ret; +} + +static inline int rbnode_dir(struct rbnode __arena *node) +{ + /* Arbitrarily choose a direction for the root. */ + if (unlikely(!node->parent)) + return 0; + + return (node->parent->left == node) ? 0 : 1; +} + +/* + * The __noinline is to prevent inlining from bloating the add + * remove calls, in turn causing register splits and increasing + * stack usage above what is permitted. + */ +__noinline +int rbnode_rotate(struct rbtree __arena *rbtree, + struct rbnode __arena *node, int dir) +{ + struct rbnode __arena *tmp, *parent; + int parentdir; + + parent = node->parent; + if (parent) + parentdir = rbnode_dir(node); + + /* If we're doing a root change, are we the root? */ + if (unlikely(!parent && rbtree->root != node)) + return -EINVAL; + + /* + * Does the node we're turning into the root into exist? + * Note that the new root is on the opposite side of the + * rotation's direction. + */ + tmp = node->child[1 - dir]; + if (unlikely(!tmp)) + return -EINVAL; + + /* Steal the closest child of the new root. */ + node->child[1 - dir] = tmp->child[dir]; + if (node->child[1 - dir]) + node->child[1 - dir]->parent = node; + + /* Put the node below the new root.*/ + tmp->child[dir] = node; + node->parent = tmp; + + tmp->parent = parent; + if (parent) + parent->child[parentdir] = tmp; + else + rbtree->root = tmp; + + return 0; +} + +static +struct rbnode __arena *rbnode_find(struct rbnode __arena *subtree, u64 key) +{ + struct rbnode __arena *node = subtree; + int dir; + + if (!subtree) + return NULL; + + while (can_loop) { + if (node->key == key) + break; + + dir = (key < node->key) ? 0 : 1; + + if (!node->child[dir]) + break; + + node = node->child[dir]; + } + + return node; +} + +static +struct rbnode __arena *rbnode_least_upper_bound(struct rbnode __arena *subtree, uint64_t key) +{ + struct rbnode __arena *node = subtree; + int dir; + + if (!subtree) + return NULL; + + while (can_loop) { + dir = (key <= node->key) ? 0 : 1; + + if (!node->child[dir]) + break; + + node = node->child[dir]; + } + + return node; +} + +__weak +int rb_find(struct rbtree __arena *rbtree, u64 key, u64 *value) +{ + struct rbnode __arena *node; + + if (unlikely(!rbtree)) + return -EINVAL; + + if (unlikely(!value)) + return -EINVAL; + + node = rbnode_find(rbtree->root, key); + if (!node || node->key != key) + return -ENOENT; + + *value = node->value; + + return 0; +} + +__weak +struct rbnode __arena *rb_node_alloc(u64 key, u64 value) +{ + struct rbnode __arena *rbnode = NULL; + + rbnode = (struct rbnode __arena *)arena_malloc(sizeof(*rbnode)); + if (!rbnode) + return NULL; + + /* + * WARNING: The order of assignments is weird on purpose. + * See comment in rb_insert_node() for more context. + * TL;DR: Prevent consecutive 0 assignments from being + * promoted into an unverifiable memset by the compiler. + */ + + rbnode->key = key; + rbnode->parent = NULL; + rbnode->value = value; + rbnode->left = NULL; + rbnode->is_red = true; + rbnode->right = NULL; + + return rbnode; +} + +__weak +void rb_node_free(struct rbnode __arena *rbnode) +{ + arena_free(rbnode); +} + +static +int rb_node_insert(struct rbtree __arena *rbtree, + struct rbnode __arena *node) +{ + struct rbnode __arena *grandparent, *parent = rbtree->root; + u64 key = node->key; + struct rbnode __arena *uncle; + int dir; + int ret; + + if (unlikely(!rbtree)) + return -EINVAL; + + if (!parent) { + rbtree->root = node; + return 0; + } + + if (rbtree->insert != RB_DUPLICATE) + parent = rbnode_find(parent, key); + else + parent = rbnode_least_upper_bound(parent, key); + + if (key == parent->key && rbtree->insert != RB_DUPLICATE) { + if (rbtree->insert == RB_UPDATE) { + /* + * Replace the old node with the new one. + * Free up the old node. + */ + ret = rbnode_replace(rbtree, parent, node); + if (ret) + return ret; + + if (rbtree->alloc == RB_ALLOC) + rb_node_free(parent); + + return 0; + } + + /* Otherwise it's RB_DEFAULT. */ + return -EALREADY; + } + + node->parent = parent; + /* Also works if key == parent->key. */ + if (key <= parent->key) + parent->left = node; + else + parent->right = node; + + while (can_loop) { + parent = node->parent; + if (!parent) + return 0; + + if (!parent->is_red) + return 0; + + grandparent = parent->parent; + if (!grandparent) { + parent->is_red = false; + return 0; + } + + dir = rbnode_dir(parent); + uncle = grandparent->child[1 - dir]; + + if (!uncle || !uncle->is_red) { + if (node == parent->child[1 - dir]) { + rbnode_rotate(rbtree, parent, dir); + node = parent; + parent = grandparent->child[dir]; + } + + rbnode_rotate(rbtree, grandparent, 1 - dir); + parent->is_red = false; + grandparent->is_red = true; + + return 0; + } + + /* Uncle is red. */ + + parent->is_red = false; + uncle->is_red = false; + grandparent->is_red = true; + + node = grandparent; + } + + return 0; +} + +int rb_insert_node(struct rbtree __arena *rbtree, + struct rbnode __arena *node) +{ + if (unlikely(!rbtree)) + return -EINVAL; + + if (unlikely(rbtree->alloc == RB_ALLOC)) + return -EINVAL; + + node->left = NULL; + + /* + * Workaround to break an optimization that causes + * verification failures on some compilers. Assignments + * of the kind + * + * *(r0 + 0) = 0; + * *(r0 + 8) = 0; + * *(r0 + 16) = 0; + * + * get promoted into a memset, and that in turn is not + * handled properly for arena memory by LLVM 21 and GCC 15. + * Add a barrier for now to prevent the assignments from being fused. + */ + barrier(); + + node->parent = NULL; + node->right = NULL; + + node->is_red = true; + + return rb_node_insert(rbtree, node); +} + +__weak +int rb_insert(struct rbtree __arena *rbtree, u64 key, u64 value) +{ + struct rbnode __arena *node; + int ret; + + if (unlikely(!rbtree)) + return -EINVAL; + + if (unlikely(rbtree->alloc != RB_ALLOC)) + return -EINVAL; + + node = rb_node_alloc(key, value); + if (!node) + return -ENOMEM; + + ret = rb_node_insert(rbtree, node); + if (ret) { + rb_node_free(node); + return ret; + } + + return 0; +} + +static inline struct rbnode __arena *rbnode_least(struct rbnode __arena *subtree) +{ + while (subtree->left && can_loop) + subtree = subtree->left; + + return subtree; +} + +__weak int rb_least(struct rbtree __arena *rbtree, u64 *key, u64 *value) +{ + struct rbnode __arena *least; + + if (unlikely(!rbtree)) + return -EINVAL; + + if (!rbtree->root) + return -ENOENT; + + least = rbnode_least(rbtree->root); + if (key) + *key = least->key; + if (value) + *value = least->value; + + return 0; +} + + +/* + * If we are referencing ourselves, a and b have a parent-child relation, + * and we should be pointing at the other node instead. + */ +static inline void rbnode_fixup_pointers(struct rbnode __arena *a, + struct rbnode __arena *b) +{ +#define fixup(n1, n2, member) do { if (n1->member == n1) n1->member = n2; } while (0) + fixup(a, b, left); + fixup(a, b, right); + fixup(a, b, parent); +#undef fixup +} + +static inline void rbnode_swap_values(struct rbnode __arena *a, + struct rbnode __arena *b) +{ +#define swap(n1, n2, tmp) do { (tmp) = (n1); (n1) = (n2); (n2) = (tmp); } while (0) + struct rbnode __arena *tmpnode; + u64 tmp; + + /* Swap the pointers. */ + swap(a->is_red, b->is_red, tmp); + + swap(a->left, b->left, tmpnode); + swap(a->right, b->right, tmpnode); + swap(a->parent, b->parent, tmpnode); +#undef swap + + /* Account for the nodes being parent and child. */ + rbnode_fixup_pointers(b, a); + rbnode_fixup_pointers(a, b); +} + +static inline void rbnode_adjust_neighbors(struct rbtree __arena *rbtree, + struct rbnode __arena *node, int dir) +{ + if (node->left) + node->left->parent = node; + if (node->right) + node->right->parent = node; + + if (node->parent) { + node->parent->child[dir] = node; + return; + } + + rbtree->root = node; +} + +/* + * Directly replace an existing node with a replacement. The replacement node + * should not already be in the tree. + */ +static int rbnode_replace(struct rbtree __arena *rbtree, + struct rbnode __arena *existing, + struct rbnode __arena *replacement) +{ + int dir = 0; + + if (unlikely(replacement->parent || replacement->left || replacement->right)) + return -EINVAL; + + if (existing->parent) + dir = rbnode_dir(existing); + + replacement->is_red = existing->is_red; + replacement->left = existing->left; + replacement->right = existing->right; + replacement->parent = existing->parent; + + /* Fix up the new node's neighbors. */ + rbnode_adjust_neighbors(rbtree, replacement, dir); + + return 0; +} + +/* + * Switch two nodes in the tree in place. This is useful during node deletion. + * This is more involved than switching the values of the two nodes because we + * must update all tree pointers. + */ +static void rbnode_switch(struct rbtree __arena *rbtree, + struct rbnode __arena *a, + struct rbnode __arena *b) +{ + int adir = 0, bdir = 0; + + /* + * Store the direction in the parent because we will not + * be able to recompute it once we start swapping values. + */ + if (a->parent) + adir = rbnode_dir(a); + + if (b->parent) + bdir = rbnode_dir(b); + + rbnode_swap_values(a, b); + + /* + * Fix up the pointers from the children/parent to the + * new nodes. + */ + rbnode_adjust_neighbors(rbtree, a, bdir); + rbnode_adjust_neighbors(rbtree, b, adir); +} + +static inline int rbnode_remove_node_single_child(struct rbtree __arena *rbtree, + struct rbnode __arena *node, + bool free) +{ + struct rbnode __arena *child; + int dir; + + if (unlikely(node->is_red)) { + arena_stderr("Node unexpectedly red\n"); + return -EINVAL; + } + + child = node->left ? node->left : node->right; + if (unlikely(!child->is_red)) { + arena_stderr("Only child is black\n"); + return -EINVAL; + } + + /* + * Since it's the immediate child, we can just + * remove the parent. + */ + child->parent = node->parent; + + if (node->parent) { + dir = rbnode_dir(node); + node->parent->child[dir] = child; + } else { + rbtree->root = child; + } + + /* Color the child black. */ + child->is_red = false; + + /* Only free if called from rb_remove. */ + if (free) + rb_node_free(node); + + return 0; +} + +static inline bool rbnode_has_red_children(struct rbnode __arena *node) +{ + if (node->left && node->left->is_red) + return true; + + return node->right && node->right->is_red; +} + +static +int rb_node_remove(struct rbtree __arena *rbtree, + struct rbnode __arena *node) +{ + struct rbnode __arena *parent, *sibling, *close_nephew, *distant_nephew; + bool free = (rbtree->alloc == RB_ALLOC); + struct rbnode __arena *replace, *initial; + bool is_red; + int dir; + + /* Both children present, replace with next largest key. */ + if (node->left && node->right) { + /* + * Swap the node itself instead of just the + * key/value pair to account for nodes embedded + * in other structs. + */ + + replace = rbnode_least(node->right); + rbnode_switch(rbtree, replace, node); + + /* + * FALLTHROUGH: We moved the node we are removing to + * the leftmost position of the subtree. We can now + * remove it as if it was always where we moved it to. + */ + } + + initial = node; + + /* Only one child present, replace with child and paint it black. */ + if (!node->left != !node->right) + return rbnode_remove_node_single_child(rbtree, node, free); + + /* (!node->left && !node->right) */ + + parent = node->parent; + if (!parent) { + /* Check that we're _actually_ the root. */ + if (rbtree->root == node) + rbtree->root = NULL; + else + arena_stderr("WARNING: Attempting to remove detached node from rbtree\n"); + + if (free) + rb_node_free(node); + return 0; + } + + dir = rbnode_dir(node); + parent->child[dir] = NULL; + is_red = node->is_red; + + if (free) + rb_node_free(node); + + /* If we removed a red node, we did not unbalance the tree.*/ + if (is_red) + return 0; + + sibling = parent->child[1 - dir]; + if (unlikely(!sibling)) { + arena_stderr("rbtree: removed black node has no sibling\n"); + return -EINVAL; + } + + /* + * We removed a black node, causing a change in path + * weight. Start rebalancing. The invariant is that + * all paths going through the node are shortened + * by one, and the current node is black. + */ + while (can_loop) { + + /* Balancing reached the root, there can be no imbalance. */ + if (!parent) + return 0; + + /* + * We already determined the dir, either above or + * at the end of the loop. + */ + + /* + * If we have no sibling, the tree was + * already unbalanced. + */ + sibling = parent->child[1 - dir]; + if (unlikely(!sibling)) { + arena_stderr("rbtree: removed black node has no sibling\n"); + return -EINVAL; + } + + /* Sibling is red, turn it into the grandparent. */ + if (sibling->is_red) { + /* + * Sibling is red. Transform the tree to turn + * the sibling into the parent's position, and + * repaint them. This does not balance the tree + * but makes it so we know the sibling is black + * and so can use the transformations to balance. + */ + rbnode_rotate(rbtree, parent, dir); + parent->is_red = true; + sibling->is_red = false; + + /* Our new sibling is now the close nephew. */ + sibling = parent->child[1 - dir]; + /* If sibling has any red siblings, break out. */ + if (rbnode_has_red_children(sibling)) + break; + + /* We can repaint the sibling and parent, we're done. */ + sibling->is_red = true; + parent->is_red = false; + + return 0; + } + + /* Sibling guaranteed to be black. If it has red children, break out. */ + if (rbnode_has_red_children(sibling)) + break; + + /* + * Both sibling and children are black. If parent is red, swap + * colors with the sibling. Otherwise + */ + if (parent->is_red) { + parent->is_red = false; + sibling->is_red = true; + return 0; + } + + /* + * Parent, sibling, and all its children are black. Repaint the sibling. + * This shortens the paths through it, so pop up a level in the + * tree and repeat the balancing. + */ + sibling->is_red = true; + node = parent; + parent = node->parent; + dir = rbnode_dir(node); + } + + if (node != initial) { + dir = rbnode_dir(node); + parent = node->parent; + sibling = parent->child[1-dir]; + } + /* + * Almost there. We know between the parent, sibling, + * and nephews only one or two of the nephews are red. If + * it is the close one, rotate it to the sibling position, + * paint it black, and paint the previous sibling red. + */ + + close_nephew = sibling->child[dir]; + distant_nephew = sibling->child[1 - dir]; + + /* + * If the distant red nephew is not red, rotate + * and repaint. We need the distant nephew + * to be red. We know the close nephew is red + * because at least one of them are, so the + * distant one is black if it exists. + */ + if (!distant_nephew || !distant_nephew->is_red) { + rbnode_rotate(rbtree, sibling, 1 - dir); + sibling->is_red = true; + close_nephew->is_red = false; + distant_nephew = sibling; + sibling = close_nephew; + } + + /* + * We now know it's the distant nephew that's red. + * Rotate the sibling into our parent's position + * and paint both black. + */ + + rbnode_rotate(rbtree, parent, dir); + sibling->is_red = parent->is_red; + parent->is_red = false; + distant_nephew->is_red = false; + + return 0; +} + +__weak +int rb_remove_node(struct rbtree __arena *rbtree, + struct rbnode __arena *node) +{ + if (unlikely(!rbtree)) + return -EINVAL; + + if (unlikely(rbtree->alloc == RB_ALLOC)) + return -EINVAL; + + return rb_node_remove(rbtree, node); +} + +__weak +int rb_remove(struct rbtree __arena *rbtree, u64 key) +{ + struct rbnode __arena *node; + + if (unlikely(!rbtree)) + return -EINVAL; + + if (unlikely(rbtree->alloc != RB_ALLOC)) + return -EINVAL; + + if (!rbtree->root) + return -ENOENT; + + node = rbnode_find(rbtree->root, key); + if (!node || node->key != key) + return -ENOENT; + + return rb_node_remove(rbtree, node); +} + +__weak +int rb_pop(struct rbtree __arena *rbtree, u64 *key, u64 *value) +{ + struct rbnode __arena *node; + + if (unlikely(!rbtree)) + return -EINVAL; + + if (!rbtree->root) + return -ENOENT; + + if (rbtree->alloc != RB_ALLOC) + return -EINVAL; + + node = rbnode_least(rbtree->root); + if (unlikely(!node)) + return -ENOENT; + + if (key) + *key = node->key; + if (value) + *value = node->value; + + return rb_node_remove(rbtree, node); +} + +inline void rbnode_print(size_t depth, struct rbnode __arena *rbn) +{ + arena_stderr("[DEPTH %d] %p (%s)\n PARENT %p", depth, rbn, rbn->is_red ? "red" : "black", rbn->parent); + arena_stderr("\tKV (%ld, %ld)\n LEFT %p RIGHT %p]\n", rbn->key, rbn->value, rbn->left, rbn->right); +} + +enum rb_print_state { + RB_NONE_VISITED, + RB_LEFT_VISITED, + RB_RIGHT_VISITED, +}; + +__weak +enum rb_print_state rb_print_next_state(struct rbnode __arena *rbnode, + enum rb_print_state state, u64 *next) +{ + if (unlikely(!next)) + return RB_NONE_VISITED; + + switch (state) { + case RB_NONE_VISITED: + if (rbnode->left) { + *next = (u64)rbnode->left; + state = RB_LEFT_VISITED; + break; + } + + /* FALLTHROUGH */ + + case RB_LEFT_VISITED: + if (rbnode->right) { + *next = (u64)rbnode->right; + state = RB_RIGHT_VISITED; + break; + } + + /* FALLTHROUGH */ + + default: + *next = 0; + state = RB_RIGHT_VISITED; + } + + return state; +} + +__weak +int rb_print_pop_up(struct rbnode __arena **rbnodep, u8 *depthp, enum rb_print_state (*stack)[RB_MAXLVL_PRINT], enum rb_print_state *state) +{ + struct rbnode __arena *rbnode; + volatile u8 depth; + int j; + + if (unlikely(!rbnodep || !depthp || !stack || !state)) + return -EINVAL; + + rbnode = *rbnodep; + depth = *depthp; + + for (j = 0; j < RB_MAXLVL_PRINT && can_loop; j++) { + if (*state != RB_RIGHT_VISITED) + break; + + depth -= 1; + if (depth < 0 || depth >= RB_MAXLVL_PRINT) + break; + + *state = (*stack)[depth % RB_MAXLVL_PRINT]; + rbnode = rbnode->parent; + } + + *rbnodep = rbnode; + *depthp = depth; + + return 0; +} + +__weak +int rb_print(struct rbtree __arena *rbtree) +{ + enum rb_print_state stack[RB_MAXLVL_PRINT]; + struct rbnode __arena *rbnode = rbtree->root; + enum rb_print_state state; + struct rbnode __arena *next; + u64 next_addr; + u8 depth; + int ret; + + if (unlikely(!rbtree)) + return -EINVAL; + + depth = 0; + state = RB_NONE_VISITED; + + arena_stderr("=== RB TREE START ===\n"); + + if (!rbtree->root) + goto out; + + /* Even with can_loop, the verifier doesn't like infinite loops. */ + while (can_loop) { + if (state == RB_NONE_VISITED) + rbnode_print(depth, rbnode); + + /* Find which child to traverse next. */ + state = rb_print_next_state(rbnode, state, &next_addr); + next = (struct rbnode __arena *)next_addr; + + /* Child found. Store the node state and go on. */ + if (next) { + if (depth < 0 || depth >= RB_MAXLVL_PRINT) + return 0; + + stack[depth++] = state; + + rbnode = next; + state = RB_NONE_VISITED; + + continue; + } + + /* Otherwise, go as far up as possible. */ + ret = rb_print_pop_up(&rbnode, &depth, &stack, &state); + if (ret) + return -EINVAL; + + if (depth < 0 || depth >= RB_MAXLVL_PRINT) { + arena_stderr("=== RB TREE END (depth %d\n)===", depth); + return 0; + } + + } + +out: + arena_stderr("=== RB TREE END ===\n"); + + return 0; +} + +__weak +int rb_integrity_check(struct rbtree __arena *rbtree) +{ + enum rb_print_state stack[RB_MAXLVL_PRINT]; + struct rbnode __arena *rbnode = rbtree->root; + enum rb_print_state state; + struct rbnode __arena *next; + u64 next_addr; + u8 depth; + int ret; + + if (unlikely(!rbtree)) + return -EINVAL; + + if (!rbtree->root) + return 0; + + depth = 0; + state = RB_NONE_VISITED; + + /* Even with can_loop, the verifier doesn't like infinite loops. */ + while (can_loop) { + if (rbnode->parent && rbnode->parent->left != rbnode + && rbnode->parent->right != rbnode) { + arena_stderr("WARNING: Inconsistent tree. Parent %p has no child %p\n", rbnode->parent, rbnode); + return -EINVAL; + } + + if (rbnode->parent == rbnode) { + arena_stderr("WARNING: Inconsistent tree, node %p is its own parent\n", rbnode); + return -EINVAL; + } + + if (rbnode->left == rbnode) { + arena_stderr("WARNING: Inconsistent tree, node %p is its own left child\n", rbnode); + return -EINVAL; + } + + if (rbnode->right == rbnode) { + arena_stderr("WARNING: Inconsistent tree, node %p is its own right child\n", rbnode); + return -EINVAL; + } + + if (rbnode->is_red) { + if (rbnode->left && rbnode->left->is_red) { + arena_stderr("WARNING: Inconsistent tree. Parent has %p has red child %p\n", rbnode, rbnode->left); + return -EINVAL; + } + if (rbnode->right && rbnode->right->is_red) { + arena_stderr("WARNING: Inconsistent tree. Parent has %p has red child %p\n", rbnode, rbnode->right); + return -EINVAL; + } + } else if (rbnode->parent && rbnode->parent->child[1 - rbnode_dir(rbnode)] == NULL) { + arena_stderr("WARNING: Inconsistent tree. Black node %p has no sibling\n", rbnode); + return -EINVAL; + } + + /* Find which child to traverse next. */ + state = rb_print_next_state(rbnode, state, &next_addr); + next = (struct rbnode __arena *)next_addr; + + /* Child found. Store the node state and go on. */ + if (next) { + if (depth < 0 || depth >= RB_MAXLVL_PRINT) + return 0; + + stack[depth++] = state; + + rbnode = next; + state = RB_NONE_VISITED; + + continue; + } + + /* Otherwise, go as far up as possible. */ + ret = rb_print_pop_up(&rbnode, &depth, &stack, &state); + if (ret) + return -EINVAL; + + if (depth < 0 || depth >= RB_MAXLVL_PRINT) { + return 0; + } + + } + + return 0; +} -- cgit v1.2.3 From 57c6ace8395d53b9bae6fb21e0bd3f536342c16e Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Fri, 5 Jun 2026 18:20:19 -0400 Subject: selftests/bpf: libarena: Add spmc queue data structure Expand libarena with a single producer multiple consumer deque data structure. This is a single producer, multiple consumer lockless structure that permits efficient work stealing. The structure is a Lev-Chase queue, so it is lock-free and wait-free. The data structure exposes three main calls. two of them are available to the thread owning the queue and one available to all threads in the program: spmc_owner_push(): Push an item to the top of the queue. spmc_owner_pop(): Pop an item from the top of the queue. spmc_steal(): Steal a thread from the bottom of the queue from any thread. Note that the queue is not really FIFO for all consumers, since non-owners of the queue can only work steal from the bottom. Signed-off-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260605222020.5231-3-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/libarena/include/libarena/spmc.h | 27 +++ .../bpf/libarena/selftests/test_spmc.bpf.c | 194 +++++++++++++++++ .../testing/selftests/bpf/libarena/src/spmc.bpf.c | 234 +++++++++++++++++++++ 3 files changed, 455 insertions(+) create mode 100644 tools/testing/selftests/bpf/libarena/include/libarena/spmc.h create mode 100644 tools/testing/selftests/bpf/libarena/selftests/test_spmc.bpf.c create mode 100644 tools/testing/selftests/bpf/libarena/src/spmc.bpf.c diff --git a/tools/testing/selftests/bpf/libarena/include/libarena/spmc.h b/tools/testing/selftests/bpf/libarena/include/libarena/spmc.h new file mode 100644 index 000000000000..75611276ce13 --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/include/libarena/spmc.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause */ + +#pragma once + +struct spmc_arr; + +#define SPMC_ARR_BASESZ 128 +#define SPMC_ARR_ORDERS 10 + +struct spmc_arr { + u64 __arena *data; + u64 order; +}; + +struct spmc { + volatile struct spmc_arr __arena *cur; + volatile u64 top; + volatile u64 bottom; + struct spmc_arr arr[SPMC_ARR_ORDERS]; +}; + +int spmc_owned_add(struct spmc __arena *spmc, u64 val); +int spmc_owned_remove(struct spmc __arena *spmc, u64 *val); +int spmc_steal(struct spmc __arena *spmc, u64 *val); + +struct spmc __arena *spmc_create(void); +int spmc_destroy(struct spmc __arena *spmc); diff --git a/tools/testing/selftests/bpf/libarena/selftests/test_spmc.bpf.c b/tools/testing/selftests/bpf/libarena/selftests/test_spmc.bpf.c new file mode 100644 index 000000000000..4d7a520115d1 --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/selftests/test_spmc.bpf.c @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause + +#include + +#include +#include + +/* + * NOTE: These selftests only test for the single-threaded use case, which for + * Lev-Chase queues is obviously the simplest one. Still, it is important to + * exercise the API to ensure it passes verification and basic checks. + */ + +SEC("syscall") +int test_spmc_remove_empty(void) +{ + u64 val; + int ret; + + struct spmc __arena *spmc = spmc_create(); + + if (!spmc) + return 1; + + ret = spmc_owned_remove(spmc, &val); + if (ret != -ENOENT) + return 1; + + spmc_destroy(spmc); + + return 0; +} + +SEC("syscall") +int test_spmc_steal_empty(void) +{ + u64 val; + int ret; + + struct spmc __arena *spmc = spmc_create(); + + if (!spmc) + return 1; + + ret = spmc_steal(spmc, &val); + if (ret != -ENOENT) + return 1; + + spmc_destroy(spmc); + + return 0; +} + +SEC("syscall") +int test_spmc_steal_one(void) +{ + u64 val, newval; + int ret, i; + + struct spmc __arena *spmc = spmc_create(); + + if (!spmc) + return 1; + + for (i = 0; i < 10 && can_loop; i++) { + val = i; + + ret = spmc_owned_add(spmc, val); + if (ret) + return 1; + + ret = spmc_steal(spmc, &newval); + if (ret) + return 2; + + if (val != newval) + return 3; + } + + spmc_destroy(spmc); + + return 0; +} + +SEC("syscall") +int test_spmc_remove_one(void) +{ + u64 val, newval; + int ret, i; + + struct spmc __arena *spmc = spmc_create(); + + if (!spmc) + return 1; + + for (i = 0; i < 10 && can_loop; i++) { + val = i; + + ret = spmc_owned_add(spmc, val); + if (ret) + return 1; + + ret = spmc_owned_remove(spmc, &newval); + if (ret) + return 2; + + if (val != newval) + return 3; + } + + spmc_destroy(spmc); + + return 0; +} + +SEC("syscall") +int test_spmc_remove_many(void) +{ + u64 val, newval; + int ret, i; + u64 expected; + + struct spmc __arena *spmc = spmc_create(); + + if (!spmc) + return 1; + + for (i = 0; i < 500 && can_loop; i++) { + val = i; + + ret = spmc_owned_add(spmc, val); + if (ret) { + arena_stderr("%s:%d error %d\n", __func__, __LINE__, ret); + return 1; + } + } + + for (i = 0; i < 500 && can_loop; i++) { + ret = spmc_owned_remove(spmc, &newval); + if (ret) { + arena_stderr("%s:%d error %d\n", __func__, __LINE__, ret); + return 1; + } + + expected = 500 - 1 - i; + if (newval != expected) { + arena_stderr("%s:%d expected %llu found %llu\n", __func__, __LINE__, expected, newval); + return 1; + } + } + + spmc_destroy(spmc); + + return 0; +} + +SEC("syscall") +int test_spmc_steal_many(void) +{ + u64 val, newval; + int ret, i; + + struct spmc __arena *spmc = spmc_create(); + + if (!spmc) + return 1; + + for (i = 0; i < 500 && can_loop; i++) { + val = i; + + ret = spmc_owned_add(spmc, val); + if (ret) { + arena_stderr("%s:%d error %d\n", __func__, __LINE__, ret); + return 1; + } + } + + for (i = 0; i < 500 && can_loop; i++) { + ret = spmc_steal(spmc, &newval); + if (ret) { + arena_stderr("%s:%d error %d\n", __func__, __LINE__, ret); + return 1; + } + + if (newval != i) { + arena_stderr("%s:%d expected %d found %llu\n", __func__, __LINE__, i, newval); + return 1; + } + } + + spmc_destroy(spmc); + + return 0; +} diff --git a/tools/testing/selftests/bpf/libarena/src/spmc.bpf.c b/tools/testing/selftests/bpf/libarena/src/spmc.bpf.c new file mode 100644 index 000000000000..42732b7d29a6 --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/src/spmc.bpf.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* + * Copyright (c) 2025-2026 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2025-2026 Emil Tsalapatis + */ + +#include + +#include + +#include +#include + +static inline +u64 spmc_arr_size(volatile struct spmc_arr __arena *spmc_arr) +{ + return SPMC_ARR_BASESZ << spmc_arr->order; +} + +static inline +u64 spmc_arr_get(volatile struct spmc_arr __arena *spmc_arr, u64 ind) +{ + u64 ret = READ_ONCE(spmc_arr->data[ind % spmc_arr_size(spmc_arr)]); + + return ret; +} + +static inline +void spmc_arr_put(volatile struct spmc_arr __arena *spmc_arr, u64 ind, u64 value) +{ + WRITE_ONCE(spmc_arr->data[ind % spmc_arr_size(spmc_arr)], value); +} + +static inline +void spmc_arr_copy(volatile struct spmc_arr __arena *dst, + volatile struct spmc_arr __arena *src, u64 b, u64 t) +{ + u64 i; + + for (i = t; i < b && can_loop; i++) + spmc_arr_put(dst, i, spmc_arr_get(src, i)); +} + +static inline +int spmc_order_init(struct spmc __arena *spmc, int order) +{ + volatile struct spmc_arr __arena *arr = &spmc->arr[order]; + + if (unlikely(!spmc)) + return -EINVAL; + + if (order >= SPMC_ARR_ORDERS) + return -E2BIG; + + /* Already allocated? */ + if (arr->data) + return 0; + + arr->data = arena_malloc((SPMC_ARR_BASESZ << order) * sizeof(*arr->data)); + if (!arr->data) + return -ENOMEM; + + return 0; +} + +__weak +int spmc_owned_add(struct spmc __arena *spmc, u64 val) +{ + volatile struct spmc_arr __arena *newarr; + volatile struct spmc_arr __arena *arr; + ssize_t sz; + u64 b, t; + int ret; + + if (unlikely(!spmc)) + return -EINVAL; + + /* + * Bottom must always be read first, also + * see spmc_steal(). + */ + b = smp_load_acquire(&spmc->bottom); + t = READ_ONCE(spmc->top); + arr = READ_ONCE(spmc->cur); + + sz = b - t; + if (sz >= spmc_arr_size(arr) - 1) { + ret = spmc_order_init(spmc, arr->order + 1); + if (ret) + return ret; + + newarr = &spmc->arr[arr->order + 1]; + + spmc_arr_copy(newarr, arr, b, t); + smp_store_release(&spmc->cur, newarr); + arr = newarr; + } + + spmc_arr_put(arr, b, val); + smp_store_release(&spmc->bottom, b + 1); + + return 0; +} + + +__weak +int spmc_owned_remove(struct spmc __arena *spmc, u64 *val) +{ + volatile struct spmc_arr __arena *arr; + int ret = 0; + ssize_t sz; + u64 value; + u64 b, t; + + if (unlikely(!spmc || !val)) + return -EINVAL; + + b = READ_ONCE(spmc->bottom) - 1; + WRITE_ONCE(spmc->bottom, b); + smp_mb(); + + t = READ_ONCE(spmc->top); + arr = READ_ONCE(spmc->cur); + + sz = b - t; + if (sz < 0) { + WRITE_ONCE(spmc->bottom, t); + return -ENOENT; + } + + value = spmc_arr_get(arr, b); + if (sz > 0) { + *val = value; + return 0; + } + + if (cmpxchg(&spmc->top, t, t + 1) != t) + ret = -EAGAIN; + + WRITE_ONCE(spmc->bottom, t + 1); + + if (ret) + return ret; + + *val = value; + + return 0; +} + +__weak +int spmc_steal(struct spmc __arena *spmc, u64 *val) +{ + volatile struct spmc_arr __arena *arr; + ssize_t sz; + u64 value; + u64 b, t; + + if (unlikely(!spmc || !val)) + return -EINVAL; + + /* + * It is important that t is read before b for + * stealers to avoid racing with the owner. + * Races between stealers are dealt with using + * CAS to increment the top value below. + */ + t = smp_load_acquire(&spmc->top); + b = smp_load_acquire(&spmc->bottom); + + sz = b - t; + if (sz <= 0) + return -ENOENT; + + arr = smp_load_acquire(&spmc->cur); + value = spmc_arr_get(arr, t); + + if (cmpxchg(&spmc->top, t, t + 1) != t) + return -EAGAIN; + + *val = value; + + return 0; +} + + +__weak +struct spmc __arena *spmc_create(void) +{ + /* + * Marked as volatile because otherwise the array + * reference in the internal loop gets demoted to + * scalar and the program fails verification. + */ + struct spmc __arena *volatile spmc; + int ret, i; + + spmc = arena_malloc(sizeof(*spmc)); + if (!spmc) + return NULL; + + spmc->bottom = 0; + spmc->top = 0; + + for (i = 0; i < SPMC_ARR_ORDERS && can_loop; i++) { + spmc->arr[i].data = NULL; + spmc->arr[i].order = i; + } + + ret = spmc_order_init((struct spmc __arena *)spmc, 0); + if (ret) { + arena_free(spmc); + return NULL; + } + + spmc->cur = &spmc->arr[0]; + + return (struct spmc __arena *)spmc; +} + +__weak +int spmc_destroy(struct spmc __arena *spmc) +{ + int i; + + if (unlikely(!spmc)) + return -EINVAL; + + for (i = 0; i < SPMC_ARR_ORDERS && can_loop; i++) + arena_free(spmc->arr[i].data); + + arena_free(spmc); + + return 0; +} -- cgit v1.2.3 From 42998f819256ef272b6a445310e2b64a3729a139 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Fri, 5 Jun 2026 18:20:20 -0400 Subject: selftests/bpf: libarena: parallel test harness and spmc parallel selftest Add a parallel test for the SPMC Lev-Chase workstealing queue. The queue is built to be wait-free even when there are multiple consumers, and the parallel selftest provides a signal on whether the queue behaves correctly when stress tested. To support the test, this patch includes a test harness for parallel selftests. The spmc selftest acts as an example of the naming and other conventions expected by the harness. Signed-off-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260605222020.5231-4-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- .../bpf/libarena/include/libarena/userspace.h | 6 + .../libarena/selftests/test_parallel_spmc.bpf.c | 673 +++++++++++++++++++++ tools/testing/selftests/bpf/prog_tests/libarena.c | 187 ++++++ 3 files changed, 866 insertions(+) create mode 100644 tools/testing/selftests/bpf/libarena/selftests/test_parallel_spmc.bpf.c diff --git a/tools/testing/selftests/bpf/libarena/include/libarena/userspace.h b/tools/testing/selftests/bpf/libarena/include/libarena/userspace.h index 88b68ac73cca..fc27a4bcf5d7 100644 --- a/tools/testing/selftests/bpf/libarena/include/libarena/userspace.h +++ b/tools/testing/selftests/bpf/libarena/include/libarena/userspace.h @@ -32,6 +32,12 @@ static inline bool libarena_is_asan_test_prog(const char *name) return strstr(name, "asan_test") == name; } +static inline bool libarena_is_parallel_test_prog(const char *name) +{ + return strstr(name, "parallel_test") == name; +} + + static inline int libarena_run_prog_args(int prog_fd, void *args, size_t argsize) { LIBBPF_OPTS(bpf_test_run_opts, opts); diff --git a/tools/testing/selftests/bpf/libarena/selftests/test_parallel_spmc.bpf.c b/tools/testing/selftests/bpf/libarena/selftests/test_parallel_spmc.bpf.c new file mode 100644 index 000000000000..981c845e2d15 --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/selftests/test_parallel_spmc.bpf.c @@ -0,0 +1,673 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause + +#include + +#include + +#include +#include + +#define TEST_SPMC_THREADS 4 +#define TEST_SPMC_STEALERS (TEST_SPMC_THREADS - 1) + +/* + * The test requires the stealers/owners to sometimes quiesce + * before continuing the benchmark. Normally we'd use something + * like a condition variable, but since the benchmark is short-lived + * and operations are wait-free we just spin around the quiescence + * point instead. If we time out, we just fail the benchmark. + */ +#define TEST_SPMC_SYNC_SPINS (1U << 18) + +/* + * We track all the values we retrieve from the queue + * to get some guarantee we're, not corrupting data, + * e.g., accidentally reusing a past value from a slot. + */ +#define TEST_SPMC_MAX_VALUES (1024) +static u64 __arena seen[TEST_SPMC_MAX_VALUES]; + +/* The single spmc queue for the benchmark. */ +static struct spmc __arena *spmc; + +/* Owner and stealer epochs. We define the , */ +static volatile u64 owner_epoch; +static volatile u64 stealer_epoch; + +/* Map owner epochs to stealer epochs (simply scale by # of stealers). */ +#define STEALER_EPOCH(owner_epoch) ((owner_epoch) * TEST_SPMC_STEALERS) + +/* Global abort switch. If any thread fails, all others exit ASAP. */ +static volatile bool test_abort; + +/* + * Counters useful for ensuring conservation of pushes/pops of unique values + * (we're not stealing/popping more/fewer items than were pushed). + */ +static volatile u64 expected_total; +static volatile u64 total_seen; + +/* Measure how many pops and steals we've made (irrespective of retrieved value). */ +static volatile u64 pops; +static volatile u64 steals; + +/* Used for the resize selftest, see below. */ +static volatile u64 stealers_started; + +/* Used for the mixed selftest, see below. */ +static volatile u64 round_steals; + +/* + * We have multiple stealers and a single owner. We sometimes want the owner + * to successfully outproduce the stealers, we add a busy loop in them. + */ +#define TEST_SPMC_WASTE_ROUNDS (1024) + +/* + * The spmc data structure depends on the runtime fully + * supporting acquire/release semantics, which is not + * the case for all architectures. + */ +#if defined(ENABLE_ATOMICS_TESTS) && \ + (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \ + (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64)) +static bool spmc_tests_enabled(void) +{ + return true; +} +#else +static bool spmc_tests_enabled(void) +{ + return false; +} +#endif + +/* + * Scaffolding for each parallel test. Each test has setup/teardown, + * a single owner thread that owns the queue, and TEST_SPMC_STEALER + * threads that try to steal. + */ +#define DEFINE_PARALLEL_SPMC_TEST(prefix, expected_total) \ + SEC("syscall") int parallel_test_spmc_##prefix##__enabled(void) \ + { \ + return spmc_tests_enabled() ? 0 : -EOPNOTSUPP; \ + } \ + SEC("syscall") int parallel_test_spmc_##prefix##__init(void) \ + { \ + return spmc_common_init(expected_total); \ + } \ + SEC("syscall") int parallel_test_spmc_##prefix##__fini(void) \ + { \ + return spmc_common_fini(); \ + } \ + SEC("syscall") int parallel_test_spmc_##prefix##__0(void) \ + { \ + return spmc_##prefix##_owner(); \ + } \ + SEC("syscall") int parallel_test_spmc_##prefix##__1(void) \ + { \ + return spmc_##prefix##_stealer(); \ + } \ + SEC("syscall") int parallel_test_spmc_##prefix##__2(void) \ + { \ + return spmc_##prefix##_stealer(); \ + } \ + SEC("syscall") int parallel_test_spmc_##prefix##__3(void) \ + { \ + return spmc_##prefix##_stealer(); \ + } + +static int spmc_common_init(u64 total) +{ + u64 i; + + if (total > TEST_SPMC_MAX_VALUES) + return -E2BIG; + + owner_epoch = 0; + stealer_epoch = 0; + test_abort = false; + expected_total = total; + total_seen = 0; + pops = 0; + steals = 0; + stealers_started = 0; + round_steals = 0; + + for (i = zero; i < TEST_SPMC_MAX_VALUES && can_loop; i++) + seen[i] = 0; + + spmc = spmc_create(); + if (!spmc) + return -ENOMEM; + + return 0; +} + +static int spmc_common_fini(void) +{ + int ret; + + ret = spmc_destroy(spmc); + spmc = NULL; + + return ret; +} + +__weak +int spmc_quiesce_on_owner(u64 epoch) +{ + u64 i; + + bpf_for(i, 0, TEST_SPMC_SYNC_SPINS) { + if (test_abort) + return -EINTR; + if (smp_load_acquire(&owner_epoch) >= epoch) + return 0; + } + + test_abort = true; + + return -ETIMEDOUT; +} + +__weak +int spmc_quiesce_on_stealer(u64 epoch) +{ + u64 target, cur; + unsigned int i; + int err = -ETIMEDOUT; + + target = STEALER_EPOCH(epoch); + bpf_for(i, 0, TEST_SPMC_SYNC_SPINS) { + + if (test_abort) { + err = -EINTR; + break; + } + + cur = smp_load_acquire(&stealer_epoch); + if (cur > target) { + err = -EINVAL; + test_abort = true; + break; + } + + if (cur == target) + return 0; + } + + test_abort = true; + + return err; +} + +static int spmc_update_stats(u64 val, bool owner) +{ + u64 total; + + total = expected_total; + if (val >= total || val >= TEST_SPMC_MAX_VALUES) { + test_abort = true; + return -EINVAL; + } + + if (__sync_fetch_and_add(&seen[val], 1) != 0) { + test_abort = true; + return -EINVAL; + } + + __sync_fetch_and_add(&total_seen, 1); + if (owner) + __sync_fetch_and_add(&pops, 1); + else + __sync_fetch_and_add(&steals, 1); + + return 0; +} + +static int spmc_validate_owner_empty(void) +{ + u64 val; + int ret; + + ret = spmc_owned_remove(spmc, &val); + if (ret != -ENOENT) { + test_abort = true; + /* Change a 0 return value into -EINVAL. */ + return ret ?: -EINVAL; + } + + return 0; +} + +__weak +int spmc_validate_all_seen(void) +{ + u64 i, total; + + total = expected_total; + if (total_seen != total) + goto err; + + if (pops + steals != total) + goto err; + + for (i = zero; i < total && can_loop; i++) { + if (seen[i % TEST_SPMC_MAX_VALUES] != 1) + goto err; + } + + return 0; + +err: + test_abort = true; + + return -EINVAL; +} + +/* + * Single value benchmark. The owner adds an item then races with + * the stealers for it. This way directly race between owner and + * stealers on the same slot. + */ + + +#define TEST_SPMC_SINGLEVAL_ITERS (64) + +__weak +int spmc_singleval_tryconsume(u64 expected, bool steal) +{ + u64 val; + int ret; + + while (can_loop) { + if (steal) + ret = spmc_steal(spmc, &val); + else + ret = spmc_owned_remove(spmc, &val); + + /* Success. Update and validate. */ + if (!ret) { + if (val != expected) + return -EINVAL; + + ret = spmc_update_stats(val, !steal); + if (ret) + return ret; + + return 0; + } + + /* + * If we got -ENOENT, the queue is empty + * and we're good to go. + */ + if (ret != -EAGAIN) + return (ret == -ENOENT) ? 0 : ret; + } + + /* Impossible. */ + return -EINVAL; +} + +static int spmc_singleval_owner(void) +{ + int ret; + u64 i; + + for (i = zero; i < TEST_SPMC_SINGLEVAL_ITERS && can_loop; i++) { + ret = spmc_quiesce_on_stealer(i); + if (ret) + goto err; + + ret = spmc_owned_add(spmc, i); + if (ret) + goto err; + + __sync_fetch_and_add(&owner_epoch, 1); + + ret = spmc_singleval_tryconsume(i, false); + if (ret) + goto err; + + ret = spmc_quiesce_on_stealer(i + 1); + if (ret) + goto err; + } + + ret = spmc_validate_owner_empty(); + if (ret) + return ret; + + return spmc_validate_all_seen(); + +err: + test_abort = true; + return -EINVAL; +} + +static int spmc_singleval_stealer(void) +{ + int ret; + u64 i; + + for (i = zero; i < TEST_SPMC_SINGLEVAL_ITERS && can_loop; i++) { + ret = spmc_quiesce_on_owner(i + 1); + if (ret) + goto err; + + ret = spmc_singleval_tryconsume(i, true); + if (ret) + goto err; + + __sync_fetch_and_add(&stealer_epoch, 1); + } + + return 0; + +err: + test_abort = true; + return -EINVAL; +} + +DEFINE_PARALLEL_SPMC_TEST(singleval, TEST_SPMC_SINGLEVAL_ITERS) + +/* + * The resize test. Force a resize from the owner even while the stealers + * are trying to consume. Then make sure the queue is still consistent + * after the resize. + * + * The owner _doesn't_ consume from the queue. The test makes sure that + * switching the array from underneath the stealers works. + */ + +/* Force 2 resizes (since the rate of resize is logarithmic). */ +#define TEST_SPMC_RESIZE_ORDER (2) +#define TEST_SPMC_RESIZE_PREFILL ((SPMC_ARR_BASESZ << TEST_SPMC_RESIZE_ORDER) - 1) + +/* */ +#define TEST_SPMC_RESIZE_TAIL (SPMC_ARR_BASESZ << TEST_SPMC_RESIZE_ORDER) +#define TEST_SPMC_RESIZE_TOTAL (TEST_SPMC_RESIZE_PREFILL + TEST_SPMC_RESIZE_TAIL) + +__weak +int spmc_wait_for_stealers_to_start(u64 target) +{ + u64 i; + + bpf_for(i, 0, TEST_SPMC_SYNC_SPINS) { + if (test_abort) + return -EINTR; + if (READ_ONCE(stealers_started) >= target) + return 0; + } + + test_abort = true; + + return -ETIMEDOUT; +} + +__weak +void spmc_waste_time(void) +{ + int i; + int j; + + for (i = zero; i < TEST_SPMC_WASTE_ROUNDS && can_loop; i++) { + /* Random computation. */ + WRITE_ONCE(j, i * 17 + 23); + } +} + +static int spmc_resize_owner(void) +{ + bool resized = false; + u64 i; + int ret; + + /* Get a head start vs the consumers. */ + for (i = zero; i < TEST_SPMC_RESIZE_PREFILL && can_loop; i++) { + ret = spmc_owned_add(spmc, i); + if (ret) { + test_abort = true; + return ret; + } + } + + __sync_fetch_and_add(&owner_epoch, 1); + + /* Wait for stealers to start then start racing. */ + ret = spmc_wait_for_stealers_to_start(TEST_SPMC_STEALERS); + if (ret) + return ret; + + for (i = TEST_SPMC_RESIZE_PREFILL; i < TEST_SPMC_RESIZE_TOTAL && can_loop; i++) { + ret = spmc_owned_add(spmc, i); + if (ret) { + test_abort = true; + return ret; + } + + if (spmc->cur->order > TEST_SPMC_RESIZE_ORDER) + resized = true; + } + + /* Did we get to resize while racing/ */ + if (!resized) { + test_abort = true; + return -153; + } + + /* + * Wait for the stealers to drain and make sure + * we didn't lose any items along the way. + */ + __sync_fetch_and_add(&owner_epoch, 1); + + ret = spmc_quiesce_on_stealer(1); + if (ret) + return ret; + + ret = spmc_validate_owner_empty(); + if (ret) + return ret; + + return spmc_validate_all_seen(); +} + +static int spmc_resize_stealer(void) +{ + bool owner_done = false; + u64 val; + int ret; + + arena_subprog_init(); + + ret = spmc_quiesce_on_owner(1); + if (ret) + return ret; + + __sync_fetch_and_add(&stealers_started, 1); + + while (can_loop) { + spmc_waste_time(); + if (test_abort) + return -EINTR; + + ret = spmc_steal(spmc, &val); + if (!ret) { + ret = spmc_update_stats(val, false); + if (ret) + return ret; + continue; + } + + if (ret == -EAGAIN) + continue; + + if (ret == -ENOENT) { + if (owner_done) + break; + owner_done = owner_epoch >= 2; + continue; + } + + test_abort = true; + return ret; + } + + __sync_fetch_and_add(&stealer_epoch, 1); + + return 0; +} + +DEFINE_PARALLEL_SPMC_TEST(resize, TEST_SPMC_RESIZE_TOTAL) + +/* + * The burst benchmark. The owner generates data all at once, + * then waits for the stealers to steal half then starts removing + * items until the queue empties. The owner also makes sure the + * item order is not jumbled. + */ + +#define TEST_SPMC_BURST_ROUNDS (4) +#define TEST_SPMC_BURST_BURST (64) +#define TEST_SPMC_BURST_TOTAL (TEST_SPMC_BURST_ROUNDS * TEST_SPMC_BURST_BURST) +#define TEST_SPMC_BURST_STEAL_TARGET (TEST_SPMC_BURST_BURST / 2) + +static int spmc_wait_for_round_steals(u64 target) +{ + u64 i; + + arena_subprog_init(); + + bpf_for(i, 0, TEST_SPMC_SYNC_SPINS) { + if (test_abort) + return -EINTR; + if (round_steals >= target) + return 0; + } + + test_abort = true; + + return -ETIMEDOUT; +} + +__weak int +spmc_burst_owner_round(u64 round) +{ + u64 i, base, stolen, expected, val; + int ret; + + base = round * TEST_SPMC_BURST_BURST; + round_steals = 0; + + for (i = zero; i < TEST_SPMC_BURST_BURST && can_loop; i++) { + ret = spmc_owned_add(spmc, base + i); + if (ret) + return ret; + } + + __sync_fetch_and_add(&owner_epoch, 1); + + ret = spmc_wait_for_round_steals(TEST_SPMC_BURST_STEAL_TARGET); + if (ret == -EINTR || ret == -ETIMEDOUT) + return ret; + + __sync_fetch_and_add(&owner_epoch, 1); + + ret = spmc_quiesce_on_stealer(round + 1); + if (ret) + return ret; + + stolen = round_steals; + if (stolen > TEST_SPMC_BURST_BURST) + return -EINVAL; + + for (i = zero; i < TEST_SPMC_BURST_BURST - stolen && can_loop; i++) { + ret = spmc_owned_remove(spmc, &val); + if (ret) + return ret; + + expected = base + TEST_SPMC_BURST_BURST - 1 - i; + if (val != expected) + return -EINVAL; + + ret = spmc_update_stats(val, true); + if (ret) { + test_abort = true; + return -EINVAL; + } + } + + ret = spmc_validate_owner_empty(); + if (ret) + return ret; + + return 0; +} + +static int spmc_burst_owner(void) +{ + u64 round; + int ret; + + arena_subprog_init(); + + for (round = zero; round < TEST_SPMC_BURST_ROUNDS && can_loop; round++) { + ret = spmc_burst_owner_round(round); + if (ret) + goto err; + } + + return spmc_validate_all_seen(); + +err: + test_abort = true; + return -EINVAL; +} + +static int spmc_burst_stealer(void) +{ + u64 round, val, active_epoch; + int ret; + + arena_subprog_init(); + + for (round = zero; round < TEST_SPMC_BURST_ROUNDS && can_loop; round++) { + active_epoch = round * 2 + 1; + + /* + * Wait till the owner prefills the queue then + * start stealing. + */ + ret = spmc_quiesce_on_owner(active_epoch); + if (ret) + return ret; + + while (owner_epoch == active_epoch && can_loop) { + if (test_abort) + return -EINTR; + + ret = spmc_steal(spmc, &val); + if (!ret) { + ret = spmc_update_stats(val, false); + if (ret) + return ret; + __sync_fetch_and_add(&round_steals, 1); + continue; + } + if (ret == -EAGAIN || ret == -ENOENT) + continue; + + test_abort = true; + return ret; + } + + __sync_fetch_and_add(&stealer_epoch, 1); + } + + return 0; +} + +DEFINE_PARALLEL_SPMC_TEST(burst, TEST_SPMC_BURST_TOTAL) diff --git a/tools/testing/selftests/bpf/prog_tests/libarena.c b/tools/testing/selftests/bpf/prog_tests/libarena.c index 81bdb084c271..61ea68dce410 100644 --- a/tools/testing/selftests/bpf/prog_tests/libarena.c +++ b/tools/testing/selftests/bpf/prog_tests/libarena.c @@ -27,6 +27,177 @@ static void run_libarena_test(struct libarena *skel, struct bpf_program *prog, } +static void *run_libarena_parallel_prog(void *arg) +{ + struct bpf_program *prog = arg; + + return (void *)(long)libarena_run_prog(bpf_program__fd(prog)); +} + +/* Max suffix is ceil((lg 2^32) / (lg 10)) + sizeof("__") = 10 + 2 = 12. */ +#define MAX_PARTEST_SUFFIX (12) +#define MAX_PARTEST_NAME (1024) +#define MAX_PARTEST_PREFIX (MAX_PARTEST_NAME - MAX_PARTEST_SUFFIX) + +static int run_libarena_parallel_fini(struct libarena *skel, const char *name, + size_t prefixlen) +{ + char tdname[MAX_PARTEST_NAME]; + struct bpf_program *fini_prog; + int ret; + + ret = snprintf(tdname, sizeof(tdname), "%.*s__fini", (int)prefixlen, name); + if (!ASSERT_LT(ret, sizeof(tdname), "partest fini name")) + return -ENAMETOOLONG; + + fini_prog = bpf_object__find_program_by_name(skel->obj, tdname); + if (!ASSERT_TRUE(fini_prog, "partest fini prog")) + return -ENOENT; + + ret = libarena_run_prog(bpf_program__fd(fini_prog)); + ASSERT_OK(ret, tdname); + + return ret; +} + +static int run_libarena_parallel_test_workers(struct libarena *skel, + const char *name, size_t prefixlen) +{ + pthread_t *threads = NULL, *tmp_threads; + char tdname[MAX_PARTEST_NAME]; + struct bpf_program *tdprog; + uint32_t nthreads; + void *thread_ret; + int ret, err = 0; + int i; + + for (nthreads = 0; nthreads < UINT_MAX; nthreads++) { + ret = snprintf(tdname, sizeof(tdname), "%.*s__%u", (int)prefixlen, + name, nthreads); + if (!ASSERT_LT(ret, sizeof(tdname), "test worker name")) { + err = -ENAMETOOLONG; + break; + } + + /* + * We enumerate the worker threads for a given test with __0, __1, + * and so on. The suffixes always start from 0 and are contiguous, + * so if we don't find a program with the requested name we have + * discovered all available worker programs. + */ + tdprog = bpf_object__find_program_by_name(skel->obj, tdname); + if (!tdprog) + break; + + /* Bump the alloc array to accommodate the new thread. */ + tmp_threads = realloc(threads, (nthreads + 1) * sizeof(*threads)); + if (!ASSERT_TRUE(tmp_threads, "realloc")) { + err = -ENOMEM; + break; + } + threads = tmp_threads; + + ret = pthread_create(&threads[nthreads], NULL, + run_libarena_parallel_prog, + tdprog); + if (!ASSERT_OK(ret, "pthread_create")) { + err = ret; + break; + } + } + + + for (i = 0; i < nthreads; i++) { + ret = pthread_join(threads[i], &thread_ret); + if (!ASSERT_OK(ret, "pthread_join")) { + err = err ?: ret; + continue; + } + + err = err ?: (long)thread_ret; + } + + free(threads); + + return err; +} + +static bool libarena_parallel_test_enabled(struct libarena *skel, + const char *prefix, + size_t prefixlen) +{ + struct bpf_program *prog; + char progname[MAX_PARTEST_NAME]; + int ret; + + ret = snprintf(progname, sizeof(progname), "%.*s__enabled", (int)prefixlen, + prefix); + if (!ASSERT_LT(ret, sizeof(progname), "partest enabled name")) + return false; + + prog = bpf_object__find_program_by_name(skel->obj, progname); + if (!prog) + return true; + + ret = libarena_run_prog(bpf_program__fd(prog)); + if (ret == -EOPNOTSUPP) + return false; + if (!ASSERT_OK(ret, progname)) + return false; + return true; +} + +static void run_libarena_parallel_test(struct libarena *skel, struct bpf_program *prog, + const char *name) +{ + char testname[MAX_PARTEST_NAME]; + size_t prefixlen; + const char *pos; + int ret; + + /* + * We annotate the initialization prog with __init. If the current prog does + * not match, it is one of the parallel threads instead and is ignored. + * + * We assume the test writer knows what they are doing and do not add __init + * randomly in the middle of a test name. + */ + pos = strstr(name, "__init"); + if (!pos) + return; + + prefixlen = pos - name; + if (!ASSERT_LT(prefixlen, MAX_PARTEST_PREFIX, "partest prefix too long")) + return; + + /* The name of the test without the __init suffix. Looks nicer in the test log. */ + ret = snprintf(testname, sizeof(testname), "%.*s", (int)prefixlen, name); + if (!ASSERT_LT(ret, sizeof(testname), "partest test name")) + return; + + if (!test__start_subtest(testname)) + return; + + if (!libarena_parallel_test_enabled(skel, testname, prefixlen)) { + test__skip(); + return; + } + + ret = libarena_run_prog(bpf_program__fd(skel->progs.arena_buddy_reset)); + if (!ASSERT_OK(ret, "arena_buddy_reset")) + return; + + ret = libarena_run_prog(bpf_program__fd(prog)); + if (!ASSERT_OK(ret, testname)) + return; + + ret = run_libarena_parallel_test_workers(skel, name, prefixlen); + + ASSERT_OK(ret, testname); + + run_libarena_parallel_fini(skel, name, prefixlen); +} + void test_libarena(void) { struct arena_alloc_reserve_args args; @@ -52,6 +223,22 @@ void test_libarena(void) bpf_object__for_each_program(prog, skel->obj) { const char *name = bpf_program__name(prog); + /* + * Handle parallel test progs separately. For those + * progs it's not a matter of test/skip, because each + * parallel test prog includes an initialization prog + * and a set of progs to be run in parallel. For the + * latter we do not record them as skipped or run, + * because we run them all at once when we come across + * the initialization prog. For more details on how we + * discover the progs see the comment on + * run_libarena_parallel_test. + */ + if (libarena_is_parallel_test_prog(name)) { + run_libarena_parallel_test(skel, prog, name); + continue; + } + if (!libarena_is_test_prog(name)) continue; -- cgit v1.2.3 From 63a673e8a4112af267106264f50584947786845a Mon Sep 17 00:00:00 2001 From: KP Singh Date: Fri, 5 Jun 2026 23:35:17 +0200 Subject: bpf: Expose signature verdict via bpf_prog_aux BPF_PROG_LOAD verifies the loader signature but does not record the outcome on the BPF program. [BPF] LSMs and audit can read attr->signature and attr->keyring_id to infer "was this signed, and if so, against which keyring". Add prog->aux->sig (verdict + keyring_{type,serial}), populated by bpf_prog_load before the LSM hook. keyring_type classifies the keyring the load referenced (builtin, secondary, platform or user), while keyring_serial records the serial of the keyring the signature was actually validated against. System keyrings carry a pseudo key pointer with no user-visible serial and are reported as 0, as are unsigned loads. Failed verifications reject the load before the hook runs, so it observes only either UNSIGNED or VERIFIED. Signed-off-by: KP Singh Co-developed-by: Daniel Borkmann Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260605213518.544262-1-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 45 +++++++++++++++++++++++++++++++++++---------- kernel/bpf/syscall.c | 29 ++++++++++++++++++++++++----- 2 files changed, 59 insertions(+), 15 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8599b451dd7a..f615b56730d2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -32,6 +32,7 @@ #include #include #include +#include #include struct bpf_verifier_env; @@ -1674,6 +1675,19 @@ struct bpf_stream_stage { int len; }; +enum bpf_sig_verdict { + BPF_SIG_UNSIGNED = 0, + BPF_SIG_VERIFIED, +}; + +enum bpf_sig_keyring { + BPF_SIG_KEYRING_NONE = 0, + BPF_SIG_KEYRING_BUILTIN, + BPF_SIG_KEYRING_SECONDARY, + BPF_SIG_KEYRING_PLATFORM, + BPF_SIG_KEYRING_USER, +}; + struct bpf_prog_aux { atomic64_t refcnt; u32 used_map_cnt; @@ -1716,6 +1730,11 @@ struct bpf_prog_aux { bool changes_pkt_data; bool might_sleep; bool kprobe_write_ctx; + struct { + s32 keyring_serial; + u8 keyring_type; + u8 verdict; + } sig; u64 prog_array_member_cnt; /* counts how many times as member of prog_array */ struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */ struct bpf_arena *arena; @@ -3697,8 +3716,14 @@ static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, #endif /* CONFIG_BPF_SYSCALL */ #endif /* defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) */ -#if defined(CONFIG_KEYS) && defined(CONFIG_BPF_SYSCALL) +#ifdef CONFIG_KEYS +struct bpf_key { + struct key *key; + bool has_ref; +}; +#endif /* CONFIG_KEYS */ +#if defined(CONFIG_KEYS) && defined(CONFIG_BPF_SYSCALL) struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags); struct bpf_key *bpf_lookup_system_key(u64 id); void bpf_key_put(struct bpf_key *bkey); @@ -3706,6 +3731,10 @@ int bpf_verify_pkcs7_signature(const struct bpf_dynptr *data_p, const struct bpf_dynptr *sig_p, struct bpf_key *trusted_keyring); +static inline s32 bpf_key_serial(const struct bpf_key *key) +{ + return key->has_ref ? key->key->serial : 0; +} #else static inline struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags) { @@ -3727,6 +3756,11 @@ static inline int bpf_verify_pkcs7_signature(const struct bpf_dynptr *data_p, { return -EOPNOTSUPP; } + +static inline s32 bpf_key_serial(const struct bpf_key *key) +{ + return 0; +} #endif /* defined(CONFIG_KEYS) && defined(CONFIG_BPF_SYSCALL) */ /* verifier prototypes for helper functions called from eBPF programs */ @@ -4002,15 +4036,6 @@ static inline void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype) {} static inline void bpf_cgroup_atype_put(int cgroup_atype) {} #endif /* CONFIG_BPF_LSM */ -struct key; - -#ifdef CONFIG_KEYS -struct bpf_key { - struct key *key; - bool has_ref; -}; -#endif /* CONFIG_KEYS */ - static inline bool type_is_alloc(u32 type) { return type & MEM_ALLOC; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c5d4ae957e87..5fcfc32c7cb4 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2871,8 +2871,22 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) } } +static enum bpf_sig_keyring bpf_classify_keyring(s32 keyring_id) +{ + switch (keyring_id) { + case 0: + return BPF_SIG_KEYRING_BUILTIN; + case (s32)(unsigned long)VERIFY_USE_SECONDARY_KEYRING: + return BPF_SIG_KEYRING_SECONDARY; + case (s32)(unsigned long)VERIFY_USE_PLATFORM_KEYRING: + return BPF_SIG_KEYRING_PLATFORM; + default: + return BPF_SIG_KEYRING_USER; + } +} + static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr, - bool is_kernel) + bool is_kernel, s32 *keyring_serial) { bpfptr_t usig = make_bpfptr(attr->signature, is_kernel); struct bpf_dynptr_kern sig_ptr, insns_ptr; @@ -2908,7 +2922,8 @@ static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr err = bpf_verify_pkcs7_signature((struct bpf_dynptr *)&insns_ptr, (struct bpf_dynptr *)&sig_ptr, key); - + if (!err) + *keyring_serial = bpf_key_serial(key); bpf_key_put(key); kvfree(sig); return err; @@ -3095,13 +3110,17 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_at /* eBPF programs must be GPL compatible to use GPL-ed functions */ prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0; - if (attr->signature) { - err = bpf_prog_verify_signature(prog, attr, uattr.is_kernel); + err = bpf_prog_verify_signature(prog, attr, uattr.is_kernel, + &prog->aux->sig.keyring_serial); if (err) goto free_prog; + prog->aux->sig.keyring_type = bpf_classify_keyring(attr->keyring_id); + prog->aux->sig.verdict = BPF_SIG_VERIFIED; + } else { + prog->aux->sig.keyring_type = BPF_SIG_KEYRING_NONE; + prog->aux->sig.verdict = BPF_SIG_UNSIGNED; } - prog->orig_prog = NULL; prog->jited = 0; -- cgit v1.2.3 From 8ddce416797b7454ba1df855821b02c6e43b5a0e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 5 Jun 2026 23:35:18 +0200 Subject: selftests/bpf: Inspect the signature verdict exposed to BPF LSM Add a minimal BPF LSM program on lsm/bpf_prog_load that, for loads on the monitored thread, reads back prog->aux->sig.{verdict,keyring_type, keyring_serial}, and a signed_loader subtest that drives the same gen_loader loader through the hook twice: i) /unsigned/ where the LSM must observe UNSIGNED, no keyring and serial 0; ii) /signed/ where the very same insns signed against the session keyring must be observed as VERIFIED with a user keyring, and the recorded keyring_serial must be equal to the resolved session keyring serial. Loading (not running) the loader is sufficient since the verdict is attached at load time. # LDLIBS=-static PKG_CONFIG='pkg-config --static' ./vmtest.sh -- ./test_progs -t signed_loader [ 1.970530] clocksource: Switched to clocksource tsc #405/1 signed_loader/metadata_check_shape:OK #405/2 signed_loader/metadata_match:OK #405/3 signed_loader/metadata_sha_mismatch:OK #405/4 signed_loader/metadata_not_exclusive:OK #405/5 signed_loader/metadata_hash_not_computed:OK #405/6 signed_loader/signature_enforced:OK #405/7 signed_loader/signature_too_large:OK #405/8 signed_loader/signature_bad_keyring:OK #405/9 signed_loader/metadata_ctx_max_entries_ignored:OK #405/10 signed_loader/metadata_ctx_initial_value_ignored:OK #405/11 signed_loader/signature_authenticates_insns:OK #405/12 signed_loader/hash_requires_frozen:OK #405/13 signed_loader/no_update_after_freeze:OK #405/14 signed_loader/freeze_writable_mmap:OK #405/15 signed_loader/no_writable_mmap_frozen:OK #405/16 signed_loader/map_hash_matches_libbpf:OK #405/17 signed_loader/map_hash_multi_element:OK #405/18 signed_loader/map_hash_bad_size:OK #405/19 signed_loader/map_hash_unsupported_type:OK #405/20 signed_loader/lsm_signature_verdict:OK #405 signed_loader:OK Summary: 1/20 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260605213518.544262-2-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/signed_loader.c | 122 +++++++++++++++++++++ .../selftests/bpf/progs/test_signed_loader_lsm.c | 30 +++++ 2 files changed, 152 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/test_signed_loader_lsm.c diff --git a/tools/testing/selftests/bpf/prog_tests/signed_loader.c b/tools/testing/selftests/bpf/prog_tests/signed_loader.c index dcfdd2d96b05..5fc417e31fc6 100644 --- a/tools/testing/selftests/bpf/prog_tests/signed_loader.c +++ b/tools/testing/selftests/bpf/prog_tests/signed_loader.c @@ -17,9 +17,23 @@ #include "test_signed_loader.skel.h" #include "test_signed_loader_map.skel.h" #include "test_signed_loader_data.skel.h" +#include "test_signed_loader_lsm.skel.h" #define SIG_MATCH_INSNS 33 /* excl (5) + 4 * sha-dword (7) */ +enum { + BPF_SIG_UNSIGNED = 0, + BPF_SIG_VERIFIED, +}; + +enum { + BPF_SIG_KEYRING_NONE = 0, + BPF_SIG_KEYRING_BUILTIN, + BPF_SIG_KEYRING_SECONDARY, + BPF_SIG_KEYRING_PLATFORM, + BPF_SIG_KEYRING_USER, +}; + static int load_loader(const void *insns, __u32 insns_sz, int map_fd, const void *sig, __u32 sig_sz, __s32 keyring_id) { @@ -970,6 +984,112 @@ static void map_hash_unsupported_type(void) close(fd); } +static int setup_meta_map(const struct gen_loader_fixture *f) +{ + LIBBPF_OPTS(bpf_map_create_opts, mopts, + .excl_prog_hash = f->excl, + .excl_prog_hash_size = sizeof(f->excl)); + __u32 key = 0; + int fd; + + fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "__loader.map", 4, + f->data_sz, 1, &mopts); + if (fd < 0) + return -errno; + if (bpf_map_update_elem(fd, &key, f->blob, 0) || bpf_map_freeze(fd)) { + close(fd); + return -errno; + } + return fd; +} + +static void lsm_signature_verdict(void) +{ + char dir_tmpl[] = "/tmp/signed_loader_lsmXXXXXX", *dir = NULL; + struct test_signed_loader_lsm *lsm = NULL; + int map_fd = -1, prog_fd = -1; + bool have_fixture = false; + struct gen_loader_fixture f; + __u32 sig_sz = 8192; + __s32 ses_serial; + __u8 sig[8192]; + + lsm = test_signed_loader_lsm__open_and_load(); + if (!ASSERT_OK_PTR(lsm, "lsm_skel_load")) + return; + lsm->bss->monitored_tid = sys_gettid(); + if (!ASSERT_OK(test_signed_loader_lsm__attach(lsm), "lsm_attach")) + goto out; + + have_fixture = true; + if (gen_loader_fixture_init(&f) != 0) + goto out; + + map_fd = setup_meta_map(&f); + if (!ASSERT_OK_FD(map_fd, "meta_map_unsigned")) + goto out; + lsm->bss->seen = 0; + prog_fd = load_loader(f.gopts.insns, f.gopts.insns_sz, map_fd, NULL, 0, 0); + close(map_fd); + map_fd = -1; + if (!ASSERT_OK_FD(prog_fd, "unsigned loader load")) + goto out; + close(prog_fd); + prog_fd = -1; + if (!ASSERT_NEQ(lsm->bss->seen, 0, "bpf LSM in the active LSM set")) + goto out; + ASSERT_EQ(lsm->bss->seen, 1, "unsigned: one observed load"); + ASSERT_EQ(lsm->bss->sig_verdict, BPF_SIG_UNSIGNED, "unsigned verdict"); + ASSERT_EQ(lsm->bss->sig_keyring_type, BPF_SIG_KEYRING_NONE, "unsigned keyring type"); + ASSERT_EQ(lsm->bss->sig_keyring_serial, 0, "unsigned: no keyring serial"); + + syscall(__NR_request_key, "keyring", "_uid.0", NULL, + KEY_SPEC_SESSION_KEYRING); + dir = mkdtemp(dir_tmpl); + if (!ASSERT_OK_PTR(dir, "mkdtemp")) + goto out; + if (!ASSERT_OK(run_setup("setup", dir), "verify_sig_setup")) { + rmdir(dir); + dir = NULL; + goto out; + } + if (!ASSERT_OK(sign_buf(dir, f.gopts.insns, f.gopts.insns_sz, sig, + &sig_sz), "sign-file")) + goto out; + + map_fd = setup_meta_map(&f); + if (!ASSERT_OK_FD(map_fd, "meta_map_signed")) + goto out; + lsm->bss->seen = 0; + prog_fd = load_loader(f.gopts.insns, f.gopts.insns_sz, map_fd, sig, + sig_sz, KEY_SPEC_SESSION_KEYRING); + close(map_fd); + map_fd = -1; + if (!ASSERT_OK_FD(prog_fd, "signed loader load")) + goto out; + close(prog_fd); + prog_fd = -1; + + ses_serial = syscall(__NR_keyctl, KEYCTL_GET_KEYRING_ID, + KEY_SPEC_SESSION_KEYRING, 0); + ASSERT_EQ(lsm->bss->seen, 1, "signed: one observed load"); + ASSERT_EQ(lsm->bss->sig_verdict, BPF_SIG_VERIFIED, "signed verdict"); + ASSERT_EQ(lsm->bss->sig_keyring_type, BPF_SIG_KEYRING_USER, "signed keyring type"); + ASSERT_GT(ses_serial, 0, "session keyring serial resolved"); + ASSERT_EQ(lsm->bss->sig_keyring_serial, ses_serial, + "signed: validated against session keyring"); +out: + if (map_fd >= 0) + close(map_fd); + if (prog_fd >= 0) + close(prog_fd); + if (have_fixture) + gen_loader_fixture_fini(&f); + if (dir) + run_setup("cleanup", dir); + test_signed_loader_lsm__destroy(lsm); +} + void test_signed_loader(void) { if (test__start_subtest("metadata_check_shape")) @@ -1010,4 +1130,6 @@ void test_signed_loader(void) map_hash_bad_size(); if (test__start_subtest("map_hash_unsupported_type")) map_hash_unsupported_type(); + if (test__start_subtest("lsm_signature_verdict")) + lsm_signature_verdict(); } diff --git a/tools/testing/selftests/bpf/progs/test_signed_loader_lsm.c b/tools/testing/selftests/bpf/progs/test_signed_loader_lsm.c new file mode 100644 index 000000000000..575a9b7910c8 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_signed_loader_lsm.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include +#include + +char _license[] SEC("license") = "GPL"; + +__u32 monitored_tid; + +int sig_keyring_serial; +int sig_keyring_type; +int sig_verdict; +int seen; + +SEC("lsm/bpf_prog_load") +int BPF_PROG(inspect_prog_load, struct bpf_prog *prog, union bpf_attr *attr, + struct bpf_token *token, bool kernel) +{ + __u32 tid = bpf_get_current_pid_tgid() & 0xffffffff; + + if (!monitored_tid || tid != monitored_tid) + return 0; + + seen++; + sig_keyring_serial = prog->aux->sig.keyring_serial; + sig_keyring_type = prog->aux->sig.keyring_type; + sig_verdict = prog->aux->sig.verdict; + return 0; +} -- cgit v1.2.3 From 37363191cbe8f83586ad6a818460d010070ead00 Mon Sep 17 00:00:00 2001 From: Nuoqi Gui Date: Sat, 6 Jun 2026 18:50:37 +0800 Subject: bpf: Fold reg->var_off into PTR_TO_FLOW_KEYS bounds check Constant pointer arithmetic on a PTR_TO_FLOW_KEYS register lands the constant in reg->var_off (e.g. flow_keys(imm=4096)), but the PTR_TO_FLOW_KEYS path in check_mem_access() passes only insn->off to check_flow_keys_access() and never folds reg->var_off.value. The verifier therefore accepts an access that, at runtime, dereferences past struct bpf_flow_keys -- a verifier/runtime divergence that yields an out-of-bounds read and write of kernel stack memory. Commit 022ac0750883 ("bpf: use reg->var_off instead of reg->off for pointers") removed the generic "off += reg->off" that check_mem_access() applied before the per-type dispatch and replaced it with per-path folding of reg->var_off.value (for example the ctx path now folds the register offset via check_ctx_access()). The PTR_TO_FLOW_KEYS path was not given the equivalent fold, so a constant offset that used to be folded and rejected is now silently accepted: before 022ac0750883: the offset stays in reg->off and is folded generically, so the access is checked with off=4096 and rejected. after 022ac0750883: the offset lands in reg->var_off, the flow_keys path checks off=0 and accepts; at runtime the access dereferences base + 0x1000. For a BPF_PROG_TYPE_FLOW_DISSECTOR program the following is accepted: r2 = *(u64 *)(r1 + 144) ; R2=flow_keys (PTR_TO_FLOW_KEYS) r2 += 0x1000 ; R2=flow_keys(imm=4096), accepted r0 = *(u64 *)(r2 + 0) ; accepted, var_off.value=0x1000 ignored while the equivalent insn->off form r0 = *(u64 *)(r2 + 0x1000) has the same effective offset but is correctly rejected with "invalid access to flow keys off=4096 size=8", which isolates the defect to the missing var_off fold. Once attached as a flow dissector, the accepted program reads kernel stack past struct bpf_flow_keys (a kernel-stack / KASLR information leak) and can likewise write past it, corrupting kernel memory. Fix it by folding reg->var_off.value into the offset before the bounds check and rejecting non-constant offsets, mirroring the other pointer types (e.g. check_ctx_access()). Fixes: 022ac0750883 ("bpf: use reg->var_off instead of reg->off for pointers") Signed-off-by: Nuoqi Gui Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260606-c3-01-v3-v3-1-97c51f592f15@mails.tsinghua.edu.cn Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 935595138aa0..68ddd465584c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4728,9 +4728,21 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, struct b return err; } -static int check_flow_keys_access(struct bpf_verifier_env *env, int off, - int size) +static int check_flow_keys_access(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, argno_t argno, + int off, int size) { + /* Only a constant offset is allowed here; fold it into off. */ + if (!tnum_is_const(reg->var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "%s invalid variable offset to flow keys: off=%d, var_off=%s\n", + reg_arg_name(env, argno), off, tn_buf); + return -EACCES; + } + off += reg->var_off.value; + if (size < 0 || off < 0 || (u64)off + size > sizeof(struct bpf_flow_keys)) { verbose(env, "invalid access to flow keys off=%d size=%d\n", @@ -6239,7 +6251,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct b return -EACCES; } - err = check_flow_keys_access(env, off, size); + err = check_flow_keys_access(env, reg, argno, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (type_is_sk_pointer(reg->type)) { -- cgit v1.2.3 From 3ce6b42458f0e2176350fccf86b954d322591ff7 Mon Sep 17 00:00:00 2001 From: Nuoqi Gui Date: Sat, 6 Jun 2026 18:50:38 +0800 Subject: selftests/bpf: add tests for PTR_TO_FLOW_KEYS offset bounds Add verifier tests covering pointer arithmetic on a PTR_TO_FLOW_KEYS register. This covers the bpf-next regression where an out-of-bounds constant offset introduced as flow_keys += K and then dereferenced at insn->off 0 was accepted, while the equivalent flow_keys + K direct offset was rejected. The tests check that in-bounds constant arithmetic on the keys pointer is still accepted, out-of-bounds constant arithmetic is rejected for both read and write, and a truly varying offset from bpf_get_prandom_u32() remains rejected by the existing PTR_TO_FLOW_KEYS pointer arithmetic rules. Signed-off-by: Nuoqi Gui Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260606-c3-01-v3-v3-2-97c51f592f15@mails.tsinghua.edu.cn Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/verifier.c | 2 + .../selftests/bpf/progs/verifier_flow_keys.c | 97 ++++++++++++++++++++++ 2 files changed, 99 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_flow_keys.c diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 89779d897aba..8a3d69e2453c 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -38,6 +38,7 @@ #include "verifier_div0.skel.h" #include "verifier_div_mod_bounds.skel.h" #include "verifier_div_overflow.skel.h" +#include "verifier_flow_keys.skel.h" #include "verifier_global_subprogs.skel.h" #include "verifier_global_ptr_args.skel.h" #include "verifier_gotol.skel.h" @@ -190,6 +191,7 @@ void test_verifier_direct_stack_access_wraparound(void) { RUN(verifier_direct_st void test_verifier_div0(void) { RUN(verifier_div0); } void test_verifier_div_mod_bounds(void) { RUN(verifier_div_mod_bounds); } void test_verifier_div_overflow(void) { RUN(verifier_div_overflow); } +void test_verifier_flow_keys(void) { RUN(verifier_flow_keys); } void test_verifier_global_subprogs(void) { RUN(verifier_global_subprogs); } void test_verifier_global_ptr_args(void) { RUN(verifier_global_ptr_args); } void test_verifier_gotol(void) { RUN(verifier_gotol); } diff --git a/tools/testing/selftests/bpf/progs/verifier_flow_keys.c b/tools/testing/selftests/bpf/progs/verifier_flow_keys.c new file mode 100644 index 000000000000..d780a36a6e9a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_flow_keys.c @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Bounds checks for PTR_TO_FLOW_KEYS pointer arithmetic. */ + +#include "vmlinux.h" +#include +#include "bpf_misc.h" + +/* sizeof(struct bpf_flow_keys) is well under 4096, so +0x1000 is OOB. */ + +SEC("flow_dissector") +__description("flow_keys: in-bounds constant pointer arithmetic accepted") +__success +__naked void flow_keys_const_inbounds(void) +{ + asm volatile (" \ + r1 = *(u64 *)(r1 + %[flow_keys]); \ + r1 += 8; \ + r0 = *(u64 *)(r1 + 0); \ + r0 = 0; \ + exit; \ +" : + : __imm_const(flow_keys, offsetof(struct __sk_buff, flow_keys)) + : __clobber_all); +} + +SEC("flow_dissector") +__description("flow_keys: OOB via constant pointer arithmetic rejected") +__failure __msg("invalid access to flow keys off=4096 size=8") +__naked void flow_keys_const_oob_read(void) +{ + asm volatile (" \ + r1 = *(u64 *)(r1 + %[flow_keys]); \ + r1 += 4096; \ + r0 = *(u64 *)(r1 + 0); \ + r0 = 0; \ + exit; \ +" : + : __imm_const(flow_keys, offsetof(struct __sk_buff, flow_keys)) + : __clobber_all); +} + +SEC("flow_dissector") +__description("flow_keys: OOB write via constant pointer arithmetic rejected") +__failure __msg("invalid access to flow keys off=4096 size=8") +__naked void flow_keys_const_oob_write(void) +{ + asm volatile (" \ + r1 = *(u64 *)(r1 + %[flow_keys]); \ + r1 += 4096; \ + r2 = 0; \ + *(u64 *)(r1 + 0) = r2; \ + r0 = 0; \ + exit; \ +" : + : __imm_const(flow_keys, offsetof(struct __sk_buff, flow_keys)) + : __clobber_all); +} + +/* Equivalent OOB expressed directly in insn->off; this form was always + * rejected and is kept to show both forms now share one diagnostic. + */ +SEC("flow_dissector") +__description("flow_keys: OOB via insn->off rejected") +__failure __msg("invalid access to flow keys off=4096 size=8") +__naked void flow_keys_insn_off_oob(void) +{ + asm volatile (" \ + r1 = *(u64 *)(r1 + %[flow_keys]); \ + r0 = *(u64 *)(r1 + 4096); \ + r0 = 0; \ + exit; \ +" : + : __imm_const(flow_keys, offsetof(struct __sk_buff, flow_keys)) + : __clobber_all); +} + +SEC("flow_dissector") +__description("flow_keys: variable pointer arithmetic rejected") +__failure __msg("R1 pointer arithmetic on flow_keys prohibited") +__naked void flow_keys_var_read(void) +{ + asm volatile (" \ + r6 = r1; \ + call %[bpf_get_prandom_u32]; \ + r0 &= 0xFFFF; \ + r1 = *(u64 *)(r6 + %[flow_keys]); \ + r1 += r0; \ + r0 = *(u64 *)(r1 + 0); \ + r0 = 0; \ + exit; \ +" : + : __imm_const(flow_keys, offsetof(struct __sk_buff, flow_keys)), + __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 5b038319be442c620f774e6fc9e9283deeca1c75 Mon Sep 17 00:00:00 2001 From: David Windsor Date: Fri, 5 Jun 2026 10:57:07 -0400 Subject: bpf: Reject sleepable BPF_LSM_CGROUP programs at load time The cgroup shim runs under rcu_read_lock_dont_migrate(), so we should not attach any sleepable BPF programs there. Add support to the verifier to explicitly reject attempts to load sleepable BPF programs destined for LSM cgroup attachment. Without this, we get the following splat from a BPF_LSM_CGROUP program marked BPF_F_SLEEPABLE attached to file_open when it calls bpf_get_dentry_xattr(): BUG: sleeping function called from invalid context at kernel/locking/rwsem.c:1567 in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 34317, name: load preempt_count: 0, expected: 0 RCU nest depth: 2, expected: 0 Call Trace: down_read+0x76/0x480 ext4_xattr_get+0x11f/0x700 __vfs_getxattr+0xf0/0x150 bpf_get_dentry_xattr+0xbb/0xf0 bpf_prog_e76a298dac9218c6_test_open+0x6a/0x85 __cgroup_bpf_run_lsm_current+0x326/0x840 bpf_trampoline_6442534646+0x62/0x14d security_file_open+0x34/0x60 do_dentry_open+0x340/0x1260 vfs_open+0x7a/0x440 path_openat+0x1bac/0x30a0 libbpf provides a .s named section variant for every sleepable program type except lsm_cgroup, reflecting that per-cgroup LSM programs are intended to only run in a non-sleepable context. The above splat was obtained by bypassing libbpf by using bpf(2) directly. Fixes: 69fd337a975c ("bpf: per-cgroup lsm flavor") Signed-off-by: David Windsor Acked-by: Yonghong Song Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20260605145707.608579-1-dwindsor@gmail.com Signed-off-by: Kumar Kartikeya Dwivedi --- kernel/bpf/verifier.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 68ddd465584c..926ff63a0b61 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19172,8 +19172,10 @@ static bool can_be_sleepable(struct bpf_prog *prog) return false; } } - return prog->type == BPF_PROG_TYPE_LSM || - prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ || + if (prog->type == BPF_PROG_TYPE_LSM) + return prog->expected_attach_type != BPF_LSM_CGROUP; + + return prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ || prog->type == BPF_PROG_TYPE_STRUCT_OPS || prog->type == BPF_PROG_TYPE_RAW_TRACEPOINT || prog->type == BPF_PROG_TYPE_TRACEPOINT; -- cgit v1.2.3 From e57f13eaab259ece7c9e8d81ba2c40c4f057ca2c Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:26 +0200 Subject: ftrace: Add ftrace_hash_count function Adding external ftrace_hash_count function so we could get hash count outside of ftrace object. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-2-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/ftrace.h | 7 +++++++ kernel/trace/ftrace.c | 7 ++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 28b30c6f1031..02c24bf766ce 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -551,6 +551,8 @@ int update_ftrace_direct_mod(struct ftrace_ops *ops, struct ftrace_hash *hash, b void ftrace_stub_direct_tramp(void); +unsigned long ftrace_hash_count(struct ftrace_hash *hash); + #else struct ftrace_ops; static inline unsigned long ftrace_find_rec_direct(unsigned long ip) @@ -590,6 +592,11 @@ static inline int update_ftrace_direct_mod(struct ftrace_ops *ops, struct ftrace return -ENODEV; } +static inline unsigned long ftrace_hash_count(struct ftrace_hash *hash) +{ + return 0; +} + /* * This must be implemented by the architecture. * It is the way the ftrace direct_ops helper, when called diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b2611de3f594..57ab01fd00bd 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6288,11 +6288,16 @@ int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) } EXPORT_SYMBOL_GPL(modify_ftrace_direct); -static unsigned long hash_count(struct ftrace_hash *hash) +static inline unsigned long hash_count(struct ftrace_hash *hash) { return hash ? hash->count : 0; } +unsigned long ftrace_hash_count(struct ftrace_hash *hash) +{ + return hash_count(hash); +} + /** * hash_add - adds two struct ftrace_hash and returns the result * @a: struct ftrace_hash object -- cgit v1.2.3 From af7c32365090a1a8ff981f85d7c24b344a2eaa75 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:27 +0200 Subject: ftrace: Add ftrace_hash_remove function Adding ftrace_hash_remove function that removes all entries from struct ftrace_hash object without freeing them. It will be used in following changes where entries are allocated as part of another structure and are free-ed separately. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-3-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/ftrace.h | 1 + kernel/trace/ftrace.c | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 02c24bf766ce..b55ec9b25bb3 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -415,6 +415,7 @@ struct ftrace_hash *alloc_ftrace_hash(int size_bits); void free_ftrace_hash(struct ftrace_hash *hash); struct ftrace_func_entry *add_ftrace_hash_entry_direct(struct ftrace_hash *hash, unsigned long ip, unsigned long direct); +void ftrace_hash_remove(struct ftrace_hash *hash); /* The hash used to know what functions callbacks trace */ struct ftrace_ops_hash { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 57ab01fd00bd..45548b0200eb 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1249,6 +1249,25 @@ remove_hash_entry(struct ftrace_hash *hash, hash->count--; } +void ftrace_hash_remove(struct ftrace_hash *hash) +{ + struct ftrace_func_entry *entry; + struct hlist_head *hhd; + struct hlist_node *tn; + int size; + int i; + + if (!hash || !hash->count) + return; + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hhd = &hash->buckets[i]; + hlist_for_each_entry_safe(entry, tn, hhd, hlist) + remove_hash_entry(hash, entry); + } + FTRACE_WARN_ON(hash->count); +} + static void ftrace_hash_clear(struct ftrace_hash *hash) { struct hlist_head *hhd; -- cgit v1.2.3 From 2cd298c106e00ba1d8799b022594f131703f32fa Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:28 +0200 Subject: ftrace: Add add_ftrace_hash_entry function Renaming __add_hash_entry to add_ftrace_hash_entry and making it global, it will be used in following changes outside ftrace.c object. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-4-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/ftrace.h | 1 + kernel/trace/ftrace.c | 9 ++++----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index b55ec9b25bb3..02bc5027523a 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -415,6 +415,7 @@ struct ftrace_hash *alloc_ftrace_hash(int size_bits); void free_ftrace_hash(struct ftrace_hash *hash); struct ftrace_func_entry *add_ftrace_hash_entry_direct(struct ftrace_hash *hash, unsigned long ip, unsigned long direct); +void add_ftrace_hash_entry(struct ftrace_hash *hash, struct ftrace_func_entry *entry); void ftrace_hash_remove(struct ftrace_hash *hash); /* The hash used to know what functions callbacks trace */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 45548b0200eb..f93e34dd2328 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1198,8 +1198,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) return __ftrace_lookup_ip(hash, ip); } -static void __add_hash_entry(struct ftrace_hash *hash, - struct ftrace_func_entry *entry) +void add_ftrace_hash_entry(struct ftrace_hash *hash, struct ftrace_func_entry *entry) { struct hlist_head *hhd; unsigned long key; @@ -1221,7 +1220,7 @@ add_ftrace_hash_entry_direct(struct ftrace_hash *hash, unsigned long ip, unsigne entry->ip = ip; entry->direct = direct; - __add_hash_entry(hash, entry); + add_ftrace_hash_entry(hash, entry); return entry; } @@ -1477,7 +1476,7 @@ static struct ftrace_hash *__move_hash(struct ftrace_hash *src, int size) hhd = &src->buckets[i]; hlist_for_each_entry_safe(entry, tn, hhd, hlist) { remove_hash_entry(src, entry); - __add_hash_entry(new_hash, entry); + add_ftrace_hash_entry(new_hash, entry); } } return new_hash; @@ -5360,7 +5359,7 @@ int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, map->entry.ip = ip; map->data = data; - __add_hash_entry(&mapper->hash, &map->entry); + add_ftrace_hash_entry(&mapper->hash, &map->entry); return 0; } -- cgit v1.2.3 From e6abd4cd157bf63cd89c74f8f10abae76e7b0359 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:29 +0200 Subject: bpf: Use mutex lock pool for bpf trampolines Adding mutex lock pool that replaces bpf trampolines mutex. For tracing_multi link coming in following changes we need to lock all the involved trampolines during the attachment. This could mean thousands of mutex locks, which is not convenient. As suggested by Andrii we can replace bpf trampolines mutex with mutex pool, where each trampoline is hash-ed to one of the locks from the pool. It's better to lock all the pool mutexes (32 at the moment) than thousands of them. There is 48 (MAX_LOCK_DEPTH) lock limit allowed to be simultaneously held by task, so we need to keep 32 mutexes (5 bits) in the pool, so when we lock them all in following changes the lockdep won't scream. Removing the mutex_is_locked in bpf_trampoline_put, because we removed the mutex from bpf_trampoline. Suggested-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-5-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 -- kernel/bpf/trampoline.c | 77 ++++++++++++++++++++++++++++++++++--------------- 2 files changed, 53 insertions(+), 26 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f615b56730d2..f6056bab6f23 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1353,8 +1353,6 @@ struct bpf_trampoline { /* hlist for trampoline_ip_table */ struct hlist_node hlist_ip; struct ftrace_ops *fops; - /* serializes access to fields of this trampoline */ - struct mutex mutex; refcount_t refcnt; u32 flags; u64 key; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index a4298a25d4ba..c0b4732627be 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -30,6 +30,35 @@ static struct hlist_head trampoline_ip_table[TRAMPOLINE_TABLE_SIZE]; /* serializes access to trampoline tables */ static DEFINE_MUTEX(trampoline_mutex); +/* + * Keep 32 trampoline locks (5 bits) in the pool so trampoline_lock_all() + * stays below MAX_LOCK_DEPTH. Each pool slot has a distinct lockdep + * class because trampoline_lock_all() takes all pool mutexes at once; + * otherwise lockdep would report recursive locking on same-class mutexes. + */ +#define TRAMPOLINE_LOCKS_BITS 5 +#define TRAMPOLINE_LOCKS_TABLE_SIZE (1 << TRAMPOLINE_LOCKS_BITS) + +static struct { + struct mutex mutex; + struct lock_class_key key; +} trampoline_locks[TRAMPOLINE_LOCKS_TABLE_SIZE]; + +static struct mutex *select_trampoline_lock(struct bpf_trampoline *tr) +{ + return &trampoline_locks[hash_ptr(tr, TRAMPOLINE_LOCKS_BITS)].mutex; +} + +static void trampoline_lock(struct bpf_trampoline *tr) +{ + mutex_lock(select_trampoline_lock(tr)); +} + +static void trampoline_unlock(struct bpf_trampoline *tr) +{ + mutex_unlock(select_trampoline_lock(tr)); +} + #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex); @@ -69,9 +98,9 @@ static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip, if (cmd == FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF) { /* This is called inside register_ftrace_direct_multi(), so - * tr->mutex is already locked. + * trampoline's mutex is already locked. */ - lockdep_assert_held_once(&tr->mutex); + lockdep_assert_held_once(select_trampoline_lock(tr)); /* Instead of updating the trampoline here, we propagate * -EAGAIN to register_ftrace_direct(). Then we can @@ -91,7 +120,7 @@ static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip, } /* The normal locking order is - * tr->mutex => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c) + * select_trampoline_lock(tr) => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c) * * The following two commands are called from * @@ -99,12 +128,12 @@ static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip, * cleanup_direct_functions_after_ipmodify * * In both cases, direct_mutex is already locked. Use - * mutex_trylock(&tr->mutex) to avoid deadlock in race condition - * (something else is making changes to this same trampoline). + * mutex_trylock(select_trampoline_lock(tr)) to avoid deadlock in race condition + * (something else holds the same pool lock). */ - if (!mutex_trylock(&tr->mutex)) { - /* sleep 1 ms to make sure whatever holding tr->mutex makes - * some progress. + if (!mutex_trylock(select_trampoline_lock(tr))) { + /* sleep 1 ms to make sure whatever holding select_trampoline_lock(tr) + * makes some progress. */ msleep(1); return -EAGAIN; @@ -129,7 +158,7 @@ static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip, break; } - mutex_unlock(&tr->mutex); + trampoline_unlock(tr); return ret; } #endif @@ -359,7 +388,6 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key, unsigned long ip) head = &trampoline_ip_table[hash_64(tr->ip, TRAMPOLINE_HASH_BITS)]; hlist_add_head(&tr->hlist_ip, head); refcount_set(&tr->refcnt, 1); - mutex_init(&tr->mutex); for (i = 0; i < BPF_TRAMP_MAX; i++) INIT_HLIST_HEAD(&tr->progs_hlist[i]); out: @@ -843,9 +871,9 @@ int bpf_trampoline_link_prog(struct bpf_tramp_link *link, { int err; - mutex_lock(&tr->mutex); + trampoline_lock(tr); err = __bpf_trampoline_link_prog(link, tr, tgt_prog); - mutex_unlock(&tr->mutex); + trampoline_unlock(tr); return err; } @@ -886,9 +914,9 @@ int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, { int err; - mutex_lock(&tr->mutex); + trampoline_lock(tr); err = __bpf_trampoline_unlink_prog(link, tr, tgt_prog); - mutex_unlock(&tr->mutex); + trampoline_unlock(tr); return err; } @@ -998,12 +1026,12 @@ int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, if (!tr) return -ENOMEM; - mutex_lock(&tr->mutex); + trampoline_lock(tr); shim_link = cgroup_shim_find(tr, bpf_func); if (shim_link && !IS_ERR(bpf_link_inc_not_zero(&shim_link->link.link))) { /* Reusing existing shim attached by the other program. */ - mutex_unlock(&tr->mutex); + trampoline_unlock(tr); bpf_trampoline_put(tr); /* bpf_trampoline_get above */ return 0; } @@ -1023,16 +1051,16 @@ int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, shim_link->trampoline = tr; /* note, we're still holding tr refcnt from above */ - mutex_unlock(&tr->mutex); + trampoline_unlock(tr); return 0; err: - mutex_unlock(&tr->mutex); + trampoline_unlock(tr); if (shim_link) bpf_link_put(&shim_link->link.link); - /* have to release tr while _not_ holding its mutex */ + /* have to release tr while _not_ holding pool mutex for trampoline */ bpf_trampoline_put(tr); /* bpf_trampoline_get above */ return err; @@ -1053,9 +1081,9 @@ void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog) if (WARN_ON_ONCE(!tr)) return; - mutex_lock(&tr->mutex); + trampoline_lock(tr); shim_link = cgroup_shim_find(tr, bpf_func); - mutex_unlock(&tr->mutex); + trampoline_unlock(tr); if (shim_link) bpf_link_put(&shim_link->link.link); @@ -1073,14 +1101,14 @@ struct bpf_trampoline *bpf_trampoline_get(u64 key, if (!tr) return NULL; - mutex_lock(&tr->mutex); + trampoline_lock(tr); if (tr->func.addr) goto out; memcpy(&tr->func.model, &tgt_info->fmodel, sizeof(tgt_info->fmodel)); tr->func.addr = (void *)tgt_info->tgt_addr; out: - mutex_unlock(&tr->mutex); + trampoline_unlock(tr); return tr; } @@ -1093,7 +1121,6 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) mutex_lock(&trampoline_mutex); if (!refcount_dec_and_test(&tr->refcnt)) goto out; - WARN_ON_ONCE(mutex_is_locked(&tr->mutex)); for (i = 0; i < BPF_TRAMP_MAX; i++) if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[i]))) @@ -1379,6 +1406,8 @@ static int __init init_trampolines(void) INIT_HLIST_HEAD(&trampoline_key_table[i]); for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++) INIT_HLIST_HEAD(&trampoline_ip_table[i]); + for (i = 0; i < TRAMPOLINE_LOCKS_TABLE_SIZE; i++) + __mutex_init(&trampoline_locks[i].mutex, "trampoline_lock", &trampoline_locks[i].key); return 0; } late_initcall(init_trampolines); -- cgit v1.2.3 From 8a35e8db740f96ec17b85db5a0f83c028c707a3e Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:30 +0200 Subject: bpf: Add struct bpf_trampoline_ops object In following changes we will need to override ftrace direct attachment behaviour. In order to do that we are adding struct bpf_trampoline_ops object that defines callbacks for ftrace direct attachment: register_fentry unregister_fentry modify_fentry The new struct bpf_trampoline_ops object is passed as an argument to __bpf_trampoline_link/unlink_prog functions. At the moment the default trampoline_ops is set to the current ftrace direct attachment functions, so there's no functional change for the current code. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-6-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/trampoline.c | 59 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 18 deletions(-) diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index c0b4732627be..5c943832fb9d 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -59,8 +59,18 @@ static void trampoline_unlock(struct bpf_trampoline *tr) mutex_unlock(select_trampoline_lock(tr)); } +struct bpf_trampoline_ops { + int (*register_fentry)(struct bpf_trampoline *tr, void *new_addr, void *data); + int (*unregister_fentry)(struct bpf_trampoline *tr, u32 orig_flags, void *old_addr, + void *data); + int (*modify_fentry)(struct bpf_trampoline *tr, u32 orig_flags, void *old_addr, + void *new_addr, bool lock_direct_mutex, void *data); +}; + #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS -static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex); +static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex, + const struct bpf_trampoline_ops *ops, void *data); +static const struct bpf_trampoline_ops trampoline_ops; #ifdef CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS static struct bpf_trampoline *direct_ops_ip_lookup(struct ftrace_ops *ops, unsigned long ip) @@ -145,13 +155,15 @@ static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip, if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) && !(tr->flags & BPF_TRAMP_F_ORIG_STACK)) - ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */); + ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */, + &trampoline_ops, NULL); break; case FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER: tr->flags &= ~BPF_TRAMP_F_SHARE_IPMODIFY; if (tr->flags & BPF_TRAMP_F_ORIG_STACK) - ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */); + ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */, + &trampoline_ops, NULL); break; default: ret = -EINVAL; @@ -415,7 +427,7 @@ static int bpf_trampoline_update_fentry(struct bpf_trampoline *tr, u32 orig_flag } static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags, - void *old_addr) + void *old_addr, void *data __maybe_unused) { int ret; @@ -429,7 +441,7 @@ static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags, static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags, void *old_addr, void *new_addr, - bool lock_direct_mutex) + bool lock_direct_mutex, void *data __maybe_unused) { int ret; @@ -443,7 +455,7 @@ static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags, } /* first time registering */ -static int register_fentry(struct bpf_trampoline *tr, void *new_addr) +static int register_fentry(struct bpf_trampoline *tr, void *new_addr, void *data __maybe_unused) { void *ip = tr->func.addr; unsigned long faddr; @@ -465,6 +477,12 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) return ret; } +static const struct bpf_trampoline_ops trampoline_ops = { + .register_fentry = register_fentry, + .unregister_fentry = unregister_fentry, + .modify_fentry = modify_fentry, +}; + static struct bpf_tramp_links * bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_arg) { @@ -632,7 +650,8 @@ out: return ERR_PTR(err); } -static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex) +static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex, + const struct bpf_trampoline_ops *ops, void *data) { struct bpf_tramp_image *im; struct bpf_tramp_links *tlinks; @@ -645,7 +664,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut return PTR_ERR(tlinks); if (total == 0) { - err = unregister_fentry(tr, orig_flags, tr->cur_image->image); + err = ops->unregister_fentry(tr, orig_flags, tr->cur_image->image, data); bpf_tramp_image_put(tr->cur_image); tr->cur_image = NULL; goto out; @@ -715,11 +734,11 @@ again: if (tr->cur_image) /* progs already running at this address */ - err = modify_fentry(tr, orig_flags, tr->cur_image->image, - im->image, lock_direct_mutex); + err = ops->modify_fentry(tr, orig_flags, tr->cur_image->image, + im->image, lock_direct_mutex, data); else /* first time registering */ - err = register_fentry(tr, im->image); + err = ops->register_fentry(tr, im->image, data); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS if (err == -EAGAIN) { @@ -793,7 +812,9 @@ static int bpf_freplace_check_tgt_prog(struct bpf_prog *tgt_prog) static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr, - struct bpf_prog *tgt_prog) + struct bpf_prog *tgt_prog, + const struct bpf_trampoline_ops *ops, + void *data) { struct bpf_fsession_link *fslink = NULL; enum bpf_tramp_prog_type kind; @@ -851,7 +872,7 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, } else { tr->progs_cnt[kind]++; } - err = bpf_trampoline_update(tr, true /* lock_direct_mutex */); + err = bpf_trampoline_update(tr, true /* lock_direct_mutex */, ops, data); if (err) { hlist_del_init(&link->tramp_hlist); if (kind == BPF_TRAMP_FSESSION) { @@ -872,14 +893,16 @@ int bpf_trampoline_link_prog(struct bpf_tramp_link *link, int err; trampoline_lock(tr); - err = __bpf_trampoline_link_prog(link, tr, tgt_prog); + err = __bpf_trampoline_link_prog(link, tr, tgt_prog, &trampoline_ops, NULL); trampoline_unlock(tr); return err; } static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr, - struct bpf_prog *tgt_prog) + struct bpf_prog *tgt_prog, + const struct bpf_trampoline_ops *ops, + void *data) { enum bpf_tramp_prog_type kind; int err; @@ -904,7 +927,7 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, } hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; - return bpf_trampoline_update(tr, true /* lock_direct_mutex */); + return bpf_trampoline_update(tr, true /* lock_direct_mutex */, ops, data); } /* bpf_trampoline_unlink_prog() should never fail. */ @@ -915,7 +938,7 @@ int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, int err; trampoline_lock(tr); - err = __bpf_trampoline_unlink_prog(link, tr, tgt_prog); + err = __bpf_trampoline_unlink_prog(link, tr, tgt_prog, &trampoline_ops, NULL); trampoline_unlock(tr); return err; } @@ -1044,7 +1067,7 @@ int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, goto err; } - err = __bpf_trampoline_link_prog(&shim_link->link, tr, NULL); + err = __bpf_trampoline_link_prog(&shim_link->link, tr, NULL, &trampoline_ops, NULL); if (err) goto err; -- cgit v1.2.3 From bf4bc3e11c4195123055780b84dccfb8d2569535 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:31 +0200 Subject: bpf: Move trampoline image setup into bpf_trampoline_ops callbacks Moving trampoline image setup into bpf_trampoline_ops callbacks, so we can have different image handling for multi attachment which is coming in following changes. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-7-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/trampoline.c | 66 ++++++++++++++++++++++++++++--------------------- 1 file changed, 38 insertions(+), 28 deletions(-) diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 5c943832fb9d..1006031ea021 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -60,11 +60,10 @@ static void trampoline_unlock(struct bpf_trampoline *tr) } struct bpf_trampoline_ops { - int (*register_fentry)(struct bpf_trampoline *tr, void *new_addr, void *data); - int (*unregister_fentry)(struct bpf_trampoline *tr, u32 orig_flags, void *old_addr, - void *data); - int (*modify_fentry)(struct bpf_trampoline *tr, u32 orig_flags, void *old_addr, - void *new_addr, bool lock_direct_mutex, void *data); + int (*register_fentry)(struct bpf_trampoline *tr, struct bpf_tramp_image *im, void *data); + int (*unregister_fentry)(struct bpf_trampoline *tr, u32 orig_flags, void *data); + int (*modify_fentry)(struct bpf_trampoline *tr, u32 orig_flags, struct bpf_tramp_image *im, + bool lock_direct_mutex, void *data); }; #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS @@ -426,9 +425,11 @@ static int bpf_trampoline_update_fentry(struct bpf_trampoline *tr, u32 orig_flag return bpf_arch_text_poke(ip, old_t, new_t, old_addr, new_addr); } -static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags, - void *old_addr, void *data __maybe_unused) +static void bpf_tramp_image_put(struct bpf_tramp_image *im); + +static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags, void *data __maybe_unused) { + void *old_addr = tr->cur_image->image; int ret; if (tr->func.ftrace_managed) @@ -436,13 +437,19 @@ static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags, else ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, NULL); - return ret; + if (ret) + return ret; + + bpf_tramp_image_put(tr->cur_image); + tr->cur_image = NULL; + return 0; } -static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags, - void *old_addr, void *new_addr, +static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags, struct bpf_tramp_image *im, bool lock_direct_mutex, void *data __maybe_unused) { + void *old_addr = tr->cur_image->image; + void *new_addr = im->image; int ret; if (tr->func.ftrace_managed) { @@ -451,12 +458,20 @@ static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags, ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, new_addr); } - return ret; + + if (ret) + return ret; + + bpf_tramp_image_put(tr->cur_image); + tr->cur_image = im; + return 0; } /* first time registering */ -static int register_fentry(struct bpf_trampoline *tr, void *new_addr, void *data __maybe_unused) +static int register_fentry(struct bpf_trampoline *tr, struct bpf_tramp_image *im, + void *data __maybe_unused) { + void *new_addr = im->image; void *ip = tr->func.addr; unsigned long faddr; int ret; @@ -474,7 +489,11 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr, void *data ret = bpf_trampoline_update_fentry(tr, 0, NULL, new_addr); } - return ret; + if (ret) + return ret; + + tr->cur_image = im; + return 0; } static const struct bpf_trampoline_ops trampoline_ops = { @@ -664,9 +683,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut return PTR_ERR(tlinks); if (total == 0) { - err = ops->unregister_fentry(tr, orig_flags, tr->cur_image->image, data); - bpf_tramp_image_put(tr->cur_image); - tr->cur_image = NULL; + err = ops->unregister_fentry(tr, orig_flags, data); goto out; } @@ -734,11 +751,10 @@ again: if (tr->cur_image) /* progs already running at this address */ - err = ops->modify_fentry(tr, orig_flags, tr->cur_image->image, - im->image, lock_direct_mutex, data); + err = ops->modify_fentry(tr, orig_flags, im, lock_direct_mutex, data); else /* first time registering */ - err = ops->register_fentry(tr, im->image, data); + err = ops->register_fentry(tr, im, data); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS if (err == -EAGAIN) { @@ -750,22 +766,16 @@ again: goto again; } #endif - if (err) - goto out_free; - if (tr->cur_image) - bpf_tramp_image_put(tr->cur_image); - tr->cur_image = im; +out_free: + if (err) + bpf_tramp_image_free(im); out: /* If any error happens, restore previous flags */ if (err) tr->flags = orig_flags; kfree(tlinks); return err; - -out_free: - bpf_tramp_image_free(im); - goto out; } static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) -- cgit v1.2.3 From e6cc9ed677e622265bbd015892be58a1eece6238 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:32 +0200 Subject: bpf: Add bpf_trampoline_add/remove_prog functions Separate bpf_trampoline_add/remove_prog functions from __bpf_trampoline_link/unlink functions to be able to add/remove trampoline programs without the image being updated in following changes. No functional change is intended. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-8-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/trampoline.c | 108 +++++++++++++++++++++++++++--------------------- 1 file changed, 61 insertions(+), 47 deletions(-) diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 1006031ea021..701138ef424a 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -820,41 +820,16 @@ static int bpf_freplace_check_tgt_prog(struct bpf_prog *tgt_prog) return 0; } -static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, - struct bpf_trampoline *tr, - struct bpf_prog *tgt_prog, - const struct bpf_trampoline_ops *ops, - void *data) +static int bpf_trampoline_add_prog(struct bpf_trampoline *tr, + struct bpf_tramp_link *link, + int cnt) { struct bpf_fsession_link *fslink = NULL; enum bpf_tramp_prog_type kind; struct bpf_tramp_link *link_exiting; struct hlist_head *prog_list; - int err = 0; - int cnt = 0, i; kind = bpf_attach_type_to_tramp(link->link.prog); - if (tr->extension_prog) - /* cannot attach fentry/fexit if extension prog is attached. - * cannot overwrite extension prog either. - */ - return -EBUSY; - - for (i = 0; i < BPF_TRAMP_MAX; i++) - cnt += tr->progs_cnt[i]; - - if (kind == BPF_TRAMP_REPLACE) { - /* Cannot attach extension if fentry/fexit are in use. */ - if (cnt) - return -EBUSY; - err = bpf_freplace_check_tgt_prog(tgt_prog); - if (err) - return err; - tr->extension_prog = link->link.prog; - return bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP, - BPF_MOD_JUMP, NULL, - link->link.prog->bpf_func); - } if (kind == BPF_TRAMP_FSESSION) { prog_list = &tr->progs_hlist[BPF_TRAMP_FENTRY]; cnt++; @@ -882,17 +857,64 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, } else { tr->progs_cnt[kind]++; } - err = bpf_trampoline_update(tr, true /* lock_direct_mutex */, ops, data); - if (err) { - hlist_del_init(&link->tramp_hlist); - if (kind == BPF_TRAMP_FSESSION) { - tr->progs_cnt[BPF_TRAMP_FENTRY]--; - hlist_del_init(&fslink->fexit.tramp_hlist); - tr->progs_cnt[BPF_TRAMP_FEXIT]--; - } else { - tr->progs_cnt[kind]--; - } + return 0; +} + +static void bpf_trampoline_remove_prog(struct bpf_trampoline *tr, + struct bpf_tramp_link *link) +{ + struct bpf_fsession_link *fslink; + enum bpf_tramp_prog_type kind; + + kind = bpf_attach_type_to_tramp(link->link.prog); + if (kind == BPF_TRAMP_FSESSION) { + fslink = container_of(link, struct bpf_fsession_link, link.link); + hlist_del_init(&fslink->fexit.tramp_hlist); + tr->progs_cnt[BPF_TRAMP_FEXIT]--; + kind = BPF_TRAMP_FENTRY; + } + hlist_del_init(&link->tramp_hlist); + tr->progs_cnt[kind]--; +} + +static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, + struct bpf_trampoline *tr, + struct bpf_prog *tgt_prog, + const struct bpf_trampoline_ops *ops, + void *data) +{ + enum bpf_tramp_prog_type kind; + int err = 0; + int cnt = 0, i; + + kind = bpf_attach_type_to_tramp(link->link.prog); + if (tr->extension_prog) + /* cannot attach fentry/fexit if extension prog is attached. + * cannot overwrite extension prog either. + */ + return -EBUSY; + + for (i = 0; i < BPF_TRAMP_MAX; i++) + cnt += tr->progs_cnt[i]; + + if (kind == BPF_TRAMP_REPLACE) { + /* Cannot attach extension if fentry/fexit are in use. */ + if (cnt) + return -EBUSY; + err = bpf_freplace_check_tgt_prog(tgt_prog); + if (err) + return err; + tr->extension_prog = link->link.prog; + return bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP, + BPF_MOD_JUMP, NULL, + link->link.prog->bpf_func); } + err = bpf_trampoline_add_prog(tr, link, cnt); + if (err) + return err; + err = bpf_trampoline_update(tr, true /* lock_direct_mutex */, ops, data); + if (err) + bpf_trampoline_remove_prog(tr, link); return err; } @@ -927,16 +949,8 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, guard(mutex)(&tgt_prog->aux->ext_mutex); tgt_prog->aux->is_extended = false; return err; - } else if (kind == BPF_TRAMP_FSESSION) { - struct bpf_fsession_link *fslink = - container_of(link, struct bpf_fsession_link, link.link); - - hlist_del_init(&fslink->fexit.tramp_hlist); - tr->progs_cnt[BPF_TRAMP_FEXIT]--; - kind = BPF_TRAMP_FENTRY; } - hlist_del_init(&link->tramp_hlist); - tr->progs_cnt[kind]--; + bpf_trampoline_remove_prog(tr, link); return bpf_trampoline_update(tr, true /* lock_direct_mutex */, ops, data); } -- cgit v1.2.3 From 65499074efaf574fef6365ac63b785a3ec98913d Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:33 +0200 Subject: bpf: Add struct bpf_tramp_node object Adding struct bpf_tramp_node to decouple the link out of the trampoline attachment info. At the moment the object for attaching bpf program to the trampoline is 'struct bpf_tramp_link': struct bpf_tramp_link { struct bpf_link link; struct hlist_node tramp_hlist; u64 cookie; } The link holds the bpf_prog pointer and forces one link - one program binding logic. In following changes we want to attach program to multiple trampolines but we want to keep just one bpf_link object. Splitting struct bpf_tramp_link into: struct bpf_tramp_link { struct bpf_link link; struct bpf_tramp_node node; }; struct bpf_tramp_node { struct bpf_link *link; struct hlist_node tramp_hlist; u64 cookie; }; The 'struct bpf_tramp_link' defines standard single trampoline link and 'struct bpf_tramp_node' is the attachment trampoline object with pointer to the bpf_link object. This will allow us to define link for multiple trampolines, like: struct bpf_tracing_multi_link { struct bpf_link link; ... int nodes_cnt; struct bpf_tracing_multi_node nodes[] __counted_by(nodes_cnt); }; Cc: Hengqi Chen Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-9-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 58 ++++++++++---------- arch/loongarch/net/bpf_jit.c | 52 +++++++++--------- arch/powerpc/net/bpf_jit_comp.c | 54 +++++++++---------- arch/riscv/net/bpf_jit_comp64.c | 52 +++++++++--------- arch/s390/net/bpf_jit_comp.c | 44 +++++++-------- arch/x86/net/bpf_jit_comp.c | 54 +++++++++---------- include/linux/bpf.h | 60 +++++++++++++-------- kernel/bpf/bpf_struct_ops.c | 27 +++++----- kernel/bpf/syscall.c | 39 ++++++++------ kernel/bpf/trampoline.c | 115 ++++++++++++++++++++-------------------- net/bpf/bpf_dummy_struct_ops.c | 14 ++--- 11 files changed, 294 insertions(+), 275 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index b4abc3138f37..f6bcc0e1a950 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -2335,24 +2335,24 @@ bool bpf_jit_supports_subprog_tailcalls(void) return true; } -static void invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_link *l, +static void invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_node *node, int bargs_off, int retval_off, int run_ctx_off, bool save_ret) { __le32 *branch; u64 enter_prog; u64 exit_prog; - struct bpf_prog *p = l->link.prog; + struct bpf_prog *p = node->link->prog; int cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie); enter_prog = (u64)bpf_trampoline_enter(p); exit_prog = (u64)bpf_trampoline_exit(p); - if (l->cookie == 0) { + if (node->cookie == 0) { /* if cookie is zero, one instruction is enough to store it */ emit(A64_STR64I(A64_ZR, A64_SP, run_ctx_off + cookie_off), ctx); } else { - emit_a64_mov_i64(A64_R(10), l->cookie, ctx); + emit_a64_mov_i64(A64_R(10), node->cookie, ctx); emit(A64_STR64I(A64_R(10), A64_SP, run_ctx_off + cookie_off), ctx); } @@ -2402,7 +2402,7 @@ static void invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_link *l, emit_call(exit_prog, ctx); } -static void invoke_bpf_mod_ret(struct jit_ctx *ctx, struct bpf_tramp_links *tl, +static void invoke_bpf_mod_ret(struct jit_ctx *ctx, struct bpf_tramp_nodes *tn, int bargs_off, int retval_off, int run_ctx_off, __le32 **branches) { @@ -2412,8 +2412,8 @@ static void invoke_bpf_mod_ret(struct jit_ctx *ctx, struct bpf_tramp_links *tl, * Set this to 0 to avoid confusing the program. */ emit(A64_STR64I(A64_ZR, A64_SP, retval_off), ctx); - for (i = 0; i < tl->nr_links; i++) { - invoke_bpf_prog(ctx, tl->links[i], bargs_off, retval_off, + for (i = 0; i < tn->nr_nodes; i++) { + invoke_bpf_prog(ctx, tn->nodes[i], bargs_off, retval_off, run_ctx_off, true); /* if (*(u64 *)(sp + retval_off) != 0) * goto do_fexit; @@ -2544,10 +2544,10 @@ static void restore_args(struct jit_ctx *ctx, int bargs_off, int nregs) } } -static bool is_struct_ops_tramp(const struct bpf_tramp_links *fentry_links) +static bool is_struct_ops_tramp(const struct bpf_tramp_nodes *fentry_nodes) { - return fentry_links->nr_links == 1 && - fentry_links->links[0]->link.type == BPF_LINK_TYPE_STRUCT_OPS; + return fentry_nodes->nr_nodes == 1 && + fentry_nodes->nodes[0]->link->type == BPF_LINK_TYPE_STRUCT_OPS; } static void store_func_meta(struct jit_ctx *ctx, u64 func_meta, int func_meta_off) @@ -2568,7 +2568,7 @@ static void store_func_meta(struct jit_ctx *ctx, u64 func_meta, int func_meta_of * */ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, - struct bpf_tramp_links *tlinks, void *func_addr, + struct bpf_tramp_nodes *tnodes, void *func_addr, const struct btf_func_model *m, const struct arg_aux *a, u32 flags) @@ -2584,14 +2584,14 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, int run_ctx_off; int oargs_off; int nfuncargs; - struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; - struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; - struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN]; + struct bpf_tramp_nodes *fentry = &tnodes[BPF_TRAMP_FENTRY]; + struct bpf_tramp_nodes *fexit = &tnodes[BPF_TRAMP_FEXIT]; + struct bpf_tramp_nodes *fmod_ret = &tnodes[BPF_TRAMP_MODIFY_RETURN]; bool save_ret; __le32 **branches = NULL; bool is_struct_ops = is_struct_ops_tramp(fentry); int cookie_off, cookie_cnt, cookie_bargs_off; - int fsession_cnt = bpf_fsession_cnt(tlinks); + int fsession_cnt = bpf_fsession_cnt(tnodes); u64 func_meta; /* trampoline stack layout: @@ -2637,7 +2637,7 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, cookie_off = stack_size; /* room for session cookies */ - cookie_cnt = bpf_fsession_cookie_cnt(tlinks); + cookie_cnt = bpf_fsession_cookie_cnt(tnodes); stack_size += cookie_cnt * 8; ip_off = stack_size; @@ -2734,20 +2734,20 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, } cookie_bargs_off = (bargs_off - cookie_off) / 8; - for (i = 0; i < fentry->nr_links; i++) { - if (bpf_prog_calls_session_cookie(fentry->links[i])) { + for (i = 0; i < fentry->nr_nodes; i++) { + if (bpf_prog_calls_session_cookie(fentry->nodes[i])) { u64 meta = func_meta | (cookie_bargs_off << BPF_TRAMP_COOKIE_INDEX_SHIFT); store_func_meta(ctx, meta, func_meta_off); cookie_bargs_off--; } - invoke_bpf_prog(ctx, fentry->links[i], bargs_off, + invoke_bpf_prog(ctx, fentry->nodes[i], bargs_off, retval_off, run_ctx_off, flags & BPF_TRAMP_F_RET_FENTRY_RET); } - if (fmod_ret->nr_links) { - branches = kcalloc(fmod_ret->nr_links, sizeof(__le32 *), + if (fmod_ret->nr_nodes) { + branches = kcalloc(fmod_ret->nr_nodes, sizeof(__le32 *), GFP_KERNEL); if (!branches) return -ENOMEM; @@ -2771,7 +2771,7 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, } /* update the branches saved in invoke_bpf_mod_ret with cbnz */ - for (i = 0; i < fmod_ret->nr_links && ctx->image != NULL; i++) { + for (i = 0; i < fmod_ret->nr_nodes && ctx->image != NULL; i++) { int offset = &ctx->image[ctx->idx] - branches[i]; *branches[i] = cpu_to_le32(A64_CBNZ(1, A64_R(10), offset)); } @@ -2782,14 +2782,14 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, store_func_meta(ctx, func_meta, func_meta_off); cookie_bargs_off = (bargs_off - cookie_off) / 8; - for (i = 0; i < fexit->nr_links; i++) { - if (bpf_prog_calls_session_cookie(fexit->links[i])) { + for (i = 0; i < fexit->nr_nodes; i++) { + if (bpf_prog_calls_session_cookie(fexit->nodes[i])) { u64 meta = func_meta | (cookie_bargs_off << BPF_TRAMP_COOKIE_INDEX_SHIFT); store_func_meta(ctx, meta, func_meta_off); cookie_bargs_off--; } - invoke_bpf_prog(ctx, fexit->links[i], bargs_off, retval_off, + invoke_bpf_prog(ctx, fexit->nodes[i], bargs_off, retval_off, run_ctx_off, false); } @@ -2847,7 +2847,7 @@ bool bpf_jit_supports_fsession(void) } int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, void *func_addr) + struct bpf_tramp_nodes *tnodes, void *func_addr) { struct jit_ctx ctx = { .image = NULL, @@ -2861,7 +2861,7 @@ int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, if (ret < 0) return ret; - ret = prepare_trampoline(&ctx, &im, tlinks, func_addr, m, &aaux, flags); + ret = prepare_trampoline(&ctx, &im, tnodes, func_addr, m, &aaux, flags); if (ret < 0) return ret; @@ -2885,7 +2885,7 @@ int arch_protect_bpf_trampoline(void *image, unsigned int size) int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image, void *ro_image_end, const struct btf_func_model *m, - u32 flags, struct bpf_tramp_links *tlinks, + u32 flags, struct bpf_tramp_nodes *tnodes, void *func_addr) { u32 size = ro_image_end - ro_image; @@ -2912,7 +2912,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image, ret = calc_arg_aux(m, &aaux); if (ret) goto out; - ret = prepare_trampoline(&ctx, im, tlinks, func_addr, m, &aaux, flags); + ret = prepare_trampoline(&ctx, im, tnodes, func_addr, m, &aaux, flags); if (ret > 0 && validate_code(&ctx) < 0) { ret = -EINVAL; diff --git a/arch/loongarch/net/bpf_jit.c b/arch/loongarch/net/bpf_jit.c index 24913dc7f4e8..058ffbbaad85 100644 --- a/arch/loongarch/net/bpf_jit.c +++ b/arch/loongarch/net/bpf_jit.c @@ -1674,17 +1674,17 @@ static void restore_stk_args(struct jit_ctx *ctx, int nr_stk_args, int args_off, } } -static int invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_link *l, +static int invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_node *n, int args_off, int retval_off, int run_ctx_off, bool save_ret) { int ret; u32 *branch; - struct bpf_prog *p = l->link.prog; + struct bpf_prog *p = n->link->prog; int cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie); - if (l->cookie) + if (n->cookie) emit_store_stack_imm64(ctx, LOONGARCH_GPR_T1, - -run_ctx_off + cookie_off, l->cookie); + -run_ctx_off + cookie_off, n->cookie); else emit_insn(ctx, std, LOONGARCH_GPR_ZERO, LOONGARCH_GPR_FP, -run_ctx_off + cookie_off); @@ -1737,22 +1737,22 @@ static int invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_link *l, return ret; } -static int invoke_bpf(struct jit_ctx *ctx, struct bpf_tramp_links *tl, +static int invoke_bpf(struct jit_ctx *ctx, struct bpf_tramp_nodes *tn, int args_off, int retval_off, int run_ctx_off, int func_meta_off, bool save_ret, u64 func_meta, int cookie_off) { int i, cur_cookie = (cookie_off - args_off) / 8; - for (i = 0; i < tl->nr_links; i++) { + for (i = 0; i < tn->nr_nodes; i++) { int err; - if (bpf_prog_calls_session_cookie(tl->links[i])) { + if (bpf_prog_calls_session_cookie(tn->nodes[i])) { u64 meta = func_meta | ((u64)cur_cookie << BPF_TRAMP_COOKIE_INDEX_SHIFT); emit_store_stack_imm64(ctx, LOONGARCH_GPR_T1, -func_meta_off, meta); cur_cookie--; } - err = invoke_bpf_prog(ctx, tl->links[i], args_off, retval_off, run_ctx_off, save_ret); + err = invoke_bpf_prog(ctx, tn->nodes[i], args_off, retval_off, run_ctx_off, save_ret); if (err) return err; } @@ -1807,7 +1807,7 @@ static void sign_extend(struct jit_ctx *ctx, int rd, int rj, u8 size, bool sign) } static int __arch_prepare_bpf_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, - const struct btf_func_model *m, struct bpf_tramp_links *tlinks, + const struct btf_func_model *m, struct bpf_tramp_nodes *tnodes, void *func_addr, u32 flags) { int i, ret, save_ret; @@ -1817,9 +1817,9 @@ static int __arch_prepare_bpf_trampoline(struct jit_ctx *ctx, struct bpf_tramp_i unsigned long long func_meta; bool is_struct_ops = flags & BPF_TRAMP_F_INDIRECT; void *orig_call = func_addr; - struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; - struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; - struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN]; + struct bpf_tramp_nodes *fentry = &tnodes[BPF_TRAMP_FENTRY]; + struct bpf_tramp_nodes *fexit = &tnodes[BPF_TRAMP_FEXIT]; + struct bpf_tramp_nodes *fmod_ret = &tnodes[BPF_TRAMP_MODIFY_RETURN]; u32 **branches = NULL; /* @@ -1898,7 +1898,7 @@ static int __arch_prepare_bpf_trampoline(struct jit_ctx *ctx, struct bpf_tramp_i ip_off = stack_size; } - cookie_cnt = bpf_fsession_cookie_cnt(tlinks); + cookie_cnt = bpf_fsession_cookie_cnt(tnodes); /* Room for session cookies */ stack_size += cookie_cnt * 8; @@ -1969,7 +1969,7 @@ static int __arch_prepare_bpf_trampoline(struct jit_ctx *ctx, struct bpf_tramp_i store_args(ctx, nr_arg_slots, args_off); - if (bpf_fsession_cnt(tlinks)) { + if (bpf_fsession_cnt(tnodes)) { /* clear all session cookies' value */ for (i = 0; i < cookie_cnt; i++) emit_insn(ctx, std, LOONGARCH_GPR_ZERO, LOONGARCH_GPR_FP, -cookie_off + 8 * i); @@ -1994,20 +1994,20 @@ static int __arch_prepare_bpf_trampoline(struct jit_ctx *ctx, struct bpf_tramp_i return ret; } - if (fentry->nr_links) { + if (fentry->nr_nodes) { ret = invoke_bpf(ctx, fentry, args_off, retval_off, run_ctx_off, func_meta_off, flags & BPF_TRAMP_F_RET_FENTRY_RET, func_meta, cookie_off); if (ret) return ret; } - if (fmod_ret->nr_links) { - branches = kcalloc(fmod_ret->nr_links, sizeof(u32 *), GFP_KERNEL); + if (fmod_ret->nr_nodes) { + branches = kcalloc(fmod_ret->nr_nodes, sizeof(u32 *), GFP_KERNEL); if (!branches) return -ENOMEM; emit_insn(ctx, std, LOONGARCH_GPR_ZERO, LOONGARCH_GPR_FP, -retval_off); - for (i = 0; i < fmod_ret->nr_links; i++) { - ret = invoke_bpf_prog(ctx, fmod_ret->links[i], + for (i = 0; i < fmod_ret->nr_nodes; i++) { + ret = invoke_bpf_prog(ctx, fmod_ret->nodes[i], args_off, retval_off, run_ctx_off, true); if (ret) goto out; @@ -2035,17 +2035,17 @@ static int __arch_prepare_bpf_trampoline(struct jit_ctx *ctx, struct bpf_tramp_i emit_insn(ctx, nop); } - for (i = 0; ctx->image && i < fmod_ret->nr_links; i++) { + for (i = 0; ctx->image && i < fmod_ret->nr_nodes; i++) { int offset = (void *)(&ctx->image[ctx->idx]) - (void *)branches[i]; *branches[i] = larch_insn_gen_bne(LOONGARCH_GPR_T1, LOONGARCH_GPR_ZERO, offset); } /* Set "is_return" flag for fsession */ func_meta |= (1ULL << BPF_TRAMP_IS_RETURN_SHIFT); - if (bpf_fsession_cnt(tlinks)) + if (bpf_fsession_cnt(tnodes)) emit_store_stack_imm64(ctx, LOONGARCH_GPR_T1, -func_meta_off, func_meta); - if (fexit->nr_links) { + if (fexit->nr_nodes) { ret = invoke_bpf(ctx, fexit, args_off, retval_off, run_ctx_off, func_meta_off, false, func_meta, cookie_off); if (ret) @@ -2115,7 +2115,7 @@ out: int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image, void *ro_image_end, const struct btf_func_model *m, - u32 flags, struct bpf_tramp_links *tlinks, void *func_addr) + u32 flags, struct bpf_tramp_nodes *tnodes, void *func_addr) { int ret, size; void *image, *tmp; @@ -2131,7 +2131,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image, ctx.idx = 0; jit_fill_hole(image, (unsigned int)(ro_image_end - ro_image)); - ret = __arch_prepare_bpf_trampoline(&ctx, im, m, tlinks, func_addr, flags); + ret = __arch_prepare_bpf_trampoline(&ctx, im, m, tnodes, func_addr, flags); if (ret < 0) goto out; @@ -2152,7 +2152,7 @@ out: } int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, void *func_addr) + struct bpf_tramp_nodes *tnodes, void *func_addr) { int ret; struct jit_ctx ctx; @@ -2161,7 +2161,7 @@ int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, ctx.image = NULL; ctx.idx = 0; - ret = __arch_prepare_bpf_trampoline(&ctx, &im, m, tlinks, func_addr, flags); + ret = __arch_prepare_bpf_trampoline(&ctx, &im, m, tnodes, func_addr, flags); return ret < 0 ? ret : ret * LOONGARCH_INSN_SIZE; } diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 53ab97ad6074..6351a187ca61 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -597,22 +597,22 @@ int arch_protect_bpf_trampoline(void *image, unsigned int size) } static int invoke_bpf_prog(u32 *image, u32 *ro_image, struct codegen_context *ctx, - struct bpf_tramp_link *l, int regs_off, int retval_off, + struct bpf_tramp_node *n, int regs_off, int retval_off, int run_ctx_off, bool save_ret) { - struct bpf_prog *p = l->link.prog; + struct bpf_prog *p = n->link->prog; ppc_inst_t branch_insn; u32 jmp_idx; int ret = 0; /* Save cookie */ if (IS_ENABLED(CONFIG_PPC64)) { - PPC_LI64(_R3, l->cookie); + PPC_LI64(_R3, n->cookie); EMIT(PPC_RAW_STD(_R3, _R1, run_ctx_off + offsetof(struct bpf_tramp_run_ctx, bpf_cookie))); } else { - PPC_LI32(_R3, l->cookie >> 32); - PPC_LI32(_R4, l->cookie); + PPC_LI32(_R3, n->cookie >> 32); + PPC_LI32(_R4, n->cookie); EMIT(PPC_RAW_STW(_R3, _R1, run_ctx_off + offsetof(struct bpf_tramp_run_ctx, bpf_cookie))); EMIT(PPC_RAW_STW(_R4, _R1, @@ -679,7 +679,7 @@ static int invoke_bpf_prog(u32 *image, u32 *ro_image, struct codegen_context *ct } static int invoke_bpf_mod_ret(u32 *image, u32 *ro_image, struct codegen_context *ctx, - struct bpf_tramp_links *tl, int regs_off, int retval_off, + struct bpf_tramp_nodes *tn, int regs_off, int retval_off, int run_ctx_off, u32 *branches) { int i; @@ -690,8 +690,8 @@ static int invoke_bpf_mod_ret(u32 *image, u32 *ro_image, struct codegen_context */ EMIT(PPC_RAW_LI(_R3, 0)); EMIT(PPC_RAW_STL(_R3, _R1, retval_off)); - for (i = 0; i < tl->nr_links; i++) { - if (invoke_bpf_prog(image, ro_image, ctx, tl->links[i], regs_off, retval_off, + for (i = 0; i < tn->nr_nodes; i++) { + if (invoke_bpf_prog(image, ro_image, ctx, tn->nodes[i], regs_off, retval_off, run_ctx_off, true)) return -EINVAL; @@ -807,18 +807,18 @@ static void bpf_trampoline_restore_args_stack(u32 *image, struct codegen_context static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_image, void *rw_image_end, void *ro_image, const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, + struct bpf_tramp_nodes *tnodes, void *func_addr) { int regs_off, func_meta_off, ip_off, run_ctx_off, retval_off; int nvr_off, alt_lr_off, r4_off = 0; - struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN]; - struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; - struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; + struct bpf_tramp_nodes *fmod_ret = &tnodes[BPF_TRAMP_MODIFY_RETURN]; + struct bpf_tramp_nodes *fentry = &tnodes[BPF_TRAMP_FENTRY]; + struct bpf_tramp_nodes *fexit = &tnodes[BPF_TRAMP_FEXIT]; int i, ret, nr_regs, retaddr_off, bpf_frame_size = 0; struct codegen_context codegen_ctx, *ctx; int cookie_off, cookie_cnt, cookie_ctx_off; - int fsession_cnt = bpf_fsession_cnt(tlinks); + int fsession_cnt = bpf_fsession_cnt(tnodes); u64 func_meta; u32 *image = (u32 *)rw_image; ppc_inst_t branch_insn; @@ -893,7 +893,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im /* room for session cookies */ cookie_off = bpf_frame_size; - cookie_cnt = bpf_fsession_cookie_cnt(tlinks); + cookie_cnt = bpf_fsession_cookie_cnt(tnodes); bpf_frame_size += cookie_cnt * 8; /* Room for IP address argument */ @@ -1030,21 +1030,21 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im cookie_ctx_off = (regs_off - cookie_off) / 8; - for (i = 0; i < fentry->nr_links; i++) { - if (bpf_prog_calls_session_cookie(fentry->links[i])) { + for (i = 0; i < fentry->nr_nodes; i++) { + if (bpf_prog_calls_session_cookie(fentry->nodes[i])) { u64 meta = func_meta | (cookie_ctx_off << BPF_TRAMP_COOKIE_INDEX_SHIFT); store_func_meta(image, ctx, meta, func_meta_off); cookie_ctx_off--; } - if (invoke_bpf_prog(image, ro_image, ctx, fentry->links[i], regs_off, retval_off, + if (invoke_bpf_prog(image, ro_image, ctx, fentry->nodes[i], regs_off, retval_off, run_ctx_off, flags & BPF_TRAMP_F_RET_FENTRY_RET)) return -EINVAL; } - if (fmod_ret->nr_links) { - branches = kcalloc(fmod_ret->nr_links, sizeof(u32), GFP_KERNEL); + if (fmod_ret->nr_nodes) { + branches = kcalloc(fmod_ret->nr_nodes, sizeof(u32), GFP_KERNEL); if (!branches) return -ENOMEM; @@ -1093,7 +1093,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im } /* Update branches saved in invoke_bpf_mod_ret with address of do_fexit */ - for (i = 0; i < fmod_ret->nr_links && image; i++) { + for (i = 0; i < fmod_ret->nr_nodes && image; i++) { if (create_cond_branch(&branch_insn, &image[branches[i]], (unsigned long)&image[ctx->idx], COND_NE << 16)) { ret = -EINVAL; @@ -1110,15 +1110,15 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im cookie_ctx_off = (regs_off - cookie_off) / 8; - for (i = 0; i < fexit->nr_links; i++) { - if (bpf_prog_calls_session_cookie(fexit->links[i])) { + for (i = 0; i < fexit->nr_nodes; i++) { + if (bpf_prog_calls_session_cookie(fexit->nodes[i])) { u64 meta = func_meta | (cookie_ctx_off << BPF_TRAMP_COOKIE_INDEX_SHIFT); store_func_meta(image, ctx, meta, func_meta_off); cookie_ctx_off--; } - if (invoke_bpf_prog(image, ro_image, ctx, fexit->links[i], regs_off, retval_off, + if (invoke_bpf_prog(image, ro_image, ctx, fexit->nodes[i], regs_off, retval_off, run_ctx_off, false)) { ret = -EINVAL; goto cleanup; @@ -1185,18 +1185,18 @@ cleanup: } int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, void *func_addr) + struct bpf_tramp_nodes *tnodes, void *func_addr) { struct bpf_tramp_image im; int ret; - ret = __arch_prepare_bpf_trampoline(&im, NULL, NULL, NULL, m, flags, tlinks, func_addr); + ret = __arch_prepare_bpf_trampoline(&im, NULL, NULL, NULL, m, flags, tnodes, func_addr); return ret; } int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, + struct bpf_tramp_nodes *tnodes, void *func_addr) { u32 size = image_end - image; @@ -1212,7 +1212,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i return -ENOMEM; ret = __arch_prepare_bpf_trampoline(im, rw_image, rw_image + size, image, m, - flags, tlinks, func_addr); + flags, tnodes, func_addr); if (ret < 0) goto out; diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index e2c70c70cca8..c03c1de16b79 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -934,15 +934,15 @@ static void emit_store_stack_imm64(u8 reg, int stack_off, u64 imm64, emit_sd(RV_REG_FP, stack_off, reg, ctx); } -static int invoke_bpf_prog(struct bpf_tramp_link *l, int args_off, int retval_off, +static int invoke_bpf_prog(struct bpf_tramp_node *node, int args_off, int retval_off, int run_ctx_off, bool save_ret, struct rv_jit_context *ctx) { int ret, branch_off; - struct bpf_prog *p = l->link.prog; + struct bpf_prog *p = node->link->prog; int cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie); - if (l->cookie) - emit_store_stack_imm64(RV_REG_T1, -run_ctx_off + cookie_off, l->cookie, ctx); + if (node->cookie) + emit_store_stack_imm64(RV_REG_T1, -run_ctx_off + cookie_off, node->cookie, ctx); else emit_sd(RV_REG_FP, -run_ctx_off + cookie_off, RV_REG_ZERO, ctx); @@ -996,22 +996,22 @@ static int invoke_bpf_prog(struct bpf_tramp_link *l, int args_off, int retval_of return ret; } -static int invoke_bpf(struct bpf_tramp_links *tl, int args_off, int retval_off, +static int invoke_bpf(struct bpf_tramp_nodes *tn, int args_off, int retval_off, int run_ctx_off, int func_meta_off, bool save_ret, u64 func_meta, int cookie_off, struct rv_jit_context *ctx) { int i, cur_cookie = (cookie_off - args_off) / 8; - for (i = 0; i < tl->nr_links; i++) { + for (i = 0; i < tn->nr_nodes; i++) { int err; - if (bpf_prog_calls_session_cookie(tl->links[i])) { + if (bpf_prog_calls_session_cookie(tn->nodes[i])) { u64 meta = func_meta | ((u64)cur_cookie << BPF_TRAMP_COOKIE_INDEX_SHIFT); emit_store_stack_imm64(RV_REG_T1, -func_meta_off, meta, ctx); cur_cookie--; } - err = invoke_bpf_prog(tl->links[i], args_off, retval_off, run_ctx_off, + err = invoke_bpf_prog(tn->nodes[i], args_off, retval_off, run_ctx_off, save_ret, ctx); if (err) return err; @@ -1021,7 +1021,7 @@ static int invoke_bpf(struct bpf_tramp_links *tl, int args_off, int retval_off, static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, const struct btf_func_model *m, - struct bpf_tramp_links *tlinks, + struct bpf_tramp_nodes *tnodes, void *func_addr, u32 flags, struct rv_jit_context *ctx) { @@ -1030,9 +1030,9 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, int stack_size = 0, nr_arg_slots = 0; int retval_off, args_off, func_meta_off, ip_off, run_ctx_off, sreg_off, stk_arg_off; int cookie_off, cookie_cnt; - struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; - struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; - struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN]; + struct bpf_tramp_nodes *fentry = &tnodes[BPF_TRAMP_FENTRY]; + struct bpf_tramp_nodes *fexit = &tnodes[BPF_TRAMP_FEXIT]; + struct bpf_tramp_nodes *fmod_ret = &tnodes[BPF_TRAMP_MODIFY_RETURN]; bool is_struct_ops = flags & BPF_TRAMP_F_INDIRECT; void *orig_call = func_addr; bool save_ret; @@ -1115,7 +1115,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, ip_off = stack_size; } - cookie_cnt = bpf_fsession_cookie_cnt(tlinks); + cookie_cnt = bpf_fsession_cookie_cnt(tnodes); /* room for session cookies */ stack_size += cookie_cnt * 8; cookie_off = stack_size; @@ -1172,7 +1172,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, store_args(nr_arg_slots, args_off, ctx); - if (bpf_fsession_cnt(tlinks)) { + if (bpf_fsession_cnt(tnodes)) { /* clear all session cookies' value */ for (i = 0; i < cookie_cnt; i++) emit_sd(RV_REG_FP, -cookie_off + 8 * i, RV_REG_ZERO, ctx); @@ -1187,22 +1187,22 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, return ret; } - if (fentry->nr_links) { + if (fentry->nr_nodes) { ret = invoke_bpf(fentry, args_off, retval_off, run_ctx_off, func_meta_off, flags & BPF_TRAMP_F_RET_FENTRY_RET, func_meta, cookie_off, ctx); if (ret) return ret; } - if (fmod_ret->nr_links) { - branches_off = kzalloc_objs(int, fmod_ret->nr_links); + if (fmod_ret->nr_nodes) { + branches_off = kzalloc_objs(int, fmod_ret->nr_nodes); if (!branches_off) return -ENOMEM; /* cleanup to avoid garbage return value confusion */ emit_sd(RV_REG_FP, -retval_off, RV_REG_ZERO, ctx); - for (i = 0; i < fmod_ret->nr_links; i++) { - ret = invoke_bpf_prog(fmod_ret->links[i], args_off, retval_off, + for (i = 0; i < fmod_ret->nr_nodes; i++) { + ret = invoke_bpf_prog(fmod_ret->nodes[i], args_off, retval_off, run_ctx_off, true, ctx); if (ret) goto out; @@ -1230,7 +1230,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, } /* update branches saved in invoke_bpf_mod_ret with bnez */ - for (i = 0; ctx->insns && i < fmod_ret->nr_links; i++) { + for (i = 0; ctx->insns && i < fmod_ret->nr_nodes; i++) { offset = ninsns_rvoff(ctx->ninsns - branches_off[i]); insn = rv_bne(RV_REG_T1, RV_REG_ZERO, offset >> 1); *(u32 *)(ctx->insns + branches_off[i]) = insn; @@ -1238,10 +1238,10 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, /* set "is_return" flag for fsession */ func_meta |= (1ULL << BPF_TRAMP_IS_RETURN_SHIFT); - if (bpf_fsession_cnt(tlinks)) + if (bpf_fsession_cnt(tnodes)) emit_store_stack_imm64(RV_REG_T1, -func_meta_off, func_meta, ctx); - if (fexit->nr_links) { + if (fexit->nr_nodes) { ret = invoke_bpf(fexit, args_off, retval_off, run_ctx_off, func_meta_off, false, func_meta, cookie_off, ctx); if (ret) @@ -1305,7 +1305,7 @@ out: } int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, void *func_addr) + struct bpf_tramp_nodes *tnodes, void *func_addr) { struct bpf_tramp_image im; struct rv_jit_context ctx; @@ -1314,7 +1314,7 @@ int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, ctx.ninsns = 0; ctx.insns = NULL; ctx.ro_insns = NULL; - ret = __arch_prepare_bpf_trampoline(&im, m, tlinks, func_addr, flags, &ctx); + ret = __arch_prepare_bpf_trampoline(&im, m, tnodes, func_addr, flags, &ctx); return ret < 0 ? ret : ninsns_rvoff(ctx.ninsns); } @@ -1331,7 +1331,7 @@ void arch_free_bpf_trampoline(void *image, unsigned int size) int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image, void *ro_image_end, const struct btf_func_model *m, - u32 flags, struct bpf_tramp_links *tlinks, + u32 flags, struct bpf_tramp_nodes *tnodes, void *func_addr) { int ret; @@ -1346,7 +1346,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image, ctx.ninsns = 0; ctx.insns = image; ctx.ro_insns = ro_image; - ret = __arch_prepare_bpf_trampoline(im, m, tlinks, func_addr, flags, &ctx); + ret = __arch_prepare_bpf_trampoline(im, m, tnodes, func_addr, flags, &ctx); if (ret < 0) goto out; diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 14eaaa5b2185..31749c0362ca 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -2537,19 +2537,19 @@ static void emit_store_stack_imm64(struct bpf_jit *jit, int tmp_reg, int stack_o static int invoke_bpf_prog(struct bpf_tramp_jit *tjit, const struct btf_func_model *m, - struct bpf_tramp_link *tlink, bool save_ret) + struct bpf_tramp_node *node, bool save_ret) { struct bpf_jit *jit = &tjit->common; int cookie_off = tjit->run_ctx_off + offsetof(struct bpf_tramp_run_ctx, bpf_cookie); - struct bpf_prog *p = tlink->link.prog; + struct bpf_prog *p = node->link->prog; int patch; /* - * run_ctx.cookie = tlink->cookie; + * run_ctx.cookie = node->cookie; */ - emit_store_stack_imm64(jit, REG_W0, cookie_off, tlink->cookie); + emit_store_stack_imm64(jit, REG_W0, cookie_off, node->cookie); /* * if ((start = __bpf_prog_enter(p, &run_ctx)) == 0) @@ -2609,20 +2609,20 @@ static int invoke_bpf_prog(struct bpf_tramp_jit *tjit, static int invoke_bpf(struct bpf_tramp_jit *tjit, const struct btf_func_model *m, - struct bpf_tramp_links *tl, bool save_ret, + struct bpf_tramp_nodes *tn, bool save_ret, u64 func_meta, int cookie_off) { int i, cur_cookie = (tjit->bpf_args_off - cookie_off) / sizeof(u64); struct bpf_jit *jit = &tjit->common; - for (i = 0; i < tl->nr_links; i++) { - if (bpf_prog_calls_session_cookie(tl->links[i])) { + for (i = 0; i < tn->nr_nodes; i++) { + if (bpf_prog_calls_session_cookie(tn->nodes[i])) { u64 meta = func_meta | ((u64)cur_cookie << BPF_TRAMP_COOKIE_INDEX_SHIFT); emit_store_stack_imm64(jit, REG_0, tjit->func_meta_off, meta); cur_cookie--; } - if (invoke_bpf_prog(tjit, m, tl->links[i], save_ret)) + if (invoke_bpf_prog(tjit, m, tn->nodes[i], save_ret)) return -EINVAL; } @@ -2651,12 +2651,12 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, struct bpf_tramp_jit *tjit, const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, + struct bpf_tramp_nodes *tnodes, void *func_addr) { - struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN]; - struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; - struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; + struct bpf_tramp_nodes *fmod_ret = &tnodes[BPF_TRAMP_MODIFY_RETURN]; + struct bpf_tramp_nodes *fentry = &tnodes[BPF_TRAMP_FENTRY]; + struct bpf_tramp_nodes *fexit = &tnodes[BPF_TRAMP_FEXIT]; int nr_bpf_args, nr_reg_args, nr_stack_args; int cookie_cnt, cookie_off, fsession_cnt; struct bpf_jit *jit = &tjit->common; @@ -2693,8 +2693,8 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, return -ENOTSUPP; } - cookie_cnt = bpf_fsession_cookie_cnt(tlinks); - fsession_cnt = bpf_fsession_cnt(tlinks); + cookie_cnt = bpf_fsession_cookie_cnt(tnodes); + fsession_cnt = bpf_fsession_cnt(tnodes); /* * Calculate the stack layout. @@ -2829,7 +2829,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, func_meta, cookie_off)) return -EINVAL; - if (fmod_ret->nr_links) { + if (fmod_ret->nr_nodes) { /* * retval = 0; */ @@ -2838,8 +2838,8 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, _EMIT6(0xd707f000 | tjit->retval_off, 0xf000 | tjit->retval_off); - for (i = 0; i < fmod_ret->nr_links; i++) { - if (invoke_bpf_prog(tjit, m, fmod_ret->links[i], true)) + for (i = 0; i < fmod_ret->nr_nodes; i++) { + if (invoke_bpf_prog(tjit, m, fmod_ret->nodes[i], true)) return -EINVAL; /* @@ -2964,7 +2964,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, } int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, void *orig_call) + struct bpf_tramp_nodes *tnodes, void *orig_call) { struct bpf_tramp_image im; struct bpf_tramp_jit tjit; @@ -2973,14 +2973,14 @@ int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, memset(&tjit, 0, sizeof(tjit)); ret = __arch_prepare_bpf_trampoline(&im, &tjit, m, flags, - tlinks, orig_call); + tnodes, orig_call); return ret < 0 ? ret : tjit.common.prg; } int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, - u32 flags, struct bpf_tramp_links *tlinks, + u32 flags, struct bpf_tramp_nodes *tnodes, void *func_addr) { struct bpf_tramp_jit tjit; @@ -2989,7 +2989,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, /* Compute offsets, check whether the code fits. */ memset(&tjit, 0, sizeof(tjit)); ret = __arch_prepare_bpf_trampoline(im, &tjit, m, flags, - tlinks, func_addr); + tnodes, func_addr); if (ret < 0) return ret; @@ -3003,7 +3003,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, tjit.common.prg = 0; tjit.common.prg_buf = image; ret = __arch_prepare_bpf_trampoline(im, &tjit, m, flags, - tlinks, func_addr); + tnodes, func_addr); return ret < 0 ? ret : tjit.common.prg; } diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index a0c541a441cf..054e043ffcd2 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -3104,15 +3104,15 @@ static void restore_regs(const struct btf_func_model *m, u8 **prog, } static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, - struct bpf_tramp_link *l, int stack_size, + struct bpf_tramp_node *node, int stack_size, int run_ctx_off, bool save_ret, void *image, void *rw_image) { u8 *prog = *pprog; u8 *jmp_insn; int ctx_cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie); - struct bpf_prog *p = l->link.prog; - u64 cookie = l->cookie; + struct bpf_prog *p = node->link->prog; + u64 cookie = node->cookie; /* mov rdi, cookie */ emit_mov_imm64(&prog, BPF_REG_1, (long) cookie >> 32, (u32) (long) cookie); @@ -3219,7 +3219,7 @@ static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond) } static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, - struct bpf_tramp_links *tl, int stack_size, + struct bpf_tramp_nodes *tl, int stack_size, int run_ctx_off, int func_meta_off, bool save_ret, void *image, void *rw_image, u64 func_meta, int cookie_off) @@ -3227,13 +3227,13 @@ static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, int i, cur_cookie = (cookie_off - stack_size) / 8; u8 *prog = *pprog; - for (i = 0; i < tl->nr_links; i++) { - if (tl->links[i]->link.prog->call_session_cookie) { + for (i = 0; i < tl->nr_nodes; i++) { + if (tl->nodes[i]->link->prog->call_session_cookie) { emit_store_stack_imm64(&prog, BPF_REG_0, -func_meta_off, func_meta | (cur_cookie << BPF_TRAMP_COOKIE_INDEX_SHIFT)); cur_cookie--; } - if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, + if (invoke_bpf_prog(m, &prog, tl->nodes[i], stack_size, run_ctx_off, save_ret, image, rw_image)) return -EINVAL; } @@ -3242,7 +3242,7 @@ static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, } static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, - struct bpf_tramp_links *tl, int stack_size, + struct bpf_tramp_nodes *tl, int stack_size, int run_ctx_off, u8 **branches, void *image, void *rw_image) { @@ -3254,8 +3254,8 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, */ emit_mov_imm32(&prog, false, BPF_REG_0, 0); emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); - for (i = 0; i < tl->nr_links; i++) { - if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, run_ctx_off, true, + for (i = 0; i < tl->nr_nodes; i++) { + if (invoke_bpf_prog(m, &prog, tl->nodes[i], stack_size, run_ctx_off, true, image, rw_image)) return -EINVAL; @@ -3346,14 +3346,14 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_image, void *rw_image_end, void *image, const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, + struct bpf_tramp_nodes *tnodes, void *func_addr) { int i, ret, nr_regs = m->nr_args, stack_size = 0; int regs_off, func_meta_off, ip_off, run_ctx_off, arg_stack_off, rbx_off; - struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; - struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; - struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN]; + struct bpf_tramp_nodes *fentry = &tnodes[BPF_TRAMP_FENTRY]; + struct bpf_tramp_nodes *fexit = &tnodes[BPF_TRAMP_FEXIT]; + struct bpf_tramp_nodes *fmod_ret = &tnodes[BPF_TRAMP_MODIFY_RETURN]; void *orig_call = func_addr; int cookie_off, cookie_cnt; u8 **branches = NULL; @@ -3425,7 +3425,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im ip_off = stack_size; - cookie_cnt = bpf_fsession_cookie_cnt(tlinks); + cookie_cnt = bpf_fsession_cookie_cnt(tnodes); /* room for session cookies */ stack_size += cookie_cnt * 8; cookie_off = stack_size; @@ -3518,7 +3518,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im } } - if (bpf_fsession_cnt(tlinks)) { + if (bpf_fsession_cnt(tnodes)) { /* clear all the session cookies' value */ for (int i = 0; i < cookie_cnt; i++) emit_store_stack_imm64(&prog, BPF_REG_0, -cookie_off + 8 * i, 0); @@ -3526,15 +3526,15 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im emit_store_stack_imm64(&prog, BPF_REG_0, -8, 0); } - if (fentry->nr_links) { + if (fentry->nr_nodes) { if (invoke_bpf(m, &prog, fentry, regs_off, run_ctx_off, func_meta_off, flags & BPF_TRAMP_F_RET_FENTRY_RET, image, rw_image, func_meta, cookie_off)) return -EINVAL; } - if (fmod_ret->nr_links) { - branches = kcalloc(fmod_ret->nr_links, sizeof(u8 *), + if (fmod_ret->nr_nodes) { + branches = kcalloc(fmod_ret->nr_nodes, sizeof(u8 *), GFP_KERNEL); if (!branches) return -ENOMEM; @@ -3573,7 +3573,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im emit_nops(&prog, X86_PATCH_SIZE); } - if (fmod_ret->nr_links) { + if (fmod_ret->nr_nodes) { /* From Intel 64 and IA-32 Architectures Optimization * Reference Manual, 3.4.1.4 Code Alignment, Assembly/Compiler * Coding Rule 11: All branch targets should be 16-byte @@ -3583,7 +3583,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im /* Update the branches saved in invoke_bpf_mod_ret with the * aligned address of do_fexit. */ - for (i = 0; i < fmod_ret->nr_links; i++) { + for (i = 0; i < fmod_ret->nr_nodes; i++) { emit_cond_near_jump(&branches[i], image + (prog - (u8 *)rw_image), image + (branches[i] - (u8 *)rw_image), X86_JNE); } @@ -3591,10 +3591,10 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im /* set the "is_return" flag for fsession */ func_meta |= (1ULL << BPF_TRAMP_IS_RETURN_SHIFT); - if (bpf_fsession_cnt(tlinks)) + if (bpf_fsession_cnt(tnodes)) emit_store_stack_imm64(&prog, BPF_REG_0, -func_meta_off, func_meta); - if (fexit->nr_links) { + if (fexit->nr_nodes) { if (invoke_bpf(m, &prog, fexit, regs_off, run_ctx_off, func_meta_off, false, image, rw_image, func_meta, cookie_off)) { ret = -EINVAL; @@ -3668,7 +3668,7 @@ int arch_protect_bpf_trampoline(void *image, unsigned int size) int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, + struct bpf_tramp_nodes *tnodes, void *func_addr) { void *rw_image, *tmp; @@ -3683,7 +3683,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i return -ENOMEM; ret = __arch_prepare_bpf_trampoline(im, rw_image, rw_image + size, image, m, - flags, tlinks, func_addr); + flags, tnodes, func_addr); if (ret < 0) goto out; @@ -3696,7 +3696,7 @@ out: } int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, void *func_addr) + struct bpf_tramp_nodes *tnodes, void *func_addr) { struct bpf_tramp_image im; void *image; @@ -3714,7 +3714,7 @@ int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, return -ENOMEM; ret = __arch_prepare_bpf_trampoline(&im, image, image + PAGE_SIZE, image, - m, flags, tlinks, func_addr); + m, flags, tnodes, func_addr); bpf_jit_free_exec(image); return ret; } diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f6056bab6f23..6ff35491d9c0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1251,9 +1251,9 @@ enum { #define BPF_TRAMP_COOKIE_INDEX_SHIFT 8 #define BPF_TRAMP_IS_RETURN_SHIFT 63 -struct bpf_tramp_links { - struct bpf_tramp_link *links[BPF_MAX_TRAMP_LINKS]; - int nr_links; +struct bpf_tramp_nodes { + struct bpf_tramp_node *nodes[BPF_MAX_TRAMP_LINKS]; + int nr_nodes; }; struct bpf_tramp_run_ctx; @@ -1281,13 +1281,13 @@ struct bpf_tramp_run_ctx; struct bpf_tramp_image; int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, + struct bpf_tramp_nodes *tnodes, void *func_addr); void *arch_alloc_bpf_trampoline(unsigned int size); void arch_free_bpf_trampoline(void *image, unsigned int size); int __must_check arch_protect_bpf_trampoline(void *image, unsigned int size); int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, void *func_addr); + struct bpf_tramp_nodes *tnodes, void *func_addr); u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); @@ -1471,10 +1471,10 @@ static inline int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u6 } #ifdef CONFIG_BPF_JIT -int bpf_trampoline_link_prog(struct bpf_tramp_link *link, +int bpf_trampoline_link_prog(struct bpf_tramp_node *node, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog); -int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, +int bpf_trampoline_unlink_prog(struct bpf_tramp_node *node, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog); struct bpf_trampoline *bpf_trampoline_get(u64 key, @@ -1561,13 +1561,13 @@ bool bpf_insn_is_indirect_target(const struct bpf_verifier_env *env, const struc int insn_idx); u16 bpf_out_stack_arg_cnt(const struct bpf_verifier_env *env, const struct bpf_prog *prog); #else -static inline int bpf_trampoline_link_prog(struct bpf_tramp_link *link, +static inline int bpf_trampoline_link_prog(struct bpf_tramp_node *node, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog) { return -ENOTSUPP; } -static inline int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, +static inline int bpf_trampoline_unlink_prog(struct bpf_tramp_node *node, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog) { @@ -1909,12 +1909,17 @@ struct bpf_link_ops { __poll_t (*poll)(struct file *file, struct poll_table_struct *pts); }; -struct bpf_tramp_link { - struct bpf_link link; +struct bpf_tramp_node { + struct bpf_link *link; struct hlist_node tramp_hlist; u64 cookie; }; +struct bpf_tramp_link { + struct bpf_link link; + struct bpf_tramp_node node; +}; + struct bpf_shim_tramp_link { struct bpf_tramp_link link; struct bpf_trampoline *trampoline; @@ -2132,8 +2137,8 @@ void bpf_struct_ops_put(const void *kdata); int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff); int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, void *value); -int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, - struct bpf_tramp_link *link, +int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_nodes *tnodes, + struct bpf_tramp_node *node, const struct btf_func_model *model, void *stub_func, void **image, u32 *image_off, @@ -2228,31 +2233,31 @@ static inline void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_op #endif -static inline int bpf_fsession_cnt(struct bpf_tramp_links *links) +static inline int bpf_fsession_cnt(struct bpf_tramp_nodes *nodes) { - struct bpf_tramp_links fentries = links[BPF_TRAMP_FENTRY]; + struct bpf_tramp_nodes fentries = nodes[BPF_TRAMP_FENTRY]; int cnt = 0; - for (int i = 0; i < links[BPF_TRAMP_FENTRY].nr_links; i++) { - if (fentries.links[i]->link.prog->expected_attach_type == BPF_TRACE_FSESSION) + for (int i = 0; i < nodes[BPF_TRAMP_FENTRY].nr_nodes; i++) { + if (fentries.nodes[i]->link->prog->expected_attach_type == BPF_TRACE_FSESSION) cnt++; } return cnt; } -static inline bool bpf_prog_calls_session_cookie(struct bpf_tramp_link *link) +static inline bool bpf_prog_calls_session_cookie(struct bpf_tramp_node *node) { - return link->link.prog->call_session_cookie; + return node->link->prog->call_session_cookie; } -static inline int bpf_fsession_cookie_cnt(struct bpf_tramp_links *links) +static inline int bpf_fsession_cookie_cnt(struct bpf_tramp_nodes *nodes) { - struct bpf_tramp_links fentries = links[BPF_TRAMP_FENTRY]; + struct bpf_tramp_nodes fentries = nodes[BPF_TRAMP_FENTRY]; int cnt = 0; - for (int i = 0; i < links[BPF_TRAMP_FENTRY].nr_links; i++) { - if (bpf_prog_calls_session_cookie(fentries.links[i])) + for (int i = 0; i < nodes[BPF_TRAMP_FENTRY].nr_nodes; i++) { + if (bpf_prog_calls_session_cookie(fentries.nodes[i])) cnt++; } @@ -2800,6 +2805,9 @@ void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog, enum bpf_attach_type attach_type, bool sleepable); +void bpf_tramp_link_init(struct bpf_tramp_link *link, enum bpf_link_type type, + const struct bpf_link_ops *ops, struct bpf_prog *prog, + enum bpf_attach_type attach_type, u64 cookie); int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer); int bpf_link_settle(struct bpf_link_primer *primer); void bpf_link_cleanup(struct bpf_link_primer *primer); @@ -3223,6 +3231,12 @@ static inline void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_ { } +static inline void bpf_tramp_link_init(struct bpf_tramp_link *link, enum bpf_link_type type, + const struct bpf_link_ops *ops, struct bpf_prog *prog, + enum bpf_attach_type attach_type, u64 cookie) +{ +} + static inline int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer) { diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 5e51c1211673..51b16e5f5534 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -594,8 +594,8 @@ const struct bpf_link_ops bpf_struct_ops_link_lops = { .dealloc = bpf_struct_ops_link_dealloc, }; -int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, - struct bpf_tramp_link *link, +int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_nodes *tnodes, + struct bpf_tramp_node *node, const struct btf_func_model *model, void *stub_func, void **_image, u32 *_image_off, @@ -605,13 +605,13 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, void *image = *_image; int size; - tlinks[BPF_TRAMP_FENTRY].links[0] = link; - tlinks[BPF_TRAMP_FENTRY].nr_links = 1; + tnodes[BPF_TRAMP_FENTRY].nodes[0] = node; + tnodes[BPF_TRAMP_FENTRY].nr_nodes = 1; if (model->ret_size > 0) flags |= BPF_TRAMP_F_RET_FENTRY_RET; - size = arch_bpf_trampoline_size(model, flags, tlinks, stub_func); + size = arch_bpf_trampoline_size(model, flags, tnodes, stub_func); if (size <= 0) return size ? : -EFAULT; @@ -628,7 +628,7 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, size = arch_prepare_bpf_trampoline(NULL, image + image_off, image + image_off + size, - model, flags, tlinks, stub_func); + model, flags, tnodes, stub_func); if (size <= 0) { if (image != *_image) bpf_struct_ops_image_free(image); @@ -693,7 +693,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, const struct btf_type *module_type; const struct btf_member *member; const struct btf_type *t = st_ops_desc->type; - struct bpf_tramp_links *tlinks; + struct bpf_tramp_nodes *tnodes; void *udata, *kdata; int prog_fd, err; u32 i, trampoline_start, image_off = 0; @@ -720,8 +720,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, if (uvalue->common.state || refcount_read(&uvalue->common.refcnt)) return -EINVAL; - tlinks = kzalloc_objs(*tlinks, BPF_TRAMP_MAX); - if (!tlinks) + tnodes = kzalloc_objs(*tnodes, BPF_TRAMP_MAX); + if (!tnodes) return -ENOMEM; uvalue = (struct bpf_struct_ops_value *)st_map->uvalue; @@ -817,8 +817,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, err = -ENOMEM; goto reset_unlock; } - bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, - &bpf_struct_ops_link_lops, prog, prog->expected_attach_type); + bpf_tramp_link_init(link, BPF_LINK_TYPE_STRUCT_OPS, + &bpf_struct_ops_link_lops, prog, prog->expected_attach_type, 0); + *plink++ = &link->link; /* Poison pointer on error instead of return for backward compatibility */ @@ -832,7 +833,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, *pksym++ = ksym; trampoline_start = image_off; - err = bpf_struct_ops_prepare_trampoline(tlinks, link, + err = bpf_struct_ops_prepare_trampoline(tnodes, &link->node, &st_ops->func_models[i], *(void **)(st_ops->cfi_stubs + moff), &image, &image_off, @@ -911,7 +912,7 @@ reset_unlock: memset(uvalue, 0, map->value_size); memset(kvalue, 0, map->value_size); unlock: - kfree(tlinks); + kfree(tnodes); mutex_unlock(&st_map->lock); if (!err) bpf_struct_ops_map_add_ksyms(st_map); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5fcfc32c7cb4..fd69fdb9290b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3288,6 +3288,15 @@ void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, bpf_link_init_sleepable(link, type, ops, prog, attach_type, false); } +void bpf_tramp_link_init(struct bpf_tramp_link *link, enum bpf_link_type type, + const struct bpf_link_ops *ops, struct bpf_prog *prog, + enum bpf_attach_type attach_type, u64 cookie) +{ + bpf_link_init(&link->link, type, ops, prog, attach_type); + link->node.link = &link->link; + link->node.cookie = cookie; +} + static void bpf_link_free_id(int id) { if (!id) @@ -3595,7 +3604,7 @@ static void bpf_tracing_link_release(struct bpf_link *link) struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); - WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link, + WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link.node, tr_link->trampoline, tr_link->tgt_prog)); @@ -3608,8 +3617,7 @@ static void bpf_tracing_link_release(struct bpf_link *link) static void bpf_tracing_link_dealloc(struct bpf_link *link) { - struct bpf_tracing_link *tr_link = - container_of(link, struct bpf_tracing_link, link.link); + struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); kfree(tr_link); } @@ -3617,8 +3625,8 @@ static void bpf_tracing_link_dealloc(struct bpf_link *link) static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { - struct bpf_tracing_link *tr_link = - container_of(link, struct bpf_tracing_link, link.link); + struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); + u32 target_btf_id, target_obj_id; bpf_trampoline_unpack_key(tr_link->trampoline->key, @@ -3631,17 +3639,16 @@ static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link, link->attach_type, target_obj_id, target_btf_id, - tr_link->link.cookie); + tr_link->link.node.cookie); } static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { - struct bpf_tracing_link *tr_link = - container_of(link, struct bpf_tracing_link, link.link); + struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); info->tracing.attach_type = link->attach_type; - info->tracing.cookie = tr_link->link.cookie; + info->tracing.cookie = tr_link->link.node.cookie; bpf_trampoline_unpack_key(tr_link->trampoline->key, &info->tracing.target_obj_id, &info->tracing.target_btf_id); @@ -3728,9 +3735,9 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, fslink = kzalloc_obj(*fslink, GFP_USER); if (fslink) { - bpf_link_init(&fslink->fexit.link, BPF_LINK_TYPE_TRACING, - &bpf_tracing_link_lops, prog, attach_type); - fslink->fexit.cookie = bpf_cookie; + bpf_tramp_link_init(&fslink->fexit, BPF_LINK_TYPE_TRACING, + &bpf_tracing_link_lops, prog, attach_type, + bpf_cookie); link = &fslink->link; } else { link = NULL; @@ -3742,10 +3749,8 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, err = -ENOMEM; goto out_put_prog; } - bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING, - &bpf_tracing_link_lops, prog, attach_type); - - link->link.cookie = bpf_cookie; + bpf_tramp_link_init(&link->link, BPF_LINK_TYPE_TRACING, + &bpf_tracing_link_lops, prog, attach_type, bpf_cookie); mutex_lock(&prog->aux->dst_mutex); @@ -3848,7 +3853,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, if (err) goto out_unlock; - err = bpf_trampoline_link_prog(&link->link, tr, tgt_prog); + err = bpf_trampoline_link_prog(&link->link.node, tr, tgt_prog); if (err) { bpf_link_cleanup(&link_primer); link = NULL; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 701138ef424a..6a45c09fc0d8 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -502,30 +502,29 @@ static const struct bpf_trampoline_ops trampoline_ops = { .modify_fentry = modify_fentry, }; -static struct bpf_tramp_links * +static struct bpf_tramp_nodes * bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_arg) { - struct bpf_tramp_link *link; - struct bpf_tramp_links *tlinks; - struct bpf_tramp_link **links; + struct bpf_tramp_node *node, **nodes; + struct bpf_tramp_nodes *tnodes; int kind; *total = 0; - tlinks = kzalloc_objs(*tlinks, BPF_TRAMP_MAX); - if (!tlinks) + tnodes = kzalloc_objs(*tnodes, BPF_TRAMP_MAX); + if (!tnodes) return ERR_PTR(-ENOMEM); for (kind = 0; kind < BPF_TRAMP_MAX; kind++) { - tlinks[kind].nr_links = tr->progs_cnt[kind]; + tnodes[kind].nr_nodes = tr->progs_cnt[kind]; *total += tr->progs_cnt[kind]; - links = tlinks[kind].links; + nodes = tnodes[kind].nodes; - hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) { - *ip_arg |= link->link.prog->call_get_func_ip; - *links++ = link; + hlist_for_each_entry(node, &tr->progs_hlist[kind], tramp_hlist) { + *ip_arg |= node->link->prog->call_get_func_ip; + *nodes++ = node; } } - return tlinks; + return tnodes; } static void bpf_tramp_image_free(struct bpf_tramp_image *im) @@ -673,14 +672,14 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut const struct bpf_trampoline_ops *ops, void *data) { struct bpf_tramp_image *im; - struct bpf_tramp_links *tlinks; + struct bpf_tramp_nodes *tnodes; u32 orig_flags = tr->flags; bool ip_arg = false; int err, total, size; - tlinks = bpf_trampoline_get_progs(tr, &total, &ip_arg); - if (IS_ERR(tlinks)) - return PTR_ERR(tlinks); + tnodes = bpf_trampoline_get_progs(tr, &total, &ip_arg); + if (IS_ERR(tnodes)) + return PTR_ERR(tnodes); if (total == 0) { err = ops->unregister_fentry(tr, orig_flags, data); @@ -690,8 +689,8 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut /* clear all bits except SHARE_IPMODIFY and TAIL_CALL_CTX */ tr->flags &= (BPF_TRAMP_F_SHARE_IPMODIFY | BPF_TRAMP_F_TAIL_CALL_CTX); - if (tlinks[BPF_TRAMP_FEXIT].nr_links || - tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) { + if (tnodes[BPF_TRAMP_FEXIT].nr_nodes || + tnodes[BPF_TRAMP_MODIFY_RETURN].nr_nodes) { /* NOTE: BPF_TRAMP_F_RESTORE_REGS and BPF_TRAMP_F_SKIP_FRAME * should not be set together. */ @@ -722,7 +721,7 @@ again: #endif size = arch_bpf_trampoline_size(&tr->func.model, tr->flags, - tlinks, tr->func.addr); + tnodes, tr->func.addr); if (size < 0) { err = size; goto out; @@ -740,7 +739,7 @@ again: } err = arch_prepare_bpf_trampoline(im, im->image, im->image + size, - &tr->func.model, tr->flags, tlinks, + &tr->func.model, tr->flags, tnodes, tr->func.addr); if (err < 0) goto out_free; @@ -774,7 +773,7 @@ out: /* If any error happens, restore previous flags */ if (err) tr->flags = orig_flags; - kfree(tlinks); + kfree(tnodes); return err; } @@ -821,15 +820,15 @@ static int bpf_freplace_check_tgt_prog(struct bpf_prog *tgt_prog) } static int bpf_trampoline_add_prog(struct bpf_trampoline *tr, - struct bpf_tramp_link *link, + struct bpf_tramp_node *node, int cnt) { struct bpf_fsession_link *fslink = NULL; enum bpf_tramp_prog_type kind; - struct bpf_tramp_link *link_exiting; + struct bpf_tramp_node *node_existing; struct hlist_head *prog_list; - kind = bpf_attach_type_to_tramp(link->link.prog); + kind = bpf_attach_type_to_tramp(node->link->prog); if (kind == BPF_TRAMP_FSESSION) { prog_list = &tr->progs_hlist[BPF_TRAMP_FENTRY]; cnt++; @@ -838,21 +837,21 @@ static int bpf_trampoline_add_prog(struct bpf_trampoline *tr, } if (cnt >= BPF_MAX_TRAMP_LINKS) return -E2BIG; - if (!hlist_unhashed(&link->tramp_hlist)) + if (!hlist_unhashed(&node->tramp_hlist)) /* prog already linked */ return -EBUSY; - hlist_for_each_entry(link_exiting, prog_list, tramp_hlist) { - if (link_exiting->link.prog != link->link.prog) + hlist_for_each_entry(node_existing, prog_list, tramp_hlist) { + if (node_existing->link->prog != node->link->prog) continue; /* prog already linked */ return -EBUSY; } - hlist_add_head(&link->tramp_hlist, prog_list); + hlist_add_head(&node->tramp_hlist, prog_list); if (kind == BPF_TRAMP_FSESSION) { tr->progs_cnt[BPF_TRAMP_FENTRY]++; - fslink = container_of(link, struct bpf_fsession_link, link.link); - hlist_add_head(&fslink->fexit.tramp_hlist, &tr->progs_hlist[BPF_TRAMP_FEXIT]); + fslink = container_of(node, struct bpf_fsession_link, link.link.node); + hlist_add_head(&fslink->fexit.node.tramp_hlist, &tr->progs_hlist[BPF_TRAMP_FEXIT]); tr->progs_cnt[BPF_TRAMP_FEXIT]++; } else { tr->progs_cnt[kind]++; @@ -861,23 +860,23 @@ static int bpf_trampoline_add_prog(struct bpf_trampoline *tr, } static void bpf_trampoline_remove_prog(struct bpf_trampoline *tr, - struct bpf_tramp_link *link) + struct bpf_tramp_node *node) { struct bpf_fsession_link *fslink; enum bpf_tramp_prog_type kind; - kind = bpf_attach_type_to_tramp(link->link.prog); + kind = bpf_attach_type_to_tramp(node->link->prog); if (kind == BPF_TRAMP_FSESSION) { - fslink = container_of(link, struct bpf_fsession_link, link.link); - hlist_del_init(&fslink->fexit.tramp_hlist); + fslink = container_of(node, struct bpf_fsession_link, link.link.node); + hlist_del_init(&fslink->fexit.node.tramp_hlist); tr->progs_cnt[BPF_TRAMP_FEXIT]--; kind = BPF_TRAMP_FENTRY; } - hlist_del_init(&link->tramp_hlist); + hlist_del_init(&node->tramp_hlist); tr->progs_cnt[kind]--; } -static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, +static int __bpf_trampoline_link_prog(struct bpf_tramp_node *node, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog, const struct bpf_trampoline_ops *ops, @@ -887,7 +886,7 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, int err = 0; int cnt = 0, i; - kind = bpf_attach_type_to_tramp(link->link.prog); + kind = bpf_attach_type_to_tramp(node->link->prog); if (tr->extension_prog) /* cannot attach fentry/fexit if extension prog is attached. * cannot overwrite extension prog either. @@ -904,33 +903,33 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, err = bpf_freplace_check_tgt_prog(tgt_prog); if (err) return err; - tr->extension_prog = link->link.prog; + tr->extension_prog = node->link->prog; return bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP, BPF_MOD_JUMP, NULL, - link->link.prog->bpf_func); + node->link->prog->bpf_func); } - err = bpf_trampoline_add_prog(tr, link, cnt); + err = bpf_trampoline_add_prog(tr, node, cnt); if (err) return err; err = bpf_trampoline_update(tr, true /* lock_direct_mutex */, ops, data); if (err) - bpf_trampoline_remove_prog(tr, link); + bpf_trampoline_remove_prog(tr, node); return err; } -int bpf_trampoline_link_prog(struct bpf_tramp_link *link, +int bpf_trampoline_link_prog(struct bpf_tramp_node *node, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog) { int err; trampoline_lock(tr); - err = __bpf_trampoline_link_prog(link, tr, tgt_prog, &trampoline_ops, NULL); + err = __bpf_trampoline_link_prog(node, tr, tgt_prog, &trampoline_ops, NULL); trampoline_unlock(tr); return err; } -static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, +static int __bpf_trampoline_unlink_prog(struct bpf_tramp_node *node, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog, const struct bpf_trampoline_ops *ops, @@ -939,7 +938,7 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, enum bpf_tramp_prog_type kind; int err; - kind = bpf_attach_type_to_tramp(link->link.prog); + kind = bpf_attach_type_to_tramp(node->link->prog); if (kind == BPF_TRAMP_REPLACE) { WARN_ON_ONCE(!tr->extension_prog); err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, @@ -950,19 +949,19 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, tgt_prog->aux->is_extended = false; return err; } - bpf_trampoline_remove_prog(tr, link); + bpf_trampoline_remove_prog(tr, node); return bpf_trampoline_update(tr, true /* lock_direct_mutex */, ops, data); } /* bpf_trampoline_unlink_prog() should never fail. */ -int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, +int bpf_trampoline_unlink_prog(struct bpf_tramp_node *node, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog) { int err; trampoline_lock(tr); - err = __bpf_trampoline_unlink_prog(link, tr, tgt_prog, &trampoline_ops, NULL); + err = __bpf_trampoline_unlink_prog(node, tr, tgt_prog, &trampoline_ops, NULL); trampoline_unlock(tr); return err; } @@ -977,7 +976,7 @@ static void bpf_shim_tramp_link_release(struct bpf_link *link) if (!shim_link->trampoline) return; - WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline, NULL)); + WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link.node, shim_link->trampoline, NULL)); bpf_trampoline_put(shim_link->trampoline); } @@ -1023,8 +1022,8 @@ static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog p->type = BPF_PROG_TYPE_LSM; p->expected_attach_type = BPF_LSM_MAC; bpf_prog_inc(p); - bpf_link_init(&shim_link->link.link, BPF_LINK_TYPE_UNSPEC, - &bpf_shim_tramp_link_lops, p, attach_type); + bpf_tramp_link_init(&shim_link->link, BPF_LINK_TYPE_UNSPEC, + &bpf_shim_tramp_link_lops, p, attach_type, 0); bpf_cgroup_atype_get(p->aux->attach_btf_id, cgroup_atype); return shim_link; @@ -1033,15 +1032,15 @@ static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog static struct bpf_shim_tramp_link *cgroup_shim_find(struct bpf_trampoline *tr, bpf_func_t bpf_func) { - struct bpf_tramp_link *link; + struct bpf_tramp_node *node; int kind; for (kind = 0; kind < BPF_TRAMP_MAX; kind++) { - hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) { - struct bpf_prog *p = link->link.prog; + hlist_for_each_entry(node, &tr->progs_hlist[kind], tramp_hlist) { + struct bpf_prog *p = node->link->prog; if (p->bpf_func == bpf_func) - return container_of(link, struct bpf_shim_tramp_link, link); + return container_of(node, struct bpf_shim_tramp_link, link.node); } } @@ -1091,7 +1090,7 @@ int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, goto err; } - err = __bpf_trampoline_link_prog(&shim_link->link, tr, NULL, &trampoline_ops, NULL); + err = __bpf_trampoline_link_prog(&shim_link->link.node, tr, NULL, &trampoline_ops, NULL); if (err) goto err; @@ -1406,7 +1405,7 @@ bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog) int __weak arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, + struct bpf_tramp_nodes *tnodes, void *func_addr) { return -ENOTSUPP; @@ -1440,7 +1439,7 @@ int __weak arch_protect_bpf_trampoline(void *image, unsigned int size) } int __weak arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, void *func_addr) + struct bpf_tramp_nodes *tnodes, void *func_addr) { return -ENOTSUPP; } diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c index ae5a54c350b9..191a6b3ee254 100644 --- a/net/bpf/bpf_dummy_struct_ops.c +++ b/net/bpf/bpf_dummy_struct_ops.c @@ -132,7 +132,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, const struct bpf_struct_ops *st_ops = &bpf_bpf_dummy_ops; const struct btf_type *func_proto; struct bpf_dummy_ops_test_args *args; - struct bpf_tramp_links *tlinks = NULL; + struct bpf_tramp_nodes *tnodes = NULL; struct bpf_tramp_link *link = NULL; void *image = NULL; unsigned int op_idx; @@ -158,8 +158,8 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, if (err) goto out; - tlinks = kzalloc_objs(*tlinks, BPF_TRAMP_MAX); - if (!tlinks) { + tnodes = kzalloc_objs(*tnodes, BPF_TRAMP_MAX); + if (!tnodes) { err = -ENOMEM; goto out; } @@ -171,11 +171,11 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, } /* prog doesn't take the ownership of the reference from caller */ bpf_prog_inc(prog); - bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_link_lops, prog, - prog->expected_attach_type); + bpf_tramp_link_init(link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_link_lops, + prog, prog->expected_attach_type, 0); op_idx = prog->expected_attach_type; - err = bpf_struct_ops_prepare_trampoline(tlinks, link, + err = bpf_struct_ops_prepare_trampoline(tnodes, &link->node, &st_ops->func_models[op_idx], &dummy_ops_test_ret_function, &image, &image_off, @@ -198,7 +198,7 @@ out: bpf_struct_ops_image_free(image); if (link) bpf_link_put(&link->link); - kfree(tlinks); + kfree(tnodes); return err; } -- cgit v1.2.3 From 880db5d4abb29e931d82b9feefb4382f76fcf9e5 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:34 +0200 Subject: bpf: Factor fsession link to use struct bpf_tramp_node Now that we split trampoline attachment object (bpf_tramp_node) from the link object (bpf_tramp_link) we can use bpf_tramp_node as fsession's fexit attachment object and get rid of the bpf_fsession_link object. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-10-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 +----- kernel/bpf/syscall.c | 21 ++++++--------------- kernel/bpf/trampoline.c | 12 ++++++------ 3 files changed, 13 insertions(+), 26 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6ff35491d9c0..428789a9e736 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1927,15 +1927,11 @@ struct bpf_shim_tramp_link { struct bpf_tracing_link { struct bpf_tramp_link link; + struct bpf_tramp_node fexit; struct bpf_trampoline *trampoline; struct bpf_prog *tgt_prog; }; -struct bpf_fsession_link { - struct bpf_tracing_link link; - struct bpf_tramp_link fexit; -}; - struct bpf_raw_tp_link { struct bpf_link link; struct bpf_raw_event_map *btp; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index fd69fdb9290b..0cfc8bcb3dc9 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3730,21 +3730,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id); } - if (prog->expected_attach_type == BPF_TRACE_FSESSION) { - struct bpf_fsession_link *fslink; - - fslink = kzalloc_obj(*fslink, GFP_USER); - if (fslink) { - bpf_tramp_link_init(&fslink->fexit, BPF_LINK_TYPE_TRACING, - &bpf_tracing_link_lops, prog, attach_type, - bpf_cookie); - link = &fslink->link; - } else { - link = NULL; - } - } else { - link = kzalloc_obj(*link, GFP_USER); - } + link = kzalloc_obj(*link, GFP_USER); if (!link) { err = -ENOMEM; goto out_put_prog; @@ -3752,6 +3738,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, bpf_tramp_link_init(&link->link, BPF_LINK_TYPE_TRACING, &bpf_tracing_link_lops, prog, attach_type, bpf_cookie); + if (prog->expected_attach_type == BPF_TRACE_FSESSION) { + link->fexit.link = &link->link.link; + link->fexit.cookie = bpf_cookie; + } + mutex_lock(&prog->aux->dst_mutex); /* There are a few possible cases here: diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 6a45c09fc0d8..5776d2b8e36e 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -823,7 +823,7 @@ static int bpf_trampoline_add_prog(struct bpf_trampoline *tr, struct bpf_tramp_node *node, int cnt) { - struct bpf_fsession_link *fslink = NULL; + struct bpf_tracing_link *tr_link = NULL; enum bpf_tramp_prog_type kind; struct bpf_tramp_node *node_existing; struct hlist_head *prog_list; @@ -850,8 +850,8 @@ static int bpf_trampoline_add_prog(struct bpf_trampoline *tr, hlist_add_head(&node->tramp_hlist, prog_list); if (kind == BPF_TRAMP_FSESSION) { tr->progs_cnt[BPF_TRAMP_FENTRY]++; - fslink = container_of(node, struct bpf_fsession_link, link.link.node); - hlist_add_head(&fslink->fexit.node.tramp_hlist, &tr->progs_hlist[BPF_TRAMP_FEXIT]); + tr_link = container_of(node, struct bpf_tracing_link, link.node); + hlist_add_head(&tr_link->fexit.tramp_hlist, &tr->progs_hlist[BPF_TRAMP_FEXIT]); tr->progs_cnt[BPF_TRAMP_FEXIT]++; } else { tr->progs_cnt[kind]++; @@ -862,13 +862,13 @@ static int bpf_trampoline_add_prog(struct bpf_trampoline *tr, static void bpf_trampoline_remove_prog(struct bpf_trampoline *tr, struct bpf_tramp_node *node) { - struct bpf_fsession_link *fslink; + struct bpf_tracing_link *tr_link; enum bpf_tramp_prog_type kind; kind = bpf_attach_type_to_tramp(node->link->prog); if (kind == BPF_TRAMP_FSESSION) { - fslink = container_of(node, struct bpf_fsession_link, link.link.node); - hlist_del_init(&fslink->fexit.node.tramp_hlist); + tr_link = container_of(node, struct bpf_tracing_link, link.node); + hlist_del_init(&tr_link->fexit.tramp_hlist); tr->progs_cnt[BPF_TRAMP_FEXIT]--; kind = BPF_TRAMP_FENTRY; } -- cgit v1.2.3 From d14e6b4346bf397eca7cb5f4b7b0b8054be632d8 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:35 +0200 Subject: bpf: Add multi tracing attach types Adding new program attach types multi tracing attachment: BPF_TRACE_FENTRY_MULTI BPF_TRACE_FEXIT_MULTI and their base support in verifier code. Programs with such attach type will use specific link attachment interface coming in following changes. This was suggested by Andrii some (long) time ago and turned out to be easier than having special program flag for that. Bpf programs with such types have 'bpf_multi_func' function set as their attach_btf_id and keep module reference when it's specified by attach_prog_fd. They are also accepted as sleepable programs during verification, and the real validation for specific BTF_IDs/functions will happen during the multi link attachment in following changes. Suggested-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-11-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 5 +++++ include/linux/btf_ids.h | 1 + include/uapi/linux/bpf.h | 2 ++ kernel/bpf/fixups.c | 1 + kernel/bpf/syscall.c | 28 ++++++++++++++++++++++++---- kernel/bpf/trampoline.c | 5 ++++- kernel/bpf/verifier.c | 40 +++++++++++++++++++++++++++++++++++++++- net/bpf/test_run.c | 2 ++ tools/include/uapi/linux/bpf.h | 2 ++ tools/lib/bpf/libbpf.c | 2 ++ 10 files changed, 82 insertions(+), 6 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 428789a9e736..b52dc64ec92d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2113,6 +2113,11 @@ static inline void bpf_prog_put_recursion_context(struct bpf_prog *prog) #endif } +static inline bool is_tracing_multi(enum bpf_attach_type type) +{ + return type == BPF_TRACE_FENTRY_MULTI || type == BPF_TRACE_FEXIT_MULTI; +} + #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) /* This macro helps developer to register a struct_ops type and generate * type information correctly. Developers should use this macro to register diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index af011db39ab3..8b5a9ee92513 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -284,5 +284,6 @@ extern u32 bpf_cgroup_btf_id[]; extern u32 bpf_local_storage_map_btf_id[]; extern u32 btf_bpf_map_id[]; extern u32 bpf_kmem_cache_btf_id[]; +extern u32 bpf_multi_func_btf_id[]; #endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d5238df5e5eb..28d127e5040a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1156,6 +1156,8 @@ enum bpf_attach_type { BPF_TRACE_KPROBE_SESSION, BPF_TRACE_UPROBE_SESSION, BPF_TRACE_FSESSION, + BPF_TRACE_FENTRY_MULTI, + BPF_TRACE_FEXIT_MULTI, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/fixups.c b/kernel/bpf/fixups.c index 5aa3f7d99ac9..0cf9735929f5 100644 --- a/kernel/bpf/fixups.c +++ b/kernel/bpf/fixups.c @@ -2186,6 +2186,7 @@ patch_map_ops_generic: insn->imm == BPF_FUNC_get_func_ret) { if (eatype == BPF_TRACE_FEXIT || eatype == BPF_TRACE_FSESSION || + eatype == BPF_TRACE_FEXIT_MULTI || eatype == BPF_MODIFY_RETURN) { /* Load nr_args from ctx - 8 */ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0cfc8bcb3dc9..efdd6639a598 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -2719,7 +2720,8 @@ static int bpf_prog_load_check_attach(enum bpf_prog_type prog_type, enum bpf_attach_type expected_attach_type, struct btf *attach_btf, u32 btf_id, - struct bpf_prog *dst_prog) + struct bpf_prog *dst_prog, + bool multi_func) { if (btf_id) { if (btf_id > BTF_MAX_TYPE) @@ -2739,6 +2741,14 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, } } + if (multi_func) { + if (prog_type != BPF_PROG_TYPE_TRACING) + return -EINVAL; + if (!attach_btf || btf_id) + return -EINVAL; + return 0; + } + if (attach_btf && (!btf_id || dst_prog)) return -EINVAL; @@ -2946,6 +2956,11 @@ static int bpf_prog_mark_insn_arrays_ready(struct bpf_prog *prog) return 0; } +extern int bpf_multi_func(void); +int __init __used bpf_multi_func(void) { return 0; } + +BTF_ID_LIST_GLOBAL_SINGLE(bpf_multi_func_btf_id, func, bpf_multi_func) + /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD keyring_id @@ -2958,6 +2973,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_at bool bpf_cap; int err; char license[128]; + bool multi_func; if (CHECK_ATTR(BPF_PROG_LOAD)) return -EINVAL; @@ -3024,6 +3040,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_at if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON)) goto put_token; + multi_func = is_tracing_multi(attr->expected_attach_type); + /* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog * or btf, we need to check which one it is */ @@ -3045,7 +3063,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_at goto put_token; } } - } else if (attr->attach_btf_id) { + } else if (attr->attach_btf_id || multi_func) { /* fall back to vmlinux BTF, if BTF type ID is specified */ attach_btf = bpf_get_btf_vmlinux(); if (IS_ERR(attach_btf)) { @@ -3061,7 +3079,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_at if (bpf_prog_load_check_attach(type, attr->expected_attach_type, attach_btf, attr->attach_btf_id, - dst_prog)) { + dst_prog, multi_func)) { if (dst_prog) bpf_prog_put(dst_prog); if (attach_btf) @@ -3084,7 +3102,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_at prog->expected_attach_type = attr->expected_attach_type; prog->sleepable = !!(attr->prog_flags & BPF_F_SLEEPABLE); prog->aux->attach_btf = attach_btf; - prog->aux->attach_btf_id = attr->attach_btf_id; + prog->aux->attach_btf_id = multi_func ? bpf_multi_func_btf_id[0] : attr->attach_btf_id; prog->aux->dst_prog = dst_prog; prog->aux->dev_bound = !!attr->prog_ifindex; prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS; @@ -4480,6 +4498,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_TRACE_FSESSION: + case BPF_TRACE_FENTRY_MULTI: + case BPF_TRACE_FEXIT_MULTI: case BPF_MODIFY_RETURN: return BPF_PROG_TYPE_TRACING; case BPF_LSM_MAC: diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 5776d2b8e36e..ae7e4fdfe2a3 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -182,7 +182,8 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog) switch (ptype) { case BPF_PROG_TYPE_TRACING: if (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || - eatype == BPF_MODIFY_RETURN || eatype == BPF_TRACE_FSESSION) + eatype == BPF_MODIFY_RETURN || eatype == BPF_TRACE_FSESSION || + eatype == BPF_TRACE_FENTRY_MULTI || eatype == BPF_TRACE_FEXIT_MULTI) return true; return false; case BPF_PROG_TYPE_LSM: @@ -781,10 +782,12 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) { switch (prog->expected_attach_type) { case BPF_TRACE_FENTRY: + case BPF_TRACE_FENTRY_MULTI: return BPF_TRAMP_FENTRY; case BPF_MODIFY_RETURN: return BPF_TRAMP_MODIFY_RETURN; case BPF_TRACE_FEXIT: + case BPF_TRACE_FEXIT_MULTI: return BPF_TRAMP_FEXIT; case BPF_TRACE_FSESSION: return BPF_TRAMP_FSESSION; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 926ff63a0b61..0e593f3335e9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -16382,6 +16382,8 @@ static bool return_retval_range(struct bpf_verifier_env *env, struct bpf_retval_ case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_TRACE_FSESSION: + case BPF_TRACE_FENTRY_MULTI: + case BPF_TRACE_FEXIT_MULTI: *range = retval_range(0, 0); break; case BPF_TRACE_RAW_TP: @@ -18772,6 +18774,11 @@ static int check_attach_modify_return(unsigned long addr, const char *func_name) #endif /* CONFIG_FUNCTION_ERROR_INJECTION */ +static bool is_tracing_multi_id(const struct bpf_prog *prog, u32 btf_id) +{ + return is_tracing_multi(prog->expected_attach_type) && bpf_multi_func_btf_id[0] == btf_id; +} + int bpf_check_attach_target(struct bpf_verifier_log *log, const struct bpf_prog *prog, const struct bpf_prog *tgt_prog, @@ -18894,6 +18901,8 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, prog_extension && (tgt_prog->expected_attach_type == BPF_TRACE_FENTRY || tgt_prog->expected_attach_type == BPF_TRACE_FEXIT || + tgt_prog->expected_attach_type == BPF_TRACE_FENTRY_MULTI || + tgt_prog->expected_attach_type == BPF_TRACE_FEXIT_MULTI || tgt_prog->expected_attach_type == BPF_TRACE_FSESSION)) { /* Program extensions can extend all program types * except fentry/fexit. The reason is the following. @@ -19000,6 +19009,8 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_TRACE_FSESSION: + case BPF_TRACE_FENTRY_MULTI: + case BPF_TRACE_FEXIT_MULTI: if (prog->expected_attach_type == BPF_TRACE_FSESSION && !bpf_jit_supports_fsession()) { bpf_log(log, "JIT does not support fsession\n"); @@ -19029,7 +19040,18 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, if (ret < 0) return ret; - if (tgt_prog) { + /* + * *.multi programs don't need an address during program + * verification, we just take the module ref if needed. + */ + if (is_tracing_multi_id(prog, btf_id)) { + if (btf_is_module(btf)) { + mod = btf_try_get_module(btf); + if (!mod) + return -ENOENT; + } + addr = 0; + } else if (tgt_prog) { if (subprog == 0) addr = (long) tgt_prog->bpf_func; else @@ -19057,6 +19079,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, ret = -EINVAL; switch (prog->type) { case BPF_PROG_TYPE_TRACING: + /* *.multi sleepable programs will pass initial sleepable check, + * the actual attached btf ids are checked later during the link + * attachment. + */ + if (is_tracing_multi_id(prog, btf_id)) + ret = 0; if (!check_attach_sleepable(btf_id, addr, tname)) ret = 0; /* fentry/fexit/fmod_ret progs can also be sleepable if they are @@ -19167,6 +19195,8 @@ static bool can_be_sleepable(struct bpf_prog *prog) case BPF_TRACE_ITER: case BPF_TRACE_FSESSION: case BPF_TRACE_RAW_TP: + case BPF_TRACE_FENTRY_MULTI: + case BPF_TRACE_FEXIT_MULTI: return true; default: return false; @@ -19260,6 +19290,14 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return -EINVAL; } + /* + * We don't get trampoline for tracing_multi programs at this point, + * it's done when tracing_multi link is created. + */ + if (prog->type == BPF_PROG_TYPE_TRACING && + is_tracing_multi(prog->expected_attach_type)) + return 0; + key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id); tr = bpf_trampoline_get(key, &tgt_info); if (!tr) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index c9aea7052ba7..67769c700cae 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -703,6 +703,8 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog, case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_TRACE_FSESSION: + case BPF_TRACE_FENTRY_MULTI: + case BPF_TRACE_FEXIT_MULTI: if (bpf_fentry_test1(1) != 2 || bpf_fentry_test2(2, 3) != 5 || bpf_fentry_test3(4, 5, 6) != 15 || diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3829db087449..1b9aacf468e5 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1156,6 +1156,8 @@ enum bpf_attach_type { BPF_TRACE_KPROBE_SESSION, BPF_TRACE_UPROBE_SESSION, BPF_TRACE_FSESSION, + BPF_TRACE_FENTRY_MULTI, + BPF_TRACE_FEXIT_MULTI, __MAX_BPF_ATTACH_TYPE }; diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 1354bcbc8b30..1b09381d16ff 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -136,6 +136,8 @@ static const char * const attach_type_name[] = { [BPF_NETKIT_PEER] = "netkit_peer", [BPF_TRACE_KPROBE_SESSION] = "trace_kprobe_session", [BPF_TRACE_UPROBE_SESSION] = "trace_uprobe_session", + [BPF_TRACE_FENTRY_MULTI] = "trace_fentry_multi", + [BPF_TRACE_FEXIT_MULTI] = "trace_fexit_multi", }; static const char * const link_type_name[] = { -- cgit v1.2.3 From bd06659d3b8abe7a79ae473209ee89bf3a23af36 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:36 +0200 Subject: bpf: Move sleepable verification code to btf_id_allow_sleepable Move sleepable verification code to btf_id_allow_sleepable function. It will be used in following changes. Adding code to retrieve type's name instead of passing it from bpf_check_attach_target function, because this function will be called from another place in following changes and it's easier to retrieve the name directly in here. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-12-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 82 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 50 insertions(+), 32 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0e593f3335e9..df21592fc560 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18779,6 +18779,55 @@ static bool is_tracing_multi_id(const struct bpf_prog *prog, u32 btf_id) return is_tracing_multi(prog->expected_attach_type) && bpf_multi_func_btf_id[0] == btf_id; } +static int btf_id_allow_sleepable(u32 btf_id, unsigned long addr, const struct bpf_prog *prog, + const struct btf *btf) +{ + const struct btf_type *t; + const char *tname; + + switch (prog->type) { + case BPF_PROG_TYPE_TRACING: + t = btf_type_by_id(btf, btf_id); + if (!t) + return -EINVAL; + tname = btf_name_by_offset(btf, t->name_off); + if (!tname) + return -EINVAL; + + /* + * *.multi sleepable programs will pass initial sleepable check, + * the actual attached btf ids are checked later during the link + * attachment. + */ + if (is_tracing_multi_id(prog, btf_id)) + return 0; + if (!check_attach_sleepable(btf_id, addr, tname)) + return 0; + /* + * fentry/fexit/fmod_ret progs can also be sleepable if they are + * in the fmodret id set with the KF_SLEEPABLE flag. + */ + else { + u32 *flags = btf_kfunc_is_modify_return(btf, btf_id, prog); + + if (flags && (*flags & KF_SLEEPABLE)) + return 0; + } + break; + case BPF_PROG_TYPE_LSM: + /* + * LSM progs check that they are attached to bpf_lsm_*() funcs. + * Only some of them are sleepable. + */ + if (bpf_lsm_is_sleepable_hook(btf_id)) + return 0; + break; + default: + break; + } + return -EINVAL; +} + int bpf_check_attach_target(struct bpf_verifier_log *log, const struct bpf_prog *prog, const struct bpf_prog *tgt_prog, @@ -19076,38 +19125,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, } if (prog->sleepable) { - ret = -EINVAL; - switch (prog->type) { - case BPF_PROG_TYPE_TRACING: - /* *.multi sleepable programs will pass initial sleepable check, - * the actual attached btf ids are checked later during the link - * attachment. - */ - if (is_tracing_multi_id(prog, btf_id)) - ret = 0; - if (!check_attach_sleepable(btf_id, addr, tname)) - ret = 0; - /* fentry/fexit/fmod_ret progs can also be sleepable if they are - * in the fmodret id set with the KF_SLEEPABLE flag. - */ - else { - u32 *flags = btf_kfunc_is_modify_return(btf, btf_id, - prog); - - if (flags && (*flags & KF_SLEEPABLE)) - ret = 0; - } - break; - case BPF_PROG_TYPE_LSM: - /* LSM progs check that they are attached to bpf_lsm_*() funcs. - * Only some of them are sleepable. - */ - if (bpf_lsm_is_sleepable_hook(btf_id)) - ret = 0; - break; - default: - break; - } + ret = btf_id_allow_sleepable(btf_id, addr, prog, btf); if (ret) { module_put(mod); bpf_log(log, "%s is not sleepable\n", tname); -- cgit v1.2.3 From aef4dfa790b22d8052cfb78044eadbe03c876c39 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:37 +0200 Subject: bpf: Add bpf_trampoline_multi_attach/detach functions Adding bpf_trampoline_multi_attach/detach functions that allows to attach/detach tracing program to multiple functions/trampolines. The attachment is defined with bpf_program and array of BTF ids of functions to attach the bpf program to. Adding bpf_tracing_multi_link object that holds all the attached trampolines and is initialized in attach and used in detach. The attachment allocates or uses currently existing trampoline for each function to attach and links it with the bpf program. The attach works as follows: - we get all the needed trampolines - lock them and add the bpf program to each (__bpf_trampoline_link_prog) - the trampoline_multi_ops passed in __bpf_trampoline_link_prog gathers ftrace_hash (ip -> trampoline) objects - we call update_ftrace_direct_add/mod to update needed locations - we unlock all the trampolines The detach works as follows: - we lock all the needed trampolines - remove the program from each (__bpf_trampoline_unlink_prog) - the trampoline_multi_ops passed in __bpf_trampoline_unlink_prog gathers ftrace_hash (ip -> trampoline) objects - we call update_ftrace_direct_del/mod to update needed locations - we unlock and put all the trampolines We store the old image/flags in the trampoline before the update and use it in case we need to rollback the attachment. We keep the ftrace_hash objects allocated during attach in the link so they can be used for detach as well. Adding trampoline_(un)lock_all functions to (un)lock all trampolines to gate the tracing_multi attachment. Note this is supported only for archs (x86_64) with ftrace direct and have single ops support. CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS && CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS It also needs CONFIG_BPF_SYSCALL enabled. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-13-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 43 +++++++ include/linux/bpf_verifier.h | 4 + kernel/bpf/trampoline.c | 271 +++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 55 +++++++++ 4 files changed, 373 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b52dc64ec92d..bcf70f810d2c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -33,6 +33,7 @@ #include #include #include +#include #include struct bpf_verifier_env; @@ -1373,6 +1374,11 @@ struct bpf_trampoline { int progs_cnt[BPF_TRAMP_MAX]; /* Executable image of trampoline */ struct bpf_tramp_image *cur_image; + /* Used as temporary old image storage for multi_attach */ + struct { + struct bpf_tramp_image *old_image; + u32 old_flags; + } multi_attach; }; struct bpf_attach_target_info { @@ -1470,6 +1476,8 @@ static inline int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u6 return 0; } +struct bpf_tracing_multi_link; + #ifdef CONFIG_BPF_JIT int bpf_trampoline_link_prog(struct bpf_tramp_node *node, struct bpf_trampoline *tr, @@ -1482,6 +1490,11 @@ struct bpf_trampoline *bpf_trampoline_get(u64 key, void bpf_trampoline_put(struct bpf_trampoline *tr); int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs); +int bpf_trampoline_multi_attach(struct bpf_prog *prog, u32 *ids, + struct bpf_tracing_multi_link *link); +int bpf_trampoline_multi_detach(struct bpf_prog *prog, + struct bpf_tracing_multi_link *link); + /* * When the architecture supports STATIC_CALL replace the bpf_dispatcher_fn * indirection with a direct call to the bpf program. If the architecture does @@ -1594,6 +1607,16 @@ static inline bool bpf_prog_has_trampoline(const struct bpf_prog *prog) { return false; } +static inline int bpf_trampoline_multi_attach(struct bpf_prog *prog, u32 *ids, + struct bpf_tracing_multi_link *link) +{ + return -ENOTSUPP; +} +static inline int bpf_trampoline_multi_detach(struct bpf_prog *prog, + struct bpf_tracing_multi_link *link) +{ + return -ENOTSUPP; +} #endif struct bpf_func_info_aux { @@ -1932,6 +1955,26 @@ struct bpf_tracing_link { struct bpf_prog *tgt_prog; }; +struct bpf_tracing_multi_node { + struct bpf_tramp_node node; + struct bpf_trampoline *trampoline; + struct ftrace_func_entry entry; +}; + +struct bpf_tracing_multi_data { + struct ftrace_hash *unreg; + struct ftrace_hash *modify; + struct ftrace_hash *reg; + struct ftrace_func_entry *entry; +}; + +struct bpf_tracing_multi_link { + struct bpf_link link; + struct bpf_tracing_multi_data data; + int nodes_cnt; + struct bpf_tracing_multi_node nodes[] __counted_by(nodes_cnt); +}; + struct bpf_raw_tp_link { struct bpf_link link; struct bpf_raw_event_map *btp; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c248ff41f42a..d57b339a8cb8 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -1591,6 +1591,10 @@ int bpf_add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, u16 offset); int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_insn *insn_buf, int insn_idx, int *cnt); +/* Functions exported from verifier.c, used by trampoline.c */ +int bpf_check_attach_btf_id_multi(struct btf *btf, struct bpf_prog *prog, u32 btf_id, + struct bpf_attach_target_info *tgt_info); + /* Functions in fixups.c, called from bpf_check() */ int bpf_remove_fastcall_spills_fills(struct bpf_verifier_env *env); int bpf_optimize_bpf_loop(struct bpf_verifier_env *env); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index ae7e4fdfe2a3..957e5d7f9554 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -1447,6 +1447,277 @@ int __weak arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, return -ENOTSUPP; } +#if defined(CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS) && \ + defined(CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS) && \ + defined(CONFIG_BPF_SYSCALL) + +static void trampoline_lock_all(void) +{ + int i; + + for (i = 0; i < TRAMPOLINE_LOCKS_TABLE_SIZE; i++) + mutex_lock(&trampoline_locks[i].mutex); +} + +static void trampoline_unlock_all(void) +{ + int i; + + for (i = 0; i < TRAMPOLINE_LOCKS_TABLE_SIZE; i++) + mutex_unlock(&trampoline_locks[i].mutex); +} + +static void remove_tracing_multi_data(struct bpf_tracing_multi_data *data) +{ + ftrace_hash_remove(data->reg); + ftrace_hash_remove(data->unreg); + ftrace_hash_remove(data->modify); +} + +static void clear_tracing_multi_data(struct bpf_tracing_multi_data *data) +{ + remove_tracing_multi_data(data); + + free_ftrace_hash(data->reg); + free_ftrace_hash(data->unreg); + free_ftrace_hash(data->modify); +} + +static int init_tracing_multi_data(struct bpf_tracing_multi_data *data) +{ + data->reg = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); + data->unreg = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); + data->modify = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); + + if (!data->reg || !data->unreg || !data->modify) { + clear_tracing_multi_data(data); + return -ENOMEM; + } + return 0; +} + +static void ftrace_hash_add(struct ftrace_hash *hash, struct ftrace_func_entry *entry, + unsigned long ip, unsigned long direct) +{ + entry->ip = ip; + entry->direct = direct; + add_ftrace_hash_entry(hash, entry); +} + +static int register_fentry_multi(struct bpf_trampoline *tr, struct bpf_tramp_image *im, void *ptr) +{ + unsigned long addr = (unsigned long) im->image; + unsigned long ip = ftrace_location(tr->ip); + struct bpf_tracing_multi_data *data = ptr; + + if (bpf_trampoline_use_jmp(tr->flags)) + addr = ftrace_jmp_set(addr); + + ftrace_hash_add(data->reg, data->entry, ip, addr); + tr->cur_image = im; + return 0; +} + +static int unregister_fentry_multi(struct bpf_trampoline *tr, u32 orig_flags, void *ptr) +{ + unsigned long addr = (unsigned long) tr->cur_image->image; + unsigned long ip = ftrace_location(tr->ip); + struct bpf_tracing_multi_data *data = ptr; + + if (bpf_trampoline_use_jmp(tr->flags)) + addr = ftrace_jmp_set(addr); + + ftrace_hash_add(data->unreg, data->entry, ip, addr); + tr->cur_image = NULL; + return 0; +} + +static int modify_fentry_multi(struct bpf_trampoline *tr, u32 orig_flags, struct bpf_tramp_image *im, + bool lock_direct_mutex, void *ptr) +{ + unsigned long addr = (unsigned long) im->image; + unsigned long ip = ftrace_location(tr->ip); + struct bpf_tracing_multi_data *data = ptr; + + if (bpf_trampoline_use_jmp(tr->flags)) + addr = ftrace_jmp_set(addr); + + ftrace_hash_add(data->modify, data->entry, ip, addr); + tr->cur_image = im; + return 0; +} + +static const struct bpf_trampoline_ops trampoline_multi_ops = { + .register_fentry = register_fentry_multi, + .unregister_fentry = unregister_fentry_multi, + .modify_fentry = modify_fentry_multi, +}; + +static void bpf_trampoline_multi_attach_init(struct bpf_trampoline *tr) +{ + tr->multi_attach.old_image = tr->cur_image; + tr->multi_attach.old_flags = tr->flags; +} + +static void bpf_trampoline_multi_attach_free(struct bpf_trampoline *tr) +{ + if (tr->multi_attach.old_image) + bpf_tramp_image_put(tr->multi_attach.old_image); + + tr->multi_attach.old_image = NULL; + tr->multi_attach.old_flags = 0; +} + +static void bpf_trampoline_multi_attach_rollback(struct bpf_trampoline *tr) +{ + if (tr->cur_image) + bpf_tramp_image_put(tr->cur_image); + tr->cur_image = tr->multi_attach.old_image; + tr->flags = tr->multi_attach.old_flags; + + tr->multi_attach.old_image = NULL; + tr->multi_attach.old_flags = 0; +} + +#define for_each_mnode_cnt(mnode, link, cnt) \ + for (i = 0, mnode = &link->nodes[i]; i < cnt; i++, mnode = &link->nodes[i]) + +#define for_each_mnode(mnode, link) \ + for_each_mnode_cnt(mnode, link, link->nodes_cnt) + +int bpf_trampoline_multi_attach(struct bpf_prog *prog, u32 *ids, + struct bpf_tracing_multi_link *link) +{ + struct bpf_tracing_multi_data *data = &link->data; + struct bpf_attach_target_info tgt_info = {}; + struct btf *btf = prog->aux->attach_btf; + struct bpf_tracing_multi_node *mnode; + struct bpf_trampoline *tr; + int i, err, rollback_cnt; + u64 key; + + for_each_mnode(mnode, link) { + rollback_cnt = i; + + err = bpf_check_attach_btf_id_multi(btf, prog, ids[i], &tgt_info); + if (err) + goto rollback_put; + + key = bpf_trampoline_compute_key(NULL, btf, ids[i]); + + tr = bpf_trampoline_get(key, &tgt_info); + if (!tr) { + err = -ENOMEM; + goto rollback_put; + } + + mnode->trampoline = tr; + mnode->node.link = &link->link; + + cond_resched(); + } + + err = init_tracing_multi_data(data); + if (err) { + rollback_cnt = link->nodes_cnt; + goto rollback_put; + } + + trampoline_lock_all(); + + for_each_mnode(mnode, link) { + bpf_trampoline_multi_attach_init(mnode->trampoline); + + data->entry = &mnode->entry; + err = __bpf_trampoline_link_prog(&mnode->node, mnode->trampoline, NULL, + &trampoline_multi_ops, data); + if (err) { + rollback_cnt = i; + goto rollback_unlink; + } + } + + rollback_cnt = link->nodes_cnt; + if (ftrace_hash_count(data->reg)) { + err = update_ftrace_direct_add(&direct_ops, data->reg); + if (err) + goto rollback_unlink; + } + + if (ftrace_hash_count(data->modify)) { + err = update_ftrace_direct_mod(&direct_ops, data->modify, true); + if (err) { + if (ftrace_hash_count(data->reg)) + WARN_ON_ONCE(update_ftrace_direct_del(&direct_ops, data->reg)); + goto rollback_unlink; + } + } + + for_each_mnode(mnode, link) + bpf_trampoline_multi_attach_free(mnode->trampoline); + + trampoline_unlock_all(); + + remove_tracing_multi_data(data); + return 0; + +rollback_unlink: + for_each_mnode_cnt(mnode, link, rollback_cnt) { + bpf_trampoline_remove_prog(mnode->trampoline, &mnode->node); + bpf_trampoline_multi_attach_rollback(mnode->trampoline); + } + + trampoline_unlock_all(); + + clear_tracing_multi_data(data); + rollback_cnt = link->nodes_cnt; + +rollback_put: + for_each_mnode_cnt(mnode, link, rollback_cnt) + bpf_trampoline_put(mnode->trampoline); + + return err; +} + +int bpf_trampoline_multi_detach(struct bpf_prog *prog, struct bpf_tracing_multi_link *link) +{ + struct bpf_tracing_multi_data *data = &link->data; + struct bpf_tracing_multi_node *mnode; + int i; + + trampoline_lock_all(); + + for_each_mnode(mnode, link) { + data->entry = &mnode->entry; + bpf_trampoline_multi_attach_init(mnode->trampoline); + WARN_ON_ONCE(__bpf_trampoline_unlink_prog(&mnode->node, mnode->trampoline, + NULL, &trampoline_multi_ops, data)); + } + + if (ftrace_hash_count(data->unreg)) + WARN_ON_ONCE(update_ftrace_direct_del(&direct_ops, data->unreg)); + if (ftrace_hash_count(data->modify)) + WARN_ON_ONCE(update_ftrace_direct_mod(&direct_ops, data->modify, true)); + + for_each_mnode(mnode, link) + bpf_trampoline_multi_attach_free(mnode->trampoline); + + trampoline_unlock_all(); + + for_each_mnode(mnode, link) + bpf_trampoline_put(mnode->trampoline); + + clear_tracing_multi_data(data); + return 0; +} + +#undef for_each_mnode_cnt +#undef for_each_mnode + +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS && + CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS && + CONFIG_BPF_SYSCALL */ + static int __init init_trampolines(void) { int i; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index df21592fc560..5c594047ff0a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19328,6 +19328,61 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return 0; } +int bpf_check_attach_btf_id_multi(struct btf *btf, struct bpf_prog *prog, u32 btf_id, + struct bpf_attach_target_info *tgt_info) +{ + const struct btf_type *t; + unsigned long addr; + const char *tname; + int err; + + if (!btf_id || !btf) + return -EINVAL; + + /* Check noreturn attachment. */ + if (prog->expected_attach_type == BPF_TRACE_FEXIT_MULTI && + btf_id_set_contains(&noreturn_deny, btf_id)) + return -EINVAL; + /* Check denied attachment. */ + if (btf_id_set_contains(&btf_id_deny, btf_id)) + return -EINVAL; + + /* Check and get function target data. */ + t = btf_type_by_id(btf, btf_id); + if (!t) + return -EINVAL; + tname = btf_name_by_offset(btf, t->name_off); + if (!tname) + return -EINVAL; + if (!btf_type_is_func(t)) + return -EINVAL; + t = btf_type_by_id(btf, t->type); + if (!btf_type_is_func_proto(t)) + return -EINVAL; + err = btf_distill_func_proto(NULL, btf, t, tname, &tgt_info->fmodel); + if (err < 0) + return err; + if (btf_is_module(btf)) { + /* The bpf program already holds reference to module. */ + if (WARN_ON_ONCE(!prog->aux->mod)) + return -EINVAL; + addr = find_kallsyms_symbol_value(prog->aux->mod, tname); + } else { + addr = kallsyms_lookup_name(tname); + } + if (!addr || !ftrace_location(addr)) + return -ENOENT; + + /* Check sleepable program attachment. */ + if (prog->sleepable) { + err = btf_id_allow_sleepable(btf_id, addr, prog, btf); + if (err) + return err; + } + tgt_info->tgt_addr = addr; + return 0; +} + struct btf *bpf_get_btf_vmlinux(void) { if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { -- cgit v1.2.3 From c1d32dea5d4694c1a6c14d1d1c3192d0e18ffc7b Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:38 +0200 Subject: bpf: Add support for tracing multi link Adding new link to allow to attach program to multiple function BTF IDs. The link is represented by struct bpf_tracing_multi_link. To configure the link, new fields are added to bpf_attr::link_create to pass array of BTF IDs; struct { __aligned_u64 ids; __u32 cnt; } tracing_multi; Each BTF ID represents function (BTF_KIND_FUNC) that the link will attach bpf program to. We use previously added bpf_trampoline_multi_attach/detach functions to attach/detach the link. The linkinfo/fdinfo callbacks will be implemented in following changes. Note this is supported only for archs (x86_64) with ftrace direct and have single ops support. CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS && CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS Note using sort_r (instead of plain sort) in check_dup_ids, because we will use the swap callback in following changes. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-14-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_types.h | 1 + include/linux/trace_events.h | 6 ++ include/uapi/linux/bpf.h | 5 ++ kernel/bpf/syscall.c | 2 + kernel/trace/bpf_trace.c | 130 +++++++++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 6 ++ tools/lib/bpf/libbpf.c | 1 + 7 files changed, 151 insertions(+) diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 56e4c3f983d3..e5906829aa6f 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -156,3 +156,4 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_PERF_EVENT, perf) BPF_LINK_TYPE(BPF_LINK_TYPE_KPROBE_MULTI, kprobe_multi) BPF_LINK_TYPE(BPF_LINK_TYPE_STRUCT_OPS, struct_ops) BPF_LINK_TYPE(BPF_LINK_TYPE_UPROBE_MULTI, uprobe_multi) +BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING_MULTI, tracing_multi) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index d49338c44014..308c76b57d13 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -787,6 +787,7 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, unsigned long *missed); int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); +int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr); #else static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { @@ -844,6 +845,11 @@ bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { return -EOPNOTSUPP; } +static inline int +bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr) +{ + return -EOPNOTSUPP; +} #endif enum { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 28d127e5040a..9f603731d267 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1182,6 +1182,7 @@ enum bpf_link_type { BPF_LINK_TYPE_UPROBE_MULTI = 12, BPF_LINK_TYPE_NETKIT = 13, BPF_LINK_TYPE_SOCKMAP = 14, + BPF_LINK_TYPE_TRACING_MULTI = 15, __MAX_BPF_LINK_TYPE, }; @@ -1877,6 +1878,10 @@ union bpf_attr { }; __u64 expected_revision; } cgroup; + struct { + __aligned_u64 ids; + __u32 cnt; + } tracing_multi; }; } link_create; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index efdd6639a598..d551b9da0cfb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5885,6 +5885,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = bpf_iter_link_attach(attr, uattr, prog); else if (prog->expected_attach_type == BPF_LSM_CGROUP) ret = cgroup_bpf_link_attach(attr, prog); + else if (is_tracing_multi(prog->expected_attach_type)) + ret = bpf_tracing_multi_attach(prog, attr); else ret = bpf_tracing_prog_attach(prog, attr->link_create.target_fd, diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d853f97bd154..9e3cb547651e 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -42,6 +42,7 @@ #define MAX_UPROBE_MULTI_CNT (1U << 20) #define MAX_KPROBE_MULTI_CNT (1U << 20) +#define MAX_TRACING_MULTI_CNT (1U << 20) #ifdef CONFIG_MODULES struct bpf_trace_module { @@ -3641,3 +3642,132 @@ __bpf_kfunc int bpf_copy_from_user_task_str_dynptr(const struct bpf_dynptr *dptr } __bpf_kfunc_end_defs(); + +#if defined(CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS) && \ + defined(CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS) + +static void bpf_tracing_multi_link_release(struct bpf_link *link) +{ + struct bpf_tracing_multi_link *tr_link = + container_of(link, struct bpf_tracing_multi_link, link); + + WARN_ON_ONCE(bpf_trampoline_multi_detach(link->prog, tr_link)); +} + +static void bpf_tracing_multi_link_dealloc(struct bpf_link *link) +{ + struct bpf_tracing_multi_link *tr_link = + container_of(link, struct bpf_tracing_multi_link, link); + + kvfree(tr_link); +} + +static const struct bpf_link_ops bpf_tracing_multi_link_lops = { + .release = bpf_tracing_multi_link_release, + .dealloc_deferred = bpf_tracing_multi_link_dealloc, +}; + +static int ids_cmp_r(const void *pa, const void *pb, const void *priv __maybe_unused) +{ + u32 a = *(u32 *) pa; + u32 b = *(u32 *) pb; + + return (a > b) - (a < b); +} + +static void ids_swap_r(void *a, void *b, int size __maybe_unused, + const void *priv __maybe_unused) +{ + u32 *id_a = a, *id_b = b; + + swap(*id_a, *id_b); +} + +static int check_dup_ids(u32 *ids, u32 cnt) +{ + int err = 0; + + /* + * Sort ids array (together with cookies array if defined) + * and check it for duplicates. The ids and cookies arrays + * are left sorted. + */ + sort_r_nonatomic(ids, cnt, sizeof(ids[0]), ids_cmp_r, ids_swap_r, NULL); + + for (int i = 1; i < cnt; i++) { + if (ids[i] == ids[i - 1]) { + err = -EINVAL; + break; + } + } + return err; +} + +int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr) +{ + struct bpf_tracing_multi_link *link = NULL; + struct bpf_link_primer link_primer; + u32 cnt, *ids = NULL; + u32 __user *uids; + int err; + + uids = u64_to_user_ptr(attr->link_create.tracing_multi.ids); + cnt = attr->link_create.tracing_multi.cnt; + + if (!cnt || !uids) + return -EINVAL; + if (cnt > MAX_TRACING_MULTI_CNT) + return -E2BIG; + if (attr->link_create.flags || attr->link_create.target_fd) + return -EINVAL; + + ids = kvmalloc_objs(*ids, cnt); + if (!ids) + return -ENOMEM; + + if (copy_from_user(ids, uids, cnt * sizeof(*ids))) { + err = -EFAULT; + goto error; + } + + err = check_dup_ids(ids, cnt); + if (err) + goto error; + + link = kvzalloc_flex(*link, nodes, cnt); + if (!link) { + err = -ENOMEM; + goto error; + } + + bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING_MULTI, + &bpf_tracing_multi_link_lops, prog, prog->expected_attach_type); + + err = bpf_link_prime(&link->link, &link_primer); + if (err) + goto error; + + link->nodes_cnt = cnt; + + err = bpf_trampoline_multi_attach(prog, ids, link); + kvfree(ids); + if (err) { + bpf_link_cleanup(&link_primer); + return err; + } + return bpf_link_settle(&link_primer); + +error: + kvfree(ids); + kvfree(link); + return err; +} + +#else + +int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr) +{ + return -EOPNOTSUPP; +} + +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS && CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS */ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 1b9aacf468e5..9f603731d267 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1182,6 +1182,7 @@ enum bpf_link_type { BPF_LINK_TYPE_UPROBE_MULTI = 12, BPF_LINK_TYPE_NETKIT = 13, BPF_LINK_TYPE_SOCKMAP = 14, + BPF_LINK_TYPE_TRACING_MULTI = 15, __MAX_BPF_LINK_TYPE, }; @@ -1877,6 +1878,10 @@ union bpf_attr { }; __u64 expected_revision; } cgroup; + struct { + __aligned_u64 ids; + __u32 cnt; + } tracing_multi; }; } link_create; @@ -7254,6 +7259,7 @@ enum { TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */ SK_BPF_CB_FLAGS = 1009, /* Get or set sock ops flags in socket */ SK_BPF_BYPASS_PROT_MEM = 1010, /* Get or Set sk->sk_bypass_prot_mem */ + }; enum { diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 1b09381d16ff..59405d318624 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -156,6 +156,7 @@ static const char * const link_type_name[] = { [BPF_LINK_TYPE_UPROBE_MULTI] = "uprobe_multi", [BPF_LINK_TYPE_NETKIT] = "netkit", [BPF_LINK_TYPE_SOCKMAP] = "sockmap", + [BPF_LINK_TYPE_TRACING_MULTI] = "tracing_multi", }; static const char * const map_type_name[] = { -- cgit v1.2.3 From 46b42af27d40021a97c147d23de8cb29eb5020df Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:39 +0200 Subject: bpf: Add support for tracing_multi link cookies Add support to specify cookies for tracing_multi link. Cookies are provided in array where each value is paired with provided BTF ID value with the same array index. Such cookie can be retrieved by bpf program with bpf_get_attach_cookie helper call. We need to sort cookies array together with ids array in check_dup_ids, to keep the id->cookie relation. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-15-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/trampoline.c | 1 + kernel/trace/bpf_trace.c | 37 +++++++++++++++++++++++++++++++++---- tools/include/uapi/linux/bpf.h | 1 + 5 files changed, 37 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bcf70f810d2c..e9d2b42a3981 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1971,6 +1971,7 @@ struct bpf_tracing_multi_data { struct bpf_tracing_multi_link { struct bpf_link link; struct bpf_tracing_multi_data data; + u64 *cookies; int nodes_cnt; struct bpf_tracing_multi_node nodes[] __counted_by(nodes_cnt); }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9f603731d267..569c15e1cae3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1880,6 +1880,7 @@ union bpf_attr { } cgroup; struct { __aligned_u64 ids; + __aligned_u64 cookies; __u32 cnt; } tracing_multi; }; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 957e5d7f9554..a3537fda50cf 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -1613,6 +1613,7 @@ int bpf_trampoline_multi_attach(struct bpf_prog *prog, u32 *ids, mnode->trampoline = tr; mnode->node.link = &link->link; + mnode->node.cookie = link->cookies ? link->cookies[i] : 0; cond_resched(); } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 9e3cb547651e..e33492739ed1 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3659,6 +3659,7 @@ static void bpf_tracing_multi_link_dealloc(struct bpf_link *link) struct bpf_tracing_multi_link *tr_link = container_of(link, struct bpf_tracing_multi_link, link); + kvfree(tr_link->cookies); kvfree(tr_link); } @@ -3678,13 +3679,24 @@ static int ids_cmp_r(const void *pa, const void *pb, const void *priv __maybe_un static void ids_swap_r(void *a, void *b, int size __maybe_unused, const void *priv __maybe_unused) { - u32 *id_a = a, *id_b = b; + u64 *cookie_a, *cookie_b, *cookies; + u32 *id_a = a, *id_b = b, *ids; + void **data = (void **) priv; + ids = data[0]; + cookies = data[1]; + + if (cookies) { + cookie_a = cookies + (id_a - ids); + cookie_b = cookies + (id_b - ids); + swap(*cookie_a, *cookie_b); + } swap(*id_a, *id_b); } -static int check_dup_ids(u32 *ids, u32 cnt) +static int check_dup_ids(u32 *ids, u64 *cookies, u32 cnt) { + void *data[2] = { ids, cookies }; int err = 0; /* @@ -3692,7 +3704,7 @@ static int check_dup_ids(u32 *ids, u32 cnt) * and check it for duplicates. The ids and cookies arrays * are left sorted. */ - sort_r_nonatomic(ids, cnt, sizeof(ids[0]), ids_cmp_r, ids_swap_r, NULL); + sort_r_nonatomic(ids, cnt, sizeof(ids[0]), ids_cmp_r, ids_swap_r, data); for (int i = 1; i < cnt; i++) { if (ids[i] == ids[i - 1]) { @@ -3708,6 +3720,8 @@ int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr) struct bpf_tracing_multi_link *link = NULL; struct bpf_link_primer link_primer; u32 cnt, *ids = NULL; + u64 __user *ucookies; + u64 *cookies = NULL; u32 __user *uids; int err; @@ -3730,7 +3744,20 @@ int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr) goto error; } - err = check_dup_ids(ids, cnt); + ucookies = u64_to_user_ptr(attr->link_create.tracing_multi.cookies); + if (ucookies) { + cookies = kvmalloc_objs(*cookies, cnt); + if (!cookies) { + err = -ENOMEM; + goto error; + } + if (copy_from_user(cookies, ucookies, cnt * sizeof(*cookies))) { + err = -EFAULT; + goto error; + } + } + + err = check_dup_ids(ids, cookies, cnt); if (err) goto error; @@ -3748,6 +3775,7 @@ int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr) goto error; link->nodes_cnt = cnt; + link->cookies = cookies; err = bpf_trampoline_multi_attach(prog, ids, link); kvfree(ids); @@ -3758,6 +3786,7 @@ int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr) return bpf_link_settle(&link_primer); error: + kvfree(cookies); kvfree(ids); kvfree(link); return err; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 9f603731d267..569c15e1cae3 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1880,6 +1880,7 @@ union bpf_attr { } cgroup; struct { __aligned_u64 ids; + __aligned_u64 cookies; __u32 cnt; } tracing_multi; }; -- cgit v1.2.3 From ba042ed6446fc524c1d804227765b45616f9cba3 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:40 +0200 Subject: bpf: Add support for tracing_multi link session Adding support to use session attachment with tracing_multi link. Adding new BPF_TRACE_FSESSION_MULTI program attach type, that follows the BPF_TRACE_FSESSION behaviour but on the tracing_multi link. Such program is called on entry and exit of the attached function and allows to pass cookie value from entry to exit execution. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-16-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 +++++- include/uapi/linux/bpf.h | 1 + kernel/bpf/fixups.c | 1 + kernel/bpf/syscall.c | 1 + kernel/bpf/trampoline.c | 44 ++++++++++++++++++++++++++++++++++-------- kernel/bpf/verifier.c | 20 ++++++++++++++----- kernel/trace/bpf_trace.c | 15 +++++++++++++- net/bpf/test_run.c | 1 + tools/include/uapi/linux/bpf.h | 1 + tools/lib/bpf/libbpf.c | 1 + 10 files changed, 76 insertions(+), 15 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e9d2b42a3981..62bba7a4876f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1972,6 +1972,7 @@ struct bpf_tracing_multi_link { struct bpf_link link; struct bpf_tracing_multi_data data; u64 *cookies; + struct bpf_tramp_node *fexits; int nodes_cnt; struct bpf_tracing_multi_node nodes[] __counted_by(nodes_cnt); }; @@ -2159,7 +2160,8 @@ static inline void bpf_prog_put_recursion_context(struct bpf_prog *prog) static inline bool is_tracing_multi(enum bpf_attach_type type) { - return type == BPF_TRACE_FENTRY_MULTI || type == BPF_TRACE_FEXIT_MULTI; + return type == BPF_TRACE_FENTRY_MULTI || type == BPF_TRACE_FEXIT_MULTI || + type == BPF_TRACE_FSESSION_MULTI; } #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) @@ -2286,6 +2288,8 @@ static inline int bpf_fsession_cnt(struct bpf_tramp_nodes *nodes) for (int i = 0; i < nodes[BPF_TRAMP_FENTRY].nr_nodes; i++) { if (fentries.nodes[i]->link->prog->expected_attach_type == BPF_TRACE_FSESSION) cnt++; + if (fentries.nodes[i]->link->prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI) + cnt++; } return cnt; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 569c15e1cae3..11dd610fa5fa 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1158,6 +1158,7 @@ enum bpf_attach_type { BPF_TRACE_FSESSION, BPF_TRACE_FENTRY_MULTI, BPF_TRACE_FEXIT_MULTI, + BPF_TRACE_FSESSION_MULTI, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/fixups.c b/kernel/bpf/fixups.c index 0cf9735929f5..3cf2cc6e3ab6 100644 --- a/kernel/bpf/fixups.c +++ b/kernel/bpf/fixups.c @@ -2187,6 +2187,7 @@ patch_map_ops_generic: if (eatype == BPF_TRACE_FEXIT || eatype == BPF_TRACE_FSESSION || eatype == BPF_TRACE_FEXIT_MULTI || + eatype == BPF_TRACE_FSESSION_MULTI || eatype == BPF_MODIFY_RETURN) { /* Load nr_args from ctx - 8 */ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d551b9da0cfb..d4188a992bd8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4498,6 +4498,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_TRACE_FSESSION: + case BPF_TRACE_FSESSION_MULTI: case BPF_TRACE_FENTRY_MULTI: case BPF_TRACE_FEXIT_MULTI: case BPF_MODIFY_RETURN: diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index a3537fda50cf..1a721fc4bef5 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -183,7 +183,8 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog) case BPF_PROG_TYPE_TRACING: if (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || eatype == BPF_MODIFY_RETURN || eatype == BPF_TRACE_FSESSION || - eatype == BPF_TRACE_FENTRY_MULTI || eatype == BPF_TRACE_FEXIT_MULTI) + eatype == BPF_TRACE_FENTRY_MULTI || eatype == BPF_TRACE_FEXIT_MULTI || + eatype == BPF_TRACE_FSESSION_MULTI) return true; return false; case BPF_PROG_TYPE_LSM: @@ -790,6 +791,7 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) case BPF_TRACE_FEXIT_MULTI: return BPF_TRAMP_FEXIT; case BPF_TRACE_FSESSION: + case BPF_TRACE_FSESSION_MULTI: return BPF_TRAMP_FSESSION; case BPF_LSM_MAC: if (!prog->aux->attach_func_proto->type) @@ -822,13 +824,30 @@ static int bpf_freplace_check_tgt_prog(struct bpf_prog *tgt_prog) return 0; } +static struct bpf_tramp_node *fsession_exit(struct bpf_tramp_node *node) +{ + if (node->link->type == BPF_LINK_TYPE_TRACING) { + struct bpf_tracing_link *link; + + link = container_of(node->link, struct bpf_tracing_link, link.link); + return &link->fexit; + } else if (node->link->type == BPF_LINK_TYPE_TRACING_MULTI) { + struct bpf_tracing_multi_link *link; + struct bpf_tracing_multi_node *mnode; + + link = container_of(node->link, struct bpf_tracing_multi_link, link); + mnode = container_of(node, struct bpf_tracing_multi_node, node); + return &link->fexits[mnode - link->nodes]; + } + return NULL; +} + static int bpf_trampoline_add_prog(struct bpf_trampoline *tr, struct bpf_tramp_node *node, int cnt) { - struct bpf_tracing_link *tr_link = NULL; enum bpf_tramp_prog_type kind; - struct bpf_tramp_node *node_existing; + struct bpf_tramp_node *node_existing, *fexit; struct hlist_head *prog_list; kind = bpf_attach_type_to_tramp(node->link->prog); @@ -853,8 +872,10 @@ static int bpf_trampoline_add_prog(struct bpf_trampoline *tr, hlist_add_head(&node->tramp_hlist, prog_list); if (kind == BPF_TRAMP_FSESSION) { tr->progs_cnt[BPF_TRAMP_FENTRY]++; - tr_link = container_of(node, struct bpf_tracing_link, link.node); - hlist_add_head(&tr_link->fexit.tramp_hlist, &tr->progs_hlist[BPF_TRAMP_FEXIT]); + fexit = fsession_exit(node); + if (WARN_ON_ONCE(!fexit)) + return -EINVAL; + hlist_add_head(&fexit->tramp_hlist, &tr->progs_hlist[BPF_TRAMP_FEXIT]); tr->progs_cnt[BPF_TRAMP_FEXIT]++; } else { tr->progs_cnt[kind]++; @@ -865,13 +886,15 @@ static int bpf_trampoline_add_prog(struct bpf_trampoline *tr, static void bpf_trampoline_remove_prog(struct bpf_trampoline *tr, struct bpf_tramp_node *node) { - struct bpf_tracing_link *tr_link; enum bpf_tramp_prog_type kind; + struct bpf_tramp_node *fexit; kind = bpf_attach_type_to_tramp(node->link->prog); if (kind == BPF_TRAMP_FSESSION) { - tr_link = container_of(node, struct bpf_tracing_link, link.node); - hlist_del_init(&tr_link->fexit.tramp_hlist); + fexit = fsession_exit(node); + if (WARN_ON_ONCE(!fexit)) + return; + hlist_del_init(&fexit->tramp_hlist); tr->progs_cnt[BPF_TRAMP_FEXIT]--; kind = BPF_TRAMP_FENTRY; } @@ -1615,6 +1638,11 @@ int bpf_trampoline_multi_attach(struct bpf_prog *prog, u32 *ids, mnode->node.link = &link->link; mnode->node.cookie = link->cookies ? link->cookies[i] : 0; + if (prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI) { + link->fexits[i].link = &link->link; + link->fexits[i].cookie = link->cookies ? link->cookies[i] : 0; + } + cond_resched(); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5c594047ff0a..0c1cf506c219 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -16384,6 +16384,7 @@ static bool return_retval_range(struct bpf_verifier_env *env, struct bpf_retval_ case BPF_TRACE_FSESSION: case BPF_TRACE_FENTRY_MULTI: case BPF_TRACE_FEXIT_MULTI: + case BPF_TRACE_FSESSION_MULTI: *range = retval_range(0, 0); break; case BPF_TRACE_RAW_TP: @@ -18952,7 +18953,8 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, tgt_prog->expected_attach_type == BPF_TRACE_FEXIT || tgt_prog->expected_attach_type == BPF_TRACE_FENTRY_MULTI || tgt_prog->expected_attach_type == BPF_TRACE_FEXIT_MULTI || - tgt_prog->expected_attach_type == BPF_TRACE_FSESSION)) { + tgt_prog->expected_attach_type == BPF_TRACE_FSESSION || + tgt_prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI)) { /* Program extensions can extend all program types * except fentry/fexit. The reason is the following. * The fentry/fexit programs are used for performance @@ -19058,9 +19060,11 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_TRACE_FSESSION: + case BPF_TRACE_FSESSION_MULTI: case BPF_TRACE_FENTRY_MULTI: case BPF_TRACE_FEXIT_MULTI: - if (prog->expected_attach_type == BPF_TRACE_FSESSION && + if ((prog->expected_attach_type == BPF_TRACE_FSESSION || + prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI) && !bpf_jit_supports_fsession()) { bpf_log(log, "JIT does not support fsession\n"); return -EOPNOTSUPP; @@ -19215,6 +19219,7 @@ static bool can_be_sleepable(struct bpf_prog *prog) case BPF_TRACE_RAW_TP: case BPF_TRACE_FENTRY_MULTI: case BPF_TRACE_FEXIT_MULTI: + case BPF_TRACE_FSESSION_MULTI: return true; default: return false; @@ -19301,6 +19306,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return -EINVAL; } else if ((prog->expected_attach_type == BPF_TRACE_FEXIT || prog->expected_attach_type == BPF_TRACE_FSESSION || + prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI || prog->expected_attach_type == BPF_MODIFY_RETURN) && btf_id_set_contains(&noreturn_deny, btf_id)) { verbose(env, "Attaching fexit/fsession/fmod_ret to __noreturn function '%s' is rejected.\n", @@ -19340,7 +19346,8 @@ int bpf_check_attach_btf_id_multi(struct btf *btf, struct bpf_prog *prog, u32 bt return -EINVAL; /* Check noreturn attachment. */ - if (prog->expected_attach_type == BPF_TRACE_FEXIT_MULTI && + if ((prog->expected_attach_type == BPF_TRACE_FEXIT_MULTI || + prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI) && btf_id_set_contains(&noreturn_deny, btf_id)) return -EINVAL; /* Check denied attachment. */ @@ -19623,7 +19630,9 @@ int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); *cnt = 1; } else if (desc->func_id == special_kfunc_list[KF_bpf_session_is_return] && - env->prog->expected_attach_type == BPF_TRACE_FSESSION) { + (env->prog->expected_attach_type == BPF_TRACE_FSESSION || + env->prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI)) { + /* * inline the bpf_session_is_return() for fsession: * bool bpf_session_is_return(void *ctx) @@ -19636,7 +19645,8 @@ int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1); *cnt = 3; } else if (desc->func_id == special_kfunc_list[KF_bpf_session_cookie] && - env->prog->expected_attach_type == BPF_TRACE_FSESSION) { + (env->prog->expected_attach_type == BPF_TRACE_FSESSION || + env->prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI)) { /* * inline bpf_session_cookie() for fsession: * __u64 *bpf_session_cookie(void *ctx) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e33492739ed1..a0d688fffc5a 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1334,7 +1334,8 @@ static inline bool is_uprobe_session(const struct bpf_prog *prog) static inline bool is_trace_fsession(const struct bpf_prog *prog) { return prog->type == BPF_PROG_TYPE_TRACING && - prog->expected_attach_type == BPF_TRACE_FSESSION; + (prog->expected_attach_type == BPF_TRACE_FSESSION || + prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI); } static const struct bpf_func_proto * @@ -3659,6 +3660,7 @@ static void bpf_tracing_multi_link_dealloc(struct bpf_link *link) struct bpf_tracing_multi_link *tr_link = container_of(link, struct bpf_tracing_multi_link, link); + kvfree(tr_link->fexits); kvfree(tr_link->cookies); kvfree(tr_link); } @@ -3718,6 +3720,7 @@ static int check_dup_ids(u32 *ids, u64 *cookies, u32 cnt) int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr) { struct bpf_tracing_multi_link *link = NULL; + struct bpf_tramp_node *fexits = NULL; struct bpf_link_primer link_primer; u32 cnt, *ids = NULL; u64 __user *ucookies; @@ -3761,6 +3764,14 @@ int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr) if (err) goto error; + if (prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI) { + fexits = kvmalloc_objs(*fexits, cnt); + if (!fexits) { + err = -ENOMEM; + goto error; + } + } + link = kvzalloc_flex(*link, nodes, cnt); if (!link) { err = -ENOMEM; @@ -3776,6 +3787,7 @@ int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr) link->nodes_cnt = cnt; link->cookies = cookies; + link->fexits = fexits; err = bpf_trampoline_multi_attach(prog, ids, link); kvfree(ids); @@ -3786,6 +3798,7 @@ int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr) return bpf_link_settle(&link_primer); error: + kvfree(fexits); kvfree(cookies); kvfree(ids); kvfree(link); diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 67769c700cae..a831682ee982 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -705,6 +705,7 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog, case BPF_TRACE_FSESSION: case BPF_TRACE_FENTRY_MULTI: case BPF_TRACE_FEXIT_MULTI: + case BPF_TRACE_FSESSION_MULTI: if (bpf_fentry_test1(1) != 2 || bpf_fentry_test2(2, 3) != 5 || bpf_fentry_test3(4, 5, 6) != 15 || diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 569c15e1cae3..11dd610fa5fa 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1158,6 +1158,7 @@ enum bpf_attach_type { BPF_TRACE_FSESSION, BPF_TRACE_FENTRY_MULTI, BPF_TRACE_FEXIT_MULTI, + BPF_TRACE_FSESSION_MULTI, __MAX_BPF_ATTACH_TYPE }; diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 59405d318624..62f088359c5e 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -138,6 +138,7 @@ static const char * const attach_type_name[] = { [BPF_TRACE_UPROBE_SESSION] = "trace_uprobe_session", [BPF_TRACE_FENTRY_MULTI] = "trace_fentry_multi", [BPF_TRACE_FEXIT_MULTI] = "trace_fexit_multi", + [BPF_TRACE_FSESSION_MULTI] = "trace_fsession_multi", }; static const char * const link_type_name[] = { -- cgit v1.2.3 From 8abecdafd57553c053bb68db47ed32a54972d5f4 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:41 +0200 Subject: bpf: Add support for tracing_multi link fdinfo Adding tracing_multi link fdinfo support with following output: pos: 0 flags: 02000000 mnt_id: 19 ino: 3087 link_type: tracing_multi link_id: 9 prog_tag: 599ba0e317244f86 prog_id: 94 attach_type: 59 cnt: 10 obj-id btf-id cookie func 1 91593 8 bpf_fentry_test1+0x4/0x10 1 91595 9 bpf_fentry_test2+0x4/0x10 1 91596 7 bpf_fentry_test3+0x4/0x20 1 91597 5 bpf_fentry_test4+0x4/0x20 1 91598 4 bpf_fentry_test5+0x4/0x20 1 91599 2 bpf_fentry_test6+0x4/0x20 1 91600 3 bpf_fentry_test7+0x4/0x10 1 91601 1 bpf_fentry_test8+0x4/0x10 1 91602 10 bpf_fentry_test9+0x4/0x10 1 91594 6 bpf_fentry_test10+0x4/0x10 Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-17-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a0d688fffc5a..90432f0fc2a8 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3665,9 +3665,39 @@ static void bpf_tracing_multi_link_dealloc(struct bpf_link *link) kvfree(tr_link); } +#ifdef CONFIG_PROC_FS +static void bpf_tracing_multi_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_tracing_multi_link *tr_link = + container_of(link, struct bpf_tracing_multi_link, link); + bool has_cookies = !!tr_link->cookies; + + seq_printf(seq, "attach_type:\t%u\n", tr_link->link.attach_type); + seq_printf(seq, "cnt:\t%u\n", tr_link->nodes_cnt); + + seq_printf(seq, "%s\t %s\t %s\t %s\n", "obj-id", "btf-id", "cookie", "func"); + for (int i = 0; i < tr_link->nodes_cnt; i++) { + struct bpf_tracing_multi_node *mnode = &tr_link->nodes[i]; + u32 btf_id, obj_id; + + bpf_trampoline_unpack_key(mnode->trampoline->key, &obj_id, &btf_id); + seq_printf(seq, "%u\t %u\t %llu\t %pS\n", + obj_id, btf_id, + has_cookies ? tr_link->cookies[i] : 0, + (void *) mnode->trampoline->ip); + + cond_resched(); + } +} +#endif + static const struct bpf_link_ops bpf_tracing_multi_link_lops = { .release = bpf_tracing_multi_link_release, .dealloc_deferred = bpf_tracing_multi_link_dealloc, +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_tracing_multi_show_fdinfo, +#endif }; static int ids_cmp_r(const void *pa, const void *pb, const void *priv __maybe_unused) -- cgit v1.2.3 From fe9c8cb2b52b455149d363bbca0fc3648ba0cea6 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:42 +0200 Subject: libbpf: Add bpf_object_cleanup_btf function Adding bpf_object_cleanup_btf function to cleanup btf objects. It will be used in following changes. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-18-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 62f088359c5e..5bdaa5eb1f50 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -8941,13 +8941,10 @@ static void bpf_object_unpin(struct bpf_object *obj) bpf_map__unpin(&obj->maps[i], NULL); } -static void bpf_object_post_load_cleanup(struct bpf_object *obj) +static void bpf_object_cleanup_btf(struct bpf_object *obj) { int i; - /* clean up fd_array */ - zfree(&obj->fd_array); - /* clean up module BTFs */ for (i = 0; i < obj->btf_module_cnt; i++) { close(obj->btf_modules[i].fd); @@ -8955,6 +8952,8 @@ static void bpf_object_post_load_cleanup(struct bpf_object *obj) free(obj->btf_modules[i].name); } obj->btf_module_cnt = 0; + obj->btf_module_cap = 0; + obj->btf_modules_loaded = false; zfree(&obj->btf_modules); /* clean up vmlinux BTF */ @@ -8962,6 +8961,15 @@ static void bpf_object_post_load_cleanup(struct bpf_object *obj) obj->btf_vmlinux = NULL; } +static void bpf_object_post_load_cleanup(struct bpf_object *obj) +{ + /* clean up fd_array */ + zfree(&obj->fd_array); + + /* clean up BTF */ + bpf_object_cleanup_btf(obj); +} + static int bpf_object_prepare(struct bpf_object *obj, const char *target_btf_path) { int err; -- cgit v1.2.3 From 630e85a9f0056a7534601ed1ec2532d6ac85b7d7 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:43 +0200 Subject: libbpf: Add bpf_link_create support for tracing_multi link Adding bpf_link_create support for tracing_multi link with new tracing_multi record in struct bpf_link_create_opts. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-19-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/bpf.c | 9 +++++++++ tools/lib/bpf/bpf.h | 5 +++++ 2 files changed, 14 insertions(+) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index bc513aa8f404..f37e3416f61a 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -845,6 +845,15 @@ int bpf_link_create(int prog_fd, int target_fd, if (!OPTS_ZEROED(opts, uprobe_multi)) return libbpf_err(-EINVAL); break; + case BPF_TRACE_FENTRY_MULTI: + case BPF_TRACE_FEXIT_MULTI: + case BPF_TRACE_FSESSION_MULTI: + attr.link_create.tracing_multi.ids = ptr_to_u64(OPTS_GET(opts, tracing_multi.ids, 0)); + attr.link_create.tracing_multi.cookies = ptr_to_u64(OPTS_GET(opts, tracing_multi.cookies, 0)); + attr.link_create.tracing_multi.cnt = OPTS_GET(opts, tracing_multi.cnt, 0); + if (!OPTS_ZEROED(opts, tracing_multi)) + return libbpf_err(-EINVAL); + break; case BPF_TRACE_RAW_TP: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 2312900a3263..012354131cf6 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -469,6 +469,11 @@ struct bpf_link_create_opts { __u32 relative_id; __u64 expected_revision; } cgroup; + struct { + const __u32 *ids; + const __u64 *cookies; + __u32 cnt; + } tracing_multi; }; size_t :0; }; -- cgit v1.2.3 From 616a93b473a6ab33494db27057f8a413f375ac4f Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:44 +0200 Subject: libbpf: Add btf_type_is_traceable_func function Adding btf_type_is_traceable_func function to perform same checks as the kernel's btf_distill_func_proto function to prevent attachment on some of the functions. Exporting the function via libbpf_internal.h because it will be used by benchmark test in following changes. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-20-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 79 +++++++++++++++++++++++++++++++++++++++++ tools/lib/bpf/libbpf_internal.h | 1 + 2 files changed, 80 insertions(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 5bdaa5eb1f50..42f0efd70327 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -12450,6 +12450,85 @@ static int attach_uprobe_multi(const struct bpf_program *prog, long cookie, stru return ret; } +#define MAX_BPF_FUNC_ARGS 12 + +static bool btf_type_is_modifier(const struct btf_type *t) +{ + switch (BTF_INFO_KIND(t->info)) { + case BTF_KIND_TYPEDEF: + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + case BTF_KIND_TYPE_TAG: + return true; + default: + return false; + } +} + +#define MAX_RESOLVE_DEPTH 32 + +static int btf_get_type_size(const struct btf *btf, __u32 type_id, + const struct btf_type **ret_type) +{ + const struct btf_type *t; + int i; + + *ret_type = btf__type_by_id(btf, 0); + if (!type_id) + return 0; + t = btf__type_by_id(btf, type_id); + for (i = 0; i < MAX_RESOLVE_DEPTH && t && btf_type_is_modifier(t); i++) + t = btf__type_by_id(btf, t->type); + if (!t || i == MAX_RESOLVE_DEPTH) + return -EINVAL; + *ret_type = t; + if (btf_is_ptr(t)) + return btf__pointer_size(btf); + if (btf_is_int(t) || btf_is_any_enum(t) || btf_is_struct(t) || btf_is_union(t)) + return t->size; + return -EINVAL; +} + +bool btf_type_is_traceable_func(const struct btf *btf, const struct btf_type *t) +{ + const struct btf_param *args; + const struct btf_type *proto; + __u32 i, nargs; + int ret; + + if (!btf_is_func(t)) + return false; + proto = btf__type_by_id(btf, t->type); + if (!proto || !btf_is_func_proto(proto)) + return false; + + args = (const struct btf_param *)(proto + 1); + nargs = btf_vlen(proto); + if (nargs > MAX_BPF_FUNC_ARGS) + return false; + + /* No support for struct return type. */ + ret = btf_get_type_size(btf, proto->type, &t); + if (ret < 0 || btf_is_struct(t) || btf_is_union(t)) + return false; + + for (i = 0; i < nargs; i++) { + /* No support for variable args. */ + if (i == nargs - 1 && args[i].type == 0) + return false; + ret = btf_get_type_size(btf, args[i].type, &t); + /* No support of struct argument size greater than 16 bytes. */ + if (ret < 0 || ret > 16) + return false; + /* No support for void argument. */ + if (ret == 0) + return false; + } + + return true; +} + static inline int add_uprobe_event_legacy(const char *probe_name, bool retprobe, const char *binary_path, size_t offset) { diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 7d93c6c01d60..04cd303fb5a8 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -250,6 +250,7 @@ const struct btf_type *skip_mods_and_typedefs(const struct btf *btf, __u32 id, _ const struct btf_header *btf_header(const struct btf *btf); void btf_set_base_btf(struct btf *btf, const struct btf *base_btf); int btf_relocate(struct btf *btf, const struct btf *base_btf, __u32 **id_map); +bool btf_type_is_traceable_func(const struct btf *btf, const struct btf_type *t); static inline enum btf_func_linkage btf_func_linkage(const struct btf_type *t) { -- cgit v1.2.3 From f2aa370dfe571abf51631c1ac27bb58d5d0e3466 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:45 +0200 Subject: libbpf: Add support to create tracing multi link Adding bpf_program__attach_tracing_multi function for attaching tracing program to multiple functions. struct bpf_link * bpf_program__attach_tracing_multi(const struct bpf_program *prog, const char *pattern, const struct bpf_tracing_multi_opts *opts); User can specify functions to attach with 'pattern' argument that allows wildcards (*?' supported) or provide BTF ids of functions in array directly via opts argument. These options are mutually exclusive. When using BTF ids, user can also provide cookie value for each provided id/function, that can be retrieved later in bpf program with bpf_get_attach_cookie helper. Each cookie value is paired with provided BTF id with the same array index. Adding support to auto attach programs with following sections: fsession.multi/ fsession.multi.s/ fentry.multi/ fexit.multi/ fentry.multi.s/ fexit.multi.s/ The provided is used as 'pattern' argument in bpf_program__attach_kprobe_multi_opts function. The allows to specify optional kernel module name with following syntax: : In order to attach tracing_multi link to a module functions: - program must be loaded with 'module' btf fd (in attr::attach_btf_obj_fd) - bpf_program__attach_tracing_multi must either have pattern with module spec or BTF ids from the module Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-21-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 276 +++++++++++++++++++++++++++++++++++++++++++++++ tools/lib/bpf/libbpf.h | 15 +++ tools/lib/bpf/libbpf.map | 1 + 3 files changed, 292 insertions(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 42f0efd70327..1368752aa13c 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -7772,6 +7772,69 @@ static int bpf_object__sanitize_prog(struct bpf_object *obj, struct bpf_program static int libbpf_find_attach_btf_id(struct bpf_program *prog, const char *attach_name, int *btf_obj_fd, int *btf_type_id); +static inline bool is_tracing_multi(enum bpf_attach_type type) +{ + return type == BPF_TRACE_FENTRY_MULTI || type == BPF_TRACE_FEXIT_MULTI || + type == BPF_TRACE_FSESSION_MULTI; +} + +static const struct module_btf *find_attach_module(struct bpf_object *obj, const char *attach) +{ + const char *sep, *mod_name = NULL; + int i, mod_len, err; + + /* + * We expect attach string in the form of either + * - function_pattern or + * - :function_pattern + */ + sep = strchr(attach, ':'); + if (sep) { + mod_name = attach; + mod_len = sep - mod_name; + } + if (!mod_name) + return NULL; + + err = load_module_btfs(obj); + if (err) + return NULL; + + for (i = 0; i < obj->btf_module_cnt; i++) { + const struct module_btf *mod = &obj->btf_modules[i]; + + if (strncmp(mod->name, mod_name, mod_len) == 0 && mod->name[mod_len] == '\0') + return mod; + } + return NULL; +} + +static int tracing_multi_mod_fd(struct bpf_program *prog, int *btf_obj_fd) +{ + const char *attach_name, *sep; + const struct module_btf *mod; + + *btf_obj_fd = 0; + attach_name = strchr(prog->sec_name, '/'); + + /* Program with no details in spec, using kernel btf. */ + if (!attach_name) + return 0; + + /* Program with no module section, using kernel btf. */ + sep = strchr(++attach_name, ':'); + if (!sep) + return 0; + + /* Program with module specified, get its btf fd. */ + mod = find_attach_module(prog->obj, attach_name); + if (!mod) + return -EINVAL; + + *btf_obj_fd = mod->fd; + return 0; +} + /* this is called as prog->sec_def->prog_prepare_load_fn for libbpf-supported sec_defs */ static int libbpf_prepare_prog_load(struct bpf_program *prog, struct bpf_prog_load_opts *opts, long cookie) @@ -7835,6 +7898,18 @@ static int libbpf_prepare_prog_load(struct bpf_program *prog, opts->attach_btf_obj_fd = btf_obj_fd; opts->attach_btf_id = btf_type_id; } + + if (is_tracing_multi(prog->expected_attach_type)) { + int err, btf_obj_fd = 0; + + err = tracing_multi_mod_fd(prog, &btf_obj_fd); + if (err < 0) + return err; + + prog->attach_btf_obj_fd = btf_obj_fd; + opts->attach_btf_obj_fd = btf_obj_fd; + } + return 0; } @@ -9996,6 +10071,7 @@ static int attach_kprobe_session(const struct bpf_program *prog, long cookie, st static int attach_uprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_lsm(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_iter(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_tracing_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link); static const struct bpf_sec_def section_defs[] = { SEC_DEF("socket", SOCKET_FILTER, 0, SEC_NONE), @@ -10049,6 +10125,12 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("fexit.s+", TRACING, BPF_TRACE_FEXIT, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace), SEC_DEF("fsession+", TRACING, BPF_TRACE_FSESSION, SEC_ATTACH_BTF, attach_trace), SEC_DEF("fsession.s+", TRACING, BPF_TRACE_FSESSION, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace), + SEC_DEF("fsession.multi+", TRACING, BPF_TRACE_FSESSION_MULTI, 0, attach_tracing_multi), + SEC_DEF("fsession.multi.s+", TRACING, BPF_TRACE_FSESSION_MULTI, SEC_SLEEPABLE, attach_tracing_multi), + SEC_DEF("fentry.multi+", TRACING, BPF_TRACE_FENTRY_MULTI, 0, attach_tracing_multi), + SEC_DEF("fexit.multi+", TRACING, BPF_TRACE_FEXIT_MULTI, 0, attach_tracing_multi), + SEC_DEF("fentry.multi.s+", TRACING, BPF_TRACE_FENTRY_MULTI, SEC_SLEEPABLE, attach_tracing_multi), + SEC_DEF("fexit.multi.s+", TRACING, BPF_TRACE_FEXIT_MULTI, SEC_SLEEPABLE, attach_tracing_multi), SEC_DEF("freplace+", EXT, 0, SEC_ATTACH_BTF, attach_trace), SEC_DEF("lsm+", LSM, BPF_LSM_MAC, SEC_ATTACH_BTF, attach_lsm), SEC_DEF("lsm.s+", LSM, BPF_LSM_MAC, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_lsm), @@ -12529,6 +12611,200 @@ bool btf_type_is_traceable_func(const struct btf *btf, const struct btf_type *t) return true; } +static int +collect_btf_func_ids_by_glob(const struct btf *btf, const char *pattern, __u32 **ids) +{ + __u32 type_id, nr_types = btf__type_cnt(btf); + size_t cap = 0, cnt = 0; + + if (!pattern) + return -EINVAL; + + for (type_id = 1; type_id < nr_types; type_id++) { + const struct btf_type *t = btf__type_by_id(btf, type_id); + const char *name; + int err; + + if (btf_kind(t) != BTF_KIND_FUNC) + continue; + name = btf__name_by_offset(btf, t->name_off); + if (!name) + continue; + + if (!glob_match(name, pattern)) + continue; + if (!btf_type_is_traceable_func(btf, t)) + continue; + + err = libbpf_ensure_mem((void **) ids, &cap, sizeof(**ids), cnt + 1); + if (err) { + free(*ids); + return -ENOMEM; + } + (*ids)[cnt++] = type_id; + } + + return cnt; +} + +static int collect_func_ids_by_glob(const struct bpf_program *prog, const char *pattern, __u32 **ids) +{ + struct bpf_object *obj = prog->obj; + const struct module_btf *mod; + struct btf *btf = NULL; + const char *sep; + int err; + + err = bpf_object__load_vmlinux_btf(obj, true); + if (err) + return err; + + /* In case we have module specified, we will find its btf and use that. */ + sep = strchr(pattern, ':'); + if (sep) { + mod = find_attach_module(obj, pattern); + if (!mod) { + err = -EINVAL; + goto cleanup; + } + btf = mod->btf; + pattern = sep + 1; + } else { + /* Program is loaded for kernel module. */ + if (prog->attach_btf_obj_fd) { + err = -EINVAL; + goto cleanup; + } + btf = obj->btf_vmlinux; + } + + err = collect_btf_func_ids_by_glob(btf, pattern, ids); + +cleanup: + bpf_object_cleanup_btf(obj); + return err; +} + +struct bpf_link * +bpf_program__attach_tracing_multi(const struct bpf_program *prog, const char *pattern, + const struct bpf_tracing_multi_opts *opts) +{ + LIBBPF_OPTS(bpf_link_create_opts, lopts); + int prog_fd, link_fd, err, cnt; + __u32 *free_ids = NULL; + struct bpf_link *link; + const __u64 *cookies; + const __u32 *ids; + + if (!OPTS_VALID(opts, bpf_tracing_multi_opts)) + return libbpf_err_ptr(-EINVAL); + + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't attach BPF program without FD (was it loaded?)\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + + cnt = OPTS_GET(opts, cnt, 0); + ids = OPTS_GET(opts, ids, NULL); + cookies = OPTS_GET(opts, cookies, NULL); + + if (!!ids != !!cnt) + return libbpf_err_ptr(-EINVAL); + if (pattern && (ids || cookies)) + return libbpf_err_ptr(-EINVAL); + if (!pattern && !ids) + return libbpf_err_ptr(-EINVAL); + + if (pattern) { + cnt = collect_func_ids_by_glob(prog, pattern, &free_ids); + if (cnt < 0) + return libbpf_err_ptr(cnt); + if (cnt == 0) + return libbpf_err_ptr(-EINVAL); + ids = (const __u32 *) free_ids; + } + + lopts.tracing_multi.ids = ids; + lopts.tracing_multi.cookies = cookies; + lopts.tracing_multi.cnt = cnt; + + link = calloc(1, sizeof(*link)); + if (!link) { + err = -ENOMEM; + goto error; + } + link->detach = &bpf_link__detach_fd; + + link_fd = bpf_link_create(prog_fd, 0, prog->expected_attach_type, &lopts); + if (link_fd < 0) { + err = -errno; + pr_warn("prog '%s': failed to attach: %s\n", prog->name, errstr(err)); + goto error; + } + link->fd = link_fd; + free(free_ids); + return link; + +error: + free(link); + free(free_ids); + return libbpf_err_ptr(err); +} + +static int attach_tracing_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + static const char *const prefixes[] = { + "fentry.multi", + "fexit.multi", + "fsession.multi", + "fentry.multi.s", + "fexit.multi.s", + "fsession.multi.s", + }; + const char *spec = NULL; + char *pattern; + size_t i; + int n; + + *link = NULL; + + for (i = 0; i < ARRAY_SIZE(prefixes); i++) { + size_t pfx_len; + + if (!str_has_pfx(prog->sec_name, prefixes[i])) + continue; + + pfx_len = strlen(prefixes[i]); + /* no auto-attach case of, e.g., SEC("fentry.multi") */ + if (prog->sec_name[pfx_len] == '\0') + return 0; + + if (prog->sec_name[pfx_len] != '/') + continue; + + spec = prog->sec_name + pfx_len + 1; + break; + } + + if (!spec) { + pr_warn("prog '%s': invalid section name '%s'\n", + prog->name, prog->sec_name); + return -EINVAL; + } + + n = sscanf(spec, "%m[a-zA-Z0-9_.*?:]", &pattern); + if (n < 1) { + pr_warn("tracing multi pattern is invalid: %s\n", spec); + return -EINVAL; + } + + *link = bpf_program__attach_tracing_multi(prog, pattern, NULL); + free(pattern); + return libbpf_get_error(*link); +} + static inline int add_uprobe_event_legacy(const char *probe_name, bool retprobe, const char *binary_path, size_t offset) { diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index bba4e8464396..b965ad571540 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -726,6 +726,21 @@ bpf_program__attach_ksyscall(const struct bpf_program *prog, const char *syscall_name, const struct bpf_ksyscall_opts *opts); +struct bpf_tracing_multi_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + const __u32 *ids; + const __u64 *cookies; + size_t cnt; + size_t :0; +}; + +#define bpf_tracing_multi_opts__last_field cnt + +LIBBPF_API struct bpf_link * +bpf_program__attach_tracing_multi(const struct bpf_program *prog, const char *pattern, + const struct bpf_tracing_multi_opts *opts); + struct bpf_uprobe_opts { /* size of this struct, for forward/backward compatibility */ size_t sz; diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index dfed8d60af05..b731df19ae69 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -458,6 +458,7 @@ LIBBPF_1.7.0 { LIBBPF_1.8.0 { global: + bpf_program__attach_tracing_multi; bpf_program__clone; btf__new_empty_opts; } LIBBPF_1.7.0; -- cgit v1.2.3 From 2922dd58413cd9a7d9cbe029e7d60f3bc432c553 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:46 +0200 Subject: selftests/bpf: Add tracing multi skel/pattern/ids attach tests Adding tests for tracing_multi link attachment via all possible libbpf apis - skeleton, function pattern and btf ids. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-22-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 3 +- .../selftests/bpf/prog_tests/tracing_multi.c | 258 +++++++++++++++++++++ .../selftests/bpf/progs/tracing_multi_attach.c | 39 ++++ .../selftests/bpf/progs/tracing_multi_check.c | 151 ++++++++++++ 4 files changed, 450 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/tracing_multi.c create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_attach.c create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_check.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 42d9cf848b25..fd885beee0fd 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -512,7 +512,7 @@ SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h \ linked_vars.skel.h linked_maps.skel.h \ test_subskeleton.skel.h test_subskeleton_lib.skel.h \ - test_usdt.skel.h + test_usdt.skel.h tracing_multi.skel.h LSKELS := fexit_sleep.c trace_printk.c trace_vprintk.c map_ptr_kern.c \ core_kern.c core_kern_overflow.c test_ringbuf.c \ @@ -538,6 +538,7 @@ test_usdt.skel.h-deps := test_usdt.bpf.o test_usdt_multispec.bpf.o xsk_xdp_progs.skel.h-deps := xsk_xdp_progs.bpf.o xdp_hw_metadata.skel.h-deps := xdp_hw_metadata.bpf.o xdp_features.skel.h-deps := xdp_features.bpf.o +tracing_multi.skel.h-deps := tracing_multi_attach.bpf.o tracing_multi_check.bpf.o LINKED_BPF_OBJS := $(foreach skel,$(LINKED_SKELS),$($(skel)-deps)) LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.c,$(LINKED_BPF_OBJS)) diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c new file mode 100644 index 000000000000..f333b2514b34 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c @@ -0,0 +1,258 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include "bpf/libbpf_internal.h" +#include "tracing_multi.skel.h" +#include "trace_helpers.h" + +static const char * const bpf_fentry_test[] = { + "bpf_fentry_test1", + "bpf_fentry_test2", + "bpf_fentry_test3", + "bpf_fentry_test4", + "bpf_fentry_test5", + "bpf_fentry_test6", + "bpf_fentry_test7", + "bpf_fentry_test8", + "bpf_fentry_test9", + "bpf_fentry_test10", +}; + +#define FUNCS_CNT (ARRAY_SIZE(bpf_fentry_test)) + +static int compare(const void *ppa, const void *ppb) +{ + const char *pa = *(const char **) ppa; + const char *pb = *(const char **) ppb; + + return strcmp(pa, pb); +} + +static void tdestroy_free_nop(void *ptr) +{ +} + +static __u32 *get_ids(const char * const funcs[], int funcs_cnt, const char *mod) +{ + struct btf *btf, *vmlinux_btf = NULL; + __u32 nr, type_id, cnt = 0; + void *root = NULL; + __u32 *ids = NULL; + int i, err = 0; + + btf = btf__load_vmlinux_btf(); + if (!ASSERT_OK_PTR(btf, "btf__load_vmlinux_btf")) + return NULL; + + if (mod) { + vmlinux_btf = btf; + btf = btf__load_module_btf(mod, vmlinux_btf); + if (!ASSERT_OK_PTR(btf, "btf__load_module_btf")) { + btf__free(vmlinux_btf); + return NULL; + } + } + + ids = calloc(funcs_cnt, sizeof(ids[0])); + if (!ids) + goto out; + + /* + * We sort function names by name and search them + * below for each function. + */ + for (i = 0; i < funcs_cnt; i++) { + if (!tsearch(&funcs[i], &root, compare)) { + ASSERT_FAIL("tsearch failed"); + err = -1; + goto error; + } + } + + nr = btf__type_cnt(btf); + for (type_id = 1; type_id < nr && cnt < funcs_cnt; type_id++) { + const struct btf_type *type; + const char *str, ***val; + unsigned int idx; + + type = btf__type_by_id(btf, type_id); + if (!type) { + err = -1; + break; + } + + if (BTF_INFO_KIND(type->info) != BTF_KIND_FUNC) + continue; + + str = btf__name_by_offset(btf, type->name_off); + if (!str) { + err = -1; + break; + } + + val = tfind(&str, &root, compare); + if (!val) + continue; + + /* + * We keep pointer for each function name so we can get the original + * array index and have the resulting ids array matching the original + * function array. + * + * Doing it this way allow us to easily test the cookies support, + * because each cookie is attached to particular function/id. + */ + idx = *val - funcs; + ids[idx] = type_id; + cnt++; + } + +error: + if (err) { + free(ids); + ids = NULL; + } + +out: + tdestroy(root, tdestroy_free_nop); + btf__free(vmlinux_btf); + btf__free(btf); + return ids; +} + +static void tracing_multi_test_run(struct tracing_multi *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + prog_fd = bpf_program__fd(skel->progs.test_fentry); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "test_run"); + + /* extra +1 count for sleepable programs */ + ASSERT_EQ(skel->bss->test_result_fentry, FUNCS_CNT + 1, "test_result_fentry"); + ASSERT_EQ(skel->bss->test_result_fexit, FUNCS_CNT + 1, "test_result_fexit"); +} + +static void test_skel_api(void) +{ + struct tracing_multi *skel; + int err; + + skel = tracing_multi__open_and_load(); + if (!ASSERT_OK_PTR(skel, "tracing_multi__open_and_load")) + return; + + skel->bss->pid = getpid(); + + err = tracing_multi__attach(skel); + if (!ASSERT_OK(err, "tracing_multi__attach")) + goto cleanup; + + tracing_multi_test_run(skel); + +cleanup: + tracing_multi__destroy(skel); +} + +static void test_link_api_pattern(void) +{ + struct tracing_multi *skel; + + skel = tracing_multi__open_and_load(); + if (!ASSERT_OK_PTR(skel, "tracing_multi__open_and_load")) + return; + + skel->bss->pid = getpid(); + + skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry, + "bpf_fentry_test*", NULL); + if (!ASSERT_OK_PTR(skel->links.test_fentry, "bpf_program__attach_tracing_multi")) + goto cleanup; + + skel->links.test_fexit = bpf_program__attach_tracing_multi(skel->progs.test_fexit, + "bpf_fentry_test*", NULL); + if (!ASSERT_OK_PTR(skel->links.test_fexit, "bpf_program__attach_tracing_multi")) + goto cleanup; + + skel->links.test_fentry_s = bpf_program__attach_tracing_multi(skel->progs.test_fentry_s, + "bpf_fentry_test1", NULL); + if (!ASSERT_OK_PTR(skel->links.test_fentry_s, "bpf_program__attach_tracing_multi")) + goto cleanup; + + skel->links.test_fexit_s = bpf_program__attach_tracing_multi(skel->progs.test_fexit_s, + "bpf_fentry_test1", NULL); + if (!ASSERT_OK_PTR(skel->links.test_fexit_s, "bpf_program__attach_tracing_multi")) + goto cleanup; + + tracing_multi_test_run(skel); + +cleanup: + tracing_multi__destroy(skel); +} + +static void test_link_api_ids(void) +{ + LIBBPF_OPTS(bpf_tracing_multi_opts, opts); + struct tracing_multi *skel; + size_t cnt = FUNCS_CNT; + __u32 *ids; + + skel = tracing_multi__open_and_load(); + if (!ASSERT_OK_PTR(skel, "tracing_multi__open_and_load")) + return; + + skel->bss->pid = getpid(); + + ids = get_ids(bpf_fentry_test, cnt, NULL); + if (!ASSERT_OK_PTR(ids, "get_ids")) + goto cleanup; + + opts.ids = ids; + opts.cnt = cnt; + + skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry, + NULL, &opts); + if (!ASSERT_OK_PTR(skel->links.test_fentry, "bpf_program__attach_tracing_multi")) + goto cleanup; + + skel->links.test_fexit = bpf_program__attach_tracing_multi(skel->progs.test_fexit, + NULL, &opts); + if (!ASSERT_OK_PTR(skel->links.test_fexit, "bpf_program__attach_tracing_multi")) + goto cleanup; + + /* Only bpf_fentry_test1 is allowed for sleepable programs. */ + opts.cnt = 1; + skel->links.test_fentry_s = bpf_program__attach_tracing_multi(skel->progs.test_fentry_s, + NULL, &opts); + if (!ASSERT_OK_PTR(skel->links.test_fentry_s, "bpf_program__attach_tracing_multi")) + goto cleanup; + + skel->links.test_fexit_s = bpf_program__attach_tracing_multi(skel->progs.test_fexit_s, + NULL, &opts); + if (!ASSERT_OK_PTR(skel->links.test_fexit_s, "bpf_program__attach_tracing_multi")) + goto cleanup; + + tracing_multi_test_run(skel); + +cleanup: + tracing_multi__destroy(skel); + free(ids); +} + +void test_tracing_multi_test(void) +{ +#ifndef __x86_64__ + test__skip(); + return; +#endif + + if (test__start_subtest("skel_api")) + test_skel_api(); + if (test__start_subtest("link_api_pattern")) + test_link_api_pattern(); + if (test__start_subtest("link_api_ids")) + test_link_api_ids(); +} diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_attach.c b/tools/testing/selftests/bpf/progs/tracing_multi_attach.c new file mode 100644 index 000000000000..332d0a423a43 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tracing_multi_attach.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +__hidden extern int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return); + +__u64 test_result_fentry = 0; +__u64 test_result_fexit = 0; + +SEC("fentry.multi/bpf_fentry_test*") +int BPF_PROG(test_fentry) +{ + tracing_multi_arg_check(ctx, &test_result_fentry, false); + return 0; +} + +SEC("fexit.multi/bpf_fentry_test*") +int BPF_PROG(test_fexit) +{ + tracing_multi_arg_check(ctx, &test_result_fexit, true); + return 0; +} + +SEC("fentry.multi.s/bpf_fentry_test1") +int BPF_PROG(test_fentry_s) +{ + tracing_multi_arg_check(ctx, &test_result_fentry, false); + return 0; +} + +SEC("fexit.multi.s/bpf_fentry_test1") +int BPF_PROG(test_fexit_s) +{ + tracing_multi_arg_check(ctx, &test_result_fexit, true); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_check.c b/tools/testing/selftests/bpf/progs/tracing_multi_check.c new file mode 100644 index 000000000000..333a3a7bae8a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tracing_multi_check.c @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +int pid = 0; + +/* bpf_fentry_test1 is exported as kfunc via vmlinux.h */ +extern const void bpf_fentry_test2 __ksym; +extern const void bpf_fentry_test3 __ksym; +extern const void bpf_fentry_test4 __ksym; +extern const void bpf_fentry_test5 __ksym; +extern const void bpf_fentry_test6 __ksym; +extern const void bpf_fentry_test7 __ksym; +extern const void bpf_fentry_test8 __ksym; +extern const void bpf_fentry_test9 __ksym; +extern const void bpf_fentry_test10 __ksym; + +int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return) +{ + void *ip = (void *) bpf_get_func_ip(ctx); + __u64 value = 0, ret = 0; + long err = 0; + + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 1; + + if (is_return) + err |= bpf_get_func_ret(ctx, &ret); + + if (ip == &bpf_fentry_test1) { + int a; + + err |= bpf_get_func_arg(ctx, 0, &value); + a = (int) value; + + err |= is_return ? ret != 2 : 0; + + *test_result += err == 0 && a == 1; + } else if (ip == &bpf_fentry_test2) { + __u64 b; + int a; + + err |= bpf_get_func_arg(ctx, 0, &value); + a = (int) value; + err |= bpf_get_func_arg(ctx, 1, &value); + b = value; + + err |= is_return ? ret != 5 : 0; + + *test_result += err == 0 && a == 2 && b == 3; + } else if (ip == &bpf_fentry_test3) { + __u64 c; + char a; + int b; + + err |= bpf_get_func_arg(ctx, 0, &value); + a = (char) value; + err |= bpf_get_func_arg(ctx, 1, &value); + b = (int) value; + err |= bpf_get_func_arg(ctx, 2, &value); + c = value; + + err |= is_return ? ret != 15 : 0; + + *test_result += err == 0 && a == 4 && b == 5 && c == 6; + } else if (ip == &bpf_fentry_test4) { + void *a; + char b; + int c; + __u64 d; + + err |= bpf_get_func_arg(ctx, 0, &value); + a = (void *) value; + err |= bpf_get_func_arg(ctx, 1, &value); + b = (char) value; + err |= bpf_get_func_arg(ctx, 2, &value); + c = (int) value; + err |= bpf_get_func_arg(ctx, 3, &value); + d = value; + + err |= is_return ? ret != 34 : 0; + + *test_result += err == 0 && a == (void *) 7 && b == 8 && c == 9 && d == 10; + } else if (ip == &bpf_fentry_test5) { + __u64 a; + void *b; + short c; + int d; + __u64 e; + + err |= bpf_get_func_arg(ctx, 0, &value); + a = value; + err |= bpf_get_func_arg(ctx, 1, &value); + b = (void *) value; + err |= bpf_get_func_arg(ctx, 2, &value); + c = (short) value; + err |= bpf_get_func_arg(ctx, 3, &value); + d = (int) value; + err |= bpf_get_func_arg(ctx, 4, &value); + e = value; + + err |= is_return ? ret != 65 : 0; + + *test_result += err == 0 && a == 11 && b == (void *) 12 && c == 13 && d == 14 && e == 15; + } else if (ip == &bpf_fentry_test6) { + __u64 a; + void *b; + short c; + int d; + void *e; + __u64 f; + + err |= bpf_get_func_arg(ctx, 0, &value); + a = value; + err |= bpf_get_func_arg(ctx, 1, &value); + b = (void *) value; + err |= bpf_get_func_arg(ctx, 2, &value); + c = (short) value; + err |= bpf_get_func_arg(ctx, 3, &value); + d = (int) value; + err |= bpf_get_func_arg(ctx, 4, &value); + e = (void *) value; + err |= bpf_get_func_arg(ctx, 5, &value); + f = value; + + err |= is_return ? ret != 111 : 0; + + *test_result += err == 0 && a == 16 && b == (void *) 17 && c == 18 && d == 19 && e == (void *) 20 && f == 21; + } else if (ip == &bpf_fentry_test7) { + err |= is_return ? ret != 0 : 0; + + *test_result += err == 0 ? 1 : 0; + } else if (ip == &bpf_fentry_test8) { + err |= is_return ? ret != 0 : 0; + + *test_result += err == 0 ? 1 : 0; + } else if (ip == &bpf_fentry_test9) { + err |= is_return ? ret != 0 : 0; + + *test_result += err == 0 ? 1 : 0; + } else if (ip == &bpf_fentry_test10) { + err |= is_return ? ret != 0 : 0; + + *test_result += err == 0 ? 1 : 0; + } + + return 0; +} -- cgit v1.2.3 From 2863f074f146adf7f63bd567de05ae03fad64a01 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:47 +0200 Subject: selftests/bpf: Add tracing multi skel/pattern/ids module attach tests Adding tests for tracing_multi link attachment via all possible libbpf apis - skeleton, function pattern and btf ids on top of bpf_testmod kernel module. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-23-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 4 +- .../selftests/bpf/prog_tests/tracing_multi.c | 105 +++++++++++++++++++++ .../bpf/progs/tracing_multi_attach_module.c | 25 +++++ .../selftests/bpf/progs/tracing_multi_check.c | 50 ++++++++++ 4 files changed, 183 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_attach_module.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index fd885beee0fd..ed220558d41b 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -512,7 +512,8 @@ SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h \ linked_vars.skel.h linked_maps.skel.h \ test_subskeleton.skel.h test_subskeleton_lib.skel.h \ - test_usdt.skel.h tracing_multi.skel.h + test_usdt.skel.h tracing_multi.skel.h \ + tracing_multi_module.skel.h LSKELS := fexit_sleep.c trace_printk.c trace_vprintk.c map_ptr_kern.c \ core_kern.c core_kern_overflow.c test_ringbuf.c \ @@ -539,6 +540,7 @@ xsk_xdp_progs.skel.h-deps := xsk_xdp_progs.bpf.o xdp_hw_metadata.skel.h-deps := xdp_hw_metadata.bpf.o xdp_features.skel.h-deps := xdp_features.bpf.o tracing_multi.skel.h-deps := tracing_multi_attach.bpf.o tracing_multi_check.bpf.o +tracing_multi_module.skel.h-deps := tracing_multi_attach_module.bpf.o tracing_multi_check.bpf.o LINKED_BPF_OBJS := $(foreach skel,$(LINKED_SKELS),$($(skel)-deps)) LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.c,$(LINKED_BPF_OBJS)) diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c index f333b2514b34..77134f1e2dc3 100644 --- a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c +++ b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c @@ -5,6 +5,7 @@ #include #include "bpf/libbpf_internal.h" #include "tracing_multi.skel.h" +#include "tracing_multi_module.skel.h" #include "trace_helpers.h" static const char * const bpf_fentry_test[] = { @@ -20,6 +21,14 @@ static const char * const bpf_fentry_test[] = { "bpf_fentry_test10", }; +static const char * const bpf_testmod_fentry_test[] = { + "bpf_testmod_fentry_test1", + "bpf_testmod_fentry_test2", + "bpf_testmod_fentry_test3", + "bpf_testmod_fentry_test7", + "bpf_testmod_fentry_test11", +}; + #define FUNCS_CNT (ARRAY_SIZE(bpf_fentry_test)) static int compare(const void *ppa, const void *ppb) @@ -242,6 +251,96 @@ cleanup: free(ids); } +static void test_module_skel_api(void) +{ + struct tracing_multi_module *skel = NULL; + int err; + + skel = tracing_multi_module__open_and_load(); + if (!ASSERT_OK_PTR(skel, "tracing_multi__open_and_load")) + return; + + skel->bss->pid = getpid(); + + err = tracing_multi_module__attach(skel); + if (!ASSERT_OK(err, "tracing_multi__attach")) + goto cleanup; + + ASSERT_OK(trigger_module_test_read(1), "trigger_read"); + ASSERT_EQ(skel->bss->test_result_fentry, 5, "test_result_fentry"); + ASSERT_EQ(skel->bss->test_result_fexit, 5, "test_result_fexit"); + +cleanup: + tracing_multi_module__destroy(skel); +} + +static void test_module_link_api_pattern(void) +{ + struct tracing_multi_module *skel = NULL; + + skel = tracing_multi_module__open_and_load(); + if (!ASSERT_OK_PTR(skel, "tracing_multi_module__open_and_load")) + return; + + skel->bss->pid = getpid(); + + skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry, + "bpf_testmod:bpf_testmod_fentry_test*", NULL); + if (!ASSERT_OK_PTR(skel->links.test_fentry, "bpf_program__attach_tracing_multi")) + goto cleanup; + + skel->links.test_fexit = bpf_program__attach_tracing_multi(skel->progs.test_fexit, + "bpf_testmod:bpf_testmod_fentry_test*", NULL); + if (!ASSERT_OK_PTR(skel->links.test_fexit, "bpf_program__attach_tracing_multi")) + goto cleanup; + + ASSERT_OK(trigger_module_test_read(1), "trigger_read"); + ASSERT_EQ(skel->bss->test_result_fentry, 5, "test_result_fentry"); + ASSERT_EQ(skel->bss->test_result_fexit, 5, "test_result_fexit"); + +cleanup: + tracing_multi_module__destroy(skel); +} + +static void test_module_link_api_ids(void) +{ + size_t cnt = ARRAY_SIZE(bpf_testmod_fentry_test); + LIBBPF_OPTS(bpf_tracing_multi_opts, opts); + struct tracing_multi_module *skel = NULL; + __u32 *ids; + + skel = tracing_multi_module__open_and_load(); + if (!ASSERT_OK_PTR(skel, "tracing_multi_module__open_and_load")) + return; + + skel->bss->pid = getpid(); + + ids = get_ids(bpf_testmod_fentry_test, cnt, "bpf_testmod"); + if (!ASSERT_OK_PTR(ids, "get_ids")) + goto cleanup; + + opts.ids = ids; + opts.cnt = cnt; + + skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry, + NULL, &opts); + if (!ASSERT_OK_PTR(skel->links.test_fentry, "bpf_program__attach_tracing_multi")) + goto cleanup; + + skel->links.test_fexit = bpf_program__attach_tracing_multi(skel->progs.test_fexit, + NULL, &opts); + if (!ASSERT_OK_PTR(skel->links.test_fexit, "bpf_program__attach_tracing_multi")) + goto cleanup; + + ASSERT_OK(trigger_module_test_read(1), "trigger_read"); + ASSERT_EQ(skel->bss->test_result_fentry, 5, "test_result_fentry"); + ASSERT_EQ(skel->bss->test_result_fexit, 5, "test_result_fexit"); + +cleanup: + tracing_multi_module__destroy(skel); + free(ids); +} + void test_tracing_multi_test(void) { #ifndef __x86_64__ @@ -255,4 +354,10 @@ void test_tracing_multi_test(void) test_link_api_pattern(); if (test__start_subtest("link_api_ids")) test_link_api_ids(); + if (test__start_subtest("module_skel_api")) + test_module_skel_api(); + if (test__start_subtest("module_link_api_pattern")) + test_module_link_api_pattern(); + if (test__start_subtest("module_link_api_ids")) + test_module_link_api_ids(); } diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_attach_module.c b/tools/testing/selftests/bpf/progs/tracing_multi_attach_module.c new file mode 100644 index 000000000000..b3374f2db450 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tracing_multi_attach_module.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +__hidden extern int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return); + +__u64 test_result_fentry = 0; +__u64 test_result_fexit = 0; + +SEC("fentry.multi/bpf_testmod:bpf_testmod_fentry_test*") +int BPF_PROG(test_fentry) +{ + tracing_multi_arg_check(ctx, &test_result_fentry, false); + return 0; +} + +SEC("fexit.multi/bpf_testmod:bpf_testmod_fentry_test*") +int BPF_PROG(test_fexit) +{ + tracing_multi_arg_check(ctx, &test_result_fexit, true); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_check.c b/tools/testing/selftests/bpf/progs/tracing_multi_check.c index 333a3a7bae8a..7ede84c50cb6 100644 --- a/tools/testing/selftests/bpf/progs/tracing_multi_check.c +++ b/tools/testing/selftests/bpf/progs/tracing_multi_check.c @@ -18,6 +18,12 @@ extern const void bpf_fentry_test8 __ksym; extern const void bpf_fentry_test9 __ksym; extern const void bpf_fentry_test10 __ksym; +extern const void bpf_testmod_fentry_test1 __ksym; +extern const void bpf_testmod_fentry_test2 __ksym; +extern const void bpf_testmod_fentry_test3 __ksym; +extern const void bpf_testmod_fentry_test7 __ksym; +extern const void bpf_testmod_fentry_test11 __ksym; + int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return) { void *ip = (void *) bpf_get_func_ip(ctx); @@ -145,6 +151,50 @@ int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return) err |= is_return ? ret != 0 : 0; *test_result += err == 0 ? 1 : 0; + } else if (ip == &bpf_testmod_fentry_test1) { + int a; + + err |= bpf_get_func_arg(ctx, 0, &value); + a = (int) value; + + err |= is_return ? ret != 2 : 0; + + *test_result += err == 0 && a == 1; + } else if (ip == &bpf_testmod_fentry_test2) { + int a; + __u64 b; + + err |= bpf_get_func_arg(ctx, 0, &value); + a = (int) value; + err |= bpf_get_func_arg(ctx, 1, &value); + b = (__u64) value; + + err |= is_return ? ret != 5 : 0; + + *test_result += err == 0 && a == 2 && b == 3; + } else if (ip == &bpf_testmod_fentry_test3) { + char a; + int b; + __u64 c; + + err |= bpf_get_func_arg(ctx, 0, &value); + a = (char) value; + err |= bpf_get_func_arg(ctx, 1, &value); + b = (int) value; + err |= bpf_get_func_arg(ctx, 2, &value); + c = (__u64) value; + + err |= is_return ? ret != 15 : 0; + + *test_result += err == 0 && a == 4 && b == 5 && c == 6; + } else if (ip == &bpf_testmod_fentry_test7) { + err |= is_return ? ret != 133 : 0; + + *test_result += err == 0; + } else if (ip == &bpf_testmod_fentry_test11) { + err |= is_return ? ret != 231 : 0; + + *test_result += err == 0; } return 0; -- cgit v1.2.3 From 4309f580a0a6608bd0c0fe090ef5283173ff4f1a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:48 +0200 Subject: selftests/bpf: Add tracing multi intersect tests Adding tracing multi tests for intersecting attached functions. Using bits from (from 1 to 16 values) to specify (up to 4) attached programs, and randomly choosing bpf_fentry_test* functions they are attached to. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-24-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 4 +- .../selftests/bpf/prog_tests/tracing_multi.c | 99 ++++++++++++++++++++++ .../bpf/progs/tracing_multi_intersect_attach.c | 41 +++++++++ 3 files changed, 143 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_intersect_attach.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index ed220558d41b..2b5688c97006 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -513,7 +513,8 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h \ linked_vars.skel.h linked_maps.skel.h \ test_subskeleton.skel.h test_subskeleton_lib.skel.h \ test_usdt.skel.h tracing_multi.skel.h \ - tracing_multi_module.skel.h + tracing_multi_module.skel.h \ + tracing_multi_intersect.skel.h LSKELS := fexit_sleep.c trace_printk.c trace_vprintk.c map_ptr_kern.c \ core_kern.c core_kern_overflow.c test_ringbuf.c \ @@ -541,6 +542,7 @@ xdp_hw_metadata.skel.h-deps := xdp_hw_metadata.bpf.o xdp_features.skel.h-deps := xdp_features.bpf.o tracing_multi.skel.h-deps := tracing_multi_attach.bpf.o tracing_multi_check.bpf.o tracing_multi_module.skel.h-deps := tracing_multi_attach_module.bpf.o tracing_multi_check.bpf.o +tracing_multi_intersect.skel.h-deps := tracing_multi_intersect_attach.bpf.o tracing_multi_check.bpf.o LINKED_BPF_OBJS := $(foreach skel,$(LINKED_SKELS),$($(skel)-deps)) LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.c,$(LINKED_BPF_OBJS)) diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c index 77134f1e2dc3..4dd610e74f9a 100644 --- a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c +++ b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c @@ -6,6 +6,7 @@ #include "bpf/libbpf_internal.h" #include "tracing_multi.skel.h" #include "tracing_multi_module.skel.h" +#include "tracing_multi_intersect.skel.h" #include "trace_helpers.h" static const char * const bpf_fentry_test[] = { @@ -31,6 +32,20 @@ static const char * const bpf_testmod_fentry_test[] = { #define FUNCS_CNT (ARRAY_SIZE(bpf_fentry_test)) +static int get_random_funcs(const char **funcs) +{ + int i, cnt = 0; + + for (i = 0; i < FUNCS_CNT; i++) { + if (rand() % 2) + funcs[cnt++] = bpf_fentry_test[i]; + } + /* we always need at least one.. */ + if (!cnt) + funcs[cnt++] = bpf_fentry_test[rand() % FUNCS_CNT]; + return cnt; +} + static int compare(const void *ppa, const void *ppb) { const char *pa = *(const char **) ppa; @@ -341,6 +356,88 @@ cleanup: free(ids); } +static bool is_set(__u32 mask, __u32 bit) +{ + return (1 << bit) & mask; +} + +static void __test_intersect(__u32 mask, const struct bpf_program *progs[4], __u64 *test_results[4]) +{ + LIBBPF_OPTS(bpf_tracing_multi_opts, opts); + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct bpf_link *links[4] = { NULL }; + const char *funcs[FUNCS_CNT]; + __u64 expected[4]; + __u32 *ids, i; + int err, cnt; + + /* + * We have 4 programs in progs and the mask bits pick which + * of them gets attached to randomly chosen functions. + */ + for (i = 0; i < 4; i++) { + if (!is_set(mask, i)) + continue; + + cnt = get_random_funcs(funcs); + ids = get_ids(funcs, cnt, NULL); + if (!ASSERT_OK_PTR(ids, "get_ids")) + goto cleanup; + + opts.ids = ids; + opts.cnt = cnt; + links[i] = bpf_program__attach_tracing_multi(progs[i], NULL, &opts); + free(ids); + + if (!ASSERT_OK_PTR(links[i], "bpf_program__attach_tracing_multi")) + goto cleanup; + + expected[i] = *test_results[i] + cnt; + } + + err = bpf_prog_test_run_opts(bpf_program__fd(progs[0]), &topts); + ASSERT_OK(err, "test_run"); + + for (i = 0; i < 4; i++) { + if (!is_set(mask, i)) + continue; + ASSERT_EQ(*test_results[i], expected[i], "test_results"); + } + +cleanup: + for (i = 0; i < 4; i++) + bpf_link__destroy(links[i]); +} + +static void test_intersect(void) +{ + struct tracing_multi_intersect *skel; + const struct bpf_program *progs[4]; + __u64 *test_results[4]; + __u32 i; + + skel = tracing_multi_intersect__open_and_load(); + if (!ASSERT_OK_PTR(skel, "tracing_multi_intersect__open_and_load")) + return; + + skel->bss->pid = getpid(); + + progs[0] = skel->progs.fentry_1; + progs[1] = skel->progs.fexit_1; + progs[2] = skel->progs.fentry_2; + progs[3] = skel->progs.fexit_2; + + test_results[0] = &skel->bss->test_result_fentry_1; + test_results[1] = &skel->bss->test_result_fexit_1; + test_results[2] = &skel->bss->test_result_fentry_2; + test_results[3] = &skel->bss->test_result_fexit_2; + + for (i = 1; i < 16; i++) + __test_intersect(i, progs, test_results); + + tracing_multi_intersect__destroy(skel); +} + void test_tracing_multi_test(void) { #ifndef __x86_64__ @@ -360,4 +457,6 @@ void test_tracing_multi_test(void) test_module_link_api_pattern(); if (test__start_subtest("module_link_api_ids")) test_module_link_api_ids(); + if (test__start_subtest("intersect")) + test_intersect(); } diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_intersect_attach.c b/tools/testing/selftests/bpf/progs/tracing_multi_intersect_attach.c new file mode 100644 index 000000000000..cd5be0bb6ffd --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tracing_multi_intersect_attach.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +__hidden extern int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return); + +__u64 test_result_fentry_1 = 0; +__u64 test_result_fentry_2 = 0; +__u64 test_result_fexit_1 = 0; +__u64 test_result_fexit_2 = 0; + +SEC("fentry.multi") +int BPF_PROG(fentry_1) +{ + tracing_multi_arg_check(ctx, &test_result_fentry_1, false); + return 0; +} + +SEC("fentry.multi") +int BPF_PROG(fentry_2) +{ + tracing_multi_arg_check(ctx, &test_result_fentry_2, false); + return 0; +} + +SEC("fexit.multi") +int BPF_PROG(fexit_1) +{ + tracing_multi_arg_check(ctx, &test_result_fexit_1, true); + return 0; +} + +SEC("fexit.multi") +int BPF_PROG(fexit_2) +{ + tracing_multi_arg_check(ctx, &test_result_fexit_2, true); + return 0; +} -- cgit v1.2.3 From 1b938f42f5fa1789d0dcc2b9aa6262edba3a7f51 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:49 +0200 Subject: selftests/bpf: Add tracing multi cookies test Adding tests for using cookies on tracing multi link. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-25-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/tracing_multi.c | 23 ++++++++++++++++++++-- .../selftests/bpf/progs/tracing_multi_check.c | 15 +++++++++++++- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c index 4dd610e74f9a..0f066063cb82 100644 --- a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c +++ b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c @@ -9,6 +9,19 @@ #include "tracing_multi_intersect.skel.h" #include "trace_helpers.h" +static __u64 bpf_fentry_test_cookies[] = { + 8, /* bpf_fentry_test1 */ + 9, /* bpf_fentry_test2 */ + 7, /* bpf_fentry_test3 */ + 5, /* bpf_fentry_test4 */ + 4, /* bpf_fentry_test5 */ + 2, /* bpf_fentry_test6 */ + 3, /* bpf_fentry_test7 */ + 1, /* bpf_fentry_test8 */ + 10, /* bpf_fentry_test9 */ + 6, /* bpf_fentry_test10 */ +}; + static const char * const bpf_fentry_test[] = { "bpf_fentry_test1", "bpf_fentry_test2", @@ -217,7 +230,7 @@ cleanup: tracing_multi__destroy(skel); } -static void test_link_api_ids(void) +static void test_link_api_ids(bool test_cookies) { LIBBPF_OPTS(bpf_tracing_multi_opts, opts); struct tracing_multi *skel; @@ -229,6 +242,7 @@ static void test_link_api_ids(void) return; skel->bss->pid = getpid(); + skel->bss->test_cookies = test_cookies; ids = get_ids(bpf_fentry_test, cnt, NULL); if (!ASSERT_OK_PTR(ids, "get_ids")) @@ -237,6 +251,9 @@ static void test_link_api_ids(void) opts.ids = ids; opts.cnt = cnt; + if (test_cookies) + opts.cookies = bpf_fentry_test_cookies; + skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry, NULL, &opts); if (!ASSERT_OK_PTR(skel->links.test_fentry, "bpf_program__attach_tracing_multi")) @@ -450,7 +467,7 @@ void test_tracing_multi_test(void) if (test__start_subtest("link_api_pattern")) test_link_api_pattern(); if (test__start_subtest("link_api_ids")) - test_link_api_ids(); + test_link_api_ids(false); if (test__start_subtest("module_skel_api")) test_module_skel_api(); if (test__start_subtest("module_link_api_pattern")) @@ -459,4 +476,6 @@ void test_tracing_multi_test(void) test_module_link_api_ids(); if (test__start_subtest("intersect")) test_intersect(); + if (test__start_subtest("cookies")) + test_link_api_ids(true); } diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_check.c b/tools/testing/selftests/bpf/progs/tracing_multi_check.c index 7ede84c50cb6..b2959ba71179 100644 --- a/tools/testing/selftests/bpf/progs/tracing_multi_check.c +++ b/tools/testing/selftests/bpf/progs/tracing_multi_check.c @@ -6,6 +6,7 @@ char _license[] SEC("license") = "GPL"; int pid = 0; +bool test_cookies = false; /* bpf_fentry_test1 is exported as kfunc via vmlinux.h */ extern const void bpf_fentry_test2 __ksym; @@ -27,7 +28,7 @@ extern const void bpf_testmod_fentry_test11 __ksym; int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return) { void *ip = (void *) bpf_get_func_ip(ctx); - __u64 value = 0, ret = 0; + __u64 value = 0, ret = 0, cookie = 0; long err = 0; if (bpf_get_current_pid_tgid() >> 32 != pid) @@ -35,6 +36,8 @@ int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return) if (is_return) err |= bpf_get_func_ret(ctx, &ret); + if (test_cookies) + cookie = bpf_get_attach_cookie(ctx); if (ip == &bpf_fentry_test1) { int a; @@ -43,6 +46,7 @@ int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return) a = (int) value; err |= is_return ? ret != 2 : 0; + err |= test_cookies ? cookie != 8 : 0; *test_result += err == 0 && a == 1; } else if (ip == &bpf_fentry_test2) { @@ -55,6 +59,7 @@ int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return) b = value; err |= is_return ? ret != 5 : 0; + err |= test_cookies ? cookie != 9 : 0; *test_result += err == 0 && a == 2 && b == 3; } else if (ip == &bpf_fentry_test3) { @@ -70,6 +75,7 @@ int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return) c = value; err |= is_return ? ret != 15 : 0; + err |= test_cookies ? cookie != 7 : 0; *test_result += err == 0 && a == 4 && b == 5 && c == 6; } else if (ip == &bpf_fentry_test4) { @@ -88,6 +94,7 @@ int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return) d = value; err |= is_return ? ret != 34 : 0; + err |= test_cookies ? cookie != 5 : 0; *test_result += err == 0 && a == (void *) 7 && b == 8 && c == 9 && d == 10; } else if (ip == &bpf_fentry_test5) { @@ -109,6 +116,7 @@ int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return) e = value; err |= is_return ? ret != 65 : 0; + err |= test_cookies ? cookie != 4 : 0; *test_result += err == 0 && a == 11 && b == (void *) 12 && c == 13 && d == 14 && e == 15; } else if (ip == &bpf_fentry_test6) { @@ -133,22 +141,27 @@ int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return) f = value; err |= is_return ? ret != 111 : 0; + err |= test_cookies ? cookie != 2 : 0; *test_result += err == 0 && a == 16 && b == (void *) 17 && c == 18 && d == 19 && e == (void *) 20 && f == 21; } else if (ip == &bpf_fentry_test7) { err |= is_return ? ret != 0 : 0; + err |= test_cookies ? cookie != 3 : 0; *test_result += err == 0 ? 1 : 0; } else if (ip == &bpf_fentry_test8) { err |= is_return ? ret != 0 : 0; + err |= test_cookies ? cookie != 1 : 0; *test_result += err == 0 ? 1 : 0; } else if (ip == &bpf_fentry_test9) { err |= is_return ? ret != 0 : 0; + err |= test_cookies ? cookie != 10 : 0; *test_result += err == 0 ? 1 : 0; } else if (ip == &bpf_fentry_test10) { err |= is_return ? ret != 0 : 0; + err |= test_cookies ? cookie != 6 : 0; *test_result += err == 0 ? 1 : 0; } else if (ip == &bpf_testmod_fentry_test1) { -- cgit v1.2.3 From 69f25d4b0c17cc947ce26391cac0015182b07dc0 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:50 +0200 Subject: selftests/bpf: Add tracing multi session test Adding tests for tracing multi link session. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-26-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 4 +- .../selftests/bpf/prog_tests/tracing_multi.c | 45 +++++++++++++++ .../bpf/progs/tracing_multi_session_attach.c | 65 ++++++++++++++++++++++ 3 files changed, 113 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_session_attach.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 2b5688c97006..d53b7e496ac9 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -514,7 +514,8 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h \ test_subskeleton.skel.h test_subskeleton_lib.skel.h \ test_usdt.skel.h tracing_multi.skel.h \ tracing_multi_module.skel.h \ - tracing_multi_intersect.skel.h + tracing_multi_intersect.skel.h \ + tracing_multi_session.skel.h LSKELS := fexit_sleep.c trace_printk.c trace_vprintk.c map_ptr_kern.c \ core_kern.c core_kern_overflow.c test_ringbuf.c \ @@ -543,6 +544,7 @@ xdp_features.skel.h-deps := xdp_features.bpf.o tracing_multi.skel.h-deps := tracing_multi_attach.bpf.o tracing_multi_check.bpf.o tracing_multi_module.skel.h-deps := tracing_multi_attach_module.bpf.o tracing_multi_check.bpf.o tracing_multi_intersect.skel.h-deps := tracing_multi_intersect_attach.bpf.o tracing_multi_check.bpf.o +tracing_multi_session.skel.h-deps := tracing_multi_session_attach.bpf.o tracing_multi_check.bpf.o LINKED_BPF_OBJS := $(foreach skel,$(LINKED_SKELS),$($(skel)-deps)) LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.c,$(LINKED_BPF_OBJS)) diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c index 0f066063cb82..05683b8d0680 100644 --- a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c +++ b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c @@ -7,6 +7,7 @@ #include "tracing_multi.skel.h" #include "tracing_multi_module.skel.h" #include "tracing_multi_intersect.skel.h" +#include "tracing_multi_session.skel.h" #include "trace_helpers.h" static __u64 bpf_fentry_test_cookies[] = { @@ -455,6 +456,48 @@ static void test_intersect(void) tracing_multi_intersect__destroy(skel); } +static void test_session(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct tracing_multi_session *skel; + int err, prog_fd; + + skel = tracing_multi_session__open_and_load(); + if (!ASSERT_OK_PTR(skel, "tracing_multi_session__open_and_load")) + return; + + skel->bss->pid = getpid(); + + err = tracing_multi_session__attach(skel); + if (!ASSERT_OK(err, "tracing_multi_session__attach")) + goto cleanup; + + /* execute kernel session */ + prog_fd = bpf_program__fd(skel->progs.test_session_1); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "test_run"); + + /* 10 for test_session_1, 1 for test_fsession_s */ + ASSERT_EQ(skel->bss->test_result_fentry, 11, "test_result_fentry"); + /* extra count (+1 for each fexit execution) for test_result_fexit cookie check/inc */ + ASSERT_EQ(skel->bss->test_result_fexit, 22, "test_result_fexit"); + + skel->bss->test_result_fentry = 0; + skel->bss->test_result_fexit = 0; + + /* execute bpf_testmo.ko session */ + ASSERT_OK(trigger_module_test_read(1), "trigger_read"); + + /* 5 for test_session_2 */ + ASSERT_EQ(skel->bss->test_result_fentry, 5, "test_result_fentry"); + /* extra count (+1 for each fexit execution) for test_result_fexit cookie */ + ASSERT_EQ(skel->bss->test_result_fexit, 10, "test_result_fexit"); + + +cleanup: + tracing_multi_session__destroy(skel); +} + void test_tracing_multi_test(void) { #ifndef __x86_64__ @@ -478,4 +521,6 @@ void test_tracing_multi_test(void) test_intersect(); if (test__start_subtest("cookies")) test_link_api_ids(true); + if (test__start_subtest("session")) + test_session(); } diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_session_attach.c b/tools/testing/selftests/bpf/progs/tracing_multi_session_attach.c new file mode 100644 index 000000000000..7c9a46016ccd --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tracing_multi_session_attach.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +__hidden extern int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return); + +__u64 test_result_fentry = 0; +__u64 test_result_fexit = 0; + +SEC("fsession.multi/bpf_fentry_test*") +int BPF_PROG(test_session_1) +{ + volatile __u64 *cookie = bpf_session_cookie(ctx); + + if (bpf_session_is_return(ctx)) { + if (tracing_multi_arg_check(ctx, &test_result_fexit, true)) + return 0; + /* extra count for test_result_fexit cookie */ + test_result_fexit += *cookie == 0xbeafbeafbeafbeaf; + } else { + if (tracing_multi_arg_check(ctx, &test_result_fentry, false)) + return 0; + *cookie = 0xbeafbeafbeafbeaf; + } + return 0; +} + +SEC("fsession.multi.s/bpf_fentry_test1") +int BPF_PROG(test_fsession_s) +{ + volatile __u64 *cookie = bpf_session_cookie(ctx); + + if (bpf_session_is_return(ctx)) { + if (tracing_multi_arg_check(ctx, &test_result_fexit, true)) + return 0; + /* extra count for test_result_fexit cookie */ + test_result_fexit += *cookie == 0xbeafbeafbeafbeaf; + } else { + if (tracing_multi_arg_check(ctx, &test_result_fentry, false)) + return 0; + *cookie = 0xbeafbeafbeafbeaf; + } + return 0; +} + +SEC("fsession.multi/bpf_testmod:bpf_testmod_fentry_test*") +int BPF_PROG(test_session_2) +{ + volatile __u64 *cookie = bpf_session_cookie(ctx); + + if (bpf_session_is_return(ctx)) { + if (tracing_multi_arg_check(ctx, &test_result_fexit, true)) + return 0; + /* extra count for test_result_fexit cookie */ + test_result_fexit += *cookie == 0xbeafbeafbeafbeaf; + } else { + if (tracing_multi_arg_check(ctx, &test_result_fentry, false)) + return 0; + *cookie = 0xbeafbeafbeafbeaf; + } + return 0; +} -- cgit v1.2.3 From 1fd8328549979d96540252fa826481df93885a5a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:51 +0200 Subject: selftests/bpf: Add tracing multi attach fails test Adding tests for attach fails on tracing multi link. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-27-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/tracing_multi.c | 96 ++++++++++++++++++++++ .../selftests/bpf/progs/tracing_multi_fail.c | 18 ++++ 2 files changed, 114 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_fail.c diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c index 05683b8d0680..7e1bb071ce2a 100644 --- a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c +++ b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c @@ -8,6 +8,7 @@ #include "tracing_multi_module.skel.h" #include "tracing_multi_intersect.skel.h" #include "tracing_multi_session.skel.h" +#include "tracing_multi_fail.skel.h" #include "trace_helpers.h" static __u64 bpf_fentry_test_cookies[] = { @@ -498,6 +499,99 @@ cleanup: tracing_multi_session__destroy(skel); } +static void test_attach_api_fails(void) +{ + LIBBPF_OPTS(bpf_tracing_multi_opts, opts); + static const char * const func[] = { + "bpf_fentry_test2", + }; + struct tracing_multi_fail *skel = NULL; + __u32 ids[2] = {}, *ids2 = NULL; + __u64 cookies[2]; + + skel = tracing_multi_fail__open_and_load(); + if (!ASSERT_OK_PTR(skel, "tracing_multi_fail__open_and_load")) + return; + + /* fail#1 (libbpf) pattern and opts NULL */ + skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry, + NULL, NULL); + if (!ASSERT_EQ(libbpf_get_error(skel->links.test_fentry), -EINVAL, "fail_1")) + goto cleanup; + + /* fail#2 (libbpf) pattern and ids */ + LIBBPF_OPTS_RESET(opts, + .ids = ids, + .cnt = 2, + ); + + skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry, + "bpf_fentry_test*", &opts); + if (!ASSERT_EQ(libbpf_get_error(skel->links.test_fentry), -EINVAL, "fail_2")) + goto cleanup; + + /* fail#3 (libbpf) pattern and cookies */ + LIBBPF_OPTS_RESET(opts, + .ids = NULL, + .cnt = 2, + .cookies = cookies, + ); + + skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry, + "bpf_fentry_test*", &opts); + if (!ASSERT_EQ(libbpf_get_error(skel->links.test_fentry), -EINVAL, "fail_3")) + goto cleanup; + + /* fail#4 (libbpf) bogus pattern */ + skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry, + "bpf_not_really_a_function*", NULL); + if (!ASSERT_EQ(libbpf_get_error(skel->links.test_fentry), -EINVAL, "fail_4")) + goto cleanup; + + /* fail#5 (kernel) abnormal cnt */ + LIBBPF_OPTS_RESET(opts, + .ids = ids, + .cnt = INT_MAX, + ); + + skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry, + NULL, &opts); + if (!ASSERT_EQ(libbpf_get_error(skel->links.test_fentry), -E2BIG, "fail_5")) + goto cleanup; + + /* fail#6 (kernel) attach sleepable program to not-allowed function */ + ids2 = get_ids(func, 1, NULL); + if (!ASSERT_OK_PTR(ids2, "get_ids")) + goto cleanup; + + LIBBPF_OPTS_RESET(opts, + .ids = ids2, + .cnt = 1, + ); + + skel->links.test_fentry_s = bpf_program__attach_tracing_multi(skel->progs.test_fentry_s, + NULL, &opts); + if (!ASSERT_EQ(libbpf_get_error(skel->links.test_fentry_s), -EINVAL, "fail_6")) + goto cleanup; + + /* fail#7 (kernel) attach with duplicate id */ + ids[0] = ids2[0]; + ids[1] = ids2[0]; + + LIBBPF_OPTS_RESET(opts, + .ids = ids, + .cnt = 2, + ); + + skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry, + NULL, &opts); + ASSERT_EQ(libbpf_get_error(skel->links.test_fentry), -EINVAL, "fail_7"); + +cleanup: + tracing_multi_fail__destroy(skel); + free(ids2); +} + void test_tracing_multi_test(void) { #ifndef __x86_64__ @@ -523,4 +617,6 @@ void test_tracing_multi_test(void) test_link_api_ids(true); if (test__start_subtest("session")) test_session(); + if (test__start_subtest("attach_api_fails")) + test_attach_api_fails(); } diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_fail.c b/tools/testing/selftests/bpf/progs/tracing_multi_fail.c new file mode 100644 index 000000000000..7f0375f4213d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tracing_multi_fail.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +SEC("fentry.multi") +int BPF_PROG(test_fentry) +{ + return 0; +} + +SEC("fentry.multi.s") +int BPF_PROG(test_fentry_s) +{ + return 0; +} -- cgit v1.2.3 From 443c91d08c4bf48caeab6243edaca4e987573d8a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:52 +0200 Subject: selftests/bpf: Add tracing multi verifier fails test Adding tests for verifier fails on tracing multi programs. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-28-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/tracing_multi.c | 2 ++ .../selftests/bpf/progs/tracing_multi_verifier.c | 31 ++++++++++++++++++++++ 2 files changed, 33 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_verifier.c diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c index 7e1bb071ce2a..9e026f2b254d 100644 --- a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c +++ b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c @@ -9,6 +9,7 @@ #include "tracing_multi_intersect.skel.h" #include "tracing_multi_session.skel.h" #include "tracing_multi_fail.skel.h" +#include "tracing_multi_verifier.skel.h" #include "trace_helpers.h" static __u64 bpf_fentry_test_cookies[] = { @@ -619,4 +620,5 @@ void test_tracing_multi_test(void) test_session(); if (test__start_subtest("attach_api_fails")) test_attach_api_fails(); + RUN_TESTS(tracing_multi_verifier); } diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_verifier.c b/tools/testing/selftests/bpf/progs/tracing_multi_verifier.c new file mode 100644 index 000000000000..7b6ed41bf452 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tracing_multi_verifier.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "vmlinux.h" +#include +#include +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +SEC("fentry.multi/bpf_fentry_test1") +__failure +__msg("func 'bpf_multi_func' doesn't have 1-th argument") +int BPF_PROG(fentry_direct_access, int a) +{ + return a; +} + +SEC("fexit.multi/bpf_fentry_test3") +__failure +__msg("invalid bpf_context access off=24 size=8") +int BPF_PROG(fexit_direct_access, char a, int b, __u64 c, int ret) +{ + return ret; +} + +SEC("fsession.multi/bpf_fentry_test4") +__failure +__msg("invalid bpf_context access off=16 size=8") +int BPF_PROG(fsession_direct_access, void *a, char b, int c, __u64 d, int ret) +{ + return c; +} -- cgit v1.2.3 From 4db8f60b6baf64f4f405bc8eb92a36315b353481 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:53 +0200 Subject: selftests/bpf: Add tracing multi attach benchmark test Adding benchmark test that attaches to (almost) all allowed tracing functions and display attach/detach times. # ./test_progs -t tracing_multi_bench_attach -v bpf_testmod.ko is already unloaded. Loading bpf_testmod.ko... Successfully loaded bpf_testmod.ko. serial_test_tracing_multi_bench_attach:PASS:btf__load_vmlinux_btf 0 nsec serial_test_tracing_multi_bench_attach:PASS:tracing_multi_bench__open_and_load 0 nsec serial_test_tracing_multi_bench_attach:PASS:get_syms 0 nsec serial_test_tracing_multi_bench_attach:PASS:bpf_program__attach_tracing_multi 0 nsec serial_test_tracing_multi_bench_attach: found 51186 functions serial_test_tracing_multi_bench_attach: attached in 1.295s serial_test_tracing_multi_bench_attach: detached in 0.243s #507 tracing_multi_bench_attach:OK Summary: 1/0 PASSED, 0 SKIPPED, 0 FAILED Successfully unloaded bpf_testmod.ko. Exporting skip_entry as is_unsafe_function and using it in the test. Also updating trace_blacklist with ___migrate_enable to be in sync with kernel functions deny list. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-29-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/tracing_multi.c | 124 +++++++++++++++++++++ .../selftests/bpf/progs/tracing_multi_bench.c | 12 ++ tools/testing/selftests/bpf/trace_helpers.c | 7 +- tools/testing/selftests/bpf/trace_helpers.h | 1 + 4 files changed, 141 insertions(+), 3 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_bench.c diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c index 9e026f2b254d..cb39bf610823 100644 --- a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c +++ b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c @@ -10,6 +10,7 @@ #include "tracing_multi_session.skel.h" #include "tracing_multi_fail.skel.h" #include "tracing_multi_verifier.skel.h" +#include "tracing_multi_bench.skel.h" #include "trace_helpers.h" static __u64 bpf_fentry_test_cookies[] = { @@ -593,6 +594,129 @@ cleanup: free(ids2); } +void serial_test_tracing_multi_bench_attach(void) +{ + LIBBPF_OPTS(bpf_tracing_multi_opts, opts); + struct tracing_multi_bench *skel = NULL; + long attach_start_ns, attach_end_ns; + long detach_start_ns, detach_end_ns; + double attach_delta, detach_delta; + struct bpf_link *link = NULL; + size_t i, cap = 0, cnt = 0; + struct ksyms *ksyms = NULL; + void *root = NULL; + void *dups = NULL; + __u32 *ids = NULL; + __u32 nr, type_id; + struct btf *btf; + int err; + +#ifndef __x86_64__ + test__skip(); + return; +#endif + + btf = btf__load_vmlinux_btf(); + if (!ASSERT_OK_PTR(btf, "btf__load_vmlinux_btf")) + return; + + skel = tracing_multi_bench__open_and_load(); + if (!ASSERT_OK_PTR(skel, "tracing_multi_bench__open_and_load")) + goto cleanup; + + if (!ASSERT_OK(bpf_get_ksyms(&ksyms, true), "get_syms")) + goto cleanup; + + /* Get all ftrace 'safe' symbols.. */ + for (i = 0; i < ksyms->filtered_cnt; i++) { + if (!tsearch(&ksyms->filtered_syms[i], &root, compare)) { + ASSERT_FAIL("tsearch failed"); + goto cleanup; + } + } + + /* + * Collect names that are not unique in kallsyms. The kernel resolves a + * tracing-multi BTF id to an address with kallsyms_lookup_name(), which + * returns the first symbol of that name. For a duplicate name that may + * be a different (non-ftrace-able) instance than the ftrace-able one in + * available_filter_functions, so attaching to it by BTF id fails with + * -ENOENT (e.g. t_start/t_next/t_stop). ksyms->syms is sorted by name, + * so equal names are adjacent. + */ + for (i = 1; i < ksyms->sym_cnt; i++) { + if (strcmp(ksyms->syms[i].name, ksyms->syms[i - 1].name)) + continue; + if (!tsearch(&ksyms->syms[i].name, &dups, compare)) { + ASSERT_FAIL("tsearch failed"); + goto cleanup; + } + } + + /* ..and filter them through BTF and btf_type_is_traceable_func. */ + nr = btf__type_cnt(btf); + for (type_id = 1; type_id < nr; type_id++) { + const struct btf_type *type; + const char *str; + + type = btf__type_by_id(btf, type_id); + if (!type) + break; + + if (BTF_INFO_KIND(type->info) != BTF_KIND_FUNC) + continue; + + str = btf__name_by_offset(btf, type->name_off); + if (!str) + break; + + if (!tfind(&str, &root, compare)) + continue; + + /* Skip names that are not unique in kallsyms, see above. */ + if (tfind(&str, &dups, compare)) + continue; + + if (!btf_type_is_traceable_func(btf, type)) + continue; + + err = libbpf_ensure_mem((void **) &ids, &cap, sizeof(*ids), cnt + 1); + if (err) + goto cleanup; + + ids[cnt++] = type_id; + } + + opts.ids = ids; + opts.cnt = cnt; + + attach_start_ns = get_time_ns(); + link = bpf_program__attach_tracing_multi(skel->progs.bench, NULL, &opts); + attach_end_ns = get_time_ns(); + + if (!ASSERT_OK_PTR(link, "bpf_program__attach_tracing_multi")) + goto cleanup; + + detach_start_ns = get_time_ns(); + bpf_link__destroy(link); + detach_end_ns = get_time_ns(); + + attach_delta = (attach_end_ns - attach_start_ns) / 1000000000.0; + detach_delta = (detach_end_ns - detach_start_ns) / 1000000000.0; + + printf("%s: found %lu functions\n", __func__, cnt); + printf("%s: attached in %7.3lfs\n", __func__, attach_delta); + printf("%s: detached in %7.3lfs\n", __func__, detach_delta); + +cleanup: + tracing_multi_bench__destroy(skel); + tdestroy(root, tdestroy_free_nop); + tdestroy(dups, tdestroy_free_nop); + free_kallsyms_local(ksyms); + free(ids); + btf__free(btf); +} + void test_tracing_multi_test(void) { #ifndef __x86_64__ diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_bench.c b/tools/testing/selftests/bpf/progs/tracing_multi_bench.c new file mode 100644 index 000000000000..beae946cb8c4 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tracing_multi_bench.c @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +SEC("fentry.multi") +int BPF_PROG(bench) +{ + return 0; +} diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index 0e63daf83ed5..679008b310d9 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -546,9 +546,10 @@ static const char * const trace_blacklist[] = { "__rcu_read_lock", "__rcu_read_unlock", "bpf_get_numa_node_id", + "___migrate_enable", }; -static bool skip_entry(char *name) +bool is_unsafe_function(const char *name) { int i; @@ -651,7 +652,7 @@ int bpf_get_ksyms(struct ksyms **ksymsp, bool kernel) free(name); if (sscanf(buf, "%ms$*[^\n]\n", &name) != 1) continue; - if (skip_entry(name)) + if (is_unsafe_function(name)) continue; ks = search_kallsyms_custom_local(ksyms, name, search_kallsyms_compare); @@ -728,7 +729,7 @@ int bpf_get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel) free(name); if (sscanf(buf, "%p %ms$*[^\n]\n", &addr, &name) != 2) continue; - if (skip_entry(name)) + if (is_unsafe_function(name)) continue; if (cnt == max_cnt) { diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h index d5bf1433675d..01c8ecc45627 100644 --- a/tools/testing/selftests/bpf/trace_helpers.h +++ b/tools/testing/selftests/bpf/trace_helpers.h @@ -63,4 +63,5 @@ int read_build_id(const char *path, char *build_id, size_t size); int bpf_get_ksyms(struct ksyms **ksymsp, bool kernel); int bpf_get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel); +bool is_unsafe_function(const char *name); #endif -- cgit v1.2.3 From b349efe49a123f032e54d7e894d708ea5daa10d2 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:54 +0200 Subject: selftests/bpf: Add tracing multi attach rollback tests Adding tests for the rollback code when the tracing_multi link won't get attached, covering 2 reasons: - wrong btf id passed by user, where all previously allocated trampolines will be released - trampoline for requested function is fully attached (has already maximum programs attached) and the link fails, the rollback code needs to release all previously link-ed trampolines and release them We need the bpf_fentry_test* unattached for the tests to pass, so the rollback tests are serial. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-30-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/tracing_multi.c | 212 +++++++++++++++++++++ .../selftests/bpf/progs/tracing_multi_rollback.c | 43 +++++ 2 files changed, 255 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_rollback.c diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c index cb39bf610823..f02ffc7f41d7 100644 --- a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c +++ b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c @@ -11,6 +11,7 @@ #include "tracing_multi_fail.skel.h" #include "tracing_multi_verifier.skel.h" #include "tracing_multi_bench.skel.h" +#include "tracing_multi_rollback.skel.h" #include "trace_helpers.h" static __u64 bpf_fentry_test_cookies[] = { @@ -717,6 +718,217 @@ cleanup: btf__free(btf); } +static void tracing_multi_rollback_run(struct tracing_multi_rollback *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + prog_fd = bpf_program__fd(skel->progs.test_fentry); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "test_run"); + + /* make sure the rollback code did not leave any program attached */ + ASSERT_EQ(skel->bss->test_result_fentry, 0, "test_result_fentry"); + ASSERT_EQ(skel->bss->test_result_fexit, 0, "test_result_fexit"); +} + +static void test_rollback_put(void) +{ + LIBBPF_OPTS(bpf_tracing_multi_opts, opts); + struct tracing_multi_rollback *skel = NULL; + size_t cnt = FUNCS_CNT; + __u32 *ids = NULL; + int err; + + skel = tracing_multi_rollback__open(); + if (!ASSERT_OK_PTR(skel, "tracing_multi_rollback__open")) + return; + + bpf_program__set_autoload(skel->progs.test_fentry, true); + bpf_program__set_autoload(skel->progs.test_fexit, true); + + err = tracing_multi_rollback__load(skel); + if (!ASSERT_OK(err, "tracing_multi_rollback__load")) + goto cleanup; + + ids = get_ids(bpf_fentry_test, cnt, NULL); + if (!ASSERT_OK_PTR(ids, "get_ids")) + goto cleanup; + + /* + * Mangle last id to trigger rollback, which needs to do put + * on get-ed trampolines. + */ + ids[9] = 0; + + opts.ids = ids; + opts.cnt = cnt; + + skel->bss->pid = getpid(); + + skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry, + NULL, &opts); + if (!ASSERT_ERR_PTR(skel->links.test_fentry, "bpf_program__attach_tracing_multi")) + goto cleanup; + + skel->links.test_fexit = bpf_program__attach_tracing_multi(skel->progs.test_fexit, + NULL, &opts); + if (!ASSERT_ERR_PTR(skel->links.test_fexit, "bpf_program__attach_tracing_multi")) + goto cleanup; + + /* We don't really attach any program, but let's make sure. */ + tracing_multi_rollback_run(skel); + +cleanup: + tracing_multi_rollback__destroy(skel); + free(ids); +} + +static void fillers_cleanup(struct tracing_multi_rollback **skels, int cnt) +{ + int i; + + for (i = 0; i < cnt; i++) + tracing_multi_rollback__destroy(skels[i]); + + free(skels); +} + +static struct tracing_multi_rollback *extra_load_and_link(void) +{ + struct tracing_multi_rollback *skel; + int err; + + skel = tracing_multi_rollback__open(); + if (!ASSERT_OK_PTR(skel, "tracing_multi_rollback__open")) + goto cleanup; + + bpf_program__set_autoload(skel->progs.extra, true); + + err = tracing_multi_rollback__load(skel); + if (!ASSERT_OK(err, "tracing_multi_rollback__load")) + goto cleanup; + + skel->links.extra = bpf_program__attach_trace(skel->progs.extra); + if (!ASSERT_OK_PTR(skel->links.extra, "bpf_program__attach_trace")) + goto cleanup; + + return skel; + +cleanup: + tracing_multi_rollback__destroy(skel); + return NULL; +} + +static struct tracing_multi_rollback **fillers_load_and_link(int max) +{ + struct tracing_multi_rollback **skels, *skel; + int i, err; + + skels = calloc(max + 1, sizeof(*skels)); + if (!ASSERT_OK_PTR(skels, "calloc")) + return NULL; + + for (i = 0; i < max; i++) { + skel = skels[i] = tracing_multi_rollback__open(); + if (!ASSERT_OK_PTR(skels[i], "tracing_multi_rollback__open")) + goto cleanup; + + bpf_program__set_autoload(skel->progs.filler, true); + + err = tracing_multi_rollback__load(skel); + if (!ASSERT_OK(err, "tracing_multi_rollback__load")) + goto cleanup; + + skel->links.filler = bpf_program__attach_trace(skel->progs.filler); + if (!ASSERT_OK_PTR(skels[i]->links.filler, "bpf_program__attach_trace")) + goto cleanup; + } + + return skels; + +cleanup: + fillers_cleanup(skels, i + 1); + return NULL; +} + +static void test_rollback_unlink(void) +{ + struct tracing_multi_rollback *skel = NULL, *extra; + LIBBPF_OPTS(bpf_tracing_multi_opts, opts); + struct tracing_multi_rollback **fillers; + size_t cnt = FUNCS_CNT; + __u32 *ids = NULL; + int err, max; + + max = get_bpf_max_tramp_links(); + if (!ASSERT_GE(max, 1, "bpf_max_tramp_links")) + return; + + /* Attach maximum allowed programs to bpf_fentry_test10 */ + fillers = fillers_load_and_link(max); + if (!ASSERT_OK_PTR(fillers, "fillers_load_and_link")) + return; + + extra = extra_load_and_link(); + if (!ASSERT_OK_PTR(extra, "extra_load_and_link")) + goto cleanup; + + skel = tracing_multi_rollback__open(); + if (!ASSERT_OK_PTR(skel, "tracing_multi_rollback__open")) + goto cleanup; + + bpf_program__set_autoload(skel->progs.test_fentry, true); + bpf_program__set_autoload(skel->progs.test_fexit, true); + + /* + * Attach tracing_multi link on bpf_fentry_test1-10, which will + * fail on bpf_fentry_test10 function, because it already has + * maximum allowed programs attached. + * + * The rollback needs to unlink already link-ed trampolines and + * put all of them. + */ + err = tracing_multi_rollback__load(skel); + if (!ASSERT_OK(err, "tracing_multi_rollback__load")) + goto cleanup; + + ids = get_ids(bpf_fentry_test, cnt, NULL); + if (!ASSERT_OK_PTR(ids, "get_ids")) + goto cleanup; + + opts.ids = ids; + opts.cnt = cnt; + + skel->bss->pid = getpid(); + + skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry, + NULL, &opts); + if (!ASSERT_ERR_PTR(skel->links.test_fentry, "bpf_program__attach_tracing_multi")) + goto cleanup; + + skel->links.test_fexit = bpf_program__attach_tracing_multi(skel->progs.test_fexit, + NULL, &opts); + if (!ASSERT_ERR_PTR(skel->links.test_fexit, "bpf_program__attach_tracing_multi")) + goto cleanup; + + tracing_multi_rollback_run(skel); + +cleanup: + fillers_cleanup(fillers, max); + tracing_multi_rollback__destroy(extra); + tracing_multi_rollback__destroy(skel); + free(ids); +} + +void serial_test_tracing_multi_attach_rollback(void) +{ + if (test__start_subtest("put")) + test_rollback_put(); + if (test__start_subtest("unlink")) + test_rollback_unlink(); +} + void test_tracing_multi_test(void) { #ifndef __x86_64__ diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_rollback.c b/tools/testing/selftests/bpf/progs/tracing_multi_rollback.c new file mode 100644 index 000000000000..a49d1d841f3a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tracing_multi_rollback.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +int pid = 0; + +__u64 test_result_fentry = 0; +__u64 test_result_fexit = 0; + +SEC("?fentry.multi") +int BPF_PROG(test_fentry) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + test_result_fentry++; + return 0; +} + +SEC("?fexit.multi") +int BPF_PROG(test_fexit) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + test_result_fexit++; + return 0; +} + +SEC("?fentry/bpf_fentry_test1") +int BPF_PROG(extra) +{ + return 0; +} + +SEC("?fentry/bpf_fentry_test10") +int BPF_PROG(filler) +{ + return 0; +} -- cgit v1.2.3 From 1444ee886e6fedf20b9c5bc74a273c6b7d100fdc Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Sat, 6 Jun 2026 10:30:32 -0700 Subject: rhashtable: Fix rhashtable_next_key() build warnings rhashtable.o builds with warnings as rhashtable_next_key() kdoc from lib/rhashtable.c does not have the arguments descriptions. Move rhashtable_next_key() kdoc from header to c file, matching other functions. Move rhashtable_next_key() next to the other forward declarations in the header file. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202606061925.WI4bYI8k-lkp@intel.com/ Fixes: 8f4fa9f89b72 ("rhashtable: Add rhashtable_next_key() API") Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260606-rhash_fixes_1-v1-1-932ab036e6bc@meta.com Signed-off-by: Alexei Starovoitov --- include/linux/rhashtable.h | 42 ++---------------------------------------- lib/rhashtable.c | 35 ++++++++++++++++++++++++++++++++++- 2 files changed, 36 insertions(+), 41 deletions(-) diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 3de3412d53c8..79f83b6eec27 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -263,6 +263,8 @@ struct rhash_lock_head __rcu **__rht_bucket_nested( struct rhash_lock_head __rcu **rht_bucket_nested_insert( struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash); +void *rhashtable_next_key(struct rhashtable *ht, const void *prev_key); + #define rht_dereference(p, ht) \ rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht)) @@ -650,46 +652,6 @@ restart: return NULL; } -/** - * rhashtable_next_key - return next element after a given key - * @ht: hash table - * @prev_key: pointer to previous key, or NULL for the first element - * - * WARNING: this walk is highly unstable. Unlike rhashtable_walk_*(), - * it cannot detect a concurrent resize or rehash, so a full iteration - * is NOT guaranteed to terminate under adversarial or sustained - * rehashing. Callers MUST tolerate skipped and duplicated elements and - * SHOULD bound their loop externally. - * - * Returns the next element in best-effort iteration order, walking the - * @tbl chain (including any future_tbl in flight). Caller must hold RCU. - * - * Pass @prev_key == NULL to obtain the first element. To iterate, set - * @prev_key to the key of the previously returned element on each call, - * and stop when NULL is returned. - * - * Best-effort semantics: - * - Across the tbl->future_tbl chain, an element being migrated may - * transiently appear in both tables and be observed twice. - * - Concurrent inserts may or may not be observed. - * - Termination of a full iteration loop is NOT guaranteed under - * adversarial continuous rehash; callers MUST tolerate skips and - * repeats and SHOULD bound their loop externally. - * - Behavior on tables that contain duplicate keys is undefined: - * duplicates may be skipped, repeated, or trap the walk in a - * cycle. Callers requiring duplicate-key iteration must use - * rhashtable_walk_*() instead. - * - rhltable instances are not supported and return - * ERR_PTR(-EOPNOTSUPP). - * - If prev_key was concurrently deleted and is not present in any - * in-flight table, returns ERR_PTR(-ENOENT). - * - * Returns entry of the next element, or NULL when iteration is exhausted, - * or ERR_PTR(-ENOENT) if prev_key is not found, or - * ERR_PTR(-EOPNOTSUPP) if @ht is an rhltable. - */ -void *rhashtable_next_key(struct rhashtable *ht, const void *prev_key); - /** * rhashtable_lookup - search hash table * @ht: hash table diff --git a/lib/rhashtable.c b/lib/rhashtable.c index dd6eaa09c55d..907637967c0b 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -730,8 +730,41 @@ static struct rhash_head *__rhashtable_next_in_table( /** * rhashtable_next_key - return next element after a given key + * @ht: hash table + * @prev_key: pointer to previous key, or NULL for the first element * - * See include/linux/rhashtable.h for the full contract. + * WARNING: this walk is highly unstable. Unlike rhashtable_walk_*(), + * it cannot detect a concurrent resize or rehash, so a full iteration + * is NOT guaranteed to terminate under adversarial or sustained + * rehashing. Callers MUST tolerate skipped and duplicated elements and + * SHOULD bound their loop externally. + * + * Returns the next element in best-effort iteration order, walking the + * @tbl chain (including any future_tbl in flight). Caller must hold RCU. + * + * Pass @prev_key == NULL to obtain the first element. To iterate, set + * @prev_key to the key of the previously returned element on each call, + * and stop when NULL is returned. + * + * Best-effort semantics: + * - Across the tbl->future_tbl chain, an element being migrated may + * transiently appear in both tables and be observed twice. + * - Concurrent inserts may or may not be observed. + * - Termination of a full iteration loop is NOT guaranteed under + * adversarial continuous rehash; callers MUST tolerate skips and + * repeats and SHOULD bound their loop externally. + * - Behavior on tables that contain duplicate keys is undefined: + * duplicates may be skipped, repeated, or trap the walk in a + * cycle. Callers requiring duplicate-key iteration must use + * rhashtable_walk_*() instead. + * - rhltable instances are not supported and return + * ERR_PTR(-EOPNOTSUPP). + * - If prev_key was concurrently deleted and is not present in any + * in-flight table, returns ERR_PTR(-ENOENT). + * + * Returns entry of the next element, or NULL when iteration is exhausted, + * or ERR_PTR(-ENOENT) if prev_key is not found, or + * ERR_PTR(-EOPNOTSUPP) if @ht is an rhltable. */ void *rhashtable_next_key(struct rhashtable *ht, const void *prev_key) { -- cgit v1.2.3 From 89edbdfc5d0308cef57b71359331de5c4ddbf763 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Sun, 7 Jun 2026 13:30:41 -0700 Subject: bpf: Fix NMI/tracepoint re-entry deadlock on lru locks NMI and tracepoint BPF programs can re-enter the per-CPU or global LRU lock that bpf_lru_pop_free()/push_free() already hold on the same CPU, AA-deadlocking. Lockdep reports "inconsistent {INITIAL USE} -> {IN-NMI}" on &l->lock (syzbot c69a0a2c816716f1e0d5) and "possible recursive locking detected" on &loc_l->lock (syzbot 18b26edb69b2e19f3b33). Prior trylock and rqspinlock based fixes (see links) were nacked because compromised on reliability. This patch converts every LRU lock site to rqspinlock_t and adds a recovery path for some failure windows to avoid node leaks. Failure recovery: - *_pop_free top-level: return NULL; prealloc_lru_pop() already treats that as no-free-element (-ENOMEM). - Cross-CPU steal: skip the victim's locked loc_l, try next CPU. - Post-steal local lock fail: publish stolen node to lockless per-CPU free_llist; next pop on this CPU picks it up. - push_free fail: mark node pending_free=1. __local_list_flush(), __local_list_pop_pending() reclaim the node from pending_list. __bpf_lru_list_shrink_inactive() reclaims the node from inactive list. Nodes from active list are reclaimed by __bpf_lru_list_shrink() or after __bpf_lru_list_rotate_active() demotes it to the inactive. Fixes: 3a08c2fd7634 ("bpf: LRU List") Reported-by: syzbot+c69a0a2c816716f1e0d5@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=c69a0a2c816716f1e0d5 Reported-by: syzbot+18b26edb69b2e19f3b33@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=18b26edb69b2e19f3b33 Link: https://lore.kernel.org/bpf/CAPPBnEYO4R+m+SpVc2gNj_x31R6fo1uJvj2bK2YS1P09GWT6kQ@mail.gmail.com/ Link: https://lore.kernel.org/bpf/CAPPBnEZmFA3ab8Uc=PEm0bdojZy=7T_F5_+eyZSHyZR3MBG4Vw@mail.gmail.com/ Link: https://lore.kernel.org/bpf/20251030030010.95352-1-dongml2@chinatelecom.cn/ Link: https://lore.kernel.org/bpf/20260119142120.28170-1-leon.hwang@linux.dev/ Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260607-lru_map_spin-v3-1-bcd9332e911b@meta.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_lru_list.c | 165 ++++++++++++++++++++++++++++------------------ kernel/bpf/bpf_lru_list.h | 25 +++++-- 2 files changed, 119 insertions(+), 71 deletions(-) diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index e7a2fc60523f..5ed7cb4b98c0 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -13,23 +13,8 @@ #define PERCPU_FREE_TARGET (4) #define PERCPU_NR_SCANS PERCPU_FREE_TARGET -/* Helpers to get the local list index */ -#define LOCAL_LIST_IDX(t) ((t) - BPF_LOCAL_LIST_T_OFFSET) -#define LOCAL_FREE_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_FREE) -#define LOCAL_PENDING_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING) #define IS_LOCAL_LIST_TYPE(t) ((t) >= BPF_LOCAL_LIST_T_OFFSET) -/* Local list helpers */ -static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l) -{ - return &loc_l->lists[LOCAL_FREE_LIST_IDX]; -} - -static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l) -{ - return &loc_l->lists[LOCAL_PENDING_LIST_IDX]; -} - /* bpf_lru_node helpers */ static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node) { @@ -72,6 +57,7 @@ static void __bpf_lru_node_move_to_free(struct bpf_lru_list *l, bpf_lru_list_count_dec(l, node->type); node->type = tgt_free_type; + WRITE_ONCE(node->pending_free, 0); list_move(&node->list, free_list); } @@ -87,6 +73,9 @@ static void __bpf_lru_node_move_in(struct bpf_lru_list *l, bpf_lru_list_count_inc(l, tgt_type); node->type = tgt_type; bpf_lru_node_clear_ref(node); + /* Reset pending_free only when moving to the free list */ + if (tgt_type == BPF_LRU_LIST_T_FREE) + WRITE_ONCE(node->pending_free, 0); list_move(&node->list, &l->lists[tgt_type]); } @@ -212,9 +201,11 @@ __bpf_lru_list_shrink_inactive(struct bpf_lru *lru, unsigned int i = 0; list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) { - if (bpf_lru_node_is_ref(node)) { + if (bpf_lru_node_is_ref(node) && + !READ_ONCE(node->pending_free)) { __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); - } else if (lru->del_from_htab(lru->del_arg, node)) { + } else if (READ_ONCE(node->pending_free) || + lru->del_from_htab(lru->del_arg, node)) { __bpf_lru_node_move_to_free(l, node, free_list, tgt_free_type); if (++nshrinked == tgt_nshrink) @@ -273,7 +264,8 @@ static unsigned int __bpf_lru_list_shrink(struct bpf_lru *lru, list_for_each_entry_safe_reverse(node, tmp_node, force_shrink_list, list) { - if (lru->del_from_htab(lru->del_arg, node)) { + if (READ_ONCE(node->pending_free) || + lru->del_from_htab(lru->del_arg, node)) { __bpf_lru_node_move_to_free(l, node, free_list, tgt_free_type); return 1; @@ -290,8 +282,10 @@ static void __local_list_flush(struct bpf_lru_list *l, struct bpf_lru_node *node, *tmp_node; list_for_each_entry_safe_reverse(node, tmp_node, - local_pending_list(loc_l), list) { - if (bpf_lru_node_is_ref(node)) + &loc_l->pending_list, list) { + if (READ_ONCE(node->pending_free)) + __bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_FREE); + else if (bpf_lru_node_is_ref(node)) __bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_ACTIVE); else __bpf_lru_node_move_in(l, node, @@ -307,9 +301,12 @@ static void bpf_lru_list_push_free(struct bpf_lru_list *l, if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type))) return; - raw_spin_lock_irqsave(&l->lock, flags); + if (raw_res_spin_lock_irqsave(&l->lock, flags)) { + WRITE_ONCE(node->pending_free, 1); + return; + } __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); - raw_spin_unlock_irqrestore(&l->lock, flags); + raw_res_spin_unlock_irqrestore(&l->lock, flags); } static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, @@ -318,8 +315,10 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, struct bpf_lru_list *l = &lru->common_lru.lru_list; struct bpf_lru_node *node, *tmp_node; unsigned int nfree = 0; + LIST_HEAD(tmp_free); - raw_spin_lock(&l->lock); + if (raw_res_spin_lock(&l->lock)) + return; __local_list_flush(l, loc_l); @@ -327,7 +326,7 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, list_for_each_entry_safe(node, tmp_node, &l->lists[BPF_LRU_LIST_T_FREE], list) { - __bpf_lru_node_move_to_free(l, node, local_free_list(loc_l), + __bpf_lru_node_move_to_free(l, node, &tmp_free, BPF_LRU_LOCAL_LIST_T_FREE); if (++nfree == lru->target_free) break; @@ -335,10 +334,19 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, if (nfree < lru->target_free) __bpf_lru_list_shrink(lru, l, lru->target_free - nfree, - local_free_list(loc_l), + &tmp_free, BPF_LRU_LOCAL_LIST_T_FREE); - raw_spin_unlock(&l->lock); + raw_res_spin_unlock(&l->lock); + + /* + * Transfer the harvested nodes from the temporary list_head into + * the lockless per-CPU free llist. + */ + list_for_each_entry_safe(node, tmp_node, &tmp_free, list) { + list_del(&node->list); + llist_add(&node->llist, &loc_l->free_llist); + } } static void __local_list_add_pending(struct bpf_lru *lru, @@ -350,22 +358,21 @@ static void __local_list_add_pending(struct bpf_lru *lru, *(u32 *)((void *)node + lru->hash_offset) = hash; node->cpu = cpu; node->type = BPF_LRU_LOCAL_LIST_T_PENDING; + WRITE_ONCE(node->pending_free, 0); bpf_lru_node_clear_ref(node); - list_add(&node->list, local_pending_list(loc_l)); + list_add(&node->list, &loc_l->pending_list); } static struct bpf_lru_node * __local_list_pop_free(struct bpf_lru_locallist *loc_l) { - struct bpf_lru_node *node; + struct llist_node *llnode; - node = list_first_entry_or_null(local_free_list(loc_l), - struct bpf_lru_node, - list); - if (node) - list_del(&node->list); + llnode = llist_del_first(&loc_l->free_llist); + if (!llnode) + return NULL; - return node; + return container_of(llnode, struct bpf_lru_node, llist); } static struct bpf_lru_node * @@ -376,10 +383,10 @@ __local_list_pop_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l) ignore_ref: /* Get from the tail (i.e. older element) of the pending list. */ - list_for_each_entry_reverse(node, local_pending_list(loc_l), - list) { + list_for_each_entry_reverse(node, &loc_l->pending_list, list) { if ((!bpf_lru_node_is_ref(node) || force) && - lru->del_from_htab(lru->del_arg, node)) { + (READ_ONCE(node->pending_free) || + lru->del_from_htab(lru->del_arg, node))) { list_del(&node->list); return node; } @@ -404,7 +411,8 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru, l = per_cpu_ptr(lru->percpu_lru, cpu); - raw_spin_lock_irqsave(&l->lock, flags); + if (raw_res_spin_lock_irqsave(&l->lock, flags)) + return NULL; __bpf_lru_list_rotate(lru, l); @@ -420,7 +428,7 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru, __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE); } - raw_spin_unlock_irqrestore(&l->lock, flags); + raw_res_spin_unlock_irqrestore(&l->lock, flags); return node; } @@ -437,7 +445,8 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, loc_l = per_cpu_ptr(clru->local_list, cpu); - raw_spin_lock_irqsave(&loc_l->lock, flags); + if (raw_res_spin_lock_irqsave(&loc_l->lock, flags)) + return NULL; node = __local_list_pop_free(loc_l); if (!node) { @@ -448,17 +457,22 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, if (node) __local_list_add_pending(lru, loc_l, cpu, node, hash); - raw_spin_unlock_irqrestore(&loc_l->lock, flags); + raw_res_spin_unlock_irqrestore(&loc_l->lock, flags); if (node) return node; - /* No free nodes found from the local free list and + /* + * No free nodes found from the local free list and * the global LRU list. * * Steal from the local free/pending list of the * current CPU and remote CPU in RR. It starts * with the loc_l->next_steal CPU. + * + * Acquire the victim's lock before touching either list. On + * acquisition failure (rqspinlock AA or timeout) skip the victim + * and try the next CPU. */ first_steal = loc_l->next_steal; @@ -466,24 +480,36 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, do { steal_loc_l = per_cpu_ptr(clru->local_list, steal); - raw_spin_lock_irqsave(&steal_loc_l->lock, flags); - - node = __local_list_pop_free(steal_loc_l); - if (!node) - node = __local_list_pop_pending(lru, steal_loc_l); - - raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags); + if (!raw_res_spin_lock_irqsave(&steal_loc_l->lock, flags)) { + node = __local_list_pop_free(steal_loc_l); + if (!node) + node = __local_list_pop_pending(lru, steal_loc_l); + raw_res_spin_unlock_irqrestore(&steal_loc_l->lock, flags); + } steal = cpumask_next_wrap(steal, cpu_possible_mask); } while (!node && steal != first_steal); loc_l->next_steal = steal; - if (node) { - raw_spin_lock_irqsave(&loc_l->lock, flags); - __local_list_add_pending(lru, loc_l, cpu, node, hash); - raw_spin_unlock_irqrestore(&loc_l->lock, flags); + if (!node) + return NULL; + + if (raw_res_spin_lock_irqsave(&loc_l->lock, flags)) { + /* + * The local pending lock can't be acquired (rqspinlock AA + * or timeout). Return the stolen node to the per-CPU + * free_llist instead of orphaning it; the next pop_free on + * this CPU will pick it up. + */ + node->type = BPF_LRU_LOCAL_LIST_T_FREE; + bpf_lru_node_clear_ref(node); + WRITE_ONCE(node->pending_free, 0); + llist_add(&node->llist, &loc_l->free_llist); + return NULL; } + __local_list_add_pending(lru, loc_l, cpu, node, hash); + raw_res_spin_unlock_irqrestore(&loc_l->lock, flags); return node; } @@ -511,18 +537,24 @@ static void bpf_common_lru_push_free(struct bpf_lru *lru, loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu); - raw_spin_lock_irqsave(&loc_l->lock, flags); + if (raw_res_spin_lock_irqsave(&loc_l->lock, flags)) { + WRITE_ONCE(node->pending_free, 1); + return; + } if (unlikely(node->type != BPF_LRU_LOCAL_LIST_T_PENDING)) { - raw_spin_unlock_irqrestore(&loc_l->lock, flags); + raw_res_spin_unlock_irqrestore(&loc_l->lock, + flags); goto check_lru_list; } node->type = BPF_LRU_LOCAL_LIST_T_FREE; bpf_lru_node_clear_ref(node); - list_move(&node->list, local_free_list(loc_l)); + list_del(&node->list); + + raw_res_spin_unlock_irqrestore(&loc_l->lock, flags); - raw_spin_unlock_irqrestore(&loc_l->lock, flags); + llist_add(&node->llist, &loc_l->free_llist); return; } @@ -538,11 +570,14 @@ static void bpf_percpu_lru_push_free(struct bpf_lru *lru, l = per_cpu_ptr(lru->percpu_lru, node->cpu); - raw_spin_lock_irqsave(&l->lock, flags); + if (raw_res_spin_lock_irqsave(&l->lock, flags)) { + WRITE_ONCE(node->pending_free, 1); + return; + } __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); - raw_spin_unlock_irqrestore(&l->lock, flags); + raw_res_spin_unlock_irqrestore(&l->lock, flags); } void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) @@ -565,6 +600,7 @@ static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, node = (struct bpf_lru_node *)(buf + node_offset); node->type = BPF_LRU_LIST_T_FREE; + node->pending_free = 0; bpf_lru_node_clear_ref(node); list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); buf += elem_size; @@ -594,6 +630,7 @@ again: node = (struct bpf_lru_node *)(buf + node_offset); node->cpu = cpu; node->type = BPF_LRU_LIST_T_FREE; + node->pending_free = 0; bpf_lru_node_clear_ref(node); list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); i++; @@ -618,14 +655,12 @@ void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu) { - int i; - - for (i = 0; i < NR_BPF_LRU_LOCAL_LIST_T; i++) - INIT_LIST_HEAD(&loc_l->lists[i]); + INIT_LIST_HEAD(&loc_l->pending_list); + init_llist_head(&loc_l->free_llist); loc_l->next_steal = cpu; - raw_spin_lock_init(&loc_l->lock); + raw_res_spin_lock_init(&loc_l->lock); } static void bpf_lru_list_init(struct bpf_lru_list *l) @@ -640,7 +675,7 @@ static void bpf_lru_list_init(struct bpf_lru_list *l) l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE]; - raw_spin_lock_init(&l->lock); + raw_res_spin_lock_init(&l->lock); } int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index fe2661a58ea9..8d0ee61622af 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -6,11 +6,11 @@ #include #include -#include +#include +#include #define NR_BPF_LRU_LIST_T (3) #define NR_BPF_LRU_LIST_COUNT (2) -#define NR_BPF_LRU_LOCAL_LIST_T (2) #define BPF_LOCAL_LIST_T_OFFSET NR_BPF_LRU_LIST_T enum bpf_lru_list_type { @@ -22,10 +22,22 @@ enum bpf_lru_list_type { }; struct bpf_lru_node { - struct list_head list; + /* + * A node is in at most one list at a time. The free path on the + * per-CPU locallist uses an llist, so share storage via a union. + */ + union { + struct list_head list; + struct llist_node llist; + }; u16 cpu; u8 type; u8 ref; + /* + * Marks nodes whose *_push_free() lock acquire failed; reclaimed + * by flush/shrink which honor the flag instead of del_from_htab(). + */ + u8 pending_free; }; struct bpf_lru_list { @@ -34,13 +46,14 @@ struct bpf_lru_list { /* The next inactive list rotation starts from here */ struct list_head *next_inactive_rotation; - raw_spinlock_t lock ____cacheline_aligned_in_smp; + rqspinlock_t lock ____cacheline_aligned_in_smp; }; struct bpf_lru_locallist { - struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T]; + struct list_head pending_list; + struct llist_head free_llist; u16 next_steal; - raw_spinlock_t lock; + rqspinlock_t lock; }; struct bpf_common_lru { -- cgit v1.2.3 From 8f6802d26d96ef424fc9fc9e2e68c43b6cf0fa59 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Sun, 7 Jun 2026 13:30:42 -0700 Subject: Documentation/bpf: Refresh map_lru_hash_update.dot for rqspinlock Reflect the rqspinlock conversion and orphan-recovery paths added in the previous commit: - All LRU locks are rqspinlock_t; any acquire can fail (AA or timeout). A shared "rqspinlock acquire failed" terminal collapses to the existing -ENOMEM exit. Dashed arrows from each acquire site mark the failure paths. - The per-CPU local freelist is now lockless (free_llist). - Post-steal: re-acquiring loc_l->lock to insert the stolen node into the local pending list can fail; on failure the node is published to free_llist instead of being orphaned, and the call returns -ENOMEM. - Steal-loop victim lock failure is silent: skip the victim and try the next CPU. Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260607-lru_map_spin-v3-2-bcd9332e911b@meta.com Signed-off-by: Alexei Starovoitov --- Documentation/bpf/map_lru_hash_update.dot | 44 +++++++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/Documentation/bpf/map_lru_hash_update.dot b/Documentation/bpf/map_lru_hash_update.dot index ab10058f5b79..412bc8b3b57e 100644 --- a/Documentation/bpf/map_lru_hash_update.dot +++ b/Documentation/bpf/map_lru_hash_update.dot @@ -21,10 +21,18 @@ digraph { // names that initiate the corresponding logic in kernel/bpf/bpf_lru_list.c. // Number suffixes and errno suffixes handle subsections of the corresponding // logic in the function as of the writing of this dot. + // + // All LRU locks are rqspinlock_t. Every acquire can fail (AA self-deadlock + // or contention timeout); on failure the corresponding helper returns NULL + // and the caller propagates -ENOMEM. The "rqspinlock acquire failed" + // terminal below is reached via the dashed arrows from each acquire site. + + rqspinlock_failed [shape=rectangle, + label="Any LRU rqspinlock\nacquire fails\n(AA or timeout)"] // cf. __local_list_pop_free() / bpf_percpu_lru_pop_free() local_freelist_check [shape=diamond,fillcolor=1, - label="Local freelist\nnode available?"]; + label="Local freelist\nnode available?\n(lockless free_llist)"]; use_local_node [shape=rectangle, label="Use node owned\nby this CPU"] @@ -82,6 +90,15 @@ digraph { // fn__local_list_pop_pending() } + // Post-steal: re-acquire local loc_l->lock to insert the stolen node into + // the local pending list. If the acquire fails, the stolen node is published + // to the lockless local free_llist so the next pop on this CPU picks it up + // instead of orphaning it. + post_steal_lock [shape=diamond,fillcolor=1, + label="Acquire local\nloc_l->lock\nto add pending"] + post_steal_to_free_llist [shape=rectangle, + label="Publish stolen node to\nlocal free_llist (lockless)"] + fn_bpf_lru_list_pop_free_to_local2 [shape=rectangle, label="Use node that was\nnot recently referenced"] local_freelist_check4 [shape=rectangle, @@ -97,10 +114,19 @@ digraph { fn_htab_lru_map_update_elem_ENOENT [shape=oval,label="return -ENOENT"] begin -> local_freelist_check + // The initial per-CPU lock (loc_l->lock for common, l->lock for percpu) is + // acquired before the local freelist check; rqspinlock failure here exits + // directly to -ENOMEM (no recovery needed: nothing was removed yet). + local_freelist_check -> rqspinlock_failed [style=dashed, + xlabel="acquire fails"] local_freelist_check -> use_local_node [xlabel="Y"] local_freelist_check -> common_lru_check [xlabel="N"] common_lru_check -> fn_bpf_lru_list_pop_free_to_local [xlabel="Y"] common_lru_check -> fn___bpf_lru_list_shrink_inactive [xlabel="N"] + // Global lru_list lock acquire failure in pop_free_to_local: skip refill, + // fall through to the steal path. Not ENOMEM by itself. + fn_bpf_lru_list_pop_free_to_local -> common_lru_check2 [style=dashed, + xlabel="global lru_lock\nacquire fails"] fn_bpf_lru_list_pop_free_to_local -> fn___bpf_lru_node_move_to_free fn___bpf_lru_node_move_to_free -> fn_bpf_lru_list_pop_free_to_local2 [xlabel="Y"] @@ -120,13 +146,27 @@ digraph { local_freelist_check6 -> local_freelist_check7 local_freelist_check7 -> fn_htab_lru_map_update_elem - fn_htab_lru_map_update_elem -> fn_htab_lru_map_update_elem3 [xlabel = "Y"] + // Steal-loop victim lock failure is silent: treat as "no node found here" + // and continue to next CPU; same edge as the existing "N" path. + local_freelist_check5 -> fn_htab_lru_map_update_elem2 [style=dashed, + xlabel="victim's lock\nfails: skip"] + // After a successful steal, re-acquire the local loc_l->lock. On failure + // the stolen node is published to free_llist (recovered, not orphaned) + // and the update returns -ENOMEM. + fn_htab_lru_map_update_elem -> post_steal_lock [xlabel = "Y"] + post_steal_lock -> fn_htab_lru_map_update_elem3 [xlabel = "OK"] + post_steal_lock -> post_steal_to_free_llist [style=dashed, + xlabel="loc_l->lock\nacquire fails"] + post_steal_to_free_llist -> fn_htab_lru_map_update_elem_ENOMEM fn_htab_lru_map_update_elem -> fn_htab_lru_map_update_elem2 [xlabel = "N"] fn_htab_lru_map_update_elem2 -> fn_htab_lru_map_update_elem_ENOMEM [xlabel = "Y"] fn_htab_lru_map_update_elem2 -> local_freelist_check5 [xlabel = "N"] fn_htab_lru_map_update_elem3 -> fn_htab_lru_map_update_elem4 + // Shared rqspinlock-failure terminal collapses to the same -ENOMEM exit. + rqspinlock_failed -> fn_htab_lru_map_update_elem_ENOMEM + use_local_node -> fn_htab_lru_map_update_elem4 fn_bpf_lru_list_pop_free_to_local2 -> fn_htab_lru_map_update_elem4 local_freelist_check4 -> fn_htab_lru_map_update_elem4 -- cgit v1.2.3 From 6e1e4a9d60edb0e12d373fb6f2b55d90d20a363b Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Sun, 7 Jun 2026 13:30:43 -0700 Subject: selftests/bpf: Stress LRU rqspinlock recovery paths Introduces stress test for bpf_lru_list that exercises lock-failures and orphan-recovery, added by the LRU rqspinlock conversion. Runs three subtests: common LRU, per-CPU LRU lists (BPF_F_NO_COMMON_LRU), and per-CPU LRU map. Each pins one userspace hammer per CPU and attaches the perf_event NMI BPF prog (update+delete mix) on every online CPU. Pre-fix, lockdep fires the "INITIAL USE -> IN-NMI" splat during stress. After stress test, drain_then_verify_capacity() drains every key and refills the lru map. A stranded node on any CPU's pool would have forced eviction of a just-inserted key on that CPU, surfacing here as a missing lookup. Marked serial_ because per-CPU pinning and high-rate HW perf events would perturb parallel tests. Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260607-lru_map_spin-v3-3-bcd9332e911b@meta.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/lru_lock_nmi.c | 243 +++++++++++++++++++++ tools/testing/selftests/bpf/progs/lru_lock_nmi.c | 33 +++ 2 files changed, 276 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/lru_lock_nmi.c create mode 100644 tools/testing/selftests/bpf/progs/lru_lock_nmi.c diff --git a/tools/testing/selftests/bpf/prog_tests/lru_lock_nmi.c b/tools/testing/selftests/bpf/prog_tests/lru_lock_nmi.c new file mode 100644 index 000000000000..60666a9ba41f --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/lru_lock_nmi.c @@ -0,0 +1,243 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Stress every LRU lock-failure and orphan-recovery. + * perf_event NMI BPF on every online CPU does + * update+delete on a small LRU map; userspace threads on every CPU do + * the same from syscall context. + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include "testing_helpers.h" +#include "lru_lock_nmi.skel.h" + +#define MAP_ENTRIES 64 +#define KEY_RANGE (MAP_ENTRIES * 2) +#define STRESS_NS (500 * 1000 * 1000ULL) + +struct hammer_arg { + int map_fd; + int cpu; + __u64 deadline_ns; +}; + +struct refill_arg { + int map_fd; + int cpu; + int per_cpu_quota; + int update_errors; +}; + +/* + * Pin the calling thread to @cpu. Uses dynamically-allocated CPU sets so + * we stay correct on hosts with @cpu >= CPU_SETSIZE (default 1024). + */ +static int pin_to_cpu(int cpu) +{ + cpu_set_t *cs; + size_t cs_size; + int err; + + cs = CPU_ALLOC(cpu + 1); + if (!cs) + return -ENOMEM; + cs_size = CPU_ALLOC_SIZE(cpu + 1); + + CPU_ZERO_S(cs_size, cs); + CPU_SET_S(cpu, cs_size, cs); + err = pthread_setaffinity_np(pthread_self(), cs_size, cs); + CPU_FREE(cs); + return err; +} + +static void *hammer_thread(void *p) +{ + struct hammer_arg *a = p; + int nr_possible_cpus = libbpf_num_possible_cpus(); + __u64 val[nr_possible_cpus]; + unsigned int seed; + __u32 key; + + memset(val, 0, sizeof(val)); + pin_to_cpu(a->cpu); + + seed = (unsigned int)a->cpu ^ (unsigned int)(uintptr_t)pthread_self(); + + while (get_time_ns() < a->deadline_ns) { + bool do_update = rand_r(&seed) & 1; + + key = rand_r(&seed) % KEY_RANGE; + if (do_update) + bpf_map_update_elem(a->map_fd, &key, val, BPF_ANY); + else + bpf_map_delete_elem(a->map_fd, &key); + } + return NULL; +} + +static void *refill_thread(void *p) +{ + struct refill_arg *a = p; + int nr_possible_cpus = libbpf_num_possible_cpus(); + __u64 val[nr_possible_cpus]; + __u32 start, end, key; + + memset(val, 0, sizeof(val)); + pin_to_cpu(a->cpu); + + start = (__u32)a->cpu * (__u32)a->per_cpu_quota; + end = start + (__u32)a->per_cpu_quota; + for (key = start; key < end; key++) + if (bpf_map_update_elem(a->map_fd, &key, val, BPF_ANY)) + a->update_errors++; + return NULL; +} + +/* + * Drain the map, then refill it with each CPU inserting only its own + * quota of keys. + * After refill, lookup every key we inserted - a stranded node on any + * CPU's pool would have forced eviction. + */ +static int drain_then_verify_capacity(int map_fd, int nr_cpus) +{ + int per_cpu_quota = MAP_ENTRIES / nr_cpus; + int total = per_cpu_quota * nr_cpus; + int nr_possible_cpus = libbpf_num_possible_cpus(); + pthread_t threads[nr_cpus]; + struct refill_arg args[nr_cpus]; + __u64 val[nr_possible_cpus]; + int i, hits = 0, nthreads = 0; + __u32 key; + + memset(val, 0, sizeof(val)); + + for (key = 0; key < KEY_RANGE; key++) + bpf_map_delete_elem(map_fd, &key); + + for (i = 0; i < nr_cpus; i++) { + args[i] = (struct refill_arg){ + .map_fd = map_fd, + .cpu = i, + .per_cpu_quota = per_cpu_quota, + }; + if (pthread_create(&threads[nthreads], NULL, refill_thread, &args[i]) == 0) + nthreads++; + } + for (i = 0; i < nthreads; i++) + pthread_join(threads[i], NULL); + + for (i = 0; i < nr_cpus; i++) + if (args[i].update_errors) + return -ENOMEM; + + for (key = 0; key < (__u32)total; key++) + if (bpf_map_lookup_elem(map_fd, &key, val) == 0) + hits++; + + return hits == total ? 0 : -EIO; +} + +static void run_variant(enum bpf_map_type type, __u32 map_flags, const char *name) +{ + struct perf_event_attr attr = { + .size = sizeof(attr), + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .freq = 1, + }; + int nr_cpus, max_cpus = 64; + struct bpf_link *links[max_cpus]; + pthread_t threads[max_cpus]; + struct hammer_arg args[max_cpus]; + struct lru_lock_nmi *skel = NULL; + int map_fd, i, err, nr_threads = 0, pmu_fd = -1; + __u64 deadline; + + nr_cpus = libbpf_num_possible_cpus(); + if (!ASSERT_GT(nr_cpus, 0, "num_cpus")) + return; + + if (nr_cpus > max_cpus) + nr_cpus = max_cpus; + + if (!test__start_subtest(name)) + return; + + memset(links, 0, sizeof(links)); + skel = lru_lock_nmi__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + goto cleanup; + + err = bpf_map__set_type(skel->maps.lru_map, type); + if (!ASSERT_OK(err, "set_type")) + goto cleanup; + err = bpf_map__set_map_flags(skel->maps.lru_map, map_flags); + if (!ASSERT_OK(err, "set_flags")) + goto cleanup; + err = bpf_map__set_max_entries(skel->maps.lru_map, MAP_ENTRIES); + if (!ASSERT_OK(err, "set_max_entries")) + goto cleanup; + + err = lru_lock_nmi__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + skel->bss->hits = 0; + map_fd = bpf_map__fd(skel->maps.lru_map); + attr.sample_freq = read_perf_max_sample_freq(); + + for (i = 0; i < nr_cpus; i++) { + pmu_fd = syscall(__NR_perf_event_open, &attr, -1, i, -1, 0); + if (pmu_fd < 0) { + if (i == 0 && + (errno == ENOENT || errno == EOPNOTSUPP)) { + test__skip(); + goto cleanup; + } + continue; + } + /* libbpf takes ownership of pfd on success */ + links[i] = bpf_program__attach_perf_event(skel->progs.oncpu, pmu_fd); + if (!links[i]) + close(pmu_fd); + } + + deadline = get_time_ns() + STRESS_NS; + for (i = 0; i < nr_cpus; i++) { + args[i].map_fd = map_fd; + args[i].cpu = i; + args[i].deadline_ns = deadline; + if (pthread_create(&threads[nr_threads], NULL, hammer_thread, &args[i]) == 0) + nr_threads++; + } + for (i = 0; i < nr_threads; i++) + pthread_join(threads[i], NULL); + + for (i = 0; i < nr_cpus; i++) { + if (links[i]) { + bpf_link__destroy(links[i]); + links[i] = NULL; + } + } + + ASSERT_GT(skel->bss->hits, 0, "nmi_bpf_ran"); + ASSERT_OK(drain_then_verify_capacity(map_fd, nr_cpus), "drain_then_verify_capacity"); + +cleanup: + for (i = 0; i < nr_cpus; i++) { + if (links[i]) + bpf_link__destroy(links[i]); + } + lru_lock_nmi__destroy(skel); +} + +void serial_test_lru_lock_nmi(void) +{ + run_variant(BPF_MAP_TYPE_LRU_HASH, 0, "common_lru"); + run_variant(BPF_MAP_TYPE_LRU_HASH, BPF_F_NO_COMMON_LRU, "no_common_lru"); + run_variant(BPF_MAP_TYPE_LRU_PERCPU_HASH, 0, "percpu_lru"); +} diff --git a/tools/testing/selftests/bpf/progs/lru_lock_nmi.c b/tools/testing/selftests/bpf/progs/lru_lock_nmi.c new file mode 100644 index 000000000000..c0692cd54237 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/lru_lock_nmi.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __uint(max_entries, 64); + __type(key, __u32); + __type(value, __u64); +} lru_map SEC(".maps"); + +int hits; + +SEC("perf_event") +int oncpu(void *ctx) +{ + /* + * Key range deliberately wider than max_entries to force LRU + * eviction on every other update. + */ + __u32 key = bpf_get_prandom_u32() % 128; + bool do_update = bpf_get_prandom_u32() & 1; + __u64 val = 1; + + if (do_update) + bpf_map_update_elem(&lru_map, &key, &val, BPF_ANY); + else + bpf_map_delete_elem(&lru_map, &key); + __sync_fetch_and_add(&hits, 1); + return 0; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 71385b78dbc290328e3b04ebd9b27786642afaca Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 7 Jun 2026 21:25:47 -1000 Subject: arm64: mm: Complete the PTE store in ptep_try_set() ptep_try_set() installs a kernel PTE with try_cmpxchg() but, unlike __set_pte(), skips the barriers that arm64 requires after writing a valid kernel PTE. Without them a subsequent access can fault instead of seeing the new mapping. Issue them with emit_pte_barriers() rather than __set_pte_complete(). ptep_try_set() must finish the store before it returns, but __set_pte_complete() would defer the barriers when the calling context is in lazy MMU mode. v2: Emit the barriers directly instead of __set_pte_complete(). (Catalin) Fixes: 258df8fce42f ("mm: Add ptep_try_set() for lockless empty-slot installs") Suggested-by: Catalin Marinas Signed-off-by: Tejun Heo Reviewed-by: Catalin Marinas Link: https://lore.kernel.org/all/aiRFcz78QTZdIHHB@arm.com/ Link: https://lore.kernel.org/bpf/7f5f7c94601312c1a401fb18998291cc@kernel.org Signed-off-by: Kumar Kartikeya Dwivedi --- arch/arm64/include/asm/pgtable.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 3ce0f2a6cab6..3e579c26b383 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1838,7 +1838,16 @@ static inline bool ptep_try_set(pte_t *ptep, pte_t new_pte) { pteval_t old = 0; - return try_cmpxchg(&pte_val(*ptep), &old, pte_val(new_pte)); + if (!try_cmpxchg(&pte_val(*ptep), &old, pte_val(new_pte))) + return false; + + /* + * The store must be complete by the time this returns, but the caller + * may be in lazy MMU mode, where __set_pte_complete() would defer the + * barriers. Issue them directly. + */ + emit_pte_barriers(); + return true; } #define ptep_try_set ptep_try_set -- cgit v1.2.3 From 53040a81ae57cdca8af8ac36fe4e661730cf7c6b Mon Sep 17 00:00:00 2001 From: Nuoqi Gui Date: Sun, 7 Jun 2026 21:24:13 +0800 Subject: bpf: Keep dynamic inner array lookups nullable An ARRAY_OF_MAPS can use an array created with BPF_F_INNER_MAP as its inner map template. A concrete inner array with a different max_entries value can then replace the template. After a successful outer map lookup, the verifier represents the resulting map pointer using the inner map template. Const-key lookup nullness elision consequently uses the template max_entries even though the runtime helper uses the concrete inner map max_entries. Do not elide lookup result nullness for maps marked with BPF_F_INNER_MAP, because the template max_entries does not prove that the key is in bounds for the concrete runtime map. Fixes: d2102f2f5d75 ("bpf: verifier: Support eliding map lookup nullness") Signed-off-by: Nuoqi Gui Acked-by: Eduard Zingerman Acked-by: Jiri Olsa Cc: stable@vger.kernel.org Link: https://lore.kernel.org/bpf/20260607-f01-v2-v2-1-da48453146e8@mails.tsinghua.edu.cn Signed-off-by: Kumar Kartikeya Dwivedi --- kernel/bpf/verifier.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0c1cf506c219..ed7ba0e6a9ce 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8179,7 +8179,7 @@ static int get_constant_map_key(struct bpf_verifier_env *env, return 0; } -static bool can_elide_value_nullness(enum bpf_map_type type); +static bool can_elide_value_nullness(const struct bpf_map *map); static int check_func_arg(struct bpf_verifier_env *env, u32 arg, struct bpf_call_arg_meta *meta, @@ -8298,7 +8298,7 @@ skip_type_check: err = check_helper_mem_access(env, reg, argno_from_reg(regno), key_size, BPF_READ, false, NULL); if (err) return err; - if (can_elide_value_nullness(meta->map.ptr->map_type)) { + if (can_elide_value_nullness(meta->map.ptr)) { err = get_constant_map_key(env, reg, key_size, &meta->const_map_key); if (err < 0) { meta->const_map_key = -1; @@ -10068,13 +10068,16 @@ static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno state->callback_subprogno == subprogno); } -/* Returns whether or not the given map type can potentially elide +/* Returns whether or not the given map can potentially elide * lookup return value nullness check. This is possible if the key * is statically known. */ -static bool can_elide_value_nullness(enum bpf_map_type type) +static bool can_elide_value_nullness(const struct bpf_map *map) { - switch (type) { + if (map->map_flags & BPF_F_INNER_MAP) + return false; + + switch (map->map_type) { case BPF_MAP_TYPE_ARRAY: case BPF_MAP_TYPE_PERCPU_ARRAY: return true; @@ -10414,7 +10417,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } if (func_id == BPF_FUNC_map_lookup_elem && - can_elide_value_nullness(meta.map.ptr->map_type) && + can_elide_value_nullness(meta.map.ptr) && meta.const_map_key >= 0 && meta.const_map_key < meta.map.ptr->max_entries) ret_flag &= ~PTR_MAYBE_NULL; -- cgit v1.2.3 From a3847994b4d20c0701ccc54fe110920ea78e73dc Mon Sep 17 00:00:00 2001 From: Nuoqi Gui Date: Sun, 7 Jun 2026 21:24:14 +0800 Subject: selftests/bpf: Cover dynamic inner array lookup nullability Add a verifier regression test that looks up a constant key through a dynamic inner array template and dereferences the result without a NULL check. The verifier must reject the program because BPF_F_INNER_MAP allows the concrete runtime array to have fewer entries than the template. Signed-off-by: Nuoqi Gui Acked-by: Eduard Zingerman Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20260607-f01-v2-v2-2-da48453146e8@mails.tsinghua.edu.cn Signed-off-by: Kumar Kartikeya Dwivedi --- .../selftests/bpf/progs/verifier_map_in_map.c | 40 ++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_map_in_map.c b/tools/testing/selftests/bpf/progs/verifier_map_in_map.c index 16b761e510f0..b606b5dca734 100644 --- a/tools/testing/selftests/bpf/progs/verifier_map_in_map.c +++ b/tools/testing/selftests/bpf/progs/verifier_map_in_map.c @@ -18,6 +18,20 @@ struct { }); } map_in_map SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, 1); + __type(key, int); + __type(value, int); + __array(values, struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(map_flags, BPF_F_INNER_MAP); + __uint(max_entries, 8); + __type(key, int); + __type(value, long); + }); +} map_in_map_dyn SEC(".maps"); + SEC("socket") __description("map in map access") __success __success_unpriv __retval(0) @@ -45,6 +59,32 @@ l0_%=: r0 = 0; \ : __clobber_all); } +SEC("socket") +__description("map in map dynamic inner array lookup is nullable") +__failure __msg("invalid mem access 'map_value_or_null'") +__naked void map_in_map_dynamic_inner_array_lookup_is_nullable(void) +{ + asm volatile (" \ + r1 = 0; \ + *(u32*)(r10 - 4) = r1; \ + r2 = r10; \ + r2 += -4; \ + r1 = %[map_in_map_dyn] ll; \ + call %[bpf_map_lookup_elem]; \ + if r0 == 0 goto l0_%=; \ + *(u32*)(r10 - 8) = 4; \ + r2 = r10; \ + r2 += -8; \ + r1 = r0; \ + call %[bpf_map_lookup_elem]; \ + r0 = *(u64 *)(r0 + 0); \ +l0_%=: exit; \ +" : + : __imm(bpf_map_lookup_elem), + __imm_addr(map_in_map_dyn) + : __clobber_all); +} + SEC("xdp") __description("map in map state pruning") __success __msg("processed 15 insns") -- cgit v1.2.3 From 50dff00615522f3ec03449680ca23beb4cfc549c Mon Sep 17 00:00:00 2001 From: Sechang Lim Date: Mon, 8 Jun 2026 05:00:00 +0000 Subject: bpf: Fix NULL pointer dereference in bpf_task_from_vpid() bpf_task_from_vpid() looks up a task in the pid namespace of the current task, via find_task_by_vpid(): find_task_by_vpid(vpid) find_task_by_pid_ns(vpid, task_active_pid_ns(current)) find_pid_ns(nr, ns) -> idr_find(&ns->idr, nr) cgroup_skb programs run in softirq, which may interrupt a task that is itself in do_exit(). Once that task has passed exit_notify() -> release_task() -> __unhash_process(), its thread_pid is cleared, so task_active_pid_ns(current) returns NULL and find_pid_ns() dereferences &NULL->idr: BUG: kernel NULL pointer dereference, address: 0000000000000050 RIP: 0010:idr_find+0x11/0x30 lib/idr.c:176 Call Trace: find_pid_ns kernel/pid.c:370 [inline] find_task_by_pid_ns+0x3b/0xe0 kernel/pid.c:485 bpf_task_from_vpid+0x5b/0x200 kernel/bpf/helpers.c:2916 bpf_prog_run_array_cg+0x17e/0x530 kernel/bpf/cgroup.c:81 __cgroup_bpf_run_filter_skb+0x12b/0x250 kernel/bpf/cgroup.c:1612 sk_filter_trim_cap+0x1dc/0x4c0 net/core/filter.c:148 tcp_v4_rcv+0x18d1/0x2200 net/ipv4/tcp_ipv4.c:2223 do_exit+0xa63/0x1270 kernel/exit.c:1010 get_signal+0x141c/0x1530 kernel/signal.c:3037 Bail out when current has no pid namespace. Fixes: 675c3596ff32 ("bpf: Add bpf_task_from_vpid() kfunc") Signed-off-by: Sechang Lim Acked-by: Leon Hwang Link: https://lore.kernel.org/bpf/20260608050001.2545245-1-rhkrqnwk98@gmail.com Signed-off-by: Kumar Kartikeya Dwivedi --- kernel/bpf/helpers.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 8ba2b8965caf..8e196c9b7c50 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3009,11 +3009,13 @@ __bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid) { struct task_struct *p; - rcu_read_lock(); + guard(rcu)(); + if (!task_active_pid_ns(current)) + return NULL; + p = find_task_by_vpid(vpid); if (p) p = bpf_task_acquire(p); - rcu_read_unlock(); return p; } -- cgit v1.2.3 From b9452b594fd3aecbfd4aa0a6a1f741330a37dab7 Mon Sep 17 00:00:00 2001 From: Paul Moses Date: Fri, 5 Jun 2026 23:43:09 +0000 Subject: bpf: Validate BTF repeated field counts before expansion btf_parse_struct_metas() walks user-supplied BTF during BPF_BTF_LOAD, and btf_repeat_fields() expands repeatable fields from array elements into the fixed BTF_FIELDS_MAX scratch array used by btf_parse_fields(). The remaining-capacity check performs the expanded field count calculation in u32. A malformed BTF can wrap that calculation, causing the check to pass even when the expanded field count exceeds the scratch array capacity. The following memcpy() can then write past the end of the array. Use checked addition and multiplication before copying repeated fields and reject impossible counts. Fixes: 797d73ee232d ("bpf: Check the remaining info_cnt before repeating btf fields") Cc: stable@vger.kernel.org Signed-off-by: Paul Moses Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20260605234301.1109063-1-p@1g4.org Signed-off-by: Kumar Kartikeya Dwivedi --- kernel/bpf/btf.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ef4402274786..15ae7c43f594 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3667,7 +3667,7 @@ end: static int btf_repeat_fields(struct btf_field_info *info, int info_cnt, u32 field_cnt, u32 repeat_cnt, u32 elem_size) { - u32 i, j; + u32 i, j, total_cnt, total_repeats; u32 cur; /* Ensure not repeating fields that should not be repeated. */ @@ -3685,10 +3685,9 @@ static int btf_repeat_fields(struct btf_field_info *info, int info_cnt, } } - /* The type of struct size or variable size is u32, - * so the multiplication will not overflow. - */ - if (field_cnt * (repeat_cnt + 1) > info_cnt) + if (check_add_overflow(repeat_cnt, 1, &total_repeats) || + check_mul_overflow(field_cnt, total_repeats, &total_cnt) || + total_cnt > (u32)info_cnt) return -E2BIG; cur = field_cnt; -- cgit v1.2.3 From dd0f9684d2f7d3f99aee63f5fa80562f2207b964 Mon Sep 17 00:00:00 2001 From: Paul Moses Date: Tue, 9 Jun 2026 05:08:54 -0500 Subject: selftests/bpf: Add BTF repeated field count overflow test Add a raw BTF test that exercises repeated special-field expansion with a large array count. The compact element layout keeps the array byte size representable while the repeated field count overflows the old u32 capacity calculation in btf_repeat_fields(). Signed-off-by: Paul Moses Link: https://lore.kernel.org/bpf/SzebdWqm2zREZBf8Tc5Kc-JDWbh9nBztnk4PUu5kRSD1OOdr_ESVTt__2Hd3-lClr47jIjJCXfOH0RHsMpjjpEUh_R2v30nh3T1IXNT6Pbo=@1g4.org Signed-off-by: Kumar Kartikeya Dwivedi --- tools/testing/selftests/bpf/prog_tests/btf.c | 37 ++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index a9de328a8697..96f719a0cec9 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -4258,6 +4258,43 @@ static struct btf_raw_test raw_tests[] = { .max_entries = 1, }, +{ + .descr = "struct test repeated fields count overflow", + .raw_types = { + BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + BTF_STRUCT_ENC(NAME_TBD, 0, 0), /* [2] */ + BTF_TYPE_TAG_ENC(NAME_TBD, 2), /* [3] */ + BTF_PTR_ENC(3), /* [4] */ + BTF_TYPE_ARRAY_ENC(4, 1, 1), /* [5] */ + BTF_STRUCT_ENC(NAME_TBD, 10, 8), /* [6] */ + BTF_MEMBER_ENC(NAME_TBD, 5, 0), + BTF_MEMBER_ENC(NAME_TBD, 5, 0), + BTF_MEMBER_ENC(NAME_TBD, 5, 0), + BTF_MEMBER_ENC(NAME_TBD, 5, 0), + BTF_MEMBER_ENC(NAME_TBD, 5, 0), + BTF_MEMBER_ENC(NAME_TBD, 5, 0), + BTF_MEMBER_ENC(NAME_TBD, 5, 0), + BTF_MEMBER_ENC(NAME_TBD, 5, 0), + BTF_MEMBER_ENC(NAME_TBD, 5, 0), + BTF_MEMBER_ENC(NAME_TBD, 5, 0), + BTF_TYPE_ARRAY_ENC(6, 1, 0x1999999aU), /* [7] */ + BTF_STRUCT_ENC(NAME_TBD, 2, 8 + 8 * 0x1999999aU), /* [8] */ + BTF_MEMBER_ENC(NAME_TBD, 4, 0), + BTF_MEMBER_ENC(NAME_TBD, 7, 64), + BTF_END_RAW, + }, + BTF_STR_SEC("\0int\0prog_test_ref_kfunc\0kptr_untrusted\0elem" + "\0p0\0p1\0p2\0p3\0p4\0p5\0p6\0p7\0p8\0p9" + "\0outer\0trigger\0elems"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "repeat_fields", + .key_size = sizeof(int), + .value_size = 8 + 8 * 0x1999999aU, + .key_type_id = 1, + .value_type_id = 8, + .max_entries = 1, + .btf_load_err = true, +}, }; /* struct btf_raw_test raw_tests[] */ static const char *get_next_str(const char *start, const char *end) -- cgit v1.2.3 From fa75b7c85b0d2b6ab1c3ee0f06d35e2b98078c45 Mon Sep 17 00:00:00 2001 From: Nuoqi Gui Date: Tue, 9 Jun 2026 22:43:50 +0800 Subject: bpf: Enforce write checks for BTF pointer helper access check_mem_reg() verifies both read and write access for global subprogram memory arguments. When the caller register is PTR_TO_BTF_ID, check_helper_mem_access() currently forwards the access to check_ptr_to_btf_access() as BPF_READ regardless of the requested access type. This lets a BTF-backed kernel object field pointer pass the caller-side writable memory check for a global subprogram argument. The callee is then validated with a generic writable PTR_TO_MEM argument and can store through it, even though an equivalent direct BTF field store is rejected with "only read is supported". Forward the requested access type to check_ptr_to_btf_access(). This enforces existing BTF write restrictions for global subprogram memory arguments as well. Fixes: 3e30be4288b3 ("bpf: Allow helpers access trusted PTR_TO_BTF_ID.") Signed-off-by: Nuoqi Gui Link: https://lore.kernel.org/bpf/20260609-f01-04-btf-writable-arg-v1-1-f449cd970669@mails.tsinghua.edu.cn Signed-off-by: Kumar Kartikeya Dwivedi --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ed7ba0e6a9ce..cdff3e6eb96e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6777,7 +6777,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, struct bpf_reg_ zero_size_allowed, access_type, meta); case PTR_TO_BTF_ID: return check_ptr_to_btf_access(env, regs, reg, argno, 0, - access_size, BPF_READ, -1); + access_size, access_type, -1); case PTR_TO_CTX: /* Only permit reading or writing syscall context using helper calls. */ if (is_var_ctx_off_allowed(env->prog)) { -- cgit v1.2.3 From af8c3f170f7314d316023efc0ae670384e220b09 Mon Sep 17 00:00:00 2001 From: Nuoqi Gui Date: Tue, 9 Jun 2026 22:43:51 +0800 Subject: selftests/bpf: Cover writable BTF field global subprog args Add a verifier test for passing a BTF-backed task_struct field pointer to a global subprogram argument typed as writable memory. The direct field store is already rejected. The global subprogram path should be rejected too. The callee must not lose the BTF pointer's read-only provenance. It must not validate the argument as ordinary writable memory. Signed-off-by: Nuoqi Gui Link: https://lore.kernel.org/bpf/20260609-f01-04-btf-writable-arg-v1-2-f449cd970669@mails.tsinghua.edu.cn Signed-off-by: Kumar Kartikeya Dwivedi --- .../selftests/bpf/progs/verifier_global_ptr_args.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c index ea273e152209..0bdeb7bc4687 100644 --- a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c +++ b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c @@ -287,6 +287,25 @@ int trusted_to_untrusted_mem(void *ctx) return subprog_void_untrusted(bpf_get_current_task_btf()); } +__weak int subprog_write_mem_arg(int *p) +{ + if (!p) + return 0; + + *p = 42; + return 0; +} + +SEC("?tp_btf/task_newtask") +__failure +__msg("only read is supported") +int trusted_btf_field_to_writable_mem(void *ctx) +{ + struct task_struct *task = bpf_get_current_task_btf(); + + return subprog_write_mem_arg(&task->prio); +} + SEC("tp_btf/sys_enter") __success int anything_to_untrusted_mem(void *ctx) -- cgit v1.2.3 From 68f4e480b089abae26fbab0c38c3df3cbac3d79d Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Tue, 9 Jun 2026 02:36:30 -0400 Subject: selftests/bpf: Avoid spurious spmc parallel selftest errors in libarena The libarena parallel spmc selftest is nondeterministic by design. As a result it depends up to a point on the relative timing between the producer and consumer threads. This introduces the possibility for two kinds of spurious failures that this patch addresses. 1) Spurious timeouts. The test proceeds in phases, and threads use a common counter as a barrier to avoid proceeding to the next phase until all threads are ready to do so. If a thread takes too long to reach the barrier, the already waiting threads may time out. Increase the current timeout. The timeout's value is a balance between the maximum amount of time spent on the test and the possibility of spurious failures. Right now the timeout is too short. Err on the side of caution and significantly increase it to avoid spurious failures. 2) Spurious resize failures. Some selftests require the spmc queue to resize itself. This in turn requires for the producer side to be materially faster than the consumer side so that the queue gets full enough for a resize. However, in the benchmark the spmc queue's producer is outnumbered 3:1. To offset it we add busy waits for consume queues. However, we still see occasional failures due to the queue never resizing. Minimize the possibility for this in two ways: First, remove one of the consumers. The 2 consumers still exercise the "race between consumers" scenario. Second, increase the busy wait duration to decrease the rate by which the consumers act on the queue. While at it, also replace a stray invalid error value "153" with EINVAL. Fixes: 42998f819256 ("selftests/bpf: libarena: parallel test harness and spmc parallel selftest") Reported-by: Jakub Kicinski Signed-off-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260609063630.10245-1-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- .../bpf/libarena/selftests/test_parallel_spmc.bpf.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/bpf/libarena/selftests/test_parallel_spmc.bpf.c b/tools/testing/selftests/bpf/libarena/selftests/test_parallel_spmc.bpf.c index 981c845e2d15..f08f2a92e194 100644 --- a/tools/testing/selftests/bpf/libarena/selftests/test_parallel_spmc.bpf.c +++ b/tools/testing/selftests/bpf/libarena/selftests/test_parallel_spmc.bpf.c @@ -7,7 +7,7 @@ #include #include -#define TEST_SPMC_THREADS 4 +#define TEST_SPMC_THREADS 3 #define TEST_SPMC_STEALERS (TEST_SPMC_THREADS - 1) /* @@ -17,7 +17,7 @@ * and operations are wait-free we just spin around the quiescence * point instead. If we time out, we just fail the benchmark. */ -#define TEST_SPMC_SYNC_SPINS (1U << 18) +#define TEST_SPMC_SYNC_SPINS BPF_MAX_LOOPS /* * We track all the values we retrieve from the queue @@ -61,7 +61,7 @@ static volatile u64 round_steals; * We have multiple stealers and a single owner. We sometimes want the owner * to successfully outproduce the stealers, we add a busy loop in them. */ -#define TEST_SPMC_WASTE_ROUNDS (1024) +#define TEST_SPMC_WASTE_ROUNDS (1UL << 12) /* * The spmc data structure depends on the runtime fully @@ -112,10 +112,6 @@ static bool spmc_tests_enabled(void) { \ return spmc_##prefix##_stealer(); \ } \ - SEC("syscall") int parallel_test_spmc_##prefix##__3(void) \ - { \ - return spmc_##prefix##_stealer(); \ - } static int spmc_common_init(u64 total) { @@ -452,10 +448,10 @@ static int spmc_resize_owner(void) resized = true; } - /* Did we get to resize while racing/ */ + /* Did we get to resize while racing? */ if (!resized) { test_abort = true; - return -153; + return -EINVAL; } /* -- cgit v1.2.3 From 2f884d371fafea137afea504d49ee4a7c8d7985b Mon Sep 17 00:00:00 2001 From: Vlad Poenaru Date: Tue, 9 Jun 2026 06:55:57 -0700 Subject: bpf: Allow LPM map access from sleepable BPF programs trie_lookup_elem() annotates its rcu_dereference_check() walks with only rcu_read_lock_bh_held(). Because rcu_dereference_check(p, c) resolves to "c || rcu_read_lock_held()", this passes for XDP/NAPI and classic RCU readers but fails for sleepable BPF programs, which enter via __bpf_prog_enter_sleepable() and hold only rcu_read_lock_trace(). trie_update_elem() and trie_delete_elem() have the same problem in a different form: they walk the trie with plain rcu_dereference(), which asserts rcu_read_lock_held() unconditionally. Both are reachable from sleepable BPF programs via the bpf_map_update_elem / bpf_map_delete_elem helpers, and from the syscall path under classic rcu_read_lock(). In the writer paths the trie is actually protected by trie->lock (an rqspinlock taken across the walk); we never relied on the RCU read-side lock to keep nodes alive there. A sleepable LSM hook that ends up touching an LPM trie therefore triggers lockdep on debug kernels: ============================= WARNING: suspicious RCU usage 7.1.0-... Tainted: G E ----------------------------- kernel/bpf/lpm_trie.c:249 suspicious rcu_dereference_check() usage! 1 lock held by net_tests/540: #0: (rcu_tasks_trace_srcu_struct){....}-{0:0}, at: __bpf_prog_enter_sleepable+0x26/0x280 Call Trace: dump_stack_lvl lockdep_rcu_suspicious trie_lookup_elem bpf_prog_..._enforce_security_socket_connect bpf_trampoline_... security_socket_connect __sys_connect do_syscall_64 This is lockdep-only -- no UAF, since Tasks Trace RCU does serialize against the trie's reclaim path -- but it spams the console once per distinct callsite on every debug kernel running a sleepable BPF LSM that touches an LPM trie, which is increasingly common. For the lookup path, switch the rcu_dereference_check() annotation from rcu_read_lock_bh_held() to bpf_rcu_lock_held(), which accepts all three contexts (classic, BH, Tasks Trace). Other map types already follow this convention. For trie_update_elem() and trie_delete_elem(), annotate the walks as rcu_dereference_protected(*p, 1) -- matching trie_free() in the same file -- since trie->lock is held across the walk. rqspinlock has no lockdep_map, so the predicate degenerates to '1' rather than lockdep_is_held(&trie->lock); the protection is real but not machine-verifiable. trie_get_next_key() also uses bare rcu_dereference() but is reachable only from the BPF syscall, which holds classic rcu_read_lock() before dispatching, so it is left untouched. Fixes: 694cea395fde ("bpf: Allow RCU-protected lookups to happen from bh context") Cc: stable@vger.kernel.org Signed-off-by: Vlad Poenaru Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260609135558.193287-2-vlad.wing@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/lpm_trie.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 0f57608b385d..4d6f25db9ba1 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -246,7 +246,7 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key) /* Start walking the trie from the root node ... */ - for (node = rcu_dereference_check(trie->root, rcu_read_lock_bh_held()); + for (node = rcu_dereference_check(trie->root, bpf_rcu_lock_held()); node;) { unsigned int next_bit; size_t matchlen; @@ -280,7 +280,7 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key) */ next_bit = extract_bit(key->data, node->prefixlen); node = rcu_dereference_check(node->child[next_bit], - rcu_read_lock_bh_held()); + bpf_rcu_lock_held()); } if (!found) @@ -359,7 +359,7 @@ static long trie_update_elem(struct bpf_map *map, */ slot = &trie->root; - while ((node = rcu_dereference(*slot))) { + while ((node = rcu_dereference_protected(*slot, 1))) { matchlen = longest_prefix_match(trie, node, key); if (node->prefixlen != matchlen || @@ -482,7 +482,7 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) trim = &trie->root; trim2 = trim; parent = NULL; - while ((node = rcu_dereference(*trim))) { + while ((node = rcu_dereference_protected(*trim, 1))) { matchlen = longest_prefix_match(trie, node, key); if (node->prefixlen != matchlen || -- cgit v1.2.3 From a3d76e27bbbf91d1025ce99eb55068ae0aa14322 Mon Sep 17 00:00:00 2001 From: Vlad Poenaru Date: Tue, 9 Jun 2026 06:55:58 -0700 Subject: bpf: Allow sleepable programs to use LPM trie maps directly The previous change relaxed the rcu_dereference annotations in lpm_trie.c so the trie walks no longer trip lockdep when reached from a sleepable BPF program holding only rcu_read_lock_trace(). By itself that only helps tries reached as the inner map of a map-of-maps, or from the classic-RCU syscall path: a sleepable program that references an LPM trie directly is still rejected at load time by check_map_prog_compatibility(), whose sleepable whitelist omits BPF_MAP_TYPE_LPM_TRIE: Sleepable programs can only use array, hash, ringbuf and local storage maps LPM trie nodes are allocated from a bpf_mem_alloc (trie->ma) and freed with bpf_mem_cache_free_rcu(), which chains a regular RCU grace period into a Tasks Trace grace period before the node -- and the value embedded in it that trie_lookup_elem() returns to the program -- is released. That is the same reclaim discipline BPF_MAP_TYPE_HASH relies on for sleepable access, so a value handed to a sleepable reader cannot be freed while the program is still running under rcu_read_lock_trace(). The writer paths take trie->lock across the walk and never relied on the RCU read-side lock to keep nodes alive. Add BPF_MAP_TYPE_LPM_TRIE to the sleepable map whitelist so these programs can use LPM tries directly. Signed-off-by: Vlad Poenaru Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260609135558.193287-3-vlad.wing@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cdff3e6eb96e..954b85609f32 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -17737,6 +17737,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_PERCPU_HASH: case BPF_MAP_TYPE_PERCPU_ARRAY: case BPF_MAP_TYPE_LRU_PERCPU_HASH: + case BPF_MAP_TYPE_LPM_TRIE: case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH_OF_MAPS: case BPF_MAP_TYPE_RINGBUF: -- cgit v1.2.3 From be1d838b88e445fa6edfb9f98af1603cbf2ee94d Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 9 Jun 2026 16:34:07 -0700 Subject: selftests/bpf: Keep int return type for tailcall subprogs LLVM23 ([1]) supports 'true' function signature in BTF. The return type of the caller of a tailcall must be an 'int'. Otherwise, verification will fail (see check_btf_func() in check_btf.c). So with llvm23, it is possible that the compiler may change the caller's return type from 'int' to 'void'. To prevent this, barrier_var() and __sink() are used to avoid returning a constant prone to be optimized. [1] https://github.com/llvm/llvm-project/pull/198426 Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260609233407.2711577-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/tailcall_bpf2bpf2.c | 5 ++++- .../bpf/progs/tailcall_bpf2bpf_hierarchy1.c | 13 ++++++++---- .../bpf/progs/tailcall_bpf2bpf_hierarchy2.c | 24 +++++++++++++++------- .../bpf/progs/tailcall_bpf2bpf_hierarchy3.c | 13 +++++++++--- .../bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c | 13 +++++++++--- tools/testing/selftests/bpf/progs/verifier_sock.c | 9 ++++++-- 6 files changed, 57 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf2.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf2.c index ce97d141daee..c4fadee5aadc 100644 --- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf2.c +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf2.c @@ -13,11 +13,14 @@ struct { static __noinline int subprog_tail(struct __sk_buff *skb) { + int ret = 1; + if (load_byte(skb, 0)) bpf_tail_call_static(skb, &jmp_table, 1); else bpf_tail_call_static(skb, &jmp_table, 0); - return 1; + barrier_var(ret); + return ret; } int count = 0; diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c index d556b19413d7..1fd07824d88a 100644 --- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c @@ -16,20 +16,25 @@ int count = 0; static __noinline int subprog_tail(struct __sk_buff *skb) { + int ret = 0; + bpf_tail_call_static(skb, &jmp_table, 0); - return 0; + barrier_var(ret); + return ret; } SEC("tc") int entry(struct __sk_buff *skb) { - int ret = 1; + int ret = 1, ret1, ret2; clobber_regs_stack(); count++; - subprog_tail(skb); - subprog_tail(skb); + ret1 = subprog_tail(skb); + ret2 = subprog_tail(skb); + __sink(ret1); + __sink(ret2); return ret; } diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c index ae94c9c70ab7..6fde0ab92148 100644 --- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c @@ -25,8 +25,11 @@ int count1 = 0; static __noinline int subprog_tail0(struct __sk_buff *skb) { + int ret = 0; + bpf_tail_call_static(skb, &jmp_table, 0); - return 0; + barrier_var(ret); + return ret; } __auxiliary @@ -41,16 +44,22 @@ int classifier_0(struct __sk_buff *skb) static __noinline int subprog_tail1(struct __sk_buff *skb) { + int ret = 0; + bpf_tail_call_static(skb, &jmp_table, 1); - return 0; + barrier_var(ret); + return ret; } __auxiliary SEC("tc") int classifier_1(struct __sk_buff *skb) { + int ret; + count1++; - subprog_tail1(skb); + ret = subprog_tail1(skb); + __sink(ret); return 0; } @@ -59,13 +68,14 @@ __retval(33) SEC("tc") int tailcall_bpf2bpf_hierarchy_2(struct __sk_buff *skb) { - int ret = 0; + int ret = 0, ret1, ret2; clobber_regs_stack(); - subprog_tail0(skb); - subprog_tail1(skb); - + ret1 = subprog_tail0(skb); + ret2 = subprog_tail1(skb); + __sink(ret1); + __sink(ret2); __sink(ret); return (count1 << 16) | count0; } diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c index 56b6b0099840..0ef9cfb2da8d 100644 --- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c @@ -33,17 +33,24 @@ int count = 0; static __noinline int subprog_tail(struct __sk_buff *skb, void *jmp_table) { + int ret = 0; + bpf_tail_call_static(skb, jmp_table, 0); - return 0; + barrier_var(ret); + return ret; } __auxiliary SEC("tc") int classifier_0(struct __sk_buff *skb) { + int ret1, ret2; + count++; - subprog_tail(skb, &jmp_table0); - subprog_tail(skb, &jmp_table1); + ret1 = subprog_tail(skb, &jmp_table0); + ret2 = subprog_tail(skb, &jmp_table1); + __sink(ret1); + __sink(ret2); return count; } diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c index 5261395713cd..6db9afee2095 100644 --- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c @@ -18,18 +18,25 @@ int count = 0; static __noinline int subprog_tail(void *ctx) { + int ret = 0; + bpf_tail_call_static(ctx, &jmp_table, 0); - return 0; + barrier_var(ret); + return ret; } SEC("fentry/dummy") int BPF_PROG(fentry, struct sk_buff *skb) { + int ret1, ret2; + clobber_regs_stack(); count++; - subprog_tail(ctx); - subprog_tail(ctx); + ret1 = subprog_tail(ctx); + ret2 = subprog_tail(ctx); + __sink(ret1); + __sink(ret2); return 0; } diff --git a/tools/testing/selftests/bpf/progs/verifier_sock.c b/tools/testing/selftests/bpf/progs/verifier_sock.c index 9f680cf44512..4f2f3209eec8 100644 --- a/tools/testing/selftests/bpf/progs/verifier_sock.c +++ b/tools/testing/selftests/bpf/progs/verifier_sock.c @@ -1120,8 +1120,11 @@ int tail_call(struct __sk_buff *sk) static __noinline int static_tail_call(struct __sk_buff *sk) { + int ret = 0; + bpf_tail_call_static(sk, &jmp_table, 0); - return 0; + barrier_var(ret); + return ret; } /* Tail calls in sub-programs invalidate packet pointers. */ @@ -1144,10 +1147,12 @@ __failure __msg("invalid mem access") int invalidate_pkt_pointers_by_static_tail_call(struct __sk_buff *sk) { int *p = (void *)(long)sk->data; + int ret; if ((void *)(p + 1) > (void *)(long)sk->data_end) return TCX_DROP; - static_tail_call(sk); + ret = static_tail_call(sk); + __sink(ret); *p = 42; /* this is unsafe */ return TCX_PASS; } -- cgit v1.2.3 From e775c522a455b97db7e0a466c400f74672990bad Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 9 Jun 2026 16:34:12 -0700 Subject: selftests/bpf: Adjust fexit_bpf2bpf ctx layout for llvm23 true signature test_pkt_access_subprog2() is defined in C as int test_pkt_access_subprog2(int val, volatile struct __sk_buff *skb) but llvm optimizes away the unused 'int val' argument. Before llvm23 the BTF signature did not match the optimized assembly, so the verifier set attach_func_proto to NULL and fell back to MAX_BPF_FUNC_REG_ARGS (5) u64 arguments (see btf_ctx_access()). The fexit ctx struct therefore placed the return value after args[5]. With llvm23 the 'true' signature int test_pkt_access_subprog2(volatile struct __sk_buff *skb) is recorded in BTF, so nr_args becomes 1 and the return value moves to the slot right after args[1]. Select the matching args_subprog2 layout based on __clang_major__ so the test works with both old and new llvm. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260609233412.2712178-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c b/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c index 983b7c233382..f4bbf87b82dd 100644 --- a/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c +++ b/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c @@ -53,14 +53,23 @@ int BPF_PROG(test_subprog1, struct sk_buff *skb, int ret) * r0 = *(u32 *)(r1 + 0) * w0 <<= 1 * exit - * In such case the verifier falls back to conservative and + * Before llvm23, in such case the verifier falls back to conservative and * tracing program can access arguments and return value as u64 - * instead of accurate types. + * instead of accurate types. With llvm23, the true signature + * int test_pkt_access_subprog2(volatile struct __sk_buff *skb) + * is available in btf. */ +#if __clang_major__ >= 23 +struct args_subprog2 { + __u64 args[1]; + __u64 ret; +}; +#else struct args_subprog2 { __u64 args[5]; __u64 ret; }; +#endif __u64 test_result_subprog2 = 0; SEC("fexit/test_pkt_access_subprog2") int test_subprog2(struct args_subprog2 *ctx) -- cgit v1.2.3 From 94c8d1c21be40a845357854f98ec07e21bb14bc9 Mon Sep 17 00:00:00 2001 From: Justin Suess Date: Tue, 9 Jun 2026 22:25:43 +0200 Subject: bpf: Reject bpf_obj_drop() from tracing progs bpf_obj_drop() runs bpf_obj_free_fields() synchronously for program-allocated objects. When such an object contains NMI unsafe fields, tracing programs that can run from arbitrary instrumented context can reach that destruction from unsafe contexts, including NMI. NMI is likely one instance of this problem, and other instances would include possible unsafe reentrancy. Deferring bpf_obj_drop() is not appealing either: it would add delayed-free machinery to a release operation that otherwise has straightforward synchronous ownership semantics. Reject bpf_obj_drop() and bpf_percpu_obj_drop() from tracing programs that may run from unsafe contexts unless every field in the object's BTF record is explicitly NMI safe. Do not reject sleepable BPF_PROG_TYPE_TRACING programs, since they are not the arbitrary/NMI contexts that motivate the restriction. Note that while bpf_rb_root and bpf_list_head would be NMI safe on their own to free, the objects recursively held by them may not be; be conservative and just mark them as not NMI safe for now. Use a whitelist for the NMI-safe field set instead of listing only known NMI unsafe fields. Locks, async fields, unreferenced kptrs, and refcounts are known to be NMI safe because their destruction is either a no-op, simple state reset, or async cancellation. Referenced kptrs, percpu referenced kptrs, uptrs, graph roots, graph nodes, and any future field type are rejected until audited for arbitrary tracing and NMI contexts. This is less susceptible to future changes in fields that were previously safe by exclusion, and to new fields being added without updating this check. Convert the existing recursive local-object drop success case to a syscall program in the same commit, since this verifier change makes the old tracing program form invalid. The test still exercises bpf_obj_drop() releasing a referenced task kptr from a safe program type. Fixes: ac9f06050a35 ("bpf: Introduce bpf_obj_drop") Signed-off-by: Justin Suess Co-developed-by: Kumar Kartikeya Dwivedi Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260609202548.3571690-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 29 +++++++++++++++ kernel/bpf/verifier.c | 17 +++++++++ .../testing/selftests/bpf/prog_tests/task_kfunc.c | 42 +++++++++++++++++++++- .../selftests/bpf/progs/task_kfunc_success.c | 13 ++++--- 4 files changed, 93 insertions(+), 8 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 62bba7a4876f..0654d2ffadc1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -492,6 +492,35 @@ static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_f return rec->field_mask & type; } +static inline bool btf_field_is_nmi_safe(enum btf_field_type type) +{ + switch (type) { + case BPF_SPIN_LOCK: + case BPF_RES_SPIN_LOCK: + case BPF_TIMER: + case BPF_WORKQUEUE: + case BPF_TASK_WORK: + case BPF_KPTR_UNREF: + case BPF_REFCOUNT: + return true; + default: + return false; + } +} + +static inline bool btf_record_has_nmi_unsafe_fields(const struct btf_record *rec) +{ + int i; + + if (IS_ERR_OR_NULL(rec)) + return false; + for (i = 0; i < rec->cnt; i++) { + if (!btf_field_is_nmi_safe(rec->fields[i].type)) + return true; + } + return false; +} + static inline void bpf_obj_init(const struct btf_record *rec, void *obj) { int i; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 954b85609f32..eb46a81a8c51 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -205,6 +205,7 @@ static int release_reference_nomark(struct bpf_verifier_state *state, int id); static int release_reference(struct bpf_verifier_env *env, int id); static void invalidate_non_owning_refs(struct bpf_verifier_env *env); static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env); +static bool is_tracing_prog_type(enum bpf_prog_type type); static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg); static bool is_trusted_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg); @@ -12881,6 +12882,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { bool sleepable, rcu_lock, rcu_unlock, preempt_disable, preempt_enable; + enum bpf_prog_type prog_type = resolve_prog_type(env->prog); struct bpf_reg_state *regs = cur_regs(env); const char *func_name, *ptr_type_name; const struct btf_type *t, *ptr_type; @@ -12957,6 +12959,21 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (err < 0) return err; + if ((is_bpf_obj_drop_kfunc(meta.func_id) || + is_bpf_percpu_obj_drop_kfunc(meta.func_id)) && (is_tracing_prog_type(prog_type) || + /* is_tracing_prog_type() for now doesn't cover non-iterator tracing progs. */ + (prog_type == BPF_PROG_TYPE_TRACING && env->prog->expected_attach_type != BPF_TRACE_ITER + && !env->prog->sleepable))) { + struct btf_struct_meta *struct_meta; + + struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id); + if (struct_meta && btf_record_has_nmi_unsafe_fields(struct_meta->record)) { + verbose(env, "%s cannot be used in tracing programs on types with NMI unsafe fields\n", + func_name); + return -EINVAL; + } + } + if (is_bpf_rbtree_add_kfunc(meta.func_id)) { err = push_callback_call(env, insn, insn_idx, meta.subprogno, set_rbtree_add_callback_state); diff --git a/tools/testing/selftests/bpf/prog_tests/task_kfunc.c b/tools/testing/selftests/bpf/prog_tests/task_kfunc.c index 83b90335967a..e6e95c1416e6 100644 --- a/tools/testing/selftests/bpf/prog_tests/task_kfunc.c +++ b/tools/testing/selftests/bpf/prog_tests/task_kfunc.c @@ -68,6 +68,36 @@ cleanup: task_kfunc_success__destroy(skel); } +static void run_syscall_success_test(const char *prog_name) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct task_kfunc_success *skel; + struct bpf_program *prog; + int err; + + skel = open_load_task_kfunc_skel(); + if (!ASSERT_OK_PTR(skel, "open_load_skel")) + return; + + if (!ASSERT_OK(skel->bss->err, "pre_run_err")) + goto cleanup; + + prog = bpf_object__find_program_by_name(skel->obj, prog_name); + if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) + goto cleanup; + + err = bpf_prog_test_run_opts(bpf_program__fd(prog), &opts); + if (!ASSERT_OK(err, "bpf_prog_test_run_opts")) + goto cleanup; + if (!ASSERT_EQ(opts.retval, 0, "retval")) + goto cleanup; + + ASSERT_OK(skel->bss->err, "post_run_err"); + +cleanup: + task_kfunc_success__destroy(skel); +} + static int run_vpid_test(void *prog_name) { struct task_kfunc_success *skel; @@ -140,7 +170,6 @@ static const char * const success_tests[] = { "test_task_acquire_release_argument", "test_task_acquire_release_current", "test_task_acquire_leave_in_map", - "test_task_xchg_release", "test_task_map_acquire_release", "test_task_current_acquire_release", "test_task_from_pid_arg", @@ -151,6 +180,10 @@ static const char * const success_tests[] = { "test_task_kfunc_flavor_relo_not_found", }; +static const char * const syscall_success_tests[] = { + "test_task_xchg_release", +}; + static const char * const vpid_success_tests[] = { "test_task_from_vpid_current", "test_task_from_vpid_invalid", @@ -167,6 +200,13 @@ void test_task_kfunc(void) run_success_test(success_tests[i]); } + for (i = 0; i < ARRAY_SIZE(syscall_success_tests); i++) { + if (!test__start_subtest(syscall_success_tests[i])) + continue; + + run_syscall_success_test(syscall_success_tests[i]); + } + for (i = 0; i < ARRAY_SIZE(vpid_success_tests); i++) { if (!test__start_subtest(vpid_success_tests[i])) continue; diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_success.c b/tools/testing/selftests/bpf/progs/task_kfunc_success.c index 5fb4fc19d26a..d63a79ee33dc 100644 --- a/tools/testing/selftests/bpf/progs/task_kfunc_success.c +++ b/tools/testing/selftests/bpf/progs/task_kfunc_success.c @@ -140,17 +140,17 @@ int BPF_PROG(test_task_acquire_leave_in_map, struct task_struct *task, u64 clone return 0; } -SEC("tp_btf/task_newtask") -int BPF_PROG(test_task_xchg_release, struct task_struct *task, u64 clone_flags) +SEC("syscall") +int test_task_xchg_release(const void *ctx) { - struct task_struct *kptr, *acquired; + struct task_struct *task, *kptr, *acquired; struct __tasks_kfunc_map_value *v, *local; int refcnt, refcnt_after_drop; long status; - if (!is_test_kfunc_task()) - return 0; + (void)ctx; + task = bpf_get_current_task_btf(); status = tasks_kfunc_map_insert(task); if (status) { err = 1; @@ -191,7 +191,7 @@ int BPF_PROG(test_task_xchg_release, struct task_struct *task, u64 clone_flags) return 0; } - /* Stash a copy into local kptr and check if it is released recursively */ + /* Stash a copy into local kptr and check if it is released recursively. */ acquired = bpf_task_acquire(kptr); if (!acquired) { err = 7; @@ -220,7 +220,6 @@ int BPF_PROG(test_task_xchg_release, struct task_struct *task, u64 clone_flags) } bpf_task_release(kptr); - return 0; } -- cgit v1.2.3 From a3a81d247651218e47153f2d2afd7aee236726fd Mon Sep 17 00:00:00 2001 From: Justin Suess Date: Tue, 9 Jun 2026 22:25:44 +0200 Subject: bpf: Cancel special fields on map value recycle Map update and delete paths currently call bpf_obj_free_fields() when a value is being replaced or recycled. That makes field destruction depend on the context of the update/delete operation. For tracing programs this can include NMI context, where referenced kptr destructors, uptr unpinning, and graph root destruction are not generally safe. Introduce bpf_obj_cancel_fields() for the reusable-value path. It only performs NMI-safe cleanup for timer, workqueue, and task_work fields. Fields that need full destruction are left attached to the recycled value and are destroyed by the final cleanup path instead. Switch array and hashtab update/delete/recycle paths to this cancel helper. Keep bpf_obj_free_fields() for final map destruction and for bpf_mem_alloc destructors. Preallocated hashtabs do not have allocator destructors, so teardown continues to walk the normal and extra elements and fully destroy their fields. This deliberately relaxes the eager-free semantics of map update/delete for special fields. Programs that relied on a recycled map slot becoming empty immediately after update/delete were relying on behavior that cannot be implemented safely from every BPF execution context without offloading arbitrary destructors. There is a chance this change breaks programs making assumptions regarding the eager freeing of fields. If so, we can relax semantics to cancellation only when irqs_disabled() is true in the future. However, theoretically, map values that get reused eagerly already have weaker guarantees as parallel users can recreate freed fields before the new element becomes visible again. Fixes: 14a324f6a67e ("bpf: Wire up freeing of referenced kptr") Signed-off-by: Justin Suess Co-developed-by: Kumar Kartikeya Dwivedi Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260609202548.3571690-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/bpf/arraymap.c | 8 +-- kernel/bpf/hashtab.c | 32 +++++----- kernel/bpf/syscall.c | 5 ++ .../testing/selftests/bpf/prog_tests/htab_update.c | 4 +- .../testing/selftests/bpf/prog_tests/linked_list.c | 33 +++++----- tools/testing/selftests/bpf/prog_tests/map_kptr.c | 10 +-- .../selftests/bpf/prog_tests/refcounted_kptr.c | 8 ++- tools/testing/selftests/bpf/progs/htab_update.c | 4 +- tools/testing/selftests/bpf/progs/linked_list.c | 71 ++++++++++++++++++++++ .../testing/selftests/bpf/progs/refcounted_kptr.c | 20 +++++- 11 files changed, 146 insertions(+), 50 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0654d2ffadc1..56f5da2b437f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2717,6 +2717,7 @@ bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *r void bpf_obj_free_timer(const struct btf_record *rec, void *obj); void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj); void bpf_obj_free_task_work(const struct btf_record *rec, void *obj); +void bpf_obj_cancel_fields(struct bpf_map *map, void *obj); void bpf_obj_free_fields(const struct btf_record *rec, void *obj); void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index e6271a2bf6d6..248b4818178c 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -384,7 +384,7 @@ static long array_map_update_elem(struct bpf_map *map, void *key, void *value, if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { val = this_cpu_ptr(array->pptrs[index & array->index_mask]); copy_map_value(map, val, value); - bpf_obj_free_fields(array->map.record, val); + bpf_obj_cancel_fields(map, val); } else { val = array->value + (u64)array->elem_size * (index & array->index_mask); @@ -392,7 +392,7 @@ static long array_map_update_elem(struct bpf_map *map, void *key, void *value, copy_map_value_locked(map, val, value, false); else copy_map_value(map, val, value); - bpf_obj_free_fields(array->map.record, val); + bpf_obj_cancel_fields(map, val); } return 0; } @@ -432,14 +432,14 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, cpu = map_flags >> 32; ptr = per_cpu_ptr(pptr, cpu); copy_map_value(map, ptr, value); - bpf_obj_free_fields(array->map.record, ptr); + bpf_obj_cancel_fields(map, ptr); goto unlock; } for_each_possible_cpu(cpu) { ptr = per_cpu_ptr(pptr, cpu); val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu; copy_map_value(map, ptr, val); - bpf_obj_free_fields(array->map.record, ptr); + bpf_obj_cancel_fields(map, ptr); } unlock: rcu_read_unlock(); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index b4366cad3cfa..9f394e1aa2e8 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -243,6 +243,10 @@ static void htab_free_prealloced_fields(struct bpf_htab *htab) if (IS_ERR_OR_NULL(htab->map.record)) return; + /* + * Preallocated maps do not have a bpf_mem_alloc destructor, so fully + * destroy every element, including the extra elements. + */ if (htab_has_extra_elems(htab)) num_entries += num_possible_cpus(); for (i = 0; i < num_entries; i++) { @@ -833,8 +837,8 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map, return insn - insn_buf; } -static void check_and_free_fields(struct bpf_htab *htab, - struct htab_elem *elem) +static void check_and_cancel_fields(struct bpf_htab *htab, + struct htab_elem *elem) { if (IS_ERR_OR_NULL(htab->map.record)) return; @@ -844,11 +848,11 @@ static void check_and_free_fields(struct bpf_htab *htab, int cpu; for_each_possible_cpu(cpu) - bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu)); + bpf_obj_cancel_fields(&htab->map, per_cpu_ptr(pptr, cpu)); } else { void *map_value = htab_elem_value(elem, htab->map.key_size); - bpf_obj_free_fields(htab->map.record, map_value); + bpf_obj_cancel_fields(&htab->map, map_value); } } @@ -883,7 +887,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) htab_unlock_bucket(b, flags); if (l == tgt_l) - check_and_free_fields(htab, l); + check_and_cancel_fields(htab, l); return l == tgt_l; } @@ -948,7 +952,7 @@ find_first_elem: static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) { - check_and_free_fields(htab, l); + check_and_cancel_fields(htab, l); if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr); @@ -1001,7 +1005,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) if (htab_is_prealloc(htab)) { bpf_map_dec_elem_count(&htab->map); - check_and_free_fields(htab, l); + check_and_cancel_fields(htab, l); pcpu_freelist_push(&htab->freelist, &l->fnode); } else { dec_elem_count(htab); @@ -1018,7 +1022,7 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, /* copy true value_size bytes */ ptr = this_cpu_ptr(pptr); copy_map_value(&htab->map, ptr, value); - bpf_obj_free_fields(htab->map.record, ptr); + bpf_obj_cancel_fields(&htab->map, ptr); } else { u32 size = round_up(htab->map.value_size, 8); void *val; @@ -1028,7 +1032,7 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, cpu = map_flags >> 32; ptr = per_cpu_ptr(pptr, cpu); copy_map_value(&htab->map, ptr, value); - bpf_obj_free_fields(htab->map.record, ptr); + bpf_obj_cancel_fields(&htab->map, ptr); return; } @@ -1036,7 +1040,7 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, ptr = per_cpu_ptr(pptr, cpu); val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu; copy_map_value(&htab->map, ptr, val); - bpf_obj_free_fields(htab->map.record, ptr); + bpf_obj_cancel_fields(&htab->map, ptr); } } } @@ -1252,11 +1256,11 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, if (l_old) { hlist_nulls_del_rcu(&l_old->hash_node); - /* l_old has already been stashed in htab->extra_elems, free - * its special fields before it is available for reuse. + /* l_old has already been stashed in htab->extra_elems, cancel + * its reusable special fields before it is available for reuse. */ if (htab_is_prealloc(htab)) - check_and_free_fields(htab, l_old); + check_and_cancel_fields(htab, l_old); } htab_unlock_bucket(b, flags); if (l_old && !htab_is_prealloc(htab)) @@ -1269,7 +1273,7 @@ err: static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem) { - check_and_free_fields(htab, elem); + check_and_cancel_fields(htab, elem); bpf_map_dec_elem_count(&htab->map); bpf_lru_push_free(&htab->lru, &elem->lru_node); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d4188a992bd8..7ed949f70f82 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -808,6 +808,11 @@ void bpf_obj_free_task_work(const struct btf_record *rec, void *obj) bpf_task_work_cancel_and_free(obj + rec->task_work_off); } +void bpf_obj_cancel_fields(struct bpf_map *map, void *obj) +{ + bpf_map_free_internal_structs(map, obj); +} + void bpf_obj_free_fields(const struct btf_record *rec, void *obj) { const struct btf_field *fields; diff --git a/tools/testing/selftests/bpf/prog_tests/htab_update.c b/tools/testing/selftests/bpf/prog_tests/htab_update.c index ea1a6766fbe9..0a28d4346924 100644 --- a/tools/testing/selftests/bpf/prog_tests/htab_update.c +++ b/tools/testing/selftests/bpf/prog_tests/htab_update.c @@ -23,7 +23,7 @@ static void test_reenter_update(void) if (!ASSERT_OK_PTR(skel, "htab_update__open")) return; - bpf_program__set_autoload(skel->progs.bpf_obj_free_fields, true); + bpf_program__set_autoload(skel->progs.bpf_obj_cancel_fields, true); err = htab_update__load(skel); if (!ASSERT_TRUE(!err, "htab_update__load") || err) goto out; @@ -50,7 +50,7 @@ static void test_reenter_update(void) /* * Second update: replace existing element with same key and trigger * the reentrancy of bpf_map_update_elem(). - * check_and_free_fields() calls bpf_obj_free_fields() on the old + * check_and_cancel_fields() calls bpf_obj_cancel_fields() on the old * value, which is where fentry program runs and performs a nested * bpf_map_update_elem(), triggering -EDEADLK. */ diff --git a/tools/testing/selftests/bpf/prog_tests/linked_list.c b/tools/testing/selftests/bpf/prog_tests/linked_list.c index dbff099860ba..8defea0253ed 100644 --- a/tools/testing/selftests/bpf/prog_tests/linked_list.c +++ b/tools/testing/selftests/bpf/prog_tests/linked_list.c @@ -131,13 +131,14 @@ end: linked_list_fail__destroy(skel); } -static void clear_fields(struct bpf_map *map) +static void clear_fields(struct bpf_program *prog) { - char buf[24]; - int key = 0; + LIBBPF_OPTS(bpf_test_run_opts, opts); + int ret; - memset(buf, 0xff, sizeof(buf)); - ASSERT_OK(bpf_map__update_elem(map, &key, sizeof(key), buf, sizeof(buf), 0), "check_and_free_fields"); + ret = bpf_prog_test_run_opts(bpf_program__fd(prog), &opts); + ASSERT_OK(ret, "clear_fields"); + ASSERT_OK(opts.retval, "clear_fields retval"); } enum { @@ -170,31 +171,31 @@ static void test_linked_list_success(int mode, bool leave_in_map) ASSERT_OK(ret, "map_list_push_pop"); ASSERT_OK(opts.retval, "map_list_push_pop retval"); if (!leave_in_map) - clear_fields(skel->maps.array_map); + clear_fields(skel->progs.clear_map_list); ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.inner_map_list_push_pop), &opts); ASSERT_OK(ret, "inner_map_list_push_pop"); ASSERT_OK(opts.retval, "inner_map_list_push_pop retval"); if (!leave_in_map) - clear_fields(skel->maps.inner_map); + clear_fields(skel->progs.clear_inner_map_list); ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.global_list_push_pop), &opts); ASSERT_OK(ret, "global_list_push_pop"); ASSERT_OK(opts.retval, "global_list_push_pop retval"); if (!leave_in_map) - clear_fields(skel->maps.bss_A); + clear_fields(skel->progs.clear_global_list); ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.global_list_push_pop_nested), &opts); ASSERT_OK(ret, "global_list_push_pop_nested"); ASSERT_OK(opts.retval, "global_list_push_pop_nested retval"); if (!leave_in_map) - clear_fields(skel->maps.bss_A); + clear_fields(skel->progs.clear_global_nested_list); ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.global_list_array_push_pop), &opts); ASSERT_OK(ret, "global_list_array_push_pop"); ASSERT_OK(opts.retval, "global_list_array_push_pop retval"); if (!leave_in_map) - clear_fields(skel->maps.bss_A); + clear_fields(skel->progs.clear_global_array_list); if (mode == PUSH_POP) goto end; @@ -204,19 +205,19 @@ ppm: ASSERT_OK(ret, "map_list_push_pop_multiple"); ASSERT_OK(opts.retval, "map_list_push_pop_multiple retval"); if (!leave_in_map) - clear_fields(skel->maps.array_map); + clear_fields(skel->progs.clear_map_list); ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.inner_map_list_push_pop_multiple), &opts); ASSERT_OK(ret, "inner_map_list_push_pop_multiple"); ASSERT_OK(opts.retval, "inner_map_list_push_pop_multiple retval"); if (!leave_in_map) - clear_fields(skel->maps.inner_map); + clear_fields(skel->progs.clear_inner_map_list); ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.global_list_push_pop_multiple), &opts); ASSERT_OK(ret, "global_list_push_pop_multiple"); ASSERT_OK(opts.retval, "global_list_push_pop_multiple retval"); if (!leave_in_map) - clear_fields(skel->maps.bss_A); + clear_fields(skel->progs.clear_global_list); if (mode == PUSH_POP_MULT) goto end; @@ -226,19 +227,19 @@ lil: ASSERT_OK(ret, "map_list_in_list"); ASSERT_OK(opts.retval, "map_list_in_list retval"); if (!leave_in_map) - clear_fields(skel->maps.array_map); + clear_fields(skel->progs.clear_map_list); ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.inner_map_list_in_list), &opts); ASSERT_OK(ret, "inner_map_list_in_list"); ASSERT_OK(opts.retval, "inner_map_list_in_list retval"); if (!leave_in_map) - clear_fields(skel->maps.inner_map); + clear_fields(skel->progs.clear_inner_map_list); ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.global_list_in_list), &opts); ASSERT_OK(ret, "global_list_in_list"); ASSERT_OK(opts.retval, "global_list_in_list retval"); if (!leave_in_map) - clear_fields(skel->maps.bss_A); + clear_fields(skel->progs.clear_global_list); end: linked_list__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/map_kptr.c b/tools/testing/selftests/bpf/prog_tests/map_kptr.c index 03b46f17cf53..ec6f2f2e8308 100644 --- a/tools/testing/selftests/bpf/prog_tests/map_kptr.c +++ b/tools/testing/selftests/bpf/prog_tests/map_kptr.c @@ -51,7 +51,6 @@ static void test_map_kptr_success(bool test_run) ret = bpf_map__update_elem(skel->maps.array_map, &key, sizeof(key), buf, sizeof(buf), 0); ASSERT_OK(ret, "array_map update"); - skel->data->ref--; ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_map_kptr_ref3), &opts); ASSERT_OK(ret, "test_map_kptr_ref3 refcount"); ASSERT_OK(opts.retval, "test_map_kptr_ref3 retval"); @@ -59,49 +58,42 @@ static void test_map_kptr_success(bool test_run) ret = bpf_map__update_elem(skel->maps.pcpu_array_map, &key, sizeof(key), pbuf, cpu * sizeof(buf), 0); ASSERT_OK(ret, "pcpu_array_map update"); - skel->data->ref--; ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_map_kptr_ref3), &opts); ASSERT_OK(ret, "test_map_kptr_ref3 refcount"); ASSERT_OK(opts.retval, "test_map_kptr_ref3 retval"); ret = bpf_map__delete_elem(skel->maps.hash_map, &key, sizeof(key), 0); ASSERT_OK(ret, "hash_map delete"); - skel->data->ref--; ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_map_kptr_ref3), &opts); ASSERT_OK(ret, "test_map_kptr_ref3 refcount"); ASSERT_OK(opts.retval, "test_map_kptr_ref3 retval"); ret = bpf_map__delete_elem(skel->maps.pcpu_hash_map, &key, sizeof(key), 0); ASSERT_OK(ret, "pcpu_hash_map delete"); - skel->data->ref--; ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_map_kptr_ref3), &opts); ASSERT_OK(ret, "test_map_kptr_ref3 refcount"); ASSERT_OK(opts.retval, "test_map_kptr_ref3 retval"); ret = bpf_map__delete_elem(skel->maps.hash_malloc_map, &key, sizeof(key), 0); ASSERT_OK(ret, "hash_malloc_map delete"); - skel->data->ref--; ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_map_kptr_ref3), &opts); ASSERT_OK(ret, "test_map_kptr_ref3 refcount"); ASSERT_OK(opts.retval, "test_map_kptr_ref3 retval"); ret = bpf_map__delete_elem(skel->maps.pcpu_hash_malloc_map, &key, sizeof(key), 0); ASSERT_OK(ret, "pcpu_hash_malloc_map delete"); - skel->data->ref--; ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_map_kptr_ref3), &opts); ASSERT_OK(ret, "test_map_kptr_ref3 refcount"); ASSERT_OK(opts.retval, "test_map_kptr_ref3 retval"); ret = bpf_map__delete_elem(skel->maps.lru_hash_map, &key, sizeof(key), 0); ASSERT_OK(ret, "lru_hash_map delete"); - skel->data->ref--; ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_map_kptr_ref3), &opts); ASSERT_OK(ret, "test_map_kptr_ref3 refcount"); ASSERT_OK(opts.retval, "test_map_kptr_ref3 retval"); ret = bpf_map__delete_elem(skel->maps.lru_pcpu_hash_map, &key, sizeof(key), 0); ASSERT_OK(ret, "lru_pcpu_hash_map delete"); - skel->data->ref--; ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_map_kptr_ref3), &opts); ASSERT_OK(ret, "test_map_kptr_ref3 refcount"); ASSERT_OK(opts.retval, "test_map_kptr_ref3 retval"); @@ -175,7 +167,7 @@ void serial_test_map_kptr(void) ASSERT_OK(kern_sync_rcu(), "sync rcu"); wait_for_map_release(); - /* Observe refcount dropping to 1 on synchronous delete elem */ + /* Observe refcount dropping to 1 on map release. */ test_map_kptr_success(true); } diff --git a/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c b/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c index d2c0542716a8..1737eba34323 100644 --- a/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c +++ b/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c @@ -57,6 +57,7 @@ void test_percpu_hash_refcounted_kptr_refcount_leak(void) .data_size_in = sizeof(pkt_v4), .repeat = 1, ); + LIBBPF_OPTS(bpf_test_run_opts, syscall_opts); cpu_nr = libbpf_num_possible_cpus(); if (!ASSERT_GT(cpu_nr, 0, "libbpf_num_possible_cpus")) @@ -87,8 +88,11 @@ void test_percpu_hash_refcounted_kptr_refcount_leak(void) if (!ASSERT_EQ(opts.retval, 2, "opts.retval")) goto out; - err = bpf_map__update_elem(map, &key, sizeof(key), values, values_sz, 0); - if (!ASSERT_OK(err, "bpf_map__update_elem")) + fd = bpf_program__fd(skel->progs.clear_percpu_hash_kptr); + err = bpf_prog_test_run_opts(fd, &syscall_opts); + if (!ASSERT_OK(err, "bpf_prog_test_run_opts")) + goto out; + if (!ASSERT_EQ(syscall_opts.retval, 1, "syscall_opts.retval")) goto out; fd = bpf_program__fd(skel->progs.check_percpu_hash_refcount); diff --git a/tools/testing/selftests/bpf/progs/htab_update.c b/tools/testing/selftests/bpf/progs/htab_update.c index 195d3b2fba00..62c1b1325ec2 100644 --- a/tools/testing/selftests/bpf/progs/htab_update.c +++ b/tools/testing/selftests/bpf/progs/htab_update.c @@ -22,8 +22,8 @@ struct { int pid = 0; int update_err = 0; -SEC("?fentry/bpf_obj_free_fields") -int bpf_obj_free_fields(void *ctx) +SEC("?fentry/bpf_obj_cancel_fields") +int bpf_obj_cancel_fields(void *ctx) { __u32 key = 0; struct val value = { .payload = 1 }; diff --git a/tools/testing/selftests/bpf/progs/linked_list.c b/tools/testing/selftests/bpf/progs/linked_list.c index 421f40835acd..fa97faa5358b 100644 --- a/tools/testing/selftests/bpf/progs/linked_list.c +++ b/tools/testing/selftests/bpf/progs/linked_list.c @@ -290,6 +290,77 @@ int test_list_in_list(struct bpf_spin_lock *lock, struct bpf_list_head *head) return list_in_list(lock, head, true); } +#define MAX_LIST_CLEAR_NODES 256 + +static __always_inline +int clear_list(struct bpf_spin_lock *lock, struct bpf_list_head *head) +{ + struct bpf_list_node *n; + int i; + + for (i = 0; i < MAX_LIST_CLEAR_NODES; i++) { + bpf_spin_lock(lock); + n = bpf_list_pop_front(head); + bpf_spin_unlock(lock); + if (!n) + return 0; + bpf_obj_drop(container_of(n, struct foo, node2)); + } + return 1; +} + +SEC("syscall") +int clear_map_list(void *ctx) +{ + struct map_value *v; + + v = bpf_map_lookup_elem(&array_map, &(int){0}); + if (!v) + return 1; + return clear_list(&v->lock, &v->head); +} + +SEC("syscall") +int clear_inner_map_list(void *ctx) +{ + struct map_value *v; + void *map; + + map = bpf_map_lookup_elem(&map_of_maps, &(int){0}); + if (!map) + return 1; + v = bpf_map_lookup_elem(map, &(int){0}); + if (!v) + return 1; + return clear_list(&v->lock, &v->head); +} + +SEC("syscall") +int clear_global_list(void *ctx) +{ + return clear_list(&glock, &ghead); +} + +SEC("syscall") +int clear_global_nested_list(void *ctx) +{ + return clear_list(&ghead_nested.inner.lock, &ghead_nested.inner.head); +} + +SEC("syscall") +int clear_global_array_list(void *ctx) +{ + int ret; + + ret = clear_list(&glock_c, &ghead_array[0]); + if (ret) + return ret; + ret = clear_list(&glock_c, &ghead_array[1]); + if (ret) + return ret; + return clear_list(&glock_c, &ghead_array_one[0]); +} + SEC("tc") int map_list_push_pop(void *ctx) { diff --git a/tools/testing/selftests/bpf/progs/refcounted_kptr.c b/tools/testing/selftests/bpf/progs/refcounted_kptr.c index 13de169ad68f..61906f48025c 100644 --- a/tools/testing/selftests/bpf/progs/refcounted_kptr.c +++ b/tools/testing/selftests/bpf/progs/refcounted_kptr.c @@ -1036,13 +1036,31 @@ int percpu_hash_refcount_leak(void *ctx) struct map_value *v; int key = 0; - v = bpf_map_lookup_elem(&percpu_hash, &key); + v = bpf_map_lookup_percpu_elem(&percpu_hash, &key, 0); if (!v) return 0; return __insert_in_list(&head, &lock, &v->node); } +SEC("syscall") +int clear_percpu_hash_kptr(void *ctx) +{ + struct node_data *n; + struct map_value *v; + int key = 0; + + v = bpf_map_lookup_percpu_elem(&percpu_hash, &key, 0); + if (!v) + return 0; + + n = bpf_kptr_xchg(&v->node, NULL); + if (!n) + return 0; + bpf_obj_drop(n); + return probe_read_refcount(); +} + SEC("tc") int check_percpu_hash_refcount(void *ctx) { -- cgit v1.2.3 From 4b84518137ce841eca2acae83096adb829dad05c Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 9 Jun 2026 22:25:45 +0200 Subject: selftests/bpf: Exercise unsafe obj drops from tracing progs Add task_kfunc failure cases for bpf_obj_drop() on local objects with referenced kptr fields from tracing and NMI tracing programs. These programs must be rejected because dropping the object would run full special-field destruction synchronously in an unsafe context. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260609202548.3571690-4-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/task_kfunc_failure.c | 40 ++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c index 8e947d445f8e..8942b5478129 100644 --- a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c +++ b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c @@ -5,6 +5,7 @@ #include #include +#include "../bpf_experimental.h" #include "bpf_misc.h" #include "task_kfunc_common.h" @@ -233,6 +234,45 @@ int BPF_PROG(task_kfunc_release_unacquired, struct task_struct *task, u64 clone_ return 0; } +SEC("tp_btf/task_newtask") +__failure __msg("bpf_obj_drop cannot be used in tracing programs on types with NMI unsafe fields") +int BPF_PROG(task_kfunc_obj_drop_with_kptr, struct task_struct *task, u64 clone_flags) +{ + struct __tasks_kfunc_map_value *local; + + local = bpf_obj_new(typeof(*local)); + if (!local) + return 0; + + bpf_obj_drop(local); + return 0; +} + +SEC("tp_btf/task_newtask") +__failure __msg("bpf_obj_drop cannot be used in tracing programs on types with NMI unsafe fields") +int BPF_PROG(task_kfunc_obj_drop_nmi_with_kptr, struct task_struct *task, + u64 clone_flags) +{ + struct __tasks_kfunc_map_value *local; + struct task_struct *acquired, *old; + + (void)clone_flags; + + local = bpf_obj_new(typeof(*local)); + if (!local) + return 0; + + acquired = bpf_task_acquire(task); + if (acquired) { + old = bpf_kptr_xchg(&local->task, acquired); + if (old) + bpf_task_release(old); + } + + bpf_obj_drop(local); + return 0; +} + SEC("tp_btf/task_newtask") __failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(task_kfunc_from_pid_no_null_check, struct task_struct *task, u64 clone_flags) -- cgit v1.2.3 From 2e7c6cb4d8437a2fe7cd95aac7ca53d7eb05e9f4 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 9 Jun 2026 22:25:46 +0200 Subject: selftests/bpf: Exercise kptr map update lifetime Add focused map_kptr coverage for BPF-side map updates that touch values containing referenced kptrs. The new syscall programs stash the testmod refcounted object in an array map, a preallocated hash map, and a no-prealloc hash map, then update the same map from BPF. The refcount must remain elevated after the update, while the userspace runner destroys the skeleton and reuses the existing refcount wait to confirm map teardown releases the kptr. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260609202548.3571690-5-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/map_kptr.c | 56 ++++++++++++++ tools/testing/selftests/bpf/progs/map_kptr.c | 89 ++++++++++++++++++++++- 2 files changed, 142 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/map_kptr.c b/tools/testing/selftests/bpf/prog_tests/map_kptr.c index ec6f2f2e8308..17e707dddda8 100644 --- a/tools/testing/selftests/bpf/prog_tests/map_kptr.c +++ b/tools/testing/selftests/bpf/prog_tests/map_kptr.c @@ -143,12 +143,68 @@ static void wait_for_map_release(void) map_kptr__destroy(skel); } +enum map_update_kptr_case { + MAP_UPDATE_KPTR_ARRAY, + MAP_UPDATE_KPTR_HASH, + MAP_UPDATE_KPTR_HASH_MALLOC, +}; + +static struct bpf_program *map_update_kptr_prog(struct map_kptr *skel, + enum map_update_kptr_case test) +{ + switch (test) { + case MAP_UPDATE_KPTR_ARRAY: + return skel->progs.test_array_map_update_kptr; + case MAP_UPDATE_KPTR_HASH: + return skel->progs.test_hash_map_update_kptr; + case MAP_UPDATE_KPTR_HASH_MALLOC: + return skel->progs.test_hash_malloc_map_update_kptr; + } + + return NULL; +} + +static void test_map_update_kptr(enum map_update_kptr_case test) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct map_kptr *skel; + struct bpf_program *prog; + int ret; + + skel = map_kptr__open_and_load(); + if (!ASSERT_OK_PTR(skel, "map_kptr__open_and_load")) + return; + + prog = map_update_kptr_prog(skel, test); + if (!ASSERT_OK_PTR(prog, "map_update_kptr_prog")) + goto out; + + ret = bpf_prog_test_run_opts(bpf_program__fd(prog), &opts); + if (!ASSERT_OK(ret, "map_update_kptr")) + goto out; + if (!ASSERT_OK(opts.retval, "map_update_kptr retval")) + goto out; + + ASSERT_EQ(skel->bss->num_of_refs, 3, "refs_after_update"); + +out: + map_kptr__destroy(skel); + wait_for_map_release(); +} + void serial_test_map_kptr(void) { struct rcu_tasks_trace_gp *skel; RUN_TESTS(map_kptr_fail); + if (test__start_subtest("update_array_map_kptr")) + test_map_update_kptr(MAP_UPDATE_KPTR_ARRAY); + if (test__start_subtest("update_hash_map_kptr")) + test_map_update_kptr(MAP_UPDATE_KPTR_HASH); + if (test__start_subtest("update_hash_malloc_map_kptr")) + test_map_update_kptr(MAP_UPDATE_KPTR_HASH_MALLOC); + skel = rcu_tasks_trace_gp__open_and_load(); if (!ASSERT_OK_PTR(skel, "rcu_tasks_trace_gp__open_and_load")) return; diff --git a/tools/testing/selftests/bpf/progs/map_kptr.c b/tools/testing/selftests/bpf/progs/map_kptr.c index e708ffbe1f61..3fbefc568e0a 100644 --- a/tools/testing/selftests/bpf/progs/map_kptr.c +++ b/tools/testing/selftests/bpf/progs/map_kptr.c @@ -489,8 +489,7 @@ int test_map_kptr_ref3(struct __sk_buff *ctx) int num_of_refs; -SEC("syscall") -int count_ref(void *ctx) +static __always_inline int read_ref_count(void) { struct prog_test_ref_kfunc *p; unsigned long arg = 0; @@ -500,11 +499,95 @@ int count_ref(void *ctx) return 1; num_of_refs = p->cnt.refs.counter; - bpf_kfunc_call_test_release(p); return 0; } +SEC("syscall") +int count_ref(void *ctx) +{ + return read_ref_count(); +} + +static __always_inline int stash_ref_ptr(struct map_value *v) +{ + struct prog_test_ref_kfunc *p, *old; + unsigned long arg = 0; + + p = bpf_kfunc_call_test_acquire(&arg); + if (!p) + return 1; + + old = bpf_kptr_xchg(&v->ref_ptr, p); + if (old) { + bpf_kfunc_call_test_release(old); + old = bpf_kptr_xchg(&v->ref_ptr, NULL); + if (old) + bpf_kfunc_call_test_release(old); + return 2; + } + return 0; +} + +static __always_inline int check_refs(int expected) +{ + int ret; + + ret = read_ref_count(); + if (ret) + return ret; + return num_of_refs == expected ? 0 : 3; +} + +SEC("syscall") +int test_array_map_update_kptr(void *ctx) +{ + struct map_value init = {}, *v; + int key = 0, ret; + + v = bpf_map_lookup_elem(&array_map, &key); + if (!v) + return 1; + ret = stash_ref_ptr(v); + if (ret) + return ret; + ret = check_refs(3); + if (ret) + return ret; + ret = bpf_map_update_elem(&array_map, &key, &init, BPF_EXIST); + if (ret) + return 4; + return check_refs(3); +} + +#define DEFINE_HASH_UPDATE_KPTR_TEST(name, map) \ +SEC("syscall") \ +int name(void *ctx) \ +{ \ + struct map_value init = {}, *v; \ + int key = 0, ret; \ + \ + ret = bpf_map_update_elem(&map, &key, &init, BPF_NOEXIST); \ + if (ret) \ + return 1; \ + v = bpf_map_lookup_elem(&map, &key); \ + if (!v) \ + return 2; \ + ret = stash_ref_ptr(v); \ + if (ret) \ + return ret; \ + ret = check_refs(3); \ + if (ret) \ + return ret; \ + ret = bpf_map_update_elem(&map, &key, &init, BPF_EXIST); \ + if (ret) \ + return 4; \ + return check_refs(3); \ +} + +DEFINE_HASH_UPDATE_KPTR_TEST(test_hash_map_update_kptr, hash_map) +DEFINE_HASH_UPDATE_KPTR_TEST(test_hash_malloc_map_update_kptr, hash_malloc_map) + SEC("syscall") int test_ls_map_kptr_ref1(void *ctx) { -- cgit v1.2.3 From 2e8ad1ff712d2a397e407c9fde60901f68d077dc Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 9 Jun 2026 22:18:31 -0700 Subject: selftests/bpf: Fix bpf_iter/task_vma test For selftest bpf_iter/task_vma, I got a failure like below on my qemu run: test_task_vma_common:FAIL:compare_output unexpected compare_output: actual '561593546000-561593585000r--p0000000000:241256579534/root/devshare/bpf-next/tools/testing/selftests/bpf/test_progs' != expected '561593546000-561593585000r--p0000000000:245551546830/root/devshare/bpf-next/tools/testing/selftests/bpf/test_progs' Further debugging found out file->f_inode->i_ino value may exceed 32bit, e.g., i_ino = 0x14c2eae35, but the format string is '%u'. This caused inode mismatch between bpf iter and proc result. Fix the issue by using format string '%llu' to accommodate 64bit i_ino. Fixes: e8168840e16c ("selftests/bpf: Add test for bpf_iter_task_vma") Signed-off-by: Yonghong Song Acked-by: Leon Hwang Link: https://lore.kernel.org/r/20260610051831.1346659-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/bpf_iter_task_vmas.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_vmas.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_vmas.c index d64ba7ddaed5..d7fb561ed4fb 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_task_vmas.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_vmas.c @@ -52,7 +52,7 @@ SEC("iter/task_vma") int proc_maps(struct bpf_iter__task_vma *ctx) bpf_d_path(&file->f_path, d_path_buf, D_PATH_BUF_SIZE); BPF_SEQ_PRINTF(seq, "%08llx ", vma->vm_pgoff << 12); - BPF_SEQ_PRINTF(seq, "%02x:%02x %u", MAJOR(dev), MINOR(dev), + BPF_SEQ_PRINTF(seq, "%02x:%02x %llu", MAJOR(dev), MINOR(dev), file->f_inode->i_ino); BPF_SEQ_PRINTF(seq, "\t%s\n", d_path_buf); } else { -- cgit v1.2.3 From 10627ddc0167aab5c1c390a10ef461e9937aba08 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 10 Jun 2026 12:55:38 +0200 Subject: bpf: Tighten cgroup storage cookie checks for prog arrays The fix in commit abad3d0bad72 ("bpf: Fix oob access in cgroup local storage") is still incomplete. The prog-array compatibility check treats a program with no cgroup storage as compatible with any stored storage cookie. This allows a storage-less program to bridge a tail call chain between an entry program and a storage-using callee even though cgroup local storage at runtime still follows the caller's context, that is, A -> B(no storage) -> C(storage) path. Requiring exact cookie equality would break the legitimate case of a storage-less leaf program being tail called from a storage-using one. Instead, only accept a zero storage cookie if the program cannot perform tail calls itself. This keeps A -> B(no storage) working while rejecting the A -> B(no storage) -> C(storage) bridge. Fixes: abad3d0bad72 ("bpf: Fix oob access in cgroup local storage") Reported-by: Lin Ma Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20260610105539.705887-1-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a656a8572bdb..649cce41e13f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2481,7 +2481,7 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map, cookie = aux->cgroup_storage[i] ? aux->cgroup_storage[i]->cookie : 0; ret = map->owner->storage_cookie[i] == cookie || - !cookie; + (!cookie && !aux->tail_call_reachable); } if (ret && map->owner->attach_func_proto != aux->attach_func_proto) { -- cgit v1.2.3 From 30dee2c176e7954f63d1fa3e52d172f30beb9bfb Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Wed, 10 Jun 2026 12:55:39 +0200 Subject: selftests/bpf: Cover tail-call cgroup storage prog-array checks Add tail-call selftests for prog-array ownership when cgroup storage is in use. Verify that loading succeeds when callers and callees reuse the owner's cgroup storage map, and that loading fails for a different storage map and for the A(storage) -> B(no storage) -> C(storage) bridge case addressed in the previous commit. Also verify that a storage-less leaf program which cannot perform tail calls itself is still allowed to join a storage-owned prog array, while a storage-less tail-caller is rejected also at map update time. # LDLIBS=-static PKG_CONFIG='pkg-config --static' ./vmtest.sh -- ./test_progs -t tailcalls [...] #475/25 tailcalls/tailcall_freplace:OK #475/26 tailcalls/tailcall_bpf2bpf_freplace:OK #475/27 tailcalls/tailcall_failure:OK #475/28 tailcalls/reject_tail_call_spin_lock:OK #475/29 tailcalls/reject_tail_call_rcu_lock:OK #475/30 tailcalls/reject_tail_call_preempt_lock:OK #475/31 tailcalls/reject_tail_call_ref:OK #475/32 tailcalls/tailcall_sleepable:OK #475/33 tailcalls/tailcall_cgrp_storage:OK #475/34 tailcalls/tailcall_cgrp_storage_diff_storage:OK #475/35 tailcalls/tailcall_cgrp_storage_no_storage:OK #475/36 tailcalls/tailcall_cgrp_storage_no_storage_leaf:OK #475/37 tailcalls/tailcall_cgrp_storage_no_storage_bridge:OK #475 tailcalls:OK Summary: 1/37 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Lin Ma Signed-off-by: Rongzhen Cui Signed-off-by: Jingguo Tan Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20260610105539.705887-2-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/tailcalls.c | 186 +++++++++++++++++++++ .../selftests/bpf/progs/tailcall_cgrp_storage.c | 44 +++++ .../bpf/progs/tailcall_cgrp_storage_no_storage.c | 26 +++ .../bpf/progs/tailcall_cgrp_storage_owner.c | 32 ++++ 4 files changed, 288 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/tailcall_cgrp_storage.c create mode 100644 tools/testing/selftests/bpf/progs/tailcall_cgrp_storage_no_storage.c create mode 100644 tools/testing/selftests/bpf/progs/tailcall_cgrp_storage_owner.c diff --git a/tools/testing/selftests/bpf/prog_tests/tailcalls.c b/tools/testing/selftests/bpf/prog_tests/tailcalls.c index 7d534fde0af9..a5a226d0104c 100644 --- a/tools/testing/selftests/bpf/prog_tests/tailcalls.c +++ b/tools/testing/selftests/bpf/prog_tests/tailcalls.c @@ -8,6 +8,9 @@ #include "tailcall_freplace.skel.h" #include "tc_bpf2bpf.skel.h" #include "tailcall_fail.skel.h" +#include "tailcall_cgrp_storage_owner.skel.h" +#include "tailcall_cgrp_storage_no_storage.skel.h" +#include "tailcall_cgrp_storage.skel.h" #include "tailcall_sleepable.skel.h" /* test_tailcall_1 checks basic functionality by patching multiple locations @@ -1654,6 +1657,179 @@ static void test_tailcall_failure() RUN_TESTS(tailcall_fail); } +static void test_tailcall_cgrp_storage(void) +{ + struct tailcall_cgrp_storage_owner *owner_skel = NULL; + struct tailcall_cgrp_storage *skel = NULL; + int err, key = 0, prog_array_fd, prog_fd, storage_map_fd; + + owner_skel = tailcall_cgrp_storage_owner__open_and_load(); + if (!ASSERT_OK_PTR(owner_skel, "owner_open_and_load")) + return; + + prog_array_fd = bpf_map__fd(owner_skel->maps.prog_array); + storage_map_fd = bpf_map__fd(owner_skel->maps.storage_map); + + skel = tailcall_cgrp_storage__open(); + if (!ASSERT_OK_PTR(skel, "tailcall_cgrp_storage__open")) + goto out; + + err = bpf_map__reuse_fd(skel->maps.prog_array, prog_array_fd); + if (!ASSERT_OK(err, "reuse_prog_array")) + goto out; + + err = bpf_map__reuse_fd(skel->maps.storage_map, storage_map_fd); + if (!ASSERT_OK(err, "reuse_storage_map")) + goto out; + + err = bpf_object__load(skel->obj); + if (!ASSERT_OK(err, "tailcall_cgrp_storage__load")) + goto out; + + prog_fd = bpf_program__fd(skel->progs.callee_prog); + err = bpf_map_update_elem(prog_array_fd, &key, &prog_fd, BPF_ANY); + ASSERT_OK(err, "update_prog_array"); +out: + tailcall_cgrp_storage__destroy(skel); + tailcall_cgrp_storage_owner__destroy(owner_skel); +} + +static void test_tailcall_cgrp_storage_diff_storage(void) +{ + struct tailcall_cgrp_storage_owner *owner_skel = NULL; + struct tailcall_cgrp_storage *skel = NULL; + int err, prog_array_fd; + + owner_skel = tailcall_cgrp_storage_owner__open_and_load(); + if (!ASSERT_OK_PTR(owner_skel, "owner_open_and_load")) + return; + + prog_array_fd = bpf_map__fd(owner_skel->maps.prog_array); + + skel = tailcall_cgrp_storage__open(); + if (!ASSERT_OK_PTR(skel, "tailcall_cgrp_storage__open")) + goto out; + + err = bpf_map__reuse_fd(skel->maps.prog_array, prog_array_fd); + if (!ASSERT_OK(err, "reuse_prog_array")) + goto out; + + err = bpf_object__load(skel->obj); + ASSERT_ERR(err, "tailcall_cgrp_storage__load"); +out: + tailcall_cgrp_storage__destroy(skel); + tailcall_cgrp_storage_owner__destroy(owner_skel); +} + +static void test_tailcall_cgrp_storage_no_storage(void) +{ + struct tailcall_cgrp_storage_owner *owner_skel = NULL; + struct tailcall_cgrp_storage_no_storage *skel = NULL; + int err, prog_array_fd; + + owner_skel = tailcall_cgrp_storage_owner__open_and_load(); + if (!ASSERT_OK_PTR(owner_skel, "owner_open_and_load")) + return; + + prog_array_fd = bpf_map__fd(owner_skel->maps.prog_array); + + skel = tailcall_cgrp_storage_no_storage__open(); + if (!ASSERT_OK_PTR(skel, "tailcall_cgrp_storage_no_storage__open")) + goto out; + + err = bpf_map__reuse_fd(skel->maps.prog_array, prog_array_fd); + if (!ASSERT_OK(err, "reuse_prog_array")) + goto out; + + err = bpf_object__load(skel->obj); + ASSERT_ERR(err, "tailcall_cgrp_storage_no_storage__load"); +out: + tailcall_cgrp_storage_no_storage__destroy(skel); + tailcall_cgrp_storage_owner__destroy(owner_skel); +} + +static void test_tailcall_cgrp_storage_no_storage_leaf(void) +{ + struct tailcall_cgrp_storage_owner *owner_skel = NULL; + struct tailcall_cgrp_storage_no_storage *skel = NULL; + int err, key = 0, prog_array_fd, prog_fd; + + owner_skel = tailcall_cgrp_storage_owner__open_and_load(); + if (!ASSERT_OK_PTR(owner_skel, "owner_open_and_load")) + return; + + prog_array_fd = bpf_map__fd(owner_skel->maps.prog_array); + + skel = tailcall_cgrp_storage_no_storage__open_and_load(); + if (!ASSERT_OK_PTR(skel, "tailcall_cgrp_storage_no_storage__open_and_load")) + goto out; + + prog_fd = bpf_program__fd(skel->progs.leaf_prog); + err = bpf_map_update_elem(prog_array_fd, &key, &prog_fd, BPF_ANY); + if (!ASSERT_OK(err, "update_prog_array_leaf")) + goto out; + + prog_fd = bpf_program__fd(skel->progs.caller_prog); + err = bpf_map_update_elem(prog_array_fd, &key, &prog_fd, BPF_ANY); + ASSERT_ERR(err, "update_prog_array_bridge"); +out: + tailcall_cgrp_storage_no_storage__destroy(skel); + tailcall_cgrp_storage_owner__destroy(owner_skel); +} + +static void test_tailcall_cgrp_storage_no_storage_bridge(void) +{ + struct tailcall_cgrp_storage_owner *owner_skel = NULL; + struct tailcall_cgrp_storage_no_storage *bridge_skel = NULL; + struct tailcall_cgrp_storage *callee_skel = NULL; + int err, key = 0, prog_array_fd, prog_fd, storage_map_fd; + + owner_skel = tailcall_cgrp_storage_owner__open_and_load(); + if (!ASSERT_OK_PTR(owner_skel, "owner_open_and_load")) + return; + + prog_array_fd = bpf_map__fd(owner_skel->maps.prog_array); + storage_map_fd = bpf_map__fd(owner_skel->maps.storage_map); + + callee_skel = tailcall_cgrp_storage__open(); + if (!ASSERT_OK_PTR(callee_skel, "tailcall_cgrp_storage__open")) + goto out; + + bpf_program__set_autoload(callee_skel->progs.caller_prog, false); + + err = bpf_map__reuse_fd(callee_skel->maps.prog_array, prog_array_fd); + if (!ASSERT_OK(err, "reuse_prog_array")) + goto out; + + err = bpf_map__reuse_fd(callee_skel->maps.storage_map, storage_map_fd); + if (!ASSERT_OK(err, "reuse_storage_map")) + goto out; + + err = bpf_object__load(callee_skel->obj); + if (!ASSERT_OK(err, "tailcall_cgrp_storage__load")) + goto out; + + prog_fd = bpf_program__fd(callee_skel->progs.callee_prog); + err = bpf_map_update_elem(prog_array_fd, &key, &prog_fd, BPF_ANY); + if (!ASSERT_OK(err, "update_prog_array")) + goto out; + + bridge_skel = tailcall_cgrp_storage_no_storage__open(); + if (!ASSERT_OK_PTR(bridge_skel, "tailcall_cgrp_storage_no_storage__open")) + goto out; + + err = bpf_map__reuse_fd(bridge_skel->maps.prog_array, prog_array_fd); + if (!ASSERT_OK(err, "reuse_prog_array")) + goto out; + + err = bpf_object__load(bridge_skel->obj); + ASSERT_ERR(err, "tailcall_cgrp_storage_no_storage_bridge__load"); +out: + tailcall_cgrp_storage_no_storage__destroy(bridge_skel); + tailcall_cgrp_storage__destroy(callee_skel); + tailcall_cgrp_storage_owner__destroy(owner_skel); +} + noinline void uprobe_sleepable_trigger(void) { asm volatile (""); @@ -1781,4 +1957,14 @@ void test_tailcalls(void) test_tailcall_failure(); if (test__start_subtest("tailcall_sleepable")) test_tailcall_sleepable(); + if (test__start_subtest("tailcall_cgrp_storage")) + test_tailcall_cgrp_storage(); + if (test__start_subtest("tailcall_cgrp_storage_diff_storage")) + test_tailcall_cgrp_storage_diff_storage(); + if (test__start_subtest("tailcall_cgrp_storage_no_storage")) + test_tailcall_cgrp_storage_no_storage(); + if (test__start_subtest("tailcall_cgrp_storage_no_storage_leaf")) + test_tailcall_cgrp_storage_no_storage_leaf(); + if (test__start_subtest("tailcall_cgrp_storage_no_storage_bridge")) + test_tailcall_cgrp_storage_no_storage_bridge(); } diff --git a/tools/testing/selftests/bpf/progs/tailcall_cgrp_storage.c b/tools/testing/selftests/bpf/progs/tailcall_cgrp_storage.c new file mode 100644 index 000000000000..4dd3a0033d75 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall_cgrp_storage.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE); + __type(key, struct bpf_cgroup_storage_key); + __type(value, __u64); +} storage_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} prog_array SEC(".maps"); + +SEC("cgroup_skb/egress") +int caller_prog(struct __sk_buff *skb) +{ + __u64 *storage; + + storage = bpf_get_local_storage(&storage_map, 0); + if (storage) + *storage = 1; + + bpf_tail_call(skb, &prog_array, 0); + return 1; +} + +SEC("cgroup_skb/egress") +int callee_prog(struct __sk_buff *skb) +{ + __u64 *storage; + + storage = bpf_get_local_storage(&storage_map, 0); + if (storage) + *storage = 1; + + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/tailcall_cgrp_storage_no_storage.c b/tools/testing/selftests/bpf/progs/tailcall_cgrp_storage_no_storage.c new file mode 100644 index 000000000000..5c69b0af6ff9 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall_cgrp_storage_no_storage.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} prog_array SEC(".maps"); + +SEC("cgroup_skb/egress") +int caller_prog(struct __sk_buff *skb) +{ + bpf_tail_call(skb, &prog_array, 0); + return 1; +} + +SEC("cgroup_skb/egress") +int leaf_prog(struct __sk_buff *skb) +{ + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/tailcall_cgrp_storage_owner.c b/tools/testing/selftests/bpf/progs/tailcall_cgrp_storage_owner.c new file mode 100644 index 000000000000..d7e8ec9855c5 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall_cgrp_storage_owner.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE); + __type(key, struct bpf_cgroup_storage_key); + __type(value, __u64); +} storage_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} prog_array SEC(".maps"); + +SEC("cgroup_skb/egress") +int prog_array_owner(struct __sk_buff *skb) +{ + __u64 *storage; + + storage = bpf_get_local_storage(&storage_map, 0); + if (storage) + *storage = 1; + + bpf_tail_call(skb, &prog_array, 0); + return 1; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 6001896f00984d317fb75160ba05c4a885fbe2a0 Mon Sep 17 00:00:00 2001 From: Sun Jian Date: Fri, 12 Jun 2026 19:40:31 +0800 Subject: bpf: Run generic devmap egress prog on private skb MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Generic XDP devmap multi redirect uses skb_clone() for intermediate destinations and sends the last destination with the original skb. This can leave multiple destinations sharing the same packet data. This becomes visible after generic devmap egress-program support was added: a devmap egress program may mutate packet data, and another destination sharing the same data can observe that mutation. Native XDP broadcast redirect does not have this issue because xdpf_clone() copies the frame data for each destination. Generic XDP should provide the same per-destination isolation before running a devmap egress program. Fix this by making cloned skbs private before running the generic devmap egress program. Use skb_copy() instead of skb_unshare() so allocation failure does not consume the skb and the existing caller error paths keep their ownership semantics. Fixes: 2ea5eabaf04a ("bpf: devmap: Implement devmap prog execution for generic XDP") Suggested-by: Jiayuan Chen Suggested-by: Jakub Kicinski Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Sun Jian Link: https://lore.kernel.org/r/20260612114032.244616-2-sun.jian.kdev@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/devmap.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 5b9eac5342a9..dc7b859e8bbf 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -710,6 +710,18 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, if (unlikely(err)) return err; + if (dst->xdp_prog && skb_cloned(skb)) { + struct sk_buff *nskb; + + nskb = skb_copy(skb, GFP_ATOMIC); + if (!nskb) + return -ENOMEM; + + nskb->mac_len = skb->mac_len; + consume_skb(skb); + skb = nskb; + } + /* Redirect has already succeeded semantically at this point, so we just * return 0 even if packet is dropped. Helper below takes care of * freeing skb. -- cgit v1.2.3 From f0eff94d07cda9bd71754d95af4301cd437020b8 Mon Sep 17 00:00:00 2001 From: Sun Jian Date: Fri, 12 Jun 2026 19:40:32 +0800 Subject: selftests/bpf: Cover generic devmap egress last-dst rewrite Strengthen xdp_veth_egress to check that each destination observes the MAC selected for its own egress ifindex, instead of only checking that the observed MAC differs from a single magic value. Add a generic XDP last-destination test where an earlier destination does not have a devmap egress program while the final destination does. This covers the case where the final destination runs on the original skb and could otherwise rewrite packet data still shared with an earlier cloned skb. Use deterministic DEVMAP_HASH keys for the egress map so the intended last destination is stable. Initialize the result map with a sentinel value and check that store_mac_1 overwrites it before checking that the earlier destination did not observe the MAC written by the final destination. Suggested-by: Jiayuan Chen Signed-off-by: Sun Jian Link: https://lore.kernel.org/r/20260612114032.244616-3-sun.jian.kdev@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/test_xdp_veth.c | 166 ++++++++++++++++++++- 1 file changed, 163 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/test_xdp_veth.c b/tools/testing/selftests/bpf/prog_tests/test_xdp_veth.c index 3e98a1665936..1675b32753a8 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_xdp_veth.c +++ b/tools/testing/selftests/bpf/prog_tests/test_xdp_veth.c @@ -456,7 +456,11 @@ static void xdp_veth_egress(u32 flags) .remote_flags = flags, } }; - const char magic_mac[6] = { 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF}; + const unsigned char egress_macs[VETH_PAIRS_COUNT][ETH_ALEN] = { + { 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0x01 }, + { 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0x02 }, + { 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0x03 }, + }; struct xdp_redirect_multi_kern *xdp_redirect_multi_kern; struct bpf_object *bpf_objs[VETH_EGRESS_SKEL_NB]; struct xdp_redirect_map *xdp_redirect_map; @@ -512,7 +516,13 @@ static void xdp_veth_egress(u32 flags) &net_config, prog_cfg, i)) goto destroy_xdp_redirect_map; - err = bpf_map_update_elem(mac_map, &ifindex, magic_mac, 0); + { + __be64 mac = 0; + + memcpy(&mac, egress_macs[i], ETH_ALEN); + err = bpf_map_update_elem(mac_map, &ifindex, &mac, 0); + } + if (!ASSERT_OK(err, "bpf_map_update_elem")) goto destroy_xdp_redirect_map; @@ -531,15 +541,162 @@ static void xdp_veth_egress(u32 flags) for (i = 0; i < 2; i++) { u32 key = i; + __be64 expected = 0; u64 res; err = bpf_map_lookup_elem(res_map, &key, &res); if (!ASSERT_OK(err, "get MAC res")) goto destroy_xdp_redirect_map; - ASSERT_STRNEQ((const char *)&res, magic_mac, ETH_ALEN, "compare mac"); + /* store_mac_1/2 run on the second/third remote veths. */ + memcpy(&expected, egress_macs[i + 1], ETH_ALEN); + ASSERT_EQ(res, expected, "compare mac"); + } + +destroy_xdp_redirect_map: + close_netns(nstoken); + xdp_redirect_map__destroy(xdp_redirect_map); +destroy_xdp_redirect_multi_kern: + xdp_redirect_multi_kern__destroy(xdp_redirect_multi_kern); +destroy_xdp_dummy: + xdp_dummy__destroy(xdp_dummy); + + cleanup_network(&net_config); +} + +static void xdp_veth_egress_last_dst(u32 flags) +{ + struct prog_configuration prog_cfg[VETH_PAIRS_COUNT] = { + { + .local_name = "xdp_redirect_map_all_prog", + .remote_name = "xdp_dummy_prog", + .local_flags = flags, + .remote_flags = flags, + }, + { + .local_name = "xdp_redirect_map_all_prog", + .remote_name = "store_mac_1", + .local_flags = flags, + .remote_flags = flags, + }, + { + .local_name = "xdp_redirect_map_all_prog", + .remote_name = "xdp_dummy_prog", + .local_flags = flags, + .remote_flags = flags, + } + }; + const unsigned char egress_macs[VETH_PAIRS_COUNT][ETH_ALEN] = { + { 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0x01 }, + { 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0x02 }, + { 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0x03 }, + }; + struct xdp_redirect_multi_kern *xdp_redirect_multi_kern; + struct bpf_object *bpf_objs[VETH_EGRESS_SKEL_NB]; + struct xdp_redirect_map *xdp_redirect_map; + struct net_configuration net_config = {}; + int mac_map, egress_map, res_map; + struct nstoken *nstoken = NULL; + struct xdp_dummy *xdp_dummy; + __be64 sentinel_mac = 0; + __be64 last_mac = 0; + __be64 res; + u32 key; + int err; + int i; + + xdp_dummy = xdp_dummy__open_and_load(); + if (!ASSERT_OK_PTR(xdp_dummy, "xdp_dummy__open_and_load")) + return; + + xdp_redirect_multi_kern = xdp_redirect_multi_kern__open_and_load(); + if (!ASSERT_OK_PTR(xdp_redirect_multi_kern, "xdp_redirect_multi_kern__open_and_load")) + goto destroy_xdp_dummy; + + xdp_redirect_map = xdp_redirect_map__open_and_load(); + if (!ASSERT_OK_PTR(xdp_redirect_map, "xdp_redirect_map__open_and_load")) + goto destroy_xdp_redirect_multi_kern; + + if (!ASSERT_OK(create_network(&net_config), "create network")) + goto destroy_xdp_redirect_map; + + mac_map = bpf_map__fd(xdp_redirect_multi_kern->maps.mac_map); + if (!ASSERT_OK_FD(mac_map, "open mac_map")) + goto destroy_xdp_redirect_map; + + egress_map = bpf_map__fd(xdp_redirect_multi_kern->maps.map_egress); + if (!ASSERT_OK_FD(egress_map, "open map_egress")) + goto destroy_xdp_redirect_map; + + bpf_objs[0] = xdp_dummy->obj; + bpf_objs[1] = xdp_redirect_multi_kern->obj; + bpf_objs[2] = xdp_redirect_map->obj; + + nstoken = open_netns(net_config.ns0_name); + if (!ASSERT_OK_PTR(nstoken, "open NS0")) + goto destroy_xdp_redirect_map; + + for (i = 0; i < VETH_PAIRS_COUNT; i++) { + struct bpf_devmap_val devmap_val = {}; + int ifindex = if_nametoindex(net_config.veth_cfg[i].local_veth); + u32 key = i; + + SYS(destroy_xdp_redirect_map, + "ip -n %s neigh add %s lladdr 00:00:00:00:00:01 dev %s", + net_config.veth_cfg[i].namespace, IP_NEIGH, + net_config.veth_cfg[i].remote_veth); + + if (attach_programs_to_veth_pair(bpf_objs, VETH_EGRESS_SKEL_NB, + &net_config, prog_cfg, i)) + goto destroy_xdp_redirect_map; + + { + __be64 mac = 0; + + memcpy(&mac, egress_macs[i], ETH_ALEN); + err = bpf_map_update_elem(mac_map, &ifindex, &mac, 0); + } + + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto destroy_xdp_redirect_map; + + devmap_val.ifindex = ifindex; + devmap_val.bpf_prog.fd = -1; + + if (i == VETH_PAIRS_COUNT - 1) + devmap_val.bpf_prog.fd = + bpf_program__fd(xdp_redirect_multi_kern->progs.xdp_devmap_prog); + + err = bpf_map_update_elem(egress_map, &key, &devmap_val, 0); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto destroy_xdp_redirect_map; } + res_map = bpf_map__fd(xdp_redirect_map->maps.rx_mac); + if (!ASSERT_OK_FD(res_map, "open rx_map")) + goto destroy_xdp_redirect_map; + + memcpy(&sentinel_mac, egress_macs[VETH_PAIRS_COUNT - 1], ETH_ALEN); + memcpy(&last_mac, egress_macs[VETH_PAIRS_COUNT - 1], ETH_ALEN); + + key = 0; + err = bpf_map_update_elem(res_map, &key, &sentinel_mac, 0); + if (!ASSERT_OK(err, "init rx mac")) + goto destroy_xdp_redirect_map; + + SYS_NOFAIL("ip netns exec %s ping %s -i 0.1 -c 4 -W1 > /dev/null ", + net_config.veth_cfg[0].namespace, IP_NEIGH); + + err = bpf_map_lookup_elem(res_map, &key, &res); + if (!ASSERT_OK(err, "get MAC res")) + goto destroy_xdp_redirect_map; + + if (!ASSERT_NEQ(res, sentinel_mac, "rx_mac overwritten by store_mac_1")) + goto destroy_xdp_redirect_map; + + if (!ASSERT_NEQ(res, last_mac, "earlier dst not rewritten by last dst")) + goto destroy_xdp_redirect_map; + destroy_xdp_redirect_map: close_netns(nstoken); xdp_redirect_map__destroy(xdp_redirect_map); @@ -596,4 +753,7 @@ void test_xdp_veth_egress(void) if (test__start_subtest("SKB_MODE/egress")) xdp_veth_egress(XDP_FLAGS_SKB_MODE); + + if (test__start_subtest("SKB_MODE/egress_last_dst")) + xdp_veth_egress_last_dst(XDP_FLAGS_SKB_MODE); } -- cgit v1.2.3 From 4c71303c837449158815c521fcee4ec3b8721dbd Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Wed, 10 Jun 2026 20:17:23 +0000 Subject: bpf: Fix setting retval to -EPERM for cgroup hooks not returning errno When a cgroup BPF program exits with 0, bpf_prog_run_array_cg() sets the hook return value to -EPERM if it is not a valid errno. This is correct for errno-based hooks, which return 0 on success and negative errno on failure, but wrong for boolean and void LSM hooks. Boolean LSM hooks should only return true or false, and void LSM hooks have no return value at all. Fix it by skipping setting -EPERM for hooks not returning errno. Fixes: 69fd337a975c ("bpf: per-cgroup lsm flavor") Signed-off-by: Xu Kuohai Link: https://lore.kernel.org/r/20260610201724.733943-2-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_lsm.h | 6 ++++++ kernel/bpf/bpf_lsm.c | 20 ++++++++++++++++++++ kernel/bpf/cgroup.c | 47 ++++++++++++++++++++++++++++++++++------------- 3 files changed, 60 insertions(+), 13 deletions(-) diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h index 643809cc78c3..143775a27a2a 100644 --- a/include/linux/bpf_lsm.h +++ b/include/linux/bpf_lsm.h @@ -52,6 +52,7 @@ int bpf_set_dentry_xattr_locked(struct dentry *dentry, const char *name__str, const struct bpf_dynptr *value_p, int flags); int bpf_remove_dentry_xattr_locked(struct dentry *dentry, const char *name__str); bool bpf_lsm_has_d_inode_locked(const struct bpf_prog *prog); +bool bpf_lsm_hook_returns_errno(u32 btf_id); #else /* !CONFIG_BPF_LSM */ @@ -104,6 +105,11 @@ static inline bool bpf_lsm_has_d_inode_locked(const struct bpf_prog *prog) { return false; } + +static inline bool bpf_lsm_hook_returns_errno(u32 btf_id) +{ + return true; +} #endif /* CONFIG_BPF_LSM */ #endif /* _LINUX_BPF_LSM_H */ diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index c5c925f00202..564071a92d7d 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -427,6 +427,26 @@ BTF_ID(func, bpf_lsm_audit_rule_known) BTF_ID(func, bpf_lsm_inode_xattr_skipcap) BTF_SET_END(bool_lsm_hooks) +/* hooks returning void */ +#define LSM_HOOK_void(DEFAULT, NAME, ...) BTF_ID(func, bpf_lsm_##NAME) +#define LSM_HOOK_int(DEFAULT, NAME, ...) /* nothing */ +#define LSM_HOOK(RET, DEFAULT, NAME, ...) LSM_HOOK_##RET(DEFAULT, NAME, __VA_ARGS__) +BTF_SET_START(void_lsm_hooks) +#include +#undef LSM_HOOK +#undef LSM_HOOK_void +#undef LSM_HOOK_int +BTF_SET_END(void_lsm_hooks) + +bool bpf_lsm_hook_returns_errno(u32 btf_id) +{ + if (btf_id_set_contains(&bool_lsm_hooks, btf_id)) + return false; + if (btf_id_set_contains(&void_lsm_hooks, btf_id)) + return false; + return true; +} + int bpf_lsm_get_retval_range(const struct bpf_prog *prog, struct bpf_retval_range *retval_range) { diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 35d1f1428ef3..83ce66296ac1 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -55,6 +55,28 @@ void __init cgroup_bpf_lifetime_notifier_init(void) &cgroup_bpf_lifetime_nb)); } +#ifdef CONFIG_BPF_LSM +struct cgroup_lsm_atype { + u32 attach_btf_id; + int refcnt; + bool returns_errno; +}; + +static struct cgroup_lsm_atype cgroup_lsm_atype[CGROUP_LSM_NUM]; + +static bool cgroup_bpf_hook_returns_errno(enum cgroup_bpf_attach_type atype) +{ + if (atype >= CGROUP_LSM_START && atype <= CGROUP_LSM_END) + return READ_ONCE(cgroup_lsm_atype[atype - CGROUP_LSM_START].returns_errno); + return true; +} +#else +static bool cgroup_bpf_hook_returns_errno(enum cgroup_bpf_attach_type atype) +{ + return true; +} +#endif + /* __always_inline is necessary to prevent indirect call through run_prog * function pointer. */ @@ -83,7 +105,8 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp, *(ret_flags) |= (func_ret >> 1); func_ret &= 1; } - if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval)) + if (!func_ret && cgroup_bpf_hook_returns_errno(atype) && + !IS_ERR_VALUE((long)run_ctx.retval)) run_ctx.retval = -EPERM; item++; } @@ -156,13 +179,6 @@ unsigned int __cgroup_bpf_run_lsm_current(const void *ctx, } #ifdef CONFIG_BPF_LSM -struct cgroup_lsm_atype { - u32 attach_btf_id; - int refcnt; -}; - -static struct cgroup_lsm_atype cgroup_lsm_atype[CGROUP_LSM_NUM]; - static enum cgroup_bpf_attach_type bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id) { @@ -191,10 +207,13 @@ void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype) lockdep_assert_held(&cgroup_mutex); - WARN_ON_ONCE(cgroup_lsm_atype[i].attach_btf_id && - cgroup_lsm_atype[i].attach_btf_id != attach_btf_id); - - cgroup_lsm_atype[i].attach_btf_id = attach_btf_id; + if (!cgroup_lsm_atype[i].attach_btf_id) { + cgroup_lsm_atype[i].attach_btf_id = attach_btf_id; + WRITE_ONCE(cgroup_lsm_atype[i].returns_errno, + bpf_lsm_hook_returns_errno(attach_btf_id)); + } else { + WARN_ON_ONCE(cgroup_lsm_atype[i].attach_btf_id != attach_btf_id); + } cgroup_lsm_atype[i].refcnt++; } @@ -203,8 +222,10 @@ void bpf_cgroup_atype_put(int cgroup_atype) int i = cgroup_atype - CGROUP_LSM_START; cgroup_lock(); - if (--cgroup_lsm_atype[i].refcnt <= 0) + if (--cgroup_lsm_atype[i].refcnt <= 0) { + WRITE_ONCE(cgroup_lsm_atype[i].returns_errno, true); cgroup_lsm_atype[i].attach_btf_id = 0; + } WARN_ON_ONCE(cgroup_lsm_atype[i].refcnt < 0); cgroup_unlock(); } -- cgit v1.2.3 From cec8423776176eb73429443ecb859789af9602e5 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Wed, 10 Jun 2026 20:17:24 +0000 Subject: selftests/bpf: Add retval test for bool and errno LSM cgroup hooks Add test to check the return value when a BPF program exits with 0 for a boolean and an errno LSM hook. For each hook, two BPF programs are attached. The first program returns 0 without calling bpf_set_retval() to exercise the return value translation logic, while the second program reads the retval via bpf_get_retval(). Signed-off-by: Xu Kuohai Link: https://lore.kernel.org/r/20260610201724.733943-3-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/prog_tests/lsm_cgroup.c | 79 ++++++++++++++++++++++ tools/testing/selftests/bpf/progs/lsm_cgroup.c | 30 ++++++++ 2 files changed, 109 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c b/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c index 6df25de8f080..41e867467f6c 100644 --- a/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c +++ b/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -309,11 +310,89 @@ static void test_lsm_cgroup_nonvoid(void) lsm_cgroup_nonvoid__destroy(skel); } +static void test_lsm_cgroup_retval(void) +{ + struct lsm_cgroup *skel = NULL; + int skipcap_prog_fd1, skipcap_prog_fd2, socket_prog_fd1, socket_prog_fd2; + int cgroup_fd = -1; + int err, fd; + char tmpfile[] = "/tmp/test_lsm_cgroup_retval.XXXXXX"; + + fd = mkstemp(tmpfile); + if (!ASSERT_OK_FD(fd, "mkstemp")) + return; + close(fd); + + cgroup_fd = test__join_cgroup("/default_retval"); + if (!ASSERT_OK_FD(cgroup_fd, "join_cgroup")) + goto cleanup_tmpfile; + + skel = lsm_cgroup__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + goto cleanup_cgroup; + + skipcap_prog_fd1 = bpf_program__fd(skel->progs.skipcap_first); + skipcap_prog_fd2 = bpf_program__fd(skel->progs.skipcap_second); + socket_prog_fd1 = bpf_program__fd(skel->progs.socket_first); + socket_prog_fd2 = bpf_program__fd(skel->progs.socket_second); + + err = bpf_prog_attach(skipcap_prog_fd1, cgroup_fd, BPF_LSM_CGROUP, BPF_F_ALLOW_MULTI); + if (err == -ENOTSUPP) { + test__skip(); + goto cleanup_skeleton; + } + if (!ASSERT_OK(err, "attach first skipcap prog")) + goto cleanup_skeleton; + + err = bpf_prog_attach(skipcap_prog_fd2, cgroup_fd, BPF_LSM_CGROUP, BPF_F_ALLOW_MULTI); + if (!ASSERT_OK(err, "attach second skipcap prog")) + goto cleanup_skipcap1; + + err = bpf_prog_attach(socket_prog_fd1, cgroup_fd, BPF_LSM_CGROUP, BPF_F_ALLOW_MULTI); + if (!ASSERT_OK(err, "attach first sock_create prog")) + goto cleanup_skipcap2; + + err = bpf_prog_attach(socket_prog_fd2, cgroup_fd, BPF_LSM_CGROUP, BPF_F_ALLOW_MULTI); + if (!ASSERT_OK(err, "attach second sock_create prog")) + goto cleanup_sock_create1; + + /* trigger the bool hook by setxattr */ + err = setxattr(tmpfile, "user.test", "value", 5, 0); + if (!ASSERT_OK(err, "setxattr")) + goto cleanup_sock_create2; + + /* trigger the errno hook by creating a socket */ + fd = socket(AF_INET, SOCK_STREAM, 0); + if (!ASSERT_OK_FD(fd, "socket")) + goto cleanup_sock_create2; + close(fd); + + ASSERT_EQ(skel->data->skipcap_retval, 0, "bool_hook_retval_should_be_0"); + ASSERT_EQ(skel->data->socket_retval, -EPERM, "errno_hook_retval_should_be_EPERM"); + +cleanup_sock_create2: + bpf_prog_detach2(socket_prog_fd2, cgroup_fd, BPF_LSM_CGROUP); +cleanup_sock_create1: + bpf_prog_detach2(socket_prog_fd1, cgroup_fd, BPF_LSM_CGROUP); +cleanup_skipcap2: + bpf_prog_detach2(skipcap_prog_fd2, cgroup_fd, BPF_LSM_CGROUP); +cleanup_skipcap1: + bpf_prog_detach2(skipcap_prog_fd1, cgroup_fd, BPF_LSM_CGROUP); +cleanup_skeleton: + lsm_cgroup__destroy(skel); +cleanup_cgroup: + close(cgroup_fd); +cleanup_tmpfile: + unlink(tmpfile); +} + void test_lsm_cgroup(void) { if (test__start_subtest("functional")) test_lsm_cgroup_functional(); if (test__start_subtest("nonvoid")) test_lsm_cgroup_nonvoid(); + if (test__start_subtest("retval")) + test_lsm_cgroup_retval(); btf__free(btf); } diff --git a/tools/testing/selftests/bpf/progs/lsm_cgroup.c b/tools/testing/selftests/bpf/progs/lsm_cgroup.c index d7598538aa2d..3bfa479104be 100644 --- a/tools/testing/selftests/bpf/progs/lsm_cgroup.c +++ b/tools/testing/selftests/bpf/progs/lsm_cgroup.c @@ -35,6 +35,8 @@ int called_socket_bind; int called_socket_bind2; int called_socket_alloc; int called_socket_clone; +int skipcap_retval = -4095; +int socket_retval = -4095; static __always_inline int test_local_storage(void) { @@ -190,3 +192,31 @@ int BPF_PROG(socket_clone, struct sock *newsk, const struct request_sock *req) return 1; } + +SEC("lsm_cgroup/inode_xattr_skipcap") +int BPF_PROG(skipcap_first, const char *name) +{ + return 0; +} + +SEC("lsm_cgroup/inode_xattr_skipcap") +int BPF_PROG(skipcap_second, const char *name) +{ + skipcap_retval = bpf_get_retval(); + bpf_set_retval(0); + return 1; +} + +SEC("lsm_cgroup/socket_create") +int BPF_PROG(socket_first, int family, int type, int protocol, int kern) +{ + return 0; +} + +SEC("lsm_cgroup/socket_create") +int BPF_PROG(socket_second, int family, int type, int protocol, int kern) +{ + socket_retval = bpf_get_retval(); + bpf_set_retval(0); + return 1; +} -- cgit v1.2.3 From b48bd16eb9fc57a463a337ca148516cdf3212d61 Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Wed, 10 Jun 2026 11:04:29 +0200 Subject: rqspinlock: Fix order in raw_res_spin_(un)lock_irq to allow schedule raw_res_spin_unlock_irqrestore() calls raw_res_spin_unlock() and then restores interrupts, this means preemption is enabled when interrupts are still disabled (as part of raw_res_spin_unlock()) so this cannot trigger an actual preemption. This is inconsistent with other spinlock implementations (raw_spin_unlock_irqrestore() and bpf_res_spin_unlock_irqrestore() itself). Adjust the macro to ensure interrupts are enabled before enabling preemption, allowing to schedule at that point. Make the same modification in the error path of raw_res_spin_lock_irqsave(). Fixes: 101acd2e78b1 ("rqspinlock: Add macros for rqspinlock usage") Cc: stable@vger.kernel.org Acked-by: Arnd Bergmann # asm-generic Acked-by: Waiman Long Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Gabriele Monaco Link: https://lore.kernel.org/r/20260610090431.32427-1-gmonaco@redhat.com Signed-off-by: Alexei Starovoitov --- include/asm-generic/rqspinlock.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/include/asm-generic/rqspinlock.h b/include/asm-generic/rqspinlock.h index 151d267a496b..4d46643f46cb 100644 --- a/include/asm-generic/rqspinlock.h +++ b/include/asm-generic/rqspinlock.h @@ -243,12 +243,20 @@ static __always_inline void res_spin_unlock(rqspinlock_t *lock) ({ \ int __ret; \ local_irq_save(flags); \ - __ret = raw_res_spin_lock(lock); \ - if (__ret) \ + preempt_disable(); \ + __ret = res_spin_lock(lock); \ + if (__ret) { \ local_irq_restore(flags); \ + preempt_enable(); \ + } \ __ret; \ }) -#define raw_res_spin_unlock_irqrestore(lock, flags) ({ raw_res_spin_unlock(lock); local_irq_restore(flags); }) +#define raw_res_spin_unlock_irqrestore(lock, flags) \ + ({ \ + res_spin_unlock(lock); \ + local_irq_restore(flags); \ + preempt_enable(); \ + }) #endif /* __ASM_GENERIC_RQSPINLOCK_H */ -- cgit v1.2.3 From 7bfb93e3475be9de894f1cecd3a727d3e1649b03 Mon Sep 17 00:00:00 2001 From: Woojin Ji Date: Fri, 12 Jun 2026 14:26:55 +0900 Subject: selftests/bpf: Add arena direct-value one-past-end reject test BPF_MAP_TYPE_ARENA supports direct-value pseudo loads, but unlike array maps its map value_size is zero and the valid direct-value range is the arena mmap size, max_entries * PAGE_SIZE. Commit 3ac1a467e376 ("bpf: Fix off-by-one boundary validation in arena direct-value access") fixed arena_map_direct_value_addr() to reject an offset exactly at the end of the arena mapping. Add a regression test that loads a BPF_PSEUDO_MAP_VALUE with off == arena_size and verifies that the verifier rejects it with the expected offset in the log. This is intentionally kept as a userspace raw-instruction test. I tried expressing the same BPF_PSEUDO_MAP_VALUE + off == arena_size case in verifier_arena.c with inline assembly. The only form that produces the desired instruction bytes uses __imm_addr(arena), but that emits R_BPF_64_NODYLD32, which the libbpf/bpftool link step rejects. Other register, immediate, and memory constraints either fail in the BPF backend or lower to a normal R_BPF_64_64 load followed by an ALU add, which does not exercise arena_map_direct_value_addr() with the boundary offset in the second ldimm64 slot. A legacy test_verifier fixture can express the raw instruction directly, but it needs arena map creation, mmap, and fixup plumbing in the legacy runner. That is more intrusive than the small prog_tests raw-instruction test. Use the userspace raw-instruction test, following the existing selftests pattern used for direct map-value pseudo loads, so insns[1].imm can be set to arena_size precisely. Assisted-by: ChatGPT:gpt-5.5 Signed-off-by: Woojin Ji Reviewed-by: Emil Tsalapatis Cc: Emil Tsalapatis Cc: Junyoung Jang Link: https://lore.kernel.org/r/20260612-arena-direct-value-v1-v4-1-b81b642f5277@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/arena_direct_value.c | 73 ++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/arena_direct_value.c diff --git a/tools/testing/selftests/bpf/prog_tests/arena_direct_value.c b/tools/testing/selftests/bpf/prog_tests/arena_direct_value.c new file mode 100644 index 000000000000..4b4adb3f4b71 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/arena_direct_value.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include + +#define ARENA_PAGES 32 + +static char log_buf[16384]; + +static void test_arena_direct_value_one_past_end(void) +{ + char expected[128]; + __u32 arena_sz = ARENA_PAGES * getpagesize(); + struct bpf_insn insns[] = { + BPF_LD_IMM64_RAW(BPF_REG_1, BPF_PSEUDO_MAP_VALUE, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + LIBBPF_OPTS(bpf_map_create_opts, map_opts); + LIBBPF_OPTS(bpf_prog_load_opts, prog_opts); + void *arena; + int map_fd, prog_fd; + + map_opts.map_flags = BPF_F_MMAPABLE; + prog_opts.log_buf = log_buf; + prog_opts.log_size = sizeof(log_buf); + prog_opts.log_level = 1; + + map_fd = bpf_map_create(BPF_MAP_TYPE_ARENA, "arena_direct_value", + 0, 0, ARENA_PAGES, &map_opts); + if (map_fd < 0) { + if (errno == EOPNOTSUPP) { + test__skip(); + return; + } + ASSERT_GE(map_fd, 0, "bpf_map_create"); + return; + } + + arena = mmap(NULL, arena_sz, PROT_READ | PROT_WRITE, MAP_SHARED, map_fd, 0); + if (!ASSERT_NEQ(arena, MAP_FAILED, "arena_mmap")) + goto cleanup; + + insns[0].imm = map_fd; + insns[1].imm = arena_sz; + + prog_fd = bpf_prog_load(BPF_PROG_TYPE_RAW_TRACEPOINT, + "arena_direct_value", "GPL", insns, + ARRAY_SIZE(insns), &prog_opts); + if (!ASSERT_LT(prog_fd, 0, "prog_load")) { + close(prog_fd); + goto cleanup; + } + + snprintf(expected, sizeof(expected), + "invalid access to map value pointer, value_size=0 off=%u", + arena_sz); + ASSERT_HAS_SUBSTR(log_buf, expected, "verifier_log"); + +cleanup: + if (arena != MAP_FAILED) + munmap(arena, arena_sz); + close(map_fd); +} + +void test_arena_direct_value(void) +{ + if (test__start_subtest("one_past_end")) + test_arena_direct_value_one_past_end(); +} -- cgit v1.2.3 From 2148794eeaf2a898adc791e9472eb80ea55984da Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sat, 13 Jun 2026 11:07:55 -0700 Subject: bpf: Raise maximum call chain depth to 16 frames Bump MAX_CALL_FRAMES from 8 to 16 to allow deeper call chains that Rust-BPF requires and update selftests. Link: https://lore.kernel.org/r/20260613180755.29671-1-alexei.starovoitov@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 13 +++--- kernel/bpf/verifier.c | 15 ++++--- .../selftests/bpf/progs/test_global_func3.c | 52 +++++++++++++++++++++- .../selftests/bpf/progs/verifier_liveness_exp.c | 2 +- .../selftests/bpf/progs/verifier_scalar_ids.c | 25 +++++------ tools/testing/selftests/bpf/verifier/calls.c | 48 ++++++++++++++++++++ 6 files changed, 128 insertions(+), 27 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index d57b339a8cb8..39a851e690ec 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -404,7 +404,7 @@ struct bpf_func_state { struct bpf_reg_state *stack_arg_regs; /* Outgoing on-stack arguments */ }; -#define MAX_CALL_FRAMES 8 +#define MAX_CALL_FRAMES 16 /* instruction history flags, used in bpf_jmp_history_entry.flags field. * Frame number and SPI are stored in dedicated fields of bpf_jmp_history_entry. @@ -421,20 +421,21 @@ enum { struct bpf_jmp_history_entry { /* insn idx can't be bigger than 1 million */ u32 idx : 20; - u32 frame : 3; /* stack access frame number */ + u32 frame : 4; /* stack access frame number */ u32 spi : 6; /* stack slot index (0..63) */ - u32 : 3; + u32 : 2; u32 prev_idx : 20; /* special INSN_F_xxx flags */ u32 flags : 4; u32 : 8; - /* additional registers that need precision tracking when this - * jump is backtracked, vector of six 10-bit records + /* + * additional registers that need precision tracking when this + * jump is backtracked, vector of five 11-bit records */ u64 linked_regs; }; -static_assert(MAX_CALL_FRAMES <= (1 << 3)); +static_assert(MAX_CALL_FRAMES <= (1 << 4)); static_assert(MAX_BPF_STACK / 8 <= (1 << 6)); /* Maximum number of bpf_reg_state objects that can exist at once */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index eb46a81a8c51..2abc79dbf281 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3144,7 +3144,7 @@ static void mark_indirect_target(struct bpf_verifier_env *env, int idx) env->insn_aux_data[idx].indirect_target = true; } -#define LR_FRAMENO_BITS 3 +#define LR_FRAMENO_BITS 4 #define LR_SPI_BITS 6 #define LR_ENTRY_BITS (LR_SPI_BITS + LR_FRAMENO_BITS + 1) #define LR_SIZE_BITS 4 @@ -3153,7 +3153,11 @@ static void mark_indirect_target(struct bpf_verifier_env *env, int idx) #define LR_SIZE_MASK ((1ull << LR_SIZE_BITS) - 1) #define LR_SPI_OFF LR_FRAMENO_BITS #define LR_IS_REG_OFF (LR_SPI_BITS + LR_FRAMENO_BITS) -#define LINKED_REGS_MAX 6 +#define LINKED_REGS_MAX 5 + +static_assert(MAX_CALL_FRAMES <= (1 << LR_FRAMENO_BITS)); +static_assert(LINKED_REGS_MAX < (1 << LR_SIZE_BITS)); +static_assert(LINKED_REGS_MAX * LR_ENTRY_BITS + LR_SIZE_BITS <= 64); struct linked_reg { u8 frameno; @@ -3177,10 +3181,11 @@ static struct linked_reg *linked_regs_push(struct linked_regs *s) return NULL; } -/* Use u64 as a vector of 6 10-bit values, use first 4-bits to track +/* + * Use u64 as a vector of 5 11-bit values, use first 4-bits to track * number of elements currently in stack. - * Pack one history entry for linked registers as 10 bits in the following format: - * - 3-bits frameno + * Pack one history entry for linked registers as 11 bits in the following format: + * - 4-bits frameno * - 6-bits spi_or_reg * - 1-bit is_reg */ diff --git a/tools/testing/selftests/bpf/progs/test_global_func3.c b/tools/testing/selftests/bpf/progs/test_global_func3.c index 974fd8c19561..b66abb350fb0 100644 --- a/tools/testing/selftests/bpf/progs/test_global_func3.c +++ b/tools/testing/selftests/bpf/progs/test_global_func3.c @@ -53,9 +53,57 @@ int f8(struct __sk_buff *skb) return f7(skb); } +static __attribute__ ((noinline)) +int f9(struct __sk_buff *skb) +{ + return f8(skb); +} + +static __attribute__ ((noinline)) +int f10(struct __sk_buff *skb) +{ + return f9(skb); +} + +static __attribute__ ((noinline)) +int f11(struct __sk_buff *skb) +{ + return f10(skb); +} + +static __attribute__ ((noinline)) +int f12(struct __sk_buff *skb) +{ + return f11(skb); +} + +static __attribute__ ((noinline)) +int f13(struct __sk_buff *skb) +{ + return f12(skb); +} + +static __attribute__ ((noinline)) +int f14(struct __sk_buff *skb) +{ + return f13(skb); +} + +static __attribute__ ((noinline)) +int f15(struct __sk_buff *skb) +{ + return f14(skb); +} + +static __attribute__ ((noinline)) +int f16(struct __sk_buff *skb) +{ + return f15(skb); +} + SEC("tc") -__failure __msg("the call stack of 9 frames") +__failure __msg("the call stack of 17 frames") int global_func3(struct __sk_buff *skb) { - return f8(skb); + return f16(skb); } diff --git a/tools/testing/selftests/bpf/progs/verifier_liveness_exp.c b/tools/testing/selftests/bpf/progs/verifier_liveness_exp.c index b058de623200..72646fa2745e 100644 --- a/tools/testing/selftests/bpf/progs/verifier_liveness_exp.c +++ b/tools/testing/selftests/bpf/progs/verifier_liveness_exp.c @@ -15,7 +15,7 @@ * FP offset at each call site. arg_track keys on (frame, off[]), so * r1=fp-8, r1=fp-16, ... r1=fp-400 produce 50 unique cache keys per level. * - * This test chains 8 subprograms (the MAX_CALL_FRAMES limit). Each + * This test chains 8 subprograms (within the MAX_CALL_FRAMES limit). Each * intermediate function calls the next one 50 times, each time with a * different FP-relative offset in r1. * diff --git a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c index 70ae14d6084f..e38f102da45f 100644 --- a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c +++ b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c @@ -372,37 +372,36 @@ __naked void precision_two_ids(void) SEC("socket") __success __log_level(2) __flag(BPF_F_TEST_STATE_FREQ) -/* check that r0 and r6 have different IDs after 'if', - * collect_linked_regs() can't tie more than 6 registers for a single insn. +/* + * check that r0 and r5 have different IDs after 'if', + * collect_linked_regs() can't tie more than 5 registers for a single insn. */ -__msg("8: (25) if r0 > 0x7 goto pc+0 ; R0=scalar(id=1") -__msg("14: (bf) r6 = r6 ; R6=scalar(id=2") -/* check that r{0-5} are marked precise after 'if' */ -__msg("frame0: regs=r0 stack= before 8: (25) if r0 > 0x7 goto pc+0") -__msg("frame0: parent state regs=r0,r1,r2,r3,r4,r5 stack=:") +__msg("7: (25) if r0 > 0x7 goto pc+0 ; R0=scalar(id=1") +__msg("12: (bf) r5 = r5 ; R5=scalar(id=2") +/* check that r{0-4} are marked precise after 'if' */ +__msg("frame0: regs=r0 stack= before 7: (25) if r0 > 0x7 goto pc+0") +__msg("frame0: parent state regs=r0,r1,r2,r3,r4 stack=:") __naked void linked_regs_too_many_regs(void) { asm volatile ( /* r0 = random number up to 0xff */ "call %[bpf_ktime_get_ns];" "r0 &= 0xff;" - /* tie r{0-6} IDs */ + /* tie r{0-5} IDs */ "r1 = r0;" "r2 = r0;" "r3 = r0;" "r4 = r0;" "r5 = r0;" - "r6 = r0;" - /* propagate range for r{0-6} */ + /* propagate range for r{0-5} */ "if r0 > 7 goto +0;" - /* keep r{1-5} live */ + /* keep r{1-4} live */ "r1 = r1;" "r2 = r2;" "r3 = r3;" "r4 = r4;" + /* make r5 appear in the log */ "r5 = r5;" - /* make r6 appear in the log */ - "r6 = r6;" /* force r0 to be precise, * this would cause r{0-4} to be precise because of shared IDs */ diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index 42d523a21a43..302d712e0d7e 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -1219,6 +1219,30 @@ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call H */ BPF_EXIT_INSN(), /* H */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call I */ + BPF_EXIT_INSN(), + /* I */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call J */ + BPF_EXIT_INSN(), + /* J */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call K */ + BPF_EXIT_INSN(), + /* K */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call L */ + BPF_EXIT_INSN(), + /* L */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call M */ + BPF_EXIT_INSN(), + /* M */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call N */ + BPF_EXIT_INSN(), + /* N */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call O */ + BPF_EXIT_INSN(), + /* O */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call P */ + BPF_EXIT_INSN(), + /* P */ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -1257,6 +1281,30 @@ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call H */ BPF_EXIT_INSN(), /* H */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call I */ + BPF_EXIT_INSN(), + /* I */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call J */ + BPF_EXIT_INSN(), + /* J */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call K */ + BPF_EXIT_INSN(), + /* K */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call L */ + BPF_EXIT_INSN(), + /* L */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call M */ + BPF_EXIT_INSN(), + /* M */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call N */ + BPF_EXIT_INSN(), + /* N */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call O */ + BPF_EXIT_INSN(), + /* O */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call P */ + BPF_EXIT_INSN(), + /* P */ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, -- cgit v1.2.3 From 16deef8de06ed69aa79d037a168a70407a84a5ca Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 14 Jun 2026 11:02:09 -1000 Subject: arm64: mm: Remove misleading pte_none() comment from ptep_try_set() This comment was thoughtlessly copied from the x86 version and doesn't apply to arm64. Remove it. Reported-by: Will Deacon Signed-off-by: Tejun Heo Link: https://lore.kernel.org/r/20260614210209.2371030-1-tj@kernel.org Signed-off-by: Alexei Starovoitov --- arch/arm64/include/asm/pgtable.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 3e579c26b383..61ba781061e8 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1830,10 +1830,6 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, return __ptep_get_and_clear(mm, addr, ptep); } -/* - * Note: strictly-zero compare is narrower than pte_none(), but the gap is - * harmless: a fresh kernel PTE has no software bits set. - */ static inline bool ptep_try_set(pte_t *ptep, pte_t new_pte) { pteval_t old = 0; -- cgit v1.2.3 From 4d87a251d45b4a95eb4c0abcfab809c9f231258a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 11 Jun 2026 13:42:24 +0200 Subject: bpf: Guard __get_user acesss with access_ok for uprobe_multi data As reported by sashiko [1] we need to use access_ok to check the user space data bounds before we use __get-user to get it. [1] https://lore.kernel.org/bpf/20260610145235.CB1441F00893@smtp.kernel.org/ Fixes: 0b779b61f651 ("bpf: Add cookies support for uprobe_multi link") Fixes: 89ae89f53d20 ("bpf: Add multi uprobe link") Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260611114230.950379-2-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 90432f0fc2a8..b5a12af2d3f8 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3224,6 +3224,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr unsigned long __user *uoffsets; u64 __user *ucookies; void __user *upath; + unsigned long size; u32 flags, cnt, i; struct path path; char *name; @@ -3261,6 +3262,16 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr uref_ctr_offsets = u64_to_user_ptr(attr->link_create.uprobe_multi.ref_ctr_offsets); ucookies = u64_to_user_ptr(attr->link_create.uprobe_multi.cookies); + /* + * All uoffsets/uref_ctr_offsets/ucookies arrays have the same value + * size, we need to check their address range is safe for __get_user + * calls. + */ + size = sizeof(*uoffsets) * cnt; + if (!access_ok(uoffsets, size) || !access_ok(uref_ctr_offsets, size) || + !access_ok(ucookies, size)) + return -EFAULT; + name = strndup_user(upath, PATH_MAX); if (IS_ERR(name)) { err = PTR_ERR(name); -- cgit v1.2.3 From 65d81609e93140d8dd745fd41eb8a195f83ba7cd Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 11 Jun 2026 13:42:25 +0200 Subject: bpf: Use user_path_at for path resolution in uprobe_multi Resolve the uprobe_multi user path with user_path_at() instead of copying the string with strndup_user() and passing it to kern_path(). This removes the temporary allocation and keeps the lookup logic in one helper. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260611114230.950379-3-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b5a12af2d3f8..f8990bc6b64c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3227,7 +3227,6 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr unsigned long size; u32 flags, cnt, i; struct path path; - char *name; pid_t pid; int err; @@ -3272,14 +3271,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr !access_ok(ucookies, size)) return -EFAULT; - name = strndup_user(upath, PATH_MAX); - if (IS_ERR(name)) { - err = PTR_ERR(name); - return err; - } - - err = kern_path(name, LOOKUP_FOLLOW, &path); - kfree(name); + err = user_path_at(AT_FDCWD, upath, LOOKUP_FOLLOW, &path); if (err) return err; -- cgit v1.2.3 From 26330a9226417c9a3395db9fdb403f7d7371e6b7 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 11 Jun 2026 13:42:26 +0200 Subject: bpf: Add support to specify uprobe_multi target via file descriptor Allow uprobe_multi link to identify the target binary by an already opened file descriptor. Adding new BPF_F_UPROBE_MULTI_PATH_FD flag and the path_fd field for the attr.link_create.uprobe_multi struct. When the flag is set, we resolve the target from path_fd, without the flag, we keep the existing string path behavior. I don't see a use case for supporting O_PATH file descriptors, because we need to read the binary first to get probes offsets, so I'm using the CLASS(fd, f), which fails for O_PATH fds. Assisted-by: Codex:GPT-5.4 Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260611114230.950379-4-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 7 ++++++- kernel/bpf/syscall.c | 4 ++-- kernel/trace/bpf_trace.c | 43 ++++++++++++++++++++++++++++++++++++------ tools/include/uapi/linux/bpf.h | 7 ++++++- 4 files changed, 51 insertions(+), 10 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 11dd610fa5fa..89b36de5fdbb 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1327,7 +1327,11 @@ enum { * BPF_TRACE_UPROBE_MULTI attach type to create return probe. */ enum { - BPF_F_UPROBE_MULTI_RETURN = (1U << 0) + /* Get return uprobe. */ + BPF_F_UPROBE_MULTI_RETURN = (1U << 0), + + /* Get path from provided path_fd. */ + BPF_F_UPROBE_MULTI_PATH_FD = (1U << 1), }; /* link_create.netfilter.flags used in LINK_CREATE command for @@ -1864,6 +1868,7 @@ union bpf_attr { __u32 cnt; __u32 flags; __u32 pid; + __u32 path_fd; } uprobe_multi; struct { union { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7ed949f70f82..b44106c8ea75 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3480,7 +3480,7 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_KPROBE_MULTI_RETURN ? "kretprobe_multi" : "kprobe_multi"); else if (link->type == BPF_LINK_TYPE_UPROBE_MULTI) - seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_UPROBE_MULTI_RETURN ? + seq_printf(m, "link_type:\t%s\n", link->flags & BPF_F_UPROBE_MULTI_RETURN ? "uretprobe_multi" : "uprobe_multi"); else seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]); @@ -5840,7 +5840,7 @@ err_put: return err; } -#define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.pid +#define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.path_fd static int link_create(union bpf_attr *attr, bpfptr_t uattr) { struct bpf_prog *prog; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f8990bc6b64c..82f8feea6931 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -3214,6 +3215,38 @@ static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx) return run_ctx->uprobe->cookie; } +static int bpf_uprobe_multi_get_path(const union bpf_attr *attr, struct path *path) +{ + void __user *upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path); + u32 path_fd = attr->link_create.uprobe_multi.path_fd; + u32 flags = attr->link_create.uprobe_multi.flags; + + if (flags & BPF_F_UPROBE_MULTI_PATH_FD) { + /* + * When BPF_F_UPROBE_MULTI_PATH_FD is set, the executable is + * identified by path_fd, upath must be NULL. + */ + if (upath) + return -EINVAL; + + CLASS(fd, f)(path_fd); + if (fd_empty(f)) + return -EBADF; + *path = fd_file(f)->f_path; + path_get(path); + return 0; + } + + /* + * When BPF_F_UPROBE_MULTI_PATH_FD is not set, the path is resolved + * relative to the cwd (AT_FDCWD) or absolute using the upath string. + */ + if (!upath || path_fd) + return -EINVAL; + + return user_path_at(AT_FDCWD, upath, LOOKUP_FOLLOW, path); +} + int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_uprobe_multi_link *link = NULL; @@ -3223,7 +3256,6 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr struct task_struct *task = NULL; unsigned long __user *uoffsets; u64 __user *ucookies; - void __user *upath; unsigned long size; u32 flags, cnt, i; struct path path; @@ -3241,19 +3273,18 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr return -EINVAL; flags = attr->link_create.uprobe_multi.flags; - if (flags & ~BPF_F_UPROBE_MULTI_RETURN) + if (flags & ~(BPF_F_UPROBE_MULTI_RETURN | BPF_F_UPROBE_MULTI_PATH_FD)) return -EINVAL; /* - * path, offsets and cnt are mandatory, + * offsets and cnt are mandatory, * ref_ctr_offsets and cookies are optional */ - upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path); uoffsets = u64_to_user_ptr(attr->link_create.uprobe_multi.offsets); cnt = attr->link_create.uprobe_multi.cnt; pid = attr->link_create.uprobe_multi.pid; - if (!upath || !uoffsets || !cnt || pid < 0) + if (!uoffsets || !cnt || pid < 0) return -EINVAL; if (cnt > MAX_UPROBE_MULTI_CNT) return -E2BIG; @@ -3271,7 +3302,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr !access_ok(ucookies, size)) return -EFAULT; - err = user_path_at(AT_FDCWD, upath, LOOKUP_FOLLOW, &path); + err = bpf_uprobe_multi_get_path(attr, &path); if (err) return err; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 11dd610fa5fa..89b36de5fdbb 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1327,7 +1327,11 @@ enum { * BPF_TRACE_UPROBE_MULTI attach type to create return probe. */ enum { - BPF_F_UPROBE_MULTI_RETURN = (1U << 0) + /* Get return uprobe. */ + BPF_F_UPROBE_MULTI_RETURN = (1U << 0), + + /* Get path from provided path_fd. */ + BPF_F_UPROBE_MULTI_PATH_FD = (1U << 1), }; /* link_create.netfilter.flags used in LINK_CREATE command for @@ -1864,6 +1868,7 @@ union bpf_attr { __u32 cnt; __u32 flags; __u32 pid; + __u32 path_fd; } uprobe_multi; struct { union { -- cgit v1.2.3 From da3a4c3ec7ed746f7060b69c49a77602931b3dd3 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 11 Jun 2026 13:42:27 +0200 Subject: libbpf: Add path_fd to struct bpf_link_create_opts Adding the path_fd field to struct bpf_link_create_opts and passing it through kernel attr interface. Assisted-by: Codex:GPT-5.4 Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260611114230.950379-5-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/bpf.c | 1 + tools/lib/bpf/bpf.h | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index f37e3416f61a..96819c082c77 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -842,6 +842,7 @@ int bpf_link_create(int prog_fd, int target_fd, attr.link_create.uprobe_multi.ref_ctr_offsets = ptr_to_u64(OPTS_GET(opts, uprobe_multi.ref_ctr_offsets, 0)); attr.link_create.uprobe_multi.cookies = ptr_to_u64(OPTS_GET(opts, uprobe_multi.cookies, 0)); attr.link_create.uprobe_multi.pid = OPTS_GET(opts, uprobe_multi.pid, 0); + attr.link_create.uprobe_multi.path_fd = OPTS_GET(opts, uprobe_multi.path_fd, 0); if (!OPTS_ZEROED(opts, uprobe_multi)) return libbpf_err(-EINVAL); break; diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 012354131cf6..7534a593edae 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -444,6 +444,7 @@ struct bpf_link_create_opts { const unsigned long *ref_ctr_offsets; const __u64 *cookies; __u32 pid; + __u32 path_fd; } uprobe_multi; struct { __u64 cookie; @@ -477,7 +478,7 @@ struct bpf_link_create_opts { }; size_t :0; }; -#define bpf_link_create_opts__last_field uprobe_multi.pid +#define bpf_link_create_opts__last_field uprobe_multi.path_fd LIBBPF_API int bpf_link_create(int prog_fd, int target_fd, enum bpf_attach_type attach_type, -- cgit v1.2.3 From d5026e6bfc70daca4d88a732cae30b72451fdd0c Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 11 Jun 2026 13:42:28 +0200 Subject: selftests/bpf: Add uprobe_multi path_fd test Add a uprobe_multi link API selftest that opens /proc/self/exe and passes the resulting descriptor through opts.uprobe_multi.path_fd with BPF_F_UPROBE_MULTI_PATH_FD set. Assisted-by: Codex:GPT-5.4 Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260611114230.950379-6-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/uprobe_multi_test.c | 62 ++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c index 56cbea280fbd..ffcf3c92f047 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c @@ -2,6 +2,7 @@ #include #include +#include #include #include "uprobe_multi.skel.h" #include "uprobe_multi_bench.skel.h" @@ -757,6 +758,65 @@ static void test_link_api(void) __test_link_api(&child); } +static void test_link_api_path_fd(void) +{ + LIBBPF_OPTS(bpf_link_create_opts, opts); + const char *resolve_path = "/proc/self/exe"; + int prog_fd, link_fd = -1, path_fd = -1; + struct uprobe_multi *skel = NULL; + unsigned long *offsets = NULL; + const char *syms[3] = { + "uprobe_multi_func_1", + "uprobe_multi_func_2", + "uprobe_multi_func_3", + }; + int err; + + err = elf_resolve_syms_offsets(resolve_path, ARRAY_SIZE(syms), syms, + &offsets, STT_FUNC); + if (!ASSERT_OK(err, "elf_resolve_syms_offsets")) + return; + + path_fd = open(resolve_path, O_RDONLY); + if (!ASSERT_GE(path_fd, 0, "path_fd")) + goto cleanup; + + opts.uprobe_multi.path_fd = path_fd; + opts.uprobe_multi.offsets = offsets; + opts.uprobe_multi.cnt = ARRAY_SIZE(syms); + opts.uprobe_multi.flags = BPF_F_UPROBE_MULTI_PATH_FD; + + skel = uprobe_multi__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_multi__open_and_load")) + goto cleanup; + + prog_fd = bpf_program__fd(skel->progs.uprobe); + link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_UPROBE_MULTI, &opts); + if (!ASSERT_GE(link_fd, 0, "bpf_link_create")) + goto cleanup; + + skel->bss->uprobe_multi_func_1_addr = (__u64)uprobe_multi_func_1; + skel->bss->uprobe_multi_func_2_addr = (__u64)uprobe_multi_func_2; + skel->bss->uprobe_multi_func_3_addr = (__u64)uprobe_multi_func_3; + skel->bss->pid = getpid(); + + uprobe_multi_func_1(); + uprobe_multi_func_2(); + uprobe_multi_func_3(); + + ASSERT_EQ(skel->bss->uprobe_multi_func_1_result, 1, "uprobe_multi_func_1_result"); + ASSERT_EQ(skel->bss->uprobe_multi_func_2_result, 1, "uprobe_multi_func_2_result"); + ASSERT_EQ(skel->bss->uprobe_multi_func_3_result, 1, "uprobe_multi_func_3_result"); + +cleanup: + if (link_fd >= 0) + close(link_fd); + if (path_fd >= 0) + close(path_fd); + uprobe_multi__destroy(skel); + free(offsets); +} + static struct bpf_program * get_program(struct uprobe_multi_consumers *skel, int prog) { @@ -1354,6 +1414,8 @@ void test_uprobe_multi_test(void) test_attach_api_syms(); if (test__start_subtest("link_api")) test_link_api(); + if (test__start_subtest("link_api_path_fd")) + test_link_api_path_fd(); if (test__start_subtest("bench_uprobe")) test_bench_attach_uprobe(); if (test__start_subtest("bench_usdt")) -- cgit v1.2.3 From 3229675be841932879d5f1b2fb38ba9c2777a088 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 11 Jun 2026 13:42:29 +0200 Subject: selftests/bpf: Add uprobe_multi path_fd fail tests Adding tests to attach_api_fails suite to make sure we fail wrong setup for path_fd usage. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260611114230.950379-7-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/uprobe_multi_test.c | 32 +++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c index ffcf3c92f047..f0baf5738b75 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c @@ -537,7 +537,37 @@ static void test_attach_api_fails(void) link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_UPROBE_MULTI, &opts); if (!ASSERT_ERR(link_fd, "link_fd")) goto cleanup; - ASSERT_EQ(link_fd, -EINVAL, "pid_is_wrong"); + if (!ASSERT_EQ(link_fd, -EINVAL, "pid_is_wrong")) + goto cleanup; + + /* wrong path_fd */ + LIBBPF_OPTS_RESET(opts, + .uprobe_multi.path = NULL, + .uprobe_multi.path_fd = -1, + .uprobe_multi.flags = BPF_F_UPROBE_MULTI_PATH_FD, + .uprobe_multi.offsets = (unsigned long *)&offset, + .uprobe_multi.cnt = 1, + ); + + link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_UPROBE_MULTI, &opts); + if (!ASSERT_ERR(link_fd, "link_fd")) + goto cleanup; + if (!ASSERT_EQ(link_fd, -EBADF, "path_fd_is_wrong")) + goto cleanup; + + /* path and path_fd both set with BPF_F_UPROBE_MULTI_PATH_FD flag */ + LIBBPF_OPTS_RESET(opts, + .uprobe_multi.path = path, + .uprobe_multi.path_fd = 1, + .uprobe_multi.flags = BPF_F_UPROBE_MULTI_PATH_FD, + .uprobe_multi.offsets = (unsigned long *)&offset, + .uprobe_multi.cnt = 1, + ); + + link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_UPROBE_MULTI, &opts); + if (!ASSERT_ERR(link_fd, "link_fd")) + goto cleanup; + ASSERT_EQ(link_fd, -EINVAL, "path_and_path_fd_together"); cleanup: if (link_fd >= 0) -- cgit v1.2.3 From df29003c55115737a8fb4f8a60c6c2bba4c4a484 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 11 Jun 2026 13:42:30 +0200 Subject: selftests/bpf: Fix typo in verify_umulti_link_info We verify info.uprobe_multi.flags against wrong kprobe-multi flag (BPF_F_KPROBE_MULTI_RETURN). It's the same value as the correct flag (BPF_F_UPROBE_MULTI_RETURN), so there's not functional change. Fixes: 147c69307bcf ("selftests/bpf: Add link_info test for uprobe_multi link") Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260611114230.950379-8-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/fill_link_info.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/fill_link_info.c b/tools/testing/selftests/bpf/prog_tests/fill_link_info.c index e40114620751..f589eefbf9fb 100644 --- a/tools/testing/selftests/bpf/prog_tests/fill_link_info.c +++ b/tools/testing/selftests/bpf/prog_tests/fill_link_info.c @@ -469,7 +469,7 @@ verify_umulti_link_info(int fd, bool retprobe, __u64 *offsets, ASSERT_EQ(info.uprobe_multi.pid, getpid(), "info.uprobe_multi.pid"); ASSERT_EQ(info.uprobe_multi.count, 3, "info.uprobe_multi.count"); - ASSERT_EQ(info.uprobe_multi.flags & BPF_F_KPROBE_MULTI_RETURN, + ASSERT_EQ(info.uprobe_multi.flags & BPF_F_UPROBE_MULTI_RETURN, retprobe, "info.uprobe_multi.flags.retprobe"); ASSERT_EQ(info.uprobe_multi.path_size, strlen(path) + 1, "info.uprobe_multi.path_size"); ASSERT_STREQ(path_buf, path, "info.uprobe_multi.path"); -- cgit v1.2.3 From 1f24de6b2c81f71f90a7c02be516da99f00d11c7 Mon Sep 17 00:00:00 2001 From: Ethan Nelson-Moore Date: Tue, 9 Jun 2026 21:40:20 -0700 Subject: selftests/bpf: correct CONFIG_PPC64 macro name in comment A comment in tools/testing/selftests/bpf/progs/test_fill_link_info.c incorrectly refers to CONFIG_PPC6 instead of CONFIG_PPC64. Correct it. Discovered while searching for CONFIG_* symbols referenced in code but not defined in any Kconfig file. Signed-off-by: Ethan Nelson-Moore Link: https://lore.kernel.org/r/20260610044023.225820-1-enelsonmoore@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/test_fill_link_info.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/test_fill_link_info.c b/tools/testing/selftests/bpf/progs/test_fill_link_info.c index fac33a14f200..137bd6292163 100644 --- a/tools/testing/selftests/bpf/progs/test_fill_link_info.c +++ b/tools/testing/selftests/bpf/progs/test_fill_link_info.c @@ -12,7 +12,7 @@ extern bool CONFIG_PPC64 __kconfig __weak; /* This function is here to have CONFIG_X86_KERNEL_IBT, * CONFIG_PPC_FTRACE_OUT_OF_LINE, CONFIG_KPROBES_ON_FTRACE, - * CONFIG_PPC6 used and added to object BTF. + * CONFIG_PPC64 used and added to object BTF. */ int unused(void) { -- cgit v1.2.3 From 9080b97689dbf8d2c338a9af97cd2b4a714f25bf Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Tue, 2 Jun 2026 15:47:10 +0100 Subject: bpftool: Pass host flags to bootstrap libbpf bpftool builds a bootstrap libbpf with HOSTCC, but the libbpf submake can still inherit target build flags through CFLAGS. This can break cross builds when host objects are compiled with target-only options. Since HOST_CFLAGS contains warning options that are not suitable for building libbpf, use LIBBPF_BOOTSTRAP_CFLAGS with the warning options removed to build the bootstrap libbpf. Clear EXTRA_CFLAGS so target extra flags are not mixed into the host bootstrap libbpf build. Signed-off-by: Leo Yan Acked-by: Quentin Monnet Link: https://lore.kernel.org/r/20260602-tools_build_fix_zero_init_bpf_only-v2-1-c76e5250ea1c@arm.com Signed-off-by: Alexei Starovoitov --- tools/bpf/bpftool/Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index 8f50bc163bb2..c070111df22d 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -47,7 +47,8 @@ $(LIBBPF_INTERNAL_HDRS): $(LIBBPF_HDRS_DIR)/%.h: $(BPF_DIR)/%.h | $(LIBBPF_HDRS_ $(LIBBPF_BOOTSTRAP): $(wildcard $(BPF_DIR)/*.[ch] $(BPF_DIR)/Makefile) | $(LIBBPF_BOOTSTRAP_OUTPUT) $(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(LIBBPF_BOOTSTRAP_OUTPUT) \ DESTDIR=$(LIBBPF_BOOTSTRAP_DESTDIR:/=) prefix= \ - ARCH= CROSS_COMPILE= CC="$(HOSTCC)" LD="$(HOSTLD)" AR="$(HOSTAR)" $@ install_headers + ARCH= CROSS_COMPILE= CC="$(HOSTCC)" LD="$(HOSTLD)" AR="$(HOSTAR)" \ + CFLAGS="$(LIBBPF_BOOTSTRAP_CFLAGS)" EXTRA_CFLAGS= $@ install_headers $(LIBBPF_BOOTSTRAP_INTERNAL_HDRS): $(LIBBPF_BOOTSTRAP_HDRS_DIR)/%.h: $(BPF_DIR)/%.h | $(LIBBPF_BOOTSTRAP_HDRS_DIR) $(call QUIET_INSTALL, $@) @@ -92,6 +93,9 @@ HOST_CFLAGS := $(subst -I$(LIBBPF_INCLUDE),-I$(LIBBPF_BOOTSTRAP_INCLUDE),\ $(subst $(CLANG_CROSS_FLAGS),,$(CFLAGS))) HOST_LDFLAGS := $(LDFLAGS) +# Remove warnings for libbpf bootstrap build +LIBBPF_BOOTSTRAP_CFLAGS := $(filter-out -W -Wall -Wextra -Wformat -Wformat-signedness,$(HOST_CFLAGS)) + INSTALL ?= install RM ?= rm -f -- cgit v1.2.3 From 956841cbc3d77a9e687182a8bba316e9a2665a50 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Tue, 2 Jun 2026 15:47:11 +0100 Subject: bpftool: Avoid adding EXTRA_CFLAGS to HOST_CFLAGS Prepare for future changes where EXTRA_CFLAGS may include flags not applicable to the host compiler. Move the HOST_CFLAGS assignment before appending EXTRA_CFLAGS to CFLAGS so that HOST_CFLAGS does not inherit flags from EXTRA_CFLAGS. Acked-by: Quentin Monnet Signed-off-by: Leo Yan Link: https://lore.kernel.org/r/20260602-tools_build_fix_zero_init_bpf_only-v2-2-c76e5250ea1c@arm.com Signed-off-by: Alexei Starovoitov --- tools/bpf/bpftool/Makefile | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index c070111df22d..49bae0359144 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -82,6 +82,12 @@ CFLAGS += -DPACKAGE='"bpftool"' -D__EXPORTED_HEADERS__ \ ifneq ($(BPFTOOL_VERSION),) CFLAGS += -DBPFTOOL_VERSION='"$(BPFTOOL_VERSION)"' endif + +# This must be done before appending EXTRA_CFLAGS to CFLAGS to avoid +# including flags that are not applicable to the host compiler. +HOST_CFLAGS := $(subst -I$(LIBBPF_INCLUDE),-I$(LIBBPF_BOOTSTRAP_INCLUDE),\ + $(subst $(CLANG_CROSS_FLAGS),,$(CFLAGS))) + ifneq ($(EXTRA_CFLAGS),) CFLAGS += $(EXTRA_CFLAGS) endif @@ -89,8 +95,6 @@ ifneq ($(EXTRA_LDFLAGS),) LDFLAGS += $(EXTRA_LDFLAGS) endif -HOST_CFLAGS := $(subst -I$(LIBBPF_INCLUDE),-I$(LIBBPF_BOOTSTRAP_INCLUDE),\ - $(subst $(CLANG_CROSS_FLAGS),,$(CFLAGS))) HOST_LDFLAGS := $(LDFLAGS) # Remove warnings for libbpf bootstrap build -- cgit v1.2.3 From 3f2fec5b02b6efa1aad3238943858234751ac0f3 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Tue, 2 Jun 2026 15:47:12 +0100 Subject: bpftool: Append extra host flags Append HOST_EXTRACFLAGS to HOST_CFLAGS so that additional flags can be applied to the host compiler. Acked-by: Quentin Monnet Signed-off-by: Leo Yan Link: https://lore.kernel.org/r/20260602-tools_build_fix_zero_init_bpf_only-v2-3-c76e5250ea1c@arm.com Signed-off-by: Alexei Starovoitov --- tools/bpf/bpftool/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index 49bae0359144..271a7dc77273 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -87,6 +87,7 @@ endif # including flags that are not applicable to the host compiler. HOST_CFLAGS := $(subst -I$(LIBBPF_INCLUDE),-I$(LIBBPF_BOOTSTRAP_INCLUDE),\ $(subst $(CLANG_CROSS_FLAGS),,$(CFLAGS))) +HOST_CFLAGS += $(HOST_EXTRACFLAGS) ifneq ($(EXTRA_CFLAGS),) CFLAGS += $(EXTRA_CFLAGS) -- cgit v1.2.3 From f3846b3800a2cfda9c900b2e94525a1027b04424 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Tue, 2 Jun 2026 15:47:13 +0100 Subject: libbpf: Initialize CFLAGS before including Makefile.include tools/scripts/Makefile.include may expand EXTRA_CFLAGS in a future change. This could alter the initialization of CFLAGS, as the default options "-g -O2" would never be set once EXTRA_CFLAGS is expanded. Prepare for this by moving the CFLAGS initialization before including tools/scripts/Makefile.include, so it is not affected by the extended EXTRA_CFLAGS. Append EXTRA_CFLAGS to CFLAGS only after including Makefile.include and place it last so that the extra flags propagate properly and can override the default options. tools/scripts/Makefile.include already appends $(CLANG_CROSS_FLAGS) to CFLAGS, the Makefile appends $(CLANG_CROSS_FLAGS) again, remove the redundant append. Signed-off-by: Leo Yan Acked-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260602-tools_build_fix_zero_init_bpf_only-v2-4-c76e5250ea1c@arm.com Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/Makefile | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index 168140f8e646..eca584fb061e 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -49,6 +49,14 @@ man_dir_SQ = '$(subst ','\'',$(man_dir))' export man_dir man_dir_SQ INSTALL export DESTDIR DESTDIR_SQ +# Defer assigning EXTRA_CFLAGS to CFLAGS until after including +# tools/scripts/Makefile.include, as it may add flags to EXTRA_CFLAGS. +ifdef EXTRA_CFLAGS + CFLAGS := +else + CFLAGS := -g -O2 +endif + include $(srctree)/tools/scripts/Makefile.include # copy a bit from Linux kbuild @@ -70,13 +78,6 @@ LIB_TARGET = libbpf.a libbpf.so.$(LIBBPF_VERSION) LIB_FILE = libbpf.a libbpf.so* PC_FILE = libbpf.pc -# Set compile option CFLAGS -ifdef EXTRA_CFLAGS - CFLAGS := $(EXTRA_CFLAGS) -else - CFLAGS := -g -O2 -endif - # Append required CFLAGS override CFLAGS += -std=gnu89 override CFLAGS += $(EXTRA_WARNINGS) -Wno-switch-enum @@ -84,7 +85,7 @@ override CFLAGS += -Werror -Wall override CFLAGS += $(INCLUDES) override CFLAGS += -fvisibility=hidden override CFLAGS += -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -override CFLAGS += $(CLANG_CROSS_FLAGS) +override CFLAGS += $(EXTRA_CFLAGS) # flags specific for shared library SHLIB_FLAGS := -DSHARED -fPIC -- cgit v1.2.3 From b40ba139371c2ba4beffe0533c6d85fda9bc932c Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Tue, 2 Jun 2026 15:47:14 +0100 Subject: tools/bpf: build: Append extra cflags Append EXTRA_CFLAGS to CFLAGS so that additional flags can be applied to the compiler. Signed-off-by: Leo Yan Acked-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260602-tools_build_fix_zero_init_bpf_only-v2-5-c76e5250ea1c@arm.com Signed-off-by: Alexei Starovoitov --- tools/bpf/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/bpf/Makefile b/tools/bpf/Makefile index fd2585af1252..9c19e81f3c27 100644 --- a/tools/bpf/Makefile +++ b/tools/bpf/Makefile @@ -11,6 +11,7 @@ INSTALL ?= install CFLAGS += -Wall -O2 CFLAGS += -D__EXPORTED_HEADERS__ -I$(srctree)/tools/include/uapi \ -I$(srctree)/tools/include +CFLAGS += $(EXTRA_CFLAGS) # This will work when bpf is built in tools env. where srctree # isn't set and when invoked from selftests build, where srctree -- cgit v1.2.3 From 55ffbe8a15b1254f44d56952fb425a10e3f15c31 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Tue, 2 Jun 2026 15:47:15 +0100 Subject: selftests/bpf: Initialize operation name before use ASAN reports stack-buffer-overflow due to the uninitialized op_name. Initialize it to fix the issue. Fixes: 054b6c7866c7 ("selftests/bpf: Add verifier log tests for BPF_BTF_LOAD command") Signed-off-by: Leo Yan Acked-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260602-tools_build_fix_zero_init_bpf_only-v2-6-c76e5250ea1c@arm.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/verifier_log.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/prog_tests/verifier_log.c b/tools/testing/selftests/bpf/prog_tests/verifier_log.c index c01c0114af1b..4542bb586d72 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier_log.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier_log.c @@ -317,6 +317,7 @@ static void verif_btf_log_subtest(bool bad_btf) res = load_btf(&opts, true); ASSERT_EQ(res, -ENOSPC, "half_log_fd"); ASSERT_EQ(strlen(logs.buf), 24, "log_fixed_25"); + strscpy(op_name, "log_fixed", sizeof(op_name)); ASSERT_STRNEQ(logs.buf, logs.reference, 24, op_name); /* validate rolling verifier log logic: try all variations of log buf -- cgit v1.2.3 From 584f3b7a352586ddf9464faaedea57ac880e0e6d Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Tue, 2 Jun 2026 15:47:16 +0100 Subject: selftests/bpf: Use common CFLAGS for urandom_read The urandom_read helper and its shared library are built with $(CLANG) directly rather than through the normal selftest $(CC) rules. The CFLAGS variable can contain specific flags only for $(CC) but might be imcompatible for $(CLANG) and those flags are not necessarily valid for the clang-only urandom_read build. Split the BPF selftest local flags into COMMON_CFLAGS and append them to CFLAGS for the normal build path. Use COMMON_CFLAGS directly for urandom_read and liburandom_read.so, while still filtering out -static as before. Signed-off-by: Leo Yan Link: https://lore.kernel.org/r/20260602-tools_build_fix_zero_init_bpf_only-v2-7-c76e5250ea1c@arm.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index d53b7e496ac9..302a8aed3bf9 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -57,7 +57,7 @@ srctree := $(patsubst %/,%,$(dir $(srctree))) srctree := $(patsubst %/,%,$(dir $(srctree))) endif -CFLAGS += -g $(OPT_FLAGS) -rdynamic -std=gnu11 \ +COMMON_CFLAGS = -g $(OPT_FLAGS) -rdynamic -std=gnu11 \ -Wall -Werror -fno-omit-frame-pointer \ -Wno-unused-but-set-variable \ $(GENFLAGS) $(SAN_CFLAGS) $(LIBELF_CFLAGS) \ @@ -70,7 +70,7 @@ LDLIBS += $(LIBELF_LIBS) -lz -lrt -lpthread PCAP_CFLAGS := $(shell $(PKG_CONFIG) --cflags libpcap 2>/dev/null && echo "-DTRAFFIC_MONITOR=1") PCAP_LIBS := $(shell $(PKG_CONFIG) --libs libpcap 2>/dev/null) LDLIBS += $(PCAP_LIBS) -CFLAGS += $(PCAP_CFLAGS) +CFLAGS += $(COMMON_CFLAGS) $(PCAP_CFLAGS) # Some utility functions use LLVM libraries jit_disasm_helpers.c-CFLAGS = $(LLVM_CFLAGS) @@ -267,7 +267,7 @@ endif $(OUTPUT)/liburandom_read.so: urandom_read_lib1.c urandom_read_lib2.c liburandom_read.map $(call msg,LIB,,$@) $(Q)$(CLANG) $(CLANG_TARGET_ARCH) \ - $(filter-out -static,$(CFLAGS) $(LDFLAGS)) \ + $(filter-out -static,$(COMMON_CFLAGS) $(LDFLAGS)) \ $(filter %.c,$^) $(filter-out -static,$(LDLIBS)) \ -Wno-unused-command-line-argument \ -fuse-ld=$(LLD) -Wl,-znoseparate-code -Wl,--build-id=sha1 \ @@ -277,7 +277,7 @@ $(OUTPUT)/liburandom_read.so: urandom_read_lib1.c urandom_read_lib2.c liburandom $(OUTPUT)/urandom_read: urandom_read.c urandom_read_aux.c $(OUTPUT)/liburandom_read.so $(call msg,BINARY,,$@) $(Q)$(CLANG) $(CLANG_TARGET_ARCH) \ - $(filter-out -static,$(CFLAGS) $(LDFLAGS)) $(filter %.c,$^) \ + $(filter-out -static,$(COMMON_CFLAGS) $(LDFLAGS)) $(filter %.c,$^) \ -Wno-unused-command-line-argument \ -lurandom_read $(filter-out -static,$(LDLIBS)) -L$(OUTPUT) \ -fuse-ld=$(LLD) -Wl,-znoseparate-code -Wl,--build-id=sha1 \ -- cgit v1.2.3 From 62617d28d9ae123c0d6ba51035caa3ca52b94f7a Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Tue, 2 Jun 2026 15:47:17 +0100 Subject: selftests/bpf: Avoid static LLVM linking for cross builds The BPF selftests prefer static LLVM linking, which works for native builds but can break cross builds. Its --link-static output may include host-only libraries that are unavailable for the cross compilation, causing link failures. Avoid static LLVM linking for cross builds and use shared LLVM libraries instead. Native builds keep the existing behavior. Signed-off-by: Leo Yan Link: https://lore.kernel.org/r/20260602-tools_build_fix_zero_init_bpf_only-v2-8-c76e5250ea1c@arm.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 302a8aed3bf9..b642ee489ea6 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -194,8 +194,15 @@ ifeq ($(feature-llvm),1) LLVM_CONFIG_LIB_COMPONENTS := mcdisassembler all-targets # both llvm-config and lib.mk add -D_GNU_SOURCE, which ends up as conflict LLVM_CFLAGS += $(filter-out -D_GNU_SOURCE,$(shell $(LLVM_CONFIG) --cflags)) - # Prefer linking statically if it's available, otherwise fallback to shared - ifeq ($(shell $(LLVM_CONFIG) --link-static --libs >/dev/null 2>&1 && echo static),static) + # Cross compilation must use dynamic linking to avoid unresolved library + # dependencies. For native build, prefer linking statically if it's + # available, otherwise fallback to shared. + ifneq ($(ARCH), $(HOSTARCH)) + LLVM_LINK_STATIC := + else + LLVM_LINK_STATIC := $(shell $(LLVM_CONFIG) --link-static --libs >/dev/null 2>&1 && echo y) + endif + ifeq ($(LLVM_LINK_STATIC),y) LLVM_LDLIBS += $(shell $(LLVM_CONFIG) --link-static --libs $(LLVM_CONFIG_LIB_COMPONENTS)) LLVM_LDLIBS += $(filter-out -lxml2,$(shell $(LLVM_CONFIG) --link-static --system-libs $(LLVM_CONFIG_LIB_COMPONENTS))) LLVM_LDLIBS += -lstdc++ -- cgit v1.2.3 From ca0f587c029afa66227f7b932450b1c417403394 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Sun, 14 Jun 2026 00:24:42 +0800 Subject: bpf: Fix bpf_get/setsockopt to tos for ipv4-mapped ipv6 socket When TCP over IPv4 via INET6 API, bpf_get/setsockopt with ipv4 will fail, because sk->sk_family is AF_INET6. With ipv6 will success, not take effect, because inet_csk(sk)->icsk_af_ops is ipv6_mapped and use ip_queue_xmit, inet_sk(sk)->tos. To relax this restriction, allow getting/setting tos for those possible ipv4-mapped ipv6 sockets. Fixes: ee7f1e1302f5 ("bpf: Change bpf_setsockopt(SOL_IP) to reuse do_ip_setsockopt()") Signed-off-by: Feng Zhou Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260613162443.60515-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index 9590877b0714..57b00c6cc8cc 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5544,11 +5544,24 @@ static int sol_tcp_sockopt(struct sock *sk, int optname, KERNEL_SOCKPTR(optval), *optlen); } +static bool sk_allows_sol_ip_sockopt(struct sock *sk) +{ + switch (sk->sk_family) { + case AF_INET: + return true; + case AF_INET6: + /* Allow getting/setting sockopt for possible ipv4-mapped ipv6 socket. */ + return sk->sk_type != SOCK_RAW && !ipv6_only_sock(sk); + default: + return false; + } +} + static int sol_ip_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) { - if (sk->sk_family != AF_INET) + if (!sk_allows_sol_ip_sockopt(sk)) return -EINVAL; switch (optname) { -- cgit v1.2.3 From 5cf2c21ab0900b41c0e29c925b9a640a92340d40 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Sun, 14 Jun 2026 00:24:43 +0800 Subject: selftests/bpf: Add test to verify the fix for bpf_setsockopt() helper Verify the fix by: 1. Attach cgroup sockops prog. 2. Build a tcp connection using ipv4 addr in ipv6 socket. 3. Verify the return value of bpf_setsockopt() helper. Assisted-by: Codex:gpt-5.5-xhigh Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260613162443.60515-3-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/setget_sockopt.c | 78 ++++++++++++++++++++++ tools/testing/selftests/bpf/progs/setget_sockopt.c | 23 +++++++ 2 files changed, 101 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c b/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c index 77fe1bfb7504..4e91d9b615ce 100644 --- a/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c +++ b/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c @@ -199,6 +199,83 @@ err_out: bpf_link__destroy(getsockopt_link); } +static int connect_to_v4mapped_v6_fd(int server_fd) +{ + struct sockaddr_storage addr; + struct sockaddr_in *addr4 = (void *)&addr; + socklen_t addrlen = sizeof(addr); + struct sockaddr_in6 addr6 = {}; + int fd = -1, v6only = 0, err; + + err = getsockname(server_fd, (struct sockaddr *)&addr, &addrlen); + if (!ASSERT_OK(err, "getsockname")) + return -1; + + fd = socket(AF_INET6, SOCK_STREAM, 0); + if (!ASSERT_GE(fd, 0, "socket")) + return -1; + + err = settimeo(fd, 0); + if (!ASSERT_OK(err, "settimeo")) + goto err_out; + + err = setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &v6only, sizeof(v6only)); + if (!ASSERT_OK(err, "clear_v6only")) + goto err_out; + + addr6.sin6_family = AF_INET6; + addr6.sin6_port = addr4->sin_port; + addr6.sin6_addr.s6_addr[10] = 0xff; + addr6.sin6_addr.s6_addr[11] = 0xff; + memcpy(&addr6.sin6_addr.s6_addr[12], &addr4->sin_addr, sizeof(addr4->sin_addr)); + + err = connect(fd, (struct sockaddr *)&addr6, sizeof(addr6)); + if (!ASSERT_OK(err, "connect")) + goto err_out; + + return fd; + +err_out: + close(fd); + return -1; +} + +static void test_v4mapped_v6_ip_tos(void) +{ + struct setget_sockopt__bss *bss = skel->bss; + int sfd = -1, fd = -1, got = 0, exp = 0x1c; + socklen_t optlen; + + memset(bss, 0, sizeof(*bss)); + bss->v4mapped_v6_ip_tos_enable = 1; + bss->v4mapped_v6_ip_tos_ret = -1; + bss->v4mapped_v6_ip_tos_val = exp; + + sfd = start_server(AF_INET, SOCK_STREAM, addr4_str, 0, 0); + if (!ASSERT_GE(sfd, 0, "start_server")) + goto err_out; + + fd = connect_to_v4mapped_v6_fd(sfd); + if (!ASSERT_GE(fd, 0, "connect_to_v4mapped_v6_fd")) + goto err_out; + + ASSERT_GT(bss->v4mapped_v6_ip_tos_cnt, 0, "v4mapped_v6_ip_tos_cnt"); + ASSERT_EQ(bss->v4mapped_v6_ip_tos_ret, 0, "v4mapped_v6_ip_tos_ret"); + + optlen = sizeof(got); + if (!ASSERT_OK(getsockopt(fd, SOL_IP, IP_TOS, &got, &optlen), "getsockopt_ip_tos")) + goto err_out; + + ASSERT_EQ(got, exp, "ip_tos"); + +err_out: + bss->v4mapped_v6_ip_tos_enable = 0; + if (fd >= 0) + close(fd); + if (sfd >= 0) + close(sfd); +} + void test_setget_sockopt(void) { cg_fd = test__join_cgroup(CG_NAME); @@ -238,6 +315,7 @@ void test_setget_sockopt(void) test_ktls(AF_INET); test_nonstandard_opt(AF_INET); test_nonstandard_opt(AF_INET6); + test_v4mapped_v6_ip_tos(); done: setget_sockopt__destroy(skel); diff --git a/tools/testing/selftests/bpf/progs/setget_sockopt.c b/tools/testing/selftests/bpf/progs/setget_sockopt.c index d330b1511979..636a7cd8e2fa 100644 --- a/tools/testing/selftests/bpf/progs/setget_sockopt.c +++ b/tools/testing/selftests/bpf/progs/setget_sockopt.c @@ -387,6 +387,24 @@ int _getsockopt(struct bpf_sockopt *ctx) return 1; } +int v4mapped_v6_ip_tos_enable; +int v4mapped_v6_ip_tos_ret; +int v4mapped_v6_ip_tos_cnt; +int v4mapped_v6_ip_tos_val; + +static void test_v4mapped_v6_ip_tos(struct bpf_sock_ops *skops) +{ + int tos = v4mapped_v6_ip_tos_val; + + if (!v4mapped_v6_ip_tos_enable || skops->op != BPF_SOCK_OPS_TCP_CONNECT_CB) + return; + if (skops->family != AF_INET6) + return; + + v4mapped_v6_ip_tos_cnt++; + v4mapped_v6_ip_tos_ret = bpf_setsockopt(skops, IPPROTO_IP, IP_TOS, &tos, sizeof(tos)); +} + SEC("sockops") int skops_sockopt(struct bpf_sock_ops *skops) { @@ -401,6 +419,11 @@ int skops_sockopt(struct bpf_sock_ops *skops) if (!sk) return 1; + if (v4mapped_v6_ip_tos_enable) { + test_v4mapped_v6_ip_tos(skops); + return 1; + } + switch (skops->op) { case BPF_SOCK_OPS_TCP_LISTEN_CB: nr_listen += !(bpf_test_sockopt(skops, sk) || -- cgit v1.2.3 From 05ae621d4e3c7bfdcc0a4eef1d66eccfc789ee62 Mon Sep 17 00:00:00 2001 From: David Windsor Date: Thu, 11 Jun 2026 10:35:49 -0400 Subject: selftests/bpf: Add test for sleepable lsm_cgroup rejection Confirm the verifier rejects loading a sleepable BPF_LSM_CGROUP program, as introduced in commit 5b038319be44 ("bpf: Reject sleepable BPF_LSM_CGROUP programs at load time"). Signed-off-by: David Windsor Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260611143549.703914-1-dwindsor@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/verifier_lsm.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_lsm.c b/tools/testing/selftests/bpf/progs/verifier_lsm.c index 38e8e9176862..2f8103bfa14e 100644 --- a/tools/testing/selftests/bpf/progs/verifier_lsm.c +++ b/tools/testing/selftests/bpf/progs/verifier_lsm.c @@ -188,4 +188,13 @@ int BPF_PROG(null_check, struct file *file) return 0; } +SEC("lsm_cgroup/file_open") +__description("sleepable lsm_cgroup program is rejected") +__failure __msg("Program of this type cannot be sleepable") +__flag(BPF_F_SLEEPABLE) +int BPF_PROG(sleepable_lsm_cgroup) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 1f32c0d619d996b395f36a920f58159949be922a Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Thu, 11 Jun 2026 17:07:03 +0200 Subject: selftsets/bpf: Retry map update on helper_fill_hashmap() helper_fill_hashmap() is used also on parallel and stress map tests. Those are consistently failing with ENOMEM on kernels built with PREEMPT_RT if preallocation is disabled. The failure is transient and only called by the memory cache refill running in a preemptible irq_work, which can easily stall in case of contention. Use a retriable update in those cases to handle transient ENOMEM and make the test more stable also on PREEMPT_RT. Also fix the sign of the value printed in case of error (strerror() expects a positive errno while updates return it negative). Signed-off-by: Gabriele Monaco Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260611150704.95133-1-gmonaco@redhat.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_maps.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index ccc5acd55ff9..c32da7bd8be2 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -260,6 +260,16 @@ static void test_hashmap_percpu(unsigned int task, void *data) close(fd); } +#define MAP_RETRIES 20 + +static bool can_retry(int err) +{ + return (err == EAGAIN || err == EBUSY || + ((err == ENOMEM || err == E2BIG) && + map_opts.map_flags == BPF_F_NO_PREALLOC)); +} + + #define VALUE_SIZE 3 static int helper_fill_hashmap(int max_entries) { @@ -274,10 +284,11 @@ static int helper_fill_hashmap(int max_entries) for (i = 0; i < max_entries; i++) { key = i; value[0] = key; - ret = bpf_map_update_elem(fd, &key, value, BPF_NOEXIST); + ret = map_update_retriable(fd, &key, value, BPF_NOEXIST, + MAP_RETRIES, can_retry); CHECK(ret != 0, "can't update hashmap", - "err: %s\n", strerror(ret)); + "err: %s\n", strerror(-ret)); } return fd; @@ -1392,17 +1403,9 @@ static void test_map_stress(void) #define DO_UPDATE 1 #define DO_DELETE 0 -#define MAP_RETRIES 20 #define MAX_DELAY_US 50000 #define MIN_DELAY_RANGE_US 5000 -static bool can_retry(int err) -{ - return (err == EAGAIN || err == EBUSY || - ((err == ENOMEM || err == E2BIG) && - map_opts.map_flags == BPF_F_NO_PREALLOC)); -} - int map_update_retriable(int map_fd, const void *key, const void *value, int flags, int attempts, retry_for_error_fn need_retry) { -- cgit v1.2.3 From 0c0a8ed85349dae298712d79cb276acfeb794d82 Mon Sep 17 00:00:00 2001 From: Weiming Shi Date: Mon, 15 Jun 2026 10:19:54 +0800 Subject: bpf, sockmap: reject overflowing copy + len in bpf_msg_push_data() When the scatterlist ring is full or nearly full, bpf_msg_push_data() enters a copy fallback path and computes copy + len for the page allocation size. Since len comes from BPF with arg3_type = ARG_ANYTHING and both are u32, a crafted len can wrap the sum to a small value, causing an undersized allocation followed by an out-of-bounds memcpy. BUG: unable to handle page fault for address: ffffed104089a402 Oops: Oops: 0000 [#1] SMP KASAN NOPTI Call Trace: __asan_memcpy (mm/kasan/shadow.c:105) bpf_msg_push_data (net/core/filter.c:2852 net/core/filter.c:2788) bpf_prog_9ed8b5711920a7d7+0x2e/0x36 sk_psock_msg_verdict (net/core/skmsg.c:934) tcp_bpf_sendmsg (net/ipv4/tcp_bpf.c:421 net/ipv4/tcp_bpf.c:584) __sys_sendto (net/socket.c:2206) do_syscall_64 (arch/x86/entry/syscall_64.c:94) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) Add an overflow check before the allocation. Link: https://lore.kernel.org/all/20260424155913.A19FDC19425@smtp.kernel.org Fixes: 6fff607e2f14 ("bpf: sk_msg program helper bpf_msg_push_data") Tested-by: Xiang Mei Tested-by: Xinyu Ma Reviewed-by: Jiayuan Chen Reviewed-by: Emil Tsalapatis Reviewed-by: Kuniyuki Iwashima Signed-off-by: Weiming Shi Signed-off-by: Jiayuan Chen Link: https://lore.kernel.org/r/20260615021959.140010-2-jiayuan.chen@linux.dev Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index 57b00c6cc8cc..4b159045881d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2829,6 +2829,9 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, if (!space || (space == 1 && start != offset)) copy = msg->sg.data[i].length; + if (unlikely(copy + len < copy)) + return -EINVAL; + page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP, get_order(copy + len)); if (unlikely(!page)) -- cgit v1.2.3 From f3f34ca45b96c21722204e30576fd29db8f1aff7 Mon Sep 17 00:00:00 2001 From: Weiming Shi Date: Mon, 15 Jun 2026 10:19:55 +0800 Subject: bpf, sockmap: Fix wrong rsge offset in bpf_msg_push_data() When bpf_msg_push_data() splits a scatterlist element into head and tail, the tail's page offset is advanced by `start` (absolute message byte offset) instead of `start - offset` (byte position within the element). This makes rsge.offset overshoot by `offset` bytes, pointing to the wrong location within the page or beyond its boundary. Consumers of the corrupted entry either silently read wrong data or trigger an out-of-bounds access. BUG: KASAN: slab-use-after-free in bpf_msg_pull_data (net/core/filter.c:2728) Read of size 32752 at addr ffff8881042f0010 by task poc/130 Call Trace: __asan_memcpy (mm/kasan/shadow.c:105) bpf_msg_pull_data (net/core/filter.c:2728) bpf_prog_run_pin_on_cpu (include/linux/bpf.h:1402) sk_psock_msg_verdict (net/core/skmsg.c:934) tcp_bpf_send_verdict (net/ipv4/tcp_bpf.c:421) sock_sendmsg_nosec (net/socket.c:727) Fixes: 6fff607e2f14 ("bpf: sk_msg program helper bpf_msg_push_data") Reported-by: Xiang Mei Reviewed-by: Jiayuan Chen Reviewed-by: Emil Tsalapatis Reviewed-by: Kuniyuki Iwashima Signed-off-by: Weiming Shi Signed-off-by: Jiayuan Chen Link: https://lore.kernel.org/r/20260615021959.140010-3-jiayuan.chen@linux.dev Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index 4b159045881d..978e740792be 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2872,7 +2872,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, psge->length = start - offset; rsge.length -= psge->length; - rsge.offset += start; + rsge.offset += start - offset; sk_msg_iter_var_next(i); sg_unmark_end(psge); -- cgit v1.2.3 From 2ccbc9a3874620c9623419034f572e4507c33e4f Mon Sep 17 00:00:00 2001 From: Zhang Cen Date: Mon, 15 Jun 2026 10:19:56 +0800 Subject: bpf, sockmap: keep sk_msg copy state in sync SK_MSG uses msg->sg.copy as per-scatterlist-entry provenance. Entries with this bit set are copied before data/data_end are exposed to SK_MSG BPF programs for direct packet access. bpf_msg_pull_data(), bpf_msg_push_data(), and bpf_msg_pop_data() rewrite the sk_msg scatterlist ring by collapsing, splitting, and shifting entries. These operations move msg->sg.data[] entries, but the parallel copy bitmap can be left behind on the old slot. A copied entry can then return to msg->sg.start with its copy bit clear and be exposed as directly writable packet data. This corruption path requires an attached SK_MSG BPF program that calls the mutating helpers; ordinary sockmap/TLS traffic that never runs push/pop/pull helper sequences is not affected. Keep msg->sg.copy synchronized with scatterlist entry moves, preserve the copy bit when an entry is split, clear it when a helper replaces an entry with a private page, and clear slots vacated by pull-data compaction. Fixes: 015632bb30da ("bpf: sk_msg program helper bpf_sk_msg_pull_data") Fixes: 6fff607e2f14 ("bpf: sk_msg program helper bpf_msg_push_data") Fixes: 7246d8ed4dcc ("bpf: helper to pop data from messages") Cc: stable@vger.kernel.org Co-developed-by: Han Guidong <2045gemini@gmail.com> Reviewed-by: John Fastabend Reviewed-by: Emil Tsalapatis Reviewed-by: Kuniyuki Iwashima Signed-off-by: Han Guidong <2045gemini@gmail.com> Signed-off-by: Zhang Cen Signed-off-by: Jiayuan Chen Link: https://lore.kernel.org/r/20260615021959.140010-4-jiayuan.chen@linux.dev Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 83 insertions(+), 5 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 978e740792be..3ecac0eb7da1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2654,6 +2654,38 @@ static void sk_msg_reset_curr(struct sk_msg *msg) } } +static bool sk_msg_elem_is_copy(const struct sk_msg *msg, u32 i) +{ + return test_bit(i, msg->sg.copy); +} + +static void sk_msg_clear_elem_copy(struct sk_msg *msg, u32 i) +{ + __clear_bit(i, msg->sg.copy); +} + +static void sk_msg_set_elem_copy(struct sk_msg *msg, u32 i) +{ + __set_bit(i, msg->sg.copy); +} + +static void sk_msg_clear_copy_range(struct sk_msg *msg, u32 start, u32 end) +{ + while (start != end) { + sk_msg_clear_elem_copy(msg, start); + sk_msg_iter_var_next(start); + } +} + +static void sk_msg_sg_move(struct sk_msg *msg, u32 dst, u32 src) +{ + msg->sg.data[dst] = msg->sg.data[src]; + if (sk_msg_elem_is_copy(msg, src)) + sk_msg_set_elem_copy(msg, dst); + else + sk_msg_clear_elem_copy(msg, dst); +} + static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { .func = bpf_msg_cork_bytes, .gpl_only = false, @@ -2692,7 +2724,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, * account for the headroom. */ bytes_sg_total = start - offset + bytes; - if (!test_bit(i, msg->sg.copy) && bytes_sg_total <= len) + if (!sk_msg_elem_is_copy(msg, i) && bytes_sg_total <= len) goto out; /* At this point we need to linearize multiple scatterlist @@ -2738,6 +2770,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, } while (i != last_sge); sg_set_page(&msg->sg.data[first_sge], page, copy, 0); + sk_msg_clear_elem_copy(msg, first_sge); /* To repair sg ring we need to shift entries. If we only * had a single entry though we can just replace it and @@ -2747,8 +2780,14 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, shift = last_sge > first_sge ? last_sge - first_sge - 1 : NR_MSG_FRAG_IDS - first_sge + last_sge - 1; - if (!shift) + if (!shift) { + sk_msg_clear_elem_copy(msg, msg->sg.end); goto out; + } + + i = first_sge; + sk_msg_iter_var_next(i); + sk_msg_clear_copy_range(msg, i, last_sge); i = first_sge; sk_msg_iter_var_next(i); @@ -2762,16 +2801,18 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, if (move_from == msg->sg.end) break; - msg->sg.data[i] = msg->sg.data[move_from]; + sk_msg_sg_move(msg, i, move_from); msg->sg.data[move_from].length = 0; msg->sg.data[move_from].page_link = 0; msg->sg.data[move_from].offset = 0; + sk_msg_clear_elem_copy(msg, move_from); sk_msg_iter_var_next(i); } while (1); msg->sg.end = msg->sg.end - shift > msg->sg.end ? msg->sg.end - shift + NR_MSG_FRAG_IDS : msg->sg.end - shift; + sk_msg_clear_elem_copy(msg, msg->sg.end); out: sk_msg_reset_curr(msg); msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset; @@ -2792,8 +2833,10 @@ static const struct bpf_func_proto bpf_msg_pull_data_proto = { BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, u32, len, u64, flags) { + bool sge_copy = false, nsge_copy = false, nnsge_copy = false; struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge; u32 new, i = 0, l = 0, space, copy = 0, offset = 0; + bool rsge_copy = false; u8 *raw, *to, *from; struct page *page; @@ -2869,6 +2912,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, sk_msg_iter_var_prev(i); psge = sk_msg_elem(msg, i); rsge = sk_msg_elem_cpy(msg, i); + rsge_copy = sk_msg_elem_is_copy(msg, i); psge->length = start - offset; rsge.length -= psge->length; @@ -2894,23 +2938,34 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, /* Shift one or two slots as needed */ sge = sk_msg_elem_cpy(msg, new); sg_unmark_end(&sge); + sge_copy = sk_msg_elem_is_copy(msg, new); nsge = sk_msg_elem_cpy(msg, i); + nsge_copy = sk_msg_elem_is_copy(msg, i); if (rsge.length) { sk_msg_iter_var_next(i); nnsge = sk_msg_elem_cpy(msg, i); + nnsge_copy = sk_msg_elem_is_copy(msg, i); sk_msg_iter_next(msg, end); } while (i != msg->sg.end) { msg->sg.data[i] = sge; + if (sge_copy) + sk_msg_set_elem_copy(msg, i); + else + sk_msg_clear_elem_copy(msg, i); sge = nsge; + sge_copy = nsge_copy; sk_msg_iter_var_next(i); if (rsge.length) { nsge = nnsge; + nsge_copy = nnsge_copy; nnsge = sk_msg_elem_cpy(msg, i); + nnsge_copy = sk_msg_elem_is_copy(msg, i); } else { nsge = sk_msg_elem_cpy(msg, i); + nsge_copy = sk_msg_elem_is_copy(msg, i); } } @@ -2918,13 +2973,18 @@ place_new: /* Place newly allocated data buffer */ sk_mem_charge(msg->sk, len); msg->sg.size += len; - __clear_bit(new, msg->sg.copy); + sk_msg_clear_elem_copy(msg, new); sg_set_page(&msg->sg.data[new], page, len + copy, 0); if (rsge.length) { get_page(sg_page(&rsge)); sk_msg_iter_var_next(new); msg->sg.data[new] = rsge; + if (rsge_copy) + sk_msg_set_elem_copy(msg, new); + else + sk_msg_clear_elem_copy(msg, new); } + sk_msg_clear_elem_copy(msg, msg->sg.end); sk_msg_reset_curr(msg); sk_msg_compute_data_pointers(msg); @@ -2950,27 +3010,38 @@ static void sk_msg_shift_left(struct sk_msg *msg, int i) do { prev = i; sk_msg_iter_var_next(i); - msg->sg.data[prev] = msg->sg.data[i]; + sk_msg_sg_move(msg, prev, i); } while (i != msg->sg.end); sk_msg_iter_prev(msg, end); + sk_msg_clear_elem_copy(msg, msg->sg.end); } static void sk_msg_shift_right(struct sk_msg *msg, int i) { struct scatterlist tmp, sge; + bool tmp_copy, sge_copy; sk_msg_iter_next(msg, end); sge = sk_msg_elem_cpy(msg, i); + sge_copy = sk_msg_elem_is_copy(msg, i); sk_msg_iter_var_next(i); tmp = sk_msg_elem_cpy(msg, i); + tmp_copy = sk_msg_elem_is_copy(msg, i); while (i != msg->sg.end) { msg->sg.data[i] = sge; + if (sge_copy) + sk_msg_set_elem_copy(msg, i); + else + sk_msg_clear_elem_copy(msg, i); sk_msg_iter_var_next(i); sge = tmp; + sge_copy = tmp_copy; tmp = sk_msg_elem_cpy(msg, i); + tmp_copy = sk_msg_elem_is_copy(msg, i); } + sk_msg_clear_elem_copy(msg, msg->sg.end); } BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, @@ -3027,8 +3098,10 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, */ if (start != offset) { struct scatterlist *nsge, *sge = sk_msg_elem(msg, i); + bool sge_copy = sk_msg_elem_is_copy(msg, i); int a = start - offset; int b = sge->length - pop - a; + u32 sge_idx = i; sk_msg_iter_var_next(i); @@ -3041,6 +3114,10 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, sg_set_page(nsge, sg_page(sge), b, sge->offset + pop + a); + if (sge_copy) + sk_msg_set_elem_copy(msg, i); + else + sk_msg_clear_elem_copy(msg, i); } else { struct page *page, *orig; u8 *to, *from; @@ -3057,6 +3134,7 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, memcpy(to, from, a); memcpy(to + a, from + a + pop, b); sg_set_page(sge, page, a + b, 0); + sk_msg_clear_elem_copy(msg, sge_idx); put_page(orig); } pop = 0; -- cgit v1.2.3 From c010995b29c8939c6aa69e3cb26f8dbee163d156 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 15 Jun 2026 10:19:57 +0800 Subject: sockmap: Fix use-after-free in udp_bpf_recvmsg() syzbot reported use-after-free of struct sk_msg in sk_msg_recvmsg(). [0] sk_msg_recvmsg() peeks sk_msg from psock->ingress_msg under a lock, but its processing is lockless. Thus, sk_msg_recvmsg() must be serialised by callers, otherwise multiple threads could touch the same sk_msg. For example, TCP uses lock_sock(), and AF_UNIX uses unix_sk(sk)->iolock. Initially, udp_bpf_recvmsg() had used lock_sock(), but the cited commit removed it. Let's serialise sk_msg_recvmsg() with lock_sock() in udp_bpf_recvmsg(). Note that holding spin_lock_bh(&sk->sk_receive_queue.lock) is not an option due to copy_page_to_iter() in sk_msg_recvmsg(). [0]: BUG: KASAN: slab-use-after-free in sk_msg_recvmsg+0xb54/0xc30 net/core/skmsg.c:428 Read of size 4 at addr ffff88814cdcf000 by task syz.0.24/6020 CPU: 1 UID: 0 PID: 6020 Comm: syz.0.24 Not tainted syzkaller #0 PREEMPT(full) Hardware name: Google Compute Engine/Google Compute Engine, BIOS Google 01/13/2026 Call Trace: dump_stack_lvl+0xe8/0x150 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0xba/0x230 mm/kasan/report.c:482 kasan_report+0x117/0x150 mm/kasan/report.c:595 sk_msg_recvmsg+0xb54/0xc30 net/core/skmsg.c:428 udp_bpf_recvmsg+0x4bd/0xe00 net/ipv4/udp_bpf.c:84 inet_recvmsg+0x260/0x270 net/ipv4/af_inet.c:891 sock_recvmsg_nosec net/socket.c:1078 [inline] sock_recvmsg+0x1a8/0x270 net/socket.c:1100 ____sys_recvmsg+0x1e6/0x4a0 net/socket.c:2812 ___sys_recvmsg+0x215/0x590 net/socket.c:2854 do_recvmmsg+0x334/0x800 net/socket.c:2949 __sys_recvmmsg net/socket.c:3023 [inline] __do_sys_recvmmsg net/socket.c:3046 [inline] __se_sys_recvmmsg net/socket.c:3039 [inline] __x64_sys_recvmmsg+0x198/0x250 net/socket.c:3039 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xe2/0xf80 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7fb319f9aeb9 Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 e8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fb31ad97028 EFLAGS: 00000246 ORIG_RAX: 000000000000012b RAX: ffffffffffffffda RBX: 00007fb31a216090 RCX: 00007fb319f9aeb9 RDX: 0000000000000001 RSI: 0000200000000400 RDI: 0000000000000004 RBP: 00007fb31a008c1f R08: 0000000000000000 R09: 0000000000000000 R10: 0000000040000021 R11: 0000000000000246 R12: 0000000000000000 R13: 00007fb31a216128 R14: 00007fb31a216090 R15: 00007ffe21dd0a98 Allocated by task 6019: kasan_save_stack mm/kasan/common.c:57 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:78 poison_kmalloc_redzone mm/kasan/common.c:398 [inline] __kasan_kmalloc+0x93/0xb0 mm/kasan/common.c:415 kasan_kmalloc include/linux/kasan.h:263 [inline] __kmalloc_cache_noprof+0x3d1/0x6e0 mm/slub.c:5780 kmalloc_noprof include/linux/slab.h:957 [inline] kzalloc_noprof include/linux/slab.h:1094 [inline] alloc_sk_msg net/core/skmsg.c:510 [inline] sk_psock_skb_ingress_self+0x60/0x350 net/core/skmsg.c:612 sk_psock_verdict_apply net/core/skmsg.c:1038 [inline] sk_psock_verdict_recv+0x7d9/0x8d0 net/core/skmsg.c:1236 udp_read_skb+0x73e/0x7e0 net/ipv4/udp.c:2045 sk_psock_verdict_data_ready+0x12d/0x550 net/core/skmsg.c:1257 __udp_enqueue_schedule_skb+0xc54/0x10b0 net/ipv4/udp.c:1789 __udp_queue_rcv_skb net/ipv4/udp.c:2346 [inline] udp_queue_rcv_one_skb+0xac5/0x19c0 net/ipv4/udp.c:2475 __udp4_lib_mcast_deliver+0xc06/0xcf0 net/ipv4/udp.c:2585 __udp4_lib_rcv+0x10f6/0x2620 net/ipv4/udp.c:2724 ip_protocol_deliver_rcu+0x282/0x440 net/ipv4/ip_input.c:207 ip_local_deliver_finish+0x3bb/0x6f0 net/ipv4/ip_input.c:241 NF_HOOK+0x336/0x3c0 include/linux/netfilter.h:318 dst_input include/net/dst.h:474 [inline] ip_sublist_rcv_finish+0x221/0x2a0 net/ipv4/ip_input.c:584 ip_list_rcv_finish net/ipv4/ip_input.c:628 [inline] ip_sublist_rcv+0x5c6/0xa70 net/ipv4/ip_input.c:644 ip_list_rcv+0x3f1/0x450 net/ipv4/ip_input.c:678 __netif_receive_skb_list_ptype net/core/dev.c:6195 [inline] __netif_receive_skb_list_core+0x7e5/0x810 net/core/dev.c:6242 __netif_receive_skb_list net/core/dev.c:6294 [inline] netif_receive_skb_list_internal+0x995/0xcf0 net/core/dev.c:6385 netif_receive_skb_list+0x54/0x410 net/core/dev.c:6437 xdp_recv_frames net/bpf/test_run.c:269 [inline] xdp_test_run_batch net/bpf/test_run.c:350 [inline] bpf_test_run_xdp_live+0x1946/0x1cf0 net/bpf/test_run.c:379 bpf_prog_test_run_xdp+0x81c/0x1160 net/bpf/test_run.c:1396 bpf_prog_test_run+0x2c7/0x340 kernel/bpf/syscall.c:4703 __sys_bpf+0x5cb/0x920 kernel/bpf/syscall.c:6182 __do_sys_bpf kernel/bpf/syscall.c:6274 [inline] __se_sys_bpf kernel/bpf/syscall.c:6272 [inline] __x64_sys_bpf+0x7c/0x90 kernel/bpf/syscall.c:6272 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xe2/0xf80 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Freed by task 6021: kasan_save_stack mm/kasan/common.c:57 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:78 kasan_save_free_info+0x46/0x50 mm/kasan/generic.c:584 poison_slab_object mm/kasan/common.c:253 [inline] __kasan_slab_free+0x5c/0x80 mm/kasan/common.c:285 kasan_slab_free include/linux/kasan.h:235 [inline] slab_free_hook mm/slub.c:2540 [inline] slab_free mm/slub.c:6674 [inline] kfree+0x1be/0x650 mm/slub.c:6882 kfree_sk_msg include/linux/skmsg.h:385 [inline] sk_msg_recvmsg+0xaa8/0xc30 net/core/skmsg.c:483 udp_bpf_recvmsg+0x4bd/0xe00 net/ipv4/udp_bpf.c:84 inet_recvmsg+0x260/0x270 net/ipv4/af_inet.c:891 sock_recvmsg_nosec net/socket.c:1078 [inline] sock_recvmsg+0x1a8/0x270 net/socket.c:1100 ____sys_recvmsg+0x1e6/0x4a0 net/socket.c:2812 ___sys_recvmsg+0x215/0x590 net/socket.c:2854 do_recvmmsg+0x334/0x800 net/socket.c:2949 __sys_recvmmsg net/socket.c:3023 [inline] __do_sys_recvmmsg net/socket.c:3046 [inline] __se_sys_recvmmsg net/socket.c:3039 [inline] __x64_sys_recvmmsg+0x198/0x250 net/socket.c:3039 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xe2/0xf80 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: 9f2470fbc4cb ("skmsg: Improve udp_bpf_recvmsg() accuracy") Reported-by: syzbot+9307c991a6d07ce6e6d8@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/69922ac9.a70a0220.2c38d7.00e0.GAE@google.com/ Reviewed-by: Jiayuan Chen Reviewed-by: Jakub Sitnicki Reviewed-by: Emil Tsalapatis Signed-off-by: Kuniyuki Iwashima Signed-off-by: Jiayuan Chen Link: https://lore.kernel.org/r/20260615021959.140010-5-jiayuan.chen@linux.dev Signed-off-by: Alexei Starovoitov --- net/ipv4/udp_bpf.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index 9f33b07b1481..ad57c4c9eaab 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -50,7 +50,9 @@ static int udp_msg_wait_data(struct sock *sk, struct sk_psock *psock, sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); ret = udp_msg_has_data(sk, psock); if (!ret) { + release_sock(sk); wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); + lock_sock(sk); ret = udp_msg_has_data(sk, psock); } sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); @@ -79,6 +81,7 @@ static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, goto out; } + lock_sock(sk); msg_bytes_ready: copied = sk_msg_recvmsg(sk, psock, msg, len, flags); if (!copied) { @@ -90,11 +93,17 @@ msg_bytes_ready: if (data) { if (psock_has_data(psock)) goto msg_bytes_ready; + + release_sock(sk); + ret = sk_udp_recvmsg(sk, msg, len, flags); goto out; } copied = -EAGAIN; } + + release_sock(sk); + ret = copied; out: sk_psock_put(sk, psock); -- cgit v1.2.3 From a48802fb2cd2d1e23651989f8ff4d15e9d5dad54 Mon Sep 17 00:00:00 2001 From: Sechang Lim Date: Mon, 15 Jun 2026 10:19:58 +0800 Subject: bpf, sockmap: fix integer overflow in bpf_msg_pop_data() bounds check start and len are u32, so u64 last = start + len; evaluates start + len in 32-bit and wraps before storing it in last. The bounds check if (start >= offset + l || last > msg->sg.size) return -EINVAL; can then be passed with an out-of-range start/len, after which the pop loop runs off the end of the scatterlist and sk_msg_shift_left() calls put_page() on the empty msg->sg.end slot: Oops: general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] SMP KASAN PTI KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f] RIP: 0010:sk_msg_shift_left net/core/filter.c:2957 [inline] RIP: 0010:____bpf_msg_pop_data net/core/filter.c:3103 [inline] RIP: 0010:bpf_msg_pop_data+0x753/0x1a10 net/core/filter.c:2984 Call Trace: bpf_prog_4cc92c278f4d5d56+0x1b1/0x1e8 bpf_prog_run_pin_on_cpu+0x107/0x320 include/linux/filter.h:746 sk_psock_msg_verdict+0x357/0x7f0 net/core/skmsg.c:934 tcp_bpf_send_verdict net/ipv4/tcp_bpf.c:420 [inline] tcp_bpf_sendmsg+0x766/0x1ae0 net/ipv4/tcp_bpf.c:583 __sock_sendmsg+0x153/0x1c0 net/socket.c:802 __sys_sendto+0x326/0x430 net/socket.c:2265 __x64_sys_sendto+0xe3/0x100 net/socket.c:2268 do_syscall_64+0x14c/0x480 entry_SYSCALL_64_after_hwframe+0x77/0x7f Widen the addition with a (u64) cast so the bound is evaluated in 64-bit and a len near U32_MAX no longer wraps below msg->sg.size. While here, change pop from int to u32. It counts bytes against the unsigned scatterlist lengths and can never be negative, so the signed type only invites sign-confusion in the pop loop. Fixes: 7246d8ed4dcc ("bpf: helper to pop data from messages") Reviewed-by: Jiayuan Chen Reviewed-by: Emil Tsalapatis Reviewed-by: Kuniyuki Iwashima Signed-off-by: Sechang Lim Signed-off-by: Jiayuan Chen Link: https://lore.kernel.org/r/20260615021959.140010-6-jiayuan.chen@linux.dev Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 3ecac0eb7da1..126aba56f1c0 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3048,8 +3048,8 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, u32, len, u64, flags) { u32 i = 0, l = 0, space, offset = 0; - u64 last = start + len; - int pop; + u64 last = (u64)start + len; + u32 pop; if (unlikely(flags)) return -EINVAL; -- cgit v1.2.3 From 70b139d0483cd42808326c36c4b63d5be4a3cccb Mon Sep 17 00:00:00 2001 From: Sechang Lim Date: Mon, 15 Jun 2026 10:19:59 +0800 Subject: selftests/bpf: add test for bpf_msg_pop_data() overflow Add a test in sockmap_basic.c that calls bpf_msg_pop_data() with a length close to U32_MAX, which overflows the start + len bounds check. The sk_msg program records the return value over a sendmsg and the test checks that the call is rejected with -EINVAL. Reviewed-by: Jiayuan Chen Reviewed-by: Emil Tsalapatis Cc: Jiayuan Chen Signed-off-by: Sechang Lim Signed-off-by: Jiayuan Chen Link: https://lore.kernel.org/r/20260615021959.140010-7-jiayuan.chen@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/sockmap_basic.c | 48 ++++++++++++++++++++++ .../bpf/progs/test_sockmap_msg_pop_data.c | 27 ++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/test_sockmap_msg_pop_data.c diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index d2846579285f..cb3229711f93 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -14,6 +14,7 @@ #include "test_sockmap_pass_prog.skel.h" #include "test_sockmap_drop_prog.skel.h" #include "test_sockmap_change_tail.skel.h" +#include "test_sockmap_msg_pop_data.skel.h" #include "bpf_iter_sockmap.skel.h" #include "sockmap_helpers.h" @@ -666,6 +667,51 @@ out: test_sockmap_change_tail__destroy(skel); } +static void test_sockmap_msg_verdict_pop_data(void) +{ + struct test_sockmap_msg_pop_data *skel; + int err, map, verdict; + int c1 = -1, p1 = -1, sent; + int zero = 0; + char *buf; + const size_t len = 32 * 1024; + + skel = test_sockmap_msg_pop_data__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + verdict = bpf_program__fd(skel->progs.prog_msg_pop_data); + map = bpf_map__fd(skel->maps.sock_map); + + err = bpf_prog_attach(verdict, map, BPF_SK_MSG_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach")) + goto out; + + err = create_pair(AF_INET, SOCK_STREAM, &c1, &p1); + if (!ASSERT_OK(err, "create_pair")) + goto out; + + err = bpf_map_update_elem(map, &zero, &c1, BPF_NOEXIST); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto out_close; + + buf = calloc(len, 1); + if (!ASSERT_OK_PTR(buf, "calloc")) + goto out_close; + + sent = xsend(c1, buf, len, 0); + ASSERT_EQ(sent, (ssize_t)len, "xsend"); + ASSERT_EQ(skel->data->pop_data_ret, -EINVAL, "pop_data_rejects overflow"); + + free(buf); + +out_close: + close(c1); + close(p1); +out: + test_sockmap_msg_pop_data__destroy(skel); +} + static void test_sockmap_skb_verdict_peek_helper(int map) { int err, c1, p1, zero = 0, sent, recvd, avail; @@ -1373,6 +1419,8 @@ void test_sockmap_basic(void) test_sockmap_skb_verdict_fionread(false); if (test__start_subtest("sockmap skb_verdict change tail")) test_sockmap_skb_verdict_change_tail(); + if (test__start_subtest("sockmap msg_verdict pop_data overflow")) + test_sockmap_msg_verdict_pop_data(); if (test__start_subtest("sockmap skb_verdict msg_f_peek")) test_sockmap_skb_verdict_peek(); if (test__start_subtest("sockmap skb_verdict msg_f_peek with link")) diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_msg_pop_data.c b/tools/testing/selftests/bpf/progs/test_sockmap_msg_pop_data.c new file mode 100644 index 000000000000..301e65b95256 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_sockmap_msg_pop_data.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "vmlinux.h" +#include + +struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(max_entries, 1); + __type(key, int); + __type(value, int); +} sock_map SEC(".maps"); + +#define POP_START 0x48a3 +#define POP_LEN 0xfffffffd + +long pop_data_ret = 1; + +SEC("sk_msg") +int prog_msg_pop_data(struct sk_msg_md *msg) +{ + if (msg->size <= POP_START) + return SK_PASS; + + pop_data_ret = bpf_msg_pop_data(msg, POP_START, POP_LEN, 0); + return SK_PASS; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From e4287bf34f97a88c7d9322f5bde828724c073a6b Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 15 Jun 2026 00:17:58 -0700 Subject: selftests/bpf: Work around llvm stack overflow in crypto progs clang 23 fails to build crypto_bench.c and crypto_sanity.c with "BPF stack limit exceeded". The progs fill a 408-byte bpf_crypto_params on the stack and pass it to bpf_crypto_ctx_create(). clang 23 copies the byte-aligned cipher/key globals into it one byte at a time through the stack, and keeps more than one copy of the struct around. Together that blows the 512-byte limit. Align the source arrays to 8 bytes so the copy is word-wise, and move params off the stack into a static .bss var. static keeps it out of the skeleton, where bpf_crypto_params is an incomplete type. Either change alone is not enough. Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/crypto_bench.c | 21 ++++++++++++++------- tools/testing/selftests/bpf/progs/crypto_sanity.c | 21 ++++++++++++++------- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/crypto_bench.c b/tools/testing/selftests/bpf/progs/crypto_bench.c index 4ac956b26240..4c0a09aa1e6c 100644 --- a/tools/testing/selftests/bpf/progs/crypto_bench.c +++ b/tools/testing/selftests/bpf/progs/crypto_bench.c @@ -11,10 +11,19 @@ #include "crypto_common.h" const volatile unsigned int len = 16; -char cipher[128] = {}; +/* + * cipher[] and key[] are 8-byte aligned and 'params' is kept off the stack to + * work around an LLVM code generation bug. clang lowers the memcpy() of these + * byte-aligned globals into a per-byte load/store sequence staged on the stack, + * and additionally materializes the on-stack 'struct bpf_crypto_params' twice. + * Both blow the 512-byte BPF stack limit. Aligning the sources lets clang copy + * word-wise, and a global 'params' removes the large object from the stack. + */ +char cipher[128] __attribute__((aligned(8))) = {}; u32 key_len, authsize; char dst[256] = {}; -u8 key[256] = {}; +u8 key[256] __attribute__((aligned(8))) = {}; +static struct bpf_crypto_params params; long hits = 0; int status; @@ -22,11 +31,6 @@ SEC("syscall") int crypto_setup(void *args) { struct bpf_crypto_ctx *cctx; - struct bpf_crypto_params params = { - .type = "skcipher", - .key_len = key_len, - .authsize = authsize, - }; int err = 0; status = 0; @@ -36,6 +40,9 @@ int crypto_setup(void *args) return 0; } + __builtin_memcpy(¶ms.type, "skcipher", sizeof("skcipher")); + params.key_len = key_len; + params.authsize = authsize; __builtin_memcpy(¶ms.algo, cipher, sizeof(cipher)); __builtin_memcpy(¶ms.key, key, sizeof(key)); cctx = bpf_crypto_ctx_create(¶ms, sizeof(params), &err); diff --git a/tools/testing/selftests/bpf/progs/crypto_sanity.c b/tools/testing/selftests/bpf/progs/crypto_sanity.c index dfd8a258f14a..e81f5ac3b1ae 100644 --- a/tools/testing/selftests/bpf/progs/crypto_sanity.c +++ b/tools/testing/selftests/bpf/progs/crypto_sanity.c @@ -10,11 +10,20 @@ #include "bpf_kfuncs.h" #include "crypto_common.h" -unsigned char key[256] = {}; +/* + * key[] and algo[] are 8-byte aligned and 'params' is kept off the stack to + * work around an LLVM code generation bug. clang lowers the memcpy() of these + * byte-aligned globals into a per-byte load/store sequence staged on the stack, + * and additionally materializes the on-stack 'struct bpf_crypto_params' twice. + * Both blow the 512-byte BPF stack limit. Aligning the sources lets clang copy + * word-wise, and a global 'params' removes the large object from the stack. + */ +unsigned char key[256] __attribute__((aligned(8))) = {}; u16 udp_test_port = 7777; u32 authsize, key_len; -char algo[128] = {}; +char algo[128] __attribute__((aligned(8))) = {}; char dst[16] = {}, dst_bad[8] = {}; +static struct bpf_crypto_params params; int status; static int skb_dynptr_validate(struct __sk_buff *skb, struct bpf_dynptr *psrc) @@ -53,11 +62,6 @@ static int skb_dynptr_validate(struct __sk_buff *skb, struct bpf_dynptr *psrc) SEC("syscall") int skb_crypto_setup(void *ctx) { - struct bpf_crypto_params params = { - .type = "skcipher", - .key_len = key_len, - .authsize = authsize, - }; struct bpf_crypto_ctx *cctx; int err; @@ -67,6 +71,9 @@ int skb_crypto_setup(void *ctx) return 0; } + __builtin_memcpy(¶ms.type, "skcipher", sizeof("skcipher")); + params.key_len = key_len; + params.authsize = authsize; __builtin_memcpy(¶ms.algo, algo, sizeof(algo)); __builtin_memcpy(¶ms.key, key, sizeof(key)); -- cgit v1.2.3