From b5709f6d26d65f6bb9711f4b5f98469fd507cb5b Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 3 Dec 2025 15:37:44 -0800 Subject: bpf: Support associating BPF program with struct_ops Add a new BPF command BPF_PROG_ASSOC_STRUCT_OPS to allow associating a BPF program with a struct_ops map. This command takes a file descriptor of a struct_ops map and a BPF program and set prog->aux->st_ops_assoc to the kdata of the struct_ops map. The command does not accept a struct_ops program nor a non-struct_ops map. Programs of a struct_ops map is automatically associated with the map during map update. If a program is shared between two struct_ops maps, prog->aux->st_ops_assoc will be poisoned to indicate that the associated struct_ops is ambiguous. The pointer, once poisoned, cannot be reset since we have lost track of associated struct_ops. For other program types, the associated struct_ops map, once set, cannot be changed later. This restriction may be lifted in the future if there is a use case. A kernel helper bpf_prog_get_assoc_struct_ops() can be used to retrieve the associated struct_ops pointer. The returned pointer, if not NULL, is guaranteed to be valid and point to a fully updated struct_ops struct. For struct_ops program reused in multiple struct_ops map, the return will be NULL. prog->aux->st_ops_assoc is protected by bumping the refcount for non-struct_ops programs and RCU for struct_ops programs. Since it would be inefficient to track programs associated with a struct_ops map, every non-struct_ops program will bump the refcount of the map to make sure st_ops_assoc stays valid. For a struct_ops program, it is protected by RCU as map_free will wait for an RCU grace period before disassociating the program with the map. The helper must be called in BPF program context or RCU read-side critical section. struct_ops implementers should note that the struct_ops returned may not be initialized nor attached yet. The struct_ops implementer will be responsible for tracking and checking the state of the associated struct_ops map if the use case expects an initialized or attached struct_ops. Signed-off-by: Amery Hung Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20251203233748.668365-3-ameryhung@gmail.com --- include/linux/bpf.h | 16 ++++++++++++++++ include/uapi/linux/bpf.h | 17 +++++++++++++++++ 2 files changed, 33 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6498be4c44f8..28d8d6b7bb1e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1739,6 +1739,8 @@ struct bpf_prog_aux { struct rcu_head rcu; }; struct bpf_stream stream[2]; + struct mutex st_ops_assoc_mutex; + struct bpf_map __rcu *st_ops_assoc; }; struct bpf_prog { @@ -2041,6 +2043,9 @@ static inline void bpf_module_put(const void *data, struct module *owner) module_put(owner); } int bpf_struct_ops_link_create(union bpf_attr *attr); +int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map); +void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog); +void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux); u32 bpf_struct_ops_id(const void *kdata); #ifdef CONFIG_NET @@ -2088,6 +2093,17 @@ static inline int bpf_struct_ops_link_create(union bpf_attr *attr) { return -EOPNOTSUPP; } +static inline int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map) +{ + return -EOPNOTSUPP; +} +static inline void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog) +{ +} +static inline void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux) +{ + return NULL; +} static inline void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map) { } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f8d8513eda27..84ced3ed2d21 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -918,6 +918,16 @@ union bpf_iter_link_info { * Number of bytes read from the stream on success, or -1 if an * error occurred (in which case, *errno* is set appropriately). * + * BPF_PROG_ASSOC_STRUCT_OPS + * Description + * Associate a BPF program with a struct_ops map. The struct_ops + * map is identified by *map_fd* and the BPF program is + * identified by *prog_fd*. + * + * Return + * 0 on success or -1 if an error occurred (in which case, + * *errno* is set appropriately). + * * NOTES * eBPF objects (maps and programs) can be shared between processes. * @@ -974,6 +984,7 @@ enum bpf_cmd { BPF_PROG_BIND_MAP, BPF_TOKEN_CREATE, BPF_PROG_STREAM_READ_BY_FD, + BPF_PROG_ASSOC_STRUCT_OPS, __MAX_BPF_CMD, }; @@ -1894,6 +1905,12 @@ union bpf_attr { __u32 prog_fd; } prog_stream_read; + struct { + __u32 map_fd; + __u32 prog_fd; + __u32 flags; + } prog_assoc_struct_ops; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF -- cgit v1.2.3 From 93f0d09697613beba922a387d21a09a41eeefef5 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 19 Dec 2025 10:44:17 -0800 Subject: bpf: move recursion detection logic to helpers BPF programs detect recursion by doing atomic inc/dec on a per-cpu active counter from the trampoline. Create two helpers for operations on this active counter, this makes it easy to changes the recursion detection logic in future. This commit makes no functional changes. Acked-by: Yonghong Song Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20251219184422.2899902-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 10 ++++++++++ kernel/bpf/trampoline.c | 8 ++++---- kernel/trace/bpf_trace.c | 4 ++-- 3 files changed, 16 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bb3847caeae1..2da986136d26 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2004,6 +2004,16 @@ struct bpf_struct_ops_common_value { enum bpf_struct_ops_state state; }; +static inline bool bpf_prog_get_recursion_context(struct bpf_prog *prog) +{ + return this_cpu_inc_return(*(prog->active)) == 1; +} + +static inline void bpf_prog_put_recursion_context(struct bpf_prog *prog) +{ + this_cpu_dec(*(prog->active)); +} + #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) /* This macro helps developer to register a struct_ops type and generate * type information correctly. Developers should use this macro to register diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 976d89011b15..2a125d063e62 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -949,7 +949,7 @@ static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tram run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); - if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { + if (unlikely(!bpf_prog_get_recursion_context(prog))) { bpf_prog_inc_misses_counter(prog); if (prog->aux->recursion_detected) prog->aux->recursion_detected(prog); @@ -993,7 +993,7 @@ static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start, bpf_reset_run_ctx(run_ctx->saved_run_ctx); update_prog_stats(prog, start); - this_cpu_dec(*(prog->active)); + bpf_prog_put_recursion_context(prog); rcu_read_unlock_migrate(); } @@ -1029,7 +1029,7 @@ u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); - if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { + if (unlikely(!bpf_prog_get_recursion_context(prog))) { bpf_prog_inc_misses_counter(prog); if (prog->aux->recursion_detected) prog->aux->recursion_detected(prog); @@ -1044,7 +1044,7 @@ void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start, bpf_reset_run_ctx(run_ctx->saved_run_ctx); update_prog_stats(prog, start); - this_cpu_dec(*(prog->active)); + bpf_prog_put_recursion_context(prog); migrate_enable(); rcu_read_unlock_trace(); } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index fe28d86f7c35..6e076485bf70 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2063,7 +2063,7 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) struct bpf_trace_run_ctx run_ctx; cant_sleep(); - if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { + if (unlikely(!bpf_prog_get_recursion_context(prog))) { bpf_prog_inc_misses_counter(prog); goto out; } @@ -2077,7 +2077,7 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) bpf_reset_run_ctx(old_run_ctx); out: - this_cpu_dec(*(prog->active)); + bpf_prog_put_recursion_context(prog); } #define UNPACK(...) __VA_ARGS__ -- cgit v1.2.3 From c3e34f88f9992866a1fb510850921a8fe299a97b Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 19 Dec 2025 10:44:18 -0800 Subject: bpf: arm64: Optimize recursion detection by not using atomics BPF programs detect recursion using a per-CPU 'active' flag in struct bpf_prog. The trampoline currently sets/clears this flag with atomic operations. On some arm64 platforms (e.g., Neoverse V2 with LSE), per-CPU atomic operations are relatively slow. Unlike x86_64 - where per-CPU updates can avoid cross-core atomicity, arm64 LSE atomics are always atomic across all cores, which is unnecessary overhead for strictly per-CPU state. This patch removes atomics from the recursion detection path on arm64 by changing 'active' to a per-CPU array of four u8 counters, one per context: {NMI, hard-irq, soft-irq, normal}. The running context uses a non-atomic increment/decrement on its element. After increment, recursion is detected by reading the array as a u32 and verifying that only the expected element changed; any change in another element indicates inter-context recursion, and a value > 1 in the same element indicates same-context recursion. For example, starting from {0,0,0,0}, a normal-context trigger changes the array to {0,0,0,1}. If an NMI arrives on the same CPU and triggers the program, the array becomes {1,0,0,1}. When the NMI context checks the u32 against the expected mask for normal (0x00000001), it observes 0x01000001 and correctly reports recursion. Same-context recursion is detected analogously. Acked-by: Yonghong Song Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20251219184422.2899902-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 32 +++++++++++++++++++++++++++++--- kernel/bpf/core.c | 3 ++- 2 files changed, 31 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2da986136d26..da6a00dd313f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1746,6 +1746,8 @@ struct bpf_prog_aux { struct bpf_map __rcu *st_ops_assoc; }; +#define BPF_NR_CONTEXTS 4 /* normal, softirq, hardirq, NMI */ + struct bpf_prog { u16 pages; /* Number of allocated pages */ u16 jited:1, /* Is our filter JIT'ed? */ @@ -1772,7 +1774,7 @@ struct bpf_prog { u8 tag[BPF_TAG_SIZE]; }; struct bpf_prog_stats __percpu *stats; - int __percpu *active; + u8 __percpu *active; /* u8[BPF_NR_CONTEXTS] for recursion protection */ unsigned int (*bpf_func)(const void *ctx, const struct bpf_insn *insn); struct bpf_prog_aux *aux; /* Auxiliary fields */ @@ -2006,12 +2008,36 @@ struct bpf_struct_ops_common_value { static inline bool bpf_prog_get_recursion_context(struct bpf_prog *prog) { - return this_cpu_inc_return(*(prog->active)) == 1; +#ifdef CONFIG_ARM64 + u8 rctx = interrupt_context_level(); + u8 *active = this_cpu_ptr(prog->active); + u32 val; + + preempt_disable(); + active[rctx]++; + val = le32_to_cpu(*(__le32 *)active); + preempt_enable(); + if (val != BIT(rctx * 8)) + return false; + + return true; +#else + return this_cpu_inc_return(*(int __percpu *)(prog->active)) == 1; +#endif } static inline void bpf_prog_put_recursion_context(struct bpf_prog *prog) { - this_cpu_dec(*(prog->active)); +#ifdef CONFIG_ARM64 + u8 rctx = interrupt_context_level(); + u8 *active = this_cpu_ptr(prog->active); + + preempt_disable(); + active[rctx]--; + preempt_enable(); +#else + this_cpu_dec(*(int __percpu *)(prog->active)); +#endif } #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c66316e32563..e0b8a8a5aaa9 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -112,7 +112,8 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag vfree(fp); return NULL; } - fp->active = alloc_percpu_gfp(int, bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags)); + fp->active = __alloc_percpu_gfp(sizeof(u8[BPF_NR_CONTEXTS]), 4, + bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags)); if (!fp->active) { vfree(fp); kfree(aux); -- cgit v1.2.3 From 4221de8c410e7adb1114a4374c78fa4269175343 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 22 Dec 2025 20:41:51 -0800 Subject: mm: declare memcg_page_state_output() in memcontrol.h To use memcg_page_state_output() in bpf_memcontrol.c move the declaration from v1-specific memcontrol-v1.h to memcontrol.h. Signed-off-by: Roman Gushchin Acked-by: Michal Hocko Acked-by: Shakeel Butt Link: https://lore.kernel.org/r/20251223044156.208250-2-roman.gushchin@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/memcontrol.h | 6 ++++++ mm/memcontrol-v1.h | 1 - 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0651865a4564..7bef427d5a82 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -950,6 +950,7 @@ static inline void mod_memcg_page_state(struct page *page, } unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx); +unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item); unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx); unsigned long lruvec_page_state_local(struct lruvec *lruvec, enum node_stat_item idx); @@ -1373,6 +1374,11 @@ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) return 0; } +static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item) +{ + return 0; +} + static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h index 6358464bb416..a304ad418cdf 100644 --- a/mm/memcontrol-v1.h +++ b/mm/memcontrol-v1.h @@ -27,7 +27,6 @@ unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap); void drain_all_stock(struct mem_cgroup *root_memcg); unsigned long memcg_events(struct mem_cgroup *memcg, int event); -unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item); int memory_stat_show(struct seq_file *m, void *v); void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n); -- cgit v1.2.3 From 99430ab8b804c26b8a0dec93fcbfe75469f3edc7 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 22 Dec 2025 20:41:54 -0800 Subject: mm: introduce BPF kfuncs to access memcg statistics and events Introduce BPF kfuncs to conveniently access memcg data: - bpf_mem_cgroup_vm_events(), - bpf_mem_cgroup_memory_events(), - bpf_mem_cgroup_usage(), - bpf_mem_cgroup_page_state(), - bpf_mem_cgroup_flush_stats(). These functions are useful for implementing BPF OOM policies, but also can be used to accelerate access to the memcg data. Reading it through cgroupfs is much more expensive, roughly 5x, mostly because of the need to convert the data into the text and back. JP Kobryn: An experiment was setup to compare the performance of a program that uses the traditional method of reading memory.stat vs a program using the new kfuncs. The control program opens up the root memory.stat file and for 1M iterations reads, converts the string values to numeric data, then seeks back to the beginning. The experimental program sets up the requisite libbpf objects and for 1M iterations invokes a bpf program which uses the kfuncs to fetch all available stats for node_stat_item, memcg_stat_item, and vm_event_item types. The results showed a significant perf benefit on the experimental side, outperforming the control side by a margin of 93%. In kernel mode, elapsed time was reduced by 80%, while in user mode, over 99% of time was saved. control: elapsed time real 0m38.318s user 0m25.131s sys 0m13.070s experiment: elapsed time real 0m2.789s user 0m0.187s sys 0m2.512s control: perf data 33.43% a.out libc.so.6 [.] __vfscanf_internal 6.88% a.out [kernel.kallsyms] [k] vsnprintf 6.33% a.out libc.so.6 [.] _IO_fgets 5.51% a.out [kernel.kallsyms] [k] format_decode 4.31% a.out libc.so.6 [.] __GI_____strtoull_l_internal 3.78% a.out [kernel.kallsyms] [k] string 3.53% a.out [kernel.kallsyms] [k] number 2.71% a.out libc.so.6 [.] _IO_sputbackc 2.41% a.out [kernel.kallsyms] [k] strlen 1.98% a.out a.out [.] main 1.70% a.out libc.so.6 [.] _IO_getline_info 1.51% a.out libc.so.6 [.] __isoc99_sscanf 1.47% a.out [kernel.kallsyms] [k] memory_stat_format 1.47% a.out [kernel.kallsyms] [k] memcpy_orig 1.41% a.out [kernel.kallsyms] [k] seq_buf_printf experiment: perf data 10.55% memcgstat bpf_prog_..._query [k] bpf_prog_16aab2f19fa982a7_query 6.90% memcgstat [kernel.kallsyms] [k] memcg_page_state_output 3.55% memcgstat [kernel.kallsyms] [k] _raw_spin_lock 3.12% memcgstat [kernel.kallsyms] [k] memcg_events 2.87% memcgstat [kernel.kallsyms] [k] __memcg_slab_post_alloc_hook 2.73% memcgstat [kernel.kallsyms] [k] kmem_cache_free 2.70% memcgstat [kernel.kallsyms] [k] entry_SYSRETQ_unsafe_stack 2.25% memcgstat [kernel.kallsyms] [k] __memcg_slab_free_hook 2.06% memcgstat [kernel.kallsyms] [k] get_page_from_freelist Signed-off-by: Roman Gushchin Co-developed-by: JP Kobryn Signed-off-by: JP Kobryn Link: https://lore.kernel.org/r/20251223044156.208250-5-roman.gushchin@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/memcontrol.h | 14 ++++++++ mm/bpf_memcontrol.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++ mm/memcontrol.c | 16 +++++++++ 3 files changed, 115 insertions(+) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7bef427d5a82..6a5d65487b70 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -949,8 +949,12 @@ static inline void mod_memcg_page_state(struct page *page, rcu_read_unlock(); } +unsigned long memcg_events(struct mem_cgroup *memcg, int event); +unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap); unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx); unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item); +bool memcg_stat_item_valid(int idx); +bool memcg_vm_event_item_valid(enum vm_event_item idx); unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx); unsigned long lruvec_page_state_local(struct lruvec *lruvec, enum node_stat_item idx); @@ -1379,6 +1383,16 @@ static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, in return 0; } +static inline bool memcg_stat_item_valid(int idx) +{ + return false; +} + +static inline bool memcg_vm_event_item_valid(enum vm_event_item idx) +{ + return false; +} + static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c index 187919eb2fe2..e8fa7f5855f9 100644 --- a/mm/bpf_memcontrol.c +++ b/mm/bpf_memcontrol.c @@ -80,6 +80,85 @@ __bpf_kfunc void bpf_put_mem_cgroup(struct mem_cgroup *memcg) css_put(&memcg->css); } +/** + * bpf_mem_cgroup_vm_events - Read memory cgroup's vm event counter + * @memcg: memory cgroup + * @event: event id + * + * Allows to read memory cgroup event counters. + * + * Return: The current value of the corresponding events counter. + */ +__bpf_kfunc unsigned long bpf_mem_cgroup_vm_events(struct mem_cgroup *memcg, + enum vm_event_item event) +{ + if (unlikely(!memcg_vm_event_item_valid(event))) + return (unsigned long)-1; + + return memcg_events(memcg, event); +} + +/** + * bpf_mem_cgroup_usage - Read memory cgroup's usage + * @memcg: memory cgroup + * + * Please, note that the root memory cgroup it special and is exempt + * from the memory accounting. The returned value is a sum of sub-cgroup's + * usages and it not reflecting the size of the root memory cgroup itself. + * If you need to get an approximation, you can use root level statistics: + * e.g. NR_FILE_PAGES + NR_ANON_MAPPED. + * + * Return: The current memory cgroup size in bytes. + */ +__bpf_kfunc unsigned long bpf_mem_cgroup_usage(struct mem_cgroup *memcg) +{ + return page_counter_read(&memcg->memory) * PAGE_SIZE; +} + +/** + * bpf_mem_cgroup_memory_events - Read memory cgroup's memory event value + * @memcg: memory cgroup + * @event: memory event id + * + * Return: The current value of the memory event counter. + */ +__bpf_kfunc unsigned long bpf_mem_cgroup_memory_events(struct mem_cgroup *memcg, + enum memcg_memory_event event) +{ + if (unlikely(event >= MEMCG_NR_MEMORY_EVENTS)) + return (unsigned long)-1; + + return atomic_long_read(&memcg->memory_events[event]); +} + +/** + * bpf_mem_cgroup_page_state - Read memory cgroup's page state counter + * @memcg: memory cgroup + * @idx: counter idx + * + * Allows to read memory cgroup statistics. The output is in bytes. + * + * Return: The value of the page state counter in bytes. + */ +__bpf_kfunc unsigned long bpf_mem_cgroup_page_state(struct mem_cgroup *memcg, int idx) +{ + if (unlikely(!memcg_stat_item_valid(idx))) + return (unsigned long)-1; + + return memcg_page_state_output(memcg, idx); +} + +/** + * bpf_mem_cgroup_flush_stats - Flush memory cgroup's statistics + * @memcg: memory cgroup + * + * Propagate memory cgroup's statistics up the cgroup tree. + */ +__bpf_kfunc void bpf_mem_cgroup_flush_stats(struct mem_cgroup *memcg) +{ + mem_cgroup_flush_stats(memcg); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_memcontrol_kfuncs) @@ -87,6 +166,12 @@ BTF_ID_FLAGS(func, bpf_get_root_mem_cgroup, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_get_mem_cgroup, KF_ACQUIRE | KF_RET_NULL | KF_RCU) BTF_ID_FLAGS(func, bpf_put_mem_cgroup, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_mem_cgroup_vm_events, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_mem_cgroup_memory_events, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_mem_cgroup_usage, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_mem_cgroup_page_state, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_mem_cgroup_flush_stats, KF_TRUSTED_ARGS | KF_SLEEPABLE) + BTF_KFUNCS_END(bpf_memcontrol_kfuncs) static const struct btf_kfunc_id_set bpf_memcontrol_kfunc_set = { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index be810c1fbfc3..bae4eb72da61 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -663,6 +663,14 @@ unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) return x; } +bool memcg_stat_item_valid(int idx) +{ + if ((u32)idx >= MEMCG_NR_STAT) + return false; + + return !BAD_STAT_IDX(memcg_stats_index(idx)); +} + static int memcg_page_state_unit(int item); /* @@ -860,6 +868,14 @@ unsigned long memcg_events(struct mem_cgroup *memcg, int event) return READ_ONCE(memcg->vmstats->events[i]); } +bool memcg_vm_event_item_valid(enum vm_event_item idx) +{ + if (idx >= NR_VM_EVENT_ITEMS) + return false; + + return !BAD_STAT_IDX(memcg_events_index(idx)); +} + #ifdef CONFIG_MEMCG_V1 unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) { -- cgit v1.2.3 From b8467290edab4bafae352bf3f317055669a1a458 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Mon, 22 Dec 2025 11:50:18 -0800 Subject: bpf: arena: make arena kfuncs any context safe Make arena related kfuncs any context safe by the following changes: bpf_arena_alloc_pages() and bpf_arena_reserve_pages(): Replace the usage of the mutex with a rqspinlock for range tree and use kmalloc_nolock() wherever needed. Use free_pages_nolock() to free pages from any context. apply_range_set/clear_cb() with apply_to_page_range() has already made populating the vm_area in bpf_arena_alloc_pages() any context safe. bpf_arena_free_pages(): defer the main logic to a workqueue if it is called from a non-sleepable context. specialize_kfunc() is used to replace the sleepable arena_free_pages() with bpf_arena_free_pages_non_sleepable() when the verifier detects the call is from a non-sleepable context. In the non-sleepable case, arena_free_pages() queues the address and the page count to be freed to a lock-less list of struct arena_free_spans and raises an irq_work. The irq_work handler calls schedules_work() as it is safe to be called from irq context. arena_free_worker() (the work queue handler) iterates these spans and clears ptes, flushes tlb, zaps pages, and calls __free_page(). Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20251222195022.431211-4-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 16 ++++ kernel/bpf/arena.c | 248 +++++++++++++++++++++++++++++++++++++++++--------- kernel/bpf/verifier.c | 10 ++ 3 files changed, 233 insertions(+), 41 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index da6a00dd313f..4e7d72dfbcd4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -673,6 +673,22 @@ void bpf_map_free_internal_structs(struct bpf_map *map, void *obj); int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit); +#if defined(CONFIG_MMU) && defined(CONFIG_64BIT) +void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt, int node_id, + u64 flags); +void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt); +#else +static inline void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt, + int node_id, u64 flags) +{ + return NULL; +} + +static inline void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt) +{ +} +#endif + extern const struct bpf_map_ops bpf_map_offload_ops; /* bpf_type_flag contains a set of flags that are applicable to the values of diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 128efb68d47b..456ac989269d 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -4,7 +4,9 @@ #include #include #include +#include #include "linux/filter.h" +#include #include #include #include @@ -44,7 +46,7 @@ #define GUARD_SZ round_up(1ull << sizeof_field(struct bpf_insn, off) * 8, PAGE_SIZE << 1) #define KERN_VM_SZ (SZ_4G + GUARD_SZ) -static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt); +static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable); struct bpf_arena { struct bpf_map map; @@ -52,8 +54,23 @@ struct bpf_arena { u64 user_vm_end; struct vm_struct *kern_vm; struct range_tree rt; + /* protects rt */ + rqspinlock_t spinlock; struct list_head vma_list; + /* protects vma_list */ struct mutex lock; + struct irq_work free_irq; + struct work_struct free_work; + struct llist_head free_spans; +}; + +static void arena_free_worker(struct work_struct *work); +static void arena_free_irq(struct irq_work *iw); + +struct arena_free_span { + struct llist_node node; + unsigned long uaddr; + u32 page_cnt; }; u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena) @@ -127,7 +144,7 @@ static void flush_vmap_cache(unsigned long start, unsigned long size) flush_cache_vmap(start, start + size); } -static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data) +static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *free_pages) { pte_t old_pte; struct page *page; @@ -137,17 +154,15 @@ static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data) if (pte_none(old_pte) || !pte_present(old_pte)) return 0; /* nothing to do */ - /* get page and free it */ page = pte_page(old_pte); if (WARN_ON_ONCE(!page)) return -EINVAL; pte_clear(&init_mm, addr, pte); - /* ensure no stale TLB entries */ - flush_tlb_kernel_range(addr, addr + PAGE_SIZE); - - __free_page(page); + /* Add page to the list so it is freed later */ + if (free_pages) + __llist_add(&page->pcp_llist, free_pages); return 0; } @@ -202,6 +217,9 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr) arena->user_vm_end = arena->user_vm_start + vm_range; INIT_LIST_HEAD(&arena->vma_list); + init_llist_head(&arena->free_spans); + init_irq_work(&arena->free_irq, arena_free_irq); + INIT_WORK(&arena->free_work, arena_free_worker); bpf_map_init_from_attr(&arena->map, attr); range_tree_init(&arena->rt); err = range_tree_set(&arena->rt, 0, attr->max_entries); @@ -210,6 +228,7 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr) goto err; } mutex_init(&arena->lock); + raw_res_spin_lock_init(&arena->spinlock); err = populate_pgtable_except_pte(arena); if (err) { range_tree_destroy(&arena->rt); @@ -256,6 +275,10 @@ static void arena_map_free(struct bpf_map *map) if (WARN_ON_ONCE(!list_empty(&arena->vma_list))) return; + /* Ensure no pending deferred frees */ + irq_work_sync(&arena->free_irq); + flush_work(&arena->free_work); + /* * free_vm_area() calls remove_vm_area() that calls free_unmap_vmap_area(). * It unmaps everything from vmalloc area and clears pgtables. @@ -339,12 +362,16 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) struct bpf_arena *arena = container_of(map, struct bpf_arena, map); struct page *page; long kbase, kaddr; + unsigned long flags; int ret; kbase = bpf_arena_get_kern_vm_start(arena); kaddr = kbase + (u32)(vmf->address); - guard(mutex)(&arena->lock); + if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) + /* Make a reasonable effort to address impossible case */ + return VM_FAULT_RETRY; + page = vmalloc_to_page((void *)kaddr); if (page) /* already have a page vmap-ed */ @@ -352,31 +379,35 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT) /* User space requested to segfault when page is not allocated by bpf prog */ - return VM_FAULT_SIGSEGV; + goto out_unlock_sigsegv; ret = range_tree_clear(&arena->rt, vmf->pgoff, 1); if (ret) - return VM_FAULT_SIGSEGV; + goto out_unlock_sigsegv; struct apply_range_data data = { .pages = &page, .i = 0 }; /* Account into memcg of the process that created bpf_arena */ ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page); if (ret) { range_tree_set(&arena->rt, vmf->pgoff, 1); - return VM_FAULT_SIGSEGV; + goto out_unlock_sigsegv; } ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data); if (ret) { range_tree_set(&arena->rt, vmf->pgoff, 1); - __free_page(page); - return VM_FAULT_SIGSEGV; + free_pages_nolock(page, 0); + goto out_unlock_sigsegv; } flush_vmap_cache(kaddr, PAGE_SIZE); out: page_ref_add(page, 1); + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); vmf->page = page; return 0; +out_unlock_sigsegv: + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); + return VM_FAULT_SIGSEGV; } static const struct vm_operations_struct arena_vm_ops = { @@ -497,7 +528,8 @@ static u64 clear_lo32(u64 val) * Allocate pages and vmap them into kernel vmalloc area. * Later the pages will be mmaped into user space vma. */ -static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id) +static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id, + bool sleepable) { /* user_vm_end/start are fixed before bpf prog runs */ long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT; @@ -506,6 +538,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt struct page **pages = NULL; long remaining, mapped = 0; long alloc_pages; + unsigned long flags; long pgoff = 0; u32 uaddr32; int ret, i; @@ -529,7 +562,8 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt return 0; data.pages = pages; - mutex_lock(&arena->lock); + if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) + goto out_free_pages; if (uaddr) { ret = is_range_tree_set(&arena->rt, pgoff, page_cnt); @@ -573,7 +607,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt /* data.i pages were mapped, account them and free the remaining */ mapped += data.i; for (i = data.i; i < this_batch; i++) - __free_page(pages[i]); + free_pages_nolock(pages[i], 0); goto out; } @@ -581,19 +615,19 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt remaining -= this_batch; } flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT); - mutex_unlock(&arena->lock); + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); kfree_nolock(pages); return clear_lo32(arena->user_vm_start) + uaddr32; out: range_tree_set(&arena->rt, pgoff + mapped, page_cnt - mapped); - mutex_unlock(&arena->lock); + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); if (mapped) { flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT); - arena_free_pages(arena, uaddr32, mapped); + arena_free_pages(arena, uaddr32, mapped, sleepable); } goto out_free_pages; out_unlock_free_pages: - mutex_unlock(&arena->lock); + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); out_free_pages: kfree_nolock(pages); return 0; @@ -608,42 +642,64 @@ static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt) { struct vma_list *vml; + guard(mutex)(&arena->lock); + /* iterate link list under lock */ list_for_each_entry(vml, &arena->vma_list, head) zap_page_range_single(vml->vma, uaddr, PAGE_SIZE * page_cnt, NULL); } -static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt) +static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable) { u64 full_uaddr, uaddr_end; - long kaddr, pgoff, i; + long kaddr, pgoff; struct page *page; + struct llist_head free_pages; + struct llist_node *pos, *t; + struct arena_free_span *s; + unsigned long flags; + int ret = 0; /* only aligned lower 32-bit are relevant */ uaddr = (u32)uaddr; uaddr &= PAGE_MASK; + kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr; full_uaddr = clear_lo32(arena->user_vm_start) + uaddr; uaddr_end = min(arena->user_vm_end, full_uaddr + (page_cnt << PAGE_SHIFT)); if (full_uaddr >= uaddr_end) return; page_cnt = (uaddr_end - full_uaddr) >> PAGE_SHIFT; + pgoff = compute_pgoff(arena, uaddr); - guard(mutex)(&arena->lock); + if (!sleepable) + goto defer; + + ret = raw_res_spin_lock_irqsave(&arena->spinlock, flags); + + /* Can't proceed without holding the spinlock so defer the free */ + if (ret) + goto defer; - pgoff = compute_pgoff(arena, uaddr); - /* clear range */ range_tree_set(&arena->rt, pgoff, page_cnt); + init_llist_head(&free_pages); + /* clear ptes and collect struct pages */ + apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT, + apply_range_clear_cb, &free_pages); + + /* drop the lock to do the tlb flush and zap pages */ + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); + + /* ensure no stale TLB entries */ + flush_tlb_kernel_range(kaddr, kaddr + (page_cnt * PAGE_SIZE)); + if (page_cnt > 1) /* bulk zap if multiple pages being freed */ zap_pages(arena, full_uaddr, page_cnt); - kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr; - for (i = 0; i < page_cnt; i++, kaddr += PAGE_SIZE, full_uaddr += PAGE_SIZE) { - page = vmalloc_to_page((void *)kaddr); - if (!page) - continue; + llist_for_each_safe(pos, t, __llist_del_all(&free_pages)) { + page = llist_entry(pos, struct page, pcp_llist); if (page_cnt == 1 && page_mapped(page)) /* mapped by some user process */ /* Optimization for the common case of page_cnt==1: * If page wasn't mapped into some user vma there @@ -651,9 +707,25 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt) * page_cnt is big it's faster to do the batched zap. */ zap_pages(arena, full_uaddr, 1); - apply_to_existing_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_clear_cb, - NULL); + __free_page(page); } + + return; + +defer: + s = kmalloc_nolock(sizeof(struct arena_free_span), 0, -1); + if (!s) + /* + * If allocation fails in non-sleepable context, pages are intentionally left + * inaccessible (leaked) until the arena is destroyed. Cleanup or retries are not + * possible here, so we intentionally omit them for safety. + */ + return; + + s->page_cnt = page_cnt; + s->uaddr = uaddr; + llist_add(&s->node, &arena->free_spans); + irq_work_queue(&arena->free_irq); } /* @@ -663,6 +735,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt) static int arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt) { long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT; + unsigned long flags; long pgoff; int ret; @@ -673,15 +746,87 @@ static int arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt if (pgoff + page_cnt > page_cnt_max) return -EINVAL; - guard(mutex)(&arena->lock); + if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) + return -EBUSY; /* Cannot guard already allocated pages. */ ret = is_range_tree_set(&arena->rt, pgoff, page_cnt); - if (ret) - return -EBUSY; + if (ret) { + ret = -EBUSY; + goto out; + } /* "Allocate" the region to prevent it from being allocated. */ - return range_tree_clear(&arena->rt, pgoff, page_cnt); + ret = range_tree_clear(&arena->rt, pgoff, page_cnt); +out: + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); + return ret; +} + +static void arena_free_worker(struct work_struct *work) +{ + struct bpf_arena *arena = container_of(work, struct bpf_arena, free_work); + struct llist_node *list, *pos, *t; + struct arena_free_span *s; + u64 arena_vm_start, user_vm_start; + struct llist_head free_pages; + struct page *page; + unsigned long full_uaddr; + long kaddr, page_cnt, pgoff; + unsigned long flags; + + if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) { + schedule_work(work); + return; + } + + init_llist_head(&free_pages); + arena_vm_start = bpf_arena_get_kern_vm_start(arena); + user_vm_start = bpf_arena_get_user_vm_start(arena); + + list = llist_del_all(&arena->free_spans); + llist_for_each(pos, list) { + s = llist_entry(pos, struct arena_free_span, node); + page_cnt = s->page_cnt; + kaddr = arena_vm_start + s->uaddr; + pgoff = compute_pgoff(arena, s->uaddr); + + /* clear ptes and collect pages in free_pages llist */ + apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT, + apply_range_clear_cb, &free_pages); + + range_tree_set(&arena->rt, pgoff, page_cnt); + } + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); + + /* Iterate the list again without holding spinlock to do the tlb flush and zap_pages */ + llist_for_each_safe(pos, t, list) { + s = llist_entry(pos, struct arena_free_span, node); + page_cnt = s->page_cnt; + full_uaddr = clear_lo32(user_vm_start) + s->uaddr; + kaddr = arena_vm_start + s->uaddr; + + /* ensure no stale TLB entries */ + flush_tlb_kernel_range(kaddr, kaddr + (page_cnt * PAGE_SIZE)); + + /* remove pages from user vmas */ + zap_pages(arena, full_uaddr, page_cnt); + + kfree_nolock(s); + } + + /* free all pages collected by apply_to_existing_page_range() in the first loop */ + llist_for_each_safe(pos, t, __llist_del_all(&free_pages)) { + page = llist_entry(pos, struct page, pcp_llist); + __free_page(page); + } +} + +static void arena_free_irq(struct irq_work *iw) +{ + struct bpf_arena *arena = container_of(iw, struct bpf_arena, free_irq); + + schedule_work(&arena->free_work); } __bpf_kfunc_start_defs(); @@ -695,9 +840,20 @@ __bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_ if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt) return NULL; - return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id); + return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true); } +void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt, + int node_id, u64 flags) +{ + struct bpf_map *map = p__map; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt) + return NULL; + + return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, false); +} __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt) { struct bpf_map *map = p__map; @@ -705,7 +861,17 @@ __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign) return; - arena_free_pages(arena, (long)ptr__ign, page_cnt); + arena_free_pages(arena, (long)ptr__ign, page_cnt, true); +} + +void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt) +{ + struct bpf_map *map = p__map; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign) + return; + arena_free_pages(arena, (long)ptr__ign, page_cnt, false); } __bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_cnt) @@ -724,9 +890,9 @@ __bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_c __bpf_kfunc_end_defs(); BTF_KFUNCS_START(arena_kfuncs) -BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_RET | KF_ARENA_ARG2) -BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2) -BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_ARENA_RET | KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_TRUSTED_ARGS | KF_ARENA_ARG2) BTF_KFUNCS_END(arena_kfuncs) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d6b8a77fbe3b..2de1a736ef69 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12380,6 +12380,8 @@ enum special_kfunc_type { KF___bpf_trap, KF_bpf_task_work_schedule_signal_impl, KF_bpf_task_work_schedule_resume_impl, + KF_bpf_arena_alloc_pages, + KF_bpf_arena_free_pages, }; BTF_ID_LIST(special_kfunc_list) @@ -12454,6 +12456,8 @@ BTF_ID(func, bpf_dynptr_file_discard) BTF_ID(func, __bpf_trap) BTF_ID(func, bpf_task_work_schedule_signal_impl) BTF_ID(func, bpf_task_work_schedule_resume_impl) +BTF_ID(func, bpf_arena_alloc_pages) +BTF_ID(func, bpf_arena_free_pages) static bool is_task_work_add_kfunc(u32 func_id) { @@ -22432,6 +22436,12 @@ static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) { if (!env->insn_aux_data[insn_idx].non_sleepable) addr = (unsigned long)bpf_dynptr_from_file_sleepable; + } else if (func_id == special_kfunc_list[KF_bpf_arena_alloc_pages]) { + if (env->insn_aux_data[insn_idx].non_sleepable) + addr = (unsigned long)bpf_arena_alloc_pages_non_sleepable; + } else if (func_id == special_kfunc_list[KF_bpf_arena_free_pages]) { + if (env->insn_aux_data[insn_idx].non_sleepable) + addr = (unsigned long)bpf_arena_free_pages_non_sleepable; } desc->addr = addr; return 0; -- cgit v1.2.3 From 7646c7afd9a95db0b0cb4ad066ed90f6024da67d Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 10:00:28 -0800 Subject: bpf: Remove redundant KF_TRUSTED_ARGS flag from all kfuncs Now that KF_TRUSTED_ARGS is the default for all kfuncs, remove the explicit KF_TRUSTED_ARGS flag from all kfunc definitions and remove the flag itself. Acked-by: Eduard Zingerman Reviewed-by: Emil Tsalapatis Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102180038.2708325-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- fs/bpf_fs_kfuncs.c | 13 ++++++------- fs/verity/measure.c | 2 +- include/linux/bpf.h | 2 +- include/linux/btf.h | 3 +-- kernel/bpf/arena.c | 6 +++--- kernel/bpf/cpumask.c | 2 +- kernel/bpf/helpers.c | 20 ++++++++++---------- kernel/bpf/map_iter.c | 2 +- kernel/bpf/verifier.c | 2 +- kernel/sched/ext.c | 8 ++++---- mm/bpf_memcontrol.c | 10 +++++----- net/core/filter.c | 10 +++++----- net/core/xdp.c | 2 +- net/netfilter/nf_conntrack_bpf.c | 8 ++++---- net/netfilter/nf_flow_table_bpf.c | 2 +- net/netfilter/nf_nat_bpf.c | 2 +- net/sched/bpf_qdisc.c | 12 ++++++------ tools/testing/selftests/bpf/progs/cpumask_failure.c | 2 +- tools/testing/selftests/bpf/test_kmods/bpf_testmod.c | 20 ++++++++++---------- 19 files changed, 63 insertions(+), 65 deletions(-) (limited to 'include') diff --git a/fs/bpf_fs_kfuncs.c b/fs/bpf_fs_kfuncs.c index abd5eaa4892e..e4e51a1d0de2 100644 --- a/fs/bpf_fs_kfuncs.c +++ b/fs/bpf_fs_kfuncs.c @@ -356,14 +356,13 @@ __bpf_kfunc int bpf_cgroup_read_xattr(struct cgroup *cgroup, const char *name__s __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_fs_kfunc_set_ids) -BTF_ID_FLAGS(func, bpf_get_task_exe_file, - KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_get_task_exe_file, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_put_file, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_path_d_path, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_get_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_set_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_remove_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_path_d_path) +BTF_ID_FLAGS(func, bpf_get_dentry_xattr, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_set_dentry_xattr, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_remove_dentry_xattr, KF_SLEEPABLE) BTF_KFUNCS_END(bpf_fs_kfunc_set_ids) static int bpf_fs_kfuncs_filter(const struct bpf_prog *prog, u32 kfunc_id) diff --git a/fs/verity/measure.c b/fs/verity/measure.c index 388734132f01..6a35623ebdf0 100644 --- a/fs/verity/measure.c +++ b/fs/verity/measure.c @@ -162,7 +162,7 @@ __bpf_kfunc int bpf_get_fsverity_digest(struct file *file, struct bpf_dynptr *di __bpf_kfunc_end_defs(); BTF_KFUNCS_START(fsverity_set_ids) -BTF_ID_FLAGS(func, bpf_get_fsverity_digest, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_get_fsverity_digest) BTF_KFUNCS_END(fsverity_set_ids) static int bpf_get_fsverity_digest_filter(const struct bpf_prog *prog, u32 kfunc_id) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4e7d72dfbcd4..9efb2ddf331c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -753,7 +753,7 @@ enum bpf_type_flag { MEM_ALLOC = BIT(11 + BPF_BASE_TYPE_BITS), /* PTR was passed from the kernel in a trusted context, and may be - * passed to KF_TRUSTED_ARGS kfuncs or BPF helper functions. + * passed to kfuncs or BPF helper functions. * Confusingly, this is _not_ the opposite of PTR_UNTRUSTED above. * PTR_UNTRUSTED refers to a kptr that was read directly from a map * without invoking bpf_kptr_xchg(). What we really need to know is diff --git a/include/linux/btf.h b/include/linux/btf.h index f06976ffb63f..691f09784933 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -34,7 +34,7 @@ * * And the following kfunc: * - * BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) + * BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE) * * All invocations to the kfunc must pass the unmodified, unwalked task: * @@ -66,7 +66,6 @@ * return 0; * } */ -#define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */ #define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ #define KF_DESTRUCTIVE (1 << 6) /* kfunc performs destructive actions */ #define KF_RCU (1 << 7) /* kfunc takes either rcu or trusted pointer arguments */ diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 456ac989269d..2274319a95e6 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -890,9 +890,9 @@ __bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_c __bpf_kfunc_end_defs(); BTF_KFUNCS_START(arena_kfuncs) -BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_ARENA_RET | KF_ARENA_ARG2) -BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_ARENA_ARG2) -BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_TRUSTED_ARGS | KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_ARENA_RET | KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_ARENA_ARG2) BTF_KFUNCS_END(arena_kfuncs) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index 9876c5fe6c2a..b8c805b4b06a 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -477,7 +477,7 @@ __bpf_kfunc_end_defs(); BTF_KFUNCS_START(cpumask_kfunc_btf_ids) BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE) BTF_ID_FLAGS(func, bpf_cpumask_first, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_first_and, KF_RCU) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index db72b96f9c8c..2c15f77c74db 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4427,7 +4427,7 @@ BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_from_vpid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_throw) #ifdef CONFIG_BPF_EVENTS -BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_send_signal_task) #endif #ifdef CONFIG_KEYS BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) @@ -4467,14 +4467,14 @@ BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW | KF_RCU) BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY) #ifdef CONFIG_CGROUPS -BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY) -BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED) +BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_iter_css_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY) #endif -BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED) +BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_dynptr_adjust) @@ -4510,8 +4510,8 @@ BTF_ID_FLAGS(func, bpf_probe_read_user_str_dynptr) BTF_ID_FLAGS(func, bpf_probe_read_kernel_str_dynptr) BTF_ID_FLAGS(func, bpf_copy_from_user_dynptr, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_copy_from_user_str_dynptr, KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE) #endif #ifdef CONFIG_DMA_SHARED_BUFFER BTF_ID_FLAGS(func, bpf_iter_dmabuf_new, KF_ITER_NEW | KF_SLEEPABLE) @@ -4536,10 +4536,10 @@ BTF_ID_FLAGS(func, bpf_strncasestr); #if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS) BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif -BTF_ID_FLAGS(func, bpf_stream_vprintk_impl, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_dynptr_from_file, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_stream_vprintk_impl) +BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl) +BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl) +BTF_ID_FLAGS(func, bpf_dynptr_from_file) BTF_ID_FLAGS(func, bpf_dynptr_file_discard) BTF_KFUNCS_END(common_btf_ids) diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 9575314f40a6..261a03ea73d3 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -214,7 +214,7 @@ __bpf_kfunc s64 bpf_map_sum_elem_count(const struct bpf_map *map) __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_map_iter_kfunc_ids) -BTF_ID_FLAGS(func, bpf_map_sum_elem_count, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_map_sum_elem_count) BTF_KFUNCS_END(bpf_map_iter_kfunc_ids) static const struct btf_kfunc_id_set bpf_map_iter_kfunc_set = { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 359a962d69a1..c9da70dd3e72 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12619,7 +12619,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, /* Enforce strict type matching for calls to kfuncs that are acquiring * or releasing a reference, or are no-cast aliases. We do _not_ - * enforce strict matching for plain KF_TRUSTED_ARGS kfuncs by default, + * enforce strict matching for kfuncs by default, * as we want to enable BPF programs to pass types that are bitwise * equivalent without forcing them to explicitly cast with something * like bpf_cast_to_kern_ctx(). diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 94164f2dec6d..fd5423428dde 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -7229,9 +7229,9 @@ BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_RCU_PROTECTED | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY) -BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, scx_bpf_exit_bstr) +BTF_ID_FLAGS(func, scx_bpf_error_bstr) +BTF_ID_FLAGS(func, scx_bpf_dump_bstr) BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur) @@ -7250,7 +7250,7 @@ BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_RET_NULL | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) #endif BTF_ID_FLAGS(func, scx_bpf_now) -BTF_ID_FLAGS(func, scx_bpf_events, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, scx_bpf_events) BTF_KFUNCS_END(scx_kfunc_ids_any) static const struct btf_kfunc_id_set scx_kfunc_set_any = { diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c index e8fa7f5855f9..716df49d7647 100644 --- a/mm/bpf_memcontrol.c +++ b/mm/bpf_memcontrol.c @@ -166,11 +166,11 @@ BTF_ID_FLAGS(func, bpf_get_root_mem_cgroup, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_get_mem_cgroup, KF_ACQUIRE | KF_RET_NULL | KF_RCU) BTF_ID_FLAGS(func, bpf_put_mem_cgroup, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_mem_cgroup_vm_events, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_mem_cgroup_memory_events, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_mem_cgroup_usage, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_mem_cgroup_page_state, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_mem_cgroup_flush_stats, KF_TRUSTED_ARGS | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_mem_cgroup_vm_events) +BTF_ID_FLAGS(func, bpf_mem_cgroup_memory_events) +BTF_ID_FLAGS(func, bpf_mem_cgroup_usage) +BTF_ID_FLAGS(func, bpf_mem_cgroup_page_state) +BTF_ID_FLAGS(func, bpf_mem_cgroup_flush_stats, KF_SLEEPABLE) BTF_KFUNCS_END(bpf_memcontrol_kfuncs) diff --git a/net/core/filter.c b/net/core/filter.c index 616e0520a0bb..d43df98e1ded 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -12438,11 +12438,11 @@ int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags, } BTF_KFUNCS_START(bpf_kfunc_check_set_skb) -BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb) BTF_KFUNCS_END(bpf_kfunc_check_set_skb) BTF_KFUNCS_START(bpf_kfunc_check_set_skb_meta) -BTF_ID_FLAGS(func, bpf_dynptr_from_skb_meta, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb_meta) BTF_KFUNCS_END(bpf_kfunc_check_set_skb_meta) BTF_KFUNCS_START(bpf_kfunc_check_set_xdp) @@ -12455,11 +12455,11 @@ BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path) BTF_KFUNCS_END(bpf_kfunc_check_set_sock_addr) BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk) -BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk) BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk) BTF_KFUNCS_START(bpf_kfunc_check_set_sock_ops) -BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp) BTF_KFUNCS_END(bpf_kfunc_check_set_sock_ops) static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { @@ -12554,7 +12554,7 @@ __bpf_kfunc int bpf_sock_destroy(struct sock_common *sock) __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_sk_iter_kfunc_ids) -BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_sock_destroy) BTF_KFUNCS_END(bpf_sk_iter_kfunc_ids) static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id) diff --git a/net/core/xdp.c b/net/core/xdp.c index 9100e160113a..fee6d080ee85 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -964,7 +964,7 @@ __bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx, __bpf_kfunc_end_defs(); BTF_KFUNCS_START(xdp_metadata_kfunc_ids) -#define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS) +#define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name) XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC BTF_KFUNCS_END(xdp_metadata_kfunc_ids) diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index 4a136fc3a9c0..a630139bd0c3 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -516,10 +516,10 @@ BTF_ID_FLAGS(func, bpf_skb_ct_alloc, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_skb_ct_lookup, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_ct_insert_entry, KF_ACQUIRE | KF_RET_NULL | KF_RELEASE) BTF_ID_FLAGS(func, bpf_ct_release, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_ct_set_timeout, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_ct_change_timeout, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_ct_set_status, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_ct_change_status, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_ct_set_timeout) +BTF_ID_FLAGS(func, bpf_ct_change_timeout) +BTF_ID_FLAGS(func, bpf_ct_set_status) +BTF_ID_FLAGS(func, bpf_ct_change_status) BTF_KFUNCS_END(nf_ct_kfunc_set) static const struct btf_kfunc_id_set nf_conntrack_kfunc_set = { diff --git a/net/netfilter/nf_flow_table_bpf.c b/net/netfilter/nf_flow_table_bpf.c index 4a5f5195f2d2..cbd5b97a6329 100644 --- a/net/netfilter/nf_flow_table_bpf.c +++ b/net/netfilter/nf_flow_table_bpf.c @@ -105,7 +105,7 @@ __diag_pop() __bpf_kfunc_end_defs(); BTF_KFUNCS_START(nf_ft_kfunc_set) -BTF_ID_FLAGS(func, bpf_xdp_flow_lookup, KF_TRUSTED_ARGS | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_xdp_flow_lookup, KF_RET_NULL) BTF_KFUNCS_END(nf_ft_kfunc_set) static const struct btf_kfunc_id_set nf_flow_kfunc_set = { diff --git a/net/netfilter/nf_nat_bpf.c b/net/netfilter/nf_nat_bpf.c index 481be15609b1..f9dd85ccea01 100644 --- a/net/netfilter/nf_nat_bpf.c +++ b/net/netfilter/nf_nat_bpf.c @@ -55,7 +55,7 @@ __bpf_kfunc int bpf_ct_set_nat_info(struct nf_conn___init *nfct, __bpf_kfunc_end_defs(); BTF_KFUNCS_START(nf_nat_kfunc_set) -BTF_ID_FLAGS(func, bpf_ct_set_nat_info, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_ct_set_nat_info) BTF_KFUNCS_END(nf_nat_kfunc_set) static const struct btf_kfunc_id_set nf_bpf_nat_kfunc_set = { diff --git a/net/sched/bpf_qdisc.c b/net/sched/bpf_qdisc.c index adcb618a2bfc..b9771788b9b3 100644 --- a/net/sched/bpf_qdisc.c +++ b/net/sched/bpf_qdisc.c @@ -271,14 +271,14 @@ __bpf_kfunc void bpf_qdisc_bstats_update(struct Qdisc *sch, const struct sk_buff __bpf_kfunc_end_defs(); BTF_KFUNCS_START(qdisc_kfunc_ids) -BTF_ID_FLAGS(func, bpf_skb_get_hash, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_skb_get_hash) BTF_ID_FLAGS(func, bpf_kfree_skb, KF_RELEASE) BTF_ID_FLAGS(func, bpf_qdisc_skb_drop, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_qdisc_watchdog_schedule, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_qdisc_init_prologue, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_qdisc_reset_destroy_epilogue, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_qdisc_bstats_update, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb) +BTF_ID_FLAGS(func, bpf_qdisc_watchdog_schedule) +BTF_ID_FLAGS(func, bpf_qdisc_init_prologue) +BTF_ID_FLAGS(func, bpf_qdisc_reset_destroy_epilogue) +BTF_ID_FLAGS(func, bpf_qdisc_bstats_update) BTF_KFUNCS_END(qdisc_kfunc_ids) BTF_SET_START(qdisc_common_kfunc_set) diff --git a/tools/testing/selftests/bpf/progs/cpumask_failure.c b/tools/testing/selftests/bpf/progs/cpumask_failure.c index 8a2fd596c8a3..61c32e91e8c3 100644 --- a/tools/testing/selftests/bpf/progs/cpumask_failure.c +++ b/tools/testing/selftests/bpf/progs/cpumask_failure.c @@ -110,7 +110,7 @@ SEC("tp_btf/task_newtask") __failure __msg("NULL pointer passed to trusted arg0") int BPF_PROG(test_cpumask_null, struct task_struct *task, u64 clone_flags) { - /* NULL passed to KF_TRUSTED_ARGS kfunc. */ + /* NULL passed to kfunc. */ bpf_cpumask_empty(NULL); return 0; diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index 90c4b1a51de6..1c41d03bd5a1 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -693,9 +693,9 @@ BTF_ID_FLAGS(func, bpf_kfunc_dynptr_test) BTF_ID_FLAGS(func, bpf_kfunc_nested_acquire_nonzero_offset_test, KF_ACQUIRE) BTF_ID_FLAGS(func, bpf_kfunc_nested_acquire_zero_offset_test, KF_ACQUIRE) BTF_ID_FLAGS(func, bpf_kfunc_nested_release_test, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_kfunc_trusted_vma_test, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_kfunc_trusted_task_test, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_kfunc_trusted_num_test, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_trusted_vma_test) +BTF_ID_FLAGS(func, bpf_kfunc_trusted_task_test) +BTF_ID_FLAGS(func, bpf_kfunc_trusted_num_test) BTF_ID_FLAGS(func, bpf_kfunc_rcu_task_test, KF_RCU) BTF_ID_FLAGS(func, bpf_kfunc_ret_rcu_test, KF_RET_NULL | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_kfunc_ret_rcu_test_nostruct, KF_RET_NULL | KF_RCU_PROTECTED) @@ -1158,7 +1158,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_pass2) BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail1) BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail2) BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail3) -BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS | KF_RCU) +BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_RCU) BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE) BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg) BTF_ID_FLAGS(func, bpf_kfunc_call_test_offset) @@ -1172,12 +1172,12 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_sendmsg, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_call_sock_sendmsg, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getsockname, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getpeername, KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_prologue, KF_TRUSTED_ARGS | KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1_impl, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_prologue, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_epilogue, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10) +BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1) +BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1_impl) BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids) static int bpf_testmod_ops_init(struct btf *btf) -- cgit v1.2.3 From 817593af7b9ba56c23d9dd1858232c5493a14f55 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 12:02:27 -0800 Subject: bpf: syscall: Introduce memcg enter/exit helpers Introduce bpf_map_memcg_enter() and bpf_map_memcg_exit() helpers to reduce code duplication in memcg context management. bpf_map_memcg_enter() gets the memcg from the map, sets it as active, and returns both the previous and the now active memcg. bpf_map_memcg_exit() restores the previous active memcg and releases the reference obtained during enter. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102200230.25168-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 15 +++++++++++++++ kernel/bpf/syscall.c | 54 +++++++++++++++++++++++++--------------------------- 2 files changed, 41 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9efb2ddf331c..4e9667ed6630 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2608,6 +2608,10 @@ struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id); int bpf_map_alloc_pages(const struct bpf_map *map, int nid, unsigned long nr_pages, struct page **page_array); #ifdef CONFIG_MEMCG +void bpf_map_memcg_enter(const struct bpf_map *map, struct mem_cgroup **old_memcg, + struct mem_cgroup **new_memcg); +void bpf_map_memcg_exit(struct mem_cgroup *old_memcg, + struct mem_cgroup *memcg); void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, int node); void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags, @@ -2632,6 +2636,17 @@ void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, kvcalloc(_n, _size, _flags) #define bpf_map_alloc_percpu(_map, _size, _align, _flags) \ __alloc_percpu_gfp(_size, _align, _flags) +static inline void bpf_map_memcg_enter(const struct bpf_map *map, struct mem_cgroup **old_memcg, + struct mem_cgroup **new_memcg) +{ + *new_memcg = NULL; + *old_memcg = NULL; +} + +static inline void bpf_map_memcg_exit(struct mem_cgroup *old_memcg, + struct mem_cgroup *memcg) +{ +} #endif static inline int diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a4d38272d8bc..c77ab2e32659 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -505,17 +505,29 @@ static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map) return root_mem_cgroup; } +void bpf_map_memcg_enter(const struct bpf_map *map, struct mem_cgroup **old_memcg, + struct mem_cgroup **new_memcg) +{ + *new_memcg = bpf_map_get_memcg(map); + *old_memcg = set_active_memcg(*new_memcg); +} + +void bpf_map_memcg_exit(struct mem_cgroup *old_memcg, + struct mem_cgroup *new_memcg) +{ + set_active_memcg(old_memcg); + mem_cgroup_put(new_memcg); +} + void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, int node) { struct mem_cgroup *memcg, *old_memcg; void *ptr; - memcg = bpf_map_get_memcg(map); - old_memcg = set_active_memcg(memcg); + bpf_map_memcg_enter(map, &old_memcg, &memcg); ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node); - set_active_memcg(old_memcg); - mem_cgroup_put(memcg); + bpf_map_memcg_exit(old_memcg, memcg); return ptr; } @@ -526,11 +538,9 @@ void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags struct mem_cgroup *memcg, *old_memcg; void *ptr; - memcg = bpf_map_get_memcg(map); - old_memcg = set_active_memcg(memcg); + bpf_map_memcg_enter(map, &old_memcg, &memcg); ptr = kmalloc_nolock(size, flags | __GFP_ACCOUNT, node); - set_active_memcg(old_memcg); - mem_cgroup_put(memcg); + bpf_map_memcg_exit(old_memcg, memcg); return ptr; } @@ -540,11 +550,9 @@ void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) struct mem_cgroup *memcg, *old_memcg; void *ptr; - memcg = bpf_map_get_memcg(map); - old_memcg = set_active_memcg(memcg); + bpf_map_memcg_enter(map, &old_memcg, &memcg); ptr = kzalloc(size, flags | __GFP_ACCOUNT); - set_active_memcg(old_memcg); - mem_cgroup_put(memcg); + bpf_map_memcg_exit(old_memcg, memcg); return ptr; } @@ -555,11 +563,9 @@ void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, struct mem_cgroup *memcg, *old_memcg; void *ptr; - memcg = bpf_map_get_memcg(map); - old_memcg = set_active_memcg(memcg); + bpf_map_memcg_enter(map, &old_memcg, &memcg); ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT); - set_active_memcg(old_memcg); - mem_cgroup_put(memcg); + bpf_map_memcg_exit(old_memcg, memcg); return ptr; } @@ -570,11 +576,9 @@ void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, struct mem_cgroup *memcg, *old_memcg; void __percpu *ptr; - memcg = bpf_map_get_memcg(map); - old_memcg = set_active_memcg(memcg); + bpf_map_memcg_enter(map, &old_memcg, &memcg); ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT); - set_active_memcg(old_memcg); - mem_cgroup_put(memcg); + bpf_map_memcg_exit(old_memcg, memcg); return ptr; } @@ -612,12 +616,9 @@ int bpf_map_alloc_pages(const struct bpf_map *map, int nid, unsigned long i, j; struct page *pg; int ret = 0; -#ifdef CONFIG_MEMCG struct mem_cgroup *memcg, *old_memcg; - memcg = bpf_map_get_memcg(map); - old_memcg = set_active_memcg(memcg); -#endif + bpf_map_memcg_enter(map, &old_memcg, &memcg); for (i = 0; i < nr_pages; i++) { pg = __bpf_alloc_page(nid); @@ -631,10 +632,7 @@ int bpf_map_alloc_pages(const struct bpf_map *map, int nid, break; } -#ifdef CONFIG_MEMCG - set_active_memcg(old_memcg); - mem_cgroup_put(memcg); -#endif + bpf_map_memcg_exit(old_memcg, memcg); return ret; } -- cgit v1.2.3 From a069190b590e108223cd841a1c2d0bfb92230ecc Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 14:15:12 -0800 Subject: bpf: Replace __opt annotation with __nullable for kfuncs The __opt annotation was originally introduced specifically for buffer/size argument pairs in bpf_dynptr_slice() and bpf_dynptr_slice_rdwr(), allowing the buffer pointer to be NULL while still validating the size as a constant. The __nullable annotation serves the same purpose but is more general and is already used throughout the BPF subsystem for raw tracepoints, struct_ops, and other kfuncs. This patch unifies the two annotations by replacing __opt with __nullable. The key change is in the verifier's get_kfunc_ptr_arg_type() function, where mem/size pair detection is now performed before the nullable check. This ensures that buffer/size pairs are correctly classified as KF_ARG_PTR_TO_MEM_SIZE even when the buffer is nullable, while adding an !arg_mem_size condition to the nullable check prevents interference with mem/size pair handling. When processing KF_ARG_PTR_TO_MEM_SIZE arguments, the verifier now uses is_kfunc_arg_nullable() instead of the removed is_kfunc_arg_optional() to determine whether to skip size validation for NULL buffers. This is the first documentation added for the __nullable annotation, which has been in use since it was introduced but was previously undocumented. No functional changes to verifier behavior - nullable buffer/size pairs continue to work exactly as before. Acked-by: Eduard Zingerman Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102221513.1961781-1-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- Documentation/bpf/kfuncs.rst | 31 ++++++++++++++++++++----------- include/linux/bpf.h | 2 +- kernel/bpf/helpers.c | 28 ++++++++++++++-------------- kernel/bpf/verifier.c | 23 +++++++++-------------- 4 files changed, 44 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst index 6cb6857bfa6f..3eb59a8f9f34 100644 --- a/Documentation/bpf/kfuncs.rst +++ b/Documentation/bpf/kfuncs.rst @@ -178,25 +178,34 @@ Here, the dynptr will be treated as an uninitialized dynptr. Without this annotation, the verifier will reject the program if the dynptr passed in is not initialized. -2.3.4 __opt Annotation -------------------------- +2.3.4 __nullable Annotation +--------------------------- -This annotation is used to indicate that the buffer associated with an __sz or __szk -argument may be null. If the function is passed a nullptr in place of the buffer, -the verifier will not check that length is appropriate for the buffer. The kfunc is -responsible for checking if this buffer is null before using it. +This annotation is used to indicate that the pointer argument may be NULL. +The verifier will allow passing NULL for such arguments. An example is given below:: - __bpf_kfunc void *bpf_dynptr_slice(..., void *buffer__opt, u32 buffer__szk) + __bpf_kfunc void bpf_task_release(struct task_struct *task__nullable) { ... } -Here, the buffer may be null. If buffer is not null, it at least of size buffer_szk. -Either way, the returned buffer is either NULL, or of size buffer_szk. Without this -annotation, the verifier will reject the program if a null pointer is passed in with -a nonzero size. +Here, the task pointer may be NULL. The kfunc is responsible for checking if +the pointer is NULL before dereferencing it. + +The __nullable annotation can be combined with other annotations. For example, +when used with __sz or __szk annotations for memory and size pairs, the +verifier will skip size validation when a NULL pointer is passed, but will +still process the size argument to extract constant size information when +needed:: + + __bpf_kfunc void *bpf_dynptr_slice(..., void *buffer__nullable, + u32 buffer__szk) + +Here, the buffer may be NULL. If the buffer is not NULL, it must be at least +buffer__szk bytes in size. The kfunc is responsible for checking if the buffer +is NULL before using it. 2.3.5 __str Annotation ---------------------------- diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4e9667ed6630..a63e47d2109c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1434,7 +1434,7 @@ bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr); int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u64 offset, void *src, u64 len, u64 flags); void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset, - void *buffer__opt, u64 buffer__szk); + void *buffer__nullable, u64 buffer__szk); static inline int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u64 offset, u64 len) { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 2c15f77c74db..9eaa4185e0a7 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2709,14 +2709,14 @@ __bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid) * bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data. * @p: The dynptr whose data slice to retrieve * @offset: Offset into the dynptr - * @buffer__opt: User-provided buffer to copy contents into. May be NULL + * @buffer__nullable: User-provided buffer to copy contents into. May be NULL * @buffer__szk: Size (in bytes) of the buffer if present. This is the * length of the requested slice. This must be a constant. * * For non-skb and non-xdp type dynptrs, there is no difference between * bpf_dynptr_slice and bpf_dynptr_data. * - * If buffer__opt is NULL, the call will fail if buffer_opt was needed. + * If buffer__nullable is NULL, the call will fail if buffer_opt was needed. * * If the intention is to write to the data slice, please use * bpf_dynptr_slice_rdwr. @@ -2734,7 +2734,7 @@ __bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid) * direct pointer) */ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset, - void *buffer__opt, u64 buffer__szk) + void *buffer__nullable, u64 buffer__szk) { const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; enum bpf_dynptr_type type; @@ -2755,8 +2755,8 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset, case BPF_DYNPTR_TYPE_RINGBUF: return ptr->data + ptr->offset + offset; case BPF_DYNPTR_TYPE_SKB: - if (buffer__opt) - return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer__opt); + if (buffer__nullable) + return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer__nullable); else return skb_pointer_if_linear(ptr->data, ptr->offset + offset, len); case BPF_DYNPTR_TYPE_XDP: @@ -2765,16 +2765,16 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset, if (!IS_ERR_OR_NULL(xdp_ptr)) return xdp_ptr; - if (!buffer__opt) + if (!buffer__nullable) return NULL; - bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__opt, len, false); - return buffer__opt; + bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__nullable, len, false); + return buffer__nullable; } case BPF_DYNPTR_TYPE_SKB_META: return bpf_skb_meta_pointer(ptr->data, ptr->offset + offset); case BPF_DYNPTR_TYPE_FILE: - err = bpf_file_fetch_bytes(ptr->data, offset, buffer__opt, buffer__szk); - return err ? NULL : buffer__opt; + err = bpf_file_fetch_bytes(ptr->data, offset, buffer__nullable, buffer__szk); + return err ? NULL : buffer__nullable; default: WARN_ONCE(true, "unknown dynptr type %d\n", type); return NULL; @@ -2785,14 +2785,14 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset, * bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data. * @p: The dynptr whose data slice to retrieve * @offset: Offset into the dynptr - * @buffer__opt: User-provided buffer to copy contents into. May be NULL + * @buffer__nullable: User-provided buffer to copy contents into. May be NULL * @buffer__szk: Size (in bytes) of the buffer if present. This is the * length of the requested slice. This must be a constant. * * For non-skb and non-xdp type dynptrs, there is no difference between * bpf_dynptr_slice and bpf_dynptr_data. * - * If buffer__opt is NULL, the call will fail if buffer_opt was needed. + * If buffer__nullable is NULL, the call will fail if buffer_opt was needed. * * The returned pointer is writable and may point to either directly the dynptr * data at the requested offset or to the buffer if unable to obtain a direct @@ -2824,7 +2824,7 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset, * direct pointer) */ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset, - void *buffer__opt, u64 buffer__szk) + void *buffer__nullable, u64 buffer__szk) { const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; @@ -2853,7 +2853,7 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset, * will be copied out into the buffer and the user will need to call * bpf_dynptr_write() to commit changes. */ - return bpf_dynptr_slice(p, offset, buffer__opt, buffer__szk); + return bpf_dynptr_slice(p, offset, buffer__nullable, buffer__szk); } __bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u64 start, u64 end) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c9da70dd3e72..9394b0de2ef0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12086,11 +12086,6 @@ static bool is_kfunc_arg_const_mem_size(const struct btf *btf, return btf_param_match_suffix(btf, arg, "__szk"); } -static bool is_kfunc_arg_optional(const struct btf *btf, const struct btf_param *arg) -{ - return btf_param_match_suffix(btf, arg, "__opt"); -} - static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg) { return btf_param_match_suffix(btf, arg, "__k"); @@ -12510,6 +12505,11 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) return KF_ARG_PTR_TO_CTX; + if (argno + 1 < nargs && + (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]) || + is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]))) + arg_mem_size = true; + /* In this function, we verify the kfunc's BTF as per the argument type, * leaving the rest of the verification with respect to the register * type to our caller. When a set of conditions hold in the BTF type of @@ -12518,7 +12518,8 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno)) return KF_ARG_PTR_TO_CTX; - if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg)) + if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg) && + !arg_mem_size) return KF_ARG_PTR_TO_NULL; if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno])) @@ -12575,11 +12576,6 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_callback(env, meta->btf, &args[argno])) return KF_ARG_PTR_TO_CALLBACK; - if (argno + 1 < nargs && - (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]) || - is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]))) - arg_mem_size = true; - /* This is the catch all argument type of register types supported by * check_helper_mem_access. However, we only allow when argument type is * pointer to scalar, or struct composed (recursively) of scalars. When @@ -13249,8 +13245,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } if ((register_is_null(reg) || type_may_be_null(reg->type)) && - !is_kfunc_arg_nullable(meta->btf, &args[i]) && - !is_kfunc_arg_optional(meta->btf, &args[i])) { + !is_kfunc_arg_nullable(meta->btf, &args[i])) { verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i); return -EACCES; } @@ -13566,7 +13561,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ struct bpf_reg_state *size_reg = ®s[regno + 1]; const struct btf_param *size_arg = &args[i + 1]; - if (!register_is_null(buff_reg) || !is_kfunc_arg_optional(meta->btf, buff_arg)) { + if (!register_is_null(buff_reg) || !is_kfunc_arg_nullable(meta->btf, buff_arg)) { ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1); if (ret < 0) { verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1); -- cgit v1.2.3 From ea180ffbd27ce5abf2a06329fe1fc8d20dc9becf Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 5 Jan 2026 20:23:13 -0800 Subject: mm: drop mem_cgroup_usage() declaration from memcontrol.h mem_cgroup_usage() is not used outside of memcg-v1 code, the declaration was added by a mistake. Signed-off-by: Roman Gushchin Link: https://lore.kernel.org/r/20260106042313.140256-1-roman.gushchin@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/memcontrol.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6a5d65487b70..229ac9835adb 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -950,7 +950,6 @@ static inline void mod_memcg_page_state(struct page *page, } unsigned long memcg_events(struct mem_cgroup *memcg, int event); -unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap); unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx); unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item); bool memcg_stat_item_valid(int idx); -- cgit v1.2.3 From 2b421662c7887a0649fe409155a1f101562d0fa9 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 7 Jan 2026 10:20:16 +0800 Subject: bpf: Introduce BPF_F_CPU and BPF_F_ALL_CPUS flags Introduce BPF_F_CPU and BPF_F_ALL_CPUS flags and check them for following APIs: * 'map_lookup_elem()' * 'map_update_elem()' * 'generic_map_lookup_batch()' * 'generic_map_update_batch()' And, get the correct value size for these APIs. Acked-by: Andrii Nakryiko Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260107022022.12843-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 23 ++++++++++++++++++++++- include/uapi/linux/bpf.h | 2 ++ kernel/bpf/syscall.c | 31 +++++++++++++++++-------------- tools/include/uapi/linux/bpf.h | 2 ++ 4 files changed, 43 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a63e47d2109c..108bab1bda9d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3915,14 +3915,35 @@ bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image) } #endif +static inline bool bpf_map_supports_cpu_flags(enum bpf_map_type map_type) +{ + return false; +} + static inline int bpf_map_check_op_flags(struct bpf_map *map, u64 flags, u64 allowed_flags) { - if (flags & ~allowed_flags) + u32 cpu; + + if ((u32)flags & ~allowed_flags) return -EINVAL; if ((flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)) return -EINVAL; + if (!(flags & BPF_F_CPU) && flags >> 32) + return -EINVAL; + + if (flags & (BPF_F_CPU | BPF_F_ALL_CPUS)) { + if (!bpf_map_supports_cpu_flags(map->map_type)) + return -EINVAL; + if ((flags & BPF_F_CPU) && (flags & BPF_F_ALL_CPUS)) + return -EINVAL; + + cpu = flags >> 32; + if ((flags & BPF_F_CPU) && cpu >= num_possible_cpus()) + return -ERANGE; + } + return 0; } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 84ced3ed2d21..2a2ade4be60f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1384,6 +1384,8 @@ enum { BPF_NOEXIST = 1, /* create new element if it didn't exist */ BPF_EXIST = 2, /* update existing element */ BPF_F_LOCK = 4, /* spin_lock-ed map_lookup/map_update */ + BPF_F_CPU = 8, /* cpu flag for percpu maps, upper 32-bit of flags is a cpu number */ + BPF_F_ALL_CPUS = 16, /* update value across all CPUs for percpu maps */ }; /* flags for BPF_MAP_CREATE command */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6dd2ad2f9e81..e8cfe9d67e64 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -133,12 +133,14 @@ bool bpf_map_write_active(const struct bpf_map *map) return atomic64_read(&map->writecnt) != 0; } -static u32 bpf_map_value_size(const struct bpf_map *map) -{ - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || - map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) +static u32 bpf_map_value_size(const struct bpf_map *map, u64 flags) +{ + if (flags & (BPF_F_CPU | BPF_F_ALL_CPUS)) + return map->value_size; + else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || + map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) return round_up(map->value_size, 8) * num_possible_cpus(); else if (IS_FD_MAP(map)) return sizeof(u32); @@ -1729,7 +1731,7 @@ static int map_lookup_elem(union bpf_attr *attr) if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) return -EPERM; - err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK); + err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK | BPF_F_CPU); if (err) return err; @@ -1737,7 +1739,7 @@ static int map_lookup_elem(union bpf_attr *attr) if (IS_ERR(key)) return PTR_ERR(key); - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, attr->flags); err = -ENOMEM; value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); @@ -1804,7 +1806,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) goto err_put; } - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, attr->flags); value = kvmemdup_bpfptr(uvalue, value_size); if (IS_ERR(value)) { err = PTR_ERR(value); @@ -2000,11 +2002,12 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file, void *key, *value; int err = 0; - err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK); + err = bpf_map_check_op_flags(map, attr->batch.elem_flags, + BPF_F_LOCK | BPF_F_CPU | BPF_F_ALL_CPUS); if (err) return err; - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, attr->batch.elem_flags); max_count = attr->batch.count; if (!max_count) @@ -2059,11 +2062,11 @@ int generic_map_lookup_batch(struct bpf_map *map, u32 value_size, cp, max_count; int err; - err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK); + err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK | BPF_F_CPU); if (err) return err; - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, attr->batch.elem_flags); max_count = attr->batch.count; if (!max_count) @@ -2185,7 +2188,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) goto err_put; } - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, 0); err = -ENOMEM; value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6b92b0847ec2..b816bc53d2e1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1384,6 +1384,8 @@ enum { BPF_NOEXIST = 1, /* create new element if it didn't exist */ BPF_EXIST = 2, /* update existing element */ BPF_F_LOCK = 4, /* spin_lock-ed map_lookup/map_update */ + BPF_F_CPU = 8, /* cpu flag for percpu maps, upper 32-bit of flags is a cpu number */ + BPF_F_ALL_CPUS = 16, /* update value across all CPUs for percpu maps */ }; /* flags for BPF_MAP_CREATE command */ -- cgit v1.2.3 From 8eb76cb03f0f6c2bd7b15cf45dcffcd6bd07a360 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 7 Jan 2026 10:20:17 +0800 Subject: bpf: Add BPF_F_CPU and BPF_F_ALL_CPUS flags support for percpu_array maps Introduce support for the BPF_F_ALL_CPUS flag in percpu_array maps to allow updating values for all CPUs with a single value for both update_elem and update_batch APIs. Introduce support for the BPF_F_CPU flag in percpu_array maps to allow: * update value for specified CPU for both update_elem and update_batch APIs. * lookup value for specified CPU for both lookup_elem and lookup_batch APIs. The BPF_F_CPU flag is passed via: * map_flags of lookup_elem and update_elem APIs along with embedded cpu info. * elem_flags of lookup_batch and update_batch APIs along with embedded cpu info. Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260107022022.12843-3-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 9 +++++++-- kernel/bpf/arraymap.c | 29 +++++++++++++++++++++++------ kernel/bpf/syscall.c | 2 +- 3 files changed, 31 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 108bab1bda9d..dc92d001d978 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2848,7 +2848,7 @@ int map_set_for_each_callback_args(struct bpf_verifier_env *env, struct bpf_func_state *callee); int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); -int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); +int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, @@ -3917,7 +3917,12 @@ bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image) static inline bool bpf_map_supports_cpu_flags(enum bpf_map_type map_type) { - return false; + switch (map_type) { + case BPF_MAP_TYPE_PERCPU_ARRAY: + return true; + default: + return false; + } } static inline int bpf_map_check_op_flags(struct bpf_map *map, u64 flags, u64 allowed_flags) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 1eeb31c5b317..67e9e811de3a 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -307,7 +307,7 @@ static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key, return per_cpu_ptr(array->pptrs[index & array->index_mask], cpu); } -int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) +int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; @@ -325,11 +325,18 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) size = array->elem_size; rcu_read_lock(); pptr = array->pptrs[index & array->index_mask]; + if (map_flags & BPF_F_CPU) { + cpu = map_flags >> 32; + copy_map_value(map, value, per_cpu_ptr(pptr, cpu)); + check_and_init_map_value(map, value); + goto unlock; + } for_each_possible_cpu(cpu) { copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu)); check_and_init_map_value(map, value + off); off += size; } +unlock: rcu_read_unlock(); return 0; } @@ -398,10 +405,11 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; void __percpu *pptr; - int cpu, off = 0; + void *ptr, *val; u32 size; + int cpu; - if (unlikely(map_flags > BPF_EXIST)) + if (unlikely((map_flags & BPF_F_LOCK) || (u32)map_flags > BPF_F_ALL_CPUS)) /* unknown flags */ return -EINVAL; @@ -422,11 +430,20 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, size = array->elem_size; rcu_read_lock(); pptr = array->pptrs[index & array->index_mask]; + if (map_flags & BPF_F_CPU) { + cpu = map_flags >> 32; + ptr = per_cpu_ptr(pptr, cpu); + copy_map_value(map, ptr, value); + bpf_obj_free_fields(array->map.record, ptr); + goto unlock; + } for_each_possible_cpu(cpu) { - copy_map_value_long(map, per_cpu_ptr(pptr, cpu), value + off); - bpf_obj_free_fields(array->map.record, per_cpu_ptr(pptr, cpu)); - off += size; + ptr = per_cpu_ptr(pptr, cpu); + val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu; + copy_map_value(map, ptr, val); + bpf_obj_free_fields(array->map.record, ptr); } +unlock: rcu_read_unlock(); return 0; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e8cfe9d67e64..b5341ab1eb84 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -318,7 +318,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { - err = bpf_percpu_array_copy(map, key, value); + err = bpf_percpu_array_copy(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { err = bpf_percpu_cgroup_storage_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { -- cgit v1.2.3 From c6936161fd55d0c6ffde01da726e66ff5f2934e8 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 7 Jan 2026 10:20:18 +0800 Subject: bpf: Add BPF_F_CPU and BPF_F_ALL_CPUS flags support for percpu_hash and lru_percpu_hash maps Introduce BPF_F_ALL_CPUS flag support for percpu_hash and lru_percpu_hash maps to allow updating values for all CPUs with a single value for both update_elem and update_batch APIs. Introduce BPF_F_CPU flag support for percpu_hash and lru_percpu_hash maps to allow: * update value for specified CPU for both update_elem and update_batch APIs. * lookup value for specified CPU for both lookup_elem and lookup_batch APIs. The BPF_F_CPU flag is passed via: * map_flags along with embedded cpu info. * elem_flags along with embedded cpu info. Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260107022022.12843-4-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++- kernel/bpf/hashtab.c | 94 +++++++++++++++++++++++++++++++++++----------------- kernel/bpf/syscall.c | 2 +- 3 files changed, 68 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index dc92d001d978..f5c259dcc425 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2847,7 +2847,7 @@ int map_set_for_each_callback_args(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee); -int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); +int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, u64 flags); @@ -3919,6 +3919,8 @@ static inline bool bpf_map_supports_cpu_flags(enum bpf_map_type map_type) { switch (map_type) { case BPF_MAP_TYPE_PERCPU_ARRAY: + case BPF_MAP_TYPE_PERCPU_HASH: + case BPF_MAP_TYPE_LRU_PERCPU_HASH: return true; default: return false; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index c8a9b27f8663..441ff5bc54ac 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -932,7 +932,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) } static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, - void *value, bool onallcpus) + void *value, bool onallcpus, u64 map_flags) { void *ptr; @@ -943,19 +943,28 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, bpf_obj_free_fields(htab->map.record, ptr); } else { u32 size = round_up(htab->map.value_size, 8); - int off = 0, cpu; + void *val; + int cpu; + + if (map_flags & BPF_F_CPU) { + cpu = map_flags >> 32; + ptr = per_cpu_ptr(pptr, cpu); + copy_map_value(&htab->map, ptr, value); + bpf_obj_free_fields(htab->map.record, ptr); + return; + } for_each_possible_cpu(cpu) { ptr = per_cpu_ptr(pptr, cpu); - copy_map_value_long(&htab->map, ptr, value + off); + val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu; + copy_map_value(&htab->map, ptr, val); bpf_obj_free_fields(htab->map.record, ptr); - off += size; } } } static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, - void *value, bool onallcpus) + void *value, bool onallcpus, u64 map_flags) { /* When not setting the initial value on all cpus, zero-fill element * values for other cpus. Otherwise, bpf program has no way to ensure @@ -973,7 +982,7 @@ static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, zero_map_value(&htab->map, per_cpu_ptr(pptr, cpu)); } } else { - pcpu_copy_value(htab, pptr, value, onallcpus); + pcpu_copy_value(htab, pptr, value, onallcpus, map_flags); } } @@ -985,7 +994,7 @@ static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, bool percpu, bool onallcpus, - struct htab_elem *old_elem) + struct htab_elem *old_elem, u64 map_flags) { u32 size = htab->map.value_size; bool prealloc = htab_is_prealloc(htab); @@ -1043,7 +1052,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, pptr = *(void __percpu **)ptr; } - pcpu_init_value(htab, pptr, value, onallcpus); + pcpu_init_value(htab, pptr, value, onallcpus, map_flags); if (!prealloc) htab_elem_set_ptr(l_new, key_size, pptr); @@ -1147,7 +1156,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, } l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, - l_old); + l_old, map_flags); if (IS_ERR(l_new)) { /* all pre-allocated elements are in use or memory exhausted */ ret = PTR_ERR(l_new); @@ -1249,6 +1258,15 @@ err_lock_bucket: return ret; } +static int htab_map_check_update_flags(bool onallcpus, u64 map_flags) +{ + if (unlikely(!onallcpus && map_flags > BPF_EXIST)) + return -EINVAL; + if (unlikely(onallcpus && ((map_flags & BPF_F_LOCK) || (u32)map_flags > BPF_F_ALL_CPUS))) + return -EINVAL; + return 0; +} + static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, void *value, u64 map_flags, bool percpu, bool onallcpus) @@ -1262,9 +1280,9 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, u32 key_size, hash; int ret; - if (unlikely(map_flags > BPF_EXIST)) - /* unknown flags */ - return -EINVAL; + ret = htab_map_check_update_flags(onallcpus, map_flags); + if (unlikely(ret)) + return ret; WARN_ON_ONCE(!bpf_rcu_lock_held()); @@ -1289,7 +1307,7 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, /* Update value in-place */ if (percpu) { pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), - value, onallcpus); + value, onallcpus, map_flags); } else { void **inner_map_pptr = htab_elem_value(l_old, key_size); @@ -1298,7 +1316,7 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, } } else { l_new = alloc_htab_elem(htab, key, value, key_size, - hash, percpu, onallcpus, NULL); + hash, percpu, onallcpus, NULL, map_flags); if (IS_ERR(l_new)) { ret = PTR_ERR(l_new); goto err; @@ -1324,9 +1342,9 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, u32 key_size, hash; int ret; - if (unlikely(map_flags > BPF_EXIST)) - /* unknown flags */ - return -EINVAL; + ret = htab_map_check_update_flags(onallcpus, map_flags); + if (unlikely(ret)) + return ret; WARN_ON_ONCE(!bpf_rcu_lock_held()); @@ -1363,10 +1381,10 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, /* per-cpu hash map can update value in-place */ pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), - value, onallcpus); + value, onallcpus, map_flags); } else { pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size), - value, onallcpus); + value, onallcpus, map_flags); hlist_nulls_add_head_rcu(&l_new->hash_node, head); l_new = NULL; } @@ -1678,9 +1696,9 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, void __user *ukeys = u64_to_user_ptr(attr->batch.keys); void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); u32 batch, max_count, size, bucket_size, map_id; + u64 elem_map_flags, map_flags, allowed_flags; u32 bucket_cnt, total, key_size, value_size; struct htab_elem *node_to_free = NULL; - u64 elem_map_flags, map_flags; struct hlist_nulls_head *head; struct hlist_nulls_node *n; unsigned long flags = 0; @@ -1690,9 +1708,12 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, int ret = 0; elem_map_flags = attr->batch.elem_flags; - if ((elem_map_flags & ~BPF_F_LOCK) || - ((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK))) - return -EINVAL; + allowed_flags = BPF_F_LOCK; + if (!do_delete && is_percpu) + allowed_flags |= BPF_F_CPU; + ret = bpf_map_check_op_flags(map, elem_map_flags, allowed_flags); + if (ret) + return ret; map_flags = attr->batch.flags; if (map_flags) @@ -1715,7 +1736,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, key_size = htab->map.key_size; value_size = htab->map.value_size; size = round_up(value_size, 8); - if (is_percpu) + if (is_percpu && !(elem_map_flags & BPF_F_CPU)) value_size = size * num_possible_cpus(); total = 0; /* while experimenting with hash tables with sizes ranging from 10 to @@ -1798,10 +1819,17 @@ again_nocopy: void __percpu *pptr; pptr = htab_elem_get_ptr(l, map->key_size); - for_each_possible_cpu(cpu) { - copy_map_value_long(&htab->map, dst_val + off, per_cpu_ptr(pptr, cpu)); - check_and_init_map_value(&htab->map, dst_val + off); - off += size; + if (elem_map_flags & BPF_F_CPU) { + cpu = elem_map_flags >> 32; + copy_map_value(&htab->map, dst_val, per_cpu_ptr(pptr, cpu)); + check_and_init_map_value(&htab->map, dst_val); + } else { + for_each_possible_cpu(cpu) { + copy_map_value_long(&htab->map, dst_val + off, + per_cpu_ptr(pptr, cpu)); + check_and_init_map_value(&htab->map, dst_val + off); + off += size; + } } } else { value = htab_elem_value(l, key_size); @@ -2357,7 +2385,7 @@ static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *k return NULL; } -int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) +int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct htab_elem *l; void __percpu *pptr; @@ -2374,16 +2402,22 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) l = __htab_map_lookup_elem(map, key); if (!l) goto out; + ret = 0; /* We do not mark LRU map element here in order to not mess up * eviction heuristics when user space does a map walk. */ pptr = htab_elem_get_ptr(l, map->key_size); + if (map_flags & BPF_F_CPU) { + cpu = map_flags >> 32; + copy_map_value(map, value, per_cpu_ptr(pptr, cpu)); + check_and_init_map_value(map, value); + goto out; + } for_each_possible_cpu(cpu) { copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu)); check_and_init_map_value(map, value + off); off += size; } - ret = 0; out: rcu_read_unlock(); return ret; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b5341ab1eb84..5e3b5d828856 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -316,7 +316,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, bpf_disable_instrumentation(); if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { - err = bpf_percpu_hash_copy(map, key, value); + err = bpf_percpu_hash_copy(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_copy(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { -- cgit v1.2.3 From 47c79f05aa0d289f760caf5ad6521fd8dbae37a1 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 7 Jan 2026 10:20:20 +0800 Subject: bpf: Add BPF_F_CPU and BPF_F_ALL_CPUS flags support for percpu_cgroup_storage maps Introduce BPF_F_ALL_CPUS flag support for percpu_cgroup_storage maps to allow updating values for all CPUs with a single value for update_elem API. Introduce BPF_F_CPU flag support for percpu_cgroup_storage maps to allow: * update value for specified CPU for update_elem API. * lookup value for specified CPU for lookup_elem API. The BPF_F_CPU flag is passed via map_flags along with embedded cpu info. Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260107022022.12843-6-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf-cgroup.h | 4 ++-- include/linux/bpf.h | 1 + kernel/bpf/local_storage.c | 23 ++++++++++++++++++----- kernel/bpf/syscall.c | 2 +- 4 files changed, 22 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index d1eb5c7729cb..2f535331f926 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -172,7 +172,7 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage); int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map); -int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value); +int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, void *value, u64 flags); @@ -470,7 +470,7 @@ static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( static inline void bpf_cgroup_storage_free( struct bpf_cgroup_storage *storage) {} static inline int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, - void *value) { + void *value, u64 flags) { return 0; } static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f5c259dcc425..5936f8e2996f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3921,6 +3921,7 @@ static inline bool bpf_map_supports_cpu_flags(enum bpf_map_type map_type) case BPF_MAP_TYPE_PERCPU_ARRAY: case BPF_MAP_TYPE_PERCPU_HASH: case BPF_MAP_TYPE_LRU_PERCPU_HASH: + case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE: return true; default: return false; diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 2ab4b60ffe61..1ccbf28b2ad9 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -180,7 +180,7 @@ static long cgroup_storage_update_elem(struct bpf_map *map, void *key, } int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *key, - void *value) + void *value, u64 map_flags) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); struct bpf_cgroup_storage *storage; @@ -198,11 +198,17 @@ int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *key, * access 'value_size' of them, so copying rounded areas * will not leak any kernel data */ + if (map_flags & BPF_F_CPU) { + cpu = map_flags >> 32; + copy_map_value(_map, value, per_cpu_ptr(storage->percpu_buf, cpu)); + goto unlock; + } size = round_up(_map->value_size, 8); for_each_possible_cpu(cpu) { copy_map_value_long(_map, value + off, per_cpu_ptr(storage->percpu_buf, cpu)); off += size; } +unlock: rcu_read_unlock(); return 0; } @@ -212,10 +218,11 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *key, { struct bpf_cgroup_storage_map *map = map_to_storage(_map); struct bpf_cgroup_storage *storage; - int cpu, off = 0; + void *val; u32 size; + int cpu; - if (map_flags != BPF_ANY && map_flags != BPF_EXIST) + if ((u32)map_flags & ~(BPF_ANY | BPF_EXIST | BPF_F_CPU | BPF_F_ALL_CPUS)) return -EINVAL; rcu_read_lock(); @@ -231,11 +238,17 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *key, * returned or zeros which were zero-filled by percpu_alloc, * so no kernel data leaks possible */ + if (map_flags & BPF_F_CPU) { + cpu = map_flags >> 32; + copy_map_value(_map, per_cpu_ptr(storage->percpu_buf, cpu), value); + goto unlock; + } size = round_up(_map->value_size, 8); for_each_possible_cpu(cpu) { - copy_map_value_long(_map, per_cpu_ptr(storage->percpu_buf, cpu), value + off); - off += size; + val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu; + copy_map_value(_map, per_cpu_ptr(storage->percpu_buf, cpu), val); } +unlock: rcu_read_unlock(); return 0; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5e3b5d828856..ecc0929ce462 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -320,7 +320,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_copy(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { - err = bpf_percpu_cgroup_storage_copy(map, key, value); + err = bpf_percpu_cgroup_storage_copy(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { err = bpf_stackmap_extract(map, key, value, false); } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) { -- cgit v1.2.3 From 8c3070e159ba00424f0389ead694cacd85af260e Mon Sep 17 00:00:00 2001 From: Donglin Peng Date: Fri, 9 Jan 2026 20:59:58 +0800 Subject: btf: Optimize type lookup with binary search Improve btf_find_by_name_kind() performance by adding binary search support for sorted types. Falls back to linear search for compatibility. Signed-off-by: Donglin Peng Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260109130003.3313716-7-dolinux.peng@gmail.com --- include/linux/btf.h | 1 + kernel/bpf/btf.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 82 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index 691f09784933..78dc79810c7d 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -219,6 +219,7 @@ bool btf_is_module(const struct btf *btf); bool btf_is_vmlinux(const struct btf *btf); struct module *btf_try_get_module(const struct btf *btf); u32 btf_nr_types(const struct btf *btf); +u32 btf_named_start_id(const struct btf *btf, bool own); struct btf *btf_base_btf(const struct btf *btf); bool btf_type_is_i32(const struct btf_type *t); bool btf_type_is_i64(const struct btf_type *t); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2c6076fc29b9..be8aa31e9e94 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -259,6 +259,7 @@ struct btf { void *nohdr_data; struct btf_header hdr; u32 nr_types; /* includes VOID for base BTF */ + u32 named_start_id; u32 types_size; u32 data_size; refcount_t refcnt; @@ -494,6 +495,11 @@ static bool btf_type_is_modifier(const struct btf_type *t) return false; } +static int btf_start_id(const struct btf *btf) +{ + return btf->start_id + (btf->base_btf ? 0 : 1); +} + bool btf_type_is_void(const struct btf_type *t) { return t == &btf_void; @@ -544,21 +550,84 @@ u32 btf_nr_types(const struct btf *btf) return total; } +/* + * btf_named_start_id - Get the named starting ID for the BTF + * @btf: Pointer to the target BTF object + * @own: Flag indicating whether to query only the current BTF (true = current BTF only, + * false = recursively traverse the base BTF chain) + * + * Return value rules: + * 1. For a sorted btf, return its named_start_id + * 2. Else for a split BTF, return its start_id + * 3. Else for a base BTF, return 1 + */ +u32 btf_named_start_id(const struct btf *btf, bool own) +{ + const struct btf *base_btf = btf; + + while (!own && base_btf->base_btf) + base_btf = base_btf->base_btf; + + return base_btf->named_start_id ?: (base_btf->start_id ?: 1); +} + +static s32 btf_find_by_name_kind_bsearch(const struct btf *btf, const char *name) +{ + const struct btf_type *t; + const char *tname; + s32 l, r, m; + + l = btf_named_start_id(btf, true); + r = btf_nr_types(btf) - 1; + while (l <= r) { + m = l + (r - l) / 2; + t = btf_type_by_id(btf, m); + tname = btf_name_by_offset(btf, t->name_off); + if (strcmp(tname, name) >= 0) { + if (l == r) + return r; + r = m; + } else { + l = m + 1; + } + } + + return btf_nr_types(btf); +} + s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind) { + const struct btf *base_btf = btf_base_btf(btf); const struct btf_type *t; const char *tname; - u32 i, total; + s32 id, total; - total = btf_nr_types(btf); - for (i = 1; i < total; i++) { - t = btf_type_by_id(btf, i); - if (BTF_INFO_KIND(t->info) != kind) - continue; + if (base_btf) { + id = btf_find_by_name_kind(base_btf, name, kind); + if (id > 0) + return id; + } - tname = btf_name_by_offset(btf, t->name_off); - if (!strcmp(tname, name)) - return i; + total = btf_nr_types(btf); + if (btf->named_start_id > 0 && name[0]) { + id = btf_find_by_name_kind_bsearch(btf, name); + for (; id < total; id++) { + t = btf_type_by_id(btf, id); + tname = btf_name_by_offset(btf, t->name_off); + if (strcmp(tname, name) != 0) + return -ENOENT; + if (BTF_INFO_KIND(t->info) == kind) + return id; + } + } else { + for (id = btf_start_id(btf); id < total; id++) { + t = btf_type_by_id(btf, id); + if (BTF_INFO_KIND(t->info) != kind) + continue; + tname = btf_name_by_offset(btf, t->name_off); + if (strcmp(tname, name) == 0) + return id; + } } return -ENOENT; @@ -5791,6 +5860,7 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat goto errout; } env->btf = btf; + btf->named_start_id = 0; data = kvmalloc(attr->btf_size, GFP_KERNEL | __GFP_NOWARN); if (!data) { @@ -6210,6 +6280,7 @@ static struct btf *btf_parse_base(struct btf_verifier_env *env, const char *name btf->data = data; btf->data_size = data_size; btf->kernel_btf = true; + btf->named_start_id = 0; snprintf(btf->name, sizeof(btf->name), "%s", name); err = btf_parse_hdr(env); @@ -6327,6 +6398,7 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, btf->start_id = base_btf->nr_types; btf->start_str_off = base_btf->hdr.str_len; btf->kernel_btf = true; + btf->named_start_id = 0; snprintf(btf->name, sizeof(btf->name), "%s", module_name); btf->data = kvmemdup(data, data_size, GFP_KERNEL | __GFP_NOWARN); -- cgit v1.2.3 From 276f3b6daf6024ae2742afd161e7418a5584a660 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 12 Jan 2026 13:11:56 +0100 Subject: arm64/ftrace,bpf: Fix partial regs after bpf_prog_run Mahe reported issue with bpf_override_return helper not working when executed from kprobe.multi bpf program on arm. The problem is that on arm we use alternate storage for pt_regs object that is passed to bpf_prog_run and if any register is changed (which is the case of bpf_override_return) it's not propagated back to actual pt_regs object. Fixing this by introducing and calling ftrace_partial_regs_update function to propagate the values of changed registers (ip and stack). Reported-by: Mahe Tardy Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Reviewed-by: Steven Rostedt (Google) Acked-by: Will Deacon Link: https://lore.kernel.org/bpf/20260112121157.854473-1-jolsa@kernel.org --- include/linux/ftrace_regs.h | 25 +++++++++++++++++++++++++ kernel/trace/bpf_trace.c | 1 + 2 files changed, 26 insertions(+) (limited to 'include') diff --git a/include/linux/ftrace_regs.h b/include/linux/ftrace_regs.h index 15627ceea9bc..386fa48c4a95 100644 --- a/include/linux/ftrace_regs.h +++ b/include/linux/ftrace_regs.h @@ -33,6 +33,31 @@ struct ftrace_regs; #define ftrace_regs_get_frame_pointer(fregs) \ frame_pointer(&arch_ftrace_regs(fregs)->regs) +static __always_inline void +ftrace_partial_regs_update(struct ftrace_regs *fregs, struct pt_regs *regs) { } + +#else + +/* + * ftrace_partial_regs_update - update the original ftrace_regs from regs + * @fregs: The ftrace_regs to update from @regs + * @regs: The partial regs from ftrace_partial_regs() that was updated + * + * Some architectures have the partial regs living in the ftrace_regs + * structure, whereas other architectures need to make a different copy + * of the @regs. If a partial @regs is retrieved by ftrace_partial_regs() and + * if the code using @regs updates a field (like the instruction pointer or + * stack pointer) it may need to propagate that change to the original @fregs + * it retrieved the partial @regs from. Use this function to guarantee that + * update happens. + */ +static __always_inline void +ftrace_partial_regs_update(struct ftrace_regs *fregs, struct pt_regs *regs) +{ + ftrace_regs_set_instruction_pointer(fregs, instruction_pointer(regs)); + ftrace_regs_set_return_value(fregs, regs_return_value(regs)); +} + #endif /* HAVE_ARCH_FTRACE_REGS */ /* This can be overridden by the architectures */ diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 6e076485bf70..3a17f79b20c2 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2564,6 +2564,7 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx); err = bpf_prog_run(link->link.prog, regs); bpf_reset_run_ctx(old_run_ctx); + ftrace_partial_regs_update(fregs, bpf_kprobe_multi_pt_regs_ptr()); rcu_read_unlock(); out: -- cgit v1.2.3 From f81c07a6e98e3171d0c4c5ab79f5aeff71b42c44 Mon Sep 17 00:00:00 2001 From: Qiliang Yuan Date: Tue, 20 Jan 2026 10:32:34 +0800 Subject: bpf/verifier: Optimize ID mapping reset in states_equal Currently, reset_idmap_scratch() performs a 4.7KB memset() in every states_equal() call. Optimize this by using a counter to track used ID mappings, replacing the O(N) memset() with an O(1) reset and bounding the search loop in check_ids(). Signed-off-by: Qiliang Yuan Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20260120023234.77673-1-realwujing@gmail.com --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 23 ++++++++++++++--------- 2 files changed, 15 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 130bcbd66f60..8355b585cd18 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -692,6 +692,7 @@ struct bpf_id_pair { struct bpf_idmap { u32 tmp_id_gen; + u32 cnt; struct bpf_id_pair map[BPF_ID_MAP_SIZE]; }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 76e02e13890f..b67d8981b058 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19000,18 +19000,21 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap) if (old_id == 0) /* cur_id == 0 as well */ return true; - for (i = 0; i < BPF_ID_MAP_SIZE; i++) { - if (!map[i].old) { - /* Reached an empty slot; haven't seen this id before */ - map[i].old = old_id; - map[i].cur = cur_id; - return true; - } + for (i = 0; i < idmap->cnt; i++) { if (map[i].old == old_id) return map[i].cur == cur_id; if (map[i].cur == cur_id) return false; } + + /* Reached the end of known mappings; haven't seen this id before */ + if (idmap->cnt < BPF_ID_MAP_SIZE) { + map[idmap->cnt].old = old_id; + map[idmap->cnt].cur = cur_id; + idmap->cnt++; + return true; + } + /* We ran out of idmap slots, which should be impossible */ WARN_ON_ONCE(1); return false; @@ -19520,8 +19523,10 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat static void reset_idmap_scratch(struct bpf_verifier_env *env) { - env->idmap_scratch.tmp_id_gen = env->id_gen; - memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map)); + struct bpf_idmap *idmap = &env->idmap_scratch; + + idmap->tmp_id_gen = env->id_gen; + idmap->cnt = 0; } static bool states_equal(struct bpf_verifier_env *env, -- cgit v1.2.3 From ea073d1818e228440275cc90047b4ef0fddd6eb5 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:26 -0800 Subject: bpf: Refactor btf_kfunc_id_set_contains btf_kfunc_id_set_contains() is called by fetch_kfunc_meta() in the BPF verifier to get the kfunc flags stored in the .BTF_ids ELF section. If it returns NULL instead of a valid pointer, it's interpreted as an illegal kfunc usage failing the verification. There are two potential reasons for btf_kfunc_id_set_contains() to return NULL: 1. Provided kfunc BTF id is not present in relevant kfunc id sets. 2. The kfunc is not allowed, as determined by the program type specific filter [1]. The filter functions accept a pointer to `struct bpf_prog`, so they might implicitly depend on earlier stages of verification, when bpf_prog members are set. For example, bpf_qdisc_kfunc_filter() in linux/net/sched/bpf_qdisc.c inspects prog->aux->st_ops [2], which is initialized in: check_attach_btf_id() -> check_struct_ops_btf_id() So far this hasn't been an issue, because fetch_kfunc_meta() is the only caller of btf_kfunc_id_set_contains(). However in subsequent patches of this series it is necessary to inspect kfunc flags earlier in BPF verifier, in the add_kfunc_call(). To resolve this, refactor btf_kfunc_id_set_contains() into two interface functions: * btf_kfunc_flags() that simply returns pointer to kfunc_flags without applying the filters * btf_kfunc_is_allowed() that both checks for kfunc_flags existence (which is a requirement for a kfunc to be allowed) and applies the prog filters See [3] for the previous version of this patch. [1] https://lore.kernel.org/all/20230519225157.760788-7-aditi.ghag@isovalent.com/ [2] https://lore.kernel.org/all/20250409214606.2000194-4-ameryhung@gmail.com/ [3] https://lore.kernel.org/bpf/20251029190113.3323406-3-ihor.solodrai@linux.dev/ Reviewed-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-2-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 4 +-- kernel/bpf/btf.c | 70 ++++++++++++++++++++++++++++++++++++++------------- kernel/bpf/verifier.c | 6 ++--- 3 files changed, 58 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index 78dc79810c7d..a2f4f383f5b6 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -575,8 +575,8 @@ const char *btf_name_by_offset(const struct btf *btf, u32 offset); const char *btf_str_by_offset(const struct btf *btf, u32 offset); struct btf *btf_parse_vmlinux(void); struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog); -u32 *btf_kfunc_id_set_contains(const struct btf *btf, u32 kfunc_btf_id, - const struct bpf_prog *prog); +u32 *btf_kfunc_flags(const struct btf *btf, u32 kfunc_btf_id, const struct bpf_prog *prog); +bool btf_kfunc_is_allowed(const struct btf *btf, u32 kfunc_btf_id, const struct bpf_prog *prog); u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id, const struct bpf_prog *prog); int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 364dd84bfc5a..d10b3404260f 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -8757,24 +8757,17 @@ end: return ret; } -static u32 *__btf_kfunc_id_set_contains(const struct btf *btf, - enum btf_kfunc_hook hook, - u32 kfunc_btf_id, - const struct bpf_prog *prog) +static u32 *btf_kfunc_id_set_contains(const struct btf *btf, + enum btf_kfunc_hook hook, + u32 kfunc_btf_id) { - struct btf_kfunc_hook_filter *hook_filter; struct btf_id_set8 *set; - u32 *id, i; + u32 *id; if (hook >= BTF_KFUNC_HOOK_MAX) return NULL; if (!btf->kfunc_set_tab) return NULL; - hook_filter = &btf->kfunc_set_tab->hook_filters[hook]; - for (i = 0; i < hook_filter->nr_filters; i++) { - if (hook_filter->filters[i](prog, kfunc_btf_id)) - return NULL; - } set = btf->kfunc_set_tab->sets[hook]; if (!set) return NULL; @@ -8785,6 +8778,28 @@ static u32 *__btf_kfunc_id_set_contains(const struct btf *btf, return id + 1; } +static bool __btf_kfunc_is_allowed(const struct btf *btf, + enum btf_kfunc_hook hook, + u32 kfunc_btf_id, + const struct bpf_prog *prog) +{ + struct btf_kfunc_hook_filter *hook_filter; + int i; + + if (hook >= BTF_KFUNC_HOOK_MAX) + return false; + if (!btf->kfunc_set_tab) + return false; + + hook_filter = &btf->kfunc_set_tab->hook_filters[hook]; + for (i = 0; i < hook_filter->nr_filters; i++) { + if (hook_filter->filters[i](prog, kfunc_btf_id)) + return false; + } + + return true; +} + static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) { switch (prog_type) { @@ -8832,6 +8847,26 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) } } +bool btf_kfunc_is_allowed(const struct btf *btf, + u32 kfunc_btf_id, + const struct bpf_prog *prog) +{ + enum bpf_prog_type prog_type = resolve_prog_type(prog); + enum btf_kfunc_hook hook; + u32 *kfunc_flags; + + kfunc_flags = btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id); + if (kfunc_flags && __btf_kfunc_is_allowed(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id, prog)) + return true; + + hook = bpf_prog_type_to_kfunc_hook(prog_type); + kfunc_flags = btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id); + if (kfunc_flags && __btf_kfunc_is_allowed(btf, hook, kfunc_btf_id, prog)) + return true; + + return false; +} + /* Caution: * Reference to the module (obtained using btf_try_get_module) corresponding to * the struct btf *MUST* be held when calling this function from verifier @@ -8839,26 +8874,27 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) * keeping the reference for the duration of the call provides the necessary * protection for looking up a well-formed btf->kfunc_set_tab. */ -u32 *btf_kfunc_id_set_contains(const struct btf *btf, - u32 kfunc_btf_id, - const struct bpf_prog *prog) +u32 *btf_kfunc_flags(const struct btf *btf, u32 kfunc_btf_id, const struct bpf_prog *prog) { enum bpf_prog_type prog_type = resolve_prog_type(prog); enum btf_kfunc_hook hook; u32 *kfunc_flags; - kfunc_flags = __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id, prog); + kfunc_flags = btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id); if (kfunc_flags) return kfunc_flags; hook = bpf_prog_type_to_kfunc_hook(prog_type); - return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id, prog); + return btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id); } u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id, const struct bpf_prog *prog) { - return __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id, prog); + if (!__btf_kfunc_is_allowed(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id, prog)) + return NULL; + + return btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id); } static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b67d8981b058..c344371feb33 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13723,10 +13723,10 @@ static int fetch_kfunc_meta(struct bpf_verifier_env *env, *kfunc_name = func_name; func_proto = btf_type_by_id(desc_btf, func->type); - kfunc_flags = btf_kfunc_id_set_contains(desc_btf, func_id, env->prog); - if (!kfunc_flags) { + if (!btf_kfunc_is_allowed(desc_btf, func_id, env->prog)) return -EACCES; - } + + kfunc_flags = btf_kfunc_flags(desc_btf, func_id, env->prog); memset(meta, 0, sizeof(*meta)); meta->btf = desc_btf; -- cgit v1.2.3 From 64e1360524b9ef5835714669b5876e122a23e6fc Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:28 -0800 Subject: bpf: Verifier support for KF_IMPLICIT_ARGS A kernel function bpf_foo marked with KF_IMPLICIT_ARGS flag is expected to have two associated types in BTF: * `bpf_foo` with a function prototype that omits implicit arguments * `bpf_foo_impl` with a function prototype that matches the kernel declaration of `bpf_foo`, but doesn't have a ksym associated with its name In order to support kfuncs with implicit arguments, the verifier has to know how to resolve a call of `bpf_foo` to the correct BTF function prototype and address. To implement this, in add_kfunc_call() kfunc flags are checked for KF_IMPLICIT_ARGS. For such kfuncs a BTF func prototype is adjusted to the one found for `bpf_foo_impl` (func_name + "_impl" suffix, by convention) function in BTF. This effectively changes the signature of the `bpf_foo` kfunc in the context of verification: from one without implicit args to the one with full argument list. The values of implicit arguments by design are provided by the verifier, and so they can only be of particular types. In this patch the only allowed implicit arg type is a pointer to struct bpf_prog_aux. In order for the verifier to correctly set an implicit bpf_prog_aux arg value at runtime, is_kfunc_arg_prog() is extended to check for the arg type. At a point when prog arg is determined in check_kfunc_args() the kfunc with implicit args already has a prototype with full argument list, so the existing value patch mechanism just works. If a new kfunc with KF_IMPLICIT_ARG is declared for an existing kfunc that uses a __prog argument (a legacy case), the prototype substitution works in exactly the same way, assuming the kfunc follows the _impl naming convention. The difference is only in how _impl prototype is added to the BTF, which is not the verifier's concern. See a subsequent resolve_btfids patch for details. __prog suffix is still supported at this point, but will be removed in a subsequent patch, after current users are moved to KF_IMPLICIT_ARGS. Introduction of KF_IMPLICIT_ARGS revealed an issue with zero-extension tracking, because an explicit rX = 0 in place of the verifier-supplied argument is now absent if the arg is implicit (the BPF prog doesn't pass a dummy NULL anymore). To mitigate this, reset the subreg_def of all caller saved registers in check_kfunc_call() [1]. [1] https://lore.kernel.org/bpf/b4a760ef828d40dac7ea6074d39452bb0dc82caa.camel@gmail.com/ Acked-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-4-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 1 + kernel/bpf/verifier.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 55 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index a2f4f383f5b6..48108471c5b1 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -78,6 +78,7 @@ #define KF_ARENA_RET (1 << 13) /* kfunc returns an arena pointer */ #define KF_ARENA_ARG1 (1 << 14) /* kfunc takes an arena pointer as its first argument */ #define KF_ARENA_ARG2 (1 << 15) /* kfunc takes an arena pointer as its second argument */ +#define KF_IMPLICIT_ARGS (1 << 16) /* kfunc has implicit arguments supplied by the verifier */ /* * Tag marking a kernel function as a kfunc. This is meant to minimize the diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 87f0e113b356..adc24a2ce5b6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3271,6 +3271,34 @@ static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, s16 offset) return btf_vmlinux ?: ERR_PTR(-ENOENT); } +#define KF_IMPL_SUFFIX "_impl" + +static const struct btf_type *find_kfunc_impl_proto(struct bpf_verifier_env *env, + struct btf *btf, + const char *func_name) +{ + char *buf = env->tmp_str_buf; + const struct btf_type *func; + s32 impl_id; + int len; + + len = snprintf(buf, TMP_STR_BUF_LEN, "%s%s", func_name, KF_IMPL_SUFFIX); + if (len < 0 || len >= TMP_STR_BUF_LEN) { + verbose(env, "function name %s%s is too long\n", func_name, KF_IMPL_SUFFIX); + return NULL; + } + + impl_id = btf_find_by_name_kind(btf, buf, BTF_KIND_FUNC); + if (impl_id <= 0) { + verbose(env, "cannot find function %s in BTF\n", buf); + return NULL; + } + + func = btf_type_by_id(btf, impl_id); + + return btf_type_by_id(btf, func->type); +} + static int fetch_kfunc_meta(struct bpf_verifier_env *env, s32 func_id, s16 offset, @@ -3308,7 +3336,16 @@ static int fetch_kfunc_meta(struct bpf_verifier_env *env, } func_name = btf_name_by_offset(btf, func->name_off); - func_proto = btf_type_by_id(btf, func->type); + + /* + * An actual prototype of a kfunc with KF_IMPLICIT_ARGS flag + * can be found through the counterpart _impl kfunc. + */ + if (kfunc_flags && (*kfunc_flags & KF_IMPLICIT_ARGS)) + func_proto = find_kfunc_impl_proto(env, btf, func_name); + else + func_proto = btf_type_by_id(btf, func->type); + if (!func_proto || !btf_type_is_func_proto(func_proto)) { verbose(env, "kernel function btf_id %d does not have a valid func_proto\n", func_id); @@ -12174,9 +12211,11 @@ static bool is_kfunc_arg_irq_flag(const struct btf *btf, const struct btf_param return btf_param_match_suffix(btf, arg, "__irq_flag"); } +static bool is_kfunc_arg_prog_aux(const struct btf *btf, const struct btf_param *arg); + static bool is_kfunc_arg_prog(const struct btf *btf, const struct btf_param *arg) { - return btf_param_match_suffix(btf, arg, "__prog"); + return btf_param_match_suffix(btf, arg, "__prog") || is_kfunc_arg_prog_aux(btf, arg); } static bool is_kfunc_arg_scalar_with_name(const struct btf *btf, @@ -12207,6 +12246,7 @@ enum { KF_ARG_WORKQUEUE_ID, KF_ARG_RES_SPIN_LOCK_ID, KF_ARG_TASK_WORK_ID, + KF_ARG_PROG_AUX_ID }; BTF_ID_LIST(kf_arg_btf_ids) @@ -12218,6 +12258,7 @@ BTF_ID(struct, bpf_rb_node) BTF_ID(struct, bpf_wq) BTF_ID(struct, bpf_res_spin_lock) BTF_ID(struct, bpf_task_work) +BTF_ID(struct, bpf_prog_aux) static bool __is_kfunc_ptr_arg_type(const struct btf *btf, const struct btf_param *arg, int type) @@ -12298,6 +12339,11 @@ static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf return true; } +static bool is_kfunc_arg_prog_aux(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_PROG_AUX_ID); +} + /* Returns true if struct is composed of scalars, 4 levels of nesting allowed */ static bool __btf_type_is_scalar_struct(struct bpf_verifier_env *env, const struct btf *btf, @@ -14177,8 +14223,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } - for (i = 0; i < CALLER_SAVED_REGS; i++) - mark_reg_not_init(env, regs, caller_saved[i]); + for (i = 0; i < CALLER_SAVED_REGS; i++) { + u32 regno = caller_saved[i]; + + mark_reg_not_init(env, regs, regno); + regs[regno].subreg_def = DEF_NOT_SUBREG; + } /* Check return type */ t = btf_type_skip_modifiers(desc_btf, meta.func_proto->type, NULL); -- cgit v1.2.3 From 82f3b142c99cf44c7b1e70b7720169c646b9760f Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 22 Jan 2026 03:59:11 -0800 Subject: rqspinlock: Fix TAS fallback lock entry creation The TAS fallback can be invoked directly when queued spin locks are disabled, and through the slow path when paravirt is enabled for queued spin locks. In the latter case, the res_spin_lock macro will attempt the fast path and already hold the entry when entering the slow path. This will lead to creation of extraneous entries that are not released, which may cause false positives for deadlock detection. Fix this by always preceding invocation of the TAS fallback in every case with the grabbing of the held lock entry, and add a comment to make note of this. Fixes: c9102a68c070 ("rqspinlock: Add a test-and-set fallback") Reported-by: Amery Hung Signed-off-by: Kumar Kartikeya Dwivedi Tested-by: Amery Hung Link: https://lore.kernel.org/r/20260122115911.3668985-1-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/asm-generic/rqspinlock.h | 2 +- kernel/bpf/rqspinlock.c | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/asm-generic/rqspinlock.h b/include/asm-generic/rqspinlock.h index 0f2dcbbfee2f..5c5cf2f7fc39 100644 --- a/include/asm-generic/rqspinlock.h +++ b/include/asm-generic/rqspinlock.h @@ -191,7 +191,7 @@ static __always_inline int res_spin_lock(rqspinlock_t *lock) #else -#define res_spin_lock(lock) resilient_tas_spin_lock(lock) +#define res_spin_lock(lock) ({ grab_held_lock_entry(lock); resilient_tas_spin_lock(lock); }) #endif /* CONFIG_QUEUED_SPINLOCKS */ diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c index f7d0c8d4644e..2fdfa828e3d3 100644 --- a/kernel/bpf/rqspinlock.c +++ b/kernel/bpf/rqspinlock.c @@ -265,10 +265,11 @@ int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock) RES_INIT_TIMEOUT(ts); /* - * The fast path is not invoked for the TAS fallback, so we must grab - * the deadlock detection entry here. + * We are either called directly from res_spin_lock after grabbing the + * deadlock detection entry when queued spinlocks are disabled, or from + * resilient_queued_spin_lock_slowpath after grabbing the deadlock + * detection entry. No need to obtain it here. */ - grab_held_lock_entry(lock); /* * Since the waiting loop's time is dependent on the amount of -- cgit v1.2.3 From 2d419c44658f75e7655794341a95c0687830f3df Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:19:56 +0800 Subject: bpf: add fsession support The fsession is something that similar to kprobe session. It allow to attach a single BPF program to both the entry and the exit of the target functions. Introduce the struct bpf_fsession_link, which allows to add the link to both the fentry and fexit progs_hlist of the trampoline. Signed-off-by: Menglong Dong Co-developed-by: Leon Hwang Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260124062008.8657-2-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 19 ++++++++ include/uapi/linux/bpf.h | 1 + kernel/bpf/btf.c | 2 + kernel/bpf/syscall.c | 18 +++++++- kernel/bpf/trampoline.c | 53 ++++++++++++++++++---- kernel/bpf/verifier.c | 12 +++-- net/bpf/test_run.c | 1 + net/core/bpf_sk_storage.c | 1 + tools/include/uapi/linux/bpf.h | 1 + .../selftests/bpf/prog_tests/tracing_failure.c | 2 +- 10 files changed, 97 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5936f8e2996f..41228b0add52 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1309,6 +1309,7 @@ enum bpf_tramp_prog_type { BPF_TRAMP_MODIFY_RETURN, BPF_TRAMP_MAX, BPF_TRAMP_REPLACE, /* more than MAX */ + BPF_TRAMP_FSESSION, }; struct bpf_tramp_image { @@ -1875,6 +1876,11 @@ struct bpf_tracing_link { struct bpf_prog *tgt_prog; }; +struct bpf_fsession_link { + struct bpf_tracing_link link; + struct bpf_tramp_link fexit; +}; + struct bpf_raw_tp_link { struct bpf_link link; struct bpf_raw_event_map *btp; @@ -2169,6 +2175,19 @@ static inline void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_op #endif +static inline int bpf_fsession_cnt(struct bpf_tramp_links *links) +{ + struct bpf_tramp_links fentries = links[BPF_TRAMP_FENTRY]; + int cnt = 0; + + for (int i = 0; i < links[BPF_TRAMP_FENTRY].nr_links; i++) { + if (fentries.links[i]->link.prog->expected_attach_type == BPF_TRACE_FSESSION) + cnt++; + } + + return cnt; +} + int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog, const struct bpf_ctx_arg_aux *info, u32 cnt); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2a2ade4be60f..44e7dbc278e3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1145,6 +1145,7 @@ enum bpf_attach_type { BPF_NETKIT_PEER, BPF_TRACE_KPROBE_SESSION, BPF_TRACE_UPROBE_SESSION, + BPF_TRACE_FSESSION, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index d10b3404260f..8959f3bc1e92 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6219,6 +6219,7 @@ static int btf_validate_prog_ctx_type(struct bpf_verifier_log *log, const struct case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_MODIFY_RETURN: + case BPF_TRACE_FSESSION: /* allow u64* as ctx */ if (btf_is_int(t) && t->size == 8) return 0; @@ -6820,6 +6821,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, fallthrough; case BPF_LSM_CGROUP: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: /* When LSM programs are attached to void LSM hooks * they use FEXIT trampolines and when attached to * int LSM hooks, they use MODIFY_RETURN trampolines. diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3c5c03d43f5f..b9184545c3fd 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3577,6 +3577,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, case BPF_PROG_TYPE_TRACING: if (prog->expected_attach_type != BPF_TRACE_FENTRY && prog->expected_attach_type != BPF_TRACE_FEXIT && + prog->expected_attach_type != BPF_TRACE_FSESSION && prog->expected_attach_type != BPF_MODIFY_RETURN) { err = -EINVAL; goto out_put_prog; @@ -3626,7 +3627,21 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id); } - link = kzalloc(sizeof(*link), GFP_USER); + if (prog->expected_attach_type == BPF_TRACE_FSESSION) { + struct bpf_fsession_link *fslink; + + fslink = kzalloc(sizeof(*fslink), GFP_USER); + if (fslink) { + bpf_link_init(&fslink->fexit.link, BPF_LINK_TYPE_TRACING, + &bpf_tracing_link_lops, prog, attach_type); + fslink->fexit.cookie = bpf_cookie; + link = &fslink->link; + } else { + link = NULL; + } + } else { + link = kzalloc(sizeof(*link), GFP_USER); + } if (!link) { err = -ENOMEM; goto out_put_prog; @@ -4350,6 +4365,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_TRACE_RAW_TP: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: case BPF_MODIFY_RETURN: return BPF_PROG_TYPE_TRACING; case BPF_LSM_MAC: diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 2a125d063e62..edf9da43762d 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -109,10 +109,17 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog) enum bpf_attach_type eatype = prog->expected_attach_type; enum bpf_prog_type ptype = prog->type; - return (ptype == BPF_PROG_TYPE_TRACING && - (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || - eatype == BPF_MODIFY_RETURN)) || - (ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC); + switch (ptype) { + case BPF_PROG_TYPE_TRACING: + if (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || + eatype == BPF_MODIFY_RETURN || eatype == BPF_TRACE_FSESSION) + return true; + return false; + case BPF_PROG_TYPE_LSM: + return eatype == BPF_LSM_MAC; + default: + return false; + } } void bpf_image_ksym_init(void *data, unsigned int size, struct bpf_ksym *ksym) @@ -559,6 +566,8 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) return BPF_TRAMP_MODIFY_RETURN; case BPF_TRACE_FEXIT: return BPF_TRAMP_FEXIT; + case BPF_TRACE_FSESSION: + return BPF_TRAMP_FSESSION; case BPF_LSM_MAC: if (!prog->aux->attach_func_proto->type) /* The function returns void, we cannot modify its @@ -594,8 +603,10 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog) { + struct bpf_fsession_link *fslink = NULL; enum bpf_tramp_prog_type kind; struct bpf_tramp_link *link_exiting; + struct hlist_head *prog_list; int err = 0; int cnt = 0, i; @@ -621,24 +632,43 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, BPF_MOD_JUMP, NULL, link->link.prog->bpf_func); } + if (kind == BPF_TRAMP_FSESSION) { + prog_list = &tr->progs_hlist[BPF_TRAMP_FENTRY]; + cnt++; + } else { + prog_list = &tr->progs_hlist[kind]; + } if (cnt >= BPF_MAX_TRAMP_LINKS) return -E2BIG; if (!hlist_unhashed(&link->tramp_hlist)) /* prog already linked */ return -EBUSY; - hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) { + hlist_for_each_entry(link_exiting, prog_list, tramp_hlist) { if (link_exiting->link.prog != link->link.prog) continue; /* prog already linked */ return -EBUSY; } - hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]); - tr->progs_cnt[kind]++; + hlist_add_head(&link->tramp_hlist, prog_list); + if (kind == BPF_TRAMP_FSESSION) { + tr->progs_cnt[BPF_TRAMP_FENTRY]++; + fslink = container_of(link, struct bpf_fsession_link, link.link); + hlist_add_head(&fslink->fexit.tramp_hlist, &tr->progs_hlist[BPF_TRAMP_FEXIT]); + tr->progs_cnt[BPF_TRAMP_FEXIT]++; + } else { + tr->progs_cnt[kind]++; + } err = bpf_trampoline_update(tr, true /* lock_direct_mutex */); if (err) { hlist_del_init(&link->tramp_hlist); - tr->progs_cnt[kind]--; + if (kind == BPF_TRAMP_FSESSION) { + tr->progs_cnt[BPF_TRAMP_FENTRY]--; + hlist_del_init(&fslink->fexit.tramp_hlist); + tr->progs_cnt[BPF_TRAMP_FEXIT]--; + } else { + tr->progs_cnt[kind]--; + } } return err; } @@ -672,6 +702,13 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, guard(mutex)(&tgt_prog->aux->ext_mutex); tgt_prog->aux->is_extended = false; return err; + } else if (kind == BPF_TRAMP_FSESSION) { + struct bpf_fsession_link *fslink = + container_of(link, struct bpf_fsession_link, link.link); + + hlist_del_init(&fslink->fexit.tramp_hlist); + tr->progs_cnt[BPF_TRAMP_FEXIT]--; + kind = BPF_TRAMP_FENTRY; } hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c7f5234d5fd2..41bbed6418b5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -17848,6 +17848,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char switch (env->prog->expected_attach_type) { case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: range = retval_range(0, 0); break; case BPF_TRACE_RAW_TP: @@ -23774,6 +23775,7 @@ patch_map_ops_generic: if (prog_type == BPF_PROG_TYPE_TRACING && insn->imm == BPF_FUNC_get_func_ret) { if (eatype == BPF_TRACE_FEXIT || + eatype == BPF_TRACE_FSESSION || eatype == BPF_MODIFY_RETURN) { /* Load nr_args from ctx - 8 */ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); @@ -24725,7 +24727,8 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, if (tgt_prog->type == BPF_PROG_TYPE_TRACING && prog_extension && (tgt_prog->expected_attach_type == BPF_TRACE_FENTRY || - tgt_prog->expected_attach_type == BPF_TRACE_FEXIT)) { + tgt_prog->expected_attach_type == BPF_TRACE_FEXIT || + tgt_prog->expected_attach_type == BPF_TRACE_FSESSION)) { /* Program extensions can extend all program types * except fentry/fexit. The reason is the following. * The fentry/fexit programs are used for performance @@ -24740,7 +24743,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, * beyond reasonable stack size. Hence extending fentry * is not allowed. */ - bpf_log(log, "Cannot extend fentry/fexit\n"); + bpf_log(log, "Cannot extend fentry/fexit/fsession\n"); return -EINVAL; } } else { @@ -24824,6 +24827,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, case BPF_LSM_CGROUP: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: if (!btf_type_is_func(t)) { bpf_log(log, "attach_btf_id %u is not a function\n", btf_id); @@ -24990,6 +24994,7 @@ static bool can_be_sleepable(struct bpf_prog *prog) case BPF_TRACE_FEXIT: case BPF_MODIFY_RETURN: case BPF_TRACE_ITER: + case BPF_TRACE_FSESSION: return true; default: return false; @@ -25071,9 +25076,10 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) tgt_info.tgt_name); return -EINVAL; } else if ((prog->expected_attach_type == BPF_TRACE_FEXIT || + prog->expected_attach_type == BPF_TRACE_FSESSION || prog->expected_attach_type == BPF_MODIFY_RETURN) && btf_id_set_contains(&noreturn_deny, btf_id)) { - verbose(env, "Attaching fexit/fmod_ret to __noreturn function '%s' is rejected.\n", + verbose(env, "Attaching fexit/fsession/fmod_ret to __noreturn function '%s' is rejected.\n", tgt_info.tgt_name); return -EINVAL; } diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 26cfcfdc45eb..178c4738e63b 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -685,6 +685,7 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog, switch (prog->expected_attach_type) { case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: if (bpf_fentry_test1(1) != 2 || bpf_fentry_test2(2, 3) != 5 || bpf_fentry_test3(4, 5, 6) != 15 || diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 850dd736ccd1..de111818f3a0 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -365,6 +365,7 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) return true; case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: return !!strncmp(prog->aux->attach_func_name, "bpf_sk_storage", strlen("bpf_sk_storage")); default: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b816bc53d2e1..3ca7d76e05f0 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1145,6 +1145,7 @@ enum bpf_attach_type { BPF_NETKIT_PEER, BPF_TRACE_KPROBE_SESSION, BPF_TRACE_UPROBE_SESSION, + BPF_TRACE_FSESSION, __MAX_BPF_ATTACH_TYPE }; diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_failure.c b/tools/testing/selftests/bpf/prog_tests/tracing_failure.c index 10e231965589..f9f9e1cb87bf 100644 --- a/tools/testing/selftests/bpf/prog_tests/tracing_failure.c +++ b/tools/testing/selftests/bpf/prog_tests/tracing_failure.c @@ -73,7 +73,7 @@ static void test_tracing_deny(void) static void test_fexit_noreturns(void) { test_tracing_fail_prog("fexit_noreturns", - "Attaching fexit/fmod_ret to __noreturn function 'do_exit' is rejected."); + "Attaching fexit/fsession/fmod_ret to __noreturn function 'do_exit' is rejected."); } void test_tracing_failure(void) -- cgit v1.2.3 From 27d89baa6da8e5e546585c53a959176d1302d46e Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:19:59 +0800 Subject: bpf: support fsession for bpf_session_is_return If fsession exists, we will use the bit (1 << BPF_TRAMP_IS_RETURN_SHIFT) in ((u64 *)ctx)[-1] to store the "is_return" flag. The logic of bpf_session_is_return() for fsession is implemented in the verifier by inline following code: bool bpf_session_is_return(void *ctx) { return (((u64 *)ctx)[-1] >> BPF_TRAMP_IS_RETURN_SHIFT) & 1; } Signed-off-by: Menglong Dong Co-developed-by: Leon Hwang Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260124062008.8657-5-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ kernel/bpf/verifier.c | 13 +++++++++++++ kernel/trace/bpf_trace.c | 39 ++++++++++++++++++++++++++------------- 3 files changed, 41 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 41228b0add52..29eecd79352e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1229,6 +1229,8 @@ enum { #endif }; +#define BPF_TRAMP_IS_RETURN_SHIFT 63 + struct bpf_tramp_links { struct bpf_tramp_link *links[BPF_MAX_TRAMP_LINKS]; int nr_links; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0fa73d56cb8b..d04aea235a12 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -23011,6 +23011,19 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); *cnt = 1; + } else if (desc->func_id == special_kfunc_list[KF_bpf_session_is_return] && + env->prog->expected_attach_type == BPF_TRACE_FSESSION) { + /* + * inline the bpf_session_is_return() for fsession: + * bool bpf_session_is_return(void *ctx) + * { + * return (((u64 *)ctx)[-1] >> BPF_TRAMP_IS_RETURN_SHIFT) & 1; + * } + */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + insn_buf[1] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, BPF_TRAMP_IS_RETURN_SHIFT); + insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1); + *cnt = 3; } if (env->insn_aux_data[insn_idx].arg_prog) { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 13f0a2de33b7..f7baeb8278ca 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1286,7 +1286,8 @@ static bool is_kprobe_multi(const struct bpf_prog *prog) static inline bool is_kprobe_session(const struct bpf_prog *prog) { - return prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION; + return prog->type == BPF_PROG_TYPE_KPROBE && + prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION; } static inline bool is_uprobe_multi(const struct bpf_prog *prog) @@ -1297,7 +1298,14 @@ static inline bool is_uprobe_multi(const struct bpf_prog *prog) static inline bool is_uprobe_session(const struct bpf_prog *prog) { - return prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION; + return prog->type == BPF_PROG_TYPE_KPROBE && + prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION; +} + +static inline bool is_trace_fsession(const struct bpf_prog *prog) +{ + return prog->type == BPF_PROG_TYPE_TRACING && + prog->expected_attach_type == BPF_TRACE_FSESSION; } static const struct bpf_func_proto * @@ -3341,34 +3349,39 @@ __bpf_kfunc __u64 *bpf_session_cookie(void *ctx) __bpf_kfunc_end_defs(); -BTF_KFUNCS_START(kprobe_multi_kfunc_set_ids) +BTF_KFUNCS_START(session_kfunc_set_ids) BTF_ID_FLAGS(func, bpf_session_is_return) BTF_ID_FLAGS(func, bpf_session_cookie) -BTF_KFUNCS_END(kprobe_multi_kfunc_set_ids) +BTF_KFUNCS_END(session_kfunc_set_ids) -static int bpf_kprobe_multi_filter(const struct bpf_prog *prog, u32 kfunc_id) +static int bpf_session_filter(const struct bpf_prog *prog, u32 kfunc_id) { - if (!btf_id_set8_contains(&kprobe_multi_kfunc_set_ids, kfunc_id)) + if (!btf_id_set8_contains(&session_kfunc_set_ids, kfunc_id)) return 0; - if (!is_kprobe_session(prog) && !is_uprobe_session(prog)) + if (!is_kprobe_session(prog) && !is_uprobe_session(prog) && !is_trace_fsession(prog)) return -EACCES; return 0; } -static const struct btf_kfunc_id_set bpf_kprobe_multi_kfunc_set = { +static const struct btf_kfunc_id_set bpf_session_kfunc_set = { .owner = THIS_MODULE, - .set = &kprobe_multi_kfunc_set_ids, - .filter = bpf_kprobe_multi_filter, + .set = &session_kfunc_set_ids, + .filter = bpf_session_filter, }; -static int __init bpf_kprobe_multi_kfuncs_init(void) +static int __init bpf_trace_kfuncs_init(void) { - return register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_kprobe_multi_kfunc_set); + int err = 0; + + err = err ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_session_kfunc_set); + err = err ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_session_kfunc_set); + + return err; } -late_initcall(bpf_kprobe_multi_kfuncs_init); +late_initcall(bpf_trace_kfuncs_init); typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struct *tsk); -- cgit v1.2.3 From eeee4239dbb155a98f8ba737324ac081acde8417 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:20:00 +0800 Subject: bpf: support fsession for bpf_session_cookie Implement session cookie for fsession. The session cookies will be stored in the stack, and the layout of the stack will look like this: return value -> 8 bytes argN -> 8 bytes ... arg1 -> 8 bytes nr_args -> 8 bytes ip (optional) -> 8 bytes cookie2 -> 8 bytes cookie1 -> 8 bytes The offset of the cookie for the current bpf program, which is in 8-byte units, is stored in the "(((u64 *)ctx)[-1] >> BPF_TRAMP_COOKIE_INDEX_SHIFT) & 0xFF". Therefore, we can get the session cookie with ((u64 *)ctx)[-offset]. Implement and inline the bpf_session_cookie() for the fsession in the verifier. Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20260124062008.8657-6-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 15 +++++++++++++++ kernel/bpf/verifier.c | 20 ++++++++++++++++++++ 2 files changed, 35 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 29eecd79352e..4427c6e98331 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1229,6 +1229,7 @@ enum { #endif }; +#define BPF_TRAMP_COOKIE_INDEX_SHIFT 8 #define BPF_TRAMP_IS_RETURN_SHIFT 63 struct bpf_tramp_links { @@ -1782,6 +1783,7 @@ struct bpf_prog { enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */ call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */ call_get_func_ip:1, /* Do we call get_func_ip() */ + call_session_cookie:1, /* Do we call bpf_session_cookie() */ tstamp_type_access:1, /* Accessed __sk_buff->tstamp_type */ sleepable:1; /* BPF program is sleepable */ enum bpf_prog_type type; /* Type of BPF program */ @@ -2190,6 +2192,19 @@ static inline int bpf_fsession_cnt(struct bpf_tramp_links *links) return cnt; } +static inline int bpf_fsession_cookie_cnt(struct bpf_tramp_links *links) +{ + struct bpf_tramp_links fentries = links[BPF_TRAMP_FENTRY]; + int cnt = 0; + + for (int i = 0; i < links[BPF_TRAMP_FENTRY].nr_links; i++) { + if (fentries.links[i]->link.prog->call_session_cookie) + cnt++; + } + + return cnt; +} + int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog, const struct bpf_ctx_arg_aux *info, u32 cnt); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d04aea235a12..c2f2650db9fd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14406,6 +14406,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return err; } + if (meta.func_id == special_kfunc_list[KF_bpf_session_cookie]) + env->prog->call_session_cookie = true; + return 0; } @@ -23024,6 +23027,23 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, insn_buf[1] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, BPF_TRAMP_IS_RETURN_SHIFT); insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1); *cnt = 3; + } else if (desc->func_id == special_kfunc_list[KF_bpf_session_cookie] && + env->prog->expected_attach_type == BPF_TRACE_FSESSION) { + /* + * inline bpf_session_cookie() for fsession: + * __u64 *bpf_session_cookie(void *ctx) + * { + * u64 off = (((u64 *)ctx)[-1] >> BPF_TRAMP_COOKIE_INDEX_SHIFT) & 0xFF; + * return &((u64 *)ctx)[-off]; + * } + */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + insn_buf[1] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, BPF_TRAMP_COOKIE_INDEX_SHIFT); + insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF); + insn_buf[3] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3); + insn_buf[4] = BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1); + insn_buf[5] = BPF_ALU64_IMM(BPF_NEG, BPF_REG_0, 0); + *cnt = 6; } if (env->insn_aux_data[insn_idx].arg_prog) { -- cgit v1.2.3 From 752b807028e63f1473b84eb1350e131eca5e5249 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Tue, 27 Jan 2026 08:51:10 +0000 Subject: bpf: add new BPF_CGROUP_ITER_CHILDREN control option Currently, the BPF cgroup iterator supports walking descendants in either pre-order (BPF_CGROUP_ITER_DESCENDANTS_PRE) or post-order (BPF_CGROUP_ITER_DESCENDANTS_POST). These modes perform an exhaustive depth-first search (DFS) of the hierarchy. In scenarios where a BPF program may need to inspect only the direct children of a given parent cgroup, a full DFS is unnecessarily expensive. This patch introduces a new BPF cgroup iterator control option, BPF_CGROUP_ITER_CHILDREN. This control option restricts the traversal to the immediate children of a specified parent cgroup, allowing for more targeted and efficient iteration, particularly when exhaustive depth-first search (DFS) traversal is not required. Signed-off-by: Matt Bobrowski Link: https://lore.kernel.org/r/20260127085112.3608687-1-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 8 ++++++++ kernel/bpf/cgroup_iter.c | 26 +++++++++++++++++++++----- tools/include/uapi/linux/bpf.h | 8 ++++++++ 3 files changed, 37 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 44e7dbc278e3..c8d400b7680a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -119,6 +119,14 @@ enum bpf_cgroup_iter_order { BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */ + /* + * Walks the immediate children of the specified parent + * cgroup_subsys_state. Unlike BPF_CGROUP_ITER_DESCENDANTS_PRE, + * BPF_CGROUP_ITER_DESCENDANTS_POST, and BPF_CGROUP_ITER_ANCESTORS_UP + * the iterator does not include the specified parent as one of the + * returned iterator elements. + */ + BPF_CGROUP_ITER_CHILDREN, }; union bpf_iter_link_info { diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c index f04a468cf6a7..fd51fe3d92cc 100644 --- a/kernel/bpf/cgroup_iter.c +++ b/kernel/bpf/cgroup_iter.c @@ -8,12 +8,13 @@ #include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */ -/* cgroup_iter provides four modes of traversal to the cgroup hierarchy. +/* cgroup_iter provides five modes of traversal to the cgroup hierarchy. * * 1. Walk the descendants of a cgroup in pre-order. * 2. Walk the descendants of a cgroup in post-order. * 3. Walk the ancestors of a cgroup. * 4. Show the given cgroup only. + * 5. Walk the children of a given parent cgroup. * * For walking descendants, cgroup_iter can walk in either pre-order or * post-order. For walking ancestors, the iter walks up from a cgroup to @@ -78,6 +79,8 @@ static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos) return css_next_descendant_pre(NULL, p->start_css); else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST) return css_next_descendant_post(NULL, p->start_css); + else if (p->order == BPF_CGROUP_ITER_CHILDREN) + return css_next_child(NULL, p->start_css); else /* BPF_CGROUP_ITER_SELF_ONLY and BPF_CGROUP_ITER_ANCESTORS_UP */ return p->start_css; } @@ -113,6 +116,8 @@ static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos) return css_next_descendant_post(curr, p->start_css); else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP) return curr->parent; + else if (p->order == BPF_CGROUP_ITER_CHILDREN) + return css_next_child(curr, p->start_css); else /* BPF_CGROUP_ITER_SELF_ONLY */ return NULL; } @@ -200,11 +205,16 @@ static int bpf_iter_attach_cgroup(struct bpf_prog *prog, int order = linfo->cgroup.order; struct cgroup *cgrp; - if (order != BPF_CGROUP_ITER_DESCENDANTS_PRE && - order != BPF_CGROUP_ITER_DESCENDANTS_POST && - order != BPF_CGROUP_ITER_ANCESTORS_UP && - order != BPF_CGROUP_ITER_SELF_ONLY) + switch (order) { + case BPF_CGROUP_ITER_DESCENDANTS_PRE: + case BPF_CGROUP_ITER_DESCENDANTS_POST: + case BPF_CGROUP_ITER_ANCESTORS_UP: + case BPF_CGROUP_ITER_SELF_ONLY: + case BPF_CGROUP_ITER_CHILDREN: + break; + default: return -EINVAL; + } if (fd && id) return -EINVAL; @@ -257,6 +267,8 @@ show_order: seq_puts(seq, "order: descendants_post\n"); else if (aux->cgroup.order == BPF_CGROUP_ITER_ANCESTORS_UP) seq_puts(seq, "order: ancestors_up\n"); + else if (aux->cgroup.order == BPF_CGROUP_ITER_CHILDREN) + seq_puts(seq, "order: children\n"); else /* BPF_CGROUP_ITER_SELF_ONLY */ seq_puts(seq, "order: self_only\n"); } @@ -320,6 +332,7 @@ __bpf_kfunc int bpf_iter_css_new(struct bpf_iter_css *it, case BPF_CGROUP_ITER_DESCENDANTS_PRE: case BPF_CGROUP_ITER_DESCENDANTS_POST: case BPF_CGROUP_ITER_ANCESTORS_UP: + case BPF_CGROUP_ITER_CHILDREN: break; default: return -EINVAL; @@ -345,6 +358,9 @@ __bpf_kfunc struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *i case BPF_CGROUP_ITER_DESCENDANTS_POST: kit->pos = css_next_descendant_post(kit->pos, kit->start); break; + case BPF_CGROUP_ITER_CHILDREN: + kit->pos = css_next_child(kit->pos, kit->start); + break; case BPF_CGROUP_ITER_ANCESTORS_UP: kit->pos = kit->pos ? kit->pos->parent : kit->start; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3ca7d76e05f0..5e38b4887de6 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -119,6 +119,14 @@ enum bpf_cgroup_iter_order { BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */ + /* + * Walks the immediate children of the specified parent + * cgroup_subsys_state. Unlike BPF_CGROUP_ITER_DESCENDANTS_PRE, + * BPF_CGROUP_ITER_DESCENDANTS_POST, and BPF_CGROUP_ITER_ANCESTORS_UP + * the iterator does not include the specified parent as one of the + * returned iterator elements. + */ + BPF_CGROUP_ITER_CHILDREN, }; union bpf_iter_link_info { -- cgit v1.2.3 From b40cc5adaa80e1471095a62d78233b611d7a558c Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Sat, 24 Jan 2026 19:32:43 +0800 Subject: bpf, sockmap: Fix incorrect copied_seq calculation A socket using sockmap has its own independent receive queue: ingress_msg. This queue may contain data from its own protocol stack or from other sockets. The issue is that when reading from ingress_msg, we update tp->copied_seq by default. However, if the data is not from its own protocol stack, tcp->rcv_nxt is not increased. Later, if we convert this socket to a native socket, reading from this socket may fail because copied_seq might be significantly larger than rcv_nxt. This fix also addresses the syzkaller-reported bug referenced in the Closes tag. This patch marks the skmsg objects in ingress_msg. When reading, we update copied_seq only if the data is from its own protocol stack. FD1:read() -- FD1->copied_seq++ | [read data] | [enqueue data] v [sockmap] -> ingress to self -> ingress_msg queue FD1 native stack ------> ^ -- FD1->rcv_nxt++ -> redirect to other | [enqueue data] | | | ingress to FD1 v ^ ... | [sockmap] FD2 native stack Closes: https://syzkaller.appspot.com/bug?extid=06dbd397158ec0ea4983 Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()") Reviewed-by: Jakub Sitnicki Reviewed-by: John Fastabend Signed-off-by: Jiayuan Chen Link: https://lore.kernel.org/r/20260124113314.113584-2-jiayuan.chen@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/skmsg.h | 2 ++ net/core/skmsg.c | 27 ++++++++++++++++++++++++--- net/ipv4/tcp_bpf.c | 5 +++-- 3 files changed, 29 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 49847888c287..dfdc158ab88c 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -141,6 +141,8 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); +int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, + int len, int flags, int *copied_from_self); bool sk_msg_is_readable(struct sock *sk); static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 2ac7731e1e0a..d402da5caadd 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -409,22 +409,26 @@ out: } EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); -/* Receive sk_msg from psock->ingress_msg to @msg. */ -int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, - int len, int flags) +int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, + int len, int flags, int *copied_from_self) { struct iov_iter *iter = &msg->msg_iter; int peek = flags & MSG_PEEK; struct sk_msg *msg_rx; int i, copied = 0; + bool from_self; msg_rx = sk_psock_peek_msg(psock); + if (copied_from_self) + *copied_from_self = 0; + while (copied != len) { struct scatterlist *sge; if (unlikely(!msg_rx)) break; + from_self = msg_rx->sk == sk; i = msg_rx->sg.start; do { struct page *page; @@ -443,6 +447,9 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, } copied += copy; + if (from_self && copied_from_self) + *copied_from_self += copy; + if (likely(!peek)) { sge->offset += copy; sge->length -= copy; @@ -487,6 +494,13 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, out: return copied; } + +/* Receive sk_msg from psock->ingress_msg to @msg. */ +int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, + int len, int flags) +{ + return __sk_msg_recvmsg(sk, psock, msg, len, flags, NULL); +} EXPORT_SYMBOL_GPL(sk_msg_recvmsg); bool sk_msg_is_readable(struct sock *sk) @@ -616,6 +630,12 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb if (unlikely(!msg)) return -EAGAIN; skb_set_owner_r(skb, sk); + + /* This is used in tcp_bpf_recvmsg_parser() to determine whether the + * data originates from the socket's own protocol stack. No need to + * refcount sk because msg's lifetime is bound to sk via the ingress_msg. + */ + msg->sk = sk; err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, take_ref); if (err < 0) kfree(msg); @@ -909,6 +929,7 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, sk_msg_compute_data_pointers(msg); msg->sk = sk; ret = bpf_prog_run_pin_on_cpu(prog, msg); + msg->sk = NULL; ret = sk_psock_map_verd(ret, msg->sk_redir); psock->apply_bytes = msg->apply_bytes; if (ret == __SK_REDIRECT) { diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index a268e1595b22..5c698fd7fbf8 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -226,6 +226,7 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk, int peek = flags & MSG_PEEK; struct sk_psock *psock; struct tcp_sock *tcp; + int copied_from_self = 0; int copied = 0; u32 seq; @@ -262,7 +263,7 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk, } msg_bytes_ready: - copied = sk_msg_recvmsg(sk, psock, msg, len, flags); + copied = __sk_msg_recvmsg(sk, psock, msg, len, flags, &copied_from_self); /* The typical case for EFAULT is the socket was gracefully * shutdown with a FIN pkt. So check here the other case is * some error on copy_page_to_iter which would be unexpected. @@ -277,7 +278,7 @@ msg_bytes_ready: goto out; } } - seq += copied; + seq += copied_from_self; if (!copied) { long timeo; int data; -- cgit v1.2.3 From 929e30f9312514902133c45e51c79088421ab084 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Sat, 24 Jan 2026 19:32:44 +0800 Subject: bpf, sockmap: Fix FIONREAD for sockmap A socket using sockmap has its own independent receive queue: ingress_msg. This queue may contain data from its own protocol stack or from other sockets. Therefore, for sockmap, relying solely on copied_seq and rcv_nxt to calculate FIONREAD is not enough. This patch adds a new msg_tot_len field in the psock structure to record the data length in ingress_msg. Additionally, we implement new ioctl interfaces for TCP and UDP to intercept FIONREAD operations. Note that we intentionally do not include sk_receive_queue data in the FIONREAD result. Data in sk_receive_queue has not yet been processed by the BPF verdict program, and may be redirected to other sockets or dropped. Including it would create semantic ambiguity since this data may never be readable by the user. Unix and VSOCK sockets have similar issues, but fixing them is outside the scope of this patch as it would require more intrusive changes. Previous work by John Fastabend made some efforts towards FIONREAD support: commit e5c6de5fa025 ("bpf, sockmap: Incorrectly handling copied_seq") Although the current patch is based on the previous work by John Fastabend, it is acceptable for our Fixes tag to point to the same commit. FD1:read() -- FD1->copied_seq++ | [read data] | [enqueue data] v [sockmap] -> ingress to self -> ingress_msg queue FD1 native stack ------> ^ -- FD1->rcv_nxt++ -> redirect to other | [enqueue data] | | | ingress to FD1 v ^ ... | [sockmap] FD2 native stack Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()") Signed-off-by: Jiayuan Chen Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/r/20260124113314.113584-3-jiayuan.chen@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/skmsg.h | 68 +++++++++++++++++++++++++++++++++++++++++++++++++-- net/core/skmsg.c | 3 +++ net/ipv4/tcp_bpf.c | 20 +++++++++++++++ net/ipv4/udp_bpf.c | 23 ++++++++++++++--- 4 files changed, 108 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index dfdc158ab88c..829b281d6c9c 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -97,6 +97,8 @@ struct sk_psock { struct sk_buff_head ingress_skb; struct list_head ingress_msg; spinlock_t ingress_lock; + /** @msg_tot_len: Total bytes queued in ingress_msg list. */ + u32 msg_tot_len; unsigned long state; struct list_head link; spinlock_t link_lock; @@ -321,6 +323,27 @@ static inline void sock_drop(struct sock *sk, struct sk_buff *skb) kfree_skb(skb); } +static inline u32 sk_psock_get_msg_len_nolock(struct sk_psock *psock) +{ + /* Used by ioctl to read msg_tot_len only; lock-free for performance */ + return READ_ONCE(psock->msg_tot_len); +} + +static inline void sk_psock_msg_len_add_locked(struct sk_psock *psock, int diff) +{ + /* Use WRITE_ONCE to ensure correct read in sk_psock_get_msg_len_nolock(). + * ingress_lock should be held to prevent concurrent updates to msg_tot_len + */ + WRITE_ONCE(psock->msg_tot_len, psock->msg_tot_len + diff); +} + +static inline void sk_psock_msg_len_add(struct sk_psock *psock, int diff) +{ + spin_lock_bh(&psock->ingress_lock); + sk_psock_msg_len_add_locked(psock, diff); + spin_unlock_bh(&psock->ingress_lock); +} + static inline bool sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { @@ -329,6 +352,7 @@ static inline bool sk_psock_queue_msg(struct sk_psock *psock, spin_lock_bh(&psock->ingress_lock); if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { list_add_tail(&msg->list, &psock->ingress_msg); + sk_psock_msg_len_add_locked(psock, msg->sg.size); ret = true; } else { sk_msg_free(psock->sk, msg); @@ -345,18 +369,25 @@ static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock) spin_lock_bh(&psock->ingress_lock); msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); - if (msg) + if (msg) { list_del(&msg->list); + sk_psock_msg_len_add_locked(psock, -msg->sg.size); + } spin_unlock_bh(&psock->ingress_lock); return msg; } +static inline struct sk_msg *sk_psock_peek_msg_locked(struct sk_psock *psock) +{ + return list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); +} + static inline struct sk_msg *sk_psock_peek_msg(struct sk_psock *psock) { struct sk_msg *msg; spin_lock_bh(&psock->ingress_lock); - msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); + msg = sk_psock_peek_msg_locked(psock); spin_unlock_bh(&psock->ingress_lock); return msg; } @@ -523,6 +554,39 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock) return !!psock->saved_data_ready; } +/* for tcp only, sk is locked */ +static inline ssize_t sk_psock_msg_inq(struct sock *sk) +{ + struct sk_psock *psock; + ssize_t inq = 0; + + psock = sk_psock_get(sk); + if (likely(psock)) { + inq = sk_psock_get_msg_len_nolock(psock); + sk_psock_put(sk, psock); + } + return inq; +} + +/* for udp only, sk is not locked */ +static inline ssize_t sk_msg_first_len(struct sock *sk) +{ + struct sk_psock *psock; + struct sk_msg *msg; + ssize_t inq = 0; + + psock = sk_psock_get(sk); + if (likely(psock)) { + spin_lock_bh(&psock->ingress_lock); + msg = sk_psock_peek_msg_locked(psock); + if (msg) + inq = msg->sg.size; + spin_unlock_bh(&psock->ingress_lock); + sk_psock_put(sk, psock); + } + return inq; +} + #if IS_ENABLED(CONFIG_NET_SOCK_MSG) #define BPF_F_STRPARSER (1UL << 1) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index d402da5caadd..ddde93dd8bc6 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -458,6 +458,7 @@ int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg atomic_sub(copy, &sk->sk_rmem_alloc); } msg_rx->sg.size -= copy; + sk_psock_msg_len_add(psock, -copy); if (!sge->length) { sk_msg_iter_var_next(i); @@ -821,9 +822,11 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock) list_del(&msg->list); if (!msg->skb) atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc); + sk_psock_msg_len_add(psock, -msg->sg.size); sk_msg_free(psock->sk, msg); kfree(msg); } + WARN_ON_ONCE(psock->msg_tot_len); } static void __sk_psock_zap_ingress(struct sk_psock *psock) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 5c698fd7fbf8..ca8a5cb8e569 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -10,6 +10,7 @@ #include #include +#include void tcp_eat_skb(struct sock *sk, struct sk_buff *skb) { @@ -332,6 +333,24 @@ unlock: return copied; } +static int tcp_bpf_ioctl(struct sock *sk, int cmd, int *karg) +{ + bool slow; + + if (cmd != SIOCINQ) + return tcp_ioctl(sk, cmd, karg); + + /* works similar as tcp_ioctl */ + if (sk->sk_state == TCP_LISTEN) + return -EINVAL; + + slow = lock_sock_fast(sk); + *karg = sk_psock_msg_inq(sk); + unlock_sock_fast(sk, slow); + + return 0; +} + static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { @@ -610,6 +629,7 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], prot[TCP_BPF_BASE].close = sock_map_close; prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg; prot[TCP_BPF_BASE].sock_is_readable = sk_msg_is_readable; + prot[TCP_BPF_BASE].ioctl = tcp_bpf_ioctl; prot[TCP_BPF_TX] = prot[TCP_BPF_BASE]; prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg; diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index 0735d820e413..91233e37cd97 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "udp_impl.h" @@ -111,12 +112,26 @@ enum { static DEFINE_SPINLOCK(udpv6_prot_lock); static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS]; +static int udp_bpf_ioctl(struct sock *sk, int cmd, int *karg) +{ + if (cmd != SIOCINQ) + return udp_ioctl(sk, cmd, karg); + + /* Since we don't hold a lock, sk_receive_queue may contain data. + * BPF might only be processing this data at the moment. We only + * care about the data in the ingress_msg here. + */ + *karg = sk_msg_first_len(sk); + return 0; +} + static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base) { - *prot = *base; - prot->close = sock_map_close; - prot->recvmsg = udp_bpf_recvmsg; - prot->sock_is_readable = sk_msg_is_readable; + *prot = *base; + prot->close = sock_map_close; + prot->recvmsg = udp_bpf_recvmsg; + prot->sock_is_readable = sk_msg_is_readable; + prot->ioctl = udp_bpf_ioctl; } static void udp_bpf_check_v6_needs_rebuild(struct proto *ops) -- cgit v1.2.3 From ae23bc81ddf7c17b663c4ed1b21e35527b0a7131 Mon Sep 17 00:00:00 2001 From: Guillaume Gonnet Date: Tue, 27 Jan 2026 17:02:00 +0100 Subject: bpf: Fix tcx/netkit detach permissions when prog fd isn't given This commit fixes a security issue where BPF_PROG_DETACH on tcx or netkit devices could be executed by any user when no program fd was provided, bypassing permission checks. The fix adds a capability check for CAP_NET_ADMIN or CAP_SYS_ADMIN in this case. Fixes: e420bed02507 ("bpf: Add fd-based tcx multi-prog infra with link support") Signed-off-by: Guillaume Gonnet Link: https://lore.kernel.org/r/20260127160200.10395-1-ggonnet.linux@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 5 +++++ include/linux/bpf_mprog.h | 10 ++++++++++ kernel/bpf/syscall.c | 7 ++----- 3 files changed, 17 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4427c6e98331..9272a237cced 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3362,6 +3362,11 @@ static inline void bpf_prog_report_arena_violation(bool write, unsigned long add } #endif /* CONFIG_BPF_SYSCALL */ +static inline bool bpf_net_capable(void) +{ + return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN); +} + static __always_inline int bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr) { diff --git a/include/linux/bpf_mprog.h b/include/linux/bpf_mprog.h index 929225f7b095..0b9f4caeeb0a 100644 --- a/include/linux/bpf_mprog.h +++ b/include/linux/bpf_mprog.h @@ -340,4 +340,14 @@ static inline bool bpf_mprog_supported(enum bpf_prog_type type) return false; } } + +static inline bool bpf_mprog_detach_empty(enum bpf_prog_type type) +{ + switch (type) { + case BPF_PROG_TYPE_SCHED_CLS: + return bpf_net_capable(); + default: + return false; + } +} #endif /* __BPF_MPROG_H */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b9184545c3fd..5f59dd47a5b1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1363,11 +1363,6 @@ free_map_tab: return ret; } -static bool bpf_net_capable(void) -{ - return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN); -} - #define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size /* called via syscall */ static int map_create(union bpf_attr *attr, bpfptr_t uattr) @@ -4579,6 +4574,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); if (IS_ERR(prog)) return PTR_ERR(prog); + } else if (!bpf_mprog_detach_empty(ptype)) { + return -EPERM; } } else if (is_cgroup_prog_type(ptype, 0, false)) { if (attr->attach_flags || attr->relative_fd) -- cgit v1.2.3 From 4be42c92220128b3128854a2d6b0f0ad0bcedbdb Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Dec 2025 15:50:02 +0100 Subject: ftrace,bpf: Remove FTRACE_OPS_FL_JMP ftrace_ops flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At the moment the we allow the jmp attach only for ftrace_ops that has FTRACE_OPS_FL_JMP set. This conflicts with following changes where we use single ftrace_ops object for all direct call sites, so all could be be attached via just call or jmp. We already limit the jmp attach support with config option and bit (LSB) set on the trampoline address. It turns out that's actually enough to limit the jmp attach for architecture and only for chosen addresses (with LSB bit set). Each user of register_ftrace_direct or modify_ftrace_direct can set the trampoline bit (LSB) to indicate it has to be attached by jmp. The bpf trampoline generation code uses trampoline flags to generate jmp-attach specific code and ftrace inner code uses the trampoline bit (LSB) to handle return from jmp attachment, so there's no harm to remove the FTRACE_OPS_FL_JMP bit. The fexit/fmodret performance stays the same (did not drop), current code: fentry : 77.904 ± 0.546M/s fexit : 62.430 ± 0.554M/s fmodret : 66.503 ± 0.902M/s with this change: fentry : 80.472 ± 0.061M/s fexit : 63.995 ± 0.127M/s fmodret : 67.362 ± 0.175M/s Fixes: 25e4e3565d45 ("ftrace: Introduce FTRACE_OPS_FL_JMP") Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/bpf/20251230145010.103439-2-jolsa@kernel.org --- include/linux/ftrace.h | 1 - kernel/bpf/trampoline.c | 32 ++++++++++++++------------------ kernel/trace/ftrace.c | 14 -------------- 3 files changed, 14 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index a3a8989e3268..cc869c59c1a6 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -359,7 +359,6 @@ enum { FTRACE_OPS_FL_DIRECT = BIT(17), FTRACE_OPS_FL_SUBOP = BIT(18), FTRACE_OPS_FL_GRAPH = BIT(19), - FTRACE_OPS_FL_JMP = BIT(20), }; #ifndef CONFIG_DYNAMIC_FTRACE_WITH_ARGS diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index edf9da43762d..468de930b2ed 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -221,10 +221,15 @@ static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags, int ret; if (tr->func.ftrace_managed) { + unsigned long addr = (unsigned long) new_addr; + + if (bpf_trampoline_use_jmp(tr->flags)) + addr = ftrace_jmp_set(addr); + if (lock_direct_mutex) - ret = modify_ftrace_direct(tr->fops, (long)new_addr); + ret = modify_ftrace_direct(tr->fops, addr); else - ret = modify_ftrace_direct_nolock(tr->fops, (long)new_addr); + ret = modify_ftrace_direct_nolock(tr->fops, addr); } else { ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, new_addr); @@ -247,10 +252,15 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) } if (tr->func.ftrace_managed) { + unsigned long addr = (unsigned long) new_addr; + + if (bpf_trampoline_use_jmp(tr->flags)) + addr = ftrace_jmp_set(addr); + ret = ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1); if (ret) return ret; - ret = register_ftrace_direct(tr->fops, (long)new_addr); + ret = register_ftrace_direct(tr->fops, addr); } else { ret = bpf_trampoline_update_fentry(tr, 0, NULL, new_addr); } @@ -506,13 +516,6 @@ again: if (err) goto out_free; -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP - if (bpf_trampoline_use_jmp(tr->flags)) - tr->fops->flags |= FTRACE_OPS_FL_JMP; - else - tr->fops->flags &= ~FTRACE_OPS_FL_JMP; -#endif - WARN_ON(tr->cur_image && total == 0); if (tr->cur_image) /* progs already running at this address */ @@ -540,15 +543,8 @@ again: tr->cur_image = im; out: /* If any error happens, restore previous flags */ - if (err) { + if (err) tr->flags = orig_flags; -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP - if (bpf_trampoline_use_jmp(tr->flags)) - tr->fops->flags |= FTRACE_OPS_FL_JMP; - else - tr->fops->flags &= ~FTRACE_OPS_FL_JMP; -#endif - } kfree(tlinks); return err; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ef2d5dca6f70..6040aacc97da 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6046,15 +6046,8 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) if (ftrace_hash_empty(hash)) return -EINVAL; - /* This is a "raw" address, and this should never happen. */ - if (WARN_ON_ONCE(ftrace_is_jmp(addr))) - return -EINVAL; - mutex_lock(&direct_mutex); - if (ops->flags & FTRACE_OPS_FL_JMP) - addr = ftrace_jmp_set(addr); - /* Make sure requested entries are not already registered.. */ size = 1 << hash->size_bits; for (i = 0; i < size; i++) { @@ -6175,13 +6168,6 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) lockdep_assert_held_once(&direct_mutex); - /* This is a "raw" address, and this should never happen. */ - if (WARN_ON_ONCE(ftrace_is_jmp(addr))) - return -EINVAL; - - if (ops->flags & FTRACE_OPS_FL_JMP) - addr = ftrace_jmp_set(addr); - /* Enable the tmp_ops to have the same functions as the direct ops */ ftrace_ops_init(&tmp_ops); tmp_ops.func_hash = ops->func_hash; -- cgit v1.2.3 From 0e860d07c29d70205d5ad48456ac7a133ccfb293 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Dec 2025 15:50:04 +0100 Subject: ftrace: Export some of hash related functions We are going to use these functions in following changes. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/bpf/20251230145010.103439-4-jolsa@kernel.org --- include/linux/ftrace.h | 9 +++++++++ kernel/trace/ftrace.c | 13 ++++++------- 2 files changed, 15 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index cc869c59c1a6..a77b57a4aba2 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -82,6 +82,7 @@ static inline void early_trace_init(void) { } struct module; struct ftrace_hash; +struct ftrace_func_entry; #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_MODULES) && \ defined(CONFIG_DYNAMIC_FTRACE) @@ -405,6 +406,14 @@ enum ftrace_ops_cmd { typedef int (*ftrace_ops_func_t)(struct ftrace_ops *op, enum ftrace_ops_cmd cmd); #ifdef CONFIG_DYNAMIC_FTRACE + +#define FTRACE_HASH_DEFAULT_BITS 10 + +struct ftrace_hash *alloc_ftrace_hash(int size_bits); +void free_ftrace_hash(struct ftrace_hash *hash); +struct ftrace_func_entry *add_ftrace_hash_entry_direct(struct ftrace_hash *hash, + unsigned long ip, unsigned long direct); + /* The hash used to know what functions callbacks trace */ struct ftrace_ops_hash { struct ftrace_hash __rcu *notrace_hash; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2a63860c172e..bf0ca563e91f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -68,7 +68,6 @@ }) /* hash bits for specific function selection */ -#define FTRACE_HASH_DEFAULT_BITS 10 #define FTRACE_HASH_MAX_BITS 12 #ifdef CONFIG_DYNAMIC_FTRACE @@ -1211,8 +1210,8 @@ static void __add_hash_entry(struct ftrace_hash *hash, hash->count++; } -static struct ftrace_func_entry * -add_hash_entry_direct(struct ftrace_hash *hash, unsigned long ip, unsigned long direct) +struct ftrace_func_entry * +add_ftrace_hash_entry_direct(struct ftrace_hash *hash, unsigned long ip, unsigned long direct) { struct ftrace_func_entry *entry; @@ -1230,7 +1229,7 @@ add_hash_entry_direct(struct ftrace_hash *hash, unsigned long ip, unsigned long static struct ftrace_func_entry * add_hash_entry(struct ftrace_hash *hash, unsigned long ip) { - return add_hash_entry_direct(hash, ip, 0); + return add_ftrace_hash_entry_direct(hash, ip, 0); } static void @@ -1291,7 +1290,7 @@ static void clear_ftrace_mod_list(struct list_head *head) mutex_unlock(&ftrace_lock); } -static void free_ftrace_hash(struct ftrace_hash *hash) +void free_ftrace_hash(struct ftrace_hash *hash) { if (!hash || hash == EMPTY_HASH) return; @@ -1331,7 +1330,7 @@ void ftrace_free_filter(struct ftrace_ops *ops) } EXPORT_SYMBOL_GPL(ftrace_free_filter); -static struct ftrace_hash *alloc_ftrace_hash(int size_bits) +struct ftrace_hash *alloc_ftrace_hash(int size_bits) { struct ftrace_hash *hash; int size; @@ -1405,7 +1404,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) size = 1 << hash->size_bits; for (i = 0; i < size; i++) { hlist_for_each_entry(entry, &hash->buckets[i], hlist) { - if (add_hash_entry_direct(new_hash, entry->ip, entry->direct) == NULL) + if (add_ftrace_hash_entry_direct(new_hash, entry->ip, entry->direct) == NULL) goto free_hash; } } -- cgit v1.2.3 From 05dc5e9c1fe156fd9dddc4c2f81e8fc6c7e50eb5 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Dec 2025 15:50:05 +0100 Subject: ftrace: Add update_ftrace_direct_add function Adding update_ftrace_direct_add function that adds all entries (ip -> addr) provided in hash argument to direct ftrace ops and updates its attachments. The difference to current register_ftrace_direct is - hash argument that allows to register multiple ip -> direct entries at once - we can call update_ftrace_direct_add multiple times on the same ftrace_ops object, becase after first registration with register_ftrace_function_nolock, it uses ftrace_update_ops to update the ftrace_ops object This change will allow us to have simple ftrace_ops for all bpf direct interface users in following changes. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/bpf/20251230145010.103439-5-jolsa@kernel.org --- include/linux/ftrace.h | 7 +++ kernel/trace/ftrace.c | 140 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 147 insertions(+) (limited to 'include') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index a77b57a4aba2..43a6cfaf5aae 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -543,6 +543,8 @@ int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr, int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr); int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned long addr); +int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace_hash *hash); + void ftrace_stub_direct_tramp(void); #else @@ -569,6 +571,11 @@ static inline int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned l return -ENODEV; } +static inline int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace_hash *hash) +{ + return -ENODEV; +} + /* * This must be implemented by the architecture. * It is the way the ftrace direct_ops helper, when called diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index bf0ca563e91f..97bd988b0a62 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6278,6 +6278,146 @@ int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) return err; } EXPORT_SYMBOL_GPL(modify_ftrace_direct); + +static unsigned long hash_count(struct ftrace_hash *hash) +{ + return hash ? hash->count : 0; +} + +/** + * hash_add - adds two struct ftrace_hash and returns the result + * @a: struct ftrace_hash object + * @b: struct ftrace_hash object + * + * Returns struct ftrace_hash object on success, NULL on error. + */ +static struct ftrace_hash *hash_add(struct ftrace_hash *a, struct ftrace_hash *b) +{ + struct ftrace_func_entry *entry; + struct ftrace_hash *add; + int size; + + size = hash_count(a) + hash_count(b); + if (size > 32) + size = 32; + + add = alloc_and_copy_ftrace_hash(fls(size), a); + if (!add) + return NULL; + + size = 1 << b->size_bits; + for (int i = 0; i < size; i++) { + hlist_for_each_entry(entry, &b->buckets[i], hlist) { + if (add_ftrace_hash_entry_direct(add, entry->ip, entry->direct) == NULL) { + free_ftrace_hash(add); + return NULL; + } + } + } + return add; +} + +/** + * update_ftrace_direct_add - Updates @ops by adding direct + * callers provided in @hash + * @ops: The address of the struct ftrace_ops object + * @hash: The address of the struct ftrace_hash object + * + * This is used to add custom direct callers (ip -> addr) to @ops, + * specified in @hash. The @ops will be either registered or updated. + * + * Returns: zero on success. Non zero on error, which includes: + * -EINVAL - The @hash is empty + */ +int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace_hash *hash) +{ + struct ftrace_hash *old_direct_functions = NULL; + struct ftrace_hash *new_direct_functions; + struct ftrace_hash *old_filter_hash; + struct ftrace_hash *new_filter_hash = NULL; + struct ftrace_func_entry *entry; + int err = -EINVAL; + int size; + bool reg; + + if (!hash_count(hash)) + return -EINVAL; + + mutex_lock(&direct_mutex); + + /* Make sure requested entries are not already registered. */ + size = 1 << hash->size_bits; + for (int i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + if (__ftrace_lookup_ip(direct_functions, entry->ip)) + goto out_unlock; + } + } + + old_filter_hash = ops->func_hash ? ops->func_hash->filter_hash : NULL; + + /* If there's nothing in filter_hash we need to register the ops. */ + reg = hash_count(old_filter_hash) == 0; + if (reg) { + if (ops->func || ops->trampoline) + goto out_unlock; + if (ops->flags & FTRACE_OPS_FL_ENABLED) + goto out_unlock; + } + + err = -ENOMEM; + new_filter_hash = hash_add(old_filter_hash, hash); + if (!new_filter_hash) + goto out_unlock; + + new_direct_functions = hash_add(direct_functions, hash); + if (!new_direct_functions) + goto out_unlock; + + old_direct_functions = direct_functions; + rcu_assign_pointer(direct_functions, new_direct_functions); + + if (reg) { + ops->func = call_direct_funcs; + ops->flags |= MULTI_FLAGS; + ops->trampoline = FTRACE_REGS_ADDR; + ops->local_hash.filter_hash = new_filter_hash; + + err = register_ftrace_function_nolock(ops); + if (err) { + /* restore old filter on error */ + ops->local_hash.filter_hash = old_filter_hash; + + /* cleanup for possible another register call */ + ops->func = NULL; + ops->trampoline = 0; + } else { + new_filter_hash = old_filter_hash; + } + } else { + err = ftrace_update_ops(ops, new_filter_hash, EMPTY_HASH); + /* + * new_filter_hash is dup-ed, so we need to release it anyway, + * old_filter_hash either stays on error or is already released + */ + } + + if (err) { + /* reset direct_functions and free the new one */ + rcu_assign_pointer(direct_functions, old_direct_functions); + old_direct_functions = new_direct_functions; + } + + out_unlock: + mutex_unlock(&direct_mutex); + + if (old_direct_functions && old_direct_functions != EMPTY_HASH) + call_rcu_tasks(&old_direct_functions->rcu, register_ftrace_direct_cb); + free_ftrace_hash(new_filter_hash); + + return err; +} + #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ /** -- cgit v1.2.3 From 8d2c1233f37149e4d1223d06ca7054a7f88c0a24 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Dec 2025 15:50:06 +0100 Subject: ftrace: Add update_ftrace_direct_del function Adding update_ftrace_direct_del function that removes all entries (ip -> addr) provided in hash argument to direct ftrace ops and updates its attachments. The difference to current unregister_ftrace_direct is - hash argument that allows to unregister multiple ip -> direct entries at once - we can call update_ftrace_direct_del multiple times on the same ftrace_ops object, becase we do not need to unregister all entries at once, we can do it gradualy with the help of ftrace_update_ops function This change will allow us to have simple ftrace_ops for all bpf direct interface users in following changes. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/bpf/20251230145010.103439-6-jolsa@kernel.org --- include/linux/ftrace.h | 6 +++ kernel/trace/ftrace.c | 127 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 133 insertions(+) (limited to 'include') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 43a6cfaf5aae..cfb957731433 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -544,6 +544,7 @@ int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr); int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned long addr); int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace_hash *hash); +int update_ftrace_direct_del(struct ftrace_ops *ops, struct ftrace_hash *hash); void ftrace_stub_direct_tramp(void); @@ -576,6 +577,11 @@ static inline int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace return -ENODEV; } +static inline int update_ftrace_direct_del(struct ftrace_ops *ops, struct ftrace_hash *hash) +{ + return -ENODEV; +} + /* * This must be implemented by the architecture. * It is the way the ftrace direct_ops helper, when called diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 97bd988b0a62..244bb7553d3d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6418,6 +6418,133 @@ int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace_hash *hash) return err; } +/** + * hash_sub - substracts @b from @a and returns the result + * @a: struct ftrace_hash object + * @b: struct ftrace_hash object + * + * Returns struct ftrace_hash object on success, NULL on error. + */ +static struct ftrace_hash *hash_sub(struct ftrace_hash *a, struct ftrace_hash *b) +{ + struct ftrace_func_entry *entry, *del; + struct ftrace_hash *sub; + int size; + + sub = alloc_and_copy_ftrace_hash(a->size_bits, a); + if (!sub) + return NULL; + + size = 1 << b->size_bits; + for (int i = 0; i < size; i++) { + hlist_for_each_entry(entry, &b->buckets[i], hlist) { + del = __ftrace_lookup_ip(sub, entry->ip); + if (WARN_ON_ONCE(!del)) { + free_ftrace_hash(sub); + return NULL; + } + remove_hash_entry(sub, del); + kfree(del); + } + } + return sub; +} + +/** + * update_ftrace_direct_del - Updates @ops by removing its direct + * callers provided in @hash + * @ops: The address of the struct ftrace_ops object + * @hash: The address of the struct ftrace_hash object + * + * This is used to delete custom direct callers (ip -> addr) in + * @ops specified via @hash. The @ops will be either unregistered + * updated. + * + * Returns: zero on success. Non zero on error, which includes: + * -EINVAL - The @hash is empty + * -EINVAL - The @ops is not registered + */ +int update_ftrace_direct_del(struct ftrace_ops *ops, struct ftrace_hash *hash) +{ + struct ftrace_hash *old_direct_functions = NULL; + struct ftrace_hash *new_direct_functions; + struct ftrace_hash *new_filter_hash = NULL; + struct ftrace_hash *old_filter_hash; + struct ftrace_func_entry *entry; + struct ftrace_func_entry *del; + unsigned long size; + int err = -EINVAL; + + if (!hash_count(hash)) + return -EINVAL; + if (check_direct_multi(ops)) + return -EINVAL; + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return -EINVAL; + if (direct_functions == EMPTY_HASH) + return -EINVAL; + + mutex_lock(&direct_mutex); + + old_filter_hash = ops->func_hash ? ops->func_hash->filter_hash : NULL; + + if (!hash_count(old_filter_hash)) + goto out_unlock; + + /* Make sure requested entries are already registered. */ + size = 1 << hash->size_bits; + for (int i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + del = __ftrace_lookup_ip(direct_functions, entry->ip); + if (!del || del->direct != entry->direct) + goto out_unlock; + } + } + + err = -ENOMEM; + new_filter_hash = hash_sub(old_filter_hash, hash); + if (!new_filter_hash) + goto out_unlock; + + new_direct_functions = hash_sub(direct_functions, hash); + if (!new_direct_functions) + goto out_unlock; + + /* If there's nothing left, we need to unregister the ops. */ + if (ftrace_hash_empty(new_filter_hash)) { + err = unregister_ftrace_function(ops); + if (!err) { + /* cleanup for possible another register call */ + ops->func = NULL; + ops->trampoline = 0; + ftrace_free_filter(ops); + ops->func_hash->filter_hash = NULL; + } + } else { + err = ftrace_update_ops(ops, new_filter_hash, EMPTY_HASH); + /* + * new_filter_hash is dup-ed, so we need to release it anyway, + * old_filter_hash either stays on error or is already released + */ + } + + if (err) { + /* free the new_direct_functions */ + old_direct_functions = new_direct_functions; + } else { + rcu_assign_pointer(direct_functions, new_direct_functions); + } + + out_unlock: + mutex_unlock(&direct_mutex); + + if (old_direct_functions && old_direct_functions != EMPTY_HASH) + call_rcu_tasks(&old_direct_functions->rcu, register_ftrace_direct_cb); + free_ftrace_hash(new_filter_hash); + + return err; +} + #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ /** -- cgit v1.2.3 From e93672f770d72f4c33d1dd9fb0633f95948c0cc9 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Dec 2025 15:50:07 +0100 Subject: ftrace: Add update_ftrace_direct_mod function Adding update_ftrace_direct_mod function that modifies all entries (ip -> direct) provided in hash argument to direct ftrace ops and updates its attachments. The difference to current modify_ftrace_direct is: - hash argument that allows to modify multiple ip -> direct entries at once This change will allow us to have simple ftrace_ops for all bpf direct interface users in following changes. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/bpf/20251230145010.103439-7-jolsa@kernel.org --- include/linux/ftrace.h | 6 ++++ kernel/trace/ftrace.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) (limited to 'include') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index cfb957731433..b8461e541b9d 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -545,6 +545,7 @@ int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned long addr); int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace_hash *hash); int update_ftrace_direct_del(struct ftrace_ops *ops, struct ftrace_hash *hash); +int update_ftrace_direct_mod(struct ftrace_ops *ops, struct ftrace_hash *hash, bool do_direct_lock); void ftrace_stub_direct_tramp(void); @@ -582,6 +583,11 @@ static inline int update_ftrace_direct_del(struct ftrace_ops *ops, struct ftrace return -ENODEV; } +static inline int update_ftrace_direct_mod(struct ftrace_ops *ops, struct ftrace_hash *hash, bool do_direct_lock) +{ + return -ENODEV; +} + /* * This must be implemented by the architecture. * It is the way the ftrace direct_ops helper, when called diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 244bb7553d3d..794ab5e22398 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6545,6 +6545,100 @@ int update_ftrace_direct_del(struct ftrace_ops *ops, struct ftrace_hash *hash) return err; } +/** + * update_ftrace_direct_mod - Updates @ops by modifing its direct + * callers provided in @hash + * @ops: The address of the struct ftrace_ops object + * @hash: The address of the struct ftrace_hash object + * @do_direct_lock: If true lock the direct_mutex + * + * This is used to modify custom direct callers (ip -> addr) in + * @ops specified via @hash. + * + * This can be called from within ftrace ops_func callback with + * direct_mutex already locked, in which case @do_direct_lock + * needs to be false. + * + * Returns: zero on success. Non zero on error, which includes: + * -EINVAL - The @hash is empty + * -EINVAL - The @ops is not registered + */ +int update_ftrace_direct_mod(struct ftrace_ops *ops, struct ftrace_hash *hash, bool do_direct_lock) +{ + struct ftrace_func_entry *entry, *tmp; + static struct ftrace_ops tmp_ops = { + .func = ftrace_stub, + .flags = FTRACE_OPS_FL_STUB, + }; + struct ftrace_hash *orig_hash; + unsigned long size, i; + int err = -EINVAL; + + if (!hash_count(hash)) + return -EINVAL; + if (check_direct_multi(ops)) + return -EINVAL; + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return -EINVAL; + if (direct_functions == EMPTY_HASH) + return -EINVAL; + + /* + * We can be called from within ops_func callback with direct_mutex + * already taken. + */ + if (do_direct_lock) + mutex_lock(&direct_mutex); + + orig_hash = ops->func_hash ? ops->func_hash->filter_hash : NULL; + if (!orig_hash) + goto unlock; + + /* Enable the tmp_ops to have the same functions as the direct ops */ + ftrace_ops_init(&tmp_ops); + tmp_ops.func_hash = ops->func_hash; + + err = register_ftrace_function_nolock(&tmp_ops); + if (err) + goto unlock; + + /* + * Call __ftrace_hash_update_ipmodify() here, so that we can call + * ops->ops_func for the ops. This is needed because the above + * register_ftrace_function_nolock() worked on tmp_ops. + */ + err = __ftrace_hash_update_ipmodify(ops, orig_hash, orig_hash, true); + if (err) + goto out; + + /* + * Now the ftrace_ops_list_func() is called to do the direct callers. + * We can safely change the direct functions attached to each entry. + */ + mutex_lock(&ftrace_lock); + + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + tmp = __ftrace_lookup_ip(direct_functions, entry->ip); + if (!tmp) + continue; + tmp->direct = entry->direct; + } + } + + mutex_unlock(&ftrace_lock); + +out: + /* Removing the tmp_ops will add the updated direct callers to the functions */ + unregister_ftrace_function(&tmp_ops); + +unlock: + if (do_direct_lock) + mutex_unlock(&direct_mutex); + return err; +} + #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ /** -- cgit v1.2.3 From 7d0452497c292153e690652e6df218fead21185f Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Dec 2025 15:50:08 +0100 Subject: bpf: Add trampoline ip hash table Following changes need to lookup trampoline based on its ip address, adding hash table for that. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20251230145010.103439-8-jolsa@kernel.org --- include/linux/bpf.h | 7 +++++-- kernel/bpf/trampoline.c | 30 +++++++++++++++++++----------- 2 files changed, 24 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9272a237cced..5524f9429e76 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1329,14 +1329,17 @@ struct bpf_tramp_image { }; struct bpf_trampoline { - /* hlist for trampoline_table */ - struct hlist_node hlist; + /* hlist for trampoline_key_table */ + struct hlist_node hlist_key; + /* hlist for trampoline_ip_table */ + struct hlist_node hlist_ip; struct ftrace_ops *fops; /* serializes access to fields of this trampoline */ struct mutex mutex; refcount_t refcnt; u32 flags; u64 key; + unsigned long ip; struct { struct btf_func_model model; void *addr; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 468de930b2ed..1d4f618c2402 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -24,9 +24,10 @@ const struct bpf_prog_ops bpf_extension_prog_ops = { #define TRAMPOLINE_HASH_BITS 10 #define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS) -static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; +static struct hlist_head trampoline_key_table[TRAMPOLINE_TABLE_SIZE]; +static struct hlist_head trampoline_ip_table[TRAMPOLINE_TABLE_SIZE]; -/* serializes access to trampoline_table */ +/* serializes access to trampoline tables */ static DEFINE_MUTEX(trampoline_mutex); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS @@ -142,15 +143,15 @@ void bpf_image_ksym_del(struct bpf_ksym *ksym) PAGE_SIZE, true, ksym->name); } -static struct bpf_trampoline *bpf_trampoline_lookup(u64 key) +static struct bpf_trampoline *bpf_trampoline_lookup(u64 key, unsigned long ip) { struct bpf_trampoline *tr; struct hlist_head *head; int i; mutex_lock(&trampoline_mutex); - head = &trampoline_table[hash_64(key, TRAMPOLINE_HASH_BITS)]; - hlist_for_each_entry(tr, head, hlist) { + head = &trampoline_key_table[hash_64(key, TRAMPOLINE_HASH_BITS)]; + hlist_for_each_entry(tr, head, hlist_key) { if (tr->key == key) { refcount_inc(&tr->refcnt); goto out; @@ -171,8 +172,12 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key) #endif tr->key = key; - INIT_HLIST_NODE(&tr->hlist); - hlist_add_head(&tr->hlist, head); + tr->ip = ftrace_location(ip); + INIT_HLIST_NODE(&tr->hlist_key); + INIT_HLIST_NODE(&tr->hlist_ip); + hlist_add_head(&tr->hlist_key, head); + head = &trampoline_ip_table[hash_64(tr->ip, TRAMPOLINE_HASH_BITS)]; + hlist_add_head(&tr->hlist_ip, head); refcount_set(&tr->refcnt, 1); mutex_init(&tr->mutex); for (i = 0; i < BPF_TRAMP_MAX; i++) @@ -883,7 +888,7 @@ void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog) prog->aux->attach_btf_id); bpf_lsm_find_cgroup_shim(prog, &bpf_func); - tr = bpf_trampoline_lookup(key); + tr = bpf_trampoline_lookup(key, 0); if (WARN_ON_ONCE(!tr)) return; @@ -903,7 +908,7 @@ struct bpf_trampoline *bpf_trampoline_get(u64 key, { struct bpf_trampoline *tr; - tr = bpf_trampoline_lookup(key); + tr = bpf_trampoline_lookup(key, tgt_info->tgt_addr); if (!tr) return NULL; @@ -939,7 +944,8 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) * fexit progs. The fentry-only trampoline will be freed via * multiple rcu callbacks. */ - hlist_del(&tr->hlist); + hlist_del(&tr->hlist_key); + hlist_del(&tr->hlist_ip); if (tr->fops) { ftrace_free_filter(tr->fops); kfree(tr->fops); @@ -1212,7 +1218,9 @@ static int __init init_trampolines(void) int i; for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++) - INIT_HLIST_HEAD(&trampoline_table[i]); + INIT_HLIST_HEAD(&trampoline_key_table[i]); + for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++) + INIT_HLIST_HEAD(&trampoline_ip_table[i]); return 0; } late_initcall(init_trampolines); -- cgit v1.2.3 From 956747efd82aa60e0ac3e6aef2b9a17adda6f3b1 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Dec 2025 15:50:09 +0100 Subject: ftrace: Factor ftrace_ops ops_func interface We are going to remove "ftrace_ops->private == bpf_trampoline" setup in following changes. Adding ip argument to ftrace_ops_func_t callback function, so we can use it to look up the trampoline. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/bpf/20251230145010.103439-9-jolsa@kernel.org --- include/linux/ftrace.h | 2 +- kernel/bpf/trampoline.c | 3 ++- kernel/trace/ftrace.c | 6 +++--- 3 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index b8461e541b9d..705db0a6d995 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -403,7 +403,7 @@ enum ftrace_ops_cmd { * Negative on failure. The return value is dependent on the * callback. */ -typedef int (*ftrace_ops_func_t)(struct ftrace_ops *op, enum ftrace_ops_cmd cmd); +typedef int (*ftrace_ops_func_t)(struct ftrace_ops *op, unsigned long ip, enum ftrace_ops_cmd cmd); #ifdef CONFIG_DYNAMIC_FTRACE diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 1d4f618c2402..64556aa0007b 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -33,7 +33,8 @@ static DEFINE_MUTEX(trampoline_mutex); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex); -static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd cmd) +static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip, + enum ftrace_ops_cmd cmd) { struct bpf_trampoline *tr = ops->private; int ret = 0; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 794ab5e22398..ee2d7b8a4372 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2075,7 +2075,7 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, */ if (!ops->ops_func) return -EBUSY; - ret = ops->ops_func(ops, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF); + ret = ops->ops_func(ops, rec->ip, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF); if (ret) return ret; } else if (is_ipmodify) { @@ -9061,7 +9061,7 @@ static int prepare_direct_functions_for_ipmodify(struct ftrace_ops *ops) if (!op->ops_func) return -EBUSY; - ret = op->ops_func(op, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER); + ret = op->ops_func(op, ip, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER); if (ret) return ret; } @@ -9108,7 +9108,7 @@ static void cleanup_direct_functions_after_ipmodify(struct ftrace_ops *ops) /* The cleanup is optional, ignore any errors */ if (found_op && op->ops_func) - op->ops_func(op, FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER); + op->ops_func(op, ip, FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER); } } mutex_unlock(&direct_mutex); -- cgit v1.2.3 From 0f0c332992b8a5d2ae7b611b94c4e02ef8d54b97 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 30 Jan 2026 09:12:07 +0100 Subject: bpf: Allow sleepable programs to use tail calls Allowing sleepable programs to use tail calls. Making sure we can't mix sleepable and non-sleepable bpf programs in tail call map (BPF_MAP_TYPE_PROG_ARRAY) and allowing it to be used in sleepable programs. Sleepable programs can be preempted and sleep which might bring new source of race conditions, but both direct and indirect tail calls should not be affected. Direct tail calls work by patching direct jump to callee into bpf caller program, so no problem there. We atomically switch from nop to jump instruction. Indirect tail call reads the callee from the map and then jumps to it. The callee bpf program can't disappear (be released) from the caller, because it is executed under rcu lock (rcu_read_lock_trace). Signed-off-by: Jiri Olsa Acked-by: Leon Hwang Link: https://lore.kernel.org/r/20260130081208.1130204-2-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/bpf/core.c | 4 +++- kernel/bpf/verifier.c | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5524f9429e76..3b0ceb759075 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -287,6 +287,7 @@ struct bpf_map_owner { enum bpf_prog_type type; bool jited; bool xdp_has_frags; + bool sleepable; u64 storage_cookie[MAX_BPF_CGROUP_STORAGE_TYPE]; const struct btf_type *attach_func_proto; enum bpf_attach_type expected_attach_type; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index e0b8a8a5aaa9..5ebece600aeb 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2401,6 +2401,7 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map, map->owner->type = prog_type; map->owner->jited = fp->jited; map->owner->xdp_has_frags = aux->xdp_has_frags; + map->owner->sleepable = fp->sleepable; map->owner->expected_attach_type = fp->expected_attach_type; map->owner->attach_func_proto = aux->attach_func_proto; for_each_cgroup_storage_type(i) { @@ -2412,7 +2413,8 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map, } else { ret = map->owner->type == prog_type && map->owner->jited == fp->jited && - map->owner->xdp_has_frags == aux->xdp_has_frags; + map->owner->xdp_has_frags == aux->xdp_has_frags && + map->owner->sleepable == fp->sleepable; if (ret && map->map_type == BPF_MAP_TYPE_PROG_ARRAY && map->owner->expected_attach_type != fp->expected_attach_type) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e7ff8394e0da..f185ebc6748d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21386,6 +21386,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_STACK: case BPF_MAP_TYPE_ARENA: case BPF_MAP_TYPE_INSN_ARRAY: + case BPF_MAP_TYPE_PROG_ARRAY: break; default: verbose(env, -- cgit v1.2.3 From 8798902f2b8bcae6f90229a1a1496b48ddda2972 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Sat, 31 Jan 2026 22:49:48 +0800 Subject: bpf: Add bpf_jit_supports_fsession() The added fsession does not prevent running on those architectures, that haven't added fsession support. For example, try to run fsession tests on arm64: test_fsession_basic:PASS:fsession_test__open_and_load 0 nsec test_fsession_basic:PASS:fsession_attach 0 nsec check_result:FAIL:test_run_opts err unexpected error: -14 (errno 14) In order to prevent such errors, add bpf_jit_supports_fsession() to guard those architectures. Fixes: 2d419c44658f ("bpf: add fsession support") Acked-by: Puranjay Mohan Tested-by: Puranjay Mohan Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260131144950.16294-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 5 ++++ include/linux/filter.h | 1 + kernel/bpf/core.c | 5 ++++ kernel/bpf/verifier.c | 5 ++++ .../selftests/bpf/prog_tests/fsession_test.c | 32 ++++++++++++++++------ 5 files changed, 40 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 5a075e06cf45..070ba80e39d7 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -4112,3 +4112,8 @@ bool bpf_jit_supports_timed_may_goto(void) { return true; } + +bool bpf_jit_supports_fsession(void) +{ + return true; +} diff --git a/include/linux/filter.h b/include/linux/filter.h index fd54fed8f95f..4e1cb4f91f49 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1167,6 +1167,7 @@ bool bpf_jit_supports_arena(void); bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena); bool bpf_jit_supports_private_stack(void); bool bpf_jit_supports_timed_may_goto(void); +bool bpf_jit_supports_fsession(void); u64 bpf_arch_uaddress_limit(void); void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); u64 arch_bpf_timed_may_goto(void); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5ebece600aeb..dc906dfdff94 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -3144,6 +3144,11 @@ bool __weak bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) return false; } +bool __weak bpf_jit_supports_fsession(void) +{ + return false; +} + u64 __weak bpf_arch_uaddress_limit(void) { #if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 256cc5c1a7df..6b62b6d57175 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -24828,6 +24828,11 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_TRACE_FSESSION: + if (prog->expected_attach_type == BPF_TRACE_FSESSION && + !bpf_jit_supports_fsession()) { + bpf_log(log, "JIT does not support fsession\n"); + return -EOPNOTSUPP; + } if (!btf_type_is_func(t)) { bpf_log(log, "attach_btf_id %u is not a function\n", btf_id); diff --git a/tools/testing/selftests/bpf/prog_tests/fsession_test.c b/tools/testing/selftests/bpf/prog_tests/fsession_test.c index 0c4b428e1cee..a299aeb8cc2e 100644 --- a/tools/testing/selftests/bpf/prog_tests/fsession_test.c +++ b/tools/testing/selftests/bpf/prog_tests/fsession_test.c @@ -29,8 +29,16 @@ static void test_fsession_basic(void) struct fsession_test *skel = NULL; int err; - skel = fsession_test__open_and_load(); - if (!ASSERT_OK_PTR(skel, "fsession_test__open_and_load")) + skel = fsession_test__open(); + if (!ASSERT_OK_PTR(skel, "fsession_test__open")) + return; + + err = fsession_test__load(skel); + if (err == -EOPNOTSUPP) { + test__skip(); + goto cleanup; + } + if (!ASSERT_OK(err, "fsession_test__load")) goto cleanup; err = fsession_test__attach(skel); @@ -47,8 +55,16 @@ static void test_fsession_reattach(void) struct fsession_test *skel = NULL; int err; - skel = fsession_test__open_and_load(); - if (!ASSERT_OK_PTR(skel, "fsession_test__open_and_load")) + skel = fsession_test__open(); + if (!ASSERT_OK_PTR(skel, "fsession_test__open")) + return; + + err = fsession_test__load(skel); + if (err == -EOPNOTSUPP) { + test__skip(); + goto cleanup; + } + if (!ASSERT_OK(err, "fsession_test__load")) goto cleanup; /* first attach */ @@ -94,6 +110,10 @@ static void test_fsession_cookie(void) bpf_program__set_autoload(skel->progs.test6, false); err = fsession_test__load(skel); + if (err == -EOPNOTSUPP) { + test__skip(); + goto cleanup; + } if (!ASSERT_OK(err, "fsession_test__load")) goto cleanup; @@ -111,10 +131,6 @@ cleanup: void test_fsession_test(void) { -#if !defined(__x86_64__) - test__skip(); - return; -#endif if (test__start_subtest("fsession_test")) test_fsession_basic(); if (test__start_subtest("fsession_reattach")) -- cgit v1.2.3 From e3aa56b3ac175bccb8fe60d652a3df2ea6a68a1e Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Sat, 31 Jan 2026 22:49:49 +0800 Subject: bpf, arm64: Add fsession support Implement fsession support in the arm64 BPF JIT trampoline. Extend the trampoline stack layout to store function metadata and session cookies, and pass the appropriate metadata to fentry and fexit programs. This mirrors the existing x86 behavior and enables session cookies on arm64. Acked-by: Puranjay Mohan Tested-by: Puranjay Mohan Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260131144950.16294-3-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 71 +++++++++++++++++++++++++++++++++++++------ include/linux/bpf.h | 7 ++++- 2 files changed, 68 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 0c4d44bcfbf4..2dc5037694ba 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -2510,6 +2510,12 @@ static bool is_struct_ops_tramp(const struct bpf_tramp_links *fentry_links) fentry_links->links[0]->link.type == BPF_LINK_TYPE_STRUCT_OPS; } +static void store_func_meta(struct jit_ctx *ctx, u64 func_meta, int func_meta_off) +{ + emit_a64_mov_i64(A64_R(10), func_meta, ctx); + emit(A64_STR64I(A64_R(10), A64_SP, func_meta_off), ctx); +} + /* Based on the x86's implementation of arch_prepare_bpf_trampoline(). * * bpf prog and function entry before bpf trampoline hooked: @@ -2533,7 +2539,7 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, int regs_off; int retval_off; int bargs_off; - int nfuncargs_off; + int func_meta_off; int ip_off; int run_ctx_off; int oargs_off; @@ -2544,6 +2550,9 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, bool save_ret; __le32 **branches = NULL; bool is_struct_ops = is_struct_ops_tramp(fentry); + int cookie_off, cookie_cnt, cookie_bargs_off; + int fsession_cnt = bpf_fsession_cnt(tlinks); + u64 func_meta; /* trampoline stack layout: * [ parent ip ] @@ -2562,10 +2571,14 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, * [ ... ] * SP + bargs_off [ arg reg 1 ] for bpf * - * SP + nfuncargs_off [ arg regs count ] + * SP + func_meta_off [ regs count, etc ] * * SP + ip_off [ traced function ] BPF_TRAMP_F_IP_ARG flag * + * [ stack cookie N ] + * [ ... ] + * SP + cookie_off [ stack cookie 1 ] + * * SP + run_ctx_off [ bpf_tramp_run_ctx ] * * [ stack arg N ] @@ -2582,13 +2595,18 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, /* room for bpf_tramp_run_ctx */ stack_size += round_up(sizeof(struct bpf_tramp_run_ctx), 8); + cookie_off = stack_size; + /* room for session cookies */ + cookie_cnt = bpf_fsession_cookie_cnt(tlinks); + stack_size += cookie_cnt * 8; + ip_off = stack_size; /* room for IP address argument */ if (flags & BPF_TRAMP_F_IP_ARG) stack_size += 8; - nfuncargs_off = stack_size; - /* room for args count */ + func_meta_off = stack_size; + /* room for function metadata, such as regs count */ stack_size += 8; bargs_off = stack_size; @@ -2646,9 +2664,9 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, emit(A64_STR64I(A64_R(10), A64_SP, ip_off), ctx); } - /* save arg regs count*/ - emit(A64_MOVZ(1, A64_R(10), nfuncargs, 0), ctx); - emit(A64_STR64I(A64_R(10), A64_SP, nfuncargs_off), ctx); + /* save function metadata */ + func_meta = nfuncargs; + store_func_meta(ctx, func_meta, func_meta_off); /* save args for bpf */ save_args(ctx, bargs_off, oargs_off, m, a, false); @@ -2666,10 +2684,27 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, emit_call((const u64)__bpf_tramp_enter, ctx); } - for (i = 0; i < fentry->nr_links; i++) + if (fsession_cnt) { + /* clear all the session cookies' value */ + emit(A64_MOVZ(1, A64_R(10), 0, 0), ctx); + for (int i = 0; i < cookie_cnt; i++) + emit(A64_STR64I(A64_R(10), A64_SP, cookie_off + 8 * i), ctx); + /* clear the return value to make sure fentry always gets 0 */ + emit(A64_STR64I(A64_R(10), A64_SP, retval_off), ctx); + } + + cookie_bargs_off = (bargs_off - cookie_off) / 8; + for (i = 0; i < fentry->nr_links; i++) { + if (bpf_prog_calls_session_cookie(fentry->links[i])) { + u64 meta = func_meta | (cookie_bargs_off << BPF_TRAMP_COOKIE_INDEX_SHIFT); + + store_func_meta(ctx, meta, func_meta_off); + cookie_bargs_off--; + } invoke_bpf_prog(ctx, fentry->links[i], bargs_off, retval_off, run_ctx_off, flags & BPF_TRAMP_F_RET_FENTRY_RET); + } if (fmod_ret->nr_links) { branches = kcalloc(fmod_ret->nr_links, sizeof(__le32 *), @@ -2701,9 +2736,22 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, *branches[i] = cpu_to_le32(A64_CBNZ(1, A64_R(10), offset)); } - for (i = 0; i < fexit->nr_links; i++) + /* set the "is_return" flag for fsession */ + func_meta |= (1ULL << BPF_TRAMP_IS_RETURN_SHIFT); + if (fsession_cnt) + store_func_meta(ctx, func_meta, func_meta_off); + + cookie_bargs_off = (bargs_off - cookie_off) / 8; + for (i = 0; i < fexit->nr_links; i++) { + if (bpf_prog_calls_session_cookie(fexit->links[i])) { + u64 meta = func_meta | (cookie_bargs_off << BPF_TRAMP_COOKIE_INDEX_SHIFT); + + store_func_meta(ctx, meta, func_meta_off); + cookie_bargs_off--; + } invoke_bpf_prog(ctx, fexit->links[i], bargs_off, retval_off, run_ctx_off, false); + } if (flags & BPF_TRAMP_F_CALL_ORIG) { im->ip_epilogue = ctx->ro_image + ctx->idx; @@ -2753,6 +2801,11 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, return ctx->idx; } +bool bpf_jit_supports_fsession(void) +{ + return true; +} + int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, void *func_addr) { diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3b0ceb759075..cd9b96434904 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2196,13 +2196,18 @@ static inline int bpf_fsession_cnt(struct bpf_tramp_links *links) return cnt; } +static inline bool bpf_prog_calls_session_cookie(struct bpf_tramp_link *link) +{ + return link->link.prog->call_session_cookie; +} + static inline int bpf_fsession_cookie_cnt(struct bpf_tramp_links *links) { struct bpf_tramp_links fentries = links[BPF_TRAMP_FENTRY]; int cnt = 0; for (int i = 0; i < links[BPF_TRAMP_FENTRY].nr_links; i++) { - if (fentries.links[i]->link.prog->call_session_cookie) + if (bpf_prog_calls_session_cookie(fentries.links[i])) cnt++; } -- cgit v1.2.3 From b2a0aa3a87396483b468b7c81be2fddb29171d74 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Tue, 3 Feb 2026 08:50:58 -0800 Subject: bpf: Clear singular ids for scalars in is_state_visited() The verifier assigns ids to scalar registers/stack slots when they are linked through a mov or stack spill/fill instruction. These ids are later used to propagate newly found bounds from one register to all registers that share the same id. The verifier also compares the ids of these registers in current state and cached state when making pruning decisions. When an ID becomes singular (i.e., only a single register or stack slot has that ID), it can no longer participate in bounds propagation. During comparisons between current and cached states for pruning decisions, however, such stale IDs can prevent pruning of otherwise equivalent states. Find and clear all singular ids before caching a state in is_state_visited(). struct bpf_idset which is currently unused has been repurposed for this use case. Acked-by: Eduard Zingerman Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260203165102.2302462-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 7 +++-- kernel/bpf/verifier.c | 68 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 8355b585cd18..746025df82c8 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -697,8 +697,11 @@ struct bpf_idmap { }; struct bpf_idset { - u32 count; - u32 ids[BPF_ID_MAP_SIZE]; + u32 num_ids; + struct { + u32 id; + u32 cnt; + } entries[BPF_ID_MAP_SIZE]; }; /* see verifier.c:compute_scc_callchain() */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 17b499956156..d92e10d4c2cc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19461,6 +19461,72 @@ static void clean_verifier_state(struct bpf_verifier_env *env, * doesn't meant that the states are DONE. The verifier has to compare * the callsites */ + +/* Find id in idset and increment its count, or add new entry */ +static void idset_cnt_inc(struct bpf_idset *idset, u32 id) +{ + u32 i; + + for (i = 0; i < idset->num_ids; i++) { + if (idset->entries[i].id == id) { + idset->entries[i].cnt++; + return; + } + } + /* New id */ + if (idset->num_ids < BPF_ID_MAP_SIZE) { + idset->entries[idset->num_ids].id = id; + idset->entries[idset->num_ids].cnt = 1; + idset->num_ids++; + } +} + +/* Find id in idset and return its count, or 0 if not found */ +static u32 idset_cnt_get(struct bpf_idset *idset, u32 id) +{ + u32 i; + + for (i = 0; i < idset->num_ids; i++) { + if (idset->entries[i].id == id) + return idset->entries[i].cnt; + } + return 0; +} + +/* + * Clear singular scalar ids in a state. + * A register with a non-zero id is called singular if no other register shares + * the same base id. Such registers can be treated as independent (id=0). + */ +static void clear_singular_ids(struct bpf_verifier_env *env, + struct bpf_verifier_state *st) +{ + struct bpf_idset *idset = &env->idset_scratch; + struct bpf_func_state *func; + struct bpf_reg_state *reg; + + idset->num_ids = 0; + + bpf_for_each_reg_in_vstate(st, func, reg, ({ + if (reg->type != SCALAR_VALUE) + continue; + if (!reg->id) + continue; + idset_cnt_inc(idset, reg->id & ~BPF_ADD_CONST); + })); + + bpf_for_each_reg_in_vstate(st, func, reg, ({ + if (reg->type != SCALAR_VALUE) + continue; + if (!reg->id) + continue; + if (idset_cnt_get(idset, reg->id & ~BPF_ADD_CONST) == 1) { + reg->id = 0; + reg->off = 0; + } + })); +} + static void clean_live_states(struct bpf_verifier_env *env, int insn, struct bpf_verifier_state *cur) { @@ -20459,6 +20525,8 @@ miss: if (env->bpf_capable) mark_all_scalars_imprecise(env, cur); + clear_singular_ids(env, cur); + /* add new state to the head of linked list */ new = &new_sl->state; err = copy_verifier_state(new, cur); -- cgit v1.2.3 From 9d21199842247ab05c675fb9b6c6ca393a5c0024 Mon Sep 17 00:00:00 2001 From: Tianci Cao Date: Wed, 4 Feb 2026 19:15:02 +0800 Subject: bpf: Add bitwise tracking for BPF_END This patch implements bitwise tracking (tnum analysis) for BPF_END (byte swap) operation. Currently, the BPF verifier does not track value for BPF_END operation, treating the result as completely unknown. This limits the verifier's ability to prove safety of programs that perform endianness conversions, which are common in networking code. For example, the following code pattern for port number validation: int test(struct pt_regs *ctx) { __u64 x = bpf_get_prandom_u32(); x &= 0x3f00; // Range: [0, 0x3f00], var_off: (0x0; 0x3f00) x = bswap16(x); // Should swap to range [0, 0x3f], var_off: (0x0; 0x3f) if (x > 0x3f) goto trap; return 0; trap: return *(u64 *)NULL; // Should be unreachable } Currently generates verifier output: 1: (54) w0 &= 16128 ; R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=16128,var_off=(0x0; 0x3f00)) 2: (d7) r0 = bswap16 r0 ; R0=scalar() 3: (25) if r0 > 0x3f goto pc+2 ; R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=63,var_off=(0x0; 0x3f)) Without this patch, even though the verifier knows `x` has certain bits set, after bswap16, it loses all tracking information and treats port as having a completely unknown value [0, 65535]. According to the BPF instruction set[1], there are 3 kinds of BPF_END: 1. `bswap(16|32|64)`: opcode=0xd7 (BPF_END | BPF_ALU64 | BPF_TO_LE) - do unconditional swap 2. `le(16|32|64)`: opcode=0xd4 (BPF_END | BPF_ALU | BPF_TO_LE) - on big-endian: do swap - on little-endian: truncation (16/32-bit) or no-op (64-bit) 3. `be(16|32|64)`: opcode=0xdc (BPF_END | BPF_ALU | BPF_TO_BE) - on little-endian: do swap - on big-endian: truncation (16/32-bit) or no-op (64-bit) Since BPF_END operations are inherently bit-wise permutations, tnum (bitwise tracking) offers the most efficient and precise mechanism for value analysis. By implementing `tnum_bswap16`, `tnum_bswap32`, and `tnum_bswap64`, we can derive exact `var_off` values concisely, directly reflecting the bit-level changes. Here is the overview of changes: 1. In `tnum_bswap(16|32|64)` (kernel/bpf/tnum.c): Call `swab(16|32|64)` function on the value and mask of `var_off`, and do truncation for 16/32-bit cases. 2. In `adjust_scalar_min_max_vals` (kernel/bpf/verifier.c): Call helper function `scalar_byte_swap`. - Only do byte swap when * alu64 (unconditional swap) OR * switching between big-endian and little-endian machines. - If need do byte swap: * Firstly call `tnum_bswap(16|32|64)` to update `var_off`. * Then reset the bound since byte swap scrambles the range. - For 16/32-bit cases, truncate dst register to match the swapped size. This enables better verification of networking code that frequently uses byte swaps for protocol processing, reducing false positive rejections. [1] https://www.kernel.org/doc/Documentation/bpf/standardization/instruction-set.rst Co-developed-by: Shenghao Yuan Signed-off-by: Shenghao Yuan Co-developed-by: Yazhou Tang Signed-off-by: Yazhou Tang Signed-off-by: Tianci Cao Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260204111503.77871-2-ziye@zju.edu.cn Signed-off-by: Alexei Starovoitov --- include/linux/tnum.h | 5 +++++ kernel/bpf/tnum.c | 16 ++++++++++++++ kernel/bpf/verifier.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 78 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/tnum.h b/include/linux/tnum.h index c52b862dad45..fa4654ffb621 100644 --- a/include/linux/tnum.h +++ b/include/linux/tnum.h @@ -63,6 +63,11 @@ struct tnum tnum_union(struct tnum t1, struct tnum t2); /* Return @a with all but the lowest @size bytes cleared */ struct tnum tnum_cast(struct tnum a, u8 size); +/* Swap the bytes of a tnum */ +struct tnum tnum_bswap16(struct tnum a); +struct tnum tnum_bswap32(struct tnum a); +struct tnum tnum_bswap64(struct tnum a); + /* Returns true if @a is a known constant */ static inline bool tnum_is_const(struct tnum a) { diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index f8e70e9c3998..26fbfbb01700 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -8,6 +8,7 @@ */ #include #include +#include #define TNUM(_v, _m) (struct tnum){.value = _v, .mask = _m} /* A completely unknown value */ @@ -253,3 +254,18 @@ struct tnum tnum_const_subreg(struct tnum a, u32 value) { return tnum_with_subreg(a, tnum_const(value)); } + +struct tnum tnum_bswap16(struct tnum a) +{ + return TNUM(swab16(a.value & 0xFFFF), swab16(a.mask & 0xFFFF)); +} + +struct tnum tnum_bswap32(struct tnum a) +{ + return TNUM(swab32(a.value & 0xFFFFFFFF), swab32(a.mask & 0xFFFFFFFF)); +} + +struct tnum tnum_bswap64(struct tnum a) +{ + return TNUM(swab64(a.value), swab64(a.mask)); +} diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 40a8252140fb..92e03a5a50f5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15832,6 +15832,48 @@ static void scalar_min_max_arsh(struct bpf_reg_state *dst_reg, __update_reg_bounds(dst_reg); } +static void scalar_byte_swap(struct bpf_reg_state *dst_reg, struct bpf_insn *insn) +{ + /* + * Byte swap operation - update var_off using tnum_bswap. + * Three cases: + * 1. bswap(16|32|64): opcode=0xd7 (BPF_END | BPF_ALU64 | BPF_TO_LE) + * unconditional swap + * 2. to_le(16|32|64): opcode=0xd4 (BPF_END | BPF_ALU | BPF_TO_LE) + * swap on big-endian, truncation or no-op on little-endian + * 3. to_be(16|32|64): opcode=0xdc (BPF_END | BPF_ALU | BPF_TO_BE) + * swap on little-endian, truncation or no-op on big-endian + */ + + bool alu64 = BPF_CLASS(insn->code) == BPF_ALU64; + bool to_le = BPF_SRC(insn->code) == BPF_TO_LE; + bool is_big_endian; +#ifdef CONFIG_CPU_BIG_ENDIAN + is_big_endian = true; +#else + is_big_endian = false; +#endif + /* Apply bswap if alu64 or switch between big-endian and little-endian machines */ + bool need_bswap = alu64 || (to_le == is_big_endian); + + if (need_bswap) { + if (insn->imm == 16) + dst_reg->var_off = tnum_bswap16(dst_reg->var_off); + else if (insn->imm == 32) + dst_reg->var_off = tnum_bswap32(dst_reg->var_off); + else if (insn->imm == 64) + dst_reg->var_off = tnum_bswap64(dst_reg->var_off); + /* + * Byteswap scrambles the range, so we must reset bounds. + * Bounds will be re-derived from the new tnum later. + */ + __mark_reg_unbounded(dst_reg); + } + /* For bswap16/32, truncate dst register to match the swapped size */ + if (insn->imm == 16 || insn->imm == 32) + coerce_reg_to_size(dst_reg, insn->imm / 8); +} + static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, const struct bpf_reg_state *src_reg) { @@ -15858,6 +15900,7 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, case BPF_XOR: case BPF_OR: case BPF_MUL: + case BPF_END: return true; /* @@ -16047,12 +16090,23 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, else scalar_min_max_arsh(dst_reg, &src_reg); break; + case BPF_END: + scalar_byte_swap(dst_reg, insn); + break; default: break; } - /* ALU32 ops are zero extended into 64bit register */ - if (alu32) + /* + * ALU32 ops are zero extended into 64bit register. + * + * BPF_END is already handled inside the helper (truncation), + * so skip zext here to avoid unexpected zero extension. + * e.g., le64: opcode=(BPF_END|BPF_ALU|BPF_TO_LE), imm=0x40 + * This is a 64bit byte swap operation with alu32==true, + * but we should not zero extend the result. + */ + if (alu32 && opcode != BPF_END) zext_32_to_64(dst_reg); reg_bounds_sync(dst_reg); return 0; @@ -16232,7 +16286,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } /* check dest operand */ - if (opcode == BPF_NEG && + if ((opcode == BPF_NEG || opcode == BPF_END) && regs[insn->dst_reg].type == SCALAR_VALUE) { err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); err = err ?: adjust_scalar_min_max_vals(env, insn, -- cgit v1.2.3 From 7a433e519364c3c19643e5c857f4fbfaebec441c Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Wed, 4 Feb 2026 07:17:37 -0800 Subject: bpf: Support negative offsets, BPF_SUB, and alu32 for linked register tracking Previously, the verifier only tracked positive constant deltas between linked registers using BPF_ADD. This limitation meant patterns like: r1 = r0; r1 += -4; if r1 s>= 0 goto l0_%=; // r1 >= 0 implies r0 >= 4 // verifier couldn't propagate bounds back to r0 if r0 != 0 goto l0_%=; r0 /= 0; // Verifier thinks this is reachable l0_%=: Similar limitation exists for 32-bit registers. With this change, the verifier can now track negative deltas in reg->off enabling bound propagation for the above pattern. For alu32, we make sure the destination register has the upper 32 bits as 0s before creating the link. BPF_ADD_CONST is split into BPF_ADD_CONST64 and BPF_ADD_CONST32, the latter is used in case of alu32 and sync_linked_regs uses this to zext the result if known_reg has this flag. Signed-off-by: Puranjay Mohan Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260204151741.2678118-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 6 ++- kernel/bpf/verifier.c | 50 +++++++++++++++++----- .../testing/selftests/bpf/progs/verifier_bounds.c | 2 +- 3 files changed, 45 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 746025df82c8..ef8e45a362d9 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -147,8 +147,12 @@ struct bpf_reg_state { * registers. Example: * r1 = r2; both will have r1->id == r2->id == N * r1 += 10; r1->id == N | BPF_ADD_CONST and r1->off == 10 + * r3 = r2; both will have r3->id == r2->id == N + * w3 += 10; r3->id == N | BPF_ADD_CONST32 and r3->off == 10 */ -#define BPF_ADD_CONST (1U << 31) +#define BPF_ADD_CONST64 (1U << 31) +#define BPF_ADD_CONST32 (1U << 30) +#define BPF_ADD_CONST (BPF_ADD_CONST64 | BPF_ADD_CONST32) u32 id; /* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned * from a pointer-cast helper, bpf_sk_fullsock() and diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 92e03a5a50f5..edf5342b982f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -16209,6 +16209,13 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, verbose(env, "verifier internal error: no src_reg\n"); return -EFAULT; } + /* + * For alu32 linked register tracking, we need to check dst_reg's + * umax_value before the ALU operation. After adjust_scalar_min_max_vals(), + * alu32 ops will have zero-extended the result, making umax_value <= U32_MAX. + */ + u64 dst_umax = dst_reg->umax_value; + err = adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg); if (err) return err; @@ -16218,26 +16225,44 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * r1 += 0x1 * if r2 < 1000 goto ... * use r1 in memory access - * So for 64-bit alu remember constant delta between r2 and r1 and - * update r1 after 'if' condition. + * So remember constant delta between r2 and r1 and update r1 after + * 'if' condition. */ if (env->bpf_capable && - BPF_OP(insn->code) == BPF_ADD && !alu32 && - dst_reg->id && is_reg_const(src_reg, false)) { - u64 val = reg_const_value(src_reg, false); + (BPF_OP(insn->code) == BPF_ADD || BPF_OP(insn->code) == BPF_SUB) && + dst_reg->id && is_reg_const(src_reg, alu32)) { + u64 val = reg_const_value(src_reg, alu32); + s32 off; + + if (!alu32 && ((s64)val < S32_MIN || (s64)val > S32_MAX)) + goto clear_id; + + if (alu32 && (dst_umax > U32_MAX)) + goto clear_id; - if ((dst_reg->id & BPF_ADD_CONST) || - /* prevent overflow in sync_linked_regs() later */ - val > (u32)S32_MAX) { + off = (s32)val; + + if (BPF_OP(insn->code) == BPF_SUB) { + /* Negating S32_MIN would overflow */ + if (off == S32_MIN) + goto clear_id; + off = -off; + } + + if (dst_reg->id & BPF_ADD_CONST) { /* * If the register already went through rX += val * we cannot accumulate another val into rx->off. */ +clear_id: dst_reg->off = 0; dst_reg->id = 0; } else { - dst_reg->id |= BPF_ADD_CONST; - dst_reg->off = val; + if (alu32) + dst_reg->id |= BPF_ADD_CONST32; + else + dst_reg->id |= BPF_ADD_CONST64; + dst_reg->off = off; } } else { /* @@ -17334,7 +17359,7 @@ static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_s u32 saved_id = reg->id; fake_reg.type = SCALAR_VALUE; - __mark_reg_known(&fake_reg, (s32)reg->off - (s32)known_reg->off); + __mark_reg_known(&fake_reg, (s64)reg->off - (s64)known_reg->off); /* reg = known_reg; reg += delta */ copy_register_state(reg, known_reg); @@ -17349,6 +17374,9 @@ static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_s scalar32_min_max_add(reg, &fake_reg); scalar_min_max_add(reg, &fake_reg); reg->var_off = tnum_add(reg->var_off, fake_reg.var_off); + if (known_reg->id & BPF_ADD_CONST32) + zext_32_to_64(reg); + reg_bounds_sync(reg); } if (e->is_reg) mark_reg_scratched(env, e->regno); diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c index 411a18437d7e..560531404bce 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bounds.c +++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c @@ -1477,7 +1477,7 @@ __naked void sub64_full_overflow(void) SEC("socket") __description("64-bit subtraction, partial overflow, result in unbounded reg") __success __log_level(2) -__msg("3: (1f) r3 -= r2 {{.*}} R3=scalar()") +__msg("3: (1f) r3 -= r2 {{.*}} R3=scalar(id=1-1)") __retval(0) __naked void sub64_partial_overflow(void) { -- cgit v1.2.3 From 0ccef7079ea8d5f7b896b14be7e400022ff240c6 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:28:59 -0800 Subject: bpf: Select bpf_local_storage_map_bucket based on bpf_local_storage A later bpf_local_storage refactor will acquire all locks before performing any update. To simplified the number of locks needed to take in bpf_local_storage_map_update(), determine the bucket based on the local_storage an selem belongs to instead of the selem pointer. Currently, when a new selem needs to be created to replace the old selem in bpf_local_storage_map_update(), locks of both buckets need to be acquired to prevent racing. This can be simplified if the two selem belongs to the same bucket so that only one bucket needs to be locked. Therefore, instead of hashing selem, hashing the local_storage pointer the selem belongs. Performance wise, this is slightly better as update now requires locking one bucket. It should not change the level of contention on one bucket as the pointers to local storages of selems in a map are just as unique as pointers to selems. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-2-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 1 + kernel/bpf/bpf_local_storage.c | 17 +++++++++++------ net/core/bpf_sk_storage.c | 2 +- 3 files changed, 13 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 66432248cd81..2638487425b8 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -179,6 +179,7 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now); void bpf_selem_link_map(struct bpf_local_storage_map *smap, + struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem); struct bpf_local_storage_elem * diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index e2fe6c32822b..91b28f4e3130 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -19,9 +19,9 @@ static struct bpf_local_storage_map_bucket * select_bucket(struct bpf_local_storage_map *smap, - struct bpf_local_storage_elem *selem) + struct bpf_local_storage *local_storage) { - return &smap->buckets[hash_ptr(selem, smap->bucket_log)]; + return &smap->buckets[hash_ptr(local_storage, smap->bucket_log)]; } static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size) @@ -349,6 +349,7 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) { + struct bpf_local_storage *local_storage; struct bpf_local_storage_map *smap; struct bpf_local_storage_map_bucket *b; unsigned long flags; @@ -357,8 +358,10 @@ static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) /* selem has already be unlinked from smap */ return; + local_storage = rcu_dereference_check(selem->local_storage, + bpf_rcu_lock_held()); smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); - b = select_bucket(smap, selem); + b = select_bucket(smap, local_storage); raw_spin_lock_irqsave(&b->lock, flags); if (likely(selem_linked_to_map(selem))) hlist_del_init_rcu(&selem->map_node); @@ -366,11 +369,13 @@ static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) } void bpf_selem_link_map(struct bpf_local_storage_map *smap, + struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem) { - struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem); + struct bpf_local_storage_map_bucket *b; unsigned long flags; + b = select_bucket(smap, local_storage); raw_spin_lock_irqsave(&b->lock, flags); hlist_add_head_rcu(&selem->map_node, &b->list); raw_spin_unlock_irqrestore(&b->lock, flags); @@ -448,7 +453,7 @@ int bpf_local_storage_alloc(void *owner, storage->use_kmalloc_nolock = smap->use_kmalloc_nolock; bpf_selem_link_storage_nolock(storage, first_selem); - bpf_selem_link_map(smap, first_selem); + bpf_selem_link_map(smap, storage, first_selem); owner_storage_ptr = (struct bpf_local_storage **)owner_storage(smap, owner); @@ -576,7 +581,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, alloc_selem = NULL; /* First, link the new selem to the map */ - bpf_selem_link_map(smap, selem); + bpf_selem_link_map(smap, local_storage, selem); /* Second, link (and publish) the new selem to local_storage */ bpf_selem_link_storage_nolock(local_storage, selem); diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index de111818f3a0..e36273e4fcbd 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -191,7 +191,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) } if (new_sk_storage) { - bpf_selem_link_map(smap, copy_selem); + bpf_selem_link_map(smap, new_sk_storage, copy_selem); bpf_selem_link_storage_nolock(new_sk_storage, copy_selem); } else { ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC); -- cgit v1.2.3 From fd103ffc57c9a3b8b76bc852ffae5eb630a6ded4 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:01 -0800 Subject: bpf: Convert bpf_selem_link_map to failable To prepare for changing bpf_local_storage_map_bucket::lock to rqspinlock, convert bpf_selem_link_map() to failable. It still always succeeds and returns 0 until the change happens. No functional change. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-4-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 6 +++--- kernel/bpf/bpf_local_storage.c | 8 +++++--- net/core/bpf_sk_storage.c | 9 ++++++++- 3 files changed, 16 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 2638487425b8..709506e982a6 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -178,9 +178,9 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now); -void bpf_selem_link_map(struct bpf_local_storage_map *smap, - struct bpf_local_storage *local_storage, - struct bpf_local_storage_elem *selem); +int bpf_selem_link_map(struct bpf_local_storage_map *smap, + struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem); struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 6fa71502c7d7..2f94ca4a4475 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -365,9 +365,9 @@ static void bpf_selem_unlink_map_nolock(struct bpf_local_storage_elem *selem) hlist_del_init_rcu(&selem->map_node); } -void bpf_selem_link_map(struct bpf_local_storage_map *smap, - struct bpf_local_storage *local_storage, - struct bpf_local_storage_elem *selem) +int bpf_selem_link_map(struct bpf_local_storage_map *smap, + struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem) { struct bpf_local_storage_map_bucket *b; unsigned long flags; @@ -376,6 +376,8 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap, raw_spin_lock_irqsave(&b->lock, flags); hlist_add_head_rcu(&selem->map_node, &b->list); raw_spin_unlock_irqrestore(&b->lock, flags); + + return 0; } static void bpf_selem_link_map_nolock(struct bpf_local_storage_map_bucket *b, diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index e36273e4fcbd..0b85d8f2c17e 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -191,7 +191,14 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) } if (new_sk_storage) { - bpf_selem_link_map(smap, new_sk_storage, copy_selem); + ret = bpf_selem_link_map(smap, new_sk_storage, copy_selem); + if (ret) { + bpf_selem_free(copy_selem, true); + atomic_sub(smap->elem_size, + &newsk->sk_omem_alloc); + bpf_map_put(map); + goto out; + } bpf_selem_link_storage_nolock(new_sk_storage, copy_selem); } else { ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC); -- cgit v1.2.3 From 403e935f915896243ff93f9a2ff44e5bb6619032 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:02 -0800 Subject: bpf: Convert bpf_selem_unlink to failable To prepare changing both bpf_local_storage_map_bucket::lock and bpf_local_storage::lock to rqspinlock, convert bpf_selem_unlink() to failable. It still always succeeds and returns 0 until the change happens. No functional change. Open code bpf_selem_unlink_storage() in the only caller, bpf_selem_unlink(), since unlink_map and unlink_storage must be done together after all the necessary locks are acquired. For bpf_local_storage_map_free(), ignore the return from bpf_selem_unlink() for now. A later patch will allow it to unlink selems even when failing to acquire locks. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-5-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 2 +- kernel/bpf/bpf_cgrp_storage.c | 3 +- kernel/bpf/bpf_inode_storage.c | 4 +-- kernel/bpf/bpf_local_storage.c | 71 +++++++++++++++++++-------------------- kernel/bpf/bpf_task_storage.c | 4 +-- net/core/bpf_sk_storage.c | 4 +-- 6 files changed, 39 insertions(+), 49 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 709506e982a6..f74e0f7656a1 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -176,7 +176,7 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem); -void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now); +int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now); int bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage *local_storage, diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 0687a760974a..8fef24fcac68 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -118,8 +118,7 @@ static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata), false); - return 0; + return bpf_selem_unlink(SELEM(sdata), false); } static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key) diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index e54cce2b9175..cedc99184dad 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -110,9 +110,7 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata), false); - - return 0; + return bpf_selem_unlink(SELEM(sdata), false); } static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 2f94ca4a4475..38f5c3db2957 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -308,33 +308,6 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor return free_local_storage; } -static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, - bool reuse_now) -{ - struct bpf_local_storage *local_storage; - bool free_local_storage = false; - HLIST_HEAD(selem_free_list); - unsigned long flags; - - if (unlikely(!selem_linked_to_storage_lockless(selem))) - /* selem has already been unlinked from sk */ - return; - - local_storage = rcu_dereference_check(selem->local_storage, - bpf_rcu_lock_held()); - - raw_spin_lock_irqsave(&local_storage->lock, flags); - if (likely(selem_linked_to_storage(selem))) - free_local_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, &selem_free_list); - raw_spin_unlock_irqrestore(&local_storage->lock, flags); - - bpf_selem_free_list(&selem_free_list, reuse_now); - - if (free_local_storage) - bpf_local_storage_free(local_storage, reuse_now); -} - void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem) { @@ -386,19 +359,43 @@ static void bpf_selem_link_map_nolock(struct bpf_local_storage_map_bucket *b, hlist_add_head_rcu(&selem->map_node, &b->list); } -void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) +int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) { - int err; + struct bpf_local_storage *local_storage; + bool free_local_storage = false; + HLIST_HEAD(selem_free_list); + unsigned long flags; + int err = 0; - /* Always unlink from map before unlinking from local_storage - * because selem will be freed after successfully unlinked from - * the local_storage. - */ - err = bpf_selem_unlink_map(selem); - if (err) - return; + if (unlikely(!selem_linked_to_storage_lockless(selem))) + /* selem has already been unlinked from sk */ + return 0; + + local_storage = rcu_dereference_check(selem->local_storage, + bpf_rcu_lock_held()); + + raw_spin_lock_irqsave(&local_storage->lock, flags); + if (likely(selem_linked_to_storage(selem))) { + /* Always unlink from map before unlinking from local_storage + * because selem will be freed after successfully unlinked from + * the local_storage. + */ + err = bpf_selem_unlink_map(selem); + if (err) + goto out; + + free_local_storage = bpf_selem_unlink_storage_nolock( + local_storage, selem, &selem_free_list); + } +out: + raw_spin_unlock_irqrestore(&local_storage->lock, flags); - bpf_selem_unlink_storage(selem, reuse_now); + bpf_selem_free_list(&selem_free_list, reuse_now); + + if (free_local_storage) + bpf_local_storage_free(local_storage, reuse_now); + + return err; } void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage, diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index a1dc1bf0848a..ab902364ac23 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -167,9 +167,7 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map, if (!nobusy) return -EBUSY; - bpf_selem_unlink(SELEM(sdata), false); - - return 0; + return bpf_selem_unlink(SELEM(sdata), false); } static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 0b85d8f2c17e..d7b5c4551997 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -40,9 +40,7 @@ static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata), false); - - return 0; + return bpf_selem_unlink(SELEM(sdata), false); } /* Called by __sk_destruct() & bpf_sk_storage_clone() */ -- cgit v1.2.3 From 8dabe34b9d5b1135b084268396ed2267107e64c1 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:03 -0800 Subject: bpf: Change local_storage->lock and b->lock to rqspinlock Change bpf_local_storage::lock and bpf_local_storage_map_bucket::lock from raw_spin_lock to rqspinlock. Finally, propagate errors from raw_res_spin_lock_irqsave() to syscall return or BPF helper return. In bpf_local_storage_destroy(), ignore return from raw_res_spin_lock_irqsave() for now. A later patch will correctly handle errors correctly in bpf_local_storage_destroy() so that it can unlink selems even when failing to acquire locks. For __bpf_local_storage_map_cache(), instead of handling the error, skip updating the cache. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-6-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 5 +-- kernel/bpf/bpf_local_storage.c | 64 +++++++++++++++++++++++++++------------ 2 files changed, 47 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index f74e0f7656a1..fa50b7afee18 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -15,12 +15,13 @@ #include #include #include +#include #define BPF_LOCAL_STORAGE_CACHE_SIZE 16 struct bpf_local_storage_map_bucket { struct hlist_head list; - raw_spinlock_t lock; + rqspinlock_t lock; }; /* Thp map is not the primary owner of a bpf_local_storage_elem. @@ -94,7 +95,7 @@ struct bpf_local_storage { * bpf_local_storage_elem. */ struct rcu_head rcu; - raw_spinlock_t lock; /* Protect adding/removing from the "list" */ + rqspinlock_t lock; /* Protect adding/removing from the "list" */ bool use_kmalloc_nolock; }; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 38f5c3db2957..1138e2293b50 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -321,14 +321,18 @@ static int bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) struct bpf_local_storage_map *smap; struct bpf_local_storage_map_bucket *b; unsigned long flags; + int err; local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held()); smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); b = select_bucket(smap, local_storage); - raw_spin_lock_irqsave(&b->lock, flags); + err = raw_res_spin_lock_irqsave(&b->lock, flags); + if (err) + return err; + hlist_del_init_rcu(&selem->map_node); - raw_spin_unlock_irqrestore(&b->lock, flags); + raw_res_spin_unlock_irqrestore(&b->lock, flags); return 0; } @@ -344,11 +348,16 @@ int bpf_selem_link_map(struct bpf_local_storage_map *smap, { struct bpf_local_storage_map_bucket *b; unsigned long flags; + int err; b = select_bucket(smap, local_storage); - raw_spin_lock_irqsave(&b->lock, flags); + + err = raw_res_spin_lock_irqsave(&b->lock, flags); + if (err) + return err; + hlist_add_head_rcu(&selem->map_node, &b->list); - raw_spin_unlock_irqrestore(&b->lock, flags); + raw_res_spin_unlock_irqrestore(&b->lock, flags); return 0; } @@ -365,7 +374,7 @@ int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) bool free_local_storage = false; HLIST_HEAD(selem_free_list); unsigned long flags; - int err = 0; + int err; if (unlikely(!selem_linked_to_storage_lockless(selem))) /* selem has already been unlinked from sk */ @@ -374,7 +383,10 @@ int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held()); - raw_spin_lock_irqsave(&local_storage->lock, flags); + err = raw_res_spin_lock_irqsave(&local_storage->lock, flags); + if (err) + return err; + if (likely(selem_linked_to_storage(selem))) { /* Always unlink from map before unlinking from local_storage * because selem will be freed after successfully unlinked from @@ -388,7 +400,7 @@ int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) local_storage, selem, &selem_free_list); } out: - raw_spin_unlock_irqrestore(&local_storage->lock, flags); + raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); bpf_selem_free_list(&selem_free_list, reuse_now); @@ -403,16 +415,20 @@ void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem) { unsigned long flags; + int err; /* spinlock is needed to avoid racing with the * parallel delete. Otherwise, publishing an already * deleted sdata to the cache will become a use-after-free * problem in the next bpf_local_storage_lookup(). */ - raw_spin_lock_irqsave(&local_storage->lock, flags); + err = raw_res_spin_lock_irqsave(&local_storage->lock, flags); + if (err) + return; + if (selem_linked_to_storage(selem)) rcu_assign_pointer(local_storage->cache[smap->cache_idx], SDATA(selem)); - raw_spin_unlock_irqrestore(&local_storage->lock, flags); + raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); } static int check_flags(const struct bpf_local_storage_data *old_sdata, @@ -457,14 +473,17 @@ int bpf_local_storage_alloc(void *owner, RCU_INIT_POINTER(storage->smap, smap); INIT_HLIST_HEAD(&storage->list); - raw_spin_lock_init(&storage->lock); + raw_res_spin_lock_init(&storage->lock); storage->owner = owner; storage->use_kmalloc_nolock = smap->use_kmalloc_nolock; bpf_selem_link_storage_nolock(storage, first_selem); b = select_bucket(smap, storage); - raw_spin_lock_irqsave(&b->lock, flags); + err = raw_res_spin_lock_irqsave(&b->lock, flags); + if (err) + goto uncharge; + bpf_selem_link_map_nolock(b, first_selem); owner_storage_ptr = @@ -482,11 +501,11 @@ int bpf_local_storage_alloc(void *owner, prev_storage = cmpxchg(owner_storage_ptr, NULL, storage); if (unlikely(prev_storage)) { bpf_selem_unlink_map_nolock(first_selem); - raw_spin_unlock_irqrestore(&b->lock, flags); + raw_res_spin_unlock_irqrestore(&b->lock, flags); err = -EAGAIN; goto uncharge; } - raw_spin_unlock_irqrestore(&b->lock, flags); + raw_res_spin_unlock_irqrestore(&b->lock, flags); return 0; @@ -569,7 +588,9 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (!alloc_selem) return ERR_PTR(-ENOMEM); - raw_spin_lock_irqsave(&local_storage->lock, flags); + err = raw_res_spin_lock_irqsave(&local_storage->lock, flags); + if (err) + goto free_selem; /* Recheck local_storage->list under local_storage->lock */ if (unlikely(hlist_empty(&local_storage->list))) { @@ -596,7 +617,9 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, b = select_bucket(smap, local_storage); - raw_spin_lock_irqsave(&b->lock, b_flags); + err = raw_res_spin_lock_irqsave(&b->lock, b_flags); + if (err) + goto unlock; alloc_selem = NULL; /* First, link the new selem to the map */ @@ -612,9 +635,10 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, &old_selem_free_list); } - raw_spin_unlock_irqrestore(&b->lock, b_flags); + raw_res_spin_unlock_irqrestore(&b->lock, b_flags); unlock: - raw_spin_unlock_irqrestore(&local_storage->lock, flags); + raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); +free_selem: bpf_selem_free_list(&old_selem_free_list, false); if (alloc_selem) { mem_uncharge(smap, owner, smap->elem_size); @@ -699,7 +723,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) * when unlinking elem from the local_storage->list and * the map's bucket->list. */ - raw_spin_lock_irqsave(&local_storage->lock, flags); + raw_res_spin_lock_irqsave(&local_storage->lock, flags); hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { /* Always unlink from map before unlinking from * local_storage. @@ -714,7 +738,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) free_storage = bpf_selem_unlink_storage_nolock( local_storage, selem, &free_selem_list); } - raw_spin_unlock_irqrestore(&local_storage->lock, flags); + raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); bpf_selem_free_list(&free_selem_list, true); @@ -761,7 +785,7 @@ bpf_local_storage_map_alloc(union bpf_attr *attr, for (i = 0; i < nbuckets; i++) { INIT_HLIST_HEAD(&smap->buckets[i].list); - raw_spin_lock_init(&smap->buckets[i].lock); + raw_res_spin_lock_init(&smap->buckets[i].lock); } smap->elem_size = offsetof(struct bpf_local_storage_elem, -- cgit v1.2.3 From 3417dffb58331d1e7e4f3e30ec95cc0f8114ff10 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:06 -0800 Subject: bpf: Remove unused percpu counter from bpf_local_storage_map_free Percpu locks have been removed from cgroup and task local storage. Now that all local storage no longer use percpu variables as locks preventing recursion, there is no need to pass them to bpf_local_storage_map_free(). Remove the argument from the function. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-9-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 3 +-- kernel/bpf/bpf_cgrp_storage.c | 2 +- kernel/bpf/bpf_inode_storage.c | 2 +- kernel/bpf/bpf_local_storage.c | 7 +------ kernel/bpf/bpf_task_storage.c | 2 +- net/core/bpf_sk_storage.c | 2 +- 6 files changed, 6 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index fa50b7afee18..fba3354988d3 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -166,8 +166,7 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage, void bpf_local_storage_destroy(struct bpf_local_storage *local_storage); void bpf_local_storage_map_free(struct bpf_map *map, - struct bpf_local_storage_cache *cache, - int __percpu *busy_counter); + struct bpf_local_storage_cache *cache); int bpf_local_storage_map_check_btf(const struct bpf_map *map, const struct btf *btf, diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 4d84611d8222..853183eead2c 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -119,7 +119,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) static void cgroup_storage_map_free(struct bpf_map *map) { - bpf_local_storage_map_free(map, &cgroup_cache, NULL); + bpf_local_storage_map_free(map, &cgroup_cache); } /* *gfp_flags* is a hidden argument provided by the verifier */ diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index cedc99184dad..470f4b02c79e 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -184,7 +184,7 @@ static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr) static void inode_storage_map_free(struct bpf_map *map) { - bpf_local_storage_map_free(map, &inode_cache, NULL); + bpf_local_storage_map_free(map, &inode_cache); } const struct bpf_map_ops inode_storage_map_ops = { diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 1138e2293b50..76e812a40380 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -807,8 +807,7 @@ free_smap: } void bpf_local_storage_map_free(struct bpf_map *map, - struct bpf_local_storage_cache *cache, - int __percpu *busy_counter) + struct bpf_local_storage_cache *cache) { struct bpf_local_storage_map_bucket *b; struct bpf_local_storage_elem *selem; @@ -841,11 +840,7 @@ void bpf_local_storage_map_free(struct bpf_map *map, while ((selem = hlist_entry_safe( rcu_dereference_raw(hlist_first_rcu(&b->list)), struct bpf_local_storage_elem, map_node))) { - if (busy_counter) - this_cpu_inc(*busy_counter); bpf_selem_unlink(selem, true); - if (busy_counter) - this_cpu_dec(*busy_counter); cond_resched_rcu(); } rcu_read_unlock(); diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index dd858226ada2..4d53aebe6784 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -217,7 +217,7 @@ static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr) static void task_storage_map_free(struct bpf_map *map) { - bpf_local_storage_map_free(map, &task_cache, NULL); + bpf_local_storage_map_free(map, &task_cache); } BTF_ID_LIST_GLOBAL_SINGLE(bpf_local_storage_map_btf_id, struct, bpf_local_storage_map) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index d7b5c4551997..d2164165a994 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -60,7 +60,7 @@ out: static void bpf_sk_storage_map_free(struct bpf_map *map) { - bpf_local_storage_map_free(map, &sk_cache, NULL); + bpf_local_storage_map_free(map, &sk_cache); } static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) -- cgit v1.2.3 From c8be3da14718f1e732afcb61e8ee5b78e8d93808 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:07 -0800 Subject: bpf: Prepare for bpf_selem_unlink_nofail() The next patch will introduce bpf_selem_unlink_nofail() to handle rqspinlock errors. bpf_selem_unlink_nofail() will allow an selem to be partially unlinked from map or local storage. Save memory allocation method in selem so that later an selem can be correctly freed even when SDATA(selem)->smap is init to NULL. In addition, keep track of memory charge to the owner in local storage so that later bpf_selem_unlink_nofail() can return the correct memory charge to the owner. Updating local_storage->mem_charge is protected by local_storage->lock. Finally, extract miscellaneous tasks performed when unlinking an selem from local_storage into bpf_selem_unlink_storage_nolock_misc(). It will be reused by bpf_selem_unlink_nofail(). This patch also takes the chance to remove local_storage->smap, which is no longer used since commit f484f4a3e058 ("bpf: Replace bpf memory allocator with kmalloc_nolock() in local storage"). Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-10-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 5 +-- kernel/bpf/bpf_local_storage.c | 69 +++++++++++++++++++-------------------- 2 files changed, 37 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index fba3354988d3..a34ed7fa81d8 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -80,7 +80,8 @@ struct bpf_local_storage_elem { * after raw_spin_unlock */ }; - /* 8 bytes hole */ + bool use_kmalloc_nolock; + /* 7 bytes hole */ /* The data is stored in another cacheline to minimize * the number of cachelines access during a cache hit. */ @@ -89,13 +90,13 @@ struct bpf_local_storage_elem { struct bpf_local_storage { struct bpf_local_storage_data __rcu *cache[BPF_LOCAL_STORAGE_CACHE_SIZE]; - struct bpf_local_storage_map __rcu *smap; struct hlist_head list; /* List of bpf_local_storage_elem */ void *owner; /* The object that owns the above "list" of * bpf_local_storage_elem. */ struct rcu_head rcu; rqspinlock_t lock; /* Protect adding/removing from the "list" */ + u64 mem_charge; /* Copy of mem charged to owner. Protected by "lock" */ bool use_kmalloc_nolock; }; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 76e812a40380..0e9ae41a9759 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -85,6 +85,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, if (selem) { RCU_INIT_POINTER(SDATA(selem)->smap, smap); + selem->use_kmalloc_nolock = smap->use_kmalloc_nolock; if (value) { /* No need to call check_and_init_map_value as memory is zero init */ @@ -214,7 +215,7 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); - if (!smap->use_kmalloc_nolock) { + if (!selem->use_kmalloc_nolock) { /* * No uptr will be unpin even when reuse_now == false since uptr * is only supported in task local storage, where @@ -251,6 +252,30 @@ static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now) bpf_selem_free(selem, reuse_now); } +static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem *selem, + struct bpf_local_storage_map *smap, + struct bpf_local_storage *local_storage, + bool free_local_storage) +{ + void *owner = local_storage->owner; + u32 uncharge = smap->elem_size; + + if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) == + SDATA(selem)) + RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); + + uncharge += free_local_storage ? sizeof(*local_storage) : 0; + mem_uncharge(smap, local_storage->owner, uncharge); + local_storage->mem_charge -= uncharge; + + if (free_local_storage) { + local_storage->owner = NULL; + + /* After this RCU_INIT, owner may be freed and cannot be used */ + RCU_INIT_POINTER(*owner_storage(smap, owner), NULL); + } +} + /* local_storage->lock must be held and selem->local_storage == local_storage. * The caller must ensure selem->smap is still valid to be * dereferenced for its smap->elem_size and smap->cache_idx. @@ -261,56 +286,30 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor { struct bpf_local_storage_map *smap; bool free_local_storage; - void *owner; smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); - owner = local_storage->owner; - - /* All uncharging on the owner must be done first. - * The owner may be freed once the last selem is unlinked - * from local_storage. - */ - mem_uncharge(smap, owner, smap->elem_size); free_local_storage = hlist_is_singular_node(&selem->snode, &local_storage->list); - if (free_local_storage) { - mem_uncharge(smap, owner, sizeof(struct bpf_local_storage)); - local_storage->owner = NULL; - /* After this RCU_INIT, owner may be freed and cannot be used */ - RCU_INIT_POINTER(*owner_storage(smap, owner), NULL); + bpf_selem_unlink_storage_nolock_misc(selem, smap, local_storage, + free_local_storage); - /* local_storage is not freed now. local_storage->lock is - * still held and raw_spin_unlock_bh(&local_storage->lock) - * will be done by the caller. - * - * Although the unlock will be done under - * rcu_read_lock(), it is more intuitive to - * read if the freeing of the storage is done - * after the raw_spin_unlock_bh(&local_storage->lock). - * - * Hence, a "bool free_local_storage" is returned - * to the caller which then calls then frees the storage after - * all the RCU grace periods have expired. - */ - } hlist_del_init_rcu(&selem->snode); - if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) == - SDATA(selem)) - RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); hlist_add_head(&selem->free_node, free_selem_list); - if (rcu_access_pointer(local_storage->smap) == smap) - RCU_INIT_POINTER(local_storage->smap, NULL); - return free_local_storage; } void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem) { + struct bpf_local_storage_map *smap; + + smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); + local_storage->mem_charge += smap->elem_size; + RCU_INIT_POINTER(selem->local_storage, local_storage); hlist_add_head_rcu(&selem->snode, &local_storage->list); } @@ -471,10 +470,10 @@ int bpf_local_storage_alloc(void *owner, goto uncharge; } - RCU_INIT_POINTER(storage->smap, smap); INIT_HLIST_HEAD(&storage->list); raw_res_spin_lock_init(&storage->lock); storage->owner = owner; + storage->mem_charge = sizeof(*storage); storage->use_kmalloc_nolock = smap->use_kmalloc_nolock; bpf_selem_link_storage_nolock(storage, first_selem); -- cgit v1.2.3 From 5d800f87d0a5ea1b156c47a4b9fd128479335153 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:08 -0800 Subject: bpf: Support lockless unlink when freeing map or local storage Introduce bpf_selem_unlink_nofail() to properly handle errors returned from rqspinlock in bpf_local_storage_map_free() and bpf_local_storage_destroy() where the operation must succeeds. The idea of bpf_selem_unlink_nofail() is to allow an selem to be partially linked and use atomic operation on a bit field, selem->state, to determine when and who can free the selem if any unlink under lock fails. An selem initially is fully linked to a map and a local storage. Under normal circumstances, bpf_selem_unlink_nofail() will be able to grab locks and unlink a selem from map and local storage in sequeunce, just like bpf_selem_unlink(), and then free it after an RCU grace period. However, if any of the lock attempts fails, it will only clear SDATA(selem)->smap or selem->local_storage depending on the caller and set SELEM_MAP_UNLINKED or SELEM_STORAGE_UNLINKED according to the caller. Then, after both map_free() and destroy() see the selem and the state becomes SELEM_UNLINKED, one of two racing caller can succeed in cmpxchg the state from SELEM_UNLINKED to SELEM_TOFREE, ensuring no double free or memory leak. To make sure bpf_obj_free_fields() is done only once and when map is still present, it is called when unlinking an selem from b->list under b->lock. To make sure uncharging memory is done only when the owner is still present in map_free(), block destroy() from returning until there is no pending map_free(). Since smap may not be valid in destroy(), bpf_selem_unlink_nofail() skips bpf_selem_unlink_storage_nolock_misc() when called from destroy(). This is okay as bpf_local_storage_destroy() will return the remaining amount of memory charge tracked by mem_charge to the owner to uncharge. It is also safe to skip clearing local_storage->owner and owner_storage as the owner is being freed and no users or bpf programs should be able to reference the owner and using local_storage. Finally, access of selem, SDATA(selem)->smap and selem->local_storage are racy. Callers will protect these fields with RCU. Acked-by: Alexei Starovoitov Co-developed-by: Martin KaFai Lau Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-11-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 9 ++- kernel/bpf/bpf_local_storage.c | 116 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 118 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index a34ed7fa81d8..69a5d8aa765d 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -68,6 +68,11 @@ struct bpf_local_storage_data { u8 data[] __aligned(8); }; +#define SELEM_MAP_UNLINKED (1 << 0) +#define SELEM_STORAGE_UNLINKED (1 << 1) +#define SELEM_UNLINKED (SELEM_MAP_UNLINKED | SELEM_STORAGE_UNLINKED) +#define SELEM_TOFREE (1 << 2) + /* Linked to bpf_local_storage and bpf_local_storage_map */ struct bpf_local_storage_elem { struct hlist_node map_node; /* Linked to bpf_local_storage_map */ @@ -80,8 +85,9 @@ struct bpf_local_storage_elem { * after raw_spin_unlock */ }; + atomic_t state; bool use_kmalloc_nolock; - /* 7 bytes hole */ + /* 3 bytes hole */ /* The data is stored in another cacheline to minimize * the number of cachelines access during a cache hit. */ @@ -97,6 +103,7 @@ struct bpf_local_storage { struct rcu_head rcu; rqspinlock_t lock; /* Protect adding/removing from the "list" */ u64 mem_charge; /* Copy of mem charged to owner. Protected by "lock" */ + refcount_t owner_refcnt;/* Used to pin owner when map_free is uncharging */ bool use_kmalloc_nolock; }; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 0e9ae41a9759..294605c78271 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -85,6 +85,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, if (selem) { RCU_INIT_POINTER(SDATA(selem)->smap, smap); + atomic_set(&selem->state, 0); selem->use_kmalloc_nolock = smap->use_kmalloc_nolock; if (value) { @@ -194,9 +195,11 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu) /* The bpf_local_storage_map_free will wait for rcu_barrier */ smap = rcu_dereference_check(SDATA(selem)->smap, 1); - migrate_disable(); - bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); - migrate_enable(); + if (smap) { + migrate_disable(); + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); + migrate_enable(); + } kfree_nolock(selem); } @@ -221,7 +224,8 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, * is only supported in task local storage, where * smap->use_kmalloc_nolock == true. */ - bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); + if (smap) + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); __bpf_selem_free(selem, reuse_now); return; } @@ -255,7 +259,7 @@ static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now) static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem *selem, struct bpf_local_storage_map *smap, struct bpf_local_storage *local_storage, - bool free_local_storage) + bool free_local_storage, bool pin_owner) { void *owner = local_storage->owner; u32 uncharge = smap->elem_size; @@ -264,6 +268,9 @@ static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem * SDATA(selem)) RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); + if (pin_owner && !refcount_inc_not_zero(&local_storage->owner_refcnt)) + return; + uncharge += free_local_storage ? sizeof(*local_storage) : 0; mem_uncharge(smap, local_storage->owner, uncharge); local_storage->mem_charge -= uncharge; @@ -274,6 +281,9 @@ static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem * /* After this RCU_INIT, owner may be freed and cannot be used */ RCU_INIT_POINTER(*owner_storage(smap, owner), NULL); } + + if (pin_owner) + refcount_dec(&local_storage->owner_refcnt); } /* local_storage->lock must be held and selem->local_storage == local_storage. @@ -293,7 +303,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor &local_storage->list); bpf_selem_unlink_storage_nolock_misc(selem, smap, local_storage, - free_local_storage); + free_local_storage, false); hlist_del_init_rcu(&selem->snode); @@ -409,6 +419,94 @@ out: return err; } +/* + * Unlink an selem from map and local storage with lockless fallback if callers + * are racing or rqspinlock returns error. It should only be called by + * bpf_local_storage_destroy() or bpf_local_storage_map_free(). + */ +static void bpf_selem_unlink_nofail(struct bpf_local_storage_elem *selem, + struct bpf_local_storage_map_bucket *b) +{ + bool in_map_free = !!b, free_storage = false; + struct bpf_local_storage *local_storage; + struct bpf_local_storage_map *smap; + unsigned long flags; + int err, unlink = 0; + + local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held()); + smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); + + if (smap) { + b = b ? : select_bucket(smap, local_storage); + err = raw_res_spin_lock_irqsave(&b->lock, flags); + if (!err) { + /* + * Call bpf_obj_free_fields() under b->lock to make sure it is done + * exactly once for an selem. Safe to free special fields immediately + * as no BPF program should be referencing the selem. + */ + if (likely(selem_linked_to_map(selem))) { + hlist_del_init_rcu(&selem->map_node); + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); + unlink++; + } + raw_res_spin_unlock_irqrestore(&b->lock, flags); + } + /* + * Highly unlikely scenario: resource leak + * + * When map_free(selem1), destroy(selem1) and destroy(selem2) are racing + * and both selem belong to the same bucket, if destroy(selem2) acquired + * b->lock and block for too long, neither map_free(selem1) and + * destroy(selem1) will be able to free the special field associated + * with selem1 as raw_res_spin_lock_irqsave() returns -ETIMEDOUT. + */ + WARN_ON_ONCE(err && in_map_free); + if (!err || in_map_free) + RCU_INIT_POINTER(SDATA(selem)->smap, NULL); + } + + if (local_storage) { + err = raw_res_spin_lock_irqsave(&local_storage->lock, flags); + if (!err) { + if (likely(selem_linked_to_storage(selem))) { + free_storage = hlist_is_singular_node(&selem->snode, + &local_storage->list); + /* + * Okay to skip clearing owner_storage and storage->owner in + * destroy() since the owner is going away. No user or bpf + * programs should be able to reference it. + */ + if (smap && in_map_free) + bpf_selem_unlink_storage_nolock_misc( + selem, smap, local_storage, + free_storage, true); + hlist_del_init_rcu(&selem->snode); + unlink++; + } + raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); + } + if (!err || !in_map_free) + RCU_INIT_POINTER(selem->local_storage, NULL); + } + + if (unlink != 2) + atomic_or(in_map_free ? SELEM_MAP_UNLINKED : SELEM_STORAGE_UNLINKED, &selem->state); + + /* + * Normally, an selem can be unlinked under local_storage->lock and b->lock, and + * then freed after an RCU grace period. However, if destroy() and map_free() are + * racing or rqspinlock returns errors in unlikely situations (unlink != 2), free + * the selem only after both map_free() and destroy() see the selem. + */ + if (unlink == 2 || + atomic_cmpxchg(&selem->state, SELEM_UNLINKED, SELEM_TOFREE) == SELEM_UNLINKED) + bpf_selem_free(selem, true); + + if (free_storage) + bpf_local_storage_free(local_storage, true); +} + void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *selem) @@ -475,6 +573,7 @@ int bpf_local_storage_alloc(void *owner, storage->owner = owner; storage->mem_charge = sizeof(*storage); storage->use_kmalloc_nolock = smap->use_kmalloc_nolock; + refcount_set(&storage->owner_refcnt, 1); bpf_selem_link_storage_nolock(storage, first_selem); @@ -743,6 +842,11 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) if (free_storage) bpf_local_storage_free(local_storage, true); + + if (!refcount_dec_and_test(&local_storage->owner_refcnt)) { + while (refcount_read(&local_storage->owner_refcnt)) + cpu_relax(); + } } u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) -- cgit v1.2.3 From 0be08389c7f2f9db0194ee666d2e4870886e6ffb Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:09 -0800 Subject: bpf: Switch to bpf_selem_unlink_nofail in bpf_local_storage_{map_free, destroy} Take care of rqspinlock error in bpf_local_storage_{map_free, destroy}() properly by switching to bpf_selem_unlink_nofail(). Both functions iterate their own RCU-protected list of selems and call bpf_selem_unlink_nofail(). In map_free(), to prevent infinite loop when both map_free() and destroy() fail to remove a selem from b->list (extremely unlikely), switch to hlist_for_each_entry_rcu(). In destroy(), also switch to hlist_for_each_entry_rcu() since we no longer iterate local_storage->list under local_storage->lock. bpf_selem_unlink() now becomes dedicated to helpers and syscalls paths so reuse_now should always be false. Remove it from the argument and hardcode it. Acked-by: Alexei Starovoitov Co-developed-by: Martin KaFai Lau Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-12-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 4 +-- kernel/bpf/bpf_cgrp_storage.c | 2 +- kernel/bpf/bpf_inode_storage.c | 2 +- kernel/bpf/bpf_local_storage.c | 63 ++++++++++++++++++--------------------- kernel/bpf/bpf_task_storage.c | 2 +- net/core/bpf_sk_storage.c | 7 +++-- 6 files changed, 39 insertions(+), 41 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 69a5d8aa765d..85efa9772530 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -171,7 +171,7 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage, return SDATA(selem); } -void bpf_local_storage_destroy(struct bpf_local_storage *local_storage); +u32 bpf_local_storage_destroy(struct bpf_local_storage *local_storage); void bpf_local_storage_map_free(struct bpf_map *map, struct bpf_local_storage_cache *cache); @@ -184,7 +184,7 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem); -int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now); +int bpf_selem_unlink(struct bpf_local_storage_elem *selem); int bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage *local_storage, diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 853183eead2c..c2a2ead1f466 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -89,7 +89,7 @@ static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map) if (!sdata) return -ENOENT; - return bpf_selem_unlink(SELEM(sdata), false); + return bpf_selem_unlink(SELEM(sdata)); } static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key) diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 470f4b02c79e..e86734609f3d 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -110,7 +110,7 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map) if (!sdata) return -ENOENT; - return bpf_selem_unlink(SELEM(sdata), false); + return bpf_selem_unlink(SELEM(sdata)); } static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 294605c78271..b28f07d3a0db 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -377,7 +377,11 @@ static void bpf_selem_link_map_nolock(struct bpf_local_storage_map_bucket *b, hlist_add_head_rcu(&selem->map_node, &b->list); } -int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) +/* + * Unlink an selem from map and local storage with lock held. + * This is the common path used by local storages to delete an selem. + */ +int bpf_selem_unlink(struct bpf_local_storage_elem *selem) { struct bpf_local_storage *local_storage; bool free_local_storage = false; @@ -411,10 +415,10 @@ int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) out: raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); - bpf_selem_free_list(&selem_free_list, reuse_now); + bpf_selem_free_list(&selem_free_list, false); if (free_local_storage) - bpf_local_storage_free(local_storage, reuse_now); + bpf_local_storage_free(local_storage, false); return err; } @@ -804,13 +808,13 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, return 0; } -void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) +/* + * Destroy local storage when the owner is going away. Caller must uncharge memory + * if memory charging is used. + */ +u32 bpf_local_storage_destroy(struct bpf_local_storage *local_storage) { struct bpf_local_storage_elem *selem; - bool free_storage = false; - HLIST_HEAD(free_selem_list); - struct hlist_node *n; - unsigned long flags; /* Neither the bpf_prog nor the bpf_map's syscall * could be modifying the local_storage->list now. @@ -821,32 +825,20 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) * when unlinking elem from the local_storage->list and * the map's bucket->list. */ - raw_res_spin_lock_irqsave(&local_storage->lock, flags); - hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { - /* Always unlink from map before unlinking from - * local_storage. - */ - bpf_selem_unlink_map(selem); - /* If local_storage list has only one element, the - * bpf_selem_unlink_storage_nolock() will return true. - * Otherwise, it will return false. The current loop iteration - * intends to remove all local storage. So the last iteration - * of the loop will set the free_cgroup_storage to true. - */ - free_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, &free_selem_list); - } - raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); - - bpf_selem_free_list(&free_selem_list, true); - - if (free_storage) - bpf_local_storage_free(local_storage, true); + hlist_for_each_entry_rcu(selem, &local_storage->list, snode) + bpf_selem_unlink_nofail(selem, NULL); if (!refcount_dec_and_test(&local_storage->owner_refcnt)) { while (refcount_read(&local_storage->owner_refcnt)) cpu_relax(); + /* + * Paired with refcount_dec() in bpf_selem_unlink_nofail() + * to make sure destroy() sees the correct local_storage->mem_charge. + */ + smp_mb(); } + + return local_storage->mem_charge; } u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) @@ -940,11 +932,14 @@ void bpf_local_storage_map_free(struct bpf_map *map, rcu_read_lock(); /* No one is adding to b->list now */ - while ((selem = hlist_entry_safe( - rcu_dereference_raw(hlist_first_rcu(&b->list)), - struct bpf_local_storage_elem, map_node))) { - bpf_selem_unlink(selem, true); - cond_resched_rcu(); +restart: + hlist_for_each_entry_rcu(selem, &b->list, map_node) { + bpf_selem_unlink_nofail(selem, b); + + if (need_resched()) { + cond_resched_rcu(); + goto restart; + } } rcu_read_unlock(); } diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 4d53aebe6784..605506792b5b 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -134,7 +134,7 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map) if (!sdata) return -ENOENT; - return bpf_selem_unlink(SELEM(sdata), false); + return bpf_selem_unlink(SELEM(sdata)); } static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index d2164165a994..1eb3e060994e 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -40,20 +40,23 @@ static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map) if (!sdata) return -ENOENT; - return bpf_selem_unlink(SELEM(sdata), false); + return bpf_selem_unlink(SELEM(sdata)); } /* Called by __sk_destruct() & bpf_sk_storage_clone() */ void bpf_sk_storage_free(struct sock *sk) { struct bpf_local_storage *sk_storage; + u32 uncharge; rcu_read_lock_dont_migrate(); sk_storage = rcu_dereference(sk->sk_bpf_storage); if (!sk_storage) goto out; - bpf_local_storage_destroy(sk_storage); + uncharge = bpf_local_storage_destroy(sk_storage); + if (uncharge) + atomic_sub(uncharge, &sk->sk_omem_alloc); out: rcu_read_unlock_migrate(); } -- cgit v1.2.3