From 38163af068810b388f6723a681dfd8c7b3680d38 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 14 Oct 2025 23:54:58 +0000 Subject: bpf: Introduce SK_BPF_BYPASS_PROT_MEM. If a socket has sk->sk_bypass_prot_mem flagged, the socket opts out of the global protocol memory accounting. This is easily controlled by net.core.bypass_prot_mem sysctl, but it lacks flexibility. Let's support flagging (and clearing) sk->sk_bypass_prot_mem via bpf_setsockopt() at the BPF_CGROUP_INET_SOCK_CREATE hook. int val = 1; bpf_setsockopt(ctx, SOL_SOCKET, SK_BPF_BYPASS_PROT_MEM, &val, sizeof(val)); As with net.core.bypass_prot_mem, this is inherited to child sockets, and BPF always takes precedence over sysctl at socket(2) and accept(2). SK_BPF_BYPASS_PROT_MEM is only supported at BPF_CGROUP_INET_SOCK_CREATE and not supported on other hooks for some reasons: 1. UDP charges memory under sk->sk_receive_queue.lock instead of lock_sock() 2. Modifying the flag after skb is charged to sk requires such adjustment during bpf_setsockopt() and complicates the logic unnecessarily We can support other hooks later if a real use case justifies that. Most changes are inline and hard to trace, but a microbenchmark on __sk_mem_raise_allocated() during neper/tcp_stream showed that more samples completed faster with sk->sk_bypass_prot_mem == 1. This will be more visible under tcp_mem pressure (but it's not a fair comparison). # bpftrace -e 'kprobe:__sk_mem_raise_allocated { @start[tid] = nsecs; } kretprobe:__sk_mem_raise_allocated /@start[tid]/ { @end[tid] = nsecs - @start[tid]; @times = hist(@end[tid]); delete(@start[tid]); }' # tcp_stream -6 -F 1000 -N -T 256 Without bpf prog: [128, 256) 3846 | | [256, 512) 1505326 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [512, 1K) 1371006 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | [1K, 2K) 198207 |@@@@@@ | [2K, 4K) 31199 |@ | With bpf prog in the next patch: (must be attached before tcp_stream) # bpftool prog load sk_bypass_prot_mem.bpf.o /sys/fs/bpf/test type cgroup/sock_create # bpftool cgroup attach /sys/fs/cgroup/test cgroup_inet_sock_create pinned /sys/fs/bpf/test [128, 256) 6413 | | [256, 512) 1868425 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [512, 1K) 1101697 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | [1K, 2K) 117031 |@@@@ | [2K, 4K) 11773 | | Signed-off-by: Kuniyuki Iwashima Signed-off-by: Martin KaFai Lau Acked-by: Roman Gushchin Link: https://patch.msgid.link/20251014235604.3057003-6-kuniyu@google.com --- tools/include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools/include/uapi/linux') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6829936d33f5..9b17d937edf7 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -7200,6 +7200,7 @@ enum { TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */ TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */ SK_BPF_CB_FLAGS = 1009, /* Get or set sock ops flags in socket */ + SK_BPF_BYPASS_PROT_MEM = 1010, /* Get or Set sk->sk_bypass_prot_mem */ }; enum { -- cgit v1.2.3 From 531b87d865eb9e625c2e46ec8f06a65a6157ee45 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Sun, 26 Oct 2025 20:38:45 +0000 Subject: bpf: widen dynptr size/offset to 64 bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dynptr currently caps size and offset at 24 bits, which isn’t sufficient for file-backed use cases; even 32 bits can be limiting. Refactor dynptr helpers/kfuncs to use 64-bit size and offset, ensuring consistency across the APIs. This change does not affect internals of xdp, skb or other dynptrs, which continue to behave as before. Also it does not break binary compatibility. The widening enables large-file access support via dynptr, implemented in the next patches. Signed-off-by: Mykyta Yatsenko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20251026203853.135105-3-mykyta.yatsenko5@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 20 +++---- include/uapi/linux/bpf.h | 8 +-- kernel/bpf/helpers.c | 66 +++++++++++----------- kernel/trace/bpf_trace.c | 46 +++++++-------- tools/include/uapi/linux/bpf.h | 8 +-- tools/testing/selftests/bpf/bpf_kfuncs.h | 12 ++-- tools/testing/selftests/bpf/progs/dynptr_success.c | 12 ++-- 7 files changed, 86 insertions(+), 86 deletions(-) (limited to 'tools/include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e53cda0aabb6..907c69295293 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1387,19 +1387,19 @@ enum bpf_dynptr_type { BPF_DYNPTR_TYPE_SKB_META, }; -int bpf_dynptr_check_size(u32 size); -u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr); -const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len); -void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len); +int bpf_dynptr_check_size(u64 size); +u64 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr); +const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u64 len); +void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u64 len); bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr); -int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, - void *src, u32 len, u64 flags); -void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset, - void *buffer__opt, u32 buffer__szk); +int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u64 offset, + void *src, u64 len, u64 flags); +void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset, + void *buffer__opt, u64 buffer__szk); -static inline int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len) +static inline int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u64 offset, u64 len) { - u32 size = __bpf_dynptr_size(ptr); + u64 size = __bpf_dynptr_size(ptr); if (len > size || offset > size - len) return -E2BIG; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6829936d33f5..77edd0253989 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5618,7 +5618,7 @@ union bpf_attr { * Return * *sk* if casting is valid, or **NULL** otherwise. * - * long bpf_dynptr_from_mem(void *data, u32 size, u64 flags, struct bpf_dynptr *ptr) + * long bpf_dynptr_from_mem(void *data, u64 size, u64 flags, struct bpf_dynptr *ptr) * Description * Get a dynptr to local memory *data*. * @@ -5661,7 +5661,7 @@ union bpf_attr { * Return * Nothing. Always succeeds. * - * long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags) + * long bpf_dynptr_read(void *dst, u64 len, const struct bpf_dynptr *src, u64 offset, u64 flags) * Description * Read *len* bytes from *src* into *dst*, starting from *offset* * into *src*. @@ -5671,7 +5671,7 @@ union bpf_attr { * of *src*'s data, -EINVAL if *src* is an invalid dynptr or if * *flags* is not 0. * - * long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags) + * long bpf_dynptr_write(const struct bpf_dynptr *dst, u64 offset, void *src, u64 len, u64 flags) * Description * Write *len* bytes from *src* into *dst*, starting from *offset* * into *dst*. @@ -5692,7 +5692,7 @@ union bpf_attr { * is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs, * other errors correspond to errors returned by **bpf_skb_store_bytes**\ (). * - * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len) + * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u64 offset, u64 len) * Description * Get a pointer to the underlying dynptr data. * diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index b9ec6ee21c94..a2ce17ea5edb 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1684,19 +1684,19 @@ static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *pt return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT; } -u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr) +u64 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_SIZE_MASK; } -static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u32 new_size) +static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u64 new_size) { u32 metadata = ptr->size & ~DYNPTR_SIZE_MASK; - ptr->size = new_size | metadata; + ptr->size = (u32)new_size | metadata; } -int bpf_dynptr_check_size(u32 size) +int bpf_dynptr_check_size(u64 size) { return size > DYNPTR_MAX_SIZE ? -E2BIG : 0; } @@ -1715,7 +1715,7 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) memset(ptr, 0, sizeof(*ptr)); } -BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr) +BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u64, size, u64, flags, struct bpf_dynptr_kern *, ptr) { int err; @@ -1750,8 +1750,8 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE, }; -static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *src, - u32 offset, u64 flags) +static int __bpf_dynptr_read(void *dst, u64 len, const struct bpf_dynptr_kern *src, + u64 offset, u64 flags) { enum bpf_dynptr_type type; int err; @@ -1787,8 +1787,8 @@ static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *s } } -BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, - u32, offset, u64, flags) +BPF_CALL_5(bpf_dynptr_read, void *, dst, u64, len, const struct bpf_dynptr_kern *, src, + u64, offset, u64, flags) { return __bpf_dynptr_read(dst, len, src, offset, flags); } @@ -1804,8 +1804,8 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = { .arg5_type = ARG_ANYTHING, }; -int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src, - u32 len, u64 flags) +int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u64 offset, void *src, + u64 len, u64 flags) { enum bpf_dynptr_type type; int err; @@ -1848,8 +1848,8 @@ int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src, } } -BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src, - u32, len, u64, flags) +BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u64, offset, void *, src, + u64, len, u64, flags) { return __bpf_dynptr_write(dst, offset, src, len, flags); } @@ -1865,7 +1865,7 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = { .arg5_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len) +BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u64, offset, u64, len) { enum bpf_dynptr_type type; int err; @@ -2680,12 +2680,12 @@ __bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid) * provided buffer, with its contents containing the data, if unable to obtain * direct pointer) */ -__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset, - void *buffer__opt, u32 buffer__szk) +__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset, + void *buffer__opt, u64 buffer__szk) { const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; enum bpf_dynptr_type type; - u32 len = buffer__szk; + u64 len = buffer__szk; int err; if (!ptr->data) @@ -2767,8 +2767,8 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset, * provided buffer, with its contents containing the data, if unable to obtain * direct pointer) */ -__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset, - void *buffer__opt, u32 buffer__szk) +__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset, + void *buffer__opt, u64 buffer__szk) { const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; @@ -2800,10 +2800,10 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset, return bpf_dynptr_slice(p, offset, buffer__opt, buffer__szk); } -__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u32 start, u32 end) +__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u64 start, u64 end) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; - u32 size; + u64 size; if (!ptr->data || start > end) return -EINVAL; @@ -2836,7 +2836,7 @@ __bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p) return __bpf_dynptr_is_rdonly(ptr); } -__bpf_kfunc __u32 bpf_dynptr_size(const struct bpf_dynptr *p) +__bpf_kfunc u64 bpf_dynptr_size(const struct bpf_dynptr *p) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; @@ -2873,14 +2873,14 @@ __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p, * Copies data from source dynptr to destination dynptr. * Returns 0 on success; negative error, otherwise. */ -__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, - struct bpf_dynptr *src_ptr, u32 src_off, u32 size) +__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u64 dst_off, + struct bpf_dynptr *src_ptr, u64 src_off, u64 size) { struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr; struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr; void *src_slice, *dst_slice; char buf[256]; - u32 off; + u64 off; src_slice = bpf_dynptr_slice(src_ptr, src_off, NULL, size); dst_slice = bpf_dynptr_slice_rdwr(dst_ptr, dst_off, NULL, size); @@ -2902,7 +2902,7 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, off = 0; while (off < size) { - u32 chunk_sz = min_t(u32, sizeof(buf), size - off); + u64 chunk_sz = min_t(u64, sizeof(buf), size - off); int err; err = __bpf_dynptr_read(buf, chunk_sz, src, src_off + off, 0); @@ -2928,10 +2928,10 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, * at @offset with the constant byte @val. * Returns 0 on success; negative error, otherwise. */ - __bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u32 offset, u32 size, u8 val) - { +__bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u64 offset, u64 size, u8 val) +{ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; - u32 chunk_sz, write_off; + u64 chunk_sz, write_off; char buf[256]; void* slice; int err; @@ -2950,11 +2950,11 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, return err; /* Non-linear data under the dynptr, write from a local buffer */ - chunk_sz = min_t(u32, sizeof(buf), size); + chunk_sz = min_t(u64, sizeof(buf), size); memset(buf, val, chunk_sz); for (write_off = 0; write_off < size; write_off += chunk_sz) { - chunk_sz = min_t(u32, sizeof(buf), size - write_off); + chunk_sz = min_t(u64, sizeof(buf), size - write_off); err = __bpf_dynptr_write(ptr, offset + write_off, buf, chunk_sz, 0); if (err) return err; @@ -4469,7 +4469,7 @@ late_initcall(kfunc_init); /* Get a pointer to dynptr data up to len bytes for read only access. If * the dynptr doesn't have continuous data up to len bytes, return NULL. */ -const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len) +const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u64 len) { const struct bpf_dynptr *p = (struct bpf_dynptr *)ptr; @@ -4480,7 +4480,7 @@ const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len) * the dynptr doesn't have continuous data up to len bytes, or the dynptr * is read only, return NULL. */ -void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len) +void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u64 len) { if (__bpf_dynptr_is_rdonly(ptr)) return NULL; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 4f87c16d915a..a795f7afbf3d 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3372,13 +3372,13 @@ typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struc * direct calls into all the specific callback implementations * (copy_user_data_sleepable, copy_user_data_nofault, and so on) */ -static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 doff, u32 size, +static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u64 doff, u64 size, const void *unsafe_src, copy_fn_t str_copy_fn, struct task_struct *tsk) { struct bpf_dynptr_kern *dst; - u32 chunk_sz, off; + u64 chunk_sz, off; void *dst_slice; int cnt, err; char buf[256]; @@ -3392,7 +3392,7 @@ static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 do return -E2BIG; for (off = 0; off < size; off += chunk_sz - 1) { - chunk_sz = min_t(u32, sizeof(buf), size - off); + chunk_sz = min_t(u64, sizeof(buf), size - off); /* Expect str_copy_fn to return count of copied bytes, including * zero terminator. Next iteration increment off by chunk_sz - 1 to * overwrite NUL. @@ -3409,14 +3409,14 @@ static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 do return off; } -static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u32 doff, - u32 size, const void *unsafe_src, +static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u64 doff, + u64 size, const void *unsafe_src, copy_fn_t copy_fn, struct task_struct *tsk) { struct bpf_dynptr_kern *dst; void *dst_slice; char buf[256]; - u32 off, chunk_sz; + u64 off, chunk_sz; int err; dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size); @@ -3428,7 +3428,7 @@ static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u32 return -E2BIG; for (off = 0; off < size; off += chunk_sz) { - chunk_sz = min_t(u32, sizeof(buf), size - off); + chunk_sz = min_t(u64, sizeof(buf), size - off); err = copy_fn(buf, unsafe_src + off, chunk_sz, tsk); if (err) return err; @@ -3514,58 +3514,58 @@ __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid return bpf_send_signal_common(sig, type, task, value); } -__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void __user *unsafe_ptr__ign) +__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_data_nofault, NULL); } -__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void *unsafe_ptr__ign) +__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void *unsafe_ptr__ign) { return __bpf_dynptr_copy(dptr, off, size, unsafe_ptr__ign, copy_kernel_data_nofault, NULL); } -__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void __user *unsafe_ptr__ign) +__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_str_nofault, NULL); } -__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void *unsafe_ptr__ign) +__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void *unsafe_ptr__ign) { return __bpf_dynptr_copy_str(dptr, off, size, unsafe_ptr__ign, copy_kernel_str_nofault, NULL); } -__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void __user *unsafe_ptr__ign) +__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_data_sleepable, NULL); } -__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void __user *unsafe_ptr__ign) +__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_str_sleepable, NULL); } -__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void __user *unsafe_ptr__ign, +__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign, struct task_struct *tsk) { return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_data_sleepable, tsk); } -__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u32 off, - u32 size, const void __user *unsafe_ptr__ign, +__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64 off, + u64 size, const void __user *unsafe_ptr__ign, struct task_struct *tsk) { return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6829936d33f5..77edd0253989 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5618,7 +5618,7 @@ union bpf_attr { * Return * *sk* if casting is valid, or **NULL** otherwise. * - * long bpf_dynptr_from_mem(void *data, u32 size, u64 flags, struct bpf_dynptr *ptr) + * long bpf_dynptr_from_mem(void *data, u64 size, u64 flags, struct bpf_dynptr *ptr) * Description * Get a dynptr to local memory *data*. * @@ -5661,7 +5661,7 @@ union bpf_attr { * Return * Nothing. Always succeeds. * - * long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags) + * long bpf_dynptr_read(void *dst, u64 len, const struct bpf_dynptr *src, u64 offset, u64 flags) * Description * Read *len* bytes from *src* into *dst*, starting from *offset* * into *src*. @@ -5671,7 +5671,7 @@ union bpf_attr { * of *src*'s data, -EINVAL if *src* is an invalid dynptr or if * *flags* is not 0. * - * long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags) + * long bpf_dynptr_write(const struct bpf_dynptr *dst, u64 offset, void *src, u64 len, u64 flags) * Description * Write *len* bytes from *src* into *dst*, starting from *offset* * into *dst*. @@ -5692,7 +5692,7 @@ union bpf_attr { * is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs, * other errors correspond to errors returned by **bpf_skb_store_bytes**\ (). * - * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len) + * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u64 offset, u64 len) * Description * Get a pointer to the underlying dynptr data. * diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h index 794d44d19c88..e0189254bb6e 100644 --- a/tools/testing/selftests/bpf/bpf_kfuncs.h +++ b/tools/testing/selftests/bpf/bpf_kfuncs.h @@ -28,8 +28,8 @@ extern int bpf_dynptr_from_skb_meta(struct __sk_buff *skb, __u64 flags, * Either a direct pointer to the dynptr data or a pointer to the user-provided * buffer if unable to obtain a direct pointer */ -extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, __u32 offset, - void *buffer, __u32 buffer__szk) __ksym __weak; +extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, __u64 offset, + void *buffer, __u64 buffer__szk) __ksym __weak; /* Description * Obtain a read-write pointer to the dynptr's data @@ -37,13 +37,13 @@ extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, __u32 offset, * Either a direct pointer to the dynptr data or a pointer to the user-provided * buffer if unable to obtain a direct pointer */ -extern void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *ptr, __u32 offset, - void *buffer, __u32 buffer__szk) __ksym __weak; +extern void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *ptr, __u64 offset, void *buffer, + __u64 buffer__szk) __ksym __weak; -extern int bpf_dynptr_adjust(const struct bpf_dynptr *ptr, __u32 start, __u32 end) __ksym __weak; +extern int bpf_dynptr_adjust(const struct bpf_dynptr *ptr, __u64 start, __u64 end) __ksym __weak; extern bool bpf_dynptr_is_null(const struct bpf_dynptr *ptr) __ksym __weak; extern bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *ptr) __ksym __weak; -extern __u32 bpf_dynptr_size(const struct bpf_dynptr *ptr) __ksym __weak; +extern __u64 bpf_dynptr_size(const struct bpf_dynptr *ptr) __ksym __weak; extern int bpf_dynptr_clone(const struct bpf_dynptr *ptr, struct bpf_dynptr *clone__init) __ksym __weak; /* Description diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c index 127dea342e5a..e0d672d93adf 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_success.c +++ b/tools/testing/selftests/bpf/progs/dynptr_success.c @@ -914,8 +914,8 @@ void *user_ptr; char expected_str[384]; __u32 test_len[7] = {0/* placeholder */, 0, 1, 2, 255, 256, 257}; -typedef int (*bpf_read_dynptr_fn_t)(struct bpf_dynptr *dptr, u32 off, - u32 size, const void *unsafe_ptr); +typedef int (*bpf_read_dynptr_fn_t)(struct bpf_dynptr *dptr, u64 off, + u64 size, const void *unsafe_ptr); /* Returns the offset just before the end of the maximum sized xdp fragment. * Any write larger than 32 bytes will be split between 2 fragments. @@ -1106,16 +1106,16 @@ int test_copy_from_user_str_dynptr(void *ctx) return 0; } -static int bpf_copy_data_from_user_task(struct bpf_dynptr *dptr, u32 off, - u32 size, const void *unsafe_ptr) +static int bpf_copy_data_from_user_task(struct bpf_dynptr *dptr, u64 off, + u64 size, const void *unsafe_ptr) { struct task_struct *task = bpf_get_current_task_btf(); return bpf_copy_from_user_task_dynptr(dptr, off, size, unsafe_ptr, task); } -static int bpf_copy_data_from_user_task_str(struct bpf_dynptr *dptr, u32 off, - u32 size, const void *unsafe_ptr) +static int bpf_copy_data_from_user_task_str(struct bpf_dynptr *dptr, u64 off, + u64 size, const void *unsafe_ptr) { struct task_struct *task = bpf_get_current_task_btf(); -- cgit v1.2.3 From feeaf1346f80ffb181b6f9b739628103aa73b067 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Sat, 18 Oct 2025 11:57:36 +0800 Subject: bpf: Add overwrite mode for BPF ring buffer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the BPF ring buffer is full, a new event cannot be recorded until one or more old events are consumed to make enough space for it. In cases such as fault diagnostics, where recent events are more useful than older ones, this mechanism may lead to critical events being lost. So add overwrite mode for BPF ring buffer to address it. In this mode, the new event overwrites the oldest event when the buffer is full. The basic idea is as follows: 1. producer_pos tracks the next position to record new event. When there is enough free space, producer_pos is simply advanced by producer to make space for the new event. 2. To avoid waiting for consumer when the buffer is full, a new variable, overwrite_pos, is introduced for producer. It points to the oldest event committed in the buffer. It is advanced by producer to discard one or more oldest events to make space for the new event when the buffer is full. 3. pending_pos tracks the oldest event to be committed. pending_pos is never passed by producer_pos, so multiple producers never write to the same position at the same time. The following example diagrams show how it works in a 4096-byte ring buffer. 1. At first, {producer,overwrite,pending,consumer}_pos are all set to 0. 0 512 1024 1536 2048 2560 3072 3584 4096 +-----------------------------------------------------------------------+ | | | | | | +-----------------------------------------------------------------------+ ^ | | producer_pos = 0 overwrite_pos = 0 pending_pos = 0 consumer_pos = 0 2. Now reserve a 512-byte event A. There is enough free space, so A is allocated at offset 0. And producer_pos is advanced to 512, the end of A. Since A is not submitted, the BUSY bit is set. 0 512 1024 1536 2048 2560 3072 3584 4096 +-----------------------------------------------------------------------+ | | | | A | | | [BUSY] | | +-----------------------------------------------------------------------+ ^ ^ | | | | | producer_pos = 512 | overwrite_pos = 0 pending_pos = 0 consumer_pos = 0 3. Reserve event B, size 1024. B is allocated at offset 512 with BUSY bit set, and producer_pos is advanced to the end of B. 0 512 1024 1536 2048 2560 3072 3584 4096 +-----------------------------------------------------------------------+ | | | | | A | B | | | [BUSY] | [BUSY] | | +-----------------------------------------------------------------------+ ^ ^ | | | | | producer_pos = 1536 | overwrite_pos = 0 pending_pos = 0 consumer_pos = 0 4. Reserve event C, size 2048. C is allocated at offset 1536, and producer_pos is advanced to 3584. 0 512 1024 1536 2048 2560 3072 3584 4096 +-----------------------------------------------------------------------+ | | | | | | A | B | C | | | [BUSY] | [BUSY] | [BUSY] | | +-----------------------------------------------------------------------+ ^ ^ | | | | | producer_pos = 3584 | overwrite_pos = 0 pending_pos = 0 consumer_pos = 0 5. Submit event A. The BUSY bit of A is cleared. B becomes the oldest event to be committed, so pending_pos is advanced to 512, the start of B. 0 512 1024 1536 2048 2560 3072 3584 4096 +-----------------------------------------------------------------------+ | | | | | | A | B | C | | | | [BUSY] | [BUSY] | | +-----------------------------------------------------------------------+ ^ ^ ^ | | | | | | | pending_pos = 512 producer_pos = 3584 | overwrite_pos = 0 consumer_pos = 0 6. Submit event B. The BUSY bit of B is cleared, and pending_pos is advanced to the start of C, which is now the oldest event to be committed. 0 512 1024 1536 2048 2560 3072 3584 4096 +-----------------------------------------------------------------------+ | | | | | | A | B | C | | | | | [BUSY] | | +-----------------------------------------------------------------------+ ^ ^ ^ | | | | | | | pending_pos = 1536 producer_pos = 3584 | overwrite_pos = 0 consumer_pos = 0 7. Reserve event D, size 1536 (3 * 512). There are 2048 bytes not being written between producer_pos (currently 3584) and pending_pos, so D is allocated at offset 3584, and producer_pos is advanced by 1536 (from 3584 to 5120). Since event D will overwrite all bytes of event A and the first 512 bytes of event B, overwrite_pos is advanced to the start of event C, the oldest event that is not overwritten. 0 512 1024 1536 2048 2560 3072 3584 4096 +-----------------------------------------------------------------------+ | | | | | | D End | | C | D Begin| | [BUSY] | | [BUSY] | [BUSY] | +-----------------------------------------------------------------------+ ^ ^ ^ | | | | | pending_pos = 1536 | | overwrite_pos = 1536 | | | producer_pos=5120 | consumer_pos = 0 8. Reserve event E, size 1024. Although there are 512 bytes not being written between producer_pos and pending_pos, E cannot be reserved, as it would overwrite the first 512 bytes of event C, which is still being written. 9. Submit event C and D. pending_pos is advanced to the end of D. 0 512 1024 1536 2048 2560 3072 3584 4096 +-----------------------------------------------------------------------+ | | | | | | D End | | C | D Begin| | | | | | +-----------------------------------------------------------------------+ ^ ^ ^ | | | | | overwrite_pos = 1536 | | | producer_pos=5120 | pending_pos=5120 | consumer_pos = 0 The performance data for overwrite mode will be provided in a follow-up patch that adds overwrite-mode benchmarks. A sample of performance data for non-overwrite mode, collected on an x86_64 CPU and an arm64 CPU, before and after this patch, is shown below. As we can see, no obvious performance regression occurs. - x86_64 (AMD EPYC 9654) Before: Ringbuf, multi-producer contention ================================== rb-libbpf nr_prod 1 11.623 ± 0.027M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 2 15.812 ± 0.014M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 3 7.871 ± 0.003M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 4 6.703 ± 0.001M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 8 2.896 ± 0.002M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 12 2.054 ± 0.002M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 16 1.864 ± 0.002M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 20 1.580 ± 0.002M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 24 1.484 ± 0.002M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 28 1.369 ± 0.002M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 32 1.316 ± 0.001M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 36 1.272 ± 0.002M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 40 1.239 ± 0.001M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 44 1.226 ± 0.002M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 48 1.213 ± 0.001M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 52 1.193 ± 0.001M/s (drops 0.000 ± 0.000M/s) After: Ringbuf, multi-producer contention ================================== rb-libbpf nr_prod 1 11.845 ± 0.036M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 2 15.889 ± 0.006M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 3 8.155 ± 0.002M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 4 6.708 ± 0.001M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 8 2.918 ± 0.001M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 12 2.065 ± 0.002M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 16 1.870 ± 0.002M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 20 1.582 ± 0.002M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 24 1.482 ± 0.001M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 28 1.372 ± 0.002M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 32 1.323 ± 0.002M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 36 1.264 ± 0.001M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 40 1.236 ± 0.002M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 44 1.209 ± 0.002M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 48 1.189 ± 0.001M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 52 1.165 ± 0.002M/s (drops 0.000 ± 0.000M/s) - arm64 (HiSilicon Kunpeng 920) Before: Ringbuf, multi-producer contention ================================== rb-libbpf nr_prod 1 11.310 ± 0.623M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 2 9.947 ± 0.004M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 3 6.634 ± 0.011M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 4 4.502 ± 0.003M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 8 3.888 ± 0.003M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 12 3.372 ± 0.005M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 16 3.189 ± 0.010M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 20 2.998 ± 0.006M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 24 3.086 ± 0.018M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 28 2.845 ± 0.004M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 32 2.815 ± 0.008M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 36 2.771 ± 0.009M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 40 2.814 ± 0.011M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 44 2.752 ± 0.006M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 48 2.695 ± 0.006M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 52 2.710 ± 0.006M/s (drops 0.000 ± 0.000M/s) After: Ringbuf, multi-producer contention ================================== rb-libbpf nr_prod 1 11.283 ± 0.550M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 2 9.993 ± 0.003M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 3 6.898 ± 0.006M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 4 5.257 ± 0.001M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 8 3.830 ± 0.005M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 12 3.528 ± 0.013M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 16 3.265 ± 0.018M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 20 2.990 ± 0.007M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 24 2.929 ± 0.014M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 28 2.898 ± 0.010M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 32 2.818 ± 0.006M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 36 2.789 ± 0.012M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 40 2.770 ± 0.006M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 44 2.651 ± 0.007M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 48 2.669 ± 0.005M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 52 2.695 ± 0.009M/s (drops 0.000 ± 0.000M/s) Signed-off-by: Xu Kuohai Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20251018035738.4039621-2-xukuohai@huaweicloud.com --- include/uapi/linux/bpf.h | 4 ++ kernel/bpf/ringbuf.c | 114 ++++++++++++++++++++++++++++++++++------- tools/include/uapi/linux/bpf.h | 4 ++ 3 files changed, 103 insertions(+), 19 deletions(-) (limited to 'tools/include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 77edd0253989..1d73f165394d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1430,6 +1430,9 @@ enum { /* Do not translate kernel bpf_arena pointers to user pointers */ BPF_F_NO_USER_CONV = (1U << 18), + +/* Enable BPF ringbuf overwrite mode */ + BPF_F_RB_OVERWRITE = (1U << 19), }; /* Flags for BPF_PROG_QUERY. */ @@ -6231,6 +6234,7 @@ enum { BPF_RB_RING_SIZE = 1, BPF_RB_CONS_POS = 2, BPF_RB_PROD_POS = 3, + BPF_RB_OVERWRITE_POS = 4, }; /* BPF ring buffer constants */ diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 719d73299397..cbfa109e907e 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -13,7 +13,7 @@ #include #include -#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE) +#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE | BPF_F_RB_OVERWRITE) /* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */ #define RINGBUF_PGOFF \ @@ -30,6 +30,7 @@ struct bpf_ringbuf { u64 mask; struct page **pages; int nr_pages; + bool overwrite_mode; rqspinlock_t spinlock ____cacheline_aligned_in_smp; /* For user-space producer ring buffers, an atomic_t busy bit is used * to synchronize access to the ring buffers in the kernel, rather than @@ -73,6 +74,7 @@ struct bpf_ringbuf { unsigned long consumer_pos __aligned(PAGE_SIZE); unsigned long producer_pos __aligned(PAGE_SIZE); unsigned long pending_pos; + unsigned long overwrite_pos; /* position after the last overwritten record */ char data[] __aligned(PAGE_SIZE); }; @@ -166,7 +168,7 @@ static void bpf_ringbuf_notify(struct irq_work *work) * considering that the maximum value of data_sz is (4GB - 1), there * will be no overflow, so just note the size limit in the comments. */ -static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) +static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node, bool overwrite_mode) { struct bpf_ringbuf *rb; @@ -183,17 +185,25 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) rb->consumer_pos = 0; rb->producer_pos = 0; rb->pending_pos = 0; + rb->overwrite_mode = overwrite_mode; return rb; } static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) { + bool overwrite_mode = false; struct bpf_ringbuf_map *rb_map; if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); + if (attr->map_flags & BPF_F_RB_OVERWRITE) { + if (attr->map_type != BPF_MAP_TYPE_RINGBUF) + return ERR_PTR(-EINVAL); + overwrite_mode = true; + } + if (attr->key_size || attr->value_size || !is_power_of_2(attr->max_entries) || !PAGE_ALIGNED(attr->max_entries)) @@ -205,7 +215,7 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) bpf_map_init_from_attr(&rb_map->map, attr); - rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node); + rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node, overwrite_mode); if (!rb_map->rb) { bpf_map_area_free(rb_map); return ERR_PTR(-ENOMEM); @@ -293,13 +303,26 @@ static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF); } +/* + * Return an estimate of the available data in the ring buffer. + * Note: the returned value can exceed the actual ring buffer size because the + * function is not synchronized with the producer. The producer acquires the + * ring buffer's spinlock, but this function does not. + */ static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) { - unsigned long cons_pos, prod_pos; + unsigned long cons_pos, prod_pos, over_pos; cons_pos = smp_load_acquire(&rb->consumer_pos); - prod_pos = smp_load_acquire(&rb->producer_pos); - return prod_pos - cons_pos; + + if (unlikely(rb->overwrite_mode)) { + over_pos = smp_load_acquire(&rb->overwrite_pos); + prod_pos = smp_load_acquire(&rb->producer_pos); + return prod_pos - max(cons_pos, over_pos); + } else { + prod_pos = smp_load_acquire(&rb->producer_pos); + return prod_pos - cons_pos; + } } static u32 ringbuf_total_data_sz(const struct bpf_ringbuf *rb) @@ -402,11 +425,43 @@ bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr) return (void*)((addr & PAGE_MASK) - off); } +static bool bpf_ringbuf_has_space(const struct bpf_ringbuf *rb, + unsigned long new_prod_pos, + unsigned long cons_pos, + unsigned long pend_pos) +{ + /* + * No space if oldest not yet committed record until the newest + * record span more than (ringbuf_size - 1). + */ + if (new_prod_pos - pend_pos > rb->mask) + return false; + + /* Ok, we have space in overwrite mode */ + if (unlikely(rb->overwrite_mode)) + return true; + + /* + * No space if producer position advances more than (ringbuf_size - 1) + * ahead of consumer position when not in overwrite mode. + */ + if (new_prod_pos - cons_pos > rb->mask) + return false; + + return true; +} + +static u32 bpf_ringbuf_round_up_hdr_len(u32 hdr_len) +{ + hdr_len &= ~BPF_RINGBUF_DISCARD_BIT; + return round_up(hdr_len + BPF_RINGBUF_HDR_SZ, 8); +} + static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) { - unsigned long cons_pos, prod_pos, new_prod_pos, pend_pos, flags; + unsigned long cons_pos, prod_pos, new_prod_pos, pend_pos, over_pos, flags; struct bpf_ringbuf_hdr *hdr; - u32 len, pg_off, tmp_size, hdr_len; + u32 len, pg_off, hdr_len; if (unlikely(size > RINGBUF_MAX_RECORD_SZ)) return NULL; @@ -429,24 +484,43 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) hdr_len = READ_ONCE(hdr->len); if (hdr_len & BPF_RINGBUF_BUSY_BIT) break; - tmp_size = hdr_len & ~BPF_RINGBUF_DISCARD_BIT; - tmp_size = round_up(tmp_size + BPF_RINGBUF_HDR_SZ, 8); - pend_pos += tmp_size; + pend_pos += bpf_ringbuf_round_up_hdr_len(hdr_len); } rb->pending_pos = pend_pos; - /* check for out of ringbuf space: - * - by ensuring producer position doesn't advance more than - * (ringbuf_size - 1) ahead - * - by ensuring oldest not yet committed record until newest - * record does not span more than (ringbuf_size - 1) - */ - if (new_prod_pos - cons_pos > rb->mask || - new_prod_pos - pend_pos > rb->mask) { + if (!bpf_ringbuf_has_space(rb, new_prod_pos, cons_pos, pend_pos)) { raw_res_spin_unlock_irqrestore(&rb->spinlock, flags); return NULL; } + /* + * In overwrite mode, advance overwrite_pos when the ring buffer is full. + * The key points are to stay on record boundaries and consume enough records + * to fit the new one. + */ + if (unlikely(rb->overwrite_mode)) { + over_pos = rb->overwrite_pos; + while (new_prod_pos - over_pos > rb->mask) { + hdr = (void *)rb->data + (over_pos & rb->mask); + hdr_len = READ_ONCE(hdr->len); + /* + * The bpf_ringbuf_has_space() check above ensures we won’t + * step over a record currently being worked on by another + * producer. + */ + over_pos += bpf_ringbuf_round_up_hdr_len(hdr_len); + } + /* + * smp_store_release(&rb->producer_pos, new_prod_pos) at + * the end of the function ensures that when consumer sees + * the updated rb->producer_pos, it always sees the updated + * rb->overwrite_pos, so when consumer reads overwrite_pos + * after smp_load_acquire(r->producer_pos), the overwrite_pos + * will always be valid. + */ + WRITE_ONCE(rb->overwrite_pos, over_pos); + } + hdr = (void *)rb->data + (prod_pos & rb->mask); pg_off = bpf_ringbuf_rec_pg_off(rb, hdr); hdr->len = size | BPF_RINGBUF_BUSY_BIT; @@ -576,6 +650,8 @@ BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags) return smp_load_acquire(&rb->consumer_pos); case BPF_RB_PROD_POS: return smp_load_acquire(&rb->producer_pos); + case BPF_RB_OVERWRITE_POS: + return smp_load_acquire(&rb->overwrite_pos); default: return 0; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 77edd0253989..1d73f165394d 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1430,6 +1430,9 @@ enum { /* Do not translate kernel bpf_arena pointers to user pointers */ BPF_F_NO_USER_CONV = (1U << 18), + +/* Enable BPF ringbuf overwrite mode */ + BPF_F_RB_OVERWRITE = (1U << 19), }; /* Flags for BPF_PROG_QUERY. */ @@ -6231,6 +6234,7 @@ enum { BPF_RB_RING_SIZE = 1, BPF_RB_CONS_POS = 2, BPF_RB_PROD_POS = 3, + BPF_RB_OVERWRITE_POS = 4, }; /* BPF ring buffer constants */ -- cgit v1.2.3 From c69993ecdd4dfde2b7da08b022052a33b203da07 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 23 Oct 2025 15:17:05 +0200 Subject: perf: Support deferred user unwind Add support for deferred userspace unwind to perf. Where perf currently relies on in-place stack unwinding; from NMI context and all that. This moves the userspace part of the unwind to right before the return-to-userspace. This has two distinct benefits, the biggest is that it moves the unwind to a faultable context. It becomes possible to fault in debug info (.eh_frame, SFrame etc.) that might not otherwise be readily available. And secondly, it de-duplicates the user callchain where multiple samples happen during the same kernel entry. To facilitate this the perf interface is extended with a new record type: PERF_RECORD_CALLCHAIN_DEFERRED and two new attribute flags: perf_event_attr::defer_callchain - to request the user unwind be deferred perf_event_attr::defer_output - to request PERF_RECORD_CALLCHAIN_DEFERRED records The existing PERF_RECORD_SAMPLE callchain section gets a new context type: PERF_CONTEXT_USER_DEFERRED After which will come a single entry, denoting the 'cookie' of the deferred callchain that should be attached here, matching the 'cookie' field of the above mentioned PERF_RECORD_CALLCHAIN_DEFERRED. The 'defer_callchain' flag is expected on all events with PERF_SAMPLE_CALLCHAIN. The 'defer_output' flag is expect on the event responsible for collecting side-band events (like mmap, comm etc.). Setting 'defer_output' on multiple events will get you duplicated PERF_RECORD_CALLCHAIN_DEFERRED records. Based on earlier patches by Josh and Steven. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251023150002.GR4067720@noisy.programming.kicks-ass.net --- include/linux/perf_event.h | 2 +- include/linux/unwind_deferred.h | 12 ------ include/linux/unwind_deferred_types.h | 13 ++++++ include/uapi/linux/perf_event.h | 21 +++++++++- kernel/bpf/stackmap.c | 4 +- kernel/events/callchain.c | 14 ++++++- kernel/events/core.c | 78 ++++++++++++++++++++++++++++++++++- tools/include/uapi/linux/perf_event.h | 21 +++++++++- 8 files changed, 145 insertions(+), 20 deletions(-) (limited to 'tools/include/uapi/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index fd1d91017b99..9870d768db4c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1720,7 +1720,7 @@ extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct p extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, - u32 max_stack, bool crosstask, bool add_mark); + u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie); extern int get_callchain_buffers(int max_stack); extern void put_callchain_buffers(void); extern struct perf_callchain_entry *get_callchain_entry(int *rctx); diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h index f4743c8cff4c..bc7ae7d21900 100644 --- a/include/linux/unwind_deferred.h +++ b/include/linux/unwind_deferred.h @@ -6,18 +6,6 @@ #include #include -struct unwind_work; - -typedef void (*unwind_callback_t)(struct unwind_work *work, - struct unwind_stacktrace *trace, - u64 cookie); - -struct unwind_work { - struct list_head list; - unwind_callback_t func; - int bit; -}; - #ifdef CONFIG_UNWIND_USER enum { diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h index 0a4c8ddbbc57..18fa3932f61c 100644 --- a/include/linux/unwind_deferred_types.h +++ b/include/linux/unwind_deferred_types.h @@ -39,4 +39,17 @@ struct unwind_task_info { union unwind_task_id id; }; +struct unwind_work; +struct unwind_stacktrace; + +typedef void (*unwind_callback_t)(struct unwind_work *work, + struct unwind_stacktrace *trace, + u64 cookie); + +struct unwind_work { + struct list_head list; + unwind_callback_t func; + int bit; +}; + #endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */ diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 78a362b80027..d292f96bc06f 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -463,7 +463,9 @@ struct perf_event_attr { inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */ remove_on_exec : 1, /* event is removed from task on exec */ sigtrap : 1, /* send synchronous SIGTRAP on event */ - __reserved_1 : 26; + defer_callchain: 1, /* request PERF_RECORD_CALLCHAIN_DEFERRED records */ + defer_output : 1, /* output PERF_RECORD_CALLCHAIN_DEFERRED records */ + __reserved_1 : 24; union { __u32 wakeup_events; /* wake up every n events */ @@ -1239,6 +1241,22 @@ enum perf_event_type { */ PERF_RECORD_AUX_OUTPUT_HW_ID = 21, + /* + * This user callchain capture was deferred until shortly before + * returning to user space. Previous samples would have kernel + * callchains only and they need to be stitched with this to make full + * callchains. + * + * struct { + * struct perf_event_header header; + * u64 cookie; + * u64 nr; + * u64 ips[nr]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_CALLCHAIN_DEFERRED = 22, + PERF_RECORD_MAX, /* non-ABI */ }; @@ -1269,6 +1287,7 @@ enum perf_callchain_context { PERF_CONTEXT_HV = (__u64)-32, PERF_CONTEXT_KERNEL = (__u64)-128, PERF_CONTEXT_USER = (__u64)-512, + PERF_CONTEXT_USER_DEFERRED = (__u64)-640, PERF_CONTEXT_GUEST = (__u64)-2048, PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 4d53cdd1374c..8f1dacaf01fe 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -315,7 +315,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, max_depth = sysctl_perf_event_max_stack; trace = get_perf_callchain(regs, kernel, user, max_depth, - false, false); + false, false, 0); if (unlikely(!trace)) /* couldn't fetch the stack trace */ @@ -452,7 +452,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, trace = get_callchain_entry_for_task(task, max_depth); else trace = get_perf_callchain(regs, kernel, user, max_depth, - crosstask, false); + crosstask, false, 0); if (unlikely(!trace) || trace->nr < skip) { if (may_fault) diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 808c0d7a31fa..b9c7e00725d6 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -218,7 +218,7 @@ static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, - u32 max_stack, bool crosstask, bool add_mark) + u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie) { struct perf_callchain_entry *entry; struct perf_callchain_entry_ctx ctx; @@ -251,6 +251,18 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, regs = task_pt_regs(current); } + if (defer_cookie) { + /* + * Foretell the coming of PERF_RECORD_CALLCHAIN_DEFERRED + * which can be stitched to this one, and add + * the cookie after it (it will be cut off when the + * user stack is copied to the callchain). + */ + perf_callchain_store_context(&ctx, PERF_CONTEXT_USER_DEFERRED); + perf_callchain_store_context(&ctx, defer_cookie); + goto exit_put; + } + if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); diff --git a/kernel/events/core.c b/kernel/events/core.c index 7541f6f85fcb..f6a08c73f783 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -56,6 +56,7 @@ #include #include #include +#include #include "internal.h" @@ -8200,6 +8201,8 @@ static u64 perf_get_page_size(unsigned long addr) static struct perf_callchain_entry __empty_callchain = { .nr = 0, }; +static struct unwind_work perf_unwind_work; + struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs) { @@ -8208,8 +8211,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) !(current->flags & (PF_KTHREAD | PF_USER_WORKER)); /* Disallow cross-task user callchains. */ bool crosstask = event->ctx->task && event->ctx->task != current; + bool defer_user = IS_ENABLED(CONFIG_UNWIND_USER) && user && + event->attr.defer_callchain; const u32 max_stack = event->attr.sample_max_stack; struct perf_callchain_entry *callchain; + u64 defer_cookie; if (!current->mm) user = false; @@ -8217,8 +8223,13 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) if (!kernel && !user) return &__empty_callchain; - callchain = get_perf_callchain(regs, kernel, user, - max_stack, crosstask, true); + if (!(user && defer_user && !crosstask && + unwind_deferred_request(&perf_unwind_work, &defer_cookie) >= 0)) + defer_cookie = 0; + + callchain = get_perf_callchain(regs, kernel, user, max_stack, + crosstask, true, defer_cookie); + return callchain ?: &__empty_callchain; } @@ -10003,6 +10014,66 @@ void perf_event_bpf_event(struct bpf_prog *prog, perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); } +struct perf_callchain_deferred_event { + struct unwind_stacktrace *trace; + struct { + struct perf_event_header header; + u64 cookie; + u64 nr; + u64 ips[]; + } event; +}; + +static void perf_callchain_deferred_output(struct perf_event *event, void *data) +{ + struct perf_callchain_deferred_event *deferred_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret, size = deferred_event->event.header.size; + + if (!event->attr.defer_output) + return; + + /* XXX do we really need sample_id_all for this ??? */ + perf_event_header__init_id(&deferred_event->event.header, &sample, event); + + ret = perf_output_begin(&handle, &sample, event, + deferred_event->event.header.size); + if (ret) + goto out; + + perf_output_put(&handle, deferred_event->event); + for (int i = 0; i < deferred_event->trace->nr; i++) { + u64 entry = deferred_event->trace->entries[i]; + perf_output_put(&handle, entry); + } + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +out: + deferred_event->event.header.size = size; +} + +static void perf_unwind_deferred_callback(struct unwind_work *work, + struct unwind_stacktrace *trace, u64 cookie) +{ + struct perf_callchain_deferred_event deferred_event = { + .trace = trace, + .event = { + .header = { + .type = PERF_RECORD_CALLCHAIN_DEFERRED, + .misc = PERF_RECORD_MISC_USER, + .size = sizeof(deferred_event.event) + + (trace->nr * sizeof(u64)), + }, + .cookie = cookie, + .nr = trace->nr, + }, + }; + + perf_iterate_sb(perf_callchain_deferred_output, &deferred_event, NULL); +} + struct perf_text_poke_event { const void *old_bytes; const void *new_bytes; @@ -14799,6 +14870,9 @@ void __init perf_event_init(void) idr_init(&pmu_idr); + unwind_deferred_init(&perf_unwind_work, + perf_unwind_deferred_callback); + perf_event_init_all_cpus(); init_srcu_struct(&pmus_srcu); perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 78a362b80027..d292f96bc06f 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -463,7 +463,9 @@ struct perf_event_attr { inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */ remove_on_exec : 1, /* event is removed from task on exec */ sigtrap : 1, /* send synchronous SIGTRAP on event */ - __reserved_1 : 26; + defer_callchain: 1, /* request PERF_RECORD_CALLCHAIN_DEFERRED records */ + defer_output : 1, /* output PERF_RECORD_CALLCHAIN_DEFERRED records */ + __reserved_1 : 24; union { __u32 wakeup_events; /* wake up every n events */ @@ -1239,6 +1241,22 @@ enum perf_event_type { */ PERF_RECORD_AUX_OUTPUT_HW_ID = 21, + /* + * This user callchain capture was deferred until shortly before + * returning to user space. Previous samples would have kernel + * callchains only and they need to be stitched with this to make full + * callchains. + * + * struct { + * struct perf_event_header header; + * u64 cookie; + * u64 nr; + * u64 ips[nr]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_CALLCHAIN_DEFERRED = 22, + PERF_RECORD_MAX, /* non-ABI */ }; @@ -1269,6 +1287,7 @@ enum perf_callchain_context { PERF_CONTEXT_HV = (__u64)-32, PERF_CONTEXT_KERNEL = (__u64)-128, PERF_CONTEXT_USER = (__u64)-512, + PERF_CONTEXT_USER_DEFERRED = (__u64)-640, PERF_CONTEXT_GUEST = (__u64)-2048, PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, -- cgit v1.2.3 From 6fc9baa49d0ce36e4c2c27dd9e5fbd387240d8e7 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 29 Oct 2025 13:20:34 +0100 Subject: nsfs: update tools header Ensure all the new uapi bits are visible for the selftests. Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-21-2e6f823ebdc0@kernel.org Tested-by: syzbot@syzkaller.appspotmail.com Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- tools/include/uapi/linux/nsfs.h | 70 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) (limited to 'tools/include/uapi/linux') diff --git a/tools/include/uapi/linux/nsfs.h b/tools/include/uapi/linux/nsfs.h index 33c9b578b3b2..a25e38d1c874 100644 --- a/tools/include/uapi/linux/nsfs.h +++ b/tools/include/uapi/linux/nsfs.h @@ -53,6 +53,76 @@ enum init_ns_ino { TIME_NS_INIT_INO = 0xEFFFFFFAU, NET_NS_INIT_INO = 0xEFFFFFF9U, MNT_NS_INIT_INO = 0xEFFFFFF8U, +#ifdef __KERNEL__ + MNT_NS_ANON_INO = 0xEFFFFFF7U, +#endif }; +struct nsfs_file_handle { + __u64 ns_id; + __u32 ns_type; + __u32 ns_inum; +}; + +#define NSFS_FILE_HANDLE_SIZE_VER0 16 /* sizeof first published struct */ +#define NSFS_FILE_HANDLE_SIZE_LATEST sizeof(struct nsfs_file_handle) /* sizeof latest published struct */ + +enum init_ns_id { + IPC_NS_INIT_ID = 1ULL, + UTS_NS_INIT_ID = 2ULL, + USER_NS_INIT_ID = 3ULL, + PID_NS_INIT_ID = 4ULL, + CGROUP_NS_INIT_ID = 5ULL, + TIME_NS_INIT_ID = 6ULL, + NET_NS_INIT_ID = 7ULL, + MNT_NS_INIT_ID = 8ULL, +#ifdef __KERNEL__ + NS_LAST_INIT_ID = MNT_NS_INIT_ID, +#endif +}; + +enum ns_type { + TIME_NS = (1ULL << 7), /* CLONE_NEWTIME */ + MNT_NS = (1ULL << 17), /* CLONE_NEWNS */ + CGROUP_NS = (1ULL << 25), /* CLONE_NEWCGROUP */ + UTS_NS = (1ULL << 26), /* CLONE_NEWUTS */ + IPC_NS = (1ULL << 27), /* CLONE_NEWIPC */ + USER_NS = (1ULL << 28), /* CLONE_NEWUSER */ + PID_NS = (1ULL << 29), /* CLONE_NEWPID */ + NET_NS = (1ULL << 30), /* CLONE_NEWNET */ +}; + +/** + * struct ns_id_req - namespace ID request structure + * @size: size of this structure + * @spare: reserved for future use + * @filter: filter mask + * @ns_id: last namespace id + * @user_ns_id: owning user namespace ID + * + * Structure for passing namespace ID and miscellaneous parameters to + * statns(2) and listns(2). + * + * For statns(2) @param represents the request mask. + * For listns(2) @param represents the last listed mount id (or zero). + */ +struct ns_id_req { + __u32 size; + __u32 spare; + __u64 ns_id; + struct /* listns */ { + __u32 ns_type; + __u32 spare2; + __u64 user_ns_id; + }; +}; + +/* + * Special @user_ns_id value that can be passed to listns() + */ +#define LISTNS_CURRENT_USER 0xffffffffffffffff /* Caller's userns */ + +/* List of all ns_id_req versions. */ +#define NS_ID_REQ_SIZE_VER0 32 /* sizeof first published struct */ + #endif /* __LINUX_NSFS_H */ -- cgit v1.2.3 From c18d4b190a46651726c9a952667c74d2deb33c28 Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Tue, 28 Oct 2025 20:30:05 +0000 Subject: net: Extend NAPI threaded polling to allow kthread based busy polling Add a new state NAPI_STATE_THREADED_BUSY_POLL to the NAPI state enum to enable and disable threaded busy polling. When threaded busy polling is enabled for a NAPI, enable NAPI_STATE_THREADED also. When the threaded NAPI is scheduled, set NAPI_STATE_IN_BUSY_POLL to signal napi_complete_done not to rearm interrupts. Whenever NAPI_STATE_THREADED_BUSY_POLL is unset, the NAPI_STATE_IN_BUSY_POLL will be unset, napi_complete_done unsets the NAPI_STATE_SCHED_THREADED bit also, which in turn will make the kthread go to sleep. Signed-off-by: Samiullah Khawaja Reviewed-by: Willem de Bruijn Acked-by: Martin Karsten Tested-by: Martin Karsten Link: https://patch.msgid.link/20251028203007.575686-2-skhawaja@google.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/netdev.yaml | 5 +-- Documentation/networking/napi.rst | 50 +++++++++++++++++++++++++++- include/linux/netdevice.h | 4 ++- include/uapi/linux/netdev.h | 1 + net/core/dev.c | 58 +++++++++++++++++++++++++++------ net/core/dev.h | 3 ++ net/core/netdev-genl-gen.c | 2 +- tools/include/uapi/linux/netdev.h | 1 + 8 files changed, 109 insertions(+), 15 deletions(-) (limited to 'tools/include/uapi/linux') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index e00d3fa1c152..10c412b7433f 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -88,7 +88,7 @@ definitions: - name: napi-threaded type: enum - entries: [disabled, enabled] + entries: [disabled, enabled, busy-poll] attribute-sets: - @@ -291,7 +291,8 @@ attribute-sets: name: threaded doc: Whether the NAPI is configured to operate in threaded polling mode. If this is set to enabled then the NAPI context operates - in threaded polling mode. + in threaded polling mode. If this is set to busy-poll, then the + threaded polling mode also busy polls. type: u32 enum: napi-threaded - diff --git a/Documentation/networking/napi.rst b/Documentation/networking/napi.rst index 7dd60366f4ff..4e008efebb35 100644 --- a/Documentation/networking/napi.rst +++ b/Documentation/networking/napi.rst @@ -263,7 +263,9 @@ are not well known). Busy polling is enabled by either setting ``SO_BUSY_POLL`` on selected sockets or using the global ``net.core.busy_poll`` and ``net.core.busy_read`` sysctls. An io_uring API for NAPI busy polling -also exists. +also exists. Threaded polling of NAPI also has a mode to busy poll for +packets (:ref:`threaded busy polling`) using the NAPI +processing kthread. epoll-based busy polling ------------------------ @@ -426,6 +428,52 @@ Therefore, setting ``gro_flush_timeout`` and ``napi_defer_hard_irqs`` is the recommended usage, because otherwise setting ``irq-suspend-timeout`` might not have any discernible effect. +.. _threaded_busy_poll: + +Threaded NAPI busy polling +-------------------------- + +Threaded NAPI busy polling extends threaded NAPI and adds support to do +continuous busy polling of the NAPI. This can be useful for forwarding or +AF_XDP applications. + +Threaded NAPI busy polling can be enabled on per NIC queue basis using Netlink. + +For example, using the following script: + +.. code-block:: bash + + $ ynl --family netdev --do napi-set \ + --json='{"id": 66, "threaded": "busy-poll"}' + +The kernel will create a kthread that busy polls on this NAPI. + +The user may elect to set the CPU affinity of this kthread to an unused CPU +core to improve how often the NAPI is polled at the expense of wasted CPU +cycles. Note that this will keep the CPU core busy with 100% usage. + +Once threaded busy polling is enabled for a NAPI, PID of the kthread can be +retrieved using Netlink so the affinity of the kthread can be set up. + +For example, the following script can be used to fetch the PID: + +.. code-block:: bash + + $ ynl --family netdev --do napi-get --json='{"id": 66}' + +This will output something like following, the pid `258` is the PID of the +kthread that is polling this NAPI. + +.. code-block:: bash + + $ {'defer-hard-irqs': 0, + 'gro-flush-timeout': 0, + 'id': 66, + 'ifindex': 2, + 'irq-suspend-timeout': 0, + 'pid': 258, + 'threaded': 'busy-poll'} + .. _threaded: Threaded NAPI diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9c1e5042c5e7..e808071dbb7d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -423,11 +423,12 @@ enum { NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ NAPI_STATE_LISTED, /* NAPI added to system lists */ NAPI_STATE_NO_BUSY_POLL, /* Do not add in napi_hash, no busy polling */ - NAPI_STATE_IN_BUSY_POLL, /* sk_busy_loop() owns this NAPI */ + NAPI_STATE_IN_BUSY_POLL, /* Do not rearm NAPI interrupt */ NAPI_STATE_PREFER_BUSY_POLL, /* prefer busy-polling over softirq processing*/ NAPI_STATE_THREADED, /* The poll is performed inside its own thread*/ NAPI_STATE_SCHED_THREADED, /* Napi is currently scheduled in threaded mode */ NAPI_STATE_HAS_NOTIFIER, /* Napi has an IRQ notifier */ + NAPI_STATE_THREADED_BUSY_POLL, /* The threaded NAPI poller will busy poll */ }; enum { @@ -442,6 +443,7 @@ enum { NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED), NAPIF_STATE_SCHED_THREADED = BIT(NAPI_STATE_SCHED_THREADED), NAPIF_STATE_HAS_NOTIFIER = BIT(NAPI_STATE_HAS_NOTIFIER), + NAPIF_STATE_THREADED_BUSY_POLL = BIT(NAPI_STATE_THREADED_BUSY_POLL), }; enum gro_result { diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 48eb49aa03d4..048c8de1a130 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -80,6 +80,7 @@ enum netdev_qstats_scope { enum netdev_napi_threaded { NETDEV_NAPI_THREADED_DISABLED, NETDEV_NAPI_THREADED_ENABLED, + NETDEV_NAPI_THREADED_BUSY_POLL, }; enum { diff --git a/net/core/dev.c b/net/core/dev.c index dccc1176f3c6..2c1de5fb97d9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7089,7 +7089,8 @@ static void napi_stop_kthread(struct napi_struct *napi) */ if ((val & NAPIF_STATE_SCHED_THREADED) || !(val & NAPIF_STATE_SCHED)) { - new = val & (~NAPIF_STATE_THREADED); + new = val & (~(NAPIF_STATE_THREADED | + NAPIF_STATE_THREADED_BUSY_POLL)); } else { msleep(20); continue; @@ -7113,6 +7114,16 @@ static void napi_stop_kthread(struct napi_struct *napi) napi->thread = NULL; } +static void napi_set_threaded_state(struct napi_struct *napi, + enum netdev_napi_threaded threaded_mode) +{ + bool threaded = threaded_mode != NETDEV_NAPI_THREADED_DISABLED; + bool busy_poll = threaded_mode == NETDEV_NAPI_THREADED_BUSY_POLL; + + assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); + assign_bit(NAPI_STATE_THREADED_BUSY_POLL, &napi->state, busy_poll); +} + int napi_set_threaded(struct napi_struct *napi, enum netdev_napi_threaded threaded) { @@ -7139,7 +7150,7 @@ int napi_set_threaded(struct napi_struct *napi, } else { /* Make sure kthread is created before THREADED bit is set. */ smp_mb__before_atomic(); - assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); + napi_set_threaded_state(napi, threaded); } return 0; @@ -7531,7 +7542,9 @@ void napi_disable_locked(struct napi_struct *n) } new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC; - new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL); + new &= ~(NAPIF_STATE_THREADED | + NAPIF_STATE_THREADED_BUSY_POLL | + NAPIF_STATE_PREFER_BUSY_POLL); } while (!try_cmpxchg(&n->state, &val, new)); hrtimer_cancel(&n->timer); @@ -7743,7 +7756,7 @@ static int napi_thread_wait(struct napi_struct *napi) return -1; } -static void napi_threaded_poll_loop(struct napi_struct *napi) +static void napi_threaded_poll_loop(struct napi_struct *napi, bool busy_poll) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; struct softnet_data *sd; @@ -7772,22 +7785,47 @@ static void napi_threaded_poll_loop(struct napi_struct *napi) } skb_defer_free_flush(); bpf_net_ctx_clear(bpf_net_ctx); + + /* When busy poll is enabled, the old packets are not flushed in + * napi_complete_done. So flush them here. + */ + if (busy_poll) + gro_flush_normal(&napi->gro, HZ >= 1000); local_bh_enable(); + /* Call cond_resched here to avoid watchdog warnings. */ + if (repoll || busy_poll) { + rcu_softirq_qs_periodic(last_qs); + cond_resched(); + } + if (!repoll) break; - - rcu_softirq_qs_periodic(last_qs); - cond_resched(); } } static int napi_threaded_poll(void *data) { struct napi_struct *napi = data; + bool want_busy_poll; + bool in_busy_poll; + unsigned long val; + + while (!napi_thread_wait(napi)) { + val = READ_ONCE(napi->state); + + want_busy_poll = val & NAPIF_STATE_THREADED_BUSY_POLL; + in_busy_poll = val & NAPIF_STATE_IN_BUSY_POLL; - while (!napi_thread_wait(napi)) - napi_threaded_poll_loop(napi); + if (unlikely(val & NAPIF_STATE_DISABLE)) + want_busy_poll = false; + + if (want_busy_poll != in_busy_poll) + assign_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state, + want_busy_poll); + + napi_threaded_poll_loop(napi, want_busy_poll); + } return 0; } @@ -13097,7 +13135,7 @@ static void run_backlog_napi(unsigned int cpu) { struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); - napi_threaded_poll_loop(&sd->backlog); + napi_threaded_poll_loop(&sd->backlog, false); } static void backlog_napi_setup(unsigned int cpu) diff --git a/net/core/dev.h b/net/core/dev.h index 900880e8b5b4..4d872a79bafb 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -317,6 +317,9 @@ static inline void napi_set_irq_suspend_timeout(struct napi_struct *n, static inline enum netdev_napi_threaded napi_get_threaded(struct napi_struct *n) { + if (test_bit(NAPI_STATE_THREADED_BUSY_POLL, &n->state)) + return NETDEV_NAPI_THREADED_BUSY_POLL; + if (test_bit(NAPI_STATE_THREADED, &n->state)) return NETDEV_NAPI_THREADED_ENABLED; diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index e9a2a6f26cb7..ff20435c45d2 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -97,7 +97,7 @@ static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_THREADED [NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range), [NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, }, [NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, }, - [NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_U32, 1), + [NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_U32, 2), }; /* NETDEV_CMD_BIND_TX - do */ diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 48eb49aa03d4..048c8de1a130 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -80,6 +80,7 @@ enum netdev_qstats_scope { enum netdev_napi_threaded { NETDEV_NAPI_THREADED_DISABLED, NETDEV_NAPI_THREADED_ENABLED, + NETDEV_NAPI_THREADED_BUSY_POLL, }; enum { -- cgit v1.2.3 From b4ce5923e780d6896d4aaf19de5a27652b8bf1ea Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Wed, 5 Nov 2025 09:03:59 +0000 Subject: bpf, x86: add new map type: instructions array MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On bpf(BPF_PROG_LOAD) syscall user-supplied BPF programs are translated by the verifier into "xlated" BPF programs. During this process the original instructions offsets might be adjusted and/or individual instructions might be replaced by new sets of instructions, or deleted. Add a new BPF map type which is aimed to keep track of how, for a given program, the original instructions were relocated during the verification. Also, besides keeping track of the original -> xlated mapping, make x86 JIT to build the xlated -> jitted mapping for every instruction listed in an instruction array. This is required for every future application of instruction arrays: static keys, indirect jumps and indirect calls. A map of the BPF_MAP_TYPE_INSN_ARRAY type must be created with a u32 keys and value of size 8. The values have different semantics for userspace and for BPF space. For userspace a value consists of two u32 values – xlated and jitted offsets. For BPF side the value is a real pointer to a jitted instruction. On map creation/initialization, before loading the program, each element of the map should be initialized to point to an instruction offset within the program. Before the program load such maps should be made frozen. After the program verification xlated and jitted offsets can be read via the bpf(2) syscall. If a tracked instruction is removed by the verifier, then the xlated offset is set to (u32)-1 which is considered to be too big for a valid BPF program offset. One such a map can, obviously, be used to track one and only one BPF program. If the verification process was unsuccessful, then the same map can be re-used to verify the program with a different log level. However, if the program was loaded fine, then such a map, being frozen in any case, can't be reused by other programs even after the program release. Example. Consider the following original and xlated programs: Original prog: Xlated prog: 0: r1 = 0x0 0: r1 = 0 1: *(u32 *)(r10 - 0x4) = r1 1: *(u32 *)(r10 -4) = r1 2: r2 = r10 2: r2 = r10 3: r2 += -0x4 3: r2 += -4 4: r1 = 0x0 ll 4: r1 = map[id:88] 6: call 0x1 6: r1 += 272 7: r0 = *(u32 *)(r2 +0) 8: if r0 >= 0x1 goto pc+3 9: r0 <<= 3 10: r0 += r1 11: goto pc+1 12: r0 = 0 7: r6 = r0 13: r6 = r0 8: if r6 == 0x0 goto +0x2 14: if r6 == 0x0 goto pc+4 9: call 0x76 15: r0 = 0xffffffff8d2079c0 17: r0 = *(u64 *)(r0 +0) 10: *(u64 *)(r6 + 0x0) = r0 18: *(u64 *)(r6 +0) = r0 11: r0 = 0x0 19: r0 = 0x0 12: exit 20: exit An instruction array map, containing, e.g., instructions [0,4,7,12] will be translated by the verifier to [0,4,13,20]. A map with index 5 (the middle of 16-byte instruction) or indexes greater than 12 (outside the program boundaries) would be rejected. The functionality provided by this patch will be extended in consequent patches to implement BPF Static Keys, indirect jumps, and indirect calls. Signed-off-by: Anton Protopopov Reviewed-by: Eduard Zingerman Link: https://lore.kernel.org/r/20251105090410.1250500-2-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 9 ++ include/linux/bpf.h | 15 +++ include/linux/bpf_types.h | 1 + include/linux/bpf_verifier.h | 2 + include/uapi/linux/bpf.h | 21 +++ kernel/bpf/Makefile | 2 +- kernel/bpf/bpf_insn_array.c | 286 +++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 22 ++++ kernel/bpf/verifier.c | 45 +++++++ tools/include/uapi/linux/bpf.h | 21 +++ 10 files changed, 423 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/bpf_insn_array.c (limited to 'tools/include/uapi/linux') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index de5083cb1d37..91f92d65ae83 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -3827,6 +3827,15 @@ out_image: jit_data->header = header; jit_data->rw_header = rw_header; } + + /* + * The bpf_prog_update_insn_ptrs function expects addrs to + * point to the first byte of the jitted instruction (unlike + * the bpf_prog_fill_jited_linfo below, which, for historical + * reasons, expects to point to the next instruction) + */ + bpf_prog_update_insn_ptrs(prog, addrs, image); + /* * ctx.prog_offset is used when CFI preambles put code *before* * the function. See emit_cfi(). For FineIBT specifically this code diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a47d67db3be5..9d41a6affcef 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3797,4 +3797,19 @@ int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char * const char **linep, int *nump); struct bpf_prog *bpf_prog_find_from_stack(void); +int bpf_insn_array_init(struct bpf_map *map, const struct bpf_prog *prog); +int bpf_insn_array_ready(struct bpf_map *map); +void bpf_insn_array_release(struct bpf_map *map); +void bpf_insn_array_adjust(struct bpf_map *map, u32 off, u32 len); +void bpf_insn_array_adjust_after_remove(struct bpf_map *map, u32 off, u32 len); + +#ifdef CONFIG_BPF_SYSCALL +void bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image); +#else +static inline void +bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image) +{ +} +#endif + #endif /* _LINUX_BPF_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index fa78f49d4a9a..b13de31e163f 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -133,6 +133,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_ARENA, arena_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_INSN_ARRAY, insn_array_map_ops) BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint) BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c6eb68b6389c..6b820d8d77af 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -754,8 +754,10 @@ struct bpf_verifier_env { struct list_head free_list; /* list of struct bpf_verifier_state_list */ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ struct btf_mod_pair used_btfs[MAX_USED_BTFS]; /* array of BTF's used by BPF program */ + struct bpf_map *insn_array_maps[MAX_USED_MAPS]; /* array of INSN_ARRAY map's to be relocated */ u32 used_map_cnt; /* number of used maps */ u32 used_btf_cnt; /* number of used BTF objects */ + u32 insn_array_map_cnt; /* number of used maps of type BPF_MAP_TYPE_INSN_ARRAY */ u32 id_gen; /* used to generate unique reg IDs */ u32 hidden_subprog_cnt; /* number of hidden subprogs */ int exception_callback_subprog; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1d73f165394d..f5713f59ac10 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1026,6 +1026,7 @@ enum bpf_map_type { BPF_MAP_TYPE_USER_RINGBUF, BPF_MAP_TYPE_CGRP_STORAGE, BPF_MAP_TYPE_ARENA, + BPF_MAP_TYPE_INSN_ARRAY, __MAX_BPF_MAP_TYPE }; @@ -7649,4 +7650,24 @@ enum bpf_kfunc_flags { BPF_F_PAD_ZEROS = (1ULL << 0), }; +/* + * Values of a BPF_MAP_TYPE_INSN_ARRAY entry must be of this type. + * + * Before the map is used the orig_off field should point to an + * instruction inside the program being loaded. The other fields + * must be set to 0. + * + * After the program is loaded, the xlated_off will be adjusted + * by the verifier to point to the index of the original instruction + * in the xlated program. If the instruction is deleted, it will + * be set to (u32)-1. The jitted_off will be set to the corresponding + * offset in the jitted image of the program. + */ +struct bpf_insn_array_value { + __u32 orig_off; + __u32 xlated_off; + __u32 jitted_off; + __u32 :32; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 7fd0badfacb1..232cbc97434d 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -9,7 +9,7 @@ CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o -obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o bpf_insn_array.o obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o diff --git a/kernel/bpf/bpf_insn_array.c b/kernel/bpf/bpf_insn_array.c new file mode 100644 index 000000000000..2053fda377bb --- /dev/null +++ b/kernel/bpf/bpf_insn_array.c @@ -0,0 +1,286 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Isovalent */ + +#include + +struct bpf_insn_array { + struct bpf_map map; + atomic_t used; + long *ips; + DECLARE_FLEX_ARRAY(struct bpf_insn_array_value, values); +}; + +#define cast_insn_array(MAP_PTR) \ + container_of((MAP_PTR), struct bpf_insn_array, map) + +#define INSN_DELETED ((u32)-1) + +static inline u64 insn_array_alloc_size(u32 max_entries) +{ + const u64 base_size = sizeof(struct bpf_insn_array); + const u64 entry_size = sizeof(struct bpf_insn_array_value); + + return base_size + max_entries * (entry_size + sizeof(long)); +} + +static int insn_array_alloc_check(union bpf_attr *attr) +{ + u32 value_size = sizeof(struct bpf_insn_array_value); + + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size != value_size || attr->map_flags != 0) + return -EINVAL; + + return 0; +} + +static void insn_array_free(struct bpf_map *map) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + + bpf_map_area_free(insn_array); +} + +static struct bpf_map *insn_array_alloc(union bpf_attr *attr) +{ + u64 size = insn_array_alloc_size(attr->max_entries); + struct bpf_insn_array *insn_array; + + insn_array = bpf_map_area_alloc(size, NUMA_NO_NODE); + if (!insn_array) + return ERR_PTR(-ENOMEM); + + /* ips are allocated right after the insn_array->values[] array */ + insn_array->ips = (void *)&insn_array->values[attr->max_entries]; + + bpf_map_init_from_attr(&insn_array->map, attr); + + return &insn_array->map; +} + +static void *insn_array_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + u32 index = *(u32 *)key; + + if (unlikely(index >= insn_array->map.max_entries)) + return NULL; + + return &insn_array->values[index]; +} + +static long insn_array_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + u32 index = *(u32 *)key; + struct bpf_insn_array_value val = {}; + + if (unlikely(index >= insn_array->map.max_entries)) + return -E2BIG; + + if (unlikely(map_flags & BPF_NOEXIST)) + return -EEXIST; + + copy_map_value(map, &val, value); + if (val.jitted_off || val.xlated_off) + return -EINVAL; + + insn_array->values[index].orig_off = val.orig_off; + + return 0; +} + +static long insn_array_delete_elem(struct bpf_map *map, void *key) +{ + return -EINVAL; +} + +static int insn_array_check_btf(const struct bpf_map *map, + const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + if (!btf_type_is_i32(key_type)) + return -EINVAL; + + if (!btf_type_is_i64(value_type)) + return -EINVAL; + + return 0; +} + +static u64 insn_array_mem_usage(const struct bpf_map *map) +{ + return insn_array_alloc_size(map->max_entries); +} + +BTF_ID_LIST_SINGLE(insn_array_btf_ids, struct, bpf_insn_array) + +const struct bpf_map_ops insn_array_map_ops = { + .map_alloc_check = insn_array_alloc_check, + .map_alloc = insn_array_alloc, + .map_free = insn_array_free, + .map_get_next_key = bpf_array_get_next_key, + .map_lookup_elem = insn_array_lookup_elem, + .map_update_elem = insn_array_update_elem, + .map_delete_elem = insn_array_delete_elem, + .map_check_btf = insn_array_check_btf, + .map_mem_usage = insn_array_mem_usage, + .map_btf_id = &insn_array_btf_ids[0], +}; + +static inline bool is_frozen(struct bpf_map *map) +{ + guard(mutex)(&map->freeze_mutex); + + return map->frozen; +} + +static bool is_insn_array(const struct bpf_map *map) +{ + return map->map_type == BPF_MAP_TYPE_INSN_ARRAY; +} + +static inline bool valid_offsets(const struct bpf_insn_array *insn_array, + const struct bpf_prog *prog) +{ + u32 off; + int i; + + for (i = 0; i < insn_array->map.max_entries; i++) { + off = insn_array->values[i].orig_off; + + if (off >= prog->len) + return false; + + if (off > 0) { + if (prog->insnsi[off-1].code == (BPF_LD | BPF_DW | BPF_IMM)) + return false; + } + } + + return true; +} + +int bpf_insn_array_init(struct bpf_map *map, const struct bpf_prog *prog) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + struct bpf_insn_array_value *values = insn_array->values; + int i; + + if (!is_frozen(map)) + return -EINVAL; + + if (!valid_offsets(insn_array, prog)) + return -EINVAL; + + /* + * There can be only one program using the map + */ + if (atomic_xchg(&insn_array->used, 1)) + return -EBUSY; + + /* + * Reset all the map indexes to the original values. This is needed, + * e.g., when a replay of verification with different log level should + * be performed. + */ + for (i = 0; i < map->max_entries; i++) + values[i].xlated_off = values[i].orig_off; + + return 0; +} + +int bpf_insn_array_ready(struct bpf_map *map) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + int i; + + for (i = 0; i < map->max_entries; i++) { + if (insn_array->values[i].xlated_off == INSN_DELETED) + continue; + if (!insn_array->ips[i]) + return -EFAULT; + } + + return 0; +} + +void bpf_insn_array_release(struct bpf_map *map) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + + atomic_set(&insn_array->used, 0); +} + +void bpf_insn_array_adjust(struct bpf_map *map, u32 off, u32 len) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + int i; + + if (len <= 1) + return; + + for (i = 0; i < map->max_entries; i++) { + if (insn_array->values[i].xlated_off <= off) + continue; + if (insn_array->values[i].xlated_off == INSN_DELETED) + continue; + insn_array->values[i].xlated_off += len - 1; + } +} + +void bpf_insn_array_adjust_after_remove(struct bpf_map *map, u32 off, u32 len) +{ + struct bpf_insn_array *insn_array = cast_insn_array(map); + int i; + + for (i = 0; i < map->max_entries; i++) { + if (insn_array->values[i].xlated_off < off) + continue; + if (insn_array->values[i].xlated_off == INSN_DELETED) + continue; + if (insn_array->values[i].xlated_off < off + len) + insn_array->values[i].xlated_off = INSN_DELETED; + else + insn_array->values[i].xlated_off -= len; + } +} + +/* + * This function is called by JITs. The image is the real program + * image, the offsets array set up the xlated -> jitted mapping. + * The offsets[xlated] offset should point to the beginning of + * the jitted instruction. + */ +void bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image) +{ + struct bpf_insn_array *insn_array; + struct bpf_map *map; + u32 xlated_off; + int i, j; + + if (!offsets || !image) + return; + + for (i = 0; i < prog->aux->used_map_cnt; i++) { + map = prog->aux->used_maps[i]; + if (!is_insn_array(map)) + continue; + + insn_array = cast_insn_array(map); + for (j = 0; j < map->max_entries; j++) { + xlated_off = insn_array->values[j].xlated_off; + if (xlated_off == INSN_DELETED) + continue; + if (xlated_off < prog->aux->subprog_start) + continue; + xlated_off -= prog->aux->subprog_start; + if (xlated_off >= prog->len) + continue; + + insn_array->values[j].jitted_off = offsets[xlated_off]; + insn_array->ips[j] = (long)(image + offsets[xlated_off]); + } + } +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8a129746bd6c..f62d61b6730a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1493,6 +1493,7 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_MAP_TYPE_STRUCT_OPS: case BPF_MAP_TYPE_CPUMAP: case BPF_MAP_TYPE_ARENA: + case BPF_MAP_TYPE_INSN_ARRAY: if (!bpf_token_capable(token, CAP_BPF)) goto put_token; break; @@ -2853,6 +2854,23 @@ static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr return err; } +static int bpf_prog_mark_insn_arrays_ready(struct bpf_prog *prog) +{ + int err; + int i; + + for (i = 0; i < prog->aux->used_map_cnt; i++) { + if (prog->aux->used_maps[i]->map_type != BPF_MAP_TYPE_INSN_ARRAY) + continue; + + err = bpf_insn_array_ready(prog->aux->used_maps[i]); + if (err) + return err; + } + + return 0; +} + /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD keyring_id @@ -3082,6 +3100,10 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) if (err < 0) goto free_used_maps; + err = bpf_prog_mark_insn_arrays_ready(prog); + if (err < 0) + goto free_used_maps; + err = bpf_prog_alloc_id(prog); if (err) goto free_used_maps; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e4928846e763..dfe5741812b9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10086,6 +10086,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_map_push_elem) goto error; break; + case BPF_MAP_TYPE_INSN_ARRAY: + goto error; default: break; } @@ -20582,6 +20584,15 @@ static int __add_used_map(struct bpf_verifier_env *env, struct bpf_map *map) env->used_maps[env->used_map_cnt++] = map; + if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) { + err = bpf_insn_array_init(map, env->prog); + if (err) { + verbose(env, "Failed to properly initialize insn array\n"); + return err; + } + env->insn_array_maps[env->insn_array_map_cnt++] = map; + } + return env->used_map_cnt - 1; } @@ -20828,6 +20839,33 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len } } +static void release_insn_arrays(struct bpf_verifier_env *env) +{ + int i; + + for (i = 0; i < env->insn_array_map_cnt; i++) + bpf_insn_array_release(env->insn_array_maps[i]); +} + +static void adjust_insn_arrays(struct bpf_verifier_env *env, u32 off, u32 len) +{ + int i; + + if (len == 1) + return; + + for (i = 0; i < env->insn_array_map_cnt; i++) + bpf_insn_array_adjust(env->insn_array_maps[i], off, len); +} + +static void adjust_insn_arrays_after_remove(struct bpf_verifier_env *env, u32 off, u32 len) +{ + int i; + + for (i = 0; i < env->insn_array_map_cnt; i++) + bpf_insn_array_adjust_after_remove(env->insn_array_maps[i], off, len); +} + static void adjust_poke_descs(struct bpf_prog *prog, u32 off, u32 len) { struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab; @@ -20869,6 +20907,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of } adjust_insn_aux_data(env, new_prog, off, len); adjust_subprog_starts(env, off, len); + adjust_insn_arrays(env, off, len); adjust_poke_descs(new_prog, off, len); return new_prog; } @@ -21052,6 +21091,8 @@ static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) if (err) return err; + adjust_insn_arrays_after_remove(env, off, cnt); + memmove(aux_data + off, aux_data + off + cnt, sizeof(*aux_data) * (orig_prog_len - off - cnt)); @@ -21695,6 +21736,8 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->jited_linfo = prog->aux->jited_linfo; func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx; func[i]->aux->arena = prog->aux->arena; + func[i]->aux->used_maps = env->used_maps; + func[i]->aux->used_map_cnt = env->used_map_cnt; num_exentries = 0; insn = func[i]->insnsi; for (j = 0; j < func[i]->len; j++, insn++) { @@ -24871,6 +24914,8 @@ skip_full_check: adjust_btf_func(env); err_release_maps: + if (ret) + release_insn_arrays(env); if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release * them now. Otherwise free_used_maps() will release them. diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 1d73f165394d..f5713f59ac10 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1026,6 +1026,7 @@ enum bpf_map_type { BPF_MAP_TYPE_USER_RINGBUF, BPF_MAP_TYPE_CGRP_STORAGE, BPF_MAP_TYPE_ARENA, + BPF_MAP_TYPE_INSN_ARRAY, __MAX_BPF_MAP_TYPE }; @@ -7649,4 +7650,24 @@ enum bpf_kfunc_flags { BPF_F_PAD_ZEROS = (1ULL << 0), }; +/* + * Values of a BPF_MAP_TYPE_INSN_ARRAY entry must be of this type. + * + * Before the map is used the orig_off field should point to an + * instruction inside the program being loaded. The other fields + * must be set to 0. + * + * After the program is loaded, the xlated_off will be adjusted + * by the verifier to point to the index of the original instruction + * in the xlated program. If the instruction is deleted, it will + * be set to (u32)-1. The jitted_off will be set to the corresponding + * offset in the jitted image of the program. + */ +struct bpf_insn_array_value { + __u32 orig_off; + __u32 xlated_off; + __u32 jitted_off; + __u32 :32; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3 From 80cdf208117a36de82a30d210b3b8df0f193e75b Mon Sep 17 00:00:00 2001 From: James Clark Date: Tue, 11 Nov 2025 11:37:57 +0000 Subject: tools headers UAPI: Sync linux/perf_event.h with the kernel sources To pickup config4 changes. Tested-by: Leo Yan Reviewed-by: Ian Rogers Signed-off-by: James Clark Acked-by: Peter Zijlstra (Intel) Signed-off-by: Namhyung Kim --- tools/include/uapi/linux/perf_event.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools/include/uapi/linux') diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 78a362b80027..0d0ed85ad8cb 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -382,6 +382,7 @@ enum perf_event_read_format { #define PERF_ATTR_SIZE_VER6 120 /* Add: aux_sample_size */ #define PERF_ATTR_SIZE_VER7 128 /* Add: sig_data */ #define PERF_ATTR_SIZE_VER8 136 /* Add: config3 */ +#define PERF_ATTR_SIZE_VER9 144 /* add: config4 */ /* * 'struct perf_event_attr' contains various attributes that define @@ -543,6 +544,7 @@ struct perf_event_attr { __u64 sig_data; __u64 config3; /* extension of config2 */ + __u64 config4; /* extension of config3 */ }; /* -- cgit v1.2.3 From 68e83f3472667aac18d577587102f4bf77d0bd06 Mon Sep 17 00:00:00 2001 From: Asbjørn Sloth Tønnesen Date: Thu, 20 Nov 2025 17:44:27 +0000 Subject: tools: ynl-gen: add regeneration comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a comment on regeneration to the generated files. The comment is placed after the YNL-GEN line[1], as to not interfere with ynl-regen.sh's detection logic. [1] and after the optional YNL-ARG line. Link: https://lore.kernel.org/r/aR5m174O7pklKrMR@zx2c4.com/ Suggested-by: Jason A. Donenfeld Signed-off-by: Asbjørn Sloth Tønnesen Acked-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251120174429.390574-3-ast@fiberby.net Signed-off-by: Jakub Kicinski --- drivers/android/binder_netlink.c | 1 + drivers/android/binder_netlink.h | 1 + drivers/dpll/dpll_nl.c | 1 + drivers/dpll/dpll_nl.h | 1 + drivers/net/ovpn/netlink-gen.c | 1 + drivers/net/ovpn/netlink-gen.h | 1 + drivers/net/team/team_nl.c | 1 + drivers/net/team/team_nl.h | 1 + fs/lockd/netlink.c | 1 + fs/lockd/netlink.h | 1 + fs/nfsd/netlink.c | 1 + fs/nfsd/netlink.h | 1 + include/uapi/linux/android/binder_netlink.h | 1 + include/uapi/linux/dpll.h | 1 + include/uapi/linux/ethtool_netlink_generated.h | 1 + include/uapi/linux/fou.h | 1 + include/uapi/linux/handshake.h | 1 + include/uapi/linux/if_team.h | 1 + include/uapi/linux/lockd_netlink.h | 1 + include/uapi/linux/mptcp_pm.h | 1 + include/uapi/linux/net_shaper.h | 1 + include/uapi/linux/netdev.h | 1 + include/uapi/linux/nfsd_netlink.h | 1 + include/uapi/linux/ovpn.h | 1 + include/uapi/linux/psp.h | 1 + net/core/netdev-genl-gen.c | 1 + net/core/netdev-genl-gen.h | 1 + net/devlink/netlink_gen.c | 1 + net/devlink/netlink_gen.h | 1 + net/handshake/genl.c | 1 + net/handshake/genl.h | 1 + net/ipv4/fou_nl.c | 1 + net/ipv4/fou_nl.h | 1 + net/mptcp/mptcp_pm_gen.c | 1 + net/mptcp/mptcp_pm_gen.h | 1 + net/psp/psp-nl-gen.c | 1 + net/psp/psp-nl-gen.h | 1 + net/shaper/shaper_nl_gen.c | 1 + net/shaper/shaper_nl_gen.h | 1 + tools/include/uapi/linux/netdev.h | 1 + tools/net/ynl/pyynl/ynl_gen_c.py | 1 + 41 files changed, 41 insertions(+) (limited to 'tools/include/uapi/linux') diff --git a/drivers/android/binder_netlink.c b/drivers/android/binder_netlink.c index d05397a50ca6..81e8432b5904 100644 --- a/drivers/android/binder_netlink.c +++ b/drivers/android/binder_netlink.c @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/binder.yaml */ /* YNL-GEN kernel source */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #include #include diff --git a/drivers/android/binder_netlink.h b/drivers/android/binder_netlink.h index 882c7a6b537e..57399942a5e3 100644 --- a/drivers/android/binder_netlink.h +++ b/drivers/android/binder_netlink.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/binder.yaml */ /* YNL-GEN kernel header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _LINUX_BINDER_GEN_H #define _LINUX_BINDER_GEN_H diff --git a/drivers/dpll/dpll_nl.c b/drivers/dpll/dpll_nl.c index 3c6d570babf8..36d11ff195df 100644 --- a/drivers/dpll/dpll_nl.c +++ b/drivers/dpll/dpll_nl.c @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/dpll.yaml */ /* YNL-GEN kernel source */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #include #include diff --git a/drivers/dpll/dpll_nl.h b/drivers/dpll/dpll_nl.h index 3da10cfe9a6e..7419679b6977 100644 --- a/drivers/dpll/dpll_nl.h +++ b/drivers/dpll/dpll_nl.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/dpll.yaml */ /* YNL-GEN kernel header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _LINUX_DPLL_GEN_H #define _LINUX_DPLL_GEN_H diff --git a/drivers/net/ovpn/netlink-gen.c b/drivers/net/ovpn/netlink-gen.c index 14298188c5f1..ecbe9dcf4f7d 100644 --- a/drivers/net/ovpn/netlink-gen.c +++ b/drivers/net/ovpn/netlink-gen.c @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/ovpn.yaml */ /* YNL-GEN kernel source */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #include #include diff --git a/drivers/net/ovpn/netlink-gen.h b/drivers/net/ovpn/netlink-gen.h index 220b5b2fdd4f..b2301580770f 100644 --- a/drivers/net/ovpn/netlink-gen.h +++ b/drivers/net/ovpn/netlink-gen.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/ovpn.yaml */ /* YNL-GEN kernel header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _LINUX_OVPN_GEN_H #define _LINUX_OVPN_GEN_H diff --git a/drivers/net/team/team_nl.c b/drivers/net/team/team_nl.c index 208424ab78f5..6db21725f9cc 100644 --- a/drivers/net/team/team_nl.c +++ b/drivers/net/team/team_nl.c @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/team.yaml */ /* YNL-GEN kernel source */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #include #include diff --git a/drivers/net/team/team_nl.h b/drivers/net/team/team_nl.h index c9ec1b22ac4d..74816b193475 100644 --- a/drivers/net/team/team_nl.h +++ b/drivers/net/team/team_nl.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/team.yaml */ /* YNL-GEN kernel header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _LINUX_TEAM_GEN_H #define _LINUX_TEAM_GEN_H diff --git a/fs/lockd/netlink.c b/fs/lockd/netlink.c index 6e00b02cad90..880c42b4f8c3 100644 --- a/fs/lockd/netlink.c +++ b/fs/lockd/netlink.c @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/lockd.yaml */ /* YNL-GEN kernel source */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #include #include diff --git a/fs/lockd/netlink.h b/fs/lockd/netlink.h index 1920543a7955..d8408f077dd8 100644 --- a/fs/lockd/netlink.h +++ b/fs/lockd/netlink.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/lockd.yaml */ /* YNL-GEN kernel header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _LINUX_LOCKD_GEN_H #define _LINUX_LOCKD_GEN_H diff --git a/fs/nfsd/netlink.c b/fs/nfsd/netlink.c index ca54aa583530..ac51a44e1065 100644 --- a/fs/nfsd/netlink.c +++ b/fs/nfsd/netlink.c @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/nfsd.yaml */ /* YNL-GEN kernel source */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #include #include diff --git a/fs/nfsd/netlink.h b/fs/nfsd/netlink.h index 8eb903f24c41..478117ff6b8c 100644 --- a/fs/nfsd/netlink.h +++ b/fs/nfsd/netlink.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/nfsd.yaml */ /* YNL-GEN kernel header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _LINUX_NFSD_GEN_H #define _LINUX_NFSD_GEN_H diff --git a/include/uapi/linux/android/binder_netlink.h b/include/uapi/linux/android/binder_netlink.h index b218f96d6668..bf69833c9a19 100644 --- a/include/uapi/linux/android/binder_netlink.h +++ b/include/uapi/linux/android/binder_netlink.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/binder.yaml */ /* YNL-GEN uapi header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _UAPI_LINUX_ANDROID_BINDER_NETLINK_H #define _UAPI_LINUX_ANDROID_BINDER_NETLINK_H diff --git a/include/uapi/linux/dpll.h b/include/uapi/linux/dpll.h index 69d35570ac4f..b7ff9c44f9aa 100644 --- a/include/uapi/linux/dpll.h +++ b/include/uapi/linux/dpll.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/dpll.yaml */ /* YNL-GEN uapi header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _UAPI_LINUX_DPLL_H #define _UAPI_LINUX_DPLL_H diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index b71b175df46d..556a0c834df5 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/ethtool.yaml */ /* YNL-GEN uapi header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H #define _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H diff --git a/include/uapi/linux/fou.h b/include/uapi/linux/fou.h index b5cd3e7b3775..bb6bef74d2d1 100644 --- a/include/uapi/linux/fou.h +++ b/include/uapi/linux/fou.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/fou.yaml */ /* YNL-GEN uapi header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _UAPI_LINUX_FOU_H #define _UAPI_LINUX_FOU_H diff --git a/include/uapi/linux/handshake.h b/include/uapi/linux/handshake.h index 662e7de46c54..d7e40f594888 100644 --- a/include/uapi/linux/handshake.h +++ b/include/uapi/linux/handshake.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/handshake.yaml */ /* YNL-GEN uapi header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _UAPI_LINUX_HANDSHAKE_H #define _UAPI_LINUX_HANDSHAKE_H diff --git a/include/uapi/linux/if_team.h b/include/uapi/linux/if_team.h index a5c06243a435..f4cd839ae725 100644 --- a/include/uapi/linux/if_team.h +++ b/include/uapi/linux/if_team.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/team.yaml */ /* YNL-GEN uapi header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _UAPI_LINUX_IF_TEAM_H #define _UAPI_LINUX_IF_TEAM_H diff --git a/include/uapi/linux/lockd_netlink.h b/include/uapi/linux/lockd_netlink.h index 21c65aec3bc6..2d766a0fa6ea 100644 --- a/include/uapi/linux/lockd_netlink.h +++ b/include/uapi/linux/lockd_netlink.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/lockd.yaml */ /* YNL-GEN uapi header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _UAPI_LINUX_LOCKD_NETLINK_H #define _UAPI_LINUX_LOCKD_NETLINK_H diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h index bf44a5cf5b5a..c97d060ee90b 100644 --- a/include/uapi/linux/mptcp_pm.h +++ b/include/uapi/linux/mptcp_pm.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/mptcp_pm.yaml */ /* YNL-GEN uapi header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _UAPI_LINUX_MPTCP_PM_H #define _UAPI_LINUX_MPTCP_PM_H diff --git a/include/uapi/linux/net_shaper.h b/include/uapi/linux/net_shaper.h index d8834b59f7d7..3dd22c2930d9 100644 --- a/include/uapi/linux/net_shaper.h +++ b/include/uapi/linux/net_shaper.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/net_shaper.yaml */ /* YNL-GEN uapi header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _UAPI_LINUX_NET_SHAPER_H #define _UAPI_LINUX_NET_SHAPER_H diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 048c8de1a130..e0b579a1df4f 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/netdev.yaml */ /* YNL-GEN uapi header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _UAPI_LINUX_NETDEV_H #define _UAPI_LINUX_NETDEV_H diff --git a/include/uapi/linux/nfsd_netlink.h b/include/uapi/linux/nfsd_netlink.h index 887cbd12b695..e157e2009ea8 100644 --- a/include/uapi/linux/nfsd_netlink.h +++ b/include/uapi/linux/nfsd_netlink.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/nfsd.yaml */ /* YNL-GEN uapi header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _UAPI_LINUX_NFSD_NETLINK_H #define _UAPI_LINUX_NFSD_NETLINK_H diff --git a/include/uapi/linux/ovpn.h b/include/uapi/linux/ovpn.h index 680d1522dc87..959b41def61f 100644 --- a/include/uapi/linux/ovpn.h +++ b/include/uapi/linux/ovpn.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/ovpn.yaml */ /* YNL-GEN uapi header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _UAPI_LINUX_OVPN_H #define _UAPI_LINUX_OVPN_H diff --git a/include/uapi/linux/psp.h b/include/uapi/linux/psp.h index d8449c043ba1..a3a336488dc3 100644 --- a/include/uapi/linux/psp.h +++ b/include/uapi/linux/psp.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/psp.yaml */ /* YNL-GEN uapi header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _UAPI_LINUX_PSP_H #define _UAPI_LINUX_PSP_H diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index ff20435c45d2..ba673e81716f 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/netdev.yaml */ /* YNL-GEN kernel source */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #include #include diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index cf3fad74511f..cffc08517a41 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/netdev.yaml */ /* YNL-GEN kernel header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _LINUX_NETDEV_GEN_H #define _LINUX_NETDEV_GEN_H diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c index 580985025f49..f4c61c2b4f22 100644 --- a/net/devlink/netlink_gen.c +++ b/net/devlink/netlink_gen.c @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/devlink.yaml */ /* YNL-GEN kernel source */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #include #include diff --git a/net/devlink/netlink_gen.h b/net/devlink/netlink_gen.h index 09cc6f264ccf..2817d53a0eba 100644 --- a/net/devlink/netlink_gen.h +++ b/net/devlink/netlink_gen.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/devlink.yaml */ /* YNL-GEN kernel header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _LINUX_DEVLINK_GEN_H #define _LINUX_DEVLINK_GEN_H diff --git a/net/handshake/genl.c b/net/handshake/genl.c index f55d14d7b726..870612609491 100644 --- a/net/handshake/genl.c +++ b/net/handshake/genl.c @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/handshake.yaml */ /* YNL-GEN kernel source */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #include #include diff --git a/net/handshake/genl.h b/net/handshake/genl.h index ae72a596f6cc..8d3e18672daf 100644 --- a/net/handshake/genl.h +++ b/net/handshake/genl.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/handshake.yaml */ /* YNL-GEN kernel header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _LINUX_HANDSHAKE_GEN_H #define _LINUX_HANDSHAKE_GEN_H diff --git a/net/ipv4/fou_nl.c b/net/ipv4/fou_nl.c index 506260b4a4dc..7a99639204b1 100644 --- a/net/ipv4/fou_nl.c +++ b/net/ipv4/fou_nl.c @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/fou.yaml */ /* YNL-GEN kernel source */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #include #include diff --git a/net/ipv4/fou_nl.h b/net/ipv4/fou_nl.h index 63a6c4ed803d..438342dc8507 100644 --- a/net/ipv4/fou_nl.h +++ b/net/ipv4/fou_nl.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/fou.yaml */ /* YNL-GEN kernel header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _LINUX_FOU_GEN_H #define _LINUX_FOU_GEN_H diff --git a/net/mptcp/mptcp_pm_gen.c b/net/mptcp/mptcp_pm_gen.c index dcffd847af33..c180930a8e0e 100644 --- a/net/mptcp/mptcp_pm_gen.c +++ b/net/mptcp/mptcp_pm_gen.c @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/mptcp_pm.yaml */ /* YNL-GEN kernel source */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #include #include diff --git a/net/mptcp/mptcp_pm_gen.h b/net/mptcp/mptcp_pm_gen.h index e24258f6f819..b9280bcb43f5 100644 --- a/net/mptcp/mptcp_pm_gen.h +++ b/net/mptcp/mptcp_pm_gen.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/mptcp_pm.yaml */ /* YNL-GEN kernel header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _LINUX_MPTCP_PM_GEN_H #define _LINUX_MPTCP_PM_GEN_H diff --git a/net/psp/psp-nl-gen.c b/net/psp/psp-nl-gen.c index 73f8b06d66f0..22a48d0fa378 100644 --- a/net/psp/psp-nl-gen.c +++ b/net/psp/psp-nl-gen.c @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/psp.yaml */ /* YNL-GEN kernel source */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #include #include diff --git a/net/psp/psp-nl-gen.h b/net/psp/psp-nl-gen.h index 5bc3b5d5a53e..599c5f1c82f2 100644 --- a/net/psp/psp-nl-gen.h +++ b/net/psp/psp-nl-gen.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/psp.yaml */ /* YNL-GEN kernel header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _LINUX_PSP_GEN_H #define _LINUX_PSP_GEN_H diff --git a/net/shaper/shaper_nl_gen.c b/net/shaper/shaper_nl_gen.c index 204c8ae8c7b1..e8cccc4c1180 100644 --- a/net/shaper/shaper_nl_gen.c +++ b/net/shaper/shaper_nl_gen.c @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/net_shaper.yaml */ /* YNL-GEN kernel source */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #include #include diff --git a/net/shaper/shaper_nl_gen.h b/net/shaper/shaper_nl_gen.h index cb7f9026fc23..ec41c90431a4 100644 --- a/net/shaper/shaper_nl_gen.h +++ b/net/shaper/shaper_nl_gen.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/net_shaper.yaml */ /* YNL-GEN kernel header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _LINUX_NET_SHAPER_GEN_H #define _LINUX_NET_SHAPER_GEN_H diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 048c8de1a130..e0b579a1df4f 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/netdev.yaml */ /* YNL-GEN uapi header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _UAPI_LINUX_NETDEV_H #define _UAPI_LINUX_NETDEV_H diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index 0dd589e502cb..b517d0c605ad 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -3442,6 +3442,7 @@ def main(): if args.fn_prefix: line += f' --function-prefix {args.fn_prefix}' cw.p(f'/* YNL-ARG{line} */') + cw.p('/* To regenerate run: tools/net/ynl/ynl-regen.sh */') cw.nl() if args.mode == 'uapi': -- cgit v1.2.3 From 22b0ceee1c48178ed3a69a17ea36150ab1f1eb9a Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 20 Nov 2025 15:47:59 -0800 Subject: tools headers UAPI: Sync linux/perf_event.h for deferred callchains It needs to sync with the kernel to support user space changes for the deferred callchains. Reviewed-by: Ian Rogers Signed-off-by: Namhyung Kim --- tools/include/uapi/linux/perf_event.h | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'tools/include/uapi/linux') diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 0d0ed85ad8cb..c44a8fb3e418 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -464,7 +464,9 @@ struct perf_event_attr { inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */ remove_on_exec : 1, /* event is removed from task on exec */ sigtrap : 1, /* send synchronous SIGTRAP on event */ - __reserved_1 : 26; + defer_callchain: 1, /* request PERF_RECORD_CALLCHAIN_DEFERRED records */ + defer_output : 1, /* output PERF_RECORD_CALLCHAIN_DEFERRED records */ + __reserved_1 : 24; union { __u32 wakeup_events; /* wake up every n events */ @@ -1241,6 +1243,22 @@ enum perf_event_type { */ PERF_RECORD_AUX_OUTPUT_HW_ID = 21, + /* + * This user callchain capture was deferred until shortly before + * returning to user space. Previous samples would have kernel + * callchains only and they need to be stitched with this to make full + * callchains. + * + * struct { + * struct perf_event_header header; + * u64 cookie; + * u64 nr; + * u64 ips[nr]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_CALLCHAIN_DEFERRED = 22, + PERF_RECORD_MAX, /* non-ABI */ }; @@ -1271,6 +1289,7 @@ enum perf_callchain_context { PERF_CONTEXT_HV = (__u64)-32, PERF_CONTEXT_KERNEL = (__u64)-128, PERF_CONTEXT_USER = (__u64)-512, + PERF_CONTEXT_USER_DEFERRED = (__u64)-640, PERF_CONTEXT_GUEST = (__u64)-2048, PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, -- cgit v1.2.3 From b5709f6d26d65f6bb9711f4b5f98469fd507cb5b Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 3 Dec 2025 15:37:44 -0800 Subject: bpf: Support associating BPF program with struct_ops Add a new BPF command BPF_PROG_ASSOC_STRUCT_OPS to allow associating a BPF program with a struct_ops map. This command takes a file descriptor of a struct_ops map and a BPF program and set prog->aux->st_ops_assoc to the kdata of the struct_ops map. The command does not accept a struct_ops program nor a non-struct_ops map. Programs of a struct_ops map is automatically associated with the map during map update. If a program is shared between two struct_ops maps, prog->aux->st_ops_assoc will be poisoned to indicate that the associated struct_ops is ambiguous. The pointer, once poisoned, cannot be reset since we have lost track of associated struct_ops. For other program types, the associated struct_ops map, once set, cannot be changed later. This restriction may be lifted in the future if there is a use case. A kernel helper bpf_prog_get_assoc_struct_ops() can be used to retrieve the associated struct_ops pointer. The returned pointer, if not NULL, is guaranteed to be valid and point to a fully updated struct_ops struct. For struct_ops program reused in multiple struct_ops map, the return will be NULL. prog->aux->st_ops_assoc is protected by bumping the refcount for non-struct_ops programs and RCU for struct_ops programs. Since it would be inefficient to track programs associated with a struct_ops map, every non-struct_ops program will bump the refcount of the map to make sure st_ops_assoc stays valid. For a struct_ops program, it is protected by RCU as map_free will wait for an RCU grace period before disassociating the program with the map. The helper must be called in BPF program context or RCU read-side critical section. struct_ops implementers should note that the struct_ops returned may not be initialized nor attached yet. The struct_ops implementer will be responsible for tracking and checking the state of the associated struct_ops map if the use case expects an initialized or attached struct_ops. Signed-off-by: Amery Hung Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20251203233748.668365-3-ameryhung@gmail.com --- include/linux/bpf.h | 16 ++++++++ include/uapi/linux/bpf.h | 17 ++++++++ kernel/bpf/bpf_struct_ops.c | 88 ++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/core.c | 3 ++ kernel/bpf/syscall.c | 46 ++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 17 ++++++++ 6 files changed, 187 insertions(+) (limited to 'tools/include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6498be4c44f8..28d8d6b7bb1e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1739,6 +1739,8 @@ struct bpf_prog_aux { struct rcu_head rcu; }; struct bpf_stream stream[2]; + struct mutex st_ops_assoc_mutex; + struct bpf_map __rcu *st_ops_assoc; }; struct bpf_prog { @@ -2041,6 +2043,9 @@ static inline void bpf_module_put(const void *data, struct module *owner) module_put(owner); } int bpf_struct_ops_link_create(union bpf_attr *attr); +int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map); +void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog); +void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux); u32 bpf_struct_ops_id(const void *kdata); #ifdef CONFIG_NET @@ -2088,6 +2093,17 @@ static inline int bpf_struct_ops_link_create(union bpf_attr *attr) { return -EOPNOTSUPP; } +static inline int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map) +{ + return -EOPNOTSUPP; +} +static inline void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog) +{ +} +static inline void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux) +{ + return NULL; +} static inline void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map) { } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f8d8513eda27..84ced3ed2d21 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -918,6 +918,16 @@ union bpf_iter_link_info { * Number of bytes read from the stream on success, or -1 if an * error occurred (in which case, *errno* is set appropriately). * + * BPF_PROG_ASSOC_STRUCT_OPS + * Description + * Associate a BPF program with a struct_ops map. The struct_ops + * map is identified by *map_fd* and the BPF program is + * identified by *prog_fd*. + * + * Return + * 0 on success or -1 if an error occurred (in which case, + * *errno* is set appropriately). + * * NOTES * eBPF objects (maps and programs) can be shared between processes. * @@ -974,6 +984,7 @@ enum bpf_cmd { BPF_PROG_BIND_MAP, BPF_TOKEN_CREATE, BPF_PROG_STREAM_READ_BY_FD, + BPF_PROG_ASSOC_STRUCT_OPS, __MAX_BPF_CMD, }; @@ -1894,6 +1905,12 @@ union bpf_attr { __u32 prog_fd; } prog_stream_read; + struct { + __u32 map_fd; + __u32 prog_fd; + __u32 flags; + } prog_assoc_struct_ops; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 278490683d28..c43346cb3d76 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -533,6 +533,17 @@ static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map) } } +static void bpf_struct_ops_map_dissoc_progs(struct bpf_struct_ops_map *st_map) +{ + u32 i; + + for (i = 0; i < st_map->funcs_cnt; i++) { + if (!st_map->links[i]) + break; + bpf_prog_disassoc_struct_ops(st_map->links[i]->prog); + } +} + static void bpf_struct_ops_map_free_image(struct bpf_struct_ops_map *st_map) { int i; @@ -801,6 +812,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, goto reset_unlock; } + /* Poison pointer on error instead of return for backward compatibility */ + bpf_prog_assoc_struct_ops(prog, &st_map->map); + link = kzalloc(sizeof(*link), GFP_USER); if (!link) { bpf_prog_put(prog); @@ -980,6 +994,8 @@ static void bpf_struct_ops_map_free(struct bpf_map *map) if (btf_is_module(st_map->btf)) module_put(st_map->st_ops_desc->st_ops->owner); + bpf_struct_ops_map_dissoc_progs(st_map); + bpf_struct_ops_map_del_ksyms(st_map); /* The struct_ops's function may switch to another struct_ops. @@ -1396,6 +1412,78 @@ err_out: return err; } +int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map) +{ + struct bpf_map *st_ops_assoc; + + guard(mutex)(&prog->aux->st_ops_assoc_mutex); + + st_ops_assoc = rcu_dereference_protected(prog->aux->st_ops_assoc, + lockdep_is_held(&prog->aux->st_ops_assoc_mutex)); + if (st_ops_assoc && st_ops_assoc == map) + return 0; + + if (st_ops_assoc) { + if (prog->type != BPF_PROG_TYPE_STRUCT_OPS) + return -EBUSY; + + rcu_assign_pointer(prog->aux->st_ops_assoc, BPF_PTR_POISON); + } else { + /* + * struct_ops map does not track associated non-struct_ops programs. + * Bump the refcount to make sure st_ops_assoc is always valid. + */ + if (prog->type != BPF_PROG_TYPE_STRUCT_OPS) + bpf_map_inc(map); + + rcu_assign_pointer(prog->aux->st_ops_assoc, map); + } + + return 0; +} + +void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog) +{ + struct bpf_map *st_ops_assoc; + + guard(mutex)(&prog->aux->st_ops_assoc_mutex); + + st_ops_assoc = rcu_dereference_protected(prog->aux->st_ops_assoc, + lockdep_is_held(&prog->aux->st_ops_assoc_mutex)); + if (!st_ops_assoc || st_ops_assoc == BPF_PTR_POISON) + return; + + if (prog->type != BPF_PROG_TYPE_STRUCT_OPS) + bpf_map_put(st_ops_assoc); + + RCU_INIT_POINTER(prog->aux->st_ops_assoc, NULL); +} + +/* + * Get a reference to the struct_ops struct (i.e., kdata) associated with a + * program. Should only be called in BPF program context (e.g., in a kfunc). + * + * If the returned pointer is not NULL, it must points to a valid struct_ops. + * The struct_ops map is not guaranteed to be initialized nor attached. + * Kernel struct_ops implementers are responsible for tracking and checking + * the state of the struct_ops if the use case requires an initialized or + * attached struct_ops. + */ +void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux) +{ + struct bpf_struct_ops_map *st_map; + struct bpf_map *st_ops_assoc; + + st_ops_assoc = rcu_dereference_check(aux->st_ops_assoc, bpf_rcu_lock_held()); + if (!st_ops_assoc || st_ops_assoc == BPF_PTR_POISON) + return NULL; + + st_map = (struct bpf_struct_ops_map *)st_ops_assoc; + + return &st_map->kvalue.data; +} +EXPORT_SYMBOL_GPL(bpf_prog_get_assoc_struct_ops); + void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c8ae6ab31651..67226145a4db 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -136,6 +136,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag mutex_init(&fp->aux->used_maps_mutex); mutex_init(&fp->aux->ext_mutex); mutex_init(&fp->aux->dst_mutex); + mutex_init(&fp->aux->st_ops_assoc_mutex); #ifdef CONFIG_BPF_SYSCALL bpf_prog_stream_init(fp); @@ -286,6 +287,7 @@ void __bpf_prog_free(struct bpf_prog *fp) if (fp->aux) { mutex_destroy(&fp->aux->used_maps_mutex); mutex_destroy(&fp->aux->dst_mutex); + mutex_destroy(&fp->aux->st_ops_assoc_mutex); kfree(fp->aux->poke_tab); kfree(fp->aux); } @@ -2896,6 +2898,7 @@ static void bpf_prog_free_deferred(struct work_struct *work) #endif bpf_free_used_maps(aux); bpf_free_used_btfs(aux); + bpf_prog_disassoc_struct_ops(aux->prog); if (bpf_prog_is_dev_bound(aux)) bpf_prog_dev_bound_destroy(aux->prog); #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6589acc89ef8..3080cc48bfc3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -6122,6 +6122,49 @@ static int prog_stream_read(union bpf_attr *attr) return ret; } +#define BPF_PROG_ASSOC_STRUCT_OPS_LAST_FIELD prog_assoc_struct_ops.prog_fd + +static int prog_assoc_struct_ops(union bpf_attr *attr) +{ + struct bpf_prog *prog; + struct bpf_map *map; + int ret; + + if (CHECK_ATTR(BPF_PROG_ASSOC_STRUCT_OPS)) + return -EINVAL; + + if (attr->prog_assoc_struct_ops.flags) + return -EINVAL; + + prog = bpf_prog_get(attr->prog_assoc_struct_ops.prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) { + ret = -EINVAL; + goto put_prog; + } + + map = bpf_map_get(attr->prog_assoc_struct_ops.map_fd); + if (IS_ERR(map)) { + ret = PTR_ERR(map); + goto put_prog; + } + + if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS) { + ret = -EINVAL; + goto put_map; + } + + ret = bpf_prog_assoc_struct_ops(prog, map); + +put_map: + bpf_map_put(map); +put_prog: + bpf_prog_put(prog); + return ret; +} + static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) { union bpf_attr attr; @@ -6261,6 +6304,9 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) case BPF_PROG_STREAM_READ_BY_FD: err = prog_stream_read(&attr); break; + case BPF_PROG_ASSOC_STRUCT_OPS: + err = prog_assoc_struct_ops(&attr); + break; default: err = -EINVAL; break; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index be7d8e060e10..6b92b0847ec2 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -918,6 +918,16 @@ union bpf_iter_link_info { * Number of bytes read from the stream on success, or -1 if an * error occurred (in which case, *errno* is set appropriately). * + * BPF_PROG_ASSOC_STRUCT_OPS + * Description + * Associate a BPF program with a struct_ops map. The struct_ops + * map is identified by *map_fd* and the BPF program is + * identified by *prog_fd*. + * + * Return + * 0 on success or -1 if an error occurred (in which case, + * *errno* is set appropriately). + * * NOTES * eBPF objects (maps and programs) can be shared between processes. * @@ -974,6 +984,7 @@ enum bpf_cmd { BPF_PROG_BIND_MAP, BPF_TOKEN_CREATE, BPF_PROG_STREAM_READ_BY_FD, + BPF_PROG_ASSOC_STRUCT_OPS, __MAX_BPF_CMD, }; @@ -1894,6 +1905,12 @@ union bpf_attr { __u32 prog_fd; } prog_stream_read; + struct { + __u32 map_fd; + __u32 prog_fd; + __u32 flags; + } prog_assoc_struct_ops; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF -- cgit v1.2.3 From 34524cde88c9137ef134df25ded59520c2fb307a Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 22 Dec 2025 14:57:07 -0800 Subject: tools headers: Sync UAPI KVM headers with kernel sources To pick up changes from: ad9c62bd8946621e ("KVM: arm64: VM exit to userspace to handle SEA") 8e8678e740ecde2a ("KVM: s390: Add capability that forwards operation exceptions") e0c26d47def7382d ("Merge tag 'kvm-s390-next-6.19-1' of https://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux into HEAD") 7a61d61396b97fd6 ("KVM: SEV: Publish supported SEV-SNP policy bits") This should be used to beautify DRM syscall arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h diff -u tools/arch/x86/include/uapi/asm/kvm.h arch/x86/include/uapi/asm/kvm.h Please see tools/include/uapi/README. Cc: kvm@vger.kernel.org Signed-off-by: Namhyung Kim --- tools/arch/x86/include/uapi/asm/kvm.h | 1 + tools/include/uapi/linux/kvm.h | 11 +++++++++++ 2 files changed, 12 insertions(+) (limited to 'tools/include/uapi/linux') diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index d420c9c066d4..7ceff6583652 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -502,6 +502,7 @@ struct kvm_sync_regs { /* vendor-specific groups and attributes for system fd */ #define KVM_X86_GRP_SEV 1 # define KVM_X86_SEV_VMSA_FEATURES 0 +# define KVM_X86_SNP_POLICY_BITS 1 struct kvm_vmx_nested_state_data { __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 52f6000ab020..dddb781b0507 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -179,6 +179,7 @@ struct kvm_xen_exit { #define KVM_EXIT_LOONGARCH_IOCSR 38 #define KVM_EXIT_MEMORY_FAULT 39 #define KVM_EXIT_TDX 40 +#define KVM_EXIT_ARM_SEA 41 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -473,6 +474,14 @@ struct kvm_run { } setup_event_notify; }; } tdx; + /* KVM_EXIT_ARM_SEA */ + struct { +#define KVM_EXIT_ARM_SEA_FLAG_GPA_VALID (1ULL << 0) + __u64 flags; + __u64 esr; + __u64 gva; + __u64 gpa; + } arm_sea; /* Fix the size of the union. */ char padding[256]; }; @@ -963,6 +972,8 @@ struct kvm_enable_cap { #define KVM_CAP_RISCV_MP_STATE_RESET 242 #define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243 #define KVM_CAP_GUEST_MEMFD_FLAGS 244 +#define KVM_CAP_ARM_SEA_TO_USER 245 +#define KVM_CAP_S390_USER_OPEREXEC 246 struct kvm_irq_routing_irqchip { __u32 irqchip; -- cgit v1.2.3 From 2b421662c7887a0649fe409155a1f101562d0fa9 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 7 Jan 2026 10:20:16 +0800 Subject: bpf: Introduce BPF_F_CPU and BPF_F_ALL_CPUS flags Introduce BPF_F_CPU and BPF_F_ALL_CPUS flags and check them for following APIs: * 'map_lookup_elem()' * 'map_update_elem()' * 'generic_map_lookup_batch()' * 'generic_map_update_batch()' And, get the correct value size for these APIs. Acked-by: Andrii Nakryiko Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260107022022.12843-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 23 ++++++++++++++++++++++- include/uapi/linux/bpf.h | 2 ++ kernel/bpf/syscall.c | 31 +++++++++++++++++-------------- tools/include/uapi/linux/bpf.h | 2 ++ 4 files changed, 43 insertions(+), 15 deletions(-) (limited to 'tools/include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a63e47d2109c..108bab1bda9d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3915,14 +3915,35 @@ bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image) } #endif +static inline bool bpf_map_supports_cpu_flags(enum bpf_map_type map_type) +{ + return false; +} + static inline int bpf_map_check_op_flags(struct bpf_map *map, u64 flags, u64 allowed_flags) { - if (flags & ~allowed_flags) + u32 cpu; + + if ((u32)flags & ~allowed_flags) return -EINVAL; if ((flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)) return -EINVAL; + if (!(flags & BPF_F_CPU) && flags >> 32) + return -EINVAL; + + if (flags & (BPF_F_CPU | BPF_F_ALL_CPUS)) { + if (!bpf_map_supports_cpu_flags(map->map_type)) + return -EINVAL; + if ((flags & BPF_F_CPU) && (flags & BPF_F_ALL_CPUS)) + return -EINVAL; + + cpu = flags >> 32; + if ((flags & BPF_F_CPU) && cpu >= num_possible_cpus()) + return -ERANGE; + } + return 0; } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 84ced3ed2d21..2a2ade4be60f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1384,6 +1384,8 @@ enum { BPF_NOEXIST = 1, /* create new element if it didn't exist */ BPF_EXIST = 2, /* update existing element */ BPF_F_LOCK = 4, /* spin_lock-ed map_lookup/map_update */ + BPF_F_CPU = 8, /* cpu flag for percpu maps, upper 32-bit of flags is a cpu number */ + BPF_F_ALL_CPUS = 16, /* update value across all CPUs for percpu maps */ }; /* flags for BPF_MAP_CREATE command */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6dd2ad2f9e81..e8cfe9d67e64 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -133,12 +133,14 @@ bool bpf_map_write_active(const struct bpf_map *map) return atomic64_read(&map->writecnt) != 0; } -static u32 bpf_map_value_size(const struct bpf_map *map) -{ - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || - map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) +static u32 bpf_map_value_size(const struct bpf_map *map, u64 flags) +{ + if (flags & (BPF_F_CPU | BPF_F_ALL_CPUS)) + return map->value_size; + else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || + map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) return round_up(map->value_size, 8) * num_possible_cpus(); else if (IS_FD_MAP(map)) return sizeof(u32); @@ -1729,7 +1731,7 @@ static int map_lookup_elem(union bpf_attr *attr) if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) return -EPERM; - err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK); + err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK | BPF_F_CPU); if (err) return err; @@ -1737,7 +1739,7 @@ static int map_lookup_elem(union bpf_attr *attr) if (IS_ERR(key)) return PTR_ERR(key); - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, attr->flags); err = -ENOMEM; value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); @@ -1804,7 +1806,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) goto err_put; } - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, attr->flags); value = kvmemdup_bpfptr(uvalue, value_size); if (IS_ERR(value)) { err = PTR_ERR(value); @@ -2000,11 +2002,12 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file, void *key, *value; int err = 0; - err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK); + err = bpf_map_check_op_flags(map, attr->batch.elem_flags, + BPF_F_LOCK | BPF_F_CPU | BPF_F_ALL_CPUS); if (err) return err; - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, attr->batch.elem_flags); max_count = attr->batch.count; if (!max_count) @@ -2059,11 +2062,11 @@ int generic_map_lookup_batch(struct bpf_map *map, u32 value_size, cp, max_count; int err; - err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK); + err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK | BPF_F_CPU); if (err) return err; - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, attr->batch.elem_flags); max_count = attr->batch.count; if (!max_count) @@ -2185,7 +2188,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) goto err_put; } - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, 0); err = -ENOMEM; value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6b92b0847ec2..b816bc53d2e1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1384,6 +1384,8 @@ enum { BPF_NOEXIST = 1, /* create new element if it didn't exist */ BPF_EXIST = 2, /* update existing element */ BPF_F_LOCK = 4, /* spin_lock-ed map_lookup/map_update */ + BPF_F_CPU = 8, /* cpu flag for percpu maps, upper 32-bit of flags is a cpu number */ + BPF_F_ALL_CPUS = 16, /* update value across all CPUs for percpu maps */ }; /* flags for BPF_MAP_CREATE command */ -- cgit v1.2.3 From 2e4b28c48f88ce9e263957b1d944cf5349952f88 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 11 Jan 2026 16:53:48 +0100 Subject: treewide: Update email address In a vain attempt to consolidate the email zoo switch everything to the kernel.org account. Signed-off-by: Thomas Gleixner Signed-off-by: Linus Torvalds --- .mailmap | 1 + CREDITS | 2 +- .../ABI/stable/sysfs-kernel-time-aux-clocks | 2 +- Documentation/arch/x86/topology.rst | 2 +- Documentation/core-api/cpu_hotplug.rst | 2 +- Documentation/core-api/genericirq.rst | 2 +- Documentation/core-api/librs.rst | 2 +- .../devicetree/bindings/timer/mrvl,mmp-timer.yaml | 2 +- Documentation/driver-api/mtdnand.rst | 4 +-- .../translations/zh_CN/core-api/cpu_hotplug.rst | 2 +- .../translations/zh_CN/core-api/genericirq.rst | 2 +- MAINTAINERS | 36 +++++++++++----------- arch/sh/kernel/perf_event.c | 2 +- arch/sparc/kernel/perf_event.c | 2 +- arch/x86/events/core.c | 2 +- arch/x86/events/perf_event.h | 2 +- arch/x86/kernel/x86_init.c | 2 +- arch/x86/mm/pti.c | 2 +- drivers/mtd/nand/ecc-sw-hamming.c | 2 +- drivers/mtd/nand/raw/diskonchip.c | 2 +- drivers/mtd/nand/raw/nand_base.c | 4 +-- drivers/mtd/nand/raw/nand_bbt.c | 2 +- drivers/mtd/nand/raw/nand_ids.c | 2 +- drivers/mtd/nand/raw/nand_jedec.c | 2 +- drivers/mtd/nand/raw/nand_legacy.c | 2 +- drivers/mtd/nand/raw/nand_onfi.c | 2 +- drivers/mtd/nand/raw/ndfc.c | 2 +- drivers/uio/uio.c | 2 +- fs/jffs2/wbuf.c | 4 +-- include/linux/hrtimer.h | 2 +- include/linux/ktime.h | 2 +- include/linux/mtd/jedec.h | 2 +- include/linux/mtd/nand-ecc-sw-hamming.h | 2 +- include/linux/mtd/ndfc.h | 2 +- include/linux/mtd/onfi.h | 2 +- include/linux/mtd/platnand.h | 2 +- include/linux/mtd/rawnand.h | 2 +- include/linux/perf_event.h | 2 +- include/linux/plist.h | 2 +- include/linux/rslib.h | 2 +- include/linux/uio_driver.h | 2 +- include/uapi/linux/perf_event.h | 2 +- kernel/events/callchain.c | 2 +- kernel/events/core.c | 2 +- kernel/events/ring_buffer.c | 2 +- kernel/irq/debugfs.c | 2 +- kernel/irq/matrix.c | 2 +- kernel/sched/fair.c | 2 +- kernel/sched/pelt.c | 2 +- kernel/time/clockevents.c | 2 +- kernel/time/hrtimer.c | 2 +- kernel/time/tick-broadcast.c | 2 +- kernel/time/tick-common.c | 2 +- kernel/time/tick-oneshot.c | 2 +- kernel/time/tick-sched.c | 2 +- lib/debugobjects.c | 2 +- lib/plist.c | 2 +- lib/reed_solomon/decode_rs.c | 2 +- lib/reed_solomon/encode_rs.c | 2 +- lib/reed_solomon/reed_solomon.c | 2 +- scripts/spdxcheck.py | 2 +- tools/include/uapi/linux/perf_event.h | 2 +- tools/perf/builtin-list.c | 2 +- 63 files changed, 83 insertions(+), 82 deletions(-) (limited to 'tools/include/uapi/linux') diff --git a/.mailmap b/.mailmap index b23e0853d636..fa018b5bd533 100644 --- a/.mailmap +++ b/.mailmap @@ -801,6 +801,7 @@ Tanzir Hasan Tejun Heo Tomeu Vizoso Thomas Graf +Thomas Gleixner Thomas Körper Thomas Pedersen Thorsten Blum diff --git a/CREDITS b/CREDITS index ca75f110edb6..383809bc4b7a 100644 --- a/CREDITS +++ b/CREDITS @@ -1398,7 +1398,7 @@ D: SRM environment driver (for Alpha systems) P: 1024D/8399E1BB 250D 3BCF 7127 0D8C A444 A961 1DBD 5E75 8399 E1BB N: Thomas Gleixner -E: tglx@linutronix.de +E: tglx@kernel.org D: NAND flash hardware support, JFFS2 on NAND flash N: Jérôme Glisse diff --git a/Documentation/ABI/stable/sysfs-kernel-time-aux-clocks b/Documentation/ABI/stable/sysfs-kernel-time-aux-clocks index 825508f42af6..e1a894c8dd1b 100644 --- a/Documentation/ABI/stable/sysfs-kernel-time-aux-clocks +++ b/Documentation/ABI/stable/sysfs-kernel-time-aux-clocks @@ -1,5 +1,5 @@ What: /sys/kernel/time/aux_clocks//enable Date: May 2025 -Contact: Thomas Gleixner +Contact: Thomas Gleixner Description: Controls the enablement of auxiliary clock timekeepers. diff --git a/Documentation/arch/x86/topology.rst b/Documentation/arch/x86/topology.rst index 86bec8ac2c4d..f779a68875c5 100644 --- a/Documentation/arch/x86/topology.rst +++ b/Documentation/arch/x86/topology.rst @@ -17,7 +17,7 @@ with the generic one and look at this one in parallel for the x86 specifics. Needless to say, code should use the generic functions - this file is *only* here to *document* the inner workings of x86 topology. -Started by Thomas Gleixner and Borislav Petkov . +Started by Thomas Gleixner and Borislav Petkov . The main aim of the topology facilities is to present adequate interfaces to code which needs to know/query/use the structure of the running system wrt diff --git a/Documentation/core-api/cpu_hotplug.rst b/Documentation/core-api/cpu_hotplug.rst index e1b0eeabbb5e..9b4afca9fd09 100644 --- a/Documentation/core-api/cpu_hotplug.rst +++ b/Documentation/core-api/cpu_hotplug.rst @@ -8,7 +8,7 @@ CPU hotplug in the Kernel Srivatsa Vaddagiri , Ashok Raj , Joel Schopp , - Thomas Gleixner + Thomas Gleixner Introduction ============ diff --git a/Documentation/core-api/genericirq.rst b/Documentation/core-api/genericirq.rst index 582bde9bf5a9..b16d751d4b98 100644 --- a/Documentation/core-api/genericirq.rst +++ b/Documentation/core-api/genericirq.rst @@ -439,6 +439,6 @@ Credits The following people have contributed to this document: -1. Thomas Gleixner tglx@linutronix.de +1. Thomas Gleixner tglx@kernel.org 2. Ingo Molnar mingo@elte.hu diff --git a/Documentation/core-api/librs.rst b/Documentation/core-api/librs.rst index 6010f5bc5bf9..0d88893dbc03 100644 --- a/Documentation/core-api/librs.rst +++ b/Documentation/core-api/librs.rst @@ -209,4 +209,4 @@ testing. Thanks a lot. The following people have contributed to this document: -Thomas Gleixner\ tglx@linutronix.de +Thomas Gleixner\ tglx@kernel.org diff --git a/Documentation/devicetree/bindings/timer/mrvl,mmp-timer.yaml b/Documentation/devicetree/bindings/timer/mrvl,mmp-timer.yaml index fe6bc4173789..0643cfcc6bc7 100644 --- a/Documentation/devicetree/bindings/timer/mrvl,mmp-timer.yaml +++ b/Documentation/devicetree/bindings/timer/mrvl,mmp-timer.yaml @@ -8,7 +8,7 @@ title: Marvell MMP Timer maintainers: - Daniel Lezcano - - Thomas Gleixner + - Thomas Gleixner - Rob Herring properties: diff --git a/Documentation/driver-api/mtdnand.rst b/Documentation/driver-api/mtdnand.rst index ce77e024c4f1..adf03983f1ba 100644 --- a/Documentation/driver-api/mtdnand.rst +++ b/Documentation/driver-api/mtdnand.rst @@ -996,11 +996,11 @@ The following people have contributed to the NAND driver: 2. David Woodhouse\ dwmw2@infradead.org -3. Thomas Gleixner\ tglx@linutronix.de +3. Thomas Gleixner\ tglx@kernel.org A lot of users have provided bugfixes, improvements and helping hands for testing. Thanks a lot. The following people have contributed to this document: -1. Thomas Gleixner\ tglx@linutronix.de +1. Thomas Gleixner\ tglx@kernel.org diff --git a/Documentation/translations/zh_CN/core-api/cpu_hotplug.rst b/Documentation/translations/zh_CN/core-api/cpu_hotplug.rst index bc0d7ea6d834..3447fbf0e695 100644 --- a/Documentation/translations/zh_CN/core-api/cpu_hotplug.rst +++ b/Documentation/translations/zh_CN/core-api/cpu_hotplug.rst @@ -22,7 +22,7 @@ Srivatsa Vaddagiri , Ashok Raj , Joel Schopp , - Thomas Gleixner + Thomas Gleixner 简介 ==== diff --git a/Documentation/translations/zh_CN/core-api/genericirq.rst b/Documentation/translations/zh_CN/core-api/genericirq.rst index 05ccb954c18d..d2c1bd94bb97 100644 --- a/Documentation/translations/zh_CN/core-api/genericirq.rst +++ b/Documentation/translations/zh_CN/core-api/genericirq.rst @@ -404,6 +404,6 @@ kernel/irq/chip.c 感谢以下人士对本文档作出的贡献: -1. Thomas Gleixner tglx@linutronix.de +1. Thomas Gleixner tglx@kernel.org 2. Ingo Molnar mingo@elte.hu diff --git a/MAINTAINERS b/MAINTAINERS index 32b5e41d9849..ee036e0a3ef6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6175,7 +6175,7 @@ F: include/linux/clk.h CLOCKSOURCE, CLOCKEVENT DRIVERS M: Daniel Lezcano -M: Thomas Gleixner +M: Thomas Gleixner L: linux-kernel@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core @@ -6541,7 +6541,7 @@ S: Maintained F: drivers/cpufreq/virtual-cpufreq.c CPU HOTPLUG -M: Thomas Gleixner +M: Thomas Gleixner M: Peter Zijlstra L: linux-kernel@vger.kernel.org S: Maintained @@ -6968,7 +6968,7 @@ F: Documentation/scsi/dc395x.rst F: drivers/scsi/dc395x.* DEBUGOBJECTS: -M: Thomas Gleixner +M: Thomas Gleixner L: linux-kernel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git core/debugobjects @@ -10371,7 +10371,7 @@ F: include/uapi/linux/fuse.h F: tools/testing/selftests/filesystems/fuse/ FUTEX SUBSYSTEM -M: Thomas Gleixner +M: Thomas Gleixner M: Ingo Molnar R: Peter Zijlstra R: Darren Hart @@ -10515,7 +10515,7 @@ F: drivers/base/arch_topology.c F: include/linux/arch_topology.h GENERIC ENTRY CODE -M: Thomas Gleixner +M: Thomas Gleixner M: Peter Zijlstra M: Andy Lutomirski L: linux-kernel@vger.kernel.org @@ -10628,7 +10628,7 @@ F: drivers/uio/uio_pci_generic.c GENERIC VDSO LIBRARY M: Andy Lutomirski -M: Thomas Gleixner +M: Thomas Gleixner M: Vincenzo Frascino L: linux-kernel@vger.kernel.org S: Maintained @@ -11241,7 +11241,7 @@ F: drivers/hid/hid-logitech-hidpp.c HIGH-RESOLUTION TIMERS, TIMER WHEEL, CLOCKEVENTS M: Anna-Maria Behnsen M: Frederic Weisbecker -M: Thomas Gleixner +M: Thomas Gleixner L: linux-kernel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core @@ -11264,7 +11264,7 @@ R: Boqun Feng R: FUJITA Tomonori R: Frederic Weisbecker R: Lyude Paul -R: Thomas Gleixner +R: Thomas Gleixner R: Anna-Maria Behnsen R: John Stultz R: Stephen Boyd @@ -13334,7 +13334,7 @@ F: Documentation/devicetree/bindings/sound/irondevice,* F: sound/soc/codecs/sma* IRQ DOMAINS (IRQ NUMBER MAPPING LIBRARY) -M: Thomas Gleixner +M: Thomas Gleixner S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git irq/core F: Documentation/core-api/irq/irq-domain.rst @@ -13344,7 +13344,7 @@ F: kernel/irq/irqdomain.c F: kernel/irq/msi.c IRQ SUBSYSTEM -M: Thomas Gleixner +M: Thomas Gleixner L: linux-kernel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git irq/core @@ -13357,7 +13357,7 @@ F: kernel/irq/ F: lib/group_cpus.c IRQCHIP DRIVERS -M: Thomas Gleixner +M: Thomas Gleixner L: linux-kernel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git irq/core @@ -14451,7 +14451,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-nonmm-unstab F: lib/* LICENSES and SPDX stuff -M: Thomas Gleixner +M: Thomas Gleixner M: Greg Kroah-Hartman L: linux-spdx@vger.kernel.org S: Maintained @@ -18576,7 +18576,7 @@ NOHZ, DYNTICKS SUPPORT M: Anna-Maria Behnsen M: Frederic Weisbecker M: Ingo Molnar -M: Thomas Gleixner +M: Thomas Gleixner L: linux-kernel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/nohz @@ -20761,7 +20761,7 @@ F: drivers/platform/x86/portwell-ec.c POSIX CLOCKS and TIMERS M: Anna-Maria Behnsen M: Frederic Weisbecker -M: Thomas Gleixner +M: Thomas Gleixner L: linux-kernel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core @@ -26272,7 +26272,7 @@ F: drivers/net/wireless/ti/ TIMEKEEPING, CLOCKSOURCE CORE, NTP, ALARMTIMER M: John Stultz -M: Thomas Gleixner +M: Thomas Gleixner R: Stephen Boyd L: linux-kernel@vger.kernel.org S: Supported @@ -28203,7 +28203,7 @@ F: net/lapb/ F: net/x25/ X86 ARCHITECTURE (32-BIT AND 64-BIT) -M: Thomas Gleixner +M: Thomas Gleixner M: Ingo Molnar M: Borislav Petkov M: Dave Hansen @@ -28219,7 +28219,7 @@ F: tools/testing/selftests/x86 X86 CPUID DATABASE M: Borislav Petkov -M: Thomas Gleixner +M: Thomas Gleixner M: x86@kernel.org R: Ahmed S. Darwish L: x86-cpuid@lists.linux.dev @@ -28235,7 +28235,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/asm F: arch/x86/entry/ X86 HARDWARE VULNERABILITIES -M: Thomas Gleixner +M: Thomas Gleixner M: Borislav Petkov M: Peter Zijlstra M: Josh Poimboeuf diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c index 1d2507f22437..1fbb7d46e484 100644 --- a/arch/sh/kernel/perf_event.c +++ b/arch/sh/kernel/perf_event.c @@ -7,7 +7,7 @@ * Heavily based on the x86 and PowerPC implementations. * * x86: - * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008 Linutronix GmbH, Thomas Gleixner * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index cae4d33002a5..0ce4ae343531 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -6,7 +6,7 @@ * This code is based almost entirely upon the x86 perf event * code, which is: * - * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008 Linutronix GmbH, Thomas Gleixner * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 0c38a31d5fc7..576baa9a52c5 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1,7 +1,7 @@ /* * Performance events x86 architecture code * - * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008 Linutronix GmbH, Thomas Gleixner * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 3161ec0a3416..62963022b517 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1,7 +1,7 @@ /* * Performance events x86 architecture header * - * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008 Linutronix GmbH, Thomas Gleixner * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 0a2bbd674a6d..ebefb77c37bb 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2009 Thomas Gleixner + * Copyright (C) 2009 Linutronix GmbH, Thomas Gleixner * * For licencing details see kernel-base/COPYING */ diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index b10d4d131dce..f7546e9e8e89 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -15,7 +15,7 @@ * Signed-off-by: Michael Schwarz * * Major changes to the original code by: Dave Hansen - * Mostly rewritten by Thomas Gleixner and + * Mostly rewritten by Thomas Gleixner and * Andy Lutomirsky */ #include diff --git a/drivers/mtd/nand/ecc-sw-hamming.c b/drivers/mtd/nand/ecc-sw-hamming.c index f2d0effad9d2..bc62a71f9fdd 100644 --- a/drivers/mtd/nand/ecc-sw-hamming.c +++ b/drivers/mtd/nand/ecc-sw-hamming.c @@ -8,7 +8,7 @@ * * Completely replaces the previous ECC implementation which was written by: * Steven J. Hill (sjhill@realitydiluted.com) - * Thomas Gleixner (tglx@linutronix.de) + * Thomas Gleixner (tglx@kernel.org) * * Information on how this algorithm works and how it was developed * can be found in Documentation/driver-api/mtd/nand_ecc.rst diff --git a/drivers/mtd/nand/raw/diskonchip.c b/drivers/mtd/nand/raw/diskonchip.c index 70d6c2250f32..540b6baf8bb1 100644 --- a/drivers/mtd/nand/raw/diskonchip.c +++ b/drivers/mtd/nand/raw/diskonchip.c @@ -11,7 +11,7 @@ * Error correction code lifted from the old docecc code * Author: Fabrice Bellard (fabrice.bellard@netgem.com) * Copyright (C) 2000 Netgem S.A. - * converted to the generic Reed-Solomon library by Thomas Gleixner + * converted to the generic Reed-Solomon library by Thomas Gleixner * * Interface to generic NAND code for M-Systems DiskOnChip devices */ diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c index ad6d66309597..f2322de93ab4 100644 --- a/drivers/mtd/nand/raw/nand_base.c +++ b/drivers/mtd/nand/raw/nand_base.c @@ -8,7 +8,7 @@ * http://www.linux-mtd.infradead.org/doc/nand.html * * Copyright (C) 2000 Steven J. Hill (sjhill@realitydiluted.com) - * 2002-2006 Thomas Gleixner (tglx@linutronix.de) + * 2002-2006 Thomas Gleixner (tglx@kernel.org) * * Credits: * David Woodhouse for adding multichip support @@ -6594,5 +6594,5 @@ EXPORT_SYMBOL_GPL(nand_cleanup); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Steven J. Hill "); -MODULE_AUTHOR("Thomas Gleixner "); +MODULE_AUTHOR("Thomas Gleixner "); MODULE_DESCRIPTION("Generic NAND flash driver code"); diff --git a/drivers/mtd/nand/raw/nand_bbt.c b/drivers/mtd/nand/raw/nand_bbt.c index a8fba5f39f59..3050ab7e6eb6 100644 --- a/drivers/mtd/nand/raw/nand_bbt.c +++ b/drivers/mtd/nand/raw/nand_bbt.c @@ -3,7 +3,7 @@ * Overview: * Bad block table support for the NAND driver * - * Copyright © 2004 Thomas Gleixner (tglx@linutronix.de) + * Copyright © 2004 Thomas Gleixner (tglx@kernel.org) * * Description: * diff --git a/drivers/mtd/nand/raw/nand_ids.c b/drivers/mtd/nand/raw/nand_ids.c index 650351c62af6..62a8cf86d9e2 100644 --- a/drivers/mtd/nand/raw/nand_ids.c +++ b/drivers/mtd/nand/raw/nand_ids.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2002 Thomas Gleixner (tglx@linutronix.de) + * Copyright (C) 2002 Thomas Gleixner (tglx@kernel.org) */ #include diff --git a/drivers/mtd/nand/raw/nand_jedec.c b/drivers/mtd/nand/raw/nand_jedec.c index b3cc8f360529..89e6dd8ed1a8 100644 --- a/drivers/mtd/nand/raw/nand_jedec.c +++ b/drivers/mtd/nand/raw/nand_jedec.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2000 Steven J. Hill (sjhill@realitydiluted.com) - * 2002-2006 Thomas Gleixner (tglx@linutronix.de) + * 2002-2006 Thomas Gleixner (tglx@kernel.org) * * Credits: * David Woodhouse for adding multichip support diff --git a/drivers/mtd/nand/raw/nand_legacy.c b/drivers/mtd/nand/raw/nand_legacy.c index 743792edf98d..97700f80d5b8 100644 --- a/drivers/mtd/nand/raw/nand_legacy.c +++ b/drivers/mtd/nand/raw/nand_legacy.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2000 Steven J. Hill (sjhill@realitydiluted.com) - * 2002-2006 Thomas Gleixner (tglx@linutronix.de) + * 2002-2006 Thomas Gleixner (tglx@kernel.org) * * Credits: * David Woodhouse for adding multichip support diff --git a/drivers/mtd/nand/raw/nand_onfi.c b/drivers/mtd/nand/raw/nand_onfi.c index 861975e44b55..11954440e4de 100644 --- a/drivers/mtd/nand/raw/nand_onfi.c +++ b/drivers/mtd/nand/raw/nand_onfi.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2000 Steven J. Hill (sjhill@realitydiluted.com) - * 2002-2006 Thomas Gleixner (tglx@linutronix.de) + * 2002-2006 Thomas Gleixner (tglx@kernel.org) * * Credits: * David Woodhouse for adding multichip support diff --git a/drivers/mtd/nand/raw/ndfc.c b/drivers/mtd/nand/raw/ndfc.c index 13365128194d..7ad8bc04be1a 100644 --- a/drivers/mtd/nand/raw/ndfc.c +++ b/drivers/mtd/nand/raw/ndfc.c @@ -272,5 +272,5 @@ static struct platform_driver ndfc_driver = { module_platform_driver(ndfc_driver); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Thomas Gleixner "); +MODULE_AUTHOR("Thomas Gleixner "); MODULE_DESCRIPTION("OF Platform driver for NDFC"); diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index d93ed4e86a17..fa0d4e6aee16 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c @@ -3,7 +3,7 @@ * drivers/uio/uio.c * * Copyright(C) 2005, Benedikt Spranger - * Copyright(C) 2005, Thomas Gleixner + * Copyright(C) 2005, Linutronix GmbH, Thomas Gleixner * Copyright(C) 2006, Hans J. Koch * Copyright(C) 2006, Greg Kroah-Hartman * diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index bb815a002984..3ab3f0ff7ebb 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -2,10 +2,10 @@ * JFFS2 -- Journalling Flash File System, Version 2. * * Copyright © 2001-2007 Red Hat, Inc. - * Copyright © 2004 Thomas Gleixner + * Copyright © 2004 Thomas Gleixner * * Created by David Woodhouse - * Modified debugged and enhanced by Thomas Gleixner + * Modified debugged and enhanced by Thomas Gleixner * * For licensing information, see the file 'LICENCE' in this directory. * diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 2cf1bf65b225..0de12f14d6a4 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -2,7 +2,7 @@ /* * hrtimers - High-resolution kernel timers * - * Copyright(C) 2005, Thomas Gleixner + * Copyright(C) 2005, Linutronix GmbH, Thomas Gleixner * Copyright(C) 2005, Red Hat, Inc., Ingo Molnar * * data type definitions, declarations, prototypes diff --git a/include/linux/ktime.h b/include/linux/ktime.h index 383ed9985802..f247e564602f 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -3,7 +3,7 @@ * * ktime_t - nanosecond-resolution time format. * - * Copyright(C) 2005, Thomas Gleixner + * Copyright(C) 2005, Linutronix GmbH, Thomas Gleixner * Copyright(C) 2005, Red Hat, Inc., Ingo Molnar * * data type definitions, declarations, prototypes and macros. diff --git a/include/linux/mtd/jedec.h b/include/linux/mtd/jedec.h index 56047a4e54c9..255972f3d88d 100644 --- a/include/linux/mtd/jedec.h +++ b/include/linux/mtd/jedec.h @@ -2,7 +2,7 @@ /* * Copyright © 2000-2010 David Woodhouse * Steven J. Hill - * Thomas Gleixner + * Thomas Gleixner * * Contains all JEDEC related definitions */ diff --git a/include/linux/mtd/nand-ecc-sw-hamming.h b/include/linux/mtd/nand-ecc-sw-hamming.h index c6c71894c575..2aa2f8ef68d2 100644 --- a/include/linux/mtd/nand-ecc-sw-hamming.h +++ b/include/linux/mtd/nand-ecc-sw-hamming.h @@ -2,7 +2,7 @@ /* * Copyright (C) 2000-2010 Steven J. Hill * David Woodhouse - * Thomas Gleixner + * Thomas Gleixner * * This file is the header for the NAND Hamming ECC implementation. */ diff --git a/include/linux/mtd/ndfc.h b/include/linux/mtd/ndfc.h index 98f075b86931..622891191e9c 100644 --- a/include/linux/mtd/ndfc.h +++ b/include/linux/mtd/ndfc.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Copyright (c) 2006 Thomas Gleixner + * Copyright (c) 2006 Linutronix GmbH, Thomas Gleixner * * Info: * Contains defines, datastructures for ndfc nand controller diff --git a/include/linux/mtd/onfi.h b/include/linux/mtd/onfi.h index 55ab2e4d62f9..09a5cbd8f232 100644 --- a/include/linux/mtd/onfi.h +++ b/include/linux/mtd/onfi.h @@ -2,7 +2,7 @@ /* * Copyright © 2000-2010 David Woodhouse * Steven J. Hill - * Thomas Gleixner + * Thomas Gleixner * * Contains all ONFI related definitions */ diff --git a/include/linux/mtd/platnand.h b/include/linux/mtd/platnand.h index bc11eb6b593b..2df6fba699f2 100644 --- a/include/linux/mtd/platnand.h +++ b/include/linux/mtd/platnand.h @@ -2,7 +2,7 @@ /* * Copyright © 2000-2010 David Woodhouse * Steven J. Hill - * Thomas Gleixner + * Thomas Gleixner * * Contains all platform NAND related definitions. */ diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h index d30bdc3fcfd7..5c70e7bd3ed5 100644 --- a/include/linux/mtd/rawnand.h +++ b/include/linux/mtd/rawnand.h @@ -2,7 +2,7 @@ /* * Copyright © 2000-2010 David Woodhouse * Steven J. Hill - * Thomas Gleixner + * Thomas Gleixner * * Info: * Contains standard defines and IDs for NAND flash devices diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 9870d768db4c..9ded2e582c60 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1,7 +1,7 @@ /* * Performance events: * - * Copyright (C) 2008-2009, Thomas Gleixner + * Copyright (C) 2008-2009, Linutronix GmbH, Thomas Gleixner * Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra * diff --git a/include/linux/plist.h b/include/linux/plist.h index 8c1c8adf7fe9..16cf4355b5c1 100644 --- a/include/linux/plist.h +++ b/include/linux/plist.h @@ -8,7 +8,7 @@ * 2001-2005 (c) MontaVista Software, Inc. * Daniel Walker * - * (C) 2005 Thomas Gleixner + * (C) 2005 Linutronix GmbH, Thomas Gleixner * * Simplifications of the original code by * Oleg Nesterov diff --git a/include/linux/rslib.h b/include/linux/rslib.h index a04dacbdc8ae..a2848f6907e3 100644 --- a/include/linux/rslib.h +++ b/include/linux/rslib.h @@ -2,7 +2,7 @@ /* * Generic Reed Solomon encoder / decoder library * - * Copyright (C) 2004 Thomas Gleixner (tglx@linutronix.de) + * Copyright (C) 2004 Thomas Gleixner (tglx@kernel.org) * * RS code lifted from reed solomon library written by Phil Karn * Copyright 2002 Phil Karn, KA9Q diff --git a/include/linux/uio_driver.h b/include/linux/uio_driver.h index 18238dc8bfd3..334641e20fb1 100644 --- a/include/linux/uio_driver.h +++ b/include/linux/uio_driver.h @@ -3,7 +3,7 @@ * include/linux/uio_driver.h * * Copyright(C) 2005, Benedikt Spranger - * Copyright(C) 2005, Thomas Gleixner + * Copyright(C) 2005, Linutronix GmbH, Thomas Gleixner * Copyright(C) 2006, Hans J. Koch * Copyright(C) 2006, Greg Kroah-Hartman * diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index c44a8fb3e418..72f03153dd32 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -2,7 +2,7 @@ /* * Performance events: * - * Copyright (C) 2008-2009, Thomas Gleixner + * Copyright (C) 2008-2009, Linutronix GmbH, Thomas Gleixner * Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra * diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index b9c7e00725d6..1f6589578703 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -2,7 +2,7 @@ /* * Performance events callchain code, extracted from core.c: * - * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008 Linutronix GmbH, Thomas Gleixner * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. diff --git a/kernel/events/core.c b/kernel/events/core.c index dad0d3d2e85f..f5e9d30e4fa9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2,7 +2,7 @@ /* * Performance events core code: * - * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008 Linutronix GmbH, Thomas Gleixner * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 20a905023736..3e7de2661417 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -2,7 +2,7 @@ /* * Performance events ring-buffer code: * - * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008 Linutronix GmbH, Thomas Gleixner * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index 3527defd2890..5c5ebaee35f2 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -// Copyright 2017 Thomas Gleixner +// Copyright 2017 Linutronix GmbH, Thomas Gleixner #include #include diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index 8f222d1cccec..a50f2305a8dc 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -// Copyright (C) 2017 Thomas Gleixner +// Copyright (C) 2017 Linutronix GmbH, Thomas Gleixner #include #include diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index da46c3164537..e71302282671 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -15,7 +15,7 @@ * Author: Srivatsa Vaddagiri * * Scaled math optimizations by Thomas Gleixner - * Copyright (C) 2007, Thomas Gleixner + * Copyright (C) 2007, Linutronix GmbH, Thomas Gleixner * * Adaptive scheduling granularity, math enhancements by Peter Zijlstra * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index fa83bbaf4f3e..897790889ba3 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -15,7 +15,7 @@ * Author: Srivatsa Vaddagiri * * Scaled math optimizations by Thomas Gleixner - * Copyright (C) 2007, Thomas Gleixner + * Copyright (C) 2007, Linutronix GmbH, Thomas Gleixner * * Adaptive scheduling granularity, math enhancements by Peter Zijlstra * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index a59bc75ab7c5..eaae1ce9f060 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -2,7 +2,7 @@ /* * This file contains functions which manage clock event devices. * - * Copyright(C) 2005-2006, Thomas Gleixner + * Copyright(C) 2005-2006, Linutronix GmbH, Thomas Gleixner * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner */ diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index f8ea8c8fc895..bdb30cc5e873 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright(C) 2005-2006, Thomas Gleixner + * Copyright(C) 2005-2006, Linutronix GmbH, Thomas Gleixner * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner * diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 0207868c8b4d..f63c65881364 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -3,7 +3,7 @@ * This file contains functions which emulate a local clock-event * device via a broadcast event source. * - * Copyright(C) 2005-2006, Thomas Gleixner + * Copyright(C) 2005-2006, Linutronix GmbH, Thomas Gleixner * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner */ diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 7e33d3f2e889..d305d8521896 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -3,7 +3,7 @@ * This file contains the base functions to manage periodic tick * related events. * - * Copyright(C) 2005-2006, Thomas Gleixner + * Copyright(C) 2005-2006, Linutronix GmbH, Thomas Gleixner * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner */ diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index ffee943d796d..7472597f3225 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -3,7 +3,7 @@ * This file contains functions which manage high resolution tick * related events. * - * Copyright(C) 2005-2006, Thomas Gleixner + * Copyright(C) 2005-2006, Linutronix GmbH, Thomas Gleixner * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 8ddf74e705d3..2f8a7923fa27 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright(C) 2005-2006, Thomas Gleixner + * Copyright(C) 2005-2006, Linutronix GmbH, Thomas Gleixner * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner * diff --git a/lib/debugobjects.c b/lib/debugobjects.c index ecf8e7f978e3..89a1d6745dc2 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -2,7 +2,7 @@ /* * Generic infrastructure for lifetime debugging of objects. * - * Copyright (C) 2008, Thomas Gleixner + * Copyright (C) 2008, Linutronix GmbH, Thomas Gleixner */ #define pr_fmt(fmt) "ODEBUG: " fmt diff --git a/lib/plist.c b/lib/plist.c index ba677c31e8f3..a5bef38add43 100644 --- a/lib/plist.c +++ b/lib/plist.c @@ -10,7 +10,7 @@ * 2001-2005 (c) MontaVista Software, Inc. * Daniel Walker * - * (C) 2005 Thomas Gleixner + * (C) 2005 Linutronix GmbH, Thomas Gleixner * * Simplifications of the original code by * Oleg Nesterov diff --git a/lib/reed_solomon/decode_rs.c b/lib/reed_solomon/decode_rs.c index 805de84ae83d..ef86ee2aec58 100644 --- a/lib/reed_solomon/decode_rs.c +++ b/lib/reed_solomon/decode_rs.c @@ -5,7 +5,7 @@ * Copyright 2002, Phil Karn, KA9Q * May be used under the terms of the GNU General Public License (GPL) * - * Adaption to the kernel by Thomas Gleixner (tglx@linutronix.de) + * Adaption to the kernel by Thomas Gleixner (tglx@kernel.org) * * Generic data width independent code which is included by the wrappers. */ diff --git a/lib/reed_solomon/encode_rs.c b/lib/reed_solomon/encode_rs.c index 9112d46e869e..1d9e51dcc83d 100644 --- a/lib/reed_solomon/encode_rs.c +++ b/lib/reed_solomon/encode_rs.c @@ -5,7 +5,7 @@ * Copyright 2002, Phil Karn, KA9Q * May be used under the terms of the GNU General Public License (GPL) * - * Adaption to the kernel by Thomas Gleixner (tglx@linutronix.de) + * Adaption to the kernel by Thomas Gleixner (tglx@kernel.org) * * Generic data width independent code which is included by the wrappers. */ diff --git a/lib/reed_solomon/reed_solomon.c b/lib/reed_solomon/reed_solomon.c index bbc01bad3053..a9e2dcb6f2a7 100644 --- a/lib/reed_solomon/reed_solomon.c +++ b/lib/reed_solomon/reed_solomon.c @@ -2,7 +2,7 @@ /* * Generic Reed Solomon encoder / decoder library * - * Copyright (C) 2004 Thomas Gleixner (tglx@linutronix.de) + * Copyright (C) 2004 Thomas Gleixner (tglx@kernel.org) * * Reed Solomon code lifted from reed solomon library written by Phil Karn * Copyright 2002 Phil Karn, KA9Q diff --git a/scripts/spdxcheck.py b/scripts/spdxcheck.py index 8d608f61bf37..908029e45ca2 100755 --- a/scripts/spdxcheck.py +++ b/scripts/spdxcheck.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0 -# Copyright Thomas Gleixner +# Copyright Linutronix GmbH, Thomas Gleixner from argparse import ArgumentParser from ply import lex, yacc diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index c44a8fb3e418..72f03153dd32 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -2,7 +2,7 @@ /* * Performance events: * - * Copyright (C) 2008-2009, Thomas Gleixner + * Copyright (C) 2008-2009, Linutronix GmbH, Thomas Gleixner * Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra * diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c index 5cbca0bacd35..87a5491048ac 100644 --- a/tools/perf/builtin-list.c +++ b/tools/perf/builtin-list.c @@ -4,7 +4,7 @@ * * Builtin list command: list all event types * - * Copyright (C) 2009, Thomas Gleixner + * Copyright (C) 2009, Linutronix GmbH, Thomas Gleixner * Copyright (C) 2008-2009, Red Hat Inc, Ingo Molnar * Copyright (C) 2011, Red Hat Inc, Arnaldo Carvalho de Melo */ -- cgit v1.2.3 From d2bdcde9626cbea0c44a6aaa33b440c8adf81e09 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 14 Jan 2026 09:17:45 +0800 Subject: perf/x86/intel: Add support for PEBS memory auxiliary info field in DMR With the introduction of the OMR feature, the PEBS memory auxiliary info field for load and store latency events has been restructured for DMR. The memory auxiliary info field's bit[8] indicates whether a L2 cache miss occurred for a memory load or store instruction. If bit[8] is 0, it signifies no L2 cache miss, and bits[7:0] specify the exact cache data source (up to the L2 cache level). If bit[8] is 1, bits[7:0] represent the OMR encoding, indicating the specific L3 cache or memory region involved in the memory access. A significant enhancement is OMR encoding provides up to 8 fine-grained memory regions besides the cache region. A significant enhancement for OMR encoding is the ability to provide up to 8 fine-grained memory regions in addition to the cache region, offering more detailed insights into memory access regions. For detailed information on the memory auxiliary info encoding, please refer to section 16.2 "PEBS LOAD LATENCY AND STORE LATENCY FACILITY" in the ISE documentation. This patch ensures that the PEBS memory auxiliary info field is correctly interpreted and utilized in DMR. Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260114011750.350569-3-dapeng1.mi@linux.intel.com --- arch/x86/events/intel/ds.c | 140 ++++++++++++++++++++++++++++++++++ arch/x86/events/perf_event.h | 2 + include/uapi/linux/perf_event.h | 27 ++++++- tools/include/uapi/linux/perf_event.h | 27 ++++++- 4 files changed, 190 insertions(+), 6 deletions(-) (limited to 'tools/include/uapi/linux') diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index feb1c3cf63e4..272e652f25fc 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -34,6 +34,17 @@ struct pebs_record_32 { */ +union omr_encoding { + struct { + u8 omr_source : 4; + u8 omr_remote : 1; + u8 omr_hitm : 1; + u8 omr_snoop : 1; + u8 omr_promoted : 1; + }; + u8 omr_full; +}; + union intel_x86_pebs_dse { u64 val; struct { @@ -73,6 +84,18 @@ union intel_x86_pebs_dse { unsigned int lnc_addr_blk:1; unsigned int ld_reserved6:18; }; + struct { + unsigned int pnc_dse: 8; + unsigned int pnc_l2_miss:1; + unsigned int pnc_stlb_clean_hit:1; + unsigned int pnc_stlb_any_hit:1; + unsigned int pnc_stlb_miss:1; + unsigned int pnc_locked:1; + unsigned int pnc_data_blk:1; + unsigned int pnc_addr_blk:1; + unsigned int pnc_fb_full:1; + unsigned int ld_reserved8:16; + }; }; @@ -228,6 +251,85 @@ void __init intel_pmu_pebs_data_source_lnl(void) __intel_pmu_pebs_data_source_cmt(data_source); } +/* Version for Panthercove and later */ + +/* L2 hit */ +#define PNC_PEBS_DATA_SOURCE_MAX 16 +static u64 pnc_pebs_l2_hit_data_source[PNC_PEBS_DATA_SOURCE_MAX] = { + P(OP, LOAD) | P(LVL, NA) | LEVEL(NA) | P(SNOOP, NA), /* 0x00: non-cache access */ + OP_LH | LEVEL(L0) | P(SNOOP, NONE), /* 0x01: L0 hit */ + OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x02: L1 hit */ + OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x03: L1 Miss Handling Buffer hit */ + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, NONE), /* 0x04: L2 Hit Clean */ + 0, /* 0x05: Reserved */ + 0, /* 0x06: Reserved */ + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, HIT), /* 0x07: L2 Hit Snoop HIT */ + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, HITM), /* 0x08: L2 Hit Snoop Hit Modified */ + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, MISS), /* 0x09: Prefetch Promotion */ + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, MISS), /* 0x0a: Cross Core Prefetch Promotion */ + 0, /* 0x0b: Reserved */ + 0, /* 0x0c: Reserved */ + 0, /* 0x0d: Reserved */ + 0, /* 0x0e: Reserved */ + OP_LH | P(LVL, UNC) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0f: uncached */ +}; + +/* L2 miss */ +#define OMR_DATA_SOURCE_MAX 16 +static u64 omr_data_source[OMR_DATA_SOURCE_MAX] = { + P(OP, LOAD) | P(LVL, NA) | LEVEL(NA) | P(SNOOP, NA), /* 0x00: invalid */ + 0, /* 0x01: Reserved */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, L_SHARE), /* 0x02: local CA shared cache */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, L_NON_SHARE),/* 0x03: local CA non-shared cache */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_IO), /* 0x04: other CA IO agent */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_SHARE), /* 0x05: other CA shared cache */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_NON_SHARE),/* 0x06: other CA non-shared cache */ + OP_LH | LEVEL(RAM) | P(REGION, MMIO), /* 0x07: MMIO */ + OP_LH | LEVEL(RAM) | P(REGION, MEM0), /* 0x08: Memory region 0 */ + OP_LH | LEVEL(RAM) | P(REGION, MEM1), /* 0x09: Memory region 1 */ + OP_LH | LEVEL(RAM) | P(REGION, MEM2), /* 0x0a: Memory region 2 */ + OP_LH | LEVEL(RAM) | P(REGION, MEM3), /* 0x0b: Memory region 3 */ + OP_LH | LEVEL(RAM) | P(REGION, MEM4), /* 0x0c: Memory region 4 */ + OP_LH | LEVEL(RAM) | P(REGION, MEM5), /* 0x0d: Memory region 5 */ + OP_LH | LEVEL(RAM) | P(REGION, MEM6), /* 0x0e: Memory region 6 */ + OP_LH | LEVEL(RAM) | P(REGION, MEM7), /* 0x0f: Memory region 7 */ +}; + +static u64 parse_omr_data_source(u8 dse) +{ + union omr_encoding omr; + u64 val = 0; + + omr.omr_full = dse; + val = omr_data_source[omr.omr_source]; + if (omr.omr_source > 0x1 && omr.omr_source < 0x7) + val |= omr.omr_remote ? P(LVL, REM_CCE1) : 0; + else if (omr.omr_source > 0x7) + val |= omr.omr_remote ? P(LVL, REM_RAM1) : P(LVL, LOC_RAM); + + if (omr.omr_remote) + val |= REM; + + val |= omr.omr_hitm ? P(SNOOP, HITM) : P(SNOOP, HIT); + + if (omr.omr_source == 0x2) { + u8 snoop = omr.omr_snoop | omr.omr_promoted; + + if (snoop == 0x0) + val |= P(SNOOP, NA); + else if (snoop == 0x1) + val |= P(SNOOP, MISS); + else if (snoop == 0x2) + val |= P(SNOOP, HIT); + else if (snoop == 0x3) + val |= P(SNOOP, NONE); + } else if (omr.omr_source > 0x2 && omr.omr_source < 0x7) { + val |= omr.omr_snoop ? P(SNOOPX, FWD) : 0; + } + + return val; +} + static u64 precise_store_data(u64 status) { union intel_x86_pebs_dse dse; @@ -411,6 +513,44 @@ u64 arl_h_latency_data(struct perf_event *event, u64 status) return lnl_latency_data(event, status); } +u64 pnc_latency_data(struct perf_event *event, u64 status) +{ + union intel_x86_pebs_dse dse; + union perf_mem_data_src src; + u64 val; + + dse.val = status; + + if (!dse.pnc_l2_miss) + val = pnc_pebs_l2_hit_data_source[dse.pnc_dse & 0xf]; + else + val = parse_omr_data_source(dse.pnc_dse); + + if (!val) + val = P(OP, LOAD) | LEVEL(NA) | P(SNOOP, NA); + + if (dse.pnc_stlb_miss) + val |= P(TLB, MISS) | P(TLB, L2); + else + val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2); + + if (dse.pnc_locked) + val |= P(LOCK, LOCKED); + + if (dse.pnc_data_blk) + val |= P(BLK, DATA); + if (dse.pnc_addr_blk) + val |= P(BLK, ADDR); + if (!dse.pnc_data_blk && !dse.pnc_addr_blk) + val |= P(BLK, NA); + + src.val = val; + if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) + src.mem_op = P(OP, STORE); + + return src.val; +} + static u64 load_latency_data(struct perf_event *event, u64 status) { union intel_x86_pebs_dse dse; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 586e3fdfe6d8..bd501c2a0f73 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1664,6 +1664,8 @@ u64 lnl_latency_data(struct perf_event *event, u64 status); u64 arl_h_latency_data(struct perf_event *event, u64 status); +u64 pnc_latency_data(struct perf_event *event, u64 status); + extern struct event_constraint intel_core2_pebs_event_constraints[]; extern struct event_constraint intel_atom_pebs_event_constraints[]; diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index c44a8fb3e418..533393ec94d0 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -1330,14 +1330,16 @@ union perf_mem_data_src { mem_snoopx : 2, /* Snoop mode, ext */ mem_blk : 3, /* Access blocked */ mem_hops : 3, /* Hop level */ - mem_rsvd : 18; + mem_region : 5, /* cache/memory regions */ + mem_rsvd : 13; }; }; #elif defined(__BIG_ENDIAN_BITFIELD) union perf_mem_data_src { __u64 val; struct { - __u64 mem_rsvd : 18, + __u64 mem_rsvd : 13, + mem_region : 5, /* cache/memory regions */ mem_hops : 3, /* Hop level */ mem_blk : 3, /* Access blocked */ mem_snoopx : 2, /* Snoop mode, ext */ @@ -1394,7 +1396,7 @@ union perf_mem_data_src { #define PERF_MEM_LVLNUM_L4 0x0004 /* L4 */ #define PERF_MEM_LVLNUM_L2_MHB 0x0005 /* L2 Miss Handling Buffer */ #define PERF_MEM_LVLNUM_MSC 0x0006 /* Memory-side Cache */ -/* 0x007 available */ +#define PERF_MEM_LVLNUM_L0 0x0007 /* L0 */ #define PERF_MEM_LVLNUM_UNC 0x0008 /* Uncached */ #define PERF_MEM_LVLNUM_CXL 0x0009 /* CXL */ #define PERF_MEM_LVLNUM_IO 0x000a /* I/O */ @@ -1447,6 +1449,25 @@ union perf_mem_data_src { /* 5-7 available */ #define PERF_MEM_HOPS_SHIFT 43 +/* Cache/Memory region */ +#define PERF_MEM_REGION_NA 0x0 /* Invalid */ +#define PERF_MEM_REGION_RSVD 0x01 /* Reserved */ +#define PERF_MEM_REGION_L_SHARE 0x02 /* Local CA shared cache */ +#define PERF_MEM_REGION_L_NON_SHARE 0x03 /* Local CA non-shared cache */ +#define PERF_MEM_REGION_O_IO 0x04 /* Other CA IO agent */ +#define PERF_MEM_REGION_O_SHARE 0x05 /* Other CA shared cache */ +#define PERF_MEM_REGION_O_NON_SHARE 0x06 /* Other CA non-shared cache */ +#define PERF_MEM_REGION_MMIO 0x07 /* MMIO */ +#define PERF_MEM_REGION_MEM0 0x08 /* Memory region 0 */ +#define PERF_MEM_REGION_MEM1 0x09 /* Memory region 1 */ +#define PERF_MEM_REGION_MEM2 0x0a /* Memory region 2 */ +#define PERF_MEM_REGION_MEM3 0x0b /* Memory region 3 */ +#define PERF_MEM_REGION_MEM4 0x0c /* Memory region 4 */ +#define PERF_MEM_REGION_MEM5 0x0d /* Memory region 5 */ +#define PERF_MEM_REGION_MEM6 0x0e /* Memory region 6 */ +#define PERF_MEM_REGION_MEM7 0x0f /* Memory region 7 */ +#define PERF_MEM_REGION_SHIFT 46 + #define PERF_MEM_S(a, s) \ (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index c44a8fb3e418..d4b99610a3b0 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -1330,14 +1330,16 @@ union perf_mem_data_src { mem_snoopx : 2, /* Snoop mode, ext */ mem_blk : 3, /* Access blocked */ mem_hops : 3, /* Hop level */ - mem_rsvd : 18; + mem_region : 5, /* cache/memory regions */ + mem_rsvd : 13; }; }; #elif defined(__BIG_ENDIAN_BITFIELD) union perf_mem_data_src { __u64 val; struct { - __u64 mem_rsvd : 18, + __u64 mem_rsvd : 13, + mem_region : 5, /* cache/memory regions */ mem_hops : 3, /* Hop level */ mem_blk : 3, /* Access blocked */ mem_snoopx : 2, /* Snoop mode, ext */ @@ -1394,7 +1396,7 @@ union perf_mem_data_src { #define PERF_MEM_LVLNUM_L4 0x0004 /* L4 */ #define PERF_MEM_LVLNUM_L2_MHB 0x0005 /* L2 Miss Handling Buffer */ #define PERF_MEM_LVLNUM_MSC 0x0006 /* Memory-side Cache */ -/* 0x007 available */ +#define PERF_MEM_LVLNUM_L0 0x0007 /* L0 */ #define PERF_MEM_LVLNUM_UNC 0x0008 /* Uncached */ #define PERF_MEM_LVLNUM_CXL 0x0009 /* CXL */ #define PERF_MEM_LVLNUM_IO 0x000a /* I/O */ @@ -1447,6 +1449,25 @@ union perf_mem_data_src { /* 5-7 available */ #define PERF_MEM_HOPS_SHIFT 43 +/* Cache/Memory region */ +#define PERF_MEM_REGION_NA 0x0 /* Invalid */ +#define PERF_MEM_REGION_RSVD 0x01 /* Reserved */ +#define PERF_MEM_REGION_L_SHARE 0x02 /* Local CA shared cache */ +#define PERF_MEM_REGION_L_NON_SHARE 0x03 /* Local CA non-shared cache */ +#define PERF_MEM_REGION_O_IO 0x04 /* Other CA IO agent */ +#define PERF_MEM_REGION_O_SHARE 0x05 /* Other CA shared cache */ +#define PERF_MEM_REGION_O_NON_SHARE 0x06 /* Other CA non-shared cache */ +#define PERF_MEM_REGION_MMIO 0x07 /* MMIO */ +#define PERF_MEM_REGION_MEM0 0x08 /* Memory region 0 */ +#define PERF_MEM_REGION_MEM1 0x09 /* Memory region 1 */ +#define PERF_MEM_REGION_MEM2 0x0a /* Memory region 2 */ +#define PERF_MEM_REGION_MEM3 0x0b /* Memory region 3 */ +#define PERF_MEM_REGION_MEM4 0x0c /* Memory region 4 */ +#define PERF_MEM_REGION_MEM5 0x0d /* Memory region 5 */ +#define PERF_MEM_REGION_MEM6 0x0e /* Memory region 6 */ +#define PERF_MEM_REGION_MEM7 0x0f /* Memory region 7 */ +#define PERF_MEM_REGION_SHIFT 46 + #define PERF_MEM_S(a, s) \ (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT) -- cgit v1.2.3 From 2d419c44658f75e7655794341a95c0687830f3df Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:19:56 +0800 Subject: bpf: add fsession support The fsession is something that similar to kprobe session. It allow to attach a single BPF program to both the entry and the exit of the target functions. Introduce the struct bpf_fsession_link, which allows to add the link to both the fentry and fexit progs_hlist of the trampoline. Signed-off-by: Menglong Dong Co-developed-by: Leon Hwang Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260124062008.8657-2-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 19 ++++++++ include/uapi/linux/bpf.h | 1 + kernel/bpf/btf.c | 2 + kernel/bpf/syscall.c | 18 +++++++- kernel/bpf/trampoline.c | 53 ++++++++++++++++++---- kernel/bpf/verifier.c | 12 +++-- net/bpf/test_run.c | 1 + net/core/bpf_sk_storage.c | 1 + tools/include/uapi/linux/bpf.h | 1 + .../selftests/bpf/prog_tests/tracing_failure.c | 2 +- 10 files changed, 97 insertions(+), 13 deletions(-) (limited to 'tools/include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5936f8e2996f..41228b0add52 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1309,6 +1309,7 @@ enum bpf_tramp_prog_type { BPF_TRAMP_MODIFY_RETURN, BPF_TRAMP_MAX, BPF_TRAMP_REPLACE, /* more than MAX */ + BPF_TRAMP_FSESSION, }; struct bpf_tramp_image { @@ -1875,6 +1876,11 @@ struct bpf_tracing_link { struct bpf_prog *tgt_prog; }; +struct bpf_fsession_link { + struct bpf_tracing_link link; + struct bpf_tramp_link fexit; +}; + struct bpf_raw_tp_link { struct bpf_link link; struct bpf_raw_event_map *btp; @@ -2169,6 +2175,19 @@ static inline void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_op #endif +static inline int bpf_fsession_cnt(struct bpf_tramp_links *links) +{ + struct bpf_tramp_links fentries = links[BPF_TRAMP_FENTRY]; + int cnt = 0; + + for (int i = 0; i < links[BPF_TRAMP_FENTRY].nr_links; i++) { + if (fentries.links[i]->link.prog->expected_attach_type == BPF_TRACE_FSESSION) + cnt++; + } + + return cnt; +} + int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog, const struct bpf_ctx_arg_aux *info, u32 cnt); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2a2ade4be60f..44e7dbc278e3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1145,6 +1145,7 @@ enum bpf_attach_type { BPF_NETKIT_PEER, BPF_TRACE_KPROBE_SESSION, BPF_TRACE_UPROBE_SESSION, + BPF_TRACE_FSESSION, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index d10b3404260f..8959f3bc1e92 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6219,6 +6219,7 @@ static int btf_validate_prog_ctx_type(struct bpf_verifier_log *log, const struct case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_MODIFY_RETURN: + case BPF_TRACE_FSESSION: /* allow u64* as ctx */ if (btf_is_int(t) && t->size == 8) return 0; @@ -6820,6 +6821,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, fallthrough; case BPF_LSM_CGROUP: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: /* When LSM programs are attached to void LSM hooks * they use FEXIT trampolines and when attached to * int LSM hooks, they use MODIFY_RETURN trampolines. diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3c5c03d43f5f..b9184545c3fd 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3577,6 +3577,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, case BPF_PROG_TYPE_TRACING: if (prog->expected_attach_type != BPF_TRACE_FENTRY && prog->expected_attach_type != BPF_TRACE_FEXIT && + prog->expected_attach_type != BPF_TRACE_FSESSION && prog->expected_attach_type != BPF_MODIFY_RETURN) { err = -EINVAL; goto out_put_prog; @@ -3626,7 +3627,21 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id); } - link = kzalloc(sizeof(*link), GFP_USER); + if (prog->expected_attach_type == BPF_TRACE_FSESSION) { + struct bpf_fsession_link *fslink; + + fslink = kzalloc(sizeof(*fslink), GFP_USER); + if (fslink) { + bpf_link_init(&fslink->fexit.link, BPF_LINK_TYPE_TRACING, + &bpf_tracing_link_lops, prog, attach_type); + fslink->fexit.cookie = bpf_cookie; + link = &fslink->link; + } else { + link = NULL; + } + } else { + link = kzalloc(sizeof(*link), GFP_USER); + } if (!link) { err = -ENOMEM; goto out_put_prog; @@ -4350,6 +4365,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_TRACE_RAW_TP: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: case BPF_MODIFY_RETURN: return BPF_PROG_TYPE_TRACING; case BPF_LSM_MAC: diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 2a125d063e62..edf9da43762d 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -109,10 +109,17 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog) enum bpf_attach_type eatype = prog->expected_attach_type; enum bpf_prog_type ptype = prog->type; - return (ptype == BPF_PROG_TYPE_TRACING && - (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || - eatype == BPF_MODIFY_RETURN)) || - (ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC); + switch (ptype) { + case BPF_PROG_TYPE_TRACING: + if (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || + eatype == BPF_MODIFY_RETURN || eatype == BPF_TRACE_FSESSION) + return true; + return false; + case BPF_PROG_TYPE_LSM: + return eatype == BPF_LSM_MAC; + default: + return false; + } } void bpf_image_ksym_init(void *data, unsigned int size, struct bpf_ksym *ksym) @@ -559,6 +566,8 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) return BPF_TRAMP_MODIFY_RETURN; case BPF_TRACE_FEXIT: return BPF_TRAMP_FEXIT; + case BPF_TRACE_FSESSION: + return BPF_TRAMP_FSESSION; case BPF_LSM_MAC: if (!prog->aux->attach_func_proto->type) /* The function returns void, we cannot modify its @@ -594,8 +603,10 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog) { + struct bpf_fsession_link *fslink = NULL; enum bpf_tramp_prog_type kind; struct bpf_tramp_link *link_exiting; + struct hlist_head *prog_list; int err = 0; int cnt = 0, i; @@ -621,24 +632,43 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, BPF_MOD_JUMP, NULL, link->link.prog->bpf_func); } + if (kind == BPF_TRAMP_FSESSION) { + prog_list = &tr->progs_hlist[BPF_TRAMP_FENTRY]; + cnt++; + } else { + prog_list = &tr->progs_hlist[kind]; + } if (cnt >= BPF_MAX_TRAMP_LINKS) return -E2BIG; if (!hlist_unhashed(&link->tramp_hlist)) /* prog already linked */ return -EBUSY; - hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) { + hlist_for_each_entry(link_exiting, prog_list, tramp_hlist) { if (link_exiting->link.prog != link->link.prog) continue; /* prog already linked */ return -EBUSY; } - hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]); - tr->progs_cnt[kind]++; + hlist_add_head(&link->tramp_hlist, prog_list); + if (kind == BPF_TRAMP_FSESSION) { + tr->progs_cnt[BPF_TRAMP_FENTRY]++; + fslink = container_of(link, struct bpf_fsession_link, link.link); + hlist_add_head(&fslink->fexit.tramp_hlist, &tr->progs_hlist[BPF_TRAMP_FEXIT]); + tr->progs_cnt[BPF_TRAMP_FEXIT]++; + } else { + tr->progs_cnt[kind]++; + } err = bpf_trampoline_update(tr, true /* lock_direct_mutex */); if (err) { hlist_del_init(&link->tramp_hlist); - tr->progs_cnt[kind]--; + if (kind == BPF_TRAMP_FSESSION) { + tr->progs_cnt[BPF_TRAMP_FENTRY]--; + hlist_del_init(&fslink->fexit.tramp_hlist); + tr->progs_cnt[BPF_TRAMP_FEXIT]--; + } else { + tr->progs_cnt[kind]--; + } } return err; } @@ -672,6 +702,13 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, guard(mutex)(&tgt_prog->aux->ext_mutex); tgt_prog->aux->is_extended = false; return err; + } else if (kind == BPF_TRAMP_FSESSION) { + struct bpf_fsession_link *fslink = + container_of(link, struct bpf_fsession_link, link.link); + + hlist_del_init(&fslink->fexit.tramp_hlist); + tr->progs_cnt[BPF_TRAMP_FEXIT]--; + kind = BPF_TRAMP_FENTRY; } hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c7f5234d5fd2..41bbed6418b5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -17848,6 +17848,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char switch (env->prog->expected_attach_type) { case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: range = retval_range(0, 0); break; case BPF_TRACE_RAW_TP: @@ -23774,6 +23775,7 @@ patch_map_ops_generic: if (prog_type == BPF_PROG_TYPE_TRACING && insn->imm == BPF_FUNC_get_func_ret) { if (eatype == BPF_TRACE_FEXIT || + eatype == BPF_TRACE_FSESSION || eatype == BPF_MODIFY_RETURN) { /* Load nr_args from ctx - 8 */ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); @@ -24725,7 +24727,8 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, if (tgt_prog->type == BPF_PROG_TYPE_TRACING && prog_extension && (tgt_prog->expected_attach_type == BPF_TRACE_FENTRY || - tgt_prog->expected_attach_type == BPF_TRACE_FEXIT)) { + tgt_prog->expected_attach_type == BPF_TRACE_FEXIT || + tgt_prog->expected_attach_type == BPF_TRACE_FSESSION)) { /* Program extensions can extend all program types * except fentry/fexit. The reason is the following. * The fentry/fexit programs are used for performance @@ -24740,7 +24743,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, * beyond reasonable stack size. Hence extending fentry * is not allowed. */ - bpf_log(log, "Cannot extend fentry/fexit\n"); + bpf_log(log, "Cannot extend fentry/fexit/fsession\n"); return -EINVAL; } } else { @@ -24824,6 +24827,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, case BPF_LSM_CGROUP: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: if (!btf_type_is_func(t)) { bpf_log(log, "attach_btf_id %u is not a function\n", btf_id); @@ -24990,6 +24994,7 @@ static bool can_be_sleepable(struct bpf_prog *prog) case BPF_TRACE_FEXIT: case BPF_MODIFY_RETURN: case BPF_TRACE_ITER: + case BPF_TRACE_FSESSION: return true; default: return false; @@ -25071,9 +25076,10 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) tgt_info.tgt_name); return -EINVAL; } else if ((prog->expected_attach_type == BPF_TRACE_FEXIT || + prog->expected_attach_type == BPF_TRACE_FSESSION || prog->expected_attach_type == BPF_MODIFY_RETURN) && btf_id_set_contains(&noreturn_deny, btf_id)) { - verbose(env, "Attaching fexit/fmod_ret to __noreturn function '%s' is rejected.\n", + verbose(env, "Attaching fexit/fsession/fmod_ret to __noreturn function '%s' is rejected.\n", tgt_info.tgt_name); return -EINVAL; } diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 26cfcfdc45eb..178c4738e63b 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -685,6 +685,7 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog, switch (prog->expected_attach_type) { case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: if (bpf_fentry_test1(1) != 2 || bpf_fentry_test2(2, 3) != 5 || bpf_fentry_test3(4, 5, 6) != 15 || diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 850dd736ccd1..de111818f3a0 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -365,6 +365,7 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) return true; case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: return !!strncmp(prog->aux->attach_func_name, "bpf_sk_storage", strlen("bpf_sk_storage")); default: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b816bc53d2e1..3ca7d76e05f0 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1145,6 +1145,7 @@ enum bpf_attach_type { BPF_NETKIT_PEER, BPF_TRACE_KPROBE_SESSION, BPF_TRACE_UPROBE_SESSION, + BPF_TRACE_FSESSION, __MAX_BPF_ATTACH_TYPE }; diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_failure.c b/tools/testing/selftests/bpf/prog_tests/tracing_failure.c index 10e231965589..f9f9e1cb87bf 100644 --- a/tools/testing/selftests/bpf/prog_tests/tracing_failure.c +++ b/tools/testing/selftests/bpf/prog_tests/tracing_failure.c @@ -73,7 +73,7 @@ static void test_tracing_deny(void) static void test_fexit_noreturns(void) { test_tracing_fail_prog("fexit_noreturns", - "Attaching fexit/fmod_ret to __noreturn function 'do_exit' is rejected."); + "Attaching fexit/fsession/fmod_ret to __noreturn function 'do_exit' is rejected."); } void test_tracing_failure(void) -- cgit v1.2.3 From 752b807028e63f1473b84eb1350e131eca5e5249 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Tue, 27 Jan 2026 08:51:10 +0000 Subject: bpf: add new BPF_CGROUP_ITER_CHILDREN control option Currently, the BPF cgroup iterator supports walking descendants in either pre-order (BPF_CGROUP_ITER_DESCENDANTS_PRE) or post-order (BPF_CGROUP_ITER_DESCENDANTS_POST). These modes perform an exhaustive depth-first search (DFS) of the hierarchy. In scenarios where a BPF program may need to inspect only the direct children of a given parent cgroup, a full DFS is unnecessarily expensive. This patch introduces a new BPF cgroup iterator control option, BPF_CGROUP_ITER_CHILDREN. This control option restricts the traversal to the immediate children of a specified parent cgroup, allowing for more targeted and efficient iteration, particularly when exhaustive depth-first search (DFS) traversal is not required. Signed-off-by: Matt Bobrowski Link: https://lore.kernel.org/r/20260127085112.3608687-1-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 8 ++++++++ kernel/bpf/cgroup_iter.c | 26 +++++++++++++++++++++----- tools/include/uapi/linux/bpf.h | 8 ++++++++ 3 files changed, 37 insertions(+), 5 deletions(-) (limited to 'tools/include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 44e7dbc278e3..c8d400b7680a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -119,6 +119,14 @@ enum bpf_cgroup_iter_order { BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */ + /* + * Walks the immediate children of the specified parent + * cgroup_subsys_state. Unlike BPF_CGROUP_ITER_DESCENDANTS_PRE, + * BPF_CGROUP_ITER_DESCENDANTS_POST, and BPF_CGROUP_ITER_ANCESTORS_UP + * the iterator does not include the specified parent as one of the + * returned iterator elements. + */ + BPF_CGROUP_ITER_CHILDREN, }; union bpf_iter_link_info { diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c index f04a468cf6a7..fd51fe3d92cc 100644 --- a/kernel/bpf/cgroup_iter.c +++ b/kernel/bpf/cgroup_iter.c @@ -8,12 +8,13 @@ #include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */ -/* cgroup_iter provides four modes of traversal to the cgroup hierarchy. +/* cgroup_iter provides five modes of traversal to the cgroup hierarchy. * * 1. Walk the descendants of a cgroup in pre-order. * 2. Walk the descendants of a cgroup in post-order. * 3. Walk the ancestors of a cgroup. * 4. Show the given cgroup only. + * 5. Walk the children of a given parent cgroup. * * For walking descendants, cgroup_iter can walk in either pre-order or * post-order. For walking ancestors, the iter walks up from a cgroup to @@ -78,6 +79,8 @@ static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos) return css_next_descendant_pre(NULL, p->start_css); else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST) return css_next_descendant_post(NULL, p->start_css); + else if (p->order == BPF_CGROUP_ITER_CHILDREN) + return css_next_child(NULL, p->start_css); else /* BPF_CGROUP_ITER_SELF_ONLY and BPF_CGROUP_ITER_ANCESTORS_UP */ return p->start_css; } @@ -113,6 +116,8 @@ static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos) return css_next_descendant_post(curr, p->start_css); else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP) return curr->parent; + else if (p->order == BPF_CGROUP_ITER_CHILDREN) + return css_next_child(curr, p->start_css); else /* BPF_CGROUP_ITER_SELF_ONLY */ return NULL; } @@ -200,11 +205,16 @@ static int bpf_iter_attach_cgroup(struct bpf_prog *prog, int order = linfo->cgroup.order; struct cgroup *cgrp; - if (order != BPF_CGROUP_ITER_DESCENDANTS_PRE && - order != BPF_CGROUP_ITER_DESCENDANTS_POST && - order != BPF_CGROUP_ITER_ANCESTORS_UP && - order != BPF_CGROUP_ITER_SELF_ONLY) + switch (order) { + case BPF_CGROUP_ITER_DESCENDANTS_PRE: + case BPF_CGROUP_ITER_DESCENDANTS_POST: + case BPF_CGROUP_ITER_ANCESTORS_UP: + case BPF_CGROUP_ITER_SELF_ONLY: + case BPF_CGROUP_ITER_CHILDREN: + break; + default: return -EINVAL; + } if (fd && id) return -EINVAL; @@ -257,6 +267,8 @@ show_order: seq_puts(seq, "order: descendants_post\n"); else if (aux->cgroup.order == BPF_CGROUP_ITER_ANCESTORS_UP) seq_puts(seq, "order: ancestors_up\n"); + else if (aux->cgroup.order == BPF_CGROUP_ITER_CHILDREN) + seq_puts(seq, "order: children\n"); else /* BPF_CGROUP_ITER_SELF_ONLY */ seq_puts(seq, "order: self_only\n"); } @@ -320,6 +332,7 @@ __bpf_kfunc int bpf_iter_css_new(struct bpf_iter_css *it, case BPF_CGROUP_ITER_DESCENDANTS_PRE: case BPF_CGROUP_ITER_DESCENDANTS_POST: case BPF_CGROUP_ITER_ANCESTORS_UP: + case BPF_CGROUP_ITER_CHILDREN: break; default: return -EINVAL; @@ -345,6 +358,9 @@ __bpf_kfunc struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *i case BPF_CGROUP_ITER_DESCENDANTS_POST: kit->pos = css_next_descendant_post(kit->pos, kit->start); break; + case BPF_CGROUP_ITER_CHILDREN: + kit->pos = css_next_child(kit->pos, kit->start); + break; case BPF_CGROUP_ITER_ANCESTORS_UP: kit->pos = kit->pos ? kit->pos->parent : kit->start; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3ca7d76e05f0..5e38b4887de6 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -119,6 +119,14 @@ enum bpf_cgroup_iter_order { BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */ + /* + * Walks the immediate children of the specified parent + * cgroup_subsys_state. Unlike BPF_CGROUP_ITER_DESCENDANTS_PRE, + * BPF_CGROUP_ITER_DESCENDANTS_POST, and BPF_CGROUP_ITER_ANCESTORS_UP + * the iterator does not include the specified parent as one of the + * returned iterator elements. + */ + BPF_CGROUP_ITER_CHILDREN, }; union bpf_iter_link_info { -- cgit v1.2.3 From ecd5a2fd4c7495a1a923c754c47cdd500f5b30df Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 4 Mar 2026 11:49:24 -0300 Subject: perf beauty: Update the linux/perf_event.h copy with the kernel sources Update it as one comment got realigned, probably in a merge, so no changes in perf tooling, just silences this build warning: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/perf_event.h include/uapi/linux/perf_event.h Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/include/uapi/linux') diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 76e9d0664d0c..fd10aa8d697f 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -1396,7 +1396,7 @@ union perf_mem_data_src { #define PERF_MEM_LVLNUM_L4 0x0004 /* L4 */ #define PERF_MEM_LVLNUM_L2_MHB 0x0005 /* L2 Miss Handling Buffer */ #define PERF_MEM_LVLNUM_MSC 0x0006 /* Memory-side Cache */ -#define PERF_MEM_LVLNUM_L0 0x0007 /* L0 */ +#define PERF_MEM_LVLNUM_L0 0x0007 /* L0 */ #define PERF_MEM_LVLNUM_UNC 0x0008 /* Uncached */ #define PERF_MEM_LVLNUM_CXL 0x0009 /* CXL */ #define PERF_MEM_LVLNUM_IO 0x000a /* I/O */ -- cgit v1.2.3 From 9cd284105bb77b063b61523f62096e853b8b890b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 4 Mar 2026 12:32:19 -0300 Subject: tools headers UAPI: Sync linux/kvm.h with the kernel sources To pick the changes in: f7ab71f178d56447 ("KVM: s390: Add explicit padding to struct kvm_s390_keyop") 0ee4ddc1647b8b3b ("KVM: s390: Storage key manipulation IOCTL") fa9893fadbc245e1 ("KVM: Introduce KVM_EXIT_SNP_REQ_CERTS for SNP certificate-fetching") f174a9ffcd48d78a ("KVM: arm64: Add exit to userspace on {LD,ST}64B* outside of memslots") That just rebuilds perf, as these patches add just one new KVM ioctl, but for S390, that is not being considered by tools/perf/trace/beauty/kvm_ioctl.sh so far. This addresses this perf build warning: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h Please see tools/include/uapi/README for further details. Cc: Arnd Bergmann Cc: Claudio Imbrenda Cc: Marc Zyngier Cc: Michael Roth Cc: Sean Christopherson Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/kvm.h | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'tools/include/uapi/linux') diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index dddb781b0507..65500f5db379 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -135,6 +135,12 @@ struct kvm_xen_exit { } u; }; +struct kvm_exit_snp_req_certs { + __u64 gpa; + __u64 npages; + __u64 ret; +}; + #define KVM_S390_GET_SKEYS_NONE 1 #define KVM_S390_SKEYS_MAX 1048576 @@ -180,6 +186,8 @@ struct kvm_xen_exit { #define KVM_EXIT_MEMORY_FAULT 39 #define KVM_EXIT_TDX 40 #define KVM_EXIT_ARM_SEA 41 +#define KVM_EXIT_ARM_LDST64B 42 +#define KVM_EXIT_SNP_REQ_CERTS 43 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -402,7 +410,7 @@ struct kvm_run { } eoi; /* KVM_EXIT_HYPERV */ struct kvm_hyperv_exit hyperv; - /* KVM_EXIT_ARM_NISV */ + /* KVM_EXIT_ARM_NISV / KVM_EXIT_ARM_LDST64B */ struct { __u64 esr_iss; __u64 fault_ipa; @@ -482,6 +490,8 @@ struct kvm_run { __u64 gva; __u64 gpa; } arm_sea; + /* KVM_EXIT_SNP_REQ_CERTS */ + struct kvm_exit_snp_req_certs snp_req_certs; /* Fix the size of the union. */ char padding[256]; }; @@ -974,6 +984,7 @@ struct kvm_enable_cap { #define KVM_CAP_GUEST_MEMFD_FLAGS 244 #define KVM_CAP_ARM_SEA_TO_USER 245 #define KVM_CAP_S390_USER_OPEREXEC 246 +#define KVM_CAP_S390_KEYOP 247 struct kvm_irq_routing_irqchip { __u32 irqchip; @@ -1219,6 +1230,16 @@ struct kvm_vfio_spapr_tce { __s32 tablefd; }; +#define KVM_S390_KEYOP_ISKE 0x01 +#define KVM_S390_KEYOP_RRBE 0x02 +#define KVM_S390_KEYOP_SSKE 0x03 +struct kvm_s390_keyop { + __u64 guest_addr; + __u8 key; + __u8 operation; + __u8 pad[6]; +}; + /* * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns * a vcpu fd. @@ -1238,6 +1259,7 @@ struct kvm_vfio_spapr_tce { #define KVM_S390_UCAS_MAP _IOW(KVMIO, 0x50, struct kvm_s390_ucas_mapping) #define KVM_S390_UCAS_UNMAP _IOW(KVMIO, 0x51, struct kvm_s390_ucas_mapping) #define KVM_S390_VCPU_FAULT _IOW(KVMIO, 0x52, unsigned long) +#define KVM_S390_KEYOP _IOWR(KVMIO, 0x53, struct kvm_s390_keyop) /* Device model IOC */ #define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60) -- cgit v1.2.3